openms-insight 0.1.0__py3-none-any.whl → 0.1.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- openms_insight/components/heatmap.py +92 -36
- openms_insight/components/lineplot.py +16 -3
- openms_insight/components/table.py +16 -3
- openms_insight/core/base.py +53 -24
- openms_insight/core/subprocess_preprocess.py +96 -0
- openms_insight/js-component/dist/assets/index.css +1 -1
- openms_insight/js-component/dist/assets/index.js +90 -90
- openms_insight/preprocessing/filtering.py +92 -15
- openms_insight/rendering/bridge.py +88 -27
- {openms_insight-0.1.0.dist-info → openms_insight-0.1.2.dist-info}/METADATA +12 -12
- {openms_insight-0.1.0.dist-info → openms_insight-0.1.2.dist-info}/RECORD +13 -12
- {openms_insight-0.1.0.dist-info → openms_insight-0.1.2.dist-info}/WHEEL +0 -0
- {openms_insight-0.1.0.dist-info → openms_insight-0.1.2.dist-info}/licenses/LICENSE +0 -0
|
@@ -69,6 +69,7 @@ class Heatmap(BaseComponent):
|
|
|
69
69
|
x_column: str,
|
|
70
70
|
y_column: str,
|
|
71
71
|
data: Optional[pl.LazyFrame] = None,
|
|
72
|
+
data_path: Optional[str] = None,
|
|
72
73
|
intensity_column: str = 'intensity',
|
|
73
74
|
filters: Optional[Dict[str, str]] = None,
|
|
74
75
|
filter_defaults: Optional[Dict[str, Any]] = None,
|
|
@@ -97,6 +98,7 @@ class Heatmap(BaseComponent):
|
|
|
97
98
|
x_column: Name of column for x-axis values
|
|
98
99
|
y_column: Name of column for y-axis values
|
|
99
100
|
data: Polars LazyFrame with heatmap data. Optional if cache exists.
|
|
101
|
+
data_path: Path to parquet file (preferred for large datasets).
|
|
100
102
|
intensity_column: Name of column for intensity/color values
|
|
101
103
|
filters: Mapping of identifier names to column names for filtering
|
|
102
104
|
interactivity: Mapping of identifier names to column names for clicks.
|
|
@@ -142,11 +144,27 @@ class Heatmap(BaseComponent):
|
|
|
142
144
|
super().__init__(
|
|
143
145
|
cache_id=cache_id,
|
|
144
146
|
data=data,
|
|
147
|
+
data_path=data_path,
|
|
145
148
|
filters=filters,
|
|
146
149
|
filter_defaults=filter_defaults,
|
|
147
150
|
interactivity=interactivity,
|
|
148
151
|
cache_path=cache_path,
|
|
149
152
|
regenerate_cache=regenerate_cache,
|
|
153
|
+
# Pass component-specific params for subprocess recreation
|
|
154
|
+
x_column=x_column,
|
|
155
|
+
y_column=y_column,
|
|
156
|
+
intensity_column=intensity_column,
|
|
157
|
+
min_points=min_points,
|
|
158
|
+
x_bins=x_bins,
|
|
159
|
+
y_bins=y_bins,
|
|
160
|
+
zoom_identifier=zoom_identifier,
|
|
161
|
+
title=title,
|
|
162
|
+
x_label=x_label,
|
|
163
|
+
y_label=y_label,
|
|
164
|
+
colorscale=colorscale,
|
|
165
|
+
use_simple_downsample=use_simple_downsample,
|
|
166
|
+
use_streaming=use_streaming,
|
|
167
|
+
categorical_filters=categorical_filters,
|
|
150
168
|
**kwargs
|
|
151
169
|
)
|
|
152
170
|
|
|
@@ -211,6 +229,8 @@ class Heatmap(BaseComponent):
|
|
|
211
229
|
render time, the resulting data has ~min_points regardless of the
|
|
212
230
|
filter value selected.
|
|
213
231
|
|
|
232
|
+
Data is sorted by x, y columns for efficient range query predicate pushdown.
|
|
233
|
+
|
|
214
234
|
Example: For im_dimension with values [0, 1, 2, 3], creates:
|
|
215
235
|
- cat_level_im_dimension_0_0: 20K points with im_id=0
|
|
216
236
|
- cat_level_im_dimension_0_1: 20K points with im_id=1
|
|
@@ -271,9 +291,8 @@ class Heatmap(BaseComponent):
|
|
|
271
291
|
|
|
272
292
|
# Store level sizes for this filter value
|
|
273
293
|
self._preprocessed_data[f'cat_level_sizes_{filter_id}_{filter_value}'] = level_sizes
|
|
274
|
-
self._preprocessed_data[f'cat_num_levels_{filter_id}_{filter_value}'] = len(level_sizes)
|
|
275
294
|
|
|
276
|
-
# Build each level
|
|
295
|
+
# Build each compressed level
|
|
277
296
|
for level_idx, target_size in enumerate(level_sizes):
|
|
278
297
|
# If target size equals total, skip downsampling - use all data
|
|
279
298
|
if target_size >= filtered_total:
|
|
@@ -297,15 +316,25 @@ class Heatmap(BaseComponent):
|
|
|
297
316
|
y_range=y_range,
|
|
298
317
|
)
|
|
299
318
|
|
|
300
|
-
#
|
|
319
|
+
# Sort by x, y for efficient range query predicate pushdown
|
|
320
|
+
level = level.sort([self._x_column, self._y_column])
|
|
321
|
+
# Store LazyFrame for streaming to disk
|
|
301
322
|
level_key = f'cat_level_{filter_id}_{filter_value}_{level_idx}'
|
|
302
|
-
self._preprocessed_data[level_key] = level
|
|
323
|
+
self._preprocessed_data[level_key] = level # Keep lazy
|
|
324
|
+
|
|
325
|
+
# Add full resolution as final level (for zoom fallback)
|
|
326
|
+
# Also sorted for consistent predicate pushdown behavior
|
|
327
|
+
num_compressed = len(level_sizes)
|
|
328
|
+
full_res_key = f'cat_level_{filter_id}_{filter_value}_{num_compressed}'
|
|
329
|
+
self._preprocessed_data[full_res_key] = filtered_data.sort(
|
|
330
|
+
[self._x_column, self._y_column]
|
|
331
|
+
)
|
|
332
|
+
self._preprocessed_data[f'cat_num_levels_{filter_id}_{filter_value}'] = num_compressed + 1
|
|
303
333
|
|
|
304
334
|
# Also create global levels for when no categorical filter is selected
|
|
305
335
|
# (fallback to standard behavior)
|
|
306
336
|
level_sizes = compute_compression_levels(self._min_points, total)
|
|
307
337
|
self._preprocessed_data['level_sizes'] = level_sizes
|
|
308
|
-
self._preprocessed_data['num_levels'] = len(level_sizes)
|
|
309
338
|
|
|
310
339
|
for i, size in enumerate(level_sizes):
|
|
311
340
|
# If target size equals total, skip downsampling - use all data
|
|
@@ -329,13 +358,24 @@ class Heatmap(BaseComponent):
|
|
|
329
358
|
x_range=x_range,
|
|
330
359
|
y_range=y_range,
|
|
331
360
|
)
|
|
332
|
-
|
|
361
|
+
# Sort by x, y for efficient range query predicate pushdown
|
|
362
|
+
level = level.sort([self._x_column, self._y_column])
|
|
363
|
+
self._preprocessed_data[f'level_{i}'] = level # Keep lazy
|
|
364
|
+
|
|
365
|
+
# Add full resolution as final level (for zoom fallback)
|
|
366
|
+
# Also sorted for consistent predicate pushdown behavior
|
|
367
|
+
num_compressed = len(level_sizes)
|
|
368
|
+
self._preprocessed_data[f'level_{num_compressed}'] = self._raw_data.sort(
|
|
369
|
+
[self._x_column, self._y_column]
|
|
370
|
+
)
|
|
371
|
+
self._preprocessed_data['num_levels'] = num_compressed + 1
|
|
333
372
|
|
|
334
373
|
def _preprocess_streaming(self) -> None:
|
|
335
374
|
"""
|
|
336
|
-
Streaming preprocessing - levels stay lazy
|
|
375
|
+
Streaming preprocessing - levels stay lazy through caching.
|
|
337
376
|
|
|
338
|
-
Builds lazy query plans
|
|
377
|
+
Builds lazy query plans that are streamed to disk via sink_parquet().
|
|
378
|
+
Data is sorted by x, y columns for efficient range query predicate pushdown.
|
|
339
379
|
"""
|
|
340
380
|
# Get data ranges (minimal collect - just 4 values)
|
|
341
381
|
x_range, y_range = get_data_range(
|
|
@@ -379,12 +419,22 @@ class Heatmap(BaseComponent):
|
|
|
379
419
|
x_range=x_range,
|
|
380
420
|
y_range=y_range,
|
|
381
421
|
)
|
|
382
|
-
#
|
|
383
|
-
#
|
|
384
|
-
|
|
422
|
+
# Sort by x, y for efficient range query predicate pushdown
|
|
423
|
+
# This clusters spatially close points together in row groups
|
|
424
|
+
level = level.sort([self._x_column, self._y_column])
|
|
425
|
+
# Store LazyFrame for streaming to disk
|
|
426
|
+
# Base class will use sink_parquet() to stream without full materialization
|
|
427
|
+
self._preprocessed_data[f'level_{i}'] = level # Keep lazy
|
|
428
|
+
|
|
429
|
+
# Add full resolution as final level (for zoom fallback)
|
|
430
|
+
# Also sorted for consistent predicate pushdown behavior
|
|
431
|
+
num_compressed = len(level_sizes)
|
|
432
|
+
self._preprocessed_data[f'level_{num_compressed}'] = self._raw_data.sort(
|
|
433
|
+
[self._x_column, self._y_column]
|
|
434
|
+
)
|
|
385
435
|
|
|
386
|
-
# Store number of levels for reconstruction
|
|
387
|
-
self._preprocessed_data['num_levels'] =
|
|
436
|
+
# Store number of levels for reconstruction (includes full resolution)
|
|
437
|
+
self._preprocessed_data['num_levels'] = num_compressed + 1
|
|
388
438
|
|
|
389
439
|
def _preprocess_eager(self) -> None:
|
|
390
440
|
"""
|
|
@@ -392,6 +442,7 @@ class Heatmap(BaseComponent):
|
|
|
392
442
|
|
|
393
443
|
Uses more memory at init but faster rendering. Uses scipy-based
|
|
394
444
|
downsampling for better spatial distribution.
|
|
445
|
+
Data is sorted by x, y columns for efficient range query predicate pushdown.
|
|
395
446
|
"""
|
|
396
447
|
# Get data ranges
|
|
397
448
|
x_range, y_range = get_data_range(
|
|
@@ -434,16 +485,29 @@ class Heatmap(BaseComponent):
|
|
|
434
485
|
x_bins=self._x_bins,
|
|
435
486
|
y_bins=self._y_bins,
|
|
436
487
|
)
|
|
437
|
-
#
|
|
488
|
+
# Sort by x, y for efficient range query predicate pushdown
|
|
489
|
+
if isinstance(downsampled, pl.LazyFrame):
|
|
490
|
+
downsampled = downsampled.sort([self._x_column, self._y_column])
|
|
491
|
+
else:
|
|
492
|
+
downsampled = downsampled.sort([self._x_column, self._y_column])
|
|
493
|
+
# Store LazyFrame for streaming to disk
|
|
438
494
|
level_idx = len(level_sizes) - 1 - i
|
|
439
495
|
if isinstance(downsampled, pl.LazyFrame):
|
|
440
|
-
self._preprocessed_data[f'level_{level_idx}'] = downsampled
|
|
496
|
+
self._preprocessed_data[f'level_{level_idx}'] = downsampled # Keep lazy
|
|
441
497
|
else:
|
|
442
|
-
|
|
498
|
+
# DataFrame from downsample_2d - convert back to lazy
|
|
499
|
+
self._preprocessed_data[f'level_{level_idx}'] = downsampled.lazy()
|
|
443
500
|
current = downsampled
|
|
444
501
|
|
|
445
|
-
#
|
|
446
|
-
|
|
502
|
+
# Add full resolution as final level (for zoom fallback)
|
|
503
|
+
# Also sorted for consistent predicate pushdown behavior
|
|
504
|
+
num_compressed = len(level_sizes)
|
|
505
|
+
self._preprocessed_data[f'level_{num_compressed}'] = self._raw_data.sort(
|
|
506
|
+
[self._x_column, self._y_column]
|
|
507
|
+
)
|
|
508
|
+
|
|
509
|
+
# Store number of levels for reconstruction (includes full resolution)
|
|
510
|
+
self._preprocessed_data['num_levels'] = num_compressed + 1
|
|
447
511
|
|
|
448
512
|
def _get_levels(self) -> list:
|
|
449
513
|
"""
|
|
@@ -460,10 +524,6 @@ class Heatmap(BaseComponent):
|
|
|
460
524
|
if level_data is not None:
|
|
461
525
|
levels.append(level_data)
|
|
462
526
|
|
|
463
|
-
# Add full resolution at end (if raw data available)
|
|
464
|
-
if self._raw_data is not None:
|
|
465
|
-
levels.append(self._raw_data)
|
|
466
|
-
|
|
467
527
|
return levels
|
|
468
528
|
|
|
469
529
|
def _get_categorical_levels(
|
|
@@ -496,13 +556,7 @@ class Heatmap(BaseComponent):
|
|
|
496
556
|
if level_data is not None:
|
|
497
557
|
levels.append(level_data)
|
|
498
558
|
|
|
499
|
-
|
|
500
|
-
filtered_raw = None
|
|
501
|
-
if self._raw_data is not None and filter_id in self._filters:
|
|
502
|
-
column_name = self._filters[filter_id]
|
|
503
|
-
filtered_raw = self._raw_data.filter(pl.col(column_name) == filter_value)
|
|
504
|
-
|
|
505
|
-
return levels, filtered_raw
|
|
559
|
+
return levels, None # Full resolution included in cached levels
|
|
506
560
|
|
|
507
561
|
def _get_levels_for_state(self, state: Dict[str, Any]) -> Tuple[list, Optional[pl.LazyFrame]]:
|
|
508
562
|
"""
|
|
@@ -630,10 +684,11 @@ class Heatmap(BaseComponent):
|
|
|
630
684
|
|
|
631
685
|
if count >= self._min_points:
|
|
632
686
|
# This level has enough detail
|
|
633
|
-
if count > self._min_points
|
|
634
|
-
#
|
|
635
|
-
|
|
636
|
-
|
|
687
|
+
if count > self._min_points:
|
|
688
|
+
# Over limit - downsample to stay at/under max
|
|
689
|
+
# Use ZOOM range for binning (not global) to avoid sparse bins
|
|
690
|
+
zoom_x_range = (x0, x1)
|
|
691
|
+
zoom_y_range = (y0, y1)
|
|
637
692
|
if self._use_streaming or self._use_simple_downsample:
|
|
638
693
|
if self._use_simple_downsample:
|
|
639
694
|
return downsample_2d_simple(
|
|
@@ -650,8 +705,8 @@ class Heatmap(BaseComponent):
|
|
|
650
705
|
intensity_column=self._intensity_column,
|
|
651
706
|
x_bins=self._x_bins,
|
|
652
707
|
y_bins=self._y_bins,
|
|
653
|
-
x_range=
|
|
654
|
-
y_range=
|
|
708
|
+
x_range=zoom_x_range,
|
|
709
|
+
y_range=zoom_y_range,
|
|
655
710
|
).collect()
|
|
656
711
|
else:
|
|
657
712
|
return downsample_2d(
|
|
@@ -744,7 +799,8 @@ class Heatmap(BaseComponent):
|
|
|
744
799
|
df_pandas = df_pandas.sort_values(self._intensity_column).reset_index(drop=True)
|
|
745
800
|
else:
|
|
746
801
|
# No filters to apply - levels already filtered by categorical filter
|
|
747
|
-
|
|
802
|
+
schema_names = data.collect_schema().names()
|
|
803
|
+
available_cols = [c for c in columns_to_select if c in schema_names]
|
|
748
804
|
df_polars = data.select(available_cols).collect()
|
|
749
805
|
# Sort by intensity ascending so high-intensity points are drawn on top
|
|
750
806
|
df_polars = df_polars.sort(self._intensity_column)
|
|
@@ -45,6 +45,7 @@ class LinePlot(BaseComponent):
|
|
|
45
45
|
self,
|
|
46
46
|
cache_id: str,
|
|
47
47
|
data: Optional[pl.LazyFrame] = None,
|
|
48
|
+
data_path: Optional[str] = None,
|
|
48
49
|
filters: Optional[Dict[str, str]] = None,
|
|
49
50
|
filter_defaults: Optional[Dict[str, Any]] = None,
|
|
50
51
|
interactivity: Optional[Dict[str, str]] = None,
|
|
@@ -68,6 +69,7 @@ class LinePlot(BaseComponent):
|
|
|
68
69
|
cache_id: Unique identifier for this component's cache (MANDATORY).
|
|
69
70
|
Creates a folder {cache_path}/{cache_id}/ for cached data.
|
|
70
71
|
data: Polars LazyFrame with plot data. Optional if cache exists.
|
|
72
|
+
data_path: Path to parquet file (preferred for large datasets).
|
|
71
73
|
filters: Mapping of identifier names to column names for filtering.
|
|
72
74
|
Example: {'spectrum': 'scan_id'}
|
|
73
75
|
When 'spectrum' selection exists, plot shows only data where
|
|
@@ -116,11 +118,22 @@ class LinePlot(BaseComponent):
|
|
|
116
118
|
super().__init__(
|
|
117
119
|
cache_id=cache_id,
|
|
118
120
|
data=data,
|
|
121
|
+
data_path=data_path,
|
|
119
122
|
filters=filters,
|
|
120
123
|
filter_defaults=filter_defaults,
|
|
121
124
|
interactivity=interactivity,
|
|
122
125
|
cache_path=cache_path,
|
|
123
126
|
regenerate_cache=regenerate_cache,
|
|
127
|
+
# Pass component-specific params for subprocess recreation
|
|
128
|
+
x_column=x_column,
|
|
129
|
+
y_column=y_column,
|
|
130
|
+
title=title,
|
|
131
|
+
x_label=x_label,
|
|
132
|
+
y_label=y_label,
|
|
133
|
+
highlight_column=highlight_column,
|
|
134
|
+
annotation_column=annotation_column,
|
|
135
|
+
styling=styling,
|
|
136
|
+
config=config,
|
|
124
137
|
**kwargs
|
|
125
138
|
)
|
|
126
139
|
|
|
@@ -208,9 +221,9 @@ class LinePlot(BaseComponent):
|
|
|
208
221
|
'annotation_column': self._annotation_column,
|
|
209
222
|
}
|
|
210
223
|
|
|
211
|
-
#
|
|
212
|
-
# Base class will
|
|
213
|
-
self._preprocessed_data['data'] = data
|
|
224
|
+
# Store LazyFrame for streaming to disk (filter happens at render time)
|
|
225
|
+
# Base class will use sink_parquet() to stream without full materialization
|
|
226
|
+
self._preprocessed_data['data'] = data # Keep lazy
|
|
214
227
|
|
|
215
228
|
def _get_vue_component_name(self) -> str:
|
|
216
229
|
"""Return the Vue component name."""
|
|
@@ -49,6 +49,7 @@ class Table(BaseComponent):
|
|
|
49
49
|
self,
|
|
50
50
|
cache_id: str,
|
|
51
51
|
data: Optional[pl.LazyFrame] = None,
|
|
52
|
+
data_path: Optional[str] = None,
|
|
52
53
|
filters: Optional[Dict[str, str]] = None,
|
|
53
54
|
filter_defaults: Optional[Dict[str, Any]] = None,
|
|
54
55
|
interactivity: Optional[Dict[str, str]] = None,
|
|
@@ -72,6 +73,7 @@ class Table(BaseComponent):
|
|
|
72
73
|
cache_id: Unique identifier for this component's cache (MANDATORY).
|
|
73
74
|
Creates a folder {cache_path}/{cache_id}/ for cached data.
|
|
74
75
|
data: Polars LazyFrame with table data. Optional if cache exists.
|
|
76
|
+
data_path: Path to parquet file (preferred for large datasets).
|
|
75
77
|
filters: Mapping of identifier names to column names for filtering.
|
|
76
78
|
Example: {'spectrum': 'scan_id'}
|
|
77
79
|
When 'spectrum' selection exists, table shows only rows where
|
|
@@ -120,11 +122,22 @@ class Table(BaseComponent):
|
|
|
120
122
|
super().__init__(
|
|
121
123
|
cache_id=cache_id,
|
|
122
124
|
data=data,
|
|
125
|
+
data_path=data_path,
|
|
123
126
|
filters=filters,
|
|
124
127
|
filter_defaults=filter_defaults,
|
|
125
128
|
interactivity=interactivity,
|
|
126
129
|
cache_path=cache_path,
|
|
127
130
|
regenerate_cache=regenerate_cache,
|
|
131
|
+
# Pass component-specific params for subprocess recreation
|
|
132
|
+
column_definitions=column_definitions,
|
|
133
|
+
title=title,
|
|
134
|
+
index_field=index_field,
|
|
135
|
+
go_to_fields=go_to_fields,
|
|
136
|
+
layout=layout,
|
|
137
|
+
default_row=default_row,
|
|
138
|
+
initial_sort=initial_sort,
|
|
139
|
+
pagination=pagination,
|
|
140
|
+
page_size=page_size,
|
|
128
141
|
**kwargs
|
|
129
142
|
)
|
|
130
143
|
|
|
@@ -204,9 +217,9 @@ class Table(BaseComponent):
|
|
|
204
217
|
# Store column definitions in preprocessed data for serialization
|
|
205
218
|
self._preprocessed_data['column_definitions'] = self._column_definitions
|
|
206
219
|
|
|
207
|
-
#
|
|
208
|
-
# Base class will
|
|
209
|
-
self._preprocessed_data['data'] = data
|
|
220
|
+
# Store LazyFrame for streaming to disk (filter happens at render time)
|
|
221
|
+
# Base class will use sink_parquet() to stream without full materialization
|
|
222
|
+
self._preprocessed_data['data'] = data # Keep lazy
|
|
210
223
|
|
|
211
224
|
def _get_columns_to_select(self) -> Optional[List[str]]:
|
|
212
225
|
"""Get list of columns needed for this table."""
|
openms_insight/core/base.py
CHANGED
|
@@ -15,7 +15,8 @@ if TYPE_CHECKING:
|
|
|
15
15
|
|
|
16
16
|
# Cache format version - increment when cache structure changes
|
|
17
17
|
# Version 2: Added sorting by filter columns + smaller row groups for predicate pushdown
|
|
18
|
-
|
|
18
|
+
# Version 3: Downcast numeric types (Int64→Int32, Float64→Float32) for efficient transfer
|
|
19
|
+
CACHE_VERSION = 3
|
|
19
20
|
|
|
20
21
|
|
|
21
22
|
class BaseComponent(ABC):
|
|
@@ -43,6 +44,7 @@ class BaseComponent(ABC):
|
|
|
43
44
|
self,
|
|
44
45
|
cache_id: str,
|
|
45
46
|
data: Optional[pl.LazyFrame] = None,
|
|
47
|
+
data_path: Optional[str] = None,
|
|
46
48
|
filters: Optional[Dict[str, str]] = None,
|
|
47
49
|
filter_defaults: Optional[Dict[str, Any]] = None,
|
|
48
50
|
interactivity: Optional[Dict[str, str]] = None,
|
|
@@ -57,6 +59,9 @@ class BaseComponent(ABC):
|
|
|
57
59
|
cache_id: Unique identifier for this component's cache (MANDATORY).
|
|
58
60
|
Creates a folder {cache_path}/{cache_id}/ for cached data.
|
|
59
61
|
data: Polars LazyFrame with source data. Optional if cache exists.
|
|
62
|
+
data_path: Path to parquet file with source data. Preferred over
|
|
63
|
+
data= for large datasets as preprocessing runs in a subprocess
|
|
64
|
+
to ensure memory is released after cache creation.
|
|
60
65
|
filters: Mapping of identifier names to column names for filtering.
|
|
61
66
|
Example: {'spectrum': 'scan_id'}
|
|
62
67
|
When 'spectrum' selection exists, component filters data where
|
|
@@ -73,6 +78,10 @@ class BaseComponent(ABC):
|
|
|
73
78
|
regenerate_cache: If True, regenerate cache even if valid cache exists.
|
|
74
79
|
**kwargs: Component-specific configuration options
|
|
75
80
|
"""
|
|
81
|
+
# Validate inputs
|
|
82
|
+
if data is not None and data_path is not None:
|
|
83
|
+
raise ValueError("Provide either 'data' or 'data_path', not both")
|
|
84
|
+
|
|
76
85
|
self._cache_id = cache_id
|
|
77
86
|
self._cache_dir = get_cache_dir(cache_path, cache_id)
|
|
78
87
|
self._filters = filters or {}
|
|
@@ -83,18 +92,33 @@ class BaseComponent(ABC):
|
|
|
83
92
|
|
|
84
93
|
# Check if we should load from cache or preprocess
|
|
85
94
|
if regenerate_cache or not self._is_cache_valid():
|
|
86
|
-
if data is None:
|
|
95
|
+
if data is None and data_path is None:
|
|
87
96
|
raise CacheMissError(
|
|
88
97
|
f"Cache not found at '{self._cache_dir}' and no data provided. "
|
|
89
|
-
f"Either provide data
|
|
98
|
+
f"Either provide data=, data_path=, or ensure cache exists."
|
|
99
|
+
)
|
|
100
|
+
|
|
101
|
+
if data_path is not None:
|
|
102
|
+
# Subprocess preprocessing - memory released after cache creation
|
|
103
|
+
from .subprocess_preprocess import preprocess_component
|
|
104
|
+
preprocess_component(
|
|
105
|
+
type(self),
|
|
106
|
+
data_path=data_path,
|
|
107
|
+
cache_id=cache_id,
|
|
108
|
+
cache_path=cache_path,
|
|
109
|
+
filters=filters,
|
|
110
|
+
filter_defaults=filter_defaults,
|
|
111
|
+
interactivity=interactivity,
|
|
112
|
+
**kwargs
|
|
90
113
|
)
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
114
|
+
self._raw_data = None
|
|
115
|
+
self._load_from_cache()
|
|
116
|
+
else:
|
|
117
|
+
# In-process preprocessing (backward compatible)
|
|
118
|
+
self._raw_data = data
|
|
119
|
+
self._validate_mappings()
|
|
120
|
+
self._preprocess()
|
|
121
|
+
self._save_to_cache()
|
|
98
122
|
else:
|
|
99
123
|
# Load from valid cache
|
|
100
124
|
self._raw_data = None
|
|
@@ -213,6 +237,8 @@ class BaseComponent(ABC):
|
|
|
213
237
|
|
|
214
238
|
def _save_to_cache(self) -> None:
|
|
215
239
|
"""Save preprocessed data to cache."""
|
|
240
|
+
from ..preprocessing.filtering import optimize_for_transfer, optimize_for_transfer_lazy
|
|
241
|
+
|
|
216
242
|
# Create directories
|
|
217
243
|
self._cache_dir.mkdir(parents=True, exist_ok=True)
|
|
218
244
|
preprocessed_dir = self._get_preprocessed_dir()
|
|
@@ -231,28 +257,24 @@ class BaseComponent(ABC):
|
|
|
231
257
|
"data_values": {},
|
|
232
258
|
}
|
|
233
259
|
|
|
234
|
-
# Save preprocessed data
|
|
235
|
-
|
|
260
|
+
# Save preprocessed data with type optimization for efficient transfer
|
|
261
|
+
# Float64→Float32 reduces Arrow payload size
|
|
262
|
+
# Int64→Int32 (when safe) avoids BigInt overhead in JavaScript
|
|
236
263
|
for key, value in self._preprocessed_data.items():
|
|
237
264
|
if isinstance(value, pl.LazyFrame):
|
|
238
265
|
filename = f"{key}.parquet"
|
|
239
266
|
filepath = preprocessed_dir / filename
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
row_group_size=row_group_size,
|
|
245
|
-
)
|
|
267
|
+
# Apply streaming-safe optimization (Float64→Float32 only)
|
|
268
|
+
# Int64 bounds checking would require collect(), breaking streaming
|
|
269
|
+
value = optimize_for_transfer_lazy(value)
|
|
270
|
+
value.sink_parquet(filepath, compression='zstd')
|
|
246
271
|
manifest["data_files"][key] = filename
|
|
247
272
|
elif isinstance(value, pl.DataFrame):
|
|
248
273
|
filename = f"{key}.parquet"
|
|
249
274
|
filepath = preprocessed_dir / filename
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
statistics=True,
|
|
254
|
-
row_group_size=row_group_size,
|
|
255
|
-
)
|
|
275
|
+
# Full optimization including Int64→Int32 with bounds checking
|
|
276
|
+
value = optimize_for_transfer(value)
|
|
277
|
+
value.write_parquet(filepath, compression='zstd')
|
|
256
278
|
manifest["data_files"][key] = filename
|
|
257
279
|
elif self._is_json_serializable(value):
|
|
258
280
|
manifest["data_values"][key] = value
|
|
@@ -261,6 +283,13 @@ class BaseComponent(ABC):
|
|
|
261
283
|
with open(self._get_manifest_path(), "w") as f:
|
|
262
284
|
json.dump(manifest, f, indent=2)
|
|
263
285
|
|
|
286
|
+
# Release memory - data is now safely on disk
|
|
287
|
+
self._preprocessed_data = {}
|
|
288
|
+
self._raw_data = None
|
|
289
|
+
|
|
290
|
+
# Reload as lazy scan_parquet() references
|
|
291
|
+
self._load_from_cache()
|
|
292
|
+
|
|
264
293
|
def _is_json_serializable(self, value: Any) -> bool:
|
|
265
294
|
"""Check if value can be JSON serialized."""
|
|
266
295
|
try:
|
|
@@ -0,0 +1,96 @@
|
|
|
1
|
+
"""Subprocess-based preprocessing to ensure memory is released after cache creation.
|
|
2
|
+
|
|
3
|
+
When preprocessing large datasets (especially heatmaps with millions of points),
|
|
4
|
+
memory allocators like mimalloc retain freed memory. Running preprocessing in a
|
|
5
|
+
subprocess ensures all memory is returned to the OS when the subprocess exits.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import multiprocessing
|
|
9
|
+
import os
|
|
10
|
+
import traceback
|
|
11
|
+
from typing import Any, Dict, Type
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def _preprocess_worker(
|
|
15
|
+
component_class: Type,
|
|
16
|
+
data_path: str,
|
|
17
|
+
kwargs: Dict[str, Any],
|
|
18
|
+
error_queue: multiprocessing.Queue,
|
|
19
|
+
) -> None:
|
|
20
|
+
"""Worker function that runs in subprocess to do preprocessing."""
|
|
21
|
+
try:
|
|
22
|
+
import polars as pl
|
|
23
|
+
|
|
24
|
+
# Set mimalloc to release memory aggressively (in case not inherited)
|
|
25
|
+
os.environ.setdefault("MIMALLOC_PURGE_DELAY", "0")
|
|
26
|
+
|
|
27
|
+
# Create component with data - this triggers preprocessing and cache save
|
|
28
|
+
data = pl.scan_parquet(data_path)
|
|
29
|
+
component_class(data=data, **kwargs)
|
|
30
|
+
# Subprocess exits here, releasing all memory
|
|
31
|
+
error_queue.put(None)
|
|
32
|
+
except Exception as e:
|
|
33
|
+
# Send exception info back to parent process
|
|
34
|
+
error_queue.put((type(e).__name__, str(e), traceback.format_exc()))
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def preprocess_component(
|
|
38
|
+
component_class: Type,
|
|
39
|
+
data_path: str,
|
|
40
|
+
cache_id: str,
|
|
41
|
+
cache_path: str,
|
|
42
|
+
**kwargs,
|
|
43
|
+
) -> None:
|
|
44
|
+
"""
|
|
45
|
+
Run component preprocessing in a subprocess to guarantee memory release.
|
|
46
|
+
|
|
47
|
+
This is an internal function called by BaseComponent when data_path is
|
|
48
|
+
provided. Users should use the component constructor directly:
|
|
49
|
+
|
|
50
|
+
heatmap = Heatmap(
|
|
51
|
+
data_path="/path/to/data.parquet",
|
|
52
|
+
cache_id="my_heatmap",
|
|
53
|
+
cache_path="/path/to/cache",
|
|
54
|
+
x_column="rt",
|
|
55
|
+
y_column="mz",
|
|
56
|
+
intensity_column="intensity",
|
|
57
|
+
)
|
|
58
|
+
|
|
59
|
+
Args:
|
|
60
|
+
component_class: The component class (e.g., Heatmap, Table)
|
|
61
|
+
data_path: Path to the parquet file containing the data
|
|
62
|
+
cache_id: Unique identifier for the cache
|
|
63
|
+
cache_path: Directory for cache storage
|
|
64
|
+
**kwargs: Additional arguments passed to component constructor
|
|
65
|
+
"""
|
|
66
|
+
# Prepare kwargs for subprocess
|
|
67
|
+
worker_kwargs = {
|
|
68
|
+
"cache_id": cache_id,
|
|
69
|
+
"cache_path": cache_path,
|
|
70
|
+
**kwargs,
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
# Use spawn to get a fresh process (fork might copy memory)
|
|
74
|
+
ctx = multiprocessing.get_context("spawn")
|
|
75
|
+
error_queue = ctx.Queue()
|
|
76
|
+
process = ctx.Process(
|
|
77
|
+
target=_preprocess_worker,
|
|
78
|
+
args=(component_class, data_path, worker_kwargs, error_queue),
|
|
79
|
+
)
|
|
80
|
+
process.start()
|
|
81
|
+
process.join()
|
|
82
|
+
|
|
83
|
+
# Check for errors from subprocess
|
|
84
|
+
if not error_queue.empty():
|
|
85
|
+
error_info = error_queue.get_nowait()
|
|
86
|
+
if error_info is not None:
|
|
87
|
+
exc_type, exc_msg, exc_tb = error_info
|
|
88
|
+
raise RuntimeError(
|
|
89
|
+
f"Subprocess preprocessing failed with {exc_type}: {exc_msg}\n"
|
|
90
|
+
f"Subprocess traceback:\n{exc_tb}"
|
|
91
|
+
)
|
|
92
|
+
|
|
93
|
+
if process.exitcode != 0:
|
|
94
|
+
raise RuntimeError(
|
|
95
|
+
f"Preprocessing failed with exit code {process.exitcode}"
|
|
96
|
+
)
|