openms-insight 0.1.0__py3-none-any.whl → 0.1.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -69,6 +69,7 @@ class Heatmap(BaseComponent):
69
69
  x_column: str,
70
70
  y_column: str,
71
71
  data: Optional[pl.LazyFrame] = None,
72
+ data_path: Optional[str] = None,
72
73
  intensity_column: str = 'intensity',
73
74
  filters: Optional[Dict[str, str]] = None,
74
75
  filter_defaults: Optional[Dict[str, Any]] = None,
@@ -97,6 +98,7 @@ class Heatmap(BaseComponent):
97
98
  x_column: Name of column for x-axis values
98
99
  y_column: Name of column for y-axis values
99
100
  data: Polars LazyFrame with heatmap data. Optional if cache exists.
101
+ data_path: Path to parquet file (preferred for large datasets).
100
102
  intensity_column: Name of column for intensity/color values
101
103
  filters: Mapping of identifier names to column names for filtering
102
104
  interactivity: Mapping of identifier names to column names for clicks.
@@ -142,11 +144,27 @@ class Heatmap(BaseComponent):
142
144
  super().__init__(
143
145
  cache_id=cache_id,
144
146
  data=data,
147
+ data_path=data_path,
145
148
  filters=filters,
146
149
  filter_defaults=filter_defaults,
147
150
  interactivity=interactivity,
148
151
  cache_path=cache_path,
149
152
  regenerate_cache=regenerate_cache,
153
+ # Pass component-specific params for subprocess recreation
154
+ x_column=x_column,
155
+ y_column=y_column,
156
+ intensity_column=intensity_column,
157
+ min_points=min_points,
158
+ x_bins=x_bins,
159
+ y_bins=y_bins,
160
+ zoom_identifier=zoom_identifier,
161
+ title=title,
162
+ x_label=x_label,
163
+ y_label=y_label,
164
+ colorscale=colorscale,
165
+ use_simple_downsample=use_simple_downsample,
166
+ use_streaming=use_streaming,
167
+ categorical_filters=categorical_filters,
150
168
  **kwargs
151
169
  )
152
170
 
@@ -211,6 +229,8 @@ class Heatmap(BaseComponent):
211
229
  render time, the resulting data has ~min_points regardless of the
212
230
  filter value selected.
213
231
 
232
+ Data is sorted by x, y columns for efficient range query predicate pushdown.
233
+
214
234
  Example: For im_dimension with values [0, 1, 2, 3], creates:
215
235
  - cat_level_im_dimension_0_0: 20K points with im_id=0
216
236
  - cat_level_im_dimension_0_1: 20K points with im_id=1
@@ -271,9 +291,8 @@ class Heatmap(BaseComponent):
271
291
 
272
292
  # Store level sizes for this filter value
273
293
  self._preprocessed_data[f'cat_level_sizes_{filter_id}_{filter_value}'] = level_sizes
274
- self._preprocessed_data[f'cat_num_levels_{filter_id}_{filter_value}'] = len(level_sizes)
275
294
 
276
- # Build each level
295
+ # Build each compressed level
277
296
  for level_idx, target_size in enumerate(level_sizes):
278
297
  # If target size equals total, skip downsampling - use all data
279
298
  if target_size >= filtered_total:
@@ -297,15 +316,25 @@ class Heatmap(BaseComponent):
297
316
  y_range=y_range,
298
317
  )
299
318
 
300
- # Collect and store
319
+ # Sort by x, y for efficient range query predicate pushdown
320
+ level = level.sort([self._x_column, self._y_column])
321
+ # Store LazyFrame for streaming to disk
301
322
  level_key = f'cat_level_{filter_id}_{filter_value}_{level_idx}'
302
- self._preprocessed_data[level_key] = level.collect()
323
+ self._preprocessed_data[level_key] = level # Keep lazy
324
+
325
+ # Add full resolution as final level (for zoom fallback)
326
+ # Also sorted for consistent predicate pushdown behavior
327
+ num_compressed = len(level_sizes)
328
+ full_res_key = f'cat_level_{filter_id}_{filter_value}_{num_compressed}'
329
+ self._preprocessed_data[full_res_key] = filtered_data.sort(
330
+ [self._x_column, self._y_column]
331
+ )
332
+ self._preprocessed_data[f'cat_num_levels_{filter_id}_{filter_value}'] = num_compressed + 1
303
333
 
304
334
  # Also create global levels for when no categorical filter is selected
305
335
  # (fallback to standard behavior)
306
336
  level_sizes = compute_compression_levels(self._min_points, total)
307
337
  self._preprocessed_data['level_sizes'] = level_sizes
308
- self._preprocessed_data['num_levels'] = len(level_sizes)
309
338
 
310
339
  for i, size in enumerate(level_sizes):
311
340
  # If target size equals total, skip downsampling - use all data
@@ -329,13 +358,24 @@ class Heatmap(BaseComponent):
329
358
  x_range=x_range,
330
359
  y_range=y_range,
331
360
  )
332
- self._preprocessed_data[f'level_{i}'] = level.collect()
361
+ # Sort by x, y for efficient range query predicate pushdown
362
+ level = level.sort([self._x_column, self._y_column])
363
+ self._preprocessed_data[f'level_{i}'] = level # Keep lazy
364
+
365
+ # Add full resolution as final level (for zoom fallback)
366
+ # Also sorted for consistent predicate pushdown behavior
367
+ num_compressed = len(level_sizes)
368
+ self._preprocessed_data[f'level_{num_compressed}'] = self._raw_data.sort(
369
+ [self._x_column, self._y_column]
370
+ )
371
+ self._preprocessed_data['num_levels'] = num_compressed + 1
333
372
 
334
373
  def _preprocess_streaming(self) -> None:
335
374
  """
336
- Streaming preprocessing - levels stay lazy until render.
375
+ Streaming preprocessing - levels stay lazy through caching.
337
376
 
338
- Builds lazy query plans and collects them for caching.
377
+ Builds lazy query plans that are streamed to disk via sink_parquet().
378
+ Data is sorted by x, y columns for efficient range query predicate pushdown.
339
379
  """
340
380
  # Get data ranges (minimal collect - just 4 values)
341
381
  x_range, y_range = get_data_range(
@@ -379,12 +419,22 @@ class Heatmap(BaseComponent):
379
419
  x_range=x_range,
380
420
  y_range=y_range,
381
421
  )
382
- # Collect and store as DataFrame for caching
383
- # Base class will serialize these to parquet
384
- self._preprocessed_data[f'level_{i}'] = level.collect()
422
+ # Sort by x, y for efficient range query predicate pushdown
423
+ # This clusters spatially close points together in row groups
424
+ level = level.sort([self._x_column, self._y_column])
425
+ # Store LazyFrame for streaming to disk
426
+ # Base class will use sink_parquet() to stream without full materialization
427
+ self._preprocessed_data[f'level_{i}'] = level # Keep lazy
428
+
429
+ # Add full resolution as final level (for zoom fallback)
430
+ # Also sorted for consistent predicate pushdown behavior
431
+ num_compressed = len(level_sizes)
432
+ self._preprocessed_data[f'level_{num_compressed}'] = self._raw_data.sort(
433
+ [self._x_column, self._y_column]
434
+ )
385
435
 
386
- # Store number of levels for reconstruction
387
- self._preprocessed_data['num_levels'] = len(level_sizes)
436
+ # Store number of levels for reconstruction (includes full resolution)
437
+ self._preprocessed_data['num_levels'] = num_compressed + 1
388
438
 
389
439
  def _preprocess_eager(self) -> None:
390
440
  """
@@ -392,6 +442,7 @@ class Heatmap(BaseComponent):
392
442
 
393
443
  Uses more memory at init but faster rendering. Uses scipy-based
394
444
  downsampling for better spatial distribution.
445
+ Data is sorted by x, y columns for efficient range query predicate pushdown.
395
446
  """
396
447
  # Get data ranges
397
448
  x_range, y_range = get_data_range(
@@ -434,16 +485,29 @@ class Heatmap(BaseComponent):
434
485
  x_bins=self._x_bins,
435
486
  y_bins=self._y_bins,
436
487
  )
437
- # Collect for caching - store with reversed index
488
+ # Sort by x, y for efficient range query predicate pushdown
489
+ if isinstance(downsampled, pl.LazyFrame):
490
+ downsampled = downsampled.sort([self._x_column, self._y_column])
491
+ else:
492
+ downsampled = downsampled.sort([self._x_column, self._y_column])
493
+ # Store LazyFrame for streaming to disk
438
494
  level_idx = len(level_sizes) - 1 - i
439
495
  if isinstance(downsampled, pl.LazyFrame):
440
- self._preprocessed_data[f'level_{level_idx}'] = downsampled.collect()
496
+ self._preprocessed_data[f'level_{level_idx}'] = downsampled # Keep lazy
441
497
  else:
442
- self._preprocessed_data[f'level_{level_idx}'] = downsampled
498
+ # DataFrame from downsample_2d - convert back to lazy
499
+ self._preprocessed_data[f'level_{level_idx}'] = downsampled.lazy()
443
500
  current = downsampled
444
501
 
445
- # Store number of levels for reconstruction
446
- self._preprocessed_data['num_levels'] = len(level_sizes)
502
+ # Add full resolution as final level (for zoom fallback)
503
+ # Also sorted for consistent predicate pushdown behavior
504
+ num_compressed = len(level_sizes)
505
+ self._preprocessed_data[f'level_{num_compressed}'] = self._raw_data.sort(
506
+ [self._x_column, self._y_column]
507
+ )
508
+
509
+ # Store number of levels for reconstruction (includes full resolution)
510
+ self._preprocessed_data['num_levels'] = num_compressed + 1
447
511
 
448
512
  def _get_levels(self) -> list:
449
513
  """
@@ -460,10 +524,6 @@ class Heatmap(BaseComponent):
460
524
  if level_data is not None:
461
525
  levels.append(level_data)
462
526
 
463
- # Add full resolution at end (if raw data available)
464
- if self._raw_data is not None:
465
- levels.append(self._raw_data)
466
-
467
527
  return levels
468
528
 
469
529
  def _get_categorical_levels(
@@ -496,13 +556,7 @@ class Heatmap(BaseComponent):
496
556
  if level_data is not None:
497
557
  levels.append(level_data)
498
558
 
499
- # Get filtered raw data for full resolution (if available)
500
- filtered_raw = None
501
- if self._raw_data is not None and filter_id in self._filters:
502
- column_name = self._filters[filter_id]
503
- filtered_raw = self._raw_data.filter(pl.col(column_name) == filter_value)
504
-
505
- return levels, filtered_raw
559
+ return levels, None # Full resolution included in cached levels
506
560
 
507
561
  def _get_levels_for_state(self, state: Dict[str, Any]) -> Tuple[list, Optional[pl.LazyFrame]]:
508
562
  """
@@ -630,10 +684,11 @@ class Heatmap(BaseComponent):
630
684
 
631
685
  if count >= self._min_points:
632
686
  # This level has enough detail
633
- if count > self._min_points * 2:
634
- # Still too many - downsample further
635
- x_range = self._preprocessed_data.get('x_range')
636
- y_range = self._preprocessed_data.get('y_range')
687
+ if count > self._min_points:
688
+ # Over limit - downsample to stay at/under max
689
+ # Use ZOOM range for binning (not global) to avoid sparse bins
690
+ zoom_x_range = (x0, x1)
691
+ zoom_y_range = (y0, y1)
637
692
  if self._use_streaming or self._use_simple_downsample:
638
693
  if self._use_simple_downsample:
639
694
  return downsample_2d_simple(
@@ -650,8 +705,8 @@ class Heatmap(BaseComponent):
650
705
  intensity_column=self._intensity_column,
651
706
  x_bins=self._x_bins,
652
707
  y_bins=self._y_bins,
653
- x_range=x_range,
654
- y_range=y_range,
708
+ x_range=zoom_x_range,
709
+ y_range=zoom_y_range,
655
710
  ).collect()
656
711
  else:
657
712
  return downsample_2d(
@@ -744,7 +799,8 @@ class Heatmap(BaseComponent):
744
799
  df_pandas = df_pandas.sort_values(self._intensity_column).reset_index(drop=True)
745
800
  else:
746
801
  # No filters to apply - levels already filtered by categorical filter
747
- available_cols = [c for c in columns_to_select if c in data.columns]
802
+ schema_names = data.collect_schema().names()
803
+ available_cols = [c for c in columns_to_select if c in schema_names]
748
804
  df_polars = data.select(available_cols).collect()
749
805
  # Sort by intensity ascending so high-intensity points are drawn on top
750
806
  df_polars = df_polars.sort(self._intensity_column)
@@ -45,6 +45,7 @@ class LinePlot(BaseComponent):
45
45
  self,
46
46
  cache_id: str,
47
47
  data: Optional[pl.LazyFrame] = None,
48
+ data_path: Optional[str] = None,
48
49
  filters: Optional[Dict[str, str]] = None,
49
50
  filter_defaults: Optional[Dict[str, Any]] = None,
50
51
  interactivity: Optional[Dict[str, str]] = None,
@@ -68,6 +69,7 @@ class LinePlot(BaseComponent):
68
69
  cache_id: Unique identifier for this component's cache (MANDATORY).
69
70
  Creates a folder {cache_path}/{cache_id}/ for cached data.
70
71
  data: Polars LazyFrame with plot data. Optional if cache exists.
72
+ data_path: Path to parquet file (preferred for large datasets).
71
73
  filters: Mapping of identifier names to column names for filtering.
72
74
  Example: {'spectrum': 'scan_id'}
73
75
  When 'spectrum' selection exists, plot shows only data where
@@ -116,11 +118,22 @@ class LinePlot(BaseComponent):
116
118
  super().__init__(
117
119
  cache_id=cache_id,
118
120
  data=data,
121
+ data_path=data_path,
119
122
  filters=filters,
120
123
  filter_defaults=filter_defaults,
121
124
  interactivity=interactivity,
122
125
  cache_path=cache_path,
123
126
  regenerate_cache=regenerate_cache,
127
+ # Pass component-specific params for subprocess recreation
128
+ x_column=x_column,
129
+ y_column=y_column,
130
+ title=title,
131
+ x_label=x_label,
132
+ y_label=y_label,
133
+ highlight_column=highlight_column,
134
+ annotation_column=annotation_column,
135
+ styling=styling,
136
+ config=config,
124
137
  **kwargs
125
138
  )
126
139
 
@@ -208,9 +221,9 @@ class LinePlot(BaseComponent):
208
221
  'annotation_column': self._annotation_column,
209
222
  }
210
223
 
211
- # Collect data for caching (filter happens at render time)
212
- # Base class will serialize this to parquet
213
- self._preprocessed_data['data'] = data.collect()
224
+ # Store LazyFrame for streaming to disk (filter happens at render time)
225
+ # Base class will use sink_parquet() to stream without full materialization
226
+ self._preprocessed_data['data'] = data # Keep lazy
214
227
 
215
228
  def _get_vue_component_name(self) -> str:
216
229
  """Return the Vue component name."""
@@ -49,6 +49,7 @@ class Table(BaseComponent):
49
49
  self,
50
50
  cache_id: str,
51
51
  data: Optional[pl.LazyFrame] = None,
52
+ data_path: Optional[str] = None,
52
53
  filters: Optional[Dict[str, str]] = None,
53
54
  filter_defaults: Optional[Dict[str, Any]] = None,
54
55
  interactivity: Optional[Dict[str, str]] = None,
@@ -72,6 +73,7 @@ class Table(BaseComponent):
72
73
  cache_id: Unique identifier for this component's cache (MANDATORY).
73
74
  Creates a folder {cache_path}/{cache_id}/ for cached data.
74
75
  data: Polars LazyFrame with table data. Optional if cache exists.
76
+ data_path: Path to parquet file (preferred for large datasets).
75
77
  filters: Mapping of identifier names to column names for filtering.
76
78
  Example: {'spectrum': 'scan_id'}
77
79
  When 'spectrum' selection exists, table shows only rows where
@@ -120,11 +122,22 @@ class Table(BaseComponent):
120
122
  super().__init__(
121
123
  cache_id=cache_id,
122
124
  data=data,
125
+ data_path=data_path,
123
126
  filters=filters,
124
127
  filter_defaults=filter_defaults,
125
128
  interactivity=interactivity,
126
129
  cache_path=cache_path,
127
130
  regenerate_cache=regenerate_cache,
131
+ # Pass component-specific params for subprocess recreation
132
+ column_definitions=column_definitions,
133
+ title=title,
134
+ index_field=index_field,
135
+ go_to_fields=go_to_fields,
136
+ layout=layout,
137
+ default_row=default_row,
138
+ initial_sort=initial_sort,
139
+ pagination=pagination,
140
+ page_size=page_size,
128
141
  **kwargs
129
142
  )
130
143
 
@@ -204,9 +217,9 @@ class Table(BaseComponent):
204
217
  # Store column definitions in preprocessed data for serialization
205
218
  self._preprocessed_data['column_definitions'] = self._column_definitions
206
219
 
207
- # Collect data for caching (filter happens at render time)
208
- # Base class will serialize this to parquet with optimized row groups
209
- self._preprocessed_data['data'] = data.collect()
220
+ # Store LazyFrame for streaming to disk (filter happens at render time)
221
+ # Base class will use sink_parquet() to stream without full materialization
222
+ self._preprocessed_data['data'] = data # Keep lazy
210
223
 
211
224
  def _get_columns_to_select(self) -> Optional[List[str]]:
212
225
  """Get list of columns needed for this table."""
@@ -15,7 +15,8 @@ if TYPE_CHECKING:
15
15
 
16
16
  # Cache format version - increment when cache structure changes
17
17
  # Version 2: Added sorting by filter columns + smaller row groups for predicate pushdown
18
- CACHE_VERSION = 2
18
+ # Version 3: Downcast numeric types (Int64→Int32, Float64→Float32) for efficient transfer
19
+ CACHE_VERSION = 3
19
20
 
20
21
 
21
22
  class BaseComponent(ABC):
@@ -43,6 +44,7 @@ class BaseComponent(ABC):
43
44
  self,
44
45
  cache_id: str,
45
46
  data: Optional[pl.LazyFrame] = None,
47
+ data_path: Optional[str] = None,
46
48
  filters: Optional[Dict[str, str]] = None,
47
49
  filter_defaults: Optional[Dict[str, Any]] = None,
48
50
  interactivity: Optional[Dict[str, str]] = None,
@@ -57,6 +59,9 @@ class BaseComponent(ABC):
57
59
  cache_id: Unique identifier for this component's cache (MANDATORY).
58
60
  Creates a folder {cache_path}/{cache_id}/ for cached data.
59
61
  data: Polars LazyFrame with source data. Optional if cache exists.
62
+ data_path: Path to parquet file with source data. Preferred over
63
+ data= for large datasets as preprocessing runs in a subprocess
64
+ to ensure memory is released after cache creation.
60
65
  filters: Mapping of identifier names to column names for filtering.
61
66
  Example: {'spectrum': 'scan_id'}
62
67
  When 'spectrum' selection exists, component filters data where
@@ -73,6 +78,10 @@ class BaseComponent(ABC):
73
78
  regenerate_cache: If True, regenerate cache even if valid cache exists.
74
79
  **kwargs: Component-specific configuration options
75
80
  """
81
+ # Validate inputs
82
+ if data is not None and data_path is not None:
83
+ raise ValueError("Provide either 'data' or 'data_path', not both")
84
+
76
85
  self._cache_id = cache_id
77
86
  self._cache_dir = get_cache_dir(cache_path, cache_id)
78
87
  self._filters = filters or {}
@@ -83,18 +92,33 @@ class BaseComponent(ABC):
83
92
 
84
93
  # Check if we should load from cache or preprocess
85
94
  if regenerate_cache or not self._is_cache_valid():
86
- if data is None:
95
+ if data is None and data_path is None:
87
96
  raise CacheMissError(
88
97
  f"Cache not found at '{self._cache_dir}' and no data provided. "
89
- f"Either provide data= or ensure cache exists from a previous run."
98
+ f"Either provide data=, data_path=, or ensure cache exists."
99
+ )
100
+
101
+ if data_path is not None:
102
+ # Subprocess preprocessing - memory released after cache creation
103
+ from .subprocess_preprocess import preprocess_component
104
+ preprocess_component(
105
+ type(self),
106
+ data_path=data_path,
107
+ cache_id=cache_id,
108
+ cache_path=cache_path,
109
+ filters=filters,
110
+ filter_defaults=filter_defaults,
111
+ interactivity=interactivity,
112
+ **kwargs
90
113
  )
91
- self._raw_data = data
92
- # Validate columns exist in data
93
- self._validate_mappings()
94
- # Run component-specific preprocessing
95
- self._preprocess()
96
- # Save to cache for next time
97
- self._save_to_cache()
114
+ self._raw_data = None
115
+ self._load_from_cache()
116
+ else:
117
+ # In-process preprocessing (backward compatible)
118
+ self._raw_data = data
119
+ self._validate_mappings()
120
+ self._preprocess()
121
+ self._save_to_cache()
98
122
  else:
99
123
  # Load from valid cache
100
124
  self._raw_data = None
@@ -213,6 +237,8 @@ class BaseComponent(ABC):
213
237
 
214
238
  def _save_to_cache(self) -> None:
215
239
  """Save preprocessed data to cache."""
240
+ from ..preprocessing.filtering import optimize_for_transfer, optimize_for_transfer_lazy
241
+
216
242
  # Create directories
217
243
  self._cache_dir.mkdir(parents=True, exist_ok=True)
218
244
  preprocessed_dir = self._get_preprocessed_dir()
@@ -231,28 +257,24 @@ class BaseComponent(ABC):
231
257
  "data_values": {},
232
258
  }
233
259
 
234
- # Save preprocessed data
235
- row_group_size = self._get_row_group_size()
260
+ # Save preprocessed data with type optimization for efficient transfer
261
+ # Float64→Float32 reduces Arrow payload size
262
+ # Int64→Int32 (when safe) avoids BigInt overhead in JavaScript
236
263
  for key, value in self._preprocessed_data.items():
237
264
  if isinstance(value, pl.LazyFrame):
238
265
  filename = f"{key}.parquet"
239
266
  filepath = preprocessed_dir / filename
240
- value.collect().write_parquet(
241
- filepath,
242
- compression='zstd',
243
- statistics=True,
244
- row_group_size=row_group_size,
245
- )
267
+ # Apply streaming-safe optimization (Float64→Float32 only)
268
+ # Int64 bounds checking would require collect(), breaking streaming
269
+ value = optimize_for_transfer_lazy(value)
270
+ value.sink_parquet(filepath, compression='zstd')
246
271
  manifest["data_files"][key] = filename
247
272
  elif isinstance(value, pl.DataFrame):
248
273
  filename = f"{key}.parquet"
249
274
  filepath = preprocessed_dir / filename
250
- value.write_parquet(
251
- filepath,
252
- compression='zstd',
253
- statistics=True,
254
- row_group_size=row_group_size,
255
- )
275
+ # Full optimization including Int64→Int32 with bounds checking
276
+ value = optimize_for_transfer(value)
277
+ value.write_parquet(filepath, compression='zstd')
256
278
  manifest["data_files"][key] = filename
257
279
  elif self._is_json_serializable(value):
258
280
  manifest["data_values"][key] = value
@@ -261,6 +283,13 @@ class BaseComponent(ABC):
261
283
  with open(self._get_manifest_path(), "w") as f:
262
284
  json.dump(manifest, f, indent=2)
263
285
 
286
+ # Release memory - data is now safely on disk
287
+ self._preprocessed_data = {}
288
+ self._raw_data = None
289
+
290
+ # Reload as lazy scan_parquet() references
291
+ self._load_from_cache()
292
+
264
293
  def _is_json_serializable(self, value: Any) -> bool:
265
294
  """Check if value can be JSON serialized."""
266
295
  try:
@@ -0,0 +1,96 @@
1
+ """Subprocess-based preprocessing to ensure memory is released after cache creation.
2
+
3
+ When preprocessing large datasets (especially heatmaps with millions of points),
4
+ memory allocators like mimalloc retain freed memory. Running preprocessing in a
5
+ subprocess ensures all memory is returned to the OS when the subprocess exits.
6
+ """
7
+
8
+ import multiprocessing
9
+ import os
10
+ import traceback
11
+ from typing import Any, Dict, Type
12
+
13
+
14
+ def _preprocess_worker(
15
+ component_class: Type,
16
+ data_path: str,
17
+ kwargs: Dict[str, Any],
18
+ error_queue: multiprocessing.Queue,
19
+ ) -> None:
20
+ """Worker function that runs in subprocess to do preprocessing."""
21
+ try:
22
+ import polars as pl
23
+
24
+ # Set mimalloc to release memory aggressively (in case not inherited)
25
+ os.environ.setdefault("MIMALLOC_PURGE_DELAY", "0")
26
+
27
+ # Create component with data - this triggers preprocessing and cache save
28
+ data = pl.scan_parquet(data_path)
29
+ component_class(data=data, **kwargs)
30
+ # Subprocess exits here, releasing all memory
31
+ error_queue.put(None)
32
+ except Exception as e:
33
+ # Send exception info back to parent process
34
+ error_queue.put((type(e).__name__, str(e), traceback.format_exc()))
35
+
36
+
37
+ def preprocess_component(
38
+ component_class: Type,
39
+ data_path: str,
40
+ cache_id: str,
41
+ cache_path: str,
42
+ **kwargs,
43
+ ) -> None:
44
+ """
45
+ Run component preprocessing in a subprocess to guarantee memory release.
46
+
47
+ This is an internal function called by BaseComponent when data_path is
48
+ provided. Users should use the component constructor directly:
49
+
50
+ heatmap = Heatmap(
51
+ data_path="/path/to/data.parquet",
52
+ cache_id="my_heatmap",
53
+ cache_path="/path/to/cache",
54
+ x_column="rt",
55
+ y_column="mz",
56
+ intensity_column="intensity",
57
+ )
58
+
59
+ Args:
60
+ component_class: The component class (e.g., Heatmap, Table)
61
+ data_path: Path to the parquet file containing the data
62
+ cache_id: Unique identifier for the cache
63
+ cache_path: Directory for cache storage
64
+ **kwargs: Additional arguments passed to component constructor
65
+ """
66
+ # Prepare kwargs for subprocess
67
+ worker_kwargs = {
68
+ "cache_id": cache_id,
69
+ "cache_path": cache_path,
70
+ **kwargs,
71
+ }
72
+
73
+ # Use spawn to get a fresh process (fork might copy memory)
74
+ ctx = multiprocessing.get_context("spawn")
75
+ error_queue = ctx.Queue()
76
+ process = ctx.Process(
77
+ target=_preprocess_worker,
78
+ args=(component_class, data_path, worker_kwargs, error_queue),
79
+ )
80
+ process.start()
81
+ process.join()
82
+
83
+ # Check for errors from subprocess
84
+ if not error_queue.empty():
85
+ error_info = error_queue.get_nowait()
86
+ if error_info is not None:
87
+ exc_type, exc_msg, exc_tb = error_info
88
+ raise RuntimeError(
89
+ f"Subprocess preprocessing failed with {exc_type}: {exc_msg}\n"
90
+ f"Subprocess traceback:\n{exc_tb}"
91
+ )
92
+
93
+ if process.exitcode != 0:
94
+ raise RuntimeError(
95
+ f"Preprocessing failed with exit code {process.exitcode}"
96
+ )