openms-insight 0.1.3__py3-none-any.whl → 0.1.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- openms_insight/components/heatmap.py +280 -137
- openms_insight/core/base.py +22 -9
- openms_insight/preprocessing/compression.py +55 -1
- {openms_insight-0.1.3.dist-info → openms_insight-0.1.4.dist-info}/METADATA +1 -1
- {openms_insight-0.1.3.dist-info → openms_insight-0.1.4.dist-info}/RECORD +7 -7
- {openms_insight-0.1.3.dist-info → openms_insight-0.1.4.dist-info}/WHEEL +0 -0
- {openms_insight-0.1.3.dist-info → openms_insight-0.1.4.dist-info}/licenses/LICENSE +0 -0
|
@@ -8,6 +8,7 @@ from ..core.base import BaseComponent
|
|
|
8
8
|
from ..core.registry import register_component
|
|
9
9
|
from ..preprocessing.compression import (
|
|
10
10
|
compute_compression_levels,
|
|
11
|
+
compute_optimal_bins,
|
|
11
12
|
downsample_2d,
|
|
12
13
|
downsample_2d_simple,
|
|
13
14
|
downsample_2d_streaming,
|
|
@@ -76,9 +77,10 @@ class Heatmap(BaseComponent):
|
|
|
76
77
|
interactivity: Optional[Dict[str, str]] = None,
|
|
77
78
|
cache_path: str = ".",
|
|
78
79
|
regenerate_cache: bool = False,
|
|
79
|
-
min_points: int =
|
|
80
|
-
|
|
81
|
-
|
|
80
|
+
min_points: int = 10000,
|
|
81
|
+
display_aspect_ratio: float = 16 / 9,
|
|
82
|
+
x_bins: Optional[int] = None,
|
|
83
|
+
y_bins: Optional[int] = None,
|
|
82
84
|
zoom_identifier: str = "heatmap_zoom",
|
|
83
85
|
title: Optional[str] = None,
|
|
84
86
|
x_label: Optional[str] = None,
|
|
@@ -106,10 +108,17 @@ class Heatmap(BaseComponent):
|
|
|
106
108
|
point's value in the corresponding column.
|
|
107
109
|
cache_path: Base path for cache storage. Default "." (current dir).
|
|
108
110
|
regenerate_cache: If True, regenerate cache even if valid cache exists.
|
|
109
|
-
min_points: Target
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
111
|
+
min_points: Target number of points to display (default: 10000).
|
|
112
|
+
Cache levels are built at 2× this value; final downsample
|
|
113
|
+
at render time reduces to exactly min_points.
|
|
114
|
+
display_aspect_ratio: Expected display width/height ratio for
|
|
115
|
+
optimal bin computation during caching (default: 16/9).
|
|
116
|
+
At render time, the actual zoom region's aspect ratio is used.
|
|
117
|
+
x_bins: Number of bins along x-axis for downsampling. If None
|
|
118
|
+
(default), auto-computed from display_aspect_ratio such that
|
|
119
|
+
x_bins × y_bins ≈ 2×min_points with even spatial distribution.
|
|
120
|
+
y_bins: Number of bins along y-axis for downsampling. If None
|
|
121
|
+
(default), auto-computed from display_aspect_ratio.
|
|
113
122
|
zoom_identifier: State key for storing zoom range (default: 'heatmap_zoom')
|
|
114
123
|
title: Heatmap title displayed above the plot
|
|
115
124
|
x_label: X-axis label (defaults to x_column)
|
|
@@ -130,6 +139,7 @@ class Heatmap(BaseComponent):
|
|
|
130
139
|
self._y_column = y_column
|
|
131
140
|
self._intensity_column = intensity_column
|
|
132
141
|
self._min_points = min_points
|
|
142
|
+
self._display_aspect_ratio = display_aspect_ratio
|
|
133
143
|
self._x_bins = x_bins
|
|
134
144
|
self._y_bins = y_bins
|
|
135
145
|
self._zoom_identifier = zoom_identifier
|
|
@@ -155,6 +165,7 @@ class Heatmap(BaseComponent):
|
|
|
155
165
|
y_column=y_column,
|
|
156
166
|
intensity_column=intensity_column,
|
|
157
167
|
min_points=min_points,
|
|
168
|
+
display_aspect_ratio=display_aspect_ratio,
|
|
158
169
|
x_bins=x_bins,
|
|
159
170
|
y_bins=y_bins,
|
|
160
171
|
zoom_identifier=zoom_identifier,
|
|
@@ -180,6 +191,7 @@ class Heatmap(BaseComponent):
|
|
|
180
191
|
"y_column": self._y_column,
|
|
181
192
|
"intensity_column": self._intensity_column,
|
|
182
193
|
"min_points": self._min_points,
|
|
194
|
+
"display_aspect_ratio": self._display_aspect_ratio,
|
|
183
195
|
"x_bins": self._x_bins,
|
|
184
196
|
"y_bins": self._y_bins,
|
|
185
197
|
"use_simple_downsample": self._use_simple_downsample,
|
|
@@ -197,7 +209,10 @@ class Heatmap(BaseComponent):
|
|
|
197
209
|
self._x_column = config.get("x_column")
|
|
198
210
|
self._y_column = config.get("y_column")
|
|
199
211
|
self._intensity_column = config.get("intensity_column", "intensity")
|
|
200
|
-
self._min_points = config.get("min_points",
|
|
212
|
+
self._min_points = config.get("min_points", 10000)
|
|
213
|
+
self._display_aspect_ratio = config.get("display_aspect_ratio", 16 / 9)
|
|
214
|
+
# x_bins/y_bins are computed during preprocessing and stored in cache
|
|
215
|
+
# Fallback to old defaults for backward compatibility with old caches
|
|
201
216
|
self._x_bins = config.get("x_bins", 400)
|
|
202
217
|
self._y_bins = config.get("y_bins", 50)
|
|
203
218
|
self._use_simple_downsample = config.get("use_simple_downsample", False)
|
|
@@ -242,14 +257,116 @@ class Heatmap(BaseComponent):
|
|
|
242
257
|
else:
|
|
243
258
|
self._preprocess_eager()
|
|
244
259
|
|
|
260
|
+
def _build_cascading_levels(
|
|
261
|
+
self,
|
|
262
|
+
source_data: pl.LazyFrame,
|
|
263
|
+
level_sizes: list,
|
|
264
|
+
x_range: tuple,
|
|
265
|
+
y_range: tuple,
|
|
266
|
+
cache_dir,
|
|
267
|
+
prefix: str = "level",
|
|
268
|
+
) -> dict:
|
|
269
|
+
"""
|
|
270
|
+
Build cascading compression levels from source data.
|
|
271
|
+
|
|
272
|
+
Each level is built from the previous larger level rather than from
|
|
273
|
+
raw data. This is efficient (raw data read once) and produces identical
|
|
274
|
+
results because the downsampling keeps top N highest-intensity points
|
|
275
|
+
per bin - points surviving at larger levels will also be selected at
|
|
276
|
+
smaller levels.
|
|
277
|
+
|
|
278
|
+
Args:
|
|
279
|
+
source_data: LazyFrame with raw/filtered data
|
|
280
|
+
level_sizes: List of target sizes for compressed levels (smallest first)
|
|
281
|
+
x_range: (x_min, x_max) for consistent bin boundaries
|
|
282
|
+
y_range: (y_min, y_max) for consistent bin boundaries
|
|
283
|
+
cache_dir: Path to save parquet files
|
|
284
|
+
prefix: Filename prefix (e.g., "level" or "cat_level_im_0")
|
|
285
|
+
|
|
286
|
+
Returns:
|
|
287
|
+
Dict with level LazyFrames keyed by "{prefix}_{idx}" and "num_levels"
|
|
288
|
+
"""
|
|
289
|
+
import sys
|
|
290
|
+
|
|
291
|
+
result = {}
|
|
292
|
+
num_compressed = len(level_sizes)
|
|
293
|
+
|
|
294
|
+
# Get total count
|
|
295
|
+
total = source_data.select(pl.len()).collect().item()
|
|
296
|
+
|
|
297
|
+
# First: save full resolution as the largest level
|
|
298
|
+
full_res_path = cache_dir / f"{prefix}_{num_compressed}.parquet"
|
|
299
|
+
full_res = source_data.sort([self._x_column, self._y_column])
|
|
300
|
+
full_res.sink_parquet(full_res_path, compression="zstd")
|
|
301
|
+
print(
|
|
302
|
+
f"[HEATMAP] Saved {prefix}_{num_compressed} ({total:,} pts)",
|
|
303
|
+
file=sys.stderr,
|
|
304
|
+
)
|
|
305
|
+
|
|
306
|
+
# Start cascading from full resolution
|
|
307
|
+
current_source = pl.scan_parquet(full_res_path)
|
|
308
|
+
current_size = total
|
|
309
|
+
|
|
310
|
+
# Build compressed levels from largest to smallest
|
|
311
|
+
for i, target_size in enumerate(reversed(level_sizes)):
|
|
312
|
+
level_idx = num_compressed - 1 - i
|
|
313
|
+
level_path = cache_dir / f"{prefix}_{level_idx}.parquet"
|
|
314
|
+
|
|
315
|
+
# If target size equals or exceeds current, just copy reference
|
|
316
|
+
if target_size >= current_size:
|
|
317
|
+
level = current_source
|
|
318
|
+
elif self._use_simple_downsample:
|
|
319
|
+
level = downsample_2d_simple(
|
|
320
|
+
current_source,
|
|
321
|
+
max_points=target_size,
|
|
322
|
+
intensity_column=self._intensity_column,
|
|
323
|
+
)
|
|
324
|
+
else:
|
|
325
|
+
level = downsample_2d_streaming(
|
|
326
|
+
current_source,
|
|
327
|
+
max_points=target_size,
|
|
328
|
+
x_column=self._x_column,
|
|
329
|
+
y_column=self._y_column,
|
|
330
|
+
intensity_column=self._intensity_column,
|
|
331
|
+
x_bins=self._x_bins,
|
|
332
|
+
y_bins=self._y_bins,
|
|
333
|
+
x_range=x_range,
|
|
334
|
+
y_range=y_range,
|
|
335
|
+
)
|
|
336
|
+
|
|
337
|
+
# Sort and save immediately
|
|
338
|
+
level = level.sort([self._x_column, self._y_column])
|
|
339
|
+
level.sink_parquet(level_path, compression="zstd")
|
|
340
|
+
|
|
341
|
+
print(
|
|
342
|
+
f"[HEATMAP] Saved {prefix}_{level_idx} (target {target_size:,} pts)",
|
|
343
|
+
file=sys.stderr,
|
|
344
|
+
)
|
|
345
|
+
|
|
346
|
+
# Next iteration uses this level as source (cascading)
|
|
347
|
+
current_source = pl.scan_parquet(level_path)
|
|
348
|
+
current_size = target_size
|
|
349
|
+
|
|
350
|
+
# Load all levels back as LazyFrames
|
|
351
|
+
for i in range(num_compressed + 1):
|
|
352
|
+
level_path = cache_dir / f"{prefix}_{i}.parquet"
|
|
353
|
+
result[f"{prefix}_{i}"] = pl.scan_parquet(level_path)
|
|
354
|
+
|
|
355
|
+
result["num_levels"] = num_compressed + 1
|
|
356
|
+
|
|
357
|
+
return result
|
|
358
|
+
|
|
245
359
|
def _preprocess_with_categorical_filters(self) -> None:
|
|
246
360
|
"""
|
|
247
|
-
Preprocess with per-filter-value compression levels.
|
|
361
|
+
Preprocess with per-filter-value compression levels using cascading.
|
|
248
362
|
|
|
249
363
|
For each unique value of each categorical filter, creates separate
|
|
250
|
-
compression levels
|
|
251
|
-
|
|
252
|
-
filter value selected.
|
|
364
|
+
compression levels using cascading (building smaller levels from larger).
|
|
365
|
+
This ensures that when a filter is applied at render time, the resulting
|
|
366
|
+
data has ~min_points regardless of the filter value selected.
|
|
367
|
+
|
|
368
|
+
Uses cascading downsampling for efficiency - each level is built from
|
|
369
|
+
the previous larger level rather than from raw data.
|
|
253
370
|
|
|
254
371
|
Data is sorted by x, y columns for efficient range query predicate pushdown.
|
|
255
372
|
|
|
@@ -261,6 +378,7 @@ class Heatmap(BaseComponent):
|
|
|
261
378
|
import sys
|
|
262
379
|
|
|
263
380
|
# Get data ranges (for the full dataset)
|
|
381
|
+
# These ranges are used for ALL levels to ensure consistent binning
|
|
264
382
|
x_range, y_range = get_data_range(
|
|
265
383
|
self._raw_data,
|
|
266
384
|
self._x_column,
|
|
@@ -269,10 +387,31 @@ class Heatmap(BaseComponent):
|
|
|
269
387
|
self._preprocessed_data["x_range"] = x_range
|
|
270
388
|
self._preprocessed_data["y_range"] = y_range
|
|
271
389
|
|
|
390
|
+
# Compute optimal bins if not provided
|
|
391
|
+
# Cache at 2×min_points, use display_aspect_ratio for bin computation
|
|
392
|
+
cache_target = 2 * self._min_points
|
|
393
|
+
if self._x_bins is None or self._y_bins is None:
|
|
394
|
+
# Use display aspect ratio (not data aspect ratio) for optimal bins
|
|
395
|
+
self._x_bins, self._y_bins = compute_optimal_bins(
|
|
396
|
+
cache_target,
|
|
397
|
+
(0, self._display_aspect_ratio), # Fake x_range matching aspect
|
|
398
|
+
(0, 1.0), # Fake y_range
|
|
399
|
+
)
|
|
400
|
+
print(
|
|
401
|
+
f"[HEATMAP] Auto-computed bins: {self._x_bins}x{self._y_bins} "
|
|
402
|
+
f"= {self._x_bins * self._y_bins:,} (cache target: {cache_target:,}, "
|
|
403
|
+
f"display aspect: {self._display_aspect_ratio:.2f})",
|
|
404
|
+
file=sys.stderr,
|
|
405
|
+
)
|
|
406
|
+
|
|
272
407
|
# Get total count
|
|
273
408
|
total = self._raw_data.select(pl.len()).collect().item()
|
|
274
409
|
self._preprocessed_data["total"] = total
|
|
275
410
|
|
|
411
|
+
# Create cache directory for immediate level saving
|
|
412
|
+
cache_dir = self._cache_dir / "preprocessed"
|
|
413
|
+
cache_dir.mkdir(parents=True, exist_ok=True)
|
|
414
|
+
|
|
276
415
|
# Store metadata about categorical filters
|
|
277
416
|
self._preprocessed_data["has_categorical_filters"] = True
|
|
278
417
|
self._preprocessed_data["categorical_filter_values"] = {}
|
|
@@ -309,7 +448,7 @@ class Heatmap(BaseComponent):
|
|
|
309
448
|
unique_values
|
|
310
449
|
)
|
|
311
450
|
|
|
312
|
-
# Create compression levels for each filter value
|
|
451
|
+
# Create compression levels for each filter value using cascading
|
|
313
452
|
for filter_value in unique_values:
|
|
314
453
|
# Filter data to this value
|
|
315
454
|
filtered_data = self._raw_data.filter(
|
|
@@ -317,9 +456,9 @@ class Heatmap(BaseComponent):
|
|
|
317
456
|
)
|
|
318
457
|
filtered_total = filtered_data.select(pl.len()).collect().item()
|
|
319
458
|
|
|
320
|
-
# Compute level sizes for this filtered subset
|
|
459
|
+
# Compute level sizes for this filtered subset (2× for cache buffer)
|
|
321
460
|
level_sizes = compute_compression_levels(
|
|
322
|
-
|
|
461
|
+
cache_target, filtered_total
|
|
323
462
|
)
|
|
324
463
|
|
|
325
464
|
print(
|
|
@@ -332,94 +471,71 @@ class Heatmap(BaseComponent):
|
|
|
332
471
|
f"cat_level_sizes_{filter_id}_{filter_value}"
|
|
333
472
|
] = level_sizes
|
|
334
473
|
|
|
335
|
-
# Build
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
|
|
339
|
-
|
|
340
|
-
|
|
341
|
-
|
|
342
|
-
|
|
343
|
-
|
|
344
|
-
intensity_column=self._intensity_column,
|
|
345
|
-
)
|
|
346
|
-
else:
|
|
347
|
-
level = downsample_2d_streaming(
|
|
348
|
-
filtered_data,
|
|
349
|
-
max_points=target_size,
|
|
350
|
-
x_column=self._x_column,
|
|
351
|
-
y_column=self._y_column,
|
|
352
|
-
intensity_column=self._intensity_column,
|
|
353
|
-
x_bins=self._x_bins,
|
|
354
|
-
y_bins=self._y_bins,
|
|
355
|
-
x_range=x_range,
|
|
356
|
-
y_range=y_range,
|
|
357
|
-
)
|
|
358
|
-
|
|
359
|
-
# Sort by x, y for efficient range query predicate pushdown
|
|
360
|
-
level = level.sort([self._x_column, self._y_column])
|
|
361
|
-
# Store LazyFrame for streaming to disk
|
|
362
|
-
level_key = f"cat_level_{filter_id}_{filter_value}_{level_idx}"
|
|
363
|
-
self._preprocessed_data[level_key] = level # Keep lazy
|
|
364
|
-
|
|
365
|
-
# Add full resolution as final level (for zoom fallback)
|
|
366
|
-
# Also sorted for consistent predicate pushdown behavior
|
|
367
|
-
num_compressed = len(level_sizes)
|
|
368
|
-
full_res_key = f"cat_level_{filter_id}_{filter_value}_{num_compressed}"
|
|
369
|
-
self._preprocessed_data[full_res_key] = filtered_data.sort(
|
|
370
|
-
[self._x_column, self._y_column]
|
|
474
|
+
# Build cascading levels using helper
|
|
475
|
+
prefix = f"cat_level_{filter_id}_{filter_value}"
|
|
476
|
+
levels_result = self._build_cascading_levels(
|
|
477
|
+
source_data=filtered_data,
|
|
478
|
+
level_sizes=level_sizes,
|
|
479
|
+
x_range=x_range,
|
|
480
|
+
y_range=y_range,
|
|
481
|
+
cache_dir=cache_dir,
|
|
482
|
+
prefix=prefix,
|
|
371
483
|
)
|
|
372
|
-
|
|
373
|
-
|
|
374
|
-
|
|
484
|
+
|
|
485
|
+
# Copy results to preprocessed_data
|
|
486
|
+
for key, value in levels_result.items():
|
|
487
|
+
if key == "num_levels":
|
|
488
|
+
self._preprocessed_data[
|
|
489
|
+
f"cat_num_levels_{filter_id}_{filter_value}"
|
|
490
|
+
] = value
|
|
491
|
+
else:
|
|
492
|
+
self._preprocessed_data[key] = value
|
|
375
493
|
|
|
376
494
|
# Also create global levels for when no categorical filter is selected
|
|
377
|
-
# (fallback to standard behavior)
|
|
378
|
-
level_sizes = compute_compression_levels(
|
|
495
|
+
# (fallback to standard behavior) - using cascading with 2× cache buffer
|
|
496
|
+
level_sizes = compute_compression_levels(cache_target, total)
|
|
379
497
|
self._preprocessed_data["level_sizes"] = level_sizes
|
|
380
498
|
|
|
381
|
-
|
|
382
|
-
|
|
383
|
-
|
|
384
|
-
|
|
385
|
-
|
|
386
|
-
|
|
387
|
-
|
|
388
|
-
|
|
389
|
-
|
|
390
|
-
|
|
499
|
+
# Build global cascading levels using helper
|
|
500
|
+
levels_result = self._build_cascading_levels(
|
|
501
|
+
source_data=self._raw_data,
|
|
502
|
+
level_sizes=level_sizes,
|
|
503
|
+
x_range=x_range,
|
|
504
|
+
y_range=y_range,
|
|
505
|
+
cache_dir=cache_dir,
|
|
506
|
+
prefix="level",
|
|
507
|
+
)
|
|
508
|
+
|
|
509
|
+
# Copy results to preprocessed_data
|
|
510
|
+
for key, value in levels_result.items():
|
|
511
|
+
if key == "num_levels":
|
|
512
|
+
self._preprocessed_data["num_levels"] = value
|
|
391
513
|
else:
|
|
392
|
-
|
|
393
|
-
self._raw_data,
|
|
394
|
-
max_points=size,
|
|
395
|
-
x_column=self._x_column,
|
|
396
|
-
y_column=self._y_column,
|
|
397
|
-
intensity_column=self._intensity_column,
|
|
398
|
-
x_bins=self._x_bins,
|
|
399
|
-
y_bins=self._y_bins,
|
|
400
|
-
x_range=x_range,
|
|
401
|
-
y_range=y_range,
|
|
402
|
-
)
|
|
403
|
-
# Sort by x, y for efficient range query predicate pushdown
|
|
404
|
-
level = level.sort([self._x_column, self._y_column])
|
|
405
|
-
self._preprocessed_data[f"level_{i}"] = level # Keep lazy
|
|
514
|
+
self._preprocessed_data[key] = value
|
|
406
515
|
|
|
407
|
-
#
|
|
408
|
-
|
|
409
|
-
num_compressed = len(level_sizes)
|
|
410
|
-
self._preprocessed_data[f"level_{num_compressed}"] = self._raw_data.sort(
|
|
411
|
-
[self._x_column, self._y_column]
|
|
412
|
-
)
|
|
413
|
-
self._preprocessed_data["num_levels"] = num_compressed + 1
|
|
516
|
+
# Mark that files are already saved
|
|
517
|
+
self._preprocessed_data["_files_already_saved"] = True
|
|
414
518
|
|
|
415
519
|
def _preprocess_streaming(self) -> None:
|
|
416
520
|
"""
|
|
417
|
-
Streaming preprocessing -
|
|
521
|
+
Streaming preprocessing with cascading - builds smaller levels from larger.
|
|
522
|
+
|
|
523
|
+
Uses cascading downsampling: each level is built from the previous larger
|
|
524
|
+
level rather than from raw data. This is more efficient (raw data read once)
|
|
525
|
+
and produces identical results because the downsampling algorithm keeps
|
|
526
|
+
the TOP N highest-intensity points per bin - points that survive at a larger
|
|
527
|
+
level will also be selected at smaller levels.
|
|
528
|
+
|
|
529
|
+
Levels are saved to disk immediately after creation, then read back as the
|
|
530
|
+
source for the next smaller level. This keeps memory low while enabling
|
|
531
|
+
cascading.
|
|
418
532
|
|
|
419
|
-
Builds lazy query plans that are streamed to disk via sink_parquet().
|
|
420
533
|
Data is sorted by x, y columns for efficient range query predicate pushdown.
|
|
421
534
|
"""
|
|
535
|
+
import sys
|
|
536
|
+
|
|
422
537
|
# Get data ranges (minimal collect - just 4 values)
|
|
538
|
+
# These ranges are used for ALL levels to ensure consistent binning
|
|
423
539
|
x_range, y_range = get_data_range(
|
|
424
540
|
self._raw_data,
|
|
425
541
|
self._x_column,
|
|
@@ -428,55 +544,55 @@ class Heatmap(BaseComponent):
|
|
|
428
544
|
self._preprocessed_data["x_range"] = x_range
|
|
429
545
|
self._preprocessed_data["y_range"] = y_range
|
|
430
546
|
|
|
547
|
+
# Compute optimal bins if not provided
|
|
548
|
+
# Cache at 2×min_points, use display_aspect_ratio for bin computation
|
|
549
|
+
cache_target = 2 * self._min_points
|
|
550
|
+
if self._x_bins is None or self._y_bins is None:
|
|
551
|
+
# Use display aspect ratio (not data aspect ratio) for optimal bins
|
|
552
|
+
# This ensures even distribution in the expected display dimensions
|
|
553
|
+
self._x_bins, self._y_bins = compute_optimal_bins(
|
|
554
|
+
cache_target,
|
|
555
|
+
(0, self._display_aspect_ratio), # Fake x_range matching aspect
|
|
556
|
+
(0, 1.0), # Fake y_range
|
|
557
|
+
)
|
|
558
|
+
print(
|
|
559
|
+
f"[HEATMAP] Auto-computed bins: {self._x_bins}x{self._y_bins} "
|
|
560
|
+
f"= {self._x_bins * self._y_bins:,} (cache target: {cache_target:,}, "
|
|
561
|
+
f"display aspect: {self._display_aspect_ratio:.2f})",
|
|
562
|
+
file=sys.stderr,
|
|
563
|
+
)
|
|
564
|
+
|
|
431
565
|
# Get total count
|
|
432
566
|
total = self._raw_data.select(pl.len()).collect().item()
|
|
433
567
|
self._preprocessed_data["total"] = total
|
|
434
568
|
|
|
435
|
-
# Compute target sizes for levels
|
|
436
|
-
level_sizes = compute_compression_levels(
|
|
569
|
+
# Compute target sizes for levels (use 2×min_points for smallest cache level)
|
|
570
|
+
level_sizes = compute_compression_levels(cache_target, total)
|
|
437
571
|
self._preprocessed_data["level_sizes"] = level_sizes
|
|
438
572
|
|
|
439
|
-
#
|
|
440
|
-
self.
|
|
573
|
+
# Create cache directory for immediate level saving
|
|
574
|
+
cache_dir = self._cache_dir / "preprocessed"
|
|
575
|
+
cache_dir.mkdir(parents=True, exist_ok=True)
|
|
576
|
+
|
|
577
|
+
# Build cascading levels using helper
|
|
578
|
+
levels_result = self._build_cascading_levels(
|
|
579
|
+
source_data=self._raw_data,
|
|
580
|
+
level_sizes=level_sizes,
|
|
581
|
+
x_range=x_range,
|
|
582
|
+
y_range=y_range,
|
|
583
|
+
cache_dir=cache_dir,
|
|
584
|
+
prefix="level",
|
|
585
|
+
)
|
|
441
586
|
|
|
442
|
-
|
|
443
|
-
|
|
444
|
-
if
|
|
445
|
-
|
|
446
|
-
elif self._use_simple_downsample:
|
|
447
|
-
level = downsample_2d_simple(
|
|
448
|
-
self._raw_data,
|
|
449
|
-
max_points=size,
|
|
450
|
-
intensity_column=self._intensity_column,
|
|
451
|
-
)
|
|
587
|
+
# Copy results to preprocessed_data
|
|
588
|
+
for key, value in levels_result.items():
|
|
589
|
+
if key == "num_levels":
|
|
590
|
+
self._preprocessed_data["num_levels"] = value
|
|
452
591
|
else:
|
|
453
|
-
|
|
454
|
-
self._raw_data,
|
|
455
|
-
max_points=size,
|
|
456
|
-
x_column=self._x_column,
|
|
457
|
-
y_column=self._y_column,
|
|
458
|
-
intensity_column=self._intensity_column,
|
|
459
|
-
x_bins=self._x_bins,
|
|
460
|
-
y_bins=self._y_bins,
|
|
461
|
-
x_range=x_range,
|
|
462
|
-
y_range=y_range,
|
|
463
|
-
)
|
|
464
|
-
# Sort by x, y for efficient range query predicate pushdown
|
|
465
|
-
# This clusters spatially close points together in row groups
|
|
466
|
-
level = level.sort([self._x_column, self._y_column])
|
|
467
|
-
# Store LazyFrame for streaming to disk
|
|
468
|
-
# Base class will use sink_parquet() to stream without full materialization
|
|
469
|
-
self._preprocessed_data[f"level_{i}"] = level # Keep lazy
|
|
592
|
+
self._preprocessed_data[key] = value
|
|
470
593
|
|
|
471
|
-
#
|
|
472
|
-
|
|
473
|
-
num_compressed = len(level_sizes)
|
|
474
|
-
self._preprocessed_data[f"level_{num_compressed}"] = self._raw_data.sort(
|
|
475
|
-
[self._x_column, self._y_column]
|
|
476
|
-
)
|
|
477
|
-
|
|
478
|
-
# Store number of levels for reconstruction (includes full resolution)
|
|
479
|
-
self._preprocessed_data["num_levels"] = num_compressed + 1
|
|
594
|
+
# Mark that files are already saved (base class should skip saving)
|
|
595
|
+
self._preprocessed_data["_files_already_saved"] = True
|
|
480
596
|
|
|
481
597
|
def _preprocess_eager(self) -> None:
|
|
482
598
|
"""
|
|
@@ -486,6 +602,8 @@ class Heatmap(BaseComponent):
|
|
|
486
602
|
downsampling for better spatial distribution.
|
|
487
603
|
Data is sorted by x, y columns for efficient range query predicate pushdown.
|
|
488
604
|
"""
|
|
605
|
+
import sys
|
|
606
|
+
|
|
489
607
|
# Get data ranges
|
|
490
608
|
x_range, y_range = get_data_range(
|
|
491
609
|
self._raw_data,
|
|
@@ -495,12 +613,29 @@ class Heatmap(BaseComponent):
|
|
|
495
613
|
self._preprocessed_data["x_range"] = x_range
|
|
496
614
|
self._preprocessed_data["y_range"] = y_range
|
|
497
615
|
|
|
616
|
+
# Compute optimal bins if not provided
|
|
617
|
+
# Cache at 2×min_points, use display_aspect_ratio for bin computation
|
|
618
|
+
cache_target = 2 * self._min_points
|
|
619
|
+
if self._x_bins is None or self._y_bins is None:
|
|
620
|
+
# Use display aspect ratio (not data aspect ratio) for optimal bins
|
|
621
|
+
self._x_bins, self._y_bins = compute_optimal_bins(
|
|
622
|
+
cache_target,
|
|
623
|
+
(0, self._display_aspect_ratio), # Fake x_range matching aspect
|
|
624
|
+
(0, 1.0), # Fake y_range
|
|
625
|
+
)
|
|
626
|
+
print(
|
|
627
|
+
f"[HEATMAP] Auto-computed bins: {self._x_bins}x{self._y_bins} "
|
|
628
|
+
f"= {self._x_bins * self._y_bins:,} (cache target: {cache_target:,}, "
|
|
629
|
+
f"display aspect: {self._display_aspect_ratio:.2f})",
|
|
630
|
+
file=sys.stderr,
|
|
631
|
+
)
|
|
632
|
+
|
|
498
633
|
# Get total count
|
|
499
634
|
total = self._raw_data.select(pl.len()).collect().item()
|
|
500
635
|
self._preprocessed_data["total"] = total
|
|
501
636
|
|
|
502
|
-
# Compute compression level target sizes
|
|
503
|
-
level_sizes = compute_compression_levels(
|
|
637
|
+
# Compute compression level target sizes (2× for cache buffer)
|
|
638
|
+
level_sizes = compute_compression_levels(cache_target, total)
|
|
504
639
|
self._preprocessed_data["level_sizes"] = level_sizes
|
|
505
640
|
|
|
506
641
|
# Build levels from largest to smallest
|
|
@@ -736,10 +871,18 @@ class Heatmap(BaseComponent):
|
|
|
736
871
|
if count >= self._min_points:
|
|
737
872
|
# This level has enough detail
|
|
738
873
|
if count > self._min_points:
|
|
739
|
-
# Over limit - downsample to
|
|
740
|
-
#
|
|
874
|
+
# Over limit - downsample to exactly min_points
|
|
875
|
+
# Compute optimal bins from ACTUAL zoom region aspect ratio
|
|
741
876
|
zoom_x_range = (x0, x1)
|
|
742
877
|
zoom_y_range = (y0, y1)
|
|
878
|
+
render_x_bins, render_y_bins = compute_optimal_bins(
|
|
879
|
+
self._min_points, zoom_x_range, zoom_y_range
|
|
880
|
+
)
|
|
881
|
+
print(
|
|
882
|
+
f"[HEATMAP] Render downsample: {count:,} → {self._min_points:,} pts "
|
|
883
|
+
f"(bins: {render_x_bins}x{render_y_bins})",
|
|
884
|
+
file=sys.stderr,
|
|
885
|
+
)
|
|
743
886
|
if self._use_streaming or self._use_simple_downsample:
|
|
744
887
|
if self._use_simple_downsample:
|
|
745
888
|
return downsample_2d_simple(
|
|
@@ -754,8 +897,8 @@ class Heatmap(BaseComponent):
|
|
|
754
897
|
x_column=self._x_column,
|
|
755
898
|
y_column=self._y_column,
|
|
756
899
|
intensity_column=self._intensity_column,
|
|
757
|
-
x_bins=
|
|
758
|
-
y_bins=
|
|
900
|
+
x_bins=render_x_bins,
|
|
901
|
+
y_bins=render_y_bins,
|
|
759
902
|
x_range=zoom_x_range,
|
|
760
903
|
y_range=zoom_y_range,
|
|
761
904
|
).collect()
|
|
@@ -766,8 +909,8 @@ class Heatmap(BaseComponent):
|
|
|
766
909
|
x_column=self._x_column,
|
|
767
910
|
y_column=self._y_column,
|
|
768
911
|
intensity_column=self._intensity_column,
|
|
769
|
-
x_bins=
|
|
770
|
-
y_bins=
|
|
912
|
+
x_bins=render_x_bins,
|
|
913
|
+
y_bins=render_y_bins,
|
|
771
914
|
).collect()
|
|
772
915
|
return filtered
|
|
773
916
|
|
openms_insight/core/base.py
CHANGED
|
@@ -318,6 +318,9 @@ class BaseComponent(ABC):
|
|
|
318
318
|
"data_values": {},
|
|
319
319
|
}
|
|
320
320
|
|
|
321
|
+
# Check if files were already saved during preprocessing (e.g., cascading)
|
|
322
|
+
files_already_saved = self._preprocessed_data.pop("_files_already_saved", False)
|
|
323
|
+
|
|
321
324
|
# Save preprocessed data with type optimization for efficient transfer
|
|
322
325
|
# Float64→Float32 reduces Arrow payload size
|
|
323
326
|
# Int64→Int32 (when safe) avoids BigInt overhead in JavaScript
|
|
@@ -325,18 +328,28 @@ class BaseComponent(ABC):
|
|
|
325
328
|
if isinstance(value, pl.LazyFrame):
|
|
326
329
|
filename = f"{key}.parquet"
|
|
327
330
|
filepath = preprocessed_dir / filename
|
|
328
|
-
|
|
329
|
-
|
|
330
|
-
|
|
331
|
-
|
|
332
|
-
|
|
331
|
+
|
|
332
|
+
if files_already_saved and filepath.exists():
|
|
333
|
+
# File was saved during preprocessing (cascading) - just register it
|
|
334
|
+
manifest["data_files"][key] = filename
|
|
335
|
+
else:
|
|
336
|
+
# Apply streaming-safe optimization (Float64→Float32 only)
|
|
337
|
+
# Int64 bounds checking would require collect(), breaking streaming
|
|
338
|
+
value = optimize_for_transfer_lazy(value)
|
|
339
|
+
value.sink_parquet(filepath, compression="zstd")
|
|
340
|
+
manifest["data_files"][key] = filename
|
|
333
341
|
elif isinstance(value, pl.DataFrame):
|
|
334
342
|
filename = f"{key}.parquet"
|
|
335
343
|
filepath = preprocessed_dir / filename
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
|
|
339
|
-
|
|
344
|
+
|
|
345
|
+
if files_already_saved and filepath.exists():
|
|
346
|
+
# File was saved during preprocessing - just register it
|
|
347
|
+
manifest["data_files"][key] = filename
|
|
348
|
+
else:
|
|
349
|
+
# Full optimization including Int64→Int32 with bounds checking
|
|
350
|
+
value = optimize_for_transfer(value)
|
|
351
|
+
value.write_parquet(filepath, compression="zstd")
|
|
352
|
+
manifest["data_files"][key] = filename
|
|
340
353
|
elif self._is_json_serializable(value):
|
|
341
354
|
manifest["data_values"][key] = value
|
|
342
355
|
|
|
@@ -6,7 +6,8 @@ data, enabling efficient visualization of datasets with millions of points.
|
|
|
6
6
|
Supports both streaming (lazy) and eager downsampling approaches.
|
|
7
7
|
"""
|
|
8
8
|
|
|
9
|
-
|
|
9
|
+
import math
|
|
10
|
+
from typing import List, Optional, Tuple, Union
|
|
10
11
|
|
|
11
12
|
import numpy as np
|
|
12
13
|
import polars as pl
|
|
@@ -19,6 +20,59 @@ except ImportError:
|
|
|
19
20
|
HAS_SCIPY = False
|
|
20
21
|
|
|
21
22
|
|
|
23
|
+
def compute_optimal_bins(
|
|
24
|
+
target_points: int,
|
|
25
|
+
x_range: Tuple[float, float],
|
|
26
|
+
y_range: Tuple[float, float],
|
|
27
|
+
) -> Tuple[int, int]:
|
|
28
|
+
"""
|
|
29
|
+
Compute optimal x_bins, y_bins for even spatial distribution.
|
|
30
|
+
|
|
31
|
+
The bin grid matches the data's aspect ratio so bins are approximately
|
|
32
|
+
square in data space. Total bins ≈ target_points for 1 point per bin.
|
|
33
|
+
|
|
34
|
+
Solves the system:
|
|
35
|
+
x_bins × y_bins = target_points
|
|
36
|
+
x_bins / y_bins = aspect_ratio
|
|
37
|
+
|
|
38
|
+
Solution:
|
|
39
|
+
y_bins = sqrt(target_points / aspect_ratio)
|
|
40
|
+
x_bins = sqrt(target_points × aspect_ratio)
|
|
41
|
+
|
|
42
|
+
Args:
|
|
43
|
+
target_points: Target number of bins (and thus max points with 1 per bin)
|
|
44
|
+
x_range: (x_min, x_max) data range
|
|
45
|
+
y_range: (y_min, y_max) data range
|
|
46
|
+
|
|
47
|
+
Returns:
|
|
48
|
+
(x_bins, y_bins) tuple
|
|
49
|
+
|
|
50
|
+
Examples:
|
|
51
|
+
>>> compute_optimal_bins(10000, (0, 1000), (0, 100)) # 10:1 aspect
|
|
52
|
+
(316, 31)
|
|
53
|
+
>>> compute_optimal_bins(10000, (0, 100), (0, 100)) # 1:1 aspect
|
|
54
|
+
(100, 100)
|
|
55
|
+
"""
|
|
56
|
+
x_span = x_range[1] - x_range[0]
|
|
57
|
+
y_span = y_range[1] - y_range[0]
|
|
58
|
+
|
|
59
|
+
# Handle edge cases
|
|
60
|
+
if y_span < 1e-10:
|
|
61
|
+
y_span = x_span if x_span > 1e-10 else 1.0
|
|
62
|
+
if x_span < 1e-10:
|
|
63
|
+
x_span = y_span
|
|
64
|
+
|
|
65
|
+
aspect_ratio = x_span / y_span
|
|
66
|
+
|
|
67
|
+
# Clamp to reasonable bounds (avoid extreme rectangles)
|
|
68
|
+
aspect_ratio = max(0.05, min(20.0, aspect_ratio))
|
|
69
|
+
|
|
70
|
+
y_bins = max(1, int(math.sqrt(target_points / aspect_ratio)))
|
|
71
|
+
x_bins = max(1, int(math.sqrt(target_points * aspect_ratio)))
|
|
72
|
+
|
|
73
|
+
return x_bins, y_bins
|
|
74
|
+
|
|
75
|
+
|
|
22
76
|
def compute_compression_levels(min_size: int, total: int) -> List[int]:
|
|
23
77
|
"""
|
|
24
78
|
Compute logarithmically-spaced compression level target sizes.
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: openms-insight
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.4
|
|
4
4
|
Summary: Interactive visualization components for mass spectrometry data in Streamlit
|
|
5
5
|
Project-URL: Homepage, https://github.com/t0mdavid-m/OpenMS-Insight
|
|
6
6
|
Project-URL: Documentation, https://github.com/t0mdavid-m/OpenMS-Insight#readme
|
|
@@ -1,17 +1,17 @@
|
|
|
1
1
|
openms_insight/__init__.py,sha256=Iv9w0J_7J3pMsyvM4xaYDMWt6IvrtAt6WqOmJ-_tUxk,1044
|
|
2
2
|
openms_insight/components/__init__.py,sha256=T9mUxfgFUiHILmXh1VjcGVlnRvuxRMqi_GJJYOmJKwY,177
|
|
3
|
-
openms_insight/components/heatmap.py,sha256=
|
|
3
|
+
openms_insight/components/heatmap.py,sha256=LigtpPbAPQpfjFljMWoEPPAc3t27Bl1ekr5uaR1Ctuk,44090
|
|
4
4
|
openms_insight/components/lineplot.py,sha256=I-JPvDzCr3Nu8Boc1V4D8QQ1bHgTqvM6CbeoIe7zJ-s,30896
|
|
5
5
|
openms_insight/components/sequenceview.py,sha256=0pDOE0xeoc1-85QZNGdNwwoBwXi-5MFfeb9pCcOi6rc,30274
|
|
6
6
|
openms_insight/components/table.py,sha256=wmq1rjGVe4Ef0SAf5p85pfVCeyLlVevZnxBc9EIg2uk,16458
|
|
7
7
|
openms_insight/core/__init__.py,sha256=EPjKX_FFQRgO8mWHs59I-o0BiuzEMzEU1Pfu9YOfLC4,338
|
|
8
|
-
openms_insight/core/base.py,sha256=
|
|
8
|
+
openms_insight/core/base.py,sha256=P2cOrPvPIzxfYQ7xMn9e0BlyKEMrhOCgD9FAtyxTiCc,19408
|
|
9
9
|
openms_insight/core/cache.py,sha256=3fnPDWjuWUnxazK2XflcUIeRZZPQ3N45kAKYu-xGBKw,1197
|
|
10
10
|
openms_insight/core/registry.py,sha256=Hak80Jqhx0qa4gbd1YolNZnM6xBrS8I4U_X7zC0bQ8Y,2108
|
|
11
11
|
openms_insight/core/state.py,sha256=_vNYxYHYFgIigbkqYwkIO6cBGFJyF2VN9dr7CBEAQbY,6873
|
|
12
12
|
openms_insight/core/subprocess_preprocess.py,sha256=m9FbAAFy9Do1Exlh-m4Wo-LDwv6yHlEI4klz5OVwemc,3133
|
|
13
13
|
openms_insight/preprocessing/__init__.py,sha256=hXKTI9zHtMtHojqXq_0V62xfNokozpnpRAwEnxs81fM,461
|
|
14
|
-
openms_insight/preprocessing/compression.py,sha256=
|
|
14
|
+
openms_insight/preprocessing/compression.py,sha256=T4YbX9PUlfTfPit_kpuLZn8hYpqLYu3xtTme_CG2ymc,12241
|
|
15
15
|
openms_insight/preprocessing/filtering.py,sha256=fkmaIXfR5hfjyWfaMYqaeybMHaZjvUZYaKCqvxPOWMQ,14152
|
|
16
16
|
openms_insight/rendering/__init__.py,sha256=ApHvKeh87yY4GTIEai-tCeIXpNbwOXWlmcmIwMMRZYc,198
|
|
17
17
|
openms_insight/rendering/bridge.py,sha256=i8cZq_ra13XpuV1KT0qC6Jf4VCAe4BGrLE-ybrFHwZE,19408
|
|
@@ -22,7 +22,7 @@ openms_insight/js-component/dist/assets/materialdesignicons-webfont.eot,sha256=C
|
|
|
22
22
|
openms_insight/js-component/dist/assets/materialdesignicons-webfont.ttf,sha256=YeirpaTpgf4iz3yOi82-oAR251xiw38Bv37jM2HWhCg,1307660
|
|
23
23
|
openms_insight/js-component/dist/assets/materialdesignicons-webfont.woff,sha256=pZKKDVwvYk5G-Y2bFcL2AEU3f3xZTdeKF1kTLqO0Y-s,587984
|
|
24
24
|
openms_insight/js-component/dist/assets/materialdesignicons-webfont.woff2,sha256=Zi_vqPL4qVwYWI0hd0eJwQfGTnccvmWmmvRikcQxGvw,403216
|
|
25
|
-
openms_insight-0.1.
|
|
26
|
-
openms_insight-0.1.
|
|
27
|
-
openms_insight-0.1.
|
|
28
|
-
openms_insight-0.1.
|
|
25
|
+
openms_insight-0.1.4.dist-info/METADATA,sha256=_c_eGoMj7wCxAWE5CHC6T2Emri6DEZRXwrQJ-RNptrI,12807
|
|
26
|
+
openms_insight-0.1.4.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
|
|
27
|
+
openms_insight-0.1.4.dist-info/licenses/LICENSE,sha256=INFF4rOMmpah7Oi14hLqu7NTOsx56KRRNChAAUcfh2E,1823
|
|
28
|
+
openms_insight-0.1.4.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|