openms-insight 0.1.3__py3-none-any.whl → 0.1.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -8,6 +8,7 @@ from ..core.base import BaseComponent
8
8
  from ..core.registry import register_component
9
9
  from ..preprocessing.compression import (
10
10
  compute_compression_levels,
11
+ compute_optimal_bins,
11
12
  downsample_2d,
12
13
  downsample_2d_simple,
13
14
  downsample_2d_streaming,
@@ -76,9 +77,10 @@ class Heatmap(BaseComponent):
76
77
  interactivity: Optional[Dict[str, str]] = None,
77
78
  cache_path: str = ".",
78
79
  regenerate_cache: bool = False,
79
- min_points: int = 20000,
80
- x_bins: int = 400,
81
- y_bins: int = 50,
80
+ min_points: int = 10000,
81
+ display_aspect_ratio: float = 16 / 9,
82
+ x_bins: Optional[int] = None,
83
+ y_bins: Optional[int] = None,
82
84
  zoom_identifier: str = "heatmap_zoom",
83
85
  title: Optional[str] = None,
84
86
  x_label: Optional[str] = None,
@@ -106,10 +108,17 @@ class Heatmap(BaseComponent):
106
108
  point's value in the corresponding column.
107
109
  cache_path: Base path for cache storage. Default "." (current dir).
108
110
  regenerate_cache: If True, regenerate cache even if valid cache exists.
109
- min_points: Target size for smallest compression level and
110
- threshold for level selection (default: 20000)
111
- x_bins: Number of bins along x-axis for downsampling (default: 400)
112
- y_bins: Number of bins along y-axis for downsampling (default: 50)
111
+ min_points: Target number of points to display (default: 10000).
112
+ Cache levels are built at 2× this value; final downsample
113
+ at render time reduces to exactly min_points.
114
+ display_aspect_ratio: Expected display width/height ratio for
115
+ optimal bin computation during caching (default: 16/9).
116
+ At render time, the actual zoom region's aspect ratio is used.
117
+ x_bins: Number of bins along x-axis for downsampling. If None
118
+ (default), auto-computed from display_aspect_ratio such that
119
+ x_bins × y_bins ≈ 2×min_points with even spatial distribution.
120
+ y_bins: Number of bins along y-axis for downsampling. If None
121
+ (default), auto-computed from display_aspect_ratio.
113
122
  zoom_identifier: State key for storing zoom range (default: 'heatmap_zoom')
114
123
  title: Heatmap title displayed above the plot
115
124
  x_label: X-axis label (defaults to x_column)
@@ -130,6 +139,7 @@ class Heatmap(BaseComponent):
130
139
  self._y_column = y_column
131
140
  self._intensity_column = intensity_column
132
141
  self._min_points = min_points
142
+ self._display_aspect_ratio = display_aspect_ratio
133
143
  self._x_bins = x_bins
134
144
  self._y_bins = y_bins
135
145
  self._zoom_identifier = zoom_identifier
@@ -155,6 +165,7 @@ class Heatmap(BaseComponent):
155
165
  y_column=y_column,
156
166
  intensity_column=intensity_column,
157
167
  min_points=min_points,
168
+ display_aspect_ratio=display_aspect_ratio,
158
169
  x_bins=x_bins,
159
170
  y_bins=y_bins,
160
171
  zoom_identifier=zoom_identifier,
@@ -180,6 +191,7 @@ class Heatmap(BaseComponent):
180
191
  "y_column": self._y_column,
181
192
  "intensity_column": self._intensity_column,
182
193
  "min_points": self._min_points,
194
+ "display_aspect_ratio": self._display_aspect_ratio,
183
195
  "x_bins": self._x_bins,
184
196
  "y_bins": self._y_bins,
185
197
  "use_simple_downsample": self._use_simple_downsample,
@@ -197,7 +209,10 @@ class Heatmap(BaseComponent):
197
209
  self._x_column = config.get("x_column")
198
210
  self._y_column = config.get("y_column")
199
211
  self._intensity_column = config.get("intensity_column", "intensity")
200
- self._min_points = config.get("min_points", 20000)
212
+ self._min_points = config.get("min_points", 10000)
213
+ self._display_aspect_ratio = config.get("display_aspect_ratio", 16 / 9)
214
+ # x_bins/y_bins are computed during preprocessing and stored in cache
215
+ # Fallback to old defaults for backward compatibility with old caches
201
216
  self._x_bins = config.get("x_bins", 400)
202
217
  self._y_bins = config.get("y_bins", 50)
203
218
  self._use_simple_downsample = config.get("use_simple_downsample", False)
@@ -242,14 +257,116 @@ class Heatmap(BaseComponent):
242
257
  else:
243
258
  self._preprocess_eager()
244
259
 
260
+ def _build_cascading_levels(
261
+ self,
262
+ source_data: pl.LazyFrame,
263
+ level_sizes: list,
264
+ x_range: tuple,
265
+ y_range: tuple,
266
+ cache_dir,
267
+ prefix: str = "level",
268
+ ) -> dict:
269
+ """
270
+ Build cascading compression levels from source data.
271
+
272
+ Each level is built from the previous larger level rather than from
273
+ raw data. This is efficient (raw data read once) and produces identical
274
+ results because the downsampling keeps top N highest-intensity points
275
+ per bin - points surviving at larger levels will also be selected at
276
+ smaller levels.
277
+
278
+ Args:
279
+ source_data: LazyFrame with raw/filtered data
280
+ level_sizes: List of target sizes for compressed levels (smallest first)
281
+ x_range: (x_min, x_max) for consistent bin boundaries
282
+ y_range: (y_min, y_max) for consistent bin boundaries
283
+ cache_dir: Path to save parquet files
284
+ prefix: Filename prefix (e.g., "level" or "cat_level_im_0")
285
+
286
+ Returns:
287
+ Dict with level LazyFrames keyed by "{prefix}_{idx}" and "num_levels"
288
+ """
289
+ import sys
290
+
291
+ result = {}
292
+ num_compressed = len(level_sizes)
293
+
294
+ # Get total count
295
+ total = source_data.select(pl.len()).collect().item()
296
+
297
+ # First: save full resolution as the largest level
298
+ full_res_path = cache_dir / f"{prefix}_{num_compressed}.parquet"
299
+ full_res = source_data.sort([self._x_column, self._y_column])
300
+ full_res.sink_parquet(full_res_path, compression="zstd")
301
+ print(
302
+ f"[HEATMAP] Saved {prefix}_{num_compressed} ({total:,} pts)",
303
+ file=sys.stderr,
304
+ )
305
+
306
+ # Start cascading from full resolution
307
+ current_source = pl.scan_parquet(full_res_path)
308
+ current_size = total
309
+
310
+ # Build compressed levels from largest to smallest
311
+ for i, target_size in enumerate(reversed(level_sizes)):
312
+ level_idx = num_compressed - 1 - i
313
+ level_path = cache_dir / f"{prefix}_{level_idx}.parquet"
314
+
315
+ # If target size equals or exceeds current, just copy reference
316
+ if target_size >= current_size:
317
+ level = current_source
318
+ elif self._use_simple_downsample:
319
+ level = downsample_2d_simple(
320
+ current_source,
321
+ max_points=target_size,
322
+ intensity_column=self._intensity_column,
323
+ )
324
+ else:
325
+ level = downsample_2d_streaming(
326
+ current_source,
327
+ max_points=target_size,
328
+ x_column=self._x_column,
329
+ y_column=self._y_column,
330
+ intensity_column=self._intensity_column,
331
+ x_bins=self._x_bins,
332
+ y_bins=self._y_bins,
333
+ x_range=x_range,
334
+ y_range=y_range,
335
+ )
336
+
337
+ # Sort and save immediately
338
+ level = level.sort([self._x_column, self._y_column])
339
+ level.sink_parquet(level_path, compression="zstd")
340
+
341
+ print(
342
+ f"[HEATMAP] Saved {prefix}_{level_idx} (target {target_size:,} pts)",
343
+ file=sys.stderr,
344
+ )
345
+
346
+ # Next iteration uses this level as source (cascading)
347
+ current_source = pl.scan_parquet(level_path)
348
+ current_size = target_size
349
+
350
+ # Load all levels back as LazyFrames
351
+ for i in range(num_compressed + 1):
352
+ level_path = cache_dir / f"{prefix}_{i}.parquet"
353
+ result[f"{prefix}_{i}"] = pl.scan_parquet(level_path)
354
+
355
+ result["num_levels"] = num_compressed + 1
356
+
357
+ return result
358
+
245
359
  def _preprocess_with_categorical_filters(self) -> None:
246
360
  """
247
- Preprocess with per-filter-value compression levels.
361
+ Preprocess with per-filter-value compression levels using cascading.
248
362
 
249
363
  For each unique value of each categorical filter, creates separate
250
- compression levels. This ensures that when a filter is applied at
251
- render time, the resulting data has ~min_points regardless of the
252
- filter value selected.
364
+ compression levels using cascading (building smaller levels from larger).
365
+ This ensures that when a filter is applied at render time, the resulting
366
+ data has ~min_points regardless of the filter value selected.
367
+
368
+ Uses cascading downsampling for efficiency - each level is built from
369
+ the previous larger level rather than from raw data.
253
370
 
254
371
  Data is sorted by x, y columns for efficient range query predicate pushdown.
255
372
 
@@ -261,6 +378,7 @@ class Heatmap(BaseComponent):
261
378
  import sys
262
379
 
263
380
  # Get data ranges (for the full dataset)
381
+ # These ranges are used for ALL levels to ensure consistent binning
264
382
  x_range, y_range = get_data_range(
265
383
  self._raw_data,
266
384
  self._x_column,
@@ -269,10 +387,31 @@ class Heatmap(BaseComponent):
269
387
  self._preprocessed_data["x_range"] = x_range
270
388
  self._preprocessed_data["y_range"] = y_range
271
389
 
390
+ # Compute optimal bins if not provided
391
+ # Cache at 2×min_points, use display_aspect_ratio for bin computation
392
+ cache_target = 2 * self._min_points
393
+ if self._x_bins is None or self._y_bins is None:
394
+ # Use display aspect ratio (not data aspect ratio) for optimal bins
395
+ self._x_bins, self._y_bins = compute_optimal_bins(
396
+ cache_target,
397
+ (0, self._display_aspect_ratio), # Fake x_range matching aspect
398
+ (0, 1.0), # Fake y_range
399
+ )
400
+ print(
401
+ f"[HEATMAP] Auto-computed bins: {self._x_bins}x{self._y_bins} "
402
+ f"= {self._x_bins * self._y_bins:,} (cache target: {cache_target:,}, "
403
+ f"display aspect: {self._display_aspect_ratio:.2f})",
404
+ file=sys.stderr,
405
+ )
406
+
272
407
  # Get total count
273
408
  total = self._raw_data.select(pl.len()).collect().item()
274
409
  self._preprocessed_data["total"] = total
275
410
 
411
+ # Create cache directory for immediate level saving
412
+ cache_dir = self._cache_dir / "preprocessed"
413
+ cache_dir.mkdir(parents=True, exist_ok=True)
414
+
276
415
  # Store metadata about categorical filters
277
416
  self._preprocessed_data["has_categorical_filters"] = True
278
417
  self._preprocessed_data["categorical_filter_values"] = {}
@@ -309,7 +448,7 @@ class Heatmap(BaseComponent):
309
448
  unique_values
310
449
  )
311
450
 
312
- # Create compression levels for each filter value
451
+ # Create compression levels for each filter value using cascading
313
452
  for filter_value in unique_values:
314
453
  # Filter data to this value
315
454
  filtered_data = self._raw_data.filter(
@@ -317,9 +456,9 @@ class Heatmap(BaseComponent):
317
456
  )
318
457
  filtered_total = filtered_data.select(pl.len()).collect().item()
319
458
 
320
- # Compute level sizes for this filtered subset
459
+ # Compute level sizes for this filtered subset (2× for cache buffer)
321
460
  level_sizes = compute_compression_levels(
322
- self._min_points, filtered_total
461
+ cache_target, filtered_total
323
462
  )
324
463
 
325
464
  print(
@@ -332,94 +471,71 @@ class Heatmap(BaseComponent):
332
471
  f"cat_level_sizes_{filter_id}_{filter_value}"
333
472
  ] = level_sizes
334
473
 
335
- # Build each compressed level
336
- for level_idx, target_size in enumerate(level_sizes):
337
- # If target size equals total, skip downsampling - use all data
338
- if target_size >= filtered_total:
339
- level = filtered_data
340
- elif self._use_simple_downsample:
341
- level = downsample_2d_simple(
342
- filtered_data,
343
- max_points=target_size,
344
- intensity_column=self._intensity_column,
345
- )
346
- else:
347
- level = downsample_2d_streaming(
348
- filtered_data,
349
- max_points=target_size,
350
- x_column=self._x_column,
351
- y_column=self._y_column,
352
- intensity_column=self._intensity_column,
353
- x_bins=self._x_bins,
354
- y_bins=self._y_bins,
355
- x_range=x_range,
356
- y_range=y_range,
357
- )
358
-
359
- # Sort by x, y for efficient range query predicate pushdown
360
- level = level.sort([self._x_column, self._y_column])
361
- # Store LazyFrame for streaming to disk
362
- level_key = f"cat_level_{filter_id}_{filter_value}_{level_idx}"
363
- self._preprocessed_data[level_key] = level # Keep lazy
364
-
365
- # Add full resolution as final level (for zoom fallback)
366
- # Also sorted for consistent predicate pushdown behavior
367
- num_compressed = len(level_sizes)
368
- full_res_key = f"cat_level_{filter_id}_{filter_value}_{num_compressed}"
369
- self._preprocessed_data[full_res_key] = filtered_data.sort(
370
- [self._x_column, self._y_column]
474
+ # Build cascading levels using helper
475
+ prefix = f"cat_level_{filter_id}_{filter_value}"
476
+ levels_result = self._build_cascading_levels(
477
+ source_data=filtered_data,
478
+ level_sizes=level_sizes,
479
+ x_range=x_range,
480
+ y_range=y_range,
481
+ cache_dir=cache_dir,
482
+ prefix=prefix,
371
483
  )
372
- self._preprocessed_data[
373
- f"cat_num_levels_{filter_id}_{filter_value}"
374
- ] = num_compressed + 1
484
+
485
+ # Copy results to preprocessed_data
486
+ for key, value in levels_result.items():
487
+ if key == "num_levels":
488
+ self._preprocessed_data[
489
+ f"cat_num_levels_{filter_id}_{filter_value}"
490
+ ] = value
491
+ else:
492
+ self._preprocessed_data[key] = value
375
493
 
376
494
  # Also create global levels for when no categorical filter is selected
377
- # (fallback to standard behavior)
378
- level_sizes = compute_compression_levels(self._min_points, total)
495
+ # (fallback to standard behavior) - using cascading with 2× cache buffer
496
+ level_sizes = compute_compression_levels(cache_target, total)
379
497
  self._preprocessed_data["level_sizes"] = level_sizes
380
498
 
381
- for i, size in enumerate(level_sizes):
382
- # If target size equals total, skip downsampling - use all data
383
- if size >= total:
384
- level = self._raw_data
385
- elif self._use_simple_downsample:
386
- level = downsample_2d_simple(
387
- self._raw_data,
388
- max_points=size,
389
- intensity_column=self._intensity_column,
390
- )
499
+ # Build global cascading levels using helper
500
+ levels_result = self._build_cascading_levels(
501
+ source_data=self._raw_data,
502
+ level_sizes=level_sizes,
503
+ x_range=x_range,
504
+ y_range=y_range,
505
+ cache_dir=cache_dir,
506
+ prefix="level",
507
+ )
508
+
509
+ # Copy results to preprocessed_data
510
+ for key, value in levels_result.items():
511
+ if key == "num_levels":
512
+ self._preprocessed_data["num_levels"] = value
391
513
  else:
392
- level = downsample_2d_streaming(
393
- self._raw_data,
394
- max_points=size,
395
- x_column=self._x_column,
396
- y_column=self._y_column,
397
- intensity_column=self._intensity_column,
398
- x_bins=self._x_bins,
399
- y_bins=self._y_bins,
400
- x_range=x_range,
401
- y_range=y_range,
402
- )
403
- # Sort by x, y for efficient range query predicate pushdown
404
- level = level.sort([self._x_column, self._y_column])
405
- self._preprocessed_data[f"level_{i}"] = level # Keep lazy
514
+ self._preprocessed_data[key] = value
406
515
 
407
- # Add full resolution as final level (for zoom fallback)
408
- # Also sorted for consistent predicate pushdown behavior
409
- num_compressed = len(level_sizes)
410
- self._preprocessed_data[f"level_{num_compressed}"] = self._raw_data.sort(
411
- [self._x_column, self._y_column]
412
- )
413
- self._preprocessed_data["num_levels"] = num_compressed + 1
516
+ # Mark that files are already saved
517
+ self._preprocessed_data["_files_already_saved"] = True
414
518
 
415
519
  def _preprocess_streaming(self) -> None:
416
520
  """
417
- Streaming preprocessing - levels stay lazy through caching.
521
+ Streaming preprocessing with cascading - builds smaller levels from larger.
522
+
523
+ Uses cascading downsampling: each level is built from the previous larger
524
+ level rather than from raw data. This is more efficient (raw data read once)
525
+ and produces identical results because the downsampling algorithm keeps
526
+ the TOP N highest-intensity points per bin - points that survive at a larger
527
+ level will also be selected at smaller levels.
528
+
529
+ Levels are saved to disk immediately after creation, then read back as the
530
+ source for the next smaller level. This keeps memory low while enabling
531
+ cascading.
418
532
 
419
- Builds lazy query plans that are streamed to disk via sink_parquet().
420
533
  Data is sorted by x, y columns for efficient range query predicate pushdown.
421
534
  """
535
+ import sys
536
+
422
537
  # Get data ranges (minimal collect - just 4 values)
538
+ # These ranges are used for ALL levels to ensure consistent binning
423
539
  x_range, y_range = get_data_range(
424
540
  self._raw_data,
425
541
  self._x_column,
@@ -428,55 +544,55 @@ class Heatmap(BaseComponent):
428
544
  self._preprocessed_data["x_range"] = x_range
429
545
  self._preprocessed_data["y_range"] = y_range
430
546
 
547
+ # Compute optimal bins if not provided
548
+ # Cache at 2×min_points, use display_aspect_ratio for bin computation
549
+ cache_target = 2 * self._min_points
550
+ if self._x_bins is None or self._y_bins is None:
551
+ # Use display aspect ratio (not data aspect ratio) for optimal bins
552
+ # This ensures even distribution in the expected display dimensions
553
+ self._x_bins, self._y_bins = compute_optimal_bins(
554
+ cache_target,
555
+ (0, self._display_aspect_ratio), # Fake x_range matching aspect
556
+ (0, 1.0), # Fake y_range
557
+ )
558
+ print(
559
+ f"[HEATMAP] Auto-computed bins: {self._x_bins}x{self._y_bins} "
560
+ f"= {self._x_bins * self._y_bins:,} (cache target: {cache_target:,}, "
561
+ f"display aspect: {self._display_aspect_ratio:.2f})",
562
+ file=sys.stderr,
563
+ )
564
+
431
565
  # Get total count
432
566
  total = self._raw_data.select(pl.len()).collect().item()
433
567
  self._preprocessed_data["total"] = total
434
568
 
435
- # Compute target sizes for levels
436
- level_sizes = compute_compression_levels(self._min_points, total)
569
+ # Compute target sizes for levels (use 2×min_points for smallest cache level)
570
+ level_sizes = compute_compression_levels(cache_target, total)
437
571
  self._preprocessed_data["level_sizes"] = level_sizes
438
572
 
439
- # Build and collect each level
440
- self._preprocessed_data["levels"] = []
573
+ # Create cache directory for immediate level saving
574
+ cache_dir = self._cache_dir / "preprocessed"
575
+ cache_dir.mkdir(parents=True, exist_ok=True)
576
+
577
+ # Build cascading levels using helper
578
+ levels_result = self._build_cascading_levels(
579
+ source_data=self._raw_data,
580
+ level_sizes=level_sizes,
581
+ x_range=x_range,
582
+ y_range=y_range,
583
+ cache_dir=cache_dir,
584
+ prefix="level",
585
+ )
441
586
 
442
- for i, size in enumerate(level_sizes):
443
- # If target size equals total, skip downsampling - use all data
444
- if size >= total:
445
- level = self._raw_data
446
- elif self._use_simple_downsample:
447
- level = downsample_2d_simple(
448
- self._raw_data,
449
- max_points=size,
450
- intensity_column=self._intensity_column,
451
- )
587
+ # Copy results to preprocessed_data
588
+ for key, value in levels_result.items():
589
+ if key == "num_levels":
590
+ self._preprocessed_data["num_levels"] = value
452
591
  else:
453
- level = downsample_2d_streaming(
454
- self._raw_data,
455
- max_points=size,
456
- x_column=self._x_column,
457
- y_column=self._y_column,
458
- intensity_column=self._intensity_column,
459
- x_bins=self._x_bins,
460
- y_bins=self._y_bins,
461
- x_range=x_range,
462
- y_range=y_range,
463
- )
464
- # Sort by x, y for efficient range query predicate pushdown
465
- # This clusters spatially close points together in row groups
466
- level = level.sort([self._x_column, self._y_column])
467
- # Store LazyFrame for streaming to disk
468
- # Base class will use sink_parquet() to stream without full materialization
469
- self._preprocessed_data[f"level_{i}"] = level # Keep lazy
592
+ self._preprocessed_data[key] = value
470
593
 
471
- # Add full resolution as final level (for zoom fallback)
472
- # Also sorted for consistent predicate pushdown behavior
473
- num_compressed = len(level_sizes)
474
- self._preprocessed_data[f"level_{num_compressed}"] = self._raw_data.sort(
475
- [self._x_column, self._y_column]
476
- )
477
-
478
- # Store number of levels for reconstruction (includes full resolution)
479
- self._preprocessed_data["num_levels"] = num_compressed + 1
594
+ # Mark that files are already saved (base class should skip saving)
595
+ self._preprocessed_data["_files_already_saved"] = True
480
596
 
481
597
  def _preprocess_eager(self) -> None:
482
598
  """
@@ -486,6 +602,8 @@ class Heatmap(BaseComponent):
486
602
  downsampling for better spatial distribution.
487
603
  Data is sorted by x, y columns for efficient range query predicate pushdown.
488
604
  """
605
+ import sys
606
+
489
607
  # Get data ranges
490
608
  x_range, y_range = get_data_range(
491
609
  self._raw_data,
@@ -495,12 +613,29 @@ class Heatmap(BaseComponent):
495
613
  self._preprocessed_data["x_range"] = x_range
496
614
  self._preprocessed_data["y_range"] = y_range
497
615
 
616
+ # Compute optimal bins if not provided
617
+ # Cache at 2×min_points, use display_aspect_ratio for bin computation
618
+ cache_target = 2 * self._min_points
619
+ if self._x_bins is None or self._y_bins is None:
620
+ # Use display aspect ratio (not data aspect ratio) for optimal bins
621
+ self._x_bins, self._y_bins = compute_optimal_bins(
622
+ cache_target,
623
+ (0, self._display_aspect_ratio), # Fake x_range matching aspect
624
+ (0, 1.0), # Fake y_range
625
+ )
626
+ print(
627
+ f"[HEATMAP] Auto-computed bins: {self._x_bins}x{self._y_bins} "
628
+ f"= {self._x_bins * self._y_bins:,} (cache target: {cache_target:,}, "
629
+ f"display aspect: {self._display_aspect_ratio:.2f})",
630
+ file=sys.stderr,
631
+ )
632
+
498
633
  # Get total count
499
634
  total = self._raw_data.select(pl.len()).collect().item()
500
635
  self._preprocessed_data["total"] = total
501
636
 
502
- # Compute compression level target sizes
503
- level_sizes = compute_compression_levels(self._min_points, total)
637
+ # Compute compression level target sizes (2× for cache buffer)
638
+ level_sizes = compute_compression_levels(cache_target, total)
504
639
  self._preprocessed_data["level_sizes"] = level_sizes
505
640
 
506
641
  # Build levels from largest to smallest
@@ -736,10 +871,18 @@ class Heatmap(BaseComponent):
736
871
  if count >= self._min_points:
737
872
  # This level has enough detail
738
873
  if count > self._min_points:
739
- # Over limit - downsample to stay at/under max
740
- # Use ZOOM range for binning (not global) to avoid sparse bins
874
+ # Over limit - downsample to exactly min_points
875
+ # Compute optimal bins from ACTUAL zoom region aspect ratio
741
876
  zoom_x_range = (x0, x1)
742
877
  zoom_y_range = (y0, y1)
878
+ render_x_bins, render_y_bins = compute_optimal_bins(
879
+ self._min_points, zoom_x_range, zoom_y_range
880
+ )
881
+ print(
882
+ f"[HEATMAP] Render downsample: {count:,} → {self._min_points:,} pts "
883
+ f"(bins: {render_x_bins}x{render_y_bins})",
884
+ file=sys.stderr,
885
+ )
743
886
  if self._use_streaming or self._use_simple_downsample:
744
887
  if self._use_simple_downsample:
745
888
  return downsample_2d_simple(
@@ -754,8 +897,8 @@ class Heatmap(BaseComponent):
754
897
  x_column=self._x_column,
755
898
  y_column=self._y_column,
756
899
  intensity_column=self._intensity_column,
757
- x_bins=self._x_bins,
758
- y_bins=self._y_bins,
900
+ x_bins=render_x_bins,
901
+ y_bins=render_y_bins,
759
902
  x_range=zoom_x_range,
760
903
  y_range=zoom_y_range,
761
904
  ).collect()
@@ -766,8 +909,8 @@ class Heatmap(BaseComponent):
766
909
  x_column=self._x_column,
767
910
  y_column=self._y_column,
768
911
  intensity_column=self._intensity_column,
769
- x_bins=self._x_bins,
770
- y_bins=self._y_bins,
912
+ x_bins=render_x_bins,
913
+ y_bins=render_y_bins,
771
914
  ).collect()
772
915
  return filtered
773
916
 
@@ -318,6 +318,9 @@ class BaseComponent(ABC):
318
318
  "data_values": {},
319
319
  }
320
320
 
321
+ # Check if files were already saved during preprocessing (e.g., cascading)
322
+ files_already_saved = self._preprocessed_data.pop("_files_already_saved", False)
323
+
321
324
  # Save preprocessed data with type optimization for efficient transfer
322
325
  # Float64→Float32 reduces Arrow payload size
323
326
  # Int64→Int32 (when safe) avoids BigInt overhead in JavaScript
@@ -325,18 +328,28 @@ class BaseComponent(ABC):
325
328
  if isinstance(value, pl.LazyFrame):
326
329
  filename = f"{key}.parquet"
327
330
  filepath = preprocessed_dir / filename
328
- # Apply streaming-safe optimization (Float64→Float32 only)
329
- # Int64 bounds checking would require collect(), breaking streaming
330
- value = optimize_for_transfer_lazy(value)
331
- value.sink_parquet(filepath, compression="zstd")
332
- manifest["data_files"][key] = filename
331
+
332
+ if files_already_saved and filepath.exists():
333
+ # File was saved during preprocessing (cascading) - just register it
334
+ manifest["data_files"][key] = filename
335
+ else:
336
+ # Apply streaming-safe optimization (Float64→Float32 only)
337
+ # Int64 bounds checking would require collect(), breaking streaming
338
+ value = optimize_for_transfer_lazy(value)
339
+ value.sink_parquet(filepath, compression="zstd")
340
+ manifest["data_files"][key] = filename
333
341
  elif isinstance(value, pl.DataFrame):
334
342
  filename = f"{key}.parquet"
335
343
  filepath = preprocessed_dir / filename
336
- # Full optimization including Int64→Int32 with bounds checking
337
- value = optimize_for_transfer(value)
338
- value.write_parquet(filepath, compression="zstd")
339
- manifest["data_files"][key] = filename
344
+
345
+ if files_already_saved and filepath.exists():
346
+ # File was saved during preprocessing - just register it
347
+ manifest["data_files"][key] = filename
348
+ else:
349
+ # Full optimization including Int64→Int32 with bounds checking
350
+ value = optimize_for_transfer(value)
351
+ value.write_parquet(filepath, compression="zstd")
352
+ manifest["data_files"][key] = filename
340
353
  elif self._is_json_serializable(value):
341
354
  manifest["data_values"][key] = value
342
355
 
@@ -6,7 +6,8 @@ data, enabling efficient visualization of datasets with millions of points.
6
6
  Supports both streaming (lazy) and eager downsampling approaches.
7
7
  """
8
8
 
9
- from typing import List, Optional, Union
9
+ import math
10
+ from typing import List, Optional, Tuple, Union
10
11
 
11
12
  import numpy as np
12
13
  import polars as pl
@@ -19,6 +20,59 @@ except ImportError:
19
20
  HAS_SCIPY = False
20
21
 
21
22
 
23
+ def compute_optimal_bins(
24
+ target_points: int,
25
+ x_range: Tuple[float, float],
26
+ y_range: Tuple[float, float],
27
+ ) -> Tuple[int, int]:
28
+ """
29
+ Compute optimal x_bins, y_bins for even spatial distribution.
30
+
31
+ The bin grid matches the data's aspect ratio so bins are approximately
32
+ square in data space. Total bins ≈ target_points for 1 point per bin.
33
+
34
+ Solves the system:
35
+ x_bins × y_bins = target_points
36
+ x_bins / y_bins = aspect_ratio
37
+
38
+ Solution:
39
+ y_bins = sqrt(target_points / aspect_ratio)
40
+ x_bins = sqrt(target_points × aspect_ratio)
41
+
42
+ Args:
43
+ target_points: Target number of bins (and thus max points with 1 per bin)
44
+ x_range: (x_min, x_max) data range
45
+ y_range: (y_min, y_max) data range
46
+
47
+ Returns:
48
+ (x_bins, y_bins) tuple
49
+
50
+ Examples:
51
+ >>> compute_optimal_bins(10000, (0, 1000), (0, 100)) # 10:1 aspect
52
+ (316, 31)
53
+ >>> compute_optimal_bins(10000, (0, 100), (0, 100)) # 1:1 aspect
54
+ (100, 100)
55
+ """
56
+ x_span = x_range[1] - x_range[0]
57
+ y_span = y_range[1] - y_range[0]
58
+
59
+ # Handle edge cases
60
+ if y_span < 1e-10:
61
+ y_span = x_span if x_span > 1e-10 else 1.0
62
+ if x_span < 1e-10:
63
+ x_span = y_span
64
+
65
+ aspect_ratio = x_span / y_span
66
+
67
+ # Clamp to reasonable bounds (avoid extreme rectangles)
68
+ aspect_ratio = max(0.05, min(20.0, aspect_ratio))
69
+
70
+ y_bins = max(1, int(math.sqrt(target_points / aspect_ratio)))
71
+ x_bins = max(1, int(math.sqrt(target_points * aspect_ratio)))
72
+
73
+ return x_bins, y_bins
74
+
75
+
22
76
  def compute_compression_levels(min_size: int, total: int) -> List[int]:
23
77
  """
24
78
  Compute logarithmically-spaced compression level target sizes.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: openms-insight
3
- Version: 0.1.3
3
+ Version: 0.1.4
4
4
  Summary: Interactive visualization components for mass spectrometry data in Streamlit
5
5
  Project-URL: Homepage, https://github.com/t0mdavid-m/OpenMS-Insight
6
6
  Project-URL: Documentation, https://github.com/t0mdavid-m/OpenMS-Insight#readme
@@ -1,17 +1,17 @@
1
1
  openms_insight/__init__.py,sha256=Iv9w0J_7J3pMsyvM4xaYDMWt6IvrtAt6WqOmJ-_tUxk,1044
2
2
  openms_insight/components/__init__.py,sha256=T9mUxfgFUiHILmXh1VjcGVlnRvuxRMqi_GJJYOmJKwY,177
3
- openms_insight/components/heatmap.py,sha256=hCvmtGsFYiZCoIrk4L9KsD7FC6j1GBljwZgKxjNkLPo,38143
3
+ openms_insight/components/heatmap.py,sha256=LigtpPbAPQpfjFljMWoEPPAc3t27Bl1ekr5uaR1Ctuk,44090
4
4
  openms_insight/components/lineplot.py,sha256=I-JPvDzCr3Nu8Boc1V4D8QQ1bHgTqvM6CbeoIe7zJ-s,30896
5
5
  openms_insight/components/sequenceview.py,sha256=0pDOE0xeoc1-85QZNGdNwwoBwXi-5MFfeb9pCcOi6rc,30274
6
6
  openms_insight/components/table.py,sha256=wmq1rjGVe4Ef0SAf5p85pfVCeyLlVevZnxBc9EIg2uk,16458
7
7
  openms_insight/core/__init__.py,sha256=EPjKX_FFQRgO8mWHs59I-o0BiuzEMzEU1Pfu9YOfLC4,338
8
- openms_insight/core/base.py,sha256=t8G_hertREPf1qSqoDU6PGDKFV_mnvOyLUNtaNbnQvQ,18745
8
+ openms_insight/core/base.py,sha256=P2cOrPvPIzxfYQ7xMn9e0BlyKEMrhOCgD9FAtyxTiCc,19408
9
9
  openms_insight/core/cache.py,sha256=3fnPDWjuWUnxazK2XflcUIeRZZPQ3N45kAKYu-xGBKw,1197
10
10
  openms_insight/core/registry.py,sha256=Hak80Jqhx0qa4gbd1YolNZnM6xBrS8I4U_X7zC0bQ8Y,2108
11
11
  openms_insight/core/state.py,sha256=_vNYxYHYFgIigbkqYwkIO6cBGFJyF2VN9dr7CBEAQbY,6873
12
12
  openms_insight/core/subprocess_preprocess.py,sha256=m9FbAAFy9Do1Exlh-m4Wo-LDwv6yHlEI4klz5OVwemc,3133
13
13
  openms_insight/preprocessing/__init__.py,sha256=hXKTI9zHtMtHojqXq_0V62xfNokozpnpRAwEnxs81fM,461
14
- openms_insight/preprocessing/compression.py,sha256=phe0D568lpNiwkPGI7AXWg9i3iX3xgEyi7JIqydCtxk,10664
14
+ openms_insight/preprocessing/compression.py,sha256=T4YbX9PUlfTfPit_kpuLZn8hYpqLYu3xtTme_CG2ymc,12241
15
15
  openms_insight/preprocessing/filtering.py,sha256=fkmaIXfR5hfjyWfaMYqaeybMHaZjvUZYaKCqvxPOWMQ,14152
16
16
  openms_insight/rendering/__init__.py,sha256=ApHvKeh87yY4GTIEai-tCeIXpNbwOXWlmcmIwMMRZYc,198
17
17
  openms_insight/rendering/bridge.py,sha256=i8cZq_ra13XpuV1KT0qC6Jf4VCAe4BGrLE-ybrFHwZE,19408
@@ -22,7 +22,7 @@ openms_insight/js-component/dist/assets/materialdesignicons-webfont.eot,sha256=C
22
22
  openms_insight/js-component/dist/assets/materialdesignicons-webfont.ttf,sha256=YeirpaTpgf4iz3yOi82-oAR251xiw38Bv37jM2HWhCg,1307660
23
23
  openms_insight/js-component/dist/assets/materialdesignicons-webfont.woff,sha256=pZKKDVwvYk5G-Y2bFcL2AEU3f3xZTdeKF1kTLqO0Y-s,587984
24
24
  openms_insight/js-component/dist/assets/materialdesignicons-webfont.woff2,sha256=Zi_vqPL4qVwYWI0hd0eJwQfGTnccvmWmmvRikcQxGvw,403216
25
- openms_insight-0.1.3.dist-info/METADATA,sha256=k_FqXgopIKrICwhW5qcs1xXsYvfav1JOhTD6kQ410wA,12807
26
- openms_insight-0.1.3.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
27
- openms_insight-0.1.3.dist-info/licenses/LICENSE,sha256=INFF4rOMmpah7Oi14hLqu7NTOsx56KRRNChAAUcfh2E,1823
28
- openms_insight-0.1.3.dist-info/RECORD,,
25
+ openms_insight-0.1.4.dist-info/METADATA,sha256=_c_eGoMj7wCxAWE5CHC6T2Emri6DEZRXwrQJ-RNptrI,12807
26
+ openms_insight-0.1.4.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
27
+ openms_insight-0.1.4.dist-info/licenses/LICENSE,sha256=INFF4rOMmpah7Oi14hLqu7NTOsx56KRRNChAAUcfh2E,1823
28
+ openms_insight-0.1.4.dist-info/RECORD,,