openms-insight 0.1.2__py3-none-any.whl → 0.1.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -8,6 +8,7 @@ from ..core.base import BaseComponent
8
8
  from ..core.registry import register_component
9
9
  from ..preprocessing.compression import (
10
10
  compute_compression_levels,
11
+ compute_optimal_bins,
11
12
  downsample_2d,
12
13
  downsample_2d_simple,
13
14
  downsample_2d_streaming,
@@ -22,10 +23,10 @@ def _make_zoom_cache_key(zoom: Optional[Dict[str, Any]]) -> tuple:
22
23
  if zoom is None:
23
24
  return (None,)
24
25
  return (
25
- ('x0', zoom.get('xRange', [-1, -1])[0]),
26
- ('x1', zoom.get('xRange', [-1, -1])[1]),
27
- ('y0', zoom.get('yRange', [-1, -1])[0]),
28
- ('y1', zoom.get('yRange', [-1, -1])[1]),
26
+ ("x0", zoom.get("xRange", [-1, -1])[0]),
27
+ ("x1", zoom.get("xRange", [-1, -1])[1]),
28
+ ("y0", zoom.get("yRange", [-1, -1])[0]),
29
+ ("y1", zoom.get("yRange", [-1, -1])[1]),
29
30
  )
30
31
 
31
32
 
@@ -66,28 +67,29 @@ class Heatmap(BaseComponent):
66
67
  def __init__(
67
68
  self,
68
69
  cache_id: str,
69
- x_column: str,
70
- y_column: str,
70
+ x_column: Optional[str] = None,
71
+ y_column: Optional[str] = None,
71
72
  data: Optional[pl.LazyFrame] = None,
72
73
  data_path: Optional[str] = None,
73
- intensity_column: str = 'intensity',
74
+ intensity_column: Optional[str] = None,
74
75
  filters: Optional[Dict[str, str]] = None,
75
76
  filter_defaults: Optional[Dict[str, Any]] = None,
76
77
  interactivity: Optional[Dict[str, str]] = None,
77
78
  cache_path: str = ".",
78
79
  regenerate_cache: bool = False,
79
- min_points: int = 20000,
80
- x_bins: int = 400,
81
- y_bins: int = 50,
82
- zoom_identifier: str = 'heatmap_zoom',
80
+ min_points: int = 10000,
81
+ display_aspect_ratio: float = 16 / 9,
82
+ x_bins: Optional[int] = None,
83
+ y_bins: Optional[int] = None,
84
+ zoom_identifier: str = "heatmap_zoom",
83
85
  title: Optional[str] = None,
84
86
  x_label: Optional[str] = None,
85
87
  y_label: Optional[str] = None,
86
- colorscale: str = 'Portland',
88
+ colorscale: str = "Portland",
87
89
  use_simple_downsample: bool = False,
88
90
  use_streaming: bool = True,
89
91
  categorical_filters: Optional[List[str]] = None,
90
- **kwargs
92
+ **kwargs,
91
93
  ):
92
94
  """
93
95
  Initialize the Heatmap component.
@@ -106,10 +108,17 @@ class Heatmap(BaseComponent):
106
108
  point's value in the corresponding column.
107
109
  cache_path: Base path for cache storage. Default "." (current dir).
108
110
  regenerate_cache: If True, regenerate cache even if valid cache exists.
109
- min_points: Target size for smallest compression level and
110
- threshold for level selection (default: 20000)
111
- x_bins: Number of bins along x-axis for downsampling (default: 400)
112
- y_bins: Number of bins along y-axis for downsampling (default: 50)
111
+ min_points: Target number of points to display (default: 10000).
112
+ Cache levels are built at 2× this value; final downsample
113
+ at render time reduces to exactly min_points.
114
+ display_aspect_ratio: Expected display width/height ratio for
115
+ optimal bin computation during caching (default: 16/9).
116
+ At render time, the actual zoom region's aspect ratio is used.
117
+ x_bins: Number of bins along x-axis for downsampling. If None
118
+ (default), auto-computed from display_aspect_ratio such that
119
+ x_bins × y_bins ≈ 2×min_points with even spatial distribution.
120
+ y_bins: Number of bins along y-axis for downsampling. If None
121
+ (default), auto-computed from display_aspect_ratio.
113
122
  zoom_identifier: State key for storing zoom range (default: 'heatmap_zoom')
114
123
  title: Heatmap title displayed above the plot
115
124
  x_label: X-axis label (defaults to x_column)
@@ -130,6 +139,7 @@ class Heatmap(BaseComponent):
130
139
  self._y_column = y_column
131
140
  self._intensity_column = intensity_column
132
141
  self._min_points = min_points
142
+ self._display_aspect_ratio = display_aspect_ratio
133
143
  self._x_bins = x_bins
134
144
  self._y_bins = y_bins
135
145
  self._zoom_identifier = zoom_identifier
@@ -155,6 +165,7 @@ class Heatmap(BaseComponent):
155
165
  y_column=y_column,
156
166
  intensity_column=intensity_column,
157
167
  min_points=min_points,
168
+ display_aspect_ratio=display_aspect_ratio,
158
169
  x_bins=x_bins,
159
170
  y_bins=y_bins,
160
171
  zoom_identifier=zoom_identifier,
@@ -165,7 +176,7 @@ class Heatmap(BaseComponent):
165
176
  use_simple_downsample=use_simple_downsample,
166
177
  use_streaming=use_streaming,
167
178
  categorical_filters=categorical_filters,
168
- **kwargs
179
+ **kwargs,
169
180
  )
170
181
 
171
182
  def _get_cache_config(self) -> Dict[str, Any]:
@@ -176,17 +187,43 @@ class Heatmap(BaseComponent):
176
187
  Dict of config values that affect preprocessing
177
188
  """
178
189
  return {
179
- 'x_column': self._x_column,
180
- 'y_column': self._y_column,
181
- 'intensity_column': self._intensity_column,
182
- 'min_points': self._min_points,
183
- 'x_bins': self._x_bins,
184
- 'y_bins': self._y_bins,
185
- 'use_simple_downsample': self._use_simple_downsample,
186
- 'use_streaming': self._use_streaming,
187
- 'categorical_filters': sorted(self._categorical_filters),
190
+ "x_column": self._x_column,
191
+ "y_column": self._y_column,
192
+ "intensity_column": self._intensity_column,
193
+ "min_points": self._min_points,
194
+ "display_aspect_ratio": self._display_aspect_ratio,
195
+ "x_bins": self._x_bins,
196
+ "y_bins": self._y_bins,
197
+ "use_simple_downsample": self._use_simple_downsample,
198
+ "use_streaming": self._use_streaming,
199
+ "categorical_filters": sorted(self._categorical_filters),
200
+ "zoom_identifier": self._zoom_identifier,
201
+ "title": self._title,
202
+ "x_label": self._x_label,
203
+ "y_label": self._y_label,
204
+ "colorscale": self._colorscale,
188
205
  }
189
206
 
207
+ def _restore_cache_config(self, config: Dict[str, Any]) -> None:
208
+ """Restore component-specific configuration from cached config."""
209
+ self._x_column = config.get("x_column")
210
+ self._y_column = config.get("y_column")
211
+ self._intensity_column = config.get("intensity_column", "intensity")
212
+ self._min_points = config.get("min_points", 10000)
213
+ self._display_aspect_ratio = config.get("display_aspect_ratio", 16 / 9)
214
+ # x_bins/y_bins are computed during preprocessing and stored in cache
215
+ # Fallback to old defaults for backward compatibility with old caches
216
+ self._x_bins = config.get("x_bins", 400)
217
+ self._y_bins = config.get("y_bins", 50)
218
+ self._use_simple_downsample = config.get("use_simple_downsample", False)
219
+ self._use_streaming = config.get("use_streaming", True)
220
+ self._categorical_filters = config.get("categorical_filters", [])
221
+ self._zoom_identifier = config.get("zoom_identifier", "heatmap_zoom")
222
+ self._title = config.get("title")
223
+ self._x_label = config.get("x_label", self._x_column)
224
+ self._y_label = config.get("y_label", self._y_column)
225
+ self._colorscale = config.get("colorscale", "Portland")
226
+
190
227
  def get_state_dependencies(self) -> list:
191
228
  """
192
229
  Return list of state keys that affect this component's data.
@@ -220,14 +257,116 @@ class Heatmap(BaseComponent):
220
257
  else:
221
258
  self._preprocess_eager()
222
259
 
260
+ def _build_cascading_levels(
261
+ self,
262
+ source_data: pl.LazyFrame,
263
+ level_sizes: list,
264
+ x_range: tuple,
265
+ y_range: tuple,
266
+ cache_dir,
267
+ prefix: str = "level",
268
+ ) -> dict:
269
+ """
270
+ Build cascading compression levels from source data.
271
+
272
+ Each level is built from the previous larger level rather than from
273
+ raw data. This is efficient (raw data read once) and produces identical
274
+ results because the downsampling keeps top N highest-intensity points
275
+ per bin - points surviving at larger levels will also be selected at
276
+ smaller levels.
277
+
278
+ Args:
279
+ source_data: LazyFrame with raw/filtered data
280
+ level_sizes: List of target sizes for compressed levels (smallest first)
281
+ x_range: (x_min, x_max) for consistent bin boundaries
282
+ y_range: (y_min, y_max) for consistent bin boundaries
283
+ cache_dir: Path to save parquet files
284
+ prefix: Filename prefix (e.g., "level" or "cat_level_im_0")
285
+
286
+ Returns:
287
+ Dict with level LazyFrames keyed by "{prefix}_{idx}" and "num_levels"
288
+ """
289
+ import sys
290
+
291
+ result = {}
292
+ num_compressed = len(level_sizes)
293
+
294
+ # Get total count
295
+ total = source_data.select(pl.len()).collect().item()
296
+
297
+ # First: save full resolution as the largest level
298
+ full_res_path = cache_dir / f"{prefix}_{num_compressed}.parquet"
299
+ full_res = source_data.sort([self._x_column, self._y_column])
300
+ full_res.sink_parquet(full_res_path, compression="zstd")
301
+ print(
302
+ f"[HEATMAP] Saved {prefix}_{num_compressed} ({total:,} pts)",
303
+ file=sys.stderr,
304
+ )
305
+
306
+ # Start cascading from full resolution
307
+ current_source = pl.scan_parquet(full_res_path)
308
+ current_size = total
309
+
310
+ # Build compressed levels from largest to smallest
311
+ for i, target_size in enumerate(reversed(level_sizes)):
312
+ level_idx = num_compressed - 1 - i
313
+ level_path = cache_dir / f"{prefix}_{level_idx}.parquet"
314
+
315
+ # If target size equals or exceeds current, just copy reference
316
+ if target_size >= current_size:
317
+ level = current_source
318
+ elif self._use_simple_downsample:
319
+ level = downsample_2d_simple(
320
+ current_source,
321
+ max_points=target_size,
322
+ intensity_column=self._intensity_column,
323
+ )
324
+ else:
325
+ level = downsample_2d_streaming(
326
+ current_source,
327
+ max_points=target_size,
328
+ x_column=self._x_column,
329
+ y_column=self._y_column,
330
+ intensity_column=self._intensity_column,
331
+ x_bins=self._x_bins,
332
+ y_bins=self._y_bins,
333
+ x_range=x_range,
334
+ y_range=y_range,
335
+ )
336
+
337
+ # Sort and save immediately
338
+ level = level.sort([self._x_column, self._y_column])
339
+ level.sink_parquet(level_path, compression="zstd")
340
+
341
+ print(
342
+ f"[HEATMAP] Saved {prefix}_{level_idx} (target {target_size:,} pts)",
343
+ file=sys.stderr,
344
+ )
345
+
346
+ # Next iteration uses this level as source (cascading)
347
+ current_source = pl.scan_parquet(level_path)
348
+ current_size = target_size
349
+
350
+ # Load all levels back as LazyFrames
351
+ for i in range(num_compressed + 1):
352
+ level_path = cache_dir / f"{prefix}_{i}.parquet"
353
+ result[f"{prefix}_{i}"] = pl.scan_parquet(level_path)
354
+
355
+ result["num_levels"] = num_compressed + 1
356
+
357
+ return result
358
+
223
359
  def _preprocess_with_categorical_filters(self) -> None:
224
360
  """
225
- Preprocess with per-filter-value compression levels.
361
+ Preprocess with per-filter-value compression levels using cascading.
226
362
 
227
363
  For each unique value of each categorical filter, creates separate
228
- compression levels. This ensures that when a filter is applied at
229
- render time, the resulting data has ~min_points regardless of the
230
- filter value selected.
364
+ compression levels using cascading (building smaller levels from larger).
365
+ This ensures that when a filter is applied at render time, the resulting
366
+ data has ~min_points regardless of the filter value selected.
367
+
368
+ Uses cascading downsampling for efficiency - each level is built from
369
+ the previous larger level rather than from raw data.
231
370
 
232
371
  Data is sorted by x, y columns for efficient range query predicate pushdown.
233
372
 
@@ -239,202 +378,221 @@ class Heatmap(BaseComponent):
239
378
  import sys
240
379
 
241
380
  # Get data ranges (for the full dataset)
381
+ # These ranges are used for ALL levels to ensure consistent binning
242
382
  x_range, y_range = get_data_range(
243
383
  self._raw_data,
244
384
  self._x_column,
245
385
  self._y_column,
246
386
  )
247
- self._preprocessed_data['x_range'] = x_range
248
- self._preprocessed_data['y_range'] = y_range
387
+ self._preprocessed_data["x_range"] = x_range
388
+ self._preprocessed_data["y_range"] = y_range
389
+
390
+ # Compute optimal bins if not provided
391
+ # Cache at 2×min_points, use display_aspect_ratio for bin computation
392
+ cache_target = 2 * self._min_points
393
+ if self._x_bins is None or self._y_bins is None:
394
+ # Use display aspect ratio (not data aspect ratio) for optimal bins
395
+ self._x_bins, self._y_bins = compute_optimal_bins(
396
+ cache_target,
397
+ (0, self._display_aspect_ratio), # Fake x_range matching aspect
398
+ (0, 1.0), # Fake y_range
399
+ )
400
+ print(
401
+ f"[HEATMAP] Auto-computed bins: {self._x_bins}x{self._y_bins} "
402
+ f"= {self._x_bins * self._y_bins:,} (cache target: {cache_target:,}, "
403
+ f"display aspect: {self._display_aspect_ratio:.2f})",
404
+ file=sys.stderr,
405
+ )
249
406
 
250
407
  # Get total count
251
408
  total = self._raw_data.select(pl.len()).collect().item()
252
- self._preprocessed_data['total'] = total
409
+ self._preprocessed_data["total"] = total
410
+
411
+ # Create cache directory for immediate level saving
412
+ cache_dir = self._cache_dir / "preprocessed"
413
+ cache_dir.mkdir(parents=True, exist_ok=True)
253
414
 
254
415
  # Store metadata about categorical filters
255
- self._preprocessed_data['has_categorical_filters'] = True
256
- self._preprocessed_data['categorical_filter_values'] = {}
416
+ self._preprocessed_data["has_categorical_filters"] = True
417
+ self._preprocessed_data["categorical_filter_values"] = {}
257
418
 
258
419
  # Process each categorical filter
259
420
  for filter_id in self._categorical_filters:
260
421
  if filter_id not in self._filters:
261
- print(f"[HEATMAP] Warning: categorical_filter '{filter_id}' not in filters, skipping", file=sys.stderr)
422
+ print(
423
+ f"[HEATMAP] Warning: categorical_filter '{filter_id}' not in filters, skipping",
424
+ file=sys.stderr,
425
+ )
262
426
  continue
263
427
 
264
428
  column_name = self._filters[filter_id]
265
429
 
266
430
  # Get unique values for this filter
267
431
  unique_values = (
268
- self._raw_data
269
- .select(pl.col(column_name))
432
+ self._raw_data.select(pl.col(column_name))
270
433
  .unique()
271
434
  .collect()
272
435
  .to_series()
273
436
  .to_list()
274
437
  )
275
- unique_values = sorted([v for v in unique_values if v is not None and v >= 0])
438
+ unique_values = sorted(
439
+ [v for v in unique_values if v is not None and v >= 0]
440
+ )
276
441
 
277
- print(f"[HEATMAP] Categorical filter '{filter_id}' ({column_name}): {len(unique_values)} unique values", file=sys.stderr)
442
+ print(
443
+ f"[HEATMAP] Categorical filter '{filter_id}' ({column_name}): {len(unique_values)} unique values",
444
+ file=sys.stderr,
445
+ )
278
446
 
279
- self._preprocessed_data['categorical_filter_values'][filter_id] = unique_values
447
+ self._preprocessed_data["categorical_filter_values"][filter_id] = (
448
+ unique_values
449
+ )
280
450
 
281
- # Create compression levels for each filter value
451
+ # Create compression levels for each filter value using cascading
282
452
  for filter_value in unique_values:
283
453
  # Filter data to this value
284
- filtered_data = self._raw_data.filter(pl.col(column_name) == filter_value)
454
+ filtered_data = self._raw_data.filter(
455
+ pl.col(column_name) == filter_value
456
+ )
285
457
  filtered_total = filtered_data.select(pl.len()).collect().item()
286
458
 
287
- # Compute level sizes for this filtered subset
288
- level_sizes = compute_compression_levels(self._min_points, filtered_total)
289
-
290
- print(f"[HEATMAP] Value {filter_value}: {filtered_total:,} pts → levels {level_sizes}", file=sys.stderr)
291
-
292
- # Store level sizes for this filter value
293
- self._preprocessed_data[f'cat_level_sizes_{filter_id}_{filter_value}'] = level_sizes
294
-
295
- # Build each compressed level
296
- for level_idx, target_size in enumerate(level_sizes):
297
- # If target size equals total, skip downsampling - use all data
298
- if target_size >= filtered_total:
299
- level = filtered_data
300
- elif self._use_simple_downsample:
301
- level = downsample_2d_simple(
302
- filtered_data,
303
- max_points=target_size,
304
- intensity_column=self._intensity_column,
305
- )
306
- else:
307
- level = downsample_2d_streaming(
308
- filtered_data,
309
- max_points=target_size,
310
- x_column=self._x_column,
311
- y_column=self._y_column,
312
- intensity_column=self._intensity_column,
313
- x_bins=self._x_bins,
314
- y_bins=self._y_bins,
315
- x_range=x_range,
316
- y_range=y_range,
317
- )
318
-
319
- # Sort by x, y for efficient range query predicate pushdown
320
- level = level.sort([self._x_column, self._y_column])
321
- # Store LazyFrame for streaming to disk
322
- level_key = f'cat_level_{filter_id}_{filter_value}_{level_idx}'
323
- self._preprocessed_data[level_key] = level # Keep lazy
324
-
325
- # Add full resolution as final level (for zoom fallback)
326
- # Also sorted for consistent predicate pushdown behavior
327
- num_compressed = len(level_sizes)
328
- full_res_key = f'cat_level_{filter_id}_{filter_value}_{num_compressed}'
329
- self._preprocessed_data[full_res_key] = filtered_data.sort(
330
- [self._x_column, self._y_column]
459
+ # Compute level sizes for this filtered subset (2× for cache buffer)
460
+ level_sizes = compute_compression_levels(
461
+ cache_target, filtered_total
331
462
  )
332
- self._preprocessed_data[f'cat_num_levels_{filter_id}_{filter_value}'] = num_compressed + 1
333
463
 
334
- # Also create global levels for when no categorical filter is selected
335
- # (fallback to standard behavior)
336
- level_sizes = compute_compression_levels(self._min_points, total)
337
- self._preprocessed_data['level_sizes'] = level_sizes
338
-
339
- for i, size in enumerate(level_sizes):
340
- # If target size equals total, skip downsampling - use all data
341
- if size >= total:
342
- level = self._raw_data
343
- elif self._use_simple_downsample:
344
- level = downsample_2d_simple(
345
- self._raw_data,
346
- max_points=size,
347
- intensity_column=self._intensity_column,
464
+ print(
465
+ f"[HEATMAP] Value {filter_value}: {filtered_total:,} pts → levels {level_sizes}",
466
+ file=sys.stderr,
348
467
  )
349
- else:
350
- level = downsample_2d_streaming(
351
- self._raw_data,
352
- max_points=size,
353
- x_column=self._x_column,
354
- y_column=self._y_column,
355
- intensity_column=self._intensity_column,
356
- x_bins=self._x_bins,
357
- y_bins=self._y_bins,
468
+
469
+ # Store level sizes for this filter value
470
+ self._preprocessed_data[
471
+ f"cat_level_sizes_{filter_id}_{filter_value}"
472
+ ] = level_sizes
473
+
474
+ # Build cascading levels using helper
475
+ prefix = f"cat_level_{filter_id}_{filter_value}"
476
+ levels_result = self._build_cascading_levels(
477
+ source_data=filtered_data,
478
+ level_sizes=level_sizes,
358
479
  x_range=x_range,
359
480
  y_range=y_range,
481
+ cache_dir=cache_dir,
482
+ prefix=prefix,
360
483
  )
361
- # Sort by x, y for efficient range query predicate pushdown
362
- level = level.sort([self._x_column, self._y_column])
363
- self._preprocessed_data[f'level_{i}'] = level # Keep lazy
364
484
 
365
- # Add full resolution as final level (for zoom fallback)
366
- # Also sorted for consistent predicate pushdown behavior
367
- num_compressed = len(level_sizes)
368
- self._preprocessed_data[f'level_{num_compressed}'] = self._raw_data.sort(
369
- [self._x_column, self._y_column]
485
+ # Copy results to preprocessed_data
486
+ for key, value in levels_result.items():
487
+ if key == "num_levels":
488
+ self._preprocessed_data[
489
+ f"cat_num_levels_{filter_id}_{filter_value}"
490
+ ] = value
491
+ else:
492
+ self._preprocessed_data[key] = value
493
+
494
+ # Also create global levels for when no categorical filter is selected
495
+ # (fallback to standard behavior) - using cascading with 2× cache buffer
496
+ level_sizes = compute_compression_levels(cache_target, total)
497
+ self._preprocessed_data["level_sizes"] = level_sizes
498
+
499
+ # Build global cascading levels using helper
500
+ levels_result = self._build_cascading_levels(
501
+ source_data=self._raw_data,
502
+ level_sizes=level_sizes,
503
+ x_range=x_range,
504
+ y_range=y_range,
505
+ cache_dir=cache_dir,
506
+ prefix="level",
370
507
  )
371
- self._preprocessed_data['num_levels'] = num_compressed + 1
508
+
509
+ # Copy results to preprocessed_data
510
+ for key, value in levels_result.items():
511
+ if key == "num_levels":
512
+ self._preprocessed_data["num_levels"] = value
513
+ else:
514
+ self._preprocessed_data[key] = value
515
+
516
+ # Mark that files are already saved
517
+ self._preprocessed_data["_files_already_saved"] = True
372
518
 
373
519
  def _preprocess_streaming(self) -> None:
374
520
  """
375
- Streaming preprocessing - levels stay lazy through caching.
521
+ Streaming preprocessing with cascading - builds smaller levels from larger.
522
+
523
+ Uses cascading downsampling: each level is built from the previous larger
524
+ level rather than from raw data. This is more efficient (raw data read once)
525
+ and produces identical results because the downsampling algorithm keeps
526
+ the TOP N highest-intensity points per bin - points that survive at a larger
527
+ level will also be selected at smaller levels.
528
+
529
+ Levels are saved to disk immediately after creation, then read back as the
530
+ source for the next smaller level. This keeps memory low while enabling
531
+ cascading.
376
532
 
377
- Builds lazy query plans that are streamed to disk via sink_parquet().
378
533
  Data is sorted by x, y columns for efficient range query predicate pushdown.
379
534
  """
535
+ import sys
536
+
380
537
  # Get data ranges (minimal collect - just 4 values)
538
+ # These ranges are used for ALL levels to ensure consistent binning
381
539
  x_range, y_range = get_data_range(
382
540
  self._raw_data,
383
541
  self._x_column,
384
542
  self._y_column,
385
543
  )
386
- self._preprocessed_data['x_range'] = x_range
387
- self._preprocessed_data['y_range'] = y_range
544
+ self._preprocessed_data["x_range"] = x_range
545
+ self._preprocessed_data["y_range"] = y_range
546
+
547
+ # Compute optimal bins if not provided
548
+ # Cache at 2×min_points, use display_aspect_ratio for bin computation
549
+ cache_target = 2 * self._min_points
550
+ if self._x_bins is None or self._y_bins is None:
551
+ # Use display aspect ratio (not data aspect ratio) for optimal bins
552
+ # This ensures even distribution in the expected display dimensions
553
+ self._x_bins, self._y_bins = compute_optimal_bins(
554
+ cache_target,
555
+ (0, self._display_aspect_ratio), # Fake x_range matching aspect
556
+ (0, 1.0), # Fake y_range
557
+ )
558
+ print(
559
+ f"[HEATMAP] Auto-computed bins: {self._x_bins}x{self._y_bins} "
560
+ f"= {self._x_bins * self._y_bins:,} (cache target: {cache_target:,}, "
561
+ f"display aspect: {self._display_aspect_ratio:.2f})",
562
+ file=sys.stderr,
563
+ )
388
564
 
389
565
  # Get total count
390
566
  total = self._raw_data.select(pl.len()).collect().item()
391
- self._preprocessed_data['total'] = total
392
-
393
- # Compute target sizes for levels
394
- level_sizes = compute_compression_levels(self._min_points, total)
395
- self._preprocessed_data['level_sizes'] = level_sizes
396
-
397
- # Build and collect each level
398
- self._preprocessed_data['levels'] = []
567
+ self._preprocessed_data["total"] = total
568
+
569
+ # Compute target sizes for levels (use 2×min_points for smallest cache level)
570
+ level_sizes = compute_compression_levels(cache_target, total)
571
+ self._preprocessed_data["level_sizes"] = level_sizes
572
+
573
+ # Create cache directory for immediate level saving
574
+ cache_dir = self._cache_dir / "preprocessed"
575
+ cache_dir.mkdir(parents=True, exist_ok=True)
576
+
577
+ # Build cascading levels using helper
578
+ levels_result = self._build_cascading_levels(
579
+ source_data=self._raw_data,
580
+ level_sizes=level_sizes,
581
+ x_range=x_range,
582
+ y_range=y_range,
583
+ cache_dir=cache_dir,
584
+ prefix="level",
585
+ )
399
586
 
400
- for i, size in enumerate(level_sizes):
401
- # If target size equals total, skip downsampling - use all data
402
- if size >= total:
403
- level = self._raw_data
404
- elif self._use_simple_downsample:
405
- level = downsample_2d_simple(
406
- self._raw_data,
407
- max_points=size,
408
- intensity_column=self._intensity_column,
409
- )
587
+ # Copy results to preprocessed_data
588
+ for key, value in levels_result.items():
589
+ if key == "num_levels":
590
+ self._preprocessed_data["num_levels"] = value
410
591
  else:
411
- level = downsample_2d_streaming(
412
- self._raw_data,
413
- max_points=size,
414
- x_column=self._x_column,
415
- y_column=self._y_column,
416
- intensity_column=self._intensity_column,
417
- x_bins=self._x_bins,
418
- y_bins=self._y_bins,
419
- x_range=x_range,
420
- y_range=y_range,
421
- )
422
- # Sort by x, y for efficient range query predicate pushdown
423
- # This clusters spatially close points together in row groups
424
- level = level.sort([self._x_column, self._y_column])
425
- # Store LazyFrame for streaming to disk
426
- # Base class will use sink_parquet() to stream without full materialization
427
- self._preprocessed_data[f'level_{i}'] = level # Keep lazy
428
-
429
- # Add full resolution as final level (for zoom fallback)
430
- # Also sorted for consistent predicate pushdown behavior
431
- num_compressed = len(level_sizes)
432
- self._preprocessed_data[f'level_{num_compressed}'] = self._raw_data.sort(
433
- [self._x_column, self._y_column]
434
- )
592
+ self._preprocessed_data[key] = value
435
593
 
436
- # Store number of levels for reconstruction (includes full resolution)
437
- self._preprocessed_data['num_levels'] = num_compressed + 1
594
+ # Mark that files are already saved (base class should skip saving)
595
+ self._preprocessed_data["_files_already_saved"] = True
438
596
 
439
597
  def _preprocess_eager(self) -> None:
440
598
  """
@@ -444,22 +602,41 @@ class Heatmap(BaseComponent):
444
602
  downsampling for better spatial distribution.
445
603
  Data is sorted by x, y columns for efficient range query predicate pushdown.
446
604
  """
605
+ import sys
606
+
447
607
  # Get data ranges
448
608
  x_range, y_range = get_data_range(
449
609
  self._raw_data,
450
610
  self._x_column,
451
611
  self._y_column,
452
612
  )
453
- self._preprocessed_data['x_range'] = x_range
454
- self._preprocessed_data['y_range'] = y_range
613
+ self._preprocessed_data["x_range"] = x_range
614
+ self._preprocessed_data["y_range"] = y_range
615
+
616
+ # Compute optimal bins if not provided
617
+ # Cache at 2×min_points, use display_aspect_ratio for bin computation
618
+ cache_target = 2 * self._min_points
619
+ if self._x_bins is None or self._y_bins is None:
620
+ # Use display aspect ratio (not data aspect ratio) for optimal bins
621
+ self._x_bins, self._y_bins = compute_optimal_bins(
622
+ cache_target,
623
+ (0, self._display_aspect_ratio), # Fake x_range matching aspect
624
+ (0, 1.0), # Fake y_range
625
+ )
626
+ print(
627
+ f"[HEATMAP] Auto-computed bins: {self._x_bins}x{self._y_bins} "
628
+ f"= {self._x_bins * self._y_bins:,} (cache target: {cache_target:,}, "
629
+ f"display aspect: {self._display_aspect_ratio:.2f})",
630
+ file=sys.stderr,
631
+ )
455
632
 
456
633
  # Get total count
457
634
  total = self._raw_data.select(pl.len()).collect().item()
458
- self._preprocessed_data['total'] = total
635
+ self._preprocessed_data["total"] = total
459
636
 
460
- # Compute compression level target sizes
461
- level_sizes = compute_compression_levels(self._min_points, total)
462
- self._preprocessed_data['level_sizes'] = level_sizes
637
+ # Compute compression level target sizes (2× for cache buffer)
638
+ level_sizes = compute_compression_levels(cache_target, total)
639
+ self._preprocessed_data["level_sizes"] = level_sizes
463
640
 
464
641
  # Build levels from largest to smallest
465
642
  if level_sizes:
@@ -493,21 +670,23 @@ class Heatmap(BaseComponent):
493
670
  # Store LazyFrame for streaming to disk
494
671
  level_idx = len(level_sizes) - 1 - i
495
672
  if isinstance(downsampled, pl.LazyFrame):
496
- self._preprocessed_data[f'level_{level_idx}'] = downsampled # Keep lazy
673
+ self._preprocessed_data[f"level_{level_idx}"] = (
674
+ downsampled # Keep lazy
675
+ )
497
676
  else:
498
677
  # DataFrame from downsample_2d - convert back to lazy
499
- self._preprocessed_data[f'level_{level_idx}'] = downsampled.lazy()
678
+ self._preprocessed_data[f"level_{level_idx}"] = downsampled.lazy()
500
679
  current = downsampled
501
680
 
502
681
  # Add full resolution as final level (for zoom fallback)
503
682
  # Also sorted for consistent predicate pushdown behavior
504
683
  num_compressed = len(level_sizes)
505
- self._preprocessed_data[f'level_{num_compressed}'] = self._raw_data.sort(
684
+ self._preprocessed_data[f"level_{num_compressed}"] = self._raw_data.sort(
506
685
  [self._x_column, self._y_column]
507
686
  )
508
687
 
509
688
  # Store number of levels for reconstruction (includes full resolution)
510
- self._preprocessed_data['num_levels'] = num_compressed + 1
689
+ self._preprocessed_data["num_levels"] = num_compressed + 1
511
690
 
512
691
  def _get_levels(self) -> list:
513
692
  """
@@ -516,11 +695,11 @@ class Heatmap(BaseComponent):
516
695
  Reconstructs the levels list from preprocessed data,
517
696
  adding full resolution at the end.
518
697
  """
519
- num_levels = self._preprocessed_data.get('num_levels', 0)
698
+ num_levels = self._preprocessed_data.get("num_levels", 0)
520
699
  levels = []
521
700
 
522
701
  for i in range(num_levels):
523
- level_data = self._preprocessed_data.get(f'level_{i}')
702
+ level_data = self._preprocessed_data.get(f"level_{i}")
524
703
  if level_data is not None:
525
704
  levels.append(level_data)
526
705
 
@@ -543,7 +722,7 @@ class Heatmap(BaseComponent):
543
722
  Returns ([], None) if no categorical levels exist for this filter
544
723
  """
545
724
  # Check if we have categorical levels for this filter/value
546
- num_levels_key = f'cat_num_levels_{filter_id}_{filter_value}'
725
+ num_levels_key = f"cat_num_levels_{filter_id}_{filter_value}"
547
726
  num_levels = self._preprocessed_data.get(num_levels_key, 0)
548
727
 
549
728
  if num_levels == 0:
@@ -551,14 +730,16 @@ class Heatmap(BaseComponent):
551
730
 
552
731
  levels = []
553
732
  for i in range(num_levels):
554
- level_key = f'cat_level_{filter_id}_{filter_value}_{i}'
733
+ level_key = f"cat_level_{filter_id}_{filter_value}_{i}"
555
734
  level_data = self._preprocessed_data.get(level_key)
556
735
  if level_data is not None:
557
736
  levels.append(level_data)
558
737
 
559
738
  return levels, None # Full resolution included in cached levels
560
739
 
561
- def _get_levels_for_state(self, state: Dict[str, Any]) -> Tuple[list, Optional[pl.LazyFrame]]:
740
+ def _get_levels_for_state(
741
+ self, state: Dict[str, Any]
742
+ ) -> Tuple[list, Optional[pl.LazyFrame]]:
562
743
  """
563
744
  Get appropriate compression levels based on current filter state.
564
745
 
@@ -573,8 +754,10 @@ class Heatmap(BaseComponent):
573
754
  Tuple of (levels list, raw data for full resolution)
574
755
  """
575
756
  # Check if we have categorical filters and a selected value
576
- if self._preprocessed_data.get('has_categorical_filters'):
577
- cat_filter_values = self._preprocessed_data.get('categorical_filter_values', {})
757
+ if self._preprocessed_data.get("has_categorical_filters"):
758
+ cat_filter_values = self._preprocessed_data.get(
759
+ "categorical_filter_values", {}
760
+ )
578
761
 
579
762
  for filter_id in self._categorical_filters:
580
763
  if filter_id not in cat_filter_values:
@@ -590,7 +773,9 @@ class Heatmap(BaseComponent):
590
773
 
591
774
  # Check if this value has per-filter levels
592
775
  if selected_value in cat_filter_values[filter_id]:
593
- levels, filtered_raw = self._get_categorical_levels(filter_id, selected_value)
776
+ levels, filtered_raw = self._get_categorical_levels(
777
+ filter_id, selected_value
778
+ )
594
779
  if levels:
595
780
  return levels, filtered_raw
596
781
 
@@ -599,22 +784,19 @@ class Heatmap(BaseComponent):
599
784
 
600
785
  def _get_vue_component_name(self) -> str:
601
786
  """Return the Vue component name."""
602
- return 'PlotlyHeatmap'
787
+ return "PlotlyHeatmap"
603
788
 
604
789
  def _get_data_key(self) -> str:
605
790
  """Return the key used to send primary data to Vue."""
606
- return 'heatmapData'
791
+ return "heatmapData"
607
792
 
608
793
  def _is_no_zoom(self, zoom: Optional[Dict[str, Any]]) -> bool:
609
794
  """Check if zoom state represents no zoom (full view)."""
610
795
  if zoom is None:
611
796
  return True
612
- x_range = zoom.get('xRange', [-1, -1])
613
- y_range = zoom.get('yRange', [-1, -1])
614
- return (
615
- x_range[0] < 0 and x_range[1] < 0 and
616
- y_range[0] < 0 and y_range[1] < 0
617
- )
797
+ x_range = zoom.get("xRange", [-1, -1])
798
+ y_range = zoom.get("yRange", [-1, -1])
799
+ return x_range[0] < 0 and x_range[1] < 0 and y_range[0] < 0 and y_range[1] < 0
618
800
 
619
801
  def _select_level_for_zoom(
620
802
  self,
@@ -641,8 +823,9 @@ class Heatmap(BaseComponent):
641
823
  Filtered Polars DataFrame at appropriate resolution
642
824
  """
643
825
  import sys
644
- x0, x1 = zoom['xRange']
645
- y0, y1 = zoom['yRange']
826
+
827
+ x0, x1 = zoom["xRange"]
828
+ y0, y1 = zoom["yRange"]
646
829
 
647
830
  # Add raw data as final level if available
648
831
  all_levels = list(levels)
@@ -658,10 +841,10 @@ class Heatmap(BaseComponent):
658
841
 
659
842
  # Filter to zoom range
660
843
  filtered_lazy = level_data.filter(
661
- (pl.col(self._x_column) >= x0) &
662
- (pl.col(self._x_column) <= x1) &
663
- (pl.col(self._y_column) >= y0) &
664
- (pl.col(self._y_column) <= y1)
844
+ (pl.col(self._x_column) >= x0)
845
+ & (pl.col(self._x_column) <= x1)
846
+ & (pl.col(self._y_column) >= y0)
847
+ & (pl.col(self._y_column) <= y1)
665
848
  )
666
849
 
667
850
  # Apply non-categorical filters if any
@@ -680,15 +863,26 @@ class Heatmap(BaseComponent):
680
863
 
681
864
  count = len(filtered)
682
865
  last_filtered = filtered
683
- print(f"[HEATMAP] Level {level_idx}: {count} pts in zoom range", file=sys.stderr)
866
+ print(
867
+ f"[HEATMAP] Level {level_idx}: {count} pts in zoom range",
868
+ file=sys.stderr,
869
+ )
684
870
 
685
871
  if count >= self._min_points:
686
872
  # This level has enough detail
687
873
  if count > self._min_points:
688
- # Over limit - downsample to stay at/under max
689
- # Use ZOOM range for binning (not global) to avoid sparse bins
874
+ # Over limit - downsample to exactly min_points
875
+ # Compute optimal bins from ACTUAL zoom region aspect ratio
690
876
  zoom_x_range = (x0, x1)
691
877
  zoom_y_range = (y0, y1)
878
+ render_x_bins, render_y_bins = compute_optimal_bins(
879
+ self._min_points, zoom_x_range, zoom_y_range
880
+ )
881
+ print(
882
+ f"[HEATMAP] Render downsample: {count:,} → {self._min_points:,} pts "
883
+ f"(bins: {render_x_bins}x{render_y_bins})",
884
+ file=sys.stderr,
885
+ )
692
886
  if self._use_streaming or self._use_simple_downsample:
693
887
  if self._use_simple_downsample:
694
888
  return downsample_2d_simple(
@@ -703,8 +897,8 @@ class Heatmap(BaseComponent):
703
897
  x_column=self._x_column,
704
898
  y_column=self._y_column,
705
899
  intensity_column=self._intensity_column,
706
- x_bins=self._x_bins,
707
- y_bins=self._y_bins,
900
+ x_bins=render_x_bins,
901
+ y_bins=render_y_bins,
708
902
  x_range=zoom_x_range,
709
903
  y_range=zoom_y_range,
710
904
  ).collect()
@@ -715,8 +909,8 @@ class Heatmap(BaseComponent):
715
909
  x_column=self._x_column,
716
910
  y_column=self._y_column,
717
911
  intensity_column=self._intensity_column,
718
- x_bins=self._x_bins,
719
- y_bins=self._y_bins,
912
+ x_bins=render_x_bins,
913
+ y_bins=render_y_bins,
720
914
  ).collect()
721
915
  return filtered
722
916
 
@@ -740,6 +934,7 @@ class Heatmap(BaseComponent):
740
934
  Dict with heatmapData (pandas DataFrame) and _hash for change detection
741
935
  """
742
936
  import sys
937
+
743
938
  zoom = state.get(self._zoom_identifier)
744
939
 
745
940
  # Build columns to select
@@ -761,7 +956,9 @@ class Heatmap(BaseComponent):
761
956
 
762
957
  # Get levels based on current state (may use per-filter levels)
763
958
  levels, filtered_raw = self._get_levels_for_state(state)
764
- level_sizes = [len(l) if isinstance(l, pl.DataFrame) else '?' for l in levels]
959
+ level_sizes = [
960
+ len(lvl) if isinstance(lvl, pl.DataFrame) else "?" for lvl in levels
961
+ ]
765
962
 
766
963
  # Determine which filters still need to be applied at render time
767
964
  # (filters not in categorical_filters need runtime application)
@@ -775,12 +972,15 @@ class Heatmap(BaseComponent):
775
972
  # No zoom - use smallest level
776
973
  if not levels:
777
974
  # No levels available
778
- print(f"[HEATMAP] No levels available", file=sys.stderr)
779
- return {'heatmapData': pl.DataFrame().to_pandas(), '_hash': ''}
975
+ print("[HEATMAP] No levels available", file=sys.stderr)
976
+ return {"heatmapData": pl.DataFrame().to_pandas(), "_hash": ""}
780
977
 
781
978
  data = levels[0]
782
- using_cat = self._preprocessed_data.get('has_categorical_filters', False)
783
- print(f"[HEATMAP] No zoom → level 0 ({level_sizes[0]} pts), levels={level_sizes}, categorical={using_cat}", file=sys.stderr)
979
+ using_cat = self._preprocessed_data.get("has_categorical_filters", False)
980
+ print(
981
+ f"[HEATMAP] No zoom → level 0 ({level_sizes[0]} pts), levels={level_sizes}, categorical={using_cat}",
982
+ file=sys.stderr,
983
+ )
784
984
 
785
985
  # Ensure we have a LazyFrame
786
986
  if isinstance(data, pl.DataFrame):
@@ -796,7 +996,9 @@ class Heatmap(BaseComponent):
796
996
  filter_defaults=self._filter_defaults,
797
997
  )
798
998
  # Sort by intensity ascending so high-intensity points are drawn on top
799
- df_pandas = df_pandas.sort_values(self._intensity_column).reset_index(drop=True)
999
+ df_pandas = df_pandas.sort_values(self._intensity_column).reset_index(
1000
+ drop=True
1001
+ )
800
1002
  else:
801
1003
  # No filters to apply - levels already filtered by categorical filter
802
1004
  schema_names = data.collect_schema().names()
@@ -817,13 +1019,16 @@ class Heatmap(BaseComponent):
817
1019
  df_polars = df_polars.select(available_cols)
818
1020
  # Sort by intensity ascending so high-intensity points are drawn on top
819
1021
  df_polars = df_polars.sort(self._intensity_column)
820
- print(f"[HEATMAP] Selected {len(df_polars)} pts for zoom, levels={level_sizes}", file=sys.stderr)
1022
+ print(
1023
+ f"[HEATMAP] Selected {len(df_polars)} pts for zoom, levels={level_sizes}",
1024
+ file=sys.stderr,
1025
+ )
821
1026
  data_hash = compute_dataframe_hash(df_polars)
822
1027
  df_pandas = df_polars.to_pandas()
823
1028
 
824
1029
  return {
825
- 'heatmapData': df_pandas,
826
- '_hash': data_hash,
1030
+ "heatmapData": df_pandas,
1031
+ "_hash": data_hash,
827
1032
  }
828
1033
 
829
1034
  def _get_component_args(self) -> Dict[str, Any]:
@@ -834,19 +1039,19 @@ class Heatmap(BaseComponent):
834
1039
  Dict with all heatmap configuration for Vue
835
1040
  """
836
1041
  args: Dict[str, Any] = {
837
- 'componentType': self._get_vue_component_name(),
838
- 'xColumn': self._x_column,
839
- 'yColumn': self._y_column,
840
- 'intensityColumn': self._intensity_column,
841
- 'xLabel': self._x_label,
842
- 'yLabel': self._y_label,
843
- 'colorscale': self._colorscale,
844
- 'zoomIdentifier': self._zoom_identifier,
845
- 'interactivity': self._interactivity,
1042
+ "componentType": self._get_vue_component_name(),
1043
+ "xColumn": self._x_column,
1044
+ "yColumn": self._y_column,
1045
+ "intensityColumn": self._intensity_column,
1046
+ "xLabel": self._x_label,
1047
+ "yLabel": self._y_label,
1048
+ "colorscale": self._colorscale,
1049
+ "zoomIdentifier": self._zoom_identifier,
1050
+ "interactivity": self._interactivity,
846
1051
  }
847
1052
 
848
1053
  if self._title:
849
- args['title'] = self._title
1054
+ args["title"] = self._title
850
1055
 
851
1056
  # Add any extra config options
852
1057
  args.update(self._config)
@@ -858,7 +1063,7 @@ class Heatmap(BaseComponent):
858
1063
  colorscale: Optional[str] = None,
859
1064
  x_label: Optional[str] = None,
860
1065
  y_label: Optional[str] = None,
861
- ) -> 'Heatmap':
1066
+ ) -> "Heatmap":
862
1067
  """
863
1068
  Update heatmap styling.
864
1069