openms-insight 0.1.1__py3-none-any.whl → 0.1.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -229,6 +229,8 @@ class Heatmap(BaseComponent):
229
229
  render time, the resulting data has ~min_points regardless of the
230
230
  filter value selected.
231
231
 
232
+ Data is sorted by x, y columns for efficient range query predicate pushdown.
233
+
232
234
  Example: For im_dimension with values [0, 1, 2, 3], creates:
233
235
  - cat_level_im_dimension_0_0: 20K points with im_id=0
234
236
  - cat_level_im_dimension_0_1: 20K points with im_id=1
@@ -314,14 +316,19 @@ class Heatmap(BaseComponent):
314
316
  y_range=y_range,
315
317
  )
316
318
 
319
+ # Sort by x, y for efficient range query predicate pushdown
320
+ level = level.sort([self._x_column, self._y_column])
317
321
  # Store LazyFrame for streaming to disk
318
322
  level_key = f'cat_level_{filter_id}_{filter_value}_{level_idx}'
319
323
  self._preprocessed_data[level_key] = level # Keep lazy
320
324
 
321
325
  # Add full resolution as final level (for zoom fallback)
326
+ # Also sorted for consistent predicate pushdown behavior
322
327
  num_compressed = len(level_sizes)
323
328
  full_res_key = f'cat_level_{filter_id}_{filter_value}_{num_compressed}'
324
- self._preprocessed_data[full_res_key] = filtered_data
329
+ self._preprocessed_data[full_res_key] = filtered_data.sort(
330
+ [self._x_column, self._y_column]
331
+ )
325
332
  self._preprocessed_data[f'cat_num_levels_{filter_id}_{filter_value}'] = num_compressed + 1
326
333
 
327
334
  # Also create global levels for when no categorical filter is selected
@@ -351,11 +358,16 @@ class Heatmap(BaseComponent):
351
358
  x_range=x_range,
352
359
  y_range=y_range,
353
360
  )
361
+ # Sort by x, y for efficient range query predicate pushdown
362
+ level = level.sort([self._x_column, self._y_column])
354
363
  self._preprocessed_data[f'level_{i}'] = level # Keep lazy
355
364
 
356
365
  # Add full resolution as final level (for zoom fallback)
366
+ # Also sorted for consistent predicate pushdown behavior
357
367
  num_compressed = len(level_sizes)
358
- self._preprocessed_data[f'level_{num_compressed}'] = self._raw_data
368
+ self._preprocessed_data[f'level_{num_compressed}'] = self._raw_data.sort(
369
+ [self._x_column, self._y_column]
370
+ )
359
371
  self._preprocessed_data['num_levels'] = num_compressed + 1
360
372
 
361
373
  def _preprocess_streaming(self) -> None:
@@ -363,6 +375,7 @@ class Heatmap(BaseComponent):
363
375
  Streaming preprocessing - levels stay lazy through caching.
364
376
 
365
377
  Builds lazy query plans that are streamed to disk via sink_parquet().
378
+ Data is sorted by x, y columns for efficient range query predicate pushdown.
366
379
  """
367
380
  # Get data ranges (minimal collect - just 4 values)
368
381
  x_range, y_range = get_data_range(
@@ -406,13 +419,19 @@ class Heatmap(BaseComponent):
406
419
  x_range=x_range,
407
420
  y_range=y_range,
408
421
  )
422
+ # Sort by x, y for efficient range query predicate pushdown
423
+ # This clusters spatially close points together in row groups
424
+ level = level.sort([self._x_column, self._y_column])
409
425
  # Store LazyFrame for streaming to disk
410
426
  # Base class will use sink_parquet() to stream without full materialization
411
427
  self._preprocessed_data[f'level_{i}'] = level # Keep lazy
412
428
 
413
429
  # Add full resolution as final level (for zoom fallback)
430
+ # Also sorted for consistent predicate pushdown behavior
414
431
  num_compressed = len(level_sizes)
415
- self._preprocessed_data[f'level_{num_compressed}'] = self._raw_data
432
+ self._preprocessed_data[f'level_{num_compressed}'] = self._raw_data.sort(
433
+ [self._x_column, self._y_column]
434
+ )
416
435
 
417
436
  # Store number of levels for reconstruction (includes full resolution)
418
437
  self._preprocessed_data['num_levels'] = num_compressed + 1
@@ -423,6 +442,7 @@ class Heatmap(BaseComponent):
423
442
 
424
443
  Uses more memory at init but faster rendering. Uses scipy-based
425
444
  downsampling for better spatial distribution.
445
+ Data is sorted by x, y columns for efficient range query predicate pushdown.
426
446
  """
427
447
  # Get data ranges
428
448
  x_range, y_range = get_data_range(
@@ -465,6 +485,11 @@ class Heatmap(BaseComponent):
465
485
  x_bins=self._x_bins,
466
486
  y_bins=self._y_bins,
467
487
  )
488
+ # Sort by x, y for efficient range query predicate pushdown
489
+ if isinstance(downsampled, pl.LazyFrame):
490
+ downsampled = downsampled.sort([self._x_column, self._y_column])
491
+ else:
492
+ downsampled = downsampled.sort([self._x_column, self._y_column])
468
493
  # Store LazyFrame for streaming to disk
469
494
  level_idx = len(level_sizes) - 1 - i
470
495
  if isinstance(downsampled, pl.LazyFrame):
@@ -475,8 +500,11 @@ class Heatmap(BaseComponent):
475
500
  current = downsampled
476
501
 
477
502
  # Add full resolution as final level (for zoom fallback)
503
+ # Also sorted for consistent predicate pushdown behavior
478
504
  num_compressed = len(level_sizes)
479
- self._preprocessed_data[f'level_{num_compressed}'] = self._raw_data
505
+ self._preprocessed_data[f'level_{num_compressed}'] = self._raw_data.sort(
506
+ [self._x_column, self._y_column]
507
+ )
480
508
 
481
509
  # Store number of levels for reconstruction (includes full resolution)
482
510
  self._preprocessed_data['num_levels'] = num_compressed + 1
@@ -15,7 +15,8 @@ if TYPE_CHECKING:
15
15
 
16
16
  # Cache format version - increment when cache structure changes
17
17
  # Version 2: Added sorting by filter columns + smaller row groups for predicate pushdown
18
- CACHE_VERSION = 2
18
+ # Version 3: Downcast numeric types (Int64→Int32, Float64→Float32) for efficient transfer
19
+ CACHE_VERSION = 3
19
20
 
20
21
 
21
22
  class BaseComponent(ABC):
@@ -236,6 +237,8 @@ class BaseComponent(ABC):
236
237
 
237
238
  def _save_to_cache(self) -> None:
238
239
  """Save preprocessed data to cache."""
240
+ from ..preprocessing.filtering import optimize_for_transfer, optimize_for_transfer_lazy
241
+
239
242
  # Create directories
240
243
  self._cache_dir.mkdir(parents=True, exist_ok=True)
241
244
  preprocessed_dir = self._get_preprocessed_dir()
@@ -254,17 +257,23 @@ class BaseComponent(ABC):
254
257
  "data_values": {},
255
258
  }
256
259
 
257
- # Save preprocessed data - stream LazyFrames directly to disk
260
+ # Save preprocessed data with type optimization for efficient transfer
261
+ # Float64→Float32 reduces Arrow payload size
262
+ # Int64→Int32 (when safe) avoids BigInt overhead in JavaScript
258
263
  for key, value in self._preprocessed_data.items():
259
264
  if isinstance(value, pl.LazyFrame):
260
265
  filename = f"{key}.parquet"
261
266
  filepath = preprocessed_dir / filename
262
- # Stream directly to disk without full materialization
267
+ # Apply streaming-safe optimization (Float64→Float32 only)
268
+ # Int64 bounds checking would require collect(), breaking streaming
269
+ value = optimize_for_transfer_lazy(value)
263
270
  value.sink_parquet(filepath, compression='zstd')
264
271
  manifest["data_files"][key] = filename
265
272
  elif isinstance(value, pl.DataFrame):
266
273
  filename = f"{key}.parquet"
267
274
  filepath = preprocessed_dir / filename
275
+ # Full optimization including Int64→Int32 with bounds checking
276
+ value = optimize_for_transfer(value)
268
277
  value.write_parquet(filepath, compression='zstd')
269
278
  manifest["data_files"][key] = filename
270
279
  elif self._is_json_serializable(value):