openms-insight 0.1.1__py3-none-any.whl → 0.1.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- openms_insight/components/heatmap.py +32 -4
- openms_insight/core/base.py +12 -3
- openms_insight/js-component/dist/assets/index.js +90 -90
- openms_insight/preprocessing/filtering.py +84 -0
- {openms_insight-0.1.1.dist-info → openms_insight-0.1.2.dist-info}/METADATA +1 -1
- {openms_insight-0.1.1.dist-info → openms_insight-0.1.2.dist-info}/RECORD +8 -8
- {openms_insight-0.1.1.dist-info → openms_insight-0.1.2.dist-info}/WHEEL +0 -0
- {openms_insight-0.1.1.dist-info → openms_insight-0.1.2.dist-info}/licenses/LICENSE +0 -0
|
@@ -229,6 +229,8 @@ class Heatmap(BaseComponent):
|
|
|
229
229
|
render time, the resulting data has ~min_points regardless of the
|
|
230
230
|
filter value selected.
|
|
231
231
|
|
|
232
|
+
Data is sorted by x, y columns for efficient range query predicate pushdown.
|
|
233
|
+
|
|
232
234
|
Example: For im_dimension with values [0, 1, 2, 3], creates:
|
|
233
235
|
- cat_level_im_dimension_0_0: 20K points with im_id=0
|
|
234
236
|
- cat_level_im_dimension_0_1: 20K points with im_id=1
|
|
@@ -314,14 +316,19 @@ class Heatmap(BaseComponent):
|
|
|
314
316
|
y_range=y_range,
|
|
315
317
|
)
|
|
316
318
|
|
|
319
|
+
# Sort by x, y for efficient range query predicate pushdown
|
|
320
|
+
level = level.sort([self._x_column, self._y_column])
|
|
317
321
|
# Store LazyFrame for streaming to disk
|
|
318
322
|
level_key = f'cat_level_{filter_id}_{filter_value}_{level_idx}'
|
|
319
323
|
self._preprocessed_data[level_key] = level # Keep lazy
|
|
320
324
|
|
|
321
325
|
# Add full resolution as final level (for zoom fallback)
|
|
326
|
+
# Also sorted for consistent predicate pushdown behavior
|
|
322
327
|
num_compressed = len(level_sizes)
|
|
323
328
|
full_res_key = f'cat_level_{filter_id}_{filter_value}_{num_compressed}'
|
|
324
|
-
self._preprocessed_data[full_res_key] = filtered_data
|
|
329
|
+
self._preprocessed_data[full_res_key] = filtered_data.sort(
|
|
330
|
+
[self._x_column, self._y_column]
|
|
331
|
+
)
|
|
325
332
|
self._preprocessed_data[f'cat_num_levels_{filter_id}_{filter_value}'] = num_compressed + 1
|
|
326
333
|
|
|
327
334
|
# Also create global levels for when no categorical filter is selected
|
|
@@ -351,11 +358,16 @@ class Heatmap(BaseComponent):
|
|
|
351
358
|
x_range=x_range,
|
|
352
359
|
y_range=y_range,
|
|
353
360
|
)
|
|
361
|
+
# Sort by x, y for efficient range query predicate pushdown
|
|
362
|
+
level = level.sort([self._x_column, self._y_column])
|
|
354
363
|
self._preprocessed_data[f'level_{i}'] = level # Keep lazy
|
|
355
364
|
|
|
356
365
|
# Add full resolution as final level (for zoom fallback)
|
|
366
|
+
# Also sorted for consistent predicate pushdown behavior
|
|
357
367
|
num_compressed = len(level_sizes)
|
|
358
|
-
self._preprocessed_data[f'level_{num_compressed}'] = self._raw_data
|
|
368
|
+
self._preprocessed_data[f'level_{num_compressed}'] = self._raw_data.sort(
|
|
369
|
+
[self._x_column, self._y_column]
|
|
370
|
+
)
|
|
359
371
|
self._preprocessed_data['num_levels'] = num_compressed + 1
|
|
360
372
|
|
|
361
373
|
def _preprocess_streaming(self) -> None:
|
|
@@ -363,6 +375,7 @@ class Heatmap(BaseComponent):
|
|
|
363
375
|
Streaming preprocessing - levels stay lazy through caching.
|
|
364
376
|
|
|
365
377
|
Builds lazy query plans that are streamed to disk via sink_parquet().
|
|
378
|
+
Data is sorted by x, y columns for efficient range query predicate pushdown.
|
|
366
379
|
"""
|
|
367
380
|
# Get data ranges (minimal collect - just 4 values)
|
|
368
381
|
x_range, y_range = get_data_range(
|
|
@@ -406,13 +419,19 @@ class Heatmap(BaseComponent):
|
|
|
406
419
|
x_range=x_range,
|
|
407
420
|
y_range=y_range,
|
|
408
421
|
)
|
|
422
|
+
# Sort by x, y for efficient range query predicate pushdown
|
|
423
|
+
# This clusters spatially close points together in row groups
|
|
424
|
+
level = level.sort([self._x_column, self._y_column])
|
|
409
425
|
# Store LazyFrame for streaming to disk
|
|
410
426
|
# Base class will use sink_parquet() to stream without full materialization
|
|
411
427
|
self._preprocessed_data[f'level_{i}'] = level # Keep lazy
|
|
412
428
|
|
|
413
429
|
# Add full resolution as final level (for zoom fallback)
|
|
430
|
+
# Also sorted for consistent predicate pushdown behavior
|
|
414
431
|
num_compressed = len(level_sizes)
|
|
415
|
-
self._preprocessed_data[f'level_{num_compressed}'] = self._raw_data
|
|
432
|
+
self._preprocessed_data[f'level_{num_compressed}'] = self._raw_data.sort(
|
|
433
|
+
[self._x_column, self._y_column]
|
|
434
|
+
)
|
|
416
435
|
|
|
417
436
|
# Store number of levels for reconstruction (includes full resolution)
|
|
418
437
|
self._preprocessed_data['num_levels'] = num_compressed + 1
|
|
@@ -423,6 +442,7 @@ class Heatmap(BaseComponent):
|
|
|
423
442
|
|
|
424
443
|
Uses more memory at init but faster rendering. Uses scipy-based
|
|
425
444
|
downsampling for better spatial distribution.
|
|
445
|
+
Data is sorted by x, y columns for efficient range query predicate pushdown.
|
|
426
446
|
"""
|
|
427
447
|
# Get data ranges
|
|
428
448
|
x_range, y_range = get_data_range(
|
|
@@ -465,6 +485,11 @@ class Heatmap(BaseComponent):
|
|
|
465
485
|
x_bins=self._x_bins,
|
|
466
486
|
y_bins=self._y_bins,
|
|
467
487
|
)
|
|
488
|
+
# Sort by x, y for efficient range query predicate pushdown
|
|
489
|
+
if isinstance(downsampled, pl.LazyFrame):
|
|
490
|
+
downsampled = downsampled.sort([self._x_column, self._y_column])
|
|
491
|
+
else:
|
|
492
|
+
downsampled = downsampled.sort([self._x_column, self._y_column])
|
|
468
493
|
# Store LazyFrame for streaming to disk
|
|
469
494
|
level_idx = len(level_sizes) - 1 - i
|
|
470
495
|
if isinstance(downsampled, pl.LazyFrame):
|
|
@@ -475,8 +500,11 @@ class Heatmap(BaseComponent):
|
|
|
475
500
|
current = downsampled
|
|
476
501
|
|
|
477
502
|
# Add full resolution as final level (for zoom fallback)
|
|
503
|
+
# Also sorted for consistent predicate pushdown behavior
|
|
478
504
|
num_compressed = len(level_sizes)
|
|
479
|
-
self._preprocessed_data[f'level_{num_compressed}'] = self._raw_data
|
|
505
|
+
self._preprocessed_data[f'level_{num_compressed}'] = self._raw_data.sort(
|
|
506
|
+
[self._x_column, self._y_column]
|
|
507
|
+
)
|
|
480
508
|
|
|
481
509
|
# Store number of levels for reconstruction (includes full resolution)
|
|
482
510
|
self._preprocessed_data['num_levels'] = num_compressed + 1
|
openms_insight/core/base.py
CHANGED
|
@@ -15,7 +15,8 @@ if TYPE_CHECKING:
|
|
|
15
15
|
|
|
16
16
|
# Cache format version - increment when cache structure changes
|
|
17
17
|
# Version 2: Added sorting by filter columns + smaller row groups for predicate pushdown
|
|
18
|
-
|
|
18
|
+
# Version 3: Downcast numeric types (Int64→Int32, Float64→Float32) for efficient transfer
|
|
19
|
+
CACHE_VERSION = 3
|
|
19
20
|
|
|
20
21
|
|
|
21
22
|
class BaseComponent(ABC):
|
|
@@ -236,6 +237,8 @@ class BaseComponent(ABC):
|
|
|
236
237
|
|
|
237
238
|
def _save_to_cache(self) -> None:
|
|
238
239
|
"""Save preprocessed data to cache."""
|
|
240
|
+
from ..preprocessing.filtering import optimize_for_transfer, optimize_for_transfer_lazy
|
|
241
|
+
|
|
239
242
|
# Create directories
|
|
240
243
|
self._cache_dir.mkdir(parents=True, exist_ok=True)
|
|
241
244
|
preprocessed_dir = self._get_preprocessed_dir()
|
|
@@ -254,17 +257,23 @@ class BaseComponent(ABC):
|
|
|
254
257
|
"data_values": {},
|
|
255
258
|
}
|
|
256
259
|
|
|
257
|
-
# Save preprocessed data
|
|
260
|
+
# Save preprocessed data with type optimization for efficient transfer
|
|
261
|
+
# Float64→Float32 reduces Arrow payload size
|
|
262
|
+
# Int64→Int32 (when safe) avoids BigInt overhead in JavaScript
|
|
258
263
|
for key, value in self._preprocessed_data.items():
|
|
259
264
|
if isinstance(value, pl.LazyFrame):
|
|
260
265
|
filename = f"{key}.parquet"
|
|
261
266
|
filepath = preprocessed_dir / filename
|
|
262
|
-
#
|
|
267
|
+
# Apply streaming-safe optimization (Float64→Float32 only)
|
|
268
|
+
# Int64 bounds checking would require collect(), breaking streaming
|
|
269
|
+
value = optimize_for_transfer_lazy(value)
|
|
263
270
|
value.sink_parquet(filepath, compression='zstd')
|
|
264
271
|
manifest["data_files"][key] = filename
|
|
265
272
|
elif isinstance(value, pl.DataFrame):
|
|
266
273
|
filename = f"{key}.parquet"
|
|
267
274
|
filepath = preprocessed_dir / filename
|
|
275
|
+
# Full optimization including Int64→Int32 with bounds checking
|
|
276
|
+
value = optimize_for_transfer(value)
|
|
268
277
|
value.write_parquet(filepath, compression='zstd')
|
|
269
278
|
manifest["data_files"][key] = filename
|
|
270
279
|
elif self._is_json_serializable(value):
|