openms-insight 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- openms_insight/__init__.py +32 -0
- openms_insight/components/__init__.py +11 -0
- openms_insight/components/heatmap.py +823 -0
- openms_insight/components/lineplot.py +492 -0
- openms_insight/components/sequenceview.py +384 -0
- openms_insight/components/table.py +400 -0
- openms_insight/core/__init__.py +14 -0
- openms_insight/core/base.py +413 -0
- openms_insight/core/cache.py +39 -0
- openms_insight/core/registry.py +82 -0
- openms_insight/core/state.py +215 -0
- openms_insight/js-component/dist/assets/index.css +5 -0
- openms_insight/js-component/dist/assets/index.js +4220 -0
- openms_insight/js-component/dist/assets/materialdesignicons-webfont.eot +0 -0
- openms_insight/js-component/dist/assets/materialdesignicons-webfont.ttf +0 -0
- openms_insight/js-component/dist/assets/materialdesignicons-webfont.woff +0 -0
- openms_insight/js-component/dist/assets/materialdesignicons-webfont.woff2 +0 -0
- openms_insight/js-component/dist/index.html +14 -0
- openms_insight/preprocessing/__init__.py +22 -0
- openms_insight/preprocessing/compression.py +338 -0
- openms_insight/preprocessing/filtering.py +316 -0
- openms_insight/rendering/__init__.py +8 -0
- openms_insight/rendering/bridge.py +312 -0
- openms_insight-0.1.0.dist-info/METADATA +256 -0
- openms_insight-0.1.0.dist-info/RECORD +27 -0
- openms_insight-0.1.0.dist-info/WHEEL +4 -0
- openms_insight-0.1.0.dist-info/licenses/LICENSE +29 -0
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
<!DOCTYPE html>
|
|
2
|
+
<html lang="en">
|
|
3
|
+
<head>
|
|
4
|
+
<meta charset="UTF-8">
|
|
5
|
+
<link rel="icon" href="/favicon.ico">
|
|
6
|
+
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
|
7
|
+
<title>Streamlit Vue Components</title>
|
|
8
|
+
<script type="module" crossorigin src="./assets/index.js"></script>
|
|
9
|
+
<link rel="stylesheet" crossorigin href="./assets/index.css">
|
|
10
|
+
</head>
|
|
11
|
+
<body>
|
|
12
|
+
<div id="app"></div>
|
|
13
|
+
</body>
|
|
14
|
+
</html>
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
"""Preprocessing utilities for data transformation and filtering."""
|
|
2
|
+
|
|
3
|
+
from .filtering import (
|
|
4
|
+
filter_by_selection,
|
|
5
|
+
filter_by_index,
|
|
6
|
+
filter_and_collect_cached,
|
|
7
|
+
)
|
|
8
|
+
|
|
9
|
+
from .compression import (
|
|
10
|
+
compute_compression_levels,
|
|
11
|
+
downsample_2d,
|
|
12
|
+
downsample_2d_simple,
|
|
13
|
+
)
|
|
14
|
+
|
|
15
|
+
__all__ = [
|
|
16
|
+
"filter_by_selection",
|
|
17
|
+
"filter_by_index",
|
|
18
|
+
"filter_and_collect_cached",
|
|
19
|
+
"compute_compression_levels",
|
|
20
|
+
"downsample_2d",
|
|
21
|
+
"downsample_2d_simple",
|
|
22
|
+
]
|
|
@@ -0,0 +1,338 @@
|
|
|
1
|
+
"""Compression utilities for large 2D datasets (heatmaps).
|
|
2
|
+
|
|
3
|
+
This module provides functions for multi-resolution downsampling of 2D scatter
|
|
4
|
+
data, enabling efficient visualization of datasets with millions of points.
|
|
5
|
+
|
|
6
|
+
Supports both streaming (lazy) and eager downsampling approaches.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from typing import List, Optional, Union
|
|
10
|
+
|
|
11
|
+
import numpy as np
|
|
12
|
+
import polars as pl
|
|
13
|
+
|
|
14
|
+
try:
|
|
15
|
+
from scipy.stats import binned_statistic_2d
|
|
16
|
+
HAS_SCIPY = True
|
|
17
|
+
except ImportError:
|
|
18
|
+
HAS_SCIPY = False
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def compute_compression_levels(min_size: int, total: int) -> List[int]:
|
|
22
|
+
"""
|
|
23
|
+
Compute logarithmically-spaced compression level target sizes.
|
|
24
|
+
|
|
25
|
+
Given a minimum target size and total data size, computes intermediate
|
|
26
|
+
compression levels at powers of 10.
|
|
27
|
+
|
|
28
|
+
Args:
|
|
29
|
+
min_size: Minimum/smallest compression level size (e.g., 20000)
|
|
30
|
+
total: Total number of data points
|
|
31
|
+
|
|
32
|
+
Returns:
|
|
33
|
+
List of target sizes, smallest first. Always returns at least one level.
|
|
34
|
+
For small datasets (total <= min_size), returns [total] to preserve all data.
|
|
35
|
+
|
|
36
|
+
Examples:
|
|
37
|
+
>>> compute_compression_levels(20000, 1_000_000)
|
|
38
|
+
[20000, 200000]
|
|
39
|
+
>>> compute_compression_levels(20000, 50_000)
|
|
40
|
+
[20000]
|
|
41
|
+
>>> compute_compression_levels(20000, 15_000)
|
|
42
|
+
[15000]
|
|
43
|
+
"""
|
|
44
|
+
if total <= min_size:
|
|
45
|
+
# Still return at least one level with all data
|
|
46
|
+
return [total]
|
|
47
|
+
|
|
48
|
+
# Compute powers of 10 between min and total
|
|
49
|
+
min_power = int(np.log10(min_size))
|
|
50
|
+
max_power = int(np.log10(total))
|
|
51
|
+
|
|
52
|
+
if min_power >= max_power:
|
|
53
|
+
# Data is between min_size and 10x min_size - one downsampled level
|
|
54
|
+
return [min_size]
|
|
55
|
+
|
|
56
|
+
# Generate levels at each power of 10, scaled by the fractional part
|
|
57
|
+
scale_factor = int(10 ** (np.log10(min_size) % 1))
|
|
58
|
+
levels = np.logspace(
|
|
59
|
+
min_power,
|
|
60
|
+
max_power,
|
|
61
|
+
max_power - min_power + 1,
|
|
62
|
+
dtype='int'
|
|
63
|
+
) * scale_factor
|
|
64
|
+
|
|
65
|
+
# Filter out levels >= total (don't include full resolution for large datasets)
|
|
66
|
+
levels = levels[levels < total].tolist()
|
|
67
|
+
|
|
68
|
+
# Ensure at least one level exists
|
|
69
|
+
if not levels:
|
|
70
|
+
levels = [min_size]
|
|
71
|
+
|
|
72
|
+
return levels
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
def downsample_2d(
|
|
76
|
+
data: Union[pl.LazyFrame, pl.DataFrame],
|
|
77
|
+
max_points: int = 20000,
|
|
78
|
+
x_column: str = 'x',
|
|
79
|
+
y_column: str = 'y',
|
|
80
|
+
intensity_column: str = 'intensity',
|
|
81
|
+
x_bins: int = 400,
|
|
82
|
+
y_bins: int = 50,
|
|
83
|
+
) -> pl.LazyFrame:
|
|
84
|
+
"""
|
|
85
|
+
Downsample 2D scatter data while preserving high-intensity points.
|
|
86
|
+
|
|
87
|
+
Uses 2D binning to spatially partition data, then keeps the top N
|
|
88
|
+
highest-intensity points per bin. This preserves visually important
|
|
89
|
+
features (peaks) while reducing total point count.
|
|
90
|
+
|
|
91
|
+
Args:
|
|
92
|
+
data: Input data as Polars LazyFrame or DataFrame
|
|
93
|
+
max_points: Maximum number of points to keep
|
|
94
|
+
x_column: Name of x-axis column
|
|
95
|
+
y_column: Name of y-axis column
|
|
96
|
+
intensity_column: Name of intensity/value column for ranking
|
|
97
|
+
x_bins: Number of bins along x-axis
|
|
98
|
+
y_bins: Number of bins along y-axis
|
|
99
|
+
|
|
100
|
+
Returns:
|
|
101
|
+
Downsampled data as Polars LazyFrame
|
|
102
|
+
|
|
103
|
+
Raises:
|
|
104
|
+
ImportError: If scipy is not installed
|
|
105
|
+
ValueError: If x_bins * y_bins > max_points
|
|
106
|
+
"""
|
|
107
|
+
if not HAS_SCIPY:
|
|
108
|
+
raise ImportError(
|
|
109
|
+
"scipy is required for downsample_2d. "
|
|
110
|
+
"Install with: pip install scipy"
|
|
111
|
+
)
|
|
112
|
+
|
|
113
|
+
if (x_bins * y_bins) > max_points:
|
|
114
|
+
raise ValueError(
|
|
115
|
+
f"Number of bins ({x_bins * y_bins}) exceeds max_points ({max_points}). "
|
|
116
|
+
"Reduce x_bins or y_bins."
|
|
117
|
+
)
|
|
118
|
+
|
|
119
|
+
# Ensure we're working with a LazyFrame
|
|
120
|
+
if isinstance(data, pl.DataFrame):
|
|
121
|
+
data = data.lazy()
|
|
122
|
+
|
|
123
|
+
# Sort by intensity (descending) to prioritize high-intensity points
|
|
124
|
+
sorted_data = (
|
|
125
|
+
data
|
|
126
|
+
.sort([x_column, intensity_column], descending=[False, True])
|
|
127
|
+
.with_columns([
|
|
128
|
+
pl.int_range(pl.len()).over(x_column).alias('_rank')
|
|
129
|
+
])
|
|
130
|
+
.sort(['_rank', intensity_column], descending=[False, True])
|
|
131
|
+
)
|
|
132
|
+
|
|
133
|
+
# Collect for scipy binning (requires numpy arrays)
|
|
134
|
+
collected = sorted_data.collect()
|
|
135
|
+
|
|
136
|
+
total_count = len(collected)
|
|
137
|
+
if total_count <= max_points:
|
|
138
|
+
# No downsampling needed
|
|
139
|
+
return collected.drop('_rank').lazy()
|
|
140
|
+
|
|
141
|
+
# Extract arrays for scipy
|
|
142
|
+
x_array = collected[x_column].to_numpy()
|
|
143
|
+
y_array = collected[y_column].to_numpy()
|
|
144
|
+
intensity_array = collected[intensity_column].to_numpy()
|
|
145
|
+
|
|
146
|
+
# Compute 2D bins
|
|
147
|
+
count, _, _, mapping = binned_statistic_2d(
|
|
148
|
+
x_array, y_array, intensity_array, 'count',
|
|
149
|
+
bins=[x_bins, y_bins],
|
|
150
|
+
expand_binnumbers=True
|
|
151
|
+
)
|
|
152
|
+
|
|
153
|
+
# Add bin indices to dataframe
|
|
154
|
+
binned_data = (
|
|
155
|
+
collected.lazy()
|
|
156
|
+
.with_columns([
|
|
157
|
+
pl.Series('_x_bin', mapping[0] - 1), # scipy uses 1-based indexing
|
|
158
|
+
pl.Series('_y_bin', mapping[1] - 1)
|
|
159
|
+
])
|
|
160
|
+
)
|
|
161
|
+
|
|
162
|
+
# Compute max peaks per bin to stay under limit
|
|
163
|
+
counted_peaks = 0
|
|
164
|
+
max_peaks_per_bin = -1
|
|
165
|
+
new_count = 0
|
|
166
|
+
|
|
167
|
+
while (counted_peaks + new_count) < max_points:
|
|
168
|
+
max_peaks_per_bin += 1
|
|
169
|
+
counted_peaks += new_count
|
|
170
|
+
new_count = np.sum(count.flatten() >= (max_peaks_per_bin + 1))
|
|
171
|
+
|
|
172
|
+
if counted_peaks >= total_count:
|
|
173
|
+
break
|
|
174
|
+
|
|
175
|
+
# Keep top N peaks per bin
|
|
176
|
+
result = (
|
|
177
|
+
binned_data
|
|
178
|
+
.group_by(['_x_bin', '_y_bin'])
|
|
179
|
+
.head(max_peaks_per_bin)
|
|
180
|
+
.sort(intensity_column)
|
|
181
|
+
.drop(['_rank', '_x_bin', '_y_bin'])
|
|
182
|
+
)
|
|
183
|
+
|
|
184
|
+
return result
|
|
185
|
+
|
|
186
|
+
|
|
187
|
+
def downsample_2d_simple(
|
|
188
|
+
data: Union[pl.LazyFrame, pl.DataFrame],
|
|
189
|
+
max_points: int = 20000,
|
|
190
|
+
intensity_column: str = 'intensity',
|
|
191
|
+
) -> pl.LazyFrame:
|
|
192
|
+
"""
|
|
193
|
+
Simple downsampling by keeping highest-intensity points.
|
|
194
|
+
|
|
195
|
+
A simpler alternative to downsample_2d that doesn't require scipy.
|
|
196
|
+
Less spatially aware but still preserves important peaks.
|
|
197
|
+
|
|
198
|
+
Args:
|
|
199
|
+
data: Input data as Polars LazyFrame or DataFrame
|
|
200
|
+
max_points: Maximum number of points to keep
|
|
201
|
+
intensity_column: Name of intensity column for ranking
|
|
202
|
+
|
|
203
|
+
Returns:
|
|
204
|
+
Downsampled data as Polars LazyFrame
|
|
205
|
+
"""
|
|
206
|
+
if isinstance(data, pl.DataFrame):
|
|
207
|
+
data = data.lazy()
|
|
208
|
+
|
|
209
|
+
return (
|
|
210
|
+
data
|
|
211
|
+
.sort(intensity_column, descending=True)
|
|
212
|
+
.head(max_points)
|
|
213
|
+
)
|
|
214
|
+
|
|
215
|
+
|
|
216
|
+
def downsample_2d_streaming(
|
|
217
|
+
data: Union[pl.LazyFrame, pl.DataFrame],
|
|
218
|
+
max_points: int = 20000,
|
|
219
|
+
x_column: str = 'x',
|
|
220
|
+
y_column: str = 'y',
|
|
221
|
+
intensity_column: str = 'intensity',
|
|
222
|
+
x_bins: int = 400,
|
|
223
|
+
y_bins: int = 50,
|
|
224
|
+
x_range: Optional[tuple] = None,
|
|
225
|
+
y_range: Optional[tuple] = None,
|
|
226
|
+
) -> pl.LazyFrame:
|
|
227
|
+
"""
|
|
228
|
+
Streaming 2D downsampling using pure Polars operations.
|
|
229
|
+
|
|
230
|
+
Uses Polars' lazy evaluation to downsample data without full materialization.
|
|
231
|
+
Creates spatial bins using integer division and keeps top-N highest-intensity
|
|
232
|
+
points per bin. Stays fully lazy - no .collect() is called.
|
|
233
|
+
|
|
234
|
+
Args:
|
|
235
|
+
data: Input data as Polars LazyFrame or DataFrame
|
|
236
|
+
max_points: Maximum number of points to keep
|
|
237
|
+
x_column: Name of x-axis column
|
|
238
|
+
y_column: Name of y-axis column
|
|
239
|
+
intensity_column: Name of intensity/value column for ranking
|
|
240
|
+
x_bins: Number of bins along x-axis
|
|
241
|
+
y_bins: Number of bins along y-axis
|
|
242
|
+
x_range: Optional (min, max) tuple for x-axis. If None, computed from data.
|
|
243
|
+
y_range: Optional (min, max) tuple for y-axis. If None, computed from data.
|
|
244
|
+
|
|
245
|
+
Returns:
|
|
246
|
+
Downsampled data as Polars LazyFrame (fully lazy, no collection)
|
|
247
|
+
"""
|
|
248
|
+
if isinstance(data, pl.DataFrame):
|
|
249
|
+
data = data.lazy()
|
|
250
|
+
|
|
251
|
+
# Calculate points per bin
|
|
252
|
+
total_bins = x_bins * y_bins
|
|
253
|
+
points_per_bin = max(1, max_points // total_bins)
|
|
254
|
+
|
|
255
|
+
# Build binning expression using provided or computed ranges
|
|
256
|
+
if x_range is not None and y_range is not None:
|
|
257
|
+
x_min, x_max = x_range
|
|
258
|
+
y_min, y_max = y_range
|
|
259
|
+
|
|
260
|
+
# Use provided ranges for bin calculation
|
|
261
|
+
x_bin_expr = (
|
|
262
|
+
((pl.col(x_column) - x_min) / (x_max - x_min + 1e-10) * x_bins)
|
|
263
|
+
.cast(pl.Int32)
|
|
264
|
+
.clip(0, x_bins - 1)
|
|
265
|
+
.alias('_x_bin')
|
|
266
|
+
)
|
|
267
|
+
y_bin_expr = (
|
|
268
|
+
((pl.col(y_column) - y_min) / (y_max - y_min + 1e-10) * y_bins)
|
|
269
|
+
.cast(pl.Int32)
|
|
270
|
+
.clip(0, y_bins - 1)
|
|
271
|
+
.alias('_y_bin')
|
|
272
|
+
)
|
|
273
|
+
|
|
274
|
+
result = (
|
|
275
|
+
data
|
|
276
|
+
.with_columns([x_bin_expr, y_bin_expr])
|
|
277
|
+
.sort(intensity_column, descending=True)
|
|
278
|
+
.group_by(['_x_bin', '_y_bin'])
|
|
279
|
+
.head(points_per_bin)
|
|
280
|
+
.drop(['_x_bin', '_y_bin'])
|
|
281
|
+
)
|
|
282
|
+
else:
|
|
283
|
+
# Need to compute ranges - still lazy using over() window
|
|
284
|
+
# First pass: add normalized bin columns using min/max over entire frame
|
|
285
|
+
result = (
|
|
286
|
+
data
|
|
287
|
+
.with_columns([
|
|
288
|
+
# Compute bin indices using window functions for min/max
|
|
289
|
+
(
|
|
290
|
+
(pl.col(x_column) - pl.col(x_column).min()) /
|
|
291
|
+
(pl.col(x_column).max() - pl.col(x_column).min() + 1e-10) * x_bins
|
|
292
|
+
).cast(pl.Int32).clip(0, x_bins - 1).alias('_x_bin'),
|
|
293
|
+
(
|
|
294
|
+
(pl.col(y_column) - pl.col(y_column).min()) /
|
|
295
|
+
(pl.col(y_column).max() - pl.col(y_column).min() + 1e-10) * y_bins
|
|
296
|
+
).cast(pl.Int32).clip(0, y_bins - 1).alias('_y_bin'),
|
|
297
|
+
])
|
|
298
|
+
.sort(intensity_column, descending=True)
|
|
299
|
+
.group_by(['_x_bin', '_y_bin'])
|
|
300
|
+
.head(points_per_bin)
|
|
301
|
+
.drop(['_x_bin', '_y_bin'])
|
|
302
|
+
)
|
|
303
|
+
|
|
304
|
+
return result
|
|
305
|
+
|
|
306
|
+
|
|
307
|
+
def get_data_range(
|
|
308
|
+
data: Union[pl.LazyFrame, pl.DataFrame],
|
|
309
|
+
x_column: str,
|
|
310
|
+
y_column: str,
|
|
311
|
+
) -> tuple:
|
|
312
|
+
"""
|
|
313
|
+
Get the min/max ranges for x and y columns.
|
|
314
|
+
|
|
315
|
+
This requires a collect() operation but only fetches 4 scalar values.
|
|
316
|
+
|
|
317
|
+
Args:
|
|
318
|
+
data: Input data
|
|
319
|
+
x_column: X-axis column name
|
|
320
|
+
y_column: Y-axis column name
|
|
321
|
+
|
|
322
|
+
Returns:
|
|
323
|
+
Tuple of ((x_min, x_max), (y_min, y_max))
|
|
324
|
+
"""
|
|
325
|
+
if isinstance(data, pl.DataFrame):
|
|
326
|
+
data = data.lazy()
|
|
327
|
+
|
|
328
|
+
stats = data.select([
|
|
329
|
+
pl.col(x_column).min().alias('x_min'),
|
|
330
|
+
pl.col(x_column).max().alias('x_max'),
|
|
331
|
+
pl.col(y_column).min().alias('y_min'),
|
|
332
|
+
pl.col(y_column).max().alias('y_max'),
|
|
333
|
+
]).collect()
|
|
334
|
+
|
|
335
|
+
return (
|
|
336
|
+
(stats['x_min'][0], stats['x_max'][0]),
|
|
337
|
+
(stats['y_min'][0], stats['y_max'][0]),
|
|
338
|
+
)
|
|
@@ -0,0 +1,316 @@
|
|
|
1
|
+
"""Data filtering utilities for selection-based filtering."""
|
|
2
|
+
|
|
3
|
+
from typing import Any, Dict, List, Optional, Tuple, Union
|
|
4
|
+
|
|
5
|
+
import hashlib
|
|
6
|
+
import pandas as pd
|
|
7
|
+
import polars as pl
|
|
8
|
+
import streamlit as st
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def _make_cache_key(
|
|
12
|
+
filters: Dict[str, str],
|
|
13
|
+
state: Dict[str, Any],
|
|
14
|
+
filter_defaults: Optional[Dict[str, Any]] = None,
|
|
15
|
+
) -> Tuple[Tuple[str, Any], ...]:
|
|
16
|
+
"""
|
|
17
|
+
Create a hashable cache key from filters and state.
|
|
18
|
+
|
|
19
|
+
Only includes state values for identifiers that are in filters,
|
|
20
|
+
so cache is invalidated only when relevant selections change.
|
|
21
|
+
|
|
22
|
+
Args:
|
|
23
|
+
filters: Mapping of identifier names to column names
|
|
24
|
+
state: Current selection state
|
|
25
|
+
filter_defaults: Optional default values for filters when state is None
|
|
26
|
+
|
|
27
|
+
Returns:
|
|
28
|
+
Tuple of (identifier, value) pairs for use as cache key
|
|
29
|
+
"""
|
|
30
|
+
relevant_state = []
|
|
31
|
+
for identifier in sorted(filters.keys()):
|
|
32
|
+
value = state.get(identifier)
|
|
33
|
+
# Apply default if value is None and default exists
|
|
34
|
+
if value is None and filter_defaults and identifier in filter_defaults:
|
|
35
|
+
value = filter_defaults[identifier]
|
|
36
|
+
relevant_state.append((identifier, value))
|
|
37
|
+
return tuple(relevant_state)
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def compute_dataframe_hash(df: pl.DataFrame) -> str:
|
|
41
|
+
"""
|
|
42
|
+
Compute an efficient hash for a DataFrame without pickling.
|
|
43
|
+
|
|
44
|
+
Uses shape, column names, and sampled values to create a fast hash
|
|
45
|
+
that detects data changes without materializing extra copies.
|
|
46
|
+
|
|
47
|
+
Args:
|
|
48
|
+
df: Polars DataFrame to hash
|
|
49
|
+
|
|
50
|
+
Returns:
|
|
51
|
+
SHA256 hash string
|
|
52
|
+
"""
|
|
53
|
+
# Build hash from metadata and sampled content
|
|
54
|
+
hash_parts = [
|
|
55
|
+
str(df.shape), # (rows, cols)
|
|
56
|
+
str(df.columns), # Column names
|
|
57
|
+
]
|
|
58
|
+
|
|
59
|
+
# For small DataFrames, hash first/last values of each column
|
|
60
|
+
# For large DataFrames, this is still O(1) memory
|
|
61
|
+
if len(df) > 0:
|
|
62
|
+
# Sample first and last row for change detection
|
|
63
|
+
first_row = df.head(1).to_dicts()[0] if len(df) > 0 else {}
|
|
64
|
+
last_row = df.tail(1).to_dicts()[0] if len(df) > 0 else {}
|
|
65
|
+
hash_parts.append(str(first_row))
|
|
66
|
+
hash_parts.append(str(last_row))
|
|
67
|
+
|
|
68
|
+
# Add sum of numeric columns for content verification
|
|
69
|
+
for col in df.columns:
|
|
70
|
+
dtype = df[col].dtype
|
|
71
|
+
if dtype in (pl.Int8, pl.Int16, pl.Int32, pl.Int64,
|
|
72
|
+
pl.UInt8, pl.UInt16, pl.UInt32, pl.UInt64,
|
|
73
|
+
pl.Float32, pl.Float64):
|
|
74
|
+
try:
|
|
75
|
+
col_sum = df[col].sum()
|
|
76
|
+
hash_parts.append(f"{col}:{col_sum}")
|
|
77
|
+
except Exception:
|
|
78
|
+
pass
|
|
79
|
+
|
|
80
|
+
hash_input = "|".join(hash_parts).encode()
|
|
81
|
+
return hashlib.sha256(hash_input).hexdigest()
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
@st.cache_data(ttl=300, max_entries=100)
|
|
85
|
+
def _cached_filter_and_collect(
|
|
86
|
+
_data: pl.LazyFrame,
|
|
87
|
+
filters_tuple: Tuple[Tuple[str, str], ...],
|
|
88
|
+
state_tuple: Tuple[Tuple[str, Any], ...],
|
|
89
|
+
columns_tuple: Optional[Tuple[str, ...]] = None,
|
|
90
|
+
filter_defaults_tuple: Optional[Tuple[Tuple[str, Any], ...]] = None,
|
|
91
|
+
) -> Tuple[pd.DataFrame, str]:
|
|
92
|
+
"""
|
|
93
|
+
Filter data and collect with caching.
|
|
94
|
+
|
|
95
|
+
This function is cached by Streamlit, so repeated calls with the same
|
|
96
|
+
filter state will return cached results without re-executing the query.
|
|
97
|
+
|
|
98
|
+
Returns pandas DataFrame for efficient Arrow serialization to frontend.
|
|
99
|
+
|
|
100
|
+
Args:
|
|
101
|
+
_data: LazyFrame to filter (underscore prefix tells st.cache_data to hash by id)
|
|
102
|
+
filters_tuple: Tuple of (identifier, column) pairs from filters dict
|
|
103
|
+
state_tuple: Tuple of (identifier, value) pairs for current selection state
|
|
104
|
+
(already has defaults applied from _make_cache_key)
|
|
105
|
+
columns_tuple: Optional tuple of column names to select (projection)
|
|
106
|
+
filter_defaults_tuple: Optional tuple of (identifier, default_value) pairs
|
|
107
|
+
(included for cache key differentiation)
|
|
108
|
+
|
|
109
|
+
Returns:
|
|
110
|
+
Tuple of (pandas DataFrame, hash string)
|
|
111
|
+
"""
|
|
112
|
+
data = _data
|
|
113
|
+
filters = dict(filters_tuple)
|
|
114
|
+
state = dict(state_tuple) # Already has defaults applied
|
|
115
|
+
|
|
116
|
+
# Apply column projection FIRST (before filters) for efficiency
|
|
117
|
+
# This ensures we only read needed columns from disk
|
|
118
|
+
if columns_tuple:
|
|
119
|
+
data = data.select(list(columns_tuple))
|
|
120
|
+
|
|
121
|
+
# Apply filters
|
|
122
|
+
# If ANY filter has no selection (and no default), return empty DataFrame
|
|
123
|
+
# This prevents loading millions of rows when no spectrum is selected
|
|
124
|
+
for identifier, column in filters.items():
|
|
125
|
+
selected_value = state.get(identifier)
|
|
126
|
+
if selected_value is None:
|
|
127
|
+
# No selection for this filter - return empty DataFrame
|
|
128
|
+
# Collect with limit 0 to get schema without data
|
|
129
|
+
df_polars = data.head(0).collect()
|
|
130
|
+
data_hash = compute_dataframe_hash(df_polars)
|
|
131
|
+
df_pandas = df_polars.to_pandas()
|
|
132
|
+
return (df_pandas, data_hash)
|
|
133
|
+
|
|
134
|
+
# Convert float to int for integer columns to handle JSON number parsing
|
|
135
|
+
# (JavaScript numbers come back as floats, but Polars Int64 needs int comparison)
|
|
136
|
+
if isinstance(selected_value, float) and selected_value.is_integer():
|
|
137
|
+
selected_value = int(selected_value)
|
|
138
|
+
data = data.filter(pl.col(column) == selected_value)
|
|
139
|
+
|
|
140
|
+
# Collect to Polars DataFrame
|
|
141
|
+
df_polars = data.collect()
|
|
142
|
+
|
|
143
|
+
# Compute hash efficiently (no pickle)
|
|
144
|
+
data_hash = compute_dataframe_hash(df_polars)
|
|
145
|
+
|
|
146
|
+
# Convert to pandas for Arrow serialization (zero-copy when possible)
|
|
147
|
+
df_pandas = df_polars.to_pandas()
|
|
148
|
+
|
|
149
|
+
return (df_pandas, data_hash)
|
|
150
|
+
|
|
151
|
+
|
|
152
|
+
def filter_and_collect_cached(
|
|
153
|
+
data: Union[pl.LazyFrame, pl.DataFrame],
|
|
154
|
+
filters: Dict[str, str],
|
|
155
|
+
state: Dict[str, Any],
|
|
156
|
+
columns: Optional[List[str]] = None,
|
|
157
|
+
filter_defaults: Optional[Dict[str, Any]] = None,
|
|
158
|
+
) -> Tuple[pd.DataFrame, str]:
|
|
159
|
+
"""
|
|
160
|
+
Filter data based on selection state and collect, with caching.
|
|
161
|
+
|
|
162
|
+
This is the recommended function for components that need filtered data.
|
|
163
|
+
Results are cached based on filter state, so interactions that don't
|
|
164
|
+
change the filter values (e.g., clicking within already-filtered data)
|
|
165
|
+
will return cached results instantly.
|
|
166
|
+
|
|
167
|
+
Returns pandas DataFrame for efficient Arrow serialization to the frontend.
|
|
168
|
+
The hash is computed efficiently without pickling the data.
|
|
169
|
+
|
|
170
|
+
Args:
|
|
171
|
+
data: The data to filter (LazyFrame or DataFrame)
|
|
172
|
+
filters: Mapping of identifier names to column names for filtering
|
|
173
|
+
state: Current selection state with identifier values
|
|
174
|
+
columns: Optional list of column names to select (projection pushdown)
|
|
175
|
+
filter_defaults: Optional default values for filters when state is None.
|
|
176
|
+
When a filter's state value is None, the default is used instead.
|
|
177
|
+
Example: {"identification": -1} means None → -1 for identification filter.
|
|
178
|
+
|
|
179
|
+
Returns:
|
|
180
|
+
Tuple of (pandas DataFrame, hash string) with filters and projection applied
|
|
181
|
+
"""
|
|
182
|
+
if isinstance(data, pl.DataFrame):
|
|
183
|
+
data = data.lazy()
|
|
184
|
+
|
|
185
|
+
# Convert to tuples for caching (dicts aren't hashable)
|
|
186
|
+
filters_tuple = tuple(sorted(filters.items()))
|
|
187
|
+
# Pass filter_defaults to _make_cache_key so defaults are applied to state
|
|
188
|
+
state_tuple = _make_cache_key(filters, state, filter_defaults)
|
|
189
|
+
columns_tuple = tuple(columns) if columns else None
|
|
190
|
+
filter_defaults_tuple = tuple(sorted(filter_defaults.items())) if filter_defaults else None
|
|
191
|
+
|
|
192
|
+
return _cached_filter_and_collect(
|
|
193
|
+
data,
|
|
194
|
+
filters_tuple,
|
|
195
|
+
state_tuple,
|
|
196
|
+
columns_tuple,
|
|
197
|
+
filter_defaults_tuple,
|
|
198
|
+
)
|
|
199
|
+
|
|
200
|
+
|
|
201
|
+
def filter_by_selection(
|
|
202
|
+
data: Union[pl.LazyFrame, pl.DataFrame],
|
|
203
|
+
interactivity: Dict[str, str],
|
|
204
|
+
state: Dict[str, Any],
|
|
205
|
+
) -> pl.LazyFrame:
|
|
206
|
+
"""
|
|
207
|
+
Filter data based on selection state and interactivity mapping.
|
|
208
|
+
|
|
209
|
+
For each identifier in the interactivity mapping, if there's a
|
|
210
|
+
corresponding selection in state, filter the data to rows where
|
|
211
|
+
the mapped column equals the selected value.
|
|
212
|
+
|
|
213
|
+
Args:
|
|
214
|
+
data: The data to filter (LazyFrame or DataFrame)
|
|
215
|
+
interactivity: Mapping of identifier names to column names
|
|
216
|
+
state: Current selection state with identifier values
|
|
217
|
+
|
|
218
|
+
Returns:
|
|
219
|
+
Filtered LazyFrame
|
|
220
|
+
"""
|
|
221
|
+
if isinstance(data, pl.DataFrame):
|
|
222
|
+
data = data.lazy()
|
|
223
|
+
|
|
224
|
+
for identifier, column in interactivity.items():
|
|
225
|
+
selected_value = state.get(identifier)
|
|
226
|
+
if selected_value is not None:
|
|
227
|
+
data = data.filter(pl.col(column) == selected_value)
|
|
228
|
+
|
|
229
|
+
return data
|
|
230
|
+
|
|
231
|
+
|
|
232
|
+
def filter_by_index(
|
|
233
|
+
data: Union[pl.LazyFrame, pl.DataFrame],
|
|
234
|
+
index_column: str,
|
|
235
|
+
index_value: Any,
|
|
236
|
+
) -> pl.LazyFrame:
|
|
237
|
+
"""
|
|
238
|
+
Filter data to a single row by index value.
|
|
239
|
+
|
|
240
|
+
Args:
|
|
241
|
+
data: The data to filter
|
|
242
|
+
index_column: Name of the index column
|
|
243
|
+
index_value: The index value to filter to
|
|
244
|
+
|
|
245
|
+
Returns:
|
|
246
|
+
Filtered LazyFrame (typically 1 row)
|
|
247
|
+
"""
|
|
248
|
+
if isinstance(data, pl.DataFrame):
|
|
249
|
+
data = data.lazy()
|
|
250
|
+
|
|
251
|
+
return data.filter(pl.col(index_column) == index_value)
|
|
252
|
+
|
|
253
|
+
|
|
254
|
+
def filter_by_range(
|
|
255
|
+
data: Union[pl.LazyFrame, pl.DataFrame],
|
|
256
|
+
x_column: str,
|
|
257
|
+
y_column: str,
|
|
258
|
+
x_range: tuple,
|
|
259
|
+
y_range: tuple,
|
|
260
|
+
) -> pl.LazyFrame:
|
|
261
|
+
"""
|
|
262
|
+
Filter data within x/y range bounds.
|
|
263
|
+
|
|
264
|
+
Args:
|
|
265
|
+
data: The data to filter
|
|
266
|
+
x_column: Name of the x-axis column
|
|
267
|
+
y_column: Name of the y-axis column
|
|
268
|
+
x_range: Tuple of (min, max) for x-axis
|
|
269
|
+
y_range: Tuple of (min, max) for y-axis
|
|
270
|
+
|
|
271
|
+
Returns:
|
|
272
|
+
Filtered LazyFrame
|
|
273
|
+
"""
|
|
274
|
+
if isinstance(data, pl.DataFrame):
|
|
275
|
+
data = data.lazy()
|
|
276
|
+
|
|
277
|
+
return data.filter(
|
|
278
|
+
(pl.col(x_column) >= x_range[0]) &
|
|
279
|
+
(pl.col(x_column) <= x_range[1]) &
|
|
280
|
+
(pl.col(y_column) >= y_range[0]) &
|
|
281
|
+
(pl.col(y_column) <= y_range[1])
|
|
282
|
+
)
|
|
283
|
+
|
|
284
|
+
|
|
285
|
+
def slice_by_row_index(
|
|
286
|
+
data: Union[pl.LazyFrame, pl.DataFrame],
|
|
287
|
+
row_index: Optional[int],
|
|
288
|
+
) -> pl.DataFrame:
|
|
289
|
+
"""
|
|
290
|
+
Slice data to a single row by row position.
|
|
291
|
+
|
|
292
|
+
Args:
|
|
293
|
+
data: The data to slice
|
|
294
|
+
row_index: The row index (position) to extract
|
|
295
|
+
|
|
296
|
+
Returns:
|
|
297
|
+
DataFrame with single row, or empty DataFrame if index is None
|
|
298
|
+
"""
|
|
299
|
+
if row_index is None:
|
|
300
|
+
# Return empty DataFrame with same schema
|
|
301
|
+
if isinstance(data, pl.LazyFrame):
|
|
302
|
+
return data.head(0).collect()
|
|
303
|
+
return data.head(0)
|
|
304
|
+
|
|
305
|
+
# For LazyFrames, add slice to query plan before collecting
|
|
306
|
+
# This allows Polars to optimize and avoid materializing all rows
|
|
307
|
+
if isinstance(data, pl.LazyFrame):
|
|
308
|
+
# Note: We can't check bounds without collecting, so we slice optimistically
|
|
309
|
+
# and return empty if result is empty
|
|
310
|
+
return data.slice(row_index, 1).collect()
|
|
311
|
+
|
|
312
|
+
# For DataFrames, check bounds first
|
|
313
|
+
if row_index < 0 or row_index >= len(data):
|
|
314
|
+
return data.head(0)
|
|
315
|
+
|
|
316
|
+
return data.slice(row_index, 1)
|