openms-insight 0.1.2__py3-none-any.whl → 0.1.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- openms_insight/__init__.py +11 -7
- openms_insight/components/__init__.py +2 -2
- openms_insight/components/heatmap.py +433 -228
- openms_insight/components/lineplot.py +377 -82
- openms_insight/components/sequenceview.py +677 -213
- openms_insight/components/table.py +86 -58
- openms_insight/core/__init__.py +2 -2
- openms_insight/core/base.py +122 -54
- openms_insight/core/registry.py +6 -5
- openms_insight/core/state.py +33 -31
- openms_insight/core/subprocess_preprocess.py +1 -3
- openms_insight/js-component/dist/assets/index.css +1 -1
- openms_insight/js-component/dist/assets/index.js +105 -105
- openms_insight/preprocessing/__init__.py +5 -6
- openms_insight/preprocessing/compression.py +123 -67
- openms_insight/preprocessing/filtering.py +39 -13
- openms_insight/rendering/__init__.py +1 -1
- openms_insight/rendering/bridge.py +192 -42
- {openms_insight-0.1.2.dist-info → openms_insight-0.1.4.dist-info}/METADATA +163 -20
- openms_insight-0.1.4.dist-info/RECORD +28 -0
- openms_insight-0.1.2.dist-info/RECORD +0 -28
- {openms_insight-0.1.2.dist-info → openms_insight-0.1.4.dist-info}/WHEEL +0 -0
- {openms_insight-0.1.2.dist-info → openms_insight-0.1.4.dist-info}/licenses/LICENSE +0 -0
|
@@ -1,16 +1,15 @@
|
|
|
1
1
|
"""Preprocessing utilities for data transformation and filtering."""
|
|
2
2
|
|
|
3
|
-
from .filtering import (
|
|
4
|
-
filter_by_selection,
|
|
5
|
-
filter_by_index,
|
|
6
|
-
filter_and_collect_cached,
|
|
7
|
-
)
|
|
8
|
-
|
|
9
3
|
from .compression import (
|
|
10
4
|
compute_compression_levels,
|
|
11
5
|
downsample_2d,
|
|
12
6
|
downsample_2d_simple,
|
|
13
7
|
)
|
|
8
|
+
from .filtering import (
|
|
9
|
+
filter_and_collect_cached,
|
|
10
|
+
filter_by_index,
|
|
11
|
+
filter_by_selection,
|
|
12
|
+
)
|
|
14
13
|
|
|
15
14
|
__all__ = [
|
|
16
15
|
"filter_by_selection",
|
|
@@ -6,18 +6,73 @@ data, enabling efficient visualization of datasets with millions of points.
|
|
|
6
6
|
Supports both streaming (lazy) and eager downsampling approaches.
|
|
7
7
|
"""
|
|
8
8
|
|
|
9
|
-
|
|
9
|
+
import math
|
|
10
|
+
from typing import List, Optional, Tuple, Union
|
|
10
11
|
|
|
11
12
|
import numpy as np
|
|
12
13
|
import polars as pl
|
|
13
14
|
|
|
14
15
|
try:
|
|
15
16
|
from scipy.stats import binned_statistic_2d
|
|
17
|
+
|
|
16
18
|
HAS_SCIPY = True
|
|
17
19
|
except ImportError:
|
|
18
20
|
HAS_SCIPY = False
|
|
19
21
|
|
|
20
22
|
|
|
23
|
+
def compute_optimal_bins(
|
|
24
|
+
target_points: int,
|
|
25
|
+
x_range: Tuple[float, float],
|
|
26
|
+
y_range: Tuple[float, float],
|
|
27
|
+
) -> Tuple[int, int]:
|
|
28
|
+
"""
|
|
29
|
+
Compute optimal x_bins, y_bins for even spatial distribution.
|
|
30
|
+
|
|
31
|
+
The bin grid matches the data's aspect ratio so bins are approximately
|
|
32
|
+
square in data space. Total bins ≈ target_points for 1 point per bin.
|
|
33
|
+
|
|
34
|
+
Solves the system:
|
|
35
|
+
x_bins × y_bins = target_points
|
|
36
|
+
x_bins / y_bins = aspect_ratio
|
|
37
|
+
|
|
38
|
+
Solution:
|
|
39
|
+
y_bins = sqrt(target_points / aspect_ratio)
|
|
40
|
+
x_bins = sqrt(target_points × aspect_ratio)
|
|
41
|
+
|
|
42
|
+
Args:
|
|
43
|
+
target_points: Target number of bins (and thus max points with 1 per bin)
|
|
44
|
+
x_range: (x_min, x_max) data range
|
|
45
|
+
y_range: (y_min, y_max) data range
|
|
46
|
+
|
|
47
|
+
Returns:
|
|
48
|
+
(x_bins, y_bins) tuple
|
|
49
|
+
|
|
50
|
+
Examples:
|
|
51
|
+
>>> compute_optimal_bins(10000, (0, 1000), (0, 100)) # 10:1 aspect
|
|
52
|
+
(316, 31)
|
|
53
|
+
>>> compute_optimal_bins(10000, (0, 100), (0, 100)) # 1:1 aspect
|
|
54
|
+
(100, 100)
|
|
55
|
+
"""
|
|
56
|
+
x_span = x_range[1] - x_range[0]
|
|
57
|
+
y_span = y_range[1] - y_range[0]
|
|
58
|
+
|
|
59
|
+
# Handle edge cases
|
|
60
|
+
if y_span < 1e-10:
|
|
61
|
+
y_span = x_span if x_span > 1e-10 else 1.0
|
|
62
|
+
if x_span < 1e-10:
|
|
63
|
+
x_span = y_span
|
|
64
|
+
|
|
65
|
+
aspect_ratio = x_span / y_span
|
|
66
|
+
|
|
67
|
+
# Clamp to reasonable bounds (avoid extreme rectangles)
|
|
68
|
+
aspect_ratio = max(0.05, min(20.0, aspect_ratio))
|
|
69
|
+
|
|
70
|
+
y_bins = max(1, int(math.sqrt(target_points / aspect_ratio)))
|
|
71
|
+
x_bins = max(1, int(math.sqrt(target_points * aspect_ratio)))
|
|
72
|
+
|
|
73
|
+
return x_bins, y_bins
|
|
74
|
+
|
|
75
|
+
|
|
21
76
|
def compute_compression_levels(min_size: int, total: int) -> List[int]:
|
|
22
77
|
"""
|
|
23
78
|
Compute logarithmically-spaced compression level target sizes.
|
|
@@ -55,12 +110,10 @@ def compute_compression_levels(min_size: int, total: int) -> List[int]:
|
|
|
55
110
|
|
|
56
111
|
# Generate levels at each power of 10, scaled by the fractional part
|
|
57
112
|
scale_factor = int(10 ** (np.log10(min_size) % 1))
|
|
58
|
-
levels =
|
|
59
|
-
min_power,
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
dtype='int'
|
|
63
|
-
) * scale_factor
|
|
113
|
+
levels = (
|
|
114
|
+
np.logspace(min_power, max_power, max_power - min_power + 1, dtype="int")
|
|
115
|
+
* scale_factor
|
|
116
|
+
)
|
|
64
117
|
|
|
65
118
|
# Filter out levels >= total (don't include full resolution for large datasets)
|
|
66
119
|
levels = levels[levels < total].tolist()
|
|
@@ -75,9 +128,9 @@ def compute_compression_levels(min_size: int, total: int) -> List[int]:
|
|
|
75
128
|
def downsample_2d(
|
|
76
129
|
data: Union[pl.LazyFrame, pl.DataFrame],
|
|
77
130
|
max_points: int = 20000,
|
|
78
|
-
x_column: str =
|
|
79
|
-
y_column: str =
|
|
80
|
-
intensity_column: str =
|
|
131
|
+
x_column: str = "x",
|
|
132
|
+
y_column: str = "y",
|
|
133
|
+
intensity_column: str = "intensity",
|
|
81
134
|
x_bins: int = 400,
|
|
82
135
|
y_bins: int = 50,
|
|
83
136
|
) -> pl.LazyFrame:
|
|
@@ -106,8 +159,7 @@ def downsample_2d(
|
|
|
106
159
|
"""
|
|
107
160
|
if not HAS_SCIPY:
|
|
108
161
|
raise ImportError(
|
|
109
|
-
"scipy is required for downsample_2d. "
|
|
110
|
-
"Install with: pip install scipy"
|
|
162
|
+
"scipy is required for downsample_2d. Install with: pip install scipy"
|
|
111
163
|
)
|
|
112
164
|
|
|
113
165
|
if (x_bins * y_bins) > max_points:
|
|
@@ -122,12 +174,9 @@ def downsample_2d(
|
|
|
122
174
|
|
|
123
175
|
# Sort by intensity (descending) to prioritize high-intensity points
|
|
124
176
|
sorted_data = (
|
|
125
|
-
data
|
|
126
|
-
.
|
|
127
|
-
.
|
|
128
|
-
pl.int_range(pl.len()).over(x_column).alias('_rank')
|
|
129
|
-
])
|
|
130
|
-
.sort(['_rank', intensity_column], descending=[False, True])
|
|
177
|
+
data.sort([x_column, intensity_column], descending=[False, True])
|
|
178
|
+
.with_columns([pl.int_range(pl.len()).over(x_column).alias("_rank")])
|
|
179
|
+
.sort(["_rank", intensity_column], descending=[False, True])
|
|
131
180
|
)
|
|
132
181
|
|
|
133
182
|
# Collect for scipy binning (requires numpy arrays)
|
|
@@ -136,7 +185,7 @@ def downsample_2d(
|
|
|
136
185
|
total_count = len(collected)
|
|
137
186
|
if total_count <= max_points:
|
|
138
187
|
# No downsampling needed
|
|
139
|
-
return collected.drop(
|
|
188
|
+
return collected.drop("_rank").lazy()
|
|
140
189
|
|
|
141
190
|
# Extract arrays for scipy
|
|
142
191
|
x_array = collected[x_column].to_numpy()
|
|
@@ -145,18 +194,20 @@ def downsample_2d(
|
|
|
145
194
|
|
|
146
195
|
# Compute 2D bins
|
|
147
196
|
count, _, _, mapping = binned_statistic_2d(
|
|
148
|
-
x_array,
|
|
197
|
+
x_array,
|
|
198
|
+
y_array,
|
|
199
|
+
intensity_array,
|
|
200
|
+
"count",
|
|
149
201
|
bins=[x_bins, y_bins],
|
|
150
|
-
expand_binnumbers=True
|
|
202
|
+
expand_binnumbers=True,
|
|
151
203
|
)
|
|
152
204
|
|
|
153
205
|
# Add bin indices to dataframe
|
|
154
|
-
binned_data = (
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
pl.Series(
|
|
158
|
-
|
|
159
|
-
])
|
|
206
|
+
binned_data = collected.lazy().with_columns(
|
|
207
|
+
[
|
|
208
|
+
pl.Series("_x_bin", mapping[0] - 1), # scipy uses 1-based indexing
|
|
209
|
+
pl.Series("_y_bin", mapping[1] - 1),
|
|
210
|
+
]
|
|
160
211
|
)
|
|
161
212
|
|
|
162
213
|
# Compute max peaks per bin to stay under limit
|
|
@@ -174,11 +225,10 @@ def downsample_2d(
|
|
|
174
225
|
|
|
175
226
|
# Keep top N peaks per bin
|
|
176
227
|
result = (
|
|
177
|
-
binned_data
|
|
178
|
-
.group_by(['_x_bin', '_y_bin'])
|
|
228
|
+
binned_data.group_by(["_x_bin", "_y_bin"])
|
|
179
229
|
.head(max_peaks_per_bin)
|
|
180
230
|
.sort(intensity_column)
|
|
181
|
-
.drop([
|
|
231
|
+
.drop(["_rank", "_x_bin", "_y_bin"])
|
|
182
232
|
)
|
|
183
233
|
|
|
184
234
|
return result
|
|
@@ -187,7 +237,7 @@ def downsample_2d(
|
|
|
187
237
|
def downsample_2d_simple(
|
|
188
238
|
data: Union[pl.LazyFrame, pl.DataFrame],
|
|
189
239
|
max_points: int = 20000,
|
|
190
|
-
intensity_column: str =
|
|
240
|
+
intensity_column: str = "intensity",
|
|
191
241
|
) -> pl.LazyFrame:
|
|
192
242
|
"""
|
|
193
243
|
Simple downsampling by keeping highest-intensity points.
|
|
@@ -206,19 +256,15 @@ def downsample_2d_simple(
|
|
|
206
256
|
if isinstance(data, pl.DataFrame):
|
|
207
257
|
data = data.lazy()
|
|
208
258
|
|
|
209
|
-
return (
|
|
210
|
-
data
|
|
211
|
-
.sort(intensity_column, descending=True)
|
|
212
|
-
.head(max_points)
|
|
213
|
-
)
|
|
259
|
+
return data.sort(intensity_column, descending=True).head(max_points)
|
|
214
260
|
|
|
215
261
|
|
|
216
262
|
def downsample_2d_streaming(
|
|
217
263
|
data: Union[pl.LazyFrame, pl.DataFrame],
|
|
218
264
|
max_points: int = 20000,
|
|
219
|
-
x_column: str =
|
|
220
|
-
y_column: str =
|
|
221
|
-
intensity_column: str =
|
|
265
|
+
x_column: str = "x",
|
|
266
|
+
y_column: str = "y",
|
|
267
|
+
intensity_column: str = "intensity",
|
|
222
268
|
x_bins: int = 400,
|
|
223
269
|
y_bins: int = 50,
|
|
224
270
|
x_range: Optional[tuple] = None,
|
|
@@ -262,43 +308,51 @@ def downsample_2d_streaming(
|
|
|
262
308
|
((pl.col(x_column) - x_min) / (x_max - x_min + 1e-10) * x_bins)
|
|
263
309
|
.cast(pl.Int32)
|
|
264
310
|
.clip(0, x_bins - 1)
|
|
265
|
-
.alias(
|
|
311
|
+
.alias("_x_bin")
|
|
266
312
|
)
|
|
267
313
|
y_bin_expr = (
|
|
268
314
|
((pl.col(y_column) - y_min) / (y_max - y_min + 1e-10) * y_bins)
|
|
269
315
|
.cast(pl.Int32)
|
|
270
316
|
.clip(0, y_bins - 1)
|
|
271
|
-
.alias(
|
|
317
|
+
.alias("_y_bin")
|
|
272
318
|
)
|
|
273
319
|
|
|
274
320
|
result = (
|
|
275
|
-
data
|
|
276
|
-
.with_columns([x_bin_expr, y_bin_expr])
|
|
321
|
+
data.with_columns([x_bin_expr, y_bin_expr])
|
|
277
322
|
.sort(intensity_column, descending=True)
|
|
278
|
-
.group_by([
|
|
323
|
+
.group_by(["_x_bin", "_y_bin"])
|
|
279
324
|
.head(points_per_bin)
|
|
280
|
-
.drop([
|
|
325
|
+
.drop(["_x_bin", "_y_bin"])
|
|
281
326
|
)
|
|
282
327
|
else:
|
|
283
328
|
# Need to compute ranges - still lazy using over() window
|
|
284
329
|
# First pass: add normalized bin columns using min/max over entire frame
|
|
285
330
|
result = (
|
|
286
|
-
data
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
|
|
331
|
+
data.with_columns(
|
|
332
|
+
[
|
|
333
|
+
# Compute bin indices using window functions for min/max
|
|
334
|
+
(
|
|
335
|
+
(pl.col(x_column) - pl.col(x_column).min())
|
|
336
|
+
/ (pl.col(x_column).max() - pl.col(x_column).min() + 1e-10)
|
|
337
|
+
* x_bins
|
|
338
|
+
)
|
|
339
|
+
.cast(pl.Int32)
|
|
340
|
+
.clip(0, x_bins - 1)
|
|
341
|
+
.alias("_x_bin"),
|
|
342
|
+
(
|
|
343
|
+
(pl.col(y_column) - pl.col(y_column).min())
|
|
344
|
+
/ (pl.col(y_column).max() - pl.col(y_column).min() + 1e-10)
|
|
345
|
+
* y_bins
|
|
346
|
+
)
|
|
347
|
+
.cast(pl.Int32)
|
|
348
|
+
.clip(0, y_bins - 1)
|
|
349
|
+
.alias("_y_bin"),
|
|
350
|
+
]
|
|
351
|
+
)
|
|
298
352
|
.sort(intensity_column, descending=True)
|
|
299
|
-
.group_by([
|
|
353
|
+
.group_by(["_x_bin", "_y_bin"])
|
|
300
354
|
.head(points_per_bin)
|
|
301
|
-
.drop([
|
|
355
|
+
.drop(["_x_bin", "_y_bin"])
|
|
302
356
|
)
|
|
303
357
|
|
|
304
358
|
return result
|
|
@@ -325,14 +379,16 @@ def get_data_range(
|
|
|
325
379
|
if isinstance(data, pl.DataFrame):
|
|
326
380
|
data = data.lazy()
|
|
327
381
|
|
|
328
|
-
stats = data.select(
|
|
329
|
-
|
|
330
|
-
|
|
331
|
-
|
|
332
|
-
|
|
333
|
-
|
|
382
|
+
stats = data.select(
|
|
383
|
+
[
|
|
384
|
+
pl.col(x_column).min().alias("x_min"),
|
|
385
|
+
pl.col(x_column).max().alias("x_max"),
|
|
386
|
+
pl.col(y_column).min().alias("y_min"),
|
|
387
|
+
pl.col(y_column).max().alias("y_max"),
|
|
388
|
+
]
|
|
389
|
+
).collect()
|
|
334
390
|
|
|
335
391
|
return (
|
|
336
|
-
(stats[
|
|
337
|
-
(stats[
|
|
392
|
+
(stats["x_min"][0], stats["x_max"][0]),
|
|
393
|
+
(stats["y_min"][0], stats["y_max"][0]),
|
|
338
394
|
)
|
|
@@ -1,11 +1,10 @@
|
|
|
1
1
|
"""Data filtering utilities for selection-based filtering."""
|
|
2
2
|
|
|
3
|
+
import hashlib
|
|
3
4
|
from typing import Any, Dict, List, Optional, Tuple, Union
|
|
4
5
|
|
|
5
|
-
import hashlib
|
|
6
6
|
import pandas as pd
|
|
7
7
|
import polars as pl
|
|
8
|
-
import streamlit as st
|
|
9
8
|
|
|
10
9
|
|
|
11
10
|
def optimize_for_transfer(df: pl.DataFrame) -> pl.DataFrame:
|
|
@@ -35,10 +34,12 @@ def optimize_for_transfer(df: pl.DataFrame) -> pl.DataFrame:
|
|
|
35
34
|
# JS safe integer is 2^53, but Int32 range is simpler and sufficient for most data
|
|
36
35
|
if dtype == pl.Int64:
|
|
37
36
|
# Get min/max in a single pass
|
|
38
|
-
stats = df.select(
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
37
|
+
stats = df.select(
|
|
38
|
+
[
|
|
39
|
+
pl.col(col).min().alias("min"),
|
|
40
|
+
pl.col(col).max().alias("max"),
|
|
41
|
+
]
|
|
42
|
+
).row(0)
|
|
42
43
|
col_min, col_max = stats
|
|
43
44
|
|
|
44
45
|
if col_min is not None and col_max is not None:
|
|
@@ -150,14 +151,39 @@ def compute_dataframe_hash(df: pl.DataFrame) -> str:
|
|
|
150
151
|
# Add sum of numeric columns for content verification
|
|
151
152
|
for col in df.columns:
|
|
152
153
|
dtype = df[col].dtype
|
|
153
|
-
if dtype in (
|
|
154
|
-
|
|
155
|
-
|
|
154
|
+
if dtype in (
|
|
155
|
+
pl.Int8,
|
|
156
|
+
pl.Int16,
|
|
157
|
+
pl.Int32,
|
|
158
|
+
pl.Int64,
|
|
159
|
+
pl.UInt8,
|
|
160
|
+
pl.UInt16,
|
|
161
|
+
pl.UInt32,
|
|
162
|
+
pl.UInt64,
|
|
163
|
+
pl.Float32,
|
|
164
|
+
pl.Float64,
|
|
165
|
+
):
|
|
156
166
|
try:
|
|
157
167
|
col_sum = df[col].sum()
|
|
158
168
|
hash_parts.append(f"{col}:{col_sum}")
|
|
159
169
|
except Exception:
|
|
160
170
|
pass
|
|
171
|
+
elif dtype == pl.Boolean:
|
|
172
|
+
# Count True values for boolean columns (important for annotations)
|
|
173
|
+
try:
|
|
174
|
+
true_count = df[col].sum() # True=1, False=0
|
|
175
|
+
hash_parts.append(f"{col}_bool:{true_count}")
|
|
176
|
+
except Exception:
|
|
177
|
+
pass
|
|
178
|
+
elif dtype == pl.Utf8 and col.startswith("_dynamic"):
|
|
179
|
+
# Hash content of dynamic string columns (annotations)
|
|
180
|
+
try:
|
|
181
|
+
# Use hash of all non-empty values for annotation text
|
|
182
|
+
non_empty = df[col].filter(pl.col(col) != "").to_list()
|
|
183
|
+
if non_empty:
|
|
184
|
+
hash_parts.append(f"{col}_str:{hash(tuple(non_empty))}")
|
|
185
|
+
except Exception:
|
|
186
|
+
pass
|
|
161
187
|
|
|
162
188
|
hash_input = "|".join(hash_parts).encode()
|
|
163
189
|
return hashlib.sha256(hash_input).hexdigest()
|
|
@@ -352,10 +378,10 @@ def filter_by_range(
|
|
|
352
378
|
data = data.lazy()
|
|
353
379
|
|
|
354
380
|
return data.filter(
|
|
355
|
-
(pl.col(x_column) >= x_range[0])
|
|
356
|
-
(pl.col(x_column) <= x_range[1])
|
|
357
|
-
(pl.col(y_column) >= y_range[0])
|
|
358
|
-
(pl.col(y_column) <= y_range[1])
|
|
381
|
+
(pl.col(x_column) >= x_range[0])
|
|
382
|
+
& (pl.col(x_column) <= x_range[1])
|
|
383
|
+
& (pl.col(y_column) >= y_range[0])
|
|
384
|
+
& (pl.col(y_column) <= y_range[1])
|
|
359
385
|
)
|
|
360
386
|
|
|
361
387
|
|