openms-insight 0.1.1__py3-none-any.whl → 0.1.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- openms_insight/__init__.py +11 -7
- openms_insight/components/__init__.py +2 -2
- openms_insight/components/heatmap.py +192 -102
- openms_insight/components/lineplot.py +377 -82
- openms_insight/components/sequenceview.py +677 -213
- openms_insight/components/table.py +86 -58
- openms_insight/core/__init__.py +2 -2
- openms_insight/core/base.py +113 -49
- openms_insight/core/registry.py +6 -5
- openms_insight/core/state.py +33 -31
- openms_insight/core/subprocess_preprocess.py +1 -3
- openms_insight/js-component/dist/assets/index.css +1 -1
- openms_insight/js-component/dist/assets/index.js +113 -113
- openms_insight/preprocessing/__init__.py +5 -6
- openms_insight/preprocessing/compression.py +68 -66
- openms_insight/preprocessing/filtering.py +119 -9
- openms_insight/rendering/__init__.py +1 -1
- openms_insight/rendering/bridge.py +192 -42
- {openms_insight-0.1.1.dist-info → openms_insight-0.1.3.dist-info}/METADATA +163 -20
- openms_insight-0.1.3.dist-info/RECORD +28 -0
- openms_insight-0.1.1.dist-info/RECORD +0 -28
- {openms_insight-0.1.1.dist-info → openms_insight-0.1.3.dist-info}/WHEEL +0 -0
- {openms_insight-0.1.1.dist-info → openms_insight-0.1.3.dist-info}/licenses/LICENSE +0 -0
|
@@ -1,16 +1,15 @@
|
|
|
1
1
|
"""Preprocessing utilities for data transformation and filtering."""
|
|
2
2
|
|
|
3
|
-
from .filtering import (
|
|
4
|
-
filter_by_selection,
|
|
5
|
-
filter_by_index,
|
|
6
|
-
filter_and_collect_cached,
|
|
7
|
-
)
|
|
8
|
-
|
|
9
3
|
from .compression import (
|
|
10
4
|
compute_compression_levels,
|
|
11
5
|
downsample_2d,
|
|
12
6
|
downsample_2d_simple,
|
|
13
7
|
)
|
|
8
|
+
from .filtering import (
|
|
9
|
+
filter_and_collect_cached,
|
|
10
|
+
filter_by_index,
|
|
11
|
+
filter_by_selection,
|
|
12
|
+
)
|
|
14
13
|
|
|
15
14
|
__all__ = [
|
|
16
15
|
"filter_by_selection",
|
|
@@ -13,6 +13,7 @@ import polars as pl
|
|
|
13
13
|
|
|
14
14
|
try:
|
|
15
15
|
from scipy.stats import binned_statistic_2d
|
|
16
|
+
|
|
16
17
|
HAS_SCIPY = True
|
|
17
18
|
except ImportError:
|
|
18
19
|
HAS_SCIPY = False
|
|
@@ -55,12 +56,10 @@ def compute_compression_levels(min_size: int, total: int) -> List[int]:
|
|
|
55
56
|
|
|
56
57
|
# Generate levels at each power of 10, scaled by the fractional part
|
|
57
58
|
scale_factor = int(10 ** (np.log10(min_size) % 1))
|
|
58
|
-
levels =
|
|
59
|
-
min_power,
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
dtype='int'
|
|
63
|
-
) * scale_factor
|
|
59
|
+
levels = (
|
|
60
|
+
np.logspace(min_power, max_power, max_power - min_power + 1, dtype="int")
|
|
61
|
+
* scale_factor
|
|
62
|
+
)
|
|
64
63
|
|
|
65
64
|
# Filter out levels >= total (don't include full resolution for large datasets)
|
|
66
65
|
levels = levels[levels < total].tolist()
|
|
@@ -75,9 +74,9 @@ def compute_compression_levels(min_size: int, total: int) -> List[int]:
|
|
|
75
74
|
def downsample_2d(
|
|
76
75
|
data: Union[pl.LazyFrame, pl.DataFrame],
|
|
77
76
|
max_points: int = 20000,
|
|
78
|
-
x_column: str =
|
|
79
|
-
y_column: str =
|
|
80
|
-
intensity_column: str =
|
|
77
|
+
x_column: str = "x",
|
|
78
|
+
y_column: str = "y",
|
|
79
|
+
intensity_column: str = "intensity",
|
|
81
80
|
x_bins: int = 400,
|
|
82
81
|
y_bins: int = 50,
|
|
83
82
|
) -> pl.LazyFrame:
|
|
@@ -106,8 +105,7 @@ def downsample_2d(
|
|
|
106
105
|
"""
|
|
107
106
|
if not HAS_SCIPY:
|
|
108
107
|
raise ImportError(
|
|
109
|
-
"scipy is required for downsample_2d. "
|
|
110
|
-
"Install with: pip install scipy"
|
|
108
|
+
"scipy is required for downsample_2d. Install with: pip install scipy"
|
|
111
109
|
)
|
|
112
110
|
|
|
113
111
|
if (x_bins * y_bins) > max_points:
|
|
@@ -122,12 +120,9 @@ def downsample_2d(
|
|
|
122
120
|
|
|
123
121
|
# Sort by intensity (descending) to prioritize high-intensity points
|
|
124
122
|
sorted_data = (
|
|
125
|
-
data
|
|
126
|
-
.
|
|
127
|
-
.
|
|
128
|
-
pl.int_range(pl.len()).over(x_column).alias('_rank')
|
|
129
|
-
])
|
|
130
|
-
.sort(['_rank', intensity_column], descending=[False, True])
|
|
123
|
+
data.sort([x_column, intensity_column], descending=[False, True])
|
|
124
|
+
.with_columns([pl.int_range(pl.len()).over(x_column).alias("_rank")])
|
|
125
|
+
.sort(["_rank", intensity_column], descending=[False, True])
|
|
131
126
|
)
|
|
132
127
|
|
|
133
128
|
# Collect for scipy binning (requires numpy arrays)
|
|
@@ -136,7 +131,7 @@ def downsample_2d(
|
|
|
136
131
|
total_count = len(collected)
|
|
137
132
|
if total_count <= max_points:
|
|
138
133
|
# No downsampling needed
|
|
139
|
-
return collected.drop(
|
|
134
|
+
return collected.drop("_rank").lazy()
|
|
140
135
|
|
|
141
136
|
# Extract arrays for scipy
|
|
142
137
|
x_array = collected[x_column].to_numpy()
|
|
@@ -145,18 +140,20 @@ def downsample_2d(
|
|
|
145
140
|
|
|
146
141
|
# Compute 2D bins
|
|
147
142
|
count, _, _, mapping = binned_statistic_2d(
|
|
148
|
-
x_array,
|
|
143
|
+
x_array,
|
|
144
|
+
y_array,
|
|
145
|
+
intensity_array,
|
|
146
|
+
"count",
|
|
149
147
|
bins=[x_bins, y_bins],
|
|
150
|
-
expand_binnumbers=True
|
|
148
|
+
expand_binnumbers=True,
|
|
151
149
|
)
|
|
152
150
|
|
|
153
151
|
# Add bin indices to dataframe
|
|
154
|
-
binned_data = (
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
pl.Series(
|
|
158
|
-
|
|
159
|
-
])
|
|
152
|
+
binned_data = collected.lazy().with_columns(
|
|
153
|
+
[
|
|
154
|
+
pl.Series("_x_bin", mapping[0] - 1), # scipy uses 1-based indexing
|
|
155
|
+
pl.Series("_y_bin", mapping[1] - 1),
|
|
156
|
+
]
|
|
160
157
|
)
|
|
161
158
|
|
|
162
159
|
# Compute max peaks per bin to stay under limit
|
|
@@ -174,11 +171,10 @@ def downsample_2d(
|
|
|
174
171
|
|
|
175
172
|
# Keep top N peaks per bin
|
|
176
173
|
result = (
|
|
177
|
-
binned_data
|
|
178
|
-
.group_by(['_x_bin', '_y_bin'])
|
|
174
|
+
binned_data.group_by(["_x_bin", "_y_bin"])
|
|
179
175
|
.head(max_peaks_per_bin)
|
|
180
176
|
.sort(intensity_column)
|
|
181
|
-
.drop([
|
|
177
|
+
.drop(["_rank", "_x_bin", "_y_bin"])
|
|
182
178
|
)
|
|
183
179
|
|
|
184
180
|
return result
|
|
@@ -187,7 +183,7 @@ def downsample_2d(
|
|
|
187
183
|
def downsample_2d_simple(
|
|
188
184
|
data: Union[pl.LazyFrame, pl.DataFrame],
|
|
189
185
|
max_points: int = 20000,
|
|
190
|
-
intensity_column: str =
|
|
186
|
+
intensity_column: str = "intensity",
|
|
191
187
|
) -> pl.LazyFrame:
|
|
192
188
|
"""
|
|
193
189
|
Simple downsampling by keeping highest-intensity points.
|
|
@@ -206,19 +202,15 @@ def downsample_2d_simple(
|
|
|
206
202
|
if isinstance(data, pl.DataFrame):
|
|
207
203
|
data = data.lazy()
|
|
208
204
|
|
|
209
|
-
return (
|
|
210
|
-
data
|
|
211
|
-
.sort(intensity_column, descending=True)
|
|
212
|
-
.head(max_points)
|
|
213
|
-
)
|
|
205
|
+
return data.sort(intensity_column, descending=True).head(max_points)
|
|
214
206
|
|
|
215
207
|
|
|
216
208
|
def downsample_2d_streaming(
|
|
217
209
|
data: Union[pl.LazyFrame, pl.DataFrame],
|
|
218
210
|
max_points: int = 20000,
|
|
219
|
-
x_column: str =
|
|
220
|
-
y_column: str =
|
|
221
|
-
intensity_column: str =
|
|
211
|
+
x_column: str = "x",
|
|
212
|
+
y_column: str = "y",
|
|
213
|
+
intensity_column: str = "intensity",
|
|
222
214
|
x_bins: int = 400,
|
|
223
215
|
y_bins: int = 50,
|
|
224
216
|
x_range: Optional[tuple] = None,
|
|
@@ -262,43 +254,51 @@ def downsample_2d_streaming(
|
|
|
262
254
|
((pl.col(x_column) - x_min) / (x_max - x_min + 1e-10) * x_bins)
|
|
263
255
|
.cast(pl.Int32)
|
|
264
256
|
.clip(0, x_bins - 1)
|
|
265
|
-
.alias(
|
|
257
|
+
.alias("_x_bin")
|
|
266
258
|
)
|
|
267
259
|
y_bin_expr = (
|
|
268
260
|
((pl.col(y_column) - y_min) / (y_max - y_min + 1e-10) * y_bins)
|
|
269
261
|
.cast(pl.Int32)
|
|
270
262
|
.clip(0, y_bins - 1)
|
|
271
|
-
.alias(
|
|
263
|
+
.alias("_y_bin")
|
|
272
264
|
)
|
|
273
265
|
|
|
274
266
|
result = (
|
|
275
|
-
data
|
|
276
|
-
.with_columns([x_bin_expr, y_bin_expr])
|
|
267
|
+
data.with_columns([x_bin_expr, y_bin_expr])
|
|
277
268
|
.sort(intensity_column, descending=True)
|
|
278
|
-
.group_by([
|
|
269
|
+
.group_by(["_x_bin", "_y_bin"])
|
|
279
270
|
.head(points_per_bin)
|
|
280
|
-
.drop([
|
|
271
|
+
.drop(["_x_bin", "_y_bin"])
|
|
281
272
|
)
|
|
282
273
|
else:
|
|
283
274
|
# Need to compute ranges - still lazy using over() window
|
|
284
275
|
# First pass: add normalized bin columns using min/max over entire frame
|
|
285
276
|
result = (
|
|
286
|
-
data
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
|
|
277
|
+
data.with_columns(
|
|
278
|
+
[
|
|
279
|
+
# Compute bin indices using window functions for min/max
|
|
280
|
+
(
|
|
281
|
+
(pl.col(x_column) - pl.col(x_column).min())
|
|
282
|
+
/ (pl.col(x_column).max() - pl.col(x_column).min() + 1e-10)
|
|
283
|
+
* x_bins
|
|
284
|
+
)
|
|
285
|
+
.cast(pl.Int32)
|
|
286
|
+
.clip(0, x_bins - 1)
|
|
287
|
+
.alias("_x_bin"),
|
|
288
|
+
(
|
|
289
|
+
(pl.col(y_column) - pl.col(y_column).min())
|
|
290
|
+
/ (pl.col(y_column).max() - pl.col(y_column).min() + 1e-10)
|
|
291
|
+
* y_bins
|
|
292
|
+
)
|
|
293
|
+
.cast(pl.Int32)
|
|
294
|
+
.clip(0, y_bins - 1)
|
|
295
|
+
.alias("_y_bin"),
|
|
296
|
+
]
|
|
297
|
+
)
|
|
298
298
|
.sort(intensity_column, descending=True)
|
|
299
|
-
.group_by([
|
|
299
|
+
.group_by(["_x_bin", "_y_bin"])
|
|
300
300
|
.head(points_per_bin)
|
|
301
|
-
.drop([
|
|
301
|
+
.drop(["_x_bin", "_y_bin"])
|
|
302
302
|
)
|
|
303
303
|
|
|
304
304
|
return result
|
|
@@ -325,14 +325,16 @@ def get_data_range(
|
|
|
325
325
|
if isinstance(data, pl.DataFrame):
|
|
326
326
|
data = data.lazy()
|
|
327
327
|
|
|
328
|
-
stats = data.select(
|
|
329
|
-
|
|
330
|
-
|
|
331
|
-
|
|
332
|
-
|
|
333
|
-
|
|
328
|
+
stats = data.select(
|
|
329
|
+
[
|
|
330
|
+
pl.col(x_column).min().alias("x_min"),
|
|
331
|
+
pl.col(x_column).max().alias("x_max"),
|
|
332
|
+
pl.col(y_column).min().alias("y_min"),
|
|
333
|
+
pl.col(y_column).max().alias("y_max"),
|
|
334
|
+
]
|
|
335
|
+
).collect()
|
|
334
336
|
|
|
335
337
|
return (
|
|
336
|
-
(stats[
|
|
337
|
-
(stats[
|
|
338
|
+
(stats["x_min"][0], stats["x_max"][0]),
|
|
339
|
+
(stats["y_min"][0], stats["y_max"][0]),
|
|
338
340
|
)
|
|
@@ -1,11 +1,94 @@
|
|
|
1
1
|
"""Data filtering utilities for selection-based filtering."""
|
|
2
2
|
|
|
3
|
+
import hashlib
|
|
3
4
|
from typing import Any, Dict, List, Optional, Tuple, Union
|
|
4
5
|
|
|
5
|
-
import hashlib
|
|
6
6
|
import pandas as pd
|
|
7
7
|
import polars as pl
|
|
8
|
-
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def optimize_for_transfer(df: pl.DataFrame) -> pl.DataFrame:
|
|
11
|
+
"""
|
|
12
|
+
Optimize DataFrame types for efficient Arrow transfer to frontend.
|
|
13
|
+
|
|
14
|
+
This function downcasts numeric types to reduce Arrow payload size and
|
|
15
|
+
avoid BigInt overhead in JavaScript:
|
|
16
|
+
- Int64 → Int32 (if values fit): Avoids BigInt conversion in JS
|
|
17
|
+
- Float64 → Float32: Sufficient precision for visualization
|
|
18
|
+
|
|
19
|
+
Args:
|
|
20
|
+
df: Polars DataFrame to optimize
|
|
21
|
+
|
|
22
|
+
Returns:
|
|
23
|
+
DataFrame with optimized types
|
|
24
|
+
"""
|
|
25
|
+
if len(df) == 0:
|
|
26
|
+
return df
|
|
27
|
+
|
|
28
|
+
casts = []
|
|
29
|
+
|
|
30
|
+
for col in df.columns:
|
|
31
|
+
dtype = df[col].dtype
|
|
32
|
+
|
|
33
|
+
# Downcast Int64 to Int32 to avoid BigInt in JavaScript
|
|
34
|
+
# JS safe integer is 2^53, but Int32 range is simpler and sufficient for most data
|
|
35
|
+
if dtype == pl.Int64:
|
|
36
|
+
# Get min/max in a single pass
|
|
37
|
+
stats = df.select(
|
|
38
|
+
[
|
|
39
|
+
pl.col(col).min().alias("min"),
|
|
40
|
+
pl.col(col).max().alias("max"),
|
|
41
|
+
]
|
|
42
|
+
).row(0)
|
|
43
|
+
col_min, col_max = stats
|
|
44
|
+
|
|
45
|
+
if col_min is not None and col_max is not None:
|
|
46
|
+
# Int32 range: -2,147,483,648 to 2,147,483,647
|
|
47
|
+
if col_min >= -2147483648 and col_max <= 2147483647:
|
|
48
|
+
casts.append(pl.col(col).cast(pl.Int32))
|
|
49
|
+
|
|
50
|
+
# Downcast Float64 to Float32 (sufficient for display)
|
|
51
|
+
# Float32 has ~7 significant digits - enough for visualization
|
|
52
|
+
elif dtype == pl.Float64:
|
|
53
|
+
casts.append(pl.col(col).cast(pl.Float32))
|
|
54
|
+
|
|
55
|
+
if casts:
|
|
56
|
+
df = df.with_columns(casts)
|
|
57
|
+
|
|
58
|
+
return df
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def optimize_for_transfer_lazy(lf: pl.LazyFrame) -> pl.LazyFrame:
|
|
62
|
+
"""
|
|
63
|
+
Optimize LazyFrame types for efficient Arrow transfer (streaming-safe).
|
|
64
|
+
|
|
65
|
+
Unlike optimize_for_transfer(), this only applies optimizations that don't
|
|
66
|
+
require knowing the data values, preserving the ability to stream via sink_parquet().
|
|
67
|
+
|
|
68
|
+
Currently applies:
|
|
69
|
+
- Float64 → Float32: Always safe, no bounds check needed
|
|
70
|
+
|
|
71
|
+
Int64 → Int32 is NOT applied here because it requires bounds checking.
|
|
72
|
+
Use optimize_for_transfer() on collected DataFrames for full optimization.
|
|
73
|
+
|
|
74
|
+
Args:
|
|
75
|
+
lf: Polars LazyFrame to optimize
|
|
76
|
+
|
|
77
|
+
Returns:
|
|
78
|
+
LazyFrame with Float64 columns cast to Float32
|
|
79
|
+
"""
|
|
80
|
+
schema = lf.collect_schema()
|
|
81
|
+
casts = []
|
|
82
|
+
|
|
83
|
+
for col, dtype in zip(schema.names(), schema.dtypes()):
|
|
84
|
+
# Only Float64 → Float32 is safe without bounds checking
|
|
85
|
+
if dtype == pl.Float64:
|
|
86
|
+
casts.append(pl.col(col).cast(pl.Float32))
|
|
87
|
+
|
|
88
|
+
if casts:
|
|
89
|
+
lf = lf.with_columns(casts)
|
|
90
|
+
|
|
91
|
+
return lf
|
|
9
92
|
|
|
10
93
|
|
|
11
94
|
def _make_cache_key(
|
|
@@ -68,14 +151,39 @@ def compute_dataframe_hash(df: pl.DataFrame) -> str:
|
|
|
68
151
|
# Add sum of numeric columns for content verification
|
|
69
152
|
for col in df.columns:
|
|
70
153
|
dtype = df[col].dtype
|
|
71
|
-
if dtype in (
|
|
72
|
-
|
|
73
|
-
|
|
154
|
+
if dtype in (
|
|
155
|
+
pl.Int8,
|
|
156
|
+
pl.Int16,
|
|
157
|
+
pl.Int32,
|
|
158
|
+
pl.Int64,
|
|
159
|
+
pl.UInt8,
|
|
160
|
+
pl.UInt16,
|
|
161
|
+
pl.UInt32,
|
|
162
|
+
pl.UInt64,
|
|
163
|
+
pl.Float32,
|
|
164
|
+
pl.Float64,
|
|
165
|
+
):
|
|
74
166
|
try:
|
|
75
167
|
col_sum = df[col].sum()
|
|
76
168
|
hash_parts.append(f"{col}:{col_sum}")
|
|
77
169
|
except Exception:
|
|
78
170
|
pass
|
|
171
|
+
elif dtype == pl.Boolean:
|
|
172
|
+
# Count True values for boolean columns (important for annotations)
|
|
173
|
+
try:
|
|
174
|
+
true_count = df[col].sum() # True=1, False=0
|
|
175
|
+
hash_parts.append(f"{col}_bool:{true_count}")
|
|
176
|
+
except Exception:
|
|
177
|
+
pass
|
|
178
|
+
elif dtype == pl.Utf8 and col.startswith("_dynamic"):
|
|
179
|
+
# Hash content of dynamic string columns (annotations)
|
|
180
|
+
try:
|
|
181
|
+
# Use hash of all non-empty values for annotation text
|
|
182
|
+
non_empty = df[col].filter(pl.col(col) != "").to_list()
|
|
183
|
+
if non_empty:
|
|
184
|
+
hash_parts.append(f"{col}_str:{hash(tuple(non_empty))}")
|
|
185
|
+
except Exception:
|
|
186
|
+
pass
|
|
79
187
|
|
|
80
188
|
hash_input = "|".join(hash_parts).encode()
|
|
81
189
|
return hashlib.sha256(hash_input).hexdigest()
|
|
@@ -133,6 +241,8 @@ def _filter_and_collect(
|
|
|
133
241
|
data = data.filter(pl.col(column) == selected_value)
|
|
134
242
|
|
|
135
243
|
# Collect to Polars DataFrame
|
|
244
|
+
# Note: Type optimization (Int64→Int32, Float64→Float32) is applied at cache
|
|
245
|
+
# creation time in base.py._save_to_cache(), so data is already optimized
|
|
136
246
|
df_polars = data.collect()
|
|
137
247
|
|
|
138
248
|
# Compute hash efficiently (no pickle)
|
|
@@ -268,10 +378,10 @@ def filter_by_range(
|
|
|
268
378
|
data = data.lazy()
|
|
269
379
|
|
|
270
380
|
return data.filter(
|
|
271
|
-
(pl.col(x_column) >= x_range[0])
|
|
272
|
-
(pl.col(x_column) <= x_range[1])
|
|
273
|
-
(pl.col(y_column) >= y_range[0])
|
|
274
|
-
(pl.col(y_column) <= y_range[1])
|
|
381
|
+
(pl.col(x_column) >= x_range[0])
|
|
382
|
+
& (pl.col(x_column) <= x_range[1])
|
|
383
|
+
& (pl.col(y_column) >= y_range[0])
|
|
384
|
+
& (pl.col(y_column) <= y_range[1])
|
|
275
385
|
)
|
|
276
386
|
|
|
277
387
|
|