openms-insight 0.1.1__py3-none-any.whl → 0.1.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,16 +1,15 @@
1
1
  """Preprocessing utilities for data transformation and filtering."""
2
2
 
3
- from .filtering import (
4
- filter_by_selection,
5
- filter_by_index,
6
- filter_and_collect_cached,
7
- )
8
-
9
3
  from .compression import (
10
4
  compute_compression_levels,
11
5
  downsample_2d,
12
6
  downsample_2d_simple,
13
7
  )
8
+ from .filtering import (
9
+ filter_and_collect_cached,
10
+ filter_by_index,
11
+ filter_by_selection,
12
+ )
14
13
 
15
14
  __all__ = [
16
15
  "filter_by_selection",
@@ -13,6 +13,7 @@ import polars as pl
13
13
 
14
14
  try:
15
15
  from scipy.stats import binned_statistic_2d
16
+
16
17
  HAS_SCIPY = True
17
18
  except ImportError:
18
19
  HAS_SCIPY = False
@@ -55,12 +56,10 @@ def compute_compression_levels(min_size: int, total: int) -> List[int]:
55
56
 
56
57
  # Generate levels at each power of 10, scaled by the fractional part
57
58
  scale_factor = int(10 ** (np.log10(min_size) % 1))
58
- levels = np.logspace(
59
- min_power,
60
- max_power,
61
- max_power - min_power + 1,
62
- dtype='int'
63
- ) * scale_factor
59
+ levels = (
60
+ np.logspace(min_power, max_power, max_power - min_power + 1, dtype="int")
61
+ * scale_factor
62
+ )
64
63
 
65
64
  # Filter out levels >= total (don't include full resolution for large datasets)
66
65
  levels = levels[levels < total].tolist()
@@ -75,9 +74,9 @@ def compute_compression_levels(min_size: int, total: int) -> List[int]:
75
74
  def downsample_2d(
76
75
  data: Union[pl.LazyFrame, pl.DataFrame],
77
76
  max_points: int = 20000,
78
- x_column: str = 'x',
79
- y_column: str = 'y',
80
- intensity_column: str = 'intensity',
77
+ x_column: str = "x",
78
+ y_column: str = "y",
79
+ intensity_column: str = "intensity",
81
80
  x_bins: int = 400,
82
81
  y_bins: int = 50,
83
82
  ) -> pl.LazyFrame:
@@ -106,8 +105,7 @@ def downsample_2d(
106
105
  """
107
106
  if not HAS_SCIPY:
108
107
  raise ImportError(
109
- "scipy is required for downsample_2d. "
110
- "Install with: pip install scipy"
108
+ "scipy is required for downsample_2d. Install with: pip install scipy"
111
109
  )
112
110
 
113
111
  if (x_bins * y_bins) > max_points:
@@ -122,12 +120,9 @@ def downsample_2d(
122
120
 
123
121
  # Sort by intensity (descending) to prioritize high-intensity points
124
122
  sorted_data = (
125
- data
126
- .sort([x_column, intensity_column], descending=[False, True])
127
- .with_columns([
128
- pl.int_range(pl.len()).over(x_column).alias('_rank')
129
- ])
130
- .sort(['_rank', intensity_column], descending=[False, True])
123
+ data.sort([x_column, intensity_column], descending=[False, True])
124
+ .with_columns([pl.int_range(pl.len()).over(x_column).alias("_rank")])
125
+ .sort(["_rank", intensity_column], descending=[False, True])
131
126
  )
132
127
 
133
128
  # Collect for scipy binning (requires numpy arrays)
@@ -136,7 +131,7 @@ def downsample_2d(
136
131
  total_count = len(collected)
137
132
  if total_count <= max_points:
138
133
  # No downsampling needed
139
- return collected.drop('_rank').lazy()
134
+ return collected.drop("_rank").lazy()
140
135
 
141
136
  # Extract arrays for scipy
142
137
  x_array = collected[x_column].to_numpy()
@@ -145,18 +140,20 @@ def downsample_2d(
145
140
 
146
141
  # Compute 2D bins
147
142
  count, _, _, mapping = binned_statistic_2d(
148
- x_array, y_array, intensity_array, 'count',
143
+ x_array,
144
+ y_array,
145
+ intensity_array,
146
+ "count",
149
147
  bins=[x_bins, y_bins],
150
- expand_binnumbers=True
148
+ expand_binnumbers=True,
151
149
  )
152
150
 
153
151
  # Add bin indices to dataframe
154
- binned_data = (
155
- collected.lazy()
156
- .with_columns([
157
- pl.Series('_x_bin', mapping[0] - 1), # scipy uses 1-based indexing
158
- pl.Series('_y_bin', mapping[1] - 1)
159
- ])
152
+ binned_data = collected.lazy().with_columns(
153
+ [
154
+ pl.Series("_x_bin", mapping[0] - 1), # scipy uses 1-based indexing
155
+ pl.Series("_y_bin", mapping[1] - 1),
156
+ ]
160
157
  )
161
158
 
162
159
  # Compute max peaks per bin to stay under limit
@@ -174,11 +171,10 @@ def downsample_2d(
174
171
 
175
172
  # Keep top N peaks per bin
176
173
  result = (
177
- binned_data
178
- .group_by(['_x_bin', '_y_bin'])
174
+ binned_data.group_by(["_x_bin", "_y_bin"])
179
175
  .head(max_peaks_per_bin)
180
176
  .sort(intensity_column)
181
- .drop(['_rank', '_x_bin', '_y_bin'])
177
+ .drop(["_rank", "_x_bin", "_y_bin"])
182
178
  )
183
179
 
184
180
  return result
@@ -187,7 +183,7 @@ def downsample_2d(
187
183
  def downsample_2d_simple(
188
184
  data: Union[pl.LazyFrame, pl.DataFrame],
189
185
  max_points: int = 20000,
190
- intensity_column: str = 'intensity',
186
+ intensity_column: str = "intensity",
191
187
  ) -> pl.LazyFrame:
192
188
  """
193
189
  Simple downsampling by keeping highest-intensity points.
@@ -206,19 +202,15 @@ def downsample_2d_simple(
206
202
  if isinstance(data, pl.DataFrame):
207
203
  data = data.lazy()
208
204
 
209
- return (
210
- data
211
- .sort(intensity_column, descending=True)
212
- .head(max_points)
213
- )
205
+ return data.sort(intensity_column, descending=True).head(max_points)
214
206
 
215
207
 
216
208
  def downsample_2d_streaming(
217
209
  data: Union[pl.LazyFrame, pl.DataFrame],
218
210
  max_points: int = 20000,
219
- x_column: str = 'x',
220
- y_column: str = 'y',
221
- intensity_column: str = 'intensity',
211
+ x_column: str = "x",
212
+ y_column: str = "y",
213
+ intensity_column: str = "intensity",
222
214
  x_bins: int = 400,
223
215
  y_bins: int = 50,
224
216
  x_range: Optional[tuple] = None,
@@ -262,43 +254,51 @@ def downsample_2d_streaming(
262
254
  ((pl.col(x_column) - x_min) / (x_max - x_min + 1e-10) * x_bins)
263
255
  .cast(pl.Int32)
264
256
  .clip(0, x_bins - 1)
265
- .alias('_x_bin')
257
+ .alias("_x_bin")
266
258
  )
267
259
  y_bin_expr = (
268
260
  ((pl.col(y_column) - y_min) / (y_max - y_min + 1e-10) * y_bins)
269
261
  .cast(pl.Int32)
270
262
  .clip(0, y_bins - 1)
271
- .alias('_y_bin')
263
+ .alias("_y_bin")
272
264
  )
273
265
 
274
266
  result = (
275
- data
276
- .with_columns([x_bin_expr, y_bin_expr])
267
+ data.with_columns([x_bin_expr, y_bin_expr])
277
268
  .sort(intensity_column, descending=True)
278
- .group_by(['_x_bin', '_y_bin'])
269
+ .group_by(["_x_bin", "_y_bin"])
279
270
  .head(points_per_bin)
280
- .drop(['_x_bin', '_y_bin'])
271
+ .drop(["_x_bin", "_y_bin"])
281
272
  )
282
273
  else:
283
274
  # Need to compute ranges - still lazy using over() window
284
275
  # First pass: add normalized bin columns using min/max over entire frame
285
276
  result = (
286
- data
287
- .with_columns([
288
- # Compute bin indices using window functions for min/max
289
- (
290
- (pl.col(x_column) - pl.col(x_column).min()) /
291
- (pl.col(x_column).max() - pl.col(x_column).min() + 1e-10) * x_bins
292
- ).cast(pl.Int32).clip(0, x_bins - 1).alias('_x_bin'),
293
- (
294
- (pl.col(y_column) - pl.col(y_column).min()) /
295
- (pl.col(y_column).max() - pl.col(y_column).min() + 1e-10) * y_bins
296
- ).cast(pl.Int32).clip(0, y_bins - 1).alias('_y_bin'),
297
- ])
277
+ data.with_columns(
278
+ [
279
+ # Compute bin indices using window functions for min/max
280
+ (
281
+ (pl.col(x_column) - pl.col(x_column).min())
282
+ / (pl.col(x_column).max() - pl.col(x_column).min() + 1e-10)
283
+ * x_bins
284
+ )
285
+ .cast(pl.Int32)
286
+ .clip(0, x_bins - 1)
287
+ .alias("_x_bin"),
288
+ (
289
+ (pl.col(y_column) - pl.col(y_column).min())
290
+ / (pl.col(y_column).max() - pl.col(y_column).min() + 1e-10)
291
+ * y_bins
292
+ )
293
+ .cast(pl.Int32)
294
+ .clip(0, y_bins - 1)
295
+ .alias("_y_bin"),
296
+ ]
297
+ )
298
298
  .sort(intensity_column, descending=True)
299
- .group_by(['_x_bin', '_y_bin'])
299
+ .group_by(["_x_bin", "_y_bin"])
300
300
  .head(points_per_bin)
301
- .drop(['_x_bin', '_y_bin'])
301
+ .drop(["_x_bin", "_y_bin"])
302
302
  )
303
303
 
304
304
  return result
@@ -325,14 +325,16 @@ def get_data_range(
325
325
  if isinstance(data, pl.DataFrame):
326
326
  data = data.lazy()
327
327
 
328
- stats = data.select([
329
- pl.col(x_column).min().alias('x_min'),
330
- pl.col(x_column).max().alias('x_max'),
331
- pl.col(y_column).min().alias('y_min'),
332
- pl.col(y_column).max().alias('y_max'),
333
- ]).collect()
328
+ stats = data.select(
329
+ [
330
+ pl.col(x_column).min().alias("x_min"),
331
+ pl.col(x_column).max().alias("x_max"),
332
+ pl.col(y_column).min().alias("y_min"),
333
+ pl.col(y_column).max().alias("y_max"),
334
+ ]
335
+ ).collect()
334
336
 
335
337
  return (
336
- (stats['x_min'][0], stats['x_max'][0]),
337
- (stats['y_min'][0], stats['y_max'][0]),
338
+ (stats["x_min"][0], stats["x_max"][0]),
339
+ (stats["y_min"][0], stats["y_max"][0]),
338
340
  )
@@ -1,11 +1,94 @@
1
1
  """Data filtering utilities for selection-based filtering."""
2
2
 
3
+ import hashlib
3
4
  from typing import Any, Dict, List, Optional, Tuple, Union
4
5
 
5
- import hashlib
6
6
  import pandas as pd
7
7
  import polars as pl
8
- import streamlit as st
8
+
9
+
10
+ def optimize_for_transfer(df: pl.DataFrame) -> pl.DataFrame:
11
+ """
12
+ Optimize DataFrame types for efficient Arrow transfer to frontend.
13
+
14
+ This function downcasts numeric types to reduce Arrow payload size and
15
+ avoid BigInt overhead in JavaScript:
16
+ - Int64 → Int32 (if values fit): Avoids BigInt conversion in JS
17
+ - Float64 → Float32: Sufficient precision for visualization
18
+
19
+ Args:
20
+ df: Polars DataFrame to optimize
21
+
22
+ Returns:
23
+ DataFrame with optimized types
24
+ """
25
+ if len(df) == 0:
26
+ return df
27
+
28
+ casts = []
29
+
30
+ for col in df.columns:
31
+ dtype = df[col].dtype
32
+
33
+ # Downcast Int64 to Int32 to avoid BigInt in JavaScript
34
+ # JS safe integer is 2^53, but Int32 range is simpler and sufficient for most data
35
+ if dtype == pl.Int64:
36
+ # Get min/max in a single pass
37
+ stats = df.select(
38
+ [
39
+ pl.col(col).min().alias("min"),
40
+ pl.col(col).max().alias("max"),
41
+ ]
42
+ ).row(0)
43
+ col_min, col_max = stats
44
+
45
+ if col_min is not None and col_max is not None:
46
+ # Int32 range: -2,147,483,648 to 2,147,483,647
47
+ if col_min >= -2147483648 and col_max <= 2147483647:
48
+ casts.append(pl.col(col).cast(pl.Int32))
49
+
50
+ # Downcast Float64 to Float32 (sufficient for display)
51
+ # Float32 has ~7 significant digits - enough for visualization
52
+ elif dtype == pl.Float64:
53
+ casts.append(pl.col(col).cast(pl.Float32))
54
+
55
+ if casts:
56
+ df = df.with_columns(casts)
57
+
58
+ return df
59
+
60
+
61
+ def optimize_for_transfer_lazy(lf: pl.LazyFrame) -> pl.LazyFrame:
62
+ """
63
+ Optimize LazyFrame types for efficient Arrow transfer (streaming-safe).
64
+
65
+ Unlike optimize_for_transfer(), this only applies optimizations that don't
66
+ require knowing the data values, preserving the ability to stream via sink_parquet().
67
+
68
+ Currently applies:
69
+ - Float64 → Float32: Always safe, no bounds check needed
70
+
71
+ Int64 → Int32 is NOT applied here because it requires bounds checking.
72
+ Use optimize_for_transfer() on collected DataFrames for full optimization.
73
+
74
+ Args:
75
+ lf: Polars LazyFrame to optimize
76
+
77
+ Returns:
78
+ LazyFrame with Float64 columns cast to Float32
79
+ """
80
+ schema = lf.collect_schema()
81
+ casts = []
82
+
83
+ for col, dtype in zip(schema.names(), schema.dtypes()):
84
+ # Only Float64 → Float32 is safe without bounds checking
85
+ if dtype == pl.Float64:
86
+ casts.append(pl.col(col).cast(pl.Float32))
87
+
88
+ if casts:
89
+ lf = lf.with_columns(casts)
90
+
91
+ return lf
9
92
 
10
93
 
11
94
  def _make_cache_key(
@@ -68,14 +151,39 @@ def compute_dataframe_hash(df: pl.DataFrame) -> str:
68
151
  # Add sum of numeric columns for content verification
69
152
  for col in df.columns:
70
153
  dtype = df[col].dtype
71
- if dtype in (pl.Int8, pl.Int16, pl.Int32, pl.Int64,
72
- pl.UInt8, pl.UInt16, pl.UInt32, pl.UInt64,
73
- pl.Float32, pl.Float64):
154
+ if dtype in (
155
+ pl.Int8,
156
+ pl.Int16,
157
+ pl.Int32,
158
+ pl.Int64,
159
+ pl.UInt8,
160
+ pl.UInt16,
161
+ pl.UInt32,
162
+ pl.UInt64,
163
+ pl.Float32,
164
+ pl.Float64,
165
+ ):
74
166
  try:
75
167
  col_sum = df[col].sum()
76
168
  hash_parts.append(f"{col}:{col_sum}")
77
169
  except Exception:
78
170
  pass
171
+ elif dtype == pl.Boolean:
172
+ # Count True values for boolean columns (important for annotations)
173
+ try:
174
+ true_count = df[col].sum() # True=1, False=0
175
+ hash_parts.append(f"{col}_bool:{true_count}")
176
+ except Exception:
177
+ pass
178
+ elif dtype == pl.Utf8 and col.startswith("_dynamic"):
179
+ # Hash content of dynamic string columns (annotations)
180
+ try:
181
+ # Use hash of all non-empty values for annotation text
182
+ non_empty = df[col].filter(pl.col(col) != "").to_list()
183
+ if non_empty:
184
+ hash_parts.append(f"{col}_str:{hash(tuple(non_empty))}")
185
+ except Exception:
186
+ pass
79
187
 
80
188
  hash_input = "|".join(hash_parts).encode()
81
189
  return hashlib.sha256(hash_input).hexdigest()
@@ -133,6 +241,8 @@ def _filter_and_collect(
133
241
  data = data.filter(pl.col(column) == selected_value)
134
242
 
135
243
  # Collect to Polars DataFrame
244
+ # Note: Type optimization (Int64→Int32, Float64→Float32) is applied at cache
245
+ # creation time in base.py._save_to_cache(), so data is already optimized
136
246
  df_polars = data.collect()
137
247
 
138
248
  # Compute hash efficiently (no pickle)
@@ -268,10 +378,10 @@ def filter_by_range(
268
378
  data = data.lazy()
269
379
 
270
380
  return data.filter(
271
- (pl.col(x_column) >= x_range[0]) &
272
- (pl.col(x_column) <= x_range[1]) &
273
- (pl.col(y_column) >= y_range[0]) &
274
- (pl.col(y_column) <= y_range[1])
381
+ (pl.col(x_column) >= x_range[0])
382
+ & (pl.col(x_column) <= x_range[1])
383
+ & (pl.col(y_column) >= y_range[0])
384
+ & (pl.col(y_column) <= y_range[1])
275
385
  )
276
386
 
277
387
 
@@ -1,6 +1,6 @@
1
1
  """Rendering utilities for Python-to-Vue communication."""
2
2
 
3
- from .bridge import render_component, get_vue_component_function
3
+ from .bridge import get_vue_component_function, render_component
4
4
 
5
5
  __all__ = [
6
6
  "render_component",