openms-insight 0.1.2__py3-none-any.whl → 0.1.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,16 +1,15 @@
1
1
  """Preprocessing utilities for data transformation and filtering."""
2
2
 
3
- from .filtering import (
4
- filter_by_selection,
5
- filter_by_index,
6
- filter_and_collect_cached,
7
- )
8
-
9
3
  from .compression import (
10
4
  compute_compression_levels,
11
5
  downsample_2d,
12
6
  downsample_2d_simple,
13
7
  )
8
+ from .filtering import (
9
+ filter_and_collect_cached,
10
+ filter_by_index,
11
+ filter_by_selection,
12
+ )
14
13
 
15
14
  __all__ = [
16
15
  "filter_by_selection",
@@ -13,6 +13,7 @@ import polars as pl
13
13
 
14
14
  try:
15
15
  from scipy.stats import binned_statistic_2d
16
+
16
17
  HAS_SCIPY = True
17
18
  except ImportError:
18
19
  HAS_SCIPY = False
@@ -55,12 +56,10 @@ def compute_compression_levels(min_size: int, total: int) -> List[int]:
55
56
 
56
57
  # Generate levels at each power of 10, scaled by the fractional part
57
58
  scale_factor = int(10 ** (np.log10(min_size) % 1))
58
- levels = np.logspace(
59
- min_power,
60
- max_power,
61
- max_power - min_power + 1,
62
- dtype='int'
63
- ) * scale_factor
59
+ levels = (
60
+ np.logspace(min_power, max_power, max_power - min_power + 1, dtype="int")
61
+ * scale_factor
62
+ )
64
63
 
65
64
  # Filter out levels >= total (don't include full resolution for large datasets)
66
65
  levels = levels[levels < total].tolist()
@@ -75,9 +74,9 @@ def compute_compression_levels(min_size: int, total: int) -> List[int]:
75
74
  def downsample_2d(
76
75
  data: Union[pl.LazyFrame, pl.DataFrame],
77
76
  max_points: int = 20000,
78
- x_column: str = 'x',
79
- y_column: str = 'y',
80
- intensity_column: str = 'intensity',
77
+ x_column: str = "x",
78
+ y_column: str = "y",
79
+ intensity_column: str = "intensity",
81
80
  x_bins: int = 400,
82
81
  y_bins: int = 50,
83
82
  ) -> pl.LazyFrame:
@@ -106,8 +105,7 @@ def downsample_2d(
106
105
  """
107
106
  if not HAS_SCIPY:
108
107
  raise ImportError(
109
- "scipy is required for downsample_2d. "
110
- "Install with: pip install scipy"
108
+ "scipy is required for downsample_2d. Install with: pip install scipy"
111
109
  )
112
110
 
113
111
  if (x_bins * y_bins) > max_points:
@@ -122,12 +120,9 @@ def downsample_2d(
122
120
 
123
121
  # Sort by intensity (descending) to prioritize high-intensity points
124
122
  sorted_data = (
125
- data
126
- .sort([x_column, intensity_column], descending=[False, True])
127
- .with_columns([
128
- pl.int_range(pl.len()).over(x_column).alias('_rank')
129
- ])
130
- .sort(['_rank', intensity_column], descending=[False, True])
123
+ data.sort([x_column, intensity_column], descending=[False, True])
124
+ .with_columns([pl.int_range(pl.len()).over(x_column).alias("_rank")])
125
+ .sort(["_rank", intensity_column], descending=[False, True])
131
126
  )
132
127
 
133
128
  # Collect for scipy binning (requires numpy arrays)
@@ -136,7 +131,7 @@ def downsample_2d(
136
131
  total_count = len(collected)
137
132
  if total_count <= max_points:
138
133
  # No downsampling needed
139
- return collected.drop('_rank').lazy()
134
+ return collected.drop("_rank").lazy()
140
135
 
141
136
  # Extract arrays for scipy
142
137
  x_array = collected[x_column].to_numpy()
@@ -145,18 +140,20 @@ def downsample_2d(
145
140
 
146
141
  # Compute 2D bins
147
142
  count, _, _, mapping = binned_statistic_2d(
148
- x_array, y_array, intensity_array, 'count',
143
+ x_array,
144
+ y_array,
145
+ intensity_array,
146
+ "count",
149
147
  bins=[x_bins, y_bins],
150
- expand_binnumbers=True
148
+ expand_binnumbers=True,
151
149
  )
152
150
 
153
151
  # Add bin indices to dataframe
154
- binned_data = (
155
- collected.lazy()
156
- .with_columns([
157
- pl.Series('_x_bin', mapping[0] - 1), # scipy uses 1-based indexing
158
- pl.Series('_y_bin', mapping[1] - 1)
159
- ])
152
+ binned_data = collected.lazy().with_columns(
153
+ [
154
+ pl.Series("_x_bin", mapping[0] - 1), # scipy uses 1-based indexing
155
+ pl.Series("_y_bin", mapping[1] - 1),
156
+ ]
160
157
  )
161
158
 
162
159
  # Compute max peaks per bin to stay under limit
@@ -174,11 +171,10 @@ def downsample_2d(
174
171
 
175
172
  # Keep top N peaks per bin
176
173
  result = (
177
- binned_data
178
- .group_by(['_x_bin', '_y_bin'])
174
+ binned_data.group_by(["_x_bin", "_y_bin"])
179
175
  .head(max_peaks_per_bin)
180
176
  .sort(intensity_column)
181
- .drop(['_rank', '_x_bin', '_y_bin'])
177
+ .drop(["_rank", "_x_bin", "_y_bin"])
182
178
  )
183
179
 
184
180
  return result
@@ -187,7 +183,7 @@ def downsample_2d(
187
183
  def downsample_2d_simple(
188
184
  data: Union[pl.LazyFrame, pl.DataFrame],
189
185
  max_points: int = 20000,
190
- intensity_column: str = 'intensity',
186
+ intensity_column: str = "intensity",
191
187
  ) -> pl.LazyFrame:
192
188
  """
193
189
  Simple downsampling by keeping highest-intensity points.
@@ -206,19 +202,15 @@ def downsample_2d_simple(
206
202
  if isinstance(data, pl.DataFrame):
207
203
  data = data.lazy()
208
204
 
209
- return (
210
- data
211
- .sort(intensity_column, descending=True)
212
- .head(max_points)
213
- )
205
+ return data.sort(intensity_column, descending=True).head(max_points)
214
206
 
215
207
 
216
208
  def downsample_2d_streaming(
217
209
  data: Union[pl.LazyFrame, pl.DataFrame],
218
210
  max_points: int = 20000,
219
- x_column: str = 'x',
220
- y_column: str = 'y',
221
- intensity_column: str = 'intensity',
211
+ x_column: str = "x",
212
+ y_column: str = "y",
213
+ intensity_column: str = "intensity",
222
214
  x_bins: int = 400,
223
215
  y_bins: int = 50,
224
216
  x_range: Optional[tuple] = None,
@@ -262,43 +254,51 @@ def downsample_2d_streaming(
262
254
  ((pl.col(x_column) - x_min) / (x_max - x_min + 1e-10) * x_bins)
263
255
  .cast(pl.Int32)
264
256
  .clip(0, x_bins - 1)
265
- .alias('_x_bin')
257
+ .alias("_x_bin")
266
258
  )
267
259
  y_bin_expr = (
268
260
  ((pl.col(y_column) - y_min) / (y_max - y_min + 1e-10) * y_bins)
269
261
  .cast(pl.Int32)
270
262
  .clip(0, y_bins - 1)
271
- .alias('_y_bin')
263
+ .alias("_y_bin")
272
264
  )
273
265
 
274
266
  result = (
275
- data
276
- .with_columns([x_bin_expr, y_bin_expr])
267
+ data.with_columns([x_bin_expr, y_bin_expr])
277
268
  .sort(intensity_column, descending=True)
278
- .group_by(['_x_bin', '_y_bin'])
269
+ .group_by(["_x_bin", "_y_bin"])
279
270
  .head(points_per_bin)
280
- .drop(['_x_bin', '_y_bin'])
271
+ .drop(["_x_bin", "_y_bin"])
281
272
  )
282
273
  else:
283
274
  # Need to compute ranges - still lazy using over() window
284
275
  # First pass: add normalized bin columns using min/max over entire frame
285
276
  result = (
286
- data
287
- .with_columns([
288
- # Compute bin indices using window functions for min/max
289
- (
290
- (pl.col(x_column) - pl.col(x_column).min()) /
291
- (pl.col(x_column).max() - pl.col(x_column).min() + 1e-10) * x_bins
292
- ).cast(pl.Int32).clip(0, x_bins - 1).alias('_x_bin'),
293
- (
294
- (pl.col(y_column) - pl.col(y_column).min()) /
295
- (pl.col(y_column).max() - pl.col(y_column).min() + 1e-10) * y_bins
296
- ).cast(pl.Int32).clip(0, y_bins - 1).alias('_y_bin'),
297
- ])
277
+ data.with_columns(
278
+ [
279
+ # Compute bin indices using window functions for min/max
280
+ (
281
+ (pl.col(x_column) - pl.col(x_column).min())
282
+ / (pl.col(x_column).max() - pl.col(x_column).min() + 1e-10)
283
+ * x_bins
284
+ )
285
+ .cast(pl.Int32)
286
+ .clip(0, x_bins - 1)
287
+ .alias("_x_bin"),
288
+ (
289
+ (pl.col(y_column) - pl.col(y_column).min())
290
+ / (pl.col(y_column).max() - pl.col(y_column).min() + 1e-10)
291
+ * y_bins
292
+ )
293
+ .cast(pl.Int32)
294
+ .clip(0, y_bins - 1)
295
+ .alias("_y_bin"),
296
+ ]
297
+ )
298
298
  .sort(intensity_column, descending=True)
299
- .group_by(['_x_bin', '_y_bin'])
299
+ .group_by(["_x_bin", "_y_bin"])
300
300
  .head(points_per_bin)
301
- .drop(['_x_bin', '_y_bin'])
301
+ .drop(["_x_bin", "_y_bin"])
302
302
  )
303
303
 
304
304
  return result
@@ -325,14 +325,16 @@ def get_data_range(
325
325
  if isinstance(data, pl.DataFrame):
326
326
  data = data.lazy()
327
327
 
328
- stats = data.select([
329
- pl.col(x_column).min().alias('x_min'),
330
- pl.col(x_column).max().alias('x_max'),
331
- pl.col(y_column).min().alias('y_min'),
332
- pl.col(y_column).max().alias('y_max'),
333
- ]).collect()
328
+ stats = data.select(
329
+ [
330
+ pl.col(x_column).min().alias("x_min"),
331
+ pl.col(x_column).max().alias("x_max"),
332
+ pl.col(y_column).min().alias("y_min"),
333
+ pl.col(y_column).max().alias("y_max"),
334
+ ]
335
+ ).collect()
334
336
 
335
337
  return (
336
- (stats['x_min'][0], stats['x_max'][0]),
337
- (stats['y_min'][0], stats['y_max'][0]),
338
+ (stats["x_min"][0], stats["x_max"][0]),
339
+ (stats["y_min"][0], stats["y_max"][0]),
338
340
  )
@@ -1,11 +1,10 @@
1
1
  """Data filtering utilities for selection-based filtering."""
2
2
 
3
+ import hashlib
3
4
  from typing import Any, Dict, List, Optional, Tuple, Union
4
5
 
5
- import hashlib
6
6
  import pandas as pd
7
7
  import polars as pl
8
- import streamlit as st
9
8
 
10
9
 
11
10
  def optimize_for_transfer(df: pl.DataFrame) -> pl.DataFrame:
@@ -35,10 +34,12 @@ def optimize_for_transfer(df: pl.DataFrame) -> pl.DataFrame:
35
34
  # JS safe integer is 2^53, but Int32 range is simpler and sufficient for most data
36
35
  if dtype == pl.Int64:
37
36
  # Get min/max in a single pass
38
- stats = df.select([
39
- pl.col(col).min().alias('min'),
40
- pl.col(col).max().alias('max'),
41
- ]).row(0)
37
+ stats = df.select(
38
+ [
39
+ pl.col(col).min().alias("min"),
40
+ pl.col(col).max().alias("max"),
41
+ ]
42
+ ).row(0)
42
43
  col_min, col_max = stats
43
44
 
44
45
  if col_min is not None and col_max is not None:
@@ -150,14 +151,39 @@ def compute_dataframe_hash(df: pl.DataFrame) -> str:
150
151
  # Add sum of numeric columns for content verification
151
152
  for col in df.columns:
152
153
  dtype = df[col].dtype
153
- if dtype in (pl.Int8, pl.Int16, pl.Int32, pl.Int64,
154
- pl.UInt8, pl.UInt16, pl.UInt32, pl.UInt64,
155
- pl.Float32, pl.Float64):
154
+ if dtype in (
155
+ pl.Int8,
156
+ pl.Int16,
157
+ pl.Int32,
158
+ pl.Int64,
159
+ pl.UInt8,
160
+ pl.UInt16,
161
+ pl.UInt32,
162
+ pl.UInt64,
163
+ pl.Float32,
164
+ pl.Float64,
165
+ ):
156
166
  try:
157
167
  col_sum = df[col].sum()
158
168
  hash_parts.append(f"{col}:{col_sum}")
159
169
  except Exception:
160
170
  pass
171
+ elif dtype == pl.Boolean:
172
+ # Count True values for boolean columns (important for annotations)
173
+ try:
174
+ true_count = df[col].sum() # True=1, False=0
175
+ hash_parts.append(f"{col}_bool:{true_count}")
176
+ except Exception:
177
+ pass
178
+ elif dtype == pl.Utf8 and col.startswith("_dynamic"):
179
+ # Hash content of dynamic string columns (annotations)
180
+ try:
181
+ # Use hash of all non-empty values for annotation text
182
+ non_empty = df[col].filter(pl.col(col) != "").to_list()
183
+ if non_empty:
184
+ hash_parts.append(f"{col}_str:{hash(tuple(non_empty))}")
185
+ except Exception:
186
+ pass
161
187
 
162
188
  hash_input = "|".join(hash_parts).encode()
163
189
  return hashlib.sha256(hash_input).hexdigest()
@@ -352,10 +378,10 @@ def filter_by_range(
352
378
  data = data.lazy()
353
379
 
354
380
  return data.filter(
355
- (pl.col(x_column) >= x_range[0]) &
356
- (pl.col(x_column) <= x_range[1]) &
357
- (pl.col(y_column) >= y_range[0]) &
358
- (pl.col(y_column) <= y_range[1])
381
+ (pl.col(x_column) >= x_range[0])
382
+ & (pl.col(x_column) <= x_range[1])
383
+ & (pl.col(y_column) >= y_range[0])
384
+ & (pl.col(y_column) <= y_range[1])
359
385
  )
360
386
 
361
387
 
@@ -1,6 +1,6 @@
1
1
  """Rendering utilities for Python-to-Vue communication."""
2
2
 
3
- from .bridge import render_component, get_vue_component_function
3
+ from .bridge import get_vue_component_function, render_component
4
4
 
5
5
  __all__ = [
6
6
  "render_component",