openms-insight 0.1.2__py3-none-any.whl → 0.1.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,16 +1,15 @@
1
1
  """Preprocessing utilities for data transformation and filtering."""
2
2
 
3
- from .filtering import (
4
- filter_by_selection,
5
- filter_by_index,
6
- filter_and_collect_cached,
7
- )
8
-
9
3
  from .compression import (
10
4
  compute_compression_levels,
11
5
  downsample_2d,
12
6
  downsample_2d_simple,
13
7
  )
8
+ from .filtering import (
9
+ filter_and_collect_cached,
10
+ filter_by_index,
11
+ filter_by_selection,
12
+ )
14
13
 
15
14
  __all__ = [
16
15
  "filter_by_selection",
@@ -6,18 +6,73 @@ data, enabling efficient visualization of datasets with millions of points.
6
6
  Supports both streaming (lazy) and eager downsampling approaches.
7
7
  """
8
8
 
9
- from typing import List, Optional, Union
9
+ import math
10
+ from typing import List, Optional, Tuple, Union
10
11
 
11
12
  import numpy as np
12
13
  import polars as pl
13
14
 
14
15
  try:
15
16
  from scipy.stats import binned_statistic_2d
17
+
16
18
  HAS_SCIPY = True
17
19
  except ImportError:
18
20
  HAS_SCIPY = False
19
21
 
20
22
 
23
+ def compute_optimal_bins(
24
+ target_points: int,
25
+ x_range: Tuple[float, float],
26
+ y_range: Tuple[float, float],
27
+ ) -> Tuple[int, int]:
28
+ """
29
+ Compute optimal x_bins, y_bins for even spatial distribution.
30
+
31
+ The bin grid matches the data's aspect ratio so bins are approximately
32
+ square in data space. Total bins ≈ target_points for 1 point per bin.
33
+
34
+ Solves the system:
35
+ x_bins × y_bins = target_points
36
+ x_bins / y_bins = aspect_ratio
37
+
38
+ Solution:
39
+ y_bins = sqrt(target_points / aspect_ratio)
40
+ x_bins = sqrt(target_points × aspect_ratio)
41
+
42
+ Args:
43
+ target_points: Target number of bins (and thus max points with 1 per bin)
44
+ x_range: (x_min, x_max) data range
45
+ y_range: (y_min, y_max) data range
46
+
47
+ Returns:
48
+ (x_bins, y_bins) tuple
49
+
50
+ Examples:
51
+ >>> compute_optimal_bins(10000, (0, 1000), (0, 100)) # 10:1 aspect
52
+ (316, 31)
53
+ >>> compute_optimal_bins(10000, (0, 100), (0, 100)) # 1:1 aspect
54
+ (100, 100)
55
+ """
56
+ x_span = x_range[1] - x_range[0]
57
+ y_span = y_range[1] - y_range[0]
58
+
59
+ # Handle edge cases
60
+ if y_span < 1e-10:
61
+ y_span = x_span if x_span > 1e-10 else 1.0
62
+ if x_span < 1e-10:
63
+ x_span = y_span
64
+
65
+ aspect_ratio = x_span / y_span
66
+
67
+ # Clamp to reasonable bounds (avoid extreme rectangles)
68
+ aspect_ratio = max(0.05, min(20.0, aspect_ratio))
69
+
70
+ y_bins = max(1, int(math.sqrt(target_points / aspect_ratio)))
71
+ x_bins = max(1, int(math.sqrt(target_points * aspect_ratio)))
72
+
73
+ return x_bins, y_bins
74
+
75
+
21
76
  def compute_compression_levels(min_size: int, total: int) -> List[int]:
22
77
  """
23
78
  Compute logarithmically-spaced compression level target sizes.
@@ -55,12 +110,10 @@ def compute_compression_levels(min_size: int, total: int) -> List[int]:
55
110
 
56
111
  # Generate levels at each power of 10, scaled by the fractional part
57
112
  scale_factor = int(10 ** (np.log10(min_size) % 1))
58
- levels = np.logspace(
59
- min_power,
60
- max_power,
61
- max_power - min_power + 1,
62
- dtype='int'
63
- ) * scale_factor
113
+ levels = (
114
+ np.logspace(min_power, max_power, max_power - min_power + 1, dtype="int")
115
+ * scale_factor
116
+ )
64
117
 
65
118
  # Filter out levels >= total (don't include full resolution for large datasets)
66
119
  levels = levels[levels < total].tolist()
@@ -75,9 +128,9 @@ def compute_compression_levels(min_size: int, total: int) -> List[int]:
75
128
  def downsample_2d(
76
129
  data: Union[pl.LazyFrame, pl.DataFrame],
77
130
  max_points: int = 20000,
78
- x_column: str = 'x',
79
- y_column: str = 'y',
80
- intensity_column: str = 'intensity',
131
+ x_column: str = "x",
132
+ y_column: str = "y",
133
+ intensity_column: str = "intensity",
81
134
  x_bins: int = 400,
82
135
  y_bins: int = 50,
83
136
  ) -> pl.LazyFrame:
@@ -106,8 +159,7 @@ def downsample_2d(
106
159
  """
107
160
  if not HAS_SCIPY:
108
161
  raise ImportError(
109
- "scipy is required for downsample_2d. "
110
- "Install with: pip install scipy"
162
+ "scipy is required for downsample_2d. Install with: pip install scipy"
111
163
  )
112
164
 
113
165
  if (x_bins * y_bins) > max_points:
@@ -122,12 +174,9 @@ def downsample_2d(
122
174
 
123
175
  # Sort by intensity (descending) to prioritize high-intensity points
124
176
  sorted_data = (
125
- data
126
- .sort([x_column, intensity_column], descending=[False, True])
127
- .with_columns([
128
- pl.int_range(pl.len()).over(x_column).alias('_rank')
129
- ])
130
- .sort(['_rank', intensity_column], descending=[False, True])
177
+ data.sort([x_column, intensity_column], descending=[False, True])
178
+ .with_columns([pl.int_range(pl.len()).over(x_column).alias("_rank")])
179
+ .sort(["_rank", intensity_column], descending=[False, True])
131
180
  )
132
181
 
133
182
  # Collect for scipy binning (requires numpy arrays)
@@ -136,7 +185,7 @@ def downsample_2d(
136
185
  total_count = len(collected)
137
186
  if total_count <= max_points:
138
187
  # No downsampling needed
139
- return collected.drop('_rank').lazy()
188
+ return collected.drop("_rank").lazy()
140
189
 
141
190
  # Extract arrays for scipy
142
191
  x_array = collected[x_column].to_numpy()
@@ -145,18 +194,20 @@ def downsample_2d(
145
194
 
146
195
  # Compute 2D bins
147
196
  count, _, _, mapping = binned_statistic_2d(
148
- x_array, y_array, intensity_array, 'count',
197
+ x_array,
198
+ y_array,
199
+ intensity_array,
200
+ "count",
149
201
  bins=[x_bins, y_bins],
150
- expand_binnumbers=True
202
+ expand_binnumbers=True,
151
203
  )
152
204
 
153
205
  # Add bin indices to dataframe
154
- binned_data = (
155
- collected.lazy()
156
- .with_columns([
157
- pl.Series('_x_bin', mapping[0] - 1), # scipy uses 1-based indexing
158
- pl.Series('_y_bin', mapping[1] - 1)
159
- ])
206
+ binned_data = collected.lazy().with_columns(
207
+ [
208
+ pl.Series("_x_bin", mapping[0] - 1), # scipy uses 1-based indexing
209
+ pl.Series("_y_bin", mapping[1] - 1),
210
+ ]
160
211
  )
161
212
 
162
213
  # Compute max peaks per bin to stay under limit
@@ -174,11 +225,10 @@ def downsample_2d(
174
225
 
175
226
  # Keep top N peaks per bin
176
227
  result = (
177
- binned_data
178
- .group_by(['_x_bin', '_y_bin'])
228
+ binned_data.group_by(["_x_bin", "_y_bin"])
179
229
  .head(max_peaks_per_bin)
180
230
  .sort(intensity_column)
181
- .drop(['_rank', '_x_bin', '_y_bin'])
231
+ .drop(["_rank", "_x_bin", "_y_bin"])
182
232
  )
183
233
 
184
234
  return result
@@ -187,7 +237,7 @@ def downsample_2d(
187
237
  def downsample_2d_simple(
188
238
  data: Union[pl.LazyFrame, pl.DataFrame],
189
239
  max_points: int = 20000,
190
- intensity_column: str = 'intensity',
240
+ intensity_column: str = "intensity",
191
241
  ) -> pl.LazyFrame:
192
242
  """
193
243
  Simple downsampling by keeping highest-intensity points.
@@ -206,19 +256,15 @@ def downsample_2d_simple(
206
256
  if isinstance(data, pl.DataFrame):
207
257
  data = data.lazy()
208
258
 
209
- return (
210
- data
211
- .sort(intensity_column, descending=True)
212
- .head(max_points)
213
- )
259
+ return data.sort(intensity_column, descending=True).head(max_points)
214
260
 
215
261
 
216
262
  def downsample_2d_streaming(
217
263
  data: Union[pl.LazyFrame, pl.DataFrame],
218
264
  max_points: int = 20000,
219
- x_column: str = 'x',
220
- y_column: str = 'y',
221
- intensity_column: str = 'intensity',
265
+ x_column: str = "x",
266
+ y_column: str = "y",
267
+ intensity_column: str = "intensity",
222
268
  x_bins: int = 400,
223
269
  y_bins: int = 50,
224
270
  x_range: Optional[tuple] = None,
@@ -262,43 +308,51 @@ def downsample_2d_streaming(
262
308
  ((pl.col(x_column) - x_min) / (x_max - x_min + 1e-10) * x_bins)
263
309
  .cast(pl.Int32)
264
310
  .clip(0, x_bins - 1)
265
- .alias('_x_bin')
311
+ .alias("_x_bin")
266
312
  )
267
313
  y_bin_expr = (
268
314
  ((pl.col(y_column) - y_min) / (y_max - y_min + 1e-10) * y_bins)
269
315
  .cast(pl.Int32)
270
316
  .clip(0, y_bins - 1)
271
- .alias('_y_bin')
317
+ .alias("_y_bin")
272
318
  )
273
319
 
274
320
  result = (
275
- data
276
- .with_columns([x_bin_expr, y_bin_expr])
321
+ data.with_columns([x_bin_expr, y_bin_expr])
277
322
  .sort(intensity_column, descending=True)
278
- .group_by(['_x_bin', '_y_bin'])
323
+ .group_by(["_x_bin", "_y_bin"])
279
324
  .head(points_per_bin)
280
- .drop(['_x_bin', '_y_bin'])
325
+ .drop(["_x_bin", "_y_bin"])
281
326
  )
282
327
  else:
283
328
  # Need to compute ranges - still lazy using over() window
284
329
  # First pass: add normalized bin columns using min/max over entire frame
285
330
  result = (
286
- data
287
- .with_columns([
288
- # Compute bin indices using window functions for min/max
289
- (
290
- (pl.col(x_column) - pl.col(x_column).min()) /
291
- (pl.col(x_column).max() - pl.col(x_column).min() + 1e-10) * x_bins
292
- ).cast(pl.Int32).clip(0, x_bins - 1).alias('_x_bin'),
293
- (
294
- (pl.col(y_column) - pl.col(y_column).min()) /
295
- (pl.col(y_column).max() - pl.col(y_column).min() + 1e-10) * y_bins
296
- ).cast(pl.Int32).clip(0, y_bins - 1).alias('_y_bin'),
297
- ])
331
+ data.with_columns(
332
+ [
333
+ # Compute bin indices using window functions for min/max
334
+ (
335
+ (pl.col(x_column) - pl.col(x_column).min())
336
+ / (pl.col(x_column).max() - pl.col(x_column).min() + 1e-10)
337
+ * x_bins
338
+ )
339
+ .cast(pl.Int32)
340
+ .clip(0, x_bins - 1)
341
+ .alias("_x_bin"),
342
+ (
343
+ (pl.col(y_column) - pl.col(y_column).min())
344
+ / (pl.col(y_column).max() - pl.col(y_column).min() + 1e-10)
345
+ * y_bins
346
+ )
347
+ .cast(pl.Int32)
348
+ .clip(0, y_bins - 1)
349
+ .alias("_y_bin"),
350
+ ]
351
+ )
298
352
  .sort(intensity_column, descending=True)
299
- .group_by(['_x_bin', '_y_bin'])
353
+ .group_by(["_x_bin", "_y_bin"])
300
354
  .head(points_per_bin)
301
- .drop(['_x_bin', '_y_bin'])
355
+ .drop(["_x_bin", "_y_bin"])
302
356
  )
303
357
 
304
358
  return result
@@ -325,14 +379,16 @@ def get_data_range(
325
379
  if isinstance(data, pl.DataFrame):
326
380
  data = data.lazy()
327
381
 
328
- stats = data.select([
329
- pl.col(x_column).min().alias('x_min'),
330
- pl.col(x_column).max().alias('x_max'),
331
- pl.col(y_column).min().alias('y_min'),
332
- pl.col(y_column).max().alias('y_max'),
333
- ]).collect()
382
+ stats = data.select(
383
+ [
384
+ pl.col(x_column).min().alias("x_min"),
385
+ pl.col(x_column).max().alias("x_max"),
386
+ pl.col(y_column).min().alias("y_min"),
387
+ pl.col(y_column).max().alias("y_max"),
388
+ ]
389
+ ).collect()
334
390
 
335
391
  return (
336
- (stats['x_min'][0], stats['x_max'][0]),
337
- (stats['y_min'][0], stats['y_max'][0]),
392
+ (stats["x_min"][0], stats["x_max"][0]),
393
+ (stats["y_min"][0], stats["y_max"][0]),
338
394
  )
@@ -1,11 +1,10 @@
1
1
  """Data filtering utilities for selection-based filtering."""
2
2
 
3
+ import hashlib
3
4
  from typing import Any, Dict, List, Optional, Tuple, Union
4
5
 
5
- import hashlib
6
6
  import pandas as pd
7
7
  import polars as pl
8
- import streamlit as st
9
8
 
10
9
 
11
10
  def optimize_for_transfer(df: pl.DataFrame) -> pl.DataFrame:
@@ -35,10 +34,12 @@ def optimize_for_transfer(df: pl.DataFrame) -> pl.DataFrame:
35
34
  # JS safe integer is 2^53, but Int32 range is simpler and sufficient for most data
36
35
  if dtype == pl.Int64:
37
36
  # Get min/max in a single pass
38
- stats = df.select([
39
- pl.col(col).min().alias('min'),
40
- pl.col(col).max().alias('max'),
41
- ]).row(0)
37
+ stats = df.select(
38
+ [
39
+ pl.col(col).min().alias("min"),
40
+ pl.col(col).max().alias("max"),
41
+ ]
42
+ ).row(0)
42
43
  col_min, col_max = stats
43
44
 
44
45
  if col_min is not None and col_max is not None:
@@ -150,14 +151,39 @@ def compute_dataframe_hash(df: pl.DataFrame) -> str:
150
151
  # Add sum of numeric columns for content verification
151
152
  for col in df.columns:
152
153
  dtype = df[col].dtype
153
- if dtype in (pl.Int8, pl.Int16, pl.Int32, pl.Int64,
154
- pl.UInt8, pl.UInt16, pl.UInt32, pl.UInt64,
155
- pl.Float32, pl.Float64):
154
+ if dtype in (
155
+ pl.Int8,
156
+ pl.Int16,
157
+ pl.Int32,
158
+ pl.Int64,
159
+ pl.UInt8,
160
+ pl.UInt16,
161
+ pl.UInt32,
162
+ pl.UInt64,
163
+ pl.Float32,
164
+ pl.Float64,
165
+ ):
156
166
  try:
157
167
  col_sum = df[col].sum()
158
168
  hash_parts.append(f"{col}:{col_sum}")
159
169
  except Exception:
160
170
  pass
171
+ elif dtype == pl.Boolean:
172
+ # Count True values for boolean columns (important for annotations)
173
+ try:
174
+ true_count = df[col].sum() # True=1, False=0
175
+ hash_parts.append(f"{col}_bool:{true_count}")
176
+ except Exception:
177
+ pass
178
+ elif dtype == pl.Utf8 and col.startswith("_dynamic"):
179
+ # Hash content of dynamic string columns (annotations)
180
+ try:
181
+ # Use hash of all non-empty values for annotation text
182
+ non_empty = df[col].filter(pl.col(col) != "").to_list()
183
+ if non_empty:
184
+ hash_parts.append(f"{col}_str:{hash(tuple(non_empty))}")
185
+ except Exception:
186
+ pass
161
187
 
162
188
  hash_input = "|".join(hash_parts).encode()
163
189
  return hashlib.sha256(hash_input).hexdigest()
@@ -352,10 +378,10 @@ def filter_by_range(
352
378
  data = data.lazy()
353
379
 
354
380
  return data.filter(
355
- (pl.col(x_column) >= x_range[0]) &
356
- (pl.col(x_column) <= x_range[1]) &
357
- (pl.col(y_column) >= y_range[0]) &
358
- (pl.col(y_column) <= y_range[1])
381
+ (pl.col(x_column) >= x_range[0])
382
+ & (pl.col(x_column) <= x_range[1])
383
+ & (pl.col(y_column) >= y_range[0])
384
+ & (pl.col(y_column) <= y_range[1])
359
385
  )
360
386
 
361
387
 
@@ -1,6 +1,6 @@
1
1
  """Rendering utilities for Python-to-Vue communication."""
2
2
 
3
- from .bridge import render_component, get_vue_component_function
3
+ from .bridge import get_vue_component_function, render_component
4
4
 
5
5
  __all__ = [
6
6
  "render_component",