openms-insight 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,14 @@
1
+ <!DOCTYPE html>
2
+ <html lang="en">
3
+ <head>
4
+ <meta charset="UTF-8">
5
+ <link rel="icon" href="/favicon.ico">
6
+ <meta name="viewport" content="width=device-width, initial-scale=1.0">
7
+ <title>Streamlit Vue Components</title>
8
+ <script type="module" crossorigin src="./assets/index.js"></script>
9
+ <link rel="stylesheet" crossorigin href="./assets/index.css">
10
+ </head>
11
+ <body>
12
+ <div id="app"></div>
13
+ </body>
14
+ </html>
@@ -0,0 +1,22 @@
1
+ """Preprocessing utilities for data transformation and filtering."""
2
+
3
+ from .filtering import (
4
+ filter_by_selection,
5
+ filter_by_index,
6
+ filter_and_collect_cached,
7
+ )
8
+
9
+ from .compression import (
10
+ compute_compression_levels,
11
+ downsample_2d,
12
+ downsample_2d_simple,
13
+ )
14
+
15
+ __all__ = [
16
+ "filter_by_selection",
17
+ "filter_by_index",
18
+ "filter_and_collect_cached",
19
+ "compute_compression_levels",
20
+ "downsample_2d",
21
+ "downsample_2d_simple",
22
+ ]
@@ -0,0 +1,338 @@
1
+ """Compression utilities for large 2D datasets (heatmaps).
2
+
3
+ This module provides functions for multi-resolution downsampling of 2D scatter
4
+ data, enabling efficient visualization of datasets with millions of points.
5
+
6
+ Supports both streaming (lazy) and eager downsampling approaches.
7
+ """
8
+
9
+ from typing import List, Optional, Union
10
+
11
+ import numpy as np
12
+ import polars as pl
13
+
14
+ try:
15
+ from scipy.stats import binned_statistic_2d
16
+ HAS_SCIPY = True
17
+ except ImportError:
18
+ HAS_SCIPY = False
19
+
20
+
21
+ def compute_compression_levels(min_size: int, total: int) -> List[int]:
22
+ """
23
+ Compute logarithmically-spaced compression level target sizes.
24
+
25
+ Given a minimum target size and total data size, computes intermediate
26
+ compression levels at powers of 10.
27
+
28
+ Args:
29
+ min_size: Minimum/smallest compression level size (e.g., 20000)
30
+ total: Total number of data points
31
+
32
+ Returns:
33
+ List of target sizes, smallest first. Always returns at least one level.
34
+ For small datasets (total <= min_size), returns [total] to preserve all data.
35
+
36
+ Examples:
37
+ >>> compute_compression_levels(20000, 1_000_000)
38
+ [20000, 200000]
39
+ >>> compute_compression_levels(20000, 50_000)
40
+ [20000]
41
+ >>> compute_compression_levels(20000, 15_000)
42
+ [15000]
43
+ """
44
+ if total <= min_size:
45
+ # Still return at least one level with all data
46
+ return [total]
47
+
48
+ # Compute powers of 10 between min and total
49
+ min_power = int(np.log10(min_size))
50
+ max_power = int(np.log10(total))
51
+
52
+ if min_power >= max_power:
53
+ # Data is between min_size and 10x min_size - one downsampled level
54
+ return [min_size]
55
+
56
+ # Generate levels at each power of 10, scaled by the fractional part
57
+ scale_factor = int(10 ** (np.log10(min_size) % 1))
58
+ levels = np.logspace(
59
+ min_power,
60
+ max_power,
61
+ max_power - min_power + 1,
62
+ dtype='int'
63
+ ) * scale_factor
64
+
65
+ # Filter out levels >= total (don't include full resolution for large datasets)
66
+ levels = levels[levels < total].tolist()
67
+
68
+ # Ensure at least one level exists
69
+ if not levels:
70
+ levels = [min_size]
71
+
72
+ return levels
73
+
74
+
75
+ def downsample_2d(
76
+ data: Union[pl.LazyFrame, pl.DataFrame],
77
+ max_points: int = 20000,
78
+ x_column: str = 'x',
79
+ y_column: str = 'y',
80
+ intensity_column: str = 'intensity',
81
+ x_bins: int = 400,
82
+ y_bins: int = 50,
83
+ ) -> pl.LazyFrame:
84
+ """
85
+ Downsample 2D scatter data while preserving high-intensity points.
86
+
87
+ Uses 2D binning to spatially partition data, then keeps the top N
88
+ highest-intensity points per bin. This preserves visually important
89
+ features (peaks) while reducing total point count.
90
+
91
+ Args:
92
+ data: Input data as Polars LazyFrame or DataFrame
93
+ max_points: Maximum number of points to keep
94
+ x_column: Name of x-axis column
95
+ y_column: Name of y-axis column
96
+ intensity_column: Name of intensity/value column for ranking
97
+ x_bins: Number of bins along x-axis
98
+ y_bins: Number of bins along y-axis
99
+
100
+ Returns:
101
+ Downsampled data as Polars LazyFrame
102
+
103
+ Raises:
104
+ ImportError: If scipy is not installed
105
+ ValueError: If x_bins * y_bins > max_points
106
+ """
107
+ if not HAS_SCIPY:
108
+ raise ImportError(
109
+ "scipy is required for downsample_2d. "
110
+ "Install with: pip install scipy"
111
+ )
112
+
113
+ if (x_bins * y_bins) > max_points:
114
+ raise ValueError(
115
+ f"Number of bins ({x_bins * y_bins}) exceeds max_points ({max_points}). "
116
+ "Reduce x_bins or y_bins."
117
+ )
118
+
119
+ # Ensure we're working with a LazyFrame
120
+ if isinstance(data, pl.DataFrame):
121
+ data = data.lazy()
122
+
123
+ # Sort by intensity (descending) to prioritize high-intensity points
124
+ sorted_data = (
125
+ data
126
+ .sort([x_column, intensity_column], descending=[False, True])
127
+ .with_columns([
128
+ pl.int_range(pl.len()).over(x_column).alias('_rank')
129
+ ])
130
+ .sort(['_rank', intensity_column], descending=[False, True])
131
+ )
132
+
133
+ # Collect for scipy binning (requires numpy arrays)
134
+ collected = sorted_data.collect()
135
+
136
+ total_count = len(collected)
137
+ if total_count <= max_points:
138
+ # No downsampling needed
139
+ return collected.drop('_rank').lazy()
140
+
141
+ # Extract arrays for scipy
142
+ x_array = collected[x_column].to_numpy()
143
+ y_array = collected[y_column].to_numpy()
144
+ intensity_array = collected[intensity_column].to_numpy()
145
+
146
+ # Compute 2D bins
147
+ count, _, _, mapping = binned_statistic_2d(
148
+ x_array, y_array, intensity_array, 'count',
149
+ bins=[x_bins, y_bins],
150
+ expand_binnumbers=True
151
+ )
152
+
153
+ # Add bin indices to dataframe
154
+ binned_data = (
155
+ collected.lazy()
156
+ .with_columns([
157
+ pl.Series('_x_bin', mapping[0] - 1), # scipy uses 1-based indexing
158
+ pl.Series('_y_bin', mapping[1] - 1)
159
+ ])
160
+ )
161
+
162
+ # Compute max peaks per bin to stay under limit
163
+ counted_peaks = 0
164
+ max_peaks_per_bin = -1
165
+ new_count = 0
166
+
167
+ while (counted_peaks + new_count) < max_points:
168
+ max_peaks_per_bin += 1
169
+ counted_peaks += new_count
170
+ new_count = np.sum(count.flatten() >= (max_peaks_per_bin + 1))
171
+
172
+ if counted_peaks >= total_count:
173
+ break
174
+
175
+ # Keep top N peaks per bin
176
+ result = (
177
+ binned_data
178
+ .group_by(['_x_bin', '_y_bin'])
179
+ .head(max_peaks_per_bin)
180
+ .sort(intensity_column)
181
+ .drop(['_rank', '_x_bin', '_y_bin'])
182
+ )
183
+
184
+ return result
185
+
186
+
187
+ def downsample_2d_simple(
188
+ data: Union[pl.LazyFrame, pl.DataFrame],
189
+ max_points: int = 20000,
190
+ intensity_column: str = 'intensity',
191
+ ) -> pl.LazyFrame:
192
+ """
193
+ Simple downsampling by keeping highest-intensity points.
194
+
195
+ A simpler alternative to downsample_2d that doesn't require scipy.
196
+ Less spatially aware but still preserves important peaks.
197
+
198
+ Args:
199
+ data: Input data as Polars LazyFrame or DataFrame
200
+ max_points: Maximum number of points to keep
201
+ intensity_column: Name of intensity column for ranking
202
+
203
+ Returns:
204
+ Downsampled data as Polars LazyFrame
205
+ """
206
+ if isinstance(data, pl.DataFrame):
207
+ data = data.lazy()
208
+
209
+ return (
210
+ data
211
+ .sort(intensity_column, descending=True)
212
+ .head(max_points)
213
+ )
214
+
215
+
216
+ def downsample_2d_streaming(
217
+ data: Union[pl.LazyFrame, pl.DataFrame],
218
+ max_points: int = 20000,
219
+ x_column: str = 'x',
220
+ y_column: str = 'y',
221
+ intensity_column: str = 'intensity',
222
+ x_bins: int = 400,
223
+ y_bins: int = 50,
224
+ x_range: Optional[tuple] = None,
225
+ y_range: Optional[tuple] = None,
226
+ ) -> pl.LazyFrame:
227
+ """
228
+ Streaming 2D downsampling using pure Polars operations.
229
+
230
+ Uses Polars' lazy evaluation to downsample data without full materialization.
231
+ Creates spatial bins using integer division and keeps top-N highest-intensity
232
+ points per bin. Stays fully lazy - no .collect() is called.
233
+
234
+ Args:
235
+ data: Input data as Polars LazyFrame or DataFrame
236
+ max_points: Maximum number of points to keep
237
+ x_column: Name of x-axis column
238
+ y_column: Name of y-axis column
239
+ intensity_column: Name of intensity/value column for ranking
240
+ x_bins: Number of bins along x-axis
241
+ y_bins: Number of bins along y-axis
242
+ x_range: Optional (min, max) tuple for x-axis. If None, computed from data.
243
+ y_range: Optional (min, max) tuple for y-axis. If None, computed from data.
244
+
245
+ Returns:
246
+ Downsampled data as Polars LazyFrame (fully lazy, no collection)
247
+ """
248
+ if isinstance(data, pl.DataFrame):
249
+ data = data.lazy()
250
+
251
+ # Calculate points per bin
252
+ total_bins = x_bins * y_bins
253
+ points_per_bin = max(1, max_points // total_bins)
254
+
255
+ # Build binning expression using provided or computed ranges
256
+ if x_range is not None and y_range is not None:
257
+ x_min, x_max = x_range
258
+ y_min, y_max = y_range
259
+
260
+ # Use provided ranges for bin calculation
261
+ x_bin_expr = (
262
+ ((pl.col(x_column) - x_min) / (x_max - x_min + 1e-10) * x_bins)
263
+ .cast(pl.Int32)
264
+ .clip(0, x_bins - 1)
265
+ .alias('_x_bin')
266
+ )
267
+ y_bin_expr = (
268
+ ((pl.col(y_column) - y_min) / (y_max - y_min + 1e-10) * y_bins)
269
+ .cast(pl.Int32)
270
+ .clip(0, y_bins - 1)
271
+ .alias('_y_bin')
272
+ )
273
+
274
+ result = (
275
+ data
276
+ .with_columns([x_bin_expr, y_bin_expr])
277
+ .sort(intensity_column, descending=True)
278
+ .group_by(['_x_bin', '_y_bin'])
279
+ .head(points_per_bin)
280
+ .drop(['_x_bin', '_y_bin'])
281
+ )
282
+ else:
283
+ # Need to compute ranges - still lazy using over() window
284
+ # First pass: add normalized bin columns using min/max over entire frame
285
+ result = (
286
+ data
287
+ .with_columns([
288
+ # Compute bin indices using window functions for min/max
289
+ (
290
+ (pl.col(x_column) - pl.col(x_column).min()) /
291
+ (pl.col(x_column).max() - pl.col(x_column).min() + 1e-10) * x_bins
292
+ ).cast(pl.Int32).clip(0, x_bins - 1).alias('_x_bin'),
293
+ (
294
+ (pl.col(y_column) - pl.col(y_column).min()) /
295
+ (pl.col(y_column).max() - pl.col(y_column).min() + 1e-10) * y_bins
296
+ ).cast(pl.Int32).clip(0, y_bins - 1).alias('_y_bin'),
297
+ ])
298
+ .sort(intensity_column, descending=True)
299
+ .group_by(['_x_bin', '_y_bin'])
300
+ .head(points_per_bin)
301
+ .drop(['_x_bin', '_y_bin'])
302
+ )
303
+
304
+ return result
305
+
306
+
307
+ def get_data_range(
308
+ data: Union[pl.LazyFrame, pl.DataFrame],
309
+ x_column: str,
310
+ y_column: str,
311
+ ) -> tuple:
312
+ """
313
+ Get the min/max ranges for x and y columns.
314
+
315
+ This requires a collect() operation but only fetches 4 scalar values.
316
+
317
+ Args:
318
+ data: Input data
319
+ x_column: X-axis column name
320
+ y_column: Y-axis column name
321
+
322
+ Returns:
323
+ Tuple of ((x_min, x_max), (y_min, y_max))
324
+ """
325
+ if isinstance(data, pl.DataFrame):
326
+ data = data.lazy()
327
+
328
+ stats = data.select([
329
+ pl.col(x_column).min().alias('x_min'),
330
+ pl.col(x_column).max().alias('x_max'),
331
+ pl.col(y_column).min().alias('y_min'),
332
+ pl.col(y_column).max().alias('y_max'),
333
+ ]).collect()
334
+
335
+ return (
336
+ (stats['x_min'][0], stats['x_max'][0]),
337
+ (stats['y_min'][0], stats['y_max'][0]),
338
+ )
@@ -0,0 +1,316 @@
1
+ """Data filtering utilities for selection-based filtering."""
2
+
3
+ from typing import Any, Dict, List, Optional, Tuple, Union
4
+
5
+ import hashlib
6
+ import pandas as pd
7
+ import polars as pl
8
+ import streamlit as st
9
+
10
+
11
+ def _make_cache_key(
12
+ filters: Dict[str, str],
13
+ state: Dict[str, Any],
14
+ filter_defaults: Optional[Dict[str, Any]] = None,
15
+ ) -> Tuple[Tuple[str, Any], ...]:
16
+ """
17
+ Create a hashable cache key from filters and state.
18
+
19
+ Only includes state values for identifiers that are in filters,
20
+ so cache is invalidated only when relevant selections change.
21
+
22
+ Args:
23
+ filters: Mapping of identifier names to column names
24
+ state: Current selection state
25
+ filter_defaults: Optional default values for filters when state is None
26
+
27
+ Returns:
28
+ Tuple of (identifier, value) pairs for use as cache key
29
+ """
30
+ relevant_state = []
31
+ for identifier in sorted(filters.keys()):
32
+ value = state.get(identifier)
33
+ # Apply default if value is None and default exists
34
+ if value is None and filter_defaults and identifier in filter_defaults:
35
+ value = filter_defaults[identifier]
36
+ relevant_state.append((identifier, value))
37
+ return tuple(relevant_state)
38
+
39
+
40
+ def compute_dataframe_hash(df: pl.DataFrame) -> str:
41
+ """
42
+ Compute an efficient hash for a DataFrame without pickling.
43
+
44
+ Uses shape, column names, and sampled values to create a fast hash
45
+ that detects data changes without materializing extra copies.
46
+
47
+ Args:
48
+ df: Polars DataFrame to hash
49
+
50
+ Returns:
51
+ SHA256 hash string
52
+ """
53
+ # Build hash from metadata and sampled content
54
+ hash_parts = [
55
+ str(df.shape), # (rows, cols)
56
+ str(df.columns), # Column names
57
+ ]
58
+
59
+ # For small DataFrames, hash first/last values of each column
60
+ # For large DataFrames, this is still O(1) memory
61
+ if len(df) > 0:
62
+ # Sample first and last row for change detection
63
+ first_row = df.head(1).to_dicts()[0] if len(df) > 0 else {}
64
+ last_row = df.tail(1).to_dicts()[0] if len(df) > 0 else {}
65
+ hash_parts.append(str(first_row))
66
+ hash_parts.append(str(last_row))
67
+
68
+ # Add sum of numeric columns for content verification
69
+ for col in df.columns:
70
+ dtype = df[col].dtype
71
+ if dtype in (pl.Int8, pl.Int16, pl.Int32, pl.Int64,
72
+ pl.UInt8, pl.UInt16, pl.UInt32, pl.UInt64,
73
+ pl.Float32, pl.Float64):
74
+ try:
75
+ col_sum = df[col].sum()
76
+ hash_parts.append(f"{col}:{col_sum}")
77
+ except Exception:
78
+ pass
79
+
80
+ hash_input = "|".join(hash_parts).encode()
81
+ return hashlib.sha256(hash_input).hexdigest()
82
+
83
+
84
+ @st.cache_data(ttl=300, max_entries=100)
85
+ def _cached_filter_and_collect(
86
+ _data: pl.LazyFrame,
87
+ filters_tuple: Tuple[Tuple[str, str], ...],
88
+ state_tuple: Tuple[Tuple[str, Any], ...],
89
+ columns_tuple: Optional[Tuple[str, ...]] = None,
90
+ filter_defaults_tuple: Optional[Tuple[Tuple[str, Any], ...]] = None,
91
+ ) -> Tuple[pd.DataFrame, str]:
92
+ """
93
+ Filter data and collect with caching.
94
+
95
+ This function is cached by Streamlit, so repeated calls with the same
96
+ filter state will return cached results without re-executing the query.
97
+
98
+ Returns pandas DataFrame for efficient Arrow serialization to frontend.
99
+
100
+ Args:
101
+ _data: LazyFrame to filter (underscore prefix tells st.cache_data to hash by id)
102
+ filters_tuple: Tuple of (identifier, column) pairs from filters dict
103
+ state_tuple: Tuple of (identifier, value) pairs for current selection state
104
+ (already has defaults applied from _make_cache_key)
105
+ columns_tuple: Optional tuple of column names to select (projection)
106
+ filter_defaults_tuple: Optional tuple of (identifier, default_value) pairs
107
+ (included for cache key differentiation)
108
+
109
+ Returns:
110
+ Tuple of (pandas DataFrame, hash string)
111
+ """
112
+ data = _data
113
+ filters = dict(filters_tuple)
114
+ state = dict(state_tuple) # Already has defaults applied
115
+
116
+ # Apply column projection FIRST (before filters) for efficiency
117
+ # This ensures we only read needed columns from disk
118
+ if columns_tuple:
119
+ data = data.select(list(columns_tuple))
120
+
121
+ # Apply filters
122
+ # If ANY filter has no selection (and no default), return empty DataFrame
123
+ # This prevents loading millions of rows when no spectrum is selected
124
+ for identifier, column in filters.items():
125
+ selected_value = state.get(identifier)
126
+ if selected_value is None:
127
+ # No selection for this filter - return empty DataFrame
128
+ # Collect with limit 0 to get schema without data
129
+ df_polars = data.head(0).collect()
130
+ data_hash = compute_dataframe_hash(df_polars)
131
+ df_pandas = df_polars.to_pandas()
132
+ return (df_pandas, data_hash)
133
+
134
+ # Convert float to int for integer columns to handle JSON number parsing
135
+ # (JavaScript numbers come back as floats, but Polars Int64 needs int comparison)
136
+ if isinstance(selected_value, float) and selected_value.is_integer():
137
+ selected_value = int(selected_value)
138
+ data = data.filter(pl.col(column) == selected_value)
139
+
140
+ # Collect to Polars DataFrame
141
+ df_polars = data.collect()
142
+
143
+ # Compute hash efficiently (no pickle)
144
+ data_hash = compute_dataframe_hash(df_polars)
145
+
146
+ # Convert to pandas for Arrow serialization (zero-copy when possible)
147
+ df_pandas = df_polars.to_pandas()
148
+
149
+ return (df_pandas, data_hash)
150
+
151
+
152
+ def filter_and_collect_cached(
153
+ data: Union[pl.LazyFrame, pl.DataFrame],
154
+ filters: Dict[str, str],
155
+ state: Dict[str, Any],
156
+ columns: Optional[List[str]] = None,
157
+ filter_defaults: Optional[Dict[str, Any]] = None,
158
+ ) -> Tuple[pd.DataFrame, str]:
159
+ """
160
+ Filter data based on selection state and collect, with caching.
161
+
162
+ This is the recommended function for components that need filtered data.
163
+ Results are cached based on filter state, so interactions that don't
164
+ change the filter values (e.g., clicking within already-filtered data)
165
+ will return cached results instantly.
166
+
167
+ Returns pandas DataFrame for efficient Arrow serialization to the frontend.
168
+ The hash is computed efficiently without pickling the data.
169
+
170
+ Args:
171
+ data: The data to filter (LazyFrame or DataFrame)
172
+ filters: Mapping of identifier names to column names for filtering
173
+ state: Current selection state with identifier values
174
+ columns: Optional list of column names to select (projection pushdown)
175
+ filter_defaults: Optional default values for filters when state is None.
176
+ When a filter's state value is None, the default is used instead.
177
+ Example: {"identification": -1} means None → -1 for identification filter.
178
+
179
+ Returns:
180
+ Tuple of (pandas DataFrame, hash string) with filters and projection applied
181
+ """
182
+ if isinstance(data, pl.DataFrame):
183
+ data = data.lazy()
184
+
185
+ # Convert to tuples for caching (dicts aren't hashable)
186
+ filters_tuple = tuple(sorted(filters.items()))
187
+ # Pass filter_defaults to _make_cache_key so defaults are applied to state
188
+ state_tuple = _make_cache_key(filters, state, filter_defaults)
189
+ columns_tuple = tuple(columns) if columns else None
190
+ filter_defaults_tuple = tuple(sorted(filter_defaults.items())) if filter_defaults else None
191
+
192
+ return _cached_filter_and_collect(
193
+ data,
194
+ filters_tuple,
195
+ state_tuple,
196
+ columns_tuple,
197
+ filter_defaults_tuple,
198
+ )
199
+
200
+
201
+ def filter_by_selection(
202
+ data: Union[pl.LazyFrame, pl.DataFrame],
203
+ interactivity: Dict[str, str],
204
+ state: Dict[str, Any],
205
+ ) -> pl.LazyFrame:
206
+ """
207
+ Filter data based on selection state and interactivity mapping.
208
+
209
+ For each identifier in the interactivity mapping, if there's a
210
+ corresponding selection in state, filter the data to rows where
211
+ the mapped column equals the selected value.
212
+
213
+ Args:
214
+ data: The data to filter (LazyFrame or DataFrame)
215
+ interactivity: Mapping of identifier names to column names
216
+ state: Current selection state with identifier values
217
+
218
+ Returns:
219
+ Filtered LazyFrame
220
+ """
221
+ if isinstance(data, pl.DataFrame):
222
+ data = data.lazy()
223
+
224
+ for identifier, column in interactivity.items():
225
+ selected_value = state.get(identifier)
226
+ if selected_value is not None:
227
+ data = data.filter(pl.col(column) == selected_value)
228
+
229
+ return data
230
+
231
+
232
+ def filter_by_index(
233
+ data: Union[pl.LazyFrame, pl.DataFrame],
234
+ index_column: str,
235
+ index_value: Any,
236
+ ) -> pl.LazyFrame:
237
+ """
238
+ Filter data to a single row by index value.
239
+
240
+ Args:
241
+ data: The data to filter
242
+ index_column: Name of the index column
243
+ index_value: The index value to filter to
244
+
245
+ Returns:
246
+ Filtered LazyFrame (typically 1 row)
247
+ """
248
+ if isinstance(data, pl.DataFrame):
249
+ data = data.lazy()
250
+
251
+ return data.filter(pl.col(index_column) == index_value)
252
+
253
+
254
+ def filter_by_range(
255
+ data: Union[pl.LazyFrame, pl.DataFrame],
256
+ x_column: str,
257
+ y_column: str,
258
+ x_range: tuple,
259
+ y_range: tuple,
260
+ ) -> pl.LazyFrame:
261
+ """
262
+ Filter data within x/y range bounds.
263
+
264
+ Args:
265
+ data: The data to filter
266
+ x_column: Name of the x-axis column
267
+ y_column: Name of the y-axis column
268
+ x_range: Tuple of (min, max) for x-axis
269
+ y_range: Tuple of (min, max) for y-axis
270
+
271
+ Returns:
272
+ Filtered LazyFrame
273
+ """
274
+ if isinstance(data, pl.DataFrame):
275
+ data = data.lazy()
276
+
277
+ return data.filter(
278
+ (pl.col(x_column) >= x_range[0]) &
279
+ (pl.col(x_column) <= x_range[1]) &
280
+ (pl.col(y_column) >= y_range[0]) &
281
+ (pl.col(y_column) <= y_range[1])
282
+ )
283
+
284
+
285
+ def slice_by_row_index(
286
+ data: Union[pl.LazyFrame, pl.DataFrame],
287
+ row_index: Optional[int],
288
+ ) -> pl.DataFrame:
289
+ """
290
+ Slice data to a single row by row position.
291
+
292
+ Args:
293
+ data: The data to slice
294
+ row_index: The row index (position) to extract
295
+
296
+ Returns:
297
+ DataFrame with single row, or empty DataFrame if index is None
298
+ """
299
+ if row_index is None:
300
+ # Return empty DataFrame with same schema
301
+ if isinstance(data, pl.LazyFrame):
302
+ return data.head(0).collect()
303
+ return data.head(0)
304
+
305
+ # For LazyFrames, add slice to query plan before collecting
306
+ # This allows Polars to optimize and avoid materializing all rows
307
+ if isinstance(data, pl.LazyFrame):
308
+ # Note: We can't check bounds without collecting, so we slice optimistically
309
+ # and return empty if result is empty
310
+ return data.slice(row_index, 1).collect()
311
+
312
+ # For DataFrames, check bounds first
313
+ if row_index < 0 or row_index >= len(data):
314
+ return data.head(0)
315
+
316
+ return data.slice(row_index, 1)