PyPI - openms-insight - Versions diffs - 0.1.1__py3-none-any.whl → 0.1.3__py3-none-any.whl - Mend

openms-insight 0.1.1py3-none-any.whl → 0.1.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (23) hide show

openms_insight/__init__.py +11 -7
openms_insight/components/__init__.py +2 -2
openms_insight/components/heatmap.py +192 -102
openms_insight/components/lineplot.py +377 -82
openms_insight/components/sequenceview.py +677 -213
openms_insight/components/table.py +86 -58
openms_insight/core/__init__.py +2 -2
openms_insight/core/base.py +113 -49
openms_insight/core/registry.py +6 -5
openms_insight/core/state.py +33 -31
openms_insight/core/subprocess_preprocess.py +1 -3
openms_insight/js-component/dist/assets/index.css +1 -1
openms_insight/js-component/dist/assets/index.js +113 -113
openms_insight/preprocessing/__init__.py +5 -6
openms_insight/preprocessing/compression.py +68 -66
openms_insight/preprocessing/filtering.py +119 -9
openms_insight/rendering/__init__.py +1 -1
openms_insight/rendering/bridge.py +192 -42
{openms_insight-0.1.1.dist-info → openms_insight-0.1.3.dist-info}/METADATA +163 -20
openms_insight-0.1.3.dist-info/RECORD +28 -0
openms_insight-0.1.1.dist-info/RECORD +0 -28
{openms_insight-0.1.1.dist-info → openms_insight-0.1.3.dist-info}/WHEEL +0 -0
{openms_insight-0.1.1.dist-info → openms_insight-0.1.3.dist-info}/licenses/LICENSE +0 -0

openms_insight/preprocessing/__init__.py CHANGED Viewed

@@ -1,16 +1,15 @@
 """Preprocessing utilities for data transformation and filtering."""
-from .filtering import (
-    filter_by_selection,
-    filter_by_index,
-    filter_and_collect_cached,
-)
 from .compression import (
     compute_compression_levels,
     downsample_2d,
     downsample_2d_simple,
 )
+from .filtering import (
+    filter_and_collect_cached,
+    filter_by_index,
+    filter_by_selection,
+)
 __all__ = [
     "filter_by_selection",

openms_insight/preprocessing/compression.py CHANGED Viewed

@@ -13,6 +13,7 @@ import polars as pl
 try:
     from scipy.stats import binned_statistic_2d
     HAS_SCIPY = True
 except ImportError:
     HAS_SCIPY = False
@@ -55,12 +56,10 @@ def compute_compression_levels(min_size: int, total: int) -> List[int]:
     # Generate levels at each power of 10, scaled by the fractional part
     scale_factor = int(10 ** (np.log10(min_size) % 1))
-    levels = np.logspace(
-        min_power,
-        max_power,
-        max_power - min_power + 1,
-        dtype='int'
-    ) * scale_factor
+    levels = (
+        np.logspace(min_power, max_power, max_power - min_power + 1, dtype="int")
+        * scale_factor
+    )
     # Filter out levels >= total (don't include full resolution for large datasets)
     levels = levels[levels < total].tolist()
@@ -75,9 +74,9 @@ def compute_compression_levels(min_size: int, total: int) -> List[int]:
 def downsample_2d(
     data: Union[pl.LazyFrame, pl.DataFrame],
     max_points: int = 20000,
-    x_column: str = 'x',
-    y_column: str = 'y',
-    intensity_column: str = 'intensity',
+    x_column: str = "x",
+    y_column: str = "y",
+    intensity_column: str = "intensity",
     x_bins: int = 400,
     y_bins: int = 50,
 ) -> pl.LazyFrame:
@@ -106,8 +105,7 @@ def downsample_2d(
     """
     if not HAS_SCIPY:
         raise ImportError(
-            "scipy is required for downsample_2d. "
-            "Install with: pip install scipy"
+            "scipy is required for downsample_2d. Install with: pip install scipy"
         )
     if (x_bins * y_bins) > max_points:
@@ -122,12 +120,9 @@ def downsample_2d(
     # Sort by intensity (descending) to prioritize high-intensity points
     sorted_data = (
-        data
-        .sort([x_column, intensity_column], descending=[False, True])
-        .with_columns([
-            pl.int_range(pl.len()).over(x_column).alias('_rank')
-        ])
-        .sort(['_rank', intensity_column], descending=[False, True])
+        data.sort([x_column, intensity_column], descending=[False, True])
+        .with_columns([pl.int_range(pl.len()).over(x_column).alias("_rank")])
+        .sort(["_rank", intensity_column], descending=[False, True])
     )
     # Collect for scipy binning (requires numpy arrays)
@@ -136,7 +131,7 @@ def downsample_2d(
     total_count = len(collected)
     if total_count <= max_points:
         # No downsampling needed
-        return collected.drop('_rank').lazy()
+        return collected.drop("_rank").lazy()
     # Extract arrays for scipy
     x_array = collected[x_column].to_numpy()
@@ -145,18 +140,20 @@ def downsample_2d(
     # Compute 2D bins
     count, _, _, mapping = binned_statistic_2d(
-        x_array, y_array, intensity_array, 'count',
+        x_array,
+        y_array,
+        intensity_array,
+        "count",
         bins=[x_bins, y_bins],
-        expand_binnumbers=True
+        expand_binnumbers=True,
     )
     # Add bin indices to dataframe
-    binned_data = (
-        collected.lazy()
-        .with_columns([
-            pl.Series('_x_bin', mapping[0] - 1),  # scipy uses 1-based indexing
-            pl.Series('_y_bin', mapping[1] - 1)
-        ])
+    binned_data = collected.lazy().with_columns(
+        [
+            pl.Series("_x_bin", mapping[0] - 1),  # scipy uses 1-based indexing
+            pl.Series("_y_bin", mapping[1] - 1),
+        ]
     )
     # Compute max peaks per bin to stay under limit
@@ -174,11 +171,10 @@ def downsample_2d(
     # Keep top N peaks per bin
     result = (
-        binned_data
-        .group_by(['_x_bin', '_y_bin'])
+        binned_data.group_by(["_x_bin", "_y_bin"])
         .head(max_peaks_per_bin)
         .sort(intensity_column)
-        .drop(['_rank', '_x_bin', '_y_bin'])
+        .drop(["_rank", "_x_bin", "_y_bin"])
     )
     return result
@@ -187,7 +183,7 @@ def downsample_2d(
 def downsample_2d_simple(
     data: Union[pl.LazyFrame, pl.DataFrame],
     max_points: int = 20000,
-    intensity_column: str = 'intensity',
+    intensity_column: str = "intensity",
 ) -> pl.LazyFrame:
     """
     Simple downsampling by keeping highest-intensity points.
@@ -206,19 +202,15 @@ def downsample_2d_simple(
     if isinstance(data, pl.DataFrame):
         data = data.lazy()
-    return (
-        data
-        .sort(intensity_column, descending=True)
-        .head(max_points)
-    )
+    return data.sort(intensity_column, descending=True).head(max_points)
 def downsample_2d_streaming(
     data: Union[pl.LazyFrame, pl.DataFrame],
     max_points: int = 20000,
-    x_column: str = 'x',
-    y_column: str = 'y',
-    intensity_column: str = 'intensity',
+    x_column: str = "x",
+    y_column: str = "y",
+    intensity_column: str = "intensity",
     x_bins: int = 400,
     y_bins: int = 50,
     x_range: Optional[tuple] = None,
@@ -262,43 +254,51 @@ def downsample_2d_streaming(
             ((pl.col(x_column) - x_min) / (x_max - x_min + 1e-10) * x_bins)
             .cast(pl.Int32)
             .clip(0, x_bins - 1)
-            .alias('_x_bin')
+            .alias("_x_bin")
         )
         y_bin_expr = (
             ((pl.col(y_column) - y_min) / (y_max - y_min + 1e-10) * y_bins)
             .cast(pl.Int32)
             .clip(0, y_bins - 1)
-            .alias('_y_bin')
+            .alias("_y_bin")
         )
         result = (
-            data
-            .with_columns([x_bin_expr, y_bin_expr])
+            data.with_columns([x_bin_expr, y_bin_expr])
             .sort(intensity_column, descending=True)
-            .group_by(['_x_bin', '_y_bin'])
+            .group_by(["_x_bin", "_y_bin"])
             .head(points_per_bin)
-            .drop(['_x_bin', '_y_bin'])
+            .drop(["_x_bin", "_y_bin"])
         )
     else:
         # Need to compute ranges - still lazy using over() window
         # First pass: add normalized bin columns using min/max over entire frame
         result = (
-            data
-            .with_columns([
-                # Compute bin indices using window functions for min/max
-                (
-                    (pl.col(x_column) - pl.col(x_column).min()) /
-                    (pl.col(x_column).max() - pl.col(x_column).min() + 1e-10) * x_bins
-                ).cast(pl.Int32).clip(0, x_bins - 1).alias('_x_bin'),
-                (
-                    (pl.col(y_column) - pl.col(y_column).min()) /
-                    (pl.col(y_column).max() - pl.col(y_column).min() + 1e-10) * y_bins
-                ).cast(pl.Int32).clip(0, y_bins - 1).alias('_y_bin'),
-            ])
+            data.with_columns(
+                [
+                    # Compute bin indices using window functions for min/max
+                    (
+                        (pl.col(x_column) - pl.col(x_column).min())
+                        / (pl.col(x_column).max() - pl.col(x_column).min() + 1e-10)
+                        * x_bins
+                    )
+                    .cast(pl.Int32)
+                    .clip(0, x_bins - 1)
+                    .alias("_x_bin"),
+                    (
+                        (pl.col(y_column) - pl.col(y_column).min())
+                        / (pl.col(y_column).max() - pl.col(y_column).min() + 1e-10)
+                        * y_bins
+                    )
+                    .cast(pl.Int32)
+                    .clip(0, y_bins - 1)
+                    .alias("_y_bin"),
+                ]
+            )
             .sort(intensity_column, descending=True)
-            .group_by(['_x_bin', '_y_bin'])
+            .group_by(["_x_bin", "_y_bin"])
             .head(points_per_bin)
-            .drop(['_x_bin', '_y_bin'])
+            .drop(["_x_bin", "_y_bin"])
         )
     return result
@@ -325,14 +325,16 @@ def get_data_range(
     if isinstance(data, pl.DataFrame):
         data = data.lazy()
-    stats = data.select([
-        pl.col(x_column).min().alias('x_min'),
-        pl.col(x_column).max().alias('x_max'),
-        pl.col(y_column).min().alias('y_min'),
-        pl.col(y_column).max().alias('y_max'),
-    ]).collect()
+    stats = data.select(
+        [
+            pl.col(x_column).min().alias("x_min"),
+            pl.col(x_column).max().alias("x_max"),
+            pl.col(y_column).min().alias("y_min"),
+            pl.col(y_column).max().alias("y_max"),
+        ]
+    ).collect()
     return (
-        (stats['x_min'][0], stats['x_max'][0]),
-        (stats['y_min'][0], stats['y_max'][0]),
+        (stats["x_min"][0], stats["x_max"][0]),
+        (stats["y_min"][0], stats["y_max"][0]),
     )

openms_insight/preprocessing/filtering.py CHANGED Viewed

@@ -1,11 +1,94 @@
 """Data filtering utilities for selection-based filtering."""
+import hashlib
 from typing import Any, Dict, List, Optional, Tuple, Union
-import hashlib
 import pandas as pd
 import polars as pl
-import streamlit as st
+def optimize_for_transfer(df: pl.DataFrame) -> pl.DataFrame:
+    """
+    Optimize DataFrame types for efficient Arrow transfer to frontend.
+    This function downcasts numeric types to reduce Arrow payload size and
+    avoid BigInt overhead in JavaScript:
+    - Int64 → Int32 (if values fit): Avoids BigInt conversion in JS
+    - Float64 → Float32: Sufficient precision for visualization
+    Args:
+        df: Polars DataFrame to optimize
+    Returns:
+        DataFrame with optimized types
+    """
+    if len(df) == 0:
+        return df
+    casts = []
+    for col in df.columns:
+        dtype = df[col].dtype
+        # Downcast Int64 to Int32 to avoid BigInt in JavaScript
+        # JS safe integer is 2^53, but Int32 range is simpler and sufficient for most data
+        if dtype == pl.Int64:
+            # Get min/max in a single pass
+            stats = df.select(
+                [
+                    pl.col(col).min().alias("min"),
+                    pl.col(col).max().alias("max"),
+                ]
+            ).row(0)
+            col_min, col_max = stats
+            if col_min is not None and col_max is not None:
+                # Int32 range: -2,147,483,648 to 2,147,483,647
+                if col_min >= -2147483648 and col_max <= 2147483647:
+                    casts.append(pl.col(col).cast(pl.Int32))
+        # Downcast Float64 to Float32 (sufficient for display)
+        # Float32 has ~7 significant digits - enough for visualization
+        elif dtype == pl.Float64:
+            casts.append(pl.col(col).cast(pl.Float32))
+    if casts:
+        df = df.with_columns(casts)
+    return df
+def optimize_for_transfer_lazy(lf: pl.LazyFrame) -> pl.LazyFrame:
+    """
+    Optimize LazyFrame types for efficient Arrow transfer (streaming-safe).
+    Unlike optimize_for_transfer(), this only applies optimizations that don't
+    require knowing the data values, preserving the ability to stream via sink_parquet().
+    Currently applies:
+    - Float64 → Float32: Always safe, no bounds check needed
+    Int64 → Int32 is NOT applied here because it requires bounds checking.
+    Use optimize_for_transfer() on collected DataFrames for full optimization.
+    Args:
+        lf: Polars LazyFrame to optimize
+    Returns:
+        LazyFrame with Float64 columns cast to Float32
+    """
+    schema = lf.collect_schema()
+    casts = []
+    for col, dtype in zip(schema.names(), schema.dtypes()):
+        # Only Float64 → Float32 is safe without bounds checking
+        if dtype == pl.Float64:
+            casts.append(pl.col(col).cast(pl.Float32))
+    if casts:
+        lf = lf.with_columns(casts)
+    return lf
 def _make_cache_key(
@@ -68,14 +151,39 @@ def compute_dataframe_hash(df: pl.DataFrame) -> str:
         # Add sum of numeric columns for content verification
         for col in df.columns:
             dtype = df[col].dtype
-            if dtype in (pl.Int8, pl.Int16, pl.Int32, pl.Int64,
-                         pl.UInt8, pl.UInt16, pl.UInt32, pl.UInt64,
-                         pl.Float32, pl.Float64):
+            if dtype in (
+                pl.Int8,
+                pl.Int16,
+                pl.Int32,
+                pl.Int64,
+                pl.UInt8,
+                pl.UInt16,
+                pl.UInt32,
+                pl.UInt64,
+                pl.Float32,
+                pl.Float64,
+            ):
                 try:
                     col_sum = df[col].sum()
                     hash_parts.append(f"{col}:{col_sum}")
                 except Exception:
                     pass
+            elif dtype == pl.Boolean:
+                # Count True values for boolean columns (important for annotations)
+                try:
+                    true_count = df[col].sum()  # True=1, False=0
+                    hash_parts.append(f"{col}_bool:{true_count}")
+                except Exception:
+                    pass
+            elif dtype == pl.Utf8 and col.startswith("_dynamic"):
+                # Hash content of dynamic string columns (annotations)
+                try:
+                    # Use hash of all non-empty values for annotation text
+                    non_empty = df[col].filter(pl.col(col) != "").to_list()
+                    if non_empty:
+                        hash_parts.append(f"{col}_str:{hash(tuple(non_empty))}")
+                except Exception:
+                    pass
     hash_input = "|".join(hash_parts).encode()
     return hashlib.sha256(hash_input).hexdigest()
@@ -133,6 +241,8 @@ def _filter_and_collect(
         data = data.filter(pl.col(column) == selected_value)
     # Collect to Polars DataFrame
+    # Note: Type optimization (Int64→Int32, Float64→Float32) is applied at cache
+    # creation time in base.py._save_to_cache(), so data is already optimized
     df_polars = data.collect()
     # Compute hash efficiently (no pickle)
@@ -268,10 +378,10 @@ def filter_by_range(
         data = data.lazy()
     return data.filter(
-        (pl.col(x_column) >= x_range[0]) &
-        (pl.col(x_column) <= x_range[1]) &
-        (pl.col(y_column) >= y_range[0]) &
-        (pl.col(y_column) <= y_range[1])
+        (pl.col(x_column) >= x_range[0])
+        & (pl.col(x_column) <= x_range[1])
+        & (pl.col(y_column) >= y_range[0])
+        & (pl.col(y_column) <= y_range[1])
     )

openms_insight/rendering/__init__.py CHANGED Viewed

@@ -1,6 +1,6 @@
 """Rendering utilities for Python-to-Vue communication."""
-from .bridge import render_component, get_vue_component_function
+from .bridge import get_vue_component_function, render_component
 __all__ = [
     "render_component",

openms-insight 0.1.1__py3-none-any.whl → 0.1.3__py3-none-any.whl

openms-insight 0.1.1py3-none-any.whl → 0.1.3py3-none-any.whl