PyPI - openms-insight - Versions diffs - 0.1.2__py3-none-any.whl → 0.1.4__py3-none-any.whl - Mend

openms-insight 0.1.2py3-none-any.whl → 0.1.4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (23) hide show

openms_insight/__init__.py +11 -7
openms_insight/components/__init__.py +2 -2
openms_insight/components/heatmap.py +433 -228
openms_insight/components/lineplot.py +377 -82
openms_insight/components/sequenceview.py +677 -213
openms_insight/components/table.py +86 -58
openms_insight/core/__init__.py +2 -2
openms_insight/core/base.py +122 -54
openms_insight/core/registry.py +6 -5
openms_insight/core/state.py +33 -31
openms_insight/core/subprocess_preprocess.py +1 -3
openms_insight/js-component/dist/assets/index.css +1 -1
openms_insight/js-component/dist/assets/index.js +105 -105
openms_insight/preprocessing/__init__.py +5 -6
openms_insight/preprocessing/compression.py +123 -67
openms_insight/preprocessing/filtering.py +39 -13
openms_insight/rendering/__init__.py +1 -1
openms_insight/rendering/bridge.py +192 -42
{openms_insight-0.1.2.dist-info → openms_insight-0.1.4.dist-info}/METADATA +163 -20
openms_insight-0.1.4.dist-info/RECORD +28 -0
openms_insight-0.1.2.dist-info/RECORD +0 -28
{openms_insight-0.1.2.dist-info → openms_insight-0.1.4.dist-info}/WHEEL +0 -0
{openms_insight-0.1.2.dist-info → openms_insight-0.1.4.dist-info}/licenses/LICENSE +0 -0

openms_insight/preprocessing/__init__.py CHANGED Viewed

@@ -1,16 +1,15 @@
 """Preprocessing utilities for data transformation and filtering."""
-from .filtering import (
-    filter_by_selection,
-    filter_by_index,
-    filter_and_collect_cached,
-)
 from .compression import (
     compute_compression_levels,
     downsample_2d,
     downsample_2d_simple,
 )
+from .filtering import (
+    filter_and_collect_cached,
+    filter_by_index,
+    filter_by_selection,
+)
 __all__ = [
     "filter_by_selection",

openms_insight/preprocessing/compression.py CHANGED Viewed

@@ -6,18 +6,73 @@ data, enabling efficient visualization of datasets with millions of points.
 Supports both streaming (lazy) and eager downsampling approaches.
 """
-from typing import List, Optional, Union
+import math
+from typing import List, Optional, Tuple, Union
 import numpy as np
 import polars as pl
 try:
     from scipy.stats import binned_statistic_2d
     HAS_SCIPY = True
 except ImportError:
     HAS_SCIPY = False
+def compute_optimal_bins(
+    target_points: int,
+    x_range: Tuple[float, float],
+    y_range: Tuple[float, float],
+) -> Tuple[int, int]:
+    """
+    Compute optimal x_bins, y_bins for even spatial distribution.
+    The bin grid matches the data's aspect ratio so bins are approximately
+    square in data space. Total bins ≈ target_points for 1 point per bin.
+    Solves the system:
+        x_bins × y_bins = target_points
+        x_bins / y_bins = aspect_ratio
+    Solution:
+        y_bins = sqrt(target_points / aspect_ratio)
+        x_bins = sqrt(target_points × aspect_ratio)
+    Args:
+        target_points: Target number of bins (and thus max points with 1 per bin)
+        x_range: (x_min, x_max) data range
+        y_range: (y_min, y_max) data range
+    Returns:
+        (x_bins, y_bins) tuple
+    Examples:
+        >>> compute_optimal_bins(10000, (0, 1000), (0, 100))  # 10:1 aspect
+        (316, 31)
+        >>> compute_optimal_bins(10000, (0, 100), (0, 100))   # 1:1 aspect
+        (100, 100)
+    """
+    x_span = x_range[1] - x_range[0]
+    y_span = y_range[1] - y_range[0]
+    # Handle edge cases
+    if y_span < 1e-10:
+        y_span = x_span if x_span > 1e-10 else 1.0
+    if x_span < 1e-10:
+        x_span = y_span
+    aspect_ratio = x_span / y_span
+    # Clamp to reasonable bounds (avoid extreme rectangles)
+    aspect_ratio = max(0.05, min(20.0, aspect_ratio))
+    y_bins = max(1, int(math.sqrt(target_points / aspect_ratio)))
+    x_bins = max(1, int(math.sqrt(target_points * aspect_ratio)))
+    return x_bins, y_bins
 def compute_compression_levels(min_size: int, total: int) -> List[int]:
     """
     Compute logarithmically-spaced compression level target sizes.
@@ -55,12 +110,10 @@ def compute_compression_levels(min_size: int, total: int) -> List[int]:
     # Generate levels at each power of 10, scaled by the fractional part
     scale_factor = int(10 ** (np.log10(min_size) % 1))
-    levels = np.logspace(
-        min_power,
-        max_power,
-        max_power - min_power + 1,
-        dtype='int'
-    ) * scale_factor
+    levels = (
+        np.logspace(min_power, max_power, max_power - min_power + 1, dtype="int")
+        * scale_factor
+    )
     # Filter out levels >= total (don't include full resolution for large datasets)
     levels = levels[levels < total].tolist()
@@ -75,9 +128,9 @@ def compute_compression_levels(min_size: int, total: int) -> List[int]:
 def downsample_2d(
     data: Union[pl.LazyFrame, pl.DataFrame],
     max_points: int = 20000,
-    x_column: str = 'x',
-    y_column: str = 'y',
-    intensity_column: str = 'intensity',
+    x_column: str = "x",
+    y_column: str = "y",
+    intensity_column: str = "intensity",
     x_bins: int = 400,
     y_bins: int = 50,
 ) -> pl.LazyFrame:
@@ -106,8 +159,7 @@ def downsample_2d(
     """
     if not HAS_SCIPY:
         raise ImportError(
-            "scipy is required for downsample_2d. "
-            "Install with: pip install scipy"
+            "scipy is required for downsample_2d. Install with: pip install scipy"
         )
     if (x_bins * y_bins) > max_points:
@@ -122,12 +174,9 @@ def downsample_2d(
     # Sort by intensity (descending) to prioritize high-intensity points
     sorted_data = (
-        data
-        .sort([x_column, intensity_column], descending=[False, True])
-        .with_columns([
-            pl.int_range(pl.len()).over(x_column).alias('_rank')
-        ])
-        .sort(['_rank', intensity_column], descending=[False, True])
+        data.sort([x_column, intensity_column], descending=[False, True])
+        .with_columns([pl.int_range(pl.len()).over(x_column).alias("_rank")])
+        .sort(["_rank", intensity_column], descending=[False, True])
     )
     # Collect for scipy binning (requires numpy arrays)
@@ -136,7 +185,7 @@ def downsample_2d(
     total_count = len(collected)
     if total_count <= max_points:
         # No downsampling needed
-        return collected.drop('_rank').lazy()
+        return collected.drop("_rank").lazy()
     # Extract arrays for scipy
     x_array = collected[x_column].to_numpy()
@@ -145,18 +194,20 @@ def downsample_2d(
     # Compute 2D bins
     count, _, _, mapping = binned_statistic_2d(
-        x_array, y_array, intensity_array, 'count',
+        x_array,
+        y_array,
+        intensity_array,
+        "count",
         bins=[x_bins, y_bins],
-        expand_binnumbers=True
+        expand_binnumbers=True,
     )
     # Add bin indices to dataframe
-    binned_data = (
-        collected.lazy()
-        .with_columns([
-            pl.Series('_x_bin', mapping[0] - 1),  # scipy uses 1-based indexing
-            pl.Series('_y_bin', mapping[1] - 1)
-        ])
+    binned_data = collected.lazy().with_columns(
+        [
+            pl.Series("_x_bin", mapping[0] - 1),  # scipy uses 1-based indexing
+            pl.Series("_y_bin", mapping[1] - 1),
+        ]
     )
     # Compute max peaks per bin to stay under limit
@@ -174,11 +225,10 @@ def downsample_2d(
     # Keep top N peaks per bin
     result = (
-        binned_data
-        .group_by(['_x_bin', '_y_bin'])
+        binned_data.group_by(["_x_bin", "_y_bin"])
         .head(max_peaks_per_bin)
         .sort(intensity_column)
-        .drop(['_rank', '_x_bin', '_y_bin'])
+        .drop(["_rank", "_x_bin", "_y_bin"])
     )
     return result
@@ -187,7 +237,7 @@ def downsample_2d(
 def downsample_2d_simple(
     data: Union[pl.LazyFrame, pl.DataFrame],
     max_points: int = 20000,
-    intensity_column: str = 'intensity',
+    intensity_column: str = "intensity",
 ) -> pl.LazyFrame:
     """
     Simple downsampling by keeping highest-intensity points.
@@ -206,19 +256,15 @@ def downsample_2d_simple(
     if isinstance(data, pl.DataFrame):
         data = data.lazy()
-    return (
-        data
-        .sort(intensity_column, descending=True)
-        .head(max_points)
-    )
+    return data.sort(intensity_column, descending=True).head(max_points)
 def downsample_2d_streaming(
     data: Union[pl.LazyFrame, pl.DataFrame],
     max_points: int = 20000,
-    x_column: str = 'x',
-    y_column: str = 'y',
-    intensity_column: str = 'intensity',
+    x_column: str = "x",
+    y_column: str = "y",
+    intensity_column: str = "intensity",
     x_bins: int = 400,
     y_bins: int = 50,
     x_range: Optional[tuple] = None,
@@ -262,43 +308,51 @@ def downsample_2d_streaming(
             ((pl.col(x_column) - x_min) / (x_max - x_min + 1e-10) * x_bins)
             .cast(pl.Int32)
             .clip(0, x_bins - 1)
-            .alias('_x_bin')
+            .alias("_x_bin")
         )
         y_bin_expr = (
             ((pl.col(y_column) - y_min) / (y_max - y_min + 1e-10) * y_bins)
             .cast(pl.Int32)
             .clip(0, y_bins - 1)
-            .alias('_y_bin')
+            .alias("_y_bin")
         )
         result = (
-            data
-            .with_columns([x_bin_expr, y_bin_expr])
+            data.with_columns([x_bin_expr, y_bin_expr])
             .sort(intensity_column, descending=True)
-            .group_by(['_x_bin', '_y_bin'])
+            .group_by(["_x_bin", "_y_bin"])
             .head(points_per_bin)
-            .drop(['_x_bin', '_y_bin'])
+            .drop(["_x_bin", "_y_bin"])
         )
     else:
         # Need to compute ranges - still lazy using over() window
         # First pass: add normalized bin columns using min/max over entire frame
         result = (
-            data
-            .with_columns([
-                # Compute bin indices using window functions for min/max
-                (
-                    (pl.col(x_column) - pl.col(x_column).min()) /
-                    (pl.col(x_column).max() - pl.col(x_column).min() + 1e-10) * x_bins
-                ).cast(pl.Int32).clip(0, x_bins - 1).alias('_x_bin'),
-                (
-                    (pl.col(y_column) - pl.col(y_column).min()) /
-                    (pl.col(y_column).max() - pl.col(y_column).min() + 1e-10) * y_bins
-                ).cast(pl.Int32).clip(0, y_bins - 1).alias('_y_bin'),
-            ])
+            data.with_columns(
+                [
+                    # Compute bin indices using window functions for min/max
+                    (
+                        (pl.col(x_column) - pl.col(x_column).min())
+                        / (pl.col(x_column).max() - pl.col(x_column).min() + 1e-10)
+                        * x_bins
+                    )
+                    .cast(pl.Int32)
+                    .clip(0, x_bins - 1)
+                    .alias("_x_bin"),
+                    (
+                        (pl.col(y_column) - pl.col(y_column).min())
+                        / (pl.col(y_column).max() - pl.col(y_column).min() + 1e-10)
+                        * y_bins
+                    )
+                    .cast(pl.Int32)
+                    .clip(0, y_bins - 1)
+                    .alias("_y_bin"),
+                ]
+            )
             .sort(intensity_column, descending=True)
-            .group_by(['_x_bin', '_y_bin'])
+            .group_by(["_x_bin", "_y_bin"])
             .head(points_per_bin)
-            .drop(['_x_bin', '_y_bin'])
+            .drop(["_x_bin", "_y_bin"])
         )
     return result
@@ -325,14 +379,16 @@ def get_data_range(
     if isinstance(data, pl.DataFrame):
         data = data.lazy()
-    stats = data.select([
-        pl.col(x_column).min().alias('x_min'),
-        pl.col(x_column).max().alias('x_max'),
-        pl.col(y_column).min().alias('y_min'),
-        pl.col(y_column).max().alias('y_max'),
-    ]).collect()
+    stats = data.select(
+        [
+            pl.col(x_column).min().alias("x_min"),
+            pl.col(x_column).max().alias("x_max"),
+            pl.col(y_column).min().alias("y_min"),
+            pl.col(y_column).max().alias("y_max"),
+        ]
+    ).collect()
     return (
-        (stats['x_min'][0], stats['x_max'][0]),
-        (stats['y_min'][0], stats['y_max'][0]),
+        (stats["x_min"][0], stats["x_max"][0]),
+        (stats["y_min"][0], stats["y_max"][0]),
     )

openms_insight/preprocessing/filtering.py CHANGED Viewed

@@ -1,11 +1,10 @@
 """Data filtering utilities for selection-based filtering."""
+import hashlib
 from typing import Any, Dict, List, Optional, Tuple, Union
-import hashlib
 import pandas as pd
 import polars as pl
-import streamlit as st
 def optimize_for_transfer(df: pl.DataFrame) -> pl.DataFrame:
@@ -35,10 +34,12 @@ def optimize_for_transfer(df: pl.DataFrame) -> pl.DataFrame:
         # JS safe integer is 2^53, but Int32 range is simpler and sufficient for most data
         if dtype == pl.Int64:
             # Get min/max in a single pass
-            stats = df.select([
-                pl.col(col).min().alias('min'),
-                pl.col(col).max().alias('max'),
-            ]).row(0)
+            stats = df.select(
+                [
+                    pl.col(col).min().alias("min"),
+                    pl.col(col).max().alias("max"),
+                ]
+            ).row(0)
             col_min, col_max = stats
             if col_min is not None and col_max is not None:
@@ -150,14 +151,39 @@ def compute_dataframe_hash(df: pl.DataFrame) -> str:
         # Add sum of numeric columns for content verification
         for col in df.columns:
             dtype = df[col].dtype
-            if dtype in (pl.Int8, pl.Int16, pl.Int32, pl.Int64,
-                         pl.UInt8, pl.UInt16, pl.UInt32, pl.UInt64,
-                         pl.Float32, pl.Float64):
+            if dtype in (
+                pl.Int8,
+                pl.Int16,
+                pl.Int32,
+                pl.Int64,
+                pl.UInt8,
+                pl.UInt16,
+                pl.UInt32,
+                pl.UInt64,
+                pl.Float32,
+                pl.Float64,
+            ):
                 try:
                     col_sum = df[col].sum()
                     hash_parts.append(f"{col}:{col_sum}")
                 except Exception:
                     pass
+            elif dtype == pl.Boolean:
+                # Count True values for boolean columns (important for annotations)
+                try:
+                    true_count = df[col].sum()  # True=1, False=0
+                    hash_parts.append(f"{col}_bool:{true_count}")
+                except Exception:
+                    pass
+            elif dtype == pl.Utf8 and col.startswith("_dynamic"):
+                # Hash content of dynamic string columns (annotations)
+                try:
+                    # Use hash of all non-empty values for annotation text
+                    non_empty = df[col].filter(pl.col(col) != "").to_list()
+                    if non_empty:
+                        hash_parts.append(f"{col}_str:{hash(tuple(non_empty))}")
+                except Exception:
+                    pass
     hash_input = "|".join(hash_parts).encode()
     return hashlib.sha256(hash_input).hexdigest()
@@ -352,10 +378,10 @@ def filter_by_range(
         data = data.lazy()
     return data.filter(
-        (pl.col(x_column) >= x_range[0]) &
-        (pl.col(x_column) <= x_range[1]) &
-        (pl.col(y_column) >= y_range[0]) &
-        (pl.col(y_column) <= y_range[1])
+        (pl.col(x_column) >= x_range[0])
+        & (pl.col(x_column) <= x_range[1])
+        & (pl.col(y_column) >= y_range[0])
+        & (pl.col(y_column) <= y_range[1])
     )

openms_insight/rendering/__init__.py CHANGED Viewed

@@ -1,6 +1,6 @@
 """Rendering utilities for Python-to-Vue communication."""
-from .bridge import render_component, get_vue_component_function
+from .bridge import get_vue_component_function, render_component
 __all__ = [
     "render_component",

openms-insight 0.1.2__py3-none-any.whl → 0.1.4__py3-none-any.whl

openms-insight 0.1.2py3-none-any.whl → 0.1.4py3-none-any.whl