PyPI - masster - Versions diffs - 0.4.9__py3-none-any.whl → 0.4.11__py3-none-any.whl - Mend

masster 0.4.9py3-none-any.whl → 0.4.11py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of masster might be problematic. Click here for more details.

Files changed (13) hide show

masster/_version.py +1 -1
masster/lib/__init__.py +9 -0
masster/lib/lib.py +598 -0
masster/sample/sample5_schema.json +44 -44
masster/study/h5.py +0 -13
masster/study/helpers.py +1 -0
masster/study/plot.py +136 -289
masster/study/study5_schema.json +149 -149
{masster-0.4.9.dist-info → masster-0.4.11.dist-info}/METADATA +1 -1
{masster-0.4.9.dist-info → masster-0.4.11.dist-info}/RECORD +13 -11
{masster-0.4.9.dist-info → masster-0.4.11.dist-info}/WHEEL +0 -0
{masster-0.4.9.dist-info → masster-0.4.11.dist-info}/entry_points.txt +0 -0
{masster-0.4.9.dist-info → masster-0.4.11.dist-info}/licenses/LICENSE +0 -0

masster/study/plot.py CHANGED Viewed

@@ -226,8 +226,7 @@ def _isolated_show_panel_notebook(panel_obj):
 def plot_alignment(
     self,
-    samples=None,
-    maps: bool = True,
+    samples=50,
     filename: str | None = None,
     width: int = 450,
     height: int = 450,
@@ -235,322 +234,172 @@ def plot_alignment(
 ):
     """Visualize retention time alignment using two synchronized Bokeh scatter plots.
-    - When ``maps=True`` the function reads ``self.features_maps`` (list of FeatureMap)
-      and builds two side-by-side plots: Original RT (left) and Current/Aligned RT (right).
-    - When ``maps=False`` the function uses ``self.features_df`` and expects an
-      ``rt_original`` column (before) and ``rt`` column (after).
+    Uses ``features_df`` to create side-by-side plots showing Original RT (left)
+    and Current/Aligned RT (right). If no alignment has been performed yet,
+    both plots show the current RT values.
-    Parameters
+    Parameters:
     - samples: List of sample identifiers (sample_uids or sample_names), or single int for random selection, or None for all samples.
-    - maps: whether to use feature maps (default True).
     - filename: optional HTML file path to save the plot.
     - width/height: pixel size of each subplot.
     - markersize: base marker size.
-    Returns
+    Returns:
     - Bokeh layout (row) containing the two synchronized plots.
     """
     # Local imports so the module can be used even if bokeh isn't needed elsewhere
     from bokeh.models import ColumnDataSource, HoverTool
-    from bokeh.plotting import figure, show, output_file
+    from bokeh.plotting import figure
     import pandas as pd
-    # Get sample_uids to filter by if specified
-    sample_uids = self._get_sample_uids(samples) if samples is not None else None
+    # Check if features_df exists
+    if self.features_df is None or self.features_df.is_empty():
+        self.logger.error("No features_df found. Load features first.")
+        return
-    # Build the before/after tabular data used for plotting
-    before_data: list[dict[str, Any]] = []
-    after_data: list[dict[str, Any]] = []
+    # Check required columns
+    required_cols = ["rt", "mz", "inty"]
+    missing = [c for c in required_cols if c not in self.features_df.columns]
+    if missing:
+        self.logger.error(f"Missing required columns in features_df: {missing}")
+        return
-    if maps:
-        # Ensure feature maps are loaded
-        if self.features_maps is None or len(self.features_maps) == 0:
-            self.load_features()
+    # Check if alignment has been performed
+    has_alignment = "rt_original" in self.features_df.columns
+    if not has_alignment:
+        self.logger.warning("Column 'rt_original' not found - alignment has not been performed yet.")
+        self.logger.info("Showing current RT values for both plots. Run align() first to see alignment comparison.")
+    # Get sample_uids to filter by if specified
+    sample_uids = self._get_sample_uids(samples) if samples is not None else None
-        fmaps = self.features_maps or []
+    # Start with full features_df
+    features_df = self.features_df
-        if not fmaps:
-            self.logger.error("No feature maps available for plotting.")
+    # Filter by selected samples if specified
+    if sample_uids is not None:
+        features_df = features_df.filter(pl.col("sample_uid").is_in(sample_uids))
+        if features_df.is_empty():
+            self.logger.error("No features found for the selected samples.")
             return
-        # Filter feature maps by selected samples if specified
-        if sample_uids is not None:
-            # Create mapping from sample_uid to map_id and filter accordingly
-            if hasattr(self, "samples_df") and self.samples_df is not None and not self.samples_df.is_empty():
-                samples_info = self.samples_df.to_pandas()
-                # Filter samples_info to only selected sample_uids and get their map_ids
-                selected_samples = samples_info[samples_info["sample_uid"].isin(sample_uids)]
-                if selected_samples.empty:
-                    self.logger.error("No matching samples found for the provided sample_uids.")
-                    return
-                # Get the map_ids for selected samples
-                selected_map_ids = selected_samples["map_id"].tolist()
-                # Filter feature maps based on map_ids
-                filtered_maps = []
-                for map_id in selected_map_ids:
-                    if 0 <= map_id < len(fmaps):
-                        filtered_maps.append(fmaps[map_id])
-                fmaps = filtered_maps
-                samples_info = selected_samples.reset_index(drop=True)
-                if not fmaps:
-                    self.logger.error("No feature maps found for the selected samples.")
-                    return
-            else:
-                self.logger.warning("Cannot filter feature maps: no samples_df available")
+    # Determine sample column
+    sample_col = "sample_uid" if "sample_uid" in features_df.columns else "sample_name"
+    if sample_col not in features_df.columns:
+        self.logger.error("No sample identifier column found in features_df.")
+        return
-        if not fmaps:
-            self.logger.error("No feature maps available after filtering.")
-            return
+    # Get unique samples
+    samples_list = features_df.select(pl.col(sample_col)).unique().to_series().to_list()
-        # Reference (first) sample: use current RT for both before and after
-        ref = fmaps[0]
-        ref_rt = [f.getRT() for f in ref]
-        ref_mz = [f.getMZ() for f in ref]
-        ref_inty = [f.getIntensity() for f in ref]
-        max_ref_inty = max(ref_inty) if ref_inty else 1
-        # Get sample metadata for reference (first) sample
-        if hasattr(self, "samples_df") and self.samples_df is not None and not self.samples_df.is_empty():
-            if 'samples_info' not in locals():
-                samples_info = self.samples_df.to_pandas()
-            ref_sample_uid = (
-                samples_info.iloc[0]["sample_uid"] if "sample_uid" in samples_info.columns else "Reference_UID"
-            )
-            ref_sample_name = (
-                samples_info.iloc[0]["sample_name"] if "sample_name" in samples_info.columns else "Reference"
-            )
-        else:
-            ref_sample_uid = "Reference_UID"
-            ref_sample_name = "Reference"
+    # Build plotting data
+    before_data: list[dict[str, Any]] = []
+    after_data: list[dict[str, Any]] = []
+    for sample_idx, sample in enumerate(samples_list):
+        # Filter sample data
+        sample_data = features_df.filter(pl.col(sample_col) == sample)
+        # Sample data if too large for performance
+        max_points_per_sample = 10000
+        if sample_data.height > max_points_per_sample:
+            self.logger.info(f"Sample {sample}: Sampling {max_points_per_sample} points from {sample_data.height} features for performance")
+            sample_data = sample_data.sample(n=max_points_per_sample, seed=42)
+        # Calculate max intensity for alpha scaling
+        max_inty = sample_data.select(pl.col("inty").max()).item() or 1
+        # Get sample information
+        sample_name = str(sample)
+        sample_uid = sample if sample_col == "sample_uid" else sample_data.select(pl.col("sample_uid")).item() if "sample_uid" in sample_data.columns else sample
+        # Select columns to process
+        cols_to_select = ["rt", "mz", "inty"]
+        if has_alignment:
+            cols_to_select.append("rt_original")
+        sample_dict = sample_data.select(cols_to_select).to_dicts()
+        for row_dict in sample_dict:
+            rt_original = row_dict.get("rt_original", row_dict["rt"]) if has_alignment else row_dict["rt"]
+            rt_current = row_dict["rt"]
+            mz = row_dict["mz"]
+            inty = row_dict["inty"]
+            alpha = inty / max_inty
+            size = markersize + 2 if sample_idx == 0 else markersize
-        for rt, mz, inty in zip(ref_rt, ref_mz, ref_inty):
             before_data.append({
-                "rt": rt,
+                "rt": rt_original,
                 "mz": mz,
                 "inty": inty,
-                "alpha": inty / max_ref_inty,
-                "sample_idx": 0,
-                "sample_name": ref_sample_name,
-                "sample_uid": ref_sample_uid,
-                "size": markersize + 2,
+                "alpha": alpha,
+                "sample_idx": sample_idx,
+                "sample_name": sample_name,
+                "sample_uid": sample_uid,
+                "size": size,
             })
             after_data.append({
-                "rt": rt,
+                "rt": rt_current,
                 "mz": mz,
                 "inty": inty,
-                "alpha": inty / max_ref_inty,
-                "sample_idx": 0,
-                "sample_name": ref_sample_name,
-                "sample_uid": ref_sample_uid,
-                "size": markersize + 2,
+                "alpha": alpha,
+                "sample_idx": sample_idx,
+                "sample_name": sample_name,
+                "sample_uid": sample_uid,
+                "size": size,
             })
-        # Remaining samples - now using filtered feature maps and samples_info
-        for sample_idx, fm in enumerate(fmaps[1:], start=1):
-            mz_vals = []
-            inty_vals = []
-            original_rt = []
-            aligned_rt = []
-            for f in fm:
-                try:
-                    orig = f.getMetaValue("original_RT")
-                except Exception:
-                    orig = None
-                if orig is None:
-                    original_rt.append(f.getRT())
-                else:
-                    original_rt.append(orig)
-                aligned_rt.append(f.getRT())
-                mz_vals.append(f.getMZ())
-                inty_vals.append(f.getIntensity())
-            if not inty_vals:
-                continue
-            max_inty = max(inty_vals)
-            # Get sample metadata from filtered samples_info
-            if hasattr(self, "samples_df") and self.samples_df is not None and not self.samples_df.is_empty():
-                # Use filtered samples_info if it exists from the filtering above
-                if 'samples_info' in locals() and sample_idx < len(samples_info):
-                    sample_name = samples_info.iloc[sample_idx].get("sample_name", f"Sample {sample_idx}")
-                    sample_uid = samples_info.iloc[sample_idx].get("sample_uid", f"Sample_{sample_idx}_UID")
-                else:
-                    # Fallback to original samples_df if filtered samples_info is not available
-                    all_samples_info = self.samples_df.to_pandas()
-                    if sample_idx < len(all_samples_info):
-                        sample_name = all_samples_info.iloc[sample_idx].get("sample_name", f"Sample {sample_idx}")
-                        sample_uid = all_samples_info.iloc[sample_idx].get("sample_uid", f"Sample_{sample_idx}_UID")
-                    else:
-                        sample_name = f"Sample {sample_idx}"
-                        sample_uid = f"Sample_{sample_idx}_UID"
-            else:
-                sample_name = f"Sample {sample_idx}"
-                sample_uid = f"Sample_{sample_idx}_UID"
-            for rt, mz, inty in zip(original_rt, mz_vals, inty_vals):
-                before_data.append({
-                    "rt": rt,
-                    "mz": mz,
-                    "inty": inty,
-                    "alpha": inty / max_inty,
-                    "sample_idx": sample_idx,
-                    "sample_name": sample_name,
-                    "sample_uid": sample_uid,
-                    "size": markersize,
-                })
-            for rt, mz, inty in zip(aligned_rt, mz_vals, inty_vals):
-                after_data.append({
-                    "rt": rt,
-                    "mz": mz,
-                    "inty": inty,
-                    "alpha": inty / max_inty,
-                    "sample_idx": sample_idx,
-                    "sample_name": sample_name,
-                    "sample_uid": sample_uid,
-                    "size": markersize,
-                })
-    else:
-        # Use features_df
-        if self.features_df is None or self.features_df.is_empty():
-            self.logger.error("No features_df found. Load features first.")
-            return
-        required_cols = ["rt", "mz", "inty"]
-        missing = [c for c in required_cols if c not in self.features_df.columns]
-        if missing:
-            self.logger.error(f"Missing required columns in features_df: {missing}")
-            return
-        if "rt_original" not in self.features_df.columns:
-            self.logger.error("Column 'rt_original' not found in features_df. Alignment may not have been performed.")
-            return
-        # Use Polars instead of pandas
-        features_df = self.features_df
-        # Filter by selected samples if specified
-        if sample_uids is not None:
-            features_df = features_df.filter(pl.col("sample_uid").is_in(sample_uids))
-            if features_df.is_empty():
-                self.logger.error("No features found for the selected samples.")
-                return
-        sample_col = "sample_uid" if "sample_uid" in features_df.columns else "sample_name"
-        if sample_col not in features_df.columns:
-            self.logger.error("No sample identifier column found in features_df.")
-            return
-        # Get unique samples using Polars
-        samples = features_df.select(pl.col(sample_col)).unique().to_series().to_list()
-        for sample_idx, sample in enumerate(samples):
-            # Filter sample data using Polars
-            sample_data = features_df.filter(pl.col(sample_col) == sample)
-            # Calculate max intensity using Polars
-            max_inty = sample_data.select(pl.col("inty").max()).item()
-            max_inty = max_inty if max_inty and max_inty > 0 else 1
+    # Check if we have any data to plot
+    if not before_data:
+        self.logger.error("No data to plot.")
+        return
-            sample_name = str(sample)
-            # Get sample_uid - if sample_col is 'sample_uid', use sample directly
-            if sample_col == "sample_uid":
-                sample_uid = sample
-            else:
-                # Try to get sample_uid from the first row if it exists
-                if "sample_uid" in sample_data.columns:
-                    sample_uid = sample_data.select(pl.col("sample_uid")).item()
-                else:
-                    sample_uid = sample
-            # Convert to dict for iteration - more efficient than row-by-row processing
-            sample_dict = sample_data.select(["rt_original", "rt", "mz", "inty"]).to_dicts()
-            for row_dict in sample_dict:
-                rt_original = row_dict["rt_original"]
-                rt_current = row_dict["rt"]
-                mz = row_dict["mz"]
-                inty = row_dict["inty"]
-                alpha = inty / max_inty
-                size = markersize + 2 if sample_idx == 0 else markersize
-                before_data.append({
-                    "rt": rt_original,
-                    "mz": mz,
-                    "inty": inty,
-                    "alpha": alpha,
-                    "sample_idx": sample_idx,
-                    "sample_name": sample_name,
-                    "sample_uid": sample_uid,
-                    "size": size,
-                })
-                after_data.append({
-                    "rt": rt_current,
-                    "mz": mz,
-                    "inty": inty,
-                    "alpha": alpha,
-                    "sample_idx": sample_idx,
-                    "sample_name": sample_name,
-                    "sample_uid": sample_uid,
-                    "size": size,
-                })
-    # Get sample colors from samples_df using sample indices
-    # Extract unique sample information from the dictionaries we created
-    if before_data:
-        # Create mapping from sample_idx to sample_uid more efficiently
-        sample_idx_to_uid = {}
-        for item in before_data:
-            if item["sample_idx"] not in sample_idx_to_uid:
-                sample_idx_to_uid[item["sample_idx"]] = item["sample_uid"]
-    else:
-        sample_idx_to_uid = {}
+    # Get sample colors from samples_df
+    sample_idx_to_uid = {}
+    for item in before_data:
+        if item["sample_idx"] not in sample_idx_to_uid:
+            sample_idx_to_uid[item["sample_idx"]] = item["sample_uid"]
-    # Get colors from samples_df
+    # Get colors from samples_df if available
     sample_uids_list = list(sample_idx_to_uid.values())
+    color_map: dict[int, str] = {}
     if sample_uids_list and hasattr(self, "samples_df") and self.samples_df is not None:
-        sample_colors = (
-            self.samples_df.filter(pl.col("sample_uid").is_in(sample_uids_list))
-            .select(["sample_uid", "sample_color"])
-            .to_dict(as_series=False)
-        )
-        uid_to_color = dict(zip(sample_colors["sample_uid"], sample_colors["sample_color"]))
+        try:
+            sample_colors = (
+                self.samples_df.filter(pl.col("sample_uid").is_in(sample_uids_list))
+                .select(["sample_uid", "sample_color"])
+                .to_dict(as_series=False)
+            )
+            uid_to_color = dict(zip(sample_colors["sample_uid"], sample_colors["sample_color"]))
+            for sample_idx, sample_uid in sample_idx_to_uid.items():
+                color_map[sample_idx] = uid_to_color.get(sample_uid, "#1f77b4")
+        except Exception:
+            # Fallback to default colors if sample colors not available
+            for sample_idx in sample_idx_to_uid.keys():
+                color_map[sample_idx] = "#1f77b4"
     else:
-        uid_to_color = {}
-    # Create color map for sample indices
-    color_map: dict[int, str] = {}
-    for sample_idx, sample_uid in sample_idx_to_uid.items():
-        color_map[sample_idx] = uid_to_color.get(sample_uid, "#1f77b4")  # fallback to blue
-    # Add sample_color to data dictionaries before creating DataFrames
-    if before_data:
-        for item in before_data:
-            item["sample_color"] = color_map.get(item["sample_idx"], "#1f77b4")
+        # Default colors
+        for sample_idx in sample_idx_to_uid.keys():
+            color_map[sample_idx] = "#1f77b4"
-    if after_data:
-        for item in after_data:
-            item["sample_color"] = color_map.get(item["sample_idx"], "#1f77b4")
+    # Add sample_color to data
+    for item in before_data + after_data:
+        item["sample_color"] = color_map.get(item["sample_idx"], "#1f77b4")
-    # Now create DataFrames with the sample_color already included
-    before_df = pd.DataFrame(before_data) if before_data else pd.DataFrame()
-    after_df = pd.DataFrame(after_data) if after_data else pd.DataFrame()
+    # Create DataFrames
+    before_df = pd.DataFrame(before_data)
+    after_df = pd.DataFrame(after_data)
     # Create Bokeh figures
+    title_before = "Original RT" if has_alignment else "Current RT (No Alignment)"
+    title_after = "Aligned RT" if has_alignment else "Current RT (Copy)"
     p1 = figure(
         width=width,
         height=height,
-        title="Original RT",
+        title=title_before,
         x_axis_label="Retention Time (s)",
         y_axis_label="m/z",
         tools="pan,wheel_zoom,box_zoom,reset,save",
@@ -563,7 +412,7 @@ def plot_alignment(
     p2 = figure(
         width=width,
         height=height,
-        title="Current RT",
+        title=title_after,
         x_axis_label="Retention Time (s)",
         y_axis_label="m/z",
         tools="pan,wheel_zoom,box_zoom,reset,save",
@@ -575,16 +424,15 @@ def plot_alignment(
     p2.border_fill_color = "white"
     p2.min_border = 0
-    # Get unique sample indices for iteration
-    unique_samples = sorted(list({item["sample_idx"] for item in before_data})) if before_data else []
+    # Plot data by sample
+    unique_samples = sorted(list({item["sample_idx"] for item in before_data}))
     renderers_before = []
     renderers_after = []
     for sample_idx in unique_samples:
         sb = before_df[before_df["sample_idx"] == sample_idx]
         sa = after_df[after_df["sample_idx"] == sample_idx]
-        color = color_map.get(sample_idx, "#000000")
+        color = color_map.get(sample_idx, "#1f77b4")
         if not sb.empty:
             src = ColumnDataSource(sb)
@@ -623,8 +471,7 @@ def plot_alignment(
     )
     p2.add_tools(hover2)
-    # Create layout with both plots side by side
-    # Use the aliased bokeh_row and set sizing_mode, width and height to avoid validation warnings.
+    # Create layout
     layout = bokeh_row(p1, p2, sizing_mode="fixed", width=width, height=height)
     # Apply consistent save/display behavior
@@ -878,7 +725,7 @@ def plot_consensus_2d(
 def plot_samples_2d(
     self,
-    samples=None,
+    samples=100,
     filename=None,
     markersize=2,
     size="dynamic",
@@ -1112,7 +959,7 @@ def plot_samples_2d(
 def plot_bpc(
     self,
-    samples=None,
+    samples=100,
     title: str | None = None,
     filename: str | None = None,
     width: int = 1000,
@@ -1288,7 +1135,7 @@ def plot_eic(
     self,
     mz,
     mz_tol=None,
-    samples=None,
+    samples=100,
     title: str | None = None,
     filename: str | None = None,
     width: int = 1000,
@@ -1457,7 +1304,7 @@ def plot_eic(
 def plot_rt_correction(
     self,
-    samples=None,
+    samples=200,
     title: str | None = None,
     filename: str | None = None,
     width: int = 1000,
@@ -1611,7 +1458,7 @@ def plot_rt_correction(
 def plot_chrom(
     self,
     uids=None,
-    samples=None,
+    samples=100,
     filename=None,
     aligned=True,
     width=800,
@@ -2309,7 +2156,7 @@ def plot_pca(
 def plot_tic(
     self,
-    samples=None,
+    samples=100,
     title: str | None = None,
     filename: str | None = None,
     width: int = 1000,

masster 0.4.9__py3-none-any.whl → 0.4.11__py3-none-any.whl

Potentially problematic release.

masster 0.4.9py3-none-any.whl → 0.4.11py3-none-any.whl