PyPI - masster - Versions diffs - 0.5.1__py3-none-any.whl → 0.5.3__py3-none-any.whl - Mend

masster 0.5.1py3-none-any.whl → 0.5.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of masster might be problematic. Click here for more details.

Files changed (20) hide show

masster/_version.py +1 -1
masster/sample/load.py +5 -4
masster/study/defaults/align_def.py +0 -204
masster/study/defaults/fill_def.py +9 -1
masster/study/defaults/merge_def.py +20 -69
masster/study/export.py +25 -5
masster/study/h5.py +160 -42
masster/study/helpers.py +430 -53
masster/study/load.py +986 -158
masster/study/merge.py +683 -1076
masster/study/plot.py +43 -38
masster/study/processing.py +337 -280
masster/study/study.py +58 -135
masster/wizard/wizard.py +20 -6
{masster-0.5.1.dist-info → masster-0.5.3.dist-info}/METADATA +1 -1
{masster-0.5.1.dist-info → masster-0.5.3.dist-info}/RECORD +19 -20
masster/study/defaults/fill_chrom_def.py +0 -260
{masster-0.5.1.dist-info → masster-0.5.3.dist-info}/WHEEL +0 -0
{masster-0.5.1.dist-info → masster-0.5.3.dist-info}/entry_points.txt +0 -0
{masster-0.5.1.dist-info → masster-0.5.3.dist-info}/licenses/LICENSE +0 -0

masster/study/processing.py CHANGED Viewed

@@ -15,77 +15,6 @@ from masster.study.defaults import (
 )
-def _generate_feature_maps_on_demand_for_align(study):
-    """
-    Generate feature maps on-demand from study.features_df for alignment operations.
-    Returns temporary feature maps that are not cached in the study.
-    Args:
-        study: Study object containing features_df and samples_df
-    Returns:
-        list: List of temporary FeatureMap objects
-    """
-    import polars as pl
-    import pyopenms as oms
-    if study.features_df is None or len(study.features_df) == 0:
-        study.logger.error("No features_df available for generating feature maps")
-        return []
-    temp_feature_maps = []
-    # Process each sample in order
-    for sample_index, row_dict in enumerate(study.samples_df.iter_rows(named=True)):
-        sample_uid = row_dict["sample_uid"]
-        sample_name = row_dict["sample_name"]
-        # Get features for this sample from features_df
-        sample_features = study.features_df.filter(pl.col("sample_uid") == sample_uid)
-        # Create new FeatureMap
-        feature_map = oms.FeatureMap()
-        # Convert DataFrame features to OpenMS Features
-        for feature_row in sample_features.iter_rows(named=True):
-            feature = oms.Feature()
-            # Set properties from DataFrame (handle missing values gracefully)
-            try:
-                # Skip features with missing critical data
-                if feature_row["mz"] is None:
-                    study.logger.warning("Skipping feature due to missing mz")
-                    continue
-                if feature_row["rt"] is None:
-                    study.logger.warning("Skipping feature due to missing rt")
-                    continue
-                if feature_row["inty"] is None:
-                    study.logger.warning("Skipping feature due to missing inty")
-                    continue
-                feature.setUniqueId(int(feature_row["feature_id"]))
-                feature.setMZ(float(feature_row["mz"]))
-                feature.setRT(float(feature_row["rt"]))
-                feature.setIntensity(float(feature_row["inty"]))
-                # Handle optional fields that might be None
-                if feature_row.get("quality") is not None:
-                    feature.setOverallQuality(float(feature_row["quality"]))
-                if feature_row.get("charge") is not None:
-                    feature.setCharge(int(feature_row["charge"]))
-                # Add to feature map
-                feature_map.push_back(feature)
-            except (ValueError, TypeError) as e:
-                study.logger.warning(f"Skipping feature due to conversion error: {e}")
-                continue
-        temp_feature_maps.append(feature_map)
-    study.logger.debug(f"Generated {len(temp_feature_maps)} temporary feature maps from features_df for alignment")
-    return temp_feature_maps
 def align(self, **kwargs):
     """Align feature maps using pose clustering or KD algorithm and update feature RTs.
@@ -103,30 +32,7 @@ def align(self, **kwargs):
         - algorithm (str): Alignment algorithm ('pc' for PoseClustering, 'kd' for KD).
         KD algorithm specific parameters:
-        - min_samples (int): Minimum number of samples required for KD alignment.
-        - nr_partitions (int): Number of partitions in m/z dimension.
-        - warp_enabled (bool): Enable non-linear retention time transformation.
-        - warp_rt_tol (float): RT tolerance for the LOWESS fit.
         - warp_mz_tol (float): m/z tolerance for the LOWESS fit.
-        - warp_max_pairwise_log_fc (float): Maximum absolute log10 fold-change threshold for pairing.
-        - warp_min_rel_cc_size (float): Minimum relative connected component size.
-        - warp_max_nr_conflicts (int): Allow up to this many conflicts per connected component for alignment.
-        - link_rt_tol (float): Width of RT tolerance window for linking features.
-        - link_mz_tol (float): m/z tolerance for linking features.
-        - link_charge_merging (str): Charge merging strategy for linking features.
-        - link_adduct_merging (str): Adduct merging strategy for linking features.
-        - distance_RT_exponent (float): Exponent for normalized RT differences.
-        - distance_RT_weight (float): Weight factor for final RT distances.
-        - distance_MZ_exponent (float): Exponent for normalized m/z differences.
-        - distance_MZ_weight (float): Weight factor for final m/z distances.
-        - distance_intensity_exponent (float): Exponent for differences in relative intensity.
-        - distance_intensity_weight (float): Weight factor for final intensity distances.
-        - distance_intensity_log_transform (str): Log-transform intensities.
-        - LOWESS_span (float): Fraction of datapoints for each local regression.
-        - LOWESS_num_iterations (int): Number of robustifying iterations for LOWESS fitting.
-        - LOWESS_delta (float): Parameter for LOWESS computations (negative auto-computes).
-        - LOWESS_interpolation_type (str): Method for interpolation between datapoints.
-        - LOWESS_extrapolation_type (str): Method for extrapolation outside data range.
     """
     # parameters initialization
     params = align_defaults()
@@ -155,145 +61,31 @@ def align(self, **kwargs):
                     )
             else:
                 self.logger.warning(f"Unknown parameter '{key}' ignored")
-    # end of parameter initialization
     # Store parameters in the Study object
     self.update_history(["align"], params.to_dict())
     self.logger.debug("Parameters stored to align")
-    # Generate temporary feature maps on-demand from features_df instead of using cached data
-    self.logger.debug("Generating feature maps on-demand from features_df for alignment")
-    fmaps = _generate_feature_maps_on_demand_for_align(self)
+    # Ensure rt_original exists before starting alignment (both algorithms need this)
+    if "rt_original" not in self.features_df.columns:
+        # add column 'rt_original' after 'rt'
+        rt_index = self.features_df.columns.get_loc("rt") + 1
+        self.features_df.insert(rt_index, "rt_original", 0)
+        self.features_df["rt_original"] = self.features_df["rt"]
+        self.logger.debug("Created rt_original column from current rt values")
     # Choose alignment algorithm
     algorithm = params.get("algorithm").lower()
     if algorithm == "pc":
-        _align_pose_clustering(self, fmaps, params)
+        _align_pose_clustering(self, params)
     elif algorithm == "kd":
-        _align_kd_algorithm(self, fmaps, params)
+        _align_kd_algorithm(self, params)
     else:
         self.logger.error(f"Unknown alignment algorithm '{algorithm}'")
-        # Clean up temporary feature maps to release memory
-        del fmaps
         return
-    # check if rt_original exists in features_df, if not, add it after rt
-    if "rt_original" not in self.features_df.columns:
-        # add column 'rt_original' after 'rt'
-        rt_index = self.features_df.columns.get_loc("rt") + 1
-        self.features_df.insert(rt_index, "rt_original", 0)
-        self.features_df["rt_original"] = self.features_df["rt"]
-    # iterate through all feature_maps and add the transformed retention times to the features_df
-    # Build a fast lookup for (sample_uid, featureUid) to index in features_df
-    feats = self.features_df
-    # Pre-build sample_uid lookup for faster access
-    self.logger.debug("Build sample_uid lookup for fast access...")
-    sample_uid_lookup = {
-        idx: row_dict["sample_uid"]
-        for idx, row_dict in enumerate(self.samples_df.iter_rows(named=True))
-    }
-    # Build the main lookup using feature_uid (not feature_id)
-    if "feature_id" in feats.columns:
-        # Create lookup mapping (sample_uid, feature_id) to DataFrame index using Polars
-        # Since we need a pandas-style index lookup, we'll create a simple dict
-        sample_uids = feats.get_column("sample_uid").to_list()
-        # Handle feature_id column - it might be Object type due to conversion
-        feature_id_col = feats.get_column("feature_id")
-        if feature_id_col.dtype == pl.Object:
-            # If it's Object type, convert to list and let Python handle the conversion
-            feature_ids = feature_id_col.to_list()
-            # Convert to strings if they're not already
-            feature_ids = [str(fid) if fid is not None else None for fid in feature_ids]
-        else:
-            # Safe to cast normally
-            feature_ids = feature_id_col.cast(pl.Utf8).to_list()
-        lookup = {
-            (sample_uid, feature_id): idx
-            for idx, (sample_uid, feature_id) in enumerate(
-                zip(sample_uids, feature_ids, strict=True),
-            )
-        }
-    else:
-        # fallback: skip if feature_uid column missing
-        lookup = {}
-        self.logger.warning("feature_id column not found in features_df")
-    # Pre-allocate update lists for better performance
-    all_update_idx = []
-    all_update_rt = []
-    all_update_rt_original = []
-    tdqm_disable = self.log_level not in ["TRACE", "DEBUG"]
-    for index, fm in tqdm(
-        list(enumerate(fmaps)),
-        total=len(fmaps),
-        desc=f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO     | {self.log_label}Extract RTs",
-        disable=tdqm_disable,
-    ):
-        sample_uid = sample_uid_lookup.get(index)
-        if sample_uid is None:
-            continue
-        # Collect all updates for this feature map
-        for f in fm:
-            feature_uid = str(f.getUniqueId())
-            idx = lookup.get((sample_uid, feature_uid))
-            if idx is not None:
-                rt = round(f.getRT(), 3)
-                # rt_or = round(f.getMetaValue("original_RT"), 3) if f.metaValueExists("original_RT") else rt
-                all_update_idx.append(idx)
-                all_update_rt.append(rt)
-                # all_update_rt_original.append(rt_or)
-    # Single batch update for all features at once
-    if all_update_idx:
-        # Build a full-length Python list of rt values, update specified indices,
-        # then replace the DataFrame column with a Series that has the same length
-        try:
-            current_rt = self.features_df["rt"].to_list()
-        except Exception:
-            current_rt = [None] * self.features_df.height
-        # Defensive: ensure list length equals dataframe height
-        if len(current_rt) != self.features_df.height:
-            current_rt = [None] * self.features_df.height
-        for idx, new_rt in zip(all_update_idx, all_update_rt):
-            current_rt[idx] = new_rt
-        new_cols = [pl.Series("rt", current_rt)]
-        # Update rt_original if corresponding updates were collected
-        if "all_update_rt_original" in locals() and all_update_rt_original:
-            try:
-                current_rt_orig = (
-                    self.features_df["rt_original"].to_list()
-                    if "rt_original" in self.features_df.columns
-                    else [None] * self.features_df.height
-                )
-            except Exception:
-                current_rt_orig = [None] * self.features_df.height
-            if len(current_rt_orig) != self.features_df.height:
-                current_rt_orig = [None] * self.features_df.height
-            for idx, new_orig in zip(all_update_idx, all_update_rt_original):
-                current_rt_orig[idx] = new_orig
-            new_cols.append(pl.Series("rt_original", current_rt_orig))
-        # Replace columns in one call
-        self.features_df = self.features_df.with_columns(*new_cols)
     self.logger.debug("Alignment completed successfully.")
     # Reset consensus data structures after alignment since RT changes invalidate consensus
@@ -307,6 +99,9 @@ def align(self, **kwargs):
     if not self.consensus_ms2.is_empty():
         self.consensus_ms2 = pl.DataFrame()
         consensus_reset_count += 1
+    if not self.id_df.is_empty():
+        self.id_df = pl.DataFrame()
+        consensus_reset_count += 1
     # Remove merge and find_ms2 parameters from history since they need to be re-run
     keys_to_remove = ["merge", "find_ms2"]
@@ -319,17 +114,13 @@ def align(self, **kwargs):
                 self.logger.debug(f"Removed {key} from history")
     if consensus_reset_count > 0 or history_removed_count > 0:
-        self.logger.info(
+        self.logger.debug(
             f"Alignment reset: {consensus_reset_count} consensus structures cleared, {history_removed_count} history entries removed",
         )
     if params.get("save_features"):
         self.save_samples()
-    # Clean up temporary feature maps to release memory
-    del fmaps
-    self.logger.debug("Temporary feature maps deleted to release memory")
 def find_ms2(self, **kwargs):
     """
@@ -803,12 +594,73 @@ def _find_closest_valley(chrom, rt, dir="left", threshold=0.9):
     return chrom.rt[idx]
-def _align_pose_clustering(study_obj, fmaps, params):
+def _align_pose_clustering(study_obj, params):
     """Perform alignment using PoseClustering algorithm."""
     import pyopenms as oms
+    import polars as pl
     from tqdm import tqdm
     from datetime import datetime
+    # Generate temporary feature maps on-demand from features_df for PoseClustering
+    study_obj.logger.debug("Generating feature maps on-demand from features_df for PoseClustering alignment")
+    tdqm_disable = study_obj.log_level not in ["TRACE", "DEBUG", "INFO"]
+    fmaps = []
+    # Process each sample in order with progress bar
+    for sample_index, row_dict in tqdm(
+        list(enumerate(study_obj.samples_df.iter_rows(named=True))),
+        total=len(study_obj.samples_df),
+        desc=f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO     | {study_obj.log_label}Generate feature maps",
+        disable=tdqm_disable,
+    ):
+        sample_uid = row_dict["sample_uid"]
+        sample_name = row_dict["sample_name"]
+        # Get features for this sample from features_df
+        sample_features = study_obj.features_df.filter(pl.col("sample_uid") == sample_uid)
+        # Create new FeatureMap
+        feature_map = oms.FeatureMap()
+        # Convert DataFrame features to OpenMS Features
+        for feature_row in sample_features.iter_rows(named=True):
+            feature = oms.Feature()
+            # Set properties from DataFrame (handle missing values gracefully)
+            try:
+                # Skip features with missing critical data
+                if feature_row["mz"] is None:
+                    study_obj.logger.warning("Skipping feature due to missing mz")
+                    continue
+                if feature_row["rt"] is None:
+                    study_obj.logger.warning("Skipping feature due to missing rt")
+                    continue
+                if feature_row["inty"] is None:
+                    study_obj.logger.warning("Skipping feature due to missing inty")
+                    continue
+                feature.setUniqueId(int(feature_row["feature_id"]))
+                feature.setMZ(float(feature_row["mz"]))
+                feature.setRT(float(feature_row["rt"]))
+                feature.setIntensity(float(feature_row["inty"]))
+                # Handle optional fields that might be None
+                if feature_row.get("quality") is not None:
+                    feature.setOverallQuality(float(feature_row["quality"]))
+                if feature_row.get("charge") is not None:
+                    feature.setCharge(int(feature_row["charge"]))
+                # Add to feature map
+                feature_map.push_back(feature)
+            except (ValueError, TypeError) as e:
+                study_obj.logger.warning(f"Skipping feature due to conversion error: {e}")
+                continue
+        fmaps.append(feature_map)
+    study_obj.logger.debug(f"Generated {len(fmaps)} feature maps from features_df for PoseClustering alignment")
     # Create PC-specific OpenMS parameters
     params_oms = oms.Param()
     params_oms.setValue("pairfinder:distance_intensity:log_transform", "disabled")
@@ -847,7 +699,6 @@ def _align_pose_clustering(study_obj, fmaps, params):
     study_obj.logger.debug(f"Parameters for alignment: {params}")
     # Perform alignment and transformation of feature maps to the reference map (exclude reference map)
-    tdqm_disable = study_obj.log_level not in ["TRACE", "DEBUG", "INFO"]
     for index, fm in tqdm(
         list(enumerate(fmaps)),
         total=len(fmaps),
@@ -879,14 +730,126 @@ def _align_pose_clustering(study_obj, fmaps, params):
             continue
     study_obj.alignment_ref_index = ref_index
+    # Process feature maps and update features_df with transformed retention times
+    # Build a fast lookup for (sample_uid, featureUid) to index in features_df
+    feats = study_obj.features_df
+    # Pre-build sample_uid lookup for faster access
+    study_obj.logger.debug("Build sample_uid lookup for fast access...")
+    sample_uid_lookup = {
+        idx: row_dict["sample_uid"]
+        for idx, row_dict in enumerate(study_obj.samples_df.iter_rows(named=True))
+    }
+    # Build the main lookup using feature_uid (not feature_id)
+    if "feature_id" in feats.columns:
+        # Create lookup mapping (sample_uid, feature_id) to DataFrame index using Polars
+        # Since we need a pandas-style index lookup, we'll create a simple dict
+        sample_uids = feats.get_column("sample_uid").to_list()
+        # Handle feature_id column - it might be Object type due to conversion
+        feature_id_col = feats.get_column("feature_id")
+        if feature_id_col.dtype == pl.Object:
+            # If it's Object type, convert to list and let Python handle the conversion
+            feature_ids = feature_id_col.to_list()
+            # Convert to strings if they're not already
+            feature_ids = [str(fid) if fid is not None else None for fid in feature_ids]
+        else:
+            # Safe to cast normally
+            feature_ids = feature_id_col.cast(pl.Utf8).to_list()
+        lookup = {
+            (sample_uid, feature_id): idx
+            for idx, (sample_uid, feature_id) in enumerate(
+                zip(sample_uids, feature_ids, strict=True),
+            )
+        }
+    else:
+        # fallback: skip if feature_uid column missing
+        lookup = {}
+        study_obj.logger.warning("feature_id column not found in features_df")
+    # Pre-allocate update lists for better performance
+    all_update_idx = []
+    all_update_rt = []
+    all_update_rt_original = []
+    for index, fm in tqdm(
+        list(enumerate(fmaps)),
+        total=len(fmaps),
+        desc=f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO     | {study_obj.log_label}Extract RTs",
+        disable=tdqm_disable,
+    ):
+        sample_uid = sample_uid_lookup.get(index)
+        if sample_uid is None:
+            continue
+        # Collect all updates for this feature map
+        for f in fm:
+            feature_uid = str(f.getUniqueId())
+            idx = lookup.get((sample_uid, feature_uid))
+            if idx is not None:
+                rt = round(f.getRT(), 3)
+                # rt_or = round(f.getMetaValue("original_RT"), 3) if f.metaValueExists("original_RT") else rt
+                all_update_idx.append(idx)
+                all_update_rt.append(rt)
+                # all_update_rt_original.append(rt_or)
+    # Single batch update for all features at once
+    if all_update_idx:
+        # Build a full-length Python list of rt values, update specified indices,
+        # then replace the DataFrame column with a Series that has the same length
+        try:
+            current_rt = study_obj.features_df["rt"].to_list()
+        except Exception:
+            current_rt = [None] * study_obj.features_df.height
+        # Defensive: ensure list length equals dataframe height
+        if len(current_rt) != study_obj.features_df.height:
+            current_rt = [None] * study_obj.features_df.height
+        for idx, new_rt in zip(all_update_idx, all_update_rt):
+            current_rt[idx] = new_rt
+        new_cols = [pl.Series("rt", current_rt)]
+        # Update rt_original if corresponding updates were collected
+        if "all_update_rt_original" in locals() and all_update_rt_original:
+            try:
+                current_rt_orig = (
+                    study_obj.features_df["rt_original"].to_list()
+                    if "rt_original" in study_obj.features_df.columns
+                    else [None] * study_obj.features_df.height
+                )
+            except Exception:
+                current_rt_orig = [None] * study_obj.features_df.height
+            if len(current_rt_orig) != study_obj.features_df.height:
+                current_rt_orig = [None] * study_obj.features_df.height
+            for idx, new_orig in zip(all_update_idx, all_update_rt_original):
+                current_rt_orig[idx] = new_orig
+            new_cols.append(pl.Series("rt_original", current_rt_orig))
+        # Replace columns in one call
+        study_obj.features_df = study_obj.features_df.with_columns(*new_cols)
+    # Clean up temporary feature maps to release memory
+    del fmaps
+    study_obj.logger.debug("Temporary feature maps deleted to release memory")
-def _align_kd_algorithm(study_obj, fmaps, params):
+def _align_kd_algorithm(study_obj, params):
     """
-    Custom KD-tree / reference-based alignment.
+    Custom KD-tree / reference-based alignment working directly with features_df.
     """
     import bisect
     import statistics
+    import pyopenms as oms
+    import polars as pl
+    from datetime import datetime
     # Pull parameter values - map standard align params to our algorithm
     # Use rt_tol (standard align param) instead of warp_rt_tol for RT tolerance
@@ -919,26 +882,64 @@ def _align_kd_algorithm(study_obj, fmaps, params):
         _raw_mp = None
     max_points = int(_raw_mp) if _raw_mp is not None else 1000
     study_obj.logger.info(
-        f"Align time axes with rt_tol={params.get('rt_tol')}, min_samples={params.get('min_samples')}, max_points={max_points}",
+        f"KD align: rt_tol={params.get('rt_tol')}, max_points={max_points}",
     )
-    # Check if feature maps are empty before proceeding
-    if not fmaps:
-        study_obj.logger.error("No feature maps available for alignment. Cannot proceed with alignment.")
-        raise ValueError("No feature maps available for alignment. This usually indicates that all samples failed to load properly.")
+    # Work directly with features_df instead of feature maps
+    if study_obj.features_df is None or study_obj.features_df.is_empty():
+        study_obj.logger.error("No features_df available for alignment. Cannot proceed with KD alignment.")
+        raise ValueError("No features_df available for alignment. This usually indicates that features were not detected properly.")
+    # OPTIMIZATION 1: Group all features by sample_uid in ONE operation instead of filtering repeatedly
+    study_obj.logger.debug("Grouping features efficiently (major speedup)...")
+    # rt_original should already exist (created in main align() function)
+    if "rt_original" not in study_obj.features_df.columns:
+        raise ValueError("rt_original column missing - this should have been created by align() function")
+    sample_groups = study_obj.features_df.group_by("sample_uid", maintain_order=True)
+    sample_feature_data = sample_groups.agg([
+        pl.len().alias("feature_count"),
+        pl.col("mz").alias("mzs"),
+        pl.col("rt_original").alias("rt_originals")  # Use original RT values for alignment
+    ]).sort("feature_count", descending=True)
+    if sample_feature_data.is_empty():
+        study_obj.logger.error("No features found in any sample for alignment.")
+        raise ValueError("No features found in any sample for alignment.")
+    # Choose reference sample (sample with most features)
+    ref_sample_uid = sample_feature_data.row(0, named=True)["sample_uid"]
+    # Find the index of this sample in samples_df
+    ref_index = None
+    sample_uid_to_index = {}
+    for idx, row_dict in enumerate(study_obj.samples_df.iter_rows(named=True)):
+        sample_uid = row_dict["sample_uid"]
+        sample_uid_to_index[sample_uid] = idx
+        if sample_uid == ref_sample_uid:
+            ref_index = idx
+    if ref_index is None:
+        study_obj.logger.error(f"Could not find reference sample {ref_sample_uid} in samples_df")
+        raise ValueError(f"Could not find reference sample {ref_sample_uid} in samples_df")
-    # Choose reference map (largest number of features)
-    ref_index = max(range(len(fmaps)), key=lambda i: fmaps[i].size())
-    ref_map = fmaps[ref_index]
     study_obj.alignment_ref_index = ref_index
-    study_obj.logger.debug(
-        f"Reference map index {ref_index} (sample: {study_obj.samples_df.row(ref_index, named=True)['sample_name']}) size={ref_map.size()}",
-    )
-    # Extract and sort reference features by m/z for binary search
-    ref_features = [(f.getMZ(), f.getRT()) for f in ref_map]
+    # OPTIMIZATION 2: Get reference features efficiently from pre-grouped data
+    # Always use rt_original for alignment input to ensure consistent results
+    ref_row = sample_feature_data.filter(pl.col("sample_uid") == ref_sample_uid).row(0, named=True)
+    ref_mzs_list = ref_row["mzs"]
+    ref_rts_list = ref_row["rt_originals"]  # Use original RT values
+    # Create sorted reference features for binary search
+    ref_features = list(zip(ref_mzs_list, ref_rts_list))
     ref_features.sort(key=lambda x: x[0])
     ref_mzs = [mz for mz, _ in ref_features]
+    study_obj.logger.debug(
+        f"Reference sample UID {ref_sample_uid} (index {ref_index}, sample: {study_obj.samples_df.row(ref_index, named=True)['sample_name']}) has {len(ref_features)} features",
+    )
     def find_best_match(mz: float, rt: float):
         mz_tol_abs = mz * ppm_tol * 1e-6
@@ -971,45 +972,59 @@ def _align_kd_algorithm(study_obj, fmaps, params):
             except Exception:
                 pass
-    transformations: list[oms.TransformationDescription] = []
+    # OPTIMIZATION 3: Process samples using pre-grouped data (eliminates expensive filtering)
+    transformations = {}
-    for i, fmap in enumerate(fmaps):
+    for row in sample_feature_data.iter_rows(named=True):
+        sample_uid = row["sample_uid"]
+        sample_mzs = row["mzs"]
+        sample_rts = row["rt_originals"]  # Use original RT values for alignment input
         td = oms.TransformationDescription()
-        if fmap.size() == 0:
-            transformations.append(td)
+        sample_index = sample_uid_to_index.get(sample_uid)
+        if sample_index is None:
+            study_obj.logger.warning(f"Sample UID {sample_uid} not found in samples_df, skipping")
+            continue
+        # Skip empty samples
+        if not sample_mzs or not sample_rts:
+            transformations[sample_uid] = td
             continue
-        # Identity for reference map
-        if i == ref_index:
-            rts = [f.getRT() for f in fmap]
+        # Identity for reference sample
+        if sample_uid == ref_sample_uid:
+            rts = [rt for rt in sample_rts if rt is not None]
             lo, hi = (min(rts), max(rts)) if rts else (0.0, 0.0)
             try:
                 _set_pairs(td, [(lo, lo), (hi, hi)])
                 td.fitModel("linear", oms.Param())
             except Exception:
                 pass
-            transformations.append(td)
+            transformations[sample_uid] = td
             continue
-        # Collect candidate pairs
+        # OPTIMIZATION 4: Process pairs using pre-loaded data arrays (no DataFrame operations)
         pairs_raw = []
-        for f in fmap:
-            match = find_best_match(f.getMZ(), f.getRT())
-            if match:
-                obs_rt, ref_rt = match
-                if abs(obs_rt - ref_rt) <= rt_pair_tol:
-                    pairs_raw.append((obs_rt, ref_rt))
+        for mz, rt in zip(sample_mzs, sample_rts):
+            if mz is not None and rt is not None:
+                match = find_best_match(mz, rt)
+                if match:
+                    obs_rt, ref_rt = match
+                    if abs(obs_rt - ref_rt) <= rt_pair_tol:
+                        pairs_raw.append((obs_rt, ref_rt))
         if not pairs_raw:
             # Fallback identity
-            rts = [f.getRT() for f in fmap]
+            rts = [rt for rt in sample_rts if rt is not None]
             lo, hi = (min(rts), max(rts)) if rts else (0.0, 0.0)
             try:
                 _set_pairs(td, [(lo, lo), (hi, hi)])
                 td.fitModel("linear", oms.Param())
             except Exception:
                 pass
-            transformations.append(td)
-            study_obj.logger.debug(f"Map {i}: no anchors -> identity transform")
+            transformations[sample_uid] = td
+            study_obj.logger.debug(f"Sample {sample_uid}: no anchors -> identity transform")
             continue
         # Deduplicate and downsample
@@ -1041,9 +1056,9 @@ def _align_kd_algorithm(study_obj, fmaps, params):
             td.fitModel(model, oms.Param())
         except Exception as e:
             study_obj.logger.debug(
-                f"Map {i}: {model} fitting failed ({e}); fallback to linear two-point shift",
+                f"Sample {sample_uid}: {model} fitting failed ({e}); fallback to linear two-point shift",
             )
-            rts = [f.getRT() for f in fmap]
+            rts = [rt for rt in sample_rts if rt is not None]
             lo, hi = (min(rts), max(rts)) if rts else (0.0, 1.0)
             td = oms.TransformationDescription()
             try:
@@ -1053,28 +1068,39 @@ def _align_kd_algorithm(study_obj, fmaps, params):
                 pass
         study_obj.logger.debug(
-            f"Map {i}: anchors raw={len(pairs_raw)} used={len(pairs_use)} model={model} median_shift={med_shift:.4f}s",
+            f"Sample {sample_uid}: anchors raw={len(pairs_raw)} used={len(pairs_use)} model={model} median_shift={med_shift:.4f}s",
         )
-        transformations.append(td)
+        transformations[sample_uid] = td
-    # Apply transformations to feature maps; store original rt as meta value if absent
-    for i, (fmap, trafo) in enumerate(zip(fmaps, transformations)):
-        try:
-            for feat in fmap:
-                if not feat.metaValueExists("original_RT"):
-                    try:
-                        feat.setMetaValue("original_RT", float(feat.getRT()))
-                    except Exception:
-                        pass
-            oms.MapAlignmentTransformer().transformRetentionTimes(fmap, trafo, True)
-        except Exception as e:
-            study_obj.logger.warning(f"Map {i}: failed applying transformation ({e})")
+    # OPTIMIZATION 5: Apply transformations efficiently using vectorized operations
+    study_obj.logger.debug("Applying RT transformations efficiently...")
+    # Apply transformations to RT values starting from rt_original
+    def transform_rt_vectorized(sample_uid: int, rt_original: float) -> float:
+        if sample_uid in transformations and rt_original is not None:
+            try:
+                trafo = transformations[sample_uid]
+                return trafo.apply(float(rt_original))
+            except Exception:
+                return rt_original
+        return rt_original
+    # Use Polars' efficient struct operations for vectorized transformation
+    # Apply transformation to rt_original and store result in rt column
+    study_obj.features_df = study_obj.features_df.with_columns(
+        pl.struct(["sample_uid", "rt_original"]).map_elements(
+            lambda x: transform_rt_vectorized(x["sample_uid"], x["rt_original"]),
+            return_dtype=pl.Float64
+        ).alias("rt")
+    )
     study_obj.logger.info(
-        f"Alignment completed. Reference index {ref_index}.",
+        f"Alignment completed. Reference sample UID {ref_sample_uid} (index {ref_index}).",
     )
 def _align_pose_clustering_fallback(study_obj, fmaps, params):
     """Fallback PoseClustering alignment with minimal parameters."""
     import pyopenms as oms
@@ -1107,7 +1133,7 @@ def _align_pose_clustering_fallback(study_obj, fmaps, params):
     study_obj.alignment_ref_index = ref_index
-def find_iso(self, rt_tol=0.1, mz_tol=0.01):
+def find_iso(self, rt_tol=0.1, mz_tol=0.01, uids=None):
     """
     Find isotope patterns for consensus features by searching raw MS1 data.
     OPTIMIZED VERSION: Each sample file is loaded only once for maximum efficiency.
@@ -1123,6 +1149,7 @@ def find_iso(self, rt_tol=0.1, mz_tol=0.01):
     Parameters:
         rt_tol (float): RT tolerance for scan matching in seconds
         mz_tol (float): Additional m/z tolerance for isotope matching in Da
+        uids (list, optional): List of consensus_uid values to process. If None, process all consensus features.
     """
     if self.consensus_df is None or self.consensus_df.is_empty():
         self.logger.error("No consensus features found. Please run merge() first.")
@@ -1148,6 +1175,20 @@ def find_iso(self, rt_tol=0.1, mz_tol=0.01):
     self.logger.info("Extracting isotopomers from raw MS1 data...")
+    # Filter consensus features if uids is specified
+    if uids is not None:
+        if not isinstance(uids, (list, tuple)):
+            uids = [uids]
+        # Filter consensus_df to only process specified UIDs
+        consensus_df_filtered = self.consensus_df.filter(pl.col("consensus_uid").is_in(uids))
+        if consensus_df_filtered.is_empty():
+            self.logger.warning(f"No consensus features found with specified UIDs: {uids}")
+            return
+        self.logger.debug(f"Processing {len(consensus_df_filtered)} consensus features (UIDs: {uids})")
+    else:
+        consensus_df_filtered = self.consensus_df
+        self.logger.debug(f"Processing all {len(consensus_df_filtered)} consensus features")
     # Isotope mass shifts to search for (up to 7x 13C isotopes)
     isotope_shifts = [
         0.33,
@@ -1169,7 +1210,13 @@ def find_iso(self, rt_tol=0.1, mz_tol=0.01):
     self.logger.debug("Building sample-to-consensus mapping using vectorized operations...")
     # Step 1: Join consensus_mapping with features to get intensities in one operation
-    consensus_with_features = self.consensus_mapping_df.join(
+    # Apply UID filtering if specified
+    if uids is not None:
+        consensus_mapping_filtered = self.consensus_mapping_df.filter(pl.col("consensus_uid").is_in(uids))
+    else:
+        consensus_mapping_filtered = self.consensus_mapping_df
+    consensus_with_features = consensus_mapping_filtered.join(
         self.features_df.select(['feature_uid', 'sample_uid', 'inty', 'mz', 'rt', 'rt_original']),
         on=['feature_uid', 'sample_uid'],
         how='left'
@@ -1214,19 +1261,19 @@ def find_iso(self, rt_tol=0.1, mz_tol=0.01):
     # Initialize failed consensus features (those not in the mapping)
     processed_consensus_uids = set(best_features_with_paths['consensus_uid'].to_list())
-    for consensus_row in self.consensus_df.iter_rows(named=True):
+    for consensus_row in consensus_df_filtered.iter_rows(named=True):
         consensus_uid = consensus_row["consensus_uid"]
         if consensus_uid not in processed_consensus_uids:
             consensus_iso_data[consensus_uid] = None
-    self.logger.debug(f"Will read {len(sample_to_consensus)} unique sample files for {len(self.consensus_df)} consensus features")
+    self.logger.debug(f"Will read {len(sample_to_consensus)} unique sample files for {len(consensus_df_filtered)} consensus features")
     tdqm_disable = self.log_level not in ["TRACE", "DEBUG", "INFO"]
     # OPTIMIZATION 2: Process by sample file (load each file only once)
     for sample_path, consensus_list in tqdm(
         sample_to_consensus.items(),
-        desc=f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO     | {self.log_label}Read files",
+        desc=f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO     | {self.log_label}Process from files",
         disable=tdqm_disable,
     ):
         try:
@@ -1245,6 +1292,16 @@ def find_iso(self, rt_tol=0.1, mz_tol=0.01):
                 base_mz = best_feature["mz"]
                 original_rt = best_feature.get("rt_original", best_feature["rt"])
+                # Skip if RT or mz is None or invalid
+                if original_rt is None:
+                    original_rt = best_feature["rt"]
+                    self.logger.debug(f"original_rt is None. Using aligned rt instead")
+                if base_mz is None:
+                    self.logger.warning(f"Skipping consensus_uid {consensus_uid}: base_mz is None")
+                    consensus_iso_data[consensus_uid] = None
+                    continue
                 # Find MS1 scans near the original RT
                 rt_min = original_rt - rt_tol
                 rt_max = original_rt + rt_tol

masster 0.5.1__py3-none-any.whl → 0.5.3__py3-none-any.whl

Potentially problematic release.

masster 0.5.1py3-none-any.whl → 0.5.3py3-none-any.whl