PyPI - masster - Versions diffs - 0.4.4__py3-none-any.whl → 0.4.5__py3-none-any.whl - Mend

masster 0.4.4py3-none-any.whl → 0.4.5py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of masster might be problematic. Click here for more details.

Files changed (39) hide show

masster/__init__.py +8 -8
masster/chromatogram.py +1 -1
masster/data/libs/urine.csv +3 -3
masster/logger.py +11 -11
masster/sample/__init__.py +1 -1
masster/sample/adducts.py +338 -264
masster/sample/defaults/find_adducts_def.py +21 -8
masster/sample/h5.py +561 -282
masster/sample/helpers.py +131 -75
masster/sample/lib.py +4 -4
masster/sample/load.py +31 -17
masster/sample/parameters.py +1 -1
masster/sample/plot.py +7 -7
masster/sample/processing.py +117 -87
masster/sample/sample.py +103 -90
masster/sample/sample5_schema.json +44 -44
masster/sample/save.py +35 -12
masster/spectrum.py +1 -1
masster/study/__init__.py +1 -1
masster/study/defaults/align_def.py +5 -1
masster/study/defaults/identify_def.py +3 -1
masster/study/defaults/study_def.py +58 -25
masster/study/export.py +360 -210
masster/study/h5.py +560 -158
masster/study/helpers.py +496 -203
masster/study/helpers_optimized.py +1 -1
masster/study/id.py +538 -349
masster/study/load.py +233 -143
masster/study/plot.py +71 -71
masster/study/processing.py +456 -254
masster/study/save.py +15 -5
masster/study/study.py +213 -131
masster/study/study5_schema.json +149 -149
{masster-0.4.4.dist-info → masster-0.4.5.dist-info}/METADATA +3 -1
{masster-0.4.4.dist-info → masster-0.4.5.dist-info}/RECORD +39 -39
{masster-0.4.4.dist-info → masster-0.4.5.dist-info}/WHEEL +0 -0
{masster-0.4.4.dist-info → masster-0.4.5.dist-info}/entry_points.txt +0 -0
{masster-0.4.4.dist-info → masster-0.4.5.dist-info}/licenses/LICENSE +0 -0
{masster-0.4.4.dist-info → masster-0.4.5.dist-info}/top_level.txt +0 -0

masster/study/processing.py CHANGED Viewed

@@ -8,7 +8,7 @@ import pyopenms as oms
 from tqdm import tqdm
-from masster.study.defaults import (
+from master.study.defaults import (
     align_defaults,
     find_ms2_defaults,
     integrate_defaults,
@@ -115,7 +115,8 @@ def align(self, **kwargs):
     # Pre-build sample_uid lookup for faster access
     self.logger.debug("Build sample_uid lookup for fast access...")
     sample_uid_lookup = {
-        idx: row_dict["sample_uid"] for idx, row_dict in enumerate(self.samples_df.iter_rows(named=True))
+        idx: row_dict["sample_uid"]
+        for idx, row_dict in enumerate(self.samples_df.iter_rows(named=True))
     }
     # Build the main lookup using feature_uid (not feature_id)
@@ -215,7 +216,7 @@ def align(self, **kwargs):
         self.features_df = self.features_df.with_columns(*new_cols)
     self.logger.debug("Alignment completed successfully.")
     # Reset consensus data structures after alignment since RT changes invalidate consensus
     consensus_reset_count = 0
     if not self.consensus_df.is_empty():
@@ -227,7 +228,7 @@ def align(self, **kwargs):
     if not self.consensus_ms2.is_empty():
         self.consensus_ms2 = pl.DataFrame()
         consensus_reset_count += 1
     # Remove merge and find_ms2 parameters from history since they need to be re-run
     keys_to_remove = ["merge", "find_ms2"]
     history_removed_count = 0
@@ -237,9 +238,11 @@ def align(self, **kwargs):
                 del self.history[key]
                 history_removed_count += 1
                 self.logger.debug(f"Removed {key} from history")
     if consensus_reset_count > 0 or history_removed_count > 0:
-        self.logger.info(f"Alignment reset: {consensus_reset_count} consensus structures cleared, {history_removed_count} history entries removed")
+        self.logger.info(
+            f"Alignment reset: {consensus_reset_count} consensus structures cleared, {history_removed_count} history entries removed",
+        )
     if params.get("save_features"):
         self.save_samples()
@@ -290,7 +293,10 @@ def merge(self, **kwargs):
     algorithm = params.get("algorithm")
     min_samples = params.get("min_samples")
     link_ms2 = params.get("link_ms2")
-    mz_tol = kwargs.get("mz_tol", 0.01)  # Default values for parameters not in defaults class
+    mz_tol = kwargs.get(
+        "mz_tol",
+        0.01,
+    )  # Default values for parameters not in defaults class
     rt_tol = kwargs.get("rt_tol", 1.0)
     if len(self.samples_df) > 200 and algorithm == "qt":
@@ -399,7 +405,10 @@ def merge(self, **kwargs):
                 consensus_map.setUniqueIds()
     # create a dict to map uid to feature_uid using self.features_df
-    feature_uid_map = {row["feature_id"]: row["feature_uid"] for row in self.features_df.iter_rows(named=True)}
+    feature_uid_map = {
+        row["feature_id"]: row["feature_uid"]
+        for row in self.features_df.iter_rows(named=True)
+    }
     imax = consensus_map.size()
     # Pre-build fast lookup tables for features_df data
@@ -426,7 +435,9 @@ def merge(self, **kwargs):
     for row in self.features_df.iter_rows(named=True):
         feature_uid = row["feature_uid"]
-        features_lookup[feature_uid] = {col: row[col] for col in feature_columns if col in self.features_df.columns}
+        features_lookup[feature_uid] = {
+            col: row[col] for col in feature_columns if col in self.features_df.columns
+        }
     # create a list to store the consensus mapping
     consensus_mapping = []
@@ -453,11 +464,13 @@ def merge(self, **kwargs):
                 # this is a feature that was removed but is still in the feature maps
                 continue
             fuid = feature_uid_map[fuid]
-            consensus_mapping.append({
-                "consensus_uid": i,
-                "sample_uid": f.getMapIndex() + 1,
-                "feature_uid": fuid,
-            })
+            consensus_mapping.append(
+                {
+                    "consensus_uid": i,
+                    "sample_uid": f.getMapIndex() + 1,
+                    "feature_uid": fuid,
+                },
+            )
             uids.append(fuid)
             # Get feature data from lookup instead of DataFrame filtering
@@ -471,43 +484,99 @@ def merge(self, **kwargs):
         # Compute statistics using vectorized operations on collected data
         # Convert to numpy arrays for faster computation
-        rt_values = np.array([fd.get("rt", 0) for fd in feature_data_list if fd.get("rt") is not None])
-        mz_values = np.array([fd.get("mz", 0) for fd in feature_data_list if fd.get("mz") is not None])
-        rt_start_values = np.array([
-            fd.get("rt_start", 0) for fd in feature_data_list if fd.get("rt_start") is not None
-        ])
-        rt_end_values = np.array([fd.get("rt_end", 0) for fd in feature_data_list if fd.get("rt_end") is not None])
-        rt_delta_values = np.array([
-            fd.get("rt_delta", 0) for fd in feature_data_list if fd.get("rt_delta") is not None
-        ])
-        mz_start_values = np.array([
-            fd.get("mz_start", 0) for fd in feature_data_list if fd.get("mz_start") is not None
-        ])
-        mz_end_values = np.array([fd.get("mz_end", 0) for fd in feature_data_list if fd.get("mz_end") is not None])
-        inty_values = np.array([fd.get("inty", 0) for fd in feature_data_list if fd.get("inty") is not None])
-        coherence_values = np.array([
-            fd.get("chrom_coherence", 0) for fd in feature_data_list if fd.get("chrom_coherence") is not None
-        ])
-        prominence_values = np.array([
-            fd.get("chrom_prominence", 0) for fd in feature_data_list if fd.get("chrom_prominence") is not None
-        ])
-        prominence_scaled_values = np.array([
-            fd.get("chrom_prominence_scaled", 0)
-            for fd in feature_data_list
-            if fd.get("chrom_prominence_scaled") is not None
-        ])
-        height_scaled_values = np.array([
-            fd.get("chrom_height_scaled", 0) for fd in feature_data_list if fd.get("chrom_height_scaled") is not None
-        ])
-        iso_values = np.array([fd.get("iso", 0) for fd in feature_data_list if fd.get("iso") is not None])
-        charge_values = np.array([fd.get("charge", 0) for fd in feature_data_list if fd.get("charge") is not None])
+        rt_values = np.array(
+            [fd.get("rt", 0) for fd in feature_data_list if fd.get("rt") is not None],
+        )
+        mz_values = np.array(
+            [fd.get("mz", 0) for fd in feature_data_list if fd.get("mz") is not None],
+        )
+        rt_start_values = np.array(
+            [
+                fd.get("rt_start", 0)
+                for fd in feature_data_list
+                if fd.get("rt_start") is not None
+            ],
+        )
+        rt_end_values = np.array(
+            [
+                fd.get("rt_end", 0)
+                for fd in feature_data_list
+                if fd.get("rt_end") is not None
+            ],
+        )
+        rt_delta_values = np.array(
+            [
+                fd.get("rt_delta", 0)
+                for fd in feature_data_list
+                if fd.get("rt_delta") is not None
+            ],
+        )
+        mz_start_values = np.array(
+            [
+                fd.get("mz_start", 0)
+                for fd in feature_data_list
+                if fd.get("mz_start") is not None
+            ],
+        )
+        mz_end_values = np.array(
+            [
+                fd.get("mz_end", 0)
+                for fd in feature_data_list
+                if fd.get("mz_end") is not None
+            ],
+        )
+        inty_values = np.array(
+            [
+                fd.get("inty", 0)
+                for fd in feature_data_list
+                if fd.get("inty") is not None
+            ],
+        )
+        coherence_values = np.array(
+            [
+                fd.get("chrom_coherence", 0)
+                for fd in feature_data_list
+                if fd.get("chrom_coherence") is not None
+            ],
+        )
+        prominence_values = np.array(
+            [
+                fd.get("chrom_prominence", 0)
+                for fd in feature_data_list
+                if fd.get("chrom_prominence") is not None
+            ],
+        )
+        prominence_scaled_values = np.array(
+            [
+                fd.get("chrom_prominence_scaled", 0)
+                for fd in feature_data_list
+                if fd.get("chrom_prominence_scaled") is not None
+            ],
+        )
+        height_scaled_values = np.array(
+            [
+                fd.get("chrom_height_scaled", 0)
+                for fd in feature_data_list
+                if fd.get("chrom_height_scaled") is not None
+            ],
+        )
+        iso_values = np.array(
+            [fd.get("iso", 0) for fd in feature_data_list if fd.get("iso") is not None],
+        )
+        charge_values = np.array(
+            [
+                fd.get("charge", 0)
+                for fd in feature_data_list
+                if fd.get("charge") is not None
+            ],
+        )
         # adduct_values
         # Collect all adducts from feature_data_list to create consensus adduct information
         # Only consider adducts that are in study._get_adducts() plus items with '?'
         all_adducts = []
         adduct_masses = {}
         # Get valid adducts from study._get_adducts()
         valid_adducts = set()
         try:
@@ -516,7 +585,7 @@ def merge(self, **kwargs):
                 valid_adducts.update(study_adducts_df["name"].to_list())
         except Exception as e:
             self.logger.warning(f"Could not retrieve study adducts: {e}")
         # Always allow '?' adducts
         valid_adducts.add("?")
@@ -527,7 +596,7 @@ def merge(self, **kwargs):
             if adduct is not None:
                 # Only include adducts that are valid (from study._get_adducts() or contain '?')
-                if adduct in valid_adducts or '?' in adduct:
+                if adduct in valid_adducts or "?" in adduct:
                     all_adducts.append(adduct)
                     if adduct_mass is not None:
                         adduct_masses[adduct] = adduct_mass
@@ -535,33 +604,37 @@ def merge(self, **kwargs):
         # Calculate adduct_values for the consensus feature
         adduct_values = []
         if all_adducts:
-            adduct_counts = {adduct: all_adducts.count(adduct) for adduct in set(all_adducts)}
+            adduct_counts = {
+                adduct: all_adducts.count(adduct) for adduct in set(all_adducts)
+            }
             total_count = sum(adduct_counts.values())
             for adduct, count in adduct_counts.items():
                 percentage = (count / total_count) * 100 if total_count > 0 else 0
                 mass = adduct_masses.get(adduct, None)
                 # Store as list with [name, num, %] format for the adducts column
-                adduct_values.append([
-                    str(adduct),
-                    int(count),
-                    float(round(percentage, 2))
-                ])
+                adduct_values.append(
+                    [
+                        str(adduct),
+                        int(count),
+                        float(round(percentage, 2)),
+                    ],
+                )
         # Sort adduct_values by count in descending order
         adduct_values.sort(key=lambda x: x[1], reverse=True)  # Sort by count (index 1)
         # Store adduct_values for use in metadata
         consensus_adduct_values = adduct_values
         # Extract top adduct information for new columns
         adduct_top = None
         adduct_charge_top = None
         adduct_mass_neutral_top = None
         adduct_mass_shift_top = None
         if consensus_adduct_values:
             top_adduct_name = consensus_adduct_values[0][0]  # Get top adduct name
             adduct_top = top_adduct_name
             # Parse adduct information to extract charge and mass shift
             # Handle "?" as "H" and parse common adduct formats
             if top_adduct_name == "?" or top_adduct_name == "[M+?]+":
@@ -577,33 +650,37 @@ def merge(self, **kwargs):
                     study_adducts_df = self._get_adducts()
                     if not study_adducts_df.is_empty():
                         # Look for exact match in study adducts
-                        matching_adduct = study_adducts_df.filter(pl.col("name") == top_adduct_name)
+                        matching_adduct = study_adducts_df.filter(
+                            pl.col("name") == top_adduct_name,
+                        )
                         if not matching_adduct.is_empty():
                             adduct_row = matching_adduct.row(0, named=True)
                             adduct_charge_top = adduct_row["charge"]
                             adduct_mass_shift_top = adduct_row["mass_shift"]
                             adduct_found = True
                 except Exception as e:
-                    self.logger.warning(f"Could not lookup adduct in study adducts: {e}")
+                    self.logger.warning(
+                        f"Could not lookup adduct in study adducts: {e}",
+                    )
                 if not adduct_found:
                     # Fallback to regex parsing
                     import re
                     # Pattern for adducts like [M+H]+, [M-H]-, [M+Na]+, etc.
-                    pattern = r'\[M([+\-])([A-Za-z0-9]+)\]([0-9]*)([+\-])'
+                    pattern = r"\[M([+\-])([A-Za-z0-9]+)\]([0-9]*)([+\-])"
                     match = re.match(pattern, top_adduct_name)
                     if match:
                         sign = match.group(1)
                         element = match.group(2)
                         multiplier_str = match.group(3)
                         charge_sign = match.group(4)
                         multiplier = int(multiplier_str) if multiplier_str else 1
                         charge = multiplier if charge_sign == "+" else -multiplier
                         adduct_charge_top = charge
                         # Calculate mass shift based on element
                         element_masses = {
                             "H": 1.007825,
@@ -617,9 +694,16 @@ def merge(self, **kwargs):
                             "CH3COO": 59.013851,
                             "H2O": 18.010565,
                         }
-                        base_mass = element_masses.get(element, 1.007825)  # Default to H if unknown
-                        mass_shift = base_mass * multiplier if sign == "+" else -base_mass * multiplier
+                        base_mass = element_masses.get(
+                            element,
+                            1.007825,
+                        )  # Default to H if unknown
+                        mass_shift = (
+                            base_mass * multiplier
+                            if sign == "+"
+                            else -base_mass * multiplier
+                        )
                         adduct_mass_shift_top = mass_shift
                     else:
                         # Default fallback
@@ -627,8 +711,8 @@ def merge(self, **kwargs):
                         adduct_mass_shift_top = 1.007825
         else:
             # No valid adducts found - assign default based on study polarity
-            study_polarity = getattr(self, 'polarity', 'positive')
-            if study_polarity in ['negative', 'neg']:
+            study_polarity = getattr(self, "polarity", "positive")
+            if study_polarity in ["negative", "neg"]:
                 # Negative mode default
                 adduct_top = "[M-?]1-"
                 adduct_charge_top = -1
@@ -638,14 +722,18 @@ def merge(self, **kwargs):
                 adduct_top = "[M+?]1+"
                 adduct_charge_top = 1
                 adduct_mass_shift_top = 1.007825  # H mass (gain of proton)
             # Create a single default adduct entry in the adducts list for consistency
             consensus_adduct_values = [[adduct_top, 1, 100.0]]
         # Calculate neutral mass from consensus mz (for both cases)
-        consensus_mz = round(float(np.mean(mz_values)), 4) if len(mz_values) > 0 else 0.0
+        consensus_mz = (
+            round(float(np.mean(mz_values)), 4) if len(mz_values) > 0 else 0.0
+        )
         if adduct_charge_top and adduct_mass_shift_top is not None:
-            adduct_mass_neutral_top = consensus_mz * abs(adduct_charge_top) - adduct_mass_shift_top
+            adduct_mass_neutral_top = (
+                consensus_mz * abs(adduct_charge_top) - adduct_mass_shift_top
+            )
         # Calculate number of MS2 spectra
         ms2_count = 0
@@ -654,48 +742,95 @@ def merge(self, **kwargs):
             if ms2_scans is not None:
                 ms2_count += len(ms2_scans)
-        metadata_list.append({
-            "consensus_uid": int(i),  # "consensus_id": i,
-            "consensus_id": str(feature.getUniqueId()),
-            "quality": round(float(feature.getQuality()), 3),
-            "number_samples": len(feature_data_list),
-            # "number_ext": int(len(features_list)),
-            "rt": round(float(np.mean(rt_values)), 4) if len(rt_values) > 0 else 0.0,
-            "mz": round(float(np.mean(mz_values)), 4) if len(mz_values) > 0 else 0.0,
-            "rt_min": round(float(np.min(rt_values)), 3) if len(rt_values) > 0 else 0.0,
-            "rt_max": round(float(np.max(rt_values)), 3) if len(rt_values) > 0 else 0.0,
-            "rt_mean": round(float(np.mean(rt_values)), 3) if len(rt_values) > 0 else 0.0,
-            "rt_start_mean": round(float(np.mean(rt_start_values)), 3) if len(rt_start_values) > 0 else 0.0,
-            "rt_end_mean": round(float(np.mean(rt_end_values)), 3) if len(rt_end_values) > 0 else 0.0,
-            "rt_delta_mean": round(float(np.ptp(rt_delta_values)), 3) if len(rt_delta_values) > 0 else 0.0,
-            "mz_min": round(float(np.min(mz_values)), 4) if len(mz_values) > 0 else 0.0,
-            "mz_max": round(float(np.max(mz_values)), 4) if len(mz_values) > 0 else 0.0,
-            "mz_mean": round(float(np.mean(mz_values)), 4) if len(mz_values) > 0 else 0.0,
-            "mz_start_mean": round(float(np.mean(mz_start_values)), 4) if len(mz_start_values) > 0 else 0.0,
-            "mz_end_mean": round(float(np.mean(mz_end_values)), 4) if len(mz_end_values) > 0 else 0.0,
-            "inty_mean": round(float(np.mean(inty_values)), 0) if len(inty_values) > 0 else 0.0,
-            "bl": -1.0,
-            "chrom_coherence_mean": round(float(np.mean(coherence_values)), 3) if len(coherence_values) > 0 else 0.0,
-            "chrom_prominence_mean": round(float(np.mean(prominence_values)), 0) if len(prominence_values) > 0 else 0.0,
-            "chrom_prominence_scaled_mean": round(
-                float(np.mean(prominence_scaled_values)),
-                3,
-            )
-            if len(prominence_scaled_values) > 0
-            else 0.0,
-            "chrom_height_scaled_mean": round(float(np.mean(height_scaled_values)), 3)
-            if len(height_scaled_values) > 0
-            else 0.0,
-            "iso_mean": round(float(np.mean(iso_values)), 2) if len(iso_values) > 0 else 0.0,
-            "charge_mean": round(float(np.mean(charge_values)), 2) if len(charge_values) > 0 else 0.0,
-            "number_ms2": int(ms2_count),
-            "adducts": consensus_adduct_values if consensus_adduct_values else [],  # Ensure it's always a list
-            # New columns for top-ranked adduct information
-            "adduct_top": adduct_top,
-            "adduct_charge_top": adduct_charge_top,
-            "adduct_mass_neutral_top": round(adduct_mass_neutral_top, 6) if adduct_mass_neutral_top is not None else None,
-            "adduct_mass_shift_top": round(adduct_mass_shift_top, 6) if adduct_mass_shift_top is not None else None,
-        })
+        metadata_list.append(
+            {
+                "consensus_uid": int(i),  # "consensus_id": i,
+                "consensus_id": str(feature.getUniqueId()),
+                "quality": round(float(feature.getQuality()), 3),
+                "number_samples": len(feature_data_list),
+                # "number_ext": int(len(features_list)),
+                "rt": round(float(np.mean(rt_values)), 4)
+                if len(rt_values) > 0
+                else 0.0,
+                "mz": round(float(np.mean(mz_values)), 4)
+                if len(mz_values) > 0
+                else 0.0,
+                "rt_min": round(float(np.min(rt_values)), 3)
+                if len(rt_values) > 0
+                else 0.0,
+                "rt_max": round(float(np.max(rt_values)), 3)
+                if len(rt_values) > 0
+                else 0.0,
+                "rt_mean": round(float(np.mean(rt_values)), 3)
+                if len(rt_values) > 0
+                else 0.0,
+                "rt_start_mean": round(float(np.mean(rt_start_values)), 3)
+                if len(rt_start_values) > 0
+                else 0.0,
+                "rt_end_mean": round(float(np.mean(rt_end_values)), 3)
+                if len(rt_end_values) > 0
+                else 0.0,
+                "rt_delta_mean": round(float(np.ptp(rt_delta_values)), 3)
+                if len(rt_delta_values) > 0
+                else 0.0,
+                "mz_min": round(float(np.min(mz_values)), 4)
+                if len(mz_values) > 0
+                else 0.0,
+                "mz_max": round(float(np.max(mz_values)), 4)
+                if len(mz_values) > 0
+                else 0.0,
+                "mz_mean": round(float(np.mean(mz_values)), 4)
+                if len(mz_values) > 0
+                else 0.0,
+                "mz_start_mean": round(float(np.mean(mz_start_values)), 4)
+                if len(mz_start_values) > 0
+                else 0.0,
+                "mz_end_mean": round(float(np.mean(mz_end_values)), 4)
+                if len(mz_end_values) > 0
+                else 0.0,
+                "inty_mean": round(float(np.mean(inty_values)), 0)
+                if len(inty_values) > 0
+                else 0.0,
+                "bl": -1.0,
+                "chrom_coherence_mean": round(float(np.mean(coherence_values)), 3)
+                if len(coherence_values) > 0
+                else 0.0,
+                "chrom_prominence_mean": round(float(np.mean(prominence_values)), 0)
+                if len(prominence_values) > 0
+                else 0.0,
+                "chrom_prominence_scaled_mean": round(
+                    float(np.mean(prominence_scaled_values)),
+                    3,
+                )
+                if len(prominence_scaled_values) > 0
+                else 0.0,
+                "chrom_height_scaled_mean": round(
+                    float(np.mean(height_scaled_values)),
+                    3,
+                )
+                if len(height_scaled_values) > 0
+                else 0.0,
+                "iso_mean": round(float(np.mean(iso_values)), 2)
+                if len(iso_values) > 0
+                else 0.0,
+                "charge_mean": round(float(np.mean(charge_values)), 2)
+                if len(charge_values) > 0
+                else 0.0,
+                "number_ms2": int(ms2_count),
+                "adducts": consensus_adduct_values
+                if consensus_adduct_values
+                else [],  # Ensure it's always a list
+                # New columns for top-ranked adduct information
+                "adduct_top": adduct_top,
+                "adduct_charge_top": adduct_charge_top,
+                "adduct_mass_neutral_top": round(adduct_mass_neutral_top, 6)
+                if adduct_mass_neutral_top is not None
+                else None,
+                "adduct_mass_shift_top": round(adduct_mass_shift_top, 6)
+                if adduct_mass_shift_top is not None
+                else None,
+            },
+        )
     consensus_mapping_df = pl.DataFrame(consensus_mapping)
     # remove all rows in consensus_mapping_df where consensus_id is not in self.featured_df['uid']
@@ -736,72 +871,74 @@ def merge(self, **kwargs):
     )
     self.consensus_map = consensus_map
     # Add adduct grouping and adduct_of assignment
     if len(self.consensus_df) > 0:
         # Get rt_tol and mz_tol from kwargs or use defaults from merge_defaults
         adduct_rt_tol = rt_tol  # Use the same rt_tol from merge parameters
         adduct_mz_tol = mz_tol  # Use the same mz_tol from merge parameters
         # Initialize new columns
         adduct_group_list = []
         adduct_of_list = []
         # Get relevant columns for grouping
         consensus_data = []
         for row in self.consensus_df.iter_rows(named=True):
-            consensus_data.append({
-                "consensus_uid": row["consensus_uid"],
-                "rt": row["rt"],
-                "adduct_mass_neutral_top": row.get("adduct_mass_neutral_top"),
-                "adduct_top": row.get("adduct_top"),
-                "inty_mean": row.get("inty_mean", 0),
-            })
+            consensus_data.append(
+                {
+                    "consensus_uid": row["consensus_uid"],
+                    "rt": row["rt"],
+                    "adduct_mass_neutral_top": row.get("adduct_mass_neutral_top"),
+                    "adduct_top": row.get("adduct_top"),
+                    "inty_mean": row.get("inty_mean", 0),
+                },
+            )
         # Group features with similar neutral mass and RT
         group_id = 1
         assigned_groups = {}  # consensus_uid -> group_id
         groups = {}  # group_id -> [consensus_uids]
         for i, feature in enumerate(consensus_data):
             consensus_uid = feature["consensus_uid"]
             if consensus_uid in assigned_groups:
                 continue
             neutral_mass = feature["adduct_mass_neutral_top"]
             rt = feature["rt"]
             # Skip if neutral mass is None
             if neutral_mass is None:
                 assigned_groups[consensus_uid] = 0  # No group assignment
                 continue
             # Find all features that could belong to the same group
             group_members = [consensus_uid]
             for j, other_feature in enumerate(consensus_data):
                 if i == j:
                     continue
                 other_uid = other_feature["consensus_uid"]
                 if other_uid in assigned_groups:
                     continue
                 other_neutral_mass = other_feature["adduct_mass_neutral_top"]
                 other_rt = other_feature["rt"]
                 if other_neutral_mass is None:
                     continue
                 # Check if features have similar neutral mass and RT
                 mass_diff = abs(neutral_mass - other_neutral_mass)
                 rt_diff = abs(rt - other_rt) / 60.0  # Convert to minutes for rt_tol
                 if mass_diff <= adduct_mz_tol and rt_diff <= adduct_rt_tol:
                     group_members.append(other_uid)
                     assigned_groups[other_uid] = group_id
             if len(group_members) > 1:
                 # Multiple members - create a group
                 for member_uid in group_members:
@@ -813,26 +950,29 @@ def merge(self, **kwargs):
                 assigned_groups[consensus_uid] = group_id
                 groups[group_id] = [consensus_uid]
                 group_id += 1
         # Determine adduct_of for each group
         group_adduct_of = {}  # group_id -> consensus_uid of most important adduct
         for grp_id, member_uids in groups.items():
             # Find the most important adduct in this group
             # Priority: [M+H]+ > [M-H]- > highest intensity
             best_uid = None
             best_priority = -1
             best_intensity = 0
             for uid in member_uids:
                 # Find the feature data
-                feature_data = next((f for f in consensus_data if f["consensus_uid"] == uid), None)
+                feature_data = next(
+                    (f for f in consensus_data if f["consensus_uid"] == uid),
+                    None,
+                )
                 if not feature_data:
                     continue
                 adduct = feature_data.get("adduct_top", "")
                 intensity = feature_data.get("inty_mean", 0)
                 priority = 0
                 if adduct and ("[M+H]" in adduct or adduct == "H" or adduct == "?"):
                     priority = 3  # Highest priority for [M+H]+ or H
@@ -840,34 +980,41 @@ def merge(self, **kwargs):
                     priority = 2  # Second priority for [M-H]-
                 elif adduct and "M" in adduct:
                     priority = 1  # Third priority for other molecular adducts
                 # Choose based on priority first, then intensity
-                if (priority > best_priority or
-                    (priority == best_priority and intensity > best_intensity)):
+                if priority > best_priority or (
+                    priority == best_priority and intensity > best_intensity
+                ):
                     best_uid = uid
                     best_priority = priority
                     best_intensity = intensity
             group_adduct_of[grp_id] = best_uid if best_uid else member_uids[0]
         # Build the final lists in the same order as consensus_df
         for row in self.consensus_df.iter_rows(named=True):
             consensus_uid = row["consensus_uid"]
             group = assigned_groups.get(consensus_uid, 0)
             adduct_of = group_adduct_of.get(group, consensus_uid)
             adduct_group_list.append(group)
             adduct_of_list.append(adduct_of)
         # Add the new columns to consensus_df
-        self.consensus_df = self.consensus_df.with_columns([
-            pl.Series("adduct_group", adduct_group_list, dtype=pl.Int64),
-            pl.Series("adduct_of", adduct_of_list, dtype=pl.Int64),
-        ])
+        self.consensus_df = self.consensus_df.with_columns(
+            [
+                pl.Series("adduct_group", adduct_group_list, dtype=pl.Int64),
+                pl.Series("adduct_of", adduct_of_list, dtype=pl.Int64),
+            ],
+        )
     # calculate the completeness of the consensus map
     if len(self.consensus_df) > 0 and len(self.samples_df) > 0:
-        c = len(self.consensus_mapping_df) / len(self.consensus_df) / len(self.samples_df)
+        c = (
+            len(self.consensus_mapping_df)
+            / len(self.consensus_df)
+            / len(self.samples_df)
+        )
         self.logger.info(
             f"Merging completed. Consensus features: {len(self.consensus_df)}. Completeness: {c:.2f}.",
         )
@@ -938,7 +1085,9 @@ def find_ms2(self, **kwargs):
     ]
     for row in feats.iter_rows(named=True):
         feature_uid = row["feature_uid"]
-        feature_lookup[feature_uid] = {col: row[col] for col in relevant_cols if col in feats.columns}
+        feature_lookup[feature_uid] = {
+            col: row[col] for col in relevant_cols if col in feats.columns
+        }
     tdqm_disable = self.log_level not in ["TRACE", "DEBUG", "INFO"]
     # Process consensus mapping in batch
@@ -960,20 +1109,26 @@ def find_ms2(self, **kwargs):
         for j in range(len(ms2_specs)):
             spec = ms2_specs[j]
             scanid = ms2_scans[j]
-            data.append({
-                "consensus_uid": int(mapping_row["consensus_uid"]),
-                "feature_uid": int(mapping_row["feature_uid"]),
-                "sample_uid": int(mapping_row["sample_uid"]),
-                "scan_id": int(scanid),
-                "energy": round(spec.energy, 1) if hasattr(spec, "energy") and spec.energy is not None else None,
-                "prec_inty": round(inty, 0) if inty is not None else None,
-                "prec_coherence": round(chrom_coherence, 3) if chrom_coherence is not None else None,
-                "prec_prominence_scaled": round(chrom_prominence_scaled, 3)
-                if chrom_prominence_scaled is not None
-                else None,
-                "number_frags": len(spec.mz),
-                "spec": spec,
-            })
+            data.append(
+                {
+                    "consensus_uid": int(mapping_row["consensus_uid"]),
+                    "feature_uid": int(mapping_row["feature_uid"]),
+                    "sample_uid": int(mapping_row["sample_uid"]),
+                    "scan_id": int(scanid),
+                    "energy": round(spec.energy, 1)
+                    if hasattr(spec, "energy") and spec.energy is not None
+                    else None,
+                    "prec_inty": round(inty, 0) if inty is not None else None,
+                    "prec_coherence": round(chrom_coherence, 3)
+                    if chrom_coherence is not None
+                    else None,
+                    "prec_prominence_scaled": round(chrom_prominence_scaled, 3)
+                    if chrom_prominence_scaled is not None
+                    else None,
+                    "number_frags": len(spec.mz),
+                    "spec": spec,
+                },
+            )
     self.consensus_ms2 = pl.DataFrame(data)
     if not self.consensus_ms2.is_empty():
         unique_consensus_features = self.consensus_ms2["consensus_uid"].n_unique()
@@ -1006,7 +1161,10 @@ def filter_consensus(
         else:
             if isinstance(coherence, tuple) and len(coherence) == 2:
                 min_coherence, max_coherence = coherence
-                cons = cons[(cons["chrom_coherence"] >= min_coherence) & (cons["chrom_coherence"] <= max_coherence)]
+                cons = cons[
+                    (cons["chrom_coherence"] >= min_coherence)
+                    & (cons["chrom_coherence"] <= max_coherence)
+                ]
             else:
                 cons = cons[cons["chrom_coherence"] >= coherence]
         after_coherence = len(cons)
@@ -1017,7 +1175,9 @@ def filter_consensus(
     if quality is not None:
         if isinstance(quality, tuple) and len(quality) == 2:
             min_quality, max_quality = quality
-            cons = cons[(cons["quality"] >= min_quality) & (cons["quality"] <= max_quality)]
+            cons = cons[
+                (cons["quality"] >= min_quality) & (cons["quality"] <= max_quality)
+            ]
         else:
             cons = cons[cons["quality"] >= quality]
         after_quality = len(cons)
@@ -1028,7 +1188,10 @@ def filter_consensus(
     if number_samples is not None:
         if isinstance(number_samples, tuple) and len(number_samples) == 2:
             min_number, max_number = number_samples
-            cons = cons[(cons["number_samples"] >= min_number) & (cons["number_samples"] <= max_number)]
+            cons = cons[
+                (cons["number_samples"] >= min_number)
+                & (cons["number_samples"] <= max_number)
+            ]
         else:
             cons = cons[cons["number_samples"] >= number_samples]
         after_number_samples = len(cons)
@@ -1105,11 +1268,13 @@ def _integrate_chrom_impl(self, **kwargs):
     # Merge consensus_mapping with consensus_df to get rt_start_mean and rt_end_mean
     # Use Polars join operation instead of pandas merge
-    consensus_subset = self.consensus_df.select([
-        "consensus_uid",
-        "rt_start_mean",
-        "rt_end_mean",
-    ])
+    consensus_subset = self.consensus_df.select(
+        [
+            "consensus_uid",
+            "rt_start_mean",
+            "rt_end_mean",
+        ],
+    )
     df1 = self.consensus_mapping_df.join(
         consensus_subset,
         on="consensus_uid",
@@ -1154,9 +1319,9 @@ def _integrate_chrom_impl(self, **kwargs):
         if chrom is None or len(chrom) == 0:
             update_rows.append(row_idx)
             chroms.append(None)
-            rt_starts.append(float('nan'))
-            rt_ends.append(float('nan'))
-            rt_deltas.append(float('nan'))
+            rt_starts.append(float("nan"))
+            rt_ends.append(float("nan"))
+            rt_deltas.append(float("nan"))
             chrom_areas.append(-1.0)
             continue
         ## TODO expose parameters
@@ -1186,9 +1351,13 @@ def _integrate_chrom_impl(self, **kwargs):
     if update_rows:
         # Create mapping from row index to new values
         row_to_chrom = {update_rows[i]: chroms[i] for i in range(len(update_rows))}
-        row_to_rt_start = {update_rows[i]: rt_starts[i] for i in range(len(update_rows))}
+        row_to_rt_start = {
+            update_rows[i]: rt_starts[i] for i in range(len(update_rows))
+        }
         row_to_rt_end = {update_rows[i]: rt_ends[i] for i in range(len(update_rows))}
-        row_to_rt_delta = {update_rows[i]: rt_deltas[i] for i in range(len(update_rows))}
+        row_to_rt_delta = {
+            update_rows[i]: rt_deltas[i] for i in range(len(update_rows))
+        }
         row_to_chrom_area = {
             update_rows[i]: float(chrom_areas[i]) if chrom_areas[i] is not None else 0.0
             for i in range(len(update_rows))
@@ -1202,58 +1371,60 @@ def _integrate_chrom_impl(self, **kwargs):
         # Update columns conditionally
         try:
-            self.features_df = df_with_index.with_columns([
-                # Update chrom column - use when() to update only specific rows
-                pl.when(update_mask)
-                .then(
-                    pl.col("__row_idx").map_elements(
-                        lambda x: row_to_chrom.get(x, None),
-                        return_dtype=pl.Object,
-                    ),
-                )
-                .otherwise(pl.col("chrom"))
-                .alias("chrom"),
-                # Update rt_start column
-                pl.when(update_mask)
-                .then(
-                    pl.col("__row_idx").map_elements(
-                        lambda x: row_to_rt_start.get(x, None),
-                        return_dtype=pl.Float64,
-                    ),
-                )
-                .otherwise(pl.col("rt_start"))
-                .alias("rt_start"),
-                # Update rt_end column
-                pl.when(update_mask)
-                .then(
-                    pl.col("__row_idx").map_elements(
-                        lambda x: row_to_rt_end.get(x, None),
-                        return_dtype=pl.Float64,
-                    ),
-                )
-                .otherwise(pl.col("rt_end"))
-                .alias("rt_end"),
-                # Update rt_delta column
-                pl.when(update_mask)
-                .then(
-                    pl.col("__row_idx").map_elements(
-                        lambda x: row_to_rt_delta.get(x, None),
-                        return_dtype=pl.Float64,
-                    ),
-                )
-                .otherwise(pl.col("rt_delta"))
-                .alias("rt_delta"),
-                # Update chrom_area column
-                pl.when(update_mask)
-                .then(
-                    pl.col("__row_idx").map_elements(
-                        lambda x: row_to_chrom_area.get(x, 0),
-                        return_dtype=pl.Float64,
-                    ),
-                )
-                .otherwise(pl.col("chrom_area"))
-                .alias("chrom_area"),
-            ]).drop("__row_idx")  # Remove the temporary row index column
+            self.features_df = df_with_index.with_columns(
+                [
+                    # Update chrom column - use when() to update only specific rows
+                    pl.when(update_mask)
+                    .then(
+                        pl.col("__row_idx").map_elements(
+                            lambda x: row_to_chrom.get(x, None),
+                            return_dtype=pl.Object,
+                        ),
+                    )
+                    .otherwise(pl.col("chrom"))
+                    .alias("chrom"),
+                    # Update rt_start column
+                    pl.when(update_mask)
+                    .then(
+                        pl.col("__row_idx").map_elements(
+                            lambda x: row_to_rt_start.get(x, None),
+                            return_dtype=pl.Float64,
+                        ),
+                    )
+                    .otherwise(pl.col("rt_start"))
+                    .alias("rt_start"),
+                    # Update rt_end column
+                    pl.when(update_mask)
+                    .then(
+                        pl.col("__row_idx").map_elements(
+                            lambda x: row_to_rt_end.get(x, None),
+                            return_dtype=pl.Float64,
+                        ),
+                    )
+                    .otherwise(pl.col("rt_end"))
+                    .alias("rt_end"),
+                    # Update rt_delta column
+                    pl.when(update_mask)
+                    .then(
+                        pl.col("__row_idx").map_elements(
+                            lambda x: row_to_rt_delta.get(x, None),
+                            return_dtype=pl.Float64,
+                        ),
+                    )
+                    .otherwise(pl.col("rt_delta"))
+                    .alias("rt_delta"),
+                    # Update chrom_area column
+                    pl.when(update_mask)
+                    .then(
+                        pl.col("__row_idx").map_elements(
+                            lambda x: row_to_chrom_area.get(x, 0),
+                            return_dtype=pl.Float64,
+                        ),
+                    )
+                    .otherwise(pl.col("chrom_area"))
+                    .alias("chrom_area"),
+                ],
+            ).drop("__row_idx")  # Remove the temporary row index column
             self.logger.debug(
                 f"Integration completed. Updated {len(update_rows)} features with chromatogram data.",
@@ -1344,9 +1515,18 @@ def _align_pose_clustering(study_obj, fmaps, params):
     params_oms.setValue("pairfinder:ignore_charge", "true")
     params_oms.setValue("max_num_peaks_considered", 1000)
     params_oms.setValue("pairfinder:distance_RT:max_difference", params.get("rt_tol"))
-    params_oms.setValue("pairfinder:distance_MZ:max_difference", params.get("mz_max_diff"))
-    params_oms.setValue("superimposer:rt_pair_distance_fraction", params.get("rt_pair_distance_frac"))
-    params_oms.setValue("superimposer:mz_pair_max_distance", params.get("mz_pair_max_distance"))
+    params_oms.setValue(
+        "pairfinder:distance_MZ:max_difference",
+        params.get("mz_max_diff"),
+    )
+    params_oms.setValue(
+        "superimposer:rt_pair_distance_fraction",
+        params.get("rt_pair_distance_frac"),
+    )
+    params_oms.setValue(
+        "superimposer:mz_pair_max_distance",
+        params.get("mz_pair_max_distance"),
+    )
     params_oms.setValue("superimposer:num_used_points", params.get("num_used_points"))
     params_oms.setValue("pairfinder:distance_MZ:exponent", 3.0)
     params_oms.setValue("pairfinder:distance_RT:exponent", 2.0)
@@ -1355,7 +1535,9 @@ def _align_pose_clustering(study_obj, fmaps, params):
     study_obj.logger.info("Starting alignment with PoseClustering")
     # Set ref_index to feature map index with largest number of features
-    ref_index = [i[0] for i in sorted(enumerate([fm.size() for fm in fmaps]), key=lambda x: x[1])][-1]
+    ref_index = [
+        i[0] for i in sorted(enumerate([fm.size() for fm in fmaps]), key=lambda x: x[1])
+    ][-1]
     study_obj.logger.debug(
         f"Reference map is {study_obj.samples_df.row(ref_index, named=True)['sample_name']}",
     )
@@ -1374,7 +1556,10 @@ def _align_pose_clustering(study_obj, fmaps, params):
     ):
         if index == ref_index:
             continue
-        if params.get("skip_blanks") and study_obj.samples_df.row(index, named=True)["sample_type"] == "blank":
+        if (
+            params.get("skip_blanks")
+            and study_obj.samples_df.row(index, named=True)["sample_type"] == "blank"
+        ):
             continue
         trafo = oms.TransformationDescription()
         aligner.align(fm, trafo)
@@ -1393,19 +1578,28 @@ def _align_kd_algorithm(study_obj, fmaps, params):
     # Pull parameter values - map standard align params to our algorithm
     # Use rt_tol (standard align param) instead of warp_rt_tol for RT tolerance
-    rt_pair_tol = float(params.get("rt_tol")) if params.get("rt_tol") is not None else 2.0
+    rt_pair_tol = (
+        float(params.get("rt_tol")) if params.get("rt_tol") is not None else 2.0
+    )
     # Use mz_max_diff (standard align param) converted to ppm
-    mz_max_diff_da = float(params.get("mz_max_diff")) if params.get("mz_max_diff") is not None else 0.02
+    mz_max_diff_da = (
+        float(params.get("mz_max_diff"))
+        if params.get("mz_max_diff") is not None
+        else 0.02
+    )
     # Convert Da to ppm (assuming ~400 m/z average for metabolomics): 0.01 Da / 400 * 1e6 = 25 ppm
     ppm_tol = mz_max_diff_da / 400.0 * 1e6
     # Allow override with warp_mz_tol if specifically set (but not from defaults)
     try:
         warp_mz_from_params = params.get("warp_mz_tol")
-        if warp_mz_from_params is not None and warp_mz_from_params != params.__class__().warp_mz_tol:
+        if (
+            warp_mz_from_params is not None
+            and warp_mz_from_params != params.__class__().warp_mz_tol
+        ):
             ppm_tol = float(warp_mz_from_params)
     except (KeyError, AttributeError):
         pass
     # Safely retrieve optional parameter max_anchor_points (not yet part of defaults)
     try:
         _raw_mp = params.get("max_anchor_points")
@@ -1413,7 +1607,7 @@ def _align_kd_algorithm(study_obj, fmaps, params):
         _raw_mp = None
     max_points = int(_raw_mp) if _raw_mp is not None else 1000
     study_obj.logger.info(
-        f"Align time axes with rt_tol={params.get('rt_tol')}, min_samples={params.get('min_samples')}, max_points={max_points}"
+        f"Align time axes with rt_tol={params.get('rt_tol')}, min_samples={params.get('min_samples')}, max_points={max_points}",
     )
     # Choose reference map (largest number of features)
@@ -1421,7 +1615,7 @@ def _align_kd_algorithm(study_obj, fmaps, params):
     ref_map = fmaps[ref_index]
     study_obj.alignment_ref_index = ref_index
     study_obj.logger.debug(
-        f"Reference map index {ref_index} (sample: {study_obj.samples_df.row(ref_index, named=True)['sample_name']}) size={ref_map.size()}"
+        f"Reference map index {ref_index} (sample: {study_obj.samples_df.row(ref_index, named=True)['sample_name']}) size={ref_map.size()}",
     )
     # Extract and sort reference features by m/z for binary search
@@ -1445,7 +1639,10 @@ def _align_kd_algorithm(study_obj, fmaps, params):
                     best_drt = drt
         return best
-    def _set_pairs(td_obj: 'oms.TransformationDescription', pairs):  # Helper for pyopenms API variability
+    def _set_pairs(
+        td_obj: oms.TransformationDescription,
+        pairs,
+    ):  # Helper for pyopenms API variability
         # Always provide list of lists to satisfy strict type expectations
         conv = [[float(a), float(b)] for a, b in pairs]
         try:
@@ -1527,7 +1724,7 @@ def _align_kd_algorithm(study_obj, fmaps, params):
             td.fitModel(model, oms.Param())
         except Exception as e:
             study_obj.logger.debug(
-                f"Map {i}: {model} fitting failed ({e}); fallback to linear two-point shift"
+                f"Map {i}: {model} fitting failed ({e}); fallback to linear two-point shift",
             )
             rts = [f.getRT() for f in fmap]
             lo, hi = (min(rts), max(rts)) if rts else (0.0, 1.0)
@@ -1539,7 +1736,7 @@ def _align_kd_algorithm(study_obj, fmaps, params):
                 pass
         study_obj.logger.debug(
-            f"Map {i}: anchors raw={len(pairs_raw)} used={len(pairs_use)} model={model} median_shift={med_shift:.4f}s"
+            f"Map {i}: anchors raw={len(pairs_raw)} used={len(pairs_use)} model={model} median_shift={med_shift:.4f}s",
         )
         transformations.append(td)
@@ -1557,7 +1754,7 @@ def _align_kd_algorithm(study_obj, fmaps, params):
             study_obj.logger.warning(f"Map {i}: failed applying transformation ({e})")
     study_obj.logger.info(
-        f"Custom KD alignment completed. Reference index {ref_index}. Applied {len(transformations)} transformations."
+        f"Custom KD alignment completed. Reference index {ref_index}. Applied {len(transformations)} transformations.",
     )
@@ -1566,13 +1763,18 @@ def _align_pose_clustering_fallback(study_obj, fmaps, params):
     import pyopenms as oms
     aligner = oms.MapAlignmentAlgorithmPoseClustering()
-    ref_index = [i[0] for i in sorted(enumerate([fm.size() for fm in fmaps]), key=lambda x: x[1])][-1]
+    ref_index = [
+        i[0] for i in sorted(enumerate([fm.size() for fm in fmaps]), key=lambda x: x[1])
+    ][-1]
     # Set up basic parameters for pose clustering
     pc_params = oms.Param()
     pc_params.setValue("max_num_peaks_considered", 1000)
     pc_params.setValue("pairfinder:distance_RT:max_difference", params.get("rt_tol"))
-    pc_params.setValue("pairfinder:distance_MZ:max_difference", params.get("mz_max_diff"))
+    pc_params.setValue(
+        "pairfinder:distance_MZ:max_difference",
+        params.get("mz_max_diff"),
+    )
     aligner.setParameters(pc_params)
     aligner.setReference(fmaps[ref_index])

masster 0.4.4__py3-none-any.whl → 0.4.5__py3-none-any.whl

Potentially problematic release.

masster 0.4.4py3-none-any.whl → 0.4.5py3-none-any.whl