PyPI - masster - Versions diffs - 0.5.22__py3-none-any.whl → 0.5.24__py3-none-any.whl - Mend

masster 0.5.22py3-none-any.whl → 0.5.24py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of masster might be problematic. Click here for more details.

Files changed (37) hide show

masster/_version.py +1 -1
masster/logger.py +35 -19
masster/sample/adducts.py +15 -29
masster/sample/defaults/find_adducts_def.py +1 -3
masster/sample/defaults/sample_def.py +4 -4
masster/sample/h5.py +203 -361
masster/sample/helpers.py +14 -30
masster/sample/lib.py +3 -3
masster/sample/load.py +21 -29
masster/sample/plot.py +222 -132
masster/sample/processing.py +42 -55
masster/sample/sample.py +37 -46
masster/sample/save.py +37 -61
masster/sample/sciex.py +13 -11
masster/sample/thermo.py +69 -74
masster/spectrum.py +15 -15
masster/study/analysis.py +650 -586
masster/study/defaults/identify_def.py +1 -3
masster/study/defaults/merge_def.py +6 -7
masster/study/defaults/study_def.py +1 -5
masster/study/export.py +35 -96
masster/study/h5.py +134 -211
masster/study/helpers.py +385 -459
masster/study/id.py +239 -290
masster/study/importers.py +84 -93
masster/study/load.py +159 -178
masster/study/merge.py +1112 -1098
masster/study/plot.py +195 -149
masster/study/processing.py +144 -191
masster/study/save.py +14 -13
masster/study/study.py +89 -130
masster/wizard/wizard.py +764 -714
{masster-0.5.22.dist-info → masster-0.5.24.dist-info}/METADATA +27 -1
{masster-0.5.22.dist-info → masster-0.5.24.dist-info}/RECORD +37 -37
{masster-0.5.22.dist-info → masster-0.5.24.dist-info}/WHEEL +0 -0
{masster-0.5.22.dist-info → masster-0.5.24.dist-info}/entry_points.txt +0 -0
{masster-0.5.22.dist-info → masster-0.5.24.dist-info}/licenses/LICENSE +0 -0

masster/study/processing.py CHANGED Viewed

@@ -36,16 +36,16 @@ def align(self, **kwargs):
     """
     # parameters initialization
     params = align_defaults()
     # Handle 'params' keyword argument specifically (like merge does)
-    if 'params' in kwargs:
-        provided_params = kwargs.pop('params')
+    if "params" in kwargs:
+        provided_params = kwargs.pop("params")
         if isinstance(provided_params, align_defaults):
             params = provided_params
             self.logger.debug("Using provided align_defaults parameters from 'params' argument")
         else:
             self.logger.warning("'params' argument is not an align_defaults instance, ignoring")
     # Process remaining kwargs
     for key, value in kwargs.items():
         if isinstance(value, align_defaults):
@@ -68,7 +68,7 @@ def align(self, **kwargs):
     # Ensure rt_original exists before starting alignment (both algorithms need this)
     if "rt_original" not in self.features_df.columns:
-        # add column 'rt_original' after 'rt'
+        # add column 'rt_original' after 'rt'
         rt_index = self.features_df.columns.get_loc("rt") + 1
         self.features_df.insert(rt_index, "rt_original", 0)
         self.features_df["rt_original"] = self.features_df["rt"]
@@ -174,9 +174,7 @@ def find_ms2(self, **kwargs):
     ]
     for row in feats.iter_rows(named=True):
         feature_uid = row["feature_uid"]
-        feature_lookup[feature_uid] = {
-            col: row[col] for col in relevant_cols if col in feats.columns
-        }
+        feature_lookup[feature_uid] = {col: row[col] for col in relevant_cols if col in feats.columns}
     tdqm_disable = self.log_level not in ["TRACE", "DEBUG", "INFO"]
     # Process consensus mapping in batch
@@ -204,13 +202,9 @@ def find_ms2(self, **kwargs):
                     "feature_uid": int(mapping_row["feature_uid"]),
                     "sample_uid": int(mapping_row["sample_uid"]),
                     "scan_id": int(scanid),
-                    "energy": round(spec.energy, 1)
-                    if hasattr(spec, "energy") and spec.energy is not None
-                    else None,
+                    "energy": round(spec.energy, 1) if hasattr(spec, "energy") and spec.energy is not None else None,
                     "prec_inty": round(inty, 0) if inty is not None else None,
-                    "prec_coherence": round(chrom_coherence, 3)
-                    if chrom_coherence is not None
-                    else None,
+                    "prec_coherence": round(chrom_coherence, 3) if chrom_coherence is not None else None,
                     "prec_prominence_scaled": round(chrom_prominence_scaled, 3)
                     if chrom_prominence_scaled is not None
                     else None,
@@ -250,10 +244,7 @@ def filter_consensus(
         else:
             if isinstance(coherence, tuple) and len(coherence) == 2:
                 min_coherence, max_coherence = coherence
-                cons = cons[
-                    (cons["chrom_coherence"] >= min_coherence)
-                    & (cons["chrom_coherence"] <= max_coherence)
-                ]
+                cons = cons[(cons["chrom_coherence"] >= min_coherence) & (cons["chrom_coherence"] <= max_coherence)]
             else:
                 cons = cons[cons["chrom_coherence"] >= coherence]
         after_coherence = len(cons)
@@ -264,9 +255,7 @@ def filter_consensus(
     if quality is not None:
         if isinstance(quality, tuple) and len(quality) == 2:
             min_quality, max_quality = quality
-            cons = cons[
-                (cons["quality"] >= min_quality) & (cons["quality"] <= max_quality)
-            ]
+            cons = cons[(cons["quality"] >= min_quality) & (cons["quality"] <= max_quality)]
         else:
             cons = cons[cons["quality"] >= quality]
         after_quality = len(cons)
@@ -277,10 +266,7 @@ def filter_consensus(
     if number_samples is not None:
         if isinstance(number_samples, tuple) and len(number_samples) == 2:
             min_number, max_number = number_samples
-            cons = cons[
-                (cons["number_samples"] >= min_number)
-                & (cons["number_samples"] <= max_number)
-            ]
+            cons = cons[(cons["number_samples"] >= min_number) & (cons["number_samples"] <= max_number)]
         else:
             cons = cons[cons["number_samples"] >= number_samples]
         after_number_samples = len(cons)
@@ -437,13 +423,9 @@ def _integrate_chrom_impl(self, **kwargs):
     if update_rows:
         # Create mapping from row index to new values
         row_to_chrom = {update_rows[i]: chroms[i] for i in range(len(update_rows))}
-        row_to_rt_start = {
-            update_rows[i]: rt_starts[i] for i in range(len(update_rows))
-        }
+        row_to_rt_start = {update_rows[i]: rt_starts[i] for i in range(len(update_rows))}
         row_to_rt_end = {update_rows[i]: rt_ends[i] for i in range(len(update_rows))}
-        row_to_rt_delta = {
-            update_rows[i]: rt_deltas[i] for i in range(len(update_rows))
-        }
+        row_to_rt_delta = {update_rows[i]: rt_deltas[i] for i in range(len(update_rows))}
         row_to_chrom_area = {
             update_rows[i]: float(chrom_areas[i]) if chrom_areas[i] is not None else 0.0
             for i in range(len(update_rows))
@@ -598,10 +580,10 @@ def _align_pose_clustering(study_obj, params):
     # Generate temporary feature maps on-demand from features_df for PoseClustering
     study_obj.logger.debug("Generating feature maps on-demand from features_df for PoseClustering alignment")
     tdqm_disable = study_obj.log_level not in ["TRACE", "DEBUG", "INFO"]
     fmaps = []
     # Process each sample in order with progress bar
     for sample_index, row_dict in tqdm(
         list(enumerate(study_obj.samples_df.iter_rows(named=True))),
@@ -611,17 +593,17 @@ def _align_pose_clustering(study_obj, params):
     ):
         sample_uid = row_dict["sample_uid"]
         sample_name = row_dict["sample_name"]
         # Get features for this sample from features_df
         sample_features = study_obj.features_df.filter(pl.col("sample_uid") == sample_uid)
         # Create new FeatureMap
         feature_map = oms.FeatureMap()
         # Convert DataFrame features to OpenMS Features
         for feature_row in sample_features.iter_rows(named=True):
             feature = oms.Feature()
             # Set properties from DataFrame (handle missing values gracefully)
             try:
                 # Skip features with missing critical data
@@ -639,7 +621,7 @@ def _align_pose_clustering(study_obj, params):
                 feature.setMZ(float(feature_row["mz"]))
                 feature.setRT(float(feature_row["rt"]))
                 feature.setIntensity(float(feature_row["inty"]))
                 # Handle optional fields that might be None
                 if feature_row.get("quality") is not None:
                     feature.setOverallQuality(float(feature_row["quality"]))
@@ -651,9 +633,9 @@ def _align_pose_clustering(study_obj, params):
             except (ValueError, TypeError) as e:
                 study_obj.logger.warning(f"Skipping feature due to conversion error: {e}")
                 continue
         fmaps.append(feature_map)
     study_obj.logger.debug(f"Generated {len(fmaps)} feature maps from features_df for PoseClustering alignment")
     # Create PC-specific OpenMS parameters
@@ -684,10 +666,8 @@ def _align_pose_clustering(study_obj, params):
     )
     # Set ref_index to feature map index with largest number of features
-    ref_index = [
-        i[0] for i in sorted(enumerate([fm.size() for fm in fmaps]), key=lambda x: x[1])
-    ][-1]
+    ref_index = [i[0] for i in sorted(enumerate([fm.size() for fm in fmaps]), key=lambda x: x[1])][-1]
     aligner.setParameters(params_oms)
     aligner.setReference(fmaps[ref_index])
     study_obj.logger.debug(f"Parameters for alignment: {params}")
@@ -701,18 +681,17 @@ def _align_pose_clustering(study_obj, params):
     ):
         if index == ref_index:
             continue
-        if (
-            params.get("skip_blanks")
-            and study_obj.samples_df.row(index, named=True)["sample_type"] == "blank"
-        ):
+        if params.get("skip_blanks") and study_obj.samples_df.row(index, named=True)["sample_type"] == "blank":
             continue
         # Skip feature maps with insufficient data points for alignment
         if fm.size() < 2:
             sample_name = study_obj.samples_df.row(index, named=True)["sample_name"]
-            study_obj.logger.warning(f"Skipping alignment for sample '{sample_name}' - insufficient features ({fm.size()} features)")
+            study_obj.logger.warning(
+                f"Skipping alignment for sample '{sample_name}' - insufficient features ({fm.size()} features)"
+            )
             continue
         try:
             trafo = oms.TransformationDescription()
             aligner.align(fm, trafo)
@@ -724,7 +703,7 @@ def _align_pose_clustering(study_obj, params):
             continue
     study_obj.alignment_ref_index = ref_index
     # Process feature maps and update features_df with transformed retention times
     # Build a fast lookup for (sample_uid, featureUid) to index in features_df
     feats = study_obj.features_df
@@ -732,8 +711,7 @@ def _align_pose_clustering(study_obj, params):
     # Pre-build sample_uid lookup for faster access
     study_obj.logger.debug("Build sample_uid lookup for fast access...")
     sample_uid_lookup = {
-        idx: row_dict["sample_uid"]
-        for idx, row_dict in enumerate(study_obj.samples_df.iter_rows(named=True))
+        idx: row_dict["sample_uid"] for idx, row_dict in enumerate(study_obj.samples_df.iter_rows(named=True))
     }
     # Build the main lookup using feature_uid (not feature_id)
@@ -833,7 +811,7 @@ def _align_pose_clustering(study_obj, params):
     # Clean up temporary feature maps to release memory
     del fmaps
     study_obj.logger.debug("Temporary feature maps deleted to release memory")
     # Resolve reference sample UID from the reference index
     ref_sample_uid = sample_uid_lookup.get(ref_index)
     study_obj.logger.success(
@@ -853,24 +831,15 @@ def _align_kd_algorithm(study_obj, params):
     # Pull parameter values - map standard align params to our algorithm
     # Use rt_tol (standard align param) instead of warp_rt_tol for RT tolerance
-    rt_pair_tol = (
-        float(params.get("rt_tol")) if params.get("rt_tol") is not None else 2.0
-    )
+    rt_pair_tol = float(params.get("rt_tol")) if params.get("rt_tol") is not None else 2.0
     # Use mz_max_diff (standard align param) converted to ppm
-    mz_max_diff_da = (
-        float(params.get("mz_max_diff"))
-        if params.get("mz_max_diff") is not None
-        else 0.02
-    )
+    mz_max_diff_da = float(params.get("mz_max_diff")) if params.get("mz_max_diff") is not None else 0.02
     # Convert Da to ppm (assuming ~400 m/z average for metabolomics): 0.01 Da / 400 * 1e6 = 25 ppm
     ppm_tol = mz_max_diff_da / 400.0 * 1e6
     # Allow override with warp_mz_tol if specifically set (but not from defaults)
     try:
         warp_mz_from_params = params.get("warp_mz_tol")
-        if (
-            warp_mz_from_params is not None
-            and warp_mz_from_params != params.__class__().warp_mz_tol
-        ):
+        if warp_mz_from_params is not None and warp_mz_from_params != params.__class__().warp_mz_tol:
             ppm_tol = float(warp_mz_from_params)
     except (KeyError, AttributeError):
         pass
@@ -888,29 +857,31 @@ def _align_kd_algorithm(study_obj, params):
     # Work directly with features_df instead of feature maps
     if study_obj.features_df is None or study_obj.features_df.is_empty():
         study_obj.logger.error("No features_df available for alignment. Cannot proceed with KD alignment.")
-        raise ValueError("No features_df available for alignment. This usually indicates that features were not detected properly.")
+        raise ValueError(
+            "No features_df available for alignment. This usually indicates that features were not detected properly."
+        )
     # OPTIMIZATION 1: Group all features by sample_uid in ONE operation instead of filtering repeatedly
     study_obj.logger.debug("Grouping features efficiently (major speedup)...")
     # rt_original should already exist (created in main align() function)
     if "rt_original" not in study_obj.features_df.columns:
         raise ValueError("rt_original column missing - this should have been created by align() function")
     sample_groups = study_obj.features_df.group_by("sample_uid", maintain_order=True)
     sample_feature_data = sample_groups.agg([
         pl.len().alias("feature_count"),
         pl.col("mz").alias("mzs"),
-        pl.col("rt_original").alias("rt_originals")  # Use original RT values for alignment
+        pl.col("rt_original").alias("rt_originals"),  # Use original RT values for alignment
     ]).sort("feature_count", descending=True)
     if sample_feature_data.is_empty():
         study_obj.logger.error("No features found in any sample for alignment.")
         raise ValueError("No features found in any sample for alignment.")
     # Choose reference sample (sample with most features)
     ref_sample_uid = sample_feature_data.row(0, named=True)["sample_uid"]
     # Find the index of this sample in samples_df
     ref_index = None
     sample_uid_to_index = {}
@@ -919,24 +890,24 @@ def _align_kd_algorithm(study_obj, params):
         sample_uid_to_index[sample_uid] = idx
         if sample_uid == ref_sample_uid:
             ref_index = idx
     if ref_index is None:
         study_obj.logger.error(f"Could not find reference sample {ref_sample_uid} in samples_df")
         raise ValueError(f"Could not find reference sample {ref_sample_uid} in samples_df")
     study_obj.alignment_ref_index = ref_index
     # OPTIMIZATION 2: Get reference features efficiently from pre-grouped data
     # Always use rt_original for alignment input to ensure consistent results
     ref_row = sample_feature_data.filter(pl.col("sample_uid") == ref_sample_uid).row(0, named=True)
     ref_mzs_list = ref_row["mzs"]
     ref_rts_list = ref_row["rt_originals"]  # Use original RT values
     # Create sorted reference features for binary search
     ref_features = list(zip(ref_mzs_list, ref_rts_list))
     ref_features.sort(key=lambda x: x[0])
     ref_mzs = [mz for mz, _ in ref_features]
     study_obj.logger.debug(
         f"Reference sample UID {ref_sample_uid} (index {ref_index}, sample: {study_obj.samples_df.row(ref_index, named=True)['sample_name']}) has {len(ref_features)} features",
     )
@@ -979,19 +950,19 @@ def _align_kd_algorithm(study_obj, params):
         sample_uid = row["sample_uid"]
         sample_mzs = row["mzs"]
         sample_rts = row["rt_originals"]  # Use original RT values for alignment input
         td = oms.TransformationDescription()
         sample_index = sample_uid_to_index.get(sample_uid)
         if sample_index is None:
             study_obj.logger.warning(f"Sample UID {sample_uid} not found in samples_df, skipping")
             continue
         # Skip empty samples
         if not sample_mzs or not sample_rts:
             transformations[sample_uid] = td
             continue
         # Identity for reference sample
         if sample_uid == ref_sample_uid:
             rts = [rt for rt in sample_rts if rt is not None]
@@ -1074,7 +1045,7 @@ def _align_kd_algorithm(study_obj, params):
     # OPTIMIZATION 5: Apply transformations efficiently using vectorized operations
     study_obj.logger.debug("Applying RT transformations efficiently...")
     # Apply transformations to RT values starting from rt_original
     def transform_rt_vectorized(sample_uid: int, rt_original: float) -> float:
         if sample_uid in transformations and rt_original is not None:
@@ -1084,14 +1055,13 @@ def _align_kd_algorithm(study_obj, params):
             except Exception:
                 return rt_original
         return rt_original
     # Use Polars' efficient struct operations for vectorized transformation
     # Apply transformation to rt_original and store result in rt column
     study_obj.features_df = study_obj.features_df.with_columns(
-        pl.struct(["sample_uid", "rt_original"]).map_elements(
-            lambda x: transform_rt_vectorized(x["sample_uid"], x["rt_original"]),
-            return_dtype=pl.Float64
-        ).alias("rt")
+        pl.struct(["sample_uid", "rt_original"])
+        .map_elements(lambda x: transform_rt_vectorized(x["sample_uid"], x["rt_original"]), return_dtype=pl.Float64)
+        .alias("rt")
     )
     study_obj.logger.success(
@@ -1099,16 +1069,12 @@ def _align_kd_algorithm(study_obj, params):
     )
 def _align_pose_clustering_fallback(study_obj, fmaps, params):
     """Fallback PoseClustering alignment with minimal parameters."""
     import pyopenms as oms
     aligner = oms.MapAlignmentAlgorithmPoseClustering()
-    ref_index = [
-        i[0] for i in sorted(enumerate([fm.size() for fm in fmaps]), key=lambda x: x[1])
-    ][-1]
+    ref_index = [i[0] for i in sorted(enumerate([fm.size() for fm in fmaps]), key=lambda x: x[1])][-1]
     # Set up basic parameters for pose clustering
     pc_params = oms.Param()
@@ -1137,15 +1103,15 @@ def find_iso(self, rt_tol=0.1, mz_tol=0.01, uids=None):
     """
     Find isotope patterns for consensus features by searching raw MS1 data.
     OPTIMIZED VERSION: Each sample file is loaded only once for maximum efficiency.
     For each consensus feature:
-    1. Find the associated feature with highest intensity
+    1. Find the associated feature with highest intensity
     2. Load the corresponding sample5 file to access raw MS1 data
     3. Use original_rt (before alignment) to find the correct scan
     4. Search for isotope patterns in raw MS1 spectra
     5. Look for isotope patterns: 0.33, 0.50, 0.66, 1.00, 1.50, 2.00, 3.00, 4.00, 5.00 Da
     6. Store results as numpy arrays with [mz, inty] in the iso column
     Parameters:
         rt_tol (float): RT tolerance for scan matching in seconds
         mz_tol (float): Additional m/z tolerance for isotope matching in Da
@@ -1154,27 +1120,25 @@ def find_iso(self, rt_tol=0.1, mz_tol=0.01, uids=None):
     if self.consensus_df is None or self.consensus_df.is_empty():
         self.logger.error("No consensus features found. Please run merge() first.")
         return
     if self.consensus_mapping_df is None or self.consensus_mapping_df.is_empty():
         self.logger.error("No consensus mapping found. Please run merge() first.")
         return
     if self.features_df is None or self.features_df.is_empty():
         self.logger.error("No features found.")
         return
     if self.samples_df is None or self.samples_df.is_empty():
         self.logger.error("No samples found.")
         return
     # Add iso column if it doesn't exist
     if "iso" not in self.consensus_df.columns:
-        self.consensus_df = self.consensus_df.with_columns(
-            pl.lit(None, dtype=pl.Object).alias("iso")
-        )
+        self.consensus_df = self.consensus_df.with_columns(pl.lit(None, dtype=pl.Object).alias("iso"))
     self.logger.info("Extracting isotopomers from raw MS1 data...")
     # Filter consensus features if uids is specified
     if uids is not None:
         if not isinstance(uids, (list, tuple)):
@@ -1188,7 +1152,7 @@ def find_iso(self, rt_tol=0.1, mz_tol=0.01, uids=None):
     else:
         consensus_df_filtered = self.consensus_df
         self.logger.debug(f"Processing all {len(consensus_df_filtered)} consensus features")
     # Isotope mass shifts to search for (up to 7x 13C isotopes)
     isotope_shifts = [
         0.33,
@@ -1203,73 +1167,73 @@ def find_iso(self, rt_tol=0.1, mz_tol=0.01, uids=None):
         6.02010,
         7.02345,
     ]
     consensus_iso_data = {}
     # SUPER OPTIMIZATION: Vectorized pre-calculation using joins (10-100x faster)
     self.logger.debug("Building sample-to-consensus mapping using vectorized operations...")
     # Step 1: Join consensus_mapping with features to get intensities in one operation
     # Apply UID filtering if specified
     if uids is not None:
         consensus_mapping_filtered = self.consensus_mapping_df.filter(pl.col("consensus_uid").is_in(uids))
     else:
         consensus_mapping_filtered = self.consensus_mapping_df
     consensus_with_features = consensus_mapping_filtered.join(
-        self.features_df.select(['feature_uid', 'sample_uid', 'inty', 'mz', 'rt', 'rt_original']),
-        on=['feature_uid', 'sample_uid'],
-        how='left'
+        self.features_df.select(["feature_uid", "sample_uid", "inty", "mz", "rt", "rt_original"]),
+        on=["feature_uid", "sample_uid"],
+        how="left",
     )
     # Step 2: Find the best feature (highest intensity) for each consensus using window functions
-    best_features = consensus_with_features.with_columns(
-        pl.col('inty').fill_null(0)  # Handle null intensities
-    ).with_columns(
-        pl.col('inty').max().over('consensus_uid').alias('max_inty')
-    ).filter(
-        pl.col('inty') == pl.col('max_inty')
-    ).group_by('consensus_uid').first()  # Take first if there are ties
+    best_features = (
+        consensus_with_features.with_columns(
+            pl.col("inty").fill_null(0)  # Handle null intensities
+        )
+        .with_columns(pl.col("inty").max().over("consensus_uid").alias("max_inty"))
+        .filter(pl.col("inty") == pl.col("max_inty"))
+        .group_by("consensus_uid")
+        .first()
+    )  # Take first if there are ties
     # Step 3: Join with samples to get sample paths in one operation
     best_features_with_paths = best_features.join(
-        self.samples_df.select(['sample_uid', 'sample_path']),
-        on='sample_uid',
-        how='left'
-    ).filter(
-        pl.col('sample_path').is_not_null()
-    )
+        self.samples_df.select(["sample_uid", "sample_path"]), on="sample_uid", how="left"
+    ).filter(pl.col("sample_path").is_not_null())
     # Step 4: Group by sample path for batch processing (much faster than nested loops)
     sample_to_consensus = {}
     for row in best_features_with_paths.iter_rows(named=True):
-        sample_path = row['sample_path']
-        consensus_uid = row['consensus_uid']
+        sample_path = row["sample_path"]
+        consensus_uid = row["consensus_uid"]
         # Create feature data dictionary for compatibility
         feature_data = {
-            'mz': row['mz'],
-            'rt': row['rt'],
-            'rt_original': row.get('rt_original', row['rt']),
-            'inty': row['inty']
+            "mz": row["mz"],
+            "rt": row["rt"],
+            "rt_original": row.get("rt_original", row["rt"]),
+            "inty": row["inty"],
         }
         if sample_path not in sample_to_consensus:
             sample_to_consensus[sample_path] = []
         sample_to_consensus[sample_path].append((consensus_uid, feature_data))
     # Initialize failed consensus features (those not in the mapping)
-    processed_consensus_uids = set(best_features_with_paths['consensus_uid'].to_list())
+    processed_consensus_uids = set(best_features_with_paths["consensus_uid"].to_list())
     for consensus_row in consensus_df_filtered.iter_rows(named=True):
         consensus_uid = consensus_row["consensus_uid"]
         if consensus_uid not in processed_consensus_uids:
             consensus_iso_data[consensus_uid] = None
-    self.logger.debug(f"Will read {len(sample_to_consensus)} unique sample files for {len(consensus_df_filtered)} consensus features")
+    self.logger.debug(
+        f"Will read {len(sample_to_consensus)} unique sample files for {len(consensus_df_filtered)} consensus features"
+    )
     tdqm_disable = self.log_level not in ["TRACE", "DEBUG", "INFO"]
     # OPTIMIZATION 2: Process by sample file (load each file only once)
     for sample_path, consensus_list in tqdm(
         sample_to_consensus.items(),
@@ -1279,126 +1243,115 @@ def find_iso(self, rt_tol=0.1, mz_tol=0.01, uids=None):
         try:
             # Load MS1 data once per sample
             ms1_df = self._load_ms1(sample_path)
             if ms1_df is None or ms1_df.is_empty():
                 # Mark all consensus features from this sample as failed
                 for consensus_uid, _ in consensus_list:
                     consensus_iso_data[consensus_uid] = None
                 continue
             # Process all consensus features for this sample
             for consensus_uid, best_feature in consensus_list:
                 # Get the original RT (before alignment correction)
                 base_mz = best_feature["mz"]
                 original_rt = best_feature.get("rt_original", best_feature["rt"])
                 # Skip if RT or mz is None or invalid
                 if original_rt is None:
-                    original_rt = best_feature["rt"]
+                    original_rt = best_feature["rt"]
                     self.logger.debug(f"original_rt is None. Using aligned rt instead")
                 if base_mz is None:
                     self.logger.warning(f"Skipping consensus_uid {consensus_uid}: base_mz is None")
                     consensus_iso_data[consensus_uid] = None
                     continue
                 # Find MS1 scans near the original RT
                 rt_min = original_rt - rt_tol
                 rt_max = original_rt + rt_tol
                 # Filter MS1 data for scans within RT window
-                ms1_window = ms1_df.filter(
-                    (pl.col("rt") >= rt_min) & (pl.col("rt") <= rt_max)
-                )
+                ms1_window = ms1_df.filter((pl.col("rt") >= rt_min) & (pl.col("rt") <= rt_max))
                 if ms1_window.is_empty():
                     consensus_iso_data[consensus_uid] = None
                     continue
                 isotope_matches = []
                 # Search for each isotope shift
                 for shift in isotope_shifts:
                     target_mz = base_mz + shift
                     mz_min_iso = target_mz - mz_tol
                     mz_max_iso = target_mz + mz_tol
                     # Find peaks in MS1 data within m/z tolerance
-                    isotope_peaks = ms1_window.filter(
-                        (pl.col("mz") >= mz_min_iso) & (pl.col("mz") <= mz_max_iso)
-                    )
+                    isotope_peaks = ms1_window.filter((pl.col("mz") >= mz_min_iso) & (pl.col("mz") <= mz_max_iso))
                     if not isotope_peaks.is_empty():
                         # Get the peak with maximum intensity for this isotope
-                        max_peak = isotope_peaks.filter(
-                            pl.col("inty") == pl.col("inty").max()
-                        ).row(0, named=True)
+                        max_peak = isotope_peaks.filter(pl.col("inty") == pl.col("inty").max()).row(0, named=True)
                         # Store as float with specific precision: m/z to 4 decimals, intensity rounded to integer
                         mz_formatted = round(float(max_peak["mz"]), 4)
                         inty_formatted = float(round(max_peak["inty"]))  # Round to integer, but keep as float
                         isotope_matches.append([mz_formatted, inty_formatted])
                 # Store results as numpy array
                 if isotope_matches:
                     consensus_iso_data[consensus_uid] = np.array(isotope_matches)
                 else:
                     consensus_iso_data[consensus_uid] = None
         except Exception as e:
             self.logger.warning(f"Failed to load MS1 data from {sample_path}: {e}")
             # Mark all consensus features from this sample as failed
             for consensus_uid, _ in consensus_list:
                 consensus_iso_data[consensus_uid] = None
             continue
     # Update consensus_df with isotope data
     # Create mapping function for update
     def get_iso_data(uid):
         return consensus_iso_data.get(uid, None)
     # Update the iso column
     self.consensus_df = self.consensus_df.with_columns(
-        pl.col("consensus_uid").map_elements(
-            lambda uid: get_iso_data(uid),
-            return_dtype=pl.Object
-        ).alias("iso")
+        pl.col("consensus_uid").map_elements(lambda uid: get_iso_data(uid), return_dtype=pl.Object).alias("iso")
     )
     # Count how many consensus features have isotope data
     iso_count = sum(1 for data in consensus_iso_data.values() if data is not None and len(data) > 0)
-    self.logger.success(f"Isotope detection completed. Found isotope patterns for {iso_count}/{len(self.consensus_df)} consensus features.")
+    self.logger.success(
+        f"Isotope detection completed. Found isotope patterns for {iso_count}/{len(self.consensus_df)} consensus features."
+    )
 def reset_iso(self):
     """
     Reset the iso column in consensus_df to None, clearing all isotope data.
     This function clears any previously computed isotope patterns from the
     consensus_df, setting the 'iso' column to None for all features. This
     is useful before re-running isotope detection with different parameters
     or to clear isotope data entirely.
     Returns:
         None
     """
     if self.consensus_df is None:
         self.logger.warning("No consensus_df found. Nothing to reset.")
         return
     if "iso" not in self.consensus_df.columns:
         self.logger.warning("No 'iso' column found in consensus_df. Nothing to reset.")
         return
     # Count how many features currently have isotope data
-    iso_count = self.consensus_df.select(
-        pl.col("iso").is_not_null().sum().alias("count")
-    ).item(0, "count")
+    iso_count = self.consensus_df.select(pl.col("iso").is_not_null().sum().alias("count")).item(0, "count")
     # Reset the iso column to None
-    self.consensus_df = self.consensus_df.with_columns(
-        pl.lit(None, dtype=pl.Object).alias("iso")
-    )
+    self.consensus_df = self.consensus_df.with_columns(pl.lit(None, dtype=pl.Object).alias("iso"))
     self.logger.info(f"Reset isotope data for {iso_count} features. All 'iso' values set to None.")

masster 0.5.22__py3-none-any.whl → 0.5.24__py3-none-any.whl

Potentially problematic release.

masster 0.5.22py3-none-any.whl → 0.5.24py3-none-any.whl