PyPI - masster - Versions diffs - 0.5.22__py3-none-any.whl → 0.5.24__py3-none-any.whl - Mend

masster 0.5.22py3-none-any.whl → 0.5.24py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of masster might be problematic. Click here for more details.

Files changed (37) hide show

masster/_version.py +1 -1
masster/logger.py +35 -19
masster/sample/adducts.py +15 -29
masster/sample/defaults/find_adducts_def.py +1 -3
masster/sample/defaults/sample_def.py +4 -4
masster/sample/h5.py +203 -361
masster/sample/helpers.py +14 -30
masster/sample/lib.py +3 -3
masster/sample/load.py +21 -29
masster/sample/plot.py +222 -132
masster/sample/processing.py +42 -55
masster/sample/sample.py +37 -46
masster/sample/save.py +37 -61
masster/sample/sciex.py +13 -11
masster/sample/thermo.py +69 -74
masster/spectrum.py +15 -15
masster/study/analysis.py +650 -586
masster/study/defaults/identify_def.py +1 -3
masster/study/defaults/merge_def.py +6 -7
masster/study/defaults/study_def.py +1 -5
masster/study/export.py +35 -96
masster/study/h5.py +134 -211
masster/study/helpers.py +385 -459
masster/study/id.py +239 -290
masster/study/importers.py +84 -93
masster/study/load.py +159 -178
masster/study/merge.py +1112 -1098
masster/study/plot.py +195 -149
masster/study/processing.py +144 -191
masster/study/save.py +14 -13
masster/study/study.py +89 -130
masster/wizard/wizard.py +764 -714
{masster-0.5.22.dist-info → masster-0.5.24.dist-info}/METADATA +27 -1
{masster-0.5.22.dist-info → masster-0.5.24.dist-info}/RECORD +37 -37
{masster-0.5.22.dist-info → masster-0.5.24.dist-info}/WHEEL +0 -0
{masster-0.5.22.dist-info → masster-0.5.24.dist-info}/entry_points.txt +0 -0
{masster-0.5.22.dist-info → masster-0.5.24.dist-info}/licenses/LICENSE +0 -0

masster/study/id.py CHANGED Viewed

@@ -60,11 +60,11 @@ def lib_load(
             )
         lib_obj = Lib()
         # Determine file type by extension
-        if lib_source.lower().endswith('.json'):
+        if lib_source.lower().endswith(".json"):
             lib_obj.import_json(lib_source, polarity=polarity, adducts=adducts)
-        elif lib_source.lower().endswith('.csv'):
+        elif lib_source.lower().endswith(".csv"):
             lib_obj.import_csv(lib_source, polarity=polarity, adducts=adducts)
         else:
             # Default to CSV behavior for backward compatibility
@@ -112,15 +112,13 @@ def lib_load(
     # Add source_id column with filename (without path) if loading from CSV/JSON
     if isinstance(lib_source, str):
         import os
         filename_only = os.path.basename(lib_source)
         filtered_lf = filtered_lf.with_columns(pl.lit(filename_only).alias("source_id"))
     # Ensure required columns exist and set correct values
-    required_columns = {
-        "quant_group": pl.Int64,
-        "iso": pl.Int64
-    }
+    required_columns = {"quant_group": pl.Int64, "iso": pl.Int64}
     for col_name, col_dtype in required_columns.items():
         if col_name == "quant_group":
             # Set quant_group using cmpd_uid (same for isotopomers of same compound)
@@ -133,21 +131,24 @@ def lib_load(
             if col_name not in filtered_lf.columns:
                 # Default to zero for iso
                 filtered_lf = filtered_lf.with_columns(pl.lit(0).cast(col_dtype).alias(col_name))
     # Generate 13C isotopes if requested
     original_count = len(filtered_lf)
-    if iso == '13C':
+    if iso == "13C":
         filtered_lf = _generate_13c_isotopes(filtered_lf)
         # Update the log message to show the correct count after isotope generation
         if isinstance(lib_source, str):
             import os
             filename_only = os.path.basename(lib_source)
-            print(f"Generated 13C isotopes: {len(filtered_lf)} total entries ({original_count} original + {len(filtered_lf) - original_count} isotopes) from {filename_only}")
+            print(
+                f"Generated 13C isotopes: {len(filtered_lf)} total entries ({original_count} original + {len(filtered_lf) - original_count} isotopes) from {filename_only}"
+            )
     # Reorder columns to place quant_group after rt and iso after formula
     column_order = []
     columns_list = list(filtered_lf.columns)
     for col in columns_list:
         if col not in column_order:  # Only add if not already added
             column_order.append(col)
@@ -156,22 +157,17 @@ def lib_load(
             elif col == "formula" and "iso" in columns_list and "iso" not in column_order:
                 column_order.append("iso")
     # Add to existing lib_df instead of replacing
-    if (
-        hasattr(study, "lib_df")
-        and study.lib_df is not None
-        and not study.lib_df.is_empty()
-    ):
+    if hasattr(study, "lib_df") and study.lib_df is not None and not study.lib_df.is_empty():
         # Check for schema compatibility and handle mismatches
         existing_cols = set(study.lib_df.columns)
         new_cols = set(filtered_lf.columns)
         # If schemas don't match, we need to align them
         if existing_cols != new_cols:
             # Get union of all columns
             all_cols = existing_cols.union(new_cols)
             # Add missing columns to existing data with appropriate defaults
             for col in new_cols - existing_cols:
                 if col == "probability":
@@ -180,10 +176,12 @@ def lib_load(
                         try:
                             adduct_prob_map = _get_adduct_probabilities(study)
                             study.lib_df = study.lib_df.with_columns(
-                                pl.col("adduct").map_elements(
+                                pl.col("adduct")
+                                .map_elements(
                                     lambda adduct: adduct_prob_map.get(adduct, 1.0) if adduct is not None else 1.0,
-                                    return_dtype=pl.Float64
-                                ).alias("probability")
+                                    return_dtype=pl.Float64,
+                                )
+                                .alias("probability")
                             )
                         except Exception:
                             study.lib_df = study.lib_df.with_columns(pl.lit(1.0).alias("probability"))
@@ -200,16 +198,16 @@ def lib_load(
                 else:
                     # Default to null for other columns
                     study.lib_df = study.lib_df.with_columns(pl.lit(None).alias(col))
             # Add missing columns to new data with appropriate defaults
             for col in existing_cols - new_cols:
                 if col not in ["probability", "iso", "quant_group"]:  # These should already be handled
                     filtered_lf = filtered_lf.with_columns(pl.lit(None).alias(col))
         # Ensure column order matches for concatenation - use existing column order
         existing_column_order = list(study.lib_df.columns)
         filtered_lf = filtered_lf.select(existing_column_order)
         # Concatenate with existing data
         study.lib_df = pl.concat([study.lib_df, filtered_lf])
     else:
@@ -218,14 +216,14 @@ def lib_load(
             study.lib_df = (
                 filtered_lf.clone()
                 if hasattr(filtered_lf, "clone")
-                else pl.DataFrame(filtered_lf.to_dict() if hasattr(filtered_lf, 'to_dict') else filtered_lf)
+                else pl.DataFrame(filtered_lf.to_dict() if hasattr(filtered_lf, "to_dict") else filtered_lf)
             )
         except Exception:
             try:
                 study.lib_df = (
                     pl.from_pandas(filtered_lf)
                     if hasattr(filtered_lf, "to_pandas")
-                    else pl.DataFrame(filtered_lf.to_dict() if hasattr(filtered_lf, 'to_dict') else filtered_lf)
+                    else pl.DataFrame(filtered_lf.to_dict() if hasattr(filtered_lf, "to_dict") else filtered_lf)
                 )
             except Exception:
                 study.lib_df = pl.DataFrame()
@@ -265,20 +263,17 @@ def _setup_identify_parameters(params, kwargs):
     # Override parameters with any provided kwargs
     if kwargs:
         # Handle parameter name mapping for backwards compatibility
-        param_mapping = {
-            'rt_tolerance': 'rt_tol',
-            'mz_tolerance': 'mz_tol'
-        }
+        param_mapping = {"rt_tolerance": "rt_tol", "mz_tolerance": "mz_tol"}
         for param_name, value in kwargs.items():
             # Check if we need to map the parameter name
             mapped_name = param_mapping.get(param_name, param_name)
             if hasattr(params, mapped_name):
                 setattr(params, mapped_name, value)
             elif hasattr(params, param_name):
                 setattr(params, param_name, value)
     return params
@@ -287,9 +282,7 @@ def _smart_reset_id_results(study, target_uids, logger):
     if target_uids is not None:
         # Selective reset: only clear results for features being re-identified
         if hasattr(study, "id_df") and study.id_df is not None and not study.id_df.is_empty():
-            study.id_df = study.id_df.filter(
-                ~pl.col("consensus_uid").is_in(target_uids)
-            )
+            study.id_df = study.id_df.filter(~pl.col("consensus_uid").is_in(target_uids))
             if logger:
                 logger.debug(f"Cleared previous results for {len(target_uids)} specific features")
         elif not hasattr(study, "id_df"):
@@ -305,21 +298,23 @@ def _get_cached_adduct_probabilities(study, logger):
     """Get adduct probabilities with caching to avoid repeated expensive computation."""
     # Check if we have cached results and cache key matches current parameters
     current_cache_key = _get_adduct_cache_key(study)
-    if (hasattr(study, '_cached_adduct_probs') and
-        hasattr(study, '_cached_adduct_key') and
-        study._cached_adduct_key == current_cache_key):
+    if (
+        hasattr(study, "_cached_adduct_probs")
+        and hasattr(study, "_cached_adduct_key")
+        and study._cached_adduct_key == current_cache_key
+    ):
         if logger:
             logger.debug("Using cached adduct probabilities")
         return study._cached_adduct_probs
     # Compute and cache
     if logger:
         logger.debug("Computing adduct probabilities...")
     adduct_prob_map = _get_adduct_probabilities(study)
     study._cached_adduct_probs = adduct_prob_map
     study._cached_adduct_key = current_cache_key
     if logger:
         logger.debug(f"Computed and cached probabilities for {len(adduct_prob_map)} adducts")
     return adduct_prob_map
@@ -327,28 +322,30 @@ def _get_cached_adduct_probabilities(study, logger):
 def _get_adduct_cache_key(study):
     """Generate a cache key based on adduct-related parameters."""
-    if hasattr(study, 'parameters') and hasattr(study.parameters, 'adducts'):
-        adducts_str = '|'.join(sorted(study.parameters.adducts)) if study.parameters.adducts else ""
-        min_prob = getattr(study.parameters, 'adduct_min_probability', 0.04)
+    if hasattr(study, "parameters") and hasattr(study.parameters, "adducts"):
+        adducts_str = "|".join(sorted(study.parameters.adducts)) if study.parameters.adducts else ""
+        min_prob = getattr(study.parameters, "adduct_min_probability", 0.04)
         return f"adducts:{adducts_str}:min_prob:{min_prob}"
     return "default"
 def clear_identification_cache(study):
     """Clear cached identification data (useful when parameters change)."""
-    cache_attrs = ['_cached_adduct_probs', '_cached_adduct_key']
+    cache_attrs = ["_cached_adduct_probs", "_cached_adduct_key"]
     for attr in cache_attrs:
         if hasattr(study, attr):
             delattr(study, attr)
-def _perform_identification_matching(consensus_to_process, study, effective_mz_tol, effective_rt_tol, adduct_prob_map, logger):
+def _perform_identification_matching(
+    consensus_to_process, study, effective_mz_tol, effective_rt_tol, adduct_prob_map, logger
+):
     """Perform optimized identification matching using vectorized operations where possible."""
     results = []
     # Get library data as arrays for faster access
     lib_df = study.lib_df
     if logger:
         consensus_count = len(consensus_to_process)
         lib_count = len(lib_df)
@@ -361,7 +358,7 @@ def _perform_identification_matching(consensus_to_process, study, effective_mz_t
         cons_uid = cons_row.get("consensus_uid")
         cons_mz = cons_row.get("mz")
         cons_rt = cons_row.get("rt")
         if cons_mz is None:
             if logger:
                 logger.debug(f"Skipping consensus feature {cons_uid} - no m/z value")
@@ -372,18 +369,14 @@ def _perform_identification_matching(consensus_to_process, study, effective_mz_t
         matches = _find_matches_vectorized(
             lib_df, cons_mz, cons_rt, effective_mz_tol, effective_rt_tol, logger, cons_uid
         )
         # Convert matches to result format
         match_results = []
         if not matches.is_empty():
             for match_row in matches.iter_rows(named=True):
                 mz_delta = abs(cons_mz - match_row.get("mz")) if match_row.get("mz") is not None else None
                 lib_rt = match_row.get("rt")
-                rt_delta = (
-                    abs(cons_rt - lib_rt)
-                    if (cons_rt is not None and lib_rt is not None)
-                    else None
-                )
+                rt_delta = abs(cons_rt - lib_rt) if (cons_rt is not None and lib_rt is not None) else None
                 # Get library probability as base score, then multiply by adduct probability
                 lib_probability = match_row.get("probability", 1.0) if match_row.get("probability") is not None else 1.0
@@ -400,22 +393,20 @@ def _perform_identification_matching(consensus_to_process, study, effective_mz_t
                     "matcher": "ms1",
                     "score": score,
                 })
         results.append({"consensus_uid": cons_uid, "matches": match_results})
     return results
 def _find_matches_vectorized(lib_df, cons_mz, cons_rt, mz_tol, rt_tol, logger, cons_uid):
     """
     Find library matches using optimized vectorized operations.
     FIXED VERSION: Prevents incorrect matching of same compound to different m/z values.
     """
     # Filter by m/z tolerance using vectorized operations
-    matches = lib_df.filter(
-        (pl.col("mz") >= cons_mz - mz_tol) & (pl.col("mz") <= cons_mz + mz_tol)
-    )
+    matches = lib_df.filter((pl.col("mz") >= cons_mz - mz_tol) & (pl.col("mz") <= cons_mz + mz_tol))
     initial_match_count = len(matches)
@@ -423,14 +414,11 @@ def _find_matches_vectorized(lib_df, cons_mz, cons_rt, mz_tol, rt_tol, logger, c
     if rt_tol is not None and cons_rt is not None and not matches.is_empty():
         # First, check if any m/z matches have RT data
         rt_candidates = matches.filter(pl.col("rt").is_not_null())
         if not rt_candidates.is_empty():
             # Apply RT filtering to candidates with RT data
-            rt_matches = rt_candidates.filter(
-                (pl.col("rt") >= cons_rt - rt_tol) &
-                (pl.col("rt") <= cons_rt + rt_tol)
-            )
+            rt_matches = rt_candidates.filter((pl.col("rt") >= cons_rt - rt_tol) & (pl.col("rt") <= cons_rt + rt_tol))
             if not rt_matches.is_empty():
                 matches = rt_matches
                 if logger:
@@ -458,12 +446,14 @@ def _find_matches_vectorized(lib_df, cons_mz, cons_rt, mz_tol, rt_tol, logger, c
         strict_matches = matches.filter(
             (pl.col("mz") >= cons_mz - strict_mz_tol) & (pl.col("mz") <= cons_mz + strict_mz_tol)
         )
         if not strict_matches.is_empty():
             # Use strict matches if available
             matches = strict_matches
             if logger:
-                logger.debug(f"Consensus {cons_uid}: Using {len(matches)} strict m/z matches (within {strict_mz_tol:.6f} Da)")
+                logger.debug(
+                    f"Consensus {cons_uid}: Using {len(matches)} strict m/z matches (within {strict_mz_tol:.6f} Da)"
+                )
         else:
             if logger:
                 logger.debug(f"Consensus {cons_uid}: No strict matches, using {len(matches)} loose matches")
@@ -472,21 +462,18 @@ def _find_matches_vectorized(lib_df, cons_mz, cons_rt, mz_tol, rt_tol, logger, c
     if not matches.is_empty() and len(matches) > 1:
         if "formula" in matches.columns and "adduct" in matches.columns:
             pre_dedup_count = len(matches)
             # Calculate m/z error for sorting
-            matches = matches.with_columns([
-                (pl.col("mz") - cons_mz).abs().alias("mz_error_abs")
-            ])
+            matches = matches.with_columns([(pl.col("mz") - cons_mz).abs().alias("mz_error_abs")])
             # Group by formula and adduct, but keep the most accurate m/z match
             matches = (
-                matches
-                .sort(["mz_error_abs", "lib_uid"])  # Sort by m/z accuracy first, then lib_uid for consistency
+                matches.sort(["mz_error_abs", "lib_uid"])  # Sort by m/z accuracy first, then lib_uid for consistency
                 .group_by(["formula", "adduct"], maintain_order=True)
                 .first()
                 .drop("mz_error_abs")  # Remove the temporary column
             )
             post_dedup_count = len(matches)
             if logger and post_dedup_count < pre_dedup_count:
                 logger.debug(
@@ -512,10 +499,10 @@ def _update_identification_results(study, results, logger):
                 "score": match["score"],
                 "iso": 0,  # Default to zero
             })
     # Convert to DataFrame and append to existing results
     new_results_df = pl.DataFrame(records) if records else pl.DataFrame()
     if not new_results_df.is_empty():
         if hasattr(study, "id_df") and study.id_df is not None and not study.id_df.is_empty():
             # Check if existing id_df has the iso column
@@ -524,11 +511,11 @@ def _update_identification_results(study, results, logger):
                 study.id_df = study.id_df.with_columns(pl.lit(0).alias("iso"))
                 if logger:
                     logger.debug("Added 'iso' column to existing id_df for schema compatibility")
             study.id_df = pl.concat([study.id_df, new_results_df])
         else:
             study.id_df = new_results_df
         if logger:
             logger.debug(f"Added {len(records)} identification results to study.id_df")
     elif not hasattr(study, "id_df"):
@@ -539,7 +526,7 @@ def _finalize_identification_results(study, params, logger):
     """Apply final scoring adjustments and update consensus columns."""
     # Apply scoring adjustments based on compound and formula counts
     _apply_scoring_adjustments(study, params)
     # Update consensus_df with top-scoring identification results
     _update_consensus_id_columns(study, logger)
@@ -568,7 +555,7 @@ def _validate_identify_inputs(study, logger=None):
         if logger:
             logger.error("Library (study.lib_df) is empty; call lib_load() first")
         raise ValueError("Library (study.lib_df) is empty; call lib_load() first")
     return True
@@ -612,8 +599,6 @@ def _prepare_consensus_features(study, features, logger=None):
     return consensus_to_process, target_uids
 def _get_adduct_probabilities(study):
     """Get adduct probabilities from _get_adducts() results."""
     adducts_df = _get_adducts(study)
@@ -624,45 +609,42 @@ def _get_adduct_probabilities(study):
     return adduct_prob_map
-def _create_identification_results(consensus_to_process, study, effective_mz_tol, effective_rt_tol, adduct_prob_map, logger=None):
+def _create_identification_results(
+    consensus_to_process, study, effective_mz_tol, effective_rt_tol, adduct_prob_map, logger=None
+):
     """Create identification results by matching consensus features against library (DEPRECATED - use optimized version)."""
     # This function is now deprecated in favor of _perform_identification_matching
     # Keep for backward compatibility but redirect to optimized version
     results = _perform_identification_matching(
         consensus_to_process, study, effective_mz_tol, effective_rt_tol, adduct_prob_map, logger
     )
     # Convert to legacy format for compatibility
     legacy_results = []
     features_with_matches = 0
     total_matches = 0
     for result in results:
         if result["matches"]:
             features_with_matches += 1
             total_matches += len(result["matches"])
             for match in result["matches"]:
                 legacy_results.append({
                     "consensus_uid": result["consensus_uid"],
                     "lib_uid": match["lib_uid"],
-                    "mz_delta": match["mz_delta"],
+                    "mz_delta": match["mz_delta"],
                     "rt_delta": match["rt_delta"],
                     "matcher": match["matcher"],
                     "score": match["score"],
                 })
     return legacy_results, features_with_matches, total_matches
 def _apply_scoring_adjustments(study, params):
     """Apply scoring adjustments based on compound and formula counts using optimized operations."""
-    if (
-        not study.id_df.is_empty()
-        and hasattr(study, "lib_df")
-        and not study.lib_df.is_empty()
-    ):
+    if not study.id_df.is_empty() and hasattr(study, "lib_df") and not study.lib_df.is_empty():
         # Get penalty parameters
         heteroatoms = getattr(params, "heteroatoms", ["Cl", "Br", "F", "I"])
         heteroatom_penalty = getattr(params, "heteroatom_penalty", 0.7)
@@ -685,15 +667,14 @@ def _apply_scoring_adjustments(study, params):
         # Join stats back and apply all penalties in one with_columns operation
         heteroatom_conditions = [pl.col("formula").str.contains(atom) for atom in heteroatoms]
-        has_heteroatoms = pl.fold(
-            acc=pl.lit(False),
-            function=lambda acc, x: acc | x,
-            exprs=heteroatom_conditions
-        ) if heteroatom_conditions else pl.lit(False)
+        has_heteroatoms = (
+            pl.fold(acc=pl.lit(False), function=lambda acc, x: acc | x, exprs=heteroatom_conditions)
+            if heteroatom_conditions
+            else pl.lit(False)
+        )
         study.id_df = (
-            id_with_lib
-            .join(stats, on="consensus_uid", how="left")
+            id_with_lib.join(stats, on="consensus_uid", how="left")
             .with_columns([
                 # Apply all penalties in sequence using case-when chains
                 pl.when(pl.col("formula").is_not_null() & has_heteroatoms)
@@ -716,7 +697,7 @@ def _apply_scoring_adjustments(study, params):
             ])
             .select([
                 "consensus_uid",
-                "lib_uid",
+                "lib_uid",
                 "mz_delta",
                 "rt_delta",
                 "matcher",
@@ -728,7 +709,7 @@ def _apply_scoring_adjustments(study, params):
 def _update_consensus_id_columns(study, logger=None):
     """
     Update consensus_df with top-scoring identification results using safe in-place updates.
     FIXED VERSION: Prevents same compound from being assigned to vastly different m/z values.
     """
     try:
@@ -736,15 +717,15 @@ def _update_consensus_id_columns(study, logger=None):
             if logger:
                 logger.debug("No identification results to process")
             return
         if not hasattr(study, "lib_df") or study.lib_df is None or study.lib_df.is_empty():
             if logger:
                 logger.debug("No library data available")
             return
         if not hasattr(study, "consensus_df") or study.consensus_df is None or study.consensus_df.is_empty():
             if logger:
-                logger.debug("No consensus data available")
+                logger.debug("No consensus data available")
             return
         # Get library columns we need (include mz for validation)
@@ -754,50 +735,45 @@ def _update_consensus_id_columns(study, logger=None):
         # FIX 1: Join identification results with consensus m/z for validation
         id_with_consensus = study.id_df.join(
-            study.consensus_df.select(["consensus_uid", "mz"]),
-            on="consensus_uid",
-            how="left",
-            suffix="_consensus"
+            study.consensus_df.select(["consensus_uid", "mz"]), on="consensus_uid", how="left", suffix="_consensus"
         )
         # FIX 2: Validate m/z accuracy - filter out poor matches
         id_with_lib = id_with_consensus.join(
-            study.lib_df.select(["lib_uid", "mz"]),
-            on="lib_uid",
-            how="left",
-            suffix="_lib"
+            study.lib_df.select(["lib_uid", "mz"]), on="lib_uid", how="left", suffix="_lib"
         )
         # Calculate actual m/z error and filter out excessive errors
-        id_validated = id_with_lib.with_columns([
-            (pl.col("mz") - pl.col("mz_lib")).abs().alias("actual_mz_error")
-        ])
+        id_validated = id_with_lib.with_columns([(pl.col("mz") - pl.col("mz_lib")).abs().alias("actual_mz_error")])
         # Filter out matches with excessive m/z error
         max_reasonable_error = 0.02  # 20 millidalton maximum error
         id_validated = id_validated.filter(
             (pl.col("actual_mz_error") <= max_reasonable_error) | pl.col("actual_mz_error").is_null()
         )
         if logger:
             original_count = len(id_with_consensus)
             validated_count = len(id_validated)
             if validated_count < original_count:
-                logger.warning(f"Filtered out {original_count - validated_count} identifications with excessive m/z error (>{max_reasonable_error:.3f} Da)")
+                logger.warning(
+                    f"Filtered out {original_count - validated_count} identifications with excessive m/z error (>{max_reasonable_error:.3f} Da)"
+                )
         # Get top-scoring identification for each consensus feature (from validated results)
         top_ids = (
-            id_validated
-            .sort(["consensus_uid", "score"], descending=[False, True])
+            id_validated.sort(["consensus_uid", "score"], descending=[False, True])
             .group_by("consensus_uid", maintain_order=True)
             .first()
             .join(study.lib_df.select(lib_columns), on="lib_uid", how="left")
             .select([
                 "consensus_uid",
                 "name",
-                pl.col("class").alias("id_top_class") if "class" in lib_columns else pl.lit(None, dtype=pl.String).alias("id_top_class"),
+                pl.col("class").alias("id_top_class")
+                if "class" in lib_columns
+                else pl.lit(None, dtype=pl.String).alias("id_top_class"),
                 pl.col("adduct").alias("id_top_adduct"),
-                pl.col("score").alias("id_top_score")
+                pl.col("score").alias("id_top_score"),
             ])
             .rename({"name": "id_top_name"})
         )
@@ -805,28 +781,23 @@ def _update_consensus_id_columns(study, logger=None):
         # FIX 3: Check for conflicts where same compound+adduct assigned to very different m/z
         if not top_ids.is_empty():
             compound_groups = (
-                top_ids
-                .join(study.consensus_df.select(["consensus_uid", "mz"]), on="consensus_uid", how="left")
+                top_ids.join(study.consensus_df.select(["consensus_uid", "mz"]), on="consensus_uid", how="left")
                 .group_by(["id_top_name", "id_top_adduct"])
                 .agg([
                     pl.col("consensus_uid").count().alias("count"),
                     pl.col("mz").min().alias("mz_min"),
-                    pl.col("mz").max().alias("mz_max")
-                ])
-                .with_columns([
-                    (pl.col("mz_max") - pl.col("mz_min")).alias("mz_range")
+                    pl.col("mz").max().alias("mz_max"),
                 ])
+                .with_columns([(pl.col("mz_max") - pl.col("mz_min")).alias("mz_range")])
             )
             # Find problematic assignments (same compound+adduct with >0.1 Da m/z range)
-            problematic = compound_groups.filter(
-                (pl.col("count") > 1) & (pl.col("mz_range") > 0.1)
-            )
+            problematic = compound_groups.filter((pl.col("count") > 1) & (pl.col("mz_range") > 0.1))
             if not problematic.is_empty() and logger:
                 for row in problematic.iter_rows(named=True):
                     name = row["id_top_name"]
-                    adduct = row["id_top_adduct"]
+                    adduct = row["id_top_adduct"]
                     count = row["count"]
                     mz_range = row["mz_range"]
                     logger.warning(
@@ -836,15 +807,13 @@ def _update_consensus_id_columns(study, logger=None):
         # Ensure we have the id_top columns in consensus_df
         for col_name, dtype in [
             ("id_top_name", pl.String),
-            ("id_top_class", pl.String),
+            ("id_top_class", pl.String),
             ("id_top_adduct", pl.String),
             ("id_top_score", pl.Float64),
-            ("id_source", pl.String)
+            ("id_source", pl.String),
         ]:
             if col_name not in study.consensus_df.columns:
-                study.consensus_df = study.consensus_df.with_columns(
-                    pl.lit(None, dtype=dtype).alias(col_name)
-                )
+                study.consensus_df = study.consensus_df.with_columns(pl.lit(None, dtype=dtype).alias(col_name))
         # Create a mapping dictionary for efficient updates
         id_mapping = {}
@@ -854,42 +823,36 @@ def _update_consensus_id_columns(study, logger=None):
                 "id_top_name": row["id_top_name"],
                 "id_top_class": row["id_top_class"],
                 "id_top_adduct": row["id_top_adduct"],
-                "id_top_score": row["id_top_score"]
+                "id_top_score": row["id_top_score"],
             }
         # Update consensus_df using map_elements (safer than join for avoiding duplicates)
         if id_mapping:
             study.consensus_df = study.consensus_df.with_columns([
-                pl.col("consensus_uid").map_elements(
-                    lambda uid: id_mapping.get(uid, {}).get("id_top_name"),
-                    return_dtype=pl.String
-                ).alias("id_top_name"),
-                pl.col("consensus_uid").map_elements(
-                    lambda uid: id_mapping.get(uid, {}).get("id_top_class"),
-                    return_dtype=pl.String
-                ).alias("id_top_class"),
-                pl.col("consensus_uid").map_elements(
-                    lambda uid: id_mapping.get(uid, {}).get("id_top_adduct"),
-                    return_dtype=pl.String
-                ).alias("id_top_adduct"),
-                pl.col("consensus_uid").map_elements(
-                    lambda uid: id_mapping.get(uid, {}).get("id_top_score"),
-                    return_dtype=pl.Float64
-                ).alias("id_top_score")
+                pl.col("consensus_uid")
+                .map_elements(lambda uid: id_mapping.get(uid, {}).get("id_top_name"), return_dtype=pl.String)
+                .alias("id_top_name"),
+                pl.col("consensus_uid")
+                .map_elements(lambda uid: id_mapping.get(uid, {}).get("id_top_class"), return_dtype=pl.String)
+                .alias("id_top_class"),
+                pl.col("consensus_uid")
+                .map_elements(lambda uid: id_mapping.get(uid, {}).get("id_top_adduct"), return_dtype=pl.String)
+                .alias("id_top_adduct"),
+                pl.col("consensus_uid")
+                .map_elements(lambda uid: id_mapping.get(uid, {}).get("id_top_score"), return_dtype=pl.Float64)
+                .alias("id_top_score"),
             ])
         if logger:
             num_updated = len(id_mapping)
             logger.debug(f"Updated consensus_df with top identifications for {num_updated} features")
     except Exception as e:
         if logger:
             logger.error(f"Error updating consensus_df with identification results: {e}")
         # Don't re-raise to avoid breaking the identification process
 def identify(study, features=None, params=None, **kwargs):
     """Identify consensus features against the loaded library.
@@ -915,12 +878,12 @@ def identify(study, features=None, params=None, **kwargs):
     """
     # Get logger from study if available
     logger = getattr(study, "logger", None)
     # Setup parameters early
     params = _setup_identify_parameters(params, kwargs)
     effective_mz_tol = getattr(params, "mz_tol", 0.01)
     effective_rt_tol = getattr(params, "rt_tol", 2.0)
     if logger:
         logger.debug(
             f"Starting identification with mz_tolerance={effective_mz_tol}, rt_tolerance={effective_rt_tol}",
@@ -937,7 +900,7 @@ def identify(study, features=None, params=None, **kwargs):
     # Smart reset of id_df: only clear results for features being re-identified
     _smart_reset_id_results(study, target_uids, logger)
     # Cache adduct probabilities (expensive operation)
     adduct_prob_map = _get_cached_adduct_probabilities(study, logger)
@@ -1037,9 +1000,7 @@ def get_id(study, features=None) -> pl.DataFrame:
     # Join with consensus_df to get consensus feature m/z and RT
     consensus_cols = ["consensus_uid", "mz", "rt"]
     # Only select columns that exist in consensus_df
-    available_consensus_cols = [
-        col for col in consensus_cols if col in study.consensus_df.columns
-    ]
+    available_consensus_cols = [col for col in consensus_cols if col in study.consensus_df.columns]
     result_df = result_df.join(
         study.consensus_df.select(available_consensus_cols),
@@ -1101,9 +1062,7 @@ def get_id(study, features=None) -> pl.DataFrame:
     column_order.extend(remaining_cols)
     # Filter out None values and select existing columns
-    final_column_order = [
-        col for col in column_order if col is not None and col in result_df.columns
-    ]
+    final_column_order = [col for col in column_order if col is not None and col in result_df.columns]
     result_df = result_df.select(final_column_order)
@@ -1115,10 +1074,7 @@ def get_id(study, features=None) -> pl.DataFrame:
                 pl.col("cmpd_uid").n_unique().alias("num_cmpds")
                 if "cmpd_uid" in result_df.columns
                 else pl.lit(None).alias("num_cmpds"),
-                pl.col("formula")
-                .filter(pl.col("formula").is_not_null())
-                .n_unique()
-                .alias("num_formulas")
+                pl.col("formula").filter(pl.col("formula").is_not_null()).n_unique().alias("num_formulas")
                 if "formula" in result_df.columns
                 else pl.lit(None).alias("num_formulas"),
             ],
@@ -1177,9 +1133,7 @@ def get_id(study, features=None) -> pl.DataFrame:
                     # Get the highest scoring entry's RT as reference
                     reference_rt = (
-                        group_df["rt"][0]
-                        if "rt" in group_df.columns and group_df["rt"][0] is not None
-                        else None
+                        group_df["rt"][0] if "rt" in group_df.columns and group_df["rt"][0] is not None else None
                     )
                     # Filter entries: keep those with same RT as highest scoring entry
@@ -1193,11 +1147,7 @@ def get_id(study, features=None) -> pl.DataFrame:
                         rt_filtered = group_df
                     # Check multiply charged constraint
-                    if (
-                        "z" in rt_filtered.columns
-                        and "adduct" in rt_filtered.columns
-                        and len(rt_filtered) > 0
-                    ):
+                    if "z" in rt_filtered.columns and "adduct" in rt_filtered.columns and len(rt_filtered) > 0:
                         # Check if there are multiply charged adducts
                         multiply_charged = rt_filtered.filter(
                             (pl.col("z") > 1) | (pl.col("z") < -1),
@@ -1259,7 +1209,7 @@ def id_reset(study):
     if hasattr(study, "consensus_df") and not study.consensus_df.is_empty():
         if logger:
             logger.debug("Resetting id_top_* columns in consensus_df")
         # Check which columns exist before trying to update them
         id_columns_to_reset = []
         for col in ["id_top_name", "id_top_class", "id_top_adduct", "id_top_score", "id_source"]:
@@ -1268,7 +1218,7 @@ def id_reset(study):
                     id_columns_to_reset.append(pl.lit(None, dtype=pl.Float64).alias(col))
                 else:
                     id_columns_to_reset.append(pl.lit(None, dtype=pl.String).alias(col))
         if id_columns_to_reset:
             study.consensus_df = study.consensus_df.with_columns(id_columns_to_reset)
@@ -1306,24 +1256,24 @@ def lib_reset(study):
     if hasattr(study, "consensus_df") and not study.consensus_df.is_empty():
         if logger:
             logger.debug("Checking for consensus features created by lib_to_consensus()")
         try:
             # Filter for features created by lib_to_consensus()
             # These can be identified by:
             # 1. number_samples < 1 (set to 0.0 by lib_to_consensus)
             # 2. AND have corresponding entries in consensus_mapping_df with sample_uid = 0 (virtual sample)
             # First check if we have any features with number_samples < 1
             potential_lib_features = study.consensus_df.filter(pl.col("number_samples") < 1)
             if potential_lib_features is not None and not potential_lib_features.is_empty():
                 # Further filter by checking if they have sample_uid = 0 in consensus_mapping_df
                 # This ensures we only remove library-derived features, not legitimate features with 0 samples
                 if hasattr(study, "consensus_mapping_df") and not study.consensus_mapping_df.is_empty():
-                    lib_consensus_uids = study.consensus_mapping_df.filter(
-                        pl.col("sample_uid") == 0
-                    )["consensus_uid"].unique().to_list()
+                    lib_consensus_uids = (
+                        study.consensus_mapping_df.filter(pl.col("sample_uid") == 0)["consensus_uid"].unique().to_list()
+                    )
                     if lib_consensus_uids:
                         lib_consensus_features = potential_lib_features.filter(
                             pl.col("consensus_uid").is_in(lib_consensus_uids)
@@ -1335,15 +1285,15 @@ def lib_reset(study):
                     lib_consensus_features = potential_lib_features
             else:
                 lib_consensus_features = pl.DataFrame()  # No features with number_samples < 1
             if lib_consensus_features is not None and not lib_consensus_features.is_empty():
                 num_lib_features = len(lib_consensus_features)
                 if logger:
                     logger.info(f"Removing {num_lib_features} consensus features created by lib_to_consensus()")
                 # Use consensus_delete to remove these features and all dependent data
                 study.consensus_delete(lib_consensus_features)
                 if logger:
                     logger.debug("Successfully removed library-derived consensus features")
             else:
@@ -1375,7 +1325,7 @@ def lib_reset(study):
     if hasattr(study, "consensus_df") and not study.consensus_df.is_empty():
         if logger:
             logger.debug("Resetting id_top_* columns in consensus_df")
         # Check which columns exist before trying to update them
         id_columns_to_reset = []
         for col in ["id_top_name", "id_top_class", "id_top_adduct", "id_top_score", "id_source"]:
@@ -1384,7 +1334,7 @@ def lib_reset(study):
                     id_columns_to_reset.append(pl.lit(None, dtype=pl.Float64).alias(col))
                 else:
                     id_columns_to_reset.append(pl.lit(None, dtype=pl.String).alias(col))
         if id_columns_to_reset:
             study.consensus_df = study.consensus_df.with_columns(id_columns_to_reset)
@@ -1399,7 +1349,7 @@ def lib_reset(study):
             if logger:
                 logger.debug("Removing 'lib_load' from history")
             del study.history["lib_load"]
         if "lib_to_consensus" in study.history:
             if logger:
                 logger.debug("Removing 'lib_to_consensus' from history")
@@ -1445,9 +1395,7 @@ def _get_adducts(study, adducts_list: list | None = None, **kwargs):
     adducts_list_to_use = adducts_list
     if adducts_list_to_use is None:
         adducts_list_to_use = (
-            study.parameters.adducts
-            if hasattr(study.parameters, "adducts") and study.parameters.adducts
-            else []
+            study.parameters.adducts if hasattr(study.parameters, "adducts") and study.parameters.adducts else []
         )
     # Get parameters with study-specific defaults
@@ -1561,11 +1509,9 @@ def _get_adducts(study, adducts_list: list | None = None, **kwargs):
                         {
                             "components": components,
                             "formatted_name": formatted_name,
-                            "total_mass_shift": float(pos_spec["mass_shift"])
-                            + float(neut_spec["mass_shift"]),
+                            "total_mass_shift": float(pos_spec["mass_shift"]) + float(neut_spec["mass_shift"]),
                             "total_charge": total_charge,
-                            "combined_probability": float(pos_spec["probability"])
-                            * float(neut_spec["probability"]),
+                            "combined_probability": float(pos_spec["probability"]) * float(neut_spec["probability"]),
                             "complexity": 2,
                         },
                     )
@@ -1739,9 +1685,7 @@ def _format_adduct_name(components: list[dict]) -> str:
     elif abs(total_charge) == 1:
         charge_str = "1+" if total_charge > 0 else "1-"
     else:
-        charge_str = (
-            f"{abs(total_charge)}+" if total_charge > 0 else f"{abs(total_charge)}-"
-        )
+        charge_str = f"{abs(total_charge)}+" if total_charge > 0 else f"{abs(total_charge)}-"
     return f"[M{formula}]{charge_str}"
@@ -1749,53 +1693,53 @@ def _format_adduct_name(components: list[dict]) -> str:
 def _generate_13c_isotopes(lib_df):
     """
     Generate 13C isotope variants for library entries.
     For each compound with n carbon atoms, creates n+1 entries:
     - iso=0: original compound (no 13C)
     - iso=1: one 13C isotope (+1.00335 Da)
     - iso=2: two 13C isotopes (+2.00670 Da)
     - ...
     - iso=n: n 13C isotopes (+n*1.00335 Da)
     All isotopomers share the same quant_group.
     Args:
         lib_df: Polars DataFrame with library entries
     Returns:
         Polars DataFrame with additional 13C isotope entries
     """
     if lib_df.is_empty():
         return lib_df
     # First, ensure all original entries have iso=0
     original_df = lib_df.with_columns(pl.lit(0).alias("iso"))
     isotope_entries = []
     next_lib_uid = lib_df["lib_uid"].max() + 1 if len(lib_df) > 0 else 1
     # Mass difference for one 13C isotope
     c13_mass_shift = 1.00335  # Mass difference between 13C and 12C
     for row in original_df.iter_rows(named=True):
         formula = row.get("formula", "")
         if not formula:
             continue
         # Count carbon atoms in the formula
         carbon_count = _count_carbon_atoms(formula)
         if carbon_count == 0:
             continue
         # Get the original quant_group to keep it consistent across isotopes
         # All isotopomers of the same compound should have the same quant_group
         quant_group = row.get("quant_group", row.get("cmpd_uid", row.get("lib_uid", 1)))
         # Generate isotope variants (1 to n 13C atoms)
         for iso_num in range(1, carbon_count + 1):
             # Calculate mass shift for this number of 13C isotopes
             mass_shift = iso_num * c13_mass_shift
             # Create new entry
             isotope_entry = dict(row)  # Copy all fields
             isotope_entry["lib_uid"] = next_lib_uid
@@ -1803,10 +1747,10 @@ def _generate_13c_isotopes(lib_df):
             isotope_entry["m"] = row["m"] + mass_shift
             isotope_entry["mz"] = (row["m"] + mass_shift) / abs(row["z"]) if row["z"] != 0 else row["m"] + mass_shift
             isotope_entry["quant_group"] = quant_group  # Keep same quant_group
             isotope_entries.append(isotope_entry)
             next_lib_uid += 1
     # Combine original entries (now with iso=0) with isotope entries
     if isotope_entries:
         isotope_df = pl.DataFrame(isotope_entries)
@@ -1818,7 +1762,7 @@ def _generate_13c_isotopes(lib_df):
             # Get common schema
             original_schema = original_df.schema
             isotope_schema = isotope_df.schema
             # Cast isotope_df columns to match original_df schema where possible
             cast_exprs = []
             for col_name in isotope_df.columns:
@@ -1827,7 +1771,7 @@ def _generate_13c_isotopes(lib_df):
                     cast_exprs.append(pl.col(col_name).cast(target_dtype, strict=False))
                 else:
                     cast_exprs.append(pl.col(col_name))
             isotope_df_cast = isotope_df.select(cast_exprs)
             return pl.concat([original_df, isotope_df_cast])
     else:
@@ -1837,75 +1781,75 @@ def _generate_13c_isotopes(lib_df):
 def _count_carbon_atoms(formula: str) -> int:
     """
     Count the number of carbon atoms in a molecular formula.
     Args:
         formula: Molecular formula string like "C6H12O6"
     Returns:
         Number of carbon atoms
     """
     import re
     if not formula or not isinstance(formula, str):
         return 0
     # Look for carbon followed by optional number
     # C followed by digits, or just C (which means 1)
-    carbon_matches = re.findall(r'C(\d*)', formula)
+    carbon_matches = re.findall(r"C(\d*)", formula)
     total_carbons = 0
     for match in carbon_matches:
-        if match == '':
+        if match == "":
             # Just 'C' without number means 1 carbon
             total_carbons += 1
         else:
             # 'C' followed by number
             total_carbons += int(match)
     return total_carbons
 def lib_to_consensus(study, chrom_fhwm: float = 5.0, mz_tol: float = 0.01, rt_tol: float = 2.0):
     """Create consensus features from library entries instead of features_df.
     This method takes all rows from lib_df and creates corresponding entries in
-    consensus_df with the same columns as merge(). Instead of relying on
+    consensus_df with the same columns as merge(). Instead of relying on
     features_df, it populates consensus features directly from library data.
     Before creating new features, it checks for pre-existing consensus features:
     - If rt in lib_df is null: picks consensus feature with matching mz and largest inty_mean
     - If rt is not null: picks consensus feature with matching mz and rt within tolerance
     - If a match is found, skips to the next library entry
     Args:
         study: Study instance with lib_df populated
-        chrom_fhwm: Chromatographic full width at half maximum in seconds
+        chrom_fhwm: Chromatographic full width at half maximum in seconds
                    to infer rt_start_mean and rt_end_mean (default: 5.0)
         mz_tol: m/z tolerance for matching existing consensus features (default: 0.01)
         rt_tol: RT tolerance for matching existing consensus features (default: 2.0)
     Side effects:
         Adds rows to study.consensus_df and study.consensus_mapping_df
         Calls study.find_ms2() at the end
     """
     # Get logger from study if available
     logger = getattr(study, "logger", None)
     # Validate inputs
     if getattr(study, "lib_df", None) is None or study.lib_df.is_empty():
         if logger:
             logger.error("Library (study.lib_df) is empty; call lib_load() first")
         raise ValueError("Library (study.lib_df) is empty; call lib_load() first")
     if logger:
         logger.info(f"Creating consensus features from {len(study.lib_df)} library entries")
     # Initialize consensus DataFrames if they don't exist
     if not hasattr(study, "consensus_df") or study.consensus_df is None:
         study.consensus_df = pl.DataFrame()
     if not hasattr(study, "consensus_mapping_df") or study.consensus_mapping_df is None:
         study.consensus_mapping_df = pl.DataFrame()
     # Get cached adducts for consistent adduct handling
     cached_adducts_df = None
     cached_valid_adducts = None
@@ -1919,26 +1863,26 @@ def lib_to_consensus(study, chrom_fhwm: float = 5.0, mz_tol: float = 0.01, rt_to
         if logger:
             logger.warning(f"Could not retrieve study adducts: {e}")
         cached_valid_adducts = set()
     # Always allow '?' adducts
     cached_valid_adducts.add("?")
     # Get starting consensus_uid counter
     if not study.consensus_df.is_empty():
         max_existing_uid = study.consensus_df["consensus_uid"].max()
         consensus_uid_counter = int(max_existing_uid) + 1 if max_existing_uid is not None else 0
     else:
         consensus_uid_counter = 0
     # Track [M+H] iso=0 and [M-H] iso=0 entries for adduct grouping
     base_adduct_groups = {}  # key: (mz, adduct_base), value: adduct_group
     # Process each library entry
     consensus_metadata = []
     consensus_mapping_list = []
     matched_count = 0
     skipped_count = 0
     for lib_row in study.lib_df.iter_rows(named=True):
         # Extract basic library data
         lib_uid = lib_row.get("lib_uid")
@@ -1947,21 +1891,19 @@ def lib_to_consensus(study, chrom_fhwm: float = 5.0, mz_tol: float = 0.01, rt_to
         iso = lib_row.get("iso", 0)
         adduct = lib_row.get("adduct")
         z = lib_row.get("z", 1)  # charge
         # Skip entries without essential data
         if mz is None:
             if logger:
                 logger.warning(f"Skipping library entry {lib_uid} - no m/z value")
             continue
         # Check for pre-existing consensus features
         existing_match = None
         if not study.consensus_df.is_empty():
             # Filter by m/z tolerance first
-            mz_matches = study.consensus_df.filter(
-                (pl.col("mz") >= mz - mz_tol) & (pl.col("mz") <= mz + mz_tol)
-            )
+            mz_matches = study.consensus_df.filter((pl.col("mz") >= mz - mz_tol) & (pl.col("mz") <= mz + mz_tol))
             if not mz_matches.is_empty():
                 if rt is None:
                     # If rt is null, pick the consensus feature with largest inty_mean
@@ -1974,7 +1916,7 @@ def lib_to_consensus(study, chrom_fhwm: float = 5.0, mz_tol: float = 0.01, rt_to
                     )
                     if not rt_matches.is_empty():
                         existing_match = rt_matches.sort("inty_mean", descending=True).head(1)
         if existing_match is not None and len(existing_match) > 0:
             # Found a matching consensus feature, skip this library entry
             matched_count += 1
@@ -1982,27 +1924,29 @@ def lib_to_consensus(study, chrom_fhwm: float = 5.0, mz_tol: float = 0.01, rt_to
                 match_uid = existing_match["consensus_uid"][0]
                 match_mz = existing_match["mz"][0]
                 match_rt = existing_match["rt"][0]
-                logger.debug(f"Library entry {lib_uid} (mz={mz:.4f}, rt={rt}) matched existing consensus {match_uid} (mz={match_mz:.4f}, rt={match_rt})")
+                logger.debug(
+                    f"Library entry {lib_uid} (mz={mz:.4f}, rt={rt}) matched existing consensus {match_uid} (mz={match_mz:.4f}, rt={match_rt})"
+                )
             continue
         # No match found, create new consensus feature
         # Handle missing RT - use 0 as placeholder
         if rt is None:
             rt = 0.0
             if logger and skipped_count < 5:  # Log first few
                 logger.debug(f"Library entry {lib_uid} has no RT, using 0.0")
         # Calculate RT range based on chrom_fhwm
         half_width = chrom_fhwm / 2.0
         rt_start = rt - half_width
         rt_end = rt + half_width
         # Get adduct information
         adduct_top = adduct if adduct else "?"
         adduct_charge_top = None
         adduct_mass_shift_top = None
         adduct_mass_neutral_top = None
         # Parse adduct to get charge and mass shift
         if adduct_top and cached_adducts_df is not None and not cached_adducts_df.is_empty():
             # Look for exact match in study adducts
@@ -2011,7 +1955,7 @@ def lib_to_consensus(study, chrom_fhwm: float = 5.0, mz_tol: float = 0.01, rt_to
                 adduct_row = matching_adduct.row(0, named=True)
                 adduct_charge_top = adduct_row["charge"]
                 adduct_mass_shift_top = adduct_row["mass_shift"]
         # Fallback to default values if not found
         if adduct_charge_top is None:
             adduct_charge_top = int(z) if z else 1
@@ -2029,15 +1973,15 @@ def lib_to_consensus(study, chrom_fhwm: float = 5.0, mz_tol: float = 0.01, rt_to
                 adduct_mass_shift_top = 1.007825
                 if adduct_top == "?":
                     adduct_top = "[M+?]1+"
         # Calculate neutral mass
         if adduct_charge_top and adduct_mass_shift_top is not None:
             adduct_mass_neutral_top = mz * abs(adduct_charge_top) - adduct_mass_shift_top
         # Determine adduct group for isotopologues and related adducts
         adduct_group = consensus_uid_counter  # Default: each entry gets its own group
         adduct_of = 0  # Default: this is the base adduct
         # Track base adducts ([M+H] iso=0 or [M-H] iso=0) for grouping
         base_adduct_key = None
         if iso == 0 and adduct_top in ["[M+H]+", "[M+H]1+", "[M-H]-", "[M-H]1-"]:
@@ -2049,21 +1993,22 @@ def lib_to_consensus(study, chrom_fhwm: float = 5.0, mz_tol: float = 0.01, rt_to
             # Calculate the base m/z (subtract isotope mass shifts)
             c13_mass_shift = 1.00335
             base_mz = mz - (iso * c13_mass_shift / abs(adduct_charge_top))
             # Look for matching base adduct
             for (stored_mz, stored_adduct), stored_group in base_adduct_groups.items():
                 if abs(stored_mz - base_mz) < mz_tol and stored_adduct == adduct_top:
                     adduct_group = stored_group
                     adduct_of = stored_group
                     break
         # Create adduct values list with proper structure (format: structured data with fields: adduct, count, percentage, mass)
         adduct_values = [{"adduct": adduct_top, "count": 1, "percentage": 100.0, "mass": 0.0}]
         # Generate unique consensus_id string
         import uuid
-        consensus_id_str = str(uuid.uuid4()).replace('-', '')[:16]
+        consensus_id_str = str(uuid.uuid4()).replace("-", "")[:16]
         # Build consensus metadata with requested modifications for new entries
         metadata = {
             "consensus_uid": consensus_uid_counter,
@@ -2096,7 +2041,9 @@ def lib_to_consensus(study, chrom_fhwm: float = 5.0, mz_tol: float = 0.01, rt_to
             "adducts": adduct_values,
             "adduct_charge_top": adduct_charge_top,
             "adduct_group": adduct_group,  # Use calculated adduct group
-            "adduct_mass_neutral_top": round(adduct_mass_neutral_top, 6) if adduct_mass_neutral_top is not None else None,
+            "adduct_mass_neutral_top": round(adduct_mass_neutral_top, 6)
+            if adduct_mass_neutral_top is not None
+            else None,
             "adduct_mass_shift_top": round(adduct_mass_shift_top, 6) if adduct_mass_shift_top is not None else None,
             "adduct_of": adduct_of,  # Use calculated adduct_of
             "adduct_top": adduct_top,
@@ -2105,9 +2052,9 @@ def lib_to_consensus(study, chrom_fhwm: float = 5.0, mz_tol: float = 0.01, rt_to
             "id_top_adduct": None,  # Set to null as requested
             "id_top_score": None,  # Set to null as requested
         }
         consensus_metadata.append(metadata)
         # Create mapping entry (maps to library entry as "virtual" feature)
         # Use lib_uid as the feature_uid and a virtual sample_uid of 0
         # Match existing consensus_mapping_df column order: consensus_uid, feature_uid, sample_uid
@@ -2116,18 +2063,20 @@ def lib_to_consensus(study, chrom_fhwm: float = 5.0, mz_tol: float = 0.01, rt_to
             "feature_uid": lib_uid,  # Use lib_uid as feature reference
             "sample_uid": 0,  # Virtual sample for library entries
         })
         consensus_uid_counter += 1
     # Log matching statistics
     if logger:
         total_processed = matched_count + len(consensus_metadata)
-        logger.info(f"Processed {total_processed} library entries: {matched_count} matched existing consensus features, {len(consensus_metadata)} created new features")
+        logger.info(
+            f"Processed {total_processed} library entries: {matched_count} matched existing consensus features, {len(consensus_metadata)} created new features"
+        )
     # Convert to DataFrames with proper schema alignment
     if consensus_metadata:
         new_consensus_df = pl.DataFrame(consensus_metadata, strict=False)
         # Ensure schema compatibility with existing consensus_df
         if not study.consensus_df.is_empty():
             # Cast columns to match existing schema
@@ -2143,36 +2092,36 @@ def lib_to_consensus(study, chrom_fhwm: float = 5.0, mz_tol: float = 0.01, rt_to
                         cast_exprs.append(pl.col(col_name).cast(target_dtype, strict=False))
                 else:
                     cast_exprs.append(pl.col(col_name))
             new_consensus_df = new_consensus_df.select(cast_exprs)
         new_consensus_mapping_df = pl.DataFrame(consensus_mapping_list, strict=False)
         # Append to existing DataFrames
         if not study.consensus_df.is_empty():
             study.consensus_df = pl.concat([study.consensus_df, new_consensus_df])
         else:
             study.consensus_df = new_consensus_df
         if not study.consensus_mapping_df.is_empty():
             study.consensus_mapping_df = pl.concat([study.consensus_mapping_df, new_consensus_mapping_df])
         else:
             study.consensus_mapping_df = new_consensus_mapping_df
         if logger:
             logger.info(f"Added {len(consensus_metadata)} consensus features from library")
     else:
         if logger:
             logger.warning("No valid consensus features created from library")
         return
     # Store operation in history
     if hasattr(study, "update_history"):
         study.update_history(
             ["lib_to_consensus"],
             {"chrom_fhwm": chrom_fhwm, "lib_entries": len(study.lib_df)},
         )
     # Perform find_ms2 at the end
     try:
         if hasattr(study, "find_ms2"):
@@ -2185,6 +2134,6 @@ def lib_to_consensus(study, chrom_fhwm: float = 5.0, mz_tol: float = 0.01, rt_to
     except Exception as e:
         if logger:
             logger.warning(f"find_ms2 failed: {e}")
     if logger:
         logger.success(f"lib_to_consensus completed: {len(consensus_metadata)} features added")

masster 0.5.22__py3-none-any.whl → 0.5.24__py3-none-any.whl

Potentially problematic release.

masster 0.5.22py3-none-any.whl → 0.5.24py3-none-any.whl