PyPI - masster - Versions diffs - 0.4.11__py3-none-any.whl → 0.4.13__py3-none-any.whl - Mend

masster 0.4.11py3-none-any.whl → 0.4.13py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of masster might be problematic. Click here for more details.

Files changed (13) hide show

masster/_version.py +1 -1
masster/lib/lib.py +45 -3
masster/study/helpers.py +262 -310
masster/study/id.py +564 -324
masster/study/plot.py +38 -23
masster/study/processing.py +268 -178
masster/study/study.py +95 -60
masster/study/study5_schema.json +12 -0
{masster-0.4.11.dist-info → masster-0.4.13.dist-info}/METADATA +1 -1
{masster-0.4.11.dist-info → masster-0.4.13.dist-info}/RECORD +13 -13
{masster-0.4.11.dist-info → masster-0.4.13.dist-info}/WHEEL +0 -0
{masster-0.4.11.dist-info → masster-0.4.13.dist-info}/entry_points.txt +0 -0
{masster-0.4.11.dist-info → masster-0.4.13.dist-info}/licenses/LICENSE +0 -0

masster/study/id.py CHANGED Viewed

@@ -16,13 +16,7 @@ def lib_load(
     polarity: str | None = None,
     adducts: list | None = None,
 ):
-    """Load a co    # Add compound and formula count columns
-    if "consensus_uid" in result_df.columns:
-        # Calculate counts per consensus_uid
-        count_stats = result_df.group_by("consensus_uid").agg([
-            pl.col("cmpd_uid").filter(pl.col("cmpd_uid").is_not_null()).n_unique().alias("num_cmpds") if "cmpd_uid" in result_df.columns else pl.lit(None).alias("num_cmpds"),
-            pl.col("formula").filter(pl.col("formula").is_not_null()).n_unique().alias("num_formulas") if "formula" in result_df.columns else pl.lit(None).alias("num_formulas")
-        ])library into the study.
+    """Load a compound library into the study.
     Args:
         study: Study instance
@@ -117,14 +111,17 @@ def lib_load(
             study.lib_df = (
                 filtered_lf.clone()
                 if hasattr(filtered_lf, "clone")
-                else pl.DataFrame(filtered_lf)
+                else pl.DataFrame(filtered_lf.to_dict() if hasattr(filtered_lf, 'to_dict') else filtered_lf)
             )
         except Exception:
-            study.lib_df = (
-                pl.from_pandas(filtered_lf)
-                if hasattr(filtered_lf, "to_pandas")
-                else pl.DataFrame(filtered_lf)
-            )
+            try:
+                study.lib_df = (
+                    pl.from_pandas(filtered_lf)
+                    if hasattr(filtered_lf, "to_pandas")
+                    else pl.DataFrame(filtered_lf.to_dict() if hasattr(filtered_lf, 'to_dict') else filtered_lf)
+                )
+            except Exception:
+                study.lib_df = pl.DataFrame()
     # Store this operation in history
     if hasattr(study, "store_history"):
@@ -134,29 +131,8 @@ def lib_load(
         )
-def identify(study, features=None, params=None, **kwargs):
-    """Identify consensus features against the loaded library.
-    Matches consensus_df.mz against lib_df.mz within mz_tolerance. If rt_tolerance
-    is provided and both consensus and library entries have rt values, RT is
-    used as an additional filter.
-    Args:
-        study: Study instance
-        features: Optional DataFrame or list of consensus_uids to identify.
-                 If None, identifies all consensus features.
-        params: Optional identify_defaults instance with matching tolerances and scoring parameters.
-                If None, uses default parameters.
-        **kwargs: Individual parameter overrides (mz_tol, rt_tol, heteroatom_penalty,
-                 multiple_formulas_penalty, multiple_compounds_penalty, heteroatoms)
-    The resulting DataFrame is stored as study.id_df. Columns:
-        - consensus_uid
-        - lib_uid
-        - mz_delta
-        - rt_delta (nullable)
-        - score (adduct probability from _get_adducts with penalties applied)
-    """
+def _setup_identify_parameters(params, kwargs):
+    """Setup identification parameters with fallbacks and overrides."""
     # Import defaults class
     try:
         from masster.study.defaults.identify_def import identify_defaults
@@ -184,19 +160,251 @@ def identify(study, features=None, params=None, **kwargs):
         for param_name, value in kwargs.items():
             if hasattr(params, param_name):
                 setattr(params, param_name, value)
+    return params
-    # Get effective tolerances from params (now possibly overridden)
-    effective_mz_tol = getattr(params, "mz_tol", 0.01)
-    effective_rt_tol = getattr(params, "rt_tol", 2.0)
-    # Get logger from study if available
-    logger = getattr(study, "logger", None)
+def _smart_reset_id_results(study, target_uids, logger):
+    """Smart reset of identification results - only clear what's being re-identified."""
+    if target_uids is not None:
+        # Selective reset: only clear results for features being re-identified
+        if hasattr(study, "id_df") and study.id_df is not None and not study.id_df.is_empty():
+            study.id_df = study.id_df.filter(
+                ~pl.col("consensus_uid").is_in(target_uids)
+            )
+            if logger:
+                logger.debug(f"Cleared previous results for {len(target_uids)} specific features")
+        elif not hasattr(study, "id_df"):
+            study.id_df = pl.DataFrame()
+    else:
+        # Full reset: clear all results
+        study.id_df = pl.DataFrame()
+        if logger:
+            logger.debug("Cleared all previous identification results")
+def _get_cached_adduct_probabilities(study, logger):
+    """Get adduct probabilities with caching to avoid repeated expensive computation."""
+    # Check if we have cached results and cache key matches current parameters
+    current_cache_key = _get_adduct_cache_key(study)
+    if (hasattr(study, '_cached_adduct_probs') and
+        hasattr(study, '_cached_adduct_key') and
+        study._cached_adduct_key == current_cache_key):
+        if logger:
+            logger.debug("Using cached adduct probabilities")
+        return study._cached_adduct_probs
+    # Compute and cache
+    if logger:
+        logger.debug("Computing adduct probabilities...")
+    adduct_prob_map = _get_adduct_probabilities(study)
+    study._cached_adduct_probs = adduct_prob_map
+    study._cached_adduct_key = current_cache_key
     if logger:
+        logger.debug(f"Computed and cached probabilities for {len(adduct_prob_map)} adducts")
+    return adduct_prob_map
+def _get_adduct_cache_key(study):
+    """Generate a cache key based on adduct-related parameters."""
+    if hasattr(study, 'parameters') and hasattr(study.parameters, 'adducts'):
+        adducts_str = '|'.join(sorted(study.parameters.adducts)) if study.parameters.adducts else ""
+        min_prob = getattr(study.parameters, 'adduct_min_probability', 0.04)
+        return f"adducts:{adducts_str}:min_prob:{min_prob}"
+    return "default"
+def clear_identification_cache(study):
+    """Clear cached identification data (useful when parameters change)."""
+    cache_attrs = ['_cached_adduct_probs', '_cached_adduct_key']
+    for attr in cache_attrs:
+        if hasattr(study, attr):
+            delattr(study, attr)
+def _perform_identification_matching(consensus_to_process, study, effective_mz_tol, effective_rt_tol, adduct_prob_map, logger):
+    """Perform optimized identification matching using vectorized operations where possible."""
+    results = []
+    # Get library data as arrays for faster access
+    lib_df = study.lib_df
+    if logger:
+        consensus_count = len(consensus_to_process)
+        lib_count = len(lib_df)
         logger.debug(
-            f"Starting identification with mz_tolerance={effective_mz_tol}, rt_tolerance={effective_rt_tol}",
+            f"Identifying {consensus_count} consensus features against {lib_count} library entries",
+        )
+    # Process each consensus feature
+    for cons_row in consensus_to_process.iter_rows(named=True):
+        cons_uid = cons_row.get("consensus_uid")
+        cons_mz = cons_row.get("mz")
+        cons_rt = cons_row.get("rt")
+        if cons_mz is None:
+            if logger:
+                logger.debug(f"Skipping consensus feature {cons_uid} - no m/z value")
+            results.append({"consensus_uid": cons_uid, "matches": []})
+            continue
+        # Find matches using vectorized filtering
+        matches = _find_matches_vectorized(
+            lib_df, cons_mz, cons_rt, effective_mz_tol, effective_rt_tol, logger, cons_uid
         )
+        # Convert matches to result format
+        match_results = []
+        if not matches.is_empty():
+            for match_row in matches.iter_rows(named=True):
+                mz_delta = abs(cons_mz - match_row.get("mz")) if match_row.get("mz") is not None else None
+                lib_rt = match_row.get("rt")
+                rt_delta = (
+                    abs(cons_rt - lib_rt)
+                    if (cons_rt is not None and lib_rt is not None)
+                    else None
+                )
+                # Get adduct probability from cached map
+                adduct = match_row.get("adduct")
+                score = adduct_prob_map.get(adduct, 1.0) if adduct else 1.0
+                match_results.append({
+                    "lib_uid": match_row.get("lib_uid"),
+                    "mz_delta": mz_delta,
+                    "rt_delta": rt_delta,
+                    "matcher": "ms1",
+                    "score": score,
+                })
+        results.append({"consensus_uid": cons_uid, "matches": match_results})
+    return results
+def _find_matches_vectorized(lib_df, cons_mz, cons_rt, mz_tol, rt_tol, logger, cons_uid):
+    """Find library matches using optimized vectorized operations."""
+    # Filter by m/z tolerance using vectorized operations
+    matches = lib_df.filter(
+        (pl.col("mz") >= cons_mz - mz_tol) & (pl.col("mz") <= cons_mz + mz_tol)
+    )
+    initial_match_count = len(matches)
+    # Apply RT filter if available
+    if rt_tol is not None and cons_rt is not None and not matches.is_empty():
+        rt_matches = matches.filter(
+            pl.col("rt").is_not_null() &
+            (pl.col("rt") >= cons_rt - rt_tol) &
+            (pl.col("rt") <= cons_rt + rt_tol)
+        )
+        if not rt_matches.is_empty():
+            matches = rt_matches
+            if logger:
+                logger.debug(
+                    f"Consensus {cons_uid}: {initial_match_count} m/z matches, {len(matches)} after RT filter"
+                )
+        else:
+            if logger:
+                logger.debug(
+                    f"Consensus {cons_uid}: {initial_match_count} m/z matches, 0 after RT filter - using m/z matches only"
+                )
+    # Optimized deduplication using Polars operations
+    if not matches.is_empty() and len(matches) > 1:
+        if "formula" in matches.columns and "adduct" in matches.columns:
+            pre_dedup_count = len(matches)
+            # Use Polars group_by with maintain_order for consistent results
+            matches = (
+                matches
+                .sort("lib_uid")  # Ensure consistent ordering
+                .group_by(["formula", "adduct"], maintain_order=True)
+                .first()
+            )
+            post_dedup_count = len(matches)
+            if logger and post_dedup_count < pre_dedup_count:
+                logger.debug(
+                    f"Consensus {cons_uid}: deduplicated {pre_dedup_count} to {post_dedup_count} matches"
+                )
+    return matches
+def _update_identification_results(study, results, logger):
+    """Update study.id_df with new identification results."""
+    # Flatten results into records
+    records = []
+    for result in results:
+        consensus_uid = result["consensus_uid"]
+        for match in result["matches"]:
+            records.append({
+                "consensus_uid": consensus_uid,
+                "lib_uid": match["lib_uid"],
+                "mz_delta": match["mz_delta"],
+                "rt_delta": match["rt_delta"],
+                "matcher": match["matcher"],
+                "score": match["score"],
+            })
+    # Convert to DataFrame and append to existing results
+    new_results_df = pl.DataFrame(records) if records else pl.DataFrame()
+    if not new_results_df.is_empty():
+        if hasattr(study, "id_df") and study.id_df is not None and not study.id_df.is_empty():
+            study.id_df = pl.concat([study.id_df, new_results_df])
+        else:
+            study.id_df = new_results_df
+        if logger:
+            logger.debug(f"Added {len(records)} identification results to study.id_df")
+    elif not hasattr(study, "id_df"):
+        study.id_df = pl.DataFrame()
+def _finalize_identification_results(study, params, logger):
+    """Apply final scoring adjustments and update consensus columns."""
+    # Apply scoring adjustments based on compound and formula counts
+    _apply_scoring_adjustments(study, params)
+    # Update consensus_df with top-scoring identification results
+    _update_consensus_id_columns(study, logger)
+def _store_identification_history(study, effective_mz_tol, effective_rt_tol, target_uids, params, kwargs):
+    """Store identification operation in study history."""
+    if hasattr(study, "store_history"):
+        history_params = {"mz_tol": effective_mz_tol, "rt_tol": effective_rt_tol}
+        if target_uids is not None:
+            history_params["features"] = target_uids
+        if params is not None and hasattr(params, "to_dict"):
+            history_params["params"] = params.to_dict()
+        if kwargs:
+            history_params["kwargs"] = kwargs
+        study.store_history(["identify"], history_params)
+def _validate_identify_inputs(study, logger=None):
+    """Validate inputs for identification process."""
+    if getattr(study, "consensus_df", None) is None or study.consensus_df.is_empty():
+        if logger:
+            logger.warning("No consensus features found for identification")
+        return False
+    if getattr(study, "lib_df", None) is None or study.lib_df.is_empty():
+        if logger:
+            logger.error("Library (study.lib_df) is empty; call lib_load() first")
+        raise ValueError("Library (study.lib_df) is empty; call lib_load() first")
+    return True
-    # Determine which features to process
+def _prepare_consensus_features(study, features, logger=None):
+    """Prepare consensus features for identification."""
     target_uids = None
     if features is not None:
         if hasattr(features, "columns"):  # DataFrame-like
@@ -219,38 +427,6 @@ def identify(study, features=None, params=None, **kwargs):
         if logger:
             logger.debug(f"Identifying {len(target_uids)} specified features")
-    # Clear previous identification results for target features only
-    if hasattr(study, "id_df") and not study.id_df.is_empty():
-        if target_uids is not None:
-            # Keep results for features NOT being re-identified
-            study.id_df = study.id_df.filter(
-                ~pl.col("consensus_uid").is_in(target_uids),
-            )
-            if logger:
-                logger.debug(
-                    f"Cleared previous identification results for {len(target_uids)} features",
-                )
-        else:
-            # Clear all results if no specific features specified
-            study.id_df = pl.DataFrame()
-            if logger:
-                logger.debug("Cleared all previous identification results")
-    elif not hasattr(study, "id_df"):
-        study.id_df = pl.DataFrame()
-        if logger:
-            logger.debug("Initialized empty id_df")
-    # Validate inputs
-    if getattr(study, "consensus_df", None) is None or study.consensus_df.is_empty():
-        if logger:
-            logger.warning("No consensus features found for identification")
-        return
-    if getattr(study, "lib_df", None) is None or study.lib_df.is_empty():
-        if logger:
-            logger.error("Library (study.lib_df) is empty; call lib_load() first")
-        raise ValueError("Library (study.lib_df) is empty; call lib_load() first")
     # Filter consensus features if target_uids specified
     consensus_to_process = study.consensus_df
     if target_uids is not None:
@@ -262,270 +438,294 @@ def identify(study, features=None, params=None, **kwargs):
                 logger.warning(
                     "No consensus features found matching specified features",
                 )
-            return
+            return None, target_uids
+    return consensus_to_process, target_uids
-    consensus_count = len(consensus_to_process)
-    lib_count = len(study.lib_df)
-    if logger:
-        if target_uids is not None:
-            logger.debug(
-                f"Identifying {consensus_count} specified consensus features against {lib_count} library entries",
-            )
-        else:
-            logger.debug(
-                f"Identifying {consensus_count} consensus features against {lib_count} library entries",
-            )
-    # Get adduct probabilities
-    adducts_df = study._get_adducts()
+def _get_adduct_probabilities(study):
+    """Get adduct probabilities from _get_adducts() results."""
+    adducts_df = _get_adducts(study)
     adduct_prob_map = {}
     if not adducts_df.is_empty():
         for row in adducts_df.iter_rows(named=True):
             adduct_prob_map[row.get("name")] = row.get("probability", 1.0)
+    return adduct_prob_map
-    results = []
-    features_with_matches = 0
-    total_matches = 0
-    rt_filtered_compounds = 0
-    multiply_charged_filtered = 0
-    # Iterate consensus rows and find matching lib rows by m/z +/- tolerance
-    for cons in consensus_to_process.iter_rows(named=True):
-        cons_mz = cons.get("mz")
-        cons_rt = cons.get("rt")
-        cons_uid = cons.get("consensus_uid")
-        if cons_mz is None:
-            if logger:
-                logger.debug(f"Skipping consensus feature {cons_uid} - no m/z value")
-            continue
+def _create_identification_results(consensus_to_process, study, effective_mz_tol, effective_rt_tol, adduct_prob_map, logger=None):
+    """Create identification results by matching consensus features against library (DEPRECATED - use optimized version)."""
+    # This function is now deprecated in favor of _perform_identification_matching
+    # Keep for backward compatibility but redirect to optimized version
+    results = _perform_identification_matching(
+        consensus_to_process, study, effective_mz_tol, effective_rt_tol, adduct_prob_map, logger
+    )
+    # Convert to legacy format for compatibility
+    legacy_results = []
+    features_with_matches = 0
+    total_matches = 0
+    for result in results:
+        if result["matches"]:
+            features_with_matches += 1
+            total_matches += len(result["matches"])
+            for match in result["matches"]:
+                legacy_results.append({
+                    "consensus_uid": result["consensus_uid"],
+                    "lib_uid": match["lib_uid"],
+                    "mz_delta": match["mz_delta"],
+                    "rt_delta": match["rt_delta"],
+                    "matcher": match["matcher"],
+                    "score": match["score"],
+                })
+    return legacy_results, features_with_matches, total_matches
+def _apply_scoring_adjustments(study, params):
+    """Apply scoring adjustments based on compound and formula counts using optimized operations."""
+    if (
+        not study.id_df.is_empty()
+        and hasattr(study, "lib_df")
+        and not study.lib_df.is_empty()
+    ):
+        # Get penalty parameters
+        heteroatoms = getattr(params, "heteroatoms", ["Cl", "Br", "F", "I"])
+        heteroatom_penalty = getattr(params, "heteroatom_penalty", 0.7)
+        formulas_penalty = getattr(params, "multiple_formulas_penalty", 0.8)
+        compounds_penalty = getattr(params, "multiple_compounds_penalty", 0.8)
-        # Filter lib by mz window
-        matches = study.lib_df.filter(
-            (pl.col("mz") >= cons_mz - effective_mz_tol)
-            & (pl.col("mz") <= cons_mz + effective_mz_tol),
+        # Single join to get all needed library information
+        lib_columns = ["lib_uid", "cmpd_uid", "formula"]
+        id_with_lib = study.id_df.join(
+            study.lib_df.select(lib_columns),
+            on="lib_uid",
+            how="left",
         )
-        initial_matches = len(matches)
+        # Calculate all statistics in one group_by operation
+        stats = id_with_lib.group_by("consensus_uid").agg([
+            pl.col("cmpd_uid").n_unique().alias("num_cmpds"),
+            pl.col("formula").filter(pl.col("formula").is_not_null()).n_unique().alias("num_formulas"),
+        ])
-        # If rt_tol provided and consensus RT present, prefer rt-filtered hits
-        if effective_rt_tol is not None and cons_rt is not None:
-            rt_matches = matches.filter(
-                pl.col("rt").is_not_null()
-                & (pl.col("rt") >= cons_rt - effective_rt_tol)
-                & (pl.col("rt") <= cons_rt + effective_rt_tol),
-            )
-            if not rt_matches.is_empty():
-                matches = rt_matches
-                if logger:
-                    logger.debug(
-                        f"Consensus {cons_uid}: {initial_matches} m/z matches, {len(matches)} after RT filter",
-                    )
-            else:
-                if logger:
-                    logger.debug(
-                        f"Consensus {cons_uid}: {initial_matches} m/z matches, 0 after RT filter - using m/z matches only",
-                    )
+        # Join stats back and apply all penalties in one with_columns operation
+        heteroatom_conditions = [pl.col("formula").str.contains(atom) for atom in heteroatoms]
+        has_heteroatoms = pl.fold(
+            acc=pl.lit(False),
+            function=lambda acc, x: acc | x,
+            exprs=heteroatom_conditions
+        ) if heteroatom_conditions else pl.lit(False)
-        # Apply scoring-based filtering system
-        if not matches.is_empty():
-            filtered_matches = matches.clone()
-        else:
-            filtered_matches = pl.DataFrame()
+        study.id_df = (
+            id_with_lib
+            .join(stats, on="consensus_uid", how="left")
+            .with_columns([
+                # Apply all penalties in sequence using case-when chains
+                pl.when(pl.col("formula").is_not_null() & has_heteroatoms)
+                .then(pl.col("score") * heteroatom_penalty)
+                .otherwise(pl.col("score"))
+                .alias("score_temp1")
+            ])
+            .with_columns([
+                pl.when(pl.col("num_formulas") > 1)
+                .then(pl.col("score_temp1") * formulas_penalty)
+                .otherwise(pl.col("score_temp1"))
+                .alias("score_temp2")
+            ])
+            .with_columns([
+                pl.when(pl.col("num_cmpds") > 1)
+                .then(pl.col("score_temp2") * compounds_penalty)
+                .otherwise(pl.col("score_temp2"))
+                .round(4)
+                .alias("score")
+            ])
+            .select([
+                "consensus_uid",
+                "lib_uid",
+                "mz_delta",
+                "rt_delta",
+                "matcher",
+                "score",
+            ])
+        )
-        if not filtered_matches.is_empty():
-            features_with_matches += 1
-            feature_match_count = len(filtered_matches)
-            total_matches += feature_match_count
+def _update_consensus_id_columns(study, logger=None):
+    """Update consensus_df with top-scoring identification results using safe in-place updates."""
+    try:
+        if not hasattr(study, "id_df") or study.id_df is None or study.id_df.is_empty():
             if logger:
-                logger.debug(
-                    f"Consensus {cons_uid} (mz={cons_mz:.5f}): {feature_match_count} library matches",
+                logger.debug("No identification results to process")
+            return
+        if not hasattr(study, "lib_df") or study.lib_df is None or study.lib_df.is_empty():
+            if logger:
+                logger.debug("No library data available")
+            return
+        if not hasattr(study, "consensus_df") or study.consensus_df is None or study.consensus_df.is_empty():
+            if logger:
+                logger.debug("No consensus data available")
+            return
+        # Get library columns we need
+        lib_columns = ["lib_uid", "name", "adduct"]
+        if "class" in study.lib_df.columns:
+            lib_columns.append("class")
+        # Get top-scoring identification for each consensus feature
+        top_ids = (
+            study.id_df
+            .sort(["consensus_uid", "score"], descending=[False, True])
+            .group_by("consensus_uid", maintain_order=True)
+            .first()
+            .join(study.lib_df.select(lib_columns), on="lib_uid", how="left")
+            .select([
+                "consensus_uid",
+                "name",
+                pl.col("class").alias("id_top_class") if "class" in lib_columns else pl.lit(None, dtype=pl.String).alias("id_top_class"),
+                pl.col("adduct").alias("id_top_adduct"),
+                pl.col("score").alias("id_top_score")
+            ])
+            .rename({"name": "id_top_name"})
+        )
+        # Ensure we have the id_top columns in consensus_df
+        for col_name, dtype in [
+            ("id_top_name", pl.String),
+            ("id_top_class", pl.String),
+            ("id_top_adduct", pl.String),
+            ("id_top_score", pl.Float64)
+        ]:
+            if col_name not in study.consensus_df.columns:
+                study.consensus_df = study.consensus_df.with_columns(
+                    pl.lit(None, dtype=dtype).alias(col_name)
                 )
-        for m in filtered_matches.iter_rows(named=True):
-            mz_delta = abs(cons_mz - m.get("mz")) if m.get("mz") is not None else None
-            lib_rt = m.get("rt")
-            rt_delta = (
-                abs(cons_rt - lib_rt)
-                if (cons_rt is not None and lib_rt is not None)
-                else None
-            )
+        # Create a mapping dictionary for efficient updates
+        id_mapping = {}
+        for row in top_ids.iter_rows(named=True):
+            consensus_uid = row["consensus_uid"]
+            id_mapping[consensus_uid] = {
+                "id_top_name": row["id_top_name"],
+                "id_top_class": row["id_top_class"],
+                "id_top_adduct": row["id_top_adduct"],
+                "id_top_score": row["id_top_score"]
+            }
+        # Update consensus_df using map_elements (safer than join for avoiding duplicates)
+        if id_mapping:
+            study.consensus_df = study.consensus_df.with_columns([
+                pl.col("consensus_uid").map_elements(
+                    lambda uid: id_mapping.get(uid, {}).get("id_top_name"),
+                    return_dtype=pl.String
+                ).alias("id_top_name"),
+                pl.col("consensus_uid").map_elements(
+                    lambda uid: id_mapping.get(uid, {}).get("id_top_class"),
+                    return_dtype=pl.String
+                ).alias("id_top_class"),
+                pl.col("consensus_uid").map_elements(
+                    lambda uid: id_mapping.get(uid, {}).get("id_top_adduct"),
+                    return_dtype=pl.String
+                ).alias("id_top_adduct"),
+                pl.col("consensus_uid").map_elements(
+                    lambda uid: id_mapping.get(uid, {}).get("id_top_score"),
+                    return_dtype=pl.Float64
+                ).alias("id_top_score")
+            ])
-            # Get adduct probability from _get_adducts() results
-            adduct = m.get("adduct")
-            score = adduct_prob_map.get(adduct, 1.0) if adduct else 1.0
+        if logger:
+            num_updated = len(id_mapping)
+            logger.debug(f"Updated consensus_df with top identifications for {num_updated} features")
+    except Exception as e:
+        if logger:
+            logger.error(f"Error updating consensus_df with identification results: {e}")
+        # Don't re-raise to avoid breaking the identification process
-            results.append(
-                {
-                    "consensus_uid": cons.get("consensus_uid"),
-                    "lib_uid": m.get("lib_uid"),
-                    "mz_delta": mz_delta,
-                    "rt_delta": rt_delta,
-                    "matcher": "ms1",
-                    "score": score,
-                },
-            )
-    # Merge new results with existing results
-    new_results_df = pl.DataFrame(results) if results else pl.DataFrame()
-    if not new_results_df.is_empty():
-        if hasattr(study, "id_df") and not study.id_df.is_empty():
-            # Concatenate new results with existing results
-            study.id_df = pl.concat([study.id_df, new_results_df])
-        else:
-            # First results
-            study.id_df = new_results_df
-    # Apply scoring adjustments based on compound and formula counts
-    if (
-        not study.id_df.is_empty()
-        and hasattr(study, "lib_df")
-        and not study.lib_df.is_empty()
-    ):
-        # Join with lib_df to get compound and formula information
-        id_with_lib = study.id_df.join(
-            study.lib_df.select(["lib_uid", "cmpd_uid", "formula"]),
-            on="lib_uid",
-            how="left",
-        )
+def identify(study, features=None, params=None, **kwargs):
+    """Identify consensus features against the loaded library.
-        # Calculate counts per consensus_uid
-        count_stats = id_with_lib.group_by("consensus_uid").agg(
-            [
-                pl.col("cmpd_uid").n_unique().alias("num_cmpds"),
-                pl.col("formula")
-                .filter(pl.col("formula").is_not_null())
-                .n_unique()
-                .alias("num_formulas"),
-            ],
-        )
+    Matches consensus_df.mz against lib_df.mz within mz_tolerance. If rt_tolerance
+    is provided and both consensus and library entries have rt values, RT is
+    used as an additional filter.
-        # Join counts back to id_df
-        id_with_counts = study.id_df.join(count_stats, on="consensus_uid", how="left")
+    Args:
+        study: Study instance
+        features: Optional DataFrame or list of consensus_uids to identify.
+                 If None, identifies all consensus features.
+        params: Optional identify_defaults instance with matching tolerances and scoring parameters.
+                If None, uses default parameters.
+        **kwargs: Individual parameter overrides (mz_tol, rt_tol, heteroatom_penalty,
+                 multiple_formulas_penalty, multiple_compounds_penalty, heteroatoms)
-        # Join with lib_df again to get formula information for heteroatom penalty
-        id_with_formula = id_with_counts.join(
-            study.lib_df.select(["lib_uid", "formula"]),
-            on="lib_uid",
-            how="left",
+    The resulting DataFrame is stored as study.id_df. Columns:
+        - consensus_uid
+        - lib_uid
+        - mz_delta
+        - rt_delta (nullable)
+        - score (adduct probability from _get_adducts with penalties applied)
+    """
+    # Get logger from study if available
+    logger = getattr(study, "logger", None)
+    # Setup parameters early
+    params = _setup_identify_parameters(params, kwargs)
+    effective_mz_tol = getattr(params, "mz_tol", 0.01)
+    effective_rt_tol = getattr(params, "rt_tol", 2.0)
+    if logger:
+        logger.debug(
+            f"Starting identification with mz_tolerance={effective_mz_tol}, rt_tolerance={effective_rt_tol}",
         )
-        # Apply scoring penalties
-        heteroatoms = getattr(params, "heteroatoms", ["Cl", "Br", "F", "I"])
-        heteroatom_penalty = getattr(params, "heteroatom_penalty", 0.7)
-        formulas_penalty = getattr(params, "multiple_formulas_penalty", 0.8)
-        compounds_penalty = getattr(params, "multiple_compounds_penalty", 0.8)
+    # Validate inputs early
+    if not _validate_identify_inputs(study, logger):
+        return
-        # Build heteroatom condition
-        heteroatom_condition = None
-        for atom in heteroatoms:
-            atom_condition = pl.col("formula").str.contains(atom)
-            if heteroatom_condition is None:
-                heteroatom_condition = atom_condition
-            else:
-                heteroatom_condition = heteroatom_condition | atom_condition
+    # Prepare consensus features and determine target UIDs early
+    consensus_to_process, target_uids = _prepare_consensus_features(study, features, logger)
+    if consensus_to_process is None:
+        return
-        # Apply penalties
-        study.id_df = (
-            id_with_formula.with_columns(
-                [
-                    # Heteroatom penalty: if formula contains specified heteroatoms, apply penalty
-                    pl.when(
-                        pl.col("formula").is_not_null() & heteroatom_condition,
-                    )
-                    .then(pl.col("score") * heteroatom_penalty)
-                    .otherwise(pl.col("score"))
-                    .alias("score_temp0"),
-                ],
-            )
-            .with_columns(
-                [
-                    # If num_formulas > 1, apply multiple formulas penalty
-                    pl.when(pl.col("num_formulas") > 1)
-                    .then(pl.col("score_temp0") * formulas_penalty)
-                    .otherwise(pl.col("score_temp0"))
-                    .alias("score_temp1"),
-                ],
-            )
-            .with_columns(
-                [
-                    # If num_cmpds > 1, apply multiple compounds penalty
-                    pl.when(pl.col("num_cmpds") > 1)
-                    .then(pl.col("score_temp1") * compounds_penalty)
-                    .otherwise(pl.col("score_temp1"))
-                    .round(4)  # Round to 4 decimal places
-                    .alias("score"),
-                ],
-            )
-            .select(
-                [
-                    "consensus_uid",
-                    "lib_uid",
-                    "mz_delta",
-                    "rt_delta",
-                    "matcher",
-                    "score",
-                ],
-            )
-        )
+    # Smart reset of id_df: only clear results for features being re-identified
+    _smart_reset_id_results(study, target_uids, logger)
+    # Cache adduct probabilities (expensive operation)
+    adduct_prob_map = _get_cached_adduct_probabilities(study, logger)
-    # Store this operation in history
-    if hasattr(study, "store_history"):
-        history_params = {"mz_tol": effective_mz_tol, "rt_tol": effective_rt_tol}
-        if features is not None:
-            history_params["features"] = target_uids
-        if params is not None and hasattr(params, "to_dict"):
-            history_params["params"] = params.to_dict()
-        if kwargs:
-            history_params["kwargs"] = kwargs
-        study.store_history(["identify"], history_params)
+    # Perform identification with optimized matching
+    results = _perform_identification_matching(
+        consensus_to_process, study, effective_mz_tol, effective_rt_tol, adduct_prob_map, logger
+    )
-    if logger:
-        if rt_filtered_compounds > 0:
-            logger.debug(
-                f"RT consistency filtering applied to {rt_filtered_compounds} compound groups",
-            )
+    # Update or append results to study.id_df
+    _update_identification_results(study, results, logger)
-        if multiply_charged_filtered > 0:
-            logger.debug(
-                f"Excluded {multiply_charged_filtered} multiply charged adducts (no [M+H]+ or [M-H]- coeluting)",
-            )
+    # Apply scoring adjustments and update consensus columns
+    _finalize_identification_results(study, params, logger)
+    # Store operation in history
+    _store_identification_history(study, effective_mz_tol, effective_rt_tol, target_uids, params, kwargs)
+    # Log final statistics
+    consensus_count = len(consensus_to_process)
+    if logger:
+        features_with_matches = len([r for r in results if len(r["matches"]) > 0])
+        total_matches = sum(len(r["matches"]) for r in results)
         logger.info(
             f"Identification completed: {features_with_matches}/{consensus_count} features matched, {total_matches} total identifications",
         )
-        if total_matches > 0:
-            # Calculate some statistics
-            mz_deltas = [r["mz_delta"] for r in results if r["mz_delta"] is not None]
-            rt_deltas = [r["rt_delta"] for r in results if r["rt_delta"] is not None]
-            scores = [r["score"] for r in results if r["score"] is not None]
-            if mz_deltas:
-                avg_mz_delta = sum(mz_deltas) / len(mz_deltas)
-                max_mz_delta = max(mz_deltas)
-                logger.debug(
-                    f"m/z accuracy: average Δ={avg_mz_delta:.5f} Da, max Δ={max_mz_delta:.5f} Da",
-                )
-            if rt_deltas:
-                avg_rt_delta = sum(rt_deltas) / len(rt_deltas)
-                max_rt_delta = max(rt_deltas)
-                logger.debug(
-                    f"RT accuracy: average Δ={avg_rt_delta:.2f} min, max Δ={max_rt_delta:.2f} min",
-                )
-            if scores:
-                avg_score = sum(scores) / len(scores)
-                min_score = min(scores)
-                max_score = max(scores)
-                logger.debug(
-                    f"Adduct probability scores: average={avg_score:.3f}, min={min_score:.3f}, max={max_score:.3f}",
-                )
 def get_id(study, features=None) -> pl.DataFrame:
     """Get identification results with comprehensive annotation data.
@@ -795,6 +995,7 @@ def id_reset(study):
     Removes:
     - study.id_df (identification results DataFrame)
     - 'identify' from study.history
+    - Resets id_top_* columns in consensus_df to null
     Args:
         study: Study instance to reset
@@ -808,6 +1009,23 @@ def id_reset(study):
             logger.debug("Removing id_df")
         delattr(study, "id_df")
+    # Reset id_top_* columns in consensus_df
+    if hasattr(study, "consensus_df") and not study.consensus_df.is_empty():
+        if logger:
+            logger.debug("Resetting id_top_* columns in consensus_df")
+        # Check which columns exist before trying to update them
+        id_columns_to_reset = []
+        for col in ["id_top_name", "id_top_class", "id_top_adduct", "id_top_score"]:
+            if col in study.consensus_df.columns:
+                if col == "id_top_score":
+                    id_columns_to_reset.append(pl.lit(None, dtype=pl.Float64).alias(col))
+                else:
+                    id_columns_to_reset.append(pl.lit(None, dtype=pl.String).alias(col))
+        if id_columns_to_reset:
+            study.consensus_df = study.consensus_df.with_columns(id_columns_to_reset)
     # Remove identify from history
     if hasattr(study, "history") and "identify" in study.history:
         if logger:
@@ -827,6 +1045,7 @@ def lib_reset(study):
     - study._lib (library object reference)
     - 'identify' from study.history
     - 'lib_load' from study.history (if exists)
+    - Resets id_top_* columns in consensus_df to null
     Args:
         study: Study instance to reset
@@ -852,6 +1071,23 @@ def lib_reset(study):
             logger.debug("Removing _lib reference")
         delattr(study, "_lib")
+    # Reset id_top_* columns in consensus_df
+    if hasattr(study, "consensus_df") and not study.consensus_df.is_empty():
+        if logger:
+            logger.debug("Resetting id_top_* columns in consensus_df")
+        # Check which columns exist before trying to update them
+        id_columns_to_reset = []
+        for col in ["id_top_name", "id_top_class", "id_top_adduct", "id_top_score"]:
+            if col in study.consensus_df.columns:
+                if col == "id_top_score":
+                    id_columns_to_reset.append(pl.lit(None, dtype=pl.Float64).alias(col))
+                else:
+                    id_columns_to_reset.append(pl.lit(None, dtype=pl.String).alias(col))
+        if id_columns_to_reset:
+            study.consensus_df = study.consensus_df.with_columns(id_columns_to_reset)
     # Remove from history
     if hasattr(study, "history"):
         if "identify" in study.history:
@@ -868,7 +1104,7 @@ def lib_reset(study):
         logger.info("Library and identification data reset completed")
-def _get_adducts(self, adducts_list: list = None, **kwargs):
+def _get_adducts(study, adducts_list: list = None, **kwargs):
     """
     Generate comprehensive adduct specifications for study-level adduct filtering.
@@ -901,10 +1137,11 @@ def _get_adducts(self, adducts_list: list = None, **kwargs):
     # Import required modules
     # Use provided adducts list or get from study parameters
-    if adducts_list is None:
-        adducts_list = (
-            self.parameters.adducts
-            if hasattr(self.parameters, "adducts") and self.parameters.adducts
+    adducts_list_to_use = adducts_list
+    if adducts_list_to_use is None:
+        adducts_list_to_use = (
+            study.parameters.adducts
+            if hasattr(study.parameters, "adducts") and study.parameters.adducts
             else []
         )
@@ -914,13 +1151,13 @@ def _get_adducts(self, adducts_list: list = None, **kwargs):
     max_combinations = kwargs.get("max_combinations", 3)  # Up to 3 combinations
     min_probability = kwargs.get(
         "min_probability",
-        getattr(self.parameters, "adduct_min_probability", 0.04),
+        getattr(study.parameters, "adduct_min_probability", 0.04),
     )
     # Parse base adduct specifications
     base_specs = []
-    for adduct_str in adducts_list:
+    for adduct_str in adducts_list_to_use:
         if not isinstance(adduct_str, str) or ":" not in adduct_str:
             continue
@@ -934,7 +1171,7 @@ def _get_adducts(self, adducts_list: list = None, **kwargs):
             probability = float(parts[2])
             # Calculate mass shift from formula
-            mass_shift = self._calculate_formula_mass_shift(formula_part)
+            mass_shift = _calculate_formula_mass_shift(study, formula_part)
             base_specs.append(
                 {
@@ -972,7 +1209,7 @@ def _get_adducts(self, adducts_list: list = None, **kwargs):
     # 1. Single adducts (filter out neutral adducts with charge == 0)
     for spec in base_specs:
         if charge_min <= spec["charge"] <= charge_max and spec["charge"] != 0:
-            formatted_name = self._format_adduct_name([spec])
+            formatted_name = _format_adduct_name([spec])
             combinations_list.append(
                 {
                     "components": [spec],
@@ -991,15 +1228,16 @@ def _get_adducts(self, adducts_list: list = None, **kwargs):
             total_charge = base_charge * multiplier
             if charge_min <= total_charge <= charge_max and total_charge != 0:
                 components = [spec] * multiplier
-                formatted_name = self._format_adduct_name(components)
+                formatted_name = _format_adduct_name(components)
+                probability_multiplied = float(spec["probability"]) ** multiplier
                 combinations_list.append(
                     {
                         "components": components,
                         "formatted_name": formatted_name,
-                        "total_mass_shift": spec["mass_shift"] * multiplier,
+                        "total_mass_shift": float(spec["mass_shift"]) * multiplier,
                         "total_charge": total_charge,
-                        "combined_probability": spec["probability"] ** multiplier,
+                        "combined_probability": probability_multiplied,
                         "complexity": multiplier,
                     },
                 )
@@ -1012,16 +1250,16 @@ def _get_adducts(self, adducts_list: list = None, **kwargs):
                 total_charge = pos_spec["charge"] + neut_spec["charge"]
                 if charge_min <= total_charge <= charge_max and total_charge != 0:
                     components = [pos_spec, neut_spec]
-                    formatted_name = self._format_adduct_name(components)
+                    formatted_name = _format_adduct_name(components)
                     combinations_list.append(
                         {
                             "components": components,
                             "formatted_name": formatted_name,
-                            "total_mass_shift": pos_spec["mass_shift"]
-                            + neut_spec["mass_shift"],
+                            "total_mass_shift": float(pos_spec["mass_shift"])
+                            + float(neut_spec["mass_shift"]),
                             "total_charge": total_charge,
-                            "combined_probability": pos_spec["probability"]
-                            * neut_spec["probability"],
+                            "combined_probability": float(pos_spec["probability"])
+                            * float(neut_spec["probability"]),
                             "complexity": 2,
                         },
                     )
@@ -1051,9 +1289,11 @@ def _get_adducts(self, adducts_list: list = None, **kwargs):
             adducts_df = adducts_df.filter(pl.col("probability") >= min_probability)
             adducts_after_filter = len(adducts_df)
-            self.logger.debug(
-                f"Study adducts: generated {adducts_before_filter}, filtered to {adducts_after_filter} (min_prob={min_probability})",
-            )
+            logger = getattr(study, "logger", None)
+            if logger:
+                logger.debug(
+                    f"Study adducts: generated {adducts_before_filter}, filtered to {adducts_after_filter} (min_prob={min_probability})",
+                )
     else:
         # Return empty DataFrame with correct schema
@@ -1070,7 +1310,7 @@ def _get_adducts(self, adducts_list: list = None, **kwargs):
     return adducts_df
-def _calculate_formula_mass_shift(self, formula: str) -> float:
+def _calculate_formula_mass_shift(study, formula: str) -> float:
     """Calculate mass shift from formula string like "+H", "-H2O", "+Na-H", etc."""
     # Standard atomic masses
     atomic_masses = {
@@ -1121,7 +1361,7 @@ def _calculate_formula_mass_shift(self, formula: str) -> float:
             continue
         # Parse element and count (e.g., "H2O" -> H:2, O:1)
-        elements = self._parse_element_counts(part)
+        elements = _parse_element_counts(part)
         for element, count in elements.items():
             if element in atomic_masses:
@@ -1130,9 +1370,9 @@ def _calculate_formula_mass_shift(self, formula: str) -> float:
     return total_mass
-def _parse_element_counts(self, formula_part: str) -> dict[str, int]:
+def _parse_element_counts(formula_part: str) -> dict[str, int]:
     """Parse element counts from a formula part like 'H2O' -> {'H': 2, 'O': 1}"""
-    elements = {}
+    elements: dict[str, int] = {}
     i = 0
     while i < len(formula_part):
@@ -1156,7 +1396,7 @@ def _parse_element_counts(self, formula_part: str) -> dict[str, int]:
     return elements
-def _format_adduct_name(self, components: list[dict]) -> str:
+def _format_adduct_name(components: list[dict]) -> str:
     """Format adduct name from components like [M+H]1+ or [M+2H]2+"""
     if not components:
         return "[M]"

masster 0.4.11__py3-none-any.whl → 0.4.13__py3-none-any.whl

Potentially problematic release.

masster 0.4.11py3-none-any.whl → 0.4.13py3-none-any.whl