PyPI - masster - Versions diffs - 0.5.12__py3-none-any.whl → 0.5.14__py3-none-any.whl - Mend

masster 0.5.12py3-none-any.whl → 0.5.14py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of masster might be problematic. Click here for more details.

Files changed (16) hide show

masster/_version.py +1 -1
masster/lib/lib.py +371 -57
masster/study/helpers.py +1 -0
masster/study/id.py +237 -39
masster/study/importers.py +331 -0
masster/study/merge.py +3 -1
masster/study/plot.py +93 -29
masster/study/study.py +4 -0
masster/study/study5_schema.json +12 -0
masster/wizard/__init__.py +4 -4
masster/wizard/wizard.py +437 -19
{masster-0.5.12.dist-info → masster-0.5.14.dist-info}/METADATA +1 -1
{masster-0.5.12.dist-info → masster-0.5.14.dist-info}/RECORD +16 -15
{masster-0.5.12.dist-info → masster-0.5.14.dist-info}/WHEEL +0 -0
{masster-0.5.12.dist-info → masster-0.5.14.dist-info}/entry_points.txt +0 -0
{masster-0.5.12.dist-info → masster-0.5.14.dist-info}/licenses/LICENSE +0 -0

masster/study/id.py CHANGED Viewed

@@ -145,16 +145,61 @@ def lib_load(
                 column_order.append("quant_group")
             elif col == "formula" and "iso" in columns_list and "iso" not in column_order:
                 column_order.append("iso")
-    # Apply the column ordering
-    filtered_lf = filtered_lf.select(column_order)
     # Add to existing lib_df instead of replacing
     if (
         hasattr(study, "lib_df")
         and study.lib_df is not None
         and not study.lib_df.is_empty()
     ):
+        # Check for schema compatibility and handle mismatches
+        existing_cols = set(study.lib_df.columns)
+        new_cols = set(filtered_lf.columns)
+        # If schemas don't match, we need to align them
+        if existing_cols != new_cols:
+            # Get union of all columns
+            all_cols = existing_cols.union(new_cols)
+            # Add missing columns to existing data with appropriate defaults
+            for col in new_cols - existing_cols:
+                if col == "probability":
+                    # Add probability column to existing data - try to calculate from adduct
+                    if "adduct" in study.lib_df.columns:
+                        try:
+                            adduct_prob_map = _get_adduct_probabilities(study)
+                            study.lib_df = study.lib_df.with_columns(
+                                pl.col("adduct").map_elements(
+                                    lambda adduct: adduct_prob_map.get(adduct, 1.0) if adduct is not None else 1.0,
+                                    return_dtype=pl.Float64
+                                ).alias("probability")
+                            )
+                        except Exception:
+                            study.lib_df = study.lib_df.with_columns(pl.lit(1.0).alias("probability"))
+                    else:
+                        study.lib_df = study.lib_df.with_columns(pl.lit(1.0).alias("probability"))
+                elif col == "iso":
+                    study.lib_df = study.lib_df.with_columns(pl.lit(0).cast(pl.Int64).alias("iso"))
+                elif col == "quant_group":
+                    # Set quant_group using cmpd_uid or lib_uid
+                    if "cmpd_uid" in study.lib_df.columns:
+                        study.lib_df = study.lib_df.with_columns(pl.col("cmpd_uid").cast(pl.Int64).alias("quant_group"))
+                    else:
+                        study.lib_df = study.lib_df.with_columns(pl.col("lib_uid").cast(pl.Int64).alias("quant_group"))
+                else:
+                    # Default to null for other columns
+                    study.lib_df = study.lib_df.with_columns(pl.lit(None).alias(col))
+            # Add missing columns to new data with appropriate defaults
+            for col in existing_cols - new_cols:
+                if col not in ["probability", "iso", "quant_group"]:  # These should already be handled
+                    filtered_lf = filtered_lf.with_columns(pl.lit(None).alias(col))
+        # Ensure column order matches for concatenation - use existing column order
+        existing_column_order = list(study.lib_df.columns)
+        filtered_lf = filtered_lf.select(existing_column_order)
         # Concatenate with existing data
         study.lib_df = pl.concat([study.lib_df, filtered_lf])
     else:
@@ -209,8 +254,19 @@ def _setup_identify_parameters(params, kwargs):
     # Override parameters with any provided kwargs
     if kwargs:
+        # Handle parameter name mapping for backwards compatibility
+        param_mapping = {
+            'rt_tolerance': 'rt_tol',
+            'mz_tolerance': 'mz_tol'
+        }
         for param_name, value in kwargs.items():
-            if hasattr(params, param_name):
+            # Check if we need to map the parameter name
+            mapped_name = param_mapping.get(param_name, param_name)
+            if hasattr(params, mapped_name):
+                setattr(params, mapped_name, value)
+            elif hasattr(params, param_name):
                 setattr(params, param_name, value)
     return params
@@ -319,9 +375,13 @@ def _perform_identification_matching(consensus_to_process, study, effective_mz_t
                     else None
                 )
-                # Get adduct probability from cached map
+                # Get library probability as base score, then multiply by adduct probability
+                lib_probability = match_row.get("probability", 1.0) if match_row.get("probability") is not None else 1.0
                 adduct = match_row.get("adduct")
-                score = adduct_prob_map.get(adduct, 1.0) if adduct else 1.0
+                adduct_probability = adduct_prob_map.get(adduct, 1.0) if adduct else 1.0
+                score = lib_probability * adduct_probability
+                # Scale to 0-100 and round to 1 decimal place
+                score = round(score * 100.0, 1)
                 match_results.append({
                     "lib_uid": match_row.get("lib_uid"),
@@ -337,7 +397,11 @@ def _perform_identification_matching(consensus_to_process, study, effective_mz_t
 def _find_matches_vectorized(lib_df, cons_mz, cons_rt, mz_tol, rt_tol, logger, cons_uid):
-    """Find library matches using optimized vectorized operations."""
+    """
+    Find library matches using optimized vectorized operations.
+    FIXED VERSION: Prevents incorrect matching of same compound to different m/z values.
+    """
     # Filter by m/z tolerance using vectorized operations
     matches = lib_df.filter(
         (pl.col("mz") >= cons_mz - mz_tol) & (pl.col("mz") <= cons_mz + mz_tol)
@@ -345,43 +409,78 @@ def _find_matches_vectorized(lib_df, cons_mz, cons_rt, mz_tol, rt_tol, logger, c
     initial_match_count = len(matches)
-    # Apply RT filter if available
+    # Apply RT filter if available - STRICT VERSION (no fallback)
     if rt_tol is not None and cons_rt is not None and not matches.is_empty():
-        rt_matches = matches.filter(
-            pl.col("rt").is_not_null() &
-            (pl.col("rt") >= cons_rt - rt_tol) &
-            (pl.col("rt") <= cons_rt + rt_tol)
-        )
+        # First, check if any m/z matches have RT data
+        rt_candidates = matches.filter(pl.col("rt").is_not_null())
-        if not rt_matches.is_empty():
-            matches = rt_matches
+        if not rt_candidates.is_empty():
+            # Apply RT filtering to candidates with RT data
+            rt_matches = rt_candidates.filter(
+                (pl.col("rt") >= cons_rt - rt_tol) &
+                (pl.col("rt") <= cons_rt + rt_tol)
+            )
+            if not rt_matches.is_empty():
+                matches = rt_matches
+                if logger:
+                    logger.debug(
+                        f"Consensus {cons_uid}: {initial_match_count} m/z matches, {len(rt_candidates)} with RT, {len(matches)} after RT filter"
+                    )
+            else:
+                # NO FALLBACK - if RT filtering finds no matches, return empty
+                matches = rt_matches  # This is empty
+                if logger:
+                    logger.debug(
+                        f"Consensus {cons_uid}: RT filtering eliminated all {len(rt_candidates)} candidates (rt_tol={rt_tol}s) - no matches returned"
+                    )
+        else:
+            # No RT data in library matches - return empty if strict RT filtering requested
             if logger:
                 logger.debug(
-                    f"Consensus {cons_uid}: {initial_match_count} m/z matches, {len(matches)} after RT filter"
+                    f"Consensus {cons_uid}: {initial_match_count} m/z matches but none have library RT data - no matches returned due to RT filtering"
                 )
+            matches = pl.DataFrame()  # Return empty DataFrame
+    # FIX 1: Add stricter m/z validation - prioritize more accurate matches
+    if not matches.is_empty():
+        strict_mz_tol = mz_tol * 0.5  # Use 50% of tolerance as strict threshold
+        strict_matches = matches.filter(
+            (pl.col("mz") >= cons_mz - strict_mz_tol) & (pl.col("mz") <= cons_mz + strict_mz_tol)
+        )
+        if not strict_matches.is_empty():
+            # Use strict matches if available
+            matches = strict_matches
+            if logger:
+                logger.debug(f"Consensus {cons_uid}: Using {len(matches)} strict m/z matches (within {strict_mz_tol:.6f} Da)")
         else:
             if logger:
-                logger.debug(
-                    f"Consensus {cons_uid}: {initial_match_count} m/z matches, 0 after RT filter - using m/z matches only"
-                )
+                logger.debug(f"Consensus {cons_uid}: No strict matches, using {len(matches)} loose matches")
-    # Optimized deduplication using Polars operations
+    # FIX 2: Improved deduplication - prioritize by m/z accuracy
     if not matches.is_empty() and len(matches) > 1:
         if "formula" in matches.columns and "adduct" in matches.columns:
             pre_dedup_count = len(matches)
-            # Use Polars group_by with maintain_order for consistent results
+            # Calculate m/z error for sorting
+            matches = matches.with_columns([
+                (pl.col("mz") - cons_mz).abs().alias("mz_error_abs")
+            ])
+            # Group by formula and adduct, but keep the most accurate m/z match
             matches = (
                 matches
-                .sort("lib_uid")  # Ensure consistent ordering
+                .sort(["mz_error_abs", "lib_uid"])  # Sort by m/z accuracy first, then lib_uid for consistency
                 .group_by(["formula", "adduct"], maintain_order=True)
                 .first()
+                .drop("mz_error_abs")  # Remove the temporary column
             )
             post_dedup_count = len(matches)
             if logger and post_dedup_count < pre_dedup_count:
                 logger.debug(
-                    f"Consensus {cons_uid}: deduplicated {pre_dedup_count} to {post_dedup_count} matches"
+                    f"Consensus {cons_uid}: deduplicated {pre_dedup_count} to {post_dedup_count} matches (m/z accuracy prioritized)"
                 )
     return matches
@@ -617,7 +716,11 @@ def _apply_scoring_adjustments(study, params):
 def _update_consensus_id_columns(study, logger=None):
-    """Update consensus_df with top-scoring identification results using safe in-place updates."""
+    """
+    Update consensus_df with top-scoring identification results using safe in-place updates.
+    FIXED VERSION: Prevents same compound from being assigned to vastly different m/z values.
+    """
     try:
         if not hasattr(study, "id_df") or study.id_df is None or study.id_df.is_empty():
             if logger:
@@ -634,14 +737,47 @@ def _update_consensus_id_columns(study, logger=None):
                 logger.debug("No consensus data available")
             return
-        # Get library columns we need
-        lib_columns = ["lib_uid", "name", "adduct"]
+        # Get library columns we need (include mz for validation)
+        lib_columns = ["lib_uid", "name", "adduct", "mz"]
         if "class" in study.lib_df.columns:
             lib_columns.append("class")
-        # Get top-scoring identification for each consensus feature
+        # FIX 1: Join identification results with consensus m/z for validation
+        id_with_consensus = study.id_df.join(
+            study.consensus_df.select(["consensus_uid", "mz"]),
+            on="consensus_uid",
+            how="left",
+            suffix="_consensus"
+        )
+        # FIX 2: Validate m/z accuracy - filter out poor matches
+        id_with_lib = id_with_consensus.join(
+            study.lib_df.select(["lib_uid", "mz"]),
+            on="lib_uid",
+            how="left",
+            suffix="_lib"
+        )
+        # Calculate actual m/z error and filter out excessive errors
+        id_validated = id_with_lib.with_columns([
+            (pl.col("mz") - pl.col("mz_lib")).abs().alias("actual_mz_error")
+        ])
+        # Filter out matches with excessive m/z error
+        max_reasonable_error = 0.02  # 20 millidalton maximum error
+        id_validated = id_validated.filter(
+            (pl.col("actual_mz_error") <= max_reasonable_error) | pl.col("actual_mz_error").is_null()
+        )
+        if logger:
+            original_count = len(id_with_consensus)
+            validated_count = len(id_validated)
+            if validated_count < original_count:
+                logger.warning(f"Filtered out {original_count - validated_count} identifications with excessive m/z error (>{max_reasonable_error:.3f} Da)")
+        # Get top-scoring identification for each consensus feature (from validated results)
         top_ids = (
-            study.id_df
+            id_validated
             .sort(["consensus_uid", "score"], descending=[False, True])
             .group_by("consensus_uid", maintain_order=True)
             .first()
@@ -656,12 +792,44 @@ def _update_consensus_id_columns(study, logger=None):
             .rename({"name": "id_top_name"})
         )
+        # FIX 3: Check for conflicts where same compound+adduct assigned to very different m/z
+        if not top_ids.is_empty():
+            compound_groups = (
+                top_ids
+                .join(study.consensus_df.select(["consensus_uid", "mz"]), on="consensus_uid", how="left")
+                .group_by(["id_top_name", "id_top_adduct"])
+                .agg([
+                    pl.col("consensus_uid").count().alias("count"),
+                    pl.col("mz").min().alias("mz_min"),
+                    pl.col("mz").max().alias("mz_max")
+                ])
+                .with_columns([
+                    (pl.col("mz_max") - pl.col("mz_min")).alias("mz_range")
+                ])
+            )
+            # Find problematic assignments (same compound+adduct with >0.1 Da m/z range)
+            problematic = compound_groups.filter(
+                (pl.col("count") > 1) & (pl.col("mz_range") > 0.1)
+            )
+            if not problematic.is_empty() and logger:
+                for row in problematic.iter_rows(named=True):
+                    name = row["id_top_name"]
+                    adduct = row["id_top_adduct"]
+                    count = row["count"]
+                    mz_range = row["mz_range"]
+                    logger.warning(
+                        f"Identification conflict detected: '{name}' ({adduct}) assigned to {count} features with {mz_range:.4f} Da m/z range"
+                    )
         # Ensure we have the id_top columns in consensus_df
         for col_name, dtype in [
             ("id_top_name", pl.String),
             ("id_top_class", pl.String),
             ("id_top_adduct", pl.String),
-            ("id_top_score", pl.Float64)
+            ("id_top_score", pl.Float64),
+            ("id_source", pl.String)
         ]:
             if col_name not in study.consensus_df.columns:
                 study.consensus_df = study.consensus_df.with_columns(
@@ -782,7 +950,7 @@ def identify(study, features=None, params=None, **kwargs):
     if logger:
         features_with_matches = len([r for r in results if len(r["matches"]) > 0])
         total_matches = sum(len(r["matches"]) for r in results)
-        logger.info(
+        logger.success(
             f"Identification completed: {features_with_matches}/{consensus_count} features matched, {total_matches} total identifications",
         )
@@ -805,6 +973,8 @@ def get_id(study, features=None) -> pl.DataFrame:
         - mz (consensus feature m/z)
         - rt (consensus feature RT)
         - name (compound name from library)
+        - shortname (short name from library, if available)
+        - class (compound class from library, if available)
         - formula (molecular formula from library)
         - adduct (adduct type from library)
         - smiles (SMILES notation from library)
@@ -872,6 +1042,8 @@ def get_id(study, features=None) -> pl.DataFrame:
     lib_cols = [
         "lib_uid",
         "name",
+        "shortname",
+        "class",
         "formula",
         "adduct",
         "smiles",
@@ -900,6 +1072,8 @@ def get_id(study, features=None) -> pl.DataFrame:
         "cmpd_uid" if "cmpd_uid" in result_df.columns else None,
         "lib_uid",
         "name" if "name" in result_df.columns else None,
+        "shortname" if "shortname" in result_df.columns else None,
+        "class" if "class" in result_df.columns else None,
         "formula" if "formula" in result_df.columns else None,
         "adduct" if "adduct" in result_df.columns else None,
         "mz" if "mz" in result_df.columns else None,
@@ -951,6 +1125,8 @@ def get_id(study, features=None) -> pl.DataFrame:
                 "cmpd_uid",
                 "lib_uid",
                 "name",
+                "shortname",
+                "class",
                 "formula",
                 "adduct",
                 "mz",
@@ -1076,7 +1252,7 @@ def id_reset(study):
         # Check which columns exist before trying to update them
         id_columns_to_reset = []
-        for col in ["id_top_name", "id_top_class", "id_top_adduct", "id_top_score"]:
+        for col in ["id_top_name", "id_top_class", "id_top_adduct", "id_top_score", "id_source"]:
             if col in study.consensus_df.columns:
                 if col == "id_top_score":
                     id_columns_to_reset.append(pl.lit(None, dtype=pl.Float64).alias(col))
@@ -1093,7 +1269,7 @@ def id_reset(study):
         del study.history["identify"]
     if logger:
-        logger.success("Identification data reset completed")
+        logger.info("Identification data reset completed")
 def lib_reset(study):
@@ -1122,11 +1298,33 @@ def lib_reset(study):
             logger.debug("Checking for consensus features created by lib_to_consensus()")
         try:
-            # Filter for features with number_samples = -1 or 0
-            # Since consensus_select doesn't support list of discrete values, use direct filtering
-            lib_consensus_features = study.consensus_df.filter(
-                (pl.col("number_samples") == -1) | (pl.col("number_samples") == 0)
-            )
+            # Filter for features created by lib_to_consensus()
+            # These can be identified by:
+            # 1. number_samples < 1 (set to 0.0 by lib_to_consensus)
+            # 2. AND have corresponding entries in consensus_mapping_df with sample_uid = 0 (virtual sample)
+            # First check if we have any features with number_samples < 1
+            potential_lib_features = study.consensus_df.filter(pl.col("number_samples") < 1)
+            if potential_lib_features is not None and not potential_lib_features.is_empty():
+                # Further filter by checking if they have sample_uid = 0 in consensus_mapping_df
+                # This ensures we only remove library-derived features, not legitimate features with 0 samples
+                if hasattr(study, "consensus_mapping_df") and not study.consensus_mapping_df.is_empty():
+                    lib_consensus_uids = study.consensus_mapping_df.filter(
+                        pl.col("sample_uid") == 0
+                    )["consensus_uid"].unique().to_list()
+                    if lib_consensus_uids:
+                        lib_consensus_features = potential_lib_features.filter(
+                            pl.col("consensus_uid").is_in(lib_consensus_uids)
+                        )
+                    else:
+                        lib_consensus_features = pl.DataFrame()  # No library features found
+                else:
+                    # If no consensus_mapping_df, fall back to number_samples < 1 only
+                    lib_consensus_features = potential_lib_features
+            else:
+                lib_consensus_features = pl.DataFrame()  # No features with number_samples < 1
             if lib_consensus_features is not None and not lib_consensus_features.is_empty():
                 num_lib_features = len(lib_consensus_features)
@@ -1170,7 +1368,7 @@ def lib_reset(study):
         # Check which columns exist before trying to update them
         id_columns_to_reset = []
-        for col in ["id_top_name", "id_top_class", "id_top_adduct", "id_top_score"]:
+        for col in ["id_top_name", "id_top_class", "id_top_adduct", "id_top_score", "id_source"]:
             if col in study.consensus_df.columns:
                 if col == "id_top_score":
                     id_columns_to_reset.append(pl.lit(None, dtype=pl.Float64).alias(col))
@@ -1198,7 +1396,7 @@ def lib_reset(study):
             del study.history["lib_to_consensus"]
     if logger:
-        logger.success("Library and identification data reset completed")
+        logger.info("Library and identification data reset completed")
 def _get_adducts(study, adducts_list: list | None = None, **kwargs):

masster 0.5.12__py3-none-any.whl → 0.5.14__py3-none-any.whl

Potentially problematic release.

masster 0.5.12py3-none-any.whl → 0.5.14py3-none-any.whl