PyPI - masster - Versions diffs - 0.5.28__py3-none-any.whl → 0.6.2__py3-none-any.whl - Mend

masster 0.5.28py3-none-any.whl → 0.6.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of masster might be problematic. Click here for more details.

Files changed (33) hide show

masster/_version.py +1 -1
masster/data/libs/aa_nort.json +240 -0
masster/data/libs/ccm_nort.json +1319 -0
masster/lib/lib.py +1 -1
masster/logger.py +0 -6
masster/sample/adducts.py +1 -1
masster/sample/defaults/find_adducts_def.py +1 -1
masster/sample/h5.py +152 -2
masster/sample/helpers.py +91 -5
masster/sample/id.py +1160 -0
masster/sample/importers.py +715 -0
masster/sample/plot.py +175 -71
masster/sample/sample.py +26 -5
masster/sample/sample5_schema.json +99 -1
masster/sample/save.py +724 -1
masster/study/defaults/study_def.py +8 -12
masster/study/export.py +216 -65
masster/study/id.py +59 -12
masster/study/importers.py +384 -1
masster/study/load.py +0 -11
masster/study/merge.py +153 -0
masster/study/plot.py +197 -0
masster/study/study.py +6 -4
masster/study/study5_schema.json +15 -0
masster/wizard/wizard.py +13 -14
{masster-0.5.28.dist-info → masster-0.6.2.dist-info}/METADATA +17 -18
{masster-0.5.28.dist-info → masster-0.6.2.dist-info}/RECORD +30 -29
masster/data/libs/aa.csv +0 -22
masster/data/libs/ccm.csv +0 -120
masster/data/libs/urine.csv +0 -4693
{masster-0.5.28.dist-info → masster-0.6.2.dist-info}/WHEEL +0 -0
{masster-0.5.28.dist-info → masster-0.6.2.dist-info}/entry_points.txt +0 -0
{masster-0.5.28.dist-info → masster-0.6.2.dist-info}/licenses/LICENSE +0 -0

masster/study/defaults/study_def.py CHANGED Viewed

@@ -96,19 +96,15 @@ class study_defaults:
             "adducts": {
                 "dtype": "list[str]",
                 "description": "List of adduct specifications in OpenMS format (element:charge:probability). Charged adduct probabilities must sum to 1.0.",
-                "default": ["H:+:0.8", "Na:+:0.1", "NH4:+:0.1"],
+                "default": ["+H:1:0.65", "+Na:1:0.15", "+NH4:1:0.15", "+K:1:0.05"],
                 "examples": {
-                    "positive": ["H:+:0.8", "Na:+:0.1", "NH4:+:0.1"],
-                    "negative": [
-                        "H-1:-:0.95",
-                        "Cl:-:0.05",
-                        "CH2O2:0:0.2",
-                        "H-2-O:0:0.2",
-                    ],
+                    "positive": ["+H:1:0.65", "+Na:1:0.15", "+NH4:1:0.15", "+K:1:0.05", "-H2O:0:0.15"],
+                    "negative": ["-H:-1:0.95", "+Cl:-1:0.05", "+CH2O2:0:0.2", "-H2O:0:0.2"],
                 },
                 "validation_rules": [
-                    "Format: element:charge:probability",
-                    "Charge must be +, -, or 0 (neutral)",
+                    "Format: formula:charge:probability (e.g., '+H:1:0.65', '-H:-1:0.95', '-H2O:0:0.15')",
+                    "Formula must start with + or - to indicate gain/loss (e.g., '+H', '-H', '+Na', '-H2O')",
+                    "Charge must be an integer (positive, negative, or 0 for neutral)",
                     "Probability must be between 0.0 and 1.0",
                     "Sum of all charged adduct probabilities must equal 1.0",
                 ],
@@ -128,7 +124,7 @@ class study_defaults:
         """Set polarity-specific defaults for adducts if not explicitly provided."""
         # If adducts is None, set based on polarity
         if self.adducts is None:
-            if self.polarity.lower() in ["positive", "pos"]:
+            if self.polarity.lower() in ["positive", "pos", "+"]:
                 self.adducts = [
                     "+H:1:0.65",
                     "+Na:1:0.15",
@@ -136,7 +132,7 @@ class study_defaults:
                     "+K:1:0.05",
                     "-H2O:0:0.15",
                 ]
-            elif self.polarity.lower() in ["negative", "neg"]:
+            elif self.polarity.lower() in ["negative", "neg", "-"]:
                 self.adducts = [
                     "-H:-1:0.9",
                     "+Cl:-1:0.1",

masster/study/export.py CHANGED Viewed

@@ -524,7 +524,7 @@ def export_mztab(self, filename: str | None = None, include_mgf=True, **kwargs)
         # Import here to avoid circular imports
         from masster.study.id import get_id
-        # Get full enriched identification data for SOME section
+        # Get full enriched identification data for SME section
         full_id_data = get_id(self)
         if full_id_data is not None and not full_id_data.is_empty():
             # Get top scoring identification for each consensus_uid for SML section
@@ -828,8 +828,8 @@ def export_mztab(self, filename: str | None = None, include_mgf=True, **kwargs)
     smf_header = [
         "SFH",
         "SMF_ID",
-        "SOME_ID_REFS",
-        "SOME_ID_REF_ambiguity_code",
+        "SME_ID_REFS",
+        "SME_ID_REF_ambiguity_code",
         "adduct_ion",
         "isotopomer",
         "exp_mass_to_charge",
@@ -847,40 +847,40 @@ def export_mztab(self, filename: str | None = None, include_mgf=True, **kwargs)
     # SMF table uses the same consensus features as SML, just different metadata
     for idx, row in enumerate(self.consensus_df.iter_rows(named=True), 1):
-        # References to SOME entries - each SMF can reference multiple SOME entries for the same consensus_uid
-        some_refs = "null"
-        some_ambiguity = "null"
+        # References to SME entries - each SMF can reference multiple SME entries for the same consensus_uid
+        SME_refs = "null"
+        SME_ambiguity = "null"
         consensus_uid = row["consensus_uid"]
         if full_id_data is not None:
-            # Find all SOME entries for this consensus_uid
-            some_matches = full_id_data.filter(pl.col("consensus_uid") == consensus_uid)
-            if some_matches.height > 0:
-                # Generate SOME IDs - we'll create a mapping in the SOME section
+            # Find all SME entries for this consensus_uid
+            SME_matches = full_id_data.filter(pl.col("consensus_uid") == consensus_uid)
+            if SME_matches.height > 0:
+                # Generate SME IDs - we'll create a mapping in the SME section
                 # For now, use a simple approach based on consensus_uid and lib_uid
-                some_ids = []
-                for i, some_row in enumerate(some_matches.iter_rows(named=True)):
-                    # Create a unique SOME ID based on consensus_uid and position
-                    some_id_base = consensus_uid * 1000  # Ensure uniqueness across consensus features
-                    some_id = some_id_base + i + 1
-                    some_ids.append(str(some_id))
-                if some_ids:
-                    some_refs = "|".join(some_ids)
+                SME_ids = []
+                for i, SME_row in enumerate(SME_matches.iter_rows(named=True)):
+                    # Create a unique SME ID based on consensus_uid and position
+                    SME_id_base = consensus_uid * 1000  # Ensure uniqueness across consensus features
+                    SME_id = SME_id_base + i + 1
+                    SME_ids.append(str(SME_id))
+                if SME_ids:
+                    SME_refs = "|".join(SME_ids)
                     # Set ambiguity code: 1=ambiguous identification, 2=multiple evidence same molecule, 3=both
-                    if len(some_ids) > 1:
+                    if len(SME_ids) > 1:
                         # Check if all identifications point to the same compound
                         unique_cmpds = {
                             match["cmpd_uid"]
-                            for match in some_matches.iter_rows(named=True)
+                            for match in SME_matches.iter_rows(named=True)
                             if match.get("cmpd_uid") is not None
                         }
                         if len(unique_cmpds) > 1:
-                            some_ambiguity = "1"  # Ambiguous identification
+                            SME_ambiguity = "1"  # Ambiguous identification
                         else:
-                            some_ambiguity = "2"  # Multiple evidence for same molecule
+                            SME_ambiguity = "2"  # Multiple evidence for same molecule
                     else:
-                        some_ambiguity = "null"
+                        SME_ambiguity = "null"
         # Format isotopomer according to mzTab-M specification
         iso_value = row.get("iso_mean", 0)
@@ -892,8 +892,8 @@ def export_mztab(self, filename: str | None = None, include_mgf=True, **kwargs)
         smf_row = [
             "SMF",
             str(idx),
-            some_refs,
-            some_ambiguity,
+            SME_refs,
+            SME_ambiguity,
             adduct_list[idx - 1],  # adduct_ion
             isotopomer,  # isotopomer formatted according to mzTab-M specification
             safe_str(row.get("mz", "null")),  # exp_mass_to_charge
@@ -943,16 +943,16 @@ def export_mztab(self, filename: str | None = None, include_mgf=True, **kwargs)
         for line in smf_lines:
             f.write(line + "\n")
-    # --- SOME (Small Molecule Evidence) table ---
+    # --- SME (Small Molecule Evidence) table ---
     if full_id_data is not None and not full_id_data.is_empty():
-        some_lines = []
+        SME_lines = []
         # Add comment about spectra_ref being dummy placeholders
-        some_lines.append(
+        SME_lines.append(
             "COM\tThe spectra_ref are dummy placeholders, as the annotation was based on aggregated data",
         )
-        some_header = [
-            "SHE",
-            "SOME_ID",
+        SME_header = [
+            "SEH",
+            "SME_ID",
             "evidence_input_id",
             "database_identifier",
             "chemical_formula",
@@ -971,9 +971,9 @@ def export_mztab(self, filename: str | None = None, include_mgf=True, **kwargs)
             "id_confidence_measure[1]",
             "rank",
         ]
-        some_lines.append("\t".join(some_header))
+        SME_lines.append("\t".join(SME_header))
-        # Create SOME entries for all identification results using enriched data
+        # Create SME entries for all identification results using enriched data
         for consensus_uid in self.consensus_df.select("consensus_uid").to_series().unique():
             # Get consensus feature data for this consensus_uid
             consensus_feature_data = self.consensus_df.filter(
@@ -984,16 +984,16 @@ def export_mztab(self, filename: str | None = None, include_mgf=True, **kwargs)
             consensus_row = consensus_feature_data.row(0, named=True)
             # Get all identification results for this consensus feature from enriched data
-            some_matches = full_id_data.filter(pl.col("consensus_uid") == consensus_uid)
+            SME_matches = full_id_data.filter(pl.col("consensus_uid") == consensus_uid)
-            if some_matches.height > 0:
+            if SME_matches.height > 0:
                 # Sort by score descending to maintain rank order
-                some_matches = some_matches.sort("score", descending=True)
+                SME_matches = SME_matches.sort("score", descending=True)
-                for i, some_row in enumerate(some_matches.iter_rows(named=True)):
-                    # Generate unique SOME_ID
-                    some_id_base = consensus_uid * 1000
-                    some_id = some_id_base + i + 1
+                for i, SME_row in enumerate(SME_matches.iter_rows(named=True)):
+                    # Generate unique SME_ID
+                    SME_id_base = consensus_uid * 1000
+                    SME_id = SME_id_base + i + 1
                     # Create evidence input ID using consensus_uid:mz:rt format
                     consensus_mz = consensus_row.get("mz", 0)
@@ -1002,15 +1002,15 @@ def export_mztab(self, filename: str | None = None, include_mgf=True, **kwargs)
                     # Database identifier - use db_id if available, otherwise fallback to cmpd_uid
                     db_id = "null"
-                    if some_row.get("db_id") is not None and some_row["db_id"] != "":
-                        db_id = safe_str(some_row["db_id"])
-                    elif some_row.get("cmpd_uid") is not None:
-                        db_id = f"cmpd:{some_row['cmpd_uid']}"
+                    if SME_row.get("db_id") is not None and SME_row["db_id"] != "":
+                        db_id = safe_str(SME_row["db_id"])
+                    elif SME_row.get("cmpd_uid") is not None:
+                        db_id = f"cmpd:{SME_row['cmpd_uid']}"
                     # Get adduct information
                     adduct_ion = "null"
-                    if some_row.get("adduct") is not None and some_row["adduct"] != "":
-                        adduct_ion = safe_str(some_row["adduct"])
+                    if SME_row.get("adduct") is not None and SME_row["adduct"] != "":
+                        adduct_ion = safe_str(SME_row["adduct"])
                         # Replace ? with H for better mzTab compatibility
                         adduct_ion = adduct_ion.replace("?", "H")
@@ -1019,29 +1019,32 @@ def export_mztab(self, filename: str | None = None, include_mgf=True, **kwargs)
                     # Identification method
                     id_method = "[MS, MS:1002888, small molecule confidence measure, ]"
-                    if some_row.get("matcher") is not None:
-                        id_method = f"[MS, MS:1002888, {some_row['matcher']}, ]"
+                    if SME_row.get("matcher") is not None:
+                        id_method = f"[MS, MS:1002888, {SME_row['matcher']}, ]"
-                    # MS level - assume MS1 for now
-                    ms_level = "[MS, MS:1000511, ms level, 1]"
+                    # MS level - check if ms1 exists in matched
+                    if 'ms1' in SME_row['matcher'].lower():
+                        ms_level = "[MS, MS:1000511, ms level, 1]"
+                    else:
+                        ms_level = "[MS,MS:1000511, ms level, 2]"
                     # Experimental mass-to-charge from consensus feature
                     exp_mz = safe_str(consensus_mz)
                     # Theoretical mass-to-charge from lib_df
                     theoretical_mz = "null"
-                    if some_row.get("mz") is not None:  # This comes from lib_df via get_id() join
-                        theoretical_mz = safe_str(some_row["mz"])
+                    if SME_row.get("mz") is not None:  # This comes from lib_df via get_id() join
+                        theoretical_mz = safe_str(SME_row["mz"])
-                    some_line = [
-                        "SOME",
-                        str(some_id),
+                    SME_line = [
+                        "SME",
+                        str(SME_id),
                         evidence_id,
                         db_id,
-                        safe_str(some_row.get("formula", "null")),
-                        safe_str(some_row.get("smiles", "null")),
-                        safe_str(some_row.get("inchi", "null")),
-                        safe_str(some_row.get("name", "null")),
+                        safe_str(SME_row.get("formula", "null")),
+                        safe_str(SME_row.get("smiles", "null")),
+                        safe_str(SME_row.get("inchi", "null")),
+                        safe_str(SME_row.get("name", "null")),
                         "null",  # uri - not available in current data
                         "null",  # derivatized_form
                         adduct_ion,
@@ -1053,15 +1056,15 @@ def export_mztab(self, filename: str | None = None, include_mgf=True, **kwargs)
                         spectra_ref,
                         id_method,
                         ms_level,
-                        safe_str(some_row.get("score", "null")),
+                        safe_str(SME_row.get("score", "null")),
                         str(i + 1),  # rank within this consensus feature
                     ]
-                    some_lines.append("\t".join(some_line))
+                    SME_lines.append("\t".join(SME_line))
-        # Write SOME table
+        # Write SME table
         with open(filename, "a", encoding="utf-8") as f:
             f.write("\n")
-            for line in some_lines:
+            for line in SME_lines:
                 f.write(line + "\n")
     # --- MGF table ---
@@ -1125,7 +1128,7 @@ def export_mztab(self, filename: str | None = None, include_mgf=True, **kwargs)
     self.logger.success(f"Exported mzTab-M to {filename}")
-def export_xlsx(self, filename: str | None = None) -> None:
+def export_excel(self, filename: str | None = None) -> None:
     """
     Export the study data to an Excel workbook with multiple worksheets.
@@ -1390,3 +1393,151 @@ def export_parquet(self, filename: str | None = None) -> None:
         self.logger.success(f"Study exported to {len(exported_files)} Parquet files.")
     else:
         self.logger.error("No Parquet files were created - no data available to export")
+def export_slaw(self, filename="features_slaw.csv"):
+    """
+    Export the consensus features DataFrame to a SLAW-formatted CSV file.
+    This method exports the consensus features to a CSV format compatible with SLAW,
+    including feature metadata and intensity quantification across all samples. The file
+    contains comprehensive feature information including m/z, RT, annotations, isotopic
+    patterns, MS2 data, and intensity values for each sample.
+    Parameters:
+        filename (str): The path to the output CSV file. Defaults to 'features_slaw.csv'.
+    Side Effects:
+        Writes the exported data to the specified CSV file and logs the export operation.
+    """
+    if self.consensus_df is None:
+        self.logger.warning("No consensus features found. Cannot export to SLAW format.")
+        return
+    # Make filename absolute if not already
+    if not os.path.isabs(filename):
+        if self.folder is not None:
+            filename = os.path.join(self.folder, filename)
+        else:
+            filename = os.path.join(os.getcwd(), filename)
+    df = self.consensus_df
+    # Get consensus matrix for quantification across samples
+    try:
+        quant_matrix = self.get_consensus_matrix()
+    except Exception as e:
+        self.logger.error(f"Error getting consensus matrix: {e}")
+        return
+    # Evaluate the charge column
+    if "charge_mean" in df.columns:
+        charge_series = df.select(
+            pl.when(pl.col("charge_mean") == 0)
+            .then(1 if self.polarity == "positive" else -1)
+            .otherwise(pl.col("charge_mean"))
+            .alias("charge")
+        ).get_column("charge")
+    else:
+        charge_series = pl.Series([1 if self.polarity == "positive" else -1] * len(df))
+    # Evaluate the group column (from adduct_group_top)
+    # Features with adduct_group_top == 0 should each get a unique group index
+    if "adduct_group_top" in df.columns:
+        max_adduct_group = df.get_column("adduct_group_top").max()
+        if max_adduct_group is None:
+            max_adduct_group = 0
+        group_series = df.select(
+            pl.when(pl.col("adduct_group_top") == 0)
+            .then(max_adduct_group + 1 + pl.int_range(pl.len()).over(pl.col("adduct_group_top") == 0))
+            .otherwise(pl.col("adduct_group_top"))
+            .alias("group")
+        ).get_column("group")
+    else:
+        group_series = pl.Series([None] * len(df))
+    # Evaluate the annotation column (adduct + isotope info)
+    if "adduct_top" in df.columns and "iso_mean" in df.columns:
+        annotation_series = df.select(
+            pl.when(pl.col("iso_mean") == 0)
+            .then(pl.col("adduct_top").str.replace(r"\?", "H"))
+            .otherwise(pl.col("adduct_top").str.replace(r"\?", "H") + " +" + pl.col("iso_mean").cast(pl.Int64).cast(pl.Utf8))
+            .alias("annotation")
+        ).get_column("annotation")
+    elif "adduct_top" in df.columns:
+        annotation_series = df.get_column("adduct_top").str.replace(r"\?", "H")
+    else:
+        annotation_series = pl.Series([""] * len(df))
+    # Get sample columns from quant_matrix (excluding consensus_uid)
+    sample_columns = [col for col in quant_matrix.columns if col != "consensus_uid"]
+    # Create SLAW columns with appropriate mappings from consensus_df
+    slaw_data = {
+        "feature_id": df.get_column("consensus_id") if "consensus_id" in df.columns else pl.Series(range(1, len(df) + 1)),
+        "mz": df.get_column("mz") if "mz" in df.columns else pl.Series([None] * len(df)),
+        "rt": df.get_column("rt") if "rt" in df.columns else pl.Series([None] * len(df)),
+        "group": group_series,
+        "annotation": annotation_series,
+        "neutral_mass": df.get_column("adduct_neutral_mass_top") if "adduct_neutral_mass_top" in df.columns else pl.Series([None] * len(df)),
+        "charge": charge_series,
+        "main_id": df.get_column("main_id") if "main_id" in df.columns else df.get_column("consensus_id") if "consensus_id" in df.columns else pl.Series(range(1, len(df) + 1)),
+        "ion": df.get_column("adduct_top").str.replace(r"\?", "H") if "adduct_top" in df.columns else pl.Series([""] * len(df)),
+        "iso": df.get_column("iso_mean").cast(pl.Int64) if "iso_mean" in df.columns else pl.Series([0] * len(df)),
+        "clique": df.get_column("clique") if "clique" in df.columns else pl.Series([None] * len(df)),
+        "num_detection": df.get_column("num_detection") if "num_detection" in df.columns else pl.Series([1] * len(df)),
+        "total_detection": df.get_column("total_detection") if "total_detection" in df.columns else pl.Series([1] * len(df)),
+        "mz_mean": df.get_column("mz") if "mz" in df.columns else pl.Series([None] * len(df)),
+        "mz_min": df.get_column("mz_min") if "mz_min" in df.columns else df.get_column("mz") if "mz" in df.columns else pl.Series([None] * len(df)),
+        "mz_max": df.get_column("mz_max") if "mz_max" in df.columns else df.get_column("mz") if "mz" in df.columns else pl.Series([None] * len(df)),
+        "rt_mean": df.get_column("rt") if "rt" in df.columns else pl.Series([None] * len(df)),
+        "rt_min": df.get_column("rt_min") if "rt_min" in df.columns else df.get_column("rt") if "rt" in df.columns else pl.Series([None] * len(df)),
+        "rt_max": df.get_column("rt_max") if "rt_max" in df.columns else df.get_column("rt") if "rt" in df.columns else pl.Series([None] * len(df)),
+        "rt_cor_mean": df.get_column("rt") if "rt" in df.columns else pl.Series([None] * len(df)),
+        "rt_cor_min": df.get_column("rt_min") if "rt_min" in df.columns else df.get_column("rt") if "rt" in df.columns else pl.Series([None] * len(df)),
+        "rt_cor_max": df.get_column("rt_max") if "rt_max" in df.columns else df.get_column("rt") if "rt" in df.columns else pl.Series([None] * len(df)),
+        "height_mean": df.get_column("height_mean") if "height_mean" in df.columns else pl.Series([None] * len(df)),
+        "height_min": df.get_column("height_min") if "height_min" in df.columns else pl.Series([None] * len(df)),
+        "height_max": df.get_column("height_max") if "height_max" in df.columns else pl.Series([None] * len(df)),
+        "intensity_mean": df.get_column("inty_mean") if "inty_mean" in df.columns else pl.Series([None] * len(df)),
+        "intensity_min": df.get_column("inty_min") if "inty_min" in df.columns else pl.Series([None] * len(df)),
+        "intensity_max": df.get_column("inty_max") if "inty_max" in df.columns else pl.Series([None] * len(df)),
+        "SN_mean": df.get_column("sn_mean") if "sn_mean" in df.columns else pl.Series([None] * len(df)),
+        "SN_min": df.get_column("sn_min") if "sn_min" in df.columns else pl.Series([None] * len(df)),
+        "SN_max": df.get_column("sn_max") if "sn_max" in df.columns else pl.Series([None] * len(df)),
+        "peakwidth_mean": df.get_column("fwhm_mean") if "fwhm_mean" in df.columns else pl.Series([None] * len(df)),
+        "peakwidth_min": df.get_column("fwhm_min") if "fwhm_min" in df.columns else pl.Series([None] * len(df)),
+        "peakwidth_max": df.get_column("fwhm_max") if "fwhm_max" in df.columns else pl.Series([None] * len(df)),
+        "ms2_mgf_id": pl.Series([""] * len(df)),  # Not available in study
+        "ms2_num_fused": pl.Series([None] * len(df)),  # Not available in study
+        "ms2_source": pl.Series([""] * len(df)),  # Not available in study
+        "isotopic_pattern_annot": pl.Series([""] * len(df)),  # Not available in study
+        "isotopic_pattern_rel": pl.Series([""] * len(df)),  # Not available in study
+        "isotopic_pattern_abs": pl.Series([""] * len(df)),  # Not available in study
+    }
+    # Add quantification columns for each sample
+    for sample_col in sample_columns:
+        quant_column_name = f"quant_{sample_col}"
+        # Join with quant_matrix to get values for this sample
+        sample_values = quant_matrix.join(
+            df.select("consensus_uid"),
+            on="consensus_uid",
+            how="right"
+        ).get_column(sample_col)
+        slaw_data[quant_column_name] = sample_values
+    # Create the polars DataFrame
+    slaw_df = pl.DataFrame(slaw_data)
+    # Convert to pandas for CSV export
+    pandas_df = slaw_df.to_pandas()
+    # Export to CSV with comma separator - only quote when necessary (QUOTE_MINIMAL)
+    try:
+        pandas_df.to_csv(filename, sep=',', index=False, quoting=0)  # quoting=0 means QUOTE_MINIMAL
+        self.logger.success(f"Features exported to {filename} (SLAW format)")
+        self.logger.debug(f"Exported {len(slaw_df)} features with {len(slaw_df.columns)} columns")
+    except PermissionError:
+        self.logger.error(f"Permission denied: Cannot write to {filename}. The file may be open in another program. Please close it and try again.")

masster/study/id.py CHANGED Viewed

@@ -24,7 +24,8 @@ def lib_load(
         lib_source: either a CSV/JSON file path (str) or a Lib instance
         polarity: ionization polarity ("positive" or "negative") - used when lib_source is a CSV/JSON path.
                  If None, uses study.polarity automatically.
-        adducts: specific adducts to generate - used when lib_source is a CSV/JSON path
+        adducts: specific adducts to generate - used when lib_source is a CSV/JSON path.
+                 If None, uses study.parameters.adducts if available.
         iso: isotope generation mode ("13C" to generate 13C isotopes, None for no isotopes)
     Side effects:
@@ -51,6 +52,18 @@ def lib_load(
         else:
             polarity = "positive"  # Default fallback
         study.logger.debug(f"Using study polarity: {polarity}")
+    # Use study.parameters.adducts if adducts not explicitly provided
+    # If study.parameters.adducts is also None, lib will use its default adducts for the polarity
+    if adducts is None:
+        if hasattr(study, "parameters") and hasattr(study.parameters, "adducts"):
+            adducts = study.parameters.adducts
+            if adducts:
+                study.logger.debug(f"Using study.parameters.adducts: {adducts}")
+            else:
+                study.logger.debug(f"study.parameters.adducts is None, lib will use default adducts for {polarity} mode")
+        else:
+            study.logger.debug(f"study.parameters.adducts not found, lib will use default adducts for {polarity} mode")
     # Handle string input (CSV or JSON file path)
     if isinstance(lib_source, str):
@@ -403,42 +416,64 @@ def _find_matches_vectorized(lib_df, cons_mz, cons_rt, mz_tol, rt_tol, logger, c
     """
     Find library matches using optimized vectorized operations.
-    FIXED VERSION: Prevents incorrect matching of same compound to different m/z values.
+    Automatically skips RT filtering if library has no RT data for the matched entries.
     """
     # Filter by m/z tolerance using vectorized operations
     matches = lib_df.filter((pl.col("mz") >= cons_mz - mz_tol) & (pl.col("mz") <= cons_mz + mz_tol))
     initial_match_count = len(matches)
-    # Apply RT filter if available - STRICT VERSION (no fallback)
+    # Apply RT filter if requested AND if data is available
+    # Strategy: Handle mixed RT/no-RT entries properly by treating them separately
     if rt_tol is not None and cons_rt is not None and not matches.is_empty():
-        # First, check if any m/z matches have RT data
+        # Separate entries with and without RT data
         rt_candidates = matches.filter(pl.col("rt").is_not_null())
+        no_rt_entries = matches.filter(pl.col("rt").is_null())
         if not rt_candidates.is_empty():
             # Apply RT filtering to candidates with RT data
             rt_matches = rt_candidates.filter((pl.col("rt") >= cons_rt - rt_tol) & (pl.col("rt") <= cons_rt + rt_tol))
-            if not rt_matches.is_empty():
+            # Combine RT-filtered matches with entries that have no RT data
+            # Rationale: Entries without RT can't be filtered by RT, so include them
+            if not rt_matches.is_empty() and not no_rt_entries.is_empty():
+                # Both RT matches and no-RT entries exist
+                matches = pl.concat([rt_matches, no_rt_entries])
+                if logger:
+                    logger.debug(
+                        f"Consensus {cons_uid}: {initial_match_count} m/z matches, {len(rt_candidates)} with RT, "
+                        f"{len(rt_matches)} passed RT filter, {len(no_rt_entries)} with no RT → {len(matches)} total matches"
+                    )
+            elif not rt_matches.is_empty():
+                # Only RT matches, no entries without RT
                 matches = rt_matches
                 if logger:
                     logger.debug(
-                        f"Consensus {cons_uid}: {initial_match_count} m/z matches, {len(rt_candidates)} with RT, {len(matches)} after RT filter"
+                        f"Consensus {cons_uid}: {initial_match_count} m/z matches, {len(rt_candidates)} with RT, "
+                        f"{len(matches)} passed RT filter"
+                    )
+            elif not no_rt_entries.is_empty():
+                # No RT matches passed filter, but there are entries without RT
+                matches = no_rt_entries
+                if logger:
+                    logger.debug(
+                        f"Consensus {cons_uid}: {initial_match_count} m/z matches, {len(rt_candidates)} with RT but none passed RT filter, "
+                        f"using {len(matches)} entries with no RT data"
                     )
             else:
-                # NO FALLBACK - if RT filtering finds no matches, return empty
-                matches = rt_matches  # This is empty
+                # No RT matches and no entries without RT - return empty
+                matches = pl.DataFrame()
                 if logger:
                     logger.debug(
                         f"Consensus {cons_uid}: RT filtering eliminated all {len(rt_candidates)} candidates (rt_tol={rt_tol}s) - no matches returned"
                     )
         else:
-            # No RT data in library matches - return empty if strict RT filtering requested
+            # All m/z matches have no RT data - keep all m/z matches
             if logger:
                 logger.debug(
-                    f"Consensus {cons_uid}: {initial_match_count} m/z matches but none have library RT data - no matches returned due to RT filtering"
+                    f"Consensus {cons_uid}: {initial_match_count} m/z matches, all have no RT data - using m/z matches only"
                 )
-            matches = pl.DataFrame()  # Return empty DataFrame
+            # matches already contains the m/z-filtered results (which are all no_rt_entries)
     # FIX 1: Add stricter m/z validation - prioritize more accurate matches
     if not matches.is_empty():
@@ -884,6 +919,18 @@ def identify(study, features=None, params=None, **kwargs):
     effective_mz_tol = getattr(params, "mz_tol", 0.01)
     effective_rt_tol = getattr(params, "rt_tol", 2.0)
+    # Check if library has RT data - if not, disable RT filtering
+    if effective_rt_tol is not None and hasattr(study, "lib_df") and study.lib_df is not None:
+        if "rt" in study.lib_df.columns:
+            # Check if library has any non-null RT values
+            rt_count = study.lib_df.filter(pl.col("rt").is_not_null()).shape[0]
+            if rt_count == 0:
+                if logger:
+                    logger.info(
+                        f"Library has no retention time data - disabling RT filtering (was rt_tol={effective_rt_tol})"
+                    )
+                effective_rt_tol = None
     if logger:
         logger.debug(
             f"Starting identification with mz_tolerance={effective_mz_tol}, rt_tolerance={effective_rt_tol}",
@@ -1483,7 +1530,7 @@ def _get_adducts(study, adducts_list: list | None = None, **kwargs):
             if charge_min <= abs(total_charge) <= charge_max and total_charge != 0:
                 components = [spec] * multiplier
                 formatted_name = _format_adduct_name(components)
-                probability_multiplied = float(spec["probability"]) ** multiplier
+                probability_multiplied = (float(spec["probability"]) ** multiplier) / 2.0
                 combinations_list.append(
                     {

masster 0.5.28__py3-none-any.whl → 0.6.2__py3-none-any.whl

Potentially problematic release.

masster 0.5.28py3-none-any.whl → 0.6.2py3-none-any.whl