PyPI - masster - Versions diffs - 0.5.28__py3-none-any.whl → 0.6.2__py3-none-any.whl - Mend

masster 0.5.28py3-none-any.whl → 0.6.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of masster might be problematic. Click here for more details.

Files changed (33) hide show

masster/_version.py +1 -1
masster/data/libs/aa_nort.json +240 -0
masster/data/libs/ccm_nort.json +1319 -0
masster/lib/lib.py +1 -1
masster/logger.py +0 -6
masster/sample/adducts.py +1 -1
masster/sample/defaults/find_adducts_def.py +1 -1
masster/sample/h5.py +152 -2
masster/sample/helpers.py +91 -5
masster/sample/id.py +1160 -0
masster/sample/importers.py +715 -0
masster/sample/plot.py +175 -71
masster/sample/sample.py +26 -5
masster/sample/sample5_schema.json +99 -1
masster/sample/save.py +724 -1
masster/study/defaults/study_def.py +8 -12
masster/study/export.py +216 -65
masster/study/id.py +59 -12
masster/study/importers.py +384 -1
masster/study/load.py +0 -11
masster/study/merge.py +153 -0
masster/study/plot.py +197 -0
masster/study/study.py +6 -4
masster/study/study5_schema.json +15 -0
masster/wizard/wizard.py +13 -14
{masster-0.5.28.dist-info → masster-0.6.2.dist-info}/METADATA +17 -18
{masster-0.5.28.dist-info → masster-0.6.2.dist-info}/RECORD +30 -29
masster/data/libs/aa.csv +0 -22
masster/data/libs/ccm.csv +0 -120
masster/data/libs/urine.csv +0 -4693
{masster-0.5.28.dist-info → masster-0.6.2.dist-info}/WHEEL +0 -0
{masster-0.5.28.dist-info → masster-0.6.2.dist-info}/entry_points.txt +0 -0
{masster-0.5.28.dist-info → masster-0.6.2.dist-info}/licenses/LICENSE +0 -0

masster/sample/save.py CHANGED Viewed

@@ -808,7 +808,7 @@ def export_dda_stats(self, filename="stats.csv"):
     self.logger.success(f"DDA statistics exported to {filename}")
-def export_xlsx(self, filename="features.xlsx"):
+def export_excel(self, filename="features.xlsx"):
     """
     Export the features DataFrame to an Excel file.
@@ -857,6 +857,143 @@ def export_xlsx(self, filename="features.xlsx"):
     self.logger.debug(f"Exported {len(clean_df)} features with {len(exportable_columns)} columns")
+def export_slaw(self, filename="features_slaw.csv"):
+    """
+    Export the features DataFrame to a SLAW-formatted CSV file.
+    This method exports the features to a tab-separated CSV format compatible with SLAW,
+    including feature metadata and intensity quantification. The file contains comprehensive
+    feature information including m/z, RT, annotations, isotopic patterns, MS2 data, and
+    intensity values.
+    Parameters:
+        filename (str): The path to the output CSV file. Defaults to 'features_slaw.csv'.
+    Side Effects:
+        Writes the exported data to the specified CSV file and logs the export operation.
+    """
+    if self.features_df is None:
+        self.logger.warning("No features found. Cannot export to SLAW format.")
+        return
+    filename = os.path.abspath(filename)
+    # Get base filename for quant column
+    if self.file_path is not None:
+        base_name = os.path.splitext(os.path.basename(self.file_path))[0]
+    else:
+        base_name = "sample"
+    quant_column_name = f"quant_{base_name}.csv"
+    # Prepare the SLAW dataframe with required columns
+    import polars as pl
+    df = self.features_df
+    # Evaluate the charge column first if adduct_charge exists
+    if "adduct_charge" in df.columns:
+        charge_series = df.select(
+            pl.when(pl.col("adduct_charge") == 0)
+            .then(1 if self.polarity == "positive" else -1)
+            .otherwise(pl.col("adduct_charge"))
+            .alias("charge")
+        ).get_column("charge")
+    else:
+        charge_series = pl.Series([1 if self.polarity == "positive" else -1] * len(df))
+    # Evaluate the group column (from adduct_group)
+    # Features with adduct_group == 0 should each get a unique group index
+    if "adduct_group" in df.columns:
+        max_adduct_group = df.get_column("adduct_group").max()
+        if max_adduct_group is None:
+            max_adduct_group = 0
+        # Create a row number starting from max_adduct_group + 1 for features with adduct_group == 0
+        group_series = df.select(
+            pl.when(pl.col("adduct_group") == 0)
+            .then(max_adduct_group + 1 + pl.int_range(pl.len()).over(pl.col("adduct_group") == 0))
+            .otherwise(pl.col("adduct_group"))
+            .alias("group")
+        ).get_column("group")
+    else:
+        group_series = pl.Series([None] * len(df))
+    # Evaluate the annotation column (adduct + isotope info)
+    # annotation = adduct for iso==0, adduct + " +{iso}" for iso>0
+    if "adduct" in df.columns and "iso" in df.columns:
+        annotation_series = df.select(
+            pl.when(pl.col("iso") == 0)
+            .then(pl.col("adduct").str.replace(r"\?", "H"))
+            .otherwise(pl.col("adduct").str.replace(r"\?", "H") + " +" + pl.col("iso").cast(pl.Utf8))
+            .alias("annotation")
+        ).get_column("annotation")
+    elif "adduct" in df.columns:
+        annotation_series = df.get_column("adduct").str.replace(r"\?", "H")
+    else:
+        annotation_series = pl.Series([""] * len(df))
+    # Create SLAW columns with appropriate mappings from features_df
+    # Columns are ordered according to SLAW specification
+    slaw_data = {
+        "feature_id": df.get_column("feature_id") if "feature_id" in df.columns else pl.Series(range(1, len(df) + 1)),
+        "mz": df.get_column("mz") if "mz" in df.columns else pl.Series([None] * len(df)),
+        "rt": df.get_column("rt") if "rt" in df.columns else pl.Series([None] * len(df)),
+        "group": group_series,
+        "annotation": annotation_series,
+        "neutral_mass": df.get_column("adduct_neutral_mass") if "adduct_neutral_mass" in df.columns else pl.Series([None] * len(df)),
+        "charge": charge_series,
+        "main_id": df.get_column("main_id") if "main_id" in df.columns else df.get_column("feature_id") if "feature_id" in df.columns else pl.Series(range(1, len(df) + 1)),
+        "ion": df.get_column("adduct").str.replace(r"\?", "H") if "adduct" in df.columns else pl.Series([""] * len(df)),
+        "iso": df.get_column("iso") if "iso" in df.columns else pl.Series([0] * len(df)),
+        "clique": df.get_column("clique") if "clique" in df.columns else pl.Series([None] * len(df)),
+        "num_detection": pl.Series([1] * len(df)),  # Single sample always 1
+        "total_detection": pl.Series([1] * len(df)),  # Single sample always 1
+        "mz_mean": df.get_column("mz") if "mz" in df.columns else pl.Series([None] * len(df)),
+        "mz_min": df.get_column("mz") if "mz" in df.columns else pl.Series([None] * len(df)),
+        "mz_max": df.get_column("mz") if "mz" in df.columns else pl.Series([None] * len(df)),
+        "rt_mean": df.get_column("rt") if "rt" in df.columns else pl.Series([None] * len(df)),
+        "rt_min": df.get_column("rt_start") if "rt_start" in df.columns else df.get_column("rt") if "rt" in df.columns else pl.Series([None] * len(df)),
+        "rt_max": df.get_column("rt_end") if "rt_end" in df.columns else df.get_column("rt") if "rt" in df.columns else pl.Series([None] * len(df)),
+        "rt_cor_mean": df.get_column("rt") if "rt" in df.columns else pl.Series([None] * len(df)),
+        "rt_cor_min": df.get_column("rt_start") if "rt_start" in df.columns else df.get_column("rt") if "rt" in df.columns else pl.Series([None] * len(df)),
+        "rt_cor_max": df.get_column("rt_end") if "rt_end" in df.columns else df.get_column("rt") if "rt" in df.columns else pl.Series([None] * len(df)),
+        "height_mean": df.get_column("height") if "height" in df.columns else pl.Series([None] * len(df)),
+        "height_min": df.get_column("height") if "height" in df.columns else pl.Series([None] * len(df)),
+        "height_max": df.get_column("height") if "height" in df.columns else pl.Series([None] * len(df)),
+        "intensity_mean": df.get_column("inty") if "inty" in df.columns else pl.Series([None] * len(df)),
+        "intensity_min": df.get_column("inty") if "inty" in df.columns else pl.Series([None] * len(df)),
+        "intensity_max": df.get_column("inty") if "inty" in df.columns else pl.Series([None] * len(df)),
+        "SN_mean": df.get_column("sn") if "sn" in df.columns else pl.Series([None] * len(df)),
+        "SN_min": df.get_column("sn") if "sn" in df.columns else pl.Series([None] * len(df)),
+        "SN_max": df.get_column("sn") if "sn" in df.columns else pl.Series([None] * len(df)),
+        "peakwidth_mean": (df.get_column("rt_end") - df.get_column("rt_start")) if ("rt_end" in df.columns and "rt_start" in df.columns) else pl.Series([None] * len(df)),
+        "peakwidth_min": (df.get_column("rt_end") - df.get_column("rt_start")) if ("rt_end" in df.columns and "rt_start" in df.columns) else pl.Series([None] * len(df)),
+        "peakwidth_max": (df.get_column("rt_end") - df.get_column("rt_start")) if ("rt_end" in df.columns and "rt_start" in df.columns) else pl.Series([None] * len(df)),
+        "ms2_mgf_id": df.get_column("ms2_mgf_id") if "ms2_mgf_id" in df.columns else pl.Series([""] * len(df)),
+        "ms2_num_fused": df.get_column("ms2_scans").list.len() if "ms2_scans" in df.columns and df["ms2_scans"].dtype == pl.List else pl.Series([None] * len(df)),
+        "ms2_source": df.get_column("ms2_source") if "ms2_source" in df.columns else pl.Series([""] * len(df)),
+        "isotopic_pattern_annot": df.get_column("isotopic_pattern_annot") if "isotopic_pattern_annot" in df.columns else pl.Series([""] * len(df)),
+        "isotopic_pattern_rel": df.get_column("isotopic_pattern_rel") if "isotopic_pattern_rel" in df.columns else pl.Series([""] * len(df)),
+        "isotopic_pattern_abs": df.get_column("isotopic_pattern_abs") if "isotopic_pattern_abs" in df.columns else pl.Series([""] * len(df)),
+        quant_column_name: df.get_column("inty") if "inty" in df.columns else pl.Series([None] * len(df)),
+    }
+    # Create the polars DataFrame
+    slaw_df = pl.DataFrame(slaw_data)
+    # Convert to pandas for CSV export with comma separator
+    pandas_df = slaw_df.to_pandas()
+    # Export to CSV with comma separator - only quote when necessary (QUOTE_MINIMAL)
+    try:
+        pandas_df.to_csv(filename, sep=',', index=False, quoting=0)  # quoting=0 means QUOTE_MINIMAL
+        self.logger.success(f"Features exported to {filename} (SLAW format)")
+        self.logger.debug(f"Exported {len(slaw_df)} features with {len(slaw_df.columns)} columns")
+    except PermissionError:
+        self.logger.error(f"Permission denied: Cannot write to {filename}. The file may be open in another program. Please close it and try again.")
 def export_chrom(self, filename="chrom.csv"):
     # saves self.chrom_df to a csv file. Remove the scan_uid and chrom columns if the file already exists
     if self.chrom_df is None:
@@ -872,3 +1009,589 @@ def export_chrom(self, filename="chrom.csv"):
     if "chrom" in data.columns:
         data = data.drop("chrom")
     data.to_csv(filename, index=False)
+def export_mztab(self, filename=None, title=None, description=None, include_mgf=False, **kwargs):
+    """
+    Export the sample as a fully compliant mzTab-M file.
+    Args:
+        filename (str, optional): Path to the output mzTab-M file. Defaults to "sample.mztab".
+        title (str, optional): Human-readable title for the file.
+        description (str, optional): Human-readable description.
+        include_mgf (bool, optional): Include MGF table with MS2 spectra. Defaults to False.
+        **kwargs: Additional metadata or export options.
+    """
+    from masster._version import __version__
+    def safe_str(value, default="null"):
+        """Convert value to string, replacing empty strings with 'null'"""
+        if value is None:
+            return default
+        str_val = str(value)
+        return str_val if str_val.strip() != "" else default
+    if filename is None:
+        filename = "sample.mztab"
+    if not os.path.isabs(filename):
+        filename = os.path.abspath(filename)
+    # Get identification data if available using get_id() function
+    id_data = None
+    top_id_data = None
+    full_id_data = None
+    try:
+        # Import get_id function from sample.id module
+        from masster.sample.id import get_id
+        # Get full enriched identification data
+        full_id_data = get_id(self)
+        if full_id_data is not None and not full_id_data.is_empty():
+            # Get top scoring identification for each feature_uid for SML section
+            top_id_data = (
+                full_id_data.group_by("feature_uid")
+                .agg(pl.all().sort_by("score", descending=True).first())
+                .sort("feature_uid")
+            )
+            # Keep raw id_data for backward compatibility (if needed elsewhere)
+            id_data = self.id_df if hasattr(self, "id_df") and self.id_df is not None else None
+        else:
+            self.logger.info("No identification data available for mzTab export")
+    except Exception as e:
+        self.logger.debug(f"Could not retrieve identification data: {e}")
+        id_data = None
+        top_id_data = None
+        full_id_data = None
+    # Get MGF data only if requested
+    mgf_data = None
+    mgf_mapping: dict[int, list[int]] = {}
+    if include_mgf:
+        # Create MGF data from features_df
+        if self.features_df is not None:
+            mgf_rows = []
+            mgf_index = 1
+            for feature_row in self.features_df.iter_rows(named=True):
+                feature_uid = feature_row["feature_uid"]
+                feature_id = feature_row.get("feature_id", feature_uid)
+                # Check if this feature has MS2 scans
+                if feature_row.get("ms2_scans") is None:
+                    continue
+                ms2_scans = feature_row["ms2_scans"]
+                if not isinstance(ms2_scans, list):
+                    ms2_scans = [ms2_scans]
+                # Process each MS2 scan
+                for scan_uid in ms2_scans:
+                    spec = self.get_spectrum(scan_uid)
+                    if spec is None or len(spec.mz) == 0:
+                        continue
+                    mgf_row = {
+                        "mgf_index": mgf_index,
+                        "feature_uid": feature_uid,
+                        "feature_id": feature_id,
+                        "rtinseconds": feature_row.get("rt", 0),
+                        "pepmass": feature_row.get("mz", 0),
+                        "energy": spec.energy if hasattr(spec, "energy") else 0,
+                        "mslevel": spec.ms_level if hasattr(spec, "ms_level") else 2,
+                        "title": f"uid:{feature_uid}, rt:{feature_row.get('rt', 0):.2f}, mz:{feature_row.get('mz', 0):.4f}",
+                        "spec_mz": spec.mz,
+                        "spec_int": spec.inty,
+                        "spec_len": len(spec.mz),
+                    }
+                    mgf_rows.append(mgf_row)
+                    # Track mapping
+                    if feature_uid not in mgf_mapping:
+                        mgf_mapping[feature_uid] = []
+                    mgf_mapping[feature_uid].append(mgf_index)
+                    mgf_index += 1
+            if mgf_rows:
+                mgf_data = pl.DataFrame(mgf_rows)
+    # --- Prepare MTD (metadata) section ---
+    mtd_lines = []
+    mtd_lines.append(
+        f"COM\tfile generated by MASSter {__version__} on {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}",
+    )
+    mtd_lines.append("\nMTD\tmzTab-version\t2.2.0-M")
+    # Use sample name or filename as mzTab-id
+    sample_name = getattr(self, "label", None) or os.path.splitext(os.path.basename(self.file_path))[0] if hasattr(self, "file_path") and self.file_path else "sample"
+    mtd_lines.append(f"MTD\tmzTab-id\t{sample_name}")
+    mtd_lines.append("")
+    # CV definitions
+    mtd_lines.append("MTD\tcv[1]-label\tMS")
+    mtd_lines.append("MTD\tcv[1]-full_name\tPSI-MS controlled vocabulary")
+    mtd_lines.append("MTD\tcv[1]-version\t4.1.199")
+    mtd_lines.append(
+        "MTD\tcv[1]-uri\thttps://raw.githubusercontent.com/HUPO-PSI/psi-ms-CV/master/psi-ms.obo",
+    )
+    mtd_lines.append("")
+    # Quantification units
+    mtd_lines.append(
+        "MTD\tsmall_molecule-quantification_unit\t[MS, MS:1001844, MS1 feature area, ]",
+    )
+    mtd_lines.append(
+        "MTD\tsmall_molecule_feature-quantification_unit\t[MS, MS:1001844, MS1 feature area, ]",
+    )
+    mtd_lines.append(
+        "MTD\tsmall_molecule-identification_reliability\t[MS, MS:1002955, hr-ms compound identification confidence level, ]",
+    )
+    # Identification confidence
+    mtd_lines.append(
+        "MTD\tid_confidence_measure[1]\t[MS, MS:1002888, small molecule confidence measure, ]",
+    )
+    mtd_lines.append("")
+    # Software
+    mtd_lines.append("MTD\tsoftware[1]\t[MS, MS:1003430, OpenMS, unknown]")
+    mtd_lines.append(f"MTD\tsoftware[2]\t[MS, MS:1002878, MASSter, {__version__}]")
+    mtd_lines.append(
+        "MTD\tquantification_method\t[MS, MS:1001834, LC-MS label-free quantitation analysis, ]",
+    )
+    mtd_lines.append("")
+    # Database information - updated based on identification data
+    if full_id_data is not None and hasattr(self, "lib_df") and self.lib_df is not None and not self.lib_df.is_empty():
+        mtd_lines.append('MTD\tdatabase[1]\t[, , "compound library", ]')
+        mtd_lines.append("MTD\tdatabase[1]-prefix\tcmpd")
+        mtd_lines.append("MTD\tdatabase[1]-version\tUnknown")
+        mtd_lines.append("MTD\tdatabase[1]-uri\thttps://pubchem.ncbi.nlm.nih.gov/")
+    else:
+        mtd_lines.append('MTD\tdatabase[1]\t[, , "PubChem", ]')
+        mtd_lines.append("MTD\tdatabase[1]-prefix\tCID")
+        mtd_lines.append("MTD\tdatabase[1]-version\tUnknown")
+        mtd_lines.append("MTD\tdatabase[1]-uri\thttps://pubchem.ncbi.nlm.nih.gov/")
+    # Single sample metadata
+    mtd_lines.append(f"\nMTD\tsample[1]\t{sample_name}")
+    mtd_lines.append(f"MTD\tsample[1]-description\t{sample_name}")
+    mtd_lines.append(f"MTD\tms_run[1]-location\tfile://unknown")
+    # Scan polarity
+    sample_polarity = getattr(self, "polarity", "positive")
+    if sample_polarity in ["negative", "neg"]:
+        scan_polarity_cv = "[MS, MS:1000129, negative scan, ]"
+    else:
+        scan_polarity_cv = "[MS, MS:1000130, positive scan, ]"
+    mtd_lines.append(f"MTD\tms_run[1]-scan_polarity\t{scan_polarity_cv}")
+    mtd_lines.append("MTD\tassay[1]\tAssay_1")
+    mtd_lines.append("MTD\tassay[1]-sample_ref\tsample[1]")
+    mtd_lines.append("MTD\tassay[1]-ms_run_ref\tms_run[1]")
+    mtd_lines.append("")
+    mtd_lines.append("MTD\tstudy_variable[1]\tundefined")
+    mtd_lines.append("MTD\tstudy_variable[1]-assay_refs\tassay[1]")
+    mtd_lines.append("MTD\tstudy_variable[1]-description\tSingle sample")
+    with open(filename, "w", encoding="utf-8") as f:
+        for line in mtd_lines:
+            f.write(line + "\n")
+    # --- SML (Small Molecule) table ---
+    sml_lines = []
+    sml_header = [
+        "SMH",
+        "SML_ID",
+        "SMF_ID_REFS",
+        "database_identifier",
+        "chemical_formula",
+        "smiles",
+        "inchi",
+        "chemical_name",
+        "uri",
+        "theoretical_neutral_mass",
+        "adduct_ions",
+        "reliability",
+        "best_id_confidence_measure",
+        "best_id_confidence_value",
+        "opt_global_mgf_index",
+        "abundance_assay[1]",
+        "abundance_study_variable[1]",
+        "abundance_variation_study_variable[1]",
+    ]
+    sml_lines.append("\t".join(sml_header))
+    # Get adducts from features_df['adduct']
+    adduct_list = []
+    for row in self.features_df.iter_rows(named=True):
+        adduct = "null"
+        if "adduct" in row and row["adduct"] is not None:
+            adduct = str(row["adduct"]).replace("?", "H")
+        adduct_list.append(adduct)
+    for idx, row in enumerate(self.features_df.iter_rows(named=True), 1):
+        feature_uid = row["feature_uid"]
+        # Get identification information for this feature_uid if available
+        id_info = None
+        if top_id_data is not None:
+            id_matches = top_id_data.filter(pl.col("feature_uid") == feature_uid)
+            if id_matches.height > 0:
+                id_info = id_matches.row(0, named=True)
+        # Populate identification fields
+        database_identifier = "null"
+        chemical_formula = "null"
+        smiles_val = "null"
+        inchi_val = "null"
+        chemical_name = "null"
+        best_id_confidence_measure = "null"
+        best_id_confidence_value = "null"
+        reliability = "4"  # Default: unknown compound
+        theoretical_neutral_mass = "null"
+        if id_info:
+            # Use cmpd_uid as database identifier with prefix
+            if id_info.get("cmpd_uid") is not None:
+                database_identifier = f"cmpd:{id_info['cmpd_uid']}"
+            # Chemical formula
+            if id_info.get("formula") is not None and id_info["formula"] != "":
+                chemical_formula = safe_str(id_info["formula"])
+            # SMILES
+            if id_info.get("smiles") is not None and id_info["smiles"] != "":
+                smiles_val = safe_str(id_info["smiles"])
+            # InChI
+            if id_info.get("inchi") is not None and id_info["inchi"] != "":
+                inchi_val = safe_str(id_info["inchi"])
+            # Chemical name
+            if id_info.get("name") is not None and id_info["name"] != "":
+                chemical_name = safe_str(id_info["name"])
+            # Theoretical neutral mass
+            if id_info.get("neutral_mass") is not None:
+                theoretical_neutral_mass = safe_str(id_info["neutral_mass"])
+            elif id_info.get("mass") is not None:
+                theoretical_neutral_mass = safe_str(id_info["mass"])
+            # Identification confidence
+            if id_info.get("matcher") is not None:
+                best_id_confidence_measure = f"[MS, MS:1002888, {id_info['matcher']}, ]"
+            if id_info.get("score") is not None:
+                best_id_confidence_value = safe_str(id_info["score"])
+            # Set reliability based on identification quality
+            if id_info.get("score", 0) >= 0.8:
+                reliability = "2a"  # High confidence compound match
+            elif id_info.get("score", 0) >= 0.5:
+                reliability = "2b"  # Moderate confidence match
+            elif id_info.get("score", 0) >= 0.2:
+                reliability = "3"  # Compound class level
+            else:
+                reliability = "4"  # Unknown compound
+        # Get MGF indexes for this feature
+        mgf_indexes = mgf_mapping.get(feature_uid, [])
+        # Get intensity value for abundance
+        abundance_value = row.get("inty", None)
+        abundance_str = safe_str(abundance_value) if abundance_value is not None else "null"
+        sml_row = [
+            "SML",
+            str(idx),
+            str(idx),  # SMF_ID_REFS - same as SML_ID for single features
+            database_identifier,
+            chemical_formula,
+            smiles_val,
+            inchi_val,
+            chemical_name,
+            safe_str(row.get("uri", "null")),
+            theoretical_neutral_mass,
+            adduct_list[idx - 1],
+            reliability,
+            best_id_confidence_measure,
+            best_id_confidence_value,
+            ",".join(map(str, mgf_indexes)) if mgf_indexes else "null",
+            abundance_str,  # abundance_assay[1]
+            abundance_str,  # abundance_study_variable[1] (same for single sample)
+            "null",  # abundance_variation_study_variable[1] (no variation for single sample)
+        ]
+        sml_lines.append("\t".join(sml_row))
+    with open(filename, "a", encoding="utf-8") as f:
+        f.write("\n")
+        for line in sml_lines:
+            f.write(line + "\n")
+    # --- SMF (Small Molecule Feature) table ---
+    smf_lines = []
+    smf_header = [
+        "SFH",
+        "SMF_ID",
+        "SME_ID_REFS",
+        "SME_ID_REF_ambiguity_code",
+        "adduct_ion",
+        "isotopomer",
+        "exp_mass_to_charge",
+        "charge",
+        "retention_time_in_seconds",
+        "retention_time_in_seconds_start",
+        "retention_time_in_seconds_end",
+        "abundance_assay[1]",
+        "abundance_study_variable[1]",
+        "abundance_variation_study_variable[1]",
+    ]
+    smf_lines.append("\t".join(smf_header))
+    for idx, row in enumerate(self.features_df.iter_rows(named=True), 1):
+        feature_uid = row["feature_uid"]
+        # References to SME entries
+        SME_refs = "null"
+        SME_ambiguity = "null"
+        if full_id_data is not None:
+            # Find all SME entries for this feature_uid
+            SME_matches = full_id_data.filter(pl.col("feature_uid") == feature_uid)
+            if SME_matches.height > 0:
+                # Generate SME IDs
+                SME_ids = []
+                for i, SME_row in enumerate(SME_matches.iter_rows(named=True)):
+                    SME_id_base = feature_uid * 1000
+                    SME_id = SME_id_base + i + 1
+                    SME_ids.append(str(SME_id))
+                if SME_ids:
+                    SME_refs = "|".join(SME_ids)
+                    # Set ambiguity code
+                    if len(SME_ids) > 1:
+                        unique_cmpds = {
+                            match["cmpd_uid"]
+                            for match in SME_matches.iter_rows(named=True)
+                            if match.get("cmpd_uid") is not None
+                        }
+                        if len(unique_cmpds) > 1:
+                            SME_ambiguity = "1"  # Ambiguous identification
+                        else:
+                            SME_ambiguity = "2"  # Multiple evidence for same molecule
+                    else:
+                        SME_ambiguity = "null"
+        # Format isotopomer
+        iso_value = row.get("iso", 0)
+        if iso_value is not None and round(iso_value) != 0:
+            isotopomer = f'[MS,MS:1002957,"isotopomer MS peak","+{round(iso_value)}"]'
+        else:
+            isotopomer = "null"
+        # Get abundance value
+        abundance_value = row.get("inty", None)
+        abundance_str = safe_str(abundance_value) if abundance_value is not None else "null"
+        smf_row = [
+            "SMF",
+            str(idx),
+            SME_refs,
+            SME_ambiguity,
+            adduct_list[idx - 1],  # adduct_ion
+            isotopomer,
+            safe_str(row.get("mz", "null")),  # exp_mass_to_charge
+            safe_str(row.get("charge", "null")),
+            safe_str(row.get("rt", "null")),  # retention_time_in_seconds
+            safe_str(row.get("rt_start", "null")),
+            safe_str(row.get("rt_end", "null")),
+            abundance_str,  # abundance_assay[1]
+            abundance_str,  # abundance_study_variable[1]
+            "null",  # abundance_variation_study_variable[1]
+        ]
+        smf_lines.append("\t".join(smf_row))
+    with open(filename, "a", encoding="utf-8") as f:
+        f.write("\n")
+        for line in smf_lines:
+            f.write(line + "\n")
+    # --- SME (Small Molecule Evidence) table ---
+    if full_id_data is not None and not full_id_data.is_empty():
+        SME_lines = []
+        SME_lines.append(
+            "COM\tThe spectra_ref are dummy placeholders, as the annotation was based on aggregated data",
+        )
+        SME_header = [
+            "SEH",
+            "SME_ID",
+            "evidence_input_id",
+            "database_identifier",
+            "chemical_formula",
+            "smiles",
+            "inchi",
+            "chemical_name",
+            "uri",
+            "derivatized_form",
+            "adduct_ion",
+            "exp_mass_to_charge",
+            "charge",
+            "theoretical_mass_to_charge",
+            "spectra_ref",
+            "identification_method",
+            "ms_level",
+            "id_confidence_measure[1]",
+            "rank",
+        ]
+        SME_lines.append("\t".join(SME_header))
+        # Create SME entries for all identification results
+        for feature_uid in self.features_df.select("feature_uid").to_series().unique():
+            # Get feature data
+            feature_data = self.features_df.filter(pl.col("feature_uid") == feature_uid)
+            if feature_data.height == 0:
+                continue
+            feature_row = feature_data.row(0, named=True)
+            # Get all identification results for this feature
+            SME_matches = full_id_data.filter(pl.col("feature_uid") == feature_uid)
+            if SME_matches.height > 0:
+                # Sort by score descending
+                SME_matches = SME_matches.sort("score", descending=True)
+                for i, SME_row in enumerate(SME_matches.iter_rows(named=True)):
+                    # Generate unique SME_ID
+                    SME_id_base = feature_uid * 1000
+                    SME_id = SME_id_base + i + 1
+                    # Create evidence input ID
+                    feature_mz = feature_row.get("mz", 0)
+                    feature_rt = feature_row.get("rt", 0)
+                    feature_id = feature_row.get("feature_id", feature_uid)
+                    evidence_id = f"feature_uid={feature_uid}:feature_id={feature_id}:mz={feature_mz:.4f}:rt={feature_rt:.2f}"
+                    # Database identifier
+                    db_id = "null"
+                    if SME_row.get("db_id") is not None and SME_row["db_id"] != "":
+                        db_id = safe_str(SME_row["db_id"])
+                    elif SME_row.get("cmpd_uid") is not None:
+                        db_id = f"cmpd:{SME_row['cmpd_uid']}"
+                    # Get adduct information
+                    adduct_ion = "null"
+                    if SME_row.get("adduct") is not None and SME_row["adduct"] != "":
+                        adduct_ion = safe_str(SME_row["adduct"]).replace("?", "H")
+                    # Spectra reference
+                    spectra_ref = "ms_run[1]:spectrum=0"
+                    # Identification method
+                    id_method = "[MS, MS:1002888, small molecule confidence measure, ]"
+                    if SME_row.get("matcher") is not None:
+                        id_method = f"[MS, MS:1002888, {SME_row['matcher']}, ]"
+                    # MS level - check if ms1 exists in matched
+                    if 'ms1' in SME_row['matcher'].lower():
+                        ms_level = "[MS, MS:1000511, ms level, 1]"
+                    else:
+                        ms_level = "[MS,MS:1000511, ms level, 2]"
+                    # Experimental mass-to-charge
+                    exp_mz = safe_str(feature_mz)
+                    # Theoretical mass-to-charge
+                    theoretical_mz = "null"
+                    if SME_row.get("mz") is not None:
+                        theoretical_mz = safe_str(SME_row["mz"])
+                    SME_line = [
+                        "SME",
+                        str(SME_id),
+                        evidence_id,
+                        db_id,
+                        safe_str(SME_row.get("formula", "null")),
+                        safe_str(SME_row.get("smiles", "null")),
+                        safe_str(SME_row.get("inchi", "null")),
+                        safe_str(SME_row.get("name", "null")),
+                        "null",  # uri
+                        "null",  # derivatized_form
+                        adduct_ion,
+                        exp_mz,
+                        safe_str(feature_row.get("charge", "1")),
+                        theoretical_mz,
+                        spectra_ref,
+                        id_method,
+                        ms_level,
+                        safe_str(SME_row.get("score", "null")),
+                        str(i + 1),  # rank
+                    ]
+                    SME_lines.append("\t".join(SME_line))
+        # Write SME table
+        with open(filename, "a", encoding="utf-8") as f:
+            f.write("\n")
+            for line in SME_lines:
+                f.write(line + "\n")
+    # --- MGF table ---
+    if include_mgf and mgf_data is not None and len(mgf_data) > 0:
+        mgf_lines = []
+        # Header
+        mgf_header = [
+            "COM",
+            "MGH",
+            "mgf_id",
+            "prec_id",
+            "prec_rt",
+            "prec_mz",
+            "prec_int",
+            "energy",
+            "level",
+            "title",
+            "spec_tic",
+            "spec_len",
+            "spec_mz",
+            "spec_int",
+        ]
+        mgf_lines.append("\t".join(mgf_header))
+        # Data rows
+        for row in mgf_data.iter_rows(named=True):
+            # Calculate spectrum TIC
+            spectrum_mz = row["spec_mz"]
+            spectrum_inty = row["spec_int"]
+            spec_tic = sum(spectrum_inty) if spectrum_inty else 0
+            spec_len = row["spec_len"] if row["spec_len"] is not None else 0
+            # Format spectrum data as pipe-separated strings
+            spec_mz_str = "|".join([f"{mz:.4f}" for mz in spectrum_mz]) if spectrum_mz else ""
+            spec_int_str = "|".join([f"{int(inty)}" for inty in spectrum_inty]) if spectrum_inty else ""
+            mgf_row = [
+                "COM",
+                "MGF",
+                str(row["mgf_index"]) if row["mgf_index"] is not None else "null",
+                str(row["feature_id"]) if row["feature_id"] is not None else "null",
+                f"{row['rtinseconds']:.2f}" if row["rtinseconds"] is not None else "null",
+                f"{row['pepmass']:.4f}" if row["pepmass"] is not None else "null",
+                "null",  # prec_int
+                str(row["energy"]) if row["energy"] is not None else "null",
+                str(row["mslevel"]) if row["mslevel"] is not None else "null",
+                str(row["title"]) if row["title"] is not None else "null",
+                f"{int(spec_tic)}" if spec_tic > 0 else "null",
+                str(spec_len) if spec_len > 0 else "null",
+                spec_mz_str if spec_mz_str else "null",
+                spec_int_str if spec_int_str else "null",
+            ]
+            mgf_lines.append("\t".join(mgf_row))
+        # Write MGF table
+        with open(filename, "a", encoding="utf-8") as f:
+            f.write("\n")
+            for line in mgf_lines:
+                f.write(line + "\n")
+    self.logger.success(f"Exported mzTab-M to {filename}")

masster 0.5.28__py3-none-any.whl → 0.6.2__py3-none-any.whl

Potentially problematic release.

masster 0.5.28py3-none-any.whl → 0.6.2py3-none-any.whl