PyPI - masster - Versions diffs - 0.4.0__py3-none-any.whl → 0.4.1__py3-none-any.whl - Mend

masster 0.4.0py3-none-any.whl → 0.4.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (54) hide show

masster/__init__.py +8 -8
masster/_version.py +1 -1
masster/chromatogram.py +3 -9
masster/data/libs/README.md +1 -1
masster/data/libs/ccm.csv +120 -120
masster/data/libs/ccm.py +116 -62
masster/data/libs/central_carbon_README.md +1 -1
masster/data/libs/urine.py +161 -65
masster/data/libs/urine_metabolites.csv +4693 -4693
masster/data/wiff/2025_01_14_VW_7600_LpMx_DBS_CID_2min_TOP15_030msecMS1_005msecReac_CE35_DBS-ON_3.mzML +2 -2
masster/logger.py +43 -78
masster/sample/__init__.py +1 -1
masster/sample/adducts.py +264 -338
masster/sample/defaults/find_adducts_def.py +8 -21
masster/sample/defaults/find_features_def.py +1 -6
masster/sample/defaults/get_spectrum_def.py +1 -5
masster/sample/defaults/sample_def.py +1 -5
masster/sample/h5.py +282 -561
masster/sample/helpers.py +75 -131
masster/sample/lib.py +17 -42
masster/sample/load.py +17 -31
masster/sample/parameters.py +2 -6
masster/sample/plot.py +27 -88
masster/sample/processing.py +87 -117
masster/sample/quant.py +51 -57
masster/sample/sample.py +90 -103
masster/sample/sample5_schema.json +44 -44
masster/sample/save.py +12 -35
masster/sample/sciex.py +19 -66
masster/spectrum.py +20 -58
masster/study/__init__.py +1 -1
masster/study/defaults/align_def.py +1 -5
masster/study/defaults/fill_chrom_def.py +1 -5
masster/study/defaults/fill_def.py +1 -5
masster/study/defaults/integrate_chrom_def.py +1 -5
masster/study/defaults/integrate_def.py +1 -5
masster/study/defaults/study_def.py +25 -58
masster/study/export.py +207 -233
masster/study/h5.py +136 -470
masster/study/helpers.py +202 -495
masster/study/helpers_optimized.py +13 -40
masster/study/id.py +110 -213
masster/study/load.py +143 -230
masster/study/plot.py +257 -518
masster/study/processing.py +257 -469
masster/study/save.py +5 -15
masster/study/study.py +276 -379
masster/study/study5_schema.json +96 -96
{masster-0.4.0.dist-info → masster-0.4.1.dist-info}/METADATA +1 -1
masster-0.4.1.dist-info/RECORD +67 -0
masster-0.4.0.dist-info/RECORD +0 -67
{masster-0.4.0.dist-info → masster-0.4.1.dist-info}/WHEEL +0 -0
{masster-0.4.0.dist-info → masster-0.4.1.dist-info}/entry_points.txt +0 -0
{masster-0.4.0.dist-info → masster-0.4.1.dist-info}/licenses/LICENSE +0 -0

masster/study/export.py CHANGED Viewed

@@ -10,9 +10,9 @@ import polars as pl
 from tqdm import tqdm
-from master.spectrum import combine_peaks
-from master.study.defaults import export_mgf_defaults
-from master._version import get_version
+from masster.spectrum import combine_peaks
+from masster.study.defaults import export_mgf_defaults
+from masster._version import get_version
 def _get_mgf_df(self, **kwargs):
@@ -107,11 +107,7 @@ def _get_mgf_df(self, **kwargs):
             mask = mask & (spec.inty >= inty_min)
         for attr in spec.__dict__:
             arr = getattr(spec, attr)
-            if (
-                isinstance(arr, list | np.ndarray)
-                and hasattr(arr, "__len__")
-                and len(arr) == length
-            ):
+            if isinstance(arr, list | np.ndarray) and hasattr(arr, "__len__") and len(arr) == length:
                 setattr(spec, attr, np.array(arr)[mask])
         return spec
@@ -121,12 +117,8 @@ def _get_mgf_df(self, **kwargs):
             return None
         # Prepare spectrum data
-        spectrum_mz = (
-            spect.mz.tolist() if hasattr(spect.mz, "tolist") else list(spect.mz)
-        )
-        spectrum_inty = (
-            spect.inty.tolist() if hasattr(spect.inty, "tolist") else list(spect.inty)
-        )
+        spectrum_mz = spect.mz.tolist() if hasattr(spect.mz, "tolist") else list(spect.mz)
+        spectrum_inty = spect.inty.tolist() if hasattr(spect.inty, "tolist") else list(spect.inty)
         # Determine MS level
         ms_level = spect.ms_level if spect.ms_level is not None else 1
@@ -266,11 +258,7 @@ def _get_mgf_df(self, **kwargs):
             elif selection == "all":
                 if merge:
-                    specs = [
-                        row_e["spec"]
-                        for row_e in cons_ms2.iter_rows(named=True)
-                        if row_e["spec"] is not None
-                    ]
+                    specs = [row_e["spec"] for row_e in cons_ms2.iter_rows(named=True) if row_e["spec"] is not None]
                     if not specs:
                         continue
                     spect = combine_peaks(specs)
@@ -422,6 +410,13 @@ def export_mztab(self, filename: str = None, include_mgf=True, **kwargs) -> None
         description (str, optional): Human-readable description.
         **kwargs: Additional metadata or export options.
     """
+    def safe_str(value, default="null"):
+        """Convert value to string, replacing empty strings with 'null'"""
+        if value is None:
+            return default
+        str_val = str(value)
+        return str_val if str_val.strip() != "" else default
     if filename is None:
         filename = "study.mztab"
     if not os.path.isabs(filename):
@@ -435,16 +430,14 @@ def export_mztab(self, filename: str = None, include_mgf=True, **kwargs) -> None
     top_id_data = None
     try:
         # Import here to avoid circular imports
-        from master.study.id import get_id
+        from masster.study.id import get_id
         id_data = get_id(self)
         if id_data is not None and not id_data.is_empty():
             # Get top scoring identification for each consensus_uid for SML section
-            top_id_data = (
-                id_data.group_by("consensus_uid")
-                .agg(pl.all().sort_by("score", descending=True).first())
-                .sort("consensus_uid")
-            )
+            top_id_data = (id_data
+                          .group_by("consensus_uid")
+                          .agg(pl.all().sort_by("score", descending=True).first())
+                          .sort("consensus_uid"))
         else:
             self.logger.info("No identification data available for mzTab export")
     except Exception as e:
@@ -468,9 +461,7 @@ def export_mztab(self, filename: str = None, include_mgf=True, **kwargs) -> None
     # --- Prepare MTD (metadata) section ---
     mtd_lines = []
-    mtd_lines.append(
-        f"COM\tfile generated by MASSter {get_version()} on {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}",
-    )
+    mtd_lines.append(f"COM\tfile generated by MASSter {get_version()} on {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
     mtd_lines.append("\nMTD\tmzTab-version\t2.2.0-M")
     id = self.label if self.label else self.folder
     mtd_lines.append(f"MTD\tmzTab-id\t{id}")
@@ -478,67 +469,58 @@ def export_mztab(self, filename: str = None, include_mgf=True, **kwargs) -> None
     mtd_lines.append("MTD\tcv[1]-label\tMS")
     mtd_lines.append("MTD\tcv[1]-full_name\tPSI-MS controlled vocabulary")
     mtd_lines.append("MTD\tcv[1]-version\t4.1.199")
-    mtd_lines.append(
-        "MTD\tcv[1]-uri\thttps://raw.githubusercontent.com/HUPO-PSI/psi-ms-CV/master/psi-ms.obo",
-    )
+    mtd_lines.append("MTD\tcv[1]-uri\thttps://raw.githubusercontent.com/HUPO-PSI/psi-ms-CV/master/psi-ms.obo")
     mtd_lines.append("")
-    mtd_lines.append(
-        "MTD\tsmall_molecule-quantification_unit\t[MS, MS:1001844, MS1 feature area, ]",
-    )
-    mtd_lines.append(
-        "MTD\tsmall_molecule_feature-quantification_unit\t[MS, MS:1001844, MS1 feature area, ]",
-    )
+    mtd_lines.append("MTD\tsmall_molecule-quantification_unit\t[MS, MS:1001844, MS1 feature area, ]")
+    mtd_lines.append("MTD\tsmall_molecule_feature-quantification_unit\t[MS, MS:1001844, MS1 feature area, ]")
     mtd_lines.append(
         "MTD\tsmall_molecule-identification_reliability\t[MS, MS:1002955, hr-ms compound identification confidence level, ]",
     )
     # Add identification confidence measures if identification data is available
     if id_data is not None:
-        mtd_lines.append(
-            "MTD\tid_confidence_measure[1]\t[MS, MS:1002888, small molecule confidence measure, ]",
-        )
+        mtd_lines.append("MTD\tid_confidence_measure[1]\t[MS, MS:1002888, small molecule confidence measure, ]")
     else:
-        mtd_lines.append(
-            "MTD\tid_confidence_measure[1]\t[MS, MS:1002888, small molecule confidence measure, ]",
-        )
+        mtd_lines.append("MTD\tid_confidence_measure[1]\t[MS, MS:1002888, small molecule confidence measure, ]")
     mtd_lines.append("")
     mtd_lines.append("MTD\tsoftware[1]\t[MS, MS:1003430, OpenMS, unknown]")
     mtd_lines.append(f"MTD\tsoftware[2]\t[MS, MS:1002878, MASSter, {get_version()}]")
-    mtd_lines.append(
-        "MTD\tquantification_method\t[MS, MS:1001834, LC-MS label-free quantitation analysis, ]",
-    )
+    mtd_lines.append("MTD\tquantification_method\t[MS, MS:1001834, LC-MS label-free quantitation analysis, ]")
     mtd_lines.append("")
     # Database information - updated based on identification data
-    if (
-        id_data is not None
-        and hasattr(self, "lib_df")
-        and self.lib_df is not None
-        and not self.lib_df.is_empty()
-    ):
+    if id_data is not None and hasattr(self, 'lib_df') and self.lib_df is not None and not self.lib_df.is_empty():
         mtd_lines.append('MTD\tdatabase[1]\t[, , "compound library", ]')
         mtd_lines.append("MTD\tdatabase[1]-prefix\tcmpd")
         mtd_lines.append("MTD\tdatabase[1]-version\tUnknown")
-        mtd_lines.append("MTD\tdatabase[1]-uri\tnull")
+        mtd_lines.append("MTD\tdatabase[1]-uri\thttps://pubchem.ncbi.nlm.nih.gov/")
     else:
-        mtd_lines.append('MTD\tdatabase[1]\t[, , "no database", null]')
-        mtd_lines.append("MTD\tdatabase[1]-prefix\tnull")
+        mtd_lines.append('MTD\tdatabase[1]\t[, , "PubChem", ]')
+        mtd_lines.append("MTD\tdatabase[1]-prefix\tCID")
         mtd_lines.append("MTD\tdatabase[1]-version\tUnknown")
-        mtd_lines.append("MTD\tdatabase[1]-uri\tnull")
+        mtd_lines.append("MTD\tdatabase[1]-uri\thttps://pubchem.ncbi.nlm.nih.gov/")
     # Get abundance matrix to determine the number of assays needed
     abundance_matrix = self.get_consensus_matrix()
     # Get sample columns (excluding consensus_uid)
     sample_columns = [col for col in abundance_matrix.columns if col != "consensus_uid"]
     n_assays = len(sample_columns)
     # Define samples, ms_runs, and assays based on the abundance matrix columns
+    # Determine scan polarity based on study polarity
+    study_polarity = getattr(self, 'polarity', 'positive')
+    if study_polarity in ['negative', 'neg']:
+        scan_polarity_cv = "[MS, MS:1000129, negative scan, ]"
+    else:
+        scan_polarity_cv = "[MS, MS:1000130, positive scan, ]"
     for i, sample_col in enumerate(sample_columns, 1):
         mtd_lines.append(f"\nMTD\tsample[{i}]\t{sample_col}")
         mtd_lines.append(f"MTD\tsample[{i}]-description\t{sample_col}")
         mtd_lines.append(f"MTD\tms_run[{i}]-location\tfile://unknown")
+        mtd_lines.append(f"MTD\tms_run[{i}]-scan_polarity\t{scan_polarity_cv}")
         mtd_lines.append(f"MTD\tassay[{i}]\tAssay_{i}")
         mtd_lines.append(f"MTD\tassay[{i}]-sample_ref\tsample[{i}]")
         mtd_lines.append(f"MTD\tassay[{i}]-ms_run_ref\tms_run[{i}]")
@@ -575,24 +557,15 @@ def export_mztab(self, filename: str = None, include_mgf=True, **kwargs) -> None
     # round to int - handle both Polars and Pandas DataFrames
     if hasattr(abundance_matrix, "with_columns"):
         # Polars DataFrame
-        numeric_cols = [
-            col
-            for col in abundance_matrix.columns
-            if abundance_matrix[col].dtype.is_numeric()
-        ]
-        abundance_matrix = abundance_matrix.with_columns(
-            [abundance_matrix[col].round(0) for col in numeric_cols],
-        )
+        numeric_cols = [col for col in abundance_matrix.columns if abundance_matrix[col].dtype.is_numeric()]
+        abundance_matrix = abundance_matrix.with_columns([abundance_matrix[col].round(0) for col in numeric_cols])
     else:
         # Pandas DataFrame
         abundance_matrix = abundance_matrix.round(0)
     # Use the n_assays already calculated from abundance matrix columns
     sml_header += [f"abundance_assay[{i}]" for i in range(1, n_assays + 1)]
-    sml_header += [
-        "abundance_study_variable[1]",
-        "abundance_variation_study_variable[1]",
-    ]
+    sml_header += ["abundance_study_variable[1]", "abundance_variation_study_variable[1]"]
     sml_lines.append("\t".join(sml_header))
     # get adducts from consensus_df['adduct_top'] - use the top-ranked adduct directly
@@ -602,7 +575,9 @@ def export_mztab(self, filename: str = None, include_mgf=True, **kwargs) -> None
         # Use adduct_top if available, otherwise fall back to null
         if "adduct_top" in row and row["adduct_top"] is not None:
             adduct = str(row["adduct_top"])
+            # Replace ? with H for better mzTab compatibility
+            adduct = adduct.replace("?", "H")
         adduct_list.append(adduct)
     for idx, row in enumerate(self.consensus_df.iter_rows(named=True), 1):
@@ -613,56 +588,63 @@ def export_mztab(self, filename: str = None, include_mgf=True, **kwargs) -> None
             id_matches = top_id_data.filter(pl.col("consensus_uid") == consensus_uid)
             if id_matches.height > 0:
                 id_info = id_matches.row(0, named=True)
         # Populate identification fields
         database_identifier = "null"
         chemical_formula = "null"
         smiles_val = "null"
-        inchi_val = "null"
+        inchi_val = "null"
         chemical_name = "null"
         best_id_confidence_measure = "null"
         best_id_confidence_value = "null"
         reliability = "4"  # Default: unknown compound
+        theoretical_neutral_mass = "null"  # Only set when we have database identification
         if id_info:
             # Use cmpd_uid as database identifier with prefix
             if id_info.get("cmpd_uid") is not None:
                 database_identifier = f"cmpd:{id_info['cmpd_uid']}"
             # Chemical formula
             if id_info.get("formula") is not None and id_info["formula"] != "":
-                chemical_formula = str(id_info["formula"])
+                chemical_formula = safe_str(id_info["formula"])
             # SMILES
             if id_info.get("smiles") is not None and id_info["smiles"] != "":
-                smiles_val = str(id_info["smiles"])
+                smiles_val = safe_str(id_info["smiles"])
             # InChI
             if id_info.get("inchi") is not None and id_info["inchi"] != "":
-                inchi_val = str(id_info["inchi"])
+                inchi_val = safe_str(id_info["inchi"])
             # Chemical name
             if id_info.get("name") is not None and id_info["name"] != "":
-                chemical_name = str(id_info["name"])
+                chemical_name = safe_str(id_info["name"])
+            # Theoretical neutral mass - only from identification data, not consensus_df
+            if id_info.get("neutral_mass") is not None:
+                theoretical_neutral_mass = safe_str(id_info["neutral_mass"])
+            elif id_info.get("mass") is not None:
+                theoretical_neutral_mass = safe_str(id_info["mass"])
             # Identification confidence
             if id_info.get("matcher") is not None:
                 best_id_confidence_measure = f"[MS, MS:1002888, {id_info['matcher']}, ]"
             if id_info.get("score") is not None:
-                best_id_confidence_value = str(id_info["score"])
+                best_id_confidence_value = safe_str(id_info["score"])
             # Set reliability based on identification quality
             # Using mzTab-M hr-ms identification levels: 2a=compound match, 2b=library spectrum match, 3=compound class, 4=unknown
             if id_info.get("score", 0) >= 0.8:
                 reliability = "2a"  # High confidence compound match
             elif id_info.get("score", 0) >= 0.5:
-                reliability = "2b"  # Moderate confidence match
+                reliability = "2b"  # Moderate confidence match
             elif id_info.get("score", 0) >= 0.2:
-                reliability = "3"  # Compound class level
+                reliability = "3"   # Compound class level
             else:
-                reliability = "4"  # Unknown compound
+                reliability = "4"   # Unknown compound
         # Get MGF indexes for this consensus feature
         mgf_indexes = mgf_mapping.get(row["consensus_uid"], [])
@@ -675,10 +657,8 @@ def export_mztab(self, filename: str = None, include_mgf=True, **kwargs) -> None
             smiles_val,
             inchi_val,
             chemical_name,
-            str(row.get("uri", "null")),
-            str(
-                row.get("adduct_mass_neutral_top", "null"),
-            ),  # Use calculated neutral mass from adduct analysis
+            safe_str(row.get("uri", "null")),
+            theoretical_neutral_mass,  # Only set when database_identifier is not null
             adduct_list[idx - 1],
             reliability,
             best_id_confidence_measure,
@@ -688,24 +668,29 @@ def export_mztab(self, filename: str = None, include_mgf=True, **kwargs) -> None
         # Add abundance values for each assay
         consensus_uid = row["consensus_uid"]
         # Check if consensus_uid exists in the abundance_matrix (Polars)
-        filtered_matrix = abundance_matrix.filter(
-            pl.col("consensus_uid") == consensus_uid,
-        )
+        filtered_matrix = abundance_matrix.filter(pl.col("consensus_uid") == consensus_uid)
         if filtered_matrix.height > 0:
             # Get the first (and should be only) matching row
             abundance_row = filtered_matrix.row(0, named=True)
             # Extract values excluding the consensus_uid column
-            abundance_values = [
-                abundance_row[col]
-                for col in abundance_matrix.columns
-                if col != "consensus_uid"
-            ]
-            sml_row += [
-                str(val) if val is not None else "null" for val in abundance_values
-            ]
+            abundance_values = [abundance_row[col] for col in abundance_matrix.columns if col != "consensus_uid"]
+            sml_row += [safe_str(val) if val is not None else "null" for val in abundance_values]
+            # Calculate study variable statistics
+            non_null_values = [val for val in abundance_values if val is not None]
+            if non_null_values:
+                abundance_study_variable = sum(non_null_values) / len(non_null_values)
+                abundance_variation_study_variable = (
+                    sum((x - abundance_study_variable) ** 2 for x in non_null_values) / len(non_null_values)
+                ) ** 0.5 if len(non_null_values) > 1 else 0
+            else:
+                abundance_study_variable = "null"
+                abundance_variation_study_variable = "null"
+            sml_row += [safe_str(abundance_study_variable), safe_str(abundance_variation_study_variable)]
         else:
             sml_row += ["null"] * n_assays
-        sml_row += ["null", "null"]
+            sml_row += ["null", "null"]  # Study variable columns
         sml_lines.append("\t".join(sml_row))
     with open(filename, "a", encoding="utf-8") as f:
         f.write("\n")
@@ -717,8 +702,8 @@ def export_mztab(self, filename: str = None, include_mgf=True, **kwargs) -> None
     smf_header = [
         "SFH",
         "SMF_ID",
-        "SOME_ID_REFS",
-        "SOME_ID_REF_ambiguity_code",
+        "SME_ID_REFS",
+        "SME_ID_REF_ambiguity_code",
         "adduct_ion",
         "isotopomer",
         "exp_mass_to_charge",
@@ -728,96 +713,99 @@ def export_mztab(self, filename: str = None, include_mgf=True, **kwargs) -> None
         "retention_time_in_seconds_end",
     ]
     smf_header += [f"abundance_assay[{i}]" for i in range(1, n_assays + 1)]
+    smf_header += ["abundance_study_variable[1]", "abundance_variation_study_variable[1]"]
     smf_lines.append("\t".join(smf_header))
     # SMF table uses the same consensus features as SML, just different metadata
     for idx, row in enumerate(self.consensus_df.iter_rows(named=True), 1):
-        # References to SOME entries - each SMF can reference multiple SOME entries for the same consensus_uid
-        some_refs = "null"
-        some_ambiguity = "null"
+        # References to SME entries - each SMF can reference multiple SME entries for the same consensus_uid
+        sme_refs = "null"
+        sme_ambiguity = "null"
         consensus_uid = row["consensus_uid"]
         if id_data is not None:
-            # Find all SOME entries for this consensus_uid
-            some_matches = id_data.filter(pl.col("consensus_uid") == consensus_uid)
-            if some_matches.height > 0:
-                # Generate SOME IDs - we'll create a mapping in the SOME section
+            # Find all SME entries for this consensus_uid
+            sme_matches = id_data.filter(pl.col("consensus_uid") == consensus_uid)
+            if sme_matches.height > 0:
+                # Generate SME IDs - we'll create a mapping in the SME section
                 # For now, use a simple approach based on consensus_uid and lib_uid
-                some_ids = []
-                for i, some_row in enumerate(some_matches.iter_rows(named=True)):
-                    # Create a unique SOME ID based on consensus_uid and position
-                    some_id_base = (
-                        consensus_uid * 1000
-                    )  # Ensure uniqueness across consensus features
-                    some_id = some_id_base + i + 1
-                    some_ids.append(str(some_id))
-                if some_ids:
-                    some_refs = "|".join(some_ids)
+                sme_ids = []
+                for i, sme_row in enumerate(sme_matches.iter_rows(named=True)):
+                    # Create a unique SME ID based on consensus_uid and position
+                    sme_id_base = consensus_uid * 1000  # Ensure uniqueness across consensus features
+                    sme_id = sme_id_base + i + 1
+                    sme_ids.append(str(sme_id))
+                if sme_ids:
+                    sme_refs = "|".join(sme_ids)
                     # Set ambiguity code: 1=ambiguous identification, 2=multiple evidence same molecule, 3=both
-                    if len(some_ids) > 1:
+                    if len(sme_ids) > 1:
                         # Check if all identifications point to the same compound
-                        unique_cmpds = {
-                            match["cmpd_uid"]
-                            for match in some_matches.iter_rows(named=True)
-                            if match.get("cmpd_uid") is not None
-                        }
+                        unique_cmpds = set(match["cmpd_uid"] for match in sme_matches.iter_rows(named=True)
+                                         if match.get("cmpd_uid") is not None)
                         if len(unique_cmpds) > 1:
-                            some_ambiguity = "1"  # Ambiguous identification
+                            sme_ambiguity = "1"  # Ambiguous identification
                         else:
-                            some_ambiguity = "2"  # Multiple evidence for same molecule
+                            sme_ambiguity = "2"  # Multiple evidence for same molecule
                     else:
-                        some_ambiguity = "null"
+                        sme_ambiguity = "null"
         smf_row = [
             "SMF",
             str(idx),
-            some_refs,
-            some_ambiguity,
+            sme_refs,
+            sme_ambiguity,
             adduct_list[idx - 1],  # adduct_ion
-            str(row.get("isotopomer", "null")),
-            str(row.get("mz", "null")),  # exp_mass_to_charge
-            str(row.get("adduct_charge_top", "null")),  # Use top-ranked adduct charge
-            str(row.get("rt", "null")),  # retention_time_in_seconds
-            str(row.get("retention_time_in_seconds_start", "null")),
-            str(row.get("retention_time_in_seconds_end", "null")),
+            safe_str(row.get("isotopomer", "null")),
+            safe_str(row.get("mz", "null")),  # exp_mass_to_charge
+            safe_str(row.get("adduct_charge_top", "null")),  # Use top-ranked adduct charge
+            safe_str(row.get("rt", "null")),  # retention_time_in_seconds
+            safe_str(row.get("retention_time_in_seconds_start", "null")),
+            safe_str(row.get("retention_time_in_seconds_end", "null")),
         ]
         # Add abundance values for each assay - same as SML (Polars)
         consensus_uid = row["consensus_uid"]
-        filtered_matrix = abundance_matrix.filter(
-            pl.col("consensus_uid") == consensus_uid,
-        )
+        filtered_matrix = abundance_matrix.filter(pl.col("consensus_uid") == consensus_uid)
         if filtered_matrix.height > 0:
             # Get the first (and should be only) matching row
             abundance_row = filtered_matrix.row(0, named=True)
             # Extract values excluding the consensus_uid column
-            abundance_values = [
-                abundance_row[col]
-                for col in abundance_matrix.columns
-                if col != "consensus_uid"
-            ]
-            smf_row += [
-                str(val) if val is not None else "null" for val in abundance_values
-            ]
+            abundance_values = [abundance_row[col] for col in abundance_matrix.columns if col != "consensus_uid"]
+            abundance_strings = [safe_str(val) if val is not None else "null" for val in abundance_values]
+            smf_row += abundance_strings
+            # Calculate study variable statistics (same as in SML section)
+            non_null_values = [val for val in abundance_values if val is not None]
+            if non_null_values:
+                abundance_study_variable = sum(non_null_values) / len(non_null_values)
+                abundance_variation_study_variable = (
+                    sum((x - abundance_study_variable) ** 2 for x in non_null_values) / len(non_null_values)
+                ) ** 0.5 if len(non_null_values) > 1 else 0
+            else:
+                abundance_study_variable = "null"
+                abundance_variation_study_variable = "null"
+            smf_row += [safe_str(abundance_study_variable), safe_str(abundance_variation_study_variable)]
         else:
             smf_row += ["null"] * n_assays
+            smf_row += ["null", "null"]  # Study variable columns
         smf_lines.append("\t".join(smf_row))
     with open(filename, "a", encoding="utf-8") as f:
         f.write("\n")
         for line in smf_lines:
             f.write(line + "\n")
-    # --- SOME (Small Molecule Evidence) table ---
+    # --- SME (Small Molecule Evidence) table ---
     if id_data is not None and not id_data.is_empty():
-        some_lines = []
-        some_header = [
-            "SHE",
-            "SOME_ID",
+        sme_lines = []
+        sme_header = [
+            "SEH",
+            "SME_ID",
             "evidence_input_id",
             "database_identifier",
             "chemical_formula",
             "smiles",
-            "inchi",
+            "inchi",
             "chemical_name",
             "uri",
             "derivatized_form",
@@ -831,87 +819,81 @@ def export_mztab(self, filename: str = None, include_mgf=True, **kwargs) -> None
             "id_confidence_measure[1]",
             "rank",
         ]
-        some_lines.append("\t".join(some_header))
-        # Create SOME entries for all identification results
-        for consensus_uid in (
-            self.consensus_df.select("consensus_uid").to_series().unique()
-        ):
+        sme_lines.append("\t".join(sme_header))
+        # Create SME entries for all identification results
+        for consensus_uid in self.consensus_df.select("consensus_uid").to_series().unique():
             # Get consensus feature data for this consensus_uid
-            consensus_feature_data = self.consensus_df.filter(
-                pl.col("consensus_uid") == consensus_uid,
-            )
+            consensus_feature_data = self.consensus_df.filter(pl.col("consensus_uid") == consensus_uid)
             if consensus_feature_data.height == 0:
                 continue
             consensus_row = consensus_feature_data.row(0, named=True)
             # Get all identification results for this consensus feature
-            some_matches = id_data.filter(pl.col("consensus_uid") == consensus_uid)
-            if some_matches.height > 0:
+            sme_matches = id_data.filter(pl.col("consensus_uid") == consensus_uid)
+            if sme_matches.height > 0:
                 # Sort by score descending to maintain rank order
-                some_matches = some_matches.sort("score", descending=True)
-                for i, some_row in enumerate(some_matches.iter_rows(named=True)):
-                    # Generate unique SOME_ID
-                    some_id_base = consensus_uid * 1000
-                    some_id = some_id_base + i + 1
+                sme_matches = sme_matches.sort("score", descending=True)
+                for i, sme_row in enumerate(sme_matches.iter_rows(named=True)):
+                    # Generate unique SME_ID
+                    sme_id_base = consensus_uid * 1000
+                    sme_id = sme_id_base + i + 1
                     # Create evidence input ID - use consensus feature info
-                    evidence_id = f"consensus_feature:{consensus_uid}:mz={some_row.get('mz', 0):.4f}:rt={some_row.get('rt', 0):.2f}"
+                    evidence_id = f"consensus_feature:{consensus_uid}:mz={sme_row.get('mz', 0):.4f}:rt={sme_row.get('rt', 0):.2f}"
                     # Database identifier with prefix
                     db_id = "null"
-                    if some_row.get("cmpd_uid") is not None:
-                        db_id = f"cmpd:{some_row['cmpd_uid']}"
-                    # Get adduct information
+                    if sme_row.get("cmpd_uid") is not None:
+                        db_id = f"cmpd:{sme_row['cmpd_uid']}"
+                    # Get adduct information
                     adduct_ion = "null"
-                    if some_row.get("adduct") is not None and some_row["adduct"] != "":
-                        adduct_ion = str(some_row["adduct"])
-                    # Spectra reference - reference to the consensus feature
-                    spectra_ref = f"consensus_feature:{consensus_uid}"
+                    if sme_row.get("adduct") is not None and sme_row["adduct"] != "":
+                        adduct_ion = safe_str(sme_row["adduct"])
+                        # Replace ? with H for better mzTab compatibility
+                        adduct_ion = adduct_ion.replace("?", "H")
+                    # Spectra reference - reference to first ms_run with spectrum index 0
+                    spectra_ref = "ms_run[1]:spectrum=0"
                     # Identification method
                     id_method = "[MS, MS:1002888, small molecule confidence measure, ]"
-                    if some_row.get("matcher") is not None:
-                        id_method = f"[MS, MS:1002888, {some_row['matcher']}, ]"
+                    if sme_row.get("matcher") is not None:
+                        id_method = f"[MS, MS:1002888, {sme_row['matcher']}, ]"
                     # MS level - assume MS1 for now
                     ms_level = "[MS, MS:1000511, ms level, 1]"
-                    some_line = [
-                        "SOME",
-                        str(some_id),
+                    sme_line = [
+                        "SME",
+                        str(sme_id),
                         evidence_id,
                         db_id,
-                        str(some_row.get("formula", "null")),
-                        str(some_row.get("smiles", "null")),
-                        str(some_row.get("inchi", "null")),
-                        str(some_row.get("name", "null")),
+                        safe_str(sme_row.get("formula", "null")),
+                        safe_str(sme_row.get("smiles", "null")),
+                        safe_str(sme_row.get("inchi", "null")),
+                        safe_str(sme_row.get("name", "null")),
                         "null",  # uri - not available in current data
                         "null",  # derivatized_form
                         adduct_ion,
-                        str(some_row.get("mz", "null")),
-                        str(
-                            consensus_row.get("adduct_charge_top", "1"),
-                        ),  # Use consensus feature's top adduct charge
-                        str(
-                            some_row.get("mz", "null"),
-                        ),  # theoretical m/z same as experimental for now
+                        safe_str(sme_row.get("mz", "null")),
+                        safe_str(consensus_row.get("adduct_charge_top", "1")),  # Use consensus feature's top adduct charge
+                        safe_str(sme_row.get("mz", "null")),  # theoretical m/z same as experimental for now
                         spectra_ref,
                         id_method,
                         ms_level,
-                        str(some_row.get("score", "null")),
+                        safe_str(sme_row.get("score", "null")),
                         str(i + 1),  # rank within this consensus feature
                     ]
-                    some_lines.append("\t".join(some_line))
-        # Write SOME table
+                    sme_lines.append("\t".join(sme_line))
+        # Write SME table
         with open(filename, "a", encoding="utf-8") as f:
             f.write("\n")
-            for line in some_lines:
+            for line in sme_lines:
                 f.write(line + "\n")
     # --- MGF table ---
@@ -945,23 +927,15 @@ def export_mztab(self, filename: str = None, include_mgf=True, **kwargs) -> None
             spec_len = row["spec_len"] if row["spec_len"] is not None else 0
             # Format spectrum data as pipe-separated strings
-            spec_mz_str = (
-                "|".join([f"{mz:.4f}" for mz in spectrum_mz]) if spectrum_mz else ""
-            )
-            spec_int_str = (
-                "|".join([f"{int(inty)}" for inty in spectrum_inty])
-                if spectrum_inty
-                else ""
-            )
+            spec_mz_str = "|".join([f"{mz:.4f}" for mz in spectrum_mz]) if spectrum_mz else ""
+            spec_int_str = "|".join([f"{int(inty)}" for inty in spectrum_inty]) if spectrum_inty else ""
             mgf_row = [
                 "COM",
                 "MGF",
                 str(row["mgf_index"]) if row["mgf_index"] is not None else "null",
                 str(row["feature_id"]) if row["feature_id"] is not None else "null",
-                f"{row['rtinseconds']:.2f}"
-                if row["rtinseconds"] is not None
-                else "null",
+                f"{row['rtinseconds']:.2f}" if row["rtinseconds"] is not None else "null",
                 f"{row['pepmass']:.4f}" if row["pepmass"] is not None else "null",
                 "null",  # prec_int - not available in current data
                 str(row["energy"]) if row["energy"] is not None else "null",

masster 0.4.0__py3-none-any.whl → 0.4.1__py3-none-any.whl

masster 0.4.0py3-none-any.whl → 0.4.1py3-none-any.whl