PyPI - masster - Versions diffs - 0.4.3__py3-none-any.whl → 0.4.5__py3-none-any.whl - Mend

masster 0.4.3py3-none-any.whl → 0.4.5py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of masster might be problematic. Click here for more details.

Files changed (54) hide show

masster/__init__.py +8 -8
masster/_version.py +1 -1
masster/chromatogram.py +1 -1
masster/data/dda/20250530_VH_IQX_KW_RP_HSST3_100mm_12min_pos_v4_DDA_OT_C-MiLUT_QC_dil2_01_20250602151849.sample5 +0 -0
masster/data/dda/20250530_VH_IQX_KW_RP_HSST3_100mm_12min_pos_v4_DDA_OT_C-MiLUT_QC_dil3_01_20250602150634.sample5 +0 -0
masster/data/dda/20250530_VH_IQX_KW_RP_HSST3_100mm_12min_pos_v4_MS1_C-MiLUT_C008_v6_r38_01.sample5 +0 -0
masster/data/dda/20250530_VH_IQX_KW_RP_HSST3_100mm_12min_pos_v4_MS1_C-MiLUT_C008_v7_r37_01.sample5 +0 -0
masster/data/dda/20250530_VH_IQX_KW_RP_HSST3_100mm_12min_pos_v4_MS1_C-MiLUT_C017_v5_r99_01.sample5 +0 -0
masster/data/libs/__pycache__/ccm.cpython-312.pyc +0 -0
masster/data/libs/__pycache__/urine.cpython-312.pyc +0 -0
masster/data/libs/ccm.csv +120 -0
masster/data/libs/urine.csv +4693 -0
masster/data/wiff/2025_01_14_VW_7600_LpMx_DBS_CID_2min_TOP15_030msecMS1_005msecReac_CE35_DBS-ON_3.timeseries.data +0 -0
masster/data/wiff/2025_01_14_VW_7600_LpMx_DBS_CID_2min_TOP15_030msecMS1_005msecReac_CE35_DBS-ON_3.wiff +0 -0
masster/data/wiff/2025_01_14_VW_7600_LpMx_DBS_CID_2min_TOP15_030msecMS1_005msecReac_CE35_DBS-ON_3.wiff.scan +0 -0
masster/data/wiff/2025_01_14_VW_7600_LpMx_DBS_CID_2min_TOP15_030msecMS1_005msecReac_CE35_DBS-ON_3.wiff2 +0 -0
masster/logger.py +11 -11
masster/sample/__init__.py +1 -1
masster/sample/adducts.py +338 -264
masster/sample/defaults/find_adducts_def.py +21 -8
masster/sample/h5.py +561 -282
masster/sample/helpers.py +131 -75
masster/sample/lib.py +4 -4
masster/sample/load.py +31 -17
masster/sample/parameters.py +1 -1
masster/sample/plot.py +7 -7
masster/sample/processing.py +117 -87
masster/sample/sample.py +103 -90
masster/sample/sample5_schema.json +196 -0
masster/sample/save.py +35 -12
masster/spectrum.py +1 -1
masster/study/__init__.py +1 -1
masster/study/defaults/align_def.py +5 -1
masster/study/defaults/identify_def.py +3 -1
masster/study/defaults/study_def.py +58 -25
masster/study/export.py +360 -210
masster/study/h5.py +560 -158
masster/study/helpers.py +496 -203
masster/study/helpers_optimized.py +1 -1
masster/study/id.py +538 -349
masster/study/load.py +233 -143
masster/study/plot.py +71 -71
masster/study/processing.py +456 -254
masster/study/save.py +15 -5
masster/study/study.py +213 -131
masster/study/study5_schema.json +360 -0
masster-0.4.5.dist-info/METADATA +131 -0
masster-0.4.5.dist-info/RECORD +71 -0
masster-0.4.3.dist-info/METADATA +0 -791
masster-0.4.3.dist-info/RECORD +0 -56
{masster-0.4.3.dist-info → masster-0.4.5.dist-info}/WHEEL +0 -0
{masster-0.4.3.dist-info → masster-0.4.5.dist-info}/entry_points.txt +0 -0
{masster-0.4.3.dist-info → masster-0.4.5.dist-info}/licenses/LICENSE +0 -0
{masster-0.4.3.dist-info → masster-0.4.5.dist-info}/top_level.txt +0 -0

masster/study/export.py CHANGED Viewed

@@ -10,9 +10,9 @@ import polars as pl
 from tqdm import tqdm
-from masster.spectrum import combine_peaks
-from masster.study.defaults import export_mgf_defaults
-from masster._version import get_version
+from master.spectrum import combine_peaks
+from master.study.defaults import export_mgf_defaults
+from master._version import get_version
 def _get_mgf_df(self, **kwargs):
@@ -107,7 +107,11 @@ def _get_mgf_df(self, **kwargs):
             mask = mask & (spec.inty >= inty_min)
         for attr in spec.__dict__:
             arr = getattr(spec, attr)
-            if isinstance(arr, list | np.ndarray) and hasattr(arr, "__len__") and len(arr) == length:
+            if (
+                isinstance(arr, list | np.ndarray)
+                and hasattr(arr, "__len__")
+                and len(arr) == length
+            ):
                 setattr(spec, attr, np.array(arr)[mask])
         return spec
@@ -117,8 +121,12 @@ def _get_mgf_df(self, **kwargs):
             return None
         # Prepare spectrum data
-        spectrum_mz = spect.mz.tolist() if hasattr(spect.mz, "tolist") else list(spect.mz)
-        spectrum_inty = spect.inty.tolist() if hasattr(spect.inty, "tolist") else list(spect.inty)
+        spectrum_mz = (
+            spect.mz.tolist() if hasattr(spect.mz, "tolist") else list(spect.mz)
+        )
+        spectrum_inty = (
+            spect.inty.tolist() if hasattr(spect.inty, "tolist") else list(spect.inty)
+        )
         # Determine MS level
         ms_level = spect.ms_level if spect.ms_level is not None else 1
@@ -258,7 +266,11 @@ def _get_mgf_df(self, **kwargs):
             elif selection == "all":
                 if merge:
-                    specs = [row_e["spec"] for row_e in cons_ms2.iter_rows(named=True) if row_e["spec"] is not None]
+                    specs = [
+                        row_e["spec"]
+                        for row_e in cons_ms2.iter_rows(named=True)
+                        if row_e["spec"] is not None
+                    ]
                     if not specs:
                         continue
                     spect = combine_peaks(specs)
@@ -410,13 +422,14 @@ def export_mztab(self, filename: str = None, include_mgf=True, **kwargs) -> None
         description (str, optional): Human-readable description.
         **kwargs: Additional metadata or export options.
     """
     def safe_str(value, default="null"):
         """Convert value to string, replacing empty strings with 'null'"""
         if value is None:
             return default
         str_val = str(value)
         return str_val if str_val.strip() != "" else default
     if filename is None:
         filename = "study.mztab"
     if not os.path.isabs(filename):
@@ -431,17 +444,23 @@ def export_mztab(self, filename: str = None, include_mgf=True, **kwargs) -> None
     full_id_data = None
     try:
         # Import here to avoid circular imports
-        from masster.study.id import get_id
-        # Get full enriched identification data for SME section
+        from master.study.id import get_id
+        # Get full enriched identification data for SOME section
         full_id_data = get_id(self)
         if full_id_data is not None and not full_id_data.is_empty():
             # Get top scoring identification for each consensus_uid for SML section
-            top_id_data = (full_id_data
-                          .group_by("consensus_uid")
-                          .agg(pl.all().sort_by("score", descending=True).first())
-                          .sort("consensus_uid"))
+            top_id_data = (
+                full_id_data.group_by("consensus_uid")
+                .agg(pl.all().sort_by("score", descending=True).first())
+                .sort("consensus_uid")
+            )
             # Keep raw id_data for backward compatibility (if needed elsewhere)
-            id_data = self.id_df if hasattr(self, 'id_df') and self.id_df is not None else None
+            id_data = (
+                self.id_df
+                if hasattr(self, "id_df") and self.id_df is not None
+                else None
+            )
         else:
             self.logger.info("No identification data available for mzTab export")
     except Exception as e:
@@ -466,7 +485,9 @@ def export_mztab(self, filename: str = None, include_mgf=True, **kwargs) -> None
     # --- Prepare MTD (metadata) section ---
     mtd_lines = []
-    mtd_lines.append(f"COM\tfile generated by MASSter {get_version()} on {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
+    mtd_lines.append(
+        f"COM\tfile generated by MASSter {get_version()} on {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}",
+    )
     mtd_lines.append("\nMTD\tmzTab-version\t2.2.0-M")
     id = self.label if self.label else self.folder
     mtd_lines.append(f"MTD\tmzTab-id\t{id}")
@@ -474,28 +495,45 @@ def export_mztab(self, filename: str = None, include_mgf=True, **kwargs) -> None
     mtd_lines.append("MTD\tcv[1]-label\tMS")
     mtd_lines.append("MTD\tcv[1]-full_name\tPSI-MS controlled vocabulary")
     mtd_lines.append("MTD\tcv[1]-version\t4.1.199")
-    mtd_lines.append("MTD\tcv[1]-uri\thttps://raw.githubusercontent.com/HUPO-PSI/psi-ms-CV/master/psi-ms.obo")
+    mtd_lines.append(
+        "MTD\tcv[1]-uri\thttps://raw.githubusercontent.com/HUPO-PSI/psi-ms-CV/master/psi-ms.obo",
+    )
     mtd_lines.append("")
-    mtd_lines.append("MTD\tsmall_molecule-quantification_unit\t[MS, MS:1001844, MS1 feature area, ]")
-    mtd_lines.append("MTD\tsmall_molecule_feature-quantification_unit\t[MS, MS:1001844, MS1 feature area, ]")
+    mtd_lines.append(
+        "MTD\tsmall_molecule-quantification_unit\t[MS, MS:1001844, MS1 feature area, ]",
+    )
+    mtd_lines.append(
+        "MTD\tsmall_molecule_feature-quantification_unit\t[MS, MS:1001844, MS1 feature area, ]",
+    )
     mtd_lines.append(
         "MTD\tsmall_molecule-identification_reliability\t[MS, MS:1002955, hr-ms compound identification confidence level, ]",
     )
     # Add identification confidence measures if identification data is available
     if full_id_data is not None:
-        mtd_lines.append("MTD\tid_confidence_measure[1]\t[MS, MS:1002888, small molecule confidence measure, ]")
+        mtd_lines.append(
+            "MTD\tid_confidence_measure[1]\t[MS, MS:1002888, small molecule confidence measure, ]",
+        )
     else:
-        mtd_lines.append("MTD\tid_confidence_measure[1]\t[MS, MS:1002888, small molecule confidence measure, ]")
+        mtd_lines.append(
+            "MTD\tid_confidence_measure[1]\t[MS, MS:1002888, small molecule confidence measure, ]",
+        )
     mtd_lines.append("")
     mtd_lines.append("MTD\tsoftware[1]\t[MS, MS:1003430, OpenMS, unknown]")
     mtd_lines.append(f"MTD\tsoftware[2]\t[MS, MS:1002878, MASSter, {get_version()}]")
-    mtd_lines.append("MTD\tquantification_method\t[MS, MS:1001834, LC-MS label-free quantitation analysis, ]")
+    mtd_lines.append(
+        "MTD\tquantification_method\t[MS, MS:1001834, LC-MS label-free quantitation analysis, ]",
+    )
     mtd_lines.append("")
     # Database information - updated based on identification data
-    if full_id_data is not None and hasattr(self, 'lib_df') and self.lib_df is not None and not self.lib_df.is_empty():
+    if (
+        full_id_data is not None
+        and hasattr(self, "lib_df")
+        and self.lib_df is not None
+        and not self.lib_df.is_empty()
+    ):
         mtd_lines.append('MTD\tdatabase[1]\t[, , "compound library", ]')
         mtd_lines.append("MTD\tdatabase[1]-prefix\tcmpd")
         mtd_lines.append("MTD\tdatabase[1]-version\tUnknown")
@@ -505,22 +543,22 @@ def export_mztab(self, filename: str = None, include_mgf=True, **kwargs) -> None
         mtd_lines.append("MTD\tdatabase[1]-prefix\tCID")
         mtd_lines.append("MTD\tdatabase[1]-version\tUnknown")
         mtd_lines.append("MTD\tdatabase[1]-uri\thttps://pubchem.ncbi.nlm.nih.gov/")
     # Get abundance matrix to determine the number of assays needed
     abundance_matrix = self.get_consensus_matrix()
     # Get sample columns (excluding consensus_uid)
     sample_columns = [col for col in abundance_matrix.columns if col != "consensus_uid"]
     n_assays = len(sample_columns)
     # Define samples, ms_runs, and assays based on the abundance matrix columns
     # Determine scan polarity based on study polarity
-    study_polarity = getattr(self, 'polarity', 'positive')
-    if study_polarity in ['negative', 'neg']:
+    study_polarity = getattr(self, "polarity", "positive")
+    if study_polarity in ["negative", "neg"]:
         scan_polarity_cv = "[MS, MS:1000129, negative scan, ]"
     else:
         scan_polarity_cv = "[MS, MS:1000130, positive scan, ]"
     for i, sample_col in enumerate(sample_columns, 1):
         mtd_lines.append(f"\nMTD\tsample[{i}]\t{sample_col}")
         mtd_lines.append(f"MTD\tsample[{i}]-description\t{sample_col}")
@@ -562,15 +600,24 @@ def export_mztab(self, filename: str = None, include_mgf=True, **kwargs) -> None
     # round to int - handle both Polars and Pandas DataFrames
     if hasattr(abundance_matrix, "with_columns"):
         # Polars DataFrame
-        numeric_cols = [col for col in abundance_matrix.columns if abundance_matrix[col].dtype.is_numeric()]
-        abundance_matrix = abundance_matrix.with_columns([abundance_matrix[col].round(0) for col in numeric_cols])
+        numeric_cols = [
+            col
+            for col in abundance_matrix.columns
+            if abundance_matrix[col].dtype.is_numeric()
+        ]
+        abundance_matrix = abundance_matrix.with_columns(
+            [abundance_matrix[col].round(0) for col in numeric_cols],
+        )
     else:
         # Pandas DataFrame
         abundance_matrix = abundance_matrix.round(0)
     # Use the n_assays already calculated from abundance matrix columns
     sml_header += [f"abundance_assay[{i}]" for i in range(1, n_assays + 1)]
-    sml_header += ["abundance_study_variable[1]", "abundance_variation_study_variable[1]"]
+    sml_header += [
+        "abundance_study_variable[1]",
+        "abundance_variation_study_variable[1]",
+    ]
     sml_lines.append("\t".join(sml_header))
     # get adducts from consensus_df['adduct_top'] - use the top-ranked adduct directly
@@ -582,7 +629,7 @@ def export_mztab(self, filename: str = None, include_mgf=True, **kwargs) -> None
             adduct = str(row["adduct_top"])
             # Replace ? with H for better mzTab compatibility
             adduct = adduct.replace("?", "H")
         adduct_list.append(adduct)
     for idx, row in enumerate(self.consensus_df.iter_rows(named=True), 1):
@@ -593,63 +640,65 @@ def export_mztab(self, filename: str = None, include_mgf=True, **kwargs) -> None
             id_matches = top_id_data.filter(pl.col("consensus_uid") == consensus_uid)
             if id_matches.height > 0:
                 id_info = id_matches.row(0, named=True)
         # Populate identification fields
         database_identifier = "null"
         chemical_formula = "null"
         smiles_val = "null"
-        inchi_val = "null"
+        inchi_val = "null"
         chemical_name = "null"
         best_id_confidence_measure = "null"
         best_id_confidence_value = "null"
         reliability = "4"  # Default: unknown compound
-        theoretical_neutral_mass = "null"  # Only set when we have database identification
+        theoretical_neutral_mass = (
+            "null"  # Only set when we have database identification
+        )
         if id_info:
             # Use cmpd_uid as database identifier with prefix
             if id_info.get("cmpd_uid") is not None:
                 database_identifier = f"cmpd:{id_info['cmpd_uid']}"
             # Chemical formula
             if id_info.get("formula") is not None and id_info["formula"] != "":
                 chemical_formula = safe_str(id_info["formula"])
             # SMILES
             if id_info.get("smiles") is not None and id_info["smiles"] != "":
                 smiles_val = safe_str(id_info["smiles"])
             # InChI
             if id_info.get("inchi") is not None and id_info["inchi"] != "":
                 inchi_val = safe_str(id_info["inchi"])
             # Chemical name
             if id_info.get("name") is not None and id_info["name"] != "":
                 chemical_name = safe_str(id_info["name"])
             # Theoretical neutral mass - only from identification data, not consensus_df
             if id_info.get("neutral_mass") is not None:
                 theoretical_neutral_mass = safe_str(id_info["neutral_mass"])
             elif id_info.get("mass") is not None:
                 theoretical_neutral_mass = safe_str(id_info["mass"])
             # Identification confidence
             if id_info.get("matcher") is not None:
                 best_id_confidence_measure = f"[MS, MS:1002888, {id_info['matcher']}, ]"
             if id_info.get("score") is not None:
                 best_id_confidence_value = safe_str(id_info["score"])
             # Set reliability based on identification quality
             # Using mzTab-M hr-ms identification levels: 2a=compound match, 2b=library spectrum match, 3=compound class, 4=unknown
             if id_info.get("score", 0) >= 0.8:
                 reliability = "2a"  # High confidence compound match
             elif id_info.get("score", 0) >= 0.5:
-                reliability = "2b"  # Moderate confidence match
+                reliability = "2b"  # Moderate confidence match
             elif id_info.get("score", 0) >= 0.2:
-                reliability = "3"   # Compound class level
+                reliability = "3"  # Compound class level
             else:
-                reliability = "4"   # Unknown compound
+                reliability = "4"  # Unknown compound
         # Get MGF indexes for this consensus feature
         mgf_indexes = mgf_mapping.get(row["consensus_uid"], [])
@@ -673,26 +722,45 @@ def export_mztab(self, filename: str = None, include_mgf=True, **kwargs) -> None
         # Add abundance values for each assay
         consensus_uid = row["consensus_uid"]
         # Check if consensus_uid exists in the abundance_matrix (Polars)
-        filtered_matrix = abundance_matrix.filter(pl.col("consensus_uid") == consensus_uid)
+        filtered_matrix = abundance_matrix.filter(
+            pl.col("consensus_uid") == consensus_uid,
+        )
         if filtered_matrix.height > 0:
             # Get the first (and should be only) matching row
             abundance_row = filtered_matrix.row(0, named=True)
             # Extract values excluding the consensus_uid column
-            abundance_values = [abundance_row[col] for col in abundance_matrix.columns if col != "consensus_uid"]
-            sml_row += [safe_str(val) if val is not None else "null" for val in abundance_values]
-            # Calculate study variable statistics
+            abundance_values = [
+                abundance_row[col]
+                for col in abundance_matrix.columns
+                if col != "consensus_uid"
+            ]
+            sml_row += [
+                safe_str(val) if val is not None else "null" for val in abundance_values
+            ]
+            # Calculate study variable statistics
             non_null_values = [val for val in abundance_values if val is not None]
             if non_null_values:
                 abundance_study_variable = sum(non_null_values) / len(non_null_values)
                 abundance_variation_study_variable = (
-                    sum((x - abundance_study_variable) ** 2 for x in non_null_values) / len(non_null_values)
-                ) ** 0.5 if len(non_null_values) > 1 else 0
+                    (
+                        sum(
+                            (x - abundance_study_variable) ** 2 for x in non_null_values
+                        )
+                        / len(non_null_values)
+                    )
+                    ** 0.5
+                    if len(non_null_values) > 1
+                    else 0
+                )
             else:
                 abundance_study_variable = "null"
                 abundance_variation_study_variable = "null"
-            sml_row += [safe_str(abundance_study_variable), safe_str(abundance_variation_study_variable)]
+            sml_row += [
+                safe_str(abundance_study_variable),
+                safe_str(abundance_variation_study_variable),
+            ]
         else:
             sml_row += ["null"] * n_assays
             sml_row += ["null", "null"]  # Study variable columns
@@ -707,8 +775,8 @@ def export_mztab(self, filename: str = None, include_mgf=True, **kwargs) -> None
     smf_header = [
         "SFH",
         "SMF_ID",
-        "SME_ID_REFS",
-        "SME_ID_REF_ambiguity_code",
+        "SOME_ID_REFS",
+        "SOME_ID_REF_ambiguity_code",
         "adduct_ion",
         "isotopomer",
         "exp_mass_to_charge",
@@ -718,86 +786,115 @@ def export_mztab(self, filename: str = None, include_mgf=True, **kwargs) -> None
         "retention_time_in_seconds_end",
     ]
     smf_header += [f"abundance_assay[{i}]" for i in range(1, n_assays + 1)]
-    smf_header += ["abundance_study_variable[1]", "abundance_variation_study_variable[1]"]
+    smf_header += [
+        "abundance_study_variable[1]",
+        "abundance_variation_study_variable[1]",
+    ]
     smf_lines.append("\t".join(smf_header))
     # SMF table uses the same consensus features as SML, just different metadata
     for idx, row in enumerate(self.consensus_df.iter_rows(named=True), 1):
-        # References to SME entries - each SMF can reference multiple SME entries for the same consensus_uid
-        sme_refs = "null"
-        sme_ambiguity = "null"
+        # References to SOME entries - each SMF can reference multiple SOME entries for the same consensus_uid
+        some_refs = "null"
+        some_ambiguity = "null"
         consensus_uid = row["consensus_uid"]
         if full_id_data is not None:
-            # Find all SME entries for this consensus_uid
-            sme_matches = full_id_data.filter(pl.col("consensus_uid") == consensus_uid)
-            if sme_matches.height > 0:
-                # Generate SME IDs - we'll create a mapping in the SME section
+            # Find all SOME entries for this consensus_uid
+            some_matches = full_id_data.filter(pl.col("consensus_uid") == consensus_uid)
+            if some_matches.height > 0:
+                # Generate SOME IDs - we'll create a mapping in the SOME section
                 # For now, use a simple approach based on consensus_uid and lib_uid
-                sme_ids = []
-                for i, sme_row in enumerate(sme_matches.iter_rows(named=True)):
-                    # Create a unique SME ID based on consensus_uid and position
-                    sme_id_base = consensus_uid * 1000  # Ensure uniqueness across consensus features
-                    sme_id = sme_id_base + i + 1
-                    sme_ids.append(str(sme_id))
-                if sme_ids:
-                    sme_refs = "|".join(sme_ids)
+                some_ids = []
+                for i, some_row in enumerate(some_matches.iter_rows(named=True)):
+                    # Create a unique SOME ID based on consensus_uid and position
+                    some_id_base = (
+                        consensus_uid * 1000
+                    )  # Ensure uniqueness across consensus features
+                    some_id = some_id_base + i + 1
+                    some_ids.append(str(some_id))
+                if some_ids:
+                    some_refs = "|".join(some_ids)
                     # Set ambiguity code: 1=ambiguous identification, 2=multiple evidence same molecule, 3=both
-                    if len(sme_ids) > 1:
+                    if len(some_ids) > 1:
                         # Check if all identifications point to the same compound
-                        unique_cmpds = set(match["cmpd_uid"] for match in sme_matches.iter_rows(named=True)
-                                         if match.get("cmpd_uid") is not None)
+                        unique_cmpds = {
+                            match["cmpd_uid"]
+                            for match in some_matches.iter_rows(named=True)
+                            if match.get("cmpd_uid") is not None
+                        }
                         if len(unique_cmpds) > 1:
-                            sme_ambiguity = "1"  # Ambiguous identification
+                            some_ambiguity = "1"  # Ambiguous identification
                         else:
-                            sme_ambiguity = "2"  # Multiple evidence for same molecule
+                            some_ambiguity = "2"  # Multiple evidence for same molecule
                     else:
-                        sme_ambiguity = "null"
+                        some_ambiguity = "null"
         # Format isotopomer according to mzTab-M specification
         iso_value = row.get("iso_mean", 0)
         if iso_value is not None and round(iso_value) != 0:
-            isotopomer = f"[MS,MS:1002957,\"isotopomer MS peak\",\"+{round(iso_value)}\"]"
+            isotopomer = f'[MS,MS:1002957,"isotopomer MS peak","+{round(iso_value)}"]'
         else:
             isotopomer = "null"
         smf_row = [
             "SMF",
             str(idx),
-            sme_refs,
-            sme_ambiguity,
+            some_refs,
+            some_ambiguity,
             adduct_list[idx - 1],  # adduct_ion
             isotopomer,  # isotopomer formatted according to mzTab-M specification
             safe_str(row.get("mz", "null")),  # exp_mass_to_charge
-            safe_str(row.get("adduct_charge_top", "null")),  # Use top-ranked adduct charge
+            safe_str(
+                row.get("adduct_charge_top", "null"),
+            ),  # Use top-ranked adduct charge
             safe_str(row.get("rt", "null")),  # retention_time_in_seconds
             safe_str(row.get("retention_time_in_seconds_start", "null")),
             safe_str(row.get("retention_time_in_seconds_end", "null")),
         ]
         # Add abundance values for each assay - same as SML (Polars)
         consensus_uid = row["consensus_uid"]
-        filtered_matrix = abundance_matrix.filter(pl.col("consensus_uid") == consensus_uid)
+        filtered_matrix = abundance_matrix.filter(
+            pl.col("consensus_uid") == consensus_uid,
+        )
         if filtered_matrix.height > 0:
             # Get the first (and should be only) matching row
             abundance_row = filtered_matrix.row(0, named=True)
             # Extract values excluding the consensus_uid column
-            abundance_values = [abundance_row[col] for col in abundance_matrix.columns if col != "consensus_uid"]
-            abundance_strings = [safe_str(val) if val is not None else "null" for val in abundance_values]
+            abundance_values = [
+                abundance_row[col]
+                for col in abundance_matrix.columns
+                if col != "consensus_uid"
+            ]
+            abundance_strings = [
+                safe_str(val) if val is not None else "null" for val in abundance_values
+            ]
             smf_row += abundance_strings
             # Calculate study variable statistics (same as in SML section)
             non_null_values = [val for val in abundance_values if val is not None]
             if non_null_values:
                 abundance_study_variable = sum(non_null_values) / len(non_null_values)
                 abundance_variation_study_variable = (
-                    sum((x - abundance_study_variable) ** 2 for x in non_null_values) / len(non_null_values)
-                ) ** 0.5 if len(non_null_values) > 1 else 0
+                    (
+                        sum(
+                            (x - abundance_study_variable) ** 2 for x in non_null_values
+                        )
+                        / len(non_null_values)
+                    )
+                    ** 0.5
+                    if len(non_null_values) > 1
+                    else 0
+                )
             else:
                 abundance_study_variable = "null"
                 abundance_variation_study_variable = "null"
-            smf_row += [safe_str(abundance_study_variable), safe_str(abundance_variation_study_variable)]
+            smf_row += [
+                safe_str(abundance_study_variable),
+                safe_str(abundance_variation_study_variable),
+            ]
         else:
             smf_row += ["null"] * n_assays
             smf_row += ["null", "null"]  # Study variable columns
@@ -807,19 +904,21 @@ def export_mztab(self, filename: str = None, include_mgf=True, **kwargs) -> None
         for line in smf_lines:
             f.write(line + "\n")
-    # --- SME (Small Molecule Evidence) table ---
+    # --- SOME (Small Molecule Evidence) table ---
     if full_id_data is not None and not full_id_data.is_empty():
-        sme_lines = []
+        some_lines = []
         # Add comment about spectra_ref being dummy placeholders
-        sme_lines.append("COM\tThe spectra_ref are dummy placeholders, as the annotation was based on aggregated data")
-        sme_header = [
-            "SEH",
-            "SME_ID",
+        some_lines.append(
+            "COM\tThe spectra_ref are dummy placeholders, as the annotation was based on aggregated data",
+        )
+        some_header = [
+            "SHE",
+            "SOME_ID",
             "evidence_input_id",
             "database_identifier",
             "chemical_formula",
             "smiles",
-            "inchi",
+            "inchi",
             "chemical_name",
             "uri",
             "derivatized_form",
@@ -833,93 +932,101 @@ def export_mztab(self, filename: str = None, include_mgf=True, **kwargs) -> None
             "id_confidence_measure[1]",
             "rank",
         ]
-        sme_lines.append("\t".join(sme_header))
-        # Create SME entries for all identification results using enriched data
-        for consensus_uid in self.consensus_df.select("consensus_uid").to_series().unique():
+        some_lines.append("\t".join(some_header))
+        # Create SOME entries for all identification results using enriched data
+        for consensus_uid in (
+            self.consensus_df.select("consensus_uid").to_series().unique()
+        ):
             # Get consensus feature data for this consensus_uid
-            consensus_feature_data = self.consensus_df.filter(pl.col("consensus_uid") == consensus_uid)
+            consensus_feature_data = self.consensus_df.filter(
+                pl.col("consensus_uid") == consensus_uid,
+            )
             if consensus_feature_data.height == 0:
                 continue
             consensus_row = consensus_feature_data.row(0, named=True)
             # Get all identification results for this consensus feature from enriched data
-            sme_matches = full_id_data.filter(pl.col("consensus_uid") == consensus_uid)
-            if sme_matches.height > 0:
+            some_matches = full_id_data.filter(pl.col("consensus_uid") == consensus_uid)
+            if some_matches.height > 0:
                 # Sort by score descending to maintain rank order
-                sme_matches = sme_matches.sort("score", descending=True)
-                for i, sme_row in enumerate(sme_matches.iter_rows(named=True)):
-                    # Generate unique SME_ID
-                    sme_id_base = consensus_uid * 1000
-                    sme_id = sme_id_base + i + 1
+                some_matches = some_matches.sort("score", descending=True)
+                for i, some_row in enumerate(some_matches.iter_rows(named=True)):
+                    # Generate unique SOME_ID
+                    some_id_base = consensus_uid * 1000
+                    some_id = some_id_base + i + 1
                     # Create evidence input ID using consensus_uid:mz:rt format
                     consensus_mz = consensus_row.get("mz", 0)
                     consensus_rt = consensus_row.get("rt", 0)
                     evidence_id = f"consensus_uid={consensus_uid}:mz={consensus_mz:.4f}:rt={consensus_rt:.2f}"
                     # Database identifier - use db_id if available, otherwise fallback to cmpd_uid
                     db_id = "null"
-                    if sme_row.get("db_id") is not None and sme_row["db_id"] != "":
-                        db_id = safe_str(sme_row["db_id"])
-                    elif sme_row.get("cmpd_uid") is not None:
-                        db_id = f"cmpd:{sme_row['cmpd_uid']}"
-                    # Get adduct information
+                    if some_row.get("db_id") is not None and some_row["db_id"] != "":
+                        db_id = safe_str(some_row["db_id"])
+                    elif some_row.get("cmpd_uid") is not None:
+                        db_id = f"cmpd:{some_row['cmpd_uid']}"
+                    # Get adduct information
                     adduct_ion = "null"
-                    if sme_row.get("adduct") is not None and sme_row["adduct"] != "":
-                        adduct_ion = safe_str(sme_row["adduct"])
+                    if some_row.get("adduct") is not None and some_row["adduct"] != "":
+                        adduct_ion = safe_str(some_row["adduct"])
                         # Replace ? with H for better mzTab compatibility
                         adduct_ion = adduct_ion.replace("?", "H")
                     # Spectra reference - reference to first ms_run with spectrum index 0
                     spectra_ref = "ms_run[1]:spectrum=0"
                     # Identification method
                     id_method = "[MS, MS:1002888, small molecule confidence measure, ]"
-                    if sme_row.get("matcher") is not None:
-                        id_method = f"[MS, MS:1002888, {sme_row['matcher']}, ]"
+                    if some_row.get("matcher") is not None:
+                        id_method = f"[MS, MS:1002888, {some_row['matcher']}, ]"
                     # MS level - assume MS1 for now
                     ms_level = "[MS, MS:1000511, ms level, 1]"
                     # Experimental mass-to-charge from consensus feature
                     exp_mz = safe_str(consensus_mz)
-                    # Theoretical mass-to-charge from lib_df
+                    # Theoretical mass-to-charge from lib_df
                     theoretical_mz = "null"
-                    if sme_row.get("mz") is not None:  # This comes from lib_df via get_id() join
-                        theoretical_mz = safe_str(sme_row["mz"])
-                    sme_line = [
-                        "SME",
-                        str(sme_id),
+                    if (
+                        some_row.get("mz") is not None
+                    ):  # This comes from lib_df via get_id() join
+                        theoretical_mz = safe_str(some_row["mz"])
+                    some_line = [
+                        "SOME",
+                        str(some_id),
                         evidence_id,
                         db_id,
-                        safe_str(sme_row.get("formula", "null")),
-                        safe_str(sme_row.get("smiles", "null")),
-                        safe_str(sme_row.get("inchi", "null")),
-                        safe_str(sme_row.get("name", "null")),
+                        safe_str(some_row.get("formula", "null")),
+                        safe_str(some_row.get("smiles", "null")),
+                        safe_str(some_row.get("inchi", "null")),
+                        safe_str(some_row.get("name", "null")),
                         "null",  # uri - not available in current data
                         "null",  # derivatized_form
                         adduct_ion,
                         exp_mz,  # experimental m/z from consensus feature
-                        safe_str(consensus_row.get("adduct_charge_top", "1")),  # Use consensus feature's top adduct charge
+                        safe_str(
+                            consensus_row.get("adduct_charge_top", "1"),
+                        ),  # Use consensus feature's top adduct charge
                         theoretical_mz,  # theoretical m/z from lib_df
                         spectra_ref,
                         id_method,
                         ms_level,
-                        safe_str(sme_row.get("score", "null")),
+                        safe_str(some_row.get("score", "null")),
                         str(i + 1),  # rank within this consensus feature
                     ]
-                    sme_lines.append("\t".join(sme_line))
-        # Write SME table
+                    some_lines.append("\t".join(some_line))
+        # Write SOME table
         with open(filename, "a", encoding="utf-8") as f:
             f.write("\n")
-            for line in sme_lines:
+            for line in some_lines:
                 f.write(line + "\n")
     # --- MGF table ---
@@ -953,15 +1060,23 @@ def export_mztab(self, filename: str = None, include_mgf=True, **kwargs) -> None
             spec_len = row["spec_len"] if row["spec_len"] is not None else 0
             # Format spectrum data as pipe-separated strings
-            spec_mz_str = "|".join([f"{mz:.4f}" for mz in spectrum_mz]) if spectrum_mz else ""
-            spec_int_str = "|".join([f"{int(inty)}" for inty in spectrum_inty]) if spectrum_inty else ""
+            spec_mz_str = (
+                "|".join([f"{mz:.4f}" for mz in spectrum_mz]) if spectrum_mz else ""
+            )
+            spec_int_str = (
+                "|".join([f"{int(inty)}" for inty in spectrum_inty])
+                if spectrum_inty
+                else ""
+            )
             mgf_row = [
                 "COM",
                 "MGF",
                 str(row["mgf_index"]) if row["mgf_index"] is not None else "null",
                 str(row["feature_id"]) if row["feature_id"] is not None else "null",
-                f"{row['rtinseconds']:.2f}" if row["rtinseconds"] is not None else "null",
+                f"{row['rtinseconds']:.2f}"
+                if row["rtinseconds"] is not None
+                else "null",
                 f"{row['pepmass']:.4f}" if row["pepmass"] is not None else "null",
                 "null",  # prec_int - not available in current data
                 str(row["energy"]) if row["energy"] is not None else "null",
@@ -986,94 +1101,110 @@ def export_mztab(self, filename: str = None, include_mgf=True, **kwargs) -> None
 def export_xlsx(self, filename: str = None) -> None:
     """
     Export the study data to an Excel workbook with multiple worksheets.
     The Excel file contains three worksheets:
     - consensus_df: Consensus features dataframe
-    - matrix: Consensus matrix with samples as columns (get_consensus_matrix)
+    - matrix: Consensus matrix with samples as columns (get_consensus_matrix)
     - identification: Identification results with library annotations (get_id)
     Args:
-        filename (str, optional): Path to the output Excel file. Defaults to "study.xlsx"
+        filename (str, optional): Path to the output Excel file. Defaults to "study.xlsx"
                                 in the study folder.
     """
     try:
         import openpyxl
     except ImportError:
-        self.logger.error("openpyxl package is required for Excel export. Install with: pip install openpyxl")
+        self.logger.error(
+            "openpyxl package is required for Excel export. Install with: pip install openpyxl",
+        )
         return
     # Set default filename
     if filename is None:
         filename = "study.xlsx"
     # Make filename absolute if not already
     if not os.path.isabs(filename):
         if self.folder is not None:
             filename = os.path.join(self.folder, filename)
         else:
             filename = os.path.join(os.getcwd(), filename)
-    self.logger.debug(f"Exporting study to Excel...")
+    self.logger.debug("Exporting study to Excel...")
     # Prepare data for export in the desired order
     from collections import OrderedDict
     worksheets = OrderedDict()
     # 1. Samples dataframe (first worksheet)
     if self.samples_df is not None and not self.samples_df.is_empty():
         samples_pandas = self.samples_df.to_pandas()
-        worksheets['samples'] = samples_pandas
+        worksheets["samples"] = samples_pandas
         self.logger.debug(f"Added samples worksheet with {len(samples_pandas)} rows")
     else:
         self.logger.warning("samples_df is empty or None, skipping worksheet")
     # 2. Consensus dataframe (renamed to 'consensus')
     if self.consensus_df is not None and not self.consensus_df.is_empty():
         consensus_pandas = self.consensus_df.to_pandas()
-        worksheets['consensus'] = consensus_pandas
-        self.logger.debug(f"Added consensus worksheet with {len(consensus_pandas)} rows")
+        worksheets["consensus"] = consensus_pandas
+        self.logger.debug(
+            f"Added consensus worksheet with {len(consensus_pandas)} rows",
+        )
     else:
         self.logger.warning("consensus_df is empty or None, skipping worksheet")
     # 3. Identification results
     try:
-        from masster.study.id import get_id
+        from master.study.id import get_id
         id_df = get_id(self)
         if id_df is not None and not id_df.is_empty():
             id_pandas = id_df.to_pandas()
-            worksheets['identification'] = id_pandas
-            self.logger.debug(f"Added identification worksheet with {len(id_pandas)} rows")
+            worksheets["identification"] = id_pandas
+            self.logger.debug(
+                f"Added identification worksheet with {len(id_pandas)} rows",
+            )
         else:
-            self.logger.warning("get_id() returned empty data, skipping identification worksheet")
+            self.logger.warning(
+                "get_id() returned empty data, skipping identification worksheet",
+            )
     except Exception as e:
-        self.logger.warning(f"Error getting identification data: {e}. Skipping identification worksheet.")
+        self.logger.warning(
+            f"Error getting identification data: {e}. Skipping identification worksheet.",
+        )
     # 4. Consensus matrix (last worksheet)
     try:
         matrix_df = self.get_consensus_matrix()
         if matrix_df is not None and not matrix_df.is_empty():
             matrix_pandas = matrix_df.to_pandas()
-            worksheets['matrix'] = matrix_pandas
+            worksheets["matrix"] = matrix_pandas
             self.logger.debug(f"Added matrix worksheet with {len(matrix_pandas)} rows")
         else:
-            self.logger.warning("get_consensus_matrix() returned empty data, skipping matrix worksheet")
+            self.logger.warning(
+                "get_consensus_matrix() returned empty data, skipping matrix worksheet",
+            )
     except Exception as e:
         self.logger.error(f"Error getting consensus matrix: {e}")
     # Check if we have any data to export
     if not worksheets:
         self.logger.error("No data available to export to Excel")
         return
     # Write to Excel file
     try:
-        with pd.ExcelWriter(filename, engine='openpyxl') as writer:
+        with pd.ExcelWriter(filename, engine="openpyxl") as writer:
             for sheet_name, data in worksheets.items():
                 data.to_excel(writer, sheet_name=sheet_name, index=False)
-                self.logger.debug(f"Written worksheet '{sheet_name}' with shape {data.shape}")
+                self.logger.debug(
+                    f"Written worksheet '{sheet_name}' with shape {data.shape}",
+                )
         self.logger.info(f"Study exported to {filename}")
     except Exception as e:
         self.logger.error(f"Error writing Excel file: {e}")
@@ -1081,13 +1212,13 @@ def export_xlsx(self, filename: str = None) -> None:
 def export_parquet(self, basename: str = None) -> None:
     """
     Export the study data to multiple Parquet files with different suffixes.
     The export creates separate Parquet files for each dataset:
     - <basename>_samples.parquet: Samples dataframe
     - <basename>_consensus.parquet: Consensus features dataframe
     - <basename>_identification.parquet: Identification results with library annotations
     - <basename>_matrix.parquet: Consensus matrix with samples as columns
     Args:
         basename (str, optional): Base name for the output files. Defaults to "study"
                                  in the study folder.
@@ -1095,59 +1226,74 @@ def export_parquet(self, basename: str = None) -> None:
     # Set default basename
     if basename is None:
         basename = "study"
     # Make basename absolute path if not already (without extension)
     if not os.path.isabs(basename):
         if self.folder is not None:
             basename = os.path.join(self.folder, basename)
         else:
             basename = os.path.join(os.getcwd(), basename)
     self.logger.debug(f"Exporting study to Parquet files with basename: {basename}")
     exported_files = []
     # 1. Samples dataframe
     if self.samples_df is not None and not self.samples_df.is_empty():
         samples_file = f"{basename}_samples.parquet"
         try:
             self.samples_df.write_parquet(samples_file)
             exported_files.append(samples_file)
-            self.logger.debug(f"Exported samples to {samples_file} ({self.samples_df.height} rows)")
+            self.logger.debug(
+                f"Exported samples to {samples_file} ({self.samples_df.height} rows)",
+            )
         except Exception as e:
             self.logger.error(f"Error writing samples parquet file: {e}")
     else:
-        self.logger.warning("samples_df is empty or None, skipping samples parquet file")
+        self.logger.warning(
+            "samples_df is empty or None, skipping samples parquet file",
+        )
     # 2. Consensus dataframe
     if self.consensus_df is not None and not self.consensus_df.is_empty():
         consensus_file = f"{basename}_consensus.parquet"
         try:
             self.consensus_df.write_parquet(consensus_file)
             exported_files.append(consensus_file)
-            self.logger.debug(f"Exported consensus to {consensus_file} ({self.consensus_df.height} rows)")
+            self.logger.debug(
+                f"Exported consensus to {consensus_file} ({self.consensus_df.height} rows)",
+            )
         except Exception as e:
             self.logger.error(f"Error writing consensus parquet file: {e}")
     else:
-        self.logger.warning("consensus_df is empty or None, skipping consensus parquet file")
+        self.logger.warning(
+            "consensus_df is empty or None, skipping consensus parquet file",
+        )
     # 3. Identification results
     try:
-        from masster.study.id import get_id
+        from master.study.id import get_id
         id_df = get_id(self)
         if id_df is not None and not id_df.is_empty():
             identification_file = f"{basename}_identification.parquet"
             try:
                 id_df.write_parquet(identification_file)
                 exported_files.append(identification_file)
-                self.logger.debug(f"Exported identification to {identification_file} ({id_df.height} rows)")
+                self.logger.debug(
+                    f"Exported identification to {identification_file} ({id_df.height} rows)",
+                )
             except Exception as e:
                 self.logger.error(f"Error writing identification parquet file: {e}")
         else:
-            self.logger.warning("get_id() returned empty data, skipping identification parquet file")
+            self.logger.warning(
+                "get_id() returned empty data, skipping identification parquet file",
+            )
     except Exception as e:
-        self.logger.warning(f"Error getting identification data: {e}. Skipping identification parquet file.")
+        self.logger.warning(
+            f"Error getting identification data: {e}. Skipping identification parquet file.",
+        )
     # 4. Consensus matrix
     try:
         matrix_df = self.get_consensus_matrix()
@@ -1156,14 +1302,18 @@ def export_parquet(self, basename: str = None) -> None:
             try:
                 matrix_df.write_parquet(matrix_file)
                 exported_files.append(matrix_file)
-                self.logger.debug(f"Exported matrix to {matrix_file} ({matrix_df.height} rows)")
+                self.logger.debug(
+                    f"Exported matrix to {matrix_file} ({matrix_df.height} rows)",
+                )
             except Exception as e:
                 self.logger.error(f"Error writing matrix parquet file: {e}")
         else:
-            self.logger.warning("get_consensus_matrix() returned empty data, skipping matrix parquet file")
+            self.logger.warning(
+                "get_consensus_matrix() returned empty data, skipping matrix parquet file",
+            )
     except Exception as e:
         self.logger.error(f"Error getting consensus matrix: {e}")
     # Report results
     if exported_files:
         self.logger.info(f"Study exported to {len(exported_files)} Parquet files:")

masster 0.4.3__py3-none-any.whl → 0.4.5__py3-none-any.whl

Potentially problematic release.

masster 0.4.3py3-none-any.whl → 0.4.5py3-none-any.whl