PyPI - masster - Versions diffs - 0.3.9__py3-none-any.whl → 0.3.11__py3-none-any.whl - Mend

masster 0.3.9py3-none-any.whl → 0.3.11py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of masster might be problematic. Click here for more details.

Files changed (34) hide show

masster/docs/SCX_API_Documentation.md +0 -0
masster/docs/SCX_DLL_Analysis.md +0 -0
masster/logger.py +92 -78
masster/sample/defaults/find_features_def.py +90 -94
masster/sample/defaults/sample_def.py +15 -0
masster/sample/h5.py +2 -2
masster/sample/helpers.py +137 -136
masster/sample/lib.py +11 -11
masster/sample/load.py +13 -9
masster/sample/plot.py +167 -60
masster/sample/processing.py +150 -153
masster/sample/sample.py +4 -4
masster/sample/sample5_schema.json +62 -62
masster/sample/save.py +16 -13
masster/sample/sciex.py +187 -176
masster/study/defaults/align_def.py +224 -6
masster/study/defaults/fill_chrom_def.py +1 -5
masster/study/defaults/integrate_chrom_def.py +1 -5
masster/study/defaults/study_def.py +2 -2
masster/study/export.py +144 -131
masster/study/h5.py +193 -133
masster/study/helpers.py +293 -245
masster/study/helpers_optimized.py +99 -57
masster/study/load.py +51 -25
masster/study/plot.py +453 -17
masster/study/processing.py +197 -123
masster/study/save.py +7 -7
masster/study/study.py +97 -88
masster/study/study5_schema.json +82 -82
{masster-0.3.9.dist-info → masster-0.3.11.dist-info}/METADATA +1 -1
{masster-0.3.9.dist-info → masster-0.3.11.dist-info}/RECORD +34 -32
{masster-0.3.9.dist-info → masster-0.3.11.dist-info}/WHEEL +0 -0
{masster-0.3.9.dist-info → masster-0.3.11.dist-info}/entry_points.txt +0 -0
{masster-0.3.9.dist-info → masster-0.3.11.dist-info}/licenses/LICENSE +0 -0

masster/study/export.py CHANGED Viewed

@@ -18,13 +18,13 @@ from masster._version import get_version
 def _get_mgf_df(self, **kwargs):
     """
     Generate MGF data as a Polars DataFrame.
     This is the core data generation function used by export_mgf().
     Parameters:
-        **kwargs: Keyword arguments for export parameters. Same as export_mgf()
+        **kwargs: Keyword arguments for export parameters. Same as export_mgf()
                  except return_data is not relevant here.
     Returns:
         pl.DataFrame: DataFrame with columns:
             - mgf_index: MGF index
@@ -115,37 +115,37 @@ def _get_mgf_df(self, **kwargs):
         """Create a dictionary representing an ion for the DataFrame."""
         if spect is None:
             return None
         # Prepare spectrum data
-        spectrum_mz = spect.mz.tolist() if hasattr(spect.mz, 'tolist') else list(spect.mz)
-        spectrum_inty = spect.inty.tolist() if hasattr(spect.inty, 'tolist') else list(spect.inty)
+        spectrum_mz = spect.mz.tolist() if hasattr(spect.mz, "tolist") else list(spect.mz)
+        spectrum_inty = spect.inty.tolist() if hasattr(spect.inty, "tolist") else list(spect.inty)
         # Determine MS level
         ms_level = spect.ms_level if spect.ms_level is not None else 1
         # Get energy if available
-        energy = getattr(spect, 'energy', None)
+        energy = getattr(spect, "energy", None)
         # Determine spectrum type based on MS level
         spec_type = f"MS{ms_level}" if ms_level > 1 else "MS1"
         # Calculate spectrum length
         spec_len = len(spectrum_mz)
         return {
-            'mgf_index': mgf_id,
-            'title': title,
-            'feature_id': id,
-            'feature_uid': uid,
-            'charge': charge,
-            'pepmass': mz,
-            'rtinseconds': rt,
-            'mslevel': ms_level,
-            'type': spec_type,
-            'energy': energy,
-            'spec_len': spec_len,
-            'spec_mz': spectrum_mz,
-            'spec_int': spectrum_inty,
+            "mgf_index": mgf_id,
+            "title": title,
+            "feature_id": id,
+            "feature_uid": uid,
+            "charge": charge,
+            "pepmass": mz,
+            "rtinseconds": rt,
+            "mslevel": ms_level,
+            "type": spec_type,
+            "energy": energy,
+            "spec_len": spec_len,
+            "spec_mz": spectrum_mz,
+            "spec_int": spectrum_inty,
         }
     # Collect all ion data
@@ -153,7 +153,7 @@ def _get_mgf_df(self, **kwargs):
     skip = 0
     mgf_counter = 0
     self.logger.info(f"Generating MGF data for {len(grouped)} consensus features...")
     tdqm_disable = self.log_level not in ["TRACE", "DEBUG", "INFO"]
     for _consensus_uid, cons_ms2 in tqdm(
         grouped,
@@ -308,7 +308,7 @@ def _get_mgf_df(self, **kwargs):
     # Convert to Polars DataFrame
     if not ion_data:
         return pl.DataFrame()
     return pl.DataFrame(ion_data)
@@ -336,13 +336,13 @@ def export_mgf(self, **kwargs):
         verbose (bool): Enable verbose logging (default: False).
         precursor_trim (float): Precursor trimming value (default: -10).
         centroid_algo (str): Centroiding algorithm (default: "lmp").
     Returns:
         None: Writes MGF file to disk.
     """
     # Get mgf data as DataFrame
     mgf_data = self._get_mgf_df(**kwargs)
     if mgf_data is None or len(mgf_data) == 0:
         self.logger.warning("No MGF data generated.")
         return
@@ -355,9 +355,9 @@ def export_mgf(self, **kwargs):
         else:
             if hasattr(params, key):
                 params.set(key, value, validate=True)
     filename = params.get("filename")
     # Prepare output path
     if not os.path.isabs(filename):
         if self.folder is not None:
@@ -370,7 +370,7 @@ def export_mgf(self, **kwargs):
         for row in mgf_data.iter_rows(named=True):
             # Write BEGIN IONS
             f.write("BEGIN IONS\n")
             # Write metadata
             if row["mgf_index"] is not None:
                 f.write(f"INDEX={row['mgf_index']}\n")
@@ -381,19 +381,19 @@ def export_mgf(self, **kwargs):
             f.write(f"PEPMASS={row['pepmass']}\n")
             f.write(f"RTINSECONDS={row['rtinseconds']}\n")
             f.write(f"MSLEVEL={row['mslevel']}\n")
             if row["energy"] is not None:
                 f.write(f"ENERGY={row['energy']}\n")
             # Write spectrum data
             spectrum_mz = row["spec_mz"]
             spectrum_inty = row["spec_int"]
             for mz_val, inty in zip(spectrum_mz, spectrum_inty, strict=False):
                 f.write(f"{mz_val:.5f} {inty:.0f}\n")
             # Write END IONS
             f.write("END IONS\n\n")
     self.logger.info(f"Exported {len(mgf_data)} spectra to {filename}")
@@ -414,45 +414,47 @@ def export_mztab(self, filename: str = None, include_mgf=True, **kwargs) -> None
             filename = os.path.join(self.folder, filename)
         else:
             filename = os.path.join(os.getcwd(), filename)
     # get mgf data
     mgf_data = self._get_mgf_df(**kwargs)
     # Create mapping from feature_uid to MGF indexes
     mgf_mapping: dict[str, list[int]] = {}
     if mgf_data is not None and len(mgf_data) > 0:
         for row in mgf_data.iter_rows(named=True):
-            feature_uid = row['feature_uid']
-            mgf_index = row['mgf_index']
+            feature_uid = row["feature_uid"]
+            mgf_index = row["mgf_index"]
             if feature_uid not in mgf_mapping:
                 mgf_mapping[feature_uid] = []
             mgf_mapping[feature_uid].append(mgf_index)
     # --- Prepare MTD (metadata) section ---
     mtd_lines = []
     mtd_lines.append(f"COM file generated by MASSter on {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
     mtd_lines.append("\nMTD\tmzTab-version\t2.2.0-M")
     id = self.label if self.label else self.folder
     mtd_lines.append(f"MTD\tmzTab-id\t{id}")
-    mtd_lines.append('')
+    mtd_lines.append("")
     mtd_lines.append("MTD\tcv[1]-label\tMS")
     mtd_lines.append("MTD\tcv[1]-full_name\tPSI-MS controlled vocabulary")
     mtd_lines.append("MTD\tcv[1]-version\t4.1.199")
     mtd_lines.append("MTD\tcv[1]-uri\thttps://raw.githubusercontent.com/HUPO-PSI/psi-ms-CV/master/psi-ms.obo")
-    mtd_lines.append('')
+    mtd_lines.append("")
     mtd_lines.append("MTD\tsmall_molecule-quantification_unit\t[MS, MS:1001844, MS1 feature area, ]")
     mtd_lines.append("MTD\tsmall_molecule_feature-quantification_unit\t[MS, MS:1001844, MS1 feature area, ]")
-    mtd_lines.append("MTD\tsmall_molecule-identification_reliability\t[MS, MS:1002955, hr-ms compound identification confidence level, ]")
+    mtd_lines.append(
+        "MTD\tsmall_molecule-identification_reliability\t[MS, MS:1002955, hr-ms compound identification confidence level, ]"
+    )
     mtd_lines.append("MTD\tid_confidence_measure[1]\t[MS, MS:1002888, small molecule confidence measure, ]")
-    mtd_lines.append('')
+    mtd_lines.append("")
     mtd_lines.append("MTD\tsoftware[1]\t[MS, MS:1003430, OpenMS, unknown]")
     mtd_lines.append(f"MTD\tsoftware[2]\t[MS, MS:1002878, MASSter, {get_version()}]")
     mtd_lines.append("MTD\tquantification_method\t[MS, MS:1001834, LC-MS label-free quantitation analysis, ]")
-    mtd_lines.append('')
-    mtd_lines.append("MTD\tdatabase[1]\t[, , \"no database\", null]")
+    mtd_lines.append("")
+    mtd_lines.append('MTD\tdatabase[1]\t[, , "no database", null]')
     mtd_lines.append("MTD\tdatabase[1]-prefix\tnull")
     mtd_lines.append("MTD\tdatabase[1]-version\tUnknown")
     mtd_lines.append("MTD\tdatabase[1]-uri\tnull")
-    #mtd_lines.append('')
+    # mtd_lines.append('')
     for i, row in enumerate(self.samples_df.iter_rows(named=True), 1):
         mtd_lines.append(f"\nMTD\tsample[{i}]\t{row.get('sample_uid', f'sample_{i}')}")
         mtd_lines.append(f"MTD\tsample[{i}]-description\t{row.get('sample_name', 'unknown')}")
@@ -460,15 +462,15 @@ def export_mztab(self, filename: str = None, include_mgf=True, **kwargs) -> None
         mtd_lines.append(f"MTD\tassay[{i}]\tAssay_{i}")
         mtd_lines.append(f"MTD\tassay[{i}]-sample_ref\tsample[{i}]")
         mtd_lines.append(f"MTD\tassay[{i}]-ms_run_ref\tms_run[{i}]")
-    mtd_lines.append('')
+    mtd_lines.append("")
     mtd_lines.append("MTD\tstudy_variable[1]\tundefined")
     mtd_lines.append("MTD\tstudy_variable[1]_refs\tundefined")
-    #assay_refs = '|'.join([f"assay[{i}]" for i in range(1, len(self.samples_df)+1)])
-    #mtd_lines.append(f"MTD\tstudy_variable[1]-assay_refs\t{assay_refs}")
-    #mtd_lines.append("MTD\tstudy_variable[1]-description\tAll assays grouped (default)")
-    with open(filename, 'w', encoding='utf-8') as f:
+    # assay_refs = '|'.join([f"assay[{i}]" for i in range(1, len(self.samples_df)+1)])
+    # mtd_lines.append(f"MTD\tstudy_variable[1]-assay_refs\t{assay_refs}")
+    # mtd_lines.append("MTD\tstudy_variable[1]-description\tAll assays grouped (default)")
+    with open(filename, "w", encoding="utf-8") as f:
         for line in mtd_lines:
-            f.write(line + '\n')
+            f.write(line + "\n")
     # --- SML (Small Molecule) table ---
     sml_lines = []
@@ -487,43 +489,54 @@ def export_mztab(self, filename: str = None, include_mgf=True, **kwargs) -> None
         "reliability",
         "best_id_confidence_measure",
         "best_id_confidence_value",
-        "opt_global_mgf_index",
+        "opt_global_mgf_index",
     ]
     abundance_matrix = self.get_consensus_matrix()
     # Use the matrix as-is since it already has the correct sample columns
     # The matrix columns are sample names, which is what we want for the assay columns
     # round to int
     abundance_matrix = abundance_matrix.round(0)
     # Use actual number of samples from the abundance matrix
     n_assays = len(abundance_matrix.columns)
-    sml_header += [f"abundance_assay[{i}]" for i in range(1, n_assays+1)]
+    sml_header += [f"abundance_assay[{i}]" for i in range(1, n_assays + 1)]
     sml_header += ["abundance_study_variable[1]", "abundance_variation_study_variable[1]"]
-    sml_lines.append('\t'.join(sml_header))
+    sml_lines.append("\t".join(sml_header))
     # get adducts from consensus_df['adducts']. If value is None or [], use 'null'. If there is, take the first element and the first string
     adduct_list = []
-    mapping = {'H1': '[M+H]+', 'H2': '[M+2H]2+',
-               'Na1': '[M+Na]+', 'Na2': '[M+2Na]2+',
-               'NH4': '[M+NH4]+', 'HCOO': '[M+HCOO]-',
-               'CH3COO': '[M+CH3COO]-', 'H2O': '[M+H2O]+',
-               'HCO2': '[M+HCO2]-', 'H3PO4': '[M+H3PO4]+',
-               'H3O1': '[M+H3O]+', 'K1': '[M+K]+',
-               'H4N1': '[M+NH4]+',
-               'H-1': '[M-H]-', 'Cl1': '[M+Cl]-',
-               'Br1': '[M+Br]-', 'I1': '[M+I]-',
-               'H2O2': '[M+H2O2]+', 'H3O2': '[M+H3O2]+',}
+    mapping = {
+        "H1": "[M+H]+",
+        "H2": "[M+2H]2+",
+        "Na1": "[M+Na]+",
+        "Na2": "[M+2Na]2+",
+        "NH4": "[M+NH4]+",
+        "HCOO": "[M+HCOO]-",
+        "CH3COO": "[M+CH3COO]-",
+        "H2O": "[M+H2O]+",
+        "HCO2": "[M+HCO2]-",
+        "H3PO4": "[M+H3PO4]+",
+        "H3O1": "[M+H3O]+",
+        "K1": "[M+K]+",
+        "H4N1": "[M+NH4]+",
+        "H-1": "[M-H]-",
+        "Cl1": "[M+Cl]-",
+        "Br1": "[M+Br]-",
+        "I1": "[M+I]-",
+        "H2O2": "[M+H2O2]+",
+        "H3O2": "[M+H3O2]+",
+    }
     for idx, row in enumerate(self.consensus_df.iter_rows(named=True), 1):
-        adduct = 'null'
-        if 'adducts' in row:
-            row_adducts = row['adducts']
+        adduct = "null"
+        if "adducts" in row:
+            row_adducts = row["adducts"]
             if isinstance(row_adducts, list) and row_adducts:
                 # Each adduct is a dictionary with 'adduct' key
                 first_adduct_dict = row_adducts[0]
-                if isinstance(first_adduct_dict, dict) and 'adduct' in first_adduct_dict:
-                    adduct_str = first_adduct_dict['adduct']
+                if isinstance(first_adduct_dict, dict) and "adduct" in first_adduct_dict:
+                    adduct_str = first_adduct_dict["adduct"]
                     if adduct_str in mapping:
                         adduct = mapping[adduct_str]
                     else:
@@ -533,46 +546,46 @@ def export_mztab(self, filename: str = None, include_mgf=True, **kwargs) -> None
     for idx, row in enumerate(self.consensus_df.iter_rows(named=True), 1):
         # Get MGF indexes for this consensus feature
-        mgf_indexes = mgf_mapping.get(row['consensus_uid'], [])
+        mgf_indexes = mgf_mapping.get(row["consensus_uid"], [])
         sml_row = [
             "SML",
             str(idx),
             str(idx),
-            str(row.get('database_identifier', 'null')),
-            str(row.get('chemical_formula', 'null')),
-            str(row.get('smiles', 'null')),
-            str(row.get('inchi', 'null')),
-            str(row.get('chemical_name', 'null')),
-            str(row.get('uri', 'null')),
-            str(row.get('theoretical_neutral_mass', 'null')),
-            adduct_list[idx-1],
-            str(row.get('reliability', 'null')),
-            str(row.get('best_id_confidence_measure', 'null')),
-            str(row.get('best_id_confidence_value', 'null')),
-            ','.join(map(str, mgf_indexes)) if mgf_indexes else 'null',
+            str(row.get("database_identifier", "null")),
+            str(row.get("chemical_formula", "null")),
+            str(row.get("smiles", "null")),
+            str(row.get("inchi", "null")),
+            str(row.get("chemical_name", "null")),
+            str(row.get("uri", "null")),
+            str(row.get("theoretical_neutral_mass", "null")),
+            adduct_list[idx - 1],
+            str(row.get("reliability", "null")),
+            str(row.get("best_id_confidence_measure", "null")),
+            str(row.get("best_id_confidence_value", "null")),
+            ",".join(map(str, mgf_indexes)) if mgf_indexes else "null",
         ]
         # Add abundance values for each assay
-        consensus_uid = row['consensus_uid']
+        consensus_uid = row["consensus_uid"]
         if consensus_uid in abundance_matrix.index:
             abundance_values = abundance_matrix.loc[consensus_uid].tolist()
-            sml_row += [str(val) if pd.notna(val) else 'null' for val in abundance_values]
+            sml_row += [str(val) if pd.notna(val) else "null" for val in abundance_values]
         else:
-            sml_row += ['null'] * n_assays
-        sml_row += ['null', 'null']
-        sml_lines.append('\t'.join(sml_row))
-    with open(filename, 'a', encoding='utf-8') as f:
-        f.write('\n')
+            sml_row += ["null"] * n_assays
+        sml_row += ["null", "null"]
+        sml_lines.append("\t".join(sml_row))
+    with open(filename, "a", encoding="utf-8") as f:
+        f.write("\n")
         for line in sml_lines:
-            f.write(line + '\n')
+            f.write(line + "\n")
     # --- SMF (Small Molecule Feature) table ---
     smf_lines = []
     smf_header = [
         "SFH",
         "SMF_ID",
-        "SME_ID_REFS",
-        "SME_ID_REF_ambiguity_code",
+        "SOME_ID_REFS",
+        "SOME_ID_REF_ambiguity_code",
         "adduct_ion",
         "isotopomer",
         "exp_mass_to_charge",
@@ -581,9 +594,9 @@ def export_mztab(self, filename: str = None, include_mgf=True, **kwargs) -> None
         "retention_time_in_seconds_start",
         "retention_time_in_seconds_end",
     ]
-    smf_header += [f"abundance_assay[{i}]" for i in range(1, n_assays+1)]
-    smf_lines.append('\t'.join(smf_header))
+    smf_header += [f"abundance_assay[{i}]" for i in range(1, n_assays + 1)]
+    smf_lines.append("\t".join(smf_header))
     # SMF table uses the same consensus features as SML, just different metadata
     for idx, row in enumerate(self.consensus_df.iter_rows(named=True), 1):
         smf_row = [
@@ -591,26 +604,26 @@ def export_mztab(self, filename: str = None, include_mgf=True, **kwargs) -> None
             str(idx),
             "null",
             "null",
-            adduct_list[idx-1],  # adduct_ion
-            str(row.get('isotopomer', 'null')),
-            str(row.get('mz', 'null')),  # exp_mass_to_charge
-            str(row.get('charge', 'null')),
-            str(row.get('rt', 'null')),  # retention_time_in_seconds
-            str(row.get('retention_time_in_seconds_start', 'null')),
-            str(row.get('retention_time_in_seconds_end', 'null')),
+            adduct_list[idx - 1],  # adduct_ion
+            str(row.get("isotopomer", "null")),
+            str(row.get("mz", "null")),  # exp_mass_to_charge
+            str(row.get("charge", "null")),
+            str(row.get("rt", "null")),  # retention_time_in_seconds
+            str(row.get("retention_time_in_seconds_start", "null")),
+            str(row.get("retention_time_in_seconds_end", "null")),
         ]
         # Add abundance values for each assay - same as SML
-        consensus_uid = row['consensus_uid']
+        consensus_uid = row["consensus_uid"]
         if consensus_uid in abundance_matrix.index:
             abundance_values = abundance_matrix.loc[consensus_uid].tolist()
-            smf_row += [str(val) if pd.notna(val) else 'null' for val in abundance_values]
+            smf_row += [str(val) if pd.notna(val) else "null" for val in abundance_values]
         else:
-            smf_row += ['null'] * n_assays
-        smf_lines.append('\t'.join(smf_row))
-    with open(filename, 'a', encoding='utf-8') as f:
-        f.write('\n')
+            smf_row += ["null"] * n_assays
+        smf_lines.append("\t".join(smf_row))
+    with open(filename, "a", encoding="utf-8") as f:
+        f.write("\n")
         for line in smf_lines:
-            f.write(line + '\n')
+            f.write(line + "\n")
     # --- MGF table ---
     if include_mgf and mgf_data is not None and len(mgf_data) > 0:
@@ -618,9 +631,9 @@ def export_mztab(self, filename: str = None, include_mgf=True, **kwargs) -> None
         # Header
         mgf_header = [
             "COM",
-            "MGH",
+            "MGH",
             "mgf_id",
-            "prec_id",
+            "prec_id",
             "prec_rt",
             "prec_mz",
             "prec_int",
@@ -630,10 +643,10 @@ def export_mztab(self, filename: str = None, include_mgf=True, **kwargs) -> None
             "spec_tic",
             "spec_len",
             "spec_mz",
-            "spec_int"
+            "spec_int",
         ]
-        mgf_lines.append('\t'.join(mgf_header))
+        mgf_lines.append("\t".join(mgf_header))
         # Data rows
         for row in mgf_data.iter_rows(named=True):
             # Calculate spectrum TIC (total ion current) from the spectrum data
@@ -641,11 +654,11 @@ def export_mztab(self, filename: str = None, include_mgf=True, **kwargs) -> None
             spectrum_inty = row["spec_int"]
             spec_tic = sum(spectrum_inty) if spectrum_inty else 0
             spec_len = row["spec_len"] if row["spec_len"] is not None else 0
             # Format spectrum data as pipe-separated strings
-            spec_mz_str = '|'.join([f"{mz:.4f}" for mz in spectrum_mz]) if spectrum_mz else ""
-            spec_int_str = '|'.join([f"{int(inty)}" for inty in spectrum_inty]) if spectrum_inty else ""
+            spec_mz_str = "|".join([f"{mz:.4f}" for mz in spectrum_mz]) if spectrum_mz else ""
+            spec_int_str = "|".join([f"{int(inty)}" for inty in spectrum_inty]) if spectrum_inty else ""
             mgf_row = [
                 "COM",
                 "MGF",
@@ -660,15 +673,15 @@ def export_mztab(self, filename: str = None, include_mgf=True, **kwargs) -> None
                 f"{int(spec_tic)}" if spec_tic > 0 else "null",
                 str(spec_len) if spec_len > 0 else "null",
                 spec_mz_str if spec_mz_str else "null",
-                spec_int_str if spec_int_str else "null"
+                spec_int_str if spec_int_str else "null",
             ]
-            mgf_lines.append('\t'.join(mgf_row))
+            mgf_lines.append("\t".join(mgf_row))
         # Write MGF table
-        with open(filename, 'a', encoding='utf-8') as f:
-            f.write('\n')
+        with open(filename, "a", encoding="utf-8") as f:
+            f.write("\n")
             for line in mgf_lines:
-                f.write(line + '\n')
+                f.write(line + "\n")
-    if include_mgf:
+    if include_mgf:
         self.logger.info(f"Exported mzTab-M to {filename}")

masster 0.3.9__py3-none-any.whl → 0.3.11__py3-none-any.whl

Potentially problematic release.

masster 0.3.9py3-none-any.whl → 0.3.11py3-none-any.whl