PyPI - masster - Versions diffs - 0.2.3__tar.gz → 0.2.4__tar.gz - Mend

masster 0.2.3tar.gz → 0.2.4tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of masster might be problematic. Click here for more details.

Files changed (71) hide show

{masster-0.2.3 → masster-0.2.4}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: masster
-Version: 0.2.3
+Version: 0.2.4
 Summary: Mass spectrometry data analysis package
 Project-URL: homepage, https://github.com/zamboni-lab/masster
 Project-URL: repository, https://github.com/zamboni-lab/masster

{masster-0.2.3 → masster-0.2.4}/pyproject.toml RENAMED Viewed

@@ -1,7 +1,7 @@
 [project]
 name = "masster"
-version = "0.2.3"
+version = "0.2.4"
 description = "Mass spectrometry data analysis package"
 authors = [
     { name = "Zamboni Lab" }

{masster-0.2.3 → masster-0.2.4}/src/masster/_version.py RENAMED Viewed

@@ -1,7 +1,7 @@
 from __future__ import annotations
-__version__ = "0.2.3"
+__version__ = "0.2.4"
 def get_version():

{masster-0.2.3 → masster-0.2.4}/src/masster/sample/save.py RENAMED Viewed

@@ -71,6 +71,23 @@ from masster.spectrum import combine_peaks
 def save(self, filename=None):
+    """
+    Save the current object to a file in the '.sample5' format.
+    If `filename` is not provided, the method attempts to use `self.file_path` as the base name,
+    replacing its extension with '.sample5'. If neither `filename` nor `self.file_path` is available,
+    a ValueError is raised.
+    If `filename` is provided and `self.file_path` is an absolute path, the extension of `filename`
+    is replaced with '.sample5'. Otherwise, if `self.file_path` is available, its extension is replaced
+    with '.sample5'. If neither is available, a ValueError is raised.
+    Parameters:
+        filename (str, optional): The name of the file to save to. If not provided, uses `self.file_path`.
+    Returns:
+        None
+    """
     if filename is None:
         # save to default file name
         if self.file_path is not None:
@@ -98,20 +115,43 @@ def _save_featureXML(self, filename="features.featureXML"):
 def export_features(self, filename="features.csv"):
-    # COMMENT: cannot export lists to CSV. Could be exported to Parquet
-    # COMMENT: removing problematic columns for now
+    """
+    Export the features DataFrame to a CSV or Excel file.
+    This method clones the internal features DataFrame, adds a boolean column 'has_ms2' indicating
+    whether the 'ms2_scans' column is not null, and exports the resulting DataFrame to the specified file.
+    Columns with data types 'List' or 'Object' are excluded from the export.
+    Parameters:
+        filename (str): The path to the output file. If the filename ends with '.xls' or '.xlsx',
+                        the data is exported in Excel format; otherwise, it is exported as CSV.
+                        Defaults to 'features.csv'.
+    Side Effects:
+        Writes the exported data to the specified file and logs the export operation.
+    """
+    # clone df
+    clean_df = self.features_df.clone()
+    filename = os.path.abspath(filename)
+    # add a column has_ms2=True if colum ms2_scans is not None
+    if "ms2_scans" in clean_df.columns:
+        clean_df = clean_df.with_columns(
+            (pl.col("ms2_scans").is_not_null()).alias("has_ms2")
+        )
     clean_df = self.features_df.select([
-        col
-        for col in self.features_df.columns
-        if self.features_df[col].dtype not in (pl.List, pl.Object)
+        col for col in self.features_df.columns if self.features_df[col].dtype not in (pl.List, pl.Object)
     ])
-    clean_df.write_csv(filename)
-    self.logger.info(f"Features exported to {filename}")
+    if filename.lower().endswith((".xls", ".xlsx")):
+        clean_df.to_pandas().to_excel(filename, index=False)
+        self.logger.info(f"Features exported to {filename} (Excel format)")
+    else:
+        clean_df.write_csv(filename)
+        self.logger.info(f"Features exported to {filename}")
 def export_mgf(
     self,
-    filename:str="features.mgf",
+    filename: str = "features.mgf",
     use_cache=True,
     selection="best",
     split_energy=True,
@@ -128,7 +168,6 @@ def export_mgf(
     q1_ratio_max=None,
     eic_corr_min=None,
     deisotope=True,
-    verbose=False,
     precursor_trim=-(-10.0),
     centroid_algo=None,
 ):
@@ -175,14 +214,28 @@ def export_mgf(
             return
         else:
             self.features_df = self.features.get_df()
+    # Apply filtering at DataFrame level for better performance
     features = self.features_df
-    # iterate over all features
+    if mz_start is not None:
+        features = features.filter(pl.col("mz") >= mz_start)
+    if mz_end is not None:
+        features = features.filter(pl.col("mz") <= mz_end)
+    if rt_start is not None:
+        features = features.filter(pl.col("rt") >= rt_start)
+    if rt_end is not None:
+        features = features.filter(pl.col("rt") <= rt_end)
+    if not include_all_ms1:
+        features = features.filter(pl.col("ms2_scans").is_not_null())
+    # Convert to list of dictionaries for faster iteration
+    features_list = features.to_dicts()
     def filter_peaks(spec, inty_min=None, q1_min=None, eic_min=None, q1_max=None):
         # create a copy of the spectrum
         spec = spec.copy()
-        l = len(spec.mz)
-        mask = [True] * l
+        spec_len = len(spec.mz)
+        mask = [True] * spec_len
         if inty_min is not None and inty_min > 0:
             mask = np.array(mask) & (spec.inty >= inty_min)
         # check if q1_ratio is an attribute of spec
@@ -201,9 +254,9 @@ def export_mgf(
                 getattr(spec, attr),
                 np.ndarray,
             ):
-                # check if attr has attribute 0 and its length is equal to l:
+                # check if attr has attribute 0 and its length is equal to spec_len:
                 if hasattr(getattr(spec, attr), "__len__"):
-                    if len(getattr(spec, attr)) == l:
+                    if len(getattr(spec, attr)) == spec_len:
                         setattr(spec, attr, getattr(spec, attr)[mask])
         return spec
@@ -218,47 +271,54 @@ def export_mgf(
         else:
             f.write(f"MSLEVEL={spect.ms_level}\n")
         if spect.ms_level is not None:
-            if spect.ms_level > 1 and "energy" in spect.__dict__:
+            if spect.ms_level > 1 and hasattr(spect, "energy"):
                 f.write(f"ENERGY={spect.energy}\n")
-        for mz, inty in zip(spect.mz, spect.inty, strict=False):
-            f.write(f"{mz:.5f} {inty:.0f}\n")
+        # Use list comprehension for better performance
+        peak_lines = [f"{mz_val:.5f} {inty_val:.0f}\n" for mz_val, inty_val in zip(spect.mz, spect.inty, strict=False)]
+        f.writelines(peak_lines)
         f.write("END IONS\n\n")
     if centroid_algo is None:
-        if "centroid_algo" in self.parameters:
-            centroid_algo = self.parameters["centroid_algo"]
+        if hasattr(self.parameters, "centroid_algo"):
+            centroid_algo = self.parameters.centroid_algo
         else:
             centroid_algo = "cr"
+    # count how many features have charge < 0
+    if self.features_df.filter(pl.col("charge") < 0).shape[0]- self.features_df.filter(pl.col("charge") > 0).shape[0] > 0:
+        preferred_charge = -1
+    else:
+        preferred_charge = 1
     c = 0
     skip = 0
     # check if features is empty
-    if len(features) == 0:
+    if len(features_list) == 0:
         self.logger.warning("No features found.")
         return
+    filename = os.path.abspath(filename)
     with open(filename, "w", encoding="utf-8") as f:
         tdqm_disable = self.log_level not in ["TRACE", "DEBUG", "INFO"]
-        for i in tqdm(
-            range(len(features)),
-            total=len(features),
+        for row in tqdm(
+            features_list,
+            total=len(features_list),
             desc=f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO     | {self.log_label}Export MGF",
             disable=tdqm_disable,
         ):
-            row = features.row(i, named=True)
-            if mz_start is not None and row["mz"] < mz_start:
-                continue
-            if mz_end is not None and row["mz"] > mz_end:
-                continue
-            if rt_start is not None and row["rt"] < rt_start:
-                continue
-            if rt_end is not None and row["rt"] > rt_end:
-                continue
+            # Pre-calculate common values
+            feature_uid = row["feature_uid"]
+            mz = row["mz"]
+            rt = row["rt"]
+            rt_str = f"{rt:.2f}"
+            mz_str = f"{mz:.4f}"
+            # Filtering is now done at DataFrame level, so we can skip these checks
             if row["ms2_scans"] is None and not include_all_ms1:
                 skip = skip + 1
                 continue
             # write MS1 spectrum
-            ms1_scan_uid = self.find_closest_scan(rt=row["rt"])["scan_uid"]
+            ms1_scan_uid = self.find_closest_scan(rt=rt)["scan_uid"]
             spect = self.get_spectrum(
                 ms1_scan_uid,
                 centroid=centroid,
@@ -271,17 +331,21 @@ def export_mgf(
             if not full_ms1:
                 # trim spectrum to region around the precursor, it's wide to potentially identify adducts
                 spect = spect.trim(
-                    mz_min=row["mz"] - 50,
-                    mz_max=row["mz"] + 50,
+                    mz_min=mz - 50,
+                    mz_max=mz + 50,
                 )
+            charge = preferred_charge
+            if row["charge"] is not None and row["charge"] != 0:
+                    charge = row["charge"]
             write_ion(
                 f,
-                f"fid:{row['feature_uid']}, rt:{row['rt']:.2f}, mz:{row['mz']:.4f}",
-                row["feature_uid"],
-                row["mz"],
-                row["rt"],
-                row["charge"],
+                f"feature_uid:{feature_uid}, rt:{rt_str}, mz:{mz_str}",
+                feature_uid,
+                mz,
+                rt,
+                charge,
                 spect,
             )
@@ -319,29 +383,24 @@ def export_mgf(
                                 q1_max=q1_ratio_max,
                             )
                             # Get the corresponding scan_uid from the list
-                            current_scan_uid = (
-                                scan_uids[i] if i < len(scan_uids) else "unknown"
-                            )
+                            current_scan_uid = scan_uids[i] if i < len(scan_uids) else "unknown"
                             write_ion(
                                 f,
-                                f"fid:{row['feature_uid']}, rt:{row['rt']:.2f}, mz:{row['mz']:.4f}, scan_uid:{current_scan_uid}",
-                                row["feature_uid"],
-                                row["mz"],
-                                row["rt"],
-                                row["charge"],
+                                f"fid:{feature_uid}, rt:{rt_str}, mz:{mz_str}, scan_uid:{current_scan_uid}",
+                                feature_uid,
+                                mz,
+                                rt,
+                                charge,
                                 s,
                             )
+                            c += 1
             elif split_energy:
                 # get energy of all scans with scan_uid in ms2_scans
                 energy = [s.energy for s in row["ms2_specs"]]
                 # find unique energies
                 unique_energies = list(set(energy))
                 for e in unique_energies:
-                    ms2_scans = [
-                        row["ms2_scans"][i]
-                        for i, s in enumerate(row["ms2_specs"])
-                        if s.energy == e
-                    ]
+                    ms2_scans = [row["ms2_scans"][i] for i, s in enumerate(row["ms2_specs"]) if s.energy == e]
                     if selection == "best":
                         # Keep as list with single element
                         ms2_scans = [ms2_scans[0]]
@@ -362,13 +421,14 @@ def export_mgf(
                         )
                         write_ion(
                             f,
-                            f"fid:{row['feature_uid']}, rt:{row['rt']:.2f}, mz:{row['mz']:.4f}, scan_uid:{scan_uid}, energy:{e}",
-                            row["feature_uid"],
-                            row["mz"],
-                            row["rt"],
-                            row["charge"],
+                            f"fid:{feature_uid}, rt:{rt_str}, mz:{mz_str}, scan_uid:{scan_uid}, energy:{e}",
+                            feature_uid,
+                            mz,
+                            rt,
+                            charge,
                             spect,
                         )
+                        c += 1
             else:
                 if selection == "best":
                     ms2_scans = row["ms2_scans"][0]
@@ -388,13 +448,14 @@ def export_mgf(
                     )
                     write_ion(
                         f,
-                        f"fid:{row['feature_uid']}, rt:{row['rt']:.2f}, mz:{row['mz']:.4f}, scan_uid:{ms2_scans}",
-                        row["feature_uid"],
-                        row["mz"],
-                        row["rt"],
-                        row["charge"],
+                        f"fid:{feature_uid}, rt:{rt_str}, mz:{mz_str}, scan_uid:{ms2_scans}",
+                        feature_uid,
+                        mz,
+                        rt,
+                        charge,
                         spect,
                     )
+                    c += 1
                 elif selection == "all":
                     if merge:
                         specs = []
@@ -414,23 +475,19 @@ def export_mgf(
                                 spect = spect.centroid(
                                     tolerance=self.parameters["mz_tol_ms1_da"],
                                     ppm=self.parameters["mz_tol_ms1_ppm"],
-                                    min_points=self.parameters[
-                                        "centroid_min_points_ms1"
-                                    ],
+                                    min_points=self.parameters["centroid_min_points_ms1"],
                                     algo=centroid_algo,
                                 )
                             elif spect.ms_level == 2:
                                 spect = spect.centroid(
                                     tolerance=self.parameters["mz_tol_ms2_da"],
                                     ppm=self.parameters["mz_tol_ms2_ppm"],
-                                    min_points=self.parameters[
-                                        "centroid_min_points_ms2"
-                                    ],
+                                    min_points=self.parameters["centroid_min_points_ms2"],
                                     algo=centroid_algo,
                                 )
                         if deisotope:
                             spect = spect.deisotope()
-                        title = f"fid:{row['fid']}, rt:{row['rt']:.2f}, mz:{row['mz']:.4f}, merged"
+                        title = f"fid:{feature_uid}, rt:{rt_str}, mz:{mz_str}, merged"
                         spect = filter_peaks(
                             spect,
                             inty_min=inty_min,
@@ -441,12 +498,13 @@ def export_mgf(
                         write_ion(
                             f,
                             title,
-                            row["feature_uid"],
-                            row["mz"],
-                            row["rt"],
-                            row["charge"],
+                            feature_uid,
+                            mz,
+                            rt,
+                            charge,
                             spect,
                         )
+                        c += 1
                     else:
                         for ms2_scans in row["ms2_scans"]:
                             spect = self.get_spectrum(
@@ -465,24 +523,30 @@ def export_mgf(
                             )
                             write_ion(
                                 f,
-                                f"fid:{row['feature_uid']}, rt:{row['rt']:.2f}, mz:{row['mz']:.4f}, scan_uid:{ms2_scans}",
-                                row["feature_uid"],
-                                row["mz"],
-                                row["rt"],
-                                row["charge"],
+                                f"fid:{feature_uid}, rt:{rt_str}, mz:{mz_str}, scan_uid:{ms2_scans}",
+                                feature_uid,
+                                mz,
+                                rt,
+                                charge,
                                 spect,
                             )
+                            c += 1
-    self.logger.info(f"Exported {c - skip} features to {filename}")
+    self.logger.info(f"Exported {c} features to {filename}")
+    # Handle None values in logging
+    inty_min_str = f"{inty_min:.3f}" if inty_min != float("-inf") else "None"
+    q1_ratio_min_str = f"{q1_ratio_min:.3f}" if q1_ratio_min is not None else "None"
+    eic_corr_min_str = f"{eic_corr_min:.3f}" if eic_corr_min is not None else "None"
     self.logger.debug(
-        f"MGF created with int>{inty_min:.3f}, q1_ratio>{q1_ratio_min:.3f}, eic_corr>{eic_corr_min:.3f}",
+        f"MGF created with int>{inty_min_str}, q1_ratio>{q1_ratio_min_str}, eic_corr>{eic_corr_min_str}",
     )
     self.logger.debug(
-        f"- Exported {c} MS2 features for {len(features) - skip} precursors. Average peaks/feature is {c / (len(features) - skip + 0.000000001):.0f}",
+        f"- Exported {c} MS2 spectra for {len(features_list) - skip} precursors. Average spectra/feature is {c / (len(features_list) - skip + 0.000000001):.0f}",
     )
     self.logger.debug(
-        f"- Skipped {skip} features because no MS2 peaks were left after filtering.",
+        f"- Skipped {skip} features because no MS2 scans were available.",
     )
@@ -510,9 +574,7 @@ def export_dda_stats(self, filename="stats.csv"):
     ms2_count = len(self.scans_df.filter(pl.col("ms_level") == 2))
     features_count = len(self.features_df) if self.features_df is not None else 0
     features_with_ms2 = (
-        self.features_df.filter(pl.col("ms2_scans").is_not_null()).height
-        if self.features_df is not None
-        else 0
+        self.features_df.filter(pl.col("ms2_scans").is_not_null()).height if self.features_df is not None else 0
     )
     # Initialize a dictionary to hold statistics
@@ -527,9 +589,7 @@ def export_dda_stats(self, filename="stats.csv"):
     if "time_cycle" in self.scans_df.columns:
         ms1_df = self.scans_df.filter(pl.col("ms_level") == 1)
         avg_cycle_time = ms1_df["time_cycle"].mean()
-        stats["Average_cycle_time"] = (
-            avg_cycle_time if avg_cycle_time is not None else ""
-        )
+        stats["Average_cycle_time"] = avg_cycle_time if avg_cycle_time is not None else ""
     else:
         stats["Average_cycle_time"] = 0

{masster-0.2.3 → masster-0.2.4}/uv.lock RENAMED Viewed

@@ -1585,7 +1585,7 @@ wheels = [
 [[package]]
 name = "masster"
-version = "0.2.3"
+version = "0.2.4"
 source = { editable = "." }
 dependencies = [
     { name = "alphabase" },