PyPI - masster - Versions diffs - 0.4.20__py3-none-any.whl → 0.4.21__py3-none-any.whl - Mend

masster 0.4.20py3-none-any.whl → 0.4.21py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of masster might be problematic. Click here for more details.

Files changed (28) hide show

masster/__init__.py +6 -0
masster/_version.py +1 -1
masster/sample/h5.py +58 -1
masster/sample/load.py +7 -1
masster/sample/plot.py +56 -65
masster/sample/processing.py +158 -0
masster/sample/sample.py +2 -0
masster/sample/sample5_schema.json +3 -0
masster/sample/save.py +137 -59
masster/spectrum.py +58 -9
masster/study/export.py +238 -152
masster/study/h5.py +65 -1
masster/study/helpers.py +3 -3
masster/study/merge.py +25 -10
masster/study/plot.py +39 -2
masster/study/processing.py +257 -1
masster/study/save.py +48 -5
masster/study/study.py +16 -3
masster/study/study5_schema.json +3 -0
masster/wizard/__init__.py +5 -2
masster/wizard/wizard.py +430 -1866
{masster-0.4.20.dist-info → masster-0.4.21.dist-info}/METADATA +1 -1
{masster-0.4.20.dist-info → masster-0.4.21.dist-info}/RECORD +26 -28
masster/wizard/test_structure.py +0 -49
masster/wizard/test_wizard.py +0 -285
{masster-0.4.20.dist-info → masster-0.4.21.dist-info}/WHEEL +0 -0
{masster-0.4.20.dist-info → masster-0.4.21.dist-info}/entry_points.txt +0 -0
{masster-0.4.20.dist-info → masster-0.4.21.dist-info}/licenses/LICENSE +0 -0

masster/sample/save.py CHANGED Viewed

@@ -230,8 +230,9 @@ def export_mgf(
         features = features.filter(pl.col("rt") >= rt_start)
     if rt_end is not None:
         features = features.filter(pl.col("rt") <= rt_end)
-    if not include_all_ms1:
-        features = features.filter(pl.col("ms2_scans").is_not_null())
+    # Note: We no longer filter out features without MS2 data here since we want to export
+    # MS1 spectra for ALL features with isotope data. The MS2 filtering is done in the
+    # second pass where we specifically check for ms2_scans.
     # Convert to list of dictionaries for faster iteration
     features_list = features.to_dicts()
@@ -265,16 +266,42 @@ def export_mgf(
                         setattr(spec, attr, getattr(spec, attr)[mask])
         return spec
-    def write_ion(f, title, fid, mz, rt, charge, spect):
+    def write_ion(f, title, fuid, fid, mz, rt, charge, spect):
         if spect is None:
-            return
-        f.write(f"BEGIN IONS\nTITLE={title}\n")
+            return "none"
+        # For MSLEVEL=2 ions, don't write empty spectra
+        ms_level = spect.ms_level if spect.ms_level is not None else 1
+        if ms_level > 1 and (len(spect.mz) == 0 or len(spect.inty) == 0):
+            return "empty_ms2"
+        # Create dynamic title based on MS level
+        if ms_level == 1:
+            # MS1: uid, rt, mz
+            dynamic_title = f"uid:{fuid}, rt:{rt:.2f}, mz:{mz:.4f}"
+        else:
+            # MS2: uid, rt, mz, energy
+            energy = spect.energy if hasattr(spect, 'energy') else 0
+            dynamic_title = f"uid:{fuid}, rt:{rt:.2f}, mz:{mz:.4f}, energy:{energy}"
+        f.write(f"BEGIN IONS\nTITLE={dynamic_title}\n")
+        f.write(f"FEATURE_UID={fuid}\n")
         f.write(f"FEATURE_ID={fid}\n")
         f.write(f"CHARGE={charge}\nPEPMASS={mz}\nRTINSECONDS={rt}\n")
         if spect.ms_level is None:
             f.write("MSLEVEL=1\n")
+            # Add PRECURSORINTENSITY for MS1 spectra
+            if len(spect.inty) > 0:
+                precursor_intensity = max(spect.inty)
+                f.write(f"PRECURSORINTENSITY={precursor_intensity:.0f}\n")
         else:
             f.write(f"MSLEVEL={spect.ms_level}\n")
+            # Add PRECURSORINTENSITY for MS1 spectra
+            if spect.ms_level == 1 and len(spect.inty) > 0:
+                precursor_intensity = max(spect.inty)
+                f.write(f"PRECURSORINTENSITY={precursor_intensity:.0f}\n")
         if spect.ms_level is not None:
             if spect.ms_level > 1 and hasattr(spect, "energy"):
                 f.write(f"ENERGY={spect.energy}\n")
@@ -285,6 +312,7 @@ def export_mgf(
         ]
         f.writelines(peak_lines)
         f.write("END IONS\n\n")
+        return "written"
     if centroid_algo is None:
         if hasattr(self.parameters, "centroid_algo"):
@@ -304,6 +332,9 @@ def export_mgf(
     c = 0
     skip = 0
+    empty_ms2_count = 0
+    ms1_spec_used_count = 0
+    ms1_fallback_count = 0
     # check if features is empty
     if len(features_list) == 0:
         self.logger.warning("No features found.")
@@ -311,57 +342,80 @@ def export_mgf(
     filename = os.path.abspath(filename)
     with open(filename, "w", encoding="utf-8") as f:
         tdqm_disable = self.log_level not in ["TRACE", "DEBUG", "INFO"]
+        # First pass: Export MS1 spectra for ALL features with ms1_spec data
+        print("Exporting MS1 spectra...")
         for row in tqdm(
             features_list,
             total=len(features_list),
-            desc=f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO     | {self.log_label}Export MGF",
+            desc=f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO     | {self.log_label}Export MS1 spectra",
             disable=tdqm_disable,
         ):
             # Pre-calculate common values
             feature_uid = row["feature_uid"]
+            feature_id = row["feature_id"] if "feature_id" in row else feature_uid
             mz = row["mz"]
             rt = row["rt"]
             rt_str = f"{rt:.2f}"
             mz_str = f"{mz:.4f}"
-            # Filtering is now done at DataFrame level, so we can skip these checks
-            if row["ms2_scans"] is None and not include_all_ms1:
-                skip = skip + 1
-                continue
+            # Export MS1 spectrum for ALL features with ms1_spec data
+            if "ms1_spec" in row and row["ms1_spec"] is not None:
+                # Create spectrum from ms1_spec isotope pattern data
+                from masster.spectrum import Spectrum
+                iso_data = row["ms1_spec"]
+                if len(iso_data) >= 2:  # Ensure we have mz and intensity arrays
+                    ms1_mz = iso_data[0]
+                    ms1_inty = iso_data[1]
+                    # Create a Spectrum object from the isotope data
+                    spect = Spectrum(
+                        mz=np.array(ms1_mz),
+                        inty=np.array(ms1_inty),
+                        ms_level=1
+                    )
+                    charge = preferred_charge
+                    if row["charge"] is not None and row["charge"] != 0:
+                        charge = row["charge"]
-            # write MS1 spectrum
-            ms1_scan_uid = self.select_closest_scan(rt=rt)["scan_uid"][0]
-            spect = self.get_spectrum(
-                ms1_scan_uid,
-                centroid=centroid,
-                deisotope=deisotope,
-                centroid_algo=centroid_algo,
-            )
-            spect = filter_peaks(spect, inty_min=inty_min)
-            if not full_ms1:
-                # trim spectrum to region around the precursor, it's wide to potentially identify adducts
-                spect = spect.trim(
-                    mz_min=mz - 50,
-                    mz_max=mz + 50,
-                )
-            charge = preferred_charge
-            if row["charge"] is not None and row["charge"] != 0:
-                charge = row["charge"]
-            write_ion(
-                f,
-                f"feature_uid:{feature_uid}, rt:{rt_str}, mz:{mz_str}",
-                feature_uid,
-                mz,
-                rt,
-                charge,
-                spect,
-            )
+                    write_ion(
+                        f,
+                        f"uid:{feature_uid}",
+                        feature_uid,
+                        feature_id,
+                        mz,
+                        rt,
+                        charge,
+                        spect,
+                    )
+                    ms1_spec_used_count += 1
+                else:
+                    ms1_fallback_count += 1
+            else:
+                # No MS1 spectrum exported for features without ms1_spec data
+                ms1_fallback_count += 1
+        # Second pass: Export MS2 spectra for features with MS2 data
+        print("Exporting MS2 spectra...")
+        for row in tqdm(
+            features_list,
+            total=len(features_list),
+            desc=f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO     | {self.log_label}Export MS2 spectra",
+            disable=tdqm_disable,
+        ):
+            # Pre-calculate common values
+            feature_uid = row["feature_uid"]
+            feature_id = row["feature_id"] if "feature_id" in row else feature_uid
+            mz = row["mz"]
+            rt = row["rt"]
+            rt_str = f"{rt:.2f}"
+            mz_str = f"{mz:.4f}"
+            # Skip features without MS2 data (unless include_all_ms1 is True, but we already handled MS1 above)
             if row["ms2_scans"] is None:
+                skip = skip + 1
                 continue
             elif use_cache:
                 spect = row["ms2_specs"]
@@ -399,16 +453,20 @@ def export_mgf(
                                 current_scan_uid = (
                                     scan_uids[i] if i < len(scan_uids) else "unknown"
                                 )
-                                write_ion(
+                                result = write_ion(
                                     f,
-                                    f"fid:{feature_uid}, rt:{rt_str}, mz:{mz_str}, scan_uid:{current_scan_uid}",
+                                    f"uid:{feature_uid}",
                                     feature_uid,
+                                    feature_id,
                                     mz,
                                     rt,
                                     charge,
                                     s,
                                 )
-                                c += 1
+                                if result == "written":
+                                    c += 1
+                                elif result == "empty_ms2":
+                                    empty_ms2_count += 1
                         continue  # Skip the rest of the processing for this feature
             # If we reach here, either use_cache=False or no cached spectra were available
@@ -455,16 +513,20 @@ def export_mgf(
                                 eic_min=eic_corr_min,
                                 q1_max=q1_ratio_max,
                             )
-                            write_ion(
+                            result = write_ion(
                                 f,
-                                f"fid:{feature_uid}, rt:{rt_str}, mz:{mz_str}, scan_uid:{scan_uid}, energy:{energy}",
+                                f"uid:{feature_uid}",
                                 feature_uid,
+                                feature_id,
                                 mz,
                                 rt,
                                 charge,
                                 spect,
                             )
-                            c += 1
+                            if result == "written":
+                                c += 1
+                            elif result == "empty_ms2":
+                                empty_ms2_count += 1
             else:
                 if selection == "best":
                     ms2_scans = row["ms2_scans"][0]
@@ -482,16 +544,20 @@ def export_mgf(
                         eic_min=eic_corr_min,
                         q1_max=q1_ratio_max,
                     )
-                    write_ion(
+                    result = write_ion(
                         f,
-                        f"fid:{feature_uid}, rt:{rt_str}, mz:{mz_str}, scan_uid:{ms2_scans}",
+                        f"uid:{feature_uid}",
                         feature_uid,
+                        feature_id,
                         mz,
                         rt,
                         charge,
                         spect,
                     )
-                    c += 1
+                    if result == "written":
+                        c += 1
+                    elif result == "empty_ms2":
+                        empty_ms2_count += 1
                 elif selection == "all":
                     if merge:
                         specs = []
@@ -527,7 +593,7 @@ def export_mgf(
                                 )
                         if deisotope:
                             spect = spect.deisotope()
-                        title = f"fid:{feature_uid}, rt:{rt_str}, mz:{mz_str}, merged"
+                        title = f"uid:{feature_uid}"
                         spect = filter_peaks(
                             spect,
                             inty_min=inty_min,
@@ -535,16 +601,20 @@ def export_mgf(
                             eic_min=eic_corr_min,
                             q1_max=q1_ratio_max,
                         )
-                        write_ion(
+                        result = write_ion(
                             f,
                             title,
                             feature_uid,
+                            feature_id,
                             mz,
                             rt,
                             charge,
                             spect,
                         )
-                        c += 1
+                        if result == "written":
+                            c += 1
+                        elif result == "empty_ms2":
+                            empty_ms2_count += 1
                     else:
                         for ms2_scans in row["ms2_scans"]:
                             spect = self.get_spectrum(
@@ -561,19 +631,27 @@ def export_mgf(
                                 eic_min=eic_corr_min,
                                 q1_max=q1_ratio_max,
                             )
-                            write_ion(
+                            result = write_ion(
                                 f,
-                                f"fid:{feature_uid}, rt:{rt_str}, mz:{mz_str}, scan_uid:{ms2_scans}",
+                                f"uid:{feature_uid}",
                                 feature_uid,
+                                feature_id,
                                 mz,
                                 rt,
                                 charge,
                                 spect,
                             )
-                            c += 1
-    self.logger.info(f"Exported {c} features to {filename}")
+                            if result == "written":
+                                c += 1
+                            elif result == "empty_ms2":
+                                empty_ms2_count += 1
+    self.logger.info(f"Exported {ms1_spec_used_count} MS1 spectra and {c} MS2 spectra to {filename}")
+    if empty_ms2_count > 0:
+        self.logger.info(f"Skipped {empty_ms2_count} empty MS2 spectra")
+    if ms1_fallback_count > 0:
+        self.logger.info(f"Skipped MS1 export for {ms1_fallback_count} features without isotope patterns")
     # Handle None values in logging
     inty_min_str = f"{inty_min:.3f}" if inty_min != float("-inf") else "None"
     q1_ratio_min_str = f"{q1_ratio_min:.3f}" if q1_ratio_min is not None else "None"

masster/spectrum.py CHANGED Viewed

@@ -197,16 +197,65 @@ class Spectrum:
             self.bl = None
     def check_if_centroided(self) -> bool:
-        if self.mz.size == 0:
-            return True
-        mzs = self.mz[self.mz < np.min(self.mz) + 0.4]
-        if len(mzs) < 20:
-            if len(mzs) < 3:
-                return True
-            min_distance = np.min(np.diff(mzs))
-            if min_distance > 0.003:
+        """
+        Fast determination if spectrum data is centroided or profile.
+        Uses optimized statistical approaches with early exits for speed:
+        1. Fast median difference check (most decisive)
+        2. Small gap ratio (profile characteristic)
+        3. Density check (fallback)
+        Returns:
+            bool: True if centroided, False if profile
+        """
+        if self.mz.size < 5:
+            return True  # Too few points to determine, assume centroided
+        # Fast path: check if mz is already sorted to avoid sorting cost
+        if np.all(self.mz[:-1] <= self.mz[1:]):
+            sorted_mz = self.mz
+        else:
+            sorted_mz = np.sort(self.mz)
+        # Calculate differences efficiently
+        mz_diffs = np.diff(sorted_mz)
+        # Remove zeros efficiently (keep positive differences)
+        mz_diffs = mz_diffs[mz_diffs > 0]
+        if mz_diffs.size == 0:
+            return True  # All identical m/z values
+        # Fast approach 1: Median difference (most decisive, compute once)
+        median_diff = np.median(mz_diffs)
+        # Early exits for clear cases (>90% of cases)
+        if median_diff > 0.02:
+            return True  # Clearly centroided
+        elif median_diff < 0.005:
+            return False  # Clearly profile
+        # Fast approach 2: Small gap ratio (for borderline cases)
+        # Use vectorized comparison instead of creating new array
+        small_gap_count = np.sum(mz_diffs < 0.005)
+        small_gap_ratio = small_gap_count / mz_diffs.size
+        if small_gap_ratio > 0.7:
+            return False  # High ratio of small gaps = profile
+        elif small_gap_ratio < 0.1:
+            return True   # Low ratio of small gaps = centroided
+        # Fast approach 3: Density check (final fallback)
+        mz_range = sorted_mz[-1] - sorted_mz[0]
+        if mz_range > 0:
+            density = sorted_mz.size / mz_range
+            if density > 100:  # High density = profile
+                return False
+            elif density < 10:  # Low density = centroided
                 return True
-        return False
+        # Final fallback: median threshold
+        return median_diff > 0.01
     def reload(self):
         modname = self.__class__.__module__

masster 0.4.20__py3-none-any.whl → 0.4.21__py3-none-any.whl

Potentially problematic release.

masster 0.4.20py3-none-any.whl → 0.4.21py3-none-any.whl