PyPI - masster - Versions diffs - 0.4.20__py3-none-any.whl → 0.4.22__py3-none-any.whl - Mend

masster 0.4.20py3-none-any.whl → 0.4.22py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of masster might be problematic. Click here for more details.

Files changed (29) hide show

masster/__init__.py +6 -0
masster/_version.py +1 -1
masster/sample/h5.py +58 -1
masster/sample/load.py +7 -1
masster/sample/plot.py +56 -65
masster/sample/processing.py +158 -0
masster/sample/sample.py +2 -0
masster/sample/sample5_schema.json +3 -0
masster/sample/save.py +135 -59
masster/spectrum.py +58 -9
masster/study/export.py +240 -154
masster/study/h5.py +65 -1
masster/study/helpers.py +3 -3
masster/study/load.py +39 -3
masster/study/merge.py +25 -10
masster/study/plot.py +162 -192
masster/study/processing.py +362 -12
masster/study/save.py +48 -5
masster/study/study.py +16 -3
masster/study/study5_schema.json +3 -0
masster/wizard/__init__.py +5 -2
masster/wizard/wizard.py +435 -1871
{masster-0.4.20.dist-info → masster-0.4.22.dist-info}/METADATA +1 -1
{masster-0.4.20.dist-info → masster-0.4.22.dist-info}/RECORD +27 -29
masster/wizard/test_structure.py +0 -49
masster/wizard/test_wizard.py +0 -285
{masster-0.4.20.dist-info → masster-0.4.22.dist-info}/WHEEL +0 -0
{masster-0.4.20.dist-info → masster-0.4.22.dist-info}/entry_points.txt +0 -0
{masster-0.4.20.dist-info → masster-0.4.22.dist-info}/licenses/LICENSE +0 -0

masster/study/export.py CHANGED Viewed

@@ -78,26 +78,31 @@ def _get_mgf_df(self, **kwargs):
     if self.consensus_df is None:
         self.logger.error("No consensus map found. Please run merge() first.")
         return None
-    if self.consensus_ms2 is None:
-        self.logger.error("No consensus MS2 data found. Please run link_ms2() first.")
-        return None
-    # Convert to pandas for merge operation since the result is used for groupby
-    consensus_df_pd = self.consensus_df.to_pandas()
-    consensus_ms2_pd = self.consensus_ms2.to_pandas()
-    features = pd.merge(
-        consensus_df_pd,
-        consensus_ms2_pd,
-        how="right",
-        on="consensus_uid",
-    )
-    if len(features) == 0:
-        self.logger.warning("No features found.")
-        return pl.DataFrame()
-    # Pre-group by consensus_uid for fast access
-    grouped = features.groupby("consensus_uid")
+    # MS2 data is optional - we can generate MS1 data without it
+    ms2_available = self.consensus_ms2 is not None and not self.consensus_ms2.is_empty()
+    if not ms2_available:
+        self.logger.info("No consensus MS2 data found. Generating MS1-only MGF data.")
+    # Convert to pandas for merge operation only if we have MS2 data
+    if ms2_available:
+        consensus_df_pd = self.consensus_df.to_pandas()
+        consensus_ms2_pd = self.consensus_ms2.to_pandas()
+        features = pd.merge(
+            consensus_df_pd,
+            consensus_ms2_pd,
+            how="right",
+            on="consensus_uid",
+        )
+        if len(features) == 0:
+            self.logger.warning("No MS2 features found.")
+            grouped = {}  # Empty groupby result
+        else:
+            # Pre-group by consensus_uid for fast access
+            grouped = features.groupby("consensus_uid")
+    else:
+        grouped = {}  # No MS2 data available
     def filter_peaks(spec, inty_min=None):
         spec = spec.copy()
@@ -115,6 +120,12 @@ def _get_mgf_df(self, **kwargs):
                 setattr(spec, attr, np.array(arr)[mask])
         return spec
+    def safe_charge(charge_value):
+        """Safely convert charge value to integer, handling NaN and None"""
+        if charge_value is None or (isinstance(charge_value, float) and np.isnan(charge_value)):
+            return 1
+        return int(round(charge_value))
     def create_ion_dict(title, id, uid, mz, rt, charge, spect, mgf_id):
         """Create a dictionary representing an ion for the DataFrame."""
         if spect is None:
@@ -160,65 +171,115 @@ def _get_mgf_df(self, **kwargs):
     ion_data = []
     skip = 0
     mgf_counter = 0
-    self.logger.info(f"Generating MGF data for {len(grouped)} consensus features...")
-    tdqm_disable = self.log_level not in ["TRACE", "DEBUG", "INFO"]
-    for _consensus_uid, cons_ms2 in tqdm(
-        grouped,
-        total=len(grouped),
-        desc=f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO     | {self.log_label}Feature",
-        disable=tdqm_disable,
-    ):
-        # Use the first row for feature-level info
-        row = cons_ms2.iloc[0]
-        if mz_start is not None and row["mz"] < mz_start:
-            continue
-        if mz_end is not None and row["mz"] > mz_end:
+    self.logger.debug(f"Generating MGF data for {len(self.consensus_df)} consensus features...")
+    # First, generate MS1 spectra for all consensus features using isotope data
+    self.logger.debug("Generating MS1 spectra from isotope data...")
+    for row in self.consensus_df.iter_rows(named=True):
+        # Apply filtering at individual feature level for MS1 data
+        consensus_uid = row["consensus_uid"]
+        consensus_mz = row["mz"]
+        consensus_rt = row["rt"]
+        consensus_inty_mean = row.get("inty_mean", 0)
+        if mz_start is not None and consensus_mz < mz_start:
             continue
-        if rt_start is not None and row["rt"] < rt_start:
+        if mz_end is not None and consensus_mz > mz_end:
             continue
-        if rt_end is not None and row["rt"] > rt_end:
+        if rt_start is not None and consensus_rt < rt_start:
             continue
-        if len(cons_ms2) == 0:
-            skip += 1
+        if rt_end is not None and consensus_rt > rt_end:
             continue
+        # Create MS1 spectrum using isotope data
+        iso_data = row.get("iso", None)
+        if iso_data is not None and len(iso_data) > 0:
+            # Use isotope data for spectrum
+            spectrum_mz = [float(peak[0]) for peak in iso_data]
+            spectrum_inty = [float(peak[1]) for peak in iso_data]
+        else:
+            # Use consensus mz and inty_mean as single peak
+            spectrum_mz = [float(consensus_mz)]
+            spectrum_inty = [float(consensus_inty_mean)]
+        # Apply intensity minimum filter if specified
+        if inty_min is not None and inty_min > 0:
+            filtered_pairs = [(mz, inty) for mz, inty in zip(spectrum_mz, spectrum_inty, strict=False) if inty >= inty_min]
+            if filtered_pairs:
+                spectrum_mz, spectrum_inty = zip(*filtered_pairs, strict=False)
+                spectrum_mz = list(spectrum_mz)
+                spectrum_inty = list(spectrum_inty)
+            else:
+                # If all peaks are below threshold, skip this feature
+                continue
+        mgf_counter += 1
+        # Create MS1 spectrum object to use with create_ion_dict
+        class SimpleSpectrum:
+            def __init__(self, mz_list, inty_list):
+                self.mz = np.array(mz_list)
+                self.inty = np.array(inty_list)
+                self.ms_level = 1
+                self.energy = None
+        ms1_spectrum = SimpleSpectrum(spectrum_mz, spectrum_inty)
+        # Use create_ion_dict to ensure consistent schema
+        ion_dict = create_ion_dict(
+            f"uid:{consensus_uid}, rt:{consensus_rt:.2f}, mz:{consensus_mz:.4f}, MS1",
+            row["consensus_id"],
+            consensus_uid,
+            consensus_mz,
+            consensus_rt,
+            safe_charge(row.get("charge_mean")),
+            ms1_spectrum,
+            mgf_counter,
+        )
+        if ion_dict is not None:
+            ion_data.append(ion_dict)
+    self.logger.debug(f"Generated {len(ion_data)} MS1 spectra from isotope data")
+    # Now generate MS2 spectra if available
+    if ms2_available and len(grouped) > 0:
+        self.logger.debug(f"Processing MS2 data for {len(grouped)} consensus features with MS2...")
+        tdqm_disable = self.log_level not in ["TRACE", "DEBUG", "INFO"]
+        for _consensus_uid, cons_ms2 in tqdm(
+            grouped,
+            total=len(grouped),
+            desc=f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO     | {self.log_label}Feature",
+            disable=tdqm_disable,
+        ):
+            # Use the first row for feature-level info
+            row = cons_ms2.iloc[0]
+            if mz_start is not None and row["mz"] < mz_start:
+                continue
+            if mz_end is not None and row["mz"] > mz_end:
+                continue
+            if rt_start is not None and row["rt"] < rt_start:
+                continue
+            if rt_end is not None and row["rt"] > rt_end:
+                continue
+            if len(cons_ms2) == 0:
+                skip += 1
+                continue
-        if split_energy:
-            energies = cons_ms2["energy"].unique()
-            for e in energies:
-                cons_ms2_e = cons_ms2[cons_ms2["energy"] == e]
-                if selection == "best":
-                    # Check if the filtered DataFrame is empty
-                    if len(cons_ms2_e) == 0:
-                        continue
-                    idx = cons_ms2_e["prec_inty"].idxmax()
-                    cons_ms2_e_row = cons_ms2_e.loc[idx]
-                    spect = cons_ms2_e_row["spec"]
-                    if spect is None:
-                        skip += 1
-                        continue
-                    if centroid:
-                        spect = spect.centroid()
-                    if deisotope:
-                        spect = spect.deisotope()
-                    spect = filter_peaks(spect, inty_min=inty_min)
-                    mgf_counter += 1
-                    ion_dict = create_ion_dict(
-                        f"uid:{cons_ms2_e_row['consensus_uid']}, rt:{cons_ms2_e_row['rt']:.2f}, mz:{cons_ms2_e_row['mz']:.4f}, energy:{e}, sample_uid:{cons_ms2_e_row['sample_uid']}, scan_id:{cons_ms2_e_row['scan_id']}",
-                        cons_ms2_e_row["consensus_id"],
-                        cons_ms2_e_row["consensus_uid"],
-                        cons_ms2_e_row["mz"],
-                        cons_ms2_e_row["rt"],
-                        round(cons_ms2_e_row["charge_mean"]),
-                        spect,
-                        mgf_counter,
-                    )
-                    if ion_dict is not None:
-                        ion_data.append(ion_dict)
-                else:
-                    for row_e in cons_ms2_e.iter_rows(named=True):
-                        spect = row_e["spec"]
+            if split_energy:
+                energies = cons_ms2["energy"].unique()
+                for e in energies:
+                    cons_ms2_e = cons_ms2[cons_ms2["energy"] == e]
+                    if selection == "best":
+                        # Check if the filtered DataFrame is empty
+                        if len(cons_ms2_e) == 0:
+                            continue
+                        idx = cons_ms2_e["prec_inty"].idxmax()
+                        cons_ms2_e_row = cons_ms2_e.loc[idx]
+                        spect = cons_ms2_e_row["spec"]
                         if spect is None:
+                            skip += 1
                             continue
                         if centroid:
                             spect = spect.centroid()
@@ -227,104 +288,129 @@ def _get_mgf_df(self, **kwargs):
                         spect = filter_peaks(spect, inty_min=inty_min)
                         mgf_counter += 1
                         ion_dict = create_ion_dict(
-                            f"uid:{row_e['consensus_uid']}, rt:{row_e['rt']:.2f}, mz:{row_e['mz']:.4f}, energy:{e}, sample_uid:{row_e['sample_uid']}, scanid:{row_e['scan_id']}",
-                            row_e["consensus_id"],
-                            row_e["consensus_uid"],
-                            row_e["mz"],
-                            row_e["rt"],
-                            round(row_e["charge_mean"]),
+                            f"uid:{cons_ms2_e_row['consensus_uid']}, rt:{cons_ms2_e_row['rt']:.2f}, mz:{cons_ms2_e_row['mz']:.4f}, energy:{e}, sample_uid:{cons_ms2_e_row['sample_uid']}, scan_id:{cons_ms2_e_row['scan_id']}",
+                            cons_ms2_e_row["consensus_id"],
+                            cons_ms2_e_row["consensus_uid"],
+                            cons_ms2_e_row["mz"],
+                            cons_ms2_e_row["rt"],
+                            safe_charge(cons_ms2_e_row["charge_mean"]),
                             spect,
                             mgf_counter,
                         )
                         if ion_dict is not None:
                             ion_data.append(ion_dict)
-        else:
-            if selection == "best":
-                idx = cons_ms2["prec_inty"].idxmax()
-                cons_ms2_e_row = cons_ms2.loc[idx]
-                spect = cons_ms2_e_row["spec"]
-                if spect is None:
-                    continue
-                if centroid:
-                    spect = spect.centroid()
-                if deisotope:
-                    spect = spect.deisotope()
-                spect = filter_peaks(spect, inty_min=inty_min)
-                mgf_counter += 1
-                ion_dict = create_ion_dict(
-                    f"uid:{cons_ms2_e_row['consensus_uid']}, rt:{cons_ms2_e_row['rt']:.2f}, mz:{cons_ms2_e_row['mz']:.4f}, energy:{cons_ms2_e_row['energy']}, sample_uid:{cons_ms2_e_row['sample_uid']}, scan_id:{cons_ms2_e_row['scan_id']}",
-                    cons_ms2_e_row["consensus_id"],
-                    cons_ms2_e_row["consensus_uid"],
-                    cons_ms2_e_row["mz"],
-                    cons_ms2_e_row["rt"],
-                    round(cons_ms2_e_row["charge_mean"]),
-                    spect,
-                    mgf_counter,
-                )
-                if ion_dict is not None:
-                    ion_data.append(ion_dict)
-            elif selection == "all":
-                if merge:
-                    specs = [
-                        row_e["spec"]
-                        for row_e in cons_ms2.iter_rows(named=True)
-                        if row_e["spec"] is not None
-                    ]
-                    if not specs:
+                    else:
+                        for row_e in cons_ms2_e.iter_rows(named=True):
+                            spect = row_e["spec"]
+                            if spect is None:
+                                continue
+                            if centroid:
+                                spect = spect.centroid()
+                            if deisotope:
+                                spect = spect.deisotope()
+                            spect = filter_peaks(spect, inty_min=inty_min)
+                            mgf_counter += 1
+                            ion_dict = create_ion_dict(
+                                f"uid:{row_e['consensus_uid']}, rt:{row_e['rt']:.2f}, mz:{row_e['mz']:.4f}, energy:{e}, sample_uid:{row_e['sample_uid']}, scanid:{row_e['scan_id']}",
+                                row_e["consensus_id"],
+                                row_e["consensus_uid"],
+                                row_e["mz"],
+                                row_e["rt"],
+                                safe_charge(row_e["charge_mean"]),
+                                spect,
+                                mgf_counter,
+                            )
+                            if ion_dict is not None:
+                                ion_data.append(ion_dict)
+            else:
+                if selection == "best":
+                    idx = cons_ms2["prec_inty"].idxmax()
+                    cons_ms2_e_row = cons_ms2.loc[idx]
+                    spect = cons_ms2_e_row["spec"]
+                    if spect is None:
                         continue
-                    spect = combine_peaks(specs)
                     if centroid:
-                        spect = spect.denoise()
                         spect = spect.centroid()
                     if deisotope:
                         spect = spect.deisotope()
                     spect = filter_peaks(spect, inty_min=inty_min)
                     mgf_counter += 1
                     ion_dict = create_ion_dict(
-                        f"uid:{row['consensus_uid']}, rt:{row['rt']:.2f}, mz:{row['mz']:.4f}, sample_uid:{row['sample_uid']}, scan_id:{row['scan_id']}",
-                        row["consensus_id"],
-                        row["consensus_uid"],
-                        row["mz"],
-                        row["rt"],
-                        round(row["charge_mean"]),
+                        f"uid:{cons_ms2_e_row['consensus_uid']}, rt:{cons_ms2_e_row['rt']:.2f}, mz:{cons_ms2_e_row['mz']:.4f}, energy:{cons_ms2_e_row['energy']}, sample_uid:{cons_ms2_e_row['sample_uid']}, scan_id:{cons_ms2_e_row['scan_id']}",
+                        cons_ms2_e_row["consensus_id"],
+                        cons_ms2_e_row["consensus_uid"],
+                        cons_ms2_e_row["mz"],
+                        cons_ms2_e_row["rt"],
+                        safe_charge(cons_ms2_e_row["charge_mean"]),
                         spect,
                         mgf_counter,
                     )
                     if ion_dict is not None:
                         ion_data.append(ion_dict)
-                else:
-                    for row_e in cons_ms2.iter_rows(named=True):
-                        spect = row_e["spec"]
-                        if spect is None:
+                elif selection == "all":
+                    if merge:
+                        specs = [
+                            row_e["spec"]
+                            for row_e in cons_ms2.iter_rows(named=True)
+                            if row_e["spec"] is not None
+                        ]
+                        if not specs:
                             continue
+                        spect = combine_peaks(specs)
                         if centroid:
+                            spect = spect.denoise()
                             spect = spect.centroid()
                         if deisotope:
                             spect = spect.deisotope()
                         spect = filter_peaks(spect, inty_min=inty_min)
                         mgf_counter += 1
                         ion_dict = create_ion_dict(
-                            f"uid:{row_e['consensus_uid']}, rt:{row_e['rt']:.2f}, mz:{row_e['mz']:.4f}, energy:{row_e['energy']}, sample_uid:{row_e['sample_uid']}, scan_id:{row_e['scan_id']}",
-                            row_e["consensus_id"],
-                            row_e["consensus_uid"],
-                            row_e["mz"],
-                            row_e["rt"],
-                            round(row_e["charge_mean"]),
+                            f"uid:{row['consensus_uid']}, rt:{row['rt']:.2f}, mz:{row['mz']:.4f}, sample_uid:{row['sample_uid']}, scan_id:{row['scan_id']}",
+                            row["consensus_id"],
+                            row["consensus_uid"],
+                            row["mz"],
+                            row["rt"],
+                            safe_charge(row["charge_mean"]),
                             spect,
                             mgf_counter,
                         )
                         if ion_dict is not None:
                             ion_data.append(ion_dict)
+                    else:
+                        for row_e in cons_ms2.iter_rows(named=True):
+                            spect = row_e["spec"]
+                            if spect is None:
+                                continue
+                            if centroid:
+                                spect = spect.centroid()
+                            if deisotope:
+                                spect = spect.deisotope()
+                            spect = filter_peaks(spect, inty_min=inty_min)
+                            mgf_counter += 1
+                            ion_dict = create_ion_dict(
+                                f"uid:{row_e['consensus_uid']}, rt:{row_e['rt']:.2f}, mz:{row_e['mz']:.4f}, energy:{row_e['energy']}, sample_uid:{row_e['sample_uid']}, scan_id:{row_e['scan_id']}",
+                                row_e["consensus_id"],
+                                row_e["consensus_uid"],
+                                row_e["mz"],
+                                row_e["rt"],
+                                safe_charge(row_e["charge_mean"]),
+                                spect,
+                                mgf_counter,
+                            )
+                            if ion_dict is not None:
+                                ion_data.append(ion_dict)
+    else:
+        self.logger.info("Skipping MS2 data generation - no MS2 data available")
-    self.logger.debug(f"Generated MGF data for {len(ion_data)} spectra")
-    self.logger.debug(f"Skipped {skip} features due to missing data.")
+    self.logger.debug(f"Generated MGF data for {len(ion_data)} spectra (MS1 + MS2)")
+    self.logger.debug(f"Skipped {skip} MS2 features due to missing data.")
     # Convert to Polars DataFrame
     if not ion_data:
         return pl.DataFrame()
-    return pl.DataFrame(ion_data)
+    return pl.DataFrame(ion_data, infer_schema_length=None)
 def export_mgf(self, **kwargs):
@@ -412,7 +498,7 @@ def export_mgf(self, **kwargs):
     self.logger.info(f"Exported {len(mgf_data)} spectra to {filename}")
-def export_mztab(self, filename: str = None, include_mgf=True, **kwargs) -> None:
+def export_mztab(self, filename: str | None = None, include_mgf=True, **kwargs) -> None:
     """
     Export the study as a fully compliant mzTab-M file.
@@ -1098,7 +1184,7 @@ def export_mztab(self, filename: str = None, include_mgf=True, **kwargs) -> None
     self.logger.info(f"Exported mzTab-M to {filename}")
-def export_xlsx(self, filename: str = None) -> None:
+def export_xlsx(self, filename: str | None = None) -> None:
     """
     Export the study data to an Excel workbook with multiple worksheets.
@@ -1209,38 +1295,38 @@ def export_xlsx(self, filename: str = None) -> None:
         self.logger.error(f"Error writing Excel file: {e}")
-def export_parquet(self, basename: str = None) -> None:
+def export_parquet(self, filename: str | None = None) -> None:
     """
     Export the study data to multiple Parquet files with different suffixes.
     The export creates separate Parquet files for each dataset:
-    - <basename>_samples.parquet: Samples dataframe
-    - <basename>_consensus.parquet: Consensus features dataframe
-    - <basename>_identification.parquet: Identification results with library annotations
-    - <basename>_matrix.parquet: Consensus matrix with samples as columns
+    - <filename>_samples.parquet: Samples dataframe
+    - <filename>_consensus.parquet: Consensus features dataframe
+    - <filename>_identification.parquet: Identification results with library annotations
+    - <filename>_matrix.parquet: Consensus matrix with samples as columns
     Args:
-        basename (str, optional): Base name for the output files. Defaults to "study"
+        filename (str, optional): Base name for the output files. Defaults to "study"
                                  in the study folder.
     """
-    # Set default basename
-    if basename is None:
-        basename = "study"
+    # Set default filename
+    if filename is None:
+        filename = "study"
-    # Make basename absolute path if not already (without extension)
-    if not os.path.isabs(basename):
+    # Make filename absolute path if not already (without extension)
+    if not os.path.isabs(filename):
         if self.folder is not None:
-            basename = os.path.join(self.folder, basename)
+            filename = os.path.join(self.folder, filename)
         else:
-            basename = os.path.join(os.getcwd(), basename)
+            filename = os.path.join(os.getcwd(), filename)
-    self.logger.debug(f"Exporting study to Parquet files with basename: {basename}")
+    self.logger.debug(f"Exporting study to Parquet files with filename: {filename}")
     exported_files = []
     # 1. Samples dataframe
     if self.samples_df is not None and not self.samples_df.is_empty():
-        samples_file = f"{basename}_samples.parquet"
+        samples_file = f"{filename}_samples.parquet"
         try:
             self.samples_df.write_parquet(samples_file)
             exported_files.append(samples_file)
@@ -1256,7 +1342,7 @@ def export_parquet(self, basename: str = None) -> None:
     # 2. Consensus dataframe
     if self.consensus_df is not None and not self.consensus_df.is_empty():
-        consensus_file = f"{basename}_consensus.parquet"
+        consensus_file = f"{filename}_consensus.parquet"
         try:
             self.consensus_df.write_parquet(consensus_file)
             exported_files.append(consensus_file)
@@ -1276,7 +1362,7 @@ def export_parquet(self, basename: str = None) -> None:
         id_df = get_id(self)
         if id_df is not None and not id_df.is_empty():
-            identification_file = f"{basename}_identification.parquet"
+            identification_file = f"{filename}_identification.parquet"
             try:
                 id_df.write_parquet(identification_file)
                 exported_files.append(identification_file)
@@ -1298,7 +1384,7 @@ def export_parquet(self, basename: str = None) -> None:
     try:
         matrix_df = self.get_consensus_matrix()
         if matrix_df is not None and not matrix_df.is_empty():
-            matrix_file = f"{basename}_matrix.parquet"
+            matrix_file = f"{filename}_matrix.parquet"
             try:
                 matrix_df.write_parquet(matrix_file)
                 exported_files.append(matrix_file)

masster/study/h5.py CHANGED Viewed

@@ -974,7 +974,7 @@ def _load_dataframe_from_group(
     # Second pass: handle missing columns
     for col in missing_columns:
-        logger.warning(f"Column '{col}' not found in {df_name}.")
+        logger.info(f"Column '{col}' not found in {df_name}.")
         # For missing columns, create appropriately sized array with appropriate defaults
         if col in object_columns:
             data[col] = [None] * expected_length
@@ -2008,3 +2008,67 @@ def _load_study5(self, filename=None):
         )
     self.logger.debug("Study loaded")
+def _load_ms1(self, sample_path: str) -> pl.DataFrame:
+    """
+    Optimized method to load only MS1 data from a sample5 file for isotope detection.
+    This method efficiently loads only the ms1_df from a sample5 HDF5 file without
+    loading other potentially large datasets like features_df, scans_df, etc.
+    Args:
+        sample_path (str): Path to the sample5 HDF5 file
+    Returns:
+        pl.DataFrame: MS1 data with columns [cycle, scan_uid, rt, mz, inty]
+                     Returns empty DataFrame if no MS1 data found or file cannot be read
+    Note:
+        Used by find_iso() for efficient isotope pattern detection without full sample loading
+    """
+    try:
+        with h5py.File(sample_path, "r") as f:
+            # Check if ms1 group exists
+            if "ms1" not in f:
+                self.logger.debug(f"No MS1 data found in {sample_path}")
+                return pl.DataFrame()
+            ms1_group = f["ms1"]
+            # Load MS1 data efficiently
+            ms1_data = {}
+            for col in ms1_group.keys():
+                ms1_data[col] = ms1_group[col][:]
+            if not ms1_data:
+                self.logger.debug(f"Empty MS1 data in {sample_path}")
+                return pl.DataFrame()
+            # Create DataFrame with proper schema
+            ms1_df = pl.DataFrame(ms1_data)
+            # Apply expected schema for MS1 data
+            expected_schema = {
+                "cycle": pl.Int64,
+                "scan_uid": pl.Int64,
+                "rt": pl.Float64,
+                "mz": pl.Float64,
+                "inty": pl.Float64
+            }
+            # Cast columns to expected types if they exist
+            cast_expressions = []
+            for col, dtype in expected_schema.items():
+                if col in ms1_df.columns:
+                    cast_expressions.append(pl.col(col).cast(dtype))
+            if cast_expressions:
+                ms1_df = ms1_df.with_columns(cast_expressions)
+            self.logger.debug(f"Loaded {len(ms1_df)} MS1 peaks from {sample_path}")
+            return ms1_df
+    except Exception as e:
+        self.logger.warning(f"Failed to load MS1 data from {sample_path}: {e}")
+        return pl.DataFrame()

masster/study/helpers.py CHANGED Viewed

@@ -509,8 +509,9 @@ def get_consensus(self, quant="chrom_area"):
     # Convert Polars DataFrame to pandas for this operation since the result is used for export
     df1 = self.consensus_df.to_pandas().copy()
-    # set consensus_id as uint64
-    df1["consensus_id"] = df1["consensus_id"].astype("uint64")
+    # Keep consensus_id as string (UUID format)
+    # Note: consensus_id is now a 16-character UUID string, not an integer
+    df1["consensus_id"] = df1["consensus_id"].astype("string")
     # set consensus_id as index
     df1.set_index("consensus_uid", inplace=True)
     # sort by consensus_id
@@ -640,7 +641,6 @@ def get_gaps_stats(self, uids=None):
     return gaps_stats
-# TODO is uid not supposed to be a list anymore?
 def get_consensus_matches(self, uids=None, filled=True):
     """
     Get feature matches for consensus UIDs with optimized join operation.

masster 0.4.20__py3-none-any.whl → 0.4.22__py3-none-any.whl

Potentially problematic release.

masster 0.4.20py3-none-any.whl → 0.4.22py3-none-any.whl