PyPI - masster - Versions diffs - 0.3.18__py3-none-any.whl → 0.3.20__py3-none-any.whl - Mend

masster 0.3.18py3-none-any.whl → 0.3.20py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of masster might be problematic. Click here for more details.

Files changed (31) hide show

masster/__init__.py +2 -0
masster/_version.py +1 -1
masster/data/libs/README.md +17 -0
masster/data/libs/ccm.py +533 -0
masster/data/libs/central_carbon_README.md +17 -0
masster/data/libs/central_carbon_metabolites.csv +120 -0
masster/data/libs/urine.py +333 -0
masster/data/libs/urine_metabolites.csv +51 -0
masster/sample/h5.py +1 -1
masster/sample/helpers.py +3 -7
masster/sample/lib.py +32 -25
masster/sample/load.py +9 -3
masster/sample/plot.py +113 -27
masster/study/export.py +27 -10
masster/study/h5.py +58 -40
masster/study/helpers.py +450 -196
masster/study/helpers_optimized.py +5 -5
masster/study/load.py +144 -118
masster/study/plot.py +691 -277
masster/study/processing.py +9 -5
masster/study/study.py +6 -6
{masster-0.3.18.dist-info → masster-0.3.20.dist-info}/METADATA +1 -1
{masster-0.3.18.dist-info → masster-0.3.20.dist-info}/RECORD +31 -25
/masster/data/{examples → wiff}/2025_01_14_VW_7600_LpMx_DBS_CID_2min_TOP15_030msecMS1_005msecReac_CE35_DBS-ON_3.mzML +0 -0
/masster/data/{examples → wiff}/2025_01_14_VW_7600_LpMx_DBS_CID_2min_TOP15_030msecMS1_005msecReac_CE35_DBS-ON_3.timeseries.data +0 -0
/masster/data/{examples → wiff}/2025_01_14_VW_7600_LpMx_DBS_CID_2min_TOP15_030msecMS1_005msecReac_CE35_DBS-ON_3.wiff +0 -0
/masster/data/{examples → wiff}/2025_01_14_VW_7600_LpMx_DBS_CID_2min_TOP15_030msecMS1_005msecReac_CE35_DBS-ON_3.wiff.scan +0 -0
/masster/data/{examples → wiff}/2025_01_14_VW_7600_LpMx_DBS_CID_2min_TOP15_030msecMS1_005msecReac_CE35_DBS-ON_3.wiff2 +0 -0
{masster-0.3.18.dist-info → masster-0.3.20.dist-info}/WHEEL +0 -0
{masster-0.3.18.dist-info → masster-0.3.20.dist-info}/entry_points.txt +0 -0
{masster-0.3.18.dist-info → masster-0.3.20.dist-info}/licenses/LICENSE +0 -0

masster/study/helpers.py CHANGED Viewed

@@ -6,7 +6,7 @@ like data retrieval, filtering, compression, and utility functions.
 The functions are organized into the following sections:
 1. Chromatogram extraction functions (BPC, TIC, EIC, chrom matrix)
-2. Data retrieval helper functions (get_sample, get_consensus, etc.)
+2. Data retrieval helper functions (get_sample, get_consensus, etc.)
 3. UID helper functions (_get_*_uids)
 4. Data filtering and selection functions
 5. Data compression and restoration functions
@@ -150,9 +150,19 @@ def get_bpc(owner, sample=None, rt_unit="s", label=None, original=False):
     # build Chromatogram
     ycol = "inty"
     try:
-        chrom = Chromatogram(rt=bpc_pd["rt"].to_numpy(), inty=bpc_pd[ycol].to_numpy(), label=label or "Base Peak Chromatogram", rt_unit=rt_unit)
+        chrom = Chromatogram(
+            rt=bpc_pd["rt"].to_numpy(),
+            inty=bpc_pd[ycol].to_numpy(),
+            label=label or "Base Peak Chromatogram",
+            rt_unit=rt_unit,
+        )
     except Exception:
-        chrom = Chromatogram(rt=bpc_pd["rt"].values, inty=bpc_pd[ycol].values, label=label or "Base Peak Chromatogram", rt_unit=rt_unit)
+        chrom = Chromatogram(
+            rt=bpc_pd["rt"].values,
+            inty=bpc_pd[ycol].values,
+            label=label or "Base Peak Chromatogram",
+            rt_unit=rt_unit,
+        )
     return chrom
@@ -204,13 +214,21 @@ def get_tic(owner, sample=None, label=None):
         tic_pd = tic_pd.rename(columns={tic_pd.columns[1]: "inty_tot"})
     try:
-        chrom = Chromatogram(rt=tic_pd["rt"].to_numpy(), inty=tic_pd["inty_tot"].to_numpy(), label=label or "Total Ion Chromatogram")
+        chrom = Chromatogram(
+            rt=tic_pd["rt"].to_numpy(),
+            inty=tic_pd["inty_tot"].to_numpy(),
+            label=label or "Total Ion Chromatogram",
+        )
     except Exception:
-        chrom = Chromatogram(rt=tic_pd["rt"].values, inty=tic_pd["inty_tot"].values, label=label or "Total Ion Chromatogram")
+        chrom = Chromatogram(
+            rt=tic_pd["rt"].values,
+            inty=tic_pd["inty_tot"].values,
+            label=label or "Total Ion Chromatogram",
+        )
     return chrom
 def get_eic(owner, sample=None, mz=None, mz_tol=None, rt_unit="s", label=None):
     """
     Return a Chromatogram object containing the Extracted Ion Chromatogram (EIC) for a target m/z.
@@ -223,7 +241,7 @@ def get_eic(owner, sample=None, mz=None, mz_tol=None, rt_unit="s", label=None):
     Parameters:
         owner: Study or Sample instance
-        sample: Sample identifier (required if owner is Study)
+        sample: Sample identifier (required if owner is Study)
         mz (float): Target m/z value
         mz_tol (float): m/z tolerance. If None, uses owner.parameters.eic_mz_tol (for Study) or defaults to 0.01
         rt_unit (str): Retention time unit for the chromatogram
@@ -234,7 +252,7 @@ def get_eic(owner, sample=None, mz=None, mz_tol=None, rt_unit="s", label=None):
     """
     # Use default mz_tol from study parameters if not provided
     if mz_tol is None:
-        if hasattr(owner, 'parameters') and hasattr(owner.parameters, 'eic_mz_tol'):
+        if hasattr(owner, "parameters") and hasattr(owner.parameters, "eic_mz_tol"):
             mz_tol = owner.parameters.eic_mz_tol
         else:
             mz_tol = 0.01  # fallback default
@@ -267,17 +285,18 @@ def get_eic(owner, sample=None, mz=None, mz_tol=None, rt_unit="s", label=None):
         mz_min = mz - mz_tol
         mz_max = mz + mz_tol
         eic_data = s.ms1_df.filter(
-            (pl.col("mz") >= mz_min) & (pl.col("mz") <= mz_max)
+            (pl.col("mz") >= mz_min) & (pl.col("mz") <= mz_max),
         )
         if eic_data.is_empty():
             # Return empty chromatogram if no data found
             import numpy as _np
             return Chromatogram(
-                rt=_np.array([0.0]),
-                inty=_np.array([0.0]),
+                rt=_np.array([0.0]),
+                inty=_np.array([0.0]),
                 label=label or f"EIC m/z={mz:.4f} ± {mz_tol} (empty)",
-                rt_unit=rt_unit
+                rt_unit=rt_unit,
             )
         # Aggregate intensities per retention time (sum in case of multiple points per rt)
@@ -290,34 +309,35 @@ def get_eic(owner, sample=None, mz=None, mz_tol=None, rt_unit="s", label=None):
     if eic_pd.empty:
         # Return empty chromatogram if no data found
         import numpy as _np
         return Chromatogram(
-            rt=_np.array([0.0]),
-            inty=_np.array([0.0]),
+            rt=_np.array([0.0]),
+            inty=_np.array([0.0]),
             label=label or f"EIC m/z={mz:.4f} ± {mz_tol} (empty)",
-            rt_unit=rt_unit
+            rt_unit=rt_unit,
         )
     # build Chromatogram
     try:
         chrom = Chromatogram(
-            rt=eic_pd["rt"].to_numpy(),
-            inty=eic_pd["inty"].to_numpy(),
+            rt=eic_pd["rt"].to_numpy(),
+            inty=eic_pd["inty"].to_numpy(),
             label=label or f"EIC m/z={mz:.4f} ± {mz_tol}",
-            rt_unit=rt_unit
+            rt_unit=rt_unit,
         )
     except Exception:
         chrom = Chromatogram(
-            rt=eic_pd["rt"].values,
-            inty=eic_pd["inty"].values,
+            rt=eic_pd["rt"].values,
+            inty=eic_pd["inty"].values,
             label=label or f"EIC m/z={mz:.4f} ± {mz_tol}",
-            rt_unit=rt_unit
+            rt_unit=rt_unit,
         )
     return chrom
 # =====================================================================================
-# DATA RETRIEVAL AND MATRIX FUNCTIONS
+# DATA RETRIEVAL AND MATRIX FUNCTIONS
 # =====================================================================================
@@ -451,9 +471,9 @@ def align_reset(self):
     self.alignment_ref_index = None
     # in self.features_df, set rt equal to rt_original
     self.features_df = self.features_df.with_columns(
-        pl.col("rt_original").alias("rt")
+        pl.col("rt_original").alias("rt"),
     )
     # Ensure column order is maintained after with_columns operation
     self._ensure_features_df_schema_order()
@@ -614,7 +634,7 @@ def get_consensus_matches(self, uids=None):
     return matches
-# =====================================================================================
+# =====================================================================================
 # UID HELPER FUNCTIONS
 # =====================================================================================
@@ -796,7 +816,7 @@ def get_sample(self, sample):
         return cache[sample_uid]
     sample_path = row.get("sample_path", None)
-    s = Sample(log_level='ERROR')
+    s = Sample(log_level="ERROR")
     try:
         if sample_path:
             try:
@@ -816,13 +836,13 @@ def get_orphans(self):
     Get all features that are not in the consensus mapping.
     """
     not_in_consensus = self.features_df.filter(
-        ~self.features_df["feature_uid"].is_in(self.consensus_mapping_df["feature_uid"].to_list())
+        ~self.features_df["feature_uid"].is_in(self.consensus_mapping_df["feature_uid"].to_list()),
     )
     return not_in_consensus
 # =====================================================================================
-# DATA COMPRESSION AND RESTORATION FUNCTIONS
+# DATA COMPRESSION AND RESTORATION FUNCTIONS
 # =====================================================================================
@@ -878,7 +898,7 @@ def compress_features(self):
     removed_count = initial_count - len(self.features_df)
     self.logger.info(
-        f"Compressed features: removed {removed_count} features not in consensus, cleared ms2_specs column"
+        f"Compressed features: removed {removed_count} features not in consensus, cleared ms2_specs column",
     )
@@ -949,13 +969,20 @@ def restore_features(self, samples=None, maps=False):
             # Load sample to get its features_df
             # Use a direct load call with map=False to prevent feature synchronization
             # which would remove filled features that don't exist in the original FeatureMap
-            sample = Sample(log_level="DEBUG")
+            # Use ERROR log level to suppress info messages
+            sample = Sample(log_level="ERROR")
             sample._load_sample5(sample_path, map=False)
             if sample.features_df is None or sample.features_df.is_empty():
                 self.logger.warning(f"No features found in sample {sample_name}")
                 continue
+            # Check which columns are actually available in the sample
+            available_columns = [col for col in columns_to_update if col in sample.features_df.columns]
+            if not available_columns:
+                self.logger.debug(f"No target columns found in sample {sample_name}")
+                continue
             # Create update data for this sample
             updates_made = 0
             for row in sample.features_df.iter_rows(named=True):
@@ -967,8 +994,8 @@ def restore_features(self, samples=None, maps=False):
                 if key in study_feature_mapping:
                     feature_uid = study_feature_mapping[key]
-                    # Update the specific columns in study.features_df
-                    for col in columns_to_update:
+                    # Update only the available columns in study.features_df
+                    for col in available_columns:
                         if col in row and col in self.features_df.columns:
                             # Get the original column dtype to preserve it
                             original_dtype = self.features_df[col].dtype
@@ -993,7 +1020,8 @@ def restore_features(self, samples=None, maps=False):
                                 )
                     updates_made += 1
-            self.logger.debug(f"Updated {updates_made} features from sample {sample_name}")
+            if updates_made > 0:
+                self.logger.debug(f"Updated {updates_made} features from sample {sample_name}")
             # If maps is True, load featureXML data
             if maps:
@@ -1076,13 +1104,18 @@ def restore_chrom(self, samples=None, mz_tol=0.010, rt_tol=10.0):
         try:
             # Load sample (with map=False to prevent feature synchronization)
-            sample = Sample(log_level="WARNING")
+            # Use ERROR log level to suppress info messages
+            sample = Sample(log_level="ERROR")
             sample._load_sample5(sample_path, map=False)
             if sample.features_df is None or sample.features_df.is_empty():
                 self.logger.warning(f"No features found in sample {sample_name}")
                 continue
+            # Check if chrom column exists in sample
+            if "chrom" not in sample.features_df.columns:
+                continue
             # Update chromatograms from this sample
             for row in sample.features_df.iter_rows(named=True):
                 feature_id = row.get("feature_id")
@@ -1119,7 +1152,7 @@ def restore_chrom(self, samples=None, mz_tol=0.010, rt_tol=10.0):
     total_chroms = len(self.features_df)
     self.logger.debug(
-        f"Chromatograms still missing: {empty_chroms}/{total_chroms} ({empty_chroms / total_chroms * 100:.1f}%)"
+        f"Chromatograms still missing: {empty_chroms}/{total_chroms} ({empty_chroms / total_chroms * 100:.1f}%)",
     )
     if empty_chroms == 0:
@@ -1163,7 +1196,8 @@ def restore_chrom(self, samples=None, mz_tol=0.010, rt_tol=10.0):
         try:
             # Load sample for MS1 data extraction
-            sample = Sample(log_level="WARNING")
+            # Use ERROR log level to suppress info messages
+            sample = Sample(log_level="ERROR")
             sample._load_sample5(sample_path, map=False)
             if not hasattr(sample, "ms1_df") or sample.ms1_df is None or sample.ms1_df.is_empty():
@@ -1249,7 +1283,7 @@ def restore_chrom(self, samples=None, mz_tol=0.010, rt_tol=10.0):
     final_total = len(self.features_df)
     self.logger.info(
-        f"Chromatogram restoration complete: {final_non_null}/{final_total} ({final_non_null / final_total * 100:.1f}%)"
+        f"Chromatogram restoration complete: {final_non_null}/{final_total} ({final_non_null / final_total * 100:.1f}%)",
     )
     self.logger.info(f"Restored from .sample5 files: {restored_count}, Gap-filled from raw data: {filled_count}")
@@ -1290,7 +1324,7 @@ def compress_ms2(self, max_replicates=5):
     removed_count = initial_count - len(self.consensus_ms2)
     self.logger.info(
-        f"Compressed MS2 data: removed {removed_count} entries, kept max {max_replicates} per consensus/energy pair"
+        f"Compressed MS2 data: removed {removed_count} entries, kept max {max_replicates} per consensus/energy pair",
     )
@@ -1328,14 +1362,14 @@ def compress_chrom(self):
 def sample_name_replace(self, replace_dict):
     """
     Replace sample names in samples_df based on a dictionary mapping.
-    Takes all names in self.samples_df['sample_name'], creates a copy, and replaces
-    all keys with their corresponding values from replace_dict. Checks that all
+    Takes all names in self.samples_df['sample_name'], creates a copy, and replaces
+    all keys with their corresponding values from replace_dict. Checks that all
     resulting sample names are unique. If unique, replaces the values in self.samples_df.
     Parameters:
         replace_dict (dict): Dictionary mapping old names (keys) to new names (values).
-                           All keys found in sample names will be replaced with their
+                           All keys found in sample names will be replaced with their
                            corresponding values.
                            e.g., {"old_name1": "new_name1", "old_name2": "new_name2"}
@@ -1348,22 +1382,22 @@ def sample_name_replace(self, replace_dict):
     """
     if not isinstance(replace_dict, dict):
         raise ValueError("replace_dict must be a dictionary")
     if self.samples_df is None or len(self.samples_df) == 0:
         self.logger.warning("No samples found in study.")
         return
     if not replace_dict:
         self.logger.warning("Empty replace_dict provided, no changes made.")
         return
     # Get current sample names
     current_names = self.samples_df.get_column("sample_name").to_list()
     # Create a copy and apply replacements
     new_names = []
     replaced_count = 0
     for name in current_names:
         if name in replace_dict:
             new_names.append(replace_dict[name])
@@ -1371,7 +1405,7 @@ def sample_name_replace(self, replace_dict):
             self.logger.debug(f"Replacing sample name: '{name}' -> '{replace_dict[name]}'")
         else:
             new_names.append(name)
     # Check that all new names are unique
     if len(set(new_names)) != len(new_names):
         duplicates = []
@@ -1382,19 +1416,19 @@ def sample_name_replace(self, replace_dict):
             else:
                 seen.add(name)
         raise ValueError(f"Resulting sample names are not unique. Duplicates found: {duplicates}")
     # If we get here, all names are unique - apply the changes
     self.samples_df = self.samples_df.with_columns(
         pl.Series("sample_name", new_names).alias("sample_name"),
     )
     self.logger.info(f"Successfully replaced {replaced_count} sample names")
 def sample_name_reset(self):
     """
     Reset sample names to the basename of sample_path without extensions.
     Takes all paths in self.samples_df['sample_path'], extracts the basename,
     removes file extensions, and checks that all resulting names are unique.
     If unique, replaces the values in self.samples_df['sample_name'].
@@ -1407,31 +1441,31 @@ def sample_name_reset(self):
         RuntimeError: If any sample_path is None or empty
     """
     import os
     if self.samples_df is None or len(self.samples_df) == 0:
         self.logger.warning("No samples found in study.")
         return
     # Get current sample paths
     sample_paths = self.samples_df.get_column("sample_path").to_list()
     # Extract basenames without extensions
     new_names = []
     for i, path in enumerate(sample_paths):
         if path is None or path == "":
             raise RuntimeError(f"Sample at index {i} has no sample_path set")
         # Get basename and remove extension(s)
         basename = os.path.basename(path)
         # Remove all extensions (handles cases like .tar.gz, .sample5.gz, etc.)
         name_without_ext = basename
-        while '.' in name_without_ext:
+        while "." in name_without_ext:
             name_without_ext = os.path.splitext(name_without_ext)[0]
         new_names.append(name_without_ext)
         self.logger.debug(f"Resetting sample name from path: '{path}' -> '{name_without_ext}'")
     # Check that all new names are unique
     if len(set(new_names)) != len(new_names):
         duplicates = []
@@ -1442,12 +1476,12 @@ def sample_name_reset(self):
             else:
                 seen.add(name)
         raise ValueError(f"Resulting sample names are not unique. Duplicates found: {duplicates}")
     # If we get here, all names are unique - apply the changes
     self.samples_df = self.samples_df.with_columns(
         pl.Series("sample_name", new_names).alias("sample_name"),
     )
     self.logger.info(f"Successfully reset {len(new_names)} sample names from sample paths")
@@ -1704,7 +1738,7 @@ def features_select(
             if isinstance(chrom_coherence, tuple) and len(chrom_coherence) == 2:
                 min_coherence, max_coherence = chrom_coherence
                 filter_conditions.append(
-                    (pl.col("chrom_coherence") >= min_coherence) & (pl.col("chrom_coherence") <= max_coherence)
+                    (pl.col("chrom_coherence") >= min_coherence) & (pl.col("chrom_coherence") <= max_coherence),
                 )
             else:
                 filter_conditions.append(pl.col("chrom_coherence") >= chrom_coherence)
@@ -1717,7 +1751,7 @@ def features_select(
             if isinstance(chrom_prominence, tuple) and len(chrom_prominence) == 2:
                 min_prominence, max_prominence = chrom_prominence
                 filter_conditions.append(
-                    (pl.col("chrom_prominence") >= min_prominence) & (pl.col("chrom_prominence") <= max_prominence)
+                    (pl.col("chrom_prominence") >= min_prominence) & (pl.col("chrom_prominence") <= max_prominence),
                 )
             else:
                 filter_conditions.append(pl.col("chrom_prominence") >= chrom_prominence)
@@ -1731,7 +1765,7 @@ def features_select(
                 min_prominence_scaled, max_prominence_scaled = chrom_prominence_scaled
                 filter_conditions.append(
                     (pl.col("chrom_prominence_scaled") >= min_prominence_scaled)
-                    & (pl.col("chrom_prominence_scaled") <= max_prominence_scaled)
+                    & (pl.col("chrom_prominence_scaled") <= max_prominence_scaled),
                 )
             else:
                 filter_conditions.append(pl.col("chrom_prominence_scaled") >= chrom_prominence_scaled)
@@ -1745,7 +1779,7 @@ def features_select(
                 min_height_scaled, max_height_scaled = chrom_height_scaled
                 filter_conditions.append(
                     (pl.col("chrom_height_scaled") >= min_height_scaled)
-                    & (pl.col("chrom_height_scaled") <= max_height_scaled)
+                    & (pl.col("chrom_height_scaled") <= max_height_scaled),
                 )
             else:
                 filter_conditions.append(pl.col("chrom_height_scaled") >= chrom_height_scaled)
@@ -1852,7 +1886,7 @@ def features_filter(self, features):
     # Single comprehensive log message
     if mapping_removed_count > 0:
         self.logger.info(
-            f"Kept {final_count} features and removed {mapping_removed_count} consensus mappings. Filtered out {removed_count} features."
+            f"Kept {final_count} features and removed {mapping_removed_count} consensus mappings. Filtered out {removed_count} features.",
         )
     else:
         self.logger.info(f"Kept {final_count} features. Filtered out {removed_count} features.")
@@ -1929,7 +1963,7 @@ def features_delete(self, features):
     # Single comprehensive log message
     if mapping_removed_count > 0:
         self.logger.info(
-            f"Deleted {removed_count} features and {mapping_removed_count} consensus mappings. Remaining features: {final_count}"
+            f"Deleted {removed_count} features and {mapping_removed_count} consensus mappings. Remaining features: {final_count}",
         )
     else:
         self.logger.info(f"Deleted {removed_count} features. Remaining features: {final_count}")
@@ -1994,7 +2028,7 @@ def consensus_select(
     # Filter by m/z
     if mz is not None:
         consensus_len_before_filter = len(consensus)
         if isinstance(mz, tuple) and len(mz) == 2:
             # Check if second value is smaller than first (indicating mz, mz_tol format)
             if mz[1] < mz[0]:
@@ -2008,18 +2042,19 @@ def consensus_select(
             consensus = consensus.filter((pl.col("mz") >= min_mz) & (pl.col("mz") <= max_mz))
         else:
             # Single float value - use default mz tolerance from study parameters
-            default_mz_tol = getattr(self, 'parameters', None)
-            if default_mz_tol and hasattr(default_mz_tol, 'eic_mz_tol'):
+            default_mz_tol = getattr(self, "parameters", None)
+            if default_mz_tol and hasattr(default_mz_tol, "eic_mz_tol"):
                 default_mz_tol = default_mz_tol.eic_mz_tol
             else:
                 # Fallback to align_defaults if study parameters not available
                 from masster.study.defaults.align_def import align_defaults
                 default_mz_tol = align_defaults().mz_max_diff
             min_mz = mz - default_mz_tol
             max_mz = mz + default_mz_tol
             consensus = consensus.filter((pl.col("mz") >= min_mz) & (pl.col("mz") <= max_mz))
         self.logger.debug(
             f"Selected consensus by mz. Consensus removed: {consensus_len_before_filter - len(consensus)}",
         )
@@ -2027,7 +2062,7 @@ def consensus_select(
     # Filter by retention time
     if rt is not None:
         consensus_len_before_filter = len(consensus)
         if isinstance(rt, tuple) and len(rt) == 2:
             # Check if second value is smaller than first (indicating rt, rt_tol format)
             if rt[1] < rt[0]:
@@ -2041,18 +2076,19 @@ def consensus_select(
             consensus = consensus.filter((pl.col("rt") >= min_rt) & (pl.col("rt") <= max_rt))
         else:
             # Single float value - use default rt tolerance from study parameters
-            default_rt_tol = getattr(self, 'parameters', None)
-            if default_rt_tol and hasattr(default_rt_tol, 'eic_rt_tol'):
+            default_rt_tol = getattr(self, "parameters", None)
+            if default_rt_tol and hasattr(default_rt_tol, "eic_rt_tol"):
                 default_rt_tol = default_rt_tol.eic_rt_tol
             else:
                 # Fallback to align_defaults if study parameters not available
                 from masster.study.defaults.align_def import align_defaults
                 default_rt_tol = align_defaults().rt_max_diff
             min_rt = rt - default_rt_tol
             max_rt = rt + default_rt_tol
             consensus = consensus.filter((pl.col("rt") >= min_rt) & (pl.col("rt") <= max_rt))
         self.logger.debug(
             f"Selected consensus by rt. Consensus removed: {consensus_len_before_filter - len(consensus)}",
         )
@@ -2077,7 +2113,7 @@ def consensus_select(
                 # Treat as range
                 min_uid, max_uid = consensus_uid
                 consensus = consensus.filter(
-                    (pl.col("consensus_uid") >= min_uid) & (pl.col("consensus_uid") <= max_uid)
+                    (pl.col("consensus_uid") >= min_uid) & (pl.col("consensus_uid") <= max_uid),
                 )
             else:
                 # Treat as list
@@ -2105,7 +2141,7 @@ def consensus_select(
         if isinstance(number_samples, tuple) and len(number_samples) == 2:
             min_samples, max_samples = number_samples
             consensus = consensus.filter(
-                (pl.col("number_samples") >= min_samples) & (pl.col("number_samples") <= max_samples)
+                (pl.col("number_samples") >= min_samples) & (pl.col("number_samples") <= max_samples),
             )
         else:
             consensus = consensus.filter(pl.col("number_samples") >= number_samples)
@@ -2163,7 +2199,7 @@ def consensus_select(
                 min_coherence, max_coherence = chrom_coherence_mean
                 consensus = consensus.filter(
                     (pl.col("chrom_coherence_mean") >= min_coherence)
-                    & (pl.col("chrom_coherence_mean") <= max_coherence)
+                    & (pl.col("chrom_coherence_mean") <= max_coherence),
                 )
             else:
                 consensus = consensus.filter(pl.col("chrom_coherence_mean") >= chrom_coherence_mean)
@@ -2181,7 +2217,7 @@ def consensus_select(
                 min_prominence, max_prominence = chrom_prominence_mean
                 consensus = consensus.filter(
                     (pl.col("chrom_prominence_mean") >= min_prominence)
-                    & (pl.col("chrom_prominence_mean") <= max_prominence)
+                    & (pl.col("chrom_prominence_mean") <= max_prominence),
                 )
             else:
                 consensus = consensus.filter(pl.col("chrom_prominence_mean") >= chrom_prominence_mean)
@@ -2199,7 +2235,7 @@ def consensus_select(
                 min_prominence_scaled, max_prominence_scaled = chrom_prominence_scaled_mean
                 consensus = consensus.filter(
                     (pl.col("chrom_prominence_scaled_mean") >= min_prominence_scaled)
-                    & (pl.col("chrom_prominence_scaled_mean") <= max_prominence_scaled)
+                    & (pl.col("chrom_prominence_scaled_mean") <= max_prominence_scaled),
                 )
             else:
                 consensus = consensus.filter(pl.col("chrom_prominence_scaled_mean") >= chrom_prominence_scaled_mean)
@@ -2217,7 +2253,7 @@ def consensus_select(
                 min_height_scaled, max_height_scaled = chrom_height_scaled_mean
                 consensus = consensus.filter(
                     (pl.col("chrom_height_scaled_mean") >= min_height_scaled)
-                    & (pl.col("chrom_height_scaled_mean") <= max_height_scaled)
+                    & (pl.col("chrom_height_scaled_mean") <= max_height_scaled),
                 )
             else:
                 consensus = consensus.filter(pl.col("chrom_height_scaled_mean") >= chrom_height_scaled_mean)
@@ -2234,7 +2270,7 @@ def consensus_select(
             if isinstance(rt_delta_mean, tuple) and len(rt_delta_mean) == 2:
                 min_rt_delta, max_rt_delta = rt_delta_mean
                 consensus = consensus.filter(
-                    (pl.col("rt_delta_mean") >= min_rt_delta) & (pl.col("rt_delta_mean") <= max_rt_delta)
+                    (pl.col("rt_delta_mean") >= min_rt_delta) & (pl.col("rt_delta_mean") <= max_rt_delta),
                 )
             else:
                 consensus = consensus.filter(pl.col("rt_delta_mean") >= rt_delta_mean)
@@ -2261,10 +2297,10 @@ def consensus_select(
             # Multiple columns
             valid_columns = [col for col in sortby if col in consensus.columns]
             invalid_columns = [col for col in sortby if col not in consensus.columns]
             if invalid_columns:
                 self.logger.warning(f"Sort columns not found in consensus DataFrame: {invalid_columns}")
             if valid_columns:
                 consensus = consensus.sort(valid_columns, descending=descending)
         else:
@@ -2355,7 +2391,7 @@ def consensus_filter(self, consensus):
     removed_consensus_count = initial_consensus_count - len(self.consensus_df)
     self.logger.info(
-        f"Filtered {removed_consensus_count} consensus features. Remaining consensus: {len(self.consensus_df)}"
+        f"Filtered {removed_consensus_count} consensus features. Remaining consensus: {len(self.consensus_df)}",
     )
@@ -2485,7 +2521,9 @@ def samples_select(
                 if len(sample_batch) == 2 and not isinstance(sample_batch, list):
                     # Treat as range
                     min_batch, max_batch = sample_batch
-                    filter_conditions.append((pl.col("sample_batch") >= min_batch) & (pl.col("sample_batch") <= max_batch))
+                    filter_conditions.append(
+                        (pl.col("sample_batch") >= min_batch) & (pl.col("sample_batch") <= max_batch),
+                    )
                 else:
                     # Treat as list
                     filter_conditions.append(pl.col("sample_batch").is_in(sample_batch))
@@ -2501,7 +2539,9 @@ def samples_select(
                 if len(sample_sequence) == 2 and not isinstance(sample_sequence, list):
                     # Treat as range
                     min_seq, max_seq = sample_sequence
-                    filter_conditions.append((pl.col("sample_sequence") >= min_seq) & (pl.col("sample_sequence") <= max_seq))
+                    filter_conditions.append(
+                        (pl.col("sample_sequence") >= min_seq) & (pl.col("sample_sequence") <= max_seq),
+                    )
                 else:
                     # Treat as list
                     filter_conditions.append(pl.col("sample_sequence").is_in(sample_sequence))
@@ -2515,7 +2555,9 @@ def samples_select(
         if "num_features" in available_columns:
             if isinstance(num_features, tuple) and len(num_features) == 2:
                 min_features, max_features = num_features
-                filter_conditions.append((pl.col("num_features") >= min_features) & (pl.col("num_features") <= max_features))
+                filter_conditions.append(
+                    (pl.col("num_features") >= min_features) & (pl.col("num_features") <= max_features),
+                )
             else:
                 filter_conditions.append(pl.col("num_features") >= num_features)
         else:
@@ -2572,15 +2614,15 @@ def samples_select(
 def samples_delete(self, samples):
     """
     Delete samples and all related data from the study based on sample identifiers.
-    This function eliminates all data related to the specified samples (and their sample_uids)
+    This function eliminates all data related to the specified samples (and their sample_uids)
     from all dataframes including:
     - samples_df: Removes the sample rows
     - features_df: Removes all features belonging to these samples
     - consensus_mapping_df: Removes mappings for features from these samples
     - consensus_ms2: Removes MS2 spectra for features from these samples
     - feature_maps: Removes the corresponding feature maps
     Also updates map_id values to maintain sequential indices after deletion.
     Parameters:
@@ -2642,10 +2684,10 @@ def samples_delete(self, samples):
     # Get map_ids to remove from feature_maps (needed before samples_df deletion)
     map_ids_to_remove = []
-    if hasattr(self, 'feature_maps') and self.feature_maps is not None:
+    if hasattr(self, "feature_maps") and self.feature_maps is not None:
         # Get map_ids for samples to be deleted
         map_ids_df = self.samples_df.filter(
-            pl.col("sample_uid").is_in(sample_uids_to_remove)
+            pl.col("sample_uid").is_in(sample_uids_to_remove),
         ).select("map_id")
         if not map_ids_df.is_empty():
             map_ids_to_remove = map_ids_df["map_id"].to_list()
@@ -2683,7 +2725,7 @@ def samples_delete(self, samples):
     # 5. Remove from feature_maps and update map_id
     removed_maps_count = 0
-    if hasattr(self, 'feature_maps') and self.feature_maps is not None and map_ids_to_remove:
+    if hasattr(self, "feature_maps") and self.feature_maps is not None and map_ids_to_remove:
         # Remove feature maps in reverse order to maintain indices
         for map_id in sorted(map_ids_to_remove, reverse=True):
             if 0 <= map_id < len(self.feature_maps):
@@ -2694,7 +2736,7 @@ def samples_delete(self, samples):
         if len(self.samples_df) > 0:
             new_map_ids = list(range(len(self.samples_df)))
             self.samples_df = self.samples_df.with_columns(
-                pl.lit(new_map_ids).alias("map_id")
+                pl.lit(new_map_ids).alias("map_id"),
             )
     # Calculate and log results
@@ -2705,16 +2747,16 @@ def samples_delete(self, samples):
     summary_parts = [
         f"Deleted {removed_sample_count} samples",
     ]
     if removed_features_count > 0:
         summary_parts.append(f"{removed_features_count} features")
     if removed_mapping_count > 0:
         summary_parts.append(f"{removed_mapping_count} consensus mappings")
     if removed_ms2_count > 0:
         summary_parts.append(f"{removed_ms2_count} MS2 spectra")
     if removed_maps_count > 0:
         summary_parts.append(f"{removed_maps_count} feature maps")
@@ -2735,14 +2777,14 @@ def samples_delete(self, samples):
 def sample_color(self, by=None, palette="Turbo256"):
     """
     Set sample colors in the sample_color column of samples_df.
     When a new sample is added, this function resets all colors picking from the specified palette.
     The default palette is Turbo256.
     Parameters:
         by (str or list, optional): Property to base colors on. Options:
                                      - 'sample_uid': Use sample_uid values to assign colors
-                                     - 'sample_index': Use sample index (position) to assign colors
+                                     - 'sample_index': Use sample index (position) to assign colors
                                      - 'sample_type': Use sample_type values to assign colors
                                      - 'sample_name': Use sample_name values to assign colors
                                      - list of colors: Use provided list of hex color codes
@@ -2755,7 +2797,7 @@ def sample_color(self, by=None, palette="Turbo256"):
                       - 'Magma256': Magma colormap (256 colors, perceptually uniform)
                       - 'Cividis256': Cividis colormap (256 colors, colorblind-friendly)
                       - 'Set1': Qualitative palette (9 distinct colors)
-                      - 'Set2': Qualitative palette (8 distinct colors)
+                      - 'Set2': Qualitative palette (8 distinct colors)
                       - 'Set3': Qualitative palette (12 distinct colors)
                       - 'Tab10': Tableau 10 palette (10 distinct colors)
                       - 'Tab20': Tableau 20 palette (20 distinct colors)
@@ -2766,7 +2808,7 @@ def sample_color(self, by=None, palette="Turbo256"):
                       - 'Coolwarm': Cool-warm diverging colormap
                       - 'Seismic': Seismic diverging colormap
                       - Any other colormap name supported by the cmap library
                       For a complete catalog of available colormaps, see:
                       https://cmap-docs.readthedocs.io/en/latest/catalog/
@@ -2776,10 +2818,10 @@ def sample_color(self, by=None, palette="Turbo256"):
     Example:
         # Set colors based on sample type
         study.sample_color(by='sample_type', palette='Set1')
         # Set colors using a custom color list
         study.sample_color(by=['#FF0000', '#00FF00', '#0000FF'])
         # Reset to default Turbo256 sequential colors
         study.sample_color()
     """
@@ -2788,11 +2830,13 @@ def sample_color(self, by=None, palette="Turbo256"):
         return
     sample_count = len(self.samples_df)
     # Handle custom color list
     if isinstance(by, list):
         if len(by) < sample_count:
-            self.logger.warning(f"Provided color list has {len(by)} colors but {sample_count} samples. Repeating colors.")
+            self.logger.warning(
+                f"Provided color list has {len(by)} colors but {sample_count} samples. Repeating colors.",
+            )
             # Cycle through the provided colors if there aren't enough
             colors = []
             for i in range(sample_count):
@@ -2808,10 +2852,10 @@ def sample_color(self, by=None, palette="Turbo256"):
             except ValueError as e:
                 self.logger.error(f"Error sampling colors from colormap: {e}")
                 return
-        elif by == 'sample_uid':
+        elif by == "sample_uid":
             # Use sample_uid to determine position in evenly sampled colormap
-            sample_uids = self.samples_df['sample_uid'].to_list()
+            sample_uids = self.samples_df["sample_uid"].to_list()
             try:
                 # Sample colors evenly for the number of samples
                 palette_colors = _sample_colors_from_colormap(palette, sample_count)
@@ -2823,29 +2867,29 @@ def sample_color(self, by=None, palette="Turbo256"):
             except ValueError as e:
                 self.logger.error(f"Error sampling colors from colormap: {e}")
                 return
-        elif by == 'sample_index':
+        elif by == "sample_index":
             # Use sample index (position in DataFrame) with evenly sampled colors
             try:
                 colors = _sample_colors_from_colormap(palette, sample_count)
             except ValueError as e:
                 self.logger.error(f"Error sampling colors from colormap: {e}")
                 return
-        elif by == 'sample_type':
+        elif by == "sample_type":
             # Use sample_type to assign colors - same type gets same color
             # Sample colors evenly across colormap for unique types
-            sample_types = self.samples_df['sample_type'].to_list()
-            unique_types = list(set([t for t in sample_types if t is not None]))
+            sample_types = self.samples_df["sample_type"].to_list()
+            unique_types = list({t for t in sample_types if t is not None})
             try:
                 # Sample colors evenly for unique types
                 type_colors = _sample_colors_from_colormap(palette, len(unique_types))
                 type_to_color = {}
                 for i, sample_type in enumerate(unique_types):
                     type_to_color[sample_type] = type_colors[i]
                 colors = []
                 for sample_type in sample_types:
                     if sample_type is None:
@@ -2856,21 +2900,21 @@ def sample_color(self, by=None, palette="Turbo256"):
             except ValueError as e:
                 self.logger.error(f"Error sampling colors from colormap: {e}")
                 return
-        elif by == 'sample_name':
+        elif by == "sample_name":
             # Use sample_name to assign colors - same name gets same color (unlikely but possible)
             # Sample colors evenly across colormap for unique names
-            sample_names = self.samples_df['sample_name'].to_list()
-            unique_names = list(set([n for n in sample_names if n is not None]))
+            sample_names = self.samples_df["sample_name"].to_list()
+            unique_names = list({n for n in sample_names if n is not None})
             try:
                 # Sample colors evenly for unique names
                 name_colors = _sample_colors_from_colormap(palette, len(unique_names))
                 name_to_color = {}
                 for i, sample_name in enumerate(unique_names):
                     name_to_color[sample_name] = name_colors[i]
                 colors = []
                 for sample_name in sample_names:
                     if sample_name is None:
@@ -2882,14 +2926,16 @@ def sample_color(self, by=None, palette="Turbo256"):
                 self.logger.error(f"Error sampling colors from colormap: {e}")
                 return
         else:
-            self.logger.error(f"Invalid by value: {by}. Must be 'sample_uid', 'sample_index', 'sample_type', 'sample_name', a list of colors, or None.")
+            self.logger.error(
+                f"Invalid by value: {by}. Must be 'sample_uid', 'sample_index', 'sample_type', 'sample_name', a list of colors, or None.",
+            )
             return
     # Update the sample_color column
     self.samples_df = self.samples_df.with_columns(
-        pl.Series("sample_color", colors).alias("sample_color")
+        pl.Series("sample_color", colors).alias("sample_color"),
     )
     if isinstance(by, list):
         self.logger.debug(f"Set sample colors using provided color list ({len(by)} colors)")
     elif by is None:
@@ -2901,28 +2947,28 @@ def sample_color(self, by=None, palette="Turbo256"):
 def sample_color_reset(self):
     """
     Reset sample colors to default coloring using the 'turbo' colormap.
     This function assigns colors by distributing samples evenly across the full
     turbo colormap range, ensuring maximum color diversity and visual distinction
     between samples.
     Returns:
         None (modifies self.samples_df in place)
     """
     if self.samples_df is None or len(self.samples_df) == 0:
         self.logger.warning("No samples found in study.")
         return
     try:
         from cmap import Colormap
         # Use turbo colormap
-        cm = Colormap('turbo')
+        cm = Colormap("turbo")
         # Get sample count and assign colors evenly distributed across colormap
         n_samples = len(self.samples_df)
         colors = []
         # Distribute samples evenly across the full colormap range
         for i in range(n_samples):
             # Evenly distribute samples across colormap (avoiding endpoints to prevent white/black)
@@ -2930,9 +2976,9 @@ def sample_color_reset(self):
             # Optionally, map to a subset of colormap to avoid extreme colors
             # Use 10% to 90% of colormap range for better color diversity
             normalized_value = 0.1 + (normalized_value * 0.8)
             color_rgba = cm(normalized_value)
             # Convert RGBA to hex
             if len(color_rgba) >= 3:
                 r, g, b = color_rgba[:3]
@@ -2941,14 +2987,14 @@ def sample_color_reset(self):
                     r, g, b = int(r * 255), int(g * 255), int(b * 255)
                 hex_color = f"#{r:02x}{g:02x}{b:02x}"
                 colors.append(hex_color)
         # Update the sample_color column
         self.samples_df = self.samples_df.with_columns(
-            pl.Series("sample_color", colors).alias("sample_color")
+            pl.Series("sample_color", colors).alias("sample_color"),
         )
         self.logger.debug(f"Reset sample colors using turbo colormap with even distribution ({n_samples} samples)")
     except ImportError:
         self.logger.error("cmap library is required for sample color reset. Install with: pip install cmap")
     except Exception as e:
@@ -2958,13 +3004,13 @@ def sample_color_reset(self):
 def _get_color_palette(palette_name):
     """
     Get color palette as a list of hex color codes using the cmap library.
     Parameters:
         palette_name (str): Name of the palette
     Returns:
         list: List of hex color codes
     Raises:
         ValueError: If palette_name is not supported
     """
@@ -2972,40 +3018,38 @@ def _get_color_palette(palette_name):
         from cmap import Colormap
     except ImportError:
         raise ValueError("cmap library is required for color palettes. Install with: pip install cmap")
     # Map common palette names to cmap names
     palette_mapping = {
         # Scientific colormaps
         "Turbo256": "turbo",
-        "Viridis256": "viridis",
+        "Viridis256": "viridis",
         "Plasma256": "plasma",
         "Inferno256": "inferno",
         "Magma256": "magma",
         "Cividis256": "cividis",
         # Qualitative palettes
         "Set1": "Set1",
-        "Set2": "Set2",
+        "Set2": "Set2",
         "Set3": "Set3",
         "Tab10": "tab10",
         "Tab20": "tab20",
         "Dark2": "Dark2",
         "Paired": "Paired",
         # Additional useful palettes
         "Spectral": "Spectral",
         "Rainbow": "rainbow",
         "Coolwarm": "coolwarm",
         "Seismic": "seismic",
     }
     # Get the cmap name
     cmap_name = palette_mapping.get(palette_name, palette_name.lower())
     try:
         # Create colormap
         cm = Colormap(cmap_name)
         # Determine number of colors to generate
         if "256" in palette_name:
             n_colors = 256
@@ -3021,7 +3065,7 @@ def _get_color_palette(palette_name):
             n_colors = 20
         else:
             n_colors = 256  # Default for continuous colormaps
         # Generate colors
         if n_colors <= 20:
             # For discrete palettes, use evenly spaced indices
@@ -3029,11 +3073,11 @@ def _get_color_palette(palette_name):
         else:
             # For continuous palettes, use full range
             indices = [i / (n_colors - 1) for i in range(n_colors)]
         # Get colors as RGBA and convert to hex
         colors = cm(indices)
         hex_colors = []
         for color in colors:
             if len(color) >= 3:  # RGBA or RGB
                 r, g, b = color[:3]
@@ -3042,25 +3086,26 @@ def _get_color_palette(palette_name):
                     r, g, b = int(r * 255), int(g * 255), int(b * 255)
                 hex_color = f"#{r:02x}{g:02x}{b:02x}"
                 hex_colors.append(hex_color)
         return hex_colors
     except Exception as e:
-        raise ValueError(f"Failed to create colormap '{cmap_name}': {e}. "
-                        f"Available palettes: {list(palette_mapping.keys())}")
+        raise ValueError(
+            f"Failed to create colormap '{cmap_name}': {e}. Available palettes: {list(palette_mapping.keys())}",
+        )
 def _sample_colors_from_colormap(palette_name, n_colors):
     """
     Sample colors evenly from the whole colormap range, similar to sample_color_reset.
     Parameters:
         palette_name (str): Name of the palette/colormap
         n_colors (int): Number of colors to sample
     Returns:
         list: List of hex color codes sampled evenly from the colormap
     Raises:
         ValueError: If palette_name is not supported
     """
@@ -3068,51 +3113,49 @@ def _sample_colors_from_colormap(palette_name, n_colors):
         from cmap import Colormap
     except ImportError:
         raise ValueError("cmap library is required for color palettes. Install with: pip install cmap")
     # Map common palette names to cmap names (same as _get_color_palette)
     palette_mapping = {
         # Scientific colormaps
         "Turbo256": "turbo",
-        "Viridis256": "viridis",
+        "Viridis256": "viridis",
         "Plasma256": "plasma",
         "Inferno256": "inferno",
         "Magma256": "magma",
         "Cividis256": "cividis",
         # Qualitative palettes
         "Set1": "Set1",
-        "Set2": "Set2",
+        "Set2": "Set2",
         "Set3": "Set3",
         "Tab10": "tab10",
         "Tab20": "tab20",
         "Dark2": "Dark2",
         "Paired": "Paired",
         # Additional useful palettes
         "Spectral": "Spectral",
         "Rainbow": "rainbow",
         "Coolwarm": "coolwarm",
         "Seismic": "seismic",
     }
     # Get the cmap name
     cmap_name = palette_mapping.get(palette_name, palette_name.lower())
     try:
         # Create colormap
         cm = Colormap(cmap_name)
         colors = []
         # Distribute samples evenly across the full colormap range (same approach as sample_color_reset)
         for i in range(n_colors):
             # Evenly distribute samples across colormap (avoiding endpoints to prevent white/black)
             normalized_value = (i + 0.5) / n_colors  # +0.5 to center samples in their bins
             # Map to a subset of colormap to avoid extreme colors (use 10% to 90% range)
             normalized_value = 0.1 + (normalized_value * 0.8)
             color_rgba = cm(normalized_value)
             # Convert RGBA to hex
             if len(color_rgba) >= 3:
                 r, g, b = color_rgba[:3]
@@ -3121,12 +3164,13 @@ def _sample_colors_from_colormap(palette_name, n_colors):
                     r, g, b = int(r * 255), int(g * 255), int(b * 255)
                 hex_color = f"#{r:02x}{g:02x}{b:02x}"
                 colors.append(hex_color)
         return colors
     except Exception as e:
-        raise ValueError(f"Failed to create colormap '{cmap_name}': {e}. "
-                        f"Available palettes: {list(palette_mapping.keys())}")
+        raise ValueError(
+            f"Failed to create colormap '{cmap_name}': {e}. Available palettes: {list(palette_mapping.keys())}",
+        )
 def _matplotlib_to_hex(color_dict):
@@ -3135,32 +3179,32 @@ def _matplotlib_to_hex(color_dict):
 # =====================================================================================
-# SCHEMA AND DATA STRUCTURE FUNCTIONS
+# SCHEMA AND DATA STRUCTURE FUNCTIONS
 # =====================================================================================
 def _ensure_features_df_schema_order(self):
     """
     Ensure features_df columns are ordered according to study5_schema.json.
     This method should be called after operations that might scramble the column order.
     """
     if self.features_df is None or self.features_df.is_empty():
         return
     try:
         import os
         import json
         from masster.study.h5 import _reorder_columns_by_schema
         # Load schema
         schema_path = os.path.join(os.path.dirname(__file__), "study5_schema.json")
-        with open(schema_path, 'r') as f:
+        with open(schema_path) as f:
             schema = json.load(f)
         # Reorder columns to match schema
-        self.features_df = _reorder_columns_by_schema(self.features_df, schema, 'features_df')
+        self.features_df = _reorder_columns_by_schema(self.features_df, schema, "features_df")
     except Exception as e:
         self.logger.warning(f"Failed to reorder features_df columns: {e}")
@@ -3168,38 +3212,248 @@ def _ensure_features_df_schema_order(self):
 def migrate_map_id_to_index(self):
     """
     Migrate map_id from string-based OpenMS unique IDs to integer indices.
     This function converts the map_id column from string type (with OpenMS unique IDs)
     to integer type where each map_id corresponds to the index of the feature map
     in self.features_maps.
     This migration is needed for studies that were created before the map_id format
     change from OpenMS unique IDs to feature map indices.
     """
     if self.samples_df is None or self.samples_df.is_empty():
         self.logger.warning("No samples to migrate")
         return
     # Check if migration is needed
-    current_dtype = self.samples_df['map_id'].dtype
+    current_dtype = self.samples_df["map_id"].dtype
     if current_dtype == pl.Int64:
         self.logger.info("map_id column is already Int64 type - no migration needed")
         return
     self.logger.info("Migrating map_id from string-based OpenMS IDs to integer indices")
     # Create new map_id values based on sample order
     # Each sample gets a map_id that corresponds to its position in features_maps
     sample_count = len(self.samples_df)
     new_map_ids = list(range(sample_count))
     # Update the map_id column
     self.samples_df = self.samples_df.with_columns(
-        pl.lit(new_map_ids).alias("map_id")
+        pl.lit(new_map_ids).alias("map_id"),
     )
     # Ensure the column is Int64 type
     self.samples_df = self.samples_df.cast({"map_id": pl.Int64})
     self.logger.info(f"Successfully migrated {sample_count} samples to indexed map_id format")
     self.logger.info(f"map_id now ranges from 0 to {sample_count - 1}")
+def restore_ms2(self, samples=None, **kwargs):
+    """
+    Restore MS2 data by re-running find_ms2 on specified samples.
+    This function rebuilds the consensus_ms2 DataFrame by re-extracting MS2 spectra
+    from the original sample files. Use this to reverse the effects of compress_ms2().
+    Parameters:
+        samples (list, optional): List of sample_uids or sample_names to process.
+                                 If None, processes all samples.
+        **kwargs: Additional keyword arguments passed to find_ms2()
+                 (e.g., mz_tol, centroid, deisotope, etc.)
+    """
+    if self.features_df is None or self.features_df.is_empty():
+        self.logger.error("No features_df found in study.")
+        return
+    if self.samples_df is None or self.samples_df.is_empty():
+        self.logger.error("No samples_df found in study.")
+        return
+    # Get sample_uids to process
+    sample_uids = self._get_sample_uids(samples)
+    if not sample_uids:
+        self.logger.warning("No valid samples specified.")
+        return
+    self.logger.info(f"Restoring MS2 data from {len(sample_uids)} samples...")
+    # Clear existing consensus_ms2 to rebuild from scratch
+    initial_ms2_count = len(self.consensus_ms2) if not self.consensus_ms2.is_empty() else 0
+    self.consensus_ms2 = pl.DataFrame()
+    # Re-run find_ms2 which will rebuild consensus_ms2
+    try:
+        self.find_ms2(**kwargs)
+        final_ms2_count = len(self.consensus_ms2) if not self.consensus_ms2.is_empty() else 0
+        self.logger.info(f"MS2 restoration completed: {initial_ms2_count} -> {final_ms2_count} MS2 spectra")
+    except Exception as e:
+        self.logger.error(f"Failed to restore MS2 data: {e}")
+        raise
+def decompress(self, features=True, ms2=True, chrom=True, samples=None, **kwargs):
+    """
+    Reverse any compression effects by restoring compressed data adaptively.
+    This function restores data that was compressed using compress(), compress_features(),
+    compress_ms2(), compress_chrom(), or study.save(compress=True). It optimizes the
+    decompression process for speed by only processing what actually needs restoration.
+    Parameters:
+        features (bool): Restore features data (ms2_specs, ms2_scans, chrom_area)
+        ms2 (bool): Restore MS2 spectra by re-running find_ms2()
+        chrom (bool): Restore chromatogram objects
+        samples (list, optional): List of sample_uids or sample_names to process.
+                                 If None, processes all samples.
+        **kwargs: Additional keyword arguments for restoration functions:
+                 - For restore_chrom: mz_tol (default: 0.010), rt_tol (default: 10.0)
+                 - For restore_ms2/find_ms2: mz_tol, centroid, deisotope, etc.
+    Performance Optimizations:
+        - Adaptive processing: Only restores what actually needs restoration
+        - Processes features and chromatograms together when possible (shared file I/O)
+        - Uses cached sample instances to avoid repeated file loading
+        - Processes MS2 restoration last as it's the most computationally expensive
+        - Provides detailed progress information for long-running operations
+    Example:
+        # Restore everything (but only what needs restoration)
+        study.decompress()
+        # Restore only chromatograms with custom tolerances
+        study.decompress(features=False, ms2=False, chrom=True, mz_tol=0.005, rt_tol=5.0)
+        # Restore specific samples only
+        study.decompress(samples=["sample1", "sample2"])
+    """
+    if not any([features, ms2, chrom]):
+        self.logger.warning("No decompression operations specified.")
+        return
+    # Get sample_uids to process
+    sample_uids = self._get_sample_uids(samples)
+    if not sample_uids:
+        self.logger.warning("No valid samples specified.")
+        return
+    # Adaptively check what actually needs to be done
+    import polars as pl
+    # Check if features need restoration (more sophisticated logic)
+    features_need_restoration = False
+    if features and not self.features_df.is_empty():
+        # Check for completely missing columns that should exist after feature processing
+        missing_cols = []
+        for col in ["ms2_scans", "ms2_specs"]:
+            if col not in self.features_df.columns:
+                missing_cols.append(col)
+        # If columns are missing entirely, we likely need restoration
+        if missing_cols:
+            features_need_restoration = True
+        else:
+            # If columns exist, check if they're mostly null (indicating compression)
+            # But be smart about it - only check if we have consensus features with MS2
+            if not self.consensus_ms2.is_empty():
+                # We have MS2 data, so ms2_specs should have some content
+                null_ms2_specs = self.features_df.filter(pl.col("ms2_specs").is_null()).height
+                total_features = len(self.features_df)
+                # If more than 90% are null but we have MS2 data, likely compressed
+                if null_ms2_specs > (total_features * 0.9):
+                    features_need_restoration = True
+    # Check if chromatograms need restoration
+    chrom_need_restoration = False
+    if chrom and not self.features_df.is_empty():
+        if "chrom" not in self.features_df.columns:
+            # Column completely missing
+            chrom_need_restoration = True
+        else:
+            null_chroms = self.features_df.filter(pl.col("chrom").is_null()).height
+            total_features = len(self.features_df)
+            # If more than 50% are null, likely need restoration
+            chrom_need_restoration = null_chroms > (total_features * 0.5)
+    # Check if MS2 data might need restoration (compare expected vs actual)
+    ms2_need_restoration = False
+    if ms2:
+        current_ms2_count = len(self.consensus_ms2) if not self.consensus_ms2.is_empty() else 0
+        consensus_count = len(self.consensus_df) if not self.consensus_df.is_empty() else 0
+        if consensus_count > 0:
+            # Calculate expected MS2 count based on consensus features with MS2 potential
+            # This is a heuristic - if we have very few MS2 compared to consensus, likely compressed
+            expected_ratio = 3.0  # Expect at least 3 MS2 per consensus on average
+            expected_ms2 = consensus_count * expected_ratio
+            if current_ms2_count < min(expected_ms2 * 0.3, consensus_count * 0.8):
+                ms2_need_restoration = True
+    # Build list of operations that actually need to be done
+    operations_needed = []
+    if features and features_need_restoration:
+        operations_needed.append("features")
+    if chrom and chrom_need_restoration:
+        operations_needed.append("chromatograms")
+    if ms2 and ms2_need_restoration:
+        operations_needed.append("MS2 spectra")
+    # Early exit if nothing needs to be done
+    if not operations_needed:
+        self.logger.info("All data appears to be already decompressed. No operations needed.")
+        return
+    self.logger.info(f"Starting adaptive decompression: {', '.join(operations_needed)} from {len(sample_uids)} samples")
+    try:
+        # Phase 1: Restore features and chromatograms together (shared file I/O)
+        if ("features" in operations_needed and "chromatograms" in operations_needed):
+            self.logger.info("Phase 1: Restoring features and chromatograms together...")
+            # Extract relevant kwargs for restore_features and restore_chrom
+            restore_kwargs = {}
+            if 'mz_tol' in kwargs:
+                restore_kwargs['mz_tol'] = kwargs['mz_tol']
+            if 'rt_tol' in kwargs:
+                restore_kwargs['rt_tol'] = kwargs['rt_tol']
+            # Restore features first (includes chrom column)
+            self.restore_features(samples=samples)
+            # Then do additional chrom gap-filling if needed
+            self.restore_chrom(samples=samples, **restore_kwargs)
+        elif ("features" in operations_needed and "chromatograms" not in operations_needed):
+            self.logger.info("Phase 1: Restoring features data...")
+            self.restore_features(samples=samples)
+        elif ("chromatograms" in operations_needed and "features" not in operations_needed):
+            self.logger.info("Phase 1: Restoring chromatograms...")
+            restore_kwargs = {}
+            if 'mz_tol' in kwargs:
+                restore_kwargs['mz_tol'] = kwargs['mz_tol']
+            if 'rt_tol' in kwargs:
+                restore_kwargs['rt_tol'] = kwargs['rt_tol']
+            self.restore_chrom(samples=samples, **restore_kwargs)
+        # Phase 2: Restore MS2 data (most computationally expensive, done last)
+        if "MS2 spectra" in operations_needed:
+            self.logger.info("Phase 2: Restoring MS2 spectra...")
+            # Extract MS2-specific kwargs
+            ms2_kwargs = {}
+            for key, value in kwargs.items():
+                if key in ['mz_tol', 'centroid', 'deisotope', 'dia_stats', 'feature_uid']:
+                    ms2_kwargs[key] = value
+            self.restore_ms2(samples=samples, **ms2_kwargs)
+        self.logger.info("Adaptive decompression completed successfully")
+    except Exception as e:
+        self.logger.error(f"Decompression failed: {e}")
+        raise

masster 0.3.18__py3-none-any.whl → 0.3.20__py3-none-any.whl

Potentially problematic release.

masster 0.3.18py3-none-any.whl → 0.3.20py3-none-any.whl