PyPI - masster - Versions diffs - 0.4.3__py3-none-any.whl → 0.4.5__py3-none-any.whl - Mend

masster 0.4.3py3-none-any.whl → 0.4.5py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of masster might be problematic. Click here for more details.

Files changed (54) hide show

masster/__init__.py +8 -8
masster/_version.py +1 -1
masster/chromatogram.py +1 -1
masster/data/dda/20250530_VH_IQX_KW_RP_HSST3_100mm_12min_pos_v4_DDA_OT_C-MiLUT_QC_dil2_01_20250602151849.sample5 +0 -0
masster/data/dda/20250530_VH_IQX_KW_RP_HSST3_100mm_12min_pos_v4_DDA_OT_C-MiLUT_QC_dil3_01_20250602150634.sample5 +0 -0
masster/data/dda/20250530_VH_IQX_KW_RP_HSST3_100mm_12min_pos_v4_MS1_C-MiLUT_C008_v6_r38_01.sample5 +0 -0
masster/data/dda/20250530_VH_IQX_KW_RP_HSST3_100mm_12min_pos_v4_MS1_C-MiLUT_C008_v7_r37_01.sample5 +0 -0
masster/data/dda/20250530_VH_IQX_KW_RP_HSST3_100mm_12min_pos_v4_MS1_C-MiLUT_C017_v5_r99_01.sample5 +0 -0
masster/data/libs/__pycache__/ccm.cpython-312.pyc +0 -0
masster/data/libs/__pycache__/urine.cpython-312.pyc +0 -0
masster/data/libs/ccm.csv +120 -0
masster/data/libs/urine.csv +4693 -0
masster/data/wiff/2025_01_14_VW_7600_LpMx_DBS_CID_2min_TOP15_030msecMS1_005msecReac_CE35_DBS-ON_3.timeseries.data +0 -0
masster/data/wiff/2025_01_14_VW_7600_LpMx_DBS_CID_2min_TOP15_030msecMS1_005msecReac_CE35_DBS-ON_3.wiff +0 -0
masster/data/wiff/2025_01_14_VW_7600_LpMx_DBS_CID_2min_TOP15_030msecMS1_005msecReac_CE35_DBS-ON_3.wiff.scan +0 -0
masster/data/wiff/2025_01_14_VW_7600_LpMx_DBS_CID_2min_TOP15_030msecMS1_005msecReac_CE35_DBS-ON_3.wiff2 +0 -0
masster/logger.py +11 -11
masster/sample/__init__.py +1 -1
masster/sample/adducts.py +338 -264
masster/sample/defaults/find_adducts_def.py +21 -8
masster/sample/h5.py +561 -282
masster/sample/helpers.py +131 -75
masster/sample/lib.py +4 -4
masster/sample/load.py +31 -17
masster/sample/parameters.py +1 -1
masster/sample/plot.py +7 -7
masster/sample/processing.py +117 -87
masster/sample/sample.py +103 -90
masster/sample/sample5_schema.json +196 -0
masster/sample/save.py +35 -12
masster/spectrum.py +1 -1
masster/study/__init__.py +1 -1
masster/study/defaults/align_def.py +5 -1
masster/study/defaults/identify_def.py +3 -1
masster/study/defaults/study_def.py +58 -25
masster/study/export.py +360 -210
masster/study/h5.py +560 -158
masster/study/helpers.py +496 -203
masster/study/helpers_optimized.py +1 -1
masster/study/id.py +538 -349
masster/study/load.py +233 -143
masster/study/plot.py +71 -71
masster/study/processing.py +456 -254
masster/study/save.py +15 -5
masster/study/study.py +213 -131
masster/study/study5_schema.json +360 -0
masster-0.4.5.dist-info/METADATA +131 -0
masster-0.4.5.dist-info/RECORD +71 -0
masster-0.4.3.dist-info/METADATA +0 -791
masster-0.4.3.dist-info/RECORD +0 -56
{masster-0.4.3.dist-info → masster-0.4.5.dist-info}/WHEEL +0 -0
{masster-0.4.3.dist-info → masster-0.4.5.dist-info}/entry_points.txt +0 -0
{masster-0.4.3.dist-info → masster-0.4.5.dist-info}/licenses/LICENSE +0 -0
{masster-0.4.3.dist-info → masster-0.4.5.dist-info}/top_level.txt +0 -0

masster/study/helpers.py CHANGED Viewed

@@ -22,7 +22,7 @@ import pandas as pd
 import polars as pl
 from tqdm import tqdm
-from masster.chromatogram import Chromatogram
+from master.chromatogram import Chromatogram
 # =====================================================================================
@@ -71,7 +71,12 @@ def get_bpc(owner, sample=None, rt_unit="s", label=None, original=False):
         # fallback to pandas
         try:
             bpc_pd = s.ms1_df.to_pandas()[["rt", "inty"]]
-            bpc_pd = bpc_pd.groupby("rt").agg({"inty": "max"}).reset_index().sort_values("rt")
+            bpc_pd = (
+                bpc_pd.groupby("rt")
+                .agg({"inty": "max"})
+                .reset_index()
+                .sort_values("rt")
+            )
         except Exception:
             raise
@@ -113,11 +118,16 @@ def get_bpc(owner, sample=None, rt_unit="s", label=None, original=False):
                             mapping_rows = pl.DataFrame()
                 # If we still have no sample selector, try to infer sample from the Sample object s
-                if (mapping_rows is None or mapping_rows.is_empty()) and hasattr(s, "sample_path"):
+                if (mapping_rows is None or mapping_rows.is_empty()) and hasattr(
+                    s,
+                    "sample_path",
+                ):
                     # attempt to match by sample_path or file name
                     try:
                         # find row where sample_path matches
-                        mapping_rows = feats.filter(pl.col("sample_path") == getattr(s, "file", None))
+                        mapping_rows = feats.filter(
+                            pl.col("sample_path") == getattr(s, "file", None),
+                        )
                     except Exception:
                         mapping_rows = pl.DataFrame()
@@ -204,7 +214,9 @@ def get_tic(owner, sample=None, label=None):
             except Exception:
                 raise
         else:
-            raise ValueError("Neither ms1_df nor scans_df available for TIC computation")
+            raise ValueError(
+                "Neither ms1_df nor scans_df available for TIC computation",
+            )
     if tic_pd.empty:
         raise ValueError("Computed TIC is empty")
@@ -367,14 +379,17 @@ def get_chrom(self, uids=None, samples=None):
     )
     # Pre-filter features_df to only relevant features and samples
     filtered_features = self.features_df.filter(
-        pl.col("feature_uid").is_in(relevant_feature_uids) & pl.col("sample_uid").is_in(sample_uids),
-    ).select([
-        "feature_uid",
-        "chrom",
-        "rt",
-        "rt_original",
-        "sample_uid",
-    ])
+        pl.col("feature_uid").is_in(relevant_feature_uids)
+        & pl.col("sample_uid").is_in(sample_uids),
+    ).select(
+        [
+            "feature_uid",
+            "chrom",
+            "rt",
+            "rt_original",
+            "sample_uid",
+        ],
+    )
     # Pre-filter samples_df
     filtered_samples = self.samples_df.filter(
@@ -409,11 +424,13 @@ def get_chrom(self, uids=None, samples=None):
     # Create a mapping dictionary for O(1) lookup instead of O(n) filtering
     self.logger.debug("Creating lookup dictionary for chromatogram objects.")
     chrom_lookup = {}
-    for row in df_combined.select([
-        "consensus_uid",
-        "sample_name",
-        "chrom",
-    ]).iter_rows():
+    for row in df_combined.select(
+        [
+            "consensus_uid",
+            "sample_name",
+            "chrom",
+        ],
+    ).iter_rows():
         key = (row[0], row[1])  # (consensus_uid, sample_name)
         chrom_lookup[key] = row[2]  # chrom object
@@ -532,7 +549,9 @@ def get_consensus_matrix(self, quant="chrom_area"):
     # Build consensus matrix directly using the consensus_mapping_df
     matrix_dict = {}
-    sample_mapping = dict(self.samples_df.select(["sample_uid", "sample_name"]).iter_rows())
+    sample_mapping = dict(
+        self.samples_df.select(["sample_uid", "sample_name"]).iter_rows(),
+    )
     for row in self.consensus_mapping_df.iter_rows(named=True):
         consensus_uid = row["consensus_uid"]
@@ -550,7 +569,10 @@ def get_consensus_matrix(self, quant="chrom_area"):
         # Take max if multiple features map to same consensus/sample combination
         if sample_name in matrix_dict[consensus_uid]:
-            matrix_dict[consensus_uid][sample_name] = max(matrix_dict[consensus_uid][sample_name], value)
+            matrix_dict[consensus_uid][sample_name] = max(
+                matrix_dict[consensus_uid][sample_name],
+                value,
+            )
         else:
             matrix_dict[consensus_uid][sample_name] = value
@@ -569,10 +591,12 @@ def get_consensus_matrix(self, quant="chrom_area"):
     # Fill null values with 0 and round numeric columns
     numeric_cols = [col for col in df2.columns if col != "consensus_uid"]
-    df2 = df2.with_columns([
-        pl.col("consensus_uid").cast(pl.UInt64),
-        *[pl.col(col).fill_null(0).round(0) for col in numeric_cols],
-    ])
+    df2 = df2.with_columns(
+        [
+            pl.col("consensus_uid").cast(pl.UInt64),
+            *[pl.col(col).fill_null(0).round(0) for col in numeric_cols],
+        ],
+    )
     return df2
@@ -792,7 +816,7 @@ def get_sample(self, sample):
     This helper mirrors the original Study.get_sample method but lives in helpers for reuse.
     """
-    from masster.sample.sample import Sample
+    from master.sample.sample import Sample
     if isinstance(sample, Sample):
         return sample
@@ -802,7 +826,9 @@ def get_sample(self, sample):
     elif isinstance(sample, str):
         rows = self.samples_df.filter(pl.col("sample_name") == sample)
     else:
-        raise ValueError("sample must be an int (sample_uid), str (sample_name) or a Sample instance")
+        raise ValueError(
+            "sample must be an int (sample_uid), str (sample_name) or a Sample instance",
+        )
     if rows.is_empty():
         raise KeyError(f"Sample not found: {sample}")
@@ -836,7 +862,9 @@ def get_orphans(self):
     Get all features that are not in the consensus mapping.
     """
     not_in_consensus = self.features_df.filter(
-        ~self.features_df["feature_uid"].is_in(self.consensus_mapping_df["feature_uid"].to_list()),
+        ~self.features_df["feature_uid"].is_in(
+            self.consensus_mapping_df["feature_uid"].to_list(),
+        ),
     )
     return not_in_consensus
@@ -914,7 +942,7 @@ def restore_features(self, samples=None, maps=False):
         maps (bool, optional): If True, also load featureXML data and update study.feature_maps.
     """
     import datetime
-    from masster.sample.sample import Sample
+    from master.sample.sample import Sample
     if self.features_df is None or self.features_df.is_empty():
         self.logger.error("No features_df found in study.")
@@ -934,7 +962,9 @@ def restore_features(self, samples=None, maps=False):
     # Columns to update from sample data
     columns_to_update = ["chrom", "chrom_area", "ms2_scans", "ms2_specs"]
-    self.logger.info(f"Restoring columns {columns_to_update} from {len(sample_uids)} samples...")
+    self.logger.info(
+        f"Restoring columns {columns_to_update} from {len(sample_uids)} samples...",
+    )
     # Create a mapping of (sample_uid, feature_id) to feature_uid from study.features_df
     study_feature_mapping = {}
@@ -954,7 +984,9 @@ def restore_features(self, samples=None, maps=False):
         # Get sample info
         sample_row = self.samples_df.filter(pl.col("sample_uid") == sample_uid)
         if sample_row.is_empty():
-            self.logger.warning(f"Sample with uid {sample_uid} not found in samples_df.")
+            self.logger.warning(
+                f"Sample with uid {sample_uid} not found in samples_df.",
+            )
             continue
         sample_info = sample_row.row(0, named=True)
@@ -962,7 +994,9 @@ def restore_features(self, samples=None, maps=False):
         sample_name = sample_info.get("sample_name")
         if not sample_path or not os.path.exists(sample_path):
-            self.logger.warning(f"Sample file not found for {sample_name}: {sample_path}")
+            self.logger.warning(
+                f"Sample file not found for {sample_name}: {sample_path}",
+            )
             continue
         try:
@@ -978,7 +1012,9 @@ def restore_features(self, samples=None, maps=False):
                 continue
             # Check which columns are actually available in the sample
-            available_columns = [col for col in columns_to_update if col in sample.features_df.columns]
+            available_columns = [
+                col for col in columns_to_update if col in sample.features_df.columns
+            ]
             if not available_columns:
                 self.logger.debug(f"No target columns found in sample {sample_name}")
                 continue
@@ -1001,13 +1037,21 @@ def restore_features(self, samples=None, maps=False):
                             original_dtype = self.features_df[col].dtype
                             # Update the specific row and column, preserving dtype
-                            mask = (pl.col("feature_uid") == feature_uid) & (pl.col("sample_uid") == sample_uid)
+                            mask = (pl.col("feature_uid") == feature_uid) & (
+                                pl.col("sample_uid") == sample_uid
+                            )
                             # Handle object columns (like Chromatogram) differently
                             if original_dtype == pl.Object:
                                 self.features_df = self.features_df.with_columns(
                                     pl.when(mask)
-                                    .then(pl.lit(row[col], dtype=original_dtype, allow_object=True))
+                                    .then(
+                                        pl.lit(
+                                            row[col],
+                                            dtype=original_dtype,
+                                            allow_object=True,
+                                        ),
+                                    )
                                     .otherwise(pl.col(col))
                                     .alias(col),
                                 )
@@ -1021,7 +1065,9 @@ def restore_features(self, samples=None, maps=False):
                     updates_made += 1
             if updates_made > 0:
-                self.logger.debug(f"Updated {updates_made} features from sample {sample_name}")
+                self.logger.debug(
+                    f"Updated {updates_made} features from sample {sample_name}",
+                )
             # If maps is True, load featureXML data
             if maps:
@@ -1032,7 +1078,9 @@ def restore_features(self, samples=None, maps=False):
             self.logger.error(f"Failed to load sample {sample_name}: {e}")
             continue
-    self.logger.info(f"Completed restoring columns {columns_to_update} from {len(sample_uids)} samples")
+    self.logger.info(
+        f"Completed restoring columns {columns_to_update} from {len(sample_uids)} samples",
+    )
 def restore_chrom(self, samples=None, mz_tol=0.010, rt_tol=10.0):
@@ -1052,8 +1100,8 @@ def restore_chrom(self, samples=None, mz_tol=0.010, rt_tol=10.0):
     """
     import datetime
     import numpy as np
-    from masster.sample.sample import Sample
-    from masster.chromatogram import Chromatogram
+    from master.sample.sample import Sample
+    from master.chromatogram import Chromatogram
     if self.features_df is None or self.features_df.is_empty():
         self.logger.error("No features_df found in study.")
@@ -1129,7 +1177,9 @@ def restore_chrom(self, samples=None, mz_tol=0.010, rt_tol=10.0):
                     feature_uid = study_feature_mapping[key]
                     # Update only the chrom column
-                    mask = (pl.col("feature_uid") == feature_uid) & (pl.col("sample_uid") == sample_uid)
+                    mask = (pl.col("feature_uid") == feature_uid) & (
+                        pl.col("sample_uid") == sample_uid
+                    )
                     self.features_df = self.features_df.with_columns(
                         pl.when(mask)
                         .then(pl.lit(chrom, dtype=pl.Object, allow_object=True))
@@ -1142,7 +1192,9 @@ def restore_chrom(self, samples=None, mz_tol=0.010, rt_tol=10.0):
             self.logger.error(f"Failed to load sample {sample_name}: {e}")
             continue
-    self.logger.info(f"Phase 1 complete: Restored {restored_count} chromatograms from .sample5 files")
+    self.logger.info(
+        f"Phase 1 complete: Restored {restored_count} chromatograms from .sample5 files",
+    )
     # Phase 2: Gap-fill remaining empty chromatograms (like fill_chrom)
     self.logger.info("Phase 2: Gap-filling remaining empty chromatograms...")
@@ -1156,7 +1208,9 @@ def restore_chrom(self, samples=None, mz_tol=0.010, rt_tol=10.0):
     )
     if empty_chroms == 0:
-        self.logger.info("All chromatograms restored from .sample5 files. No gap-filling needed.")
+        self.logger.info(
+            "All chromatograms restored from .sample5 files. No gap-filling needed.",
+        )
         return
     # Get consensus info for gap filling
@@ -1200,7 +1254,11 @@ def restore_chrom(self, samples=None, mz_tol=0.010, rt_tol=10.0):
             sample = Sample(log_level="ERROR")
             sample._load_sample5(sample_path, map=False)
-            if not hasattr(sample, "ms1_df") or sample.ms1_df is None or sample.ms1_df.is_empty():
+            if (
+                not hasattr(sample, "ms1_df")
+                or sample.ms1_df is None
+                or sample.ms1_df.is_empty()
+            ):
                 continue
             # Process each missing feature
@@ -1285,7 +1343,9 @@ def restore_chrom(self, samples=None, mz_tol=0.010, rt_tol=10.0):
     self.logger.info(
         f"Chromatogram restoration complete: {final_non_null}/{final_total} ({final_non_null / final_total * 100:.1f}%)",
     )
-    self.logger.info(f"Restored from .sample5 files: {restored_count}, Gap-filled from raw data: {filled_count}")
+    self.logger.info(
+        f"Restored from .sample5 files: {restored_count}, Gap-filled from raw data: {filled_count}",
+    )
 def compress_ms2(self, max_replicates=5):
@@ -1305,17 +1365,28 @@ def compress_ms2(self, max_replicates=5):
     # Create a ranking score based on number_frags * prec_inty
     # Handle None values by treating them as 0
-    self.consensus_ms2 = self.consensus_ms2.with_columns([
-        (pl.col("number_frags").fill_null(0) * pl.col("prec_inty").fill_null(0)).alias("ranking_score"),
-    ])
+    self.consensus_ms2 = self.consensus_ms2.with_columns(
+        [
+            (
+                pl.col("number_frags").fill_null(0) * pl.col("prec_inty").fill_null(0)
+            ).alias("ranking_score"),
+        ],
+    )
     # Group by consensus_uid and energy, then rank by score and keep top max_replicates
     compressed_ms2 = (
-        self.consensus_ms2.with_row_count("row_id")  # Add row numbers for stable sorting
-        .sort(["consensus_uid", "energy", "ranking_score", "row_id"], descending=[False, False, True, False])
-        .with_columns([
-            pl.int_range(pl.len()).over(["consensus_uid", "energy"]).alias("rank"),
-        ])
+        self.consensus_ms2.with_row_count(
+            "row_id",
+        )  # Add row numbers for stable sorting
+        .sort(
+            ["consensus_uid", "energy", "ranking_score", "row_id"],
+            descending=[False, False, True, False],
+        )
+        .with_columns(
+            [
+                pl.int_range(pl.len()).over(["consensus_uid", "energy"]).alias("rank"),
+            ],
+        )
         .filter(pl.col("rank") < max_replicates)
         .drop(["ranking_score", "row_id", "rank"])
     )
@@ -1351,7 +1422,9 @@ def compress_chrom(self):
         pl.lit(None, dtype=pl.Object).alias("chrom"),
     )
-    self.logger.info(f"Compressed chromatograms: cleared {non_null_count} chromatogram objects from features_df")
+    self.logger.info(
+        f"Compressed chromatograms: cleared {non_null_count} chromatogram objects from features_df",
+    )
 # =====================================================================================
@@ -1402,7 +1475,9 @@ def sample_name_replace(self, replace_dict):
         if name in replace_dict:
             new_names.append(replace_dict[name])
             replaced_count += 1
-            self.logger.debug(f"Replacing sample name: '{name}' -> '{replace_dict[name]}'")
+            self.logger.debug(
+                f"Replacing sample name: '{name}' -> '{replace_dict[name]}'",
+            )
         else:
             new_names.append(name)
@@ -1415,7 +1490,9 @@ def sample_name_replace(self, replace_dict):
                 duplicates.append(name)
             else:
                 seen.add(name)
-        raise ValueError(f"Resulting sample names are not unique. Duplicates found: {duplicates}")
+        raise ValueError(
+            f"Resulting sample names are not unique. Duplicates found: {duplicates}",
+        )
     # If we get here, all names are unique - apply the changes
     self.samples_df = self.samples_df.with_columns(
@@ -1464,7 +1541,9 @@ def sample_name_reset(self):
             name_without_ext = os.path.splitext(name_without_ext)[0]
         new_names.append(name_without_ext)
-        self.logger.debug(f"Resetting sample name from path: '{path}' -> '{name_without_ext}'")
+        self.logger.debug(
+            f"Resetting sample name from path: '{path}' -> '{name_without_ext}'",
+        )
     # Check that all new names are unique
     if len(set(new_names)) != len(new_names):
@@ -1475,14 +1554,18 @@ def sample_name_reset(self):
                 duplicates.append(name)
             else:
                 seen.add(name)
-        raise ValueError(f"Resulting sample names are not unique. Duplicates found: {duplicates}")
+        raise ValueError(
+            f"Resulting sample names are not unique. Duplicates found: {duplicates}",
+        )
     # If we get here, all names are unique - apply the changes
     self.samples_df = self.samples_df.with_columns(
         pl.Series("sample_name", new_names).alias("sample_name"),
     )
-    self.logger.info(f"Successfully reset {len(new_names)} sample names from sample paths")
+    self.logger.info(
+        f"Successfully reset {len(new_names)} sample names from sample paths",
+    )
 def set_source(self, filename):
@@ -1512,11 +1595,15 @@ def set_source(self, filename):
     new_sources = []
-    for i, (current_source, sample_name) in enumerate(zip(current_sources, sample_names)):
+    for i, (current_source, sample_name) in enumerate(
+        zip(current_sources, sample_names),
+    ):
         # Check if filename is just a directory path
         if os.path.isdir(filename):
             if current_source is None or current_source == "":
-                self.logger.warning(f"Cannot build path for sample '{sample_name}': no current file_source available")
+                self.logger.warning(
+                    f"Cannot build path for sample '{sample_name}': no current file_source available",
+                )
                 new_sources.append(current_source)
                 failed_count += 1
                 continue
@@ -1531,7 +1618,9 @@ def set_source(self, filename):
         # Check if the new file exists
         if not os.path.exists(new_file_path):
-            self.logger.warning(f"File does not exist for sample '{sample_name}': {new_file_path}")
+            self.logger.warning(
+                f"File does not exist for sample '{sample_name}': {new_file_path}",
+            )
             new_sources.append(current_source)
             failed_count += 1
             continue
@@ -1541,7 +1630,9 @@ def set_source(self, filename):
         updated_count += 1
         # Log individual updates at debug level
-        self.logger.debug(f"Updated file_source for sample '{sample_name}': {current_source} -> {new_file_path}")
+        self.logger.debug(
+            f"Updated file_source for sample '{sample_name}': {current_source} -> {new_file_path}",
+        )
     # Update the samples_df with new file_source values
     self.samples_df = self.samples_df.with_columns(
@@ -1636,7 +1727,9 @@ def features_select(
     if mz is not None:
         if isinstance(mz, tuple) and len(mz) == 2:
             min_mz, max_mz = mz
-            filter_conditions.append((pl.col("mz") >= min_mz) & (pl.col("mz") <= max_mz))
+            filter_conditions.append(
+                (pl.col("mz") >= min_mz) & (pl.col("mz") <= max_mz),
+            )
         else:
             filter_conditions.append(pl.col("mz") >= mz)
@@ -1644,7 +1737,9 @@ def features_select(
     if rt is not None:
         if isinstance(rt, tuple) and len(rt) == 2:
             min_rt, max_rt = rt
-            filter_conditions.append((pl.col("rt") >= min_rt) & (pl.col("rt") <= max_rt))
+            filter_conditions.append(
+                (pl.col("rt") >= min_rt) & (pl.col("rt") <= max_rt),
+            )
         else:
             filter_conditions.append(pl.col("rt") >= rt)
@@ -1652,7 +1747,9 @@ def features_select(
     if inty is not None:
         if isinstance(inty, tuple) and len(inty) == 2:
             min_inty, max_inty = inty
-            filter_conditions.append((pl.col("inty") >= min_inty) & (pl.col("inty") <= max_inty))
+            filter_conditions.append(
+                (pl.col("inty") >= min_inty) & (pl.col("inty") <= max_inty),
+            )
         else:
             filter_conditions.append(pl.col("inty") >= inty)
@@ -1662,7 +1759,10 @@ def features_select(
             if len(sample_uid) == 2 and not isinstance(sample_uid, list):
                 # Treat as range
                 min_uid, max_uid = sample_uid
-                filter_conditions.append((pl.col("sample_uid") >= min_uid) & (pl.col("sample_uid") <= max_uid))
+                filter_conditions.append(
+                    (pl.col("sample_uid") >= min_uid)
+                    & (pl.col("sample_uid") <= max_uid),
+                )
             else:
                 # Treat as list
                 filter_conditions.append(pl.col("sample_uid").is_in(sample_uid))
@@ -1692,7 +1792,10 @@ def features_select(
             if len(consensus_uid) == 2 and not isinstance(consensus_uid, list):
                 # Treat as range
                 min_uid, max_uid = consensus_uid
-                filter_conditions.append((pl.col("consensus_uid") >= min_uid) & (pl.col("consensus_uid") <= max_uid))
+                filter_conditions.append(
+                    (pl.col("consensus_uid") >= min_uid)
+                    & (pl.col("consensus_uid") <= max_uid),
+                )
             else:
                 # Treat as list
                 filter_conditions.append(pl.col("consensus_uid").is_in(consensus_uid))
@@ -1705,7 +1808,10 @@ def features_select(
             if len(feature_uid) == 2 and not isinstance(feature_uid, list):
                 # Treat as range
                 min_uid, max_uid = feature_uid
-                filter_conditions.append((pl.col("feature_uid") >= min_uid) & (pl.col("feature_uid") <= max_uid))
+                filter_conditions.append(
+                    (pl.col("feature_uid") >= min_uid)
+                    & (pl.col("feature_uid") <= max_uid),
+                )
             else:
                 # Treat as list
                 filter_conditions.append(pl.col("feature_uid").is_in(feature_uid))
@@ -1727,7 +1833,10 @@ def features_select(
         if "quality" in available_columns:
             if isinstance(quality, tuple) and len(quality) == 2:
                 min_quality, max_quality = quality
-                filter_conditions.append((pl.col("quality") >= min_quality) & (pl.col("quality") <= max_quality))
+                filter_conditions.append(
+                    (pl.col("quality") >= min_quality)
+                    & (pl.col("quality") <= max_quality),
+                )
             else:
                 filter_conditions.append(pl.col("quality") >= quality)
         else:
@@ -1739,7 +1848,8 @@ def features_select(
             if isinstance(chrom_coherence, tuple) and len(chrom_coherence) == 2:
                 min_coherence, max_coherence = chrom_coherence
                 filter_conditions.append(
-                    (pl.col("chrom_coherence") >= min_coherence) & (pl.col("chrom_coherence") <= max_coherence),
+                    (pl.col("chrom_coherence") >= min_coherence)
+                    & (pl.col("chrom_coherence") <= max_coherence),
                 )
             else:
                 filter_conditions.append(pl.col("chrom_coherence") >= chrom_coherence)
@@ -1752,7 +1862,8 @@ def features_select(
             if isinstance(chrom_prominence, tuple) and len(chrom_prominence) == 2:
                 min_prominence, max_prominence = chrom_prominence
                 filter_conditions.append(
-                    (pl.col("chrom_prominence") >= min_prominence) & (pl.col("chrom_prominence") <= max_prominence),
+                    (pl.col("chrom_prominence") >= min_prominence)
+                    & (pl.col("chrom_prominence") <= max_prominence),
                 )
             else:
                 filter_conditions.append(pl.col("chrom_prominence") >= chrom_prominence)
@@ -1762,14 +1873,19 @@ def features_select(
     # Filter by scaled chromatogram prominence
     if chrom_prominence_scaled is not None:
         if "chrom_prominence_scaled" in available_columns:
-            if isinstance(chrom_prominence_scaled, tuple) and len(chrom_prominence_scaled) == 2:
+            if (
+                isinstance(chrom_prominence_scaled, tuple)
+                and len(chrom_prominence_scaled) == 2
+            ):
                 min_prominence_scaled, max_prominence_scaled = chrom_prominence_scaled
                 filter_conditions.append(
                     (pl.col("chrom_prominence_scaled") >= min_prominence_scaled)
                     & (pl.col("chrom_prominence_scaled") <= max_prominence_scaled),
                 )
             else:
-                filter_conditions.append(pl.col("chrom_prominence_scaled") >= chrom_prominence_scaled)
+                filter_conditions.append(
+                    pl.col("chrom_prominence_scaled") >= chrom_prominence_scaled,
+                )
         else:
             warnings.append("'chrom_prominence_scaled' column not found in features_df")
@@ -1783,7 +1899,9 @@ def features_select(
                     & (pl.col("chrom_height_scaled") <= max_height_scaled),
                 )
             else:
-                filter_conditions.append(pl.col("chrom_height_scaled") >= chrom_height_scaled)
+                filter_conditions.append(
+                    pl.col("chrom_height_scaled") >= chrom_height_scaled,
+                )
         else:
             warnings.append("'chrom_height_scaled' column not found in features_df")
@@ -1896,7 +2014,7 @@ def monkey_patch_study():
     as `features_select_original` if not already set, then replaces Study.features_select
     with the optimized `features_select` defined above. This function is idempotent.
     """
-    from masster.study.study import Study
+    from master.study.study import Study
     # Only set original if it doesn't exist yet
     if not hasattr(Study, "features_select_original"):
@@ -1969,9 +2087,14 @@ def features_filter(self, features):
     # Apply filter to consensus_mapping_df if it exists - batch operation
     mapping_removed_count = 0
-    if self.consensus_mapping_df is not None and not self.consensus_mapping_df.is_empty():
+    if (
+        self.consensus_mapping_df is not None
+        and not self.consensus_mapping_df.is_empty()
+    ):
         initial_mapping_count = len(self.consensus_mapping_df)
-        self.consensus_mapping_df = self.consensus_mapping_df.lazy().filter(filter_condition).collect()
+        self.consensus_mapping_df = (
+            self.consensus_mapping_df.lazy().filter(filter_condition).collect()
+        )
         mapping_removed_count = initial_mapping_count - len(self.consensus_mapping_df)
     # Calculate results once and log efficiently
@@ -1984,7 +2107,9 @@ def features_filter(self, features):
             f"Kept {final_count} features and removed {mapping_removed_count} consensus mappings. Filtered out {removed_count} features.",
         )
     else:
-        self.logger.info(f"Kept {final_count} features. Filtered out {removed_count} features.")
+        self.logger.info(
+            f"Kept {final_count} features. Filtered out {removed_count} features.",
+        )
 def features_delete(self, features):
@@ -2046,9 +2171,14 @@ def features_delete(self, features):
     # Apply filter to consensus_mapping_df if it exists - batch operation
     mapping_removed_count = 0
-    if self.consensus_mapping_df is not None and not self.consensus_mapping_df.is_empty():
+    if (
+        self.consensus_mapping_df is not None
+        and not self.consensus_mapping_df.is_empty()
+    ):
         initial_mapping_count = len(self.consensus_mapping_df)
-        self.consensus_mapping_df = self.consensus_mapping_df.lazy().filter(filter_condition).collect()
+        self.consensus_mapping_df = (
+            self.consensus_mapping_df.lazy().filter(filter_condition).collect()
+        )
         mapping_removed_count = initial_mapping_count - len(self.consensus_mapping_df)
     # Calculate results once and log efficiently
@@ -2061,7 +2191,9 @@ def features_delete(self, features):
             f"Deleted {removed_count} features and {mapping_removed_count} consensus mappings. Remaining features: {final_count}",
         )
     else:
-        self.logger.info(f"Deleted {removed_count} features. Remaining features: {final_count}")
+        self.logger.info(
+            f"Deleted {removed_count} features. Remaining features: {final_count}",
+        )
 def consensus_select(
@@ -2134,7 +2266,9 @@ def consensus_select(
             else:
                 # Standard (min_mz, max_mz) format
                 min_mz, max_mz = mz
-            consensus = consensus.filter((pl.col("mz") >= min_mz) & (pl.col("mz") <= max_mz))
+            consensus = consensus.filter(
+                (pl.col("mz") >= min_mz) & (pl.col("mz") <= max_mz),
+            )
         else:
             # Single float value - use default mz tolerance from study parameters
             default_mz_tol = getattr(self, "parameters", None)
@@ -2142,13 +2276,15 @@ def consensus_select(
                 default_mz_tol = default_mz_tol.eic_mz_tol
             else:
                 # Fallback to align_defaults if study parameters not available
-                from masster.study.defaults.align_def import align_defaults
+                from master.study.defaults.align_def import align_defaults
                 default_mz_tol = align_defaults().mz_max_diff
             min_mz = mz - default_mz_tol
             max_mz = mz + default_mz_tol
-            consensus = consensus.filter((pl.col("mz") >= min_mz) & (pl.col("mz") <= max_mz))
+            consensus = consensus.filter(
+                (pl.col("mz") >= min_mz) & (pl.col("mz") <= max_mz),
+            )
         self.logger.debug(
             f"Selected consensus by mz. Consensus removed: {consensus_len_before_filter - len(consensus)}",
@@ -2168,7 +2304,9 @@ def consensus_select(
             else:
                 # Standard (min_rt, max_rt) format
                 min_rt, max_rt = rt
-            consensus = consensus.filter((pl.col("rt") >= min_rt) & (pl.col("rt") <= max_rt))
+            consensus = consensus.filter(
+                (pl.col("rt") >= min_rt) & (pl.col("rt") <= max_rt),
+            )
         else:
             # Single float value - use default rt tolerance from study parameters
             default_rt_tol = getattr(self, "parameters", None)
@@ -2176,13 +2314,15 @@ def consensus_select(
                 default_rt_tol = default_rt_tol.eic_rt_tol
             else:
                 # Fallback to align_defaults if study parameters not available
-                from masster.study.defaults.align_def import align_defaults
+                from master.study.defaults.align_def import align_defaults
                 default_rt_tol = align_defaults().rt_tol
             min_rt = rt - default_rt_tol
             max_rt = rt + default_rt_tol
-            consensus = consensus.filter((pl.col("rt") >= min_rt) & (pl.col("rt") <= max_rt))
+            consensus = consensus.filter(
+                (pl.col("rt") >= min_rt) & (pl.col("rt") <= max_rt),
+            )
         self.logger.debug(
             f"Selected consensus by rt. Consensus removed: {consensus_len_before_filter - len(consensus)}",
@@ -2193,7 +2333,9 @@ def consensus_select(
         consensus_len_before_filter = len(consensus)
         if isinstance(inty_mean, tuple) and len(inty_mean) == 2:
             min_inty, max_inty = inty_mean
-            consensus = consensus.filter((pl.col("inty_mean") >= min_inty) & (pl.col("inty_mean") <= max_inty))
+            consensus = consensus.filter(
+                (pl.col("inty_mean") >= min_inty) & (pl.col("inty_mean") <= max_inty),
+            )
         else:
             consensus = consensus.filter(pl.col("inty_mean") >= inty_mean)
         self.logger.debug(
@@ -2208,11 +2350,14 @@ def consensus_select(
                 # Treat as range
                 min_uid, max_uid = consensus_uid
                 consensus = consensus.filter(
-                    (pl.col("consensus_uid") >= min_uid) & (pl.col("consensus_uid") <= max_uid),
+                    (pl.col("consensus_uid") >= min_uid)
+                    & (pl.col("consensus_uid") <= max_uid),
                 )
             else:
                 # Treat as list
-                consensus = consensus.filter(pl.col("consensus_uid").is_in(consensus_uid))
+                consensus = consensus.filter(
+                    pl.col("consensus_uid").is_in(consensus_uid),
+                )
         else:
             consensus = consensus.filter(pl.col("consensus_uid") == consensus_uid)
         self.logger.debug(
@@ -2236,7 +2381,8 @@ def consensus_select(
         if isinstance(number_samples, tuple) and len(number_samples) == 2:
             min_samples, max_samples = number_samples
             consensus = consensus.filter(
-                (pl.col("number_samples") >= min_samples) & (pl.col("number_samples") <= max_samples),
+                (pl.col("number_samples") >= min_samples)
+                & (pl.col("number_samples") <= max_samples),
             )
         else:
             consensus = consensus.filter(pl.col("number_samples") >= number_samples)
@@ -2250,7 +2396,10 @@ def consensus_select(
         if "number_ms2" in consensus.columns:
             if isinstance(number_ms2, tuple) and len(number_ms2) == 2:
                 min_ms2, max_ms2 = number_ms2
-                consensus = consensus.filter((pl.col("number_ms2") >= min_ms2) & (pl.col("number_ms2") <= max_ms2))
+                consensus = consensus.filter(
+                    (pl.col("number_ms2") >= min_ms2)
+                    & (pl.col("number_ms2") <= max_ms2),
+                )
             else:
                 consensus = consensus.filter(pl.col("number_ms2") >= number_ms2)
         else:
@@ -2264,7 +2413,9 @@ def consensus_select(
         consensus_len_before_filter = len(consensus)
         if isinstance(quality, tuple) and len(quality) == 2:
             min_quality, max_quality = quality
-            consensus = consensus.filter((pl.col("quality") >= min_quality) & (pl.col("quality") <= max_quality))
+            consensus = consensus.filter(
+                (pl.col("quality") >= min_quality) & (pl.col("quality") <= max_quality),
+            )
         else:
             consensus = consensus.filter(pl.col("quality") >= quality)
         self.logger.debug(
@@ -2277,7 +2428,9 @@ def consensus_select(
         if "bl" in consensus.columns:
             if isinstance(bl, tuple) and len(bl) == 2:
                 min_bl, max_bl = bl
-                consensus = consensus.filter((pl.col("bl") >= min_bl) & (pl.col("bl") <= max_bl))
+                consensus = consensus.filter(
+                    (pl.col("bl") >= min_bl) & (pl.col("bl") <= max_bl),
+                )
             else:
                 consensus = consensus.filter(pl.col("bl") >= bl)
         else:
@@ -2290,16 +2443,23 @@ def consensus_select(
     if chrom_coherence_mean is not None:
         consensus_len_before_filter = len(consensus)
         if "chrom_coherence_mean" in consensus.columns:
-            if isinstance(chrom_coherence_mean, tuple) and len(chrom_coherence_mean) == 2:
+            if (
+                isinstance(chrom_coherence_mean, tuple)
+                and len(chrom_coherence_mean) == 2
+            ):
                 min_coherence, max_coherence = chrom_coherence_mean
                 consensus = consensus.filter(
                     (pl.col("chrom_coherence_mean") >= min_coherence)
                     & (pl.col("chrom_coherence_mean") <= max_coherence),
                 )
             else:
-                consensus = consensus.filter(pl.col("chrom_coherence_mean") >= chrom_coherence_mean)
+                consensus = consensus.filter(
+                    pl.col("chrom_coherence_mean") >= chrom_coherence_mean,
+                )
         else:
-            self.logger.warning("'chrom_coherence_mean' column not found in consensus_df")
+            self.logger.warning(
+                "'chrom_coherence_mean' column not found in consensus_df",
+            )
         self.logger.debug(
             f"Selected consensus by chrom_coherence_mean. Consensus removed: {consensus_len_before_filter - len(consensus)}",
         )
@@ -2308,16 +2468,23 @@ def consensus_select(
     if chrom_prominence_mean is not None:
         consensus_len_before_filter = len(consensus)
         if "chrom_prominence_mean" in consensus.columns:
-            if isinstance(chrom_prominence_mean, tuple) and len(chrom_prominence_mean) == 2:
+            if (
+                isinstance(chrom_prominence_mean, tuple)
+                and len(chrom_prominence_mean) == 2
+            ):
                 min_prominence, max_prominence = chrom_prominence_mean
                 consensus = consensus.filter(
                     (pl.col("chrom_prominence_mean") >= min_prominence)
                     & (pl.col("chrom_prominence_mean") <= max_prominence),
                 )
             else:
-                consensus = consensus.filter(pl.col("chrom_prominence_mean") >= chrom_prominence_mean)
+                consensus = consensus.filter(
+                    pl.col("chrom_prominence_mean") >= chrom_prominence_mean,
+                )
         else:
-            self.logger.warning("'chrom_prominence_mean' column not found in consensus_df")
+            self.logger.warning(
+                "'chrom_prominence_mean' column not found in consensus_df",
+            )
         self.logger.debug(
             f"Selected consensus by chrom_prominence_mean. Consensus removed: {consensus_len_before_filter - len(consensus)}",
         )
@@ -2326,16 +2493,26 @@ def consensus_select(
     if chrom_prominence_scaled_mean is not None:
         consensus_len_before_filter = len(consensus)
         if "chrom_prominence_scaled_mean" in consensus.columns:
-            if isinstance(chrom_prominence_scaled_mean, tuple) and len(chrom_prominence_scaled_mean) == 2:
-                min_prominence_scaled, max_prominence_scaled = chrom_prominence_scaled_mean
+            if (
+                isinstance(chrom_prominence_scaled_mean, tuple)
+                and len(chrom_prominence_scaled_mean) == 2
+            ):
+                min_prominence_scaled, max_prominence_scaled = (
+                    chrom_prominence_scaled_mean
+                )
                 consensus = consensus.filter(
                     (pl.col("chrom_prominence_scaled_mean") >= min_prominence_scaled)
                     & (pl.col("chrom_prominence_scaled_mean") <= max_prominence_scaled),
                 )
             else:
-                consensus = consensus.filter(pl.col("chrom_prominence_scaled_mean") >= chrom_prominence_scaled_mean)
+                consensus = consensus.filter(
+                    pl.col("chrom_prominence_scaled_mean")
+                    >= chrom_prominence_scaled_mean,
+                )
         else:
-            self.logger.warning("'chrom_prominence_scaled_mean' column not found in consensus_df")
+            self.logger.warning(
+                "'chrom_prominence_scaled_mean' column not found in consensus_df",
+            )
         self.logger.debug(
             f"Selected consensus by chrom_prominence_scaled_mean. Consensus removed: {consensus_len_before_filter - len(consensus)}",
         )
@@ -2344,16 +2521,23 @@ def consensus_select(
     if chrom_height_scaled_mean is not None:
         consensus_len_before_filter = len(consensus)
         if "chrom_height_scaled_mean" in consensus.columns:
-            if isinstance(chrom_height_scaled_mean, tuple) and len(chrom_height_scaled_mean) == 2:
+            if (
+                isinstance(chrom_height_scaled_mean, tuple)
+                and len(chrom_height_scaled_mean) == 2
+            ):
                 min_height_scaled, max_height_scaled = chrom_height_scaled_mean
                 consensus = consensus.filter(
                     (pl.col("chrom_height_scaled_mean") >= min_height_scaled)
                     & (pl.col("chrom_height_scaled_mean") <= max_height_scaled),
                 )
             else:
-                consensus = consensus.filter(pl.col("chrom_height_scaled_mean") >= chrom_height_scaled_mean)
+                consensus = consensus.filter(
+                    pl.col("chrom_height_scaled_mean") >= chrom_height_scaled_mean,
+                )
         else:
-            self.logger.warning("'chrom_height_scaled_mean' column not found in consensus_df")
+            self.logger.warning(
+                "'chrom_height_scaled_mean' column not found in consensus_df",
+            )
         self.logger.debug(
             f"Selected consensus by chrom_height_scaled_mean. Consensus removed: {consensus_len_before_filter - len(consensus)}",
         )
@@ -2365,7 +2549,8 @@ def consensus_select(
             if isinstance(rt_delta_mean, tuple) and len(rt_delta_mean) == 2:
                 min_rt_delta, max_rt_delta = rt_delta_mean
                 consensus = consensus.filter(
-                    (pl.col("rt_delta_mean") >= min_rt_delta) & (pl.col("rt_delta_mean") <= max_rt_delta),
+                    (pl.col("rt_delta_mean") >= min_rt_delta)
+                    & (pl.col("rt_delta_mean") <= max_rt_delta),
                 )
             else:
                 consensus = consensus.filter(pl.col("rt_delta_mean") >= rt_delta_mean)
@@ -2376,9 +2561,13 @@ def consensus_select(
         )
     if len(consensus) == 0:
-        self.logger.warning("No consensus features remaining after applying selection criteria.")
+        self.logger.warning(
+            "No consensus features remaining after applying selection criteria.",
+        )
     else:
-        self.logger.info(f"Selected consensus features. Features remaining: {len(consensus)} (from {initial_count})")
+        self.logger.info(
+            f"Selected consensus features. Features remaining: {len(consensus)} (from {initial_count})",
+        )
     # Sort the results if sortby is specified
     if sortby is not None:
@@ -2387,19 +2576,25 @@ def consensus_select(
             if sortby in consensus.columns:
                 consensus = consensus.sort(sortby, descending=descending)
             else:
-                self.logger.warning(f"Sort column '{sortby}' not found in consensus DataFrame")
+                self.logger.warning(
+                    f"Sort column '{sortby}' not found in consensus DataFrame",
+                )
         elif isinstance(sortby, (list, tuple)):
             # Multiple columns
             valid_columns = [col for col in sortby if col in consensus.columns]
             invalid_columns = [col for col in sortby if col not in consensus.columns]
             if invalid_columns:
-                self.logger.warning(f"Sort columns not found in consensus DataFrame: {invalid_columns}")
+                self.logger.warning(
+                    f"Sort columns not found in consensus DataFrame: {invalid_columns}",
+                )
             if valid_columns:
                 consensus = consensus.sort(valid_columns, descending=descending)
         else:
-            self.logger.warning(f"Invalid sortby parameter type: {type(sortby)}. Expected str, list, or tuple.")
+            self.logger.warning(
+                f"Invalid sortby parameter type: {type(sortby)}. Expected str, list, or tuple.",
+            )
     return consensus
@@ -2444,7 +2639,10 @@ def consensus_filter(self, consensus):
     # Get feature_uids that need to be removed from features_df
     feature_uids_to_remove = []
-    if self.consensus_mapping_df is not None and not self.consensus_mapping_df.is_empty():
+    if (
+        self.consensus_mapping_df is not None
+        and not self.consensus_mapping_df.is_empty()
+    ):
         feature_uids_to_remove = self.consensus_mapping_df.filter(
             pl.col("consensus_uid").is_in(consensus_uids_to_remove),
         )["feature_uid"].to_list()
@@ -2455,27 +2653,42 @@ def consensus_filter(self, consensus):
     )
     # Remove from consensus_mapping_df
-    if self.consensus_mapping_df is not None and not self.consensus_mapping_df.is_empty():
+    if (
+        self.consensus_mapping_df is not None
+        and not self.consensus_mapping_df.is_empty()
+    ):
         initial_mapping_count = len(self.consensus_mapping_df)
         self.consensus_mapping_df = self.consensus_mapping_df.filter(
             ~pl.col("consensus_uid").is_in(consensus_uids_to_remove),
         )
         removed_mapping_count = initial_mapping_count - len(self.consensus_mapping_df)
         if removed_mapping_count > 0:
-            self.logger.debug(f"Removed {removed_mapping_count} entries from consensus_mapping_df")
+            self.logger.debug(
+                f"Removed {removed_mapping_count} entries from consensus_mapping_df",
+            )
     # Remove corresponding features from features_df
-    if feature_uids_to_remove and self.features_df is not None and not self.features_df.is_empty():
+    if (
+        feature_uids_to_remove
+        and self.features_df is not None
+        and not self.features_df.is_empty()
+    ):
         initial_features_count = len(self.features_df)
         self.features_df = self.features_df.filter(
             ~pl.col("feature_uid").is_in(feature_uids_to_remove),
         )
         removed_features_count = initial_features_count - len(self.features_df)
         if removed_features_count > 0:
-            self.logger.debug(f"Removed {removed_features_count} entries from features_df")
+            self.logger.debug(
+                f"Removed {removed_features_count} entries from features_df",
+            )
     # Remove from consensus_ms2 if it exists
-    if hasattr(self, "consensus_ms2") and self.consensus_ms2 is not None and not self.consensus_ms2.is_empty():
+    if (
+        hasattr(self, "consensus_ms2")
+        and self.consensus_ms2 is not None
+        and not self.consensus_ms2.is_empty()
+    ):
         initial_ms2_count = len(self.consensus_ms2)
         self.consensus_ms2 = self.consensus_ms2.filter(
             ~pl.col("consensus_uid").is_in(consensus_uids_to_remove),
@@ -2575,7 +2788,10 @@ def samples_select(
             if len(sample_uid) == 2 and not isinstance(sample_uid, list):
                 # Treat as range
                 min_uid, max_uid = sample_uid
-                filter_conditions.append((pl.col("sample_uid") >= min_uid) & (pl.col("sample_uid") <= max_uid))
+                filter_conditions.append(
+                    (pl.col("sample_uid") >= min_uid)
+                    & (pl.col("sample_uid") <= max_uid),
+                )
             else:
                 # Treat as list
                 filter_conditions.append(pl.col("sample_uid").is_in(sample_uid))
@@ -2617,7 +2833,8 @@ def samples_select(
                     # Treat as range
                     min_batch, max_batch = sample_batch
                     filter_conditions.append(
-                        (pl.col("sample_batch") >= min_batch) & (pl.col("sample_batch") <= max_batch),
+                        (pl.col("sample_batch") >= min_batch)
+                        & (pl.col("sample_batch") <= max_batch),
                     )
                 else:
                     # Treat as list
@@ -2635,11 +2852,14 @@ def samples_select(
                     # Treat as range
                     min_seq, max_seq = sample_sequence
                     filter_conditions.append(
-                        (pl.col("sample_sequence") >= min_seq) & (pl.col("sample_sequence") <= max_seq),
+                        (pl.col("sample_sequence") >= min_seq)
+                        & (pl.col("sample_sequence") <= max_seq),
                     )
                 else:
                     # Treat as list
-                    filter_conditions.append(pl.col("sample_sequence").is_in(sample_sequence))
+                    filter_conditions.append(
+                        pl.col("sample_sequence").is_in(sample_sequence),
+                    )
             else:
                 filter_conditions.append(pl.col("sample_sequence") == sample_sequence)
         else:
@@ -2651,7 +2871,8 @@ def samples_select(
             if isinstance(num_features, tuple) and len(num_features) == 2:
                 min_features, max_features = num_features
                 filter_conditions.append(
-                    (pl.col("num_features") >= min_features) & (pl.col("num_features") <= max_features),
+                    (pl.col("num_features") >= min_features)
+                    & (pl.col("num_features") <= max_features),
                 )
             else:
                 filter_conditions.append(pl.col("num_features") >= num_features)
@@ -2663,7 +2884,9 @@ def samples_select(
         if "num_ms1" in available_columns:
             if isinstance(num_ms1, tuple) and len(num_ms1) == 2:
                 min_ms1, max_ms1 = num_ms1
-                filter_conditions.append((pl.col("num_ms1") >= min_ms1) & (pl.col("num_ms1") <= max_ms1))
+                filter_conditions.append(
+                    (pl.col("num_ms1") >= min_ms1) & (pl.col("num_ms1") <= max_ms1),
+                )
             else:
                 filter_conditions.append(pl.col("num_ms1") >= num_ms1)
         else:
@@ -2674,7 +2897,9 @@ def samples_select(
         if "num_ms2" in available_columns:
             if isinstance(num_ms2, tuple) and len(num_ms2) == 2:
                 min_ms2, max_ms2 = num_ms2
-                filter_conditions.append((pl.col("num_ms2") >= min_ms2) & (pl.col("num_ms2") <= max_ms2))
+                filter_conditions.append(
+                    (pl.col("num_ms2") >= min_ms2) & (pl.col("num_ms2") <= max_ms2),
+                )
             else:
                 filter_conditions.append(pl.col("num_ms2") >= num_ms2)
         else:
@@ -2766,7 +2991,9 @@ def samples_delete(self, samples):
         if len(sample_uids_set) < len(sample_uids_to_remove) * 0.8:
             sample_uids_to_remove = list(sample_uids_set)
-    self.logger.info(f"Deleting {len(sample_uids_to_remove)} samples and all related data...")
+    self.logger.info(
+        f"Deleting {len(sample_uids_to_remove)} samples and all related data...",
+    )
     # Get feature_uids that need to be removed from features_df
     feature_uids_to_remove = []
@@ -2794,7 +3021,11 @@ def samples_delete(self, samples):
     # 2. Remove corresponding features from features_df
     removed_features_count = 0
-    if feature_uids_to_remove and self.features_df is not None and not self.features_df.is_empty():
+    if (
+        feature_uids_to_remove
+        and self.features_df is not None
+        and not self.features_df.is_empty()
+    ):
         self.features_df = self.features_df.filter(
             ~pl.col("sample_uid").is_in(sample_uids_to_remove),
         )
@@ -2802,7 +3033,11 @@ def samples_delete(self, samples):
     # 3. Remove from consensus_mapping_df
     removed_mapping_count = 0
-    if feature_uids_to_remove and self.consensus_mapping_df is not None and not self.consensus_mapping_df.is_empty():
+    if (
+        feature_uids_to_remove
+        and self.consensus_mapping_df is not None
+        and not self.consensus_mapping_df.is_empty()
+    ):
         initial_mapping_count = len(self.consensus_mapping_df)
         self.consensus_mapping_df = self.consensus_mapping_df.filter(
             ~pl.col("feature_uid").is_in(feature_uids_to_remove),
@@ -2811,7 +3046,11 @@ def samples_delete(self, samples):
     # 4. Remove from consensus_ms2 if it exists
     removed_ms2_count = 0
-    if hasattr(self, "consensus_ms2") and self.consensus_ms2 is not None and not self.consensus_ms2.is_empty():
+    if (
+        hasattr(self, "consensus_ms2")
+        and self.consensus_ms2 is not None
+        and not self.consensus_ms2.is_empty()
+    ):
         initial_ms2_count = len(self.consensus_ms2)
         self.consensus_ms2 = self.consensus_ms2.filter(
             ~pl.col("sample_uid").is_in(sample_uids_to_remove),
@@ -2820,7 +3059,11 @@ def samples_delete(self, samples):
     # 5. Remove from feature_maps and update map_id
     removed_maps_count = 0
-    if hasattr(self, "feature_maps") and self.feature_maps is not None and map_ids_to_remove:
+    if (
+        hasattr(self, "feature_maps")
+        and self.feature_maps is not None
+        and map_ids_to_remove
+    ):
         # Remove feature maps in reverse order to maintain indices
         for map_id in sorted(map_ids_to_remove, reverse=True):
             if 0 <= map_id < len(self.feature_maps):
@@ -2861,7 +3104,9 @@ def samples_delete(self, samples):
     # Update map_id indices if needed
     if removed_maps_count > 0 and final_sample_count > 0:
-        self.logger.debug(f"Updated map_id values to range from 0 to {final_sample_count - 1}")
+        self.logger.debug(
+            f"Updated map_id values to range from 0 to {final_sample_count - 1}",
+        )
 # =====================================================================================
@@ -3032,7 +3277,9 @@ def sample_color(self, by=None, palette="Turbo256"):
     )
     if isinstance(by, list):
-        self.logger.debug(f"Set sample colors using provided color list ({len(by)} colors)")
+        self.logger.debug(
+            f"Set sample colors using provided color list ({len(by)} colors)",
+        )
     elif by is None:
         self.logger.debug(f"Set sequential sample colors using {palette} palette")
     else:
@@ -3067,7 +3314,9 @@ def sample_color_reset(self):
         # Distribute samples evenly across the full colormap range
         for i in range(n_samples):
             # Evenly distribute samples across colormap (avoiding endpoints to prevent white/black)
-            normalized_value = (i + 0.5) / n_samples  # +0.5 to center samples in their bins
+            normalized_value = (
+                i + 0.5
+            ) / n_samples  # +0.5 to center samples in their bins
             # Optionally, map to a subset of colormap to avoid extreme colors
             # Use 10% to 90% of colormap range for better color diversity
             normalized_value = 0.1 + (normalized_value * 0.8)
@@ -3088,10 +3337,14 @@ def sample_color_reset(self):
             pl.Series("sample_color", colors).alias("sample_color"),
         )
-        self.logger.debug(f"Reset sample colors using turbo colormap with even distribution ({n_samples} samples)")
+        self.logger.debug(
+            f"Reset sample colors using turbo colormap with even distribution ({n_samples} samples)",
+        )
     except ImportError:
-        self.logger.error("cmap library is required for sample color reset. Install with: uv add cmap")
+        self.logger.error(
+            "cmap library is required for sample color reset. Install with: uv add cmap",
+        )
     except Exception as e:
         self.logger.error(f"Failed to reset sample colors: {e}")
@@ -3112,7 +3365,9 @@ def _get_color_palette(palette_name):
     try:
         from cmap import Colormap
     except ImportError:
-        raise ValueError("cmap library is required for color palettes. Install with: pip install cmap")
+        raise ValueError(
+            "cmap library is required for color palettes. Install with: pip install cmap",
+        )
     # Map common palette names to cmap names
     palette_mapping = {
@@ -3207,7 +3462,9 @@ def _sample_colors_from_colormap(palette_name, n_colors):
     try:
         from cmap import Colormap
     except ImportError:
-        raise ValueError("cmap library is required for color palettes. Install with: pip install cmap")
+        raise ValueError(
+            "cmap library is required for color palettes. Install with: pip install cmap",
+        )
     # Map common palette names to cmap names (same as _get_color_palette)
     palette_mapping = {
@@ -3245,7 +3502,9 @@ def _sample_colors_from_colormap(palette_name, n_colors):
         # Distribute samples evenly across the full colormap range (same approach as sample_color_reset)
         for i in range(n_colors):
             # Evenly distribute samples across colormap (avoiding endpoints to prevent white/black)
-            normalized_value = (i + 0.5) / n_colors  # +0.5 to center samples in their bins
+            normalized_value = (
+                i + 0.5
+            ) / n_colors  # +0.5 to center samples in their bins
             # Map to a subset of colormap to avoid extreme colors (use 10% to 90% range)
             normalized_value = 0.1 + (normalized_value * 0.8)
@@ -3290,7 +3549,7 @@ def _ensure_features_df_schema_order(self):
     try:
         import os
         import json
-        from masster.study.h5 import _reorder_columns_by_schema
+        from master.study.h5 import _reorder_columns_by_schema
         # Load schema
         schema_path = os.path.join(os.path.dirname(__file__), "study5_schema.json")
@@ -3298,7 +3557,11 @@ def _ensure_features_df_schema_order(self):
             schema = json.load(f)
         # Reorder columns to match schema
-        self.features_df = _reorder_columns_by_schema(self.features_df, schema, "features_df")
+        self.features_df = _reorder_columns_by_schema(
+            self.features_df,
+            schema,
+            "features_df",
+        )
     except Exception as e:
         self.logger.warning(f"Failed to reorder features_df columns: {e}")
@@ -3340,17 +3603,19 @@ def migrate_map_id_to_index(self):
     # Ensure the column is Int64 type
     self.samples_df = self.samples_df.cast({"map_id": pl.Int64})
-    self.logger.info(f"Successfully migrated {sample_count} samples to indexed map_id format")
+    self.logger.info(
+        f"Successfully migrated {sample_count} samples to indexed map_id format",
+    )
     self.logger.info(f"map_id now ranges from 0 to {sample_count - 1}")
 def restore_ms2(self, samples=None, **kwargs):
     """
     Restore MS2 data by re-running find_ms2 on specified samples.
     This function rebuilds the consensus_ms2 DataFrame by re-extracting MS2 spectra
     from the original sample files. Use this to reverse the effects of compress_ms2().
     Parameters:
         samples (list, optional): List of sample_uids or sample_names to process.
                                  If None, processes all samples.
@@ -3360,31 +3625,37 @@ def restore_ms2(self, samples=None, **kwargs):
     if self.features_df is None or self.features_df.is_empty():
         self.logger.error("No features_df found in study.")
         return
     if self.samples_df is None or self.samples_df.is_empty():
         self.logger.error("No samples_df found in study.")
         return
-    # Get sample_uids to process
+    # Get sample_uids to process
     sample_uids = self._get_sample_uids(samples)
     if not sample_uids:
         self.logger.warning("No valid samples specified.")
         return
     self.logger.info(f"Restoring MS2 data from {len(sample_uids)} samples...")
     # Clear existing consensus_ms2 to rebuild from scratch
-    initial_ms2_count = len(self.consensus_ms2) if not self.consensus_ms2.is_empty() else 0
+    initial_ms2_count = (
+        len(self.consensus_ms2) if not self.consensus_ms2.is_empty() else 0
+    )
     self.consensus_ms2 = pl.DataFrame()
     # Re-run find_ms2 which will rebuild consensus_ms2
     try:
         self.find_ms2(**kwargs)
-        final_ms2_count = len(self.consensus_ms2) if not self.consensus_ms2.is_empty() else 0
-        self.logger.info(f"MS2 restoration completed: {initial_ms2_count} -> {final_ms2_count} MS2 spectra")
+        final_ms2_count = (
+            len(self.consensus_ms2) if not self.consensus_ms2.is_empty() else 0
+        )
+        self.logger.info(
+            f"MS2 restoration completed: {initial_ms2_count} -> {final_ms2_count} MS2 spectra",
+        )
     except Exception as e:
         self.logger.error(f"Failed to restore MS2 data: {e}")
         raise
@@ -3393,51 +3664,51 @@ def restore_ms2(self, samples=None, **kwargs):
 def decompress(self, features=True, ms2=True, chrom=True, samples=None, **kwargs):
     """
     Reverse any compression effects by restoring compressed data adaptively.
     This function restores data that was compressed using compress(), compress_features(),
     compress_ms2(), compress_chrom(), or study.save(compress=True). It optimizes the
     decompression process for speed by only processing what actually needs restoration.
     Parameters:
         features (bool): Restore features data (ms2_specs, ms2_scans, chrom_area)
-        ms2 (bool): Restore MS2 spectra by re-running find_ms2()
+        ms2 (bool): Restore MS2 spectra by re-running find_ms2()
         chrom (bool): Restore chromatogram objects
         samples (list, optional): List of sample_uids or sample_names to process.
                                  If None, processes all samples.
         **kwargs: Additional keyword arguments for restoration functions:
                  - For restore_chrom: mz_tol (default: 0.010), rt_tol (default: 10.0)
                  - For restore_ms2/find_ms2: mz_tol, centroid, deisotope, etc.
     Performance Optimizations:
         - Adaptive processing: Only restores what actually needs restoration
         - Processes features and chromatograms together when possible (shared file I/O)
         - Uses cached sample instances to avoid repeated file loading
         - Processes MS2 restoration last as it's the most computationally expensive
         - Provides detailed progress information for long-running operations
     Example:
         # Restore everything (but only what needs restoration)
         study.decompress()
         # Restore only chromatograms with custom tolerances
         study.decompress(features=False, ms2=False, chrom=True, mz_tol=0.005, rt_tol=5.0)
         # Restore specific samples only
         study.decompress(samples=["sample1", "sample2"])
     """
     if not any([features, ms2, chrom]):
         self.logger.warning("No decompression operations specified.")
         return
     # Get sample_uids to process
     sample_uids = self._get_sample_uids(samples)
     if not sample_uids:
         self.logger.warning("No valid samples specified.")
         return
     # Adaptively check what actually needs to be done
     import polars as pl
     # Check if features need restoration (more sophisticated logic)
     features_need_restoration = False
     if features and not self.features_df.is_empty():
@@ -3446,7 +3717,7 @@ def decompress(self, features=True, ms2=True, chrom=True, samples=None, **kwargs
         for col in ["ms2_scans", "ms2_specs"]:
             if col not in self.features_df.columns:
                 missing_cols.append(col)
         # If columns are missing entirely, we likely need restoration
         if missing_cols:
             features_need_restoration = True
@@ -3455,13 +3726,15 @@ def decompress(self, features=True, ms2=True, chrom=True, samples=None, **kwargs
             # But be smart about it - only check if we have consensus features with MS2
             if not self.consensus_ms2.is_empty():
                 # We have MS2 data, so ms2_specs should have some content
-                null_ms2_specs = self.features_df.filter(pl.col("ms2_specs").is_null()).height
+                null_ms2_specs = self.features_df.filter(
+                    pl.col("ms2_specs").is_null(),
+                ).height
                 total_features = len(self.features_df)
                 # If more than 90% are null but we have MS2 data, likely compressed
                 if null_ms2_specs > (total_features * 0.9):
                     features_need_restoration = True
-    # Check if chromatograms need restoration
+    # Check if chromatograms need restoration
     chrom_need_restoration = False
     if chrom and not self.features_df.is_empty():
         if "chrom" not in self.features_df.columns:
@@ -3472,22 +3745,26 @@ def decompress(self, features=True, ms2=True, chrom=True, samples=None, **kwargs
             total_features = len(self.features_df)
             # If more than 50% are null, likely need restoration
             chrom_need_restoration = null_chroms > (total_features * 0.5)
     # Check if MS2 data might need restoration (compare expected vs actual)
     ms2_need_restoration = False
     if ms2:
-        current_ms2_count = len(self.consensus_ms2) if not self.consensus_ms2.is_empty() else 0
-        consensus_count = len(self.consensus_df) if not self.consensus_df.is_empty() else 0
+        current_ms2_count = (
+            len(self.consensus_ms2) if not self.consensus_ms2.is_empty() else 0
+        )
+        consensus_count = (
+            len(self.consensus_df) if not self.consensus_df.is_empty() else 0
+        )
         if consensus_count > 0:
             # Calculate expected MS2 count based on consensus features with MS2 potential
             # This is a heuristic - if we have very few MS2 compared to consensus, likely compressed
             expected_ratio = 3.0  # Expect at least 3 MS2 per consensus on average
             expected_ms2 = consensus_count * expected_ratio
             if current_ms2_count < min(expected_ms2 * 0.3, consensus_count * 0.8):
                 ms2_need_restoration = True
     # Build list of operations that actually need to be done
     operations_needed = []
     if features and features_need_restoration:
@@ -3496,59 +3773,75 @@ def decompress(self, features=True, ms2=True, chrom=True, samples=None, **kwargs
         operations_needed.append("chromatograms")
     if ms2 and ms2_need_restoration:
         operations_needed.append("MS2 spectra")
     # Early exit if nothing needs to be done
     if not operations_needed:
-        self.logger.info("All data appears to be already decompressed. No operations needed.")
+        self.logger.info(
+            "All data appears to be already decompressed. No operations needed.",
+        )
         return
-    self.logger.info(f"Starting adaptive decompression: {', '.join(operations_needed)} from {len(sample_uids)} samples")
+    self.logger.info(
+        f"Starting adaptive decompression: {', '.join(operations_needed)} from {len(sample_uids)} samples",
+    )
     try:
         # Phase 1: Restore features and chromatograms together (shared file I/O)
-        if ("features" in operations_needed and "chromatograms" in operations_needed):
-            self.logger.info("Phase 1: Restoring features and chromatograms together...")
+        if "features" in operations_needed and "chromatograms" in operations_needed:
+            self.logger.info(
+                "Phase 1: Restoring features and chromatograms together...",
+            )
             # Extract relevant kwargs for restore_features and restore_chrom
             restore_kwargs = {}
-            if 'mz_tol' in kwargs:
-                restore_kwargs['mz_tol'] = kwargs['mz_tol']
-            if 'rt_tol' in kwargs:
-                restore_kwargs['rt_tol'] = kwargs['rt_tol']
+            if "mz_tol" in kwargs:
+                restore_kwargs["mz_tol"] = kwargs["mz_tol"]
+            if "rt_tol" in kwargs:
+                restore_kwargs["rt_tol"] = kwargs["rt_tol"]
             # Restore features first (includes chrom column)
             self.restore_features(samples=samples)
             # Then do additional chrom gap-filling if needed
             self.restore_chrom(samples=samples, **restore_kwargs)
-        elif ("features" in operations_needed and "chromatograms" not in operations_needed):
+        elif (
+            "features" in operations_needed and "chromatograms" not in operations_needed
+        ):
             self.logger.info("Phase 1: Restoring features data...")
             self.restore_features(samples=samples)
-        elif ("chromatograms" in operations_needed and "features" not in operations_needed):
+        elif (
+            "chromatograms" in operations_needed and "features" not in operations_needed
+        ):
             self.logger.info("Phase 1: Restoring chromatograms...")
             restore_kwargs = {}
-            if 'mz_tol' in kwargs:
-                restore_kwargs['mz_tol'] = kwargs['mz_tol']
-            if 'rt_tol' in kwargs:
-                restore_kwargs['rt_tol'] = kwargs['rt_tol']
+            if "mz_tol" in kwargs:
+                restore_kwargs["mz_tol"] = kwargs["mz_tol"]
+            if "rt_tol" in kwargs:
+                restore_kwargs["rt_tol"] = kwargs["rt_tol"]
             self.restore_chrom(samples=samples, **restore_kwargs)
         # Phase 2: Restore MS2 data (most computationally expensive, done last)
         if "MS2 spectra" in operations_needed:
             self.logger.info("Phase 2: Restoring MS2 spectra...")
             # Extract MS2-specific kwargs
             ms2_kwargs = {}
             for key, value in kwargs.items():
-                if key in ['mz_tol', 'centroid', 'deisotope', 'dia_stats', 'feature_uid']:
+                if key in [
+                    "mz_tol",
+                    "centroid",
+                    "deisotope",
+                    "dia_stats",
+                    "feature_uid",
+                ]:
                     ms2_kwargs[key] = value
             self.restore_ms2(samples=samples, **ms2_kwargs)
         self.logger.info("Adaptive decompression completed successfully")
     except Exception as e:
         self.logger.error(f"Decompression failed: {e}")
         raise

masster 0.4.3__py3-none-any.whl → 0.4.5__py3-none-any.whl

Potentially problematic release.

masster 0.4.3py3-none-any.whl → 0.4.5py3-none-any.whl