PyPI - masster - Versions diffs - 0.3.18__py3-none-any.whl → 0.3.19__py3-none-any.whl - Mend

masster 0.3.18py3-none-any.whl → 0.3.19py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of masster might be problematic. Click here for more details.

Files changed (18) hide show

masster/_version.py +1 -1
masster/sample/h5.py +1 -1
masster/sample/helpers.py +3 -7
masster/sample/load.py +2 -2
masster/sample/plot.py +2 -1
masster/study/export.py +27 -10
masster/study/h5.py +58 -40
masster/study/helpers.py +220 -190
masster/study/helpers_optimized.py +5 -5
masster/study/load.py +144 -118
masster/study/plot.py +240 -101
masster/study/processing.py +9 -5
masster/study/study.py +2 -6
{masster-0.3.18.dist-info → masster-0.3.19.dist-info}/METADATA +1 -1
{masster-0.3.18.dist-info → masster-0.3.19.dist-info}/RECORD +18 -18
{masster-0.3.18.dist-info → masster-0.3.19.dist-info}/WHEEL +0 -0
{masster-0.3.18.dist-info → masster-0.3.19.dist-info}/entry_points.txt +0 -0
{masster-0.3.18.dist-info → masster-0.3.19.dist-info}/licenses/LICENSE +0 -0

masster/study/helpers.py CHANGED Viewed

@@ -6,7 +6,7 @@ like data retrieval, filtering, compression, and utility functions.
 The functions are organized into the following sections:
 1. Chromatogram extraction functions (BPC, TIC, EIC, chrom matrix)
-2. Data retrieval helper functions (get_sample, get_consensus, etc.)
+2. Data retrieval helper functions (get_sample, get_consensus, etc.)
 3. UID helper functions (_get_*_uids)
 4. Data filtering and selection functions
 5. Data compression and restoration functions
@@ -150,9 +150,19 @@ def get_bpc(owner, sample=None, rt_unit="s", label=None, original=False):
     # build Chromatogram
     ycol = "inty"
     try:
-        chrom = Chromatogram(rt=bpc_pd["rt"].to_numpy(), inty=bpc_pd[ycol].to_numpy(), label=label or "Base Peak Chromatogram", rt_unit=rt_unit)
+        chrom = Chromatogram(
+            rt=bpc_pd["rt"].to_numpy(),
+            inty=bpc_pd[ycol].to_numpy(),
+            label=label or "Base Peak Chromatogram",
+            rt_unit=rt_unit,
+        )
     except Exception:
-        chrom = Chromatogram(rt=bpc_pd["rt"].values, inty=bpc_pd[ycol].values, label=label or "Base Peak Chromatogram", rt_unit=rt_unit)
+        chrom = Chromatogram(
+            rt=bpc_pd["rt"].values,
+            inty=bpc_pd[ycol].values,
+            label=label or "Base Peak Chromatogram",
+            rt_unit=rt_unit,
+        )
     return chrom
@@ -204,13 +214,21 @@ def get_tic(owner, sample=None, label=None):
         tic_pd = tic_pd.rename(columns={tic_pd.columns[1]: "inty_tot"})
     try:
-        chrom = Chromatogram(rt=tic_pd["rt"].to_numpy(), inty=tic_pd["inty_tot"].to_numpy(), label=label or "Total Ion Chromatogram")
+        chrom = Chromatogram(
+            rt=tic_pd["rt"].to_numpy(),
+            inty=tic_pd["inty_tot"].to_numpy(),
+            label=label or "Total Ion Chromatogram",
+        )
     except Exception:
-        chrom = Chromatogram(rt=tic_pd["rt"].values, inty=tic_pd["inty_tot"].values, label=label or "Total Ion Chromatogram")
+        chrom = Chromatogram(
+            rt=tic_pd["rt"].values,
+            inty=tic_pd["inty_tot"].values,
+            label=label or "Total Ion Chromatogram",
+        )
     return chrom
 def get_eic(owner, sample=None, mz=None, mz_tol=None, rt_unit="s", label=None):
     """
     Return a Chromatogram object containing the Extracted Ion Chromatogram (EIC) for a target m/z.
@@ -223,7 +241,7 @@ def get_eic(owner, sample=None, mz=None, mz_tol=None, rt_unit="s", label=None):
     Parameters:
         owner: Study or Sample instance
-        sample: Sample identifier (required if owner is Study)
+        sample: Sample identifier (required if owner is Study)
         mz (float): Target m/z value
         mz_tol (float): m/z tolerance. If None, uses owner.parameters.eic_mz_tol (for Study) or defaults to 0.01
         rt_unit (str): Retention time unit for the chromatogram
@@ -234,7 +252,7 @@ def get_eic(owner, sample=None, mz=None, mz_tol=None, rt_unit="s", label=None):
     """
     # Use default mz_tol from study parameters if not provided
     if mz_tol is None:
-        if hasattr(owner, 'parameters') and hasattr(owner.parameters, 'eic_mz_tol'):
+        if hasattr(owner, "parameters") and hasattr(owner.parameters, "eic_mz_tol"):
             mz_tol = owner.parameters.eic_mz_tol
         else:
             mz_tol = 0.01  # fallback default
@@ -267,17 +285,18 @@ def get_eic(owner, sample=None, mz=None, mz_tol=None, rt_unit="s", label=None):
         mz_min = mz - mz_tol
         mz_max = mz + mz_tol
         eic_data = s.ms1_df.filter(
-            (pl.col("mz") >= mz_min) & (pl.col("mz") <= mz_max)
+            (pl.col("mz") >= mz_min) & (pl.col("mz") <= mz_max),
         )
         if eic_data.is_empty():
             # Return empty chromatogram if no data found
             import numpy as _np
             return Chromatogram(
-                rt=_np.array([0.0]),
-                inty=_np.array([0.0]),
+                rt=_np.array([0.0]),
+                inty=_np.array([0.0]),
                 label=label or f"EIC m/z={mz:.4f} ± {mz_tol} (empty)",
-                rt_unit=rt_unit
+                rt_unit=rt_unit,
             )
         # Aggregate intensities per retention time (sum in case of multiple points per rt)
@@ -290,34 +309,35 @@ def get_eic(owner, sample=None, mz=None, mz_tol=None, rt_unit="s", label=None):
     if eic_pd.empty:
         # Return empty chromatogram if no data found
         import numpy as _np
         return Chromatogram(
-            rt=_np.array([0.0]),
-            inty=_np.array([0.0]),
+            rt=_np.array([0.0]),
+            inty=_np.array([0.0]),
             label=label or f"EIC m/z={mz:.4f} ± {mz_tol} (empty)",
-            rt_unit=rt_unit
+            rt_unit=rt_unit,
         )
     # build Chromatogram
     try:
         chrom = Chromatogram(
-            rt=eic_pd["rt"].to_numpy(),
-            inty=eic_pd["inty"].to_numpy(),
+            rt=eic_pd["rt"].to_numpy(),
+            inty=eic_pd["inty"].to_numpy(),
             label=label or f"EIC m/z={mz:.4f} ± {mz_tol}",
-            rt_unit=rt_unit
+            rt_unit=rt_unit,
         )
     except Exception:
         chrom = Chromatogram(
-            rt=eic_pd["rt"].values,
-            inty=eic_pd["inty"].values,
+            rt=eic_pd["rt"].values,
+            inty=eic_pd["inty"].values,
             label=label or f"EIC m/z={mz:.4f} ± {mz_tol}",
-            rt_unit=rt_unit
+            rt_unit=rt_unit,
         )
     return chrom
 # =====================================================================================
-# DATA RETRIEVAL AND MATRIX FUNCTIONS
+# DATA RETRIEVAL AND MATRIX FUNCTIONS
 # =====================================================================================
@@ -451,9 +471,9 @@ def align_reset(self):
     self.alignment_ref_index = None
     # in self.features_df, set rt equal to rt_original
     self.features_df = self.features_df.with_columns(
-        pl.col("rt_original").alias("rt")
+        pl.col("rt_original").alias("rt"),
     )
     # Ensure column order is maintained after with_columns operation
     self._ensure_features_df_schema_order()
@@ -614,7 +634,7 @@ def get_consensus_matches(self, uids=None):
     return matches
-# =====================================================================================
+# =====================================================================================
 # UID HELPER FUNCTIONS
 # =====================================================================================
@@ -796,7 +816,7 @@ def get_sample(self, sample):
         return cache[sample_uid]
     sample_path = row.get("sample_path", None)
-    s = Sample(log_level='ERROR')
+    s = Sample(log_level="ERROR")
     try:
         if sample_path:
             try:
@@ -816,13 +836,13 @@ def get_orphans(self):
     Get all features that are not in the consensus mapping.
     """
     not_in_consensus = self.features_df.filter(
-        ~self.features_df["feature_uid"].is_in(self.consensus_mapping_df["feature_uid"].to_list())
+        ~self.features_df["feature_uid"].is_in(self.consensus_mapping_df["feature_uid"].to_list()),
     )
     return not_in_consensus
 # =====================================================================================
-# DATA COMPRESSION AND RESTORATION FUNCTIONS
+# DATA COMPRESSION AND RESTORATION FUNCTIONS
 # =====================================================================================
@@ -878,7 +898,7 @@ def compress_features(self):
     removed_count = initial_count - len(self.features_df)
     self.logger.info(
-        f"Compressed features: removed {removed_count} features not in consensus, cleared ms2_specs column"
+        f"Compressed features: removed {removed_count} features not in consensus, cleared ms2_specs column",
     )
@@ -1119,7 +1139,7 @@ def restore_chrom(self, samples=None, mz_tol=0.010, rt_tol=10.0):
     total_chroms = len(self.features_df)
     self.logger.debug(
-        f"Chromatograms still missing: {empty_chroms}/{total_chroms} ({empty_chroms / total_chroms * 100:.1f}%)"
+        f"Chromatograms still missing: {empty_chroms}/{total_chroms} ({empty_chroms / total_chroms * 100:.1f}%)",
     )
     if empty_chroms == 0:
@@ -1249,7 +1269,7 @@ def restore_chrom(self, samples=None, mz_tol=0.010, rt_tol=10.0):
     final_total = len(self.features_df)
     self.logger.info(
-        f"Chromatogram restoration complete: {final_non_null}/{final_total} ({final_non_null / final_total * 100:.1f}%)"
+        f"Chromatogram restoration complete: {final_non_null}/{final_total} ({final_non_null / final_total * 100:.1f}%)",
     )
     self.logger.info(f"Restored from .sample5 files: {restored_count}, Gap-filled from raw data: {filled_count}")
@@ -1290,7 +1310,7 @@ def compress_ms2(self, max_replicates=5):
     removed_count = initial_count - len(self.consensus_ms2)
     self.logger.info(
-        f"Compressed MS2 data: removed {removed_count} entries, kept max {max_replicates} per consensus/energy pair"
+        f"Compressed MS2 data: removed {removed_count} entries, kept max {max_replicates} per consensus/energy pair",
     )
@@ -1328,14 +1348,14 @@ def compress_chrom(self):
 def sample_name_replace(self, replace_dict):
     """
     Replace sample names in samples_df based on a dictionary mapping.
-    Takes all names in self.samples_df['sample_name'], creates a copy, and replaces
-    all keys with their corresponding values from replace_dict. Checks that all
+    Takes all names in self.samples_df['sample_name'], creates a copy, and replaces
+    all keys with their corresponding values from replace_dict. Checks that all
     resulting sample names are unique. If unique, replaces the values in self.samples_df.
     Parameters:
         replace_dict (dict): Dictionary mapping old names (keys) to new names (values).
-                           All keys found in sample names will be replaced with their
+                           All keys found in sample names will be replaced with their
                            corresponding values.
                            e.g., {"old_name1": "new_name1", "old_name2": "new_name2"}
@@ -1348,22 +1368,22 @@ def sample_name_replace(self, replace_dict):
     """
     if not isinstance(replace_dict, dict):
         raise ValueError("replace_dict must be a dictionary")
     if self.samples_df is None or len(self.samples_df) == 0:
         self.logger.warning("No samples found in study.")
         return
     if not replace_dict:
         self.logger.warning("Empty replace_dict provided, no changes made.")
         return
     # Get current sample names
     current_names = self.samples_df.get_column("sample_name").to_list()
     # Create a copy and apply replacements
     new_names = []
     replaced_count = 0
     for name in current_names:
         if name in replace_dict:
             new_names.append(replace_dict[name])
@@ -1371,7 +1391,7 @@ def sample_name_replace(self, replace_dict):
             self.logger.debug(f"Replacing sample name: '{name}' -> '{replace_dict[name]}'")
         else:
             new_names.append(name)
     # Check that all new names are unique
     if len(set(new_names)) != len(new_names):
         duplicates = []
@@ -1382,19 +1402,19 @@ def sample_name_replace(self, replace_dict):
             else:
                 seen.add(name)
         raise ValueError(f"Resulting sample names are not unique. Duplicates found: {duplicates}")
     # If we get here, all names are unique - apply the changes
     self.samples_df = self.samples_df.with_columns(
         pl.Series("sample_name", new_names).alias("sample_name"),
     )
     self.logger.info(f"Successfully replaced {replaced_count} sample names")
 def sample_name_reset(self):
     """
     Reset sample names to the basename of sample_path without extensions.
     Takes all paths in self.samples_df['sample_path'], extracts the basename,
     removes file extensions, and checks that all resulting names are unique.
     If unique, replaces the values in self.samples_df['sample_name'].
@@ -1407,31 +1427,31 @@ def sample_name_reset(self):
         RuntimeError: If any sample_path is None or empty
     """
     import os
     if self.samples_df is None or len(self.samples_df) == 0:
         self.logger.warning("No samples found in study.")
         return
     # Get current sample paths
     sample_paths = self.samples_df.get_column("sample_path").to_list()
     # Extract basenames without extensions
     new_names = []
     for i, path in enumerate(sample_paths):
         if path is None or path == "":
             raise RuntimeError(f"Sample at index {i} has no sample_path set")
         # Get basename and remove extension(s)
         basename = os.path.basename(path)
         # Remove all extensions (handles cases like .tar.gz, .sample5.gz, etc.)
         name_without_ext = basename
-        while '.' in name_without_ext:
+        while "." in name_without_ext:
             name_without_ext = os.path.splitext(name_without_ext)[0]
         new_names.append(name_without_ext)
         self.logger.debug(f"Resetting sample name from path: '{path}' -> '{name_without_ext}'")
     # Check that all new names are unique
     if len(set(new_names)) != len(new_names):
         duplicates = []
@@ -1442,12 +1462,12 @@ def sample_name_reset(self):
             else:
                 seen.add(name)
         raise ValueError(f"Resulting sample names are not unique. Duplicates found: {duplicates}")
     # If we get here, all names are unique - apply the changes
     self.samples_df = self.samples_df.with_columns(
         pl.Series("sample_name", new_names).alias("sample_name"),
     )
     self.logger.info(f"Successfully reset {len(new_names)} sample names from sample paths")
@@ -1704,7 +1724,7 @@ def features_select(
             if isinstance(chrom_coherence, tuple) and len(chrom_coherence) == 2:
                 min_coherence, max_coherence = chrom_coherence
                 filter_conditions.append(
-                    (pl.col("chrom_coherence") >= min_coherence) & (pl.col("chrom_coherence") <= max_coherence)
+                    (pl.col("chrom_coherence") >= min_coherence) & (pl.col("chrom_coherence") <= max_coherence),
                 )
             else:
                 filter_conditions.append(pl.col("chrom_coherence") >= chrom_coherence)
@@ -1717,7 +1737,7 @@ def features_select(
             if isinstance(chrom_prominence, tuple) and len(chrom_prominence) == 2:
                 min_prominence, max_prominence = chrom_prominence
                 filter_conditions.append(
-                    (pl.col("chrom_prominence") >= min_prominence) & (pl.col("chrom_prominence") <= max_prominence)
+                    (pl.col("chrom_prominence") >= min_prominence) & (pl.col("chrom_prominence") <= max_prominence),
                 )
             else:
                 filter_conditions.append(pl.col("chrom_prominence") >= chrom_prominence)
@@ -1731,7 +1751,7 @@ def features_select(
                 min_prominence_scaled, max_prominence_scaled = chrom_prominence_scaled
                 filter_conditions.append(
                     (pl.col("chrom_prominence_scaled") >= min_prominence_scaled)
-                    & (pl.col("chrom_prominence_scaled") <= max_prominence_scaled)
+                    & (pl.col("chrom_prominence_scaled") <= max_prominence_scaled),
                 )
             else:
                 filter_conditions.append(pl.col("chrom_prominence_scaled") >= chrom_prominence_scaled)
@@ -1745,7 +1765,7 @@ def features_select(
                 min_height_scaled, max_height_scaled = chrom_height_scaled
                 filter_conditions.append(
                     (pl.col("chrom_height_scaled") >= min_height_scaled)
-                    & (pl.col("chrom_height_scaled") <= max_height_scaled)
+                    & (pl.col("chrom_height_scaled") <= max_height_scaled),
                 )
             else:
                 filter_conditions.append(pl.col("chrom_height_scaled") >= chrom_height_scaled)
@@ -1852,7 +1872,7 @@ def features_filter(self, features):
     # Single comprehensive log message
     if mapping_removed_count > 0:
         self.logger.info(
-            f"Kept {final_count} features and removed {mapping_removed_count} consensus mappings. Filtered out {removed_count} features."
+            f"Kept {final_count} features and removed {mapping_removed_count} consensus mappings. Filtered out {removed_count} features.",
         )
     else:
         self.logger.info(f"Kept {final_count} features. Filtered out {removed_count} features.")
@@ -1929,7 +1949,7 @@ def features_delete(self, features):
     # Single comprehensive log message
     if mapping_removed_count > 0:
         self.logger.info(
-            f"Deleted {removed_count} features and {mapping_removed_count} consensus mappings. Remaining features: {final_count}"
+            f"Deleted {removed_count} features and {mapping_removed_count} consensus mappings. Remaining features: {final_count}",
         )
     else:
         self.logger.info(f"Deleted {removed_count} features. Remaining features: {final_count}")
@@ -1994,7 +2014,7 @@ def consensus_select(
     # Filter by m/z
     if mz is not None:
         consensus_len_before_filter = len(consensus)
         if isinstance(mz, tuple) and len(mz) == 2:
             # Check if second value is smaller than first (indicating mz, mz_tol format)
             if mz[1] < mz[0]:
@@ -2008,18 +2028,19 @@ def consensus_select(
             consensus = consensus.filter((pl.col("mz") >= min_mz) & (pl.col("mz") <= max_mz))
         else:
             # Single float value - use default mz tolerance from study parameters
-            default_mz_tol = getattr(self, 'parameters', None)
-            if default_mz_tol and hasattr(default_mz_tol, 'eic_mz_tol'):
+            default_mz_tol = getattr(self, "parameters", None)
+            if default_mz_tol and hasattr(default_mz_tol, "eic_mz_tol"):
                 default_mz_tol = default_mz_tol.eic_mz_tol
             else:
                 # Fallback to align_defaults if study parameters not available
                 from masster.study.defaults.align_def import align_defaults
                 default_mz_tol = align_defaults().mz_max_diff
             min_mz = mz - default_mz_tol
             max_mz = mz + default_mz_tol
             consensus = consensus.filter((pl.col("mz") >= min_mz) & (pl.col("mz") <= max_mz))
         self.logger.debug(
             f"Selected consensus by mz. Consensus removed: {consensus_len_before_filter - len(consensus)}",
         )
@@ -2027,7 +2048,7 @@ def consensus_select(
     # Filter by retention time
     if rt is not None:
         consensus_len_before_filter = len(consensus)
         if isinstance(rt, tuple) and len(rt) == 2:
             # Check if second value is smaller than first (indicating rt, rt_tol format)
             if rt[1] < rt[0]:
@@ -2041,18 +2062,19 @@ def consensus_select(
             consensus = consensus.filter((pl.col("rt") >= min_rt) & (pl.col("rt") <= max_rt))
         else:
             # Single float value - use default rt tolerance from study parameters
-            default_rt_tol = getattr(self, 'parameters', None)
-            if default_rt_tol and hasattr(default_rt_tol, 'eic_rt_tol'):
+            default_rt_tol = getattr(self, "parameters", None)
+            if default_rt_tol and hasattr(default_rt_tol, "eic_rt_tol"):
                 default_rt_tol = default_rt_tol.eic_rt_tol
             else:
                 # Fallback to align_defaults if study parameters not available
                 from masster.study.defaults.align_def import align_defaults
                 default_rt_tol = align_defaults().rt_max_diff
             min_rt = rt - default_rt_tol
             max_rt = rt + default_rt_tol
             consensus = consensus.filter((pl.col("rt") >= min_rt) & (pl.col("rt") <= max_rt))
         self.logger.debug(
             f"Selected consensus by rt. Consensus removed: {consensus_len_before_filter - len(consensus)}",
         )
@@ -2077,7 +2099,7 @@ def consensus_select(
                 # Treat as range
                 min_uid, max_uid = consensus_uid
                 consensus = consensus.filter(
-                    (pl.col("consensus_uid") >= min_uid) & (pl.col("consensus_uid") <= max_uid)
+                    (pl.col("consensus_uid") >= min_uid) & (pl.col("consensus_uid") <= max_uid),
                 )
             else:
                 # Treat as list
@@ -2105,7 +2127,7 @@ def consensus_select(
         if isinstance(number_samples, tuple) and len(number_samples) == 2:
             min_samples, max_samples = number_samples
             consensus = consensus.filter(
-                (pl.col("number_samples") >= min_samples) & (pl.col("number_samples") <= max_samples)
+                (pl.col("number_samples") >= min_samples) & (pl.col("number_samples") <= max_samples),
             )
         else:
             consensus = consensus.filter(pl.col("number_samples") >= number_samples)
@@ -2163,7 +2185,7 @@ def consensus_select(
                 min_coherence, max_coherence = chrom_coherence_mean
                 consensus = consensus.filter(
                     (pl.col("chrom_coherence_mean") >= min_coherence)
-                    & (pl.col("chrom_coherence_mean") <= max_coherence)
+                    & (pl.col("chrom_coherence_mean") <= max_coherence),
                 )
             else:
                 consensus = consensus.filter(pl.col("chrom_coherence_mean") >= chrom_coherence_mean)
@@ -2181,7 +2203,7 @@ def consensus_select(
                 min_prominence, max_prominence = chrom_prominence_mean
                 consensus = consensus.filter(
                     (pl.col("chrom_prominence_mean") >= min_prominence)
-                    & (pl.col("chrom_prominence_mean") <= max_prominence)
+                    & (pl.col("chrom_prominence_mean") <= max_prominence),
                 )
             else:
                 consensus = consensus.filter(pl.col("chrom_prominence_mean") >= chrom_prominence_mean)
@@ -2199,7 +2221,7 @@ def consensus_select(
                 min_prominence_scaled, max_prominence_scaled = chrom_prominence_scaled_mean
                 consensus = consensus.filter(
                     (pl.col("chrom_prominence_scaled_mean") >= min_prominence_scaled)
-                    & (pl.col("chrom_prominence_scaled_mean") <= max_prominence_scaled)
+                    & (pl.col("chrom_prominence_scaled_mean") <= max_prominence_scaled),
                 )
             else:
                 consensus = consensus.filter(pl.col("chrom_prominence_scaled_mean") >= chrom_prominence_scaled_mean)
@@ -2217,7 +2239,7 @@ def consensus_select(
                 min_height_scaled, max_height_scaled = chrom_height_scaled_mean
                 consensus = consensus.filter(
                     (pl.col("chrom_height_scaled_mean") >= min_height_scaled)
-                    & (pl.col("chrom_height_scaled_mean") <= max_height_scaled)
+                    & (pl.col("chrom_height_scaled_mean") <= max_height_scaled),
                 )
             else:
                 consensus = consensus.filter(pl.col("chrom_height_scaled_mean") >= chrom_height_scaled_mean)
@@ -2234,7 +2256,7 @@ def consensus_select(
             if isinstance(rt_delta_mean, tuple) and len(rt_delta_mean) == 2:
                 min_rt_delta, max_rt_delta = rt_delta_mean
                 consensus = consensus.filter(
-                    (pl.col("rt_delta_mean") >= min_rt_delta) & (pl.col("rt_delta_mean") <= max_rt_delta)
+                    (pl.col("rt_delta_mean") >= min_rt_delta) & (pl.col("rt_delta_mean") <= max_rt_delta),
                 )
             else:
                 consensus = consensus.filter(pl.col("rt_delta_mean") >= rt_delta_mean)
@@ -2261,10 +2283,10 @@ def consensus_select(
             # Multiple columns
             valid_columns = [col for col in sortby if col in consensus.columns]
             invalid_columns = [col for col in sortby if col not in consensus.columns]
             if invalid_columns:
                 self.logger.warning(f"Sort columns not found in consensus DataFrame: {invalid_columns}")
             if valid_columns:
                 consensus = consensus.sort(valid_columns, descending=descending)
         else:
@@ -2355,7 +2377,7 @@ def consensus_filter(self, consensus):
     removed_consensus_count = initial_consensus_count - len(self.consensus_df)
     self.logger.info(
-        f"Filtered {removed_consensus_count} consensus features. Remaining consensus: {len(self.consensus_df)}"
+        f"Filtered {removed_consensus_count} consensus features. Remaining consensus: {len(self.consensus_df)}",
     )
@@ -2485,7 +2507,9 @@ def samples_select(
                 if len(sample_batch) == 2 and not isinstance(sample_batch, list):
                     # Treat as range
                     min_batch, max_batch = sample_batch
-                    filter_conditions.append((pl.col("sample_batch") >= min_batch) & (pl.col("sample_batch") <= max_batch))
+                    filter_conditions.append(
+                        (pl.col("sample_batch") >= min_batch) & (pl.col("sample_batch") <= max_batch),
+                    )
                 else:
                     # Treat as list
                     filter_conditions.append(pl.col("sample_batch").is_in(sample_batch))
@@ -2501,7 +2525,9 @@ def samples_select(
                 if len(sample_sequence) == 2 and not isinstance(sample_sequence, list):
                     # Treat as range
                     min_seq, max_seq = sample_sequence
-                    filter_conditions.append((pl.col("sample_sequence") >= min_seq) & (pl.col("sample_sequence") <= max_seq))
+                    filter_conditions.append(
+                        (pl.col("sample_sequence") >= min_seq) & (pl.col("sample_sequence") <= max_seq),
+                    )
                 else:
                     # Treat as list
                     filter_conditions.append(pl.col("sample_sequence").is_in(sample_sequence))
@@ -2515,7 +2541,9 @@ def samples_select(
         if "num_features" in available_columns:
             if isinstance(num_features, tuple) and len(num_features) == 2:
                 min_features, max_features = num_features
-                filter_conditions.append((pl.col("num_features") >= min_features) & (pl.col("num_features") <= max_features))
+                filter_conditions.append(
+                    (pl.col("num_features") >= min_features) & (pl.col("num_features") <= max_features),
+                )
             else:
                 filter_conditions.append(pl.col("num_features") >= num_features)
         else:
@@ -2572,15 +2600,15 @@ def samples_select(
 def samples_delete(self, samples):
     """
     Delete samples and all related data from the study based on sample identifiers.
-    This function eliminates all data related to the specified samples (and their sample_uids)
+    This function eliminates all data related to the specified samples (and their sample_uids)
     from all dataframes including:
     - samples_df: Removes the sample rows
     - features_df: Removes all features belonging to these samples
     - consensus_mapping_df: Removes mappings for features from these samples
     - consensus_ms2: Removes MS2 spectra for features from these samples
     - feature_maps: Removes the corresponding feature maps
     Also updates map_id values to maintain sequential indices after deletion.
     Parameters:
@@ -2642,10 +2670,10 @@ def samples_delete(self, samples):
     # Get map_ids to remove from feature_maps (needed before samples_df deletion)
     map_ids_to_remove = []
-    if hasattr(self, 'feature_maps') and self.feature_maps is not None:
+    if hasattr(self, "feature_maps") and self.feature_maps is not None:
         # Get map_ids for samples to be deleted
         map_ids_df = self.samples_df.filter(
-            pl.col("sample_uid").is_in(sample_uids_to_remove)
+            pl.col("sample_uid").is_in(sample_uids_to_remove),
         ).select("map_id")
         if not map_ids_df.is_empty():
             map_ids_to_remove = map_ids_df["map_id"].to_list()
@@ -2683,7 +2711,7 @@ def samples_delete(self, samples):
     # 5. Remove from feature_maps and update map_id
     removed_maps_count = 0
-    if hasattr(self, 'feature_maps') and self.feature_maps is not None and map_ids_to_remove:
+    if hasattr(self, "feature_maps") and self.feature_maps is not None and map_ids_to_remove:
         # Remove feature maps in reverse order to maintain indices
         for map_id in sorted(map_ids_to_remove, reverse=True):
             if 0 <= map_id < len(self.feature_maps):
@@ -2694,7 +2722,7 @@ def samples_delete(self, samples):
         if len(self.samples_df) > 0:
             new_map_ids = list(range(len(self.samples_df)))
             self.samples_df = self.samples_df.with_columns(
-                pl.lit(new_map_ids).alias("map_id")
+                pl.lit(new_map_ids).alias("map_id"),
             )
     # Calculate and log results
@@ -2705,16 +2733,16 @@ def samples_delete(self, samples):
     summary_parts = [
         f"Deleted {removed_sample_count} samples",
     ]
     if removed_features_count > 0:
         summary_parts.append(f"{removed_features_count} features")
     if removed_mapping_count > 0:
         summary_parts.append(f"{removed_mapping_count} consensus mappings")
     if removed_ms2_count > 0:
         summary_parts.append(f"{removed_ms2_count} MS2 spectra")
     if removed_maps_count > 0:
         summary_parts.append(f"{removed_maps_count} feature maps")
@@ -2735,14 +2763,14 @@ def samples_delete(self, samples):
 def sample_color(self, by=None, palette="Turbo256"):
     """
     Set sample colors in the sample_color column of samples_df.
     When a new sample is added, this function resets all colors picking from the specified palette.
     The default palette is Turbo256.
     Parameters:
         by (str or list, optional): Property to base colors on. Options:
                                      - 'sample_uid': Use sample_uid values to assign colors
-                                     - 'sample_index': Use sample index (position) to assign colors
+                                     - 'sample_index': Use sample index (position) to assign colors
                                      - 'sample_type': Use sample_type values to assign colors
                                      - 'sample_name': Use sample_name values to assign colors
                                      - list of colors: Use provided list of hex color codes
@@ -2755,7 +2783,7 @@ def sample_color(self, by=None, palette="Turbo256"):
                       - 'Magma256': Magma colormap (256 colors, perceptually uniform)
                       - 'Cividis256': Cividis colormap (256 colors, colorblind-friendly)
                       - 'Set1': Qualitative palette (9 distinct colors)
-                      - 'Set2': Qualitative palette (8 distinct colors)
+                      - 'Set2': Qualitative palette (8 distinct colors)
                       - 'Set3': Qualitative palette (12 distinct colors)
                       - 'Tab10': Tableau 10 palette (10 distinct colors)
                       - 'Tab20': Tableau 20 palette (20 distinct colors)
@@ -2766,7 +2794,7 @@ def sample_color(self, by=None, palette="Turbo256"):
                       - 'Coolwarm': Cool-warm diverging colormap
                       - 'Seismic': Seismic diverging colormap
                       - Any other colormap name supported by the cmap library
                       For a complete catalog of available colormaps, see:
                       https://cmap-docs.readthedocs.io/en/latest/catalog/
@@ -2776,10 +2804,10 @@ def sample_color(self, by=None, palette="Turbo256"):
     Example:
         # Set colors based on sample type
         study.sample_color(by='sample_type', palette='Set1')
         # Set colors using a custom color list
         study.sample_color(by=['#FF0000', '#00FF00', '#0000FF'])
         # Reset to default Turbo256 sequential colors
         study.sample_color()
     """
@@ -2788,11 +2816,13 @@ def sample_color(self, by=None, palette="Turbo256"):
         return
     sample_count = len(self.samples_df)
     # Handle custom color list
     if isinstance(by, list):
         if len(by) < sample_count:
-            self.logger.warning(f"Provided color list has {len(by)} colors but {sample_count} samples. Repeating colors.")
+            self.logger.warning(
+                f"Provided color list has {len(by)} colors but {sample_count} samples. Repeating colors.",
+            )
             # Cycle through the provided colors if there aren't enough
             colors = []
             for i in range(sample_count):
@@ -2808,10 +2838,10 @@ def sample_color(self, by=None, palette="Turbo256"):
             except ValueError as e:
                 self.logger.error(f"Error sampling colors from colormap: {e}")
                 return
-        elif by == 'sample_uid':
+        elif by == "sample_uid":
             # Use sample_uid to determine position in evenly sampled colormap
-            sample_uids = self.samples_df['sample_uid'].to_list()
+            sample_uids = self.samples_df["sample_uid"].to_list()
             try:
                 # Sample colors evenly for the number of samples
                 palette_colors = _sample_colors_from_colormap(palette, sample_count)
@@ -2823,29 +2853,29 @@ def sample_color(self, by=None, palette="Turbo256"):
             except ValueError as e:
                 self.logger.error(f"Error sampling colors from colormap: {e}")
                 return
-        elif by == 'sample_index':
+        elif by == "sample_index":
             # Use sample index (position in DataFrame) with evenly sampled colors
             try:
                 colors = _sample_colors_from_colormap(palette, sample_count)
             except ValueError as e:
                 self.logger.error(f"Error sampling colors from colormap: {e}")
                 return
-        elif by == 'sample_type':
+        elif by == "sample_type":
             # Use sample_type to assign colors - same type gets same color
             # Sample colors evenly across colormap for unique types
-            sample_types = self.samples_df['sample_type'].to_list()
-            unique_types = list(set([t for t in sample_types if t is not None]))
+            sample_types = self.samples_df["sample_type"].to_list()
+            unique_types = list({t for t in sample_types if t is not None})
             try:
                 # Sample colors evenly for unique types
                 type_colors = _sample_colors_from_colormap(palette, len(unique_types))
                 type_to_color = {}
                 for i, sample_type in enumerate(unique_types):
                     type_to_color[sample_type] = type_colors[i]
                 colors = []
                 for sample_type in sample_types:
                     if sample_type is None:
@@ -2856,21 +2886,21 @@ def sample_color(self, by=None, palette="Turbo256"):
             except ValueError as e:
                 self.logger.error(f"Error sampling colors from colormap: {e}")
                 return
-        elif by == 'sample_name':
+        elif by == "sample_name":
             # Use sample_name to assign colors - same name gets same color (unlikely but possible)
             # Sample colors evenly across colormap for unique names
-            sample_names = self.samples_df['sample_name'].to_list()
-            unique_names = list(set([n for n in sample_names if n is not None]))
+            sample_names = self.samples_df["sample_name"].to_list()
+            unique_names = list({n for n in sample_names if n is not None})
             try:
                 # Sample colors evenly for unique names
                 name_colors = _sample_colors_from_colormap(palette, len(unique_names))
                 name_to_color = {}
                 for i, sample_name in enumerate(unique_names):
                     name_to_color[sample_name] = name_colors[i]
                 colors = []
                 for sample_name in sample_names:
                     if sample_name is None:
@@ -2882,14 +2912,16 @@ def sample_color(self, by=None, palette="Turbo256"):
                 self.logger.error(f"Error sampling colors from colormap: {e}")
                 return
         else:
-            self.logger.error(f"Invalid by value: {by}. Must be 'sample_uid', 'sample_index', 'sample_type', 'sample_name', a list of colors, or None.")
+            self.logger.error(
+                f"Invalid by value: {by}. Must be 'sample_uid', 'sample_index', 'sample_type', 'sample_name', a list of colors, or None.",
+            )
             return
     # Update the sample_color column
     self.samples_df = self.samples_df.with_columns(
-        pl.Series("sample_color", colors).alias("sample_color")
+        pl.Series("sample_color", colors).alias("sample_color"),
     )
     if isinstance(by, list):
         self.logger.debug(f"Set sample colors using provided color list ({len(by)} colors)")
     elif by is None:
@@ -2901,28 +2933,28 @@ def sample_color(self, by=None, palette="Turbo256"):
 def sample_color_reset(self):
     """
     Reset sample colors to default coloring using the 'turbo' colormap.
     This function assigns colors by distributing samples evenly across the full
     turbo colormap range, ensuring maximum color diversity and visual distinction
     between samples.
     Returns:
         None (modifies self.samples_df in place)
     """
     if self.samples_df is None or len(self.samples_df) == 0:
         self.logger.warning("No samples found in study.")
         return
     try:
         from cmap import Colormap
         # Use turbo colormap
-        cm = Colormap('turbo')
+        cm = Colormap("turbo")
         # Get sample count and assign colors evenly distributed across colormap
         n_samples = len(self.samples_df)
         colors = []
         # Distribute samples evenly across the full colormap range
         for i in range(n_samples):
             # Evenly distribute samples across colormap (avoiding endpoints to prevent white/black)
@@ -2930,9 +2962,9 @@ def sample_color_reset(self):
             # Optionally, map to a subset of colormap to avoid extreme colors
             # Use 10% to 90% of colormap range for better color diversity
             normalized_value = 0.1 + (normalized_value * 0.8)
             color_rgba = cm(normalized_value)
             # Convert RGBA to hex
             if len(color_rgba) >= 3:
                 r, g, b = color_rgba[:3]
@@ -2941,14 +2973,14 @@ def sample_color_reset(self):
                     r, g, b = int(r * 255), int(g * 255), int(b * 255)
                 hex_color = f"#{r:02x}{g:02x}{b:02x}"
                 colors.append(hex_color)
         # Update the sample_color column
         self.samples_df = self.samples_df.with_columns(
-            pl.Series("sample_color", colors).alias("sample_color")
+            pl.Series("sample_color", colors).alias("sample_color"),
         )
         self.logger.debug(f"Reset sample colors using turbo colormap with even distribution ({n_samples} samples)")
     except ImportError:
         self.logger.error("cmap library is required for sample color reset. Install with: pip install cmap")
     except Exception as e:
@@ -2958,13 +2990,13 @@ def sample_color_reset(self):
 def _get_color_palette(palette_name):
     """
     Get color palette as a list of hex color codes using the cmap library.
     Parameters:
         palette_name (str): Name of the palette
     Returns:
         list: List of hex color codes
     Raises:
         ValueError: If palette_name is not supported
     """
@@ -2972,40 +3004,38 @@ def _get_color_palette(palette_name):
         from cmap import Colormap
     except ImportError:
         raise ValueError("cmap library is required for color palettes. Install with: pip install cmap")
     # Map common palette names to cmap names
     palette_mapping = {
         # Scientific colormaps
         "Turbo256": "turbo",
-        "Viridis256": "viridis",
+        "Viridis256": "viridis",
         "Plasma256": "plasma",
         "Inferno256": "inferno",
         "Magma256": "magma",
         "Cividis256": "cividis",
         # Qualitative palettes
         "Set1": "Set1",
-        "Set2": "Set2",
+        "Set2": "Set2",
         "Set3": "Set3",
         "Tab10": "tab10",
         "Tab20": "tab20",
         "Dark2": "Dark2",
         "Paired": "Paired",
         # Additional useful palettes
         "Spectral": "Spectral",
         "Rainbow": "rainbow",
         "Coolwarm": "coolwarm",
         "Seismic": "seismic",
     }
     # Get the cmap name
     cmap_name = palette_mapping.get(palette_name, palette_name.lower())
     try:
         # Create colormap
         cm = Colormap(cmap_name)
         # Determine number of colors to generate
         if "256" in palette_name:
             n_colors = 256
@@ -3021,7 +3051,7 @@ def _get_color_palette(palette_name):
             n_colors = 20
         else:
             n_colors = 256  # Default for continuous colormaps
         # Generate colors
         if n_colors <= 20:
             # For discrete palettes, use evenly spaced indices
@@ -3029,11 +3059,11 @@ def _get_color_palette(palette_name):
         else:
             # For continuous palettes, use full range
             indices = [i / (n_colors - 1) for i in range(n_colors)]
         # Get colors as RGBA and convert to hex
         colors = cm(indices)
         hex_colors = []
         for color in colors:
             if len(color) >= 3:  # RGBA or RGB
                 r, g, b = color[:3]
@@ -3042,25 +3072,26 @@ def _get_color_palette(palette_name):
                     r, g, b = int(r * 255), int(g * 255), int(b * 255)
                 hex_color = f"#{r:02x}{g:02x}{b:02x}"
                 hex_colors.append(hex_color)
         return hex_colors
     except Exception as e:
-        raise ValueError(f"Failed to create colormap '{cmap_name}': {e}. "
-                        f"Available palettes: {list(palette_mapping.keys())}")
+        raise ValueError(
+            f"Failed to create colormap '{cmap_name}': {e}. Available palettes: {list(palette_mapping.keys())}",
+        )
 def _sample_colors_from_colormap(palette_name, n_colors):
     """
     Sample colors evenly from the whole colormap range, similar to sample_color_reset.
     Parameters:
         palette_name (str): Name of the palette/colormap
         n_colors (int): Number of colors to sample
     Returns:
         list: List of hex color codes sampled evenly from the colormap
     Raises:
         ValueError: If palette_name is not supported
     """
@@ -3068,51 +3099,49 @@ def _sample_colors_from_colormap(palette_name, n_colors):
         from cmap import Colormap
     except ImportError:
         raise ValueError("cmap library is required for color palettes. Install with: pip install cmap")
     # Map common palette names to cmap names (same as _get_color_palette)
     palette_mapping = {
         # Scientific colormaps
         "Turbo256": "turbo",
-        "Viridis256": "viridis",
+        "Viridis256": "viridis",
         "Plasma256": "plasma",
         "Inferno256": "inferno",
         "Magma256": "magma",
         "Cividis256": "cividis",
         # Qualitative palettes
         "Set1": "Set1",
-        "Set2": "Set2",
+        "Set2": "Set2",
         "Set3": "Set3",
         "Tab10": "tab10",
         "Tab20": "tab20",
         "Dark2": "Dark2",
         "Paired": "Paired",
         # Additional useful palettes
         "Spectral": "Spectral",
         "Rainbow": "rainbow",
         "Coolwarm": "coolwarm",
         "Seismic": "seismic",
     }
     # Get the cmap name
     cmap_name = palette_mapping.get(palette_name, palette_name.lower())
     try:
         # Create colormap
         cm = Colormap(cmap_name)
         colors = []
         # Distribute samples evenly across the full colormap range (same approach as sample_color_reset)
         for i in range(n_colors):
             # Evenly distribute samples across colormap (avoiding endpoints to prevent white/black)
             normalized_value = (i + 0.5) / n_colors  # +0.5 to center samples in their bins
             # Map to a subset of colormap to avoid extreme colors (use 10% to 90% range)
             normalized_value = 0.1 + (normalized_value * 0.8)
             color_rgba = cm(normalized_value)
             # Convert RGBA to hex
             if len(color_rgba) >= 3:
                 r, g, b = color_rgba[:3]
@@ -3121,12 +3150,13 @@ def _sample_colors_from_colormap(palette_name, n_colors):
                     r, g, b = int(r * 255), int(g * 255), int(b * 255)
                 hex_color = f"#{r:02x}{g:02x}{b:02x}"
                 colors.append(hex_color)
         return colors
     except Exception as e:
-        raise ValueError(f"Failed to create colormap '{cmap_name}': {e}. "
-                        f"Available palettes: {list(palette_mapping.keys())}")
+        raise ValueError(
+            f"Failed to create colormap '{cmap_name}': {e}. Available palettes: {list(palette_mapping.keys())}",
+        )
 def _matplotlib_to_hex(color_dict):
@@ -3135,32 +3165,32 @@ def _matplotlib_to_hex(color_dict):
 # =====================================================================================
-# SCHEMA AND DATA STRUCTURE FUNCTIONS
+# SCHEMA AND DATA STRUCTURE FUNCTIONS
 # =====================================================================================
 def _ensure_features_df_schema_order(self):
     """
     Ensure features_df columns are ordered according to study5_schema.json.
     This method should be called after operations that might scramble the column order.
     """
     if self.features_df is None or self.features_df.is_empty():
         return
     try:
         import os
         import json
         from masster.study.h5 import _reorder_columns_by_schema
         # Load schema
         schema_path = os.path.join(os.path.dirname(__file__), "study5_schema.json")
-        with open(schema_path, 'r') as f:
+        with open(schema_path) as f:
             schema = json.load(f)
         # Reorder columns to match schema
-        self.features_df = _reorder_columns_by_schema(self.features_df, schema, 'features_df')
+        self.features_df = _reorder_columns_by_schema(self.features_df, schema, "features_df")
     except Exception as e:
         self.logger.warning(f"Failed to reorder features_df columns: {e}")
@@ -3168,38 +3198,38 @@ def _ensure_features_df_schema_order(self):
 def migrate_map_id_to_index(self):
     """
     Migrate map_id from string-based OpenMS unique IDs to integer indices.
     This function converts the map_id column from string type (with OpenMS unique IDs)
     to integer type where each map_id corresponds to the index of the feature map
     in self.features_maps.
     This migration is needed for studies that were created before the map_id format
     change from OpenMS unique IDs to feature map indices.
     """
     if self.samples_df is None or self.samples_df.is_empty():
         self.logger.warning("No samples to migrate")
         return
     # Check if migration is needed
-    current_dtype = self.samples_df['map_id'].dtype
+    current_dtype = self.samples_df["map_id"].dtype
     if current_dtype == pl.Int64:
         self.logger.info("map_id column is already Int64 type - no migration needed")
         return
     self.logger.info("Migrating map_id from string-based OpenMS IDs to integer indices")
     # Create new map_id values based on sample order
     # Each sample gets a map_id that corresponds to its position in features_maps
     sample_count = len(self.samples_df)
     new_map_ids = list(range(sample_count))
     # Update the map_id column
     self.samples_df = self.samples_df.with_columns(
-        pl.lit(new_map_ids).alias("map_id")
+        pl.lit(new_map_ids).alias("map_id"),
     )
     # Ensure the column is Int64 type
     self.samples_df = self.samples_df.cast({"map_id": pl.Int64})
     self.logger.info(f"Successfully migrated {sample_count} samples to indexed map_id format")
     self.logger.info(f"map_id now ranges from 0 to {sample_count - 1}")

masster 0.3.18__py3-none-any.whl → 0.3.19__py3-none-any.whl

Potentially problematic release.

masster 0.3.18py3-none-any.whl → 0.3.19py3-none-any.whl