PyPI - masster - Versions diffs - 0.5.22__py3-none-any.whl → 0.5.24__py3-none-any.whl - Mend

masster 0.5.22py3-none-any.whl → 0.5.24py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of masster might be problematic. Click here for more details.

Files changed (37) hide show

masster/_version.py +1 -1
masster/logger.py +35 -19
masster/sample/adducts.py +15 -29
masster/sample/defaults/find_adducts_def.py +1 -3
masster/sample/defaults/sample_def.py +4 -4
masster/sample/h5.py +203 -361
masster/sample/helpers.py +14 -30
masster/sample/lib.py +3 -3
masster/sample/load.py +21 -29
masster/sample/plot.py +222 -132
masster/sample/processing.py +42 -55
masster/sample/sample.py +37 -46
masster/sample/save.py +37 -61
masster/sample/sciex.py +13 -11
masster/sample/thermo.py +69 -74
masster/spectrum.py +15 -15
masster/study/analysis.py +650 -586
masster/study/defaults/identify_def.py +1 -3
masster/study/defaults/merge_def.py +6 -7
masster/study/defaults/study_def.py +1 -5
masster/study/export.py +35 -96
masster/study/h5.py +134 -211
masster/study/helpers.py +385 -459
masster/study/id.py +239 -290
masster/study/importers.py +84 -93
masster/study/load.py +159 -178
masster/study/merge.py +1112 -1098
masster/study/plot.py +195 -149
masster/study/processing.py +144 -191
masster/study/save.py +14 -13
masster/study/study.py +89 -130
masster/wizard/wizard.py +764 -714
{masster-0.5.22.dist-info → masster-0.5.24.dist-info}/METADATA +27 -1
{masster-0.5.22.dist-info → masster-0.5.24.dist-info}/RECORD +37 -37
{masster-0.5.22.dist-info → masster-0.5.24.dist-info}/WHEEL +0 -0
{masster-0.5.22.dist-info → masster-0.5.24.dist-info}/entry_points.txt +0 -0
{masster-0.5.22.dist-info → masster-0.5.24.dist-info}/licenses/LICENSE +0 -0

masster/study/load.py CHANGED Viewed

@@ -34,13 +34,7 @@ except ImportError:
 import glob
-def add(
-    self,
-    folder=None,
-    reset=False,
-    adducts=None,
-    max_files=None
-):
+def add(self, folder=None, reset=False, adducts=None, max_files=None):
     """Add samples from a folder to the study.
     Args:
@@ -91,9 +85,7 @@ def add(
         if len(files) > 0:
             # Limit files if max_files is specified
-            remaining_slots = (
-                max_files - counter if max_files is not None else len(files)
-            )
+            remaining_slots = max_files - counter if max_files is not None else len(files)
             files = files[:remaining_slots]
             self.logger.debug(f"Found {len(files)} {ext} files")
@@ -119,11 +111,8 @@ def add(
                 self.logger.debug(
                     f"Batch processing {len(files_to_process)} {ext} files",
                 )
-                successful = _add_samples_batch(self,
-                    files_to_process,
-                    reset=reset,
-                    adducts=adducts,
-                    blacklist=blacklist
+                successful = _add_samples_batch(
+                    self, files_to_process, reset=reset, adducts=adducts, blacklist=blacklist
                 )
                 counter += successful
                 if successful > 0:
@@ -171,7 +160,6 @@ def add_sample(self, file, type=None, reset=False, adducts=None):
         skip_schema_check=True,  # Skip schema check for performance (safe with diagonal concat)
     )
     return success
@@ -200,19 +188,20 @@ def load(self, filename=None):
     # self.logger.info(f"Loading study from {filename}")
     from masster.study.h5 import _load_study5
     _load_study5(self, filename)
     # After loading the study, check if we have consensus features before loading consensus XML
-    #if (self.consensus_df is not None and not self.consensus_df.is_empty()):
+    # if (self.consensus_df is not None and not self.consensus_df.is_empty()):
     #    consensus_xml_path = filename.replace(".study5", ".consensusXML")
     #    if os.path.exists(consensus_xml_path):
     #        self._load_consensusXML(filename=consensus_xml_path)
-            # self.logger.info(f"Automatically loaded consensus from {consensus_xml_path}")
+    # self.logger.info(f"Automatically loaded consensus from {consensus_xml_path}")
     #    else:
     #        self.logger.warning(f"No consensus XML file found at {consensus_xml_path}")
-    #else:
+    # else:
     #    self.logger.debug("No consensus features found, skipping consensusXML loading")
     self.filename = filename
@@ -250,25 +239,24 @@ def _fill_chrom_single_impl(
     if isinstance(min_samples_abs, int) and min_samples_abs >= 0:
         min_number_abs = int(min_samples_abs) if min_samples_abs > 0 else 0
     min_number = max(min_number_rel, min_number_abs)
     # Special case: if min_samples_abs is explicitly 0, allow 0-sample features (like library features)
     if isinstance(min_samples_abs, int) and min_samples_abs == 0:
         min_number = 0
     self.logger.debug(f"Threshold for gap filling: number_samples>={min_number}")
     if min_number > 0:
         original_count = len(uids)
         uids = self.consensus_df.filter(
-            (pl.col("number_samples") >= min_number)
-            & (pl.col("consensus_uid").is_in(uids)),
+            (pl.col("number_samples") >= min_number) & (pl.col("consensus_uid").is_in(uids)),
         )["consensus_uid"].to_list()
         self.logger.debug(
             f"Features to fill: {original_count} -> {len(uids)}",
         )
     self.logger.debug("Identifying missing features...")
     # Instead of building full chromatogram matrix, identify missing consensus/sample combinations directly
-    missing_combinations = _get_missing_consensus_sample_combinations(self,uids)
+    missing_combinations = _get_missing_consensus_sample_combinations(self, uids)
     if not missing_combinations:
         self.logger.info("No missing features found to fill.")
         return
@@ -335,12 +323,12 @@ def _fill_chrom_single_impl(
             if ms1_data is None or ms1_data.is_empty():
                 self.logger.warning(f"No MS1 data found for sample {sample_name}")
                 continue
             # Create a temporary object to hold the MS1 data for processing
             class TempSample:
                 def __init__(self, ms1_df):
                     self.ms1_df = ms1_df
             file = TempSample(ms1_data)
         except Exception as e:
             self.logger.warning(f"Failed to load sample {sample_name}: {e}")
@@ -363,31 +351,25 @@ def _fill_chrom_single_impl(
                 # Special handling for RT=0 (library-derived features)
                 if rt == 0.0:
                     # Step 1: Retrieve full chromatogram for the m/z
-                    d_full = file.ms1_df.filter(
-                        (pl.col("mz") >= mz - mz_tol)
-                        & (pl.col("mz") <= mz + mz_tol)
-                    )
+                    d_full = file.ms1_df.filter((pl.col("mz") >= mz - mz_tol) & (pl.col("mz") <= mz + mz_tol))
                     if not d_full.is_empty():
                         # Step 2: Find maximum intensity and its RT
-                        max_inty_row = d_full.filter(
-                            pl.col("inty") == d_full["inty"].max()
-                        ).head(1)
+                        max_inty_row = d_full.filter(pl.col("inty") == d_full["inty"].max()).head(1)
                         if not max_inty_row.is_empty():
                             max_rt = max_inty_row["rt"].item()
                             # Get eic_rt_tol from sample parameters if available
                             eic_rt_tol = rt_tol  # Default fallback
-                            if hasattr(file, 'parameters') and hasattr(file.parameters, 'eic_rt_tol'):
+                            if hasattr(file, "parameters") and hasattr(file.parameters, "eic_rt_tol"):
                                 eic_rt_tol = file.parameters.eic_rt_tol
                             # Step 3: Trim around max intensity using eic_rt_tol
                             d = d_full.filter(
-                                (pl.col("rt") >= max_rt - eic_rt_tol)
-                                & (pl.col("rt") <= max_rt + eic_rt_tol)
+                                (pl.col("rt") >= max_rt - eic_rt_tol) & (pl.col("rt") <= max_rt + eic_rt_tol)
                             )
                             # Update consensus RT info based on discovered peak
                             rt = max_rt
                             rt_start_mean = max_rt - eic_rt_tol
@@ -529,10 +511,7 @@ def _fill_chrom_single_impl(
             for row in rows_to_add:
                 # Cast numeric columns to ensure consistency
                 for key, value in row.items():
-                    if (
-                        key in ["mz", "rt", "intensity", "area", "height"]
-                        and value is not None
-                    ):
+                    if key in ["mz", "rt", "intensity", "area", "height"] and value is not None:
                         row[key] = float(value)
                     elif key in ["sample_id", "feature_id"] and value is not None:
                         row[key] = int(value)
@@ -618,67 +597,64 @@ def _build_rt_correction_mapping_per_sample(self, sample_uid):
     """
     Pre-compute RT correction mapping for a sample by getting all non-filled features.
     This avoids repeated DataFrame filtering for each feature.
     Args:
         sample_uid: Sample UID to build mapping for
     Returns:
         Polars DataFrame with rt, rt_original, and rt_delta columns, sorted by rt
         Returns empty DataFrame if no reference features found
     """
     # Get non-filled features from the same sample
-    if 'filled' in self.features_df.columns:
+    if "filled" in self.features_df.columns:
         sample_features = self.features_df.filter(
-            (pl.col('sample_uid') == sample_uid) &
-            (pl.col('filled') == False) &
-            (pl.col('rt_original').is_not_null()) &
-            (pl.col('rt').is_not_null())
+            (pl.col("sample_uid") == sample_uid)
+            & (pl.col("filled") == False)
+            & (pl.col("rt_original").is_not_null())
+            & (pl.col("rt").is_not_null())
         )
     else:
         # If no filled column, assume all existing features are non-filled
         sample_features = self.features_df.filter(
-            (pl.col('sample_uid') == sample_uid) &
-            (pl.col('rt_original').is_not_null()) &
-            (pl.col('rt').is_not_null())
+            (pl.col("sample_uid") == sample_uid) & (pl.col("rt_original").is_not_null()) & (pl.col("rt").is_not_null())
         )
     if sample_features.is_empty():
-        return pl.DataFrame(schema={'rt': pl.Float64, 'rt_original': pl.Float64, 'rt_delta': pl.Float64})
+        return pl.DataFrame(schema={"rt": pl.Float64, "rt_original": pl.Float64, "rt_delta": pl.Float64})
     # Pre-compute RT deltas and sort by RT for efficient lookup
     rt_mapping = sample_features.select([
-        pl.col('rt'),
-        pl.col('rt_original'),
-        (pl.col('rt') - pl.col('rt_original')).alias('rt_delta')
-    ]).sort('rt')
+        pl.col("rt"),
+        pl.col("rt_original"),
+        (pl.col("rt") - pl.col("rt_original")).alias("rt_delta"),
+    ]).sort("rt")
     return rt_mapping
 def _estimate_rt_original_from_mapping(self, rt_mapping, target_rt):
     """
     Fast RT original estimation using pre-computed mapping.
     Args:
         rt_mapping: Pre-computed RT mapping DataFrame from _build_rt_correction_mapping_per_sample
         target_rt: Target aligned RT for the filled feature
     Returns:
         Estimated rt_original value, or None if no mapping available
     """
     if rt_mapping.is_empty():
         return None
     # Find closest RT using vectorized operations
-    rt_mapping_with_diff = rt_mapping.with_columns([
-        (pl.col('rt') - target_rt).abs().alias('rt_diff')
-    ])
+    rt_mapping_with_diff = rt_mapping.with_columns([(pl.col("rt") - target_rt).abs().alias("rt_diff")])
     # Get the RT delta from the closest feature
-    closest_row = rt_mapping_with_diff.sort('rt_diff').head(1)
+    closest_row = rt_mapping_with_diff.sort("rt_diff").head(1)
     if closest_row.is_empty():
         return None
-    closest_rt_delta = closest_row['rt_delta'].item()
+    closest_rt_delta = closest_row["rt_delta"].item()
     return target_rt - closest_rt_delta
@@ -686,59 +662,59 @@ def _estimate_rt_original_for_filled_feature(self, sample_uid, target_rt, logger
     """
     Estimate rt_original for a filled feature by finding the closest non-filled feature
     from the same sample and using its RT delta (rt - rt_original).
     Args:
         sample_uid: Sample UID to search within
         target_rt: Target aligned RT for the filled feature
         logger: Optional logger for debug messages
     Returns:
         Estimated rt_original value, or None if no suitable reference found
     """
     # Get non-filled features from the same sample
-    if 'filled' in self.features_df.columns:
+    if "filled" in self.features_df.columns:
         sample_features = self.features_df.filter(
-            (pl.col('sample_uid') == sample_uid) &
-            (pl.col('filled') == False) &
-            (pl.col('rt_original').is_not_null()) &
-            (pl.col('rt').is_not_null())
+            (pl.col("sample_uid") == sample_uid)
+            & (pl.col("filled") == False)
+            & (pl.col("rt_original").is_not_null())
+            & (pl.col("rt").is_not_null())
         )
     else:
         # If no filled column, assume all existing features are non-filled
         sample_features = self.features_df.filter(
-            (pl.col('sample_uid') == sample_uid) &
-            (pl.col('rt_original').is_not_null()) &
-            (pl.col('rt').is_not_null())
+            (pl.col("sample_uid") == sample_uid) & (pl.col("rt_original").is_not_null()) & (pl.col("rt").is_not_null())
         )
     if sample_features.is_empty():
         if logger:
             logger.debug(f"No reference features found for sample {sample_uid} to estimate rt_original")
         return None
     # Calculate RT differences and find the closest feature
     sample_features_with_diff = sample_features.with_columns([
-        (pl.col('rt') - target_rt).abs().alias('rt_diff'),
-        (pl.col('rt') - pl.col('rt_original')).alias('rt_delta')
+        (pl.col("rt") - target_rt).abs().alias("rt_diff"),
+        (pl.col("rt") - pl.col("rt_original")).alias("rt_delta"),
     ])
     # Find the feature with minimum RT difference
-    closest_feature = sample_features_with_diff.sort('rt_diff').head(1)
+    closest_feature = sample_features_with_diff.sort("rt_diff").head(1)
     if closest_feature.is_empty():
         return None
     # Get the RT delta from the closest feature
-    closest_rt_diff = closest_feature['rt_diff'].item()
-    closest_rt_delta = closest_feature['rt_delta'].item()
+    closest_rt_diff = closest_feature["rt_diff"].item()
+    closest_rt_delta = closest_feature["rt_delta"].item()
     # Estimate rt_original using the same delta: rt_original = rt - rt_delta
     estimated_rt_original = target_rt - closest_rt_delta
     if self.logger:
-        self.logger.debug(f"Estimated rt_original={estimated_rt_original:.3f} for sample {sample_uid}, rt={target_rt:.3f} "
-                    f"using closest feature (rt_diff={closest_rt_diff:.3f}, rt_delta={closest_rt_delta:.3f})")
+        self.logger.debug(
+            f"Estimated rt_original={estimated_rt_original:.3f} for sample {sample_uid}, rt={target_rt:.3f} "
+            f"using closest feature (rt_diff={closest_rt_diff:.3f}, rt_delta={closest_rt_delta:.3f})"
+        )
     return estimated_rt_original
@@ -763,7 +739,7 @@ def _process_sample_for_parallel_fill(
     # Get missing features for this sample from precomputed combinations
     sample_missing_df = missing_combinations_df.filter(pl.col("sample_uid") == sample_uid)
     sample_consensus_uids = sample_missing_df["consensus_uid"].to_list()
     if not sample_consensus_uids:
         return new_features, new_mapping, counter
@@ -787,13 +763,15 @@ def _process_sample_for_parallel_fill(
                     feature_end=info["rt_end_mean"],
                     feature_apex=info["rt"],
                 )
                 new_feature = {
                     "uid": features_df_max_uid + counter,
                     "sample_uid": sample_uid,
                     "mz": info["mz"],
                     "rt": info["rt"],
-                    "rt_original": 0.0 if info["rt"] == 0.0 else _estimate_rt_original_from_mapping(self, rt_mapping, info["rt"]),
+                    "rt_original": 0.0
+                    if info["rt"] == 0.0
+                    else _estimate_rt_original_from_mapping(self, rt_mapping, info["rt"]),
                     "mz_centroid": None,
                     "rt_centroid": None,
                     "iso": None,
@@ -811,7 +789,7 @@ def _process_sample_for_parallel_fill(
                     "ms2_scans": None,
                     "ms2_specs": None,
                 }
                 new_features.append(new_feature)
                 new_mapping.append({
                     "consensus_uid": consensus_uid,
@@ -820,7 +798,7 @@ def _process_sample_for_parallel_fill(
                 })
                 counter += 1
             return new_features, new_mapping, counter
     except Exception as e:
         # If MS1 loading fails, create empty features
         self.logger.debug(f"Failed to load MS1 data from {sample_path}: {e}")
@@ -836,13 +814,15 @@ def _process_sample_for_parallel_fill(
                 feature_end=info["rt_end_mean"],
                 feature_apex=info["rt"],
             )
             new_feature = {
                 "uid": features_df_max_uid + counter,
                 "sample_uid": sample_uid,
                 "mz": info["mz"],
                 "rt": info["rt"],
-                "rt_original": 0.0 if info["rt"] == 0.0 else _estimate_rt_original_from_mapping(self, rt_mapping, info["rt"]),
+                "rt_original": 0.0
+                if info["rt"] == 0.0
+                else _estimate_rt_original_from_mapping(self, rt_mapping, info["rt"]),
                 "mz_centroid": None,
                 "rt_centroid": None,
                 "iso": None,
@@ -860,7 +840,7 @@ def _process_sample_for_parallel_fill(
                 "ms2_scans": None,
                 "ms2_specs": None,
             }
             new_features.append(new_feature)
             new_mapping.append({
                 "consensus_uid": consensus_uid,
@@ -874,12 +854,10 @@ def _process_sample_for_parallel_fill(
     all_mzs = [consensus_info[uid]["mz"] for uid in sample_consensus_uids]
     mz_min = min(all_mzs) - mz_tol
     mz_max = max(all_mzs) + mz_tol
     # Pre-filter by broad m/z range
-    ms1_filtered = ms1_data.filter(
-        (pl.col("mz") >= mz_min) & (pl.col("mz") <= mz_max)
-    )
+    ms1_filtered = ms1_data.filter((pl.col("mz") >= mz_min) & (pl.col("mz") <= mz_max))
     # Early exit if no data in m/z range
     if ms1_filtered.is_empty():
         for i, consensus_uid in enumerate(sample_consensus_uids):
@@ -894,13 +872,15 @@ def _process_sample_for_parallel_fill(
                 feature_end=info["rt_end_mean"],
                 feature_apex=info["rt"],
             )
             new_feature = {
                 "uid": features_df_max_uid + counter,
                 "sample_uid": sample_uid,
                 "mz": info["mz"],
                 "rt": info["rt"],
-                "rt_original": 0.0 if info["rt"] == 0.0 else _estimate_rt_original_from_mapping(self, rt_mapping, info["rt"]),
+                "rt_original": 0.0
+                if info["rt"] == 0.0
+                else _estimate_rt_original_from_mapping(self, rt_mapping, info["rt"]),
                 "mz_centroid": None,
                 "rt_centroid": None,
                 "iso": None,
@@ -918,7 +898,7 @@ def _process_sample_for_parallel_fill(
                 "ms2_scans": None,
                 "ms2_specs": None,
             }
             new_features.append(new_feature)
             new_mapping.append({
                 "consensus_uid": consensus_uid,
@@ -932,7 +912,7 @@ def _process_sample_for_parallel_fill(
     for consensus_uid in sample_consensus_uids:
         info = consensus_info[consensus_uid]
         mz, rt = info["mz"], info["rt"]
         try:
             if rt == 0.0:
                 # Handle RT=0 features - create empty chromatogram
@@ -951,10 +931,12 @@ def _process_sample_for_parallel_fill(
             else:
                 # Extract real chromatogram using pre-filtered MS1 data
                 d = ms1_filtered.filter(
-                    (pl.col("mz") >= mz - mz_tol) & (pl.col("mz") <= mz + mz_tol) &
-                    (pl.col("rt") >= rt - rt_tol) & (pl.col("rt") <= rt + rt_tol)
+                    (pl.col("mz") >= mz - mz_tol)
+                    & (pl.col("mz") <= mz + mz_tol)
+                    & (pl.col("rt") >= rt - rt_tol)
+                    & (pl.col("rt") <= rt + rt_tol)
                 )
                 # Create chromatogram from filtered data
                 if d.is_empty():
                     # No MS1 data found - create empty chromatogram
@@ -972,7 +954,7 @@ def _process_sample_for_parallel_fill(
                 else:
                     # Aggregate intensities per retention time (get max inty per RT)
                     eic_rt = d.group_by("rt").agg(pl.col("inty").max()).sort("rt")
                     # Create chromatogram with real data and find peaks
                     eic = Chromatogram(
                         eic_rt["rt"].to_numpy(),
@@ -984,15 +966,19 @@ def _process_sample_for_parallel_fill(
                         feature_end=info["rt_end_mean"],
                         feature_apex=rt,
                     ).find_peaks()
-                    best_peak = self._find_best_peak_in_eic(eic, rt, rt_tol) if hasattr(self, '_find_best_peak_in_eic') else None
+                    best_peak = (
+                        self._find_best_peak_in_eic(eic, rt, rt_tol)
+                        if hasattr(self, "_find_best_peak_in_eic")
+                        else None
+                    )
             # Create feature with optimized RT original estimation
             rt_original_estimated = None
             if rt == 0.0:
                 rt_original_estimated = 0.0  # RT=0 features
             else:
                 rt_original_estimated = _estimate_rt_original_from_mapping(self, rt_mapping, rt)
             new_feature = {
                 "uid": features_df_max_uid + counter,
                 "sample_uid": sample_uid,
@@ -1016,7 +1002,7 @@ def _process_sample_for_parallel_fill(
                 "ms2_scans": None,
                 "ms2_specs": None,
             }
             new_features.append(new_feature)
             new_mapping.append({
                 "consensus_uid": consensus_uid,
@@ -1024,7 +1010,7 @@ def _process_sample_for_parallel_fill(
                 "feature_uid": features_df_max_uid + counter,
             })
             counter += 1
         except Exception as e:
             # Skip this feature if extraction fails but log the error
             self.logger.debug(f"Failed to extract feature {consensus_uid} from {sample_path}: {e}")
@@ -1032,6 +1018,7 @@ def _process_sample_for_parallel_fill(
     return new_features, new_mapping, counter
 def _fill_chrom_impl(
     self,
     uids=None,
@@ -1076,8 +1063,7 @@ def _fill_chrom_impl(
     if min_number > 0:
         original_count = len(uids)
         uids = self.consensus_df.filter(
-            (pl.col("number_samples") >= min_number)
-            & (pl.col("consensus_uid").is_in(uids)),
+            (pl.col("number_samples") >= min_number) & (pl.col("consensus_uid").is_in(uids)),
         )["consensus_uid"].to_list()
         self.logger.debug(f"Features to fill: {original_count} -> {len(uids)}")
@@ -1145,9 +1131,7 @@ def _fill_chrom_impl(
     )
     # Calculate current max feature_uid to avoid conflicts
-    features_df_max_uid = (
-        self.features_df["feature_uid"].max() if not self.features_df.is_empty() else 0
-    )
+    features_df_max_uid = self.features_df["feature_uid"].max() if not self.features_df.is_empty() else 0
     # Process samples in parallel
     all_new_features: list[dict] = []
@@ -1161,7 +1145,8 @@ def _fill_chrom_impl(
         future_to_sample = {}
         for sample_info in samples_to_process:
             future = executor.submit(
-                _process_sample_for_parallel_fill, self,
+                _process_sample_for_parallel_fill,
+                self,
                 sample_info,
                 consensus_info,
                 uids,
@@ -1223,10 +1208,7 @@ def _fill_chrom_impl(
             for row in rows_to_add:
                 # Cast numeric columns to ensure consistency
                 for key, value in row.items():
-                    if (
-                        key in ["mz", "rt", "intensity", "area", "height"]
-                        and value is not None
-                    ):
+                    if key in ["mz", "rt", "intensity", "area", "height"] and value is not None:
                         row[key] = float(value)
                     elif key in ["sample_id", "feature_id"] and value is not None:
                         row[key] = int(value)
@@ -1254,8 +1236,8 @@ def _fill_chrom_impl(
     # Log statistics about rt_original estimation
     if all_new_features:
-        estimated_count = sum(1 for feature in all_new_features if feature.get('rt_original') is not None)
-        none_count = sum(1 for feature in all_new_features if feature.get('rt_original') is None)
+        estimated_count = sum(1 for feature in all_new_features if feature.get("rt_original") is not None)
+        none_count = sum(1 for feature in all_new_features if feature.get("rt_original") is None)
         self.logger.debug(f"Features with estimated rt_original: {estimated_count}")
         self.logger.debug(f"Features with None rt_original: {none_count}")
@@ -1288,7 +1270,7 @@ def fill(self, **kwargs):
         kwargs["threads"] = kwargs.pop("workers")
         self.logger.debug("Converted 'workers' parameter to 'threads' for backward compatibility")
     if "num_workers" in kwargs:
-        kwargs["threads"] = kwargs.pop("num_workers")
+        kwargs["threads"] = kwargs.pop("num_workers")
         self.logger.debug("Converted 'num_workers' parameter to 'threads' for backward compatibility")
     for key, value in kwargs.items():
@@ -1347,24 +1329,24 @@ def _get_missing_consensus_sample_combinations(self, uids):
         self.consensus_mapping_df.filter(pl.col("consensus_uid").is_in(uids))["consensus_uid"].to_list()
     )
     unmapped_consensus_uids = uids_set - mapped_consensus_uids
     # Get all sample info once for efficiency
     all_samples = list(
         self.samples_df.select(
             ["sample_uid", "sample_name", "sample_path", "sample_source"],
         ).iter_rows(),
     )
     missing_combinations = []
     # For unmapped consensus features (e.g., RT=0), ALL samples are missing
     if unmapped_consensus_uids:
-        self.logger.debug(f"Found {len(unmapped_consensus_uids)} consensus features with no mappings (e.g., RT=0 library features)")
+        self.logger.debug(
+            f"Found {len(unmapped_consensus_uids)} consensus features with no mappings (e.g., RT=0 library features)"
+        )
         for consensus_uid in unmapped_consensus_uids:
             for sample_uid, sample_name, sample_path, sample_source in all_samples:
-                missing_combinations.append(
-                    (consensus_uid, sample_uid, sample_name, sample_path, sample_source)
-                )
+                missing_combinations.append((consensus_uid, sample_uid, sample_name, sample_path, sample_source))
     # If all consensus features are unmapped, return early
     if len(mapped_consensus_uids) == 0:
@@ -1372,7 +1354,7 @@ def _get_missing_consensus_sample_combinations(self, uids):
     # Continue with existing logic for mapped consensus features
     mapped_uids_list = list(mapped_consensus_uids)
     # Quick early termination check for fully/nearly filled studies
     # This handles the common case where fill() is run on an already-filled study
     consensus_counts = (
@@ -1381,9 +1363,7 @@ def _get_missing_consensus_sample_combinations(self, uids):
         .agg(pl.count("feature_uid").alias("count"))
     )
-    total_existing = (
-        consensus_counts["count"].sum() if not consensus_counts.is_empty() else 0
-    )
+    total_existing = consensus_counts["count"].sum() if not consensus_counts.is_empty() else 0
     # Calculate total possible for mapped features only
     mapped_total_possible = len(mapped_uids_list) * n_samples
@@ -1451,9 +1431,7 @@ def _get_missing_consensus_sample_combinations(self, uids):
         for consensus_uid in mapped_uids_list:
             for sample_uid, sample_name, sample_path, sample_source in all_samples:
                 if (consensus_uid, sample_uid) not in existing_combinations:
-                    missing_combinations.append(
-                        (consensus_uid, sample_uid, sample_name, sample_path, sample_source)
-                    )
+                    missing_combinations.append((consensus_uid, sample_uid, sample_name, sample_path, sample_source))
         return missing_combinations
@@ -1551,13 +1529,8 @@ def _sanitize(self):
     except Exception as e:
         self.logger.error(f"Failed to recreate sanitized DataFrame: {e}")
-def _add_samples_batch(
-    self,
-    files,
-    reset=False,
-    adducts=None,
-    blacklist=None
-):
+def _add_samples_batch(self, files, reset=False, adducts=None, blacklist=None):
     """
     Optimized batch addition of samples.
@@ -1599,7 +1572,8 @@ def _add_samples_batch(
     ):
         try:
             # Choose between optimized and standard loading
-            success = _add_sample_noms1(self,
+            success = _add_sample_noms1(
+                self,
                 file,
                 reset=reset,
                 adducts=adducts,
@@ -1695,20 +1669,33 @@ def _add_sample_noms1(
             return False
     # Check polarity compatibility
-    sample_polarity = getattr(ddaobj, 'polarity', None)
-    study_polarity = getattr(self, 'polarity', None)
+    sample_polarity = getattr(ddaobj, "polarity", None)
+    study_polarity = getattr(self, "polarity", None)
     if sample_polarity is not None and study_polarity is not None:
         # Normalize polarity names for comparison
-        sample_pol_norm = "positive" if sample_polarity in ["pos", "positive"] else "negative" if sample_polarity in ["neg", "negative"] else sample_polarity
-        study_pol_norm = "positive" if study_polarity in ["pos", "positive"] else "negative" if study_polarity in ["neg", "negative"] else study_polarity
+        sample_pol_norm = (
+            "positive"
+            if sample_polarity in ["pos", "positive"]
+            else "negative"
+            if sample_polarity in ["neg", "negative"]
+            else sample_polarity
+        )
+        study_pol_norm = (
+            "positive"
+            if study_polarity in ["pos", "positive"]
+            else "negative"
+            if study_polarity in ["neg", "negative"]
+            else study_polarity
+        )
         if sample_pol_norm != study_pol_norm:
-            self.logger.warning(f"Sample {sample_name} polarity ({sample_polarity}) differs from study polarity ({study_polarity}). Skipping sample.")
+            self.logger.warning(
+                f"Sample {sample_name} polarity ({sample_polarity}) differs from study polarity ({study_polarity}). Skipping sample."
+            )
             return False
-    #self.features_maps.append(ddaobj._oms_features_map)
+    # self.features_maps.append(ddaobj._oms_features_map)
     # Determine sample type
     sample_type = "sample" if type is None else type
@@ -1735,14 +1722,8 @@ def _add_sample_noms1(
     # Efficient scan counting
     ms1_count = ms2_count = 0
-    if (
-        hasattr(ddaobj, "scans_df")
-        and ddaobj.scans_df is not None
-        and not ddaobj.scans_df.is_empty()
-    ):
-        scan_counts = (
-            ddaobj.scans_df.group_by("ms_level").len().to_dict(as_series=False)
-        )
+    if hasattr(ddaobj, "scans_df") and ddaobj.scans_df is not None and not ddaobj.scans_df.is_empty():
+        scan_counts = ddaobj.scans_df.group_by("ms_level").len().to_dict(as_series=False)
         ms_levels = scan_counts.get("ms_level", [])
         counts = scan_counts.get("len", [])
         for level, count in zip(ms_levels, counts):

masster 0.5.22__py3-none-any.whl → 0.5.24__py3-none-any.whl

Potentially problematic release.

masster 0.5.22py3-none-any.whl → 0.5.24py3-none-any.whl