PyPI - masster - Versions diffs - 0.5.1__py3-none-any.whl → 0.5.3__py3-none-any.whl - Mend

masster 0.5.1py3-none-any.whl → 0.5.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of masster might be problematic. Click here for more details.

Files changed (20) hide show

masster/_version.py +1 -1
masster/sample/load.py +5 -4
masster/study/defaults/align_def.py +0 -204
masster/study/defaults/fill_def.py +9 -1
masster/study/defaults/merge_def.py +20 -69
masster/study/export.py +25 -5
masster/study/h5.py +160 -42
masster/study/helpers.py +430 -53
masster/study/load.py +986 -158
masster/study/merge.py +683 -1076
masster/study/plot.py +43 -38
masster/study/processing.py +337 -280
masster/study/study.py +58 -135
masster/wizard/wizard.py +20 -6
{masster-0.5.1.dist-info → masster-0.5.3.dist-info}/METADATA +1 -1
{masster-0.5.1.dist-info → masster-0.5.3.dist-info}/RECORD +19 -20
masster/study/defaults/fill_chrom_def.py +0 -260
{masster-0.5.1.dist-info → masster-0.5.3.dist-info}/WHEEL +0 -0
{masster-0.5.1.dist-info → masster-0.5.3.dist-info}/entry_points.txt +0 -0
{masster-0.5.1.dist-info → masster-0.5.3.dist-info}/licenses/LICENSE +0 -0

masster/study/load.py CHANGED Viewed

@@ -39,8 +39,7 @@ def add(
     folder=None,
     reset=False,
     adducts=None,
-    max_files=None,
-    fast=True,
+    max_files=None
 ):
     """Add samples from a folder to the study.
@@ -52,8 +51,6 @@ def add(
         adducts (optional): Adducts to use for sample loading. Defaults to None.
         max_files (int, optional): Maximum number of files to process.
             Defaults to None (no limit).
-        fast (bool, optional): Whether to use optimized loading that skips ms1_df
-            for better performance. Defaults to True.
     """
     if folder is None:
         if self.folder is not None:
@@ -122,12 +119,11 @@ def add(
                 self.logger.debug(
                     f"Batch processing {len(files_to_process)} {ext} files",
                 )
-                successful = self._add_samples_batch(
+                successful = _add_samples_batch(self,
                     files_to_process,
                     reset=reset,
                     adducts=adducts,
-                    blacklist=blacklist,
-                    fast=fast,
+                    blacklist=blacklist
                 )
                 counter += successful
                 if successful > 0:
@@ -149,8 +145,7 @@ def add(
     return f"Added {counter} samples to study"
-# TODO type is not used
-def add_sample(self, file, type=None, reset=False, adducts=None, fast=True):
+def add_sample(self, file, type=None, reset=False, adducts=None):
     """
     Add a single sample to the study.
@@ -165,26 +160,16 @@ def add_sample(self, file, type=None, reset=False, adducts=None, fast=True):
     Returns:
         bool: True if successful, False otherwise.
     """
-    if fast:
-        # Use optimized method for better performance
-        success = self._add_sample_optimized(
-            file,
-            type=type,
-            reset=reset,
-            adducts=adducts,
-            skip_color_reset=False,  # Do color reset for individual calls
-            skip_schema_check=True,  # Skip schema check for performance (safe with diagonal concat)
-        )
-    else:
-        # Use standard method with full ms1_df loading
-        success = self._add_sample_standard(
-            file,
-            type=type,
-            reset=reset,
-            adducts=adducts,
-            skip_color_reset=False,  # Do color reset for individual calls
-            skip_schema_check=True,  # Skip schema check for performance
-        )
+    success = self._add_sample_optimized(
+        file,
+        type=type,
+        reset=reset,
+        adducts=adducts,
+        skip_color_reset=False,  # Do color reset for individual calls
+        skip_schema_check=True,  # Skip schema check for performance (safe with diagonal concat)
+    )
     return success
@@ -311,11 +296,12 @@ def _fill_chrom_single_impl(
     # Process each sample individually
     # Group missing combinations by sample for efficient processing
     missing_by_sample = {}
-    for consensus_uid, sample_uid, sample_name, sample_path in missing_combinations:
+    for consensus_uid, sample_uid, sample_name, sample_path, sample_source in missing_combinations:
         if sample_name not in missing_by_sample:
             missing_by_sample[sample_name] = {
                 "sample_uid": sample_uid,
                 "sample_path": sample_path,
+                "sample_source": sample_source,
                 "missing_consensus_uids": [],
             }
         missing_by_sample[sample_name]["missing_consensus_uids"].append(consensus_uid)
@@ -338,13 +324,23 @@ def _fill_chrom_single_impl(
         # Load this sample
         sample_uid = sample_info["sample_uid"]
         sample_path = sample_info["sample_path"]
+        sample_source = sample_info["sample_source"]
         missing_consensus_uids = sample_info["missing_consensus_uids"]
         try:
-            # self.logger.debug(f"Loading sample: {sample_path}")
-            file = Sample()
-            file.logger_update("WARNING")
-            file.load(sample_path)
+            # Load this sample using study._load_ms1() as suggested by user
+            # Use sample_path (points to .sample5 files) not sample_source (points to .raw files)
+            ms1_data = self._load_ms1(filename=sample_path)
+            if ms1_data is None or ms1_data.is_empty():
+                self.logger.warning(f"No MS1 data found for sample {sample_name}")
+                continue
+            # Create a temporary object to hold the MS1 data for processing
+            class TempSample:
+                def __init__(self, ms1_df):
+                    self.ms1_df = ms1_df
+            file = TempSample(ms1_data)
         except Exception as e:
             self.logger.warning(f"Failed to load sample {sample_name}: {e}")
             continue
@@ -363,12 +359,50 @@ def _fill_chrom_single_impl(
             # Filter MS1 data for this feature
             if hasattr(file, "ms1_df") and not file.ms1_df.is_empty():
-                d = file.ms1_df.filter(
-                    (pl.col("mz") >= mz - mz_tol)
-                    & (pl.col("mz") <= mz + mz_tol)
-                    & (pl.col("rt") >= rt_start_mean - rt_tol)
-                    & (pl.col("rt") <= rt_end_mean + rt_tol),
-                )
+                # Special handling for RT=0 (library-derived features)
+                if rt == 0.0:
+                    # Step 1: Retrieve full chromatogram for the m/z
+                    d_full = file.ms1_df.filter(
+                        (pl.col("mz") >= mz - mz_tol)
+                        & (pl.col("mz") <= mz + mz_tol)
+                    )
+                    if not d_full.is_empty():
+                        # Step 2: Find maximum intensity and its RT
+                        max_inty_row = d_full.filter(
+                            pl.col("inty") == d_full["inty"].max()
+                        ).head(1)
+                        if not max_inty_row.is_empty():
+                            max_rt = max_inty_row["rt"].item()
+                            # Get eic_rt_tol from sample parameters if available
+                            eic_rt_tol = rt_tol  # Default fallback
+                            if hasattr(file, 'parameters') and hasattr(file.parameters, 'eic_rt_tol'):
+                                eic_rt_tol = file.parameters.eic_rt_tol
+                            # Step 3: Trim around max intensity using eic_rt_tol
+                            d = d_full.filter(
+                                (pl.col("rt") >= max_rt - eic_rt_tol)
+                                & (pl.col("rt") <= max_rt + eic_rt_tol)
+                            )
+                            # Update consensus RT info based on discovered peak
+                            rt = max_rt
+                            rt_start_mean = max_rt - eic_rt_tol
+                            rt_end_mean = max_rt + eic_rt_tol
+                        else:
+                            d = pl.DataFrame()
+                    else:
+                        d = pl.DataFrame()
+                else:
+                    # Normal RT-based filtering for non-zero RT
+                    d = file.ms1_df.filter(
+                        (pl.col("mz") >= mz - mz_tol)
+                        & (pl.col("mz") <= mz + mz_tol)
+                        & (pl.col("rt") >= rt_start_mean - rt_tol)
+                        & (pl.col("rt") <= rt_end_mean + rt_tol),
+                    )
             else:
                 d = pl.DataFrame()
@@ -579,6 +613,134 @@ def fill_single(self, **kwargs):
     )
+def _build_rt_correction_mapping_per_sample(self, sample_uid):
+    """
+    Pre-compute RT correction mapping for a sample by getting all non-filled features.
+    This avoids repeated DataFrame filtering for each feature.
+    Args:
+        sample_uid: Sample UID to build mapping for
+    Returns:
+        Polars DataFrame with rt, rt_original, and rt_delta columns, sorted by rt
+        Returns empty DataFrame if no reference features found
+    """
+    # Get non-filled features from the same sample
+    if 'filled' in self.features_df.columns:
+        sample_features = self.features_df.filter(
+            (pl.col('sample_uid') == sample_uid) &
+            (pl.col('filled') == False) &
+            (pl.col('rt_original').is_not_null()) &
+            (pl.col('rt').is_not_null())
+        )
+    else:
+        # If no filled column, assume all existing features are non-filled
+        sample_features = self.features_df.filter(
+            (pl.col('sample_uid') == sample_uid) &
+            (pl.col('rt_original').is_not_null()) &
+            (pl.col('rt').is_not_null())
+        )
+    if sample_features.is_empty():
+        return pl.DataFrame(schema={'rt': pl.Float64, 'rt_original': pl.Float64, 'rt_delta': pl.Float64})
+    # Pre-compute RT deltas and sort by RT for efficient lookup
+    rt_mapping = sample_features.select([
+        pl.col('rt'),
+        pl.col('rt_original'),
+        (pl.col('rt') - pl.col('rt_original')).alias('rt_delta')
+    ]).sort('rt')
+    return rt_mapping
+def _estimate_rt_original_from_mapping(self, rt_mapping, target_rt):
+    """
+    Fast RT original estimation using pre-computed mapping.
+    Args:
+        rt_mapping: Pre-computed RT mapping DataFrame from _build_rt_correction_mapping_per_sample
+        target_rt: Target aligned RT for the filled feature
+    Returns:
+        Estimated rt_original value, or None if no mapping available
+    """
+    if rt_mapping.is_empty():
+        return None
+    # Find closest RT using vectorized operations
+    rt_mapping_with_diff = rt_mapping.with_columns([
+        (pl.col('rt') - target_rt).abs().alias('rt_diff')
+    ])
+    # Get the RT delta from the closest feature
+    closest_row = rt_mapping_with_diff.sort('rt_diff').head(1)
+    if closest_row.is_empty():
+        return None
+    closest_rt_delta = closest_row['rt_delta'].item()
+    return target_rt - closest_rt_delta
+def _estimate_rt_original_for_filled_feature(self, sample_uid, target_rt, logger=None):
+    """
+    Estimate rt_original for a filled feature by finding the closest non-filled feature
+    from the same sample and using its RT delta (rt - rt_original).
+    Args:
+        sample_uid: Sample UID to search within
+        target_rt: Target aligned RT for the filled feature
+        logger: Optional logger for debug messages
+    Returns:
+        Estimated rt_original value, or None if no suitable reference found
+    """
+    # Get non-filled features from the same sample
+    if 'filled' in self.features_df.columns:
+        sample_features = self.features_df.filter(
+            (pl.col('sample_uid') == sample_uid) &
+            (pl.col('filled') == False) &
+            (pl.col('rt_original').is_not_null()) &
+            (pl.col('rt').is_not_null())
+        )
+    else:
+        # If no filled column, assume all existing features are non-filled
+        sample_features = self.features_df.filter(
+            (pl.col('sample_uid') == sample_uid) &
+            (pl.col('rt_original').is_not_null()) &
+            (pl.col('rt').is_not_null())
+        )
+    if sample_features.is_empty():
+        if logger:
+            logger.debug(f"No reference features found for sample {sample_uid} to estimate rt_original")
+        return None
+    # Calculate RT differences and find the closest feature
+    sample_features_with_diff = sample_features.with_columns([
+        (pl.col('rt') - target_rt).abs().alias('rt_diff'),
+        (pl.col('rt') - pl.col('rt_original')).alias('rt_delta')
+    ])
+    # Find the feature with minimum RT difference
+    closest_feature = sample_features_with_diff.sort('rt_diff').head(1)
+    if closest_feature.is_empty():
+        return None
+    # Get the RT delta from the closest feature
+    closest_rt_diff = closest_feature['rt_diff'].item()
+    closest_rt_delta = closest_feature['rt_delta'].item()
+    # Estimate rt_original using the same delta: rt_original = rt - rt_delta
+    estimated_rt_original = target_rt - closest_rt_delta
+    if self.logger:
+        self.logger.debug(f"Estimated rt_original={estimated_rt_original:.3f} for sample {sample_uid}, rt={target_rt:.3f} "
+                    f"using closest feature (rt_diff={closest_rt_diff:.3f}, rt_delta={closest_rt_delta:.3f})")
+    return estimated_rt_original
 def _process_sample_for_parallel_fill(
     self,
     sample_info,
@@ -589,31 +751,606 @@ def _process_sample_for_parallel_fill(
     missing_combinations_df,
     features_df_max_uid,
 ):
-    """Process a single sample for parallel gap filling."""
     sample_uid = sample_info["sample_uid"]
     sample_path = sample_info["sample_path"]
+    sample_source = sample_info["sample_source"]
     new_features: list[dict] = []
     new_mapping: list[dict] = []
     counter = 0
-    try:
-        # Load this sample
-        file = Sample()
-        file.logger_update(level="WARNING")
-        file.load(sample_path)
-    except Exception:
-        # Skip this sample if loading fails
+    # Get missing features for this sample from precomputed combinations
+    sample_missing_df = missing_combinations_df.filter(pl.col("sample_uid") == sample_uid)
+    sample_consensus_uids = sample_missing_df["consensus_uid"].to_list()
+    if not sample_consensus_uids:
         return new_features, new_mapping, counter
-    # Find missing features for this sample from precomputed combinations
-    sample_missing = missing_combinations_df.filter(
-        pl.col("sample_uid") == sample_uid,
-    )["consensus_uid"].to_list()
+    # OPTIMIZATION: Pre-compute RT correction mapping per sample to avoid repeated DataFrame filtering
+    rt_mapping = _build_rt_correction_mapping_per_sample(self, sample_uid)
-    if not sample_missing:
+    # OPTIMIZATION 1: Load MS1 data ONCE per sample instead of per feature
+    try:
+        ms1_data = self._load_ms1(filename=sample_path)
+        if ms1_data is None or ms1_data.is_empty():
+            # Create empty features for all missing consensus UIDs
+            for i, consensus_uid in enumerate(sample_consensus_uids):
+                info = consensus_info[consensus_uid]
+                empty_eic = Chromatogram(
+                    rt=np.array([info["rt_start_mean"], info["rt_end_mean"]]),
+                    inty=np.array([0.0, 0.0]),
+                    label=f"EIC mz={info['mz']:.4f}",
+                    file=sample_path,
+                    mz=info["mz"],
+                    feature_start=info["rt_start_mean"],
+                    feature_end=info["rt_end_mean"],
+                    feature_apex=info["rt"],
+                )
+                new_feature = {
+                    "uid": features_df_max_uid + counter,
+                    "sample_uid": sample_uid,
+                    "mz": info["mz"],
+                    "rt": info["rt"],
+                    "rt_original": 0.0 if info["rt"] == 0.0 else _estimate_rt_original_from_mapping(self, rt_mapping, info["rt"]),
+                    "mz_centroid": None,
+                    "rt_centroid": None,
+                    "iso": None,
+                    "iso_of": None,
+                    "adduct": None,
+                    "adduct_mass": None,
+                    "adduct_group": None,
+                    "chrom": empty_eic,
+                    "filled": True,
+                    "chrom_area": 0.0,
+                    "chrom_coherence": None,
+                    "chrom_prominence": None,
+                    "chrom_prominence_scaled": None,
+                    "chrom_height_scaled": None,
+                    "ms2_scans": None,
+                    "ms2_specs": None,
+                }
+                new_features.append(new_feature)
+                new_mapping.append({
+                    "consensus_uid": consensus_uid,
+                    "sample_uid": sample_uid,
+                    "feature_uid": features_df_max_uid + counter,
+                })
+                counter += 1
+            return new_features, new_mapping, counter
+    except Exception as e:
+        # If MS1 loading fails, create empty features
+        self.logger.debug(f"Failed to load MS1 data from {sample_path}: {e}")
+        for i, consensus_uid in enumerate(sample_consensus_uids):
+            info = consensus_info[consensus_uid]
+            empty_eic = Chromatogram(
+                rt=np.array([info["rt_start_mean"], info["rt_end_mean"]]),
+                inty=np.array([0.0, 0.0]),
+                label=f"EIC mz={info['mz']:.4f}",
+                file=sample_path,
+                mz=info["mz"],
+                feature_start=info["rt_start_mean"],
+                feature_end=info["rt_end_mean"],
+                feature_apex=info["rt"],
+            )
+            new_feature = {
+                "uid": features_df_max_uid + counter,
+                "sample_uid": sample_uid,
+                "mz": info["mz"],
+                "rt": info["rt"],
+                "rt_original": 0.0 if info["rt"] == 0.0 else _estimate_rt_original_from_mapping(self, rt_mapping, info["rt"]),
+                "mz_centroid": None,
+                "rt_centroid": None,
+                "iso": None,
+                "iso_of": None,
+                "adduct": None,
+                "adduct_mass": None,
+                "adduct_group": None,
+                "chrom": empty_eic,
+                "filled": True,
+                "chrom_area": 0.0,
+                "chrom_coherence": None,
+                "chrom_prominence": None,
+                "chrom_prominence_scaled": None,
+                "chrom_height_scaled": None,
+                "ms2_scans": None,
+                "ms2_specs": None,
+            }
+            new_features.append(new_feature)
+            new_mapping.append({
+                "consensus_uid": consensus_uid,
+                "sample_uid": sample_uid,
+                "feature_uid": features_df_max_uid + counter,
+            })
+            counter += 1
+        return new_features, new_mapping, counter
+    # OPTIMIZATION 2: Pre-filter MS1 data by m/z ranges to reduce memory and processing
+    all_mzs = [consensus_info[uid]["mz"] for uid in sample_consensus_uids]
+    mz_min = min(all_mzs) - mz_tol
+    mz_max = max(all_mzs) + mz_tol
+    # Pre-filter by broad m/z range
+    ms1_filtered = ms1_data.filter(
+        (pl.col("mz") >= mz_min) & (pl.col("mz") <= mz_max)
+    )
+    # Early exit if no data in m/z range
+    if ms1_filtered.is_empty():
+        for i, consensus_uid in enumerate(sample_consensus_uids):
+            info = consensus_info[consensus_uid]
+            empty_eic = Chromatogram(
+                rt=np.array([info["rt_start_mean"], info["rt_end_mean"]]),
+                inty=np.array([0.0, 0.0]),
+                label=f"EIC mz={info['mz']:.4f}",
+                file=sample_path,
+                mz=info["mz"],
+                feature_start=info["rt_start_mean"],
+                feature_end=info["rt_end_mean"],
+                feature_apex=info["rt"],
+            )
+            new_feature = {
+                "uid": features_df_max_uid + counter,
+                "sample_uid": sample_uid,
+                "mz": info["mz"],
+                "rt": info["rt"],
+                "rt_original": 0.0 if info["rt"] == 0.0 else _estimate_rt_original_from_mapping(self, rt_mapping, info["rt"]),
+                "mz_centroid": None,
+                "rt_centroid": None,
+                "iso": None,
+                "iso_of": None,
+                "adduct": None,
+                "adduct_mass": None,
+                "adduct_group": None,
+                "chrom": empty_eic,
+                "filled": True,
+                "chrom_area": 0.0,
+                "chrom_coherence": None,
+                "chrom_prominence": None,
+                "chrom_prominence_scaled": None,
+                "chrom_height_scaled": None,
+                "ms2_scans": None,
+                "ms2_specs": None,
+            }
+            new_features.append(new_feature)
+            new_mapping.append({
+                "consensus_uid": consensus_uid,
+                "sample_uid": sample_uid,
+                "feature_uid": features_df_max_uid + counter,
+            })
+            counter += 1
         return new_features, new_mapping, counter
+    # OPTIMIZATION 3: Process all features using the pre-loaded and filtered MS1 data
+    for consensus_uid in sample_consensus_uids:
+        info = consensus_info[consensus_uid]
+        mz, rt = info["mz"], info["rt"]
+        try:
+            if rt == 0.0:
+                # Handle RT=0 features - create empty chromatogram
+                empty_eic = Chromatogram(
+                    rt=np.array([info["rt_start_mean"], info["rt_end_mean"]]),
+                    inty=np.array([0.0, 0.0]),
+                    label=f"EIC mz={mz:.4f}",
+                    file=sample_path,
+                    mz=mz,
+                    feature_start=info["rt_start_mean"],
+                    feature_end=info["rt_end_mean"],
+                    feature_apex=rt,
+                )
+                eic = empty_eic
+                best_peak = None
+            else:
+                # Extract real chromatogram using pre-filtered MS1 data
+                d = ms1_filtered.filter(
+                    (pl.col("mz") >= mz - mz_tol) & (pl.col("mz") <= mz + mz_tol) &
+                    (pl.col("rt") >= rt - rt_tol) & (pl.col("rt") <= rt + rt_tol)
+                )
+                # Create chromatogram from filtered data
+                if d.is_empty():
+                    # No MS1 data found - create empty chromatogram
+                    eic = Chromatogram(
+                        rt=np.array([info["rt_start_mean"], info["rt_end_mean"]]),
+                        inty=np.array([0.0, 0.0]),
+                        label=f"EIC mz={mz:.4f}",
+                        file=sample_path,
+                        mz=mz,
+                        feature_start=info["rt_start_mean"],
+                        feature_end=info["rt_end_mean"],
+                        feature_apex=rt,
+                    )
+                    best_peak = None
+                else:
+                    # Aggregate intensities per retention time (get max inty per RT)
+                    eic_rt = d.group_by("rt").agg(pl.col("inty").max()).sort("rt")
+                    # Create chromatogram with real data and find peaks
+                    eic = Chromatogram(
+                        eic_rt["rt"].to_numpy(),
+                        eic_rt["inty"].to_numpy(),
+                        label=f"EIC mz={mz:.4f}",
+                        file=sample_path,
+                        mz=mz,
+                        feature_start=info["rt_start_mean"],
+                        feature_end=info["rt_end_mean"],
+                        feature_apex=rt,
+                    ).find_peaks()
+                    best_peak = self._find_best_peak_in_eic(eic, rt, rt_tol) if hasattr(self, '_find_best_peak_in_eic') else None
+            # Create feature with optimized RT original estimation
+            rt_original_estimated = None
+            if rt == 0.0:
+                rt_original_estimated = 0.0  # RT=0 features
+            else:
+                rt_original_estimated = _estimate_rt_original_from_mapping(self, rt_mapping, rt)
+            new_feature = {
+                "uid": features_df_max_uid + counter,
+                "sample_uid": sample_uid,
+                "mz": mz,
+                "rt": rt,
+                "rt_original": rt_original_estimated,
+                "mz_centroid": None,
+                "rt_centroid": None,
+                "iso": None,
+                "iso_of": None,
+                "adduct": None,
+                "adduct_mass": None,
+                "adduct_group": None,
+                "chrom": eic,
+                "filled": True,
+                "chrom_area": best_peak.get("area", 0.0) if best_peak else 0.0,
+                "chrom_coherence": best_peak.get("coherence") if best_peak else None,
+                "chrom_prominence": best_peak.get("prominence") if best_peak else None,
+                "chrom_prominence_scaled": best_peak.get("prominence_scaled") if best_peak else None,
+                "chrom_height_scaled": best_peak.get("height_scaled") if best_peak else None,
+                "ms2_scans": None,
+                "ms2_specs": None,
+            }
+            new_features.append(new_feature)
+            new_mapping.append({
+                "consensus_uid": consensus_uid,
+                "sample_uid": sample_uid,
+                "feature_uid": features_df_max_uid + counter,
+            })
+            counter += 1
+        except Exception as e:
+            # Skip this feature if extraction fails but log the error
+            self.logger.debug(f"Failed to extract feature {consensus_uid} from {sample_path}: {e}")
+            continue
+    return new_features, new_mapping, counter
+'''
+def _load_ms1_optimized(self, sample_path, mz_ranges, rt_ranges):
+    """
+    OPTIMIZED: Load only the MS1 data we actually need instead of the entire file.
+    Pre-filter by m/z and RT ranges to reduce memory usage and processing time.
+    """
+    try:
+        # Load full MS1 data (we'll optimize this further later)
+        ms1_data = self._load_ms1(filename=sample_path)
+        if ms1_data is None or ms1_data.is_empty():
+            return ms1_data
+        # OPTIMIZATION: Pre-filter to only relevant m/z ranges to reduce data size
+        if mz_ranges:
+            # Build comprehensive m/z filter covering all ranges
+            mz_min = min(r[0] for r in mz_ranges)
+            mz_max = max(r[1] for r in mz_ranges)
+            # Pre-filter by broad m/z range first (much faster than multiple OR conditions)
+            ms1_filtered = ms1_data.filter(
+                (pl.col("mz") >= mz_min) & (pl.col("mz") <= mz_max)
+            )
+            # If we have RT ranges, also pre-filter by RT
+            if rt_ranges and len(rt_ranges) > 0:
+                rt_min = min(r[0] for r in rt_ranges)
+                rt_max = max(r[1] for r in rt_ranges)
+                ms1_filtered = ms1_filtered.filter(
+                    (pl.col("rt") >= rt_min) & (pl.col("rt") <= rt_max)
+                )
+            return ms1_filtered
+        return ms1_data
+    except Exception:
+        return pl.DataFrame()
+'''
+'''
+def _create_empty_features(self, consensus_uids, consensus_info, sample_uid, features_df_max_uid):
+    """Create empty features for consensus UIDs when no MS1 data is available."""
+    new_features = []
+    new_mapping = []
+    for i, consensus_uid in enumerate(consensus_uids):
+        cons = consensus_info[consensus_uid]
+        feature_uid = features_df_max_uid + i + 1
+        # Create minimal empty feature
+        empty_eic = Chromatogram(
+            rt=np.array([cons["rt_start_mean"], cons["rt_end_mean"]]),
+            inty=np.array([0.0, 0.0]),
+            label=f"EIC mz={cons['mz']:.4f}",
+            file="",
+            mz=cons["mz"],
+            feature_start=cons["rt_start_mean"],
+            feature_end=cons["rt_end_mean"],
+            feature_apex=cons["rt"],
+        )
+        new_feature = {
+            "sample_uid": sample_uid,
+            "feature_uid": feature_uid,
+            "feature_id": None,
+            "mz": cons["mz"],
+            "rt": cons["rt"],
+            "rt_original": 0.0 if cons["rt"] == 0.0 else None,
+            "rt_start": cons["rt_start_mean"],
+            "rt_end": cons["rt_end_mean"],
+            "rt_delta": cons["rt_end_mean"] - cons["rt_start_mean"],
+            "mz_start": None,
+            "mz_end": None,
+            "inty": 0.0,
+            "quality": None,
+            "charge": None,
+            "iso": None,
+            "iso_of": None,
+            "adduct": None,
+            "adduct_mass": None,
+            "adduct_group": None,
+            "chrom": empty_eic,
+            "filled": True,
+            "chrom_area": 0.0,
+            "chrom_coherence": None,
+            "chrom_prominence": None,
+            "chrom_prominence_scaled": None,
+            "chrom_height_scaled": None,
+            "ms2_scans": None,
+            "ms2_specs": None,
+        }
+        new_features.append(new_feature)
+        new_mapping.append({
+            "consensus_uid": consensus_uid,
+            "sample_uid": sample_uid,
+            "feature_uid": feature_uid,
+        })
+    return new_features, new_mapping, len(new_features)
+'''
+'''
+def _create_feature_fast(self, consensus_uid, sample_uid, features_df_max_uid, consensus_info):
+    """
+    OPTIMIZED: Create a minimal empty feature quickly without expensive operations.
+    Used for RT=0 features and other cases where we just need a placeholder feature.
+    """
+    cons = consensus_info[consensus_uid]
+    feature_uid = features_df_max_uid
+    # Create minimal empty feature
+    empty_eic = Chromatogram(
+        rt=np.array([cons["rt_start_mean"], cons["rt_end_mean"]]),
+        inty=np.array([0.0, 0.0]),
+        label=f"EIC mz={cons['mz']:.4f}",
+        file="",
+        mz=cons["mz"],
+        feature_start=cons["rt_start_mean"],
+        feature_end=cons["rt_end_mean"]
+    )
+    new_feature = {
+        "uid": feature_uid,
+        "sample_uid": sample_uid,
+        "mz": cons["mz"],
+        "rt": cons["rt"],
+        "mz_centroid": None,
+        "rt_centroid": None,
+        "iso": None,
+        "iso_of": None,
+        "adduct": None,
+        "adduct_mass": None,
+        "adduct_group": None,
+        "chrom": empty_eic,
+        "filled": True,
+        "chrom_area": 0.0,
+        "chrom_coherence": None,
+        "chrom_prominence": None,
+        "chrom_prominence_scaled": None,
+        "chrom_height_scaled": None,
+        "ms2_scans": None,
+        "ms2_specs": None,
+    }
+    new_features = [new_feature]
+    new_mapping = [{
+        "consensus_uid": consensus_uid,
+        "sample_uid": sample_uid,
+        "feature_uid": feature_uid,
+    }]
+    return new_features, new_mapping, 1
+'''
+'''
+def _process_rt_zero_features_batch(self, rt_zero_consensus_uids, consensus_info, sample_uid,
+                                  features_df_max_uid, rt_zero_features):
+    """
+    OPTIMIZED: Process all RT=0 features in a batch since they share similar characteristics.
+    RT=0 features are typically not real peaks but artifacts or noise.
+    """
+    new_features = []
+    new_mapping = []
+    for consensus_uid in rt_zero_consensus_uids:
+        new_features_batch, new_mapping_batch, _ = self._create_feature_fast(
+            consensus_uid, sample_uid, features_df_max_uid, consensus_info
+        )
+        new_features.extend(new_features_batch)
+        new_mapping.extend(new_mapping_batch)
+        features_df_max_uid += 1
+        # Track RT=0 features for statistics
+        rt_zero_features.append(1)
+    return new_features, new_mapping, features_df_max_uid
+'''
+'''
+def _process_normal_rt_features_batch(self, normal_rt_consensus_uids, consensus_info, ms1_data,
+                                    sample_uid, sample_path, mz_tol, rt_tol, features_df_max_uid):
+    """
+    OPTIMIZED: Process normal RT features in batch with pre-filtered MS1 data.
+    Only loads chromatograms once per batch instead of per feature.
+    """
+    new_features = []
+    new_mapping = []
+    if len(normal_rt_consensus_uids) == 0:
+        return new_features, new_mapping, features_df_max_uid
+    # OPTIMIZATION: Pre-filter MS1 data by m/z range to reduce data size
+    all_mzs = [consensus_info[cuid]["mz"] for cuid in normal_rt_consensus_uids]
+    mz_min = min(all_mzs) - max(0.01, min(all_mzs) * mz_tol / 1e6)
+    mz_max = max(all_mzs) + max(0.01, max(all_mzs) * mz_tol / 1e6)
+    # Pre-filter MS1 data once for all features
+    ms1_filtered = ms1_data.filter(
+        (pl.col("mz") >= mz_min) & (pl.col("mz") <= mz_max)
+    )
+    # Early exit if no data in m/z range
+    if ms1_filtered.shape[0] == 0:
+        # Create empty features for all consensus UIDs
+        for consensus_uid in normal_rt_consensus_uids:
+            new_features_batch, new_mapping_batch, _ = self._create_feature_fast(
+                consensus_uid, sample_uid, features_df_max_uid, consensus_info
+            )
+            new_features.extend(new_features_batch)
+            new_mapping.extend(new_mapping_batch)
+            features_df_max_uid += 1
+        return new_features, new_mapping, features_df_max_uid
+    # Process each feature with pre-filtered data
+    for consensus_uid in normal_rt_consensus_uids:
+        info = consensus_info[consensus_uid]
+        mz, rt = info["mz"], info["rt"]
+        # Extract chromatogram using pre-loaded MS1 data (FIXED!)
+        sample_obj = self._load_ms1(sample_path)  # Get the sample object for extract_eic method
+        eic = sample_obj.extract_eic(
+            mz, mz_tol, rt, rt_tol, ms1_data=ms1_filtered  # Use the pre-filtered data!
+        )
+        # Find best peak
+        best_peak = self._find_best_peak_in_eic(eic, rt, rt_tol)
+        # Create feature
+        new_feature = {
+            "uid": features_df_max_uid,
+            "sample_uid": sample_uid,
+            "mz": mz,
+            "rt": rt,
+            "mz_centroid": None,
+            "rt_centroid": None,
+            "iso": None,
+            "iso_of": None,
+            "adduct": None,
+            "adduct_mass": None,
+            "adduct_group": None,
+            "chrom": eic if best_peak else Chromatogram(
+                rt=np.array([rt, rt]),
+                inty=np.array([0.0, 0.0]),
+                label=f"EIC mz={mz:.4f}",
+                file="",
+                mz=mz,
+                feature_start=rt,
+                feature_end=rt
+            ),
+            "filled": True,
+            "chrom_area": best_peak.get("area", 0.0) if best_peak else 0.0,
+            "chrom_coherence": best_peak.get("coherence") if best_peak else None,
+            "chrom_prominence": best_peak.get("prominence") if best_peak else None,
+            "chrom_prominence_scaled": best_peak.get("prominence_scaled") if best_peak else None,
+            "chrom_height_scaled": best_peak.get("height_scaled") if best_peak else None,
+            "ms2_scans": None,
+            "ms2_specs": None,
+        }
+        new_features.append(new_feature)
+        new_mapping.append({
+            "consensus_uid": consensus_uid,
+            "sample_uid": sample_uid,
+            "feature_uid": features_df_max_uid,
+        })
+        features_df_max_uid += 1
+    return new_features, new_mapping, features_df_max_uid
+'''
+'''def _batch_process_features(self, consensus_uids, consensus_info, ms1_data, sample_uid, sample_path,
+                           mz_tol, rt_tol, features_df_max_uid, rt_zero_features):
+    """
+    OPTIMIZED: Process all missing features for a sample in a single batch operation.
+    This avoids repeated filtering of the MS1 dataframe.
+    """
+    new_features = []
+    new_mapping = []
+    # OPTIMIZATION: Process RT=0 features separately (they need special handling)
+    rt_zero_data = {}
+    if rt_zero_features:
+        rt_zero_data = self._process_rt_zero_features_batch(
+            rt_zero_features, consensus_info, ms1_data, mz_tol, rt_tol
+        )
+    # OPTIMIZATION: Build comprehensive filter for all normal RT features at once
+    normal_rt_features = [uid for uid in consensus_uids if uid not in rt_zero_features]
+    normal_rt_data = {}
+    if normal_rt_features:
+        normal_rt_data = self._process_normal_rt_features_batch(
+            normal_rt_features, consensus_info, ms1_data, mz_tol, rt_tol
+        )
+    # Combine results and create features
+    all_feature_data = {**rt_zero_data, **normal_rt_data}
+    for i, consensus_uid in enumerate(consensus_uids):
+        feature_uid = features_df_max_uid + i + 1
+        cons = consensus_info[consensus_uid]
+        # Get pre-processed data for this feature
+        feature_ms1_data = all_feature_data.get(consensus_uid, pl.DataFrame())
+        # Create feature using optimized chromatogram creation
+        new_feature, area = self._create_feature_fast(
+            consensus_uid, cons, feature_ms1_data, sample_uid, sample_path,
+            feature_uid, mz_tol, rt_tol
+        )
+        new_features.append(new_feature)
+        new_mapping.append({
+            "consensus_uid": consensus_uid,
+            "sample_uid": sample_uid,
+            "feature_uid": feature_uid,
+        })
+    return new_features, new_mapping, len(new_features)
     # Process each missing feature
     for consensus_uid in sample_missing:
         cons = consensus_info[consensus_uid]
@@ -624,12 +1361,43 @@ def _process_sample_for_parallel_fill(
         # Filter MS1 data for this feature
         if hasattr(file, "ms1_df") and not file.ms1_df.is_empty():
-            d = file.ms1_df.filter(
-                (pl.col("mz") >= mz - mz_tol)
-                & (pl.col("mz") <= mz + mz_tol)
-                & (pl.col("rt") >= rt_start_mean - rt_tol)
-                & (pl.col("rt") <= rt_end_mean + rt_tol),
-            )
+            # Special handling for RT=0 (library-derived features)
+            if rt == 0.0:
+                # Simple RT=0 processing: find max intensity across full m/z range
+                d_full = file.ms1_df.filter(
+                    (pl.col("mz") >= mz - mz_tol) & (pl.col("mz") <= mz + mz_tol)
+                )
+                if not d_full.is_empty():
+                    max_inty = d_full["inty"].max()
+                    if max_inty > 0:
+                        max_rt = d_full.filter(pl.col("inty") == max_inty)["rt"].min()
+                        # Use default rt_tol for RT=0 features
+                        eic_rt_tol = rt_tol
+                        # Filter around max RT
+                        d = d_full.filter(
+                            (pl.col("rt") >= max_rt - eic_rt_tol) &
+                            (pl.col("rt") <= max_rt + eic_rt_tol)
+                        )
+                        # Update consensus RT info
+                        rt = max_rt
+                        rt_start_mean = max_rt - eic_rt_tol
+                        rt_end_mean = max_rt + eic_rt_tol
+                    else:
+                        d = pl.DataFrame()
+                else:
+                    d = pl.DataFrame()
+            else:
+                # Normal RT-based filtering for non-zero RT
+                d = file.ms1_df.filter(
+                    (pl.col("mz") >= mz - mz_tol)
+                    & (pl.col("mz") <= mz + mz_tol)
+                    & (pl.col("rt") >= rt_start_mean - rt_tol)
+                    & (pl.col("rt") <= rt_end_mean + rt_tol),
+                )
         else:
             d = pl.DataFrame()
@@ -648,6 +1416,13 @@ def _process_sample_for_parallel_fill(
             )
             max_inty = 0.0
             area = 0.0
+            chrom_coherence = None
+            chrom_prominence = None
+            chrom_prominence_scaled = None
+            chrom_height_scaled = None
+            peak_rt_start = rt_start_mean
+            peak_rt_end = rt_end_mean
+            peak_rt_delta = rt_end_mean - rt_start_mean
         else:
             eic_rt = d.group_by("rt").agg(pl.col("inty").max()).sort("rt")
@@ -665,6 +1440,24 @@ def _process_sample_for_parallel_fill(
                 ).find_peaks()
                 max_inty = np.max(eic.inty)
                 area = eic.feature_area
+                # Extract chromatogram peak properties from first peak (if available)
+                if len(eic.peak_rts) > 0 and eic.feature_start is not None and eic.feature_end is not None:
+                    chrom_coherence = round(eic.feature_coherence, 3) if eic.feature_coherence is not None else None
+                    chrom_prominence = round(eic.peak_prominences[0], 3) if len(eic.peak_prominences) > 0 else None
+                    chrom_prominence_scaled = round(eic.peak_prominences[0] / (np.mean(eic.inty) + 1e-10), 3) if len(eic.peak_prominences) > 0 else None
+                    chrom_height_scaled = round(eic.peak_heights[0] / (np.mean(eic.inty) + 1e-10), 3) if len(eic.peak_heights) > 0 else None
+                    peak_rt_start = eic.feature_start
+                    peak_rt_end = eic.feature_end
+                    peak_rt_delta = peak_rt_end - peak_rt_start
+                else:
+                    chrom_coherence = None
+                    chrom_prominence = None
+                    chrom_prominence_scaled = None
+                    chrom_height_scaled = None
+                    peak_rt_start = rt_start_mean
+                    peak_rt_end = rt_end_mean
+                    peak_rt_delta = rt_end_mean - rt_start_mean
             else:
                 eic = Chromatogram(
                     eic_rt["rt"].to_numpy(),
@@ -679,21 +1472,36 @@ def _process_sample_for_parallel_fill(
                 )
                 max_inty = 0.0
                 area = 0.0
+                chrom_coherence = None
+                chrom_prominence = None
+                chrom_prominence_scaled = None
+                chrom_height_scaled = None
+                peak_rt_start = rt_start_mean
+                peak_rt_end = rt_end_mean
+                peak_rt_delta = rt_end_mean - rt_start_mean
         # Generate feature UID (will be adjusted later to ensure global uniqueness)
         feature_uid = features_df_max_uid + len(new_features) + 1
-        # Create new feature entry
+        # Handle rt_original: for RT=0 features, set to 0; otherwise estimate from closest feature
+        if rt == 0.0 or (hasattr(cons, 'get') and cons.get("rt") == 0.0):
+            estimated_rt_original = 0.0
+        else:
+            estimated_rt_original = _estimate_rt_original_for_filled_feature(
+                self, sample_uid, rt, logger=self.logger if hasattr(self, 'logger') else None
+            )
+        # Create new feature entry with updated chromatogram properties
         new_feature = {
             "sample_uid": sample_uid,
             "feature_uid": feature_uid,
             "feature_id": None,
             "mz": mz,
             "rt": rt,
-            "rt_original": None,
-            "rt_start": rt_start_mean,
-            "rt_end": rt_end_mean,
-            "rt_delta": rt_end_mean - rt_start_mean,
+            "rt_original": estimated_rt_original,
+            "rt_start": peak_rt_start,
+            "rt_end": peak_rt_end,
+            "rt_delta": peak_rt_delta,
             "mz_start": None,
             "mz_end": None,
             "inty": max_inty,
@@ -707,10 +1515,10 @@ def _process_sample_for_parallel_fill(
             "chrom": eic,
             "filled": True,
             "chrom_area": area,
-            "chrom_coherence": None,
-            "chrom_prominence": None,
-            "chrom_prominence_scaled": None,
-            "chrom_height_scaled": None,
+            "chrom_coherence": chrom_coherence,
+            "chrom_prominence": chrom_prominence,
+            "chrom_prominence_scaled": chrom_prominence_scaled,
+            "chrom_height_scaled": chrom_height_scaled,
             "ms2_scans": None,
             "ms2_specs": None,
         }
@@ -726,7 +1534,7 @@ def _process_sample_for_parallel_fill(
         counter += 1
     return new_features, new_mapping, counter
+'''
 def _fill_chrom_impl(
     self,
@@ -735,7 +1543,7 @@ def _fill_chrom_impl(
     rt_tol: float = 10.0,
     min_samples_rel: float = 0.0,
     min_samples_abs: int = 2,
-    num_workers=4,
+    threads=6,
 ):
     """Fill missing chromatograms by extracting from raw data using parallel processing.
@@ -745,13 +1553,13 @@ def _fill_chrom_impl(
         rt_tol: RT tolerance for extraction (default: 10.0 seconds)
         min_samples_rel: Relative minimum sample threshold (default: 0.0)
         min_samples_abs: Absolute minimum sample threshold (default: 2)
-        num_workers: Number of parallel workers (default: 4)
+        threads: Number of parallel threads (default: 6)
     """
     uids = self._get_consensus_uids(uids)
-    self.logger.info(f"Gap filling with {num_workers} workers...")
+    self.logger.info(f"Gap filling with {threads} threads...")
     self.logger.debug(
-        f"Parameters: mz_tol={mz_tol}, rt_tol={rt_tol}, min_samples_rel={min_samples_rel}, min_samples_abs={min_samples_abs}, num_workers={num_workers}",
+        f"Parameters: mz_tol={mz_tol}, rt_tol={rt_tol}, min_samples_rel={min_samples_rel}, min_samples_abs={min_samples_abs}, threads={threads}",
     )
     # Apply minimum sample filters
@@ -793,6 +1601,7 @@ def _fill_chrom_impl(
             "sample_uid": pl.Int64,
             "sample_name": pl.Utf8,
             "sample_path": pl.Utf8,
+            "sample_source": pl.Utf8,
         },
         orient="row",
     )
@@ -830,12 +1639,13 @@ def _fill_chrom_impl(
                 "sample_name": row["sample_name"],
                 "sample_uid": row["sample_uid"],
                 "sample_path": row["sample_path"],
+                "sample_source": row["sample_source"],
             },
         )
     total_missing = len(missing_combinations_df)
     self.logger.debug(
-        f"Gap filling for {total_missing} missing features...",
+        f"Gap filling for {total_missing} missing features across {len(samples_to_process)} samples...",
     )
     # Calculate current max feature_uid to avoid conflicts
@@ -850,7 +1660,7 @@ def _fill_chrom_impl(
     tdqm_disable = self.log_level not in ["TRACE", "DEBUG", "INFO"]
-    with concurrent.futures.ThreadPoolExecutor(max_workers=num_workers) as executor:
+    with concurrent.futures.ThreadPoolExecutor(max_workers=threads) as executor:
         # Submit all samples for processing
         future_to_sample = {}
         for sample_info in samples_to_process:
@@ -883,6 +1693,8 @@ def _fill_chrom_impl(
                     for i, mapping in enumerate(new_mapping):
                         mapping["feature_uid"] = uid_offset + i + 1
+                    # RT original estimation is now done inside parallel processing - PERFORMANCE OPTIMIZED!
                     all_new_features.extend(new_features)
                     all_new_mapping.extend(new_mapping)
                     total_counter += counter
@@ -944,8 +1756,15 @@ def _fill_chrom_impl(
             how="diagonal",
         )
+    # Log statistics about rt_original estimation
+    if all_new_features:
+        estimated_count = sum(1 for feature in all_new_features if feature.get('rt_original') is not None)
+        none_count = sum(1 for feature in all_new_features if feature.get('rt_original') is None)
+        self.logger.debug(f"Features with estimated rt_original: {estimated_count}")
+        self.logger.debug(f"Features with None rt_original: {none_count}")
     self.logger.info(
-        f"Filled {total_counter} chromatograms from raw data using {num_workers} parallel workers.",
+        f"Filled {total_counter} chromatograms from raw data.",
     )
@@ -963,14 +1782,18 @@ def fill(self, **kwargs):
         rt_tol: RT tolerance for extraction (default: 10.0 seconds)
         min_samples_rel: Relative minimum sample threshold (default: 0.05)
         min_samples_abs: Absolute minimum sample threshold (default: 5)
-        num_workers: Number of parallel workers (default: 4)
+        threads: Number of parallel threads (default: 6)
     """
     # parameters initialization
     params = fill_defaults()
-    num_workers = kwargs.get(
-        "num_workers",
-        4,
-    )  # Default parameter not in defaults class
+    # Handle backward compatibility for old parameter names
+    if "workers" in kwargs:
+        kwargs["threads"] = kwargs.pop("workers")
+        self.logger.debug("Converted 'workers' parameter to 'threads' for backward compatibility")
+    if "num_workers" in kwargs:
+        kwargs["threads"] = kwargs.pop("num_workers")
+        self.logger.debug("Converted 'num_workers' parameter to 'threads' for backward compatibility")
     for key, value in kwargs.items():
         if isinstance(value, fill_defaults):
@@ -984,7 +1807,7 @@ def fill(self, **kwargs):
                     self.logger.warning(
                         f"Failed to set parameter {key} = {value} (validation failed)",
                     )
-            elif key != "num_workers":  # Allow num_workers as valid parameter
+            else:
                 self.logger.debug(f"Unknown parameter {key} ignored")
     # end of parameter initialization
@@ -1000,14 +1823,10 @@ def fill(self, **kwargs):
         rt_tol=params.get("rt_tol"),
         min_samples_rel=params.get("min_samples_rel"),
         min_samples_abs=params.get("min_samples_abs"),
-        num_workers=num_workers,
+        threads=params.get("threads"),
     )
-# Backward compatibility alias
-fill_chrom = fill
 def _get_missing_consensus_sample_combinations(self, uids):
     """
     Efficiently identify which consensus_uid/sample combinations are missing.
@@ -1017,6 +1836,7 @@ def _get_missing_consensus_sample_combinations(self, uids):
     - Early termination for fully-filled studies
     - Efficient dictionary lookups instead of expensive DataFrame joins
     - Smart handling of sparse vs dense missing data patterns
+    - Special handling for consensus features with no mappings (e.g., library-derived RT=0 features)
     """
     if not uids:
         return []
@@ -1025,10 +1845,42 @@ def _get_missing_consensus_sample_combinations(self, uids):
     n_samples = len(self.samples_df)
     total_possible = n_consensus * n_samples
+    # Identify consensus features that have NO mappings at all (e.g., library-derived RT=0 features)
+    uids_set = set(uids)
+    mapped_consensus_uids = set(
+        self.consensus_mapping_df.filter(pl.col("consensus_uid").is_in(uids))["consensus_uid"].to_list()
+    )
+    unmapped_consensus_uids = uids_set - mapped_consensus_uids
+    # Get all sample info once for efficiency
+    all_samples = list(
+        self.samples_df.select(
+            ["sample_uid", "sample_name", "sample_path", "sample_source"],
+        ).iter_rows(),
+    )
+    missing_combinations = []
+    # For unmapped consensus features (e.g., RT=0), ALL samples are missing
+    if unmapped_consensus_uids:
+        self.logger.debug(f"Found {len(unmapped_consensus_uids)} consensus features with no mappings (e.g., RT=0 library features)")
+        for consensus_uid in unmapped_consensus_uids:
+            for sample_uid, sample_name, sample_path, sample_source in all_samples:
+                missing_combinations.append(
+                    (consensus_uid, sample_uid, sample_name, sample_path, sample_source)
+                )
+    # If all consensus features are unmapped, return early
+    if len(mapped_consensus_uids) == 0:
+        return missing_combinations
+    # Continue with existing logic for mapped consensus features
+    mapped_uids_list = list(mapped_consensus_uids)
     # Quick early termination check for fully/nearly filled studies
     # This handles the common case where fill() is run on an already-filled study
     consensus_counts = (
-        self.consensus_mapping_df.filter(pl.col("consensus_uid").is_in(uids))
+        self.consensus_mapping_df.filter(pl.col("consensus_uid").is_in(mapped_uids_list))
         .group_by("consensus_uid")
         .agg(pl.count("feature_uid").alias("count"))
     )
@@ -1037,22 +1889,22 @@ def _get_missing_consensus_sample_combinations(self, uids):
         consensus_counts["count"].sum() if not consensus_counts.is_empty() else 0
     )
+    # Calculate total possible for mapped features only
+    mapped_total_possible = len(mapped_uids_list) * n_samples
     # If >95% filled, likely no gaps (common case)
-    if total_existing >= total_possible * 0.95:
+    if total_existing >= mapped_total_possible * 0.95:
         self.logger.debug(
-            f"Study appears {total_existing / total_possible * 100:.1f}% filled, using sparse optimization",
+            f"Study appears {total_existing / mapped_total_possible * 100:.1f}% filled, using sparse optimization",
         )
         # For sparse missing data, check each consensus feature individually
-        missing_combinations = []
-        uids_set = set(uids)
         # Build efficient lookups
         feature_to_sample = dict(
             self.features_df.select(["feature_uid", "sample_uid"]).iter_rows(),
         )
-        # Get existing combinations for target UIDs only
+        # Get existing combinations for target UIDs only (mapped features)
         existing_by_consensus = {}
         for consensus_uid, feature_uid in self.consensus_mapping_df.select(
             [
@@ -1060,25 +1912,18 @@ def _get_missing_consensus_sample_combinations(self, uids):
                 "feature_uid",
             ],
         ).iter_rows():
-            if consensus_uid in uids_set and feature_uid in feature_to_sample:
+            if consensus_uid in mapped_consensus_uids and feature_uid in feature_to_sample:
                 if consensus_uid not in existing_by_consensus:
                     existing_by_consensus[consensus_uid] = set()
                 existing_by_consensus[consensus_uid].add(feature_to_sample[feature_uid])
-        # Get sample info once
-        all_samples = list(
-            self.samples_df.select(
-                ["sample_uid", "sample_name", "sample_path"],
-            ).iter_rows(),
-        )
-        # Check for missing combinations
-        for consensus_uid in uids:
+        # Check for missing combinations for mapped features
+        for consensus_uid in mapped_uids_list:
             existing_samples = existing_by_consensus.get(consensus_uid, set())
-            for sample_uid, sample_name, sample_path in all_samples:
+            for sample_uid, sample_name, sample_path, sample_source in all_samples:
                 if sample_uid not in existing_samples:
                     missing_combinations.append(
-                        (consensus_uid, sample_uid, sample_name, sample_path),
+                        (consensus_uid, sample_uid, sample_name, sample_path, sample_source),
                     )
         return missing_combinations
@@ -1086,16 +1931,15 @@ def _get_missing_consensus_sample_combinations(self, uids):
     else:
         # For studies with many gaps, use bulk operations
         self.logger.debug(
-            f"Study {total_existing / total_possible * 100:.1f}% filled, using bulk optimization",
+            f"Study {total_existing / mapped_total_possible * 100:.1f}% filled, using bulk optimization",
         )
         # Build efficient lookups
-        uids_set = set(uids)
         feature_to_sample = dict(
             self.features_df.select(["feature_uid", "sample_uid"]).iter_rows(),
         )
-        # Build existing combinations set
+        # Build existing combinations set for mapped features only
         existing_combinations = {
             (consensus_uid, feature_to_sample[feature_uid])
             for consensus_uid, feature_uid in self.consensus_mapping_df.select(
@@ -1104,23 +1948,16 @@ def _get_missing_consensus_sample_combinations(self, uids):
                     "feature_uid",
                 ],
             ).iter_rows()
-            if consensus_uid in uids_set and feature_uid in feature_to_sample
+            if consensus_uid in mapped_consensus_uids and feature_uid in feature_to_sample
         }
-        # Get all sample info
-        all_samples = list(
-            self.samples_df.select(
-                ["sample_uid", "sample_name", "sample_path"],
-            ).iter_rows(),
-        )
-        # Generate all missing combinations
-        missing_combinations = [
-            (consensus_uid, sample_uid, sample_name, sample_path)
-            for consensus_uid in uids
-            for sample_uid, sample_name, sample_path in all_samples
-            if (consensus_uid, sample_uid) not in existing_combinations
-        ]
+        # Generate missing combinations for mapped features
+        for consensus_uid in mapped_uids_list:
+            for sample_uid, sample_name, sample_path, sample_source in all_samples:
+                if (consensus_uid, sample_uid) not in existing_combinations:
+                    missing_combinations.append(
+                        (consensus_uid, sample_uid, sample_name, sample_path, sample_source)
+                    )
         return missing_combinations
@@ -1218,7 +2055,7 @@ def _sanitize(self):
     except Exception as e:
         self.logger.error(f"Failed to recreate sanitized DataFrame: {e}")
+'''
 def _load_features(self):
     """
     Load features by reconstructing FeatureMaps from the processed features_df data.
@@ -1326,8 +2163,9 @@ def _load_features(self):
     self.logger.debug(
         f"Successfully reconstructed {len(self.features_maps)} FeatureMaps from features_df.",
     )
+'''
+'''
 def _load_features_from_xml(self):
     """
     Original load_features method that loads from .featureXML files.
@@ -1365,8 +2203,8 @@ def _load_features_from_xml(self):
         fh.load(filename, fm)
         self.features_maps.append(fm)
     self.logger.debug("Features loaded successfully.")
+'''
+'''
 def _load_consensusXML(self, filename="alignment.consensusXML"):
     """
     Load a consensus map from a file.
@@ -1378,15 +2216,14 @@ def _load_consensusXML(self, filename="alignment.consensusXML"):
     self.consensus_map = oms.ConsensusMap()
     fh.load(filename, self.consensus_map)
     self.logger.debug(f"Loaded consensus map from {filename}.")
+'''
 def _add_samples_batch(
     self,
     files,
     reset=False,
     adducts=None,
-    blacklist=None,
-    fast=True,
+    blacklist=None
 ):
     """
     Optimized batch addition of samples.
@@ -1396,7 +2233,6 @@ def _add_samples_batch(
         reset (bool): Whether to reset features before processing
         adducts: Adducts to use for sample loading
         blacklist (set): Set of filenames already processed
-        fast (bool): Whether to use optimized loading (skips ms1_df) or standard loading
     Performance optimizations:
     1. No per-sample color reset
@@ -1411,7 +2247,7 @@ def _add_samples_batch(
         blacklist = set()
     self.logger.debug(
-        f"Starting batch addition of {len(files)} samples (skip_ms1={fast})...",
+        f"Starting batch addition of {len(files)} samples...",
     )
     successful_additions = 0
@@ -1430,22 +2266,13 @@ def _add_samples_batch(
     ):
         try:
             # Choose between optimized and standard loading
-            if fast:
-                success = self._add_sample_optimized(
-                    file,
-                    reset=reset,
-                    adducts=adducts,
-                    skip_color_reset=True,  # Skip color reset during batch
-                    skip_schema_check=True,  # Skip schema enforcement
-                )
-            else:
-                success = self._add_sample_standard(
-                    file,
-                    reset=reset,
-                    adducts=adducts,
-                    skip_color_reset=True,  # Skip color reset during batch
-                    skip_schema_check=True,  # Skip schema enforcement
-                )
+            success = _add_sample_noms1(self,
+                file,
+                reset=reset,
+                adducts=adducts,
+                skip_color_reset=True,  # Skip color reset during batch
+                skip_schema_check=True,  # Skip schema enforcement
+            )
             if success:
                 # Add to blacklist for filename tracking
@@ -1467,7 +2294,7 @@ def _add_samples_batch(
         # self._ensure_features_df_schema_order()
         # Color assignment done once for all samples
-        self._sample_color_reset_optimized()
+        self.set_samples_color()
         self.logger.debug(
             f"Add samples complete: {successful_additions} successful, {failed_additions} failed",
@@ -1476,7 +2303,7 @@ def _add_samples_batch(
     return successful_additions
-def _add_sample_optimized(
+def _add_sample_noms1(
     self,
     file,
     type=None,
@@ -1535,11 +2362,11 @@ def _add_sample_optimized(
             return False
     # Check if features map was created successfully
-    if ddaobj._oms_features_map is None:
-        self.logger.warning(f"Failed to add sample {file}: No features map created")
-        return False
+    #if ddaobj._oms_features_map is None:
+    #    self.logger.warning(f"Failed to add sample {file}: No features map created")
+    #    return False
-    self.features_maps.append(ddaobj._oms_features_map)
+    #self.features_maps.append(ddaobj._oms_features_map)
     # Determine sample type
     sample_type = "sample" if type is None else type
@@ -1647,7 +2474,7 @@ def _add_sample_optimized(
     )
     return True
+'''
 def _add_sample_standard(
     self,
     file,
@@ -1921,9 +2748,10 @@ def _add_sample_standard(
     )
     return True
-def _sample_color_reset_optimized(self):
+'''
+'''def _sample_color_reset_optimized(self):
     """
     Optimized version of sample color reset using set_samples_color.
     """
     return self.set_samples_color(by=None)
+'''

masster 0.5.1__py3-none-any.whl → 0.5.3__py3-none-any.whl

Potentially problematic release.

masster 0.5.1py3-none-any.whl → 0.5.3py3-none-any.whl