PyPI - masster - Versions diffs - 0.4.16__py3-none-any.whl → 0.4.17__py3-none-any.whl - Mend

masster 0.4.16py3-none-any.whl → 0.4.17py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of masster might be problematic. Click here for more details.

Files changed (9) hide show

masster/_version.py CHANGED Viewed

@@ -1,7 +1,7 @@
 from __future__ import annotations
-__version__ = "0.4.16"
+__version__ = "0.4.17"
 def get_version():

masster/study/defaults/merge_def.py CHANGED Viewed

@@ -25,7 +25,7 @@ class merge_defaults:
         link_ms2 (bool): Whether to link MS2 spectra to consensus features. Default is True.
     """
-    method: str = "kd"
+    method: str = "quality"
     min_samples: int = 10
     rt_tol: float = 5.0
     mz_tol: float = 0.01
@@ -35,14 +35,25 @@ class merge_defaults:
     max_pairwise_log_fc: float = -1.0
     max_nr_conflicts: int = 0
     link_ms2: bool = True
+    # KD-Strict specific parameters
+    optimize_rt_tol: bool = False
+    rt_tol_range: tuple = (0.8, 2.0)
+    rt_tol_steps: int = 5
+    secondary_merge_rt_tol: float = 0.5
+    secondary_merge_mz_tol: float = 0.005
+    min_sample_overlap: float = 0.8
+    max_rt_spread: float = None  # Will default to 2x rt_tol
+    min_coherence: float = 0.0
     _param_metadata: dict[str, dict[str, Any]] = field(
         default_factory=lambda: {
             "method": {
                 "dtype": str,
                 "description": "Merge method (algorithm) to use",
-                "default": "kd",
-                "allowed_values": ["kd", "qt", "kd-nowarp", "chunked"],
+                "default": "quality",
+                "allowed_values": ["sensitivity", "qt", "nowarp", "chunked", "quality",
+                                 "kd", "kd-nowarp", "kd_nowarp", "kd-strict", "kd_strict"],
             },
             "min_samples": {
                 "dtype": int,
@@ -103,6 +114,58 @@ class merge_defaults:
                 "description": "Whether to link MS2 spectra to consensus features",
                 "default": True,
             },
+            # KD-Strict specific parameters
+            "optimize_rt_tol": {
+                "dtype": bool,
+                "description": "Enable RT tolerance optimization for kd-strict method",
+                "default": False,
+            },
+            "rt_tol_range": {
+                "dtype": tuple,
+                "description": "RT tolerance range for optimization (min, max) in seconds",
+                "default": (0.8, 2.0),
+            },
+            "rt_tol_steps": {
+                "dtype": int,
+                "description": "Number of steps for RT tolerance optimization",
+                "default": 5,
+                "min_value": 3,
+                "max_value": 20,
+            },
+            "secondary_merge_rt_tol": {
+                "dtype": float,
+                "description": "RT tolerance for secondary clustering in kd-strict (seconds)",
+                "default": 0.5,
+                "min_value": 0.1,
+                "max_value": 5.0,
+            },
+            "secondary_merge_mz_tol": {
+                "dtype": float,
+                "description": "m/z tolerance for secondary clustering in kd-strict (Da)",
+                "default": 0.005,
+                "min_value": 0.001,
+                "max_value": 0.1,
+            },
+            "min_sample_overlap": {
+                "dtype": float,
+                "description": "Minimum sample overlap ratio for merging features (0.0-1.0)",
+                "default": 0.8,
+                "min_value": 0.0,
+                "max_value": 1.0,
+            },
+            "max_rt_spread": {
+                "dtype": float,
+                "description": "Maximum allowed RT spread in seconds (None = 3x rt_tol)",
+                "default": None,
+                "min_value": 0.1,
+            },
+            "min_coherence": {
+                "dtype": float,
+                "description": "Minimum chromatographic coherence score (0.0 = disabled)",
+                "default": 0.0,
+                "min_value": 0.0,
+                "max_value": 1.0,
+            },
         },
         repr=False,
     )

masster/study/merge.py CHANGED Viewed

@@ -24,8 +24,8 @@ def merge(self, **kwargs) -> None:
     ----------
     **kwargs : dict
         Parameters from merge_defaults class:
-        - method : str, default 'kd'
-          Merge algorithm: 'kd', 'qt', 'kd-nowarp', 'chunked'
+        - method : str, default 'quality'
+          Merge algorithm: 'sensitivity', 'qt', 'nowarp', 'chunked', 'quality'
         - min_samples : int, default 10
           Minimum number of samples for consensus feature
         - rt_tol : float, default 2.0
@@ -47,9 +47,11 @@ def merge(self, **kwargs) -> None:
     Algorithm Guidelines
     -------------------
-    - KD: Best general purpose, O(n log n), recommended default
+    - Quality: KD with post-processing quality control to reduce oversegmentation (RECOMMENDED DEFAULT)
+      Includes RT tolerance optimization, secondary clustering, and quality filtering
+    - Sensitivity: Best raw sensitivity, O(n log n), maximum feature detection
     - QT: Thorough but slow O(n²), good for <1000 samples
-    - KD-NoWarp: Memory efficient KD without RT warping for large datasets
+    - NoWarp: Memory efficient KD without RT warping for large datasets
     - Chunked: Memory-optimized KD algorithm for very large datasets (>5000 samples)
       Uses optimized partitioning for better memory management while maintaining
       full cross-sample consensus feature detection.
@@ -67,9 +69,24 @@ def merge(self, **kwargs) -> None:
         else:
             self.logger.warning(f"Unknown parameter '{key}' ignored")
+    # Backward compatibility: Map old method names to new names
+    method_mapping = {
+        'kd': 'sensitivity',
+        'kd-nowarp': 'nowarp',
+        'kd_nowarp': 'nowarp',
+        'kd-strict': 'quality',
+        'kd_strict': 'quality',
+        'kdstrict': 'quality'
+    }
+    if params.method in method_mapping:
+        old_method = params.method
+        params.method = method_mapping[old_method]
+        self.logger.info(f"Method '{old_method}' is deprecated. Using '{params.method}' instead.")
     # Validate method
-    if params.method not in ['kd', 'qt', 'kd-nowarp', 'chunked']:
-        raise ValueError(f"Invalid method '{params.method}'. Must be one of: ['kd', 'qt', 'kd-nowarp', 'chunked']")
+    if params.method not in ['sensitivity', 'qt', 'nowarp', 'chunked', 'quality']:
+        raise ValueError(f"Invalid method '{params.method}'. Must be one of: ['sensitivity', 'qt', 'nowarp', 'chunked', 'quality']")
     # Persist last used params for diagnostics
     try:
@@ -77,6 +94,15 @@ def merge(self, **kwargs) -> None:
     except Exception:
         self._merge_params_last = {}
+    # Store merge parameters in history
+    try:
+        if hasattr(self, 'store_history'):
+            self.store_history(['merge'], params.to_dict())
+        else:
+            self.logger.warning("History storage not available - parameters not saved to history")
+    except Exception as e:
+        self.logger.warning(f"Failed to store merge parameters in history: {e}")
     # Ensure feature maps are available for merging (regenerate if needed)
     if len(self.features_maps) < len(self.samples_df):
         self.features_maps = []
@@ -106,7 +132,7 @@ def merge(self, **kwargs) -> None:
     cached_valid_adducts.add("?")
     # Route to algorithm implementation
-    if params.method == 'kd':
+    if params.method == 'sensitivity':
         consensus_map = _merge_kd(self, params)
         # Extract consensus features
         self._extract_consensus_features(consensus_map, params.min_samples, cached_adducts_df, cached_valid_adducts)
@@ -114,10 +140,13 @@ def merge(self, **kwargs) -> None:
         consensus_map = _merge_qt(self, params)
         # Extract consensus features
         self._extract_consensus_features(consensus_map, params.min_samples, cached_adducts_df, cached_valid_adducts)
-    elif params.method == 'kd-nowarp':
+    elif params.method == 'nowarp':
         consensus_map = _merge_kd_nowarp(self, params)
         # Extract consensus features
         self._extract_consensus_features(consensus_map, params.min_samples, cached_adducts_df, cached_valid_adducts)
+    elif params.method == 'quality':
+        consensus_map = _merge_kd_strict(self, params)
+        # Note: _merge_kd_strict handles both consensus_df and consensus_mapping_df directly
     elif params.method == 'chunked':
         consensus_map = _merge_chunked(self, params, cached_adducts_df, cached_valid_adducts)
         # Note: _merge_chunked populates consensus_df directly, no need to extract
@@ -209,6 +238,496 @@ def _merge_qt(self, params: merge_defaults) -> oms.ConsensusMap:
     return consensus_map
+def _merge_kd_strict(self, params: merge_defaults) -> oms.ConsensusMap:
+    """
+    Quality merge: Standard KD algorithm with post-processing quality control.
+    This method combines the sensitivity of KD clustering with post-processing steps
+    to reduce oversegmentation while maintaining high-quality consensus features.
+    This is the recommended default method.
+    Post-processing features:
+    1. RT tolerance optimization (optional)
+    2. Secondary clustering for close features
+    3. Sample overlap validation
+    4. RT spread quality filtering
+    5. Chromatographic coherence validation
+    Additional parameters supported in params:
+    - optimize_rt_tol: bool - Enable RT tolerance optimization
+    - rt_tol_range: tuple - RT tolerance range for optimization (min, max)
+    - secondary_merge_rt_tol: float - Secondary merge RT tolerance (default: 0.5s)
+    - secondary_merge_mz_tol: float - Secondary merge m/z tolerance (default: 0.005)
+    - min_sample_overlap: float - Minimum sample overlap for merging (0.0-1.0, default: 0.8)
+    - max_rt_spread: float - Maximum RT spread allowed (default: 2x rt_tol)
+    - min_coherence: float - Minimum chromatographic coherence (default: 0.0, disabled)
+    """
+    # Check for RT tolerance optimization
+    optimize_rt_tol = getattr(params, 'optimize_rt_tol', False)
+    if optimize_rt_tol:
+        # Optimize RT tolerance first
+        optimal_rt_tol = _optimize_rt_tolerance(self, params)
+        self.logger.info(f"RT tolerance optimization: {params.rt_tol}s → {optimal_rt_tol}s")
+        # Create modified params with optimal RT tolerance
+        import copy
+        optimized_params = copy.deepcopy(params)
+        optimized_params.rt_tol = optimal_rt_tol
+    else:
+        optimized_params = params
+    # Phase 1: Standard KD clustering
+    self.logger.info("Initial KD clustering")
+    consensus_map = _merge_kd(self, optimized_params)
+    # Phase 2: Post-processing quality control
+    self.logger.info("Post-processing quality control")
+    consensus_map = _apply_kd_strict_postprocessing(self, consensus_map, optimized_params)
+    return consensus_map
+def _optimize_rt_tolerance(self, params: merge_defaults) -> float:
+    """
+    Optimize RT tolerance by testing different values and measuring oversegmentation.
+    Args:
+        self: Study object
+        params: Merge parameters
+    Returns:
+        Optimal RT tolerance value
+    """
+    rt_tol_range = getattr(params, 'rt_tol_range', (0.8, 2.0))
+    rt_tol_steps = getattr(params, 'rt_tol_steps', 5)
+    self.logger.info(f"Optimizing RT tolerance in range {rt_tol_range} with {rt_tol_steps} steps")
+    # Generate test values
+    test_rt_tols = [rt_tol_range[0] + i * (rt_tol_range[1] - rt_tol_range[0]) / (rt_tol_steps - 1)
+                    for i in range(rt_tol_steps)]
+    best_rt_tol = params.rt_tol
+    best_score = float('inf')
+    # Store original features for restoration
+    original_consensus_df = getattr(self, 'consensus_df', pl.DataFrame())
+    original_consensus_mapping_df = getattr(self, 'consensus_mapping_df', pl.DataFrame())
+    for test_rt_tol in test_rt_tols:
+        try:
+            # Create test parameters
+            import copy
+            test_params = copy.deepcopy(params)
+            test_params.rt_tol = test_rt_tol
+            # Run KD merge with test parameters
+            test_consensus_map = _merge_kd(self, test_params)
+            # Extract consensus features temporarily for analysis
+            self._extract_consensus_features(test_consensus_map, test_params.min_samples)
+            if len(self.consensus_df) == 0:
+                continue
+            # Calculate oversegmentation metrics
+            oversegmentation_score = _calculate_oversegmentation_score(self, test_rt_tol)
+            self.logger.debug(f"RT tol {test_rt_tol:.1f}s: {len(self.consensus_df)} features, score: {oversegmentation_score:.3f}")
+            # Lower score is better (less oversegmentation)
+            if oversegmentation_score < best_score:
+                best_score = oversegmentation_score
+                best_rt_tol = test_rt_tol
+        except Exception as e:
+            self.logger.warning(f"RT tolerance optimization failed for {test_rt_tol}s: {e}")
+            continue
+    # Restore original consensus data
+    self.consensus_df = original_consensus_df
+    self.consensus_mapping_df = original_consensus_mapping_df
+    self.logger.info(f"Optimal RT tolerance: {best_rt_tol:.1f}s (score: {best_score:.3f})")
+    return best_rt_tol
+def _calculate_oversegmentation_score(self, rt_tol: float) -> float:
+    """
+    Calculate oversegmentation score based on feature density and RT spread metrics.
+    Lower scores indicate less oversegmentation.
+    Args:
+        self: Study object
+        rt_tol: RT tolerance used
+    Returns:
+        Oversegmentation score (lower = better)
+    """
+    if len(self.consensus_df) == 0:
+        return float('inf')
+    # Metric 1: Feature density (features per RT second)
+    rt_range = self.consensus_df['rt'].max() - self.consensus_df['rt'].min()
+    if rt_range <= 0:
+        return float('inf')
+    feature_density = len(self.consensus_df) / rt_range
+    # Metric 2: Average RT spread relative to tolerance
+    rt_spreads = (self.consensus_df['rt_max'] - self.consensus_df['rt_min'])
+    avg_rt_spread_ratio = rt_spreads.mean() / rt_tol if rt_tol > 0 else float('inf')
+    # Metric 3: Proportion of features with low sample counts (indicates fragmentation)
+    low_sample_features = len(self.consensus_df.filter(pl.col('number_samples') <= 5))
+    low_sample_ratio = low_sample_features / len(self.consensus_df)
+    # Metric 4: Number of features with excessive RT spread
+    excessive_spread_features = len(rt_spreads.filter(rt_spreads > rt_tol * 2))
+    excessive_spread_ratio = excessive_spread_features / len(self.consensus_df)
+    # Combined score (weighted combination)
+    oversegmentation_score = (
+        0.4 * (feature_density / 10.0) +  # Normalize to reasonable scale
+        0.3 * avg_rt_spread_ratio +
+        0.2 * low_sample_ratio +
+        0.1 * excessive_spread_ratio
+    )
+    return oversegmentation_score
+def _apply_kd_strict_postprocessing(self, consensus_map: oms.ConsensusMap, params: merge_defaults) -> oms.ConsensusMap:
+    """
+    Apply post-processing quality control to KD consensus map.
+    Args:
+        consensus_map: Initial consensus map from KD
+        params: Merge parameters with kd-strict options
+    Returns:
+        Processed consensus map with reduced oversegmentation
+    """
+    if consensus_map.size() == 0:
+        self.logger.warning("Empty consensus map provided to post-processing")
+        return consensus_map
+    self.logger.debug(f"Post-processing {consensus_map.size()} initial consensus features")
+    # Step 1: Extract initial consensus features
+    original_min_samples = params.min_samples
+    params.min_samples = 1  # Extract all features initially
+    self._extract_consensus_features(consensus_map, params.min_samples)
+    initial_feature_count = len(self.consensus_df)
+    if initial_feature_count == 0:
+        self.logger.warning("No consensus features extracted for post-processing")
+        params.min_samples = original_min_samples
+        return consensus_map
+    # Step 2: Secondary clustering for close features
+    secondary_merge_rt_tol = getattr(params, 'secondary_merge_rt_tol', 0.5)
+    secondary_merge_mz_tol = getattr(params, 'secondary_merge_mz_tol', 0.005)
+    self.logger.debug(f"Secondary clustering with RT≤{secondary_merge_rt_tol}s, m/z≤{secondary_merge_mz_tol}")
+    merged_features = _perform_secondary_clustering(self, secondary_merge_rt_tol, secondary_merge_mz_tol)
+    # Step 3: Sample overlap validation
+    min_sample_overlap = getattr(params, 'min_sample_overlap', 0.8)
+    if min_sample_overlap > 0:
+        self.logger.debug(f"Sample overlap validation (threshold: {min_sample_overlap})")
+        merged_features = _validate_sample_overlap(self, merged_features, min_sample_overlap)
+    # Step 4: RT spread quality filtering
+    if params.rt_tol is not None:
+        max_rt_spread = getattr(params, 'max_rt_spread', params.rt_tol * 2)
+        if max_rt_spread is not None:
+            self.logger.debug(f"RT spread filtering (max: {max_rt_spread:.1f}s)")
+            merged_features = _filter_rt_spread(self, merged_features, max_rt_spread)
+        else:
+            self.logger.debug("Skipping RT spread filtering - max_rt_spread is None")
+    else:
+        self.logger.debug("Skipping RT spread filtering - rt_tol is None")
+    # Step 5: Chromatographic coherence filtering (optional)
+    min_coherence = getattr(params, 'min_coherence', 0.0)
+    if min_coherence > 0:
+        self.logger.debug(f"Chromatographic coherence filtering (min: {min_coherence})")
+        merged_features = _filter_coherence(self, merged_features, min_coherence)
+    # Step 6: Rebuild consensus_df with filtered features and preserve mapping
+    original_mapping_df = self.consensus_mapping_df.clone()  # Save original mapping
+    self.consensus_df = pl.DataFrame(merged_features, strict=False)
+    # Step 7: Apply original min_samples filter
+    params.min_samples = original_min_samples
+    if params.min_samples > 1:
+        l1 = len(self.consensus_df)
+        self.consensus_df = self.consensus_df.filter(
+            pl.col("number_samples") >= params.min_samples
+        )
+        filtered_count = l1 - len(self.consensus_df)
+        if filtered_count > 0:
+            self.logger.debug(f"Filtered {filtered_count} features below min_samples threshold ({params.min_samples})")
+    # Step 8: Update consensus_mapping_df to match final consensus_df
+    if len(self.consensus_df) > 0 and len(original_mapping_df) > 0:
+        valid_consensus_ids = set(self.consensus_df['consensus_uid'].to_list())
+        self.consensus_mapping_df = original_mapping_df.filter(
+            pl.col('consensus_uid').is_in(list(valid_consensus_ids))
+        )
+    else:
+        self.consensus_mapping_df = pl.DataFrame()
+    final_feature_count = len(self.consensus_df)
+    reduction_pct = ((initial_feature_count - final_feature_count) / initial_feature_count * 100) if initial_feature_count > 0 else 0
+    self.logger.info(f"Post-processing complete: {initial_feature_count} → {final_feature_count} features ({reduction_pct:.1f}% reduction)")
+    # Create a new consensus map for compatibility (the processed data is in consensus_df)
+    processed_consensus_map = oms.ConsensusMap()
+    return processed_consensus_map
+def _perform_secondary_clustering(self, rt_tol: float, mz_tol: float) -> list:
+    """
+    Perform secondary clustering to merge very close features.
+    Args:
+        rt_tol: RT tolerance for secondary clustering
+        mz_tol: m/z tolerance for secondary clustering
+    Returns:
+        List of merged consensus feature dictionaries
+    """
+    if len(self.consensus_df) == 0:
+        return []
+    # Convert consensus_df to list of dictionaries for clustering
+    consensus_features = []
+    for i, row in enumerate(self.consensus_df.iter_rows(named=True)):
+        consensus_features.append(dict(row))
+    # Use Union-Find for efficient clustering
+    class UnionFind:
+        def __init__(self, n):
+            self.parent = list(range(n))
+            self.rank = [0] * n
+        def find(self, x):
+            if self.parent[x] != x:
+                self.parent[x] = self.find(self.parent[x])
+            return self.parent[x]
+        def union(self, x, y):
+            px, py = self.find(x), self.find(y)
+            if px == py:
+                return
+            if self.rank[px] < self.rank[py]:
+                px, py = py, px
+            self.parent[py] = px
+            if self.rank[px] == self.rank[py]:
+                self.rank[px] += 1
+    n_features = len(consensus_features)
+    uf = UnionFind(n_features)
+    # Find features to merge based on proximity
+    merge_count = 0
+    for i in range(n_features):
+        for j in range(i + 1, n_features):
+            feat_i = consensus_features[i]
+            feat_j = consensus_features[j]
+            rt_diff = abs(feat_i['rt'] - feat_j['rt'])
+            mz_diff = abs(feat_i['mz'] - feat_j['mz'])
+            if rt_diff <= rt_tol and mz_diff <= mz_tol:
+                uf.union(i, j)
+                merge_count += 1
+    # Group features by their root
+    groups_by_root = defaultdict(list)
+    for i in range(n_features):
+        root = uf.find(i)
+        groups_by_root[root].append(consensus_features[i])
+    # Merge features within each group
+    merged_features = []
+    for group in groups_by_root.values():
+        if len(group) == 1:
+            # Single feature - keep as is
+            merged_features.append(group[0])
+        else:
+            # Multiple features - merge them
+            merged_feature = _merge_feature_group(group)
+            merged_features.append(merged_feature)
+    self.logger.debug(f"Secondary clustering: {n_features} → {len(merged_features)} features ({n_features - len(merged_features)} merged)")
+    return merged_features
+def _merge_feature_group(feature_group: list) -> dict:
+    """
+    Merge a group of similar consensus features into one.
+    Args:
+        feature_group: List of consensus feature dictionaries to merge
+    Returns:
+        Merged consensus feature dictionary
+    """
+    if not feature_group:
+        return {}
+    if len(feature_group) == 1:
+        return feature_group[0]
+    # Use the feature with highest sample count as base
+    base_feature = max(feature_group, key=lambda f: f.get('number_samples', 0))
+    merged = base_feature.copy()
+    # Aggregate numeric statistics
+    rt_values = [f['rt'] for f in feature_group if f.get('rt') is not None]
+    mz_values = [f['mz'] for f in feature_group if f.get('mz') is not None]
+    sample_counts = [f.get('number_samples', 0) for f in feature_group]
+    intensities = [f.get('inty_mean', 0) for f in feature_group if f.get('inty_mean') is not None]
+    # Update merged feature statistics
+    if rt_values:
+        merged['rt'] = float(np.mean(rt_values))
+        merged['rt_min'] = min([f.get('rt_min', f['rt']) for f in feature_group])
+        merged['rt_max'] = max([f.get('rt_max', f['rt']) for f in feature_group])
+        merged['rt_mean'] = float(np.mean(rt_values))
+    if mz_values:
+        merged['mz'] = float(np.mean(mz_values))
+        merged['mz_min'] = min([f.get('mz_min', f['mz']) for f in feature_group])
+        merged['mz_max'] = max([f.get('mz_max', f['mz']) for f in feature_group])
+        merged['mz_mean'] = float(np.mean(mz_values))
+    # Use maximum sample count (features might be detected in overlapping but different samples)
+    merged['number_samples'] = max(sample_counts)
+    # Use weighted average intensity (by sample count)
+    if intensities and sample_counts:
+        total_weight = sum(sample_counts)
+        if total_weight > 0:
+            weighted_intensity = sum(inty * count for inty, count in zip(intensities, sample_counts)) / total_weight
+            merged['inty_mean'] = float(weighted_intensity)
+    # Aggregate chromatographic quality metrics if available
+    coherence_values = [f.get('chrom_coherence_mean', 0) for f in feature_group if f.get('chrom_coherence_mean') is not None]
+    prominence_values = [f.get('chrom_prominence_mean', 0) for f in feature_group if f.get('chrom_prominence_mean') is not None]
+    if coherence_values:
+        merged['chrom_coherence_mean'] = float(np.mean(coherence_values))
+    if prominence_values:
+        merged['chrom_prominence_mean'] = float(np.mean(prominence_values))
+    # Merge MS2 counts
+    ms2_counts = [f.get('number_ms2', 0) for f in feature_group]
+    merged['number_ms2'] = sum(ms2_counts)
+    # Keep the best quality score
+    quality_scores = [f.get('quality', 1.0) for f in feature_group if f.get('quality') is not None]
+    if quality_scores:
+        merged['quality'] = max(quality_scores)
+    return merged
+def _validate_sample_overlap(self, features: list, min_overlap: float) -> list:
+    """
+    Validate that merged features have sufficient sample overlap.
+    Args:
+        features: List of consensus feature dictionaries
+        min_overlap: Minimum sample overlap ratio (0.0-1.0)
+    Returns:
+        List of validated features
+    """
+    # This is a placeholder for sample overlap validation
+    # Implementation would require access to which samples each feature appears in
+    # For now, we'll use a simple heuristic based on feature statistics
+    validated_features = []
+    for feature in features:
+        # Simple validation based on RT spread and sample count ratio
+        rt_spread = feature.get('rt_max', feature['rt']) - feature.get('rt_min', feature['rt'])
+        sample_count = feature.get('number_samples', 1)
+        # Features with very tight RT spread and high sample counts are more reliable
+        if rt_spread <= 2.0 or sample_count >= 10:  # More permissive validation
+            validated_features.append(feature)
+        else:
+            # Could implement more sophisticated sample overlap checking here
+            validated_features.append(feature)  # Keep for now
+    return validated_features
+def _filter_rt_spread(self, features: list, max_rt_spread: float) -> list:
+    """
+    Filter out features with excessive RT spread.
+    Args:
+        features: List of consensus feature dictionaries
+        max_rt_spread: Maximum allowed RT spread in seconds
+    Returns:
+        List of filtered features
+    """
+    filtered_features = []
+    filtered_count = 0
+    for feature in features:
+        rt_min = feature.get('rt_min', feature['rt'])
+        rt_max = feature.get('rt_max', feature['rt'])
+        rt_spread = rt_max - rt_min
+        if rt_spread <= max_rt_spread:
+            filtered_features.append(feature)
+        else:
+            filtered_count += 1
+    if filtered_count > 0:
+        self.logger.debug(f"Filtered {filtered_count} features with excessive RT spread (>{max_rt_spread:.1f}s)")
+    return filtered_features
+def _filter_coherence(self, features: list, min_coherence: float) -> list:
+    """
+    Filter out features with low chromatographic coherence.
+    Args:
+        features: List of consensus feature dictionaries
+        min_coherence: Minimum chromatographic coherence score
+    Returns:
+        List of filtered features
+    """
+    filtered_features = []
+    filtered_count = 0
+    for feature in features:
+        coherence = feature.get('chrom_coherence_mean', 1.0)  # Default to high coherence if missing
+        if coherence >= min_coherence:
+            filtered_features.append(feature)
+        else:
+            filtered_count += 1
+    if filtered_count > 0:
+        self.logger.debug(f"Filtered {filtered_count} features with low coherence (<{min_coherence})")
+    return filtered_features
 def _merge_kd_nowarp(self, params: merge_defaults) -> oms.ConsensusMap:
     """KD-tree based merge without RT warping"""
@@ -470,11 +989,19 @@ def _merge_chunk_results(self, chunk_consensus_maps: list, params: merge_default
                             b = chunk_consensus_list[j]
                             if a['chunk_idx'] == b['chunk_idx']:
                                 continue
-                            # Centroid checks
+                            # Primary check: centroid distance (strict)
                             centroid_close = (abs(a['rt']-b['rt']) <= rt_tol and abs(a['mz']-b['mz']) <= mz_tol)
-                            # Interval overlap checks (expanded by tolerance)
-                            rt_overlap = (a['rt_min'] - rt_tol) <= (b['rt_max'] + rt_tol) and (b['rt_min'] - rt_tol) <= (a['rt_max'] + rt_tol)
-                            mz_overlap = (a['mz_min'] - mz_tol) <= (b['mz_max'] + mz_tol) and (b['mz_min'] - mz_tol) <= (a['mz_max'] + mz_tol)
+                            # Secondary check: interval overlap (more conservative)
+                            # Only allow interval overlap if centroids are reasonably close (within 2x tolerance)
+                            centroids_reasonable = (abs(a['rt']-b['rt']) <= 2 * rt_tol and abs(a['mz']-b['mz']) <= 2 * mz_tol)
+                            if centroids_reasonable:
+                                rt_overlap = (a['rt_min'] - rt_tol/2) <= (b['rt_max'] + rt_tol/2) and (b['rt_min'] - rt_tol/2) <= (a['rt_max'] + rt_tol/2)
+                                mz_overlap = (a['mz_min'] - mz_tol/2) <= (b['mz_max'] + mz_tol/2) and (b['mz_min'] - mz_tol/2) <= (a['mz_max'] + mz_tol/2)
+                            else:
+                                rt_overlap = mz_overlap = False
                             if centroid_close or (rt_overlap and mz_overlap):
                                 uf.union(i,j)
@@ -611,6 +1138,17 @@ def _merge_chunk_results(self, chunk_consensus_maps: list, params: merge_default
             cached_adducts_df=cached_adducts_df,
             cached_valid_adducts=cached_valid_adducts,
         )
+        # Validate RT spread doesn't exceed tolerance (with some flexibility for chunked merge)
+        rt_spread = metadata.get('rt_max', 0) - metadata.get('rt_min', 0)
+        max_allowed_spread = params.rt_tol * 2  # Allow 2x tolerance for chunked method
+        if rt_spread > max_allowed_spread:
+            # Skip consensus features with excessive RT spread
+            self.logger.debug(f"Skipping consensus feature {consensus_uid_counter} with RT spread {rt_spread:.3f}s > {max_allowed_spread:.3f}s")
+            consensus_uid_counter += 1
+            continue
         consensus_metadata.append(metadata)
         # Build mapping rows (deduplicated)
@@ -689,8 +1227,8 @@ def _calculate_consensus_statistics(study_obj, consensus_uid: int, feature_data_
     inty_values = np.array([fd.get("inty", 0) for fd in feature_data_list if fd.get("inty") is not None])
     coherence_values = np.array([fd.get("chrom_coherence", 0) for fd in feature_data_list if fd.get("chrom_coherence") is not None])
     prominence_values = np.array([fd.get("chrom_prominence", 0) for fd in feature_data_list if fd.get("chrom_prominence") is not None])
-    prominence_scaled_values = np.array([fd.get("chrom_prominence_scaled", 0) for fd in feature_data_list if fd.get("chrom_prominence_scaled") is not None])
-    height_scaled_values = np.array([fd.get("chrom_height_scaled", 0) for fd in feature_data_list if fd.get("chrom_height_scaled") is not None])
+    prominence_scaled_values = np.array([fd.get("chrom_height_scaled", 0) for fd in feature_data_list if fd.get("chrom_height_scaled") is not None])
+    height_scaled_values = np.array([fd.get("chrom_prominence_scaled", 0) for fd in feature_data_list if fd.get("chrom_prominence_scaled") is not None])
     iso_values = np.array([fd.get("iso", 0) for fd in feature_data_list if fd.get("iso") is not None])
     charge_values = np.array([fd.get("charge", 0) for fd in feature_data_list if fd.get("charge") is not None])
@@ -1006,16 +1544,16 @@ def _extract_consensus_features(self, consensus_map, min_samples, cached_adducts
         )
         prominence_scaled_values = np.array(
             [
-                fd.get("chrom_prominence_scaled", 0)
+                fd.get("chrom_height_scaled", 0)
                 for fd in feature_data_list
-                if fd.get("chrom_prominence_scaled") is not None
+                if fd.get("chrom_height_scaled") is not None
             ],
         )
         height_scaled_values = np.array(
             [
-                fd.get("chrom_height_scaled", 0)
+                fd.get("chrom_prominence_scaled", 0)
                 for fd in feature_data_list
-                if fd.get("chrom_height_scaled") is not None
+                if fd.get("chrom_prominence_scaled") is not None
             ],
         )
         iso_values = np.array(

masster/study/plot.py CHANGED Viewed

@@ -310,8 +310,22 @@ def plot_alignment(
         max_inty = sample_data.select(pl.col("inty").max()).item() or 1
         # Get sample information
-        sample_name = str(sample)
         sample_uid = sample if sample_col == "sample_uid" else sample_data.select(pl.col("sample_uid")).item() if "sample_uid" in sample_data.columns else sample
+        # Try to get actual sample name from samples_df if available
+        sample_name = str(sample)  # fallback
+        if hasattr(self, "samples_df") and self.samples_df is not None and sample_uid is not None:
+            try:
+                sample_name_result = (
+                    self.samples_df.filter(pl.col("sample_uid") == sample_uid)
+                    .select("sample_name")
+                    .to_series()
+                )
+                if len(sample_name_result) > 0 and sample_name_result[0] is not None:
+                    sample_name = str(sample_name_result[0])
+            except Exception:
+                # Keep the fallback value
+                pass
         # Select columns to process
         cols_to_select = ["rt", "mz", "inty"]

{masster-0.4.16.dist-info → masster-0.4.17.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: masster
-Version: 0.4.16
+Version: 0.4.17
 Summary: Mass spectrometry data analysis package
 Project-URL: homepage, https://github.com/zamboni-lab/masster
 Project-URL: repository, https://github.com/zamboni-lab/masster
@@ -767,7 +767,8 @@ study.integrate()
 # export results
 study.export_mgf()
 study.export_mztab()
-study.export_consensus()
+study.export_xlsx()
+study.export_parquet()
 # Save the study to .study5
 study.save()

{masster-0.4.16.dist-info → masster-0.4.17.dist-info}/RECORD RENAMED Viewed

@@ -1,5 +1,5 @@
 masster/__init__.py,sha256=HHjKhCjkAc98LhoQfu4C6L-W2vfTEc1iXaPTxxcl_4A,800
-masster/_version.py,sha256=zMjCN14DFC1TYYvoTFcnuHINoESJ3g5QeRaN-wLn-U0,257
+masster/_version.py,sha256=A-Vx5wjFdgUfquBN1kWTW90q7wTOwZx-uonA2Xl-IWc,257
 masster/chromatogram.py,sha256=iYpdv8C17zVnlWvOFgAn9ns2uFGiF-GgoYf5QVVAbHs,19319
 masster/logger.py,sha256=W50V_uh8RSYwGxDrDFhOuj5jpu2tKJyt_16lMw9kQwA,14755
 masster/spectrum.py,sha256=_upC_g2N9gwTaflXAugs9pSXpKUmzbIehofDordk7WI,47718
@@ -43,9 +43,9 @@ masster/study/h5.py,sha256=LiVGUAtULyPpZIUmKVJSaV38huJb8FsKOUWBOqiv0QU,82363
 masster/study/helpers.py,sha256=M5_q8O5tuFchKPW04PTuj3X335lDA2VZqcs4D8ZQJEk,158604
 masster/study/id.py,sha256=6NUBBKZCFOU1wlDKM0eXQeOIStSZCRNJ_3x7ZaIHzmM,55263
 masster/study/load.py,sha256=CQQY_7BzagE3oQTdDlqNyfuMdVWIAft-M4a2WCFnxp0,70695
-masster/study/merge.py,sha256=7ezv9GauDCw3M4wcskjQnQ3zszWap-5MvDUR4nSa6EM,69628
+masster/study/merge.py,sha256=-gc-255NTKxkJZcIRl1wqQsMMi0m8zoZ10BkGsINFDc,92012
 masster/study/parameters.py,sha256=0elaF7YspTsB7qyajWAbRNL2VfKlGz5GJLifmO8IGkk,3276
-masster/study/plot.py,sha256=Wp48DH5x1t8w6R67AMjxLaUIKZpDa82fnUoAgEeNY5E,87564
+masster/study/plot.py,sha256=SimX-IlqISEItAnTBsx4xsdYHRAevfN41cCENVns1lw,88236
 masster/study/processing.py,sha256=pm98FrQHoM3ov6qmjKuVN9h2KBhGgCLEZCRS7zpmJFM,41104
 masster/study/save.py,sha256=YCvp4xhnG16sNXaT2mFDBoCrIMub0Es61B97qLo0maw,6705
 masster/study/study.py,sha256=LO_hbJOOCZzeA3uterPKImFgPG6fCNQKMSVMtEwW3DU,38815
@@ -60,7 +60,7 @@ masster/study/defaults/find_ms2_def.py,sha256=RL0DFG41wQ05U8UQKUGr3vzSl3mU0m0knQ
 masster/study/defaults/identify_def.py,sha256=96rxoCAPQj_yX-3mRoD2LTkTLJgG27eJQqwarLv5jL0,10580
 masster/study/defaults/integrate_chrom_def.py,sha256=0MNIWGTjty-Zu-NTQsIweuj3UVqEY3x1x8pK0mPwYak,7264
 masster/study/defaults/integrate_def.py,sha256=Vf4SAzdBfnsSZ3IRaF0qZvWu3gMDPHdgPfMYoPKeWv8,7246
-masster/study/defaults/merge_def.py,sha256=R-BbhfgThjOwb2QEZKYO2jdhDxxTaSDau-NXkWRO3-U,10609
+masster/study/defaults/merge_def.py,sha256=Q31JwAaVGgVPEVIsiyeiOsF97c48IKe48HXuqh-sA_k,13189
 masster/study/defaults/study_def.py,sha256=h8dYbi9xv0sesCSQik49Z53IkskMmNtW6ixl7it5pL0,16033
 masster/wizard/README.md,sha256=mL1A3YWJZOefpJ6D0-HqGLkVRmUlOpwyVFdvJBeeoZM,14149
 masster/wizard/__init__.py,sha256=A9GHQvkq4lSRIA8V6AKB-TJy8s_npH8i1baUGdkw_is,364
@@ -68,8 +68,8 @@ masster/wizard/example.py,sha256=xEZFTH9UZ8HKOm6s3JL8Js0Uw5ChnISWBHSZCL32vsM,798
 masster/wizard/test_structure.py,sha256=h88gsYYCG6iDRjqPZC_r1H1T8y79j0E-K6OrwuHaSCU,1586
 masster/wizard/test_wizard.py,sha256=CMp1cpjH3iYYC5Fy6puF_K0kfwwk3bgOsSbUGW-t7Xk,8986
 masster/wizard/wizard.py,sha256=jMLHy4cXgNEE_-vshFmA7BNEByhfA6tV7O91jhiMYuw,48054
-masster-0.4.16.dist-info/METADATA,sha256=gNDP1Gnpz65g1WR0OGzazi2ikrRngHlIBvReOHlxYiQ,44189
-masster-0.4.16.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
-masster-0.4.16.dist-info/entry_points.txt,sha256=ZHguQ_vPmdbpqq2uGtmEOLJfgP-DQ1T0c07Lxh30wc8,58
-masster-0.4.16.dist-info/licenses/LICENSE,sha256=bx5iLIKjgAdYQ7sISn7DsfHRKkoCUm1154sJJKhgqnU,35184
-masster-0.4.16.dist-info/RECORD,,
+masster-0.4.17.dist-info/METADATA,sha256=uIdQNkAXQQzMkcVM53y_pUBZPzwqOx0lxGW8nmB1lz8,44207
+masster-0.4.17.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
+masster-0.4.17.dist-info/entry_points.txt,sha256=ZHguQ_vPmdbpqq2uGtmEOLJfgP-DQ1T0c07Lxh30wc8,58
+masster-0.4.17.dist-info/licenses/LICENSE,sha256=bx5iLIKjgAdYQ7sISn7DsfHRKkoCUm1154sJJKhgqnU,35184
+masster-0.4.17.dist-info/RECORD,,

{masster-0.4.16.dist-info → masster-0.4.17.dist-info}/WHEEL RENAMED Viewed

File without changes

{masster-0.4.16.dist-info → masster-0.4.17.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{masster-0.4.16.dist-info → masster-0.4.17.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

masster 0.4.16__py3-none-any.whl → 0.4.17__py3-none-any.whl

Potentially problematic release.

masster 0.4.16py3-none-any.whl → 0.4.17py3-none-any.whl