masster 0.4.22__py3-none-any.whl → 0.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of masster might be problematic. Click here for more details.
- masster/_version.py +1 -1
- masster/sample/adducts.py +1 -1
- masster/sample/load.py +10 -9
- masster/sample/plot.py +1 -1
- masster/sample/processing.py +4 -4
- masster/sample/sample.py +29 -32
- masster/study/analysis.py +1762 -0
- masster/study/export.py +5 -3
- masster/study/helpers.py +153 -80
- masster/study/id.py +3 -3
- masster/study/load.py +17 -52
- masster/study/merge.py +316 -313
- masster/study/parameters.py +3 -3
- masster/study/plot.py +398 -43
- masster/study/processing.py +4 -4
- masster/study/save.py +8 -4
- masster/study/study.py +97 -139
- {masster-0.4.22.dist-info → masster-0.5.0.dist-info}/METADATA +54 -14
- {masster-0.4.22.dist-info → masster-0.5.0.dist-info}/RECORD +22 -21
- {masster-0.4.22.dist-info → masster-0.5.0.dist-info}/WHEEL +0 -0
- {masster-0.4.22.dist-info → masster-0.5.0.dist-info}/entry_points.txt +0 -0
- {masster-0.4.22.dist-info → masster-0.5.0.dist-info}/licenses/LICENSE +0 -0
masster/study/merge.py
CHANGED
|
@@ -274,7 +274,7 @@ def _serialize_feature_map(feature_map):
|
|
|
274
274
|
return features_data
|
|
275
275
|
|
|
276
276
|
|
|
277
|
-
def merge(
|
|
277
|
+
def merge(study, **kwargs) -> None:
|
|
278
278
|
"""
|
|
279
279
|
Group features across samples into consensus features using various algorithms.
|
|
280
280
|
|
|
@@ -342,7 +342,7 @@ def merge(self, **kwargs) -> None:
|
|
|
342
342
|
if key in valid_params:
|
|
343
343
|
setattr(params, key, value)
|
|
344
344
|
else:
|
|
345
|
-
|
|
345
|
+
study.logger.warning(f"Unknown parameter '{key}' ignored")
|
|
346
346
|
|
|
347
347
|
# Backward compatibility: Map old method names to new names
|
|
348
348
|
method_mapping = {
|
|
@@ -362,18 +362,18 @@ def merge(self, **kwargs) -> None:
|
|
|
362
362
|
if params.method in method_mapping:
|
|
363
363
|
old_method = params.method
|
|
364
364
|
params.method = method_mapping[old_method]
|
|
365
|
-
|
|
365
|
+
study.logger.info(f"Method '{old_method}' is deprecated. Using '{params.method}' instead.")
|
|
366
366
|
|
|
367
367
|
# Validate method
|
|
368
368
|
if params.method not in ['sensitivity', 'qt', 'nowarp', 'kd_chunked', 'qt_chunked', 'quality']:
|
|
369
369
|
raise ValueError(f"Invalid method '{params.method}'. Must be one of: ['sensitivity', 'qt', 'nowarp', 'kd_chunked', 'qt_chunked', 'quality']")
|
|
370
370
|
|
|
371
371
|
# Check if chunked method is advisable for large datasets
|
|
372
|
-
num_samples = len(
|
|
372
|
+
num_samples = len(study.samples_df) if hasattr(study, 'samples_df') and study.samples_df is not None else 0
|
|
373
373
|
if num_samples > 500:
|
|
374
374
|
chunked_methods = {'kd_chunked', 'qt_chunked'}
|
|
375
375
|
if params.method not in chunked_methods:
|
|
376
|
-
|
|
376
|
+
study.logger.warning(
|
|
377
377
|
f"Large dataset detected ({num_samples} samples > 500). "
|
|
378
378
|
f"For better performance and memory efficiency, consider using a chunked method: "
|
|
379
379
|
f"'kd_chunked' or 'qt_chunked' instead of '{params.method}'"
|
|
@@ -381,42 +381,43 @@ def merge(self, **kwargs) -> None:
|
|
|
381
381
|
|
|
382
382
|
# Persist last used params for diagnostics
|
|
383
383
|
try:
|
|
384
|
-
|
|
384
|
+
study._merge_params_last = params.to_dict()
|
|
385
385
|
except Exception:
|
|
386
|
-
|
|
386
|
+
study._merge_params_last = {}
|
|
387
387
|
|
|
388
388
|
# Store merge parameters in history
|
|
389
389
|
try:
|
|
390
|
-
if hasattr(
|
|
391
|
-
|
|
390
|
+
if hasattr(study, 'store_history'):
|
|
391
|
+
study.update_history(['merge'], params.to_dict())
|
|
392
392
|
else:
|
|
393
|
-
|
|
393
|
+
study.logger.warning("History storage not available - parameters not saved to history")
|
|
394
394
|
except Exception as e:
|
|
395
|
-
|
|
395
|
+
study.logger.warning(f"Failed to store merge parameters in history: {e}")
|
|
396
396
|
|
|
397
397
|
# Ensure feature maps are available for merging (regenerate if needed)
|
|
398
|
-
if len(
|
|
399
|
-
|
|
398
|
+
if len(study.features_maps) < len(study.samples_df):
|
|
399
|
+
study.features_maps = []
|
|
400
400
|
# Feature maps will be generated on-demand within each merge method
|
|
401
401
|
|
|
402
|
-
|
|
402
|
+
study.logger.info(
|
|
403
403
|
f"Merge: {params.method}, samples={params.min_samples}, rt_tol={params.rt_tol}s, mz_tol={params.mz_tol}Da"
|
|
404
404
|
)
|
|
405
405
|
|
|
406
406
|
# Initialize
|
|
407
|
-
|
|
407
|
+
_reset_consensus_data(study)
|
|
408
408
|
|
|
409
409
|
# Cache adducts for performance (avoid repeated _get_adducts() calls)
|
|
410
410
|
cached_adducts_df = None
|
|
411
411
|
cached_valid_adducts = None
|
|
412
412
|
try:
|
|
413
|
-
|
|
413
|
+
from masster.study.id import _get_adducts
|
|
414
|
+
cached_adducts_df = _get_adducts(study)
|
|
414
415
|
if not cached_adducts_df.is_empty():
|
|
415
416
|
cached_valid_adducts = set(cached_adducts_df["name"].to_list())
|
|
416
417
|
else:
|
|
417
418
|
cached_valid_adducts = set()
|
|
418
419
|
except Exception as e:
|
|
419
|
-
|
|
420
|
+
study.logger.warning(f"Could not retrieve study adducts: {e}")
|
|
420
421
|
cached_valid_adducts = set()
|
|
421
422
|
|
|
422
423
|
# Always allow '?' adducts
|
|
@@ -424,58 +425,58 @@ def merge(self, **kwargs) -> None:
|
|
|
424
425
|
|
|
425
426
|
# Route to algorithm implementation
|
|
426
427
|
if params.method == 'sensitivity':
|
|
427
|
-
consensus_map = _merge_kd(
|
|
428
|
+
consensus_map = _merge_kd(study, params)
|
|
428
429
|
# Extract consensus features
|
|
429
|
-
|
|
430
|
+
_extract_consensus_features(study, consensus_map, params.min_samples, cached_adducts_df, cached_valid_adducts)
|
|
430
431
|
elif params.method == 'qt':
|
|
431
|
-
consensus_map = _merge_qt(
|
|
432
|
+
consensus_map = _merge_qt(study, params)
|
|
432
433
|
# Extract consensus features
|
|
433
|
-
|
|
434
|
+
_extract_consensus_features(study, consensus_map, params.min_samples, cached_adducts_df, cached_valid_adducts)
|
|
434
435
|
elif params.method == 'nowarp':
|
|
435
|
-
consensus_map = _merge_kd_nowarp(
|
|
436
|
+
consensus_map = _merge_kd_nowarp(study, params)
|
|
436
437
|
# Extract consensus features
|
|
437
|
-
|
|
438
|
+
_extract_consensus_features(study, consensus_map, params.min_samples, cached_adducts_df, cached_valid_adducts)
|
|
438
439
|
elif params.method == 'quality':
|
|
439
|
-
consensus_map = _merge_kd_strict(
|
|
440
|
+
consensus_map = _merge_kd_strict(study, params)
|
|
440
441
|
# Note: _merge_kd_strict handles both consensus_df and consensus_mapping_df directly
|
|
441
442
|
elif params.method == 'kd_chunked':
|
|
442
|
-
consensus_map = _merge_kd_chunked(
|
|
443
|
+
consensus_map = _merge_kd_chunked(study, params, cached_adducts_df, cached_valid_adducts)
|
|
443
444
|
# Note: _merge_kd_chunked populates consensus_df directly, no need to extract
|
|
444
445
|
elif params.method == 'qt_chunked':
|
|
445
|
-
consensus_map = _merge_qt_chunked(
|
|
446
|
+
consensus_map = _merge_qt_chunked(study, params, cached_adducts_df, cached_valid_adducts)
|
|
446
447
|
# Note: _merge_qt_chunked populates consensus_df directly, no need to extract
|
|
447
448
|
|
|
448
449
|
# Enhanced post-clustering to merge over-segmented features (for qt and kd methods)
|
|
449
450
|
if params.method in ['qt', 'sensitivity', 'qt_chunked', 'kd_chunked', 'quality']:
|
|
450
|
-
|
|
451
|
+
_consensus_cleanup(study, params.rt_tol, params.mz_tol)
|
|
451
452
|
|
|
452
453
|
# Perform adduct grouping
|
|
453
|
-
|
|
454
|
+
_perform_adduct_grouping(study, params.rt_tol, params.mz_tol)
|
|
454
455
|
|
|
455
456
|
# Identify coeluting consensus features by mass shifts and update adduct information
|
|
456
|
-
|
|
457
|
+
_identify_adduct_by_mass_shift(study, params.rt_tol, cached_adducts_df)
|
|
457
458
|
|
|
458
459
|
# Link MS2 if requested
|
|
459
460
|
if params.link_ms2:
|
|
460
|
-
|
|
461
|
+
_finalize_merge(study, params.link_ms2, params.min_samples)
|
|
461
462
|
|
|
462
463
|
# Log completion without the misleading feature count
|
|
463
464
|
elapsed = time.time() - start_time
|
|
464
|
-
|
|
465
|
+
study.logger.debug(f"Merge process completed in {elapsed:.1f}s")
|
|
465
466
|
|
|
466
467
|
|
|
467
|
-
def _merge_kd(
|
|
468
|
+
def _merge_kd(study, params: merge_defaults) -> oms.ConsensusMap:
|
|
468
469
|
"""KD-tree based merge (fast, recommended)"""
|
|
469
470
|
|
|
470
471
|
# Generate temporary feature maps on-demand from features_df
|
|
471
|
-
temp_feature_maps = _generate_feature_maps_on_demand(
|
|
472
|
+
temp_feature_maps = _generate_feature_maps_on_demand(study)
|
|
472
473
|
|
|
473
474
|
consensus_map = oms.ConsensusMap()
|
|
474
475
|
file_descriptions = consensus_map.getColumnHeaders()
|
|
475
476
|
|
|
476
477
|
for i, feature_map in enumerate(temp_feature_maps):
|
|
477
478
|
file_description = file_descriptions.get(i, oms.ColumnHeader())
|
|
478
|
-
file_description.filename =
|
|
479
|
+
file_description.filename = study.samples_df.row(i, named=True)["sample_name"]
|
|
479
480
|
file_description.size = feature_map.size()
|
|
480
481
|
file_description.unique_id = feature_map.getUniqueId()
|
|
481
482
|
file_descriptions[i] = file_description
|
|
@@ -624,22 +625,22 @@ def _generate_feature_maps_on_demand(study):
|
|
|
624
625
|
return temp_feature_maps
|
|
625
626
|
|
|
626
627
|
|
|
627
|
-
def _merge_qt(
|
|
628
|
+
def _merge_qt(study, params: merge_defaults) -> oms.ConsensusMap:
|
|
628
629
|
"""QT (Quality Threshold) based merge"""
|
|
629
630
|
|
|
630
631
|
# Generate temporary feature maps on-demand from features_df
|
|
631
|
-
temp_feature_maps = _generate_feature_maps_on_demand(
|
|
632
|
+
temp_feature_maps = _generate_feature_maps_on_demand(study)
|
|
632
633
|
|
|
633
634
|
n_samples = len(temp_feature_maps)
|
|
634
635
|
if n_samples > 1000:
|
|
635
|
-
|
|
636
|
+
study.logger.warning(f"QT with {n_samples} samples may be slow [O(n²)]. Consider KD [O(n log n)]")
|
|
636
637
|
|
|
637
638
|
consensus_map = oms.ConsensusMap()
|
|
638
639
|
file_descriptions = consensus_map.getColumnHeaders()
|
|
639
640
|
|
|
640
641
|
for i, feature_map in enumerate(temp_feature_maps):
|
|
641
642
|
file_description = file_descriptions.get(i, oms.ColumnHeader())
|
|
642
|
-
file_description.filename =
|
|
643
|
+
file_description.filename = study.samples_df.row(i, named=True)["sample_name"]
|
|
643
644
|
file_description.size = feature_map.size()
|
|
644
645
|
file_description.unique_id = feature_map.getUniqueId()
|
|
645
646
|
file_descriptions[i] = file_description
|
|
@@ -665,7 +666,7 @@ def _merge_qt(self, params: merge_defaults) -> oms.ConsensusMap:
|
|
|
665
666
|
return consensus_map
|
|
666
667
|
|
|
667
668
|
|
|
668
|
-
def _merge_kd_strict(
|
|
669
|
+
def _merge_kd_strict(study, params: merge_defaults) -> oms.ConsensusMap:
|
|
669
670
|
"""
|
|
670
671
|
Quality merge: Standard KD algorithm with post-processing quality control.
|
|
671
672
|
|
|
@@ -695,8 +696,8 @@ def _merge_kd_strict(self, params: merge_defaults) -> oms.ConsensusMap:
|
|
|
695
696
|
|
|
696
697
|
if optimize_rt_tol:
|
|
697
698
|
# Optimize RT tolerance first
|
|
698
|
-
optimal_rt_tol = _optimize_rt_tolerance(
|
|
699
|
-
|
|
699
|
+
optimal_rt_tol = _optimize_rt_tolerance(study, params)
|
|
700
|
+
study.logger.info(f"RT tolerance optimization: {params.rt_tol}s → {optimal_rt_tol}s")
|
|
700
701
|
# Create modified params with optimal RT tolerance
|
|
701
702
|
import copy
|
|
702
703
|
optimized_params = copy.deepcopy(params)
|
|
@@ -705,22 +706,22 @@ def _merge_kd_strict(self, params: merge_defaults) -> oms.ConsensusMap:
|
|
|
705
706
|
optimized_params = params
|
|
706
707
|
|
|
707
708
|
# Phase 1: Standard KD clustering
|
|
708
|
-
|
|
709
|
-
consensus_map = _merge_kd(
|
|
709
|
+
study.logger.debug("Initial KD clustering")
|
|
710
|
+
consensus_map = _merge_kd(study, optimized_params)
|
|
710
711
|
|
|
711
712
|
# Phase 2: Post-processing quality control
|
|
712
|
-
|
|
713
|
-
consensus_map = _apply_kd_strict_postprocessing(
|
|
713
|
+
study.logger.debug("Post-processing quality control")
|
|
714
|
+
consensus_map = _apply_kd_strict_postprocessing(study, consensus_map, optimized_params)
|
|
714
715
|
|
|
715
716
|
return consensus_map
|
|
716
717
|
|
|
717
718
|
|
|
718
|
-
def _optimize_rt_tolerance(
|
|
719
|
+
def _optimize_rt_tolerance(study, params: merge_defaults) -> float:
|
|
719
720
|
"""
|
|
720
721
|
Optimize RT tolerance by testing different values and measuring oversegmentation.
|
|
721
722
|
|
|
722
723
|
Args:
|
|
723
|
-
|
|
724
|
+
study: Study object
|
|
724
725
|
params: Merge parameters
|
|
725
726
|
|
|
726
727
|
Returns:
|
|
@@ -729,7 +730,7 @@ def _optimize_rt_tolerance(self, params: merge_defaults) -> float:
|
|
|
729
730
|
rt_tol_range = getattr(params, 'rt_tol_range', (0.8, 2.0))
|
|
730
731
|
rt_tol_steps = getattr(params, 'rt_tol_steps', 5)
|
|
731
732
|
|
|
732
|
-
|
|
733
|
+
study.logger.info(f"Optimizing RT tolerance in range {rt_tol_range} with {rt_tol_steps} steps")
|
|
733
734
|
|
|
734
735
|
# Generate test values
|
|
735
736
|
test_rt_tols = [rt_tol_range[0] + i * (rt_tol_range[1] - rt_tol_range[0]) / (rt_tol_steps - 1)
|
|
@@ -739,8 +740,8 @@ def _optimize_rt_tolerance(self, params: merge_defaults) -> float:
|
|
|
739
740
|
best_score = float('inf')
|
|
740
741
|
|
|
741
742
|
# Store original features for restoration
|
|
742
|
-
original_consensus_df = getattr(
|
|
743
|
-
original_consensus_mapping_df = getattr(
|
|
743
|
+
original_consensus_df = getattr(study, 'consensus_df', pl.DataFrame())
|
|
744
|
+
original_consensus_mapping_df = getattr(study, 'consensus_mapping_df', pl.DataFrame())
|
|
744
745
|
|
|
745
746
|
for test_rt_tol in test_rt_tols:
|
|
746
747
|
try:
|
|
@@ -750,18 +751,18 @@ def _optimize_rt_tolerance(self, params: merge_defaults) -> float:
|
|
|
750
751
|
test_params.rt_tol = test_rt_tol
|
|
751
752
|
|
|
752
753
|
# Run KD merge with test parameters
|
|
753
|
-
test_consensus_map = _merge_kd(
|
|
754
|
+
test_consensus_map = _merge_kd(study, test_params)
|
|
754
755
|
|
|
755
756
|
# Extract consensus features temporarily for analysis
|
|
756
|
-
|
|
757
|
+
_extract_consensus_features(study, test_consensus_map, test_params.min_samples)
|
|
757
758
|
|
|
758
|
-
if len(
|
|
759
|
+
if len(study.consensus_df) == 0:
|
|
759
760
|
continue
|
|
760
761
|
|
|
761
762
|
# Calculate oversegmentation metrics
|
|
762
|
-
oversegmentation_score = _calculate_oversegmentation_score(
|
|
763
|
+
oversegmentation_score = _calculate_oversegmentation_score(study, test_rt_tol)
|
|
763
764
|
|
|
764
|
-
|
|
765
|
+
study.logger.debug(f"RT tol {test_rt_tol:.1f}s: {len(study.consensus_df)} features, score: {oversegmentation_score:.3f}")
|
|
765
766
|
|
|
766
767
|
# Lower score is better (less oversegmentation)
|
|
767
768
|
if oversegmentation_score < best_score:
|
|
@@ -769,50 +770,50 @@ def _optimize_rt_tolerance(self, params: merge_defaults) -> float:
|
|
|
769
770
|
best_rt_tol = test_rt_tol
|
|
770
771
|
|
|
771
772
|
except Exception as e:
|
|
772
|
-
|
|
773
|
+
study.logger.warning(f"RT tolerance optimization failed for {test_rt_tol}s: {e}")
|
|
773
774
|
continue
|
|
774
775
|
|
|
775
776
|
# Restore original consensus data
|
|
776
|
-
|
|
777
|
-
|
|
777
|
+
study.consensus_df = original_consensus_df
|
|
778
|
+
study.consensus_mapping_df = original_consensus_mapping_df
|
|
778
779
|
|
|
779
|
-
|
|
780
|
+
study.logger.info(f"Optimal RT tolerance: {best_rt_tol:.1f}s (score: {best_score:.3f})")
|
|
780
781
|
return best_rt_tol
|
|
781
782
|
|
|
782
783
|
|
|
783
|
-
def _calculate_oversegmentation_score(
|
|
784
|
+
def _calculate_oversegmentation_score(study, rt_tol: float) -> float:
|
|
784
785
|
"""
|
|
785
786
|
Calculate oversegmentation score based on feature density and RT spread metrics.
|
|
786
787
|
Lower scores indicate less oversegmentation.
|
|
787
788
|
|
|
788
789
|
Args:
|
|
789
|
-
|
|
790
|
+
study: Study object
|
|
790
791
|
rt_tol: RT tolerance used
|
|
791
792
|
|
|
792
793
|
Returns:
|
|
793
794
|
Oversegmentation score (lower = better)
|
|
794
795
|
"""
|
|
795
|
-
if len(
|
|
796
|
+
if len(study.consensus_df) == 0:
|
|
796
797
|
return float('inf')
|
|
797
798
|
|
|
798
799
|
# Metric 1: Feature density (features per RT second)
|
|
799
|
-
rt_range =
|
|
800
|
+
rt_range = study.consensus_df['rt'].max() - study.consensus_df['rt'].min()
|
|
800
801
|
if rt_range <= 0:
|
|
801
802
|
return float('inf')
|
|
802
803
|
|
|
803
|
-
feature_density = len(
|
|
804
|
+
feature_density = len(study.consensus_df) / rt_range
|
|
804
805
|
|
|
805
806
|
# Metric 2: Average RT spread relative to tolerance
|
|
806
|
-
rt_spreads = (
|
|
807
|
+
rt_spreads = (study.consensus_df['rt_max'] - study.consensus_df['rt_min'])
|
|
807
808
|
avg_rt_spread_ratio = rt_spreads.mean() / rt_tol if rt_tol > 0 else float('inf')
|
|
808
809
|
|
|
809
810
|
# Metric 3: Proportion of features with low sample counts (indicates fragmentation)
|
|
810
|
-
low_sample_features = len(
|
|
811
|
-
low_sample_ratio = low_sample_features / len(
|
|
811
|
+
low_sample_features = len(study.consensus_df.filter(pl.col('number_samples') <= 5))
|
|
812
|
+
low_sample_ratio = low_sample_features / len(study.consensus_df)
|
|
812
813
|
|
|
813
814
|
# Metric 4: Number of features with excessive RT spread
|
|
814
815
|
excessive_spread_features = len(rt_spreads.filter(rt_spreads > rt_tol * 2))
|
|
815
|
-
excessive_spread_ratio = excessive_spread_features / len(
|
|
816
|
+
excessive_spread_ratio = excessive_spread_features / len(study.consensus_df)
|
|
816
817
|
|
|
817
818
|
# Combined score (weighted combination)
|
|
818
819
|
oversegmentation_score = (
|
|
@@ -825,7 +826,7 @@ def _calculate_oversegmentation_score(self, rt_tol: float) -> float:
|
|
|
825
826
|
return oversegmentation_score
|
|
826
827
|
|
|
827
828
|
|
|
828
|
-
def _apply_kd_strict_postprocessing(
|
|
829
|
+
def _apply_kd_strict_postprocessing(study, consensus_map: oms.ConsensusMap, params: merge_defaults) -> oms.ConsensusMap:
|
|
829
830
|
"""
|
|
830
831
|
Apply post-processing quality control to KD consensus map.
|
|
831
832
|
|
|
@@ -837,20 +838,20 @@ def _apply_kd_strict_postprocessing(self, consensus_map: oms.ConsensusMap, param
|
|
|
837
838
|
Processed consensus map with reduced oversegmentation
|
|
838
839
|
"""
|
|
839
840
|
if consensus_map.size() == 0:
|
|
840
|
-
|
|
841
|
+
study.logger.warning("Empty consensus map provided to post-processing")
|
|
841
842
|
return consensus_map
|
|
842
843
|
|
|
843
|
-
|
|
844
|
+
study.logger.debug(f"Post-processing {consensus_map.size()} initial consensus features")
|
|
844
845
|
|
|
845
846
|
# Step 1: Extract initial consensus features
|
|
846
847
|
original_min_samples = params.min_samples
|
|
847
848
|
params.min_samples = 1 # Extract all features initially
|
|
848
849
|
|
|
849
|
-
|
|
850
|
-
initial_feature_count = len(
|
|
850
|
+
_extract_consensus_features(study, consensus_map, params.min_samples)
|
|
851
|
+
initial_feature_count = len(study.consensus_df)
|
|
851
852
|
|
|
852
853
|
if initial_feature_count == 0:
|
|
853
|
-
|
|
854
|
+
study.logger.warning("No consensus features extracted for post-processing")
|
|
854
855
|
params.min_samples = original_min_samples
|
|
855
856
|
return consensus_map
|
|
856
857
|
|
|
@@ -858,67 +859,67 @@ def _apply_kd_strict_postprocessing(self, consensus_map: oms.ConsensusMap, param
|
|
|
858
859
|
secondary_merge_rt_tol = getattr(params, 'secondary_merge_rt_tol', 0.5)
|
|
859
860
|
secondary_merge_mz_tol = getattr(params, 'secondary_merge_mz_tol', 0.005)
|
|
860
861
|
|
|
861
|
-
|
|
862
|
-
merged_features = _perform_secondary_clustering(
|
|
862
|
+
study.logger.debug(f"Secondary clustering with RT≤{secondary_merge_rt_tol}s, m/z≤{secondary_merge_mz_tol}")
|
|
863
|
+
merged_features = _perform_secondary_clustering(study, secondary_merge_rt_tol, secondary_merge_mz_tol)
|
|
863
864
|
|
|
864
865
|
# Step 3: Sample overlap validation
|
|
865
866
|
min_sample_overlap = getattr(params, 'min_sample_overlap', 0.8)
|
|
866
867
|
if min_sample_overlap > 0:
|
|
867
|
-
|
|
868
|
-
merged_features = _validate_sample_overlap(
|
|
868
|
+
study.logger.debug(f"Sample overlap validation (threshold: {min_sample_overlap})")
|
|
869
|
+
merged_features = _validate_sample_overlap(study, merged_features, min_sample_overlap)
|
|
869
870
|
|
|
870
871
|
# Step 4: RT spread quality filtering
|
|
871
872
|
if params.rt_tol is not None:
|
|
872
873
|
max_rt_spread = getattr(params, 'max_rt_spread', params.rt_tol * 2)
|
|
873
874
|
if max_rt_spread is not None:
|
|
874
|
-
|
|
875
|
-
merged_features = _filter_rt_spread(
|
|
875
|
+
study.logger.debug(f"RT spread filtering (max: {max_rt_spread:.1f}s)")
|
|
876
|
+
merged_features = _filter_rt_spread(study, merged_features, max_rt_spread)
|
|
876
877
|
else:
|
|
877
|
-
|
|
878
|
+
study.logger.debug("Skipping RT spread filtering - max_rt_spread is None")
|
|
878
879
|
else:
|
|
879
|
-
|
|
880
|
+
study.logger.debug("Skipping RT spread filtering - rt_tol is None")
|
|
880
881
|
|
|
881
882
|
# Step 5: Chromatographic coherence filtering (optional)
|
|
882
883
|
min_coherence = getattr(params, 'min_coherence', 0.0)
|
|
883
884
|
if min_coherence > 0:
|
|
884
|
-
|
|
885
|
-
merged_features = _filter_coherence(
|
|
885
|
+
study.logger.debug(f"Chromatographic coherence filtering (min: {min_coherence})")
|
|
886
|
+
merged_features = _filter_coherence(study, merged_features, min_coherence)
|
|
886
887
|
|
|
887
888
|
# Step 6: Rebuild consensus_df with filtered features and preserve mapping
|
|
888
|
-
original_mapping_df =
|
|
889
|
-
|
|
889
|
+
original_mapping_df = study.consensus_mapping_df.clone() # Save original mapping
|
|
890
|
+
study.consensus_df = pl.DataFrame(merged_features, strict=False)
|
|
890
891
|
|
|
891
892
|
# Step 7: Apply original min_samples filter
|
|
892
893
|
params.min_samples = original_min_samples
|
|
893
894
|
if params.min_samples > 1:
|
|
894
|
-
l1 = len(
|
|
895
|
-
|
|
895
|
+
l1 = len(study.consensus_df)
|
|
896
|
+
study.consensus_df = study.consensus_df.filter(
|
|
896
897
|
pl.col("number_samples") >= params.min_samples
|
|
897
898
|
)
|
|
898
|
-
filtered_count = l1 - len(
|
|
899
|
+
filtered_count = l1 - len(study.consensus_df)
|
|
899
900
|
if filtered_count > 0:
|
|
900
|
-
|
|
901
|
+
study.logger.debug(f"Filtered {filtered_count} features below min_samples threshold ({params.min_samples})")
|
|
901
902
|
|
|
902
903
|
# Step 8: Update consensus_mapping_df to match final consensus_df
|
|
903
|
-
if len(
|
|
904
|
-
valid_consensus_ids = set(
|
|
905
|
-
|
|
904
|
+
if len(study.consensus_df) > 0 and len(original_mapping_df) > 0:
|
|
905
|
+
valid_consensus_ids = set(study.consensus_df['consensus_uid'].to_list())
|
|
906
|
+
study.consensus_mapping_df = original_mapping_df.filter(
|
|
906
907
|
pl.col('consensus_uid').is_in(list(valid_consensus_ids))
|
|
907
908
|
)
|
|
908
909
|
else:
|
|
909
|
-
|
|
910
|
+
study.consensus_mapping_df = pl.DataFrame()
|
|
910
911
|
|
|
911
|
-
final_feature_count = len(
|
|
912
|
+
final_feature_count = len(study.consensus_df)
|
|
912
913
|
reduction_pct = ((initial_feature_count - final_feature_count) / initial_feature_count * 100) if initial_feature_count > 0 else 0
|
|
913
914
|
|
|
914
|
-
|
|
915
|
+
study.logger.info(f"Consensus cleanup complete: {initial_feature_count} → {final_feature_count} features ({reduction_pct:.1f}% reduction)")
|
|
915
916
|
|
|
916
917
|
# Create a new consensus map for compatibility (the processed data is in consensus_df)
|
|
917
918
|
processed_consensus_map = oms.ConsensusMap()
|
|
918
919
|
return processed_consensus_map
|
|
919
920
|
|
|
920
921
|
|
|
921
|
-
def _perform_secondary_clustering(
|
|
922
|
+
def _perform_secondary_clustering(study, rt_tol: float, mz_tol: float) -> list:
|
|
922
923
|
"""
|
|
923
924
|
Perform secondary clustering to merge very close features.
|
|
924
925
|
|
|
@@ -929,34 +930,34 @@ def _perform_secondary_clustering(self, rt_tol: float, mz_tol: float) -> list:
|
|
|
929
930
|
Returns:
|
|
930
931
|
List of merged consensus feature dictionaries
|
|
931
932
|
"""
|
|
932
|
-
if len(
|
|
933
|
+
if len(study.consensus_df) == 0:
|
|
933
934
|
return []
|
|
934
935
|
|
|
935
936
|
# Convert consensus_df to list of dictionaries for clustering
|
|
936
937
|
consensus_features = []
|
|
937
|
-
for i, row in enumerate(
|
|
938
|
+
for i, row in enumerate(study.consensus_df.iter_rows(named=True)):
|
|
938
939
|
consensus_features.append(dict(row))
|
|
939
940
|
|
|
940
941
|
# Use Union-Find for efficient clustering
|
|
941
942
|
class UnionFind:
|
|
942
|
-
def __init__(
|
|
943
|
-
|
|
944
|
-
|
|
943
|
+
def __init__(study, n):
|
|
944
|
+
study.parent = list(range(n))
|
|
945
|
+
study.rank = [0] * n
|
|
945
946
|
|
|
946
|
-
def find(
|
|
947
|
-
if
|
|
948
|
-
|
|
949
|
-
return
|
|
947
|
+
def find(study, x):
|
|
948
|
+
if study.parent[x] != x:
|
|
949
|
+
study.parent[x] = study.find(study.parent[x])
|
|
950
|
+
return study.parent[x]
|
|
950
951
|
|
|
951
|
-
def union(
|
|
952
|
-
px, py =
|
|
952
|
+
def union(study, x, y):
|
|
953
|
+
px, py = study.find(x), study.find(y)
|
|
953
954
|
if px == py:
|
|
954
955
|
return
|
|
955
|
-
if
|
|
956
|
+
if study.rank[px] < study.rank[py]:
|
|
956
957
|
px, py = py, px
|
|
957
|
-
|
|
958
|
-
if
|
|
959
|
-
|
|
958
|
+
study.parent[py] = px
|
|
959
|
+
if study.rank[px] == study.rank[py]:
|
|
960
|
+
study.rank[px] += 1
|
|
960
961
|
|
|
961
962
|
n_features = len(consensus_features)
|
|
962
963
|
uf = UnionFind(n_features)
|
|
@@ -992,7 +993,7 @@ def _perform_secondary_clustering(self, rt_tol: float, mz_tol: float) -> list:
|
|
|
992
993
|
merged_feature = _merge_feature_group(group)
|
|
993
994
|
merged_features.append(merged_feature)
|
|
994
995
|
|
|
995
|
-
|
|
996
|
+
study.logger.debug(f"Secondary clustering: {n_features} → {len(merged_features)} features ({n_features - len(merged_features)} merged)")
|
|
996
997
|
return merged_features
|
|
997
998
|
|
|
998
999
|
|
|
@@ -1066,7 +1067,7 @@ def _merge_feature_group(feature_group: list) -> dict:
|
|
|
1066
1067
|
return merged
|
|
1067
1068
|
|
|
1068
1069
|
|
|
1069
|
-
def _validate_sample_overlap(
|
|
1070
|
+
def _validate_sample_overlap(study, features: list, min_overlap: float) -> list:
|
|
1070
1071
|
"""
|
|
1071
1072
|
Validate that merged features have sufficient sample overlap.
|
|
1072
1073
|
|
|
@@ -1097,7 +1098,7 @@ def _validate_sample_overlap(self, features: list, min_overlap: float) -> list:
|
|
|
1097
1098
|
return validated_features
|
|
1098
1099
|
|
|
1099
1100
|
|
|
1100
|
-
def _filter_rt_spread(
|
|
1101
|
+
def _filter_rt_spread(study, features: list, max_rt_spread: float) -> list:
|
|
1101
1102
|
"""
|
|
1102
1103
|
Filter out features with excessive RT spread.
|
|
1103
1104
|
|
|
@@ -1122,12 +1123,12 @@ def _filter_rt_spread(self, features: list, max_rt_spread: float) -> list:
|
|
|
1122
1123
|
filtered_count += 1
|
|
1123
1124
|
|
|
1124
1125
|
if filtered_count > 0:
|
|
1125
|
-
|
|
1126
|
+
study.logger.debug(f"Filtered {filtered_count} features with excessive RT spread (>{max_rt_spread:.1f}s)")
|
|
1126
1127
|
|
|
1127
1128
|
return filtered_features
|
|
1128
1129
|
|
|
1129
1130
|
|
|
1130
|
-
def _filter_coherence(
|
|
1131
|
+
def _filter_coherence(study, features: list, min_coherence: float) -> list:
|
|
1131
1132
|
"""
|
|
1132
1133
|
Filter out features with low chromatographic coherence.
|
|
1133
1134
|
|
|
@@ -1150,23 +1151,23 @@ def _filter_coherence(self, features: list, min_coherence: float) -> list:
|
|
|
1150
1151
|
filtered_count += 1
|
|
1151
1152
|
|
|
1152
1153
|
if filtered_count > 0:
|
|
1153
|
-
|
|
1154
|
+
study.logger.debug(f"Filtered {filtered_count} features with low coherence (<{min_coherence})")
|
|
1154
1155
|
|
|
1155
1156
|
return filtered_features
|
|
1156
1157
|
|
|
1157
1158
|
|
|
1158
|
-
def _merge_kd_nowarp(
|
|
1159
|
+
def _merge_kd_nowarp(study, params: merge_defaults) -> oms.ConsensusMap:
|
|
1159
1160
|
"""KD-tree based merge without RT warping"""
|
|
1160
1161
|
|
|
1161
1162
|
# Generate temporary feature maps on-demand from features_df
|
|
1162
|
-
temp_feature_maps = _generate_feature_maps_on_demand(
|
|
1163
|
+
temp_feature_maps = _generate_feature_maps_on_demand(study)
|
|
1163
1164
|
|
|
1164
1165
|
consensus_map = oms.ConsensusMap()
|
|
1165
1166
|
file_descriptions = consensus_map.getColumnHeaders()
|
|
1166
1167
|
|
|
1167
1168
|
for i, feature_map in enumerate(temp_feature_maps):
|
|
1168
1169
|
file_description = file_descriptions.get(i, oms.ColumnHeader())
|
|
1169
|
-
file_description.filename =
|
|
1170
|
+
file_description.filename = study.samples_df.row(i, named=True)["sample_name"]
|
|
1170
1171
|
file_description.size = feature_map.size()
|
|
1171
1172
|
file_description.unique_id = feature_map.getUniqueId()
|
|
1172
1173
|
file_descriptions[i] = file_description
|
|
@@ -1193,18 +1194,18 @@ def _merge_kd_nowarp(self, params: merge_defaults) -> oms.ConsensusMap:
|
|
|
1193
1194
|
return consensus_map
|
|
1194
1195
|
|
|
1195
1196
|
|
|
1196
|
-
def _merge_kd_chunked(
|
|
1197
|
+
def _merge_kd_chunked(study, params: merge_defaults, cached_adducts_df=None, cached_valid_adducts=None) -> oms.ConsensusMap:
|
|
1197
1198
|
"""KD-based chunked merge with proper cross-chunk consensus building and optional parallel processing"""
|
|
1198
1199
|
|
|
1199
1200
|
# Generate temporary feature maps on-demand from features_df
|
|
1200
|
-
temp_feature_maps = _generate_feature_maps_on_demand(
|
|
1201
|
+
temp_feature_maps = _generate_feature_maps_on_demand(study)
|
|
1201
1202
|
|
|
1202
1203
|
n_samples = len(temp_feature_maps)
|
|
1203
1204
|
if n_samples <= params.chunk_size:
|
|
1204
|
-
|
|
1205
|
-
consensus_map = _merge_kd(
|
|
1205
|
+
study.logger.info(f"Dataset size ({n_samples}) ≤ chunk_size, using KD merge")
|
|
1206
|
+
consensus_map = _merge_kd(study, params)
|
|
1206
1207
|
# Extract consensus features to populate consensus_df for chunked method consistency
|
|
1207
|
-
|
|
1208
|
+
_extract_consensus_features(study, consensus_map, params.min_samples, cached_adducts_df, cached_valid_adducts)
|
|
1208
1209
|
return consensus_map
|
|
1209
1210
|
|
|
1210
1211
|
# Process in chunks
|
|
@@ -1213,21 +1214,21 @@ def _merge_kd_chunked(self, params: merge_defaults, cached_adducts_df=None, cach
|
|
|
1213
1214
|
chunk_end = min(i + params.chunk_size, n_samples)
|
|
1214
1215
|
chunks.append((i, temp_feature_maps[i:chunk_end]))
|
|
1215
1216
|
|
|
1216
|
-
|
|
1217
|
+
study.logger.debug(f"Processing {len(chunks)} chunks of max {params.chunk_size} samples using {params.threads or 'sequential'} thread(s)")
|
|
1217
1218
|
|
|
1218
1219
|
# Process each chunk to create chunk consensus maps
|
|
1219
1220
|
chunk_consensus_maps = []
|
|
1220
1221
|
|
|
1221
1222
|
if params.threads is None:
|
|
1222
1223
|
# Sequential processing (original behavior)
|
|
1223
|
-
for chunk_idx, (chunk_start_idx, chunk_maps) in enumerate(tqdm(chunks, desc=f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {
|
|
1224
|
+
for chunk_idx, (chunk_start_idx, chunk_maps) in enumerate(tqdm(chunks, desc=f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {study.log_label}KD Chunk", disable=study.log_level not in ["TRACE", "DEBUG", "INFO"])):
|
|
1224
1225
|
chunk_consensus_map = oms.ConsensusMap()
|
|
1225
1226
|
|
|
1226
1227
|
# Set up file descriptions for chunk
|
|
1227
1228
|
file_descriptions = chunk_consensus_map.getColumnHeaders()
|
|
1228
1229
|
for j, feature_map in enumerate(chunk_maps):
|
|
1229
1230
|
file_description = file_descriptions.get(j, oms.ColumnHeader())
|
|
1230
|
-
file_description.filename =
|
|
1231
|
+
file_description.filename = study.samples_df.row(chunk_start_idx + j, named=True)["sample_name"]
|
|
1231
1232
|
file_description.size = feature_map.size()
|
|
1232
1233
|
file_description.unique_id = feature_map.getUniqueId()
|
|
1233
1234
|
file_descriptions[j] = file_description
|
|
@@ -1255,7 +1256,7 @@ def _merge_kd_chunked(self, params: merge_defaults, cached_adducts_df=None, cach
|
|
|
1255
1256
|
|
|
1256
1257
|
else:
|
|
1257
1258
|
# Parallel processing
|
|
1258
|
-
|
|
1259
|
+
study.logger.info(f"Processing chunks in parallel using {params.threads} processes")
|
|
1259
1260
|
|
|
1260
1261
|
# Prepare chunk data for parallel processing using features_df slices
|
|
1261
1262
|
chunk_data_list = []
|
|
@@ -1264,7 +1265,7 @@ def _merge_kd_chunked(self, params: merge_defaults, cached_adducts_df=None, cach
|
|
|
1264
1265
|
chunk_sample_uids = []
|
|
1265
1266
|
chunk_samples_df_rows = []
|
|
1266
1267
|
for j in range(len(chunk_maps)):
|
|
1267
|
-
sample_row =
|
|
1268
|
+
sample_row = study.samples_df.row(chunk_start_idx + j, named=True)
|
|
1268
1269
|
chunk_sample_uids.append(sample_row['sample_uid'])
|
|
1269
1270
|
chunk_samples_df_rows.append(sample_row)
|
|
1270
1271
|
|
|
@@ -1272,7 +1273,7 @@ def _merge_kd_chunked(self, params: merge_defaults, cached_adducts_df=None, cach
|
|
|
1272
1273
|
chunk_samples_df = pl.DataFrame(chunk_samples_df_rows)
|
|
1273
1274
|
|
|
1274
1275
|
# Filter features_df for this chunk's samples and select only necessary columns
|
|
1275
|
-
chunk_features_df =
|
|
1276
|
+
chunk_features_df = study.features_df.filter(
|
|
1276
1277
|
pl.col('sample_uid').is_in(chunk_sample_uids)
|
|
1277
1278
|
).select([
|
|
1278
1279
|
'sample_uid', 'rt', 'mz', 'inty', 'charge', 'feature_id'
|
|
@@ -1316,22 +1317,22 @@ def _merge_kd_chunked(self, params: merge_defaults, cached_adducts_df=None, cach
|
|
|
1316
1317
|
serialized_chunk_results.append((chunk_start_idx, consensus_features))
|
|
1317
1318
|
completed_chunks += 1
|
|
1318
1319
|
n_samples_in_chunk = len(chunk_data_list[chunk_idx]['chunk_samples_data'])
|
|
1319
|
-
|
|
1320
|
+
study.logger.info(f"Completed chunk {completed_chunks}/{total_chunks} (samples {chunk_start_idx + 1}-{chunk_start_idx + n_samples_in_chunk})")
|
|
1320
1321
|
except Exception as exc:
|
|
1321
1322
|
# Check if this is a BrokenProcessPool exception from Windows multiprocessing issues
|
|
1322
1323
|
if isinstance(exc, BrokenProcessPool) or "process pool" in str(exc).lower():
|
|
1323
1324
|
# Convert to RuntimeError so outer except block can catch it for fallback
|
|
1324
1325
|
raise RuntimeError(f"Windows multiprocessing failure: {exc}")
|
|
1325
1326
|
else:
|
|
1326
|
-
|
|
1327
|
+
study.logger.error(f"Chunk {chunk_idx} generated an exception: {exc}")
|
|
1327
1328
|
raise exc
|
|
1328
1329
|
|
|
1329
1330
|
except (RuntimeError, OSError, BrokenProcessPool) as e:
|
|
1330
1331
|
# Handle Windows multiprocessing issues - fallback to ThreadPoolExecutor
|
|
1331
1332
|
if ("freeze_support" in str(e) or "spawn" in str(e) or "bootstrapping" in str(e) or
|
|
1332
1333
|
"process pool" in str(e).lower() or "Windows multiprocessing failure" in str(e)):
|
|
1333
|
-
|
|
1334
|
-
|
|
1334
|
+
study.logger.warning(f"ProcessPoolExecutor failed (likely Windows multiprocessing issue): {e}")
|
|
1335
|
+
study.logger.info(f"Falling back to ThreadPoolExecutor with {params.threads} threads")
|
|
1335
1336
|
|
|
1336
1337
|
with ThreadPoolExecutor(max_workers=params.threads) as executor:
|
|
1337
1338
|
# Submit all chunk processing tasks
|
|
@@ -1350,9 +1351,9 @@ def _merge_kd_chunked(self, params: merge_defaults, cached_adducts_df=None, cach
|
|
|
1350
1351
|
serialized_chunk_results.append((chunk_start_idx, consensus_features))
|
|
1351
1352
|
completed_chunks += 1
|
|
1352
1353
|
n_samples_in_chunk = len(chunk_data_list[chunk_idx]['chunk_samples_data'])
|
|
1353
|
-
|
|
1354
|
+
study.logger.info(f"Completed chunk {completed_chunks}/{total_chunks} (samples {chunk_start_idx + 1}-{chunk_start_idx + n_samples_in_chunk})")
|
|
1354
1355
|
except Exception as exc:
|
|
1355
|
-
|
|
1356
|
+
study.logger.error(f"Chunk {chunk_idx} generated an exception: {exc}")
|
|
1356
1357
|
raise exc
|
|
1357
1358
|
else:
|
|
1358
1359
|
# Re-raise other exceptions
|
|
@@ -1366,25 +1367,25 @@ def _merge_kd_chunked(self, params: merge_defaults, cached_adducts_df=None, cach
|
|
|
1366
1367
|
|
|
1367
1368
|
# Merge chunk results with proper cross-chunk consensus building
|
|
1368
1369
|
# _merge_chunk_results now handles both ConsensusMap objects (sequential) and serialized data (parallel)
|
|
1369
|
-
_merge_chunk_results(
|
|
1370
|
+
_merge_chunk_results(study, chunk_consensus_maps, params, cached_adducts_df, cached_valid_adducts)
|
|
1370
1371
|
|
|
1371
|
-
# Return a dummy consensus map for compatibility (consensus features are stored in
|
|
1372
|
+
# Return a dummy consensus map for compatibility (consensus features are stored in study.consensus_df)
|
|
1372
1373
|
consensus_map = oms.ConsensusMap()
|
|
1373
1374
|
return consensus_map
|
|
1374
1375
|
|
|
1375
1376
|
|
|
1376
|
-
def _merge_qt_chunked(
|
|
1377
|
+
def _merge_qt_chunked(study, params: merge_defaults, cached_adducts_df=None, cached_valid_adducts=None) -> oms.ConsensusMap:
|
|
1377
1378
|
"""QT-based chunked merge with proper cross-chunk consensus building and optional parallel processing"""
|
|
1378
1379
|
|
|
1379
1380
|
# Generate temporary feature maps on-demand from features_df
|
|
1380
|
-
temp_feature_maps = _generate_feature_maps_on_demand(
|
|
1381
|
+
temp_feature_maps = _generate_feature_maps_on_demand(study)
|
|
1381
1382
|
|
|
1382
1383
|
n_samples = len(temp_feature_maps)
|
|
1383
1384
|
if n_samples <= params.chunk_size:
|
|
1384
|
-
|
|
1385
|
-
consensus_map = _merge_qt(
|
|
1385
|
+
study.logger.info(f"Dataset size ({n_samples}) ≤ chunk_size, using QT merge")
|
|
1386
|
+
consensus_map = _merge_qt(study, params)
|
|
1386
1387
|
# Extract consensus features to populate consensus_df for chunked method consistency
|
|
1387
|
-
|
|
1388
|
+
_extract_consensus_features(study, consensus_map, params.min_samples, cached_adducts_df, cached_valid_adducts)
|
|
1388
1389
|
return consensus_map
|
|
1389
1390
|
|
|
1390
1391
|
# Process in chunks
|
|
@@ -1393,21 +1394,21 @@ def _merge_qt_chunked(self, params: merge_defaults, cached_adducts_df=None, cach
|
|
|
1393
1394
|
chunk_end = min(i + params.chunk_size, n_samples)
|
|
1394
1395
|
chunks.append((i, temp_feature_maps[i:chunk_end]))
|
|
1395
1396
|
|
|
1396
|
-
|
|
1397
|
+
study.logger.debug(f"Processing {len(chunks)} chunks of max {params.chunk_size} samples using {params.threads or 'sequential'} thread(s)")
|
|
1397
1398
|
|
|
1398
1399
|
# Process each chunk to create chunk consensus maps
|
|
1399
1400
|
chunk_consensus_maps = []
|
|
1400
1401
|
|
|
1401
1402
|
if params.threads is None:
|
|
1402
1403
|
# Sequential processing (original behavior)
|
|
1403
|
-
for chunk_idx, (chunk_start_idx, chunk_maps) in enumerate(tqdm(chunks, desc=f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {
|
|
1404
|
+
for chunk_idx, (chunk_start_idx, chunk_maps) in enumerate(tqdm(chunks, desc=f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {study.log_label}QT Chunk", disable=study.log_level not in ["TRACE", "DEBUG", "INFO"])):
|
|
1404
1405
|
chunk_consensus_map = oms.ConsensusMap()
|
|
1405
1406
|
|
|
1406
1407
|
# Set up file descriptions for chunk
|
|
1407
1408
|
file_descriptions = chunk_consensus_map.getColumnHeaders()
|
|
1408
1409
|
for j, feature_map in enumerate(chunk_maps):
|
|
1409
1410
|
file_description = file_descriptions.get(j, oms.ColumnHeader())
|
|
1410
|
-
file_description.filename =
|
|
1411
|
+
file_description.filename = study.samples_df.row(chunk_start_idx + j, named=True)["sample_name"]
|
|
1411
1412
|
file_description.size = feature_map.size()
|
|
1412
1413
|
file_description.unique_id = feature_map.getUniqueId()
|
|
1413
1414
|
file_descriptions[j] = file_description
|
|
@@ -1430,7 +1431,7 @@ def _merge_qt_chunked(self, params: merge_defaults, cached_adducts_df=None, cach
|
|
|
1430
1431
|
|
|
1431
1432
|
else:
|
|
1432
1433
|
# Parallel processing
|
|
1433
|
-
|
|
1434
|
+
study.logger.info(f"Processing chunks in parallel using {params.threads} processes")
|
|
1434
1435
|
|
|
1435
1436
|
# Prepare chunk data for parallel processing using features_df slices
|
|
1436
1437
|
chunk_data_list = []
|
|
@@ -1439,7 +1440,7 @@ def _merge_qt_chunked(self, params: merge_defaults, cached_adducts_df=None, cach
|
|
|
1439
1440
|
chunk_sample_uids = []
|
|
1440
1441
|
chunk_samples_df_rows = []
|
|
1441
1442
|
for j in range(len(chunk_maps)):
|
|
1442
|
-
sample_row =
|
|
1443
|
+
sample_row = study.samples_df.row(chunk_start_idx + j, named=True)
|
|
1443
1444
|
chunk_sample_uids.append(sample_row['sample_uid'])
|
|
1444
1445
|
chunk_samples_df_rows.append(sample_row)
|
|
1445
1446
|
|
|
@@ -1447,7 +1448,7 @@ def _merge_qt_chunked(self, params: merge_defaults, cached_adducts_df=None, cach
|
|
|
1447
1448
|
chunk_samples_df = pl.DataFrame(chunk_samples_df_rows)
|
|
1448
1449
|
|
|
1449
1450
|
# Filter features_df for this chunk's samples and select only necessary columns
|
|
1450
|
-
chunk_features_df =
|
|
1451
|
+
chunk_features_df = study.features_df.filter(
|
|
1451
1452
|
pl.col('sample_uid').is_in(chunk_sample_uids)
|
|
1452
1453
|
).select([
|
|
1453
1454
|
'sample_uid', 'rt', 'mz', 'inty', 'charge', 'feature_id'
|
|
@@ -1491,22 +1492,22 @@ def _merge_qt_chunked(self, params: merge_defaults, cached_adducts_df=None, cach
|
|
|
1491
1492
|
serialized_chunk_results.append((chunk_start_idx, consensus_features))
|
|
1492
1493
|
completed_chunks += 1
|
|
1493
1494
|
n_samples_in_chunk = len(chunk_data_list[chunk_idx]['chunk_samples_data'])
|
|
1494
|
-
|
|
1495
|
+
study.logger.info(f"Completed chunk {completed_chunks}/{total_chunks} (samples {chunk_start_idx + 1}-{chunk_start_idx + n_samples_in_chunk})")
|
|
1495
1496
|
except Exception as exc:
|
|
1496
1497
|
# Check if this is a BrokenProcessPool exception from Windows multiprocessing issues
|
|
1497
1498
|
if isinstance(exc, BrokenProcessPool) or "process pool" in str(exc).lower():
|
|
1498
1499
|
# Convert to RuntimeError so outer except block can catch it for fallback
|
|
1499
1500
|
raise RuntimeError(f"Windows multiprocessing failure: {exc}")
|
|
1500
1501
|
else:
|
|
1501
|
-
|
|
1502
|
+
study.logger.error(f"Chunk {chunk_idx} generated an exception: {exc}")
|
|
1502
1503
|
raise exc
|
|
1503
1504
|
|
|
1504
1505
|
except (RuntimeError, OSError, BrokenProcessPool) as e:
|
|
1505
1506
|
# Handle Windows multiprocessing issues - fallback to ThreadPoolExecutor
|
|
1506
1507
|
if ("freeze_support" in str(e) or "spawn" in str(e) or "bootstrapping" in str(e) or
|
|
1507
1508
|
"process pool" in str(e).lower() or "Windows multiprocessing failure" in str(e)):
|
|
1508
|
-
|
|
1509
|
-
|
|
1509
|
+
study.logger.warning(f"ProcessPoolExecutor failed (likely Windows multiprocessing issue): {e}")
|
|
1510
|
+
study.logger.info(f"Falling back to ThreadPoolExecutor with {params.threads} threads")
|
|
1510
1511
|
|
|
1511
1512
|
with ThreadPoolExecutor(max_workers=params.threads) as executor:
|
|
1512
1513
|
# Submit all chunk processing tasks
|
|
@@ -1525,9 +1526,9 @@ def _merge_qt_chunked(self, params: merge_defaults, cached_adducts_df=None, cach
|
|
|
1525
1526
|
serialized_chunk_results.append((chunk_start_idx, consensus_features))
|
|
1526
1527
|
completed_chunks += 1
|
|
1527
1528
|
n_samples_in_chunk = len(chunk_data_list[chunk_idx]['chunk_samples_data'])
|
|
1528
|
-
|
|
1529
|
+
study.logger.info(f"Completed chunk {completed_chunks}/{total_chunks} (samples {chunk_start_idx + 1}-{chunk_start_idx + n_samples_in_chunk})")
|
|
1529
1530
|
except Exception as exc:
|
|
1530
|
-
|
|
1531
|
+
study.logger.error(f"Chunk {chunk_idx} generated an exception: {exc}")
|
|
1531
1532
|
raise exc
|
|
1532
1533
|
else:
|
|
1533
1534
|
# Re-raise other exceptions
|
|
@@ -1541,14 +1542,14 @@ def _merge_qt_chunked(self, params: merge_defaults, cached_adducts_df=None, cach
|
|
|
1541
1542
|
|
|
1542
1543
|
# Merge chunk results with proper cross-chunk consensus building
|
|
1543
1544
|
# _merge_chunk_results now handles both ConsensusMap objects (sequential) and serialized data (parallel)
|
|
1544
|
-
_merge_chunk_results(
|
|
1545
|
+
_merge_chunk_results(study, chunk_consensus_maps, params, cached_adducts_df, cached_valid_adducts)
|
|
1545
1546
|
|
|
1546
|
-
# Return a dummy consensus map for compatibility (consensus features are stored in
|
|
1547
|
+
# Return a dummy consensus map for compatibility (consensus features are stored in study.consensus_df)
|
|
1547
1548
|
consensus_map = oms.ConsensusMap()
|
|
1548
1549
|
return consensus_map
|
|
1549
1550
|
|
|
1550
1551
|
|
|
1551
|
-
def _merge_chunk_results(
|
|
1552
|
+
def _merge_chunk_results(study, chunk_consensus_maps: list, params: merge_defaults, cached_adducts_df=None, cached_valid_adducts=None) -> None:
|
|
1552
1553
|
"""
|
|
1553
1554
|
Scalable aggregation of chunk consensus maps into final consensus_df.
|
|
1554
1555
|
|
|
@@ -1561,7 +1562,8 @@ def _merge_chunk_results(self, chunk_consensus_maps: list, params: merge_default
|
|
|
1561
1562
|
if len(chunk_consensus_maps) == 1:
|
|
1562
1563
|
# Single chunk case - just extract using the true global min_samples.
|
|
1563
1564
|
# No need for permissive threshold because we are not discarding singletons pre-aggregation.
|
|
1564
|
-
|
|
1565
|
+
_extract_consensus_features(
|
|
1566
|
+
study,
|
|
1565
1567
|
chunk_consensus_maps[0][1],
|
|
1566
1568
|
params.min_samples,
|
|
1567
1569
|
cached_adducts_df,
|
|
@@ -1572,10 +1574,10 @@ def _merge_chunk_results(self, chunk_consensus_maps: list, params: merge_default
|
|
|
1572
1574
|
# Build feature_uid to feature_data lookup for fast access
|
|
1573
1575
|
feature_uid_map = {
|
|
1574
1576
|
row["feature_id"]: row["feature_uid"]
|
|
1575
|
-
for row in
|
|
1577
|
+
for row in study.features_df.iter_rows(named=True)
|
|
1576
1578
|
}
|
|
1577
1579
|
|
|
1578
|
-
features_lookup = _optimized_feature_lookup(
|
|
1580
|
+
features_lookup = _optimized_feature_lookup(study, study.features_df)
|
|
1579
1581
|
|
|
1580
1582
|
# Extract all consensus features from chunks with their feature_uids
|
|
1581
1583
|
all_chunk_consensus = []
|
|
@@ -1717,8 +1719,8 @@ def _merge_chunk_results(self, chunk_consensus_maps: list, params: merge_default
|
|
|
1717
1719
|
|
|
1718
1720
|
if not all_chunk_consensus:
|
|
1719
1721
|
# No valid consensus features found
|
|
1720
|
-
|
|
1721
|
-
|
|
1722
|
+
study.consensus_df = pl.DataFrame()
|
|
1723
|
+
study.consensus_mapping_df = pl.DataFrame()
|
|
1722
1724
|
return
|
|
1723
1725
|
|
|
1724
1726
|
# Perform cross-chunk clustering using optimized spatial indexing
|
|
@@ -1744,22 +1746,22 @@ def _merge_chunk_results(self, chunk_consensus_maps: list, params: merge_default
|
|
|
1744
1746
|
features_by_bin[(rt_bin, mz_bin)].append(i)
|
|
1745
1747
|
|
|
1746
1748
|
class UF:
|
|
1747
|
-
def __init__(
|
|
1748
|
-
|
|
1749
|
-
|
|
1750
|
-
def find(
|
|
1751
|
-
if
|
|
1752
|
-
|
|
1753
|
-
return
|
|
1754
|
-
def union(
|
|
1755
|
-
pa, pb =
|
|
1749
|
+
def __init__(study, n):
|
|
1750
|
+
study.p = list(range(n))
|
|
1751
|
+
study.r = [0]*n
|
|
1752
|
+
def find(study, x):
|
|
1753
|
+
if study.p[x] != x:
|
|
1754
|
+
study.p[x] = study.find(study.p[x])
|
|
1755
|
+
return study.p[x]
|
|
1756
|
+
def union(study, a,b):
|
|
1757
|
+
pa, pb = study.find(a), study.find(b)
|
|
1756
1758
|
if pa == pb:
|
|
1757
1759
|
return
|
|
1758
|
-
if
|
|
1760
|
+
if study.r[pa] < study.r[pb]:
|
|
1759
1761
|
pa, pb = pb, pa
|
|
1760
|
-
|
|
1761
|
-
if
|
|
1762
|
-
|
|
1762
|
+
study.p[pb] = pa
|
|
1763
|
+
if study.r[pa] == study.r[pb]:
|
|
1764
|
+
study.r[pa] += 1
|
|
1763
1765
|
|
|
1764
1766
|
uf = UF(n_features)
|
|
1765
1767
|
checked = set()
|
|
@@ -1918,7 +1920,7 @@ def _merge_chunk_results(self, chunk_consensus_maps: list, params: merge_default
|
|
|
1918
1920
|
# This allows proper cross-chunk consensus building before final filtering
|
|
1919
1921
|
|
|
1920
1922
|
metadata = _calculate_consensus_statistics(
|
|
1921
|
-
|
|
1923
|
+
study,
|
|
1922
1924
|
consensus_uid_counter,
|
|
1923
1925
|
list(feature_data_acc.values()),
|
|
1924
1926
|
rt_values_chunk,
|
|
@@ -1937,7 +1939,7 @@ def _merge_chunk_results(self, chunk_consensus_maps: list, params: merge_default
|
|
|
1937
1939
|
|
|
1938
1940
|
if rt_spread > max_allowed_spread:
|
|
1939
1941
|
# Skip consensus features with excessive RT spread
|
|
1940
|
-
|
|
1942
|
+
study.logger.debug(f"Skipping consensus feature {consensus_uid_counter} with RT spread {rt_spread:.3f}s > {max_allowed_spread:.3f}s")
|
|
1941
1943
|
consensus_uid_counter += 1
|
|
1942
1944
|
continue
|
|
1943
1945
|
|
|
@@ -1969,27 +1971,27 @@ def _merge_chunk_results(self, chunk_consensus_maps: list, params: merge_default
|
|
|
1969
1971
|
consensus_uid_counter += 1
|
|
1970
1972
|
|
|
1971
1973
|
# Assign DataFrames
|
|
1972
|
-
|
|
1973
|
-
|
|
1974
|
+
study.consensus_df = pl.DataFrame(consensus_metadata, strict=False)
|
|
1975
|
+
study.consensus_mapping_df = pl.DataFrame(consensus_mapping_list, strict=False)
|
|
1974
1976
|
|
|
1975
1977
|
# Ensure mapping only contains features from retained consensus_df
|
|
1976
|
-
if len(
|
|
1977
|
-
valid_consensus_ids = set(
|
|
1978
|
-
|
|
1978
|
+
if len(study.consensus_df) > 0:
|
|
1979
|
+
valid_consensus_ids = set(study.consensus_df['consensus_uid'].to_list())
|
|
1980
|
+
study.consensus_mapping_df = study.consensus_mapping_df.filter(
|
|
1979
1981
|
pl.col('consensus_uid').is_in(list(valid_consensus_ids))
|
|
1980
1982
|
)
|
|
1981
1983
|
else:
|
|
1982
|
-
|
|
1984
|
+
study.consensus_mapping_df = pl.DataFrame()
|
|
1983
1985
|
|
|
1984
1986
|
# Attach empty consensus_map placeholder for downstream compatibility
|
|
1985
|
-
|
|
1987
|
+
study.consensus_map = oms.ConsensusMap()
|
|
1986
1988
|
return
|
|
1987
1989
|
|
|
1988
1990
|
|
|
1989
1991
|
def _calculate_consensus_statistics(study_obj, consensus_uid: int, feature_data_list: list,
|
|
1990
1992
|
rt_values: list, mz_values: list,
|
|
1991
1993
|
intensity_values: list, quality_values: list,
|
|
1992
|
-
number_features: int = None, number_samples: int = None,
|
|
1994
|
+
number_features: int | None = None, number_samples: int | None = None,
|
|
1993
1995
|
cached_adducts_df=None, cached_valid_adducts=None) -> dict:
|
|
1994
1996
|
"""
|
|
1995
1997
|
Calculate comprehensive statistics for a consensus feature from aggregated feature data.
|
|
@@ -2158,24 +2160,24 @@ def _cluster_consensus_features(features: list, rt_tol: float, mz_tol: float) ->
|
|
|
2158
2160
|
|
|
2159
2161
|
# Use Union-Find for efficient clustering
|
|
2160
2162
|
class UnionFind:
|
|
2161
|
-
def __init__(
|
|
2162
|
-
|
|
2163
|
-
|
|
2163
|
+
def __init__(study, n):
|
|
2164
|
+
study.parent = list(range(n))
|
|
2165
|
+
study.rank = [0] * n
|
|
2164
2166
|
|
|
2165
|
-
def find(
|
|
2166
|
-
if
|
|
2167
|
-
|
|
2168
|
-
return
|
|
2167
|
+
def find(study, x):
|
|
2168
|
+
if study.parent[x] != x:
|
|
2169
|
+
study.parent[x] = study.find(study.parent[x])
|
|
2170
|
+
return study.parent[x]
|
|
2169
2171
|
|
|
2170
|
-
def union(
|
|
2171
|
-
px, py =
|
|
2172
|
+
def union(study, x, y):
|
|
2173
|
+
px, py = study.find(x), study.find(y)
|
|
2172
2174
|
if px == py:
|
|
2173
2175
|
return
|
|
2174
|
-
if
|
|
2176
|
+
if study.rank[px] < study.rank[py]:
|
|
2175
2177
|
px, py = py, px
|
|
2176
|
-
|
|
2177
|
-
if
|
|
2178
|
-
|
|
2178
|
+
study.parent[py] = px
|
|
2179
|
+
if study.rank[px] == study.rank[py]:
|
|
2180
|
+
study.rank[px] += 1
|
|
2179
2181
|
|
|
2180
2182
|
n_features = len(features)
|
|
2181
2183
|
uf = UnionFind(n_features)
|
|
@@ -2208,39 +2210,39 @@ def _cluster_consensus_features(features: list, rt_tol: float, mz_tol: float) ->
|
|
|
2208
2210
|
return list(groups_by_root.values())
|
|
2209
2211
|
|
|
2210
2212
|
|
|
2211
|
-
def _reset_consensus_data(
|
|
2213
|
+
def _reset_consensus_data(study):
|
|
2212
2214
|
"""Reset consensus-related DataFrames at the start of merge."""
|
|
2213
|
-
|
|
2214
|
-
|
|
2215
|
-
|
|
2215
|
+
study.consensus_df = pl.DataFrame()
|
|
2216
|
+
study.consensus_ms2 = pl.DataFrame()
|
|
2217
|
+
study.consensus_mapping_df = pl.DataFrame()
|
|
2216
2218
|
|
|
2217
2219
|
|
|
2218
|
-
def _extract_consensus_features(
|
|
2220
|
+
def _extract_consensus_features(study, consensus_map, min_samples, cached_adducts_df=None, cached_valid_adducts=None):
|
|
2219
2221
|
"""Extract consensus features and build metadata."""
|
|
2220
|
-
# create a dict to map uid to feature_uid using
|
|
2222
|
+
# create a dict to map uid to feature_uid using study.features_df
|
|
2221
2223
|
feature_uid_map = {
|
|
2222
2224
|
row["feature_id"]: row["feature_uid"]
|
|
2223
|
-
for row in
|
|
2225
|
+
for row in study.features_df.iter_rows(named=True)
|
|
2224
2226
|
}
|
|
2225
2227
|
imax = consensus_map.size()
|
|
2226
2228
|
|
|
2227
|
-
|
|
2229
|
+
study.logger.debug(f"Found {imax} feature groups by clustering.")
|
|
2228
2230
|
|
|
2229
2231
|
# Pre-build fast lookup tables for features_df data using optimized approach
|
|
2230
|
-
features_lookup = _optimized_feature_lookup(
|
|
2232
|
+
features_lookup = _optimized_feature_lookup(study, study.features_df)
|
|
2231
2233
|
|
|
2232
2234
|
# create a list to store the consensus mapping
|
|
2233
2235
|
consensus_mapping = []
|
|
2234
2236
|
metadata_list = []
|
|
2235
2237
|
|
|
2236
|
-
tqdm_disable =
|
|
2238
|
+
tqdm_disable = study.log_level not in ["TRACE", "DEBUG"]
|
|
2237
2239
|
|
|
2238
2240
|
for i, feature in enumerate(
|
|
2239
2241
|
tqdm(
|
|
2240
2242
|
consensus_map,
|
|
2241
2243
|
total=imax,
|
|
2242
2244
|
disable=tqdm_disable,
|
|
2243
|
-
desc=f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {
|
|
2245
|
+
desc=f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {study.log_label}Extract metadata",
|
|
2244
2246
|
),
|
|
2245
2247
|
):
|
|
2246
2248
|
# get all features in the feature map with the same unique id as the consensus feature
|
|
@@ -2486,7 +2488,7 @@ def _extract_consensus_features(self, consensus_map, min_samples, cached_adducts
|
|
|
2486
2488
|
adduct_mass_shift_top = 1.007825
|
|
2487
2489
|
else:
|
|
2488
2490
|
# No valid adducts found - assign default based on study polarity
|
|
2489
|
-
study_polarity = getattr(
|
|
2491
|
+
study_polarity = getattr(study, "polarity", "positive")
|
|
2490
2492
|
if study_polarity in ["negative", "neg"]:
|
|
2491
2493
|
# Negative mode default
|
|
2492
2494
|
adduct_top = "[M-?]1-"
|
|
@@ -2618,55 +2620,55 @@ def _extract_consensus_features(self, consensus_map, min_samples, cached_adducts
|
|
|
2618
2620
|
)
|
|
2619
2621
|
|
|
2620
2622
|
consensus_mapping_df = pl.DataFrame(consensus_mapping)
|
|
2621
|
-
# remove all rows in consensus_mapping_df where consensus_id is not in
|
|
2623
|
+
# remove all rows in consensus_mapping_df where consensus_id is not in study.featured_df['uid']
|
|
2622
2624
|
l1 = len(consensus_mapping_df)
|
|
2623
2625
|
consensus_mapping_df = consensus_mapping_df.filter(
|
|
2624
|
-
pl.col("feature_uid").is_in(
|
|
2626
|
+
pl.col("feature_uid").is_in(study.features_df["feature_uid"].to_list()),
|
|
2625
2627
|
)
|
|
2626
|
-
|
|
2628
|
+
study.logger.debug(
|
|
2627
2629
|
f"Filtered {l1 - len(consensus_mapping_df)} orphan features from maps.",
|
|
2628
2630
|
)
|
|
2629
|
-
|
|
2630
|
-
|
|
2631
|
+
study.consensus_mapping_df = consensus_mapping_df
|
|
2632
|
+
study.consensus_df = pl.DataFrame(metadata_list, strict=False)
|
|
2631
2633
|
|
|
2632
2634
|
if min_samples is None:
|
|
2633
2635
|
min_samples = 1
|
|
2634
2636
|
if min_samples < 1:
|
|
2635
|
-
min_samples = int(min_samples * len(
|
|
2637
|
+
min_samples = int(min_samples * len(study.samples_df))
|
|
2636
2638
|
|
|
2637
2639
|
# Validate that min_samples doesn't exceed the number of samples
|
|
2638
|
-
if min_samples > len(
|
|
2639
|
-
|
|
2640
|
-
f"min_samples ({min_samples}) exceeds the number of samples ({len(
|
|
2641
|
-
f"Setting min_samples to {len(
|
|
2640
|
+
if min_samples > len(study.samples_df):
|
|
2641
|
+
study.logger.warning(
|
|
2642
|
+
f"min_samples ({min_samples}) exceeds the number of samples ({len(study.samples_df)}). "
|
|
2643
|
+
f"Setting min_samples to {len(study.samples_df)}.",
|
|
2642
2644
|
)
|
|
2643
|
-
min_samples = len(
|
|
2645
|
+
min_samples = len(study.samples_df)
|
|
2644
2646
|
|
|
2645
2647
|
# filter out consensus features with less than min_samples features
|
|
2646
|
-
l1 = len(
|
|
2647
|
-
|
|
2648
|
+
l1 = len(study.consensus_df)
|
|
2649
|
+
study.consensus_df = study.consensus_df.filter(
|
|
2648
2650
|
pl.col("number_samples") >= min_samples,
|
|
2649
2651
|
)
|
|
2650
|
-
|
|
2651
|
-
f"Filtered {l1 - len(
|
|
2652
|
+
study.logger.debug(
|
|
2653
|
+
f"Filtered {l1 - len(study.consensus_df)} consensus features with less than {min_samples} samples.",
|
|
2652
2654
|
)
|
|
2653
2655
|
# filter out consensus mapping with less than min_samples features
|
|
2654
|
-
|
|
2655
|
-
pl.col("consensus_uid").is_in(
|
|
2656
|
+
study.consensus_mapping_df = study.consensus_mapping_df.filter(
|
|
2657
|
+
pl.col("consensus_uid").is_in(study.consensus_df["consensus_uid"].to_list()),
|
|
2656
2658
|
)
|
|
2657
2659
|
|
|
2658
|
-
|
|
2660
|
+
study.consensus_map = consensus_map
|
|
2659
2661
|
|
|
2660
2662
|
|
|
2661
|
-
def _perform_adduct_grouping(
|
|
2663
|
+
def _perform_adduct_grouping(study, rt_tol, mz_tol):
|
|
2662
2664
|
"""Perform adduct grouping on consensus features."""
|
|
2663
2665
|
import polars as pl
|
|
2664
2666
|
|
|
2665
2667
|
# Add adduct grouping and adduct_of assignment
|
|
2666
|
-
if len(
|
|
2668
|
+
if len(study.consensus_df) > 0:
|
|
2667
2669
|
# Get relevant columns for grouping
|
|
2668
2670
|
consensus_data = []
|
|
2669
|
-
for row in
|
|
2671
|
+
for row in study.consensus_df.iter_rows(named=True):
|
|
2670
2672
|
consensus_data.append(
|
|
2671
2673
|
{
|
|
2672
2674
|
"consensus_uid": row["consensus_uid"],
|
|
@@ -2679,11 +2681,11 @@ def _perform_adduct_grouping(self, rt_tol, mz_tol):
|
|
|
2679
2681
|
|
|
2680
2682
|
# Use optimized adduct grouping
|
|
2681
2683
|
adduct_group_list, adduct_of_list = _optimized_adduct_grouping(
|
|
2682
|
-
|
|
2684
|
+
study, consensus_data, rt_tol, mz_tol
|
|
2683
2685
|
)
|
|
2684
2686
|
|
|
2685
2687
|
# Add the new columns to consensus_df
|
|
2686
|
-
|
|
2688
|
+
study.consensus_df = study.consensus_df.with_columns(
|
|
2687
2689
|
[
|
|
2688
2690
|
pl.Series("adduct_group", adduct_group_list, dtype=pl.Int64),
|
|
2689
2691
|
pl.Series("adduct_of", adduct_of_list, dtype=pl.Int64),
|
|
@@ -2691,7 +2693,7 @@ def _perform_adduct_grouping(self, rt_tol, mz_tol):
|
|
|
2691
2693
|
)
|
|
2692
2694
|
|
|
2693
2695
|
|
|
2694
|
-
def _count_tight_clusters(
|
|
2696
|
+
def _count_tight_clusters(study, mz_tol: float = 0.04, rt_tol: float = 0.3) -> int:
|
|
2695
2697
|
"""
|
|
2696
2698
|
Count consensus features grouped in tight clusters.
|
|
2697
2699
|
|
|
@@ -2702,12 +2704,12 @@ def _count_tight_clusters(self, mz_tol: float = 0.04, rt_tol: float = 0.3) -> in
|
|
|
2702
2704
|
Returns:
|
|
2703
2705
|
Number of tight clusters found
|
|
2704
2706
|
"""
|
|
2705
|
-
if len(
|
|
2707
|
+
if len(study.consensus_df) < 2:
|
|
2706
2708
|
return 0
|
|
2707
2709
|
|
|
2708
2710
|
# Extract consensus feature data
|
|
2709
2711
|
consensus_data = []
|
|
2710
|
-
for row in
|
|
2712
|
+
for row in study.consensus_df.iter_rows(named=True):
|
|
2711
2713
|
consensus_data.append({
|
|
2712
2714
|
'consensus_uid': row['consensus_uid'],
|
|
2713
2715
|
'mz': row['mz'],
|
|
@@ -2768,7 +2770,7 @@ def _count_tight_clusters(self, mz_tol: float = 0.04, rt_tol: float = 0.3) -> in
|
|
|
2768
2770
|
return tight_clusters_count
|
|
2769
2771
|
|
|
2770
2772
|
|
|
2771
|
-
def _consensus_cleanup(
|
|
2773
|
+
def _consensus_cleanup(study, rt_tol, mz_tol):
|
|
2772
2774
|
"""
|
|
2773
2775
|
Consensus cleanup to merge over-segmented consensus features and remove isotopic features.
|
|
2774
2776
|
|
|
@@ -2777,20 +2779,20 @@ def _consensus_cleanup(self, rt_tol, mz_tol):
|
|
|
2777
2779
|
(too many features in very tight m/z and RT windows)
|
|
2778
2780
|
2. Performs deisotoping to remove +1 and +2 isotopic features
|
|
2779
2781
|
"""
|
|
2780
|
-
if len(
|
|
2782
|
+
if len(study.consensus_df) == 0:
|
|
2781
2783
|
return
|
|
2782
2784
|
|
|
2783
|
-
initial_count = len(
|
|
2785
|
+
initial_count = len(study.consensus_df)
|
|
2784
2786
|
|
|
2785
2787
|
# Only perform enhanced post-clustering if there are many features
|
|
2786
2788
|
if initial_count < 50:
|
|
2787
2789
|
return
|
|
2788
2790
|
|
|
2789
|
-
|
|
2791
|
+
study.logger.debug(f"Enhanced post-clustering: processing {initial_count} consensus features")
|
|
2790
2792
|
|
|
2791
2793
|
# Find tight clusters using spatial binning
|
|
2792
2794
|
consensus_data = []
|
|
2793
|
-
for row in
|
|
2795
|
+
for row in study.consensus_df.iter_rows(named=True):
|
|
2794
2796
|
consensus_data.append({
|
|
2795
2797
|
'consensus_uid': row['consensus_uid'],
|
|
2796
2798
|
'mz': row['mz'],
|
|
@@ -2873,7 +2875,7 @@ def _consensus_cleanup(self, rt_tol, mz_tol):
|
|
|
2873
2875
|
if not merge_groups:
|
|
2874
2876
|
return
|
|
2875
2877
|
|
|
2876
|
-
|
|
2878
|
+
study.logger.debug(f"Found {len(merge_groups)} over-segmented clusters to merge")
|
|
2877
2879
|
|
|
2878
2880
|
# Merge clusters by keeping the most representative feature
|
|
2879
2881
|
uids_to_remove = set()
|
|
@@ -2892,25 +2894,25 @@ def _consensus_cleanup(self, rt_tol, mz_tol):
|
|
|
2892
2894
|
|
|
2893
2895
|
if uids_to_remove:
|
|
2894
2896
|
# Remove merged features from consensus_df
|
|
2895
|
-
|
|
2897
|
+
study.consensus_df = study.consensus_df.filter(
|
|
2896
2898
|
~pl.col('consensus_uid').is_in(list(uids_to_remove))
|
|
2897
2899
|
)
|
|
2898
2900
|
|
|
2899
2901
|
# Also update consensus_mapping_df if it exists
|
|
2900
|
-
if hasattr(
|
|
2901
|
-
|
|
2902
|
+
if hasattr(study, 'consensus_mapping_df') and not study.consensus_mapping_df.is_empty():
|
|
2903
|
+
study.consensus_mapping_df = study.consensus_mapping_df.filter(
|
|
2902
2904
|
~pl.col('consensus_uid').is_in(list(uids_to_remove))
|
|
2903
2905
|
)
|
|
2904
2906
|
|
|
2905
|
-
final_count = len(
|
|
2907
|
+
final_count = len(study.consensus_df)
|
|
2906
2908
|
reduction = initial_count - final_count
|
|
2907
2909
|
reduction_pct = (reduction / initial_count) * 100
|
|
2908
2910
|
|
|
2909
2911
|
if reduction > 0:
|
|
2910
|
-
|
|
2912
|
+
study.logger.debug(f"Enhanced post-clustering: {initial_count} → {final_count} features ({reduction_pct:.1f}% reduction)")
|
|
2911
2913
|
|
|
2912
2914
|
# Step 2: Deisotoping - Remove +1 and +2 isotopic consensus features
|
|
2913
|
-
pre_deisotoping_count = len(
|
|
2915
|
+
pre_deisotoping_count = len(study.consensus_df)
|
|
2914
2916
|
isotope_uids_to_remove = set()
|
|
2915
2917
|
|
|
2916
2918
|
# Use strict tolerances for deisotoping (same as declustering)
|
|
@@ -2919,7 +2921,7 @@ def _consensus_cleanup(self, rt_tol, mz_tol):
|
|
|
2919
2921
|
|
|
2920
2922
|
# Get current consensus data for isotope detection
|
|
2921
2923
|
current_consensus_data = []
|
|
2922
|
-
for row in
|
|
2924
|
+
for row in study.consensus_df.iter_rows(named=True):
|
|
2923
2925
|
current_consensus_data.append({
|
|
2924
2926
|
'consensus_uid': row['consensus_uid'],
|
|
2925
2927
|
'mz': row['mz'],
|
|
@@ -2970,31 +2972,31 @@ def _consensus_cleanup(self, rt_tol, mz_tol):
|
|
|
2970
2972
|
|
|
2971
2973
|
# Remove isotopic features
|
|
2972
2974
|
if isotope_uids_to_remove:
|
|
2973
|
-
|
|
2975
|
+
study.consensus_df = study.consensus_df.filter(
|
|
2974
2976
|
~pl.col('consensus_uid').is_in(list(isotope_uids_to_remove))
|
|
2975
2977
|
)
|
|
2976
2978
|
|
|
2977
2979
|
# Also update consensus_mapping_df if it exists
|
|
2978
|
-
if hasattr(
|
|
2979
|
-
|
|
2980
|
+
if hasattr(study, 'consensus_mapping_df') and not study.consensus_mapping_df.is_empty():
|
|
2981
|
+
study.consensus_mapping_df = study.consensus_mapping_df.filter(
|
|
2980
2982
|
~pl.col('consensus_uid').is_in(list(isotope_uids_to_remove))
|
|
2981
2983
|
)
|
|
2982
2984
|
|
|
2983
|
-
post_deisotoping_count = len(
|
|
2985
|
+
post_deisotoping_count = len(study.consensus_df)
|
|
2984
2986
|
isotope_reduction = pre_deisotoping_count - post_deisotoping_count
|
|
2985
2987
|
|
|
2986
2988
|
if isotope_reduction > 0:
|
|
2987
|
-
|
|
2989
|
+
study.logger.debug(f"Deisotoping: {pre_deisotoping_count} → {post_deisotoping_count} features ({isotope_reduction} isotopic features removed)")
|
|
2988
2990
|
|
|
2989
2991
|
# Final summary
|
|
2990
|
-
final_count = len(
|
|
2992
|
+
final_count = len(study.consensus_df)
|
|
2991
2993
|
total_reduction = initial_count - final_count
|
|
2992
2994
|
if total_reduction > 0:
|
|
2993
2995
|
total_reduction_pct = (total_reduction / initial_count) * 100
|
|
2994
|
-
|
|
2996
|
+
study.logger.debug(f"Consensus cleanup complete: {initial_count} → {final_count} features ({total_reduction_pct:.1f}% total reduction)")
|
|
2995
2997
|
|
|
2996
2998
|
|
|
2997
|
-
def _identify_adduct_by_mass_shift(
|
|
2999
|
+
def _identify_adduct_by_mass_shift(study, rt_tol, cached_adducts_df=None):
|
|
2998
3000
|
"""
|
|
2999
3001
|
Identify coeluting consensus features by characteristic mass shifts between adducts
|
|
3000
3002
|
and update their adduct information accordingly.
|
|
@@ -3014,23 +3016,24 @@ def _identify_adduct_by_mass_shift(self, rt_tol, cached_adducts_df=None):
|
|
|
3014
3016
|
from collections import defaultdict
|
|
3015
3017
|
|
|
3016
3018
|
# Check if consensus_df exists and has features
|
|
3017
|
-
if len(
|
|
3018
|
-
|
|
3019
|
+
if len(study.consensus_df) == 0:
|
|
3020
|
+
study.logger.debug("No consensus features for adduct identification by mass shift")
|
|
3019
3021
|
return
|
|
3020
3022
|
|
|
3021
|
-
|
|
3023
|
+
study.logger.info(f"Identifying coeluting adducts by mass shifts in {len(study.consensus_df)} consensus features...")
|
|
3022
3024
|
|
|
3023
3025
|
# Get adducts DataFrame if not provided
|
|
3024
3026
|
if cached_adducts_df is None or cached_adducts_df.is_empty():
|
|
3025
3027
|
try:
|
|
3026
3028
|
# Use lower min_probability for better adduct coverage in mass shift identification
|
|
3027
|
-
|
|
3029
|
+
from masster.study.id import _get_adducts
|
|
3030
|
+
cached_adducts_df = _get_adducts(study, min_probability=0.01)
|
|
3028
3031
|
except Exception as e:
|
|
3029
|
-
|
|
3032
|
+
study.logger.warning(f"Could not retrieve adducts for mass shift identification: {e}")
|
|
3030
3033
|
return
|
|
3031
3034
|
|
|
3032
3035
|
if cached_adducts_df.is_empty():
|
|
3033
|
-
|
|
3036
|
+
study.logger.debug("No adducts available for mass shift identification")
|
|
3034
3037
|
return
|
|
3035
3038
|
|
|
3036
3039
|
# Build catalogue of mass shifts between adducts
|
|
@@ -3081,11 +3084,11 @@ def _identify_adduct_by_mass_shift(self, rt_tol, cached_adducts_df=None):
|
|
|
3081
3084
|
"to_charge": charge2
|
|
3082
3085
|
})
|
|
3083
3086
|
|
|
3084
|
-
|
|
3087
|
+
study.logger.debug(f"Generated mass shift catalog with {len(mass_shift_catalog)} unique shifts")
|
|
3085
3088
|
|
|
3086
3089
|
# Get consensus features data
|
|
3087
3090
|
consensus_data = []
|
|
3088
|
-
for i, row in enumerate(
|
|
3091
|
+
for i, row in enumerate(study.consensus_df.iter_rows(named=True)):
|
|
3089
3092
|
consensus_data.append({
|
|
3090
3093
|
"index": i,
|
|
3091
3094
|
"consensus_uid": row["consensus_uid"],
|
|
@@ -3234,7 +3237,7 @@ def _identify_adduct_by_mass_shift(self, rt_tol, cached_adducts_df=None):
|
|
|
3234
3237
|
}
|
|
3235
3238
|
|
|
3236
3239
|
updated_count += 2
|
|
3237
|
-
|
|
3240
|
+
study.logger.debug(
|
|
3238
3241
|
f"Identified adduct pair: {from_adduct_name} (m/z {from_feature['mz']:.4f}) "
|
|
3239
3242
|
f"<-> {to_adduct_name} (m/z {to_feature['mz']:.4f}), "
|
|
3240
3243
|
f"RT {rt1:.2f}s, Δm/z {mz_diff:.4f}"
|
|
@@ -3244,7 +3247,7 @@ def _identify_adduct_by_mass_shift(self, rt_tol, cached_adducts_df=None):
|
|
|
3244
3247
|
# Apply updates to consensus_df
|
|
3245
3248
|
if adduct_updates:
|
|
3246
3249
|
# Prepare update data
|
|
3247
|
-
consensus_uids =
|
|
3250
|
+
consensus_uids = study.consensus_df["consensus_uid"].to_list()
|
|
3248
3251
|
|
|
3249
3252
|
new_adduct_top = []
|
|
3250
3253
|
new_adduct_charge_top = []
|
|
@@ -3261,88 +3264,88 @@ def _identify_adduct_by_mass_shift(self, rt_tol, cached_adducts_df=None):
|
|
|
3261
3264
|
else:
|
|
3262
3265
|
# Keep existing values
|
|
3263
3266
|
row_idx = consensus_uids.index(uid)
|
|
3264
|
-
row =
|
|
3267
|
+
row = study.consensus_df.row(row_idx, named=True)
|
|
3265
3268
|
new_adduct_top.append(row.get("adduct_top"))
|
|
3266
3269
|
new_adduct_charge_top.append(row.get("adduct_charge_top"))
|
|
3267
3270
|
new_adduct_mass_neutral_top.append(row.get("adduct_mass_neutral_top"))
|
|
3268
3271
|
new_adduct_mass_shift_top.append(row.get("adduct_mass_shift_top"))
|
|
3269
3272
|
|
|
3270
3273
|
# Update the DataFrame
|
|
3271
|
-
|
|
3274
|
+
study.consensus_df = study.consensus_df.with_columns([
|
|
3272
3275
|
pl.Series("adduct_top", new_adduct_top),
|
|
3273
3276
|
pl.Series("adduct_charge_top", new_adduct_charge_top),
|
|
3274
3277
|
pl.Series("adduct_mass_neutral_top", new_adduct_mass_neutral_top),
|
|
3275
3278
|
pl.Series("adduct_mass_shift_top", new_adduct_mass_shift_top)
|
|
3276
3279
|
])
|
|
3277
3280
|
|
|
3278
|
-
|
|
3281
|
+
study.logger.info(f"Updated adduct assignments for {updated_count} consensus features based on mass shifts")
|
|
3279
3282
|
else:
|
|
3280
|
-
|
|
3283
|
+
study.logger.debug("No consensus features updated based on mass shift analysis")
|
|
3281
3284
|
|
|
3282
3285
|
|
|
3283
|
-
def _finalize_merge(
|
|
3286
|
+
def _finalize_merge(study, link_ms2, min_samples):
|
|
3284
3287
|
"""Complete the merge process with final calculations and cleanup."""
|
|
3285
3288
|
import polars as pl
|
|
3286
3289
|
|
|
3287
3290
|
# Check if consensus_df is empty or missing required columns
|
|
3288
|
-
if len(
|
|
3289
|
-
|
|
3291
|
+
if len(study.consensus_df) == 0 or "number_samples" not in study.consensus_df.columns:
|
|
3292
|
+
study.logger.debug("No consensus features found or consensus_df is empty. Skipping finalize merge.")
|
|
3290
3293
|
return
|
|
3291
3294
|
|
|
3292
3295
|
# Validate min_samples parameter
|
|
3293
3296
|
if min_samples is None:
|
|
3294
3297
|
min_samples = 1
|
|
3295
3298
|
if min_samples < 1:
|
|
3296
|
-
min_samples = int(min_samples * len(
|
|
3299
|
+
min_samples = int(min_samples * len(study.samples_df))
|
|
3297
3300
|
|
|
3298
3301
|
# Validate that min_samples doesn't exceed the number of samples
|
|
3299
|
-
if min_samples > len(
|
|
3300
|
-
|
|
3301
|
-
f"min_samples ({min_samples}) exceeds the number of samples ({len(
|
|
3302
|
-
f"Setting min_samples to {len(
|
|
3302
|
+
if min_samples > len(study.samples_df):
|
|
3303
|
+
study.logger.warning(
|
|
3304
|
+
f"min_samples ({min_samples}) exceeds the number of samples ({len(study.samples_df)}). "
|
|
3305
|
+
f"Setting min_samples to {len(study.samples_df)}.",
|
|
3303
3306
|
)
|
|
3304
|
-
min_samples = len(
|
|
3307
|
+
min_samples = len(study.samples_df)
|
|
3305
3308
|
|
|
3306
3309
|
# Filter out consensus features with less than min_samples features
|
|
3307
|
-
l1 = len(
|
|
3308
|
-
|
|
3310
|
+
l1 = len(study.consensus_df)
|
|
3311
|
+
study.consensus_df = study.consensus_df.filter(
|
|
3309
3312
|
pl.col("number_samples") >= min_samples,
|
|
3310
3313
|
)
|
|
3311
|
-
|
|
3312
|
-
f"Filtered {l1 - len(
|
|
3314
|
+
study.logger.debug(
|
|
3315
|
+
f"Filtered {l1 - len(study.consensus_df)} consensus features with less than {min_samples} samples.",
|
|
3313
3316
|
)
|
|
3314
3317
|
|
|
3315
3318
|
# Filter out consensus mapping with less than min_samples features
|
|
3316
|
-
|
|
3317
|
-
pl.col("consensus_uid").is_in(
|
|
3319
|
+
study.consensus_mapping_df = study.consensus_mapping_df.filter(
|
|
3320
|
+
pl.col("consensus_uid").is_in(study.consensus_df["consensus_uid"].to_list()),
|
|
3318
3321
|
)
|
|
3319
3322
|
|
|
3320
3323
|
# Calculate the completeness of the consensus map
|
|
3321
3324
|
# Log completion with tight cluster metrics
|
|
3322
|
-
if len(
|
|
3325
|
+
if len(study.consensus_df) > 0 and len(study.samples_df) > 0:
|
|
3323
3326
|
c = (
|
|
3324
|
-
len(
|
|
3325
|
-
/ len(
|
|
3326
|
-
/ len(
|
|
3327
|
+
len(study.consensus_mapping_df)
|
|
3328
|
+
/ len(study.consensus_df)
|
|
3329
|
+
/ len(study.samples_df)
|
|
3327
3330
|
)
|
|
3328
3331
|
|
|
3329
3332
|
# Count tight clusters with specified thresholds
|
|
3330
|
-
tight_clusters = _count_tight_clusters(
|
|
3333
|
+
tight_clusters = _count_tight_clusters(study,mz_tol=0.04, rt_tol=0.3)
|
|
3331
3334
|
|
|
3332
|
-
|
|
3333
|
-
f"Merging completed. Consensus features: {len(
|
|
3335
|
+
study.logger.info(
|
|
3336
|
+
f"Merging completed. Consensus features: {len(study.consensus_df)}. "
|
|
3334
3337
|
f"Completeness: {c:.2f}. Tight clusters left: {tight_clusters}.",
|
|
3335
3338
|
)
|
|
3336
3339
|
else:
|
|
3337
|
-
|
|
3338
|
-
f"Merging completed with empty result. Consensus features: {len(
|
|
3340
|
+
study.logger.warning(
|
|
3341
|
+
f"Merging completed with empty result. Consensus features: {len(study.consensus_df)}. "
|
|
3339
3342
|
f"This may be due to min_samples ({min_samples}) being too high for the available data.",
|
|
3340
3343
|
)
|
|
3341
3344
|
|
|
3342
3345
|
# add iso data from raw files.
|
|
3343
|
-
|
|
3346
|
+
study.find_iso()
|
|
3344
3347
|
if link_ms2:
|
|
3345
|
-
|
|
3348
|
+
study.find_ms2()
|
|
3346
3349
|
|
|
3347
3350
|
|
|
3348
3351
|
def _optimized_feature_lookup(study_obj, features_df):
|
|
@@ -3419,24 +3422,24 @@ def _optimized_adduct_grouping(study_obj, consensus_data, rt_tol, mz_tol):
|
|
|
3419
3422
|
|
|
3420
3423
|
# Union-Find for efficient grouping
|
|
3421
3424
|
class UnionFind:
|
|
3422
|
-
def __init__(
|
|
3423
|
-
|
|
3424
|
-
|
|
3425
|
+
def __init__(study, n):
|
|
3426
|
+
study.parent = list(range(n))
|
|
3427
|
+
study.rank = [0] * n
|
|
3425
3428
|
|
|
3426
|
-
def find(
|
|
3427
|
-
if
|
|
3428
|
-
|
|
3429
|
-
return
|
|
3429
|
+
def find(study, x):
|
|
3430
|
+
if study.parent[x] != x:
|
|
3431
|
+
study.parent[x] = study.find(study.parent[x])
|
|
3432
|
+
return study.parent[x]
|
|
3430
3433
|
|
|
3431
|
-
def union(
|
|
3432
|
-
px, py =
|
|
3434
|
+
def union(study, x, y):
|
|
3435
|
+
px, py = study.find(x), study.find(y)
|
|
3433
3436
|
if px == py:
|
|
3434
3437
|
return
|
|
3435
|
-
if
|
|
3438
|
+
if study.rank[px] < study.rank[py]:
|
|
3436
3439
|
px, py = py, px
|
|
3437
|
-
|
|
3438
|
-
if
|
|
3439
|
-
|
|
3440
|
+
study.parent[py] = px
|
|
3441
|
+
if study.rank[px] == study.rank[py]:
|
|
3442
|
+
study.rank[px] += 1
|
|
3440
3443
|
|
|
3441
3444
|
uid_to_idx = {feature[0]: i for i, feature in enumerate(valid_features)}
|
|
3442
3445
|
uf = UnionFind(len(valid_features))
|