masster 0.4.22__py3-none-any.whl → 0.5.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of masster might be problematic. Click here for more details.
- masster/_version.py +1 -1
- masster/data/libs/aa.csv +22 -0
- masster/lib/lib.py +6 -0
- masster/sample/adducts.py +1 -1
- masster/sample/load.py +10 -9
- masster/sample/plot.py +1 -1
- masster/sample/processing.py +4 -4
- masster/sample/sample.py +29 -32
- masster/study/analysis.py +1762 -0
- masster/study/defaults/fill_def.py +1 -1
- masster/study/export.py +5 -3
- masster/study/h5.py +3 -0
- masster/study/helpers.py +153 -80
- masster/study/id.py +545 -4
- masster/study/load.py +33 -59
- masster/study/merge.py +413 -315
- masster/study/parameters.py +3 -3
- masster/study/plot.py +398 -43
- masster/study/processing.py +6 -14
- masster/study/save.py +8 -4
- masster/study/study.py +179 -139
- masster/study/study5_schema.json +9 -0
- {masster-0.4.22.dist-info → masster-0.5.1.dist-info}/METADATA +54 -14
- {masster-0.4.22.dist-info → masster-0.5.1.dist-info}/RECORD +27 -25
- {masster-0.4.22.dist-info → masster-0.5.1.dist-info}/WHEEL +0 -0
- {masster-0.4.22.dist-info → masster-0.5.1.dist-info}/entry_points.txt +0 -0
- {masster-0.4.22.dist-info → masster-0.5.1.dist-info}/licenses/LICENSE +0 -0
masster/study/merge.py
CHANGED
|
@@ -274,7 +274,7 @@ def _serialize_feature_map(feature_map):
|
|
|
274
274
|
return features_data
|
|
275
275
|
|
|
276
276
|
|
|
277
|
-
def merge(
|
|
277
|
+
def merge(study, **kwargs) -> None:
|
|
278
278
|
"""
|
|
279
279
|
Group features across samples into consensus features using various algorithms.
|
|
280
280
|
|
|
@@ -342,7 +342,7 @@ def merge(self, **kwargs) -> None:
|
|
|
342
342
|
if key in valid_params:
|
|
343
343
|
setattr(params, key, value)
|
|
344
344
|
else:
|
|
345
|
-
|
|
345
|
+
study.logger.warning(f"Unknown parameter '{key}' ignored")
|
|
346
346
|
|
|
347
347
|
# Backward compatibility: Map old method names to new names
|
|
348
348
|
method_mapping = {
|
|
@@ -362,18 +362,18 @@ def merge(self, **kwargs) -> None:
|
|
|
362
362
|
if params.method in method_mapping:
|
|
363
363
|
old_method = params.method
|
|
364
364
|
params.method = method_mapping[old_method]
|
|
365
|
-
|
|
365
|
+
study.logger.info(f"Method '{old_method}' is deprecated. Using '{params.method}' instead.")
|
|
366
366
|
|
|
367
367
|
# Validate method
|
|
368
368
|
if params.method not in ['sensitivity', 'qt', 'nowarp', 'kd_chunked', 'qt_chunked', 'quality']:
|
|
369
369
|
raise ValueError(f"Invalid method '{params.method}'. Must be one of: ['sensitivity', 'qt', 'nowarp', 'kd_chunked', 'qt_chunked', 'quality']")
|
|
370
370
|
|
|
371
371
|
# Check if chunked method is advisable for large datasets
|
|
372
|
-
num_samples = len(
|
|
372
|
+
num_samples = len(study.samples_df) if hasattr(study, 'samples_df') and study.samples_df is not None else 0
|
|
373
373
|
if num_samples > 500:
|
|
374
374
|
chunked_methods = {'kd_chunked', 'qt_chunked'}
|
|
375
375
|
if params.method not in chunked_methods:
|
|
376
|
-
|
|
376
|
+
study.logger.warning(
|
|
377
377
|
f"Large dataset detected ({num_samples} samples > 500). "
|
|
378
378
|
f"For better performance and memory efficiency, consider using a chunked method: "
|
|
379
379
|
f"'kd_chunked' or 'qt_chunked' instead of '{params.method}'"
|
|
@@ -381,42 +381,43 @@ def merge(self, **kwargs) -> None:
|
|
|
381
381
|
|
|
382
382
|
# Persist last used params for diagnostics
|
|
383
383
|
try:
|
|
384
|
-
|
|
384
|
+
study._merge_params_last = params.to_dict()
|
|
385
385
|
except Exception:
|
|
386
|
-
|
|
386
|
+
study._merge_params_last = {}
|
|
387
387
|
|
|
388
388
|
# Store merge parameters in history
|
|
389
389
|
try:
|
|
390
|
-
if hasattr(
|
|
391
|
-
|
|
390
|
+
if hasattr(study, 'store_history'):
|
|
391
|
+
study.update_history(['merge'], params.to_dict())
|
|
392
392
|
else:
|
|
393
|
-
|
|
393
|
+
study.logger.warning("History storage not available - parameters not saved to history")
|
|
394
394
|
except Exception as e:
|
|
395
|
-
|
|
395
|
+
study.logger.warning(f"Failed to store merge parameters in history: {e}")
|
|
396
396
|
|
|
397
397
|
# Ensure feature maps are available for merging (regenerate if needed)
|
|
398
|
-
if len(
|
|
399
|
-
|
|
398
|
+
if len(study.features_maps) < len(study.samples_df):
|
|
399
|
+
study.features_maps = []
|
|
400
400
|
# Feature maps will be generated on-demand within each merge method
|
|
401
401
|
|
|
402
|
-
|
|
402
|
+
study.logger.info(
|
|
403
403
|
f"Merge: {params.method}, samples={params.min_samples}, rt_tol={params.rt_tol}s, mz_tol={params.mz_tol}Da"
|
|
404
404
|
)
|
|
405
405
|
|
|
406
406
|
# Initialize
|
|
407
|
-
|
|
407
|
+
_reset_consensus_data(study)
|
|
408
408
|
|
|
409
409
|
# Cache adducts for performance (avoid repeated _get_adducts() calls)
|
|
410
410
|
cached_adducts_df = None
|
|
411
411
|
cached_valid_adducts = None
|
|
412
412
|
try:
|
|
413
|
-
|
|
413
|
+
from masster.study.id import _get_adducts
|
|
414
|
+
cached_adducts_df = _get_adducts(study)
|
|
414
415
|
if not cached_adducts_df.is_empty():
|
|
415
416
|
cached_valid_adducts = set(cached_adducts_df["name"].to_list())
|
|
416
417
|
else:
|
|
417
418
|
cached_valid_adducts = set()
|
|
418
419
|
except Exception as e:
|
|
419
|
-
|
|
420
|
+
study.logger.warning(f"Could not retrieve study adducts: {e}")
|
|
420
421
|
cached_valid_adducts = set()
|
|
421
422
|
|
|
422
423
|
# Always allow '?' adducts
|
|
@@ -424,58 +425,58 @@ def merge(self, **kwargs) -> None:
|
|
|
424
425
|
|
|
425
426
|
# Route to algorithm implementation
|
|
426
427
|
if params.method == 'sensitivity':
|
|
427
|
-
consensus_map = _merge_kd(
|
|
428
|
+
consensus_map = _merge_kd(study, params)
|
|
428
429
|
# Extract consensus features
|
|
429
|
-
|
|
430
|
+
_extract_consensus_features(study, consensus_map, params.min_samples, cached_adducts_df, cached_valid_adducts)
|
|
430
431
|
elif params.method == 'qt':
|
|
431
|
-
consensus_map = _merge_qt(
|
|
432
|
+
consensus_map = _merge_qt(study, params)
|
|
432
433
|
# Extract consensus features
|
|
433
|
-
|
|
434
|
+
_extract_consensus_features(study, consensus_map, params.min_samples, cached_adducts_df, cached_valid_adducts)
|
|
434
435
|
elif params.method == 'nowarp':
|
|
435
|
-
consensus_map = _merge_kd_nowarp(
|
|
436
|
+
consensus_map = _merge_kd_nowarp(study, params)
|
|
436
437
|
# Extract consensus features
|
|
437
|
-
|
|
438
|
+
_extract_consensus_features(study, consensus_map, params.min_samples, cached_adducts_df, cached_valid_adducts)
|
|
438
439
|
elif params.method == 'quality':
|
|
439
|
-
consensus_map = _merge_kd_strict(
|
|
440
|
+
consensus_map = _merge_kd_strict(study, params)
|
|
440
441
|
# Note: _merge_kd_strict handles both consensus_df and consensus_mapping_df directly
|
|
441
442
|
elif params.method == 'kd_chunked':
|
|
442
|
-
consensus_map = _merge_kd_chunked(
|
|
443
|
+
consensus_map = _merge_kd_chunked(study, params, cached_adducts_df, cached_valid_adducts)
|
|
443
444
|
# Note: _merge_kd_chunked populates consensus_df directly, no need to extract
|
|
444
445
|
elif params.method == 'qt_chunked':
|
|
445
|
-
consensus_map = _merge_qt_chunked(
|
|
446
|
+
consensus_map = _merge_qt_chunked(study, params, cached_adducts_df, cached_valid_adducts)
|
|
446
447
|
# Note: _merge_qt_chunked populates consensus_df directly, no need to extract
|
|
447
448
|
|
|
448
449
|
# Enhanced post-clustering to merge over-segmented features (for qt and kd methods)
|
|
449
450
|
if params.method in ['qt', 'sensitivity', 'qt_chunked', 'kd_chunked', 'quality']:
|
|
450
|
-
|
|
451
|
+
_consensus_cleanup(study, params.rt_tol, params.mz_tol)
|
|
451
452
|
|
|
452
453
|
# Perform adduct grouping
|
|
453
|
-
|
|
454
|
+
_perform_adduct_grouping(study, params.rt_tol, params.mz_tol)
|
|
454
455
|
|
|
455
456
|
# Identify coeluting consensus features by mass shifts and update adduct information
|
|
456
|
-
|
|
457
|
+
_identify_adduct_by_mass_shift(study, params.rt_tol, cached_adducts_df)
|
|
457
458
|
|
|
458
459
|
# Link MS2 if requested
|
|
459
460
|
if params.link_ms2:
|
|
460
|
-
|
|
461
|
+
_finalize_merge(study, params.link_ms2, params.min_samples)
|
|
461
462
|
|
|
462
463
|
# Log completion without the misleading feature count
|
|
463
464
|
elapsed = time.time() - start_time
|
|
464
|
-
|
|
465
|
+
study.logger.debug(f"Merge process completed in {elapsed:.1f}s")
|
|
465
466
|
|
|
466
467
|
|
|
467
|
-
def _merge_kd(
|
|
468
|
+
def _merge_kd(study, params: merge_defaults) -> oms.ConsensusMap:
|
|
468
469
|
"""KD-tree based merge (fast, recommended)"""
|
|
469
470
|
|
|
470
471
|
# Generate temporary feature maps on-demand from features_df
|
|
471
|
-
temp_feature_maps = _generate_feature_maps_on_demand(
|
|
472
|
+
temp_feature_maps = _generate_feature_maps_on_demand(study)
|
|
472
473
|
|
|
473
474
|
consensus_map = oms.ConsensusMap()
|
|
474
475
|
file_descriptions = consensus_map.getColumnHeaders()
|
|
475
476
|
|
|
476
477
|
for i, feature_map in enumerate(temp_feature_maps):
|
|
477
478
|
file_description = file_descriptions.get(i, oms.ColumnHeader())
|
|
478
|
-
file_description.filename =
|
|
479
|
+
file_description.filename = study.samples_df.row(i, named=True)["sample_name"]
|
|
479
480
|
file_description.size = feature_map.size()
|
|
480
481
|
file_description.unique_id = feature_map.getUniqueId()
|
|
481
482
|
file_descriptions[i] = file_description
|
|
@@ -504,13 +505,99 @@ def _merge_kd(self, params: merge_defaults) -> oms.ConsensusMap:
|
|
|
504
505
|
return consensus_map
|
|
505
506
|
|
|
506
507
|
|
|
508
|
+
def _generate_feature_maps_from_samples(study):
|
|
509
|
+
"""
|
|
510
|
+
Generate feature maps using Study-level features_df instead of Sample-level loading.
|
|
511
|
+
This uses the study's existing features_df which is already loaded.
|
|
512
|
+
|
|
513
|
+
Args:
|
|
514
|
+
study: Study object containing features_df
|
|
515
|
+
|
|
516
|
+
Returns:
|
|
517
|
+
list: List of temporary FeatureMap objects built from Study-level data
|
|
518
|
+
"""
|
|
519
|
+
import pyopenms as oms
|
|
520
|
+
|
|
521
|
+
temp_feature_maps = []
|
|
522
|
+
|
|
523
|
+
study.logger.info(f"Building feature maps using Study-level features_df from {len(study.samples_df)} samples")
|
|
524
|
+
|
|
525
|
+
# Use the features_df from the study that's already loaded
|
|
526
|
+
if not hasattr(study, 'features_df') or study.features_df is None or study.features_df.is_empty():
|
|
527
|
+
study.logger.warning("No features_df available - features must be loaded first")
|
|
528
|
+
return temp_feature_maps
|
|
529
|
+
|
|
530
|
+
# Group features by sample
|
|
531
|
+
study.logger.info(f"Processing {len(study.features_df)} features grouped by sample")
|
|
532
|
+
|
|
533
|
+
# Get unique sample names/indices
|
|
534
|
+
if 'sample_uid' in study.features_df.columns:
|
|
535
|
+
sample_groups = study.features_df.group_by('sample_uid')
|
|
536
|
+
study.logger.debug("Grouping features by 'sample_uid' column")
|
|
537
|
+
elif 'sample_id' in study.features_df.columns:
|
|
538
|
+
sample_groups = study.features_df.group_by('sample_id')
|
|
539
|
+
study.logger.debug("Grouping features by 'sample_id' column")
|
|
540
|
+
elif 'sample' in study.features_df.columns:
|
|
541
|
+
sample_groups = study.features_df.group_by('sample')
|
|
542
|
+
study.logger.debug("Grouping features by 'sample' column")
|
|
543
|
+
else:
|
|
544
|
+
study.logger.warning("No sample grouping column found in features_df")
|
|
545
|
+
study.logger.info(f"Available columns: {study.features_df.columns}")
|
|
546
|
+
return temp_feature_maps
|
|
547
|
+
|
|
548
|
+
# Process each sample group
|
|
549
|
+
processed_samples = 0
|
|
550
|
+
for sample_key, sample_features in sample_groups:
|
|
551
|
+
try:
|
|
552
|
+
feature_map = oms.FeatureMap()
|
|
553
|
+
feature_count = 0
|
|
554
|
+
|
|
555
|
+
# Build features from this sample's features
|
|
556
|
+
for row in sample_features.iter_rows(named=True):
|
|
557
|
+
try:
|
|
558
|
+
feature = oms.Feature()
|
|
559
|
+
|
|
560
|
+
# Set feature properties
|
|
561
|
+
if row.get("feature_id") is not None:
|
|
562
|
+
feature.setUniqueId(int(row["feature_id"]))
|
|
563
|
+
if row.get("mz") is not None:
|
|
564
|
+
feature.setMZ(float(row["mz"]))
|
|
565
|
+
if row.get("rt") is not None:
|
|
566
|
+
feature.setRT(float(row["rt"]))
|
|
567
|
+
if row.get("inty") is not None:
|
|
568
|
+
feature.setIntensity(float(row["inty"]))
|
|
569
|
+
if row.get("quality") is not None:
|
|
570
|
+
feature.setOverallQuality(float(row["quality"]))
|
|
571
|
+
if row.get("charge") is not None:
|
|
572
|
+
feature.setCharge(int(row["charge"]))
|
|
573
|
+
|
|
574
|
+
feature_map.push_back(feature)
|
|
575
|
+
feature_count += 1
|
|
576
|
+
|
|
577
|
+
except (ValueError, TypeError) as e:
|
|
578
|
+
study.logger.warning(f"Skipping feature in sample {sample_key} due to conversion error: {e}")
|
|
579
|
+
continue
|
|
580
|
+
|
|
581
|
+
temp_feature_maps.append(feature_map)
|
|
582
|
+
processed_samples += 1
|
|
583
|
+
study.logger.debug(f"Built feature map for sample {sample_key} with {feature_count} features")
|
|
584
|
+
|
|
585
|
+
except Exception as e:
|
|
586
|
+
study.logger.warning(f"Failed to process sample group {sample_key}: {e}")
|
|
587
|
+
# Add empty feature map for failed samples to maintain sample order
|
|
588
|
+
temp_feature_maps.append(oms.FeatureMap())
|
|
589
|
+
|
|
590
|
+
study.logger.info(f"Generated {len(temp_feature_maps)} feature maps from {processed_samples} samples using Study-level features_df")
|
|
591
|
+
return temp_feature_maps
|
|
592
|
+
|
|
593
|
+
|
|
507
594
|
def _generate_feature_maps_on_demand(study):
|
|
508
595
|
"""
|
|
509
|
-
Generate feature maps on-demand
|
|
596
|
+
Generate feature maps on-demand using Sample-level _load_ms1() for merge operations.
|
|
510
597
|
Returns temporary feature maps that are not cached in the study.
|
|
511
598
|
|
|
512
599
|
Args:
|
|
513
|
-
study: Study object containing
|
|
600
|
+
study: Study object containing samples
|
|
514
601
|
|
|
515
602
|
Returns:
|
|
516
603
|
list: List of temporary FeatureMap objects
|
|
@@ -519,6 +606,15 @@ def _generate_feature_maps_on_demand(study):
|
|
|
519
606
|
import pyopenms as oms
|
|
520
607
|
import numpy as np
|
|
521
608
|
|
|
609
|
+
# Check if we should use Sample-level loading instead of features_df
|
|
610
|
+
use_sample_loading = True # Default to Sample-level loading as requested
|
|
611
|
+
|
|
612
|
+
# Use Sample-level loading if requested and samples_df is available
|
|
613
|
+
if use_sample_loading and hasattr(study, 'samples_df') and study.samples_df is not None and len(study.samples_df) > 0:
|
|
614
|
+
study.logger.debug("Building feature maps using Sample-level _load_ms1() instead of features_df")
|
|
615
|
+
return _generate_feature_maps_from_samples(study)
|
|
616
|
+
|
|
617
|
+
# Fallback to original features_df approach
|
|
522
618
|
if study.features_df is None or len(study.features_df) == 0:
|
|
523
619
|
study.logger.error("No features_df available for generating feature maps")
|
|
524
620
|
return []
|
|
@@ -624,22 +720,22 @@ def _generate_feature_maps_on_demand(study):
|
|
|
624
720
|
return temp_feature_maps
|
|
625
721
|
|
|
626
722
|
|
|
627
|
-
def _merge_qt(
|
|
723
|
+
def _merge_qt(study, params: merge_defaults) -> oms.ConsensusMap:
|
|
628
724
|
"""QT (Quality Threshold) based merge"""
|
|
629
725
|
|
|
630
726
|
# Generate temporary feature maps on-demand from features_df
|
|
631
|
-
temp_feature_maps = _generate_feature_maps_on_demand(
|
|
727
|
+
temp_feature_maps = _generate_feature_maps_on_demand(study)
|
|
632
728
|
|
|
633
729
|
n_samples = len(temp_feature_maps)
|
|
634
730
|
if n_samples > 1000:
|
|
635
|
-
|
|
731
|
+
study.logger.warning(f"QT with {n_samples} samples may be slow [O(n²)]. Consider KD [O(n log n)]")
|
|
636
732
|
|
|
637
733
|
consensus_map = oms.ConsensusMap()
|
|
638
734
|
file_descriptions = consensus_map.getColumnHeaders()
|
|
639
735
|
|
|
640
736
|
for i, feature_map in enumerate(temp_feature_maps):
|
|
641
737
|
file_description = file_descriptions.get(i, oms.ColumnHeader())
|
|
642
|
-
file_description.filename =
|
|
738
|
+
file_description.filename = study.samples_df.row(i, named=True)["sample_name"]
|
|
643
739
|
file_description.size = feature_map.size()
|
|
644
740
|
file_description.unique_id = feature_map.getUniqueId()
|
|
645
741
|
file_descriptions[i] = file_description
|
|
@@ -665,7 +761,7 @@ def _merge_qt(self, params: merge_defaults) -> oms.ConsensusMap:
|
|
|
665
761
|
return consensus_map
|
|
666
762
|
|
|
667
763
|
|
|
668
|
-
def _merge_kd_strict(
|
|
764
|
+
def _merge_kd_strict(study, params: merge_defaults) -> oms.ConsensusMap:
|
|
669
765
|
"""
|
|
670
766
|
Quality merge: Standard KD algorithm with post-processing quality control.
|
|
671
767
|
|
|
@@ -695,8 +791,8 @@ def _merge_kd_strict(self, params: merge_defaults) -> oms.ConsensusMap:
|
|
|
695
791
|
|
|
696
792
|
if optimize_rt_tol:
|
|
697
793
|
# Optimize RT tolerance first
|
|
698
|
-
optimal_rt_tol = _optimize_rt_tolerance(
|
|
699
|
-
|
|
794
|
+
optimal_rt_tol = _optimize_rt_tolerance(study, params)
|
|
795
|
+
study.logger.info(f"RT tolerance optimization: {params.rt_tol}s → {optimal_rt_tol}s")
|
|
700
796
|
# Create modified params with optimal RT tolerance
|
|
701
797
|
import copy
|
|
702
798
|
optimized_params = copy.deepcopy(params)
|
|
@@ -705,22 +801,22 @@ def _merge_kd_strict(self, params: merge_defaults) -> oms.ConsensusMap:
|
|
|
705
801
|
optimized_params = params
|
|
706
802
|
|
|
707
803
|
# Phase 1: Standard KD clustering
|
|
708
|
-
|
|
709
|
-
consensus_map = _merge_kd(
|
|
804
|
+
study.logger.debug("Initial KD clustering")
|
|
805
|
+
consensus_map = _merge_kd(study, optimized_params)
|
|
710
806
|
|
|
711
807
|
# Phase 2: Post-processing quality control
|
|
712
|
-
|
|
713
|
-
consensus_map = _apply_kd_strict_postprocessing(
|
|
808
|
+
study.logger.debug("Post-processing quality control")
|
|
809
|
+
consensus_map = _apply_kd_strict_postprocessing(study, consensus_map, optimized_params)
|
|
714
810
|
|
|
715
811
|
return consensus_map
|
|
716
812
|
|
|
717
813
|
|
|
718
|
-
def _optimize_rt_tolerance(
|
|
814
|
+
def _optimize_rt_tolerance(study, params: merge_defaults) -> float:
|
|
719
815
|
"""
|
|
720
816
|
Optimize RT tolerance by testing different values and measuring oversegmentation.
|
|
721
817
|
|
|
722
818
|
Args:
|
|
723
|
-
|
|
819
|
+
study: Study object
|
|
724
820
|
params: Merge parameters
|
|
725
821
|
|
|
726
822
|
Returns:
|
|
@@ -729,7 +825,7 @@ def _optimize_rt_tolerance(self, params: merge_defaults) -> float:
|
|
|
729
825
|
rt_tol_range = getattr(params, 'rt_tol_range', (0.8, 2.0))
|
|
730
826
|
rt_tol_steps = getattr(params, 'rt_tol_steps', 5)
|
|
731
827
|
|
|
732
|
-
|
|
828
|
+
study.logger.info(f"Optimizing RT tolerance in range {rt_tol_range} with {rt_tol_steps} steps")
|
|
733
829
|
|
|
734
830
|
# Generate test values
|
|
735
831
|
test_rt_tols = [rt_tol_range[0] + i * (rt_tol_range[1] - rt_tol_range[0]) / (rt_tol_steps - 1)
|
|
@@ -739,8 +835,8 @@ def _optimize_rt_tolerance(self, params: merge_defaults) -> float:
|
|
|
739
835
|
best_score = float('inf')
|
|
740
836
|
|
|
741
837
|
# Store original features for restoration
|
|
742
|
-
original_consensus_df = getattr(
|
|
743
|
-
original_consensus_mapping_df = getattr(
|
|
838
|
+
original_consensus_df = getattr(study, 'consensus_df', pl.DataFrame())
|
|
839
|
+
original_consensus_mapping_df = getattr(study, 'consensus_mapping_df', pl.DataFrame())
|
|
744
840
|
|
|
745
841
|
for test_rt_tol in test_rt_tols:
|
|
746
842
|
try:
|
|
@@ -750,18 +846,18 @@ def _optimize_rt_tolerance(self, params: merge_defaults) -> float:
|
|
|
750
846
|
test_params.rt_tol = test_rt_tol
|
|
751
847
|
|
|
752
848
|
# Run KD merge with test parameters
|
|
753
|
-
test_consensus_map = _merge_kd(
|
|
849
|
+
test_consensus_map = _merge_kd(study, test_params)
|
|
754
850
|
|
|
755
851
|
# Extract consensus features temporarily for analysis
|
|
756
|
-
|
|
852
|
+
_extract_consensus_features(study, test_consensus_map, test_params.min_samples)
|
|
757
853
|
|
|
758
|
-
if len(
|
|
854
|
+
if len(study.consensus_df) == 0:
|
|
759
855
|
continue
|
|
760
856
|
|
|
761
857
|
# Calculate oversegmentation metrics
|
|
762
|
-
oversegmentation_score = _calculate_oversegmentation_score(
|
|
858
|
+
oversegmentation_score = _calculate_oversegmentation_score(study, test_rt_tol)
|
|
763
859
|
|
|
764
|
-
|
|
860
|
+
study.logger.debug(f"RT tol {test_rt_tol:.1f}s: {len(study.consensus_df)} features, score: {oversegmentation_score:.3f}")
|
|
765
861
|
|
|
766
862
|
# Lower score is better (less oversegmentation)
|
|
767
863
|
if oversegmentation_score < best_score:
|
|
@@ -769,50 +865,50 @@ def _optimize_rt_tolerance(self, params: merge_defaults) -> float:
|
|
|
769
865
|
best_rt_tol = test_rt_tol
|
|
770
866
|
|
|
771
867
|
except Exception as e:
|
|
772
|
-
|
|
868
|
+
study.logger.warning(f"RT tolerance optimization failed for {test_rt_tol}s: {e}")
|
|
773
869
|
continue
|
|
774
870
|
|
|
775
871
|
# Restore original consensus data
|
|
776
|
-
|
|
777
|
-
|
|
872
|
+
study.consensus_df = original_consensus_df
|
|
873
|
+
study.consensus_mapping_df = original_consensus_mapping_df
|
|
778
874
|
|
|
779
|
-
|
|
875
|
+
study.logger.info(f"Optimal RT tolerance: {best_rt_tol:.1f}s (score: {best_score:.3f})")
|
|
780
876
|
return best_rt_tol
|
|
781
877
|
|
|
782
878
|
|
|
783
|
-
def _calculate_oversegmentation_score(
|
|
879
|
+
def _calculate_oversegmentation_score(study, rt_tol: float) -> float:
|
|
784
880
|
"""
|
|
785
881
|
Calculate oversegmentation score based on feature density and RT spread metrics.
|
|
786
882
|
Lower scores indicate less oversegmentation.
|
|
787
883
|
|
|
788
884
|
Args:
|
|
789
|
-
|
|
885
|
+
study: Study object
|
|
790
886
|
rt_tol: RT tolerance used
|
|
791
887
|
|
|
792
888
|
Returns:
|
|
793
889
|
Oversegmentation score (lower = better)
|
|
794
890
|
"""
|
|
795
|
-
if len(
|
|
891
|
+
if len(study.consensus_df) == 0:
|
|
796
892
|
return float('inf')
|
|
797
893
|
|
|
798
894
|
# Metric 1: Feature density (features per RT second)
|
|
799
|
-
rt_range =
|
|
895
|
+
rt_range = study.consensus_df['rt'].max() - study.consensus_df['rt'].min()
|
|
800
896
|
if rt_range <= 0:
|
|
801
897
|
return float('inf')
|
|
802
898
|
|
|
803
|
-
feature_density = len(
|
|
899
|
+
feature_density = len(study.consensus_df) / rt_range
|
|
804
900
|
|
|
805
901
|
# Metric 2: Average RT spread relative to tolerance
|
|
806
|
-
rt_spreads = (
|
|
902
|
+
rt_spreads = (study.consensus_df['rt_max'] - study.consensus_df['rt_min'])
|
|
807
903
|
avg_rt_spread_ratio = rt_spreads.mean() / rt_tol if rt_tol > 0 else float('inf')
|
|
808
904
|
|
|
809
905
|
# Metric 3: Proportion of features with low sample counts (indicates fragmentation)
|
|
810
|
-
low_sample_features = len(
|
|
811
|
-
low_sample_ratio = low_sample_features / len(
|
|
906
|
+
low_sample_features = len(study.consensus_df.filter(pl.col('number_samples') <= 5))
|
|
907
|
+
low_sample_ratio = low_sample_features / len(study.consensus_df)
|
|
812
908
|
|
|
813
909
|
# Metric 4: Number of features with excessive RT spread
|
|
814
910
|
excessive_spread_features = len(rt_spreads.filter(rt_spreads > rt_tol * 2))
|
|
815
|
-
excessive_spread_ratio = excessive_spread_features / len(
|
|
911
|
+
excessive_spread_ratio = excessive_spread_features / len(study.consensus_df)
|
|
816
912
|
|
|
817
913
|
# Combined score (weighted combination)
|
|
818
914
|
oversegmentation_score = (
|
|
@@ -825,7 +921,7 @@ def _calculate_oversegmentation_score(self, rt_tol: float) -> float:
|
|
|
825
921
|
return oversegmentation_score
|
|
826
922
|
|
|
827
923
|
|
|
828
|
-
def _apply_kd_strict_postprocessing(
|
|
924
|
+
def _apply_kd_strict_postprocessing(study, consensus_map: oms.ConsensusMap, params: merge_defaults) -> oms.ConsensusMap:
|
|
829
925
|
"""
|
|
830
926
|
Apply post-processing quality control to KD consensus map.
|
|
831
927
|
|
|
@@ -837,20 +933,20 @@ def _apply_kd_strict_postprocessing(self, consensus_map: oms.ConsensusMap, param
|
|
|
837
933
|
Processed consensus map with reduced oversegmentation
|
|
838
934
|
"""
|
|
839
935
|
if consensus_map.size() == 0:
|
|
840
|
-
|
|
936
|
+
study.logger.warning("Empty consensus map provided to post-processing")
|
|
841
937
|
return consensus_map
|
|
842
938
|
|
|
843
|
-
|
|
939
|
+
study.logger.debug(f"Post-processing {consensus_map.size()} initial consensus features")
|
|
844
940
|
|
|
845
941
|
# Step 1: Extract initial consensus features
|
|
846
942
|
original_min_samples = params.min_samples
|
|
847
943
|
params.min_samples = 1 # Extract all features initially
|
|
848
944
|
|
|
849
|
-
|
|
850
|
-
initial_feature_count = len(
|
|
945
|
+
_extract_consensus_features(study, consensus_map, params.min_samples)
|
|
946
|
+
initial_feature_count = len(study.consensus_df)
|
|
851
947
|
|
|
852
948
|
if initial_feature_count == 0:
|
|
853
|
-
|
|
949
|
+
study.logger.warning("No consensus features extracted for post-processing")
|
|
854
950
|
params.min_samples = original_min_samples
|
|
855
951
|
return consensus_map
|
|
856
952
|
|
|
@@ -858,67 +954,67 @@ def _apply_kd_strict_postprocessing(self, consensus_map: oms.ConsensusMap, param
|
|
|
858
954
|
secondary_merge_rt_tol = getattr(params, 'secondary_merge_rt_tol', 0.5)
|
|
859
955
|
secondary_merge_mz_tol = getattr(params, 'secondary_merge_mz_tol', 0.005)
|
|
860
956
|
|
|
861
|
-
|
|
862
|
-
merged_features = _perform_secondary_clustering(
|
|
957
|
+
study.logger.debug(f"Secondary clustering with RT≤{secondary_merge_rt_tol}s, m/z≤{secondary_merge_mz_tol}")
|
|
958
|
+
merged_features = _perform_secondary_clustering(study, secondary_merge_rt_tol, secondary_merge_mz_tol)
|
|
863
959
|
|
|
864
960
|
# Step 3: Sample overlap validation
|
|
865
961
|
min_sample_overlap = getattr(params, 'min_sample_overlap', 0.8)
|
|
866
962
|
if min_sample_overlap > 0:
|
|
867
|
-
|
|
868
|
-
merged_features = _validate_sample_overlap(
|
|
963
|
+
study.logger.debug(f"Sample overlap validation (threshold: {min_sample_overlap})")
|
|
964
|
+
merged_features = _validate_sample_overlap(study, merged_features, min_sample_overlap)
|
|
869
965
|
|
|
870
966
|
# Step 4: RT spread quality filtering
|
|
871
967
|
if params.rt_tol is not None:
|
|
872
968
|
max_rt_spread = getattr(params, 'max_rt_spread', params.rt_tol * 2)
|
|
873
969
|
if max_rt_spread is not None:
|
|
874
|
-
|
|
875
|
-
merged_features = _filter_rt_spread(
|
|
970
|
+
study.logger.debug(f"RT spread filtering (max: {max_rt_spread:.1f}s)")
|
|
971
|
+
merged_features = _filter_rt_spread(study, merged_features, max_rt_spread)
|
|
876
972
|
else:
|
|
877
|
-
|
|
973
|
+
study.logger.debug("Skipping RT spread filtering - max_rt_spread is None")
|
|
878
974
|
else:
|
|
879
|
-
|
|
975
|
+
study.logger.debug("Skipping RT spread filtering - rt_tol is None")
|
|
880
976
|
|
|
881
977
|
# Step 5: Chromatographic coherence filtering (optional)
|
|
882
978
|
min_coherence = getattr(params, 'min_coherence', 0.0)
|
|
883
979
|
if min_coherence > 0:
|
|
884
|
-
|
|
885
|
-
merged_features = _filter_coherence(
|
|
980
|
+
study.logger.debug(f"Chromatographic coherence filtering (min: {min_coherence})")
|
|
981
|
+
merged_features = _filter_coherence(study, merged_features, min_coherence)
|
|
886
982
|
|
|
887
983
|
# Step 6: Rebuild consensus_df with filtered features and preserve mapping
|
|
888
|
-
original_mapping_df =
|
|
889
|
-
|
|
984
|
+
original_mapping_df = study.consensus_mapping_df.clone() # Save original mapping
|
|
985
|
+
study.consensus_df = pl.DataFrame(merged_features, strict=False)
|
|
890
986
|
|
|
891
987
|
# Step 7: Apply original min_samples filter
|
|
892
988
|
params.min_samples = original_min_samples
|
|
893
989
|
if params.min_samples > 1:
|
|
894
|
-
l1 = len(
|
|
895
|
-
|
|
990
|
+
l1 = len(study.consensus_df)
|
|
991
|
+
study.consensus_df = study.consensus_df.filter(
|
|
896
992
|
pl.col("number_samples") >= params.min_samples
|
|
897
993
|
)
|
|
898
|
-
filtered_count = l1 - len(
|
|
994
|
+
filtered_count = l1 - len(study.consensus_df)
|
|
899
995
|
if filtered_count > 0:
|
|
900
|
-
|
|
996
|
+
study.logger.debug(f"Filtered {filtered_count} features below min_samples threshold ({params.min_samples})")
|
|
901
997
|
|
|
902
998
|
# Step 8: Update consensus_mapping_df to match final consensus_df
|
|
903
|
-
if len(
|
|
904
|
-
valid_consensus_ids = set(
|
|
905
|
-
|
|
999
|
+
if len(study.consensus_df) > 0 and len(original_mapping_df) > 0:
|
|
1000
|
+
valid_consensus_ids = set(study.consensus_df['consensus_uid'].to_list())
|
|
1001
|
+
study.consensus_mapping_df = original_mapping_df.filter(
|
|
906
1002
|
pl.col('consensus_uid').is_in(list(valid_consensus_ids))
|
|
907
1003
|
)
|
|
908
1004
|
else:
|
|
909
|
-
|
|
1005
|
+
study.consensus_mapping_df = pl.DataFrame()
|
|
910
1006
|
|
|
911
|
-
final_feature_count = len(
|
|
1007
|
+
final_feature_count = len(study.consensus_df)
|
|
912
1008
|
reduction_pct = ((initial_feature_count - final_feature_count) / initial_feature_count * 100) if initial_feature_count > 0 else 0
|
|
913
1009
|
|
|
914
|
-
|
|
1010
|
+
study.logger.info(f"Consensus cleanup complete: {initial_feature_count} → {final_feature_count} features ({reduction_pct:.1f}% reduction)")
|
|
915
1011
|
|
|
916
1012
|
# Create a new consensus map for compatibility (the processed data is in consensus_df)
|
|
917
1013
|
processed_consensus_map = oms.ConsensusMap()
|
|
918
1014
|
return processed_consensus_map
|
|
919
1015
|
|
|
920
1016
|
|
|
921
|
-
def _perform_secondary_clustering(
|
|
1017
|
+
def _perform_secondary_clustering(study, rt_tol: float, mz_tol: float) -> list:
|
|
922
1018
|
"""
|
|
923
1019
|
Perform secondary clustering to merge very close features.
|
|
924
1020
|
|
|
@@ -929,34 +1025,34 @@ def _perform_secondary_clustering(self, rt_tol: float, mz_tol: float) -> list:
|
|
|
929
1025
|
Returns:
|
|
930
1026
|
List of merged consensus feature dictionaries
|
|
931
1027
|
"""
|
|
932
|
-
if len(
|
|
1028
|
+
if len(study.consensus_df) == 0:
|
|
933
1029
|
return []
|
|
934
1030
|
|
|
935
1031
|
# Convert consensus_df to list of dictionaries for clustering
|
|
936
1032
|
consensus_features = []
|
|
937
|
-
for i, row in enumerate(
|
|
1033
|
+
for i, row in enumerate(study.consensus_df.iter_rows(named=True)):
|
|
938
1034
|
consensus_features.append(dict(row))
|
|
939
1035
|
|
|
940
1036
|
# Use Union-Find for efficient clustering
|
|
941
1037
|
class UnionFind:
|
|
942
|
-
def __init__(
|
|
943
|
-
|
|
944
|
-
|
|
1038
|
+
def __init__(study, n):
|
|
1039
|
+
study.parent = list(range(n))
|
|
1040
|
+
study.rank = [0] * n
|
|
945
1041
|
|
|
946
|
-
def find(
|
|
947
|
-
if
|
|
948
|
-
|
|
949
|
-
return
|
|
1042
|
+
def find(study, x):
|
|
1043
|
+
if study.parent[x] != x:
|
|
1044
|
+
study.parent[x] = study.find(study.parent[x])
|
|
1045
|
+
return study.parent[x]
|
|
950
1046
|
|
|
951
|
-
def union(
|
|
952
|
-
px, py =
|
|
1047
|
+
def union(study, x, y):
|
|
1048
|
+
px, py = study.find(x), study.find(y)
|
|
953
1049
|
if px == py:
|
|
954
1050
|
return
|
|
955
|
-
if
|
|
1051
|
+
if study.rank[px] < study.rank[py]:
|
|
956
1052
|
px, py = py, px
|
|
957
|
-
|
|
958
|
-
if
|
|
959
|
-
|
|
1053
|
+
study.parent[py] = px
|
|
1054
|
+
if study.rank[px] == study.rank[py]:
|
|
1055
|
+
study.rank[px] += 1
|
|
960
1056
|
|
|
961
1057
|
n_features = len(consensus_features)
|
|
962
1058
|
uf = UnionFind(n_features)
|
|
@@ -992,7 +1088,7 @@ def _perform_secondary_clustering(self, rt_tol: float, mz_tol: float) -> list:
|
|
|
992
1088
|
merged_feature = _merge_feature_group(group)
|
|
993
1089
|
merged_features.append(merged_feature)
|
|
994
1090
|
|
|
995
|
-
|
|
1091
|
+
study.logger.debug(f"Secondary clustering: {n_features} → {len(merged_features)} features ({n_features - len(merged_features)} merged)")
|
|
996
1092
|
return merged_features
|
|
997
1093
|
|
|
998
1094
|
|
|
@@ -1066,7 +1162,7 @@ def _merge_feature_group(feature_group: list) -> dict:
|
|
|
1066
1162
|
return merged
|
|
1067
1163
|
|
|
1068
1164
|
|
|
1069
|
-
def _validate_sample_overlap(
|
|
1165
|
+
def _validate_sample_overlap(study, features: list, min_overlap: float) -> list:
|
|
1070
1166
|
"""
|
|
1071
1167
|
Validate that merged features have sufficient sample overlap.
|
|
1072
1168
|
|
|
@@ -1097,7 +1193,7 @@ def _validate_sample_overlap(self, features: list, min_overlap: float) -> list:
|
|
|
1097
1193
|
return validated_features
|
|
1098
1194
|
|
|
1099
1195
|
|
|
1100
|
-
def _filter_rt_spread(
|
|
1196
|
+
def _filter_rt_spread(study, features: list, max_rt_spread: float) -> list:
|
|
1101
1197
|
"""
|
|
1102
1198
|
Filter out features with excessive RT spread.
|
|
1103
1199
|
|
|
@@ -1122,12 +1218,12 @@ def _filter_rt_spread(self, features: list, max_rt_spread: float) -> list:
|
|
|
1122
1218
|
filtered_count += 1
|
|
1123
1219
|
|
|
1124
1220
|
if filtered_count > 0:
|
|
1125
|
-
|
|
1221
|
+
study.logger.debug(f"Filtered {filtered_count} features with excessive RT spread (>{max_rt_spread:.1f}s)")
|
|
1126
1222
|
|
|
1127
1223
|
return filtered_features
|
|
1128
1224
|
|
|
1129
1225
|
|
|
1130
|
-
def _filter_coherence(
|
|
1226
|
+
def _filter_coherence(study, features: list, min_coherence: float) -> list:
|
|
1131
1227
|
"""
|
|
1132
1228
|
Filter out features with low chromatographic coherence.
|
|
1133
1229
|
|
|
@@ -1150,23 +1246,23 @@ def _filter_coherence(self, features: list, min_coherence: float) -> list:
|
|
|
1150
1246
|
filtered_count += 1
|
|
1151
1247
|
|
|
1152
1248
|
if filtered_count > 0:
|
|
1153
|
-
|
|
1249
|
+
study.logger.debug(f"Filtered {filtered_count} features with low coherence (<{min_coherence})")
|
|
1154
1250
|
|
|
1155
1251
|
return filtered_features
|
|
1156
1252
|
|
|
1157
1253
|
|
|
1158
|
-
def _merge_kd_nowarp(
|
|
1254
|
+
def _merge_kd_nowarp(study, params: merge_defaults) -> oms.ConsensusMap:
|
|
1159
1255
|
"""KD-tree based merge without RT warping"""
|
|
1160
1256
|
|
|
1161
1257
|
# Generate temporary feature maps on-demand from features_df
|
|
1162
|
-
temp_feature_maps = _generate_feature_maps_on_demand(
|
|
1258
|
+
temp_feature_maps = _generate_feature_maps_on_demand(study)
|
|
1163
1259
|
|
|
1164
1260
|
consensus_map = oms.ConsensusMap()
|
|
1165
1261
|
file_descriptions = consensus_map.getColumnHeaders()
|
|
1166
1262
|
|
|
1167
1263
|
for i, feature_map in enumerate(temp_feature_maps):
|
|
1168
1264
|
file_description = file_descriptions.get(i, oms.ColumnHeader())
|
|
1169
|
-
file_description.filename =
|
|
1265
|
+
file_description.filename = study.samples_df.row(i, named=True)["sample_name"]
|
|
1170
1266
|
file_description.size = feature_map.size()
|
|
1171
1267
|
file_description.unique_id = feature_map.getUniqueId()
|
|
1172
1268
|
file_descriptions[i] = file_description
|
|
@@ -1193,18 +1289,18 @@ def _merge_kd_nowarp(self, params: merge_defaults) -> oms.ConsensusMap:
|
|
|
1193
1289
|
return consensus_map
|
|
1194
1290
|
|
|
1195
1291
|
|
|
1196
|
-
def _merge_kd_chunked(
|
|
1292
|
+
def _merge_kd_chunked(study, params: merge_defaults, cached_adducts_df=None, cached_valid_adducts=None) -> oms.ConsensusMap:
|
|
1197
1293
|
"""KD-based chunked merge with proper cross-chunk consensus building and optional parallel processing"""
|
|
1198
1294
|
|
|
1199
1295
|
# Generate temporary feature maps on-demand from features_df
|
|
1200
|
-
temp_feature_maps = _generate_feature_maps_on_demand(
|
|
1296
|
+
temp_feature_maps = _generate_feature_maps_on_demand(study)
|
|
1201
1297
|
|
|
1202
1298
|
n_samples = len(temp_feature_maps)
|
|
1203
1299
|
if n_samples <= params.chunk_size:
|
|
1204
|
-
|
|
1205
|
-
consensus_map = _merge_kd(
|
|
1300
|
+
study.logger.info(f"Dataset size ({n_samples}) ≤ chunk_size, using KD merge")
|
|
1301
|
+
consensus_map = _merge_kd(study, params)
|
|
1206
1302
|
# Extract consensus features to populate consensus_df for chunked method consistency
|
|
1207
|
-
|
|
1303
|
+
_extract_consensus_features(study, consensus_map, params.min_samples, cached_adducts_df, cached_valid_adducts)
|
|
1208
1304
|
return consensus_map
|
|
1209
1305
|
|
|
1210
1306
|
# Process in chunks
|
|
@@ -1213,21 +1309,21 @@ def _merge_kd_chunked(self, params: merge_defaults, cached_adducts_df=None, cach
|
|
|
1213
1309
|
chunk_end = min(i + params.chunk_size, n_samples)
|
|
1214
1310
|
chunks.append((i, temp_feature_maps[i:chunk_end]))
|
|
1215
1311
|
|
|
1216
|
-
|
|
1312
|
+
study.logger.debug(f"Processing {len(chunks)} chunks of max {params.chunk_size} samples using {params.threads or 'sequential'} thread(s)")
|
|
1217
1313
|
|
|
1218
1314
|
# Process each chunk to create chunk consensus maps
|
|
1219
1315
|
chunk_consensus_maps = []
|
|
1220
1316
|
|
|
1221
1317
|
if params.threads is None:
|
|
1222
1318
|
# Sequential processing (original behavior)
|
|
1223
|
-
for chunk_idx, (chunk_start_idx, chunk_maps) in enumerate(tqdm(chunks, desc=f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {
|
|
1319
|
+
for chunk_idx, (chunk_start_idx, chunk_maps) in enumerate(tqdm(chunks, desc=f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {study.log_label}KD Chunk", disable=study.log_level not in ["TRACE", "DEBUG", "INFO"])):
|
|
1224
1320
|
chunk_consensus_map = oms.ConsensusMap()
|
|
1225
1321
|
|
|
1226
1322
|
# Set up file descriptions for chunk
|
|
1227
1323
|
file_descriptions = chunk_consensus_map.getColumnHeaders()
|
|
1228
1324
|
for j, feature_map in enumerate(chunk_maps):
|
|
1229
1325
|
file_description = file_descriptions.get(j, oms.ColumnHeader())
|
|
1230
|
-
file_description.filename =
|
|
1326
|
+
file_description.filename = study.samples_df.row(chunk_start_idx + j, named=True)["sample_name"]
|
|
1231
1327
|
file_description.size = feature_map.size()
|
|
1232
1328
|
file_description.unique_id = feature_map.getUniqueId()
|
|
1233
1329
|
file_descriptions[j] = file_description
|
|
@@ -1255,7 +1351,7 @@ def _merge_kd_chunked(self, params: merge_defaults, cached_adducts_df=None, cach
|
|
|
1255
1351
|
|
|
1256
1352
|
else:
|
|
1257
1353
|
# Parallel processing
|
|
1258
|
-
|
|
1354
|
+
study.logger.info(f"Processing chunks in parallel using {params.threads} processes")
|
|
1259
1355
|
|
|
1260
1356
|
# Prepare chunk data for parallel processing using features_df slices
|
|
1261
1357
|
chunk_data_list = []
|
|
@@ -1264,7 +1360,7 @@ def _merge_kd_chunked(self, params: merge_defaults, cached_adducts_df=None, cach
|
|
|
1264
1360
|
chunk_sample_uids = []
|
|
1265
1361
|
chunk_samples_df_rows = []
|
|
1266
1362
|
for j in range(len(chunk_maps)):
|
|
1267
|
-
sample_row =
|
|
1363
|
+
sample_row = study.samples_df.row(chunk_start_idx + j, named=True)
|
|
1268
1364
|
chunk_sample_uids.append(sample_row['sample_uid'])
|
|
1269
1365
|
chunk_samples_df_rows.append(sample_row)
|
|
1270
1366
|
|
|
@@ -1272,7 +1368,7 @@ def _merge_kd_chunked(self, params: merge_defaults, cached_adducts_df=None, cach
|
|
|
1272
1368
|
chunk_samples_df = pl.DataFrame(chunk_samples_df_rows)
|
|
1273
1369
|
|
|
1274
1370
|
# Filter features_df for this chunk's samples and select only necessary columns
|
|
1275
|
-
chunk_features_df =
|
|
1371
|
+
chunk_features_df = study.features_df.filter(
|
|
1276
1372
|
pl.col('sample_uid').is_in(chunk_sample_uids)
|
|
1277
1373
|
).select([
|
|
1278
1374
|
'sample_uid', 'rt', 'mz', 'inty', 'charge', 'feature_id'
|
|
@@ -1316,22 +1412,22 @@ def _merge_kd_chunked(self, params: merge_defaults, cached_adducts_df=None, cach
|
|
|
1316
1412
|
serialized_chunk_results.append((chunk_start_idx, consensus_features))
|
|
1317
1413
|
completed_chunks += 1
|
|
1318
1414
|
n_samples_in_chunk = len(chunk_data_list[chunk_idx]['chunk_samples_data'])
|
|
1319
|
-
|
|
1415
|
+
study.logger.info(f"Completed chunk {completed_chunks}/{total_chunks} (samples {chunk_start_idx + 1}-{chunk_start_idx + n_samples_in_chunk})")
|
|
1320
1416
|
except Exception as exc:
|
|
1321
1417
|
# Check if this is a BrokenProcessPool exception from Windows multiprocessing issues
|
|
1322
1418
|
if isinstance(exc, BrokenProcessPool) or "process pool" in str(exc).lower():
|
|
1323
1419
|
# Convert to RuntimeError so outer except block can catch it for fallback
|
|
1324
1420
|
raise RuntimeError(f"Windows multiprocessing failure: {exc}")
|
|
1325
1421
|
else:
|
|
1326
|
-
|
|
1422
|
+
study.logger.error(f"Chunk {chunk_idx} generated an exception: {exc}")
|
|
1327
1423
|
raise exc
|
|
1328
1424
|
|
|
1329
1425
|
except (RuntimeError, OSError, BrokenProcessPool) as e:
|
|
1330
1426
|
# Handle Windows multiprocessing issues - fallback to ThreadPoolExecutor
|
|
1331
1427
|
if ("freeze_support" in str(e) or "spawn" in str(e) or "bootstrapping" in str(e) or
|
|
1332
1428
|
"process pool" in str(e).lower() or "Windows multiprocessing failure" in str(e)):
|
|
1333
|
-
|
|
1334
|
-
|
|
1429
|
+
study.logger.warning(f"ProcessPoolExecutor failed (likely Windows multiprocessing issue): {e}")
|
|
1430
|
+
study.logger.info(f"Falling back to ThreadPoolExecutor with {params.threads} threads")
|
|
1335
1431
|
|
|
1336
1432
|
with ThreadPoolExecutor(max_workers=params.threads) as executor:
|
|
1337
1433
|
# Submit all chunk processing tasks
|
|
@@ -1350,9 +1446,9 @@ def _merge_kd_chunked(self, params: merge_defaults, cached_adducts_df=None, cach
|
|
|
1350
1446
|
serialized_chunk_results.append((chunk_start_idx, consensus_features))
|
|
1351
1447
|
completed_chunks += 1
|
|
1352
1448
|
n_samples_in_chunk = len(chunk_data_list[chunk_idx]['chunk_samples_data'])
|
|
1353
|
-
|
|
1449
|
+
study.logger.info(f"Completed chunk {completed_chunks}/{total_chunks} (samples {chunk_start_idx + 1}-{chunk_start_idx + n_samples_in_chunk})")
|
|
1354
1450
|
except Exception as exc:
|
|
1355
|
-
|
|
1451
|
+
study.logger.error(f"Chunk {chunk_idx} generated an exception: {exc}")
|
|
1356
1452
|
raise exc
|
|
1357
1453
|
else:
|
|
1358
1454
|
# Re-raise other exceptions
|
|
@@ -1366,25 +1462,25 @@ def _merge_kd_chunked(self, params: merge_defaults, cached_adducts_df=None, cach
|
|
|
1366
1462
|
|
|
1367
1463
|
# Merge chunk results with proper cross-chunk consensus building
|
|
1368
1464
|
# _merge_chunk_results now handles both ConsensusMap objects (sequential) and serialized data (parallel)
|
|
1369
|
-
_merge_chunk_results(
|
|
1465
|
+
_merge_chunk_results(study, chunk_consensus_maps, params, cached_adducts_df, cached_valid_adducts)
|
|
1370
1466
|
|
|
1371
|
-
# Return a dummy consensus map for compatibility (consensus features are stored in
|
|
1467
|
+
# Return a dummy consensus map for compatibility (consensus features are stored in study.consensus_df)
|
|
1372
1468
|
consensus_map = oms.ConsensusMap()
|
|
1373
1469
|
return consensus_map
|
|
1374
1470
|
|
|
1375
1471
|
|
|
1376
|
-
def _merge_qt_chunked(
|
|
1472
|
+
def _merge_qt_chunked(study, params: merge_defaults, cached_adducts_df=None, cached_valid_adducts=None) -> oms.ConsensusMap:
|
|
1377
1473
|
"""QT-based chunked merge with proper cross-chunk consensus building and optional parallel processing"""
|
|
1378
1474
|
|
|
1379
1475
|
# Generate temporary feature maps on-demand from features_df
|
|
1380
|
-
temp_feature_maps = _generate_feature_maps_on_demand(
|
|
1476
|
+
temp_feature_maps = _generate_feature_maps_on_demand(study)
|
|
1381
1477
|
|
|
1382
1478
|
n_samples = len(temp_feature_maps)
|
|
1383
1479
|
if n_samples <= params.chunk_size:
|
|
1384
|
-
|
|
1385
|
-
consensus_map = _merge_qt(
|
|
1480
|
+
study.logger.info(f"Dataset size ({n_samples}) ≤ chunk_size, using QT merge")
|
|
1481
|
+
consensus_map = _merge_qt(study, params)
|
|
1386
1482
|
# Extract consensus features to populate consensus_df for chunked method consistency
|
|
1387
|
-
|
|
1483
|
+
_extract_consensus_features(study, consensus_map, params.min_samples, cached_adducts_df, cached_valid_adducts)
|
|
1388
1484
|
return consensus_map
|
|
1389
1485
|
|
|
1390
1486
|
# Process in chunks
|
|
@@ -1393,21 +1489,21 @@ def _merge_qt_chunked(self, params: merge_defaults, cached_adducts_df=None, cach
|
|
|
1393
1489
|
chunk_end = min(i + params.chunk_size, n_samples)
|
|
1394
1490
|
chunks.append((i, temp_feature_maps[i:chunk_end]))
|
|
1395
1491
|
|
|
1396
|
-
|
|
1492
|
+
study.logger.debug(f"Processing {len(chunks)} chunks of max {params.chunk_size} samples using {params.threads or 'sequential'} thread(s)")
|
|
1397
1493
|
|
|
1398
1494
|
# Process each chunk to create chunk consensus maps
|
|
1399
1495
|
chunk_consensus_maps = []
|
|
1400
1496
|
|
|
1401
1497
|
if params.threads is None:
|
|
1402
1498
|
# Sequential processing (original behavior)
|
|
1403
|
-
for chunk_idx, (chunk_start_idx, chunk_maps) in enumerate(tqdm(chunks, desc=f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {
|
|
1499
|
+
for chunk_idx, (chunk_start_idx, chunk_maps) in enumerate(tqdm(chunks, desc=f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {study.log_label}QT Chunk", disable=study.log_level not in ["TRACE", "DEBUG", "INFO"])):
|
|
1404
1500
|
chunk_consensus_map = oms.ConsensusMap()
|
|
1405
1501
|
|
|
1406
1502
|
# Set up file descriptions for chunk
|
|
1407
1503
|
file_descriptions = chunk_consensus_map.getColumnHeaders()
|
|
1408
1504
|
for j, feature_map in enumerate(chunk_maps):
|
|
1409
1505
|
file_description = file_descriptions.get(j, oms.ColumnHeader())
|
|
1410
|
-
file_description.filename =
|
|
1506
|
+
file_description.filename = study.samples_df.row(chunk_start_idx + j, named=True)["sample_name"]
|
|
1411
1507
|
file_description.size = feature_map.size()
|
|
1412
1508
|
file_description.unique_id = feature_map.getUniqueId()
|
|
1413
1509
|
file_descriptions[j] = file_description
|
|
@@ -1430,7 +1526,7 @@ def _merge_qt_chunked(self, params: merge_defaults, cached_adducts_df=None, cach
|
|
|
1430
1526
|
|
|
1431
1527
|
else:
|
|
1432
1528
|
# Parallel processing
|
|
1433
|
-
|
|
1529
|
+
study.logger.info(f"Processing chunks in parallel using {params.threads} processes")
|
|
1434
1530
|
|
|
1435
1531
|
# Prepare chunk data for parallel processing using features_df slices
|
|
1436
1532
|
chunk_data_list = []
|
|
@@ -1439,7 +1535,7 @@ def _merge_qt_chunked(self, params: merge_defaults, cached_adducts_df=None, cach
|
|
|
1439
1535
|
chunk_sample_uids = []
|
|
1440
1536
|
chunk_samples_df_rows = []
|
|
1441
1537
|
for j in range(len(chunk_maps)):
|
|
1442
|
-
sample_row =
|
|
1538
|
+
sample_row = study.samples_df.row(chunk_start_idx + j, named=True)
|
|
1443
1539
|
chunk_sample_uids.append(sample_row['sample_uid'])
|
|
1444
1540
|
chunk_samples_df_rows.append(sample_row)
|
|
1445
1541
|
|
|
@@ -1447,7 +1543,7 @@ def _merge_qt_chunked(self, params: merge_defaults, cached_adducts_df=None, cach
|
|
|
1447
1543
|
chunk_samples_df = pl.DataFrame(chunk_samples_df_rows)
|
|
1448
1544
|
|
|
1449
1545
|
# Filter features_df for this chunk's samples and select only necessary columns
|
|
1450
|
-
chunk_features_df =
|
|
1546
|
+
chunk_features_df = study.features_df.filter(
|
|
1451
1547
|
pl.col('sample_uid').is_in(chunk_sample_uids)
|
|
1452
1548
|
).select([
|
|
1453
1549
|
'sample_uid', 'rt', 'mz', 'inty', 'charge', 'feature_id'
|
|
@@ -1491,22 +1587,22 @@ def _merge_qt_chunked(self, params: merge_defaults, cached_adducts_df=None, cach
|
|
|
1491
1587
|
serialized_chunk_results.append((chunk_start_idx, consensus_features))
|
|
1492
1588
|
completed_chunks += 1
|
|
1493
1589
|
n_samples_in_chunk = len(chunk_data_list[chunk_idx]['chunk_samples_data'])
|
|
1494
|
-
|
|
1590
|
+
study.logger.info(f"Completed chunk {completed_chunks}/{total_chunks} (samples {chunk_start_idx + 1}-{chunk_start_idx + n_samples_in_chunk})")
|
|
1495
1591
|
except Exception as exc:
|
|
1496
1592
|
# Check if this is a BrokenProcessPool exception from Windows multiprocessing issues
|
|
1497
1593
|
if isinstance(exc, BrokenProcessPool) or "process pool" in str(exc).lower():
|
|
1498
1594
|
# Convert to RuntimeError so outer except block can catch it for fallback
|
|
1499
1595
|
raise RuntimeError(f"Windows multiprocessing failure: {exc}")
|
|
1500
1596
|
else:
|
|
1501
|
-
|
|
1597
|
+
study.logger.error(f"Chunk {chunk_idx} generated an exception: {exc}")
|
|
1502
1598
|
raise exc
|
|
1503
1599
|
|
|
1504
1600
|
except (RuntimeError, OSError, BrokenProcessPool) as e:
|
|
1505
1601
|
# Handle Windows multiprocessing issues - fallback to ThreadPoolExecutor
|
|
1506
1602
|
if ("freeze_support" in str(e) or "spawn" in str(e) or "bootstrapping" in str(e) or
|
|
1507
1603
|
"process pool" in str(e).lower() or "Windows multiprocessing failure" in str(e)):
|
|
1508
|
-
|
|
1509
|
-
|
|
1604
|
+
study.logger.warning(f"ProcessPoolExecutor failed (likely Windows multiprocessing issue): {e}")
|
|
1605
|
+
study.logger.info(f"Falling back to ThreadPoolExecutor with {params.threads} threads")
|
|
1510
1606
|
|
|
1511
1607
|
with ThreadPoolExecutor(max_workers=params.threads) as executor:
|
|
1512
1608
|
# Submit all chunk processing tasks
|
|
@@ -1525,9 +1621,9 @@ def _merge_qt_chunked(self, params: merge_defaults, cached_adducts_df=None, cach
|
|
|
1525
1621
|
serialized_chunk_results.append((chunk_start_idx, consensus_features))
|
|
1526
1622
|
completed_chunks += 1
|
|
1527
1623
|
n_samples_in_chunk = len(chunk_data_list[chunk_idx]['chunk_samples_data'])
|
|
1528
|
-
|
|
1624
|
+
study.logger.info(f"Completed chunk {completed_chunks}/{total_chunks} (samples {chunk_start_idx + 1}-{chunk_start_idx + n_samples_in_chunk})")
|
|
1529
1625
|
except Exception as exc:
|
|
1530
|
-
|
|
1626
|
+
study.logger.error(f"Chunk {chunk_idx} generated an exception: {exc}")
|
|
1531
1627
|
raise exc
|
|
1532
1628
|
else:
|
|
1533
1629
|
# Re-raise other exceptions
|
|
@@ -1541,14 +1637,14 @@ def _merge_qt_chunked(self, params: merge_defaults, cached_adducts_df=None, cach
|
|
|
1541
1637
|
|
|
1542
1638
|
# Merge chunk results with proper cross-chunk consensus building
|
|
1543
1639
|
# _merge_chunk_results now handles both ConsensusMap objects (sequential) and serialized data (parallel)
|
|
1544
|
-
_merge_chunk_results(
|
|
1640
|
+
_merge_chunk_results(study, chunk_consensus_maps, params, cached_adducts_df, cached_valid_adducts)
|
|
1545
1641
|
|
|
1546
|
-
# Return a dummy consensus map for compatibility (consensus features are stored in
|
|
1642
|
+
# Return a dummy consensus map for compatibility (consensus features are stored in study.consensus_df)
|
|
1547
1643
|
consensus_map = oms.ConsensusMap()
|
|
1548
1644
|
return consensus_map
|
|
1549
1645
|
|
|
1550
1646
|
|
|
1551
|
-
def _merge_chunk_results(
|
|
1647
|
+
def _merge_chunk_results(study, chunk_consensus_maps: list, params: merge_defaults, cached_adducts_df=None, cached_valid_adducts=None) -> None:
|
|
1552
1648
|
"""
|
|
1553
1649
|
Scalable aggregation of chunk consensus maps into final consensus_df.
|
|
1554
1650
|
|
|
@@ -1561,7 +1657,8 @@ def _merge_chunk_results(self, chunk_consensus_maps: list, params: merge_default
|
|
|
1561
1657
|
if len(chunk_consensus_maps) == 1:
|
|
1562
1658
|
# Single chunk case - just extract using the true global min_samples.
|
|
1563
1659
|
# No need for permissive threshold because we are not discarding singletons pre-aggregation.
|
|
1564
|
-
|
|
1660
|
+
_extract_consensus_features(
|
|
1661
|
+
study,
|
|
1565
1662
|
chunk_consensus_maps[0][1],
|
|
1566
1663
|
params.min_samples,
|
|
1567
1664
|
cached_adducts_df,
|
|
@@ -1572,10 +1669,10 @@ def _merge_chunk_results(self, chunk_consensus_maps: list, params: merge_default
|
|
|
1572
1669
|
# Build feature_uid to feature_data lookup for fast access
|
|
1573
1670
|
feature_uid_map = {
|
|
1574
1671
|
row["feature_id"]: row["feature_uid"]
|
|
1575
|
-
for row in
|
|
1672
|
+
for row in study.features_df.iter_rows(named=True)
|
|
1576
1673
|
}
|
|
1577
1674
|
|
|
1578
|
-
features_lookup = _optimized_feature_lookup(
|
|
1675
|
+
features_lookup = _optimized_feature_lookup(study, study.features_df)
|
|
1579
1676
|
|
|
1580
1677
|
# Extract all consensus features from chunks with their feature_uids
|
|
1581
1678
|
all_chunk_consensus = []
|
|
@@ -1717,8 +1814,8 @@ def _merge_chunk_results(self, chunk_consensus_maps: list, params: merge_default
|
|
|
1717
1814
|
|
|
1718
1815
|
if not all_chunk_consensus:
|
|
1719
1816
|
# No valid consensus features found
|
|
1720
|
-
|
|
1721
|
-
|
|
1817
|
+
study.consensus_df = pl.DataFrame()
|
|
1818
|
+
study.consensus_mapping_df = pl.DataFrame()
|
|
1722
1819
|
return
|
|
1723
1820
|
|
|
1724
1821
|
# Perform cross-chunk clustering using optimized spatial indexing
|
|
@@ -1744,22 +1841,22 @@ def _merge_chunk_results(self, chunk_consensus_maps: list, params: merge_default
|
|
|
1744
1841
|
features_by_bin[(rt_bin, mz_bin)].append(i)
|
|
1745
1842
|
|
|
1746
1843
|
class UF:
|
|
1747
|
-
def __init__(
|
|
1748
|
-
|
|
1749
|
-
|
|
1750
|
-
def find(
|
|
1751
|
-
if
|
|
1752
|
-
|
|
1753
|
-
return
|
|
1754
|
-
def union(
|
|
1755
|
-
pa, pb =
|
|
1844
|
+
def __init__(study, n):
|
|
1845
|
+
study.p = list(range(n))
|
|
1846
|
+
study.r = [0]*n
|
|
1847
|
+
def find(study, x):
|
|
1848
|
+
if study.p[x] != x:
|
|
1849
|
+
study.p[x] = study.find(study.p[x])
|
|
1850
|
+
return study.p[x]
|
|
1851
|
+
def union(study, a,b):
|
|
1852
|
+
pa, pb = study.find(a), study.find(b)
|
|
1756
1853
|
if pa == pb:
|
|
1757
1854
|
return
|
|
1758
|
-
if
|
|
1855
|
+
if study.r[pa] < study.r[pb]:
|
|
1759
1856
|
pa, pb = pb, pa
|
|
1760
|
-
|
|
1761
|
-
if
|
|
1762
|
-
|
|
1857
|
+
study.p[pb] = pa
|
|
1858
|
+
if study.r[pa] == study.r[pb]:
|
|
1859
|
+
study.r[pa] += 1
|
|
1763
1860
|
|
|
1764
1861
|
uf = UF(n_features)
|
|
1765
1862
|
checked = set()
|
|
@@ -1918,7 +2015,7 @@ def _merge_chunk_results(self, chunk_consensus_maps: list, params: merge_default
|
|
|
1918
2015
|
# This allows proper cross-chunk consensus building before final filtering
|
|
1919
2016
|
|
|
1920
2017
|
metadata = _calculate_consensus_statistics(
|
|
1921
|
-
|
|
2018
|
+
study,
|
|
1922
2019
|
consensus_uid_counter,
|
|
1923
2020
|
list(feature_data_acc.values()),
|
|
1924
2021
|
rt_values_chunk,
|
|
@@ -1937,7 +2034,7 @@ def _merge_chunk_results(self, chunk_consensus_maps: list, params: merge_default
|
|
|
1937
2034
|
|
|
1938
2035
|
if rt_spread > max_allowed_spread:
|
|
1939
2036
|
# Skip consensus features with excessive RT spread
|
|
1940
|
-
|
|
2037
|
+
study.logger.debug(f"Skipping consensus feature {consensus_uid_counter} with RT spread {rt_spread:.3f}s > {max_allowed_spread:.3f}s")
|
|
1941
2038
|
consensus_uid_counter += 1
|
|
1942
2039
|
continue
|
|
1943
2040
|
|
|
@@ -1969,27 +2066,27 @@ def _merge_chunk_results(self, chunk_consensus_maps: list, params: merge_default
|
|
|
1969
2066
|
consensus_uid_counter += 1
|
|
1970
2067
|
|
|
1971
2068
|
# Assign DataFrames
|
|
1972
|
-
|
|
1973
|
-
|
|
2069
|
+
study.consensus_df = pl.DataFrame(consensus_metadata, strict=False)
|
|
2070
|
+
study.consensus_mapping_df = pl.DataFrame(consensus_mapping_list, strict=False)
|
|
1974
2071
|
|
|
1975
2072
|
# Ensure mapping only contains features from retained consensus_df
|
|
1976
|
-
if len(
|
|
1977
|
-
valid_consensus_ids = set(
|
|
1978
|
-
|
|
2073
|
+
if len(study.consensus_df) > 0:
|
|
2074
|
+
valid_consensus_ids = set(study.consensus_df['consensus_uid'].to_list())
|
|
2075
|
+
study.consensus_mapping_df = study.consensus_mapping_df.filter(
|
|
1979
2076
|
pl.col('consensus_uid').is_in(list(valid_consensus_ids))
|
|
1980
2077
|
)
|
|
1981
2078
|
else:
|
|
1982
|
-
|
|
2079
|
+
study.consensus_mapping_df = pl.DataFrame()
|
|
1983
2080
|
|
|
1984
2081
|
# Attach empty consensus_map placeholder for downstream compatibility
|
|
1985
|
-
|
|
2082
|
+
study.consensus_map = oms.ConsensusMap()
|
|
1986
2083
|
return
|
|
1987
2084
|
|
|
1988
2085
|
|
|
1989
2086
|
def _calculate_consensus_statistics(study_obj, consensus_uid: int, feature_data_list: list,
|
|
1990
2087
|
rt_values: list, mz_values: list,
|
|
1991
2088
|
intensity_values: list, quality_values: list,
|
|
1992
|
-
number_features: int = None, number_samples: int = None,
|
|
2089
|
+
number_features: int | None = None, number_samples: int | None = None,
|
|
1993
2090
|
cached_adducts_df=None, cached_valid_adducts=None) -> dict:
|
|
1994
2091
|
"""
|
|
1995
2092
|
Calculate comprehensive statistics for a consensus feature from aggregated feature data.
|
|
@@ -2158,24 +2255,24 @@ def _cluster_consensus_features(features: list, rt_tol: float, mz_tol: float) ->
|
|
|
2158
2255
|
|
|
2159
2256
|
# Use Union-Find for efficient clustering
|
|
2160
2257
|
class UnionFind:
|
|
2161
|
-
def __init__(
|
|
2162
|
-
|
|
2163
|
-
|
|
2258
|
+
def __init__(study, n):
|
|
2259
|
+
study.parent = list(range(n))
|
|
2260
|
+
study.rank = [0] * n
|
|
2164
2261
|
|
|
2165
|
-
def find(
|
|
2166
|
-
if
|
|
2167
|
-
|
|
2168
|
-
return
|
|
2262
|
+
def find(study, x):
|
|
2263
|
+
if study.parent[x] != x:
|
|
2264
|
+
study.parent[x] = study.find(study.parent[x])
|
|
2265
|
+
return study.parent[x]
|
|
2169
2266
|
|
|
2170
|
-
def union(
|
|
2171
|
-
px, py =
|
|
2267
|
+
def union(study, x, y):
|
|
2268
|
+
px, py = study.find(x), study.find(y)
|
|
2172
2269
|
if px == py:
|
|
2173
2270
|
return
|
|
2174
|
-
if
|
|
2271
|
+
if study.rank[px] < study.rank[py]:
|
|
2175
2272
|
px, py = py, px
|
|
2176
|
-
|
|
2177
|
-
if
|
|
2178
|
-
|
|
2273
|
+
study.parent[py] = px
|
|
2274
|
+
if study.rank[px] == study.rank[py]:
|
|
2275
|
+
study.rank[px] += 1
|
|
2179
2276
|
|
|
2180
2277
|
n_features = len(features)
|
|
2181
2278
|
uf = UnionFind(n_features)
|
|
@@ -2208,39 +2305,39 @@ def _cluster_consensus_features(features: list, rt_tol: float, mz_tol: float) ->
|
|
|
2208
2305
|
return list(groups_by_root.values())
|
|
2209
2306
|
|
|
2210
2307
|
|
|
2211
|
-
def _reset_consensus_data(
|
|
2308
|
+
def _reset_consensus_data(study):
|
|
2212
2309
|
"""Reset consensus-related DataFrames at the start of merge."""
|
|
2213
|
-
|
|
2214
|
-
|
|
2215
|
-
|
|
2310
|
+
study.consensus_df = pl.DataFrame()
|
|
2311
|
+
study.consensus_ms2 = pl.DataFrame()
|
|
2312
|
+
study.consensus_mapping_df = pl.DataFrame()
|
|
2216
2313
|
|
|
2217
2314
|
|
|
2218
|
-
def _extract_consensus_features(
|
|
2315
|
+
def _extract_consensus_features(study, consensus_map, min_samples, cached_adducts_df=None, cached_valid_adducts=None):
|
|
2219
2316
|
"""Extract consensus features and build metadata."""
|
|
2220
|
-
# create a dict to map uid to feature_uid using
|
|
2317
|
+
# create a dict to map uid to feature_uid using study.features_df
|
|
2221
2318
|
feature_uid_map = {
|
|
2222
2319
|
row["feature_id"]: row["feature_uid"]
|
|
2223
|
-
for row in
|
|
2320
|
+
for row in study.features_df.iter_rows(named=True)
|
|
2224
2321
|
}
|
|
2225
2322
|
imax = consensus_map.size()
|
|
2226
2323
|
|
|
2227
|
-
|
|
2324
|
+
study.logger.debug(f"Found {imax} feature groups by clustering.")
|
|
2228
2325
|
|
|
2229
2326
|
# Pre-build fast lookup tables for features_df data using optimized approach
|
|
2230
|
-
features_lookup = _optimized_feature_lookup(
|
|
2327
|
+
features_lookup = _optimized_feature_lookup(study, study.features_df)
|
|
2231
2328
|
|
|
2232
2329
|
# create a list to store the consensus mapping
|
|
2233
2330
|
consensus_mapping = []
|
|
2234
2331
|
metadata_list = []
|
|
2235
2332
|
|
|
2236
|
-
tqdm_disable =
|
|
2333
|
+
tqdm_disable = study.log_level not in ["TRACE", "DEBUG"]
|
|
2237
2334
|
|
|
2238
2335
|
for i, feature in enumerate(
|
|
2239
2336
|
tqdm(
|
|
2240
2337
|
consensus_map,
|
|
2241
2338
|
total=imax,
|
|
2242
2339
|
disable=tqdm_disable,
|
|
2243
|
-
desc=f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {
|
|
2340
|
+
desc=f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {study.log_label}Extract metadata",
|
|
2244
2341
|
),
|
|
2245
2342
|
):
|
|
2246
2343
|
# get all features in the feature map with the same unique id as the consensus feature
|
|
@@ -2486,7 +2583,7 @@ def _extract_consensus_features(self, consensus_map, min_samples, cached_adducts
|
|
|
2486
2583
|
adduct_mass_shift_top = 1.007825
|
|
2487
2584
|
else:
|
|
2488
2585
|
# No valid adducts found - assign default based on study polarity
|
|
2489
|
-
study_polarity = getattr(
|
|
2586
|
+
study_polarity = getattr(study, "polarity", "positive")
|
|
2490
2587
|
if study_polarity in ["negative", "neg"]:
|
|
2491
2588
|
# Negative mode default
|
|
2492
2589
|
adduct_top = "[M-?]1-"
|
|
@@ -2618,55 +2715,55 @@ def _extract_consensus_features(self, consensus_map, min_samples, cached_adducts
|
|
|
2618
2715
|
)
|
|
2619
2716
|
|
|
2620
2717
|
consensus_mapping_df = pl.DataFrame(consensus_mapping)
|
|
2621
|
-
# remove all rows in consensus_mapping_df where consensus_id is not in
|
|
2718
|
+
# remove all rows in consensus_mapping_df where consensus_id is not in study.featured_df['uid']
|
|
2622
2719
|
l1 = len(consensus_mapping_df)
|
|
2623
2720
|
consensus_mapping_df = consensus_mapping_df.filter(
|
|
2624
|
-
pl.col("feature_uid").is_in(
|
|
2721
|
+
pl.col("feature_uid").is_in(study.features_df["feature_uid"].to_list()),
|
|
2625
2722
|
)
|
|
2626
|
-
|
|
2723
|
+
study.logger.debug(
|
|
2627
2724
|
f"Filtered {l1 - len(consensus_mapping_df)} orphan features from maps.",
|
|
2628
2725
|
)
|
|
2629
|
-
|
|
2630
|
-
|
|
2726
|
+
study.consensus_mapping_df = consensus_mapping_df
|
|
2727
|
+
study.consensus_df = pl.DataFrame(metadata_list, strict=False)
|
|
2631
2728
|
|
|
2632
2729
|
if min_samples is None:
|
|
2633
2730
|
min_samples = 1
|
|
2634
2731
|
if min_samples < 1:
|
|
2635
|
-
min_samples = int(min_samples * len(
|
|
2732
|
+
min_samples = int(min_samples * len(study.samples_df))
|
|
2636
2733
|
|
|
2637
2734
|
# Validate that min_samples doesn't exceed the number of samples
|
|
2638
|
-
if min_samples > len(
|
|
2639
|
-
|
|
2640
|
-
f"min_samples ({min_samples}) exceeds the number of samples ({len(
|
|
2641
|
-
f"Setting min_samples to {len(
|
|
2735
|
+
if min_samples > len(study.samples_df):
|
|
2736
|
+
study.logger.warning(
|
|
2737
|
+
f"min_samples ({min_samples}) exceeds the number of samples ({len(study.samples_df)}). "
|
|
2738
|
+
f"Setting min_samples to {len(study.samples_df)}.",
|
|
2642
2739
|
)
|
|
2643
|
-
min_samples = len(
|
|
2740
|
+
min_samples = len(study.samples_df)
|
|
2644
2741
|
|
|
2645
2742
|
# filter out consensus features with less than min_samples features
|
|
2646
|
-
l1 = len(
|
|
2647
|
-
|
|
2743
|
+
l1 = len(study.consensus_df)
|
|
2744
|
+
study.consensus_df = study.consensus_df.filter(
|
|
2648
2745
|
pl.col("number_samples") >= min_samples,
|
|
2649
2746
|
)
|
|
2650
|
-
|
|
2651
|
-
f"Filtered {l1 - len(
|
|
2747
|
+
study.logger.debug(
|
|
2748
|
+
f"Filtered {l1 - len(study.consensus_df)} consensus features with less than {min_samples} samples.",
|
|
2652
2749
|
)
|
|
2653
2750
|
# filter out consensus mapping with less than min_samples features
|
|
2654
|
-
|
|
2655
|
-
pl.col("consensus_uid").is_in(
|
|
2751
|
+
study.consensus_mapping_df = study.consensus_mapping_df.filter(
|
|
2752
|
+
pl.col("consensus_uid").is_in(study.consensus_df["consensus_uid"].to_list()),
|
|
2656
2753
|
)
|
|
2657
2754
|
|
|
2658
|
-
|
|
2755
|
+
study.consensus_map = consensus_map
|
|
2659
2756
|
|
|
2660
2757
|
|
|
2661
|
-
def _perform_adduct_grouping(
|
|
2758
|
+
def _perform_adduct_grouping(study, rt_tol, mz_tol):
|
|
2662
2759
|
"""Perform adduct grouping on consensus features."""
|
|
2663
2760
|
import polars as pl
|
|
2664
2761
|
|
|
2665
2762
|
# Add adduct grouping and adduct_of assignment
|
|
2666
|
-
if len(
|
|
2763
|
+
if len(study.consensus_df) > 0:
|
|
2667
2764
|
# Get relevant columns for grouping
|
|
2668
2765
|
consensus_data = []
|
|
2669
|
-
for row in
|
|
2766
|
+
for row in study.consensus_df.iter_rows(named=True):
|
|
2670
2767
|
consensus_data.append(
|
|
2671
2768
|
{
|
|
2672
2769
|
"consensus_uid": row["consensus_uid"],
|
|
@@ -2679,11 +2776,11 @@ def _perform_adduct_grouping(self, rt_tol, mz_tol):
|
|
|
2679
2776
|
|
|
2680
2777
|
# Use optimized adduct grouping
|
|
2681
2778
|
adduct_group_list, adduct_of_list = _optimized_adduct_grouping(
|
|
2682
|
-
|
|
2779
|
+
study, consensus_data, rt_tol, mz_tol
|
|
2683
2780
|
)
|
|
2684
2781
|
|
|
2685
2782
|
# Add the new columns to consensus_df
|
|
2686
|
-
|
|
2783
|
+
study.consensus_df = study.consensus_df.with_columns(
|
|
2687
2784
|
[
|
|
2688
2785
|
pl.Series("adduct_group", adduct_group_list, dtype=pl.Int64),
|
|
2689
2786
|
pl.Series("adduct_of", adduct_of_list, dtype=pl.Int64),
|
|
@@ -2691,7 +2788,7 @@ def _perform_adduct_grouping(self, rt_tol, mz_tol):
|
|
|
2691
2788
|
)
|
|
2692
2789
|
|
|
2693
2790
|
|
|
2694
|
-
def _count_tight_clusters(
|
|
2791
|
+
def _count_tight_clusters(study, mz_tol: float = 0.04, rt_tol: float = 0.3) -> int:
|
|
2695
2792
|
"""
|
|
2696
2793
|
Count consensus features grouped in tight clusters.
|
|
2697
2794
|
|
|
@@ -2702,12 +2799,12 @@ def _count_tight_clusters(self, mz_tol: float = 0.04, rt_tol: float = 0.3) -> in
|
|
|
2702
2799
|
Returns:
|
|
2703
2800
|
Number of tight clusters found
|
|
2704
2801
|
"""
|
|
2705
|
-
if len(
|
|
2802
|
+
if len(study.consensus_df) < 2:
|
|
2706
2803
|
return 0
|
|
2707
2804
|
|
|
2708
2805
|
# Extract consensus feature data
|
|
2709
2806
|
consensus_data = []
|
|
2710
|
-
for row in
|
|
2807
|
+
for row in study.consensus_df.iter_rows(named=True):
|
|
2711
2808
|
consensus_data.append({
|
|
2712
2809
|
'consensus_uid': row['consensus_uid'],
|
|
2713
2810
|
'mz': row['mz'],
|
|
@@ -2768,7 +2865,7 @@ def _count_tight_clusters(self, mz_tol: float = 0.04, rt_tol: float = 0.3) -> in
|
|
|
2768
2865
|
return tight_clusters_count
|
|
2769
2866
|
|
|
2770
2867
|
|
|
2771
|
-
def _consensus_cleanup(
|
|
2868
|
+
def _consensus_cleanup(study, rt_tol, mz_tol):
|
|
2772
2869
|
"""
|
|
2773
2870
|
Consensus cleanup to merge over-segmented consensus features and remove isotopic features.
|
|
2774
2871
|
|
|
@@ -2777,20 +2874,20 @@ def _consensus_cleanup(self, rt_tol, mz_tol):
|
|
|
2777
2874
|
(too many features in very tight m/z and RT windows)
|
|
2778
2875
|
2. Performs deisotoping to remove +1 and +2 isotopic features
|
|
2779
2876
|
"""
|
|
2780
|
-
if len(
|
|
2877
|
+
if len(study.consensus_df) == 0:
|
|
2781
2878
|
return
|
|
2782
2879
|
|
|
2783
|
-
initial_count = len(
|
|
2880
|
+
initial_count = len(study.consensus_df)
|
|
2784
2881
|
|
|
2785
2882
|
# Only perform enhanced post-clustering if there are many features
|
|
2786
2883
|
if initial_count < 50:
|
|
2787
2884
|
return
|
|
2788
2885
|
|
|
2789
|
-
|
|
2886
|
+
study.logger.debug(f"Enhanced post-clustering: processing {initial_count} consensus features")
|
|
2790
2887
|
|
|
2791
2888
|
# Find tight clusters using spatial binning
|
|
2792
2889
|
consensus_data = []
|
|
2793
|
-
for row in
|
|
2890
|
+
for row in study.consensus_df.iter_rows(named=True):
|
|
2794
2891
|
consensus_data.append({
|
|
2795
2892
|
'consensus_uid': row['consensus_uid'],
|
|
2796
2893
|
'mz': row['mz'],
|
|
@@ -2873,7 +2970,7 @@ def _consensus_cleanup(self, rt_tol, mz_tol):
|
|
|
2873
2970
|
if not merge_groups:
|
|
2874
2971
|
return
|
|
2875
2972
|
|
|
2876
|
-
|
|
2973
|
+
study.logger.debug(f"Found {len(merge_groups)} over-segmented clusters to merge")
|
|
2877
2974
|
|
|
2878
2975
|
# Merge clusters by keeping the most representative feature
|
|
2879
2976
|
uids_to_remove = set()
|
|
@@ -2892,25 +2989,25 @@ def _consensus_cleanup(self, rt_tol, mz_tol):
|
|
|
2892
2989
|
|
|
2893
2990
|
if uids_to_remove:
|
|
2894
2991
|
# Remove merged features from consensus_df
|
|
2895
|
-
|
|
2992
|
+
study.consensus_df = study.consensus_df.filter(
|
|
2896
2993
|
~pl.col('consensus_uid').is_in(list(uids_to_remove))
|
|
2897
2994
|
)
|
|
2898
2995
|
|
|
2899
2996
|
# Also update consensus_mapping_df if it exists
|
|
2900
|
-
if hasattr(
|
|
2901
|
-
|
|
2997
|
+
if hasattr(study, 'consensus_mapping_df') and not study.consensus_mapping_df.is_empty():
|
|
2998
|
+
study.consensus_mapping_df = study.consensus_mapping_df.filter(
|
|
2902
2999
|
~pl.col('consensus_uid').is_in(list(uids_to_remove))
|
|
2903
3000
|
)
|
|
2904
3001
|
|
|
2905
|
-
final_count = len(
|
|
3002
|
+
final_count = len(study.consensus_df)
|
|
2906
3003
|
reduction = initial_count - final_count
|
|
2907
3004
|
reduction_pct = (reduction / initial_count) * 100
|
|
2908
3005
|
|
|
2909
3006
|
if reduction > 0:
|
|
2910
|
-
|
|
3007
|
+
study.logger.debug(f"Enhanced post-clustering: {initial_count} → {final_count} features ({reduction_pct:.1f}% reduction)")
|
|
2911
3008
|
|
|
2912
3009
|
# Step 2: Deisotoping - Remove +1 and +2 isotopic consensus features
|
|
2913
|
-
pre_deisotoping_count = len(
|
|
3010
|
+
pre_deisotoping_count = len(study.consensus_df)
|
|
2914
3011
|
isotope_uids_to_remove = set()
|
|
2915
3012
|
|
|
2916
3013
|
# Use strict tolerances for deisotoping (same as declustering)
|
|
@@ -2919,7 +3016,7 @@ def _consensus_cleanup(self, rt_tol, mz_tol):
|
|
|
2919
3016
|
|
|
2920
3017
|
# Get current consensus data for isotope detection
|
|
2921
3018
|
current_consensus_data = []
|
|
2922
|
-
for row in
|
|
3019
|
+
for row in study.consensus_df.iter_rows(named=True):
|
|
2923
3020
|
current_consensus_data.append({
|
|
2924
3021
|
'consensus_uid': row['consensus_uid'],
|
|
2925
3022
|
'mz': row['mz'],
|
|
@@ -2970,31 +3067,31 @@ def _consensus_cleanup(self, rt_tol, mz_tol):
|
|
|
2970
3067
|
|
|
2971
3068
|
# Remove isotopic features
|
|
2972
3069
|
if isotope_uids_to_remove:
|
|
2973
|
-
|
|
3070
|
+
study.consensus_df = study.consensus_df.filter(
|
|
2974
3071
|
~pl.col('consensus_uid').is_in(list(isotope_uids_to_remove))
|
|
2975
3072
|
)
|
|
2976
3073
|
|
|
2977
3074
|
# Also update consensus_mapping_df if it exists
|
|
2978
|
-
if hasattr(
|
|
2979
|
-
|
|
3075
|
+
if hasattr(study, 'consensus_mapping_df') and not study.consensus_mapping_df.is_empty():
|
|
3076
|
+
study.consensus_mapping_df = study.consensus_mapping_df.filter(
|
|
2980
3077
|
~pl.col('consensus_uid').is_in(list(isotope_uids_to_remove))
|
|
2981
3078
|
)
|
|
2982
3079
|
|
|
2983
|
-
post_deisotoping_count = len(
|
|
3080
|
+
post_deisotoping_count = len(study.consensus_df)
|
|
2984
3081
|
isotope_reduction = pre_deisotoping_count - post_deisotoping_count
|
|
2985
3082
|
|
|
2986
3083
|
if isotope_reduction > 0:
|
|
2987
|
-
|
|
3084
|
+
study.logger.debug(f"Deisotoping: {pre_deisotoping_count} → {post_deisotoping_count} features ({isotope_reduction} isotopic features removed)")
|
|
2988
3085
|
|
|
2989
3086
|
# Final summary
|
|
2990
|
-
final_count = len(
|
|
3087
|
+
final_count = len(study.consensus_df)
|
|
2991
3088
|
total_reduction = initial_count - final_count
|
|
2992
3089
|
if total_reduction > 0:
|
|
2993
3090
|
total_reduction_pct = (total_reduction / initial_count) * 100
|
|
2994
|
-
|
|
3091
|
+
study.logger.debug(f"Consensus cleanup complete: {initial_count} → {final_count} features ({total_reduction_pct:.1f}% total reduction)")
|
|
2995
3092
|
|
|
2996
3093
|
|
|
2997
|
-
def _identify_adduct_by_mass_shift(
|
|
3094
|
+
def _identify_adduct_by_mass_shift(study, rt_tol, cached_adducts_df=None):
|
|
2998
3095
|
"""
|
|
2999
3096
|
Identify coeluting consensus features by characteristic mass shifts between adducts
|
|
3000
3097
|
and update their adduct information accordingly.
|
|
@@ -3014,23 +3111,24 @@ def _identify_adduct_by_mass_shift(self, rt_tol, cached_adducts_df=None):
|
|
|
3014
3111
|
from collections import defaultdict
|
|
3015
3112
|
|
|
3016
3113
|
# Check if consensus_df exists and has features
|
|
3017
|
-
if len(
|
|
3018
|
-
|
|
3114
|
+
if len(study.consensus_df) == 0:
|
|
3115
|
+
study.logger.debug("No consensus features for adduct identification by mass shift")
|
|
3019
3116
|
return
|
|
3020
3117
|
|
|
3021
|
-
|
|
3118
|
+
study.logger.info(f"Identifying coeluting adducts by mass shifts in {len(study.consensus_df)} consensus features...")
|
|
3022
3119
|
|
|
3023
3120
|
# Get adducts DataFrame if not provided
|
|
3024
3121
|
if cached_adducts_df is None or cached_adducts_df.is_empty():
|
|
3025
3122
|
try:
|
|
3026
3123
|
# Use lower min_probability for better adduct coverage in mass shift identification
|
|
3027
|
-
|
|
3124
|
+
from masster.study.id import _get_adducts
|
|
3125
|
+
cached_adducts_df = _get_adducts(study, min_probability=0.01)
|
|
3028
3126
|
except Exception as e:
|
|
3029
|
-
|
|
3127
|
+
study.logger.warning(f"Could not retrieve adducts for mass shift identification: {e}")
|
|
3030
3128
|
return
|
|
3031
3129
|
|
|
3032
3130
|
if cached_adducts_df.is_empty():
|
|
3033
|
-
|
|
3131
|
+
study.logger.debug("No adducts available for mass shift identification")
|
|
3034
3132
|
return
|
|
3035
3133
|
|
|
3036
3134
|
# Build catalogue of mass shifts between adducts
|
|
@@ -3081,11 +3179,11 @@ def _identify_adduct_by_mass_shift(self, rt_tol, cached_adducts_df=None):
|
|
|
3081
3179
|
"to_charge": charge2
|
|
3082
3180
|
})
|
|
3083
3181
|
|
|
3084
|
-
|
|
3182
|
+
study.logger.debug(f"Generated mass shift catalog with {len(mass_shift_catalog)} unique shifts")
|
|
3085
3183
|
|
|
3086
3184
|
# Get consensus features data
|
|
3087
3185
|
consensus_data = []
|
|
3088
|
-
for i, row in enumerate(
|
|
3186
|
+
for i, row in enumerate(study.consensus_df.iter_rows(named=True)):
|
|
3089
3187
|
consensus_data.append({
|
|
3090
3188
|
"index": i,
|
|
3091
3189
|
"consensus_uid": row["consensus_uid"],
|
|
@@ -3234,7 +3332,7 @@ def _identify_adduct_by_mass_shift(self, rt_tol, cached_adducts_df=None):
|
|
|
3234
3332
|
}
|
|
3235
3333
|
|
|
3236
3334
|
updated_count += 2
|
|
3237
|
-
|
|
3335
|
+
study.logger.debug(
|
|
3238
3336
|
f"Identified adduct pair: {from_adduct_name} (m/z {from_feature['mz']:.4f}) "
|
|
3239
3337
|
f"<-> {to_adduct_name} (m/z {to_feature['mz']:.4f}), "
|
|
3240
3338
|
f"RT {rt1:.2f}s, Δm/z {mz_diff:.4f}"
|
|
@@ -3244,7 +3342,7 @@ def _identify_adduct_by_mass_shift(self, rt_tol, cached_adducts_df=None):
|
|
|
3244
3342
|
# Apply updates to consensus_df
|
|
3245
3343
|
if adduct_updates:
|
|
3246
3344
|
# Prepare update data
|
|
3247
|
-
consensus_uids =
|
|
3345
|
+
consensus_uids = study.consensus_df["consensus_uid"].to_list()
|
|
3248
3346
|
|
|
3249
3347
|
new_adduct_top = []
|
|
3250
3348
|
new_adduct_charge_top = []
|
|
@@ -3261,88 +3359,88 @@ def _identify_adduct_by_mass_shift(self, rt_tol, cached_adducts_df=None):
|
|
|
3261
3359
|
else:
|
|
3262
3360
|
# Keep existing values
|
|
3263
3361
|
row_idx = consensus_uids.index(uid)
|
|
3264
|
-
row =
|
|
3362
|
+
row = study.consensus_df.row(row_idx, named=True)
|
|
3265
3363
|
new_adduct_top.append(row.get("adduct_top"))
|
|
3266
3364
|
new_adduct_charge_top.append(row.get("adduct_charge_top"))
|
|
3267
3365
|
new_adduct_mass_neutral_top.append(row.get("adduct_mass_neutral_top"))
|
|
3268
3366
|
new_adduct_mass_shift_top.append(row.get("adduct_mass_shift_top"))
|
|
3269
3367
|
|
|
3270
3368
|
# Update the DataFrame
|
|
3271
|
-
|
|
3369
|
+
study.consensus_df = study.consensus_df.with_columns([
|
|
3272
3370
|
pl.Series("adduct_top", new_adduct_top),
|
|
3273
3371
|
pl.Series("adduct_charge_top", new_adduct_charge_top),
|
|
3274
3372
|
pl.Series("adduct_mass_neutral_top", new_adduct_mass_neutral_top),
|
|
3275
3373
|
pl.Series("adduct_mass_shift_top", new_adduct_mass_shift_top)
|
|
3276
3374
|
])
|
|
3277
3375
|
|
|
3278
|
-
|
|
3376
|
+
study.logger.info(f"Updated adduct assignments for {updated_count} consensus features based on mass shifts")
|
|
3279
3377
|
else:
|
|
3280
|
-
|
|
3378
|
+
study.logger.debug("No consensus features updated based on mass shift analysis")
|
|
3281
3379
|
|
|
3282
3380
|
|
|
3283
|
-
def _finalize_merge(
|
|
3381
|
+
def _finalize_merge(study, link_ms2, min_samples):
|
|
3284
3382
|
"""Complete the merge process with final calculations and cleanup."""
|
|
3285
3383
|
import polars as pl
|
|
3286
3384
|
|
|
3287
3385
|
# Check if consensus_df is empty or missing required columns
|
|
3288
|
-
if len(
|
|
3289
|
-
|
|
3386
|
+
if len(study.consensus_df) == 0 or "number_samples" not in study.consensus_df.columns:
|
|
3387
|
+
study.logger.debug("No consensus features found or consensus_df is empty. Skipping finalize merge.")
|
|
3290
3388
|
return
|
|
3291
3389
|
|
|
3292
3390
|
# Validate min_samples parameter
|
|
3293
3391
|
if min_samples is None:
|
|
3294
3392
|
min_samples = 1
|
|
3295
3393
|
if min_samples < 1:
|
|
3296
|
-
min_samples = int(min_samples * len(
|
|
3394
|
+
min_samples = int(min_samples * len(study.samples_df))
|
|
3297
3395
|
|
|
3298
3396
|
# Validate that min_samples doesn't exceed the number of samples
|
|
3299
|
-
if min_samples > len(
|
|
3300
|
-
|
|
3301
|
-
f"min_samples ({min_samples}) exceeds the number of samples ({len(
|
|
3302
|
-
f"Setting min_samples to {len(
|
|
3397
|
+
if min_samples > len(study.samples_df):
|
|
3398
|
+
study.logger.warning(
|
|
3399
|
+
f"min_samples ({min_samples}) exceeds the number of samples ({len(study.samples_df)}). "
|
|
3400
|
+
f"Setting min_samples to {len(study.samples_df)}.",
|
|
3303
3401
|
)
|
|
3304
|
-
min_samples = len(
|
|
3402
|
+
min_samples = len(study.samples_df)
|
|
3305
3403
|
|
|
3306
3404
|
# Filter out consensus features with less than min_samples features
|
|
3307
|
-
l1 = len(
|
|
3308
|
-
|
|
3405
|
+
l1 = len(study.consensus_df)
|
|
3406
|
+
study.consensus_df = study.consensus_df.filter(
|
|
3309
3407
|
pl.col("number_samples") >= min_samples,
|
|
3310
3408
|
)
|
|
3311
|
-
|
|
3312
|
-
f"Filtered {l1 - len(
|
|
3409
|
+
study.logger.debug(
|
|
3410
|
+
f"Filtered {l1 - len(study.consensus_df)} consensus features with less than {min_samples} samples.",
|
|
3313
3411
|
)
|
|
3314
3412
|
|
|
3315
3413
|
# Filter out consensus mapping with less than min_samples features
|
|
3316
|
-
|
|
3317
|
-
pl.col("consensus_uid").is_in(
|
|
3414
|
+
study.consensus_mapping_df = study.consensus_mapping_df.filter(
|
|
3415
|
+
pl.col("consensus_uid").is_in(study.consensus_df["consensus_uid"].to_list()),
|
|
3318
3416
|
)
|
|
3319
3417
|
|
|
3320
3418
|
# Calculate the completeness of the consensus map
|
|
3321
3419
|
# Log completion with tight cluster metrics
|
|
3322
|
-
if len(
|
|
3420
|
+
if len(study.consensus_df) > 0 and len(study.samples_df) > 0:
|
|
3323
3421
|
c = (
|
|
3324
|
-
len(
|
|
3325
|
-
/ len(
|
|
3326
|
-
/ len(
|
|
3422
|
+
len(study.consensus_mapping_df)
|
|
3423
|
+
/ len(study.consensus_df)
|
|
3424
|
+
/ len(study.samples_df)
|
|
3327
3425
|
)
|
|
3328
3426
|
|
|
3329
3427
|
# Count tight clusters with specified thresholds
|
|
3330
|
-
tight_clusters = _count_tight_clusters(
|
|
3428
|
+
tight_clusters = _count_tight_clusters(study,mz_tol=0.04, rt_tol=0.3)
|
|
3331
3429
|
|
|
3332
|
-
|
|
3333
|
-
f"Merging completed. Consensus features: {len(
|
|
3430
|
+
study.logger.info(
|
|
3431
|
+
f"Merging completed. Consensus features: {len(study.consensus_df)}. "
|
|
3334
3432
|
f"Completeness: {c:.2f}. Tight clusters left: {tight_clusters}.",
|
|
3335
3433
|
)
|
|
3336
3434
|
else:
|
|
3337
|
-
|
|
3338
|
-
f"Merging completed with empty result. Consensus features: {len(
|
|
3435
|
+
study.logger.warning(
|
|
3436
|
+
f"Merging completed with empty result. Consensus features: {len(study.consensus_df)}. "
|
|
3339
3437
|
f"This may be due to min_samples ({min_samples}) being too high for the available data.",
|
|
3340
3438
|
)
|
|
3341
3439
|
|
|
3342
3440
|
# add iso data from raw files.
|
|
3343
|
-
|
|
3441
|
+
study.find_iso()
|
|
3344
3442
|
if link_ms2:
|
|
3345
|
-
|
|
3443
|
+
study.find_ms2()
|
|
3346
3444
|
|
|
3347
3445
|
|
|
3348
3446
|
def _optimized_feature_lookup(study_obj, features_df):
|
|
@@ -3419,24 +3517,24 @@ def _optimized_adduct_grouping(study_obj, consensus_data, rt_tol, mz_tol):
|
|
|
3419
3517
|
|
|
3420
3518
|
# Union-Find for efficient grouping
|
|
3421
3519
|
class UnionFind:
|
|
3422
|
-
def __init__(
|
|
3423
|
-
|
|
3424
|
-
|
|
3520
|
+
def __init__(study, n):
|
|
3521
|
+
study.parent = list(range(n))
|
|
3522
|
+
study.rank = [0] * n
|
|
3425
3523
|
|
|
3426
|
-
def find(
|
|
3427
|
-
if
|
|
3428
|
-
|
|
3429
|
-
return
|
|
3524
|
+
def find(study, x):
|
|
3525
|
+
if study.parent[x] != x:
|
|
3526
|
+
study.parent[x] = study.find(study.parent[x])
|
|
3527
|
+
return study.parent[x]
|
|
3430
3528
|
|
|
3431
|
-
def union(
|
|
3432
|
-
px, py =
|
|
3529
|
+
def union(study, x, y):
|
|
3530
|
+
px, py = study.find(x), study.find(y)
|
|
3433
3531
|
if px == py:
|
|
3434
3532
|
return
|
|
3435
|
-
if
|
|
3533
|
+
if study.rank[px] < study.rank[py]:
|
|
3436
3534
|
px, py = py, px
|
|
3437
|
-
|
|
3438
|
-
if
|
|
3439
|
-
|
|
3535
|
+
study.parent[py] = px
|
|
3536
|
+
if study.rank[px] == study.rank[py]:
|
|
3537
|
+
study.rank[px] += 1
|
|
3440
3538
|
|
|
3441
3539
|
uid_to_idx = {feature[0]: i for i, feature in enumerate(valid_features)}
|
|
3442
3540
|
uf = UnionFind(len(valid_features))
|