masster 0.3.10__py3-none-any.whl → 0.3.11__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of masster might be problematic. Click here for more details.
- masster/docs/SCX_API_Documentation.md +0 -0
- masster/docs/SCX_DLL_Analysis.md +0 -0
- masster/logger.py +92 -78
- masster/sample/defaults/find_features_def.py +16 -6
- masster/sample/defaults/sample_def.py +1 -1
- masster/sample/h5.py +2 -2
- masster/sample/helpers.py +137 -136
- masster/sample/load.py +13 -9
- masster/sample/plot.py +156 -131
- masster/sample/processing.py +18 -12
- masster/sample/sample.py +4 -4
- masster/sample/sample5_schema.json +62 -62
- masster/sample/save.py +16 -13
- masster/sample/sciex.py +187 -176
- masster/study/defaults/align_def.py +224 -6
- masster/study/defaults/fill_chrom_def.py +1 -5
- masster/study/defaults/integrate_chrom_def.py +1 -5
- masster/study/defaults/study_def.py +2 -2
- masster/study/export.py +144 -131
- masster/study/h5.py +193 -133
- masster/study/helpers.py +293 -245
- masster/study/helpers_optimized.py +99 -57
- masster/study/load.py +51 -25
- masster/study/plot.py +453 -17
- masster/study/processing.py +159 -76
- masster/study/save.py +7 -7
- masster/study/study.py +97 -88
- masster/study/study5_schema.json +82 -82
- {masster-0.3.10.dist-info → masster-0.3.11.dist-info}/METADATA +1 -1
- {masster-0.3.10.dist-info → masster-0.3.11.dist-info}/RECORD +33 -31
- {masster-0.3.10.dist-info → masster-0.3.11.dist-info}/WHEEL +0 -0
- {masster-0.3.10.dist-info → masster-0.3.11.dist-info}/entry_points.txt +0 -0
- {masster-0.3.10.dist-info → masster-0.3.11.dist-info}/licenses/LICENSE +0 -0
masster/study/helpers.py
CHANGED
|
@@ -7,7 +7,7 @@ import pandas as pd
|
|
|
7
7
|
import polars as pl
|
|
8
8
|
|
|
9
9
|
from tqdm import tqdm
|
|
10
|
-
|
|
10
|
+
|
|
11
11
|
|
|
12
12
|
def get_chrom(self, uids=None, samples=None):
|
|
13
13
|
# Check if consensus_df is empty or doesn't have required columns
|
|
@@ -113,6 +113,7 @@ def get_chrom(self, uids=None, samples=None):
|
|
|
113
113
|
# Return as Polars DataFrame (can handle complex objects like Chromatogram)
|
|
114
114
|
return df2_pivoted
|
|
115
115
|
|
|
116
|
+
|
|
116
117
|
def set_folder(self, folder):
|
|
117
118
|
"""
|
|
118
119
|
Set the folder for saving and loading files.
|
|
@@ -408,17 +409,21 @@ def _get_sample_uids(self, samples=None, seed=42):
|
|
|
408
409
|
sample_uids = list(set(sample_uids))
|
|
409
410
|
return sample_uids
|
|
410
411
|
|
|
412
|
+
|
|
411
413
|
def get_orphans(self):
|
|
412
|
-
"""
|
|
414
|
+
"""
|
|
413
415
|
Get all features that are not in the consensus mapping.
|
|
414
416
|
"""
|
|
415
|
-
not_in_consensus = self.features_df.filter(
|
|
417
|
+
not_in_consensus = self.features_df.filter(
|
|
418
|
+
~self.features_df["feature_uid"].is_in(self.consensus_mapping_df["feature_uid"].to_list())
|
|
419
|
+
)
|
|
416
420
|
return not_in_consensus
|
|
417
421
|
|
|
422
|
+
|
|
418
423
|
def compress(self, features=True, ms2=True, chrom=False, ms2_max=5):
|
|
419
424
|
"""
|
|
420
425
|
Perform compress_features, compress_ms2, and compress_chrom operations.
|
|
421
|
-
|
|
426
|
+
|
|
422
427
|
Parameters:
|
|
423
428
|
max_replicates (int): Maximum number of MS2 replicates to keep per consensus_uid and energy combination
|
|
424
429
|
"""
|
|
@@ -441,48 +446,50 @@ def compress_features(self):
|
|
|
441
446
|
if self.features_df is None or self.features_df.is_empty():
|
|
442
447
|
self.logger.warning("No features_df found.")
|
|
443
448
|
return
|
|
444
|
-
|
|
449
|
+
|
|
445
450
|
if self.consensus_mapping_df is None or self.consensus_mapping_df.is_empty():
|
|
446
451
|
self.logger.warning("No consensus_mapping_df found.")
|
|
447
452
|
return
|
|
448
|
-
|
|
453
|
+
|
|
449
454
|
initial_count = len(self.features_df)
|
|
450
|
-
|
|
455
|
+
|
|
451
456
|
# Get feature_uids that are associated with consensus features
|
|
452
457
|
consensus_feature_uids = self.consensus_mapping_df["feature_uid"].to_list()
|
|
453
|
-
|
|
458
|
+
|
|
454
459
|
# Filter features_df to keep only features associated with consensus
|
|
455
460
|
self.features_df = self.features_df.filter(
|
|
456
|
-
pl.col("feature_uid").is_in(consensus_feature_uids)
|
|
461
|
+
pl.col("feature_uid").is_in(consensus_feature_uids),
|
|
457
462
|
)
|
|
458
|
-
|
|
463
|
+
|
|
459
464
|
# Set ms2_specs column to None if it exists
|
|
460
465
|
if "ms2_specs" in self.features_df.columns:
|
|
461
466
|
# Create a list of None values with the same length as the dataframe
|
|
462
467
|
# This preserves the Object dtype instead of converting to Null
|
|
463
468
|
none_values = [None] * len(self.features_df)
|
|
464
469
|
self.features_df = self.features_df.with_columns(
|
|
465
|
-
pl.Series("ms2_specs", none_values, dtype=pl.Object)
|
|
470
|
+
pl.Series("ms2_specs", none_values, dtype=pl.Object),
|
|
466
471
|
)
|
|
467
|
-
|
|
472
|
+
|
|
468
473
|
removed_count = initial_count - len(self.features_df)
|
|
469
|
-
self.logger.info(
|
|
474
|
+
self.logger.info(
|
|
475
|
+
f"Compressed features: removed {removed_count} features not in consensus, cleared ms2_specs column"
|
|
476
|
+
)
|
|
470
477
|
|
|
471
478
|
|
|
472
479
|
def restore_features(self, samples=None, maps=False):
|
|
473
480
|
"""
|
|
474
|
-
Update specific columns (chrom, chrom_area, ms2_scans, ms2_specs) in features_df
|
|
481
|
+
Update specific columns (chrom, chrom_area, ms2_scans, ms2_specs) in features_df
|
|
475
482
|
from the corresponding samples by reading features_df from the sample5 file.
|
|
476
483
|
Use the feature_id for matching.
|
|
477
484
|
|
|
478
485
|
Parameters:
|
|
479
|
-
samples (list, optional): List of sample_uids or sample_names to restore.
|
|
486
|
+
samples (list, optional): List of sample_uids or sample_names to restore.
|
|
480
487
|
If None, restores all samples.
|
|
481
488
|
maps (bool, optional): If True, also load featureXML data and update study.feature_maps.
|
|
482
489
|
"""
|
|
483
490
|
import datetime
|
|
484
491
|
from masster.sample.sample import Sample
|
|
485
|
-
|
|
492
|
+
|
|
486
493
|
if self.features_df is None or self.features_df.is_empty():
|
|
487
494
|
self.logger.error("No features_df found in study.")
|
|
488
495
|
return
|
|
@@ -499,8 +506,8 @@ def restore_features(self, samples=None, maps=False):
|
|
|
499
506
|
return
|
|
500
507
|
|
|
501
508
|
# Columns to update from sample data
|
|
502
|
-
columns_to_update = [
|
|
503
|
-
|
|
509
|
+
columns_to_update = ["chrom", "chrom_area", "ms2_scans", "ms2_specs"]
|
|
510
|
+
|
|
504
511
|
self.logger.info(f"Restoring columns {columns_to_update} from {len(sample_uids)} samples...")
|
|
505
512
|
|
|
506
513
|
# Create a mapping of (sample_uid, feature_id) to feature_uid from study.features_df
|
|
@@ -512,10 +519,12 @@ def restore_features(self, samples=None, maps=False):
|
|
|
512
519
|
|
|
513
520
|
# Process each sample
|
|
514
521
|
tqdm_disable = self.log_level not in ["TRACE", "DEBUG", "INFO"]
|
|
515
|
-
for sample_uid in tqdm(
|
|
516
|
-
|
|
517
|
-
|
|
518
|
-
|
|
522
|
+
for sample_uid in tqdm(
|
|
523
|
+
sample_uids,
|
|
524
|
+
unit="sample",
|
|
525
|
+
disable=tqdm_disable,
|
|
526
|
+
desc=f"{datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}Restoring samples",
|
|
527
|
+
):
|
|
519
528
|
# Get sample info
|
|
520
529
|
sample_row = self.samples_df.filter(pl.col("sample_uid") == sample_uid)
|
|
521
530
|
if sample_row.is_empty():
|
|
@@ -534,7 +543,7 @@ def restore_features(self, samples=None, maps=False):
|
|
|
534
543
|
# Load sample to get its features_df
|
|
535
544
|
# Use a direct load call with map=False to prevent feature synchronization
|
|
536
545
|
# which would remove filled features that don't exist in the original FeatureMap
|
|
537
|
-
sample = Sample(log_level=
|
|
546
|
+
sample = Sample(log_level="DEBUG")
|
|
538
547
|
sample._load_sample5(sample_path, map=False)
|
|
539
548
|
|
|
540
549
|
if sample.features_df is None or sample.features_df.is_empty():
|
|
@@ -547,34 +556,34 @@ def restore_features(self, samples=None, maps=False):
|
|
|
547
556
|
feature_id = row.get("feature_id")
|
|
548
557
|
if feature_id is None:
|
|
549
558
|
continue
|
|
550
|
-
|
|
559
|
+
|
|
551
560
|
key = (sample_uid, feature_id)
|
|
552
561
|
if key in study_feature_mapping:
|
|
553
562
|
feature_uid = study_feature_mapping[key]
|
|
554
|
-
|
|
563
|
+
|
|
555
564
|
# Update the specific columns in study.features_df
|
|
556
565
|
for col in columns_to_update:
|
|
557
566
|
if col in row and col in self.features_df.columns:
|
|
558
567
|
# Get the original column dtype to preserve it
|
|
559
568
|
original_dtype = self.features_df[col].dtype
|
|
560
|
-
|
|
569
|
+
|
|
561
570
|
# Update the specific row and column, preserving dtype
|
|
562
571
|
mask = (pl.col("feature_uid") == feature_uid) & (pl.col("sample_uid") == sample_uid)
|
|
563
|
-
|
|
572
|
+
|
|
564
573
|
# Handle object columns (like Chromatogram) differently
|
|
565
574
|
if original_dtype == pl.Object:
|
|
566
575
|
self.features_df = self.features_df.with_columns(
|
|
567
576
|
pl.when(mask)
|
|
568
577
|
.then(pl.lit(row[col], dtype=original_dtype, allow_object=True))
|
|
569
578
|
.otherwise(pl.col(col))
|
|
570
|
-
.alias(col)
|
|
579
|
+
.alias(col),
|
|
571
580
|
)
|
|
572
581
|
else:
|
|
573
582
|
self.features_df = self.features_df.with_columns(
|
|
574
583
|
pl.when(mask)
|
|
575
584
|
.then(pl.lit(row[col], dtype=original_dtype))
|
|
576
585
|
.otherwise(pl.col(col))
|
|
577
|
-
.alias(col)
|
|
586
|
+
.alias(col),
|
|
578
587
|
)
|
|
579
588
|
updates_made += 1
|
|
580
589
|
|
|
@@ -582,7 +591,7 @@ def restore_features(self, samples=None, maps=False):
|
|
|
582
591
|
|
|
583
592
|
# If maps is True, load featureXML data
|
|
584
593
|
if maps:
|
|
585
|
-
if hasattr(sample,
|
|
594
|
+
if hasattr(sample, "feature_maps"):
|
|
586
595
|
self.feature_maps.extend(sample.feature_maps)
|
|
587
596
|
|
|
588
597
|
except Exception as e:
|
|
@@ -595,14 +604,14 @@ def restore_features(self, samples=None, maps=False):
|
|
|
595
604
|
def restore_chrom(self, samples=None, mz_tol=0.010, rt_tol=10.0):
|
|
596
605
|
"""
|
|
597
606
|
Restore chromatograms from individual .sample5 files and gap-fill missing ones.
|
|
598
|
-
|
|
607
|
+
|
|
599
608
|
This function combines the functionality of restore_features() and fill_chrom():
|
|
600
609
|
1. First restores chromatograms from individual .sample5 files (like restore_features)
|
|
601
610
|
2. Then gap-fills any remaining empty chromatograms (like fill_chrom)
|
|
602
611
|
3. ONLY updates the 'chrom' column, not chrom_area or other derived values
|
|
603
|
-
|
|
612
|
+
|
|
604
613
|
Parameters:
|
|
605
|
-
samples (list, optional): List of sample_uids or sample_names to process.
|
|
614
|
+
samples (list, optional): List of sample_uids or sample_names to process.
|
|
606
615
|
If None, processes all samples.
|
|
607
616
|
mz_tol (float): m/z tolerance for gap filling (default: 0.010)
|
|
608
617
|
rt_tol (float): RT tolerance for gap filling (default: 10.0)
|
|
@@ -611,7 +620,7 @@ def restore_chrom(self, samples=None, mz_tol=0.010, rt_tol=10.0):
|
|
|
611
620
|
import numpy as np
|
|
612
621
|
from masster.sample.sample import Sample
|
|
613
622
|
from masster.chromatogram import Chromatogram
|
|
614
|
-
|
|
623
|
+
|
|
615
624
|
if self.features_df is None or self.features_df.is_empty():
|
|
616
625
|
self.logger.error("No features_df found in study.")
|
|
617
626
|
return
|
|
@@ -627,7 +636,7 @@ def restore_chrom(self, samples=None, mz_tol=0.010, rt_tol=10.0):
|
|
|
627
636
|
return
|
|
628
637
|
|
|
629
638
|
self.logger.info(f"Restoring chromatograms from {len(sample_uids)} samples...")
|
|
630
|
-
|
|
639
|
+
|
|
631
640
|
# Create mapping of (sample_uid, feature_id) to feature_uid
|
|
632
641
|
study_feature_mapping = {}
|
|
633
642
|
for row in self.features_df.iter_rows(named=True):
|
|
@@ -638,12 +647,13 @@ def restore_chrom(self, samples=None, mz_tol=0.010, rt_tol=10.0):
|
|
|
638
647
|
# Phase 1: Restore from individual .sample5 files (like restore_features)
|
|
639
648
|
restored_count = 0
|
|
640
649
|
tqdm_disable = self.log_level not in ["TRACE", "DEBUG", "INFO"]
|
|
641
|
-
|
|
650
|
+
|
|
642
651
|
self.logger.info("Phase 1: Restoring chromatograms from .sample5 files...")
|
|
643
|
-
for sample_uid in tqdm(
|
|
644
|
-
|
|
645
|
-
|
|
646
|
-
|
|
652
|
+
for sample_uid in tqdm(
|
|
653
|
+
sample_uids,
|
|
654
|
+
desc=f"{datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}Restoring from samples",
|
|
655
|
+
disable=tqdm_disable,
|
|
656
|
+
):
|
|
647
657
|
# Get sample info
|
|
648
658
|
sample_row = self.samples_df.filter(pl.col("sample_uid") == sample_uid)
|
|
649
659
|
if sample_row.is_empty():
|
|
@@ -660,7 +670,7 @@ def restore_chrom(self, samples=None, mz_tol=0.010, rt_tol=10.0):
|
|
|
660
670
|
|
|
661
671
|
try:
|
|
662
672
|
# Load sample (with map=False to prevent feature synchronization)
|
|
663
|
-
sample = Sample(log_level=
|
|
673
|
+
sample = Sample(log_level="WARNING")
|
|
664
674
|
sample._load_sample5(sample_path, map=False)
|
|
665
675
|
|
|
666
676
|
if sample.features_df is None or sample.features_df.is_empty():
|
|
@@ -671,21 +681,21 @@ def restore_chrom(self, samples=None, mz_tol=0.010, rt_tol=10.0):
|
|
|
671
681
|
for row in sample.features_df.iter_rows(named=True):
|
|
672
682
|
feature_id = row.get("feature_id")
|
|
673
683
|
chrom = row.get("chrom")
|
|
674
|
-
|
|
684
|
+
|
|
675
685
|
if feature_id is None or chrom is None:
|
|
676
686
|
continue
|
|
677
|
-
|
|
687
|
+
|
|
678
688
|
key = (sample_uid, feature_id)
|
|
679
689
|
if key in study_feature_mapping:
|
|
680
690
|
feature_uid = study_feature_mapping[key]
|
|
681
|
-
|
|
691
|
+
|
|
682
692
|
# Update only the chrom column
|
|
683
693
|
mask = (pl.col("feature_uid") == feature_uid) & (pl.col("sample_uid") == sample_uid)
|
|
684
694
|
self.features_df = self.features_df.with_columns(
|
|
685
695
|
pl.when(mask)
|
|
686
696
|
.then(pl.lit(chrom, dtype=pl.Object, allow_object=True))
|
|
687
697
|
.otherwise(pl.col("chrom"))
|
|
688
|
-
.alias("chrom")
|
|
698
|
+
.alias("chrom"),
|
|
689
699
|
)
|
|
690
700
|
restored_count += 1
|
|
691
701
|
|
|
@@ -694,20 +704,22 @@ def restore_chrom(self, samples=None, mz_tol=0.010, rt_tol=10.0):
|
|
|
694
704
|
continue
|
|
695
705
|
|
|
696
706
|
self.logger.info(f"Phase 1 complete: Restored {restored_count} chromatograms from .sample5 files")
|
|
697
|
-
|
|
707
|
+
|
|
698
708
|
# Phase 2: Gap-fill remaining empty chromatograms (like fill_chrom)
|
|
699
709
|
self.logger.info("Phase 2: Gap-filling remaining empty chromatograms...")
|
|
700
|
-
|
|
710
|
+
|
|
701
711
|
# Count how many chromatograms are still missing
|
|
702
712
|
empty_chroms = self.features_df.filter(pl.col("chrom").is_null()).height
|
|
703
713
|
total_chroms = len(self.features_df)
|
|
704
|
-
|
|
705
|
-
self.logger.debug(
|
|
706
|
-
|
|
714
|
+
|
|
715
|
+
self.logger.debug(
|
|
716
|
+
f"Chromatograms still missing: {empty_chroms}/{total_chroms} ({empty_chroms / total_chroms * 100:.1f}%)"
|
|
717
|
+
)
|
|
718
|
+
|
|
707
719
|
if empty_chroms == 0:
|
|
708
720
|
self.logger.info("All chromatograms restored from .sample5 files. No gap-filling needed.")
|
|
709
721
|
return
|
|
710
|
-
|
|
722
|
+
|
|
711
723
|
# Get consensus info for gap filling
|
|
712
724
|
consensus_info = {}
|
|
713
725
|
for row in self.consensus_df.iter_rows(named=True):
|
|
@@ -717,23 +729,23 @@ def restore_chrom(self, samples=None, mz_tol=0.010, rt_tol=10.0):
|
|
|
717
729
|
"mz": row["mz"],
|
|
718
730
|
"rt": row["rt"],
|
|
719
731
|
}
|
|
720
|
-
|
|
732
|
+
|
|
721
733
|
filled_count = 0
|
|
722
|
-
|
|
734
|
+
|
|
723
735
|
# Process each sample that has missing chromatograms
|
|
724
|
-
for sample_uid in tqdm(
|
|
725
|
-
|
|
726
|
-
|
|
727
|
-
|
|
736
|
+
for sample_uid in tqdm(
|
|
737
|
+
sample_uids,
|
|
738
|
+
desc=f"{datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}Gap-filling missing chromatograms",
|
|
739
|
+
disable=tqdm_disable,
|
|
740
|
+
):
|
|
728
741
|
# Get features with missing chromatograms for this sample
|
|
729
742
|
missing_features = self.features_df.filter(
|
|
730
|
-
(pl.col("sample_uid") == sample_uid) &
|
|
731
|
-
(pl.col("chrom").is_null())
|
|
743
|
+
(pl.col("sample_uid") == sample_uid) & (pl.col("chrom").is_null()),
|
|
732
744
|
)
|
|
733
|
-
|
|
745
|
+
|
|
734
746
|
if missing_features.is_empty():
|
|
735
747
|
continue
|
|
736
|
-
|
|
748
|
+
|
|
737
749
|
# Get sample info
|
|
738
750
|
sample_row = self.samples_df.filter(pl.col("sample_uid") == sample_uid)
|
|
739
751
|
sample_info = sample_row.row(0, named=True)
|
|
@@ -745,10 +757,10 @@ def restore_chrom(self, samples=None, mz_tol=0.010, rt_tol=10.0):
|
|
|
745
757
|
|
|
746
758
|
try:
|
|
747
759
|
# Load sample for MS1 data extraction
|
|
748
|
-
sample = Sample(log_level=
|
|
760
|
+
sample = Sample(log_level="WARNING")
|
|
749
761
|
sample._load_sample5(sample_path, map=False)
|
|
750
762
|
|
|
751
|
-
if not hasattr(sample,
|
|
763
|
+
if not hasattr(sample, "ms1_df") or sample.ms1_df is None or sample.ms1_df.is_empty():
|
|
752
764
|
continue
|
|
753
765
|
|
|
754
766
|
# Process each missing feature
|
|
@@ -758,15 +770,15 @@ def restore_chrom(self, samples=None, mz_tol=0.010, rt_tol=10.0):
|
|
|
758
770
|
rt = feature_row["rt"]
|
|
759
771
|
rt_start = feature_row.get("rt_start", rt - rt_tol)
|
|
760
772
|
rt_end = feature_row.get("rt_end", rt + rt_tol)
|
|
761
|
-
|
|
773
|
+
|
|
762
774
|
# Extract EIC from MS1 data
|
|
763
775
|
d = sample.ms1_df.filter(
|
|
764
|
-
(pl.col("mz") >= mz - mz_tol)
|
|
765
|
-
(pl.col("mz") <= mz + mz_tol)
|
|
766
|
-
(pl.col("rt") >= rt_start - rt_tol)
|
|
767
|
-
(pl.col("rt") <= rt_end + rt_tol)
|
|
776
|
+
(pl.col("mz") >= mz - mz_tol)
|
|
777
|
+
& (pl.col("mz") <= mz + mz_tol)
|
|
778
|
+
& (pl.col("rt") >= rt_start - rt_tol)
|
|
779
|
+
& (pl.col("rt") <= rt_end + rt_tol),
|
|
768
780
|
)
|
|
769
|
-
|
|
781
|
+
|
|
770
782
|
# Create chromatogram
|
|
771
783
|
if d.is_empty():
|
|
772
784
|
# Create empty chromatogram
|
|
@@ -784,7 +796,7 @@ def restore_chrom(self, samples=None, mz_tol=0.010, rt_tol=10.0):
|
|
|
784
796
|
else:
|
|
785
797
|
# Create real chromatogram from data
|
|
786
798
|
eic_rt = d.group_by("rt").agg(pl.col("inty").max()).sort("rt")
|
|
787
|
-
|
|
799
|
+
|
|
788
800
|
if len(eic_rt) > 4:
|
|
789
801
|
eic = Chromatogram(
|
|
790
802
|
eic_rt["rt"].to_numpy(),
|
|
@@ -809,14 +821,14 @@ def restore_chrom(self, samples=None, mz_tol=0.010, rt_tol=10.0):
|
|
|
809
821
|
feature_end=rt_end,
|
|
810
822
|
feature_apex=rt,
|
|
811
823
|
)
|
|
812
|
-
|
|
824
|
+
|
|
813
825
|
# Update the chromatogram in the study
|
|
814
826
|
mask = pl.col("feature_uid") == feature_uid
|
|
815
827
|
self.features_df = self.features_df.with_columns(
|
|
816
828
|
pl.when(mask)
|
|
817
829
|
.then(pl.lit(eic, dtype=pl.Object, allow_object=True))
|
|
818
830
|
.otherwise(pl.col("chrom"))
|
|
819
|
-
.alias("chrom")
|
|
831
|
+
.alias("chrom"),
|
|
820
832
|
)
|
|
821
833
|
filled_count += 1
|
|
822
834
|
|
|
@@ -825,12 +837,14 @@ def restore_chrom(self, samples=None, mz_tol=0.010, rt_tol=10.0):
|
|
|
825
837
|
continue
|
|
826
838
|
|
|
827
839
|
self.logger.info(f"Phase 2 complete: Gap-filled {filled_count} chromatograms")
|
|
828
|
-
|
|
840
|
+
|
|
829
841
|
# Final summary
|
|
830
842
|
final_non_null = self.features_df.filter(pl.col("chrom").is_not_null()).height
|
|
831
843
|
final_total = len(self.features_df)
|
|
832
|
-
|
|
833
|
-
self.logger.info(
|
|
844
|
+
|
|
845
|
+
self.logger.info(
|
|
846
|
+
f"Chromatogram restoration complete: {final_non_null}/{final_total} ({final_non_null / final_total * 100:.1f}%)"
|
|
847
|
+
)
|
|
834
848
|
self.logger.info(f"Restored from .sample5 files: {restored_count}, Gap-filled from raw data: {filled_count}")
|
|
835
849
|
|
|
836
850
|
|
|
@@ -839,41 +853,39 @@ def compress_ms2(self, max_replicates=5):
|
|
|
839
853
|
Reduce the number of entries matching any pair of (consensus and energy) to max XY rows.
|
|
840
854
|
Groups all rows by consensus_uid and energy. For each group, sort by number_frags * prec_inty,
|
|
841
855
|
and then pick the top XY rows. Discard the others.
|
|
842
|
-
|
|
856
|
+
|
|
843
857
|
Parameters:
|
|
844
858
|
max_replicates (int): Maximum number of replicates to keep per consensus_uid and energy combination
|
|
845
859
|
"""
|
|
846
860
|
if self.consensus_ms2 is None or self.consensus_ms2.is_empty():
|
|
847
861
|
self.logger.warning("No consensus_ms2 found.")
|
|
848
862
|
return
|
|
849
|
-
|
|
863
|
+
|
|
850
864
|
initial_count = len(self.consensus_ms2)
|
|
851
|
-
|
|
865
|
+
|
|
852
866
|
# Create a ranking score based on number_frags * prec_inty
|
|
853
867
|
# Handle None values by treating them as 0
|
|
854
868
|
self.consensus_ms2 = self.consensus_ms2.with_columns([
|
|
855
|
-
(
|
|
856
|
-
pl.col("number_frags").fill_null(0) *
|
|
857
|
-
pl.col("prec_inty").fill_null(0)
|
|
858
|
-
).alias("ranking_score")
|
|
869
|
+
(pl.col("number_frags").fill_null(0) * pl.col("prec_inty").fill_null(0)).alias("ranking_score"),
|
|
859
870
|
])
|
|
860
|
-
|
|
871
|
+
|
|
861
872
|
# Group by consensus_uid and energy, then rank by score and keep top max_replicates
|
|
862
873
|
compressed_ms2 = (
|
|
863
|
-
self.consensus_ms2
|
|
864
|
-
.with_row_count("row_id") # Add row numbers for stable sorting
|
|
874
|
+
self.consensus_ms2.with_row_count("row_id") # Add row numbers for stable sorting
|
|
865
875
|
.sort(["consensus_uid", "energy", "ranking_score", "row_id"], descending=[False, False, True, False])
|
|
866
876
|
.with_columns([
|
|
867
|
-
pl.int_range(pl.len()).over(["consensus_uid", "energy"]).alias("rank")
|
|
877
|
+
pl.int_range(pl.len()).over(["consensus_uid", "energy"]).alias("rank"),
|
|
868
878
|
])
|
|
869
879
|
.filter(pl.col("rank") < max_replicates)
|
|
870
880
|
.drop(["ranking_score", "row_id", "rank"])
|
|
871
881
|
)
|
|
872
|
-
|
|
882
|
+
|
|
873
883
|
self.consensus_ms2 = compressed_ms2
|
|
874
|
-
|
|
884
|
+
|
|
875
885
|
removed_count = initial_count - len(self.consensus_ms2)
|
|
876
|
-
self.logger.info(
|
|
886
|
+
self.logger.info(
|
|
887
|
+
f"Compressed MS2 data: removed {removed_count} entries, kept max {max_replicates} per consensus/energy pair"
|
|
888
|
+
)
|
|
877
889
|
|
|
878
890
|
|
|
879
891
|
def compress_chrom(self):
|
|
@@ -886,49 +898,49 @@ def compress_chrom(self):
|
|
|
886
898
|
if self.features_df is None or self.features_df.is_empty():
|
|
887
899
|
self.logger.warning("No features_df found.")
|
|
888
900
|
return
|
|
889
|
-
|
|
901
|
+
|
|
890
902
|
if "chrom" not in self.features_df.columns:
|
|
891
903
|
self.logger.warning("No 'chrom' column found in features_df.")
|
|
892
904
|
return
|
|
893
|
-
|
|
905
|
+
|
|
894
906
|
# Count non-null chromatograms before compression
|
|
895
907
|
non_null_count = self.features_df.filter(pl.col("chrom").is_not_null()).height
|
|
896
|
-
|
|
908
|
+
|
|
897
909
|
# Set chrom column to None while keeping dtype as object
|
|
898
910
|
self.features_df = self.features_df.with_columns(
|
|
899
|
-
pl.lit(None, dtype=pl.Object).alias("chrom")
|
|
911
|
+
pl.lit(None, dtype=pl.Object).alias("chrom"),
|
|
900
912
|
)
|
|
901
|
-
|
|
913
|
+
|
|
902
914
|
self.logger.info(f"Compressed chromatograms: cleared {non_null_count} chromatogram objects from features_df")
|
|
903
915
|
|
|
904
916
|
|
|
905
917
|
def set_source(self, filename):
|
|
906
918
|
"""
|
|
907
|
-
Reassign file_source for all samples in samples_df. If filename contains only a path,
|
|
908
|
-
keep the current basename and build an absolute path. Check that the new file exists
|
|
919
|
+
Reassign file_source for all samples in samples_df. If filename contains only a path,
|
|
920
|
+
keep the current basename and build an absolute path. Check that the new file exists
|
|
909
921
|
before overwriting the old file_source.
|
|
910
|
-
|
|
922
|
+
|
|
911
923
|
Parameters:
|
|
912
924
|
filename (str): New file path or directory path for all samples
|
|
913
|
-
|
|
925
|
+
|
|
914
926
|
Returns:
|
|
915
927
|
None
|
|
916
928
|
"""
|
|
917
929
|
import os
|
|
918
|
-
|
|
930
|
+
|
|
919
931
|
if self.samples_df is None or len(self.samples_df) == 0:
|
|
920
932
|
self.logger.warning("No samples found in study.")
|
|
921
933
|
return
|
|
922
|
-
|
|
934
|
+
|
|
923
935
|
updated_count = 0
|
|
924
936
|
failed_count = 0
|
|
925
|
-
|
|
937
|
+
|
|
926
938
|
# Get all current file_source values
|
|
927
939
|
current_sources = self.samples_df.get_column("file_source").to_list()
|
|
928
940
|
sample_names = self.samples_df.get_column("sample_name").to_list()
|
|
929
|
-
|
|
941
|
+
|
|
930
942
|
new_sources = []
|
|
931
|
-
|
|
943
|
+
|
|
932
944
|
for i, (current_source, sample_name) in enumerate(zip(current_sources, sample_names)):
|
|
933
945
|
# Check if filename is just a directory path
|
|
934
946
|
if os.path.isdir(filename):
|
|
@@ -937,7 +949,7 @@ def set_source(self, filename):
|
|
|
937
949
|
new_sources.append(current_source)
|
|
938
950
|
failed_count += 1
|
|
939
951
|
continue
|
|
940
|
-
|
|
952
|
+
|
|
941
953
|
# Get the basename from current file_source
|
|
942
954
|
current_basename = os.path.basename(current_source)
|
|
943
955
|
# Build new absolute path
|
|
@@ -945,26 +957,26 @@ def set_source(self, filename):
|
|
|
945
957
|
else:
|
|
946
958
|
# filename is a full path, make it absolute
|
|
947
959
|
new_file_path = os.path.abspath(filename)
|
|
948
|
-
|
|
960
|
+
|
|
949
961
|
# Check if the new file exists
|
|
950
962
|
if not os.path.exists(new_file_path):
|
|
951
963
|
self.logger.warning(f"File does not exist for sample '{sample_name}': {new_file_path}")
|
|
952
964
|
new_sources.append(current_source)
|
|
953
965
|
failed_count += 1
|
|
954
966
|
continue
|
|
955
|
-
|
|
967
|
+
|
|
956
968
|
# File exists, update source
|
|
957
969
|
new_sources.append(new_file_path)
|
|
958
970
|
updated_count += 1
|
|
959
|
-
|
|
971
|
+
|
|
960
972
|
# Log individual updates at debug level
|
|
961
973
|
self.logger.debug(f"Updated file_source for sample '{sample_name}': {current_source} -> {new_file_path}")
|
|
962
|
-
|
|
974
|
+
|
|
963
975
|
# Update the samples_df with new file_source values
|
|
964
976
|
self.samples_df = self.samples_df.with_columns(
|
|
965
|
-
pl.Series("file_source", new_sources).alias("file_source")
|
|
977
|
+
pl.Series("file_source", new_sources).alias("file_source"),
|
|
966
978
|
)
|
|
967
|
-
|
|
979
|
+
|
|
968
980
|
# Log summary
|
|
969
981
|
if updated_count > 0:
|
|
970
982
|
self.logger.info(f"Updated file_source for {updated_count} samples")
|
|
@@ -990,9 +1002,9 @@ def features_select(
|
|
|
990
1002
|
):
|
|
991
1003
|
"""
|
|
992
1004
|
Select features from features_df based on specified criteria and return the filtered DataFrame.
|
|
993
|
-
|
|
1005
|
+
|
|
994
1006
|
OPTIMIZED VERSION: Combines all filters into a single operation for better performance.
|
|
995
|
-
|
|
1007
|
+
|
|
996
1008
|
Parameters:
|
|
997
1009
|
mz: m/z range filter (tuple for range, single value for minimum)
|
|
998
1010
|
rt: retention time range filter (tuple for range, single value for minimum)
|
|
@@ -1007,30 +1019,42 @@ def features_select(
|
|
|
1007
1019
|
chrom_prominence: chromatogram prominence filter (tuple for range, single value for minimum)
|
|
1008
1020
|
chrom_prominence_scaled: scaled chromatogram prominence filter (tuple for range, single value for minimum)
|
|
1009
1021
|
chrom_height_scaled: scaled chromatogram height filter (tuple for range, single value for minimum)
|
|
1010
|
-
|
|
1022
|
+
|
|
1011
1023
|
Returns:
|
|
1012
1024
|
polars.DataFrame: Filtered features DataFrame
|
|
1013
1025
|
"""
|
|
1014
1026
|
if self.features_df is None or self.features_df.is_empty():
|
|
1015
1027
|
self.logger.warning("No features found in study.")
|
|
1016
1028
|
return pl.DataFrame()
|
|
1017
|
-
|
|
1029
|
+
|
|
1018
1030
|
# Early return if no filters provided - performance optimization
|
|
1019
|
-
filter_params = [
|
|
1020
|
-
|
|
1021
|
-
|
|
1031
|
+
filter_params = [
|
|
1032
|
+
mz,
|
|
1033
|
+
rt,
|
|
1034
|
+
inty,
|
|
1035
|
+
sample_uid,
|
|
1036
|
+
sample_name,
|
|
1037
|
+
consensus_uid,
|
|
1038
|
+
feature_uid,
|
|
1039
|
+
filled,
|
|
1040
|
+
quality,
|
|
1041
|
+
chrom_coherence,
|
|
1042
|
+
chrom_prominence,
|
|
1043
|
+
chrom_prominence_scaled,
|
|
1044
|
+
chrom_height_scaled,
|
|
1045
|
+
]
|
|
1022
1046
|
if all(param is None for param in filter_params):
|
|
1023
1047
|
return self.features_df.clone()
|
|
1024
|
-
|
|
1048
|
+
|
|
1025
1049
|
initial_count = len(self.features_df)
|
|
1026
|
-
|
|
1050
|
+
|
|
1027
1051
|
# Pre-check available columns once for efficiency
|
|
1028
1052
|
available_columns = set(self.features_df.columns)
|
|
1029
|
-
|
|
1053
|
+
|
|
1030
1054
|
# Build all filter conditions first, then apply them all at once
|
|
1031
1055
|
filter_conditions = []
|
|
1032
1056
|
warnings = []
|
|
1033
|
-
|
|
1057
|
+
|
|
1034
1058
|
# Filter by m/z
|
|
1035
1059
|
if mz is not None:
|
|
1036
1060
|
if isinstance(mz, tuple) and len(mz) == 2:
|
|
@@ -1038,7 +1062,7 @@ def features_select(
|
|
|
1038
1062
|
filter_conditions.append((pl.col("mz") >= min_mz) & (pl.col("mz") <= max_mz))
|
|
1039
1063
|
else:
|
|
1040
1064
|
filter_conditions.append(pl.col("mz") >= mz)
|
|
1041
|
-
|
|
1065
|
+
|
|
1042
1066
|
# Filter by retention time
|
|
1043
1067
|
if rt is not None:
|
|
1044
1068
|
if isinstance(rt, tuple) and len(rt) == 2:
|
|
@@ -1046,7 +1070,7 @@ def features_select(
|
|
|
1046
1070
|
filter_conditions.append((pl.col("rt") >= min_rt) & (pl.col("rt") <= max_rt))
|
|
1047
1071
|
else:
|
|
1048
1072
|
filter_conditions.append(pl.col("rt") >= rt)
|
|
1049
|
-
|
|
1073
|
+
|
|
1050
1074
|
# Filter by intensity
|
|
1051
1075
|
if inty is not None:
|
|
1052
1076
|
if isinstance(inty, tuple) and len(inty) == 2:
|
|
@@ -1054,7 +1078,7 @@ def features_select(
|
|
|
1054
1078
|
filter_conditions.append((pl.col("inty") >= min_inty) & (pl.col("inty") <= max_inty))
|
|
1055
1079
|
else:
|
|
1056
1080
|
filter_conditions.append(pl.col("inty") >= inty)
|
|
1057
|
-
|
|
1081
|
+
|
|
1058
1082
|
# Filter by sample_uid
|
|
1059
1083
|
if sample_uid is not None:
|
|
1060
1084
|
if isinstance(sample_uid, (list, tuple)):
|
|
@@ -1067,24 +1091,24 @@ def features_select(
|
|
|
1067
1091
|
filter_conditions.append(pl.col("sample_uid").is_in(sample_uid))
|
|
1068
1092
|
else:
|
|
1069
1093
|
filter_conditions.append(pl.col("sample_uid") == sample_uid)
|
|
1070
|
-
|
|
1094
|
+
|
|
1071
1095
|
# Filter by sample_name (requires pre-processing)
|
|
1072
1096
|
if sample_name is not None:
|
|
1073
1097
|
# Get sample_uids for the given sample names
|
|
1074
1098
|
if isinstance(sample_name, list):
|
|
1075
1099
|
sample_uids_for_names = self.samples_df.filter(
|
|
1076
|
-
pl.col("sample_name").is_in(sample_name)
|
|
1100
|
+
pl.col("sample_name").is_in(sample_name),
|
|
1077
1101
|
)["sample_uid"].to_list()
|
|
1078
1102
|
else:
|
|
1079
1103
|
sample_uids_for_names = self.samples_df.filter(
|
|
1080
|
-
pl.col("sample_name") == sample_name
|
|
1104
|
+
pl.col("sample_name") == sample_name,
|
|
1081
1105
|
)["sample_uid"].to_list()
|
|
1082
|
-
|
|
1106
|
+
|
|
1083
1107
|
if sample_uids_for_names:
|
|
1084
1108
|
filter_conditions.append(pl.col("sample_uid").is_in(sample_uids_for_names))
|
|
1085
1109
|
else:
|
|
1086
1110
|
filter_conditions.append(pl.lit(False)) # No matching samples
|
|
1087
|
-
|
|
1111
|
+
|
|
1088
1112
|
# Filter by consensus_uid
|
|
1089
1113
|
if consensus_uid is not None:
|
|
1090
1114
|
if isinstance(consensus_uid, (list, tuple)):
|
|
@@ -1097,7 +1121,7 @@ def features_select(
|
|
|
1097
1121
|
filter_conditions.append(pl.col("consensus_uid").is_in(consensus_uid))
|
|
1098
1122
|
else:
|
|
1099
1123
|
filter_conditions.append(pl.col("consensus_uid") == consensus_uid)
|
|
1100
|
-
|
|
1124
|
+
|
|
1101
1125
|
# Filter by feature_uid
|
|
1102
1126
|
if feature_uid is not None:
|
|
1103
1127
|
if isinstance(feature_uid, (list, tuple)):
|
|
@@ -1110,7 +1134,7 @@ def features_select(
|
|
|
1110
1134
|
filter_conditions.append(pl.col("feature_uid").is_in(feature_uid))
|
|
1111
1135
|
else:
|
|
1112
1136
|
filter_conditions.append(pl.col("feature_uid") == feature_uid)
|
|
1113
|
-
|
|
1137
|
+
|
|
1114
1138
|
# Filter by filled status
|
|
1115
1139
|
if filled is not None:
|
|
1116
1140
|
if "filled" in available_columns:
|
|
@@ -1120,7 +1144,7 @@ def features_select(
|
|
|
1120
1144
|
filter_conditions.append(~pl.col("filled") | pl.col("filled").is_null())
|
|
1121
1145
|
else:
|
|
1122
1146
|
warnings.append("'filled' column not found in features_df")
|
|
1123
|
-
|
|
1147
|
+
|
|
1124
1148
|
# Filter by quality
|
|
1125
1149
|
if quality is not None:
|
|
1126
1150
|
if "quality" in available_columns:
|
|
@@ -1131,73 +1155,83 @@ def features_select(
|
|
|
1131
1155
|
filter_conditions.append(pl.col("quality") >= quality)
|
|
1132
1156
|
else:
|
|
1133
1157
|
warnings.append("'quality' column not found in features_df")
|
|
1134
|
-
|
|
1158
|
+
|
|
1135
1159
|
# Filter by chromatogram coherence
|
|
1136
1160
|
if chrom_coherence is not None:
|
|
1137
1161
|
if "chrom_coherence" in available_columns:
|
|
1138
1162
|
if isinstance(chrom_coherence, tuple) and len(chrom_coherence) == 2:
|
|
1139
1163
|
min_coherence, max_coherence = chrom_coherence
|
|
1140
|
-
filter_conditions.append(
|
|
1164
|
+
filter_conditions.append(
|
|
1165
|
+
(pl.col("chrom_coherence") >= min_coherence) & (pl.col("chrom_coherence") <= max_coherence)
|
|
1166
|
+
)
|
|
1141
1167
|
else:
|
|
1142
1168
|
filter_conditions.append(pl.col("chrom_coherence") >= chrom_coherence)
|
|
1143
1169
|
else:
|
|
1144
1170
|
warnings.append("'chrom_coherence' column not found in features_df")
|
|
1145
|
-
|
|
1171
|
+
|
|
1146
1172
|
# Filter by chromatogram prominence
|
|
1147
1173
|
if chrom_prominence is not None:
|
|
1148
1174
|
if "chrom_prominence" in available_columns:
|
|
1149
1175
|
if isinstance(chrom_prominence, tuple) and len(chrom_prominence) == 2:
|
|
1150
1176
|
min_prominence, max_prominence = chrom_prominence
|
|
1151
|
-
filter_conditions.append(
|
|
1177
|
+
filter_conditions.append(
|
|
1178
|
+
(pl.col("chrom_prominence") >= min_prominence) & (pl.col("chrom_prominence") <= max_prominence)
|
|
1179
|
+
)
|
|
1152
1180
|
else:
|
|
1153
1181
|
filter_conditions.append(pl.col("chrom_prominence") >= chrom_prominence)
|
|
1154
1182
|
else:
|
|
1155
1183
|
warnings.append("'chrom_prominence' column not found in features_df")
|
|
1156
|
-
|
|
1184
|
+
|
|
1157
1185
|
# Filter by scaled chromatogram prominence
|
|
1158
1186
|
if chrom_prominence_scaled is not None:
|
|
1159
1187
|
if "chrom_prominence_scaled" in available_columns:
|
|
1160
1188
|
if isinstance(chrom_prominence_scaled, tuple) and len(chrom_prominence_scaled) == 2:
|
|
1161
1189
|
min_prominence_scaled, max_prominence_scaled = chrom_prominence_scaled
|
|
1162
|
-
filter_conditions.append(
|
|
1190
|
+
filter_conditions.append(
|
|
1191
|
+
(pl.col("chrom_prominence_scaled") >= min_prominence_scaled)
|
|
1192
|
+
& (pl.col("chrom_prominence_scaled") <= max_prominence_scaled)
|
|
1193
|
+
)
|
|
1163
1194
|
else:
|
|
1164
1195
|
filter_conditions.append(pl.col("chrom_prominence_scaled") >= chrom_prominence_scaled)
|
|
1165
1196
|
else:
|
|
1166
1197
|
warnings.append("'chrom_prominence_scaled' column not found in features_df")
|
|
1167
|
-
|
|
1198
|
+
|
|
1168
1199
|
# Filter by scaled chromatogram height
|
|
1169
1200
|
if chrom_height_scaled is not None:
|
|
1170
1201
|
if "chrom_height_scaled" in available_columns:
|
|
1171
1202
|
if isinstance(chrom_height_scaled, tuple) and len(chrom_height_scaled) == 2:
|
|
1172
1203
|
min_height_scaled, max_height_scaled = chrom_height_scaled
|
|
1173
|
-
filter_conditions.append(
|
|
1204
|
+
filter_conditions.append(
|
|
1205
|
+
(pl.col("chrom_height_scaled") >= min_height_scaled)
|
|
1206
|
+
& (pl.col("chrom_height_scaled") <= max_height_scaled)
|
|
1207
|
+
)
|
|
1174
1208
|
else:
|
|
1175
1209
|
filter_conditions.append(pl.col("chrom_height_scaled") >= chrom_height_scaled)
|
|
1176
1210
|
else:
|
|
1177
1211
|
warnings.append("'chrom_height_scaled' column not found in features_df")
|
|
1178
|
-
|
|
1212
|
+
|
|
1179
1213
|
# Log all warnings once at the end for efficiency
|
|
1180
1214
|
for warning in warnings:
|
|
1181
1215
|
self.logger.warning(warning)
|
|
1182
|
-
|
|
1216
|
+
|
|
1183
1217
|
# Apply all filters at once using lazy evaluation for optimal performance
|
|
1184
1218
|
if filter_conditions:
|
|
1185
1219
|
# Combine all conditions with AND
|
|
1186
1220
|
combined_filter = filter_conditions[0]
|
|
1187
1221
|
for condition in filter_conditions[1:]:
|
|
1188
1222
|
combined_filter = combined_filter & condition
|
|
1189
|
-
|
|
1223
|
+
|
|
1190
1224
|
# Apply the combined filter using lazy evaluation
|
|
1191
1225
|
feats = self.features_df.lazy().filter(combined_filter).collect()
|
|
1192
1226
|
else:
|
|
1193
1227
|
feats = self.features_df.clone()
|
|
1194
|
-
|
|
1228
|
+
|
|
1195
1229
|
final_count = len(feats)
|
|
1196
|
-
|
|
1230
|
+
|
|
1197
1231
|
if final_count == 0:
|
|
1198
1232
|
self.logger.warning("No features remaining after applying selection criteria.")
|
|
1199
1233
|
else:
|
|
1200
|
-
#removed_count = initial_count - final_count
|
|
1234
|
+
# removed_count = initial_count - final_count
|
|
1201
1235
|
self.logger.info(f"Features selected: {final_count} (out of {initial_count})")
|
|
1202
1236
|
|
|
1203
1237
|
return feats
|
|
@@ -1207,29 +1241,29 @@ def features_filter(self, features):
|
|
|
1207
1241
|
"""
|
|
1208
1242
|
Filter features_df by keeping only features that match the given criteria.
|
|
1209
1243
|
This keeps only the specified features and removes all others.
|
|
1210
|
-
|
|
1244
|
+
|
|
1211
1245
|
OPTIMIZED VERSION: Batch operations and reduced overhead for better performance.
|
|
1212
|
-
|
|
1246
|
+
|
|
1213
1247
|
Parameters:
|
|
1214
1248
|
features: Features to keep. Can be:
|
|
1215
1249
|
- polars.DataFrame: Features DataFrame (will use feature_uid column)
|
|
1216
1250
|
- list: List of feature_uids to keep
|
|
1217
1251
|
- int: Single feature_uid to keep
|
|
1218
|
-
|
|
1252
|
+
|
|
1219
1253
|
Returns:
|
|
1220
1254
|
None (modifies self.features_df in place)
|
|
1221
1255
|
"""
|
|
1222
1256
|
if self.features_df is None or self.features_df.is_empty():
|
|
1223
1257
|
self.logger.warning("No features found in study.")
|
|
1224
1258
|
return
|
|
1225
|
-
|
|
1259
|
+
|
|
1226
1260
|
# Early return if no features provided
|
|
1227
1261
|
if features is None:
|
|
1228
1262
|
self.logger.warning("No features provided for filtering.")
|
|
1229
1263
|
return
|
|
1230
|
-
|
|
1264
|
+
|
|
1231
1265
|
initial_count = len(self.features_df)
|
|
1232
|
-
|
|
1266
|
+
|
|
1233
1267
|
# Determine feature_uids to keep - optimized type checking
|
|
1234
1268
|
if isinstance(features, pl.DataFrame):
|
|
1235
1269
|
if "feature_uid" not in features.columns:
|
|
@@ -1243,44 +1277,41 @@ def features_filter(self, features):
|
|
|
1243
1277
|
else:
|
|
1244
1278
|
self.logger.error("features parameter must be a DataFrame, list, tuple, or int")
|
|
1245
1279
|
return
|
|
1246
|
-
|
|
1280
|
+
|
|
1247
1281
|
# Early return if no UIDs to keep
|
|
1248
1282
|
if not feature_uids_to_keep:
|
|
1249
1283
|
self.logger.warning("No feature UIDs provided for filtering.")
|
|
1250
1284
|
return
|
|
1251
|
-
|
|
1285
|
+
|
|
1252
1286
|
# Convert to set for faster lookup if list is large
|
|
1253
1287
|
if len(feature_uids_to_keep) > 100:
|
|
1254
1288
|
feature_uids_set = set(feature_uids_to_keep)
|
|
1255
1289
|
# Use the set for filtering if it's significantly smaller
|
|
1256
1290
|
if len(feature_uids_set) < len(feature_uids_to_keep) * 0.8:
|
|
1257
1291
|
feature_uids_to_keep = list(feature_uids_set)
|
|
1258
|
-
|
|
1292
|
+
|
|
1259
1293
|
# Create filter condition once - keep only the specified features
|
|
1260
1294
|
filter_condition = pl.col("feature_uid").is_in(feature_uids_to_keep)
|
|
1261
|
-
|
|
1295
|
+
|
|
1262
1296
|
# Apply filter to features_df using lazy evaluation for better performance
|
|
1263
1297
|
self.features_df = self.features_df.lazy().filter(filter_condition).collect()
|
|
1264
|
-
|
|
1298
|
+
|
|
1265
1299
|
# Apply filter to consensus_mapping_df if it exists - batch operation
|
|
1266
1300
|
mapping_removed_count = 0
|
|
1267
1301
|
if self.consensus_mapping_df is not None and not self.consensus_mapping_df.is_empty():
|
|
1268
1302
|
initial_mapping_count = len(self.consensus_mapping_df)
|
|
1269
|
-
self.consensus_mapping_df = (
|
|
1270
|
-
self.consensus_mapping_df
|
|
1271
|
-
.lazy()
|
|
1272
|
-
.filter(filter_condition)
|
|
1273
|
-
.collect()
|
|
1274
|
-
)
|
|
1303
|
+
self.consensus_mapping_df = self.consensus_mapping_df.lazy().filter(filter_condition).collect()
|
|
1275
1304
|
mapping_removed_count = initial_mapping_count - len(self.consensus_mapping_df)
|
|
1276
|
-
|
|
1305
|
+
|
|
1277
1306
|
# Calculate results once and log efficiently
|
|
1278
1307
|
final_count = len(self.features_df)
|
|
1279
1308
|
removed_count = initial_count - final_count
|
|
1280
|
-
|
|
1309
|
+
|
|
1281
1310
|
# Single comprehensive log message
|
|
1282
1311
|
if mapping_removed_count > 0:
|
|
1283
|
-
self.logger.info(
|
|
1312
|
+
self.logger.info(
|
|
1313
|
+
f"Kept {final_count} features and removed {mapping_removed_count} consensus mappings. Filtered out {removed_count} features."
|
|
1314
|
+
)
|
|
1284
1315
|
else:
|
|
1285
1316
|
self.logger.info(f"Kept {final_count} features. Filtered out {removed_count} features.")
|
|
1286
1317
|
|
|
@@ -1289,27 +1320,27 @@ def features_delete(self, features):
|
|
|
1289
1320
|
"""
|
|
1290
1321
|
Delete features from features_df based on feature identifiers.
|
|
1291
1322
|
This removes the specified features and keeps all others (opposite of features_filter).
|
|
1292
|
-
|
|
1323
|
+
|
|
1293
1324
|
Parameters:
|
|
1294
1325
|
features: Features to delete. Can be:
|
|
1295
1326
|
- polars.DataFrame: Features DataFrame (will use feature_uid column)
|
|
1296
1327
|
- list: List of feature_uids to delete
|
|
1297
1328
|
- int: Single feature_uid to delete
|
|
1298
|
-
|
|
1329
|
+
|
|
1299
1330
|
Returns:
|
|
1300
1331
|
None (modifies self.features_df in place)
|
|
1301
1332
|
"""
|
|
1302
1333
|
if self.features_df is None or self.features_df.is_empty():
|
|
1303
1334
|
self.logger.warning("No features found in study.")
|
|
1304
1335
|
return
|
|
1305
|
-
|
|
1336
|
+
|
|
1306
1337
|
# Early return if no features provided
|
|
1307
1338
|
if features is None:
|
|
1308
1339
|
self.logger.warning("No features provided for deletion.")
|
|
1309
1340
|
return
|
|
1310
|
-
|
|
1341
|
+
|
|
1311
1342
|
initial_count = len(self.features_df)
|
|
1312
|
-
|
|
1343
|
+
|
|
1313
1344
|
# Determine feature_uids to remove - optimized type checking
|
|
1314
1345
|
if isinstance(features, pl.DataFrame):
|
|
1315
1346
|
if "feature_uid" not in features.columns:
|
|
@@ -1323,44 +1354,41 @@ def features_delete(self, features):
|
|
|
1323
1354
|
else:
|
|
1324
1355
|
self.logger.error("features parameter must be a DataFrame, list, tuple, or int")
|
|
1325
1356
|
return
|
|
1326
|
-
|
|
1357
|
+
|
|
1327
1358
|
# Early return if no UIDs to remove
|
|
1328
1359
|
if not feature_uids_to_remove:
|
|
1329
1360
|
self.logger.warning("No feature UIDs provided for deletion.")
|
|
1330
1361
|
return
|
|
1331
|
-
|
|
1362
|
+
|
|
1332
1363
|
# Convert to set for faster lookup if list is large
|
|
1333
1364
|
if len(feature_uids_to_remove) > 100:
|
|
1334
1365
|
feature_uids_set = set(feature_uids_to_remove)
|
|
1335
1366
|
# Use the set for filtering if it's significantly smaller
|
|
1336
1367
|
if len(feature_uids_set) < len(feature_uids_to_remove) * 0.8:
|
|
1337
1368
|
feature_uids_to_remove = list(feature_uids_set)
|
|
1338
|
-
|
|
1369
|
+
|
|
1339
1370
|
# Create filter condition - remove specified features
|
|
1340
1371
|
filter_condition = ~pl.col("feature_uid").is_in(feature_uids_to_remove)
|
|
1341
|
-
|
|
1372
|
+
|
|
1342
1373
|
# Apply filter to features_df using lazy evaluation for better performance
|
|
1343
1374
|
self.features_df = self.features_df.lazy().filter(filter_condition).collect()
|
|
1344
|
-
|
|
1375
|
+
|
|
1345
1376
|
# Apply filter to consensus_mapping_df if it exists - batch operation
|
|
1346
1377
|
mapping_removed_count = 0
|
|
1347
1378
|
if self.consensus_mapping_df is not None and not self.consensus_mapping_df.is_empty():
|
|
1348
1379
|
initial_mapping_count = len(self.consensus_mapping_df)
|
|
1349
|
-
self.consensus_mapping_df = (
|
|
1350
|
-
self.consensus_mapping_df
|
|
1351
|
-
.lazy()
|
|
1352
|
-
.filter(filter_condition)
|
|
1353
|
-
.collect()
|
|
1354
|
-
)
|
|
1380
|
+
self.consensus_mapping_df = self.consensus_mapping_df.lazy().filter(filter_condition).collect()
|
|
1355
1381
|
mapping_removed_count = initial_mapping_count - len(self.consensus_mapping_df)
|
|
1356
|
-
|
|
1382
|
+
|
|
1357
1383
|
# Calculate results once and log efficiently
|
|
1358
1384
|
final_count = len(self.features_df)
|
|
1359
1385
|
removed_count = initial_count - final_count
|
|
1360
|
-
|
|
1386
|
+
|
|
1361
1387
|
# Single comprehensive log message
|
|
1362
1388
|
if mapping_removed_count > 0:
|
|
1363
|
-
self.logger.info(
|
|
1389
|
+
self.logger.info(
|
|
1390
|
+
f"Deleted {removed_count} features and {mapping_removed_count} consensus mappings. Remaining features: {final_count}"
|
|
1391
|
+
)
|
|
1364
1392
|
else:
|
|
1365
1393
|
self.logger.info(f"Deleted {removed_count} features. Remaining features: {final_count}")
|
|
1366
1394
|
|
|
@@ -1384,7 +1412,7 @@ def consensus_select(
|
|
|
1384
1412
|
):
|
|
1385
1413
|
"""
|
|
1386
1414
|
Select consensus features from consensus_df based on specified criteria and return the filtered DataFrame.
|
|
1387
|
-
|
|
1415
|
+
|
|
1388
1416
|
Parameters:
|
|
1389
1417
|
mz: m/z range filter (tuple for range, single value for minimum)
|
|
1390
1418
|
rt: retention time range filter (tuple for range, single value for minimum)
|
|
@@ -1400,17 +1428,17 @@ def consensus_select(
|
|
|
1400
1428
|
chrom_prominence_scaled_mean: mean scaled chromatogram prominence filter (tuple for range, single value for minimum)
|
|
1401
1429
|
chrom_height_scaled_mean: mean scaled chromatogram height filter (tuple for range, single value for minimum)
|
|
1402
1430
|
rt_delta_mean: mean RT delta filter (tuple for range, single value for minimum)
|
|
1403
|
-
|
|
1431
|
+
|
|
1404
1432
|
Returns:
|
|
1405
1433
|
polars.DataFrame: Filtered consensus DataFrame
|
|
1406
1434
|
"""
|
|
1407
1435
|
if self.consensus_df is None or self.consensus_df.is_empty():
|
|
1408
1436
|
self.logger.warning("No consensus features found in study.")
|
|
1409
1437
|
return pl.DataFrame()
|
|
1410
|
-
|
|
1438
|
+
|
|
1411
1439
|
consensus = self.consensus_df.clone()
|
|
1412
1440
|
initial_count = len(consensus)
|
|
1413
|
-
|
|
1441
|
+
|
|
1414
1442
|
# Filter by m/z
|
|
1415
1443
|
if mz is not None:
|
|
1416
1444
|
consensus_len_before_filter = len(consensus)
|
|
@@ -1420,9 +1448,9 @@ def consensus_select(
|
|
|
1420
1448
|
else:
|
|
1421
1449
|
consensus = consensus.filter(pl.col("mz") >= mz)
|
|
1422
1450
|
self.logger.debug(
|
|
1423
|
-
f"Selected consensus by mz. Consensus removed: {consensus_len_before_filter - len(consensus)}"
|
|
1451
|
+
f"Selected consensus by mz. Consensus removed: {consensus_len_before_filter - len(consensus)}",
|
|
1424
1452
|
)
|
|
1425
|
-
|
|
1453
|
+
|
|
1426
1454
|
# Filter by retention time
|
|
1427
1455
|
if rt is not None:
|
|
1428
1456
|
consensus_len_before_filter = len(consensus)
|
|
@@ -1432,9 +1460,9 @@ def consensus_select(
|
|
|
1432
1460
|
else:
|
|
1433
1461
|
consensus = consensus.filter(pl.col("rt") >= rt)
|
|
1434
1462
|
self.logger.debug(
|
|
1435
|
-
f"Selected consensus by rt. Consensus removed: {consensus_len_before_filter - len(consensus)}"
|
|
1463
|
+
f"Selected consensus by rt. Consensus removed: {consensus_len_before_filter - len(consensus)}",
|
|
1436
1464
|
)
|
|
1437
|
-
|
|
1465
|
+
|
|
1438
1466
|
# Filter by mean intensity
|
|
1439
1467
|
if inty_mean is not None:
|
|
1440
1468
|
consensus_len_before_filter = len(consensus)
|
|
@@ -1444,9 +1472,9 @@ def consensus_select(
|
|
|
1444
1472
|
else:
|
|
1445
1473
|
consensus = consensus.filter(pl.col("inty_mean") >= inty_mean)
|
|
1446
1474
|
self.logger.debug(
|
|
1447
|
-
f"Selected consensus by inty_mean. Consensus removed: {consensus_len_before_filter - len(consensus)}"
|
|
1475
|
+
f"Selected consensus by inty_mean. Consensus removed: {consensus_len_before_filter - len(consensus)}",
|
|
1448
1476
|
)
|
|
1449
|
-
|
|
1477
|
+
|
|
1450
1478
|
# Filter by consensus_uid
|
|
1451
1479
|
if consensus_uid is not None:
|
|
1452
1480
|
consensus_len_before_filter = len(consensus)
|
|
@@ -1454,16 +1482,18 @@ def consensus_select(
|
|
|
1454
1482
|
if len(consensus_uid) == 2 and not isinstance(consensus_uid, list):
|
|
1455
1483
|
# Treat as range
|
|
1456
1484
|
min_uid, max_uid = consensus_uid
|
|
1457
|
-
consensus = consensus.filter(
|
|
1485
|
+
consensus = consensus.filter(
|
|
1486
|
+
(pl.col("consensus_uid") >= min_uid) & (pl.col("consensus_uid") <= max_uid)
|
|
1487
|
+
)
|
|
1458
1488
|
else:
|
|
1459
1489
|
# Treat as list
|
|
1460
1490
|
consensus = consensus.filter(pl.col("consensus_uid").is_in(consensus_uid))
|
|
1461
1491
|
else:
|
|
1462
1492
|
consensus = consensus.filter(pl.col("consensus_uid") == consensus_uid)
|
|
1463
1493
|
self.logger.debug(
|
|
1464
|
-
f"Selected consensus by consensus_uid. Consensus removed: {consensus_len_before_filter - len(consensus)}"
|
|
1494
|
+
f"Selected consensus by consensus_uid. Consensus removed: {consensus_len_before_filter - len(consensus)}",
|
|
1465
1495
|
)
|
|
1466
|
-
|
|
1496
|
+
|
|
1467
1497
|
# Filter by consensus_id
|
|
1468
1498
|
if consensus_id is not None:
|
|
1469
1499
|
consensus_len_before_filter = len(consensus)
|
|
@@ -1472,21 +1502,23 @@ def consensus_select(
|
|
|
1472
1502
|
else:
|
|
1473
1503
|
consensus = consensus.filter(pl.col("consensus_id") == consensus_id)
|
|
1474
1504
|
self.logger.debug(
|
|
1475
|
-
f"Selected consensus by consensus_id. Consensus removed: {consensus_len_before_filter - len(consensus)}"
|
|
1505
|
+
f"Selected consensus by consensus_id. Consensus removed: {consensus_len_before_filter - len(consensus)}",
|
|
1476
1506
|
)
|
|
1477
|
-
|
|
1507
|
+
|
|
1478
1508
|
# Filter by number of samples
|
|
1479
1509
|
if number_samples is not None:
|
|
1480
1510
|
consensus_len_before_filter = len(consensus)
|
|
1481
1511
|
if isinstance(number_samples, tuple) and len(number_samples) == 2:
|
|
1482
1512
|
min_samples, max_samples = number_samples
|
|
1483
|
-
consensus = consensus.filter(
|
|
1513
|
+
consensus = consensus.filter(
|
|
1514
|
+
(pl.col("number_samples") >= min_samples) & (pl.col("number_samples") <= max_samples)
|
|
1515
|
+
)
|
|
1484
1516
|
else:
|
|
1485
1517
|
consensus = consensus.filter(pl.col("number_samples") >= number_samples)
|
|
1486
1518
|
self.logger.debug(
|
|
1487
|
-
f"Selected consensus by number_samples. Consensus removed: {consensus_len_before_filter - len(consensus)}"
|
|
1519
|
+
f"Selected consensus by number_samples. Consensus removed: {consensus_len_before_filter - len(consensus)}",
|
|
1488
1520
|
)
|
|
1489
|
-
|
|
1521
|
+
|
|
1490
1522
|
# Filter by number of MS2 spectra
|
|
1491
1523
|
if number_ms2 is not None:
|
|
1492
1524
|
consensus_len_before_filter = len(consensus)
|
|
@@ -1499,9 +1531,9 @@ def consensus_select(
|
|
|
1499
1531
|
else:
|
|
1500
1532
|
self.logger.warning("'number_ms2' column not found in consensus_df")
|
|
1501
1533
|
self.logger.debug(
|
|
1502
|
-
f"Selected consensus by number_ms2. Consensus removed: {consensus_len_before_filter - len(consensus)}"
|
|
1534
|
+
f"Selected consensus by number_ms2. Consensus removed: {consensus_len_before_filter - len(consensus)}",
|
|
1503
1535
|
)
|
|
1504
|
-
|
|
1536
|
+
|
|
1505
1537
|
# Filter by quality
|
|
1506
1538
|
if quality is not None:
|
|
1507
1539
|
consensus_len_before_filter = len(consensus)
|
|
@@ -1511,9 +1543,9 @@ def consensus_select(
|
|
|
1511
1543
|
else:
|
|
1512
1544
|
consensus = consensus.filter(pl.col("quality") >= quality)
|
|
1513
1545
|
self.logger.debug(
|
|
1514
|
-
f"Selected consensus by quality. Consensus removed: {consensus_len_before_filter - len(consensus)}"
|
|
1546
|
+
f"Selected consensus by quality. Consensus removed: {consensus_len_before_filter - len(consensus)}",
|
|
1515
1547
|
)
|
|
1516
|
-
|
|
1548
|
+
|
|
1517
1549
|
# Filter by baseline
|
|
1518
1550
|
if bl is not None:
|
|
1519
1551
|
consensus_len_before_filter = len(consensus)
|
|
@@ -1526,89 +1558,103 @@ def consensus_select(
|
|
|
1526
1558
|
else:
|
|
1527
1559
|
self.logger.warning("'bl' column not found in consensus_df")
|
|
1528
1560
|
self.logger.debug(
|
|
1529
|
-
f"Selected consensus by bl. Consensus removed: {consensus_len_before_filter - len(consensus)}"
|
|
1561
|
+
f"Selected consensus by bl. Consensus removed: {consensus_len_before_filter - len(consensus)}",
|
|
1530
1562
|
)
|
|
1531
|
-
|
|
1563
|
+
|
|
1532
1564
|
# Filter by mean chromatogram coherence
|
|
1533
1565
|
if chrom_coherence_mean is not None:
|
|
1534
1566
|
consensus_len_before_filter = len(consensus)
|
|
1535
1567
|
if "chrom_coherence_mean" in consensus.columns:
|
|
1536
1568
|
if isinstance(chrom_coherence_mean, tuple) and len(chrom_coherence_mean) == 2:
|
|
1537
1569
|
min_coherence, max_coherence = chrom_coherence_mean
|
|
1538
|
-
consensus = consensus.filter(
|
|
1570
|
+
consensus = consensus.filter(
|
|
1571
|
+
(pl.col("chrom_coherence_mean") >= min_coherence)
|
|
1572
|
+
& (pl.col("chrom_coherence_mean") <= max_coherence)
|
|
1573
|
+
)
|
|
1539
1574
|
else:
|
|
1540
1575
|
consensus = consensus.filter(pl.col("chrom_coherence_mean") >= chrom_coherence_mean)
|
|
1541
1576
|
else:
|
|
1542
1577
|
self.logger.warning("'chrom_coherence_mean' column not found in consensus_df")
|
|
1543
1578
|
self.logger.debug(
|
|
1544
|
-
f"Selected consensus by chrom_coherence_mean. Consensus removed: {consensus_len_before_filter - len(consensus)}"
|
|
1579
|
+
f"Selected consensus by chrom_coherence_mean. Consensus removed: {consensus_len_before_filter - len(consensus)}",
|
|
1545
1580
|
)
|
|
1546
|
-
|
|
1581
|
+
|
|
1547
1582
|
# Filter by mean chromatogram prominence
|
|
1548
1583
|
if chrom_prominence_mean is not None:
|
|
1549
1584
|
consensus_len_before_filter = len(consensus)
|
|
1550
1585
|
if "chrom_prominence_mean" in consensus.columns:
|
|
1551
1586
|
if isinstance(chrom_prominence_mean, tuple) and len(chrom_prominence_mean) == 2:
|
|
1552
1587
|
min_prominence, max_prominence = chrom_prominence_mean
|
|
1553
|
-
consensus = consensus.filter(
|
|
1588
|
+
consensus = consensus.filter(
|
|
1589
|
+
(pl.col("chrom_prominence_mean") >= min_prominence)
|
|
1590
|
+
& (pl.col("chrom_prominence_mean") <= max_prominence)
|
|
1591
|
+
)
|
|
1554
1592
|
else:
|
|
1555
1593
|
consensus = consensus.filter(pl.col("chrom_prominence_mean") >= chrom_prominence_mean)
|
|
1556
1594
|
else:
|
|
1557
1595
|
self.logger.warning("'chrom_prominence_mean' column not found in consensus_df")
|
|
1558
1596
|
self.logger.debug(
|
|
1559
|
-
f"Selected consensus by chrom_prominence_mean. Consensus removed: {consensus_len_before_filter - len(consensus)}"
|
|
1597
|
+
f"Selected consensus by chrom_prominence_mean. Consensus removed: {consensus_len_before_filter - len(consensus)}",
|
|
1560
1598
|
)
|
|
1561
|
-
|
|
1599
|
+
|
|
1562
1600
|
# Filter by mean scaled chromatogram prominence
|
|
1563
1601
|
if chrom_prominence_scaled_mean is not None:
|
|
1564
1602
|
consensus_len_before_filter = len(consensus)
|
|
1565
1603
|
if "chrom_prominence_scaled_mean" in consensus.columns:
|
|
1566
1604
|
if isinstance(chrom_prominence_scaled_mean, tuple) and len(chrom_prominence_scaled_mean) == 2:
|
|
1567
1605
|
min_prominence_scaled, max_prominence_scaled = chrom_prominence_scaled_mean
|
|
1568
|
-
consensus = consensus.filter(
|
|
1606
|
+
consensus = consensus.filter(
|
|
1607
|
+
(pl.col("chrom_prominence_scaled_mean") >= min_prominence_scaled)
|
|
1608
|
+
& (pl.col("chrom_prominence_scaled_mean") <= max_prominence_scaled)
|
|
1609
|
+
)
|
|
1569
1610
|
else:
|
|
1570
1611
|
consensus = consensus.filter(pl.col("chrom_prominence_scaled_mean") >= chrom_prominence_scaled_mean)
|
|
1571
1612
|
else:
|
|
1572
1613
|
self.logger.warning("'chrom_prominence_scaled_mean' column not found in consensus_df")
|
|
1573
1614
|
self.logger.debug(
|
|
1574
|
-
f"Selected consensus by chrom_prominence_scaled_mean. Consensus removed: {consensus_len_before_filter - len(consensus)}"
|
|
1615
|
+
f"Selected consensus by chrom_prominence_scaled_mean. Consensus removed: {consensus_len_before_filter - len(consensus)}",
|
|
1575
1616
|
)
|
|
1576
|
-
|
|
1617
|
+
|
|
1577
1618
|
# Filter by mean scaled chromatogram height
|
|
1578
1619
|
if chrom_height_scaled_mean is not None:
|
|
1579
1620
|
consensus_len_before_filter = len(consensus)
|
|
1580
1621
|
if "chrom_height_scaled_mean" in consensus.columns:
|
|
1581
1622
|
if isinstance(chrom_height_scaled_mean, tuple) and len(chrom_height_scaled_mean) == 2:
|
|
1582
1623
|
min_height_scaled, max_height_scaled = chrom_height_scaled_mean
|
|
1583
|
-
consensus = consensus.filter(
|
|
1624
|
+
consensus = consensus.filter(
|
|
1625
|
+
(pl.col("chrom_height_scaled_mean") >= min_height_scaled)
|
|
1626
|
+
& (pl.col("chrom_height_scaled_mean") <= max_height_scaled)
|
|
1627
|
+
)
|
|
1584
1628
|
else:
|
|
1585
1629
|
consensus = consensus.filter(pl.col("chrom_height_scaled_mean") >= chrom_height_scaled_mean)
|
|
1586
1630
|
else:
|
|
1587
1631
|
self.logger.warning("'chrom_height_scaled_mean' column not found in consensus_df")
|
|
1588
1632
|
self.logger.debug(
|
|
1589
|
-
f"Selected consensus by chrom_height_scaled_mean. Consensus removed: {consensus_len_before_filter - len(consensus)}"
|
|
1633
|
+
f"Selected consensus by chrom_height_scaled_mean. Consensus removed: {consensus_len_before_filter - len(consensus)}",
|
|
1590
1634
|
)
|
|
1591
|
-
|
|
1635
|
+
|
|
1592
1636
|
# Filter by mean RT delta
|
|
1593
1637
|
if rt_delta_mean is not None:
|
|
1594
1638
|
consensus_len_before_filter = len(consensus)
|
|
1595
1639
|
if "rt_delta_mean" in consensus.columns:
|
|
1596
1640
|
if isinstance(rt_delta_mean, tuple) and len(rt_delta_mean) == 2:
|
|
1597
1641
|
min_rt_delta, max_rt_delta = rt_delta_mean
|
|
1598
|
-
consensus = consensus.filter(
|
|
1642
|
+
consensus = consensus.filter(
|
|
1643
|
+
(pl.col("rt_delta_mean") >= min_rt_delta) & (pl.col("rt_delta_mean") <= max_rt_delta)
|
|
1644
|
+
)
|
|
1599
1645
|
else:
|
|
1600
1646
|
consensus = consensus.filter(pl.col("rt_delta_mean") >= rt_delta_mean)
|
|
1601
1647
|
else:
|
|
1602
1648
|
self.logger.warning("'rt_delta_mean' column not found in consensus_df")
|
|
1603
1649
|
self.logger.debug(
|
|
1604
|
-
f"Selected consensus by rt_delta_mean. Consensus removed: {consensus_len_before_filter - len(consensus)}"
|
|
1650
|
+
f"Selected consensus by rt_delta_mean. Consensus removed: {consensus_len_before_filter - len(consensus)}",
|
|
1605
1651
|
)
|
|
1606
|
-
|
|
1652
|
+
|
|
1607
1653
|
if len(consensus) == 0:
|
|
1608
1654
|
self.logger.warning("No consensus features remaining after applying selection criteria.")
|
|
1609
1655
|
else:
|
|
1610
1656
|
self.logger.info(f"Selected consensus features. Features remaining: {len(consensus)} (from {initial_count})")
|
|
1611
|
-
|
|
1657
|
+
|
|
1612
1658
|
return consensus
|
|
1613
1659
|
|
|
1614
1660
|
|
|
@@ -1616,22 +1662,22 @@ def consensus_filter(self, consensus):
|
|
|
1616
1662
|
"""
|
|
1617
1663
|
Filter consensus_df by removing all consensus features that match the given criteria.
|
|
1618
1664
|
This also removes related entries from consensus_mapping_df, features_df, and consensus_ms2.
|
|
1619
|
-
|
|
1665
|
+
|
|
1620
1666
|
Parameters:
|
|
1621
1667
|
consensus: Consensus features to remove. Can be:
|
|
1622
1668
|
- polars.DataFrame: Consensus DataFrame (will use consensus_uid column)
|
|
1623
1669
|
- list: List of consensus_uids to remove
|
|
1624
1670
|
- int: Single consensus_uid to remove
|
|
1625
|
-
|
|
1671
|
+
|
|
1626
1672
|
Returns:
|
|
1627
1673
|
None (modifies self.consensus_df and related DataFrames in place)
|
|
1628
1674
|
"""
|
|
1629
1675
|
if self.consensus_df is None or self.consensus_df.is_empty():
|
|
1630
1676
|
self.logger.warning("No consensus features found in study.")
|
|
1631
1677
|
return
|
|
1632
|
-
|
|
1678
|
+
|
|
1633
1679
|
initial_consensus_count = len(self.consensus_df)
|
|
1634
|
-
|
|
1680
|
+
|
|
1635
1681
|
# Determine consensus_uids to remove
|
|
1636
1682
|
if isinstance(consensus, pl.DataFrame):
|
|
1637
1683
|
if "consensus_uid" not in consensus.columns:
|
|
@@ -1645,68 +1691,70 @@ def consensus_filter(self, consensus):
|
|
|
1645
1691
|
else:
|
|
1646
1692
|
self.logger.error("consensus parameter must be a DataFrame, list, or int")
|
|
1647
1693
|
return
|
|
1648
|
-
|
|
1694
|
+
|
|
1649
1695
|
if not consensus_uids_to_remove:
|
|
1650
1696
|
self.logger.warning("No consensus UIDs provided for filtering.")
|
|
1651
1697
|
return
|
|
1652
|
-
|
|
1698
|
+
|
|
1653
1699
|
# Get feature_uids that need to be removed from features_df
|
|
1654
1700
|
feature_uids_to_remove = []
|
|
1655
1701
|
if self.consensus_mapping_df is not None and not self.consensus_mapping_df.is_empty():
|
|
1656
1702
|
feature_uids_to_remove = self.consensus_mapping_df.filter(
|
|
1657
|
-
pl.col("consensus_uid").is_in(consensus_uids_to_remove)
|
|
1703
|
+
pl.col("consensus_uid").is_in(consensus_uids_to_remove),
|
|
1658
1704
|
)["feature_uid"].to_list()
|
|
1659
|
-
|
|
1705
|
+
|
|
1660
1706
|
# Remove consensus features from consensus_df
|
|
1661
1707
|
self.consensus_df = self.consensus_df.filter(
|
|
1662
|
-
~pl.col("consensus_uid").is_in(consensus_uids_to_remove)
|
|
1708
|
+
~pl.col("consensus_uid").is_in(consensus_uids_to_remove),
|
|
1663
1709
|
)
|
|
1664
|
-
|
|
1710
|
+
|
|
1665
1711
|
# Remove from consensus_mapping_df
|
|
1666
1712
|
if self.consensus_mapping_df is not None and not self.consensus_mapping_df.is_empty():
|
|
1667
1713
|
initial_mapping_count = len(self.consensus_mapping_df)
|
|
1668
1714
|
self.consensus_mapping_df = self.consensus_mapping_df.filter(
|
|
1669
|
-
~pl.col("consensus_uid").is_in(consensus_uids_to_remove)
|
|
1715
|
+
~pl.col("consensus_uid").is_in(consensus_uids_to_remove),
|
|
1670
1716
|
)
|
|
1671
1717
|
removed_mapping_count = initial_mapping_count - len(self.consensus_mapping_df)
|
|
1672
1718
|
if removed_mapping_count > 0:
|
|
1673
1719
|
self.logger.debug(f"Removed {removed_mapping_count} entries from consensus_mapping_df")
|
|
1674
|
-
|
|
1720
|
+
|
|
1675
1721
|
# Remove corresponding features from features_df
|
|
1676
1722
|
if feature_uids_to_remove and self.features_df is not None and not self.features_df.is_empty():
|
|
1677
1723
|
initial_features_count = len(self.features_df)
|
|
1678
1724
|
self.features_df = self.features_df.filter(
|
|
1679
|
-
~pl.col("feature_uid").is_in(feature_uids_to_remove)
|
|
1725
|
+
~pl.col("feature_uid").is_in(feature_uids_to_remove),
|
|
1680
1726
|
)
|
|
1681
1727
|
removed_features_count = initial_features_count - len(self.features_df)
|
|
1682
1728
|
if removed_features_count > 0:
|
|
1683
1729
|
self.logger.debug(f"Removed {removed_features_count} entries from features_df")
|
|
1684
|
-
|
|
1730
|
+
|
|
1685
1731
|
# Remove from consensus_ms2 if it exists
|
|
1686
|
-
if hasattr(self,
|
|
1732
|
+
if hasattr(self, "consensus_ms2") and self.consensus_ms2 is not None and not self.consensus_ms2.is_empty():
|
|
1687
1733
|
initial_ms2_count = len(self.consensus_ms2)
|
|
1688
1734
|
self.consensus_ms2 = self.consensus_ms2.filter(
|
|
1689
|
-
~pl.col("consensus_uid").is_in(consensus_uids_to_remove)
|
|
1735
|
+
~pl.col("consensus_uid").is_in(consensus_uids_to_remove),
|
|
1690
1736
|
)
|
|
1691
1737
|
removed_ms2_count = initial_ms2_count - len(self.consensus_ms2)
|
|
1692
1738
|
if removed_ms2_count > 0:
|
|
1693
1739
|
self.logger.debug(f"Removed {removed_ms2_count} entries from consensus_ms2")
|
|
1694
|
-
|
|
1740
|
+
|
|
1695
1741
|
removed_consensus_count = initial_consensus_count - len(self.consensus_df)
|
|
1696
|
-
self.logger.info(
|
|
1742
|
+
self.logger.info(
|
|
1743
|
+
f"Filtered {removed_consensus_count} consensus features. Remaining consensus: {len(self.consensus_df)}"
|
|
1744
|
+
)
|
|
1697
1745
|
|
|
1698
1746
|
|
|
1699
1747
|
def consensus_delete(self, consensus):
|
|
1700
1748
|
"""
|
|
1701
1749
|
Delete consensus features from consensus_df based on consensus identifiers.
|
|
1702
1750
|
This is an alias for consensus_filter for consistency with other delete methods.
|
|
1703
|
-
|
|
1751
|
+
|
|
1704
1752
|
Parameters:
|
|
1705
1753
|
consensus: Consensus features to delete. Can be:
|
|
1706
1754
|
- polars.DataFrame: Consensus DataFrame (will use consensus_uid column)
|
|
1707
1755
|
- list: List of consensus_uids to delete
|
|
1708
1756
|
- int: Single consensus_uid to delete
|
|
1709
|
-
|
|
1757
|
+
|
|
1710
1758
|
Returns:
|
|
1711
1759
|
None (modifies self.consensus_df and related DataFrames in place)
|
|
1712
1760
|
"""
|