masster 0.5.22__py3-none-any.whl → 0.5.24__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of masster might be problematic. Click here for more details.
- masster/_version.py +1 -1
- masster/logger.py +35 -19
- masster/sample/adducts.py +15 -29
- masster/sample/defaults/find_adducts_def.py +1 -3
- masster/sample/defaults/sample_def.py +4 -4
- masster/sample/h5.py +203 -361
- masster/sample/helpers.py +14 -30
- masster/sample/lib.py +3 -3
- masster/sample/load.py +21 -29
- masster/sample/plot.py +222 -132
- masster/sample/processing.py +42 -55
- masster/sample/sample.py +37 -46
- masster/sample/save.py +37 -61
- masster/sample/sciex.py +13 -11
- masster/sample/thermo.py +69 -74
- masster/spectrum.py +15 -15
- masster/study/analysis.py +650 -586
- masster/study/defaults/identify_def.py +1 -3
- masster/study/defaults/merge_def.py +6 -7
- masster/study/defaults/study_def.py +1 -5
- masster/study/export.py +35 -96
- masster/study/h5.py +134 -211
- masster/study/helpers.py +385 -459
- masster/study/id.py +239 -290
- masster/study/importers.py +84 -93
- masster/study/load.py +159 -178
- masster/study/merge.py +1112 -1098
- masster/study/plot.py +195 -149
- masster/study/processing.py +144 -191
- masster/study/save.py +14 -13
- masster/study/study.py +89 -130
- masster/wizard/wizard.py +764 -714
- {masster-0.5.22.dist-info → masster-0.5.24.dist-info}/METADATA +27 -1
- {masster-0.5.22.dist-info → masster-0.5.24.dist-info}/RECORD +37 -37
- {masster-0.5.22.dist-info → masster-0.5.24.dist-info}/WHEEL +0 -0
- {masster-0.5.22.dist-info → masster-0.5.24.dist-info}/entry_points.txt +0 -0
- {masster-0.5.22.dist-info → masster-0.5.24.dist-info}/licenses/LICENSE +0 -0
masster/study/helpers.py
CHANGED
|
@@ -71,12 +71,7 @@ def get_bpc(owner, sample=None, rt_unit="s", label=None, original=False):
|
|
|
71
71
|
# fallback to pandas
|
|
72
72
|
try:
|
|
73
73
|
bpc_pd = s.ms1_df.to_pandas()[["rt", "inty"]]
|
|
74
|
-
bpc_pd = (
|
|
75
|
-
bpc_pd.groupby("rt")
|
|
76
|
-
.agg({"inty": "max"})
|
|
77
|
-
.reset_index()
|
|
78
|
-
.sort_values("rt")
|
|
79
|
-
)
|
|
74
|
+
bpc_pd = bpc_pd.groupby("rt").agg({"inty": "max"}).reset_index().sort_values("rt")
|
|
80
75
|
except Exception:
|
|
81
76
|
raise
|
|
82
77
|
|
|
@@ -375,8 +370,7 @@ def get_chrom(self, uids=None, samples=None):
|
|
|
375
370
|
)
|
|
376
371
|
# Pre-filter features_df to only relevant features and samples
|
|
377
372
|
filtered_features = self.features_df.filter(
|
|
378
|
-
pl.col("feature_uid").is_in(relevant_feature_uids)
|
|
379
|
-
& pl.col("sample_uid").is_in(sample_uids),
|
|
373
|
+
pl.col("feature_uid").is_in(relevant_feature_uids) & pl.col("sample_uid").is_in(sample_uids),
|
|
380
374
|
).select(
|
|
381
375
|
[
|
|
382
376
|
"feature_uid",
|
|
@@ -489,6 +483,7 @@ def align_reset(self):
|
|
|
489
483
|
|
|
490
484
|
# Ensure column order is maintained after with_columns operation
|
|
491
485
|
from masster.study.helpers import _ensure_features_df_schema_order
|
|
486
|
+
|
|
492
487
|
_ensure_features_df_schema_order(self)
|
|
493
488
|
self.logger.info("Alignment reset: all feature RTs set to original_RT.")
|
|
494
489
|
|
|
@@ -530,24 +525,24 @@ def get_consensus_matrix(self, quant="chrom_area", samples=None):
|
|
|
530
525
|
"""
|
|
531
526
|
Get a matrix of consensus features with samples as columns and consensus features as rows.
|
|
532
527
|
Highly optimized implementation using vectorized Polars operations.
|
|
533
|
-
|
|
528
|
+
|
|
534
529
|
Parameters:
|
|
535
530
|
quant (str): Quantification method column name (default: "chrom_area")
|
|
536
531
|
samples: Sample identifier(s) to include. Can be:
|
|
537
532
|
- None: include all samples (default)
|
|
538
|
-
- int: single sample_uid
|
|
533
|
+
- int: single sample_uid
|
|
539
534
|
- str: single sample_name
|
|
540
535
|
- list: multiple sample_uids or sample_names
|
|
541
536
|
"""
|
|
542
537
|
import polars as pl
|
|
543
|
-
|
|
538
|
+
|
|
544
539
|
if quant not in self.features_df.columns:
|
|
545
540
|
self.logger.error(f"Quantification method {quant} not found in features_df.")
|
|
546
541
|
return None
|
|
547
542
|
|
|
548
543
|
# Get sample_uids to include in the matrix
|
|
549
544
|
sample_uids = self._get_samples_uids(samples) if samples is not None else self.samples_df["sample_uid"].to_list()
|
|
550
|
-
|
|
545
|
+
|
|
551
546
|
if not sample_uids:
|
|
552
547
|
self.logger.warning("No valid samples found for consensus matrix")
|
|
553
548
|
return pl.DataFrame()
|
|
@@ -556,44 +551,31 @@ def get_consensus_matrix(self, quant="chrom_area", samples=None):
|
|
|
556
551
|
features_filtered = self.features_df.filter(pl.col("sample_uid").is_in(sample_uids))
|
|
557
552
|
samples_filtered = self.samples_df.filter(pl.col("sample_uid").is_in(sample_uids))
|
|
558
553
|
consensus_mapping_filtered = self.consensus_mapping_df.filter(pl.col("sample_uid").is_in(sample_uids))
|
|
559
|
-
|
|
554
|
+
|
|
560
555
|
# Join operations to combine data efficiently
|
|
561
556
|
# 1. Join consensus mapping with features to get quantification values
|
|
562
|
-
consensus_with_values = (
|
|
563
|
-
|
|
564
|
-
|
|
565
|
-
|
|
566
|
-
.with_columns(pl.col(quant).fill_null(0))
|
|
567
|
-
)
|
|
568
|
-
|
|
557
|
+
consensus_with_values = consensus_mapping_filtered.join(
|
|
558
|
+
features_filtered.select(["feature_uid", "sample_uid", quant]), on=["feature_uid", "sample_uid"], how="left"
|
|
559
|
+
).with_columns(pl.col(quant).fill_null(0))
|
|
560
|
+
|
|
569
561
|
# 2. Join with samples to get sample names
|
|
570
|
-
consensus_with_names = (
|
|
571
|
-
|
|
572
|
-
.join(samples_filtered.select(["sample_uid", "sample_name"]),
|
|
573
|
-
on="sample_uid", how="left")
|
|
562
|
+
consensus_with_names = consensus_with_values.join(
|
|
563
|
+
samples_filtered.select(["sample_uid", "sample_name"]), on="sample_uid", how="left"
|
|
574
564
|
)
|
|
575
|
-
|
|
565
|
+
|
|
576
566
|
# 3. Group by consensus_uid and sample_name, taking max value per group
|
|
577
|
-
aggregated = (
|
|
578
|
-
|
|
579
|
-
.group_by(["consensus_uid", "sample_name"])
|
|
580
|
-
.agg(pl.col(quant).max().alias("value"))
|
|
581
|
-
)
|
|
582
|
-
|
|
567
|
+
aggregated = consensus_with_names.group_by(["consensus_uid", "sample_name"]).agg(pl.col(quant).max().alias("value"))
|
|
568
|
+
|
|
583
569
|
# 4. Pivot to create the matrix format
|
|
584
|
-
matrix_df = (
|
|
585
|
-
|
|
586
|
-
.pivot(on="sample_name", index="consensus_uid", values="value")
|
|
587
|
-
.fill_null(0)
|
|
588
|
-
)
|
|
589
|
-
|
|
570
|
+
matrix_df = aggregated.pivot(on="sample_name", index="consensus_uid", values="value").fill_null(0)
|
|
571
|
+
|
|
590
572
|
# 5. Round numeric columns and ensure proper types
|
|
591
573
|
numeric_cols = [col for col in matrix_df.columns if col != "consensus_uid"]
|
|
592
574
|
matrix_df = matrix_df.with_columns([
|
|
593
575
|
pl.col("consensus_uid").cast(pl.UInt64),
|
|
594
|
-
*[pl.col(col).round(0) for col in numeric_cols]
|
|
576
|
+
*[pl.col(col).round(0) for col in numeric_cols],
|
|
595
577
|
])
|
|
596
|
-
|
|
578
|
+
|
|
597
579
|
return matrix_df
|
|
598
580
|
|
|
599
581
|
|
|
@@ -601,26 +583,26 @@ def get_gaps_matrix(self, uids=None, samples=None):
|
|
|
601
583
|
"""
|
|
602
584
|
Get a matrix of gaps between consensus features with samples as columns and consensus features as rows.
|
|
603
585
|
Optimized implementation that builds the gaps matrix directly without calling get_consensus_matrix().
|
|
604
|
-
|
|
586
|
+
|
|
605
587
|
Parameters:
|
|
606
588
|
uids: Consensus UID(s) to include. If None, includes all consensus features.
|
|
607
589
|
samples: Sample identifier(s) to include. If None, includes all samples.
|
|
608
590
|
Can be int (sample_uid), str (sample_name), or list of either.
|
|
609
|
-
|
|
591
|
+
|
|
610
592
|
Returns:
|
|
611
593
|
pl.DataFrame: Gaps matrix with consensus_uid as first column and samples as other columns.
|
|
612
594
|
Values are 1 (detected) or 0 (missing/gap).
|
|
613
595
|
"""
|
|
614
596
|
import polars as pl
|
|
615
|
-
|
|
597
|
+
|
|
616
598
|
if self.consensus_df is None or self.consensus_df.is_empty():
|
|
617
599
|
self.logger.error("No consensus found.")
|
|
618
600
|
return None
|
|
619
|
-
|
|
601
|
+
|
|
620
602
|
if self.consensus_mapping_df is None or self.consensus_mapping_df.is_empty():
|
|
621
603
|
self.logger.error("No consensus mapping found.")
|
|
622
604
|
return None
|
|
623
|
-
|
|
605
|
+
|
|
624
606
|
if self.features_df is None or self.features_df.is_empty():
|
|
625
607
|
self.logger.error("No features found.")
|
|
626
608
|
return None
|
|
@@ -628,7 +610,7 @@ def get_gaps_matrix(self, uids=None, samples=None):
|
|
|
628
610
|
# Get consensus UIDs and sample UIDs to include
|
|
629
611
|
uids = self._get_consensus_uids(uids)
|
|
630
612
|
sample_uids = self._get_samples_uids(samples) if samples is not None else self.samples_df["sample_uid"].to_list()
|
|
631
|
-
|
|
613
|
+
|
|
632
614
|
if not uids or not sample_uids:
|
|
633
615
|
self.logger.warning("No valid consensus features or samples found for gaps matrix")
|
|
634
616
|
return pl.DataFrame()
|
|
@@ -642,7 +624,7 @@ def get_gaps_matrix(self, uids=None, samples=None):
|
|
|
642
624
|
# Skip filled features (gaps should only show original detections)
|
|
643
625
|
if row.get("filled", False):
|
|
644
626
|
continue
|
|
645
|
-
|
|
627
|
+
|
|
646
628
|
feature_uid = row["feature_uid"]
|
|
647
629
|
# If feature exists and is not filled, it's detected (1)
|
|
648
630
|
feature_detection[(feature_uid, sample_uid)] = 1
|
|
@@ -651,7 +633,8 @@ def get_gaps_matrix(self, uids=None, samples=None):
|
|
|
651
633
|
matrix_dict = {}
|
|
652
634
|
sample_mapping = dict(
|
|
653
635
|
self.samples_df.filter(pl.col("sample_uid").is_in(sample_uids))
|
|
654
|
-
.select(["sample_uid", "sample_name"])
|
|
636
|
+
.select(["sample_uid", "sample_name"])
|
|
637
|
+
.iter_rows(),
|
|
655
638
|
)
|
|
656
639
|
|
|
657
640
|
for row in self.consensus_mapping_df.iter_rows(named=True):
|
|
@@ -732,7 +715,7 @@ def get_gaps_stats(self, uids=None):
|
|
|
732
715
|
def get_consensus_matches(self, uids=None, filled=True):
|
|
733
716
|
"""
|
|
734
717
|
Get feature matches for consensus UIDs with optimized join operation.
|
|
735
|
-
|
|
718
|
+
|
|
736
719
|
Parameters:
|
|
737
720
|
uids: Consensus UID(s) to get matches for. Can be:
|
|
738
721
|
- None: get matches for all consensus features
|
|
@@ -740,50 +723,47 @@ def get_consensus_matches(self, uids=None, filled=True):
|
|
|
740
723
|
- list: multiple consensus UIDs
|
|
741
724
|
filled (bool): Whether to include filled rows (True) or exclude them (False).
|
|
742
725
|
Default is True to maintain backward compatibility.
|
|
743
|
-
|
|
726
|
+
|
|
744
727
|
Returns:
|
|
745
728
|
pl.DataFrame: Feature matches for the specified consensus UIDs
|
|
746
729
|
"""
|
|
747
730
|
# Handle single int by converting to list
|
|
748
731
|
if isinstance(uids, int):
|
|
749
732
|
uids = [uids]
|
|
750
|
-
|
|
733
|
+
|
|
751
734
|
uids = self._get_consensus_uids(uids)
|
|
752
|
-
|
|
735
|
+
|
|
753
736
|
if not uids:
|
|
754
737
|
return pl.DataFrame()
|
|
755
|
-
|
|
738
|
+
|
|
756
739
|
# Early validation checks
|
|
757
740
|
if self.consensus_mapping_df is None or self.consensus_mapping_df.is_empty():
|
|
758
741
|
self.logger.warning("No consensus mapping data available")
|
|
759
742
|
return pl.DataFrame()
|
|
760
|
-
|
|
743
|
+
|
|
761
744
|
if self.features_df is None or self.features_df.is_empty():
|
|
762
745
|
self.logger.warning("No feature data available")
|
|
763
746
|
return pl.DataFrame()
|
|
764
|
-
|
|
747
|
+
|
|
765
748
|
# Build the query with optional filled filter
|
|
766
749
|
features_query = self.features_df.lazy()
|
|
767
|
-
|
|
750
|
+
|
|
768
751
|
# Apply filled filter if specified
|
|
769
752
|
if not filled and "filled" in self.features_df.columns:
|
|
770
753
|
features_query = features_query.filter(~pl.col("filled"))
|
|
771
|
-
|
|
754
|
+
|
|
772
755
|
# Optimized single-pass operation using join instead of two separate filters
|
|
773
756
|
# This avoids creating intermediate Python lists and leverages Polars' optimized joins
|
|
774
757
|
matches = (
|
|
775
|
-
features_query
|
|
776
|
-
|
|
777
|
-
self.consensus_mapping_df
|
|
778
|
-
.lazy()
|
|
758
|
+
features_query.join(
|
|
759
|
+
self.consensus_mapping_df.lazy()
|
|
779
760
|
.filter(pl.col("consensus_uid").is_in(uids))
|
|
780
761
|
.select("feature_uid"), # Only select what we need for the join
|
|
781
762
|
on="feature_uid",
|
|
782
|
-
how="inner"
|
|
783
|
-
)
|
|
784
|
-
.collect(streaming=True) # Use streaming for memory efficiency with large datasets
|
|
763
|
+
how="inner",
|
|
764
|
+
).collect(streaming=True) # Use streaming for memory efficiency with large datasets
|
|
785
765
|
)
|
|
786
|
-
|
|
766
|
+
|
|
787
767
|
return matches
|
|
788
768
|
|
|
789
769
|
|
|
@@ -795,34 +775,34 @@ def get_consensus_matches(self, uids=None, filled=True):
|
|
|
795
775
|
def consensus_reset(self):
|
|
796
776
|
"""
|
|
797
777
|
Reset consensus data by clearing consensus DataFrames and removing filled features.
|
|
798
|
-
|
|
778
|
+
|
|
799
779
|
This function:
|
|
800
780
|
1. Sets consensus_df, consensus_ms2, consensus_mapping_df, id_df to empty pl.DataFrame()
|
|
801
781
|
2. Removes all filled features from features_df
|
|
802
782
|
3. Removes relevant operations from history (merge, integrate, find_ms2, fill, identify)
|
|
803
783
|
4. Logs the number of features removed
|
|
804
|
-
|
|
784
|
+
|
|
805
785
|
This effectively undoes the merge() operation and any gap-filling.
|
|
806
786
|
"""
|
|
807
787
|
self.logger.debug("Resetting consensus data.")
|
|
808
|
-
|
|
788
|
+
|
|
809
789
|
# Reset consensus DataFrames to empty
|
|
810
790
|
self.consensus_df = pl.DataFrame()
|
|
811
|
-
self.consensus_ms2 = pl.DataFrame()
|
|
791
|
+
self.consensus_ms2 = pl.DataFrame()
|
|
812
792
|
self.consensus_mapping_df = pl.DataFrame()
|
|
813
793
|
self.id_df = pl.DataFrame()
|
|
814
|
-
|
|
794
|
+
|
|
815
795
|
# Remove filled features from features_df
|
|
816
796
|
if self.features_df is None:
|
|
817
797
|
self.logger.warning("No features found.")
|
|
818
798
|
return
|
|
819
|
-
|
|
799
|
+
|
|
820
800
|
l1 = len(self.features_df)
|
|
821
|
-
|
|
801
|
+
|
|
822
802
|
# Filter out filled features (keep only non-filled features)
|
|
823
803
|
if "filled" in self.features_df.columns:
|
|
824
804
|
self.features_df = self.features_df.filter(~pl.col("filled") | pl.col("filled").is_null())
|
|
825
|
-
|
|
805
|
+
|
|
826
806
|
# Remove consensus-related operations from history
|
|
827
807
|
keys_to_remove = ["merge", "integrate", "integrate_chrom", "find_ms2", "fill", "fill_single", "identify"]
|
|
828
808
|
history_removed_count = 0
|
|
@@ -832,7 +812,7 @@ def consensus_reset(self):
|
|
|
832
812
|
del self.history[key]
|
|
833
813
|
history_removed_count += 1
|
|
834
814
|
self.logger.debug(f"Removed '{key}' from history")
|
|
835
|
-
|
|
815
|
+
|
|
836
816
|
removed_count = l1 - len(self.features_df)
|
|
837
817
|
self.logger.info(
|
|
838
818
|
f"Reset consensus data. Consensus DataFrames cleared. Features removed: {removed_count}. History entries removed: {history_removed_count}",
|
|
@@ -1049,13 +1029,13 @@ def get_orphans(self):
|
|
|
1049
1029
|
def get_sample_stats(self):
|
|
1050
1030
|
"""
|
|
1051
1031
|
Get statistics for all samples in the study.
|
|
1052
|
-
|
|
1032
|
+
|
|
1053
1033
|
Returns:
|
|
1054
1034
|
pl.DataFrame: DataFrame with the following columns:
|
|
1055
1035
|
- sample_uid: Sample unique identifier
|
|
1056
1036
|
- num_features: Total number of features per sample
|
|
1057
1037
|
- num_ms1: Number of MS1 features per sample
|
|
1058
|
-
- num_ms2: Number of MS2 features per sample
|
|
1038
|
+
- num_ms2: Number of MS2 features per sample
|
|
1059
1039
|
- num_linked_ms1: Number of non-filled features present in consensus_mapping_df
|
|
1060
1040
|
- num_orphans: Number of non-filled features not present in consensus_mapping_df
|
|
1061
1041
|
- max_rt_correction: Maximum RT correction applied
|
|
@@ -1065,19 +1045,19 @@ def get_sample_stats(self):
|
|
|
1065
1045
|
if self.samples_df is None or self.samples_df.is_empty():
|
|
1066
1046
|
self.logger.warning("No samples found in study.")
|
|
1067
1047
|
return pl.DataFrame()
|
|
1068
|
-
|
|
1048
|
+
|
|
1069
1049
|
if self.features_df is None or self.features_df.is_empty():
|
|
1070
1050
|
self.logger.warning("No features found in study.")
|
|
1071
1051
|
return pl.DataFrame()
|
|
1072
|
-
|
|
1052
|
+
|
|
1073
1053
|
# Get base sample information
|
|
1074
1054
|
sample_uids = self.samples_df["sample_uid"].to_list()
|
|
1075
1055
|
stats_data = []
|
|
1076
|
-
|
|
1056
|
+
|
|
1077
1057
|
for sample_uid in sample_uids:
|
|
1078
1058
|
# Filter features for this sample
|
|
1079
1059
|
sample_features = self.features_df.filter(pl.col("sample_uid") == sample_uid)
|
|
1080
|
-
|
|
1060
|
+
|
|
1081
1061
|
if sample_features.is_empty():
|
|
1082
1062
|
# Sample has no features
|
|
1083
1063
|
stats_data.append({
|
|
@@ -1089,66 +1069,60 @@ def get_sample_stats(self):
|
|
|
1089
1069
|
"num_orphans": 0,
|
|
1090
1070
|
"max_rt_correction": None,
|
|
1091
1071
|
"average_rt_correction": None,
|
|
1092
|
-
"num_linked_ms2": 0
|
|
1072
|
+
"num_linked_ms2": 0,
|
|
1093
1073
|
})
|
|
1094
1074
|
continue
|
|
1095
|
-
|
|
1075
|
+
|
|
1096
1076
|
# Basic feature counts
|
|
1097
1077
|
num_features = len(sample_features)
|
|
1098
|
-
|
|
1078
|
+
|
|
1099
1079
|
# Count MS1 and MS2 features
|
|
1100
1080
|
# Assume features with ms_level=1 or missing ms_level are MS1
|
|
1101
|
-
num_ms1 =
|
|
1102
|
-
pl.col("ms_level").is_null() | (pl.col("ms_level") == 1)
|
|
1103
|
-
|
|
1104
|
-
|
|
1105
|
-
|
|
1106
|
-
|
|
1107
|
-
).height if "ms_level" in sample_features.columns else 0
|
|
1108
|
-
|
|
1081
|
+
num_ms1 = (
|
|
1082
|
+
sample_features.filter(pl.col("ms_level").is_null() | (pl.col("ms_level") == 1)).height
|
|
1083
|
+
if "ms_level" in sample_features.columns
|
|
1084
|
+
else num_features
|
|
1085
|
+
)
|
|
1086
|
+
|
|
1087
|
+
num_ms2 = sample_features.filter(pl.col("ms_level") == 2).height if "ms_level" in sample_features.columns else 0
|
|
1088
|
+
|
|
1109
1089
|
# Get non-filled features for this sample
|
|
1110
1090
|
if "filled" in sample_features.columns:
|
|
1111
1091
|
non_filled_features = sample_features.filter(~pl.col("filled") | pl.col("filled").is_null())
|
|
1112
1092
|
else:
|
|
1113
1093
|
non_filled_features = sample_features
|
|
1114
|
-
|
|
1094
|
+
|
|
1115
1095
|
# Count linked MS1 features (non-filled and present in consensus_mapping_df)
|
|
1116
1096
|
num_linked_ms1 = 0
|
|
1117
1097
|
if not self.consensus_mapping_df.is_empty() and not non_filled_features.is_empty():
|
|
1118
|
-
linked_feature_uids = self.consensus_mapping_df.filter(
|
|
1119
|
-
|
|
1120
|
-
|
|
1121
|
-
|
|
1122
|
-
num_linked_ms1 = non_filled_features.filter(
|
|
1123
|
-
|
|
1124
|
-
).height
|
|
1125
|
-
|
|
1098
|
+
linked_feature_uids = self.consensus_mapping_df.filter(pl.col("sample_uid") == sample_uid)[
|
|
1099
|
+
"feature_uid"
|
|
1100
|
+
].to_list()
|
|
1101
|
+
|
|
1102
|
+
num_linked_ms1 = non_filled_features.filter(pl.col("feature_uid").is_in(linked_feature_uids)).height
|
|
1103
|
+
|
|
1126
1104
|
# Count orphan features (non-filled and NOT present in consensus_mapping_df)
|
|
1127
1105
|
num_orphans = len(non_filled_features) - num_linked_ms1
|
|
1128
|
-
|
|
1106
|
+
|
|
1129
1107
|
# Calculate RT correction statistics
|
|
1130
1108
|
max_rt_correction = None
|
|
1131
1109
|
average_rt_correction = None
|
|
1132
|
-
|
|
1110
|
+
|
|
1133
1111
|
if "rt" in sample_features.columns and "rt_original" in sample_features.columns:
|
|
1134
1112
|
rt_corrections = sample_features.with_columns(
|
|
1135
1113
|
(pl.col("rt") - pl.col("rt_original")).alias("rt_correction")
|
|
1136
|
-
).filter(
|
|
1137
|
-
|
|
1138
|
-
)["rt_correction"]
|
|
1139
|
-
|
|
1114
|
+
).filter(pl.col("rt_correction").is_not_null())["rt_correction"]
|
|
1115
|
+
|
|
1140
1116
|
if not rt_corrections.is_empty():
|
|
1141
1117
|
max_rt_correction = rt_corrections.abs().max()
|
|
1142
1118
|
average_rt_correction = rt_corrections.abs().mean()
|
|
1143
|
-
|
|
1119
|
+
|
|
1144
1120
|
# Count linked MS2 spectra from consensus_ms2_df
|
|
1145
1121
|
num_linked_ms2 = 0
|
|
1146
|
-
if hasattr(self,
|
|
1122
|
+
if hasattr(self, "consensus_ms2") and self.consensus_ms2 is not None and not self.consensus_ms2.is_empty():
|
|
1147
1123
|
if "sample_uid" in self.consensus_ms2.columns:
|
|
1148
|
-
num_linked_ms2 = self.consensus_ms2.filter(
|
|
1149
|
-
|
|
1150
|
-
).height
|
|
1151
|
-
|
|
1124
|
+
num_linked_ms2 = self.consensus_ms2.filter(pl.col("sample_uid") == sample_uid).height
|
|
1125
|
+
|
|
1152
1126
|
stats_data.append({
|
|
1153
1127
|
"sample_uid": sample_uid,
|
|
1154
1128
|
"num_features": num_features,
|
|
@@ -1158,9 +1132,9 @@ def get_sample_stats(self):
|
|
|
1158
1132
|
"num_orphans": num_orphans,
|
|
1159
1133
|
"max_rt_correction": max_rt_correction,
|
|
1160
1134
|
"average_rt_correction": average_rt_correction,
|
|
1161
|
-
"num_linked_ms2": num_linked_ms2
|
|
1135
|
+
"num_linked_ms2": num_linked_ms2,
|
|
1162
1136
|
})
|
|
1163
|
-
|
|
1137
|
+
|
|
1164
1138
|
# Create DataFrame with proper schema
|
|
1165
1139
|
return pl.DataFrame(
|
|
1166
1140
|
stats_data,
|
|
@@ -1173,15 +1147,15 @@ def get_sample_stats(self):
|
|
|
1173
1147
|
"num_orphans": pl.UInt32,
|
|
1174
1148
|
"max_rt_correction": pl.Float64,
|
|
1175
1149
|
"average_rt_correction": pl.Float64,
|
|
1176
|
-
"num_linked_ms2": pl.UInt32
|
|
1177
|
-
}
|
|
1150
|
+
"num_linked_ms2": pl.UInt32,
|
|
1151
|
+
},
|
|
1178
1152
|
)
|
|
1179
1153
|
|
|
1180
1154
|
|
|
1181
1155
|
def get_consensus_stats(self):
|
|
1182
1156
|
"""
|
|
1183
1157
|
Get key performance indicators for each consensus feature.
|
|
1184
|
-
|
|
1158
|
+
|
|
1185
1159
|
Returns:
|
|
1186
1160
|
pl.DataFrame: DataFrame with the following columns:
|
|
1187
1161
|
- consensus_uid: Consensus unique identifier
|
|
@@ -1203,7 +1177,7 @@ def get_consensus_stats(self):
|
|
|
1203
1177
|
"""
|
|
1204
1178
|
import polars as pl
|
|
1205
1179
|
import numpy as np
|
|
1206
|
-
|
|
1180
|
+
|
|
1207
1181
|
# Check if consensus_df exists and has data
|
|
1208
1182
|
if self.consensus_df is None or self.consensus_df.is_empty():
|
|
1209
1183
|
self.logger.error("No consensus data available. Run merge/find_consensus first.")
|
|
@@ -1215,134 +1189,146 @@ def get_consensus_stats(self):
|
|
|
1215
1189
|
# Define specific columns to include in the exact order requested
|
|
1216
1190
|
desired_columns = [
|
|
1217
1191
|
"consensus_uid", # Include consensus_uid for identification
|
|
1218
|
-
"rt",
|
|
1219
|
-
"rt_delta_mean",
|
|
1220
|
-
"mz",
|
|
1192
|
+
"rt",
|
|
1193
|
+
"rt_delta_mean",
|
|
1194
|
+
"mz",
|
|
1221
1195
|
"mz_range", # mz_max-mz_min (will be calculated)
|
|
1222
1196
|
"log10_inty_mean", # log10(inty_mean) (will be calculated)
|
|
1223
|
-
"number_samples",
|
|
1224
|
-
"number_ms2",
|
|
1225
|
-
"charge_mean",
|
|
1226
|
-
"quality",
|
|
1227
|
-
"chrom_coherence_mean",
|
|
1228
|
-
"chrom_height_scaled_mean",
|
|
1229
|
-
"chrom_prominence_scaled_mean"
|
|
1197
|
+
"number_samples",
|
|
1198
|
+
"number_ms2",
|
|
1199
|
+
"charge_mean",
|
|
1200
|
+
"quality",
|
|
1201
|
+
"chrom_coherence_mean",
|
|
1202
|
+
"chrom_height_scaled_mean",
|
|
1203
|
+
"chrom_prominence_scaled_mean",
|
|
1230
1204
|
]
|
|
1231
|
-
|
|
1205
|
+
|
|
1232
1206
|
# Calculate derived columns if they don't exist
|
|
1233
1207
|
if "mz_range" not in data_df.columns and "mz_max" in data_df.columns and "mz_min" in data_df.columns:
|
|
1234
1208
|
data_df = data_df.with_columns((pl.col("mz_max") - pl.col("mz_min")).alias("mz_range"))
|
|
1235
|
-
|
|
1209
|
+
|
|
1236
1210
|
if "log10_inty_mean" not in data_df.columns and "inty_mean" in data_df.columns:
|
|
1237
1211
|
data_df = data_df.with_columns(pl.col("inty_mean").log10().alias("log10_inty_mean"))
|
|
1238
|
-
|
|
1212
|
+
|
|
1239
1213
|
# Filter to only include columns that exist in the dataframe, preserving order
|
|
1240
1214
|
available_columns = [col for col in desired_columns if col in data_df.columns]
|
|
1241
|
-
|
|
1215
|
+
|
|
1242
1216
|
if len(available_columns) <= 1: # Only consensus_uid would be 1
|
|
1243
|
-
self.logger.error(
|
|
1217
|
+
self.logger.error(
|
|
1218
|
+
f"None of the requested consensus statistics columns were found. Available columns: {list(data_df.columns)}"
|
|
1219
|
+
)
|
|
1244
1220
|
return pl.DataFrame()
|
|
1245
1221
|
|
|
1246
1222
|
self.logger.debug(f"Creating consensus stats DataFrame with {len(available_columns)} columns: {available_columns}")
|
|
1247
1223
|
|
|
1248
1224
|
# Get base result DataFrame with selected columns
|
|
1249
1225
|
result_df = data_df.select(available_columns)
|
|
1250
|
-
|
|
1226
|
+
|
|
1251
1227
|
# Add QC-related columns
|
|
1252
1228
|
try:
|
|
1253
1229
|
# Identify QC and blank samples based on naming patterns
|
|
1254
1230
|
all_sample_names = self.samples_df["sample_name"].to_list()
|
|
1255
|
-
|
|
1231
|
+
|
|
1256
1232
|
# Define patterns for QC and blank identification
|
|
1257
1233
|
qc_patterns = ["qc", "QC", "quality", "Quality", "control", "Control"]
|
|
1258
1234
|
blank_patterns = ["blank", "Blank", "BLANK", "blk", "BLK"]
|
|
1259
|
-
|
|
1235
|
+
|
|
1260
1236
|
# Get QC and blank sample names
|
|
1261
1237
|
qc_sample_names = [name for name in all_sample_names if any(pattern in name for pattern in qc_patterns)]
|
|
1262
1238
|
blank_sample_names = [name for name in all_sample_names if any(pattern in name for pattern in blank_patterns)]
|
|
1263
|
-
|
|
1239
|
+
|
|
1264
1240
|
self.logger.debug(f"Found {len(qc_sample_names)} QC samples and {len(blank_sample_names)} blank samples")
|
|
1265
|
-
|
|
1241
|
+
|
|
1266
1242
|
# Initialize QC columns with null values
|
|
1267
1243
|
qc_ratio_values = [None] * len(result_df)
|
|
1268
|
-
qc_cv_values = [None] * len(result_df)
|
|
1244
|
+
qc_cv_values = [None] * len(result_df)
|
|
1269
1245
|
qc_to_blank_values = [None] * len(result_df)
|
|
1270
|
-
|
|
1246
|
+
|
|
1271
1247
|
if len(qc_sample_names) > 0:
|
|
1272
1248
|
# Calculate QC metrics using optimized approach - get only QC+blank data
|
|
1273
1249
|
self.logger.debug("Fetching optimized consensus matrices for QC calculations...")
|
|
1274
|
-
|
|
1250
|
+
|
|
1275
1251
|
# Get QC consensus matrix (only QC samples)
|
|
1276
1252
|
qc_consensus_matrix = self.get_consensus_matrix(samples=qc_sample_names)
|
|
1277
|
-
|
|
1253
|
+
|
|
1278
1254
|
# Get blank consensus matrix (only blank samples) if blanks exist
|
|
1279
1255
|
blank_consensus_matrix = None
|
|
1280
1256
|
if len(blank_sample_names) > 0:
|
|
1281
1257
|
blank_consensus_matrix = self.get_consensus_matrix(samples=blank_sample_names)
|
|
1282
|
-
|
|
1258
|
+
|
|
1283
1259
|
if qc_consensus_matrix is not None and not qc_consensus_matrix.is_empty():
|
|
1284
1260
|
available_qc_cols = [col for col in qc_consensus_matrix.columns if col != "consensus_uid"]
|
|
1285
1261
|
self.logger.debug(f"Found {len(available_qc_cols)} QC columns in optimized QC matrix")
|
|
1286
|
-
|
|
1262
|
+
|
|
1287
1263
|
# 2. QC CV: Calculate CV for QC samples
|
|
1288
1264
|
if len(available_qc_cols) > 0:
|
|
1289
1265
|
self.logger.debug("Calculating QC CV...")
|
|
1290
1266
|
try:
|
|
1291
1267
|
# Calculate CV (coefficient of variation) for QC samples
|
|
1292
1268
|
qc_data = qc_consensus_matrix.select(["consensus_uid"] + available_qc_cols)
|
|
1293
|
-
|
|
1269
|
+
|
|
1294
1270
|
# Calculate mean and std for each row across QC columns
|
|
1295
|
-
qc_stats =
|
|
1296
|
-
|
|
1297
|
-
|
|
1298
|
-
|
|
1299
|
-
|
|
1300
|
-
|
|
1301
|
-
|
|
1302
|
-
|
|
1303
|
-
.
|
|
1304
|
-
|
|
1305
|
-
|
|
1271
|
+
qc_stats = (
|
|
1272
|
+
qc_data.with_columns([
|
|
1273
|
+
pl.concat_list([pl.col(col) for col in available_qc_cols]).alias("qc_values")
|
|
1274
|
+
])
|
|
1275
|
+
.with_columns([
|
|
1276
|
+
pl.col("qc_values").list.mean().alias("qc_mean"),
|
|
1277
|
+
pl.col("qc_values").list.std().alias("qc_std"),
|
|
1278
|
+
])
|
|
1279
|
+
.with_columns(
|
|
1280
|
+
# CV = std / mean (NOT multiplied by 100 to keep between 0-1)
|
|
1281
|
+
pl.when(pl.col("qc_mean") > 0)
|
|
1282
|
+
.then(pl.col("qc_std") / pl.col("qc_mean"))
|
|
1283
|
+
.otherwise(None)
|
|
1284
|
+
.alias("qc_cv")
|
|
1285
|
+
)
|
|
1306
1286
|
)
|
|
1307
|
-
|
|
1287
|
+
|
|
1308
1288
|
# Join with result DataFrame
|
|
1309
1289
|
result_df = result_df.join(
|
|
1310
|
-
qc_stats.select(["consensus_uid", "qc_cv"]),
|
|
1311
|
-
on="consensus_uid",
|
|
1312
|
-
how="left"
|
|
1290
|
+
qc_stats.select(["consensus_uid", "qc_cv"]), on="consensus_uid", how="left"
|
|
1313
1291
|
)
|
|
1314
1292
|
qc_cv_values = None # Indicate we successfully added the column
|
|
1315
|
-
|
|
1293
|
+
|
|
1316
1294
|
except Exception as e:
|
|
1317
1295
|
self.logger.debug(f"Could not calculate QC CV: {e}")
|
|
1318
|
-
|
|
1296
|
+
|
|
1319
1297
|
# 3. QC to blank ratio: Compare average QC to average blank intensity
|
|
1320
|
-
if
|
|
1298
|
+
if (
|
|
1299
|
+
len(available_qc_cols) > 0
|
|
1300
|
+
and blank_consensus_matrix is not None
|
|
1301
|
+
and not blank_consensus_matrix.is_empty()
|
|
1302
|
+
):
|
|
1321
1303
|
available_blank_cols = [col for col in blank_consensus_matrix.columns if col != "consensus_uid"]
|
|
1322
|
-
self.logger.debug(
|
|
1323
|
-
|
|
1304
|
+
self.logger.debug(
|
|
1305
|
+
f"Calculating QC to blank ratio with {len(available_blank_cols)} blank columns..."
|
|
1306
|
+
)
|
|
1307
|
+
|
|
1324
1308
|
if len(available_blank_cols) > 0:
|
|
1325
1309
|
try:
|
|
1326
1310
|
# Calculate average intensity for QC samples
|
|
1327
|
-
qc_averages =
|
|
1328
|
-
|
|
1329
|
-
|
|
1330
|
-
|
|
1331
|
-
|
|
1332
|
-
|
|
1311
|
+
qc_averages = (
|
|
1312
|
+
qc_data.with_columns([
|
|
1313
|
+
pl.concat_list([pl.col(col) for col in available_qc_cols]).alias("qc_values")
|
|
1314
|
+
])
|
|
1315
|
+
.with_columns(pl.col("qc_values").list.mean().alias("qc_avg"))
|
|
1316
|
+
.select(["consensus_uid", "qc_avg"])
|
|
1317
|
+
)
|
|
1318
|
+
|
|
1333
1319
|
# Calculate average intensity for blank samples
|
|
1334
1320
|
blank_data = blank_consensus_matrix.select(["consensus_uid"] + available_blank_cols)
|
|
1335
|
-
blank_averages =
|
|
1336
|
-
|
|
1337
|
-
|
|
1338
|
-
|
|
1339
|
-
|
|
1340
|
-
|
|
1321
|
+
blank_averages = (
|
|
1322
|
+
blank_data.with_columns([
|
|
1323
|
+
pl.concat_list([pl.col(col) for col in available_blank_cols]).alias("blank_values")
|
|
1324
|
+
])
|
|
1325
|
+
.with_columns(pl.col("blank_values").list.mean().alias("blank_avg"))
|
|
1326
|
+
.select(["consensus_uid", "blank_avg"])
|
|
1327
|
+
)
|
|
1328
|
+
|
|
1341
1329
|
# Join QC and blank averages and calculate ratio
|
|
1342
1330
|
qc_blank_ratios = qc_averages.join(
|
|
1343
|
-
blank_averages,
|
|
1344
|
-
on="consensus_uid",
|
|
1345
|
-
how="left"
|
|
1331
|
+
blank_averages, on="consensus_uid", how="left"
|
|
1346
1332
|
).with_columns(
|
|
1347
1333
|
# Ratio = qc_avg / blank_avg, but only where blank_avg > 0
|
|
1348
1334
|
pl.when(pl.col("blank_avg") > 0)
|
|
@@ -1350,57 +1336,53 @@ def get_consensus_stats(self):
|
|
|
1350
1336
|
.otherwise(None)
|
|
1351
1337
|
.alias("qc_to_blank")
|
|
1352
1338
|
)
|
|
1353
|
-
|
|
1339
|
+
|
|
1354
1340
|
# Join with result DataFrame
|
|
1355
1341
|
result_df = result_df.join(
|
|
1356
|
-
qc_blank_ratios.select(["consensus_uid", "qc_to_blank"]),
|
|
1357
|
-
on="consensus_uid",
|
|
1358
|
-
how="left"
|
|
1342
|
+
qc_blank_ratios.select(["consensus_uid", "qc_to_blank"]), on="consensus_uid", how="left"
|
|
1359
1343
|
)
|
|
1360
1344
|
qc_to_blank_values = None # Indicate we successfully added the column
|
|
1361
|
-
|
|
1345
|
+
|
|
1362
1346
|
except Exception as e:
|
|
1363
1347
|
self.logger.debug(f"Could not calculate QC to blank ratio: {e}")
|
|
1364
|
-
|
|
1348
|
+
|
|
1365
1349
|
# 1. QC ratio: Get optimized gaps matrix for QC samples only
|
|
1366
1350
|
self.logger.debug("Calculating QC detection ratio with optimized gaps matrix...")
|
|
1367
1351
|
try:
|
|
1368
1352
|
# Use optimized get_gaps_matrix with QC samples filtering for faster performance
|
|
1369
1353
|
qc_gaps_matrix = self.get_gaps_matrix(samples=qc_sample_names)
|
|
1370
|
-
|
|
1354
|
+
|
|
1371
1355
|
if qc_gaps_matrix is not None and not qc_gaps_matrix.is_empty():
|
|
1372
1356
|
# Get QC columns (should be all columns except consensus_uid since we filtered)
|
|
1373
1357
|
available_qc_cols_gaps = [col for col in qc_gaps_matrix.columns if col != "consensus_uid"]
|
|
1374
1358
|
self.logger.debug(f"Found {len(available_qc_cols_gaps)} QC columns in optimized gaps matrix")
|
|
1375
|
-
|
|
1359
|
+
|
|
1376
1360
|
if len(available_qc_cols_gaps) > 0:
|
|
1377
1361
|
# Calculate QC detection ratio for each consensus feature
|
|
1378
1362
|
qc_detection = qc_gaps_matrix.select(["consensus_uid"] + available_qc_cols_gaps)
|
|
1379
|
-
|
|
1363
|
+
|
|
1380
1364
|
# Data should already be properly typed from get_gaps_matrix, but ensure consistency
|
|
1381
1365
|
for col in available_qc_cols_gaps:
|
|
1382
|
-
qc_detection = qc_detection.with_columns(
|
|
1383
|
-
|
|
1384
|
-
)
|
|
1385
|
-
|
|
1366
|
+
qc_detection = qc_detection.with_columns(pl.col(col).fill_null(0).cast(pl.Int8).alias(col))
|
|
1367
|
+
|
|
1386
1368
|
# Calculate ratio (sum of detections / number of QC samples)
|
|
1387
1369
|
qc_ratios = qc_detection.with_columns(
|
|
1388
1370
|
pl.concat_list([pl.col(col) for col in available_qc_cols_gaps]).alias("qc_detections")
|
|
1389
1371
|
).with_columns(
|
|
1390
|
-
(pl.col("qc_detections").list.sum().cast(pl.Float64) / len(available_qc_cols_gaps)).alias(
|
|
1372
|
+
(pl.col("qc_detections").list.sum().cast(pl.Float64) / len(available_qc_cols_gaps)).alias(
|
|
1373
|
+
"qc_ratio"
|
|
1374
|
+
)
|
|
1391
1375
|
)
|
|
1392
|
-
|
|
1376
|
+
|
|
1393
1377
|
# Join with result DataFrame
|
|
1394
1378
|
result_df = result_df.join(
|
|
1395
|
-
qc_ratios.select(["consensus_uid", "qc_ratio"]),
|
|
1396
|
-
on="consensus_uid",
|
|
1397
|
-
how="left"
|
|
1379
|
+
qc_ratios.select(["consensus_uid", "qc_ratio"]), on="consensus_uid", how="left"
|
|
1398
1380
|
)
|
|
1399
1381
|
qc_ratio_values = None # Indicate we successfully added the column
|
|
1400
|
-
|
|
1382
|
+
|
|
1401
1383
|
except Exception as e:
|
|
1402
1384
|
self.logger.debug(f"Could not calculate QC ratio: {e}")
|
|
1403
|
-
|
|
1385
|
+
|
|
1404
1386
|
# Add null columns for any QC metrics that couldn't be calculated
|
|
1405
1387
|
# Add null columns for any QC metrics that couldn't be calculated
|
|
1406
1388
|
if qc_ratio_values is not None:
|
|
@@ -1409,16 +1391,16 @@ def get_consensus_stats(self):
|
|
|
1409
1391
|
result_df = result_df.with_columns(pl.lit(None, dtype=pl.Float64).alias("qc_cv"))
|
|
1410
1392
|
if qc_to_blank_values is not None:
|
|
1411
1393
|
result_df = result_df.with_columns(pl.lit(None, dtype=pl.Float64).alias("qc_to_blank"))
|
|
1412
|
-
|
|
1394
|
+
|
|
1413
1395
|
except Exception as e:
|
|
1414
1396
|
self.logger.warning(f"Error calculating QC metrics: {e}")
|
|
1415
1397
|
# Add null columns if QC calculation fails
|
|
1416
1398
|
result_df = result_df.with_columns([
|
|
1417
1399
|
pl.lit(None, dtype=pl.Float64).alias("qc_ratio"),
|
|
1418
1400
|
pl.lit(None, dtype=pl.Float64).alias("qc_cv"),
|
|
1419
|
-
pl.lit(None, dtype=pl.Float64).alias("qc_to_blank")
|
|
1401
|
+
pl.lit(None, dtype=pl.Float64).alias("qc_to_blank"),
|
|
1420
1402
|
])
|
|
1421
|
-
|
|
1403
|
+
|
|
1422
1404
|
return result_df
|
|
1423
1405
|
|
|
1424
1406
|
|
|
@@ -1565,9 +1547,7 @@ def restore_features(self, samples=None, maps=False):
|
|
|
1565
1547
|
continue
|
|
1566
1548
|
|
|
1567
1549
|
# Check which columns are actually available in the sample
|
|
1568
|
-
available_columns = [
|
|
1569
|
-
col for col in columns_to_update if col in sample.features_df.columns
|
|
1570
|
-
]
|
|
1550
|
+
available_columns = [col for col in columns_to_update if col in sample.features_df.columns]
|
|
1571
1551
|
if not available_columns:
|
|
1572
1552
|
self.logger.debug(f"No target columns found in sample {sample_name}")
|
|
1573
1553
|
continue
|
|
@@ -1590,9 +1570,7 @@ def restore_features(self, samples=None, maps=False):
|
|
|
1590
1570
|
original_dtype = self.features_df[col].dtype
|
|
1591
1571
|
|
|
1592
1572
|
# Update the specific row and column, preserving dtype
|
|
1593
|
-
mask = (pl.col("feature_uid") == feature_uid) & (
|
|
1594
|
-
pl.col("sample_uid") == sample_uid
|
|
1595
|
-
)
|
|
1573
|
+
mask = (pl.col("feature_uid") == feature_uid) & (pl.col("sample_uid") == sample_uid)
|
|
1596
1574
|
|
|
1597
1575
|
# Handle object columns (like Chromatogram) differently
|
|
1598
1576
|
if original_dtype == pl.Object:
|
|
@@ -1730,9 +1708,7 @@ def restore_chrom(self, samples=None, mz_tol=0.010, rt_tol=10.0):
|
|
|
1730
1708
|
feature_uid = study_feature_mapping[key]
|
|
1731
1709
|
|
|
1732
1710
|
# Update only the chrom column
|
|
1733
|
-
mask = (pl.col("feature_uid") == feature_uid) & (
|
|
1734
|
-
pl.col("sample_uid") == sample_uid
|
|
1735
|
-
)
|
|
1711
|
+
mask = (pl.col("feature_uid") == feature_uid) & (pl.col("sample_uid") == sample_uid)
|
|
1736
1712
|
self.features_df = self.features_df.with_columns(
|
|
1737
1713
|
pl.when(mask)
|
|
1738
1714
|
.then(pl.lit(chrom, dtype=pl.Object, allow_object=True))
|
|
@@ -1807,11 +1783,7 @@ def restore_chrom(self, samples=None, mz_tol=0.010, rt_tol=10.0):
|
|
|
1807
1783
|
sample = Sample(log_level="ERROR")
|
|
1808
1784
|
sample._load_sample5(sample_path, map=False)
|
|
1809
1785
|
|
|
1810
|
-
if (
|
|
1811
|
-
not hasattr(sample, "ms1_df")
|
|
1812
|
-
or sample.ms1_df is None
|
|
1813
|
-
or sample.ms1_df.is_empty()
|
|
1814
|
-
):
|
|
1786
|
+
if not hasattr(sample, "ms1_df") or sample.ms1_df is None or sample.ms1_df.is_empty():
|
|
1815
1787
|
continue
|
|
1816
1788
|
|
|
1817
1789
|
# Process each missing feature
|
|
@@ -1920,9 +1892,7 @@ def compress_ms2(self, max_replicates=5):
|
|
|
1920
1892
|
# Handle None values by treating them as 0
|
|
1921
1893
|
self.consensus_ms2 = self.consensus_ms2.with_columns(
|
|
1922
1894
|
[
|
|
1923
|
-
(
|
|
1924
|
-
pl.col("number_frags").fill_null(0) * pl.col("prec_inty").fill_null(0)
|
|
1925
|
-
).alias("ranking_score"),
|
|
1895
|
+
(pl.col("number_frags").fill_null(0) * pl.col("prec_inty").fill_null(0)).alias("ranking_score"),
|
|
1926
1896
|
],
|
|
1927
1897
|
)
|
|
1928
1898
|
|
|
@@ -2259,57 +2229,86 @@ def features_select(
|
|
|
2259
2229
|
return pl.DataFrame()
|
|
2260
2230
|
|
|
2261
2231
|
# Early return optimization
|
|
2262
|
-
filter_params = [
|
|
2263
|
-
|
|
2264
|
-
|
|
2265
|
-
|
|
2232
|
+
filter_params = [
|
|
2233
|
+
mz,
|
|
2234
|
+
rt,
|
|
2235
|
+
inty,
|
|
2236
|
+
sample_uid,
|
|
2237
|
+
sample_name,
|
|
2238
|
+
consensus_uid,
|
|
2239
|
+
feature_uid,
|
|
2240
|
+
filled,
|
|
2241
|
+
quality,
|
|
2242
|
+
chrom_coherence,
|
|
2243
|
+
chrom_prominence,
|
|
2244
|
+
chrom_prominence_scaled,
|
|
2245
|
+
chrom_height_scaled,
|
|
2246
|
+
]
|
|
2247
|
+
|
|
2266
2248
|
if all(param is None for param in filter_params):
|
|
2267
2249
|
return self.features_df.clone()
|
|
2268
2250
|
|
|
2269
2251
|
import time
|
|
2252
|
+
|
|
2270
2253
|
start_time = time.perf_counter()
|
|
2271
2254
|
initial_count = len(self.features_df)
|
|
2272
2255
|
|
|
2273
2256
|
# Build optimized filter expression
|
|
2274
2257
|
filter_expr = _build_optimized_filter_expression(
|
|
2275
|
-
self,
|
|
2276
|
-
|
|
2277
|
-
|
|
2258
|
+
self,
|
|
2259
|
+
mz,
|
|
2260
|
+
rt,
|
|
2261
|
+
inty,
|
|
2262
|
+
sample_uid,
|
|
2263
|
+
sample_name,
|
|
2264
|
+
consensus_uid,
|
|
2265
|
+
feature_uid,
|
|
2266
|
+
filled,
|
|
2267
|
+
quality,
|
|
2268
|
+
chrom_coherence,
|
|
2269
|
+
chrom_prominence,
|
|
2270
|
+
chrom_prominence_scaled,
|
|
2271
|
+
chrom_height_scaled,
|
|
2278
2272
|
)
|
|
2279
|
-
|
|
2273
|
+
|
|
2280
2274
|
if filter_expr is None:
|
|
2281
2275
|
return pl.DataFrame()
|
|
2282
|
-
|
|
2276
|
+
|
|
2283
2277
|
# Apply filter with optimized execution strategy
|
|
2284
2278
|
if use_lazy_streaming and initial_count > chunk_size:
|
|
2285
2279
|
result = _apply_chunked_select(self, filter_expr, chunk_size)
|
|
2286
2280
|
else:
|
|
2287
|
-
result = (
|
|
2288
|
-
|
|
2289
|
-
.lazy()
|
|
2290
|
-
.filter(filter_expr)
|
|
2291
|
-
.collect(streaming=use_lazy_streaming)
|
|
2292
|
-
)
|
|
2293
|
-
|
|
2281
|
+
result = self.features_df.lazy().filter(filter_expr).collect(streaming=use_lazy_streaming)
|
|
2282
|
+
|
|
2294
2283
|
# Log performance
|
|
2295
2284
|
elapsed_time = time.perf_counter() - start_time
|
|
2296
2285
|
final_count = len(result)
|
|
2297
2286
|
removed_count = initial_count - final_count
|
|
2298
|
-
|
|
2287
|
+
|
|
2299
2288
|
if final_count == 0:
|
|
2300
2289
|
self.logger.warning("No features remaining after applying selection criteria.")
|
|
2301
2290
|
else:
|
|
2302
|
-
self.logger.debug(
|
|
2303
|
-
|
|
2304
|
-
)
|
|
2305
|
-
|
|
2291
|
+
self.logger.debug(f"Selected features: {final_count:,} (removed: {removed_count:,}) in {elapsed_time:.4f}s")
|
|
2292
|
+
|
|
2306
2293
|
return result
|
|
2307
2294
|
|
|
2308
2295
|
|
|
2309
|
-
def _build_optimized_filter_expression(
|
|
2310
|
-
|
|
2311
|
-
|
|
2312
|
-
|
|
2296
|
+
def _build_optimized_filter_expression(
|
|
2297
|
+
self,
|
|
2298
|
+
mz,
|
|
2299
|
+
rt,
|
|
2300
|
+
inty,
|
|
2301
|
+
sample_uid,
|
|
2302
|
+
sample_name,
|
|
2303
|
+
consensus_uid,
|
|
2304
|
+
feature_uid,
|
|
2305
|
+
filled,
|
|
2306
|
+
quality,
|
|
2307
|
+
chrom_coherence,
|
|
2308
|
+
chrom_prominence,
|
|
2309
|
+
chrom_prominence_scaled,
|
|
2310
|
+
chrom_height_scaled,
|
|
2311
|
+
):
|
|
2313
2312
|
"""
|
|
2314
2313
|
Build optimized filter expression with efficient column checking and expression combining.
|
|
2315
2314
|
"""
|
|
@@ -2317,7 +2316,7 @@ def _build_optimized_filter_expression(self, mz, rt, inty, sample_uid, sample_na
|
|
|
2317
2316
|
available_columns = set(self.features_df.columns)
|
|
2318
2317
|
filter_conditions = []
|
|
2319
2318
|
warnings = []
|
|
2320
|
-
|
|
2319
|
+
|
|
2321
2320
|
# Build filter conditions with optimized expressions
|
|
2322
2321
|
if mz is not None:
|
|
2323
2322
|
if isinstance(mz, tuple) and len(mz) == 2:
|
|
@@ -2422,7 +2421,9 @@ def _build_optimized_filter_expression(self, mz, rt, inty, sample_uid, sample_na
|
|
|
2422
2421
|
if "chrom_coherence" in available_columns:
|
|
2423
2422
|
if isinstance(chrom_coherence, tuple) and len(chrom_coherence) == 2:
|
|
2424
2423
|
min_coherence, max_coherence = chrom_coherence
|
|
2425
|
-
filter_conditions.append(
|
|
2424
|
+
filter_conditions.append(
|
|
2425
|
+
pl.col("chrom_coherence").is_between(min_coherence, max_coherence, closed="both")
|
|
2426
|
+
)
|
|
2426
2427
|
else:
|
|
2427
2428
|
filter_conditions.append(pl.col("chrom_coherence") >= chrom_coherence)
|
|
2428
2429
|
else:
|
|
@@ -2433,7 +2434,9 @@ def _build_optimized_filter_expression(self, mz, rt, inty, sample_uid, sample_na
|
|
|
2433
2434
|
if "chrom_prominence" in available_columns:
|
|
2434
2435
|
if isinstance(chrom_prominence, tuple) and len(chrom_prominence) == 2:
|
|
2435
2436
|
min_prominence, max_prominence = chrom_prominence
|
|
2436
|
-
filter_conditions.append(
|
|
2437
|
+
filter_conditions.append(
|
|
2438
|
+
pl.col("chrom_prominence").is_between(min_prominence, max_prominence, closed="both")
|
|
2439
|
+
)
|
|
2437
2440
|
else:
|
|
2438
2441
|
filter_conditions.append(pl.col("chrom_prominence") >= chrom_prominence)
|
|
2439
2442
|
else:
|
|
@@ -2445,7 +2448,10 @@ def _build_optimized_filter_expression(self, mz, rt, inty, sample_uid, sample_na
|
|
|
2445
2448
|
if isinstance(chrom_prominence_scaled, tuple) and len(chrom_prominence_scaled) == 2:
|
|
2446
2449
|
min_prominence_scaled, max_prominence_scaled = chrom_prominence_scaled
|
|
2447
2450
|
filter_conditions.append(
|
|
2448
|
-
pl.col("chrom_prominence_scaled").is_between(
|
|
2451
|
+
pl.col("chrom_prominence_scaled").is_between(
|
|
2452
|
+
min_prominence_scaled, max_prominence_scaled, closed="both"
|
|
2453
|
+
)
|
|
2454
|
+
)
|
|
2449
2455
|
else:
|
|
2450
2456
|
filter_conditions.append(pl.col("chrom_prominence_scaled") >= chrom_prominence_scaled)
|
|
2451
2457
|
else:
|
|
@@ -2457,7 +2463,8 @@ def _build_optimized_filter_expression(self, mz, rt, inty, sample_uid, sample_na
|
|
|
2457
2463
|
if isinstance(chrom_height_scaled, tuple) and len(chrom_height_scaled) == 2:
|
|
2458
2464
|
min_height_scaled, max_height_scaled = chrom_height_scaled
|
|
2459
2465
|
filter_conditions.append(
|
|
2460
|
-
pl.col("chrom_height_scaled").is_between(min_height_scaled, max_height_scaled, closed="both")
|
|
2466
|
+
pl.col("chrom_height_scaled").is_between(min_height_scaled, max_height_scaled, closed="both")
|
|
2467
|
+
)
|
|
2461
2468
|
else:
|
|
2462
2469
|
filter_conditions.append(pl.col("chrom_height_scaled") >= chrom_height_scaled)
|
|
2463
2470
|
else:
|
|
@@ -2470,12 +2477,13 @@ def _build_optimized_filter_expression(self, mz, rt, inty, sample_uid, sample_na
|
|
|
2470
2477
|
# Combine all conditions efficiently
|
|
2471
2478
|
if not filter_conditions:
|
|
2472
2479
|
return None
|
|
2473
|
-
|
|
2480
|
+
|
|
2474
2481
|
# Use reduce for efficient expression combination
|
|
2475
2482
|
from functools import reduce
|
|
2476
2483
|
import operator
|
|
2484
|
+
|
|
2477
2485
|
combined_expr = reduce(operator.and_, filter_conditions)
|
|
2478
|
-
|
|
2486
|
+
|
|
2479
2487
|
return combined_expr
|
|
2480
2488
|
|
|
2481
2489
|
|
|
@@ -2485,30 +2493,27 @@ def _apply_chunked_select(self, filter_expr, chunk_size: int):
|
|
|
2485
2493
|
"""
|
|
2486
2494
|
total_features = len(self.features_df)
|
|
2487
2495
|
num_chunks = (total_features + chunk_size - 1) // chunk_size
|
|
2488
|
-
|
|
2496
|
+
|
|
2489
2497
|
self.logger.debug(f"Using chunked select with {num_chunks} chunks")
|
|
2490
|
-
|
|
2498
|
+
|
|
2491
2499
|
filtered_chunks = []
|
|
2492
2500
|
for i in range(num_chunks):
|
|
2493
2501
|
start_idx = i * chunk_size
|
|
2494
2502
|
end_idx = min((i + 1) * chunk_size, total_features)
|
|
2495
|
-
|
|
2503
|
+
|
|
2496
2504
|
chunk_result = (
|
|
2497
|
-
self.features_df
|
|
2498
|
-
.lazy()
|
|
2499
|
-
.slice(start_idx, end_idx - start_idx)
|
|
2500
|
-
.filter(filter_expr)
|
|
2501
|
-
.collect(streaming=True)
|
|
2505
|
+
self.features_df.lazy().slice(start_idx, end_idx - start_idx).filter(filter_expr).collect(streaming=True)
|
|
2502
2506
|
)
|
|
2503
|
-
|
|
2507
|
+
|
|
2504
2508
|
if not chunk_result.is_empty():
|
|
2505
2509
|
filtered_chunks.append(chunk_result)
|
|
2506
|
-
|
|
2510
|
+
|
|
2507
2511
|
if filtered_chunks:
|
|
2508
2512
|
return pl.concat(filtered_chunks, how="vertical")
|
|
2509
2513
|
else:
|
|
2510
2514
|
return pl.DataFrame()
|
|
2511
2515
|
|
|
2516
|
+
|
|
2512
2517
|
'''
|
|
2513
2518
|
def features_select_benchmarked(
|
|
2514
2519
|
self,
|
|
@@ -2604,19 +2609,14 @@ def monkey_patch_study():
|
|
|
2604
2609
|
print("Patched Study.features_select with consolidated optimized implementation")
|
|
2605
2610
|
'''
|
|
2606
2611
|
|
|
2607
|
-
|
|
2608
|
-
|
|
2609
|
-
features,
|
|
2610
|
-
chunk_size: int = 50000,
|
|
2611
|
-
use_index_based: bool = True,
|
|
2612
|
-
parallel: bool = True
|
|
2613
|
-
):
|
|
2612
|
+
|
|
2613
|
+
def features_filter(self, features, chunk_size: int = 50000, use_index_based: bool = True, parallel: bool = True):
|
|
2614
2614
|
"""
|
|
2615
2615
|
Filter features_df by keeping only features that match the given criteria.
|
|
2616
2616
|
This keeps only the specified features and removes all others.
|
|
2617
2617
|
|
|
2618
2618
|
FULLY OPTIMIZED VERSION: Index-based filtering, chunked processing, and lazy evaluation.
|
|
2619
|
-
|
|
2619
|
+
|
|
2620
2620
|
Performance improvements:
|
|
2621
2621
|
- Index-based filtering using sorted arrays (O(n log n) instead of O(n²))
|
|
2622
2622
|
- Chunked processing to handle large datasets without memory issues
|
|
@@ -2646,26 +2646,24 @@ def features_filter(
|
|
|
2646
2646
|
return
|
|
2647
2647
|
|
|
2648
2648
|
initial_count = len(self.features_df)
|
|
2649
|
-
|
|
2649
|
+
|
|
2650
2650
|
# Extract feature UIDs efficiently
|
|
2651
2651
|
feature_uids_to_keep = _extract_feature_uids_optimized(self, features)
|
|
2652
2652
|
if not feature_uids_to_keep:
|
|
2653
2653
|
self.logger.warning("No feature UIDs provided for filtering.")
|
|
2654
2654
|
return
|
|
2655
|
-
|
|
2655
|
+
|
|
2656
2656
|
# Choose optimal filtering strategy based on data size and characteristics
|
|
2657
2657
|
if use_index_based and len(self.features_df) > 10000:
|
|
2658
2658
|
_apply_index_based_filter(self, feature_uids_to_keep, chunk_size, parallel)
|
|
2659
2659
|
else:
|
|
2660
2660
|
_apply_standard_filter(self, feature_uids_to_keep)
|
|
2661
|
-
|
|
2661
|
+
|
|
2662
2662
|
# Calculate results and log performance
|
|
2663
2663
|
final_count = len(self.features_df)
|
|
2664
2664
|
removed_count = initial_count - final_count
|
|
2665
|
-
|
|
2666
|
-
self.logger.info(
|
|
2667
|
-
f"Filtered features. Kept: {final_count:,}. Removed: {removed_count:,}."
|
|
2668
|
-
)
|
|
2665
|
+
|
|
2666
|
+
self.logger.info(f"Filtered features. Kept: {final_count:,}. Removed: {removed_count:,}.")
|
|
2669
2667
|
|
|
2670
2668
|
|
|
2671
2669
|
def _extract_feature_uids_optimized(self, features):
|
|
@@ -2679,13 +2677,13 @@ def _extract_feature_uids_optimized(self, features):
|
|
|
2679
2677
|
return set()
|
|
2680
2678
|
# Use polars native operations for efficiency
|
|
2681
2679
|
return set(features.select("feature_uid").to_series().to_list())
|
|
2682
|
-
|
|
2680
|
+
|
|
2683
2681
|
elif isinstance(features, (list, tuple)):
|
|
2684
2682
|
return set(features) # Convert to set immediately for O(1) lookups
|
|
2685
|
-
|
|
2683
|
+
|
|
2686
2684
|
elif isinstance(features, int):
|
|
2687
2685
|
return {features}
|
|
2688
|
-
|
|
2686
|
+
|
|
2689
2687
|
else:
|
|
2690
2688
|
self.logger.error("features parameter must be a DataFrame, list, tuple, or int")
|
|
2691
2689
|
return set()
|
|
@@ -2694,7 +2692,7 @@ def _extract_feature_uids_optimized(self, features):
|
|
|
2694
2692
|
def _apply_index_based_filter(self, feature_uids_to_keep, chunk_size: int, parallel: bool):
|
|
2695
2693
|
"""
|
|
2696
2694
|
Apply index-based filtering with chunked processing and lazy evaluation.
|
|
2697
|
-
|
|
2695
|
+
|
|
2698
2696
|
This method uses:
|
|
2699
2697
|
1. Sorted arrays and binary search for O(log n) lookups
|
|
2700
2698
|
2. Chunked processing to manage memory usage
|
|
@@ -2702,9 +2700,9 @@ def _apply_index_based_filter(self, feature_uids_to_keep, chunk_size: int, paral
|
|
|
2702
2700
|
4. Hash-based set operations for optimal performance
|
|
2703
2701
|
"""
|
|
2704
2702
|
self.logger.debug(f"Using index-based filtering with chunks of {chunk_size:,}")
|
|
2705
|
-
|
|
2703
|
+
|
|
2706
2704
|
total_features = len(self.features_df)
|
|
2707
|
-
|
|
2705
|
+
|
|
2708
2706
|
if total_features <= chunk_size:
|
|
2709
2707
|
# Small dataset - process in single chunk with optimized operations
|
|
2710
2708
|
_filter_single_chunk_optimized(self, feature_uids_to_keep)
|
|
@@ -2720,30 +2718,21 @@ def _filter_single_chunk_optimized(self, feature_uids_to_keep):
|
|
|
2720
2718
|
"""
|
|
2721
2719
|
# Create boolean mask using hash-based set lookup (O(1) per element)
|
|
2722
2720
|
filter_expr = pl.col("feature_uid").is_in(list(feature_uids_to_keep))
|
|
2723
|
-
|
|
2721
|
+
|
|
2724
2722
|
# Apply filter using lazy evaluation with optimized execution
|
|
2725
2723
|
self.features_df = (
|
|
2726
|
-
self.features_df
|
|
2727
|
-
.lazy()
|
|
2728
|
-
.filter(filter_expr)
|
|
2729
|
-
.collect(streaming=True) # Use streaming for memory efficiency
|
|
2724
|
+
self.features_df.lazy().filter(filter_expr).collect(streaming=True) # Use streaming for memory efficiency
|
|
2730
2725
|
)
|
|
2731
|
-
|
|
2726
|
+
|
|
2732
2727
|
# Apply same filter to consensus_mapping_df if it exists
|
|
2733
|
-
if
|
|
2734
|
-
|
|
2735
|
-
self.consensus_mapping_df = (
|
|
2736
|
-
self.consensus_mapping_df
|
|
2737
|
-
.lazy()
|
|
2738
|
-
.filter(filter_expr)
|
|
2739
|
-
.collect(streaming=True)
|
|
2740
|
-
)
|
|
2728
|
+
if self.consensus_mapping_df is not None and not self.consensus_mapping_df.is_empty():
|
|
2729
|
+
self.consensus_mapping_df = self.consensus_mapping_df.lazy().filter(filter_expr).collect(streaming=True)
|
|
2741
2730
|
|
|
2742
2731
|
|
|
2743
2732
|
def _filter_chunked_lazy(self, feature_uids_to_keep, chunk_size: int, parallel: bool):
|
|
2744
2733
|
"""
|
|
2745
2734
|
Chunked processing with lazy evaluation for large datasets.
|
|
2746
|
-
|
|
2735
|
+
|
|
2747
2736
|
This approach:
|
|
2748
2737
|
1. Processes data in manageable chunks to control memory usage
|
|
2749
2738
|
2. Uses lazy evaluation to optimize query execution
|
|
@@ -2752,35 +2741,34 @@ def _filter_chunked_lazy(self, feature_uids_to_keep, chunk_size: int, parallel:
|
|
|
2752
2741
|
"""
|
|
2753
2742
|
total_features = len(self.features_df)
|
|
2754
2743
|
num_chunks = (total_features + chunk_size - 1) // chunk_size
|
|
2755
|
-
|
|
2744
|
+
|
|
2756
2745
|
self.logger.debug(f"Processing {total_features:,} features in {num_chunks} chunks")
|
|
2757
|
-
|
|
2746
|
+
|
|
2758
2747
|
# Process features_df in chunks using lazy evaluation
|
|
2759
2748
|
filtered_chunks = []
|
|
2760
|
-
|
|
2749
|
+
|
|
2761
2750
|
for i in range(num_chunks):
|
|
2762
2751
|
start_idx = i * chunk_size
|
|
2763
2752
|
end_idx = min((i + 1) * chunk_size, total_features)
|
|
2764
|
-
|
|
2753
|
+
|
|
2765
2754
|
# Create lazy query for this chunk
|
|
2766
2755
|
chunk_query = (
|
|
2767
|
-
self.features_df
|
|
2768
|
-
.lazy()
|
|
2756
|
+
self.features_df.lazy()
|
|
2769
2757
|
.slice(start_idx, end_idx - start_idx)
|
|
2770
2758
|
.filter(pl.col("feature_uid").is_in(list(feature_uids_to_keep)))
|
|
2771
2759
|
)
|
|
2772
|
-
|
|
2760
|
+
|
|
2773
2761
|
# Collect chunk with streaming for memory efficiency
|
|
2774
2762
|
chunk_result = chunk_query.collect(streaming=True)
|
|
2775
2763
|
if not chunk_result.is_empty():
|
|
2776
2764
|
filtered_chunks.append(chunk_result)
|
|
2777
|
-
|
|
2765
|
+
|
|
2778
2766
|
# Combine all filtered chunks efficiently
|
|
2779
2767
|
if filtered_chunks:
|
|
2780
2768
|
self.features_df = pl.concat(filtered_chunks, how="vertical")
|
|
2781
2769
|
else:
|
|
2782
2770
|
self.features_df = pl.DataFrame() # No features remain
|
|
2783
|
-
|
|
2771
|
+
|
|
2784
2772
|
# Apply same chunked processing to consensus_mapping_df
|
|
2785
2773
|
_filter_consensus_mapping_chunked(self, feature_uids_to_keep, chunk_size)
|
|
2786
2774
|
|
|
@@ -2789,17 +2777,15 @@ def _filter_consensus_mapping_chunked(self, feature_uids_to_keep, chunk_size: in
|
|
|
2789
2777
|
"""
|
|
2790
2778
|
Apply chunked filtering to consensus_mapping_df with same optimization strategy.
|
|
2791
2779
|
"""
|
|
2792
|
-
if
|
|
2793
|
-
self.consensus_mapping_df.is_empty()):
|
|
2780
|
+
if self.consensus_mapping_df is None or self.consensus_mapping_df.is_empty():
|
|
2794
2781
|
return
|
|
2795
|
-
|
|
2782
|
+
|
|
2796
2783
|
total_mappings = len(self.consensus_mapping_df)
|
|
2797
|
-
|
|
2784
|
+
|
|
2798
2785
|
if total_mappings <= chunk_size:
|
|
2799
2786
|
# Single chunk processing
|
|
2800
2787
|
self.consensus_mapping_df = (
|
|
2801
|
-
self.consensus_mapping_df
|
|
2802
|
-
.lazy()
|
|
2788
|
+
self.consensus_mapping_df.lazy()
|
|
2803
2789
|
.filter(pl.col("feature_uid").is_in(list(feature_uids_to_keep)))
|
|
2804
2790
|
.collect(streaming=True)
|
|
2805
2791
|
)
|
|
@@ -2807,22 +2793,21 @@ def _filter_consensus_mapping_chunked(self, feature_uids_to_keep, chunk_size: in
|
|
|
2807
2793
|
# Multi-chunk processing
|
|
2808
2794
|
num_chunks = (total_mappings + chunk_size - 1) // chunk_size
|
|
2809
2795
|
filtered_chunks = []
|
|
2810
|
-
|
|
2796
|
+
|
|
2811
2797
|
for i in range(num_chunks):
|
|
2812
2798
|
start_idx = i * chunk_size
|
|
2813
2799
|
end_idx = min((i + 1) * chunk_size, total_mappings)
|
|
2814
|
-
|
|
2800
|
+
|
|
2815
2801
|
chunk_query = (
|
|
2816
|
-
self.consensus_mapping_df
|
|
2817
|
-
.lazy()
|
|
2802
|
+
self.consensus_mapping_df.lazy()
|
|
2818
2803
|
.slice(start_idx, end_idx - start_idx)
|
|
2819
2804
|
.filter(pl.col("feature_uid").is_in(list(feature_uids_to_keep)))
|
|
2820
2805
|
)
|
|
2821
|
-
|
|
2806
|
+
|
|
2822
2807
|
chunk_result = chunk_query.collect(streaming=True)
|
|
2823
2808
|
if not chunk_result.is_empty():
|
|
2824
2809
|
filtered_chunks.append(chunk_result)
|
|
2825
|
-
|
|
2810
|
+
|
|
2826
2811
|
if filtered_chunks:
|
|
2827
2812
|
self.consensus_mapping_df = pl.concat(filtered_chunks, how="vertical")
|
|
2828
2813
|
else:
|
|
@@ -2835,24 +2820,13 @@ def _apply_standard_filter(self, feature_uids_to_keep):
|
|
|
2835
2820
|
Still uses optimized set operations and lazy evaluation.
|
|
2836
2821
|
"""
|
|
2837
2822
|
filter_expr = pl.col("feature_uid").is_in(list(feature_uids_to_keep))
|
|
2838
|
-
|
|
2823
|
+
|
|
2839
2824
|
# Apply filter with lazy evaluation
|
|
2840
|
-
self.features_df = (
|
|
2841
|
-
|
|
2842
|
-
.lazy()
|
|
2843
|
-
.filter(filter_expr)
|
|
2844
|
-
.collect(streaming=True)
|
|
2845
|
-
)
|
|
2846
|
-
|
|
2825
|
+
self.features_df = self.features_df.lazy().filter(filter_expr).collect(streaming=True)
|
|
2826
|
+
|
|
2847
2827
|
# Apply to consensus_mapping_df
|
|
2848
|
-
if
|
|
2849
|
-
|
|
2850
|
-
self.consensus_mapping_df = (
|
|
2851
|
-
self.consensus_mapping_df
|
|
2852
|
-
.lazy()
|
|
2853
|
-
.filter(filter_expr)
|
|
2854
|
-
.collect(streaming=True)
|
|
2855
|
-
)
|
|
2828
|
+
if self.consensus_mapping_df is not None and not self.consensus_mapping_df.is_empty():
|
|
2829
|
+
self.consensus_mapping_df = self.consensus_mapping_df.lazy().filter(filter_expr).collect(streaming=True)
|
|
2856
2830
|
|
|
2857
2831
|
|
|
2858
2832
|
def features_delete(self, features):
|
|
@@ -2914,14 +2888,9 @@ def features_delete(self, features):
|
|
|
2914
2888
|
|
|
2915
2889
|
# Apply filter to consensus_mapping_df if it exists - batch operation
|
|
2916
2890
|
mapping_removed_count = 0
|
|
2917
|
-
if (
|
|
2918
|
-
self.consensus_mapping_df is not None
|
|
2919
|
-
and not self.consensus_mapping_df.is_empty()
|
|
2920
|
-
):
|
|
2891
|
+
if self.consensus_mapping_df is not None and not self.consensus_mapping_df.is_empty():
|
|
2921
2892
|
initial_mapping_count = len(self.consensus_mapping_df)
|
|
2922
|
-
self.consensus_mapping_df = (
|
|
2923
|
-
self.consensus_mapping_df.lazy().filter(filter_condition).collect()
|
|
2924
|
-
)
|
|
2893
|
+
self.consensus_mapping_df = self.consensus_mapping_df.lazy().filter(filter_condition).collect()
|
|
2925
2894
|
mapping_removed_count = initial_mapping_count - len(self.consensus_mapping_df)
|
|
2926
2895
|
|
|
2927
2896
|
# Calculate results once and log efficiently
|
|
@@ -3028,18 +2997,41 @@ def consensus_select(
|
|
|
3028
2997
|
return pl.DataFrame()
|
|
3029
2998
|
|
|
3030
2999
|
# Early return optimization - check if any filters are provided
|
|
3031
|
-
filter_params = [
|
|
3032
|
-
|
|
3033
|
-
|
|
3034
|
-
|
|
3035
|
-
|
|
3036
|
-
|
|
3037
|
-
|
|
3038
|
-
|
|
3000
|
+
filter_params = [
|
|
3001
|
+
uid,
|
|
3002
|
+
mz,
|
|
3003
|
+
rt,
|
|
3004
|
+
inty_mean,
|
|
3005
|
+
consensus_uid,
|
|
3006
|
+
consensus_id,
|
|
3007
|
+
number_samples,
|
|
3008
|
+
number_ms2,
|
|
3009
|
+
quality,
|
|
3010
|
+
bl,
|
|
3011
|
+
chrom_coherence_mean,
|
|
3012
|
+
chrom_prominence_mean,
|
|
3013
|
+
chrom_prominence_scaled_mean,
|
|
3014
|
+
chrom_height_scaled_mean,
|
|
3015
|
+
rt_delta_mean,
|
|
3016
|
+
id_top_score,
|
|
3017
|
+
identified,
|
|
3018
|
+
# New adduct and identification parameters
|
|
3019
|
+
adduct_top,
|
|
3020
|
+
adduct_charge_top,
|
|
3021
|
+
adduct_mass_neutral_top,
|
|
3022
|
+
adduct_mass_shift_top,
|
|
3023
|
+
adduct_group,
|
|
3024
|
+
adduct_of,
|
|
3025
|
+
id_top_name,
|
|
3026
|
+
id_top_class,
|
|
3027
|
+
id_top_adduct,
|
|
3028
|
+
]
|
|
3029
|
+
|
|
3039
3030
|
if all(param is None for param in filter_params) and sortby is None:
|
|
3040
3031
|
return self.consensus_df.clone()
|
|
3041
3032
|
|
|
3042
3033
|
import time
|
|
3034
|
+
|
|
3043
3035
|
start_time = time.perf_counter()
|
|
3044
3036
|
initial_count = len(self.consensus_df)
|
|
3045
3037
|
|
|
@@ -3082,8 +3074,9 @@ def consensus_select(
|
|
|
3082
3074
|
default_mz_tol = default_mz_tol.eic_mz_tol
|
|
3083
3075
|
else:
|
|
3084
3076
|
from masster.study.defaults.align_def import align_defaults
|
|
3077
|
+
|
|
3085
3078
|
default_mz_tol = align_defaults().mz_max_diff
|
|
3086
|
-
|
|
3079
|
+
|
|
3087
3080
|
min_mz = mz - default_mz_tol
|
|
3088
3081
|
max_mz = mz + default_mz_tol
|
|
3089
3082
|
filter_conditions.append((pl.col("mz") >= min_mz) & (pl.col("mz") <= max_mz))
|
|
@@ -3106,8 +3099,9 @@ def consensus_select(
|
|
|
3106
3099
|
default_rt_tol = default_rt_tol.eic_rt_tol
|
|
3107
3100
|
else:
|
|
3108
3101
|
from masster.study.defaults.align_def import align_defaults
|
|
3102
|
+
|
|
3109
3103
|
default_rt_tol = align_defaults().rt_tol
|
|
3110
|
-
|
|
3104
|
+
|
|
3111
3105
|
min_rt = rt - default_rt_tol
|
|
3112
3106
|
max_rt = rt + default_rt_tol
|
|
3113
3107
|
filter_conditions.append((pl.col("rt") >= min_rt) & (pl.col("rt") <= max_rt))
|
|
@@ -3192,8 +3186,8 @@ def consensus_select(
|
|
|
3192
3186
|
if "adduct_charge_top" in available_columns:
|
|
3193
3187
|
if isinstance(adduct_charge_top, tuple) and len(adduct_charge_top) == 2:
|
|
3194
3188
|
filter_conditions.append(
|
|
3195
|
-
(pl.col("adduct_charge_top") >= adduct_charge_top[0])
|
|
3196
|
-
(pl.col("adduct_charge_top") <= adduct_charge_top[1])
|
|
3189
|
+
(pl.col("adduct_charge_top") >= adduct_charge_top[0])
|
|
3190
|
+
& (pl.col("adduct_charge_top") <= adduct_charge_top[1])
|
|
3197
3191
|
)
|
|
3198
3192
|
elif isinstance(adduct_charge_top, list):
|
|
3199
3193
|
filter_conditions.append(pl.col("adduct_charge_top").is_in(adduct_charge_top))
|
|
@@ -3207,8 +3201,8 @@ def consensus_select(
|
|
|
3207
3201
|
if "adduct_mass_neutral_top" in available_columns:
|
|
3208
3202
|
if isinstance(adduct_mass_neutral_top, tuple) and len(adduct_mass_neutral_top) == 2:
|
|
3209
3203
|
filter_conditions.append(
|
|
3210
|
-
(pl.col("adduct_mass_neutral_top") >= adduct_mass_neutral_top[0])
|
|
3211
|
-
(pl.col("adduct_mass_neutral_top") <= adduct_mass_neutral_top[1])
|
|
3204
|
+
(pl.col("adduct_mass_neutral_top") >= adduct_mass_neutral_top[0])
|
|
3205
|
+
& (pl.col("adduct_mass_neutral_top") <= adduct_mass_neutral_top[1])
|
|
3212
3206
|
)
|
|
3213
3207
|
elif isinstance(adduct_mass_neutral_top, list):
|
|
3214
3208
|
filter_conditions.append(pl.col("adduct_mass_neutral_top").is_in(adduct_mass_neutral_top))
|
|
@@ -3222,8 +3216,8 @@ def consensus_select(
|
|
|
3222
3216
|
if "adduct_mass_shift_top" in available_columns:
|
|
3223
3217
|
if isinstance(adduct_mass_shift_top, tuple) and len(adduct_mass_shift_top) == 2:
|
|
3224
3218
|
filter_conditions.append(
|
|
3225
|
-
(pl.col("adduct_mass_shift_top") >= adduct_mass_shift_top[0])
|
|
3226
|
-
(pl.col("adduct_mass_shift_top") <= adduct_mass_shift_top[1])
|
|
3219
|
+
(pl.col("adduct_mass_shift_top") >= adduct_mass_shift_top[0])
|
|
3220
|
+
& (pl.col("adduct_mass_shift_top") <= adduct_mass_shift_top[1])
|
|
3227
3221
|
)
|
|
3228
3222
|
elif isinstance(adduct_mass_shift_top, list):
|
|
3229
3223
|
filter_conditions.append(pl.col("adduct_mass_shift_top").is_in(adduct_mass_shift_top))
|
|
@@ -3287,8 +3281,7 @@ def consensus_select(
|
|
|
3287
3281
|
if "id_top_score" in available_columns:
|
|
3288
3282
|
if isinstance(id_top_score, tuple) and len(id_top_score) == 2:
|
|
3289
3283
|
filter_conditions.append(
|
|
3290
|
-
(pl.col("id_top_score") >= id_top_score[0]) &
|
|
3291
|
-
(pl.col("id_top_score") <= id_top_score[1])
|
|
3284
|
+
(pl.col("id_top_score") >= id_top_score[0]) & (pl.col("id_top_score") <= id_top_score[1])
|
|
3292
3285
|
)
|
|
3293
3286
|
elif isinstance(id_top_score, list):
|
|
3294
3287
|
filter_conditions.append(pl.col("id_top_score").is_in(id_top_score))
|
|
@@ -3306,14 +3299,10 @@ def consensus_select(
|
|
|
3306
3299
|
# Combine all conditions efficiently using reduce
|
|
3307
3300
|
from functools import reduce
|
|
3308
3301
|
import operator
|
|
3302
|
+
|
|
3309
3303
|
combined_filter = reduce(operator.and_, filter_conditions)
|
|
3310
|
-
|
|
3311
|
-
consensus = (
|
|
3312
|
-
self.consensus_df
|
|
3313
|
-
.lazy()
|
|
3314
|
-
.filter(combined_filter)
|
|
3315
|
-
.collect(streaming=True)
|
|
3316
|
-
)
|
|
3304
|
+
|
|
3305
|
+
consensus = self.consensus_df.lazy().filter(combined_filter).collect(streaming=True)
|
|
3317
3306
|
else:
|
|
3318
3307
|
consensus = self.consensus_df.clone()
|
|
3319
3308
|
|
|
@@ -3334,10 +3323,10 @@ def consensus_select(
|
|
|
3334
3323
|
elif isinstance(sortby, (list, tuple)):
|
|
3335
3324
|
valid_columns = [col for col in sortby if col in consensus.columns]
|
|
3336
3325
|
invalid_columns = [col for col in sortby if col not in consensus.columns]
|
|
3337
|
-
|
|
3326
|
+
|
|
3338
3327
|
if invalid_columns:
|
|
3339
3328
|
self.logger.warning(f"Sort columns not found in consensus DataFrame: {invalid_columns}")
|
|
3340
|
-
|
|
3329
|
+
|
|
3341
3330
|
if valid_columns:
|
|
3342
3331
|
consensus = consensus.sort(valid_columns, descending=descending)
|
|
3343
3332
|
else:
|
|
@@ -3346,8 +3335,10 @@ def consensus_select(
|
|
|
3346
3335
|
# Log performance metrics
|
|
3347
3336
|
elapsed_time = time.perf_counter() - start_time
|
|
3348
3337
|
removed_count = initial_count - final_count
|
|
3349
|
-
|
|
3350
|
-
self.logger.info(
|
|
3338
|
+
|
|
3339
|
+
self.logger.info(
|
|
3340
|
+
f"Selected consensus features: {final_count:,} (removed: {removed_count:,}) in {elapsed_time:.4f}s"
|
|
3341
|
+
)
|
|
3351
3342
|
|
|
3352
3343
|
return consensus
|
|
3353
3344
|
|
|
@@ -3393,10 +3384,7 @@ def consensus_filter(self, consensus):
|
|
|
3393
3384
|
|
|
3394
3385
|
# Get feature_uids that need to be kept in features_df
|
|
3395
3386
|
feature_uids_to_keep = []
|
|
3396
|
-
if (
|
|
3397
|
-
self.consensus_mapping_df is not None
|
|
3398
|
-
and not self.consensus_mapping_df.is_empty()
|
|
3399
|
-
):
|
|
3387
|
+
if self.consensus_mapping_df is not None and not self.consensus_mapping_df.is_empty():
|
|
3400
3388
|
feature_uids_to_keep = self.consensus_mapping_df.filter(
|
|
3401
3389
|
pl.col("consensus_uid").is_in(consensus_uids_to_keep),
|
|
3402
3390
|
)["feature_uid"].to_list()
|
|
@@ -3407,10 +3395,7 @@ def consensus_filter(self, consensus):
|
|
|
3407
3395
|
)
|
|
3408
3396
|
|
|
3409
3397
|
# Keep only relevant entries in consensus_mapping_df
|
|
3410
|
-
if (
|
|
3411
|
-
self.consensus_mapping_df is not None
|
|
3412
|
-
and not self.consensus_mapping_df.is_empty()
|
|
3413
|
-
):
|
|
3398
|
+
if self.consensus_mapping_df is not None and not self.consensus_mapping_df.is_empty():
|
|
3414
3399
|
initial_mapping_count = len(self.consensus_mapping_df)
|
|
3415
3400
|
self.consensus_mapping_df = self.consensus_mapping_df.filter(
|
|
3416
3401
|
pl.col("consensus_uid").is_in(consensus_uids_to_keep),
|
|
@@ -3423,11 +3408,7 @@ def consensus_filter(self, consensus):
|
|
|
3423
3408
|
)
|
|
3424
3409
|
|
|
3425
3410
|
# Keep only corresponding features in features_df
|
|
3426
|
-
if (
|
|
3427
|
-
feature_uids_to_keep
|
|
3428
|
-
and self.features_df is not None
|
|
3429
|
-
and not self.features_df.is_empty()
|
|
3430
|
-
):
|
|
3411
|
+
if feature_uids_to_keep and self.features_df is not None and not self.features_df.is_empty():
|
|
3431
3412
|
initial_features_count = len(self.features_df)
|
|
3432
3413
|
self.features_df = self.features_df.filter(
|
|
3433
3414
|
pl.col("feature_uid").is_in(feature_uids_to_keep),
|
|
@@ -3440,11 +3421,7 @@ def consensus_filter(self, consensus):
|
|
|
3440
3421
|
)
|
|
3441
3422
|
|
|
3442
3423
|
# Keep only relevant entries in consensus_ms2 if it exists
|
|
3443
|
-
if (
|
|
3444
|
-
hasattr(self, "consensus_ms2")
|
|
3445
|
-
and self.consensus_ms2 is not None
|
|
3446
|
-
and not self.consensus_ms2.is_empty()
|
|
3447
|
-
):
|
|
3424
|
+
if hasattr(self, "consensus_ms2") and self.consensus_ms2 is not None and not self.consensus_ms2.is_empty():
|
|
3448
3425
|
initial_ms2_count = len(self.consensus_ms2)
|
|
3449
3426
|
self.consensus_ms2 = self.consensus_ms2.filter(
|
|
3450
3427
|
pl.col("consensus_uid").is_in(consensus_uids_to_keep),
|
|
@@ -3514,10 +3491,7 @@ def consensus_delete(self, consensus):
|
|
|
3514
3491
|
|
|
3515
3492
|
# Get feature_uids that need to be removed from features_df
|
|
3516
3493
|
feature_uids_to_remove = []
|
|
3517
|
-
if (
|
|
3518
|
-
self.consensus_mapping_df is not None
|
|
3519
|
-
and not self.consensus_mapping_df.is_empty()
|
|
3520
|
-
):
|
|
3494
|
+
if self.consensus_mapping_df is not None and not self.consensus_mapping_df.is_empty():
|
|
3521
3495
|
feature_uids_to_remove = self.consensus_mapping_df.filter(
|
|
3522
3496
|
pl.col("consensus_uid").is_in(consensus_uids_to_remove),
|
|
3523
3497
|
)["feature_uid"].to_list()
|
|
@@ -3529,10 +3503,7 @@ def consensus_delete(self, consensus):
|
|
|
3529
3503
|
|
|
3530
3504
|
# Remove from consensus_mapping_df
|
|
3531
3505
|
mapping_removed_count = 0
|
|
3532
|
-
if (
|
|
3533
|
-
self.consensus_mapping_df is not None
|
|
3534
|
-
and not self.consensus_mapping_df.is_empty()
|
|
3535
|
-
):
|
|
3506
|
+
if self.consensus_mapping_df is not None and not self.consensus_mapping_df.is_empty():
|
|
3536
3507
|
initial_mapping_count = len(self.consensus_mapping_df)
|
|
3537
3508
|
self.consensus_mapping_df = self.consensus_mapping_df.filter(
|
|
3538
3509
|
~pl.col("consensus_uid").is_in(consensus_uids_to_remove),
|
|
@@ -3541,11 +3512,7 @@ def consensus_delete(self, consensus):
|
|
|
3541
3512
|
|
|
3542
3513
|
# Remove corresponding features from features_df
|
|
3543
3514
|
features_removed_count = 0
|
|
3544
|
-
if (
|
|
3545
|
-
feature_uids_to_remove
|
|
3546
|
-
and self.features_df is not None
|
|
3547
|
-
and not self.features_df.is_empty()
|
|
3548
|
-
):
|
|
3515
|
+
if feature_uids_to_remove and self.features_df is not None and not self.features_df.is_empty():
|
|
3549
3516
|
initial_features_count = len(self.features_df)
|
|
3550
3517
|
self.features_df = self.features_df.filter(
|
|
3551
3518
|
~pl.col("feature_uid").is_in(feature_uids_to_remove),
|
|
@@ -3554,11 +3521,7 @@ def consensus_delete(self, consensus):
|
|
|
3554
3521
|
|
|
3555
3522
|
# Remove from consensus_ms2 if it exists
|
|
3556
3523
|
ms2_removed_count = 0
|
|
3557
|
-
if (
|
|
3558
|
-
hasattr(self, "consensus_ms2")
|
|
3559
|
-
and self.consensus_ms2 is not None
|
|
3560
|
-
and not self.consensus_ms2.is_empty()
|
|
3561
|
-
):
|
|
3524
|
+
if hasattr(self, "consensus_ms2") and self.consensus_ms2 is not None and not self.consensus_ms2.is_empty():
|
|
3562
3525
|
initial_ms2_count = len(self.consensus_ms2)
|
|
3563
3526
|
self.consensus_ms2 = self.consensus_ms2.filter(
|
|
3564
3527
|
~pl.col("consensus_uid").is_in(consensus_uids_to_remove),
|
|
@@ -3577,7 +3540,7 @@ def consensus_delete(self, consensus):
|
|
|
3577
3540
|
log_parts.append(f"{features_removed_count} features")
|
|
3578
3541
|
if ms2_removed_count > 0:
|
|
3579
3542
|
log_parts.append(f"{ms2_removed_count} MS2 spectra")
|
|
3580
|
-
|
|
3543
|
+
|
|
3581
3544
|
log_message = ". ".join(log_parts) + f". Remaining consensus: {final_consensus_count}"
|
|
3582
3545
|
self.logger.info(log_message)
|
|
3583
3546
|
|
|
@@ -3651,8 +3614,7 @@ def samples_select(
|
|
|
3651
3614
|
# Treat as range
|
|
3652
3615
|
min_uid, max_uid = sample_uid
|
|
3653
3616
|
filter_conditions.append(
|
|
3654
|
-
(pl.col("sample_uid") >= min_uid)
|
|
3655
|
-
& (pl.col("sample_uid") <= max_uid),
|
|
3617
|
+
(pl.col("sample_uid") >= min_uid) & (pl.col("sample_uid") <= max_uid),
|
|
3656
3618
|
)
|
|
3657
3619
|
else:
|
|
3658
3620
|
# Treat as list
|
|
@@ -3695,8 +3657,7 @@ def samples_select(
|
|
|
3695
3657
|
# Treat as range
|
|
3696
3658
|
min_batch, max_batch = sample_batch
|
|
3697
3659
|
filter_conditions.append(
|
|
3698
|
-
(pl.col("sample_batch") >= min_batch)
|
|
3699
|
-
& (pl.col("sample_batch") <= max_batch),
|
|
3660
|
+
(pl.col("sample_batch") >= min_batch) & (pl.col("sample_batch") <= max_batch),
|
|
3700
3661
|
)
|
|
3701
3662
|
else:
|
|
3702
3663
|
# Treat as list
|
|
@@ -3714,8 +3675,7 @@ def samples_select(
|
|
|
3714
3675
|
# Treat as range
|
|
3715
3676
|
min_seq, max_seq = sample_sequence
|
|
3716
3677
|
filter_conditions.append(
|
|
3717
|
-
(pl.col("sample_sequence") >= min_seq)
|
|
3718
|
-
& (pl.col("sample_sequence") <= max_seq),
|
|
3678
|
+
(pl.col("sample_sequence") >= min_seq) & (pl.col("sample_sequence") <= max_seq),
|
|
3719
3679
|
)
|
|
3720
3680
|
else:
|
|
3721
3681
|
# Treat as list
|
|
@@ -3733,8 +3693,7 @@ def samples_select(
|
|
|
3733
3693
|
if isinstance(num_features, tuple) and len(num_features) == 2:
|
|
3734
3694
|
min_features, max_features = num_features
|
|
3735
3695
|
filter_conditions.append(
|
|
3736
|
-
(pl.col("num_features") >= min_features)
|
|
3737
|
-
& (pl.col("num_features") <= max_features),
|
|
3696
|
+
(pl.col("num_features") >= min_features) & (pl.col("num_features") <= max_features),
|
|
3738
3697
|
)
|
|
3739
3698
|
else:
|
|
3740
3699
|
filter_conditions.append(pl.col("num_features") >= num_features)
|
|
@@ -3883,11 +3842,7 @@ def samples_delete(self, samples):
|
|
|
3883
3842
|
|
|
3884
3843
|
# 2. Remove corresponding features from features_df
|
|
3885
3844
|
removed_features_count = 0
|
|
3886
|
-
if (
|
|
3887
|
-
feature_uids_to_remove
|
|
3888
|
-
and self.features_df is not None
|
|
3889
|
-
and not self.features_df.is_empty()
|
|
3890
|
-
):
|
|
3845
|
+
if feature_uids_to_remove and self.features_df is not None and not self.features_df.is_empty():
|
|
3891
3846
|
self.features_df = self.features_df.filter(
|
|
3892
3847
|
~pl.col("sample_uid").is_in(sample_uids_to_remove),
|
|
3893
3848
|
)
|
|
@@ -3895,11 +3850,7 @@ def samples_delete(self, samples):
|
|
|
3895
3850
|
|
|
3896
3851
|
# 3. Remove from consensus_mapping_df
|
|
3897
3852
|
removed_mapping_count = 0
|
|
3898
|
-
if (
|
|
3899
|
-
feature_uids_to_remove
|
|
3900
|
-
and self.consensus_mapping_df is not None
|
|
3901
|
-
and not self.consensus_mapping_df.is_empty()
|
|
3902
|
-
):
|
|
3853
|
+
if feature_uids_to_remove and self.consensus_mapping_df is not None and not self.consensus_mapping_df.is_empty():
|
|
3903
3854
|
initial_mapping_count = len(self.consensus_mapping_df)
|
|
3904
3855
|
self.consensus_mapping_df = self.consensus_mapping_df.filter(
|
|
3905
3856
|
~pl.col("feature_uid").is_in(feature_uids_to_remove),
|
|
@@ -3908,11 +3859,7 @@ def samples_delete(self, samples):
|
|
|
3908
3859
|
|
|
3909
3860
|
# 4. Remove from consensus_ms2 if it exists
|
|
3910
3861
|
removed_ms2_count = 0
|
|
3911
|
-
if (
|
|
3912
|
-
hasattr(self, "consensus_ms2")
|
|
3913
|
-
and self.consensus_ms2 is not None
|
|
3914
|
-
and not self.consensus_ms2.is_empty()
|
|
3915
|
-
):
|
|
3862
|
+
if hasattr(self, "consensus_ms2") and self.consensus_ms2 is not None and not self.consensus_ms2.is_empty():
|
|
3916
3863
|
initial_ms2_count = len(self.consensus_ms2)
|
|
3917
3864
|
self.consensus_ms2 = self.consensus_ms2.filter(
|
|
3918
3865
|
~pl.col("sample_uid").is_in(sample_uids_to_remove),
|
|
@@ -3921,11 +3868,7 @@ def samples_delete(self, samples):
|
|
|
3921
3868
|
|
|
3922
3869
|
# 5. Remove from feature_maps and update map_id
|
|
3923
3870
|
removed_maps_count = 0
|
|
3924
|
-
if (
|
|
3925
|
-
hasattr(self, "feature_maps")
|
|
3926
|
-
and self.feature_maps is not None
|
|
3927
|
-
and map_ids_to_remove
|
|
3928
|
-
):
|
|
3871
|
+
if hasattr(self, "feature_maps") and self.feature_maps is not None and map_ids_to_remove:
|
|
3929
3872
|
# Remove feature maps in reverse order to maintain indices
|
|
3930
3873
|
for map_id in sorted(map_ids_to_remove, reverse=True):
|
|
3931
3874
|
if 0 <= map_id < len(self.feature_maps):
|
|
@@ -4148,9 +4091,6 @@ def set_samples_color(self, by=None, palette="Turbo256"):
|
|
|
4148
4091
|
self.logger.debug(f"Set sample colors based on {by} using {palette} palette")
|
|
4149
4092
|
|
|
4150
4093
|
|
|
4151
|
-
|
|
4152
|
-
|
|
4153
|
-
|
|
4154
4094
|
def _get_color_palette(palette_name):
|
|
4155
4095
|
"""
|
|
4156
4096
|
Get color palette as a list of hex color codes using the cmap library.
|
|
@@ -4304,9 +4244,7 @@ def _sample_colors_from_colormap(palette_name, n_colors):
|
|
|
4304
4244
|
# Distribute samples evenly across the full colormap range (same approach as set_samples_color(by=None))
|
|
4305
4245
|
for i in range(n_colors):
|
|
4306
4246
|
# Evenly distribute samples across colormap (avoiding endpoints to prevent white/black)
|
|
4307
|
-
normalized_value = (
|
|
4308
|
-
i + 0.5
|
|
4309
|
-
) / n_colors # +0.5 to center samples in their bins
|
|
4247
|
+
normalized_value = (i + 0.5) / n_colors # +0.5 to center samples in their bins
|
|
4310
4248
|
# Map to a subset of colormap to avoid extreme colors (use 10% to 90% range)
|
|
4311
4249
|
normalized_value = 0.1 + (normalized_value * 0.8)
|
|
4312
4250
|
|
|
@@ -4441,18 +4379,14 @@ def restore_ms2(self, samples=None, **kwargs):
|
|
|
4441
4379
|
self.logger.info(f"Restoring MS2 data from {len(sample_uids)} samples...")
|
|
4442
4380
|
|
|
4443
4381
|
# Clear existing consensus_ms2 to rebuild from scratch
|
|
4444
|
-
initial_ms2_count = (
|
|
4445
|
-
len(self.consensus_ms2) if not self.consensus_ms2.is_empty() else 0
|
|
4446
|
-
)
|
|
4382
|
+
initial_ms2_count = len(self.consensus_ms2) if not self.consensus_ms2.is_empty() else 0
|
|
4447
4383
|
self.consensus_ms2 = pl.DataFrame()
|
|
4448
4384
|
|
|
4449
4385
|
# Re-run find_ms2 which will rebuild consensus_ms2
|
|
4450
4386
|
try:
|
|
4451
4387
|
self.find_ms2(**kwargs)
|
|
4452
4388
|
|
|
4453
|
-
final_ms2_count = (
|
|
4454
|
-
len(self.consensus_ms2) if not self.consensus_ms2.is_empty() else 0
|
|
4455
|
-
)
|
|
4389
|
+
final_ms2_count = len(self.consensus_ms2) if not self.consensus_ms2.is_empty() else 0
|
|
4456
4390
|
|
|
4457
4391
|
self.logger.info(
|
|
4458
4392
|
f"MS2 restoration completed: {initial_ms2_count} -> {final_ms2_count} MS2 spectra",
|
|
@@ -4551,12 +4485,8 @@ def decompress(self, features=True, ms2=True, chrom=True, samples=None, **kwargs
|
|
|
4551
4485
|
# Check if MS2 data might need restoration (compare expected vs actual)
|
|
4552
4486
|
ms2_need_restoration = False
|
|
4553
4487
|
if ms2:
|
|
4554
|
-
current_ms2_count = (
|
|
4555
|
-
|
|
4556
|
-
)
|
|
4557
|
-
consensus_count = (
|
|
4558
|
-
len(self.consensus_df) if not self.consensus_df.is_empty() else 0
|
|
4559
|
-
)
|
|
4488
|
+
current_ms2_count = len(self.consensus_ms2) if not self.consensus_ms2.is_empty() else 0
|
|
4489
|
+
consensus_count = len(self.consensus_df) if not self.consensus_df.is_empty() else 0
|
|
4560
4490
|
|
|
4561
4491
|
if consensus_count > 0:
|
|
4562
4492
|
# Calculate expected MS2 count based on consensus features with MS2 potential
|
|
@@ -4607,15 +4537,11 @@ def decompress(self, features=True, ms2=True, chrom=True, samples=None, **kwargs
|
|
|
4607
4537
|
# Then do additional chrom gap-filling if needed
|
|
4608
4538
|
self.restore_chrom(samples=samples, **restore_kwargs)
|
|
4609
4539
|
|
|
4610
|
-
elif
|
|
4611
|
-
"features" in operations_needed and "chromatograms" not in operations_needed
|
|
4612
|
-
):
|
|
4540
|
+
elif "features" in operations_needed and "chromatograms" not in operations_needed:
|
|
4613
4541
|
self.logger.info("Phase 1: Restoring features data...")
|
|
4614
4542
|
self.restore_features(samples=samples)
|
|
4615
4543
|
|
|
4616
|
-
elif
|
|
4617
|
-
"chromatograms" in operations_needed and "features" not in operations_needed
|
|
4618
|
-
):
|
|
4544
|
+
elif "chromatograms" in operations_needed and "features" not in operations_needed:
|
|
4619
4545
|
self.logger.info("Phase 1: Restoring chromatograms...")
|
|
4620
4546
|
restore_kwargs = {}
|
|
4621
4547
|
if "mz_tol" in kwargs:
|