masster 0.5.1__py3-none-any.whl → 0.5.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of masster might be problematic. Click here for more details.
- masster/_version.py +1 -1
- masster/sample/adducts.py +1 -1
- masster/sample/h5.py +11 -11
- masster/sample/helpers.py +2 -2
- masster/sample/load.py +10 -8
- masster/sample/processing.py +1 -1
- masster/sample/sample.py +7 -3
- masster/study/defaults/align_def.py +0 -204
- masster/study/defaults/fill_def.py +9 -1
- masster/study/defaults/merge_def.py +20 -69
- masster/study/export.py +25 -5
- masster/study/h5.py +230 -42
- masster/study/helpers.py +430 -53
- masster/study/load.py +986 -158
- masster/study/merge.py +683 -1076
- masster/study/plot.py +95 -73
- masster/study/processing.py +337 -280
- masster/study/study.py +58 -135
- masster/wizard/wizard.py +20 -6
- {masster-0.5.1.dist-info → masster-0.5.4.dist-info}/METADATA +1 -1
- {masster-0.5.1.dist-info → masster-0.5.4.dist-info}/RECORD +24 -25
- masster/study/defaults/fill_chrom_def.py +0 -260
- {masster-0.5.1.dist-info → masster-0.5.4.dist-info}/WHEEL +0 -0
- {masster-0.5.1.dist-info → masster-0.5.4.dist-info}/entry_points.txt +0 -0
- {masster-0.5.1.dist-info → masster-0.5.4.dist-info}/licenses/LICENSE +0 -0
masster/study/helpers.py
CHANGED
|
@@ -362,10 +362,6 @@ def get_chrom(self, uids=None, samples=None):
|
|
|
362
362
|
ids = self._get_consensus_uids(uids)
|
|
363
363
|
sample_uids = self._get_samples_uids(samples)
|
|
364
364
|
|
|
365
|
-
if self.consensus_map is None:
|
|
366
|
-
self.logger.error("No consensus map found.")
|
|
367
|
-
return None
|
|
368
|
-
|
|
369
365
|
# Pre-filter all DataFrames to reduce join sizes
|
|
370
366
|
filtered_consensus_mapping = self.consensus_mapping_df.filter(
|
|
371
367
|
pl.col("consensus_uid").is_in(ids),
|
|
@@ -529,30 +525,132 @@ def get_consensus(self, quant="chrom_area"):
|
|
|
529
525
|
return df
|
|
530
526
|
|
|
531
527
|
|
|
532
|
-
|
|
533
|
-
def get_consensus_matrix(self, quant="chrom_area"):
|
|
528
|
+
def get_consensus_matrix(self, quant="chrom_area", samples=None):
|
|
534
529
|
"""
|
|
535
530
|
Get a matrix of consensus features with samples as columns and consensus features as rows.
|
|
536
|
-
|
|
531
|
+
Highly optimized implementation using vectorized Polars operations.
|
|
532
|
+
|
|
533
|
+
Parameters:
|
|
534
|
+
quant (str): Quantification method column name (default: "chrom_area")
|
|
535
|
+
samples: Sample identifier(s) to include. Can be:
|
|
536
|
+
- None: include all samples (default)
|
|
537
|
+
- int: single sample_uid
|
|
538
|
+
- str: single sample_name
|
|
539
|
+
- list: multiple sample_uids or sample_names
|
|
537
540
|
"""
|
|
541
|
+
import polars as pl
|
|
542
|
+
|
|
538
543
|
if quant not in self.features_df.columns:
|
|
539
|
-
self.logger.error(
|
|
540
|
-
f"Quantification method {quant} not found in features_df.",
|
|
541
|
-
)
|
|
544
|
+
self.logger.error(f"Quantification method {quant} not found in features_df.")
|
|
542
545
|
return None
|
|
543
546
|
|
|
544
|
-
#
|
|
545
|
-
|
|
547
|
+
# Get sample_uids to include in the matrix
|
|
548
|
+
sample_uids = self._get_samples_uids(samples) if samples is not None else self.samples_df["sample_uid"].to_list()
|
|
549
|
+
|
|
550
|
+
if not sample_uids:
|
|
551
|
+
self.logger.warning("No valid samples found for consensus matrix")
|
|
552
|
+
return pl.DataFrame()
|
|
553
|
+
|
|
554
|
+
# Filter datasets upfront to reduce processing load
|
|
555
|
+
features_filtered = self.features_df.filter(pl.col("sample_uid").is_in(sample_uids))
|
|
556
|
+
samples_filtered = self.samples_df.filter(pl.col("sample_uid").is_in(sample_uids))
|
|
557
|
+
consensus_mapping_filtered = self.consensus_mapping_df.filter(pl.col("sample_uid").is_in(sample_uids))
|
|
558
|
+
|
|
559
|
+
# Join operations to combine data efficiently
|
|
560
|
+
# 1. Join consensus mapping with features to get quantification values
|
|
561
|
+
consensus_with_values = (
|
|
562
|
+
consensus_mapping_filtered
|
|
563
|
+
.join(features_filtered.select(["feature_uid", "sample_uid", quant]),
|
|
564
|
+
on=["feature_uid", "sample_uid"], how="left")
|
|
565
|
+
.with_columns(pl.col(quant).fill_null(0))
|
|
566
|
+
)
|
|
567
|
+
|
|
568
|
+
# 2. Join with samples to get sample names
|
|
569
|
+
consensus_with_names = (
|
|
570
|
+
consensus_with_values
|
|
571
|
+
.join(samples_filtered.select(["sample_uid", "sample_name"]),
|
|
572
|
+
on="sample_uid", how="left")
|
|
573
|
+
)
|
|
574
|
+
|
|
575
|
+
# 3. Group by consensus_uid and sample_name, taking max value per group
|
|
576
|
+
aggregated = (
|
|
577
|
+
consensus_with_names
|
|
578
|
+
.group_by(["consensus_uid", "sample_name"])
|
|
579
|
+
.agg(pl.col(quant).max().alias("value"))
|
|
580
|
+
)
|
|
581
|
+
|
|
582
|
+
# 4. Pivot to create the matrix format
|
|
583
|
+
matrix_df = (
|
|
584
|
+
aggregated
|
|
585
|
+
.pivot(on="sample_name", index="consensus_uid", values="value")
|
|
586
|
+
.fill_null(0)
|
|
587
|
+
)
|
|
588
|
+
|
|
589
|
+
# 5. Round numeric columns and ensure proper types
|
|
590
|
+
numeric_cols = [col for col in matrix_df.columns if col != "consensus_uid"]
|
|
591
|
+
matrix_df = matrix_df.with_columns([
|
|
592
|
+
pl.col("consensus_uid").cast(pl.UInt64),
|
|
593
|
+
*[pl.col(col).round(0) for col in numeric_cols]
|
|
594
|
+
])
|
|
595
|
+
|
|
596
|
+
return matrix_df
|
|
597
|
+
|
|
598
|
+
|
|
599
|
+
def get_gaps_matrix(self, uids=None, samples=None):
|
|
600
|
+
"""
|
|
601
|
+
Get a matrix of gaps between consensus features with samples as columns and consensus features as rows.
|
|
602
|
+
Optimized implementation that builds the gaps matrix directly without calling get_consensus_matrix().
|
|
603
|
+
|
|
604
|
+
Parameters:
|
|
605
|
+
uids: Consensus UID(s) to include. If None, includes all consensus features.
|
|
606
|
+
samples: Sample identifier(s) to include. If None, includes all samples.
|
|
607
|
+
Can be int (sample_uid), str (sample_name), or list of either.
|
|
608
|
+
|
|
609
|
+
Returns:
|
|
610
|
+
pl.DataFrame: Gaps matrix with consensus_uid as first column and samples as other columns.
|
|
611
|
+
Values are 1 (detected) or 0 (missing/gap).
|
|
612
|
+
"""
|
|
613
|
+
import polars as pl
|
|
614
|
+
|
|
615
|
+
if self.consensus_df is None or self.consensus_df.is_empty():
|
|
616
|
+
self.logger.error("No consensus map found.")
|
|
617
|
+
return None
|
|
618
|
+
|
|
619
|
+
if self.consensus_mapping_df is None or self.consensus_mapping_df.is_empty():
|
|
620
|
+
self.logger.error("No consensus mapping found.")
|
|
621
|
+
return None
|
|
622
|
+
|
|
623
|
+
if self.features_df is None or self.features_df.is_empty():
|
|
624
|
+
self.logger.error("No features found.")
|
|
625
|
+
return None
|
|
626
|
+
|
|
627
|
+
# Get consensus UIDs and sample UIDs to include
|
|
628
|
+
uids = self._get_consensus_uids(uids)
|
|
629
|
+
sample_uids = self._get_samples_uids(samples) if samples is not None else self.samples_df["sample_uid"].to_list()
|
|
630
|
+
|
|
631
|
+
if not uids or not sample_uids:
|
|
632
|
+
self.logger.warning("No valid consensus features or samples found for gaps matrix")
|
|
633
|
+
return pl.DataFrame()
|
|
634
|
+
|
|
635
|
+
# Create a lookup dictionary from features_df for gap detection (exclude filled features)
|
|
636
|
+
# Key: (feature_uid, sample_uid) -> Value: 1 (detected)
|
|
637
|
+
feature_detection = {}
|
|
546
638
|
for row in self.features_df.iter_rows(named=True):
|
|
547
|
-
feature_uid = row["feature_uid"]
|
|
548
639
|
sample_uid = row["sample_uid"]
|
|
549
|
-
|
|
550
|
-
|
|
640
|
+
if sample_uid in sample_uids: # Only include specified samples
|
|
641
|
+
# Skip filled features (gaps should only show original detections)
|
|
642
|
+
if row.get("filled", False):
|
|
643
|
+
continue
|
|
644
|
+
|
|
645
|
+
feature_uid = row["feature_uid"]
|
|
646
|
+
# If feature exists and is not filled, it's detected (1)
|
|
647
|
+
feature_detection[(feature_uid, sample_uid)] = 1
|
|
551
648
|
|
|
552
|
-
# Build
|
|
649
|
+
# Build gaps matrix directly using the consensus_mapping_df
|
|
553
650
|
matrix_dict = {}
|
|
554
651
|
sample_mapping = dict(
|
|
555
|
-
self.samples_df.
|
|
652
|
+
self.samples_df.filter(pl.col("sample_uid").is_in(sample_uids))
|
|
653
|
+
.select(["sample_uid", "sample_name"]).iter_rows(),
|
|
556
654
|
)
|
|
557
655
|
|
|
558
656
|
for row in self.consensus_mapping_df.iter_rows(named=True):
|
|
@@ -560,65 +658,53 @@ def get_consensus_matrix(self, quant="chrom_area"):
|
|
|
560
658
|
sample_uid = row["sample_uid"]
|
|
561
659
|
feature_uid = row["feature_uid"]
|
|
562
660
|
|
|
563
|
-
#
|
|
661
|
+
# Only process samples and consensus features in our filtered lists
|
|
662
|
+
if sample_uid not in sample_uids or consensus_uid not in uids:
|
|
663
|
+
continue
|
|
664
|
+
|
|
665
|
+
# Check if feature was detected (not filled)
|
|
564
666
|
key = (feature_uid, sample_uid)
|
|
565
|
-
|
|
667
|
+
detected = feature_detection.get(key, 0) # 0 if not found (gap), 1 if detected
|
|
566
668
|
|
|
567
669
|
if consensus_uid not in matrix_dict:
|
|
568
670
|
matrix_dict[consensus_uid] = {}
|
|
569
671
|
|
|
570
672
|
sample_name = sample_mapping.get(sample_uid, f"sample_{sample_uid}")
|
|
571
673
|
|
|
572
|
-
#
|
|
674
|
+
# For gaps matrix, we want to know if ANY feature was detected for this consensus/sample
|
|
675
|
+
# So we take max (if any feature is detected, the consensus feature is detected)
|
|
573
676
|
if sample_name in matrix_dict[consensus_uid]:
|
|
574
677
|
matrix_dict[consensus_uid][sample_name] = max(
|
|
575
678
|
matrix_dict[consensus_uid][sample_name],
|
|
576
|
-
|
|
679
|
+
detected,
|
|
577
680
|
)
|
|
578
681
|
else:
|
|
579
|
-
matrix_dict[consensus_uid][sample_name] =
|
|
580
|
-
|
|
581
|
-
# Convert to Polars DataFrame with proper formatting
|
|
582
|
-
import polars as pl
|
|
682
|
+
matrix_dict[consensus_uid][sample_name] = detected
|
|
583
683
|
|
|
584
|
-
# Convert
|
|
684
|
+
# Convert to Polars DataFrame
|
|
585
685
|
records = []
|
|
586
686
|
for consensus_uid, sample_values in matrix_dict.items():
|
|
587
687
|
record = {"consensus_uid": consensus_uid}
|
|
588
688
|
record.update(sample_values)
|
|
589
689
|
records.append(record)
|
|
590
690
|
|
|
691
|
+
if not records:
|
|
692
|
+
self.logger.warning("No gaps data found for specified consensus features and samples")
|
|
693
|
+
return pl.DataFrame()
|
|
694
|
+
|
|
591
695
|
# Create Polars DataFrame and set proper data types
|
|
592
|
-
|
|
696
|
+
df_gaps = pl.DataFrame(records)
|
|
593
697
|
|
|
594
|
-
# Fill null values with 0 and
|
|
595
|
-
numeric_cols = [col for col in
|
|
596
|
-
|
|
698
|
+
# Fill null values with 0 (gaps) and ensure integer type for gap indicators
|
|
699
|
+
numeric_cols = [col for col in df_gaps.columns if col != "consensus_uid"]
|
|
700
|
+
df_gaps = df_gaps.with_columns(
|
|
597
701
|
[
|
|
598
702
|
pl.col("consensus_uid").cast(pl.UInt64),
|
|
599
|
-
*[pl.col(col).fill_null(0).
|
|
703
|
+
*[pl.col(col).fill_null(0).cast(pl.Int8) for col in numeric_cols],
|
|
600
704
|
],
|
|
601
705
|
)
|
|
602
706
|
|
|
603
|
-
return
|
|
604
|
-
|
|
605
|
-
|
|
606
|
-
def get_gaps_matrix(self, uids=None):
|
|
607
|
-
"""
|
|
608
|
-
Get a matrix of gaps between consensus features with samples as columns and consensus features as rows.
|
|
609
|
-
"""
|
|
610
|
-
if self.consensus_df is None:
|
|
611
|
-
self.logger.error("No consensus map found.")
|
|
612
|
-
return None
|
|
613
|
-
uids = self._get_consensus_uids(uids)
|
|
614
|
-
|
|
615
|
-
df1 = self.get_consensus_matrix(quant="filled")
|
|
616
|
-
if df1 is None or df1.empty:
|
|
617
|
-
self.logger.warning("No gap data found.")
|
|
618
|
-
return None
|
|
619
|
-
# keep only rows where consensus_id is in ids - use pandas indexing since df1 is already pandas
|
|
620
|
-
df1 = df1[df1.index.isin(uids)]
|
|
621
|
-
return df1
|
|
707
|
+
return df_gaps
|
|
622
708
|
|
|
623
709
|
|
|
624
710
|
def get_gaps_stats(self, uids=None):
|
|
@@ -705,6 +791,53 @@ def get_consensus_matches(self, uids=None, filled=True):
|
|
|
705
791
|
# =====================================================================================
|
|
706
792
|
|
|
707
793
|
|
|
794
|
+
def consensus_reset(self):
|
|
795
|
+
"""
|
|
796
|
+
Reset consensus data by clearing consensus DataFrames and removing filled features.
|
|
797
|
+
|
|
798
|
+
This function:
|
|
799
|
+
1. Sets consensus_df, consensus_ms2, consensus_mapping_df, id_df to empty pl.DataFrame()
|
|
800
|
+
2. Removes all filled features from features_df
|
|
801
|
+
3. Removes relevant operations from history (merge, integrate, find_ms2, fill, identify)
|
|
802
|
+
4. Logs the number of features removed
|
|
803
|
+
|
|
804
|
+
This effectively undoes the merge() operation and any gap-filling.
|
|
805
|
+
"""
|
|
806
|
+
self.logger.debug("Resetting consensus data.")
|
|
807
|
+
|
|
808
|
+
# Reset consensus DataFrames to empty
|
|
809
|
+
self.consensus_df = pl.DataFrame()
|
|
810
|
+
self.consensus_ms2 = pl.DataFrame()
|
|
811
|
+
self.consensus_mapping_df = pl.DataFrame()
|
|
812
|
+
self.id_df = pl.DataFrame()
|
|
813
|
+
|
|
814
|
+
# Remove filled features from features_df
|
|
815
|
+
if self.features_df is None:
|
|
816
|
+
self.logger.warning("No features found.")
|
|
817
|
+
return
|
|
818
|
+
|
|
819
|
+
l1 = len(self.features_df)
|
|
820
|
+
|
|
821
|
+
# Filter out filled features (keep only non-filled features)
|
|
822
|
+
if "filled" in self.features_df.columns:
|
|
823
|
+
self.features_df = self.features_df.filter(~pl.col("filled") | pl.col("filled").is_null())
|
|
824
|
+
|
|
825
|
+
# Remove consensus-related operations from history
|
|
826
|
+
keys_to_remove = ["merge", "integrate", "integrate_chrom", "find_ms2", "fill", "fill_single", "identify"]
|
|
827
|
+
history_removed_count = 0
|
|
828
|
+
if hasattr(self, "history") and self.history:
|
|
829
|
+
for key in keys_to_remove:
|
|
830
|
+
if key in self.history:
|
|
831
|
+
del self.history[key]
|
|
832
|
+
history_removed_count += 1
|
|
833
|
+
self.logger.debug(f"Removed '{key}' from history")
|
|
834
|
+
|
|
835
|
+
removed_count = l1 - len(self.features_df)
|
|
836
|
+
self.logger.info(
|
|
837
|
+
f"Reset consensus data. Consensus DataFrames cleared. Features removed: {removed_count}. History entries removed: {history_removed_count}",
|
|
838
|
+
)
|
|
839
|
+
|
|
840
|
+
|
|
708
841
|
def fill_reset(self):
|
|
709
842
|
# remove all features with filled=True
|
|
710
843
|
if self.features_df is None:
|
|
@@ -719,7 +852,7 @@ def fill_reset(self):
|
|
|
719
852
|
pl.col("feature_uid").is_in(feature_uids_to_keep),
|
|
720
853
|
)
|
|
721
854
|
self.logger.info(
|
|
722
|
-
f"
|
|
855
|
+
f"Removed {l1 - len(self.features_df)} gap-filled features",
|
|
723
856
|
)
|
|
724
857
|
|
|
725
858
|
|
|
@@ -1044,6 +1177,250 @@ def get_sample_stats(self):
|
|
|
1044
1177
|
)
|
|
1045
1178
|
|
|
1046
1179
|
|
|
1180
|
+
def get_consensus_stats(self):
|
|
1181
|
+
"""
|
|
1182
|
+
Get key performance indicators for each consensus feature.
|
|
1183
|
+
|
|
1184
|
+
Returns:
|
|
1185
|
+
pl.DataFrame: DataFrame with the following columns:
|
|
1186
|
+
- consensus_uid: Consensus unique identifier
|
|
1187
|
+
- rt: Retention time
|
|
1188
|
+
- rt_delta_mean: Mean retention time delta
|
|
1189
|
+
- mz: Mass-to-charge ratio
|
|
1190
|
+
- mz_range: Mass range (mz_max - mz_min)
|
|
1191
|
+
- log10_inty_mean: Log10 of mean intensity
|
|
1192
|
+
- number_samples: Number of samples
|
|
1193
|
+
- number_ms2: Number of MS2 spectra
|
|
1194
|
+
- charge_mean: Mean charge
|
|
1195
|
+
- quality: Feature quality
|
|
1196
|
+
- chrom_coherence_mean: Mean chromatographic coherence
|
|
1197
|
+
- chrom_height_scaled_mean: Mean scaled chromatographic height
|
|
1198
|
+
- chrom_prominence_scaled_mean: Mean scaled chromatographic prominence
|
|
1199
|
+
- qc_ratio: Ratio of QC samples where feature was detected
|
|
1200
|
+
- qc_cv: RSD (relative standard deviation) of intensity for QC samples
|
|
1201
|
+
- qc_to_blank: Ratio of average QC intensity to average blank intensity
|
|
1202
|
+
"""
|
|
1203
|
+
import polars as pl
|
|
1204
|
+
import numpy as np
|
|
1205
|
+
|
|
1206
|
+
# Check if consensus_df exists and has data
|
|
1207
|
+
if self.consensus_df is None or self.consensus_df.is_empty():
|
|
1208
|
+
self.logger.error("No consensus data available. Run merge/find_consensus first.")
|
|
1209
|
+
return pl.DataFrame()
|
|
1210
|
+
|
|
1211
|
+
# Get all columns and their data types - work with original dataframe
|
|
1212
|
+
data_df = self.consensus_df.clone()
|
|
1213
|
+
|
|
1214
|
+
# Define specific columns to include in the exact order requested
|
|
1215
|
+
desired_columns = [
|
|
1216
|
+
"consensus_uid", # Include consensus_uid for identification
|
|
1217
|
+
"rt",
|
|
1218
|
+
"rt_delta_mean",
|
|
1219
|
+
"mz",
|
|
1220
|
+
"mz_range", # mz_max-mz_min (will be calculated)
|
|
1221
|
+
"log10_inty_mean", # log10(inty_mean) (will be calculated)
|
|
1222
|
+
"number_samples",
|
|
1223
|
+
"number_ms2",
|
|
1224
|
+
"charge_mean",
|
|
1225
|
+
"quality",
|
|
1226
|
+
"chrom_coherence_mean",
|
|
1227
|
+
"chrom_height_scaled_mean",
|
|
1228
|
+
"chrom_prominence_scaled_mean"
|
|
1229
|
+
]
|
|
1230
|
+
|
|
1231
|
+
# Calculate derived columns if they don't exist
|
|
1232
|
+
if "mz_range" not in data_df.columns and "mz_max" in data_df.columns and "mz_min" in data_df.columns:
|
|
1233
|
+
data_df = data_df.with_columns((pl.col("mz_max") - pl.col("mz_min")).alias("mz_range"))
|
|
1234
|
+
|
|
1235
|
+
if "log10_inty_mean" not in data_df.columns and "inty_mean" in data_df.columns:
|
|
1236
|
+
data_df = data_df.with_columns(pl.col("inty_mean").log10().alias("log10_inty_mean"))
|
|
1237
|
+
|
|
1238
|
+
# Filter to only include columns that exist in the dataframe, preserving order
|
|
1239
|
+
available_columns = [col for col in desired_columns if col in data_df.columns]
|
|
1240
|
+
|
|
1241
|
+
if len(available_columns) <= 1: # Only consensus_uid would be 1
|
|
1242
|
+
self.logger.error(f"None of the requested consensus statistics columns were found. Available columns: {list(data_df.columns)}")
|
|
1243
|
+
return pl.DataFrame()
|
|
1244
|
+
|
|
1245
|
+
self.logger.debug(f"Creating consensus stats DataFrame with {len(available_columns)} columns: {available_columns}")
|
|
1246
|
+
|
|
1247
|
+
# Get base result DataFrame with selected columns
|
|
1248
|
+
result_df = data_df.select(available_columns)
|
|
1249
|
+
|
|
1250
|
+
# Add QC-related columns
|
|
1251
|
+
try:
|
|
1252
|
+
# Identify QC and blank samples based on naming patterns
|
|
1253
|
+
all_sample_names = self.samples_df["sample_name"].to_list()
|
|
1254
|
+
|
|
1255
|
+
# Define patterns for QC and blank identification
|
|
1256
|
+
qc_patterns = ["qc", "QC", "quality", "Quality", "control", "Control"]
|
|
1257
|
+
blank_patterns = ["blank", "Blank", "BLANK", "blk", "BLK"]
|
|
1258
|
+
|
|
1259
|
+
# Get QC and blank sample names
|
|
1260
|
+
qc_sample_names = [name for name in all_sample_names if any(pattern in name for pattern in qc_patterns)]
|
|
1261
|
+
blank_sample_names = [name for name in all_sample_names if any(pattern in name for pattern in blank_patterns)]
|
|
1262
|
+
|
|
1263
|
+
self.logger.debug(f"Found {len(qc_sample_names)} QC samples and {len(blank_sample_names)} blank samples")
|
|
1264
|
+
|
|
1265
|
+
# Initialize QC columns with null values
|
|
1266
|
+
qc_ratio_values = [None] * len(result_df)
|
|
1267
|
+
qc_cv_values = [None] * len(result_df)
|
|
1268
|
+
qc_to_blank_values = [None] * len(result_df)
|
|
1269
|
+
|
|
1270
|
+
if len(qc_sample_names) > 0:
|
|
1271
|
+
# Calculate QC metrics using optimized approach - get only QC+blank data
|
|
1272
|
+
self.logger.debug("Fetching optimized consensus matrices for QC calculations...")
|
|
1273
|
+
|
|
1274
|
+
# Get QC consensus matrix (only QC samples)
|
|
1275
|
+
qc_consensus_matrix = self.get_consensus_matrix(samples=qc_sample_names)
|
|
1276
|
+
|
|
1277
|
+
# Get blank consensus matrix (only blank samples) if blanks exist
|
|
1278
|
+
blank_consensus_matrix = None
|
|
1279
|
+
if len(blank_sample_names) > 0:
|
|
1280
|
+
blank_consensus_matrix = self.get_consensus_matrix(samples=blank_sample_names)
|
|
1281
|
+
|
|
1282
|
+
if qc_consensus_matrix is not None and not qc_consensus_matrix.is_empty():
|
|
1283
|
+
available_qc_cols = [col for col in qc_consensus_matrix.columns if col != "consensus_uid"]
|
|
1284
|
+
self.logger.debug(f"Found {len(available_qc_cols)} QC columns in optimized QC matrix")
|
|
1285
|
+
|
|
1286
|
+
# 2. QC CV: Calculate CV for QC samples
|
|
1287
|
+
if len(available_qc_cols) > 0:
|
|
1288
|
+
self.logger.debug("Calculating QC CV...")
|
|
1289
|
+
try:
|
|
1290
|
+
# Calculate CV (coefficient of variation) for QC samples
|
|
1291
|
+
qc_data = qc_consensus_matrix.select(["consensus_uid"] + available_qc_cols)
|
|
1292
|
+
|
|
1293
|
+
# Calculate mean and std for each row across QC columns
|
|
1294
|
+
qc_stats = qc_data.with_columns([
|
|
1295
|
+
pl.concat_list([pl.col(col) for col in available_qc_cols]).alias("qc_values")
|
|
1296
|
+
]).with_columns([
|
|
1297
|
+
pl.col("qc_values").list.mean().alias("qc_mean"),
|
|
1298
|
+
pl.col("qc_values").list.std().alias("qc_std")
|
|
1299
|
+
]).with_columns(
|
|
1300
|
+
# CV = std / mean (NOT multiplied by 100 to keep between 0-1)
|
|
1301
|
+
pl.when(pl.col("qc_mean") > 0)
|
|
1302
|
+
.then(pl.col("qc_std") / pl.col("qc_mean"))
|
|
1303
|
+
.otherwise(None)
|
|
1304
|
+
.alias("qc_cv")
|
|
1305
|
+
)
|
|
1306
|
+
|
|
1307
|
+
# Join with result DataFrame
|
|
1308
|
+
result_df = result_df.join(
|
|
1309
|
+
qc_stats.select(["consensus_uid", "qc_cv"]),
|
|
1310
|
+
on="consensus_uid",
|
|
1311
|
+
how="left"
|
|
1312
|
+
)
|
|
1313
|
+
qc_cv_values = None # Indicate we successfully added the column
|
|
1314
|
+
|
|
1315
|
+
except Exception as e:
|
|
1316
|
+
self.logger.debug(f"Could not calculate QC CV: {e}")
|
|
1317
|
+
|
|
1318
|
+
# 3. QC to blank ratio: Compare average QC to average blank intensity
|
|
1319
|
+
if len(available_qc_cols) > 0 and blank_consensus_matrix is not None and not blank_consensus_matrix.is_empty():
|
|
1320
|
+
available_blank_cols = [col for col in blank_consensus_matrix.columns if col != "consensus_uid"]
|
|
1321
|
+
self.logger.debug(f"Calculating QC to blank ratio with {len(available_blank_cols)} blank columns...")
|
|
1322
|
+
|
|
1323
|
+
if len(available_blank_cols) > 0:
|
|
1324
|
+
try:
|
|
1325
|
+
# Calculate average intensity for QC samples
|
|
1326
|
+
qc_averages = qc_data.with_columns([
|
|
1327
|
+
pl.concat_list([pl.col(col) for col in available_qc_cols]).alias("qc_values")
|
|
1328
|
+
]).with_columns(
|
|
1329
|
+
pl.col("qc_values").list.mean().alias("qc_avg")
|
|
1330
|
+
).select(["consensus_uid", "qc_avg"])
|
|
1331
|
+
|
|
1332
|
+
# Calculate average intensity for blank samples
|
|
1333
|
+
blank_data = blank_consensus_matrix.select(["consensus_uid"] + available_blank_cols)
|
|
1334
|
+
blank_averages = blank_data.with_columns([
|
|
1335
|
+
pl.concat_list([pl.col(col) for col in available_blank_cols]).alias("blank_values")
|
|
1336
|
+
]).with_columns(
|
|
1337
|
+
pl.col("blank_values").list.mean().alias("blank_avg")
|
|
1338
|
+
).select(["consensus_uid", "blank_avg"])
|
|
1339
|
+
|
|
1340
|
+
# Join QC and blank averages and calculate ratio
|
|
1341
|
+
qc_blank_ratios = qc_averages.join(
|
|
1342
|
+
blank_averages,
|
|
1343
|
+
on="consensus_uid",
|
|
1344
|
+
how="left"
|
|
1345
|
+
).with_columns(
|
|
1346
|
+
# Ratio = qc_avg / blank_avg, but only where blank_avg > 0
|
|
1347
|
+
pl.when(pl.col("blank_avg") > 0)
|
|
1348
|
+
.then(pl.col("qc_avg") / pl.col("blank_avg"))
|
|
1349
|
+
.otherwise(None)
|
|
1350
|
+
.alias("qc_to_blank")
|
|
1351
|
+
)
|
|
1352
|
+
|
|
1353
|
+
# Join with result DataFrame
|
|
1354
|
+
result_df = result_df.join(
|
|
1355
|
+
qc_blank_ratios.select(["consensus_uid", "qc_to_blank"]),
|
|
1356
|
+
on="consensus_uid",
|
|
1357
|
+
how="left"
|
|
1358
|
+
)
|
|
1359
|
+
qc_to_blank_values = None # Indicate we successfully added the column
|
|
1360
|
+
|
|
1361
|
+
except Exception as e:
|
|
1362
|
+
self.logger.debug(f"Could not calculate QC to blank ratio: {e}")
|
|
1363
|
+
|
|
1364
|
+
# 1. QC ratio: Get optimized gaps matrix for QC samples only
|
|
1365
|
+
self.logger.debug("Calculating QC detection ratio with optimized gaps matrix...")
|
|
1366
|
+
try:
|
|
1367
|
+
# Use optimized get_gaps_matrix with QC samples filtering for faster performance
|
|
1368
|
+
qc_gaps_matrix = self.get_gaps_matrix(samples=qc_sample_names)
|
|
1369
|
+
|
|
1370
|
+
if qc_gaps_matrix is not None and not qc_gaps_matrix.is_empty():
|
|
1371
|
+
# Get QC columns (should be all columns except consensus_uid since we filtered)
|
|
1372
|
+
available_qc_cols_gaps = [col for col in qc_gaps_matrix.columns if col != "consensus_uid"]
|
|
1373
|
+
self.logger.debug(f"Found {len(available_qc_cols_gaps)} QC columns in optimized gaps matrix")
|
|
1374
|
+
|
|
1375
|
+
if len(available_qc_cols_gaps) > 0:
|
|
1376
|
+
# Calculate QC detection ratio for each consensus feature
|
|
1377
|
+
qc_detection = qc_gaps_matrix.select(["consensus_uid"] + available_qc_cols_gaps)
|
|
1378
|
+
|
|
1379
|
+
# Data should already be properly typed from get_gaps_matrix, but ensure consistency
|
|
1380
|
+
for col in available_qc_cols_gaps:
|
|
1381
|
+
qc_detection = qc_detection.with_columns(
|
|
1382
|
+
pl.col(col).fill_null(0).cast(pl.Int8).alias(col)
|
|
1383
|
+
)
|
|
1384
|
+
|
|
1385
|
+
# Calculate ratio (sum of detections / number of QC samples)
|
|
1386
|
+
qc_ratios = qc_detection.with_columns(
|
|
1387
|
+
pl.concat_list([pl.col(col) for col in available_qc_cols_gaps]).alias("qc_detections")
|
|
1388
|
+
).with_columns(
|
|
1389
|
+
(pl.col("qc_detections").list.sum().cast(pl.Float64) / len(available_qc_cols_gaps)).alias("qc_ratio")
|
|
1390
|
+
)
|
|
1391
|
+
|
|
1392
|
+
# Join with result DataFrame
|
|
1393
|
+
result_df = result_df.join(
|
|
1394
|
+
qc_ratios.select(["consensus_uid", "qc_ratio"]),
|
|
1395
|
+
on="consensus_uid",
|
|
1396
|
+
how="left"
|
|
1397
|
+
)
|
|
1398
|
+
qc_ratio_values = None # Indicate we successfully added the column
|
|
1399
|
+
|
|
1400
|
+
except Exception as e:
|
|
1401
|
+
self.logger.debug(f"Could not calculate QC ratio: {e}")
|
|
1402
|
+
|
|
1403
|
+
# Add null columns for any QC metrics that couldn't be calculated
|
|
1404
|
+
# Add null columns for any QC metrics that couldn't be calculated
|
|
1405
|
+
if qc_ratio_values is not None:
|
|
1406
|
+
result_df = result_df.with_columns(pl.lit(None, dtype=pl.Float64).alias("qc_ratio"))
|
|
1407
|
+
if qc_cv_values is not None:
|
|
1408
|
+
result_df = result_df.with_columns(pl.lit(None, dtype=pl.Float64).alias("qc_cv"))
|
|
1409
|
+
if qc_to_blank_values is not None:
|
|
1410
|
+
result_df = result_df.with_columns(pl.lit(None, dtype=pl.Float64).alias("qc_to_blank"))
|
|
1411
|
+
|
|
1412
|
+
except Exception as e:
|
|
1413
|
+
self.logger.warning(f"Error calculating QC metrics: {e}")
|
|
1414
|
+
# Add null columns if QC calculation fails
|
|
1415
|
+
result_df = result_df.with_columns([
|
|
1416
|
+
pl.lit(None, dtype=pl.Float64).alias("qc_ratio"),
|
|
1417
|
+
pl.lit(None, dtype=pl.Float64).alias("qc_cv"),
|
|
1418
|
+
pl.lit(None, dtype=pl.Float64).alias("qc_to_blank")
|
|
1419
|
+
])
|
|
1420
|
+
|
|
1421
|
+
return result_df
|
|
1422
|
+
|
|
1423
|
+
|
|
1047
1424
|
# =====================================================================================
|
|
1048
1425
|
# DATA COMPRESSION AND RESTORATION FUNCTIONS
|
|
1049
1426
|
# =====================================================================================
|
|
@@ -2131,7 +2508,7 @@ def _apply_chunked_select(self, filter_expr, chunk_size: int):
|
|
|
2131
2508
|
else:
|
|
2132
2509
|
return pl.DataFrame()
|
|
2133
2510
|
|
|
2134
|
-
|
|
2511
|
+
'''
|
|
2135
2512
|
def features_select_benchmarked(
|
|
2136
2513
|
self,
|
|
2137
2514
|
mz=None,
|
|
@@ -2224,7 +2601,7 @@ def monkey_patch_study():
|
|
|
2224
2601
|
Study.features_select_benchmarked = features_select_benchmarked
|
|
2225
2602
|
|
|
2226
2603
|
print("Patched Study.features_select with consolidated optimized implementation")
|
|
2227
|
-
|
|
2604
|
+
'''
|
|
2228
2605
|
|
|
2229
2606
|
def features_filter(
|
|
2230
2607
|
self,
|