masster 0.5.22__py3-none-any.whl → 0.5.24__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of masster might be problematic. Click here for more details.

masster/study/helpers.py CHANGED
@@ -71,12 +71,7 @@ def get_bpc(owner, sample=None, rt_unit="s", label=None, original=False):
71
71
  # fallback to pandas
72
72
  try:
73
73
  bpc_pd = s.ms1_df.to_pandas()[["rt", "inty"]]
74
- bpc_pd = (
75
- bpc_pd.groupby("rt")
76
- .agg({"inty": "max"})
77
- .reset_index()
78
- .sort_values("rt")
79
- )
74
+ bpc_pd = bpc_pd.groupby("rt").agg({"inty": "max"}).reset_index().sort_values("rt")
80
75
  except Exception:
81
76
  raise
82
77
 
@@ -375,8 +370,7 @@ def get_chrom(self, uids=None, samples=None):
375
370
  )
376
371
  # Pre-filter features_df to only relevant features and samples
377
372
  filtered_features = self.features_df.filter(
378
- pl.col("feature_uid").is_in(relevant_feature_uids)
379
- & pl.col("sample_uid").is_in(sample_uids),
373
+ pl.col("feature_uid").is_in(relevant_feature_uids) & pl.col("sample_uid").is_in(sample_uids),
380
374
  ).select(
381
375
  [
382
376
  "feature_uid",
@@ -489,6 +483,7 @@ def align_reset(self):
489
483
 
490
484
  # Ensure column order is maintained after with_columns operation
491
485
  from masster.study.helpers import _ensure_features_df_schema_order
486
+
492
487
  _ensure_features_df_schema_order(self)
493
488
  self.logger.info("Alignment reset: all feature RTs set to original_RT.")
494
489
 
@@ -530,24 +525,24 @@ def get_consensus_matrix(self, quant="chrom_area", samples=None):
530
525
  """
531
526
  Get a matrix of consensus features with samples as columns and consensus features as rows.
532
527
  Highly optimized implementation using vectorized Polars operations.
533
-
528
+
534
529
  Parameters:
535
530
  quant (str): Quantification method column name (default: "chrom_area")
536
531
  samples: Sample identifier(s) to include. Can be:
537
532
  - None: include all samples (default)
538
- - int: single sample_uid
533
+ - int: single sample_uid
539
534
  - str: single sample_name
540
535
  - list: multiple sample_uids or sample_names
541
536
  """
542
537
  import polars as pl
543
-
538
+
544
539
  if quant not in self.features_df.columns:
545
540
  self.logger.error(f"Quantification method {quant} not found in features_df.")
546
541
  return None
547
542
 
548
543
  # Get sample_uids to include in the matrix
549
544
  sample_uids = self._get_samples_uids(samples) if samples is not None else self.samples_df["sample_uid"].to_list()
550
-
545
+
551
546
  if not sample_uids:
552
547
  self.logger.warning("No valid samples found for consensus matrix")
553
548
  return pl.DataFrame()
@@ -556,44 +551,31 @@ def get_consensus_matrix(self, quant="chrom_area", samples=None):
556
551
  features_filtered = self.features_df.filter(pl.col("sample_uid").is_in(sample_uids))
557
552
  samples_filtered = self.samples_df.filter(pl.col("sample_uid").is_in(sample_uids))
558
553
  consensus_mapping_filtered = self.consensus_mapping_df.filter(pl.col("sample_uid").is_in(sample_uids))
559
-
554
+
560
555
  # Join operations to combine data efficiently
561
556
  # 1. Join consensus mapping with features to get quantification values
562
- consensus_with_values = (
563
- consensus_mapping_filtered
564
- .join(features_filtered.select(["feature_uid", "sample_uid", quant]),
565
- on=["feature_uid", "sample_uid"], how="left")
566
- .with_columns(pl.col(quant).fill_null(0))
567
- )
568
-
557
+ consensus_with_values = consensus_mapping_filtered.join(
558
+ features_filtered.select(["feature_uid", "sample_uid", quant]), on=["feature_uid", "sample_uid"], how="left"
559
+ ).with_columns(pl.col(quant).fill_null(0))
560
+
569
561
  # 2. Join with samples to get sample names
570
- consensus_with_names = (
571
- consensus_with_values
572
- .join(samples_filtered.select(["sample_uid", "sample_name"]),
573
- on="sample_uid", how="left")
562
+ consensus_with_names = consensus_with_values.join(
563
+ samples_filtered.select(["sample_uid", "sample_name"]), on="sample_uid", how="left"
574
564
  )
575
-
565
+
576
566
  # 3. Group by consensus_uid and sample_name, taking max value per group
577
- aggregated = (
578
- consensus_with_names
579
- .group_by(["consensus_uid", "sample_name"])
580
- .agg(pl.col(quant).max().alias("value"))
581
- )
582
-
567
+ aggregated = consensus_with_names.group_by(["consensus_uid", "sample_name"]).agg(pl.col(quant).max().alias("value"))
568
+
583
569
  # 4. Pivot to create the matrix format
584
- matrix_df = (
585
- aggregated
586
- .pivot(on="sample_name", index="consensus_uid", values="value")
587
- .fill_null(0)
588
- )
589
-
570
+ matrix_df = aggregated.pivot(on="sample_name", index="consensus_uid", values="value").fill_null(0)
571
+
590
572
  # 5. Round numeric columns and ensure proper types
591
573
  numeric_cols = [col for col in matrix_df.columns if col != "consensus_uid"]
592
574
  matrix_df = matrix_df.with_columns([
593
575
  pl.col("consensus_uid").cast(pl.UInt64),
594
- *[pl.col(col).round(0) for col in numeric_cols]
576
+ *[pl.col(col).round(0) for col in numeric_cols],
595
577
  ])
596
-
578
+
597
579
  return matrix_df
598
580
 
599
581
 
@@ -601,26 +583,26 @@ def get_gaps_matrix(self, uids=None, samples=None):
601
583
  """
602
584
  Get a matrix of gaps between consensus features with samples as columns and consensus features as rows.
603
585
  Optimized implementation that builds the gaps matrix directly without calling get_consensus_matrix().
604
-
586
+
605
587
  Parameters:
606
588
  uids: Consensus UID(s) to include. If None, includes all consensus features.
607
589
  samples: Sample identifier(s) to include. If None, includes all samples.
608
590
  Can be int (sample_uid), str (sample_name), or list of either.
609
-
591
+
610
592
  Returns:
611
593
  pl.DataFrame: Gaps matrix with consensus_uid as first column and samples as other columns.
612
594
  Values are 1 (detected) or 0 (missing/gap).
613
595
  """
614
596
  import polars as pl
615
-
597
+
616
598
  if self.consensus_df is None or self.consensus_df.is_empty():
617
599
  self.logger.error("No consensus found.")
618
600
  return None
619
-
601
+
620
602
  if self.consensus_mapping_df is None or self.consensus_mapping_df.is_empty():
621
603
  self.logger.error("No consensus mapping found.")
622
604
  return None
623
-
605
+
624
606
  if self.features_df is None or self.features_df.is_empty():
625
607
  self.logger.error("No features found.")
626
608
  return None
@@ -628,7 +610,7 @@ def get_gaps_matrix(self, uids=None, samples=None):
628
610
  # Get consensus UIDs and sample UIDs to include
629
611
  uids = self._get_consensus_uids(uids)
630
612
  sample_uids = self._get_samples_uids(samples) if samples is not None else self.samples_df["sample_uid"].to_list()
631
-
613
+
632
614
  if not uids or not sample_uids:
633
615
  self.logger.warning("No valid consensus features or samples found for gaps matrix")
634
616
  return pl.DataFrame()
@@ -642,7 +624,7 @@ def get_gaps_matrix(self, uids=None, samples=None):
642
624
  # Skip filled features (gaps should only show original detections)
643
625
  if row.get("filled", False):
644
626
  continue
645
-
627
+
646
628
  feature_uid = row["feature_uid"]
647
629
  # If feature exists and is not filled, it's detected (1)
648
630
  feature_detection[(feature_uid, sample_uid)] = 1
@@ -651,7 +633,8 @@ def get_gaps_matrix(self, uids=None, samples=None):
651
633
  matrix_dict = {}
652
634
  sample_mapping = dict(
653
635
  self.samples_df.filter(pl.col("sample_uid").is_in(sample_uids))
654
- .select(["sample_uid", "sample_name"]).iter_rows(),
636
+ .select(["sample_uid", "sample_name"])
637
+ .iter_rows(),
655
638
  )
656
639
 
657
640
  for row in self.consensus_mapping_df.iter_rows(named=True):
@@ -732,7 +715,7 @@ def get_gaps_stats(self, uids=None):
732
715
  def get_consensus_matches(self, uids=None, filled=True):
733
716
  """
734
717
  Get feature matches for consensus UIDs with optimized join operation.
735
-
718
+
736
719
  Parameters:
737
720
  uids: Consensus UID(s) to get matches for. Can be:
738
721
  - None: get matches for all consensus features
@@ -740,50 +723,47 @@ def get_consensus_matches(self, uids=None, filled=True):
740
723
  - list: multiple consensus UIDs
741
724
  filled (bool): Whether to include filled rows (True) or exclude them (False).
742
725
  Default is True to maintain backward compatibility.
743
-
726
+
744
727
  Returns:
745
728
  pl.DataFrame: Feature matches for the specified consensus UIDs
746
729
  """
747
730
  # Handle single int by converting to list
748
731
  if isinstance(uids, int):
749
732
  uids = [uids]
750
-
733
+
751
734
  uids = self._get_consensus_uids(uids)
752
-
735
+
753
736
  if not uids:
754
737
  return pl.DataFrame()
755
-
738
+
756
739
  # Early validation checks
757
740
  if self.consensus_mapping_df is None or self.consensus_mapping_df.is_empty():
758
741
  self.logger.warning("No consensus mapping data available")
759
742
  return pl.DataFrame()
760
-
743
+
761
744
  if self.features_df is None or self.features_df.is_empty():
762
745
  self.logger.warning("No feature data available")
763
746
  return pl.DataFrame()
764
-
747
+
765
748
  # Build the query with optional filled filter
766
749
  features_query = self.features_df.lazy()
767
-
750
+
768
751
  # Apply filled filter if specified
769
752
  if not filled and "filled" in self.features_df.columns:
770
753
  features_query = features_query.filter(~pl.col("filled"))
771
-
754
+
772
755
  # Optimized single-pass operation using join instead of two separate filters
773
756
  # This avoids creating intermediate Python lists and leverages Polars' optimized joins
774
757
  matches = (
775
- features_query
776
- .join(
777
- self.consensus_mapping_df
778
- .lazy()
758
+ features_query.join(
759
+ self.consensus_mapping_df.lazy()
779
760
  .filter(pl.col("consensus_uid").is_in(uids))
780
761
  .select("feature_uid"), # Only select what we need for the join
781
762
  on="feature_uid",
782
- how="inner"
783
- )
784
- .collect(streaming=True) # Use streaming for memory efficiency with large datasets
763
+ how="inner",
764
+ ).collect(streaming=True) # Use streaming for memory efficiency with large datasets
785
765
  )
786
-
766
+
787
767
  return matches
788
768
 
789
769
 
@@ -795,34 +775,34 @@ def get_consensus_matches(self, uids=None, filled=True):
795
775
  def consensus_reset(self):
796
776
  """
797
777
  Reset consensus data by clearing consensus DataFrames and removing filled features.
798
-
778
+
799
779
  This function:
800
780
  1. Sets consensus_df, consensus_ms2, consensus_mapping_df, id_df to empty pl.DataFrame()
801
781
  2. Removes all filled features from features_df
802
782
  3. Removes relevant operations from history (merge, integrate, find_ms2, fill, identify)
803
783
  4. Logs the number of features removed
804
-
784
+
805
785
  This effectively undoes the merge() operation and any gap-filling.
806
786
  """
807
787
  self.logger.debug("Resetting consensus data.")
808
-
788
+
809
789
  # Reset consensus DataFrames to empty
810
790
  self.consensus_df = pl.DataFrame()
811
- self.consensus_ms2 = pl.DataFrame()
791
+ self.consensus_ms2 = pl.DataFrame()
812
792
  self.consensus_mapping_df = pl.DataFrame()
813
793
  self.id_df = pl.DataFrame()
814
-
794
+
815
795
  # Remove filled features from features_df
816
796
  if self.features_df is None:
817
797
  self.logger.warning("No features found.")
818
798
  return
819
-
799
+
820
800
  l1 = len(self.features_df)
821
-
801
+
822
802
  # Filter out filled features (keep only non-filled features)
823
803
  if "filled" in self.features_df.columns:
824
804
  self.features_df = self.features_df.filter(~pl.col("filled") | pl.col("filled").is_null())
825
-
805
+
826
806
  # Remove consensus-related operations from history
827
807
  keys_to_remove = ["merge", "integrate", "integrate_chrom", "find_ms2", "fill", "fill_single", "identify"]
828
808
  history_removed_count = 0
@@ -832,7 +812,7 @@ def consensus_reset(self):
832
812
  del self.history[key]
833
813
  history_removed_count += 1
834
814
  self.logger.debug(f"Removed '{key}' from history")
835
-
815
+
836
816
  removed_count = l1 - len(self.features_df)
837
817
  self.logger.info(
838
818
  f"Reset consensus data. Consensus DataFrames cleared. Features removed: {removed_count}. History entries removed: {history_removed_count}",
@@ -1049,13 +1029,13 @@ def get_orphans(self):
1049
1029
  def get_sample_stats(self):
1050
1030
  """
1051
1031
  Get statistics for all samples in the study.
1052
-
1032
+
1053
1033
  Returns:
1054
1034
  pl.DataFrame: DataFrame with the following columns:
1055
1035
  - sample_uid: Sample unique identifier
1056
1036
  - num_features: Total number of features per sample
1057
1037
  - num_ms1: Number of MS1 features per sample
1058
- - num_ms2: Number of MS2 features per sample
1038
+ - num_ms2: Number of MS2 features per sample
1059
1039
  - num_linked_ms1: Number of non-filled features present in consensus_mapping_df
1060
1040
  - num_orphans: Number of non-filled features not present in consensus_mapping_df
1061
1041
  - max_rt_correction: Maximum RT correction applied
@@ -1065,19 +1045,19 @@ def get_sample_stats(self):
1065
1045
  if self.samples_df is None or self.samples_df.is_empty():
1066
1046
  self.logger.warning("No samples found in study.")
1067
1047
  return pl.DataFrame()
1068
-
1048
+
1069
1049
  if self.features_df is None or self.features_df.is_empty():
1070
1050
  self.logger.warning("No features found in study.")
1071
1051
  return pl.DataFrame()
1072
-
1052
+
1073
1053
  # Get base sample information
1074
1054
  sample_uids = self.samples_df["sample_uid"].to_list()
1075
1055
  stats_data = []
1076
-
1056
+
1077
1057
  for sample_uid in sample_uids:
1078
1058
  # Filter features for this sample
1079
1059
  sample_features = self.features_df.filter(pl.col("sample_uid") == sample_uid)
1080
-
1060
+
1081
1061
  if sample_features.is_empty():
1082
1062
  # Sample has no features
1083
1063
  stats_data.append({
@@ -1089,66 +1069,60 @@ def get_sample_stats(self):
1089
1069
  "num_orphans": 0,
1090
1070
  "max_rt_correction": None,
1091
1071
  "average_rt_correction": None,
1092
- "num_linked_ms2": 0
1072
+ "num_linked_ms2": 0,
1093
1073
  })
1094
1074
  continue
1095
-
1075
+
1096
1076
  # Basic feature counts
1097
1077
  num_features = len(sample_features)
1098
-
1078
+
1099
1079
  # Count MS1 and MS2 features
1100
1080
  # Assume features with ms_level=1 or missing ms_level are MS1
1101
- num_ms1 = sample_features.filter(
1102
- pl.col("ms_level").is_null() | (pl.col("ms_level") == 1)
1103
- ).height if "ms_level" in sample_features.columns else num_features
1104
-
1105
- num_ms2 = sample_features.filter(
1106
- pl.col("ms_level") == 2
1107
- ).height if "ms_level" in sample_features.columns else 0
1108
-
1081
+ num_ms1 = (
1082
+ sample_features.filter(pl.col("ms_level").is_null() | (pl.col("ms_level") == 1)).height
1083
+ if "ms_level" in sample_features.columns
1084
+ else num_features
1085
+ )
1086
+
1087
+ num_ms2 = sample_features.filter(pl.col("ms_level") == 2).height if "ms_level" in sample_features.columns else 0
1088
+
1109
1089
  # Get non-filled features for this sample
1110
1090
  if "filled" in sample_features.columns:
1111
1091
  non_filled_features = sample_features.filter(~pl.col("filled") | pl.col("filled").is_null())
1112
1092
  else:
1113
1093
  non_filled_features = sample_features
1114
-
1094
+
1115
1095
  # Count linked MS1 features (non-filled and present in consensus_mapping_df)
1116
1096
  num_linked_ms1 = 0
1117
1097
  if not self.consensus_mapping_df.is_empty() and not non_filled_features.is_empty():
1118
- linked_feature_uids = self.consensus_mapping_df.filter(
1119
- pl.col("sample_uid") == sample_uid
1120
- )["feature_uid"].to_list()
1121
-
1122
- num_linked_ms1 = non_filled_features.filter(
1123
- pl.col("feature_uid").is_in(linked_feature_uids)
1124
- ).height
1125
-
1098
+ linked_feature_uids = self.consensus_mapping_df.filter(pl.col("sample_uid") == sample_uid)[
1099
+ "feature_uid"
1100
+ ].to_list()
1101
+
1102
+ num_linked_ms1 = non_filled_features.filter(pl.col("feature_uid").is_in(linked_feature_uids)).height
1103
+
1126
1104
  # Count orphan features (non-filled and NOT present in consensus_mapping_df)
1127
1105
  num_orphans = len(non_filled_features) - num_linked_ms1
1128
-
1106
+
1129
1107
  # Calculate RT correction statistics
1130
1108
  max_rt_correction = None
1131
1109
  average_rt_correction = None
1132
-
1110
+
1133
1111
  if "rt" in sample_features.columns and "rt_original" in sample_features.columns:
1134
1112
  rt_corrections = sample_features.with_columns(
1135
1113
  (pl.col("rt") - pl.col("rt_original")).alias("rt_correction")
1136
- ).filter(
1137
- pl.col("rt_correction").is_not_null()
1138
- )["rt_correction"]
1139
-
1114
+ ).filter(pl.col("rt_correction").is_not_null())["rt_correction"]
1115
+
1140
1116
  if not rt_corrections.is_empty():
1141
1117
  max_rt_correction = rt_corrections.abs().max()
1142
1118
  average_rt_correction = rt_corrections.abs().mean()
1143
-
1119
+
1144
1120
  # Count linked MS2 spectra from consensus_ms2_df
1145
1121
  num_linked_ms2 = 0
1146
- if hasattr(self, 'consensus_ms2') and self.consensus_ms2 is not None and not self.consensus_ms2.is_empty():
1122
+ if hasattr(self, "consensus_ms2") and self.consensus_ms2 is not None and not self.consensus_ms2.is_empty():
1147
1123
  if "sample_uid" in self.consensus_ms2.columns:
1148
- num_linked_ms2 = self.consensus_ms2.filter(
1149
- pl.col("sample_uid") == sample_uid
1150
- ).height
1151
-
1124
+ num_linked_ms2 = self.consensus_ms2.filter(pl.col("sample_uid") == sample_uid).height
1125
+
1152
1126
  stats_data.append({
1153
1127
  "sample_uid": sample_uid,
1154
1128
  "num_features": num_features,
@@ -1158,9 +1132,9 @@ def get_sample_stats(self):
1158
1132
  "num_orphans": num_orphans,
1159
1133
  "max_rt_correction": max_rt_correction,
1160
1134
  "average_rt_correction": average_rt_correction,
1161
- "num_linked_ms2": num_linked_ms2
1135
+ "num_linked_ms2": num_linked_ms2,
1162
1136
  })
1163
-
1137
+
1164
1138
  # Create DataFrame with proper schema
1165
1139
  return pl.DataFrame(
1166
1140
  stats_data,
@@ -1173,15 +1147,15 @@ def get_sample_stats(self):
1173
1147
  "num_orphans": pl.UInt32,
1174
1148
  "max_rt_correction": pl.Float64,
1175
1149
  "average_rt_correction": pl.Float64,
1176
- "num_linked_ms2": pl.UInt32
1177
- }
1150
+ "num_linked_ms2": pl.UInt32,
1151
+ },
1178
1152
  )
1179
1153
 
1180
1154
 
1181
1155
  def get_consensus_stats(self):
1182
1156
  """
1183
1157
  Get key performance indicators for each consensus feature.
1184
-
1158
+
1185
1159
  Returns:
1186
1160
  pl.DataFrame: DataFrame with the following columns:
1187
1161
  - consensus_uid: Consensus unique identifier
@@ -1203,7 +1177,7 @@ def get_consensus_stats(self):
1203
1177
  """
1204
1178
  import polars as pl
1205
1179
  import numpy as np
1206
-
1180
+
1207
1181
  # Check if consensus_df exists and has data
1208
1182
  if self.consensus_df is None or self.consensus_df.is_empty():
1209
1183
  self.logger.error("No consensus data available. Run merge/find_consensus first.")
@@ -1215,134 +1189,146 @@ def get_consensus_stats(self):
1215
1189
  # Define specific columns to include in the exact order requested
1216
1190
  desired_columns = [
1217
1191
  "consensus_uid", # Include consensus_uid for identification
1218
- "rt",
1219
- "rt_delta_mean",
1220
- "mz",
1192
+ "rt",
1193
+ "rt_delta_mean",
1194
+ "mz",
1221
1195
  "mz_range", # mz_max-mz_min (will be calculated)
1222
1196
  "log10_inty_mean", # log10(inty_mean) (will be calculated)
1223
- "number_samples",
1224
- "number_ms2",
1225
- "charge_mean",
1226
- "quality",
1227
- "chrom_coherence_mean",
1228
- "chrom_height_scaled_mean",
1229
- "chrom_prominence_scaled_mean"
1197
+ "number_samples",
1198
+ "number_ms2",
1199
+ "charge_mean",
1200
+ "quality",
1201
+ "chrom_coherence_mean",
1202
+ "chrom_height_scaled_mean",
1203
+ "chrom_prominence_scaled_mean",
1230
1204
  ]
1231
-
1205
+
1232
1206
  # Calculate derived columns if they don't exist
1233
1207
  if "mz_range" not in data_df.columns and "mz_max" in data_df.columns and "mz_min" in data_df.columns:
1234
1208
  data_df = data_df.with_columns((pl.col("mz_max") - pl.col("mz_min")).alias("mz_range"))
1235
-
1209
+
1236
1210
  if "log10_inty_mean" not in data_df.columns and "inty_mean" in data_df.columns:
1237
1211
  data_df = data_df.with_columns(pl.col("inty_mean").log10().alias("log10_inty_mean"))
1238
-
1212
+
1239
1213
  # Filter to only include columns that exist in the dataframe, preserving order
1240
1214
  available_columns = [col for col in desired_columns if col in data_df.columns]
1241
-
1215
+
1242
1216
  if len(available_columns) <= 1: # Only consensus_uid would be 1
1243
- self.logger.error(f"None of the requested consensus statistics columns were found. Available columns: {list(data_df.columns)}")
1217
+ self.logger.error(
1218
+ f"None of the requested consensus statistics columns were found. Available columns: {list(data_df.columns)}"
1219
+ )
1244
1220
  return pl.DataFrame()
1245
1221
 
1246
1222
  self.logger.debug(f"Creating consensus stats DataFrame with {len(available_columns)} columns: {available_columns}")
1247
1223
 
1248
1224
  # Get base result DataFrame with selected columns
1249
1225
  result_df = data_df.select(available_columns)
1250
-
1226
+
1251
1227
  # Add QC-related columns
1252
1228
  try:
1253
1229
  # Identify QC and blank samples based on naming patterns
1254
1230
  all_sample_names = self.samples_df["sample_name"].to_list()
1255
-
1231
+
1256
1232
  # Define patterns for QC and blank identification
1257
1233
  qc_patterns = ["qc", "QC", "quality", "Quality", "control", "Control"]
1258
1234
  blank_patterns = ["blank", "Blank", "BLANK", "blk", "BLK"]
1259
-
1235
+
1260
1236
  # Get QC and blank sample names
1261
1237
  qc_sample_names = [name for name in all_sample_names if any(pattern in name for pattern in qc_patterns)]
1262
1238
  blank_sample_names = [name for name in all_sample_names if any(pattern in name for pattern in blank_patterns)]
1263
-
1239
+
1264
1240
  self.logger.debug(f"Found {len(qc_sample_names)} QC samples and {len(blank_sample_names)} blank samples")
1265
-
1241
+
1266
1242
  # Initialize QC columns with null values
1267
1243
  qc_ratio_values = [None] * len(result_df)
1268
- qc_cv_values = [None] * len(result_df)
1244
+ qc_cv_values = [None] * len(result_df)
1269
1245
  qc_to_blank_values = [None] * len(result_df)
1270
-
1246
+
1271
1247
  if len(qc_sample_names) > 0:
1272
1248
  # Calculate QC metrics using optimized approach - get only QC+blank data
1273
1249
  self.logger.debug("Fetching optimized consensus matrices for QC calculations...")
1274
-
1250
+
1275
1251
  # Get QC consensus matrix (only QC samples)
1276
1252
  qc_consensus_matrix = self.get_consensus_matrix(samples=qc_sample_names)
1277
-
1253
+
1278
1254
  # Get blank consensus matrix (only blank samples) if blanks exist
1279
1255
  blank_consensus_matrix = None
1280
1256
  if len(blank_sample_names) > 0:
1281
1257
  blank_consensus_matrix = self.get_consensus_matrix(samples=blank_sample_names)
1282
-
1258
+
1283
1259
  if qc_consensus_matrix is not None and not qc_consensus_matrix.is_empty():
1284
1260
  available_qc_cols = [col for col in qc_consensus_matrix.columns if col != "consensus_uid"]
1285
1261
  self.logger.debug(f"Found {len(available_qc_cols)} QC columns in optimized QC matrix")
1286
-
1262
+
1287
1263
  # 2. QC CV: Calculate CV for QC samples
1288
1264
  if len(available_qc_cols) > 0:
1289
1265
  self.logger.debug("Calculating QC CV...")
1290
1266
  try:
1291
1267
  # Calculate CV (coefficient of variation) for QC samples
1292
1268
  qc_data = qc_consensus_matrix.select(["consensus_uid"] + available_qc_cols)
1293
-
1269
+
1294
1270
  # Calculate mean and std for each row across QC columns
1295
- qc_stats = qc_data.with_columns([
1296
- pl.concat_list([pl.col(col) for col in available_qc_cols]).alias("qc_values")
1297
- ]).with_columns([
1298
- pl.col("qc_values").list.mean().alias("qc_mean"),
1299
- pl.col("qc_values").list.std().alias("qc_std")
1300
- ]).with_columns(
1301
- # CV = std / mean (NOT multiplied by 100 to keep between 0-1)
1302
- pl.when(pl.col("qc_mean") > 0)
1303
- .then(pl.col("qc_std") / pl.col("qc_mean"))
1304
- .otherwise(None)
1305
- .alias("qc_cv")
1271
+ qc_stats = (
1272
+ qc_data.with_columns([
1273
+ pl.concat_list([pl.col(col) for col in available_qc_cols]).alias("qc_values")
1274
+ ])
1275
+ .with_columns([
1276
+ pl.col("qc_values").list.mean().alias("qc_mean"),
1277
+ pl.col("qc_values").list.std().alias("qc_std"),
1278
+ ])
1279
+ .with_columns(
1280
+ # CV = std / mean (NOT multiplied by 100 to keep between 0-1)
1281
+ pl.when(pl.col("qc_mean") > 0)
1282
+ .then(pl.col("qc_std") / pl.col("qc_mean"))
1283
+ .otherwise(None)
1284
+ .alias("qc_cv")
1285
+ )
1306
1286
  )
1307
-
1287
+
1308
1288
  # Join with result DataFrame
1309
1289
  result_df = result_df.join(
1310
- qc_stats.select(["consensus_uid", "qc_cv"]),
1311
- on="consensus_uid",
1312
- how="left"
1290
+ qc_stats.select(["consensus_uid", "qc_cv"]), on="consensus_uid", how="left"
1313
1291
  )
1314
1292
  qc_cv_values = None # Indicate we successfully added the column
1315
-
1293
+
1316
1294
  except Exception as e:
1317
1295
  self.logger.debug(f"Could not calculate QC CV: {e}")
1318
-
1296
+
1319
1297
  # 3. QC to blank ratio: Compare average QC to average blank intensity
1320
- if len(available_qc_cols) > 0 and blank_consensus_matrix is not None and not blank_consensus_matrix.is_empty():
1298
+ if (
1299
+ len(available_qc_cols) > 0
1300
+ and blank_consensus_matrix is not None
1301
+ and not blank_consensus_matrix.is_empty()
1302
+ ):
1321
1303
  available_blank_cols = [col for col in blank_consensus_matrix.columns if col != "consensus_uid"]
1322
- self.logger.debug(f"Calculating QC to blank ratio with {len(available_blank_cols)} blank columns...")
1323
-
1304
+ self.logger.debug(
1305
+ f"Calculating QC to blank ratio with {len(available_blank_cols)} blank columns..."
1306
+ )
1307
+
1324
1308
  if len(available_blank_cols) > 0:
1325
1309
  try:
1326
1310
  # Calculate average intensity for QC samples
1327
- qc_averages = qc_data.with_columns([
1328
- pl.concat_list([pl.col(col) for col in available_qc_cols]).alias("qc_values")
1329
- ]).with_columns(
1330
- pl.col("qc_values").list.mean().alias("qc_avg")
1331
- ).select(["consensus_uid", "qc_avg"])
1332
-
1311
+ qc_averages = (
1312
+ qc_data.with_columns([
1313
+ pl.concat_list([pl.col(col) for col in available_qc_cols]).alias("qc_values")
1314
+ ])
1315
+ .with_columns(pl.col("qc_values").list.mean().alias("qc_avg"))
1316
+ .select(["consensus_uid", "qc_avg"])
1317
+ )
1318
+
1333
1319
  # Calculate average intensity for blank samples
1334
1320
  blank_data = blank_consensus_matrix.select(["consensus_uid"] + available_blank_cols)
1335
- blank_averages = blank_data.with_columns([
1336
- pl.concat_list([pl.col(col) for col in available_blank_cols]).alias("blank_values")
1337
- ]).with_columns(
1338
- pl.col("blank_values").list.mean().alias("blank_avg")
1339
- ).select(["consensus_uid", "blank_avg"])
1340
-
1321
+ blank_averages = (
1322
+ blank_data.with_columns([
1323
+ pl.concat_list([pl.col(col) for col in available_blank_cols]).alias("blank_values")
1324
+ ])
1325
+ .with_columns(pl.col("blank_values").list.mean().alias("blank_avg"))
1326
+ .select(["consensus_uid", "blank_avg"])
1327
+ )
1328
+
1341
1329
  # Join QC and blank averages and calculate ratio
1342
1330
  qc_blank_ratios = qc_averages.join(
1343
- blank_averages,
1344
- on="consensus_uid",
1345
- how="left"
1331
+ blank_averages, on="consensus_uid", how="left"
1346
1332
  ).with_columns(
1347
1333
  # Ratio = qc_avg / blank_avg, but only where blank_avg > 0
1348
1334
  pl.when(pl.col("blank_avg") > 0)
@@ -1350,57 +1336,53 @@ def get_consensus_stats(self):
1350
1336
  .otherwise(None)
1351
1337
  .alias("qc_to_blank")
1352
1338
  )
1353
-
1339
+
1354
1340
  # Join with result DataFrame
1355
1341
  result_df = result_df.join(
1356
- qc_blank_ratios.select(["consensus_uid", "qc_to_blank"]),
1357
- on="consensus_uid",
1358
- how="left"
1342
+ qc_blank_ratios.select(["consensus_uid", "qc_to_blank"]), on="consensus_uid", how="left"
1359
1343
  )
1360
1344
  qc_to_blank_values = None # Indicate we successfully added the column
1361
-
1345
+
1362
1346
  except Exception as e:
1363
1347
  self.logger.debug(f"Could not calculate QC to blank ratio: {e}")
1364
-
1348
+
1365
1349
  # 1. QC ratio: Get optimized gaps matrix for QC samples only
1366
1350
  self.logger.debug("Calculating QC detection ratio with optimized gaps matrix...")
1367
1351
  try:
1368
1352
  # Use optimized get_gaps_matrix with QC samples filtering for faster performance
1369
1353
  qc_gaps_matrix = self.get_gaps_matrix(samples=qc_sample_names)
1370
-
1354
+
1371
1355
  if qc_gaps_matrix is not None and not qc_gaps_matrix.is_empty():
1372
1356
  # Get QC columns (should be all columns except consensus_uid since we filtered)
1373
1357
  available_qc_cols_gaps = [col for col in qc_gaps_matrix.columns if col != "consensus_uid"]
1374
1358
  self.logger.debug(f"Found {len(available_qc_cols_gaps)} QC columns in optimized gaps matrix")
1375
-
1359
+
1376
1360
  if len(available_qc_cols_gaps) > 0:
1377
1361
  # Calculate QC detection ratio for each consensus feature
1378
1362
  qc_detection = qc_gaps_matrix.select(["consensus_uid"] + available_qc_cols_gaps)
1379
-
1363
+
1380
1364
  # Data should already be properly typed from get_gaps_matrix, but ensure consistency
1381
1365
  for col in available_qc_cols_gaps:
1382
- qc_detection = qc_detection.with_columns(
1383
- pl.col(col).fill_null(0).cast(pl.Int8).alias(col)
1384
- )
1385
-
1366
+ qc_detection = qc_detection.with_columns(pl.col(col).fill_null(0).cast(pl.Int8).alias(col))
1367
+
1386
1368
  # Calculate ratio (sum of detections / number of QC samples)
1387
1369
  qc_ratios = qc_detection.with_columns(
1388
1370
  pl.concat_list([pl.col(col) for col in available_qc_cols_gaps]).alias("qc_detections")
1389
1371
  ).with_columns(
1390
- (pl.col("qc_detections").list.sum().cast(pl.Float64) / len(available_qc_cols_gaps)).alias("qc_ratio")
1372
+ (pl.col("qc_detections").list.sum().cast(pl.Float64) / len(available_qc_cols_gaps)).alias(
1373
+ "qc_ratio"
1374
+ )
1391
1375
  )
1392
-
1376
+
1393
1377
  # Join with result DataFrame
1394
1378
  result_df = result_df.join(
1395
- qc_ratios.select(["consensus_uid", "qc_ratio"]),
1396
- on="consensus_uid",
1397
- how="left"
1379
+ qc_ratios.select(["consensus_uid", "qc_ratio"]), on="consensus_uid", how="left"
1398
1380
  )
1399
1381
  qc_ratio_values = None # Indicate we successfully added the column
1400
-
1382
+
1401
1383
  except Exception as e:
1402
1384
  self.logger.debug(f"Could not calculate QC ratio: {e}")
1403
-
1385
+
1404
1386
  # Add null columns for any QC metrics that couldn't be calculated
1405
1387
  # Add null columns for any QC metrics that couldn't be calculated
1406
1388
  if qc_ratio_values is not None:
@@ -1409,16 +1391,16 @@ def get_consensus_stats(self):
1409
1391
  result_df = result_df.with_columns(pl.lit(None, dtype=pl.Float64).alias("qc_cv"))
1410
1392
  if qc_to_blank_values is not None:
1411
1393
  result_df = result_df.with_columns(pl.lit(None, dtype=pl.Float64).alias("qc_to_blank"))
1412
-
1394
+
1413
1395
  except Exception as e:
1414
1396
  self.logger.warning(f"Error calculating QC metrics: {e}")
1415
1397
  # Add null columns if QC calculation fails
1416
1398
  result_df = result_df.with_columns([
1417
1399
  pl.lit(None, dtype=pl.Float64).alias("qc_ratio"),
1418
1400
  pl.lit(None, dtype=pl.Float64).alias("qc_cv"),
1419
- pl.lit(None, dtype=pl.Float64).alias("qc_to_blank")
1401
+ pl.lit(None, dtype=pl.Float64).alias("qc_to_blank"),
1420
1402
  ])
1421
-
1403
+
1422
1404
  return result_df
1423
1405
 
1424
1406
 
@@ -1565,9 +1547,7 @@ def restore_features(self, samples=None, maps=False):
1565
1547
  continue
1566
1548
 
1567
1549
  # Check which columns are actually available in the sample
1568
- available_columns = [
1569
- col for col in columns_to_update if col in sample.features_df.columns
1570
- ]
1550
+ available_columns = [col for col in columns_to_update if col in sample.features_df.columns]
1571
1551
  if not available_columns:
1572
1552
  self.logger.debug(f"No target columns found in sample {sample_name}")
1573
1553
  continue
@@ -1590,9 +1570,7 @@ def restore_features(self, samples=None, maps=False):
1590
1570
  original_dtype = self.features_df[col].dtype
1591
1571
 
1592
1572
  # Update the specific row and column, preserving dtype
1593
- mask = (pl.col("feature_uid") == feature_uid) & (
1594
- pl.col("sample_uid") == sample_uid
1595
- )
1573
+ mask = (pl.col("feature_uid") == feature_uid) & (pl.col("sample_uid") == sample_uid)
1596
1574
 
1597
1575
  # Handle object columns (like Chromatogram) differently
1598
1576
  if original_dtype == pl.Object:
@@ -1730,9 +1708,7 @@ def restore_chrom(self, samples=None, mz_tol=0.010, rt_tol=10.0):
1730
1708
  feature_uid = study_feature_mapping[key]
1731
1709
 
1732
1710
  # Update only the chrom column
1733
- mask = (pl.col("feature_uid") == feature_uid) & (
1734
- pl.col("sample_uid") == sample_uid
1735
- )
1711
+ mask = (pl.col("feature_uid") == feature_uid) & (pl.col("sample_uid") == sample_uid)
1736
1712
  self.features_df = self.features_df.with_columns(
1737
1713
  pl.when(mask)
1738
1714
  .then(pl.lit(chrom, dtype=pl.Object, allow_object=True))
@@ -1807,11 +1783,7 @@ def restore_chrom(self, samples=None, mz_tol=0.010, rt_tol=10.0):
1807
1783
  sample = Sample(log_level="ERROR")
1808
1784
  sample._load_sample5(sample_path, map=False)
1809
1785
 
1810
- if (
1811
- not hasattr(sample, "ms1_df")
1812
- or sample.ms1_df is None
1813
- or sample.ms1_df.is_empty()
1814
- ):
1786
+ if not hasattr(sample, "ms1_df") or sample.ms1_df is None or sample.ms1_df.is_empty():
1815
1787
  continue
1816
1788
 
1817
1789
  # Process each missing feature
@@ -1920,9 +1892,7 @@ def compress_ms2(self, max_replicates=5):
1920
1892
  # Handle None values by treating them as 0
1921
1893
  self.consensus_ms2 = self.consensus_ms2.with_columns(
1922
1894
  [
1923
- (
1924
- pl.col("number_frags").fill_null(0) * pl.col("prec_inty").fill_null(0)
1925
- ).alias("ranking_score"),
1895
+ (pl.col("number_frags").fill_null(0) * pl.col("prec_inty").fill_null(0)).alias("ranking_score"),
1926
1896
  ],
1927
1897
  )
1928
1898
 
@@ -2259,57 +2229,86 @@ def features_select(
2259
2229
  return pl.DataFrame()
2260
2230
 
2261
2231
  # Early return optimization
2262
- filter_params = [mz, rt, inty, sample_uid, sample_name, consensus_uid,
2263
- feature_uid, filled, quality, chrom_coherence,
2264
- chrom_prominence, chrom_prominence_scaled, chrom_height_scaled]
2265
-
2232
+ filter_params = [
2233
+ mz,
2234
+ rt,
2235
+ inty,
2236
+ sample_uid,
2237
+ sample_name,
2238
+ consensus_uid,
2239
+ feature_uid,
2240
+ filled,
2241
+ quality,
2242
+ chrom_coherence,
2243
+ chrom_prominence,
2244
+ chrom_prominence_scaled,
2245
+ chrom_height_scaled,
2246
+ ]
2247
+
2266
2248
  if all(param is None for param in filter_params):
2267
2249
  return self.features_df.clone()
2268
2250
 
2269
2251
  import time
2252
+
2270
2253
  start_time = time.perf_counter()
2271
2254
  initial_count = len(self.features_df)
2272
2255
 
2273
2256
  # Build optimized filter expression
2274
2257
  filter_expr = _build_optimized_filter_expression(
2275
- self, mz, rt, inty, sample_uid, sample_name, consensus_uid,
2276
- feature_uid, filled, quality, chrom_coherence,
2277
- chrom_prominence, chrom_prominence_scaled, chrom_height_scaled
2258
+ self,
2259
+ mz,
2260
+ rt,
2261
+ inty,
2262
+ sample_uid,
2263
+ sample_name,
2264
+ consensus_uid,
2265
+ feature_uid,
2266
+ filled,
2267
+ quality,
2268
+ chrom_coherence,
2269
+ chrom_prominence,
2270
+ chrom_prominence_scaled,
2271
+ chrom_height_scaled,
2278
2272
  )
2279
-
2273
+
2280
2274
  if filter_expr is None:
2281
2275
  return pl.DataFrame()
2282
-
2276
+
2283
2277
  # Apply filter with optimized execution strategy
2284
2278
  if use_lazy_streaming and initial_count > chunk_size:
2285
2279
  result = _apply_chunked_select(self, filter_expr, chunk_size)
2286
2280
  else:
2287
- result = (
2288
- self.features_df
2289
- .lazy()
2290
- .filter(filter_expr)
2291
- .collect(streaming=use_lazy_streaming)
2292
- )
2293
-
2281
+ result = self.features_df.lazy().filter(filter_expr).collect(streaming=use_lazy_streaming)
2282
+
2294
2283
  # Log performance
2295
2284
  elapsed_time = time.perf_counter() - start_time
2296
2285
  final_count = len(result)
2297
2286
  removed_count = initial_count - final_count
2298
-
2287
+
2299
2288
  if final_count == 0:
2300
2289
  self.logger.warning("No features remaining after applying selection criteria.")
2301
2290
  else:
2302
- self.logger.debug(
2303
- f"Selected features: {final_count:,} (removed: {removed_count:,}) in {elapsed_time:.4f}s"
2304
- )
2305
-
2291
+ self.logger.debug(f"Selected features: {final_count:,} (removed: {removed_count:,}) in {elapsed_time:.4f}s")
2292
+
2306
2293
  return result
2307
2294
 
2308
2295
 
2309
- def _build_optimized_filter_expression(self, mz, rt, inty, sample_uid, sample_name,
2310
- consensus_uid, feature_uid, filled, quality,
2311
- chrom_coherence, chrom_prominence,
2312
- chrom_prominence_scaled, chrom_height_scaled):
2296
+ def _build_optimized_filter_expression(
2297
+ self,
2298
+ mz,
2299
+ rt,
2300
+ inty,
2301
+ sample_uid,
2302
+ sample_name,
2303
+ consensus_uid,
2304
+ feature_uid,
2305
+ filled,
2306
+ quality,
2307
+ chrom_coherence,
2308
+ chrom_prominence,
2309
+ chrom_prominence_scaled,
2310
+ chrom_height_scaled,
2311
+ ):
2313
2312
  """
2314
2313
  Build optimized filter expression with efficient column checking and expression combining.
2315
2314
  """
@@ -2317,7 +2316,7 @@ def _build_optimized_filter_expression(self, mz, rt, inty, sample_uid, sample_na
2317
2316
  available_columns = set(self.features_df.columns)
2318
2317
  filter_conditions = []
2319
2318
  warnings = []
2320
-
2319
+
2321
2320
  # Build filter conditions with optimized expressions
2322
2321
  if mz is not None:
2323
2322
  if isinstance(mz, tuple) and len(mz) == 2:
@@ -2422,7 +2421,9 @@ def _build_optimized_filter_expression(self, mz, rt, inty, sample_uid, sample_na
2422
2421
  if "chrom_coherence" in available_columns:
2423
2422
  if isinstance(chrom_coherence, tuple) and len(chrom_coherence) == 2:
2424
2423
  min_coherence, max_coherence = chrom_coherence
2425
- filter_conditions.append(pl.col("chrom_coherence").is_between(min_coherence, max_coherence, closed="both"))
2424
+ filter_conditions.append(
2425
+ pl.col("chrom_coherence").is_between(min_coherence, max_coherence, closed="both")
2426
+ )
2426
2427
  else:
2427
2428
  filter_conditions.append(pl.col("chrom_coherence") >= chrom_coherence)
2428
2429
  else:
@@ -2433,7 +2434,9 @@ def _build_optimized_filter_expression(self, mz, rt, inty, sample_uid, sample_na
2433
2434
  if "chrom_prominence" in available_columns:
2434
2435
  if isinstance(chrom_prominence, tuple) and len(chrom_prominence) == 2:
2435
2436
  min_prominence, max_prominence = chrom_prominence
2436
- filter_conditions.append(pl.col("chrom_prominence").is_between(min_prominence, max_prominence, closed="both"))
2437
+ filter_conditions.append(
2438
+ pl.col("chrom_prominence").is_between(min_prominence, max_prominence, closed="both")
2439
+ )
2437
2440
  else:
2438
2441
  filter_conditions.append(pl.col("chrom_prominence") >= chrom_prominence)
2439
2442
  else:
@@ -2445,7 +2448,10 @@ def _build_optimized_filter_expression(self, mz, rt, inty, sample_uid, sample_na
2445
2448
  if isinstance(chrom_prominence_scaled, tuple) and len(chrom_prominence_scaled) == 2:
2446
2449
  min_prominence_scaled, max_prominence_scaled = chrom_prominence_scaled
2447
2450
  filter_conditions.append(
2448
- pl.col("chrom_prominence_scaled").is_between(min_prominence_scaled, max_prominence_scaled, closed="both"))
2451
+ pl.col("chrom_prominence_scaled").is_between(
2452
+ min_prominence_scaled, max_prominence_scaled, closed="both"
2453
+ )
2454
+ )
2449
2455
  else:
2450
2456
  filter_conditions.append(pl.col("chrom_prominence_scaled") >= chrom_prominence_scaled)
2451
2457
  else:
@@ -2457,7 +2463,8 @@ def _build_optimized_filter_expression(self, mz, rt, inty, sample_uid, sample_na
2457
2463
  if isinstance(chrom_height_scaled, tuple) and len(chrom_height_scaled) == 2:
2458
2464
  min_height_scaled, max_height_scaled = chrom_height_scaled
2459
2465
  filter_conditions.append(
2460
- pl.col("chrom_height_scaled").is_between(min_height_scaled, max_height_scaled, closed="both"))
2466
+ pl.col("chrom_height_scaled").is_between(min_height_scaled, max_height_scaled, closed="both")
2467
+ )
2461
2468
  else:
2462
2469
  filter_conditions.append(pl.col("chrom_height_scaled") >= chrom_height_scaled)
2463
2470
  else:
@@ -2470,12 +2477,13 @@ def _build_optimized_filter_expression(self, mz, rt, inty, sample_uid, sample_na
2470
2477
  # Combine all conditions efficiently
2471
2478
  if not filter_conditions:
2472
2479
  return None
2473
-
2480
+
2474
2481
  # Use reduce for efficient expression combination
2475
2482
  from functools import reduce
2476
2483
  import operator
2484
+
2477
2485
  combined_expr = reduce(operator.and_, filter_conditions)
2478
-
2486
+
2479
2487
  return combined_expr
2480
2488
 
2481
2489
 
@@ -2485,30 +2493,27 @@ def _apply_chunked_select(self, filter_expr, chunk_size: int):
2485
2493
  """
2486
2494
  total_features = len(self.features_df)
2487
2495
  num_chunks = (total_features + chunk_size - 1) // chunk_size
2488
-
2496
+
2489
2497
  self.logger.debug(f"Using chunked select with {num_chunks} chunks")
2490
-
2498
+
2491
2499
  filtered_chunks = []
2492
2500
  for i in range(num_chunks):
2493
2501
  start_idx = i * chunk_size
2494
2502
  end_idx = min((i + 1) * chunk_size, total_features)
2495
-
2503
+
2496
2504
  chunk_result = (
2497
- self.features_df
2498
- .lazy()
2499
- .slice(start_idx, end_idx - start_idx)
2500
- .filter(filter_expr)
2501
- .collect(streaming=True)
2505
+ self.features_df.lazy().slice(start_idx, end_idx - start_idx).filter(filter_expr).collect(streaming=True)
2502
2506
  )
2503
-
2507
+
2504
2508
  if not chunk_result.is_empty():
2505
2509
  filtered_chunks.append(chunk_result)
2506
-
2510
+
2507
2511
  if filtered_chunks:
2508
2512
  return pl.concat(filtered_chunks, how="vertical")
2509
2513
  else:
2510
2514
  return pl.DataFrame()
2511
2515
 
2516
+
2512
2517
  '''
2513
2518
  def features_select_benchmarked(
2514
2519
  self,
@@ -2604,19 +2609,14 @@ def monkey_patch_study():
2604
2609
  print("Patched Study.features_select with consolidated optimized implementation")
2605
2610
  '''
2606
2611
 
2607
- def features_filter(
2608
- self,
2609
- features,
2610
- chunk_size: int = 50000,
2611
- use_index_based: bool = True,
2612
- parallel: bool = True
2613
- ):
2612
+
2613
+ def features_filter(self, features, chunk_size: int = 50000, use_index_based: bool = True, parallel: bool = True):
2614
2614
  """
2615
2615
  Filter features_df by keeping only features that match the given criteria.
2616
2616
  This keeps only the specified features and removes all others.
2617
2617
 
2618
2618
  FULLY OPTIMIZED VERSION: Index-based filtering, chunked processing, and lazy evaluation.
2619
-
2619
+
2620
2620
  Performance improvements:
2621
2621
  - Index-based filtering using sorted arrays (O(n log n) instead of O(n²))
2622
2622
  - Chunked processing to handle large datasets without memory issues
@@ -2646,26 +2646,24 @@ def features_filter(
2646
2646
  return
2647
2647
 
2648
2648
  initial_count = len(self.features_df)
2649
-
2649
+
2650
2650
  # Extract feature UIDs efficiently
2651
2651
  feature_uids_to_keep = _extract_feature_uids_optimized(self, features)
2652
2652
  if not feature_uids_to_keep:
2653
2653
  self.logger.warning("No feature UIDs provided for filtering.")
2654
2654
  return
2655
-
2655
+
2656
2656
  # Choose optimal filtering strategy based on data size and characteristics
2657
2657
  if use_index_based and len(self.features_df) > 10000:
2658
2658
  _apply_index_based_filter(self, feature_uids_to_keep, chunk_size, parallel)
2659
2659
  else:
2660
2660
  _apply_standard_filter(self, feature_uids_to_keep)
2661
-
2661
+
2662
2662
  # Calculate results and log performance
2663
2663
  final_count = len(self.features_df)
2664
2664
  removed_count = initial_count - final_count
2665
-
2666
- self.logger.info(
2667
- f"Filtered features. Kept: {final_count:,}. Removed: {removed_count:,}."
2668
- )
2665
+
2666
+ self.logger.info(f"Filtered features. Kept: {final_count:,}. Removed: {removed_count:,}.")
2669
2667
 
2670
2668
 
2671
2669
  def _extract_feature_uids_optimized(self, features):
@@ -2679,13 +2677,13 @@ def _extract_feature_uids_optimized(self, features):
2679
2677
  return set()
2680
2678
  # Use polars native operations for efficiency
2681
2679
  return set(features.select("feature_uid").to_series().to_list())
2682
-
2680
+
2683
2681
  elif isinstance(features, (list, tuple)):
2684
2682
  return set(features) # Convert to set immediately for O(1) lookups
2685
-
2683
+
2686
2684
  elif isinstance(features, int):
2687
2685
  return {features}
2688
-
2686
+
2689
2687
  else:
2690
2688
  self.logger.error("features parameter must be a DataFrame, list, tuple, or int")
2691
2689
  return set()
@@ -2694,7 +2692,7 @@ def _extract_feature_uids_optimized(self, features):
2694
2692
  def _apply_index_based_filter(self, feature_uids_to_keep, chunk_size: int, parallel: bool):
2695
2693
  """
2696
2694
  Apply index-based filtering with chunked processing and lazy evaluation.
2697
-
2695
+
2698
2696
  This method uses:
2699
2697
  1. Sorted arrays and binary search for O(log n) lookups
2700
2698
  2. Chunked processing to manage memory usage
@@ -2702,9 +2700,9 @@ def _apply_index_based_filter(self, feature_uids_to_keep, chunk_size: int, paral
2702
2700
  4. Hash-based set operations for optimal performance
2703
2701
  """
2704
2702
  self.logger.debug(f"Using index-based filtering with chunks of {chunk_size:,}")
2705
-
2703
+
2706
2704
  total_features = len(self.features_df)
2707
-
2705
+
2708
2706
  if total_features <= chunk_size:
2709
2707
  # Small dataset - process in single chunk with optimized operations
2710
2708
  _filter_single_chunk_optimized(self, feature_uids_to_keep)
@@ -2720,30 +2718,21 @@ def _filter_single_chunk_optimized(self, feature_uids_to_keep):
2720
2718
  """
2721
2719
  # Create boolean mask using hash-based set lookup (O(1) per element)
2722
2720
  filter_expr = pl.col("feature_uid").is_in(list(feature_uids_to_keep))
2723
-
2721
+
2724
2722
  # Apply filter using lazy evaluation with optimized execution
2725
2723
  self.features_df = (
2726
- self.features_df
2727
- .lazy()
2728
- .filter(filter_expr)
2729
- .collect(streaming=True) # Use streaming for memory efficiency
2724
+ self.features_df.lazy().filter(filter_expr).collect(streaming=True) # Use streaming for memory efficiency
2730
2725
  )
2731
-
2726
+
2732
2727
  # Apply same filter to consensus_mapping_df if it exists
2733
- if (self.consensus_mapping_df is not None and
2734
- not self.consensus_mapping_df.is_empty()):
2735
- self.consensus_mapping_df = (
2736
- self.consensus_mapping_df
2737
- .lazy()
2738
- .filter(filter_expr)
2739
- .collect(streaming=True)
2740
- )
2728
+ if self.consensus_mapping_df is not None and not self.consensus_mapping_df.is_empty():
2729
+ self.consensus_mapping_df = self.consensus_mapping_df.lazy().filter(filter_expr).collect(streaming=True)
2741
2730
 
2742
2731
 
2743
2732
  def _filter_chunked_lazy(self, feature_uids_to_keep, chunk_size: int, parallel: bool):
2744
2733
  """
2745
2734
  Chunked processing with lazy evaluation for large datasets.
2746
-
2735
+
2747
2736
  This approach:
2748
2737
  1. Processes data in manageable chunks to control memory usage
2749
2738
  2. Uses lazy evaluation to optimize query execution
@@ -2752,35 +2741,34 @@ def _filter_chunked_lazy(self, feature_uids_to_keep, chunk_size: int, parallel:
2752
2741
  """
2753
2742
  total_features = len(self.features_df)
2754
2743
  num_chunks = (total_features + chunk_size - 1) // chunk_size
2755
-
2744
+
2756
2745
  self.logger.debug(f"Processing {total_features:,} features in {num_chunks} chunks")
2757
-
2746
+
2758
2747
  # Process features_df in chunks using lazy evaluation
2759
2748
  filtered_chunks = []
2760
-
2749
+
2761
2750
  for i in range(num_chunks):
2762
2751
  start_idx = i * chunk_size
2763
2752
  end_idx = min((i + 1) * chunk_size, total_features)
2764
-
2753
+
2765
2754
  # Create lazy query for this chunk
2766
2755
  chunk_query = (
2767
- self.features_df
2768
- .lazy()
2756
+ self.features_df.lazy()
2769
2757
  .slice(start_idx, end_idx - start_idx)
2770
2758
  .filter(pl.col("feature_uid").is_in(list(feature_uids_to_keep)))
2771
2759
  )
2772
-
2760
+
2773
2761
  # Collect chunk with streaming for memory efficiency
2774
2762
  chunk_result = chunk_query.collect(streaming=True)
2775
2763
  if not chunk_result.is_empty():
2776
2764
  filtered_chunks.append(chunk_result)
2777
-
2765
+
2778
2766
  # Combine all filtered chunks efficiently
2779
2767
  if filtered_chunks:
2780
2768
  self.features_df = pl.concat(filtered_chunks, how="vertical")
2781
2769
  else:
2782
2770
  self.features_df = pl.DataFrame() # No features remain
2783
-
2771
+
2784
2772
  # Apply same chunked processing to consensus_mapping_df
2785
2773
  _filter_consensus_mapping_chunked(self, feature_uids_to_keep, chunk_size)
2786
2774
 
@@ -2789,17 +2777,15 @@ def _filter_consensus_mapping_chunked(self, feature_uids_to_keep, chunk_size: in
2789
2777
  """
2790
2778
  Apply chunked filtering to consensus_mapping_df with same optimization strategy.
2791
2779
  """
2792
- if (self.consensus_mapping_df is None or
2793
- self.consensus_mapping_df.is_empty()):
2780
+ if self.consensus_mapping_df is None or self.consensus_mapping_df.is_empty():
2794
2781
  return
2795
-
2782
+
2796
2783
  total_mappings = len(self.consensus_mapping_df)
2797
-
2784
+
2798
2785
  if total_mappings <= chunk_size:
2799
2786
  # Single chunk processing
2800
2787
  self.consensus_mapping_df = (
2801
- self.consensus_mapping_df
2802
- .lazy()
2788
+ self.consensus_mapping_df.lazy()
2803
2789
  .filter(pl.col("feature_uid").is_in(list(feature_uids_to_keep)))
2804
2790
  .collect(streaming=True)
2805
2791
  )
@@ -2807,22 +2793,21 @@ def _filter_consensus_mapping_chunked(self, feature_uids_to_keep, chunk_size: in
2807
2793
  # Multi-chunk processing
2808
2794
  num_chunks = (total_mappings + chunk_size - 1) // chunk_size
2809
2795
  filtered_chunks = []
2810
-
2796
+
2811
2797
  for i in range(num_chunks):
2812
2798
  start_idx = i * chunk_size
2813
2799
  end_idx = min((i + 1) * chunk_size, total_mappings)
2814
-
2800
+
2815
2801
  chunk_query = (
2816
- self.consensus_mapping_df
2817
- .lazy()
2802
+ self.consensus_mapping_df.lazy()
2818
2803
  .slice(start_idx, end_idx - start_idx)
2819
2804
  .filter(pl.col("feature_uid").is_in(list(feature_uids_to_keep)))
2820
2805
  )
2821
-
2806
+
2822
2807
  chunk_result = chunk_query.collect(streaming=True)
2823
2808
  if not chunk_result.is_empty():
2824
2809
  filtered_chunks.append(chunk_result)
2825
-
2810
+
2826
2811
  if filtered_chunks:
2827
2812
  self.consensus_mapping_df = pl.concat(filtered_chunks, how="vertical")
2828
2813
  else:
@@ -2835,24 +2820,13 @@ def _apply_standard_filter(self, feature_uids_to_keep):
2835
2820
  Still uses optimized set operations and lazy evaluation.
2836
2821
  """
2837
2822
  filter_expr = pl.col("feature_uid").is_in(list(feature_uids_to_keep))
2838
-
2823
+
2839
2824
  # Apply filter with lazy evaluation
2840
- self.features_df = (
2841
- self.features_df
2842
- .lazy()
2843
- .filter(filter_expr)
2844
- .collect(streaming=True)
2845
- )
2846
-
2825
+ self.features_df = self.features_df.lazy().filter(filter_expr).collect(streaming=True)
2826
+
2847
2827
  # Apply to consensus_mapping_df
2848
- if (self.consensus_mapping_df is not None and
2849
- not self.consensus_mapping_df.is_empty()):
2850
- self.consensus_mapping_df = (
2851
- self.consensus_mapping_df
2852
- .lazy()
2853
- .filter(filter_expr)
2854
- .collect(streaming=True)
2855
- )
2828
+ if self.consensus_mapping_df is not None and not self.consensus_mapping_df.is_empty():
2829
+ self.consensus_mapping_df = self.consensus_mapping_df.lazy().filter(filter_expr).collect(streaming=True)
2856
2830
 
2857
2831
 
2858
2832
  def features_delete(self, features):
@@ -2914,14 +2888,9 @@ def features_delete(self, features):
2914
2888
 
2915
2889
  # Apply filter to consensus_mapping_df if it exists - batch operation
2916
2890
  mapping_removed_count = 0
2917
- if (
2918
- self.consensus_mapping_df is not None
2919
- and not self.consensus_mapping_df.is_empty()
2920
- ):
2891
+ if self.consensus_mapping_df is not None and not self.consensus_mapping_df.is_empty():
2921
2892
  initial_mapping_count = len(self.consensus_mapping_df)
2922
- self.consensus_mapping_df = (
2923
- self.consensus_mapping_df.lazy().filter(filter_condition).collect()
2924
- )
2893
+ self.consensus_mapping_df = self.consensus_mapping_df.lazy().filter(filter_condition).collect()
2925
2894
  mapping_removed_count = initial_mapping_count - len(self.consensus_mapping_df)
2926
2895
 
2927
2896
  # Calculate results once and log efficiently
@@ -3028,18 +2997,41 @@ def consensus_select(
3028
2997
  return pl.DataFrame()
3029
2998
 
3030
2999
  # Early return optimization - check if any filters are provided
3031
- filter_params = [uid, mz, rt, inty_mean, consensus_uid, consensus_id, number_samples,
3032
- number_ms2, quality, bl, chrom_coherence_mean, chrom_prominence_mean,
3033
- chrom_prominence_scaled_mean, chrom_height_scaled_mean,
3034
- rt_delta_mean, id_top_score, identified,
3035
- # New adduct and identification parameters
3036
- adduct_top, adduct_charge_top, adduct_mass_neutral_top, adduct_mass_shift_top,
3037
- adduct_group, adduct_of, id_top_name, id_top_class, id_top_adduct]
3038
-
3000
+ filter_params = [
3001
+ uid,
3002
+ mz,
3003
+ rt,
3004
+ inty_mean,
3005
+ consensus_uid,
3006
+ consensus_id,
3007
+ number_samples,
3008
+ number_ms2,
3009
+ quality,
3010
+ bl,
3011
+ chrom_coherence_mean,
3012
+ chrom_prominence_mean,
3013
+ chrom_prominence_scaled_mean,
3014
+ chrom_height_scaled_mean,
3015
+ rt_delta_mean,
3016
+ id_top_score,
3017
+ identified,
3018
+ # New adduct and identification parameters
3019
+ adduct_top,
3020
+ adduct_charge_top,
3021
+ adduct_mass_neutral_top,
3022
+ adduct_mass_shift_top,
3023
+ adduct_group,
3024
+ adduct_of,
3025
+ id_top_name,
3026
+ id_top_class,
3027
+ id_top_adduct,
3028
+ ]
3029
+
3039
3030
  if all(param is None for param in filter_params) and sortby is None:
3040
3031
  return self.consensus_df.clone()
3041
3032
 
3042
3033
  import time
3034
+
3043
3035
  start_time = time.perf_counter()
3044
3036
  initial_count = len(self.consensus_df)
3045
3037
 
@@ -3082,8 +3074,9 @@ def consensus_select(
3082
3074
  default_mz_tol = default_mz_tol.eic_mz_tol
3083
3075
  else:
3084
3076
  from masster.study.defaults.align_def import align_defaults
3077
+
3085
3078
  default_mz_tol = align_defaults().mz_max_diff
3086
-
3079
+
3087
3080
  min_mz = mz - default_mz_tol
3088
3081
  max_mz = mz + default_mz_tol
3089
3082
  filter_conditions.append((pl.col("mz") >= min_mz) & (pl.col("mz") <= max_mz))
@@ -3106,8 +3099,9 @@ def consensus_select(
3106
3099
  default_rt_tol = default_rt_tol.eic_rt_tol
3107
3100
  else:
3108
3101
  from masster.study.defaults.align_def import align_defaults
3102
+
3109
3103
  default_rt_tol = align_defaults().rt_tol
3110
-
3104
+
3111
3105
  min_rt = rt - default_rt_tol
3112
3106
  max_rt = rt + default_rt_tol
3113
3107
  filter_conditions.append((pl.col("rt") >= min_rt) & (pl.col("rt") <= max_rt))
@@ -3192,8 +3186,8 @@ def consensus_select(
3192
3186
  if "adduct_charge_top" in available_columns:
3193
3187
  if isinstance(adduct_charge_top, tuple) and len(adduct_charge_top) == 2:
3194
3188
  filter_conditions.append(
3195
- (pl.col("adduct_charge_top") >= adduct_charge_top[0]) &
3196
- (pl.col("adduct_charge_top") <= adduct_charge_top[1])
3189
+ (pl.col("adduct_charge_top") >= adduct_charge_top[0])
3190
+ & (pl.col("adduct_charge_top") <= adduct_charge_top[1])
3197
3191
  )
3198
3192
  elif isinstance(adduct_charge_top, list):
3199
3193
  filter_conditions.append(pl.col("adduct_charge_top").is_in(adduct_charge_top))
@@ -3207,8 +3201,8 @@ def consensus_select(
3207
3201
  if "adduct_mass_neutral_top" in available_columns:
3208
3202
  if isinstance(adduct_mass_neutral_top, tuple) and len(adduct_mass_neutral_top) == 2:
3209
3203
  filter_conditions.append(
3210
- (pl.col("adduct_mass_neutral_top") >= adduct_mass_neutral_top[0]) &
3211
- (pl.col("adduct_mass_neutral_top") <= adduct_mass_neutral_top[1])
3204
+ (pl.col("adduct_mass_neutral_top") >= adduct_mass_neutral_top[0])
3205
+ & (pl.col("adduct_mass_neutral_top") <= adduct_mass_neutral_top[1])
3212
3206
  )
3213
3207
  elif isinstance(adduct_mass_neutral_top, list):
3214
3208
  filter_conditions.append(pl.col("adduct_mass_neutral_top").is_in(adduct_mass_neutral_top))
@@ -3222,8 +3216,8 @@ def consensus_select(
3222
3216
  if "adduct_mass_shift_top" in available_columns:
3223
3217
  if isinstance(adduct_mass_shift_top, tuple) and len(adduct_mass_shift_top) == 2:
3224
3218
  filter_conditions.append(
3225
- (pl.col("adduct_mass_shift_top") >= adduct_mass_shift_top[0]) &
3226
- (pl.col("adduct_mass_shift_top") <= adduct_mass_shift_top[1])
3219
+ (pl.col("adduct_mass_shift_top") >= adduct_mass_shift_top[0])
3220
+ & (pl.col("adduct_mass_shift_top") <= adduct_mass_shift_top[1])
3227
3221
  )
3228
3222
  elif isinstance(adduct_mass_shift_top, list):
3229
3223
  filter_conditions.append(pl.col("adduct_mass_shift_top").is_in(adduct_mass_shift_top))
@@ -3287,8 +3281,7 @@ def consensus_select(
3287
3281
  if "id_top_score" in available_columns:
3288
3282
  if isinstance(id_top_score, tuple) and len(id_top_score) == 2:
3289
3283
  filter_conditions.append(
3290
- (pl.col("id_top_score") >= id_top_score[0]) &
3291
- (pl.col("id_top_score") <= id_top_score[1])
3284
+ (pl.col("id_top_score") >= id_top_score[0]) & (pl.col("id_top_score") <= id_top_score[1])
3292
3285
  )
3293
3286
  elif isinstance(id_top_score, list):
3294
3287
  filter_conditions.append(pl.col("id_top_score").is_in(id_top_score))
@@ -3306,14 +3299,10 @@ def consensus_select(
3306
3299
  # Combine all conditions efficiently using reduce
3307
3300
  from functools import reduce
3308
3301
  import operator
3302
+
3309
3303
  combined_filter = reduce(operator.and_, filter_conditions)
3310
-
3311
- consensus = (
3312
- self.consensus_df
3313
- .lazy()
3314
- .filter(combined_filter)
3315
- .collect(streaming=True)
3316
- )
3304
+
3305
+ consensus = self.consensus_df.lazy().filter(combined_filter).collect(streaming=True)
3317
3306
  else:
3318
3307
  consensus = self.consensus_df.clone()
3319
3308
 
@@ -3334,10 +3323,10 @@ def consensus_select(
3334
3323
  elif isinstance(sortby, (list, tuple)):
3335
3324
  valid_columns = [col for col in sortby if col in consensus.columns]
3336
3325
  invalid_columns = [col for col in sortby if col not in consensus.columns]
3337
-
3326
+
3338
3327
  if invalid_columns:
3339
3328
  self.logger.warning(f"Sort columns not found in consensus DataFrame: {invalid_columns}")
3340
-
3329
+
3341
3330
  if valid_columns:
3342
3331
  consensus = consensus.sort(valid_columns, descending=descending)
3343
3332
  else:
@@ -3346,8 +3335,10 @@ def consensus_select(
3346
3335
  # Log performance metrics
3347
3336
  elapsed_time = time.perf_counter() - start_time
3348
3337
  removed_count = initial_count - final_count
3349
-
3350
- self.logger.info(f"Selected consensus features: {final_count:,} (removed: {removed_count:,}) in {elapsed_time:.4f}s")
3338
+
3339
+ self.logger.info(
3340
+ f"Selected consensus features: {final_count:,} (removed: {removed_count:,}) in {elapsed_time:.4f}s"
3341
+ )
3351
3342
 
3352
3343
  return consensus
3353
3344
 
@@ -3393,10 +3384,7 @@ def consensus_filter(self, consensus):
3393
3384
 
3394
3385
  # Get feature_uids that need to be kept in features_df
3395
3386
  feature_uids_to_keep = []
3396
- if (
3397
- self.consensus_mapping_df is not None
3398
- and not self.consensus_mapping_df.is_empty()
3399
- ):
3387
+ if self.consensus_mapping_df is not None and not self.consensus_mapping_df.is_empty():
3400
3388
  feature_uids_to_keep = self.consensus_mapping_df.filter(
3401
3389
  pl.col("consensus_uid").is_in(consensus_uids_to_keep),
3402
3390
  )["feature_uid"].to_list()
@@ -3407,10 +3395,7 @@ def consensus_filter(self, consensus):
3407
3395
  )
3408
3396
 
3409
3397
  # Keep only relevant entries in consensus_mapping_df
3410
- if (
3411
- self.consensus_mapping_df is not None
3412
- and not self.consensus_mapping_df.is_empty()
3413
- ):
3398
+ if self.consensus_mapping_df is not None and not self.consensus_mapping_df.is_empty():
3414
3399
  initial_mapping_count = len(self.consensus_mapping_df)
3415
3400
  self.consensus_mapping_df = self.consensus_mapping_df.filter(
3416
3401
  pl.col("consensus_uid").is_in(consensus_uids_to_keep),
@@ -3423,11 +3408,7 @@ def consensus_filter(self, consensus):
3423
3408
  )
3424
3409
 
3425
3410
  # Keep only corresponding features in features_df
3426
- if (
3427
- feature_uids_to_keep
3428
- and self.features_df is not None
3429
- and not self.features_df.is_empty()
3430
- ):
3411
+ if feature_uids_to_keep and self.features_df is not None and not self.features_df.is_empty():
3431
3412
  initial_features_count = len(self.features_df)
3432
3413
  self.features_df = self.features_df.filter(
3433
3414
  pl.col("feature_uid").is_in(feature_uids_to_keep),
@@ -3440,11 +3421,7 @@ def consensus_filter(self, consensus):
3440
3421
  )
3441
3422
 
3442
3423
  # Keep only relevant entries in consensus_ms2 if it exists
3443
- if (
3444
- hasattr(self, "consensus_ms2")
3445
- and self.consensus_ms2 is not None
3446
- and not self.consensus_ms2.is_empty()
3447
- ):
3424
+ if hasattr(self, "consensus_ms2") and self.consensus_ms2 is not None and not self.consensus_ms2.is_empty():
3448
3425
  initial_ms2_count = len(self.consensus_ms2)
3449
3426
  self.consensus_ms2 = self.consensus_ms2.filter(
3450
3427
  pl.col("consensus_uid").is_in(consensus_uids_to_keep),
@@ -3514,10 +3491,7 @@ def consensus_delete(self, consensus):
3514
3491
 
3515
3492
  # Get feature_uids that need to be removed from features_df
3516
3493
  feature_uids_to_remove = []
3517
- if (
3518
- self.consensus_mapping_df is not None
3519
- and not self.consensus_mapping_df.is_empty()
3520
- ):
3494
+ if self.consensus_mapping_df is not None and not self.consensus_mapping_df.is_empty():
3521
3495
  feature_uids_to_remove = self.consensus_mapping_df.filter(
3522
3496
  pl.col("consensus_uid").is_in(consensus_uids_to_remove),
3523
3497
  )["feature_uid"].to_list()
@@ -3529,10 +3503,7 @@ def consensus_delete(self, consensus):
3529
3503
 
3530
3504
  # Remove from consensus_mapping_df
3531
3505
  mapping_removed_count = 0
3532
- if (
3533
- self.consensus_mapping_df is not None
3534
- and not self.consensus_mapping_df.is_empty()
3535
- ):
3506
+ if self.consensus_mapping_df is not None and not self.consensus_mapping_df.is_empty():
3536
3507
  initial_mapping_count = len(self.consensus_mapping_df)
3537
3508
  self.consensus_mapping_df = self.consensus_mapping_df.filter(
3538
3509
  ~pl.col("consensus_uid").is_in(consensus_uids_to_remove),
@@ -3541,11 +3512,7 @@ def consensus_delete(self, consensus):
3541
3512
 
3542
3513
  # Remove corresponding features from features_df
3543
3514
  features_removed_count = 0
3544
- if (
3545
- feature_uids_to_remove
3546
- and self.features_df is not None
3547
- and not self.features_df.is_empty()
3548
- ):
3515
+ if feature_uids_to_remove and self.features_df is not None and not self.features_df.is_empty():
3549
3516
  initial_features_count = len(self.features_df)
3550
3517
  self.features_df = self.features_df.filter(
3551
3518
  ~pl.col("feature_uid").is_in(feature_uids_to_remove),
@@ -3554,11 +3521,7 @@ def consensus_delete(self, consensus):
3554
3521
 
3555
3522
  # Remove from consensus_ms2 if it exists
3556
3523
  ms2_removed_count = 0
3557
- if (
3558
- hasattr(self, "consensus_ms2")
3559
- and self.consensus_ms2 is not None
3560
- and not self.consensus_ms2.is_empty()
3561
- ):
3524
+ if hasattr(self, "consensus_ms2") and self.consensus_ms2 is not None and not self.consensus_ms2.is_empty():
3562
3525
  initial_ms2_count = len(self.consensus_ms2)
3563
3526
  self.consensus_ms2 = self.consensus_ms2.filter(
3564
3527
  ~pl.col("consensus_uid").is_in(consensus_uids_to_remove),
@@ -3577,7 +3540,7 @@ def consensus_delete(self, consensus):
3577
3540
  log_parts.append(f"{features_removed_count} features")
3578
3541
  if ms2_removed_count > 0:
3579
3542
  log_parts.append(f"{ms2_removed_count} MS2 spectra")
3580
-
3543
+
3581
3544
  log_message = ". ".join(log_parts) + f". Remaining consensus: {final_consensus_count}"
3582
3545
  self.logger.info(log_message)
3583
3546
 
@@ -3651,8 +3614,7 @@ def samples_select(
3651
3614
  # Treat as range
3652
3615
  min_uid, max_uid = sample_uid
3653
3616
  filter_conditions.append(
3654
- (pl.col("sample_uid") >= min_uid)
3655
- & (pl.col("sample_uid") <= max_uid),
3617
+ (pl.col("sample_uid") >= min_uid) & (pl.col("sample_uid") <= max_uid),
3656
3618
  )
3657
3619
  else:
3658
3620
  # Treat as list
@@ -3695,8 +3657,7 @@ def samples_select(
3695
3657
  # Treat as range
3696
3658
  min_batch, max_batch = sample_batch
3697
3659
  filter_conditions.append(
3698
- (pl.col("sample_batch") >= min_batch)
3699
- & (pl.col("sample_batch") <= max_batch),
3660
+ (pl.col("sample_batch") >= min_batch) & (pl.col("sample_batch") <= max_batch),
3700
3661
  )
3701
3662
  else:
3702
3663
  # Treat as list
@@ -3714,8 +3675,7 @@ def samples_select(
3714
3675
  # Treat as range
3715
3676
  min_seq, max_seq = sample_sequence
3716
3677
  filter_conditions.append(
3717
- (pl.col("sample_sequence") >= min_seq)
3718
- & (pl.col("sample_sequence") <= max_seq),
3678
+ (pl.col("sample_sequence") >= min_seq) & (pl.col("sample_sequence") <= max_seq),
3719
3679
  )
3720
3680
  else:
3721
3681
  # Treat as list
@@ -3733,8 +3693,7 @@ def samples_select(
3733
3693
  if isinstance(num_features, tuple) and len(num_features) == 2:
3734
3694
  min_features, max_features = num_features
3735
3695
  filter_conditions.append(
3736
- (pl.col("num_features") >= min_features)
3737
- & (pl.col("num_features") <= max_features),
3696
+ (pl.col("num_features") >= min_features) & (pl.col("num_features") <= max_features),
3738
3697
  )
3739
3698
  else:
3740
3699
  filter_conditions.append(pl.col("num_features") >= num_features)
@@ -3883,11 +3842,7 @@ def samples_delete(self, samples):
3883
3842
 
3884
3843
  # 2. Remove corresponding features from features_df
3885
3844
  removed_features_count = 0
3886
- if (
3887
- feature_uids_to_remove
3888
- and self.features_df is not None
3889
- and not self.features_df.is_empty()
3890
- ):
3845
+ if feature_uids_to_remove and self.features_df is not None and not self.features_df.is_empty():
3891
3846
  self.features_df = self.features_df.filter(
3892
3847
  ~pl.col("sample_uid").is_in(sample_uids_to_remove),
3893
3848
  )
@@ -3895,11 +3850,7 @@ def samples_delete(self, samples):
3895
3850
 
3896
3851
  # 3. Remove from consensus_mapping_df
3897
3852
  removed_mapping_count = 0
3898
- if (
3899
- feature_uids_to_remove
3900
- and self.consensus_mapping_df is not None
3901
- and not self.consensus_mapping_df.is_empty()
3902
- ):
3853
+ if feature_uids_to_remove and self.consensus_mapping_df is not None and not self.consensus_mapping_df.is_empty():
3903
3854
  initial_mapping_count = len(self.consensus_mapping_df)
3904
3855
  self.consensus_mapping_df = self.consensus_mapping_df.filter(
3905
3856
  ~pl.col("feature_uid").is_in(feature_uids_to_remove),
@@ -3908,11 +3859,7 @@ def samples_delete(self, samples):
3908
3859
 
3909
3860
  # 4. Remove from consensus_ms2 if it exists
3910
3861
  removed_ms2_count = 0
3911
- if (
3912
- hasattr(self, "consensus_ms2")
3913
- and self.consensus_ms2 is not None
3914
- and not self.consensus_ms2.is_empty()
3915
- ):
3862
+ if hasattr(self, "consensus_ms2") and self.consensus_ms2 is not None and not self.consensus_ms2.is_empty():
3916
3863
  initial_ms2_count = len(self.consensus_ms2)
3917
3864
  self.consensus_ms2 = self.consensus_ms2.filter(
3918
3865
  ~pl.col("sample_uid").is_in(sample_uids_to_remove),
@@ -3921,11 +3868,7 @@ def samples_delete(self, samples):
3921
3868
 
3922
3869
  # 5. Remove from feature_maps and update map_id
3923
3870
  removed_maps_count = 0
3924
- if (
3925
- hasattr(self, "feature_maps")
3926
- and self.feature_maps is not None
3927
- and map_ids_to_remove
3928
- ):
3871
+ if hasattr(self, "feature_maps") and self.feature_maps is not None and map_ids_to_remove:
3929
3872
  # Remove feature maps in reverse order to maintain indices
3930
3873
  for map_id in sorted(map_ids_to_remove, reverse=True):
3931
3874
  if 0 <= map_id < len(self.feature_maps):
@@ -4148,9 +4091,6 @@ def set_samples_color(self, by=None, palette="Turbo256"):
4148
4091
  self.logger.debug(f"Set sample colors based on {by} using {palette} palette")
4149
4092
 
4150
4093
 
4151
-
4152
-
4153
-
4154
4094
  def _get_color_palette(palette_name):
4155
4095
  """
4156
4096
  Get color palette as a list of hex color codes using the cmap library.
@@ -4304,9 +4244,7 @@ def _sample_colors_from_colormap(palette_name, n_colors):
4304
4244
  # Distribute samples evenly across the full colormap range (same approach as set_samples_color(by=None))
4305
4245
  for i in range(n_colors):
4306
4246
  # Evenly distribute samples across colormap (avoiding endpoints to prevent white/black)
4307
- normalized_value = (
4308
- i + 0.5
4309
- ) / n_colors # +0.5 to center samples in their bins
4247
+ normalized_value = (i + 0.5) / n_colors # +0.5 to center samples in their bins
4310
4248
  # Map to a subset of colormap to avoid extreme colors (use 10% to 90% range)
4311
4249
  normalized_value = 0.1 + (normalized_value * 0.8)
4312
4250
 
@@ -4441,18 +4379,14 @@ def restore_ms2(self, samples=None, **kwargs):
4441
4379
  self.logger.info(f"Restoring MS2 data from {len(sample_uids)} samples...")
4442
4380
 
4443
4381
  # Clear existing consensus_ms2 to rebuild from scratch
4444
- initial_ms2_count = (
4445
- len(self.consensus_ms2) if not self.consensus_ms2.is_empty() else 0
4446
- )
4382
+ initial_ms2_count = len(self.consensus_ms2) if not self.consensus_ms2.is_empty() else 0
4447
4383
  self.consensus_ms2 = pl.DataFrame()
4448
4384
 
4449
4385
  # Re-run find_ms2 which will rebuild consensus_ms2
4450
4386
  try:
4451
4387
  self.find_ms2(**kwargs)
4452
4388
 
4453
- final_ms2_count = (
4454
- len(self.consensus_ms2) if not self.consensus_ms2.is_empty() else 0
4455
- )
4389
+ final_ms2_count = len(self.consensus_ms2) if not self.consensus_ms2.is_empty() else 0
4456
4390
 
4457
4391
  self.logger.info(
4458
4392
  f"MS2 restoration completed: {initial_ms2_count} -> {final_ms2_count} MS2 spectra",
@@ -4551,12 +4485,8 @@ def decompress(self, features=True, ms2=True, chrom=True, samples=None, **kwargs
4551
4485
  # Check if MS2 data might need restoration (compare expected vs actual)
4552
4486
  ms2_need_restoration = False
4553
4487
  if ms2:
4554
- current_ms2_count = (
4555
- len(self.consensus_ms2) if not self.consensus_ms2.is_empty() else 0
4556
- )
4557
- consensus_count = (
4558
- len(self.consensus_df) if not self.consensus_df.is_empty() else 0
4559
- )
4488
+ current_ms2_count = len(self.consensus_ms2) if not self.consensus_ms2.is_empty() else 0
4489
+ consensus_count = len(self.consensus_df) if not self.consensus_df.is_empty() else 0
4560
4490
 
4561
4491
  if consensus_count > 0:
4562
4492
  # Calculate expected MS2 count based on consensus features with MS2 potential
@@ -4607,15 +4537,11 @@ def decompress(self, features=True, ms2=True, chrom=True, samples=None, **kwargs
4607
4537
  # Then do additional chrom gap-filling if needed
4608
4538
  self.restore_chrom(samples=samples, **restore_kwargs)
4609
4539
 
4610
- elif (
4611
- "features" in operations_needed and "chromatograms" not in operations_needed
4612
- ):
4540
+ elif "features" in operations_needed and "chromatograms" not in operations_needed:
4613
4541
  self.logger.info("Phase 1: Restoring features data...")
4614
4542
  self.restore_features(samples=samples)
4615
4543
 
4616
- elif (
4617
- "chromatograms" in operations_needed and "features" not in operations_needed
4618
- ):
4544
+ elif "chromatograms" in operations_needed and "features" not in operations_needed:
4619
4545
  self.logger.info("Phase 1: Restoring chromatograms...")
4620
4546
  restore_kwargs = {}
4621
4547
  if "mz_tol" in kwargs: