masster 0.5.1__py3-none-any.whl → 0.5.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of masster might be problematic. Click here for more details.

masster/study/helpers.py CHANGED
@@ -362,10 +362,6 @@ def get_chrom(self, uids=None, samples=None):
362
362
  ids = self._get_consensus_uids(uids)
363
363
  sample_uids = self._get_samples_uids(samples)
364
364
 
365
- if self.consensus_map is None:
366
- self.logger.error("No consensus map found.")
367
- return None
368
-
369
365
  # Pre-filter all DataFrames to reduce join sizes
370
366
  filtered_consensus_mapping = self.consensus_mapping_df.filter(
371
367
  pl.col("consensus_uid").is_in(ids),
@@ -529,30 +525,132 @@ def get_consensus(self, quant="chrom_area"):
529
525
  return df
530
526
 
531
527
 
532
- # TODO I don't get this param
533
- def get_consensus_matrix(self, quant="chrom_area"):
528
+ def get_consensus_matrix(self, quant="chrom_area", samples=None):
534
529
  """
535
530
  Get a matrix of consensus features with samples as columns and consensus features as rows.
536
- Optimized implementation that avoids expensive join operations.
531
+ Highly optimized implementation using vectorized Polars operations.
532
+
533
+ Parameters:
534
+ quant (str): Quantification method column name (default: "chrom_area")
535
+ samples: Sample identifier(s) to include. Can be:
536
+ - None: include all samples (default)
537
+ - int: single sample_uid
538
+ - str: single sample_name
539
+ - list: multiple sample_uids or sample_names
537
540
  """
541
+ import polars as pl
542
+
538
543
  if quant not in self.features_df.columns:
539
- self.logger.error(
540
- f"Quantification method {quant} not found in features_df.",
541
- )
544
+ self.logger.error(f"Quantification method {quant} not found in features_df.")
542
545
  return None
543
546
 
544
- # Create a lookup dictionary from features_df for O(1) value access
545
- feature_values = {}
547
+ # Get sample_uids to include in the matrix
548
+ sample_uids = self._get_samples_uids(samples) if samples is not None else self.samples_df["sample_uid"].to_list()
549
+
550
+ if not sample_uids:
551
+ self.logger.warning("No valid samples found for consensus matrix")
552
+ return pl.DataFrame()
553
+
554
+ # Filter datasets upfront to reduce processing load
555
+ features_filtered = self.features_df.filter(pl.col("sample_uid").is_in(sample_uids))
556
+ samples_filtered = self.samples_df.filter(pl.col("sample_uid").is_in(sample_uids))
557
+ consensus_mapping_filtered = self.consensus_mapping_df.filter(pl.col("sample_uid").is_in(sample_uids))
558
+
559
+ # Join operations to combine data efficiently
560
+ # 1. Join consensus mapping with features to get quantification values
561
+ consensus_with_values = (
562
+ consensus_mapping_filtered
563
+ .join(features_filtered.select(["feature_uid", "sample_uid", quant]),
564
+ on=["feature_uid", "sample_uid"], how="left")
565
+ .with_columns(pl.col(quant).fill_null(0))
566
+ )
567
+
568
+ # 2. Join with samples to get sample names
569
+ consensus_with_names = (
570
+ consensus_with_values
571
+ .join(samples_filtered.select(["sample_uid", "sample_name"]),
572
+ on="sample_uid", how="left")
573
+ )
574
+
575
+ # 3. Group by consensus_uid and sample_name, taking max value per group
576
+ aggregated = (
577
+ consensus_with_names
578
+ .group_by(["consensus_uid", "sample_name"])
579
+ .agg(pl.col(quant).max().alias("value"))
580
+ )
581
+
582
+ # 4. Pivot to create the matrix format
583
+ matrix_df = (
584
+ aggregated
585
+ .pivot(on="sample_name", index="consensus_uid", values="value")
586
+ .fill_null(0)
587
+ )
588
+
589
+ # 5. Round numeric columns and ensure proper types
590
+ numeric_cols = [col for col in matrix_df.columns if col != "consensus_uid"]
591
+ matrix_df = matrix_df.with_columns([
592
+ pl.col("consensus_uid").cast(pl.UInt64),
593
+ *[pl.col(col).round(0) for col in numeric_cols]
594
+ ])
595
+
596
+ return matrix_df
597
+
598
+
599
+ def get_gaps_matrix(self, uids=None, samples=None):
600
+ """
601
+ Get a matrix of gaps between consensus features with samples as columns and consensus features as rows.
602
+ Optimized implementation that builds the gaps matrix directly without calling get_consensus_matrix().
603
+
604
+ Parameters:
605
+ uids: Consensus UID(s) to include. If None, includes all consensus features.
606
+ samples: Sample identifier(s) to include. If None, includes all samples.
607
+ Can be int (sample_uid), str (sample_name), or list of either.
608
+
609
+ Returns:
610
+ pl.DataFrame: Gaps matrix with consensus_uid as first column and samples as other columns.
611
+ Values are 1 (detected) or 0 (missing/gap).
612
+ """
613
+ import polars as pl
614
+
615
+ if self.consensus_df is None or self.consensus_df.is_empty():
616
+ self.logger.error("No consensus map found.")
617
+ return None
618
+
619
+ if self.consensus_mapping_df is None or self.consensus_mapping_df.is_empty():
620
+ self.logger.error("No consensus mapping found.")
621
+ return None
622
+
623
+ if self.features_df is None or self.features_df.is_empty():
624
+ self.logger.error("No features found.")
625
+ return None
626
+
627
+ # Get consensus UIDs and sample UIDs to include
628
+ uids = self._get_consensus_uids(uids)
629
+ sample_uids = self._get_samples_uids(samples) if samples is not None else self.samples_df["sample_uid"].to_list()
630
+
631
+ if not uids or not sample_uids:
632
+ self.logger.warning("No valid consensus features or samples found for gaps matrix")
633
+ return pl.DataFrame()
634
+
635
+ # Create a lookup dictionary from features_df for gap detection (exclude filled features)
636
+ # Key: (feature_uid, sample_uid) -> Value: 1 (detected)
637
+ feature_detection = {}
546
638
  for row in self.features_df.iter_rows(named=True):
547
- feature_uid = row["feature_uid"]
548
639
  sample_uid = row["sample_uid"]
549
- value = row[quant] if row[quant] is not None else 0
550
- feature_values[(feature_uid, sample_uid)] = value
640
+ if sample_uid in sample_uids: # Only include specified samples
641
+ # Skip filled features (gaps should only show original detections)
642
+ if row.get("filled", False):
643
+ continue
644
+
645
+ feature_uid = row["feature_uid"]
646
+ # If feature exists and is not filled, it's detected (1)
647
+ feature_detection[(feature_uid, sample_uid)] = 1
551
648
 
552
- # Build consensus matrix directly using the consensus_mapping_df
649
+ # Build gaps matrix directly using the consensus_mapping_df
553
650
  matrix_dict = {}
554
651
  sample_mapping = dict(
555
- self.samples_df.select(["sample_uid", "sample_name"]).iter_rows(),
652
+ self.samples_df.filter(pl.col("sample_uid").is_in(sample_uids))
653
+ .select(["sample_uid", "sample_name"]).iter_rows(),
556
654
  )
557
655
 
558
656
  for row in self.consensus_mapping_df.iter_rows(named=True):
@@ -560,65 +658,53 @@ def get_consensus_matrix(self, quant="chrom_area"):
560
658
  sample_uid = row["sample_uid"]
561
659
  feature_uid = row["feature_uid"]
562
660
 
563
- # Look up the quantification value
661
+ # Only process samples and consensus features in our filtered lists
662
+ if sample_uid not in sample_uids or consensus_uid not in uids:
663
+ continue
664
+
665
+ # Check if feature was detected (not filled)
564
666
  key = (feature_uid, sample_uid)
565
- value = feature_values.get(key, 0)
667
+ detected = feature_detection.get(key, 0) # 0 if not found (gap), 1 if detected
566
668
 
567
669
  if consensus_uid not in matrix_dict:
568
670
  matrix_dict[consensus_uid] = {}
569
671
 
570
672
  sample_name = sample_mapping.get(sample_uid, f"sample_{sample_uid}")
571
673
 
572
- # Take max if multiple features map to same consensus/sample combination
674
+ # For gaps matrix, we want to know if ANY feature was detected for this consensus/sample
675
+ # So we take max (if any feature is detected, the consensus feature is detected)
573
676
  if sample_name in matrix_dict[consensus_uid]:
574
677
  matrix_dict[consensus_uid][sample_name] = max(
575
678
  matrix_dict[consensus_uid][sample_name],
576
- value,
679
+ detected,
577
680
  )
578
681
  else:
579
- matrix_dict[consensus_uid][sample_name] = value
580
-
581
- # Convert to Polars DataFrame with proper formatting
582
- import polars as pl
682
+ matrix_dict[consensus_uid][sample_name] = detected
583
683
 
584
- # Convert matrix_dict to list of records for Polars
684
+ # Convert to Polars DataFrame
585
685
  records = []
586
686
  for consensus_uid, sample_values in matrix_dict.items():
587
687
  record = {"consensus_uid": consensus_uid}
588
688
  record.update(sample_values)
589
689
  records.append(record)
590
690
 
691
+ if not records:
692
+ self.logger.warning("No gaps data found for specified consensus features and samples")
693
+ return pl.DataFrame()
694
+
591
695
  # Create Polars DataFrame and set proper data types
592
- df2 = pl.DataFrame(records)
696
+ df_gaps = pl.DataFrame(records)
593
697
 
594
- # Fill null values with 0 and round numeric columns
595
- numeric_cols = [col for col in df2.columns if col != "consensus_uid"]
596
- df2 = df2.with_columns(
698
+ # Fill null values with 0 (gaps) and ensure integer type for gap indicators
699
+ numeric_cols = [col for col in df_gaps.columns if col != "consensus_uid"]
700
+ df_gaps = df_gaps.with_columns(
597
701
  [
598
702
  pl.col("consensus_uid").cast(pl.UInt64),
599
- *[pl.col(col).fill_null(0).round(0) for col in numeric_cols],
703
+ *[pl.col(col).fill_null(0).cast(pl.Int8) for col in numeric_cols],
600
704
  ],
601
705
  )
602
706
 
603
- return df2
604
-
605
-
606
- def get_gaps_matrix(self, uids=None):
607
- """
608
- Get a matrix of gaps between consensus features with samples as columns and consensus features as rows.
609
- """
610
- if self.consensus_df is None:
611
- self.logger.error("No consensus map found.")
612
- return None
613
- uids = self._get_consensus_uids(uids)
614
-
615
- df1 = self.get_consensus_matrix(quant="filled")
616
- if df1 is None or df1.empty:
617
- self.logger.warning("No gap data found.")
618
- return None
619
- # keep only rows where consensus_id is in ids - use pandas indexing since df1 is already pandas
620
- df1 = df1[df1.index.isin(uids)]
621
- return df1
707
+ return df_gaps
622
708
 
623
709
 
624
710
  def get_gaps_stats(self, uids=None):
@@ -705,6 +791,53 @@ def get_consensus_matches(self, uids=None, filled=True):
705
791
  # =====================================================================================
706
792
 
707
793
 
794
+ def consensus_reset(self):
795
+ """
796
+ Reset consensus data by clearing consensus DataFrames and removing filled features.
797
+
798
+ This function:
799
+ 1. Sets consensus_df, consensus_ms2, consensus_mapping_df, id_df to empty pl.DataFrame()
800
+ 2. Removes all filled features from features_df
801
+ 3. Removes relevant operations from history (merge, integrate, find_ms2, fill, identify)
802
+ 4. Logs the number of features removed
803
+
804
+ This effectively undoes the merge() operation and any gap-filling.
805
+ """
806
+ self.logger.debug("Resetting consensus data.")
807
+
808
+ # Reset consensus DataFrames to empty
809
+ self.consensus_df = pl.DataFrame()
810
+ self.consensus_ms2 = pl.DataFrame()
811
+ self.consensus_mapping_df = pl.DataFrame()
812
+ self.id_df = pl.DataFrame()
813
+
814
+ # Remove filled features from features_df
815
+ if self.features_df is None:
816
+ self.logger.warning("No features found.")
817
+ return
818
+
819
+ l1 = len(self.features_df)
820
+
821
+ # Filter out filled features (keep only non-filled features)
822
+ if "filled" in self.features_df.columns:
823
+ self.features_df = self.features_df.filter(~pl.col("filled") | pl.col("filled").is_null())
824
+
825
+ # Remove consensus-related operations from history
826
+ keys_to_remove = ["merge", "integrate", "integrate_chrom", "find_ms2", "fill", "fill_single", "identify"]
827
+ history_removed_count = 0
828
+ if hasattr(self, "history") and self.history:
829
+ for key in keys_to_remove:
830
+ if key in self.history:
831
+ del self.history[key]
832
+ history_removed_count += 1
833
+ self.logger.debug(f"Removed '{key}' from history")
834
+
835
+ removed_count = l1 - len(self.features_df)
836
+ self.logger.info(
837
+ f"Reset consensus data. Consensus DataFrames cleared. Features removed: {removed_count}. History entries removed: {history_removed_count}",
838
+ )
839
+
840
+
708
841
  def fill_reset(self):
709
842
  # remove all features with filled=True
710
843
  if self.features_df is None:
@@ -719,7 +852,7 @@ def fill_reset(self):
719
852
  pl.col("feature_uid").is_in(feature_uids_to_keep),
720
853
  )
721
854
  self.logger.info(
722
- f"Reset filled chromatograms. Chroms removed: {l1 - len(self.features_df)}",
855
+ f"Removed {l1 - len(self.features_df)} gap-filled features",
723
856
  )
724
857
 
725
858
 
@@ -1044,6 +1177,250 @@ def get_sample_stats(self):
1044
1177
  )
1045
1178
 
1046
1179
 
1180
+ def get_consensus_stats(self):
1181
+ """
1182
+ Get key performance indicators for each consensus feature.
1183
+
1184
+ Returns:
1185
+ pl.DataFrame: DataFrame with the following columns:
1186
+ - consensus_uid: Consensus unique identifier
1187
+ - rt: Retention time
1188
+ - rt_delta_mean: Mean retention time delta
1189
+ - mz: Mass-to-charge ratio
1190
+ - mz_range: Mass range (mz_max - mz_min)
1191
+ - log10_inty_mean: Log10 of mean intensity
1192
+ - number_samples: Number of samples
1193
+ - number_ms2: Number of MS2 spectra
1194
+ - charge_mean: Mean charge
1195
+ - quality: Feature quality
1196
+ - chrom_coherence_mean: Mean chromatographic coherence
1197
+ - chrom_height_scaled_mean: Mean scaled chromatographic height
1198
+ - chrom_prominence_scaled_mean: Mean scaled chromatographic prominence
1199
+ - qc_ratio: Ratio of QC samples where feature was detected
1200
+ - qc_cv: RSD (relative standard deviation) of intensity for QC samples
1201
+ - qc_to_blank: Ratio of average QC intensity to average blank intensity
1202
+ """
1203
+ import polars as pl
1204
+ import numpy as np
1205
+
1206
+ # Check if consensus_df exists and has data
1207
+ if self.consensus_df is None or self.consensus_df.is_empty():
1208
+ self.logger.error("No consensus data available. Run merge/find_consensus first.")
1209
+ return pl.DataFrame()
1210
+
1211
+ # Get all columns and their data types - work with original dataframe
1212
+ data_df = self.consensus_df.clone()
1213
+
1214
+ # Define specific columns to include in the exact order requested
1215
+ desired_columns = [
1216
+ "consensus_uid", # Include consensus_uid for identification
1217
+ "rt",
1218
+ "rt_delta_mean",
1219
+ "mz",
1220
+ "mz_range", # mz_max-mz_min (will be calculated)
1221
+ "log10_inty_mean", # log10(inty_mean) (will be calculated)
1222
+ "number_samples",
1223
+ "number_ms2",
1224
+ "charge_mean",
1225
+ "quality",
1226
+ "chrom_coherence_mean",
1227
+ "chrom_height_scaled_mean",
1228
+ "chrom_prominence_scaled_mean"
1229
+ ]
1230
+
1231
+ # Calculate derived columns if they don't exist
1232
+ if "mz_range" not in data_df.columns and "mz_max" in data_df.columns and "mz_min" in data_df.columns:
1233
+ data_df = data_df.with_columns((pl.col("mz_max") - pl.col("mz_min")).alias("mz_range"))
1234
+
1235
+ if "log10_inty_mean" not in data_df.columns and "inty_mean" in data_df.columns:
1236
+ data_df = data_df.with_columns(pl.col("inty_mean").log10().alias("log10_inty_mean"))
1237
+
1238
+ # Filter to only include columns that exist in the dataframe, preserving order
1239
+ available_columns = [col for col in desired_columns if col in data_df.columns]
1240
+
1241
+ if len(available_columns) <= 1: # Only consensus_uid would be 1
1242
+ self.logger.error(f"None of the requested consensus statistics columns were found. Available columns: {list(data_df.columns)}")
1243
+ return pl.DataFrame()
1244
+
1245
+ self.logger.debug(f"Creating consensus stats DataFrame with {len(available_columns)} columns: {available_columns}")
1246
+
1247
+ # Get base result DataFrame with selected columns
1248
+ result_df = data_df.select(available_columns)
1249
+
1250
+ # Add QC-related columns
1251
+ try:
1252
+ # Identify QC and blank samples based on naming patterns
1253
+ all_sample_names = self.samples_df["sample_name"].to_list()
1254
+
1255
+ # Define patterns for QC and blank identification
1256
+ qc_patterns = ["qc", "QC", "quality", "Quality", "control", "Control"]
1257
+ blank_patterns = ["blank", "Blank", "BLANK", "blk", "BLK"]
1258
+
1259
+ # Get QC and blank sample names
1260
+ qc_sample_names = [name for name in all_sample_names if any(pattern in name for pattern in qc_patterns)]
1261
+ blank_sample_names = [name for name in all_sample_names if any(pattern in name for pattern in blank_patterns)]
1262
+
1263
+ self.logger.debug(f"Found {len(qc_sample_names)} QC samples and {len(blank_sample_names)} blank samples")
1264
+
1265
+ # Initialize QC columns with null values
1266
+ qc_ratio_values = [None] * len(result_df)
1267
+ qc_cv_values = [None] * len(result_df)
1268
+ qc_to_blank_values = [None] * len(result_df)
1269
+
1270
+ if len(qc_sample_names) > 0:
1271
+ # Calculate QC metrics using optimized approach - get only QC+blank data
1272
+ self.logger.debug("Fetching optimized consensus matrices for QC calculations...")
1273
+
1274
+ # Get QC consensus matrix (only QC samples)
1275
+ qc_consensus_matrix = self.get_consensus_matrix(samples=qc_sample_names)
1276
+
1277
+ # Get blank consensus matrix (only blank samples) if blanks exist
1278
+ blank_consensus_matrix = None
1279
+ if len(blank_sample_names) > 0:
1280
+ blank_consensus_matrix = self.get_consensus_matrix(samples=blank_sample_names)
1281
+
1282
+ if qc_consensus_matrix is not None and not qc_consensus_matrix.is_empty():
1283
+ available_qc_cols = [col for col in qc_consensus_matrix.columns if col != "consensus_uid"]
1284
+ self.logger.debug(f"Found {len(available_qc_cols)} QC columns in optimized QC matrix")
1285
+
1286
+ # 2. QC CV: Calculate CV for QC samples
1287
+ if len(available_qc_cols) > 0:
1288
+ self.logger.debug("Calculating QC CV...")
1289
+ try:
1290
+ # Calculate CV (coefficient of variation) for QC samples
1291
+ qc_data = qc_consensus_matrix.select(["consensus_uid"] + available_qc_cols)
1292
+
1293
+ # Calculate mean and std for each row across QC columns
1294
+ qc_stats = qc_data.with_columns([
1295
+ pl.concat_list([pl.col(col) for col in available_qc_cols]).alias("qc_values")
1296
+ ]).with_columns([
1297
+ pl.col("qc_values").list.mean().alias("qc_mean"),
1298
+ pl.col("qc_values").list.std().alias("qc_std")
1299
+ ]).with_columns(
1300
+ # CV = std / mean (NOT multiplied by 100 to keep between 0-1)
1301
+ pl.when(pl.col("qc_mean") > 0)
1302
+ .then(pl.col("qc_std") / pl.col("qc_mean"))
1303
+ .otherwise(None)
1304
+ .alias("qc_cv")
1305
+ )
1306
+
1307
+ # Join with result DataFrame
1308
+ result_df = result_df.join(
1309
+ qc_stats.select(["consensus_uid", "qc_cv"]),
1310
+ on="consensus_uid",
1311
+ how="left"
1312
+ )
1313
+ qc_cv_values = None # Indicate we successfully added the column
1314
+
1315
+ except Exception as e:
1316
+ self.logger.debug(f"Could not calculate QC CV: {e}")
1317
+
1318
+ # 3. QC to blank ratio: Compare average QC to average blank intensity
1319
+ if len(available_qc_cols) > 0 and blank_consensus_matrix is not None and not blank_consensus_matrix.is_empty():
1320
+ available_blank_cols = [col for col in blank_consensus_matrix.columns if col != "consensus_uid"]
1321
+ self.logger.debug(f"Calculating QC to blank ratio with {len(available_blank_cols)} blank columns...")
1322
+
1323
+ if len(available_blank_cols) > 0:
1324
+ try:
1325
+ # Calculate average intensity for QC samples
1326
+ qc_averages = qc_data.with_columns([
1327
+ pl.concat_list([pl.col(col) for col in available_qc_cols]).alias("qc_values")
1328
+ ]).with_columns(
1329
+ pl.col("qc_values").list.mean().alias("qc_avg")
1330
+ ).select(["consensus_uid", "qc_avg"])
1331
+
1332
+ # Calculate average intensity for blank samples
1333
+ blank_data = blank_consensus_matrix.select(["consensus_uid"] + available_blank_cols)
1334
+ blank_averages = blank_data.with_columns([
1335
+ pl.concat_list([pl.col(col) for col in available_blank_cols]).alias("blank_values")
1336
+ ]).with_columns(
1337
+ pl.col("blank_values").list.mean().alias("blank_avg")
1338
+ ).select(["consensus_uid", "blank_avg"])
1339
+
1340
+ # Join QC and blank averages and calculate ratio
1341
+ qc_blank_ratios = qc_averages.join(
1342
+ blank_averages,
1343
+ on="consensus_uid",
1344
+ how="left"
1345
+ ).with_columns(
1346
+ # Ratio = qc_avg / blank_avg, but only where blank_avg > 0
1347
+ pl.when(pl.col("blank_avg") > 0)
1348
+ .then(pl.col("qc_avg") / pl.col("blank_avg"))
1349
+ .otherwise(None)
1350
+ .alias("qc_to_blank")
1351
+ )
1352
+
1353
+ # Join with result DataFrame
1354
+ result_df = result_df.join(
1355
+ qc_blank_ratios.select(["consensus_uid", "qc_to_blank"]),
1356
+ on="consensus_uid",
1357
+ how="left"
1358
+ )
1359
+ qc_to_blank_values = None # Indicate we successfully added the column
1360
+
1361
+ except Exception as e:
1362
+ self.logger.debug(f"Could not calculate QC to blank ratio: {e}")
1363
+
1364
+ # 1. QC ratio: Get optimized gaps matrix for QC samples only
1365
+ self.logger.debug("Calculating QC detection ratio with optimized gaps matrix...")
1366
+ try:
1367
+ # Use optimized get_gaps_matrix with QC samples filtering for faster performance
1368
+ qc_gaps_matrix = self.get_gaps_matrix(samples=qc_sample_names)
1369
+
1370
+ if qc_gaps_matrix is not None and not qc_gaps_matrix.is_empty():
1371
+ # Get QC columns (should be all columns except consensus_uid since we filtered)
1372
+ available_qc_cols_gaps = [col for col in qc_gaps_matrix.columns if col != "consensus_uid"]
1373
+ self.logger.debug(f"Found {len(available_qc_cols_gaps)} QC columns in optimized gaps matrix")
1374
+
1375
+ if len(available_qc_cols_gaps) > 0:
1376
+ # Calculate QC detection ratio for each consensus feature
1377
+ qc_detection = qc_gaps_matrix.select(["consensus_uid"] + available_qc_cols_gaps)
1378
+
1379
+ # Data should already be properly typed from get_gaps_matrix, but ensure consistency
1380
+ for col in available_qc_cols_gaps:
1381
+ qc_detection = qc_detection.with_columns(
1382
+ pl.col(col).fill_null(0).cast(pl.Int8).alias(col)
1383
+ )
1384
+
1385
+ # Calculate ratio (sum of detections / number of QC samples)
1386
+ qc_ratios = qc_detection.with_columns(
1387
+ pl.concat_list([pl.col(col) for col in available_qc_cols_gaps]).alias("qc_detections")
1388
+ ).with_columns(
1389
+ (pl.col("qc_detections").list.sum().cast(pl.Float64) / len(available_qc_cols_gaps)).alias("qc_ratio")
1390
+ )
1391
+
1392
+ # Join with result DataFrame
1393
+ result_df = result_df.join(
1394
+ qc_ratios.select(["consensus_uid", "qc_ratio"]),
1395
+ on="consensus_uid",
1396
+ how="left"
1397
+ )
1398
+ qc_ratio_values = None # Indicate we successfully added the column
1399
+
1400
+ except Exception as e:
1401
+ self.logger.debug(f"Could not calculate QC ratio: {e}")
1402
+
1403
+ # Add null columns for any QC metrics that couldn't be calculated
1404
+ # Add null columns for any QC metrics that couldn't be calculated
1405
+ if qc_ratio_values is not None:
1406
+ result_df = result_df.with_columns(pl.lit(None, dtype=pl.Float64).alias("qc_ratio"))
1407
+ if qc_cv_values is not None:
1408
+ result_df = result_df.with_columns(pl.lit(None, dtype=pl.Float64).alias("qc_cv"))
1409
+ if qc_to_blank_values is not None:
1410
+ result_df = result_df.with_columns(pl.lit(None, dtype=pl.Float64).alias("qc_to_blank"))
1411
+
1412
+ except Exception as e:
1413
+ self.logger.warning(f"Error calculating QC metrics: {e}")
1414
+ # Add null columns if QC calculation fails
1415
+ result_df = result_df.with_columns([
1416
+ pl.lit(None, dtype=pl.Float64).alias("qc_ratio"),
1417
+ pl.lit(None, dtype=pl.Float64).alias("qc_cv"),
1418
+ pl.lit(None, dtype=pl.Float64).alias("qc_to_blank")
1419
+ ])
1420
+
1421
+ return result_df
1422
+
1423
+
1047
1424
  # =====================================================================================
1048
1425
  # DATA COMPRESSION AND RESTORATION FUNCTIONS
1049
1426
  # =====================================================================================
@@ -2131,7 +2508,7 @@ def _apply_chunked_select(self, filter_expr, chunk_size: int):
2131
2508
  else:
2132
2509
  return pl.DataFrame()
2133
2510
 
2134
-
2511
+ '''
2135
2512
  def features_select_benchmarked(
2136
2513
  self,
2137
2514
  mz=None,
@@ -2224,7 +2601,7 @@ def monkey_patch_study():
2224
2601
  Study.features_select_benchmarked = features_select_benchmarked
2225
2602
 
2226
2603
  print("Patched Study.features_select with consolidated optimized implementation")
2227
-
2604
+ '''
2228
2605
 
2229
2606
  def features_filter(
2230
2607
  self,