masster 0.4.11__py3-none-any.whl → 0.4.12__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of masster might be problematic. Click here for more details.

masster/study/helpers.py CHANGED
@@ -1743,13 +1743,12 @@ def features_select(
1743
1743
  elapsed_time = time.perf_counter() - start_time
1744
1744
  final_count = len(result)
1745
1745
  removed_count = initial_count - final_count
1746
- throughput = final_count / elapsed_time if elapsed_time > 0 else 0
1747
1746
 
1748
1747
  if final_count == 0:
1749
1748
  self.logger.warning("No features remaining after applying selection criteria.")
1750
1749
  else:
1751
1750
  self.logger.debug(
1752
- f"Selected features: {final_count:,} (removed: {removed_count:,})"
1751
+ f"Selected features: {final_count:,} (removed: {removed_count:,}) in {elapsed_time:.4f}s"
1753
1752
  )
1754
1753
 
1755
1754
  return result
@@ -2094,8 +2093,6 @@ def features_filter(
2094
2093
  self.logger.warning("No features provided for filtering.")
2095
2094
  return
2096
2095
 
2097
- import time
2098
- start_time = time.perf_counter()
2099
2096
  initial_count = len(self.features_df)
2100
2097
 
2101
2098
  # Extract feature UIDs efficiently
@@ -2406,12 +2403,16 @@ def consensus_select(
2406
2403
  chrom_prominence_scaled_mean=None,
2407
2404
  chrom_height_scaled_mean=None,
2408
2405
  rt_delta_mean=None,
2406
+ id_top_score=None,
2407
+ identified=None,
2409
2408
  sortby=None,
2410
2409
  descending=True,
2411
2410
  ):
2412
2411
  """
2413
2412
  Select consensus features from consensus_df based on specified criteria and return the filtered DataFrame.
2414
2413
 
2414
+ OPTIMIZED VERSION: Enhanced performance with lazy evaluation, vectorized operations, and efficient filtering.
2415
+
2415
2416
  Parameters:
2416
2417
  mz: m/z filter with flexible formats:
2417
2418
  - float: m/z value ± default tolerance (uses study.parameters.eic_mz_tol)
@@ -2433,6 +2434,11 @@ def consensus_select(
2433
2434
  chrom_prominence_scaled_mean: mean scaled chromatogram prominence filter (tuple for range, single value for minimum)
2434
2435
  chrom_height_scaled_mean: mean scaled chromatogram height filter (tuple for range, single value for minimum)
2435
2436
  rt_delta_mean: mean RT delta filter (tuple for range, single value for minimum)
2437
+ id_top_score: identification top score filter (tuple for range, single value for minimum)
2438
+ identified: filter by identification status:
2439
+ - True: select only rows with id_top_name not null
2440
+ - False: select only rows with id_top_name null
2441
+ - None: no filtering (default)
2436
2442
  sortby: column name(s) to sort by (string, list of strings, or None for no sorting)
2437
2443
  descending: sort direction (True for descending, False for ascending, default is True)
2438
2444
 
@@ -2443,366 +2449,204 @@ def consensus_select(
2443
2449
  self.logger.warning("No consensus features found in study.")
2444
2450
  return pl.DataFrame()
2445
2451
 
2446
- consensus = self.consensus_df.clone()
2447
- initial_count = len(consensus)
2452
+ # Early return optimization - check if any filters are provided
2453
+ filter_params = [mz, rt, inty_mean, consensus_uid, consensus_id, number_samples,
2454
+ number_ms2, quality, bl, chrom_coherence_mean, chrom_prominence_mean,
2455
+ chrom_prominence_scaled_mean, chrom_height_scaled_mean,
2456
+ rt_delta_mean, id_top_score, identified]
2457
+
2458
+ if all(param is None for param in filter_params) and sortby is None:
2459
+ return self.consensus_df.clone()
2460
+
2461
+ import time
2462
+ start_time = time.perf_counter()
2463
+ initial_count = len(self.consensus_df)
2448
2464
 
2449
- # Filter by m/z
2450
- if mz is not None:
2451
- consensus_len_before_filter = len(consensus)
2465
+ # Pre-check available columns once for efficiency
2466
+ available_columns = set(self.consensus_df.columns)
2467
+ filter_conditions = []
2468
+ warnings = []
2452
2469
 
2470
+ # Build all filter conditions efficiently
2471
+ if mz is not None:
2453
2472
  if isinstance(mz, tuple) and len(mz) == 2:
2454
- # Check if second value is smaller than first (indicating mz, mz_tol format)
2455
2473
  if mz[1] < mz[0]:
2456
- # First is mz, second is mz_tol
2474
+ # mz_center ± mz_tol format
2457
2475
  mz_center, mz_tol = mz
2458
2476
  min_mz = mz_center - mz_tol
2459
2477
  max_mz = mz_center + mz_tol
2460
2478
  else:
2461
- # Standard (min_mz, max_mz) format
2479
+ # (min_mz, max_mz) format
2462
2480
  min_mz, max_mz = mz
2463
- consensus = consensus.filter(
2464
- (pl.col("mz") >= min_mz) & (pl.col("mz") <= max_mz),
2465
- )
2481
+ filter_conditions.append((pl.col("mz") >= min_mz) & (pl.col("mz") <= max_mz))
2466
2482
  else:
2467
- # Single float value - use default mz tolerance from study parameters
2483
+ # Single value with default tolerance
2468
2484
  default_mz_tol = getattr(self, "parameters", None)
2469
2485
  if default_mz_tol and hasattr(default_mz_tol, "eic_mz_tol"):
2470
2486
  default_mz_tol = default_mz_tol.eic_mz_tol
2471
2487
  else:
2472
- # Fallback to align_defaults if study parameters not available
2473
2488
  from masster.study.defaults.align_def import align_defaults
2474
-
2475
2489
  default_mz_tol = align_defaults().mz_max_diff
2476
-
2490
+
2477
2491
  min_mz = mz - default_mz_tol
2478
2492
  max_mz = mz + default_mz_tol
2479
- consensus = consensus.filter(
2480
- (pl.col("mz") >= min_mz) & (pl.col("mz") <= max_mz),
2481
- )
2482
-
2483
- self.logger.debug(
2484
- f"Selected consensus by mz. Consensus removed: {consensus_len_before_filter - len(consensus)}",
2485
- )
2493
+ filter_conditions.append((pl.col("mz") >= min_mz) & (pl.col("mz") <= max_mz))
2486
2494
 
2487
- # Filter by retention time
2488
2495
  if rt is not None:
2489
- consensus_len_before_filter = len(consensus)
2490
-
2491
2496
  if isinstance(rt, tuple) and len(rt) == 2:
2492
- # Check if second value is smaller than first (indicating rt, rt_tol format)
2493
2497
  if rt[1] < rt[0]:
2494
- # First is rt, second is rt_tol
2498
+ # rt_center ± rt_tol format
2495
2499
  rt_center, rt_tol = rt
2496
2500
  min_rt = rt_center - rt_tol
2497
2501
  max_rt = rt_center + rt_tol
2498
2502
  else:
2499
- # Standard (min_rt, max_rt) format
2503
+ # (min_rt, max_rt) format
2500
2504
  min_rt, max_rt = rt
2501
- consensus = consensus.filter(
2502
- (pl.col("rt") >= min_rt) & (pl.col("rt") <= max_rt),
2503
- )
2505
+ filter_conditions.append((pl.col("rt") >= min_rt) & (pl.col("rt") <= max_rt))
2504
2506
  else:
2505
- # Single float value - use default rt tolerance from study parameters
2507
+ # Single value with default tolerance
2506
2508
  default_rt_tol = getattr(self, "parameters", None)
2507
2509
  if default_rt_tol and hasattr(default_rt_tol, "eic_rt_tol"):
2508
2510
  default_rt_tol = default_rt_tol.eic_rt_tol
2509
2511
  else:
2510
- # Fallback to align_defaults if study parameters not available
2511
2512
  from masster.study.defaults.align_def import align_defaults
2512
-
2513
2513
  default_rt_tol = align_defaults().rt_tol
2514
-
2514
+
2515
2515
  min_rt = rt - default_rt_tol
2516
2516
  max_rt = rt + default_rt_tol
2517
- consensus = consensus.filter(
2518
- (pl.col("rt") >= min_rt) & (pl.col("rt") <= max_rt),
2519
- )
2520
-
2521
- self.logger.debug(
2522
- f"Selected consensus by rt. Consensus removed: {consensus_len_before_filter - len(consensus)}",
2523
- )
2524
-
2525
- # Filter by mean intensity
2526
- if inty_mean is not None:
2527
- consensus_len_before_filter = len(consensus)
2528
- if isinstance(inty_mean, tuple) and len(inty_mean) == 2:
2529
- min_inty, max_inty = inty_mean
2530
- consensus = consensus.filter(
2531
- (pl.col("inty_mean") >= min_inty) & (pl.col("inty_mean") <= max_inty),
2532
- )
2517
+ filter_conditions.append((pl.col("rt") >= min_rt) & (pl.col("rt") <= max_rt))
2518
+
2519
+ # Helper function to add range/minimum filters
2520
+ def _add_range_filter(param, column, param_name):
2521
+ if param is not None:
2522
+ if column in available_columns:
2523
+ if isinstance(param, tuple) and len(param) == 2:
2524
+ min_val, max_val = param
2525
+ filter_conditions.append((pl.col(column) >= min_val) & (pl.col(column) <= max_val))
2526
+ else:
2527
+ filter_conditions.append(pl.col(column) >= param)
2528
+ else:
2529
+ warnings.append(f"'{column}' column not found in consensus_df")
2530
+
2531
+ # Apply range/minimum filters efficiently
2532
+ _add_range_filter(inty_mean, "inty_mean", "inty_mean")
2533
+ _add_range_filter(quality, "quality", "quality")
2534
+ _add_range_filter(bl, "bl", "bl")
2535
+ _add_range_filter(chrom_coherence_mean, "chrom_coherence_mean", "chrom_coherence_mean")
2536
+ _add_range_filter(chrom_prominence_mean, "chrom_prominence_mean", "chrom_prominence_mean")
2537
+ _add_range_filter(chrom_prominence_scaled_mean, "chrom_prominence_scaled_mean", "chrom_prominence_scaled_mean")
2538
+ _add_range_filter(chrom_height_scaled_mean, "chrom_height_scaled_mean", "chrom_height_scaled_mean")
2539
+ _add_range_filter(rt_delta_mean, "rt_delta_mean", "rt_delta_mean")
2540
+ _add_range_filter(id_top_score, "id_top_score", "id_top_score")
2541
+ _add_range_filter(number_samples, "number_samples", "number_samples")
2542
+
2543
+ # Handle number_ms2 with column check
2544
+ if number_ms2 is not None:
2545
+ if "number_ms2" in available_columns:
2546
+ if isinstance(number_ms2, tuple) and len(number_ms2) == 2:
2547
+ min_ms2, max_ms2 = number_ms2
2548
+ filter_conditions.append((pl.col("number_ms2") >= min_ms2) & (pl.col("number_ms2") <= max_ms2))
2549
+ else:
2550
+ filter_conditions.append(pl.col("number_ms2") >= number_ms2)
2533
2551
  else:
2534
- consensus = consensus.filter(pl.col("inty_mean") >= inty_mean)
2535
- self.logger.debug(
2536
- f"Selected consensus by inty_mean. Consensus removed: {consensus_len_before_filter - len(consensus)}",
2537
- )
2552
+ warnings.append("'number_ms2' column not found in consensus_df")
2538
2553
 
2539
- # Filter by consensus_uid
2554
+ # Handle consensus_uid (list, single value, or range)
2540
2555
  if consensus_uid is not None:
2541
- consensus_len_before_filter = len(consensus)
2542
2556
  if isinstance(consensus_uid, (list, tuple)):
2543
2557
  if len(consensus_uid) == 2 and not isinstance(consensus_uid, list):
2544
- # Treat as range
2558
+ # Treat tuple as range
2545
2559
  min_uid, max_uid = consensus_uid
2546
- consensus = consensus.filter(
2547
- (pl.col("consensus_uid") >= min_uid)
2548
- & (pl.col("consensus_uid") <= max_uid),
2549
- )
2560
+ filter_conditions.append((pl.col("consensus_uid") >= min_uid) & (pl.col("consensus_uid") <= max_uid))
2550
2561
  else:
2551
- # Treat as list
2552
- consensus = consensus.filter(
2553
- pl.col("consensus_uid").is_in(consensus_uid),
2554
- )
2562
+ # Treat as list of values
2563
+ filter_conditions.append(pl.col("consensus_uid").is_in(consensus_uid))
2555
2564
  else:
2556
- consensus = consensus.filter(pl.col("consensus_uid") == consensus_uid)
2557
- self.logger.debug(
2558
- f"Selected consensus by consensus_uid. Consensus removed: {consensus_len_before_filter - len(consensus)}",
2559
- )
2565
+ filter_conditions.append(pl.col("consensus_uid") == consensus_uid)
2560
2566
 
2561
- # Filter by consensus_id
2567
+ # Handle consensus_id (list or single value)
2562
2568
  if consensus_id is not None:
2563
- consensus_len_before_filter = len(consensus)
2564
2569
  if isinstance(consensus_id, list):
2565
- consensus = consensus.filter(pl.col("consensus_id").is_in(consensus_id))
2566
- else:
2567
- consensus = consensus.filter(pl.col("consensus_id") == consensus_id)
2568
- self.logger.debug(
2569
- f"Selected consensus by consensus_id. Consensus removed: {consensus_len_before_filter - len(consensus)}",
2570
- )
2571
-
2572
- # Filter by number of samples
2573
- if number_samples is not None:
2574
- consensus_len_before_filter = len(consensus)
2575
- if isinstance(number_samples, tuple) and len(number_samples) == 2:
2576
- min_samples, max_samples = number_samples
2577
- consensus = consensus.filter(
2578
- (pl.col("number_samples") >= min_samples)
2579
- & (pl.col("number_samples") <= max_samples),
2580
- )
2581
- else:
2582
- consensus = consensus.filter(pl.col("number_samples") >= number_samples)
2583
- self.logger.debug(
2584
- f"Selected consensus by number_samples. Consensus removed: {consensus_len_before_filter - len(consensus)}",
2585
- )
2586
-
2587
- # Filter by number of MS2 spectra
2588
- if number_ms2 is not None:
2589
- consensus_len_before_filter = len(consensus)
2590
- if "number_ms2" in consensus.columns:
2591
- if isinstance(number_ms2, tuple) and len(number_ms2) == 2:
2592
- min_ms2, max_ms2 = number_ms2
2593
- consensus = consensus.filter(
2594
- (pl.col("number_ms2") >= min_ms2)
2595
- & (pl.col("number_ms2") <= max_ms2),
2596
- )
2597
- else:
2598
- consensus = consensus.filter(pl.col("number_ms2") >= number_ms2)
2599
- else:
2600
- self.logger.warning("'number_ms2' column not found in consensus_df")
2601
- self.logger.debug(
2602
- f"Selected consensus by number_ms2. Consensus removed: {consensus_len_before_filter - len(consensus)}",
2603
- )
2604
-
2605
- # Filter by quality
2606
- if quality is not None:
2607
- consensus_len_before_filter = len(consensus)
2608
- if isinstance(quality, tuple) and len(quality) == 2:
2609
- min_quality, max_quality = quality
2610
- consensus = consensus.filter(
2611
- (pl.col("quality") >= min_quality) & (pl.col("quality") <= max_quality),
2612
- )
2613
- else:
2614
- consensus = consensus.filter(pl.col("quality") >= quality)
2615
- self.logger.debug(
2616
- f"Selected consensus by quality. Consensus removed: {consensus_len_before_filter - len(consensus)}",
2617
- )
2618
-
2619
- # Filter by baseline
2620
- if bl is not None:
2621
- consensus_len_before_filter = len(consensus)
2622
- if "bl" in consensus.columns:
2623
- if isinstance(bl, tuple) and len(bl) == 2:
2624
- min_bl, max_bl = bl
2625
- consensus = consensus.filter(
2626
- (pl.col("bl") >= min_bl) & (pl.col("bl") <= max_bl),
2627
- )
2628
- else:
2629
- consensus = consensus.filter(pl.col("bl") >= bl)
2570
+ filter_conditions.append(pl.col("consensus_id").is_in(consensus_id))
2630
2571
  else:
2631
- self.logger.warning("'bl' column not found in consensus_df")
2632
- self.logger.debug(
2633
- f"Selected consensus by bl. Consensus removed: {consensus_len_before_filter - len(consensus)}",
2634
- )
2572
+ filter_conditions.append(pl.col("consensus_id") == consensus_id)
2635
2573
 
2636
- # Filter by mean chromatogram coherence
2637
- if chrom_coherence_mean is not None:
2638
- consensus_len_before_filter = len(consensus)
2639
- if "chrom_coherence_mean" in consensus.columns:
2640
- if (
2641
- isinstance(chrom_coherence_mean, tuple)
2642
- and len(chrom_coherence_mean) == 2
2643
- ):
2644
- min_coherence, max_coherence = chrom_coherence_mean
2645
- consensus = consensus.filter(
2646
- (pl.col("chrom_coherence_mean") >= min_coherence)
2647
- & (pl.col("chrom_coherence_mean") <= max_coherence),
2648
- )
2574
+ # Handle identified status filter
2575
+ if identified is not None:
2576
+ if "id_top_name" in available_columns:
2577
+ if identified:
2578
+ filter_conditions.append(pl.col("id_top_name").is_not_null())
2649
2579
  else:
2650
- consensus = consensus.filter(
2651
- pl.col("chrom_coherence_mean") >= chrom_coherence_mean,
2652
- )
2580
+ filter_conditions.append(pl.col("id_top_name").is_null())
2653
2581
  else:
2654
- self.logger.warning(
2655
- "'chrom_coherence_mean' column not found in consensus_df",
2656
- )
2657
- self.logger.debug(
2658
- f"Selected consensus by chrom_coherence_mean. Consensus removed: {consensus_len_before_filter - len(consensus)}",
2659
- )
2582
+ warnings.append("'id_top_name' column not found in consensus_df")
2660
2583
 
2661
- # Filter by mean chromatogram prominence
2662
- if chrom_prominence_mean is not None:
2663
- consensus_len_before_filter = len(consensus)
2664
- if "chrom_prominence_mean" in consensus.columns:
2665
- if (
2666
- isinstance(chrom_prominence_mean, tuple)
2667
- and len(chrom_prominence_mean) == 2
2668
- ):
2669
- min_prominence, max_prominence = chrom_prominence_mean
2670
- consensus = consensus.filter(
2671
- (pl.col("chrom_prominence_mean") >= min_prominence)
2672
- & (pl.col("chrom_prominence_mean") <= max_prominence),
2673
- )
2674
- else:
2675
- consensus = consensus.filter(
2676
- pl.col("chrom_prominence_mean") >= chrom_prominence_mean,
2677
- )
2678
- else:
2679
- self.logger.warning(
2680
- "'chrom_prominence_mean' column not found in consensus_df",
2681
- )
2682
- self.logger.debug(
2683
- f"Selected consensus by chrom_prominence_mean. Consensus removed: {consensus_len_before_filter - len(consensus)}",
2684
- )
2584
+ # Log warnings once
2585
+ for warning in warnings:
2586
+ self.logger.warning(warning)
2685
2587
 
2686
- # Filter by mean scaled chromatogram prominence
2687
- if chrom_prominence_scaled_mean is not None:
2688
- consensus_len_before_filter = len(consensus)
2689
- if "chrom_prominence_scaled_mean" in consensus.columns:
2690
- if (
2691
- isinstance(chrom_prominence_scaled_mean, tuple)
2692
- and len(chrom_prominence_scaled_mean) == 2
2693
- ):
2694
- min_prominence_scaled, max_prominence_scaled = (
2695
- chrom_prominence_scaled_mean
2696
- )
2697
- consensus = consensus.filter(
2698
- (pl.col("chrom_prominence_scaled_mean") >= min_prominence_scaled)
2699
- & (pl.col("chrom_prominence_scaled_mean") <= max_prominence_scaled),
2700
- )
2701
- else:
2702
- consensus = consensus.filter(
2703
- pl.col("chrom_prominence_scaled_mean")
2704
- >= chrom_prominence_scaled_mean,
2705
- )
2706
- else:
2707
- self.logger.warning(
2708
- "'chrom_prominence_scaled_mean' column not found in consensus_df",
2709
- )
2710
- self.logger.debug(
2711
- f"Selected consensus by chrom_prominence_scaled_mean. Consensus removed: {consensus_len_before_filter - len(consensus)}",
2588
+ # Apply all filters at once using lazy evaluation for optimal performance
2589
+ if filter_conditions:
2590
+ # Combine all conditions efficiently using reduce
2591
+ from functools import reduce
2592
+ import operator
2593
+ combined_filter = reduce(operator.and_, filter_conditions)
2594
+
2595
+ consensus = (
2596
+ self.consensus_df
2597
+ .lazy()
2598
+ .filter(combined_filter)
2599
+ .collect(streaming=True)
2712
2600
  )
2601
+ else:
2602
+ consensus = self.consensus_df.clone()
2713
2603
 
2714
- # Filter by mean scaled chromatogram height
2715
- if chrom_height_scaled_mean is not None:
2716
- consensus_len_before_filter = len(consensus)
2717
- if "chrom_height_scaled_mean" in consensus.columns:
2718
- if (
2719
- isinstance(chrom_height_scaled_mean, tuple)
2720
- and len(chrom_height_scaled_mean) == 2
2721
- ):
2722
- min_height_scaled, max_height_scaled = chrom_height_scaled_mean
2723
- consensus = consensus.filter(
2724
- (pl.col("chrom_height_scaled_mean") >= min_height_scaled)
2725
- & (pl.col("chrom_height_scaled_mean") <= max_height_scaled),
2726
- )
2727
- else:
2728
- consensus = consensus.filter(
2729
- pl.col("chrom_height_scaled_mean") >= chrom_height_scaled_mean,
2730
- )
2731
- else:
2732
- self.logger.warning(
2733
- "'chrom_height_scaled_mean' column not found in consensus_df",
2734
- )
2735
- self.logger.debug(
2736
- f"Selected consensus by chrom_height_scaled_mean. Consensus removed: {consensus_len_before_filter - len(consensus)}",
2737
- )
2604
+ final_count = len(consensus)
2738
2605
 
2739
- # Filter by mean RT delta
2740
- if rt_delta_mean is not None:
2741
- consensus_len_before_filter = len(consensus)
2742
- if "rt_delta_mean" in consensus.columns:
2743
- if isinstance(rt_delta_mean, tuple) and len(rt_delta_mean) == 2:
2744
- min_rt_delta, max_rt_delta = rt_delta_mean
2745
- consensus = consensus.filter(
2746
- (pl.col("rt_delta_mean") >= min_rt_delta)
2747
- & (pl.col("rt_delta_mean") <= max_rt_delta),
2748
- )
2749
- else:
2750
- consensus = consensus.filter(pl.col("rt_delta_mean") >= rt_delta_mean)
2751
- else:
2752
- self.logger.warning("'rt_delta_mean' column not found in consensus_df")
2753
- self.logger.debug(
2754
- f"Selected consensus by rt_delta_mean. Consensus removed: {consensus_len_before_filter - len(consensus)}",
2755
- )
2756
-
2757
- if len(consensus) == 0:
2758
- self.logger.warning(
2759
- "No consensus features remaining after applying selection criteria.",
2760
- )
2761
- else:
2762
- self.logger.info(
2763
- f"Selected consensus features. Features remaining: {len(consensus)} (from {initial_count})",
2764
- )
2606
+ # Early return if no results
2607
+ if final_count == 0:
2608
+ self.logger.warning("No consensus features remaining after applying selection criteria.")
2609
+ return pl.DataFrame()
2765
2610
 
2766
2611
  # Sort the results if sortby is specified
2767
2612
  if sortby is not None:
2768
2613
  if isinstance(sortby, str):
2769
- # Single column
2770
2614
  if sortby in consensus.columns:
2771
2615
  consensus = consensus.sort(sortby, descending=descending)
2772
2616
  else:
2773
- self.logger.warning(
2774
- f"Sort column '{sortby}' not found in consensus DataFrame",
2775
- )
2617
+ self.logger.warning(f"Sort column '{sortby}' not found in consensus DataFrame")
2776
2618
  elif isinstance(sortby, (list, tuple)):
2777
- # Multiple columns
2778
2619
  valid_columns = [col for col in sortby if col in consensus.columns]
2779
2620
  invalid_columns = [col for col in sortby if col not in consensus.columns]
2780
-
2621
+
2781
2622
  if invalid_columns:
2782
- self.logger.warning(
2783
- f"Sort columns not found in consensus DataFrame: {invalid_columns}",
2784
- )
2785
-
2623
+ self.logger.warning(f"Sort columns not found in consensus DataFrame: {invalid_columns}")
2624
+
2786
2625
  if valid_columns:
2787
2626
  consensus = consensus.sort(valid_columns, descending=descending)
2788
2627
  else:
2789
- self.logger.warning(
2790
- f"Invalid sortby parameter type: {type(sortby)}. Expected str, list, or tuple.",
2791
- )
2628
+ self.logger.warning(f"Invalid sortby parameter type: {type(sortby)}. Expected str, list, or tuple.")
2629
+
2630
+ # Log performance metrics
2631
+ elapsed_time = time.perf_counter() - start_time
2632
+ removed_count = initial_count - final_count
2633
+
2634
+ self.logger.info(f"Selected consensus features: {final_count:,} (removed: {removed_count:,}) in {elapsed_time:.4f}s")
2792
2635
 
2793
2636
  return consensus
2794
2637
 
2795
2638
 
2796
2639
  def consensus_filter(self, consensus):
2797
2640
  """
2798
- Filter consensus_df by removing all consensus features that match the given criteria.
2799
- This also removes related entries from consensus_mapping_df, features_df, and consensus_ms2.
2641
+ Filter consensus_df by keeping only consensus features that match the given criteria.
2642
+ This keeps only the specified consensus features and removes all others.
2643
+ Also updates related entries in consensus_mapping_df, features_df, and consensus_ms2.
2800
2644
 
2801
2645
  Parameters:
2802
- consensus: Consensus features to remove. Can be:
2646
+ consensus: Consensus features to keep. Can be:
2803
2647
  - polars.DataFrame: Consensus DataFrame (will use consensus_uid column)
2804
- - list: List of consensus_uids to remove
2805
- - int: Single consensus_uid to remove
2648
+ - list: List of consensus_uids to keep
2649
+ - int: Single consensus_uid to keep
2806
2650
 
2807
2651
  Returns:
2808
2652
  None (modifies self.consensus_df and related DataFrames in place)
@@ -2813,71 +2657,73 @@ def consensus_filter(self, consensus):
2813
2657
 
2814
2658
  initial_consensus_count = len(self.consensus_df)
2815
2659
 
2816
- # Determine consensus_uids to remove
2660
+ # Determine consensus_uids to keep
2817
2661
  if isinstance(consensus, pl.DataFrame):
2818
2662
  if "consensus_uid" not in consensus.columns:
2819
2663
  self.logger.error("consensus DataFrame must contain 'consensus_uid' column")
2820
2664
  return
2821
- consensus_uids_to_remove = consensus["consensus_uid"].to_list()
2665
+ consensus_uids_to_keep = consensus["consensus_uid"].to_list()
2822
2666
  elif isinstance(consensus, list):
2823
- consensus_uids_to_remove = consensus
2667
+ consensus_uids_to_keep = consensus
2824
2668
  elif isinstance(consensus, int):
2825
- consensus_uids_to_remove = [consensus]
2669
+ consensus_uids_to_keep = [consensus]
2826
2670
  else:
2827
2671
  self.logger.error("consensus parameter must be a DataFrame, list, or int")
2828
2672
  return
2829
2673
 
2830
- if not consensus_uids_to_remove:
2674
+ if not consensus_uids_to_keep:
2831
2675
  self.logger.warning("No consensus UIDs provided for filtering.")
2832
2676
  return
2833
2677
 
2834
- # Get feature_uids that need to be removed from features_df
2835
- feature_uids_to_remove = []
2678
+ # Get feature_uids that need to be kept in features_df
2679
+ feature_uids_to_keep = []
2836
2680
  if (
2837
2681
  self.consensus_mapping_df is not None
2838
2682
  and not self.consensus_mapping_df.is_empty()
2839
2683
  ):
2840
- feature_uids_to_remove = self.consensus_mapping_df.filter(
2841
- pl.col("consensus_uid").is_in(consensus_uids_to_remove),
2684
+ feature_uids_to_keep = self.consensus_mapping_df.filter(
2685
+ pl.col("consensus_uid").is_in(consensus_uids_to_keep),
2842
2686
  )["feature_uid"].to_list()
2843
2687
 
2844
- # Remove consensus features from consensus_df
2688
+ # Keep only specified consensus features in consensus_df
2845
2689
  self.consensus_df = self.consensus_df.filter(
2846
- ~pl.col("consensus_uid").is_in(consensus_uids_to_remove),
2690
+ pl.col("consensus_uid").is_in(consensus_uids_to_keep),
2847
2691
  )
2848
2692
 
2849
- # Remove from consensus_mapping_df
2693
+ # Keep only relevant entries in consensus_mapping_df
2850
2694
  if (
2851
2695
  self.consensus_mapping_df is not None
2852
2696
  and not self.consensus_mapping_df.is_empty()
2853
2697
  ):
2854
2698
  initial_mapping_count = len(self.consensus_mapping_df)
2855
2699
  self.consensus_mapping_df = self.consensus_mapping_df.filter(
2856
- ~pl.col("consensus_uid").is_in(consensus_uids_to_remove),
2700
+ pl.col("consensus_uid").is_in(consensus_uids_to_keep),
2857
2701
  )
2858
- removed_mapping_count = initial_mapping_count - len(self.consensus_mapping_df)
2702
+ remaining_mapping_count = len(self.consensus_mapping_df)
2703
+ removed_mapping_count = initial_mapping_count - remaining_mapping_count
2859
2704
  if removed_mapping_count > 0:
2860
2705
  self.logger.debug(
2861
2706
  f"Removed {removed_mapping_count} entries from consensus_mapping_df",
2862
2707
  )
2863
2708
 
2864
- # Remove corresponding features from features_df
2709
+ # Keep only corresponding features in features_df
2865
2710
  if (
2866
- feature_uids_to_remove
2711
+ feature_uids_to_keep
2867
2712
  and self.features_df is not None
2868
2713
  and not self.features_df.is_empty()
2869
2714
  ):
2870
2715
  initial_features_count = len(self.features_df)
2871
2716
  self.features_df = self.features_df.filter(
2872
- ~pl.col("feature_uid").is_in(feature_uids_to_remove),
2717
+ pl.col("feature_uid").is_in(feature_uids_to_keep),
2873
2718
  )
2874
- removed_features_count = initial_features_count - len(self.features_df)
2719
+ remaining_features_count = len(self.features_df)
2720
+ removed_features_count = initial_features_count - remaining_features_count
2875
2721
  if removed_features_count > 0:
2876
2722
  self.logger.debug(
2877
2723
  f"Removed {removed_features_count} entries from features_df",
2878
2724
  )
2879
2725
 
2880
- # Remove from consensus_ms2 if it exists
2726
+ # Keep only relevant entries in consensus_ms2 if it exists
2881
2727
  if (
2882
2728
  hasattr(self, "consensus_ms2")
2883
2729
  and self.consensus_ms2 is not None
@@ -2885,22 +2731,25 @@ def consensus_filter(self, consensus):
2885
2731
  ):
2886
2732
  initial_ms2_count = len(self.consensus_ms2)
2887
2733
  self.consensus_ms2 = self.consensus_ms2.filter(
2888
- ~pl.col("consensus_uid").is_in(consensus_uids_to_remove),
2734
+ pl.col("consensus_uid").is_in(consensus_uids_to_keep),
2889
2735
  )
2890
- removed_ms2_count = initial_ms2_count - len(self.consensus_ms2)
2736
+ remaining_ms2_count = len(self.consensus_ms2)
2737
+ removed_ms2_count = initial_ms2_count - remaining_ms2_count
2891
2738
  if removed_ms2_count > 0:
2892
2739
  self.logger.debug(f"Removed {removed_ms2_count} entries from consensus_ms2")
2893
2740
 
2894
- removed_consensus_count = initial_consensus_count - len(self.consensus_df)
2741
+ remaining_consensus_count = len(self.consensus_df)
2742
+ removed_consensus_count = initial_consensus_count - remaining_consensus_count
2895
2743
  self.logger.info(
2896
- f"Filtered {removed_consensus_count} consensus features. Remaining consensus: {len(self.consensus_df)}",
2744
+ f"Filtered consensus features: kept {remaining_consensus_count}, removed {removed_consensus_count}",
2897
2745
  )
2898
2746
 
2899
2747
 
2900
2748
  def consensus_delete(self, consensus):
2901
2749
  """
2902
2750
  Delete consensus features from consensus_df based on consensus identifiers.
2903
- This is an alias for consensus_filter for consistency with other delete methods.
2751
+ This removes the specified consensus features and keeps all others (opposite of consensus_filter).
2752
+ Also removes related entries from consensus_mapping_df, features_df, and consensus_ms2.
2904
2753
 
2905
2754
  Parameters:
2906
2755
  consensus: Consensus features to delete. Can be:
@@ -2911,7 +2760,110 @@ def consensus_delete(self, consensus):
2911
2760
  Returns:
2912
2761
  None (modifies self.consensus_df and related DataFrames in place)
2913
2762
  """
2914
- self.consensus_filter(consensus)
2763
+ if self.consensus_df is None or self.consensus_df.is_empty():
2764
+ self.logger.warning("No consensus features found in study.")
2765
+ return
2766
+
2767
+ # Early return if no consensus provided
2768
+ if consensus is None:
2769
+ self.logger.warning("No consensus provided for deletion.")
2770
+ return
2771
+
2772
+ initial_consensus_count = len(self.consensus_df)
2773
+
2774
+ # Determine consensus_uids to remove
2775
+ if isinstance(consensus, pl.DataFrame):
2776
+ if "consensus_uid" not in consensus.columns:
2777
+ self.logger.error("consensus DataFrame must contain 'consensus_uid' column")
2778
+ return
2779
+ consensus_uids_to_remove = consensus["consensus_uid"].to_list()
2780
+ elif isinstance(consensus, list):
2781
+ consensus_uids_to_remove = consensus
2782
+ elif isinstance(consensus, int):
2783
+ consensus_uids_to_remove = [consensus]
2784
+ else:
2785
+ self.logger.error("consensus parameter must be a DataFrame, list, or int")
2786
+ return
2787
+
2788
+ if not consensus_uids_to_remove:
2789
+ self.logger.warning("No consensus UIDs provided for deletion.")
2790
+ return
2791
+
2792
+ # Convert to set for faster lookup if list is large
2793
+ if len(consensus_uids_to_remove) > 100:
2794
+ consensus_uids_set = set(consensus_uids_to_remove)
2795
+ # Use the set for filtering if it's significantly smaller
2796
+ if len(consensus_uids_set) < len(consensus_uids_to_remove) * 0.8:
2797
+ consensus_uids_to_remove = list(consensus_uids_set)
2798
+
2799
+ # Get feature_uids that need to be removed from features_df
2800
+ feature_uids_to_remove = []
2801
+ if (
2802
+ self.consensus_mapping_df is not None
2803
+ and not self.consensus_mapping_df.is_empty()
2804
+ ):
2805
+ feature_uids_to_remove = self.consensus_mapping_df.filter(
2806
+ pl.col("consensus_uid").is_in(consensus_uids_to_remove),
2807
+ )["feature_uid"].to_list()
2808
+
2809
+ # Remove consensus features from consensus_df
2810
+ self.consensus_df = self.consensus_df.filter(
2811
+ ~pl.col("consensus_uid").is_in(consensus_uids_to_remove),
2812
+ )
2813
+
2814
+ # Remove from consensus_mapping_df
2815
+ mapping_removed_count = 0
2816
+ if (
2817
+ self.consensus_mapping_df is not None
2818
+ and not self.consensus_mapping_df.is_empty()
2819
+ ):
2820
+ initial_mapping_count = len(self.consensus_mapping_df)
2821
+ self.consensus_mapping_df = self.consensus_mapping_df.filter(
2822
+ ~pl.col("consensus_uid").is_in(consensus_uids_to_remove),
2823
+ )
2824
+ mapping_removed_count = initial_mapping_count - len(self.consensus_mapping_df)
2825
+
2826
+ # Remove corresponding features from features_df
2827
+ features_removed_count = 0
2828
+ if (
2829
+ feature_uids_to_remove
2830
+ and self.features_df is not None
2831
+ and not self.features_df.is_empty()
2832
+ ):
2833
+ initial_features_count = len(self.features_df)
2834
+ self.features_df = self.features_df.filter(
2835
+ ~pl.col("feature_uid").is_in(feature_uids_to_remove),
2836
+ )
2837
+ features_removed_count = initial_features_count - len(self.features_df)
2838
+
2839
+ # Remove from consensus_ms2 if it exists
2840
+ ms2_removed_count = 0
2841
+ if (
2842
+ hasattr(self, "consensus_ms2")
2843
+ and self.consensus_ms2 is not None
2844
+ and not self.consensus_ms2.is_empty()
2845
+ ):
2846
+ initial_ms2_count = len(self.consensus_ms2)
2847
+ self.consensus_ms2 = self.consensus_ms2.filter(
2848
+ ~pl.col("consensus_uid").is_in(consensus_uids_to_remove),
2849
+ )
2850
+ ms2_removed_count = initial_ms2_count - len(self.consensus_ms2)
2851
+
2852
+ # Calculate results and log efficiently
2853
+ final_consensus_count = len(self.consensus_df)
2854
+ consensus_removed_count = initial_consensus_count - final_consensus_count
2855
+
2856
+ # Single comprehensive log message
2857
+ log_parts = [f"Deleted {consensus_removed_count} consensus features"]
2858
+ if mapping_removed_count > 0:
2859
+ log_parts.append(f"{mapping_removed_count} consensus mappings")
2860
+ if features_removed_count > 0:
2861
+ log_parts.append(f"{features_removed_count} features")
2862
+ if ms2_removed_count > 0:
2863
+ log_parts.append(f"{ms2_removed_count} MS2 spectra")
2864
+
2865
+ log_message = ". ".join(log_parts) + f". Remaining consensus: {final_consensus_count}"
2866
+ self.logger.info(log_message)
2915
2867
 
2916
2868
 
2917
2869
  # =====================================================================================