masster 0.4.11__py3-none-any.whl → 0.4.13__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of masster might be problematic. Click here for more details.
- masster/_version.py +1 -1
- masster/lib/lib.py +45 -3
- masster/study/helpers.py +262 -310
- masster/study/id.py +564 -324
- masster/study/plot.py +38 -23
- masster/study/processing.py +268 -178
- masster/study/study.py +95 -60
- masster/study/study5_schema.json +12 -0
- {masster-0.4.11.dist-info → masster-0.4.13.dist-info}/METADATA +1 -1
- {masster-0.4.11.dist-info → masster-0.4.13.dist-info}/RECORD +13 -13
- {masster-0.4.11.dist-info → masster-0.4.13.dist-info}/WHEEL +0 -0
- {masster-0.4.11.dist-info → masster-0.4.13.dist-info}/entry_points.txt +0 -0
- {masster-0.4.11.dist-info → masster-0.4.13.dist-info}/licenses/LICENSE +0 -0
masster/study/helpers.py
CHANGED
|
@@ -1743,13 +1743,12 @@ def features_select(
|
|
|
1743
1743
|
elapsed_time = time.perf_counter() - start_time
|
|
1744
1744
|
final_count = len(result)
|
|
1745
1745
|
removed_count = initial_count - final_count
|
|
1746
|
-
throughput = final_count / elapsed_time if elapsed_time > 0 else 0
|
|
1747
1746
|
|
|
1748
1747
|
if final_count == 0:
|
|
1749
1748
|
self.logger.warning("No features remaining after applying selection criteria.")
|
|
1750
1749
|
else:
|
|
1751
1750
|
self.logger.debug(
|
|
1752
|
-
f"Selected features: {final_count:,} (removed: {removed_count:,})"
|
|
1751
|
+
f"Selected features: {final_count:,} (removed: {removed_count:,}) in {elapsed_time:.4f}s"
|
|
1753
1752
|
)
|
|
1754
1753
|
|
|
1755
1754
|
return result
|
|
@@ -2094,8 +2093,6 @@ def features_filter(
|
|
|
2094
2093
|
self.logger.warning("No features provided for filtering.")
|
|
2095
2094
|
return
|
|
2096
2095
|
|
|
2097
|
-
import time
|
|
2098
|
-
start_time = time.perf_counter()
|
|
2099
2096
|
initial_count = len(self.features_df)
|
|
2100
2097
|
|
|
2101
2098
|
# Extract feature UIDs efficiently
|
|
@@ -2406,12 +2403,16 @@ def consensus_select(
|
|
|
2406
2403
|
chrom_prominence_scaled_mean=None,
|
|
2407
2404
|
chrom_height_scaled_mean=None,
|
|
2408
2405
|
rt_delta_mean=None,
|
|
2406
|
+
id_top_score=None,
|
|
2407
|
+
identified=None,
|
|
2409
2408
|
sortby=None,
|
|
2410
2409
|
descending=True,
|
|
2411
2410
|
):
|
|
2412
2411
|
"""
|
|
2413
2412
|
Select consensus features from consensus_df based on specified criteria and return the filtered DataFrame.
|
|
2414
2413
|
|
|
2414
|
+
OPTIMIZED VERSION: Enhanced performance with lazy evaluation, vectorized operations, and efficient filtering.
|
|
2415
|
+
|
|
2415
2416
|
Parameters:
|
|
2416
2417
|
mz: m/z filter with flexible formats:
|
|
2417
2418
|
- float: m/z value ± default tolerance (uses study.parameters.eic_mz_tol)
|
|
@@ -2433,6 +2434,11 @@ def consensus_select(
|
|
|
2433
2434
|
chrom_prominence_scaled_mean: mean scaled chromatogram prominence filter (tuple for range, single value for minimum)
|
|
2434
2435
|
chrom_height_scaled_mean: mean scaled chromatogram height filter (tuple for range, single value for minimum)
|
|
2435
2436
|
rt_delta_mean: mean RT delta filter (tuple for range, single value for minimum)
|
|
2437
|
+
id_top_score: identification top score filter (tuple for range, single value for minimum)
|
|
2438
|
+
identified: filter by identification status:
|
|
2439
|
+
- True: select only rows with id_top_name not null
|
|
2440
|
+
- False: select only rows with id_top_name null
|
|
2441
|
+
- None: no filtering (default)
|
|
2436
2442
|
sortby: column name(s) to sort by (string, list of strings, or None for no sorting)
|
|
2437
2443
|
descending: sort direction (True for descending, False for ascending, default is True)
|
|
2438
2444
|
|
|
@@ -2443,366 +2449,204 @@ def consensus_select(
|
|
|
2443
2449
|
self.logger.warning("No consensus features found in study.")
|
|
2444
2450
|
return pl.DataFrame()
|
|
2445
2451
|
|
|
2446
|
-
|
|
2447
|
-
|
|
2452
|
+
# Early return optimization - check if any filters are provided
|
|
2453
|
+
filter_params = [mz, rt, inty_mean, consensus_uid, consensus_id, number_samples,
|
|
2454
|
+
number_ms2, quality, bl, chrom_coherence_mean, chrom_prominence_mean,
|
|
2455
|
+
chrom_prominence_scaled_mean, chrom_height_scaled_mean,
|
|
2456
|
+
rt_delta_mean, id_top_score, identified]
|
|
2457
|
+
|
|
2458
|
+
if all(param is None for param in filter_params) and sortby is None:
|
|
2459
|
+
return self.consensus_df.clone()
|
|
2460
|
+
|
|
2461
|
+
import time
|
|
2462
|
+
start_time = time.perf_counter()
|
|
2463
|
+
initial_count = len(self.consensus_df)
|
|
2448
2464
|
|
|
2449
|
-
#
|
|
2450
|
-
|
|
2451
|
-
|
|
2465
|
+
# Pre-check available columns once for efficiency
|
|
2466
|
+
available_columns = set(self.consensus_df.columns)
|
|
2467
|
+
filter_conditions = []
|
|
2468
|
+
warnings = []
|
|
2452
2469
|
|
|
2470
|
+
# Build all filter conditions efficiently
|
|
2471
|
+
if mz is not None:
|
|
2453
2472
|
if isinstance(mz, tuple) and len(mz) == 2:
|
|
2454
|
-
# Check if second value is smaller than first (indicating mz, mz_tol format)
|
|
2455
2473
|
if mz[1] < mz[0]:
|
|
2456
|
-
#
|
|
2474
|
+
# mz_center ± mz_tol format
|
|
2457
2475
|
mz_center, mz_tol = mz
|
|
2458
2476
|
min_mz = mz_center - mz_tol
|
|
2459
2477
|
max_mz = mz_center + mz_tol
|
|
2460
2478
|
else:
|
|
2461
|
-
#
|
|
2479
|
+
# (min_mz, max_mz) format
|
|
2462
2480
|
min_mz, max_mz = mz
|
|
2463
|
-
|
|
2464
|
-
(pl.col("mz") >= min_mz) & (pl.col("mz") <= max_mz),
|
|
2465
|
-
)
|
|
2481
|
+
filter_conditions.append((pl.col("mz") >= min_mz) & (pl.col("mz") <= max_mz))
|
|
2466
2482
|
else:
|
|
2467
|
-
# Single
|
|
2483
|
+
# Single value with default tolerance
|
|
2468
2484
|
default_mz_tol = getattr(self, "parameters", None)
|
|
2469
2485
|
if default_mz_tol and hasattr(default_mz_tol, "eic_mz_tol"):
|
|
2470
2486
|
default_mz_tol = default_mz_tol.eic_mz_tol
|
|
2471
2487
|
else:
|
|
2472
|
-
# Fallback to align_defaults if study parameters not available
|
|
2473
2488
|
from masster.study.defaults.align_def import align_defaults
|
|
2474
|
-
|
|
2475
2489
|
default_mz_tol = align_defaults().mz_max_diff
|
|
2476
|
-
|
|
2490
|
+
|
|
2477
2491
|
min_mz = mz - default_mz_tol
|
|
2478
2492
|
max_mz = mz + default_mz_tol
|
|
2479
|
-
|
|
2480
|
-
(pl.col("mz") >= min_mz) & (pl.col("mz") <= max_mz),
|
|
2481
|
-
)
|
|
2482
|
-
|
|
2483
|
-
self.logger.debug(
|
|
2484
|
-
f"Selected consensus by mz. Consensus removed: {consensus_len_before_filter - len(consensus)}",
|
|
2485
|
-
)
|
|
2493
|
+
filter_conditions.append((pl.col("mz") >= min_mz) & (pl.col("mz") <= max_mz))
|
|
2486
2494
|
|
|
2487
|
-
# Filter by retention time
|
|
2488
2495
|
if rt is not None:
|
|
2489
|
-
consensus_len_before_filter = len(consensus)
|
|
2490
|
-
|
|
2491
2496
|
if isinstance(rt, tuple) and len(rt) == 2:
|
|
2492
|
-
# Check if second value is smaller than first (indicating rt, rt_tol format)
|
|
2493
2497
|
if rt[1] < rt[0]:
|
|
2494
|
-
#
|
|
2498
|
+
# rt_center ± rt_tol format
|
|
2495
2499
|
rt_center, rt_tol = rt
|
|
2496
2500
|
min_rt = rt_center - rt_tol
|
|
2497
2501
|
max_rt = rt_center + rt_tol
|
|
2498
2502
|
else:
|
|
2499
|
-
#
|
|
2503
|
+
# (min_rt, max_rt) format
|
|
2500
2504
|
min_rt, max_rt = rt
|
|
2501
|
-
|
|
2502
|
-
(pl.col("rt") >= min_rt) & (pl.col("rt") <= max_rt),
|
|
2503
|
-
)
|
|
2505
|
+
filter_conditions.append((pl.col("rt") >= min_rt) & (pl.col("rt") <= max_rt))
|
|
2504
2506
|
else:
|
|
2505
|
-
# Single
|
|
2507
|
+
# Single value with default tolerance
|
|
2506
2508
|
default_rt_tol = getattr(self, "parameters", None)
|
|
2507
2509
|
if default_rt_tol and hasattr(default_rt_tol, "eic_rt_tol"):
|
|
2508
2510
|
default_rt_tol = default_rt_tol.eic_rt_tol
|
|
2509
2511
|
else:
|
|
2510
|
-
# Fallback to align_defaults if study parameters not available
|
|
2511
2512
|
from masster.study.defaults.align_def import align_defaults
|
|
2512
|
-
|
|
2513
2513
|
default_rt_tol = align_defaults().rt_tol
|
|
2514
|
-
|
|
2514
|
+
|
|
2515
2515
|
min_rt = rt - default_rt_tol
|
|
2516
2516
|
max_rt = rt + default_rt_tol
|
|
2517
|
-
|
|
2518
|
-
|
|
2519
|
-
|
|
2520
|
-
|
|
2521
|
-
|
|
2522
|
-
|
|
2523
|
-
|
|
2524
|
-
|
|
2525
|
-
|
|
2526
|
-
|
|
2527
|
-
|
|
2528
|
-
|
|
2529
|
-
|
|
2530
|
-
|
|
2531
|
-
|
|
2532
|
-
|
|
2517
|
+
filter_conditions.append((pl.col("rt") >= min_rt) & (pl.col("rt") <= max_rt))
|
|
2518
|
+
|
|
2519
|
+
# Helper function to add range/minimum filters
|
|
2520
|
+
def _add_range_filter(param, column, param_name):
|
|
2521
|
+
if param is not None:
|
|
2522
|
+
if column in available_columns:
|
|
2523
|
+
if isinstance(param, tuple) and len(param) == 2:
|
|
2524
|
+
min_val, max_val = param
|
|
2525
|
+
filter_conditions.append((pl.col(column) >= min_val) & (pl.col(column) <= max_val))
|
|
2526
|
+
else:
|
|
2527
|
+
filter_conditions.append(pl.col(column) >= param)
|
|
2528
|
+
else:
|
|
2529
|
+
warnings.append(f"'{column}' column not found in consensus_df")
|
|
2530
|
+
|
|
2531
|
+
# Apply range/minimum filters efficiently
|
|
2532
|
+
_add_range_filter(inty_mean, "inty_mean", "inty_mean")
|
|
2533
|
+
_add_range_filter(quality, "quality", "quality")
|
|
2534
|
+
_add_range_filter(bl, "bl", "bl")
|
|
2535
|
+
_add_range_filter(chrom_coherence_mean, "chrom_coherence_mean", "chrom_coherence_mean")
|
|
2536
|
+
_add_range_filter(chrom_prominence_mean, "chrom_prominence_mean", "chrom_prominence_mean")
|
|
2537
|
+
_add_range_filter(chrom_prominence_scaled_mean, "chrom_prominence_scaled_mean", "chrom_prominence_scaled_mean")
|
|
2538
|
+
_add_range_filter(chrom_height_scaled_mean, "chrom_height_scaled_mean", "chrom_height_scaled_mean")
|
|
2539
|
+
_add_range_filter(rt_delta_mean, "rt_delta_mean", "rt_delta_mean")
|
|
2540
|
+
_add_range_filter(id_top_score, "id_top_score", "id_top_score")
|
|
2541
|
+
_add_range_filter(number_samples, "number_samples", "number_samples")
|
|
2542
|
+
|
|
2543
|
+
# Handle number_ms2 with column check
|
|
2544
|
+
if number_ms2 is not None:
|
|
2545
|
+
if "number_ms2" in available_columns:
|
|
2546
|
+
if isinstance(number_ms2, tuple) and len(number_ms2) == 2:
|
|
2547
|
+
min_ms2, max_ms2 = number_ms2
|
|
2548
|
+
filter_conditions.append((pl.col("number_ms2") >= min_ms2) & (pl.col("number_ms2") <= max_ms2))
|
|
2549
|
+
else:
|
|
2550
|
+
filter_conditions.append(pl.col("number_ms2") >= number_ms2)
|
|
2533
2551
|
else:
|
|
2534
|
-
|
|
2535
|
-
self.logger.debug(
|
|
2536
|
-
f"Selected consensus by inty_mean. Consensus removed: {consensus_len_before_filter - len(consensus)}",
|
|
2537
|
-
)
|
|
2552
|
+
warnings.append("'number_ms2' column not found in consensus_df")
|
|
2538
2553
|
|
|
2539
|
-
#
|
|
2554
|
+
# Handle consensus_uid (list, single value, or range)
|
|
2540
2555
|
if consensus_uid is not None:
|
|
2541
|
-
consensus_len_before_filter = len(consensus)
|
|
2542
2556
|
if isinstance(consensus_uid, (list, tuple)):
|
|
2543
2557
|
if len(consensus_uid) == 2 and not isinstance(consensus_uid, list):
|
|
2544
|
-
# Treat as range
|
|
2558
|
+
# Treat tuple as range
|
|
2545
2559
|
min_uid, max_uid = consensus_uid
|
|
2546
|
-
|
|
2547
|
-
(pl.col("consensus_uid") >= min_uid)
|
|
2548
|
-
& (pl.col("consensus_uid") <= max_uid),
|
|
2549
|
-
)
|
|
2560
|
+
filter_conditions.append((pl.col("consensus_uid") >= min_uid) & (pl.col("consensus_uid") <= max_uid))
|
|
2550
2561
|
else:
|
|
2551
|
-
# Treat as list
|
|
2552
|
-
|
|
2553
|
-
pl.col("consensus_uid").is_in(consensus_uid),
|
|
2554
|
-
)
|
|
2562
|
+
# Treat as list of values
|
|
2563
|
+
filter_conditions.append(pl.col("consensus_uid").is_in(consensus_uid))
|
|
2555
2564
|
else:
|
|
2556
|
-
|
|
2557
|
-
self.logger.debug(
|
|
2558
|
-
f"Selected consensus by consensus_uid. Consensus removed: {consensus_len_before_filter - len(consensus)}",
|
|
2559
|
-
)
|
|
2565
|
+
filter_conditions.append(pl.col("consensus_uid") == consensus_uid)
|
|
2560
2566
|
|
|
2561
|
-
#
|
|
2567
|
+
# Handle consensus_id (list or single value)
|
|
2562
2568
|
if consensus_id is not None:
|
|
2563
|
-
consensus_len_before_filter = len(consensus)
|
|
2564
2569
|
if isinstance(consensus_id, list):
|
|
2565
|
-
|
|
2566
|
-
else:
|
|
2567
|
-
consensus = consensus.filter(pl.col("consensus_id") == consensus_id)
|
|
2568
|
-
self.logger.debug(
|
|
2569
|
-
f"Selected consensus by consensus_id. Consensus removed: {consensus_len_before_filter - len(consensus)}",
|
|
2570
|
-
)
|
|
2571
|
-
|
|
2572
|
-
# Filter by number of samples
|
|
2573
|
-
if number_samples is not None:
|
|
2574
|
-
consensus_len_before_filter = len(consensus)
|
|
2575
|
-
if isinstance(number_samples, tuple) and len(number_samples) == 2:
|
|
2576
|
-
min_samples, max_samples = number_samples
|
|
2577
|
-
consensus = consensus.filter(
|
|
2578
|
-
(pl.col("number_samples") >= min_samples)
|
|
2579
|
-
& (pl.col("number_samples") <= max_samples),
|
|
2580
|
-
)
|
|
2581
|
-
else:
|
|
2582
|
-
consensus = consensus.filter(pl.col("number_samples") >= number_samples)
|
|
2583
|
-
self.logger.debug(
|
|
2584
|
-
f"Selected consensus by number_samples. Consensus removed: {consensus_len_before_filter - len(consensus)}",
|
|
2585
|
-
)
|
|
2586
|
-
|
|
2587
|
-
# Filter by number of MS2 spectra
|
|
2588
|
-
if number_ms2 is not None:
|
|
2589
|
-
consensus_len_before_filter = len(consensus)
|
|
2590
|
-
if "number_ms2" in consensus.columns:
|
|
2591
|
-
if isinstance(number_ms2, tuple) and len(number_ms2) == 2:
|
|
2592
|
-
min_ms2, max_ms2 = number_ms2
|
|
2593
|
-
consensus = consensus.filter(
|
|
2594
|
-
(pl.col("number_ms2") >= min_ms2)
|
|
2595
|
-
& (pl.col("number_ms2") <= max_ms2),
|
|
2596
|
-
)
|
|
2597
|
-
else:
|
|
2598
|
-
consensus = consensus.filter(pl.col("number_ms2") >= number_ms2)
|
|
2599
|
-
else:
|
|
2600
|
-
self.logger.warning("'number_ms2' column not found in consensus_df")
|
|
2601
|
-
self.logger.debug(
|
|
2602
|
-
f"Selected consensus by number_ms2. Consensus removed: {consensus_len_before_filter - len(consensus)}",
|
|
2603
|
-
)
|
|
2604
|
-
|
|
2605
|
-
# Filter by quality
|
|
2606
|
-
if quality is not None:
|
|
2607
|
-
consensus_len_before_filter = len(consensus)
|
|
2608
|
-
if isinstance(quality, tuple) and len(quality) == 2:
|
|
2609
|
-
min_quality, max_quality = quality
|
|
2610
|
-
consensus = consensus.filter(
|
|
2611
|
-
(pl.col("quality") >= min_quality) & (pl.col("quality") <= max_quality),
|
|
2612
|
-
)
|
|
2613
|
-
else:
|
|
2614
|
-
consensus = consensus.filter(pl.col("quality") >= quality)
|
|
2615
|
-
self.logger.debug(
|
|
2616
|
-
f"Selected consensus by quality. Consensus removed: {consensus_len_before_filter - len(consensus)}",
|
|
2617
|
-
)
|
|
2618
|
-
|
|
2619
|
-
# Filter by baseline
|
|
2620
|
-
if bl is not None:
|
|
2621
|
-
consensus_len_before_filter = len(consensus)
|
|
2622
|
-
if "bl" in consensus.columns:
|
|
2623
|
-
if isinstance(bl, tuple) and len(bl) == 2:
|
|
2624
|
-
min_bl, max_bl = bl
|
|
2625
|
-
consensus = consensus.filter(
|
|
2626
|
-
(pl.col("bl") >= min_bl) & (pl.col("bl") <= max_bl),
|
|
2627
|
-
)
|
|
2628
|
-
else:
|
|
2629
|
-
consensus = consensus.filter(pl.col("bl") >= bl)
|
|
2570
|
+
filter_conditions.append(pl.col("consensus_id").is_in(consensus_id))
|
|
2630
2571
|
else:
|
|
2631
|
-
|
|
2632
|
-
self.logger.debug(
|
|
2633
|
-
f"Selected consensus by bl. Consensus removed: {consensus_len_before_filter - len(consensus)}",
|
|
2634
|
-
)
|
|
2572
|
+
filter_conditions.append(pl.col("consensus_id") == consensus_id)
|
|
2635
2573
|
|
|
2636
|
-
#
|
|
2637
|
-
if
|
|
2638
|
-
|
|
2639
|
-
|
|
2640
|
-
|
|
2641
|
-
isinstance(chrom_coherence_mean, tuple)
|
|
2642
|
-
and len(chrom_coherence_mean) == 2
|
|
2643
|
-
):
|
|
2644
|
-
min_coherence, max_coherence = chrom_coherence_mean
|
|
2645
|
-
consensus = consensus.filter(
|
|
2646
|
-
(pl.col("chrom_coherence_mean") >= min_coherence)
|
|
2647
|
-
& (pl.col("chrom_coherence_mean") <= max_coherence),
|
|
2648
|
-
)
|
|
2574
|
+
# Handle identified status filter
|
|
2575
|
+
if identified is not None:
|
|
2576
|
+
if "id_top_name" in available_columns:
|
|
2577
|
+
if identified:
|
|
2578
|
+
filter_conditions.append(pl.col("id_top_name").is_not_null())
|
|
2649
2579
|
else:
|
|
2650
|
-
|
|
2651
|
-
pl.col("chrom_coherence_mean") >= chrom_coherence_mean,
|
|
2652
|
-
)
|
|
2580
|
+
filter_conditions.append(pl.col("id_top_name").is_null())
|
|
2653
2581
|
else:
|
|
2654
|
-
|
|
2655
|
-
"'chrom_coherence_mean' column not found in consensus_df",
|
|
2656
|
-
)
|
|
2657
|
-
self.logger.debug(
|
|
2658
|
-
f"Selected consensus by chrom_coherence_mean. Consensus removed: {consensus_len_before_filter - len(consensus)}",
|
|
2659
|
-
)
|
|
2582
|
+
warnings.append("'id_top_name' column not found in consensus_df")
|
|
2660
2583
|
|
|
2661
|
-
#
|
|
2662
|
-
|
|
2663
|
-
|
|
2664
|
-
if "chrom_prominence_mean" in consensus.columns:
|
|
2665
|
-
if (
|
|
2666
|
-
isinstance(chrom_prominence_mean, tuple)
|
|
2667
|
-
and len(chrom_prominence_mean) == 2
|
|
2668
|
-
):
|
|
2669
|
-
min_prominence, max_prominence = chrom_prominence_mean
|
|
2670
|
-
consensus = consensus.filter(
|
|
2671
|
-
(pl.col("chrom_prominence_mean") >= min_prominence)
|
|
2672
|
-
& (pl.col("chrom_prominence_mean") <= max_prominence),
|
|
2673
|
-
)
|
|
2674
|
-
else:
|
|
2675
|
-
consensus = consensus.filter(
|
|
2676
|
-
pl.col("chrom_prominence_mean") >= chrom_prominence_mean,
|
|
2677
|
-
)
|
|
2678
|
-
else:
|
|
2679
|
-
self.logger.warning(
|
|
2680
|
-
"'chrom_prominence_mean' column not found in consensus_df",
|
|
2681
|
-
)
|
|
2682
|
-
self.logger.debug(
|
|
2683
|
-
f"Selected consensus by chrom_prominence_mean. Consensus removed: {consensus_len_before_filter - len(consensus)}",
|
|
2684
|
-
)
|
|
2584
|
+
# Log warnings once
|
|
2585
|
+
for warning in warnings:
|
|
2586
|
+
self.logger.warning(warning)
|
|
2685
2587
|
|
|
2686
|
-
#
|
|
2687
|
-
if
|
|
2688
|
-
|
|
2689
|
-
|
|
2690
|
-
|
|
2691
|
-
|
|
2692
|
-
|
|
2693
|
-
|
|
2694
|
-
|
|
2695
|
-
|
|
2696
|
-
|
|
2697
|
-
|
|
2698
|
-
(pl.col("chrom_prominence_scaled_mean") >= min_prominence_scaled)
|
|
2699
|
-
& (pl.col("chrom_prominence_scaled_mean") <= max_prominence_scaled),
|
|
2700
|
-
)
|
|
2701
|
-
else:
|
|
2702
|
-
consensus = consensus.filter(
|
|
2703
|
-
pl.col("chrom_prominence_scaled_mean")
|
|
2704
|
-
>= chrom_prominence_scaled_mean,
|
|
2705
|
-
)
|
|
2706
|
-
else:
|
|
2707
|
-
self.logger.warning(
|
|
2708
|
-
"'chrom_prominence_scaled_mean' column not found in consensus_df",
|
|
2709
|
-
)
|
|
2710
|
-
self.logger.debug(
|
|
2711
|
-
f"Selected consensus by chrom_prominence_scaled_mean. Consensus removed: {consensus_len_before_filter - len(consensus)}",
|
|
2588
|
+
# Apply all filters at once using lazy evaluation for optimal performance
|
|
2589
|
+
if filter_conditions:
|
|
2590
|
+
# Combine all conditions efficiently using reduce
|
|
2591
|
+
from functools import reduce
|
|
2592
|
+
import operator
|
|
2593
|
+
combined_filter = reduce(operator.and_, filter_conditions)
|
|
2594
|
+
|
|
2595
|
+
consensus = (
|
|
2596
|
+
self.consensus_df
|
|
2597
|
+
.lazy()
|
|
2598
|
+
.filter(combined_filter)
|
|
2599
|
+
.collect(streaming=True)
|
|
2712
2600
|
)
|
|
2601
|
+
else:
|
|
2602
|
+
consensus = self.consensus_df.clone()
|
|
2713
2603
|
|
|
2714
|
-
|
|
2715
|
-
if chrom_height_scaled_mean is not None:
|
|
2716
|
-
consensus_len_before_filter = len(consensus)
|
|
2717
|
-
if "chrom_height_scaled_mean" in consensus.columns:
|
|
2718
|
-
if (
|
|
2719
|
-
isinstance(chrom_height_scaled_mean, tuple)
|
|
2720
|
-
and len(chrom_height_scaled_mean) == 2
|
|
2721
|
-
):
|
|
2722
|
-
min_height_scaled, max_height_scaled = chrom_height_scaled_mean
|
|
2723
|
-
consensus = consensus.filter(
|
|
2724
|
-
(pl.col("chrom_height_scaled_mean") >= min_height_scaled)
|
|
2725
|
-
& (pl.col("chrom_height_scaled_mean") <= max_height_scaled),
|
|
2726
|
-
)
|
|
2727
|
-
else:
|
|
2728
|
-
consensus = consensus.filter(
|
|
2729
|
-
pl.col("chrom_height_scaled_mean") >= chrom_height_scaled_mean,
|
|
2730
|
-
)
|
|
2731
|
-
else:
|
|
2732
|
-
self.logger.warning(
|
|
2733
|
-
"'chrom_height_scaled_mean' column not found in consensus_df",
|
|
2734
|
-
)
|
|
2735
|
-
self.logger.debug(
|
|
2736
|
-
f"Selected consensus by chrom_height_scaled_mean. Consensus removed: {consensus_len_before_filter - len(consensus)}",
|
|
2737
|
-
)
|
|
2604
|
+
final_count = len(consensus)
|
|
2738
2605
|
|
|
2739
|
-
#
|
|
2740
|
-
if
|
|
2741
|
-
|
|
2742
|
-
|
|
2743
|
-
if isinstance(rt_delta_mean, tuple) and len(rt_delta_mean) == 2:
|
|
2744
|
-
min_rt_delta, max_rt_delta = rt_delta_mean
|
|
2745
|
-
consensus = consensus.filter(
|
|
2746
|
-
(pl.col("rt_delta_mean") >= min_rt_delta)
|
|
2747
|
-
& (pl.col("rt_delta_mean") <= max_rt_delta),
|
|
2748
|
-
)
|
|
2749
|
-
else:
|
|
2750
|
-
consensus = consensus.filter(pl.col("rt_delta_mean") >= rt_delta_mean)
|
|
2751
|
-
else:
|
|
2752
|
-
self.logger.warning("'rt_delta_mean' column not found in consensus_df")
|
|
2753
|
-
self.logger.debug(
|
|
2754
|
-
f"Selected consensus by rt_delta_mean. Consensus removed: {consensus_len_before_filter - len(consensus)}",
|
|
2755
|
-
)
|
|
2756
|
-
|
|
2757
|
-
if len(consensus) == 0:
|
|
2758
|
-
self.logger.warning(
|
|
2759
|
-
"No consensus features remaining after applying selection criteria.",
|
|
2760
|
-
)
|
|
2761
|
-
else:
|
|
2762
|
-
self.logger.info(
|
|
2763
|
-
f"Selected consensus features. Features remaining: {len(consensus)} (from {initial_count})",
|
|
2764
|
-
)
|
|
2606
|
+
# Early return if no results
|
|
2607
|
+
if final_count == 0:
|
|
2608
|
+
self.logger.warning("No consensus features remaining after applying selection criteria.")
|
|
2609
|
+
return pl.DataFrame()
|
|
2765
2610
|
|
|
2766
2611
|
# Sort the results if sortby is specified
|
|
2767
2612
|
if sortby is not None:
|
|
2768
2613
|
if isinstance(sortby, str):
|
|
2769
|
-
# Single column
|
|
2770
2614
|
if sortby in consensus.columns:
|
|
2771
2615
|
consensus = consensus.sort(sortby, descending=descending)
|
|
2772
2616
|
else:
|
|
2773
|
-
self.logger.warning(
|
|
2774
|
-
f"Sort column '{sortby}' not found in consensus DataFrame",
|
|
2775
|
-
)
|
|
2617
|
+
self.logger.warning(f"Sort column '{sortby}' not found in consensus DataFrame")
|
|
2776
2618
|
elif isinstance(sortby, (list, tuple)):
|
|
2777
|
-
# Multiple columns
|
|
2778
2619
|
valid_columns = [col for col in sortby if col in consensus.columns]
|
|
2779
2620
|
invalid_columns = [col for col in sortby if col not in consensus.columns]
|
|
2780
|
-
|
|
2621
|
+
|
|
2781
2622
|
if invalid_columns:
|
|
2782
|
-
self.logger.warning(
|
|
2783
|
-
|
|
2784
|
-
)
|
|
2785
|
-
|
|
2623
|
+
self.logger.warning(f"Sort columns not found in consensus DataFrame: {invalid_columns}")
|
|
2624
|
+
|
|
2786
2625
|
if valid_columns:
|
|
2787
2626
|
consensus = consensus.sort(valid_columns, descending=descending)
|
|
2788
2627
|
else:
|
|
2789
|
-
self.logger.warning(
|
|
2790
|
-
|
|
2791
|
-
|
|
2628
|
+
self.logger.warning(f"Invalid sortby parameter type: {type(sortby)}. Expected str, list, or tuple.")
|
|
2629
|
+
|
|
2630
|
+
# Log performance metrics
|
|
2631
|
+
elapsed_time = time.perf_counter() - start_time
|
|
2632
|
+
removed_count = initial_count - final_count
|
|
2633
|
+
|
|
2634
|
+
self.logger.info(f"Selected consensus features: {final_count:,} (removed: {removed_count:,}) in {elapsed_time:.4f}s")
|
|
2792
2635
|
|
|
2793
2636
|
return consensus
|
|
2794
2637
|
|
|
2795
2638
|
|
|
2796
2639
|
def consensus_filter(self, consensus):
|
|
2797
2640
|
"""
|
|
2798
|
-
Filter consensus_df by
|
|
2799
|
-
This
|
|
2641
|
+
Filter consensus_df by keeping only consensus features that match the given criteria.
|
|
2642
|
+
This keeps only the specified consensus features and removes all others.
|
|
2643
|
+
Also updates related entries in consensus_mapping_df, features_df, and consensus_ms2.
|
|
2800
2644
|
|
|
2801
2645
|
Parameters:
|
|
2802
|
-
consensus: Consensus features to
|
|
2646
|
+
consensus: Consensus features to keep. Can be:
|
|
2803
2647
|
- polars.DataFrame: Consensus DataFrame (will use consensus_uid column)
|
|
2804
|
-
- list: List of consensus_uids to
|
|
2805
|
-
- int: Single consensus_uid to
|
|
2648
|
+
- list: List of consensus_uids to keep
|
|
2649
|
+
- int: Single consensus_uid to keep
|
|
2806
2650
|
|
|
2807
2651
|
Returns:
|
|
2808
2652
|
None (modifies self.consensus_df and related DataFrames in place)
|
|
@@ -2813,71 +2657,73 @@ def consensus_filter(self, consensus):
|
|
|
2813
2657
|
|
|
2814
2658
|
initial_consensus_count = len(self.consensus_df)
|
|
2815
2659
|
|
|
2816
|
-
# Determine consensus_uids to
|
|
2660
|
+
# Determine consensus_uids to keep
|
|
2817
2661
|
if isinstance(consensus, pl.DataFrame):
|
|
2818
2662
|
if "consensus_uid" not in consensus.columns:
|
|
2819
2663
|
self.logger.error("consensus DataFrame must contain 'consensus_uid' column")
|
|
2820
2664
|
return
|
|
2821
|
-
|
|
2665
|
+
consensus_uids_to_keep = consensus["consensus_uid"].to_list()
|
|
2822
2666
|
elif isinstance(consensus, list):
|
|
2823
|
-
|
|
2667
|
+
consensus_uids_to_keep = consensus
|
|
2824
2668
|
elif isinstance(consensus, int):
|
|
2825
|
-
|
|
2669
|
+
consensus_uids_to_keep = [consensus]
|
|
2826
2670
|
else:
|
|
2827
2671
|
self.logger.error("consensus parameter must be a DataFrame, list, or int")
|
|
2828
2672
|
return
|
|
2829
2673
|
|
|
2830
|
-
if not
|
|
2674
|
+
if not consensus_uids_to_keep:
|
|
2831
2675
|
self.logger.warning("No consensus UIDs provided for filtering.")
|
|
2832
2676
|
return
|
|
2833
2677
|
|
|
2834
|
-
# Get feature_uids that need to be
|
|
2835
|
-
|
|
2678
|
+
# Get feature_uids that need to be kept in features_df
|
|
2679
|
+
feature_uids_to_keep = []
|
|
2836
2680
|
if (
|
|
2837
2681
|
self.consensus_mapping_df is not None
|
|
2838
2682
|
and not self.consensus_mapping_df.is_empty()
|
|
2839
2683
|
):
|
|
2840
|
-
|
|
2841
|
-
pl.col("consensus_uid").is_in(
|
|
2684
|
+
feature_uids_to_keep = self.consensus_mapping_df.filter(
|
|
2685
|
+
pl.col("consensus_uid").is_in(consensus_uids_to_keep),
|
|
2842
2686
|
)["feature_uid"].to_list()
|
|
2843
2687
|
|
|
2844
|
-
#
|
|
2688
|
+
# Keep only specified consensus features in consensus_df
|
|
2845
2689
|
self.consensus_df = self.consensus_df.filter(
|
|
2846
|
-
|
|
2690
|
+
pl.col("consensus_uid").is_in(consensus_uids_to_keep),
|
|
2847
2691
|
)
|
|
2848
2692
|
|
|
2849
|
-
#
|
|
2693
|
+
# Keep only relevant entries in consensus_mapping_df
|
|
2850
2694
|
if (
|
|
2851
2695
|
self.consensus_mapping_df is not None
|
|
2852
2696
|
and not self.consensus_mapping_df.is_empty()
|
|
2853
2697
|
):
|
|
2854
2698
|
initial_mapping_count = len(self.consensus_mapping_df)
|
|
2855
2699
|
self.consensus_mapping_df = self.consensus_mapping_df.filter(
|
|
2856
|
-
|
|
2700
|
+
pl.col("consensus_uid").is_in(consensus_uids_to_keep),
|
|
2857
2701
|
)
|
|
2858
|
-
|
|
2702
|
+
remaining_mapping_count = len(self.consensus_mapping_df)
|
|
2703
|
+
removed_mapping_count = initial_mapping_count - remaining_mapping_count
|
|
2859
2704
|
if removed_mapping_count > 0:
|
|
2860
2705
|
self.logger.debug(
|
|
2861
2706
|
f"Removed {removed_mapping_count} entries from consensus_mapping_df",
|
|
2862
2707
|
)
|
|
2863
2708
|
|
|
2864
|
-
#
|
|
2709
|
+
# Keep only corresponding features in features_df
|
|
2865
2710
|
if (
|
|
2866
|
-
|
|
2711
|
+
feature_uids_to_keep
|
|
2867
2712
|
and self.features_df is not None
|
|
2868
2713
|
and not self.features_df.is_empty()
|
|
2869
2714
|
):
|
|
2870
2715
|
initial_features_count = len(self.features_df)
|
|
2871
2716
|
self.features_df = self.features_df.filter(
|
|
2872
|
-
|
|
2717
|
+
pl.col("feature_uid").is_in(feature_uids_to_keep),
|
|
2873
2718
|
)
|
|
2874
|
-
|
|
2719
|
+
remaining_features_count = len(self.features_df)
|
|
2720
|
+
removed_features_count = initial_features_count - remaining_features_count
|
|
2875
2721
|
if removed_features_count > 0:
|
|
2876
2722
|
self.logger.debug(
|
|
2877
2723
|
f"Removed {removed_features_count} entries from features_df",
|
|
2878
2724
|
)
|
|
2879
2725
|
|
|
2880
|
-
#
|
|
2726
|
+
# Keep only relevant entries in consensus_ms2 if it exists
|
|
2881
2727
|
if (
|
|
2882
2728
|
hasattr(self, "consensus_ms2")
|
|
2883
2729
|
and self.consensus_ms2 is not None
|
|
@@ -2885,22 +2731,25 @@ def consensus_filter(self, consensus):
|
|
|
2885
2731
|
):
|
|
2886
2732
|
initial_ms2_count = len(self.consensus_ms2)
|
|
2887
2733
|
self.consensus_ms2 = self.consensus_ms2.filter(
|
|
2888
|
-
|
|
2734
|
+
pl.col("consensus_uid").is_in(consensus_uids_to_keep),
|
|
2889
2735
|
)
|
|
2890
|
-
|
|
2736
|
+
remaining_ms2_count = len(self.consensus_ms2)
|
|
2737
|
+
removed_ms2_count = initial_ms2_count - remaining_ms2_count
|
|
2891
2738
|
if removed_ms2_count > 0:
|
|
2892
2739
|
self.logger.debug(f"Removed {removed_ms2_count} entries from consensus_ms2")
|
|
2893
2740
|
|
|
2894
|
-
|
|
2741
|
+
remaining_consensus_count = len(self.consensus_df)
|
|
2742
|
+
removed_consensus_count = initial_consensus_count - remaining_consensus_count
|
|
2895
2743
|
self.logger.info(
|
|
2896
|
-
f"Filtered
|
|
2744
|
+
f"Filtered consensus features: kept {remaining_consensus_count}, removed {removed_consensus_count}",
|
|
2897
2745
|
)
|
|
2898
2746
|
|
|
2899
2747
|
|
|
2900
2748
|
def consensus_delete(self, consensus):
|
|
2901
2749
|
"""
|
|
2902
2750
|
Delete consensus features from consensus_df based on consensus identifiers.
|
|
2903
|
-
This
|
|
2751
|
+
This removes the specified consensus features and keeps all others (opposite of consensus_filter).
|
|
2752
|
+
Also removes related entries from consensus_mapping_df, features_df, and consensus_ms2.
|
|
2904
2753
|
|
|
2905
2754
|
Parameters:
|
|
2906
2755
|
consensus: Consensus features to delete. Can be:
|
|
@@ -2911,7 +2760,110 @@ def consensus_delete(self, consensus):
|
|
|
2911
2760
|
Returns:
|
|
2912
2761
|
None (modifies self.consensus_df and related DataFrames in place)
|
|
2913
2762
|
"""
|
|
2914
|
-
self.
|
|
2763
|
+
if self.consensus_df is None or self.consensus_df.is_empty():
|
|
2764
|
+
self.logger.warning("No consensus features found in study.")
|
|
2765
|
+
return
|
|
2766
|
+
|
|
2767
|
+
# Early return if no consensus provided
|
|
2768
|
+
if consensus is None:
|
|
2769
|
+
self.logger.warning("No consensus provided for deletion.")
|
|
2770
|
+
return
|
|
2771
|
+
|
|
2772
|
+
initial_consensus_count = len(self.consensus_df)
|
|
2773
|
+
|
|
2774
|
+
# Determine consensus_uids to remove
|
|
2775
|
+
if isinstance(consensus, pl.DataFrame):
|
|
2776
|
+
if "consensus_uid" not in consensus.columns:
|
|
2777
|
+
self.logger.error("consensus DataFrame must contain 'consensus_uid' column")
|
|
2778
|
+
return
|
|
2779
|
+
consensus_uids_to_remove = consensus["consensus_uid"].to_list()
|
|
2780
|
+
elif isinstance(consensus, list):
|
|
2781
|
+
consensus_uids_to_remove = consensus
|
|
2782
|
+
elif isinstance(consensus, int):
|
|
2783
|
+
consensus_uids_to_remove = [consensus]
|
|
2784
|
+
else:
|
|
2785
|
+
self.logger.error("consensus parameter must be a DataFrame, list, or int")
|
|
2786
|
+
return
|
|
2787
|
+
|
|
2788
|
+
if not consensus_uids_to_remove:
|
|
2789
|
+
self.logger.warning("No consensus UIDs provided for deletion.")
|
|
2790
|
+
return
|
|
2791
|
+
|
|
2792
|
+
# Convert to set for faster lookup if list is large
|
|
2793
|
+
if len(consensus_uids_to_remove) > 100:
|
|
2794
|
+
consensus_uids_set = set(consensus_uids_to_remove)
|
|
2795
|
+
# Use the set for filtering if it's significantly smaller
|
|
2796
|
+
if len(consensus_uids_set) < len(consensus_uids_to_remove) * 0.8:
|
|
2797
|
+
consensus_uids_to_remove = list(consensus_uids_set)
|
|
2798
|
+
|
|
2799
|
+
# Get feature_uids that need to be removed from features_df
|
|
2800
|
+
feature_uids_to_remove = []
|
|
2801
|
+
if (
|
|
2802
|
+
self.consensus_mapping_df is not None
|
|
2803
|
+
and not self.consensus_mapping_df.is_empty()
|
|
2804
|
+
):
|
|
2805
|
+
feature_uids_to_remove = self.consensus_mapping_df.filter(
|
|
2806
|
+
pl.col("consensus_uid").is_in(consensus_uids_to_remove),
|
|
2807
|
+
)["feature_uid"].to_list()
|
|
2808
|
+
|
|
2809
|
+
# Remove consensus features from consensus_df
|
|
2810
|
+
self.consensus_df = self.consensus_df.filter(
|
|
2811
|
+
~pl.col("consensus_uid").is_in(consensus_uids_to_remove),
|
|
2812
|
+
)
|
|
2813
|
+
|
|
2814
|
+
# Remove from consensus_mapping_df
|
|
2815
|
+
mapping_removed_count = 0
|
|
2816
|
+
if (
|
|
2817
|
+
self.consensus_mapping_df is not None
|
|
2818
|
+
and not self.consensus_mapping_df.is_empty()
|
|
2819
|
+
):
|
|
2820
|
+
initial_mapping_count = len(self.consensus_mapping_df)
|
|
2821
|
+
self.consensus_mapping_df = self.consensus_mapping_df.filter(
|
|
2822
|
+
~pl.col("consensus_uid").is_in(consensus_uids_to_remove),
|
|
2823
|
+
)
|
|
2824
|
+
mapping_removed_count = initial_mapping_count - len(self.consensus_mapping_df)
|
|
2825
|
+
|
|
2826
|
+
# Remove corresponding features from features_df
|
|
2827
|
+
features_removed_count = 0
|
|
2828
|
+
if (
|
|
2829
|
+
feature_uids_to_remove
|
|
2830
|
+
and self.features_df is not None
|
|
2831
|
+
and not self.features_df.is_empty()
|
|
2832
|
+
):
|
|
2833
|
+
initial_features_count = len(self.features_df)
|
|
2834
|
+
self.features_df = self.features_df.filter(
|
|
2835
|
+
~pl.col("feature_uid").is_in(feature_uids_to_remove),
|
|
2836
|
+
)
|
|
2837
|
+
features_removed_count = initial_features_count - len(self.features_df)
|
|
2838
|
+
|
|
2839
|
+
# Remove from consensus_ms2 if it exists
|
|
2840
|
+
ms2_removed_count = 0
|
|
2841
|
+
if (
|
|
2842
|
+
hasattr(self, "consensus_ms2")
|
|
2843
|
+
and self.consensus_ms2 is not None
|
|
2844
|
+
and not self.consensus_ms2.is_empty()
|
|
2845
|
+
):
|
|
2846
|
+
initial_ms2_count = len(self.consensus_ms2)
|
|
2847
|
+
self.consensus_ms2 = self.consensus_ms2.filter(
|
|
2848
|
+
~pl.col("consensus_uid").is_in(consensus_uids_to_remove),
|
|
2849
|
+
)
|
|
2850
|
+
ms2_removed_count = initial_ms2_count - len(self.consensus_ms2)
|
|
2851
|
+
|
|
2852
|
+
# Calculate results and log efficiently
|
|
2853
|
+
final_consensus_count = len(self.consensus_df)
|
|
2854
|
+
consensus_removed_count = initial_consensus_count - final_consensus_count
|
|
2855
|
+
|
|
2856
|
+
# Single comprehensive log message
|
|
2857
|
+
log_parts = [f"Deleted {consensus_removed_count} consensus features"]
|
|
2858
|
+
if mapping_removed_count > 0:
|
|
2859
|
+
log_parts.append(f"{mapping_removed_count} consensus mappings")
|
|
2860
|
+
if features_removed_count > 0:
|
|
2861
|
+
log_parts.append(f"{features_removed_count} features")
|
|
2862
|
+
if ms2_removed_count > 0:
|
|
2863
|
+
log_parts.append(f"{ms2_removed_count} MS2 spectra")
|
|
2864
|
+
|
|
2865
|
+
log_message = ". ".join(log_parts) + f". Remaining consensus: {final_consensus_count}"
|
|
2866
|
+
self.logger.info(log_message)
|
|
2915
2867
|
|
|
2916
2868
|
|
|
2917
2869
|
# =====================================================================================
|