masster 0.4.10__py3-none-any.whl → 0.4.12__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of masster might be problematic. Click here for more details.
- masster/_version.py +1 -1
- masster/lib/lib.py +45 -3
- masster/sample/sample5_schema.json +44 -44
- masster/study/h5.py +0 -13
- masster/study/helpers.py +263 -310
- masster/study/id.py +564 -324
- masster/study/plot.py +174 -312
- masster/study/processing.py +5 -0
- masster/study/study.py +95 -60
- masster/study/study5_schema.json +157 -145
- {masster-0.4.10.dist-info → masster-0.4.12.dist-info}/METADATA +1 -1
- {masster-0.4.10.dist-info → masster-0.4.12.dist-info}/RECORD +15 -15
- {masster-0.4.10.dist-info → masster-0.4.12.dist-info}/WHEEL +0 -0
- {masster-0.4.10.dist-info → masster-0.4.12.dist-info}/entry_points.txt +0 -0
- {masster-0.4.10.dist-info → masster-0.4.12.dist-info}/licenses/LICENSE +0 -0
masster/study/helpers.py
CHANGED
|
@@ -780,6 +780,7 @@ def _get_sample_uids(self, samples=None, seed=42):
|
|
|
780
780
|
# choose a random sample of sample_uids
|
|
781
781
|
if len(self.samples_df) > samples:
|
|
782
782
|
np.random.seed(seed) # for reproducibility
|
|
783
|
+
self.logger.info(f"Randomly selected {samples} samples")
|
|
783
784
|
return np.random.choice(
|
|
784
785
|
self.samples_df["sample_uid"].to_list(),
|
|
785
786
|
samples,
|
|
@@ -1742,13 +1743,12 @@ def features_select(
|
|
|
1742
1743
|
elapsed_time = time.perf_counter() - start_time
|
|
1743
1744
|
final_count = len(result)
|
|
1744
1745
|
removed_count = initial_count - final_count
|
|
1745
|
-
throughput = final_count / elapsed_time if elapsed_time > 0 else 0
|
|
1746
1746
|
|
|
1747
1747
|
if final_count == 0:
|
|
1748
1748
|
self.logger.warning("No features remaining after applying selection criteria.")
|
|
1749
1749
|
else:
|
|
1750
1750
|
self.logger.debug(
|
|
1751
|
-
f"Selected features: {final_count:,} (removed: {removed_count:,})"
|
|
1751
|
+
f"Selected features: {final_count:,} (removed: {removed_count:,}) in {elapsed_time:.4f}s"
|
|
1752
1752
|
)
|
|
1753
1753
|
|
|
1754
1754
|
return result
|
|
@@ -2093,8 +2093,6 @@ def features_filter(
|
|
|
2093
2093
|
self.logger.warning("No features provided for filtering.")
|
|
2094
2094
|
return
|
|
2095
2095
|
|
|
2096
|
-
import time
|
|
2097
|
-
start_time = time.perf_counter()
|
|
2098
2096
|
initial_count = len(self.features_df)
|
|
2099
2097
|
|
|
2100
2098
|
# Extract feature UIDs efficiently
|
|
@@ -2405,12 +2403,16 @@ def consensus_select(
|
|
|
2405
2403
|
chrom_prominence_scaled_mean=None,
|
|
2406
2404
|
chrom_height_scaled_mean=None,
|
|
2407
2405
|
rt_delta_mean=None,
|
|
2406
|
+
id_top_score=None,
|
|
2407
|
+
identified=None,
|
|
2408
2408
|
sortby=None,
|
|
2409
2409
|
descending=True,
|
|
2410
2410
|
):
|
|
2411
2411
|
"""
|
|
2412
2412
|
Select consensus features from consensus_df based on specified criteria and return the filtered DataFrame.
|
|
2413
2413
|
|
|
2414
|
+
OPTIMIZED VERSION: Enhanced performance with lazy evaluation, vectorized operations, and efficient filtering.
|
|
2415
|
+
|
|
2414
2416
|
Parameters:
|
|
2415
2417
|
mz: m/z filter with flexible formats:
|
|
2416
2418
|
- float: m/z value ± default tolerance (uses study.parameters.eic_mz_tol)
|
|
@@ -2432,6 +2434,11 @@ def consensus_select(
|
|
|
2432
2434
|
chrom_prominence_scaled_mean: mean scaled chromatogram prominence filter (tuple for range, single value for minimum)
|
|
2433
2435
|
chrom_height_scaled_mean: mean scaled chromatogram height filter (tuple for range, single value for minimum)
|
|
2434
2436
|
rt_delta_mean: mean RT delta filter (tuple for range, single value for minimum)
|
|
2437
|
+
id_top_score: identification top score filter (tuple for range, single value for minimum)
|
|
2438
|
+
identified: filter by identification status:
|
|
2439
|
+
- True: select only rows with id_top_name not null
|
|
2440
|
+
- False: select only rows with id_top_name null
|
|
2441
|
+
- None: no filtering (default)
|
|
2435
2442
|
sortby: column name(s) to sort by (string, list of strings, or None for no sorting)
|
|
2436
2443
|
descending: sort direction (True for descending, False for ascending, default is True)
|
|
2437
2444
|
|
|
@@ -2442,366 +2449,204 @@ def consensus_select(
|
|
|
2442
2449
|
self.logger.warning("No consensus features found in study.")
|
|
2443
2450
|
return pl.DataFrame()
|
|
2444
2451
|
|
|
2445
|
-
|
|
2446
|
-
|
|
2452
|
+
# Early return optimization - check if any filters are provided
|
|
2453
|
+
filter_params = [mz, rt, inty_mean, consensus_uid, consensus_id, number_samples,
|
|
2454
|
+
number_ms2, quality, bl, chrom_coherence_mean, chrom_prominence_mean,
|
|
2455
|
+
chrom_prominence_scaled_mean, chrom_height_scaled_mean,
|
|
2456
|
+
rt_delta_mean, id_top_score, identified]
|
|
2457
|
+
|
|
2458
|
+
if all(param is None for param in filter_params) and sortby is None:
|
|
2459
|
+
return self.consensus_df.clone()
|
|
2460
|
+
|
|
2461
|
+
import time
|
|
2462
|
+
start_time = time.perf_counter()
|
|
2463
|
+
initial_count = len(self.consensus_df)
|
|
2447
2464
|
|
|
2448
|
-
#
|
|
2449
|
-
|
|
2450
|
-
|
|
2465
|
+
# Pre-check available columns once for efficiency
|
|
2466
|
+
available_columns = set(self.consensus_df.columns)
|
|
2467
|
+
filter_conditions = []
|
|
2468
|
+
warnings = []
|
|
2451
2469
|
|
|
2470
|
+
# Build all filter conditions efficiently
|
|
2471
|
+
if mz is not None:
|
|
2452
2472
|
if isinstance(mz, tuple) and len(mz) == 2:
|
|
2453
|
-
# Check if second value is smaller than first (indicating mz, mz_tol format)
|
|
2454
2473
|
if mz[1] < mz[0]:
|
|
2455
|
-
#
|
|
2474
|
+
# mz_center ± mz_tol format
|
|
2456
2475
|
mz_center, mz_tol = mz
|
|
2457
2476
|
min_mz = mz_center - mz_tol
|
|
2458
2477
|
max_mz = mz_center + mz_tol
|
|
2459
2478
|
else:
|
|
2460
|
-
#
|
|
2479
|
+
# (min_mz, max_mz) format
|
|
2461
2480
|
min_mz, max_mz = mz
|
|
2462
|
-
|
|
2463
|
-
(pl.col("mz") >= min_mz) & (pl.col("mz") <= max_mz),
|
|
2464
|
-
)
|
|
2481
|
+
filter_conditions.append((pl.col("mz") >= min_mz) & (pl.col("mz") <= max_mz))
|
|
2465
2482
|
else:
|
|
2466
|
-
# Single
|
|
2483
|
+
# Single value with default tolerance
|
|
2467
2484
|
default_mz_tol = getattr(self, "parameters", None)
|
|
2468
2485
|
if default_mz_tol and hasattr(default_mz_tol, "eic_mz_tol"):
|
|
2469
2486
|
default_mz_tol = default_mz_tol.eic_mz_tol
|
|
2470
2487
|
else:
|
|
2471
|
-
# Fallback to align_defaults if study parameters not available
|
|
2472
2488
|
from masster.study.defaults.align_def import align_defaults
|
|
2473
|
-
|
|
2474
2489
|
default_mz_tol = align_defaults().mz_max_diff
|
|
2475
|
-
|
|
2490
|
+
|
|
2476
2491
|
min_mz = mz - default_mz_tol
|
|
2477
2492
|
max_mz = mz + default_mz_tol
|
|
2478
|
-
|
|
2479
|
-
(pl.col("mz") >= min_mz) & (pl.col("mz") <= max_mz),
|
|
2480
|
-
)
|
|
2481
|
-
|
|
2482
|
-
self.logger.debug(
|
|
2483
|
-
f"Selected consensus by mz. Consensus removed: {consensus_len_before_filter - len(consensus)}",
|
|
2484
|
-
)
|
|
2493
|
+
filter_conditions.append((pl.col("mz") >= min_mz) & (pl.col("mz") <= max_mz))
|
|
2485
2494
|
|
|
2486
|
-
# Filter by retention time
|
|
2487
2495
|
if rt is not None:
|
|
2488
|
-
consensus_len_before_filter = len(consensus)
|
|
2489
|
-
|
|
2490
2496
|
if isinstance(rt, tuple) and len(rt) == 2:
|
|
2491
|
-
# Check if second value is smaller than first (indicating rt, rt_tol format)
|
|
2492
2497
|
if rt[1] < rt[0]:
|
|
2493
|
-
#
|
|
2498
|
+
# rt_center ± rt_tol format
|
|
2494
2499
|
rt_center, rt_tol = rt
|
|
2495
2500
|
min_rt = rt_center - rt_tol
|
|
2496
2501
|
max_rt = rt_center + rt_tol
|
|
2497
2502
|
else:
|
|
2498
|
-
#
|
|
2503
|
+
# (min_rt, max_rt) format
|
|
2499
2504
|
min_rt, max_rt = rt
|
|
2500
|
-
|
|
2501
|
-
(pl.col("rt") >= min_rt) & (pl.col("rt") <= max_rt),
|
|
2502
|
-
)
|
|
2505
|
+
filter_conditions.append((pl.col("rt") >= min_rt) & (pl.col("rt") <= max_rt))
|
|
2503
2506
|
else:
|
|
2504
|
-
# Single
|
|
2507
|
+
# Single value with default tolerance
|
|
2505
2508
|
default_rt_tol = getattr(self, "parameters", None)
|
|
2506
2509
|
if default_rt_tol and hasattr(default_rt_tol, "eic_rt_tol"):
|
|
2507
2510
|
default_rt_tol = default_rt_tol.eic_rt_tol
|
|
2508
2511
|
else:
|
|
2509
|
-
# Fallback to align_defaults if study parameters not available
|
|
2510
2512
|
from masster.study.defaults.align_def import align_defaults
|
|
2511
|
-
|
|
2512
2513
|
default_rt_tol = align_defaults().rt_tol
|
|
2513
|
-
|
|
2514
|
+
|
|
2514
2515
|
min_rt = rt - default_rt_tol
|
|
2515
2516
|
max_rt = rt + default_rt_tol
|
|
2516
|
-
|
|
2517
|
-
|
|
2518
|
-
|
|
2519
|
-
|
|
2520
|
-
|
|
2521
|
-
|
|
2522
|
-
|
|
2523
|
-
|
|
2524
|
-
|
|
2525
|
-
|
|
2526
|
-
|
|
2527
|
-
|
|
2528
|
-
|
|
2529
|
-
|
|
2530
|
-
|
|
2531
|
-
|
|
2517
|
+
filter_conditions.append((pl.col("rt") >= min_rt) & (pl.col("rt") <= max_rt))
|
|
2518
|
+
|
|
2519
|
+
# Helper function to add range/minimum filters
|
|
2520
|
+
def _add_range_filter(param, column, param_name):
|
|
2521
|
+
if param is not None:
|
|
2522
|
+
if column in available_columns:
|
|
2523
|
+
if isinstance(param, tuple) and len(param) == 2:
|
|
2524
|
+
min_val, max_val = param
|
|
2525
|
+
filter_conditions.append((pl.col(column) >= min_val) & (pl.col(column) <= max_val))
|
|
2526
|
+
else:
|
|
2527
|
+
filter_conditions.append(pl.col(column) >= param)
|
|
2528
|
+
else:
|
|
2529
|
+
warnings.append(f"'{column}' column not found in consensus_df")
|
|
2530
|
+
|
|
2531
|
+
# Apply range/minimum filters efficiently
|
|
2532
|
+
_add_range_filter(inty_mean, "inty_mean", "inty_mean")
|
|
2533
|
+
_add_range_filter(quality, "quality", "quality")
|
|
2534
|
+
_add_range_filter(bl, "bl", "bl")
|
|
2535
|
+
_add_range_filter(chrom_coherence_mean, "chrom_coherence_mean", "chrom_coherence_mean")
|
|
2536
|
+
_add_range_filter(chrom_prominence_mean, "chrom_prominence_mean", "chrom_prominence_mean")
|
|
2537
|
+
_add_range_filter(chrom_prominence_scaled_mean, "chrom_prominence_scaled_mean", "chrom_prominence_scaled_mean")
|
|
2538
|
+
_add_range_filter(chrom_height_scaled_mean, "chrom_height_scaled_mean", "chrom_height_scaled_mean")
|
|
2539
|
+
_add_range_filter(rt_delta_mean, "rt_delta_mean", "rt_delta_mean")
|
|
2540
|
+
_add_range_filter(id_top_score, "id_top_score", "id_top_score")
|
|
2541
|
+
_add_range_filter(number_samples, "number_samples", "number_samples")
|
|
2542
|
+
|
|
2543
|
+
# Handle number_ms2 with column check
|
|
2544
|
+
if number_ms2 is not None:
|
|
2545
|
+
if "number_ms2" in available_columns:
|
|
2546
|
+
if isinstance(number_ms2, tuple) and len(number_ms2) == 2:
|
|
2547
|
+
min_ms2, max_ms2 = number_ms2
|
|
2548
|
+
filter_conditions.append((pl.col("number_ms2") >= min_ms2) & (pl.col("number_ms2") <= max_ms2))
|
|
2549
|
+
else:
|
|
2550
|
+
filter_conditions.append(pl.col("number_ms2") >= number_ms2)
|
|
2532
2551
|
else:
|
|
2533
|
-
|
|
2534
|
-
self.logger.debug(
|
|
2535
|
-
f"Selected consensus by inty_mean. Consensus removed: {consensus_len_before_filter - len(consensus)}",
|
|
2536
|
-
)
|
|
2552
|
+
warnings.append("'number_ms2' column not found in consensus_df")
|
|
2537
2553
|
|
|
2538
|
-
#
|
|
2554
|
+
# Handle consensus_uid (list, single value, or range)
|
|
2539
2555
|
if consensus_uid is not None:
|
|
2540
|
-
consensus_len_before_filter = len(consensus)
|
|
2541
2556
|
if isinstance(consensus_uid, (list, tuple)):
|
|
2542
2557
|
if len(consensus_uid) == 2 and not isinstance(consensus_uid, list):
|
|
2543
|
-
# Treat as range
|
|
2558
|
+
# Treat tuple as range
|
|
2544
2559
|
min_uid, max_uid = consensus_uid
|
|
2545
|
-
|
|
2546
|
-
(pl.col("consensus_uid") >= min_uid)
|
|
2547
|
-
& (pl.col("consensus_uid") <= max_uid),
|
|
2548
|
-
)
|
|
2560
|
+
filter_conditions.append((pl.col("consensus_uid") >= min_uid) & (pl.col("consensus_uid") <= max_uid))
|
|
2549
2561
|
else:
|
|
2550
|
-
# Treat as list
|
|
2551
|
-
|
|
2552
|
-
pl.col("consensus_uid").is_in(consensus_uid),
|
|
2553
|
-
)
|
|
2562
|
+
# Treat as list of values
|
|
2563
|
+
filter_conditions.append(pl.col("consensus_uid").is_in(consensus_uid))
|
|
2554
2564
|
else:
|
|
2555
|
-
|
|
2556
|
-
self.logger.debug(
|
|
2557
|
-
f"Selected consensus by consensus_uid. Consensus removed: {consensus_len_before_filter - len(consensus)}",
|
|
2558
|
-
)
|
|
2565
|
+
filter_conditions.append(pl.col("consensus_uid") == consensus_uid)
|
|
2559
2566
|
|
|
2560
|
-
#
|
|
2567
|
+
# Handle consensus_id (list or single value)
|
|
2561
2568
|
if consensus_id is not None:
|
|
2562
|
-
consensus_len_before_filter = len(consensus)
|
|
2563
2569
|
if isinstance(consensus_id, list):
|
|
2564
|
-
|
|
2565
|
-
else:
|
|
2566
|
-
consensus = consensus.filter(pl.col("consensus_id") == consensus_id)
|
|
2567
|
-
self.logger.debug(
|
|
2568
|
-
f"Selected consensus by consensus_id. Consensus removed: {consensus_len_before_filter - len(consensus)}",
|
|
2569
|
-
)
|
|
2570
|
-
|
|
2571
|
-
# Filter by number of samples
|
|
2572
|
-
if number_samples is not None:
|
|
2573
|
-
consensus_len_before_filter = len(consensus)
|
|
2574
|
-
if isinstance(number_samples, tuple) and len(number_samples) == 2:
|
|
2575
|
-
min_samples, max_samples = number_samples
|
|
2576
|
-
consensus = consensus.filter(
|
|
2577
|
-
(pl.col("number_samples") >= min_samples)
|
|
2578
|
-
& (pl.col("number_samples") <= max_samples),
|
|
2579
|
-
)
|
|
2580
|
-
else:
|
|
2581
|
-
consensus = consensus.filter(pl.col("number_samples") >= number_samples)
|
|
2582
|
-
self.logger.debug(
|
|
2583
|
-
f"Selected consensus by number_samples. Consensus removed: {consensus_len_before_filter - len(consensus)}",
|
|
2584
|
-
)
|
|
2585
|
-
|
|
2586
|
-
# Filter by number of MS2 spectra
|
|
2587
|
-
if number_ms2 is not None:
|
|
2588
|
-
consensus_len_before_filter = len(consensus)
|
|
2589
|
-
if "number_ms2" in consensus.columns:
|
|
2590
|
-
if isinstance(number_ms2, tuple) and len(number_ms2) == 2:
|
|
2591
|
-
min_ms2, max_ms2 = number_ms2
|
|
2592
|
-
consensus = consensus.filter(
|
|
2593
|
-
(pl.col("number_ms2") >= min_ms2)
|
|
2594
|
-
& (pl.col("number_ms2") <= max_ms2),
|
|
2595
|
-
)
|
|
2596
|
-
else:
|
|
2597
|
-
consensus = consensus.filter(pl.col("number_ms2") >= number_ms2)
|
|
2598
|
-
else:
|
|
2599
|
-
self.logger.warning("'number_ms2' column not found in consensus_df")
|
|
2600
|
-
self.logger.debug(
|
|
2601
|
-
f"Selected consensus by number_ms2. Consensus removed: {consensus_len_before_filter - len(consensus)}",
|
|
2602
|
-
)
|
|
2603
|
-
|
|
2604
|
-
# Filter by quality
|
|
2605
|
-
if quality is not None:
|
|
2606
|
-
consensus_len_before_filter = len(consensus)
|
|
2607
|
-
if isinstance(quality, tuple) and len(quality) == 2:
|
|
2608
|
-
min_quality, max_quality = quality
|
|
2609
|
-
consensus = consensus.filter(
|
|
2610
|
-
(pl.col("quality") >= min_quality) & (pl.col("quality") <= max_quality),
|
|
2611
|
-
)
|
|
2612
|
-
else:
|
|
2613
|
-
consensus = consensus.filter(pl.col("quality") >= quality)
|
|
2614
|
-
self.logger.debug(
|
|
2615
|
-
f"Selected consensus by quality. Consensus removed: {consensus_len_before_filter - len(consensus)}",
|
|
2616
|
-
)
|
|
2617
|
-
|
|
2618
|
-
# Filter by baseline
|
|
2619
|
-
if bl is not None:
|
|
2620
|
-
consensus_len_before_filter = len(consensus)
|
|
2621
|
-
if "bl" in consensus.columns:
|
|
2622
|
-
if isinstance(bl, tuple) and len(bl) == 2:
|
|
2623
|
-
min_bl, max_bl = bl
|
|
2624
|
-
consensus = consensus.filter(
|
|
2625
|
-
(pl.col("bl") >= min_bl) & (pl.col("bl") <= max_bl),
|
|
2626
|
-
)
|
|
2627
|
-
else:
|
|
2628
|
-
consensus = consensus.filter(pl.col("bl") >= bl)
|
|
2570
|
+
filter_conditions.append(pl.col("consensus_id").is_in(consensus_id))
|
|
2629
2571
|
else:
|
|
2630
|
-
|
|
2631
|
-
self.logger.debug(
|
|
2632
|
-
f"Selected consensus by bl. Consensus removed: {consensus_len_before_filter - len(consensus)}",
|
|
2633
|
-
)
|
|
2572
|
+
filter_conditions.append(pl.col("consensus_id") == consensus_id)
|
|
2634
2573
|
|
|
2635
|
-
#
|
|
2636
|
-
if
|
|
2637
|
-
|
|
2638
|
-
|
|
2639
|
-
|
|
2640
|
-
isinstance(chrom_coherence_mean, tuple)
|
|
2641
|
-
and len(chrom_coherence_mean) == 2
|
|
2642
|
-
):
|
|
2643
|
-
min_coherence, max_coherence = chrom_coherence_mean
|
|
2644
|
-
consensus = consensus.filter(
|
|
2645
|
-
(pl.col("chrom_coherence_mean") >= min_coherence)
|
|
2646
|
-
& (pl.col("chrom_coherence_mean") <= max_coherence),
|
|
2647
|
-
)
|
|
2574
|
+
# Handle identified status filter
|
|
2575
|
+
if identified is not None:
|
|
2576
|
+
if "id_top_name" in available_columns:
|
|
2577
|
+
if identified:
|
|
2578
|
+
filter_conditions.append(pl.col("id_top_name").is_not_null())
|
|
2648
2579
|
else:
|
|
2649
|
-
|
|
2650
|
-
pl.col("chrom_coherence_mean") >= chrom_coherence_mean,
|
|
2651
|
-
)
|
|
2580
|
+
filter_conditions.append(pl.col("id_top_name").is_null())
|
|
2652
2581
|
else:
|
|
2653
|
-
|
|
2654
|
-
"'chrom_coherence_mean' column not found in consensus_df",
|
|
2655
|
-
)
|
|
2656
|
-
self.logger.debug(
|
|
2657
|
-
f"Selected consensus by chrom_coherence_mean. Consensus removed: {consensus_len_before_filter - len(consensus)}",
|
|
2658
|
-
)
|
|
2582
|
+
warnings.append("'id_top_name' column not found in consensus_df")
|
|
2659
2583
|
|
|
2660
|
-
#
|
|
2661
|
-
|
|
2662
|
-
|
|
2663
|
-
if "chrom_prominence_mean" in consensus.columns:
|
|
2664
|
-
if (
|
|
2665
|
-
isinstance(chrom_prominence_mean, tuple)
|
|
2666
|
-
and len(chrom_prominence_mean) == 2
|
|
2667
|
-
):
|
|
2668
|
-
min_prominence, max_prominence = chrom_prominence_mean
|
|
2669
|
-
consensus = consensus.filter(
|
|
2670
|
-
(pl.col("chrom_prominence_mean") >= min_prominence)
|
|
2671
|
-
& (pl.col("chrom_prominence_mean") <= max_prominence),
|
|
2672
|
-
)
|
|
2673
|
-
else:
|
|
2674
|
-
consensus = consensus.filter(
|
|
2675
|
-
pl.col("chrom_prominence_mean") >= chrom_prominence_mean,
|
|
2676
|
-
)
|
|
2677
|
-
else:
|
|
2678
|
-
self.logger.warning(
|
|
2679
|
-
"'chrom_prominence_mean' column not found in consensus_df",
|
|
2680
|
-
)
|
|
2681
|
-
self.logger.debug(
|
|
2682
|
-
f"Selected consensus by chrom_prominence_mean. Consensus removed: {consensus_len_before_filter - len(consensus)}",
|
|
2683
|
-
)
|
|
2584
|
+
# Log warnings once
|
|
2585
|
+
for warning in warnings:
|
|
2586
|
+
self.logger.warning(warning)
|
|
2684
2587
|
|
|
2685
|
-
#
|
|
2686
|
-
if
|
|
2687
|
-
|
|
2688
|
-
|
|
2689
|
-
|
|
2690
|
-
|
|
2691
|
-
|
|
2692
|
-
|
|
2693
|
-
|
|
2694
|
-
|
|
2695
|
-
|
|
2696
|
-
|
|
2697
|
-
(pl.col("chrom_prominence_scaled_mean") >= min_prominence_scaled)
|
|
2698
|
-
& (pl.col("chrom_prominence_scaled_mean") <= max_prominence_scaled),
|
|
2699
|
-
)
|
|
2700
|
-
else:
|
|
2701
|
-
consensus = consensus.filter(
|
|
2702
|
-
pl.col("chrom_prominence_scaled_mean")
|
|
2703
|
-
>= chrom_prominence_scaled_mean,
|
|
2704
|
-
)
|
|
2705
|
-
else:
|
|
2706
|
-
self.logger.warning(
|
|
2707
|
-
"'chrom_prominence_scaled_mean' column not found in consensus_df",
|
|
2708
|
-
)
|
|
2709
|
-
self.logger.debug(
|
|
2710
|
-
f"Selected consensus by chrom_prominence_scaled_mean. Consensus removed: {consensus_len_before_filter - len(consensus)}",
|
|
2588
|
+
# Apply all filters at once using lazy evaluation for optimal performance
|
|
2589
|
+
if filter_conditions:
|
|
2590
|
+
# Combine all conditions efficiently using reduce
|
|
2591
|
+
from functools import reduce
|
|
2592
|
+
import operator
|
|
2593
|
+
combined_filter = reduce(operator.and_, filter_conditions)
|
|
2594
|
+
|
|
2595
|
+
consensus = (
|
|
2596
|
+
self.consensus_df
|
|
2597
|
+
.lazy()
|
|
2598
|
+
.filter(combined_filter)
|
|
2599
|
+
.collect(streaming=True)
|
|
2711
2600
|
)
|
|
2601
|
+
else:
|
|
2602
|
+
consensus = self.consensus_df.clone()
|
|
2712
2603
|
|
|
2713
|
-
|
|
2714
|
-
if chrom_height_scaled_mean is not None:
|
|
2715
|
-
consensus_len_before_filter = len(consensus)
|
|
2716
|
-
if "chrom_height_scaled_mean" in consensus.columns:
|
|
2717
|
-
if (
|
|
2718
|
-
isinstance(chrom_height_scaled_mean, tuple)
|
|
2719
|
-
and len(chrom_height_scaled_mean) == 2
|
|
2720
|
-
):
|
|
2721
|
-
min_height_scaled, max_height_scaled = chrom_height_scaled_mean
|
|
2722
|
-
consensus = consensus.filter(
|
|
2723
|
-
(pl.col("chrom_height_scaled_mean") >= min_height_scaled)
|
|
2724
|
-
& (pl.col("chrom_height_scaled_mean") <= max_height_scaled),
|
|
2725
|
-
)
|
|
2726
|
-
else:
|
|
2727
|
-
consensus = consensus.filter(
|
|
2728
|
-
pl.col("chrom_height_scaled_mean") >= chrom_height_scaled_mean,
|
|
2729
|
-
)
|
|
2730
|
-
else:
|
|
2731
|
-
self.logger.warning(
|
|
2732
|
-
"'chrom_height_scaled_mean' column not found in consensus_df",
|
|
2733
|
-
)
|
|
2734
|
-
self.logger.debug(
|
|
2735
|
-
f"Selected consensus by chrom_height_scaled_mean. Consensus removed: {consensus_len_before_filter - len(consensus)}",
|
|
2736
|
-
)
|
|
2604
|
+
final_count = len(consensus)
|
|
2737
2605
|
|
|
2738
|
-
#
|
|
2739
|
-
if
|
|
2740
|
-
|
|
2741
|
-
|
|
2742
|
-
if isinstance(rt_delta_mean, tuple) and len(rt_delta_mean) == 2:
|
|
2743
|
-
min_rt_delta, max_rt_delta = rt_delta_mean
|
|
2744
|
-
consensus = consensus.filter(
|
|
2745
|
-
(pl.col("rt_delta_mean") >= min_rt_delta)
|
|
2746
|
-
& (pl.col("rt_delta_mean") <= max_rt_delta),
|
|
2747
|
-
)
|
|
2748
|
-
else:
|
|
2749
|
-
consensus = consensus.filter(pl.col("rt_delta_mean") >= rt_delta_mean)
|
|
2750
|
-
else:
|
|
2751
|
-
self.logger.warning("'rt_delta_mean' column not found in consensus_df")
|
|
2752
|
-
self.logger.debug(
|
|
2753
|
-
f"Selected consensus by rt_delta_mean. Consensus removed: {consensus_len_before_filter - len(consensus)}",
|
|
2754
|
-
)
|
|
2755
|
-
|
|
2756
|
-
if len(consensus) == 0:
|
|
2757
|
-
self.logger.warning(
|
|
2758
|
-
"No consensus features remaining after applying selection criteria.",
|
|
2759
|
-
)
|
|
2760
|
-
else:
|
|
2761
|
-
self.logger.info(
|
|
2762
|
-
f"Selected consensus features. Features remaining: {len(consensus)} (from {initial_count})",
|
|
2763
|
-
)
|
|
2606
|
+
# Early return if no results
|
|
2607
|
+
if final_count == 0:
|
|
2608
|
+
self.logger.warning("No consensus features remaining after applying selection criteria.")
|
|
2609
|
+
return pl.DataFrame()
|
|
2764
2610
|
|
|
2765
2611
|
# Sort the results if sortby is specified
|
|
2766
2612
|
if sortby is not None:
|
|
2767
2613
|
if isinstance(sortby, str):
|
|
2768
|
-
# Single column
|
|
2769
2614
|
if sortby in consensus.columns:
|
|
2770
2615
|
consensus = consensus.sort(sortby, descending=descending)
|
|
2771
2616
|
else:
|
|
2772
|
-
self.logger.warning(
|
|
2773
|
-
f"Sort column '{sortby}' not found in consensus DataFrame",
|
|
2774
|
-
)
|
|
2617
|
+
self.logger.warning(f"Sort column '{sortby}' not found in consensus DataFrame")
|
|
2775
2618
|
elif isinstance(sortby, (list, tuple)):
|
|
2776
|
-
# Multiple columns
|
|
2777
2619
|
valid_columns = [col for col in sortby if col in consensus.columns]
|
|
2778
2620
|
invalid_columns = [col for col in sortby if col not in consensus.columns]
|
|
2779
|
-
|
|
2621
|
+
|
|
2780
2622
|
if invalid_columns:
|
|
2781
|
-
self.logger.warning(
|
|
2782
|
-
|
|
2783
|
-
)
|
|
2784
|
-
|
|
2623
|
+
self.logger.warning(f"Sort columns not found in consensus DataFrame: {invalid_columns}")
|
|
2624
|
+
|
|
2785
2625
|
if valid_columns:
|
|
2786
2626
|
consensus = consensus.sort(valid_columns, descending=descending)
|
|
2787
2627
|
else:
|
|
2788
|
-
self.logger.warning(
|
|
2789
|
-
|
|
2790
|
-
|
|
2628
|
+
self.logger.warning(f"Invalid sortby parameter type: {type(sortby)}. Expected str, list, or tuple.")
|
|
2629
|
+
|
|
2630
|
+
# Log performance metrics
|
|
2631
|
+
elapsed_time = time.perf_counter() - start_time
|
|
2632
|
+
removed_count = initial_count - final_count
|
|
2633
|
+
|
|
2634
|
+
self.logger.info(f"Selected consensus features: {final_count:,} (removed: {removed_count:,}) in {elapsed_time:.4f}s")
|
|
2791
2635
|
|
|
2792
2636
|
return consensus
|
|
2793
2637
|
|
|
2794
2638
|
|
|
2795
2639
|
def consensus_filter(self, consensus):
|
|
2796
2640
|
"""
|
|
2797
|
-
Filter consensus_df by
|
|
2798
|
-
This
|
|
2641
|
+
Filter consensus_df by keeping only consensus features that match the given criteria.
|
|
2642
|
+
This keeps only the specified consensus features and removes all others.
|
|
2643
|
+
Also updates related entries in consensus_mapping_df, features_df, and consensus_ms2.
|
|
2799
2644
|
|
|
2800
2645
|
Parameters:
|
|
2801
|
-
consensus: Consensus features to
|
|
2646
|
+
consensus: Consensus features to keep. Can be:
|
|
2802
2647
|
- polars.DataFrame: Consensus DataFrame (will use consensus_uid column)
|
|
2803
|
-
- list: List of consensus_uids to
|
|
2804
|
-
- int: Single consensus_uid to
|
|
2648
|
+
- list: List of consensus_uids to keep
|
|
2649
|
+
- int: Single consensus_uid to keep
|
|
2805
2650
|
|
|
2806
2651
|
Returns:
|
|
2807
2652
|
None (modifies self.consensus_df and related DataFrames in place)
|
|
@@ -2812,71 +2657,73 @@ def consensus_filter(self, consensus):
|
|
|
2812
2657
|
|
|
2813
2658
|
initial_consensus_count = len(self.consensus_df)
|
|
2814
2659
|
|
|
2815
|
-
# Determine consensus_uids to
|
|
2660
|
+
# Determine consensus_uids to keep
|
|
2816
2661
|
if isinstance(consensus, pl.DataFrame):
|
|
2817
2662
|
if "consensus_uid" not in consensus.columns:
|
|
2818
2663
|
self.logger.error("consensus DataFrame must contain 'consensus_uid' column")
|
|
2819
2664
|
return
|
|
2820
|
-
|
|
2665
|
+
consensus_uids_to_keep = consensus["consensus_uid"].to_list()
|
|
2821
2666
|
elif isinstance(consensus, list):
|
|
2822
|
-
|
|
2667
|
+
consensus_uids_to_keep = consensus
|
|
2823
2668
|
elif isinstance(consensus, int):
|
|
2824
|
-
|
|
2669
|
+
consensus_uids_to_keep = [consensus]
|
|
2825
2670
|
else:
|
|
2826
2671
|
self.logger.error("consensus parameter must be a DataFrame, list, or int")
|
|
2827
2672
|
return
|
|
2828
2673
|
|
|
2829
|
-
if not
|
|
2674
|
+
if not consensus_uids_to_keep:
|
|
2830
2675
|
self.logger.warning("No consensus UIDs provided for filtering.")
|
|
2831
2676
|
return
|
|
2832
2677
|
|
|
2833
|
-
# Get feature_uids that need to be
|
|
2834
|
-
|
|
2678
|
+
# Get feature_uids that need to be kept in features_df
|
|
2679
|
+
feature_uids_to_keep = []
|
|
2835
2680
|
if (
|
|
2836
2681
|
self.consensus_mapping_df is not None
|
|
2837
2682
|
and not self.consensus_mapping_df.is_empty()
|
|
2838
2683
|
):
|
|
2839
|
-
|
|
2840
|
-
pl.col("consensus_uid").is_in(
|
|
2684
|
+
feature_uids_to_keep = self.consensus_mapping_df.filter(
|
|
2685
|
+
pl.col("consensus_uid").is_in(consensus_uids_to_keep),
|
|
2841
2686
|
)["feature_uid"].to_list()
|
|
2842
2687
|
|
|
2843
|
-
#
|
|
2688
|
+
# Keep only specified consensus features in consensus_df
|
|
2844
2689
|
self.consensus_df = self.consensus_df.filter(
|
|
2845
|
-
|
|
2690
|
+
pl.col("consensus_uid").is_in(consensus_uids_to_keep),
|
|
2846
2691
|
)
|
|
2847
2692
|
|
|
2848
|
-
#
|
|
2693
|
+
# Keep only relevant entries in consensus_mapping_df
|
|
2849
2694
|
if (
|
|
2850
2695
|
self.consensus_mapping_df is not None
|
|
2851
2696
|
and not self.consensus_mapping_df.is_empty()
|
|
2852
2697
|
):
|
|
2853
2698
|
initial_mapping_count = len(self.consensus_mapping_df)
|
|
2854
2699
|
self.consensus_mapping_df = self.consensus_mapping_df.filter(
|
|
2855
|
-
|
|
2700
|
+
pl.col("consensus_uid").is_in(consensus_uids_to_keep),
|
|
2856
2701
|
)
|
|
2857
|
-
|
|
2702
|
+
remaining_mapping_count = len(self.consensus_mapping_df)
|
|
2703
|
+
removed_mapping_count = initial_mapping_count - remaining_mapping_count
|
|
2858
2704
|
if removed_mapping_count > 0:
|
|
2859
2705
|
self.logger.debug(
|
|
2860
2706
|
f"Removed {removed_mapping_count} entries from consensus_mapping_df",
|
|
2861
2707
|
)
|
|
2862
2708
|
|
|
2863
|
-
#
|
|
2709
|
+
# Keep only corresponding features in features_df
|
|
2864
2710
|
if (
|
|
2865
|
-
|
|
2711
|
+
feature_uids_to_keep
|
|
2866
2712
|
and self.features_df is not None
|
|
2867
2713
|
and not self.features_df.is_empty()
|
|
2868
2714
|
):
|
|
2869
2715
|
initial_features_count = len(self.features_df)
|
|
2870
2716
|
self.features_df = self.features_df.filter(
|
|
2871
|
-
|
|
2717
|
+
pl.col("feature_uid").is_in(feature_uids_to_keep),
|
|
2872
2718
|
)
|
|
2873
|
-
|
|
2719
|
+
remaining_features_count = len(self.features_df)
|
|
2720
|
+
removed_features_count = initial_features_count - remaining_features_count
|
|
2874
2721
|
if removed_features_count > 0:
|
|
2875
2722
|
self.logger.debug(
|
|
2876
2723
|
f"Removed {removed_features_count} entries from features_df",
|
|
2877
2724
|
)
|
|
2878
2725
|
|
|
2879
|
-
#
|
|
2726
|
+
# Keep only relevant entries in consensus_ms2 if it exists
|
|
2880
2727
|
if (
|
|
2881
2728
|
hasattr(self, "consensus_ms2")
|
|
2882
2729
|
and self.consensus_ms2 is not None
|
|
@@ -2884,22 +2731,25 @@ def consensus_filter(self, consensus):
|
|
|
2884
2731
|
):
|
|
2885
2732
|
initial_ms2_count = len(self.consensus_ms2)
|
|
2886
2733
|
self.consensus_ms2 = self.consensus_ms2.filter(
|
|
2887
|
-
|
|
2734
|
+
pl.col("consensus_uid").is_in(consensus_uids_to_keep),
|
|
2888
2735
|
)
|
|
2889
|
-
|
|
2736
|
+
remaining_ms2_count = len(self.consensus_ms2)
|
|
2737
|
+
removed_ms2_count = initial_ms2_count - remaining_ms2_count
|
|
2890
2738
|
if removed_ms2_count > 0:
|
|
2891
2739
|
self.logger.debug(f"Removed {removed_ms2_count} entries from consensus_ms2")
|
|
2892
2740
|
|
|
2893
|
-
|
|
2741
|
+
remaining_consensus_count = len(self.consensus_df)
|
|
2742
|
+
removed_consensus_count = initial_consensus_count - remaining_consensus_count
|
|
2894
2743
|
self.logger.info(
|
|
2895
|
-
f"Filtered
|
|
2744
|
+
f"Filtered consensus features: kept {remaining_consensus_count}, removed {removed_consensus_count}",
|
|
2896
2745
|
)
|
|
2897
2746
|
|
|
2898
2747
|
|
|
2899
2748
|
def consensus_delete(self, consensus):
|
|
2900
2749
|
"""
|
|
2901
2750
|
Delete consensus features from consensus_df based on consensus identifiers.
|
|
2902
|
-
This
|
|
2751
|
+
This removes the specified consensus features and keeps all others (opposite of consensus_filter).
|
|
2752
|
+
Also removes related entries from consensus_mapping_df, features_df, and consensus_ms2.
|
|
2903
2753
|
|
|
2904
2754
|
Parameters:
|
|
2905
2755
|
consensus: Consensus features to delete. Can be:
|
|
@@ -2910,7 +2760,110 @@ def consensus_delete(self, consensus):
|
|
|
2910
2760
|
Returns:
|
|
2911
2761
|
None (modifies self.consensus_df and related DataFrames in place)
|
|
2912
2762
|
"""
|
|
2913
|
-
self.
|
|
2763
|
+
if self.consensus_df is None or self.consensus_df.is_empty():
|
|
2764
|
+
self.logger.warning("No consensus features found in study.")
|
|
2765
|
+
return
|
|
2766
|
+
|
|
2767
|
+
# Early return if no consensus provided
|
|
2768
|
+
if consensus is None:
|
|
2769
|
+
self.logger.warning("No consensus provided for deletion.")
|
|
2770
|
+
return
|
|
2771
|
+
|
|
2772
|
+
initial_consensus_count = len(self.consensus_df)
|
|
2773
|
+
|
|
2774
|
+
# Determine consensus_uids to remove
|
|
2775
|
+
if isinstance(consensus, pl.DataFrame):
|
|
2776
|
+
if "consensus_uid" not in consensus.columns:
|
|
2777
|
+
self.logger.error("consensus DataFrame must contain 'consensus_uid' column")
|
|
2778
|
+
return
|
|
2779
|
+
consensus_uids_to_remove = consensus["consensus_uid"].to_list()
|
|
2780
|
+
elif isinstance(consensus, list):
|
|
2781
|
+
consensus_uids_to_remove = consensus
|
|
2782
|
+
elif isinstance(consensus, int):
|
|
2783
|
+
consensus_uids_to_remove = [consensus]
|
|
2784
|
+
else:
|
|
2785
|
+
self.logger.error("consensus parameter must be a DataFrame, list, or int")
|
|
2786
|
+
return
|
|
2787
|
+
|
|
2788
|
+
if not consensus_uids_to_remove:
|
|
2789
|
+
self.logger.warning("No consensus UIDs provided for deletion.")
|
|
2790
|
+
return
|
|
2791
|
+
|
|
2792
|
+
# Convert to set for faster lookup if list is large
|
|
2793
|
+
if len(consensus_uids_to_remove) > 100:
|
|
2794
|
+
consensus_uids_set = set(consensus_uids_to_remove)
|
|
2795
|
+
# Use the set for filtering if it's significantly smaller
|
|
2796
|
+
if len(consensus_uids_set) < len(consensus_uids_to_remove) * 0.8:
|
|
2797
|
+
consensus_uids_to_remove = list(consensus_uids_set)
|
|
2798
|
+
|
|
2799
|
+
# Get feature_uids that need to be removed from features_df
|
|
2800
|
+
feature_uids_to_remove = []
|
|
2801
|
+
if (
|
|
2802
|
+
self.consensus_mapping_df is not None
|
|
2803
|
+
and not self.consensus_mapping_df.is_empty()
|
|
2804
|
+
):
|
|
2805
|
+
feature_uids_to_remove = self.consensus_mapping_df.filter(
|
|
2806
|
+
pl.col("consensus_uid").is_in(consensus_uids_to_remove),
|
|
2807
|
+
)["feature_uid"].to_list()
|
|
2808
|
+
|
|
2809
|
+
# Remove consensus features from consensus_df
|
|
2810
|
+
self.consensus_df = self.consensus_df.filter(
|
|
2811
|
+
~pl.col("consensus_uid").is_in(consensus_uids_to_remove),
|
|
2812
|
+
)
|
|
2813
|
+
|
|
2814
|
+
# Remove from consensus_mapping_df
|
|
2815
|
+
mapping_removed_count = 0
|
|
2816
|
+
if (
|
|
2817
|
+
self.consensus_mapping_df is not None
|
|
2818
|
+
and not self.consensus_mapping_df.is_empty()
|
|
2819
|
+
):
|
|
2820
|
+
initial_mapping_count = len(self.consensus_mapping_df)
|
|
2821
|
+
self.consensus_mapping_df = self.consensus_mapping_df.filter(
|
|
2822
|
+
~pl.col("consensus_uid").is_in(consensus_uids_to_remove),
|
|
2823
|
+
)
|
|
2824
|
+
mapping_removed_count = initial_mapping_count - len(self.consensus_mapping_df)
|
|
2825
|
+
|
|
2826
|
+
# Remove corresponding features from features_df
|
|
2827
|
+
features_removed_count = 0
|
|
2828
|
+
if (
|
|
2829
|
+
feature_uids_to_remove
|
|
2830
|
+
and self.features_df is not None
|
|
2831
|
+
and not self.features_df.is_empty()
|
|
2832
|
+
):
|
|
2833
|
+
initial_features_count = len(self.features_df)
|
|
2834
|
+
self.features_df = self.features_df.filter(
|
|
2835
|
+
~pl.col("feature_uid").is_in(feature_uids_to_remove),
|
|
2836
|
+
)
|
|
2837
|
+
features_removed_count = initial_features_count - len(self.features_df)
|
|
2838
|
+
|
|
2839
|
+
# Remove from consensus_ms2 if it exists
|
|
2840
|
+
ms2_removed_count = 0
|
|
2841
|
+
if (
|
|
2842
|
+
hasattr(self, "consensus_ms2")
|
|
2843
|
+
and self.consensus_ms2 is not None
|
|
2844
|
+
and not self.consensus_ms2.is_empty()
|
|
2845
|
+
):
|
|
2846
|
+
initial_ms2_count = len(self.consensus_ms2)
|
|
2847
|
+
self.consensus_ms2 = self.consensus_ms2.filter(
|
|
2848
|
+
~pl.col("consensus_uid").is_in(consensus_uids_to_remove),
|
|
2849
|
+
)
|
|
2850
|
+
ms2_removed_count = initial_ms2_count - len(self.consensus_ms2)
|
|
2851
|
+
|
|
2852
|
+
# Calculate results and log efficiently
|
|
2853
|
+
final_consensus_count = len(self.consensus_df)
|
|
2854
|
+
consensus_removed_count = initial_consensus_count - final_consensus_count
|
|
2855
|
+
|
|
2856
|
+
# Single comprehensive log message
|
|
2857
|
+
log_parts = [f"Deleted {consensus_removed_count} consensus features"]
|
|
2858
|
+
if mapping_removed_count > 0:
|
|
2859
|
+
log_parts.append(f"{mapping_removed_count} consensus mappings")
|
|
2860
|
+
if features_removed_count > 0:
|
|
2861
|
+
log_parts.append(f"{features_removed_count} features")
|
|
2862
|
+
if ms2_removed_count > 0:
|
|
2863
|
+
log_parts.append(f"{ms2_removed_count} MS2 spectra")
|
|
2864
|
+
|
|
2865
|
+
log_message = ". ".join(log_parts) + f". Remaining consensus: {final_consensus_count}"
|
|
2866
|
+
self.logger.info(log_message)
|
|
2914
2867
|
|
|
2915
2868
|
|
|
2916
2869
|
# =====================================================================================
|