masster 0.4.0__py3-none-any.whl → 0.4.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- masster/__init__.py +8 -8
- masster/_version.py +1 -1
- masster/chromatogram.py +3 -9
- masster/data/libs/README.md +1 -1
- masster/data/libs/ccm.csv +120 -120
- masster/data/libs/ccm.py +116 -62
- masster/data/libs/central_carbon_README.md +1 -1
- masster/data/libs/urine.py +161 -65
- masster/data/libs/urine_metabolites.csv +4693 -4693
- masster/data/wiff/2025_01_14_VW_7600_LpMx_DBS_CID_2min_TOP15_030msecMS1_005msecReac_CE35_DBS-ON_3.mzML +2 -2
- masster/logger.py +43 -78
- masster/sample/__init__.py +1 -1
- masster/sample/adducts.py +264 -338
- masster/sample/defaults/find_adducts_def.py +8 -21
- masster/sample/defaults/find_features_def.py +1 -6
- masster/sample/defaults/get_spectrum_def.py +1 -5
- masster/sample/defaults/sample_def.py +1 -5
- masster/sample/h5.py +282 -561
- masster/sample/helpers.py +75 -131
- masster/sample/lib.py +17 -42
- masster/sample/load.py +17 -31
- masster/sample/parameters.py +2 -6
- masster/sample/plot.py +27 -88
- masster/sample/processing.py +87 -117
- masster/sample/quant.py +51 -57
- masster/sample/sample.py +90 -103
- masster/sample/sample5_schema.json +44 -44
- masster/sample/save.py +12 -35
- masster/sample/sciex.py +19 -66
- masster/spectrum.py +20 -58
- masster/study/__init__.py +1 -1
- masster/study/defaults/align_def.py +1 -5
- masster/study/defaults/fill_chrom_def.py +1 -5
- masster/study/defaults/fill_def.py +1 -5
- masster/study/defaults/integrate_chrom_def.py +1 -5
- masster/study/defaults/integrate_def.py +1 -5
- masster/study/defaults/study_def.py +25 -58
- masster/study/export.py +207 -233
- masster/study/h5.py +136 -470
- masster/study/helpers.py +202 -495
- masster/study/helpers_optimized.py +13 -40
- masster/study/id.py +110 -213
- masster/study/load.py +143 -230
- masster/study/plot.py +257 -518
- masster/study/processing.py +257 -469
- masster/study/save.py +5 -15
- masster/study/study.py +276 -379
- masster/study/study5_schema.json +96 -96
- {masster-0.4.0.dist-info → masster-0.4.1.dist-info}/METADATA +1 -1
- masster-0.4.1.dist-info/RECORD +67 -0
- masster-0.4.0.dist-info/RECORD +0 -67
- {masster-0.4.0.dist-info → masster-0.4.1.dist-info}/WHEEL +0 -0
- {masster-0.4.0.dist-info → masster-0.4.1.dist-info}/entry_points.txt +0 -0
- {masster-0.4.0.dist-info → masster-0.4.1.dist-info}/licenses/LICENSE +0 -0
masster/study/processing.py
CHANGED
|
@@ -8,7 +8,7 @@ import pyopenms as oms
|
|
|
8
8
|
|
|
9
9
|
from tqdm import tqdm
|
|
10
10
|
|
|
11
|
-
from
|
|
11
|
+
from masster.study.defaults import (
|
|
12
12
|
align_defaults,
|
|
13
13
|
find_ms2_defaults,
|
|
14
14
|
integrate_defaults,
|
|
@@ -115,8 +115,7 @@ def align(self, **kwargs):
|
|
|
115
115
|
# Pre-build sample_uid lookup for faster access
|
|
116
116
|
self.logger.debug("Build sample_uid lookup for fast access...")
|
|
117
117
|
sample_uid_lookup = {
|
|
118
|
-
idx: row_dict["sample_uid"]
|
|
119
|
-
for idx, row_dict in enumerate(self.samples_df.iter_rows(named=True))
|
|
118
|
+
idx: row_dict["sample_uid"] for idx, row_dict in enumerate(self.samples_df.iter_rows(named=True))
|
|
120
119
|
}
|
|
121
120
|
|
|
122
121
|
# Build the main lookup using feature_uid (not feature_id)
|
|
@@ -216,7 +215,7 @@ def align(self, **kwargs):
|
|
|
216
215
|
self.features_df = self.features_df.with_columns(*new_cols)
|
|
217
216
|
|
|
218
217
|
self.logger.debug("Alignment completed successfully.")
|
|
219
|
-
|
|
218
|
+
|
|
220
219
|
# Reset consensus data structures after alignment since RT changes invalidate consensus
|
|
221
220
|
consensus_reset_count = 0
|
|
222
221
|
if not self.consensus_df.is_empty():
|
|
@@ -228,7 +227,7 @@ def align(self, **kwargs):
|
|
|
228
227
|
if not self.consensus_ms2.is_empty():
|
|
229
228
|
self.consensus_ms2 = pl.DataFrame()
|
|
230
229
|
consensus_reset_count += 1
|
|
231
|
-
|
|
230
|
+
|
|
232
231
|
# Remove merge and find_ms2 parameters from history since they need to be re-run
|
|
233
232
|
keys_to_remove = ["merge", "find_ms2"]
|
|
234
233
|
history_removed_count = 0
|
|
@@ -238,11 +237,9 @@ def align(self, **kwargs):
|
|
|
238
237
|
del self.history[key]
|
|
239
238
|
history_removed_count += 1
|
|
240
239
|
self.logger.debug(f"Removed {key} from history")
|
|
241
|
-
|
|
240
|
+
|
|
242
241
|
if consensus_reset_count > 0 or history_removed_count > 0:
|
|
243
|
-
self.logger.info(
|
|
244
|
-
f"Alignment reset: {consensus_reset_count} consensus structures cleared, {history_removed_count} history entries removed",
|
|
245
|
-
)
|
|
242
|
+
self.logger.info(f"Alignment reset: {consensus_reset_count} consensus structures cleared, {history_removed_count} history entries removed")
|
|
246
243
|
|
|
247
244
|
if params.get("save_features"):
|
|
248
245
|
self.save_samples()
|
|
@@ -293,10 +290,7 @@ def merge(self, **kwargs):
|
|
|
293
290
|
algorithm = params.get("algorithm")
|
|
294
291
|
min_samples = params.get("min_samples")
|
|
295
292
|
link_ms2 = params.get("link_ms2")
|
|
296
|
-
mz_tol = kwargs.get(
|
|
297
|
-
"mz_tol",
|
|
298
|
-
0.01,
|
|
299
|
-
) # Default values for parameters not in defaults class
|
|
293
|
+
mz_tol = kwargs.get("mz_tol", 0.01) # Default values for parameters not in defaults class
|
|
300
294
|
rt_tol = kwargs.get("rt_tol", 1.0)
|
|
301
295
|
|
|
302
296
|
if len(self.samples_df) > 200 and algorithm == "qt":
|
|
@@ -405,10 +399,7 @@ def merge(self, **kwargs):
|
|
|
405
399
|
consensus_map.setUniqueIds()
|
|
406
400
|
|
|
407
401
|
# create a dict to map uid to feature_uid using self.features_df
|
|
408
|
-
feature_uid_map = {
|
|
409
|
-
row["feature_id"]: row["feature_uid"]
|
|
410
|
-
for row in self.features_df.iter_rows(named=True)
|
|
411
|
-
}
|
|
402
|
+
feature_uid_map = {row["feature_id"]: row["feature_uid"] for row in self.features_df.iter_rows(named=True)}
|
|
412
403
|
imax = consensus_map.size()
|
|
413
404
|
|
|
414
405
|
# Pre-build fast lookup tables for features_df data
|
|
@@ -435,9 +426,7 @@ def merge(self, **kwargs):
|
|
|
435
426
|
|
|
436
427
|
for row in self.features_df.iter_rows(named=True):
|
|
437
428
|
feature_uid = row["feature_uid"]
|
|
438
|
-
features_lookup[feature_uid] = {
|
|
439
|
-
col: row[col] for col in feature_columns if col in self.features_df.columns
|
|
440
|
-
}
|
|
429
|
+
features_lookup[feature_uid] = {col: row[col] for col in feature_columns if col in self.features_df.columns}
|
|
441
430
|
|
|
442
431
|
# create a list to store the consensus mapping
|
|
443
432
|
consensus_mapping = []
|
|
@@ -464,13 +453,11 @@ def merge(self, **kwargs):
|
|
|
464
453
|
# this is a feature that was removed but is still in the feature maps
|
|
465
454
|
continue
|
|
466
455
|
fuid = feature_uid_map[fuid]
|
|
467
|
-
consensus_mapping.append(
|
|
468
|
-
|
|
469
|
-
|
|
470
|
-
|
|
471
|
-
|
|
472
|
-
},
|
|
473
|
-
)
|
|
456
|
+
consensus_mapping.append({
|
|
457
|
+
"consensus_uid": i,
|
|
458
|
+
"sample_uid": f.getMapIndex() + 1,
|
|
459
|
+
"feature_uid": fuid,
|
|
460
|
+
})
|
|
474
461
|
uids.append(fuid)
|
|
475
462
|
|
|
476
463
|
# Get feature data from lookup instead of DataFrame filtering
|
|
@@ -484,99 +471,43 @@ def merge(self, **kwargs):
|
|
|
484
471
|
|
|
485
472
|
# Compute statistics using vectorized operations on collected data
|
|
486
473
|
# Convert to numpy arrays for faster computation
|
|
487
|
-
rt_values = np.array(
|
|
488
|
-
|
|
489
|
-
|
|
490
|
-
|
|
491
|
-
|
|
492
|
-
)
|
|
493
|
-
|
|
494
|
-
|
|
495
|
-
|
|
496
|
-
|
|
497
|
-
|
|
498
|
-
|
|
499
|
-
)
|
|
500
|
-
|
|
501
|
-
|
|
502
|
-
|
|
503
|
-
|
|
504
|
-
|
|
505
|
-
|
|
506
|
-
)
|
|
507
|
-
|
|
508
|
-
|
|
509
|
-
|
|
510
|
-
|
|
511
|
-
|
|
512
|
-
|
|
513
|
-
|
|
514
|
-
|
|
515
|
-
|
|
516
|
-
|
|
517
|
-
for fd in feature_data_list
|
|
518
|
-
if fd.get("mz_start") is not None
|
|
519
|
-
],
|
|
520
|
-
)
|
|
521
|
-
mz_end_values = np.array(
|
|
522
|
-
[
|
|
523
|
-
fd.get("mz_end", 0)
|
|
524
|
-
for fd in feature_data_list
|
|
525
|
-
if fd.get("mz_end") is not None
|
|
526
|
-
],
|
|
527
|
-
)
|
|
528
|
-
inty_values = np.array(
|
|
529
|
-
[
|
|
530
|
-
fd.get("inty", 0)
|
|
531
|
-
for fd in feature_data_list
|
|
532
|
-
if fd.get("inty") is not None
|
|
533
|
-
],
|
|
534
|
-
)
|
|
535
|
-
coherence_values = np.array(
|
|
536
|
-
[
|
|
537
|
-
fd.get("chrom_coherence", 0)
|
|
538
|
-
for fd in feature_data_list
|
|
539
|
-
if fd.get("chrom_coherence") is not None
|
|
540
|
-
],
|
|
541
|
-
)
|
|
542
|
-
prominence_values = np.array(
|
|
543
|
-
[
|
|
544
|
-
fd.get("chrom_prominence", 0)
|
|
545
|
-
for fd in feature_data_list
|
|
546
|
-
if fd.get("chrom_prominence") is not None
|
|
547
|
-
],
|
|
548
|
-
)
|
|
549
|
-
prominence_scaled_values = np.array(
|
|
550
|
-
[
|
|
551
|
-
fd.get("chrom_prominence_scaled", 0)
|
|
552
|
-
for fd in feature_data_list
|
|
553
|
-
if fd.get("chrom_prominence_scaled") is not None
|
|
554
|
-
],
|
|
555
|
-
)
|
|
556
|
-
height_scaled_values = np.array(
|
|
557
|
-
[
|
|
558
|
-
fd.get("chrom_height_scaled", 0)
|
|
559
|
-
for fd in feature_data_list
|
|
560
|
-
if fd.get("chrom_height_scaled") is not None
|
|
561
|
-
],
|
|
562
|
-
)
|
|
563
|
-
iso_values = np.array(
|
|
564
|
-
[fd.get("iso", 0) for fd in feature_data_list if fd.get("iso") is not None],
|
|
565
|
-
)
|
|
566
|
-
charge_values = np.array(
|
|
567
|
-
[
|
|
568
|
-
fd.get("charge", 0)
|
|
569
|
-
for fd in feature_data_list
|
|
570
|
-
if fd.get("charge") is not None
|
|
571
|
-
],
|
|
572
|
-
)
|
|
474
|
+
rt_values = np.array([fd.get("rt", 0) for fd in feature_data_list if fd.get("rt") is not None])
|
|
475
|
+
mz_values = np.array([fd.get("mz", 0) for fd in feature_data_list if fd.get("mz") is not None])
|
|
476
|
+
rt_start_values = np.array([
|
|
477
|
+
fd.get("rt_start", 0) for fd in feature_data_list if fd.get("rt_start") is not None
|
|
478
|
+
])
|
|
479
|
+
rt_end_values = np.array([fd.get("rt_end", 0) for fd in feature_data_list if fd.get("rt_end") is not None])
|
|
480
|
+
rt_delta_values = np.array([
|
|
481
|
+
fd.get("rt_delta", 0) for fd in feature_data_list if fd.get("rt_delta") is not None
|
|
482
|
+
])
|
|
483
|
+
mz_start_values = np.array([
|
|
484
|
+
fd.get("mz_start", 0) for fd in feature_data_list if fd.get("mz_start") is not None
|
|
485
|
+
])
|
|
486
|
+
mz_end_values = np.array([fd.get("mz_end", 0) for fd in feature_data_list if fd.get("mz_end") is not None])
|
|
487
|
+
inty_values = np.array([fd.get("inty", 0) for fd in feature_data_list if fd.get("inty") is not None])
|
|
488
|
+
coherence_values = np.array([
|
|
489
|
+
fd.get("chrom_coherence", 0) for fd in feature_data_list if fd.get("chrom_coherence") is not None
|
|
490
|
+
])
|
|
491
|
+
prominence_values = np.array([
|
|
492
|
+
fd.get("chrom_prominence", 0) for fd in feature_data_list if fd.get("chrom_prominence") is not None
|
|
493
|
+
])
|
|
494
|
+
prominence_scaled_values = np.array([
|
|
495
|
+
fd.get("chrom_prominence_scaled", 0)
|
|
496
|
+
for fd in feature_data_list
|
|
497
|
+
if fd.get("chrom_prominence_scaled") is not None
|
|
498
|
+
])
|
|
499
|
+
height_scaled_values = np.array([
|
|
500
|
+
fd.get("chrom_height_scaled", 0) for fd in feature_data_list if fd.get("chrom_height_scaled") is not None
|
|
501
|
+
])
|
|
502
|
+
iso_values = np.array([fd.get("iso", 0) for fd in feature_data_list if fd.get("iso") is not None])
|
|
503
|
+
charge_values = np.array([fd.get("charge", 0) for fd in feature_data_list if fd.get("charge") is not None])
|
|
573
504
|
|
|
574
505
|
# adduct_values
|
|
575
506
|
# Collect all adducts from feature_data_list to create consensus adduct information
|
|
576
507
|
# Only consider adducts that are in study._get_adducts() plus items with '?'
|
|
577
508
|
all_adducts = []
|
|
578
509
|
adduct_masses = {}
|
|
579
|
-
|
|
510
|
+
|
|
580
511
|
# Get valid adducts from study._get_adducts()
|
|
581
512
|
valid_adducts = set()
|
|
582
513
|
try:
|
|
@@ -585,7 +516,7 @@ def merge(self, **kwargs):
|
|
|
585
516
|
valid_adducts.update(study_adducts_df["name"].to_list())
|
|
586
517
|
except Exception as e:
|
|
587
518
|
self.logger.warning(f"Could not retrieve study adducts: {e}")
|
|
588
|
-
|
|
519
|
+
|
|
589
520
|
# Always allow '?' adducts
|
|
590
521
|
valid_adducts.add("?")
|
|
591
522
|
|
|
@@ -596,7 +527,7 @@ def merge(self, **kwargs):
|
|
|
596
527
|
|
|
597
528
|
if adduct is not None:
|
|
598
529
|
# Only include adducts that are valid (from study._get_adducts() or contain '?')
|
|
599
|
-
if adduct in valid_adducts or
|
|
530
|
+
if adduct in valid_adducts or '?' in adduct:
|
|
600
531
|
all_adducts.append(adduct)
|
|
601
532
|
if adduct_mass is not None:
|
|
602
533
|
adduct_masses[adduct] = adduct_mass
|
|
@@ -604,37 +535,33 @@ def merge(self, **kwargs):
|
|
|
604
535
|
# Calculate adduct_values for the consensus feature
|
|
605
536
|
adduct_values = []
|
|
606
537
|
if all_adducts:
|
|
607
|
-
adduct_counts = {
|
|
608
|
-
adduct: all_adducts.count(adduct) for adduct in set(all_adducts)
|
|
609
|
-
}
|
|
538
|
+
adduct_counts = {adduct: all_adducts.count(adduct) for adduct in set(all_adducts)}
|
|
610
539
|
total_count = sum(adduct_counts.values())
|
|
611
540
|
for adduct, count in adduct_counts.items():
|
|
612
541
|
percentage = (count / total_count) * 100 if total_count > 0 else 0
|
|
613
542
|
mass = adduct_masses.get(adduct, None)
|
|
614
543
|
# Store as list with [name, num, %] format for the adducts column
|
|
615
|
-
adduct_values.append(
|
|
616
|
-
|
|
617
|
-
|
|
618
|
-
|
|
619
|
-
|
|
620
|
-
],
|
|
621
|
-
)
|
|
544
|
+
adduct_values.append([
|
|
545
|
+
str(adduct),
|
|
546
|
+
int(count),
|
|
547
|
+
float(round(percentage, 2))
|
|
548
|
+
])
|
|
622
549
|
|
|
623
550
|
# Sort adduct_values by count in descending order
|
|
624
551
|
adduct_values.sort(key=lambda x: x[1], reverse=True) # Sort by count (index 1)
|
|
625
552
|
# Store adduct_values for use in metadata
|
|
626
553
|
consensus_adduct_values = adduct_values
|
|
627
|
-
|
|
554
|
+
|
|
628
555
|
# Extract top adduct information for new columns
|
|
629
556
|
adduct_top = None
|
|
630
557
|
adduct_charge_top = None
|
|
631
558
|
adduct_mass_neutral_top = None
|
|
632
559
|
adduct_mass_shift_top = None
|
|
633
|
-
|
|
560
|
+
|
|
634
561
|
if consensus_adduct_values:
|
|
635
562
|
top_adduct_name = consensus_adduct_values[0][0] # Get top adduct name
|
|
636
563
|
adduct_top = top_adduct_name
|
|
637
|
-
|
|
564
|
+
|
|
638
565
|
# Parse adduct information to extract charge and mass shift
|
|
639
566
|
# Handle "?" as "H" and parse common adduct formats
|
|
640
567
|
if top_adduct_name == "?" or top_adduct_name == "[M+?]+":
|
|
@@ -650,37 +577,33 @@ def merge(self, **kwargs):
|
|
|
650
577
|
study_adducts_df = self._get_adducts()
|
|
651
578
|
if not study_adducts_df.is_empty():
|
|
652
579
|
# Look for exact match in study adducts
|
|
653
|
-
matching_adduct = study_adducts_df.filter(
|
|
654
|
-
pl.col("name") == top_adduct_name,
|
|
655
|
-
)
|
|
580
|
+
matching_adduct = study_adducts_df.filter(pl.col("name") == top_adduct_name)
|
|
656
581
|
if not matching_adduct.is_empty():
|
|
657
582
|
adduct_row = matching_adduct.row(0, named=True)
|
|
658
583
|
adduct_charge_top = adduct_row["charge"]
|
|
659
584
|
adduct_mass_shift_top = adduct_row["mass_shift"]
|
|
660
585
|
adduct_found = True
|
|
661
586
|
except Exception as e:
|
|
662
|
-
self.logger.warning(
|
|
663
|
-
|
|
664
|
-
)
|
|
665
|
-
|
|
587
|
+
self.logger.warning(f"Could not lookup adduct in study adducts: {e}")
|
|
588
|
+
|
|
666
589
|
if not adduct_found:
|
|
667
590
|
# Fallback to regex parsing
|
|
668
591
|
import re
|
|
669
|
-
|
|
592
|
+
|
|
670
593
|
# Pattern for adducts like [M+H]+, [M-H]-, [M+Na]+, etc.
|
|
671
|
-
pattern = r
|
|
594
|
+
pattern = r'\[M([+\-])([A-Za-z0-9]+)\]([0-9]*)([+\-])'
|
|
672
595
|
match = re.match(pattern, top_adduct_name)
|
|
673
|
-
|
|
596
|
+
|
|
674
597
|
if match:
|
|
675
598
|
sign = match.group(1)
|
|
676
599
|
element = match.group(2)
|
|
677
600
|
multiplier_str = match.group(3)
|
|
678
601
|
charge_sign = match.group(4)
|
|
679
|
-
|
|
602
|
+
|
|
680
603
|
multiplier = int(multiplier_str) if multiplier_str else 1
|
|
681
604
|
charge = multiplier if charge_sign == "+" else -multiplier
|
|
682
605
|
adduct_charge_top = charge
|
|
683
|
-
|
|
606
|
+
|
|
684
607
|
# Calculate mass shift based on element
|
|
685
608
|
element_masses = {
|
|
686
609
|
"H": 1.007825,
|
|
@@ -694,16 +617,9 @@ def merge(self, **kwargs):
|
|
|
694
617
|
"CH3COO": 59.013851,
|
|
695
618
|
"H2O": 18.010565,
|
|
696
619
|
}
|
|
697
|
-
|
|
698
|
-
base_mass = element_masses.get(
|
|
699
|
-
|
|
700
|
-
1.007825,
|
|
701
|
-
) # Default to H if unknown
|
|
702
|
-
mass_shift = (
|
|
703
|
-
base_mass * multiplier
|
|
704
|
-
if sign == "+"
|
|
705
|
-
else -base_mass * multiplier
|
|
706
|
-
)
|
|
620
|
+
|
|
621
|
+
base_mass = element_masses.get(element, 1.007825) # Default to H if unknown
|
|
622
|
+
mass_shift = base_mass * multiplier if sign == "+" else -base_mass * multiplier
|
|
707
623
|
adduct_mass_shift_top = mass_shift
|
|
708
624
|
else:
|
|
709
625
|
# Default fallback
|
|
@@ -711,8 +627,8 @@ def merge(self, **kwargs):
|
|
|
711
627
|
adduct_mass_shift_top = 1.007825
|
|
712
628
|
else:
|
|
713
629
|
# No valid adducts found - assign default based on study polarity
|
|
714
|
-
study_polarity = getattr(self,
|
|
715
|
-
if study_polarity in [
|
|
630
|
+
study_polarity = getattr(self, 'polarity', 'positive')
|
|
631
|
+
if study_polarity in ['negative', 'neg']:
|
|
716
632
|
# Negative mode default
|
|
717
633
|
adduct_top = "[M-?]1-"
|
|
718
634
|
adduct_charge_top = -1
|
|
@@ -722,18 +638,14 @@ def merge(self, **kwargs):
|
|
|
722
638
|
adduct_top = "[M+?]1+"
|
|
723
639
|
adduct_charge_top = 1
|
|
724
640
|
adduct_mass_shift_top = 1.007825 # H mass (gain of proton)
|
|
725
|
-
|
|
641
|
+
|
|
726
642
|
# Create a single default adduct entry in the adducts list for consistency
|
|
727
643
|
consensus_adduct_values = [[adduct_top, 1, 100.0]]
|
|
728
|
-
|
|
644
|
+
|
|
729
645
|
# Calculate neutral mass from consensus mz (for both cases)
|
|
730
|
-
consensus_mz = (
|
|
731
|
-
round(float(np.mean(mz_values)), 4) if len(mz_values) > 0 else 0.0
|
|
732
|
-
)
|
|
646
|
+
consensus_mz = round(float(np.mean(mz_values)), 4) if len(mz_values) > 0 else 0.0
|
|
733
647
|
if adduct_charge_top and adduct_mass_shift_top is not None:
|
|
734
|
-
adduct_mass_neutral_top = (
|
|
735
|
-
consensus_mz * abs(adduct_charge_top) - adduct_mass_shift_top
|
|
736
|
-
)
|
|
648
|
+
adduct_mass_neutral_top = consensus_mz * abs(adduct_charge_top) - adduct_mass_shift_top
|
|
737
649
|
|
|
738
650
|
# Calculate number of MS2 spectra
|
|
739
651
|
ms2_count = 0
|
|
@@ -742,95 +654,48 @@ def merge(self, **kwargs):
|
|
|
742
654
|
if ms2_scans is not None:
|
|
743
655
|
ms2_count += len(ms2_scans)
|
|
744
656
|
|
|
745
|
-
metadata_list.append(
|
|
746
|
-
|
|
747
|
-
|
|
748
|
-
|
|
749
|
-
|
|
750
|
-
|
|
751
|
-
|
|
752
|
-
|
|
753
|
-
|
|
754
|
-
|
|
755
|
-
|
|
756
|
-
|
|
757
|
-
|
|
758
|
-
|
|
759
|
-
|
|
760
|
-
|
|
761
|
-
|
|
762
|
-
|
|
763
|
-
|
|
764
|
-
|
|
765
|
-
|
|
766
|
-
|
|
767
|
-
|
|
768
|
-
|
|
769
|
-
|
|
770
|
-
|
|
771
|
-
|
|
772
|
-
|
|
773
|
-
|
|
774
|
-
|
|
775
|
-
|
|
776
|
-
|
|
777
|
-
|
|
778
|
-
|
|
779
|
-
|
|
780
|
-
|
|
781
|
-
|
|
782
|
-
|
|
783
|
-
|
|
784
|
-
|
|
785
|
-
|
|
786
|
-
|
|
787
|
-
else 0.0,
|
|
788
|
-
"mz_end_mean": round(float(np.mean(mz_end_values)), 4)
|
|
789
|
-
if len(mz_end_values) > 0
|
|
790
|
-
else 0.0,
|
|
791
|
-
"inty_mean": round(float(np.mean(inty_values)), 0)
|
|
792
|
-
if len(inty_values) > 0
|
|
793
|
-
else 0.0,
|
|
794
|
-
"bl": -1.0,
|
|
795
|
-
"chrom_coherence_mean": round(float(np.mean(coherence_values)), 3)
|
|
796
|
-
if len(coherence_values) > 0
|
|
797
|
-
else 0.0,
|
|
798
|
-
"chrom_prominence_mean": round(float(np.mean(prominence_values)), 0)
|
|
799
|
-
if len(prominence_values) > 0
|
|
800
|
-
else 0.0,
|
|
801
|
-
"chrom_prominence_scaled_mean": round(
|
|
802
|
-
float(np.mean(prominence_scaled_values)),
|
|
803
|
-
3,
|
|
804
|
-
)
|
|
805
|
-
if len(prominence_scaled_values) > 0
|
|
806
|
-
else 0.0,
|
|
807
|
-
"chrom_height_scaled_mean": round(
|
|
808
|
-
float(np.mean(height_scaled_values)),
|
|
809
|
-
3,
|
|
810
|
-
)
|
|
811
|
-
if len(height_scaled_values) > 0
|
|
812
|
-
else 0.0,
|
|
813
|
-
"iso_mean": round(float(np.mean(iso_values)), 2)
|
|
814
|
-
if len(iso_values) > 0
|
|
815
|
-
else 0.0,
|
|
816
|
-
"charge_mean": round(float(np.mean(charge_values)), 2)
|
|
817
|
-
if len(charge_values) > 0
|
|
818
|
-
else 0.0,
|
|
819
|
-
"number_ms2": int(ms2_count),
|
|
820
|
-
"adducts": consensus_adduct_values
|
|
821
|
-
if consensus_adduct_values
|
|
822
|
-
else [], # Ensure it's always a list
|
|
823
|
-
# New columns for top-ranked adduct information
|
|
824
|
-
"adduct_top": adduct_top,
|
|
825
|
-
"adduct_charge_top": adduct_charge_top,
|
|
826
|
-
"adduct_mass_neutral_top": round(adduct_mass_neutral_top, 6)
|
|
827
|
-
if adduct_mass_neutral_top is not None
|
|
828
|
-
else None,
|
|
829
|
-
"adduct_mass_shift_top": round(adduct_mass_shift_top, 6)
|
|
830
|
-
if adduct_mass_shift_top is not None
|
|
831
|
-
else None,
|
|
832
|
-
},
|
|
833
|
-
)
|
|
657
|
+
metadata_list.append({
|
|
658
|
+
"consensus_uid": int(i), # "consensus_id": i,
|
|
659
|
+
"consensus_id": str(feature.getUniqueId()),
|
|
660
|
+
"quality": round(float(feature.getQuality()), 3),
|
|
661
|
+
"number_samples": len(feature_data_list),
|
|
662
|
+
# "number_ext": int(len(features_list)),
|
|
663
|
+
"rt": round(float(np.mean(rt_values)), 4) if len(rt_values) > 0 else 0.0,
|
|
664
|
+
"mz": round(float(np.mean(mz_values)), 4) if len(mz_values) > 0 else 0.0,
|
|
665
|
+
"rt_min": round(float(np.min(rt_values)), 3) if len(rt_values) > 0 else 0.0,
|
|
666
|
+
"rt_max": round(float(np.max(rt_values)), 3) if len(rt_values) > 0 else 0.0,
|
|
667
|
+
"rt_mean": round(float(np.mean(rt_values)), 3) if len(rt_values) > 0 else 0.0,
|
|
668
|
+
"rt_start_mean": round(float(np.mean(rt_start_values)), 3) if len(rt_start_values) > 0 else 0.0,
|
|
669
|
+
"rt_end_mean": round(float(np.mean(rt_end_values)), 3) if len(rt_end_values) > 0 else 0.0,
|
|
670
|
+
"rt_delta_mean": round(float(np.ptp(rt_delta_values)), 3) if len(rt_delta_values) > 0 else 0.0,
|
|
671
|
+
"mz_min": round(float(np.min(mz_values)), 4) if len(mz_values) > 0 else 0.0,
|
|
672
|
+
"mz_max": round(float(np.max(mz_values)), 4) if len(mz_values) > 0 else 0.0,
|
|
673
|
+
"mz_mean": round(float(np.mean(mz_values)), 4) if len(mz_values) > 0 else 0.0,
|
|
674
|
+
"mz_start_mean": round(float(np.mean(mz_start_values)), 4) if len(mz_start_values) > 0 else 0.0,
|
|
675
|
+
"mz_end_mean": round(float(np.mean(mz_end_values)), 4) if len(mz_end_values) > 0 else 0.0,
|
|
676
|
+
"inty_mean": round(float(np.mean(inty_values)), 0) if len(inty_values) > 0 else 0.0,
|
|
677
|
+
"bl": -1.0,
|
|
678
|
+
"chrom_coherence_mean": round(float(np.mean(coherence_values)), 3) if len(coherence_values) > 0 else 0.0,
|
|
679
|
+
"chrom_prominence_mean": round(float(np.mean(prominence_values)), 0) if len(prominence_values) > 0 else 0.0,
|
|
680
|
+
"chrom_prominence_scaled_mean": round(
|
|
681
|
+
float(np.mean(prominence_scaled_values)),
|
|
682
|
+
3,
|
|
683
|
+
)
|
|
684
|
+
if len(prominence_scaled_values) > 0
|
|
685
|
+
else 0.0,
|
|
686
|
+
"chrom_height_scaled_mean": round(float(np.mean(height_scaled_values)), 3)
|
|
687
|
+
if len(height_scaled_values) > 0
|
|
688
|
+
else 0.0,
|
|
689
|
+
"iso_mean": round(float(np.mean(iso_values)), 2) if len(iso_values) > 0 else 0.0,
|
|
690
|
+
"charge_mean": round(float(np.mean(charge_values)), 2) if len(charge_values) > 0 else 0.0,
|
|
691
|
+
"number_ms2": int(ms2_count),
|
|
692
|
+
"adducts": consensus_adduct_values if consensus_adduct_values else [], # Ensure it's always a list
|
|
693
|
+
# New columns for top-ranked adduct information
|
|
694
|
+
"adduct_top": adduct_top,
|
|
695
|
+
"adduct_charge_top": adduct_charge_top,
|
|
696
|
+
"adduct_mass_neutral_top": round(adduct_mass_neutral_top, 6) if adduct_mass_neutral_top is not None else None,
|
|
697
|
+
"adduct_mass_shift_top": round(adduct_mass_shift_top, 6) if adduct_mass_shift_top is not None else None,
|
|
698
|
+
})
|
|
834
699
|
|
|
835
700
|
consensus_mapping_df = pl.DataFrame(consensus_mapping)
|
|
836
701
|
# remove all rows in consensus_mapping_df where consensus_id is not in self.featured_df['uid']
|
|
@@ -871,74 +736,72 @@ def merge(self, **kwargs):
|
|
|
871
736
|
)
|
|
872
737
|
|
|
873
738
|
self.consensus_map = consensus_map
|
|
874
|
-
|
|
739
|
+
|
|
875
740
|
# Add adduct grouping and adduct_of assignment
|
|
876
741
|
if len(self.consensus_df) > 0:
|
|
877
742
|
# Get rt_tol and mz_tol from kwargs or use defaults from merge_defaults
|
|
878
743
|
adduct_rt_tol = rt_tol # Use the same rt_tol from merge parameters
|
|
879
744
|
adduct_mz_tol = mz_tol # Use the same mz_tol from merge parameters
|
|
880
|
-
|
|
745
|
+
|
|
881
746
|
# Initialize new columns
|
|
882
747
|
adduct_group_list = []
|
|
883
748
|
adduct_of_list = []
|
|
884
|
-
|
|
749
|
+
|
|
885
750
|
# Get relevant columns for grouping
|
|
886
751
|
consensus_data = []
|
|
887
752
|
for row in self.consensus_df.iter_rows(named=True):
|
|
888
|
-
consensus_data.append(
|
|
889
|
-
|
|
890
|
-
|
|
891
|
-
|
|
892
|
-
|
|
893
|
-
|
|
894
|
-
|
|
895
|
-
|
|
896
|
-
)
|
|
897
|
-
|
|
753
|
+
consensus_data.append({
|
|
754
|
+
"consensus_uid": row["consensus_uid"],
|
|
755
|
+
"rt": row["rt"],
|
|
756
|
+
"adduct_mass_neutral_top": row.get("adduct_mass_neutral_top"),
|
|
757
|
+
"adduct_top": row.get("adduct_top"),
|
|
758
|
+
"inty_mean": row.get("inty_mean", 0),
|
|
759
|
+
})
|
|
760
|
+
|
|
898
761
|
# Group features with similar neutral mass and RT
|
|
899
762
|
group_id = 1
|
|
900
763
|
assigned_groups = {} # consensus_uid -> group_id
|
|
901
764
|
groups = {} # group_id -> [consensus_uids]
|
|
902
|
-
|
|
765
|
+
|
|
903
766
|
for i, feature in enumerate(consensus_data):
|
|
904
767
|
consensus_uid = feature["consensus_uid"]
|
|
905
|
-
|
|
768
|
+
|
|
906
769
|
if consensus_uid in assigned_groups:
|
|
907
770
|
continue
|
|
908
|
-
|
|
771
|
+
|
|
909
772
|
neutral_mass = feature["adduct_mass_neutral_top"]
|
|
910
773
|
rt = feature["rt"]
|
|
911
|
-
|
|
774
|
+
|
|
912
775
|
# Skip if neutral mass is None
|
|
913
776
|
if neutral_mass is None:
|
|
914
777
|
assigned_groups[consensus_uid] = 0 # No group assignment
|
|
915
778
|
continue
|
|
916
|
-
|
|
779
|
+
|
|
917
780
|
# Find all features that could belong to the same group
|
|
918
781
|
group_members = [consensus_uid]
|
|
919
|
-
|
|
782
|
+
|
|
920
783
|
for j, other_feature in enumerate(consensus_data):
|
|
921
784
|
if i == j:
|
|
922
785
|
continue
|
|
923
|
-
|
|
786
|
+
|
|
924
787
|
other_uid = other_feature["consensus_uid"]
|
|
925
788
|
if other_uid in assigned_groups:
|
|
926
789
|
continue
|
|
927
|
-
|
|
790
|
+
|
|
928
791
|
other_neutral_mass = other_feature["adduct_mass_neutral_top"]
|
|
929
792
|
other_rt = other_feature["rt"]
|
|
930
|
-
|
|
793
|
+
|
|
931
794
|
if other_neutral_mass is None:
|
|
932
795
|
continue
|
|
933
|
-
|
|
796
|
+
|
|
934
797
|
# Check if features have similar neutral mass and RT
|
|
935
798
|
mass_diff = abs(neutral_mass - other_neutral_mass)
|
|
936
799
|
rt_diff = abs(rt - other_rt) / 60.0 # Convert to minutes for rt_tol
|
|
937
|
-
|
|
800
|
+
|
|
938
801
|
if mass_diff <= adduct_mz_tol and rt_diff <= adduct_rt_tol:
|
|
939
802
|
group_members.append(other_uid)
|
|
940
803
|
assigned_groups[other_uid] = group_id
|
|
941
|
-
|
|
804
|
+
|
|
942
805
|
if len(group_members) > 1:
|
|
943
806
|
# Multiple members - create a group
|
|
944
807
|
for member_uid in group_members:
|
|
@@ -950,29 +813,26 @@ def merge(self, **kwargs):
|
|
|
950
813
|
assigned_groups[consensus_uid] = group_id
|
|
951
814
|
groups[group_id] = [consensus_uid]
|
|
952
815
|
group_id += 1
|
|
953
|
-
|
|
816
|
+
|
|
954
817
|
# Determine adduct_of for each group
|
|
955
818
|
group_adduct_of = {} # group_id -> consensus_uid of most important adduct
|
|
956
|
-
|
|
819
|
+
|
|
957
820
|
for grp_id, member_uids in groups.items():
|
|
958
821
|
# Find the most important adduct in this group
|
|
959
822
|
# Priority: [M+H]+ > [M-H]- > highest intensity
|
|
960
823
|
best_uid = None
|
|
961
824
|
best_priority = -1
|
|
962
825
|
best_intensity = 0
|
|
963
|
-
|
|
826
|
+
|
|
964
827
|
for uid in member_uids:
|
|
965
828
|
# Find the feature data
|
|
966
|
-
feature_data = next(
|
|
967
|
-
(f for f in consensus_data if f["consensus_uid"] == uid),
|
|
968
|
-
None,
|
|
969
|
-
)
|
|
829
|
+
feature_data = next((f for f in consensus_data if f["consensus_uid"] == uid), None)
|
|
970
830
|
if not feature_data:
|
|
971
831
|
continue
|
|
972
|
-
|
|
832
|
+
|
|
973
833
|
adduct = feature_data.get("adduct_top", "")
|
|
974
834
|
intensity = feature_data.get("inty_mean", 0)
|
|
975
|
-
|
|
835
|
+
|
|
976
836
|
priority = 0
|
|
977
837
|
if adduct and ("[M+H]" in adduct or adduct == "H" or adduct == "?"):
|
|
978
838
|
priority = 3 # Highest priority for [M+H]+ or H
|
|
@@ -980,41 +840,34 @@ def merge(self, **kwargs):
|
|
|
980
840
|
priority = 2 # Second priority for [M-H]-
|
|
981
841
|
elif adduct and "M" in adduct:
|
|
982
842
|
priority = 1 # Third priority for other molecular adducts
|
|
983
|
-
|
|
843
|
+
|
|
984
844
|
# Choose based on priority first, then intensity
|
|
985
|
-
if priority > best_priority or
|
|
986
|
-
priority == best_priority and intensity > best_intensity
|
|
987
|
-
):
|
|
845
|
+
if (priority > best_priority or
|
|
846
|
+
(priority == best_priority and intensity > best_intensity)):
|
|
988
847
|
best_uid = uid
|
|
989
848
|
best_priority = priority
|
|
990
849
|
best_intensity = intensity
|
|
991
|
-
|
|
850
|
+
|
|
992
851
|
group_adduct_of[grp_id] = best_uid if best_uid else member_uids[0]
|
|
993
|
-
|
|
852
|
+
|
|
994
853
|
# Build the final lists in the same order as consensus_df
|
|
995
854
|
for row in self.consensus_df.iter_rows(named=True):
|
|
996
855
|
consensus_uid = row["consensus_uid"]
|
|
997
856
|
group = assigned_groups.get(consensus_uid, 0)
|
|
998
857
|
adduct_of = group_adduct_of.get(group, consensus_uid)
|
|
999
|
-
|
|
858
|
+
|
|
1000
859
|
adduct_group_list.append(group)
|
|
1001
860
|
adduct_of_list.append(adduct_of)
|
|
1002
|
-
|
|
861
|
+
|
|
1003
862
|
# Add the new columns to consensus_df
|
|
1004
|
-
self.consensus_df = self.consensus_df.with_columns(
|
|
1005
|
-
|
|
1006
|
-
|
|
1007
|
-
|
|
1008
|
-
|
|
1009
|
-
)
|
|
1010
|
-
|
|
863
|
+
self.consensus_df = self.consensus_df.with_columns([
|
|
864
|
+
pl.Series("adduct_group", adduct_group_list, dtype=pl.Int64),
|
|
865
|
+
pl.Series("adduct_of", adduct_of_list, dtype=pl.Int64),
|
|
866
|
+
])
|
|
867
|
+
|
|
1011
868
|
# calculate the completeness of the consensus map
|
|
1012
869
|
if len(self.consensus_df) > 0 and len(self.samples_df) > 0:
|
|
1013
|
-
c = (
|
|
1014
|
-
len(self.consensus_mapping_df)
|
|
1015
|
-
/ len(self.consensus_df)
|
|
1016
|
-
/ len(self.samples_df)
|
|
1017
|
-
)
|
|
870
|
+
c = len(self.consensus_mapping_df) / len(self.consensus_df) / len(self.samples_df)
|
|
1018
871
|
self.logger.info(
|
|
1019
872
|
f"Merging completed. Consensus features: {len(self.consensus_df)}. Completeness: {c:.2f}.",
|
|
1020
873
|
)
|
|
@@ -1085,9 +938,7 @@ def find_ms2(self, **kwargs):
|
|
|
1085
938
|
]
|
|
1086
939
|
for row in feats.iter_rows(named=True):
|
|
1087
940
|
feature_uid = row["feature_uid"]
|
|
1088
|
-
feature_lookup[feature_uid] = {
|
|
1089
|
-
col: row[col] for col in relevant_cols if col in feats.columns
|
|
1090
|
-
}
|
|
941
|
+
feature_lookup[feature_uid] = {col: row[col] for col in relevant_cols if col in feats.columns}
|
|
1091
942
|
tdqm_disable = self.log_level not in ["TRACE", "DEBUG", "INFO"]
|
|
1092
943
|
|
|
1093
944
|
# Process consensus mapping in batch
|
|
@@ -1109,26 +960,20 @@ def find_ms2(self, **kwargs):
|
|
|
1109
960
|
for j in range(len(ms2_specs)):
|
|
1110
961
|
spec = ms2_specs[j]
|
|
1111
962
|
scanid = ms2_scans[j]
|
|
1112
|
-
data.append(
|
|
1113
|
-
|
|
1114
|
-
|
|
1115
|
-
|
|
1116
|
-
|
|
1117
|
-
|
|
1118
|
-
|
|
1119
|
-
|
|
1120
|
-
|
|
1121
|
-
|
|
1122
|
-
|
|
1123
|
-
|
|
1124
|
-
|
|
1125
|
-
|
|
1126
|
-
if chrom_prominence_scaled is not None
|
|
1127
|
-
else None,
|
|
1128
|
-
"number_frags": len(spec.mz),
|
|
1129
|
-
"spec": spec,
|
|
1130
|
-
},
|
|
1131
|
-
)
|
|
963
|
+
data.append({
|
|
964
|
+
"consensus_uid": int(mapping_row["consensus_uid"]),
|
|
965
|
+
"feature_uid": int(mapping_row["feature_uid"]),
|
|
966
|
+
"sample_uid": int(mapping_row["sample_uid"]),
|
|
967
|
+
"scan_id": int(scanid),
|
|
968
|
+
"energy": round(spec.energy, 1) if hasattr(spec, "energy") and spec.energy is not None else None,
|
|
969
|
+
"prec_inty": round(inty, 0) if inty is not None else None,
|
|
970
|
+
"prec_coherence": round(chrom_coherence, 3) if chrom_coherence is not None else None,
|
|
971
|
+
"prec_prominence_scaled": round(chrom_prominence_scaled, 3)
|
|
972
|
+
if chrom_prominence_scaled is not None
|
|
973
|
+
else None,
|
|
974
|
+
"number_frags": len(spec.mz),
|
|
975
|
+
"spec": spec,
|
|
976
|
+
})
|
|
1132
977
|
self.consensus_ms2 = pl.DataFrame(data)
|
|
1133
978
|
if not self.consensus_ms2.is_empty():
|
|
1134
979
|
unique_consensus_features = self.consensus_ms2["consensus_uid"].n_unique()
|
|
@@ -1161,10 +1006,7 @@ def filter_consensus(
|
|
|
1161
1006
|
else:
|
|
1162
1007
|
if isinstance(coherence, tuple) and len(coherence) == 2:
|
|
1163
1008
|
min_coherence, max_coherence = coherence
|
|
1164
|
-
cons = cons[
|
|
1165
|
-
(cons["chrom_coherence"] >= min_coherence)
|
|
1166
|
-
& (cons["chrom_coherence"] <= max_coherence)
|
|
1167
|
-
]
|
|
1009
|
+
cons = cons[(cons["chrom_coherence"] >= min_coherence) & (cons["chrom_coherence"] <= max_coherence)]
|
|
1168
1010
|
else:
|
|
1169
1011
|
cons = cons[cons["chrom_coherence"] >= coherence]
|
|
1170
1012
|
after_coherence = len(cons)
|
|
@@ -1175,9 +1017,7 @@ def filter_consensus(
|
|
|
1175
1017
|
if quality is not None:
|
|
1176
1018
|
if isinstance(quality, tuple) and len(quality) == 2:
|
|
1177
1019
|
min_quality, max_quality = quality
|
|
1178
|
-
cons = cons[
|
|
1179
|
-
(cons["quality"] >= min_quality) & (cons["quality"] <= max_quality)
|
|
1180
|
-
]
|
|
1020
|
+
cons = cons[(cons["quality"] >= min_quality) & (cons["quality"] <= max_quality)]
|
|
1181
1021
|
else:
|
|
1182
1022
|
cons = cons[cons["quality"] >= quality]
|
|
1183
1023
|
after_quality = len(cons)
|
|
@@ -1188,10 +1028,7 @@ def filter_consensus(
|
|
|
1188
1028
|
if number_samples is not None:
|
|
1189
1029
|
if isinstance(number_samples, tuple) and len(number_samples) == 2:
|
|
1190
1030
|
min_number, max_number = number_samples
|
|
1191
|
-
cons = cons[
|
|
1192
|
-
(cons["number_samples"] >= min_number)
|
|
1193
|
-
& (cons["number_samples"] <= max_number)
|
|
1194
|
-
]
|
|
1031
|
+
cons = cons[(cons["number_samples"] >= min_number) & (cons["number_samples"] <= max_number)]
|
|
1195
1032
|
else:
|
|
1196
1033
|
cons = cons[cons["number_samples"] >= number_samples]
|
|
1197
1034
|
after_number_samples = len(cons)
|
|
@@ -1268,13 +1105,11 @@ def _integrate_chrom_impl(self, **kwargs):
|
|
|
1268
1105
|
|
|
1269
1106
|
# Merge consensus_mapping with consensus_df to get rt_start_mean and rt_end_mean
|
|
1270
1107
|
# Use Polars join operation instead of pandas merge
|
|
1271
|
-
consensus_subset = self.consensus_df.select(
|
|
1272
|
-
|
|
1273
|
-
|
|
1274
|
-
|
|
1275
|
-
|
|
1276
|
-
],
|
|
1277
|
-
)
|
|
1108
|
+
consensus_subset = self.consensus_df.select([
|
|
1109
|
+
"consensus_uid",
|
|
1110
|
+
"rt_start_mean",
|
|
1111
|
+
"rt_end_mean",
|
|
1112
|
+
])
|
|
1278
1113
|
df1 = self.consensus_mapping_df.join(
|
|
1279
1114
|
consensus_subset,
|
|
1280
1115
|
on="consensus_uid",
|
|
@@ -1319,9 +1154,9 @@ def _integrate_chrom_impl(self, **kwargs):
|
|
|
1319
1154
|
if chrom is None or len(chrom) == 0:
|
|
1320
1155
|
update_rows.append(row_idx)
|
|
1321
1156
|
chroms.append(None)
|
|
1322
|
-
rt_starts.append(float(
|
|
1323
|
-
rt_ends.append(float(
|
|
1324
|
-
rt_deltas.append(float(
|
|
1157
|
+
rt_starts.append(float('nan'))
|
|
1158
|
+
rt_ends.append(float('nan'))
|
|
1159
|
+
rt_deltas.append(float('nan'))
|
|
1325
1160
|
chrom_areas.append(-1.0)
|
|
1326
1161
|
continue
|
|
1327
1162
|
## TODO expose parameters
|
|
@@ -1351,13 +1186,9 @@ def _integrate_chrom_impl(self, **kwargs):
|
|
|
1351
1186
|
if update_rows:
|
|
1352
1187
|
# Create mapping from row index to new values
|
|
1353
1188
|
row_to_chrom = {update_rows[i]: chroms[i] for i in range(len(update_rows))}
|
|
1354
|
-
row_to_rt_start = {
|
|
1355
|
-
update_rows[i]: rt_starts[i] for i in range(len(update_rows))
|
|
1356
|
-
}
|
|
1189
|
+
row_to_rt_start = {update_rows[i]: rt_starts[i] for i in range(len(update_rows))}
|
|
1357
1190
|
row_to_rt_end = {update_rows[i]: rt_ends[i] for i in range(len(update_rows))}
|
|
1358
|
-
row_to_rt_delta = {
|
|
1359
|
-
update_rows[i]: rt_deltas[i] for i in range(len(update_rows))
|
|
1360
|
-
}
|
|
1191
|
+
row_to_rt_delta = {update_rows[i]: rt_deltas[i] for i in range(len(update_rows))}
|
|
1361
1192
|
row_to_chrom_area = {
|
|
1362
1193
|
update_rows[i]: float(chrom_areas[i]) if chrom_areas[i] is not None else 0.0
|
|
1363
1194
|
for i in range(len(update_rows))
|
|
@@ -1371,60 +1202,58 @@ def _integrate_chrom_impl(self, **kwargs):
|
|
|
1371
1202
|
|
|
1372
1203
|
# Update columns conditionally
|
|
1373
1204
|
try:
|
|
1374
|
-
self.features_df = df_with_index.with_columns(
|
|
1375
|
-
|
|
1376
|
-
|
|
1377
|
-
|
|
1378
|
-
.
|
|
1379
|
-
|
|
1380
|
-
|
|
1381
|
-
|
|
1382
|
-
|
|
1383
|
-
|
|
1384
|
-
|
|
1385
|
-
|
|
1386
|
-
|
|
1387
|
-
|
|
1388
|
-
.
|
|
1389
|
-
|
|
1390
|
-
|
|
1391
|
-
|
|
1392
|
-
|
|
1393
|
-
|
|
1394
|
-
|
|
1395
|
-
|
|
1396
|
-
|
|
1397
|
-
|
|
1398
|
-
.
|
|
1399
|
-
|
|
1400
|
-
|
|
1401
|
-
|
|
1402
|
-
|
|
1403
|
-
|
|
1404
|
-
|
|
1405
|
-
|
|
1406
|
-
|
|
1407
|
-
|
|
1408
|
-
.
|
|
1409
|
-
|
|
1410
|
-
|
|
1411
|
-
|
|
1412
|
-
|
|
1413
|
-
|
|
1414
|
-
|
|
1415
|
-
|
|
1416
|
-
|
|
1417
|
-
|
|
1418
|
-
.
|
|
1419
|
-
|
|
1420
|
-
|
|
1421
|
-
|
|
1422
|
-
|
|
1423
|
-
|
|
1424
|
-
|
|
1425
|
-
|
|
1426
|
-
],
|
|
1427
|
-
).drop("__row_idx") # Remove the temporary row index column
|
|
1205
|
+
self.features_df = df_with_index.with_columns([
|
|
1206
|
+
# Update chrom column - use when() to update only specific rows
|
|
1207
|
+
pl.when(update_mask)
|
|
1208
|
+
.then(
|
|
1209
|
+
pl.col("__row_idx").map_elements(
|
|
1210
|
+
lambda x: row_to_chrom.get(x, None),
|
|
1211
|
+
return_dtype=pl.Object,
|
|
1212
|
+
),
|
|
1213
|
+
)
|
|
1214
|
+
.otherwise(pl.col("chrom"))
|
|
1215
|
+
.alias("chrom"),
|
|
1216
|
+
# Update rt_start column
|
|
1217
|
+
pl.when(update_mask)
|
|
1218
|
+
.then(
|
|
1219
|
+
pl.col("__row_idx").map_elements(
|
|
1220
|
+
lambda x: row_to_rt_start.get(x, None),
|
|
1221
|
+
return_dtype=pl.Float64,
|
|
1222
|
+
),
|
|
1223
|
+
)
|
|
1224
|
+
.otherwise(pl.col("rt_start"))
|
|
1225
|
+
.alias("rt_start"),
|
|
1226
|
+
# Update rt_end column
|
|
1227
|
+
pl.when(update_mask)
|
|
1228
|
+
.then(
|
|
1229
|
+
pl.col("__row_idx").map_elements(
|
|
1230
|
+
lambda x: row_to_rt_end.get(x, None),
|
|
1231
|
+
return_dtype=pl.Float64,
|
|
1232
|
+
),
|
|
1233
|
+
)
|
|
1234
|
+
.otherwise(pl.col("rt_end"))
|
|
1235
|
+
.alias("rt_end"),
|
|
1236
|
+
# Update rt_delta column
|
|
1237
|
+
pl.when(update_mask)
|
|
1238
|
+
.then(
|
|
1239
|
+
pl.col("__row_idx").map_elements(
|
|
1240
|
+
lambda x: row_to_rt_delta.get(x, None),
|
|
1241
|
+
return_dtype=pl.Float64,
|
|
1242
|
+
),
|
|
1243
|
+
)
|
|
1244
|
+
.otherwise(pl.col("rt_delta"))
|
|
1245
|
+
.alias("rt_delta"),
|
|
1246
|
+
# Update chrom_area column
|
|
1247
|
+
pl.when(update_mask)
|
|
1248
|
+
.then(
|
|
1249
|
+
pl.col("__row_idx").map_elements(
|
|
1250
|
+
lambda x: row_to_chrom_area.get(x, 0),
|
|
1251
|
+
return_dtype=pl.Float64,
|
|
1252
|
+
),
|
|
1253
|
+
)
|
|
1254
|
+
.otherwise(pl.col("chrom_area"))
|
|
1255
|
+
.alias("chrom_area"),
|
|
1256
|
+
]).drop("__row_idx") # Remove the temporary row index column
|
|
1428
1257
|
|
|
1429
1258
|
self.logger.debug(
|
|
1430
1259
|
f"Integration completed. Updated {len(update_rows)} features with chromatogram data.",
|
|
@@ -1514,22 +1343,10 @@ def _align_pose_clustering(study_obj, fmaps, params):
|
|
|
1514
1343
|
params_oms.setValue("pairfinder:distance_intensity:log_transform", "disabled")
|
|
1515
1344
|
params_oms.setValue("pairfinder:ignore_charge", "true")
|
|
1516
1345
|
params_oms.setValue("max_num_peaks_considered", 1000)
|
|
1517
|
-
params_oms.setValue(
|
|
1518
|
-
|
|
1519
|
-
|
|
1520
|
-
)
|
|
1521
|
-
params_oms.setValue(
|
|
1522
|
-
"pairfinder:distance_MZ:max_difference",
|
|
1523
|
-
params.get("mz_max_diff"),
|
|
1524
|
-
)
|
|
1525
|
-
params_oms.setValue(
|
|
1526
|
-
"superimposer:rt_pair_distance_fraction",
|
|
1527
|
-
params.get("rt_pair_distance_frac"),
|
|
1528
|
-
)
|
|
1529
|
-
params_oms.setValue(
|
|
1530
|
-
"superimposer:mz_pair_max_distance",
|
|
1531
|
-
params.get("mz_pair_max_distance"),
|
|
1532
|
-
)
|
|
1346
|
+
params_oms.setValue("pairfinder:distance_RT:max_difference", params.get("rt_max_diff"))
|
|
1347
|
+
params_oms.setValue("pairfinder:distance_MZ:max_difference", params.get("mz_max_diff"))
|
|
1348
|
+
params_oms.setValue("superimposer:rt_pair_distance_fraction", params.get("rt_pair_distance_frac"))
|
|
1349
|
+
params_oms.setValue("superimposer:mz_pair_max_distance", params.get("mz_pair_max_distance"))
|
|
1533
1350
|
params_oms.setValue("superimposer:num_used_points", params.get("num_used_points"))
|
|
1534
1351
|
params_oms.setValue("pairfinder:distance_MZ:exponent", 3.0)
|
|
1535
1352
|
params_oms.setValue("pairfinder:distance_RT:exponent", 2.0)
|
|
@@ -1538,9 +1355,7 @@ def _align_pose_clustering(study_obj, fmaps, params):
|
|
|
1538
1355
|
study_obj.logger.info("Starting alignment with PoseClustering")
|
|
1539
1356
|
|
|
1540
1357
|
# Set ref_index to feature map index with largest number of features
|
|
1541
|
-
ref_index = [
|
|
1542
|
-
i[0] for i in sorted(enumerate([fm.size() for fm in fmaps]), key=lambda x: x[1])
|
|
1543
|
-
][-1]
|
|
1358
|
+
ref_index = [i[0] for i in sorted(enumerate([fm.size() for fm in fmaps]), key=lambda x: x[1])][-1]
|
|
1544
1359
|
study_obj.logger.debug(
|
|
1545
1360
|
f"Reference map is {study_obj.samples_df.row(ref_index, named=True)['sample_name']}",
|
|
1546
1361
|
)
|
|
@@ -1559,10 +1374,7 @@ def _align_pose_clustering(study_obj, fmaps, params):
|
|
|
1559
1374
|
):
|
|
1560
1375
|
if index == ref_index:
|
|
1561
1376
|
continue
|
|
1562
|
-
if (
|
|
1563
|
-
params.get("skip_blanks")
|
|
1564
|
-
and study_obj.samples_df.row(index, named=True)["sample_type"] == "blank"
|
|
1565
|
-
):
|
|
1377
|
+
if params.get("skip_blanks") and study_obj.samples_df.row(index, named=True)["sample_type"] == "blank":
|
|
1566
1378
|
continue
|
|
1567
1379
|
trafo = oms.TransformationDescription()
|
|
1568
1380
|
aligner.align(fm, trafo)
|
|
@@ -1581,30 +1393,19 @@ def _align_kd_algorithm(study_obj, fmaps, params):
|
|
|
1581
1393
|
|
|
1582
1394
|
# Pull parameter values - map standard align params to our algorithm
|
|
1583
1395
|
# Use rt_max_diff (standard align param) instead of warp_rt_tol for RT tolerance
|
|
1584
|
-
rt_pair_tol = (
|
|
1585
|
-
float(params.get("rt_max_diff"))
|
|
1586
|
-
if params.get("rt_max_diff") is not None
|
|
1587
|
-
else 2.0
|
|
1588
|
-
)
|
|
1396
|
+
rt_pair_tol = float(params.get("rt_max_diff")) if params.get("rt_max_diff") is not None else 2.0
|
|
1589
1397
|
# Use mz_max_diff (standard align param) converted to ppm
|
|
1590
|
-
mz_max_diff_da = (
|
|
1591
|
-
float(params.get("mz_max_diff"))
|
|
1592
|
-
if params.get("mz_max_diff") is not None
|
|
1593
|
-
else 0.02
|
|
1594
|
-
)
|
|
1398
|
+
mz_max_diff_da = float(params.get("mz_max_diff")) if params.get("mz_max_diff") is not None else 0.02
|
|
1595
1399
|
# Convert Da to ppm (assuming ~400 m/z average for metabolomics): 0.01 Da / 400 * 1e6 = 25 ppm
|
|
1596
1400
|
ppm_tol = mz_max_diff_da / 400.0 * 1e6
|
|
1597
1401
|
# Allow override with warp_mz_tol if specifically set (but not from defaults)
|
|
1598
1402
|
try:
|
|
1599
1403
|
warp_mz_from_params = params.get("warp_mz_tol")
|
|
1600
|
-
if (
|
|
1601
|
-
warp_mz_from_params is not None
|
|
1602
|
-
and warp_mz_from_params != params.__class__().warp_mz_tol
|
|
1603
|
-
):
|
|
1404
|
+
if warp_mz_from_params is not None and warp_mz_from_params != params.__class__().warp_mz_tol:
|
|
1604
1405
|
ppm_tol = float(warp_mz_from_params)
|
|
1605
1406
|
except (KeyError, AttributeError):
|
|
1606
1407
|
pass
|
|
1607
|
-
|
|
1408
|
+
|
|
1608
1409
|
# Safely retrieve optional parameter max_anchor_points (not yet part of defaults)
|
|
1609
1410
|
try:
|
|
1610
1411
|
_raw_mp = params.get("max_anchor_points")
|
|
@@ -1612,18 +1413,16 @@ def _align_kd_algorithm(study_obj, fmaps, params):
|
|
|
1612
1413
|
_raw_mp = None
|
|
1613
1414
|
max_points = int(_raw_mp) if _raw_mp is not None else 1000
|
|
1614
1415
|
study_obj.logger.info(
|
|
1615
|
-
f"Starting custom KD-style alignment (ref-based) with ppm_tol={ppm_tol:.1f}, rt_tol={rt_pair_tol:.1f}s, max_points={max_points}"
|
|
1616
|
-
)
|
|
1617
|
-
study_obj.logger.info(
|
|
1618
|
-
f"Using rt_max_diff={params.get('rt_max_diff')}, mz_max_diff={params.get('mz_max_diff')}",
|
|
1416
|
+
f"Starting custom KD-style alignment (ref-based) with ppm_tol={ppm_tol:.1f}, rt_tol={rt_pair_tol:.1f}s, max_points={max_points}"
|
|
1619
1417
|
)
|
|
1418
|
+
study_obj.logger.info(f"Using rt_max_diff={params.get('rt_max_diff')}, mz_max_diff={params.get('mz_max_diff')}")
|
|
1620
1419
|
|
|
1621
1420
|
# Choose reference map (largest number of features)
|
|
1622
1421
|
ref_index = max(range(len(fmaps)), key=lambda i: fmaps[i].size())
|
|
1623
1422
|
ref_map = fmaps[ref_index]
|
|
1624
1423
|
study_obj.alignment_ref_index = ref_index
|
|
1625
1424
|
study_obj.logger.debug(
|
|
1626
|
-
f"Reference map index {ref_index} (sample: {study_obj.samples_df.row(ref_index, named=True)['sample_name']}) size={ref_map.size()}"
|
|
1425
|
+
f"Reference map index {ref_index} (sample: {study_obj.samples_df.row(ref_index, named=True)['sample_name']}) size={ref_map.size()}"
|
|
1627
1426
|
)
|
|
1628
1427
|
|
|
1629
1428
|
# Extract and sort reference features by m/z for binary search
|
|
@@ -1647,10 +1446,7 @@ def _align_kd_algorithm(study_obj, fmaps, params):
|
|
|
1647
1446
|
best_drt = drt
|
|
1648
1447
|
return best
|
|
1649
1448
|
|
|
1650
|
-
def _set_pairs(
|
|
1651
|
-
td_obj: oms.TransformationDescription,
|
|
1652
|
-
pairs,
|
|
1653
|
-
): # Helper for pyopenms API variability
|
|
1449
|
+
def _set_pairs(td_obj: 'oms.TransformationDescription', pairs): # Helper for pyopenms API variability
|
|
1654
1450
|
# Always provide list of lists to satisfy strict type expectations
|
|
1655
1451
|
conv = [[float(a), float(b)] for a, b in pairs]
|
|
1656
1452
|
try:
|
|
@@ -1732,7 +1528,7 @@ def _align_kd_algorithm(study_obj, fmaps, params):
|
|
|
1732
1528
|
td.fitModel(model, oms.Param())
|
|
1733
1529
|
except Exception as e:
|
|
1734
1530
|
study_obj.logger.debug(
|
|
1735
|
-
f"Map {i}: {model} fitting failed ({e}); fallback to linear two-point shift"
|
|
1531
|
+
f"Map {i}: {model} fitting failed ({e}); fallback to linear two-point shift"
|
|
1736
1532
|
)
|
|
1737
1533
|
rts = [f.getRT() for f in fmap]
|
|
1738
1534
|
lo, hi = (min(rts), max(rts)) if rts else (0.0, 1.0)
|
|
@@ -1744,7 +1540,7 @@ def _align_kd_algorithm(study_obj, fmaps, params):
|
|
|
1744
1540
|
pass
|
|
1745
1541
|
|
|
1746
1542
|
study_obj.logger.debug(
|
|
1747
|
-
f"Map {i}: anchors raw={len(pairs_raw)} used={len(pairs_use)} model={model} median_shift={med_shift:.4f}s"
|
|
1543
|
+
f"Map {i}: anchors raw={len(pairs_raw)} used={len(pairs_use)} model={model} median_shift={med_shift:.4f}s"
|
|
1748
1544
|
)
|
|
1749
1545
|
transformations.append(td)
|
|
1750
1546
|
|
|
@@ -1762,7 +1558,7 @@ def _align_kd_algorithm(study_obj, fmaps, params):
|
|
|
1762
1558
|
study_obj.logger.warning(f"Map {i}: failed applying transformation ({e})")
|
|
1763
1559
|
|
|
1764
1560
|
study_obj.logger.info(
|
|
1765
|
-
f"Custom KD alignment completed. Reference index {ref_index}. Applied {len(transformations)} transformations."
|
|
1561
|
+
f"Custom KD alignment completed. Reference index {ref_index}. Applied {len(transformations)} transformations."
|
|
1766
1562
|
)
|
|
1767
1563
|
|
|
1768
1564
|
|
|
@@ -1771,21 +1567,13 @@ def _align_pose_clustering_fallback(study_obj, fmaps, params):
|
|
|
1771
1567
|
import pyopenms as oms
|
|
1772
1568
|
|
|
1773
1569
|
aligner = oms.MapAlignmentAlgorithmPoseClustering()
|
|
1774
|
-
ref_index = [
|
|
1775
|
-
i[0] for i in sorted(enumerate([fm.size() for fm in fmaps]), key=lambda x: x[1])
|
|
1776
|
-
][-1]
|
|
1570
|
+
ref_index = [i[0] for i in sorted(enumerate([fm.size() for fm in fmaps]), key=lambda x: x[1])][-1]
|
|
1777
1571
|
|
|
1778
1572
|
# Set up basic parameters for pose clustering
|
|
1779
1573
|
pc_params = oms.Param()
|
|
1780
1574
|
pc_params.setValue("max_num_peaks_considered", 1000)
|
|
1781
|
-
pc_params.setValue(
|
|
1782
|
-
|
|
1783
|
-
params.get("rt_max_diff"),
|
|
1784
|
-
)
|
|
1785
|
-
pc_params.setValue(
|
|
1786
|
-
"pairfinder:distance_MZ:max_difference",
|
|
1787
|
-
params.get("mz_max_diff"),
|
|
1788
|
-
)
|
|
1575
|
+
pc_params.setValue("pairfinder:distance_RT:max_difference", params.get("rt_max_diff"))
|
|
1576
|
+
pc_params.setValue("pairfinder:distance_MZ:max_difference", params.get("mz_max_diff"))
|
|
1789
1577
|
|
|
1790
1578
|
aligner.setParameters(pc_params)
|
|
1791
1579
|
aligner.setReference(fmaps[ref_index])
|