masster 0.4.4__py3-none-any.whl → 0.4.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of masster might be problematic. Click here for more details.
- masster/chromatogram.py +2 -2
- masster/data/libs/urine.csv +3 -3
- masster/logger.py +8 -8
- masster/sample/adducts.py +337 -263
- masster/sample/defaults/find_adducts_def.py +21 -8
- masster/sample/h5.py +557 -278
- masster/sample/helpers.py +131 -75
- masster/sample/lib.py +2 -2
- masster/sample/load.py +25 -11
- masster/sample/plot.py +5 -5
- masster/sample/processing.py +115 -85
- masster/sample/sample.py +28 -15
- masster/sample/sample5_schema.json +44 -44
- masster/sample/save.py +34 -11
- masster/spectrum.py +2 -2
- masster/study/defaults/align_def.py +5 -1
- masster/study/defaults/identify_def.py +3 -1
- masster/study/defaults/study_def.py +58 -25
- masster/study/export.py +354 -204
- masster/study/h5.py +557 -155
- masster/study/helpers.py +487 -194
- masster/study/id.py +536 -347
- masster/study/load.py +228 -138
- masster/study/plot.py +68 -68
- masster/study/processing.py +455 -253
- masster/study/save.py +14 -4
- masster/study/study.py +122 -40
- masster/study/study5_schema.json +149 -149
- {masster-0.4.4.dist-info → masster-0.4.6.dist-info}/METADATA +5 -3
- {masster-0.4.4.dist-info → masster-0.4.6.dist-info}/RECORD +34 -34
- {masster-0.4.4.dist-info → masster-0.4.6.dist-info}/WHEEL +0 -0
- {masster-0.4.4.dist-info → masster-0.4.6.dist-info}/entry_points.txt +0 -0
- {masster-0.4.4.dist-info → masster-0.4.6.dist-info}/licenses/LICENSE +0 -0
- {masster-0.4.4.dist-info → masster-0.4.6.dist-info}/top_level.txt +0 -0
masster/study/processing.py
CHANGED
|
@@ -115,7 +115,8 @@ def align(self, **kwargs):
|
|
|
115
115
|
# Pre-build sample_uid lookup for faster access
|
|
116
116
|
self.logger.debug("Build sample_uid lookup for fast access...")
|
|
117
117
|
sample_uid_lookup = {
|
|
118
|
-
idx: row_dict["sample_uid"]
|
|
118
|
+
idx: row_dict["sample_uid"]
|
|
119
|
+
for idx, row_dict in enumerate(self.samples_df.iter_rows(named=True))
|
|
119
120
|
}
|
|
120
121
|
|
|
121
122
|
# Build the main lookup using feature_uid (not feature_id)
|
|
@@ -215,7 +216,7 @@ def align(self, **kwargs):
|
|
|
215
216
|
self.features_df = self.features_df.with_columns(*new_cols)
|
|
216
217
|
|
|
217
218
|
self.logger.debug("Alignment completed successfully.")
|
|
218
|
-
|
|
219
|
+
|
|
219
220
|
# Reset consensus data structures after alignment since RT changes invalidate consensus
|
|
220
221
|
consensus_reset_count = 0
|
|
221
222
|
if not self.consensus_df.is_empty():
|
|
@@ -227,7 +228,7 @@ def align(self, **kwargs):
|
|
|
227
228
|
if not self.consensus_ms2.is_empty():
|
|
228
229
|
self.consensus_ms2 = pl.DataFrame()
|
|
229
230
|
consensus_reset_count += 1
|
|
230
|
-
|
|
231
|
+
|
|
231
232
|
# Remove merge and find_ms2 parameters from history since they need to be re-run
|
|
232
233
|
keys_to_remove = ["merge", "find_ms2"]
|
|
233
234
|
history_removed_count = 0
|
|
@@ -237,9 +238,11 @@ def align(self, **kwargs):
|
|
|
237
238
|
del self.history[key]
|
|
238
239
|
history_removed_count += 1
|
|
239
240
|
self.logger.debug(f"Removed {key} from history")
|
|
240
|
-
|
|
241
|
+
|
|
241
242
|
if consensus_reset_count > 0 or history_removed_count > 0:
|
|
242
|
-
self.logger.info(
|
|
243
|
+
self.logger.info(
|
|
244
|
+
f"Alignment reset: {consensus_reset_count} consensus structures cleared, {history_removed_count} history entries removed",
|
|
245
|
+
)
|
|
243
246
|
|
|
244
247
|
if params.get("save_features"):
|
|
245
248
|
self.save_samples()
|
|
@@ -290,7 +293,10 @@ def merge(self, **kwargs):
|
|
|
290
293
|
algorithm = params.get("algorithm")
|
|
291
294
|
min_samples = params.get("min_samples")
|
|
292
295
|
link_ms2 = params.get("link_ms2")
|
|
293
|
-
mz_tol = kwargs.get(
|
|
296
|
+
mz_tol = kwargs.get(
|
|
297
|
+
"mz_tol",
|
|
298
|
+
0.01,
|
|
299
|
+
) # Default values for parameters not in defaults class
|
|
294
300
|
rt_tol = kwargs.get("rt_tol", 1.0)
|
|
295
301
|
|
|
296
302
|
if len(self.samples_df) > 200 and algorithm == "qt":
|
|
@@ -399,7 +405,10 @@ def merge(self, **kwargs):
|
|
|
399
405
|
consensus_map.setUniqueIds()
|
|
400
406
|
|
|
401
407
|
# create a dict to map uid to feature_uid using self.features_df
|
|
402
|
-
feature_uid_map = {
|
|
408
|
+
feature_uid_map = {
|
|
409
|
+
row["feature_id"]: row["feature_uid"]
|
|
410
|
+
for row in self.features_df.iter_rows(named=True)
|
|
411
|
+
}
|
|
403
412
|
imax = consensus_map.size()
|
|
404
413
|
|
|
405
414
|
# Pre-build fast lookup tables for features_df data
|
|
@@ -426,7 +435,9 @@ def merge(self, **kwargs):
|
|
|
426
435
|
|
|
427
436
|
for row in self.features_df.iter_rows(named=True):
|
|
428
437
|
feature_uid = row["feature_uid"]
|
|
429
|
-
features_lookup[feature_uid] = {
|
|
438
|
+
features_lookup[feature_uid] = {
|
|
439
|
+
col: row[col] for col in feature_columns if col in self.features_df.columns
|
|
440
|
+
}
|
|
430
441
|
|
|
431
442
|
# create a list to store the consensus mapping
|
|
432
443
|
consensus_mapping = []
|
|
@@ -453,11 +464,13 @@ def merge(self, **kwargs):
|
|
|
453
464
|
# this is a feature that was removed but is still in the feature maps
|
|
454
465
|
continue
|
|
455
466
|
fuid = feature_uid_map[fuid]
|
|
456
|
-
consensus_mapping.append(
|
|
457
|
-
|
|
458
|
-
|
|
459
|
-
|
|
460
|
-
|
|
467
|
+
consensus_mapping.append(
|
|
468
|
+
{
|
|
469
|
+
"consensus_uid": i,
|
|
470
|
+
"sample_uid": f.getMapIndex() + 1,
|
|
471
|
+
"feature_uid": fuid,
|
|
472
|
+
},
|
|
473
|
+
)
|
|
461
474
|
uids.append(fuid)
|
|
462
475
|
|
|
463
476
|
# Get feature data from lookup instead of DataFrame filtering
|
|
@@ -471,43 +484,99 @@ def merge(self, **kwargs):
|
|
|
471
484
|
|
|
472
485
|
# Compute statistics using vectorized operations on collected data
|
|
473
486
|
# Convert to numpy arrays for faster computation
|
|
474
|
-
rt_values = np.array(
|
|
475
|
-
|
|
476
|
-
|
|
477
|
-
|
|
478
|
-
|
|
479
|
-
|
|
480
|
-
|
|
481
|
-
|
|
482
|
-
|
|
483
|
-
|
|
484
|
-
|
|
485
|
-
|
|
486
|
-
|
|
487
|
-
|
|
488
|
-
|
|
489
|
-
|
|
490
|
-
|
|
491
|
-
|
|
492
|
-
|
|
493
|
-
|
|
494
|
-
|
|
495
|
-
|
|
496
|
-
|
|
497
|
-
|
|
498
|
-
|
|
499
|
-
|
|
500
|
-
|
|
501
|
-
|
|
502
|
-
|
|
503
|
-
|
|
487
|
+
rt_values = np.array(
|
|
488
|
+
[fd.get("rt", 0) for fd in feature_data_list if fd.get("rt") is not None],
|
|
489
|
+
)
|
|
490
|
+
mz_values = np.array(
|
|
491
|
+
[fd.get("mz", 0) for fd in feature_data_list if fd.get("mz") is not None],
|
|
492
|
+
)
|
|
493
|
+
rt_start_values = np.array(
|
|
494
|
+
[
|
|
495
|
+
fd.get("rt_start", 0)
|
|
496
|
+
for fd in feature_data_list
|
|
497
|
+
if fd.get("rt_start") is not None
|
|
498
|
+
],
|
|
499
|
+
)
|
|
500
|
+
rt_end_values = np.array(
|
|
501
|
+
[
|
|
502
|
+
fd.get("rt_end", 0)
|
|
503
|
+
for fd in feature_data_list
|
|
504
|
+
if fd.get("rt_end") is not None
|
|
505
|
+
],
|
|
506
|
+
)
|
|
507
|
+
rt_delta_values = np.array(
|
|
508
|
+
[
|
|
509
|
+
fd.get("rt_delta", 0)
|
|
510
|
+
for fd in feature_data_list
|
|
511
|
+
if fd.get("rt_delta") is not None
|
|
512
|
+
],
|
|
513
|
+
)
|
|
514
|
+
mz_start_values = np.array(
|
|
515
|
+
[
|
|
516
|
+
fd.get("mz_start", 0)
|
|
517
|
+
for fd in feature_data_list
|
|
518
|
+
if fd.get("mz_start") is not None
|
|
519
|
+
],
|
|
520
|
+
)
|
|
521
|
+
mz_end_values = np.array(
|
|
522
|
+
[
|
|
523
|
+
fd.get("mz_end", 0)
|
|
524
|
+
for fd in feature_data_list
|
|
525
|
+
if fd.get("mz_end") is not None
|
|
526
|
+
],
|
|
527
|
+
)
|
|
528
|
+
inty_values = np.array(
|
|
529
|
+
[
|
|
530
|
+
fd.get("inty", 0)
|
|
531
|
+
for fd in feature_data_list
|
|
532
|
+
if fd.get("inty") is not None
|
|
533
|
+
],
|
|
534
|
+
)
|
|
535
|
+
coherence_values = np.array(
|
|
536
|
+
[
|
|
537
|
+
fd.get("chrom_coherence", 0)
|
|
538
|
+
for fd in feature_data_list
|
|
539
|
+
if fd.get("chrom_coherence") is not None
|
|
540
|
+
],
|
|
541
|
+
)
|
|
542
|
+
prominence_values = np.array(
|
|
543
|
+
[
|
|
544
|
+
fd.get("chrom_prominence", 0)
|
|
545
|
+
for fd in feature_data_list
|
|
546
|
+
if fd.get("chrom_prominence") is not None
|
|
547
|
+
],
|
|
548
|
+
)
|
|
549
|
+
prominence_scaled_values = np.array(
|
|
550
|
+
[
|
|
551
|
+
fd.get("chrom_prominence_scaled", 0)
|
|
552
|
+
for fd in feature_data_list
|
|
553
|
+
if fd.get("chrom_prominence_scaled") is not None
|
|
554
|
+
],
|
|
555
|
+
)
|
|
556
|
+
height_scaled_values = np.array(
|
|
557
|
+
[
|
|
558
|
+
fd.get("chrom_height_scaled", 0)
|
|
559
|
+
for fd in feature_data_list
|
|
560
|
+
if fd.get("chrom_height_scaled") is not None
|
|
561
|
+
],
|
|
562
|
+
)
|
|
563
|
+
iso_values = np.array(
|
|
564
|
+
[fd.get("iso", 0) for fd in feature_data_list if fd.get("iso") is not None],
|
|
565
|
+
)
|
|
566
|
+
charge_values = np.array(
|
|
567
|
+
[
|
|
568
|
+
fd.get("charge", 0)
|
|
569
|
+
for fd in feature_data_list
|
|
570
|
+
if fd.get("charge") is not None
|
|
571
|
+
],
|
|
572
|
+
)
|
|
504
573
|
|
|
505
574
|
# adduct_values
|
|
506
575
|
# Collect all adducts from feature_data_list to create consensus adduct information
|
|
507
576
|
# Only consider adducts that are in study._get_adducts() plus items with '?'
|
|
508
577
|
all_adducts = []
|
|
509
578
|
adduct_masses = {}
|
|
510
|
-
|
|
579
|
+
|
|
511
580
|
# Get valid adducts from study._get_adducts()
|
|
512
581
|
valid_adducts = set()
|
|
513
582
|
try:
|
|
@@ -516,7 +585,7 @@ def merge(self, **kwargs):
|
|
|
516
585
|
valid_adducts.update(study_adducts_df["name"].to_list())
|
|
517
586
|
except Exception as e:
|
|
518
587
|
self.logger.warning(f"Could not retrieve study adducts: {e}")
|
|
519
|
-
|
|
588
|
+
|
|
520
589
|
# Always allow '?' adducts
|
|
521
590
|
valid_adducts.add("?")
|
|
522
591
|
|
|
@@ -527,7 +596,7 @@ def merge(self, **kwargs):
|
|
|
527
596
|
|
|
528
597
|
if adduct is not None:
|
|
529
598
|
# Only include adducts that are valid (from study._get_adducts() or contain '?')
|
|
530
|
-
if adduct in valid_adducts or
|
|
599
|
+
if adduct in valid_adducts or "?" in adduct:
|
|
531
600
|
all_adducts.append(adduct)
|
|
532
601
|
if adduct_mass is not None:
|
|
533
602
|
adduct_masses[adduct] = adduct_mass
|
|
@@ -535,33 +604,37 @@ def merge(self, **kwargs):
|
|
|
535
604
|
# Calculate adduct_values for the consensus feature
|
|
536
605
|
adduct_values = []
|
|
537
606
|
if all_adducts:
|
|
538
|
-
adduct_counts = {
|
|
607
|
+
adduct_counts = {
|
|
608
|
+
adduct: all_adducts.count(adduct) for adduct in set(all_adducts)
|
|
609
|
+
}
|
|
539
610
|
total_count = sum(adduct_counts.values())
|
|
540
611
|
for adduct, count in adduct_counts.items():
|
|
541
612
|
percentage = (count / total_count) * 100 if total_count > 0 else 0
|
|
542
613
|
mass = adduct_masses.get(adduct, None)
|
|
543
614
|
# Store as list with [name, num, %] format for the adducts column
|
|
544
|
-
adduct_values.append(
|
|
545
|
-
|
|
546
|
-
|
|
547
|
-
|
|
548
|
-
|
|
615
|
+
adduct_values.append(
|
|
616
|
+
[
|
|
617
|
+
str(adduct),
|
|
618
|
+
int(count),
|
|
619
|
+
float(round(percentage, 2)),
|
|
620
|
+
],
|
|
621
|
+
)
|
|
549
622
|
|
|
550
623
|
# Sort adduct_values by count in descending order
|
|
551
624
|
adduct_values.sort(key=lambda x: x[1], reverse=True) # Sort by count (index 1)
|
|
552
625
|
# Store adduct_values for use in metadata
|
|
553
626
|
consensus_adduct_values = adduct_values
|
|
554
|
-
|
|
627
|
+
|
|
555
628
|
# Extract top adduct information for new columns
|
|
556
629
|
adduct_top = None
|
|
557
630
|
adduct_charge_top = None
|
|
558
631
|
adduct_mass_neutral_top = None
|
|
559
632
|
adduct_mass_shift_top = None
|
|
560
|
-
|
|
633
|
+
|
|
561
634
|
if consensus_adduct_values:
|
|
562
635
|
top_adduct_name = consensus_adduct_values[0][0] # Get top adduct name
|
|
563
636
|
adduct_top = top_adduct_name
|
|
564
|
-
|
|
637
|
+
|
|
565
638
|
# Parse adduct information to extract charge and mass shift
|
|
566
639
|
# Handle "?" as "H" and parse common adduct formats
|
|
567
640
|
if top_adduct_name == "?" or top_adduct_name == "[M+?]+":
|
|
@@ -577,33 +650,37 @@ def merge(self, **kwargs):
|
|
|
577
650
|
study_adducts_df = self._get_adducts()
|
|
578
651
|
if not study_adducts_df.is_empty():
|
|
579
652
|
# Look for exact match in study adducts
|
|
580
|
-
matching_adduct = study_adducts_df.filter(
|
|
653
|
+
matching_adduct = study_adducts_df.filter(
|
|
654
|
+
pl.col("name") == top_adduct_name,
|
|
655
|
+
)
|
|
581
656
|
if not matching_adduct.is_empty():
|
|
582
657
|
adduct_row = matching_adduct.row(0, named=True)
|
|
583
658
|
adduct_charge_top = adduct_row["charge"]
|
|
584
659
|
adduct_mass_shift_top = adduct_row["mass_shift"]
|
|
585
660
|
adduct_found = True
|
|
586
661
|
except Exception as e:
|
|
587
|
-
self.logger.warning(
|
|
588
|
-
|
|
662
|
+
self.logger.warning(
|
|
663
|
+
f"Could not lookup adduct in study adducts: {e}",
|
|
664
|
+
)
|
|
665
|
+
|
|
589
666
|
if not adduct_found:
|
|
590
667
|
# Fallback to regex parsing
|
|
591
668
|
import re
|
|
592
|
-
|
|
669
|
+
|
|
593
670
|
# Pattern for adducts like [M+H]+, [M-H]-, [M+Na]+, etc.
|
|
594
|
-
pattern = r
|
|
671
|
+
pattern = r"\[M([+\-])([A-Za-z0-9]+)\]([0-9]*)([+\-])"
|
|
595
672
|
match = re.match(pattern, top_adduct_name)
|
|
596
|
-
|
|
673
|
+
|
|
597
674
|
if match:
|
|
598
675
|
sign = match.group(1)
|
|
599
676
|
element = match.group(2)
|
|
600
677
|
multiplier_str = match.group(3)
|
|
601
678
|
charge_sign = match.group(4)
|
|
602
|
-
|
|
679
|
+
|
|
603
680
|
multiplier = int(multiplier_str) if multiplier_str else 1
|
|
604
681
|
charge = multiplier if charge_sign == "+" else -multiplier
|
|
605
682
|
adduct_charge_top = charge
|
|
606
|
-
|
|
683
|
+
|
|
607
684
|
# Calculate mass shift based on element
|
|
608
685
|
element_masses = {
|
|
609
686
|
"H": 1.007825,
|
|
@@ -617,9 +694,16 @@ def merge(self, **kwargs):
|
|
|
617
694
|
"CH3COO": 59.013851,
|
|
618
695
|
"H2O": 18.010565,
|
|
619
696
|
}
|
|
620
|
-
|
|
621
|
-
base_mass = element_masses.get(
|
|
622
|
-
|
|
697
|
+
|
|
698
|
+
base_mass = element_masses.get(
|
|
699
|
+
element,
|
|
700
|
+
1.007825,
|
|
701
|
+
) # Default to H if unknown
|
|
702
|
+
mass_shift = (
|
|
703
|
+
base_mass * multiplier
|
|
704
|
+
if sign == "+"
|
|
705
|
+
else -base_mass * multiplier
|
|
706
|
+
)
|
|
623
707
|
adduct_mass_shift_top = mass_shift
|
|
624
708
|
else:
|
|
625
709
|
# Default fallback
|
|
@@ -627,8 +711,8 @@ def merge(self, **kwargs):
|
|
|
627
711
|
adduct_mass_shift_top = 1.007825
|
|
628
712
|
else:
|
|
629
713
|
# No valid adducts found - assign default based on study polarity
|
|
630
|
-
study_polarity = getattr(self,
|
|
631
|
-
if study_polarity in [
|
|
714
|
+
study_polarity = getattr(self, "polarity", "positive")
|
|
715
|
+
if study_polarity in ["negative", "neg"]:
|
|
632
716
|
# Negative mode default
|
|
633
717
|
adduct_top = "[M-?]1-"
|
|
634
718
|
adduct_charge_top = -1
|
|
@@ -638,14 +722,18 @@ def merge(self, **kwargs):
|
|
|
638
722
|
adduct_top = "[M+?]1+"
|
|
639
723
|
adduct_charge_top = 1
|
|
640
724
|
adduct_mass_shift_top = 1.007825 # H mass (gain of proton)
|
|
641
|
-
|
|
725
|
+
|
|
642
726
|
# Create a single default adduct entry in the adducts list for consistency
|
|
643
727
|
consensus_adduct_values = [[adduct_top, 1, 100.0]]
|
|
644
|
-
|
|
728
|
+
|
|
645
729
|
# Calculate neutral mass from consensus mz (for both cases)
|
|
646
|
-
consensus_mz =
|
|
730
|
+
consensus_mz = (
|
|
731
|
+
round(float(np.mean(mz_values)), 4) if len(mz_values) > 0 else 0.0
|
|
732
|
+
)
|
|
647
733
|
if adduct_charge_top and adduct_mass_shift_top is not None:
|
|
648
|
-
adduct_mass_neutral_top =
|
|
734
|
+
adduct_mass_neutral_top = (
|
|
735
|
+
consensus_mz * abs(adduct_charge_top) - adduct_mass_shift_top
|
|
736
|
+
)
|
|
649
737
|
|
|
650
738
|
# Calculate number of MS2 spectra
|
|
651
739
|
ms2_count = 0
|
|
@@ -654,48 +742,95 @@ def merge(self, **kwargs):
|
|
|
654
742
|
if ms2_scans is not None:
|
|
655
743
|
ms2_count += len(ms2_scans)
|
|
656
744
|
|
|
657
|
-
metadata_list.append(
|
|
658
|
-
|
|
659
|
-
|
|
660
|
-
|
|
661
|
-
|
|
662
|
-
|
|
663
|
-
|
|
664
|
-
|
|
665
|
-
|
|
666
|
-
|
|
667
|
-
|
|
668
|
-
|
|
669
|
-
|
|
670
|
-
|
|
671
|
-
|
|
672
|
-
|
|
673
|
-
|
|
674
|
-
|
|
675
|
-
|
|
676
|
-
|
|
677
|
-
|
|
678
|
-
|
|
679
|
-
|
|
680
|
-
|
|
681
|
-
|
|
682
|
-
3
|
|
683
|
-
|
|
684
|
-
|
|
685
|
-
|
|
686
|
-
|
|
687
|
-
|
|
688
|
-
|
|
689
|
-
|
|
690
|
-
|
|
691
|
-
|
|
692
|
-
|
|
693
|
-
|
|
694
|
-
|
|
695
|
-
|
|
696
|
-
|
|
697
|
-
|
|
698
|
-
|
|
745
|
+
metadata_list.append(
|
|
746
|
+
{
|
|
747
|
+
"consensus_uid": int(i), # "consensus_id": i,
|
|
748
|
+
"consensus_id": str(feature.getUniqueId()),
|
|
749
|
+
"quality": round(float(feature.getQuality()), 3),
|
|
750
|
+
"number_samples": len(feature_data_list),
|
|
751
|
+
# "number_ext": int(len(features_list)),
|
|
752
|
+
"rt": round(float(np.mean(rt_values)), 4)
|
|
753
|
+
if len(rt_values) > 0
|
|
754
|
+
else 0.0,
|
|
755
|
+
"mz": round(float(np.mean(mz_values)), 4)
|
|
756
|
+
if len(mz_values) > 0
|
|
757
|
+
else 0.0,
|
|
758
|
+
"rt_min": round(float(np.min(rt_values)), 3)
|
|
759
|
+
if len(rt_values) > 0
|
|
760
|
+
else 0.0,
|
|
761
|
+
"rt_max": round(float(np.max(rt_values)), 3)
|
|
762
|
+
if len(rt_values) > 0
|
|
763
|
+
else 0.0,
|
|
764
|
+
"rt_mean": round(float(np.mean(rt_values)), 3)
|
|
765
|
+
if len(rt_values) > 0
|
|
766
|
+
else 0.0,
|
|
767
|
+
"rt_start_mean": round(float(np.mean(rt_start_values)), 3)
|
|
768
|
+
if len(rt_start_values) > 0
|
|
769
|
+
else 0.0,
|
|
770
|
+
"rt_end_mean": round(float(np.mean(rt_end_values)), 3)
|
|
771
|
+
if len(rt_end_values) > 0
|
|
772
|
+
else 0.0,
|
|
773
|
+
"rt_delta_mean": round(float(np.ptp(rt_delta_values)), 3)
|
|
774
|
+
if len(rt_delta_values) > 0
|
|
775
|
+
else 0.0,
|
|
776
|
+
"mz_min": round(float(np.min(mz_values)), 4)
|
|
777
|
+
if len(mz_values) > 0
|
|
778
|
+
else 0.0,
|
|
779
|
+
"mz_max": round(float(np.max(mz_values)), 4)
|
|
780
|
+
if len(mz_values) > 0
|
|
781
|
+
else 0.0,
|
|
782
|
+
"mz_mean": round(float(np.mean(mz_values)), 4)
|
|
783
|
+
if len(mz_values) > 0
|
|
784
|
+
else 0.0,
|
|
785
|
+
"mz_start_mean": round(float(np.mean(mz_start_values)), 4)
|
|
786
|
+
if len(mz_start_values) > 0
|
|
787
|
+
else 0.0,
|
|
788
|
+
"mz_end_mean": round(float(np.mean(mz_end_values)), 4)
|
|
789
|
+
if len(mz_end_values) > 0
|
|
790
|
+
else 0.0,
|
|
791
|
+
"inty_mean": round(float(np.mean(inty_values)), 0)
|
|
792
|
+
if len(inty_values) > 0
|
|
793
|
+
else 0.0,
|
|
794
|
+
"bl": -1.0,
|
|
795
|
+
"chrom_coherence_mean": round(float(np.mean(coherence_values)), 3)
|
|
796
|
+
if len(coherence_values) > 0
|
|
797
|
+
else 0.0,
|
|
798
|
+
"chrom_prominence_mean": round(float(np.mean(prominence_values)), 0)
|
|
799
|
+
if len(prominence_values) > 0
|
|
800
|
+
else 0.0,
|
|
801
|
+
"chrom_prominence_scaled_mean": round(
|
|
802
|
+
float(np.mean(prominence_scaled_values)),
|
|
803
|
+
3,
|
|
804
|
+
)
|
|
805
|
+
if len(prominence_scaled_values) > 0
|
|
806
|
+
else 0.0,
|
|
807
|
+
"chrom_height_scaled_mean": round(
|
|
808
|
+
float(np.mean(height_scaled_values)),
|
|
809
|
+
3,
|
|
810
|
+
)
|
|
811
|
+
if len(height_scaled_values) > 0
|
|
812
|
+
else 0.0,
|
|
813
|
+
"iso_mean": round(float(np.mean(iso_values)), 2)
|
|
814
|
+
if len(iso_values) > 0
|
|
815
|
+
else 0.0,
|
|
816
|
+
"charge_mean": round(float(np.mean(charge_values)), 2)
|
|
817
|
+
if len(charge_values) > 0
|
|
818
|
+
else 0.0,
|
|
819
|
+
"number_ms2": int(ms2_count),
|
|
820
|
+
"adducts": consensus_adduct_values
|
|
821
|
+
if consensus_adduct_values
|
|
822
|
+
else [], # Ensure it's always a list
|
|
823
|
+
# New columns for top-ranked adduct information
|
|
824
|
+
"adduct_top": adduct_top,
|
|
825
|
+
"adduct_charge_top": adduct_charge_top,
|
|
826
|
+
"adduct_mass_neutral_top": round(adduct_mass_neutral_top, 6)
|
|
827
|
+
if adduct_mass_neutral_top is not None
|
|
828
|
+
else None,
|
|
829
|
+
"adduct_mass_shift_top": round(adduct_mass_shift_top, 6)
|
|
830
|
+
if adduct_mass_shift_top is not None
|
|
831
|
+
else None,
|
|
832
|
+
},
|
|
833
|
+
)
|
|
699
834
|
|
|
700
835
|
consensus_mapping_df = pl.DataFrame(consensus_mapping)
|
|
701
836
|
# remove all rows in consensus_mapping_df where consensus_id is not in self.featured_df['uid']
|
|
@@ -736,72 +871,74 @@ def merge(self, **kwargs):
|
|
|
736
871
|
)
|
|
737
872
|
|
|
738
873
|
self.consensus_map = consensus_map
|
|
739
|
-
|
|
874
|
+
|
|
740
875
|
# Add adduct grouping and adduct_of assignment
|
|
741
876
|
if len(self.consensus_df) > 0:
|
|
742
877
|
# Get rt_tol and mz_tol from kwargs or use defaults from merge_defaults
|
|
743
878
|
adduct_rt_tol = rt_tol # Use the same rt_tol from merge parameters
|
|
744
879
|
adduct_mz_tol = mz_tol # Use the same mz_tol from merge parameters
|
|
745
|
-
|
|
880
|
+
|
|
746
881
|
# Initialize new columns
|
|
747
882
|
adduct_group_list = []
|
|
748
883
|
adduct_of_list = []
|
|
749
|
-
|
|
884
|
+
|
|
750
885
|
# Get relevant columns for grouping
|
|
751
886
|
consensus_data = []
|
|
752
887
|
for row in self.consensus_df.iter_rows(named=True):
|
|
753
|
-
consensus_data.append(
|
|
754
|
-
|
|
755
|
-
|
|
756
|
-
|
|
757
|
-
|
|
758
|
-
|
|
759
|
-
|
|
760
|
-
|
|
888
|
+
consensus_data.append(
|
|
889
|
+
{
|
|
890
|
+
"consensus_uid": row["consensus_uid"],
|
|
891
|
+
"rt": row["rt"],
|
|
892
|
+
"adduct_mass_neutral_top": row.get("adduct_mass_neutral_top"),
|
|
893
|
+
"adduct_top": row.get("adduct_top"),
|
|
894
|
+
"inty_mean": row.get("inty_mean", 0),
|
|
895
|
+
},
|
|
896
|
+
)
|
|
897
|
+
|
|
761
898
|
# Group features with similar neutral mass and RT
|
|
762
899
|
group_id = 1
|
|
763
900
|
assigned_groups = {} # consensus_uid -> group_id
|
|
764
901
|
groups = {} # group_id -> [consensus_uids]
|
|
765
|
-
|
|
902
|
+
|
|
766
903
|
for i, feature in enumerate(consensus_data):
|
|
767
904
|
consensus_uid = feature["consensus_uid"]
|
|
768
|
-
|
|
905
|
+
|
|
769
906
|
if consensus_uid in assigned_groups:
|
|
770
907
|
continue
|
|
771
|
-
|
|
908
|
+
|
|
772
909
|
neutral_mass = feature["adduct_mass_neutral_top"]
|
|
773
910
|
rt = feature["rt"]
|
|
774
|
-
|
|
911
|
+
|
|
775
912
|
# Skip if neutral mass is None
|
|
776
913
|
if neutral_mass is None:
|
|
777
914
|
assigned_groups[consensus_uid] = 0 # No group assignment
|
|
778
915
|
continue
|
|
779
|
-
|
|
916
|
+
|
|
780
917
|
# Find all features that could belong to the same group
|
|
781
918
|
group_members = [consensus_uid]
|
|
782
|
-
|
|
919
|
+
|
|
783
920
|
for j, other_feature in enumerate(consensus_data):
|
|
784
921
|
if i == j:
|
|
785
922
|
continue
|
|
786
|
-
|
|
923
|
+
|
|
787
924
|
other_uid = other_feature["consensus_uid"]
|
|
788
925
|
if other_uid in assigned_groups:
|
|
789
926
|
continue
|
|
790
|
-
|
|
927
|
+
|
|
791
928
|
other_neutral_mass = other_feature["adduct_mass_neutral_top"]
|
|
792
929
|
other_rt = other_feature["rt"]
|
|
793
|
-
|
|
930
|
+
|
|
794
931
|
if other_neutral_mass is None:
|
|
795
932
|
continue
|
|
796
|
-
|
|
933
|
+
|
|
797
934
|
# Check if features have similar neutral mass and RT
|
|
798
935
|
mass_diff = abs(neutral_mass - other_neutral_mass)
|
|
799
936
|
rt_diff = abs(rt - other_rt) / 60.0 # Convert to minutes for rt_tol
|
|
800
|
-
|
|
937
|
+
|
|
801
938
|
if mass_diff <= adduct_mz_tol and rt_diff <= adduct_rt_tol:
|
|
802
939
|
group_members.append(other_uid)
|
|
803
940
|
assigned_groups[other_uid] = group_id
|
|
804
|
-
|
|
941
|
+
|
|
805
942
|
if len(group_members) > 1:
|
|
806
943
|
# Multiple members - create a group
|
|
807
944
|
for member_uid in group_members:
|
|
@@ -813,26 +950,29 @@ def merge(self, **kwargs):
|
|
|
813
950
|
assigned_groups[consensus_uid] = group_id
|
|
814
951
|
groups[group_id] = [consensus_uid]
|
|
815
952
|
group_id += 1
|
|
816
|
-
|
|
953
|
+
|
|
817
954
|
# Determine adduct_of for each group
|
|
818
955
|
group_adduct_of = {} # group_id -> consensus_uid of most important adduct
|
|
819
|
-
|
|
956
|
+
|
|
820
957
|
for grp_id, member_uids in groups.items():
|
|
821
958
|
# Find the most important adduct in this group
|
|
822
959
|
# Priority: [M+H]+ > [M-H]- > highest intensity
|
|
823
960
|
best_uid = None
|
|
824
961
|
best_priority = -1
|
|
825
962
|
best_intensity = 0
|
|
826
|
-
|
|
963
|
+
|
|
827
964
|
for uid in member_uids:
|
|
828
965
|
# Find the feature data
|
|
829
|
-
feature_data = next(
|
|
966
|
+
feature_data = next(
|
|
967
|
+
(f for f in consensus_data if f["consensus_uid"] == uid),
|
|
968
|
+
None,
|
|
969
|
+
)
|
|
830
970
|
if not feature_data:
|
|
831
971
|
continue
|
|
832
|
-
|
|
972
|
+
|
|
833
973
|
adduct = feature_data.get("adduct_top", "")
|
|
834
974
|
intensity = feature_data.get("inty_mean", 0)
|
|
835
|
-
|
|
975
|
+
|
|
836
976
|
priority = 0
|
|
837
977
|
if adduct and ("[M+H]" in adduct or adduct == "H" or adduct == "?"):
|
|
838
978
|
priority = 3 # Highest priority for [M+H]+ or H
|
|
@@ -840,34 +980,41 @@ def merge(self, **kwargs):
|
|
|
840
980
|
priority = 2 # Second priority for [M-H]-
|
|
841
981
|
elif adduct and "M" in adduct:
|
|
842
982
|
priority = 1 # Third priority for other molecular adducts
|
|
843
|
-
|
|
983
|
+
|
|
844
984
|
# Choose based on priority first, then intensity
|
|
845
|
-
if
|
|
846
|
-
|
|
985
|
+
if priority > best_priority or (
|
|
986
|
+
priority == best_priority and intensity > best_intensity
|
|
987
|
+
):
|
|
847
988
|
best_uid = uid
|
|
848
989
|
best_priority = priority
|
|
849
990
|
best_intensity = intensity
|
|
850
|
-
|
|
991
|
+
|
|
851
992
|
group_adduct_of[grp_id] = best_uid if best_uid else member_uids[0]
|
|
852
|
-
|
|
993
|
+
|
|
853
994
|
# Build the final lists in the same order as consensus_df
|
|
854
995
|
for row in self.consensus_df.iter_rows(named=True):
|
|
855
996
|
consensus_uid = row["consensus_uid"]
|
|
856
997
|
group = assigned_groups.get(consensus_uid, 0)
|
|
857
998
|
adduct_of = group_adduct_of.get(group, consensus_uid)
|
|
858
|
-
|
|
999
|
+
|
|
859
1000
|
adduct_group_list.append(group)
|
|
860
1001
|
adduct_of_list.append(adduct_of)
|
|
861
|
-
|
|
1002
|
+
|
|
862
1003
|
# Add the new columns to consensus_df
|
|
863
|
-
self.consensus_df = self.consensus_df.with_columns(
|
|
864
|
-
|
|
865
|
-
|
|
866
|
-
|
|
867
|
-
|
|
1004
|
+
self.consensus_df = self.consensus_df.with_columns(
|
|
1005
|
+
[
|
|
1006
|
+
pl.Series("adduct_group", adduct_group_list, dtype=pl.Int64),
|
|
1007
|
+
pl.Series("adduct_of", adduct_of_list, dtype=pl.Int64),
|
|
1008
|
+
],
|
|
1009
|
+
)
|
|
1010
|
+
|
|
868
1011
|
# calculate the completeness of the consensus map
|
|
869
1012
|
if len(self.consensus_df) > 0 and len(self.samples_df) > 0:
|
|
870
|
-
c =
|
|
1013
|
+
c = (
|
|
1014
|
+
len(self.consensus_mapping_df)
|
|
1015
|
+
/ len(self.consensus_df)
|
|
1016
|
+
/ len(self.samples_df)
|
|
1017
|
+
)
|
|
871
1018
|
self.logger.info(
|
|
872
1019
|
f"Merging completed. Consensus features: {len(self.consensus_df)}. Completeness: {c:.2f}.",
|
|
873
1020
|
)
|
|
@@ -938,7 +1085,9 @@ def find_ms2(self, **kwargs):
|
|
|
938
1085
|
]
|
|
939
1086
|
for row in feats.iter_rows(named=True):
|
|
940
1087
|
feature_uid = row["feature_uid"]
|
|
941
|
-
feature_lookup[feature_uid] = {
|
|
1088
|
+
feature_lookup[feature_uid] = {
|
|
1089
|
+
col: row[col] for col in relevant_cols if col in feats.columns
|
|
1090
|
+
}
|
|
942
1091
|
tdqm_disable = self.log_level not in ["TRACE", "DEBUG", "INFO"]
|
|
943
1092
|
|
|
944
1093
|
# Process consensus mapping in batch
|
|
@@ -960,20 +1109,26 @@ def find_ms2(self, **kwargs):
|
|
|
960
1109
|
for j in range(len(ms2_specs)):
|
|
961
1110
|
spec = ms2_specs[j]
|
|
962
1111
|
scanid = ms2_scans[j]
|
|
963
|
-
data.append(
|
|
964
|
-
|
|
965
|
-
|
|
966
|
-
|
|
967
|
-
|
|
968
|
-
|
|
969
|
-
|
|
970
|
-
|
|
971
|
-
|
|
972
|
-
|
|
973
|
-
|
|
974
|
-
|
|
975
|
-
|
|
976
|
-
|
|
1112
|
+
data.append(
|
|
1113
|
+
{
|
|
1114
|
+
"consensus_uid": int(mapping_row["consensus_uid"]),
|
|
1115
|
+
"feature_uid": int(mapping_row["feature_uid"]),
|
|
1116
|
+
"sample_uid": int(mapping_row["sample_uid"]),
|
|
1117
|
+
"scan_id": int(scanid),
|
|
1118
|
+
"energy": round(spec.energy, 1)
|
|
1119
|
+
if hasattr(spec, "energy") and spec.energy is not None
|
|
1120
|
+
else None,
|
|
1121
|
+
"prec_inty": round(inty, 0) if inty is not None else None,
|
|
1122
|
+
"prec_coherence": round(chrom_coherence, 3)
|
|
1123
|
+
if chrom_coherence is not None
|
|
1124
|
+
else None,
|
|
1125
|
+
"prec_prominence_scaled": round(chrom_prominence_scaled, 3)
|
|
1126
|
+
if chrom_prominence_scaled is not None
|
|
1127
|
+
else None,
|
|
1128
|
+
"number_frags": len(spec.mz),
|
|
1129
|
+
"spec": spec,
|
|
1130
|
+
},
|
|
1131
|
+
)
|
|
977
1132
|
self.consensus_ms2 = pl.DataFrame(data)
|
|
978
1133
|
if not self.consensus_ms2.is_empty():
|
|
979
1134
|
unique_consensus_features = self.consensus_ms2["consensus_uid"].n_unique()
|
|
@@ -1006,7 +1161,10 @@ def filter_consensus(
|
|
|
1006
1161
|
else:
|
|
1007
1162
|
if isinstance(coherence, tuple) and len(coherence) == 2:
|
|
1008
1163
|
min_coherence, max_coherence = coherence
|
|
1009
|
-
cons = cons[
|
|
1164
|
+
cons = cons[
|
|
1165
|
+
(cons["chrom_coherence"] >= min_coherence)
|
|
1166
|
+
& (cons["chrom_coherence"] <= max_coherence)
|
|
1167
|
+
]
|
|
1010
1168
|
else:
|
|
1011
1169
|
cons = cons[cons["chrom_coherence"] >= coherence]
|
|
1012
1170
|
after_coherence = len(cons)
|
|
@@ -1017,7 +1175,9 @@ def filter_consensus(
|
|
|
1017
1175
|
if quality is not None:
|
|
1018
1176
|
if isinstance(quality, tuple) and len(quality) == 2:
|
|
1019
1177
|
min_quality, max_quality = quality
|
|
1020
|
-
cons = cons[
|
|
1178
|
+
cons = cons[
|
|
1179
|
+
(cons["quality"] >= min_quality) & (cons["quality"] <= max_quality)
|
|
1180
|
+
]
|
|
1021
1181
|
else:
|
|
1022
1182
|
cons = cons[cons["quality"] >= quality]
|
|
1023
1183
|
after_quality = len(cons)
|
|
@@ -1028,7 +1188,10 @@ def filter_consensus(
|
|
|
1028
1188
|
if number_samples is not None:
|
|
1029
1189
|
if isinstance(number_samples, tuple) and len(number_samples) == 2:
|
|
1030
1190
|
min_number, max_number = number_samples
|
|
1031
|
-
cons = cons[
|
|
1191
|
+
cons = cons[
|
|
1192
|
+
(cons["number_samples"] >= min_number)
|
|
1193
|
+
& (cons["number_samples"] <= max_number)
|
|
1194
|
+
]
|
|
1032
1195
|
else:
|
|
1033
1196
|
cons = cons[cons["number_samples"] >= number_samples]
|
|
1034
1197
|
after_number_samples = len(cons)
|
|
@@ -1105,11 +1268,13 @@ def _integrate_chrom_impl(self, **kwargs):
|
|
|
1105
1268
|
|
|
1106
1269
|
# Merge consensus_mapping with consensus_df to get rt_start_mean and rt_end_mean
|
|
1107
1270
|
# Use Polars join operation instead of pandas merge
|
|
1108
|
-
consensus_subset = self.consensus_df.select(
|
|
1109
|
-
|
|
1110
|
-
|
|
1111
|
-
|
|
1112
|
-
|
|
1271
|
+
consensus_subset = self.consensus_df.select(
|
|
1272
|
+
[
|
|
1273
|
+
"consensus_uid",
|
|
1274
|
+
"rt_start_mean",
|
|
1275
|
+
"rt_end_mean",
|
|
1276
|
+
],
|
|
1277
|
+
)
|
|
1113
1278
|
df1 = self.consensus_mapping_df.join(
|
|
1114
1279
|
consensus_subset,
|
|
1115
1280
|
on="consensus_uid",
|
|
@@ -1154,9 +1319,9 @@ def _integrate_chrom_impl(self, **kwargs):
|
|
|
1154
1319
|
if chrom is None or len(chrom) == 0:
|
|
1155
1320
|
update_rows.append(row_idx)
|
|
1156
1321
|
chroms.append(None)
|
|
1157
|
-
rt_starts.append(float(
|
|
1158
|
-
rt_ends.append(float(
|
|
1159
|
-
rt_deltas.append(float(
|
|
1322
|
+
rt_starts.append(float("nan"))
|
|
1323
|
+
rt_ends.append(float("nan"))
|
|
1324
|
+
rt_deltas.append(float("nan"))
|
|
1160
1325
|
chrom_areas.append(-1.0)
|
|
1161
1326
|
continue
|
|
1162
1327
|
## TODO expose parameters
|
|
@@ -1186,9 +1351,13 @@ def _integrate_chrom_impl(self, **kwargs):
|
|
|
1186
1351
|
if update_rows:
|
|
1187
1352
|
# Create mapping from row index to new values
|
|
1188
1353
|
row_to_chrom = {update_rows[i]: chroms[i] for i in range(len(update_rows))}
|
|
1189
|
-
row_to_rt_start = {
|
|
1354
|
+
row_to_rt_start = {
|
|
1355
|
+
update_rows[i]: rt_starts[i] for i in range(len(update_rows))
|
|
1356
|
+
}
|
|
1190
1357
|
row_to_rt_end = {update_rows[i]: rt_ends[i] for i in range(len(update_rows))}
|
|
1191
|
-
row_to_rt_delta = {
|
|
1358
|
+
row_to_rt_delta = {
|
|
1359
|
+
update_rows[i]: rt_deltas[i] for i in range(len(update_rows))
|
|
1360
|
+
}
|
|
1192
1361
|
row_to_chrom_area = {
|
|
1193
1362
|
update_rows[i]: float(chrom_areas[i]) if chrom_areas[i] is not None else 0.0
|
|
1194
1363
|
for i in range(len(update_rows))
|
|
@@ -1202,58 +1371,60 @@ def _integrate_chrom_impl(self, **kwargs):
|
|
|
1202
1371
|
|
|
1203
1372
|
# Update columns conditionally
|
|
1204
1373
|
try:
|
|
1205
|
-
self.features_df = df_with_index.with_columns(
|
|
1206
|
-
|
|
1207
|
-
|
|
1208
|
-
|
|
1209
|
-
|
|
1210
|
-
|
|
1211
|
-
|
|
1212
|
-
|
|
1213
|
-
|
|
1214
|
-
|
|
1215
|
-
|
|
1216
|
-
|
|
1217
|
-
|
|
1218
|
-
|
|
1219
|
-
|
|
1220
|
-
|
|
1221
|
-
|
|
1222
|
-
|
|
1223
|
-
|
|
1224
|
-
|
|
1225
|
-
|
|
1226
|
-
|
|
1227
|
-
|
|
1228
|
-
|
|
1229
|
-
|
|
1230
|
-
|
|
1231
|
-
|
|
1232
|
-
|
|
1233
|
-
|
|
1234
|
-
|
|
1235
|
-
|
|
1236
|
-
|
|
1237
|
-
|
|
1238
|
-
|
|
1239
|
-
|
|
1240
|
-
|
|
1241
|
-
|
|
1242
|
-
|
|
1243
|
-
|
|
1244
|
-
|
|
1245
|
-
|
|
1246
|
-
|
|
1247
|
-
|
|
1248
|
-
|
|
1249
|
-
|
|
1250
|
-
|
|
1251
|
-
|
|
1252
|
-
|
|
1253
|
-
|
|
1254
|
-
|
|
1255
|
-
|
|
1256
|
-
|
|
1374
|
+
self.features_df = df_with_index.with_columns(
|
|
1375
|
+
[
|
|
1376
|
+
# Update chrom column - use when() to update only specific rows
|
|
1377
|
+
pl.when(update_mask)
|
|
1378
|
+
.then(
|
|
1379
|
+
pl.col("__row_idx").map_elements(
|
|
1380
|
+
lambda x: row_to_chrom.get(x, None),
|
|
1381
|
+
return_dtype=pl.Object,
|
|
1382
|
+
),
|
|
1383
|
+
)
|
|
1384
|
+
.otherwise(pl.col("chrom"))
|
|
1385
|
+
.alias("chrom"),
|
|
1386
|
+
# Update rt_start column
|
|
1387
|
+
pl.when(update_mask)
|
|
1388
|
+
.then(
|
|
1389
|
+
pl.col("__row_idx").map_elements(
|
|
1390
|
+
lambda x: row_to_rt_start.get(x, None),
|
|
1391
|
+
return_dtype=pl.Float64,
|
|
1392
|
+
),
|
|
1393
|
+
)
|
|
1394
|
+
.otherwise(pl.col("rt_start"))
|
|
1395
|
+
.alias("rt_start"),
|
|
1396
|
+
# Update rt_end column
|
|
1397
|
+
pl.when(update_mask)
|
|
1398
|
+
.then(
|
|
1399
|
+
pl.col("__row_idx").map_elements(
|
|
1400
|
+
lambda x: row_to_rt_end.get(x, None),
|
|
1401
|
+
return_dtype=pl.Float64,
|
|
1402
|
+
),
|
|
1403
|
+
)
|
|
1404
|
+
.otherwise(pl.col("rt_end"))
|
|
1405
|
+
.alias("rt_end"),
|
|
1406
|
+
# Update rt_delta column
|
|
1407
|
+
pl.when(update_mask)
|
|
1408
|
+
.then(
|
|
1409
|
+
pl.col("__row_idx").map_elements(
|
|
1410
|
+
lambda x: row_to_rt_delta.get(x, None),
|
|
1411
|
+
return_dtype=pl.Float64,
|
|
1412
|
+
),
|
|
1413
|
+
)
|
|
1414
|
+
.otherwise(pl.col("rt_delta"))
|
|
1415
|
+
.alias("rt_delta"),
|
|
1416
|
+
# Update chrom_area column
|
|
1417
|
+
pl.when(update_mask)
|
|
1418
|
+
.then(
|
|
1419
|
+
pl.col("__row_idx").map_elements(
|
|
1420
|
+
lambda x: row_to_chrom_area.get(x, 0),
|
|
1421
|
+
return_dtype=pl.Float64,
|
|
1422
|
+
),
|
|
1423
|
+
)
|
|
1424
|
+
.otherwise(pl.col("chrom_area"))
|
|
1425
|
+
.alias("chrom_area"),
|
|
1426
|
+
],
|
|
1427
|
+
).drop("__row_idx") # Remove the temporary row index column
|
|
1257
1428
|
|
|
1258
1429
|
self.logger.debug(
|
|
1259
1430
|
f"Integration completed. Updated {len(update_rows)} features with chromatogram data.",
|
|
@@ -1344,9 +1515,18 @@ def _align_pose_clustering(study_obj, fmaps, params):
|
|
|
1344
1515
|
params_oms.setValue("pairfinder:ignore_charge", "true")
|
|
1345
1516
|
params_oms.setValue("max_num_peaks_considered", 1000)
|
|
1346
1517
|
params_oms.setValue("pairfinder:distance_RT:max_difference", params.get("rt_tol"))
|
|
1347
|
-
params_oms.setValue(
|
|
1348
|
-
|
|
1349
|
-
|
|
1518
|
+
params_oms.setValue(
|
|
1519
|
+
"pairfinder:distance_MZ:max_difference",
|
|
1520
|
+
params.get("mz_max_diff"),
|
|
1521
|
+
)
|
|
1522
|
+
params_oms.setValue(
|
|
1523
|
+
"superimposer:rt_pair_distance_fraction",
|
|
1524
|
+
params.get("rt_pair_distance_frac"),
|
|
1525
|
+
)
|
|
1526
|
+
params_oms.setValue(
|
|
1527
|
+
"superimposer:mz_pair_max_distance",
|
|
1528
|
+
params.get("mz_pair_max_distance"),
|
|
1529
|
+
)
|
|
1350
1530
|
params_oms.setValue("superimposer:num_used_points", params.get("num_used_points"))
|
|
1351
1531
|
params_oms.setValue("pairfinder:distance_MZ:exponent", 3.0)
|
|
1352
1532
|
params_oms.setValue("pairfinder:distance_RT:exponent", 2.0)
|
|
@@ -1355,7 +1535,9 @@ def _align_pose_clustering(study_obj, fmaps, params):
|
|
|
1355
1535
|
study_obj.logger.info("Starting alignment with PoseClustering")
|
|
1356
1536
|
|
|
1357
1537
|
# Set ref_index to feature map index with largest number of features
|
|
1358
|
-
ref_index = [
|
|
1538
|
+
ref_index = [
|
|
1539
|
+
i[0] for i in sorted(enumerate([fm.size() for fm in fmaps]), key=lambda x: x[1])
|
|
1540
|
+
][-1]
|
|
1359
1541
|
study_obj.logger.debug(
|
|
1360
1542
|
f"Reference map is {study_obj.samples_df.row(ref_index, named=True)['sample_name']}",
|
|
1361
1543
|
)
|
|
@@ -1374,7 +1556,10 @@ def _align_pose_clustering(study_obj, fmaps, params):
|
|
|
1374
1556
|
):
|
|
1375
1557
|
if index == ref_index:
|
|
1376
1558
|
continue
|
|
1377
|
-
if
|
|
1559
|
+
if (
|
|
1560
|
+
params.get("skip_blanks")
|
|
1561
|
+
and study_obj.samples_df.row(index, named=True)["sample_type"] == "blank"
|
|
1562
|
+
):
|
|
1378
1563
|
continue
|
|
1379
1564
|
trafo = oms.TransformationDescription()
|
|
1380
1565
|
aligner.align(fm, trafo)
|
|
@@ -1393,19 +1578,28 @@ def _align_kd_algorithm(study_obj, fmaps, params):
|
|
|
1393
1578
|
|
|
1394
1579
|
# Pull parameter values - map standard align params to our algorithm
|
|
1395
1580
|
# Use rt_tol (standard align param) instead of warp_rt_tol for RT tolerance
|
|
1396
|
-
rt_pair_tol =
|
|
1581
|
+
rt_pair_tol = (
|
|
1582
|
+
float(params.get("rt_tol")) if params.get("rt_tol") is not None else 2.0
|
|
1583
|
+
)
|
|
1397
1584
|
# Use mz_max_diff (standard align param) converted to ppm
|
|
1398
|
-
mz_max_diff_da =
|
|
1585
|
+
mz_max_diff_da = (
|
|
1586
|
+
float(params.get("mz_max_diff"))
|
|
1587
|
+
if params.get("mz_max_diff") is not None
|
|
1588
|
+
else 0.02
|
|
1589
|
+
)
|
|
1399
1590
|
# Convert Da to ppm (assuming ~400 m/z average for metabolomics): 0.01 Da / 400 * 1e6 = 25 ppm
|
|
1400
1591
|
ppm_tol = mz_max_diff_da / 400.0 * 1e6
|
|
1401
1592
|
# Allow override with warp_mz_tol if specifically set (but not from defaults)
|
|
1402
1593
|
try:
|
|
1403
1594
|
warp_mz_from_params = params.get("warp_mz_tol")
|
|
1404
|
-
if
|
|
1595
|
+
if (
|
|
1596
|
+
warp_mz_from_params is not None
|
|
1597
|
+
and warp_mz_from_params != params.__class__().warp_mz_tol
|
|
1598
|
+
):
|
|
1405
1599
|
ppm_tol = float(warp_mz_from_params)
|
|
1406
1600
|
except (KeyError, AttributeError):
|
|
1407
1601
|
pass
|
|
1408
|
-
|
|
1602
|
+
|
|
1409
1603
|
# Safely retrieve optional parameter max_anchor_points (not yet part of defaults)
|
|
1410
1604
|
try:
|
|
1411
1605
|
_raw_mp = params.get("max_anchor_points")
|
|
@@ -1413,7 +1607,7 @@ def _align_kd_algorithm(study_obj, fmaps, params):
|
|
|
1413
1607
|
_raw_mp = None
|
|
1414
1608
|
max_points = int(_raw_mp) if _raw_mp is not None else 1000
|
|
1415
1609
|
study_obj.logger.info(
|
|
1416
|
-
f"Align time axes with rt_tol={params.get('rt_tol')}, min_samples={params.get('min_samples')}, max_points={max_points}"
|
|
1610
|
+
f"Align time axes with rt_tol={params.get('rt_tol')}, min_samples={params.get('min_samples')}, max_points={max_points}",
|
|
1417
1611
|
)
|
|
1418
1612
|
|
|
1419
1613
|
# Choose reference map (largest number of features)
|
|
@@ -1421,7 +1615,7 @@ def _align_kd_algorithm(study_obj, fmaps, params):
|
|
|
1421
1615
|
ref_map = fmaps[ref_index]
|
|
1422
1616
|
study_obj.alignment_ref_index = ref_index
|
|
1423
1617
|
study_obj.logger.debug(
|
|
1424
|
-
f"Reference map index {ref_index} (sample: {study_obj.samples_df.row(ref_index, named=True)['sample_name']}) size={ref_map.size()}"
|
|
1618
|
+
f"Reference map index {ref_index} (sample: {study_obj.samples_df.row(ref_index, named=True)['sample_name']}) size={ref_map.size()}",
|
|
1425
1619
|
)
|
|
1426
1620
|
|
|
1427
1621
|
# Extract and sort reference features by m/z for binary search
|
|
@@ -1445,7 +1639,10 @@ def _align_kd_algorithm(study_obj, fmaps, params):
|
|
|
1445
1639
|
best_drt = drt
|
|
1446
1640
|
return best
|
|
1447
1641
|
|
|
1448
|
-
def _set_pairs(
|
|
1642
|
+
def _set_pairs(
|
|
1643
|
+
td_obj: oms.TransformationDescription,
|
|
1644
|
+
pairs,
|
|
1645
|
+
): # Helper for pyopenms API variability
|
|
1449
1646
|
# Always provide list of lists to satisfy strict type expectations
|
|
1450
1647
|
conv = [[float(a), float(b)] for a, b in pairs]
|
|
1451
1648
|
try:
|
|
@@ -1527,7 +1724,7 @@ def _align_kd_algorithm(study_obj, fmaps, params):
|
|
|
1527
1724
|
td.fitModel(model, oms.Param())
|
|
1528
1725
|
except Exception as e:
|
|
1529
1726
|
study_obj.logger.debug(
|
|
1530
|
-
f"Map {i}: {model} fitting failed ({e}); fallback to linear two-point shift"
|
|
1727
|
+
f"Map {i}: {model} fitting failed ({e}); fallback to linear two-point shift",
|
|
1531
1728
|
)
|
|
1532
1729
|
rts = [f.getRT() for f in fmap]
|
|
1533
1730
|
lo, hi = (min(rts), max(rts)) if rts else (0.0, 1.0)
|
|
@@ -1539,7 +1736,7 @@ def _align_kd_algorithm(study_obj, fmaps, params):
|
|
|
1539
1736
|
pass
|
|
1540
1737
|
|
|
1541
1738
|
study_obj.logger.debug(
|
|
1542
|
-
f"Map {i}: anchors raw={len(pairs_raw)} used={len(pairs_use)} model={model} median_shift={med_shift:.4f}s"
|
|
1739
|
+
f"Map {i}: anchors raw={len(pairs_raw)} used={len(pairs_use)} model={model} median_shift={med_shift:.4f}s",
|
|
1543
1740
|
)
|
|
1544
1741
|
transformations.append(td)
|
|
1545
1742
|
|
|
@@ -1557,7 +1754,7 @@ def _align_kd_algorithm(study_obj, fmaps, params):
|
|
|
1557
1754
|
study_obj.logger.warning(f"Map {i}: failed applying transformation ({e})")
|
|
1558
1755
|
|
|
1559
1756
|
study_obj.logger.info(
|
|
1560
|
-
f"Custom KD alignment completed. Reference index {ref_index}. Applied {len(transformations)} transformations."
|
|
1757
|
+
f"Custom KD alignment completed. Reference index {ref_index}. Applied {len(transformations)} transformations.",
|
|
1561
1758
|
)
|
|
1562
1759
|
|
|
1563
1760
|
|
|
@@ -1566,13 +1763,18 @@ def _align_pose_clustering_fallback(study_obj, fmaps, params):
|
|
|
1566
1763
|
import pyopenms as oms
|
|
1567
1764
|
|
|
1568
1765
|
aligner = oms.MapAlignmentAlgorithmPoseClustering()
|
|
1569
|
-
ref_index = [
|
|
1766
|
+
ref_index = [
|
|
1767
|
+
i[0] for i in sorted(enumerate([fm.size() for fm in fmaps]), key=lambda x: x[1])
|
|
1768
|
+
][-1]
|
|
1570
1769
|
|
|
1571
1770
|
# Set up basic parameters for pose clustering
|
|
1572
1771
|
pc_params = oms.Param()
|
|
1573
1772
|
pc_params.setValue("max_num_peaks_considered", 1000)
|
|
1574
1773
|
pc_params.setValue("pairfinder:distance_RT:max_difference", params.get("rt_tol"))
|
|
1575
|
-
pc_params.setValue(
|
|
1774
|
+
pc_params.setValue(
|
|
1775
|
+
"pairfinder:distance_MZ:max_difference",
|
|
1776
|
+
params.get("mz_max_diff"),
|
|
1777
|
+
)
|
|
1576
1778
|
|
|
1577
1779
|
aligner.setParameters(pc_params)
|
|
1578
1780
|
aligner.setReference(fmaps[ref_index])
|