masster 0.4.0__py3-none-any.whl → 0.4.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (54) hide show
  1. masster/__init__.py +8 -8
  2. masster/_version.py +1 -1
  3. masster/chromatogram.py +3 -9
  4. masster/data/libs/README.md +1 -1
  5. masster/data/libs/ccm.csv +120 -120
  6. masster/data/libs/ccm.py +116 -62
  7. masster/data/libs/central_carbon_README.md +1 -1
  8. masster/data/libs/urine.py +161 -65
  9. masster/data/libs/urine_metabolites.csv +4693 -4693
  10. masster/data/wiff/2025_01_14_VW_7600_LpMx_DBS_CID_2min_TOP15_030msecMS1_005msecReac_CE35_DBS-ON_3.mzML +2 -2
  11. masster/logger.py +43 -78
  12. masster/sample/__init__.py +1 -1
  13. masster/sample/adducts.py +264 -338
  14. masster/sample/defaults/find_adducts_def.py +8 -21
  15. masster/sample/defaults/find_features_def.py +1 -6
  16. masster/sample/defaults/get_spectrum_def.py +1 -5
  17. masster/sample/defaults/sample_def.py +1 -5
  18. masster/sample/h5.py +282 -561
  19. masster/sample/helpers.py +75 -131
  20. masster/sample/lib.py +17 -42
  21. masster/sample/load.py +17 -31
  22. masster/sample/parameters.py +2 -6
  23. masster/sample/plot.py +27 -88
  24. masster/sample/processing.py +87 -117
  25. masster/sample/quant.py +51 -57
  26. masster/sample/sample.py +90 -103
  27. masster/sample/sample5_schema.json +44 -44
  28. masster/sample/save.py +12 -35
  29. masster/sample/sciex.py +19 -66
  30. masster/spectrum.py +20 -58
  31. masster/study/__init__.py +1 -1
  32. masster/study/defaults/align_def.py +1 -5
  33. masster/study/defaults/fill_chrom_def.py +1 -5
  34. masster/study/defaults/fill_def.py +1 -5
  35. masster/study/defaults/integrate_chrom_def.py +1 -5
  36. masster/study/defaults/integrate_def.py +1 -5
  37. masster/study/defaults/study_def.py +25 -58
  38. masster/study/export.py +207 -233
  39. masster/study/h5.py +136 -470
  40. masster/study/helpers.py +202 -495
  41. masster/study/helpers_optimized.py +13 -40
  42. masster/study/id.py +110 -213
  43. masster/study/load.py +143 -230
  44. masster/study/plot.py +257 -518
  45. masster/study/processing.py +257 -469
  46. masster/study/save.py +5 -15
  47. masster/study/study.py +276 -379
  48. masster/study/study5_schema.json +96 -96
  49. {masster-0.4.0.dist-info → masster-0.4.1.dist-info}/METADATA +1 -1
  50. masster-0.4.1.dist-info/RECORD +67 -0
  51. masster-0.4.0.dist-info/RECORD +0 -67
  52. {masster-0.4.0.dist-info → masster-0.4.1.dist-info}/WHEEL +0 -0
  53. {masster-0.4.0.dist-info → masster-0.4.1.dist-info}/entry_points.txt +0 -0
  54. {masster-0.4.0.dist-info → masster-0.4.1.dist-info}/licenses/LICENSE +0 -0
@@ -8,7 +8,7 @@ import pyopenms as oms
8
8
 
9
9
  from tqdm import tqdm
10
10
 
11
- from master.study.defaults import (
11
+ from masster.study.defaults import (
12
12
  align_defaults,
13
13
  find_ms2_defaults,
14
14
  integrate_defaults,
@@ -115,8 +115,7 @@ def align(self, **kwargs):
115
115
  # Pre-build sample_uid lookup for faster access
116
116
  self.logger.debug("Build sample_uid lookup for fast access...")
117
117
  sample_uid_lookup = {
118
- idx: row_dict["sample_uid"]
119
- for idx, row_dict in enumerate(self.samples_df.iter_rows(named=True))
118
+ idx: row_dict["sample_uid"] for idx, row_dict in enumerate(self.samples_df.iter_rows(named=True))
120
119
  }
121
120
 
122
121
  # Build the main lookup using feature_uid (not feature_id)
@@ -216,7 +215,7 @@ def align(self, **kwargs):
216
215
  self.features_df = self.features_df.with_columns(*new_cols)
217
216
 
218
217
  self.logger.debug("Alignment completed successfully.")
219
-
218
+
220
219
  # Reset consensus data structures after alignment since RT changes invalidate consensus
221
220
  consensus_reset_count = 0
222
221
  if not self.consensus_df.is_empty():
@@ -228,7 +227,7 @@ def align(self, **kwargs):
228
227
  if not self.consensus_ms2.is_empty():
229
228
  self.consensus_ms2 = pl.DataFrame()
230
229
  consensus_reset_count += 1
231
-
230
+
232
231
  # Remove merge and find_ms2 parameters from history since they need to be re-run
233
232
  keys_to_remove = ["merge", "find_ms2"]
234
233
  history_removed_count = 0
@@ -238,11 +237,9 @@ def align(self, **kwargs):
238
237
  del self.history[key]
239
238
  history_removed_count += 1
240
239
  self.logger.debug(f"Removed {key} from history")
241
-
240
+
242
241
  if consensus_reset_count > 0 or history_removed_count > 0:
243
- self.logger.info(
244
- f"Alignment reset: {consensus_reset_count} consensus structures cleared, {history_removed_count} history entries removed",
245
- )
242
+ self.logger.info(f"Alignment reset: {consensus_reset_count} consensus structures cleared, {history_removed_count} history entries removed")
246
243
 
247
244
  if params.get("save_features"):
248
245
  self.save_samples()
@@ -293,10 +290,7 @@ def merge(self, **kwargs):
293
290
  algorithm = params.get("algorithm")
294
291
  min_samples = params.get("min_samples")
295
292
  link_ms2 = params.get("link_ms2")
296
- mz_tol = kwargs.get(
297
- "mz_tol",
298
- 0.01,
299
- ) # Default values for parameters not in defaults class
293
+ mz_tol = kwargs.get("mz_tol", 0.01) # Default values for parameters not in defaults class
300
294
  rt_tol = kwargs.get("rt_tol", 1.0)
301
295
 
302
296
  if len(self.samples_df) > 200 and algorithm == "qt":
@@ -405,10 +399,7 @@ def merge(self, **kwargs):
405
399
  consensus_map.setUniqueIds()
406
400
 
407
401
  # create a dict to map uid to feature_uid using self.features_df
408
- feature_uid_map = {
409
- row["feature_id"]: row["feature_uid"]
410
- for row in self.features_df.iter_rows(named=True)
411
- }
402
+ feature_uid_map = {row["feature_id"]: row["feature_uid"] for row in self.features_df.iter_rows(named=True)}
412
403
  imax = consensus_map.size()
413
404
 
414
405
  # Pre-build fast lookup tables for features_df data
@@ -435,9 +426,7 @@ def merge(self, **kwargs):
435
426
 
436
427
  for row in self.features_df.iter_rows(named=True):
437
428
  feature_uid = row["feature_uid"]
438
- features_lookup[feature_uid] = {
439
- col: row[col] for col in feature_columns if col in self.features_df.columns
440
- }
429
+ features_lookup[feature_uid] = {col: row[col] for col in feature_columns if col in self.features_df.columns}
441
430
 
442
431
  # create a list to store the consensus mapping
443
432
  consensus_mapping = []
@@ -464,13 +453,11 @@ def merge(self, **kwargs):
464
453
  # this is a feature that was removed but is still in the feature maps
465
454
  continue
466
455
  fuid = feature_uid_map[fuid]
467
- consensus_mapping.append(
468
- {
469
- "consensus_uid": i,
470
- "sample_uid": f.getMapIndex() + 1,
471
- "feature_uid": fuid,
472
- },
473
- )
456
+ consensus_mapping.append({
457
+ "consensus_uid": i,
458
+ "sample_uid": f.getMapIndex() + 1,
459
+ "feature_uid": fuid,
460
+ })
474
461
  uids.append(fuid)
475
462
 
476
463
  # Get feature data from lookup instead of DataFrame filtering
@@ -484,99 +471,43 @@ def merge(self, **kwargs):
484
471
 
485
472
  # Compute statistics using vectorized operations on collected data
486
473
  # Convert to numpy arrays for faster computation
487
- rt_values = np.array(
488
- [fd.get("rt", 0) for fd in feature_data_list if fd.get("rt") is not None],
489
- )
490
- mz_values = np.array(
491
- [fd.get("mz", 0) for fd in feature_data_list if fd.get("mz") is not None],
492
- )
493
- rt_start_values = np.array(
494
- [
495
- fd.get("rt_start", 0)
496
- for fd in feature_data_list
497
- if fd.get("rt_start") is not None
498
- ],
499
- )
500
- rt_end_values = np.array(
501
- [
502
- fd.get("rt_end", 0)
503
- for fd in feature_data_list
504
- if fd.get("rt_end") is not None
505
- ],
506
- )
507
- rt_delta_values = np.array(
508
- [
509
- fd.get("rt_delta", 0)
510
- for fd in feature_data_list
511
- if fd.get("rt_delta") is not None
512
- ],
513
- )
514
- mz_start_values = np.array(
515
- [
516
- fd.get("mz_start", 0)
517
- for fd in feature_data_list
518
- if fd.get("mz_start") is not None
519
- ],
520
- )
521
- mz_end_values = np.array(
522
- [
523
- fd.get("mz_end", 0)
524
- for fd in feature_data_list
525
- if fd.get("mz_end") is not None
526
- ],
527
- )
528
- inty_values = np.array(
529
- [
530
- fd.get("inty", 0)
531
- for fd in feature_data_list
532
- if fd.get("inty") is not None
533
- ],
534
- )
535
- coherence_values = np.array(
536
- [
537
- fd.get("chrom_coherence", 0)
538
- for fd in feature_data_list
539
- if fd.get("chrom_coherence") is not None
540
- ],
541
- )
542
- prominence_values = np.array(
543
- [
544
- fd.get("chrom_prominence", 0)
545
- for fd in feature_data_list
546
- if fd.get("chrom_prominence") is not None
547
- ],
548
- )
549
- prominence_scaled_values = np.array(
550
- [
551
- fd.get("chrom_prominence_scaled", 0)
552
- for fd in feature_data_list
553
- if fd.get("chrom_prominence_scaled") is not None
554
- ],
555
- )
556
- height_scaled_values = np.array(
557
- [
558
- fd.get("chrom_height_scaled", 0)
559
- for fd in feature_data_list
560
- if fd.get("chrom_height_scaled") is not None
561
- ],
562
- )
563
- iso_values = np.array(
564
- [fd.get("iso", 0) for fd in feature_data_list if fd.get("iso") is not None],
565
- )
566
- charge_values = np.array(
567
- [
568
- fd.get("charge", 0)
569
- for fd in feature_data_list
570
- if fd.get("charge") is not None
571
- ],
572
- )
474
+ rt_values = np.array([fd.get("rt", 0) for fd in feature_data_list if fd.get("rt") is not None])
475
+ mz_values = np.array([fd.get("mz", 0) for fd in feature_data_list if fd.get("mz") is not None])
476
+ rt_start_values = np.array([
477
+ fd.get("rt_start", 0) for fd in feature_data_list if fd.get("rt_start") is not None
478
+ ])
479
+ rt_end_values = np.array([fd.get("rt_end", 0) for fd in feature_data_list if fd.get("rt_end") is not None])
480
+ rt_delta_values = np.array([
481
+ fd.get("rt_delta", 0) for fd in feature_data_list if fd.get("rt_delta") is not None
482
+ ])
483
+ mz_start_values = np.array([
484
+ fd.get("mz_start", 0) for fd in feature_data_list if fd.get("mz_start") is not None
485
+ ])
486
+ mz_end_values = np.array([fd.get("mz_end", 0) for fd in feature_data_list if fd.get("mz_end") is not None])
487
+ inty_values = np.array([fd.get("inty", 0) for fd in feature_data_list if fd.get("inty") is not None])
488
+ coherence_values = np.array([
489
+ fd.get("chrom_coherence", 0) for fd in feature_data_list if fd.get("chrom_coherence") is not None
490
+ ])
491
+ prominence_values = np.array([
492
+ fd.get("chrom_prominence", 0) for fd in feature_data_list if fd.get("chrom_prominence") is not None
493
+ ])
494
+ prominence_scaled_values = np.array([
495
+ fd.get("chrom_prominence_scaled", 0)
496
+ for fd in feature_data_list
497
+ if fd.get("chrom_prominence_scaled") is not None
498
+ ])
499
+ height_scaled_values = np.array([
500
+ fd.get("chrom_height_scaled", 0) for fd in feature_data_list if fd.get("chrom_height_scaled") is not None
501
+ ])
502
+ iso_values = np.array([fd.get("iso", 0) for fd in feature_data_list if fd.get("iso") is not None])
503
+ charge_values = np.array([fd.get("charge", 0) for fd in feature_data_list if fd.get("charge") is not None])
573
504
 
574
505
  # adduct_values
575
506
  # Collect all adducts from feature_data_list to create consensus adduct information
576
507
  # Only consider adducts that are in study._get_adducts() plus items with '?'
577
508
  all_adducts = []
578
509
  adduct_masses = {}
579
-
510
+
580
511
  # Get valid adducts from study._get_adducts()
581
512
  valid_adducts = set()
582
513
  try:
@@ -585,7 +516,7 @@ def merge(self, **kwargs):
585
516
  valid_adducts.update(study_adducts_df["name"].to_list())
586
517
  except Exception as e:
587
518
  self.logger.warning(f"Could not retrieve study adducts: {e}")
588
-
519
+
589
520
  # Always allow '?' adducts
590
521
  valid_adducts.add("?")
591
522
 
@@ -596,7 +527,7 @@ def merge(self, **kwargs):
596
527
 
597
528
  if adduct is not None:
598
529
  # Only include adducts that are valid (from study._get_adducts() or contain '?')
599
- if adduct in valid_adducts or "?" in adduct:
530
+ if adduct in valid_adducts or '?' in adduct:
600
531
  all_adducts.append(adduct)
601
532
  if adduct_mass is not None:
602
533
  adduct_masses[adduct] = adduct_mass
@@ -604,37 +535,33 @@ def merge(self, **kwargs):
604
535
  # Calculate adduct_values for the consensus feature
605
536
  adduct_values = []
606
537
  if all_adducts:
607
- adduct_counts = {
608
- adduct: all_adducts.count(adduct) for adduct in set(all_adducts)
609
- }
538
+ adduct_counts = {adduct: all_adducts.count(adduct) for adduct in set(all_adducts)}
610
539
  total_count = sum(adduct_counts.values())
611
540
  for adduct, count in adduct_counts.items():
612
541
  percentage = (count / total_count) * 100 if total_count > 0 else 0
613
542
  mass = adduct_masses.get(adduct, None)
614
543
  # Store as list with [name, num, %] format for the adducts column
615
- adduct_values.append(
616
- [
617
- str(adduct),
618
- int(count),
619
- float(round(percentage, 2)),
620
- ],
621
- )
544
+ adduct_values.append([
545
+ str(adduct),
546
+ int(count),
547
+ float(round(percentage, 2))
548
+ ])
622
549
 
623
550
  # Sort adduct_values by count in descending order
624
551
  adduct_values.sort(key=lambda x: x[1], reverse=True) # Sort by count (index 1)
625
552
  # Store adduct_values for use in metadata
626
553
  consensus_adduct_values = adduct_values
627
-
554
+
628
555
  # Extract top adduct information for new columns
629
556
  adduct_top = None
630
557
  adduct_charge_top = None
631
558
  adduct_mass_neutral_top = None
632
559
  adduct_mass_shift_top = None
633
-
560
+
634
561
  if consensus_adduct_values:
635
562
  top_adduct_name = consensus_adduct_values[0][0] # Get top adduct name
636
563
  adduct_top = top_adduct_name
637
-
564
+
638
565
  # Parse adduct information to extract charge and mass shift
639
566
  # Handle "?" as "H" and parse common adduct formats
640
567
  if top_adduct_name == "?" or top_adduct_name == "[M+?]+":
@@ -650,37 +577,33 @@ def merge(self, **kwargs):
650
577
  study_adducts_df = self._get_adducts()
651
578
  if not study_adducts_df.is_empty():
652
579
  # Look for exact match in study adducts
653
- matching_adduct = study_adducts_df.filter(
654
- pl.col("name") == top_adduct_name,
655
- )
580
+ matching_adduct = study_adducts_df.filter(pl.col("name") == top_adduct_name)
656
581
  if not matching_adduct.is_empty():
657
582
  adduct_row = matching_adduct.row(0, named=True)
658
583
  adduct_charge_top = adduct_row["charge"]
659
584
  adduct_mass_shift_top = adduct_row["mass_shift"]
660
585
  adduct_found = True
661
586
  except Exception as e:
662
- self.logger.warning(
663
- f"Could not lookup adduct in study adducts: {e}",
664
- )
665
-
587
+ self.logger.warning(f"Could not lookup adduct in study adducts: {e}")
588
+
666
589
  if not adduct_found:
667
590
  # Fallback to regex parsing
668
591
  import re
669
-
592
+
670
593
  # Pattern for adducts like [M+H]+, [M-H]-, [M+Na]+, etc.
671
- pattern = r"\[M([+\-])([A-Za-z0-9]+)\]([0-9]*)([+\-])"
594
+ pattern = r'\[M([+\-])([A-Za-z0-9]+)\]([0-9]*)([+\-])'
672
595
  match = re.match(pattern, top_adduct_name)
673
-
596
+
674
597
  if match:
675
598
  sign = match.group(1)
676
599
  element = match.group(2)
677
600
  multiplier_str = match.group(3)
678
601
  charge_sign = match.group(4)
679
-
602
+
680
603
  multiplier = int(multiplier_str) if multiplier_str else 1
681
604
  charge = multiplier if charge_sign == "+" else -multiplier
682
605
  adduct_charge_top = charge
683
-
606
+
684
607
  # Calculate mass shift based on element
685
608
  element_masses = {
686
609
  "H": 1.007825,
@@ -694,16 +617,9 @@ def merge(self, **kwargs):
694
617
  "CH3COO": 59.013851,
695
618
  "H2O": 18.010565,
696
619
  }
697
-
698
- base_mass = element_masses.get(
699
- element,
700
- 1.007825,
701
- ) # Default to H if unknown
702
- mass_shift = (
703
- base_mass * multiplier
704
- if sign == "+"
705
- else -base_mass * multiplier
706
- )
620
+
621
+ base_mass = element_masses.get(element, 1.007825) # Default to H if unknown
622
+ mass_shift = base_mass * multiplier if sign == "+" else -base_mass * multiplier
707
623
  adduct_mass_shift_top = mass_shift
708
624
  else:
709
625
  # Default fallback
@@ -711,8 +627,8 @@ def merge(self, **kwargs):
711
627
  adduct_mass_shift_top = 1.007825
712
628
  else:
713
629
  # No valid adducts found - assign default based on study polarity
714
- study_polarity = getattr(self, "polarity", "positive")
715
- if study_polarity in ["negative", "neg"]:
630
+ study_polarity = getattr(self, 'polarity', 'positive')
631
+ if study_polarity in ['negative', 'neg']:
716
632
  # Negative mode default
717
633
  adduct_top = "[M-?]1-"
718
634
  adduct_charge_top = -1
@@ -722,18 +638,14 @@ def merge(self, **kwargs):
722
638
  adduct_top = "[M+?]1+"
723
639
  adduct_charge_top = 1
724
640
  adduct_mass_shift_top = 1.007825 # H mass (gain of proton)
725
-
641
+
726
642
  # Create a single default adduct entry in the adducts list for consistency
727
643
  consensus_adduct_values = [[adduct_top, 1, 100.0]]
728
-
644
+
729
645
  # Calculate neutral mass from consensus mz (for both cases)
730
- consensus_mz = (
731
- round(float(np.mean(mz_values)), 4) if len(mz_values) > 0 else 0.0
732
- )
646
+ consensus_mz = round(float(np.mean(mz_values)), 4) if len(mz_values) > 0 else 0.0
733
647
  if adduct_charge_top and adduct_mass_shift_top is not None:
734
- adduct_mass_neutral_top = (
735
- consensus_mz * abs(adduct_charge_top) - adduct_mass_shift_top
736
- )
648
+ adduct_mass_neutral_top = consensus_mz * abs(adduct_charge_top) - adduct_mass_shift_top
737
649
 
738
650
  # Calculate number of MS2 spectra
739
651
  ms2_count = 0
@@ -742,95 +654,48 @@ def merge(self, **kwargs):
742
654
  if ms2_scans is not None:
743
655
  ms2_count += len(ms2_scans)
744
656
 
745
- metadata_list.append(
746
- {
747
- "consensus_uid": int(i), # "consensus_id": i,
748
- "consensus_id": str(feature.getUniqueId()),
749
- "quality": round(float(feature.getQuality()), 3),
750
- "number_samples": len(feature_data_list),
751
- # "number_ext": int(len(features_list)),
752
- "rt": round(float(np.mean(rt_values)), 4)
753
- if len(rt_values) > 0
754
- else 0.0,
755
- "mz": round(float(np.mean(mz_values)), 4)
756
- if len(mz_values) > 0
757
- else 0.0,
758
- "rt_min": round(float(np.min(rt_values)), 3)
759
- if len(rt_values) > 0
760
- else 0.0,
761
- "rt_max": round(float(np.max(rt_values)), 3)
762
- if len(rt_values) > 0
763
- else 0.0,
764
- "rt_mean": round(float(np.mean(rt_values)), 3)
765
- if len(rt_values) > 0
766
- else 0.0,
767
- "rt_start_mean": round(float(np.mean(rt_start_values)), 3)
768
- if len(rt_start_values) > 0
769
- else 0.0,
770
- "rt_end_mean": round(float(np.mean(rt_end_values)), 3)
771
- if len(rt_end_values) > 0
772
- else 0.0,
773
- "rt_delta_mean": round(float(np.ptp(rt_delta_values)), 3)
774
- if len(rt_delta_values) > 0
775
- else 0.0,
776
- "mz_min": round(float(np.min(mz_values)), 4)
777
- if len(mz_values) > 0
778
- else 0.0,
779
- "mz_max": round(float(np.max(mz_values)), 4)
780
- if len(mz_values) > 0
781
- else 0.0,
782
- "mz_mean": round(float(np.mean(mz_values)), 4)
783
- if len(mz_values) > 0
784
- else 0.0,
785
- "mz_start_mean": round(float(np.mean(mz_start_values)), 4)
786
- if len(mz_start_values) > 0
787
- else 0.0,
788
- "mz_end_mean": round(float(np.mean(mz_end_values)), 4)
789
- if len(mz_end_values) > 0
790
- else 0.0,
791
- "inty_mean": round(float(np.mean(inty_values)), 0)
792
- if len(inty_values) > 0
793
- else 0.0,
794
- "bl": -1.0,
795
- "chrom_coherence_mean": round(float(np.mean(coherence_values)), 3)
796
- if len(coherence_values) > 0
797
- else 0.0,
798
- "chrom_prominence_mean": round(float(np.mean(prominence_values)), 0)
799
- if len(prominence_values) > 0
800
- else 0.0,
801
- "chrom_prominence_scaled_mean": round(
802
- float(np.mean(prominence_scaled_values)),
803
- 3,
804
- )
805
- if len(prominence_scaled_values) > 0
806
- else 0.0,
807
- "chrom_height_scaled_mean": round(
808
- float(np.mean(height_scaled_values)),
809
- 3,
810
- )
811
- if len(height_scaled_values) > 0
812
- else 0.0,
813
- "iso_mean": round(float(np.mean(iso_values)), 2)
814
- if len(iso_values) > 0
815
- else 0.0,
816
- "charge_mean": round(float(np.mean(charge_values)), 2)
817
- if len(charge_values) > 0
818
- else 0.0,
819
- "number_ms2": int(ms2_count),
820
- "adducts": consensus_adduct_values
821
- if consensus_adduct_values
822
- else [], # Ensure it's always a list
823
- # New columns for top-ranked adduct information
824
- "adduct_top": adduct_top,
825
- "adduct_charge_top": adduct_charge_top,
826
- "adduct_mass_neutral_top": round(adduct_mass_neutral_top, 6)
827
- if adduct_mass_neutral_top is not None
828
- else None,
829
- "adduct_mass_shift_top": round(adduct_mass_shift_top, 6)
830
- if adduct_mass_shift_top is not None
831
- else None,
832
- },
833
- )
657
+ metadata_list.append({
658
+ "consensus_uid": int(i), # "consensus_id": i,
659
+ "consensus_id": str(feature.getUniqueId()),
660
+ "quality": round(float(feature.getQuality()), 3),
661
+ "number_samples": len(feature_data_list),
662
+ # "number_ext": int(len(features_list)),
663
+ "rt": round(float(np.mean(rt_values)), 4) if len(rt_values) > 0 else 0.0,
664
+ "mz": round(float(np.mean(mz_values)), 4) if len(mz_values) > 0 else 0.0,
665
+ "rt_min": round(float(np.min(rt_values)), 3) if len(rt_values) > 0 else 0.0,
666
+ "rt_max": round(float(np.max(rt_values)), 3) if len(rt_values) > 0 else 0.0,
667
+ "rt_mean": round(float(np.mean(rt_values)), 3) if len(rt_values) > 0 else 0.0,
668
+ "rt_start_mean": round(float(np.mean(rt_start_values)), 3) if len(rt_start_values) > 0 else 0.0,
669
+ "rt_end_mean": round(float(np.mean(rt_end_values)), 3) if len(rt_end_values) > 0 else 0.0,
670
+ "rt_delta_mean": round(float(np.ptp(rt_delta_values)), 3) if len(rt_delta_values) > 0 else 0.0,
671
+ "mz_min": round(float(np.min(mz_values)), 4) if len(mz_values) > 0 else 0.0,
672
+ "mz_max": round(float(np.max(mz_values)), 4) if len(mz_values) > 0 else 0.0,
673
+ "mz_mean": round(float(np.mean(mz_values)), 4) if len(mz_values) > 0 else 0.0,
674
+ "mz_start_mean": round(float(np.mean(mz_start_values)), 4) if len(mz_start_values) > 0 else 0.0,
675
+ "mz_end_mean": round(float(np.mean(mz_end_values)), 4) if len(mz_end_values) > 0 else 0.0,
676
+ "inty_mean": round(float(np.mean(inty_values)), 0) if len(inty_values) > 0 else 0.0,
677
+ "bl": -1.0,
678
+ "chrom_coherence_mean": round(float(np.mean(coherence_values)), 3) if len(coherence_values) > 0 else 0.0,
679
+ "chrom_prominence_mean": round(float(np.mean(prominence_values)), 0) if len(prominence_values) > 0 else 0.0,
680
+ "chrom_prominence_scaled_mean": round(
681
+ float(np.mean(prominence_scaled_values)),
682
+ 3,
683
+ )
684
+ if len(prominence_scaled_values) > 0
685
+ else 0.0,
686
+ "chrom_height_scaled_mean": round(float(np.mean(height_scaled_values)), 3)
687
+ if len(height_scaled_values) > 0
688
+ else 0.0,
689
+ "iso_mean": round(float(np.mean(iso_values)), 2) if len(iso_values) > 0 else 0.0,
690
+ "charge_mean": round(float(np.mean(charge_values)), 2) if len(charge_values) > 0 else 0.0,
691
+ "number_ms2": int(ms2_count),
692
+ "adducts": consensus_adduct_values if consensus_adduct_values else [], # Ensure it's always a list
693
+ # New columns for top-ranked adduct information
694
+ "adduct_top": adduct_top,
695
+ "adduct_charge_top": adduct_charge_top,
696
+ "adduct_mass_neutral_top": round(adduct_mass_neutral_top, 6) if adduct_mass_neutral_top is not None else None,
697
+ "adduct_mass_shift_top": round(adduct_mass_shift_top, 6) if adduct_mass_shift_top is not None else None,
698
+ })
834
699
 
835
700
  consensus_mapping_df = pl.DataFrame(consensus_mapping)
836
701
  # remove all rows in consensus_mapping_df where consensus_id is not in self.featured_df['uid']
@@ -871,74 +736,72 @@ def merge(self, **kwargs):
871
736
  )
872
737
 
873
738
  self.consensus_map = consensus_map
874
-
739
+
875
740
  # Add adduct grouping and adduct_of assignment
876
741
  if len(self.consensus_df) > 0:
877
742
  # Get rt_tol and mz_tol from kwargs or use defaults from merge_defaults
878
743
  adduct_rt_tol = rt_tol # Use the same rt_tol from merge parameters
879
744
  adduct_mz_tol = mz_tol # Use the same mz_tol from merge parameters
880
-
745
+
881
746
  # Initialize new columns
882
747
  adduct_group_list = []
883
748
  adduct_of_list = []
884
-
749
+
885
750
  # Get relevant columns for grouping
886
751
  consensus_data = []
887
752
  for row in self.consensus_df.iter_rows(named=True):
888
- consensus_data.append(
889
- {
890
- "consensus_uid": row["consensus_uid"],
891
- "rt": row["rt"],
892
- "adduct_mass_neutral_top": row.get("adduct_mass_neutral_top"),
893
- "adduct_top": row.get("adduct_top"),
894
- "inty_mean": row.get("inty_mean", 0),
895
- },
896
- )
897
-
753
+ consensus_data.append({
754
+ "consensus_uid": row["consensus_uid"],
755
+ "rt": row["rt"],
756
+ "adduct_mass_neutral_top": row.get("adduct_mass_neutral_top"),
757
+ "adduct_top": row.get("adduct_top"),
758
+ "inty_mean": row.get("inty_mean", 0),
759
+ })
760
+
898
761
  # Group features with similar neutral mass and RT
899
762
  group_id = 1
900
763
  assigned_groups = {} # consensus_uid -> group_id
901
764
  groups = {} # group_id -> [consensus_uids]
902
-
765
+
903
766
  for i, feature in enumerate(consensus_data):
904
767
  consensus_uid = feature["consensus_uid"]
905
-
768
+
906
769
  if consensus_uid in assigned_groups:
907
770
  continue
908
-
771
+
909
772
  neutral_mass = feature["adduct_mass_neutral_top"]
910
773
  rt = feature["rt"]
911
-
774
+
912
775
  # Skip if neutral mass is None
913
776
  if neutral_mass is None:
914
777
  assigned_groups[consensus_uid] = 0 # No group assignment
915
778
  continue
916
-
779
+
917
780
  # Find all features that could belong to the same group
918
781
  group_members = [consensus_uid]
919
-
782
+
920
783
  for j, other_feature in enumerate(consensus_data):
921
784
  if i == j:
922
785
  continue
923
-
786
+
924
787
  other_uid = other_feature["consensus_uid"]
925
788
  if other_uid in assigned_groups:
926
789
  continue
927
-
790
+
928
791
  other_neutral_mass = other_feature["adduct_mass_neutral_top"]
929
792
  other_rt = other_feature["rt"]
930
-
793
+
931
794
  if other_neutral_mass is None:
932
795
  continue
933
-
796
+
934
797
  # Check if features have similar neutral mass and RT
935
798
  mass_diff = abs(neutral_mass - other_neutral_mass)
936
799
  rt_diff = abs(rt - other_rt) / 60.0 # Convert to minutes for rt_tol
937
-
800
+
938
801
  if mass_diff <= adduct_mz_tol and rt_diff <= adduct_rt_tol:
939
802
  group_members.append(other_uid)
940
803
  assigned_groups[other_uid] = group_id
941
-
804
+
942
805
  if len(group_members) > 1:
943
806
  # Multiple members - create a group
944
807
  for member_uid in group_members:
@@ -950,29 +813,26 @@ def merge(self, **kwargs):
950
813
  assigned_groups[consensus_uid] = group_id
951
814
  groups[group_id] = [consensus_uid]
952
815
  group_id += 1
953
-
816
+
954
817
  # Determine adduct_of for each group
955
818
  group_adduct_of = {} # group_id -> consensus_uid of most important adduct
956
-
819
+
957
820
  for grp_id, member_uids in groups.items():
958
821
  # Find the most important adduct in this group
959
822
  # Priority: [M+H]+ > [M-H]- > highest intensity
960
823
  best_uid = None
961
824
  best_priority = -1
962
825
  best_intensity = 0
963
-
826
+
964
827
  for uid in member_uids:
965
828
  # Find the feature data
966
- feature_data = next(
967
- (f for f in consensus_data if f["consensus_uid"] == uid),
968
- None,
969
- )
829
+ feature_data = next((f for f in consensus_data if f["consensus_uid"] == uid), None)
970
830
  if not feature_data:
971
831
  continue
972
-
832
+
973
833
  adduct = feature_data.get("adduct_top", "")
974
834
  intensity = feature_data.get("inty_mean", 0)
975
-
835
+
976
836
  priority = 0
977
837
  if adduct and ("[M+H]" in adduct or adduct == "H" or adduct == "?"):
978
838
  priority = 3 # Highest priority for [M+H]+ or H
@@ -980,41 +840,34 @@ def merge(self, **kwargs):
980
840
  priority = 2 # Second priority for [M-H]-
981
841
  elif adduct and "M" in adduct:
982
842
  priority = 1 # Third priority for other molecular adducts
983
-
843
+
984
844
  # Choose based on priority first, then intensity
985
- if priority > best_priority or (
986
- priority == best_priority and intensity > best_intensity
987
- ):
845
+ if (priority > best_priority or
846
+ (priority == best_priority and intensity > best_intensity)):
988
847
  best_uid = uid
989
848
  best_priority = priority
990
849
  best_intensity = intensity
991
-
850
+
992
851
  group_adduct_of[grp_id] = best_uid if best_uid else member_uids[0]
993
-
852
+
994
853
  # Build the final lists in the same order as consensus_df
995
854
  for row in self.consensus_df.iter_rows(named=True):
996
855
  consensus_uid = row["consensus_uid"]
997
856
  group = assigned_groups.get(consensus_uid, 0)
998
857
  adduct_of = group_adduct_of.get(group, consensus_uid)
999
-
858
+
1000
859
  adduct_group_list.append(group)
1001
860
  adduct_of_list.append(adduct_of)
1002
-
861
+
1003
862
  # Add the new columns to consensus_df
1004
- self.consensus_df = self.consensus_df.with_columns(
1005
- [
1006
- pl.Series("adduct_group", adduct_group_list, dtype=pl.Int64),
1007
- pl.Series("adduct_of", adduct_of_list, dtype=pl.Int64),
1008
- ],
1009
- )
1010
-
863
+ self.consensus_df = self.consensus_df.with_columns([
864
+ pl.Series("adduct_group", adduct_group_list, dtype=pl.Int64),
865
+ pl.Series("adduct_of", adduct_of_list, dtype=pl.Int64),
866
+ ])
867
+
1011
868
  # calculate the completeness of the consensus map
1012
869
  if len(self.consensus_df) > 0 and len(self.samples_df) > 0:
1013
- c = (
1014
- len(self.consensus_mapping_df)
1015
- / len(self.consensus_df)
1016
- / len(self.samples_df)
1017
- )
870
+ c = len(self.consensus_mapping_df) / len(self.consensus_df) / len(self.samples_df)
1018
871
  self.logger.info(
1019
872
  f"Merging completed. Consensus features: {len(self.consensus_df)}. Completeness: {c:.2f}.",
1020
873
  )
@@ -1085,9 +938,7 @@ def find_ms2(self, **kwargs):
1085
938
  ]
1086
939
  for row in feats.iter_rows(named=True):
1087
940
  feature_uid = row["feature_uid"]
1088
- feature_lookup[feature_uid] = {
1089
- col: row[col] for col in relevant_cols if col in feats.columns
1090
- }
941
+ feature_lookup[feature_uid] = {col: row[col] for col in relevant_cols if col in feats.columns}
1091
942
  tdqm_disable = self.log_level not in ["TRACE", "DEBUG", "INFO"]
1092
943
 
1093
944
  # Process consensus mapping in batch
@@ -1109,26 +960,20 @@ def find_ms2(self, **kwargs):
1109
960
  for j in range(len(ms2_specs)):
1110
961
  spec = ms2_specs[j]
1111
962
  scanid = ms2_scans[j]
1112
- data.append(
1113
- {
1114
- "consensus_uid": int(mapping_row["consensus_uid"]),
1115
- "feature_uid": int(mapping_row["feature_uid"]),
1116
- "sample_uid": int(mapping_row["sample_uid"]),
1117
- "scan_id": int(scanid),
1118
- "energy": round(spec.energy, 1)
1119
- if hasattr(spec, "energy") and spec.energy is not None
1120
- else None,
1121
- "prec_inty": round(inty, 0) if inty is not None else None,
1122
- "prec_coherence": round(chrom_coherence, 3)
1123
- if chrom_coherence is not None
1124
- else None,
1125
- "prec_prominence_scaled": round(chrom_prominence_scaled, 3)
1126
- if chrom_prominence_scaled is not None
1127
- else None,
1128
- "number_frags": len(spec.mz),
1129
- "spec": spec,
1130
- },
1131
- )
963
+ data.append({
964
+ "consensus_uid": int(mapping_row["consensus_uid"]),
965
+ "feature_uid": int(mapping_row["feature_uid"]),
966
+ "sample_uid": int(mapping_row["sample_uid"]),
967
+ "scan_id": int(scanid),
968
+ "energy": round(spec.energy, 1) if hasattr(spec, "energy") and spec.energy is not None else None,
969
+ "prec_inty": round(inty, 0) if inty is not None else None,
970
+ "prec_coherence": round(chrom_coherence, 3) if chrom_coherence is not None else None,
971
+ "prec_prominence_scaled": round(chrom_prominence_scaled, 3)
972
+ if chrom_prominence_scaled is not None
973
+ else None,
974
+ "number_frags": len(spec.mz),
975
+ "spec": spec,
976
+ })
1132
977
  self.consensus_ms2 = pl.DataFrame(data)
1133
978
  if not self.consensus_ms2.is_empty():
1134
979
  unique_consensus_features = self.consensus_ms2["consensus_uid"].n_unique()
@@ -1161,10 +1006,7 @@ def filter_consensus(
1161
1006
  else:
1162
1007
  if isinstance(coherence, tuple) and len(coherence) == 2:
1163
1008
  min_coherence, max_coherence = coherence
1164
- cons = cons[
1165
- (cons["chrom_coherence"] >= min_coherence)
1166
- & (cons["chrom_coherence"] <= max_coherence)
1167
- ]
1009
+ cons = cons[(cons["chrom_coherence"] >= min_coherence) & (cons["chrom_coherence"] <= max_coherence)]
1168
1010
  else:
1169
1011
  cons = cons[cons["chrom_coherence"] >= coherence]
1170
1012
  after_coherence = len(cons)
@@ -1175,9 +1017,7 @@ def filter_consensus(
1175
1017
  if quality is not None:
1176
1018
  if isinstance(quality, tuple) and len(quality) == 2:
1177
1019
  min_quality, max_quality = quality
1178
- cons = cons[
1179
- (cons["quality"] >= min_quality) & (cons["quality"] <= max_quality)
1180
- ]
1020
+ cons = cons[(cons["quality"] >= min_quality) & (cons["quality"] <= max_quality)]
1181
1021
  else:
1182
1022
  cons = cons[cons["quality"] >= quality]
1183
1023
  after_quality = len(cons)
@@ -1188,10 +1028,7 @@ def filter_consensus(
1188
1028
  if number_samples is not None:
1189
1029
  if isinstance(number_samples, tuple) and len(number_samples) == 2:
1190
1030
  min_number, max_number = number_samples
1191
- cons = cons[
1192
- (cons["number_samples"] >= min_number)
1193
- & (cons["number_samples"] <= max_number)
1194
- ]
1031
+ cons = cons[(cons["number_samples"] >= min_number) & (cons["number_samples"] <= max_number)]
1195
1032
  else:
1196
1033
  cons = cons[cons["number_samples"] >= number_samples]
1197
1034
  after_number_samples = len(cons)
@@ -1268,13 +1105,11 @@ def _integrate_chrom_impl(self, **kwargs):
1268
1105
 
1269
1106
  # Merge consensus_mapping with consensus_df to get rt_start_mean and rt_end_mean
1270
1107
  # Use Polars join operation instead of pandas merge
1271
- consensus_subset = self.consensus_df.select(
1272
- [
1273
- "consensus_uid",
1274
- "rt_start_mean",
1275
- "rt_end_mean",
1276
- ],
1277
- )
1108
+ consensus_subset = self.consensus_df.select([
1109
+ "consensus_uid",
1110
+ "rt_start_mean",
1111
+ "rt_end_mean",
1112
+ ])
1278
1113
  df1 = self.consensus_mapping_df.join(
1279
1114
  consensus_subset,
1280
1115
  on="consensus_uid",
@@ -1319,9 +1154,9 @@ def _integrate_chrom_impl(self, **kwargs):
1319
1154
  if chrom is None or len(chrom) == 0:
1320
1155
  update_rows.append(row_idx)
1321
1156
  chroms.append(None)
1322
- rt_starts.append(float("nan"))
1323
- rt_ends.append(float("nan"))
1324
- rt_deltas.append(float("nan"))
1157
+ rt_starts.append(float('nan'))
1158
+ rt_ends.append(float('nan'))
1159
+ rt_deltas.append(float('nan'))
1325
1160
  chrom_areas.append(-1.0)
1326
1161
  continue
1327
1162
  ## TODO expose parameters
@@ -1351,13 +1186,9 @@ def _integrate_chrom_impl(self, **kwargs):
1351
1186
  if update_rows:
1352
1187
  # Create mapping from row index to new values
1353
1188
  row_to_chrom = {update_rows[i]: chroms[i] for i in range(len(update_rows))}
1354
- row_to_rt_start = {
1355
- update_rows[i]: rt_starts[i] for i in range(len(update_rows))
1356
- }
1189
+ row_to_rt_start = {update_rows[i]: rt_starts[i] for i in range(len(update_rows))}
1357
1190
  row_to_rt_end = {update_rows[i]: rt_ends[i] for i in range(len(update_rows))}
1358
- row_to_rt_delta = {
1359
- update_rows[i]: rt_deltas[i] for i in range(len(update_rows))
1360
- }
1191
+ row_to_rt_delta = {update_rows[i]: rt_deltas[i] for i in range(len(update_rows))}
1361
1192
  row_to_chrom_area = {
1362
1193
  update_rows[i]: float(chrom_areas[i]) if chrom_areas[i] is not None else 0.0
1363
1194
  for i in range(len(update_rows))
@@ -1371,60 +1202,58 @@ def _integrate_chrom_impl(self, **kwargs):
1371
1202
 
1372
1203
  # Update columns conditionally
1373
1204
  try:
1374
- self.features_df = df_with_index.with_columns(
1375
- [
1376
- # Update chrom column - use when() to update only specific rows
1377
- pl.when(update_mask)
1378
- .then(
1379
- pl.col("__row_idx").map_elements(
1380
- lambda x: row_to_chrom.get(x, None),
1381
- return_dtype=pl.Object,
1382
- ),
1383
- )
1384
- .otherwise(pl.col("chrom"))
1385
- .alias("chrom"),
1386
- # Update rt_start column
1387
- pl.when(update_mask)
1388
- .then(
1389
- pl.col("__row_idx").map_elements(
1390
- lambda x: row_to_rt_start.get(x, None),
1391
- return_dtype=pl.Float64,
1392
- ),
1393
- )
1394
- .otherwise(pl.col("rt_start"))
1395
- .alias("rt_start"),
1396
- # Update rt_end column
1397
- pl.when(update_mask)
1398
- .then(
1399
- pl.col("__row_idx").map_elements(
1400
- lambda x: row_to_rt_end.get(x, None),
1401
- return_dtype=pl.Float64,
1402
- ),
1403
- )
1404
- .otherwise(pl.col("rt_end"))
1405
- .alias("rt_end"),
1406
- # Update rt_delta column
1407
- pl.when(update_mask)
1408
- .then(
1409
- pl.col("__row_idx").map_elements(
1410
- lambda x: row_to_rt_delta.get(x, None),
1411
- return_dtype=pl.Float64,
1412
- ),
1413
- )
1414
- .otherwise(pl.col("rt_delta"))
1415
- .alias("rt_delta"),
1416
- # Update chrom_area column
1417
- pl.when(update_mask)
1418
- .then(
1419
- pl.col("__row_idx").map_elements(
1420
- lambda x: row_to_chrom_area.get(x, 0),
1421
- return_dtype=pl.Float64,
1422
- ),
1423
- )
1424
- .otherwise(pl.col("chrom_area"))
1425
- .alias("chrom_area"),
1426
- ],
1427
- ).drop("__row_idx") # Remove the temporary row index column
1205
+ self.features_df = df_with_index.with_columns([
1206
+ # Update chrom column - use when() to update only specific rows
1207
+ pl.when(update_mask)
1208
+ .then(
1209
+ pl.col("__row_idx").map_elements(
1210
+ lambda x: row_to_chrom.get(x, None),
1211
+ return_dtype=pl.Object,
1212
+ ),
1213
+ )
1214
+ .otherwise(pl.col("chrom"))
1215
+ .alias("chrom"),
1216
+ # Update rt_start column
1217
+ pl.when(update_mask)
1218
+ .then(
1219
+ pl.col("__row_idx").map_elements(
1220
+ lambda x: row_to_rt_start.get(x, None),
1221
+ return_dtype=pl.Float64,
1222
+ ),
1223
+ )
1224
+ .otherwise(pl.col("rt_start"))
1225
+ .alias("rt_start"),
1226
+ # Update rt_end column
1227
+ pl.when(update_mask)
1228
+ .then(
1229
+ pl.col("__row_idx").map_elements(
1230
+ lambda x: row_to_rt_end.get(x, None),
1231
+ return_dtype=pl.Float64,
1232
+ ),
1233
+ )
1234
+ .otherwise(pl.col("rt_end"))
1235
+ .alias("rt_end"),
1236
+ # Update rt_delta column
1237
+ pl.when(update_mask)
1238
+ .then(
1239
+ pl.col("__row_idx").map_elements(
1240
+ lambda x: row_to_rt_delta.get(x, None),
1241
+ return_dtype=pl.Float64,
1242
+ ),
1243
+ )
1244
+ .otherwise(pl.col("rt_delta"))
1245
+ .alias("rt_delta"),
1246
+ # Update chrom_area column
1247
+ pl.when(update_mask)
1248
+ .then(
1249
+ pl.col("__row_idx").map_elements(
1250
+ lambda x: row_to_chrom_area.get(x, 0),
1251
+ return_dtype=pl.Float64,
1252
+ ),
1253
+ )
1254
+ .otherwise(pl.col("chrom_area"))
1255
+ .alias("chrom_area"),
1256
+ ]).drop("__row_idx") # Remove the temporary row index column
1428
1257
 
1429
1258
  self.logger.debug(
1430
1259
  f"Integration completed. Updated {len(update_rows)} features with chromatogram data.",
@@ -1514,22 +1343,10 @@ def _align_pose_clustering(study_obj, fmaps, params):
1514
1343
  params_oms.setValue("pairfinder:distance_intensity:log_transform", "disabled")
1515
1344
  params_oms.setValue("pairfinder:ignore_charge", "true")
1516
1345
  params_oms.setValue("max_num_peaks_considered", 1000)
1517
- params_oms.setValue(
1518
- "pairfinder:distance_RT:max_difference",
1519
- params.get("rt_max_diff"),
1520
- )
1521
- params_oms.setValue(
1522
- "pairfinder:distance_MZ:max_difference",
1523
- params.get("mz_max_diff"),
1524
- )
1525
- params_oms.setValue(
1526
- "superimposer:rt_pair_distance_fraction",
1527
- params.get("rt_pair_distance_frac"),
1528
- )
1529
- params_oms.setValue(
1530
- "superimposer:mz_pair_max_distance",
1531
- params.get("mz_pair_max_distance"),
1532
- )
1346
+ params_oms.setValue("pairfinder:distance_RT:max_difference", params.get("rt_max_diff"))
1347
+ params_oms.setValue("pairfinder:distance_MZ:max_difference", params.get("mz_max_diff"))
1348
+ params_oms.setValue("superimposer:rt_pair_distance_fraction", params.get("rt_pair_distance_frac"))
1349
+ params_oms.setValue("superimposer:mz_pair_max_distance", params.get("mz_pair_max_distance"))
1533
1350
  params_oms.setValue("superimposer:num_used_points", params.get("num_used_points"))
1534
1351
  params_oms.setValue("pairfinder:distance_MZ:exponent", 3.0)
1535
1352
  params_oms.setValue("pairfinder:distance_RT:exponent", 2.0)
@@ -1538,9 +1355,7 @@ def _align_pose_clustering(study_obj, fmaps, params):
1538
1355
  study_obj.logger.info("Starting alignment with PoseClustering")
1539
1356
 
1540
1357
  # Set ref_index to feature map index with largest number of features
1541
- ref_index = [
1542
- i[0] for i in sorted(enumerate([fm.size() for fm in fmaps]), key=lambda x: x[1])
1543
- ][-1]
1358
+ ref_index = [i[0] for i in sorted(enumerate([fm.size() for fm in fmaps]), key=lambda x: x[1])][-1]
1544
1359
  study_obj.logger.debug(
1545
1360
  f"Reference map is {study_obj.samples_df.row(ref_index, named=True)['sample_name']}",
1546
1361
  )
@@ -1559,10 +1374,7 @@ def _align_pose_clustering(study_obj, fmaps, params):
1559
1374
  ):
1560
1375
  if index == ref_index:
1561
1376
  continue
1562
- if (
1563
- params.get("skip_blanks")
1564
- and study_obj.samples_df.row(index, named=True)["sample_type"] == "blank"
1565
- ):
1377
+ if params.get("skip_blanks") and study_obj.samples_df.row(index, named=True)["sample_type"] == "blank":
1566
1378
  continue
1567
1379
  trafo = oms.TransformationDescription()
1568
1380
  aligner.align(fm, trafo)
@@ -1581,30 +1393,19 @@ def _align_kd_algorithm(study_obj, fmaps, params):
1581
1393
 
1582
1394
  # Pull parameter values - map standard align params to our algorithm
1583
1395
  # Use rt_max_diff (standard align param) instead of warp_rt_tol for RT tolerance
1584
- rt_pair_tol = (
1585
- float(params.get("rt_max_diff"))
1586
- if params.get("rt_max_diff") is not None
1587
- else 2.0
1588
- )
1396
+ rt_pair_tol = float(params.get("rt_max_diff")) if params.get("rt_max_diff") is not None else 2.0
1589
1397
  # Use mz_max_diff (standard align param) converted to ppm
1590
- mz_max_diff_da = (
1591
- float(params.get("mz_max_diff"))
1592
- if params.get("mz_max_diff") is not None
1593
- else 0.02
1594
- )
1398
+ mz_max_diff_da = float(params.get("mz_max_diff")) if params.get("mz_max_diff") is not None else 0.02
1595
1399
  # Convert Da to ppm (assuming ~400 m/z average for metabolomics): 0.01 Da / 400 * 1e6 = 25 ppm
1596
1400
  ppm_tol = mz_max_diff_da / 400.0 * 1e6
1597
1401
  # Allow override with warp_mz_tol if specifically set (but not from defaults)
1598
1402
  try:
1599
1403
  warp_mz_from_params = params.get("warp_mz_tol")
1600
- if (
1601
- warp_mz_from_params is not None
1602
- and warp_mz_from_params != params.__class__().warp_mz_tol
1603
- ):
1404
+ if warp_mz_from_params is not None and warp_mz_from_params != params.__class__().warp_mz_tol:
1604
1405
  ppm_tol = float(warp_mz_from_params)
1605
1406
  except (KeyError, AttributeError):
1606
1407
  pass
1607
-
1408
+
1608
1409
  # Safely retrieve optional parameter max_anchor_points (not yet part of defaults)
1609
1410
  try:
1610
1411
  _raw_mp = params.get("max_anchor_points")
@@ -1612,18 +1413,16 @@ def _align_kd_algorithm(study_obj, fmaps, params):
1612
1413
  _raw_mp = None
1613
1414
  max_points = int(_raw_mp) if _raw_mp is not None else 1000
1614
1415
  study_obj.logger.info(
1615
- f"Starting custom KD-style alignment (ref-based) with ppm_tol={ppm_tol:.1f}, rt_tol={rt_pair_tol:.1f}s, max_points={max_points}",
1616
- )
1617
- study_obj.logger.info(
1618
- f"Using rt_max_diff={params.get('rt_max_diff')}, mz_max_diff={params.get('mz_max_diff')}",
1416
+ f"Starting custom KD-style alignment (ref-based) with ppm_tol={ppm_tol:.1f}, rt_tol={rt_pair_tol:.1f}s, max_points={max_points}"
1619
1417
  )
1418
+ study_obj.logger.info(f"Using rt_max_diff={params.get('rt_max_diff')}, mz_max_diff={params.get('mz_max_diff')}")
1620
1419
 
1621
1420
  # Choose reference map (largest number of features)
1622
1421
  ref_index = max(range(len(fmaps)), key=lambda i: fmaps[i].size())
1623
1422
  ref_map = fmaps[ref_index]
1624
1423
  study_obj.alignment_ref_index = ref_index
1625
1424
  study_obj.logger.debug(
1626
- f"Reference map index {ref_index} (sample: {study_obj.samples_df.row(ref_index, named=True)['sample_name']}) size={ref_map.size()}",
1425
+ f"Reference map index {ref_index} (sample: {study_obj.samples_df.row(ref_index, named=True)['sample_name']}) size={ref_map.size()}"
1627
1426
  )
1628
1427
 
1629
1428
  # Extract and sort reference features by m/z for binary search
@@ -1647,10 +1446,7 @@ def _align_kd_algorithm(study_obj, fmaps, params):
1647
1446
  best_drt = drt
1648
1447
  return best
1649
1448
 
1650
- def _set_pairs(
1651
- td_obj: oms.TransformationDescription,
1652
- pairs,
1653
- ): # Helper for pyopenms API variability
1449
+ def _set_pairs(td_obj: 'oms.TransformationDescription', pairs): # Helper for pyopenms API variability
1654
1450
  # Always provide list of lists to satisfy strict type expectations
1655
1451
  conv = [[float(a), float(b)] for a, b in pairs]
1656
1452
  try:
@@ -1732,7 +1528,7 @@ def _align_kd_algorithm(study_obj, fmaps, params):
1732
1528
  td.fitModel(model, oms.Param())
1733
1529
  except Exception as e:
1734
1530
  study_obj.logger.debug(
1735
- f"Map {i}: {model} fitting failed ({e}); fallback to linear two-point shift",
1531
+ f"Map {i}: {model} fitting failed ({e}); fallback to linear two-point shift"
1736
1532
  )
1737
1533
  rts = [f.getRT() for f in fmap]
1738
1534
  lo, hi = (min(rts), max(rts)) if rts else (0.0, 1.0)
@@ -1744,7 +1540,7 @@ def _align_kd_algorithm(study_obj, fmaps, params):
1744
1540
  pass
1745
1541
 
1746
1542
  study_obj.logger.debug(
1747
- f"Map {i}: anchors raw={len(pairs_raw)} used={len(pairs_use)} model={model} median_shift={med_shift:.4f}s",
1543
+ f"Map {i}: anchors raw={len(pairs_raw)} used={len(pairs_use)} model={model} median_shift={med_shift:.4f}s"
1748
1544
  )
1749
1545
  transformations.append(td)
1750
1546
 
@@ -1762,7 +1558,7 @@ def _align_kd_algorithm(study_obj, fmaps, params):
1762
1558
  study_obj.logger.warning(f"Map {i}: failed applying transformation ({e})")
1763
1559
 
1764
1560
  study_obj.logger.info(
1765
- f"Custom KD alignment completed. Reference index {ref_index}. Applied {len(transformations)} transformations.",
1561
+ f"Custom KD alignment completed. Reference index {ref_index}. Applied {len(transformations)} transformations."
1766
1562
  )
1767
1563
 
1768
1564
 
@@ -1771,21 +1567,13 @@ def _align_pose_clustering_fallback(study_obj, fmaps, params):
1771
1567
  import pyopenms as oms
1772
1568
 
1773
1569
  aligner = oms.MapAlignmentAlgorithmPoseClustering()
1774
- ref_index = [
1775
- i[0] for i in sorted(enumerate([fm.size() for fm in fmaps]), key=lambda x: x[1])
1776
- ][-1]
1570
+ ref_index = [i[0] for i in sorted(enumerate([fm.size() for fm in fmaps]), key=lambda x: x[1])][-1]
1777
1571
 
1778
1572
  # Set up basic parameters for pose clustering
1779
1573
  pc_params = oms.Param()
1780
1574
  pc_params.setValue("max_num_peaks_considered", 1000)
1781
- pc_params.setValue(
1782
- "pairfinder:distance_RT:max_difference",
1783
- params.get("rt_max_diff"),
1784
- )
1785
- pc_params.setValue(
1786
- "pairfinder:distance_MZ:max_difference",
1787
- params.get("mz_max_diff"),
1788
- )
1575
+ pc_params.setValue("pairfinder:distance_RT:max_difference", params.get("rt_max_diff"))
1576
+ pc_params.setValue("pairfinder:distance_MZ:max_difference", params.get("mz_max_diff"))
1789
1577
 
1790
1578
  aligner.setParameters(pc_params)
1791
1579
  aligner.setReference(fmaps[ref_index])