masster 0.4.4__py3-none-any.whl → 0.4.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of masster might be problematic. Click here for more details.

Files changed (39) hide show
  1. masster/__init__.py +8 -8
  2. masster/chromatogram.py +1 -1
  3. masster/data/libs/urine.csv +3 -3
  4. masster/logger.py +11 -11
  5. masster/sample/__init__.py +1 -1
  6. masster/sample/adducts.py +338 -264
  7. masster/sample/defaults/find_adducts_def.py +21 -8
  8. masster/sample/h5.py +561 -282
  9. masster/sample/helpers.py +131 -75
  10. masster/sample/lib.py +4 -4
  11. masster/sample/load.py +31 -17
  12. masster/sample/parameters.py +1 -1
  13. masster/sample/plot.py +7 -7
  14. masster/sample/processing.py +117 -87
  15. masster/sample/sample.py +103 -90
  16. masster/sample/sample5_schema.json +44 -44
  17. masster/sample/save.py +35 -12
  18. masster/spectrum.py +1 -1
  19. masster/study/__init__.py +1 -1
  20. masster/study/defaults/align_def.py +5 -1
  21. masster/study/defaults/identify_def.py +3 -1
  22. masster/study/defaults/study_def.py +58 -25
  23. masster/study/export.py +360 -210
  24. masster/study/h5.py +560 -158
  25. masster/study/helpers.py +496 -203
  26. masster/study/helpers_optimized.py +1 -1
  27. masster/study/id.py +538 -349
  28. masster/study/load.py +233 -143
  29. masster/study/plot.py +71 -71
  30. masster/study/processing.py +456 -254
  31. masster/study/save.py +15 -5
  32. masster/study/study.py +213 -131
  33. masster/study/study5_schema.json +149 -149
  34. {masster-0.4.4.dist-info → masster-0.4.5.dist-info}/METADATA +3 -1
  35. {masster-0.4.4.dist-info → masster-0.4.5.dist-info}/RECORD +39 -39
  36. {masster-0.4.4.dist-info → masster-0.4.5.dist-info}/WHEEL +0 -0
  37. {masster-0.4.4.dist-info → masster-0.4.5.dist-info}/entry_points.txt +0 -0
  38. {masster-0.4.4.dist-info → masster-0.4.5.dist-info}/licenses/LICENSE +0 -0
  39. {masster-0.4.4.dist-info → masster-0.4.5.dist-info}/top_level.txt +0 -0
@@ -8,7 +8,7 @@ import pyopenms as oms
8
8
 
9
9
  from tqdm import tqdm
10
10
 
11
- from masster.study.defaults import (
11
+ from master.study.defaults import (
12
12
  align_defaults,
13
13
  find_ms2_defaults,
14
14
  integrate_defaults,
@@ -115,7 +115,8 @@ def align(self, **kwargs):
115
115
  # Pre-build sample_uid lookup for faster access
116
116
  self.logger.debug("Build sample_uid lookup for fast access...")
117
117
  sample_uid_lookup = {
118
- idx: row_dict["sample_uid"] for idx, row_dict in enumerate(self.samples_df.iter_rows(named=True))
118
+ idx: row_dict["sample_uid"]
119
+ for idx, row_dict in enumerate(self.samples_df.iter_rows(named=True))
119
120
  }
120
121
 
121
122
  # Build the main lookup using feature_uid (not feature_id)
@@ -215,7 +216,7 @@ def align(self, **kwargs):
215
216
  self.features_df = self.features_df.with_columns(*new_cols)
216
217
 
217
218
  self.logger.debug("Alignment completed successfully.")
218
-
219
+
219
220
  # Reset consensus data structures after alignment since RT changes invalidate consensus
220
221
  consensus_reset_count = 0
221
222
  if not self.consensus_df.is_empty():
@@ -227,7 +228,7 @@ def align(self, **kwargs):
227
228
  if not self.consensus_ms2.is_empty():
228
229
  self.consensus_ms2 = pl.DataFrame()
229
230
  consensus_reset_count += 1
230
-
231
+
231
232
  # Remove merge and find_ms2 parameters from history since they need to be re-run
232
233
  keys_to_remove = ["merge", "find_ms2"]
233
234
  history_removed_count = 0
@@ -237,9 +238,11 @@ def align(self, **kwargs):
237
238
  del self.history[key]
238
239
  history_removed_count += 1
239
240
  self.logger.debug(f"Removed {key} from history")
240
-
241
+
241
242
  if consensus_reset_count > 0 or history_removed_count > 0:
242
- self.logger.info(f"Alignment reset: {consensus_reset_count} consensus structures cleared, {history_removed_count} history entries removed")
243
+ self.logger.info(
244
+ f"Alignment reset: {consensus_reset_count} consensus structures cleared, {history_removed_count} history entries removed",
245
+ )
243
246
 
244
247
  if params.get("save_features"):
245
248
  self.save_samples()
@@ -290,7 +293,10 @@ def merge(self, **kwargs):
290
293
  algorithm = params.get("algorithm")
291
294
  min_samples = params.get("min_samples")
292
295
  link_ms2 = params.get("link_ms2")
293
- mz_tol = kwargs.get("mz_tol", 0.01) # Default values for parameters not in defaults class
296
+ mz_tol = kwargs.get(
297
+ "mz_tol",
298
+ 0.01,
299
+ ) # Default values for parameters not in defaults class
294
300
  rt_tol = kwargs.get("rt_tol", 1.0)
295
301
 
296
302
  if len(self.samples_df) > 200 and algorithm == "qt":
@@ -399,7 +405,10 @@ def merge(self, **kwargs):
399
405
  consensus_map.setUniqueIds()
400
406
 
401
407
  # create a dict to map uid to feature_uid using self.features_df
402
- feature_uid_map = {row["feature_id"]: row["feature_uid"] for row in self.features_df.iter_rows(named=True)}
408
+ feature_uid_map = {
409
+ row["feature_id"]: row["feature_uid"]
410
+ for row in self.features_df.iter_rows(named=True)
411
+ }
403
412
  imax = consensus_map.size()
404
413
 
405
414
  # Pre-build fast lookup tables for features_df data
@@ -426,7 +435,9 @@ def merge(self, **kwargs):
426
435
 
427
436
  for row in self.features_df.iter_rows(named=True):
428
437
  feature_uid = row["feature_uid"]
429
- features_lookup[feature_uid] = {col: row[col] for col in feature_columns if col in self.features_df.columns}
438
+ features_lookup[feature_uid] = {
439
+ col: row[col] for col in feature_columns if col in self.features_df.columns
440
+ }
430
441
 
431
442
  # create a list to store the consensus mapping
432
443
  consensus_mapping = []
@@ -453,11 +464,13 @@ def merge(self, **kwargs):
453
464
  # this is a feature that was removed but is still in the feature maps
454
465
  continue
455
466
  fuid = feature_uid_map[fuid]
456
- consensus_mapping.append({
457
- "consensus_uid": i,
458
- "sample_uid": f.getMapIndex() + 1,
459
- "feature_uid": fuid,
460
- })
467
+ consensus_mapping.append(
468
+ {
469
+ "consensus_uid": i,
470
+ "sample_uid": f.getMapIndex() + 1,
471
+ "feature_uid": fuid,
472
+ },
473
+ )
461
474
  uids.append(fuid)
462
475
 
463
476
  # Get feature data from lookup instead of DataFrame filtering
@@ -471,43 +484,99 @@ def merge(self, **kwargs):
471
484
 
472
485
  # Compute statistics using vectorized operations on collected data
473
486
  # Convert to numpy arrays for faster computation
474
- rt_values = np.array([fd.get("rt", 0) for fd in feature_data_list if fd.get("rt") is not None])
475
- mz_values = np.array([fd.get("mz", 0) for fd in feature_data_list if fd.get("mz") is not None])
476
- rt_start_values = np.array([
477
- fd.get("rt_start", 0) for fd in feature_data_list if fd.get("rt_start") is not None
478
- ])
479
- rt_end_values = np.array([fd.get("rt_end", 0) for fd in feature_data_list if fd.get("rt_end") is not None])
480
- rt_delta_values = np.array([
481
- fd.get("rt_delta", 0) for fd in feature_data_list if fd.get("rt_delta") is not None
482
- ])
483
- mz_start_values = np.array([
484
- fd.get("mz_start", 0) for fd in feature_data_list if fd.get("mz_start") is not None
485
- ])
486
- mz_end_values = np.array([fd.get("mz_end", 0) for fd in feature_data_list if fd.get("mz_end") is not None])
487
- inty_values = np.array([fd.get("inty", 0) for fd in feature_data_list if fd.get("inty") is not None])
488
- coherence_values = np.array([
489
- fd.get("chrom_coherence", 0) for fd in feature_data_list if fd.get("chrom_coherence") is not None
490
- ])
491
- prominence_values = np.array([
492
- fd.get("chrom_prominence", 0) for fd in feature_data_list if fd.get("chrom_prominence") is not None
493
- ])
494
- prominence_scaled_values = np.array([
495
- fd.get("chrom_prominence_scaled", 0)
496
- for fd in feature_data_list
497
- if fd.get("chrom_prominence_scaled") is not None
498
- ])
499
- height_scaled_values = np.array([
500
- fd.get("chrom_height_scaled", 0) for fd in feature_data_list if fd.get("chrom_height_scaled") is not None
501
- ])
502
- iso_values = np.array([fd.get("iso", 0) for fd in feature_data_list if fd.get("iso") is not None])
503
- charge_values = np.array([fd.get("charge", 0) for fd in feature_data_list if fd.get("charge") is not None])
487
+ rt_values = np.array(
488
+ [fd.get("rt", 0) for fd in feature_data_list if fd.get("rt") is not None],
489
+ )
490
+ mz_values = np.array(
491
+ [fd.get("mz", 0) for fd in feature_data_list if fd.get("mz") is not None],
492
+ )
493
+ rt_start_values = np.array(
494
+ [
495
+ fd.get("rt_start", 0)
496
+ for fd in feature_data_list
497
+ if fd.get("rt_start") is not None
498
+ ],
499
+ )
500
+ rt_end_values = np.array(
501
+ [
502
+ fd.get("rt_end", 0)
503
+ for fd in feature_data_list
504
+ if fd.get("rt_end") is not None
505
+ ],
506
+ )
507
+ rt_delta_values = np.array(
508
+ [
509
+ fd.get("rt_delta", 0)
510
+ for fd in feature_data_list
511
+ if fd.get("rt_delta") is not None
512
+ ],
513
+ )
514
+ mz_start_values = np.array(
515
+ [
516
+ fd.get("mz_start", 0)
517
+ for fd in feature_data_list
518
+ if fd.get("mz_start") is not None
519
+ ],
520
+ )
521
+ mz_end_values = np.array(
522
+ [
523
+ fd.get("mz_end", 0)
524
+ for fd in feature_data_list
525
+ if fd.get("mz_end") is not None
526
+ ],
527
+ )
528
+ inty_values = np.array(
529
+ [
530
+ fd.get("inty", 0)
531
+ for fd in feature_data_list
532
+ if fd.get("inty") is not None
533
+ ],
534
+ )
535
+ coherence_values = np.array(
536
+ [
537
+ fd.get("chrom_coherence", 0)
538
+ for fd in feature_data_list
539
+ if fd.get("chrom_coherence") is not None
540
+ ],
541
+ )
542
+ prominence_values = np.array(
543
+ [
544
+ fd.get("chrom_prominence", 0)
545
+ for fd in feature_data_list
546
+ if fd.get("chrom_prominence") is not None
547
+ ],
548
+ )
549
+ prominence_scaled_values = np.array(
550
+ [
551
+ fd.get("chrom_prominence_scaled", 0)
552
+ for fd in feature_data_list
553
+ if fd.get("chrom_prominence_scaled") is not None
554
+ ],
555
+ )
556
+ height_scaled_values = np.array(
557
+ [
558
+ fd.get("chrom_height_scaled", 0)
559
+ for fd in feature_data_list
560
+ if fd.get("chrom_height_scaled") is not None
561
+ ],
562
+ )
563
+ iso_values = np.array(
564
+ [fd.get("iso", 0) for fd in feature_data_list if fd.get("iso") is not None],
565
+ )
566
+ charge_values = np.array(
567
+ [
568
+ fd.get("charge", 0)
569
+ for fd in feature_data_list
570
+ if fd.get("charge") is not None
571
+ ],
572
+ )
504
573
 
505
574
  # adduct_values
506
575
  # Collect all adducts from feature_data_list to create consensus adduct information
507
576
  # Only consider adducts that are in study._get_adducts() plus items with '?'
508
577
  all_adducts = []
509
578
  adduct_masses = {}
510
-
579
+
511
580
  # Get valid adducts from study._get_adducts()
512
581
  valid_adducts = set()
513
582
  try:
@@ -516,7 +585,7 @@ def merge(self, **kwargs):
516
585
  valid_adducts.update(study_adducts_df["name"].to_list())
517
586
  except Exception as e:
518
587
  self.logger.warning(f"Could not retrieve study adducts: {e}")
519
-
588
+
520
589
  # Always allow '?' adducts
521
590
  valid_adducts.add("?")
522
591
 
@@ -527,7 +596,7 @@ def merge(self, **kwargs):
527
596
 
528
597
  if adduct is not None:
529
598
  # Only include adducts that are valid (from study._get_adducts() or contain '?')
530
- if adduct in valid_adducts or '?' in adduct:
599
+ if adduct in valid_adducts or "?" in adduct:
531
600
  all_adducts.append(adduct)
532
601
  if adduct_mass is not None:
533
602
  adduct_masses[adduct] = adduct_mass
@@ -535,33 +604,37 @@ def merge(self, **kwargs):
535
604
  # Calculate adduct_values for the consensus feature
536
605
  adduct_values = []
537
606
  if all_adducts:
538
- adduct_counts = {adduct: all_adducts.count(adduct) for adduct in set(all_adducts)}
607
+ adduct_counts = {
608
+ adduct: all_adducts.count(adduct) for adduct in set(all_adducts)
609
+ }
539
610
  total_count = sum(adduct_counts.values())
540
611
  for adduct, count in adduct_counts.items():
541
612
  percentage = (count / total_count) * 100 if total_count > 0 else 0
542
613
  mass = adduct_masses.get(adduct, None)
543
614
  # Store as list with [name, num, %] format for the adducts column
544
- adduct_values.append([
545
- str(adduct),
546
- int(count),
547
- float(round(percentage, 2))
548
- ])
615
+ adduct_values.append(
616
+ [
617
+ str(adduct),
618
+ int(count),
619
+ float(round(percentage, 2)),
620
+ ],
621
+ )
549
622
 
550
623
  # Sort adduct_values by count in descending order
551
624
  adduct_values.sort(key=lambda x: x[1], reverse=True) # Sort by count (index 1)
552
625
  # Store adduct_values for use in metadata
553
626
  consensus_adduct_values = adduct_values
554
-
627
+
555
628
  # Extract top adduct information for new columns
556
629
  adduct_top = None
557
630
  adduct_charge_top = None
558
631
  adduct_mass_neutral_top = None
559
632
  adduct_mass_shift_top = None
560
-
633
+
561
634
  if consensus_adduct_values:
562
635
  top_adduct_name = consensus_adduct_values[0][0] # Get top adduct name
563
636
  adduct_top = top_adduct_name
564
-
637
+
565
638
  # Parse adduct information to extract charge and mass shift
566
639
  # Handle "?" as "H" and parse common adduct formats
567
640
  if top_adduct_name == "?" or top_adduct_name == "[M+?]+":
@@ -577,33 +650,37 @@ def merge(self, **kwargs):
577
650
  study_adducts_df = self._get_adducts()
578
651
  if not study_adducts_df.is_empty():
579
652
  # Look for exact match in study adducts
580
- matching_adduct = study_adducts_df.filter(pl.col("name") == top_adduct_name)
653
+ matching_adduct = study_adducts_df.filter(
654
+ pl.col("name") == top_adduct_name,
655
+ )
581
656
  if not matching_adduct.is_empty():
582
657
  adduct_row = matching_adduct.row(0, named=True)
583
658
  adduct_charge_top = adduct_row["charge"]
584
659
  adduct_mass_shift_top = adduct_row["mass_shift"]
585
660
  adduct_found = True
586
661
  except Exception as e:
587
- self.logger.warning(f"Could not lookup adduct in study adducts: {e}")
588
-
662
+ self.logger.warning(
663
+ f"Could not lookup adduct in study adducts: {e}",
664
+ )
665
+
589
666
  if not adduct_found:
590
667
  # Fallback to regex parsing
591
668
  import re
592
-
669
+
593
670
  # Pattern for adducts like [M+H]+, [M-H]-, [M+Na]+, etc.
594
- pattern = r'\[M([+\-])([A-Za-z0-9]+)\]([0-9]*)([+\-])'
671
+ pattern = r"\[M([+\-])([A-Za-z0-9]+)\]([0-9]*)([+\-])"
595
672
  match = re.match(pattern, top_adduct_name)
596
-
673
+
597
674
  if match:
598
675
  sign = match.group(1)
599
676
  element = match.group(2)
600
677
  multiplier_str = match.group(3)
601
678
  charge_sign = match.group(4)
602
-
679
+
603
680
  multiplier = int(multiplier_str) if multiplier_str else 1
604
681
  charge = multiplier if charge_sign == "+" else -multiplier
605
682
  adduct_charge_top = charge
606
-
683
+
607
684
  # Calculate mass shift based on element
608
685
  element_masses = {
609
686
  "H": 1.007825,
@@ -617,9 +694,16 @@ def merge(self, **kwargs):
617
694
  "CH3COO": 59.013851,
618
695
  "H2O": 18.010565,
619
696
  }
620
-
621
- base_mass = element_masses.get(element, 1.007825) # Default to H if unknown
622
- mass_shift = base_mass * multiplier if sign == "+" else -base_mass * multiplier
697
+
698
+ base_mass = element_masses.get(
699
+ element,
700
+ 1.007825,
701
+ ) # Default to H if unknown
702
+ mass_shift = (
703
+ base_mass * multiplier
704
+ if sign == "+"
705
+ else -base_mass * multiplier
706
+ )
623
707
  adduct_mass_shift_top = mass_shift
624
708
  else:
625
709
  # Default fallback
@@ -627,8 +711,8 @@ def merge(self, **kwargs):
627
711
  adduct_mass_shift_top = 1.007825
628
712
  else:
629
713
  # No valid adducts found - assign default based on study polarity
630
- study_polarity = getattr(self, 'polarity', 'positive')
631
- if study_polarity in ['negative', 'neg']:
714
+ study_polarity = getattr(self, "polarity", "positive")
715
+ if study_polarity in ["negative", "neg"]:
632
716
  # Negative mode default
633
717
  adduct_top = "[M-?]1-"
634
718
  adduct_charge_top = -1
@@ -638,14 +722,18 @@ def merge(self, **kwargs):
638
722
  adduct_top = "[M+?]1+"
639
723
  adduct_charge_top = 1
640
724
  adduct_mass_shift_top = 1.007825 # H mass (gain of proton)
641
-
725
+
642
726
  # Create a single default adduct entry in the adducts list for consistency
643
727
  consensus_adduct_values = [[adduct_top, 1, 100.0]]
644
-
728
+
645
729
  # Calculate neutral mass from consensus mz (for both cases)
646
- consensus_mz = round(float(np.mean(mz_values)), 4) if len(mz_values) > 0 else 0.0
730
+ consensus_mz = (
731
+ round(float(np.mean(mz_values)), 4) if len(mz_values) > 0 else 0.0
732
+ )
647
733
  if adduct_charge_top and adduct_mass_shift_top is not None:
648
- adduct_mass_neutral_top = consensus_mz * abs(adduct_charge_top) - adduct_mass_shift_top
734
+ adduct_mass_neutral_top = (
735
+ consensus_mz * abs(adduct_charge_top) - adduct_mass_shift_top
736
+ )
649
737
 
650
738
  # Calculate number of MS2 spectra
651
739
  ms2_count = 0
@@ -654,48 +742,95 @@ def merge(self, **kwargs):
654
742
  if ms2_scans is not None:
655
743
  ms2_count += len(ms2_scans)
656
744
 
657
- metadata_list.append({
658
- "consensus_uid": int(i), # "consensus_id": i,
659
- "consensus_id": str(feature.getUniqueId()),
660
- "quality": round(float(feature.getQuality()), 3),
661
- "number_samples": len(feature_data_list),
662
- # "number_ext": int(len(features_list)),
663
- "rt": round(float(np.mean(rt_values)), 4) if len(rt_values) > 0 else 0.0,
664
- "mz": round(float(np.mean(mz_values)), 4) if len(mz_values) > 0 else 0.0,
665
- "rt_min": round(float(np.min(rt_values)), 3) if len(rt_values) > 0 else 0.0,
666
- "rt_max": round(float(np.max(rt_values)), 3) if len(rt_values) > 0 else 0.0,
667
- "rt_mean": round(float(np.mean(rt_values)), 3) if len(rt_values) > 0 else 0.0,
668
- "rt_start_mean": round(float(np.mean(rt_start_values)), 3) if len(rt_start_values) > 0 else 0.0,
669
- "rt_end_mean": round(float(np.mean(rt_end_values)), 3) if len(rt_end_values) > 0 else 0.0,
670
- "rt_delta_mean": round(float(np.ptp(rt_delta_values)), 3) if len(rt_delta_values) > 0 else 0.0,
671
- "mz_min": round(float(np.min(mz_values)), 4) if len(mz_values) > 0 else 0.0,
672
- "mz_max": round(float(np.max(mz_values)), 4) if len(mz_values) > 0 else 0.0,
673
- "mz_mean": round(float(np.mean(mz_values)), 4) if len(mz_values) > 0 else 0.0,
674
- "mz_start_mean": round(float(np.mean(mz_start_values)), 4) if len(mz_start_values) > 0 else 0.0,
675
- "mz_end_mean": round(float(np.mean(mz_end_values)), 4) if len(mz_end_values) > 0 else 0.0,
676
- "inty_mean": round(float(np.mean(inty_values)), 0) if len(inty_values) > 0 else 0.0,
677
- "bl": -1.0,
678
- "chrom_coherence_mean": round(float(np.mean(coherence_values)), 3) if len(coherence_values) > 0 else 0.0,
679
- "chrom_prominence_mean": round(float(np.mean(prominence_values)), 0) if len(prominence_values) > 0 else 0.0,
680
- "chrom_prominence_scaled_mean": round(
681
- float(np.mean(prominence_scaled_values)),
682
- 3,
683
- )
684
- if len(prominence_scaled_values) > 0
685
- else 0.0,
686
- "chrom_height_scaled_mean": round(float(np.mean(height_scaled_values)), 3)
687
- if len(height_scaled_values) > 0
688
- else 0.0,
689
- "iso_mean": round(float(np.mean(iso_values)), 2) if len(iso_values) > 0 else 0.0,
690
- "charge_mean": round(float(np.mean(charge_values)), 2) if len(charge_values) > 0 else 0.0,
691
- "number_ms2": int(ms2_count),
692
- "adducts": consensus_adduct_values if consensus_adduct_values else [], # Ensure it's always a list
693
- # New columns for top-ranked adduct information
694
- "adduct_top": adduct_top,
695
- "adduct_charge_top": adduct_charge_top,
696
- "adduct_mass_neutral_top": round(adduct_mass_neutral_top, 6) if adduct_mass_neutral_top is not None else None,
697
- "adduct_mass_shift_top": round(adduct_mass_shift_top, 6) if adduct_mass_shift_top is not None else None,
698
- })
745
+ metadata_list.append(
746
+ {
747
+ "consensus_uid": int(i), # "consensus_id": i,
748
+ "consensus_id": str(feature.getUniqueId()),
749
+ "quality": round(float(feature.getQuality()), 3),
750
+ "number_samples": len(feature_data_list),
751
+ # "number_ext": int(len(features_list)),
752
+ "rt": round(float(np.mean(rt_values)), 4)
753
+ if len(rt_values) > 0
754
+ else 0.0,
755
+ "mz": round(float(np.mean(mz_values)), 4)
756
+ if len(mz_values) > 0
757
+ else 0.0,
758
+ "rt_min": round(float(np.min(rt_values)), 3)
759
+ if len(rt_values) > 0
760
+ else 0.0,
761
+ "rt_max": round(float(np.max(rt_values)), 3)
762
+ if len(rt_values) > 0
763
+ else 0.0,
764
+ "rt_mean": round(float(np.mean(rt_values)), 3)
765
+ if len(rt_values) > 0
766
+ else 0.0,
767
+ "rt_start_mean": round(float(np.mean(rt_start_values)), 3)
768
+ if len(rt_start_values) > 0
769
+ else 0.0,
770
+ "rt_end_mean": round(float(np.mean(rt_end_values)), 3)
771
+ if len(rt_end_values) > 0
772
+ else 0.0,
773
+ "rt_delta_mean": round(float(np.ptp(rt_delta_values)), 3)
774
+ if len(rt_delta_values) > 0
775
+ else 0.0,
776
+ "mz_min": round(float(np.min(mz_values)), 4)
777
+ if len(mz_values) > 0
778
+ else 0.0,
779
+ "mz_max": round(float(np.max(mz_values)), 4)
780
+ if len(mz_values) > 0
781
+ else 0.0,
782
+ "mz_mean": round(float(np.mean(mz_values)), 4)
783
+ if len(mz_values) > 0
784
+ else 0.0,
785
+ "mz_start_mean": round(float(np.mean(mz_start_values)), 4)
786
+ if len(mz_start_values) > 0
787
+ else 0.0,
788
+ "mz_end_mean": round(float(np.mean(mz_end_values)), 4)
789
+ if len(mz_end_values) > 0
790
+ else 0.0,
791
+ "inty_mean": round(float(np.mean(inty_values)), 0)
792
+ if len(inty_values) > 0
793
+ else 0.0,
794
+ "bl": -1.0,
795
+ "chrom_coherence_mean": round(float(np.mean(coherence_values)), 3)
796
+ if len(coherence_values) > 0
797
+ else 0.0,
798
+ "chrom_prominence_mean": round(float(np.mean(prominence_values)), 0)
799
+ if len(prominence_values) > 0
800
+ else 0.0,
801
+ "chrom_prominence_scaled_mean": round(
802
+ float(np.mean(prominence_scaled_values)),
803
+ 3,
804
+ )
805
+ if len(prominence_scaled_values) > 0
806
+ else 0.0,
807
+ "chrom_height_scaled_mean": round(
808
+ float(np.mean(height_scaled_values)),
809
+ 3,
810
+ )
811
+ if len(height_scaled_values) > 0
812
+ else 0.0,
813
+ "iso_mean": round(float(np.mean(iso_values)), 2)
814
+ if len(iso_values) > 0
815
+ else 0.0,
816
+ "charge_mean": round(float(np.mean(charge_values)), 2)
817
+ if len(charge_values) > 0
818
+ else 0.0,
819
+ "number_ms2": int(ms2_count),
820
+ "adducts": consensus_adduct_values
821
+ if consensus_adduct_values
822
+ else [], # Ensure it's always a list
823
+ # New columns for top-ranked adduct information
824
+ "adduct_top": adduct_top,
825
+ "adduct_charge_top": adduct_charge_top,
826
+ "adduct_mass_neutral_top": round(adduct_mass_neutral_top, 6)
827
+ if adduct_mass_neutral_top is not None
828
+ else None,
829
+ "adduct_mass_shift_top": round(adduct_mass_shift_top, 6)
830
+ if adduct_mass_shift_top is not None
831
+ else None,
832
+ },
833
+ )
699
834
 
700
835
  consensus_mapping_df = pl.DataFrame(consensus_mapping)
701
836
  # remove all rows in consensus_mapping_df where consensus_id is not in self.featured_df['uid']
@@ -736,72 +871,74 @@ def merge(self, **kwargs):
736
871
  )
737
872
 
738
873
  self.consensus_map = consensus_map
739
-
874
+
740
875
  # Add adduct grouping and adduct_of assignment
741
876
  if len(self.consensus_df) > 0:
742
877
  # Get rt_tol and mz_tol from kwargs or use defaults from merge_defaults
743
878
  adduct_rt_tol = rt_tol # Use the same rt_tol from merge parameters
744
879
  adduct_mz_tol = mz_tol # Use the same mz_tol from merge parameters
745
-
880
+
746
881
  # Initialize new columns
747
882
  adduct_group_list = []
748
883
  adduct_of_list = []
749
-
884
+
750
885
  # Get relevant columns for grouping
751
886
  consensus_data = []
752
887
  for row in self.consensus_df.iter_rows(named=True):
753
- consensus_data.append({
754
- "consensus_uid": row["consensus_uid"],
755
- "rt": row["rt"],
756
- "adduct_mass_neutral_top": row.get("adduct_mass_neutral_top"),
757
- "adduct_top": row.get("adduct_top"),
758
- "inty_mean": row.get("inty_mean", 0),
759
- })
760
-
888
+ consensus_data.append(
889
+ {
890
+ "consensus_uid": row["consensus_uid"],
891
+ "rt": row["rt"],
892
+ "adduct_mass_neutral_top": row.get("adduct_mass_neutral_top"),
893
+ "adduct_top": row.get("adduct_top"),
894
+ "inty_mean": row.get("inty_mean", 0),
895
+ },
896
+ )
897
+
761
898
  # Group features with similar neutral mass and RT
762
899
  group_id = 1
763
900
  assigned_groups = {} # consensus_uid -> group_id
764
901
  groups = {} # group_id -> [consensus_uids]
765
-
902
+
766
903
  for i, feature in enumerate(consensus_data):
767
904
  consensus_uid = feature["consensus_uid"]
768
-
905
+
769
906
  if consensus_uid in assigned_groups:
770
907
  continue
771
-
908
+
772
909
  neutral_mass = feature["adduct_mass_neutral_top"]
773
910
  rt = feature["rt"]
774
-
911
+
775
912
  # Skip if neutral mass is None
776
913
  if neutral_mass is None:
777
914
  assigned_groups[consensus_uid] = 0 # No group assignment
778
915
  continue
779
-
916
+
780
917
  # Find all features that could belong to the same group
781
918
  group_members = [consensus_uid]
782
-
919
+
783
920
  for j, other_feature in enumerate(consensus_data):
784
921
  if i == j:
785
922
  continue
786
-
923
+
787
924
  other_uid = other_feature["consensus_uid"]
788
925
  if other_uid in assigned_groups:
789
926
  continue
790
-
927
+
791
928
  other_neutral_mass = other_feature["adduct_mass_neutral_top"]
792
929
  other_rt = other_feature["rt"]
793
-
930
+
794
931
  if other_neutral_mass is None:
795
932
  continue
796
-
933
+
797
934
  # Check if features have similar neutral mass and RT
798
935
  mass_diff = abs(neutral_mass - other_neutral_mass)
799
936
  rt_diff = abs(rt - other_rt) / 60.0 # Convert to minutes for rt_tol
800
-
937
+
801
938
  if mass_diff <= adduct_mz_tol and rt_diff <= adduct_rt_tol:
802
939
  group_members.append(other_uid)
803
940
  assigned_groups[other_uid] = group_id
804
-
941
+
805
942
  if len(group_members) > 1:
806
943
  # Multiple members - create a group
807
944
  for member_uid in group_members:
@@ -813,26 +950,29 @@ def merge(self, **kwargs):
813
950
  assigned_groups[consensus_uid] = group_id
814
951
  groups[group_id] = [consensus_uid]
815
952
  group_id += 1
816
-
953
+
817
954
  # Determine adduct_of for each group
818
955
  group_adduct_of = {} # group_id -> consensus_uid of most important adduct
819
-
956
+
820
957
  for grp_id, member_uids in groups.items():
821
958
  # Find the most important adduct in this group
822
959
  # Priority: [M+H]+ > [M-H]- > highest intensity
823
960
  best_uid = None
824
961
  best_priority = -1
825
962
  best_intensity = 0
826
-
963
+
827
964
  for uid in member_uids:
828
965
  # Find the feature data
829
- feature_data = next((f for f in consensus_data if f["consensus_uid"] == uid), None)
966
+ feature_data = next(
967
+ (f for f in consensus_data if f["consensus_uid"] == uid),
968
+ None,
969
+ )
830
970
  if not feature_data:
831
971
  continue
832
-
972
+
833
973
  adduct = feature_data.get("adduct_top", "")
834
974
  intensity = feature_data.get("inty_mean", 0)
835
-
975
+
836
976
  priority = 0
837
977
  if adduct and ("[M+H]" in adduct or adduct == "H" or adduct == "?"):
838
978
  priority = 3 # Highest priority for [M+H]+ or H
@@ -840,34 +980,41 @@ def merge(self, **kwargs):
840
980
  priority = 2 # Second priority for [M-H]-
841
981
  elif adduct and "M" in adduct:
842
982
  priority = 1 # Third priority for other molecular adducts
843
-
983
+
844
984
  # Choose based on priority first, then intensity
845
- if (priority > best_priority or
846
- (priority == best_priority and intensity > best_intensity)):
985
+ if priority > best_priority or (
986
+ priority == best_priority and intensity > best_intensity
987
+ ):
847
988
  best_uid = uid
848
989
  best_priority = priority
849
990
  best_intensity = intensity
850
-
991
+
851
992
  group_adduct_of[grp_id] = best_uid if best_uid else member_uids[0]
852
-
993
+
853
994
  # Build the final lists in the same order as consensus_df
854
995
  for row in self.consensus_df.iter_rows(named=True):
855
996
  consensus_uid = row["consensus_uid"]
856
997
  group = assigned_groups.get(consensus_uid, 0)
857
998
  adduct_of = group_adduct_of.get(group, consensus_uid)
858
-
999
+
859
1000
  adduct_group_list.append(group)
860
1001
  adduct_of_list.append(adduct_of)
861
-
1002
+
862
1003
  # Add the new columns to consensus_df
863
- self.consensus_df = self.consensus_df.with_columns([
864
- pl.Series("adduct_group", adduct_group_list, dtype=pl.Int64),
865
- pl.Series("adduct_of", adduct_of_list, dtype=pl.Int64),
866
- ])
867
-
1004
+ self.consensus_df = self.consensus_df.with_columns(
1005
+ [
1006
+ pl.Series("adduct_group", adduct_group_list, dtype=pl.Int64),
1007
+ pl.Series("adduct_of", adduct_of_list, dtype=pl.Int64),
1008
+ ],
1009
+ )
1010
+
868
1011
  # calculate the completeness of the consensus map
869
1012
  if len(self.consensus_df) > 0 and len(self.samples_df) > 0:
870
- c = len(self.consensus_mapping_df) / len(self.consensus_df) / len(self.samples_df)
1013
+ c = (
1014
+ len(self.consensus_mapping_df)
1015
+ / len(self.consensus_df)
1016
+ / len(self.samples_df)
1017
+ )
871
1018
  self.logger.info(
872
1019
  f"Merging completed. Consensus features: {len(self.consensus_df)}. Completeness: {c:.2f}.",
873
1020
  )
@@ -938,7 +1085,9 @@ def find_ms2(self, **kwargs):
938
1085
  ]
939
1086
  for row in feats.iter_rows(named=True):
940
1087
  feature_uid = row["feature_uid"]
941
- feature_lookup[feature_uid] = {col: row[col] for col in relevant_cols if col in feats.columns}
1088
+ feature_lookup[feature_uid] = {
1089
+ col: row[col] for col in relevant_cols if col in feats.columns
1090
+ }
942
1091
  tdqm_disable = self.log_level not in ["TRACE", "DEBUG", "INFO"]
943
1092
 
944
1093
  # Process consensus mapping in batch
@@ -960,20 +1109,26 @@ def find_ms2(self, **kwargs):
960
1109
  for j in range(len(ms2_specs)):
961
1110
  spec = ms2_specs[j]
962
1111
  scanid = ms2_scans[j]
963
- data.append({
964
- "consensus_uid": int(mapping_row["consensus_uid"]),
965
- "feature_uid": int(mapping_row["feature_uid"]),
966
- "sample_uid": int(mapping_row["sample_uid"]),
967
- "scan_id": int(scanid),
968
- "energy": round(spec.energy, 1) if hasattr(spec, "energy") and spec.energy is not None else None,
969
- "prec_inty": round(inty, 0) if inty is not None else None,
970
- "prec_coherence": round(chrom_coherence, 3) if chrom_coherence is not None else None,
971
- "prec_prominence_scaled": round(chrom_prominence_scaled, 3)
972
- if chrom_prominence_scaled is not None
973
- else None,
974
- "number_frags": len(spec.mz),
975
- "spec": spec,
976
- })
1112
+ data.append(
1113
+ {
1114
+ "consensus_uid": int(mapping_row["consensus_uid"]),
1115
+ "feature_uid": int(mapping_row["feature_uid"]),
1116
+ "sample_uid": int(mapping_row["sample_uid"]),
1117
+ "scan_id": int(scanid),
1118
+ "energy": round(spec.energy, 1)
1119
+ if hasattr(spec, "energy") and spec.energy is not None
1120
+ else None,
1121
+ "prec_inty": round(inty, 0) if inty is not None else None,
1122
+ "prec_coherence": round(chrom_coherence, 3)
1123
+ if chrom_coherence is not None
1124
+ else None,
1125
+ "prec_prominence_scaled": round(chrom_prominence_scaled, 3)
1126
+ if chrom_prominence_scaled is not None
1127
+ else None,
1128
+ "number_frags": len(spec.mz),
1129
+ "spec": spec,
1130
+ },
1131
+ )
977
1132
  self.consensus_ms2 = pl.DataFrame(data)
978
1133
  if not self.consensus_ms2.is_empty():
979
1134
  unique_consensus_features = self.consensus_ms2["consensus_uid"].n_unique()
@@ -1006,7 +1161,10 @@ def filter_consensus(
1006
1161
  else:
1007
1162
  if isinstance(coherence, tuple) and len(coherence) == 2:
1008
1163
  min_coherence, max_coherence = coherence
1009
- cons = cons[(cons["chrom_coherence"] >= min_coherence) & (cons["chrom_coherence"] <= max_coherence)]
1164
+ cons = cons[
1165
+ (cons["chrom_coherence"] >= min_coherence)
1166
+ & (cons["chrom_coherence"] <= max_coherence)
1167
+ ]
1010
1168
  else:
1011
1169
  cons = cons[cons["chrom_coherence"] >= coherence]
1012
1170
  after_coherence = len(cons)
@@ -1017,7 +1175,9 @@ def filter_consensus(
1017
1175
  if quality is not None:
1018
1176
  if isinstance(quality, tuple) and len(quality) == 2:
1019
1177
  min_quality, max_quality = quality
1020
- cons = cons[(cons["quality"] >= min_quality) & (cons["quality"] <= max_quality)]
1178
+ cons = cons[
1179
+ (cons["quality"] >= min_quality) & (cons["quality"] <= max_quality)
1180
+ ]
1021
1181
  else:
1022
1182
  cons = cons[cons["quality"] >= quality]
1023
1183
  after_quality = len(cons)
@@ -1028,7 +1188,10 @@ def filter_consensus(
1028
1188
  if number_samples is not None:
1029
1189
  if isinstance(number_samples, tuple) and len(number_samples) == 2:
1030
1190
  min_number, max_number = number_samples
1031
- cons = cons[(cons["number_samples"] >= min_number) & (cons["number_samples"] <= max_number)]
1191
+ cons = cons[
1192
+ (cons["number_samples"] >= min_number)
1193
+ & (cons["number_samples"] <= max_number)
1194
+ ]
1032
1195
  else:
1033
1196
  cons = cons[cons["number_samples"] >= number_samples]
1034
1197
  after_number_samples = len(cons)
@@ -1105,11 +1268,13 @@ def _integrate_chrom_impl(self, **kwargs):
1105
1268
 
1106
1269
  # Merge consensus_mapping with consensus_df to get rt_start_mean and rt_end_mean
1107
1270
  # Use Polars join operation instead of pandas merge
1108
- consensus_subset = self.consensus_df.select([
1109
- "consensus_uid",
1110
- "rt_start_mean",
1111
- "rt_end_mean",
1112
- ])
1271
+ consensus_subset = self.consensus_df.select(
1272
+ [
1273
+ "consensus_uid",
1274
+ "rt_start_mean",
1275
+ "rt_end_mean",
1276
+ ],
1277
+ )
1113
1278
  df1 = self.consensus_mapping_df.join(
1114
1279
  consensus_subset,
1115
1280
  on="consensus_uid",
@@ -1154,9 +1319,9 @@ def _integrate_chrom_impl(self, **kwargs):
1154
1319
  if chrom is None or len(chrom) == 0:
1155
1320
  update_rows.append(row_idx)
1156
1321
  chroms.append(None)
1157
- rt_starts.append(float('nan'))
1158
- rt_ends.append(float('nan'))
1159
- rt_deltas.append(float('nan'))
1322
+ rt_starts.append(float("nan"))
1323
+ rt_ends.append(float("nan"))
1324
+ rt_deltas.append(float("nan"))
1160
1325
  chrom_areas.append(-1.0)
1161
1326
  continue
1162
1327
  ## TODO expose parameters
@@ -1186,9 +1351,13 @@ def _integrate_chrom_impl(self, **kwargs):
1186
1351
  if update_rows:
1187
1352
  # Create mapping from row index to new values
1188
1353
  row_to_chrom = {update_rows[i]: chroms[i] for i in range(len(update_rows))}
1189
- row_to_rt_start = {update_rows[i]: rt_starts[i] for i in range(len(update_rows))}
1354
+ row_to_rt_start = {
1355
+ update_rows[i]: rt_starts[i] for i in range(len(update_rows))
1356
+ }
1190
1357
  row_to_rt_end = {update_rows[i]: rt_ends[i] for i in range(len(update_rows))}
1191
- row_to_rt_delta = {update_rows[i]: rt_deltas[i] for i in range(len(update_rows))}
1358
+ row_to_rt_delta = {
1359
+ update_rows[i]: rt_deltas[i] for i in range(len(update_rows))
1360
+ }
1192
1361
  row_to_chrom_area = {
1193
1362
  update_rows[i]: float(chrom_areas[i]) if chrom_areas[i] is not None else 0.0
1194
1363
  for i in range(len(update_rows))
@@ -1202,58 +1371,60 @@ def _integrate_chrom_impl(self, **kwargs):
1202
1371
 
1203
1372
  # Update columns conditionally
1204
1373
  try:
1205
- self.features_df = df_with_index.with_columns([
1206
- # Update chrom column - use when() to update only specific rows
1207
- pl.when(update_mask)
1208
- .then(
1209
- pl.col("__row_idx").map_elements(
1210
- lambda x: row_to_chrom.get(x, None),
1211
- return_dtype=pl.Object,
1212
- ),
1213
- )
1214
- .otherwise(pl.col("chrom"))
1215
- .alias("chrom"),
1216
- # Update rt_start column
1217
- pl.when(update_mask)
1218
- .then(
1219
- pl.col("__row_idx").map_elements(
1220
- lambda x: row_to_rt_start.get(x, None),
1221
- return_dtype=pl.Float64,
1222
- ),
1223
- )
1224
- .otherwise(pl.col("rt_start"))
1225
- .alias("rt_start"),
1226
- # Update rt_end column
1227
- pl.when(update_mask)
1228
- .then(
1229
- pl.col("__row_idx").map_elements(
1230
- lambda x: row_to_rt_end.get(x, None),
1231
- return_dtype=pl.Float64,
1232
- ),
1233
- )
1234
- .otherwise(pl.col("rt_end"))
1235
- .alias("rt_end"),
1236
- # Update rt_delta column
1237
- pl.when(update_mask)
1238
- .then(
1239
- pl.col("__row_idx").map_elements(
1240
- lambda x: row_to_rt_delta.get(x, None),
1241
- return_dtype=pl.Float64,
1242
- ),
1243
- )
1244
- .otherwise(pl.col("rt_delta"))
1245
- .alias("rt_delta"),
1246
- # Update chrom_area column
1247
- pl.when(update_mask)
1248
- .then(
1249
- pl.col("__row_idx").map_elements(
1250
- lambda x: row_to_chrom_area.get(x, 0),
1251
- return_dtype=pl.Float64,
1252
- ),
1253
- )
1254
- .otherwise(pl.col("chrom_area"))
1255
- .alias("chrom_area"),
1256
- ]).drop("__row_idx") # Remove the temporary row index column
1374
+ self.features_df = df_with_index.with_columns(
1375
+ [
1376
+ # Update chrom column - use when() to update only specific rows
1377
+ pl.when(update_mask)
1378
+ .then(
1379
+ pl.col("__row_idx").map_elements(
1380
+ lambda x: row_to_chrom.get(x, None),
1381
+ return_dtype=pl.Object,
1382
+ ),
1383
+ )
1384
+ .otherwise(pl.col("chrom"))
1385
+ .alias("chrom"),
1386
+ # Update rt_start column
1387
+ pl.when(update_mask)
1388
+ .then(
1389
+ pl.col("__row_idx").map_elements(
1390
+ lambda x: row_to_rt_start.get(x, None),
1391
+ return_dtype=pl.Float64,
1392
+ ),
1393
+ )
1394
+ .otherwise(pl.col("rt_start"))
1395
+ .alias("rt_start"),
1396
+ # Update rt_end column
1397
+ pl.when(update_mask)
1398
+ .then(
1399
+ pl.col("__row_idx").map_elements(
1400
+ lambda x: row_to_rt_end.get(x, None),
1401
+ return_dtype=pl.Float64,
1402
+ ),
1403
+ )
1404
+ .otherwise(pl.col("rt_end"))
1405
+ .alias("rt_end"),
1406
+ # Update rt_delta column
1407
+ pl.when(update_mask)
1408
+ .then(
1409
+ pl.col("__row_idx").map_elements(
1410
+ lambda x: row_to_rt_delta.get(x, None),
1411
+ return_dtype=pl.Float64,
1412
+ ),
1413
+ )
1414
+ .otherwise(pl.col("rt_delta"))
1415
+ .alias("rt_delta"),
1416
+ # Update chrom_area column
1417
+ pl.when(update_mask)
1418
+ .then(
1419
+ pl.col("__row_idx").map_elements(
1420
+ lambda x: row_to_chrom_area.get(x, 0),
1421
+ return_dtype=pl.Float64,
1422
+ ),
1423
+ )
1424
+ .otherwise(pl.col("chrom_area"))
1425
+ .alias("chrom_area"),
1426
+ ],
1427
+ ).drop("__row_idx") # Remove the temporary row index column
1257
1428
 
1258
1429
  self.logger.debug(
1259
1430
  f"Integration completed. Updated {len(update_rows)} features with chromatogram data.",
@@ -1344,9 +1515,18 @@ def _align_pose_clustering(study_obj, fmaps, params):
1344
1515
  params_oms.setValue("pairfinder:ignore_charge", "true")
1345
1516
  params_oms.setValue("max_num_peaks_considered", 1000)
1346
1517
  params_oms.setValue("pairfinder:distance_RT:max_difference", params.get("rt_tol"))
1347
- params_oms.setValue("pairfinder:distance_MZ:max_difference", params.get("mz_max_diff"))
1348
- params_oms.setValue("superimposer:rt_pair_distance_fraction", params.get("rt_pair_distance_frac"))
1349
- params_oms.setValue("superimposer:mz_pair_max_distance", params.get("mz_pair_max_distance"))
1518
+ params_oms.setValue(
1519
+ "pairfinder:distance_MZ:max_difference",
1520
+ params.get("mz_max_diff"),
1521
+ )
1522
+ params_oms.setValue(
1523
+ "superimposer:rt_pair_distance_fraction",
1524
+ params.get("rt_pair_distance_frac"),
1525
+ )
1526
+ params_oms.setValue(
1527
+ "superimposer:mz_pair_max_distance",
1528
+ params.get("mz_pair_max_distance"),
1529
+ )
1350
1530
  params_oms.setValue("superimposer:num_used_points", params.get("num_used_points"))
1351
1531
  params_oms.setValue("pairfinder:distance_MZ:exponent", 3.0)
1352
1532
  params_oms.setValue("pairfinder:distance_RT:exponent", 2.0)
@@ -1355,7 +1535,9 @@ def _align_pose_clustering(study_obj, fmaps, params):
1355
1535
  study_obj.logger.info("Starting alignment with PoseClustering")
1356
1536
 
1357
1537
  # Set ref_index to feature map index with largest number of features
1358
- ref_index = [i[0] for i in sorted(enumerate([fm.size() for fm in fmaps]), key=lambda x: x[1])][-1]
1538
+ ref_index = [
1539
+ i[0] for i in sorted(enumerate([fm.size() for fm in fmaps]), key=lambda x: x[1])
1540
+ ][-1]
1359
1541
  study_obj.logger.debug(
1360
1542
  f"Reference map is {study_obj.samples_df.row(ref_index, named=True)['sample_name']}",
1361
1543
  )
@@ -1374,7 +1556,10 @@ def _align_pose_clustering(study_obj, fmaps, params):
1374
1556
  ):
1375
1557
  if index == ref_index:
1376
1558
  continue
1377
- if params.get("skip_blanks") and study_obj.samples_df.row(index, named=True)["sample_type"] == "blank":
1559
+ if (
1560
+ params.get("skip_blanks")
1561
+ and study_obj.samples_df.row(index, named=True)["sample_type"] == "blank"
1562
+ ):
1378
1563
  continue
1379
1564
  trafo = oms.TransformationDescription()
1380
1565
  aligner.align(fm, trafo)
@@ -1393,19 +1578,28 @@ def _align_kd_algorithm(study_obj, fmaps, params):
1393
1578
 
1394
1579
  # Pull parameter values - map standard align params to our algorithm
1395
1580
  # Use rt_tol (standard align param) instead of warp_rt_tol for RT tolerance
1396
- rt_pair_tol = float(params.get("rt_tol")) if params.get("rt_tol") is not None else 2.0
1581
+ rt_pair_tol = (
1582
+ float(params.get("rt_tol")) if params.get("rt_tol") is not None else 2.0
1583
+ )
1397
1584
  # Use mz_max_diff (standard align param) converted to ppm
1398
- mz_max_diff_da = float(params.get("mz_max_diff")) if params.get("mz_max_diff") is not None else 0.02
1585
+ mz_max_diff_da = (
1586
+ float(params.get("mz_max_diff"))
1587
+ if params.get("mz_max_diff") is not None
1588
+ else 0.02
1589
+ )
1399
1590
  # Convert Da to ppm (assuming ~400 m/z average for metabolomics): 0.01 Da / 400 * 1e6 = 25 ppm
1400
1591
  ppm_tol = mz_max_diff_da / 400.0 * 1e6
1401
1592
  # Allow override with warp_mz_tol if specifically set (but not from defaults)
1402
1593
  try:
1403
1594
  warp_mz_from_params = params.get("warp_mz_tol")
1404
- if warp_mz_from_params is not None and warp_mz_from_params != params.__class__().warp_mz_tol:
1595
+ if (
1596
+ warp_mz_from_params is not None
1597
+ and warp_mz_from_params != params.__class__().warp_mz_tol
1598
+ ):
1405
1599
  ppm_tol = float(warp_mz_from_params)
1406
1600
  except (KeyError, AttributeError):
1407
1601
  pass
1408
-
1602
+
1409
1603
  # Safely retrieve optional parameter max_anchor_points (not yet part of defaults)
1410
1604
  try:
1411
1605
  _raw_mp = params.get("max_anchor_points")
@@ -1413,7 +1607,7 @@ def _align_kd_algorithm(study_obj, fmaps, params):
1413
1607
  _raw_mp = None
1414
1608
  max_points = int(_raw_mp) if _raw_mp is not None else 1000
1415
1609
  study_obj.logger.info(
1416
- f"Align time axes with rt_tol={params.get('rt_tol')}, min_samples={params.get('min_samples')}, max_points={max_points}"
1610
+ f"Align time axes with rt_tol={params.get('rt_tol')}, min_samples={params.get('min_samples')}, max_points={max_points}",
1417
1611
  )
1418
1612
 
1419
1613
  # Choose reference map (largest number of features)
@@ -1421,7 +1615,7 @@ def _align_kd_algorithm(study_obj, fmaps, params):
1421
1615
  ref_map = fmaps[ref_index]
1422
1616
  study_obj.alignment_ref_index = ref_index
1423
1617
  study_obj.logger.debug(
1424
- f"Reference map index {ref_index} (sample: {study_obj.samples_df.row(ref_index, named=True)['sample_name']}) size={ref_map.size()}"
1618
+ f"Reference map index {ref_index} (sample: {study_obj.samples_df.row(ref_index, named=True)['sample_name']}) size={ref_map.size()}",
1425
1619
  )
1426
1620
 
1427
1621
  # Extract and sort reference features by m/z for binary search
@@ -1445,7 +1639,10 @@ def _align_kd_algorithm(study_obj, fmaps, params):
1445
1639
  best_drt = drt
1446
1640
  return best
1447
1641
 
1448
- def _set_pairs(td_obj: 'oms.TransformationDescription', pairs): # Helper for pyopenms API variability
1642
+ def _set_pairs(
1643
+ td_obj: oms.TransformationDescription,
1644
+ pairs,
1645
+ ): # Helper for pyopenms API variability
1449
1646
  # Always provide list of lists to satisfy strict type expectations
1450
1647
  conv = [[float(a), float(b)] for a, b in pairs]
1451
1648
  try:
@@ -1527,7 +1724,7 @@ def _align_kd_algorithm(study_obj, fmaps, params):
1527
1724
  td.fitModel(model, oms.Param())
1528
1725
  except Exception as e:
1529
1726
  study_obj.logger.debug(
1530
- f"Map {i}: {model} fitting failed ({e}); fallback to linear two-point shift"
1727
+ f"Map {i}: {model} fitting failed ({e}); fallback to linear two-point shift",
1531
1728
  )
1532
1729
  rts = [f.getRT() for f in fmap]
1533
1730
  lo, hi = (min(rts), max(rts)) if rts else (0.0, 1.0)
@@ -1539,7 +1736,7 @@ def _align_kd_algorithm(study_obj, fmaps, params):
1539
1736
  pass
1540
1737
 
1541
1738
  study_obj.logger.debug(
1542
- f"Map {i}: anchors raw={len(pairs_raw)} used={len(pairs_use)} model={model} median_shift={med_shift:.4f}s"
1739
+ f"Map {i}: anchors raw={len(pairs_raw)} used={len(pairs_use)} model={model} median_shift={med_shift:.4f}s",
1543
1740
  )
1544
1741
  transformations.append(td)
1545
1742
 
@@ -1557,7 +1754,7 @@ def _align_kd_algorithm(study_obj, fmaps, params):
1557
1754
  study_obj.logger.warning(f"Map {i}: failed applying transformation ({e})")
1558
1755
 
1559
1756
  study_obj.logger.info(
1560
- f"Custom KD alignment completed. Reference index {ref_index}. Applied {len(transformations)} transformations."
1757
+ f"Custom KD alignment completed. Reference index {ref_index}. Applied {len(transformations)} transformations.",
1561
1758
  )
1562
1759
 
1563
1760
 
@@ -1566,13 +1763,18 @@ def _align_pose_clustering_fallback(study_obj, fmaps, params):
1566
1763
  import pyopenms as oms
1567
1764
 
1568
1765
  aligner = oms.MapAlignmentAlgorithmPoseClustering()
1569
- ref_index = [i[0] for i in sorted(enumerate([fm.size() for fm in fmaps]), key=lambda x: x[1])][-1]
1766
+ ref_index = [
1767
+ i[0] for i in sorted(enumerate([fm.size() for fm in fmaps]), key=lambda x: x[1])
1768
+ ][-1]
1570
1769
 
1571
1770
  # Set up basic parameters for pose clustering
1572
1771
  pc_params = oms.Param()
1573
1772
  pc_params.setValue("max_num_peaks_considered", 1000)
1574
1773
  pc_params.setValue("pairfinder:distance_RT:max_difference", params.get("rt_tol"))
1575
- pc_params.setValue("pairfinder:distance_MZ:max_difference", params.get("mz_max_diff"))
1774
+ pc_params.setValue(
1775
+ "pairfinder:distance_MZ:max_difference",
1776
+ params.get("mz_max_diff"),
1777
+ )
1576
1778
 
1577
1779
  aligner.setParameters(pc_params)
1578
1780
  aligner.setReference(fmaps[ref_index])