masster 0.4.13__py3-none-any.whl → 0.4.16__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of masster might be problematic. Click here for more details.

@@ -1,8 +1,6 @@
1
1
  from __future__ import annotations
2
2
 
3
3
  from datetime import datetime
4
- from collections import defaultdict
5
- import time
6
4
 
7
5
  import numpy as np
8
6
  import polars as pl
@@ -14,7 +12,6 @@ from masster.study.defaults import (
14
12
  align_defaults,
15
13
  find_ms2_defaults,
16
14
  integrate_defaults,
17
- merge_defaults,
18
15
  )
19
16
 
20
17
 
@@ -250,877 +247,6 @@ def align(self, **kwargs):
250
247
  self.save_samples()
251
248
 
252
249
 
253
- def merge(self, **kwargs):
254
- """Group features across samples into consensus features.
255
-
256
- Parameters can be provided as a ``merge_defaults`` instance or as
257
- individual keyword arguments; they are validated against the defaults class.
258
-
259
- Key parameters (from ``merge_defaults``):
260
- - algorithm (str): Grouping algorithm to use ('qt', 'kd', 'unlabeled', 'sequential').
261
- - min_samples (int): Minimum number of samples required for a consensus feature.
262
- - link_ms2 (bool): Whether to attach/link MS2 spectra to consensus features.
263
- - mz_tol (float): m/z tolerance for grouping (Da).
264
- - rt_tol (float): RT tolerance for grouping (seconds).
265
- """
266
- # Initialize
267
- self._reset_consensus_data()
268
- self.logger.info("Merging...")
269
-
270
- # Process parameters
271
- params = self._process_merge_parameters(**kwargs)
272
- algorithm = params.get("algorithm")
273
- min_samples = params.get("min_samples")
274
- link_ms2 = params.get("link_ms2")
275
- mz_tol = kwargs.get("mz_tol", 0.01)
276
- rt_tol = kwargs.get("rt_tol", 1.0)
277
-
278
- # Validate and prepare
279
- self._validate_merge_inputs(algorithm)
280
-
281
- # Perform feature grouping using OpenMS
282
- consensus_map = self._perform_feature_grouping(algorithm, params, mz_tol, rt_tol)
283
-
284
- # Extract consensus features and build metadata
285
- self._extract_consensus_features(consensus_map, min_samples)
286
-
287
- # Perform adduct grouping optimization
288
- self._perform_adduct_grouping(rt_tol, mz_tol)
289
-
290
- # Complete merge process
291
- self._finalize_merge(link_ms2, min_samples)
292
-
293
- def _reset_consensus_data(self):
294
- """Reset consensus-related DataFrames at the start of merge."""
295
- self.consensus_df = pl.DataFrame()
296
- self.consensus_ms2 = pl.DataFrame()
297
- self.consensus_mapping_df = pl.DataFrame()
298
-
299
- def _process_merge_parameters(self, **kwargs):
300
- """Process and validate merge parameters."""
301
- params = merge_defaults()
302
- for key, value in kwargs.items():
303
- if isinstance(value, merge_defaults):
304
- params = value
305
- self.logger.debug("Using provided merge_defaults parameters")
306
- else:
307
- if hasattr(params, key):
308
- if params.set(key, value, validate=True):
309
- self.logger.debug(f"Updated parameter {key} = {value}")
310
- else:
311
- self.logger.warning(
312
- f"Failed to set parameter {key} = {value} (validation failed)",
313
- )
314
- else:
315
- self.logger.debug(f"Unknown parameter {key} ignored")
316
-
317
- # Store parameters in the Study object
318
- self.store_history(["merge"], params.to_dict())
319
- self.logger.debug("Parameters stored to merge")
320
- return params
321
-
322
- def _validate_merge_inputs(self, algorithm):
323
- """Validate merge inputs and provide warnings for performance."""
324
- if len(self.samples_df) > 200 and algorithm == "qt":
325
- self.logger.warning(
326
- "Using QT for large datasets is NOT recommended [O(n²)], consider using KDTree instead [O(n log n)].",
327
- )
328
-
329
- # Check that features_maps is not empty
330
- if not self.features_maps or len(self.features_maps) == 0:
331
- self.load_features()
332
-
333
- def _perform_feature_grouping(self, algorithm, params, mz_tol, rt_tol):
334
- """Perform feature grouping using OpenMS algorithms."""
335
- params_oms = oms.Param()
336
- ## TODO expose these
337
-
338
- feature_grouper: object # Use generic type for different OpenMS algorithms
339
- match algorithm.lower():
340
- case "kd":
341
- feature_grouper = oms.FeatureGroupingAlgorithmKD()
342
- self.logger.debug("Merging features with KDTree...")
343
- params_oms.setValue("mz_unit", "Da")
344
- params_oms.setValue("nr_partitions", len(self.samples_df))
345
-
346
- params_oms.setValue("warp:enabled", "true")
347
- params_oms.setValue("warp:rt_tol", rt_tol)
348
- params_oms.setValue("warp:mz_tol", mz_tol)
349
-
350
- params_oms.setValue("link:rt_tol", rt_tol)
351
- params_oms.setValue("link:mz_tol", mz_tol)
352
- case "unlabeled":
353
- feature_grouper = oms.FeatureGroupingAlgorithmUnlabeled()
354
- self.logger.debug("Merging features with Unlabelled algorithm...")
355
- params_oms.setValue("second_nearest_gap", 2.0)
356
- params_oms.setValue("ignore_charge", "true")
357
- params_oms.setValue("distance_RT:max_difference", rt_tol * 3)
358
- params_oms.setValue("distance_MZ:max_difference", mz_tol * 3)
359
- params_oms.setValue("distance_MZ:unit", "Da")
360
- case "sequential":
361
- self.logger.debug(
362
- "Merging features sequentially with Unlabelled algorithm...",
363
- )
364
- params_oms.setValue("second_nearest_gap", 2.0)
365
- params_oms.setValue("ignore_charge", "true")
366
- params_oms.setValue("distance_RT:max_difference", rt_tol * 3)
367
- params_oms.setValue("distance_MZ:max_difference", mz_tol * 3)
368
- params_oms.setValue("distance_MZ:unit", "Da")
369
- case "qt":
370
- feature_grouper = oms.FeatureGroupingAlgorithmQT()
371
- self.logger.debug("Grouping features with QT...")
372
- params_oms.setValue("nr_partitions", len(self.samples_df))
373
- params_oms.setValue("ignore_charge", "true")
374
- params_oms.setValue("distance_RT:max_difference", rt_tol * 3)
375
- params_oms.setValue("distance_MZ:max_difference", mz_tol * 3)
376
- params_oms.setValue("distance_MZ:unit", "Da")
377
-
378
- self.logger.debug(f"Parameters for feature grouping: {params_oms}")
379
-
380
- # Create consensus map and set up file descriptions
381
- consensus_map = oms.ConsensusMap()
382
- file_descriptions = consensus_map.getColumnHeaders() # type: ignore
383
- feature_maps = self.features_maps
384
- for i, feature_map in enumerate(feature_maps):
385
- file_description = file_descriptions.get(i, oms.ColumnHeader())
386
- file_description.filename = self.samples_df.row(i, named=True)["sample_name"]
387
- file_description.size = feature_map.size()
388
- file_description.unique_id = feature_map.getUniqueId()
389
- file_descriptions[i] = file_description
390
-
391
- consensus_map.setColumnHeaders(file_descriptions) # type: ignore
392
-
393
- # Execute the grouping algorithm
394
- match algorithm.lower():
395
- case "sequential":
396
- # set the reference map to self.alignment_ref_index
397
- if self.alignment_ref_index is None:
398
- # pick the feature map with the most features as reference
399
- self.alignment_ref_index = max(
400
- range(len(self.features_maps)),
401
- key=lambda i: self.features_maps[i].size(),
402
- )
403
- feature_grouper = oms.FeatureGroupingAlgorithmUnlabeled()
404
- feature_grouper.setParameters(params_oms)
405
- feature_grouper.setReference(self.alignment_ref_index)
406
- self.logger.debug(f"Sequential mode: reference map = {self.alignment_ref_index}")
407
-
408
- # Group features sequentially
409
- for i in range(len(feature_maps)):
410
- if i == self.alignment_ref_index:
411
- continue
412
- temp_feature_maps = [feature_maps[self.alignment_ref_index], feature_maps[i]]
413
- temp_consensus_map = oms.ConsensusMap()
414
- feature_grouper.group(temp_feature_maps, temp_consensus_map)
415
- # Merge temp_consensus_map into consensus_map
416
- # This is a simplified approach - proper sequential grouping would be more complex
417
- case _:
418
- feature_grouper.setParameters(params_oms)
419
- feature_grouper.group(feature_maps, consensus_map)
420
-
421
- return consensus_map
422
-
423
- def _extract_consensus_features(self, consensus_map, min_samples):
424
- """Extract consensus features and build metadata."""
425
- # create a dict to map uid to feature_uid using self.features_df
426
- feature_uid_map = {
427
- row["feature_id"]: row["feature_uid"]
428
- for row in self.features_df.iter_rows(named=True)
429
- }
430
- imax = consensus_map.size()
431
-
432
- self.logger.info(f"Merging completed with {imax} consensus features.")
433
-
434
- # Pre-build fast lookup tables for features_df data using optimized approach
435
- features_lookup = _optimized_feature_lookup(self, self.features_df)
436
-
437
- # create a list to store the consensus mapping
438
- consensus_mapping = []
439
- metadata_list = []
440
-
441
- tqdm_disable = self.log_level not in ["TRACE", "DEBUG"]
442
-
443
- for i, feature in enumerate(
444
- tqdm(
445
- consensus_map,
446
- total=imax,
447
- disable=tqdm_disable,
448
- desc=f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}Extract metadata",
449
- ),
450
- ):
451
- # get all features in the feature map with the same unique id as the consensus feature
452
- features_list = feature.getFeatureList()
453
- uids = []
454
- feature_data_list = []
455
-
456
- for _j, f in enumerate(features_list):
457
- fuid = str(f.getUniqueId())
458
- if fuid not in feature_uid_map:
459
- # this is a feature that was removed but is still in the feature maps
460
- continue
461
- fuid = feature_uid_map[fuid]
462
- consensus_mapping.append(
463
- {
464
- "consensus_uid": i,
465
- "sample_uid": f.getMapIndex() + 1,
466
- "feature_uid": fuid,
467
- },
468
- )
469
- uids.append(fuid)
470
-
471
- # Get feature data from lookup instead of DataFrame filtering
472
- feature_data = features_lookup.get(fuid)
473
- if feature_data:
474
- feature_data_list.append(feature_data)
475
-
476
- if not feature_data_list:
477
- # Skip this consensus feature if no valid features found
478
- continue
479
-
480
- # Compute statistics using vectorized operations on collected data
481
- # Convert to numpy arrays for faster computation
482
- rt_values = np.array(
483
- [fd.get("rt", 0) for fd in feature_data_list if fd.get("rt") is not None],
484
- )
485
- mz_values = np.array(
486
- [fd.get("mz", 0) for fd in feature_data_list if fd.get("mz") is not None],
487
- )
488
- rt_start_values = np.array(
489
- [
490
- fd.get("rt_start", 0)
491
- for fd in feature_data_list
492
- if fd.get("rt_start") is not None
493
- ],
494
- )
495
- rt_end_values = np.array(
496
- [
497
- fd.get("rt_end", 0)
498
- for fd in feature_data_list
499
- if fd.get("rt_end") is not None
500
- ],
501
- )
502
- rt_delta_values = np.array(
503
- [
504
- fd.get("rt_delta", 0)
505
- for fd in feature_data_list
506
- if fd.get("rt_delta") is not None
507
- ],
508
- )
509
- mz_start_values = np.array(
510
- [
511
- fd.get("mz_start", 0)
512
- for fd in feature_data_list
513
- if fd.get("mz_start") is not None
514
- ],
515
- )
516
- mz_end_values = np.array(
517
- [
518
- fd.get("mz_end", 0)
519
- for fd in feature_data_list
520
- if fd.get("mz_end") is not None
521
- ],
522
- )
523
- inty_values = np.array(
524
- [
525
- fd.get("inty", 0)
526
- for fd in feature_data_list
527
- if fd.get("inty") is not None
528
- ],
529
- )
530
- coherence_values = np.array(
531
- [
532
- fd.get("chrom_coherence", 0)
533
- for fd in feature_data_list
534
- if fd.get("chrom_coherence") is not None
535
- ],
536
- )
537
- prominence_values = np.array(
538
- [
539
- fd.get("chrom_prominence", 0)
540
- for fd in feature_data_list
541
- if fd.get("chrom_prominence") is not None
542
- ],
543
- )
544
- prominence_scaled_values = np.array(
545
- [
546
- fd.get("chrom_prominence_scaled", 0)
547
- for fd in feature_data_list
548
- if fd.get("chrom_prominence_scaled") is not None
549
- ],
550
- )
551
- height_scaled_values = np.array(
552
- [
553
- fd.get("chrom_height_scaled", 0)
554
- for fd in feature_data_list
555
- if fd.get("chrom_height_scaled") is not None
556
- ],
557
- )
558
- iso_values = np.array(
559
- [fd.get("iso", 0) for fd in feature_data_list if fd.get("iso") is not None],
560
- )
561
- charge_values = np.array(
562
- [
563
- fd.get("charge", 0)
564
- for fd in feature_data_list
565
- if fd.get("charge") is not None
566
- ],
567
- )
568
-
569
- # adduct_values
570
- # Collect all adducts from feature_data_list to create consensus adduct information
571
- # Only consider adducts that are in study._get_adducts() plus items with '?'
572
- all_adducts = []
573
- adduct_masses = {}
574
-
575
- # Get valid adducts from study._get_adducts()
576
- valid_adducts = set()
577
- try:
578
- study_adducts_df = self._get_adducts()
579
- if not study_adducts_df.is_empty():
580
- valid_adducts.update(study_adducts_df["name"].to_list())
581
- except Exception as e:
582
- self.logger.warning(f"Could not retrieve study adducts: {e}")
583
-
584
- # Always allow '?' adducts
585
- valid_adducts.add("?")
586
-
587
- for fd in feature_data_list:
588
- # Get individual adduct and mass from each feature data (fd)
589
- adduct = fd.get("adduct")
590
- adduct_mass = fd.get("adduct_mass")
591
-
592
- if adduct is not None:
593
- # Only include adducts that are valid (from study._get_adducts() or contain '?')
594
- if adduct in valid_adducts or "?" in adduct:
595
- all_adducts.append(adduct)
596
- if adduct_mass is not None:
597
- adduct_masses[adduct] = adduct_mass
598
-
599
- # Calculate adduct_values for the consensus feature
600
- adduct_values = []
601
- if all_adducts:
602
- adduct_counts = {
603
- adduct: all_adducts.count(adduct) for adduct in set(all_adducts)
604
- }
605
- total_count = sum(adduct_counts.values())
606
- for adduct, count in adduct_counts.items():
607
- percentage = (count / total_count) * 100 if total_count > 0 else 0
608
- mass = adduct_masses.get(adduct, None)
609
- # Store as list with [name, num, %] format for the adducts column
610
- adduct_values.append(
611
- [
612
- str(adduct),
613
- int(count),
614
- float(round(percentage, 2)),
615
- ],
616
- )
617
-
618
- # Sort adduct_values by count in descending order
619
- adduct_values.sort(key=lambda x: x[1], reverse=True) # Sort by count (index 1)
620
- # Store adduct_values for use in metadata
621
- consensus_adduct_values = adduct_values
622
-
623
- # Extract top adduct information for new columns
624
- adduct_top = None
625
- adduct_charge_top = None
626
- adduct_mass_neutral_top = None
627
- adduct_mass_shift_top = None
628
-
629
- if consensus_adduct_values:
630
- top_adduct_name = consensus_adduct_values[0][0] # Get top adduct name
631
- adduct_top = top_adduct_name
632
-
633
- # Parse adduct information to extract charge and mass shift
634
- # Handle "?" as "H" and parse common adduct formats
635
- if top_adduct_name == "?" or top_adduct_name == "[M+?]+":
636
- adduct_charge_top = 1
637
- adduct_mass_shift_top = 1.007825 # H mass
638
- elif top_adduct_name == "[M+?]-":
639
- adduct_charge_top = -1
640
- adduct_mass_shift_top = -1.007825 # -H mass
641
- else:
642
- # Try to get charge and mass shift from study._get_adducts()
643
- adduct_found = False
644
- try:
645
- study_adducts_df = self._get_adducts()
646
- if not study_adducts_df.is_empty():
647
- # Look for exact match in study adducts
648
- matching_adduct = study_adducts_df.filter(
649
- pl.col("name") == top_adduct_name,
650
- )
651
- if not matching_adduct.is_empty():
652
- adduct_row = matching_adduct.row(0, named=True)
653
- adduct_charge_top = adduct_row["charge"]
654
- adduct_mass_shift_top = adduct_row["mass_shift"]
655
- adduct_found = True
656
- except Exception as e:
657
- self.logger.warning(
658
- f"Could not lookup adduct in study adducts: {e}",
659
- )
660
-
661
- if not adduct_found:
662
- # Fallback to regex parsing
663
- import re
664
-
665
- # Pattern for adducts like [M+H]+, [M-H]-, [M+Na]+, etc.
666
- pattern = r"\[M([+\-])([A-Za-z0-9]+)\]([0-9]*)([+\-])"
667
- match = re.match(pattern, top_adduct_name)
668
-
669
- if match:
670
- sign = match.group(1)
671
- element = match.group(2)
672
- multiplier_str = match.group(3)
673
- charge_sign = match.group(4)
674
-
675
- multiplier = int(multiplier_str) if multiplier_str else 1
676
- charge = multiplier if charge_sign == "+" else -multiplier
677
- adduct_charge_top = charge
678
-
679
- # Calculate mass shift based on element
680
- element_masses = {
681
- "H": 1.007825,
682
- "Na": 22.989769,
683
- "K": 38.963708,
684
- "NH4": 18.033823,
685
- "Li": 7.016930,
686
- "Cl": 34.969401,
687
- "Br": 78.918885,
688
- "HCOO": 44.998201,
689
- "CH3COO": 59.013851,
690
- "H2O": 18.010565,
691
- }
692
-
693
- base_mass = element_masses.get(
694
- element,
695
- 1.007825,
696
- ) # Default to H if unknown
697
- mass_shift = (
698
- base_mass * multiplier
699
- if sign == "+"
700
- else -base_mass * multiplier
701
- )
702
- adduct_mass_shift_top = mass_shift
703
- else:
704
- # Default fallback
705
- adduct_charge_top = 1
706
- adduct_mass_shift_top = 1.007825
707
- else:
708
- # No valid adducts found - assign default based on study polarity
709
- study_polarity = getattr(self, "polarity", "positive")
710
- if study_polarity in ["negative", "neg"]:
711
- # Negative mode default
712
- adduct_top = "[M-?]1-"
713
- adduct_charge_top = -1
714
- adduct_mass_shift_top = -1.007825 # -H mass (loss of proton)
715
- else:
716
- # Positive mode default (includes 'positive', 'pos', or any other value)
717
- adduct_top = "[M+?]1+"
718
- adduct_charge_top = 1
719
- adduct_mass_shift_top = 1.007825 # H mass (gain of proton)
720
-
721
- # Create a single default adduct entry in the adducts list for consistency
722
- consensus_adduct_values = [[adduct_top, 1, 100.0]]
723
-
724
- # Calculate neutral mass from consensus mz (for both cases)
725
- consensus_mz = (
726
- round(float(np.mean(mz_values)), 4) if len(mz_values) > 0 else 0.0
727
- )
728
- if adduct_charge_top and adduct_mass_shift_top is not None:
729
- adduct_mass_neutral_top = (
730
- consensus_mz * abs(adduct_charge_top) - adduct_mass_shift_top
731
- )
732
-
733
- # Calculate number of MS2 spectra
734
- ms2_count = 0
735
- for fd in feature_data_list:
736
- ms2_scans = fd.get("ms2_scans")
737
- if ms2_scans is not None:
738
- ms2_count += len(ms2_scans)
739
-
740
- metadata_list.append(
741
- {
742
- "consensus_uid": int(i), # "consensus_id": i,
743
- "consensus_id": str(feature.getUniqueId()),
744
- "quality": round(float(feature.getQuality()), 3),
745
- "number_samples": len(feature_data_list),
746
- # "number_ext": int(len(features_list)),
747
- "rt": round(float(np.mean(rt_values)), 4)
748
- if len(rt_values) > 0
749
- else 0.0,
750
- "mz": round(float(np.mean(mz_values)), 4)
751
- if len(mz_values) > 0
752
- else 0.0,
753
- "rt_min": round(float(np.min(rt_values)), 3)
754
- if len(rt_values) > 0
755
- else 0.0,
756
- "rt_max": round(float(np.max(rt_values)), 3)
757
- if len(rt_values) > 0
758
- else 0.0,
759
- "rt_mean": round(float(np.mean(rt_values)), 3)
760
- if len(rt_values) > 0
761
- else 0.0,
762
- "rt_start_mean": round(float(np.mean(rt_start_values)), 3)
763
- if len(rt_start_values) > 0
764
- else 0.0,
765
- "rt_end_mean": round(float(np.mean(rt_end_values)), 3)
766
- if len(rt_end_values) > 0
767
- else 0.0,
768
- "rt_delta_mean": round(float(np.ptp(rt_delta_values)), 3)
769
- if len(rt_delta_values) > 0
770
- else 0.0,
771
- "mz_min": round(float(np.min(mz_values)), 4)
772
- if len(mz_values) > 0
773
- else 0.0,
774
- "mz_max": round(float(np.max(mz_values)), 4)
775
- if len(mz_values) > 0
776
- else 0.0,
777
- "mz_mean": round(float(np.mean(mz_values)), 4)
778
- if len(mz_values) > 0
779
- else 0.0,
780
- "mz_start_mean": round(float(np.mean(mz_start_values)), 4)
781
- if len(mz_start_values) > 0
782
- else 0.0,
783
- "mz_end_mean": round(float(np.mean(mz_end_values)), 4)
784
- if len(mz_end_values) > 0
785
- else 0.0,
786
- "inty_mean": round(float(np.mean(inty_values)), 0)
787
- if len(inty_values) > 0
788
- else 0.0,
789
- "bl": -1.0,
790
- "chrom_coherence_mean": round(float(np.mean(coherence_values)), 3)
791
- if len(coherence_values) > 0
792
- else 0.0,
793
- "chrom_prominence_mean": round(float(np.mean(prominence_values)), 0)
794
- if len(prominence_values) > 0
795
- else 0.0,
796
- "chrom_prominence_scaled_mean": round(
797
- float(np.mean(prominence_scaled_values)),
798
- 3,
799
- )
800
- if len(prominence_scaled_values) > 0
801
- else 0.0,
802
- "chrom_height_scaled_mean": round(
803
- float(np.mean(height_scaled_values)),
804
- 3,
805
- )
806
- if len(height_scaled_values) > 0
807
- else 0.0,
808
- "iso_mean": round(float(np.mean(iso_values)), 2)
809
- if len(iso_values) > 0
810
- else 0.0,
811
- "charge_mean": round(float(np.mean(charge_values)), 2)
812
- if len(charge_values) > 0
813
- else 0.0,
814
- "number_ms2": int(ms2_count),
815
- "adducts": consensus_adduct_values
816
- if consensus_adduct_values
817
- else [], # Ensure it's always a list
818
- # New columns for top-ranked adduct information
819
- "adduct_top": adduct_top,
820
- "adduct_charge_top": adduct_charge_top,
821
- "adduct_mass_neutral_top": round(adduct_mass_neutral_top, 6)
822
- if adduct_mass_neutral_top is not None
823
- else None,
824
- "adduct_mass_shift_top": round(adduct_mass_shift_top, 6)
825
- if adduct_mass_shift_top is not None
826
- else None,
827
- # New columns for top-scoring identification results
828
- "id_top_name": None,
829
- "id_top_class": None,
830
- "id_top_adduct": None,
831
- "id_top_score": None,
832
- },
833
- )
834
-
835
- consensus_mapping_df = pl.DataFrame(consensus_mapping)
836
- # remove all rows in consensus_mapping_df where consensus_id is not in self.featured_df['uid']
837
- l1 = len(consensus_mapping_df)
838
- consensus_mapping_df = consensus_mapping_df.filter(
839
- pl.col("feature_uid").is_in(self.features_df["feature_uid"].to_list()),
840
- )
841
- self.logger.debug(
842
- f"Filtered {l1 - len(consensus_mapping_df)} orphan features from maps.",
843
- )
844
- self.consensus_mapping_df = consensus_mapping_df
845
- self.consensus_df = pl.DataFrame(metadata_list, strict=False)
846
-
847
- if min_samples is None:
848
- min_samples = 1
849
- if min_samples < 1:
850
- min_samples = int(min_samples * len(self.samples_df))
851
-
852
- # Validate that min_samples doesn't exceed the number of samples
853
- if min_samples > len(self.samples_df):
854
- self.logger.warning(
855
- f"min_samples ({min_samples}) exceeds the number of samples ({len(self.samples_df)}). "
856
- f"Setting min_samples to {len(self.samples_df)}.",
857
- )
858
- min_samples = len(self.samples_df)
859
-
860
- # filter out consensus features with less than min_samples features
861
- l1 = len(self.consensus_df)
862
- self.consensus_df = self.consensus_df.filter(
863
- pl.col("number_samples") >= min_samples,
864
- )
865
- self.logger.debug(
866
- f"Filtered {l1 - len(self.consensus_df)} consensus features with less than {min_samples} samples.",
867
- )
868
- # filter out consensus mapping with less than min_samples features
869
- self.consensus_mapping_df = self.consensus_mapping_df.filter(
870
- pl.col("consensus_uid").is_in(self.consensus_df["consensus_uid"].to_list()),
871
- )
872
-
873
- self.consensus_map = consensus_map
874
-
875
- # Add adduct grouping and adduct_of assignment
876
- if len(self.consensus_df) > 0:
877
- # Get rt_tol and mz_tol from kwargs or use defaults from merge_defaults
878
- adduct_rt_tol = rt_tol # Use the same rt_tol from merge parameters
879
- adduct_mz_tol = mz_tol # Use the same mz_tol from merge parameters
880
-
881
- # Get relevant columns for grouping
882
- consensus_data = []
883
- for row in self.consensus_df.iter_rows(named=True):
884
- consensus_data.append(
885
- {
886
- "consensus_uid": row["consensus_uid"],
887
- "rt": row["rt"],
888
- "adduct_mass_neutral_top": row.get("adduct_mass_neutral_top"),
889
- "adduct_top": row.get("adduct_top"),
890
- "inty_mean": row.get("inty_mean", 0),
891
- },
892
- )
893
-
894
- # Use optimized adduct grouping
895
- adduct_group_list, adduct_of_list = _optimized_adduct_grouping(
896
- self, consensus_data, adduct_rt_tol, adduct_mz_tol
897
- )
898
-
899
- # Add the new columns to consensus_df
900
- self.consensus_df = self.consensus_df.with_columns(
901
- [
902
- pl.Series("adduct_group", adduct_group_list, dtype=pl.Int64),
903
- pl.Series("adduct_of", adduct_of_list, dtype=pl.Int64),
904
- ],
905
- )
906
-
907
- # calculate the completeness of the consensus map
908
- if len(self.consensus_df) > 0 and len(self.samples_df) > 0:
909
- c = (
910
- len(self.consensus_mapping_df)
911
- / len(self.consensus_df)
912
- / len(self.samples_df)
913
- )
914
- self.logger.info(
915
- f"Merging completed. Consensus features: {len(self.consensus_df)}. Completeness: {c:.2f}.",
916
- )
917
- else:
918
- self.logger.warning(
919
- f"Merging completed with empty result. Consensus features: {len(self.consensus_df)}. "
920
- f"This may be due to min_samples ({min_samples}) being too high for the available data.",
921
- )
922
- if link_ms2:
923
- self.find_ms2()
924
-
925
-
926
- def _optimized_feature_lookup(study_obj, features_df):
927
- """
928
- Optimized feature lookup creation using Polars operations.
929
- """
930
- study_obj.logger.debug("Creating optimized feature lookup...")
931
- start_time = time.time()
932
-
933
- # Use Polars select for faster conversion
934
- feature_columns = [
935
- "feature_uid", "rt", "mz", "rt_start", "rt_end", "rt_delta",
936
- "mz_start", "mz_end", "inty", "chrom_coherence", "chrom_prominence",
937
- "chrom_prominence_scaled", "chrom_height_scaled", "iso", "charge",
938
- "ms2_scans", "adduct", "adduct_mass"
939
- ]
940
-
941
- # Filter to only existing columns
942
- existing_columns = [col for col in feature_columns if col in features_df.columns]
943
-
944
- # Convert to dictionary more efficiently
945
- selected_df = features_df.select(existing_columns)
946
-
947
- features_lookup = {}
948
- for row in selected_df.iter_rows(named=True):
949
- feature_uid = row["feature_uid"]
950
- features_lookup[feature_uid] = {k: v for k, v in row.items() if k != "feature_uid"}
951
-
952
- lookup_time = time.time() - start_time
953
- if len(features_lookup) > 50000:
954
- study_obj.logger.debug(f"Feature lookup created in {lookup_time:.2f}s for {len(features_lookup)} features")
955
- return features_lookup
956
-
957
-
958
- def _optimized_adduct_grouping(study_obj, consensus_data, rt_tol, mz_tol):
959
- """
960
- Optimized O(n log n) adduct grouping using spatial indexing.
961
-
962
- Args:
963
- study_obj: Study object with logger
964
- consensus_data: List of consensus feature dictionaries
965
- rt_tol: RT tolerance in minutes
966
- mz_tol: m/z tolerance in Da
967
-
968
- Returns:
969
- Tuple of (adduct_group_list, adduct_of_list)
970
- """
971
- if not consensus_data:
972
- return [], []
973
-
974
- n_features = len(consensus_data)
975
- if n_features > 1000:
976
- study_obj.logger.info(f"Optimizing adduct grouping for {n_features} consensus features...")
977
-
978
- start_time = time.time()
979
-
980
- # Build spatial index using RT and neutral mass as coordinates
981
- features_by_mass = defaultdict(list)
982
- mass_bin_size = mz_tol * 2 # 2x tolerance for conservative binning
983
-
984
- valid_features = []
985
- for feature in consensus_data:
986
- consensus_uid = feature["consensus_uid"]
987
- rt = feature["rt"]
988
- neutral_mass = feature.get("adduct_mass_neutral_top")
989
- intensity = feature.get("inty_mean", 0)
990
- adduct = feature.get("adduct_top", "")
991
-
992
- if neutral_mass is not None:
993
- mass_bin = int(neutral_mass / mass_bin_size)
994
- features_by_mass[mass_bin].append((consensus_uid, rt, neutral_mass, intensity, adduct))
995
- valid_features.append((consensus_uid, rt, neutral_mass, intensity, adduct, mass_bin))
996
-
997
- # Union-Find for efficient grouping
998
- class UnionFind:
999
- def __init__(self, n):
1000
- self.parent = list(range(n))
1001
- self.rank = [0] * n
1002
-
1003
- def find(self, x):
1004
- if self.parent[x] != x:
1005
- self.parent[x] = self.find(self.parent[x])
1006
- return self.parent[x]
1007
-
1008
- def union(self, x, y):
1009
- px, py = self.find(x), self.find(y)
1010
- if px == py:
1011
- return
1012
- if self.rank[px] < self.rank[py]:
1013
- px, py = py, px
1014
- self.parent[py] = px
1015
- if self.rank[px] == self.rank[py]:
1016
- self.rank[px] += 1
1017
-
1018
- uid_to_idx = {feature[0]: i for i, feature in enumerate(valid_features)}
1019
- uf = UnionFind(len(valid_features))
1020
-
1021
- # Find groups using spatial index
1022
- checked_pairs = set()
1023
- for i, (uid1, rt1, mass1, inty1, adduct1, bin1) in enumerate(valid_features):
1024
- for bin_offset in [-1, 0, 1]:
1025
- check_bin = bin1 + bin_offset
1026
- if check_bin not in features_by_mass:
1027
- continue
1028
-
1029
- for uid2, rt2, mass2, inty2, adduct2 in features_by_mass[check_bin]:
1030
- if uid1 >= uid2:
1031
- continue
1032
-
1033
- pair = (min(uid1, uid2), max(uid1, uid2))
1034
- if pair in checked_pairs:
1035
- continue
1036
- checked_pairs.add(pair)
1037
-
1038
- mass_diff = abs(mass1 - mass2)
1039
- rt_diff = abs(rt1 - rt2) / 60.0 # Convert to minutes
1040
-
1041
- if mass_diff <= mz_tol and rt_diff <= rt_tol:
1042
- j = uid_to_idx[uid2]
1043
- uf.union(i, j)
1044
-
1045
- # Extract groups
1046
- groups_by_root = defaultdict(list)
1047
- for i, (uid, rt, mass, inty, adduct, _) in enumerate(valid_features):
1048
- root = uf.find(i)
1049
- groups_by_root[root].append((uid, rt, mass, inty, adduct))
1050
-
1051
- groups = {}
1052
- group_id = 1
1053
- assigned_groups = {}
1054
-
1055
- for group_members in groups_by_root.values():
1056
- member_uids = [uid for uid, _, _, _, _ in group_members]
1057
-
1058
- for uid in member_uids:
1059
- assigned_groups[uid] = group_id
1060
- groups[group_id] = member_uids
1061
- group_id += 1
1062
-
1063
- # Handle features without neutral mass
1064
- for feature in consensus_data:
1065
- uid = feature["consensus_uid"]
1066
- if uid not in assigned_groups:
1067
- assigned_groups[uid] = group_id
1068
- groups[group_id] = [uid]
1069
- group_id += 1
1070
-
1071
- # Determine adduct_of for each group
1072
- group_adduct_of = {}
1073
- for grp_id, member_uids in groups.items():
1074
- best_uid = None
1075
- best_priority = -1
1076
- best_intensity = 0
1077
-
1078
- for uid in member_uids:
1079
- feature_data = next((f for f in consensus_data if f["consensus_uid"] == uid), None)
1080
- if not feature_data:
1081
- continue
1082
-
1083
- adduct = feature_data.get("adduct_top", "")
1084
- intensity = feature_data.get("inty_mean", 0)
1085
-
1086
- priority = 0
1087
- if adduct and ("[M+H]" in adduct or adduct == "H" or adduct == "?"):
1088
- priority = 3
1089
- elif adduct and "[M-H]" in adduct:
1090
- priority = 2
1091
- elif adduct and "M" in adduct:
1092
- priority = 1
1093
-
1094
- if priority > best_priority or (priority == best_priority and intensity > best_intensity):
1095
- best_uid = uid
1096
- best_priority = priority
1097
- best_intensity = intensity
1098
-
1099
- group_adduct_of[grp_id] = best_uid if best_uid else member_uids[0]
1100
-
1101
- # Build final lists in same order as consensus_data
1102
- adduct_group_list = []
1103
- adduct_of_list = []
1104
-
1105
- for feature in consensus_data:
1106
- uid = feature["consensus_uid"]
1107
- group = assigned_groups.get(uid, 0)
1108
- adduct_of = group_adduct_of.get(group, uid)
1109
-
1110
- adduct_group_list.append(group)
1111
- adduct_of_list.append(adduct_of)
1112
-
1113
- grouping_time = time.time() - start_time
1114
- if n_features > 1000:
1115
- study_obj.logger.info(f"Adduct grouping completed in {grouping_time:.2f}s ({len(groups)} groups)")
1116
-
1117
- return adduct_group_list, adduct_of_list
1118
-
1119
-
1120
- # Backward compatibility alias
1121
- find_consensus = merge
1122
-
1123
-
1124
250
  def find_ms2(self, **kwargs):
1125
251
  """
1126
252
  Links MS2 spectra to consensus features and stores the result in self.consensus_ms2.