masster 0.4.14__py3-none-any.whl → 0.4.17__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of masster might be problematic. Click here for more details.

@@ -1,8 +1,6 @@
1
1
  from __future__ import annotations
2
2
 
3
3
  from datetime import datetime
4
- from collections import defaultdict
5
- import time
6
4
 
7
5
  import numpy as np
8
6
  import polars as pl
@@ -14,7 +12,6 @@ from masster.study.defaults import (
14
12
  align_defaults,
15
13
  find_ms2_defaults,
16
14
  integrate_defaults,
17
- merge_defaults,
18
15
  )
19
16
 
20
17
 
@@ -250,905 +247,6 @@ def align(self, **kwargs):
250
247
  self.save_samples()
251
248
 
252
249
 
253
- def merge(self, **kwargs):
254
- """Group features across samples into consensus features.
255
-
256
- Parameters can be provided as a ``merge_defaults`` instance or as
257
- individual keyword arguments; they are validated against the defaults class.
258
-
259
- Key parameters (from ``merge_defaults``):
260
- - algorithm (str): Grouping algorithm to use ('qt', 'kd', 'unlabeled', 'sequential').
261
- - min_samples (int): Minimum number of samples required for a consensus feature.
262
- - link_ms2 (bool): Whether to attach/link MS2 spectra to consensus features.
263
- - mz_tol (float): m/z tolerance for grouping (Da).
264
- - rt_tol (float): RT tolerance for grouping (seconds).
265
- """
266
- # Initialize
267
- self._reset_consensus_data()
268
- self.logger.info("Merging...")
269
-
270
- # Process parameters
271
- params = self._process_merge_parameters(**kwargs)
272
- algorithm = params.get("algorithm")
273
- min_samples = params.get("min_samples")
274
- link_ms2 = params.get("link_ms2")
275
- mz_tol = kwargs.get("mz_tol", 0.01)
276
- rt_tol = kwargs.get("rt_tol", 1.0)
277
-
278
- # Validate and prepare
279
- self._validate_merge_inputs(algorithm)
280
-
281
- # Perform feature grouping using OpenMS
282
- consensus_map = self._perform_feature_grouping(algorithm, params, mz_tol, rt_tol)
283
-
284
- # Extract consensus features and build metadata
285
- self._extract_consensus_features(consensus_map, min_samples)
286
-
287
- # Perform adduct grouping optimization
288
- self._perform_adduct_grouping(rt_tol, mz_tol)
289
-
290
- # Complete merge process
291
- self._finalize_merge(link_ms2, min_samples)
292
-
293
- def _perform_adduct_grouping(self, rt_tol, mz_tol):
294
- """Perform adduct grouping on consensus features."""
295
- # Add adduct grouping and adduct_of assignment
296
- if len(self.consensus_df) > 0:
297
- # Get relevant columns for grouping
298
- consensus_data = []
299
- for row in self.consensus_df.iter_rows(named=True):
300
- consensus_data.append(
301
- {
302
- "consensus_uid": row["consensus_uid"],
303
- "rt": row["rt"],
304
- "adduct_mass_neutral_top": row.get("adduct_mass_neutral_top"),
305
- "adduct_top": row.get("adduct_top"),
306
- "inty_mean": row.get("inty_mean", 0),
307
- },
308
- )
309
-
310
- # Use optimized adduct grouping
311
- adduct_group_list, adduct_of_list = _optimized_adduct_grouping(
312
- self, consensus_data, rt_tol, mz_tol
313
- )
314
-
315
- # Add the new columns to consensus_df
316
- self.consensus_df = self.consensus_df.with_columns(
317
- [
318
- pl.Series("adduct_group", adduct_group_list, dtype=pl.Int64),
319
- pl.Series("adduct_of", adduct_of_list, dtype=pl.Int64),
320
- ],
321
- )
322
-
323
- def _finalize_merge(self, link_ms2, min_samples):
324
- """Complete the merge process with final calculations and cleanup."""
325
- # Validate min_samples parameter
326
- if min_samples is None:
327
- min_samples = 1
328
- if min_samples < 1:
329
- min_samples = int(min_samples * len(self.samples_df))
330
-
331
- # Validate that min_samples doesn't exceed the number of samples
332
- if min_samples > len(self.samples_df):
333
- self.logger.warning(
334
- f"min_samples ({min_samples}) exceeds the number of samples ({len(self.samples_df)}). "
335
- f"Setting min_samples to {len(self.samples_df)}.",
336
- )
337
- min_samples = len(self.samples_df)
338
-
339
- # Filter out consensus features with less than min_samples features
340
- l1 = len(self.consensus_df)
341
- self.consensus_df = self.consensus_df.filter(
342
- pl.col("number_samples") >= min_samples,
343
- )
344
- self.logger.debug(
345
- f"Filtered {l1 - len(self.consensus_df)} consensus features with less than {min_samples} samples.",
346
- )
347
-
348
- # Filter out consensus mapping with less than min_samples features
349
- self.consensus_mapping_df = self.consensus_mapping_df.filter(
350
- pl.col("consensus_uid").is_in(self.consensus_df["consensus_uid"].to_list()),
351
- )
352
-
353
- # Calculate the completeness of the consensus map
354
- if len(self.consensus_df) > 0 and len(self.samples_df) > 0:
355
- c = (
356
- len(self.consensus_mapping_df)
357
- / len(self.consensus_df)
358
- / len(self.samples_df)
359
- )
360
- self.logger.info(
361
- f"Merging completed. Consensus features: {len(self.consensus_df)}. Completeness: {c:.2f}.",
362
- )
363
- else:
364
- self.logger.warning(
365
- f"Merging completed with empty result. Consensus features: {len(self.consensus_df)}. "
366
- f"This may be due to min_samples ({min_samples}) being too high for the available data.",
367
- )
368
-
369
- if link_ms2:
370
- self.find_ms2()
371
-
372
- def _reset_consensus_data(self):
373
- """Reset consensus-related DataFrames at the start of merge."""
374
- self.consensus_df = pl.DataFrame()
375
- self.consensus_ms2 = pl.DataFrame()
376
- self.consensus_mapping_df = pl.DataFrame()
377
-
378
- def _process_merge_parameters(self, **kwargs):
379
- """Process and validate merge parameters."""
380
- params = merge_defaults()
381
- for key, value in kwargs.items():
382
- if isinstance(value, merge_defaults):
383
- params = value
384
- self.logger.debug("Using provided merge_defaults parameters")
385
- else:
386
- if hasattr(params, key):
387
- if params.set(key, value, validate=True):
388
- self.logger.debug(f"Updated parameter {key} = {value}")
389
- else:
390
- self.logger.warning(
391
- f"Failed to set parameter {key} = {value} (validation failed)",
392
- )
393
- else:
394
- self.logger.debug(f"Unknown parameter {key} ignored")
395
-
396
- # Store parameters in the Study object
397
- self.store_history(["merge"], params.to_dict())
398
- self.logger.debug("Parameters stored to merge")
399
- return params
400
-
401
- def _validate_merge_inputs(self, algorithm):
402
- """Validate merge inputs and provide warnings for performance."""
403
- if len(self.samples_df) > 200 and algorithm == "qt":
404
- self.logger.warning(
405
- "Using QT for large datasets is NOT recommended [O(n²)], consider using KDTree instead [O(n log n)].",
406
- )
407
-
408
- # Check that features_maps is not empty
409
- if not self.features_maps or len(self.features_maps) == 0:
410
- self.load_features()
411
-
412
- def _perform_feature_grouping(self, algorithm, params, mz_tol, rt_tol):
413
- """Perform feature grouping using OpenMS algorithms."""
414
- params_oms = oms.Param()
415
- ## TODO expose these
416
-
417
- feature_grouper: object # Use generic type for different OpenMS algorithms
418
- match algorithm.lower():
419
- case "kd":
420
- feature_grouper = oms.FeatureGroupingAlgorithmKD()
421
- self.logger.debug("Merging features with KDTree...")
422
- params_oms.setValue("mz_unit", "Da")
423
- params_oms.setValue("nr_partitions", len(self.samples_df))
424
-
425
- params_oms.setValue("warp:enabled", "true")
426
- params_oms.setValue("warp:rt_tol", rt_tol)
427
- params_oms.setValue("warp:mz_tol", mz_tol)
428
-
429
- params_oms.setValue("link:rt_tol", rt_tol)
430
- params_oms.setValue("link:mz_tol", mz_tol)
431
- case "unlabeled":
432
- feature_grouper = oms.FeatureGroupingAlgorithmUnlabeled()
433
- self.logger.debug("Merging features with Unlabelled algorithm...")
434
- params_oms.setValue("second_nearest_gap", 2.0)
435
- params_oms.setValue("ignore_charge", "true")
436
- params_oms.setValue("distance_RT:max_difference", rt_tol * 3)
437
- params_oms.setValue("distance_MZ:max_difference", mz_tol * 3)
438
- params_oms.setValue("distance_MZ:unit", "Da")
439
- case "sequential":
440
- self.logger.debug(
441
- "Merging features sequentially with Unlabelled algorithm...",
442
- )
443
- params_oms.setValue("second_nearest_gap", 2.0)
444
- params_oms.setValue("ignore_charge", "true")
445
- params_oms.setValue("distance_RT:max_difference", rt_tol * 3)
446
- params_oms.setValue("distance_MZ:max_difference", mz_tol * 3)
447
- params_oms.setValue("distance_MZ:unit", "Da")
448
- case "qt":
449
- feature_grouper = oms.FeatureGroupingAlgorithmQT()
450
- self.logger.debug("Grouping features with QT...")
451
- params_oms.setValue("nr_partitions", len(self.samples_df))
452
- params_oms.setValue("ignore_charge", "true")
453
- params_oms.setValue("distance_RT:max_difference", rt_tol * 3)
454
- params_oms.setValue("distance_MZ:max_difference", mz_tol * 3)
455
- params_oms.setValue("distance_MZ:unit", "Da")
456
-
457
- self.logger.debug(f"Parameters for feature grouping: {params_oms}")
458
-
459
- # Create consensus map and set up file descriptions
460
- consensus_map = oms.ConsensusMap()
461
- file_descriptions = consensus_map.getColumnHeaders() # type: ignore
462
- feature_maps = self.features_maps
463
- for i, feature_map in enumerate(feature_maps):
464
- file_description = file_descriptions.get(i, oms.ColumnHeader())
465
- file_description.filename = self.samples_df.row(i, named=True)["sample_name"]
466
- file_description.size = feature_map.size()
467
- file_description.unique_id = feature_map.getUniqueId()
468
- file_descriptions[i] = file_description
469
-
470
- consensus_map.setColumnHeaders(file_descriptions) # type: ignore
471
-
472
- # Execute the grouping algorithm
473
- match algorithm.lower():
474
- case "sequential":
475
- # set the reference map to self.alignment_ref_index
476
- if self.alignment_ref_index is None:
477
- # pick the feature map with the most features as reference
478
- self.alignment_ref_index = max(
479
- range(len(self.features_maps)),
480
- key=lambda i: self.features_maps[i].size(),
481
- )
482
- feature_grouper = oms.FeatureGroupingAlgorithmUnlabeled()
483
- feature_grouper.setParameters(params_oms)
484
- feature_grouper.setReference(self.alignment_ref_index)
485
- self.logger.debug(f"Sequential mode: reference map = {self.alignment_ref_index}")
486
-
487
- # Group features sequentially
488
- for i in range(len(feature_maps)):
489
- if i == self.alignment_ref_index:
490
- continue
491
- temp_feature_maps = [feature_maps[self.alignment_ref_index], feature_maps[i]]
492
- temp_consensus_map = oms.ConsensusMap()
493
- feature_grouper.group(temp_feature_maps, temp_consensus_map)
494
- # Merge temp_consensus_map into consensus_map
495
- # This is a simplified approach - proper sequential grouping would be more complex
496
- case _:
497
- feature_grouper.setParameters(params_oms)
498
- feature_grouper.group(feature_maps, consensus_map)
499
-
500
- return consensus_map
501
-
502
- def _extract_consensus_features(self, consensus_map, min_samples):
503
- """Extract consensus features and build metadata."""
504
- # create a dict to map uid to feature_uid using self.features_df
505
- feature_uid_map = {
506
- row["feature_id"]: row["feature_uid"]
507
- for row in self.features_df.iter_rows(named=True)
508
- }
509
- imax = consensus_map.size()
510
-
511
- self.logger.info(f"Merging completed with {imax} consensus features.")
512
-
513
- # Pre-build fast lookup tables for features_df data using optimized approach
514
- features_lookup = _optimized_feature_lookup(self, self.features_df)
515
-
516
- # create a list to store the consensus mapping
517
- consensus_mapping = []
518
- metadata_list = []
519
-
520
- tqdm_disable = self.log_level not in ["TRACE", "DEBUG"]
521
-
522
- for i, feature in enumerate(
523
- tqdm(
524
- consensus_map,
525
- total=imax,
526
- disable=tqdm_disable,
527
- desc=f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}Extract metadata",
528
- ),
529
- ):
530
- # get all features in the feature map with the same unique id as the consensus feature
531
- features_list = feature.getFeatureList()
532
- uids = []
533
- feature_data_list = []
534
-
535
- for _j, f in enumerate(features_list):
536
- fuid = str(f.getUniqueId())
537
- if fuid not in feature_uid_map:
538
- # this is a feature that was removed but is still in the feature maps
539
- continue
540
- fuid = feature_uid_map[fuid]
541
- consensus_mapping.append(
542
- {
543
- "consensus_uid": i,
544
- "sample_uid": f.getMapIndex() + 1,
545
- "feature_uid": fuid,
546
- },
547
- )
548
- uids.append(fuid)
549
-
550
- # Get feature data from lookup instead of DataFrame filtering
551
- feature_data = features_lookup.get(fuid)
552
- if feature_data:
553
- feature_data_list.append(feature_data)
554
-
555
- if not feature_data_list:
556
- # Skip this consensus feature if no valid features found
557
- continue
558
-
559
- # Compute statistics using vectorized operations on collected data
560
- # Convert to numpy arrays for faster computation
561
- rt_values = np.array(
562
- [fd.get("rt", 0) for fd in feature_data_list if fd.get("rt") is not None],
563
- )
564
- mz_values = np.array(
565
- [fd.get("mz", 0) for fd in feature_data_list if fd.get("mz") is not None],
566
- )
567
- rt_start_values = np.array(
568
- [
569
- fd.get("rt_start", 0)
570
- for fd in feature_data_list
571
- if fd.get("rt_start") is not None
572
- ],
573
- )
574
- rt_end_values = np.array(
575
- [
576
- fd.get("rt_end", 0)
577
- for fd in feature_data_list
578
- if fd.get("rt_end") is not None
579
- ],
580
- )
581
- rt_delta_values = np.array(
582
- [
583
- fd.get("rt_delta", 0)
584
- for fd in feature_data_list
585
- if fd.get("rt_delta") is not None
586
- ],
587
- )
588
- mz_start_values = np.array(
589
- [
590
- fd.get("mz_start", 0)
591
- for fd in feature_data_list
592
- if fd.get("mz_start") is not None
593
- ],
594
- )
595
- mz_end_values = np.array(
596
- [
597
- fd.get("mz_end", 0)
598
- for fd in feature_data_list
599
- if fd.get("mz_end") is not None
600
- ],
601
- )
602
- inty_values = np.array(
603
- [
604
- fd.get("inty", 0)
605
- for fd in feature_data_list
606
- if fd.get("inty") is not None
607
- ],
608
- )
609
- coherence_values = np.array(
610
- [
611
- fd.get("chrom_coherence", 0)
612
- for fd in feature_data_list
613
- if fd.get("chrom_coherence") is not None
614
- ],
615
- )
616
- prominence_values = np.array(
617
- [
618
- fd.get("chrom_prominence", 0)
619
- for fd in feature_data_list
620
- if fd.get("chrom_prominence") is not None
621
- ],
622
- )
623
- prominence_scaled_values = np.array(
624
- [
625
- fd.get("chrom_prominence_scaled", 0)
626
- for fd in feature_data_list
627
- if fd.get("chrom_prominence_scaled") is not None
628
- ],
629
- )
630
- height_scaled_values = np.array(
631
- [
632
- fd.get("chrom_height_scaled", 0)
633
- for fd in feature_data_list
634
- if fd.get("chrom_height_scaled") is not None
635
- ],
636
- )
637
- iso_values = np.array(
638
- [fd.get("iso", 0) for fd in feature_data_list if fd.get("iso") is not None],
639
- )
640
- charge_values = np.array(
641
- [
642
- fd.get("charge", 0)
643
- for fd in feature_data_list
644
- if fd.get("charge") is not None
645
- ],
646
- )
647
-
648
- # adduct_values
649
- # Collect all adducts from feature_data_list to create consensus adduct information
650
- # Only consider adducts that are in study._get_adducts() plus items with '?'
651
- all_adducts = []
652
- adduct_masses = {}
653
-
654
- # Get valid adducts from study._get_adducts()
655
- valid_adducts = set()
656
- try:
657
- study_adducts_df = self._get_adducts()
658
- if not study_adducts_df.is_empty():
659
- valid_adducts.update(study_adducts_df["name"].to_list())
660
- except Exception as e:
661
- self.logger.warning(f"Could not retrieve study adducts: {e}")
662
-
663
- # Always allow '?' adducts
664
- valid_adducts.add("?")
665
-
666
- for fd in feature_data_list:
667
- # Get individual adduct and mass from each feature data (fd)
668
- adduct = fd.get("adduct")
669
- adduct_mass = fd.get("adduct_mass")
670
-
671
- if adduct is not None:
672
- # Only include adducts that are valid (from study._get_adducts() or contain '?')
673
- if adduct in valid_adducts or "?" in adduct:
674
- all_adducts.append(adduct)
675
- if adduct_mass is not None:
676
- adduct_masses[adduct] = adduct_mass
677
-
678
- # Calculate adduct_values for the consensus feature
679
- adduct_values = []
680
- if all_adducts:
681
- adduct_counts = {
682
- adduct: all_adducts.count(adduct) for adduct in set(all_adducts)
683
- }
684
- total_count = sum(adduct_counts.values())
685
- for adduct, count in adduct_counts.items():
686
- percentage = (count / total_count) * 100 if total_count > 0 else 0
687
- # Store as list with [name, num, %] format for the adducts column
688
- adduct_values.append(
689
- [
690
- str(adduct),
691
- int(count),
692
- float(round(percentage, 2)),
693
- ],
694
- )
695
-
696
- # Sort adduct_values by count in descending order
697
- adduct_values.sort(key=lambda x: x[1], reverse=True) # Sort by count (index 1)
698
- # Store adduct_values for use in metadata
699
- consensus_adduct_values = adduct_values
700
-
701
- # Extract top adduct information for new columns
702
- adduct_top = None
703
- adduct_charge_top = None
704
- adduct_mass_neutral_top = None
705
- adduct_mass_shift_top = None
706
-
707
- if consensus_adduct_values:
708
- top_adduct_name = consensus_adduct_values[0][0] # Get top adduct name
709
- adduct_top = top_adduct_name
710
-
711
- # Parse adduct information to extract charge and mass shift
712
- # Handle "?" as "H" and parse common adduct formats
713
- if top_adduct_name == "?" or top_adduct_name == "[M+?]+":
714
- adduct_charge_top = 1
715
- adduct_mass_shift_top = 1.007825 # H mass
716
- elif top_adduct_name == "[M+?]-":
717
- adduct_charge_top = -1
718
- adduct_mass_shift_top = -1.007825 # -H mass
719
- else:
720
- # Try to get charge and mass shift from study._get_adducts()
721
- adduct_found = False
722
- try:
723
- study_adducts_df = self._get_adducts()
724
- if not study_adducts_df.is_empty():
725
- # Look for exact match in study adducts
726
- matching_adduct = study_adducts_df.filter(
727
- pl.col("name") == top_adduct_name,
728
- )
729
- if not matching_adduct.is_empty():
730
- adduct_row = matching_adduct.row(0, named=True)
731
- adduct_charge_top = adduct_row["charge"]
732
- adduct_mass_shift_top = adduct_row["mass_shift"]
733
- adduct_found = True
734
- except Exception as e:
735
- self.logger.warning(
736
- f"Could not lookup adduct in study adducts: {e}",
737
- )
738
-
739
- if not adduct_found:
740
- # Fallback to regex parsing
741
- import re
742
-
743
- # Pattern for adducts like [M+H]+, [M-H]-, [M+Na]+, etc.
744
- pattern = r"\[M([+\-])([A-Za-z0-9]+)\]([0-9]*)([+\-])"
745
- match = re.match(pattern, top_adduct_name)
746
-
747
- if match:
748
- sign = match.group(1)
749
- element = match.group(2)
750
- multiplier_str = match.group(3)
751
- charge_sign = match.group(4)
752
-
753
- multiplier = int(multiplier_str) if multiplier_str else 1
754
- charge = multiplier if charge_sign == "+" else -multiplier
755
- adduct_charge_top = charge
756
-
757
- # Calculate mass shift based on element
758
- element_masses = {
759
- "H": 1.007825,
760
- "Na": 22.989769,
761
- "K": 38.963708,
762
- "NH4": 18.033823,
763
- "Li": 7.016930,
764
- "Cl": 34.969401,
765
- "Br": 78.918885,
766
- "HCOO": 44.998201,
767
- "CH3COO": 59.013851,
768
- "H2O": 18.010565,
769
- }
770
-
771
- base_mass = element_masses.get(
772
- element,
773
- 1.007825,
774
- ) # Default to H if unknown
775
- mass_shift = (
776
- base_mass * multiplier
777
- if sign == "+"
778
- else -base_mass * multiplier
779
- )
780
- adduct_mass_shift_top = mass_shift
781
- else:
782
- # Default fallback
783
- adduct_charge_top = 1
784
- adduct_mass_shift_top = 1.007825
785
- else:
786
- # No valid adducts found - assign default based on study polarity
787
- study_polarity = getattr(self, "polarity", "positive")
788
- if study_polarity in ["negative", "neg"]:
789
- # Negative mode default
790
- adduct_top = "[M-?]1-"
791
- adduct_charge_top = -1
792
- adduct_mass_shift_top = -1.007825 # -H mass (loss of proton)
793
- else:
794
- # Positive mode default (includes 'positive', 'pos', or any other value)
795
- adduct_top = "[M+?]1+"
796
- adduct_charge_top = 1
797
- adduct_mass_shift_top = 1.007825 # H mass (gain of proton)
798
-
799
- # Create a single default adduct entry in the adducts list for consistency
800
- consensus_adduct_values = [[adduct_top, 1, 100.0]]
801
-
802
- # Calculate neutral mass from consensus mz (for both cases)
803
- consensus_mz = (
804
- round(float(np.mean(mz_values)), 4) if len(mz_values) > 0 else 0.0
805
- )
806
- if adduct_charge_top and adduct_mass_shift_top is not None:
807
- adduct_mass_neutral_top = (
808
- consensus_mz * abs(adduct_charge_top) - adduct_mass_shift_top
809
- )
810
-
811
- # Calculate number of MS2 spectra
812
- ms2_count = 0
813
- for fd in feature_data_list:
814
- ms2_scans = fd.get("ms2_scans")
815
- if ms2_scans is not None:
816
- ms2_count += len(ms2_scans)
817
-
818
- metadata_list.append(
819
- {
820
- "consensus_uid": int(i), # "consensus_id": i,
821
- "consensus_id": str(feature.getUniqueId()),
822
- "quality": round(float(feature.getQuality()), 3),
823
- "number_samples": len(feature_data_list),
824
- # "number_ext": int(len(features_list)),
825
- "rt": round(float(np.mean(rt_values)), 4)
826
- if len(rt_values) > 0
827
- else 0.0,
828
- "mz": round(float(np.mean(mz_values)), 4)
829
- if len(mz_values) > 0
830
- else 0.0,
831
- "rt_min": round(float(np.min(rt_values)), 3)
832
- if len(rt_values) > 0
833
- else 0.0,
834
- "rt_max": round(float(np.max(rt_values)), 3)
835
- if len(rt_values) > 0
836
- else 0.0,
837
- "rt_mean": round(float(np.mean(rt_values)), 3)
838
- if len(rt_values) > 0
839
- else 0.0,
840
- "rt_start_mean": round(float(np.mean(rt_start_values)), 3)
841
- if len(rt_start_values) > 0
842
- else 0.0,
843
- "rt_end_mean": round(float(np.mean(rt_end_values)), 3)
844
- if len(rt_end_values) > 0
845
- else 0.0,
846
- "rt_delta_mean": round(float(np.ptp(rt_delta_values)), 3)
847
- if len(rt_delta_values) > 0
848
- else 0.0,
849
- "mz_min": round(float(np.min(mz_values)), 4)
850
- if len(mz_values) > 0
851
- else 0.0,
852
- "mz_max": round(float(np.max(mz_values)), 4)
853
- if len(mz_values) > 0
854
- else 0.0,
855
- "mz_mean": round(float(np.mean(mz_values)), 4)
856
- if len(mz_values) > 0
857
- else 0.0,
858
- "mz_start_mean": round(float(np.mean(mz_start_values)), 4)
859
- if len(mz_start_values) > 0
860
- else 0.0,
861
- "mz_end_mean": round(float(np.mean(mz_end_values)), 4)
862
- if len(mz_end_values) > 0
863
- else 0.0,
864
- "inty_mean": round(float(np.mean(inty_values)), 0)
865
- if len(inty_values) > 0
866
- else 0.0,
867
- "bl": -1.0,
868
- "chrom_coherence_mean": round(float(np.mean(coherence_values)), 3)
869
- if len(coherence_values) > 0
870
- else 0.0,
871
- "chrom_prominence_mean": round(float(np.mean(prominence_values)), 0)
872
- if len(prominence_values) > 0
873
- else 0.0,
874
- "chrom_prominence_scaled_mean": round(
875
- float(np.mean(prominence_scaled_values)),
876
- 3,
877
- )
878
- if len(prominence_scaled_values) > 0
879
- else 0.0,
880
- "chrom_height_scaled_mean": round(
881
- float(np.mean(height_scaled_values)),
882
- 3,
883
- )
884
- if len(height_scaled_values) > 0
885
- else 0.0,
886
- "iso_mean": round(float(np.mean(iso_values)), 2)
887
- if len(iso_values) > 0
888
- else 0.0,
889
- "charge_mean": round(float(np.mean(charge_values)), 2)
890
- if len(charge_values) > 0
891
- else 0.0,
892
- "number_ms2": int(ms2_count),
893
- "adducts": consensus_adduct_values
894
- if consensus_adduct_values
895
- else [], # Ensure it's always a list
896
- # New columns for top-ranked adduct information
897
- "adduct_top": adduct_top,
898
- "adduct_charge_top": adduct_charge_top,
899
- "adduct_mass_neutral_top": round(adduct_mass_neutral_top, 6)
900
- if adduct_mass_neutral_top is not None
901
- else None,
902
- "adduct_mass_shift_top": round(adduct_mass_shift_top, 6)
903
- if adduct_mass_shift_top is not None
904
- else None,
905
- # New columns for top-scoring identification results
906
- "id_top_name": None,
907
- "id_top_class": None,
908
- "id_top_adduct": None,
909
- "id_top_score": None,
910
- },
911
- )
912
-
913
- consensus_mapping_df = pl.DataFrame(consensus_mapping)
914
- # remove all rows in consensus_mapping_df where consensus_id is not in self.featured_df['uid']
915
- l1 = len(consensus_mapping_df)
916
- consensus_mapping_df = consensus_mapping_df.filter(
917
- pl.col("feature_uid").is_in(self.features_df["feature_uid"].to_list()),
918
- )
919
- self.logger.debug(
920
- f"Filtered {l1 - len(consensus_mapping_df)} orphan features from maps.",
921
- )
922
- self.consensus_mapping_df = consensus_mapping_df
923
- self.consensus_df = pl.DataFrame(metadata_list, strict=False)
924
-
925
- if min_samples is None:
926
- min_samples = 1
927
- if min_samples < 1:
928
- min_samples = int(min_samples * len(self.samples_df))
929
-
930
- # Validate that min_samples doesn't exceed the number of samples
931
- if min_samples > len(self.samples_df):
932
- self.logger.warning(
933
- f"min_samples ({min_samples}) exceeds the number of samples ({len(self.samples_df)}). "
934
- f"Setting min_samples to {len(self.samples_df)}.",
935
- )
936
- min_samples = len(self.samples_df)
937
-
938
- # filter out consensus features with less than min_samples features
939
- l1 = len(self.consensus_df)
940
- self.consensus_df = self.consensus_df.filter(
941
- pl.col("number_samples") >= min_samples,
942
- )
943
- self.logger.debug(
944
- f"Filtered {l1 - len(self.consensus_df)} consensus features with less than {min_samples} samples.",
945
- )
946
- # filter out consensus mapping with less than min_samples features
947
- self.consensus_mapping_df = self.consensus_mapping_df.filter(
948
- pl.col("consensus_uid").is_in(self.consensus_df["consensus_uid"].to_list()),
949
- )
950
-
951
- self.consensus_map = consensus_map
952
-
953
-
954
- def _optimized_feature_lookup(study_obj, features_df):
955
- """
956
- Optimized feature lookup creation using Polars operations.
957
- """
958
- study_obj.logger.debug("Creating optimized feature lookup...")
959
- start_time = time.time()
960
-
961
- # Use Polars select for faster conversion
962
- feature_columns = [
963
- "feature_uid", "rt", "mz", "rt_start", "rt_end", "rt_delta",
964
- "mz_start", "mz_end", "inty", "chrom_coherence", "chrom_prominence",
965
- "chrom_prominence_scaled", "chrom_height_scaled", "iso", "charge",
966
- "ms2_scans", "adduct", "adduct_mass"
967
- ]
968
-
969
- # Filter to only existing columns
970
- existing_columns = [col for col in feature_columns if col in features_df.columns]
971
-
972
- # Convert to dictionary more efficiently
973
- selected_df = features_df.select(existing_columns)
974
-
975
- features_lookup = {}
976
- for row in selected_df.iter_rows(named=True):
977
- feature_uid = row["feature_uid"]
978
- features_lookup[feature_uid] = {k: v for k, v in row.items() if k != "feature_uid"}
979
-
980
- lookup_time = time.time() - start_time
981
- if len(features_lookup) > 50000:
982
- study_obj.logger.debug(f"Feature lookup created in {lookup_time:.2f}s for {len(features_lookup)} features")
983
- return features_lookup
984
-
985
-
986
- def _optimized_adduct_grouping(study_obj, consensus_data, rt_tol, mz_tol):
987
- """
988
- Optimized O(n log n) adduct grouping using spatial indexing.
989
-
990
- Args:
991
- study_obj: Study object with logger
992
- consensus_data: List of consensus feature dictionaries
993
- rt_tol: RT tolerance in minutes
994
- mz_tol: m/z tolerance in Da
995
-
996
- Returns:
997
- Tuple of (adduct_group_list, adduct_of_list)
998
- """
999
- if not consensus_data:
1000
- return [], []
1001
-
1002
- n_features = len(consensus_data)
1003
- if n_features > 1000:
1004
- study_obj.logger.info(f"Optimizing adduct grouping for {n_features} consensus features...")
1005
-
1006
- start_time = time.time()
1007
-
1008
- # Build spatial index using RT and neutral mass as coordinates
1009
- features_by_mass = defaultdict(list)
1010
- mass_bin_size = mz_tol * 2 # 2x tolerance for conservative binning
1011
-
1012
- valid_features = []
1013
- for feature in consensus_data:
1014
- consensus_uid = feature["consensus_uid"]
1015
- rt = feature["rt"]
1016
- neutral_mass = feature.get("adduct_mass_neutral_top")
1017
- intensity = feature.get("inty_mean", 0)
1018
- adduct = feature.get("adduct_top", "")
1019
-
1020
- if neutral_mass is not None:
1021
- mass_bin = int(neutral_mass / mass_bin_size)
1022
- features_by_mass[mass_bin].append((consensus_uid, rt, neutral_mass, intensity, adduct))
1023
- valid_features.append((consensus_uid, rt, neutral_mass, intensity, adduct, mass_bin))
1024
-
1025
- # Union-Find for efficient grouping
1026
- class UnionFind:
1027
- def __init__(self, n):
1028
- self.parent = list(range(n))
1029
- self.rank = [0] * n
1030
-
1031
- def find(self, x):
1032
- if self.parent[x] != x:
1033
- self.parent[x] = self.find(self.parent[x])
1034
- return self.parent[x]
1035
-
1036
- def union(self, x, y):
1037
- px, py = self.find(x), self.find(y)
1038
- if px == py:
1039
- return
1040
- if self.rank[px] < self.rank[py]:
1041
- px, py = py, px
1042
- self.parent[py] = px
1043
- if self.rank[px] == self.rank[py]:
1044
- self.rank[px] += 1
1045
-
1046
- uid_to_idx = {feature[0]: i for i, feature in enumerate(valid_features)}
1047
- uf = UnionFind(len(valid_features))
1048
-
1049
- # Find groups using spatial index
1050
- checked_pairs = set()
1051
- for i, (uid1, rt1, mass1, inty1, adduct1, bin1) in enumerate(valid_features):
1052
- for bin_offset in [-1, 0, 1]:
1053
- check_bin = bin1 + bin_offset
1054
- if check_bin not in features_by_mass:
1055
- continue
1056
-
1057
- for uid2, rt2, mass2, inty2, adduct2 in features_by_mass[check_bin]:
1058
- if uid1 >= uid2:
1059
- continue
1060
-
1061
- pair = (min(uid1, uid2), max(uid1, uid2))
1062
- if pair in checked_pairs:
1063
- continue
1064
- checked_pairs.add(pair)
1065
-
1066
- mass_diff = abs(mass1 - mass2)
1067
- rt_diff = abs(rt1 - rt2) / 60.0 # Convert to minutes
1068
-
1069
- if mass_diff <= mz_tol and rt_diff <= rt_tol:
1070
- j = uid_to_idx[uid2]
1071
- uf.union(i, j)
1072
-
1073
- # Extract groups
1074
- groups_by_root = defaultdict(list)
1075
- for i, (uid, rt, mass, inty, adduct, _) in enumerate(valid_features):
1076
- root = uf.find(i)
1077
- groups_by_root[root].append((uid, rt, mass, inty, adduct))
1078
-
1079
- groups = {}
1080
- group_id = 1
1081
- assigned_groups = {}
1082
-
1083
- for group_members in groups_by_root.values():
1084
- member_uids = [uid for uid, _, _, _, _ in group_members]
1085
-
1086
- for uid in member_uids:
1087
- assigned_groups[uid] = group_id
1088
- groups[group_id] = member_uids
1089
- group_id += 1
1090
-
1091
- # Handle features without neutral mass
1092
- for feature in consensus_data:
1093
- uid = feature["consensus_uid"]
1094
- if uid not in assigned_groups:
1095
- assigned_groups[uid] = group_id
1096
- groups[group_id] = [uid]
1097
- group_id += 1
1098
-
1099
- # Determine adduct_of for each group
1100
- group_adduct_of = {}
1101
- for grp_id, member_uids in groups.items():
1102
- best_uid = None
1103
- best_priority = -1
1104
- best_intensity = 0
1105
-
1106
- for uid in member_uids:
1107
- feature_data = next((f for f in consensus_data if f["consensus_uid"] == uid), None)
1108
- if not feature_data:
1109
- continue
1110
-
1111
- adduct = feature_data.get("adduct_top", "")
1112
- intensity = feature_data.get("inty_mean", 0)
1113
-
1114
- priority = 0
1115
- if adduct and ("[M+H]" in adduct or adduct == "H" or adduct == "?"):
1116
- priority = 3
1117
- elif adduct and "[M-H]" in adduct:
1118
- priority = 2
1119
- elif adduct and "M" in adduct:
1120
- priority = 1
1121
-
1122
- if priority > best_priority or (priority == best_priority and intensity > best_intensity):
1123
- best_uid = uid
1124
- best_priority = priority
1125
- best_intensity = intensity
1126
-
1127
- group_adduct_of[grp_id] = best_uid if best_uid else member_uids[0]
1128
-
1129
- # Build final lists in same order as consensus_data
1130
- adduct_group_list = []
1131
- adduct_of_list = []
1132
-
1133
- for feature in consensus_data:
1134
- uid = feature["consensus_uid"]
1135
- group = assigned_groups.get(uid, 0)
1136
- adduct_of = group_adduct_of.get(group, uid)
1137
-
1138
- adduct_group_list.append(group)
1139
- adduct_of_list.append(adduct_of)
1140
-
1141
- grouping_time = time.time() - start_time
1142
- if n_features > 1000:
1143
- study_obj.logger.info(f"Adduct grouping completed in {grouping_time:.2f}s ({len(groups)} groups)")
1144
-
1145
- return adduct_group_list, adduct_of_list
1146
-
1147
-
1148
- # Backward compatibility alias
1149
- find_consensus = merge
1150
-
1151
-
1152
250
  def find_ms2(self, **kwargs):
1153
251
  """
1154
252
  Links MS2 spectra to consensus features and stores the result in self.consensus_ms2.