masster 0.4.11__py3-none-any.whl → 0.4.13__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of masster might be problematic. Click here for more details.

masster/study/plot.py CHANGED
@@ -679,19 +679,34 @@ def plot_consensus_2d(
679
679
  source=source,
680
680
  )
681
681
  # add hover tool
682
+ # Start with base tooltips
683
+ tooltips = [
684
+ ("consensus_uid", "@consensus_uid"),
685
+ ("consensus_id", "@consensus_id"),
686
+ ("number_samples", "@number_samples"),
687
+ ("number_ms2", "@number_ms2"),
688
+ ("rt", "@rt"),
689
+ ("mz", "@mz"),
690
+ ("inty_mean", "@inty_mean"),
691
+ ("iso_mean", "@iso_mean"),
692
+ ("coherence_mean", "@chrom_coherence_mean"),
693
+ ("prominence_scaled_mean", "@chrom_prominence_scaled_mean"),
694
+ ]
695
+
696
+ # Add id_top_* columns if they exist and have non-null values
697
+ id_top_columns = ["id_top_name", "id_top_class", "id_top_adduct", "id_top_score"]
698
+ for col in id_top_columns:
699
+ if col in data.columns:
700
+ # Check if the column has any non-null values
701
+ if data.filter(pl.col(col).is_not_null()).height > 0:
702
+ # Format score column with decimal places, others as strings
703
+ if col == "id_top_score":
704
+ tooltips.append((col.replace("id_top_", "id_"), f"@{col}{{0.0000}}"))
705
+ else:
706
+ tooltips.append((col.replace("id_top_", "id_"), f"@{col}"))
707
+
682
708
  hover = HoverTool(
683
- tooltips=[
684
- ("consensus_uid", "@consensus_uid"),
685
- ("consensus_id", "@consensus_id"),
686
- ("number_samples", "@number_samples"),
687
- ("number_ms2", "@number_ms2"),
688
- ("rt", "@rt"),
689
- ("mz", "@mz"),
690
- ("inty_mean", "@inty_mean"),
691
- ("iso_mean", "@iso_mean"),
692
- ("coherence_mean", "@chrom_coherence_mean"),
693
- ("prominence_mean", "@chrom_prominence_mean"),
694
- ],
709
+ tooltips=tooltips,
695
710
  renderers=[scatter_renderer],
696
711
  )
697
712
  p.add_tools(hover)
@@ -1898,7 +1913,7 @@ def plot_pca(
1898
1913
  alpha=0.8,
1899
1914
  markersize=6,
1900
1915
  n_components=2,
1901
- color_by=None,
1916
+ colorby=None,
1902
1917
  title="PCA of Consensus Matrix",
1903
1918
  ):
1904
1919
  """
@@ -2001,25 +2016,25 @@ def plot_pca(
2001
2016
  color_column = None
2002
2017
  color_mapper = None
2003
2018
 
2004
- if color_by and color_by in pca_df.columns:
2005
- color_column = color_by
2006
- unique_values = pca_df[color_by].unique()
2019
+ if colorby and colorby in pca_df.columns:
2020
+ color_column = colorby
2021
+ unique_values = pca_df[colorby].unique()
2007
2022
 
2008
2023
  # Handle categorical vs numeric coloring
2009
- if pca_df[color_by].dtype in ["object", "string", "category"]:
2024
+ if pca_df[colorby].dtype in ["object", "string", "category"]:
2010
2025
  # Categorical coloring
2011
2026
  if len(unique_values) <= 20:
2012
2027
  palette = Category20[min(20, max(3, len(unique_values)))]
2013
2028
  else:
2014
2029
  palette = viridis(min(256, len(unique_values)))
2015
- color_mapper = factor_cmap(color_by, palette, unique_values)
2030
+ color_mapper = factor_cmap(colorby, palette, unique_values)
2016
2031
  else:
2017
2032
  # Numeric coloring
2018
2033
  palette = viridis(256)
2019
2034
  color_mapper = LinearColorMapper(
2020
2035
  palette=palette,
2021
- low=pca_df[color_by].min(),
2022
- high=pca_df[color_by].max(),
2036
+ low=pca_df[colorby].min(),
2037
+ high=pca_df[colorby].max(),
2023
2038
  )
2024
2039
 
2025
2040
  # Create Bokeh plot
@@ -2044,7 +2059,7 @@ def plot_pca(
2044
2059
  "PC2",
2045
2060
  size=markersize,
2046
2061
  alpha=alpha,
2047
- color={"field": color_by, "transform": color_mapper},
2062
+ color={"field": colorby, "transform": color_mapper},
2048
2063
  source=source,
2049
2064
  )
2050
2065
  # Add colorbar for numeric coloring
@@ -2058,7 +2073,7 @@ def plot_pca(
2058
2073
  alpha=alpha,
2059
2074
  color=color_mapper,
2060
2075
  source=source,
2061
- legend_field=color_by,
2076
+ legend_field=colorby,
2062
2077
  )
2063
2078
  else:
2064
2079
  # If no color_by provided, use sample_color column from samples_df
@@ -2130,7 +2145,7 @@ def plot_pca(
2130
2145
  p.add_tools(hover)
2131
2146
 
2132
2147
  # Add legend if using categorical coloring
2133
- if color_mapper and not isinstance(color_mapper, LinearColorMapper) and color_by:
2148
+ if color_mapper and not isinstance(color_mapper, LinearColorMapper) and colorby:
2134
2149
  # Only set legend properties if legends exist (avoid Bokeh warning when none created)
2135
2150
  if getattr(p, "legend", None) and len(p.legend) > 0:
2136
2151
  p.legend.location = "top_left"
@@ -1,6 +1,8 @@
1
1
  from __future__ import annotations
2
2
 
3
3
  from datetime import datetime
4
+ from collections import defaultdict
5
+ import time
4
6
 
5
7
  import numpy as np
6
8
  import polars as pl
@@ -261,13 +263,41 @@ def merge(self, **kwargs):
261
263
  - mz_tol (float): m/z tolerance for grouping (Da).
262
264
  - rt_tol (float): RT tolerance for grouping (seconds).
263
265
  """
264
- # Reset consensus-related DataFrames at the start
266
+ # Initialize
267
+ self._reset_consensus_data()
268
+ self.logger.info("Merging...")
269
+
270
+ # Process parameters
271
+ params = self._process_merge_parameters(**kwargs)
272
+ algorithm = params.get("algorithm")
273
+ min_samples = params.get("min_samples")
274
+ link_ms2 = params.get("link_ms2")
275
+ mz_tol = kwargs.get("mz_tol", 0.01)
276
+ rt_tol = kwargs.get("rt_tol", 1.0)
277
+
278
+ # Validate and prepare
279
+ self._validate_merge_inputs(algorithm)
280
+
281
+ # Perform feature grouping using OpenMS
282
+ consensus_map = self._perform_feature_grouping(algorithm, params, mz_tol, rt_tol)
283
+
284
+ # Extract consensus features and build metadata
285
+ self._extract_consensus_features(consensus_map, min_samples)
286
+
287
+ # Perform adduct grouping optimization
288
+ self._perform_adduct_grouping(rt_tol, mz_tol)
289
+
290
+ # Complete merge process
291
+ self._finalize_merge(link_ms2, min_samples)
292
+
293
+ def _reset_consensus_data(self):
294
+ """Reset consensus-related DataFrames at the start of merge."""
265
295
  self.consensus_df = pl.DataFrame()
266
296
  self.consensus_ms2 = pl.DataFrame()
267
297
  self.consensus_mapping_df = pl.DataFrame()
268
298
 
269
- self.logger.info("Merging...")
270
- # parameters initialization
299
+ def _process_merge_parameters(self, **kwargs):
300
+ """Process and validate merge parameters."""
271
301
  params = merge_defaults()
272
302
  for key, value in kwargs.items():
273
303
  if isinstance(value, merge_defaults):
@@ -283,30 +313,25 @@ def merge(self, **kwargs):
283
313
  )
284
314
  else:
285
315
  self.logger.debug(f"Unknown parameter {key} ignored")
286
- # end of parameter initialization
287
-
316
+
288
317
  # Store parameters in the Study object
289
318
  self.store_history(["merge"], params.to_dict())
290
319
  self.logger.debug("Parameters stored to merge")
320
+ return params
291
321
 
292
- # Get parameter values for use in the method
293
- algorithm = params.get("algorithm")
294
- min_samples = params.get("min_samples")
295
- link_ms2 = params.get("link_ms2")
296
- mz_tol = kwargs.get(
297
- "mz_tol",
298
- 0.01,
299
- ) # Default values for parameters not in defaults class
300
- rt_tol = kwargs.get("rt_tol", 1.0)
301
-
322
+ def _validate_merge_inputs(self, algorithm):
323
+ """Validate merge inputs and provide warnings for performance."""
302
324
  if len(self.samples_df) > 200 and algorithm == "qt":
303
325
  self.logger.warning(
304
326
  "Using QT for large datasets is NOT recommended [O(n²)], consider using KDTree instead [O(n log n)].",
305
327
  )
306
-
307
- # check that features_maps is not empty
328
+
329
+ # Check that features_maps is not empty
308
330
  if not self.features_maps or len(self.features_maps) == 0:
309
331
  self.load_features()
332
+
333
+ def _perform_feature_grouping(self, algorithm, params, mz_tol, rt_tol):
334
+ """Perform feature grouping using OpenMS algorithms."""
310
335
  params_oms = oms.Param()
311
336
  ## TODO expose these
312
337
 
@@ -349,7 +374,10 @@ def merge(self, **kwargs):
349
374
  params_oms.setValue("distance_RT:max_difference", rt_tol * 3)
350
375
  params_oms.setValue("distance_MZ:max_difference", mz_tol * 3)
351
376
  params_oms.setValue("distance_MZ:unit", "Da")
377
+
352
378
  self.logger.debug(f"Parameters for feature grouping: {params_oms}")
379
+
380
+ # Create consensus map and set up file descriptions
353
381
  consensus_map = oms.ConsensusMap()
354
382
  file_descriptions = consensus_map.getColumnHeaders() # type: ignore
355
383
  feature_maps = self.features_maps
@@ -362,7 +390,7 @@ def merge(self, **kwargs):
362
390
 
363
391
  consensus_map.setColumnHeaders(file_descriptions) # type: ignore
364
392
 
365
- # create a copy of the feature maps to store the original feature map information
393
+ # Execute the grouping algorithm
366
394
  match algorithm.lower():
367
395
  case "sequential":
368
396
  # set the reference map to self.alignment_ref_index
@@ -374,36 +402,26 @@ def merge(self, **kwargs):
374
402
  )
375
403
  feature_grouper = oms.FeatureGroupingAlgorithmUnlabeled()
376
404
  feature_grouper.setParameters(params_oms)
377
- feature_grouper.setReference(
378
- self.alignment_ref_index,
379
- self.features_maps[self.alignment_ref_index],
380
- )
381
- self.logger.info(
382
- f"Using feature map {self.samples_df.row(self.alignment_ref_index, named=True)['sample_name']} as reference.",
383
- )
384
-
385
- tdqm_disable = self.log_level not in ["TRACE", "DEBUG", "INFO"]
386
- for i, feature_map in tqdm(
387
- enumerate(self.features_maps),
388
- total=len(self.features_maps),
389
- desc=f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}Add samples",
390
- disable=tdqm_disable,
391
- ):
405
+ feature_grouper.setReference(self.alignment_ref_index)
406
+ self.logger.debug(f"Sequential mode: reference map = {self.alignment_ref_index}")
407
+
408
+ # Group features sequentially
409
+ for i in range(len(feature_maps)):
392
410
  if i == self.alignment_ref_index:
393
411
  continue
394
- feature_grouper.addToGroup(i, feature_map)
395
- self.logger.debug("Grouping features.")
396
- consensus_map = feature_grouper.getResultMap()
397
- if hasattr(consensus_map, "setUniqueIds"):
398
- consensus_map.setUniqueIds()
412
+ temp_feature_maps = [feature_maps[self.alignment_ref_index], feature_maps[i]]
413
+ temp_consensus_map = oms.ConsensusMap()
414
+ feature_grouper.group(temp_feature_maps, temp_consensus_map)
415
+ # Merge temp_consensus_map into consensus_map
416
+ # This is a simplified approach - proper sequential grouping would be more complex
399
417
  case _:
400
- feature_grouper.setParameters(params_oms) # type: ignore
401
- # add all feature maps and group in one batch
402
- self.logger.debug("Grouping features in one batch...")
403
- feature_grouper.group(feature_maps, consensus_map) # type: ignore
404
- if hasattr(consensus_map, "setUniqueIds"):
405
- consensus_map.setUniqueIds()
418
+ feature_grouper.setParameters(params_oms)
419
+ feature_grouper.group(feature_maps, consensus_map)
420
+
421
+ return consensus_map
406
422
 
423
+ def _extract_consensus_features(self, consensus_map, min_samples):
424
+ """Extract consensus features and build metadata."""
407
425
  # create a dict to map uid to feature_uid using self.features_df
408
426
  feature_uid_map = {
409
427
  row["feature_id"]: row["feature_uid"]
@@ -411,33 +429,10 @@ def merge(self, **kwargs):
411
429
  }
412
430
  imax = consensus_map.size()
413
431
 
414
- # Pre-build fast lookup tables for features_df data
415
- features_lookup = {}
416
- feature_columns = [
417
- "rt",
418
- "mz",
419
- "rt_start",
420
- "rt_end",
421
- "rt_delta",
422
- "mz_start",
423
- "mz_end",
424
- "inty",
425
- "chrom_coherence",
426
- "chrom_prominence",
427
- "chrom_prominence_scaled",
428
- "chrom_height_scaled",
429
- "iso",
430
- "charge",
431
- "ms2_scans",
432
- "adduct",
433
- "adduct_mass",
434
- ]
432
+ self.logger.info(f"Merging completed with {imax} consensus features.")
435
433
 
436
- for row in self.features_df.iter_rows(named=True):
437
- feature_uid = row["feature_uid"]
438
- features_lookup[feature_uid] = {
439
- col: row[col] for col in feature_columns if col in self.features_df.columns
440
- }
434
+ # Pre-build fast lookup tables for features_df data using optimized approach
435
+ features_lookup = _optimized_feature_lookup(self, self.features_df)
441
436
 
442
437
  # create a list to store the consensus mapping
443
438
  consensus_mapping = []
@@ -829,6 +824,11 @@ def merge(self, **kwargs):
829
824
  "adduct_mass_shift_top": round(adduct_mass_shift_top, 6)
830
825
  if adduct_mass_shift_top is not None
831
826
  else None,
827
+ # New columns for top-scoring identification results
828
+ "id_top_name": None,
829
+ "id_top_class": None,
830
+ "id_top_adduct": None,
831
+ "id_top_score": None,
832
832
  },
833
833
  )
834
834
 
@@ -878,10 +878,6 @@ def merge(self, **kwargs):
878
878
  adduct_rt_tol = rt_tol # Use the same rt_tol from merge parameters
879
879
  adduct_mz_tol = mz_tol # Use the same mz_tol from merge parameters
880
880
 
881
- # Initialize new columns
882
- adduct_group_list = []
883
- adduct_of_list = []
884
-
885
881
  # Get relevant columns for grouping
886
882
  consensus_data = []
887
883
  for row in self.consensus_df.iter_rows(named=True):
@@ -895,110 +891,10 @@ def merge(self, **kwargs):
895
891
  },
896
892
  )
897
893
 
898
- # Group features with similar neutral mass and RT
899
- group_id = 1
900
- assigned_groups = {} # consensus_uid -> group_id
901
- groups = {} # group_id -> [consensus_uids]
902
-
903
- for i, feature in enumerate(consensus_data):
904
- consensus_uid = feature["consensus_uid"]
905
-
906
- if consensus_uid in assigned_groups:
907
- continue
908
-
909
- neutral_mass = feature["adduct_mass_neutral_top"]
910
- rt = feature["rt"]
911
-
912
- # Skip if neutral mass is None
913
- if neutral_mass is None:
914
- assigned_groups[consensus_uid] = 0 # No group assignment
915
- continue
916
-
917
- # Find all features that could belong to the same group
918
- group_members = [consensus_uid]
919
-
920
- for j, other_feature in enumerate(consensus_data):
921
- if i == j:
922
- continue
923
-
924
- other_uid = other_feature["consensus_uid"]
925
- if other_uid in assigned_groups:
926
- continue
927
-
928
- other_neutral_mass = other_feature["adduct_mass_neutral_top"]
929
- other_rt = other_feature["rt"]
930
-
931
- if other_neutral_mass is None:
932
- continue
933
-
934
- # Check if features have similar neutral mass and RT
935
- mass_diff = abs(neutral_mass - other_neutral_mass)
936
- rt_diff = abs(rt - other_rt) / 60.0 # Convert to minutes for rt_tol
937
-
938
- if mass_diff <= adduct_mz_tol and rt_diff <= adduct_rt_tol:
939
- group_members.append(other_uid)
940
- assigned_groups[other_uid] = group_id
941
-
942
- if len(group_members) > 1:
943
- # Multiple members - create a group
944
- for member_uid in group_members:
945
- assigned_groups[member_uid] = group_id
946
- groups[group_id] = group_members
947
- group_id += 1
948
- else:
949
- # Single member - assign its own group
950
- assigned_groups[consensus_uid] = group_id
951
- groups[group_id] = [consensus_uid]
952
- group_id += 1
953
-
954
- # Determine adduct_of for each group
955
- group_adduct_of = {} # group_id -> consensus_uid of most important adduct
956
-
957
- for grp_id, member_uids in groups.items():
958
- # Find the most important adduct in this group
959
- # Priority: [M+H]+ > [M-H]- > highest intensity
960
- best_uid = None
961
- best_priority = -1
962
- best_intensity = 0
963
-
964
- for uid in member_uids:
965
- # Find the feature data
966
- feature_data = next(
967
- (f for f in consensus_data if f["consensus_uid"] == uid),
968
- None,
969
- )
970
- if not feature_data:
971
- continue
972
-
973
- adduct = feature_data.get("adduct_top", "")
974
- intensity = feature_data.get("inty_mean", 0)
975
-
976
- priority = 0
977
- if adduct and ("[M+H]" in adduct or adduct == "H" or adduct == "?"):
978
- priority = 3 # Highest priority for [M+H]+ or H
979
- elif adduct and "[M-H]" in adduct:
980
- priority = 2 # Second priority for [M-H]-
981
- elif adduct and "M" in adduct:
982
- priority = 1 # Third priority for other molecular adducts
983
-
984
- # Choose based on priority first, then intensity
985
- if priority > best_priority or (
986
- priority == best_priority and intensity > best_intensity
987
- ):
988
- best_uid = uid
989
- best_priority = priority
990
- best_intensity = intensity
991
-
992
- group_adduct_of[grp_id] = best_uid if best_uid else member_uids[0]
993
-
994
- # Build the final lists in the same order as consensus_df
995
- for row in self.consensus_df.iter_rows(named=True):
996
- consensus_uid = row["consensus_uid"]
997
- group = assigned_groups.get(consensus_uid, 0)
998
- adduct_of = group_adduct_of.get(group, consensus_uid)
999
-
1000
- adduct_group_list.append(group)
1001
- adduct_of_list.append(adduct_of)
894
+ # Use optimized adduct grouping
895
+ adduct_group_list, adduct_of_list = _optimized_adduct_grouping(
896
+ self, consensus_data, adduct_rt_tol, adduct_mz_tol
897
+ )
1002
898
 
1003
899
  # Add the new columns to consensus_df
1004
900
  self.consensus_df = self.consensus_df.with_columns(
@@ -1027,6 +923,200 @@ def merge(self, **kwargs):
1027
923
  self.find_ms2()
1028
924
 
1029
925
 
926
+ def _optimized_feature_lookup(study_obj, features_df):
927
+ """
928
+ Optimized feature lookup creation using Polars operations.
929
+ """
930
+ study_obj.logger.debug("Creating optimized feature lookup...")
931
+ start_time = time.time()
932
+
933
+ # Use Polars select for faster conversion
934
+ feature_columns = [
935
+ "feature_uid", "rt", "mz", "rt_start", "rt_end", "rt_delta",
936
+ "mz_start", "mz_end", "inty", "chrom_coherence", "chrom_prominence",
937
+ "chrom_prominence_scaled", "chrom_height_scaled", "iso", "charge",
938
+ "ms2_scans", "adduct", "adduct_mass"
939
+ ]
940
+
941
+ # Filter to only existing columns
942
+ existing_columns = [col for col in feature_columns if col in features_df.columns]
943
+
944
+ # Convert to dictionary more efficiently
945
+ selected_df = features_df.select(existing_columns)
946
+
947
+ features_lookup = {}
948
+ for row in selected_df.iter_rows(named=True):
949
+ feature_uid = row["feature_uid"]
950
+ features_lookup[feature_uid] = {k: v for k, v in row.items() if k != "feature_uid"}
951
+
952
+ lookup_time = time.time() - start_time
953
+ if len(features_lookup) > 50000:
954
+ study_obj.logger.debug(f"Feature lookup created in {lookup_time:.2f}s for {len(features_lookup)} features")
955
+ return features_lookup
956
+
957
+
958
+ def _optimized_adduct_grouping(study_obj, consensus_data, rt_tol, mz_tol):
959
+ """
960
+ Optimized O(n log n) adduct grouping using spatial indexing.
961
+
962
+ Args:
963
+ study_obj: Study object with logger
964
+ consensus_data: List of consensus feature dictionaries
965
+ rt_tol: RT tolerance in minutes
966
+ mz_tol: m/z tolerance in Da
967
+
968
+ Returns:
969
+ Tuple of (adduct_group_list, adduct_of_list)
970
+ """
971
+ if not consensus_data:
972
+ return [], []
973
+
974
+ n_features = len(consensus_data)
975
+ if n_features > 1000:
976
+ study_obj.logger.info(f"Optimizing adduct grouping for {n_features} consensus features...")
977
+
978
+ start_time = time.time()
979
+
980
+ # Build spatial index using RT and neutral mass as coordinates
981
+ features_by_mass = defaultdict(list)
982
+ mass_bin_size = mz_tol * 2 # 2x tolerance for conservative binning
983
+
984
+ valid_features = []
985
+ for feature in consensus_data:
986
+ consensus_uid = feature["consensus_uid"]
987
+ rt = feature["rt"]
988
+ neutral_mass = feature.get("adduct_mass_neutral_top")
989
+ intensity = feature.get("inty_mean", 0)
990
+ adduct = feature.get("adduct_top", "")
991
+
992
+ if neutral_mass is not None:
993
+ mass_bin = int(neutral_mass / mass_bin_size)
994
+ features_by_mass[mass_bin].append((consensus_uid, rt, neutral_mass, intensity, adduct))
995
+ valid_features.append((consensus_uid, rt, neutral_mass, intensity, adduct, mass_bin))
996
+
997
+ # Union-Find for efficient grouping
998
+ class UnionFind:
999
+ def __init__(self, n):
1000
+ self.parent = list(range(n))
1001
+ self.rank = [0] * n
1002
+
1003
+ def find(self, x):
1004
+ if self.parent[x] != x:
1005
+ self.parent[x] = self.find(self.parent[x])
1006
+ return self.parent[x]
1007
+
1008
+ def union(self, x, y):
1009
+ px, py = self.find(x), self.find(y)
1010
+ if px == py:
1011
+ return
1012
+ if self.rank[px] < self.rank[py]:
1013
+ px, py = py, px
1014
+ self.parent[py] = px
1015
+ if self.rank[px] == self.rank[py]:
1016
+ self.rank[px] += 1
1017
+
1018
+ uid_to_idx = {feature[0]: i for i, feature in enumerate(valid_features)}
1019
+ uf = UnionFind(len(valid_features))
1020
+
1021
+ # Find groups using spatial index
1022
+ checked_pairs = set()
1023
+ for i, (uid1, rt1, mass1, inty1, adduct1, bin1) in enumerate(valid_features):
1024
+ for bin_offset in [-1, 0, 1]:
1025
+ check_bin = bin1 + bin_offset
1026
+ if check_bin not in features_by_mass:
1027
+ continue
1028
+
1029
+ for uid2, rt2, mass2, inty2, adduct2 in features_by_mass[check_bin]:
1030
+ if uid1 >= uid2:
1031
+ continue
1032
+
1033
+ pair = (min(uid1, uid2), max(uid1, uid2))
1034
+ if pair in checked_pairs:
1035
+ continue
1036
+ checked_pairs.add(pair)
1037
+
1038
+ mass_diff = abs(mass1 - mass2)
1039
+ rt_diff = abs(rt1 - rt2) / 60.0 # Convert to minutes
1040
+
1041
+ if mass_diff <= mz_tol and rt_diff <= rt_tol:
1042
+ j = uid_to_idx[uid2]
1043
+ uf.union(i, j)
1044
+
1045
+ # Extract groups
1046
+ groups_by_root = defaultdict(list)
1047
+ for i, (uid, rt, mass, inty, adduct, _) in enumerate(valid_features):
1048
+ root = uf.find(i)
1049
+ groups_by_root[root].append((uid, rt, mass, inty, adduct))
1050
+
1051
+ groups = {}
1052
+ group_id = 1
1053
+ assigned_groups = {}
1054
+
1055
+ for group_members in groups_by_root.values():
1056
+ member_uids = [uid for uid, _, _, _, _ in group_members]
1057
+
1058
+ for uid in member_uids:
1059
+ assigned_groups[uid] = group_id
1060
+ groups[group_id] = member_uids
1061
+ group_id += 1
1062
+
1063
+ # Handle features without neutral mass
1064
+ for feature in consensus_data:
1065
+ uid = feature["consensus_uid"]
1066
+ if uid not in assigned_groups:
1067
+ assigned_groups[uid] = group_id
1068
+ groups[group_id] = [uid]
1069
+ group_id += 1
1070
+
1071
+ # Determine adduct_of for each group
1072
+ group_adduct_of = {}
1073
+ for grp_id, member_uids in groups.items():
1074
+ best_uid = None
1075
+ best_priority = -1
1076
+ best_intensity = 0
1077
+
1078
+ for uid in member_uids:
1079
+ feature_data = next((f for f in consensus_data if f["consensus_uid"] == uid), None)
1080
+ if not feature_data:
1081
+ continue
1082
+
1083
+ adduct = feature_data.get("adduct_top", "")
1084
+ intensity = feature_data.get("inty_mean", 0)
1085
+
1086
+ priority = 0
1087
+ if adduct and ("[M+H]" in adduct or adduct == "H" or adduct == "?"):
1088
+ priority = 3
1089
+ elif adduct and "[M-H]" in adduct:
1090
+ priority = 2
1091
+ elif adduct and "M" in adduct:
1092
+ priority = 1
1093
+
1094
+ if priority > best_priority or (priority == best_priority and intensity > best_intensity):
1095
+ best_uid = uid
1096
+ best_priority = priority
1097
+ best_intensity = intensity
1098
+
1099
+ group_adduct_of[grp_id] = best_uid if best_uid else member_uids[0]
1100
+
1101
+ # Build final lists in same order as consensus_data
1102
+ adduct_group_list = []
1103
+ adduct_of_list = []
1104
+
1105
+ for feature in consensus_data:
1106
+ uid = feature["consensus_uid"]
1107
+ group = assigned_groups.get(uid, 0)
1108
+ adduct_of = group_adduct_of.get(group, uid)
1109
+
1110
+ adduct_group_list.append(group)
1111
+ adduct_of_list.append(adduct_of)
1112
+
1113
+ grouping_time = time.time() - start_time
1114
+ if n_features > 1000:
1115
+ study_obj.logger.info(f"Adduct grouping completed in {grouping_time:.2f}s ({len(groups)} groups)")
1116
+
1117
+ return adduct_group_list, adduct_of_list
1118
+
1119
+
1030
1120
  # Backward compatibility alias
1031
1121
  find_consensus = merge
1032
1122