masster 0.4.11__py3-none-any.whl → 0.4.13__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of masster might be problematic. Click here for more details.
- masster/_version.py +1 -1
- masster/lib/lib.py +45 -3
- masster/study/helpers.py +262 -310
- masster/study/id.py +564 -324
- masster/study/plot.py +38 -23
- masster/study/processing.py +268 -178
- masster/study/study.py +95 -60
- masster/study/study5_schema.json +12 -0
- {masster-0.4.11.dist-info → masster-0.4.13.dist-info}/METADATA +1 -1
- {masster-0.4.11.dist-info → masster-0.4.13.dist-info}/RECORD +13 -13
- {masster-0.4.11.dist-info → masster-0.4.13.dist-info}/WHEEL +0 -0
- {masster-0.4.11.dist-info → masster-0.4.13.dist-info}/entry_points.txt +0 -0
- {masster-0.4.11.dist-info → masster-0.4.13.dist-info}/licenses/LICENSE +0 -0
masster/study/plot.py
CHANGED
|
@@ -679,19 +679,34 @@ def plot_consensus_2d(
|
|
|
679
679
|
source=source,
|
|
680
680
|
)
|
|
681
681
|
# add hover tool
|
|
682
|
+
# Start with base tooltips
|
|
683
|
+
tooltips = [
|
|
684
|
+
("consensus_uid", "@consensus_uid"),
|
|
685
|
+
("consensus_id", "@consensus_id"),
|
|
686
|
+
("number_samples", "@number_samples"),
|
|
687
|
+
("number_ms2", "@number_ms2"),
|
|
688
|
+
("rt", "@rt"),
|
|
689
|
+
("mz", "@mz"),
|
|
690
|
+
("inty_mean", "@inty_mean"),
|
|
691
|
+
("iso_mean", "@iso_mean"),
|
|
692
|
+
("coherence_mean", "@chrom_coherence_mean"),
|
|
693
|
+
("prominence_scaled_mean", "@chrom_prominence_scaled_mean"),
|
|
694
|
+
]
|
|
695
|
+
|
|
696
|
+
# Add id_top_* columns if they exist and have non-null values
|
|
697
|
+
id_top_columns = ["id_top_name", "id_top_class", "id_top_adduct", "id_top_score"]
|
|
698
|
+
for col in id_top_columns:
|
|
699
|
+
if col in data.columns:
|
|
700
|
+
# Check if the column has any non-null values
|
|
701
|
+
if data.filter(pl.col(col).is_not_null()).height > 0:
|
|
702
|
+
# Format score column with decimal places, others as strings
|
|
703
|
+
if col == "id_top_score":
|
|
704
|
+
tooltips.append((col.replace("id_top_", "id_"), f"@{col}{{0.0000}}"))
|
|
705
|
+
else:
|
|
706
|
+
tooltips.append((col.replace("id_top_", "id_"), f"@{col}"))
|
|
707
|
+
|
|
682
708
|
hover = HoverTool(
|
|
683
|
-
tooltips=
|
|
684
|
-
("consensus_uid", "@consensus_uid"),
|
|
685
|
-
("consensus_id", "@consensus_id"),
|
|
686
|
-
("number_samples", "@number_samples"),
|
|
687
|
-
("number_ms2", "@number_ms2"),
|
|
688
|
-
("rt", "@rt"),
|
|
689
|
-
("mz", "@mz"),
|
|
690
|
-
("inty_mean", "@inty_mean"),
|
|
691
|
-
("iso_mean", "@iso_mean"),
|
|
692
|
-
("coherence_mean", "@chrom_coherence_mean"),
|
|
693
|
-
("prominence_mean", "@chrom_prominence_mean"),
|
|
694
|
-
],
|
|
709
|
+
tooltips=tooltips,
|
|
695
710
|
renderers=[scatter_renderer],
|
|
696
711
|
)
|
|
697
712
|
p.add_tools(hover)
|
|
@@ -1898,7 +1913,7 @@ def plot_pca(
|
|
|
1898
1913
|
alpha=0.8,
|
|
1899
1914
|
markersize=6,
|
|
1900
1915
|
n_components=2,
|
|
1901
|
-
|
|
1916
|
+
colorby=None,
|
|
1902
1917
|
title="PCA of Consensus Matrix",
|
|
1903
1918
|
):
|
|
1904
1919
|
"""
|
|
@@ -2001,25 +2016,25 @@ def plot_pca(
|
|
|
2001
2016
|
color_column = None
|
|
2002
2017
|
color_mapper = None
|
|
2003
2018
|
|
|
2004
|
-
if
|
|
2005
|
-
color_column =
|
|
2006
|
-
unique_values = pca_df[
|
|
2019
|
+
if colorby and colorby in pca_df.columns:
|
|
2020
|
+
color_column = colorby
|
|
2021
|
+
unique_values = pca_df[colorby].unique()
|
|
2007
2022
|
|
|
2008
2023
|
# Handle categorical vs numeric coloring
|
|
2009
|
-
if pca_df[
|
|
2024
|
+
if pca_df[colorby].dtype in ["object", "string", "category"]:
|
|
2010
2025
|
# Categorical coloring
|
|
2011
2026
|
if len(unique_values) <= 20:
|
|
2012
2027
|
palette = Category20[min(20, max(3, len(unique_values)))]
|
|
2013
2028
|
else:
|
|
2014
2029
|
palette = viridis(min(256, len(unique_values)))
|
|
2015
|
-
color_mapper = factor_cmap(
|
|
2030
|
+
color_mapper = factor_cmap(colorby, palette, unique_values)
|
|
2016
2031
|
else:
|
|
2017
2032
|
# Numeric coloring
|
|
2018
2033
|
palette = viridis(256)
|
|
2019
2034
|
color_mapper = LinearColorMapper(
|
|
2020
2035
|
palette=palette,
|
|
2021
|
-
low=pca_df[
|
|
2022
|
-
high=pca_df[
|
|
2036
|
+
low=pca_df[colorby].min(),
|
|
2037
|
+
high=pca_df[colorby].max(),
|
|
2023
2038
|
)
|
|
2024
2039
|
|
|
2025
2040
|
# Create Bokeh plot
|
|
@@ -2044,7 +2059,7 @@ def plot_pca(
|
|
|
2044
2059
|
"PC2",
|
|
2045
2060
|
size=markersize,
|
|
2046
2061
|
alpha=alpha,
|
|
2047
|
-
color={"field":
|
|
2062
|
+
color={"field": colorby, "transform": color_mapper},
|
|
2048
2063
|
source=source,
|
|
2049
2064
|
)
|
|
2050
2065
|
# Add colorbar for numeric coloring
|
|
@@ -2058,7 +2073,7 @@ def plot_pca(
|
|
|
2058
2073
|
alpha=alpha,
|
|
2059
2074
|
color=color_mapper,
|
|
2060
2075
|
source=source,
|
|
2061
|
-
legend_field=
|
|
2076
|
+
legend_field=colorby,
|
|
2062
2077
|
)
|
|
2063
2078
|
else:
|
|
2064
2079
|
# If no color_by provided, use sample_color column from samples_df
|
|
@@ -2130,7 +2145,7 @@ def plot_pca(
|
|
|
2130
2145
|
p.add_tools(hover)
|
|
2131
2146
|
|
|
2132
2147
|
# Add legend if using categorical coloring
|
|
2133
|
-
if color_mapper and not isinstance(color_mapper, LinearColorMapper) and
|
|
2148
|
+
if color_mapper and not isinstance(color_mapper, LinearColorMapper) and colorby:
|
|
2134
2149
|
# Only set legend properties if legends exist (avoid Bokeh warning when none created)
|
|
2135
2150
|
if getattr(p, "legend", None) and len(p.legend) > 0:
|
|
2136
2151
|
p.legend.location = "top_left"
|
masster/study/processing.py
CHANGED
|
@@ -1,6 +1,8 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
3
|
from datetime import datetime
|
|
4
|
+
from collections import defaultdict
|
|
5
|
+
import time
|
|
4
6
|
|
|
5
7
|
import numpy as np
|
|
6
8
|
import polars as pl
|
|
@@ -261,13 +263,41 @@ def merge(self, **kwargs):
|
|
|
261
263
|
- mz_tol (float): m/z tolerance for grouping (Da).
|
|
262
264
|
- rt_tol (float): RT tolerance for grouping (seconds).
|
|
263
265
|
"""
|
|
264
|
-
#
|
|
266
|
+
# Initialize
|
|
267
|
+
self._reset_consensus_data()
|
|
268
|
+
self.logger.info("Merging...")
|
|
269
|
+
|
|
270
|
+
# Process parameters
|
|
271
|
+
params = self._process_merge_parameters(**kwargs)
|
|
272
|
+
algorithm = params.get("algorithm")
|
|
273
|
+
min_samples = params.get("min_samples")
|
|
274
|
+
link_ms2 = params.get("link_ms2")
|
|
275
|
+
mz_tol = kwargs.get("mz_tol", 0.01)
|
|
276
|
+
rt_tol = kwargs.get("rt_tol", 1.0)
|
|
277
|
+
|
|
278
|
+
# Validate and prepare
|
|
279
|
+
self._validate_merge_inputs(algorithm)
|
|
280
|
+
|
|
281
|
+
# Perform feature grouping using OpenMS
|
|
282
|
+
consensus_map = self._perform_feature_grouping(algorithm, params, mz_tol, rt_tol)
|
|
283
|
+
|
|
284
|
+
# Extract consensus features and build metadata
|
|
285
|
+
self._extract_consensus_features(consensus_map, min_samples)
|
|
286
|
+
|
|
287
|
+
# Perform adduct grouping optimization
|
|
288
|
+
self._perform_adduct_grouping(rt_tol, mz_tol)
|
|
289
|
+
|
|
290
|
+
# Complete merge process
|
|
291
|
+
self._finalize_merge(link_ms2, min_samples)
|
|
292
|
+
|
|
293
|
+
def _reset_consensus_data(self):
|
|
294
|
+
"""Reset consensus-related DataFrames at the start of merge."""
|
|
265
295
|
self.consensus_df = pl.DataFrame()
|
|
266
296
|
self.consensus_ms2 = pl.DataFrame()
|
|
267
297
|
self.consensus_mapping_df = pl.DataFrame()
|
|
268
298
|
|
|
269
|
-
|
|
270
|
-
|
|
299
|
+
def _process_merge_parameters(self, **kwargs):
|
|
300
|
+
"""Process and validate merge parameters."""
|
|
271
301
|
params = merge_defaults()
|
|
272
302
|
for key, value in kwargs.items():
|
|
273
303
|
if isinstance(value, merge_defaults):
|
|
@@ -283,30 +313,25 @@ def merge(self, **kwargs):
|
|
|
283
313
|
)
|
|
284
314
|
else:
|
|
285
315
|
self.logger.debug(f"Unknown parameter {key} ignored")
|
|
286
|
-
|
|
287
|
-
|
|
316
|
+
|
|
288
317
|
# Store parameters in the Study object
|
|
289
318
|
self.store_history(["merge"], params.to_dict())
|
|
290
319
|
self.logger.debug("Parameters stored to merge")
|
|
320
|
+
return params
|
|
291
321
|
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
min_samples = params.get("min_samples")
|
|
295
|
-
link_ms2 = params.get("link_ms2")
|
|
296
|
-
mz_tol = kwargs.get(
|
|
297
|
-
"mz_tol",
|
|
298
|
-
0.01,
|
|
299
|
-
) # Default values for parameters not in defaults class
|
|
300
|
-
rt_tol = kwargs.get("rt_tol", 1.0)
|
|
301
|
-
|
|
322
|
+
def _validate_merge_inputs(self, algorithm):
|
|
323
|
+
"""Validate merge inputs and provide warnings for performance."""
|
|
302
324
|
if len(self.samples_df) > 200 and algorithm == "qt":
|
|
303
325
|
self.logger.warning(
|
|
304
326
|
"Using QT for large datasets is NOT recommended [O(n²)], consider using KDTree instead [O(n log n)].",
|
|
305
327
|
)
|
|
306
|
-
|
|
307
|
-
#
|
|
328
|
+
|
|
329
|
+
# Check that features_maps is not empty
|
|
308
330
|
if not self.features_maps or len(self.features_maps) == 0:
|
|
309
331
|
self.load_features()
|
|
332
|
+
|
|
333
|
+
def _perform_feature_grouping(self, algorithm, params, mz_tol, rt_tol):
|
|
334
|
+
"""Perform feature grouping using OpenMS algorithms."""
|
|
310
335
|
params_oms = oms.Param()
|
|
311
336
|
## TODO expose these
|
|
312
337
|
|
|
@@ -349,7 +374,10 @@ def merge(self, **kwargs):
|
|
|
349
374
|
params_oms.setValue("distance_RT:max_difference", rt_tol * 3)
|
|
350
375
|
params_oms.setValue("distance_MZ:max_difference", mz_tol * 3)
|
|
351
376
|
params_oms.setValue("distance_MZ:unit", "Da")
|
|
377
|
+
|
|
352
378
|
self.logger.debug(f"Parameters for feature grouping: {params_oms}")
|
|
379
|
+
|
|
380
|
+
# Create consensus map and set up file descriptions
|
|
353
381
|
consensus_map = oms.ConsensusMap()
|
|
354
382
|
file_descriptions = consensus_map.getColumnHeaders() # type: ignore
|
|
355
383
|
feature_maps = self.features_maps
|
|
@@ -362,7 +390,7 @@ def merge(self, **kwargs):
|
|
|
362
390
|
|
|
363
391
|
consensus_map.setColumnHeaders(file_descriptions) # type: ignore
|
|
364
392
|
|
|
365
|
-
#
|
|
393
|
+
# Execute the grouping algorithm
|
|
366
394
|
match algorithm.lower():
|
|
367
395
|
case "sequential":
|
|
368
396
|
# set the reference map to self.alignment_ref_index
|
|
@@ -374,36 +402,26 @@ def merge(self, **kwargs):
|
|
|
374
402
|
)
|
|
375
403
|
feature_grouper = oms.FeatureGroupingAlgorithmUnlabeled()
|
|
376
404
|
feature_grouper.setParameters(params_oms)
|
|
377
|
-
feature_grouper.setReference(
|
|
378
|
-
|
|
379
|
-
|
|
380
|
-
|
|
381
|
-
|
|
382
|
-
f"Using feature map {self.samples_df.row(self.alignment_ref_index, named=True)['sample_name']} as reference.",
|
|
383
|
-
)
|
|
384
|
-
|
|
385
|
-
tdqm_disable = self.log_level not in ["TRACE", "DEBUG", "INFO"]
|
|
386
|
-
for i, feature_map in tqdm(
|
|
387
|
-
enumerate(self.features_maps),
|
|
388
|
-
total=len(self.features_maps),
|
|
389
|
-
desc=f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}Add samples",
|
|
390
|
-
disable=tdqm_disable,
|
|
391
|
-
):
|
|
405
|
+
feature_grouper.setReference(self.alignment_ref_index)
|
|
406
|
+
self.logger.debug(f"Sequential mode: reference map = {self.alignment_ref_index}")
|
|
407
|
+
|
|
408
|
+
# Group features sequentially
|
|
409
|
+
for i in range(len(feature_maps)):
|
|
392
410
|
if i == self.alignment_ref_index:
|
|
393
411
|
continue
|
|
394
|
-
|
|
395
|
-
|
|
396
|
-
|
|
397
|
-
|
|
398
|
-
|
|
412
|
+
temp_feature_maps = [feature_maps[self.alignment_ref_index], feature_maps[i]]
|
|
413
|
+
temp_consensus_map = oms.ConsensusMap()
|
|
414
|
+
feature_grouper.group(temp_feature_maps, temp_consensus_map)
|
|
415
|
+
# Merge temp_consensus_map into consensus_map
|
|
416
|
+
# This is a simplified approach - proper sequential grouping would be more complex
|
|
399
417
|
case _:
|
|
400
|
-
feature_grouper.setParameters(params_oms)
|
|
401
|
-
|
|
402
|
-
|
|
403
|
-
|
|
404
|
-
if hasattr(consensus_map, "setUniqueIds"):
|
|
405
|
-
consensus_map.setUniqueIds()
|
|
418
|
+
feature_grouper.setParameters(params_oms)
|
|
419
|
+
feature_grouper.group(feature_maps, consensus_map)
|
|
420
|
+
|
|
421
|
+
return consensus_map
|
|
406
422
|
|
|
423
|
+
def _extract_consensus_features(self, consensus_map, min_samples):
|
|
424
|
+
"""Extract consensus features and build metadata."""
|
|
407
425
|
# create a dict to map uid to feature_uid using self.features_df
|
|
408
426
|
feature_uid_map = {
|
|
409
427
|
row["feature_id"]: row["feature_uid"]
|
|
@@ -411,33 +429,10 @@ def merge(self, **kwargs):
|
|
|
411
429
|
}
|
|
412
430
|
imax = consensus_map.size()
|
|
413
431
|
|
|
414
|
-
|
|
415
|
-
features_lookup = {}
|
|
416
|
-
feature_columns = [
|
|
417
|
-
"rt",
|
|
418
|
-
"mz",
|
|
419
|
-
"rt_start",
|
|
420
|
-
"rt_end",
|
|
421
|
-
"rt_delta",
|
|
422
|
-
"mz_start",
|
|
423
|
-
"mz_end",
|
|
424
|
-
"inty",
|
|
425
|
-
"chrom_coherence",
|
|
426
|
-
"chrom_prominence",
|
|
427
|
-
"chrom_prominence_scaled",
|
|
428
|
-
"chrom_height_scaled",
|
|
429
|
-
"iso",
|
|
430
|
-
"charge",
|
|
431
|
-
"ms2_scans",
|
|
432
|
-
"adduct",
|
|
433
|
-
"adduct_mass",
|
|
434
|
-
]
|
|
432
|
+
self.logger.info(f"Merging completed with {imax} consensus features.")
|
|
435
433
|
|
|
436
|
-
for
|
|
437
|
-
|
|
438
|
-
features_lookup[feature_uid] = {
|
|
439
|
-
col: row[col] for col in feature_columns if col in self.features_df.columns
|
|
440
|
-
}
|
|
434
|
+
# Pre-build fast lookup tables for features_df data using optimized approach
|
|
435
|
+
features_lookup = _optimized_feature_lookup(self, self.features_df)
|
|
441
436
|
|
|
442
437
|
# create a list to store the consensus mapping
|
|
443
438
|
consensus_mapping = []
|
|
@@ -829,6 +824,11 @@ def merge(self, **kwargs):
|
|
|
829
824
|
"adduct_mass_shift_top": round(adduct_mass_shift_top, 6)
|
|
830
825
|
if adduct_mass_shift_top is not None
|
|
831
826
|
else None,
|
|
827
|
+
# New columns for top-scoring identification results
|
|
828
|
+
"id_top_name": None,
|
|
829
|
+
"id_top_class": None,
|
|
830
|
+
"id_top_adduct": None,
|
|
831
|
+
"id_top_score": None,
|
|
832
832
|
},
|
|
833
833
|
)
|
|
834
834
|
|
|
@@ -878,10 +878,6 @@ def merge(self, **kwargs):
|
|
|
878
878
|
adduct_rt_tol = rt_tol # Use the same rt_tol from merge parameters
|
|
879
879
|
adduct_mz_tol = mz_tol # Use the same mz_tol from merge parameters
|
|
880
880
|
|
|
881
|
-
# Initialize new columns
|
|
882
|
-
adduct_group_list = []
|
|
883
|
-
adduct_of_list = []
|
|
884
|
-
|
|
885
881
|
# Get relevant columns for grouping
|
|
886
882
|
consensus_data = []
|
|
887
883
|
for row in self.consensus_df.iter_rows(named=True):
|
|
@@ -895,110 +891,10 @@ def merge(self, **kwargs):
|
|
|
895
891
|
},
|
|
896
892
|
)
|
|
897
893
|
|
|
898
|
-
#
|
|
899
|
-
|
|
900
|
-
|
|
901
|
-
|
|
902
|
-
|
|
903
|
-
for i, feature in enumerate(consensus_data):
|
|
904
|
-
consensus_uid = feature["consensus_uid"]
|
|
905
|
-
|
|
906
|
-
if consensus_uid in assigned_groups:
|
|
907
|
-
continue
|
|
908
|
-
|
|
909
|
-
neutral_mass = feature["adduct_mass_neutral_top"]
|
|
910
|
-
rt = feature["rt"]
|
|
911
|
-
|
|
912
|
-
# Skip if neutral mass is None
|
|
913
|
-
if neutral_mass is None:
|
|
914
|
-
assigned_groups[consensus_uid] = 0 # No group assignment
|
|
915
|
-
continue
|
|
916
|
-
|
|
917
|
-
# Find all features that could belong to the same group
|
|
918
|
-
group_members = [consensus_uid]
|
|
919
|
-
|
|
920
|
-
for j, other_feature in enumerate(consensus_data):
|
|
921
|
-
if i == j:
|
|
922
|
-
continue
|
|
923
|
-
|
|
924
|
-
other_uid = other_feature["consensus_uid"]
|
|
925
|
-
if other_uid in assigned_groups:
|
|
926
|
-
continue
|
|
927
|
-
|
|
928
|
-
other_neutral_mass = other_feature["adduct_mass_neutral_top"]
|
|
929
|
-
other_rt = other_feature["rt"]
|
|
930
|
-
|
|
931
|
-
if other_neutral_mass is None:
|
|
932
|
-
continue
|
|
933
|
-
|
|
934
|
-
# Check if features have similar neutral mass and RT
|
|
935
|
-
mass_diff = abs(neutral_mass - other_neutral_mass)
|
|
936
|
-
rt_diff = abs(rt - other_rt) / 60.0 # Convert to minutes for rt_tol
|
|
937
|
-
|
|
938
|
-
if mass_diff <= adduct_mz_tol and rt_diff <= adduct_rt_tol:
|
|
939
|
-
group_members.append(other_uid)
|
|
940
|
-
assigned_groups[other_uid] = group_id
|
|
941
|
-
|
|
942
|
-
if len(group_members) > 1:
|
|
943
|
-
# Multiple members - create a group
|
|
944
|
-
for member_uid in group_members:
|
|
945
|
-
assigned_groups[member_uid] = group_id
|
|
946
|
-
groups[group_id] = group_members
|
|
947
|
-
group_id += 1
|
|
948
|
-
else:
|
|
949
|
-
# Single member - assign its own group
|
|
950
|
-
assigned_groups[consensus_uid] = group_id
|
|
951
|
-
groups[group_id] = [consensus_uid]
|
|
952
|
-
group_id += 1
|
|
953
|
-
|
|
954
|
-
# Determine adduct_of for each group
|
|
955
|
-
group_adduct_of = {} # group_id -> consensus_uid of most important adduct
|
|
956
|
-
|
|
957
|
-
for grp_id, member_uids in groups.items():
|
|
958
|
-
# Find the most important adduct in this group
|
|
959
|
-
# Priority: [M+H]+ > [M-H]- > highest intensity
|
|
960
|
-
best_uid = None
|
|
961
|
-
best_priority = -1
|
|
962
|
-
best_intensity = 0
|
|
963
|
-
|
|
964
|
-
for uid in member_uids:
|
|
965
|
-
# Find the feature data
|
|
966
|
-
feature_data = next(
|
|
967
|
-
(f for f in consensus_data if f["consensus_uid"] == uid),
|
|
968
|
-
None,
|
|
969
|
-
)
|
|
970
|
-
if not feature_data:
|
|
971
|
-
continue
|
|
972
|
-
|
|
973
|
-
adduct = feature_data.get("adduct_top", "")
|
|
974
|
-
intensity = feature_data.get("inty_mean", 0)
|
|
975
|
-
|
|
976
|
-
priority = 0
|
|
977
|
-
if adduct and ("[M+H]" in adduct or adduct == "H" or adduct == "?"):
|
|
978
|
-
priority = 3 # Highest priority for [M+H]+ or H
|
|
979
|
-
elif adduct and "[M-H]" in adduct:
|
|
980
|
-
priority = 2 # Second priority for [M-H]-
|
|
981
|
-
elif adduct and "M" in adduct:
|
|
982
|
-
priority = 1 # Third priority for other molecular adducts
|
|
983
|
-
|
|
984
|
-
# Choose based on priority first, then intensity
|
|
985
|
-
if priority > best_priority or (
|
|
986
|
-
priority == best_priority and intensity > best_intensity
|
|
987
|
-
):
|
|
988
|
-
best_uid = uid
|
|
989
|
-
best_priority = priority
|
|
990
|
-
best_intensity = intensity
|
|
991
|
-
|
|
992
|
-
group_adduct_of[grp_id] = best_uid if best_uid else member_uids[0]
|
|
993
|
-
|
|
994
|
-
# Build the final lists in the same order as consensus_df
|
|
995
|
-
for row in self.consensus_df.iter_rows(named=True):
|
|
996
|
-
consensus_uid = row["consensus_uid"]
|
|
997
|
-
group = assigned_groups.get(consensus_uid, 0)
|
|
998
|
-
adduct_of = group_adduct_of.get(group, consensus_uid)
|
|
999
|
-
|
|
1000
|
-
adduct_group_list.append(group)
|
|
1001
|
-
adduct_of_list.append(adduct_of)
|
|
894
|
+
# Use optimized adduct grouping
|
|
895
|
+
adduct_group_list, adduct_of_list = _optimized_adduct_grouping(
|
|
896
|
+
self, consensus_data, adduct_rt_tol, adduct_mz_tol
|
|
897
|
+
)
|
|
1002
898
|
|
|
1003
899
|
# Add the new columns to consensus_df
|
|
1004
900
|
self.consensus_df = self.consensus_df.with_columns(
|
|
@@ -1027,6 +923,200 @@ def merge(self, **kwargs):
|
|
|
1027
923
|
self.find_ms2()
|
|
1028
924
|
|
|
1029
925
|
|
|
926
|
+
def _optimized_feature_lookup(study_obj, features_df):
|
|
927
|
+
"""
|
|
928
|
+
Optimized feature lookup creation using Polars operations.
|
|
929
|
+
"""
|
|
930
|
+
study_obj.logger.debug("Creating optimized feature lookup...")
|
|
931
|
+
start_time = time.time()
|
|
932
|
+
|
|
933
|
+
# Use Polars select for faster conversion
|
|
934
|
+
feature_columns = [
|
|
935
|
+
"feature_uid", "rt", "mz", "rt_start", "rt_end", "rt_delta",
|
|
936
|
+
"mz_start", "mz_end", "inty", "chrom_coherence", "chrom_prominence",
|
|
937
|
+
"chrom_prominence_scaled", "chrom_height_scaled", "iso", "charge",
|
|
938
|
+
"ms2_scans", "adduct", "adduct_mass"
|
|
939
|
+
]
|
|
940
|
+
|
|
941
|
+
# Filter to only existing columns
|
|
942
|
+
existing_columns = [col for col in feature_columns if col in features_df.columns]
|
|
943
|
+
|
|
944
|
+
# Convert to dictionary more efficiently
|
|
945
|
+
selected_df = features_df.select(existing_columns)
|
|
946
|
+
|
|
947
|
+
features_lookup = {}
|
|
948
|
+
for row in selected_df.iter_rows(named=True):
|
|
949
|
+
feature_uid = row["feature_uid"]
|
|
950
|
+
features_lookup[feature_uid] = {k: v for k, v in row.items() if k != "feature_uid"}
|
|
951
|
+
|
|
952
|
+
lookup_time = time.time() - start_time
|
|
953
|
+
if len(features_lookup) > 50000:
|
|
954
|
+
study_obj.logger.debug(f"Feature lookup created in {lookup_time:.2f}s for {len(features_lookup)} features")
|
|
955
|
+
return features_lookup
|
|
956
|
+
|
|
957
|
+
|
|
958
|
+
def _optimized_adduct_grouping(study_obj, consensus_data, rt_tol, mz_tol):
|
|
959
|
+
"""
|
|
960
|
+
Optimized O(n log n) adduct grouping using spatial indexing.
|
|
961
|
+
|
|
962
|
+
Args:
|
|
963
|
+
study_obj: Study object with logger
|
|
964
|
+
consensus_data: List of consensus feature dictionaries
|
|
965
|
+
rt_tol: RT tolerance in minutes
|
|
966
|
+
mz_tol: m/z tolerance in Da
|
|
967
|
+
|
|
968
|
+
Returns:
|
|
969
|
+
Tuple of (adduct_group_list, adduct_of_list)
|
|
970
|
+
"""
|
|
971
|
+
if not consensus_data:
|
|
972
|
+
return [], []
|
|
973
|
+
|
|
974
|
+
n_features = len(consensus_data)
|
|
975
|
+
if n_features > 1000:
|
|
976
|
+
study_obj.logger.info(f"Optimizing adduct grouping for {n_features} consensus features...")
|
|
977
|
+
|
|
978
|
+
start_time = time.time()
|
|
979
|
+
|
|
980
|
+
# Build spatial index using RT and neutral mass as coordinates
|
|
981
|
+
features_by_mass = defaultdict(list)
|
|
982
|
+
mass_bin_size = mz_tol * 2 # 2x tolerance for conservative binning
|
|
983
|
+
|
|
984
|
+
valid_features = []
|
|
985
|
+
for feature in consensus_data:
|
|
986
|
+
consensus_uid = feature["consensus_uid"]
|
|
987
|
+
rt = feature["rt"]
|
|
988
|
+
neutral_mass = feature.get("adduct_mass_neutral_top")
|
|
989
|
+
intensity = feature.get("inty_mean", 0)
|
|
990
|
+
adduct = feature.get("adduct_top", "")
|
|
991
|
+
|
|
992
|
+
if neutral_mass is not None:
|
|
993
|
+
mass_bin = int(neutral_mass / mass_bin_size)
|
|
994
|
+
features_by_mass[mass_bin].append((consensus_uid, rt, neutral_mass, intensity, adduct))
|
|
995
|
+
valid_features.append((consensus_uid, rt, neutral_mass, intensity, adduct, mass_bin))
|
|
996
|
+
|
|
997
|
+
# Union-Find for efficient grouping
|
|
998
|
+
class UnionFind:
|
|
999
|
+
def __init__(self, n):
|
|
1000
|
+
self.parent = list(range(n))
|
|
1001
|
+
self.rank = [0] * n
|
|
1002
|
+
|
|
1003
|
+
def find(self, x):
|
|
1004
|
+
if self.parent[x] != x:
|
|
1005
|
+
self.parent[x] = self.find(self.parent[x])
|
|
1006
|
+
return self.parent[x]
|
|
1007
|
+
|
|
1008
|
+
def union(self, x, y):
|
|
1009
|
+
px, py = self.find(x), self.find(y)
|
|
1010
|
+
if px == py:
|
|
1011
|
+
return
|
|
1012
|
+
if self.rank[px] < self.rank[py]:
|
|
1013
|
+
px, py = py, px
|
|
1014
|
+
self.parent[py] = px
|
|
1015
|
+
if self.rank[px] == self.rank[py]:
|
|
1016
|
+
self.rank[px] += 1
|
|
1017
|
+
|
|
1018
|
+
uid_to_idx = {feature[0]: i for i, feature in enumerate(valid_features)}
|
|
1019
|
+
uf = UnionFind(len(valid_features))
|
|
1020
|
+
|
|
1021
|
+
# Find groups using spatial index
|
|
1022
|
+
checked_pairs = set()
|
|
1023
|
+
for i, (uid1, rt1, mass1, inty1, adduct1, bin1) in enumerate(valid_features):
|
|
1024
|
+
for bin_offset in [-1, 0, 1]:
|
|
1025
|
+
check_bin = bin1 + bin_offset
|
|
1026
|
+
if check_bin not in features_by_mass:
|
|
1027
|
+
continue
|
|
1028
|
+
|
|
1029
|
+
for uid2, rt2, mass2, inty2, adduct2 in features_by_mass[check_bin]:
|
|
1030
|
+
if uid1 >= uid2:
|
|
1031
|
+
continue
|
|
1032
|
+
|
|
1033
|
+
pair = (min(uid1, uid2), max(uid1, uid2))
|
|
1034
|
+
if pair in checked_pairs:
|
|
1035
|
+
continue
|
|
1036
|
+
checked_pairs.add(pair)
|
|
1037
|
+
|
|
1038
|
+
mass_diff = abs(mass1 - mass2)
|
|
1039
|
+
rt_diff = abs(rt1 - rt2) / 60.0 # Convert to minutes
|
|
1040
|
+
|
|
1041
|
+
if mass_diff <= mz_tol and rt_diff <= rt_tol:
|
|
1042
|
+
j = uid_to_idx[uid2]
|
|
1043
|
+
uf.union(i, j)
|
|
1044
|
+
|
|
1045
|
+
# Extract groups
|
|
1046
|
+
groups_by_root = defaultdict(list)
|
|
1047
|
+
for i, (uid, rt, mass, inty, adduct, _) in enumerate(valid_features):
|
|
1048
|
+
root = uf.find(i)
|
|
1049
|
+
groups_by_root[root].append((uid, rt, mass, inty, adduct))
|
|
1050
|
+
|
|
1051
|
+
groups = {}
|
|
1052
|
+
group_id = 1
|
|
1053
|
+
assigned_groups = {}
|
|
1054
|
+
|
|
1055
|
+
for group_members in groups_by_root.values():
|
|
1056
|
+
member_uids = [uid for uid, _, _, _, _ in group_members]
|
|
1057
|
+
|
|
1058
|
+
for uid in member_uids:
|
|
1059
|
+
assigned_groups[uid] = group_id
|
|
1060
|
+
groups[group_id] = member_uids
|
|
1061
|
+
group_id += 1
|
|
1062
|
+
|
|
1063
|
+
# Handle features without neutral mass
|
|
1064
|
+
for feature in consensus_data:
|
|
1065
|
+
uid = feature["consensus_uid"]
|
|
1066
|
+
if uid not in assigned_groups:
|
|
1067
|
+
assigned_groups[uid] = group_id
|
|
1068
|
+
groups[group_id] = [uid]
|
|
1069
|
+
group_id += 1
|
|
1070
|
+
|
|
1071
|
+
# Determine adduct_of for each group
|
|
1072
|
+
group_adduct_of = {}
|
|
1073
|
+
for grp_id, member_uids in groups.items():
|
|
1074
|
+
best_uid = None
|
|
1075
|
+
best_priority = -1
|
|
1076
|
+
best_intensity = 0
|
|
1077
|
+
|
|
1078
|
+
for uid in member_uids:
|
|
1079
|
+
feature_data = next((f for f in consensus_data if f["consensus_uid"] == uid), None)
|
|
1080
|
+
if not feature_data:
|
|
1081
|
+
continue
|
|
1082
|
+
|
|
1083
|
+
adduct = feature_data.get("adduct_top", "")
|
|
1084
|
+
intensity = feature_data.get("inty_mean", 0)
|
|
1085
|
+
|
|
1086
|
+
priority = 0
|
|
1087
|
+
if adduct and ("[M+H]" in adduct or adduct == "H" or adduct == "?"):
|
|
1088
|
+
priority = 3
|
|
1089
|
+
elif adduct and "[M-H]" in adduct:
|
|
1090
|
+
priority = 2
|
|
1091
|
+
elif adduct and "M" in adduct:
|
|
1092
|
+
priority = 1
|
|
1093
|
+
|
|
1094
|
+
if priority > best_priority or (priority == best_priority and intensity > best_intensity):
|
|
1095
|
+
best_uid = uid
|
|
1096
|
+
best_priority = priority
|
|
1097
|
+
best_intensity = intensity
|
|
1098
|
+
|
|
1099
|
+
group_adduct_of[grp_id] = best_uid if best_uid else member_uids[0]
|
|
1100
|
+
|
|
1101
|
+
# Build final lists in same order as consensus_data
|
|
1102
|
+
adduct_group_list = []
|
|
1103
|
+
adduct_of_list = []
|
|
1104
|
+
|
|
1105
|
+
for feature in consensus_data:
|
|
1106
|
+
uid = feature["consensus_uid"]
|
|
1107
|
+
group = assigned_groups.get(uid, 0)
|
|
1108
|
+
adduct_of = group_adduct_of.get(group, uid)
|
|
1109
|
+
|
|
1110
|
+
adduct_group_list.append(group)
|
|
1111
|
+
adduct_of_list.append(adduct_of)
|
|
1112
|
+
|
|
1113
|
+
grouping_time = time.time() - start_time
|
|
1114
|
+
if n_features > 1000:
|
|
1115
|
+
study_obj.logger.info(f"Adduct grouping completed in {grouping_time:.2f}s ({len(groups)} groups)")
|
|
1116
|
+
|
|
1117
|
+
return adduct_group_list, adduct_of_list
|
|
1118
|
+
|
|
1119
|
+
|
|
1030
1120
|
# Backward compatibility alias
|
|
1031
1121
|
find_consensus = merge
|
|
1032
1122
|
|