masster 0.4.12__tar.gz → 0.4.13__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of masster might be problematic. Click here for more details.
- {masster-0.4.12 → masster-0.4.13}/PKG-INFO +1 -1
- {masster-0.4.12 → masster-0.4.13}/pyproject.toml +1 -1
- {masster-0.4.12 → masster-0.4.13}/src/masster/_version.py +1 -1
- {masster-0.4.12 → masster-0.4.13}/src/masster/study/processing.py +263 -178
- {masster-0.4.12 → masster-0.4.13}/uv.lock +1 -1
- {masster-0.4.12 → masster-0.4.13}/.github/workflows/publish.yml +0 -0
- {masster-0.4.12 → masster-0.4.13}/.github/workflows/security.yml +0 -0
- {masster-0.4.12 → masster-0.4.13}/.github/workflows/test.yml +0 -0
- {masster-0.4.12 → masster-0.4.13}/.gitignore +0 -0
- {masster-0.4.12 → masster-0.4.13}/.pre-commit-config.yaml +0 -0
- {masster-0.4.12 → masster-0.4.13}/LICENSE +0 -0
- {masster-0.4.12 → masster-0.4.13}/Makefile +0 -0
- {masster-0.4.12 → masster-0.4.13}/README.md +0 -0
- {masster-0.4.12 → masster-0.4.13}/TESTING.md +0 -0
- {masster-0.4.12 → masster-0.4.13}/demo/example_batch_process.py +0 -0
- {masster-0.4.12 → masster-0.4.13}/demo/example_sample_process.py +0 -0
- {masster-0.4.12 → masster-0.4.13}/src/masster/__init__.py +0 -0
- {masster-0.4.12 → masster-0.4.13}/src/masster/chromatogram.py +0 -0
- {masster-0.4.12 → masster-0.4.13}/src/masster/data/dda/20250530_VH_IQX_KW_RP_HSST3_100mm_12min_pos_v4_DDA_OT_C-MiLUT_QC_dil2_01_20250602151849.sample5 +0 -0
- {masster-0.4.12 → masster-0.4.13}/src/masster/data/dda/20250530_VH_IQX_KW_RP_HSST3_100mm_12min_pos_v4_DDA_OT_C-MiLUT_QC_dil3_01_20250602150634.sample5 +0 -0
- {masster-0.4.12 → masster-0.4.13}/src/masster/data/dda/20250530_VH_IQX_KW_RP_HSST3_100mm_12min_pos_v4_MS1_C-MiLUT_C008_v6_r38_01.sample5 +0 -0
- {masster-0.4.12 → masster-0.4.13}/src/masster/data/dda/20250530_VH_IQX_KW_RP_HSST3_100mm_12min_pos_v4_MS1_C-MiLUT_C008_v7_r37_01.sample5 +0 -0
- {masster-0.4.12 → masster-0.4.13}/src/masster/data/dda/20250530_VH_IQX_KW_RP_HSST3_100mm_12min_pos_v4_MS1_C-MiLUT_C017_v5_r99_01.sample5 +0 -0
- {masster-0.4.12 → masster-0.4.13}/src/masster/data/libs/ccm.csv +0 -0
- {masster-0.4.12 → masster-0.4.13}/src/masster/data/libs/urine.csv +0 -0
- {masster-0.4.12 → masster-0.4.13}/src/masster/data/wiff/2025_01_14_VW_7600_LpMx_DBS_CID_2min_TOP15_030msecMS1_005msecReac_CE35_DBS-ON_3.timeseries.data +0 -0
- {masster-0.4.12 → masster-0.4.13}/src/masster/data/wiff/2025_01_14_VW_7600_LpMx_DBS_CID_2min_TOP15_030msecMS1_005msecReac_CE35_DBS-ON_3.wiff +0 -0
- {masster-0.4.12 → masster-0.4.13}/src/masster/data/wiff/2025_01_14_VW_7600_LpMx_DBS_CID_2min_TOP15_030msecMS1_005msecReac_CE35_DBS-ON_3.wiff.scan +0 -0
- {masster-0.4.12 → masster-0.4.13}/src/masster/data/wiff/2025_01_14_VW_7600_LpMx_DBS_CID_2min_TOP15_030msecMS1_005msecReac_CE35_DBS-ON_3.wiff2 +0 -0
- {masster-0.4.12 → masster-0.4.13}/src/masster/lib/__init__.py +0 -0
- {masster-0.4.12 → masster-0.4.13}/src/masster/lib/lib.py +0 -0
- {masster-0.4.12 → masster-0.4.13}/src/masster/logger.py +0 -0
- {masster-0.4.12 → masster-0.4.13}/src/masster/sample/__init__.py +0 -0
- {masster-0.4.12 → masster-0.4.13}/src/masster/sample/adducts.py +0 -0
- {masster-0.4.12 → masster-0.4.13}/src/masster/sample/defaults/__init__.py +0 -0
- {masster-0.4.12 → masster-0.4.13}/src/masster/sample/defaults/find_adducts_def.py +0 -0
- {masster-0.4.12 → masster-0.4.13}/src/masster/sample/defaults/find_features_def.py +0 -0
- {masster-0.4.12 → masster-0.4.13}/src/masster/sample/defaults/find_ms2_def.py +0 -0
- {masster-0.4.12 → masster-0.4.13}/src/masster/sample/defaults/get_spectrum_def.py +0 -0
- {masster-0.4.12 → masster-0.4.13}/src/masster/sample/defaults/sample_def.py +0 -0
- {masster-0.4.12 → masster-0.4.13}/src/masster/sample/h5.py +0 -0
- {masster-0.4.12 → masster-0.4.13}/src/masster/sample/helpers.py +0 -0
- {masster-0.4.12 → masster-0.4.13}/src/masster/sample/lib.py +0 -0
- {masster-0.4.12 → masster-0.4.13}/src/masster/sample/load.py +0 -0
- {masster-0.4.12 → masster-0.4.13}/src/masster/sample/parameters.py +0 -0
- {masster-0.4.12 → masster-0.4.13}/src/masster/sample/plot.py +0 -0
- {masster-0.4.12 → masster-0.4.13}/src/masster/sample/processing.py +0 -0
- {masster-0.4.12 → masster-0.4.13}/src/masster/sample/quant.py +0 -0
- {masster-0.4.12 → masster-0.4.13}/src/masster/sample/sample.py +0 -0
- {masster-0.4.12 → masster-0.4.13}/src/masster/sample/sample5_schema.json +0 -0
- {masster-0.4.12 → masster-0.4.13}/src/masster/sample/save.py +0 -0
- {masster-0.4.12 → masster-0.4.13}/src/masster/sample/sciex.py +0 -0
- {masster-0.4.12 → masster-0.4.13}/src/masster/spectrum.py +0 -0
- {masster-0.4.12 → masster-0.4.13}/src/masster/study/__init__.py +0 -0
- {masster-0.4.12 → masster-0.4.13}/src/masster/study/defaults/__init__.py +0 -0
- {masster-0.4.12 → masster-0.4.13}/src/masster/study/defaults/align_def.py +0 -0
- {masster-0.4.12 → masster-0.4.13}/src/masster/study/defaults/export_def.py +0 -0
- {masster-0.4.12 → masster-0.4.13}/src/masster/study/defaults/fill_chrom_def.py +0 -0
- {masster-0.4.12 → masster-0.4.13}/src/masster/study/defaults/fill_def.py +0 -0
- {masster-0.4.12 → masster-0.4.13}/src/masster/study/defaults/find_consensus_def.py +0 -0
- {masster-0.4.12 → masster-0.4.13}/src/masster/study/defaults/find_ms2_def.py +0 -0
- {masster-0.4.12 → masster-0.4.13}/src/masster/study/defaults/identify_def.py +0 -0
- {masster-0.4.12 → masster-0.4.13}/src/masster/study/defaults/integrate_chrom_def.py +0 -0
- {masster-0.4.12 → masster-0.4.13}/src/masster/study/defaults/integrate_def.py +0 -0
- {masster-0.4.12 → masster-0.4.13}/src/masster/study/defaults/merge_def.py +0 -0
- {masster-0.4.12 → masster-0.4.13}/src/masster/study/defaults/study_def.py +0 -0
- {masster-0.4.12 → masster-0.4.13}/src/masster/study/export.py +0 -0
- {masster-0.4.12 → masster-0.4.13}/src/masster/study/h5.py +0 -0
- {masster-0.4.12 → masster-0.4.13}/src/masster/study/helpers.py +0 -0
- {masster-0.4.12 → masster-0.4.13}/src/masster/study/id.py +0 -0
- {masster-0.4.12 → masster-0.4.13}/src/masster/study/load.py +0 -0
- {masster-0.4.12 → masster-0.4.13}/src/masster/study/parameters.py +0 -0
- {masster-0.4.12 → masster-0.4.13}/src/masster/study/plot.py +0 -0
- {masster-0.4.12 → masster-0.4.13}/src/masster/study/save.py +0 -0
- {masster-0.4.12 → masster-0.4.13}/src/masster/study/study.py +0 -0
- {masster-0.4.12 → masster-0.4.13}/src/masster/study/study5_schema.json +0 -0
- {masster-0.4.12 → masster-0.4.13}/tests/conftest.py +0 -0
- {masster-0.4.12 → masster-0.4.13}/tests/test_chromatogram.py +0 -0
- {masster-0.4.12 → masster-0.4.13}/tests/test_defaults.py +0 -0
- {masster-0.4.12 → masster-0.4.13}/tests/test_imports.py +0 -0
- {masster-0.4.12 → masster-0.4.13}/tests/test_integration.py +0 -0
- {masster-0.4.12 → masster-0.4.13}/tests/test_logger.py +0 -0
- {masster-0.4.12 → masster-0.4.13}/tests/test_parameters.py +0 -0
- {masster-0.4.12 → masster-0.4.13}/tests/test_sample.py +0 -0
- {masster-0.4.12 → masster-0.4.13}/tests/test_spectrum.py +0 -0
- {masster-0.4.12 → masster-0.4.13}/tests/test_study.py +0 -0
- {masster-0.4.12 → masster-0.4.13}/tests/test_version.py +0 -0
- {masster-0.4.12 → masster-0.4.13}/tox.ini +0 -0
|
@@ -1,6 +1,8 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
3
|
from datetime import datetime
|
|
4
|
+
from collections import defaultdict
|
|
5
|
+
import time
|
|
4
6
|
|
|
5
7
|
import numpy as np
|
|
6
8
|
import polars as pl
|
|
@@ -261,13 +263,41 @@ def merge(self, **kwargs):
|
|
|
261
263
|
- mz_tol (float): m/z tolerance for grouping (Da).
|
|
262
264
|
- rt_tol (float): RT tolerance for grouping (seconds).
|
|
263
265
|
"""
|
|
264
|
-
#
|
|
266
|
+
# Initialize
|
|
267
|
+
self._reset_consensus_data()
|
|
268
|
+
self.logger.info("Merging...")
|
|
269
|
+
|
|
270
|
+
# Process parameters
|
|
271
|
+
params = self._process_merge_parameters(**kwargs)
|
|
272
|
+
algorithm = params.get("algorithm")
|
|
273
|
+
min_samples = params.get("min_samples")
|
|
274
|
+
link_ms2 = params.get("link_ms2")
|
|
275
|
+
mz_tol = kwargs.get("mz_tol", 0.01)
|
|
276
|
+
rt_tol = kwargs.get("rt_tol", 1.0)
|
|
277
|
+
|
|
278
|
+
# Validate and prepare
|
|
279
|
+
self._validate_merge_inputs(algorithm)
|
|
280
|
+
|
|
281
|
+
# Perform feature grouping using OpenMS
|
|
282
|
+
consensus_map = self._perform_feature_grouping(algorithm, params, mz_tol, rt_tol)
|
|
283
|
+
|
|
284
|
+
# Extract consensus features and build metadata
|
|
285
|
+
self._extract_consensus_features(consensus_map, min_samples)
|
|
286
|
+
|
|
287
|
+
# Perform adduct grouping optimization
|
|
288
|
+
self._perform_adduct_grouping(rt_tol, mz_tol)
|
|
289
|
+
|
|
290
|
+
# Complete merge process
|
|
291
|
+
self._finalize_merge(link_ms2, min_samples)
|
|
292
|
+
|
|
293
|
+
def _reset_consensus_data(self):
|
|
294
|
+
"""Reset consensus-related DataFrames at the start of merge."""
|
|
265
295
|
self.consensus_df = pl.DataFrame()
|
|
266
296
|
self.consensus_ms2 = pl.DataFrame()
|
|
267
297
|
self.consensus_mapping_df = pl.DataFrame()
|
|
268
298
|
|
|
269
|
-
|
|
270
|
-
|
|
299
|
+
def _process_merge_parameters(self, **kwargs):
|
|
300
|
+
"""Process and validate merge parameters."""
|
|
271
301
|
params = merge_defaults()
|
|
272
302
|
for key, value in kwargs.items():
|
|
273
303
|
if isinstance(value, merge_defaults):
|
|
@@ -283,30 +313,25 @@ def merge(self, **kwargs):
|
|
|
283
313
|
)
|
|
284
314
|
else:
|
|
285
315
|
self.logger.debug(f"Unknown parameter {key} ignored")
|
|
286
|
-
|
|
287
|
-
|
|
316
|
+
|
|
288
317
|
# Store parameters in the Study object
|
|
289
318
|
self.store_history(["merge"], params.to_dict())
|
|
290
319
|
self.logger.debug("Parameters stored to merge")
|
|
320
|
+
return params
|
|
291
321
|
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
min_samples = params.get("min_samples")
|
|
295
|
-
link_ms2 = params.get("link_ms2")
|
|
296
|
-
mz_tol = kwargs.get(
|
|
297
|
-
"mz_tol",
|
|
298
|
-
0.01,
|
|
299
|
-
) # Default values for parameters not in defaults class
|
|
300
|
-
rt_tol = kwargs.get("rt_tol", 1.0)
|
|
301
|
-
|
|
322
|
+
def _validate_merge_inputs(self, algorithm):
|
|
323
|
+
"""Validate merge inputs and provide warnings for performance."""
|
|
302
324
|
if len(self.samples_df) > 200 and algorithm == "qt":
|
|
303
325
|
self.logger.warning(
|
|
304
326
|
"Using QT for large datasets is NOT recommended [O(n²)], consider using KDTree instead [O(n log n)].",
|
|
305
327
|
)
|
|
306
|
-
|
|
307
|
-
#
|
|
328
|
+
|
|
329
|
+
# Check that features_maps is not empty
|
|
308
330
|
if not self.features_maps or len(self.features_maps) == 0:
|
|
309
331
|
self.load_features()
|
|
332
|
+
|
|
333
|
+
def _perform_feature_grouping(self, algorithm, params, mz_tol, rt_tol):
|
|
334
|
+
"""Perform feature grouping using OpenMS algorithms."""
|
|
310
335
|
params_oms = oms.Param()
|
|
311
336
|
## TODO expose these
|
|
312
337
|
|
|
@@ -349,7 +374,10 @@ def merge(self, **kwargs):
|
|
|
349
374
|
params_oms.setValue("distance_RT:max_difference", rt_tol * 3)
|
|
350
375
|
params_oms.setValue("distance_MZ:max_difference", mz_tol * 3)
|
|
351
376
|
params_oms.setValue("distance_MZ:unit", "Da")
|
|
377
|
+
|
|
352
378
|
self.logger.debug(f"Parameters for feature grouping: {params_oms}")
|
|
379
|
+
|
|
380
|
+
# Create consensus map and set up file descriptions
|
|
353
381
|
consensus_map = oms.ConsensusMap()
|
|
354
382
|
file_descriptions = consensus_map.getColumnHeaders() # type: ignore
|
|
355
383
|
feature_maps = self.features_maps
|
|
@@ -362,7 +390,7 @@ def merge(self, **kwargs):
|
|
|
362
390
|
|
|
363
391
|
consensus_map.setColumnHeaders(file_descriptions) # type: ignore
|
|
364
392
|
|
|
365
|
-
#
|
|
393
|
+
# Execute the grouping algorithm
|
|
366
394
|
match algorithm.lower():
|
|
367
395
|
case "sequential":
|
|
368
396
|
# set the reference map to self.alignment_ref_index
|
|
@@ -374,36 +402,26 @@ def merge(self, **kwargs):
|
|
|
374
402
|
)
|
|
375
403
|
feature_grouper = oms.FeatureGroupingAlgorithmUnlabeled()
|
|
376
404
|
feature_grouper.setParameters(params_oms)
|
|
377
|
-
feature_grouper.setReference(
|
|
378
|
-
|
|
379
|
-
|
|
380
|
-
|
|
381
|
-
|
|
382
|
-
f"Using feature map {self.samples_df.row(self.alignment_ref_index, named=True)['sample_name']} as reference.",
|
|
383
|
-
)
|
|
384
|
-
|
|
385
|
-
tdqm_disable = self.log_level not in ["TRACE", "DEBUG", "INFO"]
|
|
386
|
-
for i, feature_map in tqdm(
|
|
387
|
-
enumerate(self.features_maps),
|
|
388
|
-
total=len(self.features_maps),
|
|
389
|
-
desc=f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}Add samples",
|
|
390
|
-
disable=tdqm_disable,
|
|
391
|
-
):
|
|
405
|
+
feature_grouper.setReference(self.alignment_ref_index)
|
|
406
|
+
self.logger.debug(f"Sequential mode: reference map = {self.alignment_ref_index}")
|
|
407
|
+
|
|
408
|
+
# Group features sequentially
|
|
409
|
+
for i in range(len(feature_maps)):
|
|
392
410
|
if i == self.alignment_ref_index:
|
|
393
411
|
continue
|
|
394
|
-
|
|
395
|
-
|
|
396
|
-
|
|
397
|
-
|
|
398
|
-
|
|
412
|
+
temp_feature_maps = [feature_maps[self.alignment_ref_index], feature_maps[i]]
|
|
413
|
+
temp_consensus_map = oms.ConsensusMap()
|
|
414
|
+
feature_grouper.group(temp_feature_maps, temp_consensus_map)
|
|
415
|
+
# Merge temp_consensus_map into consensus_map
|
|
416
|
+
# This is a simplified approach - proper sequential grouping would be more complex
|
|
399
417
|
case _:
|
|
400
|
-
feature_grouper.setParameters(params_oms)
|
|
401
|
-
|
|
402
|
-
|
|
403
|
-
|
|
404
|
-
if hasattr(consensus_map, "setUniqueIds"):
|
|
405
|
-
consensus_map.setUniqueIds()
|
|
418
|
+
feature_grouper.setParameters(params_oms)
|
|
419
|
+
feature_grouper.group(feature_maps, consensus_map)
|
|
420
|
+
|
|
421
|
+
return consensus_map
|
|
406
422
|
|
|
423
|
+
def _extract_consensus_features(self, consensus_map, min_samples):
|
|
424
|
+
"""Extract consensus features and build metadata."""
|
|
407
425
|
# create a dict to map uid to feature_uid using self.features_df
|
|
408
426
|
feature_uid_map = {
|
|
409
427
|
row["feature_id"]: row["feature_uid"]
|
|
@@ -411,33 +429,10 @@ def merge(self, **kwargs):
|
|
|
411
429
|
}
|
|
412
430
|
imax = consensus_map.size()
|
|
413
431
|
|
|
414
|
-
|
|
415
|
-
features_lookup = {}
|
|
416
|
-
feature_columns = [
|
|
417
|
-
"rt",
|
|
418
|
-
"mz",
|
|
419
|
-
"rt_start",
|
|
420
|
-
"rt_end",
|
|
421
|
-
"rt_delta",
|
|
422
|
-
"mz_start",
|
|
423
|
-
"mz_end",
|
|
424
|
-
"inty",
|
|
425
|
-
"chrom_coherence",
|
|
426
|
-
"chrom_prominence",
|
|
427
|
-
"chrom_prominence_scaled",
|
|
428
|
-
"chrom_height_scaled",
|
|
429
|
-
"iso",
|
|
430
|
-
"charge",
|
|
431
|
-
"ms2_scans",
|
|
432
|
-
"adduct",
|
|
433
|
-
"adduct_mass",
|
|
434
|
-
]
|
|
432
|
+
self.logger.info(f"Merging completed with {imax} consensus features.")
|
|
435
433
|
|
|
436
|
-
for
|
|
437
|
-
|
|
438
|
-
features_lookup[feature_uid] = {
|
|
439
|
-
col: row[col] for col in feature_columns if col in self.features_df.columns
|
|
440
|
-
}
|
|
434
|
+
# Pre-build fast lookup tables for features_df data using optimized approach
|
|
435
|
+
features_lookup = _optimized_feature_lookup(self, self.features_df)
|
|
441
436
|
|
|
442
437
|
# create a list to store the consensus mapping
|
|
443
438
|
consensus_mapping = []
|
|
@@ -883,10 +878,6 @@ def merge(self, **kwargs):
|
|
|
883
878
|
adduct_rt_tol = rt_tol # Use the same rt_tol from merge parameters
|
|
884
879
|
adduct_mz_tol = mz_tol # Use the same mz_tol from merge parameters
|
|
885
880
|
|
|
886
|
-
# Initialize new columns
|
|
887
|
-
adduct_group_list = []
|
|
888
|
-
adduct_of_list = []
|
|
889
|
-
|
|
890
881
|
# Get relevant columns for grouping
|
|
891
882
|
consensus_data = []
|
|
892
883
|
for row in self.consensus_df.iter_rows(named=True):
|
|
@@ -900,110 +891,10 @@ def merge(self, **kwargs):
|
|
|
900
891
|
},
|
|
901
892
|
)
|
|
902
893
|
|
|
903
|
-
#
|
|
904
|
-
|
|
905
|
-
|
|
906
|
-
|
|
907
|
-
|
|
908
|
-
for i, feature in enumerate(consensus_data):
|
|
909
|
-
consensus_uid = feature["consensus_uid"]
|
|
910
|
-
|
|
911
|
-
if consensus_uid in assigned_groups:
|
|
912
|
-
continue
|
|
913
|
-
|
|
914
|
-
neutral_mass = feature["adduct_mass_neutral_top"]
|
|
915
|
-
rt = feature["rt"]
|
|
916
|
-
|
|
917
|
-
# Skip if neutral mass is None
|
|
918
|
-
if neutral_mass is None:
|
|
919
|
-
assigned_groups[consensus_uid] = 0 # No group assignment
|
|
920
|
-
continue
|
|
921
|
-
|
|
922
|
-
# Find all features that could belong to the same group
|
|
923
|
-
group_members = [consensus_uid]
|
|
924
|
-
|
|
925
|
-
for j, other_feature in enumerate(consensus_data):
|
|
926
|
-
if i == j:
|
|
927
|
-
continue
|
|
928
|
-
|
|
929
|
-
other_uid = other_feature["consensus_uid"]
|
|
930
|
-
if other_uid in assigned_groups:
|
|
931
|
-
continue
|
|
932
|
-
|
|
933
|
-
other_neutral_mass = other_feature["adduct_mass_neutral_top"]
|
|
934
|
-
other_rt = other_feature["rt"]
|
|
935
|
-
|
|
936
|
-
if other_neutral_mass is None:
|
|
937
|
-
continue
|
|
938
|
-
|
|
939
|
-
# Check if features have similar neutral mass and RT
|
|
940
|
-
mass_diff = abs(neutral_mass - other_neutral_mass)
|
|
941
|
-
rt_diff = abs(rt - other_rt) / 60.0 # Convert to minutes for rt_tol
|
|
942
|
-
|
|
943
|
-
if mass_diff <= adduct_mz_tol and rt_diff <= adduct_rt_tol:
|
|
944
|
-
group_members.append(other_uid)
|
|
945
|
-
assigned_groups[other_uid] = group_id
|
|
946
|
-
|
|
947
|
-
if len(group_members) > 1:
|
|
948
|
-
# Multiple members - create a group
|
|
949
|
-
for member_uid in group_members:
|
|
950
|
-
assigned_groups[member_uid] = group_id
|
|
951
|
-
groups[group_id] = group_members
|
|
952
|
-
group_id += 1
|
|
953
|
-
else:
|
|
954
|
-
# Single member - assign its own group
|
|
955
|
-
assigned_groups[consensus_uid] = group_id
|
|
956
|
-
groups[group_id] = [consensus_uid]
|
|
957
|
-
group_id += 1
|
|
958
|
-
|
|
959
|
-
# Determine adduct_of for each group
|
|
960
|
-
group_adduct_of = {} # group_id -> consensus_uid of most important adduct
|
|
961
|
-
|
|
962
|
-
for grp_id, member_uids in groups.items():
|
|
963
|
-
# Find the most important adduct in this group
|
|
964
|
-
# Priority: [M+H]+ > [M-H]- > highest intensity
|
|
965
|
-
best_uid = None
|
|
966
|
-
best_priority = -1
|
|
967
|
-
best_intensity = 0
|
|
968
|
-
|
|
969
|
-
for uid in member_uids:
|
|
970
|
-
# Find the feature data
|
|
971
|
-
feature_data = next(
|
|
972
|
-
(f for f in consensus_data if f["consensus_uid"] == uid),
|
|
973
|
-
None,
|
|
974
|
-
)
|
|
975
|
-
if not feature_data:
|
|
976
|
-
continue
|
|
977
|
-
|
|
978
|
-
adduct = feature_data.get("adduct_top", "")
|
|
979
|
-
intensity = feature_data.get("inty_mean", 0)
|
|
980
|
-
|
|
981
|
-
priority = 0
|
|
982
|
-
if adduct and ("[M+H]" in adduct or adduct == "H" or adduct == "?"):
|
|
983
|
-
priority = 3 # Highest priority for [M+H]+ or H
|
|
984
|
-
elif adduct and "[M-H]" in adduct:
|
|
985
|
-
priority = 2 # Second priority for [M-H]-
|
|
986
|
-
elif adduct and "M" in adduct:
|
|
987
|
-
priority = 1 # Third priority for other molecular adducts
|
|
988
|
-
|
|
989
|
-
# Choose based on priority first, then intensity
|
|
990
|
-
if priority > best_priority or (
|
|
991
|
-
priority == best_priority and intensity > best_intensity
|
|
992
|
-
):
|
|
993
|
-
best_uid = uid
|
|
994
|
-
best_priority = priority
|
|
995
|
-
best_intensity = intensity
|
|
996
|
-
|
|
997
|
-
group_adduct_of[grp_id] = best_uid if best_uid else member_uids[0]
|
|
998
|
-
|
|
999
|
-
# Build the final lists in the same order as consensus_df
|
|
1000
|
-
for row in self.consensus_df.iter_rows(named=True):
|
|
1001
|
-
consensus_uid = row["consensus_uid"]
|
|
1002
|
-
group = assigned_groups.get(consensus_uid, 0)
|
|
1003
|
-
adduct_of = group_adduct_of.get(group, consensus_uid)
|
|
1004
|
-
|
|
1005
|
-
adduct_group_list.append(group)
|
|
1006
|
-
adduct_of_list.append(adduct_of)
|
|
894
|
+
# Use optimized adduct grouping
|
|
895
|
+
adduct_group_list, adduct_of_list = _optimized_adduct_grouping(
|
|
896
|
+
self, consensus_data, adduct_rt_tol, adduct_mz_tol
|
|
897
|
+
)
|
|
1007
898
|
|
|
1008
899
|
# Add the new columns to consensus_df
|
|
1009
900
|
self.consensus_df = self.consensus_df.with_columns(
|
|
@@ -1032,6 +923,200 @@ def merge(self, **kwargs):
|
|
|
1032
923
|
self.find_ms2()
|
|
1033
924
|
|
|
1034
925
|
|
|
926
|
+
def _optimized_feature_lookup(study_obj, features_df):
|
|
927
|
+
"""
|
|
928
|
+
Optimized feature lookup creation using Polars operations.
|
|
929
|
+
"""
|
|
930
|
+
study_obj.logger.debug("Creating optimized feature lookup...")
|
|
931
|
+
start_time = time.time()
|
|
932
|
+
|
|
933
|
+
# Use Polars select for faster conversion
|
|
934
|
+
feature_columns = [
|
|
935
|
+
"feature_uid", "rt", "mz", "rt_start", "rt_end", "rt_delta",
|
|
936
|
+
"mz_start", "mz_end", "inty", "chrom_coherence", "chrom_prominence",
|
|
937
|
+
"chrom_prominence_scaled", "chrom_height_scaled", "iso", "charge",
|
|
938
|
+
"ms2_scans", "adduct", "adduct_mass"
|
|
939
|
+
]
|
|
940
|
+
|
|
941
|
+
# Filter to only existing columns
|
|
942
|
+
existing_columns = [col for col in feature_columns if col in features_df.columns]
|
|
943
|
+
|
|
944
|
+
# Convert to dictionary more efficiently
|
|
945
|
+
selected_df = features_df.select(existing_columns)
|
|
946
|
+
|
|
947
|
+
features_lookup = {}
|
|
948
|
+
for row in selected_df.iter_rows(named=True):
|
|
949
|
+
feature_uid = row["feature_uid"]
|
|
950
|
+
features_lookup[feature_uid] = {k: v for k, v in row.items() if k != "feature_uid"}
|
|
951
|
+
|
|
952
|
+
lookup_time = time.time() - start_time
|
|
953
|
+
if len(features_lookup) > 50000:
|
|
954
|
+
study_obj.logger.debug(f"Feature lookup created in {lookup_time:.2f}s for {len(features_lookup)} features")
|
|
955
|
+
return features_lookup
|
|
956
|
+
|
|
957
|
+
|
|
958
|
+
def _optimized_adduct_grouping(study_obj, consensus_data, rt_tol, mz_tol):
|
|
959
|
+
"""
|
|
960
|
+
Optimized O(n log n) adduct grouping using spatial indexing.
|
|
961
|
+
|
|
962
|
+
Args:
|
|
963
|
+
study_obj: Study object with logger
|
|
964
|
+
consensus_data: List of consensus feature dictionaries
|
|
965
|
+
rt_tol: RT tolerance in minutes
|
|
966
|
+
mz_tol: m/z tolerance in Da
|
|
967
|
+
|
|
968
|
+
Returns:
|
|
969
|
+
Tuple of (adduct_group_list, adduct_of_list)
|
|
970
|
+
"""
|
|
971
|
+
if not consensus_data:
|
|
972
|
+
return [], []
|
|
973
|
+
|
|
974
|
+
n_features = len(consensus_data)
|
|
975
|
+
if n_features > 1000:
|
|
976
|
+
study_obj.logger.info(f"Optimizing adduct grouping for {n_features} consensus features...")
|
|
977
|
+
|
|
978
|
+
start_time = time.time()
|
|
979
|
+
|
|
980
|
+
# Build spatial index using RT and neutral mass as coordinates
|
|
981
|
+
features_by_mass = defaultdict(list)
|
|
982
|
+
mass_bin_size = mz_tol * 2 # 2x tolerance for conservative binning
|
|
983
|
+
|
|
984
|
+
valid_features = []
|
|
985
|
+
for feature in consensus_data:
|
|
986
|
+
consensus_uid = feature["consensus_uid"]
|
|
987
|
+
rt = feature["rt"]
|
|
988
|
+
neutral_mass = feature.get("adduct_mass_neutral_top")
|
|
989
|
+
intensity = feature.get("inty_mean", 0)
|
|
990
|
+
adduct = feature.get("adduct_top", "")
|
|
991
|
+
|
|
992
|
+
if neutral_mass is not None:
|
|
993
|
+
mass_bin = int(neutral_mass / mass_bin_size)
|
|
994
|
+
features_by_mass[mass_bin].append((consensus_uid, rt, neutral_mass, intensity, adduct))
|
|
995
|
+
valid_features.append((consensus_uid, rt, neutral_mass, intensity, adduct, mass_bin))
|
|
996
|
+
|
|
997
|
+
# Union-Find for efficient grouping
|
|
998
|
+
class UnionFind:
|
|
999
|
+
def __init__(self, n):
|
|
1000
|
+
self.parent = list(range(n))
|
|
1001
|
+
self.rank = [0] * n
|
|
1002
|
+
|
|
1003
|
+
def find(self, x):
|
|
1004
|
+
if self.parent[x] != x:
|
|
1005
|
+
self.parent[x] = self.find(self.parent[x])
|
|
1006
|
+
return self.parent[x]
|
|
1007
|
+
|
|
1008
|
+
def union(self, x, y):
|
|
1009
|
+
px, py = self.find(x), self.find(y)
|
|
1010
|
+
if px == py:
|
|
1011
|
+
return
|
|
1012
|
+
if self.rank[px] < self.rank[py]:
|
|
1013
|
+
px, py = py, px
|
|
1014
|
+
self.parent[py] = px
|
|
1015
|
+
if self.rank[px] == self.rank[py]:
|
|
1016
|
+
self.rank[px] += 1
|
|
1017
|
+
|
|
1018
|
+
uid_to_idx = {feature[0]: i for i, feature in enumerate(valid_features)}
|
|
1019
|
+
uf = UnionFind(len(valid_features))
|
|
1020
|
+
|
|
1021
|
+
# Find groups using spatial index
|
|
1022
|
+
checked_pairs = set()
|
|
1023
|
+
for i, (uid1, rt1, mass1, inty1, adduct1, bin1) in enumerate(valid_features):
|
|
1024
|
+
for bin_offset in [-1, 0, 1]:
|
|
1025
|
+
check_bin = bin1 + bin_offset
|
|
1026
|
+
if check_bin not in features_by_mass:
|
|
1027
|
+
continue
|
|
1028
|
+
|
|
1029
|
+
for uid2, rt2, mass2, inty2, adduct2 in features_by_mass[check_bin]:
|
|
1030
|
+
if uid1 >= uid2:
|
|
1031
|
+
continue
|
|
1032
|
+
|
|
1033
|
+
pair = (min(uid1, uid2), max(uid1, uid2))
|
|
1034
|
+
if pair in checked_pairs:
|
|
1035
|
+
continue
|
|
1036
|
+
checked_pairs.add(pair)
|
|
1037
|
+
|
|
1038
|
+
mass_diff = abs(mass1 - mass2)
|
|
1039
|
+
rt_diff = abs(rt1 - rt2) / 60.0 # Convert to minutes
|
|
1040
|
+
|
|
1041
|
+
if mass_diff <= mz_tol and rt_diff <= rt_tol:
|
|
1042
|
+
j = uid_to_idx[uid2]
|
|
1043
|
+
uf.union(i, j)
|
|
1044
|
+
|
|
1045
|
+
# Extract groups
|
|
1046
|
+
groups_by_root = defaultdict(list)
|
|
1047
|
+
for i, (uid, rt, mass, inty, adduct, _) in enumerate(valid_features):
|
|
1048
|
+
root = uf.find(i)
|
|
1049
|
+
groups_by_root[root].append((uid, rt, mass, inty, adduct))
|
|
1050
|
+
|
|
1051
|
+
groups = {}
|
|
1052
|
+
group_id = 1
|
|
1053
|
+
assigned_groups = {}
|
|
1054
|
+
|
|
1055
|
+
for group_members in groups_by_root.values():
|
|
1056
|
+
member_uids = [uid for uid, _, _, _, _ in group_members]
|
|
1057
|
+
|
|
1058
|
+
for uid in member_uids:
|
|
1059
|
+
assigned_groups[uid] = group_id
|
|
1060
|
+
groups[group_id] = member_uids
|
|
1061
|
+
group_id += 1
|
|
1062
|
+
|
|
1063
|
+
# Handle features without neutral mass
|
|
1064
|
+
for feature in consensus_data:
|
|
1065
|
+
uid = feature["consensus_uid"]
|
|
1066
|
+
if uid not in assigned_groups:
|
|
1067
|
+
assigned_groups[uid] = group_id
|
|
1068
|
+
groups[group_id] = [uid]
|
|
1069
|
+
group_id += 1
|
|
1070
|
+
|
|
1071
|
+
# Determine adduct_of for each group
|
|
1072
|
+
group_adduct_of = {}
|
|
1073
|
+
for grp_id, member_uids in groups.items():
|
|
1074
|
+
best_uid = None
|
|
1075
|
+
best_priority = -1
|
|
1076
|
+
best_intensity = 0
|
|
1077
|
+
|
|
1078
|
+
for uid in member_uids:
|
|
1079
|
+
feature_data = next((f for f in consensus_data if f["consensus_uid"] == uid), None)
|
|
1080
|
+
if not feature_data:
|
|
1081
|
+
continue
|
|
1082
|
+
|
|
1083
|
+
adduct = feature_data.get("adduct_top", "")
|
|
1084
|
+
intensity = feature_data.get("inty_mean", 0)
|
|
1085
|
+
|
|
1086
|
+
priority = 0
|
|
1087
|
+
if adduct and ("[M+H]" in adduct or adduct == "H" or adduct == "?"):
|
|
1088
|
+
priority = 3
|
|
1089
|
+
elif adduct and "[M-H]" in adduct:
|
|
1090
|
+
priority = 2
|
|
1091
|
+
elif adduct and "M" in adduct:
|
|
1092
|
+
priority = 1
|
|
1093
|
+
|
|
1094
|
+
if priority > best_priority or (priority == best_priority and intensity > best_intensity):
|
|
1095
|
+
best_uid = uid
|
|
1096
|
+
best_priority = priority
|
|
1097
|
+
best_intensity = intensity
|
|
1098
|
+
|
|
1099
|
+
group_adduct_of[grp_id] = best_uid if best_uid else member_uids[0]
|
|
1100
|
+
|
|
1101
|
+
# Build final lists in same order as consensus_data
|
|
1102
|
+
adduct_group_list = []
|
|
1103
|
+
adduct_of_list = []
|
|
1104
|
+
|
|
1105
|
+
for feature in consensus_data:
|
|
1106
|
+
uid = feature["consensus_uid"]
|
|
1107
|
+
group = assigned_groups.get(uid, 0)
|
|
1108
|
+
adduct_of = group_adduct_of.get(group, uid)
|
|
1109
|
+
|
|
1110
|
+
adduct_group_list.append(group)
|
|
1111
|
+
adduct_of_list.append(adduct_of)
|
|
1112
|
+
|
|
1113
|
+
grouping_time = time.time() - start_time
|
|
1114
|
+
if n_features > 1000:
|
|
1115
|
+
study_obj.logger.info(f"Adduct grouping completed in {grouping_time:.2f}s ({len(groups)} groups)")
|
|
1116
|
+
|
|
1117
|
+
return adduct_group_list, adduct_of_list
|
|
1118
|
+
|
|
1119
|
+
|
|
1035
1120
|
# Backward compatibility alias
|
|
1036
1121
|
find_consensus = merge
|
|
1037
1122
|
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|