masster 0.4.18__tar.gz → 0.4.19__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of masster might be problematic. Click here for more details.
- {masster-0.4.18 → masster-0.4.19}/PKG-INFO +1 -1
- {masster-0.4.18 → masster-0.4.19}/pyproject.toml +1 -1
- {masster-0.4.18 → masster-0.4.19}/src/masster/_version.py +1 -1
- {masster-0.4.18 → masster-0.4.19}/src/masster/study/defaults/merge_def.py +43 -2
- {masster-0.4.18 → masster-0.4.19}/src/masster/study/merge.py +575 -90
- {masster-0.4.18 → masster-0.4.19}/uv.lock +1 -1
- {masster-0.4.18 → masster-0.4.19}/.github/workflows/publish.yml +0 -0
- {masster-0.4.18 → masster-0.4.19}/.github/workflows/security.yml +0 -0
- {masster-0.4.18 → masster-0.4.19}/.github/workflows/test.yml +0 -0
- {masster-0.4.18 → masster-0.4.19}/.gitignore +0 -0
- {masster-0.4.18 → masster-0.4.19}/.pre-commit-config.yaml +0 -0
- {masster-0.4.18 → masster-0.4.19}/LICENSE +0 -0
- {masster-0.4.18 → masster-0.4.19}/Makefile +0 -0
- {masster-0.4.18 → masster-0.4.19}/README.md +0 -0
- {masster-0.4.18 → masster-0.4.19}/TESTING.md +0 -0
- {masster-0.4.18 → masster-0.4.19}/demo/example_batch_process.py +0 -0
- {masster-0.4.18 → masster-0.4.19}/demo/example_sample_process.py +0 -0
- {masster-0.4.18 → masster-0.4.19}/src/masster/__init__.py +0 -0
- {masster-0.4.18 → masster-0.4.19}/src/masster/chromatogram.py +0 -0
- {masster-0.4.18 → masster-0.4.19}/src/masster/data/dda/20250530_VH_IQX_KW_RP_HSST3_100mm_12min_pos_v4_DDA_OT_C-MiLUT_QC_dil2_01_20250602151849.sample5 +0 -0
- {masster-0.4.18 → masster-0.4.19}/src/masster/data/dda/20250530_VH_IQX_KW_RP_HSST3_100mm_12min_pos_v4_DDA_OT_C-MiLUT_QC_dil3_01_20250602150634.sample5 +0 -0
- {masster-0.4.18 → masster-0.4.19}/src/masster/data/dda/20250530_VH_IQX_KW_RP_HSST3_100mm_12min_pos_v4_MS1_C-MiLUT_C008_v6_r38_01.sample5 +0 -0
- {masster-0.4.18 → masster-0.4.19}/src/masster/data/dda/20250530_VH_IQX_KW_RP_HSST3_100mm_12min_pos_v4_MS1_C-MiLUT_C008_v7_r37_01.sample5 +0 -0
- {masster-0.4.18 → masster-0.4.19}/src/masster/data/dda/20250530_VH_IQX_KW_RP_HSST3_100mm_12min_pos_v4_MS1_C-MiLUT_C017_v5_r99_01.sample5 +0 -0
- {masster-0.4.18 → masster-0.4.19}/src/masster/data/libs/ccm.csv +0 -0
- {masster-0.4.18 → masster-0.4.19}/src/masster/data/libs/urine.csv +0 -0
- {masster-0.4.18 → masster-0.4.19}/src/masster/data/wiff/2025_01_14_VW_7600_LpMx_DBS_CID_2min_TOP15_030msecMS1_005msecReac_CE35_DBS-ON_3.timeseries.data +0 -0
- {masster-0.4.18 → masster-0.4.19}/src/masster/data/wiff/2025_01_14_VW_7600_LpMx_DBS_CID_2min_TOP15_030msecMS1_005msecReac_CE35_DBS-ON_3.wiff +0 -0
- {masster-0.4.18 → masster-0.4.19}/src/masster/data/wiff/2025_01_14_VW_7600_LpMx_DBS_CID_2min_TOP15_030msecMS1_005msecReac_CE35_DBS-ON_3.wiff.scan +0 -0
- {masster-0.4.18 → masster-0.4.19}/src/masster/data/wiff/2025_01_14_VW_7600_LpMx_DBS_CID_2min_TOP15_030msecMS1_005msecReac_CE35_DBS-ON_3.wiff2 +0 -0
- {masster-0.4.18 → masster-0.4.19}/src/masster/lib/__init__.py +0 -0
- {masster-0.4.18 → masster-0.4.19}/src/masster/lib/lib.py +0 -0
- {masster-0.4.18 → masster-0.4.19}/src/masster/logger.py +0 -0
- {masster-0.4.18 → masster-0.4.19}/src/masster/sample/__init__.py +0 -0
- {masster-0.4.18 → masster-0.4.19}/src/masster/sample/adducts.py +0 -0
- {masster-0.4.18 → masster-0.4.19}/src/masster/sample/defaults/__init__.py +0 -0
- {masster-0.4.18 → masster-0.4.19}/src/masster/sample/defaults/find_adducts_def.py +0 -0
- {masster-0.4.18 → masster-0.4.19}/src/masster/sample/defaults/find_features_def.py +0 -0
- {masster-0.4.18 → masster-0.4.19}/src/masster/sample/defaults/find_ms2_def.py +0 -0
- {masster-0.4.18 → masster-0.4.19}/src/masster/sample/defaults/get_spectrum_def.py +0 -0
- {masster-0.4.18 → masster-0.4.19}/src/masster/sample/defaults/sample_def.py +0 -0
- {masster-0.4.18 → masster-0.4.19}/src/masster/sample/h5.py +0 -0
- {masster-0.4.18 → masster-0.4.19}/src/masster/sample/helpers.py +0 -0
- {masster-0.4.18 → masster-0.4.19}/src/masster/sample/lib.py +0 -0
- {masster-0.4.18 → masster-0.4.19}/src/masster/sample/load.py +0 -0
- {masster-0.4.18 → masster-0.4.19}/src/masster/sample/parameters.py +0 -0
- {masster-0.4.18 → masster-0.4.19}/src/masster/sample/plot.py +0 -0
- {masster-0.4.18 → masster-0.4.19}/src/masster/sample/processing.py +0 -0
- {masster-0.4.18 → masster-0.4.19}/src/masster/sample/quant.py +0 -0
- {masster-0.4.18 → masster-0.4.19}/src/masster/sample/sample.py +0 -0
- {masster-0.4.18 → masster-0.4.19}/src/masster/sample/sample5_schema.json +0 -0
- {masster-0.4.18 → masster-0.4.19}/src/masster/sample/save.py +0 -0
- {masster-0.4.18 → masster-0.4.19}/src/masster/sample/sciex.py +0 -0
- {masster-0.4.18 → masster-0.4.19}/src/masster/spectrum.py +0 -0
- {masster-0.4.18 → masster-0.4.19}/src/masster/study/__init__.py +0 -0
- {masster-0.4.18 → masster-0.4.19}/src/masster/study/defaults/__init__.py +0 -0
- {masster-0.4.18 → masster-0.4.19}/src/masster/study/defaults/align_def.py +0 -0
- {masster-0.4.18 → masster-0.4.19}/src/masster/study/defaults/export_def.py +0 -0
- {masster-0.4.18 → masster-0.4.19}/src/masster/study/defaults/fill_chrom_def.py +0 -0
- {masster-0.4.18 → masster-0.4.19}/src/masster/study/defaults/fill_def.py +0 -0
- {masster-0.4.18 → masster-0.4.19}/src/masster/study/defaults/find_consensus_def.py +0 -0
- {masster-0.4.18 → masster-0.4.19}/src/masster/study/defaults/find_ms2_def.py +0 -0
- {masster-0.4.18 → masster-0.4.19}/src/masster/study/defaults/identify_def.py +0 -0
- {masster-0.4.18 → masster-0.4.19}/src/masster/study/defaults/integrate_chrom_def.py +0 -0
- {masster-0.4.18 → masster-0.4.19}/src/masster/study/defaults/integrate_def.py +0 -0
- {masster-0.4.18 → masster-0.4.19}/src/masster/study/defaults/study_def.py +0 -0
- {masster-0.4.18 → masster-0.4.19}/src/masster/study/export.py +0 -0
- {masster-0.4.18 → masster-0.4.19}/src/masster/study/h5.py +0 -0
- {masster-0.4.18 → masster-0.4.19}/src/masster/study/helpers.py +0 -0
- {masster-0.4.18 → masster-0.4.19}/src/masster/study/id.py +0 -0
- {masster-0.4.18 → masster-0.4.19}/src/masster/study/load.py +0 -0
- {masster-0.4.18 → masster-0.4.19}/src/masster/study/parameters.py +0 -0
- {masster-0.4.18 → masster-0.4.19}/src/masster/study/plot.py +0 -0
- {masster-0.4.18 → masster-0.4.19}/src/masster/study/processing.py +0 -0
- {masster-0.4.18 → masster-0.4.19}/src/masster/study/save.py +0 -0
- {masster-0.4.18 → masster-0.4.19}/src/masster/study/study.py +0 -0
- {masster-0.4.18 → masster-0.4.19}/src/masster/study/study5_schema.json +0 -0
- {masster-0.4.18 → masster-0.4.19}/src/masster/wizard/README.md +0 -0
- {masster-0.4.18 → masster-0.4.19}/src/masster/wizard/__init__.py +0 -0
- {masster-0.4.18 → masster-0.4.19}/src/masster/wizard/example.py +0 -0
- {masster-0.4.18 → masster-0.4.19}/src/masster/wizard/test_structure.py +0 -0
- {masster-0.4.18 → masster-0.4.19}/src/masster/wizard/test_wizard.py +0 -0
- {masster-0.4.18 → masster-0.4.19}/src/masster/wizard/wizard.py +0 -0
- {masster-0.4.18 → masster-0.4.19}/src/masster/wizard.py +0 -0
- {masster-0.4.18 → masster-0.4.19}/tests/conftest.py +0 -0
- {masster-0.4.18 → masster-0.4.19}/tests/test_chromatogram.py +0 -0
- {masster-0.4.18 → masster-0.4.19}/tests/test_defaults.py +0 -0
- {masster-0.4.18 → masster-0.4.19}/tests/test_imports.py +0 -0
- {masster-0.4.18 → masster-0.4.19}/tests/test_integration.py +0 -0
- {masster-0.4.18 → masster-0.4.19}/tests/test_logger.py +0 -0
- {masster-0.4.18 → masster-0.4.19}/tests/test_parameters.py +0 -0
- {masster-0.4.18 → masster-0.4.19}/tests/test_sample.py +0 -0
- {masster-0.4.18 → masster-0.4.19}/tests/test_spectrum.py +0 -0
- {masster-0.4.18 → masster-0.4.19}/tests/test_study.py +0 -0
- {masster-0.4.18 → masster-0.4.19}/tests/test_version.py +0 -0
- {masster-0.4.18 → masster-0.4.19}/tox.ini +0 -0
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
"""Parameter class for Study merge method."""
|
|
2
2
|
|
|
3
3
|
from dataclasses import dataclass, field
|
|
4
|
-
from typing import Any
|
|
4
|
+
from typing import Any, Optional
|
|
5
5
|
|
|
6
6
|
|
|
7
7
|
@dataclass
|
|
@@ -36,6 +36,9 @@ class merge_defaults:
|
|
|
36
36
|
max_nr_conflicts: int = 0
|
|
37
37
|
link_ms2: bool = True
|
|
38
38
|
|
|
39
|
+
# Parallel processing parameters
|
|
40
|
+
threads: Optional[int] = None
|
|
41
|
+
|
|
39
42
|
# KD-Strict specific parameters
|
|
40
43
|
optimize_rt_tol: bool = False
|
|
41
44
|
rt_tol_range: tuple = (0.5, 4.0)
|
|
@@ -115,6 +118,14 @@ class merge_defaults:
|
|
|
115
118
|
"description": "Whether to link MS2 spectra to consensus features",
|
|
116
119
|
"default": True,
|
|
117
120
|
},
|
|
121
|
+
# Parallel processing parameters
|
|
122
|
+
"threads": {
|
|
123
|
+
"dtype": [int, type(None)],
|
|
124
|
+
"description": "Number of parallel threads/processes for chunked methods (None=original sequential)",
|
|
125
|
+
"default": None,
|
|
126
|
+
"min_value": 1,
|
|
127
|
+
"max_value": 32,
|
|
128
|
+
},
|
|
118
129
|
# KD-Strict specific parameters
|
|
119
130
|
"optimize_rt_tol": {
|
|
120
131
|
"dtype": bool,
|
|
@@ -217,7 +228,37 @@ class merge_defaults:
|
|
|
217
228
|
metadata = self._param_metadata[param_name]
|
|
218
229
|
expected_dtype = metadata["dtype"]
|
|
219
230
|
|
|
220
|
-
#
|
|
231
|
+
# Handle Optional types (list of types including None)
|
|
232
|
+
if isinstance(expected_dtype, list):
|
|
233
|
+
# Check if value matches any of the allowed types
|
|
234
|
+
valid_type = False
|
|
235
|
+
for dtype in expected_dtype:
|
|
236
|
+
if dtype is type(None) and value is None:
|
|
237
|
+
return True # None is explicitly allowed
|
|
238
|
+
elif dtype is int and isinstance(value, int):
|
|
239
|
+
valid_type = True
|
|
240
|
+
break
|
|
241
|
+
elif dtype is float and isinstance(value, (int, float)):
|
|
242
|
+
valid_type = True
|
|
243
|
+
break
|
|
244
|
+
elif dtype is bool and isinstance(value, bool):
|
|
245
|
+
valid_type = True
|
|
246
|
+
break
|
|
247
|
+
elif dtype is str and isinstance(value, str):
|
|
248
|
+
valid_type = True
|
|
249
|
+
break
|
|
250
|
+
|
|
251
|
+
if not valid_type:
|
|
252
|
+
return False
|
|
253
|
+
|
|
254
|
+
# For None values, skip further validation
|
|
255
|
+
if value is None:
|
|
256
|
+
return True
|
|
257
|
+
|
|
258
|
+
# Use the first non-None type for range validation
|
|
259
|
+
expected_dtype = next((dt for dt in expected_dtype if dt is not type(None)), expected_dtype[0])
|
|
260
|
+
|
|
261
|
+
# Type checking for non-Optional types
|
|
221
262
|
if expected_dtype is int:
|
|
222
263
|
if not isinstance(value, int):
|
|
223
264
|
try:
|
|
@@ -10,9 +10,269 @@ from datetime import datetime
|
|
|
10
10
|
from tqdm import tqdm
|
|
11
11
|
import pyopenms as oms
|
|
12
12
|
import polars as pl
|
|
13
|
+
from concurrent.futures import ProcessPoolExecutor, as_completed
|
|
13
14
|
from masster.study.defaults import merge_defaults
|
|
14
15
|
|
|
15
16
|
|
|
17
|
+
def _process_kd_chunk_parallel(chunk_data):
|
|
18
|
+
"""
|
|
19
|
+
Process a single KD chunk in parallel by reconstructing FeatureMaps from features_df slice.
|
|
20
|
+
|
|
21
|
+
Args:
|
|
22
|
+
chunk_data: Dictionary containing chunk processing parameters
|
|
23
|
+
|
|
24
|
+
Returns:
|
|
25
|
+
Tuple of (chunk_start_idx, serialized_consensus_features)
|
|
26
|
+
"""
|
|
27
|
+
import pyopenms as oms
|
|
28
|
+
|
|
29
|
+
chunk_start_idx = chunk_data['chunk_start_idx']
|
|
30
|
+
chunk_features_data = chunk_data['chunk_features_data'] # List of feature dicts
|
|
31
|
+
chunk_samples_data = chunk_data['chunk_samples_data'] # List of sample dicts
|
|
32
|
+
params_dict = chunk_data['params']
|
|
33
|
+
|
|
34
|
+
# Reconstruct FeatureMaps from features data for each sample in the chunk
|
|
35
|
+
chunk_maps = []
|
|
36
|
+
|
|
37
|
+
for sample_data in chunk_samples_data:
|
|
38
|
+
sample_uid = sample_data['sample_uid']
|
|
39
|
+
|
|
40
|
+
# Filter features for this specific sample
|
|
41
|
+
sample_features = [f for f in chunk_features_data if f['sample_uid'] == sample_uid]
|
|
42
|
+
|
|
43
|
+
# Create FeatureMap for this sample
|
|
44
|
+
feature_map = oms.FeatureMap()
|
|
45
|
+
|
|
46
|
+
# Add each feature to the map
|
|
47
|
+
for feature_dict in sample_features:
|
|
48
|
+
feature = oms.Feature()
|
|
49
|
+
feature.setRT(float(feature_dict['rt']))
|
|
50
|
+
feature.setMZ(float(feature_dict['mz']))
|
|
51
|
+
feature.setIntensity(float(feature_dict['inty']))
|
|
52
|
+
feature.setCharge(int(feature_dict.get('charge', 0)))
|
|
53
|
+
|
|
54
|
+
# Set unique ID using feature_id for mapping back
|
|
55
|
+
feature.setUniqueId(int(feature_dict['feature_id']))
|
|
56
|
+
|
|
57
|
+
feature_map.push_back(feature)
|
|
58
|
+
|
|
59
|
+
chunk_maps.append(feature_map)
|
|
60
|
+
|
|
61
|
+
# Create the chunk consensus map
|
|
62
|
+
chunk_consensus_map = oms.ConsensusMap()
|
|
63
|
+
|
|
64
|
+
# Set up file descriptions for chunk
|
|
65
|
+
file_descriptions = chunk_consensus_map.getColumnHeaders()
|
|
66
|
+
for j, (feature_map, sample_data) in enumerate(zip(chunk_maps, chunk_samples_data)):
|
|
67
|
+
file_description = file_descriptions.get(j, oms.ColumnHeader())
|
|
68
|
+
file_description.filename = sample_data['sample_name']
|
|
69
|
+
file_description.size = feature_map.size()
|
|
70
|
+
file_description.unique_id = feature_map.getUniqueId()
|
|
71
|
+
file_descriptions[j] = file_description
|
|
72
|
+
|
|
73
|
+
chunk_consensus_map.setColumnHeaders(file_descriptions)
|
|
74
|
+
|
|
75
|
+
# Use KD algorithm for chunk
|
|
76
|
+
grouper = oms.FeatureGroupingAlgorithmKD()
|
|
77
|
+
chunk_params = grouper.getParameters()
|
|
78
|
+
chunk_params.setValue("mz_unit", "Da")
|
|
79
|
+
chunk_params.setValue("nr_partitions", params_dict['nr_partitions'])
|
|
80
|
+
chunk_params.setValue("warp:enabled", "true")
|
|
81
|
+
chunk_params.setValue("warp:rt_tol", params_dict['rt_tol'])
|
|
82
|
+
chunk_params.setValue("warp:mz_tol", params_dict['mz_tol'])
|
|
83
|
+
chunk_params.setValue("link:rt_tol", params_dict['rt_tol'])
|
|
84
|
+
chunk_params.setValue("link:mz_tol", params_dict['mz_tol'])
|
|
85
|
+
chunk_params.setValue("link:min_rel_cc_size", params_dict['min_rel_cc_size'])
|
|
86
|
+
chunk_params.setValue("link:max_pairwise_log_fc", params_dict['max_pairwise_log_fc'])
|
|
87
|
+
chunk_params.setValue("link:max_nr_conflicts", params_dict['max_nr_conflicts'])
|
|
88
|
+
|
|
89
|
+
grouper.setParameters(chunk_params)
|
|
90
|
+
grouper.group(chunk_maps, chunk_consensus_map)
|
|
91
|
+
|
|
92
|
+
# Serialize the consensus map result for cross-process communication
|
|
93
|
+
consensus_features = []
|
|
94
|
+
for consensus_feature in chunk_consensus_map:
|
|
95
|
+
feature_data = {
|
|
96
|
+
'rt': consensus_feature.getRT(),
|
|
97
|
+
'mz': consensus_feature.getMZ(),
|
|
98
|
+
'intensity': consensus_feature.getIntensity(),
|
|
99
|
+
'quality': consensus_feature.getQuality(),
|
|
100
|
+
'unique_id': str(consensus_feature.getUniqueId()),
|
|
101
|
+
'features': []
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
# Get constituent features
|
|
105
|
+
for feature_handle in consensus_feature.getFeatureList():
|
|
106
|
+
feature_handle_data = {
|
|
107
|
+
'unique_id': str(feature_handle.getUniqueId()),
|
|
108
|
+
'map_index': feature_handle.getMapIndex()
|
|
109
|
+
}
|
|
110
|
+
feature_data['features'].append(feature_handle_data)
|
|
111
|
+
|
|
112
|
+
consensus_features.append(feature_data)
|
|
113
|
+
|
|
114
|
+
return chunk_start_idx, consensus_features
|
|
115
|
+
|
|
116
|
+
|
|
117
|
+
def _deserialize_consensus_features(consensus_features):
|
|
118
|
+
"""
|
|
119
|
+
Deserialize consensus features back into an OpenMS ConsensusMap.
|
|
120
|
+
|
|
121
|
+
Args:
|
|
122
|
+
consensus_features: List of serialized consensus feature dictionaries
|
|
123
|
+
|
|
124
|
+
Returns:
|
|
125
|
+
OpenMS ConsensusMap object
|
|
126
|
+
"""
|
|
127
|
+
import pyopenms as oms
|
|
128
|
+
|
|
129
|
+
consensus_map = oms.ConsensusMap()
|
|
130
|
+
|
|
131
|
+
for feature_data in consensus_features:
|
|
132
|
+
consensus_feature = oms.ConsensusFeature()
|
|
133
|
+
consensus_feature.setRT(float(feature_data['rt']))
|
|
134
|
+
consensus_feature.setMZ(float(feature_data['mz']))
|
|
135
|
+
consensus_feature.setIntensity(float(feature_data['intensity']))
|
|
136
|
+
consensus_feature.setQuality(float(feature_data['quality']))
|
|
137
|
+
consensus_feature.setUniqueId(int(feature_data['unique_id']))
|
|
138
|
+
|
|
139
|
+
# Reconstruct feature handles (simplified approach)
|
|
140
|
+
feature_handles = []
|
|
141
|
+
for handle_data in feature_data['features']:
|
|
142
|
+
feature_handle = oms.FeatureHandle()
|
|
143
|
+
feature_handle.setUniqueId(int(handle_data['unique_id']))
|
|
144
|
+
feature_handle.setMapIndex(int(handle_data['map_index']))
|
|
145
|
+
feature_handles.append(feature_handle)
|
|
146
|
+
|
|
147
|
+
# Set the feature list - properly add feature handles back to consensus feature
|
|
148
|
+
if feature_handles:
|
|
149
|
+
# Add each feature handle to the consensus feature using the correct OpenMS API
|
|
150
|
+
for feature_handle in feature_handles:
|
|
151
|
+
consensus_feature.getFeatureList().append(feature_handle)
|
|
152
|
+
|
|
153
|
+
consensus_map.push_back(consensus_feature)
|
|
154
|
+
|
|
155
|
+
return consensus_map
|
|
156
|
+
|
|
157
|
+
|
|
158
|
+
def _process_qt_chunk_parallel(chunk_data):
|
|
159
|
+
"""
|
|
160
|
+
Process a single QT chunk in parallel by reconstructing FeatureMaps from features_df slice.
|
|
161
|
+
|
|
162
|
+
Args:
|
|
163
|
+
chunk_data: Dictionary containing chunk processing parameters
|
|
164
|
+
|
|
165
|
+
Returns:
|
|
166
|
+
Tuple of (chunk_start_idx, serialized_consensus_features)
|
|
167
|
+
"""
|
|
168
|
+
import pyopenms as oms
|
|
169
|
+
|
|
170
|
+
chunk_start_idx = chunk_data['chunk_start_idx']
|
|
171
|
+
chunk_features_data = chunk_data['chunk_features_data'] # List of feature dicts
|
|
172
|
+
chunk_samples_data = chunk_data['chunk_samples_data'] # List of sample dicts
|
|
173
|
+
params_dict = chunk_data['params']
|
|
174
|
+
|
|
175
|
+
# Reconstruct FeatureMaps from features data for each sample in the chunk
|
|
176
|
+
chunk_maps = []
|
|
177
|
+
|
|
178
|
+
for sample_data in chunk_samples_data:
|
|
179
|
+
sample_uid = sample_data['sample_uid']
|
|
180
|
+
|
|
181
|
+
# Filter features for this specific sample
|
|
182
|
+
sample_features = [f for f in chunk_features_data if f['sample_uid'] == sample_uid]
|
|
183
|
+
|
|
184
|
+
# Create FeatureMap for this sample
|
|
185
|
+
feature_map = oms.FeatureMap()
|
|
186
|
+
|
|
187
|
+
# Add each feature to the map
|
|
188
|
+
for feature_dict in sample_features:
|
|
189
|
+
feature = oms.Feature()
|
|
190
|
+
feature.setRT(float(feature_dict['rt']))
|
|
191
|
+
feature.setMZ(float(feature_dict['mz']))
|
|
192
|
+
feature.setIntensity(float(feature_dict['inty']))
|
|
193
|
+
feature.setCharge(int(feature_dict.get('charge', 0)))
|
|
194
|
+
|
|
195
|
+
# Set unique ID using feature_id for mapping back
|
|
196
|
+
feature.setUniqueId(int(feature_dict['feature_id']))
|
|
197
|
+
|
|
198
|
+
feature_map.push_back(feature)
|
|
199
|
+
|
|
200
|
+
chunk_maps.append(feature_map)
|
|
201
|
+
|
|
202
|
+
# Create the chunk consensus map
|
|
203
|
+
chunk_consensus_map = oms.ConsensusMap()
|
|
204
|
+
|
|
205
|
+
# Set up file descriptions for chunk
|
|
206
|
+
file_descriptions = chunk_consensus_map.getColumnHeaders()
|
|
207
|
+
for j, (feature_map, sample_data) in enumerate(zip(chunk_maps, chunk_samples_data)):
|
|
208
|
+
file_description = file_descriptions.get(j, oms.ColumnHeader())
|
|
209
|
+
file_description.filename = sample_data['sample_name']
|
|
210
|
+
file_description.size = feature_map.size()
|
|
211
|
+
file_description.unique_id = feature_map.getUniqueId()
|
|
212
|
+
file_descriptions[j] = file_description
|
|
213
|
+
|
|
214
|
+
chunk_consensus_map.setColumnHeaders(file_descriptions)
|
|
215
|
+
|
|
216
|
+
# Use QT algorithm for chunk
|
|
217
|
+
grouper = oms.FeatureGroupingAlgorithmQT()
|
|
218
|
+
chunk_params = grouper.getParameters()
|
|
219
|
+
chunk_params.setValue("distance_RT:max_difference", params_dict['rt_tol'])
|
|
220
|
+
chunk_params.setValue("distance_MZ:max_difference", params_dict['mz_tol'])
|
|
221
|
+
chunk_params.setValue("distance_MZ:unit", "Da")
|
|
222
|
+
chunk_params.setValue("ignore_charge", "true")
|
|
223
|
+
chunk_params.setValue("nr_partitions", params_dict['nr_partitions'])
|
|
224
|
+
|
|
225
|
+
grouper.setParameters(chunk_params)
|
|
226
|
+
grouper.group(chunk_maps, chunk_consensus_map)
|
|
227
|
+
|
|
228
|
+
# Serialize the consensus map result for cross-process communication
|
|
229
|
+
consensus_features = []
|
|
230
|
+
for consensus_feature in chunk_consensus_map:
|
|
231
|
+
feature_data = {
|
|
232
|
+
'rt': consensus_feature.getRT(),
|
|
233
|
+
'mz': consensus_feature.getMZ(),
|
|
234
|
+
'intensity': consensus_feature.getIntensity(),
|
|
235
|
+
'quality': consensus_feature.getQuality(),
|
|
236
|
+
'unique_id': str(consensus_feature.getUniqueId()),
|
|
237
|
+
'features': []
|
|
238
|
+
}
|
|
239
|
+
|
|
240
|
+
# Get constituent features
|
|
241
|
+
for feature_handle in consensus_feature.getFeatureList():
|
|
242
|
+
feature_handle_data = {
|
|
243
|
+
'unique_id': str(feature_handle.getUniqueId()),
|
|
244
|
+
'map_index': feature_handle.getMapIndex()
|
|
245
|
+
}
|
|
246
|
+
feature_data['features'].append(feature_handle_data)
|
|
247
|
+
|
|
248
|
+
consensus_features.append(feature_data)
|
|
249
|
+
|
|
250
|
+
return chunk_start_idx, consensus_features
|
|
251
|
+
|
|
252
|
+
|
|
253
|
+
def _serialize_feature_map(feature_map):
|
|
254
|
+
"""
|
|
255
|
+
Serialize a FeatureMap to a list of dictionaries for multiprocessing.
|
|
256
|
+
|
|
257
|
+
Args:
|
|
258
|
+
feature_map: OpenMS FeatureMap object
|
|
259
|
+
|
|
260
|
+
Returns:
|
|
261
|
+
List of feature dictionaries
|
|
262
|
+
"""
|
|
263
|
+
features_data = []
|
|
264
|
+
for feature in feature_map:
|
|
265
|
+
feature_data = {
|
|
266
|
+
'rt': feature.getRT(),
|
|
267
|
+
'mz': feature.getMZ(),
|
|
268
|
+
'intensity': feature.getIntensity(),
|
|
269
|
+
'charge': feature.getCharge(),
|
|
270
|
+
'unique_id': feature.getUniqueId()
|
|
271
|
+
}
|
|
272
|
+
features_data.append(feature_data)
|
|
273
|
+
return features_data
|
|
274
|
+
|
|
275
|
+
|
|
16
276
|
def merge(self, **kwargs) -> None:
|
|
17
277
|
"""
|
|
18
278
|
Group features across samples into consensus features using various algorithms.
|
|
@@ -34,6 +294,8 @@ def merge(self, **kwargs) -> None:
|
|
|
34
294
|
m/z tolerance in Da (Daltons) for all methods
|
|
35
295
|
- chunk_size : int, default 500
|
|
36
296
|
Chunk size for 'chunked' method
|
|
297
|
+
- threads : int, default 1
|
|
298
|
+
Number of parallel processes for chunked methods (kd_chunked, qt_chunked)
|
|
37
299
|
- nr_partitions : int, default 500
|
|
38
300
|
Number of partitions in m/z dimension for KD algorithms
|
|
39
301
|
- min_rel_cc_size : float, default 0.3
|
|
@@ -54,9 +316,19 @@ def merge(self, **kwargs) -> None:
|
|
|
54
316
|
- NoWarp: Memory efficient KD without RT warping for large datasets
|
|
55
317
|
- KD-Chunked: Memory-optimized KD algorithm for very large datasets (>5000 samples)
|
|
56
318
|
Uses optimized partitioning for better memory management while maintaining
|
|
57
|
-
full cross-sample consensus feature detection.
|
|
319
|
+
full cross-sample consensus feature detection. Supports parallel processing.
|
|
58
320
|
- QT-Chunked: Memory-optimized QT algorithm for very large datasets (>5000 samples)
|
|
59
321
|
Uses QT clustering in first stage with optimized cross-chunk consensus building.
|
|
322
|
+
Supports parallel processing.
|
|
323
|
+
|
|
324
|
+
Parallel Processing
|
|
325
|
+
------------------
|
|
326
|
+
For kd_chunked and qt_chunked methods, use threads > 1 to enable parallel processing
|
|
327
|
+
of chunk alignments. This can significantly reduce processing time for large datasets
|
|
328
|
+
by processing multiple chunks simultaneously in separate processes.
|
|
329
|
+
|
|
330
|
+
Example:
|
|
331
|
+
study.merge(method='kd_chunked', threads=4, chunk_size=200)
|
|
60
332
|
"""
|
|
61
333
|
start_time = time.time()
|
|
62
334
|
|
|
@@ -774,7 +1046,7 @@ def _merge_kd_nowarp(self, params: merge_defaults) -> oms.ConsensusMap:
|
|
|
774
1046
|
|
|
775
1047
|
|
|
776
1048
|
def _merge_kd_chunked(self, params: merge_defaults, cached_adducts_df=None, cached_valid_adducts=None) -> oms.ConsensusMap:
|
|
777
|
-
"""KD-based chunked merge with proper cross-chunk consensus building"""
|
|
1049
|
+
"""KD-based chunked merge with proper cross-chunk consensus building and optional parallel processing"""
|
|
778
1050
|
|
|
779
1051
|
n_samples = len(self.features_maps)
|
|
780
1052
|
if n_samples <= params.chunk_size:
|
|
@@ -790,54 +1062,130 @@ def _merge_kd_chunked(self, params: merge_defaults, cached_adducts_df=None, cach
|
|
|
790
1062
|
chunk_end = min(i + params.chunk_size, n_samples)
|
|
791
1063
|
chunks.append((i, self.features_maps[i:chunk_end]))
|
|
792
1064
|
|
|
793
|
-
self.logger.debug(f"Processing {len(chunks)} chunks of max {params.chunk_size} samples")
|
|
1065
|
+
self.logger.debug(f"Processing {len(chunks)} chunks of max {params.chunk_size} samples using {params.threads or 'sequential'} thread(s)")
|
|
794
1066
|
|
|
795
1067
|
# Process each chunk to create chunk consensus maps
|
|
796
1068
|
chunk_consensus_maps = []
|
|
797
1069
|
|
|
798
|
-
|
|
799
|
-
|
|
800
|
-
|
|
801
|
-
|
|
802
|
-
|
|
803
|
-
|
|
804
|
-
|
|
805
|
-
|
|
806
|
-
|
|
807
|
-
|
|
808
|
-
|
|
809
|
-
|
|
810
|
-
|
|
1070
|
+
if params.threads is None:
|
|
1071
|
+
# Sequential processing (original behavior)
|
|
1072
|
+
for chunk_idx, (chunk_start_idx, chunk_maps) in enumerate(tqdm(chunks, desc=f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}KD Chunk", disable=self.log_level not in ["TRACE", "DEBUG", "INFO"])):
|
|
1073
|
+
chunk_consensus_map = oms.ConsensusMap()
|
|
1074
|
+
|
|
1075
|
+
# Set up file descriptions for chunk
|
|
1076
|
+
file_descriptions = chunk_consensus_map.getColumnHeaders()
|
|
1077
|
+
for j, feature_map in enumerate(chunk_maps):
|
|
1078
|
+
file_description = file_descriptions.get(j, oms.ColumnHeader())
|
|
1079
|
+
file_description.filename = self.samples_df.row(chunk_start_idx + j, named=True)["sample_name"]
|
|
1080
|
+
file_description.size = feature_map.size()
|
|
1081
|
+
file_description.unique_id = feature_map.getUniqueId()
|
|
1082
|
+
file_descriptions[j] = file_description
|
|
1083
|
+
|
|
1084
|
+
chunk_consensus_map.setColumnHeaders(file_descriptions)
|
|
1085
|
+
|
|
1086
|
+
# Use KD algorithm for chunk
|
|
1087
|
+
grouper = oms.FeatureGroupingAlgorithmKD()
|
|
1088
|
+
chunk_params = grouper.getParameters()
|
|
1089
|
+
chunk_params.setValue("mz_unit", "Da")
|
|
1090
|
+
chunk_params.setValue("nr_partitions", params.nr_partitions)
|
|
1091
|
+
chunk_params.setValue("warp:enabled", "true")
|
|
1092
|
+
chunk_params.setValue("warp:rt_tol", params.rt_tol)
|
|
1093
|
+
chunk_params.setValue("warp:mz_tol", params.mz_tol)
|
|
1094
|
+
chunk_params.setValue("link:rt_tol", params.rt_tol)
|
|
1095
|
+
chunk_params.setValue("link:mz_tol", params.mz_tol)
|
|
1096
|
+
chunk_params.setValue("link:min_rel_cc_size", params.min_rel_cc_size)
|
|
1097
|
+
chunk_params.setValue("link:max_pairwise_log_fc", params.max_pairwise_log_fc)
|
|
1098
|
+
chunk_params.setValue("link:max_nr_conflicts", params.max_nr_conflicts)
|
|
1099
|
+
|
|
1100
|
+
grouper.setParameters(chunk_params)
|
|
1101
|
+
grouper.group(chunk_maps, chunk_consensus_map)
|
|
1102
|
+
|
|
1103
|
+
chunk_consensus_maps.append((chunk_start_idx, chunk_consensus_map))
|
|
1104
|
+
|
|
1105
|
+
else:
|
|
1106
|
+
# Parallel processing
|
|
1107
|
+
self.logger.info(f"Processing chunks in parallel using {params.threads} processes")
|
|
811
1108
|
|
|
812
|
-
#
|
|
813
|
-
|
|
814
|
-
|
|
815
|
-
|
|
816
|
-
|
|
817
|
-
|
|
818
|
-
|
|
819
|
-
|
|
820
|
-
|
|
821
|
-
|
|
822
|
-
|
|
823
|
-
|
|
824
|
-
|
|
1109
|
+
# Prepare chunk data for parallel processing using features_df slices
|
|
1110
|
+
chunk_data_list = []
|
|
1111
|
+
for chunk_idx, (chunk_start_idx, chunk_maps) in enumerate(chunks):
|
|
1112
|
+
# Get the sample UIDs for this chunk
|
|
1113
|
+
chunk_sample_uids = []
|
|
1114
|
+
chunk_samples_df_rows = []
|
|
1115
|
+
for j in range(len(chunk_maps)):
|
|
1116
|
+
sample_row = self.samples_df.row(chunk_start_idx + j, named=True)
|
|
1117
|
+
chunk_sample_uids.append(sample_row['sample_uid'])
|
|
1118
|
+
chunk_samples_df_rows.append(sample_row)
|
|
1119
|
+
|
|
1120
|
+
# Create a DataFrame for this chunk's samples
|
|
1121
|
+
chunk_samples_df = pl.DataFrame(chunk_samples_df_rows)
|
|
1122
|
+
|
|
1123
|
+
# Filter features_df for this chunk's samples and select only necessary columns
|
|
1124
|
+
chunk_features_df = self.features_df.filter(
|
|
1125
|
+
pl.col('sample_uid').is_in(chunk_sample_uids)
|
|
1126
|
+
).select([
|
|
1127
|
+
'sample_uid', 'rt', 'mz', 'inty', 'charge', 'feature_id'
|
|
1128
|
+
])
|
|
1129
|
+
|
|
1130
|
+
# Convert DataFrames to serializable format (lists of dicts)
|
|
1131
|
+
chunk_features_data = chunk_features_df.to_dicts()
|
|
1132
|
+
chunk_samples_data = chunk_samples_df.to_dicts()
|
|
1133
|
+
|
|
1134
|
+
chunk_data = {
|
|
1135
|
+
'chunk_start_idx': chunk_start_idx,
|
|
1136
|
+
'chunk_features_data': chunk_features_data, # List of dicts instead of DataFrame
|
|
1137
|
+
'chunk_samples_data': chunk_samples_data, # List of dicts instead of DataFrame
|
|
1138
|
+
'params': {
|
|
1139
|
+
'nr_partitions': params.nr_partitions,
|
|
1140
|
+
'rt_tol': params.rt_tol,
|
|
1141
|
+
'mz_tol': params.mz_tol,
|
|
1142
|
+
'min_rel_cc_size': params.min_rel_cc_size,
|
|
1143
|
+
'max_pairwise_log_fc': params.max_pairwise_log_fc,
|
|
1144
|
+
'max_nr_conflicts': params.max_nr_conflicts
|
|
1145
|
+
}
|
|
1146
|
+
}
|
|
1147
|
+
chunk_data_list.append(chunk_data)
|
|
825
1148
|
|
|
826
|
-
|
|
827
|
-
|
|
1149
|
+
# Process chunks in parallel
|
|
1150
|
+
with ProcessPoolExecutor(max_workers=params.threads) as executor:
|
|
1151
|
+
# Submit all chunk processing tasks
|
|
1152
|
+
future_to_chunk = {executor.submit(_process_kd_chunk_parallel, chunk_data): i
|
|
1153
|
+
for i, chunk_data in enumerate(chunk_data_list)}
|
|
1154
|
+
|
|
1155
|
+
# Collect results with progress tracking
|
|
1156
|
+
completed_chunks = 0
|
|
1157
|
+
total_chunks = len(chunk_data_list)
|
|
1158
|
+
serialized_chunk_results = []
|
|
1159
|
+
|
|
1160
|
+
for future in as_completed(future_to_chunk):
|
|
1161
|
+
chunk_idx = future_to_chunk[future]
|
|
1162
|
+
try:
|
|
1163
|
+
chunk_start_idx, consensus_features = future.result()
|
|
1164
|
+
serialized_chunk_results.append((chunk_start_idx, consensus_features))
|
|
1165
|
+
completed_chunks += 1
|
|
1166
|
+
n_samples_in_chunk = len(chunk_data_list[chunk_idx]['chunk_samples_data'])
|
|
1167
|
+
self.logger.info(f"Completed chunk {completed_chunks}/{total_chunks} (samples {chunk_start_idx + 1}-{chunk_start_idx + n_samples_in_chunk})")
|
|
1168
|
+
except Exception as exc:
|
|
1169
|
+
self.logger.error(f"Chunk {chunk_idx} generated an exception: {exc}")
|
|
1170
|
+
raise exc
|
|
828
1171
|
|
|
829
|
-
|
|
830
|
-
|
|
831
|
-
|
|
1172
|
+
# Store serialized results for _merge_chunk_results to handle directly
|
|
1173
|
+
chunk_consensus_maps = []
|
|
1174
|
+
for chunk_start_idx, consensus_features in sorted(serialized_chunk_results):
|
|
1175
|
+
# Store serialized data directly for _merge_chunk_results to handle
|
|
1176
|
+
chunk_consensus_maps.append((chunk_start_idx, consensus_features))
|
|
1177
|
+
|
|
1178
|
+
# Merge chunk results with proper cross-chunk consensus building
|
|
1179
|
+
# _merge_chunk_results now handles both ConsensusMap objects (sequential) and serialized data (parallel)
|
|
832
1180
|
_merge_chunk_results(self, chunk_consensus_maps, params, cached_adducts_df, cached_valid_adducts)
|
|
833
1181
|
|
|
834
|
-
#
|
|
1182
|
+
# Return a dummy consensus map for compatibility (consensus features are stored in self.consensus_df)
|
|
835
1183
|
consensus_map = oms.ConsensusMap()
|
|
836
1184
|
return consensus_map
|
|
837
1185
|
|
|
838
1186
|
|
|
839
1187
|
def _merge_qt_chunked(self, params: merge_defaults, cached_adducts_df=None, cached_valid_adducts=None) -> oms.ConsensusMap:
|
|
840
|
-
"""QT-based chunked merge with proper cross-chunk consensus building"""
|
|
1188
|
+
"""QT-based chunked merge with proper cross-chunk consensus building and optional parallel processing"""
|
|
841
1189
|
|
|
842
1190
|
n_samples = len(self.features_maps)
|
|
843
1191
|
if n_samples <= params.chunk_size:
|
|
@@ -853,43 +1201,116 @@ def _merge_qt_chunked(self, params: merge_defaults, cached_adducts_df=None, cach
|
|
|
853
1201
|
chunk_end = min(i + params.chunk_size, n_samples)
|
|
854
1202
|
chunks.append((i, self.features_maps[i:chunk_end]))
|
|
855
1203
|
|
|
856
|
-
self.logger.debug(f"Processing {len(chunks)} chunks of max {params.chunk_size} samples")
|
|
1204
|
+
self.logger.debug(f"Processing {len(chunks)} chunks of max {params.chunk_size} samples using {params.threads or 'sequential'} thread(s)")
|
|
857
1205
|
|
|
858
1206
|
# Process each chunk to create chunk consensus maps
|
|
859
1207
|
chunk_consensus_maps = []
|
|
860
1208
|
|
|
861
|
-
|
|
862
|
-
|
|
863
|
-
|
|
864
|
-
|
|
865
|
-
|
|
866
|
-
|
|
867
|
-
|
|
868
|
-
|
|
869
|
-
|
|
870
|
-
|
|
871
|
-
|
|
872
|
-
|
|
873
|
-
|
|
1209
|
+
if params.threads is None:
|
|
1210
|
+
# Sequential processing (original behavior)
|
|
1211
|
+
for chunk_idx, (chunk_start_idx, chunk_maps) in enumerate(tqdm(chunks, desc=f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}QT Chunk", disable=self.log_level not in ["TRACE", "DEBUG", "INFO"])):
|
|
1212
|
+
chunk_consensus_map = oms.ConsensusMap()
|
|
1213
|
+
|
|
1214
|
+
# Set up file descriptions for chunk
|
|
1215
|
+
file_descriptions = chunk_consensus_map.getColumnHeaders()
|
|
1216
|
+
for j, feature_map in enumerate(chunk_maps):
|
|
1217
|
+
file_description = file_descriptions.get(j, oms.ColumnHeader())
|
|
1218
|
+
file_description.filename = self.samples_df.row(chunk_start_idx + j, named=True)["sample_name"]
|
|
1219
|
+
file_description.size = feature_map.size()
|
|
1220
|
+
file_description.unique_id = feature_map.getUniqueId()
|
|
1221
|
+
file_descriptions[j] = file_description
|
|
1222
|
+
|
|
1223
|
+
chunk_consensus_map.setColumnHeaders(file_descriptions)
|
|
1224
|
+
|
|
1225
|
+
# Use QT algorithm for chunk (main difference from KD chunked)
|
|
1226
|
+
grouper = oms.FeatureGroupingAlgorithmQT()
|
|
1227
|
+
chunk_params = grouper.getParameters()
|
|
1228
|
+
chunk_params.setValue("distance_RT:max_difference", params.rt_tol)
|
|
1229
|
+
chunk_params.setValue("distance_MZ:max_difference", params.mz_tol)
|
|
1230
|
+
chunk_params.setValue("distance_MZ:unit", "Da")
|
|
1231
|
+
chunk_params.setValue("ignore_charge", "true")
|
|
1232
|
+
chunk_params.setValue("nr_partitions", params.nr_partitions)
|
|
1233
|
+
|
|
1234
|
+
grouper.setParameters(chunk_params)
|
|
1235
|
+
grouper.group(chunk_maps, chunk_consensus_map)
|
|
1236
|
+
|
|
1237
|
+
chunk_consensus_maps.append((chunk_start_idx, chunk_consensus_map))
|
|
1238
|
+
|
|
1239
|
+
else:
|
|
1240
|
+
# Parallel processing
|
|
1241
|
+
self.logger.info(f"Processing chunks in parallel using {params.threads} processes")
|
|
874
1242
|
|
|
875
|
-
#
|
|
876
|
-
|
|
877
|
-
|
|
878
|
-
|
|
879
|
-
|
|
880
|
-
|
|
881
|
-
|
|
882
|
-
|
|
1243
|
+
# Prepare chunk data for parallel processing using features_df slices
|
|
1244
|
+
chunk_data_list = []
|
|
1245
|
+
for chunk_idx, (chunk_start_idx, chunk_maps) in enumerate(chunks):
|
|
1246
|
+
# Get the sample UIDs for this chunk
|
|
1247
|
+
chunk_sample_uids = []
|
|
1248
|
+
chunk_samples_df_rows = []
|
|
1249
|
+
for j in range(len(chunk_maps)):
|
|
1250
|
+
sample_row = self.samples_df.row(chunk_start_idx + j, named=True)
|
|
1251
|
+
chunk_sample_uids.append(sample_row['sample_uid'])
|
|
1252
|
+
chunk_samples_df_rows.append(sample_row)
|
|
1253
|
+
|
|
1254
|
+
# Create a DataFrame for this chunk's samples
|
|
1255
|
+
chunk_samples_df = pl.DataFrame(chunk_samples_df_rows)
|
|
1256
|
+
|
|
1257
|
+
# Filter features_df for this chunk's samples and select only necessary columns
|
|
1258
|
+
chunk_features_df = self.features_df.filter(
|
|
1259
|
+
pl.col('sample_uid').is_in(chunk_sample_uids)
|
|
1260
|
+
).select([
|
|
1261
|
+
'sample_uid', 'rt', 'mz', 'inty', 'charge', 'feature_id'
|
|
1262
|
+
])
|
|
1263
|
+
|
|
1264
|
+
# Convert DataFrames to serializable format (lists of dicts)
|
|
1265
|
+
chunk_features_data = chunk_features_df.to_dicts()
|
|
1266
|
+
chunk_samples_data = chunk_samples_df.to_dicts()
|
|
1267
|
+
|
|
1268
|
+
chunk_data = {
|
|
1269
|
+
'chunk_start_idx': chunk_start_idx,
|
|
1270
|
+
'chunk_features_data': chunk_features_data, # List of dicts instead of DataFrame
|
|
1271
|
+
'chunk_samples_data': chunk_samples_data, # List of dicts instead of DataFrame
|
|
1272
|
+
'params': {
|
|
1273
|
+
'nr_partitions': params.nr_partitions,
|
|
1274
|
+
'rt_tol': params.rt_tol,
|
|
1275
|
+
'mz_tol': params.mz_tol,
|
|
1276
|
+
}
|
|
1277
|
+
}
|
|
1278
|
+
chunk_data_list.append(chunk_data)
|
|
883
1279
|
|
|
884
|
-
|
|
885
|
-
|
|
1280
|
+
# Process chunks in parallel
|
|
1281
|
+
with ProcessPoolExecutor(max_workers=params.threads) as executor:
|
|
1282
|
+
# Submit all chunk processing tasks
|
|
1283
|
+
future_to_chunk = {executor.submit(_process_qt_chunk_parallel, chunk_data): i
|
|
1284
|
+
for i, chunk_data in enumerate(chunk_data_list)}
|
|
1285
|
+
|
|
1286
|
+
# Collect results with progress tracking
|
|
1287
|
+
completed_chunks = 0
|
|
1288
|
+
total_chunks = len(chunk_data_list)
|
|
1289
|
+
serialized_chunk_results = []
|
|
1290
|
+
|
|
1291
|
+
for future in as_completed(future_to_chunk):
|
|
1292
|
+
chunk_idx = future_to_chunk[future]
|
|
1293
|
+
try:
|
|
1294
|
+
chunk_start_idx, consensus_features = future.result()
|
|
1295
|
+
serialized_chunk_results.append((chunk_start_idx, consensus_features))
|
|
1296
|
+
completed_chunks += 1
|
|
1297
|
+
n_samples_in_chunk = len(chunk_data_list[chunk_idx]['chunk_samples_data'])
|
|
1298
|
+
self.logger.info(f"Completed chunk {completed_chunks}/{total_chunks} (samples {chunk_start_idx + 1}-{chunk_start_idx + n_samples_in_chunk})")
|
|
1299
|
+
except Exception as exc:
|
|
1300
|
+
self.logger.error(f"Chunk {chunk_idx} generated an exception: {exc}")
|
|
1301
|
+
raise exc
|
|
886
1302
|
|
|
887
|
-
|
|
888
|
-
|
|
889
|
-
|
|
1303
|
+
# Store serialized results for _merge_chunk_results to handle directly
|
|
1304
|
+
chunk_consensus_maps = []
|
|
1305
|
+
for chunk_start_idx, consensus_features in sorted(serialized_chunk_results):
|
|
1306
|
+
# Store serialized data directly for _merge_chunk_results to handle
|
|
1307
|
+
chunk_consensus_maps.append((chunk_start_idx, consensus_features))
|
|
1308
|
+
|
|
1309
|
+
# Merge chunk results with proper cross-chunk consensus building
|
|
1310
|
+
# _merge_chunk_results now handles both ConsensusMap objects (sequential) and serialized data (parallel)
|
|
890
1311
|
_merge_chunk_results(self, chunk_consensus_maps, params, cached_adducts_df, cached_valid_adducts)
|
|
891
1312
|
|
|
892
|
-
#
|
|
1313
|
+
# Return a dummy consensus map for compatibility (consensus features are stored in self.consensus_df)
|
|
893
1314
|
consensus_map = oms.ConsensusMap()
|
|
894
1315
|
return consensus_map
|
|
895
1316
|
|
|
@@ -927,61 +1348,128 @@ def _merge_chunk_results(self, chunk_consensus_maps: list, params: merge_default
|
|
|
927
1348
|
all_chunk_consensus = []
|
|
928
1349
|
consensus_id_counter = 0
|
|
929
1350
|
|
|
930
|
-
for chunk_idx, (chunk_start_idx,
|
|
931
|
-
|
|
1351
|
+
for chunk_idx, (chunk_start_idx, chunk_data) in enumerate(chunk_consensus_maps):
|
|
1352
|
+
# Handle both ConsensusMap objects (sequential) and serialized data (parallel)
|
|
1353
|
+
if isinstance(chunk_data, list):
|
|
1354
|
+
# Parallel processing: chunk_data is a list of serialized consensus feature dictionaries
|
|
1355
|
+
consensus_features_data = chunk_data
|
|
1356
|
+
else:
|
|
1357
|
+
# Sequential processing: chunk_data is a ConsensusMap object
|
|
1358
|
+
chunk_consensus_map = chunk_data
|
|
1359
|
+
consensus_features_data = []
|
|
1360
|
+
|
|
1361
|
+
# Extract data from ConsensusMap and convert to serialized format
|
|
1362
|
+
for consensus_feature in chunk_consensus_map:
|
|
1363
|
+
# Extract feature_uids from this consensus feature
|
|
1364
|
+
feature_uids = []
|
|
1365
|
+
feature_data_list = []
|
|
1366
|
+
sample_uids = []
|
|
1367
|
+
|
|
1368
|
+
for feature_handle in consensus_feature.getFeatureList():
|
|
1369
|
+
fuid = str(feature_handle.getUniqueId())
|
|
1370
|
+
if fuid not in feature_uid_map:
|
|
1371
|
+
continue
|
|
1372
|
+
|
|
1373
|
+
feature_uid = feature_uid_map[fuid]
|
|
1374
|
+
feature_data = features_lookup.get(feature_uid)
|
|
1375
|
+
if feature_data:
|
|
1376
|
+
feature_uids.append(feature_uid)
|
|
1377
|
+
feature_data_list.append(feature_data)
|
|
1378
|
+
sample_uids.append(chunk_start_idx + feature_handle.getMapIndex() + 1)
|
|
1379
|
+
|
|
1380
|
+
if not feature_data_list:
|
|
1381
|
+
# No retrievable feature metadata (possible stale map reference) -> skip
|
|
1382
|
+
continue
|
|
1383
|
+
|
|
1384
|
+
# Convert ConsensusFeature to serialized format
|
|
1385
|
+
consensus_feature_data = {
|
|
1386
|
+
'rt': consensus_feature.getRT(),
|
|
1387
|
+
'mz': consensus_feature.getMZ(),
|
|
1388
|
+
'intensity': consensus_feature.getIntensity(),
|
|
1389
|
+
'quality': consensus_feature.getQuality(),
|
|
1390
|
+
'feature_uids': feature_uids,
|
|
1391
|
+
'feature_data_list': feature_data_list,
|
|
1392
|
+
'sample_uids': sample_uids
|
|
1393
|
+
}
|
|
1394
|
+
consensus_features_data.append(consensus_feature_data)
|
|
1395
|
+
|
|
1396
|
+
# Process the consensus features (now all in serialized format)
|
|
1397
|
+
for consensus_feature_data in consensus_features_data:
|
|
932
1398
|
# ACCEPT ALL consensus features (size >=1) here.
|
|
933
1399
|
# Reason: A feature that is globally present in many samples can still
|
|
934
1400
|
# appear only once inside a given sample chunk. Early filtering at
|
|
935
1401
|
# size>=2 causes irreversible loss and underestimates the final
|
|
936
1402
|
# consensus count (observed ~296 vs 950 for KD). We defer filtering
|
|
937
1403
|
# strictly to the final global min_samples.
|
|
938
|
-
|
|
939
|
-
# Extract feature_uids from this consensus feature
|
|
940
|
-
feature_uids = []
|
|
941
|
-
feature_data_list = []
|
|
942
|
-
sample_uids = []
|
|
943
1404
|
|
|
944
|
-
|
|
945
|
-
|
|
946
|
-
|
|
1405
|
+
# For parallel processing, feature data is already extracted
|
|
1406
|
+
if isinstance(chunk_data, list):
|
|
1407
|
+
# Extract feature_uids and data from serialized format for parallel processing
|
|
1408
|
+
feature_uids = []
|
|
1409
|
+
feature_data_list = []
|
|
1410
|
+
sample_uids = []
|
|
1411
|
+
|
|
1412
|
+
for handle_data in consensus_feature_data['features']:
|
|
1413
|
+
fuid = str(handle_data['unique_id'])
|
|
1414
|
+
if fuid not in feature_uid_map:
|
|
1415
|
+
continue
|
|
1416
|
+
|
|
1417
|
+
feature_uid = feature_uid_map[fuid]
|
|
1418
|
+
feature_data = features_lookup.get(feature_uid)
|
|
1419
|
+
if feature_data:
|
|
1420
|
+
feature_uids.append(feature_uid)
|
|
1421
|
+
feature_data_list.append(feature_data)
|
|
1422
|
+
sample_uids.append(chunk_start_idx + handle_data['map_index'] + 1)
|
|
1423
|
+
|
|
1424
|
+
if not feature_data_list:
|
|
947
1425
|
continue
|
|
948
1426
|
|
|
949
|
-
|
|
950
|
-
|
|
951
|
-
|
|
952
|
-
|
|
953
|
-
|
|
954
|
-
|
|
1427
|
+
# Get RT/MZ from consensus feature data
|
|
1428
|
+
consensus_rt = consensus_feature_data['rt']
|
|
1429
|
+
consensus_mz = consensus_feature_data['mz']
|
|
1430
|
+
consensus_intensity = consensus_feature_data['intensity']
|
|
1431
|
+
consensus_quality = consensus_feature_data['quality']
|
|
1432
|
+
else:
|
|
1433
|
+
# Sequential processing: data is already extracted above
|
|
1434
|
+
feature_uids = consensus_feature_data['feature_uids']
|
|
1435
|
+
feature_data_list = consensus_feature_data['feature_data_list']
|
|
1436
|
+
sample_uids = consensus_feature_data['sample_uids']
|
|
1437
|
+
consensus_rt = consensus_feature_data['rt']
|
|
1438
|
+
consensus_mz = consensus_feature_data['mz']
|
|
1439
|
+
consensus_intensity = consensus_feature_data['intensity']
|
|
1440
|
+
consensus_quality = consensus_feature_data['quality']
|
|
955
1441
|
|
|
956
1442
|
if not feature_data_list:
|
|
957
1443
|
# No retrievable feature metadata (possible stale map reference) -> skip
|
|
958
|
-
continue
|
|
1444
|
+
continue
|
|
1445
|
+
|
|
1446
|
+
# Derive RT / m/z ranges from underlying features (used for robust cross-chunk stitching)
|
|
959
1447
|
rt_vals_local = [fd.get("rt") for fd in feature_data_list if fd.get("rt") is not None]
|
|
960
1448
|
mz_vals_local = [fd.get("mz") for fd in feature_data_list if fd.get("mz") is not None]
|
|
961
1449
|
if rt_vals_local:
|
|
962
1450
|
rt_min_local = min(rt_vals_local)
|
|
963
1451
|
rt_max_local = max(rt_vals_local)
|
|
964
1452
|
else:
|
|
965
|
-
rt_min_local = rt_max_local =
|
|
1453
|
+
rt_min_local = rt_max_local = consensus_rt
|
|
966
1454
|
if mz_vals_local:
|
|
967
1455
|
mz_min_local = min(mz_vals_local)
|
|
968
1456
|
mz_max_local = max(mz_vals_local)
|
|
969
1457
|
else:
|
|
970
|
-
mz_min_local = mz_max_local =
|
|
1458
|
+
mz_min_local = mz_max_local = consensus_mz
|
|
971
1459
|
|
|
972
1460
|
# Store chunk consensus with feature tracking
|
|
973
1461
|
chunk_consensus_data = {
|
|
974
1462
|
'consensus_id': consensus_id_counter,
|
|
975
1463
|
'chunk_idx': chunk_idx,
|
|
976
1464
|
'chunk_start_idx': chunk_start_idx,
|
|
977
|
-
'mz':
|
|
978
|
-
'rt':
|
|
1465
|
+
'mz': consensus_mz,
|
|
1466
|
+
'rt': consensus_rt,
|
|
979
1467
|
'mz_min': mz_min_local,
|
|
980
1468
|
'mz_max': mz_max_local,
|
|
981
1469
|
'rt_min': rt_min_local,
|
|
982
1470
|
'rt_max': rt_max_local,
|
|
983
|
-
'intensity':
|
|
984
|
-
'quality':
|
|
1471
|
+
'intensity': consensus_intensity,
|
|
1472
|
+
'quality': consensus_quality,
|
|
985
1473
|
'feature_uids': feature_uids,
|
|
986
1474
|
'feature_data_list': feature_data_list,
|
|
987
1475
|
'sample_uids': sample_uids,
|
|
@@ -1479,9 +1967,6 @@ def _cluster_consensus_features(features: list, rt_tol: float, mz_tol: float) ->
|
|
|
1479
1967
|
return list(groups_by_root.values())
|
|
1480
1968
|
|
|
1481
1969
|
|
|
1482
|
-
# Note: Restored proper chunked implementation with cross-chunk consensus clustering
|
|
1483
|
-
|
|
1484
|
-
|
|
1485
1970
|
def _reset_consensus_data(self):
|
|
1486
1971
|
"""Reset consensus-related DataFrames at the start of merge."""
|
|
1487
1972
|
self.consensus_df = pl.DataFrame()
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|