masster 0.4.17__py3-none-any.whl → 0.4.19__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of masster might be problematic. Click here for more details.
- masster/_version.py +1 -1
- masster/study/defaults/merge_def.py +53 -11
- masster/study/merge.py +628 -75
- masster/study/processing.py +0 -1
- {masster-0.4.17.dist-info → masster-0.4.19.dist-info}/METADATA +1 -1
- {masster-0.4.17.dist-info → masster-0.4.19.dist-info}/RECORD +9 -9
- {masster-0.4.17.dist-info → masster-0.4.19.dist-info}/WHEEL +0 -0
- {masster-0.4.17.dist-info → masster-0.4.19.dist-info}/entry_points.txt +0 -0
- {masster-0.4.17.dist-info → masster-0.4.19.dist-info}/licenses/LICENSE +0 -0
masster/_version.py
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
"""Parameter class for Study merge method."""
|
|
2
2
|
|
|
3
3
|
from dataclasses import dataclass, field
|
|
4
|
-
from typing import Any
|
|
4
|
+
from typing import Any, Optional
|
|
5
5
|
|
|
6
6
|
|
|
7
7
|
@dataclass
|
|
@@ -25,25 +25,28 @@ class merge_defaults:
|
|
|
25
25
|
link_ms2 (bool): Whether to link MS2 spectra to consensus features. Default is True.
|
|
26
26
|
"""
|
|
27
27
|
|
|
28
|
-
method: str = "
|
|
29
|
-
min_samples: int =
|
|
28
|
+
method: str = "qt"
|
|
29
|
+
min_samples: int = 2
|
|
30
30
|
rt_tol: float = 5.0
|
|
31
31
|
mz_tol: float = 0.01
|
|
32
|
-
chunk_size: int =
|
|
32
|
+
chunk_size: int = 500
|
|
33
33
|
nr_partitions: int = 1000
|
|
34
|
-
min_rel_cc_size: float = 0.
|
|
34
|
+
min_rel_cc_size: float = 0.1
|
|
35
35
|
max_pairwise_log_fc: float = -1.0
|
|
36
36
|
max_nr_conflicts: int = 0
|
|
37
37
|
link_ms2: bool = True
|
|
38
38
|
|
|
39
|
+
# Parallel processing parameters
|
|
40
|
+
threads: Optional[int] = None
|
|
41
|
+
|
|
39
42
|
# KD-Strict specific parameters
|
|
40
43
|
optimize_rt_tol: bool = False
|
|
41
|
-
rt_tol_range: tuple = (0.
|
|
42
|
-
rt_tol_steps: int =
|
|
43
|
-
secondary_merge_rt_tol: float = 0
|
|
44
|
+
rt_tol_range: tuple = (0.5, 4.0)
|
|
45
|
+
rt_tol_steps: int = 7
|
|
46
|
+
secondary_merge_rt_tol: float = 1.0
|
|
44
47
|
secondary_merge_mz_tol: float = 0.005
|
|
45
48
|
min_sample_overlap: float = 0.8
|
|
46
|
-
max_rt_spread: float =
|
|
49
|
+
max_rt_spread: float = 2.0 # Will default to 2x rt_tol
|
|
47
50
|
min_coherence: float = 0.0
|
|
48
51
|
|
|
49
52
|
_param_metadata: dict[str, dict[str, Any]] = field(
|
|
@@ -53,7 +56,8 @@ class merge_defaults:
|
|
|
53
56
|
"description": "Merge method (algorithm) to use",
|
|
54
57
|
"default": "quality",
|
|
55
58
|
"allowed_values": ["sensitivity", "qt", "nowarp", "chunked", "quality",
|
|
56
|
-
"kd", "kd-nowarp", "kd_nowarp", "kd-strict", "kd_strict"
|
|
59
|
+
"kd", "kd-nowarp", "kd_nowarp", "kd-strict", "kd_strict",
|
|
60
|
+
"kd_chunked", "kd-chunked", "qt_chunked", "qt-chunked"],
|
|
57
61
|
},
|
|
58
62
|
"min_samples": {
|
|
59
63
|
"dtype": int,
|
|
@@ -114,6 +118,14 @@ class merge_defaults:
|
|
|
114
118
|
"description": "Whether to link MS2 spectra to consensus features",
|
|
115
119
|
"default": True,
|
|
116
120
|
},
|
|
121
|
+
# Parallel processing parameters
|
|
122
|
+
"threads": {
|
|
123
|
+
"dtype": [int, type(None)],
|
|
124
|
+
"description": "Number of parallel threads/processes for chunked methods (None=original sequential)",
|
|
125
|
+
"default": None,
|
|
126
|
+
"min_value": 1,
|
|
127
|
+
"max_value": 32,
|
|
128
|
+
},
|
|
117
129
|
# KD-Strict specific parameters
|
|
118
130
|
"optimize_rt_tol": {
|
|
119
131
|
"dtype": bool,
|
|
@@ -216,7 +228,37 @@ class merge_defaults:
|
|
|
216
228
|
metadata = self._param_metadata[param_name]
|
|
217
229
|
expected_dtype = metadata["dtype"]
|
|
218
230
|
|
|
219
|
-
#
|
|
231
|
+
# Handle Optional types (list of types including None)
|
|
232
|
+
if isinstance(expected_dtype, list):
|
|
233
|
+
# Check if value matches any of the allowed types
|
|
234
|
+
valid_type = False
|
|
235
|
+
for dtype in expected_dtype:
|
|
236
|
+
if dtype is type(None) and value is None:
|
|
237
|
+
return True # None is explicitly allowed
|
|
238
|
+
elif dtype is int and isinstance(value, int):
|
|
239
|
+
valid_type = True
|
|
240
|
+
break
|
|
241
|
+
elif dtype is float and isinstance(value, (int, float)):
|
|
242
|
+
valid_type = True
|
|
243
|
+
break
|
|
244
|
+
elif dtype is bool and isinstance(value, bool):
|
|
245
|
+
valid_type = True
|
|
246
|
+
break
|
|
247
|
+
elif dtype is str and isinstance(value, str):
|
|
248
|
+
valid_type = True
|
|
249
|
+
break
|
|
250
|
+
|
|
251
|
+
if not valid_type:
|
|
252
|
+
return False
|
|
253
|
+
|
|
254
|
+
# For None values, skip further validation
|
|
255
|
+
if value is None:
|
|
256
|
+
return True
|
|
257
|
+
|
|
258
|
+
# Use the first non-None type for range validation
|
|
259
|
+
expected_dtype = next((dt for dt in expected_dtype if dt is not type(None)), expected_dtype[0])
|
|
260
|
+
|
|
261
|
+
# Type checking for non-Optional types
|
|
220
262
|
if expected_dtype is int:
|
|
221
263
|
if not isinstance(value, int):
|
|
222
264
|
try:
|
masster/study/merge.py
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
"""
|
|
2
2
|
Unified merge module for the Study class.
|
|
3
|
-
Supports multiple merge methods: 'kd', 'qt', 'kd-nowarp', '
|
|
3
|
+
Supports multiple merge methods: 'kd', 'qt', 'kd-nowarp', 'kd_chunked', 'qt_chunked'
|
|
4
4
|
"""
|
|
5
5
|
|
|
6
6
|
import time
|
|
@@ -10,9 +10,269 @@ from datetime import datetime
|
|
|
10
10
|
from tqdm import tqdm
|
|
11
11
|
import pyopenms as oms
|
|
12
12
|
import polars as pl
|
|
13
|
+
from concurrent.futures import ProcessPoolExecutor, as_completed
|
|
13
14
|
from masster.study.defaults import merge_defaults
|
|
14
15
|
|
|
15
16
|
|
|
17
|
+
def _process_kd_chunk_parallel(chunk_data):
|
|
18
|
+
"""
|
|
19
|
+
Process a single KD chunk in parallel by reconstructing FeatureMaps from features_df slice.
|
|
20
|
+
|
|
21
|
+
Args:
|
|
22
|
+
chunk_data: Dictionary containing chunk processing parameters
|
|
23
|
+
|
|
24
|
+
Returns:
|
|
25
|
+
Tuple of (chunk_start_idx, serialized_consensus_features)
|
|
26
|
+
"""
|
|
27
|
+
import pyopenms as oms
|
|
28
|
+
|
|
29
|
+
chunk_start_idx = chunk_data['chunk_start_idx']
|
|
30
|
+
chunk_features_data = chunk_data['chunk_features_data'] # List of feature dicts
|
|
31
|
+
chunk_samples_data = chunk_data['chunk_samples_data'] # List of sample dicts
|
|
32
|
+
params_dict = chunk_data['params']
|
|
33
|
+
|
|
34
|
+
# Reconstruct FeatureMaps from features data for each sample in the chunk
|
|
35
|
+
chunk_maps = []
|
|
36
|
+
|
|
37
|
+
for sample_data in chunk_samples_data:
|
|
38
|
+
sample_uid = sample_data['sample_uid']
|
|
39
|
+
|
|
40
|
+
# Filter features for this specific sample
|
|
41
|
+
sample_features = [f for f in chunk_features_data if f['sample_uid'] == sample_uid]
|
|
42
|
+
|
|
43
|
+
# Create FeatureMap for this sample
|
|
44
|
+
feature_map = oms.FeatureMap()
|
|
45
|
+
|
|
46
|
+
# Add each feature to the map
|
|
47
|
+
for feature_dict in sample_features:
|
|
48
|
+
feature = oms.Feature()
|
|
49
|
+
feature.setRT(float(feature_dict['rt']))
|
|
50
|
+
feature.setMZ(float(feature_dict['mz']))
|
|
51
|
+
feature.setIntensity(float(feature_dict['inty']))
|
|
52
|
+
feature.setCharge(int(feature_dict.get('charge', 0)))
|
|
53
|
+
|
|
54
|
+
# Set unique ID using feature_id for mapping back
|
|
55
|
+
feature.setUniqueId(int(feature_dict['feature_id']))
|
|
56
|
+
|
|
57
|
+
feature_map.push_back(feature)
|
|
58
|
+
|
|
59
|
+
chunk_maps.append(feature_map)
|
|
60
|
+
|
|
61
|
+
# Create the chunk consensus map
|
|
62
|
+
chunk_consensus_map = oms.ConsensusMap()
|
|
63
|
+
|
|
64
|
+
# Set up file descriptions for chunk
|
|
65
|
+
file_descriptions = chunk_consensus_map.getColumnHeaders()
|
|
66
|
+
for j, (feature_map, sample_data) in enumerate(zip(chunk_maps, chunk_samples_data)):
|
|
67
|
+
file_description = file_descriptions.get(j, oms.ColumnHeader())
|
|
68
|
+
file_description.filename = sample_data['sample_name']
|
|
69
|
+
file_description.size = feature_map.size()
|
|
70
|
+
file_description.unique_id = feature_map.getUniqueId()
|
|
71
|
+
file_descriptions[j] = file_description
|
|
72
|
+
|
|
73
|
+
chunk_consensus_map.setColumnHeaders(file_descriptions)
|
|
74
|
+
|
|
75
|
+
# Use KD algorithm for chunk
|
|
76
|
+
grouper = oms.FeatureGroupingAlgorithmKD()
|
|
77
|
+
chunk_params = grouper.getParameters()
|
|
78
|
+
chunk_params.setValue("mz_unit", "Da")
|
|
79
|
+
chunk_params.setValue("nr_partitions", params_dict['nr_partitions'])
|
|
80
|
+
chunk_params.setValue("warp:enabled", "true")
|
|
81
|
+
chunk_params.setValue("warp:rt_tol", params_dict['rt_tol'])
|
|
82
|
+
chunk_params.setValue("warp:mz_tol", params_dict['mz_tol'])
|
|
83
|
+
chunk_params.setValue("link:rt_tol", params_dict['rt_tol'])
|
|
84
|
+
chunk_params.setValue("link:mz_tol", params_dict['mz_tol'])
|
|
85
|
+
chunk_params.setValue("link:min_rel_cc_size", params_dict['min_rel_cc_size'])
|
|
86
|
+
chunk_params.setValue("link:max_pairwise_log_fc", params_dict['max_pairwise_log_fc'])
|
|
87
|
+
chunk_params.setValue("link:max_nr_conflicts", params_dict['max_nr_conflicts'])
|
|
88
|
+
|
|
89
|
+
grouper.setParameters(chunk_params)
|
|
90
|
+
grouper.group(chunk_maps, chunk_consensus_map)
|
|
91
|
+
|
|
92
|
+
# Serialize the consensus map result for cross-process communication
|
|
93
|
+
consensus_features = []
|
|
94
|
+
for consensus_feature in chunk_consensus_map:
|
|
95
|
+
feature_data = {
|
|
96
|
+
'rt': consensus_feature.getRT(),
|
|
97
|
+
'mz': consensus_feature.getMZ(),
|
|
98
|
+
'intensity': consensus_feature.getIntensity(),
|
|
99
|
+
'quality': consensus_feature.getQuality(),
|
|
100
|
+
'unique_id': str(consensus_feature.getUniqueId()),
|
|
101
|
+
'features': []
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
# Get constituent features
|
|
105
|
+
for feature_handle in consensus_feature.getFeatureList():
|
|
106
|
+
feature_handle_data = {
|
|
107
|
+
'unique_id': str(feature_handle.getUniqueId()),
|
|
108
|
+
'map_index': feature_handle.getMapIndex()
|
|
109
|
+
}
|
|
110
|
+
feature_data['features'].append(feature_handle_data)
|
|
111
|
+
|
|
112
|
+
consensus_features.append(feature_data)
|
|
113
|
+
|
|
114
|
+
return chunk_start_idx, consensus_features
|
|
115
|
+
|
|
116
|
+
|
|
117
|
+
def _deserialize_consensus_features(consensus_features):
|
|
118
|
+
"""
|
|
119
|
+
Deserialize consensus features back into an OpenMS ConsensusMap.
|
|
120
|
+
|
|
121
|
+
Args:
|
|
122
|
+
consensus_features: List of serialized consensus feature dictionaries
|
|
123
|
+
|
|
124
|
+
Returns:
|
|
125
|
+
OpenMS ConsensusMap object
|
|
126
|
+
"""
|
|
127
|
+
import pyopenms as oms
|
|
128
|
+
|
|
129
|
+
consensus_map = oms.ConsensusMap()
|
|
130
|
+
|
|
131
|
+
for feature_data in consensus_features:
|
|
132
|
+
consensus_feature = oms.ConsensusFeature()
|
|
133
|
+
consensus_feature.setRT(float(feature_data['rt']))
|
|
134
|
+
consensus_feature.setMZ(float(feature_data['mz']))
|
|
135
|
+
consensus_feature.setIntensity(float(feature_data['intensity']))
|
|
136
|
+
consensus_feature.setQuality(float(feature_data['quality']))
|
|
137
|
+
consensus_feature.setUniqueId(int(feature_data['unique_id']))
|
|
138
|
+
|
|
139
|
+
# Reconstruct feature handles (simplified approach)
|
|
140
|
+
feature_handles = []
|
|
141
|
+
for handle_data in feature_data['features']:
|
|
142
|
+
feature_handle = oms.FeatureHandle()
|
|
143
|
+
feature_handle.setUniqueId(int(handle_data['unique_id']))
|
|
144
|
+
feature_handle.setMapIndex(int(handle_data['map_index']))
|
|
145
|
+
feature_handles.append(feature_handle)
|
|
146
|
+
|
|
147
|
+
# Set the feature list - properly add feature handles back to consensus feature
|
|
148
|
+
if feature_handles:
|
|
149
|
+
# Add each feature handle to the consensus feature using the correct OpenMS API
|
|
150
|
+
for feature_handle in feature_handles:
|
|
151
|
+
consensus_feature.getFeatureList().append(feature_handle)
|
|
152
|
+
|
|
153
|
+
consensus_map.push_back(consensus_feature)
|
|
154
|
+
|
|
155
|
+
return consensus_map
|
|
156
|
+
|
|
157
|
+
|
|
158
|
+
def _process_qt_chunk_parallel(chunk_data):
|
|
159
|
+
"""
|
|
160
|
+
Process a single QT chunk in parallel by reconstructing FeatureMaps from features_df slice.
|
|
161
|
+
|
|
162
|
+
Args:
|
|
163
|
+
chunk_data: Dictionary containing chunk processing parameters
|
|
164
|
+
|
|
165
|
+
Returns:
|
|
166
|
+
Tuple of (chunk_start_idx, serialized_consensus_features)
|
|
167
|
+
"""
|
|
168
|
+
import pyopenms as oms
|
|
169
|
+
|
|
170
|
+
chunk_start_idx = chunk_data['chunk_start_idx']
|
|
171
|
+
chunk_features_data = chunk_data['chunk_features_data'] # List of feature dicts
|
|
172
|
+
chunk_samples_data = chunk_data['chunk_samples_data'] # List of sample dicts
|
|
173
|
+
params_dict = chunk_data['params']
|
|
174
|
+
|
|
175
|
+
# Reconstruct FeatureMaps from features data for each sample in the chunk
|
|
176
|
+
chunk_maps = []
|
|
177
|
+
|
|
178
|
+
for sample_data in chunk_samples_data:
|
|
179
|
+
sample_uid = sample_data['sample_uid']
|
|
180
|
+
|
|
181
|
+
# Filter features for this specific sample
|
|
182
|
+
sample_features = [f for f in chunk_features_data if f['sample_uid'] == sample_uid]
|
|
183
|
+
|
|
184
|
+
# Create FeatureMap for this sample
|
|
185
|
+
feature_map = oms.FeatureMap()
|
|
186
|
+
|
|
187
|
+
# Add each feature to the map
|
|
188
|
+
for feature_dict in sample_features:
|
|
189
|
+
feature = oms.Feature()
|
|
190
|
+
feature.setRT(float(feature_dict['rt']))
|
|
191
|
+
feature.setMZ(float(feature_dict['mz']))
|
|
192
|
+
feature.setIntensity(float(feature_dict['inty']))
|
|
193
|
+
feature.setCharge(int(feature_dict.get('charge', 0)))
|
|
194
|
+
|
|
195
|
+
# Set unique ID using feature_id for mapping back
|
|
196
|
+
feature.setUniqueId(int(feature_dict['feature_id']))
|
|
197
|
+
|
|
198
|
+
feature_map.push_back(feature)
|
|
199
|
+
|
|
200
|
+
chunk_maps.append(feature_map)
|
|
201
|
+
|
|
202
|
+
# Create the chunk consensus map
|
|
203
|
+
chunk_consensus_map = oms.ConsensusMap()
|
|
204
|
+
|
|
205
|
+
# Set up file descriptions for chunk
|
|
206
|
+
file_descriptions = chunk_consensus_map.getColumnHeaders()
|
|
207
|
+
for j, (feature_map, sample_data) in enumerate(zip(chunk_maps, chunk_samples_data)):
|
|
208
|
+
file_description = file_descriptions.get(j, oms.ColumnHeader())
|
|
209
|
+
file_description.filename = sample_data['sample_name']
|
|
210
|
+
file_description.size = feature_map.size()
|
|
211
|
+
file_description.unique_id = feature_map.getUniqueId()
|
|
212
|
+
file_descriptions[j] = file_description
|
|
213
|
+
|
|
214
|
+
chunk_consensus_map.setColumnHeaders(file_descriptions)
|
|
215
|
+
|
|
216
|
+
# Use QT algorithm for chunk
|
|
217
|
+
grouper = oms.FeatureGroupingAlgorithmQT()
|
|
218
|
+
chunk_params = grouper.getParameters()
|
|
219
|
+
chunk_params.setValue("distance_RT:max_difference", params_dict['rt_tol'])
|
|
220
|
+
chunk_params.setValue("distance_MZ:max_difference", params_dict['mz_tol'])
|
|
221
|
+
chunk_params.setValue("distance_MZ:unit", "Da")
|
|
222
|
+
chunk_params.setValue("ignore_charge", "true")
|
|
223
|
+
chunk_params.setValue("nr_partitions", params_dict['nr_partitions'])
|
|
224
|
+
|
|
225
|
+
grouper.setParameters(chunk_params)
|
|
226
|
+
grouper.group(chunk_maps, chunk_consensus_map)
|
|
227
|
+
|
|
228
|
+
# Serialize the consensus map result for cross-process communication
|
|
229
|
+
consensus_features = []
|
|
230
|
+
for consensus_feature in chunk_consensus_map:
|
|
231
|
+
feature_data = {
|
|
232
|
+
'rt': consensus_feature.getRT(),
|
|
233
|
+
'mz': consensus_feature.getMZ(),
|
|
234
|
+
'intensity': consensus_feature.getIntensity(),
|
|
235
|
+
'quality': consensus_feature.getQuality(),
|
|
236
|
+
'unique_id': str(consensus_feature.getUniqueId()),
|
|
237
|
+
'features': []
|
|
238
|
+
}
|
|
239
|
+
|
|
240
|
+
# Get constituent features
|
|
241
|
+
for feature_handle in consensus_feature.getFeatureList():
|
|
242
|
+
feature_handle_data = {
|
|
243
|
+
'unique_id': str(feature_handle.getUniqueId()),
|
|
244
|
+
'map_index': feature_handle.getMapIndex()
|
|
245
|
+
}
|
|
246
|
+
feature_data['features'].append(feature_handle_data)
|
|
247
|
+
|
|
248
|
+
consensus_features.append(feature_data)
|
|
249
|
+
|
|
250
|
+
return chunk_start_idx, consensus_features
|
|
251
|
+
|
|
252
|
+
|
|
253
|
+
def _serialize_feature_map(feature_map):
|
|
254
|
+
"""
|
|
255
|
+
Serialize a FeatureMap to a list of dictionaries for multiprocessing.
|
|
256
|
+
|
|
257
|
+
Args:
|
|
258
|
+
feature_map: OpenMS FeatureMap object
|
|
259
|
+
|
|
260
|
+
Returns:
|
|
261
|
+
List of feature dictionaries
|
|
262
|
+
"""
|
|
263
|
+
features_data = []
|
|
264
|
+
for feature in feature_map:
|
|
265
|
+
feature_data = {
|
|
266
|
+
'rt': feature.getRT(),
|
|
267
|
+
'mz': feature.getMZ(),
|
|
268
|
+
'intensity': feature.getIntensity(),
|
|
269
|
+
'charge': feature.getCharge(),
|
|
270
|
+
'unique_id': feature.getUniqueId()
|
|
271
|
+
}
|
|
272
|
+
features_data.append(feature_data)
|
|
273
|
+
return features_data
|
|
274
|
+
|
|
275
|
+
|
|
16
276
|
def merge(self, **kwargs) -> None:
|
|
17
277
|
"""
|
|
18
278
|
Group features across samples into consensus features using various algorithms.
|
|
@@ -25,7 +285,7 @@ def merge(self, **kwargs) -> None:
|
|
|
25
285
|
**kwargs : dict
|
|
26
286
|
Parameters from merge_defaults class:
|
|
27
287
|
- method : str, default 'quality'
|
|
28
|
-
Merge algorithm: 'sensitivity', 'qt', 'nowarp', '
|
|
288
|
+
Merge algorithm: 'sensitivity', 'qt', 'nowarp', 'kd_chunked', 'qt_chunked', 'quality'
|
|
29
289
|
- min_samples : int, default 10
|
|
30
290
|
Minimum number of samples for consensus feature
|
|
31
291
|
- rt_tol : float, default 2.0
|
|
@@ -34,6 +294,8 @@ def merge(self, **kwargs) -> None:
|
|
|
34
294
|
m/z tolerance in Da (Daltons) for all methods
|
|
35
295
|
- chunk_size : int, default 500
|
|
36
296
|
Chunk size for 'chunked' method
|
|
297
|
+
- threads : int, default 1
|
|
298
|
+
Number of parallel processes for chunked methods (kd_chunked, qt_chunked)
|
|
37
299
|
- nr_partitions : int, default 500
|
|
38
300
|
Number of partitions in m/z dimension for KD algorithms
|
|
39
301
|
- min_rel_cc_size : float, default 0.3
|
|
@@ -52,9 +314,21 @@ def merge(self, **kwargs) -> None:
|
|
|
52
314
|
- Sensitivity: Best raw sensitivity, O(n log n), maximum feature detection
|
|
53
315
|
- QT: Thorough but slow O(n²), good for <1000 samples
|
|
54
316
|
- NoWarp: Memory efficient KD without RT warping for large datasets
|
|
55
|
-
- Chunked: Memory-optimized KD algorithm for very large datasets (>5000 samples)
|
|
317
|
+
- KD-Chunked: Memory-optimized KD algorithm for very large datasets (>5000 samples)
|
|
56
318
|
Uses optimized partitioning for better memory management while maintaining
|
|
57
|
-
full cross-sample consensus feature detection.
|
|
319
|
+
full cross-sample consensus feature detection. Supports parallel processing.
|
|
320
|
+
- QT-Chunked: Memory-optimized QT algorithm for very large datasets (>5000 samples)
|
|
321
|
+
Uses QT clustering in first stage with optimized cross-chunk consensus building.
|
|
322
|
+
Supports parallel processing.
|
|
323
|
+
|
|
324
|
+
Parallel Processing
|
|
325
|
+
------------------
|
|
326
|
+
For kd_chunked and qt_chunked methods, use threads > 1 to enable parallel processing
|
|
327
|
+
of chunk alignments. This can significantly reduce processing time for large datasets
|
|
328
|
+
by processing multiple chunks simultaneously in separate processes.
|
|
329
|
+
|
|
330
|
+
Example:
|
|
331
|
+
study.merge(method='kd_chunked', threads=4, chunk_size=200)
|
|
58
332
|
"""
|
|
59
333
|
start_time = time.time()
|
|
60
334
|
|
|
@@ -76,7 +350,12 @@ def merge(self, **kwargs) -> None:
|
|
|
76
350
|
'kd_nowarp': 'nowarp',
|
|
77
351
|
'kd-strict': 'quality',
|
|
78
352
|
'kd_strict': 'quality',
|
|
79
|
-
'kdstrict': 'quality'
|
|
353
|
+
'kdstrict': 'quality',
|
|
354
|
+
'chunked': 'kd_chunked', # Map old 'chunked' to 'kd_chunked'
|
|
355
|
+
'qtchunked': 'qt_chunked', # QT chunked variants
|
|
356
|
+
'qt-chunked': 'qt_chunked',
|
|
357
|
+
'kdchunked': 'kd_chunked', # KD chunked variants
|
|
358
|
+
'kd-chunked': 'kd_chunked'
|
|
80
359
|
}
|
|
81
360
|
|
|
82
361
|
if params.method in method_mapping:
|
|
@@ -85,8 +364,8 @@ def merge(self, **kwargs) -> None:
|
|
|
85
364
|
self.logger.info(f"Method '{old_method}' is deprecated. Using '{params.method}' instead.")
|
|
86
365
|
|
|
87
366
|
# Validate method
|
|
88
|
-
if params.method not in ['sensitivity', 'qt', 'nowarp', '
|
|
89
|
-
raise ValueError(f"Invalid method '{params.method}'. Must be one of: ['sensitivity', 'qt', 'nowarp', '
|
|
367
|
+
if params.method not in ['sensitivity', 'qt', 'nowarp', 'kd_chunked', 'qt_chunked', 'quality']:
|
|
368
|
+
raise ValueError(f"Invalid method '{params.method}'. Must be one of: ['sensitivity', 'qt', 'nowarp', 'kd_chunked', 'qt_chunked', 'quality']")
|
|
90
369
|
|
|
91
370
|
# Persist last used params for diagnostics
|
|
92
371
|
try:
|
|
@@ -147,9 +426,12 @@ def merge(self, **kwargs) -> None:
|
|
|
147
426
|
elif params.method == 'quality':
|
|
148
427
|
consensus_map = _merge_kd_strict(self, params)
|
|
149
428
|
# Note: _merge_kd_strict handles both consensus_df and consensus_mapping_df directly
|
|
150
|
-
elif params.method == '
|
|
151
|
-
consensus_map =
|
|
152
|
-
# Note:
|
|
429
|
+
elif params.method == 'kd_chunked':
|
|
430
|
+
consensus_map = _merge_kd_chunked(self, params, cached_adducts_df, cached_valid_adducts)
|
|
431
|
+
# Note: _merge_kd_chunked populates consensus_df directly, no need to extract
|
|
432
|
+
elif params.method == 'qt_chunked':
|
|
433
|
+
consensus_map = _merge_qt_chunked(self, params, cached_adducts_df, cached_valid_adducts)
|
|
434
|
+
# Note: _merge_qt_chunked populates consensus_df directly, no need to extract
|
|
153
435
|
|
|
154
436
|
# Perform adduct grouping
|
|
155
437
|
self._perform_adduct_grouping(params.rt_tol, params.mz_tol)
|
|
@@ -189,9 +471,9 @@ def _merge_kd(self, params: merge_defaults) -> oms.ConsensusMap:
|
|
|
189
471
|
params_oms.setValue("warp:mz_tol", params.mz_tol)
|
|
190
472
|
params_oms.setValue("link:rt_tol", params.rt_tol)
|
|
191
473
|
params_oms.setValue("link:mz_tol", params.mz_tol)
|
|
192
|
-
params_oms.setValue("link:min_rel_cc_size", params.min_rel_cc_size)
|
|
193
|
-
params_oms.setValue("link:max_pairwise_log_fc", params.max_pairwise_log_fc)
|
|
194
|
-
params_oms.setValue("link:max_nr_conflicts", params.max_nr_conflicts)
|
|
474
|
+
#params_oms.setValue("link:min_rel_cc_size", params.min_rel_cc_size)
|
|
475
|
+
#params_oms.setValue("link:max_pairwise_log_fc", params.max_pairwise_log_fc)
|
|
476
|
+
#params_oms.setValue("link:max_nr_conflicts", params.max_nr_conflicts)
|
|
195
477
|
#params_oms.setValue("link:charge_merging", "With_charge_zero") THIS LEADS TO A CRASH
|
|
196
478
|
|
|
197
479
|
grouper.setParameters(params_oms)
|
|
@@ -227,9 +509,9 @@ def _merge_qt(self, params: merge_defaults) -> oms.ConsensusMap:
|
|
|
227
509
|
params_oms.setValue("distance_MZ:max_difference", params.mz_tol)
|
|
228
510
|
params_oms.setValue("distance_MZ:unit", "Da") # QT now uses Da like all other methods
|
|
229
511
|
params_oms.setValue("ignore_charge", "true")
|
|
230
|
-
params_oms.setValue("min_rel_cc_size", params.min_rel_cc_size)
|
|
231
|
-
params_oms.setValue("max_pairwise_log_fc", params.max_pairwise_log_fc)
|
|
232
|
-
params_oms.setValue("max_nr_conflicts", params.max_nr_conflicts)
|
|
512
|
+
#params_oms.setValue("min_rel_cc_size", params.min_rel_cc_size)
|
|
513
|
+
#params_oms.setValue("max_pairwise_log_fc", params.max_pairwise_log_fc)
|
|
514
|
+
#params_oms.setValue("max_nr_conflicts", params.max_nr_conflicts)
|
|
233
515
|
params_oms.setValue("nr_partitions", params.nr_partitions)
|
|
234
516
|
|
|
235
517
|
grouper.setParameters(params_oms)
|
|
@@ -763,8 +1045,8 @@ def _merge_kd_nowarp(self, params: merge_defaults) -> oms.ConsensusMap:
|
|
|
763
1045
|
return consensus_map
|
|
764
1046
|
|
|
765
1047
|
|
|
766
|
-
def
|
|
767
|
-
"""
|
|
1048
|
+
def _merge_kd_chunked(self, params: merge_defaults, cached_adducts_df=None, cached_valid_adducts=None) -> oms.ConsensusMap:
|
|
1049
|
+
"""KD-based chunked merge with proper cross-chunk consensus building and optional parallel processing"""
|
|
768
1050
|
|
|
769
1051
|
n_samples = len(self.features_maps)
|
|
770
1052
|
if n_samples <= params.chunk_size:
|
|
@@ -780,48 +1062,255 @@ def _merge_chunked(self, params: merge_defaults, cached_adducts_df=None, cached_
|
|
|
780
1062
|
chunk_end = min(i + params.chunk_size, n_samples)
|
|
781
1063
|
chunks.append((i, self.features_maps[i:chunk_end]))
|
|
782
1064
|
|
|
783
|
-
self.logger.debug(f"Processing {len(chunks)} chunks of max {params.chunk_size} samples")
|
|
1065
|
+
self.logger.debug(f"Processing {len(chunks)} chunks of max {params.chunk_size} samples using {params.threads or 'sequential'} thread(s)")
|
|
784
1066
|
|
|
785
1067
|
# Process each chunk to create chunk consensus maps
|
|
786
1068
|
chunk_consensus_maps = []
|
|
787
1069
|
|
|
788
|
-
|
|
789
|
-
|
|
1070
|
+
if params.threads is None:
|
|
1071
|
+
# Sequential processing (original behavior)
|
|
1072
|
+
for chunk_idx, (chunk_start_idx, chunk_maps) in enumerate(tqdm(chunks, desc=f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}KD Chunk", disable=self.log_level not in ["TRACE", "DEBUG", "INFO"])):
|
|
1073
|
+
chunk_consensus_map = oms.ConsensusMap()
|
|
1074
|
+
|
|
1075
|
+
# Set up file descriptions for chunk
|
|
1076
|
+
file_descriptions = chunk_consensus_map.getColumnHeaders()
|
|
1077
|
+
for j, feature_map in enumerate(chunk_maps):
|
|
1078
|
+
file_description = file_descriptions.get(j, oms.ColumnHeader())
|
|
1079
|
+
file_description.filename = self.samples_df.row(chunk_start_idx + j, named=True)["sample_name"]
|
|
1080
|
+
file_description.size = feature_map.size()
|
|
1081
|
+
file_description.unique_id = feature_map.getUniqueId()
|
|
1082
|
+
file_descriptions[j] = file_description
|
|
1083
|
+
|
|
1084
|
+
chunk_consensus_map.setColumnHeaders(file_descriptions)
|
|
1085
|
+
|
|
1086
|
+
# Use KD algorithm for chunk
|
|
1087
|
+
grouper = oms.FeatureGroupingAlgorithmKD()
|
|
1088
|
+
chunk_params = grouper.getParameters()
|
|
1089
|
+
chunk_params.setValue("mz_unit", "Da")
|
|
1090
|
+
chunk_params.setValue("nr_partitions", params.nr_partitions)
|
|
1091
|
+
chunk_params.setValue("warp:enabled", "true")
|
|
1092
|
+
chunk_params.setValue("warp:rt_tol", params.rt_tol)
|
|
1093
|
+
chunk_params.setValue("warp:mz_tol", params.mz_tol)
|
|
1094
|
+
chunk_params.setValue("link:rt_tol", params.rt_tol)
|
|
1095
|
+
chunk_params.setValue("link:mz_tol", params.mz_tol)
|
|
1096
|
+
chunk_params.setValue("link:min_rel_cc_size", params.min_rel_cc_size)
|
|
1097
|
+
chunk_params.setValue("link:max_pairwise_log_fc", params.max_pairwise_log_fc)
|
|
1098
|
+
chunk_params.setValue("link:max_nr_conflicts", params.max_nr_conflicts)
|
|
1099
|
+
|
|
1100
|
+
grouper.setParameters(chunk_params)
|
|
1101
|
+
grouper.group(chunk_maps, chunk_consensus_map)
|
|
1102
|
+
|
|
1103
|
+
chunk_consensus_maps.append((chunk_start_idx, chunk_consensus_map))
|
|
1104
|
+
|
|
1105
|
+
else:
|
|
1106
|
+
# Parallel processing
|
|
1107
|
+
self.logger.info(f"Processing chunks in parallel using {params.threads} processes")
|
|
790
1108
|
|
|
791
|
-
#
|
|
792
|
-
|
|
793
|
-
for
|
|
794
|
-
|
|
795
|
-
|
|
796
|
-
|
|
797
|
-
|
|
798
|
-
|
|
1109
|
+
# Prepare chunk data for parallel processing using features_df slices
|
|
1110
|
+
chunk_data_list = []
|
|
1111
|
+
for chunk_idx, (chunk_start_idx, chunk_maps) in enumerate(chunks):
|
|
1112
|
+
# Get the sample UIDs for this chunk
|
|
1113
|
+
chunk_sample_uids = []
|
|
1114
|
+
chunk_samples_df_rows = []
|
|
1115
|
+
for j in range(len(chunk_maps)):
|
|
1116
|
+
sample_row = self.samples_df.row(chunk_start_idx + j, named=True)
|
|
1117
|
+
chunk_sample_uids.append(sample_row['sample_uid'])
|
|
1118
|
+
chunk_samples_df_rows.append(sample_row)
|
|
1119
|
+
|
|
1120
|
+
# Create a DataFrame for this chunk's samples
|
|
1121
|
+
chunk_samples_df = pl.DataFrame(chunk_samples_df_rows)
|
|
1122
|
+
|
|
1123
|
+
# Filter features_df for this chunk's samples and select only necessary columns
|
|
1124
|
+
chunk_features_df = self.features_df.filter(
|
|
1125
|
+
pl.col('sample_uid').is_in(chunk_sample_uids)
|
|
1126
|
+
).select([
|
|
1127
|
+
'sample_uid', 'rt', 'mz', 'inty', 'charge', 'feature_id'
|
|
1128
|
+
])
|
|
1129
|
+
|
|
1130
|
+
# Convert DataFrames to serializable format (lists of dicts)
|
|
1131
|
+
chunk_features_data = chunk_features_df.to_dicts()
|
|
1132
|
+
chunk_samples_data = chunk_samples_df.to_dicts()
|
|
1133
|
+
|
|
1134
|
+
chunk_data = {
|
|
1135
|
+
'chunk_start_idx': chunk_start_idx,
|
|
1136
|
+
'chunk_features_data': chunk_features_data, # List of dicts instead of DataFrame
|
|
1137
|
+
'chunk_samples_data': chunk_samples_data, # List of dicts instead of DataFrame
|
|
1138
|
+
'params': {
|
|
1139
|
+
'nr_partitions': params.nr_partitions,
|
|
1140
|
+
'rt_tol': params.rt_tol,
|
|
1141
|
+
'mz_tol': params.mz_tol,
|
|
1142
|
+
'min_rel_cc_size': params.min_rel_cc_size,
|
|
1143
|
+
'max_pairwise_log_fc': params.max_pairwise_log_fc,
|
|
1144
|
+
'max_nr_conflicts': params.max_nr_conflicts
|
|
1145
|
+
}
|
|
1146
|
+
}
|
|
1147
|
+
chunk_data_list.append(chunk_data)
|
|
799
1148
|
|
|
800
|
-
|
|
1149
|
+
# Process chunks in parallel
|
|
1150
|
+
with ProcessPoolExecutor(max_workers=params.threads) as executor:
|
|
1151
|
+
# Submit all chunk processing tasks
|
|
1152
|
+
future_to_chunk = {executor.submit(_process_kd_chunk_parallel, chunk_data): i
|
|
1153
|
+
for i, chunk_data in enumerate(chunk_data_list)}
|
|
1154
|
+
|
|
1155
|
+
# Collect results with progress tracking
|
|
1156
|
+
completed_chunks = 0
|
|
1157
|
+
total_chunks = len(chunk_data_list)
|
|
1158
|
+
serialized_chunk_results = []
|
|
1159
|
+
|
|
1160
|
+
for future in as_completed(future_to_chunk):
|
|
1161
|
+
chunk_idx = future_to_chunk[future]
|
|
1162
|
+
try:
|
|
1163
|
+
chunk_start_idx, consensus_features = future.result()
|
|
1164
|
+
serialized_chunk_results.append((chunk_start_idx, consensus_features))
|
|
1165
|
+
completed_chunks += 1
|
|
1166
|
+
n_samples_in_chunk = len(chunk_data_list[chunk_idx]['chunk_samples_data'])
|
|
1167
|
+
self.logger.info(f"Completed chunk {completed_chunks}/{total_chunks} (samples {chunk_start_idx + 1}-{chunk_start_idx + n_samples_in_chunk})")
|
|
1168
|
+
except Exception as exc:
|
|
1169
|
+
self.logger.error(f"Chunk {chunk_idx} generated an exception: {exc}")
|
|
1170
|
+
raise exc
|
|
801
1171
|
|
|
802
|
-
#
|
|
803
|
-
|
|
804
|
-
|
|
805
|
-
|
|
806
|
-
|
|
807
|
-
|
|
808
|
-
|
|
809
|
-
|
|
810
|
-
|
|
811
|
-
|
|
812
|
-
|
|
813
|
-
|
|
814
|
-
|
|
1172
|
+
# Store serialized results for _merge_chunk_results to handle directly
|
|
1173
|
+
chunk_consensus_maps = []
|
|
1174
|
+
for chunk_start_idx, consensus_features in sorted(serialized_chunk_results):
|
|
1175
|
+
# Store serialized data directly for _merge_chunk_results to handle
|
|
1176
|
+
chunk_consensus_maps.append((chunk_start_idx, consensus_features))
|
|
1177
|
+
|
|
1178
|
+
# Merge chunk results with proper cross-chunk consensus building
|
|
1179
|
+
# _merge_chunk_results now handles both ConsensusMap objects (sequential) and serialized data (parallel)
|
|
1180
|
+
_merge_chunk_results(self, chunk_consensus_maps, params, cached_adducts_df, cached_valid_adducts)
|
|
1181
|
+
|
|
1182
|
+
# Return a dummy consensus map for compatibility (consensus features are stored in self.consensus_df)
|
|
1183
|
+
consensus_map = oms.ConsensusMap()
|
|
1184
|
+
return consensus_map
|
|
1185
|
+
|
|
1186
|
+
|
|
1187
|
+
def _merge_qt_chunked(self, params: merge_defaults, cached_adducts_df=None, cached_valid_adducts=None) -> oms.ConsensusMap:
|
|
1188
|
+
"""QT-based chunked merge with proper cross-chunk consensus building and optional parallel processing"""
|
|
1189
|
+
|
|
1190
|
+
n_samples = len(self.features_maps)
|
|
1191
|
+
if n_samples <= params.chunk_size:
|
|
1192
|
+
self.logger.info(f"Dataset size ({n_samples}) ≤ chunk_size, using QT merge")
|
|
1193
|
+
consensus_map = _merge_qt(self, params)
|
|
1194
|
+
# Extract consensus features to populate consensus_df for chunked method consistency
|
|
1195
|
+
self._extract_consensus_features(consensus_map, params.min_samples, cached_adducts_df, cached_valid_adducts)
|
|
1196
|
+
return consensus_map
|
|
1197
|
+
|
|
1198
|
+
# Process in chunks
|
|
1199
|
+
chunks = []
|
|
1200
|
+
for i in range(0, n_samples, params.chunk_size):
|
|
1201
|
+
chunk_end = min(i + params.chunk_size, n_samples)
|
|
1202
|
+
chunks.append((i, self.features_maps[i:chunk_end]))
|
|
1203
|
+
|
|
1204
|
+
self.logger.debug(f"Processing {len(chunks)} chunks of max {params.chunk_size} samples using {params.threads or 'sequential'} thread(s)")
|
|
1205
|
+
|
|
1206
|
+
# Process each chunk to create chunk consensus maps
|
|
1207
|
+
chunk_consensus_maps = []
|
|
1208
|
+
|
|
1209
|
+
if params.threads is None:
|
|
1210
|
+
# Sequential processing (original behavior)
|
|
1211
|
+
for chunk_idx, (chunk_start_idx, chunk_maps) in enumerate(tqdm(chunks, desc=f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}QT Chunk", disable=self.log_level not in ["TRACE", "DEBUG", "INFO"])):
|
|
1212
|
+
chunk_consensus_map = oms.ConsensusMap()
|
|
1213
|
+
|
|
1214
|
+
# Set up file descriptions for chunk
|
|
1215
|
+
file_descriptions = chunk_consensus_map.getColumnHeaders()
|
|
1216
|
+
for j, feature_map in enumerate(chunk_maps):
|
|
1217
|
+
file_description = file_descriptions.get(j, oms.ColumnHeader())
|
|
1218
|
+
file_description.filename = self.samples_df.row(chunk_start_idx + j, named=True)["sample_name"]
|
|
1219
|
+
file_description.size = feature_map.size()
|
|
1220
|
+
file_description.unique_id = feature_map.getUniqueId()
|
|
1221
|
+
file_descriptions[j] = file_description
|
|
1222
|
+
|
|
1223
|
+
chunk_consensus_map.setColumnHeaders(file_descriptions)
|
|
1224
|
+
|
|
1225
|
+
# Use QT algorithm for chunk (main difference from KD chunked)
|
|
1226
|
+
grouper = oms.FeatureGroupingAlgorithmQT()
|
|
1227
|
+
chunk_params = grouper.getParameters()
|
|
1228
|
+
chunk_params.setValue("distance_RT:max_difference", params.rt_tol)
|
|
1229
|
+
chunk_params.setValue("distance_MZ:max_difference", params.mz_tol)
|
|
1230
|
+
chunk_params.setValue("distance_MZ:unit", "Da")
|
|
1231
|
+
chunk_params.setValue("ignore_charge", "true")
|
|
1232
|
+
chunk_params.setValue("nr_partitions", params.nr_partitions)
|
|
1233
|
+
|
|
1234
|
+
grouper.setParameters(chunk_params)
|
|
1235
|
+
grouper.group(chunk_maps, chunk_consensus_map)
|
|
1236
|
+
|
|
1237
|
+
chunk_consensus_maps.append((chunk_start_idx, chunk_consensus_map))
|
|
1238
|
+
|
|
1239
|
+
else:
|
|
1240
|
+
# Parallel processing
|
|
1241
|
+
self.logger.info(f"Processing chunks in parallel using {params.threads} processes")
|
|
815
1242
|
|
|
816
|
-
|
|
817
|
-
|
|
1243
|
+
# Prepare chunk data for parallel processing using features_df slices
|
|
1244
|
+
chunk_data_list = []
|
|
1245
|
+
for chunk_idx, (chunk_start_idx, chunk_maps) in enumerate(chunks):
|
|
1246
|
+
# Get the sample UIDs for this chunk
|
|
1247
|
+
chunk_sample_uids = []
|
|
1248
|
+
chunk_samples_df_rows = []
|
|
1249
|
+
for j in range(len(chunk_maps)):
|
|
1250
|
+
sample_row = self.samples_df.row(chunk_start_idx + j, named=True)
|
|
1251
|
+
chunk_sample_uids.append(sample_row['sample_uid'])
|
|
1252
|
+
chunk_samples_df_rows.append(sample_row)
|
|
1253
|
+
|
|
1254
|
+
# Create a DataFrame for this chunk's samples
|
|
1255
|
+
chunk_samples_df = pl.DataFrame(chunk_samples_df_rows)
|
|
1256
|
+
|
|
1257
|
+
# Filter features_df for this chunk's samples and select only necessary columns
|
|
1258
|
+
chunk_features_df = self.features_df.filter(
|
|
1259
|
+
pl.col('sample_uid').is_in(chunk_sample_uids)
|
|
1260
|
+
).select([
|
|
1261
|
+
'sample_uid', 'rt', 'mz', 'inty', 'charge', 'feature_id'
|
|
1262
|
+
])
|
|
1263
|
+
|
|
1264
|
+
# Convert DataFrames to serializable format (lists of dicts)
|
|
1265
|
+
chunk_features_data = chunk_features_df.to_dicts()
|
|
1266
|
+
chunk_samples_data = chunk_samples_df.to_dicts()
|
|
1267
|
+
|
|
1268
|
+
chunk_data = {
|
|
1269
|
+
'chunk_start_idx': chunk_start_idx,
|
|
1270
|
+
'chunk_features_data': chunk_features_data, # List of dicts instead of DataFrame
|
|
1271
|
+
'chunk_samples_data': chunk_samples_data, # List of dicts instead of DataFrame
|
|
1272
|
+
'params': {
|
|
1273
|
+
'nr_partitions': params.nr_partitions,
|
|
1274
|
+
'rt_tol': params.rt_tol,
|
|
1275
|
+
'mz_tol': params.mz_tol,
|
|
1276
|
+
}
|
|
1277
|
+
}
|
|
1278
|
+
chunk_data_list.append(chunk_data)
|
|
818
1279
|
|
|
819
|
-
|
|
820
|
-
|
|
821
|
-
|
|
1280
|
+
# Process chunks in parallel
|
|
1281
|
+
with ProcessPoolExecutor(max_workers=params.threads) as executor:
|
|
1282
|
+
# Submit all chunk processing tasks
|
|
1283
|
+
future_to_chunk = {executor.submit(_process_qt_chunk_parallel, chunk_data): i
|
|
1284
|
+
for i, chunk_data in enumerate(chunk_data_list)}
|
|
1285
|
+
|
|
1286
|
+
# Collect results with progress tracking
|
|
1287
|
+
completed_chunks = 0
|
|
1288
|
+
total_chunks = len(chunk_data_list)
|
|
1289
|
+
serialized_chunk_results = []
|
|
1290
|
+
|
|
1291
|
+
for future in as_completed(future_to_chunk):
|
|
1292
|
+
chunk_idx = future_to_chunk[future]
|
|
1293
|
+
try:
|
|
1294
|
+
chunk_start_idx, consensus_features = future.result()
|
|
1295
|
+
serialized_chunk_results.append((chunk_start_idx, consensus_features))
|
|
1296
|
+
completed_chunks += 1
|
|
1297
|
+
n_samples_in_chunk = len(chunk_data_list[chunk_idx]['chunk_samples_data'])
|
|
1298
|
+
self.logger.info(f"Completed chunk {completed_chunks}/{total_chunks} (samples {chunk_start_idx + 1}-{chunk_start_idx + n_samples_in_chunk})")
|
|
1299
|
+
except Exception as exc:
|
|
1300
|
+
self.logger.error(f"Chunk {chunk_idx} generated an exception: {exc}")
|
|
1301
|
+
raise exc
|
|
1302
|
+
|
|
1303
|
+
# Store serialized results for _merge_chunk_results to handle directly
|
|
1304
|
+
chunk_consensus_maps = []
|
|
1305
|
+
for chunk_start_idx, consensus_features in sorted(serialized_chunk_results):
|
|
1306
|
+
# Store serialized data directly for _merge_chunk_results to handle
|
|
1307
|
+
chunk_consensus_maps.append((chunk_start_idx, consensus_features))
|
|
1308
|
+
|
|
1309
|
+
# Merge chunk results with proper cross-chunk consensus building
|
|
1310
|
+
# _merge_chunk_results now handles both ConsensusMap objects (sequential) and serialized data (parallel)
|
|
822
1311
|
_merge_chunk_results(self, chunk_consensus_maps, params, cached_adducts_df, cached_valid_adducts)
|
|
823
1312
|
|
|
824
|
-
#
|
|
1313
|
+
# Return a dummy consensus map for compatibility (consensus features are stored in self.consensus_df)
|
|
825
1314
|
consensus_map = oms.ConsensusMap()
|
|
826
1315
|
return consensus_map
|
|
827
1316
|
|
|
@@ -859,61 +1348,128 @@ def _merge_chunk_results(self, chunk_consensus_maps: list, params: merge_default
|
|
|
859
1348
|
all_chunk_consensus = []
|
|
860
1349
|
consensus_id_counter = 0
|
|
861
1350
|
|
|
862
|
-
for chunk_idx, (chunk_start_idx,
|
|
863
|
-
|
|
1351
|
+
for chunk_idx, (chunk_start_idx, chunk_data) in enumerate(chunk_consensus_maps):
|
|
1352
|
+
# Handle both ConsensusMap objects (sequential) and serialized data (parallel)
|
|
1353
|
+
if isinstance(chunk_data, list):
|
|
1354
|
+
# Parallel processing: chunk_data is a list of serialized consensus feature dictionaries
|
|
1355
|
+
consensus_features_data = chunk_data
|
|
1356
|
+
else:
|
|
1357
|
+
# Sequential processing: chunk_data is a ConsensusMap object
|
|
1358
|
+
chunk_consensus_map = chunk_data
|
|
1359
|
+
consensus_features_data = []
|
|
1360
|
+
|
|
1361
|
+
# Extract data from ConsensusMap and convert to serialized format
|
|
1362
|
+
for consensus_feature in chunk_consensus_map:
|
|
1363
|
+
# Extract feature_uids from this consensus feature
|
|
1364
|
+
feature_uids = []
|
|
1365
|
+
feature_data_list = []
|
|
1366
|
+
sample_uids = []
|
|
1367
|
+
|
|
1368
|
+
for feature_handle in consensus_feature.getFeatureList():
|
|
1369
|
+
fuid = str(feature_handle.getUniqueId())
|
|
1370
|
+
if fuid not in feature_uid_map:
|
|
1371
|
+
continue
|
|
1372
|
+
|
|
1373
|
+
feature_uid = feature_uid_map[fuid]
|
|
1374
|
+
feature_data = features_lookup.get(feature_uid)
|
|
1375
|
+
if feature_data:
|
|
1376
|
+
feature_uids.append(feature_uid)
|
|
1377
|
+
feature_data_list.append(feature_data)
|
|
1378
|
+
sample_uids.append(chunk_start_idx + feature_handle.getMapIndex() + 1)
|
|
1379
|
+
|
|
1380
|
+
if not feature_data_list:
|
|
1381
|
+
# No retrievable feature metadata (possible stale map reference) -> skip
|
|
1382
|
+
continue
|
|
1383
|
+
|
|
1384
|
+
# Convert ConsensusFeature to serialized format
|
|
1385
|
+
consensus_feature_data = {
|
|
1386
|
+
'rt': consensus_feature.getRT(),
|
|
1387
|
+
'mz': consensus_feature.getMZ(),
|
|
1388
|
+
'intensity': consensus_feature.getIntensity(),
|
|
1389
|
+
'quality': consensus_feature.getQuality(),
|
|
1390
|
+
'feature_uids': feature_uids,
|
|
1391
|
+
'feature_data_list': feature_data_list,
|
|
1392
|
+
'sample_uids': sample_uids
|
|
1393
|
+
}
|
|
1394
|
+
consensus_features_data.append(consensus_feature_data)
|
|
1395
|
+
|
|
1396
|
+
# Process the consensus features (now all in serialized format)
|
|
1397
|
+
for consensus_feature_data in consensus_features_data:
|
|
864
1398
|
# ACCEPT ALL consensus features (size >=1) here.
|
|
865
1399
|
# Reason: A feature that is globally present in many samples can still
|
|
866
1400
|
# appear only once inside a given sample chunk. Early filtering at
|
|
867
1401
|
# size>=2 causes irreversible loss and underestimates the final
|
|
868
1402
|
# consensus count (observed ~296 vs 950 for KD). We defer filtering
|
|
869
1403
|
# strictly to the final global min_samples.
|
|
870
|
-
|
|
871
|
-
# Extract feature_uids from this consensus feature
|
|
872
|
-
feature_uids = []
|
|
873
|
-
feature_data_list = []
|
|
874
|
-
sample_uids = []
|
|
875
1404
|
|
|
876
|
-
|
|
877
|
-
|
|
878
|
-
|
|
1405
|
+
# For parallel processing, feature data is already extracted
|
|
1406
|
+
if isinstance(chunk_data, list):
|
|
1407
|
+
# Extract feature_uids and data from serialized format for parallel processing
|
|
1408
|
+
feature_uids = []
|
|
1409
|
+
feature_data_list = []
|
|
1410
|
+
sample_uids = []
|
|
1411
|
+
|
|
1412
|
+
for handle_data in consensus_feature_data['features']:
|
|
1413
|
+
fuid = str(handle_data['unique_id'])
|
|
1414
|
+
if fuid not in feature_uid_map:
|
|
1415
|
+
continue
|
|
1416
|
+
|
|
1417
|
+
feature_uid = feature_uid_map[fuid]
|
|
1418
|
+
feature_data = features_lookup.get(feature_uid)
|
|
1419
|
+
if feature_data:
|
|
1420
|
+
feature_uids.append(feature_uid)
|
|
1421
|
+
feature_data_list.append(feature_data)
|
|
1422
|
+
sample_uids.append(chunk_start_idx + handle_data['map_index'] + 1)
|
|
1423
|
+
|
|
1424
|
+
if not feature_data_list:
|
|
879
1425
|
continue
|
|
880
1426
|
|
|
881
|
-
|
|
882
|
-
|
|
883
|
-
|
|
884
|
-
|
|
885
|
-
|
|
886
|
-
|
|
1427
|
+
# Get RT/MZ from consensus feature data
|
|
1428
|
+
consensus_rt = consensus_feature_data['rt']
|
|
1429
|
+
consensus_mz = consensus_feature_data['mz']
|
|
1430
|
+
consensus_intensity = consensus_feature_data['intensity']
|
|
1431
|
+
consensus_quality = consensus_feature_data['quality']
|
|
1432
|
+
else:
|
|
1433
|
+
# Sequential processing: data is already extracted above
|
|
1434
|
+
feature_uids = consensus_feature_data['feature_uids']
|
|
1435
|
+
feature_data_list = consensus_feature_data['feature_data_list']
|
|
1436
|
+
sample_uids = consensus_feature_data['sample_uids']
|
|
1437
|
+
consensus_rt = consensus_feature_data['rt']
|
|
1438
|
+
consensus_mz = consensus_feature_data['mz']
|
|
1439
|
+
consensus_intensity = consensus_feature_data['intensity']
|
|
1440
|
+
consensus_quality = consensus_feature_data['quality']
|
|
887
1441
|
|
|
888
1442
|
if not feature_data_list:
|
|
889
1443
|
# No retrievable feature metadata (possible stale map reference) -> skip
|
|
890
|
-
continue
|
|
1444
|
+
continue
|
|
1445
|
+
|
|
1446
|
+
# Derive RT / m/z ranges from underlying features (used for robust cross-chunk stitching)
|
|
891
1447
|
rt_vals_local = [fd.get("rt") for fd in feature_data_list if fd.get("rt") is not None]
|
|
892
1448
|
mz_vals_local = [fd.get("mz") for fd in feature_data_list if fd.get("mz") is not None]
|
|
893
1449
|
if rt_vals_local:
|
|
894
1450
|
rt_min_local = min(rt_vals_local)
|
|
895
1451
|
rt_max_local = max(rt_vals_local)
|
|
896
1452
|
else:
|
|
897
|
-
rt_min_local = rt_max_local =
|
|
1453
|
+
rt_min_local = rt_max_local = consensus_rt
|
|
898
1454
|
if mz_vals_local:
|
|
899
1455
|
mz_min_local = min(mz_vals_local)
|
|
900
1456
|
mz_max_local = max(mz_vals_local)
|
|
901
1457
|
else:
|
|
902
|
-
mz_min_local = mz_max_local =
|
|
1458
|
+
mz_min_local = mz_max_local = consensus_mz
|
|
903
1459
|
|
|
904
1460
|
# Store chunk consensus with feature tracking
|
|
905
1461
|
chunk_consensus_data = {
|
|
906
1462
|
'consensus_id': consensus_id_counter,
|
|
907
1463
|
'chunk_idx': chunk_idx,
|
|
908
1464
|
'chunk_start_idx': chunk_start_idx,
|
|
909
|
-
'mz':
|
|
910
|
-
'rt':
|
|
1465
|
+
'mz': consensus_mz,
|
|
1466
|
+
'rt': consensus_rt,
|
|
911
1467
|
'mz_min': mz_min_local,
|
|
912
1468
|
'mz_max': mz_max_local,
|
|
913
1469
|
'rt_min': rt_min_local,
|
|
914
1470
|
'rt_max': rt_max_local,
|
|
915
|
-
'intensity':
|
|
916
|
-
'quality':
|
|
1471
|
+
'intensity': consensus_intensity,
|
|
1472
|
+
'quality': consensus_quality,
|
|
917
1473
|
'feature_uids': feature_uids,
|
|
918
1474
|
'feature_data_list': feature_data_list,
|
|
919
1475
|
'sample_uids': sample_uids,
|
|
@@ -1411,9 +1967,6 @@ def _cluster_consensus_features(features: list, rt_tol: float, mz_tol: float) ->
|
|
|
1411
1967
|
return list(groups_by_root.values())
|
|
1412
1968
|
|
|
1413
1969
|
|
|
1414
|
-
# Note: Restored proper chunked implementation with cross-chunk consensus clustering
|
|
1415
|
-
|
|
1416
|
-
|
|
1417
1970
|
def _reset_consensus_data(self):
|
|
1418
1971
|
"""Reset consensus-related DataFrames at the start of merge."""
|
|
1419
1972
|
self.consensus_df = pl.DataFrame()
|
masster/study/processing.py
CHANGED
|
@@ -97,7 +97,6 @@ def align(self, **kwargs):
|
|
|
97
97
|
_align_kd_algorithm(self, fmaps, params)
|
|
98
98
|
else:
|
|
99
99
|
self.logger.error(f"Unknown alignment algorithm '{algorithm}'")
|
|
100
|
-
self.logger.error(f"Unknown alignment algorithm '{algorithm}'")
|
|
101
100
|
|
|
102
101
|
# check if rt_original exists in features_df, if not, add it after rt
|
|
103
102
|
if "rt_original" not in self.features_df.columns:
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
masster/__init__.py,sha256=HHjKhCjkAc98LhoQfu4C6L-W2vfTEc1iXaPTxxcl_4A,800
|
|
2
|
-
masster/_version.py,sha256=
|
|
2
|
+
masster/_version.py,sha256=Kro6JvBTMqNf6tOgI2r5d4TbaZIIR85ax7tdT3uQKL8,257
|
|
3
3
|
masster/chromatogram.py,sha256=iYpdv8C17zVnlWvOFgAn9ns2uFGiF-GgoYf5QVVAbHs,19319
|
|
4
4
|
masster/logger.py,sha256=W50V_uh8RSYwGxDrDFhOuj5jpu2tKJyt_16lMw9kQwA,14755
|
|
5
5
|
masster/spectrum.py,sha256=_upC_g2N9gwTaflXAugs9pSXpKUmzbIehofDordk7WI,47718
|
|
@@ -43,10 +43,10 @@ masster/study/h5.py,sha256=LiVGUAtULyPpZIUmKVJSaV38huJb8FsKOUWBOqiv0QU,82363
|
|
|
43
43
|
masster/study/helpers.py,sha256=M5_q8O5tuFchKPW04PTuj3X335lDA2VZqcs4D8ZQJEk,158604
|
|
44
44
|
masster/study/id.py,sha256=6NUBBKZCFOU1wlDKM0eXQeOIStSZCRNJ_3x7ZaIHzmM,55263
|
|
45
45
|
masster/study/load.py,sha256=CQQY_7BzagE3oQTdDlqNyfuMdVWIAft-M4a2WCFnxp0,70695
|
|
46
|
-
masster/study/merge.py,sha256
|
|
46
|
+
masster/study/merge.py,sha256=Xk7Zt6x0p_myjWQXuzXbXSlwXPSujWjMPowaqnEEmWQ,118778
|
|
47
47
|
masster/study/parameters.py,sha256=0elaF7YspTsB7qyajWAbRNL2VfKlGz5GJLifmO8IGkk,3276
|
|
48
48
|
masster/study/plot.py,sha256=SimX-IlqISEItAnTBsx4xsdYHRAevfN41cCENVns1lw,88236
|
|
49
|
-
masster/study/processing.py,sha256=
|
|
49
|
+
masster/study/processing.py,sha256=u1MSRKTzcqHNz_dClSUSfgTxkNRdBLXtVyO5LXuW_uk,41031
|
|
50
50
|
masster/study/save.py,sha256=YCvp4xhnG16sNXaT2mFDBoCrIMub0Es61B97qLo0maw,6705
|
|
51
51
|
masster/study/study.py,sha256=LO_hbJOOCZzeA3uterPKImFgPG6fCNQKMSVMtEwW3DU,38815
|
|
52
52
|
masster/study/study5_schema.json,sha256=c0w24QdHak01m04I1VPu97KvF2468FcaqROhf6pmLk4,7507
|
|
@@ -60,7 +60,7 @@ masster/study/defaults/find_ms2_def.py,sha256=RL0DFG41wQ05U8UQKUGr3vzSl3mU0m0knQ
|
|
|
60
60
|
masster/study/defaults/identify_def.py,sha256=96rxoCAPQj_yX-3mRoD2LTkTLJgG27eJQqwarLv5jL0,10580
|
|
61
61
|
masster/study/defaults/integrate_chrom_def.py,sha256=0MNIWGTjty-Zu-NTQsIweuj3UVqEY3x1x8pK0mPwYak,7264
|
|
62
62
|
masster/study/defaults/integrate_def.py,sha256=Vf4SAzdBfnsSZ3IRaF0qZvWu3gMDPHdgPfMYoPKeWv8,7246
|
|
63
|
-
masster/study/defaults/merge_def.py,sha256=
|
|
63
|
+
masster/study/defaults/merge_def.py,sha256=K7sfwEGfgcWU85zorbWNFaxDhqRH52pxQoKv9Jn2qhY,15030
|
|
64
64
|
masster/study/defaults/study_def.py,sha256=h8dYbi9xv0sesCSQik49Z53IkskMmNtW6ixl7it5pL0,16033
|
|
65
65
|
masster/wizard/README.md,sha256=mL1A3YWJZOefpJ6D0-HqGLkVRmUlOpwyVFdvJBeeoZM,14149
|
|
66
66
|
masster/wizard/__init__.py,sha256=A9GHQvkq4lSRIA8V6AKB-TJy8s_npH8i1baUGdkw_is,364
|
|
@@ -68,8 +68,8 @@ masster/wizard/example.py,sha256=xEZFTH9UZ8HKOm6s3JL8Js0Uw5ChnISWBHSZCL32vsM,798
|
|
|
68
68
|
masster/wizard/test_structure.py,sha256=h88gsYYCG6iDRjqPZC_r1H1T8y79j0E-K6OrwuHaSCU,1586
|
|
69
69
|
masster/wizard/test_wizard.py,sha256=CMp1cpjH3iYYC5Fy6puF_K0kfwwk3bgOsSbUGW-t7Xk,8986
|
|
70
70
|
masster/wizard/wizard.py,sha256=jMLHy4cXgNEE_-vshFmA7BNEByhfA6tV7O91jhiMYuw,48054
|
|
71
|
-
masster-0.4.
|
|
72
|
-
masster-0.4.
|
|
73
|
-
masster-0.4.
|
|
74
|
-
masster-0.4.
|
|
75
|
-
masster-0.4.
|
|
71
|
+
masster-0.4.19.dist-info/METADATA,sha256=fcnG14G4Fbp7mOCQ3aKL0qvkuexeUUjm79P1dDpT_Kg,44207
|
|
72
|
+
masster-0.4.19.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
|
73
|
+
masster-0.4.19.dist-info/entry_points.txt,sha256=ZHguQ_vPmdbpqq2uGtmEOLJfgP-DQ1T0c07Lxh30wc8,58
|
|
74
|
+
masster-0.4.19.dist-info/licenses/LICENSE,sha256=bx5iLIKjgAdYQ7sISn7DsfHRKkoCUm1154sJJKhgqnU,35184
|
|
75
|
+
masster-0.4.19.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|