masster 0.4.16__py3-none-any.whl → 0.4.18__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of masster might be problematic. Click here for more details.
- masster/_version.py +1 -1
- masster/study/defaults/merge_def.py +70 -6
- masster/study/merge.py +637 -31
- masster/study/plot.py +15 -1
- masster/study/processing.py +0 -1
- {masster-0.4.16.dist-info → masster-0.4.18.dist-info}/METADATA +3 -2
- {masster-0.4.16.dist-info → masster-0.4.18.dist-info}/RECORD +10 -10
- {masster-0.4.16.dist-info → masster-0.4.18.dist-info}/WHEEL +0 -0
- {masster-0.4.16.dist-info → masster-0.4.18.dist-info}/entry_points.txt +0 -0
- {masster-0.4.16.dist-info → masster-0.4.18.dist-info}/licenses/LICENSE +0 -0
masster/_version.py
CHANGED
|
@@ -25,24 +25,36 @@ class merge_defaults:
|
|
|
25
25
|
link_ms2 (bool): Whether to link MS2 spectra to consensus features. Default is True.
|
|
26
26
|
"""
|
|
27
27
|
|
|
28
|
-
method: str = "
|
|
29
|
-
min_samples: int =
|
|
28
|
+
method: str = "qt"
|
|
29
|
+
min_samples: int = 2
|
|
30
30
|
rt_tol: float = 5.0
|
|
31
31
|
mz_tol: float = 0.01
|
|
32
|
-
chunk_size: int =
|
|
32
|
+
chunk_size: int = 500
|
|
33
33
|
nr_partitions: int = 1000
|
|
34
|
-
min_rel_cc_size: float = 0.
|
|
34
|
+
min_rel_cc_size: float = 0.1
|
|
35
35
|
max_pairwise_log_fc: float = -1.0
|
|
36
36
|
max_nr_conflicts: int = 0
|
|
37
37
|
link_ms2: bool = True
|
|
38
|
+
|
|
39
|
+
# KD-Strict specific parameters
|
|
40
|
+
optimize_rt_tol: bool = False
|
|
41
|
+
rt_tol_range: tuple = (0.5, 4.0)
|
|
42
|
+
rt_tol_steps: int = 7
|
|
43
|
+
secondary_merge_rt_tol: float = 1.0
|
|
44
|
+
secondary_merge_mz_tol: float = 0.005
|
|
45
|
+
min_sample_overlap: float = 0.8
|
|
46
|
+
max_rt_spread: float = 2.0 # Will default to 2x rt_tol
|
|
47
|
+
min_coherence: float = 0.0
|
|
38
48
|
|
|
39
49
|
_param_metadata: dict[str, dict[str, Any]] = field(
|
|
40
50
|
default_factory=lambda: {
|
|
41
51
|
"method": {
|
|
42
52
|
"dtype": str,
|
|
43
53
|
"description": "Merge method (algorithm) to use",
|
|
44
|
-
"default": "
|
|
45
|
-
"allowed_values": ["
|
|
54
|
+
"default": "quality",
|
|
55
|
+
"allowed_values": ["sensitivity", "qt", "nowarp", "chunked", "quality",
|
|
56
|
+
"kd", "kd-nowarp", "kd_nowarp", "kd-strict", "kd_strict",
|
|
57
|
+
"kd_chunked", "kd-chunked", "qt_chunked", "qt-chunked"],
|
|
46
58
|
},
|
|
47
59
|
"min_samples": {
|
|
48
60
|
"dtype": int,
|
|
@@ -103,6 +115,58 @@ class merge_defaults:
|
|
|
103
115
|
"description": "Whether to link MS2 spectra to consensus features",
|
|
104
116
|
"default": True,
|
|
105
117
|
},
|
|
118
|
+
# KD-Strict specific parameters
|
|
119
|
+
"optimize_rt_tol": {
|
|
120
|
+
"dtype": bool,
|
|
121
|
+
"description": "Enable RT tolerance optimization for kd-strict method",
|
|
122
|
+
"default": False,
|
|
123
|
+
},
|
|
124
|
+
"rt_tol_range": {
|
|
125
|
+
"dtype": tuple,
|
|
126
|
+
"description": "RT tolerance range for optimization (min, max) in seconds",
|
|
127
|
+
"default": (0.8, 2.0),
|
|
128
|
+
},
|
|
129
|
+
"rt_tol_steps": {
|
|
130
|
+
"dtype": int,
|
|
131
|
+
"description": "Number of steps for RT tolerance optimization",
|
|
132
|
+
"default": 5,
|
|
133
|
+
"min_value": 3,
|
|
134
|
+
"max_value": 20,
|
|
135
|
+
},
|
|
136
|
+
"secondary_merge_rt_tol": {
|
|
137
|
+
"dtype": float,
|
|
138
|
+
"description": "RT tolerance for secondary clustering in kd-strict (seconds)",
|
|
139
|
+
"default": 0.5,
|
|
140
|
+
"min_value": 0.1,
|
|
141
|
+
"max_value": 5.0,
|
|
142
|
+
},
|
|
143
|
+
"secondary_merge_mz_tol": {
|
|
144
|
+
"dtype": float,
|
|
145
|
+
"description": "m/z tolerance for secondary clustering in kd-strict (Da)",
|
|
146
|
+
"default": 0.005,
|
|
147
|
+
"min_value": 0.001,
|
|
148
|
+
"max_value": 0.1,
|
|
149
|
+
},
|
|
150
|
+
"min_sample_overlap": {
|
|
151
|
+
"dtype": float,
|
|
152
|
+
"description": "Minimum sample overlap ratio for merging features (0.0-1.0)",
|
|
153
|
+
"default": 0.8,
|
|
154
|
+
"min_value": 0.0,
|
|
155
|
+
"max_value": 1.0,
|
|
156
|
+
},
|
|
157
|
+
"max_rt_spread": {
|
|
158
|
+
"dtype": float,
|
|
159
|
+
"description": "Maximum allowed RT spread in seconds (None = 3x rt_tol)",
|
|
160
|
+
"default": None,
|
|
161
|
+
"min_value": 0.1,
|
|
162
|
+
},
|
|
163
|
+
"min_coherence": {
|
|
164
|
+
"dtype": float,
|
|
165
|
+
"description": "Minimum chromatographic coherence score (0.0 = disabled)",
|
|
166
|
+
"default": 0.0,
|
|
167
|
+
"min_value": 0.0,
|
|
168
|
+
"max_value": 1.0,
|
|
169
|
+
},
|
|
106
170
|
},
|
|
107
171
|
repr=False,
|
|
108
172
|
)
|
masster/study/merge.py
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
"""
|
|
2
2
|
Unified merge module for the Study class.
|
|
3
|
-
Supports multiple merge methods: 'kd', 'qt', 'kd-nowarp', '
|
|
3
|
+
Supports multiple merge methods: 'kd', 'qt', 'kd-nowarp', 'kd_chunked', 'qt_chunked'
|
|
4
4
|
"""
|
|
5
5
|
|
|
6
6
|
import time
|
|
@@ -24,8 +24,8 @@ def merge(self, **kwargs) -> None:
|
|
|
24
24
|
----------
|
|
25
25
|
**kwargs : dict
|
|
26
26
|
Parameters from merge_defaults class:
|
|
27
|
-
- method : str, default '
|
|
28
|
-
Merge algorithm: '
|
|
27
|
+
- method : str, default 'quality'
|
|
28
|
+
Merge algorithm: 'sensitivity', 'qt', 'nowarp', 'kd_chunked', 'qt_chunked', 'quality'
|
|
29
29
|
- min_samples : int, default 10
|
|
30
30
|
Minimum number of samples for consensus feature
|
|
31
31
|
- rt_tol : float, default 2.0
|
|
@@ -47,12 +47,16 @@ def merge(self, **kwargs) -> None:
|
|
|
47
47
|
|
|
48
48
|
Algorithm Guidelines
|
|
49
49
|
-------------------
|
|
50
|
-
-
|
|
50
|
+
- Quality: KD with post-processing quality control to reduce oversegmentation (RECOMMENDED DEFAULT)
|
|
51
|
+
Includes RT tolerance optimization, secondary clustering, and quality filtering
|
|
52
|
+
- Sensitivity: Best raw sensitivity, O(n log n), maximum feature detection
|
|
51
53
|
- QT: Thorough but slow O(n²), good for <1000 samples
|
|
52
|
-
-
|
|
53
|
-
- Chunked: Memory-optimized KD algorithm for very large datasets (>5000 samples)
|
|
54
|
+
- NoWarp: Memory efficient KD without RT warping for large datasets
|
|
55
|
+
- KD-Chunked: Memory-optimized KD algorithm for very large datasets (>5000 samples)
|
|
54
56
|
Uses optimized partitioning for better memory management while maintaining
|
|
55
57
|
full cross-sample consensus feature detection.
|
|
58
|
+
- QT-Chunked: Memory-optimized QT algorithm for very large datasets (>5000 samples)
|
|
59
|
+
Uses QT clustering in first stage with optimized cross-chunk consensus building.
|
|
56
60
|
"""
|
|
57
61
|
start_time = time.time()
|
|
58
62
|
|
|
@@ -67,9 +71,29 @@ def merge(self, **kwargs) -> None:
|
|
|
67
71
|
else:
|
|
68
72
|
self.logger.warning(f"Unknown parameter '{key}' ignored")
|
|
69
73
|
|
|
74
|
+
# Backward compatibility: Map old method names to new names
|
|
75
|
+
method_mapping = {
|
|
76
|
+
'kd': 'sensitivity',
|
|
77
|
+
'kd-nowarp': 'nowarp',
|
|
78
|
+
'kd_nowarp': 'nowarp',
|
|
79
|
+
'kd-strict': 'quality',
|
|
80
|
+
'kd_strict': 'quality',
|
|
81
|
+
'kdstrict': 'quality',
|
|
82
|
+
'chunked': 'kd_chunked', # Map old 'chunked' to 'kd_chunked'
|
|
83
|
+
'qtchunked': 'qt_chunked', # QT chunked variants
|
|
84
|
+
'qt-chunked': 'qt_chunked',
|
|
85
|
+
'kdchunked': 'kd_chunked', # KD chunked variants
|
|
86
|
+
'kd-chunked': 'kd_chunked'
|
|
87
|
+
}
|
|
88
|
+
|
|
89
|
+
if params.method in method_mapping:
|
|
90
|
+
old_method = params.method
|
|
91
|
+
params.method = method_mapping[old_method]
|
|
92
|
+
self.logger.info(f"Method '{old_method}' is deprecated. Using '{params.method}' instead.")
|
|
93
|
+
|
|
70
94
|
# Validate method
|
|
71
|
-
if params.method not in ['
|
|
72
|
-
raise ValueError(f"Invalid method '{params.method}'. Must be one of: ['
|
|
95
|
+
if params.method not in ['sensitivity', 'qt', 'nowarp', 'kd_chunked', 'qt_chunked', 'quality']:
|
|
96
|
+
raise ValueError(f"Invalid method '{params.method}'. Must be one of: ['sensitivity', 'qt', 'nowarp', 'kd_chunked', 'qt_chunked', 'quality']")
|
|
73
97
|
|
|
74
98
|
# Persist last used params for diagnostics
|
|
75
99
|
try:
|
|
@@ -77,6 +101,15 @@ def merge(self, **kwargs) -> None:
|
|
|
77
101
|
except Exception:
|
|
78
102
|
self._merge_params_last = {}
|
|
79
103
|
|
|
104
|
+
# Store merge parameters in history
|
|
105
|
+
try:
|
|
106
|
+
if hasattr(self, 'store_history'):
|
|
107
|
+
self.store_history(['merge'], params.to_dict())
|
|
108
|
+
else:
|
|
109
|
+
self.logger.warning("History storage not available - parameters not saved to history")
|
|
110
|
+
except Exception as e:
|
|
111
|
+
self.logger.warning(f"Failed to store merge parameters in history: {e}")
|
|
112
|
+
|
|
80
113
|
# Ensure feature maps are available for merging (regenerate if needed)
|
|
81
114
|
if len(self.features_maps) < len(self.samples_df):
|
|
82
115
|
self.features_maps = []
|
|
@@ -106,7 +139,7 @@ def merge(self, **kwargs) -> None:
|
|
|
106
139
|
cached_valid_adducts.add("?")
|
|
107
140
|
|
|
108
141
|
# Route to algorithm implementation
|
|
109
|
-
if params.method == '
|
|
142
|
+
if params.method == 'sensitivity':
|
|
110
143
|
consensus_map = _merge_kd(self, params)
|
|
111
144
|
# Extract consensus features
|
|
112
145
|
self._extract_consensus_features(consensus_map, params.min_samples, cached_adducts_df, cached_valid_adducts)
|
|
@@ -114,13 +147,19 @@ def merge(self, **kwargs) -> None:
|
|
|
114
147
|
consensus_map = _merge_qt(self, params)
|
|
115
148
|
# Extract consensus features
|
|
116
149
|
self._extract_consensus_features(consensus_map, params.min_samples, cached_adducts_df, cached_valid_adducts)
|
|
117
|
-
elif params.method == '
|
|
150
|
+
elif params.method == 'nowarp':
|
|
118
151
|
consensus_map = _merge_kd_nowarp(self, params)
|
|
119
152
|
# Extract consensus features
|
|
120
153
|
self._extract_consensus_features(consensus_map, params.min_samples, cached_adducts_df, cached_valid_adducts)
|
|
121
|
-
elif params.method == '
|
|
122
|
-
consensus_map =
|
|
123
|
-
# Note:
|
|
154
|
+
elif params.method == 'quality':
|
|
155
|
+
consensus_map = _merge_kd_strict(self, params)
|
|
156
|
+
# Note: _merge_kd_strict handles both consensus_df and consensus_mapping_df directly
|
|
157
|
+
elif params.method == 'kd_chunked':
|
|
158
|
+
consensus_map = _merge_kd_chunked(self, params, cached_adducts_df, cached_valid_adducts)
|
|
159
|
+
# Note: _merge_kd_chunked populates consensus_df directly, no need to extract
|
|
160
|
+
elif params.method == 'qt_chunked':
|
|
161
|
+
consensus_map = _merge_qt_chunked(self, params, cached_adducts_df, cached_valid_adducts)
|
|
162
|
+
# Note: _merge_qt_chunked populates consensus_df directly, no need to extract
|
|
124
163
|
|
|
125
164
|
# Perform adduct grouping
|
|
126
165
|
self._perform_adduct_grouping(params.rt_tol, params.mz_tol)
|
|
@@ -160,9 +199,9 @@ def _merge_kd(self, params: merge_defaults) -> oms.ConsensusMap:
|
|
|
160
199
|
params_oms.setValue("warp:mz_tol", params.mz_tol)
|
|
161
200
|
params_oms.setValue("link:rt_tol", params.rt_tol)
|
|
162
201
|
params_oms.setValue("link:mz_tol", params.mz_tol)
|
|
163
|
-
params_oms.setValue("link:min_rel_cc_size", params.min_rel_cc_size)
|
|
164
|
-
params_oms.setValue("link:max_pairwise_log_fc", params.max_pairwise_log_fc)
|
|
165
|
-
params_oms.setValue("link:max_nr_conflicts", params.max_nr_conflicts)
|
|
202
|
+
#params_oms.setValue("link:min_rel_cc_size", params.min_rel_cc_size)
|
|
203
|
+
#params_oms.setValue("link:max_pairwise_log_fc", params.max_pairwise_log_fc)
|
|
204
|
+
#params_oms.setValue("link:max_nr_conflicts", params.max_nr_conflicts)
|
|
166
205
|
#params_oms.setValue("link:charge_merging", "With_charge_zero") THIS LEADS TO A CRASH
|
|
167
206
|
|
|
168
207
|
grouper.setParameters(params_oms)
|
|
@@ -198,9 +237,9 @@ def _merge_qt(self, params: merge_defaults) -> oms.ConsensusMap:
|
|
|
198
237
|
params_oms.setValue("distance_MZ:max_difference", params.mz_tol)
|
|
199
238
|
params_oms.setValue("distance_MZ:unit", "Da") # QT now uses Da like all other methods
|
|
200
239
|
params_oms.setValue("ignore_charge", "true")
|
|
201
|
-
params_oms.setValue("min_rel_cc_size", params.min_rel_cc_size)
|
|
202
|
-
params_oms.setValue("max_pairwise_log_fc", params.max_pairwise_log_fc)
|
|
203
|
-
params_oms.setValue("max_nr_conflicts", params.max_nr_conflicts)
|
|
240
|
+
#params_oms.setValue("min_rel_cc_size", params.min_rel_cc_size)
|
|
241
|
+
#params_oms.setValue("max_pairwise_log_fc", params.max_pairwise_log_fc)
|
|
242
|
+
#params_oms.setValue("max_nr_conflicts", params.max_nr_conflicts)
|
|
204
243
|
params_oms.setValue("nr_partitions", params.nr_partitions)
|
|
205
244
|
|
|
206
245
|
grouper.setParameters(params_oms)
|
|
@@ -209,6 +248,496 @@ def _merge_qt(self, params: merge_defaults) -> oms.ConsensusMap:
|
|
|
209
248
|
return consensus_map
|
|
210
249
|
|
|
211
250
|
|
|
251
|
+
def _merge_kd_strict(self, params: merge_defaults) -> oms.ConsensusMap:
|
|
252
|
+
"""
|
|
253
|
+
Quality merge: Standard KD algorithm with post-processing quality control.
|
|
254
|
+
|
|
255
|
+
This method combines the sensitivity of KD clustering with post-processing steps
|
|
256
|
+
to reduce oversegmentation while maintaining high-quality consensus features.
|
|
257
|
+
This is the recommended default method.
|
|
258
|
+
|
|
259
|
+
Post-processing features:
|
|
260
|
+
1. RT tolerance optimization (optional)
|
|
261
|
+
2. Secondary clustering for close features
|
|
262
|
+
3. Sample overlap validation
|
|
263
|
+
4. RT spread quality filtering
|
|
264
|
+
5. Chromatographic coherence validation
|
|
265
|
+
|
|
266
|
+
Additional parameters supported in params:
|
|
267
|
+
- optimize_rt_tol: bool - Enable RT tolerance optimization
|
|
268
|
+
- rt_tol_range: tuple - RT tolerance range for optimization (min, max)
|
|
269
|
+
- secondary_merge_rt_tol: float - Secondary merge RT tolerance (default: 0.5s)
|
|
270
|
+
- secondary_merge_mz_tol: float - Secondary merge m/z tolerance (default: 0.005)
|
|
271
|
+
- min_sample_overlap: float - Minimum sample overlap for merging (0.0-1.0, default: 0.8)
|
|
272
|
+
- max_rt_spread: float - Maximum RT spread allowed (default: 2x rt_tol)
|
|
273
|
+
- min_coherence: float - Minimum chromatographic coherence (default: 0.0, disabled)
|
|
274
|
+
"""
|
|
275
|
+
|
|
276
|
+
# Check for RT tolerance optimization
|
|
277
|
+
optimize_rt_tol = getattr(params, 'optimize_rt_tol', False)
|
|
278
|
+
|
|
279
|
+
if optimize_rt_tol:
|
|
280
|
+
# Optimize RT tolerance first
|
|
281
|
+
optimal_rt_tol = _optimize_rt_tolerance(self, params)
|
|
282
|
+
self.logger.info(f"RT tolerance optimization: {params.rt_tol}s → {optimal_rt_tol}s")
|
|
283
|
+
# Create modified params with optimal RT tolerance
|
|
284
|
+
import copy
|
|
285
|
+
optimized_params = copy.deepcopy(params)
|
|
286
|
+
optimized_params.rt_tol = optimal_rt_tol
|
|
287
|
+
else:
|
|
288
|
+
optimized_params = params
|
|
289
|
+
|
|
290
|
+
# Phase 1: Standard KD clustering
|
|
291
|
+
self.logger.info("Initial KD clustering")
|
|
292
|
+
consensus_map = _merge_kd(self, optimized_params)
|
|
293
|
+
|
|
294
|
+
# Phase 2: Post-processing quality control
|
|
295
|
+
self.logger.info("Post-processing quality control")
|
|
296
|
+
consensus_map = _apply_kd_strict_postprocessing(self, consensus_map, optimized_params)
|
|
297
|
+
|
|
298
|
+
return consensus_map
|
|
299
|
+
|
|
300
|
+
|
|
301
|
+
def _optimize_rt_tolerance(self, params: merge_defaults) -> float:
|
|
302
|
+
"""
|
|
303
|
+
Optimize RT tolerance by testing different values and measuring oversegmentation.
|
|
304
|
+
|
|
305
|
+
Args:
|
|
306
|
+
self: Study object
|
|
307
|
+
params: Merge parameters
|
|
308
|
+
|
|
309
|
+
Returns:
|
|
310
|
+
Optimal RT tolerance value
|
|
311
|
+
"""
|
|
312
|
+
rt_tol_range = getattr(params, 'rt_tol_range', (0.8, 2.0))
|
|
313
|
+
rt_tol_steps = getattr(params, 'rt_tol_steps', 5)
|
|
314
|
+
|
|
315
|
+
self.logger.info(f"Optimizing RT tolerance in range {rt_tol_range} with {rt_tol_steps} steps")
|
|
316
|
+
|
|
317
|
+
# Generate test values
|
|
318
|
+
test_rt_tols = [rt_tol_range[0] + i * (rt_tol_range[1] - rt_tol_range[0]) / (rt_tol_steps - 1)
|
|
319
|
+
for i in range(rt_tol_steps)]
|
|
320
|
+
|
|
321
|
+
best_rt_tol = params.rt_tol
|
|
322
|
+
best_score = float('inf')
|
|
323
|
+
|
|
324
|
+
# Store original features for restoration
|
|
325
|
+
original_consensus_df = getattr(self, 'consensus_df', pl.DataFrame())
|
|
326
|
+
original_consensus_mapping_df = getattr(self, 'consensus_mapping_df', pl.DataFrame())
|
|
327
|
+
|
|
328
|
+
for test_rt_tol in test_rt_tols:
|
|
329
|
+
try:
|
|
330
|
+
# Create test parameters
|
|
331
|
+
import copy
|
|
332
|
+
test_params = copy.deepcopy(params)
|
|
333
|
+
test_params.rt_tol = test_rt_tol
|
|
334
|
+
|
|
335
|
+
# Run KD merge with test parameters
|
|
336
|
+
test_consensus_map = _merge_kd(self, test_params)
|
|
337
|
+
|
|
338
|
+
# Extract consensus features temporarily for analysis
|
|
339
|
+
self._extract_consensus_features(test_consensus_map, test_params.min_samples)
|
|
340
|
+
|
|
341
|
+
if len(self.consensus_df) == 0:
|
|
342
|
+
continue
|
|
343
|
+
|
|
344
|
+
# Calculate oversegmentation metrics
|
|
345
|
+
oversegmentation_score = _calculate_oversegmentation_score(self, test_rt_tol)
|
|
346
|
+
|
|
347
|
+
self.logger.debug(f"RT tol {test_rt_tol:.1f}s: {len(self.consensus_df)} features, score: {oversegmentation_score:.3f}")
|
|
348
|
+
|
|
349
|
+
# Lower score is better (less oversegmentation)
|
|
350
|
+
if oversegmentation_score < best_score:
|
|
351
|
+
best_score = oversegmentation_score
|
|
352
|
+
best_rt_tol = test_rt_tol
|
|
353
|
+
|
|
354
|
+
except Exception as e:
|
|
355
|
+
self.logger.warning(f"RT tolerance optimization failed for {test_rt_tol}s: {e}")
|
|
356
|
+
continue
|
|
357
|
+
|
|
358
|
+
# Restore original consensus data
|
|
359
|
+
self.consensus_df = original_consensus_df
|
|
360
|
+
self.consensus_mapping_df = original_consensus_mapping_df
|
|
361
|
+
|
|
362
|
+
self.logger.info(f"Optimal RT tolerance: {best_rt_tol:.1f}s (score: {best_score:.3f})")
|
|
363
|
+
return best_rt_tol
|
|
364
|
+
|
|
365
|
+
|
|
366
|
+
def _calculate_oversegmentation_score(self, rt_tol: float) -> float:
|
|
367
|
+
"""
|
|
368
|
+
Calculate oversegmentation score based on feature density and RT spread metrics.
|
|
369
|
+
Lower scores indicate less oversegmentation.
|
|
370
|
+
|
|
371
|
+
Args:
|
|
372
|
+
self: Study object
|
|
373
|
+
rt_tol: RT tolerance used
|
|
374
|
+
|
|
375
|
+
Returns:
|
|
376
|
+
Oversegmentation score (lower = better)
|
|
377
|
+
"""
|
|
378
|
+
if len(self.consensus_df) == 0:
|
|
379
|
+
return float('inf')
|
|
380
|
+
|
|
381
|
+
# Metric 1: Feature density (features per RT second)
|
|
382
|
+
rt_range = self.consensus_df['rt'].max() - self.consensus_df['rt'].min()
|
|
383
|
+
if rt_range <= 0:
|
|
384
|
+
return float('inf')
|
|
385
|
+
|
|
386
|
+
feature_density = len(self.consensus_df) / rt_range
|
|
387
|
+
|
|
388
|
+
# Metric 2: Average RT spread relative to tolerance
|
|
389
|
+
rt_spreads = (self.consensus_df['rt_max'] - self.consensus_df['rt_min'])
|
|
390
|
+
avg_rt_spread_ratio = rt_spreads.mean() / rt_tol if rt_tol > 0 else float('inf')
|
|
391
|
+
|
|
392
|
+
# Metric 3: Proportion of features with low sample counts (indicates fragmentation)
|
|
393
|
+
low_sample_features = len(self.consensus_df.filter(pl.col('number_samples') <= 5))
|
|
394
|
+
low_sample_ratio = low_sample_features / len(self.consensus_df)
|
|
395
|
+
|
|
396
|
+
# Metric 4: Number of features with excessive RT spread
|
|
397
|
+
excessive_spread_features = len(rt_spreads.filter(rt_spreads > rt_tol * 2))
|
|
398
|
+
excessive_spread_ratio = excessive_spread_features / len(self.consensus_df)
|
|
399
|
+
|
|
400
|
+
# Combined score (weighted combination)
|
|
401
|
+
oversegmentation_score = (
|
|
402
|
+
0.4 * (feature_density / 10.0) + # Normalize to reasonable scale
|
|
403
|
+
0.3 * avg_rt_spread_ratio +
|
|
404
|
+
0.2 * low_sample_ratio +
|
|
405
|
+
0.1 * excessive_spread_ratio
|
|
406
|
+
)
|
|
407
|
+
|
|
408
|
+
return oversegmentation_score
|
|
409
|
+
|
|
410
|
+
|
|
411
|
+
def _apply_kd_strict_postprocessing(self, consensus_map: oms.ConsensusMap, params: merge_defaults) -> oms.ConsensusMap:
|
|
412
|
+
"""
|
|
413
|
+
Apply post-processing quality control to KD consensus map.
|
|
414
|
+
|
|
415
|
+
Args:
|
|
416
|
+
consensus_map: Initial consensus map from KD
|
|
417
|
+
params: Merge parameters with kd-strict options
|
|
418
|
+
|
|
419
|
+
Returns:
|
|
420
|
+
Processed consensus map with reduced oversegmentation
|
|
421
|
+
"""
|
|
422
|
+
if consensus_map.size() == 0:
|
|
423
|
+
self.logger.warning("Empty consensus map provided to post-processing")
|
|
424
|
+
return consensus_map
|
|
425
|
+
|
|
426
|
+
self.logger.debug(f"Post-processing {consensus_map.size()} initial consensus features")
|
|
427
|
+
|
|
428
|
+
# Step 1: Extract initial consensus features
|
|
429
|
+
original_min_samples = params.min_samples
|
|
430
|
+
params.min_samples = 1 # Extract all features initially
|
|
431
|
+
|
|
432
|
+
self._extract_consensus_features(consensus_map, params.min_samples)
|
|
433
|
+
initial_feature_count = len(self.consensus_df)
|
|
434
|
+
|
|
435
|
+
if initial_feature_count == 0:
|
|
436
|
+
self.logger.warning("No consensus features extracted for post-processing")
|
|
437
|
+
params.min_samples = original_min_samples
|
|
438
|
+
return consensus_map
|
|
439
|
+
|
|
440
|
+
# Step 2: Secondary clustering for close features
|
|
441
|
+
secondary_merge_rt_tol = getattr(params, 'secondary_merge_rt_tol', 0.5)
|
|
442
|
+
secondary_merge_mz_tol = getattr(params, 'secondary_merge_mz_tol', 0.005)
|
|
443
|
+
|
|
444
|
+
self.logger.debug(f"Secondary clustering with RT≤{secondary_merge_rt_tol}s, m/z≤{secondary_merge_mz_tol}")
|
|
445
|
+
merged_features = _perform_secondary_clustering(self, secondary_merge_rt_tol, secondary_merge_mz_tol)
|
|
446
|
+
|
|
447
|
+
# Step 3: Sample overlap validation
|
|
448
|
+
min_sample_overlap = getattr(params, 'min_sample_overlap', 0.8)
|
|
449
|
+
if min_sample_overlap > 0:
|
|
450
|
+
self.logger.debug(f"Sample overlap validation (threshold: {min_sample_overlap})")
|
|
451
|
+
merged_features = _validate_sample_overlap(self, merged_features, min_sample_overlap)
|
|
452
|
+
|
|
453
|
+
# Step 4: RT spread quality filtering
|
|
454
|
+
if params.rt_tol is not None:
|
|
455
|
+
max_rt_spread = getattr(params, 'max_rt_spread', params.rt_tol * 2)
|
|
456
|
+
if max_rt_spread is not None:
|
|
457
|
+
self.logger.debug(f"RT spread filtering (max: {max_rt_spread:.1f}s)")
|
|
458
|
+
merged_features = _filter_rt_spread(self, merged_features, max_rt_spread)
|
|
459
|
+
else:
|
|
460
|
+
self.logger.debug("Skipping RT spread filtering - max_rt_spread is None")
|
|
461
|
+
else:
|
|
462
|
+
self.logger.debug("Skipping RT spread filtering - rt_tol is None")
|
|
463
|
+
|
|
464
|
+
# Step 5: Chromatographic coherence filtering (optional)
|
|
465
|
+
min_coherence = getattr(params, 'min_coherence', 0.0)
|
|
466
|
+
if min_coherence > 0:
|
|
467
|
+
self.logger.debug(f"Chromatographic coherence filtering (min: {min_coherence})")
|
|
468
|
+
merged_features = _filter_coherence(self, merged_features, min_coherence)
|
|
469
|
+
|
|
470
|
+
# Step 6: Rebuild consensus_df with filtered features and preserve mapping
|
|
471
|
+
original_mapping_df = self.consensus_mapping_df.clone() # Save original mapping
|
|
472
|
+
self.consensus_df = pl.DataFrame(merged_features, strict=False)
|
|
473
|
+
|
|
474
|
+
# Step 7: Apply original min_samples filter
|
|
475
|
+
params.min_samples = original_min_samples
|
|
476
|
+
if params.min_samples > 1:
|
|
477
|
+
l1 = len(self.consensus_df)
|
|
478
|
+
self.consensus_df = self.consensus_df.filter(
|
|
479
|
+
pl.col("number_samples") >= params.min_samples
|
|
480
|
+
)
|
|
481
|
+
filtered_count = l1 - len(self.consensus_df)
|
|
482
|
+
if filtered_count > 0:
|
|
483
|
+
self.logger.debug(f"Filtered {filtered_count} features below min_samples threshold ({params.min_samples})")
|
|
484
|
+
|
|
485
|
+
# Step 8: Update consensus_mapping_df to match final consensus_df
|
|
486
|
+
if len(self.consensus_df) > 0 and len(original_mapping_df) > 0:
|
|
487
|
+
valid_consensus_ids = set(self.consensus_df['consensus_uid'].to_list())
|
|
488
|
+
self.consensus_mapping_df = original_mapping_df.filter(
|
|
489
|
+
pl.col('consensus_uid').is_in(list(valid_consensus_ids))
|
|
490
|
+
)
|
|
491
|
+
else:
|
|
492
|
+
self.consensus_mapping_df = pl.DataFrame()
|
|
493
|
+
|
|
494
|
+
final_feature_count = len(self.consensus_df)
|
|
495
|
+
reduction_pct = ((initial_feature_count - final_feature_count) / initial_feature_count * 100) if initial_feature_count > 0 else 0
|
|
496
|
+
|
|
497
|
+
self.logger.info(f"Post-processing complete: {initial_feature_count} → {final_feature_count} features ({reduction_pct:.1f}% reduction)")
|
|
498
|
+
|
|
499
|
+
# Create a new consensus map for compatibility (the processed data is in consensus_df)
|
|
500
|
+
processed_consensus_map = oms.ConsensusMap()
|
|
501
|
+
return processed_consensus_map
|
|
502
|
+
|
|
503
|
+
|
|
504
|
+
def _perform_secondary_clustering(self, rt_tol: float, mz_tol: float) -> list:
|
|
505
|
+
"""
|
|
506
|
+
Perform secondary clustering to merge very close features.
|
|
507
|
+
|
|
508
|
+
Args:
|
|
509
|
+
rt_tol: RT tolerance for secondary clustering
|
|
510
|
+
mz_tol: m/z tolerance for secondary clustering
|
|
511
|
+
|
|
512
|
+
Returns:
|
|
513
|
+
List of merged consensus feature dictionaries
|
|
514
|
+
"""
|
|
515
|
+
if len(self.consensus_df) == 0:
|
|
516
|
+
return []
|
|
517
|
+
|
|
518
|
+
# Convert consensus_df to list of dictionaries for clustering
|
|
519
|
+
consensus_features = []
|
|
520
|
+
for i, row in enumerate(self.consensus_df.iter_rows(named=True)):
|
|
521
|
+
consensus_features.append(dict(row))
|
|
522
|
+
|
|
523
|
+
# Use Union-Find for efficient clustering
|
|
524
|
+
class UnionFind:
|
|
525
|
+
def __init__(self, n):
|
|
526
|
+
self.parent = list(range(n))
|
|
527
|
+
self.rank = [0] * n
|
|
528
|
+
|
|
529
|
+
def find(self, x):
|
|
530
|
+
if self.parent[x] != x:
|
|
531
|
+
self.parent[x] = self.find(self.parent[x])
|
|
532
|
+
return self.parent[x]
|
|
533
|
+
|
|
534
|
+
def union(self, x, y):
|
|
535
|
+
px, py = self.find(x), self.find(y)
|
|
536
|
+
if px == py:
|
|
537
|
+
return
|
|
538
|
+
if self.rank[px] < self.rank[py]:
|
|
539
|
+
px, py = py, px
|
|
540
|
+
self.parent[py] = px
|
|
541
|
+
if self.rank[px] == self.rank[py]:
|
|
542
|
+
self.rank[px] += 1
|
|
543
|
+
|
|
544
|
+
n_features = len(consensus_features)
|
|
545
|
+
uf = UnionFind(n_features)
|
|
546
|
+
|
|
547
|
+
# Find features to merge based on proximity
|
|
548
|
+
merge_count = 0
|
|
549
|
+
for i in range(n_features):
|
|
550
|
+
for j in range(i + 1, n_features):
|
|
551
|
+
feat_i = consensus_features[i]
|
|
552
|
+
feat_j = consensus_features[j]
|
|
553
|
+
|
|
554
|
+
rt_diff = abs(feat_i['rt'] - feat_j['rt'])
|
|
555
|
+
mz_diff = abs(feat_i['mz'] - feat_j['mz'])
|
|
556
|
+
|
|
557
|
+
if rt_diff <= rt_tol and mz_diff <= mz_tol:
|
|
558
|
+
uf.union(i, j)
|
|
559
|
+
merge_count += 1
|
|
560
|
+
|
|
561
|
+
# Group features by their root
|
|
562
|
+
groups_by_root = defaultdict(list)
|
|
563
|
+
for i in range(n_features):
|
|
564
|
+
root = uf.find(i)
|
|
565
|
+
groups_by_root[root].append(consensus_features[i])
|
|
566
|
+
|
|
567
|
+
# Merge features within each group
|
|
568
|
+
merged_features = []
|
|
569
|
+
for group in groups_by_root.values():
|
|
570
|
+
if len(group) == 1:
|
|
571
|
+
# Single feature - keep as is
|
|
572
|
+
merged_features.append(group[0])
|
|
573
|
+
else:
|
|
574
|
+
# Multiple features - merge them
|
|
575
|
+
merged_feature = _merge_feature_group(group)
|
|
576
|
+
merged_features.append(merged_feature)
|
|
577
|
+
|
|
578
|
+
self.logger.debug(f"Secondary clustering: {n_features} → {len(merged_features)} features ({n_features - len(merged_features)} merged)")
|
|
579
|
+
return merged_features
|
|
580
|
+
|
|
581
|
+
|
|
582
|
+
def _merge_feature_group(feature_group: list) -> dict:
|
|
583
|
+
"""
|
|
584
|
+
Merge a group of similar consensus features into one.
|
|
585
|
+
|
|
586
|
+
Args:
|
|
587
|
+
feature_group: List of consensus feature dictionaries to merge
|
|
588
|
+
|
|
589
|
+
Returns:
|
|
590
|
+
Merged consensus feature dictionary
|
|
591
|
+
"""
|
|
592
|
+
if not feature_group:
|
|
593
|
+
return {}
|
|
594
|
+
|
|
595
|
+
if len(feature_group) == 1:
|
|
596
|
+
return feature_group[0]
|
|
597
|
+
|
|
598
|
+
# Use the feature with highest sample count as base
|
|
599
|
+
base_feature = max(feature_group, key=lambda f: f.get('number_samples', 0))
|
|
600
|
+
merged = base_feature.copy()
|
|
601
|
+
|
|
602
|
+
# Aggregate numeric statistics
|
|
603
|
+
rt_values = [f['rt'] for f in feature_group if f.get('rt') is not None]
|
|
604
|
+
mz_values = [f['mz'] for f in feature_group if f.get('mz') is not None]
|
|
605
|
+
sample_counts = [f.get('number_samples', 0) for f in feature_group]
|
|
606
|
+
intensities = [f.get('inty_mean', 0) for f in feature_group if f.get('inty_mean') is not None]
|
|
607
|
+
|
|
608
|
+
# Update merged feature statistics
|
|
609
|
+
if rt_values:
|
|
610
|
+
merged['rt'] = float(np.mean(rt_values))
|
|
611
|
+
merged['rt_min'] = min([f.get('rt_min', f['rt']) for f in feature_group])
|
|
612
|
+
merged['rt_max'] = max([f.get('rt_max', f['rt']) for f in feature_group])
|
|
613
|
+
merged['rt_mean'] = float(np.mean(rt_values))
|
|
614
|
+
|
|
615
|
+
if mz_values:
|
|
616
|
+
merged['mz'] = float(np.mean(mz_values))
|
|
617
|
+
merged['mz_min'] = min([f.get('mz_min', f['mz']) for f in feature_group])
|
|
618
|
+
merged['mz_max'] = max([f.get('mz_max', f['mz']) for f in feature_group])
|
|
619
|
+
merged['mz_mean'] = float(np.mean(mz_values))
|
|
620
|
+
|
|
621
|
+
# Use maximum sample count (features might be detected in overlapping but different samples)
|
|
622
|
+
merged['number_samples'] = max(sample_counts)
|
|
623
|
+
|
|
624
|
+
# Use weighted average intensity (by sample count)
|
|
625
|
+
if intensities and sample_counts:
|
|
626
|
+
total_weight = sum(sample_counts)
|
|
627
|
+
if total_weight > 0:
|
|
628
|
+
weighted_intensity = sum(inty * count for inty, count in zip(intensities, sample_counts)) / total_weight
|
|
629
|
+
merged['inty_mean'] = float(weighted_intensity)
|
|
630
|
+
|
|
631
|
+
# Aggregate chromatographic quality metrics if available
|
|
632
|
+
coherence_values = [f.get('chrom_coherence_mean', 0) for f in feature_group if f.get('chrom_coherence_mean') is not None]
|
|
633
|
+
prominence_values = [f.get('chrom_prominence_mean', 0) for f in feature_group if f.get('chrom_prominence_mean') is not None]
|
|
634
|
+
|
|
635
|
+
if coherence_values:
|
|
636
|
+
merged['chrom_coherence_mean'] = float(np.mean(coherence_values))
|
|
637
|
+
if prominence_values:
|
|
638
|
+
merged['chrom_prominence_mean'] = float(np.mean(prominence_values))
|
|
639
|
+
|
|
640
|
+
# Merge MS2 counts
|
|
641
|
+
ms2_counts = [f.get('number_ms2', 0) for f in feature_group]
|
|
642
|
+
merged['number_ms2'] = sum(ms2_counts)
|
|
643
|
+
|
|
644
|
+
# Keep the best quality score
|
|
645
|
+
quality_scores = [f.get('quality', 1.0) for f in feature_group if f.get('quality') is not None]
|
|
646
|
+
if quality_scores:
|
|
647
|
+
merged['quality'] = max(quality_scores)
|
|
648
|
+
|
|
649
|
+
return merged
|
|
650
|
+
|
|
651
|
+
|
|
652
|
+
def _validate_sample_overlap(self, features: list, min_overlap: float) -> list:
|
|
653
|
+
"""
|
|
654
|
+
Validate that merged features have sufficient sample overlap.
|
|
655
|
+
|
|
656
|
+
Args:
|
|
657
|
+
features: List of consensus feature dictionaries
|
|
658
|
+
min_overlap: Minimum sample overlap ratio (0.0-1.0)
|
|
659
|
+
|
|
660
|
+
Returns:
|
|
661
|
+
List of validated features
|
|
662
|
+
"""
|
|
663
|
+
# This is a placeholder for sample overlap validation
|
|
664
|
+
# Implementation would require access to which samples each feature appears in
|
|
665
|
+
# For now, we'll use a simple heuristic based on feature statistics
|
|
666
|
+
|
|
667
|
+
validated_features = []
|
|
668
|
+
for feature in features:
|
|
669
|
+
# Simple validation based on RT spread and sample count ratio
|
|
670
|
+
rt_spread = feature.get('rt_max', feature['rt']) - feature.get('rt_min', feature['rt'])
|
|
671
|
+
sample_count = feature.get('number_samples', 1)
|
|
672
|
+
|
|
673
|
+
# Features with very tight RT spread and high sample counts are more reliable
|
|
674
|
+
if rt_spread <= 2.0 or sample_count >= 10: # More permissive validation
|
|
675
|
+
validated_features.append(feature)
|
|
676
|
+
else:
|
|
677
|
+
# Could implement more sophisticated sample overlap checking here
|
|
678
|
+
validated_features.append(feature) # Keep for now
|
|
679
|
+
|
|
680
|
+
return validated_features
|
|
681
|
+
|
|
682
|
+
|
|
683
|
+
def _filter_rt_spread(self, features: list, max_rt_spread: float) -> list:
|
|
684
|
+
"""
|
|
685
|
+
Filter out features with excessive RT spread.
|
|
686
|
+
|
|
687
|
+
Args:
|
|
688
|
+
features: List of consensus feature dictionaries
|
|
689
|
+
max_rt_spread: Maximum allowed RT spread in seconds
|
|
690
|
+
|
|
691
|
+
Returns:
|
|
692
|
+
List of filtered features
|
|
693
|
+
"""
|
|
694
|
+
filtered_features = []
|
|
695
|
+
filtered_count = 0
|
|
696
|
+
|
|
697
|
+
for feature in features:
|
|
698
|
+
rt_min = feature.get('rt_min', feature['rt'])
|
|
699
|
+
rt_max = feature.get('rt_max', feature['rt'])
|
|
700
|
+
rt_spread = rt_max - rt_min
|
|
701
|
+
|
|
702
|
+
if rt_spread <= max_rt_spread:
|
|
703
|
+
filtered_features.append(feature)
|
|
704
|
+
else:
|
|
705
|
+
filtered_count += 1
|
|
706
|
+
|
|
707
|
+
if filtered_count > 0:
|
|
708
|
+
self.logger.debug(f"Filtered {filtered_count} features with excessive RT spread (>{max_rt_spread:.1f}s)")
|
|
709
|
+
|
|
710
|
+
return filtered_features
|
|
711
|
+
|
|
712
|
+
|
|
713
|
+
def _filter_coherence(self, features: list, min_coherence: float) -> list:
|
|
714
|
+
"""
|
|
715
|
+
Filter out features with low chromatographic coherence.
|
|
716
|
+
|
|
717
|
+
Args:
|
|
718
|
+
features: List of consensus feature dictionaries
|
|
719
|
+
min_coherence: Minimum chromatographic coherence score
|
|
720
|
+
|
|
721
|
+
Returns:
|
|
722
|
+
List of filtered features
|
|
723
|
+
"""
|
|
724
|
+
filtered_features = []
|
|
725
|
+
filtered_count = 0
|
|
726
|
+
|
|
727
|
+
for feature in features:
|
|
728
|
+
coherence = feature.get('chrom_coherence_mean', 1.0) # Default to high coherence if missing
|
|
729
|
+
|
|
730
|
+
if coherence >= min_coherence:
|
|
731
|
+
filtered_features.append(feature)
|
|
732
|
+
else:
|
|
733
|
+
filtered_count += 1
|
|
734
|
+
|
|
735
|
+
if filtered_count > 0:
|
|
736
|
+
self.logger.debug(f"Filtered {filtered_count} features with low coherence (<{min_coherence})")
|
|
737
|
+
|
|
738
|
+
return filtered_features
|
|
739
|
+
|
|
740
|
+
|
|
212
741
|
def _merge_kd_nowarp(self, params: merge_defaults) -> oms.ConsensusMap:
|
|
213
742
|
"""KD-tree based merge without RT warping"""
|
|
214
743
|
|
|
@@ -244,8 +773,8 @@ def _merge_kd_nowarp(self, params: merge_defaults) -> oms.ConsensusMap:
|
|
|
244
773
|
return consensus_map
|
|
245
774
|
|
|
246
775
|
|
|
247
|
-
def
|
|
248
|
-
"""
|
|
776
|
+
def _merge_kd_chunked(self, params: merge_defaults, cached_adducts_df=None, cached_valid_adducts=None) -> oms.ConsensusMap:
|
|
777
|
+
"""KD-based chunked merge with proper cross-chunk consensus building"""
|
|
249
778
|
|
|
250
779
|
n_samples = len(self.features_maps)
|
|
251
780
|
if n_samples <= params.chunk_size:
|
|
@@ -307,6 +836,64 @@ def _merge_chunked(self, params: merge_defaults, cached_adducts_df=None, cached_
|
|
|
307
836
|
return consensus_map
|
|
308
837
|
|
|
309
838
|
|
|
839
|
+
def _merge_qt_chunked(self, params: merge_defaults, cached_adducts_df=None, cached_valid_adducts=None) -> oms.ConsensusMap:
|
|
840
|
+
"""QT-based chunked merge with proper cross-chunk consensus building"""
|
|
841
|
+
|
|
842
|
+
n_samples = len(self.features_maps)
|
|
843
|
+
if n_samples <= params.chunk_size:
|
|
844
|
+
self.logger.info(f"Dataset size ({n_samples}) ≤ chunk_size, using QT merge")
|
|
845
|
+
consensus_map = _merge_qt(self, params)
|
|
846
|
+
# Extract consensus features to populate consensus_df for chunked method consistency
|
|
847
|
+
self._extract_consensus_features(consensus_map, params.min_samples, cached_adducts_df, cached_valid_adducts)
|
|
848
|
+
return consensus_map
|
|
849
|
+
|
|
850
|
+
# Process in chunks
|
|
851
|
+
chunks = []
|
|
852
|
+
for i in range(0, n_samples, params.chunk_size):
|
|
853
|
+
chunk_end = min(i + params.chunk_size, n_samples)
|
|
854
|
+
chunks.append((i, self.features_maps[i:chunk_end]))
|
|
855
|
+
|
|
856
|
+
self.logger.debug(f"Processing {len(chunks)} chunks of max {params.chunk_size} samples")
|
|
857
|
+
|
|
858
|
+
# Process each chunk to create chunk consensus maps
|
|
859
|
+
chunk_consensus_maps = []
|
|
860
|
+
|
|
861
|
+
for chunk_idx, (chunk_start_idx, chunk_maps) in enumerate(tqdm(chunks, desc="Chunk", disable=self.log_level not in ["TRACE", "DEBUG", "INFO"])):
|
|
862
|
+
chunk_consensus_map = oms.ConsensusMap()
|
|
863
|
+
|
|
864
|
+
# Set up file descriptions for chunk
|
|
865
|
+
file_descriptions = chunk_consensus_map.getColumnHeaders()
|
|
866
|
+
for j, feature_map in enumerate(chunk_maps):
|
|
867
|
+
file_description = file_descriptions.get(j, oms.ColumnHeader())
|
|
868
|
+
file_description.filename = self.samples_df.row(chunk_start_idx + j, named=True)["sample_name"]
|
|
869
|
+
file_description.size = feature_map.size()
|
|
870
|
+
file_description.unique_id = feature_map.getUniqueId()
|
|
871
|
+
file_descriptions[j] = file_description
|
|
872
|
+
|
|
873
|
+
chunk_consensus_map.setColumnHeaders(file_descriptions)
|
|
874
|
+
|
|
875
|
+
# Use QT algorithm for chunk (main difference from KD chunked)
|
|
876
|
+
grouper = oms.FeatureGroupingAlgorithmQT()
|
|
877
|
+
chunk_params = grouper.getParameters()
|
|
878
|
+
chunk_params.setValue("distance_RT:max_difference", params.rt_tol)
|
|
879
|
+
chunk_params.setValue("distance_MZ:max_difference", params.mz_tol)
|
|
880
|
+
chunk_params.setValue("distance_MZ:unit", "Da")
|
|
881
|
+
chunk_params.setValue("ignore_charge", "true")
|
|
882
|
+
chunk_params.setValue("nr_partitions", params.nr_partitions)
|
|
883
|
+
|
|
884
|
+
grouper.setParameters(chunk_params)
|
|
885
|
+
grouper.group(chunk_maps, chunk_consensus_map)
|
|
886
|
+
|
|
887
|
+
chunk_consensus_maps.append((chunk_start_idx, chunk_consensus_map))
|
|
888
|
+
|
|
889
|
+
# Merge chunk results with proper cross-chunk consensus building
|
|
890
|
+
_merge_chunk_results(self, chunk_consensus_maps, params, cached_adducts_df, cached_valid_adducts)
|
|
891
|
+
|
|
892
|
+
# Create a dummy consensus map for compatibility (since other functions expect it)
|
|
893
|
+
consensus_map = oms.ConsensusMap()
|
|
894
|
+
return consensus_map
|
|
895
|
+
|
|
896
|
+
|
|
310
897
|
def _merge_chunk_results(self, chunk_consensus_maps: list, params: merge_defaults, cached_adducts_df=None, cached_valid_adducts=None) -> None:
|
|
311
898
|
"""
|
|
312
899
|
Scalable aggregation of chunk consensus maps into final consensus_df.
|
|
@@ -470,11 +1057,19 @@ def _merge_chunk_results(self, chunk_consensus_maps: list, params: merge_default
|
|
|
470
1057
|
b = chunk_consensus_list[j]
|
|
471
1058
|
if a['chunk_idx'] == b['chunk_idx']:
|
|
472
1059
|
continue
|
|
473
|
-
|
|
1060
|
+
|
|
1061
|
+
# Primary check: centroid distance (strict)
|
|
474
1062
|
centroid_close = (abs(a['rt']-b['rt']) <= rt_tol and abs(a['mz']-b['mz']) <= mz_tol)
|
|
475
|
-
|
|
476
|
-
|
|
477
|
-
|
|
1063
|
+
|
|
1064
|
+
# Secondary check: interval overlap (more conservative)
|
|
1065
|
+
# Only allow interval overlap if centroids are reasonably close (within 2x tolerance)
|
|
1066
|
+
centroids_reasonable = (abs(a['rt']-b['rt']) <= 2 * rt_tol and abs(a['mz']-b['mz']) <= 2 * mz_tol)
|
|
1067
|
+
if centroids_reasonable:
|
|
1068
|
+
rt_overlap = (a['rt_min'] - rt_tol/2) <= (b['rt_max'] + rt_tol/2) and (b['rt_min'] - rt_tol/2) <= (a['rt_max'] + rt_tol/2)
|
|
1069
|
+
mz_overlap = (a['mz_min'] - mz_tol/2) <= (b['mz_max'] + mz_tol/2) and (b['mz_min'] - mz_tol/2) <= (a['mz_max'] + mz_tol/2)
|
|
1070
|
+
else:
|
|
1071
|
+
rt_overlap = mz_overlap = False
|
|
1072
|
+
|
|
478
1073
|
if centroid_close or (rt_overlap and mz_overlap):
|
|
479
1074
|
uf.union(i,j)
|
|
480
1075
|
|
|
@@ -611,6 +1206,17 @@ def _merge_chunk_results(self, chunk_consensus_maps: list, params: merge_default
|
|
|
611
1206
|
cached_adducts_df=cached_adducts_df,
|
|
612
1207
|
cached_valid_adducts=cached_valid_adducts,
|
|
613
1208
|
)
|
|
1209
|
+
|
|
1210
|
+
# Validate RT spread doesn't exceed tolerance (with some flexibility for chunked merge)
|
|
1211
|
+
rt_spread = metadata.get('rt_max', 0) - metadata.get('rt_min', 0)
|
|
1212
|
+
max_allowed_spread = params.rt_tol * 2 # Allow 2x tolerance for chunked method
|
|
1213
|
+
|
|
1214
|
+
if rt_spread > max_allowed_spread:
|
|
1215
|
+
# Skip consensus features with excessive RT spread
|
|
1216
|
+
self.logger.debug(f"Skipping consensus feature {consensus_uid_counter} with RT spread {rt_spread:.3f}s > {max_allowed_spread:.3f}s")
|
|
1217
|
+
consensus_uid_counter += 1
|
|
1218
|
+
continue
|
|
1219
|
+
|
|
614
1220
|
consensus_metadata.append(metadata)
|
|
615
1221
|
|
|
616
1222
|
# Build mapping rows (deduplicated)
|
|
@@ -689,8 +1295,8 @@ def _calculate_consensus_statistics(study_obj, consensus_uid: int, feature_data_
|
|
|
689
1295
|
inty_values = np.array([fd.get("inty", 0) for fd in feature_data_list if fd.get("inty") is not None])
|
|
690
1296
|
coherence_values = np.array([fd.get("chrom_coherence", 0) for fd in feature_data_list if fd.get("chrom_coherence") is not None])
|
|
691
1297
|
prominence_values = np.array([fd.get("chrom_prominence", 0) for fd in feature_data_list if fd.get("chrom_prominence") is not None])
|
|
692
|
-
prominence_scaled_values = np.array([fd.get("
|
|
693
|
-
height_scaled_values = np.array([fd.get("
|
|
1298
|
+
prominence_scaled_values = np.array([fd.get("chrom_height_scaled", 0) for fd in feature_data_list if fd.get("chrom_height_scaled") is not None])
|
|
1299
|
+
height_scaled_values = np.array([fd.get("chrom_prominence_scaled", 0) for fd in feature_data_list if fd.get("chrom_prominence_scaled") is not None])
|
|
694
1300
|
iso_values = np.array([fd.get("iso", 0) for fd in feature_data_list if fd.get("iso") is not None])
|
|
695
1301
|
charge_values = np.array([fd.get("charge", 0) for fd in feature_data_list if fd.get("charge") is not None])
|
|
696
1302
|
|
|
@@ -1006,16 +1612,16 @@ def _extract_consensus_features(self, consensus_map, min_samples, cached_adducts
|
|
|
1006
1612
|
)
|
|
1007
1613
|
prominence_scaled_values = np.array(
|
|
1008
1614
|
[
|
|
1009
|
-
fd.get("
|
|
1615
|
+
fd.get("chrom_height_scaled", 0)
|
|
1010
1616
|
for fd in feature_data_list
|
|
1011
|
-
if fd.get("
|
|
1617
|
+
if fd.get("chrom_height_scaled") is not None
|
|
1012
1618
|
],
|
|
1013
1619
|
)
|
|
1014
1620
|
height_scaled_values = np.array(
|
|
1015
1621
|
[
|
|
1016
|
-
fd.get("
|
|
1622
|
+
fd.get("chrom_prominence_scaled", 0)
|
|
1017
1623
|
for fd in feature_data_list
|
|
1018
|
-
if fd.get("
|
|
1624
|
+
if fd.get("chrom_prominence_scaled") is not None
|
|
1019
1625
|
],
|
|
1020
1626
|
)
|
|
1021
1627
|
iso_values = np.array(
|
masster/study/plot.py
CHANGED
|
@@ -310,8 +310,22 @@ def plot_alignment(
|
|
|
310
310
|
max_inty = sample_data.select(pl.col("inty").max()).item() or 1
|
|
311
311
|
|
|
312
312
|
# Get sample information
|
|
313
|
-
sample_name = str(sample)
|
|
314
313
|
sample_uid = sample if sample_col == "sample_uid" else sample_data.select(pl.col("sample_uid")).item() if "sample_uid" in sample_data.columns else sample
|
|
314
|
+
|
|
315
|
+
# Try to get actual sample name from samples_df if available
|
|
316
|
+
sample_name = str(sample) # fallback
|
|
317
|
+
if hasattr(self, "samples_df") and self.samples_df is not None and sample_uid is not None:
|
|
318
|
+
try:
|
|
319
|
+
sample_name_result = (
|
|
320
|
+
self.samples_df.filter(pl.col("sample_uid") == sample_uid)
|
|
321
|
+
.select("sample_name")
|
|
322
|
+
.to_series()
|
|
323
|
+
)
|
|
324
|
+
if len(sample_name_result) > 0 and sample_name_result[0] is not None:
|
|
325
|
+
sample_name = str(sample_name_result[0])
|
|
326
|
+
except Exception:
|
|
327
|
+
# Keep the fallback value
|
|
328
|
+
pass
|
|
315
329
|
|
|
316
330
|
# Select columns to process
|
|
317
331
|
cols_to_select = ["rt", "mz", "inty"]
|
masster/study/processing.py
CHANGED
|
@@ -97,7 +97,6 @@ def align(self, **kwargs):
|
|
|
97
97
|
_align_kd_algorithm(self, fmaps, params)
|
|
98
98
|
else:
|
|
99
99
|
self.logger.error(f"Unknown alignment algorithm '{algorithm}'")
|
|
100
|
-
self.logger.error(f"Unknown alignment algorithm '{algorithm}'")
|
|
101
100
|
|
|
102
101
|
# check if rt_original exists in features_df, if not, add it after rt
|
|
103
102
|
if "rt_original" not in self.features_df.columns:
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: masster
|
|
3
|
-
Version: 0.4.
|
|
3
|
+
Version: 0.4.18
|
|
4
4
|
Summary: Mass spectrometry data analysis package
|
|
5
5
|
Project-URL: homepage, https://github.com/zamboni-lab/masster
|
|
6
6
|
Project-URL: repository, https://github.com/zamboni-lab/masster
|
|
@@ -767,7 +767,8 @@ study.integrate()
|
|
|
767
767
|
# export results
|
|
768
768
|
study.export_mgf()
|
|
769
769
|
study.export_mztab()
|
|
770
|
-
study.
|
|
770
|
+
study.export_xlsx()
|
|
771
|
+
study.export_parquet()
|
|
771
772
|
|
|
772
773
|
# Save the study to .study5
|
|
773
774
|
study.save()
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
masster/__init__.py,sha256=HHjKhCjkAc98LhoQfu4C6L-W2vfTEc1iXaPTxxcl_4A,800
|
|
2
|
-
masster/_version.py,sha256=
|
|
2
|
+
masster/_version.py,sha256=OUcHIwT4wa5AqV46S88edNYE4u4sKsoESNk3lFdxs_c,257
|
|
3
3
|
masster/chromatogram.py,sha256=iYpdv8C17zVnlWvOFgAn9ns2uFGiF-GgoYf5QVVAbHs,19319
|
|
4
4
|
masster/logger.py,sha256=W50V_uh8RSYwGxDrDFhOuj5jpu2tKJyt_16lMw9kQwA,14755
|
|
5
5
|
masster/spectrum.py,sha256=_upC_g2N9gwTaflXAugs9pSXpKUmzbIehofDordk7WI,47718
|
|
@@ -43,10 +43,10 @@ masster/study/h5.py,sha256=LiVGUAtULyPpZIUmKVJSaV38huJb8FsKOUWBOqiv0QU,82363
|
|
|
43
43
|
masster/study/helpers.py,sha256=M5_q8O5tuFchKPW04PTuj3X335lDA2VZqcs4D8ZQJEk,158604
|
|
44
44
|
masster/study/id.py,sha256=6NUBBKZCFOU1wlDKM0eXQeOIStSZCRNJ_3x7ZaIHzmM,55263
|
|
45
45
|
masster/study/load.py,sha256=CQQY_7BzagE3oQTdDlqNyfuMdVWIAft-M4a2WCFnxp0,70695
|
|
46
|
-
masster/study/merge.py,sha256=
|
|
46
|
+
masster/study/merge.py,sha256=2Vqj0OaTZxwtjYu1l5PmRpMmT8_cHh-R761FUvBE_Sk,95741
|
|
47
47
|
masster/study/parameters.py,sha256=0elaF7YspTsB7qyajWAbRNL2VfKlGz5GJLifmO8IGkk,3276
|
|
48
|
-
masster/study/plot.py,sha256=
|
|
49
|
-
masster/study/processing.py,sha256=
|
|
48
|
+
masster/study/plot.py,sha256=SimX-IlqISEItAnTBsx4xsdYHRAevfN41cCENVns1lw,88236
|
|
49
|
+
masster/study/processing.py,sha256=u1MSRKTzcqHNz_dClSUSfgTxkNRdBLXtVyO5LXuW_uk,41031
|
|
50
50
|
masster/study/save.py,sha256=YCvp4xhnG16sNXaT2mFDBoCrIMub0Es61B97qLo0maw,6705
|
|
51
51
|
masster/study/study.py,sha256=LO_hbJOOCZzeA3uterPKImFgPG6fCNQKMSVMtEwW3DU,38815
|
|
52
52
|
masster/study/study5_schema.json,sha256=c0w24QdHak01m04I1VPu97KvF2468FcaqROhf6pmLk4,7507
|
|
@@ -60,7 +60,7 @@ masster/study/defaults/find_ms2_def.py,sha256=RL0DFG41wQ05U8UQKUGr3vzSl3mU0m0knQ
|
|
|
60
60
|
masster/study/defaults/identify_def.py,sha256=96rxoCAPQj_yX-3mRoD2LTkTLJgG27eJQqwarLv5jL0,10580
|
|
61
61
|
masster/study/defaults/integrate_chrom_def.py,sha256=0MNIWGTjty-Zu-NTQsIweuj3UVqEY3x1x8pK0mPwYak,7264
|
|
62
62
|
masster/study/defaults/integrate_def.py,sha256=Vf4SAzdBfnsSZ3IRaF0qZvWu3gMDPHdgPfMYoPKeWv8,7246
|
|
63
|
-
masster/study/defaults/merge_def.py,sha256=
|
|
63
|
+
masster/study/defaults/merge_def.py,sha256=X7mTCgtQhglOTjwg06oSMFSbLBJSKsHmJeVVfYE2qHE,13272
|
|
64
64
|
masster/study/defaults/study_def.py,sha256=h8dYbi9xv0sesCSQik49Z53IkskMmNtW6ixl7it5pL0,16033
|
|
65
65
|
masster/wizard/README.md,sha256=mL1A3YWJZOefpJ6D0-HqGLkVRmUlOpwyVFdvJBeeoZM,14149
|
|
66
66
|
masster/wizard/__init__.py,sha256=A9GHQvkq4lSRIA8V6AKB-TJy8s_npH8i1baUGdkw_is,364
|
|
@@ -68,8 +68,8 @@ masster/wizard/example.py,sha256=xEZFTH9UZ8HKOm6s3JL8Js0Uw5ChnISWBHSZCL32vsM,798
|
|
|
68
68
|
masster/wizard/test_structure.py,sha256=h88gsYYCG6iDRjqPZC_r1H1T8y79j0E-K6OrwuHaSCU,1586
|
|
69
69
|
masster/wizard/test_wizard.py,sha256=CMp1cpjH3iYYC5Fy6puF_K0kfwwk3bgOsSbUGW-t7Xk,8986
|
|
70
70
|
masster/wizard/wizard.py,sha256=jMLHy4cXgNEE_-vshFmA7BNEByhfA6tV7O91jhiMYuw,48054
|
|
71
|
-
masster-0.4.
|
|
72
|
-
masster-0.4.
|
|
73
|
-
masster-0.4.
|
|
74
|
-
masster-0.4.
|
|
75
|
-
masster-0.4.
|
|
71
|
+
masster-0.4.18.dist-info/METADATA,sha256=pn-XNHgHqlY1KgiYkQ2Dyke9E1nnCP3mn-ja5W5QPyM,44207
|
|
72
|
+
masster-0.4.18.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
|
73
|
+
masster-0.4.18.dist-info/entry_points.txt,sha256=ZHguQ_vPmdbpqq2uGtmEOLJfgP-DQ1T0c07Lxh30wc8,58
|
|
74
|
+
masster-0.4.18.dist-info/licenses/LICENSE,sha256=bx5iLIKjgAdYQ7sISn7DsfHRKkoCUm1154sJJKhgqnU,35184
|
|
75
|
+
masster-0.4.18.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|