masster 0.4.17__py3-none-any.whl → 0.4.19__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of masster might be problematic. Click here for more details.

masster/_version.py CHANGED
@@ -1,7 +1,7 @@
1
1
  from __future__ import annotations
2
2
 
3
3
 
4
- __version__ = "0.4.17"
4
+ __version__ = "0.4.19"
5
5
 
6
6
 
7
7
  def get_version():
@@ -1,7 +1,7 @@
1
1
  """Parameter class for Study merge method."""
2
2
 
3
3
  from dataclasses import dataclass, field
4
- from typing import Any
4
+ from typing import Any, Optional
5
5
 
6
6
 
7
7
  @dataclass
@@ -25,25 +25,28 @@ class merge_defaults:
25
25
  link_ms2 (bool): Whether to link MS2 spectra to consensus features. Default is True.
26
26
  """
27
27
 
28
- method: str = "quality"
29
- min_samples: int = 10
28
+ method: str = "qt"
29
+ min_samples: int = 2
30
30
  rt_tol: float = 5.0
31
31
  mz_tol: float = 0.01
32
- chunk_size: int = 300
32
+ chunk_size: int = 500
33
33
  nr_partitions: int = 1000
34
- min_rel_cc_size: float = 0.2
34
+ min_rel_cc_size: float = 0.1
35
35
  max_pairwise_log_fc: float = -1.0
36
36
  max_nr_conflicts: int = 0
37
37
  link_ms2: bool = True
38
38
 
39
+ # Parallel processing parameters
40
+ threads: Optional[int] = None
41
+
39
42
  # KD-Strict specific parameters
40
43
  optimize_rt_tol: bool = False
41
- rt_tol_range: tuple = (0.8, 2.0)
42
- rt_tol_steps: int = 5
43
- secondary_merge_rt_tol: float = 0.5
44
+ rt_tol_range: tuple = (0.5, 4.0)
45
+ rt_tol_steps: int = 7
46
+ secondary_merge_rt_tol: float = 1.0
44
47
  secondary_merge_mz_tol: float = 0.005
45
48
  min_sample_overlap: float = 0.8
46
- max_rt_spread: float = None # Will default to 2x rt_tol
49
+ max_rt_spread: float = 2.0 # Will default to 2x rt_tol
47
50
  min_coherence: float = 0.0
48
51
 
49
52
  _param_metadata: dict[str, dict[str, Any]] = field(
@@ -53,7 +56,8 @@ class merge_defaults:
53
56
  "description": "Merge method (algorithm) to use",
54
57
  "default": "quality",
55
58
  "allowed_values": ["sensitivity", "qt", "nowarp", "chunked", "quality",
56
- "kd", "kd-nowarp", "kd_nowarp", "kd-strict", "kd_strict"],
59
+ "kd", "kd-nowarp", "kd_nowarp", "kd-strict", "kd_strict",
60
+ "kd_chunked", "kd-chunked", "qt_chunked", "qt-chunked"],
57
61
  },
58
62
  "min_samples": {
59
63
  "dtype": int,
@@ -114,6 +118,14 @@ class merge_defaults:
114
118
  "description": "Whether to link MS2 spectra to consensus features",
115
119
  "default": True,
116
120
  },
121
+ # Parallel processing parameters
122
+ "threads": {
123
+ "dtype": [int, type(None)],
124
+ "description": "Number of parallel threads/processes for chunked methods (None=original sequential)",
125
+ "default": None,
126
+ "min_value": 1,
127
+ "max_value": 32,
128
+ },
117
129
  # KD-Strict specific parameters
118
130
  "optimize_rt_tol": {
119
131
  "dtype": bool,
@@ -216,7 +228,37 @@ class merge_defaults:
216
228
  metadata = self._param_metadata[param_name]
217
229
  expected_dtype = metadata["dtype"]
218
230
 
219
- # Type checking
231
+ # Handle Optional types (list of types including None)
232
+ if isinstance(expected_dtype, list):
233
+ # Check if value matches any of the allowed types
234
+ valid_type = False
235
+ for dtype in expected_dtype:
236
+ if dtype is type(None) and value is None:
237
+ return True # None is explicitly allowed
238
+ elif dtype is int and isinstance(value, int):
239
+ valid_type = True
240
+ break
241
+ elif dtype is float and isinstance(value, (int, float)):
242
+ valid_type = True
243
+ break
244
+ elif dtype is bool and isinstance(value, bool):
245
+ valid_type = True
246
+ break
247
+ elif dtype is str and isinstance(value, str):
248
+ valid_type = True
249
+ break
250
+
251
+ if not valid_type:
252
+ return False
253
+
254
+ # For None values, skip further validation
255
+ if value is None:
256
+ return True
257
+
258
+ # Use the first non-None type for range validation
259
+ expected_dtype = next((dt for dt in expected_dtype if dt is not type(None)), expected_dtype[0])
260
+
261
+ # Type checking for non-Optional types
220
262
  if expected_dtype is int:
221
263
  if not isinstance(value, int):
222
264
  try:
masster/study/merge.py CHANGED
@@ -1,6 +1,6 @@
1
1
  """
2
2
  Unified merge module for the Study class.
3
- Supports multiple merge methods: 'kd', 'qt', 'kd-nowarp', 'chunked'
3
+ Supports multiple merge methods: 'kd', 'qt', 'kd-nowarp', 'kd_chunked', 'qt_chunked'
4
4
  """
5
5
 
6
6
  import time
@@ -10,9 +10,269 @@ from datetime import datetime
10
10
  from tqdm import tqdm
11
11
  import pyopenms as oms
12
12
  import polars as pl
13
+ from concurrent.futures import ProcessPoolExecutor, as_completed
13
14
  from masster.study.defaults import merge_defaults
14
15
 
15
16
 
17
+ def _process_kd_chunk_parallel(chunk_data):
18
+ """
19
+ Process a single KD chunk in parallel by reconstructing FeatureMaps from features_df slice.
20
+
21
+ Args:
22
+ chunk_data: Dictionary containing chunk processing parameters
23
+
24
+ Returns:
25
+ Tuple of (chunk_start_idx, serialized_consensus_features)
26
+ """
27
+ import pyopenms as oms
28
+
29
+ chunk_start_idx = chunk_data['chunk_start_idx']
30
+ chunk_features_data = chunk_data['chunk_features_data'] # List of feature dicts
31
+ chunk_samples_data = chunk_data['chunk_samples_data'] # List of sample dicts
32
+ params_dict = chunk_data['params']
33
+
34
+ # Reconstruct FeatureMaps from features data for each sample in the chunk
35
+ chunk_maps = []
36
+
37
+ for sample_data in chunk_samples_data:
38
+ sample_uid = sample_data['sample_uid']
39
+
40
+ # Filter features for this specific sample
41
+ sample_features = [f for f in chunk_features_data if f['sample_uid'] == sample_uid]
42
+
43
+ # Create FeatureMap for this sample
44
+ feature_map = oms.FeatureMap()
45
+
46
+ # Add each feature to the map
47
+ for feature_dict in sample_features:
48
+ feature = oms.Feature()
49
+ feature.setRT(float(feature_dict['rt']))
50
+ feature.setMZ(float(feature_dict['mz']))
51
+ feature.setIntensity(float(feature_dict['inty']))
52
+ feature.setCharge(int(feature_dict.get('charge', 0)))
53
+
54
+ # Set unique ID using feature_id for mapping back
55
+ feature.setUniqueId(int(feature_dict['feature_id']))
56
+
57
+ feature_map.push_back(feature)
58
+
59
+ chunk_maps.append(feature_map)
60
+
61
+ # Create the chunk consensus map
62
+ chunk_consensus_map = oms.ConsensusMap()
63
+
64
+ # Set up file descriptions for chunk
65
+ file_descriptions = chunk_consensus_map.getColumnHeaders()
66
+ for j, (feature_map, sample_data) in enumerate(zip(chunk_maps, chunk_samples_data)):
67
+ file_description = file_descriptions.get(j, oms.ColumnHeader())
68
+ file_description.filename = sample_data['sample_name']
69
+ file_description.size = feature_map.size()
70
+ file_description.unique_id = feature_map.getUniqueId()
71
+ file_descriptions[j] = file_description
72
+
73
+ chunk_consensus_map.setColumnHeaders(file_descriptions)
74
+
75
+ # Use KD algorithm for chunk
76
+ grouper = oms.FeatureGroupingAlgorithmKD()
77
+ chunk_params = grouper.getParameters()
78
+ chunk_params.setValue("mz_unit", "Da")
79
+ chunk_params.setValue("nr_partitions", params_dict['nr_partitions'])
80
+ chunk_params.setValue("warp:enabled", "true")
81
+ chunk_params.setValue("warp:rt_tol", params_dict['rt_tol'])
82
+ chunk_params.setValue("warp:mz_tol", params_dict['mz_tol'])
83
+ chunk_params.setValue("link:rt_tol", params_dict['rt_tol'])
84
+ chunk_params.setValue("link:mz_tol", params_dict['mz_tol'])
85
+ chunk_params.setValue("link:min_rel_cc_size", params_dict['min_rel_cc_size'])
86
+ chunk_params.setValue("link:max_pairwise_log_fc", params_dict['max_pairwise_log_fc'])
87
+ chunk_params.setValue("link:max_nr_conflicts", params_dict['max_nr_conflicts'])
88
+
89
+ grouper.setParameters(chunk_params)
90
+ grouper.group(chunk_maps, chunk_consensus_map)
91
+
92
+ # Serialize the consensus map result for cross-process communication
93
+ consensus_features = []
94
+ for consensus_feature in chunk_consensus_map:
95
+ feature_data = {
96
+ 'rt': consensus_feature.getRT(),
97
+ 'mz': consensus_feature.getMZ(),
98
+ 'intensity': consensus_feature.getIntensity(),
99
+ 'quality': consensus_feature.getQuality(),
100
+ 'unique_id': str(consensus_feature.getUniqueId()),
101
+ 'features': []
102
+ }
103
+
104
+ # Get constituent features
105
+ for feature_handle in consensus_feature.getFeatureList():
106
+ feature_handle_data = {
107
+ 'unique_id': str(feature_handle.getUniqueId()),
108
+ 'map_index': feature_handle.getMapIndex()
109
+ }
110
+ feature_data['features'].append(feature_handle_data)
111
+
112
+ consensus_features.append(feature_data)
113
+
114
+ return chunk_start_idx, consensus_features
115
+
116
+
117
+ def _deserialize_consensus_features(consensus_features):
118
+ """
119
+ Deserialize consensus features back into an OpenMS ConsensusMap.
120
+
121
+ Args:
122
+ consensus_features: List of serialized consensus feature dictionaries
123
+
124
+ Returns:
125
+ OpenMS ConsensusMap object
126
+ """
127
+ import pyopenms as oms
128
+
129
+ consensus_map = oms.ConsensusMap()
130
+
131
+ for feature_data in consensus_features:
132
+ consensus_feature = oms.ConsensusFeature()
133
+ consensus_feature.setRT(float(feature_data['rt']))
134
+ consensus_feature.setMZ(float(feature_data['mz']))
135
+ consensus_feature.setIntensity(float(feature_data['intensity']))
136
+ consensus_feature.setQuality(float(feature_data['quality']))
137
+ consensus_feature.setUniqueId(int(feature_data['unique_id']))
138
+
139
+ # Reconstruct feature handles (simplified approach)
140
+ feature_handles = []
141
+ for handle_data in feature_data['features']:
142
+ feature_handle = oms.FeatureHandle()
143
+ feature_handle.setUniqueId(int(handle_data['unique_id']))
144
+ feature_handle.setMapIndex(int(handle_data['map_index']))
145
+ feature_handles.append(feature_handle)
146
+
147
+ # Set the feature list - properly add feature handles back to consensus feature
148
+ if feature_handles:
149
+ # Add each feature handle to the consensus feature using the correct OpenMS API
150
+ for feature_handle in feature_handles:
151
+ consensus_feature.getFeatureList().append(feature_handle)
152
+
153
+ consensus_map.push_back(consensus_feature)
154
+
155
+ return consensus_map
156
+
157
+
158
+ def _process_qt_chunk_parallel(chunk_data):
159
+ """
160
+ Process a single QT chunk in parallel by reconstructing FeatureMaps from features_df slice.
161
+
162
+ Args:
163
+ chunk_data: Dictionary containing chunk processing parameters
164
+
165
+ Returns:
166
+ Tuple of (chunk_start_idx, serialized_consensus_features)
167
+ """
168
+ import pyopenms as oms
169
+
170
+ chunk_start_idx = chunk_data['chunk_start_idx']
171
+ chunk_features_data = chunk_data['chunk_features_data'] # List of feature dicts
172
+ chunk_samples_data = chunk_data['chunk_samples_data'] # List of sample dicts
173
+ params_dict = chunk_data['params']
174
+
175
+ # Reconstruct FeatureMaps from features data for each sample in the chunk
176
+ chunk_maps = []
177
+
178
+ for sample_data in chunk_samples_data:
179
+ sample_uid = sample_data['sample_uid']
180
+
181
+ # Filter features for this specific sample
182
+ sample_features = [f for f in chunk_features_data if f['sample_uid'] == sample_uid]
183
+
184
+ # Create FeatureMap for this sample
185
+ feature_map = oms.FeatureMap()
186
+
187
+ # Add each feature to the map
188
+ for feature_dict in sample_features:
189
+ feature = oms.Feature()
190
+ feature.setRT(float(feature_dict['rt']))
191
+ feature.setMZ(float(feature_dict['mz']))
192
+ feature.setIntensity(float(feature_dict['inty']))
193
+ feature.setCharge(int(feature_dict.get('charge', 0)))
194
+
195
+ # Set unique ID using feature_id for mapping back
196
+ feature.setUniqueId(int(feature_dict['feature_id']))
197
+
198
+ feature_map.push_back(feature)
199
+
200
+ chunk_maps.append(feature_map)
201
+
202
+ # Create the chunk consensus map
203
+ chunk_consensus_map = oms.ConsensusMap()
204
+
205
+ # Set up file descriptions for chunk
206
+ file_descriptions = chunk_consensus_map.getColumnHeaders()
207
+ for j, (feature_map, sample_data) in enumerate(zip(chunk_maps, chunk_samples_data)):
208
+ file_description = file_descriptions.get(j, oms.ColumnHeader())
209
+ file_description.filename = sample_data['sample_name']
210
+ file_description.size = feature_map.size()
211
+ file_description.unique_id = feature_map.getUniqueId()
212
+ file_descriptions[j] = file_description
213
+
214
+ chunk_consensus_map.setColumnHeaders(file_descriptions)
215
+
216
+ # Use QT algorithm for chunk
217
+ grouper = oms.FeatureGroupingAlgorithmQT()
218
+ chunk_params = grouper.getParameters()
219
+ chunk_params.setValue("distance_RT:max_difference", params_dict['rt_tol'])
220
+ chunk_params.setValue("distance_MZ:max_difference", params_dict['mz_tol'])
221
+ chunk_params.setValue("distance_MZ:unit", "Da")
222
+ chunk_params.setValue("ignore_charge", "true")
223
+ chunk_params.setValue("nr_partitions", params_dict['nr_partitions'])
224
+
225
+ grouper.setParameters(chunk_params)
226
+ grouper.group(chunk_maps, chunk_consensus_map)
227
+
228
+ # Serialize the consensus map result for cross-process communication
229
+ consensus_features = []
230
+ for consensus_feature in chunk_consensus_map:
231
+ feature_data = {
232
+ 'rt': consensus_feature.getRT(),
233
+ 'mz': consensus_feature.getMZ(),
234
+ 'intensity': consensus_feature.getIntensity(),
235
+ 'quality': consensus_feature.getQuality(),
236
+ 'unique_id': str(consensus_feature.getUniqueId()),
237
+ 'features': []
238
+ }
239
+
240
+ # Get constituent features
241
+ for feature_handle in consensus_feature.getFeatureList():
242
+ feature_handle_data = {
243
+ 'unique_id': str(feature_handle.getUniqueId()),
244
+ 'map_index': feature_handle.getMapIndex()
245
+ }
246
+ feature_data['features'].append(feature_handle_data)
247
+
248
+ consensus_features.append(feature_data)
249
+
250
+ return chunk_start_idx, consensus_features
251
+
252
+
253
+ def _serialize_feature_map(feature_map):
254
+ """
255
+ Serialize a FeatureMap to a list of dictionaries for multiprocessing.
256
+
257
+ Args:
258
+ feature_map: OpenMS FeatureMap object
259
+
260
+ Returns:
261
+ List of feature dictionaries
262
+ """
263
+ features_data = []
264
+ for feature in feature_map:
265
+ feature_data = {
266
+ 'rt': feature.getRT(),
267
+ 'mz': feature.getMZ(),
268
+ 'intensity': feature.getIntensity(),
269
+ 'charge': feature.getCharge(),
270
+ 'unique_id': feature.getUniqueId()
271
+ }
272
+ features_data.append(feature_data)
273
+ return features_data
274
+
275
+
16
276
  def merge(self, **kwargs) -> None:
17
277
  """
18
278
  Group features across samples into consensus features using various algorithms.
@@ -25,7 +285,7 @@ def merge(self, **kwargs) -> None:
25
285
  **kwargs : dict
26
286
  Parameters from merge_defaults class:
27
287
  - method : str, default 'quality'
28
- Merge algorithm: 'sensitivity', 'qt', 'nowarp', 'chunked', 'quality'
288
+ Merge algorithm: 'sensitivity', 'qt', 'nowarp', 'kd_chunked', 'qt_chunked', 'quality'
29
289
  - min_samples : int, default 10
30
290
  Minimum number of samples for consensus feature
31
291
  - rt_tol : float, default 2.0
@@ -34,6 +294,8 @@ def merge(self, **kwargs) -> None:
34
294
  m/z tolerance in Da (Daltons) for all methods
35
295
  - chunk_size : int, default 500
36
296
  Chunk size for 'chunked' method
297
+ - threads : int, default 1
298
+ Number of parallel processes for chunked methods (kd_chunked, qt_chunked)
37
299
  - nr_partitions : int, default 500
38
300
  Number of partitions in m/z dimension for KD algorithms
39
301
  - min_rel_cc_size : float, default 0.3
@@ -52,9 +314,21 @@ def merge(self, **kwargs) -> None:
52
314
  - Sensitivity: Best raw sensitivity, O(n log n), maximum feature detection
53
315
  - QT: Thorough but slow O(n²), good for <1000 samples
54
316
  - NoWarp: Memory efficient KD without RT warping for large datasets
55
- - Chunked: Memory-optimized KD algorithm for very large datasets (>5000 samples)
317
+ - KD-Chunked: Memory-optimized KD algorithm for very large datasets (>5000 samples)
56
318
  Uses optimized partitioning for better memory management while maintaining
57
- full cross-sample consensus feature detection.
319
+ full cross-sample consensus feature detection. Supports parallel processing.
320
+ - QT-Chunked: Memory-optimized QT algorithm for very large datasets (>5000 samples)
321
+ Uses QT clustering in first stage with optimized cross-chunk consensus building.
322
+ Supports parallel processing.
323
+
324
+ Parallel Processing
325
+ ------------------
326
+ For kd_chunked and qt_chunked methods, use threads > 1 to enable parallel processing
327
+ of chunk alignments. This can significantly reduce processing time for large datasets
328
+ by processing multiple chunks simultaneously in separate processes.
329
+
330
+ Example:
331
+ study.merge(method='kd_chunked', threads=4, chunk_size=200)
58
332
  """
59
333
  start_time = time.time()
60
334
 
@@ -76,7 +350,12 @@ def merge(self, **kwargs) -> None:
76
350
  'kd_nowarp': 'nowarp',
77
351
  'kd-strict': 'quality',
78
352
  'kd_strict': 'quality',
79
- 'kdstrict': 'quality'
353
+ 'kdstrict': 'quality',
354
+ 'chunked': 'kd_chunked', # Map old 'chunked' to 'kd_chunked'
355
+ 'qtchunked': 'qt_chunked', # QT chunked variants
356
+ 'qt-chunked': 'qt_chunked',
357
+ 'kdchunked': 'kd_chunked', # KD chunked variants
358
+ 'kd-chunked': 'kd_chunked'
80
359
  }
81
360
 
82
361
  if params.method in method_mapping:
@@ -85,8 +364,8 @@ def merge(self, **kwargs) -> None:
85
364
  self.logger.info(f"Method '{old_method}' is deprecated. Using '{params.method}' instead.")
86
365
 
87
366
  # Validate method
88
- if params.method not in ['sensitivity', 'qt', 'nowarp', 'chunked', 'quality']:
89
- raise ValueError(f"Invalid method '{params.method}'. Must be one of: ['sensitivity', 'qt', 'nowarp', 'chunked', 'quality']")
367
+ if params.method not in ['sensitivity', 'qt', 'nowarp', 'kd_chunked', 'qt_chunked', 'quality']:
368
+ raise ValueError(f"Invalid method '{params.method}'. Must be one of: ['sensitivity', 'qt', 'nowarp', 'kd_chunked', 'qt_chunked', 'quality']")
90
369
 
91
370
  # Persist last used params for diagnostics
92
371
  try:
@@ -147,9 +426,12 @@ def merge(self, **kwargs) -> None:
147
426
  elif params.method == 'quality':
148
427
  consensus_map = _merge_kd_strict(self, params)
149
428
  # Note: _merge_kd_strict handles both consensus_df and consensus_mapping_df directly
150
- elif params.method == 'chunked':
151
- consensus_map = _merge_chunked(self, params, cached_adducts_df, cached_valid_adducts)
152
- # Note: _merge_chunked populates consensus_df directly, no need to extract
429
+ elif params.method == 'kd_chunked':
430
+ consensus_map = _merge_kd_chunked(self, params, cached_adducts_df, cached_valid_adducts)
431
+ # Note: _merge_kd_chunked populates consensus_df directly, no need to extract
432
+ elif params.method == 'qt_chunked':
433
+ consensus_map = _merge_qt_chunked(self, params, cached_adducts_df, cached_valid_adducts)
434
+ # Note: _merge_qt_chunked populates consensus_df directly, no need to extract
153
435
 
154
436
  # Perform adduct grouping
155
437
  self._perform_adduct_grouping(params.rt_tol, params.mz_tol)
@@ -189,9 +471,9 @@ def _merge_kd(self, params: merge_defaults) -> oms.ConsensusMap:
189
471
  params_oms.setValue("warp:mz_tol", params.mz_tol)
190
472
  params_oms.setValue("link:rt_tol", params.rt_tol)
191
473
  params_oms.setValue("link:mz_tol", params.mz_tol)
192
- params_oms.setValue("link:min_rel_cc_size", params.min_rel_cc_size)
193
- params_oms.setValue("link:max_pairwise_log_fc", params.max_pairwise_log_fc)
194
- params_oms.setValue("link:max_nr_conflicts", params.max_nr_conflicts)
474
+ #params_oms.setValue("link:min_rel_cc_size", params.min_rel_cc_size)
475
+ #params_oms.setValue("link:max_pairwise_log_fc", params.max_pairwise_log_fc)
476
+ #params_oms.setValue("link:max_nr_conflicts", params.max_nr_conflicts)
195
477
  #params_oms.setValue("link:charge_merging", "With_charge_zero") THIS LEADS TO A CRASH
196
478
 
197
479
  grouper.setParameters(params_oms)
@@ -227,9 +509,9 @@ def _merge_qt(self, params: merge_defaults) -> oms.ConsensusMap:
227
509
  params_oms.setValue("distance_MZ:max_difference", params.mz_tol)
228
510
  params_oms.setValue("distance_MZ:unit", "Da") # QT now uses Da like all other methods
229
511
  params_oms.setValue("ignore_charge", "true")
230
- params_oms.setValue("min_rel_cc_size", params.min_rel_cc_size)
231
- params_oms.setValue("max_pairwise_log_fc", params.max_pairwise_log_fc)
232
- params_oms.setValue("max_nr_conflicts", params.max_nr_conflicts)
512
+ #params_oms.setValue("min_rel_cc_size", params.min_rel_cc_size)
513
+ #params_oms.setValue("max_pairwise_log_fc", params.max_pairwise_log_fc)
514
+ #params_oms.setValue("max_nr_conflicts", params.max_nr_conflicts)
233
515
  params_oms.setValue("nr_partitions", params.nr_partitions)
234
516
 
235
517
  grouper.setParameters(params_oms)
@@ -763,8 +1045,8 @@ def _merge_kd_nowarp(self, params: merge_defaults) -> oms.ConsensusMap:
763
1045
  return consensus_map
764
1046
 
765
1047
 
766
- def _merge_chunked(self, params: merge_defaults, cached_adducts_df=None, cached_valid_adducts=None) -> oms.ConsensusMap:
767
- """Chunked merge with proper cross-chunk consensus building"""
1048
+ def _merge_kd_chunked(self, params: merge_defaults, cached_adducts_df=None, cached_valid_adducts=None) -> oms.ConsensusMap:
1049
+ """KD-based chunked merge with proper cross-chunk consensus building and optional parallel processing"""
768
1050
 
769
1051
  n_samples = len(self.features_maps)
770
1052
  if n_samples <= params.chunk_size:
@@ -780,48 +1062,255 @@ def _merge_chunked(self, params: merge_defaults, cached_adducts_df=None, cached_
780
1062
  chunk_end = min(i + params.chunk_size, n_samples)
781
1063
  chunks.append((i, self.features_maps[i:chunk_end]))
782
1064
 
783
- self.logger.debug(f"Processing {len(chunks)} chunks of max {params.chunk_size} samples")
1065
+ self.logger.debug(f"Processing {len(chunks)} chunks of max {params.chunk_size} samples using {params.threads or 'sequential'} thread(s)")
784
1066
 
785
1067
  # Process each chunk to create chunk consensus maps
786
1068
  chunk_consensus_maps = []
787
1069
 
788
- for chunk_idx, (chunk_start_idx, chunk_maps) in enumerate(tqdm(chunks, desc="Chunk", disable=self.log_level not in ["TRACE", "DEBUG", "INFO"])):
789
- chunk_consensus_map = oms.ConsensusMap()
1070
+ if params.threads is None:
1071
+ # Sequential processing (original behavior)
1072
+ for chunk_idx, (chunk_start_idx, chunk_maps) in enumerate(tqdm(chunks, desc=f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}KD Chunk", disable=self.log_level not in ["TRACE", "DEBUG", "INFO"])):
1073
+ chunk_consensus_map = oms.ConsensusMap()
1074
+
1075
+ # Set up file descriptions for chunk
1076
+ file_descriptions = chunk_consensus_map.getColumnHeaders()
1077
+ for j, feature_map in enumerate(chunk_maps):
1078
+ file_description = file_descriptions.get(j, oms.ColumnHeader())
1079
+ file_description.filename = self.samples_df.row(chunk_start_idx + j, named=True)["sample_name"]
1080
+ file_description.size = feature_map.size()
1081
+ file_description.unique_id = feature_map.getUniqueId()
1082
+ file_descriptions[j] = file_description
1083
+
1084
+ chunk_consensus_map.setColumnHeaders(file_descriptions)
1085
+
1086
+ # Use KD algorithm for chunk
1087
+ grouper = oms.FeatureGroupingAlgorithmKD()
1088
+ chunk_params = grouper.getParameters()
1089
+ chunk_params.setValue("mz_unit", "Da")
1090
+ chunk_params.setValue("nr_partitions", params.nr_partitions)
1091
+ chunk_params.setValue("warp:enabled", "true")
1092
+ chunk_params.setValue("warp:rt_tol", params.rt_tol)
1093
+ chunk_params.setValue("warp:mz_tol", params.mz_tol)
1094
+ chunk_params.setValue("link:rt_tol", params.rt_tol)
1095
+ chunk_params.setValue("link:mz_tol", params.mz_tol)
1096
+ chunk_params.setValue("link:min_rel_cc_size", params.min_rel_cc_size)
1097
+ chunk_params.setValue("link:max_pairwise_log_fc", params.max_pairwise_log_fc)
1098
+ chunk_params.setValue("link:max_nr_conflicts", params.max_nr_conflicts)
1099
+
1100
+ grouper.setParameters(chunk_params)
1101
+ grouper.group(chunk_maps, chunk_consensus_map)
1102
+
1103
+ chunk_consensus_maps.append((chunk_start_idx, chunk_consensus_map))
1104
+
1105
+ else:
1106
+ # Parallel processing
1107
+ self.logger.info(f"Processing chunks in parallel using {params.threads} processes")
790
1108
 
791
- # Set up file descriptions for chunk
792
- file_descriptions = chunk_consensus_map.getColumnHeaders()
793
- for j, feature_map in enumerate(chunk_maps):
794
- file_description = file_descriptions.get(j, oms.ColumnHeader())
795
- file_description.filename = self.samples_df.row(chunk_start_idx + j, named=True)["sample_name"]
796
- file_description.size = feature_map.size()
797
- file_description.unique_id = feature_map.getUniqueId()
798
- file_descriptions[j] = file_description
1109
+ # Prepare chunk data for parallel processing using features_df slices
1110
+ chunk_data_list = []
1111
+ for chunk_idx, (chunk_start_idx, chunk_maps) in enumerate(chunks):
1112
+ # Get the sample UIDs for this chunk
1113
+ chunk_sample_uids = []
1114
+ chunk_samples_df_rows = []
1115
+ for j in range(len(chunk_maps)):
1116
+ sample_row = self.samples_df.row(chunk_start_idx + j, named=True)
1117
+ chunk_sample_uids.append(sample_row['sample_uid'])
1118
+ chunk_samples_df_rows.append(sample_row)
1119
+
1120
+ # Create a DataFrame for this chunk's samples
1121
+ chunk_samples_df = pl.DataFrame(chunk_samples_df_rows)
1122
+
1123
+ # Filter features_df for this chunk's samples and select only necessary columns
1124
+ chunk_features_df = self.features_df.filter(
1125
+ pl.col('sample_uid').is_in(chunk_sample_uids)
1126
+ ).select([
1127
+ 'sample_uid', 'rt', 'mz', 'inty', 'charge', 'feature_id'
1128
+ ])
1129
+
1130
+ # Convert DataFrames to serializable format (lists of dicts)
1131
+ chunk_features_data = chunk_features_df.to_dicts()
1132
+ chunk_samples_data = chunk_samples_df.to_dicts()
1133
+
1134
+ chunk_data = {
1135
+ 'chunk_start_idx': chunk_start_idx,
1136
+ 'chunk_features_data': chunk_features_data, # List of dicts instead of DataFrame
1137
+ 'chunk_samples_data': chunk_samples_data, # List of dicts instead of DataFrame
1138
+ 'params': {
1139
+ 'nr_partitions': params.nr_partitions,
1140
+ 'rt_tol': params.rt_tol,
1141
+ 'mz_tol': params.mz_tol,
1142
+ 'min_rel_cc_size': params.min_rel_cc_size,
1143
+ 'max_pairwise_log_fc': params.max_pairwise_log_fc,
1144
+ 'max_nr_conflicts': params.max_nr_conflicts
1145
+ }
1146
+ }
1147
+ chunk_data_list.append(chunk_data)
799
1148
 
800
- chunk_consensus_map.setColumnHeaders(file_descriptions)
1149
+ # Process chunks in parallel
1150
+ with ProcessPoolExecutor(max_workers=params.threads) as executor:
1151
+ # Submit all chunk processing tasks
1152
+ future_to_chunk = {executor.submit(_process_kd_chunk_parallel, chunk_data): i
1153
+ for i, chunk_data in enumerate(chunk_data_list)}
1154
+
1155
+ # Collect results with progress tracking
1156
+ completed_chunks = 0
1157
+ total_chunks = len(chunk_data_list)
1158
+ serialized_chunk_results = []
1159
+
1160
+ for future in as_completed(future_to_chunk):
1161
+ chunk_idx = future_to_chunk[future]
1162
+ try:
1163
+ chunk_start_idx, consensus_features = future.result()
1164
+ serialized_chunk_results.append((chunk_start_idx, consensus_features))
1165
+ completed_chunks += 1
1166
+ n_samples_in_chunk = len(chunk_data_list[chunk_idx]['chunk_samples_data'])
1167
+ self.logger.info(f"Completed chunk {completed_chunks}/{total_chunks} (samples {chunk_start_idx + 1}-{chunk_start_idx + n_samples_in_chunk})")
1168
+ except Exception as exc:
1169
+ self.logger.error(f"Chunk {chunk_idx} generated an exception: {exc}")
1170
+ raise exc
801
1171
 
802
- # Use KD algorithm for chunk
803
- grouper = oms.FeatureGroupingAlgorithmKD()
804
- chunk_params = grouper.getParameters()
805
- chunk_params.setValue("mz_unit", "Da")
806
- chunk_params.setValue("nr_partitions", params.nr_partitions)
807
- chunk_params.setValue("warp:enabled", "true")
808
- chunk_params.setValue("warp:rt_tol", params.rt_tol)
809
- chunk_params.setValue("warp:mz_tol", params.mz_tol)
810
- chunk_params.setValue("link:rt_tol", params.rt_tol)
811
- chunk_params.setValue("link:mz_tol", params.mz_tol)
812
- chunk_params.setValue("link:min_rel_cc_size", params.min_rel_cc_size)
813
- chunk_params.setValue("link:max_pairwise_log_fc", params.max_pairwise_log_fc)
814
- chunk_params.setValue("link:max_nr_conflicts", params.max_nr_conflicts)
1172
+ # Store serialized results for _merge_chunk_results to handle directly
1173
+ chunk_consensus_maps = []
1174
+ for chunk_start_idx, consensus_features in sorted(serialized_chunk_results):
1175
+ # Store serialized data directly for _merge_chunk_results to handle
1176
+ chunk_consensus_maps.append((chunk_start_idx, consensus_features))
1177
+
1178
+ # Merge chunk results with proper cross-chunk consensus building
1179
+ # _merge_chunk_results now handles both ConsensusMap objects (sequential) and serialized data (parallel)
1180
+ _merge_chunk_results(self, chunk_consensus_maps, params, cached_adducts_df, cached_valid_adducts)
1181
+
1182
+ # Return a dummy consensus map for compatibility (consensus features are stored in self.consensus_df)
1183
+ consensus_map = oms.ConsensusMap()
1184
+ return consensus_map
1185
+
1186
+
1187
+ def _merge_qt_chunked(self, params: merge_defaults, cached_adducts_df=None, cached_valid_adducts=None) -> oms.ConsensusMap:
1188
+ """QT-based chunked merge with proper cross-chunk consensus building and optional parallel processing"""
1189
+
1190
+ n_samples = len(self.features_maps)
1191
+ if n_samples <= params.chunk_size:
1192
+ self.logger.info(f"Dataset size ({n_samples}) ≤ chunk_size, using QT merge")
1193
+ consensus_map = _merge_qt(self, params)
1194
+ # Extract consensus features to populate consensus_df for chunked method consistency
1195
+ self._extract_consensus_features(consensus_map, params.min_samples, cached_adducts_df, cached_valid_adducts)
1196
+ return consensus_map
1197
+
1198
+ # Process in chunks
1199
+ chunks = []
1200
+ for i in range(0, n_samples, params.chunk_size):
1201
+ chunk_end = min(i + params.chunk_size, n_samples)
1202
+ chunks.append((i, self.features_maps[i:chunk_end]))
1203
+
1204
+ self.logger.debug(f"Processing {len(chunks)} chunks of max {params.chunk_size} samples using {params.threads or 'sequential'} thread(s)")
1205
+
1206
+ # Process each chunk to create chunk consensus maps
1207
+ chunk_consensus_maps = []
1208
+
1209
+ if params.threads is None:
1210
+ # Sequential processing (original behavior)
1211
+ for chunk_idx, (chunk_start_idx, chunk_maps) in enumerate(tqdm(chunks, desc=f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}QT Chunk", disable=self.log_level not in ["TRACE", "DEBUG", "INFO"])):
1212
+ chunk_consensus_map = oms.ConsensusMap()
1213
+
1214
+ # Set up file descriptions for chunk
1215
+ file_descriptions = chunk_consensus_map.getColumnHeaders()
1216
+ for j, feature_map in enumerate(chunk_maps):
1217
+ file_description = file_descriptions.get(j, oms.ColumnHeader())
1218
+ file_description.filename = self.samples_df.row(chunk_start_idx + j, named=True)["sample_name"]
1219
+ file_description.size = feature_map.size()
1220
+ file_description.unique_id = feature_map.getUniqueId()
1221
+ file_descriptions[j] = file_description
1222
+
1223
+ chunk_consensus_map.setColumnHeaders(file_descriptions)
1224
+
1225
+ # Use QT algorithm for chunk (main difference from KD chunked)
1226
+ grouper = oms.FeatureGroupingAlgorithmQT()
1227
+ chunk_params = grouper.getParameters()
1228
+ chunk_params.setValue("distance_RT:max_difference", params.rt_tol)
1229
+ chunk_params.setValue("distance_MZ:max_difference", params.mz_tol)
1230
+ chunk_params.setValue("distance_MZ:unit", "Da")
1231
+ chunk_params.setValue("ignore_charge", "true")
1232
+ chunk_params.setValue("nr_partitions", params.nr_partitions)
1233
+
1234
+ grouper.setParameters(chunk_params)
1235
+ grouper.group(chunk_maps, chunk_consensus_map)
1236
+
1237
+ chunk_consensus_maps.append((chunk_start_idx, chunk_consensus_map))
1238
+
1239
+ else:
1240
+ # Parallel processing
1241
+ self.logger.info(f"Processing chunks in parallel using {params.threads} processes")
815
1242
 
816
- grouper.setParameters(chunk_params)
817
- grouper.group(chunk_maps, chunk_consensus_map)
1243
+ # Prepare chunk data for parallel processing using features_df slices
1244
+ chunk_data_list = []
1245
+ for chunk_idx, (chunk_start_idx, chunk_maps) in enumerate(chunks):
1246
+ # Get the sample UIDs for this chunk
1247
+ chunk_sample_uids = []
1248
+ chunk_samples_df_rows = []
1249
+ for j in range(len(chunk_maps)):
1250
+ sample_row = self.samples_df.row(chunk_start_idx + j, named=True)
1251
+ chunk_sample_uids.append(sample_row['sample_uid'])
1252
+ chunk_samples_df_rows.append(sample_row)
1253
+
1254
+ # Create a DataFrame for this chunk's samples
1255
+ chunk_samples_df = pl.DataFrame(chunk_samples_df_rows)
1256
+
1257
+ # Filter features_df for this chunk's samples and select only necessary columns
1258
+ chunk_features_df = self.features_df.filter(
1259
+ pl.col('sample_uid').is_in(chunk_sample_uids)
1260
+ ).select([
1261
+ 'sample_uid', 'rt', 'mz', 'inty', 'charge', 'feature_id'
1262
+ ])
1263
+
1264
+ # Convert DataFrames to serializable format (lists of dicts)
1265
+ chunk_features_data = chunk_features_df.to_dicts()
1266
+ chunk_samples_data = chunk_samples_df.to_dicts()
1267
+
1268
+ chunk_data = {
1269
+ 'chunk_start_idx': chunk_start_idx,
1270
+ 'chunk_features_data': chunk_features_data, # List of dicts instead of DataFrame
1271
+ 'chunk_samples_data': chunk_samples_data, # List of dicts instead of DataFrame
1272
+ 'params': {
1273
+ 'nr_partitions': params.nr_partitions,
1274
+ 'rt_tol': params.rt_tol,
1275
+ 'mz_tol': params.mz_tol,
1276
+ }
1277
+ }
1278
+ chunk_data_list.append(chunk_data)
818
1279
 
819
- chunk_consensus_maps.append((chunk_start_idx, chunk_consensus_map))
820
-
821
- # Merge chunk results with proper cross-chunk consensus building
1280
+ # Process chunks in parallel
1281
+ with ProcessPoolExecutor(max_workers=params.threads) as executor:
1282
+ # Submit all chunk processing tasks
1283
+ future_to_chunk = {executor.submit(_process_qt_chunk_parallel, chunk_data): i
1284
+ for i, chunk_data in enumerate(chunk_data_list)}
1285
+
1286
+ # Collect results with progress tracking
1287
+ completed_chunks = 0
1288
+ total_chunks = len(chunk_data_list)
1289
+ serialized_chunk_results = []
1290
+
1291
+ for future in as_completed(future_to_chunk):
1292
+ chunk_idx = future_to_chunk[future]
1293
+ try:
1294
+ chunk_start_idx, consensus_features = future.result()
1295
+ serialized_chunk_results.append((chunk_start_idx, consensus_features))
1296
+ completed_chunks += 1
1297
+ n_samples_in_chunk = len(chunk_data_list[chunk_idx]['chunk_samples_data'])
1298
+ self.logger.info(f"Completed chunk {completed_chunks}/{total_chunks} (samples {chunk_start_idx + 1}-{chunk_start_idx + n_samples_in_chunk})")
1299
+ except Exception as exc:
1300
+ self.logger.error(f"Chunk {chunk_idx} generated an exception: {exc}")
1301
+ raise exc
1302
+
1303
+ # Store serialized results for _merge_chunk_results to handle directly
1304
+ chunk_consensus_maps = []
1305
+ for chunk_start_idx, consensus_features in sorted(serialized_chunk_results):
1306
+ # Store serialized data directly for _merge_chunk_results to handle
1307
+ chunk_consensus_maps.append((chunk_start_idx, consensus_features))
1308
+
1309
+ # Merge chunk results with proper cross-chunk consensus building
1310
+ # _merge_chunk_results now handles both ConsensusMap objects (sequential) and serialized data (parallel)
822
1311
  _merge_chunk_results(self, chunk_consensus_maps, params, cached_adducts_df, cached_valid_adducts)
823
1312
 
824
- # Create a dummy consensus map for compatibility (since other functions expect it)
1313
+ # Return a dummy consensus map for compatibility (consensus features are stored in self.consensus_df)
825
1314
  consensus_map = oms.ConsensusMap()
826
1315
  return consensus_map
827
1316
 
@@ -859,61 +1348,128 @@ def _merge_chunk_results(self, chunk_consensus_maps: list, params: merge_default
859
1348
  all_chunk_consensus = []
860
1349
  consensus_id_counter = 0
861
1350
 
862
- for chunk_idx, (chunk_start_idx, chunk_consensus_map) in enumerate(chunk_consensus_maps):
863
- for consensus_feature in chunk_consensus_map:
1351
+ for chunk_idx, (chunk_start_idx, chunk_data) in enumerate(chunk_consensus_maps):
1352
+ # Handle both ConsensusMap objects (sequential) and serialized data (parallel)
1353
+ if isinstance(chunk_data, list):
1354
+ # Parallel processing: chunk_data is a list of serialized consensus feature dictionaries
1355
+ consensus_features_data = chunk_data
1356
+ else:
1357
+ # Sequential processing: chunk_data is a ConsensusMap object
1358
+ chunk_consensus_map = chunk_data
1359
+ consensus_features_data = []
1360
+
1361
+ # Extract data from ConsensusMap and convert to serialized format
1362
+ for consensus_feature in chunk_consensus_map:
1363
+ # Extract feature_uids from this consensus feature
1364
+ feature_uids = []
1365
+ feature_data_list = []
1366
+ sample_uids = []
1367
+
1368
+ for feature_handle in consensus_feature.getFeatureList():
1369
+ fuid = str(feature_handle.getUniqueId())
1370
+ if fuid not in feature_uid_map:
1371
+ continue
1372
+
1373
+ feature_uid = feature_uid_map[fuid]
1374
+ feature_data = features_lookup.get(feature_uid)
1375
+ if feature_data:
1376
+ feature_uids.append(feature_uid)
1377
+ feature_data_list.append(feature_data)
1378
+ sample_uids.append(chunk_start_idx + feature_handle.getMapIndex() + 1)
1379
+
1380
+ if not feature_data_list:
1381
+ # No retrievable feature metadata (possible stale map reference) -> skip
1382
+ continue
1383
+
1384
+ # Convert ConsensusFeature to serialized format
1385
+ consensus_feature_data = {
1386
+ 'rt': consensus_feature.getRT(),
1387
+ 'mz': consensus_feature.getMZ(),
1388
+ 'intensity': consensus_feature.getIntensity(),
1389
+ 'quality': consensus_feature.getQuality(),
1390
+ 'feature_uids': feature_uids,
1391
+ 'feature_data_list': feature_data_list,
1392
+ 'sample_uids': sample_uids
1393
+ }
1394
+ consensus_features_data.append(consensus_feature_data)
1395
+
1396
+ # Process the consensus features (now all in serialized format)
1397
+ for consensus_feature_data in consensus_features_data:
864
1398
  # ACCEPT ALL consensus features (size >=1) here.
865
1399
  # Reason: A feature that is globally present in many samples can still
866
1400
  # appear only once inside a given sample chunk. Early filtering at
867
1401
  # size>=2 causes irreversible loss and underestimates the final
868
1402
  # consensus count (observed ~296 vs 950 for KD). We defer filtering
869
1403
  # strictly to the final global min_samples.
870
-
871
- # Extract feature_uids from this consensus feature
872
- feature_uids = []
873
- feature_data_list = []
874
- sample_uids = []
875
1404
 
876
- for feature_handle in consensus_feature.getFeatureList():
877
- fuid = str(feature_handle.getUniqueId())
878
- if fuid not in feature_uid_map:
1405
+ # For parallel processing, feature data is already extracted
1406
+ if isinstance(chunk_data, list):
1407
+ # Extract feature_uids and data from serialized format for parallel processing
1408
+ feature_uids = []
1409
+ feature_data_list = []
1410
+ sample_uids = []
1411
+
1412
+ for handle_data in consensus_feature_data['features']:
1413
+ fuid = str(handle_data['unique_id'])
1414
+ if fuid not in feature_uid_map:
1415
+ continue
1416
+
1417
+ feature_uid = feature_uid_map[fuid]
1418
+ feature_data = features_lookup.get(feature_uid)
1419
+ if feature_data:
1420
+ feature_uids.append(feature_uid)
1421
+ feature_data_list.append(feature_data)
1422
+ sample_uids.append(chunk_start_idx + handle_data['map_index'] + 1)
1423
+
1424
+ if not feature_data_list:
879
1425
  continue
880
1426
 
881
- feature_uid = feature_uid_map[fuid]
882
- feature_data = features_lookup.get(feature_uid)
883
- if feature_data:
884
- feature_uids.append(feature_uid)
885
- feature_data_list.append(feature_data)
886
- sample_uids.append(chunk_start_idx + feature_handle.getMapIndex() + 1)
1427
+ # Get RT/MZ from consensus feature data
1428
+ consensus_rt = consensus_feature_data['rt']
1429
+ consensus_mz = consensus_feature_data['mz']
1430
+ consensus_intensity = consensus_feature_data['intensity']
1431
+ consensus_quality = consensus_feature_data['quality']
1432
+ else:
1433
+ # Sequential processing: data is already extracted above
1434
+ feature_uids = consensus_feature_data['feature_uids']
1435
+ feature_data_list = consensus_feature_data['feature_data_list']
1436
+ sample_uids = consensus_feature_data['sample_uids']
1437
+ consensus_rt = consensus_feature_data['rt']
1438
+ consensus_mz = consensus_feature_data['mz']
1439
+ consensus_intensity = consensus_feature_data['intensity']
1440
+ consensus_quality = consensus_feature_data['quality']
887
1441
 
888
1442
  if not feature_data_list:
889
1443
  # No retrievable feature metadata (possible stale map reference) -> skip
890
- continue # Derive RT / m/z ranges from underlying features (used for robust cross-chunk stitching)
1444
+ continue
1445
+
1446
+ # Derive RT / m/z ranges from underlying features (used for robust cross-chunk stitching)
891
1447
  rt_vals_local = [fd.get("rt") for fd in feature_data_list if fd.get("rt") is not None]
892
1448
  mz_vals_local = [fd.get("mz") for fd in feature_data_list if fd.get("mz") is not None]
893
1449
  if rt_vals_local:
894
1450
  rt_min_local = min(rt_vals_local)
895
1451
  rt_max_local = max(rt_vals_local)
896
1452
  else:
897
- rt_min_local = rt_max_local = consensus_feature.getRT()
1453
+ rt_min_local = rt_max_local = consensus_rt
898
1454
  if mz_vals_local:
899
1455
  mz_min_local = min(mz_vals_local)
900
1456
  mz_max_local = max(mz_vals_local)
901
1457
  else:
902
- mz_min_local = mz_max_local = consensus_feature.getMZ()
1458
+ mz_min_local = mz_max_local = consensus_mz
903
1459
 
904
1460
  # Store chunk consensus with feature tracking
905
1461
  chunk_consensus_data = {
906
1462
  'consensus_id': consensus_id_counter,
907
1463
  'chunk_idx': chunk_idx,
908
1464
  'chunk_start_idx': chunk_start_idx,
909
- 'mz': consensus_feature.getMZ(),
910
- 'rt': consensus_feature.getRT(),
1465
+ 'mz': consensus_mz,
1466
+ 'rt': consensus_rt,
911
1467
  'mz_min': mz_min_local,
912
1468
  'mz_max': mz_max_local,
913
1469
  'rt_min': rt_min_local,
914
1470
  'rt_max': rt_max_local,
915
- 'intensity': consensus_feature.getIntensity(),
916
- 'quality': consensus_feature.getQuality(),
1471
+ 'intensity': consensus_intensity,
1472
+ 'quality': consensus_quality,
917
1473
  'feature_uids': feature_uids,
918
1474
  'feature_data_list': feature_data_list,
919
1475
  'sample_uids': sample_uids,
@@ -1411,9 +1967,6 @@ def _cluster_consensus_features(features: list, rt_tol: float, mz_tol: float) ->
1411
1967
  return list(groups_by_root.values())
1412
1968
 
1413
1969
 
1414
- # Note: Restored proper chunked implementation with cross-chunk consensus clustering
1415
-
1416
-
1417
1970
  def _reset_consensus_data(self):
1418
1971
  """Reset consensus-related DataFrames at the start of merge."""
1419
1972
  self.consensus_df = pl.DataFrame()
@@ -97,7 +97,6 @@ def align(self, **kwargs):
97
97
  _align_kd_algorithm(self, fmaps, params)
98
98
  else:
99
99
  self.logger.error(f"Unknown alignment algorithm '{algorithm}'")
100
- self.logger.error(f"Unknown alignment algorithm '{algorithm}'")
101
100
 
102
101
  # check if rt_original exists in features_df, if not, add it after rt
103
102
  if "rt_original" not in self.features_df.columns:
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: masster
3
- Version: 0.4.17
3
+ Version: 0.4.19
4
4
  Summary: Mass spectrometry data analysis package
5
5
  Project-URL: homepage, https://github.com/zamboni-lab/masster
6
6
  Project-URL: repository, https://github.com/zamboni-lab/masster
@@ -1,5 +1,5 @@
1
1
  masster/__init__.py,sha256=HHjKhCjkAc98LhoQfu4C6L-W2vfTEc1iXaPTxxcl_4A,800
2
- masster/_version.py,sha256=A-Vx5wjFdgUfquBN1kWTW90q7wTOwZx-uonA2Xl-IWc,257
2
+ masster/_version.py,sha256=Kro6JvBTMqNf6tOgI2r5d4TbaZIIR85ax7tdT3uQKL8,257
3
3
  masster/chromatogram.py,sha256=iYpdv8C17zVnlWvOFgAn9ns2uFGiF-GgoYf5QVVAbHs,19319
4
4
  masster/logger.py,sha256=W50V_uh8RSYwGxDrDFhOuj5jpu2tKJyt_16lMw9kQwA,14755
5
5
  masster/spectrum.py,sha256=_upC_g2N9gwTaflXAugs9pSXpKUmzbIehofDordk7WI,47718
@@ -43,10 +43,10 @@ masster/study/h5.py,sha256=LiVGUAtULyPpZIUmKVJSaV38huJb8FsKOUWBOqiv0QU,82363
43
43
  masster/study/helpers.py,sha256=M5_q8O5tuFchKPW04PTuj3X335lDA2VZqcs4D8ZQJEk,158604
44
44
  masster/study/id.py,sha256=6NUBBKZCFOU1wlDKM0eXQeOIStSZCRNJ_3x7ZaIHzmM,55263
45
45
  masster/study/load.py,sha256=CQQY_7BzagE3oQTdDlqNyfuMdVWIAft-M4a2WCFnxp0,70695
46
- masster/study/merge.py,sha256=-gc-255NTKxkJZcIRl1wqQsMMi0m8zoZ10BkGsINFDc,92012
46
+ masster/study/merge.py,sha256=Xk7Zt6x0p_myjWQXuzXbXSlwXPSujWjMPowaqnEEmWQ,118778
47
47
  masster/study/parameters.py,sha256=0elaF7YspTsB7qyajWAbRNL2VfKlGz5GJLifmO8IGkk,3276
48
48
  masster/study/plot.py,sha256=SimX-IlqISEItAnTBsx4xsdYHRAevfN41cCENVns1lw,88236
49
- masster/study/processing.py,sha256=pm98FrQHoM3ov6qmjKuVN9h2KBhGgCLEZCRS7zpmJFM,41104
49
+ masster/study/processing.py,sha256=u1MSRKTzcqHNz_dClSUSfgTxkNRdBLXtVyO5LXuW_uk,41031
50
50
  masster/study/save.py,sha256=YCvp4xhnG16sNXaT2mFDBoCrIMub0Es61B97qLo0maw,6705
51
51
  masster/study/study.py,sha256=LO_hbJOOCZzeA3uterPKImFgPG6fCNQKMSVMtEwW3DU,38815
52
52
  masster/study/study5_schema.json,sha256=c0w24QdHak01m04I1VPu97KvF2468FcaqROhf6pmLk4,7507
@@ -60,7 +60,7 @@ masster/study/defaults/find_ms2_def.py,sha256=RL0DFG41wQ05U8UQKUGr3vzSl3mU0m0knQ
60
60
  masster/study/defaults/identify_def.py,sha256=96rxoCAPQj_yX-3mRoD2LTkTLJgG27eJQqwarLv5jL0,10580
61
61
  masster/study/defaults/integrate_chrom_def.py,sha256=0MNIWGTjty-Zu-NTQsIweuj3UVqEY3x1x8pK0mPwYak,7264
62
62
  masster/study/defaults/integrate_def.py,sha256=Vf4SAzdBfnsSZ3IRaF0qZvWu3gMDPHdgPfMYoPKeWv8,7246
63
- masster/study/defaults/merge_def.py,sha256=Q31JwAaVGgVPEVIsiyeiOsF97c48IKe48HXuqh-sA_k,13189
63
+ masster/study/defaults/merge_def.py,sha256=K7sfwEGfgcWU85zorbWNFaxDhqRH52pxQoKv9Jn2qhY,15030
64
64
  masster/study/defaults/study_def.py,sha256=h8dYbi9xv0sesCSQik49Z53IkskMmNtW6ixl7it5pL0,16033
65
65
  masster/wizard/README.md,sha256=mL1A3YWJZOefpJ6D0-HqGLkVRmUlOpwyVFdvJBeeoZM,14149
66
66
  masster/wizard/__init__.py,sha256=A9GHQvkq4lSRIA8V6AKB-TJy8s_npH8i1baUGdkw_is,364
@@ -68,8 +68,8 @@ masster/wizard/example.py,sha256=xEZFTH9UZ8HKOm6s3JL8Js0Uw5ChnISWBHSZCL32vsM,798
68
68
  masster/wizard/test_structure.py,sha256=h88gsYYCG6iDRjqPZC_r1H1T8y79j0E-K6OrwuHaSCU,1586
69
69
  masster/wizard/test_wizard.py,sha256=CMp1cpjH3iYYC5Fy6puF_K0kfwwk3bgOsSbUGW-t7Xk,8986
70
70
  masster/wizard/wizard.py,sha256=jMLHy4cXgNEE_-vshFmA7BNEByhfA6tV7O91jhiMYuw,48054
71
- masster-0.4.17.dist-info/METADATA,sha256=uIdQNkAXQQzMkcVM53y_pUBZPzwqOx0lxGW8nmB1lz8,44207
72
- masster-0.4.17.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
73
- masster-0.4.17.dist-info/entry_points.txt,sha256=ZHguQ_vPmdbpqq2uGtmEOLJfgP-DQ1T0c07Lxh30wc8,58
74
- masster-0.4.17.dist-info/licenses/LICENSE,sha256=bx5iLIKjgAdYQ7sISn7DsfHRKkoCUm1154sJJKhgqnU,35184
75
- masster-0.4.17.dist-info/RECORD,,
71
+ masster-0.4.19.dist-info/METADATA,sha256=fcnG14G4Fbp7mOCQ3aKL0qvkuexeUUjm79P1dDpT_Kg,44207
72
+ masster-0.4.19.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
73
+ masster-0.4.19.dist-info/entry_points.txt,sha256=ZHguQ_vPmdbpqq2uGtmEOLJfgP-DQ1T0c07Lxh30wc8,58
74
+ masster-0.4.19.dist-info/licenses/LICENSE,sha256=bx5iLIKjgAdYQ7sISn7DsfHRKkoCUm1154sJJKhgqnU,35184
75
+ masster-0.4.19.dist-info/RECORD,,