masster 0.4.0__py3-none-any.whl → 0.4.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (54) hide show
  1. masster/__init__.py +8 -8
  2. masster/_version.py +1 -1
  3. masster/chromatogram.py +3 -9
  4. masster/data/libs/README.md +1 -1
  5. masster/data/libs/ccm.csv +120 -120
  6. masster/data/libs/ccm.py +116 -62
  7. masster/data/libs/central_carbon_README.md +1 -1
  8. masster/data/libs/urine.py +161 -65
  9. masster/data/libs/urine_metabolites.csv +4693 -4693
  10. masster/data/wiff/2025_01_14_VW_7600_LpMx_DBS_CID_2min_TOP15_030msecMS1_005msecReac_CE35_DBS-ON_3.mzML +2 -2
  11. masster/logger.py +43 -78
  12. masster/sample/__init__.py +1 -1
  13. masster/sample/adducts.py +264 -338
  14. masster/sample/defaults/find_adducts_def.py +8 -21
  15. masster/sample/defaults/find_features_def.py +1 -6
  16. masster/sample/defaults/get_spectrum_def.py +1 -5
  17. masster/sample/defaults/sample_def.py +1 -5
  18. masster/sample/h5.py +282 -561
  19. masster/sample/helpers.py +75 -131
  20. masster/sample/lib.py +17 -42
  21. masster/sample/load.py +17 -31
  22. masster/sample/parameters.py +2 -6
  23. masster/sample/plot.py +27 -88
  24. masster/sample/processing.py +87 -117
  25. masster/sample/quant.py +51 -57
  26. masster/sample/sample.py +90 -103
  27. masster/sample/sample5_schema.json +44 -44
  28. masster/sample/save.py +12 -35
  29. masster/sample/sciex.py +19 -66
  30. masster/spectrum.py +20 -58
  31. masster/study/__init__.py +1 -1
  32. masster/study/defaults/align_def.py +1 -5
  33. masster/study/defaults/fill_chrom_def.py +1 -5
  34. masster/study/defaults/fill_def.py +1 -5
  35. masster/study/defaults/integrate_chrom_def.py +1 -5
  36. masster/study/defaults/integrate_def.py +1 -5
  37. masster/study/defaults/study_def.py +25 -58
  38. masster/study/export.py +207 -233
  39. masster/study/h5.py +136 -470
  40. masster/study/helpers.py +202 -495
  41. masster/study/helpers_optimized.py +13 -40
  42. masster/study/id.py +110 -213
  43. masster/study/load.py +143 -230
  44. masster/study/plot.py +257 -518
  45. masster/study/processing.py +257 -469
  46. masster/study/save.py +5 -15
  47. masster/study/study.py +276 -379
  48. masster/study/study5_schema.json +96 -96
  49. {masster-0.4.0.dist-info → masster-0.4.1.dist-info}/METADATA +1 -1
  50. masster-0.4.1.dist-info/RECORD +67 -0
  51. masster-0.4.0.dist-info/RECORD +0 -67
  52. {masster-0.4.0.dist-info → masster-0.4.1.dist-info}/WHEEL +0 -0
  53. {masster-0.4.0.dist-info → masster-0.4.1.dist-info}/entry_points.txt +0 -0
  54. {masster-0.4.0.dist-info → masster-0.4.1.dist-info}/licenses/LICENSE +0 -0
masster/sample/adducts.py CHANGED
@@ -14,24 +14,24 @@ Functions:
14
14
 
15
15
  import numpy as np
16
16
  import polars as pl
17
- from typing import List, Dict
17
+ from typing import List, Dict, Any
18
18
  from itertools import combinations
19
19
 
20
20
  # Import defaults class for external use
21
- from master.sample.defaults.find_adducts_def import find_adducts_defaults
21
+ from masster.sample.defaults.find_adducts_def import find_adducts_defaults
22
22
 
23
23
 
24
24
  def _get_adducts(self, adducts_list: list = None, **kwargs):
25
25
  """
26
26
  Generate comprehensive adduct specifications including multiply charged species and combinations.
27
-
27
+
28
28
  This method consolidates all adduct generation logic into a single optimized helper
29
29
  that produces a polars DataFrame with all possible adduct combinations, properly
30
30
  formatted names like [M+H]1+ or [M-H2O+2H]2+, and respecting charge constraints.
31
-
31
+
32
32
  Uses parameters from find_adducts_defaults() by default, which can be overridden
33
33
  by providing keyword arguments.
34
-
34
+
35
35
  Parameters
36
36
  ----------
37
37
  adducts_list : List[str], optional
@@ -42,7 +42,7 @@ def _get_adducts(self, adducts_list: list = None, **kwargs):
42
42
  - charge_min: Minimum charge to consider (default from find_adducts_defaults)
43
43
  - charge_max: Maximum charge to consider (default from find_adducts_defaults)
44
44
  - max_combinations: Maximum number of adduct components to combine (default 4)
45
-
45
+
46
46
  Returns
47
47
  -------
48
48
  pl.DataFrame
@@ -56,201 +56,177 @@ def _get_adducts(self, adducts_list: list = None, **kwargs):
56
56
  """
57
57
  # Get default parameters from find_adducts_defaults
58
58
  defaults = self.find_adducts_defaults()
59
-
59
+
60
60
  # Use provided parameters or defaults
61
61
  if adducts_list is None:
62
62
  adducts_list = defaults.get_openms_adducts()
63
-
64
- charge_min = kwargs.get("charge_min", defaults.charge_min)
65
- charge_max = kwargs.get("charge_max", defaults.charge_max)
66
- max_combinations = kwargs.get("max_combinations", 4)
67
-
63
+
64
+ charge_min = kwargs.get('charge_min', defaults.charge_min)
65
+ charge_max = kwargs.get('charge_max', defaults.charge_max)
66
+ max_combinations = kwargs.get('max_combinations', 4)
67
+
68
68
  # Parse base adduct specifications
69
69
  base_specs = []
70
-
70
+
71
71
  for adduct_str in adducts_list:
72
- if not isinstance(adduct_str, str) or ":" not in adduct_str:
72
+ if not isinstance(adduct_str, str) or ':' not in adduct_str:
73
73
  continue
74
-
74
+
75
75
  try:
76
- parts = adduct_str.split(":")
76
+ parts = adduct_str.split(':')
77
77
  if len(parts) != 3:
78
78
  continue
79
-
79
+
80
80
  formula_part = parts[0]
81
- charge = int(parts[1])
81
+ charge = int(parts[1])
82
82
  probability = float(parts[2])
83
-
83
+
84
84
  # Calculate mass shift from formula
85
85
  mass_shift = _calculate_formula_mass_shift(formula_part)
86
-
87
- base_specs.append(
88
- {
89
- "formula": formula_part,
90
- "charge": charge,
91
- "mass_shift": mass_shift,
92
- "probability": probability,
93
- "raw_string": adduct_str,
94
- },
95
- )
96
-
86
+
87
+ base_specs.append({
88
+ 'formula': formula_part,
89
+ 'charge': charge,
90
+ 'mass_shift': mass_shift,
91
+ 'probability': probability,
92
+ 'raw_string': adduct_str
93
+ })
94
+
97
95
  except (ValueError, IndexError):
98
96
  continue
99
-
97
+
100
98
  # Generate all valid combinations
101
99
  combinations_list = []
102
-
100
+
103
101
  # Separate specs by charge type
104
- positive_specs = [spec for spec in base_specs if spec["charge"] > 0]
105
- negative_specs = [spec for spec in base_specs if spec["charge"] < 0]
106
- neutral_specs = [spec for spec in base_specs if spec["charge"] == 0]
107
-
102
+ positive_specs = [spec for spec in base_specs if spec['charge'] > 0]
103
+ negative_specs = [spec for spec in base_specs if spec['charge'] < 0]
104
+ neutral_specs = [spec for spec in base_specs if spec['charge'] == 0]
105
+
108
106
  # 1. Single adducts
109
107
  for spec in base_specs:
110
- if charge_min <= spec["charge"] <= charge_max:
108
+ if charge_min <= spec['charge'] <= charge_max:
111
109
  formatted_name = _format_adduct_name([spec])
112
- combinations_list.append(
113
- {
114
- "components": [spec],
115
- "formatted_name": formatted_name,
116
- "total_mass_shift": spec["mass_shift"],
117
- "total_charge": spec["charge"],
118
- "combined_probability": spec["probability"],
119
- "complexity": 1,
120
- },
121
- )
122
-
110
+ combinations_list.append({
111
+ 'components': [spec],
112
+ 'formatted_name': formatted_name,
113
+ 'total_mass_shift': spec['mass_shift'],
114
+ 'total_charge': spec['charge'],
115
+ 'combined_probability': spec['probability'],
116
+ 'complexity': 1
117
+ })
118
+
123
119
  # 2. Generate multiply charged versions (2H+, 3H+, etc.)
124
120
  for spec in positive_specs + negative_specs:
125
- base_charge = spec["charge"]
121
+ base_charge = spec['charge']
126
122
  for multiplier in range(2, min(max_combinations + 1, 5)):
127
123
  total_charge = base_charge * multiplier
128
124
  if charge_min <= total_charge <= charge_max:
129
125
  components = [spec] * multiplier
130
126
  formatted_name = _format_adduct_name(components)
131
-
132
- combinations_list.append(
133
- {
134
- "components": components,
135
- "formatted_name": formatted_name,
136
- "total_mass_shift": spec["mass_shift"] * multiplier,
137
- "total_charge": total_charge,
138
- "combined_probability": spec["probability"] ** multiplier,
139
- "complexity": multiplier,
140
- },
141
- )
142
-
127
+
128
+ combinations_list.append({
129
+ 'components': components,
130
+ 'formatted_name': formatted_name,
131
+ 'total_mass_shift': spec['mass_shift'] * multiplier,
132
+ 'total_charge': total_charge,
133
+ 'combined_probability': spec['probability'] ** multiplier,
134
+ 'complexity': multiplier
135
+ })
136
+
143
137
  # 3. Mixed combinations (2-component)
144
138
  if max_combinations >= 2:
145
139
  # Positive + Neutral
146
140
  for pos_spec in positive_specs:
147
141
  for neut_spec in neutral_specs:
148
- total_charge = pos_spec["charge"] + neut_spec["charge"]
142
+ total_charge = pos_spec['charge'] + neut_spec['charge']
149
143
  if charge_min <= total_charge <= charge_max:
150
144
  components = [pos_spec, neut_spec]
151
145
  formatted_name = _format_adduct_name(components)
152
- combinations_list.append(
153
- {
154
- "components": components,
155
- "formatted_name": formatted_name,
156
- "total_mass_shift": pos_spec["mass_shift"]
157
- + neut_spec["mass_shift"],
158
- "total_charge": total_charge,
159
- "combined_probability": pos_spec["probability"]
160
- * neut_spec["probability"],
161
- "complexity": 2,
162
- },
163
- )
164
-
146
+ combinations_list.append({
147
+ 'components': components,
148
+ 'formatted_name': formatted_name,
149
+ 'total_mass_shift': pos_spec['mass_shift'] + neut_spec['mass_shift'],
150
+ 'total_charge': total_charge,
151
+ 'combined_probability': pos_spec['probability'] * neut_spec['probability'],
152
+ 'complexity': 2
153
+ })
154
+
165
155
  # Different charged species
166
156
  for combo in combinations(positive_specs, 2):
167
- if combo[0]["formula"] != combo[1]["formula"]:
168
- total_charge = combo[0]["charge"] + combo[1]["charge"]
157
+ if combo[0]['formula'] != combo[1]['formula']:
158
+ total_charge = combo[0]['charge'] + combo[1]['charge']
169
159
  if charge_min <= total_charge <= charge_max:
170
160
  components = list(combo)
171
161
  formatted_name = _format_adduct_name(components)
172
- combinations_list.append(
173
- {
174
- "components": components,
175
- "formatted_name": formatted_name,
176
- "total_mass_shift": combo[0]["mass_shift"]
177
- + combo[1]["mass_shift"],
178
- "total_charge": total_charge,
179
- "combined_probability": combo[0]["probability"]
180
- * combo[1]["probability"],
181
- "complexity": 2,
182
- },
183
- )
184
-
162
+ combinations_list.append({
163
+ 'components': components,
164
+ 'formatted_name': formatted_name,
165
+ 'total_mass_shift': combo[0]['mass_shift'] + combo[1]['mass_shift'],
166
+ 'total_charge': total_charge,
167
+ 'combined_probability': combo[0]['probability'] * combo[1]['probability'],
168
+ 'complexity': 2
169
+ })
170
+
185
171
  # 4. 3-component combinations (limited for performance)
186
172
  if max_combinations >= 3:
187
173
  for pos_spec in positive_specs[:2]:
188
174
  for neut_combo in combinations(neutral_specs[:2], 2):
189
175
  components = [pos_spec] + list(neut_combo)
190
- total_charge = sum(spec["charge"] for spec in components)
191
-
176
+ total_charge = sum(spec['charge'] for spec in components)
177
+
192
178
  if charge_min <= total_charge <= charge_max:
193
179
  formatted_name = _format_adduct_name(components)
194
- total_mass_shift = sum(spec["mass_shift"] for spec in components)
195
- combined_prob = np.prod(
196
- [spec["probability"] for spec in components],
197
- )
198
-
199
- combinations_list.append(
200
- {
201
- "components": components,
202
- "formatted_name": formatted_name,
203
- "total_mass_shift": total_mass_shift,
204
- "total_charge": total_charge,
205
- "combined_probability": combined_prob,
206
- "complexity": 3,
207
- },
208
- )
209
-
180
+ total_mass_shift = sum(spec['mass_shift'] for spec in components)
181
+ combined_prob = np.prod([spec['probability'] for spec in components])
182
+
183
+ combinations_list.append({
184
+ 'components': components,
185
+ 'formatted_name': formatted_name,
186
+ 'total_mass_shift': total_mass_shift,
187
+ 'total_charge': total_charge,
188
+ 'combined_probability': combined_prob,
189
+ 'complexity': 3
190
+ })
191
+
210
192
  # Convert to polars DataFrame
211
193
  if combinations_list:
212
- combinations_list.sort(
213
- key=lambda x: (-x["combined_probability"], x["complexity"]),
214
- )
215
-
216
- adducts_df = pl.DataFrame(
217
- [
218
- {
219
- "name": combo["formatted_name"],
220
- "charge": combo["total_charge"],
221
- "mass_shift": combo["total_mass_shift"],
222
- "probability": combo["combined_probability"],
223
- "complexity": combo["complexity"],
224
- "components": combo["components"],
225
- }
226
- for combo in combinations_list
227
- ],
228
- )
194
+ combinations_list.sort(key=lambda x: (-x['combined_probability'], x['complexity']))
195
+
196
+ adducts_df = pl.DataFrame([
197
+ {
198
+ 'name': combo['formatted_name'],
199
+ 'charge': combo['total_charge'],
200
+ 'mass_shift': combo['total_mass_shift'],
201
+ 'probability': combo['combined_probability'],
202
+ 'complexity': combo['complexity'],
203
+ 'components': combo['components']
204
+ }
205
+ for combo in combinations_list
206
+ ])
229
207
  else:
230
208
  # Return empty DataFrame with correct schema
231
- adducts_df = pl.DataFrame(
232
- {
233
- "name": [],
234
- "charge": [],
235
- "mass_shift": [],
236
- "probability": [],
237
- "complexity": [],
238
- "components": [],
239
- },
240
- )
241
-
209
+ adducts_df = pl.DataFrame({
210
+ 'name': [],
211
+ 'charge': [],
212
+ 'mass_shift': [],
213
+ 'probability': [],
214
+ 'complexity': [],
215
+ 'components': []
216
+ })
217
+
242
218
  return adducts_df
243
219
 
244
220
 
245
221
  def _calculate_formula_mass_shift(formula: str) -> float:
246
222
  """
247
223
  Calculate mass shift from formula string like "+H", "-H2O", "+Na-H", etc.
248
-
224
+
249
225
  Parameters
250
226
  ----------
251
227
  formula : str
252
228
  Formula string (e.g., "+H", "-H2O", "+Na-H")
253
-
229
+
254
230
  Returns
255
231
  -------
256
232
  float
@@ -258,59 +234,59 @@ def _calculate_formula_mass_shift(formula: str) -> float:
258
234
  """
259
235
  # Standard atomic masses
260
236
  atomic_masses = {
261
- "H": 1.007825,
262
- "C": 12.0,
263
- "N": 14.003074,
264
- "O": 15.994915,
265
- "Na": 22.989769,
266
- "K": 38.963707,
267
- "Li": 7.016003,
268
- "Ca": 39.962591,
269
- "Mg": 23.985042,
270
- "Fe": 55.934938,
271
- "Cl": 34.968853,
272
- "Br": 78.918336,
273
- "I": 126.904473,
274
- "P": 30.973762,
275
- "S": 31.972071,
237
+ 'H': 1.007825,
238
+ 'C': 12.0,
239
+ 'N': 14.003074,
240
+ 'O': 15.994915,
241
+ 'Na': 22.989769,
242
+ 'K': 38.963707,
243
+ 'Li': 7.016003,
244
+ 'Ca': 39.962591,
245
+ 'Mg': 23.985042,
246
+ 'Fe': 55.934938,
247
+ 'Cl': 34.968853,
248
+ 'Br': 78.918336,
249
+ 'I': 126.904473,
250
+ 'P': 30.973762,
251
+ 'S': 31.972071
276
252
  }
277
-
253
+
278
254
  total_mass = 0.0
279
-
255
+
280
256
  # Parse formula by splitting on + and - while preserving the operators
281
257
  parts = []
282
258
  current_part = ""
283
259
  current_sign = 1
284
-
260
+
285
261
  for char in formula:
286
- if char == "+":
262
+ if char == '+':
287
263
  if current_part:
288
264
  parts.append((current_sign, current_part))
289
265
  current_part = ""
290
266
  current_sign = 1
291
- elif char == "-":
267
+ elif char == '-':
292
268
  if current_part:
293
269
  parts.append((current_sign, current_part))
294
270
  current_part = ""
295
271
  current_sign = -1
296
272
  else:
297
273
  current_part += char
298
-
274
+
299
275
  if current_part:
300
276
  parts.append((current_sign, current_part))
301
-
277
+
302
278
  # Process each part
303
279
  for sign, part in parts:
304
280
  if not part:
305
281
  continue
306
-
282
+
307
283
  # Parse element and count (e.g., "H2O" -> H:2, O:1)
308
284
  elements = _parse_element_counts(part)
309
-
285
+
310
286
  for element, count in elements.items():
311
287
  if element in atomic_masses:
312
288
  total_mass += sign * atomic_masses[element] * count
313
-
289
+
314
290
  return total_mass
315
291
 
316
292
 
@@ -318,25 +294,25 @@ def _parse_element_counts(formula_part: str) -> Dict[str, int]:
318
294
  """Parse element counts from a formula part like 'H2O' -> {'H': 2, 'O': 1}"""
319
295
  elements = {}
320
296
  i = 0
321
-
297
+
322
298
  while i < len(formula_part):
323
299
  # Get element (uppercase letter, possibly followed by lowercase)
324
300
  element = formula_part[i]
325
301
  i += 1
326
-
302
+
327
303
  while i < len(formula_part) and formula_part[i].islower():
328
304
  element += formula_part[i]
329
305
  i += 1
330
-
306
+
331
307
  # Get count (digits following element)
332
308
  count_str = ""
333
309
  while i < len(formula_part) and formula_part[i].isdigit():
334
310
  count_str += formula_part[i]
335
311
  i += 1
336
-
312
+
337
313
  count = int(count_str) if count_str else 1
338
314
  elements[element] = elements.get(element, 0) + count
339
-
315
+
340
316
  return elements
341
317
 
342
318
 
@@ -344,56 +320,51 @@ def _format_adduct_name(components: List[Dict]) -> str:
344
320
  """Format adduct name from components like [M+H]1+ or [M+2H]2+ or [M+2(H+Na)]3+"""
345
321
  if not components:
346
322
  return "[M]"
347
-
323
+
348
324
  # Count occurrences of each formula
349
325
  from collections import Counter
350
-
351
- formula_counts = Counter(comp["formula"] for comp in components)
352
- total_charge = sum(comp["charge"] for comp in components)
353
-
326
+ formula_counts = Counter(comp['formula'] for comp in components)
327
+ total_charge = sum(comp['charge'] for comp in components)
328
+
354
329
  # Build formula part with proper multipliers
355
330
  formula_parts = []
356
- for formula, count in sorted(
357
- formula_counts.items(),
358
- ): # Sort for consistent ordering
331
+ for formula, count in sorted(formula_counts.items()): # Sort for consistent ordering
359
332
  if count == 1:
360
333
  formula_parts.append(formula)
361
334
  else:
362
335
  # For multiple occurrences, use count prefix (e.g., 2H, 3Na)
363
336
  # Handle special case where formula might already start with + or -
364
- if formula.startswith(("+", "-")):
337
+ if formula.startswith(('+', '-')):
365
338
  sign = formula[0]
366
339
  base_formula = formula[1:]
367
340
  formula_parts.append(f"{sign}{count}{base_formula}")
368
341
  else:
369
342
  formula_parts.append(f"{count}{formula}")
370
-
343
+
371
344
  # Combine formula parts
372
345
  formula = "".join(formula_parts)
373
-
346
+
374
347
  # Format charge
375
348
  if total_charge == 0:
376
349
  charge_str = ""
377
350
  elif abs(total_charge) == 1:
378
351
  charge_str = "1+" if total_charge > 0 else "1-"
379
352
  else:
380
- charge_str = (
381
- f"{abs(total_charge)}+" if total_charge > 0 else f"{abs(total_charge)}-"
382
- )
383
-
353
+ charge_str = f"{abs(total_charge)}+" if total_charge > 0 else f"{abs(total_charge)}-"
354
+
384
355
  return f"[M{formula}]{charge_str}"
385
356
 
386
357
 
387
358
  def find_adducts(self, **kwargs):
388
359
  """Detect adduct relationships among detected features using improved OpenMS-like algorithm.
389
360
 
390
- This method implements a corrected version of the OpenMS MetaboliteFeatureDeconvolution
361
+ This method implements a corrected version of the OpenMS MetaboliteFeatureDeconvolution
391
362
  algorithm that properly enforces RT constraints and avoids the mass tolerance dominance
392
363
  issues present in the original C++ implementation.
393
364
 
394
365
  Key improvements over OpenMS:
395
366
  - Early RT filtering prevents expensive mass calculations for temporally incompatible features
396
- - Strict mass tolerance (0.01 Da default) prevents inappropriate mass relationships
367
+ - Strict mass tolerance (0.01 Da default) prevents inappropriate mass relationships
397
368
  - RT constraints are properly enforced throughout the algorithm
398
369
  - Connected components analysis respects both mass AND RT constraints
399
370
  - Probability-based scoring for adduct assignment
@@ -410,9 +381,9 @@ def find_adducts(self, **kwargs):
410
381
  Side effects:
411
382
  Updates ``self.features_df`` with adduct information columns.
412
383
  """
413
- # Initialize parameters
384
+ # Initialize parameters
414
385
  params = find_adducts_defaults()
415
-
386
+
416
387
  for key, value in kwargs.items():
417
388
  if isinstance(value, find_adducts_defaults):
418
389
  params = value
@@ -422,28 +393,24 @@ def find_adducts(self, **kwargs):
422
393
  if params.set(key, value, validate=True):
423
394
  self.logger.debug(f"Updated parameter {key} = {value}")
424
395
  else:
425
- self.logger.warning(
426
- f"Failed to set parameter {key} = {value} (validation failed)",
427
- )
396
+ self.logger.warning(f"Failed to set parameter {key} = {value} (validation failed)")
428
397
  else:
429
398
  self.logger.warning(f"Unknown parameter {key} ignored")
430
399
 
431
400
  # Check if features_df exists and has data
432
- if not hasattr(self, "features_df") or len(self.features_df) == 0:
433
- self.logger.warning(
434
- "No features available for adduct detection. Run find_features() first.",
435
- )
401
+ if not hasattr(self, 'features_df') or len(self.features_df) == 0:
402
+ self.logger.warning("No features available for adduct detection. Run find_features() first.")
436
403
  return
437
404
 
438
405
  self.logger.info("Adduct detection...")
439
406
 
440
407
  # Validate required columns
441
- required_cols = ["mz", "rt"]
408
+ required_cols = ['mz', 'rt']
442
409
  missing_cols = [col for col in required_cols if col not in self.features_df.columns]
443
410
  if missing_cols:
444
411
  self.logger.error(f"Required columns missing from features_df: {missing_cols}")
445
412
  return
446
-
413
+
447
414
  # Check if we have any features to process
448
415
  if len(self.features_df) == 0:
449
416
  self.logger.warning("No features available for adduct detection")
@@ -457,151 +424,130 @@ def find_adducts(self, **kwargs):
457
424
 
458
425
  # Get parameters
459
426
  adducts_list = params.get_openms_adducts()
460
- charge_min = params.get("charge_min")
427
+ charge_min = params.get("charge_min")
461
428
  charge_max = params.get("charge_max")
462
429
  retention_max_diff = params.get("retention_max_diff")
463
430
  mass_max_diff = params.get("mass_max_diff")
464
431
  unit = params.get("unit")
465
432
  min_probability = params.get("min_probability")
466
433
 
467
- self.logger.debug(
468
- f"Processing {len(self.features_df)} features with {len(adducts_list)} base adducts",
469
- )
470
- self.logger.debug(
471
- f"RT tolerance: {retention_max_diff}s, Mass tolerance: {mass_max_diff} {unit}",
472
- )
434
+ self.logger.debug(f"Processing {len(self.features_df)} features with {len(adducts_list)} base adducts")
435
+ self.logger.debug(f"RT tolerance: {retention_max_diff}s, Mass tolerance: {mass_max_diff} {unit}")
473
436
  self.logger.debug(f"Min probability threshold: {min_probability}")
474
437
 
475
438
  # Generate comprehensive adduct specifications using the Sample method
476
439
  adducts_df = self._get_adducts(
477
440
  adducts_list=adducts_list,
478
- charge_min=charge_min,
441
+ charge_min=charge_min,
479
442
  charge_max=charge_max,
480
- max_combinations=4,
443
+ max_combinations=4
481
444
  )
482
-
445
+
483
446
  self.logger.debug(f"Generated {len(adducts_df)} total adduct combinations")
484
-
447
+
485
448
  # Filter adducts by minimum probability threshold
486
449
  if min_probability > 0.0:
487
450
  adducts_before_filter = len(adducts_df)
488
451
  adducts_df = adducts_df.filter(pl.col("probability") >= min_probability)
489
452
  adducts_after_filter = len(adducts_df)
490
453
  filtered_count = adducts_before_filter - adducts_after_filter
491
-
492
- self.logger.debug(
493
- f"Filtered {filtered_count} low-probability adducts (< {min_probability})",
494
- )
454
+
455
+ self.logger.debug(f"Filtered {filtered_count} low-probability adducts (< {min_probability})")
495
456
  self.logger.debug(f"Remaining adducts for analysis: {adducts_after_filter}")
496
-
457
+
497
458
  if len(adducts_df) == 0:
498
- self.logger.warning(
499
- f"No adducts remaining after probability filtering (min_probability={min_probability})",
500
- )
459
+ self.logger.warning(f"No adducts remaining after probability filtering (min_probability={min_probability})")
501
460
  return
502
-
461
+
503
462
  # Implement the adduct detection algorithm directly here
504
463
  import numpy as np
505
-
464
+
506
465
  # Get parameters
507
466
  charge_max = params.get("charge_max")
508
467
  retention_max_diff = params.get("retention_max_diff")
509
468
  mass_max_diff = params.get("mass_max_diff")
510
469
  unit = params.get("unit")
511
-
470
+
512
471
  # Sort features by RT for efficient RT-sweep processing (OpenMS approach)
513
472
  # Store original row positions before sorting for correct index mapping
514
473
  features_with_positions = self.features_df.with_row_index("original_position")
515
474
  features_sorted = features_with_positions.sort("rt")
516
475
  n_features = len(features_sorted)
517
-
476
+
518
477
  # Extract arrays for fast processing
519
478
  feature_mzs = features_sorted.select("mz").to_numpy().flatten()
520
479
  feature_rts = features_sorted.select("rt").to_numpy().flatten()
521
-
480
+
522
481
  # Convert adducts to arrays for vectorized operations
523
482
  adduct_mass_shifts = adducts_df.select("mass_shift").to_numpy().flatten()
524
483
  adduct_charges = adducts_df.select("charge").to_numpy().flatten()
525
484
  adduct_names = adducts_df.select("name").to_series().to_list()
526
485
  adduct_probs = adducts_df.select("probability").to_numpy().flatten()
527
486
 
528
- self.logger.debug(
529
- f"RT-sweep processing: {n_features} features × {len(adducts_df)} adduct combinations",
530
- )
531
-
487
+ self.logger.debug(f"RT-sweep processing: {n_features} features × {len(adducts_df)} adduct combinations")
488
+
532
489
  # Phase 1: RT-sweep line algorithm with early RT filtering (fixes OpenMS flaw #1)
533
490
  candidate_edges = []
534
-
491
+
535
492
  for i_rt in range(n_features):
536
493
  mz1 = feature_mzs[i_rt]
537
494
  rt1 = feature_rts[i_rt]
538
-
495
+
539
496
  # RT-window sweep: only check features within RT tolerance (early filtering)
540
497
  for j_rt in range(i_rt + 1, n_features):
541
498
  rt2 = feature_rts[j_rt]
542
499
  rt_diff = rt2 - rt1
543
-
500
+
544
501
  # Early RT constraint check (fixes OpenMS issue where RT was checked too late)
545
502
  if rt_diff > retention_max_diff:
546
503
  break # Features are RT-sorted, so no more valid pairs
547
-
504
+
548
505
  mz2 = feature_mzs[j_rt]
549
-
506
+
550
507
  # Phase 2: Check for valid mass relationships with strict tolerance (fixes OpenMS flaw #2)
551
508
  for adduct_idx, mass_shift in enumerate(adduct_mass_shifts):
552
509
  charge = adduct_charges[adduct_idx]
553
-
510
+
554
511
  # Calculate mass tolerance (per feature, as in OpenMS)
555
- if unit == "ppm":
512
+ if unit == 'ppm':
556
513
  tol1 = mass_max_diff * mz1 * 1e-6
557
514
  tol2 = mass_max_diff * mz2 * 1e-6
558
515
  combined_tolerance = tol1 + tol2
559
516
  else: # Da
560
- combined_tolerance = (
561
- 2 * mass_max_diff
562
- ) # Combined tolerance for both features
563
-
517
+ combined_tolerance = 2 * mass_max_diff # Combined tolerance for both features
518
+
564
519
  # Check both directions of mass relationship
565
520
  if charge != 0:
566
521
  # For charged adducts: m/z relationship
567
- mass_diff_12 = (mz2 * abs(charge)) - (mz1 * abs(charge))
522
+ mass_diff_12 = (mz2 * abs(charge)) - (mz1 * abs(charge))
568
523
  expected_mass_diff = mass_shift
569
-
524
+
570
525
  if abs(mass_diff_12 - expected_mass_diff) <= combined_tolerance:
571
526
  # Valid mass relationship found
572
- candidate_edges.append(
573
- {
574
- "i": i_rt,
575
- "j": j_rt,
576
- "rt_diff": rt_diff,
577
- "mass_error": abs(mass_diff_12 - expected_mass_diff),
578
- "adduct_idx": adduct_idx,
579
- "charge1": charge if mass_diff_12 > 0 else -charge,
580
- "charge2": -charge if mass_diff_12 > 0 else charge,
581
- "probability": adduct_probs[adduct_idx],
582
- },
583
- )
527
+ candidate_edges.append({
528
+ 'i': i_rt, 'j': j_rt,
529
+ 'rt_diff': rt_diff,
530
+ 'mass_error': abs(mass_diff_12 - expected_mass_diff),
531
+ 'adduct_idx': adduct_idx,
532
+ 'charge1': charge if mass_diff_12 > 0 else -charge,
533
+ 'charge2': -charge if mass_diff_12 > 0 else charge,
534
+ 'probability': adduct_probs[adduct_idx]
535
+ })
584
536
  else:
585
537
  # For neutral adducts: direct mass shift
586
538
  mass_diff_12 = mz2 - mz1
587
539
  if abs(mass_diff_12 - mass_shift) <= combined_tolerance:
588
- candidate_edges.append(
589
- {
590
- "i": i_rt,
591
- "j": j_rt,
592
- "rt_diff": rt_diff,
593
- "mass_error": abs(mass_diff_12 - mass_shift),
594
- "adduct_idx": adduct_idx,
595
- "charge1": 0,
596
- "charge2": 0,
597
- "probability": adduct_probs[adduct_idx],
598
- },
599
- )
600
-
601
- self.logger.debug(
602
- f"Found {len(candidate_edges)} candidate edges after RT+mass filtering",
603
- )
604
-
540
+ candidate_edges.append({
541
+ 'i': i_rt, 'j': j_rt,
542
+ 'rt_diff': rt_diff,
543
+ 'mass_error': abs(mass_diff_12 - mass_shift),
544
+ 'adduct_idx': adduct_idx,
545
+ 'charge1': 0, 'charge2': 0,
546
+ 'probability': adduct_probs[adduct_idx]
547
+ })
548
+
549
+ self.logger.debug(f"Found {len(candidate_edges)} candidate edges after RT+mass filtering")
550
+
605
551
  if len(candidate_edges) == 0:
606
552
  self.logger.info("No adduct relationships found")
607
553
  return
@@ -611,23 +557,23 @@ def find_adducts(self, **kwargs):
611
557
  adjacency = {}
612
558
  for i in range(n_features):
613
559
  adjacency[i] = []
614
-
560
+
615
561
  for edge in candidate_edges:
616
- i, j = edge["i"], edge["j"]
562
+ i, j = edge['i'], edge['j']
617
563
  adjacency[i].append(j)
618
564
  adjacency[j].append(i)
619
-
565
+
620
566
  # Find connected components using DFS
621
567
  visited = [False] * n_features
622
568
  components = []
623
-
569
+
624
570
  def dfs(node, component):
625
571
  visited[node] = True
626
572
  component.append(node)
627
573
  for neighbor in adjacency[node]:
628
574
  if not visited[neighbor]:
629
575
  dfs(neighbor, component)
630
-
576
+
631
577
  for i in range(n_features):
632
578
  if not visited[i] and len(adjacency[i]) > 0:
633
579
  component = []
@@ -643,60 +589,53 @@ def find_adducts(self, **kwargs):
643
589
  group_assignments = [0] * n_features
644
590
  mass_shift_assignments = [0.0] * n_features
645
591
  neutral_mass_assignments = [0.0] * n_features
646
-
592
+
647
593
  for group_id, component in enumerate(components, 1):
648
594
  # Find the most likely base ion (highest intensity or lowest m/z as proxy)
649
595
  component_mzs = [feature_mzs[idx] for idx in component]
650
596
  base_idx_in_component = np.argmin(component_mzs) # Lowest m/z as base
651
597
  base_feature_idx = component[base_idx_in_component]
652
598
  base_mz = feature_mzs[base_feature_idx]
653
-
599
+
654
600
  # Assign base ion
655
601
  base_adduct = "[M+H]1+" if charge_max > 0 else "[M-H]1-"
656
602
  base_charge = 1 if charge_max > 0 else -1
657
603
  base_mass_shift = 1.007825 if charge_max > 0 else -1.007825 # H mass
658
-
604
+
659
605
  adduct_assignments[base_feature_idx] = base_adduct
660
606
  adduct_charges_assigned[base_feature_idx] = base_charge
661
607
  group_assignments[base_feature_idx] = group_id
662
608
  mass_shift_assignments[base_feature_idx] = base_mass_shift
663
-
609
+
664
610
  # Calculate neutral mass for base ion
665
611
  base_mz_measured = feature_mzs[base_feature_idx]
666
- neutral_mass_assignments[base_feature_idx] = (
667
- base_mz_measured * abs(base_charge) - base_mass_shift
668
- )
669
-
612
+ neutral_mass_assignments[base_feature_idx] = base_mz_measured * abs(base_charge) - base_mass_shift
613
+
670
614
  # Assign other features based on their relationships to base
671
615
  for feature_idx in component:
672
616
  if feature_idx == base_feature_idx:
673
617
  continue
674
-
618
+
675
619
  group_assignments[feature_idx] = group_id
676
-
620
+
677
621
  # Find best adduct assignment based on mass difference and probability
678
622
  feature_mz = feature_mzs[feature_idx]
679
623
  best_score = -np.inf
680
624
  best_assignment = "[M+?]1+"
681
625
  best_charge = 1
682
626
  best_mass_shift = 1.007825 # Default to H mass shift for [M+?]1+
683
-
627
+
684
628
  # Check all possible adducts
685
- for adduct_idx, (mass_shift, charge, name, prob) in enumerate(
686
- zip(
687
- adduct_mass_shifts,
688
- adduct_charges,
689
- adduct_names,
690
- adduct_probs,
691
- ),
692
- ):
629
+ for adduct_idx, (mass_shift, charge, name, prob) in enumerate(zip(
630
+ adduct_mass_shifts, adduct_charges, adduct_names, adduct_probs)):
631
+
693
632
  if charge != 0:
694
633
  expected_mz = base_mz + mass_shift / abs(charge)
695
634
  else:
696
635
  expected_mz = base_mz + mass_shift
697
-
636
+
698
637
  mass_error = abs(expected_mz - feature_mz)
699
-
638
+
700
639
  # Combined score: probability + mass accuracy
701
640
  if mass_error < mass_max_diff * 2: # Within tolerance
702
641
  score = prob - mass_error * 0.1 # Weight mass accuracy
@@ -709,52 +648,48 @@ def find_adducts(self, **kwargs):
709
648
  adduct_assignments[feature_idx] = best_assignment
710
649
  adduct_charges_assigned[feature_idx] = best_charge
711
650
  mass_shift_assignments[feature_idx] = best_mass_shift
712
-
651
+
713
652
  # Calculate neutral mass
714
- neutral_mass_assignments[feature_idx] = (
715
- feature_mz * abs(best_charge) - best_mass_shift
716
- )
653
+ neutral_mass_assignments[feature_idx] = feature_mz * abs(best_charge) - best_mass_shift
717
654
 
718
655
  # Assign fallback adduct for features not processed in connected components (isolated features)
719
656
  for i in range(n_features):
720
657
  if adduct_assignments[i] is None:
721
658
  fallback_charge = 1 if charge_max > 0 else -1
722
659
  fallback_mass_shift = 1.007825 if charge_max > 0 else -1.007825 # Assume H
723
-
660
+
724
661
  adduct_assignments[i] = "[M+?]1+"
725
662
  adduct_charges_assigned[i] = fallback_charge
726
663
  group_assignments[i] = 0 # No group assignment for isolated features
727
664
  mass_shift_assignments[i] = fallback_mass_shift
728
-
665
+
729
666
  # Calculate neutral mass for isolated features
730
667
  feature_mz = feature_mzs[i]
731
- neutral_mass_assignments[i] = (
732
- feature_mz * abs(fallback_charge) - fallback_mass_shift
733
- )
668
+ neutral_mass_assignments[i] = feature_mz * abs(fallback_charge) - fallback_mass_shift
734
669
 
735
670
  # Map back to original feature order using stored positions
736
671
  original_indices = features_sorted.select("original_position").to_numpy().flatten()
737
-
672
+
738
673
  # Create final assignments in original order (same size as original DataFrame)
739
674
  final_adducts = [None] * len(self.features_df)
740
675
  final_charges = [0] * len(self.features_df)
741
676
  final_groups = [0] * len(self.features_df)
742
677
  final_mass_shifts = [0.0] * len(self.features_df)
743
678
  final_neutral_masses = [0.0] * len(self.features_df)
744
-
679
+
745
680
  for sorted_idx, orig_idx in enumerate(original_indices):
746
681
  final_adducts[orig_idx] = adduct_assignments[sorted_idx]
747
682
  final_charges[orig_idx] = adduct_charges_assigned[sorted_idx]
748
683
  final_groups[orig_idx] = group_assignments[sorted_idx]
749
684
  final_mass_shifts[orig_idx] = mass_shift_assignments[sorted_idx]
750
685
  final_neutral_masses[orig_idx] = neutral_mass_assignments[sorted_idx]
751
-
686
+
752
687
  # Update features DataFrame with correct column ordering
753
688
  # Insert adduct columns in the specified order after iso_of column
754
-
689
+
755
690
  # Get current columns
756
691
  current_columns = self.features_df.columns
757
-
692
+
758
693
  # Find the position of iso_of column
759
694
  try:
760
695
  iso_of_index = current_columns.index("iso_of")
@@ -763,51 +698,42 @@ def find_adducts(self, **kwargs):
763
698
  # If iso_of doesn't exist, append at the end
764
699
  insert_position = len(current_columns)
765
700
  self.logger.warning("iso_of column not found, adding adduct columns at the end")
766
-
701
+
767
702
  # Remove any existing adduct columns first
768
- adduct_column_names = [
769
- "adduct",
770
- "adduct_charge",
771
- "adduct_mass_shift",
772
- "adduct_mass_neutral",
773
- "adduct_group",
774
- ]
775
- df_without_adducts = self.features_df.select(
776
- [col for col in current_columns if col not in adduct_column_names],
777
- )
778
-
703
+ adduct_column_names = ["adduct", "adduct_charge", "adduct_mass_shift", "adduct_mass_neutral", "adduct_group"]
704
+ df_without_adducts = self.features_df.select([col for col in current_columns if col not in adduct_column_names])
705
+
779
706
  # Split columns at insertion point
780
707
  columns_before = df_without_adducts.columns[:insert_position]
781
708
  columns_after = df_without_adducts.columns[insert_position:]
782
-
709
+
783
710
  # Create the new column order with adduct columns in the correct position
784
- new_column_order = list(columns_before) + adduct_column_names + list(columns_after)
785
-
711
+ new_column_order = (
712
+ list(columns_before) +
713
+ adduct_column_names +
714
+ list(columns_after)
715
+ )
716
+
786
717
  # Add adduct columns to the dataframe
787
- self.features_df = df_without_adducts.with_columns(
788
- [
789
- pl.Series("adduct", final_adducts),
790
- pl.Series("adduct_charge", final_charges),
791
- pl.Series("adduct_mass_shift", final_mass_shifts),
792
- pl.Series("adduct_mass_neutral", final_neutral_masses),
793
- pl.Series("adduct_group", final_groups),
794
- ],
795
- ).select(new_column_order)
718
+ self.features_df = df_without_adducts.with_columns([
719
+ pl.Series("adduct", final_adducts),
720
+ pl.Series("adduct_charge", final_charges),
721
+ pl.Series("adduct_mass_shift", final_mass_shifts),
722
+ pl.Series("adduct_mass_neutral", final_neutral_masses),
723
+ pl.Series("adduct_group", final_groups)
724
+ ]).select(new_column_order)
796
725
 
797
726
  # Summary statistics
798
727
  total_with_adducts = sum(1 for x in final_adducts if x is not None)
799
728
  total_groups = max(final_groups) if final_groups else 0
800
-
801
- self.logger.info(
802
- f"Adduct detection completed: {total_with_adducts} features with adducts in {total_groups} groups",
803
- )
729
+
730
+ self.logger.info(f"Adduct detection completed: {total_with_adducts} features with adducts in {total_groups} groups")
804
731
 
805
732
  # Store parameters including the actual processed adducts list
806
733
  history_params = params.to_dict()
807
734
  # Convert the filtered adducts dataframe to a list of adduct specifications for history
808
- history_params["adducts"] = adducts_df.select(
809
- ["name", "charge", "mass_shift", "probability"],
810
- ).to_dicts()
811
-
735
+ history_params['adducts'] = adducts_df.select(['name', 'charge', 'mass_shift', 'probability']).to_dicts()
736
+
812
737
  self.store_history(["find_adducts"], history_params)
813
738
  self.logger.debug("Parameters stored successfully")
739
+