masster 0.4.3__py3-none-any.whl → 0.4.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of masster might be problematic. Click here for more details.

Files changed (54) hide show
  1. masster/__init__.py +8 -8
  2. masster/_version.py +1 -1
  3. masster/chromatogram.py +1 -1
  4. masster/data/dda/20250530_VH_IQX_KW_RP_HSST3_100mm_12min_pos_v4_DDA_OT_C-MiLUT_QC_dil2_01_20250602151849.sample5 +0 -0
  5. masster/data/dda/20250530_VH_IQX_KW_RP_HSST3_100mm_12min_pos_v4_DDA_OT_C-MiLUT_QC_dil3_01_20250602150634.sample5 +0 -0
  6. masster/data/dda/20250530_VH_IQX_KW_RP_HSST3_100mm_12min_pos_v4_MS1_C-MiLUT_C008_v6_r38_01.sample5 +0 -0
  7. masster/data/dda/20250530_VH_IQX_KW_RP_HSST3_100mm_12min_pos_v4_MS1_C-MiLUT_C008_v7_r37_01.sample5 +0 -0
  8. masster/data/dda/20250530_VH_IQX_KW_RP_HSST3_100mm_12min_pos_v4_MS1_C-MiLUT_C017_v5_r99_01.sample5 +0 -0
  9. masster/data/libs/__pycache__/ccm.cpython-312.pyc +0 -0
  10. masster/data/libs/__pycache__/urine.cpython-312.pyc +0 -0
  11. masster/data/libs/ccm.csv +120 -0
  12. masster/data/libs/urine.csv +4693 -0
  13. masster/data/wiff/2025_01_14_VW_7600_LpMx_DBS_CID_2min_TOP15_030msecMS1_005msecReac_CE35_DBS-ON_3.timeseries.data +0 -0
  14. masster/data/wiff/2025_01_14_VW_7600_LpMx_DBS_CID_2min_TOP15_030msecMS1_005msecReac_CE35_DBS-ON_3.wiff +0 -0
  15. masster/data/wiff/2025_01_14_VW_7600_LpMx_DBS_CID_2min_TOP15_030msecMS1_005msecReac_CE35_DBS-ON_3.wiff.scan +0 -0
  16. masster/data/wiff/2025_01_14_VW_7600_LpMx_DBS_CID_2min_TOP15_030msecMS1_005msecReac_CE35_DBS-ON_3.wiff2 +0 -0
  17. masster/logger.py +11 -11
  18. masster/sample/__init__.py +1 -1
  19. masster/sample/adducts.py +338 -264
  20. masster/sample/defaults/find_adducts_def.py +21 -8
  21. masster/sample/h5.py +561 -282
  22. masster/sample/helpers.py +131 -75
  23. masster/sample/lib.py +4 -4
  24. masster/sample/load.py +31 -17
  25. masster/sample/parameters.py +1 -1
  26. masster/sample/plot.py +7 -7
  27. masster/sample/processing.py +117 -87
  28. masster/sample/sample.py +103 -90
  29. masster/sample/sample5_schema.json +196 -0
  30. masster/sample/save.py +35 -12
  31. masster/spectrum.py +1 -1
  32. masster/study/__init__.py +1 -1
  33. masster/study/defaults/align_def.py +5 -1
  34. masster/study/defaults/identify_def.py +3 -1
  35. masster/study/defaults/study_def.py +58 -25
  36. masster/study/export.py +360 -210
  37. masster/study/h5.py +560 -158
  38. masster/study/helpers.py +496 -203
  39. masster/study/helpers_optimized.py +1 -1
  40. masster/study/id.py +538 -349
  41. masster/study/load.py +233 -143
  42. masster/study/plot.py +71 -71
  43. masster/study/processing.py +456 -254
  44. masster/study/save.py +15 -5
  45. masster/study/study.py +213 -131
  46. masster/study/study5_schema.json +360 -0
  47. masster-0.4.5.dist-info/METADATA +131 -0
  48. masster-0.4.5.dist-info/RECORD +71 -0
  49. masster-0.4.3.dist-info/METADATA +0 -791
  50. masster-0.4.3.dist-info/RECORD +0 -56
  51. {masster-0.4.3.dist-info → masster-0.4.5.dist-info}/WHEEL +0 -0
  52. {masster-0.4.3.dist-info → masster-0.4.5.dist-info}/entry_points.txt +0 -0
  53. {masster-0.4.3.dist-info → masster-0.4.5.dist-info}/licenses/LICENSE +0 -0
  54. {masster-0.4.3.dist-info → masster-0.4.5.dist-info}/top_level.txt +0 -0
masster/sample/adducts.py CHANGED
@@ -14,24 +14,24 @@ Functions:
14
14
 
15
15
  import numpy as np
16
16
  import polars as pl
17
- from typing import List, Dict, Any
17
+ from typing import List, Dict
18
18
  from itertools import combinations
19
19
 
20
20
  # Import defaults class for external use
21
- from masster.sample.defaults.find_adducts_def import find_adducts_defaults
21
+ from master.sample.defaults.find_adducts_def import find_adducts_defaults
22
22
 
23
23
 
24
24
  def _get_adducts(self, adducts_list: list = None, **kwargs):
25
25
  """
26
26
  Generate comprehensive adduct specifications including multiply charged species and combinations.
27
-
27
+
28
28
  This method consolidates all adduct generation logic into a single optimized helper
29
29
  that produces a polars DataFrame with all possible adduct combinations, properly
30
30
  formatted names like [M+H]1+ or [M-H2O+2H]2+, and respecting charge constraints.
31
-
31
+
32
32
  Uses parameters from find_adducts_defaults() by default, which can be overridden
33
33
  by providing keyword arguments.
34
-
34
+
35
35
  Parameters
36
36
  ----------
37
37
  adducts_list : List[str], optional
@@ -42,7 +42,7 @@ def _get_adducts(self, adducts_list: list = None, **kwargs):
42
42
  - charge_min: Minimum charge to consider (default from find_adducts_defaults)
43
43
  - charge_max: Maximum charge to consider (default from find_adducts_defaults)
44
44
  - max_combinations: Maximum number of adduct components to combine (default 4)
45
-
45
+
46
46
  Returns
47
47
  -------
48
48
  pl.DataFrame
@@ -56,177 +56,201 @@ def _get_adducts(self, adducts_list: list = None, **kwargs):
56
56
  """
57
57
  # Get default parameters from find_adducts_defaults
58
58
  defaults = self.find_adducts_defaults()
59
-
59
+
60
60
  # Use provided parameters or defaults
61
61
  if adducts_list is None:
62
62
  adducts_list = defaults.get_openms_adducts()
63
-
64
- charge_min = kwargs.get('charge_min', defaults.charge_min)
65
- charge_max = kwargs.get('charge_max', defaults.charge_max)
66
- max_combinations = kwargs.get('max_combinations', 4)
67
-
63
+
64
+ charge_min = kwargs.get("charge_min", defaults.charge_min)
65
+ charge_max = kwargs.get("charge_max", defaults.charge_max)
66
+ max_combinations = kwargs.get("max_combinations", 4)
67
+
68
68
  # Parse base adduct specifications
69
69
  base_specs = []
70
-
70
+
71
71
  for adduct_str in adducts_list:
72
- if not isinstance(adduct_str, str) or ':' not in adduct_str:
72
+ if not isinstance(adduct_str, str) or ":" not in adduct_str:
73
73
  continue
74
-
74
+
75
75
  try:
76
- parts = adduct_str.split(':')
76
+ parts = adduct_str.split(":")
77
77
  if len(parts) != 3:
78
78
  continue
79
-
79
+
80
80
  formula_part = parts[0]
81
- charge = int(parts[1])
81
+ charge = int(parts[1])
82
82
  probability = float(parts[2])
83
-
83
+
84
84
  # Calculate mass shift from formula
85
85
  mass_shift = _calculate_formula_mass_shift(formula_part)
86
-
87
- base_specs.append({
88
- 'formula': formula_part,
89
- 'charge': charge,
90
- 'mass_shift': mass_shift,
91
- 'probability': probability,
92
- 'raw_string': adduct_str
93
- })
94
-
86
+
87
+ base_specs.append(
88
+ {
89
+ "formula": formula_part,
90
+ "charge": charge,
91
+ "mass_shift": mass_shift,
92
+ "probability": probability,
93
+ "raw_string": adduct_str,
94
+ },
95
+ )
96
+
95
97
  except (ValueError, IndexError):
96
98
  continue
97
-
99
+
98
100
  # Generate all valid combinations
99
101
  combinations_list = []
100
-
102
+
101
103
  # Separate specs by charge type
102
- positive_specs = [spec for spec in base_specs if spec['charge'] > 0]
103
- negative_specs = [spec for spec in base_specs if spec['charge'] < 0]
104
- neutral_specs = [spec for spec in base_specs if spec['charge'] == 0]
105
-
104
+ positive_specs = [spec for spec in base_specs if spec["charge"] > 0]
105
+ negative_specs = [spec for spec in base_specs if spec["charge"] < 0]
106
+ neutral_specs = [spec for spec in base_specs if spec["charge"] == 0]
107
+
106
108
  # 1. Single adducts
107
109
  for spec in base_specs:
108
- if charge_min <= spec['charge'] <= charge_max:
110
+ if charge_min <= spec["charge"] <= charge_max:
109
111
  formatted_name = _format_adduct_name([spec])
110
- combinations_list.append({
111
- 'components': [spec],
112
- 'formatted_name': formatted_name,
113
- 'total_mass_shift': spec['mass_shift'],
114
- 'total_charge': spec['charge'],
115
- 'combined_probability': spec['probability'],
116
- 'complexity': 1
117
- })
118
-
112
+ combinations_list.append(
113
+ {
114
+ "components": [spec],
115
+ "formatted_name": formatted_name,
116
+ "total_mass_shift": spec["mass_shift"],
117
+ "total_charge": spec["charge"],
118
+ "combined_probability": spec["probability"],
119
+ "complexity": 1,
120
+ },
121
+ )
122
+
119
123
  # 2. Generate multiply charged versions (2H+, 3H+, etc.)
120
124
  for spec in positive_specs + negative_specs:
121
- base_charge = spec['charge']
125
+ base_charge = spec["charge"]
122
126
  for multiplier in range(2, min(max_combinations + 1, 5)):
123
127
  total_charge = base_charge * multiplier
124
128
  if charge_min <= total_charge <= charge_max:
125
129
  components = [spec] * multiplier
126
130
  formatted_name = _format_adduct_name(components)
127
-
128
- combinations_list.append({
129
- 'components': components,
130
- 'formatted_name': formatted_name,
131
- 'total_mass_shift': spec['mass_shift'] * multiplier,
132
- 'total_charge': total_charge,
133
- 'combined_probability': spec['probability'] ** multiplier,
134
- 'complexity': multiplier
135
- })
136
-
131
+
132
+ combinations_list.append(
133
+ {
134
+ "components": components,
135
+ "formatted_name": formatted_name,
136
+ "total_mass_shift": spec["mass_shift"] * multiplier,
137
+ "total_charge": total_charge,
138
+ "combined_probability": spec["probability"] ** multiplier,
139
+ "complexity": multiplier,
140
+ },
141
+ )
142
+
137
143
  # 3. Mixed combinations (2-component)
138
144
  if max_combinations >= 2:
139
145
  # Positive + Neutral
140
146
  for pos_spec in positive_specs:
141
147
  for neut_spec in neutral_specs:
142
- total_charge = pos_spec['charge'] + neut_spec['charge']
148
+ total_charge = pos_spec["charge"] + neut_spec["charge"]
143
149
  if charge_min <= total_charge <= charge_max:
144
150
  components = [pos_spec, neut_spec]
145
151
  formatted_name = _format_adduct_name(components)
146
- combinations_list.append({
147
- 'components': components,
148
- 'formatted_name': formatted_name,
149
- 'total_mass_shift': pos_spec['mass_shift'] + neut_spec['mass_shift'],
150
- 'total_charge': total_charge,
151
- 'combined_probability': pos_spec['probability'] * neut_spec['probability'],
152
- 'complexity': 2
153
- })
154
-
152
+ combinations_list.append(
153
+ {
154
+ "components": components,
155
+ "formatted_name": formatted_name,
156
+ "total_mass_shift": pos_spec["mass_shift"]
157
+ + neut_spec["mass_shift"],
158
+ "total_charge": total_charge,
159
+ "combined_probability": pos_spec["probability"]
160
+ * neut_spec["probability"],
161
+ "complexity": 2,
162
+ },
163
+ )
164
+
155
165
  # Different charged species
156
166
  for combo in combinations(positive_specs, 2):
157
- if combo[0]['formula'] != combo[1]['formula']:
158
- total_charge = combo[0]['charge'] + combo[1]['charge']
167
+ if combo[0]["formula"] != combo[1]["formula"]:
168
+ total_charge = combo[0]["charge"] + combo[1]["charge"]
159
169
  if charge_min <= total_charge <= charge_max:
160
170
  components = list(combo)
161
171
  formatted_name = _format_adduct_name(components)
162
- combinations_list.append({
163
- 'components': components,
164
- 'formatted_name': formatted_name,
165
- 'total_mass_shift': combo[0]['mass_shift'] + combo[1]['mass_shift'],
166
- 'total_charge': total_charge,
167
- 'combined_probability': combo[0]['probability'] * combo[1]['probability'],
168
- 'complexity': 2
169
- })
170
-
172
+ combinations_list.append(
173
+ {
174
+ "components": components,
175
+ "formatted_name": formatted_name,
176
+ "total_mass_shift": combo[0]["mass_shift"]
177
+ + combo[1]["mass_shift"],
178
+ "total_charge": total_charge,
179
+ "combined_probability": combo[0]["probability"]
180
+ * combo[1]["probability"],
181
+ "complexity": 2,
182
+ },
183
+ )
184
+
171
185
  # 4. 3-component combinations (limited for performance)
172
186
  if max_combinations >= 3:
173
187
  for pos_spec in positive_specs[:2]:
174
188
  for neut_combo in combinations(neutral_specs[:2], 2):
175
189
  components = [pos_spec] + list(neut_combo)
176
- total_charge = sum(spec['charge'] for spec in components)
177
-
190
+ total_charge = sum(spec["charge"] for spec in components)
191
+
178
192
  if charge_min <= total_charge <= charge_max:
179
193
  formatted_name = _format_adduct_name(components)
180
- total_mass_shift = sum(spec['mass_shift'] for spec in components)
181
- combined_prob = np.prod([spec['probability'] for spec in components])
182
-
183
- combinations_list.append({
184
- 'components': components,
185
- 'formatted_name': formatted_name,
186
- 'total_mass_shift': total_mass_shift,
187
- 'total_charge': total_charge,
188
- 'combined_probability': combined_prob,
189
- 'complexity': 3
190
- })
191
-
194
+ total_mass_shift = sum(spec["mass_shift"] for spec in components)
195
+ combined_prob = np.prod(
196
+ [spec["probability"] for spec in components],
197
+ )
198
+
199
+ combinations_list.append(
200
+ {
201
+ "components": components,
202
+ "formatted_name": formatted_name,
203
+ "total_mass_shift": total_mass_shift,
204
+ "total_charge": total_charge,
205
+ "combined_probability": combined_prob,
206
+ "complexity": 3,
207
+ },
208
+ )
209
+
192
210
  # Convert to polars DataFrame
193
211
  if combinations_list:
194
- combinations_list.sort(key=lambda x: (-x['combined_probability'], x['complexity']))
195
-
196
- adducts_df = pl.DataFrame([
197
- {
198
- 'name': combo['formatted_name'],
199
- 'charge': combo['total_charge'],
200
- 'mass_shift': combo['total_mass_shift'],
201
- 'probability': combo['combined_probability'],
202
- 'complexity': combo['complexity'],
203
- 'components': combo['components']
204
- }
205
- for combo in combinations_list
206
- ])
212
+ combinations_list.sort(
213
+ key=lambda x: (-x["combined_probability"], x["complexity"]),
214
+ )
215
+
216
+ adducts_df = pl.DataFrame(
217
+ [
218
+ {
219
+ "name": combo["formatted_name"],
220
+ "charge": combo["total_charge"],
221
+ "mass_shift": combo["total_mass_shift"],
222
+ "probability": combo["combined_probability"],
223
+ "complexity": combo["complexity"],
224
+ "components": combo["components"],
225
+ }
226
+ for combo in combinations_list
227
+ ],
228
+ )
207
229
  else:
208
230
  # Return empty DataFrame with correct schema
209
- adducts_df = pl.DataFrame({
210
- 'name': [],
211
- 'charge': [],
212
- 'mass_shift': [],
213
- 'probability': [],
214
- 'complexity': [],
215
- 'components': []
216
- })
217
-
231
+ adducts_df = pl.DataFrame(
232
+ {
233
+ "name": [],
234
+ "charge": [],
235
+ "mass_shift": [],
236
+ "probability": [],
237
+ "complexity": [],
238
+ "components": [],
239
+ },
240
+ )
241
+
218
242
  return adducts_df
219
243
 
220
244
 
221
245
  def _calculate_formula_mass_shift(formula: str) -> float:
222
246
  """
223
247
  Calculate mass shift from formula string like "+H", "-H2O", "+Na-H", etc.
224
-
248
+
225
249
  Parameters
226
250
  ----------
227
251
  formula : str
228
252
  Formula string (e.g., "+H", "-H2O", "+Na-H")
229
-
253
+
230
254
  Returns
231
255
  -------
232
256
  float
@@ -234,59 +258,59 @@ def _calculate_formula_mass_shift(formula: str) -> float:
234
258
  """
235
259
  # Standard atomic masses
236
260
  atomic_masses = {
237
- 'H': 1.007825,
238
- 'C': 12.0,
239
- 'N': 14.003074,
240
- 'O': 15.994915,
241
- 'Na': 22.989769,
242
- 'K': 38.963707,
243
- 'Li': 7.016003,
244
- 'Ca': 39.962591,
245
- 'Mg': 23.985042,
246
- 'Fe': 55.934938,
247
- 'Cl': 34.968853,
248
- 'Br': 78.918336,
249
- 'I': 126.904473,
250
- 'P': 30.973762,
251
- 'S': 31.972071
261
+ "H": 1.007825,
262
+ "C": 12.0,
263
+ "N": 14.003074,
264
+ "O": 15.994915,
265
+ "Na": 22.989769,
266
+ "K": 38.963707,
267
+ "Li": 7.016003,
268
+ "Ca": 39.962591,
269
+ "Mg": 23.985042,
270
+ "Fe": 55.934938,
271
+ "Cl": 34.968853,
272
+ "Br": 78.918336,
273
+ "I": 126.904473,
274
+ "P": 30.973762,
275
+ "S": 31.972071,
252
276
  }
253
-
277
+
254
278
  total_mass = 0.0
255
-
279
+
256
280
  # Parse formula by splitting on + and - while preserving the operators
257
281
  parts = []
258
282
  current_part = ""
259
283
  current_sign = 1
260
-
284
+
261
285
  for char in formula:
262
- if char == '+':
286
+ if char == "+":
263
287
  if current_part:
264
288
  parts.append((current_sign, current_part))
265
289
  current_part = ""
266
290
  current_sign = 1
267
- elif char == '-':
291
+ elif char == "-":
268
292
  if current_part:
269
293
  parts.append((current_sign, current_part))
270
294
  current_part = ""
271
295
  current_sign = -1
272
296
  else:
273
297
  current_part += char
274
-
298
+
275
299
  if current_part:
276
300
  parts.append((current_sign, current_part))
277
-
301
+
278
302
  # Process each part
279
303
  for sign, part in parts:
280
304
  if not part:
281
305
  continue
282
-
306
+
283
307
  # Parse element and count (e.g., "H2O" -> H:2, O:1)
284
308
  elements = _parse_element_counts(part)
285
-
309
+
286
310
  for element, count in elements.items():
287
311
  if element in atomic_masses:
288
312
  total_mass += sign * atomic_masses[element] * count
289
-
313
+
290
314
  return total_mass
291
315
 
292
316
 
@@ -294,25 +318,25 @@ def _parse_element_counts(formula_part: str) -> Dict[str, int]:
294
318
  """Parse element counts from a formula part like 'H2O' -> {'H': 2, 'O': 1}"""
295
319
  elements = {}
296
320
  i = 0
297
-
321
+
298
322
  while i < len(formula_part):
299
323
  # Get element (uppercase letter, possibly followed by lowercase)
300
324
  element = formula_part[i]
301
325
  i += 1
302
-
326
+
303
327
  while i < len(formula_part) and formula_part[i].islower():
304
328
  element += formula_part[i]
305
329
  i += 1
306
-
330
+
307
331
  # Get count (digits following element)
308
332
  count_str = ""
309
333
  while i < len(formula_part) and formula_part[i].isdigit():
310
334
  count_str += formula_part[i]
311
335
  i += 1
312
-
336
+
313
337
  count = int(count_str) if count_str else 1
314
338
  elements[element] = elements.get(element, 0) + count
315
-
339
+
316
340
  return elements
317
341
 
318
342
 
@@ -320,51 +344,56 @@ def _format_adduct_name(components: List[Dict]) -> str:
320
344
  """Format adduct name from components like [M+H]1+ or [M+2H]2+ or [M+2(H+Na)]3+"""
321
345
  if not components:
322
346
  return "[M]"
323
-
347
+
324
348
  # Count occurrences of each formula
325
349
  from collections import Counter
326
- formula_counts = Counter(comp['formula'] for comp in components)
327
- total_charge = sum(comp['charge'] for comp in components)
328
-
350
+
351
+ formula_counts = Counter(comp["formula"] for comp in components)
352
+ total_charge = sum(comp["charge"] for comp in components)
353
+
329
354
  # Build formula part with proper multipliers
330
355
  formula_parts = []
331
- for formula, count in sorted(formula_counts.items()): # Sort for consistent ordering
356
+ for formula, count in sorted(
357
+ formula_counts.items(),
358
+ ): # Sort for consistent ordering
332
359
  if count == 1:
333
360
  formula_parts.append(formula)
334
361
  else:
335
362
  # For multiple occurrences, use count prefix (e.g., 2H, 3Na)
336
363
  # Handle special case where formula might already start with + or -
337
- if formula.startswith(('+', '-')):
364
+ if formula.startswith(("+", "-")):
338
365
  sign = formula[0]
339
366
  base_formula = formula[1:]
340
367
  formula_parts.append(f"{sign}{count}{base_formula}")
341
368
  else:
342
369
  formula_parts.append(f"{count}{formula}")
343
-
370
+
344
371
  # Combine formula parts
345
372
  formula = "".join(formula_parts)
346
-
373
+
347
374
  # Format charge
348
375
  if total_charge == 0:
349
376
  charge_str = ""
350
377
  elif abs(total_charge) == 1:
351
378
  charge_str = "1+" if total_charge > 0 else "1-"
352
379
  else:
353
- charge_str = f"{abs(total_charge)}+" if total_charge > 0 else f"{abs(total_charge)}-"
354
-
380
+ charge_str = (
381
+ f"{abs(total_charge)}+" if total_charge > 0 else f"{abs(total_charge)}-"
382
+ )
383
+
355
384
  return f"[M{formula}]{charge_str}"
356
385
 
357
386
 
358
387
  def find_adducts(self, **kwargs):
359
388
  """Detect adduct relationships among detected features using improved OpenMS-like algorithm.
360
389
 
361
- This method implements a corrected version of the OpenMS MetaboliteFeatureDeconvolution
390
+ This method implements a corrected version of the OpenMS MetaboliteFeatureDeconvolution
362
391
  algorithm that properly enforces RT constraints and avoids the mass tolerance dominance
363
392
  issues present in the original C++ implementation.
364
393
 
365
394
  Key improvements over OpenMS:
366
395
  - Early RT filtering prevents expensive mass calculations for temporally incompatible features
367
- - Strict mass tolerance (0.01 Da default) prevents inappropriate mass relationships
396
+ - Strict mass tolerance (0.01 Da default) prevents inappropriate mass relationships
368
397
  - RT constraints are properly enforced throughout the algorithm
369
398
  - Connected components analysis respects both mass AND RT constraints
370
399
  - Probability-based scoring for adduct assignment
@@ -381,9 +410,9 @@ def find_adducts(self, **kwargs):
381
410
  Side effects:
382
411
  Updates ``self.features_df`` with adduct information columns.
383
412
  """
384
- # Initialize parameters
413
+ # Initialize parameters
385
414
  params = find_adducts_defaults()
386
-
415
+
387
416
  for key, value in kwargs.items():
388
417
  if isinstance(value, find_adducts_defaults):
389
418
  params = value
@@ -393,24 +422,28 @@ def find_adducts(self, **kwargs):
393
422
  if params.set(key, value, validate=True):
394
423
  self.logger.debug(f"Updated parameter {key} = {value}")
395
424
  else:
396
- self.logger.warning(f"Failed to set parameter {key} = {value} (validation failed)")
425
+ self.logger.warning(
426
+ f"Failed to set parameter {key} = {value} (validation failed)",
427
+ )
397
428
  else:
398
429
  self.logger.warning(f"Unknown parameter {key} ignored")
399
430
 
400
431
  # Check if features_df exists and has data
401
- if not hasattr(self, 'features_df') or len(self.features_df) == 0:
402
- self.logger.warning("No features available for adduct detection. Run find_features() first.")
432
+ if not hasattr(self, "features_df") or len(self.features_df) == 0:
433
+ self.logger.warning(
434
+ "No features available for adduct detection. Run find_features() first.",
435
+ )
403
436
  return
404
437
 
405
438
  self.logger.info("Adduct detection...")
406
439
 
407
440
  # Validate required columns
408
- required_cols = ['mz', 'rt']
441
+ required_cols = ["mz", "rt"]
409
442
  missing_cols = [col for col in required_cols if col not in self.features_df.columns]
410
443
  if missing_cols:
411
444
  self.logger.error(f"Required columns missing from features_df: {missing_cols}")
412
445
  return
413
-
446
+
414
447
  # Check if we have any features to process
415
448
  if len(self.features_df) == 0:
416
449
  self.logger.warning("No features available for adduct detection")
@@ -424,130 +457,151 @@ def find_adducts(self, **kwargs):
424
457
 
425
458
  # Get parameters
426
459
  adducts_list = params.get_openms_adducts()
427
- charge_min = params.get("charge_min")
460
+ charge_min = params.get("charge_min")
428
461
  charge_max = params.get("charge_max")
429
462
  retention_max_diff = params.get("retention_max_diff")
430
463
  mass_max_diff = params.get("mass_max_diff")
431
464
  unit = params.get("unit")
432
465
  min_probability = params.get("min_probability")
433
466
 
434
- self.logger.debug(f"Processing {len(self.features_df)} features with {len(adducts_list)} base adducts")
435
- self.logger.debug(f"RT tolerance: {retention_max_diff}s, Mass tolerance: {mass_max_diff} {unit}")
467
+ self.logger.debug(
468
+ f"Processing {len(self.features_df)} features with {len(adducts_list)} base adducts",
469
+ )
470
+ self.logger.debug(
471
+ f"RT tolerance: {retention_max_diff}s, Mass tolerance: {mass_max_diff} {unit}",
472
+ )
436
473
  self.logger.debug(f"Min probability threshold: {min_probability}")
437
474
 
438
475
  # Generate comprehensive adduct specifications using the Sample method
439
476
  adducts_df = self._get_adducts(
440
477
  adducts_list=adducts_list,
441
- charge_min=charge_min,
478
+ charge_min=charge_min,
442
479
  charge_max=charge_max,
443
- max_combinations=4
480
+ max_combinations=4,
444
481
  )
445
-
482
+
446
483
  self.logger.debug(f"Generated {len(adducts_df)} total adduct combinations")
447
-
484
+
448
485
  # Filter adducts by minimum probability threshold
449
486
  if min_probability > 0.0:
450
487
  adducts_before_filter = len(adducts_df)
451
488
  adducts_df = adducts_df.filter(pl.col("probability") >= min_probability)
452
489
  adducts_after_filter = len(adducts_df)
453
490
  filtered_count = adducts_before_filter - adducts_after_filter
454
-
455
- self.logger.debug(f"Filtered {filtered_count} low-probability adducts (< {min_probability})")
491
+
492
+ self.logger.debug(
493
+ f"Filtered {filtered_count} low-probability adducts (< {min_probability})",
494
+ )
456
495
  self.logger.debug(f"Remaining adducts for analysis: {adducts_after_filter}")
457
-
496
+
458
497
  if len(adducts_df) == 0:
459
- self.logger.warning(f"No adducts remaining after probability filtering (min_probability={min_probability})")
498
+ self.logger.warning(
499
+ f"No adducts remaining after probability filtering (min_probability={min_probability})",
500
+ )
460
501
  return
461
-
502
+
462
503
  # Implement the adduct detection algorithm directly here
463
504
  import numpy as np
464
-
505
+
465
506
  # Get parameters
466
507
  charge_max = params.get("charge_max")
467
508
  retention_max_diff = params.get("retention_max_diff")
468
509
  mass_max_diff = params.get("mass_max_diff")
469
510
  unit = params.get("unit")
470
-
511
+
471
512
  # Sort features by RT for efficient RT-sweep processing (OpenMS approach)
472
513
  # Store original row positions before sorting for correct index mapping
473
514
  features_with_positions = self.features_df.with_row_index("original_position")
474
515
  features_sorted = features_with_positions.sort("rt")
475
516
  n_features = len(features_sorted)
476
-
517
+
477
518
  # Extract arrays for fast processing
478
519
  feature_mzs = features_sorted.select("mz").to_numpy().flatten()
479
520
  feature_rts = features_sorted.select("rt").to_numpy().flatten()
480
-
521
+
481
522
  # Convert adducts to arrays for vectorized operations
482
523
  adduct_mass_shifts = adducts_df.select("mass_shift").to_numpy().flatten()
483
524
  adduct_charges = adducts_df.select("charge").to_numpy().flatten()
484
525
  adduct_names = adducts_df.select("name").to_series().to_list()
485
526
  adduct_probs = adducts_df.select("probability").to_numpy().flatten()
486
527
 
487
- self.logger.debug(f"RT-sweep processing: {n_features} features × {len(adducts_df)} adduct combinations")
488
-
528
+ self.logger.debug(
529
+ f"RT-sweep processing: {n_features} features × {len(adducts_df)} adduct combinations",
530
+ )
531
+
489
532
  # Phase 1: RT-sweep line algorithm with early RT filtering (fixes OpenMS flaw #1)
490
533
  candidate_edges = []
491
-
534
+
492
535
  for i_rt in range(n_features):
493
536
  mz1 = feature_mzs[i_rt]
494
537
  rt1 = feature_rts[i_rt]
495
-
538
+
496
539
  # RT-window sweep: only check features within RT tolerance (early filtering)
497
540
  for j_rt in range(i_rt + 1, n_features):
498
541
  rt2 = feature_rts[j_rt]
499
542
  rt_diff = rt2 - rt1
500
-
543
+
501
544
  # Early RT constraint check (fixes OpenMS issue where RT was checked too late)
502
545
  if rt_diff > retention_max_diff:
503
546
  break # Features are RT-sorted, so no more valid pairs
504
-
547
+
505
548
  mz2 = feature_mzs[j_rt]
506
-
549
+
507
550
  # Phase 2: Check for valid mass relationships with strict tolerance (fixes OpenMS flaw #2)
508
551
  for adduct_idx, mass_shift in enumerate(adduct_mass_shifts):
509
552
  charge = adduct_charges[adduct_idx]
510
-
553
+
511
554
  # Calculate mass tolerance (per feature, as in OpenMS)
512
- if unit == 'ppm':
555
+ if unit == "ppm":
513
556
  tol1 = mass_max_diff * mz1 * 1e-6
514
557
  tol2 = mass_max_diff * mz2 * 1e-6
515
558
  combined_tolerance = tol1 + tol2
516
559
  else: # Da
517
- combined_tolerance = 2 * mass_max_diff # Combined tolerance for both features
518
-
560
+ combined_tolerance = (
561
+ 2 * mass_max_diff
562
+ ) # Combined tolerance for both features
563
+
519
564
  # Check both directions of mass relationship
520
565
  if charge != 0:
521
566
  # For charged adducts: m/z relationship
522
- mass_diff_12 = (mz2 * abs(charge)) - (mz1 * abs(charge))
567
+ mass_diff_12 = (mz2 * abs(charge)) - (mz1 * abs(charge))
523
568
  expected_mass_diff = mass_shift
524
-
569
+
525
570
  if abs(mass_diff_12 - expected_mass_diff) <= combined_tolerance:
526
571
  # Valid mass relationship found
527
- candidate_edges.append({
528
- 'i': i_rt, 'j': j_rt,
529
- 'rt_diff': rt_diff,
530
- 'mass_error': abs(mass_diff_12 - expected_mass_diff),
531
- 'adduct_idx': adduct_idx,
532
- 'charge1': charge if mass_diff_12 > 0 else -charge,
533
- 'charge2': -charge if mass_diff_12 > 0 else charge,
534
- 'probability': adduct_probs[adduct_idx]
535
- })
572
+ candidate_edges.append(
573
+ {
574
+ "i": i_rt,
575
+ "j": j_rt,
576
+ "rt_diff": rt_diff,
577
+ "mass_error": abs(mass_diff_12 - expected_mass_diff),
578
+ "adduct_idx": adduct_idx,
579
+ "charge1": charge if mass_diff_12 > 0 else -charge,
580
+ "charge2": -charge if mass_diff_12 > 0 else charge,
581
+ "probability": adduct_probs[adduct_idx],
582
+ },
583
+ )
536
584
  else:
537
585
  # For neutral adducts: direct mass shift
538
586
  mass_diff_12 = mz2 - mz1
539
587
  if abs(mass_diff_12 - mass_shift) <= combined_tolerance:
540
- candidate_edges.append({
541
- 'i': i_rt, 'j': j_rt,
542
- 'rt_diff': rt_diff,
543
- 'mass_error': abs(mass_diff_12 - mass_shift),
544
- 'adduct_idx': adduct_idx,
545
- 'charge1': 0, 'charge2': 0,
546
- 'probability': adduct_probs[adduct_idx]
547
- })
548
-
549
- self.logger.debug(f"Found {len(candidate_edges)} candidate edges after RT+mass filtering")
550
-
588
+ candidate_edges.append(
589
+ {
590
+ "i": i_rt,
591
+ "j": j_rt,
592
+ "rt_diff": rt_diff,
593
+ "mass_error": abs(mass_diff_12 - mass_shift),
594
+ "adduct_idx": adduct_idx,
595
+ "charge1": 0,
596
+ "charge2": 0,
597
+ "probability": adduct_probs[adduct_idx],
598
+ },
599
+ )
600
+
601
+ self.logger.debug(
602
+ f"Found {len(candidate_edges)} candidate edges after RT+mass filtering",
603
+ )
604
+
551
605
  if len(candidate_edges) == 0:
552
606
  self.logger.info("No adduct relationships found")
553
607
  return
@@ -557,23 +611,23 @@ def find_adducts(self, **kwargs):
557
611
  adjacency = {}
558
612
  for i in range(n_features):
559
613
  adjacency[i] = []
560
-
614
+
561
615
  for edge in candidate_edges:
562
- i, j = edge['i'], edge['j']
616
+ i, j = edge["i"], edge["j"]
563
617
  adjacency[i].append(j)
564
618
  adjacency[j].append(i)
565
-
619
+
566
620
  # Find connected components using DFS
567
621
  visited = [False] * n_features
568
622
  components = []
569
-
623
+
570
624
  def dfs(node, component):
571
625
  visited[node] = True
572
626
  component.append(node)
573
627
  for neighbor in adjacency[node]:
574
628
  if not visited[neighbor]:
575
629
  dfs(neighbor, component)
576
-
630
+
577
631
  for i in range(n_features):
578
632
  if not visited[i] and len(adjacency[i]) > 0:
579
633
  component = []
@@ -589,53 +643,60 @@ def find_adducts(self, **kwargs):
589
643
  group_assignments = [0] * n_features
590
644
  mass_shift_assignments = [0.0] * n_features
591
645
  neutral_mass_assignments = [0.0] * n_features
592
-
646
+
593
647
  for group_id, component in enumerate(components, 1):
594
648
  # Find the most likely base ion (highest intensity or lowest m/z as proxy)
595
649
  component_mzs = [feature_mzs[idx] for idx in component]
596
650
  base_idx_in_component = np.argmin(component_mzs) # Lowest m/z as base
597
651
  base_feature_idx = component[base_idx_in_component]
598
652
  base_mz = feature_mzs[base_feature_idx]
599
-
653
+
600
654
  # Assign base ion
601
655
  base_adduct = "[M+H]1+" if charge_max > 0 else "[M-H]1-"
602
656
  base_charge = 1 if charge_max > 0 else -1
603
657
  base_mass_shift = 1.007825 if charge_max > 0 else -1.007825 # H mass
604
-
658
+
605
659
  adduct_assignments[base_feature_idx] = base_adduct
606
660
  adduct_charges_assigned[base_feature_idx] = base_charge
607
661
  group_assignments[base_feature_idx] = group_id
608
662
  mass_shift_assignments[base_feature_idx] = base_mass_shift
609
-
663
+
610
664
  # Calculate neutral mass for base ion
611
665
  base_mz_measured = feature_mzs[base_feature_idx]
612
- neutral_mass_assignments[base_feature_idx] = base_mz_measured * abs(base_charge) - base_mass_shift
613
-
666
+ neutral_mass_assignments[base_feature_idx] = (
667
+ base_mz_measured * abs(base_charge) - base_mass_shift
668
+ )
669
+
614
670
  # Assign other features based on their relationships to base
615
671
  for feature_idx in component:
616
672
  if feature_idx == base_feature_idx:
617
673
  continue
618
-
674
+
619
675
  group_assignments[feature_idx] = group_id
620
-
676
+
621
677
  # Find best adduct assignment based on mass difference and probability
622
678
  feature_mz = feature_mzs[feature_idx]
623
679
  best_score = -np.inf
624
680
  best_assignment = "[M+?]1+"
625
681
  best_charge = 1
626
682
  best_mass_shift = 1.007825 # Default to H mass shift for [M+?]1+
627
-
683
+
628
684
  # Check all possible adducts
629
- for adduct_idx, (mass_shift, charge, name, prob) in enumerate(zip(
630
- adduct_mass_shifts, adduct_charges, adduct_names, adduct_probs)):
631
-
685
+ for adduct_idx, (mass_shift, charge, name, prob) in enumerate(
686
+ zip(
687
+ adduct_mass_shifts,
688
+ adduct_charges,
689
+ adduct_names,
690
+ adduct_probs,
691
+ ),
692
+ ):
632
693
  if charge != 0:
633
694
  expected_mz = base_mz + mass_shift / abs(charge)
634
695
  else:
635
696
  expected_mz = base_mz + mass_shift
636
-
697
+
637
698
  mass_error = abs(expected_mz - feature_mz)
638
-
699
+
639
700
  # Combined score: probability + mass accuracy
640
701
  if mass_error < mass_max_diff * 2: # Within tolerance
641
702
  score = prob - mass_error * 0.1 # Weight mass accuracy
@@ -648,48 +709,52 @@ def find_adducts(self, **kwargs):
648
709
  adduct_assignments[feature_idx] = best_assignment
649
710
  adduct_charges_assigned[feature_idx] = best_charge
650
711
  mass_shift_assignments[feature_idx] = best_mass_shift
651
-
712
+
652
713
  # Calculate neutral mass
653
- neutral_mass_assignments[feature_idx] = feature_mz * abs(best_charge) - best_mass_shift
714
+ neutral_mass_assignments[feature_idx] = (
715
+ feature_mz * abs(best_charge) - best_mass_shift
716
+ )
654
717
 
655
718
  # Assign fallback adduct for features not processed in connected components (isolated features)
656
719
  for i in range(n_features):
657
720
  if adduct_assignments[i] is None:
658
721
  fallback_charge = 1 if charge_max > 0 else -1
659
722
  fallback_mass_shift = 1.007825 if charge_max > 0 else -1.007825 # Assume H
660
-
723
+
661
724
  adduct_assignments[i] = "[M+?]1+"
662
725
  adduct_charges_assigned[i] = fallback_charge
663
726
  group_assignments[i] = 0 # No group assignment for isolated features
664
727
  mass_shift_assignments[i] = fallback_mass_shift
665
-
728
+
666
729
  # Calculate neutral mass for isolated features
667
730
  feature_mz = feature_mzs[i]
668
- neutral_mass_assignments[i] = feature_mz * abs(fallback_charge) - fallback_mass_shift
731
+ neutral_mass_assignments[i] = (
732
+ feature_mz * abs(fallback_charge) - fallback_mass_shift
733
+ )
669
734
 
670
735
  # Map back to original feature order using stored positions
671
736
  original_indices = features_sorted.select("original_position").to_numpy().flatten()
672
-
737
+
673
738
  # Create final assignments in original order (same size as original DataFrame)
674
739
  final_adducts = [None] * len(self.features_df)
675
740
  final_charges = [0] * len(self.features_df)
676
741
  final_groups = [0] * len(self.features_df)
677
742
  final_mass_shifts = [0.0] * len(self.features_df)
678
743
  final_neutral_masses = [0.0] * len(self.features_df)
679
-
744
+
680
745
  for sorted_idx, orig_idx in enumerate(original_indices):
681
746
  final_adducts[orig_idx] = adduct_assignments[sorted_idx]
682
747
  final_charges[orig_idx] = adduct_charges_assigned[sorted_idx]
683
748
  final_groups[orig_idx] = group_assignments[sorted_idx]
684
749
  final_mass_shifts[orig_idx] = mass_shift_assignments[sorted_idx]
685
750
  final_neutral_masses[orig_idx] = neutral_mass_assignments[sorted_idx]
686
-
751
+
687
752
  # Update features DataFrame with correct column ordering
688
753
  # Insert adduct columns in the specified order after iso_of column
689
-
754
+
690
755
  # Get current columns
691
756
  current_columns = self.features_df.columns
692
-
757
+
693
758
  # Find the position of iso_of column
694
759
  try:
695
760
  iso_of_index = current_columns.index("iso_of")
@@ -698,42 +763,51 @@ def find_adducts(self, **kwargs):
698
763
  # If iso_of doesn't exist, append at the end
699
764
  insert_position = len(current_columns)
700
765
  self.logger.warning("iso_of column not found, adding adduct columns at the end")
701
-
766
+
702
767
  # Remove any existing adduct columns first
703
- adduct_column_names = ["adduct", "adduct_charge", "adduct_mass_shift", "adduct_mass_neutral", "adduct_group"]
704
- df_without_adducts = self.features_df.select([col for col in current_columns if col not in adduct_column_names])
705
-
768
+ adduct_column_names = [
769
+ "adduct",
770
+ "adduct_charge",
771
+ "adduct_mass_shift",
772
+ "adduct_mass_neutral",
773
+ "adduct_group",
774
+ ]
775
+ df_without_adducts = self.features_df.select(
776
+ [col for col in current_columns if col not in adduct_column_names],
777
+ )
778
+
706
779
  # Split columns at insertion point
707
780
  columns_before = df_without_adducts.columns[:insert_position]
708
781
  columns_after = df_without_adducts.columns[insert_position:]
709
-
782
+
710
783
  # Create the new column order with adduct columns in the correct position
711
- new_column_order = (
712
- list(columns_before) +
713
- adduct_column_names +
714
- list(columns_after)
715
- )
716
-
784
+ new_column_order = list(columns_before) + adduct_column_names + list(columns_after)
785
+
717
786
  # Add adduct columns to the dataframe
718
- self.features_df = df_without_adducts.with_columns([
719
- pl.Series("adduct", final_adducts),
720
- pl.Series("adduct_charge", final_charges),
721
- pl.Series("adduct_mass_shift", final_mass_shifts),
722
- pl.Series("adduct_mass_neutral", final_neutral_masses),
723
- pl.Series("adduct_group", final_groups)
724
- ]).select(new_column_order)
787
+ self.features_df = df_without_adducts.with_columns(
788
+ [
789
+ pl.Series("adduct", final_adducts),
790
+ pl.Series("adduct_charge", final_charges),
791
+ pl.Series("adduct_mass_shift", final_mass_shifts),
792
+ pl.Series("adduct_mass_neutral", final_neutral_masses),
793
+ pl.Series("adduct_group", final_groups),
794
+ ],
795
+ ).select(new_column_order)
725
796
 
726
797
  # Summary statistics
727
798
  total_with_adducts = sum(1 for x in final_adducts if x is not None)
728
799
  total_groups = max(final_groups) if final_groups else 0
729
-
730
- self.logger.info(f"Adduct detection completed: {total_with_adducts} features with adducts in {total_groups} groups")
800
+
801
+ self.logger.info(
802
+ f"Adduct detection completed: {total_with_adducts} features with adducts in {total_groups} groups",
803
+ )
731
804
 
732
805
  # Store parameters including the actual processed adducts list
733
806
  history_params = params.to_dict()
734
807
  # Convert the filtered adducts dataframe to a list of adduct specifications for history
735
- history_params['adducts'] = adducts_df.select(['name', 'charge', 'mass_shift', 'probability']).to_dicts()
736
-
808
+ history_params["adducts"] = adducts_df.select(
809
+ ["name", "charge", "mass_shift", "probability"],
810
+ ).to_dicts()
811
+
737
812
  self.store_history(["find_adducts"], history_params)
738
813
  self.logger.debug("Parameters stored successfully")
739
-