masster 0.4.4__py3-none-any.whl → 0.4.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of masster might be problematic. Click here for more details.
- masster/chromatogram.py +2 -2
- masster/data/libs/urine.csv +3 -3
- masster/logger.py +8 -8
- masster/sample/adducts.py +337 -263
- masster/sample/defaults/find_adducts_def.py +21 -8
- masster/sample/h5.py +557 -278
- masster/sample/helpers.py +131 -75
- masster/sample/lib.py +2 -2
- masster/sample/load.py +25 -11
- masster/sample/plot.py +5 -5
- masster/sample/processing.py +115 -85
- masster/sample/sample.py +28 -15
- masster/sample/sample5_schema.json +44 -44
- masster/sample/save.py +34 -11
- masster/spectrum.py +2 -2
- masster/study/defaults/align_def.py +5 -1
- masster/study/defaults/identify_def.py +3 -1
- masster/study/defaults/study_def.py +58 -25
- masster/study/export.py +354 -204
- masster/study/h5.py +557 -155
- masster/study/helpers.py +487 -194
- masster/study/id.py +536 -347
- masster/study/load.py +228 -138
- masster/study/plot.py +68 -68
- masster/study/processing.py +455 -253
- masster/study/save.py +14 -4
- masster/study/study.py +122 -40
- masster/study/study5_schema.json +149 -149
- {masster-0.4.4.dist-info → masster-0.4.6.dist-info}/METADATA +5 -3
- {masster-0.4.4.dist-info → masster-0.4.6.dist-info}/RECORD +34 -34
- {masster-0.4.4.dist-info → masster-0.4.6.dist-info}/WHEEL +0 -0
- {masster-0.4.4.dist-info → masster-0.4.6.dist-info}/entry_points.txt +0 -0
- {masster-0.4.4.dist-info → masster-0.4.6.dist-info}/licenses/LICENSE +0 -0
- {masster-0.4.4.dist-info → masster-0.4.6.dist-info}/top_level.txt +0 -0
masster/sample/adducts.py
CHANGED
|
@@ -14,7 +14,7 @@ Functions:
|
|
|
14
14
|
|
|
15
15
|
import numpy as np
|
|
16
16
|
import polars as pl
|
|
17
|
-
from typing import List, Dict
|
|
17
|
+
from typing import List, Dict
|
|
18
18
|
from itertools import combinations
|
|
19
19
|
|
|
20
20
|
# Import defaults class for external use
|
|
@@ -24,14 +24,14 @@ from masster.sample.defaults.find_adducts_def import find_adducts_defaults
|
|
|
24
24
|
def _get_adducts(self, adducts_list: list = None, **kwargs):
|
|
25
25
|
"""
|
|
26
26
|
Generate comprehensive adduct specifications including multiply charged species and combinations.
|
|
27
|
-
|
|
27
|
+
|
|
28
28
|
This method consolidates all adduct generation logic into a single optimized helper
|
|
29
29
|
that produces a polars DataFrame with all possible adduct combinations, properly
|
|
30
30
|
formatted names like [M+H]1+ or [M-H2O+2H]2+, and respecting charge constraints.
|
|
31
|
-
|
|
31
|
+
|
|
32
32
|
Uses parameters from find_adducts_defaults() by default, which can be overridden
|
|
33
33
|
by providing keyword arguments.
|
|
34
|
-
|
|
34
|
+
|
|
35
35
|
Parameters
|
|
36
36
|
----------
|
|
37
37
|
adducts_list : List[str], optional
|
|
@@ -42,7 +42,7 @@ def _get_adducts(self, adducts_list: list = None, **kwargs):
|
|
|
42
42
|
- charge_min: Minimum charge to consider (default from find_adducts_defaults)
|
|
43
43
|
- charge_max: Maximum charge to consider (default from find_adducts_defaults)
|
|
44
44
|
- max_combinations: Maximum number of adduct components to combine (default 4)
|
|
45
|
-
|
|
45
|
+
|
|
46
46
|
Returns
|
|
47
47
|
-------
|
|
48
48
|
pl.DataFrame
|
|
@@ -56,177 +56,201 @@ def _get_adducts(self, adducts_list: list = None, **kwargs):
|
|
|
56
56
|
"""
|
|
57
57
|
# Get default parameters from find_adducts_defaults
|
|
58
58
|
defaults = self.find_adducts_defaults()
|
|
59
|
-
|
|
59
|
+
|
|
60
60
|
# Use provided parameters or defaults
|
|
61
61
|
if adducts_list is None:
|
|
62
62
|
adducts_list = defaults.get_openms_adducts()
|
|
63
|
-
|
|
64
|
-
charge_min = kwargs.get(
|
|
65
|
-
charge_max = kwargs.get(
|
|
66
|
-
max_combinations = kwargs.get(
|
|
67
|
-
|
|
63
|
+
|
|
64
|
+
charge_min = kwargs.get("charge_min", defaults.charge_min)
|
|
65
|
+
charge_max = kwargs.get("charge_max", defaults.charge_max)
|
|
66
|
+
max_combinations = kwargs.get("max_combinations", 4)
|
|
67
|
+
|
|
68
68
|
# Parse base adduct specifications
|
|
69
69
|
base_specs = []
|
|
70
|
-
|
|
70
|
+
|
|
71
71
|
for adduct_str in adducts_list:
|
|
72
|
-
if not isinstance(adduct_str, str) or
|
|
72
|
+
if not isinstance(adduct_str, str) or ":" not in adduct_str:
|
|
73
73
|
continue
|
|
74
|
-
|
|
74
|
+
|
|
75
75
|
try:
|
|
76
|
-
parts = adduct_str.split(
|
|
76
|
+
parts = adduct_str.split(":")
|
|
77
77
|
if len(parts) != 3:
|
|
78
78
|
continue
|
|
79
|
-
|
|
79
|
+
|
|
80
80
|
formula_part = parts[0]
|
|
81
|
-
charge = int(parts[1])
|
|
81
|
+
charge = int(parts[1])
|
|
82
82
|
probability = float(parts[2])
|
|
83
|
-
|
|
83
|
+
|
|
84
84
|
# Calculate mass shift from formula
|
|
85
85
|
mass_shift = _calculate_formula_mass_shift(formula_part)
|
|
86
|
-
|
|
87
|
-
base_specs.append(
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
86
|
+
|
|
87
|
+
base_specs.append(
|
|
88
|
+
{
|
|
89
|
+
"formula": formula_part,
|
|
90
|
+
"charge": charge,
|
|
91
|
+
"mass_shift": mass_shift,
|
|
92
|
+
"probability": probability,
|
|
93
|
+
"raw_string": adduct_str,
|
|
94
|
+
},
|
|
95
|
+
)
|
|
96
|
+
|
|
95
97
|
except (ValueError, IndexError):
|
|
96
98
|
continue
|
|
97
|
-
|
|
99
|
+
|
|
98
100
|
# Generate all valid combinations
|
|
99
101
|
combinations_list = []
|
|
100
|
-
|
|
102
|
+
|
|
101
103
|
# Separate specs by charge type
|
|
102
|
-
positive_specs = [spec for spec in base_specs if spec[
|
|
103
|
-
negative_specs = [spec for spec in base_specs if spec[
|
|
104
|
-
neutral_specs = [spec for spec in base_specs if spec[
|
|
105
|
-
|
|
104
|
+
positive_specs = [spec for spec in base_specs if spec["charge"] > 0]
|
|
105
|
+
negative_specs = [spec for spec in base_specs if spec["charge"] < 0]
|
|
106
|
+
neutral_specs = [spec for spec in base_specs if spec["charge"] == 0]
|
|
107
|
+
|
|
106
108
|
# 1. Single adducts
|
|
107
109
|
for spec in base_specs:
|
|
108
|
-
if charge_min <= spec[
|
|
110
|
+
if charge_min <= spec["charge"] <= charge_max:
|
|
109
111
|
formatted_name = _format_adduct_name([spec])
|
|
110
|
-
combinations_list.append(
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
112
|
+
combinations_list.append(
|
|
113
|
+
{
|
|
114
|
+
"components": [spec],
|
|
115
|
+
"formatted_name": formatted_name,
|
|
116
|
+
"total_mass_shift": spec["mass_shift"],
|
|
117
|
+
"total_charge": spec["charge"],
|
|
118
|
+
"combined_probability": spec["probability"],
|
|
119
|
+
"complexity": 1,
|
|
120
|
+
},
|
|
121
|
+
)
|
|
122
|
+
|
|
119
123
|
# 2. Generate multiply charged versions (2H+, 3H+, etc.)
|
|
120
124
|
for spec in positive_specs + negative_specs:
|
|
121
|
-
base_charge = spec[
|
|
125
|
+
base_charge = spec["charge"]
|
|
122
126
|
for multiplier in range(2, min(max_combinations + 1, 5)):
|
|
123
127
|
total_charge = base_charge * multiplier
|
|
124
128
|
if charge_min <= total_charge <= charge_max:
|
|
125
129
|
components = [spec] * multiplier
|
|
126
130
|
formatted_name = _format_adduct_name(components)
|
|
127
|
-
|
|
128
|
-
combinations_list.append(
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
131
|
+
|
|
132
|
+
combinations_list.append(
|
|
133
|
+
{
|
|
134
|
+
"components": components,
|
|
135
|
+
"formatted_name": formatted_name,
|
|
136
|
+
"total_mass_shift": spec["mass_shift"] * multiplier,
|
|
137
|
+
"total_charge": total_charge,
|
|
138
|
+
"combined_probability": spec["probability"] ** multiplier,
|
|
139
|
+
"complexity": multiplier,
|
|
140
|
+
},
|
|
141
|
+
)
|
|
142
|
+
|
|
137
143
|
# 3. Mixed combinations (2-component)
|
|
138
144
|
if max_combinations >= 2:
|
|
139
145
|
# Positive + Neutral
|
|
140
146
|
for pos_spec in positive_specs:
|
|
141
147
|
for neut_spec in neutral_specs:
|
|
142
|
-
total_charge = pos_spec[
|
|
148
|
+
total_charge = pos_spec["charge"] + neut_spec["charge"]
|
|
143
149
|
if charge_min <= total_charge <= charge_max:
|
|
144
150
|
components = [pos_spec, neut_spec]
|
|
145
151
|
formatted_name = _format_adduct_name(components)
|
|
146
|
-
combinations_list.append(
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
152
|
+
combinations_list.append(
|
|
153
|
+
{
|
|
154
|
+
"components": components,
|
|
155
|
+
"formatted_name": formatted_name,
|
|
156
|
+
"total_mass_shift": pos_spec["mass_shift"]
|
|
157
|
+
+ neut_spec["mass_shift"],
|
|
158
|
+
"total_charge": total_charge,
|
|
159
|
+
"combined_probability": pos_spec["probability"]
|
|
160
|
+
* neut_spec["probability"],
|
|
161
|
+
"complexity": 2,
|
|
162
|
+
},
|
|
163
|
+
)
|
|
164
|
+
|
|
155
165
|
# Different charged species
|
|
156
166
|
for combo in combinations(positive_specs, 2):
|
|
157
|
-
if combo[0][
|
|
158
|
-
total_charge = combo[0][
|
|
167
|
+
if combo[0]["formula"] != combo[1]["formula"]:
|
|
168
|
+
total_charge = combo[0]["charge"] + combo[1]["charge"]
|
|
159
169
|
if charge_min <= total_charge <= charge_max:
|
|
160
170
|
components = list(combo)
|
|
161
171
|
formatted_name = _format_adduct_name(components)
|
|
162
|
-
combinations_list.append(
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
172
|
+
combinations_list.append(
|
|
173
|
+
{
|
|
174
|
+
"components": components,
|
|
175
|
+
"formatted_name": formatted_name,
|
|
176
|
+
"total_mass_shift": combo[0]["mass_shift"]
|
|
177
|
+
+ combo[1]["mass_shift"],
|
|
178
|
+
"total_charge": total_charge,
|
|
179
|
+
"combined_probability": combo[0]["probability"]
|
|
180
|
+
* combo[1]["probability"],
|
|
181
|
+
"complexity": 2,
|
|
182
|
+
},
|
|
183
|
+
)
|
|
184
|
+
|
|
171
185
|
# 4. 3-component combinations (limited for performance)
|
|
172
186
|
if max_combinations >= 3:
|
|
173
187
|
for pos_spec in positive_specs[:2]:
|
|
174
188
|
for neut_combo in combinations(neutral_specs[:2], 2):
|
|
175
189
|
components = [pos_spec] + list(neut_combo)
|
|
176
|
-
total_charge = sum(spec[
|
|
177
|
-
|
|
190
|
+
total_charge = sum(spec["charge"] for spec in components)
|
|
191
|
+
|
|
178
192
|
if charge_min <= total_charge <= charge_max:
|
|
179
193
|
formatted_name = _format_adduct_name(components)
|
|
180
|
-
total_mass_shift = sum(spec[
|
|
181
|
-
combined_prob = np.prod(
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
194
|
+
total_mass_shift = sum(spec["mass_shift"] for spec in components)
|
|
195
|
+
combined_prob = np.prod(
|
|
196
|
+
[spec["probability"] for spec in components],
|
|
197
|
+
)
|
|
198
|
+
|
|
199
|
+
combinations_list.append(
|
|
200
|
+
{
|
|
201
|
+
"components": components,
|
|
202
|
+
"formatted_name": formatted_name,
|
|
203
|
+
"total_mass_shift": total_mass_shift,
|
|
204
|
+
"total_charge": total_charge,
|
|
205
|
+
"combined_probability": combined_prob,
|
|
206
|
+
"complexity": 3,
|
|
207
|
+
},
|
|
208
|
+
)
|
|
209
|
+
|
|
192
210
|
# Convert to polars DataFrame
|
|
193
211
|
if combinations_list:
|
|
194
|
-
combinations_list.sort(
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
212
|
+
combinations_list.sort(
|
|
213
|
+
key=lambda x: (-x["combined_probability"], x["complexity"]),
|
|
214
|
+
)
|
|
215
|
+
|
|
216
|
+
adducts_df = pl.DataFrame(
|
|
217
|
+
[
|
|
218
|
+
{
|
|
219
|
+
"name": combo["formatted_name"],
|
|
220
|
+
"charge": combo["total_charge"],
|
|
221
|
+
"mass_shift": combo["total_mass_shift"],
|
|
222
|
+
"probability": combo["combined_probability"],
|
|
223
|
+
"complexity": combo["complexity"],
|
|
224
|
+
"components": combo["components"],
|
|
225
|
+
}
|
|
226
|
+
for combo in combinations_list
|
|
227
|
+
],
|
|
228
|
+
)
|
|
207
229
|
else:
|
|
208
230
|
# Return empty DataFrame with correct schema
|
|
209
|
-
adducts_df = pl.DataFrame(
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
231
|
+
adducts_df = pl.DataFrame(
|
|
232
|
+
{
|
|
233
|
+
"name": [],
|
|
234
|
+
"charge": [],
|
|
235
|
+
"mass_shift": [],
|
|
236
|
+
"probability": [],
|
|
237
|
+
"complexity": [],
|
|
238
|
+
"components": [],
|
|
239
|
+
},
|
|
240
|
+
)
|
|
241
|
+
|
|
218
242
|
return adducts_df
|
|
219
243
|
|
|
220
244
|
|
|
221
245
|
def _calculate_formula_mass_shift(formula: str) -> float:
|
|
222
246
|
"""
|
|
223
247
|
Calculate mass shift from formula string like "+H", "-H2O", "+Na-H", etc.
|
|
224
|
-
|
|
248
|
+
|
|
225
249
|
Parameters
|
|
226
250
|
----------
|
|
227
251
|
formula : str
|
|
228
252
|
Formula string (e.g., "+H", "-H2O", "+Na-H")
|
|
229
|
-
|
|
253
|
+
|
|
230
254
|
Returns
|
|
231
255
|
-------
|
|
232
256
|
float
|
|
@@ -234,59 +258,59 @@ def _calculate_formula_mass_shift(formula: str) -> float:
|
|
|
234
258
|
"""
|
|
235
259
|
# Standard atomic masses
|
|
236
260
|
atomic_masses = {
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
261
|
+
"H": 1.007825,
|
|
262
|
+
"C": 12.0,
|
|
263
|
+
"N": 14.003074,
|
|
264
|
+
"O": 15.994915,
|
|
265
|
+
"Na": 22.989769,
|
|
266
|
+
"K": 38.963707,
|
|
267
|
+
"Li": 7.016003,
|
|
268
|
+
"Ca": 39.962591,
|
|
269
|
+
"Mg": 23.985042,
|
|
270
|
+
"Fe": 55.934938,
|
|
271
|
+
"Cl": 34.968853,
|
|
272
|
+
"Br": 78.918336,
|
|
273
|
+
"I": 126.904473,
|
|
274
|
+
"P": 30.973762,
|
|
275
|
+
"S": 31.972071,
|
|
252
276
|
}
|
|
253
|
-
|
|
277
|
+
|
|
254
278
|
total_mass = 0.0
|
|
255
|
-
|
|
279
|
+
|
|
256
280
|
# Parse formula by splitting on + and - while preserving the operators
|
|
257
281
|
parts = []
|
|
258
282
|
current_part = ""
|
|
259
283
|
current_sign = 1
|
|
260
|
-
|
|
284
|
+
|
|
261
285
|
for char in formula:
|
|
262
|
-
if char ==
|
|
286
|
+
if char == "+":
|
|
263
287
|
if current_part:
|
|
264
288
|
parts.append((current_sign, current_part))
|
|
265
289
|
current_part = ""
|
|
266
290
|
current_sign = 1
|
|
267
|
-
elif char ==
|
|
291
|
+
elif char == "-":
|
|
268
292
|
if current_part:
|
|
269
293
|
parts.append((current_sign, current_part))
|
|
270
294
|
current_part = ""
|
|
271
295
|
current_sign = -1
|
|
272
296
|
else:
|
|
273
297
|
current_part += char
|
|
274
|
-
|
|
298
|
+
|
|
275
299
|
if current_part:
|
|
276
300
|
parts.append((current_sign, current_part))
|
|
277
|
-
|
|
301
|
+
|
|
278
302
|
# Process each part
|
|
279
303
|
for sign, part in parts:
|
|
280
304
|
if not part:
|
|
281
305
|
continue
|
|
282
|
-
|
|
306
|
+
|
|
283
307
|
# Parse element and count (e.g., "H2O" -> H:2, O:1)
|
|
284
308
|
elements = _parse_element_counts(part)
|
|
285
|
-
|
|
309
|
+
|
|
286
310
|
for element, count in elements.items():
|
|
287
311
|
if element in atomic_masses:
|
|
288
312
|
total_mass += sign * atomic_masses[element] * count
|
|
289
|
-
|
|
313
|
+
|
|
290
314
|
return total_mass
|
|
291
315
|
|
|
292
316
|
|
|
@@ -294,25 +318,25 @@ def _parse_element_counts(formula_part: str) -> Dict[str, int]:
|
|
|
294
318
|
"""Parse element counts from a formula part like 'H2O' -> {'H': 2, 'O': 1}"""
|
|
295
319
|
elements = {}
|
|
296
320
|
i = 0
|
|
297
|
-
|
|
321
|
+
|
|
298
322
|
while i < len(formula_part):
|
|
299
323
|
# Get element (uppercase letter, possibly followed by lowercase)
|
|
300
324
|
element = formula_part[i]
|
|
301
325
|
i += 1
|
|
302
|
-
|
|
326
|
+
|
|
303
327
|
while i < len(formula_part) and formula_part[i].islower():
|
|
304
328
|
element += formula_part[i]
|
|
305
329
|
i += 1
|
|
306
|
-
|
|
330
|
+
|
|
307
331
|
# Get count (digits following element)
|
|
308
332
|
count_str = ""
|
|
309
333
|
while i < len(formula_part) and formula_part[i].isdigit():
|
|
310
334
|
count_str += formula_part[i]
|
|
311
335
|
i += 1
|
|
312
|
-
|
|
336
|
+
|
|
313
337
|
count = int(count_str) if count_str else 1
|
|
314
338
|
elements[element] = elements.get(element, 0) + count
|
|
315
|
-
|
|
339
|
+
|
|
316
340
|
return elements
|
|
317
341
|
|
|
318
342
|
|
|
@@ -320,51 +344,56 @@ def _format_adduct_name(components: List[Dict]) -> str:
|
|
|
320
344
|
"""Format adduct name from components like [M+H]1+ or [M+2H]2+ or [M+2(H+Na)]3+"""
|
|
321
345
|
if not components:
|
|
322
346
|
return "[M]"
|
|
323
|
-
|
|
347
|
+
|
|
324
348
|
# Count occurrences of each formula
|
|
325
349
|
from collections import Counter
|
|
326
|
-
|
|
327
|
-
|
|
328
|
-
|
|
350
|
+
|
|
351
|
+
formula_counts = Counter(comp["formula"] for comp in components)
|
|
352
|
+
total_charge = sum(comp["charge"] for comp in components)
|
|
353
|
+
|
|
329
354
|
# Build formula part with proper multipliers
|
|
330
355
|
formula_parts = []
|
|
331
|
-
for formula, count in sorted(
|
|
356
|
+
for formula, count in sorted(
|
|
357
|
+
formula_counts.items(),
|
|
358
|
+
): # Sort for consistent ordering
|
|
332
359
|
if count == 1:
|
|
333
360
|
formula_parts.append(formula)
|
|
334
361
|
else:
|
|
335
362
|
# For multiple occurrences, use count prefix (e.g., 2H, 3Na)
|
|
336
363
|
# Handle special case where formula might already start with + or -
|
|
337
|
-
if formula.startswith((
|
|
364
|
+
if formula.startswith(("+", "-")):
|
|
338
365
|
sign = formula[0]
|
|
339
366
|
base_formula = formula[1:]
|
|
340
367
|
formula_parts.append(f"{sign}{count}{base_formula}")
|
|
341
368
|
else:
|
|
342
369
|
formula_parts.append(f"{count}{formula}")
|
|
343
|
-
|
|
370
|
+
|
|
344
371
|
# Combine formula parts
|
|
345
372
|
formula = "".join(formula_parts)
|
|
346
|
-
|
|
373
|
+
|
|
347
374
|
# Format charge
|
|
348
375
|
if total_charge == 0:
|
|
349
376
|
charge_str = ""
|
|
350
377
|
elif abs(total_charge) == 1:
|
|
351
378
|
charge_str = "1+" if total_charge > 0 else "1-"
|
|
352
379
|
else:
|
|
353
|
-
charge_str =
|
|
354
|
-
|
|
380
|
+
charge_str = (
|
|
381
|
+
f"{abs(total_charge)}+" if total_charge > 0 else f"{abs(total_charge)}-"
|
|
382
|
+
)
|
|
383
|
+
|
|
355
384
|
return f"[M{formula}]{charge_str}"
|
|
356
385
|
|
|
357
386
|
|
|
358
387
|
def find_adducts(self, **kwargs):
|
|
359
388
|
"""Detect adduct relationships among detected features using improved OpenMS-like algorithm.
|
|
360
389
|
|
|
361
|
-
This method implements a corrected version of the OpenMS MetaboliteFeatureDeconvolution
|
|
390
|
+
This method implements a corrected version of the OpenMS MetaboliteFeatureDeconvolution
|
|
362
391
|
algorithm that properly enforces RT constraints and avoids the mass tolerance dominance
|
|
363
392
|
issues present in the original C++ implementation.
|
|
364
393
|
|
|
365
394
|
Key improvements over OpenMS:
|
|
366
395
|
- Early RT filtering prevents expensive mass calculations for temporally incompatible features
|
|
367
|
-
- Strict mass tolerance (0.01 Da default) prevents inappropriate mass relationships
|
|
396
|
+
- Strict mass tolerance (0.01 Da default) prevents inappropriate mass relationships
|
|
368
397
|
- RT constraints are properly enforced throughout the algorithm
|
|
369
398
|
- Connected components analysis respects both mass AND RT constraints
|
|
370
399
|
- Probability-based scoring for adduct assignment
|
|
@@ -381,9 +410,9 @@ def find_adducts(self, **kwargs):
|
|
|
381
410
|
Side effects:
|
|
382
411
|
Updates ``self.features_df`` with adduct information columns.
|
|
383
412
|
"""
|
|
384
|
-
# Initialize parameters
|
|
413
|
+
# Initialize parameters
|
|
385
414
|
params = find_adducts_defaults()
|
|
386
|
-
|
|
415
|
+
|
|
387
416
|
for key, value in kwargs.items():
|
|
388
417
|
if isinstance(value, find_adducts_defaults):
|
|
389
418
|
params = value
|
|
@@ -393,24 +422,28 @@ def find_adducts(self, **kwargs):
|
|
|
393
422
|
if params.set(key, value, validate=True):
|
|
394
423
|
self.logger.debug(f"Updated parameter {key} = {value}")
|
|
395
424
|
else:
|
|
396
|
-
self.logger.warning(
|
|
425
|
+
self.logger.warning(
|
|
426
|
+
f"Failed to set parameter {key} = {value} (validation failed)",
|
|
427
|
+
)
|
|
397
428
|
else:
|
|
398
429
|
self.logger.warning(f"Unknown parameter {key} ignored")
|
|
399
430
|
|
|
400
431
|
# Check if features_df exists and has data
|
|
401
|
-
if not hasattr(self,
|
|
402
|
-
self.logger.warning(
|
|
432
|
+
if not hasattr(self, "features_df") or len(self.features_df) == 0:
|
|
433
|
+
self.logger.warning(
|
|
434
|
+
"No features available for adduct detection. Run find_features() first.",
|
|
435
|
+
)
|
|
403
436
|
return
|
|
404
437
|
|
|
405
438
|
self.logger.info("Adduct detection...")
|
|
406
439
|
|
|
407
440
|
# Validate required columns
|
|
408
|
-
required_cols = [
|
|
441
|
+
required_cols = ["mz", "rt"]
|
|
409
442
|
missing_cols = [col for col in required_cols if col not in self.features_df.columns]
|
|
410
443
|
if missing_cols:
|
|
411
444
|
self.logger.error(f"Required columns missing from features_df: {missing_cols}")
|
|
412
445
|
return
|
|
413
|
-
|
|
446
|
+
|
|
414
447
|
# Check if we have any features to process
|
|
415
448
|
if len(self.features_df) == 0:
|
|
416
449
|
self.logger.warning("No features available for adduct detection")
|
|
@@ -424,130 +457,151 @@ def find_adducts(self, **kwargs):
|
|
|
424
457
|
|
|
425
458
|
# Get parameters
|
|
426
459
|
adducts_list = params.get_openms_adducts()
|
|
427
|
-
charge_min = params.get("charge_min")
|
|
460
|
+
charge_min = params.get("charge_min")
|
|
428
461
|
charge_max = params.get("charge_max")
|
|
429
462
|
retention_max_diff = params.get("retention_max_diff")
|
|
430
463
|
mass_max_diff = params.get("mass_max_diff")
|
|
431
464
|
unit = params.get("unit")
|
|
432
465
|
min_probability = params.get("min_probability")
|
|
433
466
|
|
|
434
|
-
self.logger.debug(
|
|
435
|
-
|
|
467
|
+
self.logger.debug(
|
|
468
|
+
f"Processing {len(self.features_df)} features with {len(adducts_list)} base adducts",
|
|
469
|
+
)
|
|
470
|
+
self.logger.debug(
|
|
471
|
+
f"RT tolerance: {retention_max_diff}s, Mass tolerance: {mass_max_diff} {unit}",
|
|
472
|
+
)
|
|
436
473
|
self.logger.debug(f"Min probability threshold: {min_probability}")
|
|
437
474
|
|
|
438
475
|
# Generate comprehensive adduct specifications using the Sample method
|
|
439
476
|
adducts_df = self._get_adducts(
|
|
440
477
|
adducts_list=adducts_list,
|
|
441
|
-
charge_min=charge_min,
|
|
478
|
+
charge_min=charge_min,
|
|
442
479
|
charge_max=charge_max,
|
|
443
|
-
max_combinations=4
|
|
480
|
+
max_combinations=4,
|
|
444
481
|
)
|
|
445
|
-
|
|
482
|
+
|
|
446
483
|
self.logger.debug(f"Generated {len(adducts_df)} total adduct combinations")
|
|
447
|
-
|
|
484
|
+
|
|
448
485
|
# Filter adducts by minimum probability threshold
|
|
449
486
|
if min_probability > 0.0:
|
|
450
487
|
adducts_before_filter = len(adducts_df)
|
|
451
488
|
adducts_df = adducts_df.filter(pl.col("probability") >= min_probability)
|
|
452
489
|
adducts_after_filter = len(adducts_df)
|
|
453
490
|
filtered_count = adducts_before_filter - adducts_after_filter
|
|
454
|
-
|
|
455
|
-
self.logger.debug(
|
|
491
|
+
|
|
492
|
+
self.logger.debug(
|
|
493
|
+
f"Filtered {filtered_count} low-probability adducts (< {min_probability})",
|
|
494
|
+
)
|
|
456
495
|
self.logger.debug(f"Remaining adducts for analysis: {adducts_after_filter}")
|
|
457
|
-
|
|
496
|
+
|
|
458
497
|
if len(adducts_df) == 0:
|
|
459
|
-
self.logger.warning(
|
|
498
|
+
self.logger.warning(
|
|
499
|
+
f"No adducts remaining after probability filtering (min_probability={min_probability})",
|
|
500
|
+
)
|
|
460
501
|
return
|
|
461
|
-
|
|
502
|
+
|
|
462
503
|
# Implement the adduct detection algorithm directly here
|
|
463
504
|
import numpy as np
|
|
464
|
-
|
|
505
|
+
|
|
465
506
|
# Get parameters
|
|
466
507
|
charge_max = params.get("charge_max")
|
|
467
508
|
retention_max_diff = params.get("retention_max_diff")
|
|
468
509
|
mass_max_diff = params.get("mass_max_diff")
|
|
469
510
|
unit = params.get("unit")
|
|
470
|
-
|
|
511
|
+
|
|
471
512
|
# Sort features by RT for efficient RT-sweep processing (OpenMS approach)
|
|
472
513
|
# Store original row positions before sorting for correct index mapping
|
|
473
514
|
features_with_positions = self.features_df.with_row_index("original_position")
|
|
474
515
|
features_sorted = features_with_positions.sort("rt")
|
|
475
516
|
n_features = len(features_sorted)
|
|
476
|
-
|
|
517
|
+
|
|
477
518
|
# Extract arrays for fast processing
|
|
478
519
|
feature_mzs = features_sorted.select("mz").to_numpy().flatten()
|
|
479
520
|
feature_rts = features_sorted.select("rt").to_numpy().flatten()
|
|
480
|
-
|
|
521
|
+
|
|
481
522
|
# Convert adducts to arrays for vectorized operations
|
|
482
523
|
adduct_mass_shifts = adducts_df.select("mass_shift").to_numpy().flatten()
|
|
483
524
|
adduct_charges = adducts_df.select("charge").to_numpy().flatten()
|
|
484
525
|
adduct_names = adducts_df.select("name").to_series().to_list()
|
|
485
526
|
adduct_probs = adducts_df.select("probability").to_numpy().flatten()
|
|
486
527
|
|
|
487
|
-
self.logger.debug(
|
|
488
|
-
|
|
528
|
+
self.logger.debug(
|
|
529
|
+
f"RT-sweep processing: {n_features} features × {len(adducts_df)} adduct combinations",
|
|
530
|
+
)
|
|
531
|
+
|
|
489
532
|
# Phase 1: RT-sweep line algorithm with early RT filtering (fixes OpenMS flaw #1)
|
|
490
533
|
candidate_edges = []
|
|
491
|
-
|
|
534
|
+
|
|
492
535
|
for i_rt in range(n_features):
|
|
493
536
|
mz1 = feature_mzs[i_rt]
|
|
494
537
|
rt1 = feature_rts[i_rt]
|
|
495
|
-
|
|
538
|
+
|
|
496
539
|
# RT-window sweep: only check features within RT tolerance (early filtering)
|
|
497
540
|
for j_rt in range(i_rt + 1, n_features):
|
|
498
541
|
rt2 = feature_rts[j_rt]
|
|
499
542
|
rt_diff = rt2 - rt1
|
|
500
|
-
|
|
543
|
+
|
|
501
544
|
# Early RT constraint check (fixes OpenMS issue where RT was checked too late)
|
|
502
545
|
if rt_diff > retention_max_diff:
|
|
503
546
|
break # Features are RT-sorted, so no more valid pairs
|
|
504
|
-
|
|
547
|
+
|
|
505
548
|
mz2 = feature_mzs[j_rt]
|
|
506
|
-
|
|
549
|
+
|
|
507
550
|
# Phase 2: Check for valid mass relationships with strict tolerance (fixes OpenMS flaw #2)
|
|
508
551
|
for adduct_idx, mass_shift in enumerate(adduct_mass_shifts):
|
|
509
552
|
charge = adduct_charges[adduct_idx]
|
|
510
|
-
|
|
553
|
+
|
|
511
554
|
# Calculate mass tolerance (per feature, as in OpenMS)
|
|
512
|
-
if unit ==
|
|
555
|
+
if unit == "ppm":
|
|
513
556
|
tol1 = mass_max_diff * mz1 * 1e-6
|
|
514
557
|
tol2 = mass_max_diff * mz2 * 1e-6
|
|
515
558
|
combined_tolerance = tol1 + tol2
|
|
516
559
|
else: # Da
|
|
517
|
-
combined_tolerance =
|
|
518
|
-
|
|
560
|
+
combined_tolerance = (
|
|
561
|
+
2 * mass_max_diff
|
|
562
|
+
) # Combined tolerance for both features
|
|
563
|
+
|
|
519
564
|
# Check both directions of mass relationship
|
|
520
565
|
if charge != 0:
|
|
521
566
|
# For charged adducts: m/z relationship
|
|
522
|
-
mass_diff_12 = (mz2 * abs(charge)) - (mz1 * abs(charge))
|
|
567
|
+
mass_diff_12 = (mz2 * abs(charge)) - (mz1 * abs(charge))
|
|
523
568
|
expected_mass_diff = mass_shift
|
|
524
|
-
|
|
569
|
+
|
|
525
570
|
if abs(mass_diff_12 - expected_mass_diff) <= combined_tolerance:
|
|
526
571
|
# Valid mass relationship found
|
|
527
|
-
candidate_edges.append(
|
|
528
|
-
|
|
529
|
-
|
|
530
|
-
|
|
531
|
-
|
|
532
|
-
|
|
533
|
-
|
|
534
|
-
|
|
535
|
-
|
|
572
|
+
candidate_edges.append(
|
|
573
|
+
{
|
|
574
|
+
"i": i_rt,
|
|
575
|
+
"j": j_rt,
|
|
576
|
+
"rt_diff": rt_diff,
|
|
577
|
+
"mass_error": abs(mass_diff_12 - expected_mass_diff),
|
|
578
|
+
"adduct_idx": adduct_idx,
|
|
579
|
+
"charge1": charge if mass_diff_12 > 0 else -charge,
|
|
580
|
+
"charge2": -charge if mass_diff_12 > 0 else charge,
|
|
581
|
+
"probability": adduct_probs[adduct_idx],
|
|
582
|
+
},
|
|
583
|
+
)
|
|
536
584
|
else:
|
|
537
585
|
# For neutral adducts: direct mass shift
|
|
538
586
|
mass_diff_12 = mz2 - mz1
|
|
539
587
|
if abs(mass_diff_12 - mass_shift) <= combined_tolerance:
|
|
540
|
-
candidate_edges.append(
|
|
541
|
-
|
|
542
|
-
|
|
543
|
-
|
|
544
|
-
|
|
545
|
-
|
|
546
|
-
|
|
547
|
-
|
|
548
|
-
|
|
549
|
-
|
|
550
|
-
|
|
588
|
+
candidate_edges.append(
|
|
589
|
+
{
|
|
590
|
+
"i": i_rt,
|
|
591
|
+
"j": j_rt,
|
|
592
|
+
"rt_diff": rt_diff,
|
|
593
|
+
"mass_error": abs(mass_diff_12 - mass_shift),
|
|
594
|
+
"adduct_idx": adduct_idx,
|
|
595
|
+
"charge1": 0,
|
|
596
|
+
"charge2": 0,
|
|
597
|
+
"probability": adduct_probs[adduct_idx],
|
|
598
|
+
},
|
|
599
|
+
)
|
|
600
|
+
|
|
601
|
+
self.logger.debug(
|
|
602
|
+
f"Found {len(candidate_edges)} candidate edges after RT+mass filtering",
|
|
603
|
+
)
|
|
604
|
+
|
|
551
605
|
if len(candidate_edges) == 0:
|
|
552
606
|
self.logger.info("No adduct relationships found")
|
|
553
607
|
return
|
|
@@ -557,23 +611,23 @@ def find_adducts(self, **kwargs):
|
|
|
557
611
|
adjacency = {}
|
|
558
612
|
for i in range(n_features):
|
|
559
613
|
adjacency[i] = []
|
|
560
|
-
|
|
614
|
+
|
|
561
615
|
for edge in candidate_edges:
|
|
562
|
-
i, j = edge[
|
|
616
|
+
i, j = edge["i"], edge["j"]
|
|
563
617
|
adjacency[i].append(j)
|
|
564
618
|
adjacency[j].append(i)
|
|
565
|
-
|
|
619
|
+
|
|
566
620
|
# Find connected components using DFS
|
|
567
621
|
visited = [False] * n_features
|
|
568
622
|
components = []
|
|
569
|
-
|
|
623
|
+
|
|
570
624
|
def dfs(node, component):
|
|
571
625
|
visited[node] = True
|
|
572
626
|
component.append(node)
|
|
573
627
|
for neighbor in adjacency[node]:
|
|
574
628
|
if not visited[neighbor]:
|
|
575
629
|
dfs(neighbor, component)
|
|
576
|
-
|
|
630
|
+
|
|
577
631
|
for i in range(n_features):
|
|
578
632
|
if not visited[i] and len(adjacency[i]) > 0:
|
|
579
633
|
component = []
|
|
@@ -589,53 +643,60 @@ def find_adducts(self, **kwargs):
|
|
|
589
643
|
group_assignments = [0] * n_features
|
|
590
644
|
mass_shift_assignments = [0.0] * n_features
|
|
591
645
|
neutral_mass_assignments = [0.0] * n_features
|
|
592
|
-
|
|
646
|
+
|
|
593
647
|
for group_id, component in enumerate(components, 1):
|
|
594
648
|
# Find the most likely base ion (highest intensity or lowest m/z as proxy)
|
|
595
649
|
component_mzs = [feature_mzs[idx] for idx in component]
|
|
596
650
|
base_idx_in_component = np.argmin(component_mzs) # Lowest m/z as base
|
|
597
651
|
base_feature_idx = component[base_idx_in_component]
|
|
598
652
|
base_mz = feature_mzs[base_feature_idx]
|
|
599
|
-
|
|
653
|
+
|
|
600
654
|
# Assign base ion
|
|
601
655
|
base_adduct = "[M+H]1+" if charge_max > 0 else "[M-H]1-"
|
|
602
656
|
base_charge = 1 if charge_max > 0 else -1
|
|
603
657
|
base_mass_shift = 1.007825 if charge_max > 0 else -1.007825 # H mass
|
|
604
|
-
|
|
658
|
+
|
|
605
659
|
adduct_assignments[base_feature_idx] = base_adduct
|
|
606
660
|
adduct_charges_assigned[base_feature_idx] = base_charge
|
|
607
661
|
group_assignments[base_feature_idx] = group_id
|
|
608
662
|
mass_shift_assignments[base_feature_idx] = base_mass_shift
|
|
609
|
-
|
|
663
|
+
|
|
610
664
|
# Calculate neutral mass for base ion
|
|
611
665
|
base_mz_measured = feature_mzs[base_feature_idx]
|
|
612
|
-
neutral_mass_assignments[base_feature_idx] =
|
|
613
|
-
|
|
666
|
+
neutral_mass_assignments[base_feature_idx] = (
|
|
667
|
+
base_mz_measured * abs(base_charge) - base_mass_shift
|
|
668
|
+
)
|
|
669
|
+
|
|
614
670
|
# Assign other features based on their relationships to base
|
|
615
671
|
for feature_idx in component:
|
|
616
672
|
if feature_idx == base_feature_idx:
|
|
617
673
|
continue
|
|
618
|
-
|
|
674
|
+
|
|
619
675
|
group_assignments[feature_idx] = group_id
|
|
620
|
-
|
|
676
|
+
|
|
621
677
|
# Find best adduct assignment based on mass difference and probability
|
|
622
678
|
feature_mz = feature_mzs[feature_idx]
|
|
623
679
|
best_score = -np.inf
|
|
624
680
|
best_assignment = "[M+?]1+"
|
|
625
681
|
best_charge = 1
|
|
626
682
|
best_mass_shift = 1.007825 # Default to H mass shift for [M+?]1+
|
|
627
|
-
|
|
683
|
+
|
|
628
684
|
# Check all possible adducts
|
|
629
|
-
for adduct_idx, (mass_shift, charge, name, prob) in enumerate(
|
|
630
|
-
|
|
631
|
-
|
|
685
|
+
for adduct_idx, (mass_shift, charge, name, prob) in enumerate(
|
|
686
|
+
zip(
|
|
687
|
+
adduct_mass_shifts,
|
|
688
|
+
adduct_charges,
|
|
689
|
+
adduct_names,
|
|
690
|
+
adduct_probs,
|
|
691
|
+
),
|
|
692
|
+
):
|
|
632
693
|
if charge != 0:
|
|
633
694
|
expected_mz = base_mz + mass_shift / abs(charge)
|
|
634
695
|
else:
|
|
635
696
|
expected_mz = base_mz + mass_shift
|
|
636
|
-
|
|
697
|
+
|
|
637
698
|
mass_error = abs(expected_mz - feature_mz)
|
|
638
|
-
|
|
699
|
+
|
|
639
700
|
# Combined score: probability + mass accuracy
|
|
640
701
|
if mass_error < mass_max_diff * 2: # Within tolerance
|
|
641
702
|
score = prob - mass_error * 0.1 # Weight mass accuracy
|
|
@@ -648,48 +709,52 @@ def find_adducts(self, **kwargs):
|
|
|
648
709
|
adduct_assignments[feature_idx] = best_assignment
|
|
649
710
|
adduct_charges_assigned[feature_idx] = best_charge
|
|
650
711
|
mass_shift_assignments[feature_idx] = best_mass_shift
|
|
651
|
-
|
|
712
|
+
|
|
652
713
|
# Calculate neutral mass
|
|
653
|
-
neutral_mass_assignments[feature_idx] =
|
|
714
|
+
neutral_mass_assignments[feature_idx] = (
|
|
715
|
+
feature_mz * abs(best_charge) - best_mass_shift
|
|
716
|
+
)
|
|
654
717
|
|
|
655
718
|
# Assign fallback adduct for features not processed in connected components (isolated features)
|
|
656
719
|
for i in range(n_features):
|
|
657
720
|
if adduct_assignments[i] is None:
|
|
658
721
|
fallback_charge = 1 if charge_max > 0 else -1
|
|
659
722
|
fallback_mass_shift = 1.007825 if charge_max > 0 else -1.007825 # Assume H
|
|
660
|
-
|
|
723
|
+
|
|
661
724
|
adduct_assignments[i] = "[M+?]1+"
|
|
662
725
|
adduct_charges_assigned[i] = fallback_charge
|
|
663
726
|
group_assignments[i] = 0 # No group assignment for isolated features
|
|
664
727
|
mass_shift_assignments[i] = fallback_mass_shift
|
|
665
|
-
|
|
728
|
+
|
|
666
729
|
# Calculate neutral mass for isolated features
|
|
667
730
|
feature_mz = feature_mzs[i]
|
|
668
|
-
neutral_mass_assignments[i] =
|
|
731
|
+
neutral_mass_assignments[i] = (
|
|
732
|
+
feature_mz * abs(fallback_charge) - fallback_mass_shift
|
|
733
|
+
)
|
|
669
734
|
|
|
670
735
|
# Map back to original feature order using stored positions
|
|
671
736
|
original_indices = features_sorted.select("original_position").to_numpy().flatten()
|
|
672
|
-
|
|
737
|
+
|
|
673
738
|
# Create final assignments in original order (same size as original DataFrame)
|
|
674
739
|
final_adducts = [None] * len(self.features_df)
|
|
675
740
|
final_charges = [0] * len(self.features_df)
|
|
676
741
|
final_groups = [0] * len(self.features_df)
|
|
677
742
|
final_mass_shifts = [0.0] * len(self.features_df)
|
|
678
743
|
final_neutral_masses = [0.0] * len(self.features_df)
|
|
679
|
-
|
|
744
|
+
|
|
680
745
|
for sorted_idx, orig_idx in enumerate(original_indices):
|
|
681
746
|
final_adducts[orig_idx] = adduct_assignments[sorted_idx]
|
|
682
747
|
final_charges[orig_idx] = adduct_charges_assigned[sorted_idx]
|
|
683
748
|
final_groups[orig_idx] = group_assignments[sorted_idx]
|
|
684
749
|
final_mass_shifts[orig_idx] = mass_shift_assignments[sorted_idx]
|
|
685
750
|
final_neutral_masses[orig_idx] = neutral_mass_assignments[sorted_idx]
|
|
686
|
-
|
|
751
|
+
|
|
687
752
|
# Update features DataFrame with correct column ordering
|
|
688
753
|
# Insert adduct columns in the specified order after iso_of column
|
|
689
|
-
|
|
754
|
+
|
|
690
755
|
# Get current columns
|
|
691
756
|
current_columns = self.features_df.columns
|
|
692
|
-
|
|
757
|
+
|
|
693
758
|
# Find the position of iso_of column
|
|
694
759
|
try:
|
|
695
760
|
iso_of_index = current_columns.index("iso_of")
|
|
@@ -698,42 +763,51 @@ def find_adducts(self, **kwargs):
|
|
|
698
763
|
# If iso_of doesn't exist, append at the end
|
|
699
764
|
insert_position = len(current_columns)
|
|
700
765
|
self.logger.warning("iso_of column not found, adding adduct columns at the end")
|
|
701
|
-
|
|
766
|
+
|
|
702
767
|
# Remove any existing adduct columns first
|
|
703
|
-
adduct_column_names = [
|
|
704
|
-
|
|
705
|
-
|
|
768
|
+
adduct_column_names = [
|
|
769
|
+
"adduct",
|
|
770
|
+
"adduct_charge",
|
|
771
|
+
"adduct_mass_shift",
|
|
772
|
+
"adduct_mass_neutral",
|
|
773
|
+
"adduct_group",
|
|
774
|
+
]
|
|
775
|
+
df_without_adducts = self.features_df.select(
|
|
776
|
+
[col for col in current_columns if col not in adduct_column_names],
|
|
777
|
+
)
|
|
778
|
+
|
|
706
779
|
# Split columns at insertion point
|
|
707
780
|
columns_before = df_without_adducts.columns[:insert_position]
|
|
708
781
|
columns_after = df_without_adducts.columns[insert_position:]
|
|
709
|
-
|
|
782
|
+
|
|
710
783
|
# Create the new column order with adduct columns in the correct position
|
|
711
|
-
new_column_order = (
|
|
712
|
-
|
|
713
|
-
adduct_column_names +
|
|
714
|
-
list(columns_after)
|
|
715
|
-
)
|
|
716
|
-
|
|
784
|
+
new_column_order = list(columns_before) + adduct_column_names + list(columns_after)
|
|
785
|
+
|
|
717
786
|
# Add adduct columns to the dataframe
|
|
718
|
-
self.features_df = df_without_adducts.with_columns(
|
|
719
|
-
|
|
720
|
-
|
|
721
|
-
|
|
722
|
-
|
|
723
|
-
|
|
724
|
-
|
|
787
|
+
self.features_df = df_without_adducts.with_columns(
|
|
788
|
+
[
|
|
789
|
+
pl.Series("adduct", final_adducts),
|
|
790
|
+
pl.Series("adduct_charge", final_charges),
|
|
791
|
+
pl.Series("adduct_mass_shift", final_mass_shifts),
|
|
792
|
+
pl.Series("adduct_mass_neutral", final_neutral_masses),
|
|
793
|
+
pl.Series("adduct_group", final_groups),
|
|
794
|
+
],
|
|
795
|
+
).select(new_column_order)
|
|
725
796
|
|
|
726
797
|
# Summary statistics
|
|
727
798
|
total_with_adducts = sum(1 for x in final_adducts if x is not None)
|
|
728
799
|
total_groups = max(final_groups) if final_groups else 0
|
|
729
|
-
|
|
730
|
-
self.logger.info(
|
|
800
|
+
|
|
801
|
+
self.logger.info(
|
|
802
|
+
f"Adduct detection completed: {total_with_adducts} features with adducts in {total_groups} groups",
|
|
803
|
+
)
|
|
731
804
|
|
|
732
805
|
# Store parameters including the actual processed adducts list
|
|
733
806
|
history_params = params.to_dict()
|
|
734
807
|
# Convert the filtered adducts dataframe to a list of adduct specifications for history
|
|
735
|
-
history_params[
|
|
736
|
-
|
|
808
|
+
history_params["adducts"] = adducts_df.select(
|
|
809
|
+
["name", "charge", "mass_shift", "probability"],
|
|
810
|
+
).to_dicts()
|
|
811
|
+
|
|
737
812
|
self.store_history(["find_adducts"], history_params)
|
|
738
813
|
self.logger.debug("Parameters stored successfully")
|
|
739
|
-
|