masster 0.4.0__py3-none-any.whl → 0.4.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- masster/__init__.py +8 -8
- masster/_version.py +1 -1
- masster/chromatogram.py +3 -9
- masster/data/libs/README.md +1 -1
- masster/data/libs/ccm.csv +120 -120
- masster/data/libs/ccm.py +116 -62
- masster/data/libs/central_carbon_README.md +1 -1
- masster/data/libs/urine.py +161 -65
- masster/data/libs/urine_metabolites.csv +4693 -4693
- masster/data/wiff/2025_01_14_VW_7600_LpMx_DBS_CID_2min_TOP15_030msecMS1_005msecReac_CE35_DBS-ON_3.mzML +2 -2
- masster/logger.py +43 -78
- masster/sample/__init__.py +1 -1
- masster/sample/adducts.py +264 -338
- masster/sample/defaults/find_adducts_def.py +8 -21
- masster/sample/defaults/find_features_def.py +1 -6
- masster/sample/defaults/get_spectrum_def.py +1 -5
- masster/sample/defaults/sample_def.py +1 -5
- masster/sample/h5.py +282 -561
- masster/sample/helpers.py +75 -131
- masster/sample/lib.py +17 -42
- masster/sample/load.py +17 -31
- masster/sample/parameters.py +2 -6
- masster/sample/plot.py +27 -88
- masster/sample/processing.py +87 -117
- masster/sample/quant.py +51 -57
- masster/sample/sample.py +90 -103
- masster/sample/sample5_schema.json +44 -44
- masster/sample/save.py +12 -35
- masster/sample/sciex.py +19 -66
- masster/spectrum.py +20 -58
- masster/study/__init__.py +1 -1
- masster/study/defaults/align_def.py +1 -5
- masster/study/defaults/fill_chrom_def.py +1 -5
- masster/study/defaults/fill_def.py +1 -5
- masster/study/defaults/integrate_chrom_def.py +1 -5
- masster/study/defaults/integrate_def.py +1 -5
- masster/study/defaults/study_def.py +25 -58
- masster/study/export.py +207 -233
- masster/study/h5.py +136 -470
- masster/study/helpers.py +202 -495
- masster/study/helpers_optimized.py +13 -40
- masster/study/id.py +110 -213
- masster/study/load.py +143 -230
- masster/study/plot.py +257 -518
- masster/study/processing.py +257 -469
- masster/study/save.py +5 -15
- masster/study/study.py +276 -379
- masster/study/study5_schema.json +96 -96
- {masster-0.4.0.dist-info → masster-0.4.1.dist-info}/METADATA +1 -1
- masster-0.4.1.dist-info/RECORD +67 -0
- masster-0.4.0.dist-info/RECORD +0 -67
- {masster-0.4.0.dist-info → masster-0.4.1.dist-info}/WHEEL +0 -0
- {masster-0.4.0.dist-info → masster-0.4.1.dist-info}/entry_points.txt +0 -0
- {masster-0.4.0.dist-info → masster-0.4.1.dist-info}/licenses/LICENSE +0 -0
masster/sample/adducts.py
CHANGED
|
@@ -14,24 +14,24 @@ Functions:
|
|
|
14
14
|
|
|
15
15
|
import numpy as np
|
|
16
16
|
import polars as pl
|
|
17
|
-
from typing import List, Dict
|
|
17
|
+
from typing import List, Dict, Any
|
|
18
18
|
from itertools import combinations
|
|
19
19
|
|
|
20
20
|
# Import defaults class for external use
|
|
21
|
-
from
|
|
21
|
+
from masster.sample.defaults.find_adducts_def import find_adducts_defaults
|
|
22
22
|
|
|
23
23
|
|
|
24
24
|
def _get_adducts(self, adducts_list: list = None, **kwargs):
|
|
25
25
|
"""
|
|
26
26
|
Generate comprehensive adduct specifications including multiply charged species and combinations.
|
|
27
|
-
|
|
27
|
+
|
|
28
28
|
This method consolidates all adduct generation logic into a single optimized helper
|
|
29
29
|
that produces a polars DataFrame with all possible adduct combinations, properly
|
|
30
30
|
formatted names like [M+H]1+ or [M-H2O+2H]2+, and respecting charge constraints.
|
|
31
|
-
|
|
31
|
+
|
|
32
32
|
Uses parameters from find_adducts_defaults() by default, which can be overridden
|
|
33
33
|
by providing keyword arguments.
|
|
34
|
-
|
|
34
|
+
|
|
35
35
|
Parameters
|
|
36
36
|
----------
|
|
37
37
|
adducts_list : List[str], optional
|
|
@@ -42,7 +42,7 @@ def _get_adducts(self, adducts_list: list = None, **kwargs):
|
|
|
42
42
|
- charge_min: Minimum charge to consider (default from find_adducts_defaults)
|
|
43
43
|
- charge_max: Maximum charge to consider (default from find_adducts_defaults)
|
|
44
44
|
- max_combinations: Maximum number of adduct components to combine (default 4)
|
|
45
|
-
|
|
45
|
+
|
|
46
46
|
Returns
|
|
47
47
|
-------
|
|
48
48
|
pl.DataFrame
|
|
@@ -56,201 +56,177 @@ def _get_adducts(self, adducts_list: list = None, **kwargs):
|
|
|
56
56
|
"""
|
|
57
57
|
# Get default parameters from find_adducts_defaults
|
|
58
58
|
defaults = self.find_adducts_defaults()
|
|
59
|
-
|
|
59
|
+
|
|
60
60
|
# Use provided parameters or defaults
|
|
61
61
|
if adducts_list is None:
|
|
62
62
|
adducts_list = defaults.get_openms_adducts()
|
|
63
|
-
|
|
64
|
-
charge_min = kwargs.get(
|
|
65
|
-
charge_max = kwargs.get(
|
|
66
|
-
max_combinations = kwargs.get(
|
|
67
|
-
|
|
63
|
+
|
|
64
|
+
charge_min = kwargs.get('charge_min', defaults.charge_min)
|
|
65
|
+
charge_max = kwargs.get('charge_max', defaults.charge_max)
|
|
66
|
+
max_combinations = kwargs.get('max_combinations', 4)
|
|
67
|
+
|
|
68
68
|
# Parse base adduct specifications
|
|
69
69
|
base_specs = []
|
|
70
|
-
|
|
70
|
+
|
|
71
71
|
for adduct_str in adducts_list:
|
|
72
|
-
if not isinstance(adduct_str, str) or
|
|
72
|
+
if not isinstance(adduct_str, str) or ':' not in adduct_str:
|
|
73
73
|
continue
|
|
74
|
-
|
|
74
|
+
|
|
75
75
|
try:
|
|
76
|
-
parts = adduct_str.split(
|
|
76
|
+
parts = adduct_str.split(':')
|
|
77
77
|
if len(parts) != 3:
|
|
78
78
|
continue
|
|
79
|
-
|
|
79
|
+
|
|
80
80
|
formula_part = parts[0]
|
|
81
|
-
charge = int(parts[1])
|
|
81
|
+
charge = int(parts[1])
|
|
82
82
|
probability = float(parts[2])
|
|
83
|
-
|
|
83
|
+
|
|
84
84
|
# Calculate mass shift from formula
|
|
85
85
|
mass_shift = _calculate_formula_mass_shift(formula_part)
|
|
86
|
-
|
|
87
|
-
base_specs.append(
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
)
|
|
96
|
-
|
|
86
|
+
|
|
87
|
+
base_specs.append({
|
|
88
|
+
'formula': formula_part,
|
|
89
|
+
'charge': charge,
|
|
90
|
+
'mass_shift': mass_shift,
|
|
91
|
+
'probability': probability,
|
|
92
|
+
'raw_string': adduct_str
|
|
93
|
+
})
|
|
94
|
+
|
|
97
95
|
except (ValueError, IndexError):
|
|
98
96
|
continue
|
|
99
|
-
|
|
97
|
+
|
|
100
98
|
# Generate all valid combinations
|
|
101
99
|
combinations_list = []
|
|
102
|
-
|
|
100
|
+
|
|
103
101
|
# Separate specs by charge type
|
|
104
|
-
positive_specs = [spec for spec in base_specs if spec[
|
|
105
|
-
negative_specs = [spec for spec in base_specs if spec[
|
|
106
|
-
neutral_specs = [spec for spec in base_specs if spec[
|
|
107
|
-
|
|
102
|
+
positive_specs = [spec for spec in base_specs if spec['charge'] > 0]
|
|
103
|
+
negative_specs = [spec for spec in base_specs if spec['charge'] < 0]
|
|
104
|
+
neutral_specs = [spec for spec in base_specs if spec['charge'] == 0]
|
|
105
|
+
|
|
108
106
|
# 1. Single adducts
|
|
109
107
|
for spec in base_specs:
|
|
110
|
-
if charge_min <= spec[
|
|
108
|
+
if charge_min <= spec['charge'] <= charge_max:
|
|
111
109
|
formatted_name = _format_adduct_name([spec])
|
|
112
|
-
combinations_list.append(
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
)
|
|
122
|
-
|
|
110
|
+
combinations_list.append({
|
|
111
|
+
'components': [spec],
|
|
112
|
+
'formatted_name': formatted_name,
|
|
113
|
+
'total_mass_shift': spec['mass_shift'],
|
|
114
|
+
'total_charge': spec['charge'],
|
|
115
|
+
'combined_probability': spec['probability'],
|
|
116
|
+
'complexity': 1
|
|
117
|
+
})
|
|
118
|
+
|
|
123
119
|
# 2. Generate multiply charged versions (2H+, 3H+, etc.)
|
|
124
120
|
for spec in positive_specs + negative_specs:
|
|
125
|
-
base_charge = spec[
|
|
121
|
+
base_charge = spec['charge']
|
|
126
122
|
for multiplier in range(2, min(max_combinations + 1, 5)):
|
|
127
123
|
total_charge = base_charge * multiplier
|
|
128
124
|
if charge_min <= total_charge <= charge_max:
|
|
129
125
|
components = [spec] * multiplier
|
|
130
126
|
formatted_name = _format_adduct_name(components)
|
|
131
|
-
|
|
132
|
-
combinations_list.append(
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
)
|
|
142
|
-
|
|
127
|
+
|
|
128
|
+
combinations_list.append({
|
|
129
|
+
'components': components,
|
|
130
|
+
'formatted_name': formatted_name,
|
|
131
|
+
'total_mass_shift': spec['mass_shift'] * multiplier,
|
|
132
|
+
'total_charge': total_charge,
|
|
133
|
+
'combined_probability': spec['probability'] ** multiplier,
|
|
134
|
+
'complexity': multiplier
|
|
135
|
+
})
|
|
136
|
+
|
|
143
137
|
# 3. Mixed combinations (2-component)
|
|
144
138
|
if max_combinations >= 2:
|
|
145
139
|
# Positive + Neutral
|
|
146
140
|
for pos_spec in positive_specs:
|
|
147
141
|
for neut_spec in neutral_specs:
|
|
148
|
-
total_charge = pos_spec[
|
|
142
|
+
total_charge = pos_spec['charge'] + neut_spec['charge']
|
|
149
143
|
if charge_min <= total_charge <= charge_max:
|
|
150
144
|
components = [pos_spec, neut_spec]
|
|
151
145
|
formatted_name = _format_adduct_name(components)
|
|
152
|
-
combinations_list.append(
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
"complexity": 2,
|
|
162
|
-
},
|
|
163
|
-
)
|
|
164
|
-
|
|
146
|
+
combinations_list.append({
|
|
147
|
+
'components': components,
|
|
148
|
+
'formatted_name': formatted_name,
|
|
149
|
+
'total_mass_shift': pos_spec['mass_shift'] + neut_spec['mass_shift'],
|
|
150
|
+
'total_charge': total_charge,
|
|
151
|
+
'combined_probability': pos_spec['probability'] * neut_spec['probability'],
|
|
152
|
+
'complexity': 2
|
|
153
|
+
})
|
|
154
|
+
|
|
165
155
|
# Different charged species
|
|
166
156
|
for combo in combinations(positive_specs, 2):
|
|
167
|
-
if combo[0][
|
|
168
|
-
total_charge = combo[0][
|
|
157
|
+
if combo[0]['formula'] != combo[1]['formula']:
|
|
158
|
+
total_charge = combo[0]['charge'] + combo[1]['charge']
|
|
169
159
|
if charge_min <= total_charge <= charge_max:
|
|
170
160
|
components = list(combo)
|
|
171
161
|
formatted_name = _format_adduct_name(components)
|
|
172
|
-
combinations_list.append(
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
"complexity": 2,
|
|
182
|
-
},
|
|
183
|
-
)
|
|
184
|
-
|
|
162
|
+
combinations_list.append({
|
|
163
|
+
'components': components,
|
|
164
|
+
'formatted_name': formatted_name,
|
|
165
|
+
'total_mass_shift': combo[0]['mass_shift'] + combo[1]['mass_shift'],
|
|
166
|
+
'total_charge': total_charge,
|
|
167
|
+
'combined_probability': combo[0]['probability'] * combo[1]['probability'],
|
|
168
|
+
'complexity': 2
|
|
169
|
+
})
|
|
170
|
+
|
|
185
171
|
# 4. 3-component combinations (limited for performance)
|
|
186
172
|
if max_combinations >= 3:
|
|
187
173
|
for pos_spec in positive_specs[:2]:
|
|
188
174
|
for neut_combo in combinations(neutral_specs[:2], 2):
|
|
189
175
|
components = [pos_spec] + list(neut_combo)
|
|
190
|
-
total_charge = sum(spec[
|
|
191
|
-
|
|
176
|
+
total_charge = sum(spec['charge'] for spec in components)
|
|
177
|
+
|
|
192
178
|
if charge_min <= total_charge <= charge_max:
|
|
193
179
|
formatted_name = _format_adduct_name(components)
|
|
194
|
-
total_mass_shift = sum(spec[
|
|
195
|
-
combined_prob = np.prod(
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
"complexity": 3,
|
|
207
|
-
},
|
|
208
|
-
)
|
|
209
|
-
|
|
180
|
+
total_mass_shift = sum(spec['mass_shift'] for spec in components)
|
|
181
|
+
combined_prob = np.prod([spec['probability'] for spec in components])
|
|
182
|
+
|
|
183
|
+
combinations_list.append({
|
|
184
|
+
'components': components,
|
|
185
|
+
'formatted_name': formatted_name,
|
|
186
|
+
'total_mass_shift': total_mass_shift,
|
|
187
|
+
'total_charge': total_charge,
|
|
188
|
+
'combined_probability': combined_prob,
|
|
189
|
+
'complexity': 3
|
|
190
|
+
})
|
|
191
|
+
|
|
210
192
|
# Convert to polars DataFrame
|
|
211
193
|
if combinations_list:
|
|
212
|
-
combinations_list.sort(
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
}
|
|
226
|
-
for combo in combinations_list
|
|
227
|
-
],
|
|
228
|
-
)
|
|
194
|
+
combinations_list.sort(key=lambda x: (-x['combined_probability'], x['complexity']))
|
|
195
|
+
|
|
196
|
+
adducts_df = pl.DataFrame([
|
|
197
|
+
{
|
|
198
|
+
'name': combo['formatted_name'],
|
|
199
|
+
'charge': combo['total_charge'],
|
|
200
|
+
'mass_shift': combo['total_mass_shift'],
|
|
201
|
+
'probability': combo['combined_probability'],
|
|
202
|
+
'complexity': combo['complexity'],
|
|
203
|
+
'components': combo['components']
|
|
204
|
+
}
|
|
205
|
+
for combo in combinations_list
|
|
206
|
+
])
|
|
229
207
|
else:
|
|
230
208
|
# Return empty DataFrame with correct schema
|
|
231
|
-
adducts_df = pl.DataFrame(
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
)
|
|
241
|
-
|
|
209
|
+
adducts_df = pl.DataFrame({
|
|
210
|
+
'name': [],
|
|
211
|
+
'charge': [],
|
|
212
|
+
'mass_shift': [],
|
|
213
|
+
'probability': [],
|
|
214
|
+
'complexity': [],
|
|
215
|
+
'components': []
|
|
216
|
+
})
|
|
217
|
+
|
|
242
218
|
return adducts_df
|
|
243
219
|
|
|
244
220
|
|
|
245
221
|
def _calculate_formula_mass_shift(formula: str) -> float:
|
|
246
222
|
"""
|
|
247
223
|
Calculate mass shift from formula string like "+H", "-H2O", "+Na-H", etc.
|
|
248
|
-
|
|
224
|
+
|
|
249
225
|
Parameters
|
|
250
226
|
----------
|
|
251
227
|
formula : str
|
|
252
228
|
Formula string (e.g., "+H", "-H2O", "+Na-H")
|
|
253
|
-
|
|
229
|
+
|
|
254
230
|
Returns
|
|
255
231
|
-------
|
|
256
232
|
float
|
|
@@ -258,59 +234,59 @@ def _calculate_formula_mass_shift(formula: str) -> float:
|
|
|
258
234
|
"""
|
|
259
235
|
# Standard atomic masses
|
|
260
236
|
atomic_masses = {
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
|
|
237
|
+
'H': 1.007825,
|
|
238
|
+
'C': 12.0,
|
|
239
|
+
'N': 14.003074,
|
|
240
|
+
'O': 15.994915,
|
|
241
|
+
'Na': 22.989769,
|
|
242
|
+
'K': 38.963707,
|
|
243
|
+
'Li': 7.016003,
|
|
244
|
+
'Ca': 39.962591,
|
|
245
|
+
'Mg': 23.985042,
|
|
246
|
+
'Fe': 55.934938,
|
|
247
|
+
'Cl': 34.968853,
|
|
248
|
+
'Br': 78.918336,
|
|
249
|
+
'I': 126.904473,
|
|
250
|
+
'P': 30.973762,
|
|
251
|
+
'S': 31.972071
|
|
276
252
|
}
|
|
277
|
-
|
|
253
|
+
|
|
278
254
|
total_mass = 0.0
|
|
279
|
-
|
|
255
|
+
|
|
280
256
|
# Parse formula by splitting on + and - while preserving the operators
|
|
281
257
|
parts = []
|
|
282
258
|
current_part = ""
|
|
283
259
|
current_sign = 1
|
|
284
|
-
|
|
260
|
+
|
|
285
261
|
for char in formula:
|
|
286
|
-
if char ==
|
|
262
|
+
if char == '+':
|
|
287
263
|
if current_part:
|
|
288
264
|
parts.append((current_sign, current_part))
|
|
289
265
|
current_part = ""
|
|
290
266
|
current_sign = 1
|
|
291
|
-
elif char ==
|
|
267
|
+
elif char == '-':
|
|
292
268
|
if current_part:
|
|
293
269
|
parts.append((current_sign, current_part))
|
|
294
270
|
current_part = ""
|
|
295
271
|
current_sign = -1
|
|
296
272
|
else:
|
|
297
273
|
current_part += char
|
|
298
|
-
|
|
274
|
+
|
|
299
275
|
if current_part:
|
|
300
276
|
parts.append((current_sign, current_part))
|
|
301
|
-
|
|
277
|
+
|
|
302
278
|
# Process each part
|
|
303
279
|
for sign, part in parts:
|
|
304
280
|
if not part:
|
|
305
281
|
continue
|
|
306
|
-
|
|
282
|
+
|
|
307
283
|
# Parse element and count (e.g., "H2O" -> H:2, O:1)
|
|
308
284
|
elements = _parse_element_counts(part)
|
|
309
|
-
|
|
285
|
+
|
|
310
286
|
for element, count in elements.items():
|
|
311
287
|
if element in atomic_masses:
|
|
312
288
|
total_mass += sign * atomic_masses[element] * count
|
|
313
|
-
|
|
289
|
+
|
|
314
290
|
return total_mass
|
|
315
291
|
|
|
316
292
|
|
|
@@ -318,25 +294,25 @@ def _parse_element_counts(formula_part: str) -> Dict[str, int]:
|
|
|
318
294
|
"""Parse element counts from a formula part like 'H2O' -> {'H': 2, 'O': 1}"""
|
|
319
295
|
elements = {}
|
|
320
296
|
i = 0
|
|
321
|
-
|
|
297
|
+
|
|
322
298
|
while i < len(formula_part):
|
|
323
299
|
# Get element (uppercase letter, possibly followed by lowercase)
|
|
324
300
|
element = formula_part[i]
|
|
325
301
|
i += 1
|
|
326
|
-
|
|
302
|
+
|
|
327
303
|
while i < len(formula_part) and formula_part[i].islower():
|
|
328
304
|
element += formula_part[i]
|
|
329
305
|
i += 1
|
|
330
|
-
|
|
306
|
+
|
|
331
307
|
# Get count (digits following element)
|
|
332
308
|
count_str = ""
|
|
333
309
|
while i < len(formula_part) and formula_part[i].isdigit():
|
|
334
310
|
count_str += formula_part[i]
|
|
335
311
|
i += 1
|
|
336
|
-
|
|
312
|
+
|
|
337
313
|
count = int(count_str) if count_str else 1
|
|
338
314
|
elements[element] = elements.get(element, 0) + count
|
|
339
|
-
|
|
315
|
+
|
|
340
316
|
return elements
|
|
341
317
|
|
|
342
318
|
|
|
@@ -344,56 +320,51 @@ def _format_adduct_name(components: List[Dict]) -> str:
|
|
|
344
320
|
"""Format adduct name from components like [M+H]1+ or [M+2H]2+ or [M+2(H+Na)]3+"""
|
|
345
321
|
if not components:
|
|
346
322
|
return "[M]"
|
|
347
|
-
|
|
323
|
+
|
|
348
324
|
# Count occurrences of each formula
|
|
349
325
|
from collections import Counter
|
|
350
|
-
|
|
351
|
-
|
|
352
|
-
|
|
353
|
-
|
|
326
|
+
formula_counts = Counter(comp['formula'] for comp in components)
|
|
327
|
+
total_charge = sum(comp['charge'] for comp in components)
|
|
328
|
+
|
|
354
329
|
# Build formula part with proper multipliers
|
|
355
330
|
formula_parts = []
|
|
356
|
-
for formula, count in sorted(
|
|
357
|
-
formula_counts.items(),
|
|
358
|
-
): # Sort for consistent ordering
|
|
331
|
+
for formula, count in sorted(formula_counts.items()): # Sort for consistent ordering
|
|
359
332
|
if count == 1:
|
|
360
333
|
formula_parts.append(formula)
|
|
361
334
|
else:
|
|
362
335
|
# For multiple occurrences, use count prefix (e.g., 2H, 3Na)
|
|
363
336
|
# Handle special case where formula might already start with + or -
|
|
364
|
-
if formula.startswith((
|
|
337
|
+
if formula.startswith(('+', '-')):
|
|
365
338
|
sign = formula[0]
|
|
366
339
|
base_formula = formula[1:]
|
|
367
340
|
formula_parts.append(f"{sign}{count}{base_formula}")
|
|
368
341
|
else:
|
|
369
342
|
formula_parts.append(f"{count}{formula}")
|
|
370
|
-
|
|
343
|
+
|
|
371
344
|
# Combine formula parts
|
|
372
345
|
formula = "".join(formula_parts)
|
|
373
|
-
|
|
346
|
+
|
|
374
347
|
# Format charge
|
|
375
348
|
if total_charge == 0:
|
|
376
349
|
charge_str = ""
|
|
377
350
|
elif abs(total_charge) == 1:
|
|
378
351
|
charge_str = "1+" if total_charge > 0 else "1-"
|
|
379
352
|
else:
|
|
380
|
-
charge_str = (
|
|
381
|
-
|
|
382
|
-
)
|
|
383
|
-
|
|
353
|
+
charge_str = f"{abs(total_charge)}+" if total_charge > 0 else f"{abs(total_charge)}-"
|
|
354
|
+
|
|
384
355
|
return f"[M{formula}]{charge_str}"
|
|
385
356
|
|
|
386
357
|
|
|
387
358
|
def find_adducts(self, **kwargs):
|
|
388
359
|
"""Detect adduct relationships among detected features using improved OpenMS-like algorithm.
|
|
389
360
|
|
|
390
|
-
This method implements a corrected version of the OpenMS MetaboliteFeatureDeconvolution
|
|
361
|
+
This method implements a corrected version of the OpenMS MetaboliteFeatureDeconvolution
|
|
391
362
|
algorithm that properly enforces RT constraints and avoids the mass tolerance dominance
|
|
392
363
|
issues present in the original C++ implementation.
|
|
393
364
|
|
|
394
365
|
Key improvements over OpenMS:
|
|
395
366
|
- Early RT filtering prevents expensive mass calculations for temporally incompatible features
|
|
396
|
-
- Strict mass tolerance (0.01 Da default) prevents inappropriate mass relationships
|
|
367
|
+
- Strict mass tolerance (0.01 Da default) prevents inappropriate mass relationships
|
|
397
368
|
- RT constraints are properly enforced throughout the algorithm
|
|
398
369
|
- Connected components analysis respects both mass AND RT constraints
|
|
399
370
|
- Probability-based scoring for adduct assignment
|
|
@@ -410,9 +381,9 @@ def find_adducts(self, **kwargs):
|
|
|
410
381
|
Side effects:
|
|
411
382
|
Updates ``self.features_df`` with adduct information columns.
|
|
412
383
|
"""
|
|
413
|
-
# Initialize parameters
|
|
384
|
+
# Initialize parameters
|
|
414
385
|
params = find_adducts_defaults()
|
|
415
|
-
|
|
386
|
+
|
|
416
387
|
for key, value in kwargs.items():
|
|
417
388
|
if isinstance(value, find_adducts_defaults):
|
|
418
389
|
params = value
|
|
@@ -422,28 +393,24 @@ def find_adducts(self, **kwargs):
|
|
|
422
393
|
if params.set(key, value, validate=True):
|
|
423
394
|
self.logger.debug(f"Updated parameter {key} = {value}")
|
|
424
395
|
else:
|
|
425
|
-
self.logger.warning(
|
|
426
|
-
f"Failed to set parameter {key} = {value} (validation failed)",
|
|
427
|
-
)
|
|
396
|
+
self.logger.warning(f"Failed to set parameter {key} = {value} (validation failed)")
|
|
428
397
|
else:
|
|
429
398
|
self.logger.warning(f"Unknown parameter {key} ignored")
|
|
430
399
|
|
|
431
400
|
# Check if features_df exists and has data
|
|
432
|
-
if not hasattr(self,
|
|
433
|
-
self.logger.warning(
|
|
434
|
-
"No features available for adduct detection. Run find_features() first.",
|
|
435
|
-
)
|
|
401
|
+
if not hasattr(self, 'features_df') or len(self.features_df) == 0:
|
|
402
|
+
self.logger.warning("No features available for adduct detection. Run find_features() first.")
|
|
436
403
|
return
|
|
437
404
|
|
|
438
405
|
self.logger.info("Adduct detection...")
|
|
439
406
|
|
|
440
407
|
# Validate required columns
|
|
441
|
-
required_cols = [
|
|
408
|
+
required_cols = ['mz', 'rt']
|
|
442
409
|
missing_cols = [col for col in required_cols if col not in self.features_df.columns]
|
|
443
410
|
if missing_cols:
|
|
444
411
|
self.logger.error(f"Required columns missing from features_df: {missing_cols}")
|
|
445
412
|
return
|
|
446
|
-
|
|
413
|
+
|
|
447
414
|
# Check if we have any features to process
|
|
448
415
|
if len(self.features_df) == 0:
|
|
449
416
|
self.logger.warning("No features available for adduct detection")
|
|
@@ -457,151 +424,130 @@ def find_adducts(self, **kwargs):
|
|
|
457
424
|
|
|
458
425
|
# Get parameters
|
|
459
426
|
adducts_list = params.get_openms_adducts()
|
|
460
|
-
charge_min = params.get("charge_min")
|
|
427
|
+
charge_min = params.get("charge_min")
|
|
461
428
|
charge_max = params.get("charge_max")
|
|
462
429
|
retention_max_diff = params.get("retention_max_diff")
|
|
463
430
|
mass_max_diff = params.get("mass_max_diff")
|
|
464
431
|
unit = params.get("unit")
|
|
465
432
|
min_probability = params.get("min_probability")
|
|
466
433
|
|
|
467
|
-
self.logger.debug(
|
|
468
|
-
|
|
469
|
-
)
|
|
470
|
-
self.logger.debug(
|
|
471
|
-
f"RT tolerance: {retention_max_diff}s, Mass tolerance: {mass_max_diff} {unit}",
|
|
472
|
-
)
|
|
434
|
+
self.logger.debug(f"Processing {len(self.features_df)} features with {len(adducts_list)} base adducts")
|
|
435
|
+
self.logger.debug(f"RT tolerance: {retention_max_diff}s, Mass tolerance: {mass_max_diff} {unit}")
|
|
473
436
|
self.logger.debug(f"Min probability threshold: {min_probability}")
|
|
474
437
|
|
|
475
438
|
# Generate comprehensive adduct specifications using the Sample method
|
|
476
439
|
adducts_df = self._get_adducts(
|
|
477
440
|
adducts_list=adducts_list,
|
|
478
|
-
charge_min=charge_min,
|
|
441
|
+
charge_min=charge_min,
|
|
479
442
|
charge_max=charge_max,
|
|
480
|
-
max_combinations=4
|
|
443
|
+
max_combinations=4
|
|
481
444
|
)
|
|
482
|
-
|
|
445
|
+
|
|
483
446
|
self.logger.debug(f"Generated {len(adducts_df)} total adduct combinations")
|
|
484
|
-
|
|
447
|
+
|
|
485
448
|
# Filter adducts by minimum probability threshold
|
|
486
449
|
if min_probability > 0.0:
|
|
487
450
|
adducts_before_filter = len(adducts_df)
|
|
488
451
|
adducts_df = adducts_df.filter(pl.col("probability") >= min_probability)
|
|
489
452
|
adducts_after_filter = len(adducts_df)
|
|
490
453
|
filtered_count = adducts_before_filter - adducts_after_filter
|
|
491
|
-
|
|
492
|
-
self.logger.debug(
|
|
493
|
-
f"Filtered {filtered_count} low-probability adducts (< {min_probability})",
|
|
494
|
-
)
|
|
454
|
+
|
|
455
|
+
self.logger.debug(f"Filtered {filtered_count} low-probability adducts (< {min_probability})")
|
|
495
456
|
self.logger.debug(f"Remaining adducts for analysis: {adducts_after_filter}")
|
|
496
|
-
|
|
457
|
+
|
|
497
458
|
if len(adducts_df) == 0:
|
|
498
|
-
self.logger.warning(
|
|
499
|
-
f"No adducts remaining after probability filtering (min_probability={min_probability})",
|
|
500
|
-
)
|
|
459
|
+
self.logger.warning(f"No adducts remaining after probability filtering (min_probability={min_probability})")
|
|
501
460
|
return
|
|
502
|
-
|
|
461
|
+
|
|
503
462
|
# Implement the adduct detection algorithm directly here
|
|
504
463
|
import numpy as np
|
|
505
|
-
|
|
464
|
+
|
|
506
465
|
# Get parameters
|
|
507
466
|
charge_max = params.get("charge_max")
|
|
508
467
|
retention_max_diff = params.get("retention_max_diff")
|
|
509
468
|
mass_max_diff = params.get("mass_max_diff")
|
|
510
469
|
unit = params.get("unit")
|
|
511
|
-
|
|
470
|
+
|
|
512
471
|
# Sort features by RT for efficient RT-sweep processing (OpenMS approach)
|
|
513
472
|
# Store original row positions before sorting for correct index mapping
|
|
514
473
|
features_with_positions = self.features_df.with_row_index("original_position")
|
|
515
474
|
features_sorted = features_with_positions.sort("rt")
|
|
516
475
|
n_features = len(features_sorted)
|
|
517
|
-
|
|
476
|
+
|
|
518
477
|
# Extract arrays for fast processing
|
|
519
478
|
feature_mzs = features_sorted.select("mz").to_numpy().flatten()
|
|
520
479
|
feature_rts = features_sorted.select("rt").to_numpy().flatten()
|
|
521
|
-
|
|
480
|
+
|
|
522
481
|
# Convert adducts to arrays for vectorized operations
|
|
523
482
|
adduct_mass_shifts = adducts_df.select("mass_shift").to_numpy().flatten()
|
|
524
483
|
adduct_charges = adducts_df.select("charge").to_numpy().flatten()
|
|
525
484
|
adduct_names = adducts_df.select("name").to_series().to_list()
|
|
526
485
|
adduct_probs = adducts_df.select("probability").to_numpy().flatten()
|
|
527
486
|
|
|
528
|
-
self.logger.debug(
|
|
529
|
-
|
|
530
|
-
)
|
|
531
|
-
|
|
487
|
+
self.logger.debug(f"RT-sweep processing: {n_features} features × {len(adducts_df)} adduct combinations")
|
|
488
|
+
|
|
532
489
|
# Phase 1: RT-sweep line algorithm with early RT filtering (fixes OpenMS flaw #1)
|
|
533
490
|
candidate_edges = []
|
|
534
|
-
|
|
491
|
+
|
|
535
492
|
for i_rt in range(n_features):
|
|
536
493
|
mz1 = feature_mzs[i_rt]
|
|
537
494
|
rt1 = feature_rts[i_rt]
|
|
538
|
-
|
|
495
|
+
|
|
539
496
|
# RT-window sweep: only check features within RT tolerance (early filtering)
|
|
540
497
|
for j_rt in range(i_rt + 1, n_features):
|
|
541
498
|
rt2 = feature_rts[j_rt]
|
|
542
499
|
rt_diff = rt2 - rt1
|
|
543
|
-
|
|
500
|
+
|
|
544
501
|
# Early RT constraint check (fixes OpenMS issue where RT was checked too late)
|
|
545
502
|
if rt_diff > retention_max_diff:
|
|
546
503
|
break # Features are RT-sorted, so no more valid pairs
|
|
547
|
-
|
|
504
|
+
|
|
548
505
|
mz2 = feature_mzs[j_rt]
|
|
549
|
-
|
|
506
|
+
|
|
550
507
|
# Phase 2: Check for valid mass relationships with strict tolerance (fixes OpenMS flaw #2)
|
|
551
508
|
for adduct_idx, mass_shift in enumerate(adduct_mass_shifts):
|
|
552
509
|
charge = adduct_charges[adduct_idx]
|
|
553
|
-
|
|
510
|
+
|
|
554
511
|
# Calculate mass tolerance (per feature, as in OpenMS)
|
|
555
|
-
if unit ==
|
|
512
|
+
if unit == 'ppm':
|
|
556
513
|
tol1 = mass_max_diff * mz1 * 1e-6
|
|
557
514
|
tol2 = mass_max_diff * mz2 * 1e-6
|
|
558
515
|
combined_tolerance = tol1 + tol2
|
|
559
516
|
else: # Da
|
|
560
|
-
combined_tolerance =
|
|
561
|
-
|
|
562
|
-
) # Combined tolerance for both features
|
|
563
|
-
|
|
517
|
+
combined_tolerance = 2 * mass_max_diff # Combined tolerance for both features
|
|
518
|
+
|
|
564
519
|
# Check both directions of mass relationship
|
|
565
520
|
if charge != 0:
|
|
566
521
|
# For charged adducts: m/z relationship
|
|
567
|
-
mass_diff_12 = (mz2 * abs(charge)) - (mz1 * abs(charge))
|
|
522
|
+
mass_diff_12 = (mz2 * abs(charge)) - (mz1 * abs(charge))
|
|
568
523
|
expected_mass_diff = mass_shift
|
|
569
|
-
|
|
524
|
+
|
|
570
525
|
if abs(mass_diff_12 - expected_mass_diff) <= combined_tolerance:
|
|
571
526
|
# Valid mass relationship found
|
|
572
|
-
candidate_edges.append(
|
|
573
|
-
|
|
574
|
-
|
|
575
|
-
|
|
576
|
-
|
|
577
|
-
|
|
578
|
-
|
|
579
|
-
|
|
580
|
-
|
|
581
|
-
"probability": adduct_probs[adduct_idx],
|
|
582
|
-
},
|
|
583
|
-
)
|
|
527
|
+
candidate_edges.append({
|
|
528
|
+
'i': i_rt, 'j': j_rt,
|
|
529
|
+
'rt_diff': rt_diff,
|
|
530
|
+
'mass_error': abs(mass_diff_12 - expected_mass_diff),
|
|
531
|
+
'adduct_idx': adduct_idx,
|
|
532
|
+
'charge1': charge if mass_diff_12 > 0 else -charge,
|
|
533
|
+
'charge2': -charge if mass_diff_12 > 0 else charge,
|
|
534
|
+
'probability': adduct_probs[adduct_idx]
|
|
535
|
+
})
|
|
584
536
|
else:
|
|
585
537
|
# For neutral adducts: direct mass shift
|
|
586
538
|
mass_diff_12 = mz2 - mz1
|
|
587
539
|
if abs(mass_diff_12 - mass_shift) <= combined_tolerance:
|
|
588
|
-
candidate_edges.append(
|
|
589
|
-
|
|
590
|
-
|
|
591
|
-
|
|
592
|
-
|
|
593
|
-
|
|
594
|
-
|
|
595
|
-
|
|
596
|
-
|
|
597
|
-
|
|
598
|
-
|
|
599
|
-
)
|
|
600
|
-
|
|
601
|
-
self.logger.debug(
|
|
602
|
-
f"Found {len(candidate_edges)} candidate edges after RT+mass filtering",
|
|
603
|
-
)
|
|
604
|
-
|
|
540
|
+
candidate_edges.append({
|
|
541
|
+
'i': i_rt, 'j': j_rt,
|
|
542
|
+
'rt_diff': rt_diff,
|
|
543
|
+
'mass_error': abs(mass_diff_12 - mass_shift),
|
|
544
|
+
'adduct_idx': adduct_idx,
|
|
545
|
+
'charge1': 0, 'charge2': 0,
|
|
546
|
+
'probability': adduct_probs[adduct_idx]
|
|
547
|
+
})
|
|
548
|
+
|
|
549
|
+
self.logger.debug(f"Found {len(candidate_edges)} candidate edges after RT+mass filtering")
|
|
550
|
+
|
|
605
551
|
if len(candidate_edges) == 0:
|
|
606
552
|
self.logger.info("No adduct relationships found")
|
|
607
553
|
return
|
|
@@ -611,23 +557,23 @@ def find_adducts(self, **kwargs):
|
|
|
611
557
|
adjacency = {}
|
|
612
558
|
for i in range(n_features):
|
|
613
559
|
adjacency[i] = []
|
|
614
|
-
|
|
560
|
+
|
|
615
561
|
for edge in candidate_edges:
|
|
616
|
-
i, j = edge[
|
|
562
|
+
i, j = edge['i'], edge['j']
|
|
617
563
|
adjacency[i].append(j)
|
|
618
564
|
adjacency[j].append(i)
|
|
619
|
-
|
|
565
|
+
|
|
620
566
|
# Find connected components using DFS
|
|
621
567
|
visited = [False] * n_features
|
|
622
568
|
components = []
|
|
623
|
-
|
|
569
|
+
|
|
624
570
|
def dfs(node, component):
|
|
625
571
|
visited[node] = True
|
|
626
572
|
component.append(node)
|
|
627
573
|
for neighbor in adjacency[node]:
|
|
628
574
|
if not visited[neighbor]:
|
|
629
575
|
dfs(neighbor, component)
|
|
630
|
-
|
|
576
|
+
|
|
631
577
|
for i in range(n_features):
|
|
632
578
|
if not visited[i] and len(adjacency[i]) > 0:
|
|
633
579
|
component = []
|
|
@@ -643,60 +589,53 @@ def find_adducts(self, **kwargs):
|
|
|
643
589
|
group_assignments = [0] * n_features
|
|
644
590
|
mass_shift_assignments = [0.0] * n_features
|
|
645
591
|
neutral_mass_assignments = [0.0] * n_features
|
|
646
|
-
|
|
592
|
+
|
|
647
593
|
for group_id, component in enumerate(components, 1):
|
|
648
594
|
# Find the most likely base ion (highest intensity or lowest m/z as proxy)
|
|
649
595
|
component_mzs = [feature_mzs[idx] for idx in component]
|
|
650
596
|
base_idx_in_component = np.argmin(component_mzs) # Lowest m/z as base
|
|
651
597
|
base_feature_idx = component[base_idx_in_component]
|
|
652
598
|
base_mz = feature_mzs[base_feature_idx]
|
|
653
|
-
|
|
599
|
+
|
|
654
600
|
# Assign base ion
|
|
655
601
|
base_adduct = "[M+H]1+" if charge_max > 0 else "[M-H]1-"
|
|
656
602
|
base_charge = 1 if charge_max > 0 else -1
|
|
657
603
|
base_mass_shift = 1.007825 if charge_max > 0 else -1.007825 # H mass
|
|
658
|
-
|
|
604
|
+
|
|
659
605
|
adduct_assignments[base_feature_idx] = base_adduct
|
|
660
606
|
adduct_charges_assigned[base_feature_idx] = base_charge
|
|
661
607
|
group_assignments[base_feature_idx] = group_id
|
|
662
608
|
mass_shift_assignments[base_feature_idx] = base_mass_shift
|
|
663
|
-
|
|
609
|
+
|
|
664
610
|
# Calculate neutral mass for base ion
|
|
665
611
|
base_mz_measured = feature_mzs[base_feature_idx]
|
|
666
|
-
neutral_mass_assignments[base_feature_idx] = (
|
|
667
|
-
|
|
668
|
-
)
|
|
669
|
-
|
|
612
|
+
neutral_mass_assignments[base_feature_idx] = base_mz_measured * abs(base_charge) - base_mass_shift
|
|
613
|
+
|
|
670
614
|
# Assign other features based on their relationships to base
|
|
671
615
|
for feature_idx in component:
|
|
672
616
|
if feature_idx == base_feature_idx:
|
|
673
617
|
continue
|
|
674
|
-
|
|
618
|
+
|
|
675
619
|
group_assignments[feature_idx] = group_id
|
|
676
|
-
|
|
620
|
+
|
|
677
621
|
# Find best adduct assignment based on mass difference and probability
|
|
678
622
|
feature_mz = feature_mzs[feature_idx]
|
|
679
623
|
best_score = -np.inf
|
|
680
624
|
best_assignment = "[M+?]1+"
|
|
681
625
|
best_charge = 1
|
|
682
626
|
best_mass_shift = 1.007825 # Default to H mass shift for [M+?]1+
|
|
683
|
-
|
|
627
|
+
|
|
684
628
|
# Check all possible adducts
|
|
685
|
-
for adduct_idx, (mass_shift, charge, name, prob) in enumerate(
|
|
686
|
-
|
|
687
|
-
|
|
688
|
-
adduct_charges,
|
|
689
|
-
adduct_names,
|
|
690
|
-
adduct_probs,
|
|
691
|
-
),
|
|
692
|
-
):
|
|
629
|
+
for adduct_idx, (mass_shift, charge, name, prob) in enumerate(zip(
|
|
630
|
+
adduct_mass_shifts, adduct_charges, adduct_names, adduct_probs)):
|
|
631
|
+
|
|
693
632
|
if charge != 0:
|
|
694
633
|
expected_mz = base_mz + mass_shift / abs(charge)
|
|
695
634
|
else:
|
|
696
635
|
expected_mz = base_mz + mass_shift
|
|
697
|
-
|
|
636
|
+
|
|
698
637
|
mass_error = abs(expected_mz - feature_mz)
|
|
699
|
-
|
|
638
|
+
|
|
700
639
|
# Combined score: probability + mass accuracy
|
|
701
640
|
if mass_error < mass_max_diff * 2: # Within tolerance
|
|
702
641
|
score = prob - mass_error * 0.1 # Weight mass accuracy
|
|
@@ -709,52 +648,48 @@ def find_adducts(self, **kwargs):
|
|
|
709
648
|
adduct_assignments[feature_idx] = best_assignment
|
|
710
649
|
adduct_charges_assigned[feature_idx] = best_charge
|
|
711
650
|
mass_shift_assignments[feature_idx] = best_mass_shift
|
|
712
|
-
|
|
651
|
+
|
|
713
652
|
# Calculate neutral mass
|
|
714
|
-
neutral_mass_assignments[feature_idx] = (
|
|
715
|
-
feature_mz * abs(best_charge) - best_mass_shift
|
|
716
|
-
)
|
|
653
|
+
neutral_mass_assignments[feature_idx] = feature_mz * abs(best_charge) - best_mass_shift
|
|
717
654
|
|
|
718
655
|
# Assign fallback adduct for features not processed in connected components (isolated features)
|
|
719
656
|
for i in range(n_features):
|
|
720
657
|
if adduct_assignments[i] is None:
|
|
721
658
|
fallback_charge = 1 if charge_max > 0 else -1
|
|
722
659
|
fallback_mass_shift = 1.007825 if charge_max > 0 else -1.007825 # Assume H
|
|
723
|
-
|
|
660
|
+
|
|
724
661
|
adduct_assignments[i] = "[M+?]1+"
|
|
725
662
|
adduct_charges_assigned[i] = fallback_charge
|
|
726
663
|
group_assignments[i] = 0 # No group assignment for isolated features
|
|
727
664
|
mass_shift_assignments[i] = fallback_mass_shift
|
|
728
|
-
|
|
665
|
+
|
|
729
666
|
# Calculate neutral mass for isolated features
|
|
730
667
|
feature_mz = feature_mzs[i]
|
|
731
|
-
neutral_mass_assignments[i] = (
|
|
732
|
-
feature_mz * abs(fallback_charge) - fallback_mass_shift
|
|
733
|
-
)
|
|
668
|
+
neutral_mass_assignments[i] = feature_mz * abs(fallback_charge) - fallback_mass_shift
|
|
734
669
|
|
|
735
670
|
# Map back to original feature order using stored positions
|
|
736
671
|
original_indices = features_sorted.select("original_position").to_numpy().flatten()
|
|
737
|
-
|
|
672
|
+
|
|
738
673
|
# Create final assignments in original order (same size as original DataFrame)
|
|
739
674
|
final_adducts = [None] * len(self.features_df)
|
|
740
675
|
final_charges = [0] * len(self.features_df)
|
|
741
676
|
final_groups = [0] * len(self.features_df)
|
|
742
677
|
final_mass_shifts = [0.0] * len(self.features_df)
|
|
743
678
|
final_neutral_masses = [0.0] * len(self.features_df)
|
|
744
|
-
|
|
679
|
+
|
|
745
680
|
for sorted_idx, orig_idx in enumerate(original_indices):
|
|
746
681
|
final_adducts[orig_idx] = adduct_assignments[sorted_idx]
|
|
747
682
|
final_charges[orig_idx] = adduct_charges_assigned[sorted_idx]
|
|
748
683
|
final_groups[orig_idx] = group_assignments[sorted_idx]
|
|
749
684
|
final_mass_shifts[orig_idx] = mass_shift_assignments[sorted_idx]
|
|
750
685
|
final_neutral_masses[orig_idx] = neutral_mass_assignments[sorted_idx]
|
|
751
|
-
|
|
686
|
+
|
|
752
687
|
# Update features DataFrame with correct column ordering
|
|
753
688
|
# Insert adduct columns in the specified order after iso_of column
|
|
754
|
-
|
|
689
|
+
|
|
755
690
|
# Get current columns
|
|
756
691
|
current_columns = self.features_df.columns
|
|
757
|
-
|
|
692
|
+
|
|
758
693
|
# Find the position of iso_of column
|
|
759
694
|
try:
|
|
760
695
|
iso_of_index = current_columns.index("iso_of")
|
|
@@ -763,51 +698,42 @@ def find_adducts(self, **kwargs):
|
|
|
763
698
|
# If iso_of doesn't exist, append at the end
|
|
764
699
|
insert_position = len(current_columns)
|
|
765
700
|
self.logger.warning("iso_of column not found, adding adduct columns at the end")
|
|
766
|
-
|
|
701
|
+
|
|
767
702
|
# Remove any existing adduct columns first
|
|
768
|
-
adduct_column_names = [
|
|
769
|
-
|
|
770
|
-
|
|
771
|
-
"adduct_mass_shift",
|
|
772
|
-
"adduct_mass_neutral",
|
|
773
|
-
"adduct_group",
|
|
774
|
-
]
|
|
775
|
-
df_without_adducts = self.features_df.select(
|
|
776
|
-
[col for col in current_columns if col not in adduct_column_names],
|
|
777
|
-
)
|
|
778
|
-
|
|
703
|
+
adduct_column_names = ["adduct", "adduct_charge", "adduct_mass_shift", "adduct_mass_neutral", "adduct_group"]
|
|
704
|
+
df_without_adducts = self.features_df.select([col for col in current_columns if col not in adduct_column_names])
|
|
705
|
+
|
|
779
706
|
# Split columns at insertion point
|
|
780
707
|
columns_before = df_without_adducts.columns[:insert_position]
|
|
781
708
|
columns_after = df_without_adducts.columns[insert_position:]
|
|
782
|
-
|
|
709
|
+
|
|
783
710
|
# Create the new column order with adduct columns in the correct position
|
|
784
|
-
new_column_order =
|
|
785
|
-
|
|
711
|
+
new_column_order = (
|
|
712
|
+
list(columns_before) +
|
|
713
|
+
adduct_column_names +
|
|
714
|
+
list(columns_after)
|
|
715
|
+
)
|
|
716
|
+
|
|
786
717
|
# Add adduct columns to the dataframe
|
|
787
|
-
self.features_df = df_without_adducts.with_columns(
|
|
788
|
-
|
|
789
|
-
|
|
790
|
-
|
|
791
|
-
|
|
792
|
-
|
|
793
|
-
|
|
794
|
-
],
|
|
795
|
-
).select(new_column_order)
|
|
718
|
+
self.features_df = df_without_adducts.with_columns([
|
|
719
|
+
pl.Series("adduct", final_adducts),
|
|
720
|
+
pl.Series("adduct_charge", final_charges),
|
|
721
|
+
pl.Series("adduct_mass_shift", final_mass_shifts),
|
|
722
|
+
pl.Series("adduct_mass_neutral", final_neutral_masses),
|
|
723
|
+
pl.Series("adduct_group", final_groups)
|
|
724
|
+
]).select(new_column_order)
|
|
796
725
|
|
|
797
726
|
# Summary statistics
|
|
798
727
|
total_with_adducts = sum(1 for x in final_adducts if x is not None)
|
|
799
728
|
total_groups = max(final_groups) if final_groups else 0
|
|
800
|
-
|
|
801
|
-
self.logger.info(
|
|
802
|
-
f"Adduct detection completed: {total_with_adducts} features with adducts in {total_groups} groups",
|
|
803
|
-
)
|
|
729
|
+
|
|
730
|
+
self.logger.info(f"Adduct detection completed: {total_with_adducts} features with adducts in {total_groups} groups")
|
|
804
731
|
|
|
805
732
|
# Store parameters including the actual processed adducts list
|
|
806
733
|
history_params = params.to_dict()
|
|
807
734
|
# Convert the filtered adducts dataframe to a list of adduct specifications for history
|
|
808
|
-
history_params[
|
|
809
|
-
|
|
810
|
-
).to_dicts()
|
|
811
|
-
|
|
735
|
+
history_params['adducts'] = adducts_df.select(['name', 'charge', 'mass_shift', 'probability']).to_dicts()
|
|
736
|
+
|
|
812
737
|
self.store_history(["find_adducts"], history_params)
|
|
813
738
|
self.logger.debug("Parameters stored successfully")
|
|
739
|
+
|