masster 0.4.4__py3-none-any.whl → 0.4.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of masster might be problematic. Click here for more details.
- masster/__init__.py +8 -8
- masster/chromatogram.py +1 -1
- masster/data/libs/urine.csv +3 -3
- masster/logger.py +11 -11
- masster/sample/__init__.py +1 -1
- masster/sample/adducts.py +338 -264
- masster/sample/defaults/find_adducts_def.py +21 -8
- masster/sample/h5.py +561 -282
- masster/sample/helpers.py +131 -75
- masster/sample/lib.py +4 -4
- masster/sample/load.py +31 -17
- masster/sample/parameters.py +1 -1
- masster/sample/plot.py +7 -7
- masster/sample/processing.py +117 -87
- masster/sample/sample.py +103 -90
- masster/sample/sample5_schema.json +44 -44
- masster/sample/save.py +35 -12
- masster/spectrum.py +1 -1
- masster/study/__init__.py +1 -1
- masster/study/defaults/align_def.py +5 -1
- masster/study/defaults/identify_def.py +3 -1
- masster/study/defaults/study_def.py +58 -25
- masster/study/export.py +360 -210
- masster/study/h5.py +560 -158
- masster/study/helpers.py +496 -203
- masster/study/helpers_optimized.py +1 -1
- masster/study/id.py +538 -349
- masster/study/load.py +233 -143
- masster/study/plot.py +71 -71
- masster/study/processing.py +456 -254
- masster/study/save.py +15 -5
- masster/study/study.py +213 -131
- masster/study/study5_schema.json +149 -149
- {masster-0.4.4.dist-info → masster-0.4.5.dist-info}/METADATA +3 -1
- {masster-0.4.4.dist-info → masster-0.4.5.dist-info}/RECORD +39 -39
- {masster-0.4.4.dist-info → masster-0.4.5.dist-info}/WHEEL +0 -0
- {masster-0.4.4.dist-info → masster-0.4.5.dist-info}/entry_points.txt +0 -0
- {masster-0.4.4.dist-info → masster-0.4.5.dist-info}/licenses/LICENSE +0 -0
- {masster-0.4.4.dist-info → masster-0.4.5.dist-info}/top_level.txt +0 -0
masster/study/id.py
CHANGED
|
@@ -3,14 +3,19 @@
|
|
|
3
3
|
Identification helpers for Study: load a Lib and identify consensus features
|
|
4
4
|
by matching m/z (and optionally RT).
|
|
5
5
|
"""
|
|
6
|
+
|
|
6
7
|
from __future__ import annotations
|
|
7
8
|
|
|
8
|
-
from typing import Optional
|
|
9
9
|
|
|
10
10
|
import polars as pl
|
|
11
11
|
|
|
12
12
|
|
|
13
|
-
def lib_load(
|
|
13
|
+
def lib_load(
|
|
14
|
+
study,
|
|
15
|
+
lib_source,
|
|
16
|
+
polarity: str | None = None,
|
|
17
|
+
adducts: list | None = None,
|
|
18
|
+
):
|
|
14
19
|
"""Load a co # Add compound and formula count columns
|
|
15
20
|
if "consensus_uid" in result_df.columns:
|
|
16
21
|
# Calculate counts per consensus_uid
|
|
@@ -31,7 +36,7 @@ def lib_load(study, lib_source, polarity: Optional[str] = None, adducts: Optiona
|
|
|
31
36
|
"""
|
|
32
37
|
# Lazy import to avoid circular imports at module import time
|
|
33
38
|
try:
|
|
34
|
-
from
|
|
39
|
+
from master.lib.lib import Lib
|
|
35
40
|
except Exception:
|
|
36
41
|
Lib = None
|
|
37
42
|
|
|
@@ -40,72 +45,93 @@ def lib_load(study, lib_source, polarity: Optional[str] = None, adducts: Optiona
|
|
|
40
45
|
|
|
41
46
|
# Use study polarity if not explicitly provided
|
|
42
47
|
if polarity is None:
|
|
43
|
-
study_polarity = getattr(study,
|
|
48
|
+
study_polarity = getattr(study, "polarity", "positive")
|
|
44
49
|
# Normalize polarity names
|
|
45
|
-
if study_polarity in [
|
|
46
|
-
polarity =
|
|
47
|
-
elif study_polarity in [
|
|
48
|
-
polarity =
|
|
50
|
+
if study_polarity in ["pos", "positive"]:
|
|
51
|
+
polarity = "positive"
|
|
52
|
+
elif study_polarity in ["neg", "negative"]:
|
|
53
|
+
polarity = "negative"
|
|
49
54
|
else:
|
|
50
|
-
polarity =
|
|
55
|
+
polarity = "positive" # Default fallback
|
|
51
56
|
|
|
52
57
|
# Handle string input (CSV file path)
|
|
53
58
|
if isinstance(lib_source, str):
|
|
54
59
|
if Lib is None:
|
|
55
|
-
raise ImportError(
|
|
56
|
-
|
|
60
|
+
raise ImportError(
|
|
61
|
+
"Could not import master.lib.lib.Lib - required for CSV loading",
|
|
62
|
+
)
|
|
63
|
+
|
|
57
64
|
lib_obj = Lib()
|
|
58
65
|
lib_obj.import_csv(lib_source, polarity=polarity, adducts=adducts)
|
|
59
|
-
|
|
66
|
+
|
|
60
67
|
# Handle Lib instance
|
|
61
68
|
elif Lib is not None and isinstance(lib_source, Lib):
|
|
62
69
|
lib_obj = lib_source
|
|
63
|
-
|
|
70
|
+
|
|
64
71
|
# Handle other objects with lib_df attribute
|
|
65
72
|
elif hasattr(lib_source, "lib_df"):
|
|
66
73
|
lib_obj = lib_source
|
|
67
|
-
|
|
74
|
+
|
|
68
75
|
else:
|
|
69
|
-
raise TypeError(
|
|
76
|
+
raise TypeError(
|
|
77
|
+
"lib_source must be a CSV file path (str), a master.lib.Lib instance, or have a 'lib_df' attribute",
|
|
78
|
+
)
|
|
70
79
|
|
|
71
80
|
# Ensure lib_df is populated
|
|
72
81
|
lf = getattr(lib_obj, "lib_df", None)
|
|
73
|
-
if lf is None or (hasattr(lf,
|
|
82
|
+
if lf is None or (hasattr(lf, "is_empty") and lf.is_empty()):
|
|
74
83
|
raise ValueError("Library has no data populated in lib_df")
|
|
75
84
|
|
|
76
85
|
# Filter by polarity to match study
|
|
77
86
|
# Map polarity to charge signs
|
|
78
|
-
if polarity ==
|
|
87
|
+
if polarity == "positive":
|
|
79
88
|
target_charges = [1, 2] # positive charges
|
|
80
|
-
elif polarity ==
|
|
89
|
+
elif polarity == "negative":
|
|
81
90
|
target_charges = [-1, -2] # negative charges
|
|
82
91
|
else:
|
|
83
92
|
target_charges = [-2, -1, 1, 2] # all charges
|
|
84
93
|
|
|
85
94
|
# Filter library entries by charge sign (which corresponds to polarity)
|
|
86
95
|
filtered_lf = lf.filter(pl.col("z").is_in(target_charges))
|
|
87
|
-
|
|
96
|
+
|
|
88
97
|
if filtered_lf.is_empty():
|
|
89
|
-
print(
|
|
98
|
+
print(
|
|
99
|
+
f"Warning: No library entries found for polarity '{polarity}'. Using all entries.",
|
|
100
|
+
)
|
|
90
101
|
filtered_lf = lf
|
|
91
102
|
|
|
92
103
|
# Store pointer and DataFrame on study
|
|
93
104
|
study._lib = lib_obj
|
|
94
|
-
|
|
105
|
+
|
|
95
106
|
# Add to existing lib_df instead of replacing
|
|
96
|
-
if
|
|
107
|
+
if (
|
|
108
|
+
hasattr(study, "lib_df")
|
|
109
|
+
and study.lib_df is not None
|
|
110
|
+
and not study.lib_df.is_empty()
|
|
111
|
+
):
|
|
97
112
|
# Concatenate with existing data
|
|
98
113
|
study.lib_df = pl.concat([study.lib_df, filtered_lf])
|
|
99
114
|
else:
|
|
100
115
|
# First time loading - create new
|
|
101
116
|
try:
|
|
102
|
-
study.lib_df =
|
|
117
|
+
study.lib_df = (
|
|
118
|
+
filtered_lf.clone()
|
|
119
|
+
if hasattr(filtered_lf, "clone")
|
|
120
|
+
else pl.DataFrame(filtered_lf)
|
|
121
|
+
)
|
|
103
122
|
except Exception:
|
|
104
|
-
study.lib_df =
|
|
123
|
+
study.lib_df = (
|
|
124
|
+
pl.from_pandas(filtered_lf)
|
|
125
|
+
if hasattr(filtered_lf, "to_pandas")
|
|
126
|
+
else pl.DataFrame(filtered_lf)
|
|
127
|
+
)
|
|
105
128
|
|
|
106
129
|
# Store this operation in history
|
|
107
|
-
if hasattr(study,
|
|
108
|
-
study.store_history(
|
|
130
|
+
if hasattr(study, "store_history"):
|
|
131
|
+
study.store_history(
|
|
132
|
+
["lib_load"],
|
|
133
|
+
{"lib_source": str(lib_source), "polarity": polarity, "adducts": adducts},
|
|
134
|
+
)
|
|
109
135
|
|
|
110
136
|
|
|
111
137
|
def identify(study, features=None, params=None, **kwargs):
|
|
@@ -121,7 +147,7 @@ def identify(study, features=None, params=None, **kwargs):
|
|
|
121
147
|
If None, identifies all consensus features.
|
|
122
148
|
params: Optional identify_defaults instance with matching tolerances and scoring parameters.
|
|
123
149
|
If None, uses default parameters.
|
|
124
|
-
**kwargs: Individual parameter overrides (mz_tol, rt_tol, heteroatom_penalty,
|
|
150
|
+
**kwargs: Individual parameter overrides (mz_tol, rt_tol, heteroatom_penalty,
|
|
125
151
|
multiple_formulas_penalty, multiple_compounds_penalty, heteroatoms)
|
|
126
152
|
|
|
127
153
|
The resulting DataFrame is stored as study.id_df. Columns:
|
|
@@ -133,10 +159,10 @@ def identify(study, features=None, params=None, **kwargs):
|
|
|
133
159
|
"""
|
|
134
160
|
# Import defaults class
|
|
135
161
|
try:
|
|
136
|
-
from
|
|
162
|
+
from master.study.defaults.identify_def import identify_defaults
|
|
137
163
|
except ImportError:
|
|
138
164
|
identify_defaults = None
|
|
139
|
-
|
|
165
|
+
|
|
140
166
|
# Use provided params or create defaults
|
|
141
167
|
if params is None:
|
|
142
168
|
if identify_defaults is not None:
|
|
@@ -150,52 +176,66 @@ def identify(study, features=None, params=None, **kwargs):
|
|
|
150
176
|
multiple_formulas_penalty = 0.8
|
|
151
177
|
multiple_compounds_penalty = 0.8
|
|
152
178
|
heteroatoms = ["Cl", "Br", "F", "I"]
|
|
179
|
+
|
|
153
180
|
params = FallbackParams()
|
|
154
|
-
|
|
181
|
+
|
|
155
182
|
# Override parameters with any provided kwargs
|
|
156
183
|
if kwargs:
|
|
157
184
|
for param_name, value in kwargs.items():
|
|
158
185
|
if hasattr(params, param_name):
|
|
159
186
|
setattr(params, param_name, value)
|
|
160
|
-
|
|
187
|
+
|
|
161
188
|
# Get effective tolerances from params (now possibly overridden)
|
|
162
|
-
effective_mz_tol = getattr(params,
|
|
163
|
-
effective_rt_tol = getattr(params,
|
|
189
|
+
effective_mz_tol = getattr(params, "mz_tol", 0.01)
|
|
190
|
+
effective_rt_tol = getattr(params, "rt_tol", 2.0)
|
|
164
191
|
# Get logger from study if available
|
|
165
|
-
logger = getattr(study,
|
|
166
|
-
|
|
192
|
+
logger = getattr(study, "logger", None)
|
|
193
|
+
|
|
167
194
|
if logger:
|
|
168
|
-
logger.debug(
|
|
195
|
+
logger.debug(
|
|
196
|
+
f"Starting identification with mz_tolerance={effective_mz_tol}, rt_tolerance={effective_rt_tol}",
|
|
197
|
+
)
|
|
169
198
|
|
|
170
199
|
# Determine which features to process
|
|
171
200
|
target_uids = None
|
|
172
201
|
if features is not None:
|
|
173
|
-
if hasattr(features,
|
|
174
|
-
if
|
|
175
|
-
target_uids = features[
|
|
202
|
+
if hasattr(features, "columns"): # DataFrame-like
|
|
203
|
+
if "consensus_uid" in features.columns:
|
|
204
|
+
target_uids = features["consensus_uid"].unique().to_list()
|
|
176
205
|
else:
|
|
177
|
-
raise ValueError(
|
|
178
|
-
|
|
206
|
+
raise ValueError(
|
|
207
|
+
"features DataFrame must contain 'consensus_uid' column",
|
|
208
|
+
)
|
|
209
|
+
elif hasattr(features, "__iter__") and not isinstance(
|
|
210
|
+
features,
|
|
211
|
+
str,
|
|
212
|
+
): # List-like
|
|
179
213
|
target_uids = list(features)
|
|
180
214
|
else:
|
|
181
|
-
raise ValueError(
|
|
182
|
-
|
|
215
|
+
raise ValueError(
|
|
216
|
+
"features must be a DataFrame with 'consensus_uid' column or a list of UIDs",
|
|
217
|
+
)
|
|
218
|
+
|
|
183
219
|
if logger:
|
|
184
220
|
logger.debug(f"Identifying {len(target_uids)} specified features")
|
|
185
221
|
|
|
186
222
|
# Clear previous identification results for target features only
|
|
187
|
-
if hasattr(study,
|
|
223
|
+
if hasattr(study, "id_df") and not study.id_df.is_empty():
|
|
188
224
|
if target_uids is not None:
|
|
189
225
|
# Keep results for features NOT being re-identified
|
|
190
|
-
study.id_df = study.id_df.filter(
|
|
226
|
+
study.id_df = study.id_df.filter(
|
|
227
|
+
~pl.col("consensus_uid").is_in(target_uids),
|
|
228
|
+
)
|
|
191
229
|
if logger:
|
|
192
|
-
logger.debug(
|
|
230
|
+
logger.debug(
|
|
231
|
+
f"Cleared previous identification results for {len(target_uids)} features",
|
|
232
|
+
)
|
|
193
233
|
else:
|
|
194
234
|
# Clear all results if no specific features specified
|
|
195
235
|
study.id_df = pl.DataFrame()
|
|
196
236
|
if logger:
|
|
197
237
|
logger.debug("Cleared all previous identification results")
|
|
198
|
-
elif not hasattr(study,
|
|
238
|
+
elif not hasattr(study, "id_df"):
|
|
199
239
|
study.id_df = pl.DataFrame()
|
|
200
240
|
if logger:
|
|
201
241
|
logger.debug("Initialized empty id_df")
|
|
@@ -214,20 +254,28 @@ def identify(study, features=None, params=None, **kwargs):
|
|
|
214
254
|
# Filter consensus features if target_uids specified
|
|
215
255
|
consensus_to_process = study.consensus_df
|
|
216
256
|
if target_uids is not None:
|
|
217
|
-
consensus_to_process = study.consensus_df.filter(
|
|
257
|
+
consensus_to_process = study.consensus_df.filter(
|
|
258
|
+
pl.col("consensus_uid").is_in(target_uids),
|
|
259
|
+
)
|
|
218
260
|
if consensus_to_process.is_empty():
|
|
219
261
|
if logger:
|
|
220
|
-
logger.warning(
|
|
262
|
+
logger.warning(
|
|
263
|
+
"No consensus features found matching specified features",
|
|
264
|
+
)
|
|
221
265
|
return
|
|
222
266
|
|
|
223
267
|
consensus_count = len(consensus_to_process)
|
|
224
268
|
lib_count = len(study.lib_df)
|
|
225
|
-
|
|
269
|
+
|
|
226
270
|
if logger:
|
|
227
271
|
if target_uids is not None:
|
|
228
|
-
logger.debug(
|
|
272
|
+
logger.debug(
|
|
273
|
+
f"Identifying {consensus_count} specified consensus features against {lib_count} library entries",
|
|
274
|
+
)
|
|
229
275
|
else:
|
|
230
|
-
logger.debug(
|
|
276
|
+
logger.debug(
|
|
277
|
+
f"Identifying {consensus_count} consensus features against {lib_count} library entries",
|
|
278
|
+
)
|
|
231
279
|
|
|
232
280
|
# Get adduct probabilities
|
|
233
281
|
adducts_df = study._get_adducts()
|
|
@@ -235,7 +283,7 @@ def identify(study, features=None, params=None, **kwargs):
|
|
|
235
283
|
if not adducts_df.is_empty():
|
|
236
284
|
for row in adducts_df.iter_rows(named=True):
|
|
237
285
|
adduct_prob_map[row.get("name")] = row.get("probability", 1.0)
|
|
238
|
-
|
|
286
|
+
|
|
239
287
|
results = []
|
|
240
288
|
features_with_matches = 0
|
|
241
289
|
total_matches = 0
|
|
@@ -247,7 +295,7 @@ def identify(study, features=None, params=None, **kwargs):
|
|
|
247
295
|
cons_mz = cons.get("mz")
|
|
248
296
|
cons_rt = cons.get("rt")
|
|
249
297
|
cons_uid = cons.get("consensus_uid")
|
|
250
|
-
|
|
298
|
+
|
|
251
299
|
if cons_mz is None:
|
|
252
300
|
if logger:
|
|
253
301
|
logger.debug(f"Skipping consensus feature {cons_uid} - no m/z value")
|
|
@@ -255,7 +303,8 @@ def identify(study, features=None, params=None, **kwargs):
|
|
|
255
303
|
|
|
256
304
|
# Filter lib by mz window
|
|
257
305
|
matches = study.lib_df.filter(
|
|
258
|
-
(pl.col("mz") >= cons_mz - effective_mz_tol)
|
|
306
|
+
(pl.col("mz") >= cons_mz - effective_mz_tol)
|
|
307
|
+
& (pl.col("mz") <= cons_mz + effective_mz_tol),
|
|
259
308
|
)
|
|
260
309
|
|
|
261
310
|
initial_matches = len(matches)
|
|
@@ -263,15 +312,21 @@ def identify(study, features=None, params=None, **kwargs):
|
|
|
263
312
|
# If rt_tol provided and consensus RT present, prefer rt-filtered hits
|
|
264
313
|
if effective_rt_tol is not None and cons_rt is not None:
|
|
265
314
|
rt_matches = matches.filter(
|
|
266
|
-
pl.col("rt").is_not_null()
|
|
315
|
+
pl.col("rt").is_not_null()
|
|
316
|
+
& (pl.col("rt") >= cons_rt - effective_rt_tol)
|
|
317
|
+
& (pl.col("rt") <= cons_rt + effective_rt_tol),
|
|
267
318
|
)
|
|
268
319
|
if not rt_matches.is_empty():
|
|
269
320
|
matches = rt_matches
|
|
270
321
|
if logger:
|
|
271
|
-
logger.debug(
|
|
322
|
+
logger.debug(
|
|
323
|
+
f"Consensus {cons_uid}: {initial_matches} m/z matches, {len(matches)} after RT filter",
|
|
324
|
+
)
|
|
272
325
|
else:
|
|
273
326
|
if logger:
|
|
274
|
-
logger.debug(
|
|
327
|
+
logger.debug(
|
|
328
|
+
f"Consensus {cons_uid}: {initial_matches} m/z matches, 0 after RT filter - using m/z matches only",
|
|
329
|
+
)
|
|
275
330
|
|
|
276
331
|
# Apply scoring-based filtering system
|
|
277
332
|
if not matches.is_empty():
|
|
@@ -283,19 +338,25 @@ def identify(study, features=None, params=None, **kwargs):
|
|
|
283
338
|
features_with_matches += 1
|
|
284
339
|
feature_match_count = len(filtered_matches)
|
|
285
340
|
total_matches += feature_match_count
|
|
286
|
-
|
|
341
|
+
|
|
287
342
|
if logger:
|
|
288
|
-
logger.debug(
|
|
343
|
+
logger.debug(
|
|
344
|
+
f"Consensus {cons_uid} (mz={cons_mz:.5f}): {feature_match_count} library matches",
|
|
345
|
+
)
|
|
289
346
|
|
|
290
347
|
for m in filtered_matches.iter_rows(named=True):
|
|
291
348
|
mz_delta = abs(cons_mz - m.get("mz")) if m.get("mz") is not None else None
|
|
292
349
|
lib_rt = m.get("rt")
|
|
293
|
-
rt_delta =
|
|
294
|
-
|
|
350
|
+
rt_delta = (
|
|
351
|
+
abs(cons_rt - lib_rt)
|
|
352
|
+
if (cons_rt is not None and lib_rt is not None)
|
|
353
|
+
else None
|
|
354
|
+
)
|
|
355
|
+
|
|
295
356
|
# Get adduct probability from _get_adducts() results
|
|
296
357
|
adduct = m.get("adduct")
|
|
297
358
|
score = adduct_prob_map.get(adduct, 1.0) if adduct else 1.0
|
|
298
|
-
|
|
359
|
+
|
|
299
360
|
results.append(
|
|
300
361
|
{
|
|
301
362
|
"consensus_uid": cons.get("consensus_uid"),
|
|
@@ -304,51 +365,60 @@ def identify(study, features=None, params=None, **kwargs):
|
|
|
304
365
|
"rt_delta": rt_delta,
|
|
305
366
|
"matcher": "ms1",
|
|
306
367
|
"score": score,
|
|
307
|
-
}
|
|
368
|
+
},
|
|
308
369
|
)
|
|
309
370
|
|
|
310
371
|
# Merge new results with existing results
|
|
311
372
|
new_results_df = pl.DataFrame(results) if results else pl.DataFrame()
|
|
312
|
-
|
|
373
|
+
|
|
313
374
|
if not new_results_df.is_empty():
|
|
314
|
-
if hasattr(study,
|
|
375
|
+
if hasattr(study, "id_df") and not study.id_df.is_empty():
|
|
315
376
|
# Concatenate new results with existing results
|
|
316
377
|
study.id_df = pl.concat([study.id_df, new_results_df])
|
|
317
378
|
else:
|
|
318
379
|
# First results
|
|
319
380
|
study.id_df = new_results_df
|
|
320
|
-
|
|
381
|
+
|
|
321
382
|
# Apply scoring adjustments based on compound and formula counts
|
|
322
|
-
if
|
|
383
|
+
if (
|
|
384
|
+
not study.id_df.is_empty()
|
|
385
|
+
and hasattr(study, "lib_df")
|
|
386
|
+
and not study.lib_df.is_empty()
|
|
387
|
+
):
|
|
323
388
|
# Join with lib_df to get compound and formula information
|
|
324
389
|
id_with_lib = study.id_df.join(
|
|
325
390
|
study.lib_df.select(["lib_uid", "cmpd_uid", "formula"]),
|
|
326
391
|
on="lib_uid",
|
|
327
|
-
how="left"
|
|
392
|
+
how="left",
|
|
328
393
|
)
|
|
329
|
-
|
|
394
|
+
|
|
330
395
|
# Calculate counts per consensus_uid
|
|
331
|
-
count_stats = id_with_lib.group_by("consensus_uid").agg(
|
|
332
|
-
|
|
333
|
-
|
|
334
|
-
|
|
335
|
-
|
|
396
|
+
count_stats = id_with_lib.group_by("consensus_uid").agg(
|
|
397
|
+
[
|
|
398
|
+
pl.col("cmpd_uid").n_unique().alias("num_cmpds"),
|
|
399
|
+
pl.col("formula")
|
|
400
|
+
.filter(pl.col("formula").is_not_null())
|
|
401
|
+
.n_unique()
|
|
402
|
+
.alias("num_formulas"),
|
|
403
|
+
],
|
|
404
|
+
)
|
|
405
|
+
|
|
336
406
|
# Join counts back to id_df
|
|
337
407
|
id_with_counts = study.id_df.join(count_stats, on="consensus_uid", how="left")
|
|
338
|
-
|
|
408
|
+
|
|
339
409
|
# Join with lib_df again to get formula information for heteroatom penalty
|
|
340
410
|
id_with_formula = id_with_counts.join(
|
|
341
411
|
study.lib_df.select(["lib_uid", "formula"]),
|
|
342
412
|
on="lib_uid",
|
|
343
|
-
how="left"
|
|
413
|
+
how="left",
|
|
344
414
|
)
|
|
345
|
-
|
|
415
|
+
|
|
346
416
|
# Apply scoring penalties
|
|
347
|
-
heteroatoms = getattr(params,
|
|
348
|
-
heteroatom_penalty = getattr(params,
|
|
349
|
-
formulas_penalty = getattr(params,
|
|
350
|
-
compounds_penalty = getattr(params,
|
|
351
|
-
|
|
417
|
+
heteroatoms = getattr(params, "heteroatoms", ["Cl", "Br", "F", "I"])
|
|
418
|
+
heteroatom_penalty = getattr(params, "heteroatom_penalty", 0.7)
|
|
419
|
+
formulas_penalty = getattr(params, "multiple_formulas_penalty", 0.8)
|
|
420
|
+
compounds_penalty = getattr(params, "multiple_compounds_penalty", 0.8)
|
|
421
|
+
|
|
352
422
|
# Build heteroatom condition
|
|
353
423
|
heteroatom_condition = None
|
|
354
424
|
for atom in heteroatoms:
|
|
@@ -357,76 +427,104 @@ def identify(study, features=None, params=None, **kwargs):
|
|
|
357
427
|
heteroatom_condition = atom_condition
|
|
358
428
|
else:
|
|
359
429
|
heteroatom_condition = heteroatom_condition | atom_condition
|
|
360
|
-
|
|
430
|
+
|
|
361
431
|
# Apply penalties
|
|
362
|
-
study.id_df =
|
|
363
|
-
|
|
364
|
-
|
|
365
|
-
|
|
432
|
+
study.id_df = (
|
|
433
|
+
id_with_formula.with_columns(
|
|
434
|
+
[
|
|
435
|
+
# Heteroatom penalty: if formula contains specified heteroatoms, apply penalty
|
|
436
|
+
pl.when(
|
|
437
|
+
pl.col("formula").is_not_null() & heteroatom_condition,
|
|
438
|
+
)
|
|
439
|
+
.then(pl.col("score") * heteroatom_penalty)
|
|
440
|
+
.otherwise(pl.col("score"))
|
|
441
|
+
.alias("score_temp0"),
|
|
442
|
+
],
|
|
443
|
+
)
|
|
444
|
+
.with_columns(
|
|
445
|
+
[
|
|
446
|
+
# If num_formulas > 1, apply multiple formulas penalty
|
|
447
|
+
pl.when(pl.col("num_formulas") > 1)
|
|
448
|
+
.then(pl.col("score_temp0") * formulas_penalty)
|
|
449
|
+
.otherwise(pl.col("score_temp0"))
|
|
450
|
+
.alias("score_temp1"),
|
|
451
|
+
],
|
|
366
452
|
)
|
|
367
|
-
.
|
|
368
|
-
|
|
369
|
-
|
|
370
|
-
|
|
371
|
-
|
|
372
|
-
|
|
373
|
-
|
|
374
|
-
|
|
375
|
-
|
|
376
|
-
|
|
377
|
-
|
|
378
|
-
|
|
379
|
-
|
|
380
|
-
|
|
381
|
-
|
|
382
|
-
|
|
383
|
-
|
|
384
|
-
|
|
385
|
-
|
|
386
|
-
|
|
453
|
+
.with_columns(
|
|
454
|
+
[
|
|
455
|
+
# If num_cmpds > 1, apply multiple compounds penalty
|
|
456
|
+
pl.when(pl.col("num_cmpds") > 1)
|
|
457
|
+
.then(pl.col("score_temp1") * compounds_penalty)
|
|
458
|
+
.otherwise(pl.col("score_temp1"))
|
|
459
|
+
.round(4) # Round to 4 decimal places
|
|
460
|
+
.alias("score"),
|
|
461
|
+
],
|
|
462
|
+
)
|
|
463
|
+
.select(
|
|
464
|
+
[
|
|
465
|
+
"consensus_uid",
|
|
466
|
+
"lib_uid",
|
|
467
|
+
"mz_delta",
|
|
468
|
+
"rt_delta",
|
|
469
|
+
"matcher",
|
|
470
|
+
"score",
|
|
471
|
+
],
|
|
472
|
+
)
|
|
473
|
+
)
|
|
474
|
+
|
|
387
475
|
# Store this operation in history
|
|
388
|
-
if hasattr(study,
|
|
476
|
+
if hasattr(study, "store_history"):
|
|
389
477
|
history_params = {"mz_tol": effective_mz_tol, "rt_tol": effective_rt_tol}
|
|
390
478
|
if features is not None:
|
|
391
479
|
history_params["features"] = target_uids
|
|
392
|
-
if params is not None and hasattr(params,
|
|
480
|
+
if params is not None and hasattr(params, "to_dict"):
|
|
393
481
|
history_params["params"] = params.to_dict()
|
|
394
482
|
if kwargs:
|
|
395
483
|
history_params["kwargs"] = kwargs
|
|
396
484
|
study.store_history(["identify"], history_params)
|
|
397
|
-
|
|
485
|
+
|
|
398
486
|
if logger:
|
|
399
487
|
if rt_filtered_compounds > 0:
|
|
400
|
-
logger.debug(
|
|
401
|
-
|
|
488
|
+
logger.debug(
|
|
489
|
+
f"RT consistency filtering applied to {rt_filtered_compounds} compound groups",
|
|
490
|
+
)
|
|
491
|
+
|
|
402
492
|
if multiply_charged_filtered > 0:
|
|
403
|
-
logger.debug(
|
|
404
|
-
|
|
405
|
-
|
|
406
|
-
|
|
407
|
-
|
|
493
|
+
logger.debug(
|
|
494
|
+
f"Excluded {multiply_charged_filtered} multiply charged adducts (no [M+H]+ or [M-H]- coeluting)",
|
|
495
|
+
)
|
|
496
|
+
|
|
497
|
+
logger.info(
|
|
498
|
+
f"Identification completed: {features_with_matches}/{consensus_count} features matched, {total_matches} total identifications",
|
|
499
|
+
)
|
|
500
|
+
|
|
408
501
|
if total_matches > 0:
|
|
409
502
|
# Calculate some statistics
|
|
410
503
|
mz_deltas = [r["mz_delta"] for r in results if r["mz_delta"] is not None]
|
|
411
504
|
rt_deltas = [r["rt_delta"] for r in results if r["rt_delta"] is not None]
|
|
412
505
|
scores = [r["score"] for r in results if r["score"] is not None]
|
|
413
|
-
|
|
506
|
+
|
|
414
507
|
if mz_deltas:
|
|
415
508
|
avg_mz_delta = sum(mz_deltas) / len(mz_deltas)
|
|
416
509
|
max_mz_delta = max(mz_deltas)
|
|
417
|
-
logger.debug(
|
|
418
|
-
|
|
510
|
+
logger.debug(
|
|
511
|
+
f"m/z accuracy: average Δ={avg_mz_delta:.5f} Da, max Δ={max_mz_delta:.5f} Da",
|
|
512
|
+
)
|
|
513
|
+
|
|
419
514
|
if rt_deltas:
|
|
420
515
|
avg_rt_delta = sum(rt_deltas) / len(rt_deltas)
|
|
421
516
|
max_rt_delta = max(rt_deltas)
|
|
422
|
-
logger.debug(
|
|
423
|
-
|
|
517
|
+
logger.debug(
|
|
518
|
+
f"RT accuracy: average Δ={avg_rt_delta:.2f} min, max Δ={max_rt_delta:.2f} min",
|
|
519
|
+
)
|
|
520
|
+
|
|
424
521
|
if scores:
|
|
425
522
|
avg_score = sum(scores) / len(scores)
|
|
426
523
|
min_score = min(scores)
|
|
427
524
|
max_score = max(scores)
|
|
428
|
-
logger.debug(
|
|
429
|
-
|
|
525
|
+
logger.debug(
|
|
526
|
+
f"Adduct probability scores: average={avg_score:.3f}, min={min_score:.3f}, max={max_score:.3f}",
|
|
527
|
+
)
|
|
430
528
|
|
|
431
529
|
|
|
432
530
|
def get_id(study, features=None) -> pl.DataFrame:
|
|
@@ -443,7 +541,7 @@ def get_id(study, features=None) -> pl.DataFrame:
|
|
|
443
541
|
Returns:
|
|
444
542
|
Polars DataFrame with columns:
|
|
445
543
|
- consensus_uid
|
|
446
|
-
- lib_uid
|
|
544
|
+
- lib_uid
|
|
447
545
|
- mz (consensus feature m/z)
|
|
448
546
|
- rt (consensus feature RT)
|
|
449
547
|
- name (compound name from library)
|
|
@@ -459,7 +557,9 @@ def get_id(study, features=None) -> pl.DataFrame:
|
|
|
459
557
|
"""
|
|
460
558
|
# Validate inputs
|
|
461
559
|
if getattr(study, "id_df", None) is None or study.id_df.is_empty():
|
|
462
|
-
raise ValueError(
|
|
560
|
+
raise ValueError(
|
|
561
|
+
"Identification results (study.id_df) are empty; call identify() first",
|
|
562
|
+
)
|
|
463
563
|
|
|
464
564
|
if getattr(study, "lib_df", None) is None or study.lib_df.is_empty():
|
|
465
565
|
raise ValueError("Library (study.lib_df) is empty; call lib_load() first")
|
|
@@ -472,35 +572,52 @@ def get_id(study, features=None) -> pl.DataFrame:
|
|
|
472
572
|
|
|
473
573
|
# Filter by features if provided
|
|
474
574
|
if features is not None:
|
|
475
|
-
if hasattr(features,
|
|
476
|
-
if
|
|
477
|
-
uids = features[
|
|
575
|
+
if hasattr(features, "columns"): # DataFrame-like
|
|
576
|
+
if "consensus_uid" in features.columns:
|
|
577
|
+
uids = features["consensus_uid"].unique().to_list()
|
|
478
578
|
else:
|
|
479
|
-
raise ValueError(
|
|
480
|
-
|
|
579
|
+
raise ValueError(
|
|
580
|
+
"features DataFrame must contain 'consensus_uid' column",
|
|
581
|
+
)
|
|
582
|
+
elif hasattr(features, "__iter__") and not isinstance(
|
|
583
|
+
features,
|
|
584
|
+
str,
|
|
585
|
+
): # List-like
|
|
481
586
|
uids = list(features)
|
|
482
587
|
else:
|
|
483
|
-
raise ValueError(
|
|
484
|
-
|
|
588
|
+
raise ValueError(
|
|
589
|
+
"features must be a DataFrame with 'consensus_uid' column or a list of UIDs",
|
|
590
|
+
)
|
|
591
|
+
|
|
485
592
|
result_df = result_df.filter(pl.col("consensus_uid").is_in(uids))
|
|
486
|
-
|
|
593
|
+
|
|
487
594
|
if result_df.is_empty():
|
|
488
595
|
return pl.DataFrame()
|
|
489
596
|
|
|
490
597
|
# Join with consensus_df to get consensus feature m/z and RT
|
|
491
598
|
consensus_cols = ["consensus_uid", "mz", "rt"]
|
|
492
599
|
# Only select columns that exist in consensus_df
|
|
493
|
-
available_consensus_cols = [
|
|
494
|
-
|
|
600
|
+
available_consensus_cols = [
|
|
601
|
+
col for col in consensus_cols if col in study.consensus_df.columns
|
|
602
|
+
]
|
|
603
|
+
|
|
495
604
|
result_df = result_df.join(
|
|
496
605
|
study.consensus_df.select(available_consensus_cols),
|
|
497
606
|
on="consensus_uid",
|
|
498
607
|
how="left",
|
|
499
|
-
suffix="_consensus"
|
|
608
|
+
suffix="_consensus",
|
|
500
609
|
)
|
|
501
610
|
|
|
502
611
|
# Join with lib_df to get library information
|
|
503
|
-
lib_cols = [
|
|
612
|
+
lib_cols = [
|
|
613
|
+
"lib_uid",
|
|
614
|
+
"name",
|
|
615
|
+
"formula",
|
|
616
|
+
"adduct",
|
|
617
|
+
"smiles",
|
|
618
|
+
"cmpd_uid",
|
|
619
|
+
"inchikey",
|
|
620
|
+
]
|
|
504
621
|
# Add optional columns if they exist
|
|
505
622
|
optional_lib_cols = ["inchi", "db_id", "db"]
|
|
506
623
|
for col in optional_lib_cols:
|
|
@@ -509,19 +626,19 @@ def get_id(study, features=None) -> pl.DataFrame:
|
|
|
509
626
|
|
|
510
627
|
# Only select columns that exist in lib_df
|
|
511
628
|
available_lib_cols = [col for col in lib_cols if col in study.lib_df.columns]
|
|
512
|
-
|
|
629
|
+
|
|
513
630
|
result_df = result_df.join(
|
|
514
631
|
study.lib_df.select(available_lib_cols),
|
|
515
|
-
on="lib_uid",
|
|
632
|
+
on="lib_uid",
|
|
516
633
|
how="left",
|
|
517
|
-
suffix="_lib"
|
|
634
|
+
suffix="_lib",
|
|
518
635
|
)
|
|
519
636
|
|
|
520
637
|
# Reorder columns for better readability
|
|
521
638
|
column_order = [
|
|
522
639
|
"consensus_uid",
|
|
523
640
|
"cmpd_uid" if "cmpd_uid" in result_df.columns else None,
|
|
524
|
-
"lib_uid",
|
|
641
|
+
"lib_uid",
|
|
525
642
|
"name" if "name" in result_df.columns else None,
|
|
526
643
|
"formula" if "formula" in result_df.columns else None,
|
|
527
644
|
"adduct" if "adduct" in result_df.columns else None,
|
|
@@ -532,34 +649,57 @@ def get_id(study, features=None) -> pl.DataFrame:
|
|
|
532
649
|
"matcher" if "matcher" in result_df.columns else None,
|
|
533
650
|
"score" if "score" in result_df.columns else None,
|
|
534
651
|
"smiles" if "smiles" in result_df.columns else None,
|
|
535
|
-
"inchikey" if "inchikey" in result_df.columns else None
|
|
652
|
+
"inchikey" if "inchikey" in result_df.columns else None,
|
|
536
653
|
]
|
|
537
|
-
|
|
654
|
+
|
|
538
655
|
# Add any remaining columns
|
|
539
656
|
remaining_cols = [col for col in result_df.columns if col not in column_order]
|
|
540
657
|
column_order.extend(remaining_cols)
|
|
541
|
-
|
|
658
|
+
|
|
542
659
|
# Filter out None values and select existing columns
|
|
543
|
-
final_column_order = [
|
|
544
|
-
|
|
660
|
+
final_column_order = [
|
|
661
|
+
col for col in column_order if col is not None and col in result_df.columns
|
|
662
|
+
]
|
|
663
|
+
|
|
545
664
|
result_df = result_df.select(final_column_order)
|
|
546
|
-
|
|
665
|
+
|
|
547
666
|
# Add compound and formula count columns
|
|
548
667
|
if "consensus_uid" in result_df.columns:
|
|
549
668
|
# Calculate counts per consensus_uid
|
|
550
|
-
count_stats = result_df.group_by("consensus_uid").agg(
|
|
551
|
-
|
|
552
|
-
|
|
553
|
-
|
|
554
|
-
|
|
669
|
+
count_stats = result_df.group_by("consensus_uid").agg(
|
|
670
|
+
[
|
|
671
|
+
pl.col("cmpd_uid").n_unique().alias("num_cmpds")
|
|
672
|
+
if "cmpd_uid" in result_df.columns
|
|
673
|
+
else pl.lit(None).alias("num_cmpds"),
|
|
674
|
+
pl.col("formula")
|
|
675
|
+
.filter(pl.col("formula").is_not_null())
|
|
676
|
+
.n_unique()
|
|
677
|
+
.alias("num_formulas")
|
|
678
|
+
if "formula" in result_df.columns
|
|
679
|
+
else pl.lit(None).alias("num_formulas"),
|
|
680
|
+
],
|
|
681
|
+
)
|
|
682
|
+
|
|
555
683
|
# Join the counts back to the main dataframe
|
|
556
684
|
result_df = result_df.join(count_stats, on="consensus_uid", how="left")
|
|
557
|
-
|
|
685
|
+
|
|
558
686
|
# Reorder columns to put count columns in the right position
|
|
559
687
|
final_columns = []
|
|
560
688
|
for col in result_df.columns:
|
|
561
|
-
if col in [
|
|
562
|
-
|
|
689
|
+
if col in [
|
|
690
|
+
"consensus_uid",
|
|
691
|
+
"cmpd_uid",
|
|
692
|
+
"lib_uid",
|
|
693
|
+
"name",
|
|
694
|
+
"formula",
|
|
695
|
+
"adduct",
|
|
696
|
+
"mz",
|
|
697
|
+
"mz_delta",
|
|
698
|
+
"rt",
|
|
699
|
+
"rt_delta",
|
|
700
|
+
"matcher",
|
|
701
|
+
"score",
|
|
702
|
+
]:
|
|
563
703
|
final_columns.append(col)
|
|
564
704
|
# Add count columns
|
|
565
705
|
if "num_cmpds" in result_df.columns:
|
|
@@ -570,143 +710,160 @@ def get_id(study, features=None) -> pl.DataFrame:
|
|
|
570
710
|
for col in result_df.columns:
|
|
571
711
|
if col not in final_columns:
|
|
572
712
|
final_columns.append(col)
|
|
573
|
-
|
|
713
|
+
|
|
574
714
|
result_df = result_df.select(final_columns)
|
|
575
|
-
|
|
715
|
+
|
|
576
716
|
# Apply filtering logic (scores are already final from identify())
|
|
577
717
|
if "consensus_uid" in result_df.columns and len(result_df) > 0:
|
|
578
718
|
# (v) Rank by score, assume that highest score has the correct rt
|
|
579
719
|
# (vi) Remove all lower-scoring ids with a different rt (group by cmpd_uid)
|
|
580
720
|
# (vii) Remove multiply charged ids if not in line with [M+H]+ or [M-H]- (group by cmpd_uid)
|
|
581
|
-
|
|
721
|
+
|
|
582
722
|
# Group by cmpd_uid and apply filtering logic
|
|
583
723
|
if "cmpd_uid" in result_df.columns:
|
|
584
724
|
filtered_dfs = []
|
|
585
725
|
for cmpd_uid, group_df in result_df.group_by("cmpd_uid"):
|
|
586
726
|
# Sort by score descending to get highest score first
|
|
587
727
|
group_df = group_df.sort("score", descending=True)
|
|
588
|
-
|
|
728
|
+
|
|
589
729
|
if len(group_df) == 0:
|
|
590
730
|
continue
|
|
591
|
-
|
|
731
|
+
|
|
592
732
|
# Get the highest scoring entry's RT as reference
|
|
593
|
-
reference_rt =
|
|
594
|
-
|
|
733
|
+
reference_rt = (
|
|
734
|
+
group_df["rt"][0]
|
|
735
|
+
if "rt" in group_df.columns and group_df["rt"][0] is not None
|
|
736
|
+
else None
|
|
737
|
+
)
|
|
738
|
+
|
|
595
739
|
# Filter entries: keep those with same RT as highest scoring entry
|
|
596
740
|
if reference_rt is not None and "rt" in group_df.columns:
|
|
597
741
|
# Keep entries with the same RT or null RT
|
|
598
742
|
rt_filtered = group_df.filter(
|
|
599
|
-
(pl.col("rt") == reference_rt) | pl.col("rt").is_null()
|
|
743
|
+
(pl.col("rt") == reference_rt) | pl.col("rt").is_null(),
|
|
600
744
|
)
|
|
601
745
|
else:
|
|
602
746
|
# No reference RT, keep all
|
|
603
747
|
rt_filtered = group_df
|
|
604
|
-
|
|
748
|
+
|
|
605
749
|
# Check multiply charged constraint
|
|
606
|
-
if
|
|
750
|
+
if (
|
|
751
|
+
"z" in rt_filtered.columns
|
|
752
|
+
and "adduct" in rt_filtered.columns
|
|
753
|
+
and len(rt_filtered) > 0
|
|
754
|
+
):
|
|
607
755
|
# Check if there are multiply charged adducts
|
|
608
|
-
multiply_charged = rt_filtered.filter(
|
|
609
|
-
|
|
610
|
-
|
|
756
|
+
multiply_charged = rt_filtered.filter(
|
|
757
|
+
(pl.col("z") > 1) | (pl.col("z") < -1),
|
|
758
|
+
)
|
|
759
|
+
singly_charged = rt_filtered.filter(
|
|
760
|
+
(pl.col("z") == 1) | (pl.col("z") == -1),
|
|
761
|
+
)
|
|
762
|
+
|
|
611
763
|
if not multiply_charged.is_empty():
|
|
612
764
|
# Check if [M+H]+ or [M-H]- are present
|
|
613
765
|
reference_adducts = ["[M+H]+", "[M-H]-"]
|
|
614
|
-
has_reference = any(
|
|
615
|
-
|
|
766
|
+
has_reference = any(
|
|
767
|
+
singly_charged.filter(
|
|
768
|
+
pl.col("adduct").is_in(reference_adducts),
|
|
769
|
+
).height
|
|
770
|
+
> 0,
|
|
771
|
+
)
|
|
772
|
+
|
|
616
773
|
if not has_reference:
|
|
617
774
|
# Remove multiply charged adducts
|
|
618
775
|
rt_filtered = singly_charged
|
|
619
|
-
|
|
776
|
+
|
|
620
777
|
if len(rt_filtered) > 0:
|
|
621
778
|
filtered_dfs.append(rt_filtered)
|
|
622
|
-
|
|
779
|
+
|
|
623
780
|
if filtered_dfs:
|
|
624
781
|
result_df = pl.concat(filtered_dfs)
|
|
625
782
|
else:
|
|
626
783
|
result_df = pl.DataFrame()
|
|
627
|
-
|
|
784
|
+
|
|
628
785
|
# Sort by cmpd_uid if available
|
|
629
786
|
if "cmpd_uid" in result_df.columns:
|
|
630
787
|
result_df = result_df.sort("cmpd_uid")
|
|
631
|
-
|
|
788
|
+
|
|
632
789
|
return result_df
|
|
633
790
|
|
|
634
791
|
|
|
635
792
|
def id_reset(study):
|
|
636
793
|
"""Reset identification data and remove from history.
|
|
637
|
-
|
|
794
|
+
|
|
638
795
|
Removes:
|
|
639
796
|
- study.id_df (identification results DataFrame)
|
|
640
797
|
- 'identify' from study.history
|
|
641
|
-
|
|
798
|
+
|
|
642
799
|
Args:
|
|
643
800
|
study: Study instance to reset
|
|
644
801
|
"""
|
|
645
802
|
# Get logger from study if available
|
|
646
|
-
logger = getattr(study,
|
|
647
|
-
|
|
803
|
+
logger = getattr(study, "logger", None)
|
|
804
|
+
|
|
648
805
|
# Remove id_df
|
|
649
|
-
if hasattr(study,
|
|
806
|
+
if hasattr(study, "id_df"):
|
|
650
807
|
if logger:
|
|
651
808
|
logger.debug("Removing id_df")
|
|
652
|
-
delattr(study,
|
|
653
|
-
|
|
809
|
+
delattr(study, "id_df")
|
|
810
|
+
|
|
654
811
|
# Remove identify from history
|
|
655
|
-
if hasattr(study,
|
|
812
|
+
if hasattr(study, "history") and "identify" in study.history:
|
|
656
813
|
if logger:
|
|
657
814
|
logger.debug("Removing 'identify' from history")
|
|
658
|
-
del study.history[
|
|
659
|
-
|
|
815
|
+
del study.history["identify"]
|
|
816
|
+
|
|
660
817
|
if logger:
|
|
661
818
|
logger.info("Identification data reset completed")
|
|
662
819
|
|
|
663
820
|
|
|
664
821
|
def lib_reset(study):
|
|
665
822
|
"""Reset library and identification data and remove from history.
|
|
666
|
-
|
|
823
|
+
|
|
667
824
|
Removes:
|
|
668
|
-
- study.id_df (identification results DataFrame)
|
|
825
|
+
- study.id_df (identification results DataFrame)
|
|
669
826
|
- study.lib_df (library DataFrame)
|
|
670
827
|
- study._lib (library object reference)
|
|
671
828
|
- 'identify' from study.history
|
|
672
829
|
- 'lib_load' from study.history (if exists)
|
|
673
|
-
|
|
830
|
+
|
|
674
831
|
Args:
|
|
675
832
|
study: Study instance to reset
|
|
676
833
|
"""
|
|
677
834
|
# Get logger from study if available
|
|
678
|
-
logger = getattr(study,
|
|
679
|
-
|
|
835
|
+
logger = getattr(study, "logger", None)
|
|
836
|
+
|
|
680
837
|
# Remove id_df
|
|
681
|
-
if hasattr(study,
|
|
838
|
+
if hasattr(study, "id_df"):
|
|
682
839
|
if logger:
|
|
683
840
|
logger.debug("Removing id_df")
|
|
684
|
-
delattr(study,
|
|
685
|
-
|
|
686
|
-
# Remove lib_df
|
|
687
|
-
if hasattr(study,
|
|
841
|
+
delattr(study, "id_df")
|
|
842
|
+
|
|
843
|
+
# Remove lib_df
|
|
844
|
+
if hasattr(study, "lib_df"):
|
|
688
845
|
if logger:
|
|
689
846
|
logger.debug("Removing lib_df")
|
|
690
|
-
delattr(study,
|
|
691
|
-
|
|
847
|
+
delattr(study, "lib_df")
|
|
848
|
+
|
|
692
849
|
# Remove lib object reference
|
|
693
|
-
if hasattr(study,
|
|
850
|
+
if hasattr(study, "_lib"):
|
|
694
851
|
if logger:
|
|
695
852
|
logger.debug("Removing _lib reference")
|
|
696
|
-
delattr(study,
|
|
697
|
-
|
|
853
|
+
delattr(study, "_lib")
|
|
854
|
+
|
|
698
855
|
# Remove from history
|
|
699
|
-
if hasattr(study,
|
|
700
|
-
if
|
|
856
|
+
if hasattr(study, "history"):
|
|
857
|
+
if "identify" in study.history:
|
|
701
858
|
if logger:
|
|
702
859
|
logger.debug("Removing 'identify' from history")
|
|
703
|
-
del study.history[
|
|
704
|
-
|
|
705
|
-
if
|
|
860
|
+
del study.history["identify"]
|
|
861
|
+
|
|
862
|
+
if "lib_load" in study.history:
|
|
706
863
|
if logger:
|
|
707
864
|
logger.debug("Removing 'lib_load' from history")
|
|
708
|
-
del study.history[
|
|
709
|
-
|
|
865
|
+
del study.history["lib_load"]
|
|
866
|
+
|
|
710
867
|
if logger:
|
|
711
868
|
logger.info("Library and identification data reset completed")
|
|
712
869
|
|
|
@@ -714,11 +871,11 @@ def lib_reset(study):
|
|
|
714
871
|
def _get_adducts(self, adducts_list: list = None, **kwargs):
|
|
715
872
|
"""
|
|
716
873
|
Generate comprehensive adduct specifications for study-level adduct filtering.
|
|
717
|
-
|
|
874
|
+
|
|
718
875
|
This method creates a DataFrame of adduct combinations that will be used to filter
|
|
719
876
|
and score adducts at the study level. Similar to sample._get_adducts() but uses
|
|
720
877
|
study-level parameters and constraints.
|
|
721
|
-
|
|
878
|
+
|
|
722
879
|
Parameters
|
|
723
880
|
----------
|
|
724
881
|
adducts_list : List[str], optional
|
|
@@ -727,10 +884,10 @@ def _get_adducts(self, adducts_list: list = None, **kwargs):
|
|
|
727
884
|
**kwargs : dict
|
|
728
885
|
Override parameters, including:
|
|
729
886
|
- charge_min: Minimum charge to consider (default 1)
|
|
730
|
-
- charge_max: Maximum charge to consider (default 3)
|
|
887
|
+
- charge_max: Maximum charge to consider (default 3)
|
|
731
888
|
- max_combinations: Maximum number of adduct components to combine (default 3)
|
|
732
889
|
- min_probability: Minimum probability threshold (default from study parameters)
|
|
733
|
-
|
|
890
|
+
|
|
734
891
|
Returns
|
|
735
892
|
-------
|
|
736
893
|
pl.DataFrame
|
|
@@ -742,270 +899,302 @@ def _get_adducts(self, adducts_list: list = None, **kwargs):
|
|
|
742
899
|
- complexity: Number of adduct components (1-3)
|
|
743
900
|
"""
|
|
744
901
|
# Import required modules
|
|
745
|
-
|
|
746
|
-
from itertools import combinations
|
|
747
|
-
import numpy as np
|
|
748
|
-
|
|
902
|
+
|
|
749
903
|
# Use provided adducts list or get from study parameters
|
|
750
904
|
if adducts_list is None:
|
|
751
|
-
adducts_list =
|
|
752
|
-
|
|
905
|
+
adducts_list = (
|
|
906
|
+
self.parameters.adducts
|
|
907
|
+
if hasattr(self.parameters, "adducts") and self.parameters.adducts
|
|
908
|
+
else []
|
|
909
|
+
)
|
|
910
|
+
|
|
753
911
|
# Get parameters with study-specific defaults
|
|
754
|
-
charge_min = kwargs.get(
|
|
755
|
-
charge_max = kwargs.get(
|
|
756
|
-
max_combinations = kwargs.get(
|
|
757
|
-
min_probability = kwargs.get(
|
|
758
|
-
|
|
912
|
+
charge_min = kwargs.get("charge_min", -3) # Allow negative charges
|
|
913
|
+
charge_max = kwargs.get("charge_max", 3) # Study uses up to charge ±3
|
|
914
|
+
max_combinations = kwargs.get("max_combinations", 3) # Up to 3 combinations
|
|
915
|
+
min_probability = kwargs.get(
|
|
916
|
+
"min_probability",
|
|
917
|
+
getattr(self.parameters, "adduct_min_probability", 0.04),
|
|
918
|
+
)
|
|
919
|
+
|
|
759
920
|
# Parse base adduct specifications
|
|
760
921
|
base_specs = []
|
|
761
|
-
|
|
922
|
+
|
|
762
923
|
for adduct_str in adducts_list:
|
|
763
|
-
if not isinstance(adduct_str, str) or
|
|
924
|
+
if not isinstance(adduct_str, str) or ":" not in adduct_str:
|
|
764
925
|
continue
|
|
765
|
-
|
|
926
|
+
|
|
766
927
|
try:
|
|
767
|
-
parts = adduct_str.split(
|
|
928
|
+
parts = adduct_str.split(":")
|
|
768
929
|
if len(parts) != 3:
|
|
769
930
|
continue
|
|
770
|
-
|
|
931
|
+
|
|
771
932
|
formula_part = parts[0]
|
|
772
|
-
charge = int(parts[1])
|
|
933
|
+
charge = int(parts[1])
|
|
773
934
|
probability = float(parts[2])
|
|
774
|
-
|
|
935
|
+
|
|
775
936
|
# Calculate mass shift from formula
|
|
776
937
|
mass_shift = self._calculate_formula_mass_shift(formula_part)
|
|
777
|
-
|
|
778
|
-
base_specs.append(
|
|
779
|
-
|
|
780
|
-
|
|
781
|
-
|
|
782
|
-
|
|
783
|
-
|
|
784
|
-
|
|
785
|
-
|
|
938
|
+
|
|
939
|
+
base_specs.append(
|
|
940
|
+
{
|
|
941
|
+
"formula": formula_part,
|
|
942
|
+
"charge": charge,
|
|
943
|
+
"mass_shift": mass_shift,
|
|
944
|
+
"probability": probability,
|
|
945
|
+
"raw_string": adduct_str,
|
|
946
|
+
},
|
|
947
|
+
)
|
|
948
|
+
|
|
786
949
|
except (ValueError, IndexError):
|
|
787
950
|
continue
|
|
788
|
-
|
|
951
|
+
|
|
789
952
|
if not base_specs:
|
|
790
953
|
# Return empty DataFrame with correct schema
|
|
791
|
-
return pl.DataFrame(
|
|
792
|
-
|
|
793
|
-
|
|
794
|
-
|
|
795
|
-
|
|
796
|
-
|
|
797
|
-
|
|
798
|
-
|
|
954
|
+
return pl.DataFrame(
|
|
955
|
+
{
|
|
956
|
+
"name": [],
|
|
957
|
+
"charge": [],
|
|
958
|
+
"mass_shift": [],
|
|
959
|
+
"probability": [],
|
|
960
|
+
"complexity": [],
|
|
961
|
+
},
|
|
962
|
+
)
|
|
963
|
+
|
|
799
964
|
# Generate all valid combinations
|
|
800
965
|
combinations_list = []
|
|
801
|
-
|
|
966
|
+
|
|
802
967
|
# Separate specs by charge type
|
|
803
|
-
positive_specs = [spec for spec in base_specs if spec[
|
|
804
|
-
negative_specs = [spec for spec in base_specs if spec[
|
|
805
|
-
neutral_specs = [spec for spec in base_specs if spec[
|
|
806
|
-
|
|
968
|
+
positive_specs = [spec for spec in base_specs if spec["charge"] > 0]
|
|
969
|
+
negative_specs = [spec for spec in base_specs if spec["charge"] < 0]
|
|
970
|
+
neutral_specs = [spec for spec in base_specs if spec["charge"] == 0]
|
|
971
|
+
|
|
807
972
|
# 1. Single adducts (filter out neutral adducts with charge == 0)
|
|
808
973
|
for spec in base_specs:
|
|
809
|
-
if charge_min <= spec[
|
|
974
|
+
if charge_min <= spec["charge"] <= charge_max and spec["charge"] != 0:
|
|
810
975
|
formatted_name = self._format_adduct_name([spec])
|
|
811
|
-
combinations_list.append(
|
|
812
|
-
|
|
813
|
-
|
|
814
|
-
|
|
815
|
-
|
|
816
|
-
|
|
817
|
-
|
|
818
|
-
|
|
819
|
-
|
|
976
|
+
combinations_list.append(
|
|
977
|
+
{
|
|
978
|
+
"components": [spec],
|
|
979
|
+
"formatted_name": formatted_name,
|
|
980
|
+
"total_mass_shift": spec["mass_shift"],
|
|
981
|
+
"total_charge": spec["charge"],
|
|
982
|
+
"combined_probability": spec["probability"],
|
|
983
|
+
"complexity": 1,
|
|
984
|
+
},
|
|
985
|
+
)
|
|
986
|
+
|
|
820
987
|
# 2. Generate multiply charged versions (2H+, 3H+, etc.) - already excludes charge==0
|
|
821
988
|
for spec in positive_specs + negative_specs:
|
|
822
|
-
base_charge = spec[
|
|
989
|
+
base_charge = spec["charge"]
|
|
823
990
|
for multiplier in range(2, min(max_combinations + 1, 4)): # Up to 3x multiplier
|
|
824
991
|
total_charge = base_charge * multiplier
|
|
825
992
|
if charge_min <= total_charge <= charge_max and total_charge != 0:
|
|
826
993
|
components = [spec] * multiplier
|
|
827
994
|
formatted_name = self._format_adduct_name(components)
|
|
828
|
-
|
|
829
|
-
combinations_list.append(
|
|
830
|
-
|
|
831
|
-
|
|
832
|
-
|
|
833
|
-
|
|
834
|
-
|
|
835
|
-
|
|
836
|
-
|
|
837
|
-
|
|
995
|
+
|
|
996
|
+
combinations_list.append(
|
|
997
|
+
{
|
|
998
|
+
"components": components,
|
|
999
|
+
"formatted_name": formatted_name,
|
|
1000
|
+
"total_mass_shift": spec["mass_shift"] * multiplier,
|
|
1001
|
+
"total_charge": total_charge,
|
|
1002
|
+
"combined_probability": spec["probability"] ** multiplier,
|
|
1003
|
+
"complexity": multiplier,
|
|
1004
|
+
},
|
|
1005
|
+
)
|
|
1006
|
+
|
|
838
1007
|
# 3. Mixed combinations (2-component) - limited for study level, filter out charge==0
|
|
839
1008
|
if max_combinations >= 2:
|
|
840
1009
|
# Positive + Neutral (1 neutral loss only) - but exclude if total charge == 0
|
|
841
1010
|
for pos_spec in positive_specs[:2]: # Limit to first 2 positive specs
|
|
842
1011
|
for neut_spec in neutral_specs[:1]: # Only 1 neutral loss
|
|
843
|
-
total_charge = pos_spec[
|
|
1012
|
+
total_charge = pos_spec["charge"] + neut_spec["charge"]
|
|
844
1013
|
if charge_min <= total_charge <= charge_max and total_charge != 0:
|
|
845
1014
|
components = [pos_spec, neut_spec]
|
|
846
1015
|
formatted_name = self._format_adduct_name(components)
|
|
847
|
-
combinations_list.append(
|
|
848
|
-
|
|
849
|
-
|
|
850
|
-
|
|
851
|
-
|
|
852
|
-
|
|
853
|
-
|
|
854
|
-
|
|
855
|
-
|
|
1016
|
+
combinations_list.append(
|
|
1017
|
+
{
|
|
1018
|
+
"components": components,
|
|
1019
|
+
"formatted_name": formatted_name,
|
|
1020
|
+
"total_mass_shift": pos_spec["mass_shift"]
|
|
1021
|
+
+ neut_spec["mass_shift"],
|
|
1022
|
+
"total_charge": total_charge,
|
|
1023
|
+
"combined_probability": pos_spec["probability"]
|
|
1024
|
+
* neut_spec["probability"],
|
|
1025
|
+
"complexity": 2,
|
|
1026
|
+
},
|
|
1027
|
+
)
|
|
1028
|
+
|
|
856
1029
|
# Convert to polars DataFrame
|
|
857
1030
|
if combinations_list:
|
|
858
|
-
combinations_list.sort(
|
|
859
|
-
|
|
860
|
-
|
|
861
|
-
|
|
862
|
-
|
|
863
|
-
|
|
864
|
-
|
|
865
|
-
|
|
866
|
-
|
|
867
|
-
|
|
868
|
-
|
|
869
|
-
|
|
870
|
-
|
|
1031
|
+
combinations_list.sort(
|
|
1032
|
+
key=lambda x: (-x["combined_probability"], x["complexity"]),
|
|
1033
|
+
)
|
|
1034
|
+
|
|
1035
|
+
adducts_df = pl.DataFrame(
|
|
1036
|
+
[
|
|
1037
|
+
{
|
|
1038
|
+
"name": combo["formatted_name"],
|
|
1039
|
+
"charge": combo["total_charge"],
|
|
1040
|
+
"mass_shift": combo["total_mass_shift"],
|
|
1041
|
+
"probability": combo["combined_probability"],
|
|
1042
|
+
"complexity": combo["complexity"],
|
|
1043
|
+
}
|
|
1044
|
+
for combo in combinations_list
|
|
1045
|
+
],
|
|
1046
|
+
)
|
|
1047
|
+
|
|
871
1048
|
# Filter by minimum probability threshold
|
|
872
1049
|
if min_probability > 0.0:
|
|
873
1050
|
adducts_before_filter = len(adducts_df)
|
|
874
1051
|
adducts_df = adducts_df.filter(pl.col("probability") >= min_probability)
|
|
875
1052
|
adducts_after_filter = len(adducts_df)
|
|
876
|
-
|
|
877
|
-
self.logger.debug(
|
|
878
|
-
|
|
1053
|
+
|
|
1054
|
+
self.logger.debug(
|
|
1055
|
+
f"Study adducts: generated {adducts_before_filter}, filtered to {adducts_after_filter} (min_prob={min_probability})",
|
|
1056
|
+
)
|
|
1057
|
+
|
|
879
1058
|
else:
|
|
880
1059
|
# Return empty DataFrame with correct schema
|
|
881
|
-
adducts_df = pl.DataFrame(
|
|
882
|
-
|
|
883
|
-
|
|
884
|
-
|
|
885
|
-
|
|
886
|
-
|
|
887
|
-
|
|
888
|
-
|
|
1060
|
+
adducts_df = pl.DataFrame(
|
|
1061
|
+
{
|
|
1062
|
+
"name": [],
|
|
1063
|
+
"charge": [],
|
|
1064
|
+
"mass_shift": [],
|
|
1065
|
+
"probability": [],
|
|
1066
|
+
"complexity": [],
|
|
1067
|
+
},
|
|
1068
|
+
)
|
|
1069
|
+
|
|
889
1070
|
return adducts_df
|
|
890
1071
|
|
|
1072
|
+
|
|
891
1073
|
def _calculate_formula_mass_shift(self, formula: str) -> float:
|
|
892
1074
|
"""Calculate mass shift from formula string like "+H", "-H2O", "+Na-H", etc."""
|
|
893
1075
|
# Standard atomic masses
|
|
894
1076
|
atomic_masses = {
|
|
895
|
-
|
|
896
|
-
|
|
897
|
-
|
|
898
|
-
|
|
899
|
-
|
|
900
|
-
|
|
901
|
-
|
|
902
|
-
|
|
903
|
-
|
|
904
|
-
|
|
905
|
-
|
|
906
|
-
|
|
907
|
-
|
|
908
|
-
|
|
909
|
-
|
|
1077
|
+
"H": 1.007825,
|
|
1078
|
+
"C": 12.0,
|
|
1079
|
+
"N": 14.003074,
|
|
1080
|
+
"O": 15.994915,
|
|
1081
|
+
"Na": 22.989769,
|
|
1082
|
+
"K": 38.963707,
|
|
1083
|
+
"Li": 7.016003,
|
|
1084
|
+
"Ca": 39.962591,
|
|
1085
|
+
"Mg": 23.985042,
|
|
1086
|
+
"Fe": 55.934938,
|
|
1087
|
+
"Cl": 34.968853,
|
|
1088
|
+
"Br": 78.918336,
|
|
1089
|
+
"I": 126.904473,
|
|
1090
|
+
"P": 30.973762,
|
|
1091
|
+
"S": 31.972071,
|
|
910
1092
|
}
|
|
911
|
-
|
|
1093
|
+
|
|
912
1094
|
total_mass = 0.0
|
|
913
|
-
|
|
1095
|
+
|
|
914
1096
|
# Parse formula by splitting on + and - while preserving the operators
|
|
915
1097
|
parts = []
|
|
916
1098
|
current_part = ""
|
|
917
1099
|
current_sign = 1
|
|
918
|
-
|
|
1100
|
+
|
|
919
1101
|
for char in formula:
|
|
920
|
-
if char ==
|
|
1102
|
+
if char == "+":
|
|
921
1103
|
if current_part:
|
|
922
1104
|
parts.append((current_sign, current_part))
|
|
923
1105
|
current_part = ""
|
|
924
1106
|
current_sign = 1
|
|
925
|
-
elif char ==
|
|
1107
|
+
elif char == "-":
|
|
926
1108
|
if current_part:
|
|
927
1109
|
parts.append((current_sign, current_part))
|
|
928
1110
|
current_part = ""
|
|
929
1111
|
current_sign = -1
|
|
930
1112
|
else:
|
|
931
1113
|
current_part += char
|
|
932
|
-
|
|
1114
|
+
|
|
933
1115
|
if current_part:
|
|
934
1116
|
parts.append((current_sign, current_part))
|
|
935
|
-
|
|
1117
|
+
|
|
936
1118
|
# Process each part
|
|
937
1119
|
for sign, part in parts:
|
|
938
1120
|
if not part:
|
|
939
1121
|
continue
|
|
940
|
-
|
|
1122
|
+
|
|
941
1123
|
# Parse element and count (e.g., "H2O" -> H:2, O:1)
|
|
942
1124
|
elements = self._parse_element_counts(part)
|
|
943
|
-
|
|
1125
|
+
|
|
944
1126
|
for element, count in elements.items():
|
|
945
1127
|
if element in atomic_masses:
|
|
946
1128
|
total_mass += sign * atomic_masses[element] * count
|
|
947
|
-
|
|
1129
|
+
|
|
948
1130
|
return total_mass
|
|
949
1131
|
|
|
1132
|
+
|
|
950
1133
|
def _parse_element_counts(self, formula_part: str) -> dict[str, int]:
|
|
951
1134
|
"""Parse element counts from a formula part like 'H2O' -> {'H': 2, 'O': 1}"""
|
|
952
1135
|
elements = {}
|
|
953
1136
|
i = 0
|
|
954
|
-
|
|
1137
|
+
|
|
955
1138
|
while i < len(formula_part):
|
|
956
1139
|
# Get element (uppercase letter, possibly followed by lowercase)
|
|
957
1140
|
element = formula_part[i]
|
|
958
1141
|
i += 1
|
|
959
|
-
|
|
1142
|
+
|
|
960
1143
|
while i < len(formula_part) and formula_part[i].islower():
|
|
961
1144
|
element += formula_part[i]
|
|
962
1145
|
i += 1
|
|
963
|
-
|
|
1146
|
+
|
|
964
1147
|
# Get count (digits following element)
|
|
965
1148
|
count_str = ""
|
|
966
1149
|
while i < len(formula_part) and formula_part[i].isdigit():
|
|
967
1150
|
count_str += formula_part[i]
|
|
968
1151
|
i += 1
|
|
969
|
-
|
|
1152
|
+
|
|
970
1153
|
count = int(count_str) if count_str else 1
|
|
971
1154
|
elements[element] = elements.get(element, 0) + count
|
|
972
|
-
|
|
1155
|
+
|
|
973
1156
|
return elements
|
|
974
1157
|
|
|
1158
|
+
|
|
975
1159
|
def _format_adduct_name(self, components: list[dict]) -> str:
|
|
976
1160
|
"""Format adduct name from components like [M+H]1+ or [M+2H]2+"""
|
|
977
1161
|
if not components:
|
|
978
1162
|
return "[M]"
|
|
979
|
-
|
|
1163
|
+
|
|
980
1164
|
# Count occurrences of each formula
|
|
981
1165
|
from collections import Counter
|
|
982
|
-
|
|
983
|
-
|
|
984
|
-
|
|
1166
|
+
|
|
1167
|
+
formula_counts = Counter(comp["formula"] for comp in components)
|
|
1168
|
+
total_charge = sum(comp["charge"] for comp in components)
|
|
1169
|
+
|
|
985
1170
|
# Build formula part with proper multipliers
|
|
986
1171
|
formula_parts = []
|
|
987
|
-
for formula, count in sorted(
|
|
1172
|
+
for formula, count in sorted(
|
|
1173
|
+
formula_counts.items(),
|
|
1174
|
+
): # Sort for consistent ordering
|
|
988
1175
|
if count == 1:
|
|
989
1176
|
formula_parts.append(formula)
|
|
990
1177
|
else:
|
|
991
1178
|
# For multiple occurrences, use count prefix (e.g., 2H, 3Na)
|
|
992
1179
|
# Handle special case where formula might already start with + or -
|
|
993
|
-
if formula.startswith((
|
|
1180
|
+
if formula.startswith(("+", "-")):
|
|
994
1181
|
sign = formula[0]
|
|
995
1182
|
base_formula = formula[1:]
|
|
996
1183
|
formula_parts.append(f"{sign}{count}{base_formula}")
|
|
997
1184
|
else:
|
|
998
1185
|
formula_parts.append(f"{count}{formula}")
|
|
999
|
-
|
|
1186
|
+
|
|
1000
1187
|
# Combine formula parts
|
|
1001
1188
|
formula = "".join(formula_parts)
|
|
1002
|
-
|
|
1189
|
+
|
|
1003
1190
|
# Format charge
|
|
1004
1191
|
if total_charge == 0:
|
|
1005
1192
|
charge_str = ""
|
|
1006
1193
|
elif abs(total_charge) == 1:
|
|
1007
1194
|
charge_str = "1+" if total_charge > 0 else "1-"
|
|
1008
1195
|
else:
|
|
1009
|
-
charge_str =
|
|
1010
|
-
|
|
1011
|
-
|
|
1196
|
+
charge_str = (
|
|
1197
|
+
f"{abs(total_charge)}+" if total_charge > 0 else f"{abs(total_charge)}-"
|
|
1198
|
+
)
|
|
1199
|
+
|
|
1200
|
+
return f"[M{formula}]{charge_str}"
|