masster 0.2.4__py3-none-any.whl → 0.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of masster might be problematic. Click here for more details.
- masster/__init__.py +27 -27
- masster/_version.py +17 -17
- masster/chromatogram.py +497 -503
- masster/data/examples/2025_01_14_VW_7600_LpMx_DBS_CID_2min_TOP15_030msecMS1_005msecReac_CE35_DBS-ON_3.featureXML +199787 -0
- masster/data/examples/2025_01_14_VW_7600_LpMx_DBS_CID_2min_TOP15_030msecMS1_005msecReac_CE35_DBS-ON_3.sample5 +0 -0
- masster/logger.py +318 -244
- masster/sample/__init__.py +9 -9
- masster/sample/defaults/__init__.py +15 -15
- masster/sample/defaults/find_adducts_def.py +325 -325
- masster/sample/defaults/find_features_def.py +366 -366
- masster/sample/defaults/find_ms2_def.py +285 -285
- masster/sample/defaults/get_spectrum_def.py +314 -318
- masster/sample/defaults/sample_def.py +374 -378
- masster/sample/h5.py +1321 -1297
- masster/sample/helpers.py +833 -364
- masster/sample/lib.py +762 -0
- masster/sample/load.py +1220 -1187
- masster/sample/parameters.py +131 -131
- masster/sample/plot.py +1610 -1622
- masster/sample/processing.py +1402 -1416
- masster/sample/quant.py +209 -0
- masster/sample/sample.py +391 -387
- masster/sample/sample5_schema.json +181 -181
- masster/sample/save.py +737 -719
- masster/sample/sciex.py +1213 -0
- masster/spectrum.py +1287 -1319
- masster/study/__init__.py +9 -9
- masster/study/defaults/__init__.py +21 -19
- masster/study/defaults/align_def.py +267 -267
- masster/study/defaults/export_def.py +41 -40
- masster/study/defaults/fill_chrom_def.py +264 -264
- masster/study/defaults/fill_def.py +260 -0
- masster/study/defaults/find_consensus_def.py +256 -256
- masster/study/defaults/find_ms2_def.py +163 -163
- masster/study/defaults/integrate_chrom_def.py +225 -225
- masster/study/defaults/integrate_def.py +221 -0
- masster/study/defaults/merge_def.py +256 -0
- masster/study/defaults/study_def.py +272 -269
- masster/study/export.py +674 -287
- masster/study/h5.py +1398 -886
- masster/study/helpers.py +1650 -433
- masster/study/helpers_optimized.py +317 -0
- masster/study/load.py +1201 -1078
- masster/study/parameters.py +99 -99
- masster/study/plot.py +632 -645
- masster/study/processing.py +1057 -1046
- masster/study/save.py +149 -134
- masster/study/study.py +606 -522
- masster/study/study5_schema.json +247 -241
- {masster-0.2.4.dist-info → masster-0.3.0.dist-info}/METADATA +15 -10
- masster-0.3.0.dist-info/RECORD +59 -0
- {masster-0.2.4.dist-info → masster-0.3.0.dist-info}/licenses/LICENSE +661 -661
- masster-0.2.4.dist-info/RECORD +0 -50
- {masster-0.2.4.dist-info → masster-0.3.0.dist-info}/WHEEL +0 -0
- {masster-0.2.4.dist-info → masster-0.3.0.dist-info}/entry_points.txt +0 -0
masster/sample/lib.py
ADDED
|
@@ -0,0 +1,762 @@
|
|
|
1
|
+
"""
|
|
2
|
+
_lib.py
|
|
3
|
+
|
|
4
|
+
This module provides utility functions and algorithms for mass spectrometry data processing.
|
|
5
|
+
It contains core functionality for compound library management, target identification,
|
|
6
|
+
adduct handling, and various analytical operations used throughout the masster package.
|
|
7
|
+
|
|
8
|
+
Key Features:
|
|
9
|
+
- **Compound Libraries**: Load and manage compound databases with metadata.
|
|
10
|
+
- **Adduct Calculations**: Handle various ionization adducts and charge states.
|
|
11
|
+
- **Mass Calculations**: Precise mass calculations with adduct corrections.
|
|
12
|
+
- **Target Matching**: Match detected features against compound libraries.
|
|
13
|
+
- **Polarity Handling**: Support for positive and negative ionization modes.
|
|
14
|
+
- **Database Integration**: Interface with various compound database formats.
|
|
15
|
+
|
|
16
|
+
Dependencies:
|
|
17
|
+
- `pyopenms`: For mass spectrometry algorithms and data structures.
|
|
18
|
+
- `polars` and `pandas`: For efficient data manipulation and analysis.
|
|
19
|
+
- `numpy`: For numerical computations and array operations.
|
|
20
|
+
- `tqdm`: For progress tracking during batch operations.
|
|
21
|
+
|
|
22
|
+
Functions:
|
|
23
|
+
- `lib_load()`: Load compound libraries from CSV files.
|
|
24
|
+
- `load_lib()`: Alias for lib_load function.
|
|
25
|
+
- Various utility functions for mass calculations and library management.
|
|
26
|
+
|
|
27
|
+
Supported Adducts:
|
|
28
|
+
- Positive mode: [M+H]+, [M+Na]+, [M+K]+, [M+NH4]+, [M-H2O+H]+
|
|
29
|
+
- Negative mode: [M-H]-, [M+CH3COO]-, [M+HCOO]-, [M+Cl]-
|
|
30
|
+
|
|
31
|
+
Example Usage:
|
|
32
|
+
```python
|
|
33
|
+
from _lib import lib_load
|
|
34
|
+
|
|
35
|
+
# Load compound library
|
|
36
|
+
lib_load(self, csvfile="compounds.csv", polarity="positive")
|
|
37
|
+
|
|
38
|
+
# Access loaded library data
|
|
39
|
+
print(f"Loaded {len(self.lib_df)} compounds")
|
|
40
|
+
print(self.lib_df.head())
|
|
41
|
+
```
|
|
42
|
+
|
|
43
|
+
See Also:
|
|
44
|
+
- `parameters._lib_parameters`: For library-specific parameter configuration.
|
|
45
|
+
- `single.py`: For applying library matching to detected features.
|
|
46
|
+
|
|
47
|
+
"""
|
|
48
|
+
|
|
49
|
+
import os
|
|
50
|
+
import re
|
|
51
|
+
|
|
52
|
+
import numpy as np
|
|
53
|
+
import pandas as pd
|
|
54
|
+
import polars as pl
|
|
55
|
+
import pyopenms as oms
|
|
56
|
+
|
|
57
|
+
from tqdm import tqdm
|
|
58
|
+
|
|
59
|
+
from masster.chromatogram import Chromatogram
|
|
60
|
+
# Parameters removed - using hardcoded defaults
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def load_lib(self, *args, **kwargs):
|
|
64
|
+
lib_load(self, *args, **kwargs)
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def lib_load(self, csvfile=None, polarity="positive"):
|
|
68
|
+
delta_m = {
|
|
69
|
+
"[M+H]+": 1.007276,
|
|
70
|
+
"[M+Na]+": 22.989218,
|
|
71
|
+
"[M+K]+": 39.962383,
|
|
72
|
+
"[M+NH4]+": 18.033823,
|
|
73
|
+
"[M-H2O+H]+": -17.00329,
|
|
74
|
+
"[M-H]-": -1.007276,
|
|
75
|
+
"[M+CH3COO]-": -59.013852,
|
|
76
|
+
"[M+HCOO]-": -45.998203,
|
|
77
|
+
"[M+Cl]-": -34.968853,
|
|
78
|
+
}
|
|
79
|
+
delta_z = {
|
|
80
|
+
"[M+H]+": 1,
|
|
81
|
+
"[M+Na]+": 1,
|
|
82
|
+
"[M+K]+": 1,
|
|
83
|
+
"[M+NH4]+": 1,
|
|
84
|
+
"[M-H2O+H]+": 1,
|
|
85
|
+
"[M+CH3COO]-": -1,
|
|
86
|
+
"[M-H]-": -1,
|
|
87
|
+
"[M+HCOO]-": -1,
|
|
88
|
+
"[M+Cl]-": -1,
|
|
89
|
+
}
|
|
90
|
+
"""
|
|
91
|
+
Load target compounds from a CSV file.
|
|
92
|
+
This method reads a CSV file containing target compounds and their properties, such as m/z, retention time (RT),
|
|
93
|
+
and adducts. It filters the targets based on the specified adducts and returns a DataFrame of the targets.
|
|
94
|
+
Parameters:
|
|
95
|
+
csvfile (str): The path to the CSV file containing target compounds.
|
|
96
|
+
adducts (list, optional): A list of adducts to filter the targets. Default is ['[M+H]+', '[M+Na]+', '[M+K]+'].
|
|
97
|
+
Returns:
|
|
98
|
+
pd.DataFrame: A DataFrame containing the filtered target compounds with columns 'mz', 'rt', 'adduct'.
|
|
99
|
+
"""
|
|
100
|
+
self.lib = None
|
|
101
|
+
df = pd.read_csv(csvfile)
|
|
102
|
+
# filter targets by adducts
|
|
103
|
+
# iterate over all rows in df
|
|
104
|
+
# find index of column in df named "Name" or "name" or "Compound"
|
|
105
|
+
df_cols = df.columns
|
|
106
|
+
if "Name" in df_cols:
|
|
107
|
+
name_col = "Name"
|
|
108
|
+
elif "name" in df_cols:
|
|
109
|
+
name_col = "name"
|
|
110
|
+
elif "Compound" in df_cols:
|
|
111
|
+
name_col = "Compound"
|
|
112
|
+
elif "compound" in df_cols:
|
|
113
|
+
name_col = "compound"
|
|
114
|
+
else:
|
|
115
|
+
raise ValueError(
|
|
116
|
+
"No column named 'Name', 'name', or 'Compound' found in the CSV file.",
|
|
117
|
+
)
|
|
118
|
+
if "Formula" in df_cols:
|
|
119
|
+
formula_col = "Formula"
|
|
120
|
+
elif "formula" in df_cols:
|
|
121
|
+
formula_col = "formula"
|
|
122
|
+
else:
|
|
123
|
+
raise ValueError(
|
|
124
|
+
"No column named 'Formula' or 'formula' found in the CSV file.",
|
|
125
|
+
)
|
|
126
|
+
if "SMILES" in df_cols:
|
|
127
|
+
smiles_col = "SMILES"
|
|
128
|
+
elif "smiles" in df_cols:
|
|
129
|
+
smiles_col = "smiles"
|
|
130
|
+
else:
|
|
131
|
+
raise ValueError("No column named 'SMILES' or 'smiles' found in the CSV file.")
|
|
132
|
+
if "rt" in df_cols:
|
|
133
|
+
rt_col = "rt"
|
|
134
|
+
elif "RT" in df_cols:
|
|
135
|
+
rt_col = "RT"
|
|
136
|
+
else:
|
|
137
|
+
rt_col = None
|
|
138
|
+
if "rt2" in df_cols:
|
|
139
|
+
rt_col2 = "rt2"
|
|
140
|
+
elif "RT2" in df_cols:
|
|
141
|
+
rt_col2 = "RT2"
|
|
142
|
+
else:
|
|
143
|
+
rt_col2 = None
|
|
144
|
+
if "id" in df_cols:
|
|
145
|
+
id_col = "id"
|
|
146
|
+
elif "ID" in df_cols:
|
|
147
|
+
id_col = "ID"
|
|
148
|
+
else:
|
|
149
|
+
id_col = name_col
|
|
150
|
+
if "set" in df_cols:
|
|
151
|
+
set_col = "set"
|
|
152
|
+
elif "Set" in df_cols:
|
|
153
|
+
set_col = "Set"
|
|
154
|
+
else:
|
|
155
|
+
set_col = None
|
|
156
|
+
print(
|
|
157
|
+
"No column named 'set' or 'Set' found in the CSV file. Using all targets.",
|
|
158
|
+
)
|
|
159
|
+
|
|
160
|
+
targets = []
|
|
161
|
+
c = 0
|
|
162
|
+
for _index, row in df.iterrows():
|
|
163
|
+
# calculate accurate mass for row[formula_col]
|
|
164
|
+
m = oms.EmpiricalFormula(row[formula_col])
|
|
165
|
+
try:
|
|
166
|
+
accurate_mass = m.getMonoWeight()
|
|
167
|
+
except Exception as e:
|
|
168
|
+
print(f"Error calculating accurate mass for {row[formula_col]}: {e}")
|
|
169
|
+
continue
|
|
170
|
+
|
|
171
|
+
rt = row[rt_col] if rt_col is not None else None
|
|
172
|
+
for adduct in delta_m:
|
|
173
|
+
new_target = {
|
|
174
|
+
"libid": c,
|
|
175
|
+
"set": row[set_col] if set_col is not None else None,
|
|
176
|
+
"name": row[name_col],
|
|
177
|
+
"id": row[id_col],
|
|
178
|
+
"smiles": row[smiles_col],
|
|
179
|
+
"formula": row[formula_col],
|
|
180
|
+
"adduct": adduct,
|
|
181
|
+
"m": accurate_mass + delta_m[adduct],
|
|
182
|
+
"z": delta_z[adduct],
|
|
183
|
+
"mz": (accurate_mass + delta_m[adduct]) / delta_z[adduct],
|
|
184
|
+
"rt": rt,
|
|
185
|
+
"MS2spec": None,
|
|
186
|
+
}
|
|
187
|
+
targets.append(new_target)
|
|
188
|
+
if rt_col2 is not None:
|
|
189
|
+
rt = row[rt_col2]
|
|
190
|
+
for adduct in delta_m:
|
|
191
|
+
new_target = {
|
|
192
|
+
"libid": c,
|
|
193
|
+
"set": row[set_col] if set_col is not None else None,
|
|
194
|
+
"name": row[name_col] + " II",
|
|
195
|
+
"id": row[id_col],
|
|
196
|
+
"smiles": row[smiles_col],
|
|
197
|
+
"formula": row[formula_col],
|
|
198
|
+
"adduct": adduct,
|
|
199
|
+
"m": accurate_mass + delta_m[adduct],
|
|
200
|
+
"z": delta_z[adduct],
|
|
201
|
+
"mz": (accurate_mass + delta_m[adduct]) / delta_z[adduct],
|
|
202
|
+
"rt": rt,
|
|
203
|
+
"MS2spec": None,
|
|
204
|
+
}
|
|
205
|
+
targets.append(new_target)
|
|
206
|
+
c += 1
|
|
207
|
+
|
|
208
|
+
# convert targets to DataFrame
|
|
209
|
+
self.lib = pd.DataFrame(targets)
|
|
210
|
+
# ensure that mz is . use the abs()
|
|
211
|
+
self.lib["mz"] = self.lib["mz"].abs()
|
|
212
|
+
# convert all np.nan to None
|
|
213
|
+
self.lib = self.lib.where(pd.notnull(self.lib), None)
|
|
214
|
+
# find all elements == nan and replace them with None
|
|
215
|
+
self.lib = self.lib.replace({np.nan: None})
|
|
216
|
+
if polarity is not None:
|
|
217
|
+
if polarity.lower() == "positive":
|
|
218
|
+
self.lib = self.lib[self.lib["z"] > 0]
|
|
219
|
+
elif polarity.lower() == "negative":
|
|
220
|
+
self.lib = self.lib[self.lib["z"] < 0]
|
|
221
|
+
else:
|
|
222
|
+
raise ValueError("Polarity must be 'positive' or 'negative'.")
|
|
223
|
+
|
|
224
|
+
|
|
225
|
+
def link_lib(self, *args, **kwargs):
|
|
226
|
+
self.lib_link(*args, **kwargs)
|
|
227
|
+
|
|
228
|
+
|
|
229
|
+
def lib_link(
|
|
230
|
+
self,
|
|
231
|
+
mz_tol=0.01,
|
|
232
|
+
mz_tol_factor_lib=0.5,
|
|
233
|
+
rt_tol=6.0,
|
|
234
|
+
rt_tol_factor_lib=0.5,
|
|
235
|
+
level=1,
|
|
236
|
+
):
|
|
237
|
+
"""
|
|
238
|
+
Find all features that match the mz and rt is not None. Add all feature_uids of the feature to the lib_ms1 DataFrame.
|
|
239
|
+
"""
|
|
240
|
+
|
|
241
|
+
lib_matches = []
|
|
242
|
+
mz_tol_lib = mz_tol * mz_tol_factor_lib
|
|
243
|
+
rt_tol_lib = rt_tol * rt_tol_factor_lib
|
|
244
|
+
|
|
245
|
+
for _index, row in self.lib.iterrows():
|
|
246
|
+
# find all features that match the mz and rt is not None
|
|
247
|
+
mask = (self.features_df["mz"] >= row["mz"] - mz_tol_lib) & (self.features_df["mz"] <= row["mz"] + mz_tol_lib)
|
|
248
|
+
if row["rt"] is not None and rt_tol_lib is not np.nan:
|
|
249
|
+
mask &= (self.features_df["rt"] >= row["rt"] - rt_tol_lib) & (
|
|
250
|
+
self.features_df["rt"] <= row["rt"] + rt_tol_lib
|
|
251
|
+
)
|
|
252
|
+
if level == 1:
|
|
253
|
+
# get the feature_uids of the features that match the mask
|
|
254
|
+
feature_uids = self.features_df[mask]["feature_uid"].to_list()
|
|
255
|
+
for feature_uid in feature_uids:
|
|
256
|
+
# create a new df with id, name, formula, adduct, delta_mz, delta_rt, scan_uid,
|
|
257
|
+
f = self.features_df[self.features_df["feature_uid"] == feature_uid]
|
|
258
|
+
new_match = {
|
|
259
|
+
"libid": row["libid"],
|
|
260
|
+
"set": row["set"],
|
|
261
|
+
"name": row["name"],
|
|
262
|
+
"id": row["id"],
|
|
263
|
+
"formula": row["formula"],
|
|
264
|
+
"adduct": row["adduct"],
|
|
265
|
+
"smiles": row["smiles"],
|
|
266
|
+
"z": row["z"],
|
|
267
|
+
"match_level": 1,
|
|
268
|
+
"feature_uid": feature_uid,
|
|
269
|
+
"inty": f["inty"].values[0],
|
|
270
|
+
"quality": f["quality"].values[0],
|
|
271
|
+
"mz": f["mz"].values[0],
|
|
272
|
+
"delta_mz": row["mz"] - f["mz"].values[0],
|
|
273
|
+
"rt": f["rt"].values[0],
|
|
274
|
+
"delta_rt": row["rt"] - f["rt"].values[0] if row["rt"] is not None else None,
|
|
275
|
+
"ms2_scans": f["ms2_scans"].values[0] if "ms2_scans" in self.features_df.columns else None,
|
|
276
|
+
"eic": None,
|
|
277
|
+
}
|
|
278
|
+
lib_matches.append(new_match)
|
|
279
|
+
|
|
280
|
+
# convert lib_matches to DataFrame
|
|
281
|
+
self.lib_match = pd.DataFrame(lib_matches)
|
|
282
|
+
self.lib_eic(mz_tol=mz_tol, rt_tol=rt_tol)
|
|
283
|
+
|
|
284
|
+
|
|
285
|
+
def lib_eic(
|
|
286
|
+
self,
|
|
287
|
+
mz_tol=0.01,
|
|
288
|
+
rt_tol=6.0,
|
|
289
|
+
):
|
|
290
|
+
# for each matched feature, extract the EIC and add it to the lib_match DataFrame
|
|
291
|
+
if self.lib_match is None:
|
|
292
|
+
print("Please load and match the library first.")
|
|
293
|
+
return
|
|
294
|
+
if len(self.lib_match) == 0:
|
|
295
|
+
print("No matches found.")
|
|
296
|
+
return
|
|
297
|
+
for index, row in self.lib_match.iterrows():
|
|
298
|
+
# find the feature with feature_uid == row["feature_uid"]
|
|
299
|
+
f = self.features_df[self.features_df["feature_uid"] == row["feature_uid"]]
|
|
300
|
+
if f.empty:
|
|
301
|
+
continue
|
|
302
|
+
f = f.iloc[0]
|
|
303
|
+
rt_start = f["rt_start"] - rt_tol
|
|
304
|
+
rt_end = f["rt_end"] + rt_tol
|
|
305
|
+
# find all ms1 data in the retention time range. self.ms1_df is a polars DataFrame
|
|
306
|
+
d = self.ms1_df.filter(
|
|
307
|
+
(pl.col("rt") >= rt_start)
|
|
308
|
+
& (pl.col("rt") <= rt_end)
|
|
309
|
+
& (pl.col("mz") >= f["mz"] - mz_tol)
|
|
310
|
+
& (pl.col("mz") <= f["mz"] + mz_tol),
|
|
311
|
+
)
|
|
312
|
+
# for all unique rt values, find the maximum inty
|
|
313
|
+
eic_rt = d.group_by("rt").agg(pl.col("inty").max())
|
|
314
|
+
eic = Chromatogram(
|
|
315
|
+
eic_rt["rt"].to_numpy(),
|
|
316
|
+
eic_rt["inty"].to_numpy(),
|
|
317
|
+
label=f"EIC mz={f['mz']:.4f}; {row['name']} {row['adduct']}",
|
|
318
|
+
feature_start=f["rt_start"],
|
|
319
|
+
feature_end=f["rt_end"],
|
|
320
|
+
lib_rt=row["rt"],
|
|
321
|
+
)
|
|
322
|
+
self.lib_match.loc[index, "eic"] = eic
|
|
323
|
+
|
|
324
|
+
|
|
325
|
+
# TODO Should go in _export? (Almost the same method already there)
|
|
326
|
+
def save_lib_mgf(
|
|
327
|
+
self,
|
|
328
|
+
filename="lib_export.mgf",
|
|
329
|
+
selection="best",
|
|
330
|
+
split_energy=True,
|
|
331
|
+
merge=False,
|
|
332
|
+
centroid=True,
|
|
333
|
+
inty_min=float("-inf"),
|
|
334
|
+
q1_ratio_min=None,
|
|
335
|
+
q1_ratio_max=None,
|
|
336
|
+
eic_corr_min=None,
|
|
337
|
+
deisotope=True,
|
|
338
|
+
verbose=False,
|
|
339
|
+
precursor_trim=-10.0,
|
|
340
|
+
centroid_algo=None,
|
|
341
|
+
):
|
|
342
|
+
if self.lib_match is None:
|
|
343
|
+
print("Please load and match the library first.")
|
|
344
|
+
return
|
|
345
|
+
|
|
346
|
+
if len(self.lib_match) == 0:
|
|
347
|
+
print("No matches found.")
|
|
348
|
+
return
|
|
349
|
+
|
|
350
|
+
# iterate over all features
|
|
351
|
+
|
|
352
|
+
def filter_peaks(spec, inty_min=None, q1_min=None, eic_min=None, q1_max=None):
|
|
353
|
+
# create a copy of the spectrum
|
|
354
|
+
spec = spec.copy()
|
|
355
|
+
l = len(spec.mz)
|
|
356
|
+
mask = [True] * l
|
|
357
|
+
if inty_min is not None and inty_min > 0:
|
|
358
|
+
mask = np.array(mask) & (spec.inty >= inty_min)
|
|
359
|
+
# check if q1_ratio is an attribute of spec
|
|
360
|
+
if q1_min is not None and hasattr(spec, "q1_ratio"):
|
|
361
|
+
mask = mask & (spec.q1_ratio >= q1_min)
|
|
362
|
+
# check if eic_corr is an attribute of spec
|
|
363
|
+
if q1_max is not None and hasattr(spec, "q1_ratio"):
|
|
364
|
+
mask = mask & (spec.q1_ratio <= q1_max)
|
|
365
|
+
# check if eic_corr is an attribute of spec
|
|
366
|
+
if eic_min is not None and hasattr(spec, "eic_corr"):
|
|
367
|
+
mask = mask & (spec.eic_corr >= eic_min)
|
|
368
|
+
# apply mask to all attributes of spec with the same length as mz
|
|
369
|
+
for attr in spec.__dict__:
|
|
370
|
+
# check it attr is a list or an array:
|
|
371
|
+
if isinstance(getattr(spec, attr), list) or isinstance(
|
|
372
|
+
getattr(spec, attr),
|
|
373
|
+
np.ndarray,
|
|
374
|
+
):
|
|
375
|
+
# check if attr has attribute 0 and its length is equal to l:
|
|
376
|
+
if hasattr(getattr(spec, attr), "__len__"):
|
|
377
|
+
if len(getattr(spec, attr)) == l:
|
|
378
|
+
setattr(spec, attr, getattr(spec, attr)[mask])
|
|
379
|
+
return spec
|
|
380
|
+
|
|
381
|
+
def write_ion(f, d, spec):
|
|
382
|
+
if spec is None:
|
|
383
|
+
return
|
|
384
|
+
f.write("BEGIN IONS\n")
|
|
385
|
+
# iterate through all d.keys()
|
|
386
|
+
for key in d:
|
|
387
|
+
f.write(f"{key.upper()}={d[key]}\n")
|
|
388
|
+
for mz, inty in zip(spec.mz, spec.inty, strict=False):
|
|
389
|
+
f.write(f"{mz:.5f} {inty:.0f}\n")
|
|
390
|
+
f.write("END IONS\n\n")
|
|
391
|
+
|
|
392
|
+
if centroid_algo is None:
|
|
393
|
+
if "centroid_algo" in self.parameters:
|
|
394
|
+
centroid_algo = self.parameters["centroid_algo"]
|
|
395
|
+
else:
|
|
396
|
+
centroid_algo = "cr"
|
|
397
|
+
|
|
398
|
+
# c = 0
|
|
399
|
+
skip = 0
|
|
400
|
+
# check if features is empty
|
|
401
|
+
with open(filename, "w", encoding="utf-8") as f:
|
|
402
|
+
for _index, matchrow in tqdm(
|
|
403
|
+
self.lib_match.iterrows(),
|
|
404
|
+
total=len(self.lib_match),
|
|
405
|
+
desc="Export MGF",
|
|
406
|
+
):
|
|
407
|
+
# find the feature with feature_uid == matchrow["feature_uid"]
|
|
408
|
+
row = self.features_df[self.features_df["feature_uid"] == matchrow["feature_uid"]].iloc[0]
|
|
409
|
+
if row["ms2_scans"] is None:
|
|
410
|
+
skip = skip + 1
|
|
411
|
+
continue
|
|
412
|
+
|
|
413
|
+
# write MS1 spectrum
|
|
414
|
+
ms1_scan_uid = self.select_closest_scan(rt=row["rt"])["scan_uid"][0]
|
|
415
|
+
spec = self.get_spectrum(
|
|
416
|
+
ms1_scan_uid,
|
|
417
|
+
centroid=centroid,
|
|
418
|
+
deisotope=deisotope,
|
|
419
|
+
centroid_algo=centroid_algo,
|
|
420
|
+
)
|
|
421
|
+
# trim spectrum 2 Da lower and 10 Da higher than precursor m/z
|
|
422
|
+
spec = spec.mz_trim(mz_min=row["mz"] - 2.0, mz_max=row["mz"] + 10.0)
|
|
423
|
+
|
|
424
|
+
filename: str = os.path.basename(self.file_path)
|
|
425
|
+
mslevel = 1 if spec.ms_level is None else spec.ms_level
|
|
426
|
+
activation = None
|
|
427
|
+
energy = None
|
|
428
|
+
kineticenergy = None
|
|
429
|
+
if mslevel > 1:
|
|
430
|
+
if "CID" in filename.upper() or "ZTS" in filename.upper():
|
|
431
|
+
if "EAD" in filename.upper():
|
|
432
|
+
activation = "CID-EAD"
|
|
433
|
+
# search ([0-9]*KE) in filename.upper() using regex
|
|
434
|
+
match = re.search(r"(\d+)KE", str(filename.upper()))
|
|
435
|
+
if match:
|
|
436
|
+
kineticenergy = int(match.group(1))
|
|
437
|
+
else:
|
|
438
|
+
match = re.search(r"(\d+)EV", filename.upper())
|
|
439
|
+
if match:
|
|
440
|
+
kineticenergy = int(match.group(1))
|
|
441
|
+
else:
|
|
442
|
+
activation = "CID"
|
|
443
|
+
elif "EAD" in filename.upper():
|
|
444
|
+
activation = "EAD"
|
|
445
|
+
# search ([0-9]*KE) in filename.upper() using regex
|
|
446
|
+
match = re.search(r"(\d+)KE", filename.upper())
|
|
447
|
+
if match:
|
|
448
|
+
kineticenergy = int(match.group(1))
|
|
449
|
+
else:
|
|
450
|
+
match = re.search(r"(\d+)EV", filename.upper())
|
|
451
|
+
if match:
|
|
452
|
+
kineticenergy = int(match.group(1))
|
|
453
|
+
energy = spec.energy if hasattr(spec, "energy") else None
|
|
454
|
+
|
|
455
|
+
spec = filter_peaks(spec, inty_min=inty_min)
|
|
456
|
+
d = {
|
|
457
|
+
"PEPMASS": row["mz"],
|
|
458
|
+
"RTINSECONDS": row["rt"],
|
|
459
|
+
"IONMODE": "positive" if matchrow["adduct"][-1] == "+" else "negative",
|
|
460
|
+
"CHARGE": "1" + matchrow["adduct"].split("]")[1],
|
|
461
|
+
"NAME": f"{matchrow['name']}",
|
|
462
|
+
"SMILES": matchrow["smiles"],
|
|
463
|
+
"FORMULA": matchrow["formula"],
|
|
464
|
+
"ADDUCT": matchrow["adduct"],
|
|
465
|
+
"LIBID": matchrow["libid"],
|
|
466
|
+
"ACTIVATION": activation,
|
|
467
|
+
"COLLISIONENERGY": energy,
|
|
468
|
+
"KINETICENERGY": kineticenergy,
|
|
469
|
+
"FILENAME": filename,
|
|
470
|
+
"SCANS": ms1_scan_uid,
|
|
471
|
+
"FID": row["feature_uid"],
|
|
472
|
+
"MSLEVEL": 1 if spec.ms_level is None else spec.ms_level,
|
|
473
|
+
}
|
|
474
|
+
write_ion(f, d, spec)
|
|
475
|
+
|
|
476
|
+
if split_energy:
|
|
477
|
+
# get energy of all scans with scan_uid in ms2_scans
|
|
478
|
+
energy = [s.energy for s in row["ms2_specs"]]
|
|
479
|
+
# find unique energies
|
|
480
|
+
unique_energies = list(set(energy))
|
|
481
|
+
for e in unique_energies:
|
|
482
|
+
ms2_scans = [s.scan_uid for s in row["ms2_specs"] if s.energy == e]
|
|
483
|
+
if selection == "best":
|
|
484
|
+
ms2_scans = ms2_scans[0]
|
|
485
|
+
for scan_uid in ms2_scans:
|
|
486
|
+
spec = self.get_spectrum(
|
|
487
|
+
scan_uid,
|
|
488
|
+
centroid=centroid,
|
|
489
|
+
deisotope=deisotope,
|
|
490
|
+
precursor_trim=precursor_trim,
|
|
491
|
+
centroid_algo=centroid_algo,
|
|
492
|
+
)
|
|
493
|
+
spec = filter_peaks(
|
|
494
|
+
spec,
|
|
495
|
+
inty_min=inty_min,
|
|
496
|
+
q1_min=q1_ratio_min,
|
|
497
|
+
eic_min=eic_corr_min,
|
|
498
|
+
q1_max=q1_ratio_max,
|
|
499
|
+
)
|
|
500
|
+
# TODO not used
|
|
501
|
+
mslevel = 1 if spec.ms_level is None else spec.ms_level
|
|
502
|
+
activation = None
|
|
503
|
+
energy = None
|
|
504
|
+
kineticenergy = None
|
|
505
|
+
if "CID" in filename.upper() or "ZTS" in filename.upper():
|
|
506
|
+
if "EAD" in filename.upper():
|
|
507
|
+
activation = "CID-EAD"
|
|
508
|
+
# search ([0-9]*KE) in filename.upper() using regex
|
|
509
|
+
match = re.search(r"(\d+)KE", filename.upper())
|
|
510
|
+
if match:
|
|
511
|
+
kineticenergy = int(match.group(1))
|
|
512
|
+
else:
|
|
513
|
+
match = re.search(r"(\d+)EV", filename.upper())
|
|
514
|
+
if match:
|
|
515
|
+
kineticenergy = int(match.group(1))
|
|
516
|
+
else:
|
|
517
|
+
activation = "CID"
|
|
518
|
+
elif "EAD" in filename.upper():
|
|
519
|
+
activation = "EAD"
|
|
520
|
+
# search ([0-9]*KE) in filename.upper() using regex
|
|
521
|
+
match = re.search(r"(\d+)KE", filename.upper())
|
|
522
|
+
if match:
|
|
523
|
+
kineticenergy = int(match.group(1))
|
|
524
|
+
else:
|
|
525
|
+
match = re.search(r"(\d+)EV", filename.upper())
|
|
526
|
+
if match:
|
|
527
|
+
kineticenergy = int(match.group(1))
|
|
528
|
+
energy = spec.energy if hasattr(spec, "energy") else None
|
|
529
|
+
|
|
530
|
+
spec = filter_peaks(spec, inty_min=inty_min)
|
|
531
|
+
d = {
|
|
532
|
+
"PEPMASS": row["mz"],
|
|
533
|
+
"RTINSECONDS": row["rt"],
|
|
534
|
+
"IONMODE": "positive" if matchrow["adduct"][-1] == "+" else "negative",
|
|
535
|
+
"CHARGE": "1" + matchrow["adduct"].split("]")[1],
|
|
536
|
+
"NAME": f"{matchrow['name']}",
|
|
537
|
+
"SMILES": matchrow["smiles"],
|
|
538
|
+
"FORMULA": matchrow["formula"],
|
|
539
|
+
"ADDUCT": matchrow["adduct"],
|
|
540
|
+
"LIBID": matchrow["libid"],
|
|
541
|
+
"ACTIVATION": activation,
|
|
542
|
+
"COLLISIONENERGY": energy,
|
|
543
|
+
"KINETICENERGY": kineticenergy,
|
|
544
|
+
"FILENAME": filename,
|
|
545
|
+
"SCANS": ms1_scan_uid,
|
|
546
|
+
"FID": row["feature_uid"],
|
|
547
|
+
"MSLEVEL": 1 if spec.ms_level is None else spec.ms_level,
|
|
548
|
+
}
|
|
549
|
+
|
|
550
|
+
write_ion(f, d, spec)
|
|
551
|
+
else:
|
|
552
|
+
if selection == "best":
|
|
553
|
+
ms2_scans = row["ms2_scans"][0]
|
|
554
|
+
spec = self.get_spectrum(
|
|
555
|
+
ms2_scans,
|
|
556
|
+
centroid=centroid,
|
|
557
|
+
deisotope=deisotope,
|
|
558
|
+
precursor_trim=precursor_trim,
|
|
559
|
+
centroid_algo=centroid_algo,
|
|
560
|
+
)
|
|
561
|
+
spec = filter_peaks(
|
|
562
|
+
spec,
|
|
563
|
+
inty_min=inty_min,
|
|
564
|
+
q1_min=q1_ratio_min,
|
|
565
|
+
eic_min=eic_corr_min,
|
|
566
|
+
q1_max=q1_ratio_max,
|
|
567
|
+
)
|
|
568
|
+
mslevel = 1 if spec.ms_level is None else spec.ms_level
|
|
569
|
+
activation = None
|
|
570
|
+
energy = None
|
|
571
|
+
kineticenergy = None
|
|
572
|
+
if mslevel > 1:
|
|
573
|
+
if "CID" in filename.upper() or "ZTS" in filename.upper():
|
|
574
|
+
if "EAD" in filename.upper():
|
|
575
|
+
activation = "CID-EAD"
|
|
576
|
+
# search ([0-9]*KE) in filename.upper() using regex
|
|
577
|
+
match = re.search(r"(\d+)KE", filename.upper())
|
|
578
|
+
if match:
|
|
579
|
+
kineticenergy = int(match.group(1))
|
|
580
|
+
else:
|
|
581
|
+
match = re.search(r"(\d+)EV", filename.upper())
|
|
582
|
+
if match:
|
|
583
|
+
kineticenergy = int(match.group(1))
|
|
584
|
+
else:
|
|
585
|
+
activation = "CID"
|
|
586
|
+
elif "EAD" in filename.upper():
|
|
587
|
+
activation = "EAD"
|
|
588
|
+
# search ([0-9]*KE) in filename.upper() using regex
|
|
589
|
+
match = re.search(r"(\d+)KE", filename.upper())
|
|
590
|
+
if match:
|
|
591
|
+
kineticenergy = int(match.group(1))
|
|
592
|
+
else:
|
|
593
|
+
match = re.search(r"(\d+)EV", filename.upper())
|
|
594
|
+
if match:
|
|
595
|
+
kineticenergy = int(match.group(1))
|
|
596
|
+
energy = spec.energy if hasattr(spec, "energy") else None
|
|
597
|
+
|
|
598
|
+
spec = filter_peaks(spec, inty_min=inty_min)
|
|
599
|
+
d = {
|
|
600
|
+
"PEPMASS": row["mz"],
|
|
601
|
+
"RTINSECONDS": row["rt"],
|
|
602
|
+
"IONMODE": "positive" if matchrow["adduct"][-1] == "+" else "negative",
|
|
603
|
+
"CHARGE": "1" + matchrow["adduct"].split("]")[1],
|
|
604
|
+
"NAME": f"{matchrow['name']}",
|
|
605
|
+
"SMILES": matchrow["smiles"],
|
|
606
|
+
"FORMULA": matchrow["formula"],
|
|
607
|
+
"ADDUCT": matchrow["adduct"],
|
|
608
|
+
"LIBID": matchrow["libid"],
|
|
609
|
+
"ACTIVATION": activation,
|
|
610
|
+
"COLLISIONENERGY": energy,
|
|
611
|
+
"KINETICENERGY": kineticenergy,
|
|
612
|
+
"FILENAME": filename,
|
|
613
|
+
"SCANS": ms1_scan_uid,
|
|
614
|
+
"FID": row["feature_uid"],
|
|
615
|
+
"MSLEVEL": 1 if spec.ms_level is None else spec.ms_level,
|
|
616
|
+
}
|
|
617
|
+
write_ion(f, d, spec)
|
|
618
|
+
elif selection == "all":
|
|
619
|
+
if merge:
|
|
620
|
+
specs = []
|
|
621
|
+
for ms2_scans in row["ms2_scans"]:
|
|
622
|
+
specs.append(
|
|
623
|
+
self.get_spectrum(
|
|
624
|
+
ms2_scans,
|
|
625
|
+
centroid=centroid,
|
|
626
|
+
deisotope=deisotope,
|
|
627
|
+
precursor_trim=precursor_trim,
|
|
628
|
+
),
|
|
629
|
+
)
|
|
630
|
+
spec = spec.merge_peaks(specs)
|
|
631
|
+
if centroid:
|
|
632
|
+
spec = spec.denoise()
|
|
633
|
+
if spec.ms_level == 1:
|
|
634
|
+
spec = spec.centroid(
|
|
635
|
+
tolerance=self.parameters["mz_tol_ms1_da"],
|
|
636
|
+
ppm=self.parameters["mz_tol_ms1_ppm"],
|
|
637
|
+
min_points=self.parameters["centroid_min_points_ms1"],
|
|
638
|
+
algo=centroid_algo,
|
|
639
|
+
)
|
|
640
|
+
elif spec.ms_level == 2:
|
|
641
|
+
spec = spec.centroid(
|
|
642
|
+
tolerance=self.parameters["mz_tol_ms2_da"],
|
|
643
|
+
ppm=self.parameters["mz_tol_ms2_ppm"],
|
|
644
|
+
min_points=self.parameters["centroid_min_points_ms2"],
|
|
645
|
+
algo=centroid_algo,
|
|
646
|
+
)
|
|
647
|
+
if deisotope:
|
|
648
|
+
spec = spec.deisotope()
|
|
649
|
+
spec = filter_peaks(
|
|
650
|
+
spec,
|
|
651
|
+
inty_min=inty_min,
|
|
652
|
+
q1_min=q1_ratio_min,
|
|
653
|
+
eic_min=eic_corr_min,
|
|
654
|
+
q1_max=q1_ratio_max,
|
|
655
|
+
)
|
|
656
|
+
mslevel = 1 if spec.ms_level is None else spec.ms_level
|
|
657
|
+
activation = None
|
|
658
|
+
energy = None
|
|
659
|
+
kineticenergy = None
|
|
660
|
+
if mslevel > 1:
|
|
661
|
+
if "CID" in filename.upper() or "ZTS" in filename.upper():
|
|
662
|
+
if "EAD" in filename.upper():
|
|
663
|
+
activation = "CID-EAD"
|
|
664
|
+
match = re.search(r"(\d+)KE", filename.upper())
|
|
665
|
+
if match:
|
|
666
|
+
kineticenergy = int(match.group(1))
|
|
667
|
+
else:
|
|
668
|
+
match = re.search(r"(\d+)EV", filename.upper())
|
|
669
|
+
if match:
|
|
670
|
+
kineticenergy = int(match.group(1))
|
|
671
|
+
else:
|
|
672
|
+
activation = "CID"
|
|
673
|
+
energy = spec.energy if hasattr(spec, "energy") else None
|
|
674
|
+
|
|
675
|
+
spec = filter_peaks(spec, inty_min=inty_min)
|
|
676
|
+
d = {
|
|
677
|
+
"PEPMASS": row["mz"],
|
|
678
|
+
"RTINSECONDS": row["rt"],
|
|
679
|
+
"IONMODE": "positive" if matchrow["adduct"][-1] == "+" else "negative",
|
|
680
|
+
"CHARGE": "1" + matchrow["adduct"].split("]")[1],
|
|
681
|
+
"NAME": f"{matchrow['name']}",
|
|
682
|
+
"SMILES": matchrow["smiles"],
|
|
683
|
+
"FORMULA": matchrow["formula"],
|
|
684
|
+
"ADDUCT": matchrow["adduct"],
|
|
685
|
+
"LIBID": matchrow["libid"],
|
|
686
|
+
"ACTIVATION": activation,
|
|
687
|
+
"COLLISIONENERGY": energy,
|
|
688
|
+
"KINETICENERGY": kineticenergy,
|
|
689
|
+
"FILENAME": filename,
|
|
690
|
+
"SCANS": ms1_scan_uid,
|
|
691
|
+
"FID": row["feature_uid"],
|
|
692
|
+
"MSLEVEL": 1 if spec.ms_level is None else spec.ms_level,
|
|
693
|
+
}
|
|
694
|
+
write_ion(f, d, spec)
|
|
695
|
+
else:
|
|
696
|
+
for ms2_scans in row["ms2_scans"]:
|
|
697
|
+
spec = self.get_spectrum(
|
|
698
|
+
ms2_scans,
|
|
699
|
+
centroid=centroid,
|
|
700
|
+
deisotope=deisotope,
|
|
701
|
+
precursor_trim=precursor_trim,
|
|
702
|
+
centroid_algo=centroid_algo,
|
|
703
|
+
)
|
|
704
|
+
spec = filter_peaks(
|
|
705
|
+
spec,
|
|
706
|
+
inty_min=inty_min,
|
|
707
|
+
q1_min=q1_ratio_min,
|
|
708
|
+
eic_min=eic_corr_min,
|
|
709
|
+
q1_max=q1_ratio_max,
|
|
710
|
+
)
|
|
711
|
+
mslevel = 1 if spec.ms_level is None else spec.ms_level
|
|
712
|
+
activation = None
|
|
713
|
+
energy = None
|
|
714
|
+
kineticenergy = None
|
|
715
|
+
if mslevel > 1:
|
|
716
|
+
if (
|
|
717
|
+
"CID" in filename.upper() or "ZTS" in filename.upper()
|
|
718
|
+
) and "EAD" in filename.upper():
|
|
719
|
+
activation = "CID-EAD"
|
|
720
|
+
match = re.search(r"(\d+)KE", filename.upper())
|
|
721
|
+
if match:
|
|
722
|
+
kineticenergy = int(match.group(1))
|
|
723
|
+
else:
|
|
724
|
+
match = re.search(r"(\d+)EV", filename.upper())
|
|
725
|
+
if match:
|
|
726
|
+
kineticenergy = int(match.group(1))
|
|
727
|
+
else:
|
|
728
|
+
activation = "CID"
|
|
729
|
+
energy = spec.energy if hasattr(spec, "energy") else None
|
|
730
|
+
|
|
731
|
+
spec = filter_peaks(spec, inty_min=inty_min)
|
|
732
|
+
d = {
|
|
733
|
+
"PEPMASS": row["mz"],
|
|
734
|
+
"RTINSECONDS": row["rt"],
|
|
735
|
+
"IONMODE": "positive" if matchrow["adduct"][-1] == "+" else "negative",
|
|
736
|
+
"CHARGE": "1" + matchrow["adduct"].split("]")[1],
|
|
737
|
+
"NAME": f"{matchrow['name']}",
|
|
738
|
+
"SMILES": matchrow["smiles"],
|
|
739
|
+
"FORMULA": matchrow["formula"],
|
|
740
|
+
"ADDUCT": matchrow["adduct"],
|
|
741
|
+
"LIBID": matchrow["libid"],
|
|
742
|
+
"ACTIVATION": activation,
|
|
743
|
+
"COLLISIONENERGY": energy,
|
|
744
|
+
"KINETICENERGY": kineticenergy,
|
|
745
|
+
"FILENAME": filename,
|
|
746
|
+
"SCANS": ms1_scan_uid,
|
|
747
|
+
"FID": row["fid"],
|
|
748
|
+
"MSLEVEL": 1 if spec.ms_level is None else spec.ms_level,
|
|
749
|
+
}
|
|
750
|
+
write_ion(f, d, spec)
|
|
751
|
+
|
|
752
|
+
if verbose:
|
|
753
|
+
print(
|
|
754
|
+
f"MGF created with int>{inty_min:.3f}, q1_ratio>{q1_ratio_min:.3f}, eic_corr>{eic_corr_min:.3f}",
|
|
755
|
+
)
|
|
756
|
+
# COMMENT `features` are missing
|
|
757
|
+
# print(
|
|
758
|
+
# f"- Exported {c} MS2 features for {len(features) - skip} precursors. Average peaks/feature is {c / (len(features) - skip + 0.000000001):.0f}"
|
|
759
|
+
# )
|
|
760
|
+
print(
|
|
761
|
+
f"- Skipped {skip} features because no MS2 peaks were left after filtering.",
|
|
762
|
+
)
|