pycompound 0.1.0__py3-none-any.whl → 0.1.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- app.py +470 -144
- pycompound/build_library.py +2 -9
- pycompound/plot_spectra.py +17 -42
- pycompound/processing.py +0 -9
- pycompound/similarity_measures.py +0 -3
- pycompound/spec_lib_matching.py +295 -102
- pycompound/spec_lib_matching_CLI.py +2 -7
- pycompound/tuning_CLI.py +2 -3
- {pycompound-0.1.0.dist-info → pycompound-0.1.2.dist-info}/METADATA +1 -1
- pycompound-0.1.2.dist-info/RECORD +14 -0
- pycompound-0.1.0.dist-info/RECORD +0 -14
- {pycompound-0.1.0.dist-info → pycompound-0.1.2.dist-info}/WHEEL +0 -0
- {pycompound-0.1.0.dist-info → pycompound-0.1.2.dist-info}/licenses/LICENSE +0 -0
- {pycompound-0.1.0.dist-info → pycompound-0.1.2.dist-info}/top_level.txt +0 -0
pycompound/spec_lib_matching.py
CHANGED
|
@@ -1,6 +1,4 @@
|
|
|
1
1
|
|
|
2
|
-
# this script's function runs spectral library matching to identify unknown query compound(s)
|
|
3
|
-
|
|
4
2
|
from pycompound.build_library import build_library_from_raw_data
|
|
5
3
|
from .processing import *
|
|
6
4
|
from .similarity_measures import *
|
|
@@ -9,6 +7,13 @@ from pathlib import Path
|
|
|
9
7
|
import json
|
|
10
8
|
from itertools import product
|
|
11
9
|
from joblib import Parallel, delayed
|
|
10
|
+
import csv
|
|
11
|
+
import sys, csv
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
default_HRMS_grid = {'similarity_measure':['cosine'], 'weight':[{'Cosine':0.25,'Shannon':0.25,'Renyi':0.25,'Tsallis':0.25}], 'spectrum_preprocessing_order':['FCNMWL'], 'mz_min':[0], 'mz_max':[9999999], 'int_min':[0], 'int_max':[99999999], 'window_size_centroiding':[0.5], 'window_size_matching':[0.5], 'noise_threshold':[0.0], 'wf_mz':[0.0], 'wf_int':[1.0], 'LET_threshold':[0.0], 'entropy_dimension':[1.1], 'high_quality_reference_library':[False]}
|
|
15
|
+
default_NRMS_grid = {'similarity_measure':['cosine'], 'weight':[{'Cosine':0.25,'Shannon':0.25,'Renyi':0.25,'Tsallis':0.25}], 'spectrum_preprocessing_order':['FCNMWL'], 'mz_min':[0], 'mz_max':[9999999], 'int_min':[0], 'int_max':[99999999], 'noise_threshold':[0.0], 'wf_mz':[0.0], 'wf_int':[1.0], 'LET_threshold':[0.0], 'entropy_dimension':[1.1], 'high_quality_reference_library':[False]}
|
|
16
|
+
|
|
12
17
|
|
|
13
18
|
def _eval_one_HRMS(df_query, df_reference, unique_query_ids, unique_reference_ids,
|
|
14
19
|
similarity_measure_tmp, weight,
|
|
@@ -71,22 +76,23 @@ def _eval_one_NRMS(df_query, df_reference, unique_query_ids, unique_reference_id
|
|
|
71
76
|
)
|
|
72
77
|
|
|
73
78
|
|
|
74
|
-
|
|
79
|
+
|
|
80
|
+
def tune_params_on_HRMS_data(query_data=None, reference_data=None, grid=None, output_path=None, return_output=False):
|
|
75
81
|
"""
|
|
76
|
-
runs spectral library matching on high-resolution mass spectrometry (HRMS) data with all possible combinations of parameters in the grid dict, saves results from each choice of parameters to a
|
|
82
|
+
runs spectral library matching on high-resolution mass spectrometry (HRMS) data with all possible combinations of parameters in the grid dict, saves results from each choice of parameters to a TXT file, and prints top-performing parameters
|
|
77
83
|
|
|
78
84
|
--query_data: mgf, mzML, or csv file of query mass spectrum/spectra to be identified. If csv file, each row should correspond to a mass spectrum, the left-most column should contain an identifier, and each of the other columns should correspond to a single mass/charge ratio. Mandatory argument.
|
|
79
85
|
--reference_data: mgf, mzML, or csv file of the reference mass spectra. If csv file, each row should correspond to a mass spectrum, the left-most column should contain in identifier (i.e. the CAS registry number or the compound name), and the remaining column should correspond to a single mass/charge ratio. Mandatory argument.
|
|
80
86
|
--grid: dict with all possible parameter values to try.
|
|
81
|
-
--output_path: accuracy from each choice of parameter set is saved to a
|
|
87
|
+
--output_path: accuracy from each choice of parameter set is saved to a TXT file here.
|
|
82
88
|
"""
|
|
83
89
|
|
|
90
|
+
grid = {**default_HRMS_grid, **(grid or {})}
|
|
84
91
|
for key, value in grid.items():
|
|
85
92
|
globals()[key] = value
|
|
86
93
|
|
|
87
|
-
# load query and reference libraries
|
|
88
94
|
if query_data is None:
|
|
89
|
-
print('\nError: No argument passed to the mandatory query_data. Please pass the path to the
|
|
95
|
+
print('\nError: No argument passed to the mandatory query_data. Please pass the path to the TXT file of the query data.')
|
|
90
96
|
sys.exit()
|
|
91
97
|
else:
|
|
92
98
|
extension = query_data.rsplit('.',1)
|
|
@@ -118,37 +124,157 @@ def tune_params_on_HRMS_data(query_data=None, reference_data=None, grid={'simila
|
|
|
118
124
|
print(f'\nNote that there are {len(unique_query_ids)} unique query spectra, {len(unique_reference_ids)} unique reference spectra, and {len(set(unique_query_ids) & set(unique_reference_ids))} of the query and reference spectra IDs are in common.\n')
|
|
119
125
|
|
|
120
126
|
if output_path is None:
|
|
121
|
-
output_path = f'{Path.cwd()}/tuning_param_output.
|
|
127
|
+
output_path = f'{Path.cwd()}/tuning_param_output.txt'
|
|
122
128
|
print(f'Warning: since output_path=None, the output will be written to the current working directory: {output_path}')
|
|
123
129
|
|
|
124
|
-
# build parameter grid out of the lists you already set
|
|
125
130
|
param_grid = product(similarity_measure, weight, spectrum_preprocessing_order, mz_min, mz_max, int_min, int_max, noise_threshold,
|
|
126
131
|
window_size_centroiding, window_size_matching, wf_mz, wf_int, LET_threshold, entropy_dimension, high_quality_reference_library)
|
|
127
|
-
# run in parallel on all CPUs
|
|
128
132
|
results = Parallel(n_jobs=-1, verbose=10)(delayed(_eval_one_HRMS)(df_query, df_reference, unique_query_ids, unique_reference_ids, *params) for params in param_grid)
|
|
129
133
|
|
|
130
134
|
df_out = pd.DataFrame(results, columns=[
|
|
131
135
|
'ACC','SIMILARITY.MEASURE','WEIGHT','SPECTRUM.PROCESSING.ORDER', 'MZ.MIN','MZ.MAX','INT.MIN','INT.MAX','NOISE.THRESHOLD',
|
|
132
136
|
'WINDOW.SIZE.CENTROIDING','WINDOW.SIZE.MATCHING', 'WF.MZ','WF.INT','LET.THRESHOLD','ENTROPY.DIMENSION', 'HIGH.QUALITY.REFERENCE.LIBRARY'
|
|
133
137
|
])
|
|
134
|
-
df_out = df_out
|
|
135
|
-
df_out.
|
|
138
|
+
df_out['WEIGHT'] = df_out['WEIGHT'].str.replace("\"","",regex=False)
|
|
139
|
+
df_out['WEIGHT'] = df_out['WEIGHT'].str.replace("{","",regex=False)
|
|
140
|
+
df_out['WEIGHT'] = df_out['WEIGHT'].str.replace("}","",regex=False)
|
|
141
|
+
df_out['WEIGHT'] = df_out['WEIGHT'].str.replace(":","",regex=False)
|
|
142
|
+
df_out['WEIGHT'] = df_out['WEIGHT'].str.replace("Cosine","",regex=False)
|
|
143
|
+
df_out['WEIGHT'] = df_out['WEIGHT'].str.replace("Shannon","",regex=False)
|
|
144
|
+
df_out['WEIGHT'] = df_out['WEIGHT'].str.replace("Renyi","",regex=False)
|
|
145
|
+
df_out['WEIGHT'] = df_out['WEIGHT'].str.replace("Tsallis","",regex=False)
|
|
146
|
+
df_out['WEIGHT'] = df_out['WEIGHT'].str.replace(" ","",regex=False)
|
|
147
|
+
df_out.to_csv(output_path, index=False, sep='\t', quoting=csv.QUOTE_NONE)
|
|
148
|
+
|
|
149
|
+
if return_output is False:
|
|
150
|
+
df_out.to_csv(output_path, index=False, sep='\t', quoting=csv.QUOTE_NONE)
|
|
151
|
+
else:
|
|
152
|
+
return df_out
|
|
136
153
|
|
|
137
154
|
|
|
138
|
-
|
|
155
|
+
|
|
156
|
+
def tune_params_on_HRMS_data_shiny(query_data=None, reference_data=None, grid=None, output_path=None, return_output=False):
|
|
157
|
+
"""
|
|
158
|
+
runs spectral library matching on high-resolution mass spectrometry (HRMS) data with all possible
|
|
159
|
+
combinations of parameters in the grid dict, saves results from each choice of parameters to a TXT file,
|
|
160
|
+
and prints top-performing parameters
|
|
161
|
+
|
|
162
|
+
--query_data: mgf, mzML, or csv file of query mass spectrum/spectra to be identified. If csv file, each row
|
|
163
|
+
should correspond to a mass spectrum, the left-most column should contain an identifier, and each of the
|
|
164
|
+
other columns should correspond to a single mass/charge ratio. Mandatory argument.
|
|
165
|
+
--reference_data: mgf, mzML, or csv file of the reference mass spectra. If csv file, each row should correspond
|
|
166
|
+
to a mass spectrum, the left-most column should contain in identifier (i.e. the CAS registry number or the
|
|
167
|
+
compound name), and the remaining column should correspond to a single mass/charge ratio. Mandatory argument.
|
|
168
|
+
--grid: dict with all possible parameter values to try.
|
|
169
|
+
--output_path: accuracy from each choice of parameter set is saved to a TXT file here.
|
|
139
170
|
"""
|
|
140
|
-
|
|
171
|
+
|
|
172
|
+
local_grid = {**default_HRMS_grid, **(grid or {})}
|
|
173
|
+
for key, value in local_grid.items():
|
|
174
|
+
globals()[key] = value
|
|
175
|
+
|
|
176
|
+
if query_data is None:
|
|
177
|
+
print('\nError: No argument passed to the mandatory query_data. Please pass the path to the data file.')
|
|
178
|
+
sys.exit()
|
|
179
|
+
else:
|
|
180
|
+
extension = query_data.rsplit('.', 1)[-1]
|
|
181
|
+
if extension in ('mgf','MGF','mzML','mzml','MZML','cdf','CDF'):
|
|
182
|
+
output_path_tmp = query_data[:-3] + 'csv'
|
|
183
|
+
build_library_from_raw_data(input_path=query_data, output_path=output_path_tmp, is_reference=False)
|
|
184
|
+
df_query = pd.read_csv(output_path_tmp)
|
|
185
|
+
elif extension in ('csv','CSV'):
|
|
186
|
+
df_query = pd.read_csv(query_data)
|
|
187
|
+
else:
|
|
188
|
+
print(f'\nError: Unsupported query_data extension: {extension}')
|
|
189
|
+
sys.exit()
|
|
190
|
+
unique_query_ids = df_query.iloc[:, 0].unique()
|
|
191
|
+
|
|
192
|
+
if reference_data is None:
|
|
193
|
+
print('\nError: No argument passed to the mandatory reference_data. Please pass the path to the data file(s).')
|
|
194
|
+
sys.exit()
|
|
195
|
+
else:
|
|
196
|
+
if isinstance(reference_data, str):
|
|
197
|
+
df_reference = get_reference_df(reference_data=reference_data)
|
|
198
|
+
unique_reference_ids = df_reference.iloc[:, 0].unique()
|
|
199
|
+
else:
|
|
200
|
+
dfs = []
|
|
201
|
+
unique_reference_ids = []
|
|
202
|
+
for f in reference_data:
|
|
203
|
+
tmp = get_reference_df(reference_data=f)
|
|
204
|
+
dfs.append(tmp)
|
|
205
|
+
unique_reference_ids.extend(tmp.iloc[:, 0].unique())
|
|
206
|
+
df_reference = pd.concat(dfs, axis=0, ignore_index=True)
|
|
207
|
+
|
|
208
|
+
print(f'\nNote that there are {len(unique_query_ids)} unique query spectra, '
|
|
209
|
+
f'{len(unique_reference_ids)} unique reference spectra, and '
|
|
210
|
+
f'{len(set(unique_query_ids) & set(unique_reference_ids))} of the query and reference spectra IDs are in common.\n')
|
|
211
|
+
|
|
212
|
+
if output_path is None:
|
|
213
|
+
output_path = f'{Path.cwd()}/tuning_param_output.txt'
|
|
214
|
+
print(f'Warning: since output_path=None, the output will be written to the current working directory: {output_path}')
|
|
215
|
+
|
|
216
|
+
param_grid = product(
|
|
217
|
+
similarity_measure, weight, spectrum_preprocessing_order, mz_min, mz_max, int_min, int_max,
|
|
218
|
+
noise_threshold, window_size_centroiding, window_size_matching, wf_mz, wf_int, LET_threshold,
|
|
219
|
+
entropy_dimension, high_quality_reference_library
|
|
220
|
+
)
|
|
221
|
+
|
|
222
|
+
results = []
|
|
223
|
+
total = (
|
|
224
|
+
len(similarity_measure) * len(weight) * len(spectrum_preprocessing_order) * len(mz_min) * len(mz_max) *
|
|
225
|
+
len(int_min) * len(int_max) * len(noise_threshold) * len(window_size_centroiding) *
|
|
226
|
+
len(window_size_matching) * len(wf_mz) * len(wf_int) * len(LET_threshold) *
|
|
227
|
+
len(entropy_dimension) * len(high_quality_reference_library)
|
|
228
|
+
)
|
|
229
|
+
done = 0
|
|
230
|
+
|
|
231
|
+
for params in param_grid:
|
|
232
|
+
res = _eval_one_HRMS(df_query, df_reference, unique_query_ids, unique_reference_ids, *params)
|
|
233
|
+
results.append(res)
|
|
234
|
+
done += 1
|
|
235
|
+
print(f'Completed {done}/{total} grid combinations.\n', flush=True)
|
|
236
|
+
|
|
237
|
+
df_out = pd.DataFrame(results, columns=[
|
|
238
|
+
'ACC','SIMILARITY.MEASURE','WEIGHT','SPECTRUM.PROCESSING.ORDER','MZ.MIN','MZ.MAX',
|
|
239
|
+
'INT.MIN','INT.MAX','NOISE.THRESHOLD','WINDOW.SIZE.CENTROIDING','WINDOW.SIZE.MATCHING',
|
|
240
|
+
'WF.MZ','WF.INT','LET.THRESHOLD','ENTROPY.DIMENSION','HIGH.QUALITY.REFERENCE.LIBRARY'
|
|
241
|
+
])
|
|
242
|
+
|
|
243
|
+
if 'WEIGHT' in df_out.columns:
|
|
244
|
+
df_out['WEIGHT'] = (
|
|
245
|
+
df_out['WEIGHT'].astype(str)
|
|
246
|
+
.str.replace("\"","",regex=False)
|
|
247
|
+
.str.replace("{","",regex=False)
|
|
248
|
+
.str.replace("}","",regex=False)
|
|
249
|
+
.str.replace(":","",regex=False)
|
|
250
|
+
.str.replace("Cosine","",regex=False)
|
|
251
|
+
.str.replace("Shannon","",regex=False)
|
|
252
|
+
.str.replace("Renyi","",regex=False)
|
|
253
|
+
.str.replace("Tsallis","",regex=False)
|
|
254
|
+
.str.replace(" ","",regex=False)
|
|
255
|
+
)
|
|
256
|
+
|
|
257
|
+
if return_output:
|
|
258
|
+
return df_out
|
|
259
|
+
else:
|
|
260
|
+
df_out.to_csv(output_path, index=False, sep='\t', quoting=csv.QUOTE_NONE)
|
|
261
|
+
print(f'Wrote results to {output_path}')
|
|
262
|
+
|
|
263
|
+
|
|
264
|
+
def tune_params_on_NRMS_data(query_data=None, reference_data=None, grid=None, output_path=None, return_output=False):
|
|
265
|
+
"""
|
|
266
|
+
runs spectral library matching on nominal-resolution mass spectrometry (NRMS) data with all possible combinations of parameters in the grid dict, saves results from each choice of parameters to a TXT file, and prints top-performing parameters
|
|
141
267
|
|
|
142
268
|
--query_data: mgf, mzML, or csv file of query mass spectrum/spectra to be identified. If csv file, each row should correspond to a mass spectrum, the left-most column should contain an identifier, and each of the other columns should correspond to a single mass/charge ratio. Mandatory argument.
|
|
143
269
|
--reference_data: mgf, mzML, or csv file of the reference mass spectra. If csv file, each row should correspond to a mass spectrum, the left-most column should contain in identifier (i.e. the CAS registry number or the compound name), and the remaining column should correspond to a single mass/charge ratio. Mandatory argument.
|
|
144
270
|
--grid: dict with all possible parameter values to try
|
|
145
|
-
--output_path: accuracy from each choice of parameter set is saved to a
|
|
271
|
+
--output_path: accuracy from each choice of parameter set is saved to a TXT file here
|
|
146
272
|
"""
|
|
147
273
|
|
|
274
|
+
grid = {**default_NRMS_grid, **(grid or {})}
|
|
148
275
|
for key, value in grid.items():
|
|
149
276
|
globals()[key] = value
|
|
150
277
|
|
|
151
|
-
# load query and reference libraries
|
|
152
278
|
if query_data is None:
|
|
153
279
|
print('\nError: No argument passed to the mandatory query_data. Please pass the path to the CSV file of the query data.')
|
|
154
280
|
sys.exit()
|
|
@@ -182,75 +308,184 @@ def tune_params_on_NRMS_data(query_data=None, reference_data=None, grid={'simila
|
|
|
182
308
|
print(f'\nNote that there are {len(unique_query_ids)} unique query spectra, {len(unique_reference_ids)} unique reference spectra, and {len(set(unique_query_ids) & set(unique_reference_ids))} of the query and reference spectra IDs are in common.\n')
|
|
183
309
|
|
|
184
310
|
if output_path is None:
|
|
185
|
-
output_path = f'{Path.cwd()}/tuning_param_output.
|
|
311
|
+
output_path = f'{Path.cwd()}/tuning_param_output.txt'
|
|
186
312
|
print(f'Warning: since output_path=None, the output will be written to the current working directory: {output_path}')
|
|
187
313
|
|
|
188
|
-
# build parameter grid out of the lists you already set
|
|
189
314
|
param_grid = product(similarity_measure, weight, spectrum_preprocessing_order, mz_min, mz_max, int_min, int_max,
|
|
190
315
|
noise_threshold, wf_mz, wf_int, LET_threshold, entropy_dimension, high_quality_reference_library)
|
|
191
|
-
# run in parallel on all CPUs
|
|
192
316
|
results = Parallel(n_jobs=-1, verbose=10)(delayed(_eval_one_NRMS)(df_query, df_reference, unique_query_ids, unique_reference_ids, *params) for params in param_grid)
|
|
193
317
|
|
|
194
318
|
df_out = pd.DataFrame(results, columns=[
|
|
195
319
|
'ACC','SIMILARITY.MEASURE','WEIGHT','SPECTRUM.PROCESSING.ORDER', 'MZ.MIN','MZ.MAX','INT.MIN','INT.MAX',
|
|
196
320
|
'NOISE.THRESHOLD','WF.MZ','WF.INT','LET.THRESHOLD','ENTROPY.DIMENSION', 'HIGH.QUALITY.REFERENCE.LIBRARY'
|
|
197
321
|
])
|
|
198
|
-
df_out = df_out
|
|
199
|
-
df_out.
|
|
322
|
+
df_out['WEIGHT'] = df_out['WEIGHT'].str.replace("\"","",regex=False)
|
|
323
|
+
df_out['WEIGHT'] = df_out['WEIGHT'].str.replace("{","",regex=False)
|
|
324
|
+
df_out['WEIGHT'] = df_out['WEIGHT'].str.replace("}","",regex=False)
|
|
325
|
+
df_out['WEIGHT'] = df_out['WEIGHT'].str.replace(":","",regex=False)
|
|
326
|
+
df_out['WEIGHT'] = df_out['WEIGHT'].str.replace("Cosine","",regex=False)
|
|
327
|
+
df_out['WEIGHT'] = df_out['WEIGHT'].str.replace("Shannon","",regex=False)
|
|
328
|
+
df_out['WEIGHT'] = df_out['WEIGHT'].str.replace("Renyi","",regex=False)
|
|
329
|
+
df_out['WEIGHT'] = df_out['WEIGHT'].str.replace("Tsallis","",regex=False)
|
|
330
|
+
df_out['WEIGHT'] = df_out['WEIGHT'].str.replace(" ","",regex=False)
|
|
331
|
+
if return_output is False:
|
|
332
|
+
df_out.to_csv(output_path, index=False, sep='\t', quoting=csv.QUOTE_NONE)
|
|
333
|
+
else:
|
|
334
|
+
return df_out
|
|
335
|
+
|
|
336
|
+
|
|
337
|
+
|
|
338
|
+
def tune_params_on_NRMS_data_shiny(query_data=None, reference_data=None, grid=None, output_path=None, return_output=False):
|
|
339
|
+
"""
|
|
340
|
+
runs spectral library matching on nominal-resolution mass spectrometry (NRMS) data with all possible
|
|
341
|
+
combinations of parameters in the grid dict, saves results from each choice of parameters to a TXT file,
|
|
342
|
+
and prints top-performing parameters
|
|
343
|
+
|
|
344
|
+
--query_data: mgf, mzML, or csv file of query mass spectrum/spectra to be identified. If csv file, each row
|
|
345
|
+
should correspond to a mass spectrum, the left-most column should contain an identifier, and each of the
|
|
346
|
+
other columns should correspond to a single mass/charge ratio. Mandatory argument.
|
|
347
|
+
--reference_data: mgf, mzML, or csv file of the reference mass spectra. If csv file, each row should correspond
|
|
348
|
+
to a mass spectrum, the left-most column should contain in identifier (i.e. the CAS registry number or the
|
|
349
|
+
compound name), and the remaining column should correspond to a single mass/charge ratio. Mandatory argument.
|
|
350
|
+
--grid: dict with all possible parameter values to try.
|
|
351
|
+
--output_path: accuracy from each choice of parameter set is saved to a TXT file here.
|
|
352
|
+
"""
|
|
353
|
+
|
|
354
|
+
local_grid = {**default_NRMS_grid, **(grid or {})}
|
|
355
|
+
for key, value in local_grid.items():
|
|
356
|
+
globals()[key] = value
|
|
357
|
+
|
|
358
|
+
if query_data is None:
|
|
359
|
+
print('\nError: No argument passed to the mandatory query_data. Please pass the path to the data file.')
|
|
360
|
+
sys.exit()
|
|
361
|
+
else:
|
|
362
|
+
extension = query_data.rsplit('.', 1)[-1]
|
|
363
|
+
if extension in ('mgf','MGF','mzML','mzml','MZML','cdf','CDF'):
|
|
364
|
+
output_path_tmp = query_data[:-3] + 'csv'
|
|
365
|
+
build_library_from_raw_data(input_path=query_data, output_path=output_path_tmp, is_reference=False)
|
|
366
|
+
df_query = pd.read_csv(output_path_tmp)
|
|
367
|
+
elif extension in ('csv','CSV'):
|
|
368
|
+
df_query = pd.read_csv(query_data)
|
|
369
|
+
else:
|
|
370
|
+
print(f'\nError: Unsupported query_data extension: {extension}')
|
|
371
|
+
sys.exit()
|
|
372
|
+
unique_query_ids = df_query.iloc[:, 0].unique()
|
|
373
|
+
|
|
374
|
+
if reference_data is None:
|
|
375
|
+
print('\nError: No argument passed to the mandatory reference_data. Please pass the path to the data file(s).')
|
|
376
|
+
sys.exit()
|
|
377
|
+
else:
|
|
378
|
+
if isinstance(reference_data, str):
|
|
379
|
+
df_reference = get_reference_df(reference_data=reference_data)
|
|
380
|
+
unique_reference_ids = df_reference.iloc[:, 0].unique()
|
|
381
|
+
else:
|
|
382
|
+
dfs = []
|
|
383
|
+
unique_reference_ids = []
|
|
384
|
+
for f in reference_data:
|
|
385
|
+
tmp = get_reference_df(reference_data=f)
|
|
386
|
+
dfs.append(tmp)
|
|
387
|
+
unique_reference_ids.extend(tmp.iloc[:, 0].unique())
|
|
388
|
+
df_reference = pd.concat(dfs, axis=0, ignore_index=True)
|
|
389
|
+
|
|
390
|
+
print(f'\nNote that there are {len(unique_query_ids)} unique query spectra, '
|
|
391
|
+
f'{len(unique_reference_ids)} unique reference spectra, and '
|
|
392
|
+
f'{len(set(unique_query_ids) & set(unique_reference_ids))} of the query and reference spectra IDs are in common.\n')
|
|
393
|
+
|
|
394
|
+
if output_path is None:
|
|
395
|
+
output_path = f'{Path.cwd()}/tuning_param_output.txt'
|
|
396
|
+
print(f'Warning: since output_path=None, the output will be written to the current working directory: {output_path}')
|
|
397
|
+
|
|
398
|
+
param_grid = product(
|
|
399
|
+
similarity_measure, weight, spectrum_preprocessing_order, mz_min, mz_max, int_min, int_max,
|
|
400
|
+
noise_threshold, wf_mz, wf_int, LET_threshold,
|
|
401
|
+
entropy_dimension, high_quality_reference_library
|
|
402
|
+
)
|
|
403
|
+
|
|
404
|
+
results = []
|
|
405
|
+
total = (
|
|
406
|
+
len(similarity_measure) * len(weight) * len(spectrum_preprocessing_order) * len(mz_min) * len(mz_max) * len(int_min) *
|
|
407
|
+
len(int_max) * len(noise_threshold) * len(wf_mz) * len(wf_int) * len(LET_threshold) * len(entropy_dimension) * len(high_quality_reference_library)
|
|
408
|
+
)
|
|
409
|
+
done = 0
|
|
410
|
+
for params in param_grid:
|
|
411
|
+
res = _eval_one_NRMS(df_query, df_reference, unique_query_ids, unique_reference_ids, *params)
|
|
412
|
+
results.append(res)
|
|
413
|
+
done += 1
|
|
414
|
+
print(f'Completed {done}/{total} grid combinations.\n', flush=True)
|
|
415
|
+
|
|
416
|
+
df_out = pd.DataFrame(results, columns=[
|
|
417
|
+
'ACC','SIMILARITY.MEASURE','WEIGHT','SPECTRUM.PROCESSING.ORDER','MZ.MIN','MZ.MAX',
|
|
418
|
+
'INT.MIN','INT.MAX','NOISE.THRESHOLD','WF.MZ','WF.INT','LET.THRESHOLD','ENTROPY.DIMENSION','HIGH.QUALITY.REFERENCE.LIBRARY'
|
|
419
|
+
])
|
|
420
|
+
|
|
421
|
+
if 'WEIGHT' in df_out.columns:
|
|
422
|
+
df_out['WEIGHT'] = (
|
|
423
|
+
df_out['WEIGHT'].astype(str)
|
|
424
|
+
.str.replace("\"","",regex=False)
|
|
425
|
+
.str.replace("{","",regex=False)
|
|
426
|
+
.str.replace("}","",regex=False)
|
|
427
|
+
.str.replace(":","",regex=False)
|
|
428
|
+
.str.replace("Cosine","",regex=False)
|
|
429
|
+
.str.replace("Shannon","",regex=False)
|
|
430
|
+
.str.replace("Renyi","",regex=False)
|
|
431
|
+
.str.replace("Tsallis","",regex=False)
|
|
432
|
+
.str.replace(" ","",regex=False)
|
|
433
|
+
)
|
|
434
|
+
|
|
435
|
+
if return_output:
|
|
436
|
+
return df_out
|
|
437
|
+
else:
|
|
438
|
+
df_out.to_csv(output_path, index=False, sep='\t', quoting=csv.QUOTE_NONE)
|
|
439
|
+
print(f'Wrote results to {output_path}')
|
|
200
440
|
|
|
201
441
|
|
|
202
442
|
|
|
203
443
|
|
|
204
444
|
def get_acc_HRMS(df_query, df_reference, unique_query_ids, unique_reference_ids, similarity_measure, weights, spectrum_preprocessing_order, mz_min, mz_max, int_min, int_max, window_size_centroiding, window_size_matching, noise_threshold, wf_mz, wf_int, LET_threshold, entropy_dimension, high_quality_reference_library):
|
|
205
|
-
# returns accuracy for a given set of parameters
|
|
206
445
|
|
|
207
446
|
n_top_matches_to_save = 1
|
|
208
447
|
|
|
209
|
-
# compute the similarity score between each query library spectrum/spectra and all reference library spectra
|
|
210
448
|
all_similarity_scores = []
|
|
211
449
|
for query_idx in range(0,len(unique_query_ids)):
|
|
212
450
|
print(f'query spectrum #{query_idx} is being identified')
|
|
213
451
|
q_idxs_tmp = np.where(df_query.iloc[:,0] == unique_query_ids[query_idx])[0]
|
|
214
452
|
q_spec_tmp = np.asarray(pd.concat([df_query.iloc[q_idxs_tmp,1], df_query.iloc[q_idxs_tmp,2]], axis=1).reset_index(drop=True))
|
|
215
453
|
|
|
216
|
-
# compute the similarity score between the given query spectrum and all spectra in the reference library
|
|
217
454
|
similarity_scores = []
|
|
218
455
|
for ref_idx in range(0,len(unique_reference_ids)):
|
|
219
456
|
q_spec = q_spec_tmp
|
|
220
457
|
r_idxs_tmp = np.where(df_reference.iloc[:,0] == unique_reference_ids[ref_idx])[0]
|
|
221
458
|
r_spec = np.asarray(pd.concat([df_reference.iloc[r_idxs_tmp,1], df_reference.iloc[r_idxs_tmp,2]], axis=1).reset_index(drop=True))
|
|
222
459
|
|
|
223
|
-
# apply spectrum preprocessing transformation in the order specified by user
|
|
224
460
|
is_matched = False
|
|
225
461
|
for transformation in spectrum_preprocessing_order:
|
|
226
462
|
if np.isinf(q_spec[:,1]).sum() > 0:
|
|
227
463
|
q_spec[:,1] = np.zeros(q_spec.shape[0])
|
|
228
464
|
if np.isinf(r_spec[:,1]).sum() > 0:
|
|
229
465
|
r_spec[:,1] = np.zeros(r_spec.shape[0])
|
|
230
|
-
if transformation == 'C' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1:
|
|
466
|
+
if transformation == 'C' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1:
|
|
231
467
|
q_spec = centroid_spectrum(q_spec, window_size=window_size_centroiding)
|
|
232
468
|
r_spec = centroid_spectrum(r_spec, window_size=window_size_centroiding)
|
|
233
|
-
if transformation == 'M' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1:
|
|
469
|
+
if transformation == 'M' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1:
|
|
234
470
|
m_spec = match_peaks_in_spectra(spec_a=q_spec, spec_b=r_spec, window_size=window_size_matching)
|
|
235
471
|
q_spec = m_spec[:,0:2]
|
|
236
472
|
r_spec = m_spec[:,[0,2]]
|
|
237
473
|
is_matched = True
|
|
238
|
-
if transformation == 'W' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1:
|
|
474
|
+
if transformation == 'W' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1:
|
|
239
475
|
q_spec[:,1] = wf_transform(q_spec[:,0], q_spec[:,1], wf_mz, wf_int)
|
|
240
476
|
r_spec[:,1] = wf_transform(r_spec[:,0], r_spec[:,1], wf_mz, wf_int)
|
|
241
|
-
if transformation == 'L' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1:
|
|
477
|
+
if transformation == 'L' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1:
|
|
242
478
|
q_spec[:,1] = LE_transform(q_spec[:,1], LET_threshold, normalization_method='standard')
|
|
243
479
|
r_spec[:,1] = LE_transform(r_spec[:,1], LET_threshold, normalization_method='standard')
|
|
244
|
-
if transformation == 'N' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1:
|
|
480
|
+
if transformation == 'N' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1:
|
|
245
481
|
q_spec = remove_noise(q_spec, nr = noise_threshold)
|
|
246
482
|
if high_quality_reference_library == False:
|
|
247
483
|
r_spec = remove_noise(r_spec, nr = noise_threshold)
|
|
248
|
-
if transformation == 'F' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1:
|
|
484
|
+
if transformation == 'F' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1:
|
|
249
485
|
q_spec = filter_spec_lcms(q_spec, mz_min = mz_min, mz_max = mz_max, int_min = int_min, int_max = int_max, is_matched = is_matched)
|
|
250
486
|
if high_quality_reference_library == False:
|
|
251
487
|
r_spec = filter_spec_lcms(r_spec, mz_min = mz_min, mz_max = mz_max, int_min = int_min, int_max = int_max, is_matched = is_matched)
|
|
252
488
|
|
|
253
|
-
# query and reference spectrum intensities
|
|
254
489
|
q_ints = q_spec[:,1]
|
|
255
490
|
r_ints = r_spec[:,1]
|
|
256
491
|
if np.sum(q_ints) != 0 and np.sum(r_ints) != 0 and q_spec.shape[0] > 1 and r_spec.shape[1] > 1:
|
|
@@ -261,12 +496,10 @@ def get_acc_HRMS(df_query, df_reference, unique_query_ids, unique_reference_ids,
|
|
|
261
496
|
similarity_scores.append(similarity_score)
|
|
262
497
|
all_similarity_scores.append(similarity_scores)
|
|
263
498
|
|
|
264
|
-
# create pandas dataframe containing all similarity scores computed with one row for each query spectrum and one column for each reference spectrum
|
|
265
499
|
df_scores = pd.DataFrame(all_similarity_scores, columns = unique_reference_ids)
|
|
266
500
|
df_scores.index = unique_query_ids
|
|
267
501
|
df_scores.index.names = ['Query Spectrum ID']
|
|
268
502
|
|
|
269
|
-
# get predicted identity/identities of each query spectrum and the corresponding maximum similarity score
|
|
270
503
|
preds = []
|
|
271
504
|
scores = []
|
|
272
505
|
for i in range(0, df_scores.shape[0]):
|
|
@@ -297,7 +530,6 @@ def get_acc_HRMS(df_query, df_reference, unique_query_ids, unique_reference_ids,
|
|
|
297
530
|
|
|
298
531
|
|
|
299
532
|
def get_acc_NRMS(df_query, df_reference, unique_query_ids, unique_reference_ids, similarity_measure, weights, spectrum_preprocessing_order, mz_min, mz_max, int_min, int_max, noise_threshold, wf_mz, wf_int, LET_threshold, entropy_dimension, high_quality_reference_library):
|
|
300
|
-
# returns accuracy for a given set of parameters
|
|
301
533
|
|
|
302
534
|
n_top_matches_to_save = 1
|
|
303
535
|
|
|
@@ -320,32 +552,29 @@ def get_acc_NRMS(df_query, df_reference, unique_query_ids, unique_reference_ids,
|
|
|
320
552
|
r_spec_tmp = np.asarray(pd.concat([df_reference.iloc[r_idxs_tmp,1], df_reference.iloc[r_idxs_tmp,2]], axis=1).reset_index(drop=True))
|
|
321
553
|
r_spec = convert_spec(r_spec_tmp,mzs)
|
|
322
554
|
|
|
323
|
-
# apply spectrum preprocessing transformation in the order specified by user
|
|
324
555
|
for transformation in spectrum_preprocessing_order:
|
|
325
556
|
if np.isinf(q_spec[:,1]).sum() > 0:
|
|
326
557
|
q_spec[:,1] = np.zeros(q_spec.shape[0])
|
|
327
558
|
if np.isinf(r_spec[:,1]).sum() > 0:
|
|
328
559
|
r_spec[:,1] = np.zeros(r_spec.shape[0])
|
|
329
|
-
if transformation == 'W':
|
|
560
|
+
if transformation == 'W':
|
|
330
561
|
q_spec[:,1] = wf_transform(q_spec[:,0], q_spec[:,1], wf_mz, wf_int)
|
|
331
562
|
r_spec[:,1] = wf_transform(r_spec[:,0], r_spec[:,1], wf_mz, wf_int)
|
|
332
|
-
if transformation == 'L':
|
|
563
|
+
if transformation == 'L':
|
|
333
564
|
q_spec[:,1] = LE_transform(q_spec[:,1], LET_threshold, normalization_method='standard')
|
|
334
565
|
r_spec[:,1] = LE_transform(r_spec[:,1], LET_threshold, normalization_method='standard')
|
|
335
|
-
if transformation == 'N':
|
|
566
|
+
if transformation == 'N':
|
|
336
567
|
q_spec = remove_noise(q_spec, nr = noise_threshold)
|
|
337
568
|
if high_quality_reference_library == False:
|
|
338
569
|
r_spec = remove_noise(r_spec, nr = noise_threshold)
|
|
339
|
-
if transformation == 'F':
|
|
570
|
+
if transformation == 'F':
|
|
340
571
|
q_spec = filter_spec_gcms(q_spec, mz_min = mz_min, mz_max = mz_max, int_min = int_min, int_max = int_max)
|
|
341
572
|
if high_quality_reference_library == False:
|
|
342
573
|
r_spec = filter_spec_gcms(r_spec, mz_min = mz_min, mz_max = mz_max, int_min = int_min, int_max = int_max)
|
|
343
574
|
|
|
344
|
-
# query and reference spectrum intensities
|
|
345
575
|
q_ints = q_spec[:,1]
|
|
346
576
|
r_ints = r_spec[:,1]
|
|
347
577
|
|
|
348
|
-
# if there are no non-zero intensities in the query or reference spectrum, their similarity is 0
|
|
349
578
|
if np.sum(q_ints) != 0 and np.sum(r_ints) != 0:
|
|
350
579
|
similarity_score = get_similarity(similarity_measure, q_spec[:,1], r_spec[:,1], weights, entropy_dimension)
|
|
351
580
|
else:
|
|
@@ -354,12 +583,10 @@ def get_acc_NRMS(df_query, df_reference, unique_query_ids, unique_reference_ids,
|
|
|
354
583
|
similarity_scores.append(similarity_score)
|
|
355
584
|
all_similarity_scores.append(similarity_scores)
|
|
356
585
|
|
|
357
|
-
# create pandas dataframe containing all similarity scores computed with one row for each query spectrum and one column for each reference spectrum
|
|
358
586
|
df_scores = pd.DataFrame(all_similarity_scores, columns = unique_reference_ids)
|
|
359
587
|
df_scores.index = unique_query_ids
|
|
360
588
|
df_scores.index.names = ['Query Spectrum ID']
|
|
361
589
|
|
|
362
|
-
# get predicted identity/identities of each query spectrum and the corresponding maximum similarity score
|
|
363
590
|
preds = []
|
|
364
591
|
scores = []
|
|
365
592
|
for i in range(0, df_scores.shape[0]):
|
|
@@ -371,7 +598,6 @@ def get_acc_NRMS(df_query, df_reference, unique_query_ids, unique_reference_ids,
|
|
|
371
598
|
cols_to_keep = np.where(df_scores_tmp.iloc[i,:] != np.max(df_scores_tmp.iloc[i,:]))[0]
|
|
372
599
|
df_scores_tmp = df_scores_tmp.iloc[:,cols_to_keep]
|
|
373
600
|
|
|
374
|
-
#preds_tmp.append(';'.join(top_ref_specs_tmp.index.to_list()))
|
|
375
601
|
preds_tmp.append(';'.join(map(str,top_ref_specs_tmp.index.to_list())))
|
|
376
602
|
if len(top_ref_specs_tmp.values) == 0:
|
|
377
603
|
scores_tmp.append(0)
|
|
@@ -413,11 +639,10 @@ def run_spec_lib_matching_on_HRMS_data(query_data=None, reference_data=None, lik
|
|
|
413
639
|
--entropy_dimension: Entropy dimension parameter. Must have positive value other than 1. When the entropy dimension is 1, then Renyi and Tsallis entropy are equivalent to Shannon entropy. Therefore, this parameter only applies to the renyi and tsallis similarity measures. This parameter will be ignored if similarity measure cosine or shannon is chosen. Default: 1.1
|
|
414
640
|
--n_top_matches_to_save: The number of top matches to report. For example, if n_top_matches_to_save=5, then for each query spectrum, the five reference spectra with the largest similarity with the given query spectrum will be reported. Default: 1
|
|
415
641
|
--print_id_results: Flag that prints identification results if True. Default: False
|
|
416
|
-
--output_identification: Output
|
|
417
|
-
--output_similarity_scores: Output
|
|
642
|
+
--output_identification: Output TXT file containing the most-similar reference spectra for each query spectrum along with the corresponding similarity scores. Default is to save identification output in current working directory with filename \'output_identification.txt\'.
|
|
643
|
+
--output_similarity_scores: Output TXT file containing similarity scores between all query spectrum/spectra and all reference spectra. Each row corresponds to a query spectrum, the left-most column contains the query spectrum/spectra identifier, and the remaining column contain the similarity scores with respect to all reference library spectra. If no argument passed, then this TXT file is written to the current working directory with filename \'output_all_similarity_scores\'.txt.')
|
|
418
644
|
'''
|
|
419
645
|
|
|
420
|
-
# load query and reference libraries
|
|
421
646
|
if query_data is None:
|
|
422
647
|
print('\nError: No argument passed to the mandatory query_data. Please pass the path to the CSV file of the query data.')
|
|
423
648
|
sys.exit()
|
|
@@ -449,7 +674,6 @@ def run_spec_lib_matching_on_HRMS_data(query_data=None, reference_data=None, lik
|
|
|
449
674
|
df_reference = pd.concat(dfs, axis=0, ignore_index=True)
|
|
450
675
|
|
|
451
676
|
|
|
452
|
-
##### process input parameters and ensure they are in a valid format #####
|
|
453
677
|
if spectrum_preprocessing_order is not None:
|
|
454
678
|
spectrum_preprocessing_order = list(spectrum_preprocessing_order)
|
|
455
679
|
else:
|
|
@@ -517,7 +741,7 @@ def run_spec_lib_matching_on_HRMS_data(query_data=None, reference_data=None, lik
|
|
|
517
741
|
else:
|
|
518
742
|
q = entropy_dimension
|
|
519
743
|
|
|
520
|
-
normalization_method = 'standard'
|
|
744
|
+
normalization_method = 'standard'
|
|
521
745
|
|
|
522
746
|
if n_top_matches_to_save <= 0 or isinstance(n_top_matches_to_save,int)==False:
|
|
523
747
|
print('\nError: n_top_matches_to_save should be a positive integer')
|
|
@@ -528,23 +752,20 @@ def run_spec_lib_matching_on_HRMS_data(query_data=None, reference_data=None, lik
|
|
|
528
752
|
sys.exit()
|
|
529
753
|
|
|
530
754
|
if output_identification is None:
|
|
531
|
-
output_identification = f'{Path.cwd()}/output_identification.
|
|
755
|
+
output_identification = f'{Path.cwd()}/output_identification.txt'
|
|
532
756
|
print(f'Warning: writing identification output to {output_identification}')
|
|
533
757
|
|
|
534
758
|
if output_similarity_scores is None:
|
|
535
|
-
output_similarity_scores = f'{Path.cwd()}/output_all_similarity_scores.
|
|
759
|
+
output_similarity_scores = f'{Path.cwd()}/output_all_similarity_scores.txt'
|
|
536
760
|
print(f'Warning: writing similarity scores to {output_similarity_scores}')
|
|
537
761
|
|
|
538
762
|
|
|
539
|
-
####################################### begin spectral library matching #######################################
|
|
540
|
-
# compute the similarity score between each query library spectrum/spectra and all reference library spectra
|
|
541
763
|
all_similarity_scores = []
|
|
542
764
|
for query_idx in range(0,len(unique_query_ids)):
|
|
543
765
|
print(f'query spectrum #{query_idx} is being identified')
|
|
544
766
|
q_idxs_tmp = np.where(df_query.iloc[:,0] == unique_query_ids[query_idx])[0]
|
|
545
767
|
q_spec_tmp = np.asarray(pd.concat([df_query.iloc[q_idxs_tmp,1], df_query.iloc[q_idxs_tmp,2]], axis=1).reset_index(drop=True))
|
|
546
768
|
|
|
547
|
-
# compute the similarity score between the given query spectrum and all spectra in the reference library
|
|
548
769
|
similarity_scores = []
|
|
549
770
|
for ref_idx in range(0,len(unique_reference_ids)):
|
|
550
771
|
#if ref_idx % 100 == 0:
|
|
@@ -553,37 +774,35 @@ def run_spec_lib_matching_on_HRMS_data(query_data=None, reference_data=None, lik
|
|
|
553
774
|
r_idxs_tmp = np.where(df_reference.iloc[:,0] == unique_reference_ids[ref_idx])[0]
|
|
554
775
|
r_spec = np.asarray(pd.concat([df_reference.iloc[r_idxs_tmp,1], df_reference.iloc[r_idxs_tmp,2]], axis=1).reset_index(drop=True))
|
|
555
776
|
|
|
556
|
-
# apply spectrum preprocessing transformation in the order specified by user
|
|
557
777
|
is_matched = False
|
|
558
778
|
for transformation in spectrum_preprocessing_order:
|
|
559
779
|
if np.isinf(q_spec[:,1]).sum() > 0:
|
|
560
780
|
q_spec[:,1] = np.zeros(q_spec.shape[0])
|
|
561
781
|
if np.isinf(r_spec[:,1]).sum() > 0:
|
|
562
782
|
r_spec[:,1] = np.zeros(r_spec.shape[0])
|
|
563
|
-
if transformation == 'C' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1:
|
|
783
|
+
if transformation == 'C' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1:
|
|
564
784
|
q_spec = centroid_spectrum(q_spec, window_size=window_size_centroiding)
|
|
565
785
|
r_spec = centroid_spectrum(r_spec, window_size=window_size_centroiding)
|
|
566
|
-
if transformation == 'M' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1:
|
|
786
|
+
if transformation == 'M' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1:
|
|
567
787
|
m_spec = match_peaks_in_spectra(spec_a=q_spec, spec_b=r_spec, window_size=window_size_matching)
|
|
568
788
|
q_spec = m_spec[:,0:2]
|
|
569
789
|
r_spec = m_spec[:,[0,2]]
|
|
570
790
|
is_matched = True
|
|
571
|
-
if transformation == 'W' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1:
|
|
791
|
+
if transformation == 'W' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1:
|
|
572
792
|
q_spec[:,1] = wf_transform(q_spec[:,0], q_spec[:,1], wf_mz, wf_intensity)
|
|
573
793
|
r_spec[:,1] = wf_transform(r_spec[:,0], r_spec[:,1], wf_mz, wf_intensity)
|
|
574
|
-
if transformation == 'L' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1:
|
|
794
|
+
if transformation == 'L' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1:
|
|
575
795
|
q_spec[:,1] = LE_transform(q_spec[:,1], LET_threshold, normalization_method=normalization_method)
|
|
576
796
|
r_spec[:,1] = LE_transform(r_spec[:,1], LET_threshold, normalization_method=normalization_method)
|
|
577
|
-
if transformation == 'N' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1:
|
|
797
|
+
if transformation == 'N' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1:
|
|
578
798
|
q_spec = remove_noise(q_spec, nr = noise_threshold)
|
|
579
799
|
if high_quality_reference_library == False:
|
|
580
800
|
r_spec = remove_noise(r_spec, nr = noise_threshold)
|
|
581
|
-
if transformation == 'F' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1:
|
|
801
|
+
if transformation == 'F' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1:
|
|
582
802
|
q_spec = filter_spec_lcms(q_spec, mz_min = mz_min, mz_max = mz_max, int_min = int_min, int_max = int_max, is_matched = is_matched)
|
|
583
803
|
if high_quality_reference_library == False:
|
|
584
804
|
r_spec = filter_spec_lcms(r_spec, mz_min = mz_min, mz_max = mz_max, int_min = int_min, int_max = int_max, is_matched = is_matched)
|
|
585
805
|
|
|
586
|
-
# query and reference spectrum intensities
|
|
587
806
|
q_ints = q_spec[:,1]
|
|
588
807
|
r_ints = r_spec[:,1]
|
|
589
808
|
|
|
@@ -595,12 +814,10 @@ def run_spec_lib_matching_on_HRMS_data(query_data=None, reference_data=None, lik
|
|
|
595
814
|
similarity_scores.append(similarity_score)
|
|
596
815
|
all_similarity_scores.append(similarity_scores)
|
|
597
816
|
|
|
598
|
-
# create pandas dataframe containing all similarity scores computed with one row for each query spectrum and one column for each reference spectrum
|
|
599
817
|
df_scores = pd.DataFrame(all_similarity_scores, columns = unique_reference_ids)
|
|
600
818
|
df_scores.index = unique_query_ids
|
|
601
819
|
df_scores.index.names = ['Query Spectrum ID']
|
|
602
820
|
|
|
603
|
-
# get predicted identity/identities of each query spectrum and the corresponding maximum similarity score
|
|
604
821
|
preds = []
|
|
605
822
|
scores = []
|
|
606
823
|
for i in range(0, df_scores.shape[0]):
|
|
@@ -624,30 +841,24 @@ def run_spec_lib_matching_on_HRMS_data(query_data=None, reference_data=None, lik
|
|
|
624
841
|
scores = np.array(scores)
|
|
625
842
|
out = np.c_[preds,scores]
|
|
626
843
|
|
|
627
|
-
# get column names for a pandas dataframe with the n_top_matches_to_save top-matches for each query spectrum
|
|
628
844
|
cnames_preds = []
|
|
629
845
|
cnames_scores = []
|
|
630
846
|
for i in range(0,n_top_matches_to_save):
|
|
631
847
|
cnames_preds.append(f'RANK.{i+1}.PRED')
|
|
632
848
|
cnames_scores.append(f'RANK.{i+1}.SIMILARITY.SCORE')
|
|
633
849
|
|
|
634
|
-
# get pandas dataframe with identifcation results with each row corresponding to a query spectrum, n_top_matches_to_save columns for the top predictions, and n_top_matches_to_save columns for the similarity scores corresponding to the predictions
|
|
635
850
|
df_top_ref_specs = pd.DataFrame(out, columns = [*cnames_preds, *cnames_scores])
|
|
636
851
|
df_top_ref_specs.index = unique_query_ids
|
|
637
852
|
df_top_ref_specs.index.names = ['Query Spectrum ID']
|
|
638
853
|
|
|
639
854
|
df_scores.columns = ['Reference Spectrum ID: ' + col for col in list(map(str,df_scores.columns.tolist()))]
|
|
640
855
|
|
|
641
|
-
# print the identification results if the user desires
|
|
642
856
|
if print_id_results == True:
|
|
643
857
|
print(df_top_ref_specs.to_string())
|
|
644
858
|
|
|
645
859
|
if return_ID_output is False:
|
|
646
|
-
|
|
647
|
-
|
|
648
|
-
|
|
649
|
-
# write all similarity scores to disk
|
|
650
|
-
df_scores.to_csv(output_similarity_scores)
|
|
860
|
+
df_top_ref_specs.to_csv(output_identification, sep='\t')
|
|
861
|
+
df_scores.to_csv(output_similarity_scores, sep='\t')
|
|
651
862
|
else:
|
|
652
863
|
return df_top_ref_specs
|
|
653
864
|
|
|
@@ -678,11 +889,10 @@ def run_spec_lib_matching_on_NRMS_data(query_data=None, reference_data=None, lik
|
|
|
678
889
|
--normalization_method: Method used to normalize the intensities of each spectrum so that the intensities sum to 1. Since the objects entropy quantifies the uncertainy of must be probability distributions, the intensities of a given spectrum must sum to 1 prior to computing the entropy of the given spectrum intensities. Options: \'standard\' and \'softmax\'. Default: standard.
|
|
679
890
|
--n_top_matches_to_save: The number of top matches to report. For example, if n_top_matches_to_save=5, then for each query spectrum, the five reference spectra with the largest similarity with the given query spectrum will be reported. Default: 1
|
|
680
891
|
--print_id_results: Flag that prints identification results if True. Default: False
|
|
681
|
-
--output_identification: Output
|
|
682
|
-
--output_similarity_scores: Output
|
|
892
|
+
--output_identification: Output TXT file containing the most-similar reference spectra for each query spectrum along with the corresponding similarity scores. Default is to save identification output in current working directory with filename \'output_identification.txt\'.
|
|
893
|
+
--output_similarity_scores: Output TXT file containing similarity scores between all query spectrum/spectra and all reference spectra. Each row corresponds to a query spectrum, the left-most column contains the query spectrum/spectra identifier, and the remaining column contain the similarity scores with respect to all reference library spectra. If no argument passed, then this TXT file is written to the current working directory with filename \'output_all_similarity_scores\'.txt.')
|
|
683
894
|
'''
|
|
684
895
|
|
|
685
|
-
# load query and reference libraries
|
|
686
896
|
if query_data is None:
|
|
687
897
|
print('\nError: No argument passed to the mandatory query_data. Please pass the path to the CSV file of the query data.')
|
|
688
898
|
sys.exit()
|
|
@@ -714,7 +924,6 @@ def run_spec_lib_matching_on_NRMS_data(query_data=None, reference_data=None, lik
|
|
|
714
924
|
df_reference = pd.concat(dfs, axis=0, ignore_index=True)
|
|
715
925
|
|
|
716
926
|
|
|
717
|
-
##### process input parameters and ensure they are in a valid format #####
|
|
718
927
|
if spectrum_preprocessing_order is not None:
|
|
719
928
|
spectrum_preprocessing_order = list(spectrum_preprocessing_order)
|
|
720
929
|
else:
|
|
@@ -767,7 +976,7 @@ def run_spec_lib_matching_on_NRMS_data(query_data=None, reference_data=None, lik
|
|
|
767
976
|
else:
|
|
768
977
|
q = entropy_dimension
|
|
769
978
|
|
|
770
|
-
normalization_method = 'standard'
|
|
979
|
+
normalization_method = 'standard'
|
|
771
980
|
|
|
772
981
|
if n_top_matches_to_save <= 0 or isinstance(n_top_matches_to_save,int)==False:
|
|
773
982
|
print('\nError: n_top_matches_to_save should be a positive integer')
|
|
@@ -778,23 +987,19 @@ def run_spec_lib_matching_on_NRMS_data(query_data=None, reference_data=None, lik
|
|
|
778
987
|
sys.exit()
|
|
779
988
|
|
|
780
989
|
if output_identification is None:
|
|
781
|
-
output_identification = f'{Path.cwd()}/output_identification.
|
|
990
|
+
output_identification = f'{Path.cwd()}/output_identification.txt'
|
|
782
991
|
print(f'Warning: writing identification output to {output_identification}')
|
|
783
992
|
|
|
784
993
|
if output_similarity_scores is None:
|
|
785
|
-
output_similarity_scores = f'{Path.cwd()}/output_all_similarity_scores.
|
|
994
|
+
output_similarity_scores = f'{Path.cwd()}/output_all_similarity_scores.txt'
|
|
786
995
|
print(f'Warning: writing similarity scores to {output_similarity_scores}')
|
|
787
996
|
|
|
788
997
|
|
|
789
998
|
|
|
790
|
-
####################################### begin spectral library matching #######################################
|
|
791
|
-
# get the range of m/z values
|
|
792
999
|
min_mz = int(np.min([np.min(df_query.iloc[:,1]), np.min(df_reference.iloc[:,1])]))
|
|
793
1000
|
max_mz = int(np.max([np.max(df_query.iloc[:,1]), np.max(df_reference.iloc[:,1])]))
|
|
794
1001
|
mzs = np.linspace(min_mz,max_mz,(max_mz-min_mz+1))
|
|
795
1002
|
|
|
796
|
-
# compute the similarity score between each query library spectrum/spectra and all reference library spectra
|
|
797
|
-
# for each query spectrum, compute its similarity with all reference spectra
|
|
798
1003
|
all_similarity_scores = []
|
|
799
1004
|
for query_idx in range(0,len(unique_query_ids)):
|
|
800
1005
|
q_idxs_tmp = np.where(df_query.iloc[:,0] == unique_query_ids[query_idx])[0]
|
|
@@ -810,32 +1015,29 @@ def run_spec_lib_matching_on_NRMS_data(query_data=None, reference_data=None, lik
|
|
|
810
1015
|
r_spec_tmp = np.asarray(pd.concat([df_reference.iloc[r_idxs_tmp,1], df_reference.iloc[r_idxs_tmp,2]], axis=1).reset_index(drop=True))
|
|
811
1016
|
r_spec = convert_spec(r_spec_tmp,mzs)
|
|
812
1017
|
|
|
813
|
-
# apply spectrum preprocessing transformation in the order specified by user
|
|
814
1018
|
for transformation in spectrum_preprocessing_order:
|
|
815
1019
|
if np.isinf(q_spec[:,1]).sum() > 0:
|
|
816
1020
|
q_spec[:,1] = np.zeros(q_spec.shape[0])
|
|
817
1021
|
if np.isinf(r_spec[:,1]).sum() > 0:
|
|
818
1022
|
r_spec[:,1] = np.zeros(r_spec.shape[0])
|
|
819
|
-
if transformation == 'W':
|
|
1023
|
+
if transformation == 'W':
|
|
820
1024
|
q_spec[:,1] = wf_transform(q_spec[:,0], q_spec[:,1], wf_mz, wf_intensity)
|
|
821
1025
|
r_spec[:,1] = wf_transform(r_spec[:,0], r_spec[:,1], wf_mz, wf_intensity)
|
|
822
|
-
if transformation == 'L':
|
|
1026
|
+
if transformation == 'L':
|
|
823
1027
|
q_spec[:,1] = LE_transform(q_spec[:,1], LET_threshold, normalization_method=normalization_method)
|
|
824
1028
|
r_spec[:,1] = LE_transform(r_spec[:,1], LET_threshold, normalization_method=normalization_method)
|
|
825
|
-
if transformation == 'N':
|
|
1029
|
+
if transformation == 'N':
|
|
826
1030
|
q_spec = remove_noise(q_spec, nr = noise_threshold)
|
|
827
1031
|
if high_quality_reference_library == False:
|
|
828
1032
|
r_spec = remove_noise(r_spec, nr = noise_threshold)
|
|
829
|
-
if transformation == 'F':
|
|
1033
|
+
if transformation == 'F':
|
|
830
1034
|
q_spec = filter_spec_gcms(q_spec, mz_min = mz_min, mz_max = mz_max, int_min = int_min, int_max = int_max)
|
|
831
1035
|
if high_quality_reference_library == False:
|
|
832
1036
|
r_spec = filter_spec_gcms(r_spec, mz_min = mz_min, mz_max = mz_max, int_min = int_min, int_max = int_max)
|
|
833
1037
|
|
|
834
|
-
# query and reference spectrum intensities
|
|
835
1038
|
q_ints = q_spec[:,1]
|
|
836
1039
|
r_ints = r_spec[:,1]
|
|
837
1040
|
|
|
838
|
-
# if there are no non-zero intensities in the query or reference spectrum, their similarity is 0
|
|
839
1041
|
if np.sum(q_ints) != 0 and np.sum(r_ints) != 0:
|
|
840
1042
|
similarity_score = get_similarity(similarity_measure, q_spec[:,1], r_spec[:,1], weights, entropy_dimension)
|
|
841
1043
|
else:
|
|
@@ -844,12 +1046,10 @@ def run_spec_lib_matching_on_NRMS_data(query_data=None, reference_data=None, lik
|
|
|
844
1046
|
similarity_scores.append(similarity_score)
|
|
845
1047
|
all_similarity_scores.append(similarity_scores)
|
|
846
1048
|
|
|
847
|
-
# create pandas dataframe containing all similarity scores computed with one row for each query spectrum and one column for each reference spectrum
|
|
848
1049
|
df_scores = pd.DataFrame(all_similarity_scores, columns = unique_reference_ids)
|
|
849
1050
|
df_scores.index = unique_query_ids
|
|
850
1051
|
df_scores.index.names = ['Query Spectrum ID']
|
|
851
1052
|
|
|
852
|
-
# get predicted identity/identities of each query spectrum and the corresponding maximum similarity score
|
|
853
1053
|
preds = []
|
|
854
1054
|
scores = []
|
|
855
1055
|
for i in range(0, df_scores.shape[0]):
|
|
@@ -861,7 +1061,6 @@ def run_spec_lib_matching_on_NRMS_data(query_data=None, reference_data=None, lik
|
|
|
861
1061
|
cols_to_keep = np.where(df_scores_tmp.iloc[i,:] != np.max(df_scores_tmp.iloc[i,:]))[0]
|
|
862
1062
|
df_scores_tmp = df_scores_tmp.iloc[:,cols_to_keep]
|
|
863
1063
|
|
|
864
|
-
#preds_tmp.append(';'.join(top_ref_specs_tmp.index.to_list()))
|
|
865
1064
|
preds_tmp.append(';'.join(map(str,top_ref_specs_tmp.index.to_list())))
|
|
866
1065
|
if len(top_ref_specs_tmp.values) == 0:
|
|
867
1066
|
scores_tmp.append(0)
|
|
@@ -874,31 +1073,25 @@ def run_spec_lib_matching_on_NRMS_data(query_data=None, reference_data=None, lik
|
|
|
874
1073
|
scores = np.array(scores)
|
|
875
1074
|
out = np.c_[preds,scores]
|
|
876
1075
|
|
|
877
|
-
# get column names for a pandas dataframe with the n_top_matches_to_save top-matches for each query spectrum
|
|
878
1076
|
cnames_preds = []
|
|
879
1077
|
cnames_scores = []
|
|
880
1078
|
for i in range(0,n_top_matches_to_save):
|
|
881
1079
|
cnames_preds.append(f'RANK.{i+1}.PRED')
|
|
882
1080
|
cnames_scores.append(f'RANK.{i+1}.SIMILARITY.SCORE')
|
|
883
1081
|
|
|
884
|
-
# get pandas dataframe with identifcation results with each row corresponding to a query spectrum, n_top_matches_to_save columns for the top predictions, and n_top_matches_to_save columns for the similarity scores corresponding to the predictions
|
|
885
1082
|
df_top_ref_specs = pd.DataFrame(out, columns = [*cnames_preds, *cnames_scores])
|
|
886
1083
|
df_top_ref_specs.index = unique_query_ids
|
|
887
1084
|
df_top_ref_specs.index.names = ['Query Spectrum ID']
|
|
888
1085
|
|
|
889
|
-
# print the identification results if the user desires
|
|
890
1086
|
if print_id_results == True:
|
|
891
1087
|
print(df_top_ref_specs.to_string())
|
|
892
1088
|
|
|
893
1089
|
df_scores.columns = ['Reference Spectrum ID: ' + col for col in list(map(str,df_scores.columns.tolist()))]
|
|
894
1090
|
|
|
895
1091
|
if return_ID_output is False:
|
|
896
|
-
|
|
897
|
-
df_top_ref_specs.to_csv(output_identification)
|
|
898
|
-
|
|
899
|
-
# write all similarity scores to disk
|
|
1092
|
+
df_top_ref_specs.to_csv(output_identification, sep='\t')
|
|
900
1093
|
df_scores.columns = ['Reference Spectrum ID: ' + col for col in list(map(str,df_scores.columns.tolist()))]
|
|
901
|
-
df_scores.to_csv(output_similarity_scores)
|
|
1094
|
+
df_scores.to_csv(output_similarity_scores, sep='\t')
|
|
902
1095
|
else:
|
|
903
1096
|
return df_top_ref_specs
|
|
904
1097
|
|