pycompound 0.1.1__py3-none-any.whl → 0.1.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- app.py +155 -194
- pycompound/build_library.py +2 -9
- pycompound/plot_spectra.py +10 -38
- pycompound/processing.py +0 -9
- pycompound/similarity_measures.py +0 -3
- pycompound/spec_lib_matching.py +246 -81
- pycompound/spec_lib_matching_CLI.py +2 -7
- pycompound/tuning_CLI.py +1 -1
- {pycompound-0.1.1.dist-info → pycompound-0.1.2.dist-info}/METADATA +1 -1
- pycompound-0.1.2.dist-info/RECORD +14 -0
- pycompound-0.1.1.dist-info/RECORD +0 -14
- {pycompound-0.1.1.dist-info → pycompound-0.1.2.dist-info}/WHEEL +0 -0
- {pycompound-0.1.1.dist-info → pycompound-0.1.2.dist-info}/licenses/LICENSE +0 -0
- {pycompound-0.1.1.dist-info → pycompound-0.1.2.dist-info}/top_level.txt +0 -0
pycompound/spec_lib_matching.py
CHANGED
|
@@ -1,6 +1,4 @@
|
|
|
1
1
|
|
|
2
|
-
# this script's function runs spectral library matching to identify unknown query compound(s)
|
|
3
|
-
|
|
4
2
|
from pycompound.build_library import build_library_from_raw_data
|
|
5
3
|
from .processing import *
|
|
6
4
|
from .similarity_measures import *
|
|
@@ -10,6 +8,7 @@ import json
|
|
|
10
8
|
from itertools import product
|
|
11
9
|
from joblib import Parallel, delayed
|
|
12
10
|
import csv
|
|
11
|
+
import sys, csv
|
|
13
12
|
|
|
14
13
|
|
|
15
14
|
default_HRMS_grid = {'similarity_measure':['cosine'], 'weight':[{'Cosine':0.25,'Shannon':0.25,'Renyi':0.25,'Tsallis':0.25}], 'spectrum_preprocessing_order':['FCNMWL'], 'mz_min':[0], 'mz_max':[9999999], 'int_min':[0], 'int_max':[99999999], 'window_size_centroiding':[0.5], 'window_size_matching':[0.5], 'noise_threshold':[0.0], 'wf_mz':[0.0], 'wf_int':[1.0], 'LET_threshold':[0.0], 'entropy_dimension':[1.1], 'high_quality_reference_library':[False]}
|
|
@@ -80,21 +79,20 @@ def _eval_one_NRMS(df_query, df_reference, unique_query_ids, unique_reference_id
|
|
|
80
79
|
|
|
81
80
|
def tune_params_on_HRMS_data(query_data=None, reference_data=None, grid=None, output_path=None, return_output=False):
|
|
82
81
|
"""
|
|
83
|
-
runs spectral library matching on high-resolution mass spectrometry (HRMS) data with all possible combinations of parameters in the grid dict, saves results from each choice of parameters to a
|
|
82
|
+
runs spectral library matching on high-resolution mass spectrometry (HRMS) data with all possible combinations of parameters in the grid dict, saves results from each choice of parameters to a TXT file, and prints top-performing parameters
|
|
84
83
|
|
|
85
84
|
--query_data: mgf, mzML, or csv file of query mass spectrum/spectra to be identified. If csv file, each row should correspond to a mass spectrum, the left-most column should contain an identifier, and each of the other columns should correspond to a single mass/charge ratio. Mandatory argument.
|
|
86
85
|
--reference_data: mgf, mzML, or csv file of the reference mass spectra. If csv file, each row should correspond to a mass spectrum, the left-most column should contain in identifier (i.e. the CAS registry number or the compound name), and the remaining column should correspond to a single mass/charge ratio. Mandatory argument.
|
|
87
86
|
--grid: dict with all possible parameter values to try.
|
|
88
|
-
--output_path: accuracy from each choice of parameter set is saved to a
|
|
87
|
+
--output_path: accuracy from each choice of parameter set is saved to a TXT file here.
|
|
89
88
|
"""
|
|
90
89
|
|
|
91
90
|
grid = {**default_HRMS_grid, **(grid or {})}
|
|
92
91
|
for key, value in grid.items():
|
|
93
92
|
globals()[key] = value
|
|
94
93
|
|
|
95
|
-
# load query and reference libraries
|
|
96
94
|
if query_data is None:
|
|
97
|
-
print('\nError: No argument passed to the mandatory query_data. Please pass the path to the
|
|
95
|
+
print('\nError: No argument passed to the mandatory query_data. Please pass the path to the TXT file of the query data.')
|
|
98
96
|
sys.exit()
|
|
99
97
|
else:
|
|
100
98
|
extension = query_data.rsplit('.',1)
|
|
@@ -154,14 +152,123 @@ def tune_params_on_HRMS_data(query_data=None, reference_data=None, grid=None, ou
|
|
|
154
152
|
return df_out
|
|
155
153
|
|
|
156
154
|
|
|
155
|
+
|
|
156
|
+
def tune_params_on_HRMS_data_shiny(query_data=None, reference_data=None, grid=None, output_path=None, return_output=False):
|
|
157
|
+
"""
|
|
158
|
+
runs spectral library matching on high-resolution mass spectrometry (HRMS) data with all possible
|
|
159
|
+
combinations of parameters in the grid dict, saves results from each choice of parameters to a TXT file,
|
|
160
|
+
and prints top-performing parameters
|
|
161
|
+
|
|
162
|
+
--query_data: mgf, mzML, or csv file of query mass spectrum/spectra to be identified. If csv file, each row
|
|
163
|
+
should correspond to a mass spectrum, the left-most column should contain an identifier, and each of the
|
|
164
|
+
other columns should correspond to a single mass/charge ratio. Mandatory argument.
|
|
165
|
+
--reference_data: mgf, mzML, or csv file of the reference mass spectra. If csv file, each row should correspond
|
|
166
|
+
to a mass spectrum, the left-most column should contain in identifier (i.e. the CAS registry number or the
|
|
167
|
+
compound name), and the remaining column should correspond to a single mass/charge ratio. Mandatory argument.
|
|
168
|
+
--grid: dict with all possible parameter values to try.
|
|
169
|
+
--output_path: accuracy from each choice of parameter set is saved to a TXT file here.
|
|
170
|
+
"""
|
|
171
|
+
|
|
172
|
+
local_grid = {**default_HRMS_grid, **(grid or {})}
|
|
173
|
+
for key, value in local_grid.items():
|
|
174
|
+
globals()[key] = value
|
|
175
|
+
|
|
176
|
+
if query_data is None:
|
|
177
|
+
print('\nError: No argument passed to the mandatory query_data. Please pass the path to the data file.')
|
|
178
|
+
sys.exit()
|
|
179
|
+
else:
|
|
180
|
+
extension = query_data.rsplit('.', 1)[-1]
|
|
181
|
+
if extension in ('mgf','MGF','mzML','mzml','MZML','cdf','CDF'):
|
|
182
|
+
output_path_tmp = query_data[:-3] + 'csv'
|
|
183
|
+
build_library_from_raw_data(input_path=query_data, output_path=output_path_tmp, is_reference=False)
|
|
184
|
+
df_query = pd.read_csv(output_path_tmp)
|
|
185
|
+
elif extension in ('csv','CSV'):
|
|
186
|
+
df_query = pd.read_csv(query_data)
|
|
187
|
+
else:
|
|
188
|
+
print(f'\nError: Unsupported query_data extension: {extension}')
|
|
189
|
+
sys.exit()
|
|
190
|
+
unique_query_ids = df_query.iloc[:, 0].unique()
|
|
191
|
+
|
|
192
|
+
if reference_data is None:
|
|
193
|
+
print('\nError: No argument passed to the mandatory reference_data. Please pass the path to the data file(s).')
|
|
194
|
+
sys.exit()
|
|
195
|
+
else:
|
|
196
|
+
if isinstance(reference_data, str):
|
|
197
|
+
df_reference = get_reference_df(reference_data=reference_data)
|
|
198
|
+
unique_reference_ids = df_reference.iloc[:, 0].unique()
|
|
199
|
+
else:
|
|
200
|
+
dfs = []
|
|
201
|
+
unique_reference_ids = []
|
|
202
|
+
for f in reference_data:
|
|
203
|
+
tmp = get_reference_df(reference_data=f)
|
|
204
|
+
dfs.append(tmp)
|
|
205
|
+
unique_reference_ids.extend(tmp.iloc[:, 0].unique())
|
|
206
|
+
df_reference = pd.concat(dfs, axis=0, ignore_index=True)
|
|
207
|
+
|
|
208
|
+
print(f'\nNote that there are {len(unique_query_ids)} unique query spectra, '
|
|
209
|
+
f'{len(unique_reference_ids)} unique reference spectra, and '
|
|
210
|
+
f'{len(set(unique_query_ids) & set(unique_reference_ids))} of the query and reference spectra IDs are in common.\n')
|
|
211
|
+
|
|
212
|
+
if output_path is None:
|
|
213
|
+
output_path = f'{Path.cwd()}/tuning_param_output.txt'
|
|
214
|
+
print(f'Warning: since output_path=None, the output will be written to the current working directory: {output_path}')
|
|
215
|
+
|
|
216
|
+
param_grid = product(
|
|
217
|
+
similarity_measure, weight, spectrum_preprocessing_order, mz_min, mz_max, int_min, int_max,
|
|
218
|
+
noise_threshold, window_size_centroiding, window_size_matching, wf_mz, wf_int, LET_threshold,
|
|
219
|
+
entropy_dimension, high_quality_reference_library
|
|
220
|
+
)
|
|
221
|
+
|
|
222
|
+
results = []
|
|
223
|
+
total = (
|
|
224
|
+
len(similarity_measure) * len(weight) * len(spectrum_preprocessing_order) * len(mz_min) * len(mz_max) *
|
|
225
|
+
len(int_min) * len(int_max) * len(noise_threshold) * len(window_size_centroiding) *
|
|
226
|
+
len(window_size_matching) * len(wf_mz) * len(wf_int) * len(LET_threshold) *
|
|
227
|
+
len(entropy_dimension) * len(high_quality_reference_library)
|
|
228
|
+
)
|
|
229
|
+
done = 0
|
|
230
|
+
|
|
231
|
+
for params in param_grid:
|
|
232
|
+
res = _eval_one_HRMS(df_query, df_reference, unique_query_ids, unique_reference_ids, *params)
|
|
233
|
+
results.append(res)
|
|
234
|
+
done += 1
|
|
235
|
+
print(f'Completed {done}/{total} grid combinations.\n', flush=True)
|
|
236
|
+
|
|
237
|
+
df_out = pd.DataFrame(results, columns=[
|
|
238
|
+
'ACC','SIMILARITY.MEASURE','WEIGHT','SPECTRUM.PROCESSING.ORDER','MZ.MIN','MZ.MAX',
|
|
239
|
+
'INT.MIN','INT.MAX','NOISE.THRESHOLD','WINDOW.SIZE.CENTROIDING','WINDOW.SIZE.MATCHING',
|
|
240
|
+
'WF.MZ','WF.INT','LET.THRESHOLD','ENTROPY.DIMENSION','HIGH.QUALITY.REFERENCE.LIBRARY'
|
|
241
|
+
])
|
|
242
|
+
|
|
243
|
+
if 'WEIGHT' in df_out.columns:
|
|
244
|
+
df_out['WEIGHT'] = (
|
|
245
|
+
df_out['WEIGHT'].astype(str)
|
|
246
|
+
.str.replace("\"","",regex=False)
|
|
247
|
+
.str.replace("{","",regex=False)
|
|
248
|
+
.str.replace("}","",regex=False)
|
|
249
|
+
.str.replace(":","",regex=False)
|
|
250
|
+
.str.replace("Cosine","",regex=False)
|
|
251
|
+
.str.replace("Shannon","",regex=False)
|
|
252
|
+
.str.replace("Renyi","",regex=False)
|
|
253
|
+
.str.replace("Tsallis","",regex=False)
|
|
254
|
+
.str.replace(" ","",regex=False)
|
|
255
|
+
)
|
|
256
|
+
|
|
257
|
+
if return_output:
|
|
258
|
+
return df_out
|
|
259
|
+
else:
|
|
260
|
+
df_out.to_csv(output_path, index=False, sep='\t', quoting=csv.QUOTE_NONE)
|
|
261
|
+
print(f'Wrote results to {output_path}')
|
|
262
|
+
|
|
263
|
+
|
|
157
264
|
def tune_params_on_NRMS_data(query_data=None, reference_data=None, grid=None, output_path=None, return_output=False):
|
|
158
265
|
"""
|
|
159
|
-
runs spectral library matching on nominal-resolution mass spectrometry (NRMS) data with all possible combinations of parameters in the grid dict, saves results from each choice of parameters to a
|
|
266
|
+
runs spectral library matching on nominal-resolution mass spectrometry (NRMS) data with all possible combinations of parameters in the grid dict, saves results from each choice of parameters to a TXT file, and prints top-performing parameters
|
|
160
267
|
|
|
161
268
|
--query_data: mgf, mzML, or csv file of query mass spectrum/spectra to be identified. If csv file, each row should correspond to a mass spectrum, the left-most column should contain an identifier, and each of the other columns should correspond to a single mass/charge ratio. Mandatory argument.
|
|
162
269
|
--reference_data: mgf, mzML, or csv file of the reference mass spectra. If csv file, each row should correspond to a mass spectrum, the left-most column should contain in identifier (i.e. the CAS registry number or the compound name), and the remaining column should correspond to a single mass/charge ratio. Mandatory argument.
|
|
163
270
|
--grid: dict with all possible parameter values to try
|
|
164
|
-
--output_path: accuracy from each choice of parameter set is saved to a
|
|
271
|
+
--output_path: accuracy from each choice of parameter set is saved to a TXT file here
|
|
165
272
|
"""
|
|
166
273
|
|
|
167
274
|
grid = {**default_NRMS_grid, **(grid or {})}
|
|
@@ -228,57 +335,157 @@ def tune_params_on_NRMS_data(query_data=None, reference_data=None, grid=None, ou
|
|
|
228
335
|
|
|
229
336
|
|
|
230
337
|
|
|
338
|
+
def tune_params_on_NRMS_data_shiny(query_data=None, reference_data=None, grid=None, output_path=None, return_output=False):
|
|
339
|
+
"""
|
|
340
|
+
runs spectral library matching on nominal-resolution mass spectrometry (NRMS) data with all possible
|
|
341
|
+
combinations of parameters in the grid dict, saves results from each choice of parameters to a TXT file,
|
|
342
|
+
and prints top-performing parameters
|
|
343
|
+
|
|
344
|
+
--query_data: mgf, mzML, or csv file of query mass spectrum/spectra to be identified. If csv file, each row
|
|
345
|
+
should correspond to a mass spectrum, the left-most column should contain an identifier, and each of the
|
|
346
|
+
other columns should correspond to a single mass/charge ratio. Mandatory argument.
|
|
347
|
+
--reference_data: mgf, mzML, or csv file of the reference mass spectra. If csv file, each row should correspond
|
|
348
|
+
to a mass spectrum, the left-most column should contain in identifier (i.e. the CAS registry number or the
|
|
349
|
+
compound name), and the remaining column should correspond to a single mass/charge ratio. Mandatory argument.
|
|
350
|
+
--grid: dict with all possible parameter values to try.
|
|
351
|
+
--output_path: accuracy from each choice of parameter set is saved to a TXT file here.
|
|
352
|
+
"""
|
|
353
|
+
|
|
354
|
+
local_grid = {**default_NRMS_grid, **(grid or {})}
|
|
355
|
+
for key, value in local_grid.items():
|
|
356
|
+
globals()[key] = value
|
|
357
|
+
|
|
358
|
+
if query_data is None:
|
|
359
|
+
print('\nError: No argument passed to the mandatory query_data. Please pass the path to the data file.')
|
|
360
|
+
sys.exit()
|
|
361
|
+
else:
|
|
362
|
+
extension = query_data.rsplit('.', 1)[-1]
|
|
363
|
+
if extension in ('mgf','MGF','mzML','mzml','MZML','cdf','CDF'):
|
|
364
|
+
output_path_tmp = query_data[:-3] + 'csv'
|
|
365
|
+
build_library_from_raw_data(input_path=query_data, output_path=output_path_tmp, is_reference=False)
|
|
366
|
+
df_query = pd.read_csv(output_path_tmp)
|
|
367
|
+
elif extension in ('csv','CSV'):
|
|
368
|
+
df_query = pd.read_csv(query_data)
|
|
369
|
+
else:
|
|
370
|
+
print(f'\nError: Unsupported query_data extension: {extension}')
|
|
371
|
+
sys.exit()
|
|
372
|
+
unique_query_ids = df_query.iloc[:, 0].unique()
|
|
373
|
+
|
|
374
|
+
if reference_data is None:
|
|
375
|
+
print('\nError: No argument passed to the mandatory reference_data. Please pass the path to the data file(s).')
|
|
376
|
+
sys.exit()
|
|
377
|
+
else:
|
|
378
|
+
if isinstance(reference_data, str):
|
|
379
|
+
df_reference = get_reference_df(reference_data=reference_data)
|
|
380
|
+
unique_reference_ids = df_reference.iloc[:, 0].unique()
|
|
381
|
+
else:
|
|
382
|
+
dfs = []
|
|
383
|
+
unique_reference_ids = []
|
|
384
|
+
for f in reference_data:
|
|
385
|
+
tmp = get_reference_df(reference_data=f)
|
|
386
|
+
dfs.append(tmp)
|
|
387
|
+
unique_reference_ids.extend(tmp.iloc[:, 0].unique())
|
|
388
|
+
df_reference = pd.concat(dfs, axis=0, ignore_index=True)
|
|
389
|
+
|
|
390
|
+
print(f'\nNote that there are {len(unique_query_ids)} unique query spectra, '
|
|
391
|
+
f'{len(unique_reference_ids)} unique reference spectra, and '
|
|
392
|
+
f'{len(set(unique_query_ids) & set(unique_reference_ids))} of the query and reference spectra IDs are in common.\n')
|
|
393
|
+
|
|
394
|
+
if output_path is None:
|
|
395
|
+
output_path = f'{Path.cwd()}/tuning_param_output.txt'
|
|
396
|
+
print(f'Warning: since output_path=None, the output will be written to the current working directory: {output_path}')
|
|
397
|
+
|
|
398
|
+
param_grid = product(
|
|
399
|
+
similarity_measure, weight, spectrum_preprocessing_order, mz_min, mz_max, int_min, int_max,
|
|
400
|
+
noise_threshold, wf_mz, wf_int, LET_threshold,
|
|
401
|
+
entropy_dimension, high_quality_reference_library
|
|
402
|
+
)
|
|
403
|
+
|
|
404
|
+
results = []
|
|
405
|
+
total = (
|
|
406
|
+
len(similarity_measure) * len(weight) * len(spectrum_preprocessing_order) * len(mz_min) * len(mz_max) * len(int_min) *
|
|
407
|
+
len(int_max) * len(noise_threshold) * len(wf_mz) * len(wf_int) * len(LET_threshold) * len(entropy_dimension) * len(high_quality_reference_library)
|
|
408
|
+
)
|
|
409
|
+
done = 0
|
|
410
|
+
for params in param_grid:
|
|
411
|
+
res = _eval_one_NRMS(df_query, df_reference, unique_query_ids, unique_reference_ids, *params)
|
|
412
|
+
results.append(res)
|
|
413
|
+
done += 1
|
|
414
|
+
print(f'Completed {done}/{total} grid combinations.\n', flush=True)
|
|
415
|
+
|
|
416
|
+
df_out = pd.DataFrame(results, columns=[
|
|
417
|
+
'ACC','SIMILARITY.MEASURE','WEIGHT','SPECTRUM.PROCESSING.ORDER','MZ.MIN','MZ.MAX',
|
|
418
|
+
'INT.MIN','INT.MAX','NOISE.THRESHOLD','WF.MZ','WF.INT','LET.THRESHOLD','ENTROPY.DIMENSION','HIGH.QUALITY.REFERENCE.LIBRARY'
|
|
419
|
+
])
|
|
420
|
+
|
|
421
|
+
if 'WEIGHT' in df_out.columns:
|
|
422
|
+
df_out['WEIGHT'] = (
|
|
423
|
+
df_out['WEIGHT'].astype(str)
|
|
424
|
+
.str.replace("\"","",regex=False)
|
|
425
|
+
.str.replace("{","",regex=False)
|
|
426
|
+
.str.replace("}","",regex=False)
|
|
427
|
+
.str.replace(":","",regex=False)
|
|
428
|
+
.str.replace("Cosine","",regex=False)
|
|
429
|
+
.str.replace("Shannon","",regex=False)
|
|
430
|
+
.str.replace("Renyi","",regex=False)
|
|
431
|
+
.str.replace("Tsallis","",regex=False)
|
|
432
|
+
.str.replace(" ","",regex=False)
|
|
433
|
+
)
|
|
434
|
+
|
|
435
|
+
if return_output:
|
|
436
|
+
return df_out
|
|
437
|
+
else:
|
|
438
|
+
df_out.to_csv(output_path, index=False, sep='\t', quoting=csv.QUOTE_NONE)
|
|
439
|
+
print(f'Wrote results to {output_path}')
|
|
440
|
+
|
|
441
|
+
|
|
442
|
+
|
|
231
443
|
|
|
232
444
|
def get_acc_HRMS(df_query, df_reference, unique_query_ids, unique_reference_ids, similarity_measure, weights, spectrum_preprocessing_order, mz_min, mz_max, int_min, int_max, window_size_centroiding, window_size_matching, noise_threshold, wf_mz, wf_int, LET_threshold, entropy_dimension, high_quality_reference_library):
|
|
233
|
-
# returns accuracy for a given set of parameters
|
|
234
445
|
|
|
235
446
|
n_top_matches_to_save = 1
|
|
236
447
|
|
|
237
|
-
# compute the similarity score between each query library spectrum/spectra and all reference library spectra
|
|
238
448
|
all_similarity_scores = []
|
|
239
449
|
for query_idx in range(0,len(unique_query_ids)):
|
|
240
450
|
print(f'query spectrum #{query_idx} is being identified')
|
|
241
451
|
q_idxs_tmp = np.where(df_query.iloc[:,0] == unique_query_ids[query_idx])[0]
|
|
242
452
|
q_spec_tmp = np.asarray(pd.concat([df_query.iloc[q_idxs_tmp,1], df_query.iloc[q_idxs_tmp,2]], axis=1).reset_index(drop=True))
|
|
243
453
|
|
|
244
|
-
# compute the similarity score between the given query spectrum and all spectra in the reference library
|
|
245
454
|
similarity_scores = []
|
|
246
455
|
for ref_idx in range(0,len(unique_reference_ids)):
|
|
247
456
|
q_spec = q_spec_tmp
|
|
248
457
|
r_idxs_tmp = np.where(df_reference.iloc[:,0] == unique_reference_ids[ref_idx])[0]
|
|
249
458
|
r_spec = np.asarray(pd.concat([df_reference.iloc[r_idxs_tmp,1], df_reference.iloc[r_idxs_tmp,2]], axis=1).reset_index(drop=True))
|
|
250
459
|
|
|
251
|
-
# apply spectrum preprocessing transformation in the order specified by user
|
|
252
460
|
is_matched = False
|
|
253
461
|
for transformation in spectrum_preprocessing_order:
|
|
254
462
|
if np.isinf(q_spec[:,1]).sum() > 0:
|
|
255
463
|
q_spec[:,1] = np.zeros(q_spec.shape[0])
|
|
256
464
|
if np.isinf(r_spec[:,1]).sum() > 0:
|
|
257
465
|
r_spec[:,1] = np.zeros(r_spec.shape[0])
|
|
258
|
-
if transformation == 'C' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1:
|
|
466
|
+
if transformation == 'C' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1:
|
|
259
467
|
q_spec = centroid_spectrum(q_spec, window_size=window_size_centroiding)
|
|
260
468
|
r_spec = centroid_spectrum(r_spec, window_size=window_size_centroiding)
|
|
261
|
-
if transformation == 'M' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1:
|
|
469
|
+
if transformation == 'M' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1:
|
|
262
470
|
m_spec = match_peaks_in_spectra(spec_a=q_spec, spec_b=r_spec, window_size=window_size_matching)
|
|
263
471
|
q_spec = m_spec[:,0:2]
|
|
264
472
|
r_spec = m_spec[:,[0,2]]
|
|
265
473
|
is_matched = True
|
|
266
|
-
if transformation == 'W' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1:
|
|
474
|
+
if transformation == 'W' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1:
|
|
267
475
|
q_spec[:,1] = wf_transform(q_spec[:,0], q_spec[:,1], wf_mz, wf_int)
|
|
268
476
|
r_spec[:,1] = wf_transform(r_spec[:,0], r_spec[:,1], wf_mz, wf_int)
|
|
269
|
-
if transformation == 'L' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1:
|
|
477
|
+
if transformation == 'L' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1:
|
|
270
478
|
q_spec[:,1] = LE_transform(q_spec[:,1], LET_threshold, normalization_method='standard')
|
|
271
479
|
r_spec[:,1] = LE_transform(r_spec[:,1], LET_threshold, normalization_method='standard')
|
|
272
|
-
if transformation == 'N' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1:
|
|
480
|
+
if transformation == 'N' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1:
|
|
273
481
|
q_spec = remove_noise(q_spec, nr = noise_threshold)
|
|
274
482
|
if high_quality_reference_library == False:
|
|
275
483
|
r_spec = remove_noise(r_spec, nr = noise_threshold)
|
|
276
|
-
if transformation == 'F' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1:
|
|
484
|
+
if transformation == 'F' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1:
|
|
277
485
|
q_spec = filter_spec_lcms(q_spec, mz_min = mz_min, mz_max = mz_max, int_min = int_min, int_max = int_max, is_matched = is_matched)
|
|
278
486
|
if high_quality_reference_library == False:
|
|
279
487
|
r_spec = filter_spec_lcms(r_spec, mz_min = mz_min, mz_max = mz_max, int_min = int_min, int_max = int_max, is_matched = is_matched)
|
|
280
488
|
|
|
281
|
-
# query and reference spectrum intensities
|
|
282
489
|
q_ints = q_spec[:,1]
|
|
283
490
|
r_ints = r_spec[:,1]
|
|
284
491
|
if np.sum(q_ints) != 0 and np.sum(r_ints) != 0 and q_spec.shape[0] > 1 and r_spec.shape[1] > 1:
|
|
@@ -289,12 +496,10 @@ def get_acc_HRMS(df_query, df_reference, unique_query_ids, unique_reference_ids,
|
|
|
289
496
|
similarity_scores.append(similarity_score)
|
|
290
497
|
all_similarity_scores.append(similarity_scores)
|
|
291
498
|
|
|
292
|
-
# create pandas dataframe containing all similarity scores computed with one row for each query spectrum and one column for each reference spectrum
|
|
293
499
|
df_scores = pd.DataFrame(all_similarity_scores, columns = unique_reference_ids)
|
|
294
500
|
df_scores.index = unique_query_ids
|
|
295
501
|
df_scores.index.names = ['Query Spectrum ID']
|
|
296
502
|
|
|
297
|
-
# get predicted identity/identities of each query spectrum and the corresponding maximum similarity score
|
|
298
503
|
preds = []
|
|
299
504
|
scores = []
|
|
300
505
|
for i in range(0, df_scores.shape[0]):
|
|
@@ -325,7 +530,6 @@ def get_acc_HRMS(df_query, df_reference, unique_query_ids, unique_reference_ids,
|
|
|
325
530
|
|
|
326
531
|
|
|
327
532
|
def get_acc_NRMS(df_query, df_reference, unique_query_ids, unique_reference_ids, similarity_measure, weights, spectrum_preprocessing_order, mz_min, mz_max, int_min, int_max, noise_threshold, wf_mz, wf_int, LET_threshold, entropy_dimension, high_quality_reference_library):
|
|
328
|
-
# returns accuracy for a given set of parameters
|
|
329
533
|
|
|
330
534
|
n_top_matches_to_save = 1
|
|
331
535
|
|
|
@@ -348,32 +552,29 @@ def get_acc_NRMS(df_query, df_reference, unique_query_ids, unique_reference_ids,
|
|
|
348
552
|
r_spec_tmp = np.asarray(pd.concat([df_reference.iloc[r_idxs_tmp,1], df_reference.iloc[r_idxs_tmp,2]], axis=1).reset_index(drop=True))
|
|
349
553
|
r_spec = convert_spec(r_spec_tmp,mzs)
|
|
350
554
|
|
|
351
|
-
# apply spectrum preprocessing transformation in the order specified by user
|
|
352
555
|
for transformation in spectrum_preprocessing_order:
|
|
353
556
|
if np.isinf(q_spec[:,1]).sum() > 0:
|
|
354
557
|
q_spec[:,1] = np.zeros(q_spec.shape[0])
|
|
355
558
|
if np.isinf(r_spec[:,1]).sum() > 0:
|
|
356
559
|
r_spec[:,1] = np.zeros(r_spec.shape[0])
|
|
357
|
-
if transformation == 'W':
|
|
560
|
+
if transformation == 'W':
|
|
358
561
|
q_spec[:,1] = wf_transform(q_spec[:,0], q_spec[:,1], wf_mz, wf_int)
|
|
359
562
|
r_spec[:,1] = wf_transform(r_spec[:,0], r_spec[:,1], wf_mz, wf_int)
|
|
360
|
-
if transformation == 'L':
|
|
563
|
+
if transformation == 'L':
|
|
361
564
|
q_spec[:,1] = LE_transform(q_spec[:,1], LET_threshold, normalization_method='standard')
|
|
362
565
|
r_spec[:,1] = LE_transform(r_spec[:,1], LET_threshold, normalization_method='standard')
|
|
363
|
-
if transformation == 'N':
|
|
566
|
+
if transformation == 'N':
|
|
364
567
|
q_spec = remove_noise(q_spec, nr = noise_threshold)
|
|
365
568
|
if high_quality_reference_library == False:
|
|
366
569
|
r_spec = remove_noise(r_spec, nr = noise_threshold)
|
|
367
|
-
if transformation == 'F':
|
|
570
|
+
if transformation == 'F':
|
|
368
571
|
q_spec = filter_spec_gcms(q_spec, mz_min = mz_min, mz_max = mz_max, int_min = int_min, int_max = int_max)
|
|
369
572
|
if high_quality_reference_library == False:
|
|
370
573
|
r_spec = filter_spec_gcms(r_spec, mz_min = mz_min, mz_max = mz_max, int_min = int_min, int_max = int_max)
|
|
371
574
|
|
|
372
|
-
# query and reference spectrum intensities
|
|
373
575
|
q_ints = q_spec[:,1]
|
|
374
576
|
r_ints = r_spec[:,1]
|
|
375
577
|
|
|
376
|
-
# if there are no non-zero intensities in the query or reference spectrum, their similarity is 0
|
|
377
578
|
if np.sum(q_ints) != 0 and np.sum(r_ints) != 0:
|
|
378
579
|
similarity_score = get_similarity(similarity_measure, q_spec[:,1], r_spec[:,1], weights, entropy_dimension)
|
|
379
580
|
else:
|
|
@@ -382,12 +583,10 @@ def get_acc_NRMS(df_query, df_reference, unique_query_ids, unique_reference_ids,
|
|
|
382
583
|
similarity_scores.append(similarity_score)
|
|
383
584
|
all_similarity_scores.append(similarity_scores)
|
|
384
585
|
|
|
385
|
-
# create pandas dataframe containing all similarity scores computed with one row for each query spectrum and one column for each reference spectrum
|
|
386
586
|
df_scores = pd.DataFrame(all_similarity_scores, columns = unique_reference_ids)
|
|
387
587
|
df_scores.index = unique_query_ids
|
|
388
588
|
df_scores.index.names = ['Query Spectrum ID']
|
|
389
589
|
|
|
390
|
-
# get predicted identity/identities of each query spectrum and the corresponding maximum similarity score
|
|
391
590
|
preds = []
|
|
392
591
|
scores = []
|
|
393
592
|
for i in range(0, df_scores.shape[0]):
|
|
@@ -399,7 +598,6 @@ def get_acc_NRMS(df_query, df_reference, unique_query_ids, unique_reference_ids,
|
|
|
399
598
|
cols_to_keep = np.where(df_scores_tmp.iloc[i,:] != np.max(df_scores_tmp.iloc[i,:]))[0]
|
|
400
599
|
df_scores_tmp = df_scores_tmp.iloc[:,cols_to_keep]
|
|
401
600
|
|
|
402
|
-
#preds_tmp.append(';'.join(top_ref_specs_tmp.index.to_list()))
|
|
403
601
|
preds_tmp.append(';'.join(map(str,top_ref_specs_tmp.index.to_list())))
|
|
404
602
|
if len(top_ref_specs_tmp.values) == 0:
|
|
405
603
|
scores_tmp.append(0)
|
|
@@ -441,11 +639,10 @@ def run_spec_lib_matching_on_HRMS_data(query_data=None, reference_data=None, lik
|
|
|
441
639
|
--entropy_dimension: Entropy dimension parameter. Must have positive value other than 1. When the entropy dimension is 1, then Renyi and Tsallis entropy are equivalent to Shannon entropy. Therefore, this parameter only applies to the renyi and tsallis similarity measures. This parameter will be ignored if similarity measure cosine or shannon is chosen. Default: 1.1
|
|
442
640
|
--n_top_matches_to_save: The number of top matches to report. For example, if n_top_matches_to_save=5, then for each query spectrum, the five reference spectra with the largest similarity with the given query spectrum will be reported. Default: 1
|
|
443
641
|
--print_id_results: Flag that prints identification results if True. Default: False
|
|
444
|
-
--output_identification: Output
|
|
445
|
-
--output_similarity_scores: Output
|
|
642
|
+
--output_identification: Output TXT file containing the most-similar reference spectra for each query spectrum along with the corresponding similarity scores. Default is to save identification output in current working directory with filename \'output_identification.txt\'.
|
|
643
|
+
--output_similarity_scores: Output TXT file containing similarity scores between all query spectrum/spectra and all reference spectra. Each row corresponds to a query spectrum, the left-most column contains the query spectrum/spectra identifier, and the remaining column contain the similarity scores with respect to all reference library spectra. If no argument passed, then this TXT file is written to the current working directory with filename \'output_all_similarity_scores\'.txt.')
|
|
446
644
|
'''
|
|
447
645
|
|
|
448
|
-
# load query and reference libraries
|
|
449
646
|
if query_data is None:
|
|
450
647
|
print('\nError: No argument passed to the mandatory query_data. Please pass the path to the CSV file of the query data.')
|
|
451
648
|
sys.exit()
|
|
@@ -477,7 +674,6 @@ def run_spec_lib_matching_on_HRMS_data(query_data=None, reference_data=None, lik
|
|
|
477
674
|
df_reference = pd.concat(dfs, axis=0, ignore_index=True)
|
|
478
675
|
|
|
479
676
|
|
|
480
|
-
##### process input parameters and ensure they are in a valid format #####
|
|
481
677
|
if spectrum_preprocessing_order is not None:
|
|
482
678
|
spectrum_preprocessing_order = list(spectrum_preprocessing_order)
|
|
483
679
|
else:
|
|
@@ -545,7 +741,7 @@ def run_spec_lib_matching_on_HRMS_data(query_data=None, reference_data=None, lik
|
|
|
545
741
|
else:
|
|
546
742
|
q = entropy_dimension
|
|
547
743
|
|
|
548
|
-
normalization_method = 'standard'
|
|
744
|
+
normalization_method = 'standard'
|
|
549
745
|
|
|
550
746
|
if n_top_matches_to_save <= 0 or isinstance(n_top_matches_to_save,int)==False:
|
|
551
747
|
print('\nError: n_top_matches_to_save should be a positive integer')
|
|
@@ -564,15 +760,12 @@ def run_spec_lib_matching_on_HRMS_data(query_data=None, reference_data=None, lik
|
|
|
564
760
|
print(f'Warning: writing similarity scores to {output_similarity_scores}')
|
|
565
761
|
|
|
566
762
|
|
|
567
|
-
####################################### begin spectral library matching #######################################
|
|
568
|
-
# compute the similarity score between each query library spectrum/spectra and all reference library spectra
|
|
569
763
|
all_similarity_scores = []
|
|
570
764
|
for query_idx in range(0,len(unique_query_ids)):
|
|
571
765
|
print(f'query spectrum #{query_idx} is being identified')
|
|
572
766
|
q_idxs_tmp = np.where(df_query.iloc[:,0] == unique_query_ids[query_idx])[0]
|
|
573
767
|
q_spec_tmp = np.asarray(pd.concat([df_query.iloc[q_idxs_tmp,1], df_query.iloc[q_idxs_tmp,2]], axis=1).reset_index(drop=True))
|
|
574
768
|
|
|
575
|
-
# compute the similarity score between the given query spectrum and all spectra in the reference library
|
|
576
769
|
similarity_scores = []
|
|
577
770
|
for ref_idx in range(0,len(unique_reference_ids)):
|
|
578
771
|
#if ref_idx % 100 == 0:
|
|
@@ -581,37 +774,35 @@ def run_spec_lib_matching_on_HRMS_data(query_data=None, reference_data=None, lik
|
|
|
581
774
|
r_idxs_tmp = np.where(df_reference.iloc[:,0] == unique_reference_ids[ref_idx])[0]
|
|
582
775
|
r_spec = np.asarray(pd.concat([df_reference.iloc[r_idxs_tmp,1], df_reference.iloc[r_idxs_tmp,2]], axis=1).reset_index(drop=True))
|
|
583
776
|
|
|
584
|
-
# apply spectrum preprocessing transformation in the order specified by user
|
|
585
777
|
is_matched = False
|
|
586
778
|
for transformation in spectrum_preprocessing_order:
|
|
587
779
|
if np.isinf(q_spec[:,1]).sum() > 0:
|
|
588
780
|
q_spec[:,1] = np.zeros(q_spec.shape[0])
|
|
589
781
|
if np.isinf(r_spec[:,1]).sum() > 0:
|
|
590
782
|
r_spec[:,1] = np.zeros(r_spec.shape[0])
|
|
591
|
-
if transformation == 'C' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1:
|
|
783
|
+
if transformation == 'C' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1:
|
|
592
784
|
q_spec = centroid_spectrum(q_spec, window_size=window_size_centroiding)
|
|
593
785
|
r_spec = centroid_spectrum(r_spec, window_size=window_size_centroiding)
|
|
594
|
-
if transformation == 'M' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1:
|
|
786
|
+
if transformation == 'M' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1:
|
|
595
787
|
m_spec = match_peaks_in_spectra(spec_a=q_spec, spec_b=r_spec, window_size=window_size_matching)
|
|
596
788
|
q_spec = m_spec[:,0:2]
|
|
597
789
|
r_spec = m_spec[:,[0,2]]
|
|
598
790
|
is_matched = True
|
|
599
|
-
if transformation == 'W' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1:
|
|
791
|
+
if transformation == 'W' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1:
|
|
600
792
|
q_spec[:,1] = wf_transform(q_spec[:,0], q_spec[:,1], wf_mz, wf_intensity)
|
|
601
793
|
r_spec[:,1] = wf_transform(r_spec[:,0], r_spec[:,1], wf_mz, wf_intensity)
|
|
602
|
-
if transformation == 'L' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1:
|
|
794
|
+
if transformation == 'L' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1:
|
|
603
795
|
q_spec[:,1] = LE_transform(q_spec[:,1], LET_threshold, normalization_method=normalization_method)
|
|
604
796
|
r_spec[:,1] = LE_transform(r_spec[:,1], LET_threshold, normalization_method=normalization_method)
|
|
605
|
-
if transformation == 'N' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1:
|
|
797
|
+
if transformation == 'N' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1:
|
|
606
798
|
q_spec = remove_noise(q_spec, nr = noise_threshold)
|
|
607
799
|
if high_quality_reference_library == False:
|
|
608
800
|
r_spec = remove_noise(r_spec, nr = noise_threshold)
|
|
609
|
-
if transformation == 'F' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1:
|
|
801
|
+
if transformation == 'F' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1:
|
|
610
802
|
q_spec = filter_spec_lcms(q_spec, mz_min = mz_min, mz_max = mz_max, int_min = int_min, int_max = int_max, is_matched = is_matched)
|
|
611
803
|
if high_quality_reference_library == False:
|
|
612
804
|
r_spec = filter_spec_lcms(r_spec, mz_min = mz_min, mz_max = mz_max, int_min = int_min, int_max = int_max, is_matched = is_matched)
|
|
613
805
|
|
|
614
|
-
# query and reference spectrum intensities
|
|
615
806
|
q_ints = q_spec[:,1]
|
|
616
807
|
r_ints = r_spec[:,1]
|
|
617
808
|
|
|
@@ -623,12 +814,10 @@ def run_spec_lib_matching_on_HRMS_data(query_data=None, reference_data=None, lik
|
|
|
623
814
|
similarity_scores.append(similarity_score)
|
|
624
815
|
all_similarity_scores.append(similarity_scores)
|
|
625
816
|
|
|
626
|
-
# create pandas dataframe containing all similarity scores computed with one row for each query spectrum and one column for each reference spectrum
|
|
627
817
|
df_scores = pd.DataFrame(all_similarity_scores, columns = unique_reference_ids)
|
|
628
818
|
df_scores.index = unique_query_ids
|
|
629
819
|
df_scores.index.names = ['Query Spectrum ID']
|
|
630
820
|
|
|
631
|
-
# get predicted identity/identities of each query spectrum and the corresponding maximum similarity score
|
|
632
821
|
preds = []
|
|
633
822
|
scores = []
|
|
634
823
|
for i in range(0, df_scores.shape[0]):
|
|
@@ -652,29 +841,23 @@ def run_spec_lib_matching_on_HRMS_data(query_data=None, reference_data=None, lik
|
|
|
652
841
|
scores = np.array(scores)
|
|
653
842
|
out = np.c_[preds,scores]
|
|
654
843
|
|
|
655
|
-
# get column names for a pandas dataframe with the n_top_matches_to_save top-matches for each query spectrum
|
|
656
844
|
cnames_preds = []
|
|
657
845
|
cnames_scores = []
|
|
658
846
|
for i in range(0,n_top_matches_to_save):
|
|
659
847
|
cnames_preds.append(f'RANK.{i+1}.PRED')
|
|
660
848
|
cnames_scores.append(f'RANK.{i+1}.SIMILARITY.SCORE')
|
|
661
849
|
|
|
662
|
-
# get pandas dataframe with identifcation results with each row corresponding to a query spectrum, n_top_matches_to_save columns for the top predictions, and n_top_matches_to_save columns for the similarity scores corresponding to the predictions
|
|
663
850
|
df_top_ref_specs = pd.DataFrame(out, columns = [*cnames_preds, *cnames_scores])
|
|
664
851
|
df_top_ref_specs.index = unique_query_ids
|
|
665
852
|
df_top_ref_specs.index.names = ['Query Spectrum ID']
|
|
666
853
|
|
|
667
854
|
df_scores.columns = ['Reference Spectrum ID: ' + col for col in list(map(str,df_scores.columns.tolist()))]
|
|
668
855
|
|
|
669
|
-
# print the identification results if the user desires
|
|
670
856
|
if print_id_results == True:
|
|
671
857
|
print(df_top_ref_specs.to_string())
|
|
672
858
|
|
|
673
859
|
if return_ID_output is False:
|
|
674
|
-
# write spectral library matching results to disk
|
|
675
860
|
df_top_ref_specs.to_csv(output_identification, sep='\t')
|
|
676
|
-
|
|
677
|
-
# write all similarity scores to disk
|
|
678
861
|
df_scores.to_csv(output_similarity_scores, sep='\t')
|
|
679
862
|
else:
|
|
680
863
|
return df_top_ref_specs
|
|
@@ -706,11 +889,10 @@ def run_spec_lib_matching_on_NRMS_data(query_data=None, reference_data=None, lik
|
|
|
706
889
|
--normalization_method: Method used to normalize the intensities of each spectrum so that the intensities sum to 1. Since the objects entropy quantifies the uncertainy of must be probability distributions, the intensities of a given spectrum must sum to 1 prior to computing the entropy of the given spectrum intensities. Options: \'standard\' and \'softmax\'. Default: standard.
|
|
707
890
|
--n_top_matches_to_save: The number of top matches to report. For example, if n_top_matches_to_save=5, then for each query spectrum, the five reference spectra with the largest similarity with the given query spectrum will be reported. Default: 1
|
|
708
891
|
--print_id_results: Flag that prints identification results if True. Default: False
|
|
709
|
-
--output_identification: Output
|
|
710
|
-
--output_similarity_scores: Output
|
|
892
|
+
--output_identification: Output TXT file containing the most-similar reference spectra for each query spectrum along with the corresponding similarity scores. Default is to save identification output in current working directory with filename \'output_identification.txt\'.
|
|
893
|
+
--output_similarity_scores: Output TXT file containing similarity scores between all query spectrum/spectra and all reference spectra. Each row corresponds to a query spectrum, the left-most column contains the query spectrum/spectra identifier, and the remaining column contain the similarity scores with respect to all reference library spectra. If no argument passed, then this TXT file is written to the current working directory with filename \'output_all_similarity_scores\'.txt.')
|
|
711
894
|
'''
|
|
712
895
|
|
|
713
|
-
# load query and reference libraries
|
|
714
896
|
if query_data is None:
|
|
715
897
|
print('\nError: No argument passed to the mandatory query_data. Please pass the path to the CSV file of the query data.')
|
|
716
898
|
sys.exit()
|
|
@@ -742,7 +924,6 @@ def run_spec_lib_matching_on_NRMS_data(query_data=None, reference_data=None, lik
|
|
|
742
924
|
df_reference = pd.concat(dfs, axis=0, ignore_index=True)
|
|
743
925
|
|
|
744
926
|
|
|
745
|
-
##### process input parameters and ensure they are in a valid format #####
|
|
746
927
|
if spectrum_preprocessing_order is not None:
|
|
747
928
|
spectrum_preprocessing_order = list(spectrum_preprocessing_order)
|
|
748
929
|
else:
|
|
@@ -795,7 +976,7 @@ def run_spec_lib_matching_on_NRMS_data(query_data=None, reference_data=None, lik
|
|
|
795
976
|
else:
|
|
796
977
|
q = entropy_dimension
|
|
797
978
|
|
|
798
|
-
normalization_method = 'standard'
|
|
979
|
+
normalization_method = 'standard'
|
|
799
980
|
|
|
800
981
|
if n_top_matches_to_save <= 0 or isinstance(n_top_matches_to_save,int)==False:
|
|
801
982
|
print('\nError: n_top_matches_to_save should be a positive integer')
|
|
@@ -815,14 +996,10 @@ def run_spec_lib_matching_on_NRMS_data(query_data=None, reference_data=None, lik
|
|
|
815
996
|
|
|
816
997
|
|
|
817
998
|
|
|
818
|
-
####################################### begin spectral library matching #######################################
|
|
819
|
-
# get the range of m/z values
|
|
820
999
|
min_mz = int(np.min([np.min(df_query.iloc[:,1]), np.min(df_reference.iloc[:,1])]))
|
|
821
1000
|
max_mz = int(np.max([np.max(df_query.iloc[:,1]), np.max(df_reference.iloc[:,1])]))
|
|
822
1001
|
mzs = np.linspace(min_mz,max_mz,(max_mz-min_mz+1))
|
|
823
1002
|
|
|
824
|
-
# compute the similarity score between each query library spectrum/spectra and all reference library spectra
|
|
825
|
-
# for each query spectrum, compute its similarity with all reference spectra
|
|
826
1003
|
all_similarity_scores = []
|
|
827
1004
|
for query_idx in range(0,len(unique_query_ids)):
|
|
828
1005
|
q_idxs_tmp = np.where(df_query.iloc[:,0] == unique_query_ids[query_idx])[0]
|
|
@@ -838,32 +1015,29 @@ def run_spec_lib_matching_on_NRMS_data(query_data=None, reference_data=None, lik
|
|
|
838
1015
|
r_spec_tmp = np.asarray(pd.concat([df_reference.iloc[r_idxs_tmp,1], df_reference.iloc[r_idxs_tmp,2]], axis=1).reset_index(drop=True))
|
|
839
1016
|
r_spec = convert_spec(r_spec_tmp,mzs)
|
|
840
1017
|
|
|
841
|
-
# apply spectrum preprocessing transformation in the order specified by user
|
|
842
1018
|
for transformation in spectrum_preprocessing_order:
|
|
843
1019
|
if np.isinf(q_spec[:,1]).sum() > 0:
|
|
844
1020
|
q_spec[:,1] = np.zeros(q_spec.shape[0])
|
|
845
1021
|
if np.isinf(r_spec[:,1]).sum() > 0:
|
|
846
1022
|
r_spec[:,1] = np.zeros(r_spec.shape[0])
|
|
847
|
-
if transformation == 'W':
|
|
1023
|
+
if transformation == 'W':
|
|
848
1024
|
q_spec[:,1] = wf_transform(q_spec[:,0], q_spec[:,1], wf_mz, wf_intensity)
|
|
849
1025
|
r_spec[:,1] = wf_transform(r_spec[:,0], r_spec[:,1], wf_mz, wf_intensity)
|
|
850
|
-
if transformation == 'L':
|
|
1026
|
+
if transformation == 'L':
|
|
851
1027
|
q_spec[:,1] = LE_transform(q_spec[:,1], LET_threshold, normalization_method=normalization_method)
|
|
852
1028
|
r_spec[:,1] = LE_transform(r_spec[:,1], LET_threshold, normalization_method=normalization_method)
|
|
853
|
-
if transformation == 'N':
|
|
1029
|
+
if transformation == 'N':
|
|
854
1030
|
q_spec = remove_noise(q_spec, nr = noise_threshold)
|
|
855
1031
|
if high_quality_reference_library == False:
|
|
856
1032
|
r_spec = remove_noise(r_spec, nr = noise_threshold)
|
|
857
|
-
if transformation == 'F':
|
|
1033
|
+
if transformation == 'F':
|
|
858
1034
|
q_spec = filter_spec_gcms(q_spec, mz_min = mz_min, mz_max = mz_max, int_min = int_min, int_max = int_max)
|
|
859
1035
|
if high_quality_reference_library == False:
|
|
860
1036
|
r_spec = filter_spec_gcms(r_spec, mz_min = mz_min, mz_max = mz_max, int_min = int_min, int_max = int_max)
|
|
861
1037
|
|
|
862
|
-
# query and reference spectrum intensities
|
|
863
1038
|
q_ints = q_spec[:,1]
|
|
864
1039
|
r_ints = r_spec[:,1]
|
|
865
1040
|
|
|
866
|
-
# if there are no non-zero intensities in the query or reference spectrum, their similarity is 0
|
|
867
1041
|
if np.sum(q_ints) != 0 and np.sum(r_ints) != 0:
|
|
868
1042
|
similarity_score = get_similarity(similarity_measure, q_spec[:,1], r_spec[:,1], weights, entropy_dimension)
|
|
869
1043
|
else:
|
|
@@ -872,12 +1046,10 @@ def run_spec_lib_matching_on_NRMS_data(query_data=None, reference_data=None, lik
|
|
|
872
1046
|
similarity_scores.append(similarity_score)
|
|
873
1047
|
all_similarity_scores.append(similarity_scores)
|
|
874
1048
|
|
|
875
|
-
# create pandas dataframe containing all similarity scores computed with one row for each query spectrum and one column for each reference spectrum
|
|
876
1049
|
df_scores = pd.DataFrame(all_similarity_scores, columns = unique_reference_ids)
|
|
877
1050
|
df_scores.index = unique_query_ids
|
|
878
1051
|
df_scores.index.names = ['Query Spectrum ID']
|
|
879
1052
|
|
|
880
|
-
# get predicted identity/identities of each query spectrum and the corresponding maximum similarity score
|
|
881
1053
|
preds = []
|
|
882
1054
|
scores = []
|
|
883
1055
|
for i in range(0, df_scores.shape[0]):
|
|
@@ -889,7 +1061,6 @@ def run_spec_lib_matching_on_NRMS_data(query_data=None, reference_data=None, lik
|
|
|
889
1061
|
cols_to_keep = np.where(df_scores_tmp.iloc[i,:] != np.max(df_scores_tmp.iloc[i,:]))[0]
|
|
890
1062
|
df_scores_tmp = df_scores_tmp.iloc[:,cols_to_keep]
|
|
891
1063
|
|
|
892
|
-
#preds_tmp.append(';'.join(top_ref_specs_tmp.index.to_list()))
|
|
893
1064
|
preds_tmp.append(';'.join(map(str,top_ref_specs_tmp.index.to_list())))
|
|
894
1065
|
if len(top_ref_specs_tmp.values) == 0:
|
|
895
1066
|
scores_tmp.append(0)
|
|
@@ -902,29 +1073,23 @@ def run_spec_lib_matching_on_NRMS_data(query_data=None, reference_data=None, lik
|
|
|
902
1073
|
scores = np.array(scores)
|
|
903
1074
|
out = np.c_[preds,scores]
|
|
904
1075
|
|
|
905
|
-
# get column names for a pandas dataframe with the n_top_matches_to_save top-matches for each query spectrum
|
|
906
1076
|
cnames_preds = []
|
|
907
1077
|
cnames_scores = []
|
|
908
1078
|
for i in range(0,n_top_matches_to_save):
|
|
909
1079
|
cnames_preds.append(f'RANK.{i+1}.PRED')
|
|
910
1080
|
cnames_scores.append(f'RANK.{i+1}.SIMILARITY.SCORE')
|
|
911
1081
|
|
|
912
|
-
# get pandas dataframe with identifcation results with each row corresponding to a query spectrum, n_top_matches_to_save columns for the top predictions, and n_top_matches_to_save columns for the similarity scores corresponding to the predictions
|
|
913
1082
|
df_top_ref_specs = pd.DataFrame(out, columns = [*cnames_preds, *cnames_scores])
|
|
914
1083
|
df_top_ref_specs.index = unique_query_ids
|
|
915
1084
|
df_top_ref_specs.index.names = ['Query Spectrum ID']
|
|
916
1085
|
|
|
917
|
-
# print the identification results if the user desires
|
|
918
1086
|
if print_id_results == True:
|
|
919
1087
|
print(df_top_ref_specs.to_string())
|
|
920
1088
|
|
|
921
1089
|
df_scores.columns = ['Reference Spectrum ID: ' + col for col in list(map(str,df_scores.columns.tolist()))]
|
|
922
1090
|
|
|
923
1091
|
if return_ID_output is False:
|
|
924
|
-
# write spectral library matching results to disk
|
|
925
1092
|
df_top_ref_specs.to_csv(output_identification, sep='\t')
|
|
926
|
-
|
|
927
|
-
# write all similarity scores to disk
|
|
928
1093
|
df_scores.columns = ['Reference Spectrum ID: ' + col for col in list(map(str,df_scores.columns.tolist()))]
|
|
929
1094
|
df_scores.to_csv(output_similarity_scores, sep='\t')
|
|
930
1095
|
else:
|