pycompound 0.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pycompound-0.0.1.dist-info/METADATA +26 -0
- pycompound-0.0.1.dist-info/RECORD +14 -0
- pycompound-0.0.1.dist-info/WHEEL +5 -0
- pycompound-0.0.1.dist-info/licenses/LICENSE +21 -0
- pycompound-0.0.1.dist-info/top_level.txt +1 -0
- pycompound_fy7392/app.py +301 -0
- pycompound_fy7392/build_library.py +135 -0
- pycompound_fy7392/plot_spectra.py +636 -0
- pycompound_fy7392/plot_spectra_CLI.py +51 -0
- pycompound_fy7392/processing.py +316 -0
- pycompound_fy7392/similarity_measures.py +100 -0
- pycompound_fy7392/spec_lib_matching.py +943 -0
- pycompound_fy7392/spec_lib_matching_CLI.py +50 -0
- pycompound_fy7392/tuning_CLI.py +68 -0
|
@@ -0,0 +1,943 @@
|
|
|
1
|
+
|
|
2
|
+
# this script's function runs spectral library matching to identify unknown query compound(s)
|
|
3
|
+
|
|
4
|
+
from pycompound_fy7392.build_library import build_library_from_raw_data
|
|
5
|
+
from .processing import *
|
|
6
|
+
from .similarity_measures import *
|
|
7
|
+
import pandas as pd
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
import sys
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def tune_params_on_HRMS_data(query_data=None, reference_data=None, grid={'similarity_measure':['cosine'], 'spectrum_preprocessing_order':['FCNMWL'], 'mz_min':[0], 'mz_max':[9999999], 'int_min':[0], 'int_max':[99999999], 'window_size_centroiding':[0.5], 'window_size_matching':[0.5], 'noise_threshold':[0.0], 'wf_mz':[0.0], 'wf_int':[1.0], 'LET_threshold':[0.0], 'entropy_dimension':[1.1], 'high_quality_reference_library':[False]}, output_path=None):
|
|
13
|
+
'''
|
|
14
|
+
runs spectral library matching on high-resolution mass spectrometry (HRMS) data with all possible combinations of parameters in the grid dict, saves results from each choice of parameters to a CSV file, and prints top-performing parameters
|
|
15
|
+
|
|
16
|
+
--query_data: mgf, mzML, or csv file of query mass spectrum/spectra to be identified. If csv file, each row should correspond to a mass spectrum, the left-most column should contain an identifier, and each of the other columns should correspond to a single mass/charge ratio. Mandatory argument.
|
|
17
|
+
--reference_data: mgf, mzML, or csv file of the reference mass spectra. If csv file, each row should correspond to a mass spectrum, the left-most column should contain in identifier (i.e. the CAS registry number or the compound name), and the remaining column should correspond to a single mass/charge ratio. Mandatory argument.
|
|
18
|
+
--grid: dict with all possible parameter values to try.
|
|
19
|
+
--output_path: accuracy from each choice of parameter set is saved to a CSV file here.
|
|
20
|
+
'''
|
|
21
|
+
|
|
22
|
+
for key, value in grid.items():
|
|
23
|
+
globals()[key] = value
|
|
24
|
+
|
|
25
|
+
# load query and reference libraries
|
|
26
|
+
if query_data is None:
|
|
27
|
+
print('\nError: No argument passed to the mandatory query_data. Please pass the path to the CSV file of the query data.')
|
|
28
|
+
sys.exit()
|
|
29
|
+
else:
|
|
30
|
+
extension = query_data.rsplit('.',1)
|
|
31
|
+
extension = extension[(len(extension)-1)]
|
|
32
|
+
if extension == 'mgf' or extension == 'MGF' or extension == 'mzML' or extension == 'mzml' or extension == 'MZML' or extension == 'cdf' or extension == 'CDF':
|
|
33
|
+
output_path_tmp = query_data[:-3] + 'csv'
|
|
34
|
+
build_library_from_raw_data(input_path=query_data, output_path=output_path_tmp, is_reference=False)
|
|
35
|
+
df_query = pd.read_csv(output_path_tmp)
|
|
36
|
+
if extension == 'csv' or extension == 'CSV':
|
|
37
|
+
df_query = pd.read_csv(query_data)
|
|
38
|
+
unique_query_ids = df_query.iloc[:,0].unique()
|
|
39
|
+
|
|
40
|
+
if reference_data is None:
|
|
41
|
+
print('\nError: No argument passed to the mandatory reference_data. Please pass the path to the CSV file of the reference data.')
|
|
42
|
+
sys.exit()
|
|
43
|
+
else:
|
|
44
|
+
if isinstance(reference_data,str):
|
|
45
|
+
df_reference = get_reference_df(reference_data=reference_data)
|
|
46
|
+
unique_reference_ids = df_reference.iloc[:,0].unique()
|
|
47
|
+
else:
|
|
48
|
+
dfs = []
|
|
49
|
+
unique_reference_ids = []
|
|
50
|
+
for f in reference_data:
|
|
51
|
+
tmp = get_reference_df(reference_data=f)
|
|
52
|
+
dfs.append(tmp)
|
|
53
|
+
unique_reference_ids.extend(tmp.iloc[:,0].unique())
|
|
54
|
+
df_reference = pd.concat(dfs, axis=0, ignore_index=True)
|
|
55
|
+
|
|
56
|
+
print(f'\nNote that there are {len(unique_query_ids)} unique query spectra, {len(unique_reference_ids)} unique reference spectra, and {len(set(unique_query_ids) & set(unique_reference_ids))} of the query and reference spectra IDs are in common.\n')
|
|
57
|
+
|
|
58
|
+
if output_path is None:
|
|
59
|
+
output_path = f'{Path.cwd()}/tuning_param_output.csv'
|
|
60
|
+
print(f'Warning: since output_path=None, the output will be written to the current working directory: {output_path}')
|
|
61
|
+
|
|
62
|
+
accs = []
|
|
63
|
+
similarity_measures = []
|
|
64
|
+
spectrum_preprocessing_orders = []
|
|
65
|
+
mz_mins = []
|
|
66
|
+
mz_maxs = []
|
|
67
|
+
int_mins = []
|
|
68
|
+
int_maxs = []
|
|
69
|
+
noise_thresholds = []
|
|
70
|
+
window_size_centroidings = []
|
|
71
|
+
window_size_matchings = []
|
|
72
|
+
wf_mzs = []
|
|
73
|
+
wf_ints = []
|
|
74
|
+
LET_thresholds = []
|
|
75
|
+
entropy_dimensions = []
|
|
76
|
+
high_quality_reference_libraries = []
|
|
77
|
+
for similarity_measure_tmp in similarity_measure:
|
|
78
|
+
for spectrum_preprocessing_order_tmp in spectrum_preprocessing_order:
|
|
79
|
+
for mz_min_tmp in mz_min:
|
|
80
|
+
for mz_max_tmp in mz_max:
|
|
81
|
+
for int_min_tmp in int_min:
|
|
82
|
+
for int_max_tmp in int_max:
|
|
83
|
+
for noise_threshold_tmp in noise_threshold:
|
|
84
|
+
for window_size_centroiding_tmp in window_size_centroiding:
|
|
85
|
+
for window_size_matching_tmp in window_size_matching:
|
|
86
|
+
for wf_mz_tmp in wf_mz:
|
|
87
|
+
for wf_int_tmp in wf_int:
|
|
88
|
+
for LET_threshold_tmp in LET_threshold:
|
|
89
|
+
for entropy_dimension_tmp in entropy_dimension:
|
|
90
|
+
for high_quality_reference_library_tmp in high_quality_reference_library:
|
|
91
|
+
acc = get_acc_HRMS(df_query=df_query, df_reference=df_reference, unique_query_ids=unique_query_ids, unique_reference_ids=unique_reference_ids, similarity_measure=similarity_measure_tmp, spectrum_preprocessing_order=spectrum_preprocessing_order_tmp, mz_min=mz_min_tmp, mz_max=mz_max_tmp, int_min=int_min_tmp, int_max=int_max_tmp, window_size_centroiding=window_size_centroiding_tmp, window_size_matching=window_size_matching_tmp, noise_threshold=noise_threshold_tmp, wf_mz=wf_mz_tmp, wf_int=wf_int_tmp, LET_threshold=LET_threshold_tmp, entropy_dimension=entropy_dimension_tmp, high_quality_reference_library=high_quality_reference_library_tmp)
|
|
92
|
+
accs.append(acc)
|
|
93
|
+
similarity_measures.append(similarity_measure_tmp)
|
|
94
|
+
spectrum_preprocessing_orders.append(spectrum_preprocessing_order_tmp)
|
|
95
|
+
mz_mins.append(mz_min_tmp)
|
|
96
|
+
mz_maxs.append(mz_max_tmp)
|
|
97
|
+
int_mins.append(int_min_tmp)
|
|
98
|
+
int_maxs.append(int_max_tmp)
|
|
99
|
+
noise_thresholds.append(noise_threshold_tmp)
|
|
100
|
+
window_size_centroidings.append(window_size_centroiding_tmp)
|
|
101
|
+
window_size_matchings.append(window_size_matching_tmp)
|
|
102
|
+
wf_mzs.append(wf_mz_tmp)
|
|
103
|
+
wf_ints.append(wf_int_tmp)
|
|
104
|
+
LET_thresholds.append(LET_threshold_tmp)
|
|
105
|
+
entropy_dimensions.append(entropy_dimension_tmp)
|
|
106
|
+
high_quality_reference_libraries.append(high_quality_reference_library_tmp)
|
|
107
|
+
df_out = pd.DataFrame({'ACC':accs, 'SIMILARITY.MEASURE':similarity_measures, 'SPECTRUM.PROCESSING.ORDER':spectrum_preprocessing_orders, 'MZ.MIN':mz_mins, 'MZ.MAX':mz_maxs, 'INT.MIN':int_mins, 'INT.MAX':int_maxs, 'NOISE.THRESHOLD':noise_thresholds, 'WINDOW.SIZE.CENTROIDING':window_size_centroidings, 'WINDOW.SIZE.MATCHING':window_size_matchings, 'WF.MZ':wf_mzs, 'WF.INT':wf_ints, 'LET.THRESHOLD':LET_thresholds, 'ENTROPY.DIMENSION':entropy_dimensions, 'HIGH.QUALITY.REFERENCE.LIBRARY':high_quality_reference_libraries})
|
|
108
|
+
df_out.to_csv(output_path, index=False)
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
def tune_params_on_NRMS_data(query_data=None, reference_data=None, grid={'similarity_measure':['cosine'], 'spectrum_preprocessing_order':['FNLW'], 'mz_min':[0], 'mz_max':[9999999], 'int_min':[0], 'int_max':[99999999], 'noise_threshold':[0.0], 'wf_mz':[0.0], 'wf_int':[1.0], 'LET_threshold':[0.0], 'entropy_dimension':[1.1], 'high_quality_reference_library':[False]}, output_path=None):
|
|
114
|
+
'''
|
|
115
|
+
runs spectral library matching on nominal-resolution mass spectrometry (NRMS) data with all possible combinations of parameters in the grid dict, saves results from each choice of parameters to a CSV file, and prints top-performing parameters
|
|
116
|
+
|
|
117
|
+
--query_data: mgf, mzML, or csv file of query mass spectrum/spectra to be identified. If csv file, each row should correspond to a mass spectrum, the left-most column should contain an identifier, and each of the other columns should correspond to a single mass/charge ratio. Mandatory argument.
|
|
118
|
+
--reference_data: mgf, mzML, or csv file of the reference mass spectra. If csv file, each row should correspond to a mass spectrum, the left-most column should contain in identifier (i.e. the CAS registry number or the compound name), and the remaining column should correspond to a single mass/charge ratio. Mandatory argument.
|
|
119
|
+
--grid: dict with all possible parameter values to try
|
|
120
|
+
--output_path: accuracy from each choice of parameter set is saved to a CSV file here
|
|
121
|
+
'''
|
|
122
|
+
|
|
123
|
+
for key, value in grid.items():
|
|
124
|
+
globals()[key] = value
|
|
125
|
+
|
|
126
|
+
# load query and reference libraries
|
|
127
|
+
if query_data is None:
|
|
128
|
+
print('\nError: No argument passed to the mandatory query_data. Please pass the path to the CSV file of the query data.')
|
|
129
|
+
sys.exit()
|
|
130
|
+
else:
|
|
131
|
+
extension = query_data.rsplit('.',1)
|
|
132
|
+
extension = extension[(len(extension)-1)]
|
|
133
|
+
if extension == 'mgf' or extension == 'MGF' or extension == 'mzML' or extension == 'mzml' or extension == 'MZML' or extension == 'cdf' or extension == 'CDF':
|
|
134
|
+
output_path_tmp = query_data[:-3] + 'csv'
|
|
135
|
+
build_library_from_raw_data(input_path=query_data, output_path=output_path_tmp, is_reference=False)
|
|
136
|
+
df_query = pd.read_csv(output_path_tmp)
|
|
137
|
+
if extension == 'csv' or extension == 'CSV':
|
|
138
|
+
df_query = pd.read_csv(query_data)
|
|
139
|
+
unique_query_ids = df_query.iloc[:,0].unique()
|
|
140
|
+
|
|
141
|
+
if reference_data is None:
|
|
142
|
+
print('\nError: No argument passed to the mandatory reference_data. Please pass the path to the CSV file of the reference data.')
|
|
143
|
+
sys.exit()
|
|
144
|
+
else:
|
|
145
|
+
if isinstance(reference_data,str):
|
|
146
|
+
df_reference = get_reference_df(reference_data=reference_data)
|
|
147
|
+
unique_reference_ids = df_reference.iloc[:,0].unique()
|
|
148
|
+
else:
|
|
149
|
+
dfs = []
|
|
150
|
+
unique_reference_ids = []
|
|
151
|
+
for f in reference_data:
|
|
152
|
+
tmp = get_reference_df(reference_data=f)
|
|
153
|
+
dfs.append(tmp)
|
|
154
|
+
unique_reference_ids.extend(tmp.iloc[:,0].unique())
|
|
155
|
+
df_reference = pd.concat(dfs, axis=0, ignore_index=True)
|
|
156
|
+
|
|
157
|
+
print(f'\nNote that there are {len(unique_query_ids)} unique query spectra, {len(unique_reference_ids)} unique reference spectra, and {len(set(unique_query_ids) & set(unique_reference_ids))} of the query and reference spectra IDs are in common.\n')
|
|
158
|
+
|
|
159
|
+
if output_path is None:
|
|
160
|
+
output_path = f'{Path.cwd()}/tuning_param_output.csv'
|
|
161
|
+
print(f'Warning: since output_path=None, the output will be written to the current working directory: {output_path}')
|
|
162
|
+
|
|
163
|
+
accs = []
|
|
164
|
+
similarity_measures = []
|
|
165
|
+
spectrum_preprocessing_orders = []
|
|
166
|
+
mz_mins = []
|
|
167
|
+
mz_maxs = []
|
|
168
|
+
int_mins = []
|
|
169
|
+
int_maxs = []
|
|
170
|
+
noise_thresholds = []
|
|
171
|
+
wf_mzs = []
|
|
172
|
+
wf_ints = []
|
|
173
|
+
LET_thresholds = []
|
|
174
|
+
entropy_dimensions = []
|
|
175
|
+
high_quality_reference_libraries = []
|
|
176
|
+
for similarity_measure_tmp in similarity_measure:
|
|
177
|
+
for spectrum_preprocessing_order_tmp in spectrum_preprocessing_order:
|
|
178
|
+
for mz_min_tmp in mz_min:
|
|
179
|
+
for mz_max_tmp in mz_max:
|
|
180
|
+
for int_min_tmp in int_min:
|
|
181
|
+
for int_max_tmp in int_max:
|
|
182
|
+
for noise_threshold_tmp in noise_threshold:
|
|
183
|
+
for wf_mz_tmp in wf_mz:
|
|
184
|
+
for wf_int_tmp in wf_int:
|
|
185
|
+
for LET_threshold_tmp in LET_threshold:
|
|
186
|
+
for entropy_dimension_tmp in entropy_dimension:
|
|
187
|
+
for high_quality_reference_library_tmp in high_quality_reference_library:
|
|
188
|
+
acc = get_acc_NRMS(df_query=df_query, df_reference=df_reference, unique_query_ids=unique_query_ids, unique_reference_ids=unique_reference_ids, similarity_measure=similarity_measure_tmp, spectrum_preprocessing_order=spectrum_preprocessing_order_tmp, mz_min=mz_min_tmp, mz_max=mz_max_tmp, int_min=int_min_tmp, int_max=int_max_tmp, noise_threshold=noise_threshold_tmp, wf_mz=wf_mz_tmp, wf_int=wf_int_tmp, LET_threshold=LET_threshold_tmp, entropy_dimension=entropy_dimension_tmp, high_quality_reference_library=high_quality_reference_library_tmp)
|
|
189
|
+
accs.append(acc)
|
|
190
|
+
similarity_measures.append(similarity_measure_tmp)
|
|
191
|
+
spectrum_preprocessing_orders.append(spectrum_preprocessing_order_tmp)
|
|
192
|
+
mz_mins.append(mz_min_tmp)
|
|
193
|
+
mz_maxs.append(mz_max_tmp)
|
|
194
|
+
int_mins.append(int_min_tmp)
|
|
195
|
+
int_maxs.append(int_max_tmp)
|
|
196
|
+
noise_thresholds.append(noise_threshold_tmp)
|
|
197
|
+
wf_mzs.append(wf_mz_tmp)
|
|
198
|
+
wf_ints.append(wf_int_tmp)
|
|
199
|
+
LET_thresholds.append(LET_threshold_tmp)
|
|
200
|
+
entropy_dimensions.append(entropy_dimension_tmp)
|
|
201
|
+
high_quality_reference_libraries.append(high_quality_reference_library_tmp)
|
|
202
|
+
df_out = pd.DataFrame({'ACC':accs, 'SIMILARITY.MEASURE':similarity_measures, 'SPECTRUM.PROCESSING.ORDER':spectrum_preprocessing_orders, 'MZ.MIN':mz_mins, 'MZ.MAX':mz_maxs, 'INT.MIN':int_mins, 'INT.MAX':int_maxs, 'NOISE.THRESHOLD':noise_thresholds, 'WF.MZ':wf_mzs, 'WF.INT':wf_ints, 'LET.THRESHOLD':LET_thresholds, 'ENTROPY.DIMENSION':entropy_dimensions, 'HIGH.QUALITY.REFERENCE.LIBRARY':high_quality_reference_libraries})
|
|
203
|
+
df_out.to_csv(output_path, index=False)
|
|
204
|
+
|
|
205
|
+
|
|
206
|
+
|
|
207
|
+
|
|
208
|
+
def get_acc_HRMS(df_query, df_reference, unique_query_ids, unique_reference_ids, similarity_measure, spectrum_preprocessing_order, mz_min, mz_max, int_min, int_max, window_size_centroiding, window_size_matching, noise_threshold, wf_mz, wf_int, LET_threshold, entropy_dimension, high_quality_reference_library):
|
|
209
|
+
# returns accuracy for a given set of parameters
|
|
210
|
+
|
|
211
|
+
n_top_matches_to_save = 1
|
|
212
|
+
|
|
213
|
+
# compute the similarity score between each query library spectrum/spectra and all reference library spectra
|
|
214
|
+
all_similarity_scores = []
|
|
215
|
+
for query_idx in range(0,len(unique_query_ids)):
|
|
216
|
+
print(f'query spectrum #{query_idx} is being identified')
|
|
217
|
+
q_idxs_tmp = np.where(df_query.iloc[:,0] == unique_query_ids[query_idx])[0]
|
|
218
|
+
q_spec_tmp = np.asarray(pd.concat([df_query.iloc[q_idxs_tmp,1], df_query.iloc[q_idxs_tmp,2]], axis=1).reset_index(drop=True))
|
|
219
|
+
|
|
220
|
+
# compute the similarity score between the given query spectrum and all spectra in the reference library
|
|
221
|
+
similarity_scores = []
|
|
222
|
+
for ref_idx in range(0,len(unique_reference_ids)):
|
|
223
|
+
q_spec = q_spec_tmp
|
|
224
|
+
r_idxs_tmp = np.where(df_reference.iloc[:,0] == unique_reference_ids[ref_idx])[0]
|
|
225
|
+
r_spec = np.asarray(pd.concat([df_reference.iloc[r_idxs_tmp,1], df_reference.iloc[r_idxs_tmp,2]], axis=1).reset_index(drop=True))
|
|
226
|
+
|
|
227
|
+
# apply spectrum preprocessing transformation in the order specified by user
|
|
228
|
+
is_matched = False
|
|
229
|
+
for transformation in spectrum_preprocessing_order:
|
|
230
|
+
if np.isinf(q_spec[:,1]).sum() > 0:
|
|
231
|
+
q_spec[:,1] = np.zeros(q_spec.shape[0])
|
|
232
|
+
if np.isinf(r_spec[:,1]).sum() > 0:
|
|
233
|
+
r_spec[:,1] = np.zeros(r_spec.shape[0])
|
|
234
|
+
if transformation == 'C' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1: # centroiding
|
|
235
|
+
q_spec = centroid_spectrum(q_spec, window_size=window_size_centroiding)
|
|
236
|
+
r_spec = centroid_spectrum(r_spec, window_size=window_size_centroiding)
|
|
237
|
+
if transformation == 'M' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1: # matching
|
|
238
|
+
m_spec = match_peaks_in_spectra(spec_a=q_spec, spec_b=r_spec, window_size=window_size_matching)
|
|
239
|
+
q_spec = m_spec[:,0:2]
|
|
240
|
+
r_spec = m_spec[:,[0,2]]
|
|
241
|
+
is_matched = True
|
|
242
|
+
if transformation == 'W' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1: # weight factor transformation
|
|
243
|
+
q_spec[:,1] = wf_transform(q_spec[:,0], q_spec[:,1], wf_mz, wf_int)
|
|
244
|
+
r_spec[:,1] = wf_transform(r_spec[:,0], r_spec[:,1], wf_mz, wf_int)
|
|
245
|
+
if transformation == 'L' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1: # low-entropy tranformation
|
|
246
|
+
q_spec[:,1] = LE_transform(q_spec[:,1], LET_threshold, normalization_method='standard')
|
|
247
|
+
r_spec[:,1] = LE_transform(r_spec[:,1], LET_threshold, normalization_method='standard')
|
|
248
|
+
if transformation == 'N' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1: # noise removal
|
|
249
|
+
q_spec = remove_noise(q_spec, nr = noise_threshold)
|
|
250
|
+
if high_quality_reference_library == False:
|
|
251
|
+
r_spec = remove_noise(r_spec, nr = noise_threshold)
|
|
252
|
+
if transformation == 'F' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1: # filter with respect to mz and/or intensity
|
|
253
|
+
q_spec = filter_spec_lcms(q_spec, mz_min = mz_min, mz_max = mz_max, int_min = int_min, int_max = int_max, is_matched = is_matched)
|
|
254
|
+
if high_quality_reference_library == False:
|
|
255
|
+
r_spec = filter_spec_lcms(r_spec, mz_min = mz_min, mz_max = mz_max, int_min = int_min, int_max = int_max, is_matched = is_matched)
|
|
256
|
+
|
|
257
|
+
# query and reference spectrum intensities
|
|
258
|
+
q_ints = q_spec[:,1]
|
|
259
|
+
r_ints = r_spec[:,1]
|
|
260
|
+
if np.sum(q_ints) != 0 and np.sum(r_ints) != 0 and q_spec.shape[0] > 1 and r_spec.shape[1] > 1:
|
|
261
|
+
if similarity_measure == 'cosine':
|
|
262
|
+
similarity_score = S_cos(q_ints, r_ints)
|
|
263
|
+
else:
|
|
264
|
+
q_ints = normalize(q_ints, method='standard')
|
|
265
|
+
r_ints = normalize(r_ints, method='standard')
|
|
266
|
+
|
|
267
|
+
if similarity_measure == 'shannon':
|
|
268
|
+
similarity_score = S_shannon(q_ints, r_ints)
|
|
269
|
+
elif similarity_measure == 'renyi':
|
|
270
|
+
similarity_score = S_renyi(q_ints, r_ints, entropy_dimension)
|
|
271
|
+
elif similarity_measure == 'tsallis':
|
|
272
|
+
similarity_score = S_tsallis(q_ints, r_ints, entropy_dimension)
|
|
273
|
+
else:
|
|
274
|
+
similarity_score = 0
|
|
275
|
+
|
|
276
|
+
similarity_scores.append(similarity_score)
|
|
277
|
+
all_similarity_scores.append(similarity_scores)
|
|
278
|
+
|
|
279
|
+
# create pandas dataframe containing all similarity scores computed with one row for each query spectrum and one column for each reference spectrum
|
|
280
|
+
df_scores = pd.DataFrame(all_similarity_scores, columns = unique_reference_ids)
|
|
281
|
+
df_scores.index = unique_query_ids
|
|
282
|
+
df_scores.index.names = ['Query Spectrum ID']
|
|
283
|
+
|
|
284
|
+
# get predicted identity/identities of each query spectrum and the corresponding maximum similarity score
|
|
285
|
+
preds = []
|
|
286
|
+
scores = []
|
|
287
|
+
for i in range(0, df_scores.shape[0]):
|
|
288
|
+
df_scores_tmp = df_scores
|
|
289
|
+
preds_tmp = []
|
|
290
|
+
scores_tmp = []
|
|
291
|
+
for j in range(0, n_top_matches_to_save):
|
|
292
|
+
top_ref_specs_tmp = df_scores_tmp.iloc[i,np.where(df_scores_tmp.iloc[i,:] == np.max(df_scores_tmp.iloc[i,:]))[0]]
|
|
293
|
+
cols_to_keep = np.where(df_scores_tmp.iloc[i,:] != np.max(df_scores_tmp.iloc[i,:]))[0]
|
|
294
|
+
df_scores_tmp = df_scores_tmp.iloc[:,cols_to_keep]
|
|
295
|
+
|
|
296
|
+
preds_tmp.append(';'.join(map(str,top_ref_specs_tmp.index.to_list())))
|
|
297
|
+
if len(top_ref_specs_tmp.values) == 0:
|
|
298
|
+
scores_tmp.append(0)
|
|
299
|
+
else:
|
|
300
|
+
scores_tmp.append(top_ref_specs_tmp.values[0])
|
|
301
|
+
preds.append(preds_tmp)
|
|
302
|
+
scores.append(scores_tmp)
|
|
303
|
+
|
|
304
|
+
preds = np.array(preds)
|
|
305
|
+
scores = np.array(scores)
|
|
306
|
+
out = np.c_[unique_query_ids,preds,scores]
|
|
307
|
+
df_tmp = pd.DataFrame(out, columns=['TRUE.ID','PREDICTED.ID','SCORE'])
|
|
308
|
+
acc = (df_tmp['TRUE.ID']==df_tmp['PREDICTED.ID']).mean()
|
|
309
|
+
return acc
|
|
310
|
+
|
|
311
|
+
|
|
312
|
+
|
|
313
|
+
|
|
314
|
+
def get_acc_NRMS(df_query, df_reference, unique_query_ids, unique_reference_ids, similarity_measure, spectrum_preprocessing_order, mz_min, mz_max, int_min, int_max, noise_threshold, wf_mz, wf_int, LET_threshold, entropy_dimension, high_quality_reference_library):
|
|
315
|
+
# returns accuracy for a given set of parameters
|
|
316
|
+
|
|
317
|
+
n_top_matches_to_save = 1
|
|
318
|
+
|
|
319
|
+
min_mz = int(np.min([np.min(df_query.iloc[:,1]), np.min(df_reference.iloc[:,1])]))
|
|
320
|
+
max_mz = int(np.max([np.max(df_query.iloc[:,1]), np.max(df_reference.iloc[:,1])]))
|
|
321
|
+
mzs = np.linspace(min_mz,max_mz,(max_mz-min_mz+1))
|
|
322
|
+
|
|
323
|
+
all_similarity_scores = []
|
|
324
|
+
for query_idx in range(0,len(unique_query_ids)):
|
|
325
|
+
q_idxs_tmp = np.where(df_query.iloc[:,0] == unique_query_ids[query_idx])[0]
|
|
326
|
+
q_spec_tmp = np.asarray(pd.concat([df_query.iloc[q_idxs_tmp,1], df_query.iloc[q_idxs_tmp,2]], axis=1).reset_index(drop=True))
|
|
327
|
+
q_spec_tmp = convert_spec(q_spec_tmp,mzs)
|
|
328
|
+
|
|
329
|
+
similarity_scores = []
|
|
330
|
+
for ref_idx in range(0,len(unique_reference_ids)):
|
|
331
|
+
q_spec = q_spec_tmp
|
|
332
|
+
if ref_idx % 1000 == 0:
|
|
333
|
+
print(f'Query spectrum #{query_idx} has had its similarity with {ref_idx} reference library spectra computed')
|
|
334
|
+
r_idxs_tmp = np.where(df_reference.iloc[:,0] == unique_reference_ids[ref_idx])[0]
|
|
335
|
+
r_spec_tmp = np.asarray(pd.concat([df_reference.iloc[r_idxs_tmp,1], df_reference.iloc[r_idxs_tmp,2]], axis=1).reset_index(drop=True))
|
|
336
|
+
r_spec = convert_spec(r_spec_tmp,mzs)
|
|
337
|
+
|
|
338
|
+
# apply spectrum preprocessing transformation in the order specified by user
|
|
339
|
+
for transformation in spectrum_preprocessing_order:
|
|
340
|
+
if np.isinf(q_spec[:,1]).sum() > 0:
|
|
341
|
+
q_spec[:,1] = np.zeros(q_spec.shape[0])
|
|
342
|
+
if np.isinf(r_spec[:,1]).sum() > 0:
|
|
343
|
+
r_spec[:,1] = np.zeros(r_spec.shape[0])
|
|
344
|
+
if transformation == 'W': # weight factor transformation
|
|
345
|
+
q_spec[:,1] = wf_transform(q_spec[:,0], q_spec[:,1], wf_mz, wf_int)
|
|
346
|
+
r_spec[:,1] = wf_transform(r_spec[:,0], r_spec[:,1], wf_mz, wf_int)
|
|
347
|
+
if transformation == 'L': # low-entropy transformation
|
|
348
|
+
q_spec[:,1] = LE_transform(q_spec[:,1], LET_threshold, normalization_method='standard')
|
|
349
|
+
r_spec[:,1] = LE_transform(r_spec[:,1], LET_threshold, normalization_method='standard')
|
|
350
|
+
if transformation == 'N': # noise removal
|
|
351
|
+
q_spec = remove_noise(q_spec, nr = noise_threshold)
|
|
352
|
+
if high_quality_reference_library == False:
|
|
353
|
+
r_spec = remove_noise(r_spec, nr = noise_threshold)
|
|
354
|
+
if transformation == 'F': # filter with respect to mz and/or intensity
|
|
355
|
+
q_spec = filter_spec_gcms(q_spec, mz_min = mz_min, mz_max = mz_max, int_min = int_min, int_max = int_max)
|
|
356
|
+
if high_quality_reference_library == False:
|
|
357
|
+
r_spec = filter_spec_gcms(r_spec, mz_min = mz_min, mz_max = mz_max, int_min = int_min, int_max = int_max)
|
|
358
|
+
|
|
359
|
+
# query and reference spectrum intensities
|
|
360
|
+
q_ints = q_spec[:,1]
|
|
361
|
+
r_ints = r_spec[:,1]
|
|
362
|
+
|
|
363
|
+
# if there are no non-zero intensities in the query or reference spectrum, their similarity is 0
|
|
364
|
+
if np.sum(q_ints) != 0 and np.sum(r_ints) != 0:
|
|
365
|
+
if similarity_measure == 'cosine':
|
|
366
|
+
similarity_score = S_cos(q_ints, r_ints)
|
|
367
|
+
else:
|
|
368
|
+
# normalize intensities of each spectrum so they sum to 1 so that they represent a probability distribution
|
|
369
|
+
q_ints = normalize(q_ints, method = 'standard')
|
|
370
|
+
r_ints = normalize(r_ints, method = 'standard')
|
|
371
|
+
|
|
372
|
+
if similarity_measure == 'shannon':
|
|
373
|
+
similarity_score = S_shannon(q_ints, r_ints)
|
|
374
|
+
elif similarity_measure == 'renyi':
|
|
375
|
+
similarity_score = S_renyi(q_ints, r_ints, entropy_dimension)
|
|
376
|
+
elif similarity_measure == 'tsallis':
|
|
377
|
+
similarity_score = S_tsallis(q_ints, r_ints, entropy_dimension)
|
|
378
|
+
else:
|
|
379
|
+
similarity_score = 0
|
|
380
|
+
|
|
381
|
+
similarity_scores.append(similarity_score)
|
|
382
|
+
all_similarity_scores.append(similarity_scores)
|
|
383
|
+
|
|
384
|
+
# create pandas dataframe containing all similarity scores computed with one row for each query spectrum and one column for each reference spectrum
|
|
385
|
+
df_scores = pd.DataFrame(all_similarity_scores, columns = unique_reference_ids)
|
|
386
|
+
df_scores.index = unique_query_ids
|
|
387
|
+
df_scores.index.names = ['Query Spectrum ID']
|
|
388
|
+
|
|
389
|
+
# get predicted identity/identities of each query spectrum and the corresponding maximum similarity score
|
|
390
|
+
preds = []
|
|
391
|
+
scores = []
|
|
392
|
+
for i in range(0, df_scores.shape[0]):
|
|
393
|
+
df_scores_tmp = df_scores
|
|
394
|
+
preds_tmp = []
|
|
395
|
+
scores_tmp = []
|
|
396
|
+
for j in range(0, n_top_matches_to_save):
|
|
397
|
+
top_ref_specs_tmp = df_scores_tmp.iloc[i,np.where(df_scores_tmp.iloc[i,:] == np.max(df_scores_tmp.iloc[i,:]))[0]]
|
|
398
|
+
cols_to_keep = np.where(df_scores_tmp.iloc[i,:] != np.max(df_scores_tmp.iloc[i,:]))[0]
|
|
399
|
+
df_scores_tmp = df_scores_tmp.iloc[:,cols_to_keep]
|
|
400
|
+
|
|
401
|
+
#preds_tmp.append(';'.join(top_ref_specs_tmp.index.to_list()))
|
|
402
|
+
preds_tmp.append(';'.join(map(str,top_ref_specs_tmp.index.to_list())))
|
|
403
|
+
if len(top_ref_specs_tmp.values) == 0:
|
|
404
|
+
scores_tmp.append(0)
|
|
405
|
+
else:
|
|
406
|
+
scores_tmp.append(top_ref_specs_tmp.values[0])
|
|
407
|
+
preds.append(preds_tmp)
|
|
408
|
+
scores.append(scores_tmp)
|
|
409
|
+
|
|
410
|
+
preds = np.array(preds)
|
|
411
|
+
scores = np.array(scores)
|
|
412
|
+
out = np.c_[unique_query_ids,preds,scores]
|
|
413
|
+
df_tmp = pd.DataFrame(out, columns=['TRUE.ID','PREDICTED.ID','SCORE'])
|
|
414
|
+
acc = (df_tmp['TRUE.ID']==df_tmp['PREDICTED.ID']).mean()
|
|
415
|
+
return acc
|
|
416
|
+
|
|
417
|
+
|
|
418
|
+
|
|
419
|
+
def run_spec_lib_matching_on_HRMS_data(query_data=None, reference_data=None, likely_reference_ids=None, similarity_measure='cosine', spectrum_preprocessing_order='FCNMWL', high_quality_reference_library=False, mz_min=0, mz_max=9999999, int_min=0, int_max=9999999, window_size_centroiding=0.5, window_size_matching=0.5, noise_threshold=0.0, wf_mz=0.0, wf_intensity=1.0, LET_threshold=0.0, entropy_dimension=1.1, n_top_matches_to_save=1, print_id_results=False, output_identification=None, output_similarity_scores=None):
|
|
420
|
+
'''
|
|
421
|
+
runs spectral library matching on high-resolution mass spectrometry (HRMS) data
|
|
422
|
+
|
|
423
|
+
--query_data: mgf, mzML, or csv file of query mass spectrum/spectra to be identified. If csv file, each row should correspond to a mass spectrum, the left-most column should contain an identifier, and each of the other columns should correspond to a single mass/charge ratio. Mandatory argument.
|
|
424
|
+
--reference_data: either string or list of strings with pass to mgf, mzML, sdf, and/or csv file(s) of the reference mass spectra. If csv file, each row should correspond to a mass spectrum, the left-most column should contain in identifier (i.e. the CAS registry number or the compound name), and the remaining column should correspond to a single mass/charge ratio. Mandatory argument.
|
|
425
|
+
--likely_reference_ids: CSV file with one column containing the IDs of a subset of all compounds in the reference_data to be used in spectral library matching. Each ID in this file must be an ID in the reference library. Default: None (i.e. default is to use entire reference library)
|
|
426
|
+
--similarity_measure: \'cosine\', \'shannon\', \'renyi\', and \'tsallis\'. Default: cosine.
|
|
427
|
+
--spectrum_preprocessing_order: The spectrum preprocessing transformations and the order in which they are to be applied. Note that these transformations are applied prior to computing similarity scores. Format must be a string with 2-6 characters chosen from C, F, M, N, L, W representing centroiding, filtering based on mass/charge and intensity values, matching, noise removal, low-entropy trannsformation, and weight-factor-transformation, respectively. For example, if \'WCM\' is passed, then each spectrum will undergo a weight factor transformation, then centroiding, and then matching. Note that if an argument is passed, then \'M\' must be contained in the argument, since matching is a required preprocessing step in spectral library matching of HRMS data. Furthermore, \'C\' must be performed before matching since centroiding can change the number of ion fragments in a given spectrum. Default: FCNMWL')
|
|
428
|
+
--high_quality_reference_library: True/False flag indicating whether the reference library is considered to be of high quality. If True, then the spectrum preprocessing transformations of filtering and noise removal are performed only on the query spectrum/spectra. If False, all spectrum preprocessing transformations specified will be applied to both the query and reference spectra. Default: False')
|
|
429
|
+
--mz_min: Remove all peaks with mass/charge value less than mz_min in each spectrum. Default: 0
|
|
430
|
+
--mz_max: Remove all peaks with mass/charge value greater than mz_max in each spectrum. Default: 9999999
|
|
431
|
+
--int_min: Remove all peaks with intensity value less than int_min in each spectrum. Default: 0
|
|
432
|
+
--int_max: Remove all peaks with intensity value greater than int_max in each spectrum. Default: 9999999
|
|
433
|
+
--window_size_centroiding: Window size parameter used in centroiding a given spectrum. Default: 0.5
|
|
434
|
+
--window_size_matching: Window size parameter used in matching a query spectrum and a reference library spectrum. Default: 0.5
|
|
435
|
+
--noise_threshold: Ion fragments (i.e. points in a given mass spectrum) with intensity less than max(intensities)*noise_threshold are removed. Default: 0.0
|
|
436
|
+
--wf_mz: Mass/charge weight factor parameter. Default: 0.0
|
|
437
|
+
--wf_intensity: Intensity weight factor parameter. Default: 0.0
|
|
438
|
+
--LET_threshold: Low-entropy transformation threshold parameter. Spectra with Shannon entropy less than LET_threshold are transformed according to intensitiesNew=intensitiesOriginal^{(1+S)/(1+LET_threshold)}. Default: 0.0
|
|
439
|
+
--entropy_dimension: Entropy dimension parameter. Must have positive value other than 1. When the entropy dimension is 1, then Renyi and Tsallis entropy are equivalent to Shannon entropy. Therefore, this parameter only applies to the renyi and tsallis similarity measures. This parameter will be ignored if similarity measure cosine or shannon is chosen. Default: 1.1
|
|
440
|
+
--n_top_matches_to_save: The number of top matches to report. For example, if n_top_matches_to_save=5, then for each query spectrum, the five reference spectra with the largest similarity with the given query spectrum will be reported. Default: 1
|
|
441
|
+
--print_id_results: Flag that prints identification results if True. Default: False
|
|
442
|
+
--output_identification: Output CSV file containing the most-similar reference spectra for each query spectrum along with the corresponding similarity scores. Default is to save identification output in current working directory with filename \'output_identification.csv\'.
|
|
443
|
+
--output_similarity_scores: Output CSV file containing similarity scores between all query spectrum/spectra and all reference spectra. Each row corresponds to a query spectrum, the left-most column contains the query spectrum/spectra identifier, and the remaining column contain the similarity scores with respect to all reference library spectra. If no argument passed, then this CSV file is written to the current working directory with filename \'output_all_similarity_scores\'.csv.')
|
|
444
|
+
'''
|
|
445
|
+
|
|
446
|
+
# load query and reference libraries
|
|
447
|
+
if query_data is None:
|
|
448
|
+
print('\nError: No argument passed to the mandatory query_data. Please pass the path to the CSV file of the query data.')
|
|
449
|
+
sys.exit()
|
|
450
|
+
else:
|
|
451
|
+
extension = query_data.rsplit('.',1)
|
|
452
|
+
extension = extension[(len(extension)-1)]
|
|
453
|
+
if extension == 'mgf' or extension == 'MGF' or extension == 'mzML' or extension == 'mzml' or extension == 'MZML' or extension == 'cdf' or extension == 'CDF':
|
|
454
|
+
output_path_tmp = query_data[:-3] + 'csv'
|
|
455
|
+
build_library_from_raw_data(input_path=query_data, output_path=output_path_tmp, is_reference=False)
|
|
456
|
+
df_query = pd.read_csv(output_path_tmp)
|
|
457
|
+
if extension == 'csv' or extension == 'CSV':
|
|
458
|
+
df_query = pd.read_csv(query_data)
|
|
459
|
+
unique_query_ids = df_query.iloc[:,0].unique()
|
|
460
|
+
|
|
461
|
+
if reference_data is None:
|
|
462
|
+
print('\nError: No argument passed to the mandatory reference_data. Please pass the path to the CSV file of the reference data.')
|
|
463
|
+
sys.exit()
|
|
464
|
+
else:
|
|
465
|
+
if isinstance(reference_data,str):
|
|
466
|
+
df_reference = get_reference_df(reference_data,likely_reference_ids)
|
|
467
|
+
unique_reference_ids = df_reference.iloc[:,0].unique()
|
|
468
|
+
else:
|
|
469
|
+
dfs = []
|
|
470
|
+
unique_reference_ids = []
|
|
471
|
+
for f in reference_data:
|
|
472
|
+
tmp = get_reference_df(f,likely_reference_ids)
|
|
473
|
+
dfs.append(tmp)
|
|
474
|
+
unique_reference_ids.extend(tmp.iloc[:,0].unique())
|
|
475
|
+
df_reference = pd.concat(dfs, axis=0, ignore_index=True)
|
|
476
|
+
|
|
477
|
+
|
|
478
|
+
##### process input parameters and ensure they are in a valid format #####
|
|
479
|
+
if spectrum_preprocessing_order is not None:
|
|
480
|
+
spectrum_preprocessing_order = list(spectrum_preprocessing_order)
|
|
481
|
+
else:
|
|
482
|
+
spectrum_preprocessing_order = ['F', 'C', 'N', 'M', 'W', 'L']
|
|
483
|
+
if 'M' not in spectrum_preprocessing_order:
|
|
484
|
+
print(f'Error: \'M\' must be a character in spectrum_preprocessing_order.')
|
|
485
|
+
sys.exit()
|
|
486
|
+
if 'C' in spectrum_preprocessing_order:
|
|
487
|
+
if spectrum_preprocessing_order.index('C') > spectrum_preprocessing_order.index('M'):
|
|
488
|
+
print(f'Error: \'C\' must come before \'M\' in spectrum_preprocessing_order.')
|
|
489
|
+
sys.exit()
|
|
490
|
+
if set(spectrum_preprocessing_order) - {'F','C','N','M','W','L'}:
|
|
491
|
+
print(f'Error: spectrum_preprocessing_order must contain only \'C\', \'F\', \'M\', \'N\', \'L\', \'W\'.')
|
|
492
|
+
sys.exit()
|
|
493
|
+
|
|
494
|
+
if similarity_measure not in ['cosine','shannon','renyi','tsallis']:
|
|
495
|
+
print('\nError: similarity_measure must be either \'cosine\', \'shannon\', \'renyi\', or \'tsallis\'')
|
|
496
|
+
sys.exit()
|
|
497
|
+
|
|
498
|
+
if isinstance(int_min,int) is True:
|
|
499
|
+
int_min = float(int_min)
|
|
500
|
+
if isinstance(int_max,int) is True:
|
|
501
|
+
int_max = float(int_max)
|
|
502
|
+
if isinstance(mz_min,int) is False or isinstance(mz_max,int) is False or isinstance(int_min,float) is False or isinstance(int_max,float) is False:
|
|
503
|
+
print('Error: mz_min must be a non-negative integer, mz_max must be a positive integer, int_min must be a non-negative float, and int_max must be a positive float')
|
|
504
|
+
sys.exit()
|
|
505
|
+
if mz_min < 0:
|
|
506
|
+
print('\nError: mz_min should be a non-negative integer')
|
|
507
|
+
sys.exit()
|
|
508
|
+
if mz_max <= 0:
|
|
509
|
+
print('\nError: mz_max should be a positive integer')
|
|
510
|
+
sys.exit()
|
|
511
|
+
if int_min < 0:
|
|
512
|
+
print('\nError: int_min should be a non-negative float')
|
|
513
|
+
sys.exit()
|
|
514
|
+
if int_max <= 0:
|
|
515
|
+
print('\nError: int_max should be a positive float')
|
|
516
|
+
sys.exit()
|
|
517
|
+
|
|
518
|
+
if isinstance(window_size_centroiding,float) is False or window_size_centroiding <= 0.0:
|
|
519
|
+
print('Error: window_size_centroiding must be a positive float.')
|
|
520
|
+
sys.exit()
|
|
521
|
+
if isinstance(window_size_matching,float) is False or window_size_matching<= 0.0:
|
|
522
|
+
print('Error: window_size_matching must be a positive float.')
|
|
523
|
+
sys.exit()
|
|
524
|
+
|
|
525
|
+
if isinstance(noise_threshold,int) is True:
|
|
526
|
+
noise_threshold = float(noise_threshold)
|
|
527
|
+
if isinstance(noise_threshold,float) is False or noise_threshold < 0:
|
|
528
|
+
print('Error: noise_threshold must be a positive float.')
|
|
529
|
+
sys.exit()
|
|
530
|
+
|
|
531
|
+
if isinstance(wf_intensity,int) is True:
|
|
532
|
+
wf_intensity = float(wf_intensity)
|
|
533
|
+
if isinstance(wf_mz,int) is True:
|
|
534
|
+
wf_mz = float(wf_mz)
|
|
535
|
+
if isinstance(wf_intensity,float) is False or isinstance(wf_mz,float) is False:
|
|
536
|
+
print('Error: wf_mz and wf_intensity must be integers or floats')
|
|
537
|
+
sys.exit()
|
|
538
|
+
|
|
539
|
+
if entropy_dimension <= 0:
|
|
540
|
+
print('\nError: entropy_dimension should be a positive float')
|
|
541
|
+
sys.exit()
|
|
542
|
+
else:
|
|
543
|
+
q = entropy_dimension
|
|
544
|
+
|
|
545
|
+
normalization_method = 'standard' #consider including additional normalization methods to transform intensities into a probability distribution; softmax results in many numerical errors/warnings
|
|
546
|
+
|
|
547
|
+
if n_top_matches_to_save <= 0 or isinstance(n_top_matches_to_save,int)==False:
|
|
548
|
+
print('\nError: n_top_matches_to_save should be a positive integer')
|
|
549
|
+
sys.exit()
|
|
550
|
+
|
|
551
|
+
if isinstance(print_id_results,bool)==False:
|
|
552
|
+
print('\nError: print_id_results must be either True or False')
|
|
553
|
+
sys.exit()
|
|
554
|
+
|
|
555
|
+
if output_identification is None:
|
|
556
|
+
output_identification = f'{Path.cwd()}/output_identification.csv'
|
|
557
|
+
print(f'Warning: writing identification output to {output_identification}')
|
|
558
|
+
|
|
559
|
+
if output_similarity_scores is None:
|
|
560
|
+
output_similarity_scores = f'{Path.cwd()}/output_all_similarity_scores.csv'
|
|
561
|
+
print(f'Warning: writing similarity scores to {output_similarity_scores}')
|
|
562
|
+
|
|
563
|
+
|
|
564
|
+
####################################### begin spectral library matching #######################################
|
|
565
|
+
# compute the similarity score between each query library spectrum/spectra and all reference library spectra
|
|
566
|
+
all_similarity_scores = []
|
|
567
|
+
for query_idx in range(0,len(unique_query_ids)):
|
|
568
|
+
print(f'query spectrum #{query_idx} is being identified')
|
|
569
|
+
q_idxs_tmp = np.where(df_query.iloc[:,0] == unique_query_ids[query_idx])[0]
|
|
570
|
+
q_spec_tmp = np.asarray(pd.concat([df_query.iloc[q_idxs_tmp,1], df_query.iloc[q_idxs_tmp,2]], axis=1).reset_index(drop=True))
|
|
571
|
+
|
|
572
|
+
# compute the similarity score between the given query spectrum and all spectra in the reference library
|
|
573
|
+
similarity_scores = []
|
|
574
|
+
for ref_idx in range(0,len(unique_reference_ids)):
|
|
575
|
+
#if ref_idx % 100 == 0:
|
|
576
|
+
# print(f'Query spectrum #{query_idx} has had its similarity with {ref_idx} reference library spectra computed')
|
|
577
|
+
q_spec = q_spec_tmp
|
|
578
|
+
r_idxs_tmp = np.where(df_reference.iloc[:,0] == unique_reference_ids[ref_idx])[0]
|
|
579
|
+
r_spec = np.asarray(pd.concat([df_reference.iloc[r_idxs_tmp,1], df_reference.iloc[r_idxs_tmp,2]], axis=1).reset_index(drop=True))
|
|
580
|
+
|
|
581
|
+
# apply spectrum preprocessing transformation in the order specified by user
|
|
582
|
+
is_matched = False
|
|
583
|
+
for transformation in spectrum_preprocessing_order:
|
|
584
|
+
if np.isinf(q_spec[:,1]).sum() > 0:
|
|
585
|
+
q_spec[:,1] = np.zeros(q_spec.shape[0])
|
|
586
|
+
if np.isinf(r_spec[:,1]).sum() > 0:
|
|
587
|
+
r_spec[:,1] = np.zeros(r_spec.shape[0])
|
|
588
|
+
if transformation == 'C' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1: # centroiding
|
|
589
|
+
q_spec = centroid_spectrum(q_spec, window_size=window_size_centroiding)
|
|
590
|
+
r_spec = centroid_spectrum(r_spec, window_size=window_size_centroiding)
|
|
591
|
+
if transformation == 'M' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1: # matching
|
|
592
|
+
m_spec = match_peaks_in_spectra(spec_a=q_spec, spec_b=r_spec, window_size=window_size_matching)
|
|
593
|
+
q_spec = m_spec[:,0:2]
|
|
594
|
+
r_spec = m_spec[:,[0,2]]
|
|
595
|
+
is_matched = True
|
|
596
|
+
if transformation == 'W' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1: # weight factor transformation
|
|
597
|
+
q_spec[:,1] = wf_transform(q_spec[:,0], q_spec[:,1], wf_mz, wf_intensity)
|
|
598
|
+
r_spec[:,1] = wf_transform(r_spec[:,0], r_spec[:,1], wf_mz, wf_intensity)
|
|
599
|
+
if transformation == 'L' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1: # low-entropy tranformation
|
|
600
|
+
q_spec[:,1] = LE_transform(q_spec[:,1], LET_threshold, normalization_method=normalization_method)
|
|
601
|
+
r_spec[:,1] = LE_transform(r_spec[:,1], LET_threshold, normalization_method=normalization_method)
|
|
602
|
+
if transformation == 'N' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1: # noise removal
|
|
603
|
+
q_spec = remove_noise(q_spec, nr = noise_threshold)
|
|
604
|
+
if high_quality_reference_library == False:
|
|
605
|
+
r_spec = remove_noise(r_spec, nr = noise_threshold)
|
|
606
|
+
if transformation == 'F' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1: # filter with respect to mz and/or intensity
|
|
607
|
+
q_spec = filter_spec_lcms(q_spec, mz_min = mz_min, mz_max = mz_max, int_min = int_min, int_max = int_max, is_matched = is_matched)
|
|
608
|
+
if high_quality_reference_library == False:
|
|
609
|
+
r_spec = filter_spec_lcms(r_spec, mz_min = mz_min, mz_max = mz_max, int_min = int_min, int_max = int_max, is_matched = is_matched)
|
|
610
|
+
|
|
611
|
+
# query and reference spectrum intensities
|
|
612
|
+
q_ints = q_spec[:,1]
|
|
613
|
+
r_ints = r_spec[:,1]
|
|
614
|
+
|
|
615
|
+
if np.sum(q_ints) != 0 and np.sum(r_ints) != 0 and q_spec.shape[0] > 1 and r_spec.shape[1] > 1:
|
|
616
|
+
if similarity_measure == 'cosine':
|
|
617
|
+
similarity_score = S_cos(q_ints, r_ints)
|
|
618
|
+
else:
|
|
619
|
+
q_ints = normalize(q_ints, method = normalization_method)
|
|
620
|
+
r_ints = normalize(r_ints, method = normalization_method)
|
|
621
|
+
|
|
622
|
+
if similarity_measure == 'shannon':
|
|
623
|
+
similarity_score = S_shannon(q_ints, r_ints)
|
|
624
|
+
elif similarity_measure == 'renyi':
|
|
625
|
+
similarity_score = S_renyi(q_ints, r_ints, q)
|
|
626
|
+
elif similarity_measure == 'tsallis':
|
|
627
|
+
similarity_score = S_tsallis(q_ints, r_ints, q)
|
|
628
|
+
else:
|
|
629
|
+
similarity_score = 0
|
|
630
|
+
|
|
631
|
+
similarity_scores.append(similarity_score)
|
|
632
|
+
all_similarity_scores.append(similarity_scores)
|
|
633
|
+
|
|
634
|
+
# create pandas dataframe containing all similarity scores computed with one row for each query spectrum and one column for each reference spectrum
|
|
635
|
+
df_scores = pd.DataFrame(all_similarity_scores, columns = unique_reference_ids)
|
|
636
|
+
df_scores.index = unique_query_ids
|
|
637
|
+
df_scores.index.names = ['Query Spectrum ID']
|
|
638
|
+
|
|
639
|
+
# get predicted identity/identities of each query spectrum and the corresponding maximum similarity score
|
|
640
|
+
preds = []
|
|
641
|
+
scores = []
|
|
642
|
+
for i in range(0, df_scores.shape[0]):
|
|
643
|
+
df_scores_tmp = df_scores
|
|
644
|
+
preds_tmp = []
|
|
645
|
+
scores_tmp = []
|
|
646
|
+
for j in range(0, n_top_matches_to_save):
|
|
647
|
+
top_ref_specs_tmp = df_scores_tmp.iloc[i,np.where(df_scores_tmp.iloc[i,:] == np.max(df_scores_tmp.iloc[i,:]))[0]]
|
|
648
|
+
cols_to_keep = np.where(df_scores_tmp.iloc[i,:] != np.max(df_scores_tmp.iloc[i,:]))[0]
|
|
649
|
+
df_scores_tmp = df_scores_tmp.iloc[:,cols_to_keep]
|
|
650
|
+
|
|
651
|
+
preds_tmp.append(';'.join(map(str,top_ref_specs_tmp.index.to_list())))
|
|
652
|
+
if len(top_ref_specs_tmp.values) == 0:
|
|
653
|
+
scores_tmp.append(0)
|
|
654
|
+
else:
|
|
655
|
+
scores_tmp.append(top_ref_specs_tmp.values[0])
|
|
656
|
+
preds.append(preds_tmp)
|
|
657
|
+
scores.append(scores_tmp)
|
|
658
|
+
|
|
659
|
+
preds = np.array(preds)
|
|
660
|
+
scores = np.array(scores)
|
|
661
|
+
out = np.c_[preds,scores]
|
|
662
|
+
|
|
663
|
+
# get column names for a pandas dataframe with the n_top_matches_to_save top-matches for each query spectrum
|
|
664
|
+
cnames_preds = []
|
|
665
|
+
cnames_scores = []
|
|
666
|
+
for i in range(0,n_top_matches_to_save):
|
|
667
|
+
cnames_preds.append(f'RANK.{i+1}.PRED')
|
|
668
|
+
cnames_scores.append(f'RANK.{i+1}.SIMILARITY.SCORE')
|
|
669
|
+
|
|
670
|
+
# get pandas dataframe with identifcation results with each row corresponding to a query spectrum, n_top_matches_to_save columns for the top predictions, and n_top_matches_to_save columns for the similarity scores corresponding to the predictions
|
|
671
|
+
df_top_ref_specs = pd.DataFrame(out, columns = [*cnames_preds, *cnames_scores])
|
|
672
|
+
df_top_ref_specs.index = unique_query_ids
|
|
673
|
+
df_top_ref_specs.index.names = ['Query Spectrum ID']
|
|
674
|
+
|
|
675
|
+
# print the identification results if the user desires
|
|
676
|
+
if print_id_results == True:
|
|
677
|
+
print(df_top_ref_specs.to_string())
|
|
678
|
+
|
|
679
|
+
# write spectral library matching results to disk
|
|
680
|
+
df_top_ref_specs.to_csv(output_identification)
|
|
681
|
+
|
|
682
|
+
# write all similarity scores to disk
|
|
683
|
+
df_scores.columns = ['Reference Spectrum ID: ' + col for col in list(map(str,df_scores.columns.tolist()))]
|
|
684
|
+
df_scores.to_csv(output_similarity_scores)
|
|
685
|
+
|
|
686
|
+
|
|
687
|
+
|
|
688
|
+
|
|
689
|
+
|
|
690
|
+
def run_spec_lib_matching_on_NRMS_data(query_data=None, reference_data=None, likely_reference_ids=None, spectrum_preprocessing_order='FNLW', similarity_measure='cosine', high_quality_reference_library=False, mz_min=0, mz_max=9999999, int_min=0, int_max=9999999, noise_threshold=0.0, wf_mz=0.0, wf_intensity=1.0, LET_threshold=0.0, entropy_dimension=1.1, n_top_matches_to_save=1, print_id_results=False, output_identification=None, output_similarity_scores=None):
|
|
691
|
+
'''
|
|
692
|
+
runs spectral library matching on nominal-resolution mass spectrometry (NRMS) data
|
|
693
|
+
|
|
694
|
+
--query_data: cdf or csv file of query mass spectrum/spectra to be identified. If csv file, each row should correspond to a mass spectrum, the left-most column should contain an identifier, and each of the other columns should correspond to a single mass/charge ratio. Mandatory argument.
|
|
695
|
+
--reference_data: cdf of csv file of the reference mass spectra. If csv file, each row should correspond to a mass spectrum, the left-most column should contain in identifier (i.e. the CAS registry number or the compound name), and the remaining column should correspond to a single mass/charge ratio. Mandatory argument.
|
|
696
|
+
--likely_reference_ids: CSV file with one column containing the IDs of a subset of all compounds in the reference_data to be used in spectral library matching. Each ID in this file must be an ID in the reference library. Default: None (i.e. default is to use entire reference library)
|
|
697
|
+
--similarity_measure: \'cosine\', \'shannon\', \'renyi\', and \'tsallis\'. Default: cosine.
|
|
698
|
+
--spectrum_preprocessing_order: The spectrum preprocessing transformations and the order in which they are to be applied. Note that these transformations are applied prior to computing similarity scores. Format must be a string with 2-4 characters chosen from F, N, L, W representing filtering based on mass/charge and intensity values, noise removal, low-entropy trannsformation, and weight-factor-transformation, respectively. For example, if \'WN\' is passed, then each spectrum will undergo a weight factor transformation and then noise removal. Default: FNLW')
|
|
699
|
+
--high_quality_reference_library: True/False flag indicating whether the reference library is considered to be of high quality. If True, then the spectrum preprocessing transformations of filtering and noise removal are performed only on the query spectrum/spectra. If False, all spectrum preprocessing transformations specified will be applied to both the query and reference spectra. Default: False')
|
|
700
|
+
--mz_min: Remove all peaks with mass/charge value less than mz_min in each spectrum. Default: 0
|
|
701
|
+
--mz_max: Remove all peaks with mass/charge value greater than mz_max in each spectrum. Default: 9999999
|
|
702
|
+
--int_min: Remove all peaks with intensity value less than int_min in each spectrum. Default: 0
|
|
703
|
+
--int_max: Remove all peaks with intensity value greater than int_max in each spectrum. Default: 9999999
|
|
704
|
+
--noise_threshold: Ion fragments (i.e. points in a given mass spectrum) with intensity less than max(intensities)*noise_threshold are removed. Default: 0.0
|
|
705
|
+
--wf_mz: Mass/charge weight factor parameter. Default: 0.0
|
|
706
|
+
--wf_intensity: Intensity weight factor parameter. Default: 0.0
|
|
707
|
+
--LET_threshold: Low-entropy transformation threshold parameter. Spectra with Shannon entropy less than LET_threshold are transformed according to intensitiesNew=intensitiesOriginal^{(1+S)/(1+LET_threshold)}. Default: 0.0
|
|
708
|
+
--entropy_dimension: Entropy dimension parameter. Must have positive value other than 1. When the entropy dimension is 1, then Renyi and Tsallis entropy are equivalent to Shannon entropy. Therefore, this parameter only applies to the renyi and tsallis similarity measures. This parameter will be ignored if similarity measure cosine or shannon is chosen. Default: 1.1
|
|
709
|
+
--normalization_method: Method used to normalize the intensities of each spectrum so that the intensities sum to 1. Since the objects entropy quantifies the uncertainy of must be probability distributions, the intensities of a given spectrum must sum to 1 prior to computing the entropy of the given spectrum intensities. Options: \'standard\' and \'softmax\'. Default: standard.
|
|
710
|
+
--n_top_matches_to_save: The number of top matches to report. For example, if n_top_matches_to_save=5, then for each query spectrum, the five reference spectra with the largest similarity with the given query spectrum will be reported. Default: 1
|
|
711
|
+
--print_id_results: Flag that prints identification results if True. Default: False
|
|
712
|
+
--output_identification: Output CSV file containing the most-similar reference spectra for each query spectrum along with the corresponding similarity scores. Default is to save identification output in current working directory with filename \'output_identification.csv\'.
|
|
713
|
+
--output_similarity_scores: Output CSV file containing similarity scores between all query spectrum/spectra and all reference spectra. Each row corresponds to a query spectrum, the left-most column contains the query spectrum/spectra identifier, and the remaining column contain the similarity scores with respect to all reference library spectra. If no argument passed, then this CSV file is written to the current working directory with filename \'output_all_similarity_scores\'.csv.')
|
|
714
|
+
'''
|
|
715
|
+
|
|
716
|
+
# load query and reference libraries
|
|
717
|
+
if query_data is None:
|
|
718
|
+
print('\nError: No argument passed to the mandatory query_data. Please pass the path to the CSV file of the query data.')
|
|
719
|
+
sys.exit()
|
|
720
|
+
else:
|
|
721
|
+
extension = query_data.rsplit('.',1)
|
|
722
|
+
extension = extension[(len(extension)-1)]
|
|
723
|
+
if extension == 'mgf' or extension == 'MGF' or extension == 'mzML' or extension == 'mzml' or extension == 'MZML' or extension == 'cdf' or extension == 'CDF':
|
|
724
|
+
output_path_tmp = query_data[:-3] + 'csv'
|
|
725
|
+
build_library_from_raw_data(input_path=query_data, output_path=output_path_tmp, is_reference=False)
|
|
726
|
+
df_query = pd.read_csv(output_path_tmp)
|
|
727
|
+
if extension == 'csv' or extension == 'CSV':
|
|
728
|
+
df_query = pd.read_csv(query_data)
|
|
729
|
+
unique_query_ids = df_query.iloc[:,0].unique()
|
|
730
|
+
|
|
731
|
+
if reference_data is None:
|
|
732
|
+
print('\nError: No argument passed to the mandatory reference_data. Please pass the path to the CSV file of the reference data.')
|
|
733
|
+
sys.exit()
|
|
734
|
+
else:
|
|
735
|
+
if isinstance(reference_data,str):
|
|
736
|
+
df_reference = get_reference_df(reference_data,likely_reference_ids)
|
|
737
|
+
unique_reference_ids = df_reference.iloc[:,0].unique()
|
|
738
|
+
else:
|
|
739
|
+
dfs = []
|
|
740
|
+
unique_reference_ids = []
|
|
741
|
+
for f in reference_data:
|
|
742
|
+
tmp = get_reference_df(f,likely_reference_ids)
|
|
743
|
+
dfs.append(tmp)
|
|
744
|
+
unique_reference_ids.extend(tmp.iloc[:,0].unique())
|
|
745
|
+
df_reference = pd.concat(dfs, axis=0, ignore_index=True)
|
|
746
|
+
|
|
747
|
+
|
|
748
|
+
##### process input parameters and ensure they are in a valid format #####
|
|
749
|
+
if spectrum_preprocessing_order is not None:
|
|
750
|
+
spectrum_preprocessing_order = list(spectrum_preprocessing_order)
|
|
751
|
+
else:
|
|
752
|
+
spectrum_preprocessing_order = ['F','N','W','L']
|
|
753
|
+
if set(spectrum_preprocessing_order) - {'F','N','W','L'}:
|
|
754
|
+
print(f'Error: spectrum_preprocessing_order must contain only \'F\', \'N\', \'W\', \'L\'.')
|
|
755
|
+
sys.exit()
|
|
756
|
+
|
|
757
|
+
if similarity_measure not in ['cosine','shannon','renyi','tsallis']:
|
|
758
|
+
print('\nError: similarity_measure must be either \'cosine\', \'shannon\', \'renyi\', or \'tsallis\'')
|
|
759
|
+
sys.exit()
|
|
760
|
+
|
|
761
|
+
if isinstance(int_min,int) is True:
|
|
762
|
+
int_min = float(int_min)
|
|
763
|
+
if isinstance(int_max,int) is True:
|
|
764
|
+
int_max = float(int_max)
|
|
765
|
+
if isinstance(mz_min,int) is False or isinstance(mz_max,int) is False or isinstance(int_min,float) is False or isinstance(int_max,float) is False:
|
|
766
|
+
print('Error: mz_min must be a non-negative integer, mz_max must be a positive integer, int_min must be a non-negative float, and int_max must be a positive float')
|
|
767
|
+
sys.exit()
|
|
768
|
+
if mz_min < 0:
|
|
769
|
+
print('\nError: mz_min should be a non-negative integer')
|
|
770
|
+
sys.exit()
|
|
771
|
+
if mz_max <= 0:
|
|
772
|
+
print('\nError: mz_max should be a positive integer')
|
|
773
|
+
sys.exit()
|
|
774
|
+
if int_min < 0:
|
|
775
|
+
print('\nError: int_min should be a non-negative float')
|
|
776
|
+
sys.exit()
|
|
777
|
+
if int_max <= 0:
|
|
778
|
+
print('\nError: int_max should be a positive float')
|
|
779
|
+
sys.exit()
|
|
780
|
+
|
|
781
|
+
if isinstance(noise_threshold,int) is True:
|
|
782
|
+
noise_threshold = float(noise_threshold)
|
|
783
|
+
if isinstance(noise_threshold,float) is False or noise_threshold < 0:
|
|
784
|
+
print('Error: noise_threshold must be a positive float.')
|
|
785
|
+
sys.exit()
|
|
786
|
+
|
|
787
|
+
if isinstance(wf_intensity,int) is True:
|
|
788
|
+
wf_intensity = float(wf_intensity)
|
|
789
|
+
if isinstance(wf_mz,int) is True:
|
|
790
|
+
wf_mz = float(wf_mz)
|
|
791
|
+
if isinstance(wf_intensity,float) is False or isinstance(wf_mz,float) is False:
|
|
792
|
+
print('Error: wf_mz and wf_intensity must be integers or floats')
|
|
793
|
+
sys.exit()
|
|
794
|
+
|
|
795
|
+
if entropy_dimension <= 0:
|
|
796
|
+
print('\nError: entropy_dimension should be a positive float')
|
|
797
|
+
sys.exit()
|
|
798
|
+
else:
|
|
799
|
+
q = entropy_dimension
|
|
800
|
+
|
|
801
|
+
normalization_method = 'standard' #consider including additional normalization methods to transform intensities into a probability distribution; softmax results in many numerical errors/warnings
|
|
802
|
+
|
|
803
|
+
if n_top_matches_to_save <= 0 or isinstance(n_top_matches_to_save,int)==False:
|
|
804
|
+
print('\nError: n_top_matches_to_save should be a positive integer')
|
|
805
|
+
sys.exit()
|
|
806
|
+
|
|
807
|
+
if isinstance(print_id_results,bool)==False:
|
|
808
|
+
print('\nError: print_id_results must be either True or False')
|
|
809
|
+
sys.exit()
|
|
810
|
+
|
|
811
|
+
if output_identification is None:
|
|
812
|
+
output_identification = f'{Path.cwd()}/output_identification.csv'
|
|
813
|
+
print(f'Warning: writing identification output to {output_identification}')
|
|
814
|
+
|
|
815
|
+
if output_similarity_scores is None:
|
|
816
|
+
output_similarity_scores = f'{Path.cwd()}/output_all_similarity_scores.csv'
|
|
817
|
+
print(f'Warning: writing similarity scores to {output_similarity_scores}')
|
|
818
|
+
|
|
819
|
+
|
|
820
|
+
|
|
821
|
+
####################################### begin spectral library matching #######################################
|
|
822
|
+
# get the range of m/z values
|
|
823
|
+
min_mz = int(np.min([np.min(df_query.iloc[:,1]), np.min(df_reference.iloc[:,1])]))
|
|
824
|
+
max_mz = int(np.max([np.max(df_query.iloc[:,1]), np.max(df_reference.iloc[:,1])]))
|
|
825
|
+
mzs = np.linspace(min_mz,max_mz,(max_mz-min_mz+1))
|
|
826
|
+
|
|
827
|
+
# compute the similarity score between each query library spectrum/spectra and all reference library spectra
|
|
828
|
+
# for each query spectrum, compute its similarity with all reference spectra
|
|
829
|
+
all_similarity_scores = []
|
|
830
|
+
for query_idx in range(0,len(unique_query_ids)):
|
|
831
|
+
q_idxs_tmp = np.where(df_query.iloc[:,0] == unique_query_ids[query_idx])[0]
|
|
832
|
+
q_spec_tmp = np.asarray(pd.concat([df_query.iloc[q_idxs_tmp,1], df_query.iloc[q_idxs_tmp,2]], axis=1).reset_index(drop=True))
|
|
833
|
+
q_spec_tmp = convert_spec(q_spec_tmp,mzs)
|
|
834
|
+
|
|
835
|
+
similarity_scores = []
|
|
836
|
+
for ref_idx in range(0,len(unique_reference_ids)):
|
|
837
|
+
q_spec = q_spec_tmp
|
|
838
|
+
if ref_idx % 1000 == 0:
|
|
839
|
+
print(f'Query spectrum #{query_idx} has had its similarity with {ref_idx} reference library spectra computed')
|
|
840
|
+
r_idxs_tmp = np.where(df_reference.iloc[:,0] == unique_reference_ids[ref_idx])[0]
|
|
841
|
+
r_spec_tmp = np.asarray(pd.concat([df_reference.iloc[r_idxs_tmp,1], df_reference.iloc[r_idxs_tmp,2]], axis=1).reset_index(drop=True))
|
|
842
|
+
r_spec = convert_spec(r_spec_tmp,mzs)
|
|
843
|
+
|
|
844
|
+
# apply spectrum preprocessing transformation in the order specified by user
|
|
845
|
+
for transformation in spectrum_preprocessing_order:
|
|
846
|
+
if np.isinf(q_spec[:,1]).sum() > 0:
|
|
847
|
+
q_spec[:,1] = np.zeros(q_spec.shape[0])
|
|
848
|
+
if np.isinf(r_spec[:,1]).sum() > 0:
|
|
849
|
+
r_spec[:,1] = np.zeros(r_spec.shape[0])
|
|
850
|
+
if transformation == 'W': # weight factor transformation
|
|
851
|
+
q_spec[:,1] = wf_transform(q_spec[:,0], q_spec[:,1], wf_mz, wf_intensity)
|
|
852
|
+
r_spec[:,1] = wf_transform(r_spec[:,0], r_spec[:,1], wf_mz, wf_intensity)
|
|
853
|
+
if transformation == 'L': # low-entropy transformation
|
|
854
|
+
q_spec[:,1] = LE_transform(q_spec[:,1], LET_threshold, normalization_method=normalization_method)
|
|
855
|
+
r_spec[:,1] = LE_transform(r_spec[:,1], LET_threshold, normalization_method=normalization_method)
|
|
856
|
+
if transformation == 'N': # noise removal
|
|
857
|
+
q_spec = remove_noise(q_spec, nr = noise_threshold)
|
|
858
|
+
if high_quality_reference_library == False:
|
|
859
|
+
r_spec = remove_noise(r_spec, nr = noise_threshold)
|
|
860
|
+
if transformation == 'F': # filter with respect to mz and/or intensity
|
|
861
|
+
q_spec = filter_spec_gcms(q_spec, mz_min = mz_min, mz_max = mz_max, int_min = int_min, int_max = int_max)
|
|
862
|
+
if high_quality_reference_library == False:
|
|
863
|
+
r_spec = filter_spec_gcms(r_spec, mz_min = mz_min, mz_max = mz_max, int_min = int_min, int_max = int_max)
|
|
864
|
+
|
|
865
|
+
# query and reference spectrum intensities
|
|
866
|
+
q_ints = q_spec[:,1]
|
|
867
|
+
r_ints = r_spec[:,1]
|
|
868
|
+
|
|
869
|
+
# if there are no non-zero intensities in the query or reference spectrum, their similarity is 0
|
|
870
|
+
if np.sum(q_ints) != 0 and np.sum(r_ints) != 0:
|
|
871
|
+
if similarity_measure == 'cosine':
|
|
872
|
+
similarity_score = S_cos(q_ints, r_ints)
|
|
873
|
+
else:
|
|
874
|
+
# normalize intensities of each spectrum so they sum to 1 so that they represent a probability distribution
|
|
875
|
+
q_ints = normalize(q_ints, method = normalization_method)
|
|
876
|
+
r_ints = normalize(r_ints, method = normalization_method)
|
|
877
|
+
|
|
878
|
+
if similarity_measure == 'shannon':
|
|
879
|
+
similarity_score = S_shannon(q_ints, r_ints)
|
|
880
|
+
elif similarity_measure == 'renyi':
|
|
881
|
+
similarity_score = S_renyi(q_ints, r_ints, q)
|
|
882
|
+
elif similarity_measure == 'tsallis':
|
|
883
|
+
similarity_score = S_tsallis(q_ints, r_ints, q)
|
|
884
|
+
else:
|
|
885
|
+
similarity_score = 0
|
|
886
|
+
|
|
887
|
+
similarity_scores.append(similarity_score)
|
|
888
|
+
all_similarity_scores.append(similarity_scores)
|
|
889
|
+
|
|
890
|
+
# create pandas dataframe containing all similarity scores computed with one row for each query spectrum and one column for each reference spectrum
|
|
891
|
+
df_scores = pd.DataFrame(all_similarity_scores, columns = unique_reference_ids)
|
|
892
|
+
df_scores.index = unique_query_ids
|
|
893
|
+
df_scores.index.names = ['Query Spectrum ID']
|
|
894
|
+
|
|
895
|
+
# get predicted identity/identities of each query spectrum and the corresponding maximum similarity score
|
|
896
|
+
preds = []
|
|
897
|
+
scores = []
|
|
898
|
+
for i in range(0, df_scores.shape[0]):
|
|
899
|
+
df_scores_tmp = df_scores
|
|
900
|
+
preds_tmp = []
|
|
901
|
+
scores_tmp = []
|
|
902
|
+
for j in range(0, n_top_matches_to_save):
|
|
903
|
+
top_ref_specs_tmp = df_scores_tmp.iloc[i,np.where(df_scores_tmp.iloc[i,:] == np.max(df_scores_tmp.iloc[i,:]))[0]]
|
|
904
|
+
cols_to_keep = np.where(df_scores_tmp.iloc[i,:] != np.max(df_scores_tmp.iloc[i,:]))[0]
|
|
905
|
+
df_scores_tmp = df_scores_tmp.iloc[:,cols_to_keep]
|
|
906
|
+
|
|
907
|
+
#preds_tmp.append(';'.join(top_ref_specs_tmp.index.to_list()))
|
|
908
|
+
preds_tmp.append(';'.join(map(str,top_ref_specs_tmp.index.to_list())))
|
|
909
|
+
if len(top_ref_specs_tmp.values) == 0:
|
|
910
|
+
scores_tmp.append(0)
|
|
911
|
+
else:
|
|
912
|
+
scores_tmp.append(top_ref_specs_tmp.values[0])
|
|
913
|
+
preds.append(preds_tmp)
|
|
914
|
+
scores.append(scores_tmp)
|
|
915
|
+
|
|
916
|
+
preds = np.array(preds)
|
|
917
|
+
scores = np.array(scores)
|
|
918
|
+
out = np.c_[preds,scores]
|
|
919
|
+
|
|
920
|
+
# get column names for a pandas dataframe with the n_top_matches_to_save top-matches for each query spectrum
|
|
921
|
+
cnames_preds = []
|
|
922
|
+
cnames_scores = []
|
|
923
|
+
for i in range(0,n_top_matches_to_save):
|
|
924
|
+
cnames_preds.append(f'RANK.{i+1}.PRED')
|
|
925
|
+
cnames_scores.append(f'RANK.{i+1}.SIMILARITY.SCORE')
|
|
926
|
+
|
|
927
|
+
# get pandas dataframe with identifcation results with each row corresponding to a query spectrum, n_top_matches_to_save columns for the top predictions, and n_top_matches_to_save columns for the similarity scores corresponding to the predictions
|
|
928
|
+
df_top_ref_specs = pd.DataFrame(out, columns = [*cnames_preds, *cnames_scores])
|
|
929
|
+
df_top_ref_specs.index = unique_query_ids
|
|
930
|
+
df_top_ref_specs.index.names = ['Query Spectrum ID']
|
|
931
|
+
|
|
932
|
+
# print the identification results if the user desires
|
|
933
|
+
if print_id_results == True:
|
|
934
|
+
print(df_top_ref_specs.to_string())
|
|
935
|
+
|
|
936
|
+
# write spectral library matching results to disk
|
|
937
|
+
df_top_ref_specs.to_csv(output_identification)
|
|
938
|
+
|
|
939
|
+
# write all similarity scores to disk
|
|
940
|
+
df_scores.columns = ['Reference Spectrum ID: ' + col for col in list(map(str,df_scores.columns.tolist()))]
|
|
941
|
+
df_scores.to_csv(output_similarity_scores)
|
|
942
|
+
|
|
943
|
+
|