pycompound 0.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,943 @@
1
+
2
+ # this script's function runs spectral library matching to identify unknown query compound(s)
3
+
4
+ from pycompound_fy7392.build_library import build_library_from_raw_data
5
+ from .processing import *
6
+ from .similarity_measures import *
7
+ import pandas as pd
8
+ from pathlib import Path
9
+ import sys
10
+
11
+
12
+ def tune_params_on_HRMS_data(query_data=None, reference_data=None, grid={'similarity_measure':['cosine'], 'spectrum_preprocessing_order':['FCNMWL'], 'mz_min':[0], 'mz_max':[9999999], 'int_min':[0], 'int_max':[99999999], 'window_size_centroiding':[0.5], 'window_size_matching':[0.5], 'noise_threshold':[0.0], 'wf_mz':[0.0], 'wf_int':[1.0], 'LET_threshold':[0.0], 'entropy_dimension':[1.1], 'high_quality_reference_library':[False]}, output_path=None):
13
+ '''
14
+ runs spectral library matching on high-resolution mass spectrometry (HRMS) data with all possible combinations of parameters in the grid dict, saves results from each choice of parameters to a CSV file, and prints top-performing parameters
15
+
16
+ --query_data: mgf, mzML, or csv file of query mass spectrum/spectra to be identified. If csv file, each row should correspond to a mass spectrum, the left-most column should contain an identifier, and each of the other columns should correspond to a single mass/charge ratio. Mandatory argument.
17
+ --reference_data: mgf, mzML, or csv file of the reference mass spectra. If csv file, each row should correspond to a mass spectrum, the left-most column should contain in identifier (i.e. the CAS registry number or the compound name), and the remaining column should correspond to a single mass/charge ratio. Mandatory argument.
18
+ --grid: dict with all possible parameter values to try.
19
+ --output_path: accuracy from each choice of parameter set is saved to a CSV file here.
20
+ '''
21
+
22
+ for key, value in grid.items():
23
+ globals()[key] = value
24
+
25
+ # load query and reference libraries
26
+ if query_data is None:
27
+ print('\nError: No argument passed to the mandatory query_data. Please pass the path to the CSV file of the query data.')
28
+ sys.exit()
29
+ else:
30
+ extension = query_data.rsplit('.',1)
31
+ extension = extension[(len(extension)-1)]
32
+ if extension == 'mgf' or extension == 'MGF' or extension == 'mzML' or extension == 'mzml' or extension == 'MZML' or extension == 'cdf' or extension == 'CDF':
33
+ output_path_tmp = query_data[:-3] + 'csv'
34
+ build_library_from_raw_data(input_path=query_data, output_path=output_path_tmp, is_reference=False)
35
+ df_query = pd.read_csv(output_path_tmp)
36
+ if extension == 'csv' or extension == 'CSV':
37
+ df_query = pd.read_csv(query_data)
38
+ unique_query_ids = df_query.iloc[:,0].unique()
39
+
40
+ if reference_data is None:
41
+ print('\nError: No argument passed to the mandatory reference_data. Please pass the path to the CSV file of the reference data.')
42
+ sys.exit()
43
+ else:
44
+ if isinstance(reference_data,str):
45
+ df_reference = get_reference_df(reference_data=reference_data)
46
+ unique_reference_ids = df_reference.iloc[:,0].unique()
47
+ else:
48
+ dfs = []
49
+ unique_reference_ids = []
50
+ for f in reference_data:
51
+ tmp = get_reference_df(reference_data=f)
52
+ dfs.append(tmp)
53
+ unique_reference_ids.extend(tmp.iloc[:,0].unique())
54
+ df_reference = pd.concat(dfs, axis=0, ignore_index=True)
55
+
56
+ print(f'\nNote that there are {len(unique_query_ids)} unique query spectra, {len(unique_reference_ids)} unique reference spectra, and {len(set(unique_query_ids) & set(unique_reference_ids))} of the query and reference spectra IDs are in common.\n')
57
+
58
+ if output_path is None:
59
+ output_path = f'{Path.cwd()}/tuning_param_output.csv'
60
+ print(f'Warning: since output_path=None, the output will be written to the current working directory: {output_path}')
61
+
62
+ accs = []
63
+ similarity_measures = []
64
+ spectrum_preprocessing_orders = []
65
+ mz_mins = []
66
+ mz_maxs = []
67
+ int_mins = []
68
+ int_maxs = []
69
+ noise_thresholds = []
70
+ window_size_centroidings = []
71
+ window_size_matchings = []
72
+ wf_mzs = []
73
+ wf_ints = []
74
+ LET_thresholds = []
75
+ entropy_dimensions = []
76
+ high_quality_reference_libraries = []
77
+ for similarity_measure_tmp in similarity_measure:
78
+ for spectrum_preprocessing_order_tmp in spectrum_preprocessing_order:
79
+ for mz_min_tmp in mz_min:
80
+ for mz_max_tmp in mz_max:
81
+ for int_min_tmp in int_min:
82
+ for int_max_tmp in int_max:
83
+ for noise_threshold_tmp in noise_threshold:
84
+ for window_size_centroiding_tmp in window_size_centroiding:
85
+ for window_size_matching_tmp in window_size_matching:
86
+ for wf_mz_tmp in wf_mz:
87
+ for wf_int_tmp in wf_int:
88
+ for LET_threshold_tmp in LET_threshold:
89
+ for entropy_dimension_tmp in entropy_dimension:
90
+ for high_quality_reference_library_tmp in high_quality_reference_library:
91
+ acc = get_acc_HRMS(df_query=df_query, df_reference=df_reference, unique_query_ids=unique_query_ids, unique_reference_ids=unique_reference_ids, similarity_measure=similarity_measure_tmp, spectrum_preprocessing_order=spectrum_preprocessing_order_tmp, mz_min=mz_min_tmp, mz_max=mz_max_tmp, int_min=int_min_tmp, int_max=int_max_tmp, window_size_centroiding=window_size_centroiding_tmp, window_size_matching=window_size_matching_tmp, noise_threshold=noise_threshold_tmp, wf_mz=wf_mz_tmp, wf_int=wf_int_tmp, LET_threshold=LET_threshold_tmp, entropy_dimension=entropy_dimension_tmp, high_quality_reference_library=high_quality_reference_library_tmp)
92
+ accs.append(acc)
93
+ similarity_measures.append(similarity_measure_tmp)
94
+ spectrum_preprocessing_orders.append(spectrum_preprocessing_order_tmp)
95
+ mz_mins.append(mz_min_tmp)
96
+ mz_maxs.append(mz_max_tmp)
97
+ int_mins.append(int_min_tmp)
98
+ int_maxs.append(int_max_tmp)
99
+ noise_thresholds.append(noise_threshold_tmp)
100
+ window_size_centroidings.append(window_size_centroiding_tmp)
101
+ window_size_matchings.append(window_size_matching_tmp)
102
+ wf_mzs.append(wf_mz_tmp)
103
+ wf_ints.append(wf_int_tmp)
104
+ LET_thresholds.append(LET_threshold_tmp)
105
+ entropy_dimensions.append(entropy_dimension_tmp)
106
+ high_quality_reference_libraries.append(high_quality_reference_library_tmp)
107
+ df_out = pd.DataFrame({'ACC':accs, 'SIMILARITY.MEASURE':similarity_measures, 'SPECTRUM.PROCESSING.ORDER':spectrum_preprocessing_orders, 'MZ.MIN':mz_mins, 'MZ.MAX':mz_maxs, 'INT.MIN':int_mins, 'INT.MAX':int_maxs, 'NOISE.THRESHOLD':noise_thresholds, 'WINDOW.SIZE.CENTROIDING':window_size_centroidings, 'WINDOW.SIZE.MATCHING':window_size_matchings, 'WF.MZ':wf_mzs, 'WF.INT':wf_ints, 'LET.THRESHOLD':LET_thresholds, 'ENTROPY.DIMENSION':entropy_dimensions, 'HIGH.QUALITY.REFERENCE.LIBRARY':high_quality_reference_libraries})
108
+ df_out.to_csv(output_path, index=False)
109
+
110
+
111
+
112
+
113
+ def tune_params_on_NRMS_data(query_data=None, reference_data=None, grid={'similarity_measure':['cosine'], 'spectrum_preprocessing_order':['FNLW'], 'mz_min':[0], 'mz_max':[9999999], 'int_min':[0], 'int_max':[99999999], 'noise_threshold':[0.0], 'wf_mz':[0.0], 'wf_int':[1.0], 'LET_threshold':[0.0], 'entropy_dimension':[1.1], 'high_quality_reference_library':[False]}, output_path=None):
114
+ '''
115
+ runs spectral library matching on nominal-resolution mass spectrometry (NRMS) data with all possible combinations of parameters in the grid dict, saves results from each choice of parameters to a CSV file, and prints top-performing parameters
116
+
117
+ --query_data: mgf, mzML, or csv file of query mass spectrum/spectra to be identified. If csv file, each row should correspond to a mass spectrum, the left-most column should contain an identifier, and each of the other columns should correspond to a single mass/charge ratio. Mandatory argument.
118
+ --reference_data: mgf, mzML, or csv file of the reference mass spectra. If csv file, each row should correspond to a mass spectrum, the left-most column should contain in identifier (i.e. the CAS registry number or the compound name), and the remaining column should correspond to a single mass/charge ratio. Mandatory argument.
119
+ --grid: dict with all possible parameter values to try
120
+ --output_path: accuracy from each choice of parameter set is saved to a CSV file here
121
+ '''
122
+
123
+ for key, value in grid.items():
124
+ globals()[key] = value
125
+
126
+ # load query and reference libraries
127
+ if query_data is None:
128
+ print('\nError: No argument passed to the mandatory query_data. Please pass the path to the CSV file of the query data.')
129
+ sys.exit()
130
+ else:
131
+ extension = query_data.rsplit('.',1)
132
+ extension = extension[(len(extension)-1)]
133
+ if extension == 'mgf' or extension == 'MGF' or extension == 'mzML' or extension == 'mzml' or extension == 'MZML' or extension == 'cdf' or extension == 'CDF':
134
+ output_path_tmp = query_data[:-3] + 'csv'
135
+ build_library_from_raw_data(input_path=query_data, output_path=output_path_tmp, is_reference=False)
136
+ df_query = pd.read_csv(output_path_tmp)
137
+ if extension == 'csv' or extension == 'CSV':
138
+ df_query = pd.read_csv(query_data)
139
+ unique_query_ids = df_query.iloc[:,0].unique()
140
+
141
+ if reference_data is None:
142
+ print('\nError: No argument passed to the mandatory reference_data. Please pass the path to the CSV file of the reference data.')
143
+ sys.exit()
144
+ else:
145
+ if isinstance(reference_data,str):
146
+ df_reference = get_reference_df(reference_data=reference_data)
147
+ unique_reference_ids = df_reference.iloc[:,0].unique()
148
+ else:
149
+ dfs = []
150
+ unique_reference_ids = []
151
+ for f in reference_data:
152
+ tmp = get_reference_df(reference_data=f)
153
+ dfs.append(tmp)
154
+ unique_reference_ids.extend(tmp.iloc[:,0].unique())
155
+ df_reference = pd.concat(dfs, axis=0, ignore_index=True)
156
+
157
+ print(f'\nNote that there are {len(unique_query_ids)} unique query spectra, {len(unique_reference_ids)} unique reference spectra, and {len(set(unique_query_ids) & set(unique_reference_ids))} of the query and reference spectra IDs are in common.\n')
158
+
159
+ if output_path is None:
160
+ output_path = f'{Path.cwd()}/tuning_param_output.csv'
161
+ print(f'Warning: since output_path=None, the output will be written to the current working directory: {output_path}')
162
+
163
+ accs = []
164
+ similarity_measures = []
165
+ spectrum_preprocessing_orders = []
166
+ mz_mins = []
167
+ mz_maxs = []
168
+ int_mins = []
169
+ int_maxs = []
170
+ noise_thresholds = []
171
+ wf_mzs = []
172
+ wf_ints = []
173
+ LET_thresholds = []
174
+ entropy_dimensions = []
175
+ high_quality_reference_libraries = []
176
+ for similarity_measure_tmp in similarity_measure:
177
+ for spectrum_preprocessing_order_tmp in spectrum_preprocessing_order:
178
+ for mz_min_tmp in mz_min:
179
+ for mz_max_tmp in mz_max:
180
+ for int_min_tmp in int_min:
181
+ for int_max_tmp in int_max:
182
+ for noise_threshold_tmp in noise_threshold:
183
+ for wf_mz_tmp in wf_mz:
184
+ for wf_int_tmp in wf_int:
185
+ for LET_threshold_tmp in LET_threshold:
186
+ for entropy_dimension_tmp in entropy_dimension:
187
+ for high_quality_reference_library_tmp in high_quality_reference_library:
188
+ acc = get_acc_NRMS(df_query=df_query, df_reference=df_reference, unique_query_ids=unique_query_ids, unique_reference_ids=unique_reference_ids, similarity_measure=similarity_measure_tmp, spectrum_preprocessing_order=spectrum_preprocessing_order_tmp, mz_min=mz_min_tmp, mz_max=mz_max_tmp, int_min=int_min_tmp, int_max=int_max_tmp, noise_threshold=noise_threshold_tmp, wf_mz=wf_mz_tmp, wf_int=wf_int_tmp, LET_threshold=LET_threshold_tmp, entropy_dimension=entropy_dimension_tmp, high_quality_reference_library=high_quality_reference_library_tmp)
189
+ accs.append(acc)
190
+ similarity_measures.append(similarity_measure_tmp)
191
+ spectrum_preprocessing_orders.append(spectrum_preprocessing_order_tmp)
192
+ mz_mins.append(mz_min_tmp)
193
+ mz_maxs.append(mz_max_tmp)
194
+ int_mins.append(int_min_tmp)
195
+ int_maxs.append(int_max_tmp)
196
+ noise_thresholds.append(noise_threshold_tmp)
197
+ wf_mzs.append(wf_mz_tmp)
198
+ wf_ints.append(wf_int_tmp)
199
+ LET_thresholds.append(LET_threshold_tmp)
200
+ entropy_dimensions.append(entropy_dimension_tmp)
201
+ high_quality_reference_libraries.append(high_quality_reference_library_tmp)
202
+ df_out = pd.DataFrame({'ACC':accs, 'SIMILARITY.MEASURE':similarity_measures, 'SPECTRUM.PROCESSING.ORDER':spectrum_preprocessing_orders, 'MZ.MIN':mz_mins, 'MZ.MAX':mz_maxs, 'INT.MIN':int_mins, 'INT.MAX':int_maxs, 'NOISE.THRESHOLD':noise_thresholds, 'WF.MZ':wf_mzs, 'WF.INT':wf_ints, 'LET.THRESHOLD':LET_thresholds, 'ENTROPY.DIMENSION':entropy_dimensions, 'HIGH.QUALITY.REFERENCE.LIBRARY':high_quality_reference_libraries})
203
+ df_out.to_csv(output_path, index=False)
204
+
205
+
206
+
207
+
208
+ def get_acc_HRMS(df_query, df_reference, unique_query_ids, unique_reference_ids, similarity_measure, spectrum_preprocessing_order, mz_min, mz_max, int_min, int_max, window_size_centroiding, window_size_matching, noise_threshold, wf_mz, wf_int, LET_threshold, entropy_dimension, high_quality_reference_library):
209
+ # returns accuracy for a given set of parameters
210
+
211
+ n_top_matches_to_save = 1
212
+
213
+ # compute the similarity score between each query library spectrum/spectra and all reference library spectra
214
+ all_similarity_scores = []
215
+ for query_idx in range(0,len(unique_query_ids)):
216
+ print(f'query spectrum #{query_idx} is being identified')
217
+ q_idxs_tmp = np.where(df_query.iloc[:,0] == unique_query_ids[query_idx])[0]
218
+ q_spec_tmp = np.asarray(pd.concat([df_query.iloc[q_idxs_tmp,1], df_query.iloc[q_idxs_tmp,2]], axis=1).reset_index(drop=True))
219
+
220
+ # compute the similarity score between the given query spectrum and all spectra in the reference library
221
+ similarity_scores = []
222
+ for ref_idx in range(0,len(unique_reference_ids)):
223
+ q_spec = q_spec_tmp
224
+ r_idxs_tmp = np.where(df_reference.iloc[:,0] == unique_reference_ids[ref_idx])[0]
225
+ r_spec = np.asarray(pd.concat([df_reference.iloc[r_idxs_tmp,1], df_reference.iloc[r_idxs_tmp,2]], axis=1).reset_index(drop=True))
226
+
227
+ # apply spectrum preprocessing transformation in the order specified by user
228
+ is_matched = False
229
+ for transformation in spectrum_preprocessing_order:
230
+ if np.isinf(q_spec[:,1]).sum() > 0:
231
+ q_spec[:,1] = np.zeros(q_spec.shape[0])
232
+ if np.isinf(r_spec[:,1]).sum() > 0:
233
+ r_spec[:,1] = np.zeros(r_spec.shape[0])
234
+ if transformation == 'C' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1: # centroiding
235
+ q_spec = centroid_spectrum(q_spec, window_size=window_size_centroiding)
236
+ r_spec = centroid_spectrum(r_spec, window_size=window_size_centroiding)
237
+ if transformation == 'M' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1: # matching
238
+ m_spec = match_peaks_in_spectra(spec_a=q_spec, spec_b=r_spec, window_size=window_size_matching)
239
+ q_spec = m_spec[:,0:2]
240
+ r_spec = m_spec[:,[0,2]]
241
+ is_matched = True
242
+ if transformation == 'W' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1: # weight factor transformation
243
+ q_spec[:,1] = wf_transform(q_spec[:,0], q_spec[:,1], wf_mz, wf_int)
244
+ r_spec[:,1] = wf_transform(r_spec[:,0], r_spec[:,1], wf_mz, wf_int)
245
+ if transformation == 'L' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1: # low-entropy tranformation
246
+ q_spec[:,1] = LE_transform(q_spec[:,1], LET_threshold, normalization_method='standard')
247
+ r_spec[:,1] = LE_transform(r_spec[:,1], LET_threshold, normalization_method='standard')
248
+ if transformation == 'N' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1: # noise removal
249
+ q_spec = remove_noise(q_spec, nr = noise_threshold)
250
+ if high_quality_reference_library == False:
251
+ r_spec = remove_noise(r_spec, nr = noise_threshold)
252
+ if transformation == 'F' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1: # filter with respect to mz and/or intensity
253
+ q_spec = filter_spec_lcms(q_spec, mz_min = mz_min, mz_max = mz_max, int_min = int_min, int_max = int_max, is_matched = is_matched)
254
+ if high_quality_reference_library == False:
255
+ r_spec = filter_spec_lcms(r_spec, mz_min = mz_min, mz_max = mz_max, int_min = int_min, int_max = int_max, is_matched = is_matched)
256
+
257
+ # query and reference spectrum intensities
258
+ q_ints = q_spec[:,1]
259
+ r_ints = r_spec[:,1]
260
+ if np.sum(q_ints) != 0 and np.sum(r_ints) != 0 and q_spec.shape[0] > 1 and r_spec.shape[1] > 1:
261
+ if similarity_measure == 'cosine':
262
+ similarity_score = S_cos(q_ints, r_ints)
263
+ else:
264
+ q_ints = normalize(q_ints, method='standard')
265
+ r_ints = normalize(r_ints, method='standard')
266
+
267
+ if similarity_measure == 'shannon':
268
+ similarity_score = S_shannon(q_ints, r_ints)
269
+ elif similarity_measure == 'renyi':
270
+ similarity_score = S_renyi(q_ints, r_ints, entropy_dimension)
271
+ elif similarity_measure == 'tsallis':
272
+ similarity_score = S_tsallis(q_ints, r_ints, entropy_dimension)
273
+ else:
274
+ similarity_score = 0
275
+
276
+ similarity_scores.append(similarity_score)
277
+ all_similarity_scores.append(similarity_scores)
278
+
279
+ # create pandas dataframe containing all similarity scores computed with one row for each query spectrum and one column for each reference spectrum
280
+ df_scores = pd.DataFrame(all_similarity_scores, columns = unique_reference_ids)
281
+ df_scores.index = unique_query_ids
282
+ df_scores.index.names = ['Query Spectrum ID']
283
+
284
+ # get predicted identity/identities of each query spectrum and the corresponding maximum similarity score
285
+ preds = []
286
+ scores = []
287
+ for i in range(0, df_scores.shape[0]):
288
+ df_scores_tmp = df_scores
289
+ preds_tmp = []
290
+ scores_tmp = []
291
+ for j in range(0, n_top_matches_to_save):
292
+ top_ref_specs_tmp = df_scores_tmp.iloc[i,np.where(df_scores_tmp.iloc[i,:] == np.max(df_scores_tmp.iloc[i,:]))[0]]
293
+ cols_to_keep = np.where(df_scores_tmp.iloc[i,:] != np.max(df_scores_tmp.iloc[i,:]))[0]
294
+ df_scores_tmp = df_scores_tmp.iloc[:,cols_to_keep]
295
+
296
+ preds_tmp.append(';'.join(map(str,top_ref_specs_tmp.index.to_list())))
297
+ if len(top_ref_specs_tmp.values) == 0:
298
+ scores_tmp.append(0)
299
+ else:
300
+ scores_tmp.append(top_ref_specs_tmp.values[0])
301
+ preds.append(preds_tmp)
302
+ scores.append(scores_tmp)
303
+
304
+ preds = np.array(preds)
305
+ scores = np.array(scores)
306
+ out = np.c_[unique_query_ids,preds,scores]
307
+ df_tmp = pd.DataFrame(out, columns=['TRUE.ID','PREDICTED.ID','SCORE'])
308
+ acc = (df_tmp['TRUE.ID']==df_tmp['PREDICTED.ID']).mean()
309
+ return acc
310
+
311
+
312
+
313
+
314
+ def get_acc_NRMS(df_query, df_reference, unique_query_ids, unique_reference_ids, similarity_measure, spectrum_preprocessing_order, mz_min, mz_max, int_min, int_max, noise_threshold, wf_mz, wf_int, LET_threshold, entropy_dimension, high_quality_reference_library):
315
+ # returns accuracy for a given set of parameters
316
+
317
+ n_top_matches_to_save = 1
318
+
319
+ min_mz = int(np.min([np.min(df_query.iloc[:,1]), np.min(df_reference.iloc[:,1])]))
320
+ max_mz = int(np.max([np.max(df_query.iloc[:,1]), np.max(df_reference.iloc[:,1])]))
321
+ mzs = np.linspace(min_mz,max_mz,(max_mz-min_mz+1))
322
+
323
+ all_similarity_scores = []
324
+ for query_idx in range(0,len(unique_query_ids)):
325
+ q_idxs_tmp = np.where(df_query.iloc[:,0] == unique_query_ids[query_idx])[0]
326
+ q_spec_tmp = np.asarray(pd.concat([df_query.iloc[q_idxs_tmp,1], df_query.iloc[q_idxs_tmp,2]], axis=1).reset_index(drop=True))
327
+ q_spec_tmp = convert_spec(q_spec_tmp,mzs)
328
+
329
+ similarity_scores = []
330
+ for ref_idx in range(0,len(unique_reference_ids)):
331
+ q_spec = q_spec_tmp
332
+ if ref_idx % 1000 == 0:
333
+ print(f'Query spectrum #{query_idx} has had its similarity with {ref_idx} reference library spectra computed')
334
+ r_idxs_tmp = np.where(df_reference.iloc[:,0] == unique_reference_ids[ref_idx])[0]
335
+ r_spec_tmp = np.asarray(pd.concat([df_reference.iloc[r_idxs_tmp,1], df_reference.iloc[r_idxs_tmp,2]], axis=1).reset_index(drop=True))
336
+ r_spec = convert_spec(r_spec_tmp,mzs)
337
+
338
+ # apply spectrum preprocessing transformation in the order specified by user
339
+ for transformation in spectrum_preprocessing_order:
340
+ if np.isinf(q_spec[:,1]).sum() > 0:
341
+ q_spec[:,1] = np.zeros(q_spec.shape[0])
342
+ if np.isinf(r_spec[:,1]).sum() > 0:
343
+ r_spec[:,1] = np.zeros(r_spec.shape[0])
344
+ if transformation == 'W': # weight factor transformation
345
+ q_spec[:,1] = wf_transform(q_spec[:,0], q_spec[:,1], wf_mz, wf_int)
346
+ r_spec[:,1] = wf_transform(r_spec[:,0], r_spec[:,1], wf_mz, wf_int)
347
+ if transformation == 'L': # low-entropy transformation
348
+ q_spec[:,1] = LE_transform(q_spec[:,1], LET_threshold, normalization_method='standard')
349
+ r_spec[:,1] = LE_transform(r_spec[:,1], LET_threshold, normalization_method='standard')
350
+ if transformation == 'N': # noise removal
351
+ q_spec = remove_noise(q_spec, nr = noise_threshold)
352
+ if high_quality_reference_library == False:
353
+ r_spec = remove_noise(r_spec, nr = noise_threshold)
354
+ if transformation == 'F': # filter with respect to mz and/or intensity
355
+ q_spec = filter_spec_gcms(q_spec, mz_min = mz_min, mz_max = mz_max, int_min = int_min, int_max = int_max)
356
+ if high_quality_reference_library == False:
357
+ r_spec = filter_spec_gcms(r_spec, mz_min = mz_min, mz_max = mz_max, int_min = int_min, int_max = int_max)
358
+
359
+ # query and reference spectrum intensities
360
+ q_ints = q_spec[:,1]
361
+ r_ints = r_spec[:,1]
362
+
363
+ # if there are no non-zero intensities in the query or reference spectrum, their similarity is 0
364
+ if np.sum(q_ints) != 0 and np.sum(r_ints) != 0:
365
+ if similarity_measure == 'cosine':
366
+ similarity_score = S_cos(q_ints, r_ints)
367
+ else:
368
+ # normalize intensities of each spectrum so they sum to 1 so that they represent a probability distribution
369
+ q_ints = normalize(q_ints, method = 'standard')
370
+ r_ints = normalize(r_ints, method = 'standard')
371
+
372
+ if similarity_measure == 'shannon':
373
+ similarity_score = S_shannon(q_ints, r_ints)
374
+ elif similarity_measure == 'renyi':
375
+ similarity_score = S_renyi(q_ints, r_ints, entropy_dimension)
376
+ elif similarity_measure == 'tsallis':
377
+ similarity_score = S_tsallis(q_ints, r_ints, entropy_dimension)
378
+ else:
379
+ similarity_score = 0
380
+
381
+ similarity_scores.append(similarity_score)
382
+ all_similarity_scores.append(similarity_scores)
383
+
384
+ # create pandas dataframe containing all similarity scores computed with one row for each query spectrum and one column for each reference spectrum
385
+ df_scores = pd.DataFrame(all_similarity_scores, columns = unique_reference_ids)
386
+ df_scores.index = unique_query_ids
387
+ df_scores.index.names = ['Query Spectrum ID']
388
+
389
+ # get predicted identity/identities of each query spectrum and the corresponding maximum similarity score
390
+ preds = []
391
+ scores = []
392
+ for i in range(0, df_scores.shape[0]):
393
+ df_scores_tmp = df_scores
394
+ preds_tmp = []
395
+ scores_tmp = []
396
+ for j in range(0, n_top_matches_to_save):
397
+ top_ref_specs_tmp = df_scores_tmp.iloc[i,np.where(df_scores_tmp.iloc[i,:] == np.max(df_scores_tmp.iloc[i,:]))[0]]
398
+ cols_to_keep = np.where(df_scores_tmp.iloc[i,:] != np.max(df_scores_tmp.iloc[i,:]))[0]
399
+ df_scores_tmp = df_scores_tmp.iloc[:,cols_to_keep]
400
+
401
+ #preds_tmp.append(';'.join(top_ref_specs_tmp.index.to_list()))
402
+ preds_tmp.append(';'.join(map(str,top_ref_specs_tmp.index.to_list())))
403
+ if len(top_ref_specs_tmp.values) == 0:
404
+ scores_tmp.append(0)
405
+ else:
406
+ scores_tmp.append(top_ref_specs_tmp.values[0])
407
+ preds.append(preds_tmp)
408
+ scores.append(scores_tmp)
409
+
410
+ preds = np.array(preds)
411
+ scores = np.array(scores)
412
+ out = np.c_[unique_query_ids,preds,scores]
413
+ df_tmp = pd.DataFrame(out, columns=['TRUE.ID','PREDICTED.ID','SCORE'])
414
+ acc = (df_tmp['TRUE.ID']==df_tmp['PREDICTED.ID']).mean()
415
+ return acc
416
+
417
+
418
+
419
+ def run_spec_lib_matching_on_HRMS_data(query_data=None, reference_data=None, likely_reference_ids=None, similarity_measure='cosine', spectrum_preprocessing_order='FCNMWL', high_quality_reference_library=False, mz_min=0, mz_max=9999999, int_min=0, int_max=9999999, window_size_centroiding=0.5, window_size_matching=0.5, noise_threshold=0.0, wf_mz=0.0, wf_intensity=1.0, LET_threshold=0.0, entropy_dimension=1.1, n_top_matches_to_save=1, print_id_results=False, output_identification=None, output_similarity_scores=None):
420
+ '''
421
+ runs spectral library matching on high-resolution mass spectrometry (HRMS) data
422
+
423
+ --query_data: mgf, mzML, or csv file of query mass spectrum/spectra to be identified. If csv file, each row should correspond to a mass spectrum, the left-most column should contain an identifier, and each of the other columns should correspond to a single mass/charge ratio. Mandatory argument.
424
+ --reference_data: either string or list of strings with pass to mgf, mzML, sdf, and/or csv file(s) of the reference mass spectra. If csv file, each row should correspond to a mass spectrum, the left-most column should contain in identifier (i.e. the CAS registry number or the compound name), and the remaining column should correspond to a single mass/charge ratio. Mandatory argument.
425
+ --likely_reference_ids: CSV file with one column containing the IDs of a subset of all compounds in the reference_data to be used in spectral library matching. Each ID in this file must be an ID in the reference library. Default: None (i.e. default is to use entire reference library)
426
+ --similarity_measure: \'cosine\', \'shannon\', \'renyi\', and \'tsallis\'. Default: cosine.
427
+ --spectrum_preprocessing_order: The spectrum preprocessing transformations and the order in which they are to be applied. Note that these transformations are applied prior to computing similarity scores. Format must be a string with 2-6 characters chosen from C, F, M, N, L, W representing centroiding, filtering based on mass/charge and intensity values, matching, noise removal, low-entropy trannsformation, and weight-factor-transformation, respectively. For example, if \'WCM\' is passed, then each spectrum will undergo a weight factor transformation, then centroiding, and then matching. Note that if an argument is passed, then \'M\' must be contained in the argument, since matching is a required preprocessing step in spectral library matching of HRMS data. Furthermore, \'C\' must be performed before matching since centroiding can change the number of ion fragments in a given spectrum. Default: FCNMWL')
428
+ --high_quality_reference_library: True/False flag indicating whether the reference library is considered to be of high quality. If True, then the spectrum preprocessing transformations of filtering and noise removal are performed only on the query spectrum/spectra. If False, all spectrum preprocessing transformations specified will be applied to both the query and reference spectra. Default: False')
429
+ --mz_min: Remove all peaks with mass/charge value less than mz_min in each spectrum. Default: 0
430
+ --mz_max: Remove all peaks with mass/charge value greater than mz_max in each spectrum. Default: 9999999
431
+ --int_min: Remove all peaks with intensity value less than int_min in each spectrum. Default: 0
432
+ --int_max: Remove all peaks with intensity value greater than int_max in each spectrum. Default: 9999999
433
+ --window_size_centroiding: Window size parameter used in centroiding a given spectrum. Default: 0.5
434
+ --window_size_matching: Window size parameter used in matching a query spectrum and a reference library spectrum. Default: 0.5
435
+ --noise_threshold: Ion fragments (i.e. points in a given mass spectrum) with intensity less than max(intensities)*noise_threshold are removed. Default: 0.0
436
+ --wf_mz: Mass/charge weight factor parameter. Default: 0.0
437
+ --wf_intensity: Intensity weight factor parameter. Default: 0.0
438
+ --LET_threshold: Low-entropy transformation threshold parameter. Spectra with Shannon entropy less than LET_threshold are transformed according to intensitiesNew=intensitiesOriginal^{(1+S)/(1+LET_threshold)}. Default: 0.0
439
+ --entropy_dimension: Entropy dimension parameter. Must have positive value other than 1. When the entropy dimension is 1, then Renyi and Tsallis entropy are equivalent to Shannon entropy. Therefore, this parameter only applies to the renyi and tsallis similarity measures. This parameter will be ignored if similarity measure cosine or shannon is chosen. Default: 1.1
440
+ --n_top_matches_to_save: The number of top matches to report. For example, if n_top_matches_to_save=5, then for each query spectrum, the five reference spectra with the largest similarity with the given query spectrum will be reported. Default: 1
441
+ --print_id_results: Flag that prints identification results if True. Default: False
442
+ --output_identification: Output CSV file containing the most-similar reference spectra for each query spectrum along with the corresponding similarity scores. Default is to save identification output in current working directory with filename \'output_identification.csv\'.
443
+ --output_similarity_scores: Output CSV file containing similarity scores between all query spectrum/spectra and all reference spectra. Each row corresponds to a query spectrum, the left-most column contains the query spectrum/spectra identifier, and the remaining column contain the similarity scores with respect to all reference library spectra. If no argument passed, then this CSV file is written to the current working directory with filename \'output_all_similarity_scores\'.csv.')
444
+ '''
445
+
446
+ # load query and reference libraries
447
+ if query_data is None:
448
+ print('\nError: No argument passed to the mandatory query_data. Please pass the path to the CSV file of the query data.')
449
+ sys.exit()
450
+ else:
451
+ extension = query_data.rsplit('.',1)
452
+ extension = extension[(len(extension)-1)]
453
+ if extension == 'mgf' or extension == 'MGF' or extension == 'mzML' or extension == 'mzml' or extension == 'MZML' or extension == 'cdf' or extension == 'CDF':
454
+ output_path_tmp = query_data[:-3] + 'csv'
455
+ build_library_from_raw_data(input_path=query_data, output_path=output_path_tmp, is_reference=False)
456
+ df_query = pd.read_csv(output_path_tmp)
457
+ if extension == 'csv' or extension == 'CSV':
458
+ df_query = pd.read_csv(query_data)
459
+ unique_query_ids = df_query.iloc[:,0].unique()
460
+
461
+ if reference_data is None:
462
+ print('\nError: No argument passed to the mandatory reference_data. Please pass the path to the CSV file of the reference data.')
463
+ sys.exit()
464
+ else:
465
+ if isinstance(reference_data,str):
466
+ df_reference = get_reference_df(reference_data,likely_reference_ids)
467
+ unique_reference_ids = df_reference.iloc[:,0].unique()
468
+ else:
469
+ dfs = []
470
+ unique_reference_ids = []
471
+ for f in reference_data:
472
+ tmp = get_reference_df(f,likely_reference_ids)
473
+ dfs.append(tmp)
474
+ unique_reference_ids.extend(tmp.iloc[:,0].unique())
475
+ df_reference = pd.concat(dfs, axis=0, ignore_index=True)
476
+
477
+
478
+ ##### process input parameters and ensure they are in a valid format #####
479
+ if spectrum_preprocessing_order is not None:
480
+ spectrum_preprocessing_order = list(spectrum_preprocessing_order)
481
+ else:
482
+ spectrum_preprocessing_order = ['F', 'C', 'N', 'M', 'W', 'L']
483
+ if 'M' not in spectrum_preprocessing_order:
484
+ print(f'Error: \'M\' must be a character in spectrum_preprocessing_order.')
485
+ sys.exit()
486
+ if 'C' in spectrum_preprocessing_order:
487
+ if spectrum_preprocessing_order.index('C') > spectrum_preprocessing_order.index('M'):
488
+ print(f'Error: \'C\' must come before \'M\' in spectrum_preprocessing_order.')
489
+ sys.exit()
490
+ if set(spectrum_preprocessing_order) - {'F','C','N','M','W','L'}:
491
+ print(f'Error: spectrum_preprocessing_order must contain only \'C\', \'F\', \'M\', \'N\', \'L\', \'W\'.')
492
+ sys.exit()
493
+
494
+ if similarity_measure not in ['cosine','shannon','renyi','tsallis']:
495
+ print('\nError: similarity_measure must be either \'cosine\', \'shannon\', \'renyi\', or \'tsallis\'')
496
+ sys.exit()
497
+
498
+ if isinstance(int_min,int) is True:
499
+ int_min = float(int_min)
500
+ if isinstance(int_max,int) is True:
501
+ int_max = float(int_max)
502
+ if isinstance(mz_min,int) is False or isinstance(mz_max,int) is False or isinstance(int_min,float) is False or isinstance(int_max,float) is False:
503
+ print('Error: mz_min must be a non-negative integer, mz_max must be a positive integer, int_min must be a non-negative float, and int_max must be a positive float')
504
+ sys.exit()
505
+ if mz_min < 0:
506
+ print('\nError: mz_min should be a non-negative integer')
507
+ sys.exit()
508
+ if mz_max <= 0:
509
+ print('\nError: mz_max should be a positive integer')
510
+ sys.exit()
511
+ if int_min < 0:
512
+ print('\nError: int_min should be a non-negative float')
513
+ sys.exit()
514
+ if int_max <= 0:
515
+ print('\nError: int_max should be a positive float')
516
+ sys.exit()
517
+
518
+ if isinstance(window_size_centroiding,float) is False or window_size_centroiding <= 0.0:
519
+ print('Error: window_size_centroiding must be a positive float.')
520
+ sys.exit()
521
+ if isinstance(window_size_matching,float) is False or window_size_matching<= 0.0:
522
+ print('Error: window_size_matching must be a positive float.')
523
+ sys.exit()
524
+
525
+ if isinstance(noise_threshold,int) is True:
526
+ noise_threshold = float(noise_threshold)
527
+ if isinstance(noise_threshold,float) is False or noise_threshold < 0:
528
+ print('Error: noise_threshold must be a positive float.')
529
+ sys.exit()
530
+
531
+ if isinstance(wf_intensity,int) is True:
532
+ wf_intensity = float(wf_intensity)
533
+ if isinstance(wf_mz,int) is True:
534
+ wf_mz = float(wf_mz)
535
+ if isinstance(wf_intensity,float) is False or isinstance(wf_mz,float) is False:
536
+ print('Error: wf_mz and wf_intensity must be integers or floats')
537
+ sys.exit()
538
+
539
+ if entropy_dimension <= 0:
540
+ print('\nError: entropy_dimension should be a positive float')
541
+ sys.exit()
542
+ else:
543
+ q = entropy_dimension
544
+
545
+ normalization_method = 'standard' #consider including additional normalization methods to transform intensities into a probability distribution; softmax results in many numerical errors/warnings
546
+
547
+ if n_top_matches_to_save <= 0 or isinstance(n_top_matches_to_save,int)==False:
548
+ print('\nError: n_top_matches_to_save should be a positive integer')
549
+ sys.exit()
550
+
551
+ if isinstance(print_id_results,bool)==False:
552
+ print('\nError: print_id_results must be either True or False')
553
+ sys.exit()
554
+
555
+ if output_identification is None:
556
+ output_identification = f'{Path.cwd()}/output_identification.csv'
557
+ print(f'Warning: writing identification output to {output_identification}')
558
+
559
+ if output_similarity_scores is None:
560
+ output_similarity_scores = f'{Path.cwd()}/output_all_similarity_scores.csv'
561
+ print(f'Warning: writing similarity scores to {output_similarity_scores}')
562
+
563
+
564
+ ####################################### begin spectral library matching #######################################
565
+ # compute the similarity score between each query library spectrum/spectra and all reference library spectra
566
+ all_similarity_scores = []
567
+ for query_idx in range(0,len(unique_query_ids)):
568
+ print(f'query spectrum #{query_idx} is being identified')
569
+ q_idxs_tmp = np.where(df_query.iloc[:,0] == unique_query_ids[query_idx])[0]
570
+ q_spec_tmp = np.asarray(pd.concat([df_query.iloc[q_idxs_tmp,1], df_query.iloc[q_idxs_tmp,2]], axis=1).reset_index(drop=True))
571
+
572
+ # compute the similarity score between the given query spectrum and all spectra in the reference library
573
+ similarity_scores = []
574
+ for ref_idx in range(0,len(unique_reference_ids)):
575
+ #if ref_idx % 100 == 0:
576
+ # print(f'Query spectrum #{query_idx} has had its similarity with {ref_idx} reference library spectra computed')
577
+ q_spec = q_spec_tmp
578
+ r_idxs_tmp = np.where(df_reference.iloc[:,0] == unique_reference_ids[ref_idx])[0]
579
+ r_spec = np.asarray(pd.concat([df_reference.iloc[r_idxs_tmp,1], df_reference.iloc[r_idxs_tmp,2]], axis=1).reset_index(drop=True))
580
+
581
+ # apply spectrum preprocessing transformation in the order specified by user
582
+ is_matched = False
583
+ for transformation in spectrum_preprocessing_order:
584
+ if np.isinf(q_spec[:,1]).sum() > 0:
585
+ q_spec[:,1] = np.zeros(q_spec.shape[0])
586
+ if np.isinf(r_spec[:,1]).sum() > 0:
587
+ r_spec[:,1] = np.zeros(r_spec.shape[0])
588
+ if transformation == 'C' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1: # centroiding
589
+ q_spec = centroid_spectrum(q_spec, window_size=window_size_centroiding)
590
+ r_spec = centroid_spectrum(r_spec, window_size=window_size_centroiding)
591
+ if transformation == 'M' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1: # matching
592
+ m_spec = match_peaks_in_spectra(spec_a=q_spec, spec_b=r_spec, window_size=window_size_matching)
593
+ q_spec = m_spec[:,0:2]
594
+ r_spec = m_spec[:,[0,2]]
595
+ is_matched = True
596
+ if transformation == 'W' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1: # weight factor transformation
597
+ q_spec[:,1] = wf_transform(q_spec[:,0], q_spec[:,1], wf_mz, wf_intensity)
598
+ r_spec[:,1] = wf_transform(r_spec[:,0], r_spec[:,1], wf_mz, wf_intensity)
599
+ if transformation == 'L' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1: # low-entropy tranformation
600
+ q_spec[:,1] = LE_transform(q_spec[:,1], LET_threshold, normalization_method=normalization_method)
601
+ r_spec[:,1] = LE_transform(r_spec[:,1], LET_threshold, normalization_method=normalization_method)
602
+ if transformation == 'N' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1: # noise removal
603
+ q_spec = remove_noise(q_spec, nr = noise_threshold)
604
+ if high_quality_reference_library == False:
605
+ r_spec = remove_noise(r_spec, nr = noise_threshold)
606
+ if transformation == 'F' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1: # filter with respect to mz and/or intensity
607
+ q_spec = filter_spec_lcms(q_spec, mz_min = mz_min, mz_max = mz_max, int_min = int_min, int_max = int_max, is_matched = is_matched)
608
+ if high_quality_reference_library == False:
609
+ r_spec = filter_spec_lcms(r_spec, mz_min = mz_min, mz_max = mz_max, int_min = int_min, int_max = int_max, is_matched = is_matched)
610
+
611
+ # query and reference spectrum intensities
612
+ q_ints = q_spec[:,1]
613
+ r_ints = r_spec[:,1]
614
+
615
+ if np.sum(q_ints) != 0 and np.sum(r_ints) != 0 and q_spec.shape[0] > 1 and r_spec.shape[1] > 1:
616
+ if similarity_measure == 'cosine':
617
+ similarity_score = S_cos(q_ints, r_ints)
618
+ else:
619
+ q_ints = normalize(q_ints, method = normalization_method)
620
+ r_ints = normalize(r_ints, method = normalization_method)
621
+
622
+ if similarity_measure == 'shannon':
623
+ similarity_score = S_shannon(q_ints, r_ints)
624
+ elif similarity_measure == 'renyi':
625
+ similarity_score = S_renyi(q_ints, r_ints, q)
626
+ elif similarity_measure == 'tsallis':
627
+ similarity_score = S_tsallis(q_ints, r_ints, q)
628
+ else:
629
+ similarity_score = 0
630
+
631
+ similarity_scores.append(similarity_score)
632
+ all_similarity_scores.append(similarity_scores)
633
+
634
+ # create pandas dataframe containing all similarity scores computed with one row for each query spectrum and one column for each reference spectrum
635
+ df_scores = pd.DataFrame(all_similarity_scores, columns = unique_reference_ids)
636
+ df_scores.index = unique_query_ids
637
+ df_scores.index.names = ['Query Spectrum ID']
638
+
639
+ # get predicted identity/identities of each query spectrum and the corresponding maximum similarity score
640
+ preds = []
641
+ scores = []
642
+ for i in range(0, df_scores.shape[0]):
643
+ df_scores_tmp = df_scores
644
+ preds_tmp = []
645
+ scores_tmp = []
646
+ for j in range(0, n_top_matches_to_save):
647
+ top_ref_specs_tmp = df_scores_tmp.iloc[i,np.where(df_scores_tmp.iloc[i,:] == np.max(df_scores_tmp.iloc[i,:]))[0]]
648
+ cols_to_keep = np.where(df_scores_tmp.iloc[i,:] != np.max(df_scores_tmp.iloc[i,:]))[0]
649
+ df_scores_tmp = df_scores_tmp.iloc[:,cols_to_keep]
650
+
651
+ preds_tmp.append(';'.join(map(str,top_ref_specs_tmp.index.to_list())))
652
+ if len(top_ref_specs_tmp.values) == 0:
653
+ scores_tmp.append(0)
654
+ else:
655
+ scores_tmp.append(top_ref_specs_tmp.values[0])
656
+ preds.append(preds_tmp)
657
+ scores.append(scores_tmp)
658
+
659
+ preds = np.array(preds)
660
+ scores = np.array(scores)
661
+ out = np.c_[preds,scores]
662
+
663
+ # get column names for a pandas dataframe with the n_top_matches_to_save top-matches for each query spectrum
664
+ cnames_preds = []
665
+ cnames_scores = []
666
+ for i in range(0,n_top_matches_to_save):
667
+ cnames_preds.append(f'RANK.{i+1}.PRED')
668
+ cnames_scores.append(f'RANK.{i+1}.SIMILARITY.SCORE')
669
+
670
+ # get pandas dataframe with identifcation results with each row corresponding to a query spectrum, n_top_matches_to_save columns for the top predictions, and n_top_matches_to_save columns for the similarity scores corresponding to the predictions
671
+ df_top_ref_specs = pd.DataFrame(out, columns = [*cnames_preds, *cnames_scores])
672
+ df_top_ref_specs.index = unique_query_ids
673
+ df_top_ref_specs.index.names = ['Query Spectrum ID']
674
+
675
+ # print the identification results if the user desires
676
+ if print_id_results == True:
677
+ print(df_top_ref_specs.to_string())
678
+
679
+ # write spectral library matching results to disk
680
+ df_top_ref_specs.to_csv(output_identification)
681
+
682
+ # write all similarity scores to disk
683
+ df_scores.columns = ['Reference Spectrum ID: ' + col for col in list(map(str,df_scores.columns.tolist()))]
684
+ df_scores.to_csv(output_similarity_scores)
685
+
686
+
687
+
688
+
689
+
690
+ def run_spec_lib_matching_on_NRMS_data(query_data=None, reference_data=None, likely_reference_ids=None, spectrum_preprocessing_order='FNLW', similarity_measure='cosine', high_quality_reference_library=False, mz_min=0, mz_max=9999999, int_min=0, int_max=9999999, noise_threshold=0.0, wf_mz=0.0, wf_intensity=1.0, LET_threshold=0.0, entropy_dimension=1.1, n_top_matches_to_save=1, print_id_results=False, output_identification=None, output_similarity_scores=None):
691
+ '''
692
+ runs spectral library matching on nominal-resolution mass spectrometry (NRMS) data
693
+
694
+ --query_data: cdf or csv file of query mass spectrum/spectra to be identified. If csv file, each row should correspond to a mass spectrum, the left-most column should contain an identifier, and each of the other columns should correspond to a single mass/charge ratio. Mandatory argument.
695
+ --reference_data: cdf of csv file of the reference mass spectra. If csv file, each row should correspond to a mass spectrum, the left-most column should contain in identifier (i.e. the CAS registry number or the compound name), and the remaining column should correspond to a single mass/charge ratio. Mandatory argument.
696
+ --likely_reference_ids: CSV file with one column containing the IDs of a subset of all compounds in the reference_data to be used in spectral library matching. Each ID in this file must be an ID in the reference library. Default: None (i.e. default is to use entire reference library)
697
+ --similarity_measure: \'cosine\', \'shannon\', \'renyi\', and \'tsallis\'. Default: cosine.
698
+ --spectrum_preprocessing_order: The spectrum preprocessing transformations and the order in which they are to be applied. Note that these transformations are applied prior to computing similarity scores. Format must be a string with 2-4 characters chosen from F, N, L, W representing filtering based on mass/charge and intensity values, noise removal, low-entropy trannsformation, and weight-factor-transformation, respectively. For example, if \'WN\' is passed, then each spectrum will undergo a weight factor transformation and then noise removal. Default: FNLW')
699
+ --high_quality_reference_library: True/False flag indicating whether the reference library is considered to be of high quality. If True, then the spectrum preprocessing transformations of filtering and noise removal are performed only on the query spectrum/spectra. If False, all spectrum preprocessing transformations specified will be applied to both the query and reference spectra. Default: False')
700
+ --mz_min: Remove all peaks with mass/charge value less than mz_min in each spectrum. Default: 0
701
+ --mz_max: Remove all peaks with mass/charge value greater than mz_max in each spectrum. Default: 9999999
702
+ --int_min: Remove all peaks with intensity value less than int_min in each spectrum. Default: 0
703
+ --int_max: Remove all peaks with intensity value greater than int_max in each spectrum. Default: 9999999
704
+ --noise_threshold: Ion fragments (i.e. points in a given mass spectrum) with intensity less than max(intensities)*noise_threshold are removed. Default: 0.0
705
+ --wf_mz: Mass/charge weight factor parameter. Default: 0.0
706
+ --wf_intensity: Intensity weight factor parameter. Default: 0.0
707
+ --LET_threshold: Low-entropy transformation threshold parameter. Spectra with Shannon entropy less than LET_threshold are transformed according to intensitiesNew=intensitiesOriginal^{(1+S)/(1+LET_threshold)}. Default: 0.0
708
+ --entropy_dimension: Entropy dimension parameter. Must have positive value other than 1. When the entropy dimension is 1, then Renyi and Tsallis entropy are equivalent to Shannon entropy. Therefore, this parameter only applies to the renyi and tsallis similarity measures. This parameter will be ignored if similarity measure cosine or shannon is chosen. Default: 1.1
709
+ --normalization_method: Method used to normalize the intensities of each spectrum so that the intensities sum to 1. Since the objects entropy quantifies the uncertainy of must be probability distributions, the intensities of a given spectrum must sum to 1 prior to computing the entropy of the given spectrum intensities. Options: \'standard\' and \'softmax\'. Default: standard.
710
+ --n_top_matches_to_save: The number of top matches to report. For example, if n_top_matches_to_save=5, then for each query spectrum, the five reference spectra with the largest similarity with the given query spectrum will be reported. Default: 1
711
+ --print_id_results: Flag that prints identification results if True. Default: False
712
+ --output_identification: Output CSV file containing the most-similar reference spectra for each query spectrum along with the corresponding similarity scores. Default is to save identification output in current working directory with filename \'output_identification.csv\'.
713
+ --output_similarity_scores: Output CSV file containing similarity scores between all query spectrum/spectra and all reference spectra. Each row corresponds to a query spectrum, the left-most column contains the query spectrum/spectra identifier, and the remaining column contain the similarity scores with respect to all reference library spectra. If no argument passed, then this CSV file is written to the current working directory with filename \'output_all_similarity_scores\'.csv.')
714
+ '''
715
+
716
+ # load query and reference libraries
717
+ if query_data is None:
718
+ print('\nError: No argument passed to the mandatory query_data. Please pass the path to the CSV file of the query data.')
719
+ sys.exit()
720
+ else:
721
+ extension = query_data.rsplit('.',1)
722
+ extension = extension[(len(extension)-1)]
723
+ if extension == 'mgf' or extension == 'MGF' or extension == 'mzML' or extension == 'mzml' or extension == 'MZML' or extension == 'cdf' or extension == 'CDF':
724
+ output_path_tmp = query_data[:-3] + 'csv'
725
+ build_library_from_raw_data(input_path=query_data, output_path=output_path_tmp, is_reference=False)
726
+ df_query = pd.read_csv(output_path_tmp)
727
+ if extension == 'csv' or extension == 'CSV':
728
+ df_query = pd.read_csv(query_data)
729
+ unique_query_ids = df_query.iloc[:,0].unique()
730
+
731
+ if reference_data is None:
732
+ print('\nError: No argument passed to the mandatory reference_data. Please pass the path to the CSV file of the reference data.')
733
+ sys.exit()
734
+ else:
735
+ if isinstance(reference_data,str):
736
+ df_reference = get_reference_df(reference_data,likely_reference_ids)
737
+ unique_reference_ids = df_reference.iloc[:,0].unique()
738
+ else:
739
+ dfs = []
740
+ unique_reference_ids = []
741
+ for f in reference_data:
742
+ tmp = get_reference_df(f,likely_reference_ids)
743
+ dfs.append(tmp)
744
+ unique_reference_ids.extend(tmp.iloc[:,0].unique())
745
+ df_reference = pd.concat(dfs, axis=0, ignore_index=True)
746
+
747
+
748
+ ##### process input parameters and ensure they are in a valid format #####
749
+ if spectrum_preprocessing_order is not None:
750
+ spectrum_preprocessing_order = list(spectrum_preprocessing_order)
751
+ else:
752
+ spectrum_preprocessing_order = ['F','N','W','L']
753
+ if set(spectrum_preprocessing_order) - {'F','N','W','L'}:
754
+ print(f'Error: spectrum_preprocessing_order must contain only \'F\', \'N\', \'W\', \'L\'.')
755
+ sys.exit()
756
+
757
+ if similarity_measure not in ['cosine','shannon','renyi','tsallis']:
758
+ print('\nError: similarity_measure must be either \'cosine\', \'shannon\', \'renyi\', or \'tsallis\'')
759
+ sys.exit()
760
+
761
+ if isinstance(int_min,int) is True:
762
+ int_min = float(int_min)
763
+ if isinstance(int_max,int) is True:
764
+ int_max = float(int_max)
765
+ if isinstance(mz_min,int) is False or isinstance(mz_max,int) is False or isinstance(int_min,float) is False or isinstance(int_max,float) is False:
766
+ print('Error: mz_min must be a non-negative integer, mz_max must be a positive integer, int_min must be a non-negative float, and int_max must be a positive float')
767
+ sys.exit()
768
+ if mz_min < 0:
769
+ print('\nError: mz_min should be a non-negative integer')
770
+ sys.exit()
771
+ if mz_max <= 0:
772
+ print('\nError: mz_max should be a positive integer')
773
+ sys.exit()
774
+ if int_min < 0:
775
+ print('\nError: int_min should be a non-negative float')
776
+ sys.exit()
777
+ if int_max <= 0:
778
+ print('\nError: int_max should be a positive float')
779
+ sys.exit()
780
+
781
+ if isinstance(noise_threshold,int) is True:
782
+ noise_threshold = float(noise_threshold)
783
+ if isinstance(noise_threshold,float) is False or noise_threshold < 0:
784
+ print('Error: noise_threshold must be a positive float.')
785
+ sys.exit()
786
+
787
+ if isinstance(wf_intensity,int) is True:
788
+ wf_intensity = float(wf_intensity)
789
+ if isinstance(wf_mz,int) is True:
790
+ wf_mz = float(wf_mz)
791
+ if isinstance(wf_intensity,float) is False or isinstance(wf_mz,float) is False:
792
+ print('Error: wf_mz and wf_intensity must be integers or floats')
793
+ sys.exit()
794
+
795
+ if entropy_dimension <= 0:
796
+ print('\nError: entropy_dimension should be a positive float')
797
+ sys.exit()
798
+ else:
799
+ q = entropy_dimension
800
+
801
+ normalization_method = 'standard' #consider including additional normalization methods to transform intensities into a probability distribution; softmax results in many numerical errors/warnings
802
+
803
+ if n_top_matches_to_save <= 0 or isinstance(n_top_matches_to_save,int)==False:
804
+ print('\nError: n_top_matches_to_save should be a positive integer')
805
+ sys.exit()
806
+
807
+ if isinstance(print_id_results,bool)==False:
808
+ print('\nError: print_id_results must be either True or False')
809
+ sys.exit()
810
+
811
+ if output_identification is None:
812
+ output_identification = f'{Path.cwd()}/output_identification.csv'
813
+ print(f'Warning: writing identification output to {output_identification}')
814
+
815
+ if output_similarity_scores is None:
816
+ output_similarity_scores = f'{Path.cwd()}/output_all_similarity_scores.csv'
817
+ print(f'Warning: writing similarity scores to {output_similarity_scores}')
818
+
819
+
820
+
821
+ ####################################### begin spectral library matching #######################################
822
+ # get the range of m/z values
823
+ min_mz = int(np.min([np.min(df_query.iloc[:,1]), np.min(df_reference.iloc[:,1])]))
824
+ max_mz = int(np.max([np.max(df_query.iloc[:,1]), np.max(df_reference.iloc[:,1])]))
825
+ mzs = np.linspace(min_mz,max_mz,(max_mz-min_mz+1))
826
+
827
+ # compute the similarity score between each query library spectrum/spectra and all reference library spectra
828
+ # for each query spectrum, compute its similarity with all reference spectra
829
+ all_similarity_scores = []
830
+ for query_idx in range(0,len(unique_query_ids)):
831
+ q_idxs_tmp = np.where(df_query.iloc[:,0] == unique_query_ids[query_idx])[0]
832
+ q_spec_tmp = np.asarray(pd.concat([df_query.iloc[q_idxs_tmp,1], df_query.iloc[q_idxs_tmp,2]], axis=1).reset_index(drop=True))
833
+ q_spec_tmp = convert_spec(q_spec_tmp,mzs)
834
+
835
+ similarity_scores = []
836
+ for ref_idx in range(0,len(unique_reference_ids)):
837
+ q_spec = q_spec_tmp
838
+ if ref_idx % 1000 == 0:
839
+ print(f'Query spectrum #{query_idx} has had its similarity with {ref_idx} reference library spectra computed')
840
+ r_idxs_tmp = np.where(df_reference.iloc[:,0] == unique_reference_ids[ref_idx])[0]
841
+ r_spec_tmp = np.asarray(pd.concat([df_reference.iloc[r_idxs_tmp,1], df_reference.iloc[r_idxs_tmp,2]], axis=1).reset_index(drop=True))
842
+ r_spec = convert_spec(r_spec_tmp,mzs)
843
+
844
+ # apply spectrum preprocessing transformation in the order specified by user
845
+ for transformation in spectrum_preprocessing_order:
846
+ if np.isinf(q_spec[:,1]).sum() > 0:
847
+ q_spec[:,1] = np.zeros(q_spec.shape[0])
848
+ if np.isinf(r_spec[:,1]).sum() > 0:
849
+ r_spec[:,1] = np.zeros(r_spec.shape[0])
850
+ if transformation == 'W': # weight factor transformation
851
+ q_spec[:,1] = wf_transform(q_spec[:,0], q_spec[:,1], wf_mz, wf_intensity)
852
+ r_spec[:,1] = wf_transform(r_spec[:,0], r_spec[:,1], wf_mz, wf_intensity)
853
+ if transformation == 'L': # low-entropy transformation
854
+ q_spec[:,1] = LE_transform(q_spec[:,1], LET_threshold, normalization_method=normalization_method)
855
+ r_spec[:,1] = LE_transform(r_spec[:,1], LET_threshold, normalization_method=normalization_method)
856
+ if transformation == 'N': # noise removal
857
+ q_spec = remove_noise(q_spec, nr = noise_threshold)
858
+ if high_quality_reference_library == False:
859
+ r_spec = remove_noise(r_spec, nr = noise_threshold)
860
+ if transformation == 'F': # filter with respect to mz and/or intensity
861
+ q_spec = filter_spec_gcms(q_spec, mz_min = mz_min, mz_max = mz_max, int_min = int_min, int_max = int_max)
862
+ if high_quality_reference_library == False:
863
+ r_spec = filter_spec_gcms(r_spec, mz_min = mz_min, mz_max = mz_max, int_min = int_min, int_max = int_max)
864
+
865
+ # query and reference spectrum intensities
866
+ q_ints = q_spec[:,1]
867
+ r_ints = r_spec[:,1]
868
+
869
+ # if there are no non-zero intensities in the query or reference spectrum, their similarity is 0
870
+ if np.sum(q_ints) != 0 and np.sum(r_ints) != 0:
871
+ if similarity_measure == 'cosine':
872
+ similarity_score = S_cos(q_ints, r_ints)
873
+ else:
874
+ # normalize intensities of each spectrum so they sum to 1 so that they represent a probability distribution
875
+ q_ints = normalize(q_ints, method = normalization_method)
876
+ r_ints = normalize(r_ints, method = normalization_method)
877
+
878
+ if similarity_measure == 'shannon':
879
+ similarity_score = S_shannon(q_ints, r_ints)
880
+ elif similarity_measure == 'renyi':
881
+ similarity_score = S_renyi(q_ints, r_ints, q)
882
+ elif similarity_measure == 'tsallis':
883
+ similarity_score = S_tsallis(q_ints, r_ints, q)
884
+ else:
885
+ similarity_score = 0
886
+
887
+ similarity_scores.append(similarity_score)
888
+ all_similarity_scores.append(similarity_scores)
889
+
890
+ # create pandas dataframe containing all similarity scores computed with one row for each query spectrum and one column for each reference spectrum
891
+ df_scores = pd.DataFrame(all_similarity_scores, columns = unique_reference_ids)
892
+ df_scores.index = unique_query_ids
893
+ df_scores.index.names = ['Query Spectrum ID']
894
+
895
+ # get predicted identity/identities of each query spectrum and the corresponding maximum similarity score
896
+ preds = []
897
+ scores = []
898
+ for i in range(0, df_scores.shape[0]):
899
+ df_scores_tmp = df_scores
900
+ preds_tmp = []
901
+ scores_tmp = []
902
+ for j in range(0, n_top_matches_to_save):
903
+ top_ref_specs_tmp = df_scores_tmp.iloc[i,np.where(df_scores_tmp.iloc[i,:] == np.max(df_scores_tmp.iloc[i,:]))[0]]
904
+ cols_to_keep = np.where(df_scores_tmp.iloc[i,:] != np.max(df_scores_tmp.iloc[i,:]))[0]
905
+ df_scores_tmp = df_scores_tmp.iloc[:,cols_to_keep]
906
+
907
+ #preds_tmp.append(';'.join(top_ref_specs_tmp.index.to_list()))
908
+ preds_tmp.append(';'.join(map(str,top_ref_specs_tmp.index.to_list())))
909
+ if len(top_ref_specs_tmp.values) == 0:
910
+ scores_tmp.append(0)
911
+ else:
912
+ scores_tmp.append(top_ref_specs_tmp.values[0])
913
+ preds.append(preds_tmp)
914
+ scores.append(scores_tmp)
915
+
916
+ preds = np.array(preds)
917
+ scores = np.array(scores)
918
+ out = np.c_[preds,scores]
919
+
920
+ # get column names for a pandas dataframe with the n_top_matches_to_save top-matches for each query spectrum
921
+ cnames_preds = []
922
+ cnames_scores = []
923
+ for i in range(0,n_top_matches_to_save):
924
+ cnames_preds.append(f'RANK.{i+1}.PRED')
925
+ cnames_scores.append(f'RANK.{i+1}.SIMILARITY.SCORE')
926
+
927
+ # get pandas dataframe with identifcation results with each row corresponding to a query spectrum, n_top_matches_to_save columns for the top predictions, and n_top_matches_to_save columns for the similarity scores corresponding to the predictions
928
+ df_top_ref_specs = pd.DataFrame(out, columns = [*cnames_preds, *cnames_scores])
929
+ df_top_ref_specs.index = unique_query_ids
930
+ df_top_ref_specs.index.names = ['Query Spectrum ID']
931
+
932
+ # print the identification results if the user desires
933
+ if print_id_results == True:
934
+ print(df_top_ref_specs.to_string())
935
+
936
+ # write spectral library matching results to disk
937
+ df_top_ref_specs.to_csv(output_identification)
938
+
939
+ # write all similarity scores to disk
940
+ df_scores.columns = ['Reference Spectrum ID: ' + col for col in list(map(str,df_scores.columns.tolist()))]
941
+ df_scores.to_csv(output_similarity_scores)
942
+
943
+