pycompound 0.1.0__py3-none-any.whl → 0.1.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,4 @@
1
1
 
2
- # this script's function runs spectral library matching to identify unknown query compound(s)
3
-
4
2
  from pycompound.build_library import build_library_from_raw_data
5
3
  from .processing import *
6
4
  from .similarity_measures import *
@@ -9,6 +7,13 @@ from pathlib import Path
9
7
  import json
10
8
  from itertools import product
11
9
  from joblib import Parallel, delayed
10
+ import csv
11
+ import sys, csv
12
+
13
+
14
+ default_HRMS_grid = {'similarity_measure':['cosine'], 'weight':[{'Cosine':0.25,'Shannon':0.25,'Renyi':0.25,'Tsallis':0.25}], 'spectrum_preprocessing_order':['FCNMWL'], 'mz_min':[0], 'mz_max':[9999999], 'int_min':[0], 'int_max':[99999999], 'window_size_centroiding':[0.5], 'window_size_matching':[0.5], 'noise_threshold':[0.0], 'wf_mz':[0.0], 'wf_int':[1.0], 'LET_threshold':[0.0], 'entropy_dimension':[1.1], 'high_quality_reference_library':[False]}
15
+ default_NRMS_grid = {'similarity_measure':['cosine'], 'weight':[{'Cosine':0.25,'Shannon':0.25,'Renyi':0.25,'Tsallis':0.25}], 'spectrum_preprocessing_order':['FCNMWL'], 'mz_min':[0], 'mz_max':[9999999], 'int_min':[0], 'int_max':[99999999], 'noise_threshold':[0.0], 'wf_mz':[0.0], 'wf_int':[1.0], 'LET_threshold':[0.0], 'entropy_dimension':[1.1], 'high_quality_reference_library':[False]}
16
+
12
17
 
13
18
  def _eval_one_HRMS(df_query, df_reference, unique_query_ids, unique_reference_ids,
14
19
  similarity_measure_tmp, weight,
@@ -71,22 +76,23 @@ def _eval_one_NRMS(df_query, df_reference, unique_query_ids, unique_reference_id
71
76
  )
72
77
 
73
78
 
74
- def tune_params_on_HRMS_data(query_data=None, reference_data=None, grid={'similarity_measure':['cosine'], 'weight':[{'Cosine':0.25,'Shannon':0.25,'Renyi':0.25,'Tsallis':0.25}], 'spectrum_preprocessing_order':['FCNMWL'], 'mz_min':[0], 'mz_max':[9999999], 'int_min':[0], 'int_max':[99999999], 'window_size_centroiding':[0.5], 'window_size_matching':[0.5], 'noise_threshold':[0.0], 'wf_mz':[0.0], 'wf_int':[1.0], 'LET_threshold':[0.0], 'entropy_dimension':[1.1], 'high_quality_reference_library':[False]}, output_path=None):
79
+
80
+ def tune_params_on_HRMS_data(query_data=None, reference_data=None, grid=None, output_path=None, return_output=False):
75
81
  """
76
- runs spectral library matching on high-resolution mass spectrometry (HRMS) data with all possible combinations of parameters in the grid dict, saves results from each choice of parameters to a CSV file, and prints top-performing parameters
82
+ runs spectral library matching on high-resolution mass spectrometry (HRMS) data with all possible combinations of parameters in the grid dict, saves results from each choice of parameters to a TXT file, and prints top-performing parameters
77
83
 
78
84
  --query_data: mgf, mzML, or csv file of query mass spectrum/spectra to be identified. If csv file, each row should correspond to a mass spectrum, the left-most column should contain an identifier, and each of the other columns should correspond to a single mass/charge ratio. Mandatory argument.
79
85
  --reference_data: mgf, mzML, or csv file of the reference mass spectra. If csv file, each row should correspond to a mass spectrum, the left-most column should contain in identifier (i.e. the CAS registry number or the compound name), and the remaining column should correspond to a single mass/charge ratio. Mandatory argument.
80
86
  --grid: dict with all possible parameter values to try.
81
- --output_path: accuracy from each choice of parameter set is saved to a CSV file here.
87
+ --output_path: accuracy from each choice of parameter set is saved to a TXT file here.
82
88
  """
83
89
 
90
+ grid = {**default_HRMS_grid, **(grid or {})}
84
91
  for key, value in grid.items():
85
92
  globals()[key] = value
86
93
 
87
- # load query and reference libraries
88
94
  if query_data is None:
89
- print('\nError: No argument passed to the mandatory query_data. Please pass the path to the CSV file of the query data.')
95
+ print('\nError: No argument passed to the mandatory query_data. Please pass the path to the TXT file of the query data.')
90
96
  sys.exit()
91
97
  else:
92
98
  extension = query_data.rsplit('.',1)
@@ -118,37 +124,157 @@ def tune_params_on_HRMS_data(query_data=None, reference_data=None, grid={'simila
118
124
  print(f'\nNote that there are {len(unique_query_ids)} unique query spectra, {len(unique_reference_ids)} unique reference spectra, and {len(set(unique_query_ids) & set(unique_reference_ids))} of the query and reference spectra IDs are in common.\n')
119
125
 
120
126
  if output_path is None:
121
- output_path = f'{Path.cwd()}/tuning_param_output.csv'
127
+ output_path = f'{Path.cwd()}/tuning_param_output.txt'
122
128
  print(f'Warning: since output_path=None, the output will be written to the current working directory: {output_path}')
123
129
 
124
- # build parameter grid out of the lists you already set
125
130
  param_grid = product(similarity_measure, weight, spectrum_preprocessing_order, mz_min, mz_max, int_min, int_max, noise_threshold,
126
131
  window_size_centroiding, window_size_matching, wf_mz, wf_int, LET_threshold, entropy_dimension, high_quality_reference_library)
127
- # run in parallel on all CPUs
128
132
  results = Parallel(n_jobs=-1, verbose=10)(delayed(_eval_one_HRMS)(df_query, df_reference, unique_query_ids, unique_reference_ids, *params) for params in param_grid)
129
133
 
130
134
  df_out = pd.DataFrame(results, columns=[
131
135
  'ACC','SIMILARITY.MEASURE','WEIGHT','SPECTRUM.PROCESSING.ORDER', 'MZ.MIN','MZ.MAX','INT.MIN','INT.MAX','NOISE.THRESHOLD',
132
136
  'WINDOW.SIZE.CENTROIDING','WINDOW.SIZE.MATCHING', 'WF.MZ','WF.INT','LET.THRESHOLD','ENTROPY.DIMENSION', 'HIGH.QUALITY.REFERENCE.LIBRARY'
133
137
  ])
134
- df_out = df_out.drop(columns=['WEIGHT'])
135
- df_out.to_csv(output_path, index=False)
138
+ df_out['WEIGHT'] = df_out['WEIGHT'].str.replace("\"","",regex=False)
139
+ df_out['WEIGHT'] = df_out['WEIGHT'].str.replace("{","",regex=False)
140
+ df_out['WEIGHT'] = df_out['WEIGHT'].str.replace("}","",regex=False)
141
+ df_out['WEIGHT'] = df_out['WEIGHT'].str.replace(":","",regex=False)
142
+ df_out['WEIGHT'] = df_out['WEIGHT'].str.replace("Cosine","",regex=False)
143
+ df_out['WEIGHT'] = df_out['WEIGHT'].str.replace("Shannon","",regex=False)
144
+ df_out['WEIGHT'] = df_out['WEIGHT'].str.replace("Renyi","",regex=False)
145
+ df_out['WEIGHT'] = df_out['WEIGHT'].str.replace("Tsallis","",regex=False)
146
+ df_out['WEIGHT'] = df_out['WEIGHT'].str.replace(" ","",regex=False)
147
+ df_out.to_csv(output_path, index=False, sep='\t', quoting=csv.QUOTE_NONE)
148
+
149
+ if return_output is False:
150
+ df_out.to_csv(output_path, index=False, sep='\t', quoting=csv.QUOTE_NONE)
151
+ else:
152
+ return df_out
136
153
 
137
154
 
138
- def tune_params_on_NRMS_data(query_data=None, reference_data=None, grid={'similarity_measure':['cosine'], 'weight':[{'Cosine':0.25,'Shannon':0.25,'Renyi':0.25,'Tsallis':0.25}], 'spectrum_preprocessing_order':['FNLW'], 'mz_min':[0], 'mz_max':[9999999], 'int_min':[0], 'int_max':[99999999], 'noise_threshold':[0.0], 'wf_mz':[0.0], 'wf_int':[1.0], 'LET_threshold':[0.0], 'entropy_dimension':[1.1], 'high_quality_reference_library':[False]}, output_path=None):
155
+
156
+ def tune_params_on_HRMS_data_shiny(query_data=None, reference_data=None, grid=None, output_path=None, return_output=False):
157
+ """
158
+ runs spectral library matching on high-resolution mass spectrometry (HRMS) data with all possible
159
+ combinations of parameters in the grid dict, saves results from each choice of parameters to a TXT file,
160
+ and prints top-performing parameters
161
+
162
+ --query_data: mgf, mzML, or csv file of query mass spectrum/spectra to be identified. If csv file, each row
163
+ should correspond to a mass spectrum, the left-most column should contain an identifier, and each of the
164
+ other columns should correspond to a single mass/charge ratio. Mandatory argument.
165
+ --reference_data: mgf, mzML, or csv file of the reference mass spectra. If csv file, each row should correspond
166
+ to a mass spectrum, the left-most column should contain in identifier (i.e. the CAS registry number or the
167
+ compound name), and the remaining column should correspond to a single mass/charge ratio. Mandatory argument.
168
+ --grid: dict with all possible parameter values to try.
169
+ --output_path: accuracy from each choice of parameter set is saved to a TXT file here.
139
170
  """
140
- runs spectral library matching on nominal-resolution mass spectrometry (NRMS) data with all possible combinations of parameters in the grid dict, saves results from each choice of parameters to a CSV file, and prints top-performing parameters
171
+
172
+ local_grid = {**default_HRMS_grid, **(grid or {})}
173
+ for key, value in local_grid.items():
174
+ globals()[key] = value
175
+
176
+ if query_data is None:
177
+ print('\nError: No argument passed to the mandatory query_data. Please pass the path to the data file.')
178
+ sys.exit()
179
+ else:
180
+ extension = query_data.rsplit('.', 1)[-1]
181
+ if extension in ('mgf','MGF','mzML','mzml','MZML','cdf','CDF'):
182
+ output_path_tmp = query_data[:-3] + 'csv'
183
+ build_library_from_raw_data(input_path=query_data, output_path=output_path_tmp, is_reference=False)
184
+ df_query = pd.read_csv(output_path_tmp)
185
+ elif extension in ('csv','CSV'):
186
+ df_query = pd.read_csv(query_data)
187
+ else:
188
+ print(f'\nError: Unsupported query_data extension: {extension}')
189
+ sys.exit()
190
+ unique_query_ids = df_query.iloc[:, 0].unique()
191
+
192
+ if reference_data is None:
193
+ print('\nError: No argument passed to the mandatory reference_data. Please pass the path to the data file(s).')
194
+ sys.exit()
195
+ else:
196
+ if isinstance(reference_data, str):
197
+ df_reference = get_reference_df(reference_data=reference_data)
198
+ unique_reference_ids = df_reference.iloc[:, 0].unique()
199
+ else:
200
+ dfs = []
201
+ unique_reference_ids = []
202
+ for f in reference_data:
203
+ tmp = get_reference_df(reference_data=f)
204
+ dfs.append(tmp)
205
+ unique_reference_ids.extend(tmp.iloc[:, 0].unique())
206
+ df_reference = pd.concat(dfs, axis=0, ignore_index=True)
207
+
208
+ print(f'\nNote that there are {len(unique_query_ids)} unique query spectra, '
209
+ f'{len(unique_reference_ids)} unique reference spectra, and '
210
+ f'{len(set(unique_query_ids) & set(unique_reference_ids))} of the query and reference spectra IDs are in common.\n')
211
+
212
+ if output_path is None:
213
+ output_path = f'{Path.cwd()}/tuning_param_output.txt'
214
+ print(f'Warning: since output_path=None, the output will be written to the current working directory: {output_path}')
215
+
216
+ param_grid = product(
217
+ similarity_measure, weight, spectrum_preprocessing_order, mz_min, mz_max, int_min, int_max,
218
+ noise_threshold, window_size_centroiding, window_size_matching, wf_mz, wf_int, LET_threshold,
219
+ entropy_dimension, high_quality_reference_library
220
+ )
221
+
222
+ results = []
223
+ total = (
224
+ len(similarity_measure) * len(weight) * len(spectrum_preprocessing_order) * len(mz_min) * len(mz_max) *
225
+ len(int_min) * len(int_max) * len(noise_threshold) * len(window_size_centroiding) *
226
+ len(window_size_matching) * len(wf_mz) * len(wf_int) * len(LET_threshold) *
227
+ len(entropy_dimension) * len(high_quality_reference_library)
228
+ )
229
+ done = 0
230
+
231
+ for params in param_grid:
232
+ res = _eval_one_HRMS(df_query, df_reference, unique_query_ids, unique_reference_ids, *params)
233
+ results.append(res)
234
+ done += 1
235
+ print(f'Completed {done}/{total} grid combinations.\n', flush=True)
236
+
237
+ df_out = pd.DataFrame(results, columns=[
238
+ 'ACC','SIMILARITY.MEASURE','WEIGHT','SPECTRUM.PROCESSING.ORDER','MZ.MIN','MZ.MAX',
239
+ 'INT.MIN','INT.MAX','NOISE.THRESHOLD','WINDOW.SIZE.CENTROIDING','WINDOW.SIZE.MATCHING',
240
+ 'WF.MZ','WF.INT','LET.THRESHOLD','ENTROPY.DIMENSION','HIGH.QUALITY.REFERENCE.LIBRARY'
241
+ ])
242
+
243
+ if 'WEIGHT' in df_out.columns:
244
+ df_out['WEIGHT'] = (
245
+ df_out['WEIGHT'].astype(str)
246
+ .str.replace("\"","",regex=False)
247
+ .str.replace("{","",regex=False)
248
+ .str.replace("}","",regex=False)
249
+ .str.replace(":","",regex=False)
250
+ .str.replace("Cosine","",regex=False)
251
+ .str.replace("Shannon","",regex=False)
252
+ .str.replace("Renyi","",regex=False)
253
+ .str.replace("Tsallis","",regex=False)
254
+ .str.replace(" ","",regex=False)
255
+ )
256
+
257
+ if return_output:
258
+ return df_out
259
+ else:
260
+ df_out.to_csv(output_path, index=False, sep='\t', quoting=csv.QUOTE_NONE)
261
+ print(f'Wrote results to {output_path}')
262
+
263
+
264
+ def tune_params_on_NRMS_data(query_data=None, reference_data=None, grid=None, output_path=None, return_output=False):
265
+ """
266
+ runs spectral library matching on nominal-resolution mass spectrometry (NRMS) data with all possible combinations of parameters in the grid dict, saves results from each choice of parameters to a TXT file, and prints top-performing parameters
141
267
 
142
268
  --query_data: mgf, mzML, or csv file of query mass spectrum/spectra to be identified. If csv file, each row should correspond to a mass spectrum, the left-most column should contain an identifier, and each of the other columns should correspond to a single mass/charge ratio. Mandatory argument.
143
269
  --reference_data: mgf, mzML, or csv file of the reference mass spectra. If csv file, each row should correspond to a mass spectrum, the left-most column should contain in identifier (i.e. the CAS registry number or the compound name), and the remaining column should correspond to a single mass/charge ratio. Mandatory argument.
144
270
  --grid: dict with all possible parameter values to try
145
- --output_path: accuracy from each choice of parameter set is saved to a CSV file here
271
+ --output_path: accuracy from each choice of parameter set is saved to a TXT file here
146
272
  """
147
273
 
274
+ grid = {**default_NRMS_grid, **(grid or {})}
148
275
  for key, value in grid.items():
149
276
  globals()[key] = value
150
277
 
151
- # load query and reference libraries
152
278
  if query_data is None:
153
279
  print('\nError: No argument passed to the mandatory query_data. Please pass the path to the CSV file of the query data.')
154
280
  sys.exit()
@@ -182,75 +308,184 @@ def tune_params_on_NRMS_data(query_data=None, reference_data=None, grid={'simila
182
308
  print(f'\nNote that there are {len(unique_query_ids)} unique query spectra, {len(unique_reference_ids)} unique reference spectra, and {len(set(unique_query_ids) & set(unique_reference_ids))} of the query and reference spectra IDs are in common.\n')
183
309
 
184
310
  if output_path is None:
185
- output_path = f'{Path.cwd()}/tuning_param_output.csv'
311
+ output_path = f'{Path.cwd()}/tuning_param_output.txt'
186
312
  print(f'Warning: since output_path=None, the output will be written to the current working directory: {output_path}')
187
313
 
188
- # build parameter grid out of the lists you already set
189
314
  param_grid = product(similarity_measure, weight, spectrum_preprocessing_order, mz_min, mz_max, int_min, int_max,
190
315
  noise_threshold, wf_mz, wf_int, LET_threshold, entropy_dimension, high_quality_reference_library)
191
- # run in parallel on all CPUs
192
316
  results = Parallel(n_jobs=-1, verbose=10)(delayed(_eval_one_NRMS)(df_query, df_reference, unique_query_ids, unique_reference_ids, *params) for params in param_grid)
193
317
 
194
318
  df_out = pd.DataFrame(results, columns=[
195
319
  'ACC','SIMILARITY.MEASURE','WEIGHT','SPECTRUM.PROCESSING.ORDER', 'MZ.MIN','MZ.MAX','INT.MIN','INT.MAX',
196
320
  'NOISE.THRESHOLD','WF.MZ','WF.INT','LET.THRESHOLD','ENTROPY.DIMENSION', 'HIGH.QUALITY.REFERENCE.LIBRARY'
197
321
  ])
198
- df_out = df_out.drop(columns=['WEIGHT'])
199
- df_out.to_csv(output_path, index=False)
322
+ df_out['WEIGHT'] = df_out['WEIGHT'].str.replace("\"","",regex=False)
323
+ df_out['WEIGHT'] = df_out['WEIGHT'].str.replace("{","",regex=False)
324
+ df_out['WEIGHT'] = df_out['WEIGHT'].str.replace("}","",regex=False)
325
+ df_out['WEIGHT'] = df_out['WEIGHT'].str.replace(":","",regex=False)
326
+ df_out['WEIGHT'] = df_out['WEIGHT'].str.replace("Cosine","",regex=False)
327
+ df_out['WEIGHT'] = df_out['WEIGHT'].str.replace("Shannon","",regex=False)
328
+ df_out['WEIGHT'] = df_out['WEIGHT'].str.replace("Renyi","",regex=False)
329
+ df_out['WEIGHT'] = df_out['WEIGHT'].str.replace("Tsallis","",regex=False)
330
+ df_out['WEIGHT'] = df_out['WEIGHT'].str.replace(" ","",regex=False)
331
+ if return_output is False:
332
+ df_out.to_csv(output_path, index=False, sep='\t', quoting=csv.QUOTE_NONE)
333
+ else:
334
+ return df_out
335
+
336
+
337
+
338
+ def tune_params_on_NRMS_data_shiny(query_data=None, reference_data=None, grid=None, output_path=None, return_output=False):
339
+ """
340
+ runs spectral library matching on nominal-resolution mass spectrometry (NRMS) data with all possible
341
+ combinations of parameters in the grid dict, saves results from each choice of parameters to a TXT file,
342
+ and prints top-performing parameters
343
+
344
+ --query_data: mgf, mzML, or csv file of query mass spectrum/spectra to be identified. If csv file, each row
345
+ should correspond to a mass spectrum, the left-most column should contain an identifier, and each of the
346
+ other columns should correspond to a single mass/charge ratio. Mandatory argument.
347
+ --reference_data: mgf, mzML, or csv file of the reference mass spectra. If csv file, each row should correspond
348
+ to a mass spectrum, the left-most column should contain in identifier (i.e. the CAS registry number or the
349
+ compound name), and the remaining column should correspond to a single mass/charge ratio. Mandatory argument.
350
+ --grid: dict with all possible parameter values to try.
351
+ --output_path: accuracy from each choice of parameter set is saved to a TXT file here.
352
+ """
353
+
354
+ local_grid = {**default_NRMS_grid, **(grid or {})}
355
+ for key, value in local_grid.items():
356
+ globals()[key] = value
357
+
358
+ if query_data is None:
359
+ print('\nError: No argument passed to the mandatory query_data. Please pass the path to the data file.')
360
+ sys.exit()
361
+ else:
362
+ extension = query_data.rsplit('.', 1)[-1]
363
+ if extension in ('mgf','MGF','mzML','mzml','MZML','cdf','CDF'):
364
+ output_path_tmp = query_data[:-3] + 'csv'
365
+ build_library_from_raw_data(input_path=query_data, output_path=output_path_tmp, is_reference=False)
366
+ df_query = pd.read_csv(output_path_tmp)
367
+ elif extension in ('csv','CSV'):
368
+ df_query = pd.read_csv(query_data)
369
+ else:
370
+ print(f'\nError: Unsupported query_data extension: {extension}')
371
+ sys.exit()
372
+ unique_query_ids = df_query.iloc[:, 0].unique()
373
+
374
+ if reference_data is None:
375
+ print('\nError: No argument passed to the mandatory reference_data. Please pass the path to the data file(s).')
376
+ sys.exit()
377
+ else:
378
+ if isinstance(reference_data, str):
379
+ df_reference = get_reference_df(reference_data=reference_data)
380
+ unique_reference_ids = df_reference.iloc[:, 0].unique()
381
+ else:
382
+ dfs = []
383
+ unique_reference_ids = []
384
+ for f in reference_data:
385
+ tmp = get_reference_df(reference_data=f)
386
+ dfs.append(tmp)
387
+ unique_reference_ids.extend(tmp.iloc[:, 0].unique())
388
+ df_reference = pd.concat(dfs, axis=0, ignore_index=True)
389
+
390
+ print(f'\nNote that there are {len(unique_query_ids)} unique query spectra, '
391
+ f'{len(unique_reference_ids)} unique reference spectra, and '
392
+ f'{len(set(unique_query_ids) & set(unique_reference_ids))} of the query and reference spectra IDs are in common.\n')
393
+
394
+ if output_path is None:
395
+ output_path = f'{Path.cwd()}/tuning_param_output.txt'
396
+ print(f'Warning: since output_path=None, the output will be written to the current working directory: {output_path}')
397
+
398
+ param_grid = product(
399
+ similarity_measure, weight, spectrum_preprocessing_order, mz_min, mz_max, int_min, int_max,
400
+ noise_threshold, wf_mz, wf_int, LET_threshold,
401
+ entropy_dimension, high_quality_reference_library
402
+ )
403
+
404
+ results = []
405
+ total = (
406
+ len(similarity_measure) * len(weight) * len(spectrum_preprocessing_order) * len(mz_min) * len(mz_max) * len(int_min) *
407
+ len(int_max) * len(noise_threshold) * len(wf_mz) * len(wf_int) * len(LET_threshold) * len(entropy_dimension) * len(high_quality_reference_library)
408
+ )
409
+ done = 0
410
+ for params in param_grid:
411
+ res = _eval_one_NRMS(df_query, df_reference, unique_query_ids, unique_reference_ids, *params)
412
+ results.append(res)
413
+ done += 1
414
+ print(f'Completed {done}/{total} grid combinations.\n', flush=True)
415
+
416
+ df_out = pd.DataFrame(results, columns=[
417
+ 'ACC','SIMILARITY.MEASURE','WEIGHT','SPECTRUM.PROCESSING.ORDER','MZ.MIN','MZ.MAX',
418
+ 'INT.MIN','INT.MAX','NOISE.THRESHOLD','WF.MZ','WF.INT','LET.THRESHOLD','ENTROPY.DIMENSION','HIGH.QUALITY.REFERENCE.LIBRARY'
419
+ ])
420
+
421
+ if 'WEIGHT' in df_out.columns:
422
+ df_out['WEIGHT'] = (
423
+ df_out['WEIGHT'].astype(str)
424
+ .str.replace("\"","",regex=False)
425
+ .str.replace("{","",regex=False)
426
+ .str.replace("}","",regex=False)
427
+ .str.replace(":","",regex=False)
428
+ .str.replace("Cosine","",regex=False)
429
+ .str.replace("Shannon","",regex=False)
430
+ .str.replace("Renyi","",regex=False)
431
+ .str.replace("Tsallis","",regex=False)
432
+ .str.replace(" ","",regex=False)
433
+ )
434
+
435
+ if return_output:
436
+ return df_out
437
+ else:
438
+ df_out.to_csv(output_path, index=False, sep='\t', quoting=csv.QUOTE_NONE)
439
+ print(f'Wrote results to {output_path}')
200
440
 
201
441
 
202
442
 
203
443
 
204
444
  def get_acc_HRMS(df_query, df_reference, unique_query_ids, unique_reference_ids, similarity_measure, weights, spectrum_preprocessing_order, mz_min, mz_max, int_min, int_max, window_size_centroiding, window_size_matching, noise_threshold, wf_mz, wf_int, LET_threshold, entropy_dimension, high_quality_reference_library):
205
- # returns accuracy for a given set of parameters
206
445
 
207
446
  n_top_matches_to_save = 1
208
447
 
209
- # compute the similarity score between each query library spectrum/spectra and all reference library spectra
210
448
  all_similarity_scores = []
211
449
  for query_idx in range(0,len(unique_query_ids)):
212
450
  print(f'query spectrum #{query_idx} is being identified')
213
451
  q_idxs_tmp = np.where(df_query.iloc[:,0] == unique_query_ids[query_idx])[0]
214
452
  q_spec_tmp = np.asarray(pd.concat([df_query.iloc[q_idxs_tmp,1], df_query.iloc[q_idxs_tmp,2]], axis=1).reset_index(drop=True))
215
453
 
216
- # compute the similarity score between the given query spectrum and all spectra in the reference library
217
454
  similarity_scores = []
218
455
  for ref_idx in range(0,len(unique_reference_ids)):
219
456
  q_spec = q_spec_tmp
220
457
  r_idxs_tmp = np.where(df_reference.iloc[:,0] == unique_reference_ids[ref_idx])[0]
221
458
  r_spec = np.asarray(pd.concat([df_reference.iloc[r_idxs_tmp,1], df_reference.iloc[r_idxs_tmp,2]], axis=1).reset_index(drop=True))
222
459
 
223
- # apply spectrum preprocessing transformation in the order specified by user
224
460
  is_matched = False
225
461
  for transformation in spectrum_preprocessing_order:
226
462
  if np.isinf(q_spec[:,1]).sum() > 0:
227
463
  q_spec[:,1] = np.zeros(q_spec.shape[0])
228
464
  if np.isinf(r_spec[:,1]).sum() > 0:
229
465
  r_spec[:,1] = np.zeros(r_spec.shape[0])
230
- if transformation == 'C' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1: # centroiding
466
+ if transformation == 'C' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1:
231
467
  q_spec = centroid_spectrum(q_spec, window_size=window_size_centroiding)
232
468
  r_spec = centroid_spectrum(r_spec, window_size=window_size_centroiding)
233
- if transformation == 'M' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1: # matching
469
+ if transformation == 'M' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1:
234
470
  m_spec = match_peaks_in_spectra(spec_a=q_spec, spec_b=r_spec, window_size=window_size_matching)
235
471
  q_spec = m_spec[:,0:2]
236
472
  r_spec = m_spec[:,[0,2]]
237
473
  is_matched = True
238
- if transformation == 'W' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1: # weight factor transformation
474
+ if transformation == 'W' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1:
239
475
  q_spec[:,1] = wf_transform(q_spec[:,0], q_spec[:,1], wf_mz, wf_int)
240
476
  r_spec[:,1] = wf_transform(r_spec[:,0], r_spec[:,1], wf_mz, wf_int)
241
- if transformation == 'L' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1: # low-entropy tranformation
477
+ if transformation == 'L' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1:
242
478
  q_spec[:,1] = LE_transform(q_spec[:,1], LET_threshold, normalization_method='standard')
243
479
  r_spec[:,1] = LE_transform(r_spec[:,1], LET_threshold, normalization_method='standard')
244
- if transformation == 'N' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1: # noise removal
480
+ if transformation == 'N' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1:
245
481
  q_spec = remove_noise(q_spec, nr = noise_threshold)
246
482
  if high_quality_reference_library == False:
247
483
  r_spec = remove_noise(r_spec, nr = noise_threshold)
248
- if transformation == 'F' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1: # filter with respect to mz and/or intensity
484
+ if transformation == 'F' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1:
249
485
  q_spec = filter_spec_lcms(q_spec, mz_min = mz_min, mz_max = mz_max, int_min = int_min, int_max = int_max, is_matched = is_matched)
250
486
  if high_quality_reference_library == False:
251
487
  r_spec = filter_spec_lcms(r_spec, mz_min = mz_min, mz_max = mz_max, int_min = int_min, int_max = int_max, is_matched = is_matched)
252
488
 
253
- # query and reference spectrum intensities
254
489
  q_ints = q_spec[:,1]
255
490
  r_ints = r_spec[:,1]
256
491
  if np.sum(q_ints) != 0 and np.sum(r_ints) != 0 and q_spec.shape[0] > 1 and r_spec.shape[1] > 1:
@@ -261,12 +496,10 @@ def get_acc_HRMS(df_query, df_reference, unique_query_ids, unique_reference_ids,
261
496
  similarity_scores.append(similarity_score)
262
497
  all_similarity_scores.append(similarity_scores)
263
498
 
264
- # create pandas dataframe containing all similarity scores computed with one row for each query spectrum and one column for each reference spectrum
265
499
  df_scores = pd.DataFrame(all_similarity_scores, columns = unique_reference_ids)
266
500
  df_scores.index = unique_query_ids
267
501
  df_scores.index.names = ['Query Spectrum ID']
268
502
 
269
- # get predicted identity/identities of each query spectrum and the corresponding maximum similarity score
270
503
  preds = []
271
504
  scores = []
272
505
  for i in range(0, df_scores.shape[0]):
@@ -297,7 +530,6 @@ def get_acc_HRMS(df_query, df_reference, unique_query_ids, unique_reference_ids,
297
530
 
298
531
 
299
532
  def get_acc_NRMS(df_query, df_reference, unique_query_ids, unique_reference_ids, similarity_measure, weights, spectrum_preprocessing_order, mz_min, mz_max, int_min, int_max, noise_threshold, wf_mz, wf_int, LET_threshold, entropy_dimension, high_quality_reference_library):
300
- # returns accuracy for a given set of parameters
301
533
 
302
534
  n_top_matches_to_save = 1
303
535
 
@@ -320,32 +552,29 @@ def get_acc_NRMS(df_query, df_reference, unique_query_ids, unique_reference_ids,
320
552
  r_spec_tmp = np.asarray(pd.concat([df_reference.iloc[r_idxs_tmp,1], df_reference.iloc[r_idxs_tmp,2]], axis=1).reset_index(drop=True))
321
553
  r_spec = convert_spec(r_spec_tmp,mzs)
322
554
 
323
- # apply spectrum preprocessing transformation in the order specified by user
324
555
  for transformation in spectrum_preprocessing_order:
325
556
  if np.isinf(q_spec[:,1]).sum() > 0:
326
557
  q_spec[:,1] = np.zeros(q_spec.shape[0])
327
558
  if np.isinf(r_spec[:,1]).sum() > 0:
328
559
  r_spec[:,1] = np.zeros(r_spec.shape[0])
329
- if transformation == 'W': # weight factor transformation
560
+ if transformation == 'W':
330
561
  q_spec[:,1] = wf_transform(q_spec[:,0], q_spec[:,1], wf_mz, wf_int)
331
562
  r_spec[:,1] = wf_transform(r_spec[:,0], r_spec[:,1], wf_mz, wf_int)
332
- if transformation == 'L': # low-entropy transformation
563
+ if transformation == 'L':
333
564
  q_spec[:,1] = LE_transform(q_spec[:,1], LET_threshold, normalization_method='standard')
334
565
  r_spec[:,1] = LE_transform(r_spec[:,1], LET_threshold, normalization_method='standard')
335
- if transformation == 'N': # noise removal
566
+ if transformation == 'N':
336
567
  q_spec = remove_noise(q_spec, nr = noise_threshold)
337
568
  if high_quality_reference_library == False:
338
569
  r_spec = remove_noise(r_spec, nr = noise_threshold)
339
- if transformation == 'F': # filter with respect to mz and/or intensity
570
+ if transformation == 'F':
340
571
  q_spec = filter_spec_gcms(q_spec, mz_min = mz_min, mz_max = mz_max, int_min = int_min, int_max = int_max)
341
572
  if high_quality_reference_library == False:
342
573
  r_spec = filter_spec_gcms(r_spec, mz_min = mz_min, mz_max = mz_max, int_min = int_min, int_max = int_max)
343
574
 
344
- # query and reference spectrum intensities
345
575
  q_ints = q_spec[:,1]
346
576
  r_ints = r_spec[:,1]
347
577
 
348
- # if there are no non-zero intensities in the query or reference spectrum, their similarity is 0
349
578
  if np.sum(q_ints) != 0 and np.sum(r_ints) != 0:
350
579
  similarity_score = get_similarity(similarity_measure, q_spec[:,1], r_spec[:,1], weights, entropy_dimension)
351
580
  else:
@@ -354,12 +583,10 @@ def get_acc_NRMS(df_query, df_reference, unique_query_ids, unique_reference_ids,
354
583
  similarity_scores.append(similarity_score)
355
584
  all_similarity_scores.append(similarity_scores)
356
585
 
357
- # create pandas dataframe containing all similarity scores computed with one row for each query spectrum and one column for each reference spectrum
358
586
  df_scores = pd.DataFrame(all_similarity_scores, columns = unique_reference_ids)
359
587
  df_scores.index = unique_query_ids
360
588
  df_scores.index.names = ['Query Spectrum ID']
361
589
 
362
- # get predicted identity/identities of each query spectrum and the corresponding maximum similarity score
363
590
  preds = []
364
591
  scores = []
365
592
  for i in range(0, df_scores.shape[0]):
@@ -371,7 +598,6 @@ def get_acc_NRMS(df_query, df_reference, unique_query_ids, unique_reference_ids,
371
598
  cols_to_keep = np.where(df_scores_tmp.iloc[i,:] != np.max(df_scores_tmp.iloc[i,:]))[0]
372
599
  df_scores_tmp = df_scores_tmp.iloc[:,cols_to_keep]
373
600
 
374
- #preds_tmp.append(';'.join(top_ref_specs_tmp.index.to_list()))
375
601
  preds_tmp.append(';'.join(map(str,top_ref_specs_tmp.index.to_list())))
376
602
  if len(top_ref_specs_tmp.values) == 0:
377
603
  scores_tmp.append(0)
@@ -413,11 +639,10 @@ def run_spec_lib_matching_on_HRMS_data(query_data=None, reference_data=None, lik
413
639
  --entropy_dimension: Entropy dimension parameter. Must have positive value other than 1. When the entropy dimension is 1, then Renyi and Tsallis entropy are equivalent to Shannon entropy. Therefore, this parameter only applies to the renyi and tsallis similarity measures. This parameter will be ignored if similarity measure cosine or shannon is chosen. Default: 1.1
414
640
  --n_top_matches_to_save: The number of top matches to report. For example, if n_top_matches_to_save=5, then for each query spectrum, the five reference spectra with the largest similarity with the given query spectrum will be reported. Default: 1
415
641
  --print_id_results: Flag that prints identification results if True. Default: False
416
- --output_identification: Output CSV file containing the most-similar reference spectra for each query spectrum along with the corresponding similarity scores. Default is to save identification output in current working directory with filename \'output_identification.csv\'.
417
- --output_similarity_scores: Output CSV file containing similarity scores between all query spectrum/spectra and all reference spectra. Each row corresponds to a query spectrum, the left-most column contains the query spectrum/spectra identifier, and the remaining column contain the similarity scores with respect to all reference library spectra. If no argument passed, then this CSV file is written to the current working directory with filename \'output_all_similarity_scores\'.csv.')
642
+ --output_identification: Output TXT file containing the most-similar reference spectra for each query spectrum along with the corresponding similarity scores. Default is to save identification output in current working directory with filename \'output_identification.txt\'.
643
+ --output_similarity_scores: Output TXT file containing similarity scores between all query spectrum/spectra and all reference spectra. Each row corresponds to a query spectrum, the left-most column contains the query spectrum/spectra identifier, and the remaining column contain the similarity scores with respect to all reference library spectra. If no argument passed, then this TXT file is written to the current working directory with filename \'output_all_similarity_scores\'.txt.')
418
644
  '''
419
645
 
420
- # load query and reference libraries
421
646
  if query_data is None:
422
647
  print('\nError: No argument passed to the mandatory query_data. Please pass the path to the CSV file of the query data.')
423
648
  sys.exit()
@@ -449,7 +674,6 @@ def run_spec_lib_matching_on_HRMS_data(query_data=None, reference_data=None, lik
449
674
  df_reference = pd.concat(dfs, axis=0, ignore_index=True)
450
675
 
451
676
 
452
- ##### process input parameters and ensure they are in a valid format #####
453
677
  if spectrum_preprocessing_order is not None:
454
678
  spectrum_preprocessing_order = list(spectrum_preprocessing_order)
455
679
  else:
@@ -517,7 +741,7 @@ def run_spec_lib_matching_on_HRMS_data(query_data=None, reference_data=None, lik
517
741
  else:
518
742
  q = entropy_dimension
519
743
 
520
- normalization_method = 'standard' #consider including additional normalization methods to transform intensities into a probability distribution; softmax results in many numerical errors/warnings
744
+ normalization_method = 'standard'
521
745
 
522
746
  if n_top_matches_to_save <= 0 or isinstance(n_top_matches_to_save,int)==False:
523
747
  print('\nError: n_top_matches_to_save should be a positive integer')
@@ -528,23 +752,20 @@ def run_spec_lib_matching_on_HRMS_data(query_data=None, reference_data=None, lik
528
752
  sys.exit()
529
753
 
530
754
  if output_identification is None:
531
- output_identification = f'{Path.cwd()}/output_identification.csv'
755
+ output_identification = f'{Path.cwd()}/output_identification.txt'
532
756
  print(f'Warning: writing identification output to {output_identification}')
533
757
 
534
758
  if output_similarity_scores is None:
535
- output_similarity_scores = f'{Path.cwd()}/output_all_similarity_scores.csv'
759
+ output_similarity_scores = f'{Path.cwd()}/output_all_similarity_scores.txt'
536
760
  print(f'Warning: writing similarity scores to {output_similarity_scores}')
537
761
 
538
762
 
539
- ####################################### begin spectral library matching #######################################
540
- # compute the similarity score between each query library spectrum/spectra and all reference library spectra
541
763
  all_similarity_scores = []
542
764
  for query_idx in range(0,len(unique_query_ids)):
543
765
  print(f'query spectrum #{query_idx} is being identified')
544
766
  q_idxs_tmp = np.where(df_query.iloc[:,0] == unique_query_ids[query_idx])[0]
545
767
  q_spec_tmp = np.asarray(pd.concat([df_query.iloc[q_idxs_tmp,1], df_query.iloc[q_idxs_tmp,2]], axis=1).reset_index(drop=True))
546
768
 
547
- # compute the similarity score between the given query spectrum and all spectra in the reference library
548
769
  similarity_scores = []
549
770
  for ref_idx in range(0,len(unique_reference_ids)):
550
771
  #if ref_idx % 100 == 0:
@@ -553,37 +774,35 @@ def run_spec_lib_matching_on_HRMS_data(query_data=None, reference_data=None, lik
553
774
  r_idxs_tmp = np.where(df_reference.iloc[:,0] == unique_reference_ids[ref_idx])[0]
554
775
  r_spec = np.asarray(pd.concat([df_reference.iloc[r_idxs_tmp,1], df_reference.iloc[r_idxs_tmp,2]], axis=1).reset_index(drop=True))
555
776
 
556
- # apply spectrum preprocessing transformation in the order specified by user
557
777
  is_matched = False
558
778
  for transformation in spectrum_preprocessing_order:
559
779
  if np.isinf(q_spec[:,1]).sum() > 0:
560
780
  q_spec[:,1] = np.zeros(q_spec.shape[0])
561
781
  if np.isinf(r_spec[:,1]).sum() > 0:
562
782
  r_spec[:,1] = np.zeros(r_spec.shape[0])
563
- if transformation == 'C' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1: # centroiding
783
+ if transformation == 'C' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1:
564
784
  q_spec = centroid_spectrum(q_spec, window_size=window_size_centroiding)
565
785
  r_spec = centroid_spectrum(r_spec, window_size=window_size_centroiding)
566
- if transformation == 'M' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1: # matching
786
+ if transformation == 'M' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1:
567
787
  m_spec = match_peaks_in_spectra(spec_a=q_spec, spec_b=r_spec, window_size=window_size_matching)
568
788
  q_spec = m_spec[:,0:2]
569
789
  r_spec = m_spec[:,[0,2]]
570
790
  is_matched = True
571
- if transformation == 'W' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1: # weight factor transformation
791
+ if transformation == 'W' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1:
572
792
  q_spec[:,1] = wf_transform(q_spec[:,0], q_spec[:,1], wf_mz, wf_intensity)
573
793
  r_spec[:,1] = wf_transform(r_spec[:,0], r_spec[:,1], wf_mz, wf_intensity)
574
- if transformation == 'L' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1: # low-entropy tranformation
794
+ if transformation == 'L' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1:
575
795
  q_spec[:,1] = LE_transform(q_spec[:,1], LET_threshold, normalization_method=normalization_method)
576
796
  r_spec[:,1] = LE_transform(r_spec[:,1], LET_threshold, normalization_method=normalization_method)
577
- if transformation == 'N' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1: # noise removal
797
+ if transformation == 'N' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1:
578
798
  q_spec = remove_noise(q_spec, nr = noise_threshold)
579
799
  if high_quality_reference_library == False:
580
800
  r_spec = remove_noise(r_spec, nr = noise_threshold)
581
- if transformation == 'F' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1: # filter with respect to mz and/or intensity
801
+ if transformation == 'F' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1:
582
802
  q_spec = filter_spec_lcms(q_spec, mz_min = mz_min, mz_max = mz_max, int_min = int_min, int_max = int_max, is_matched = is_matched)
583
803
  if high_quality_reference_library == False:
584
804
  r_spec = filter_spec_lcms(r_spec, mz_min = mz_min, mz_max = mz_max, int_min = int_min, int_max = int_max, is_matched = is_matched)
585
805
 
586
- # query and reference spectrum intensities
587
806
  q_ints = q_spec[:,1]
588
807
  r_ints = r_spec[:,1]
589
808
 
@@ -595,12 +814,10 @@ def run_spec_lib_matching_on_HRMS_data(query_data=None, reference_data=None, lik
595
814
  similarity_scores.append(similarity_score)
596
815
  all_similarity_scores.append(similarity_scores)
597
816
 
598
- # create pandas dataframe containing all similarity scores computed with one row for each query spectrum and one column for each reference spectrum
599
817
  df_scores = pd.DataFrame(all_similarity_scores, columns = unique_reference_ids)
600
818
  df_scores.index = unique_query_ids
601
819
  df_scores.index.names = ['Query Spectrum ID']
602
820
 
603
- # get predicted identity/identities of each query spectrum and the corresponding maximum similarity score
604
821
  preds = []
605
822
  scores = []
606
823
  for i in range(0, df_scores.shape[0]):
@@ -624,30 +841,24 @@ def run_spec_lib_matching_on_HRMS_data(query_data=None, reference_data=None, lik
624
841
  scores = np.array(scores)
625
842
  out = np.c_[preds,scores]
626
843
 
627
- # get column names for a pandas dataframe with the n_top_matches_to_save top-matches for each query spectrum
628
844
  cnames_preds = []
629
845
  cnames_scores = []
630
846
  for i in range(0,n_top_matches_to_save):
631
847
  cnames_preds.append(f'RANK.{i+1}.PRED')
632
848
  cnames_scores.append(f'RANK.{i+1}.SIMILARITY.SCORE')
633
849
 
634
- # get pandas dataframe with identifcation results with each row corresponding to a query spectrum, n_top_matches_to_save columns for the top predictions, and n_top_matches_to_save columns for the similarity scores corresponding to the predictions
635
850
  df_top_ref_specs = pd.DataFrame(out, columns = [*cnames_preds, *cnames_scores])
636
851
  df_top_ref_specs.index = unique_query_ids
637
852
  df_top_ref_specs.index.names = ['Query Spectrum ID']
638
853
 
639
854
  df_scores.columns = ['Reference Spectrum ID: ' + col for col in list(map(str,df_scores.columns.tolist()))]
640
855
 
641
- # print the identification results if the user desires
642
856
  if print_id_results == True:
643
857
  print(df_top_ref_specs.to_string())
644
858
 
645
859
  if return_ID_output is False:
646
- # write spectral library matching results to disk
647
- df_top_ref_specs.to_csv(output_identification)
648
-
649
- # write all similarity scores to disk
650
- df_scores.to_csv(output_similarity_scores)
860
+ df_top_ref_specs.to_csv(output_identification, sep='\t')
861
+ df_scores.to_csv(output_similarity_scores, sep='\t')
651
862
  else:
652
863
  return df_top_ref_specs
653
864
 
@@ -678,11 +889,10 @@ def run_spec_lib_matching_on_NRMS_data(query_data=None, reference_data=None, lik
678
889
  --normalization_method: Method used to normalize the intensities of each spectrum so that the intensities sum to 1. Since the objects entropy quantifies the uncertainy of must be probability distributions, the intensities of a given spectrum must sum to 1 prior to computing the entropy of the given spectrum intensities. Options: \'standard\' and \'softmax\'. Default: standard.
679
890
  --n_top_matches_to_save: The number of top matches to report. For example, if n_top_matches_to_save=5, then for each query spectrum, the five reference spectra with the largest similarity with the given query spectrum will be reported. Default: 1
680
891
  --print_id_results: Flag that prints identification results if True. Default: False
681
- --output_identification: Output CSV file containing the most-similar reference spectra for each query spectrum along with the corresponding similarity scores. Default is to save identification output in current working directory with filename \'output_identification.csv\'.
682
- --output_similarity_scores: Output CSV file containing similarity scores between all query spectrum/spectra and all reference spectra. Each row corresponds to a query spectrum, the left-most column contains the query spectrum/spectra identifier, and the remaining column contain the similarity scores with respect to all reference library spectra. If no argument passed, then this CSV file is written to the current working directory with filename \'output_all_similarity_scores\'.csv.')
892
+ --output_identification: Output TXT file containing the most-similar reference spectra for each query spectrum along with the corresponding similarity scores. Default is to save identification output in current working directory with filename \'output_identification.txt\'.
893
+ --output_similarity_scores: Output TXT file containing similarity scores between all query spectrum/spectra and all reference spectra. Each row corresponds to a query spectrum, the left-most column contains the query spectrum/spectra identifier, and the remaining column contain the similarity scores with respect to all reference library spectra. If no argument passed, then this TXT file is written to the current working directory with filename \'output_all_similarity_scores\'.txt.')
683
894
  '''
684
895
 
685
- # load query and reference libraries
686
896
  if query_data is None:
687
897
  print('\nError: No argument passed to the mandatory query_data. Please pass the path to the CSV file of the query data.')
688
898
  sys.exit()
@@ -714,7 +924,6 @@ def run_spec_lib_matching_on_NRMS_data(query_data=None, reference_data=None, lik
714
924
  df_reference = pd.concat(dfs, axis=0, ignore_index=True)
715
925
 
716
926
 
717
- ##### process input parameters and ensure they are in a valid format #####
718
927
  if spectrum_preprocessing_order is not None:
719
928
  spectrum_preprocessing_order = list(spectrum_preprocessing_order)
720
929
  else:
@@ -767,7 +976,7 @@ def run_spec_lib_matching_on_NRMS_data(query_data=None, reference_data=None, lik
767
976
  else:
768
977
  q = entropy_dimension
769
978
 
770
- normalization_method = 'standard' #consider including additional normalization methods to transform intensities into a probability distribution; softmax results in many numerical errors/warnings
979
+ normalization_method = 'standard'
771
980
 
772
981
  if n_top_matches_to_save <= 0 or isinstance(n_top_matches_to_save,int)==False:
773
982
  print('\nError: n_top_matches_to_save should be a positive integer')
@@ -778,23 +987,19 @@ def run_spec_lib_matching_on_NRMS_data(query_data=None, reference_data=None, lik
778
987
  sys.exit()
779
988
 
780
989
  if output_identification is None:
781
- output_identification = f'{Path.cwd()}/output_identification.csv'
990
+ output_identification = f'{Path.cwd()}/output_identification.txt'
782
991
  print(f'Warning: writing identification output to {output_identification}')
783
992
 
784
993
  if output_similarity_scores is None:
785
- output_similarity_scores = f'{Path.cwd()}/output_all_similarity_scores.csv'
994
+ output_similarity_scores = f'{Path.cwd()}/output_all_similarity_scores.txt'
786
995
  print(f'Warning: writing similarity scores to {output_similarity_scores}')
787
996
 
788
997
 
789
998
 
790
- ####################################### begin spectral library matching #######################################
791
- # get the range of m/z values
792
999
  min_mz = int(np.min([np.min(df_query.iloc[:,1]), np.min(df_reference.iloc[:,1])]))
793
1000
  max_mz = int(np.max([np.max(df_query.iloc[:,1]), np.max(df_reference.iloc[:,1])]))
794
1001
  mzs = np.linspace(min_mz,max_mz,(max_mz-min_mz+1))
795
1002
 
796
- # compute the similarity score between each query library spectrum/spectra and all reference library spectra
797
- # for each query spectrum, compute its similarity with all reference spectra
798
1003
  all_similarity_scores = []
799
1004
  for query_idx in range(0,len(unique_query_ids)):
800
1005
  q_idxs_tmp = np.where(df_query.iloc[:,0] == unique_query_ids[query_idx])[0]
@@ -810,32 +1015,29 @@ def run_spec_lib_matching_on_NRMS_data(query_data=None, reference_data=None, lik
810
1015
  r_spec_tmp = np.asarray(pd.concat([df_reference.iloc[r_idxs_tmp,1], df_reference.iloc[r_idxs_tmp,2]], axis=1).reset_index(drop=True))
811
1016
  r_spec = convert_spec(r_spec_tmp,mzs)
812
1017
 
813
- # apply spectrum preprocessing transformation in the order specified by user
814
1018
  for transformation in spectrum_preprocessing_order:
815
1019
  if np.isinf(q_spec[:,1]).sum() > 0:
816
1020
  q_spec[:,1] = np.zeros(q_spec.shape[0])
817
1021
  if np.isinf(r_spec[:,1]).sum() > 0:
818
1022
  r_spec[:,1] = np.zeros(r_spec.shape[0])
819
- if transformation == 'W': # weight factor transformation
1023
+ if transformation == 'W':
820
1024
  q_spec[:,1] = wf_transform(q_spec[:,0], q_spec[:,1], wf_mz, wf_intensity)
821
1025
  r_spec[:,1] = wf_transform(r_spec[:,0], r_spec[:,1], wf_mz, wf_intensity)
822
- if transformation == 'L': # low-entropy transformation
1026
+ if transformation == 'L':
823
1027
  q_spec[:,1] = LE_transform(q_spec[:,1], LET_threshold, normalization_method=normalization_method)
824
1028
  r_spec[:,1] = LE_transform(r_spec[:,1], LET_threshold, normalization_method=normalization_method)
825
- if transformation == 'N': # noise removal
1029
+ if transformation == 'N':
826
1030
  q_spec = remove_noise(q_spec, nr = noise_threshold)
827
1031
  if high_quality_reference_library == False:
828
1032
  r_spec = remove_noise(r_spec, nr = noise_threshold)
829
- if transformation == 'F': # filter with respect to mz and/or intensity
1033
+ if transformation == 'F':
830
1034
  q_spec = filter_spec_gcms(q_spec, mz_min = mz_min, mz_max = mz_max, int_min = int_min, int_max = int_max)
831
1035
  if high_quality_reference_library == False:
832
1036
  r_spec = filter_spec_gcms(r_spec, mz_min = mz_min, mz_max = mz_max, int_min = int_min, int_max = int_max)
833
1037
 
834
- # query and reference spectrum intensities
835
1038
  q_ints = q_spec[:,1]
836
1039
  r_ints = r_spec[:,1]
837
1040
 
838
- # if there are no non-zero intensities in the query or reference spectrum, their similarity is 0
839
1041
  if np.sum(q_ints) != 0 and np.sum(r_ints) != 0:
840
1042
  similarity_score = get_similarity(similarity_measure, q_spec[:,1], r_spec[:,1], weights, entropy_dimension)
841
1043
  else:
@@ -844,12 +1046,10 @@ def run_spec_lib_matching_on_NRMS_data(query_data=None, reference_data=None, lik
844
1046
  similarity_scores.append(similarity_score)
845
1047
  all_similarity_scores.append(similarity_scores)
846
1048
 
847
- # create pandas dataframe containing all similarity scores computed with one row for each query spectrum and one column for each reference spectrum
848
1049
  df_scores = pd.DataFrame(all_similarity_scores, columns = unique_reference_ids)
849
1050
  df_scores.index = unique_query_ids
850
1051
  df_scores.index.names = ['Query Spectrum ID']
851
1052
 
852
- # get predicted identity/identities of each query spectrum and the corresponding maximum similarity score
853
1053
  preds = []
854
1054
  scores = []
855
1055
  for i in range(0, df_scores.shape[0]):
@@ -861,7 +1061,6 @@ def run_spec_lib_matching_on_NRMS_data(query_data=None, reference_data=None, lik
861
1061
  cols_to_keep = np.where(df_scores_tmp.iloc[i,:] != np.max(df_scores_tmp.iloc[i,:]))[0]
862
1062
  df_scores_tmp = df_scores_tmp.iloc[:,cols_to_keep]
863
1063
 
864
- #preds_tmp.append(';'.join(top_ref_specs_tmp.index.to_list()))
865
1064
  preds_tmp.append(';'.join(map(str,top_ref_specs_tmp.index.to_list())))
866
1065
  if len(top_ref_specs_tmp.values) == 0:
867
1066
  scores_tmp.append(0)
@@ -874,31 +1073,25 @@ def run_spec_lib_matching_on_NRMS_data(query_data=None, reference_data=None, lik
874
1073
  scores = np.array(scores)
875
1074
  out = np.c_[preds,scores]
876
1075
 
877
- # get column names for a pandas dataframe with the n_top_matches_to_save top-matches for each query spectrum
878
1076
  cnames_preds = []
879
1077
  cnames_scores = []
880
1078
  for i in range(0,n_top_matches_to_save):
881
1079
  cnames_preds.append(f'RANK.{i+1}.PRED')
882
1080
  cnames_scores.append(f'RANK.{i+1}.SIMILARITY.SCORE')
883
1081
 
884
- # get pandas dataframe with identifcation results with each row corresponding to a query spectrum, n_top_matches_to_save columns for the top predictions, and n_top_matches_to_save columns for the similarity scores corresponding to the predictions
885
1082
  df_top_ref_specs = pd.DataFrame(out, columns = [*cnames_preds, *cnames_scores])
886
1083
  df_top_ref_specs.index = unique_query_ids
887
1084
  df_top_ref_specs.index.names = ['Query Spectrum ID']
888
1085
 
889
- # print the identification results if the user desires
890
1086
  if print_id_results == True:
891
1087
  print(df_top_ref_specs.to_string())
892
1088
 
893
1089
  df_scores.columns = ['Reference Spectrum ID: ' + col for col in list(map(str,df_scores.columns.tolist()))]
894
1090
 
895
1091
  if return_ID_output is False:
896
- # write spectral library matching results to disk
897
- df_top_ref_specs.to_csv(output_identification)
898
-
899
- # write all similarity scores to disk
1092
+ df_top_ref_specs.to_csv(output_identification, sep='\t')
900
1093
  df_scores.columns = ['Reference Spectrum ID: ' + col for col in list(map(str,df_scores.columns.tolist()))]
901
- df_scores.to_csv(output_similarity_scores)
1094
+ df_scores.to_csv(output_similarity_scores, sep='\t')
902
1095
  else:
903
1096
  return df_top_ref_specs
904
1097