pycompound 0.1.1__py3-none-any.whl → 0.1.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,4 @@
1
1
 
2
- # this script's function runs spectral library matching to identify unknown query compound(s)
3
-
4
2
  from pycompound.build_library import build_library_from_raw_data
5
3
  from .processing import *
6
4
  from .similarity_measures import *
@@ -10,6 +8,7 @@ import json
10
8
  from itertools import product
11
9
  from joblib import Parallel, delayed
12
10
  import csv
11
+ import sys, csv
13
12
 
14
13
 
15
14
  default_HRMS_grid = {'similarity_measure':['cosine'], 'weight':[{'Cosine':0.25,'Shannon':0.25,'Renyi':0.25,'Tsallis':0.25}], 'spectrum_preprocessing_order':['FCNMWL'], 'mz_min':[0], 'mz_max':[9999999], 'int_min':[0], 'int_max':[99999999], 'window_size_centroiding':[0.5], 'window_size_matching':[0.5], 'noise_threshold':[0.0], 'wf_mz':[0.0], 'wf_int':[1.0], 'LET_threshold':[0.0], 'entropy_dimension':[1.1], 'high_quality_reference_library':[False]}
@@ -80,21 +79,20 @@ def _eval_one_NRMS(df_query, df_reference, unique_query_ids, unique_reference_id
80
79
 
81
80
  def tune_params_on_HRMS_data(query_data=None, reference_data=None, grid=None, output_path=None, return_output=False):
82
81
  """
83
- runs spectral library matching on high-resolution mass spectrometry (HRMS) data with all possible combinations of parameters in the grid dict, saves results from each choice of parameters to a CSV file, and prints top-performing parameters
82
+ runs spectral library matching on high-resolution mass spectrometry (HRMS) data with all possible combinations of parameters in the grid dict, saves results from each choice of parameters to a TXT file, and prints top-performing parameters
84
83
 
85
84
  --query_data: mgf, mzML, or csv file of query mass spectrum/spectra to be identified. If csv file, each row should correspond to a mass spectrum, the left-most column should contain an identifier, and each of the other columns should correspond to a single mass/charge ratio. Mandatory argument.
86
85
  --reference_data: mgf, mzML, or csv file of the reference mass spectra. If csv file, each row should correspond to a mass spectrum, the left-most column should contain in identifier (i.e. the CAS registry number or the compound name), and the remaining column should correspond to a single mass/charge ratio. Mandatory argument.
87
86
  --grid: dict with all possible parameter values to try.
88
- --output_path: accuracy from each choice of parameter set is saved to a CSV file here.
87
+ --output_path: accuracy from each choice of parameter set is saved to a TXT file here.
89
88
  """
90
89
 
91
90
  grid = {**default_HRMS_grid, **(grid or {})}
92
91
  for key, value in grid.items():
93
92
  globals()[key] = value
94
93
 
95
- # load query and reference libraries
96
94
  if query_data is None:
97
- print('\nError: No argument passed to the mandatory query_data. Please pass the path to the CSV file of the query data.')
95
+ print('\nError: No argument passed to the mandatory query_data. Please pass the path to the TXT file of the query data.')
98
96
  sys.exit()
99
97
  else:
100
98
  extension = query_data.rsplit('.',1)
@@ -154,14 +152,123 @@ def tune_params_on_HRMS_data(query_data=None, reference_data=None, grid=None, ou
154
152
  return df_out
155
153
 
156
154
 
155
+
156
+ def tune_params_on_HRMS_data_shiny(query_data=None, reference_data=None, grid=None, output_path=None, return_output=False):
157
+ """
158
+ runs spectral library matching on high-resolution mass spectrometry (HRMS) data with all possible
159
+ combinations of parameters in the grid dict, saves results from each choice of parameters to a TXT file,
160
+ and prints top-performing parameters
161
+
162
+ --query_data: mgf, mzML, or csv file of query mass spectrum/spectra to be identified. If csv file, each row
163
+ should correspond to a mass spectrum, the left-most column should contain an identifier, and each of the
164
+ other columns should correspond to a single mass/charge ratio. Mandatory argument.
165
+ --reference_data: mgf, mzML, or csv file of the reference mass spectra. If csv file, each row should correspond
166
+ to a mass spectrum, the left-most column should contain in identifier (i.e. the CAS registry number or the
167
+ compound name), and the remaining column should correspond to a single mass/charge ratio. Mandatory argument.
168
+ --grid: dict with all possible parameter values to try.
169
+ --output_path: accuracy from each choice of parameter set is saved to a TXT file here.
170
+ """
171
+
172
+ local_grid = {**default_HRMS_grid, **(grid or {})}
173
+ for key, value in local_grid.items():
174
+ globals()[key] = value
175
+
176
+ if query_data is None:
177
+ print('\nError: No argument passed to the mandatory query_data. Please pass the path to the data file.')
178
+ sys.exit()
179
+ else:
180
+ extension = query_data.rsplit('.', 1)[-1]
181
+ if extension in ('mgf','MGF','mzML','mzml','MZML','cdf','CDF'):
182
+ output_path_tmp = query_data[:-3] + 'csv'
183
+ build_library_from_raw_data(input_path=query_data, output_path=output_path_tmp, is_reference=False)
184
+ df_query = pd.read_csv(output_path_tmp)
185
+ elif extension in ('csv','CSV'):
186
+ df_query = pd.read_csv(query_data)
187
+ else:
188
+ print(f'\nError: Unsupported query_data extension: {extension}')
189
+ sys.exit()
190
+ unique_query_ids = df_query.iloc[:, 0].unique()
191
+
192
+ if reference_data is None:
193
+ print('\nError: No argument passed to the mandatory reference_data. Please pass the path to the data file(s).')
194
+ sys.exit()
195
+ else:
196
+ if isinstance(reference_data, str):
197
+ df_reference = get_reference_df(reference_data=reference_data)
198
+ unique_reference_ids = df_reference.iloc[:, 0].unique()
199
+ else:
200
+ dfs = []
201
+ unique_reference_ids = []
202
+ for f in reference_data:
203
+ tmp = get_reference_df(reference_data=f)
204
+ dfs.append(tmp)
205
+ unique_reference_ids.extend(tmp.iloc[:, 0].unique())
206
+ df_reference = pd.concat(dfs, axis=0, ignore_index=True)
207
+
208
+ print(f'\nNote that there are {len(unique_query_ids)} unique query spectra, '
209
+ f'{len(unique_reference_ids)} unique reference spectra, and '
210
+ f'{len(set(unique_query_ids) & set(unique_reference_ids))} of the query and reference spectra IDs are in common.\n')
211
+
212
+ if output_path is None:
213
+ output_path = f'{Path.cwd()}/tuning_param_output.txt'
214
+ print(f'Warning: since output_path=None, the output will be written to the current working directory: {output_path}')
215
+
216
+ param_grid = product(
217
+ similarity_measure, weight, spectrum_preprocessing_order, mz_min, mz_max, int_min, int_max,
218
+ noise_threshold, window_size_centroiding, window_size_matching, wf_mz, wf_int, LET_threshold,
219
+ entropy_dimension, high_quality_reference_library
220
+ )
221
+
222
+ results = []
223
+ total = (
224
+ len(similarity_measure) * len(weight) * len(spectrum_preprocessing_order) * len(mz_min) * len(mz_max) *
225
+ len(int_min) * len(int_max) * len(noise_threshold) * len(window_size_centroiding) *
226
+ len(window_size_matching) * len(wf_mz) * len(wf_int) * len(LET_threshold) *
227
+ len(entropy_dimension) * len(high_quality_reference_library)
228
+ )
229
+ done = 0
230
+
231
+ for params in param_grid:
232
+ res = _eval_one_HRMS(df_query, df_reference, unique_query_ids, unique_reference_ids, *params)
233
+ results.append(res)
234
+ done += 1
235
+ print(f'Completed {done}/{total} grid combinations.\n', flush=True)
236
+
237
+ df_out = pd.DataFrame(results, columns=[
238
+ 'ACC','SIMILARITY.MEASURE','WEIGHT','SPECTRUM.PROCESSING.ORDER','MZ.MIN','MZ.MAX',
239
+ 'INT.MIN','INT.MAX','NOISE.THRESHOLD','WINDOW.SIZE.CENTROIDING','WINDOW.SIZE.MATCHING',
240
+ 'WF.MZ','WF.INT','LET.THRESHOLD','ENTROPY.DIMENSION','HIGH.QUALITY.REFERENCE.LIBRARY'
241
+ ])
242
+
243
+ if 'WEIGHT' in df_out.columns:
244
+ df_out['WEIGHT'] = (
245
+ df_out['WEIGHT'].astype(str)
246
+ .str.replace("\"","",regex=False)
247
+ .str.replace("{","",regex=False)
248
+ .str.replace("}","",regex=False)
249
+ .str.replace(":","",regex=False)
250
+ .str.replace("Cosine","",regex=False)
251
+ .str.replace("Shannon","",regex=False)
252
+ .str.replace("Renyi","",regex=False)
253
+ .str.replace("Tsallis","",regex=False)
254
+ .str.replace(" ","",regex=False)
255
+ )
256
+
257
+ if return_output:
258
+ return df_out
259
+ else:
260
+ df_out.to_csv(output_path, index=False, sep='\t', quoting=csv.QUOTE_NONE)
261
+ print(f'Wrote results to {output_path}')
262
+
263
+
157
264
  def tune_params_on_NRMS_data(query_data=None, reference_data=None, grid=None, output_path=None, return_output=False):
158
265
  """
159
- runs spectral library matching on nominal-resolution mass spectrometry (NRMS) data with all possible combinations of parameters in the grid dict, saves results from each choice of parameters to a CSV file, and prints top-performing parameters
266
+ runs spectral library matching on nominal-resolution mass spectrometry (NRMS) data with all possible combinations of parameters in the grid dict, saves results from each choice of parameters to a TXT file, and prints top-performing parameters
160
267
 
161
268
  --query_data: mgf, mzML, or csv file of query mass spectrum/spectra to be identified. If csv file, each row should correspond to a mass spectrum, the left-most column should contain an identifier, and each of the other columns should correspond to a single mass/charge ratio. Mandatory argument.
162
269
  --reference_data: mgf, mzML, or csv file of the reference mass spectra. If csv file, each row should correspond to a mass spectrum, the left-most column should contain in identifier (i.e. the CAS registry number or the compound name), and the remaining column should correspond to a single mass/charge ratio. Mandatory argument.
163
270
  --grid: dict with all possible parameter values to try
164
- --output_path: accuracy from each choice of parameter set is saved to a CSV file here
271
+ --output_path: accuracy from each choice of parameter set is saved to a TXT file here
165
272
  """
166
273
 
167
274
  grid = {**default_NRMS_grid, **(grid or {})}
@@ -228,57 +335,157 @@ def tune_params_on_NRMS_data(query_data=None, reference_data=None, grid=None, ou
228
335
 
229
336
 
230
337
 
338
+ def tune_params_on_NRMS_data_shiny(query_data=None, reference_data=None, grid=None, output_path=None, return_output=False):
339
+ """
340
+ runs spectral library matching on nominal-resolution mass spectrometry (NRMS) data with all possible
341
+ combinations of parameters in the grid dict, saves results from each choice of parameters to a TXT file,
342
+ and prints top-performing parameters
343
+
344
+ --query_data: mgf, mzML, or csv file of query mass spectrum/spectra to be identified. If csv file, each row
345
+ should correspond to a mass spectrum, the left-most column should contain an identifier, and each of the
346
+ other columns should correspond to a single mass/charge ratio. Mandatory argument.
347
+ --reference_data: mgf, mzML, or csv file of the reference mass spectra. If csv file, each row should correspond
348
+ to a mass spectrum, the left-most column should contain in identifier (i.e. the CAS registry number or the
349
+ compound name), and the remaining column should correspond to a single mass/charge ratio. Mandatory argument.
350
+ --grid: dict with all possible parameter values to try.
351
+ --output_path: accuracy from each choice of parameter set is saved to a TXT file here.
352
+ """
353
+
354
+ local_grid = {**default_NRMS_grid, **(grid or {})}
355
+ for key, value in local_grid.items():
356
+ globals()[key] = value
357
+
358
+ if query_data is None:
359
+ print('\nError: No argument passed to the mandatory query_data. Please pass the path to the data file.')
360
+ sys.exit()
361
+ else:
362
+ extension = query_data.rsplit('.', 1)[-1]
363
+ if extension in ('mgf','MGF','mzML','mzml','MZML','cdf','CDF'):
364
+ output_path_tmp = query_data[:-3] + 'csv'
365
+ build_library_from_raw_data(input_path=query_data, output_path=output_path_tmp, is_reference=False)
366
+ df_query = pd.read_csv(output_path_tmp)
367
+ elif extension in ('csv','CSV'):
368
+ df_query = pd.read_csv(query_data)
369
+ else:
370
+ print(f'\nError: Unsupported query_data extension: {extension}')
371
+ sys.exit()
372
+ unique_query_ids = df_query.iloc[:, 0].unique()
373
+
374
+ if reference_data is None:
375
+ print('\nError: No argument passed to the mandatory reference_data. Please pass the path to the data file(s).')
376
+ sys.exit()
377
+ else:
378
+ if isinstance(reference_data, str):
379
+ df_reference = get_reference_df(reference_data=reference_data)
380
+ unique_reference_ids = df_reference.iloc[:, 0].unique()
381
+ else:
382
+ dfs = []
383
+ unique_reference_ids = []
384
+ for f in reference_data:
385
+ tmp = get_reference_df(reference_data=f)
386
+ dfs.append(tmp)
387
+ unique_reference_ids.extend(tmp.iloc[:, 0].unique())
388
+ df_reference = pd.concat(dfs, axis=0, ignore_index=True)
389
+
390
+ print(f'\nNote that there are {len(unique_query_ids)} unique query spectra, '
391
+ f'{len(unique_reference_ids)} unique reference spectra, and '
392
+ f'{len(set(unique_query_ids) & set(unique_reference_ids))} of the query and reference spectra IDs are in common.\n')
393
+
394
+ if output_path is None:
395
+ output_path = f'{Path.cwd()}/tuning_param_output.txt'
396
+ print(f'Warning: since output_path=None, the output will be written to the current working directory: {output_path}')
397
+
398
+ param_grid = product(
399
+ similarity_measure, weight, spectrum_preprocessing_order, mz_min, mz_max, int_min, int_max,
400
+ noise_threshold, wf_mz, wf_int, LET_threshold,
401
+ entropy_dimension, high_quality_reference_library
402
+ )
403
+
404
+ results = []
405
+ total = (
406
+ len(similarity_measure) * len(weight) * len(spectrum_preprocessing_order) * len(mz_min) * len(mz_max) * len(int_min) *
407
+ len(int_max) * len(noise_threshold) * len(wf_mz) * len(wf_int) * len(LET_threshold) * len(entropy_dimension) * len(high_quality_reference_library)
408
+ )
409
+ done = 0
410
+ for params in param_grid:
411
+ res = _eval_one_NRMS(df_query, df_reference, unique_query_ids, unique_reference_ids, *params)
412
+ results.append(res)
413
+ done += 1
414
+ print(f'Completed {done}/{total} grid combinations.\n', flush=True)
415
+
416
+ df_out = pd.DataFrame(results, columns=[
417
+ 'ACC','SIMILARITY.MEASURE','WEIGHT','SPECTRUM.PROCESSING.ORDER','MZ.MIN','MZ.MAX',
418
+ 'INT.MIN','INT.MAX','NOISE.THRESHOLD','WF.MZ','WF.INT','LET.THRESHOLD','ENTROPY.DIMENSION','HIGH.QUALITY.REFERENCE.LIBRARY'
419
+ ])
420
+
421
+ if 'WEIGHT' in df_out.columns:
422
+ df_out['WEIGHT'] = (
423
+ df_out['WEIGHT'].astype(str)
424
+ .str.replace("\"","",regex=False)
425
+ .str.replace("{","",regex=False)
426
+ .str.replace("}","",regex=False)
427
+ .str.replace(":","",regex=False)
428
+ .str.replace("Cosine","",regex=False)
429
+ .str.replace("Shannon","",regex=False)
430
+ .str.replace("Renyi","",regex=False)
431
+ .str.replace("Tsallis","",regex=False)
432
+ .str.replace(" ","",regex=False)
433
+ )
434
+
435
+ if return_output:
436
+ return df_out
437
+ else:
438
+ df_out.to_csv(output_path, index=False, sep='\t', quoting=csv.QUOTE_NONE)
439
+ print(f'Wrote results to {output_path}')
440
+
441
+
442
+
231
443
 
232
444
  def get_acc_HRMS(df_query, df_reference, unique_query_ids, unique_reference_ids, similarity_measure, weights, spectrum_preprocessing_order, mz_min, mz_max, int_min, int_max, window_size_centroiding, window_size_matching, noise_threshold, wf_mz, wf_int, LET_threshold, entropy_dimension, high_quality_reference_library):
233
- # returns accuracy for a given set of parameters
234
445
 
235
446
  n_top_matches_to_save = 1
236
447
 
237
- # compute the similarity score between each query library spectrum/spectra and all reference library spectra
238
448
  all_similarity_scores = []
239
449
  for query_idx in range(0,len(unique_query_ids)):
240
450
  print(f'query spectrum #{query_idx} is being identified')
241
451
  q_idxs_tmp = np.where(df_query.iloc[:,0] == unique_query_ids[query_idx])[0]
242
452
  q_spec_tmp = np.asarray(pd.concat([df_query.iloc[q_idxs_tmp,1], df_query.iloc[q_idxs_tmp,2]], axis=1).reset_index(drop=True))
243
453
 
244
- # compute the similarity score between the given query spectrum and all spectra in the reference library
245
454
  similarity_scores = []
246
455
  for ref_idx in range(0,len(unique_reference_ids)):
247
456
  q_spec = q_spec_tmp
248
457
  r_idxs_tmp = np.where(df_reference.iloc[:,0] == unique_reference_ids[ref_idx])[0]
249
458
  r_spec = np.asarray(pd.concat([df_reference.iloc[r_idxs_tmp,1], df_reference.iloc[r_idxs_tmp,2]], axis=1).reset_index(drop=True))
250
459
 
251
- # apply spectrum preprocessing transformation in the order specified by user
252
460
  is_matched = False
253
461
  for transformation in spectrum_preprocessing_order:
254
462
  if np.isinf(q_spec[:,1]).sum() > 0:
255
463
  q_spec[:,1] = np.zeros(q_spec.shape[0])
256
464
  if np.isinf(r_spec[:,1]).sum() > 0:
257
465
  r_spec[:,1] = np.zeros(r_spec.shape[0])
258
- if transformation == 'C' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1: # centroiding
466
+ if transformation == 'C' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1:
259
467
  q_spec = centroid_spectrum(q_spec, window_size=window_size_centroiding)
260
468
  r_spec = centroid_spectrum(r_spec, window_size=window_size_centroiding)
261
- if transformation == 'M' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1: # matching
469
+ if transformation == 'M' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1:
262
470
  m_spec = match_peaks_in_spectra(spec_a=q_spec, spec_b=r_spec, window_size=window_size_matching)
263
471
  q_spec = m_spec[:,0:2]
264
472
  r_spec = m_spec[:,[0,2]]
265
473
  is_matched = True
266
- if transformation == 'W' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1: # weight factor transformation
474
+ if transformation == 'W' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1:
267
475
  q_spec[:,1] = wf_transform(q_spec[:,0], q_spec[:,1], wf_mz, wf_int)
268
476
  r_spec[:,1] = wf_transform(r_spec[:,0], r_spec[:,1], wf_mz, wf_int)
269
- if transformation == 'L' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1: # low-entropy tranformation
477
+ if transformation == 'L' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1:
270
478
  q_spec[:,1] = LE_transform(q_spec[:,1], LET_threshold, normalization_method='standard')
271
479
  r_spec[:,1] = LE_transform(r_spec[:,1], LET_threshold, normalization_method='standard')
272
- if transformation == 'N' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1: # noise removal
480
+ if transformation == 'N' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1:
273
481
  q_spec = remove_noise(q_spec, nr = noise_threshold)
274
482
  if high_quality_reference_library == False:
275
483
  r_spec = remove_noise(r_spec, nr = noise_threshold)
276
- if transformation == 'F' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1: # filter with respect to mz and/or intensity
484
+ if transformation == 'F' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1:
277
485
  q_spec = filter_spec_lcms(q_spec, mz_min = mz_min, mz_max = mz_max, int_min = int_min, int_max = int_max, is_matched = is_matched)
278
486
  if high_quality_reference_library == False:
279
487
  r_spec = filter_spec_lcms(r_spec, mz_min = mz_min, mz_max = mz_max, int_min = int_min, int_max = int_max, is_matched = is_matched)
280
488
 
281
- # query and reference spectrum intensities
282
489
  q_ints = q_spec[:,1]
283
490
  r_ints = r_spec[:,1]
284
491
  if np.sum(q_ints) != 0 and np.sum(r_ints) != 0 and q_spec.shape[0] > 1 and r_spec.shape[1] > 1:
@@ -289,12 +496,10 @@ def get_acc_HRMS(df_query, df_reference, unique_query_ids, unique_reference_ids,
289
496
  similarity_scores.append(similarity_score)
290
497
  all_similarity_scores.append(similarity_scores)
291
498
 
292
- # create pandas dataframe containing all similarity scores computed with one row for each query spectrum and one column for each reference spectrum
293
499
  df_scores = pd.DataFrame(all_similarity_scores, columns = unique_reference_ids)
294
500
  df_scores.index = unique_query_ids
295
501
  df_scores.index.names = ['Query Spectrum ID']
296
502
 
297
- # get predicted identity/identities of each query spectrum and the corresponding maximum similarity score
298
503
  preds = []
299
504
  scores = []
300
505
  for i in range(0, df_scores.shape[0]):
@@ -325,7 +530,6 @@ def get_acc_HRMS(df_query, df_reference, unique_query_ids, unique_reference_ids,
325
530
 
326
531
 
327
532
  def get_acc_NRMS(df_query, df_reference, unique_query_ids, unique_reference_ids, similarity_measure, weights, spectrum_preprocessing_order, mz_min, mz_max, int_min, int_max, noise_threshold, wf_mz, wf_int, LET_threshold, entropy_dimension, high_quality_reference_library):
328
- # returns accuracy for a given set of parameters
329
533
 
330
534
  n_top_matches_to_save = 1
331
535
 
@@ -348,32 +552,29 @@ def get_acc_NRMS(df_query, df_reference, unique_query_ids, unique_reference_ids,
348
552
  r_spec_tmp = np.asarray(pd.concat([df_reference.iloc[r_idxs_tmp,1], df_reference.iloc[r_idxs_tmp,2]], axis=1).reset_index(drop=True))
349
553
  r_spec = convert_spec(r_spec_tmp,mzs)
350
554
 
351
- # apply spectrum preprocessing transformation in the order specified by user
352
555
  for transformation in spectrum_preprocessing_order:
353
556
  if np.isinf(q_spec[:,1]).sum() > 0:
354
557
  q_spec[:,1] = np.zeros(q_spec.shape[0])
355
558
  if np.isinf(r_spec[:,1]).sum() > 0:
356
559
  r_spec[:,1] = np.zeros(r_spec.shape[0])
357
- if transformation == 'W': # weight factor transformation
560
+ if transformation == 'W':
358
561
  q_spec[:,1] = wf_transform(q_spec[:,0], q_spec[:,1], wf_mz, wf_int)
359
562
  r_spec[:,1] = wf_transform(r_spec[:,0], r_spec[:,1], wf_mz, wf_int)
360
- if transformation == 'L': # low-entropy transformation
563
+ if transformation == 'L':
361
564
  q_spec[:,1] = LE_transform(q_spec[:,1], LET_threshold, normalization_method='standard')
362
565
  r_spec[:,1] = LE_transform(r_spec[:,1], LET_threshold, normalization_method='standard')
363
- if transformation == 'N': # noise removal
566
+ if transformation == 'N':
364
567
  q_spec = remove_noise(q_spec, nr = noise_threshold)
365
568
  if high_quality_reference_library == False:
366
569
  r_spec = remove_noise(r_spec, nr = noise_threshold)
367
- if transformation == 'F': # filter with respect to mz and/or intensity
570
+ if transformation == 'F':
368
571
  q_spec = filter_spec_gcms(q_spec, mz_min = mz_min, mz_max = mz_max, int_min = int_min, int_max = int_max)
369
572
  if high_quality_reference_library == False:
370
573
  r_spec = filter_spec_gcms(r_spec, mz_min = mz_min, mz_max = mz_max, int_min = int_min, int_max = int_max)
371
574
 
372
- # query and reference spectrum intensities
373
575
  q_ints = q_spec[:,1]
374
576
  r_ints = r_spec[:,1]
375
577
 
376
- # if there are no non-zero intensities in the query or reference spectrum, their similarity is 0
377
578
  if np.sum(q_ints) != 0 and np.sum(r_ints) != 0:
378
579
  similarity_score = get_similarity(similarity_measure, q_spec[:,1], r_spec[:,1], weights, entropy_dimension)
379
580
  else:
@@ -382,12 +583,10 @@ def get_acc_NRMS(df_query, df_reference, unique_query_ids, unique_reference_ids,
382
583
  similarity_scores.append(similarity_score)
383
584
  all_similarity_scores.append(similarity_scores)
384
585
 
385
- # create pandas dataframe containing all similarity scores computed with one row for each query spectrum and one column for each reference spectrum
386
586
  df_scores = pd.DataFrame(all_similarity_scores, columns = unique_reference_ids)
387
587
  df_scores.index = unique_query_ids
388
588
  df_scores.index.names = ['Query Spectrum ID']
389
589
 
390
- # get predicted identity/identities of each query spectrum and the corresponding maximum similarity score
391
590
  preds = []
392
591
  scores = []
393
592
  for i in range(0, df_scores.shape[0]):
@@ -399,7 +598,6 @@ def get_acc_NRMS(df_query, df_reference, unique_query_ids, unique_reference_ids,
399
598
  cols_to_keep = np.where(df_scores_tmp.iloc[i,:] != np.max(df_scores_tmp.iloc[i,:]))[0]
400
599
  df_scores_tmp = df_scores_tmp.iloc[:,cols_to_keep]
401
600
 
402
- #preds_tmp.append(';'.join(top_ref_specs_tmp.index.to_list()))
403
601
  preds_tmp.append(';'.join(map(str,top_ref_specs_tmp.index.to_list())))
404
602
  if len(top_ref_specs_tmp.values) == 0:
405
603
  scores_tmp.append(0)
@@ -441,11 +639,10 @@ def run_spec_lib_matching_on_HRMS_data(query_data=None, reference_data=None, lik
441
639
  --entropy_dimension: Entropy dimension parameter. Must have positive value other than 1. When the entropy dimension is 1, then Renyi and Tsallis entropy are equivalent to Shannon entropy. Therefore, this parameter only applies to the renyi and tsallis similarity measures. This parameter will be ignored if similarity measure cosine or shannon is chosen. Default: 1.1
442
640
  --n_top_matches_to_save: The number of top matches to report. For example, if n_top_matches_to_save=5, then for each query spectrum, the five reference spectra with the largest similarity with the given query spectrum will be reported. Default: 1
443
641
  --print_id_results: Flag that prints identification results if True. Default: False
444
- --output_identification: Output CSV file containing the most-similar reference spectra for each query spectrum along with the corresponding similarity scores. Default is to save identification output in current working directory with filename \'output_identification.txt\'.
445
- --output_similarity_scores: Output CSV file containing similarity scores between all query spectrum/spectra and all reference spectra. Each row corresponds to a query spectrum, the left-most column contains the query spectrum/spectra identifier, and the remaining column contain the similarity scores with respect to all reference library spectra. If no argument passed, then this CSV file is written to the current working directory with filename \'output_all_similarity_scores\'.txt.')
642
+ --output_identification: Output TXT file containing the most-similar reference spectra for each query spectrum along with the corresponding similarity scores. Default is to save identification output in current working directory with filename \'output_identification.txt\'.
643
+ --output_similarity_scores: Output TXT file containing similarity scores between all query spectrum/spectra and all reference spectra. Each row corresponds to a query spectrum, the left-most column contains the query spectrum/spectra identifier, and the remaining column contain the similarity scores with respect to all reference library spectra. If no argument passed, then this TXT file is written to the current working directory with filename \'output_all_similarity_scores\'.txt.')
446
644
  '''
447
645
 
448
- # load query and reference libraries
449
646
  if query_data is None:
450
647
  print('\nError: No argument passed to the mandatory query_data. Please pass the path to the CSV file of the query data.')
451
648
  sys.exit()
@@ -477,7 +674,6 @@ def run_spec_lib_matching_on_HRMS_data(query_data=None, reference_data=None, lik
477
674
  df_reference = pd.concat(dfs, axis=0, ignore_index=True)
478
675
 
479
676
 
480
- ##### process input parameters and ensure they are in a valid format #####
481
677
  if spectrum_preprocessing_order is not None:
482
678
  spectrum_preprocessing_order = list(spectrum_preprocessing_order)
483
679
  else:
@@ -545,7 +741,7 @@ def run_spec_lib_matching_on_HRMS_data(query_data=None, reference_data=None, lik
545
741
  else:
546
742
  q = entropy_dimension
547
743
 
548
- normalization_method = 'standard' #consider including additional normalization methods to transform intensities into a probability distribution; softmax results in many numerical errors/warnings
744
+ normalization_method = 'standard'
549
745
 
550
746
  if n_top_matches_to_save <= 0 or isinstance(n_top_matches_to_save,int)==False:
551
747
  print('\nError: n_top_matches_to_save should be a positive integer')
@@ -564,15 +760,12 @@ def run_spec_lib_matching_on_HRMS_data(query_data=None, reference_data=None, lik
564
760
  print(f'Warning: writing similarity scores to {output_similarity_scores}')
565
761
 
566
762
 
567
- ####################################### begin spectral library matching #######################################
568
- # compute the similarity score between each query library spectrum/spectra and all reference library spectra
569
763
  all_similarity_scores = []
570
764
  for query_idx in range(0,len(unique_query_ids)):
571
765
  print(f'query spectrum #{query_idx} is being identified')
572
766
  q_idxs_tmp = np.where(df_query.iloc[:,0] == unique_query_ids[query_idx])[0]
573
767
  q_spec_tmp = np.asarray(pd.concat([df_query.iloc[q_idxs_tmp,1], df_query.iloc[q_idxs_tmp,2]], axis=1).reset_index(drop=True))
574
768
 
575
- # compute the similarity score between the given query spectrum and all spectra in the reference library
576
769
  similarity_scores = []
577
770
  for ref_idx in range(0,len(unique_reference_ids)):
578
771
  #if ref_idx % 100 == 0:
@@ -581,37 +774,35 @@ def run_spec_lib_matching_on_HRMS_data(query_data=None, reference_data=None, lik
581
774
  r_idxs_tmp = np.where(df_reference.iloc[:,0] == unique_reference_ids[ref_idx])[0]
582
775
  r_spec = np.asarray(pd.concat([df_reference.iloc[r_idxs_tmp,1], df_reference.iloc[r_idxs_tmp,2]], axis=1).reset_index(drop=True))
583
776
 
584
- # apply spectrum preprocessing transformation in the order specified by user
585
777
  is_matched = False
586
778
  for transformation in spectrum_preprocessing_order:
587
779
  if np.isinf(q_spec[:,1]).sum() > 0:
588
780
  q_spec[:,1] = np.zeros(q_spec.shape[0])
589
781
  if np.isinf(r_spec[:,1]).sum() > 0:
590
782
  r_spec[:,1] = np.zeros(r_spec.shape[0])
591
- if transformation == 'C' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1: # centroiding
783
+ if transformation == 'C' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1:
592
784
  q_spec = centroid_spectrum(q_spec, window_size=window_size_centroiding)
593
785
  r_spec = centroid_spectrum(r_spec, window_size=window_size_centroiding)
594
- if transformation == 'M' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1: # matching
786
+ if transformation == 'M' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1:
595
787
  m_spec = match_peaks_in_spectra(spec_a=q_spec, spec_b=r_spec, window_size=window_size_matching)
596
788
  q_spec = m_spec[:,0:2]
597
789
  r_spec = m_spec[:,[0,2]]
598
790
  is_matched = True
599
- if transformation == 'W' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1: # weight factor transformation
791
+ if transformation == 'W' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1:
600
792
  q_spec[:,1] = wf_transform(q_spec[:,0], q_spec[:,1], wf_mz, wf_intensity)
601
793
  r_spec[:,1] = wf_transform(r_spec[:,0], r_spec[:,1], wf_mz, wf_intensity)
602
- if transformation == 'L' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1: # low-entropy tranformation
794
+ if transformation == 'L' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1:
603
795
  q_spec[:,1] = LE_transform(q_spec[:,1], LET_threshold, normalization_method=normalization_method)
604
796
  r_spec[:,1] = LE_transform(r_spec[:,1], LET_threshold, normalization_method=normalization_method)
605
- if transformation == 'N' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1: # noise removal
797
+ if transformation == 'N' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1:
606
798
  q_spec = remove_noise(q_spec, nr = noise_threshold)
607
799
  if high_quality_reference_library == False:
608
800
  r_spec = remove_noise(r_spec, nr = noise_threshold)
609
- if transformation == 'F' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1: # filter with respect to mz and/or intensity
801
+ if transformation == 'F' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1:
610
802
  q_spec = filter_spec_lcms(q_spec, mz_min = mz_min, mz_max = mz_max, int_min = int_min, int_max = int_max, is_matched = is_matched)
611
803
  if high_quality_reference_library == False:
612
804
  r_spec = filter_spec_lcms(r_spec, mz_min = mz_min, mz_max = mz_max, int_min = int_min, int_max = int_max, is_matched = is_matched)
613
805
 
614
- # query and reference spectrum intensities
615
806
  q_ints = q_spec[:,1]
616
807
  r_ints = r_spec[:,1]
617
808
 
@@ -623,12 +814,10 @@ def run_spec_lib_matching_on_HRMS_data(query_data=None, reference_data=None, lik
623
814
  similarity_scores.append(similarity_score)
624
815
  all_similarity_scores.append(similarity_scores)
625
816
 
626
- # create pandas dataframe containing all similarity scores computed with one row for each query spectrum and one column for each reference spectrum
627
817
  df_scores = pd.DataFrame(all_similarity_scores, columns = unique_reference_ids)
628
818
  df_scores.index = unique_query_ids
629
819
  df_scores.index.names = ['Query Spectrum ID']
630
820
 
631
- # get predicted identity/identities of each query spectrum and the corresponding maximum similarity score
632
821
  preds = []
633
822
  scores = []
634
823
  for i in range(0, df_scores.shape[0]):
@@ -652,29 +841,23 @@ def run_spec_lib_matching_on_HRMS_data(query_data=None, reference_data=None, lik
652
841
  scores = np.array(scores)
653
842
  out = np.c_[preds,scores]
654
843
 
655
- # get column names for a pandas dataframe with the n_top_matches_to_save top-matches for each query spectrum
656
844
  cnames_preds = []
657
845
  cnames_scores = []
658
846
  for i in range(0,n_top_matches_to_save):
659
847
  cnames_preds.append(f'RANK.{i+1}.PRED')
660
848
  cnames_scores.append(f'RANK.{i+1}.SIMILARITY.SCORE')
661
849
 
662
- # get pandas dataframe with identifcation results with each row corresponding to a query spectrum, n_top_matches_to_save columns for the top predictions, and n_top_matches_to_save columns for the similarity scores corresponding to the predictions
663
850
  df_top_ref_specs = pd.DataFrame(out, columns = [*cnames_preds, *cnames_scores])
664
851
  df_top_ref_specs.index = unique_query_ids
665
852
  df_top_ref_specs.index.names = ['Query Spectrum ID']
666
853
 
667
854
  df_scores.columns = ['Reference Spectrum ID: ' + col for col in list(map(str,df_scores.columns.tolist()))]
668
855
 
669
- # print the identification results if the user desires
670
856
  if print_id_results == True:
671
857
  print(df_top_ref_specs.to_string())
672
858
 
673
859
  if return_ID_output is False:
674
- # write spectral library matching results to disk
675
860
  df_top_ref_specs.to_csv(output_identification, sep='\t')
676
-
677
- # write all similarity scores to disk
678
861
  df_scores.to_csv(output_similarity_scores, sep='\t')
679
862
  else:
680
863
  return df_top_ref_specs
@@ -706,11 +889,10 @@ def run_spec_lib_matching_on_NRMS_data(query_data=None, reference_data=None, lik
706
889
  --normalization_method: Method used to normalize the intensities of each spectrum so that the intensities sum to 1. Since the objects entropy quantifies the uncertainy of must be probability distributions, the intensities of a given spectrum must sum to 1 prior to computing the entropy of the given spectrum intensities. Options: \'standard\' and \'softmax\'. Default: standard.
707
890
  --n_top_matches_to_save: The number of top matches to report. For example, if n_top_matches_to_save=5, then for each query spectrum, the five reference spectra with the largest similarity with the given query spectrum will be reported. Default: 1
708
891
  --print_id_results: Flag that prints identification results if True. Default: False
709
- --output_identification: Output CSV file containing the most-similar reference spectra for each query spectrum along with the corresponding similarity scores. Default is to save identification output in current working directory with filename \'output_identification.txt\'.
710
- --output_similarity_scores: Output CSV file containing similarity scores between all query spectrum/spectra and all reference spectra. Each row corresponds to a query spectrum, the left-most column contains the query spectrum/spectra identifier, and the remaining column contain the similarity scores with respect to all reference library spectra. If no argument passed, then this CSV file is written to the current working directory with filename \'output_all_similarity_scores\'.txt.')
892
+ --output_identification: Output TXT file containing the most-similar reference spectra for each query spectrum along with the corresponding similarity scores. Default is to save identification output in current working directory with filename \'output_identification.txt\'.
893
+ --output_similarity_scores: Output TXT file containing similarity scores between all query spectrum/spectra and all reference spectra. Each row corresponds to a query spectrum, the left-most column contains the query spectrum/spectra identifier, and the remaining column contain the similarity scores with respect to all reference library spectra. If no argument passed, then this TXT file is written to the current working directory with filename \'output_all_similarity_scores\'.txt.')
711
894
  '''
712
895
 
713
- # load query and reference libraries
714
896
  if query_data is None:
715
897
  print('\nError: No argument passed to the mandatory query_data. Please pass the path to the CSV file of the query data.')
716
898
  sys.exit()
@@ -742,7 +924,6 @@ def run_spec_lib_matching_on_NRMS_data(query_data=None, reference_data=None, lik
742
924
  df_reference = pd.concat(dfs, axis=0, ignore_index=True)
743
925
 
744
926
 
745
- ##### process input parameters and ensure they are in a valid format #####
746
927
  if spectrum_preprocessing_order is not None:
747
928
  spectrum_preprocessing_order = list(spectrum_preprocessing_order)
748
929
  else:
@@ -795,7 +976,7 @@ def run_spec_lib_matching_on_NRMS_data(query_data=None, reference_data=None, lik
795
976
  else:
796
977
  q = entropy_dimension
797
978
 
798
- normalization_method = 'standard' #consider including additional normalization methods to transform intensities into a probability distribution; softmax results in many numerical errors/warnings
979
+ normalization_method = 'standard'
799
980
 
800
981
  if n_top_matches_to_save <= 0 or isinstance(n_top_matches_to_save,int)==False:
801
982
  print('\nError: n_top_matches_to_save should be a positive integer')
@@ -815,14 +996,10 @@ def run_spec_lib_matching_on_NRMS_data(query_data=None, reference_data=None, lik
815
996
 
816
997
 
817
998
 
818
- ####################################### begin spectral library matching #######################################
819
- # get the range of m/z values
820
999
  min_mz = int(np.min([np.min(df_query.iloc[:,1]), np.min(df_reference.iloc[:,1])]))
821
1000
  max_mz = int(np.max([np.max(df_query.iloc[:,1]), np.max(df_reference.iloc[:,1])]))
822
1001
  mzs = np.linspace(min_mz,max_mz,(max_mz-min_mz+1))
823
1002
 
824
- # compute the similarity score between each query library spectrum/spectra and all reference library spectra
825
- # for each query spectrum, compute its similarity with all reference spectra
826
1003
  all_similarity_scores = []
827
1004
  for query_idx in range(0,len(unique_query_ids)):
828
1005
  q_idxs_tmp = np.where(df_query.iloc[:,0] == unique_query_ids[query_idx])[0]
@@ -838,32 +1015,29 @@ def run_spec_lib_matching_on_NRMS_data(query_data=None, reference_data=None, lik
838
1015
  r_spec_tmp = np.asarray(pd.concat([df_reference.iloc[r_idxs_tmp,1], df_reference.iloc[r_idxs_tmp,2]], axis=1).reset_index(drop=True))
839
1016
  r_spec = convert_spec(r_spec_tmp,mzs)
840
1017
 
841
- # apply spectrum preprocessing transformation in the order specified by user
842
1018
  for transformation in spectrum_preprocessing_order:
843
1019
  if np.isinf(q_spec[:,1]).sum() > 0:
844
1020
  q_spec[:,1] = np.zeros(q_spec.shape[0])
845
1021
  if np.isinf(r_spec[:,1]).sum() > 0:
846
1022
  r_spec[:,1] = np.zeros(r_spec.shape[0])
847
- if transformation == 'W': # weight factor transformation
1023
+ if transformation == 'W':
848
1024
  q_spec[:,1] = wf_transform(q_spec[:,0], q_spec[:,1], wf_mz, wf_intensity)
849
1025
  r_spec[:,1] = wf_transform(r_spec[:,0], r_spec[:,1], wf_mz, wf_intensity)
850
- if transformation == 'L': # low-entropy transformation
1026
+ if transformation == 'L':
851
1027
  q_spec[:,1] = LE_transform(q_spec[:,1], LET_threshold, normalization_method=normalization_method)
852
1028
  r_spec[:,1] = LE_transform(r_spec[:,1], LET_threshold, normalization_method=normalization_method)
853
- if transformation == 'N': # noise removal
1029
+ if transformation == 'N':
854
1030
  q_spec = remove_noise(q_spec, nr = noise_threshold)
855
1031
  if high_quality_reference_library == False:
856
1032
  r_spec = remove_noise(r_spec, nr = noise_threshold)
857
- if transformation == 'F': # filter with respect to mz and/or intensity
1033
+ if transformation == 'F':
858
1034
  q_spec = filter_spec_gcms(q_spec, mz_min = mz_min, mz_max = mz_max, int_min = int_min, int_max = int_max)
859
1035
  if high_quality_reference_library == False:
860
1036
  r_spec = filter_spec_gcms(r_spec, mz_min = mz_min, mz_max = mz_max, int_min = int_min, int_max = int_max)
861
1037
 
862
- # query and reference spectrum intensities
863
1038
  q_ints = q_spec[:,1]
864
1039
  r_ints = r_spec[:,1]
865
1040
 
866
- # if there are no non-zero intensities in the query or reference spectrum, their similarity is 0
867
1041
  if np.sum(q_ints) != 0 and np.sum(r_ints) != 0:
868
1042
  similarity_score = get_similarity(similarity_measure, q_spec[:,1], r_spec[:,1], weights, entropy_dimension)
869
1043
  else:
@@ -872,12 +1046,10 @@ def run_spec_lib_matching_on_NRMS_data(query_data=None, reference_data=None, lik
872
1046
  similarity_scores.append(similarity_score)
873
1047
  all_similarity_scores.append(similarity_scores)
874
1048
 
875
- # create pandas dataframe containing all similarity scores computed with one row for each query spectrum and one column for each reference spectrum
876
1049
  df_scores = pd.DataFrame(all_similarity_scores, columns = unique_reference_ids)
877
1050
  df_scores.index = unique_query_ids
878
1051
  df_scores.index.names = ['Query Spectrum ID']
879
1052
 
880
- # get predicted identity/identities of each query spectrum and the corresponding maximum similarity score
881
1053
  preds = []
882
1054
  scores = []
883
1055
  for i in range(0, df_scores.shape[0]):
@@ -889,7 +1061,6 @@ def run_spec_lib_matching_on_NRMS_data(query_data=None, reference_data=None, lik
889
1061
  cols_to_keep = np.where(df_scores_tmp.iloc[i,:] != np.max(df_scores_tmp.iloc[i,:]))[0]
890
1062
  df_scores_tmp = df_scores_tmp.iloc[:,cols_to_keep]
891
1063
 
892
- #preds_tmp.append(';'.join(top_ref_specs_tmp.index.to_list()))
893
1064
  preds_tmp.append(';'.join(map(str,top_ref_specs_tmp.index.to_list())))
894
1065
  if len(top_ref_specs_tmp.values) == 0:
895
1066
  scores_tmp.append(0)
@@ -902,29 +1073,23 @@ def run_spec_lib_matching_on_NRMS_data(query_data=None, reference_data=None, lik
902
1073
  scores = np.array(scores)
903
1074
  out = np.c_[preds,scores]
904
1075
 
905
- # get column names for a pandas dataframe with the n_top_matches_to_save top-matches for each query spectrum
906
1076
  cnames_preds = []
907
1077
  cnames_scores = []
908
1078
  for i in range(0,n_top_matches_to_save):
909
1079
  cnames_preds.append(f'RANK.{i+1}.PRED')
910
1080
  cnames_scores.append(f'RANK.{i+1}.SIMILARITY.SCORE')
911
1081
 
912
- # get pandas dataframe with identifcation results with each row corresponding to a query spectrum, n_top_matches_to_save columns for the top predictions, and n_top_matches_to_save columns for the similarity scores corresponding to the predictions
913
1082
  df_top_ref_specs = pd.DataFrame(out, columns = [*cnames_preds, *cnames_scores])
914
1083
  df_top_ref_specs.index = unique_query_ids
915
1084
  df_top_ref_specs.index.names = ['Query Spectrum ID']
916
1085
 
917
- # print the identification results if the user desires
918
1086
  if print_id_results == True:
919
1087
  print(df_top_ref_specs.to_string())
920
1088
 
921
1089
  df_scores.columns = ['Reference Spectrum ID: ' + col for col in list(map(str,df_scores.columns.tolist()))]
922
1090
 
923
1091
  if return_ID_output is False:
924
- # write spectral library matching results to disk
925
1092
  df_top_ref_specs.to_csv(output_identification, sep='\t')
926
-
927
- # write all similarity scores to disk
928
1093
  df_scores.columns = ['Reference Spectrum ID: ' + col for col in list(map(str,df_scores.columns.tolist()))]
929
1094
  df_scores.to_csv(output_similarity_scores, sep='\t')
930
1095
  else: