pycompound 0.1.6__py3-none-any.whl → 0.1.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -22,8 +22,9 @@ def _vector_to_full_params(X, default_params, optimize_params):
22
22
  def objective_function_HRMS(X, ctx):
23
23
  p = _vector_to_full_params(X, ctx["default_params"], ctx["optimize_params"])
24
24
  acc = get_acc_HRMS(
25
- ctx["df_query"], ctx["df_reference"],
26
- ctx["unique_query_ids"], ctx["unique_reference_ids"],
25
+ ctx["df_query"],
26
+ ctx["df_reference"],
27
+ ctx["precursor_ion_mz_tolerance"], ctx["ionization_mode"], ctx["adduct"],
27
28
  ctx["similarity_measure"], ctx["weights"], ctx["spectrum_preprocessing_order"],
28
29
  ctx["mz_min"], ctx["mz_max"], ctx["int_min"], ctx["int_max"],
29
30
  p["window_size_centroiding"], p["window_size_matching"], p["noise_threshold"],
@@ -35,11 +36,11 @@ def objective_function_HRMS(X, ctx):
35
36
  print(f"\nparams({ctx['optimize_params']}) = {np.array(X)}\naccuracy: {acc*100}%")
36
37
  return 1.0 - acc
37
38
 
39
+
38
40
  def objective_function_NRMS(X, ctx):
39
41
  p = _vector_to_full_params(X, ctx["default_params"], ctx["optimize_params"])
40
42
  acc = get_acc_NRMS(
41
- ctx["df_query"], ctx["df_reference"],
42
- ctx["unique_query_ids"], ctx["unique_reference_ids"],
43
+ ctx["df_query"], ctx["df_reference"], ctx['unique_query_ids'], ctx['unique_reference_ids'],
43
44
  ctx["similarity_measure"], ctx["weights"], ctx["spectrum_preprocessing_order"],
44
45
  ctx["mz_min"], ctx["mz_max"], ctx["int_min"], ctx["int_max"],
45
46
  p["noise_threshold"], p["wf_mz"], p["wf_int"], p["LET_threshold"], p["entropy_dimension"],
@@ -51,16 +52,8 @@ def objective_function_NRMS(X, ctx):
51
52
 
52
53
 
53
54
 
54
- def tune_params_DE(query_data=None, reference_data=None, chromatography_platform='HRMS', similarity_measure='cosine', weights=None, spectrum_preprocessing_order='CNMWL', mz_min=0, mz_max=999999999, int_min=0, int_max=999999999, high_quality_reference_library=False, optimize_params=["window_size_centroiding","window_size_matching","noise_threshold","wf_mz","wf_int","LET_threshold","entropy_dimension"], param_bounds={"window_size_centroiding":(0.0,0.5),"window_size_matching":(0.0,0.5),"noise_threshold":(0.0,0.25),"wf_mz":(0.0,5.0),"wf_int":(0.0,5.0),"LET_threshold":(0.0,5.0),"entropy_dimension":(1.0,3.0)}, default_params={"window_size_centroiding": 0.5, "window_size_matching":0.5, "noise_threshold":0.10, "wf_mz":0.0, "wf_int":1.0, "LET_threshold":0.0, "entropy_dimension":1.1}, maxiters=3, de_workers=1, de_updating='immediate', log_hook=None):
55
-
56
- def _log(msg):
57
- if log_hook:
58
- try: log_hook(msg if msg.endswith("\n") else msg + "\n")
59
- except: pass
60
55
 
61
- def callback(xk, conv):
62
- _log(f"iter callback: conv={conv:.4g}, x={xk}")
63
- return False
56
+ def tune_params_DE(query_data=None, reference_data=None, chromatography_platform='HRMS', precursor_ion_mz_tolerance=None, ionization_mode=None, adduct=None, similarity_measure='cosine', weights=None, spectrum_preprocessing_order='CNMWL', mz_min=0, mz_max=999999999, int_min=0, int_max=999999999, high_quality_reference_library=False, optimize_params=["window_size_centroiding","window_size_matching","noise_threshold","wf_mz","wf_int","LET_threshold","entropy_dimension"], param_bounds={"window_size_centroiding":(0.0,0.5),"window_size_matching":(0.0,0.5),"noise_threshold":(0.0,0.25),"wf_mz":(0.0,5.0),"wf_int":(0.0,5.0),"LET_threshold":(0.0,5.0),"entropy_dimension":(1.0,3.0)}, default_params={"window_size_centroiding": 0.5, "window_size_matching":0.5, "noise_threshold":0.10, "wf_mz":0.0, "wf_int":1.0, "LET_threshold":0.0, "entropy_dimension":1.1}, maxiters=3, de_workers=1):
64
57
 
65
58
  if query_data is None:
66
59
  print('\nError: No argument passed to the mandatory query_data. Please pass the path to the TXT file of the query data.')
@@ -68,21 +61,19 @@ def tune_params_DE(query_data=None, reference_data=None, chromatography_platform
68
61
  else:
69
62
  extension = query_data.rsplit('.',1)
70
63
  extension = extension[(len(extension)-1)]
71
- if extension == 'mgf' or extension == 'MGF' or extension == 'mzML' or extension == 'mzml' or extension == 'MZML' or extension == 'cdf' or extension == 'CDF':
72
- output_path_tmp = query_data[:-3] + 'csv'
64
+ if extension == 'mgf' or extension == 'MGF' or extension == 'mzML' or extension == 'mzml' or extension == 'MZML' or extension == 'cdf' or extension == 'CDF' or extension == 'msp' or extension == 'MSP' or extension == 'json' or extension == 'JSON':
65
+ output_path_tmp = query_data[:-3] + 'txt'
73
66
  build_library_from_raw_data(input_path=query_data, output_path=output_path_tmp, is_reference=False)
74
- df_query = pd.read_csv(output_path_tmp)
75
- if extension == 'csv' or extension == 'CSV':
76
- df_query = pd.read_csv(query_data)
77
- unique_query_ids = df_query.iloc[:,0].unique()
67
+ df_query = pd.read_csv(output_path_tmp, sep='\t')
68
+ if extension == 'txt' or extension == 'TXT':
69
+ df_query = pd.read_csv(query_data, sep='\t')
78
70
 
79
71
  if reference_data is None:
80
- print('\nError: No argument passed to the mandatory reference_data. Please pass the path to the CSV file of the reference data.')
72
+ print('\nError: No argument passed to the mandatory reference_data. Please pass the path to the TXT file of the reference data.')
81
73
  sys.exit()
82
74
  else:
83
75
  if isinstance(reference_data,str):
84
76
  df_reference = get_reference_df(reference_data=reference_data)
85
- unique_reference_ids = df_reference.iloc[:,0].unique()
86
77
  else:
87
78
  dfs = []
88
79
  unique_reference_ids = []
@@ -92,6 +83,11 @@ def tune_params_DE(query_data=None, reference_data=None, chromatography_platform
92
83
  unique_reference_ids.extend(tmp.iloc[:,0].unique())
93
84
  df_reference = pd.concat(dfs, axis=0, ignore_index=True)
94
85
 
86
+ if 'ionization_mode' in df_reference.columns.tolist() and ionization_mode != None and ionization_mode != 'N/A':
87
+ df_reference = df_reference.loc[df_reference['ionization_mode']==ionization_mode]
88
+ if 'adduct' in df_reference.columns.tolist() and adduct != None and adduct != 'N/A':
89
+ df_reference = df_reference.loc[df_reference['adduct']==adduct]
90
+
95
91
  unique_query_ids = df_query['id'].unique().tolist()
96
92
  unique_reference_ids = df_reference['id'].unique().tolist()
97
93
 
@@ -100,6 +96,9 @@ def tune_params_DE(query_data=None, reference_data=None, chromatography_platform
100
96
  df_reference=df_reference,
101
97
  unique_query_ids=unique_query_ids,
102
98
  unique_reference_ids=unique_reference_ids,
99
+ precursor_ion_mz_tolerance=precursor_ion_mz_tolerance,
100
+ ionization_mode=ionization_mode,
101
+ adduct=adduct,
103
102
  similarity_measure=similarity_measure,
104
103
  weights=weights,
105
104
  spectrum_preprocessing_order=spectrum_preprocessing_order,
@@ -111,13 +110,10 @@ def tune_params_DE(query_data=None, reference_data=None, chromatography_platform
111
110
 
112
111
  bounds = [param_bounds[p] for p in optimize_params]
113
112
 
114
- print('here!!!!!!!!!!!!!!!')
115
- print(de_workers)
116
- print('here!!!!!!!!!!!!!!!')
117
113
  if chromatography_platform == 'HRMS':
118
- result = differential_evolution(objective_function_HRMS, bounds=bounds, args=(ctx,), maxiter=maxiters, tol=0.0, workers=de_workers, seed=1)
114
+ result = differential_evolution(objective_function_HRMS, bounds=bounds, args=(ctx,), maxiter=maxiters, tol=0.0, workers=de_workers, seed=1, updating='deferred' if de_workers!=1 else 'immediate')
119
115
  else:
120
- result = differential_evolution(objective_function_NRMS, bounds=bounds, args=(ctx,), maxiter=maxiters, tol=0.0, workers=de_workers, seed=1)
116
+ result = differential_evolution(objective_function_NRMS, bounds=bounds, args=(ctx,), maxiter=maxiters, tol=0.0, workers=de_workers, seed=1, updating='deferred' if de_workers!=1 else 'immediate')
121
117
 
122
118
  best_full_params = _vector_to_full_params(result.x, default_params, optimize_params)
123
119
  best_acc = 100.0 - (result.fun * 100.0)
@@ -131,14 +127,17 @@ def tune_params_DE(query_data=None, reference_data=None, chromatography_platform
131
127
  for k, v in best_full_params.items():
132
128
  print(f" {k}: {v}")
133
129
  print(f"\nBest accuracy: {best_acc:.3f}%")
134
- _log(f"best = {result.x}, acc={100*(1-result.fun):.3f}%")
130
+
131
+
132
+
135
133
 
136
134
 
137
135
  default_HRMS_grid = {'similarity_measure':['cosine'], 'weight':[{'Cosine':0.25,'Shannon':0.25,'Renyi':0.25,'Tsallis':0.25}], 'spectrum_preprocessing_order':['FCNMWL'], 'mz_min':[0], 'mz_max':[9999999], 'int_min':[0], 'int_max':[99999999], 'window_size_centroiding':[0.5], 'window_size_matching':[0.5], 'noise_threshold':[0.0], 'wf_mz':[0.0], 'wf_int':[1.0], 'LET_threshold':[0.0], 'entropy_dimension':[1.1], 'high_quality_reference_library':[False]}
138
136
  default_NRMS_grid = {'similarity_measure':['cosine'], 'weight':[{'Cosine':0.25,'Shannon':0.25,'Renyi':0.25,'Tsallis':0.25}], 'spectrum_preprocessing_order':['FCNMWL'], 'mz_min':[0], 'mz_max':[9999999], 'int_min':[0], 'int_max':[99999999], 'noise_threshold':[0.0], 'wf_mz':[0.0], 'wf_int':[1.0], 'LET_threshold':[0.0], 'entropy_dimension':[1.1], 'high_quality_reference_library':[False]}
139
137
 
140
138
 
141
- def _eval_one_HRMS(df_query, df_reference, unique_query_ids, unique_reference_ids,
139
+ def _eval_one_HRMS(df_query, df_reference,
140
+ precursor_ion_mz_tolerance_tmp, ionization_mode_tmp, adduct_tmp,
142
141
  similarity_measure_tmp, weight,
143
142
  spectrum_preprocessing_order_tmp, mz_min_tmp, mz_max_tmp,
144
143
  int_min_tmp, int_max_tmp, noise_threshold_tmp,
@@ -148,7 +147,8 @@ def _eval_one_HRMS(df_query, df_reference, unique_query_ids, unique_reference_id
148
147
 
149
148
  acc = get_acc_HRMS(
150
149
  df_query=df_query, df_reference=df_reference,
151
- unique_query_ids=unique_query_ids, unique_reference_ids=unique_reference_ids,
150
+ precursor_ion_mz_tolerance=precursor_ion_mz_tolerance_tmp,
151
+ ionization_mode=ionization_mode_tmp, adduct=adduct_tmp,
152
152
  similarity_measure=similarity_measure_tmp, weights=weight,
153
153
  spectrum_preprocessing_order=spectrum_preprocessing_order_tmp,
154
154
  mz_min=mz_min_tmp, mz_max=mz_max_tmp,
@@ -160,7 +160,7 @@ def _eval_one_HRMS(df_query, df_reference, unique_query_ids, unique_reference_id
160
160
  LET_threshold=LET_threshold_tmp,
161
161
  entropy_dimension=entropy_dimension_tmp,
162
162
  high_quality_reference_library=high_quality_reference_library_tmp,
163
- verbose=True
163
+ verbose=False
164
164
  )
165
165
 
166
166
  return (
@@ -191,6 +191,7 @@ def _eval_one_NRMS(df_query, df_reference, unique_query_ids, unique_reference_id
191
191
  LET_threshold=LET_threshold_tmp,
192
192
  entropy_dimension=entropy_dimension_tmp,
193
193
  high_quality_reference_library=high_quality_reference_library_tmp,
194
+ verbose=False
194
195
  )
195
196
 
196
197
  return (
@@ -201,16 +202,7 @@ def _eval_one_NRMS(df_query, df_reference, unique_query_ids, unique_reference_id
201
202
 
202
203
 
203
204
 
204
- def tune_params_on_HRMS_data_grid(query_data=None, reference_data=None, grid=None, output_path=None, return_output=False):
205
- """
206
- runs spectral library matching on high-resolution mass spectrometry (HRMS) data with all possible combinations of parameters in the grid dict, saves results from each choice of parameters to a TXT file, and prints top-performing parameters
207
-
208
- --query_data: mgf, mzML, or csv file of query mass spectrum/spectra to be identified. If csv file, each row should correspond to a mass spectrum, the left-most column should contain an identifier, and each of the other columns should correspond to a single mass/charge ratio. Mandatory argument.
209
- --reference_data: mgf, mzML, or csv file of the reference mass spectra. If csv file, each row should correspond to a mass spectrum, the left-most column should contain in identifier (i.e. the CAS registry number or the compound name), and the remaining column should correspond to a single mass/charge ratio. Mandatory argument.
210
- --grid: dict with all possible parameter values to try.
211
- --output_path: accuracy from each choice of parameter set is saved to a TXT file here.
212
- """
213
-
205
+ def tune_params_on_HRMS_data_grid(query_data=None, reference_data=None, precursor_ion_mz_tolerance=None, ionization_mode=None, adduct=None, grid=None, output_path=None, return_output=False):
214
206
  grid = {**default_HRMS_grid, **(grid or {})}
215
207
  for key, value in grid.items():
216
208
  globals()[key] = value
@@ -221,31 +213,37 @@ def tune_params_on_HRMS_data_grid(query_data=None, reference_data=None, grid=Non
221
213
  else:
222
214
  extension = query_data.rsplit('.',1)
223
215
  extension = extension[(len(extension)-1)]
224
- if extension == 'mgf' or extension == 'MGF' or extension == 'mzML' or extension == 'mzml' or extension == 'MZML' or extension == 'cdf' or extension == 'CDF':
225
- output_path_tmp = query_data[:-3] + 'csv'
216
+ if extension == 'mgf' or extension == 'MGF' or extension == 'mzML' or extension == 'mzml' or extension == 'MZML' or extension == 'cdf' or extension == 'CDF' or extension == 'msp' or extension == 'MSP' or extension == 'json' or extension == 'JSON':
217
+ output_path_tmp = query_data[:-3] + 'txt'
226
218
  build_library_from_raw_data(input_path=query_data, output_path=output_path_tmp, is_reference=False)
227
- df_query = pd.read_csv(output_path_tmp)
228
- if extension == 'csv' or extension == 'CSV':
229
- df_query = pd.read_csv(query_data)
230
- unique_query_ids = df_query.iloc[:,0].unique()
219
+ df_query = pd.read_csv(output_path_tmp, sep='\t')
220
+ if extension == 'txt' or extension == 'TXT':
221
+ df_query = pd.read_csv(query_data, sep='\t')
222
+ unique_query_ids = df_query['id'].unique()
231
223
 
232
224
  if reference_data is None:
233
- print('\nError: No argument passed to the mandatory reference_data. Please pass the path to the CSV file of the reference data.')
225
+ print('\nError: No argument passed to the mandatory reference_data. Please pass the path to the TXT file of the reference data.')
234
226
  sys.exit()
235
227
  else:
236
228
  if isinstance(reference_data,str):
237
229
  df_reference = get_reference_df(reference_data=reference_data)
238
- unique_reference_ids = df_reference.iloc[:,0].unique()
230
+ unique_reference_ids = df_reference['id'].unique()
239
231
  else:
240
232
  dfs = []
241
233
  unique_reference_ids = []
242
234
  for f in reference_data:
243
235
  tmp = get_reference_df(reference_data=f)
244
236
  dfs.append(tmp)
245
- unique_reference_ids.extend(tmp.iloc[:,0].unique())
237
+ unique_reference_ids.extend(tmp['id'].unique())
246
238
  df_reference = pd.concat(dfs, axis=0, ignore_index=True)
247
239
 
248
- print(f'\nNote that there are {len(unique_query_ids)} unique query spectra, {len(unique_reference_ids)} unique reference spectra, and {len(set(unique_query_ids) & set(unique_reference_ids))} of the query and reference spectra IDs are in common.\n')
240
+ if 'ionization_mode' in df_reference.columns.tolist() and ionization_mode != 'N/A' and ionization_mode != None:
241
+ df_reference = df_reference.loc[df_reference['ionization_mode']==ionization_mode].copy()
242
+ if 'adduct' in df_reference.columns.tolist() and adduct != 'N/A' and adduct != None:
243
+ df_reference = df_reference.loc[df_reference['adduct']==adduct].copy()
244
+ unique_reference_ids_tmp2 = df_reference['id'].unique()
245
+
246
+ print(f'\nNote that there are {len(unique_query_ids)} unique query spectra, {len(unique_reference_ids)} unique reference spectra, and {len(set(unique_query_ids) & set(unique_reference_ids_tmp2))} of the query and reference spectra IDs are in common.\n')
249
247
 
250
248
  if output_path is None:
251
249
  output_path = f'{Path.cwd()}/tuning_param_output.txt'
@@ -253,7 +251,7 @@ def tune_params_on_HRMS_data_grid(query_data=None, reference_data=None, grid=Non
253
251
 
254
252
  param_grid = product(similarity_measure, weight, spectrum_preprocessing_order, mz_min, mz_max, int_min, int_max, noise_threshold,
255
253
  window_size_centroiding, window_size_matching, wf_mz, wf_int, LET_threshold, entropy_dimension, high_quality_reference_library)
256
- results = Parallel(n_jobs=-1, verbose=10)(delayed(_eval_one_HRMS)(df_query, df_reference, unique_query_ids, unique_reference_ids, *params) for params in param_grid)
254
+ results = Parallel(n_jobs=-1, verbose=10)(delayed(_eval_one_HRMS)(df_query, df_reference, precursor_ion_mz_tolerance, ionization_mode, adduct, *params) for params in param_grid)
257
255
 
258
256
  df_out = pd.DataFrame(results, columns=[
259
257
  'ACC','SIMILARITY.MEASURE','WEIGHT','SPECTRUM.PROCESSING.ORDER', 'MZ.MIN','MZ.MAX','INT.MIN','INT.MAX','NOISE.THRESHOLD',
@@ -277,124 +275,7 @@ def tune_params_on_HRMS_data_grid(query_data=None, reference_data=None, grid=Non
277
275
 
278
276
 
279
277
 
280
- def tune_params_on_HRMS_data_grid_shiny(query_data=None, reference_data=None, grid=None, output_path=None, return_output=False):
281
- """
282
- runs spectral library matching on high-resolution mass spectrometry (HRMS) data with all possible
283
- combinations of parameters in the grid dict, saves results from each choice of parameters to a TXT file,
284
- and prints top-performing parameters
285
-
286
- --query_data: mgf, mzML, or csv file of query mass spectrum/spectra to be identified. If csv file, each row
287
- should correspond to a mass spectrum, the left-most column should contain an identifier, and each of the
288
- other columns should correspond to a single mass/charge ratio. Mandatory argument.
289
- --reference_data: mgf, mzML, or csv file of the reference mass spectra. If csv file, each row should correspond
290
- to a mass spectrum, the left-most column should contain in identifier (i.e. the CAS registry number or the
291
- compound name), and the remaining column should correspond to a single mass/charge ratio. Mandatory argument.
292
- --grid: dict with all possible parameter values to try.
293
- --output_path: accuracy from each choice of parameter set is saved to a TXT file here.
294
- """
295
-
296
- local_grid = {**default_HRMS_grid, **(grid or {})}
297
- for key, value in local_grid.items():
298
- globals()[key] = value
299
-
300
- if query_data is None:
301
- print('\nError: No argument passed to the mandatory query_data. Please pass the path to the data file.')
302
- sys.exit()
303
- else:
304
- extension = query_data.rsplit('.', 1)[-1]
305
- if extension in ('mgf','MGF','mzML','mzml','MZML','cdf','CDF'):
306
- output_path_tmp = query_data[:-3] + 'csv'
307
- build_library_from_raw_data(input_path=query_data, output_path=output_path_tmp, is_reference=False)
308
- df_query = pd.read_csv(output_path_tmp)
309
- elif extension in ('csv','CSV'):
310
- df_query = pd.read_csv(query_data)
311
- else:
312
- print(f'\nError: Unsupported query_data extension: {extension}')
313
- sys.exit()
314
- unique_query_ids = df_query.iloc[:, 0].unique()
315
-
316
- if reference_data is None:
317
- print('\nError: No argument passed to the mandatory reference_data. Please pass the path to the data file(s).')
318
- sys.exit()
319
- else:
320
- if isinstance(reference_data, str):
321
- df_reference = get_reference_df(reference_data=reference_data)
322
- unique_reference_ids = df_reference.iloc[:, 0].unique()
323
- else:
324
- dfs = []
325
- unique_reference_ids = []
326
- for f in reference_data:
327
- tmp = get_reference_df(reference_data=f)
328
- dfs.append(tmp)
329
- unique_reference_ids.extend(tmp.iloc[:, 0].unique())
330
- df_reference = pd.concat(dfs, axis=0, ignore_index=True)
331
-
332
- print(f'\nNote that there are {len(unique_query_ids)} unique query spectra, '
333
- f'{len(unique_reference_ids)} unique reference spectra, and '
334
- f'{len(set(unique_query_ids) & set(unique_reference_ids))} of the query and reference spectra IDs are in common.\n')
335
-
336
- if output_path is None:
337
- output_path = f'{Path.cwd()}/tuning_param_output.txt'
338
- print(f'Warning: since output_path=None, the output will be written to the current working directory: {output_path}')
339
-
340
- param_grid = product(
341
- similarity_measure, weight, spectrum_preprocessing_order, mz_min, mz_max, int_min, int_max,
342
- noise_threshold, window_size_centroiding, window_size_matching, wf_mz, wf_int, LET_threshold,
343
- entropy_dimension, high_quality_reference_library
344
- )
345
-
346
- results = []
347
- total = (
348
- len(similarity_measure) * len(weight) * len(spectrum_preprocessing_order) * len(mz_min) * len(mz_max) *
349
- len(int_min) * len(int_max) * len(noise_threshold) * len(window_size_centroiding) *
350
- len(window_size_matching) * len(wf_mz) * len(wf_int) * len(LET_threshold) *
351
- len(entropy_dimension) * len(high_quality_reference_library)
352
- )
353
- done = 0
354
-
355
- for params in param_grid:
356
- res = _eval_one_HRMS(df_query, df_reference, unique_query_ids, unique_reference_ids, *params)
357
- results.append(res)
358
- done += 1
359
- print(f'Completed {done}/{total} grid combinations.\n', flush=True)
360
-
361
- df_out = pd.DataFrame(results, columns=[
362
- 'ACC','SIMILARITY.MEASURE','WEIGHT','SPECTRUM.PROCESSING.ORDER','MZ.MIN','MZ.MAX',
363
- 'INT.MIN','INT.MAX','NOISE.THRESHOLD','WINDOW.SIZE.CENTROIDING','WINDOW.SIZE.MATCHING',
364
- 'WF.MZ','WF.INT','LET.THRESHOLD','ENTROPY.DIMENSION','HIGH.QUALITY.REFERENCE.LIBRARY'
365
- ])
366
-
367
- if 'WEIGHT' in df_out.columns:
368
- df_out['WEIGHT'] = (
369
- df_out['WEIGHT'].astype(str)
370
- .str.replace("\"","",regex=False)
371
- .str.replace("{","",regex=False)
372
- .str.replace("}","",regex=False)
373
- .str.replace(":","",regex=False)
374
- .str.replace("Cosine","",regex=False)
375
- .str.replace("Shannon","",regex=False)
376
- .str.replace("Renyi","",regex=False)
377
- .str.replace("Tsallis","",regex=False)
378
- .str.replace(" ","",regex=False)
379
- )
380
-
381
- if return_output:
382
- return df_out
383
- else:
384
- df_out.to_csv(output_path, index=False, sep='\t', quoting=csv.QUOTE_NONE)
385
- print(f'Wrote results to {output_path}')
386
-
387
-
388
278
  def tune_params_on_NRMS_data_grid(query_data=None, reference_data=None, grid=None, output_path=None, return_output=False):
389
- """
390
- runs spectral library matching on nominal-resolution mass spectrometry (NRMS) data with all possible combinations of parameters in the grid dict, saves results from each choice of parameters to a TXT file, and prints top-performing parameters
391
-
392
- --query_data: mgf, mzML, or csv file of query mass spectrum/spectra to be identified. If csv file, each row should correspond to a mass spectrum, the left-most column should contain an identifier, and each of the other columns should correspond to a single mass/charge ratio. Mandatory argument.
393
- --reference_data: mgf, mzML, or csv file of the reference mass spectra. If csv file, each row should correspond to a mass spectrum, the left-most column should contain in identifier (i.e. the CAS registry number or the compound name), and the remaining column should correspond to a single mass/charge ratio. Mandatory argument.
394
- --grid: dict with all possible parameter values to try
395
- --output_path: accuracy from each choice of parameter set is saved to a TXT file here
396
- """
397
-
398
279
  grid = {**default_NRMS_grid, **(grid or {})}
399
280
  for key, value in grid.items():
400
281
  globals()[key] = value
@@ -405,13 +286,13 @@ def tune_params_on_NRMS_data_grid(query_data=None, reference_data=None, grid=Non
405
286
  else:
406
287
  extension = query_data.rsplit('.',1)
407
288
  extension = extension[(len(extension)-1)]
408
- if extension == 'mgf' or extension == 'MGF' or extension == 'mzML' or extension == 'mzml' or extension == 'MZML' or extension == 'cdf' or extension == 'CDF':
409
- output_path_tmp = query_data[:-3] + 'csv'
289
+ if extension == 'mgf' or extension == 'MGF' or extension == 'mzML' or extension == 'mzml' or extension == 'MZML' or extension == 'cdf' or extension == 'CDF' or extension == 'msp' or extension == 'MSP' or extension == 'json' or extension == 'JSON':
290
+ output_path_tmp = query_data[:-3] + 'txt'
410
291
  build_library_from_raw_data(input_path=query_data, output_path=output_path_tmp, is_reference=False)
411
- df_query = pd.read_csv(output_path_tmp)
412
- if extension == 'csv' or extension == 'CSV':
413
- df_query = pd.read_csv(query_data)
414
- unique_query_ids = df_query.iloc[:,0].unique()
292
+ df_query = pd.read_csv(output_path_tmp, sep='\t')
293
+ if extension == 'txt' or extension == 'TXT':
294
+ df_query = pd.read_csv(query_data, sep='\t')
295
+ unique_query_ids = df_query['id'].unique()
415
296
 
416
297
  if reference_data is None:
417
298
  print('\nError: No argument passed to the mandatory reference_data. Please pass the path to the CSV file of the reference data.')
@@ -419,7 +300,7 @@ def tune_params_on_NRMS_data_grid(query_data=None, reference_data=None, grid=Non
419
300
  else:
420
301
  if isinstance(reference_data,str):
421
302
  df_reference = get_reference_df(reference_data=reference_data)
422
- unique_reference_ids = df_reference.iloc[:,0].unique()
303
+ unique_reference_ids = df_reference['id'].unique()
423
304
  else:
424
305
  dfs = []
425
306
  unique_reference_ids = []
@@ -439,10 +320,8 @@ def tune_params_on_NRMS_data_grid(query_data=None, reference_data=None, grid=Non
439
320
  noise_threshold, wf_mz, wf_int, LET_threshold, entropy_dimension, high_quality_reference_library)
440
321
  results = Parallel(n_jobs=-1, verbose=10)(delayed(_eval_one_NRMS)(df_query, df_reference, unique_query_ids, unique_reference_ids, *params) for params in param_grid)
441
322
 
442
- df_out = pd.DataFrame(results, columns=[
443
- 'ACC','SIMILARITY.MEASURE','WEIGHT','SPECTRUM.PROCESSING.ORDER', 'MZ.MIN','MZ.MAX','INT.MIN','INT.MAX',
444
- 'NOISE.THRESHOLD','WF.MZ','WF.INT','LET.THRESHOLD','ENTROPY.DIMENSION', 'HIGH.QUALITY.REFERENCE.LIBRARY'
445
- ])
323
+ df_out = pd.DataFrame(results, columns=['ACC','SIMILARITY.MEASURE','WEIGHT','SPECTRUM.PROCESSING.ORDER', 'MZ.MIN','MZ.MAX','INT.MIN','INT.MAX',
324
+ 'NOISE.THRESHOLD','WF.MZ','WF.INT','LET.THRESHOLD','ENTROPY.DIMENSION', 'HIGH.QUALITY.REFERENCE.LIBRARY'])
446
325
  df_out['WEIGHT'] = df_out['WEIGHT'].str.replace("\"","",regex=False)
447
326
  df_out['WEIGHT'] = df_out['WEIGHT'].str.replace("{","",regex=False)
448
327
  df_out['WEIGHT'] = df_out['WEIGHT'].str.replace("}","",regex=False)
@@ -452,6 +331,7 @@ def tune_params_on_NRMS_data_grid(query_data=None, reference_data=None, grid=Non
452
331
  df_out['WEIGHT'] = df_out['WEIGHT'].str.replace("Renyi","",regex=False)
453
332
  df_out['WEIGHT'] = df_out['WEIGHT'].str.replace("Tsallis","",regex=False)
454
333
  df_out['WEIGHT'] = df_out['WEIGHT'].str.replace(" ","",regex=False)
334
+
455
335
  if return_output is False:
456
336
  df_out.to_csv(output_path, index=False, sep='\t', quoting=csv.QUOTE_NONE)
457
337
  else:
@@ -459,225 +339,137 @@ def tune_params_on_NRMS_data_grid(query_data=None, reference_data=None, grid=Non
459
339
 
460
340
 
461
341
 
462
- def tune_params_on_NRMS_data_grid_shiny(query_data=None, reference_data=None, grid=None, output_path=None, return_output=False):
463
- """
464
- runs spectral library matching on nominal-resolution mass spectrometry (NRMS) data with all possible
465
- combinations of parameters in the grid dict, saves results from each choice of parameters to a TXT file,
466
- and prints top-performing parameters
467
-
468
- --query_data: mgf, mzML, or csv file of query mass spectrum/spectra to be identified. If csv file, each row
469
- should correspond to a mass spectrum, the left-most column should contain an identifier, and each of the
470
- other columns should correspond to a single mass/charge ratio. Mandatory argument.
471
- --reference_data: mgf, mzML, or csv file of the reference mass spectra. If csv file, each row should correspond
472
- to a mass spectrum, the left-most column should contain in identifier (i.e. the CAS registry number or the
473
- compound name), and the remaining column should correspond to a single mass/charge ratio. Mandatory argument.
474
- --grid: dict with all possible parameter values to try.
475
- --output_path: accuracy from each choice of parameter set is saved to a TXT file here.
476
- """
477
-
478
- local_grid = {**default_NRMS_grid, **(grid or {})}
479
- for key, value in local_grid.items():
480
- globals()[key] = value
481
-
482
- if query_data is None:
483
- print('\nError: No argument passed to the mandatory query_data. Please pass the path to the data file.')
484
- sys.exit()
485
- else:
486
- extension = query_data.rsplit('.', 1)[-1]
487
- if extension in ('mgf','MGF','mzML','mzml','MZML','cdf','CDF'):
488
- output_path_tmp = query_data[:-3] + 'csv'
489
- build_library_from_raw_data(input_path=query_data, output_path=output_path_tmp, is_reference=False)
490
- df_query = pd.read_csv(output_path_tmp)
491
- elif extension in ('csv','CSV'):
492
- df_query = pd.read_csv(query_data)
493
- else:
494
- print(f'\nError: Unsupported query_data extension: {extension}')
495
- sys.exit()
496
- unique_query_ids = df_query.iloc[:, 0].unique()
497
-
498
- if reference_data is None:
499
- print('\nError: No argument passed to the mandatory reference_data. Please pass the path to the data file(s).')
500
- sys.exit()
501
- else:
502
- if isinstance(reference_data, str):
503
- df_reference = get_reference_df(reference_data=reference_data)
504
- unique_reference_ids = df_reference.iloc[:, 0].unique()
505
- else:
506
- dfs = []
507
- unique_reference_ids = []
508
- for f in reference_data:
509
- tmp = get_reference_df(reference_data=f)
510
- dfs.append(tmp)
511
- unique_reference_ids.extend(tmp.iloc[:, 0].unique())
512
- df_reference = pd.concat(dfs, axis=0, ignore_index=True)
513
-
514
- print(f'\nNote that there are {len(unique_query_ids)} unique query spectra, '
515
- f'{len(unique_reference_ids)} unique reference spectra, and '
516
- f'{len(set(unique_query_ids) & set(unique_reference_ids))} of the query and reference spectra IDs are in common.\n')
517
-
518
- if output_path is None:
519
- output_path = f'{Path.cwd()}/tuning_param_output.txt'
520
- print(f'Warning: since output_path=None, the output will be written to the current working directory: {output_path}')
521
-
522
- param_grid = product(
523
- similarity_measure, weight, spectrum_preprocessing_order, mz_min, mz_max, int_min, int_max,
524
- noise_threshold, wf_mz, wf_int, LET_threshold,
525
- entropy_dimension, high_quality_reference_library
526
- )
527
-
528
- results = []
529
- total = (
530
- len(similarity_measure) * len(weight) * len(spectrum_preprocessing_order) * len(mz_min) * len(mz_max) * len(int_min) *
531
- len(int_max) * len(noise_threshold) * len(wf_mz) * len(wf_int) * len(LET_threshold) * len(entropy_dimension) * len(high_quality_reference_library)
532
- )
533
- done = 0
534
- for params in param_grid:
535
- res = _eval_one_NRMS(df_query, df_reference, unique_query_ids, unique_reference_ids, *params)
536
- results.append(res)
537
- done += 1
538
- print(f'Completed {done}/{total} grid combinations.\n', flush=True)
342
+ def get_acc_HRMS(df_query, df_reference, precursor_ion_mz_tolerance, ionization_mode, adduct, similarity_measure, weights, spectrum_preprocessing_order, mz_min, mz_max, int_min, int_max, window_size_centroiding, window_size_matching, noise_threshold, wf_mz, wf_int, LET_threshold, entropy_dimension, high_quality_reference_library, verbose=True):
539
343
 
540
- df_out = pd.DataFrame(results, columns=[
541
- 'ACC','SIMILARITY.MEASURE','WEIGHT','SPECTRUM.PROCESSING.ORDER','MZ.MIN','MZ.MAX',
542
- 'INT.MIN','INT.MAX','NOISE.THRESHOLD','WF.MZ','WF.INT','LET.THRESHOLD','ENTROPY.DIMENSION','HIGH.QUALITY.REFERENCE.LIBRARY'
543
- ])
344
+ n_top_matches_to_save = 1
345
+ unique_reference_ids = df_reference['id'].dropna().astype(str).unique().tolist()
346
+ unique_query_ids = df_query['id'].dropna().astype(str).unique().tolist()
347
+ all_similarity_rows = []
544
348
 
545
- if 'WEIGHT' in df_out.columns:
546
- df_out['WEIGHT'] = (
547
- df_out['WEIGHT'].astype(str)
548
- .str.replace("\"","",regex=False)
549
- .str.replace("{","",regex=False)
550
- .str.replace("}","",regex=False)
551
- .str.replace(":","",regex=False)
552
- .str.replace("Cosine","",regex=False)
553
- .str.replace("Shannon","",regex=False)
554
- .str.replace("Renyi","",regex=False)
555
- .str.replace("Tsallis","",regex=False)
556
- .str.replace(" ","",regex=False)
557
- )
558
-
559
- if return_output:
560
- return df_out
561
- else:
562
- df_out.to_csv(output_path, index=False, sep='\t', quoting=csv.QUOTE_NONE)
563
- print(f'Wrote results to {output_path}')
349
+ for query_idx, qid in enumerate(unique_query_ids):
350
+ if verbose:
351
+ print(f'query spectrum #{query_idx} is being identified')
564
352
 
353
+ q_mask = (df_query['id'] == qid)
354
+ q_idxs = np.where(q_mask)[0]
355
+ if q_idxs.size == 0:
356
+ all_similarity_rows.append([0.0]*len(unique_reference_ids))
357
+ continue
565
358
 
359
+ q_spec_base = np.asarray(pd.concat([df_query['mz_ratio'].iloc[q_idxs], df_query['intensity'].iloc[q_idxs]], axis=1).reset_index(drop=True))
566
360
 
361
+ if 'precursor_ion_mz' in df_query.columns and 'precursor_ion_mz' in df_reference.columns and precursor_ion_mz_tolerance is not None:
362
+ precursor = float(df_query['precursor_ion_mz'].iloc[q_idxs[0]])
363
+ df_reference_tmp = df_reference.loc[df_reference['precursor_ion_mz'].between(precursor - precursor_ion_mz_tolerance, precursor + precursor_ion_mz_tolerance, inclusive='both'), ['id', 'mz_ratio', 'intensity']].copy()
364
+ else:
365
+ df_reference_tmp = df_reference[['id','mz_ratio','intensity']].copy()
567
366
 
568
- def get_acc_HRMS(df_query, df_reference, unique_query_ids, unique_reference_ids, similarity_measure, weights, spectrum_preprocessing_order, mz_min, mz_max, int_min, int_max, window_size_centroiding, window_size_matching, noise_threshold, wf_mz, wf_int, LET_threshold, entropy_dimension, high_quality_reference_library, verbose=True):
367
+ if df_reference_tmp.empty:
368
+ all_similarity_rows.append([0.0]*len(unique_reference_ids))
369
+ continue
569
370
 
570
- n_top_matches_to_save = 1
371
+ ref_groups = dict(tuple(df_reference_tmp.groupby('id', sort=False)))
571
372
 
572
- all_similarity_scores = []
573
- for query_idx in range(0,len(unique_query_ids)):
574
- if verbose is True:
575
- print(f'query spectrum #{query_idx} is being identified')
576
- q_idxs_tmp = np.where(df_query.iloc[:,0] == unique_query_ids[query_idx])[0]
577
- q_spec_tmp = np.asarray(pd.concat([df_query.iloc[q_idxs_tmp,1], df_query.iloc[q_idxs_tmp,2]], axis=1).reset_index(drop=True))
578
- #q_spec_tmp = q_spec_tmp.astype(float)
373
+ similarity_by_ref = {}
579
374
 
580
- similarity_scores = []
581
- for ref_idx in range(0,len(unique_reference_ids)):
582
- q_spec = q_spec_tmp
583
- r_idxs_tmp = np.where(df_reference.iloc[:,0] == unique_reference_ids[ref_idx])[0]
584
- r_spec = np.asarray(pd.concat([df_reference.iloc[r_idxs_tmp,1], df_reference.iloc[r_idxs_tmp,2]], axis=1).reset_index(drop=True))
585
- #print(r_spec)
586
- #r_spec = r_spec.astype(float)
375
+ for ref_id, r_df in ref_groups.items():
376
+ q_spec = q_spec_base.copy()
377
+ r_spec = np.asarray(pd.concat([r_df['mz_ratio'], r_df['intensity']], axis=1).reset_index(drop=True))
587
378
 
588
379
  is_matched = False
589
380
  for transformation in spectrum_preprocessing_order:
590
- if np.isinf(q_spec[:,1]).sum() > 0:
591
- q_spec[:,1] = np.zeros(q_spec.shape[0])
592
- if np.isinf(r_spec[:,1]).sum() > 0:
593
- r_spec[:,1] = np.zeros(r_spec.shape[0])
594
- if transformation == 'C' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1:
595
- q_spec = centroid_spectrum(q_spec, window_size=window_size_centroiding)
596
- r_spec = centroid_spectrum(r_spec, window_size=window_size_centroiding)
597
- if transformation == 'M' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1:
598
- m_spec = match_peaks_in_spectra(spec_a=q_spec, spec_b=r_spec, window_size=window_size_matching)
599
- q_spec = m_spec[:,0:2]
600
- r_spec = m_spec[:,[0,2]]
381
+ if np.isinf(q_spec[:, 1]).any():
382
+ q_spec[:, 1] = 0.0
383
+ if np.isinf(r_spec[:, 1]).any():
384
+ r_spec[:, 1] = 0.0
385
+
386
+ if transformation == 'C' and q_spec.shape[0] > 1 and r_spec.shape[0] > 1:
387
+ q_spec = centroid_spectrum(q_spec, window_size=window_size_centroiding)
388
+ r_spec = centroid_spectrum(r_spec, window_size=window_size_centroiding)
389
+
390
+ if transformation == 'M' and q_spec.shape[0] > 1 and r_spec.shape[0] > 1:
391
+ m_spec = match_peaks_in_spectra(
392
+ spec_a=q_spec, spec_b=r_spec, window_size=window_size_matching
393
+ )
394
+ if m_spec.size == 0:
395
+ q_spec = np.empty((0,2))
396
+ r_spec = np.empty((0,2))
397
+ else:
398
+ q_spec = m_spec[:, 0:2]
399
+ r_spec = m_spec[:, [0, 2]]
601
400
  is_matched = True
602
- if transformation == 'W' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1:
603
- q_spec[:,1] = wf_transform(q_spec[:,0], q_spec[:,1], wf_mz, wf_int)
604
- r_spec[:,1] = wf_transform(r_spec[:,0], r_spec[:,1], wf_mz, wf_int)
605
- if transformation == 'L' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1:
606
- q_spec[:,1] = LE_transform(q_spec[:,1], LET_threshold, normalization_method='standard')
607
- r_spec[:,1] = LE_transform(r_spec[:,1], LET_threshold, normalization_method='standard')
608
- if transformation == 'N' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1:
609
- q_spec = remove_noise(q_spec, nr = noise_threshold)
610
- if high_quality_reference_library == False:
611
- r_spec = remove_noise(r_spec, nr = noise_threshold)
612
- if transformation == 'F' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1:
613
- q_spec = filter_spec_lcms(q_spec, mz_min = mz_min, mz_max = mz_max, int_min = int_min, int_max = int_max, is_matched = is_matched)
614
- if high_quality_reference_library == False:
615
- r_spec = filter_spec_lcms(r_spec, mz_min = mz_min, mz_max = mz_max, int_min = int_min, int_max = int_max, is_matched = is_matched)
616
401
 
617
- q_ints = q_spec[:,1]
618
- r_ints = r_spec[:,1]
619
- if np.sum(q_ints) != 0 and np.sum(r_ints) != 0 and q_spec.shape[0] > 1 and r_spec.shape[1] > 1:
620
- similarity_score = get_similarity(similarity_measure, q_ints, r_ints, weights, entropy_dimension)
402
+ if transformation == 'W' and q_spec.shape[0] > 1 and r_spec.shape[0] > 1:
403
+ q_spec[:, 1] = wf_transform(q_spec[:, 0], q_spec[:, 1], wf_mz, wf_int)
404
+ r_spec[:, 1] = wf_transform(r_spec[:, 0], r_spec[:, 1], wf_mz, wf_int)
405
+
406
+ if transformation == 'L' and q_spec.shape[0] > 1 and r_spec.shape[0] > 1:
407
+ q_spec[:, 1] = LE_transform(q_spec[:, 1], LET_threshold, normalization_method='standard')
408
+ r_spec[:, 1] = LE_transform(r_spec[:, 1], LET_threshold, normalization_method='standard')
409
+
410
+ if transformation == 'N' and q_spec.shape[0] > 1 and r_spec.shape[0] > 1:
411
+ q_spec = remove_noise(q_spec, nr=noise_threshold)
412
+ if not high_quality_reference_library:
413
+ r_spec = remove_noise(r_spec, nr=noise_threshold)
414
+
415
+ if transformation == 'F' and q_spec.shape[0] > 1 and r_spec.shape[0] > 1:
416
+ q_spec = filter_spec_lcms(
417
+ q_spec, mz_min=mz_min, mz_max=mz_max, int_min=int_min, int_max=int_max, is_matched=is_matched
418
+ )
419
+ if not high_quality_reference_library:
420
+ r_spec = filter_spec_lcms(
421
+ r_spec, mz_min=mz_min, mz_max=mz_max, int_min=int_min, int_max=int_max, is_matched=is_matched
422
+ )
423
+
424
+ if q_spec.shape[0] > 1 and r_spec.shape[0] > 1:
425
+ q_ints = q_spec[:, 1]
426
+ r_ints = r_spec[:, 1]
427
+ if np.sum(q_ints) != 0 and np.sum(r_ints) != 0:
428
+ sim = get_similarity(similarity_measure, q_ints, r_ints, weights, entropy_dimension)
429
+ else:
430
+ sim = 0.0
621
431
  else:
622
- similarity_score = 0
432
+ sim = 0.0
623
433
 
624
- similarity_scores.append(similarity_score)
625
- all_similarity_scores.append(similarity_scores)
434
+ similarity_by_ref[str(ref_id)] = float(sim)
626
435
 
627
- df_scores = pd.DataFrame(all_similarity_scores, columns = unique_reference_ids)
628
- df_scores.index = unique_query_ids
629
- df_scores.index.names = ['Query Spectrum ID']
436
+ row = [similarity_by_ref.get(ref_id, 0.0) for ref_id in unique_reference_ids]
437
+ all_similarity_rows.append(row)
630
438
 
631
- preds = []
632
- scores = []
633
- for i in range(0, df_scores.shape[0]):
634
- df_scores_tmp = df_scores
635
- preds_tmp = []
636
- scores_tmp = []
637
- for j in range(0, n_top_matches_to_save):
638
- top_ref_specs_tmp = df_scores_tmp.iloc[i,np.where(df_scores_tmp.iloc[i,:] == np.max(df_scores_tmp.iloc[i,:]))[0]]
639
- cols_to_keep = np.where(df_scores_tmp.iloc[i,:] != np.max(df_scores_tmp.iloc[i,:]))[0]
640
- df_scores_tmp = df_scores_tmp.iloc[:,cols_to_keep]
641
-
642
- preds_tmp.append(';'.join(map(str,top_ref_specs_tmp.index.to_list())))
643
- if len(top_ref_specs_tmp.values) == 0:
644
- scores_tmp.append(0)
645
- else:
646
- scores_tmp.append(top_ref_specs_tmp.values[0])
647
- preds.append(preds_tmp)
648
- scores.append(scores_tmp)
439
+ df_scores = pd.DataFrame(all_similarity_rows, index=unique_query_ids, columns=unique_reference_ids)
440
+ df_scores.index.name = 'QUERY.SPECTRUM.ID'
649
441
 
650
- preds = np.array(preds)
651
- scores = np.array(scores)
652
- out = np.c_[unique_query_ids,preds,scores]
653
- df_tmp = pd.DataFrame(out, columns=['TRUE.ID','PREDICTED.ID','SCORE'])
654
- acc = (df_tmp['TRUE.ID']==df_tmp['PREDICTED.ID']).mean()
442
+ top_idx = df_scores.values.argmax(axis=1)
443
+ top_scores = df_scores.values[np.arange(df_scores.shape[0]), top_idx]
444
+ top_ids = [df_scores.columns[i] for i in top_idx]
445
+ df_tmp = pd.DataFrame({'TRUE.ID': df_scores.index.to_list(), 'PREDICTED.ID': top_ids, 'SCORE': top_scores})
446
+ #if verbose:
447
+ # print(df_tmp)
448
+ acc = (df_tmp['TRUE.ID'] == df_tmp['PREDICTED.ID']).mean()
655
449
  return acc
656
450
 
657
451
 
658
-
659
-
660
452
  def get_acc_NRMS(df_query, df_reference, unique_query_ids, unique_reference_ids, similarity_measure, weights, spectrum_preprocessing_order, mz_min, mz_max, int_min, int_max, noise_threshold, wf_mz, wf_int, LET_threshold, entropy_dimension, high_quality_reference_library, verbose=True):
661
453
 
662
454
  n_top_matches_to_save = 1
663
455
 
664
- min_mz = int(np.min([np.min(df_query.iloc[:,1]), np.min(df_reference.iloc[:,1])]))
665
- max_mz = int(np.max([np.max(df_query.iloc[:,1]), np.max(df_reference.iloc[:,1])]))
456
+ min_mz = int(np.min([np.min(df_query['mz_ratio']), np.min(df_reference['mz_ratio'])]))
457
+ max_mz = int(np.max([np.max(df_query['mz_ratio']), np.max(df_reference['mz_ratio'])]))
666
458
  mzs = np.linspace(min_mz,max_mz,(max_mz-min_mz+1))
667
459
 
668
460
  all_similarity_scores = []
669
461
  for query_idx in range(0,len(unique_query_ids)):
670
- q_idxs_tmp = np.where(df_query.iloc[:,0] == unique_query_ids[query_idx])[0]
671
- q_spec_tmp = np.asarray(pd.concat([df_query.iloc[q_idxs_tmp,1], df_query.iloc[q_idxs_tmp,2]], axis=1).reset_index(drop=True))
462
+ q_idxs_tmp = np.where(df_query['id'] == unique_query_ids[query_idx])[0]
463
+ q_spec_tmp = np.asarray(pd.concat([df_query['mz_ratio'].iloc[q_idxs_tmp], df_query['intensity'].iloc[q_idxs_tmp]], axis=1).reset_index(drop=True))
672
464
  q_spec_tmp = convert_spec(q_spec_tmp,mzs)
673
465
 
674
466
  similarity_scores = []
675
467
  for ref_idx in range(0,len(unique_reference_ids)):
676
468
  q_spec = q_spec_tmp
677
- if verbose is True and ref_idx % 1000 == 0:
678
- print(f'Query spectrum #{query_idx} has had its similarity with {ref_idx} reference library spectra computed')
679
- r_idxs_tmp = np.where(df_reference.iloc[:,0] == unique_reference_ids[ref_idx])[0]
680
- r_spec_tmp = np.asarray(pd.concat([df_reference.iloc[r_idxs_tmp,1], df_reference.iloc[r_idxs_tmp,2]], axis=1).reset_index(drop=True))
469
+ #if verbose is True and ref_idx % 1000 == 0:
470
+ # print(f'Query spectrum #{query_idx} has had its similarity with {ref_idx} reference library spectra computed')
471
+ r_idxs_tmp = np.where(df_reference['id'] == unique_reference_ids[ref_idx])[0]
472
+ r_spec_tmp = np.asarray(pd.concat([df_reference['mz_ratio'].iloc[r_idxs_tmp], df_reference['intensity'].iloc[r_idxs_tmp]], axis=1).reset_index(drop=True))
681
473
  r_spec = convert_spec(r_spec_tmp,mzs)
682
474
 
683
475
  for transformation in spectrum_preprocessing_order:
@@ -713,7 +505,7 @@ def get_acc_NRMS(df_query, df_reference, unique_query_ids, unique_reference_ids,
713
505
 
714
506
  df_scores = pd.DataFrame(all_similarity_scores, columns = unique_reference_ids)
715
507
  df_scores.index = unique_query_ids
716
- df_scores.index.names = ['Query Spectrum ID']
508
+ df_scores.index.names = ['QUERY.SPECTRUM.ID']
717
509
 
718
510
  preds = []
719
511
  scores = []
@@ -738,69 +530,45 @@ def get_acc_NRMS(df_query, df_reference, unique_query_ids, unique_reference_ids,
738
530
  scores = np.array(scores)
739
531
  out = np.c_[unique_query_ids,preds,scores]
740
532
  df_tmp = pd.DataFrame(out, columns=['TRUE.ID','PREDICTED.ID','SCORE'])
533
+ #if verbose:
534
+ # print(df_tmp)
741
535
  acc = (df_tmp['TRUE.ID']==df_tmp['PREDICTED.ID']).mean()
742
536
  return acc
743
537
 
744
538
 
745
539
 
746
- def run_spec_lib_matching_on_HRMS_data(query_data=None, reference_data=None, likely_reference_ids=None, similarity_measure='cosine', weights={'Cosine':0.25,'Shannon':0.25,'Renyi':0.25,'Tsallis':0.25}, spectrum_preprocessing_order='FCNMWL', high_quality_reference_library=False, mz_min=0, mz_max=9999999, int_min=0, int_max=9999999, window_size_centroiding=0.5, window_size_matching=0.5, noise_threshold=0.0, wf_mz=0.0, wf_intensity=1.0, LET_threshold=0.0, entropy_dimension=1.1, n_top_matches_to_save=1, print_id_results=False, output_identification=None, output_similarity_scores=None, return_ID_output=False, verbose=True):
747
- '''
748
- runs spectral library matching on high-resolution mass spectrometry (HRMS) data
749
-
750
- --query_data: mgf, mzML, or csv file of query mass spectrum/spectra to be identified. If csv file, each row should correspond to a mass spectrum, the left-most column should contain an identifier, and each of the other columns should correspond to a single mass/charge ratio. Mandatory argument.
751
- --reference_data: either string or list of strings with pass to mgf, mzML, sdf, and/or csv file(s) of the reference mass spectra. If csv file, each row should correspond to a mass spectrum, the left-most column should contain in identifier (i.e. the CAS registry number or the compound name), and the remaining column should correspond to a single mass/charge ratio. Mandatory argument.
752
- --likely_reference_ids: CSV file with one column containing the IDs of a subset of all compounds in the reference_data to be used in spectral library matching. Each ID in this file must be an ID in the reference library. Default: None (i.e. default is to use entire reference library)
753
- --similarity_measure: cosine, shannon, renyi, tsallis, mixture, jaccard, dice, 3w_jaccard, sokal_sneath, binary_cosine, mountford, mcconnaughey, driver_kroeber, simpson, braun_banquet, fager_mcgowan, kulczynski, intersection, hamming, hellinger. Default: cosine.
754
- --weights: dict of weights to give to each non-binary similarity measure (i.e. cosine, shannon, renyi, and tsallis) when the mixture similarity measure is specified. Default: 0.25 for each of the four non-binary similarity measures.
755
- --spectrum_preprocessing_order: The spectrum preprocessing transformations and the order in which they are to be applied. Note that these transformations are applied prior to computing similarity scores. Format must be a string with 2-6 characters chosen from C, F, M, N, L, W representing centroiding, filtering based on mass/charge and intensity values, matching, noise removal, low-entropy trannsformation, and weight-factor-transformation, respectively. For example, if \'WCM\' is passed, then each spectrum will undergo a weight factor transformation, then centroiding, and then matching. Note that if an argument is passed, then \'M\' must be contained in the argument, since matching is a required preprocessing step in spectral library matching of HRMS data. Furthermore, \'C\' must be performed before matching since centroiding can change the number of ion fragments in a given spectrum. Default: FCNMWL')
756
- --high_quality_reference_library: True/False flag indicating whether the reference library is considered to be of high quality. If True, then the spectrum preprocessing transformations of filtering and noise removal are performed only on the query spectrum/spectra. If False, all spectrum preprocessing transformations specified will be applied to both the query and reference spectra. Default: False')
757
- --mz_min: Remove all peaks with mass/charge value less than mz_min in each spectrum. Default: 0
758
- --mz_max: Remove all peaks with mass/charge value greater than mz_max in each spectrum. Default: 9999999
759
- --int_min: Remove all peaks with intensity value less than int_min in each spectrum. Default: 0
760
- --int_max: Remove all peaks with intensity value greater than int_max in each spectrum. Default: 9999999
761
- --window_size_centroiding: Window size parameter used in centroiding a given spectrum. Default: 0.5
762
- --window_size_matching: Window size parameter used in matching a query spectrum and a reference library spectrum. Default: 0.5
763
- --noise_threshold: Ion fragments (i.e. points in a given mass spectrum) with intensity less than max(intensities)*noise_threshold are removed. Default: 0.0
764
- --wf_mz: Mass/charge weight factor parameter. Default: 0.0
765
- --wf_intensity: Intensity weight factor parameter. Default: 0.0
766
- --LET_threshold: Low-entropy transformation threshold parameter. Spectra with Shannon entropy less than LET_threshold are transformed according to intensitiesNew=intensitiesOriginal^{(1+S)/(1+LET_threshold)}. Default: 0.0
767
- --entropy_dimension: Entropy dimension parameter. Must have positive value other than 1. When the entropy dimension is 1, then Renyi and Tsallis entropy are equivalent to Shannon entropy. Therefore, this parameter only applies to the renyi and tsallis similarity measures. This parameter will be ignored if similarity measure cosine or shannon is chosen. Default: 1.1
768
- --n_top_matches_to_save: The number of top matches to report. For example, if n_top_matches_to_save=5, then for each query spectrum, the five reference spectra with the largest similarity with the given query spectrum will be reported. Default: 1
769
- --print_id_results: Flag that prints identification results if True. Default: False
770
- --output_identification: Output TXT file containing the most-similar reference spectra for each query spectrum along with the corresponding similarity scores. Default is to save identification output in current working directory with filename \'output_identification.txt\'.
771
- --output_similarity_scores: Output TXT file containing similarity scores between all query spectrum/spectra and all reference spectra. Each row corresponds to a query spectrum, the left-most column contains the query spectrum/spectra identifier, and the remaining column contain the similarity scores with respect to all reference library spectra. If no argument passed, then this TXT file is written to the current working directory with filename \'output_all_similarity_scores\'.txt.')
772
- '''
773
-
540
+ def run_spec_lib_matching_on_HRMS_data(query_data=None, reference_data=None, precursor_ion_mz_tolerance=None, ionization_mode=None, adduct=None, likely_reference_ids=None, similarity_measure='cosine', weights={'Cosine':0.25,'Shannon':0.25,'Renyi':0.25,'Tsallis':0.25}, spectrum_preprocessing_order='FCNMWL', high_quality_reference_library=False, mz_min=0, mz_max=9999999, int_min=0, int_max=9999999, window_size_centroiding=0.5, window_size_matching=0.5, noise_threshold=0.0, wf_mz=0.0, wf_intensity=1.0, LET_threshold=0.0, entropy_dimension=1.1, n_top_matches_to_save=1, print_id_results=False, output_identification=None, output_similarity_scores=None, return_ID_output=False, verbose=True):
774
541
  if query_data is None:
775
542
  print('\nError: No argument passed to the mandatory query_data. Please pass the path to the CSV file of the query data.')
776
543
  sys.exit()
777
544
  else:
778
545
  extension = query_data.rsplit('.',1)
779
546
  extension = extension[(len(extension)-1)]
780
- if extension == 'mgf' or extension == 'MGF' or extension == 'mzML' or extension == 'mzml' or extension == 'MZML' or extension == 'cdf' or extension == 'CDF':
781
- output_path_tmp = query_data[:-3] + 'csv'
547
+ if extension == 'mgf' or extension == 'MGF' or extension == 'mzML' or extension == 'mzml' or extension == 'MZML' or extension == 'cdf' or extension == 'CDF' or extension == 'json' or extension == 'JSON' or extension == 'msp' or extension == 'MSP':
548
+ output_path_tmp = query_data[:-3] + 'txt'
782
549
  build_library_from_raw_data(input_path=query_data, output_path=output_path_tmp, is_reference=False)
783
- df_query = pd.read_csv(output_path_tmp)
784
- if extension == 'csv' or extension == 'CSV':
785
- df_query = pd.read_csv(query_data)
786
- unique_query_ids = df_query.iloc[:,0].unique()
550
+ df_query = pd.read_csv(output_path_tmp, sep='\t')
551
+ if extension == 'txt' or extension == 'TXT':
552
+ df_query = pd.read_csv(query_data, sep='\t')
553
+ unique_query_ids = df_query['id'].unique()
787
554
 
788
555
  if reference_data is None:
789
- print('\nError: No argument passed to the mandatory reference_data. Please pass the path to the CSV file of the reference data.')
556
+ print('\nError: No argument passed to the mandatory reference_data. Please pass the path to the reference data.')
790
557
  sys.exit()
791
558
  else:
792
559
  if isinstance(reference_data,str):
793
560
  df_reference = get_reference_df(reference_data,likely_reference_ids)
794
- unique_reference_ids = df_reference.iloc[:,0].unique()
795
561
  else:
796
562
  dfs = []
797
- unique_reference_ids = []
798
563
  for f in reference_data:
799
564
  tmp = get_reference_df(f,likely_reference_ids)
800
565
  dfs.append(tmp)
801
- unique_reference_ids.extend(tmp.iloc[:,0].unique())
802
566
  df_reference = pd.concat(dfs, axis=0, ignore_index=True)
803
567
 
568
+ if 'ionization_mode' in df_reference.columns.tolist() and ionization_mode != 'N/A' and ionization_mode != None:
569
+ df_reference = df_reference.loc[df_reference['ionization_mode']==ionization_mode]
570
+ if 'adduct' in df_reference.columns.tolist() and adduct != 'N/A' and adduct != None:
571
+ df_reference = df_reference.loc[df_reference['adduct']==adduct]
804
572
 
805
573
  if spectrum_preprocessing_order is not None:
806
574
  spectrum_preprocessing_order = list(spectrum_preprocessing_order)
@@ -888,62 +656,91 @@ def run_spec_lib_matching_on_HRMS_data(query_data=None, reference_data=None, lik
888
656
  print(f'Warning: writing similarity scores to {output_similarity_scores}')
889
657
 
890
658
 
891
- all_similarity_scores = []
892
- for query_idx in range(0,len(unique_query_ids)):
893
- if verbose is True:
659
+ unique_reference_ids = df_reference['id'].unique().tolist()
660
+ all_similarity_scores = []
661
+
662
+ for query_idx in range(len(unique_query_ids)):
663
+ if verbose:
894
664
  print(f'query spectrum #{query_idx} is being identified')
895
- q_idxs_tmp = np.where(df_query.iloc[:,0] == unique_query_ids[query_idx])[0]
896
- q_spec_tmp = np.asarray(pd.concat([df_query.iloc[q_idxs_tmp,1], df_query.iloc[q_idxs_tmp,2]], axis=1).reset_index(drop=True))
897
665
 
898
- similarity_scores = []
899
- for ref_idx in range(0,len(unique_reference_ids)):
900
- q_spec = q_spec_tmp
901
- r_idxs_tmp = np.where(df_reference.iloc[:,0] == unique_reference_ids[ref_idx])[0]
902
- r_spec = np.asarray(pd.concat([df_reference.iloc[r_idxs_tmp,1], df_reference.iloc[r_idxs_tmp,2]], axis=1).reset_index(drop=True))
666
+ q_mask = (df_query['id'] == unique_query_ids[query_idx])
667
+ q_idxs_tmp = np.where(q_mask)[0]
668
+ q_spec_tmp = np.asarray(pd.concat([df_query['mz_ratio'].iloc[q_idxs_tmp], df_query['intensity'].iloc[q_idxs_tmp]], axis=1).reset_index(drop=True))
669
+
670
+ if 'precursor_ion_mz' in df_query.columns.tolist() and 'precursor_ion_mz' in df_reference.columns.tolist() and precursor_ion_mz_tolerance != None:
671
+ precursor_ion_mz_tmp = df_query['precursor_ion_mz'].iloc[q_idxs_tmp[0]]
672
+ df_reference_tmp = df_reference.loc[df_reference['precursor_ion_mz'].between(precursor_ion_mz_tmp-precursor_ion_mz_tolerance, precursor_ion_mz_tmp+precursor_ion_mz_tolerance, inclusive='both'),['id','mz_ratio','intensity']].copy()
673
+ else:
674
+ df_reference_tmp = df_reference.copy()
675
+
676
+ ref_groups = dict(tuple(df_reference_tmp.groupby('id', sort=False)))
677
+ unique_reference_ids_tmp = list(ref_groups.keys())
678
+
679
+ similarity_by_ref = {}
680
+ for ref_id in unique_reference_ids_tmp:
681
+ q_spec = q_spec_tmp.copy()
682
+ r_df = ref_groups[ref_id]
683
+ r_spec = np.asarray(pd.concat([r_df['mz_ratio'], r_df['intensity']], axis=1).reset_index(drop=True))
684
+ #print('\nhere!!!!!!!!!!!!!!!')
685
+ #print(r_spec)
903
686
 
904
687
  is_matched = False
688
+
905
689
  for transformation in spectrum_preprocessing_order:
906
- if np.isinf(q_spec[:,1]).sum() > 0:
907
- q_spec[:,1] = np.zeros(q_spec.shape[0])
908
- if np.isinf(r_spec[:,1]).sum() > 0:
909
- r_spec[:,1] = np.zeros(r_spec.shape[0])
910
- if transformation == 'C' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1:
911
- q_spec = centroid_spectrum(q_spec, window_size=window_size_centroiding)
912
- r_spec = centroid_spectrum(r_spec, window_size=window_size_centroiding)
913
- if transformation == 'M' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1:
690
+ if np.isinf(q_spec[:, 1]).sum() > 0:
691
+ q_spec[:, 1] = np.zeros(q_spec.shape[0])
692
+ if np.isinf(r_spec[:, 1]).sum() > 0:
693
+ r_spec[:, 1] = np.zeros(r_spec.shape[0])
694
+
695
+ if transformation == 'C' and q_spec.shape[0] > 1 and r_spec.shape[0] > 1:
696
+ q_spec = centroid_spectrum(q_spec, window_size=window_size_centroiding)
697
+ r_spec = centroid_spectrum(r_spec, window_size=window_size_centroiding)
698
+
699
+ if transformation == 'M' and q_spec.shape[0] > 1 and r_spec.shape[0] > 1:
914
700
  m_spec = match_peaks_in_spectra(spec_a=q_spec, spec_b=r_spec, window_size=window_size_matching)
915
- q_spec = m_spec[:,0:2]
916
- r_spec = m_spec[:,[0,2]]
701
+ q_spec = m_spec[:, 0:2]
702
+ r_spec = m_spec[:, [0, 2]]
917
703
  is_matched = True
918
- if transformation == 'W' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1:
919
- q_spec[:,1] = wf_transform(q_spec[:,0], q_spec[:,1], wf_mz, wf_intensity)
920
- r_spec[:,1] = wf_transform(r_spec[:,0], r_spec[:,1], wf_mz, wf_intensity)
921
- if transformation == 'L' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1:
922
- q_spec[:,1] = LE_transform(q_spec[:,1], LET_threshold, normalization_method=normalization_method)
923
- r_spec[:,1] = LE_transform(r_spec[:,1], LET_threshold, normalization_method=normalization_method)
924
- if transformation == 'N' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1:
925
- q_spec = remove_noise(q_spec, nr = noise_threshold)
926
- if high_quality_reference_library == False:
927
- r_spec = remove_noise(r_spec, nr = noise_threshold)
928
- if transformation == 'F' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1:
929
- q_spec = filter_spec_lcms(q_spec, mz_min = mz_min, mz_max = mz_max, int_min = int_min, int_max = int_max, is_matched = is_matched)
930
- if high_quality_reference_library == False:
931
- r_spec = filter_spec_lcms(r_spec, mz_min = mz_min, mz_max = mz_max, int_min = int_min, int_max = int_max, is_matched = is_matched)
932
-
933
- q_ints = q_spec[:,1]
934
- r_ints = r_spec[:,1]
935
704
 
936
- if np.sum(q_ints) != 0 and np.sum(r_ints) != 0 and q_spec.shape[0] > 1 and r_spec.shape[1] > 1:
937
- similarity_score = get_similarity(similarity_measure, q_ints, r_ints, weights, entropy_dimension)
705
+ if transformation == 'W' and q_spec.shape[0] > 1 and r_spec.shape[0] > 1:
706
+ q_spec[:, 1] = wf_transform(q_spec[:, 0], q_spec[:, 1], wf_mz, wf_intensity)
707
+ r_spec[:, 1] = wf_transform(r_spec[:, 0], r_spec[:, 1], wf_mz, wf_intensity)
708
+
709
+ if transformation == 'L' and q_spec.shape[0] > 1 and r_spec.shape[0] > 1:
710
+ q_spec[:, 1] = LE_transform(q_spec[:, 1], LET_threshold, normalization_method=normalization_method)
711
+ r_spec[:, 1] = LE_transform(r_spec[:, 1], LET_threshold, normalization_method=normalization_method)
712
+
713
+ if transformation == 'N' and q_spec.shape[0] > 1 and r_spec.shape[0] > 1:
714
+ q_spec = remove_noise(q_spec, nr=noise_threshold)
715
+ if not high_quality_reference_library:
716
+ r_spec = remove_noise(r_spec, nr=noise_threshold)
717
+
718
+ if transformation == 'F' and q_spec.shape[0] > 1 and r_spec.shape[0] > 1:
719
+ q_spec = filter_spec_lcms(
720
+ q_spec, mz_min=mz_min, mz_max=mz_max, int_min=int_min, int_max=int_max, is_matched=is_matched
721
+ )
722
+ if not high_quality_reference_library:
723
+ r_spec = filter_spec_lcms(
724
+ r_spec, mz_min=mz_min, mz_max=mz_max, int_min=int_min, int_max=int_max, is_matched=is_matched
725
+ )
726
+
727
+ q_ints = q_spec[:, 1]
728
+ r_ints = r_spec[:, 1]
729
+
730
+ if np.sum(q_ints) != 0 and np.sum(r_ints) != 0 and q_spec.shape[0] > 1 and r_spec.shape[0] > 1:
731
+ sim = get_similarity(similarity_measure, q_ints, r_ints, weights, entropy_dimension)
938
732
  else:
939
- similarity_score = 0
733
+ sim = 0.0
940
734
 
941
- similarity_scores.append(similarity_score)
942
- all_similarity_scores.append(similarity_scores)
735
+ similarity_by_ref[ref_id] = sim
943
736
 
944
- df_scores = pd.DataFrame(all_similarity_scores, columns = unique_reference_ids)
737
+ row_scores = [similarity_by_ref.get(ref_id, 0.0) for ref_id in unique_reference_ids]
738
+ all_similarity_scores.append(row_scores)
739
+
740
+ df_scores = pd.DataFrame(all_similarity_scores, index=unique_query_ids, columns=unique_reference_ids)
945
741
  df_scores.index = unique_query_ids
946
- df_scores.index.names = ['Query Spectrum ID']
742
+ df_scores.index.names = ['QUERY.SPECTRUM.ID']
743
+
947
744
 
948
745
  preds = []
949
746
  scores = []
@@ -976,7 +773,7 @@ def run_spec_lib_matching_on_HRMS_data(query_data=None, reference_data=None, lik
976
773
 
977
774
  df_top_ref_specs = pd.DataFrame(out, columns = [*cnames_preds, *cnames_scores])
978
775
  df_top_ref_specs.index = unique_query_ids
979
- df_top_ref_specs.index.names = ['Query Spectrum ID']
776
+ df_top_ref_specs.index.names = ['QUERY.SPECTRUM.ID']
980
777
 
981
778
  df_scores.columns = ['Reference Spectrum ID: ' + col for col in list(map(str,df_scores.columns.tolist()))]
982
779
 
@@ -993,33 +790,7 @@ def run_spec_lib_matching_on_HRMS_data(query_data=None, reference_data=None, lik
993
790
 
994
791
 
995
792
 
996
- def run_spec_lib_matching_on_NRMS_data(query_data=None, reference_data=None, likely_reference_ids=None, spectrum_preprocessing_order='FNLW', similarity_measure='cosine', weights={'Cosine':0.25,'Shannon':0.25,'Renyi':0.25,'Tsallis':0.25}, high_quality_reference_library=False, mz_min=0, mz_max=9999999, int_min=0, int_max=9999999, noise_threshold=0.0, wf_mz=0.0, wf_intensity=1.0, LET_threshold=0.0, entropy_dimension=1.1, n_top_matches_to_save=1, print_id_results=False, output_identification=None, output_similarity_scores=None, return_ID_output=False):
997
- '''
998
- runs spectral library matching on nominal-resolution mass spectrometry (NRMS) data
999
-
1000
- --query_data: cdf or csv file of query mass spectrum/spectra to be identified. If csv file, each row should correspond to a mass spectrum, the left-most column should contain an identifier, and each of the other columns should correspond to a single mass/charge ratio. Mandatory argument.
1001
- --reference_data: cdf of csv file of the reference mass spectra. If csv file, each row should correspond to a mass spectrum, the left-most column should contain in identifier (i.e. the CAS registry number or the compound name), and the remaining column should correspond to a single mass/charge ratio. Mandatory argument.
1002
- --likely_reference_ids: CSV file with one column containing the IDs of a subset of all compounds in the reference_data to be used in spectral library matching. Each ID in this file must be an ID in the reference library. Default: None (i.e. default is to use entire reference library)
1003
- --similarity_measure: cosine, shannon, renyi, tsallis, mixture, jaccard, dice, 3w_jaccard, sokal_sneath, binary_cosine, mountford, mcconnaughey, driver_kroeber, simpson, braun_banquet, fager_mcgowan, kulczynski, intersection, hamming, hellinger. Default: cosine.
1004
- --weights: dict of weights to give to each non-binary similarity measure (i.e. cosine, shannon, renyi, and tsallis) when the mixture similarity measure is specified. Default: 0.25 for each of the four non-binary similarity measures.
1005
- --spectrum_preprocessing_order: The spectrum preprocessing transformations and the order in which they are to be applied. Note that these transformations are applied prior to computing similarity scores. Format must be a string with 2-4 characters chosen from F, N, L, W representing filtering based on mass/charge and intensity values, noise removal, low-entropy trannsformation, and weight-factor-transformation, respectively. For example, if \'WN\' is passed, then each spectrum will undergo a weight factor transformation and then noise removal. Default: FNLW')
1006
- --high_quality_reference_library: True/False flag indicating whether the reference library is considered to be of high quality. If True, then the spectrum preprocessing transformations of filtering and noise removal are performed only on the query spectrum/spectra. If False, all spectrum preprocessing transformations specified will be applied to both the query and reference spectra. Default: False')
1007
- --mz_min: Remove all peaks with mass/charge value less than mz_min in each spectrum. Default: 0
1008
- --mz_max: Remove all peaks with mass/charge value greater than mz_max in each spectrum. Default: 9999999
1009
- --int_min: Remove all peaks with intensity value less than int_min in each spectrum. Default: 0
1010
- --int_max: Remove all peaks with intensity value greater than int_max in each spectrum. Default: 9999999
1011
- --noise_threshold: Ion fragments (i.e. points in a given mass spectrum) with intensity less than max(intensities)*noise_threshold are removed. Default: 0.0
1012
- --wf_mz: Mass/charge weight factor parameter. Default: 0.0
1013
- --wf_intensity: Intensity weight factor parameter. Default: 0.0
1014
- --LET_threshold: Low-entropy transformation threshold parameter. Spectra with Shannon entropy less than LET_threshold are transformed according to intensitiesNew=intensitiesOriginal^{(1+S)/(1+LET_threshold)}. Default: 0.0
1015
- --entropy_dimension: Entropy dimension parameter. Must have positive value other than 1. When the entropy dimension is 1, then Renyi and Tsallis entropy are equivalent to Shannon entropy. Therefore, this parameter only applies to the renyi and tsallis similarity measures. This parameter will be ignored if similarity measure cosine or shannon is chosen. Default: 1.1
1016
- --normalization_method: Method used to normalize the intensities of each spectrum so that the intensities sum to 1. Since the objects entropy quantifies the uncertainy of must be probability distributions, the intensities of a given spectrum must sum to 1 prior to computing the entropy of the given spectrum intensities. Options: \'standard\' and \'softmax\'. Default: standard.
1017
- --n_top_matches_to_save: The number of top matches to report. For example, if n_top_matches_to_save=5, then for each query spectrum, the five reference spectra with the largest similarity with the given query spectrum will be reported. Default: 1
1018
- --print_id_results: Flag that prints identification results if True. Default: False
1019
- --output_identification: Output TXT file containing the most-similar reference spectra for each query spectrum along with the corresponding similarity scores. Default is to save identification output in current working directory with filename \'output_identification.txt\'.
1020
- --output_similarity_scores: Output TXT file containing similarity scores between all query spectrum/spectra and all reference spectra. Each row corresponds to a query spectrum, the left-most column contains the query spectrum/spectra identifier, and the remaining column contain the similarity scores with respect to all reference library spectra. If no argument passed, then this TXT file is written to the current working directory with filename \'output_all_similarity_scores\'.txt.')
1021
- '''
1022
-
793
+ def run_spec_lib_matching_on_NRMS_data(query_data=None, reference_data=None, likely_reference_ids=None, spectrum_preprocessing_order='FNLW', similarity_measure='cosine', weights={'Cosine':0.25,'Shannon':0.25,'Renyi':0.25,'Tsallis':0.25}, high_quality_reference_library=False, mz_min=0, mz_max=9999999, int_min=0, int_max=9999999, noise_threshold=0.0, wf_mz=0.0, wf_intensity=1.0, LET_threshold=0.0, entropy_dimension=1.1, n_top_matches_to_save=1, print_id_results=False, output_identification=None, output_similarity_scores=None, return_ID_output=False, verbose=True):
1023
794
  if query_data is None:
1024
795
  print('\nError: No argument passed to the mandatory query_data. Please pass the path to the CSV file of the query data.')
1025
796
  sys.exit()
@@ -1027,12 +798,12 @@ def run_spec_lib_matching_on_NRMS_data(query_data=None, reference_data=None, lik
1027
798
  extension = query_data.rsplit('.',1)
1028
799
  extension = extension[(len(extension)-1)]
1029
800
  if extension == 'mgf' or extension == 'MGF' or extension == 'mzML' or extension == 'mzml' or extension == 'MZML' or extension == 'cdf' or extension == 'CDF':
1030
- output_path_tmp = query_data[:-3] + 'csv'
801
+ output_path_tmp = query_data[:-3] + 'txt'
1031
802
  build_library_from_raw_data(input_path=query_data, output_path=output_path_tmp, is_reference=False)
1032
- df_query = pd.read_csv(output_path_tmp)
1033
- if extension == 'csv' or extension == 'CSV':
1034
- df_query = pd.read_csv(query_data)
1035
- unique_query_ids = df_query.iloc[:,0].unique()
803
+ df_query = pd.read_csv(output_path_tmp, sep='\t')
804
+ if extension == 'txt' or extension == 'TXT':
805
+ df_query = pd.read_csv(query_data, sep='\t')
806
+ unique_query_ids = df_query['id'].unique()
1036
807
 
1037
808
  if reference_data is None:
1038
809
  print('\nError: No argument passed to the mandatory reference_data. Please pass the path to the CSV file of the reference data.')
@@ -1040,14 +811,14 @@ def run_spec_lib_matching_on_NRMS_data(query_data=None, reference_data=None, lik
1040
811
  else:
1041
812
  if isinstance(reference_data,str):
1042
813
  df_reference = get_reference_df(reference_data,likely_reference_ids)
1043
- unique_reference_ids = df_reference.iloc[:,0].unique()
814
+ unique_reference_ids = df_reference['id'].unique()
1044
815
  else:
1045
816
  dfs = []
1046
817
  unique_reference_ids = []
1047
818
  for f in reference_data:
1048
819
  tmp = get_reference_df(f,likely_reference_ids)
1049
820
  dfs.append(tmp)
1050
- unique_reference_ids.extend(tmp.iloc[:,0].unique())
821
+ unique_reference_ids.extend(tmp['id'].unique())
1051
822
  df_reference = pd.concat(dfs, axis=0, ignore_index=True)
1052
823
 
1053
824
 
@@ -1123,23 +894,23 @@ def run_spec_lib_matching_on_NRMS_data(query_data=None, reference_data=None, lik
1123
894
 
1124
895
 
1125
896
 
1126
- min_mz = int(np.min([np.min(df_query.iloc[:,1]), np.min(df_reference.iloc[:,1])]))
1127
- max_mz = int(np.max([np.max(df_query.iloc[:,1]), np.max(df_reference.iloc[:,1])]))
897
+ min_mz = int(np.min([np.min(df_query['mz_ratio']), np.min(df_reference['mz_ratio'])]))
898
+ max_mz = int(np.max([np.max(df_query['mz_ratio']), np.max(df_reference['mz_ratio'])]))
1128
899
  mzs = np.linspace(min_mz,max_mz,(max_mz-min_mz+1))
1129
900
 
1130
901
  all_similarity_scores = []
1131
902
  for query_idx in range(0,len(unique_query_ids)):
1132
- q_idxs_tmp = np.where(df_query.iloc[:,0] == unique_query_ids[query_idx])[0]
1133
- q_spec_tmp = np.asarray(pd.concat([df_query.iloc[q_idxs_tmp,1], df_query.iloc[q_idxs_tmp,2]], axis=1).reset_index(drop=True))
903
+ q_idxs_tmp = np.where(df_query['id'] == unique_query_ids[query_idx])[0]
904
+ q_spec_tmp = np.asarray(pd.concat([df_query['mz_ratio'].iloc[q_idxs_tmp], df_query['intensity'].iloc[q_idxs_tmp]], axis=1).reset_index(drop=True))
1134
905
  q_spec_tmp = convert_spec(q_spec_tmp,mzs)
1135
906
 
1136
907
  similarity_scores = []
1137
908
  for ref_idx in range(0,len(unique_reference_ids)):
1138
- if verbose is True and ref_idx % 1000 == 0:
1139
- print(f'Query spectrum #{query_idx} has had its similarity with {ref_idx} reference library spectra computed')
909
+ #if verbose is True and ref_idx % 1000 == 0:
910
+ # print(f'Query spectrum #{query_idx} has had its similarity with {ref_idx} reference library spectra computed')
1140
911
  q_spec = q_spec_tmp
1141
- r_idxs_tmp = np.where(df_reference.iloc[:,0] == unique_reference_ids[ref_idx])[0]
1142
- r_spec_tmp = np.asarray(pd.concat([df_reference.iloc[r_idxs_tmp,1], df_reference.iloc[r_idxs_tmp,2]], axis=1).reset_index(drop=True))
912
+ r_idxs_tmp = np.where(df_reference['id'] == unique_reference_ids[ref_idx])[0]
913
+ r_spec_tmp = np.asarray(pd.concat([df_reference['mz_ratio'].iloc[r_idxs_tmp], df_reference['intensity'].iloc[r_idxs_tmp]], axis=1).reset_index(drop=True))
1143
914
  r_spec = convert_spec(r_spec_tmp,mzs)
1144
915
 
1145
916
  for transformation in spectrum_preprocessing_order:
@@ -1175,7 +946,7 @@ def run_spec_lib_matching_on_NRMS_data(query_data=None, reference_data=None, lik
1175
946
 
1176
947
  df_scores = pd.DataFrame(all_similarity_scores, columns = unique_reference_ids)
1177
948
  df_scores.index = unique_query_ids
1178
- df_scores.index.names = ['Query Spectrum ID']
949
+ df_scores.index.names = ['QUERY.SPECTRUM.ID']
1179
950
 
1180
951
  preds = []
1181
952
  scores = []
@@ -1208,7 +979,7 @@ def run_spec_lib_matching_on_NRMS_data(query_data=None, reference_data=None, lik
1208
979
 
1209
980
  df_top_ref_specs = pd.DataFrame(out, columns = [*cnames_preds, *cnames_scores])
1210
981
  df_top_ref_specs.index = unique_query_ids
1211
- df_top_ref_specs.index.names = ['Query Spectrum ID']
982
+ df_top_ref_specs.index.names = ['QUERY.SPECTRUM.ID']
1212
983
 
1213
984
  if print_id_results == True:
1214
985
  print(df_top_ref_specs.to_string())