pycompound 0.1.6__py3-none-any.whl → 0.1.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -22,8 +22,10 @@ def _vector_to_full_params(X, default_params, optimize_params):
22
22
  def objective_function_HRMS(X, ctx):
23
23
  p = _vector_to_full_params(X, ctx["default_params"], ctx["optimize_params"])
24
24
  acc = get_acc_HRMS(
25
- ctx["df_query"], ctx["df_reference"],
26
- ctx["unique_query_ids"], ctx["unique_reference_ids"],
25
+ ctx["df_query"],
26
+ ctx["df_reference"],
27
+ ctx["precursor_ion_mz_tolerance"],
28
+ ctx["ionization_mode"], ctx["adduct"],
27
29
  ctx["similarity_measure"], ctx["weights"], ctx["spectrum_preprocessing_order"],
28
30
  ctx["mz_min"], ctx["mz_max"], ctx["int_min"], ctx["int_max"],
29
31
  p["window_size_centroiding"], p["window_size_matching"], p["noise_threshold"],
@@ -35,11 +37,11 @@ def objective_function_HRMS(X, ctx):
35
37
  print(f"\nparams({ctx['optimize_params']}) = {np.array(X)}\naccuracy: {acc*100}%")
36
38
  return 1.0 - acc
37
39
 
40
+
38
41
  def objective_function_NRMS(X, ctx):
39
42
  p = _vector_to_full_params(X, ctx["default_params"], ctx["optimize_params"])
40
43
  acc = get_acc_NRMS(
41
- ctx["df_query"], ctx["df_reference"],
42
- ctx["unique_query_ids"], ctx["unique_reference_ids"],
44
+ ctx["df_query"], ctx["df_reference"], ctx['unique_query_ids'], ctx['unique_reference_ids'],
43
45
  ctx["similarity_measure"], ctx["weights"], ctx["spectrum_preprocessing_order"],
44
46
  ctx["mz_min"], ctx["mz_max"], ctx["int_min"], ctx["int_max"],
45
47
  p["noise_threshold"], p["wf_mz"], p["wf_int"], p["LET_threshold"], p["entropy_dimension"],
@@ -51,16 +53,8 @@ def objective_function_NRMS(X, ctx):
51
53
 
52
54
 
53
55
 
54
- def tune_params_DE(query_data=None, reference_data=None, chromatography_platform='HRMS', similarity_measure='cosine', weights=None, spectrum_preprocessing_order='CNMWL', mz_min=0, mz_max=999999999, int_min=0, int_max=999999999, high_quality_reference_library=False, optimize_params=["window_size_centroiding","window_size_matching","noise_threshold","wf_mz","wf_int","LET_threshold","entropy_dimension"], param_bounds={"window_size_centroiding":(0.0,0.5),"window_size_matching":(0.0,0.5),"noise_threshold":(0.0,0.25),"wf_mz":(0.0,5.0),"wf_int":(0.0,5.0),"LET_threshold":(0.0,5.0),"entropy_dimension":(1.0,3.0)}, default_params={"window_size_centroiding": 0.5, "window_size_matching":0.5, "noise_threshold":0.10, "wf_mz":0.0, "wf_int":1.0, "LET_threshold":0.0, "entropy_dimension":1.1}, maxiters=3, de_workers=1, de_updating='immediate', log_hook=None):
55
-
56
- def _log(msg):
57
- if log_hook:
58
- try: log_hook(msg if msg.endswith("\n") else msg + "\n")
59
- except: pass
60
56
 
61
- def callback(xk, conv):
62
- _log(f"iter callback: conv={conv:.4g}, x={xk}")
63
- return False
57
+ def tune_params_DE(query_data=None, reference_data=None, chromatography_platform='HRMS', precursor_ion_mz_tolerance=None, ionization_mode=None, adduct=None, similarity_measure='cosine', weights=None, spectrum_preprocessing_order='CNMWL', mz_min=0, mz_max=999999999, int_min=0, int_max=999999999, high_quality_reference_library=False, optimize_params=["window_size_centroiding","window_size_matching","noise_threshold","wf_mz","wf_int","LET_threshold","entropy_dimension"], param_bounds={"window_size_centroiding":(0.0,0.5),"window_size_matching":(0.0,0.5),"noise_threshold":(0.0,0.25),"wf_mz":(0.0,5.0),"wf_int":(0.0,5.0),"LET_threshold":(0.0,5.0),"entropy_dimension":(1.0,3.0)}, default_params={"window_size_centroiding": 0.5, "window_size_matching":0.5, "noise_threshold":0.10, "wf_mz":0.0, "wf_int":1.0, "LET_threshold":0.0, "entropy_dimension":1.1}, maxiters=3, de_workers=1):
64
58
 
65
59
  if query_data is None:
66
60
  print('\nError: No argument passed to the mandatory query_data. Please pass the path to the TXT file of the query data.')
@@ -68,21 +62,19 @@ def tune_params_DE(query_data=None, reference_data=None, chromatography_platform
68
62
  else:
69
63
  extension = query_data.rsplit('.',1)
70
64
  extension = extension[(len(extension)-1)]
71
- if extension == 'mgf' or extension == 'MGF' or extension == 'mzML' or extension == 'mzml' or extension == 'MZML' or extension == 'cdf' or extension == 'CDF':
72
- output_path_tmp = query_data[:-3] + 'csv'
65
+ if extension == 'mgf' or extension == 'MGF' or extension == 'mzML' or extension == 'mzml' or extension == 'MZML' or extension == 'cdf' or extension == 'CDF' or extension == 'msp' or extension == 'MSP' or extension == 'json' or extension == 'JSON':
66
+ output_path_tmp = query_data[:-3] + 'txt'
73
67
  build_library_from_raw_data(input_path=query_data, output_path=output_path_tmp, is_reference=False)
74
- df_query = pd.read_csv(output_path_tmp)
75
- if extension == 'csv' or extension == 'CSV':
76
- df_query = pd.read_csv(query_data)
77
- unique_query_ids = df_query.iloc[:,0].unique()
68
+ df_query = pd.read_csv(output_path_tmp, sep='\t')
69
+ if extension == 'txt' or extension == 'TXT':
70
+ df_query = pd.read_csv(query_data, sep='\t')
78
71
 
79
72
  if reference_data is None:
80
- print('\nError: No argument passed to the mandatory reference_data. Please pass the path to the CSV file of the reference data.')
73
+ print('\nError: No argument passed to the mandatory reference_data. Please pass the path to the TXT file of the reference data.')
81
74
  sys.exit()
82
75
  else:
83
76
  if isinstance(reference_data,str):
84
77
  df_reference = get_reference_df(reference_data=reference_data)
85
- unique_reference_ids = df_reference.iloc[:,0].unique()
86
78
  else:
87
79
  dfs = []
88
80
  unique_reference_ids = []
@@ -92,6 +84,11 @@ def tune_params_DE(query_data=None, reference_data=None, chromatography_platform
92
84
  unique_reference_ids.extend(tmp.iloc[:,0].unique())
93
85
  df_reference = pd.concat(dfs, axis=0, ignore_index=True)
94
86
 
87
+ if 'ionization_mode' in df_reference.columns.tolist() and ionization_mode != None and ionization_mode != 'N/A':
88
+ df_reference = df_reference.loc[df_reference['ionization_mode']==ionization_mode]
89
+ if 'adduct' in df_reference.columns.tolist() and adduct != None and adduct != 'N/A':
90
+ df_reference = df_reference.loc[df_reference['adduct']==adduct]
91
+
95
92
  unique_query_ids = df_query['id'].unique().tolist()
96
93
  unique_reference_ids = df_reference['id'].unique().tolist()
97
94
 
@@ -100,6 +97,9 @@ def tune_params_DE(query_data=None, reference_data=None, chromatography_platform
100
97
  df_reference=df_reference,
101
98
  unique_query_ids=unique_query_ids,
102
99
  unique_reference_ids=unique_reference_ids,
100
+ precursor_ion_mz_tolerance=precursor_ion_mz_tolerance,
101
+ ionization_mode=ionization_mode,
102
+ adduct=adduct,
103
103
  similarity_measure=similarity_measure,
104
104
  weights=weights,
105
105
  spectrum_preprocessing_order=spectrum_preprocessing_order,
@@ -111,13 +111,10 @@ def tune_params_DE(query_data=None, reference_data=None, chromatography_platform
111
111
 
112
112
  bounds = [param_bounds[p] for p in optimize_params]
113
113
 
114
- print('here!!!!!!!!!!!!!!!')
115
- print(de_workers)
116
- print('here!!!!!!!!!!!!!!!')
117
114
  if chromatography_platform == 'HRMS':
118
- result = differential_evolution(objective_function_HRMS, bounds=bounds, args=(ctx,), maxiter=maxiters, tol=0.0, workers=de_workers, seed=1)
115
+ result = differential_evolution(objective_function_HRMS, bounds=bounds, args=(ctx,), maxiter=maxiters, tol=0.0, workers=de_workers, seed=1, updating='deferred' if de_workers!=1 else 'immediate')
119
116
  else:
120
- result = differential_evolution(objective_function_NRMS, bounds=bounds, args=(ctx,), maxiter=maxiters, tol=0.0, workers=de_workers, seed=1)
117
+ result = differential_evolution(objective_function_NRMS, bounds=bounds, args=(ctx,), maxiter=maxiters, tol=0.0, workers=de_workers, seed=1, updating='deferred' if de_workers!=1 else 'immediate')
121
118
 
122
119
  best_full_params = _vector_to_full_params(result.x, default_params, optimize_params)
123
120
  best_acc = 100.0 - (result.fun * 100.0)
@@ -131,14 +128,17 @@ def tune_params_DE(query_data=None, reference_data=None, chromatography_platform
131
128
  for k, v in best_full_params.items():
132
129
  print(f" {k}: {v}")
133
130
  print(f"\nBest accuracy: {best_acc:.3f}%")
134
- _log(f"best = {result.x}, acc={100*(1-result.fun):.3f}%")
131
+
132
+
133
+
135
134
 
136
135
 
137
136
  default_HRMS_grid = {'similarity_measure':['cosine'], 'weight':[{'Cosine':0.25,'Shannon':0.25,'Renyi':0.25,'Tsallis':0.25}], 'spectrum_preprocessing_order':['FCNMWL'], 'mz_min':[0], 'mz_max':[9999999], 'int_min':[0], 'int_max':[99999999], 'window_size_centroiding':[0.5], 'window_size_matching':[0.5], 'noise_threshold':[0.0], 'wf_mz':[0.0], 'wf_int':[1.0], 'LET_threshold':[0.0], 'entropy_dimension':[1.1], 'high_quality_reference_library':[False]}
138
137
  default_NRMS_grid = {'similarity_measure':['cosine'], 'weight':[{'Cosine':0.25,'Shannon':0.25,'Renyi':0.25,'Tsallis':0.25}], 'spectrum_preprocessing_order':['FCNMWL'], 'mz_min':[0], 'mz_max':[9999999], 'int_min':[0], 'int_max':[99999999], 'noise_threshold':[0.0], 'wf_mz':[0.0], 'wf_int':[1.0], 'LET_threshold':[0.0], 'entropy_dimension':[1.1], 'high_quality_reference_library':[False]}
139
138
 
140
139
 
141
- def _eval_one_HRMS(df_query, df_reference, unique_query_ids, unique_reference_ids,
140
+ def _eval_one_HRMS(df_query, df_reference,
141
+ precursor_ion_mz_tolerance_tmp, ionization_mode_tmp, adduct_tmp,
142
142
  similarity_measure_tmp, weight,
143
143
  spectrum_preprocessing_order_tmp, mz_min_tmp, mz_max_tmp,
144
144
  int_min_tmp, int_max_tmp, noise_threshold_tmp,
@@ -148,7 +148,8 @@ def _eval_one_HRMS(df_query, df_reference, unique_query_ids, unique_reference_id
148
148
 
149
149
  acc = get_acc_HRMS(
150
150
  df_query=df_query, df_reference=df_reference,
151
- unique_query_ids=unique_query_ids, unique_reference_ids=unique_reference_ids,
151
+ precursor_ion_mz_tolerance=precursor_ion_mz_tolerance_tmp,
152
+ ionization_mode=ionization_mode_tmp, adduct=adduct_tmp,
152
153
  similarity_measure=similarity_measure_tmp, weights=weight,
153
154
  spectrum_preprocessing_order=spectrum_preprocessing_order_tmp,
154
155
  mz_min=mz_min_tmp, mz_max=mz_max_tmp,
@@ -160,7 +161,7 @@ def _eval_one_HRMS(df_query, df_reference, unique_query_ids, unique_reference_id
160
161
  LET_threshold=LET_threshold_tmp,
161
162
  entropy_dimension=entropy_dimension_tmp,
162
163
  high_quality_reference_library=high_quality_reference_library_tmp,
163
- verbose=True
164
+ verbose=False
164
165
  )
165
166
 
166
167
  return (
@@ -191,6 +192,7 @@ def _eval_one_NRMS(df_query, df_reference, unique_query_ids, unique_reference_id
191
192
  LET_threshold=LET_threshold_tmp,
192
193
  entropy_dimension=entropy_dimension_tmp,
193
194
  high_quality_reference_library=high_quality_reference_library_tmp,
195
+ verbose=False
194
196
  )
195
197
 
196
198
  return (
@@ -201,16 +203,7 @@ def _eval_one_NRMS(df_query, df_reference, unique_query_ids, unique_reference_id
201
203
 
202
204
 
203
205
 
204
- def tune_params_on_HRMS_data_grid(query_data=None, reference_data=None, grid=None, output_path=None, return_output=False):
205
- """
206
- runs spectral library matching on high-resolution mass spectrometry (HRMS) data with all possible combinations of parameters in the grid dict, saves results from each choice of parameters to a TXT file, and prints top-performing parameters
207
-
208
- --query_data: mgf, mzML, or csv file of query mass spectrum/spectra to be identified. If csv file, each row should correspond to a mass spectrum, the left-most column should contain an identifier, and each of the other columns should correspond to a single mass/charge ratio. Mandatory argument.
209
- --reference_data: mgf, mzML, or csv file of the reference mass spectra. If csv file, each row should correspond to a mass spectrum, the left-most column should contain in identifier (i.e. the CAS registry number or the compound name), and the remaining column should correspond to a single mass/charge ratio. Mandatory argument.
210
- --grid: dict with all possible parameter values to try.
211
- --output_path: accuracy from each choice of parameter set is saved to a TXT file here.
212
- """
213
-
206
+ def tune_params_on_HRMS_data_grid(query_data=None, reference_data=None, precursor_ion_mz_tolerance=None, ionization_mode=None, adduct=None, grid=None, output_path=None, return_output=False):
214
207
  grid = {**default_HRMS_grid, **(grid or {})}
215
208
  for key, value in grid.items():
216
209
  globals()[key] = value
@@ -221,31 +214,37 @@ def tune_params_on_HRMS_data_grid(query_data=None, reference_data=None, grid=Non
221
214
  else:
222
215
  extension = query_data.rsplit('.',1)
223
216
  extension = extension[(len(extension)-1)]
224
- if extension == 'mgf' or extension == 'MGF' or extension == 'mzML' or extension == 'mzml' or extension == 'MZML' or extension == 'cdf' or extension == 'CDF':
225
- output_path_tmp = query_data[:-3] + 'csv'
217
+ if extension == 'mgf' or extension == 'MGF' or extension == 'mzML' or extension == 'mzml' or extension == 'MZML' or extension == 'cdf' or extension == 'CDF' or extension == 'msp' or extension == 'MSP' or extension == 'json' or extension == 'JSON':
218
+ output_path_tmp = query_data[:-3] + 'txt'
226
219
  build_library_from_raw_data(input_path=query_data, output_path=output_path_tmp, is_reference=False)
227
- df_query = pd.read_csv(output_path_tmp)
228
- if extension == 'csv' or extension == 'CSV':
229
- df_query = pd.read_csv(query_data)
230
- unique_query_ids = df_query.iloc[:,0].unique()
220
+ df_query = pd.read_csv(output_path_tmp, sep='\t')
221
+ if extension == 'txt' or extension == 'TXT':
222
+ df_query = pd.read_csv(query_data, sep='\t')
223
+ unique_query_ids = df_query['id'].unique()
231
224
 
232
225
  if reference_data is None:
233
- print('\nError: No argument passed to the mandatory reference_data. Please pass the path to the CSV file of the reference data.')
226
+ print('\nError: No argument passed to the mandatory reference_data. Please pass the path to the TXT file of the reference data.')
234
227
  sys.exit()
235
228
  else:
236
229
  if isinstance(reference_data,str):
237
230
  df_reference = get_reference_df(reference_data=reference_data)
238
- unique_reference_ids = df_reference.iloc[:,0].unique()
231
+ unique_reference_ids = df_reference['id'].unique()
239
232
  else:
240
233
  dfs = []
241
234
  unique_reference_ids = []
242
235
  for f in reference_data:
243
236
  tmp = get_reference_df(reference_data=f)
244
237
  dfs.append(tmp)
245
- unique_reference_ids.extend(tmp.iloc[:,0].unique())
238
+ unique_reference_ids.extend(tmp['id'].unique())
246
239
  df_reference = pd.concat(dfs, axis=0, ignore_index=True)
247
240
 
248
- print(f'\nNote that there are {len(unique_query_ids)} unique query spectra, {len(unique_reference_ids)} unique reference spectra, and {len(set(unique_query_ids) & set(unique_reference_ids))} of the query and reference spectra IDs are in common.\n')
241
+ if 'ionization_mode' in df_reference.columns.tolist() and ionization_mode != 'N/A' and ionization_mode != None:
242
+ df_reference = df_reference.loc[df_reference['ionization_mode']==ionization_mode].copy()
243
+ if 'adduct' in df_reference.columns.tolist() and adduct != 'N/A' and adduct != None:
244
+ df_reference = df_reference.loc[df_reference['adduct']==adduct].copy()
245
+ unique_reference_ids_tmp2 = df_reference['id'].unique()
246
+
247
+ print(f'\nNote that there are {len(unique_query_ids)} unique query spectra, {len(unique_reference_ids)} unique reference spectra, and {len(set(unique_query_ids) & set(unique_reference_ids_tmp2))} of the query and reference spectra IDs are in common.\n')
249
248
 
250
249
  if output_path is None:
251
250
  output_path = f'{Path.cwd()}/tuning_param_output.txt'
@@ -253,7 +252,7 @@ def tune_params_on_HRMS_data_grid(query_data=None, reference_data=None, grid=Non
253
252
 
254
253
  param_grid = product(similarity_measure, weight, spectrum_preprocessing_order, mz_min, mz_max, int_min, int_max, noise_threshold,
255
254
  window_size_centroiding, window_size_matching, wf_mz, wf_int, LET_threshold, entropy_dimension, high_quality_reference_library)
256
- results = Parallel(n_jobs=-1, verbose=10)(delayed(_eval_one_HRMS)(df_query, df_reference, unique_query_ids, unique_reference_ids, *params) for params in param_grid)
255
+ results = Parallel(n_jobs=-1, verbose=10)(delayed(_eval_one_HRMS)(df_query, df_reference, precursor_ion_mz_tolerance, ionization_mode, adduct, *params) for params in param_grid)
257
256
 
258
257
  df_out = pd.DataFrame(results, columns=[
259
258
  'ACC','SIMILARITY.MEASURE','WEIGHT','SPECTRUM.PROCESSING.ORDER', 'MZ.MIN','MZ.MAX','INT.MIN','INT.MAX','NOISE.THRESHOLD',
@@ -277,124 +276,7 @@ def tune_params_on_HRMS_data_grid(query_data=None, reference_data=None, grid=Non
277
276
 
278
277
 
279
278
 
280
- def tune_params_on_HRMS_data_grid_shiny(query_data=None, reference_data=None, grid=None, output_path=None, return_output=False):
281
- """
282
- runs spectral library matching on high-resolution mass spectrometry (HRMS) data with all possible
283
- combinations of parameters in the grid dict, saves results from each choice of parameters to a TXT file,
284
- and prints top-performing parameters
285
-
286
- --query_data: mgf, mzML, or csv file of query mass spectrum/spectra to be identified. If csv file, each row
287
- should correspond to a mass spectrum, the left-most column should contain an identifier, and each of the
288
- other columns should correspond to a single mass/charge ratio. Mandatory argument.
289
- --reference_data: mgf, mzML, or csv file of the reference mass spectra. If csv file, each row should correspond
290
- to a mass spectrum, the left-most column should contain in identifier (i.e. the CAS registry number or the
291
- compound name), and the remaining column should correspond to a single mass/charge ratio. Mandatory argument.
292
- --grid: dict with all possible parameter values to try.
293
- --output_path: accuracy from each choice of parameter set is saved to a TXT file here.
294
- """
295
-
296
- local_grid = {**default_HRMS_grid, **(grid or {})}
297
- for key, value in local_grid.items():
298
- globals()[key] = value
299
-
300
- if query_data is None:
301
- print('\nError: No argument passed to the mandatory query_data. Please pass the path to the data file.')
302
- sys.exit()
303
- else:
304
- extension = query_data.rsplit('.', 1)[-1]
305
- if extension in ('mgf','MGF','mzML','mzml','MZML','cdf','CDF'):
306
- output_path_tmp = query_data[:-3] + 'csv'
307
- build_library_from_raw_data(input_path=query_data, output_path=output_path_tmp, is_reference=False)
308
- df_query = pd.read_csv(output_path_tmp)
309
- elif extension in ('csv','CSV'):
310
- df_query = pd.read_csv(query_data)
311
- else:
312
- print(f'\nError: Unsupported query_data extension: {extension}')
313
- sys.exit()
314
- unique_query_ids = df_query.iloc[:, 0].unique()
315
-
316
- if reference_data is None:
317
- print('\nError: No argument passed to the mandatory reference_data. Please pass the path to the data file(s).')
318
- sys.exit()
319
- else:
320
- if isinstance(reference_data, str):
321
- df_reference = get_reference_df(reference_data=reference_data)
322
- unique_reference_ids = df_reference.iloc[:, 0].unique()
323
- else:
324
- dfs = []
325
- unique_reference_ids = []
326
- for f in reference_data:
327
- tmp = get_reference_df(reference_data=f)
328
- dfs.append(tmp)
329
- unique_reference_ids.extend(tmp.iloc[:, 0].unique())
330
- df_reference = pd.concat(dfs, axis=0, ignore_index=True)
331
-
332
- print(f'\nNote that there are {len(unique_query_ids)} unique query spectra, '
333
- f'{len(unique_reference_ids)} unique reference spectra, and '
334
- f'{len(set(unique_query_ids) & set(unique_reference_ids))} of the query and reference spectra IDs are in common.\n')
335
-
336
- if output_path is None:
337
- output_path = f'{Path.cwd()}/tuning_param_output.txt'
338
- print(f'Warning: since output_path=None, the output will be written to the current working directory: {output_path}')
339
-
340
- param_grid = product(
341
- similarity_measure, weight, spectrum_preprocessing_order, mz_min, mz_max, int_min, int_max,
342
- noise_threshold, window_size_centroiding, window_size_matching, wf_mz, wf_int, LET_threshold,
343
- entropy_dimension, high_quality_reference_library
344
- )
345
-
346
- results = []
347
- total = (
348
- len(similarity_measure) * len(weight) * len(spectrum_preprocessing_order) * len(mz_min) * len(mz_max) *
349
- len(int_min) * len(int_max) * len(noise_threshold) * len(window_size_centroiding) *
350
- len(window_size_matching) * len(wf_mz) * len(wf_int) * len(LET_threshold) *
351
- len(entropy_dimension) * len(high_quality_reference_library)
352
- )
353
- done = 0
354
-
355
- for params in param_grid:
356
- res = _eval_one_HRMS(df_query, df_reference, unique_query_ids, unique_reference_ids, *params)
357
- results.append(res)
358
- done += 1
359
- print(f'Completed {done}/{total} grid combinations.\n', flush=True)
360
-
361
- df_out = pd.DataFrame(results, columns=[
362
- 'ACC','SIMILARITY.MEASURE','WEIGHT','SPECTRUM.PROCESSING.ORDER','MZ.MIN','MZ.MAX',
363
- 'INT.MIN','INT.MAX','NOISE.THRESHOLD','WINDOW.SIZE.CENTROIDING','WINDOW.SIZE.MATCHING',
364
- 'WF.MZ','WF.INT','LET.THRESHOLD','ENTROPY.DIMENSION','HIGH.QUALITY.REFERENCE.LIBRARY'
365
- ])
366
-
367
- if 'WEIGHT' in df_out.columns:
368
- df_out['WEIGHT'] = (
369
- df_out['WEIGHT'].astype(str)
370
- .str.replace("\"","",regex=False)
371
- .str.replace("{","",regex=False)
372
- .str.replace("}","",regex=False)
373
- .str.replace(":","",regex=False)
374
- .str.replace("Cosine","",regex=False)
375
- .str.replace("Shannon","",regex=False)
376
- .str.replace("Renyi","",regex=False)
377
- .str.replace("Tsallis","",regex=False)
378
- .str.replace(" ","",regex=False)
379
- )
380
-
381
- if return_output:
382
- return df_out
383
- else:
384
- df_out.to_csv(output_path, index=False, sep='\t', quoting=csv.QUOTE_NONE)
385
- print(f'Wrote results to {output_path}')
386
-
387
-
388
279
  def tune_params_on_NRMS_data_grid(query_data=None, reference_data=None, grid=None, output_path=None, return_output=False):
389
- """
390
- runs spectral library matching on nominal-resolution mass spectrometry (NRMS) data with all possible combinations of parameters in the grid dict, saves results from each choice of parameters to a TXT file, and prints top-performing parameters
391
-
392
- --query_data: mgf, mzML, or csv file of query mass spectrum/spectra to be identified. If csv file, each row should correspond to a mass spectrum, the left-most column should contain an identifier, and each of the other columns should correspond to a single mass/charge ratio. Mandatory argument.
393
- --reference_data: mgf, mzML, or csv file of the reference mass spectra. If csv file, each row should correspond to a mass spectrum, the left-most column should contain in identifier (i.e. the CAS registry number or the compound name), and the remaining column should correspond to a single mass/charge ratio. Mandatory argument.
394
- --grid: dict with all possible parameter values to try
395
- --output_path: accuracy from each choice of parameter set is saved to a TXT file here
396
- """
397
-
398
280
  grid = {**default_NRMS_grid, **(grid or {})}
399
281
  for key, value in grid.items():
400
282
  globals()[key] = value
@@ -405,13 +287,13 @@ def tune_params_on_NRMS_data_grid(query_data=None, reference_data=None, grid=Non
405
287
  else:
406
288
  extension = query_data.rsplit('.',1)
407
289
  extension = extension[(len(extension)-1)]
408
- if extension == 'mgf' or extension == 'MGF' or extension == 'mzML' or extension == 'mzml' or extension == 'MZML' or extension == 'cdf' or extension == 'CDF':
409
- output_path_tmp = query_data[:-3] + 'csv'
290
+ if extension == 'mgf' or extension == 'MGF' or extension == 'mzML' or extension == 'mzml' or extension == 'MZML' or extension == 'cdf' or extension == 'CDF' or extension == 'msp' or extension == 'MSP' or extension == 'json' or extension == 'JSON':
291
+ output_path_tmp = query_data[:-3] + 'txt'
410
292
  build_library_from_raw_data(input_path=query_data, output_path=output_path_tmp, is_reference=False)
411
- df_query = pd.read_csv(output_path_tmp)
412
- if extension == 'csv' or extension == 'CSV':
413
- df_query = pd.read_csv(query_data)
414
- unique_query_ids = df_query.iloc[:,0].unique()
293
+ df_query = pd.read_csv(output_path_tmp, sep='\t')
294
+ if extension == 'txt' or extension == 'TXT':
295
+ df_query = pd.read_csv(query_data, sep='\t')
296
+ unique_query_ids = df_query['id'].unique()
415
297
 
416
298
  if reference_data is None:
417
299
  print('\nError: No argument passed to the mandatory reference_data. Please pass the path to the CSV file of the reference data.')
@@ -419,7 +301,7 @@ def tune_params_on_NRMS_data_grid(query_data=None, reference_data=None, grid=Non
419
301
  else:
420
302
  if isinstance(reference_data,str):
421
303
  df_reference = get_reference_df(reference_data=reference_data)
422
- unique_reference_ids = df_reference.iloc[:,0].unique()
304
+ unique_reference_ids = df_reference['id'].unique()
423
305
  else:
424
306
  dfs = []
425
307
  unique_reference_ids = []
@@ -439,10 +321,8 @@ def tune_params_on_NRMS_data_grid(query_data=None, reference_data=None, grid=Non
439
321
  noise_threshold, wf_mz, wf_int, LET_threshold, entropy_dimension, high_quality_reference_library)
440
322
  results = Parallel(n_jobs=-1, verbose=10)(delayed(_eval_one_NRMS)(df_query, df_reference, unique_query_ids, unique_reference_ids, *params) for params in param_grid)
441
323
 
442
- df_out = pd.DataFrame(results, columns=[
443
- 'ACC','SIMILARITY.MEASURE','WEIGHT','SPECTRUM.PROCESSING.ORDER', 'MZ.MIN','MZ.MAX','INT.MIN','INT.MAX',
444
- 'NOISE.THRESHOLD','WF.MZ','WF.INT','LET.THRESHOLD','ENTROPY.DIMENSION', 'HIGH.QUALITY.REFERENCE.LIBRARY'
445
- ])
324
+ df_out = pd.DataFrame(results, columns=['ACC','SIMILARITY.MEASURE','WEIGHT','SPECTRUM.PROCESSING.ORDER', 'MZ.MIN','MZ.MAX','INT.MIN','INT.MAX',
325
+ 'NOISE.THRESHOLD','WF.MZ','WF.INT','LET.THRESHOLD','ENTROPY.DIMENSION', 'HIGH.QUALITY.REFERENCE.LIBRARY'])
446
326
  df_out['WEIGHT'] = df_out['WEIGHT'].str.replace("\"","",regex=False)
447
327
  df_out['WEIGHT'] = df_out['WEIGHT'].str.replace("{","",regex=False)
448
328
  df_out['WEIGHT'] = df_out['WEIGHT'].str.replace("}","",regex=False)
@@ -452,6 +332,7 @@ def tune_params_on_NRMS_data_grid(query_data=None, reference_data=None, grid=Non
452
332
  df_out['WEIGHT'] = df_out['WEIGHT'].str.replace("Renyi","",regex=False)
453
333
  df_out['WEIGHT'] = df_out['WEIGHT'].str.replace("Tsallis","",regex=False)
454
334
  df_out['WEIGHT'] = df_out['WEIGHT'].str.replace(" ","",regex=False)
335
+
455
336
  if return_output is False:
456
337
  df_out.to_csv(output_path, index=False, sep='\t', quoting=csv.QUOTE_NONE)
457
338
  else:
@@ -459,202 +340,116 @@ def tune_params_on_NRMS_data_grid(query_data=None, reference_data=None, grid=Non
459
340
 
460
341
 
461
342
 
462
- def tune_params_on_NRMS_data_grid_shiny(query_data=None, reference_data=None, grid=None, output_path=None, return_output=False):
463
- """
464
- runs spectral library matching on nominal-resolution mass spectrometry (NRMS) data with all possible
465
- combinations of parameters in the grid dict, saves results from each choice of parameters to a TXT file,
466
- and prints top-performing parameters
467
-
468
- --query_data: mgf, mzML, or csv file of query mass spectrum/spectra to be identified. If csv file, each row
469
- should correspond to a mass spectrum, the left-most column should contain an identifier, and each of the
470
- other columns should correspond to a single mass/charge ratio. Mandatory argument.
471
- --reference_data: mgf, mzML, or csv file of the reference mass spectra. If csv file, each row should correspond
472
- to a mass spectrum, the left-most column should contain in identifier (i.e. the CAS registry number or the
473
- compound name), and the remaining column should correspond to a single mass/charge ratio. Mandatory argument.
474
- --grid: dict with all possible parameter values to try.
475
- --output_path: accuracy from each choice of parameter set is saved to a TXT file here.
476
- """
477
-
478
- local_grid = {**default_NRMS_grid, **(grid or {})}
479
- for key, value in local_grid.items():
480
- globals()[key] = value
481
-
482
- if query_data is None:
483
- print('\nError: No argument passed to the mandatory query_data. Please pass the path to the data file.')
484
- sys.exit()
485
- else:
486
- extension = query_data.rsplit('.', 1)[-1]
487
- if extension in ('mgf','MGF','mzML','mzml','MZML','cdf','CDF'):
488
- output_path_tmp = query_data[:-3] + 'csv'
489
- build_library_from_raw_data(input_path=query_data, output_path=output_path_tmp, is_reference=False)
490
- df_query = pd.read_csv(output_path_tmp)
491
- elif extension in ('csv','CSV'):
492
- df_query = pd.read_csv(query_data)
493
- else:
494
- print(f'\nError: Unsupported query_data extension: {extension}')
495
- sys.exit()
496
- unique_query_ids = df_query.iloc[:, 0].unique()
343
+ def get_acc_HRMS(df_query, df_reference, precursor_ion_mz_tolerance, ionization_mode, adduct, similarity_measure, weights, spectrum_preprocessing_order, mz_min, mz_max, int_min, int_max, window_size_centroiding, window_size_matching, noise_threshold, wf_mz, wf_int, LET_threshold, entropy_dimension, high_quality_reference_library, verbose=True):
497
344
 
498
- if reference_data is None:
499
- print('\nError: No argument passed to the mandatory reference_data. Please pass the path to the data file(s).')
500
- sys.exit()
501
- else:
502
- if isinstance(reference_data, str):
503
- df_reference = get_reference_df(reference_data=reference_data)
504
- unique_reference_ids = df_reference.iloc[:, 0].unique()
505
- else:
506
- dfs = []
507
- unique_reference_ids = []
508
- for f in reference_data:
509
- tmp = get_reference_df(reference_data=f)
510
- dfs.append(tmp)
511
- unique_reference_ids.extend(tmp.iloc[:, 0].unique())
512
- df_reference = pd.concat(dfs, axis=0, ignore_index=True)
513
-
514
- print(f'\nNote that there are {len(unique_query_ids)} unique query spectra, '
515
- f'{len(unique_reference_ids)} unique reference spectra, and '
516
- f'{len(set(unique_query_ids) & set(unique_reference_ids))} of the query and reference spectra IDs are in common.\n')
517
-
518
- if output_path is None:
519
- output_path = f'{Path.cwd()}/tuning_param_output.txt'
520
- print(f'Warning: since output_path=None, the output will be written to the current working directory: {output_path}')
521
-
522
- param_grid = product(
523
- similarity_measure, weight, spectrum_preprocessing_order, mz_min, mz_max, int_min, int_max,
524
- noise_threshold, wf_mz, wf_int, LET_threshold,
525
- entropy_dimension, high_quality_reference_library
526
- )
527
-
528
- results = []
529
- total = (
530
- len(similarity_measure) * len(weight) * len(spectrum_preprocessing_order) * len(mz_min) * len(mz_max) * len(int_min) *
531
- len(int_max) * len(noise_threshold) * len(wf_mz) * len(wf_int) * len(LET_threshold) * len(entropy_dimension) * len(high_quality_reference_library)
532
- )
533
- done = 0
534
- for params in param_grid:
535
- res = _eval_one_NRMS(df_query, df_reference, unique_query_ids, unique_reference_ids, *params)
536
- results.append(res)
537
- done += 1
538
- print(f'Completed {done}/{total} grid combinations.\n', flush=True)
539
-
540
- df_out = pd.DataFrame(results, columns=[
541
- 'ACC','SIMILARITY.MEASURE','WEIGHT','SPECTRUM.PROCESSING.ORDER','MZ.MIN','MZ.MAX',
542
- 'INT.MIN','INT.MAX','NOISE.THRESHOLD','WF.MZ','WF.INT','LET.THRESHOLD','ENTROPY.DIMENSION','HIGH.QUALITY.REFERENCE.LIBRARY'
543
- ])
345
+ n_top_matches_to_save = 1
346
+ unique_reference_ids = df_reference['id'].dropna().astype(str).unique().tolist()
347
+ unique_query_ids = df_query['id'].dropna().astype(str).unique().tolist()
348
+ all_similarity_rows = []
544
349
 
545
- if 'WEIGHT' in df_out.columns:
546
- df_out['WEIGHT'] = (
547
- df_out['WEIGHT'].astype(str)
548
- .str.replace("\"","",regex=False)
549
- .str.replace("{","",regex=False)
550
- .str.replace("}","",regex=False)
551
- .str.replace(":","",regex=False)
552
- .str.replace("Cosine","",regex=False)
553
- .str.replace("Shannon","",regex=False)
554
- .str.replace("Renyi","",regex=False)
555
- .str.replace("Tsallis","",regex=False)
556
- .str.replace(" ","",regex=False)
557
- )
558
-
559
- if return_output:
560
- return df_out
561
- else:
562
- df_out.to_csv(output_path, index=False, sep='\t', quoting=csv.QUOTE_NONE)
563
- print(f'Wrote results to {output_path}')
350
+ for query_idx, qid in enumerate(unique_query_ids):
351
+ if verbose:
352
+ print(f'query spectrum #{query_idx} is being identified')
564
353
 
354
+ q_mask = (df_query['id'] == qid)
355
+ q_idxs = np.where(q_mask)[0]
356
+ if q_idxs.size == 0:
357
+ all_similarity_rows.append([0.0]*len(unique_reference_ids))
358
+ continue
565
359
 
360
+ q_spec_base = np.asarray(pd.concat([df_query['mz_ratio'].iloc[q_idxs], df_query['intensity'].iloc[q_idxs]], axis=1).reset_index(drop=True))
566
361
 
362
+ if 'precursor_ion_mz' in df_query.columns and 'precursor_ion_mz' in df_reference.columns and precursor_ion_mz_tolerance is not None:
363
+ precursor = float(df_query['precursor_ion_mz'].iloc[q_idxs[0]])
364
+ df_reference_tmp = df_reference.loc[df_reference['precursor_ion_mz'].between(precursor - precursor_ion_mz_tolerance, precursor + precursor_ion_mz_tolerance, inclusive='both'), ['id', 'mz_ratio', 'intensity']].copy()
365
+ else:
366
+ df_reference_tmp = df_reference[['id','mz_ratio','intensity']].copy()
567
367
 
568
- def get_acc_HRMS(df_query, df_reference, unique_query_ids, unique_reference_ids, similarity_measure, weights, spectrum_preprocessing_order, mz_min, mz_max, int_min, int_max, window_size_centroiding, window_size_matching, noise_threshold, wf_mz, wf_int, LET_threshold, entropy_dimension, high_quality_reference_library, verbose=True):
368
+ if df_reference_tmp.empty:
369
+ all_similarity_rows.append([0.0]*len(unique_reference_ids))
370
+ continue
569
371
 
570
- n_top_matches_to_save = 1
372
+ ref_groups = dict(tuple(df_reference_tmp.groupby('id', sort=False)))
571
373
 
572
- all_similarity_scores = []
573
- for query_idx in range(0,len(unique_query_ids)):
574
- if verbose is True:
575
- print(f'query spectrum #{query_idx} is being identified')
576
- q_idxs_tmp = np.where(df_query.iloc[:,0] == unique_query_ids[query_idx])[0]
577
- q_spec_tmp = np.asarray(pd.concat([df_query.iloc[q_idxs_tmp,1], df_query.iloc[q_idxs_tmp,2]], axis=1).reset_index(drop=True))
578
- #q_spec_tmp = q_spec_tmp.astype(float)
374
+ similarity_by_ref = {}
579
375
 
580
- similarity_scores = []
581
- for ref_idx in range(0,len(unique_reference_ids)):
582
- q_spec = q_spec_tmp
583
- r_idxs_tmp = np.where(df_reference.iloc[:,0] == unique_reference_ids[ref_idx])[0]
584
- r_spec = np.asarray(pd.concat([df_reference.iloc[r_idxs_tmp,1], df_reference.iloc[r_idxs_tmp,2]], axis=1).reset_index(drop=True))
585
- #print(r_spec)
586
- #r_spec = r_spec.astype(float)
376
+ for ref_id, r_df in ref_groups.items():
377
+ q_spec = q_spec_base.copy()
378
+ r_spec = np.asarray(pd.concat([r_df['mz_ratio'], r_df['intensity']], axis=1).reset_index(drop=True))
587
379
 
588
380
  is_matched = False
589
381
  for transformation in spectrum_preprocessing_order:
590
- if np.isinf(q_spec[:,1]).sum() > 0:
591
- q_spec[:,1] = np.zeros(q_spec.shape[0])
592
- if np.isinf(r_spec[:,1]).sum() > 0:
593
- r_spec[:,1] = np.zeros(r_spec.shape[0])
594
- if transformation == 'C' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1:
595
- q_spec = centroid_spectrum(q_spec, window_size=window_size_centroiding)
596
- r_spec = centroid_spectrum(r_spec, window_size=window_size_centroiding)
597
- if transformation == 'M' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1:
598
- m_spec = match_peaks_in_spectra(spec_a=q_spec, spec_b=r_spec, window_size=window_size_matching)
599
- q_spec = m_spec[:,0:2]
600
- r_spec = m_spec[:,[0,2]]
382
+ if np.isinf(q_spec[:, 1]).any():
383
+ q_spec[:, 1] = 0.0
384
+ if np.isinf(r_spec[:, 1]).any():
385
+ r_spec[:, 1] = 0.0
386
+
387
+ if transformation == 'C' and q_spec.shape[0] > 1 and r_spec.shape[0] > 1:
388
+ q_spec = centroid_spectrum(q_spec, window_size=window_size_centroiding)
389
+ r_spec = centroid_spectrum(r_spec, window_size=window_size_centroiding)
390
+
391
+ if transformation == 'M' and q_spec.shape[0] > 1 and r_spec.shape[0] > 1:
392
+ m_spec = match_peaks_in_spectra(
393
+ spec_a=q_spec, spec_b=r_spec, window_size=window_size_matching
394
+ )
395
+ if m_spec.size == 0:
396
+ q_spec = np.empty((0,2))
397
+ r_spec = np.empty((0,2))
398
+ else:
399
+ q_spec = m_spec[:, 0:2]
400
+ r_spec = m_spec[:, [0, 2]]
601
401
  is_matched = True
602
- if transformation == 'W' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1:
603
- q_spec[:,1] = wf_transform(q_spec[:,0], q_spec[:,1], wf_mz, wf_int)
604
- r_spec[:,1] = wf_transform(r_spec[:,0], r_spec[:,1], wf_mz, wf_int)
605
- if transformation == 'L' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1:
606
- q_spec[:,1] = LE_transform(q_spec[:,1], LET_threshold, normalization_method='standard')
607
- r_spec[:,1] = LE_transform(r_spec[:,1], LET_threshold, normalization_method='standard')
608
- if transformation == 'N' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1:
609
- q_spec = remove_noise(q_spec, nr = noise_threshold)
610
- if high_quality_reference_library == False:
611
- r_spec = remove_noise(r_spec, nr = noise_threshold)
612
- if transformation == 'F' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1:
613
- q_spec = filter_spec_lcms(q_spec, mz_min = mz_min, mz_max = mz_max, int_min = int_min, int_max = int_max, is_matched = is_matched)
614
- if high_quality_reference_library == False:
615
- r_spec = filter_spec_lcms(r_spec, mz_min = mz_min, mz_max = mz_max, int_min = int_min, int_max = int_max, is_matched = is_matched)
616
402
 
617
- q_ints = q_spec[:,1]
618
- r_ints = r_spec[:,1]
619
- if np.sum(q_ints) != 0 and np.sum(r_ints) != 0 and q_spec.shape[0] > 1 and r_spec.shape[1] > 1:
620
- similarity_score = get_similarity(similarity_measure, q_ints, r_ints, weights, entropy_dimension)
403
+ if transformation == 'W' and q_spec.shape[0] > 1 and r_spec.shape[0] > 1:
404
+ q_spec[:, 1] = wf_transform(q_spec[:, 0], q_spec[:, 1], wf_mz, wf_int)
405
+ r_spec[:, 1] = wf_transform(r_spec[:, 0], r_spec[:, 1], wf_mz, wf_int)
406
+
407
+ if transformation == 'L' and q_spec.shape[0] > 1 and r_spec.shape[0] > 1:
408
+ q_spec[:, 1] = LE_transform(q_spec[:, 1], LET_threshold, normalization_method='standard')
409
+ r_spec[:, 1] = LE_transform(r_spec[:, 1], LET_threshold, normalization_method='standard')
410
+
411
+ if transformation == 'N' and q_spec.shape[0] > 1 and r_spec.shape[0] > 1:
412
+ q_spec = remove_noise(q_spec, nr=noise_threshold)
413
+ if not high_quality_reference_library:
414
+ r_spec = remove_noise(r_spec, nr=noise_threshold)
415
+
416
+ if transformation == 'F' and q_spec.shape[0] > 1 and r_spec.shape[0] > 1:
417
+ q_spec = filter_spec_lcms(
418
+ q_spec, mz_min=mz_min, mz_max=mz_max, int_min=int_min, int_max=int_max, is_matched=is_matched
419
+ )
420
+ if not high_quality_reference_library:
421
+ r_spec = filter_spec_lcms(
422
+ r_spec, mz_min=mz_min, mz_max=mz_max, int_min=int_min, int_max=int_max, is_matched=is_matched
423
+ )
424
+
425
+ if q_spec.shape[0] > 1 and r_spec.shape[0] > 1:
426
+ q_ints = q_spec[:, 1]
427
+ r_ints = r_spec[:, 1]
428
+ if np.sum(q_ints) != 0 and np.sum(r_ints) != 0:
429
+ sim = get_similarity(similarity_measure, q_ints, r_ints, weights, entropy_dimension)
430
+ else:
431
+ sim = 0.0
621
432
  else:
622
- similarity_score = 0
433
+ sim = 0.0
623
434
 
624
- similarity_scores.append(similarity_score)
625
- all_similarity_scores.append(similarity_scores)
435
+ similarity_by_ref[str(ref_id)] = float(sim)
626
436
 
627
- df_scores = pd.DataFrame(all_similarity_scores, columns = unique_reference_ids)
628
- df_scores.index = unique_query_ids
629
- df_scores.index.names = ['Query Spectrum ID']
437
+ row = [similarity_by_ref.get(ref_id, 0.0) for ref_id in unique_reference_ids]
438
+ all_similarity_rows.append(row)
630
439
 
631
- preds = []
632
- scores = []
633
- for i in range(0, df_scores.shape[0]):
634
- df_scores_tmp = df_scores
635
- preds_tmp = []
636
- scores_tmp = []
637
- for j in range(0, n_top_matches_to_save):
638
- top_ref_specs_tmp = df_scores_tmp.iloc[i,np.where(df_scores_tmp.iloc[i,:] == np.max(df_scores_tmp.iloc[i,:]))[0]]
639
- cols_to_keep = np.where(df_scores_tmp.iloc[i,:] != np.max(df_scores_tmp.iloc[i,:]))[0]
640
- df_scores_tmp = df_scores_tmp.iloc[:,cols_to_keep]
440
+ df_scores = pd.DataFrame(all_similarity_rows, index=unique_query_ids, columns=unique_reference_ids)
441
+ df_scores.index.name = 'QUERY.SPECTRUM.ID'
641
442
 
642
- preds_tmp.append(';'.join(map(str,top_ref_specs_tmp.index.to_list())))
643
- if len(top_ref_specs_tmp.values) == 0:
644
- scores_tmp.append(0)
645
- else:
646
- scores_tmp.append(top_ref_specs_tmp.values[0])
647
- preds.append(preds_tmp)
648
- scores.append(scores_tmp)
649
-
650
- preds = np.array(preds)
651
- scores = np.array(scores)
652
- out = np.c_[unique_query_ids,preds,scores]
653
- df_tmp = pd.DataFrame(out, columns=['TRUE.ID','PREDICTED.ID','SCORE'])
654
- acc = (df_tmp['TRUE.ID']==df_tmp['PREDICTED.ID']).mean()
655
- return acc
443
+ top_idx = df_scores.values.argmax(axis=1)
444
+ top_scores = df_scores.values[np.arange(df_scores.shape[0]), top_idx]
445
+ top_ids = [df_scores.columns[i] for i in top_idx]
656
446
 
447
+ df_tmp = pd.DataFrame({'TRUE.ID': df_scores.index.to_list(), 'PREDICTED.ID': top_ids, 'SCORE': top_scores})
448
+ if verbose:
449
+ print(df_tmp)
657
450
 
451
+ acc = (df_tmp['TRUE.ID'] == df_tmp['PREDICTED.ID']).mean()
452
+ return acc
658
453
 
659
454
 
660
455
  def get_acc_NRMS(df_query, df_reference, unique_query_ids, unique_reference_ids, similarity_measure, weights, spectrum_preprocessing_order, mz_min, mz_max, int_min, int_max, noise_threshold, wf_mz, wf_int, LET_threshold, entropy_dimension, high_quality_reference_library, verbose=True):
@@ -713,7 +508,7 @@ def get_acc_NRMS(df_query, df_reference, unique_query_ids, unique_reference_ids,
713
508
 
714
509
  df_scores = pd.DataFrame(all_similarity_scores, columns = unique_reference_ids)
715
510
  df_scores.index = unique_query_ids
716
- df_scores.index.names = ['Query Spectrum ID']
511
+ df_scores.index.names = ['QUERY.SPECTRUM.ID']
717
512
 
718
513
  preds = []
719
514
  scores = []
@@ -743,64 +538,40 @@ def get_acc_NRMS(df_query, df_reference, unique_query_ids, unique_reference_ids,
743
538
 
744
539
 
745
540
 
746
- def run_spec_lib_matching_on_HRMS_data(query_data=None, reference_data=None, likely_reference_ids=None, similarity_measure='cosine', weights={'Cosine':0.25,'Shannon':0.25,'Renyi':0.25,'Tsallis':0.25}, spectrum_preprocessing_order='FCNMWL', high_quality_reference_library=False, mz_min=0, mz_max=9999999, int_min=0, int_max=9999999, window_size_centroiding=0.5, window_size_matching=0.5, noise_threshold=0.0, wf_mz=0.0, wf_intensity=1.0, LET_threshold=0.0, entropy_dimension=1.1, n_top_matches_to_save=1, print_id_results=False, output_identification=None, output_similarity_scores=None, return_ID_output=False, verbose=True):
747
- '''
748
- runs spectral library matching on high-resolution mass spectrometry (HRMS) data
749
-
750
- --query_data: mgf, mzML, or csv file of query mass spectrum/spectra to be identified. If csv file, each row should correspond to a mass spectrum, the left-most column should contain an identifier, and each of the other columns should correspond to a single mass/charge ratio. Mandatory argument.
751
- --reference_data: either string or list of strings with pass to mgf, mzML, sdf, and/or csv file(s) of the reference mass spectra. If csv file, each row should correspond to a mass spectrum, the left-most column should contain in identifier (i.e. the CAS registry number or the compound name), and the remaining column should correspond to a single mass/charge ratio. Mandatory argument.
752
- --likely_reference_ids: CSV file with one column containing the IDs of a subset of all compounds in the reference_data to be used in spectral library matching. Each ID in this file must be an ID in the reference library. Default: None (i.e. default is to use entire reference library)
753
- --similarity_measure: cosine, shannon, renyi, tsallis, mixture, jaccard, dice, 3w_jaccard, sokal_sneath, binary_cosine, mountford, mcconnaughey, driver_kroeber, simpson, braun_banquet, fager_mcgowan, kulczynski, intersection, hamming, hellinger. Default: cosine.
754
- --weights: dict of weights to give to each non-binary similarity measure (i.e. cosine, shannon, renyi, and tsallis) when the mixture similarity measure is specified. Default: 0.25 for each of the four non-binary similarity measures.
755
- --spectrum_preprocessing_order: The spectrum preprocessing transformations and the order in which they are to be applied. Note that these transformations are applied prior to computing similarity scores. Format must be a string with 2-6 characters chosen from C, F, M, N, L, W representing centroiding, filtering based on mass/charge and intensity values, matching, noise removal, low-entropy trannsformation, and weight-factor-transformation, respectively. For example, if \'WCM\' is passed, then each spectrum will undergo a weight factor transformation, then centroiding, and then matching. Note that if an argument is passed, then \'M\' must be contained in the argument, since matching is a required preprocessing step in spectral library matching of HRMS data. Furthermore, \'C\' must be performed before matching since centroiding can change the number of ion fragments in a given spectrum. Default: FCNMWL')
756
- --high_quality_reference_library: True/False flag indicating whether the reference library is considered to be of high quality. If True, then the spectrum preprocessing transformations of filtering and noise removal are performed only on the query spectrum/spectra. If False, all spectrum preprocessing transformations specified will be applied to both the query and reference spectra. Default: False')
757
- --mz_min: Remove all peaks with mass/charge value less than mz_min in each spectrum. Default: 0
758
- --mz_max: Remove all peaks with mass/charge value greater than mz_max in each spectrum. Default: 9999999
759
- --int_min: Remove all peaks with intensity value less than int_min in each spectrum. Default: 0
760
- --int_max: Remove all peaks with intensity value greater than int_max in each spectrum. Default: 9999999
761
- --window_size_centroiding: Window size parameter used in centroiding a given spectrum. Default: 0.5
762
- --window_size_matching: Window size parameter used in matching a query spectrum and a reference library spectrum. Default: 0.5
763
- --noise_threshold: Ion fragments (i.e. points in a given mass spectrum) with intensity less than max(intensities)*noise_threshold are removed. Default: 0.0
764
- --wf_mz: Mass/charge weight factor parameter. Default: 0.0
765
- --wf_intensity: Intensity weight factor parameter. Default: 0.0
766
- --LET_threshold: Low-entropy transformation threshold parameter. Spectra with Shannon entropy less than LET_threshold are transformed according to intensitiesNew=intensitiesOriginal^{(1+S)/(1+LET_threshold)}. Default: 0.0
767
- --entropy_dimension: Entropy dimension parameter. Must have positive value other than 1. When the entropy dimension is 1, then Renyi and Tsallis entropy are equivalent to Shannon entropy. Therefore, this parameter only applies to the renyi and tsallis similarity measures. This parameter will be ignored if similarity measure cosine or shannon is chosen. Default: 1.1
768
- --n_top_matches_to_save: The number of top matches to report. For example, if n_top_matches_to_save=5, then for each query spectrum, the five reference spectra with the largest similarity with the given query spectrum will be reported. Default: 1
769
- --print_id_results: Flag that prints identification results if True. Default: False
770
- --output_identification: Output TXT file containing the most-similar reference spectra for each query spectrum along with the corresponding similarity scores. Default is to save identification output in current working directory with filename \'output_identification.txt\'.
771
- --output_similarity_scores: Output TXT file containing similarity scores between all query spectrum/spectra and all reference spectra. Each row corresponds to a query spectrum, the left-most column contains the query spectrum/spectra identifier, and the remaining column contain the similarity scores with respect to all reference library spectra. If no argument passed, then this TXT file is written to the current working directory with filename \'output_all_similarity_scores\'.txt.')
772
- '''
773
-
541
+ def run_spec_lib_matching_on_HRMS_data(query_data=None, reference_data=None, precursor_ion_mz_tolerance=None, ionization_mode=None, adduct=None, likely_reference_ids=None, similarity_measure='cosine', weights={'Cosine':0.25,'Shannon':0.25,'Renyi':0.25,'Tsallis':0.25}, spectrum_preprocessing_order='FCNMWL', high_quality_reference_library=False, mz_min=0, mz_max=9999999, int_min=0, int_max=9999999, window_size_centroiding=0.5, window_size_matching=0.5, noise_threshold=0.0, wf_mz=0.0, wf_intensity=1.0, LET_threshold=0.0, entropy_dimension=1.1, n_top_matches_to_save=1, print_id_results=False, output_identification=None, output_similarity_scores=None, return_ID_output=False, verbose=True):
774
542
  if query_data is None:
775
543
  print('\nError: No argument passed to the mandatory query_data. Please pass the path to the CSV file of the query data.')
776
544
  sys.exit()
777
545
  else:
778
546
  extension = query_data.rsplit('.',1)
779
547
  extension = extension[(len(extension)-1)]
780
- if extension == 'mgf' or extension == 'MGF' or extension == 'mzML' or extension == 'mzml' or extension == 'MZML' or extension == 'cdf' or extension == 'CDF':
781
- output_path_tmp = query_data[:-3] + 'csv'
548
+ if extension == 'mgf' or extension == 'MGF' or extension == 'mzML' or extension == 'mzml' or extension == 'MZML' or extension == 'cdf' or extension == 'CDF' or extension == 'json' or extension == 'JSON' or extension == 'msp' or extension == 'MSP':
549
+ output_path_tmp = query_data[:-3] + 'txt'
782
550
  build_library_from_raw_data(input_path=query_data, output_path=output_path_tmp, is_reference=False)
783
- df_query = pd.read_csv(output_path_tmp)
784
- if extension == 'csv' or extension == 'CSV':
785
- df_query = pd.read_csv(query_data)
786
- unique_query_ids = df_query.iloc[:,0].unique()
551
+ df_query = pd.read_csv(output_path_tmp, sep='\t')
552
+ if extension == 'txt' or extension == 'TXT':
553
+ df_query = pd.read_csv(query_data, sep='\t')
554
+ unique_query_ids = df_query['id'].unique()
787
555
 
788
556
  if reference_data is None:
789
- print('\nError: No argument passed to the mandatory reference_data. Please pass the path to the CSV file of the reference data.')
557
+ print('\nError: No argument passed to the mandatory reference_data. Please pass the path to the reference data.')
790
558
  sys.exit()
791
559
  else:
792
560
  if isinstance(reference_data,str):
793
561
  df_reference = get_reference_df(reference_data,likely_reference_ids)
794
- unique_reference_ids = df_reference.iloc[:,0].unique()
795
562
  else:
796
563
  dfs = []
797
- unique_reference_ids = []
798
564
  for f in reference_data:
799
565
  tmp = get_reference_df(f,likely_reference_ids)
800
566
  dfs.append(tmp)
801
- unique_reference_ids.extend(tmp.iloc[:,0].unique())
802
567
  df_reference = pd.concat(dfs, axis=0, ignore_index=True)
803
568
 
569
+ if 'ionization_mode' in df_reference.columns.tolist() and ionization_mode != 'N/A' and ionization_mode != None:
570
+ df_reference = df_reference.loc[df_reference['ionization_mode']==ionization_mode]
571
+ if 'adduct' in df_reference.columns.tolist() and adduct != 'N/A' and adduct != None:
572
+ df_reference = df_reference.loc[df_reference['adduct']==adduct]
573
+
574
+ print(df_reference.loc[df_reference['id']=='Hectochlorin M+H'])
804
575
 
805
576
  if spectrum_preprocessing_order is not None:
806
577
  spectrum_preprocessing_order = list(spectrum_preprocessing_order)
@@ -888,62 +659,91 @@ def run_spec_lib_matching_on_HRMS_data(query_data=None, reference_data=None, lik
888
659
  print(f'Warning: writing similarity scores to {output_similarity_scores}')
889
660
 
890
661
 
891
- all_similarity_scores = []
892
- for query_idx in range(0,len(unique_query_ids)):
893
- if verbose is True:
662
+ unique_reference_ids = df_reference['id'].unique().tolist()
663
+ all_similarity_scores = []
664
+
665
+ for query_idx in range(len(unique_query_ids)):
666
+ if verbose:
894
667
  print(f'query spectrum #{query_idx} is being identified')
895
- q_idxs_tmp = np.where(df_query.iloc[:,0] == unique_query_ids[query_idx])[0]
896
- q_spec_tmp = np.asarray(pd.concat([df_query.iloc[q_idxs_tmp,1], df_query.iloc[q_idxs_tmp,2]], axis=1).reset_index(drop=True))
897
668
 
898
- similarity_scores = []
899
- for ref_idx in range(0,len(unique_reference_ids)):
900
- q_spec = q_spec_tmp
901
- r_idxs_tmp = np.where(df_reference.iloc[:,0] == unique_reference_ids[ref_idx])[0]
902
- r_spec = np.asarray(pd.concat([df_reference.iloc[r_idxs_tmp,1], df_reference.iloc[r_idxs_tmp,2]], axis=1).reset_index(drop=True))
669
+ q_mask = (df_query['id'] == unique_query_ids[query_idx])
670
+ q_idxs_tmp = np.where(q_mask)[0]
671
+ q_spec_tmp = np.asarray(pd.concat([df_query['mz_ratio'].iloc[q_idxs_tmp], df_query['intensity'].iloc[q_idxs_tmp]], axis=1).reset_index(drop=True))
672
+
673
+ if 'precursor_ion_mz' in df_query.columns.tolist() and 'precursor_ion_mz' in df_reference.columns.tolist() and precursor_ion_mz_tolerance != None:
674
+ precursor_ion_mz_tmp = df_query['precursor_ion_mz'].iloc[q_idxs_tmp[0]]
675
+ df_reference_tmp = df_reference.loc[df_reference['precursor_ion_mz'].between(precursor_ion_mz_tmp-precursor_ion_mz_tolerance, precursor_ion_mz_tmp+precursor_ion_mz_tolerance, inclusive='both'),['id','mz_ratio','intensity']].copy()
676
+ else:
677
+ df_reference_tmp = df_reference.copy()
678
+
679
+ ref_groups = dict(tuple(df_reference_tmp.groupby('id', sort=False)))
680
+ unique_reference_ids_tmp = list(ref_groups.keys())
681
+
682
+ similarity_by_ref = {}
683
+ for ref_id in unique_reference_ids_tmp:
684
+ q_spec = q_spec_tmp.copy()
685
+ r_df = ref_groups[ref_id]
686
+ r_spec = np.asarray(pd.concat([r_df['mz_ratio'], r_df['intensity']], axis=1).reset_index(drop=True))
687
+ #print('\nhere!!!!!!!!!!!!!!!')
688
+ #print(r_spec)
903
689
 
904
690
  is_matched = False
691
+
905
692
  for transformation in spectrum_preprocessing_order:
906
- if np.isinf(q_spec[:,1]).sum() > 0:
907
- q_spec[:,1] = np.zeros(q_spec.shape[0])
908
- if np.isinf(r_spec[:,1]).sum() > 0:
909
- r_spec[:,1] = np.zeros(r_spec.shape[0])
910
- if transformation == 'C' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1:
911
- q_spec = centroid_spectrum(q_spec, window_size=window_size_centroiding)
912
- r_spec = centroid_spectrum(r_spec, window_size=window_size_centroiding)
913
- if transformation == 'M' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1:
693
+ if np.isinf(q_spec[:, 1]).sum() > 0:
694
+ q_spec[:, 1] = np.zeros(q_spec.shape[0])
695
+ if np.isinf(r_spec[:, 1]).sum() > 0:
696
+ r_spec[:, 1] = np.zeros(r_spec.shape[0])
697
+
698
+ if transformation == 'C' and q_spec.shape[0] > 1 and r_spec.shape[0] > 1:
699
+ q_spec = centroid_spectrum(q_spec, window_size=window_size_centroiding)
700
+ r_spec = centroid_spectrum(r_spec, window_size=window_size_centroiding)
701
+
702
+ if transformation == 'M' and q_spec.shape[0] > 1 and r_spec.shape[0] > 1:
914
703
  m_spec = match_peaks_in_spectra(spec_a=q_spec, spec_b=r_spec, window_size=window_size_matching)
915
- q_spec = m_spec[:,0:2]
916
- r_spec = m_spec[:,[0,2]]
704
+ q_spec = m_spec[:, 0:2]
705
+ r_spec = m_spec[:, [0, 2]]
917
706
  is_matched = True
918
- if transformation == 'W' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1:
919
- q_spec[:,1] = wf_transform(q_spec[:,0], q_spec[:,1], wf_mz, wf_intensity)
920
- r_spec[:,1] = wf_transform(r_spec[:,0], r_spec[:,1], wf_mz, wf_intensity)
921
- if transformation == 'L' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1:
922
- q_spec[:,1] = LE_transform(q_spec[:,1], LET_threshold, normalization_method=normalization_method)
923
- r_spec[:,1] = LE_transform(r_spec[:,1], LET_threshold, normalization_method=normalization_method)
924
- if transformation == 'N' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1:
925
- q_spec = remove_noise(q_spec, nr = noise_threshold)
926
- if high_quality_reference_library == False:
927
- r_spec = remove_noise(r_spec, nr = noise_threshold)
928
- if transformation == 'F' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1:
929
- q_spec = filter_spec_lcms(q_spec, mz_min = mz_min, mz_max = mz_max, int_min = int_min, int_max = int_max, is_matched = is_matched)
930
- if high_quality_reference_library == False:
931
- r_spec = filter_spec_lcms(r_spec, mz_min = mz_min, mz_max = mz_max, int_min = int_min, int_max = int_max, is_matched = is_matched)
932
707
 
933
- q_ints = q_spec[:,1]
934
- r_ints = r_spec[:,1]
935
-
936
- if np.sum(q_ints) != 0 and np.sum(r_ints) != 0 and q_spec.shape[0] > 1 and r_spec.shape[1] > 1:
937
- similarity_score = get_similarity(similarity_measure, q_ints, r_ints, weights, entropy_dimension)
708
+ if transformation == 'W' and q_spec.shape[0] > 1 and r_spec.shape[0] > 1:
709
+ q_spec[:, 1] = wf_transform(q_spec[:, 0], q_spec[:, 1], wf_mz, wf_intensity)
710
+ r_spec[:, 1] = wf_transform(r_spec[:, 0], r_spec[:, 1], wf_mz, wf_intensity)
711
+
712
+ if transformation == 'L' and q_spec.shape[0] > 1 and r_spec.shape[0] > 1:
713
+ q_spec[:, 1] = LE_transform(q_spec[:, 1], LET_threshold, normalization_method=normalization_method)
714
+ r_spec[:, 1] = LE_transform(r_spec[:, 1], LET_threshold, normalization_method=normalization_method)
715
+
716
+ if transformation == 'N' and q_spec.shape[0] > 1 and r_spec.shape[0] > 1:
717
+ q_spec = remove_noise(q_spec, nr=noise_threshold)
718
+ if not high_quality_reference_library:
719
+ r_spec = remove_noise(r_spec, nr=noise_threshold)
720
+
721
+ if transformation == 'F' and q_spec.shape[0] > 1 and r_spec.shape[0] > 1:
722
+ q_spec = filter_spec_lcms(
723
+ q_spec, mz_min=mz_min, mz_max=mz_max, int_min=int_min, int_max=int_max, is_matched=is_matched
724
+ )
725
+ if not high_quality_reference_library:
726
+ r_spec = filter_spec_lcms(
727
+ r_spec, mz_min=mz_min, mz_max=mz_max, int_min=int_min, int_max=int_max, is_matched=is_matched
728
+ )
729
+
730
+ q_ints = q_spec[:, 1]
731
+ r_ints = r_spec[:, 1]
732
+
733
+ if np.sum(q_ints) != 0 and np.sum(r_ints) != 0 and q_spec.shape[0] > 1 and r_spec.shape[0] > 1:
734
+ sim = get_similarity(similarity_measure, q_ints, r_ints, weights, entropy_dimension)
938
735
  else:
939
- similarity_score = 0
736
+ sim = 0.0
940
737
 
941
- similarity_scores.append(similarity_score)
942
- all_similarity_scores.append(similarity_scores)
738
+ similarity_by_ref[ref_id] = sim
943
739
 
944
- df_scores = pd.DataFrame(all_similarity_scores, columns = unique_reference_ids)
740
+ row_scores = [similarity_by_ref.get(ref_id, 0.0) for ref_id in unique_reference_ids]
741
+ all_similarity_scores.append(row_scores)
742
+
743
+ df_scores = pd.DataFrame(all_similarity_scores, index=unique_query_ids, columns=unique_reference_ids)
945
744
  df_scores.index = unique_query_ids
946
- df_scores.index.names = ['Query Spectrum ID']
745
+ df_scores.index.names = ['QUERY.SPECTRUM.ID']
746
+
947
747
 
948
748
  preds = []
949
749
  scores = []
@@ -976,7 +776,7 @@ def run_spec_lib_matching_on_HRMS_data(query_data=None, reference_data=None, lik
976
776
 
977
777
  df_top_ref_specs = pd.DataFrame(out, columns = [*cnames_preds, *cnames_scores])
978
778
  df_top_ref_specs.index = unique_query_ids
979
- df_top_ref_specs.index.names = ['Query Spectrum ID']
779
+ df_top_ref_specs.index.names = ['QUERY.SPECTRUM.ID']
980
780
 
981
781
  df_scores.columns = ['Reference Spectrum ID: ' + col for col in list(map(str,df_scores.columns.tolist()))]
982
782
 
@@ -993,33 +793,7 @@ def run_spec_lib_matching_on_HRMS_data(query_data=None, reference_data=None, lik
993
793
 
994
794
 
995
795
 
996
- def run_spec_lib_matching_on_NRMS_data(query_data=None, reference_data=None, likely_reference_ids=None, spectrum_preprocessing_order='FNLW', similarity_measure='cosine', weights={'Cosine':0.25,'Shannon':0.25,'Renyi':0.25,'Tsallis':0.25}, high_quality_reference_library=False, mz_min=0, mz_max=9999999, int_min=0, int_max=9999999, noise_threshold=0.0, wf_mz=0.0, wf_intensity=1.0, LET_threshold=0.0, entropy_dimension=1.1, n_top_matches_to_save=1, print_id_results=False, output_identification=None, output_similarity_scores=None, return_ID_output=False):
997
- '''
998
- runs spectral library matching on nominal-resolution mass spectrometry (NRMS) data
999
-
1000
- --query_data: cdf or csv file of query mass spectrum/spectra to be identified. If csv file, each row should correspond to a mass spectrum, the left-most column should contain an identifier, and each of the other columns should correspond to a single mass/charge ratio. Mandatory argument.
1001
- --reference_data: cdf of csv file of the reference mass spectra. If csv file, each row should correspond to a mass spectrum, the left-most column should contain in identifier (i.e. the CAS registry number or the compound name), and the remaining column should correspond to a single mass/charge ratio. Mandatory argument.
1002
- --likely_reference_ids: CSV file with one column containing the IDs of a subset of all compounds in the reference_data to be used in spectral library matching. Each ID in this file must be an ID in the reference library. Default: None (i.e. default is to use entire reference library)
1003
- --similarity_measure: cosine, shannon, renyi, tsallis, mixture, jaccard, dice, 3w_jaccard, sokal_sneath, binary_cosine, mountford, mcconnaughey, driver_kroeber, simpson, braun_banquet, fager_mcgowan, kulczynski, intersection, hamming, hellinger. Default: cosine.
1004
- --weights: dict of weights to give to each non-binary similarity measure (i.e. cosine, shannon, renyi, and tsallis) when the mixture similarity measure is specified. Default: 0.25 for each of the four non-binary similarity measures.
1005
- --spectrum_preprocessing_order: The spectrum preprocessing transformations and the order in which they are to be applied. Note that these transformations are applied prior to computing similarity scores. Format must be a string with 2-4 characters chosen from F, N, L, W representing filtering based on mass/charge and intensity values, noise removal, low-entropy trannsformation, and weight-factor-transformation, respectively. For example, if \'WN\' is passed, then each spectrum will undergo a weight factor transformation and then noise removal. Default: FNLW')
1006
- --high_quality_reference_library: True/False flag indicating whether the reference library is considered to be of high quality. If True, then the spectrum preprocessing transformations of filtering and noise removal are performed only on the query spectrum/spectra. If False, all spectrum preprocessing transformations specified will be applied to both the query and reference spectra. Default: False')
1007
- --mz_min: Remove all peaks with mass/charge value less than mz_min in each spectrum. Default: 0
1008
- --mz_max: Remove all peaks with mass/charge value greater than mz_max in each spectrum. Default: 9999999
1009
- --int_min: Remove all peaks with intensity value less than int_min in each spectrum. Default: 0
1010
- --int_max: Remove all peaks with intensity value greater than int_max in each spectrum. Default: 9999999
1011
- --noise_threshold: Ion fragments (i.e. points in a given mass spectrum) with intensity less than max(intensities)*noise_threshold are removed. Default: 0.0
1012
- --wf_mz: Mass/charge weight factor parameter. Default: 0.0
1013
- --wf_intensity: Intensity weight factor parameter. Default: 0.0
1014
- --LET_threshold: Low-entropy transformation threshold parameter. Spectra with Shannon entropy less than LET_threshold are transformed according to intensitiesNew=intensitiesOriginal^{(1+S)/(1+LET_threshold)}. Default: 0.0
1015
- --entropy_dimension: Entropy dimension parameter. Must have positive value other than 1. When the entropy dimension is 1, then Renyi and Tsallis entropy are equivalent to Shannon entropy. Therefore, this parameter only applies to the renyi and tsallis similarity measures. This parameter will be ignored if similarity measure cosine or shannon is chosen. Default: 1.1
1016
- --normalization_method: Method used to normalize the intensities of each spectrum so that the intensities sum to 1. Since the objects entropy quantifies the uncertainy of must be probability distributions, the intensities of a given spectrum must sum to 1 prior to computing the entropy of the given spectrum intensities. Options: \'standard\' and \'softmax\'. Default: standard.
1017
- --n_top_matches_to_save: The number of top matches to report. For example, if n_top_matches_to_save=5, then for each query spectrum, the five reference spectra with the largest similarity with the given query spectrum will be reported. Default: 1
1018
- --print_id_results: Flag that prints identification results if True. Default: False
1019
- --output_identification: Output TXT file containing the most-similar reference spectra for each query spectrum along with the corresponding similarity scores. Default is to save identification output in current working directory with filename \'output_identification.txt\'.
1020
- --output_similarity_scores: Output TXT file containing similarity scores between all query spectrum/spectra and all reference spectra. Each row corresponds to a query spectrum, the left-most column contains the query spectrum/spectra identifier, and the remaining column contain the similarity scores with respect to all reference library spectra. If no argument passed, then this TXT file is written to the current working directory with filename \'output_all_similarity_scores\'.txt.')
1021
- '''
1022
-
796
+ def run_spec_lib_matching_on_NRMS_data(query_data=None, reference_data=None, likely_reference_ids=None, spectrum_preprocessing_order='FNLW', similarity_measure='cosine', weights={'Cosine':0.25,'Shannon':0.25,'Renyi':0.25,'Tsallis':0.25}, high_quality_reference_library=False, mz_min=0, mz_max=9999999, int_min=0, int_max=9999999, noise_threshold=0.0, wf_mz=0.0, wf_intensity=1.0, LET_threshold=0.0, entropy_dimension=1.1, n_top_matches_to_save=1, print_id_results=False, output_identification=None, output_similarity_scores=None, return_ID_output=False, verbose=True):
1023
797
  if query_data is None:
1024
798
  print('\nError: No argument passed to the mandatory query_data. Please pass the path to the CSV file of the query data.')
1025
799
  sys.exit()
@@ -1027,11 +801,11 @@ def run_spec_lib_matching_on_NRMS_data(query_data=None, reference_data=None, lik
1027
801
  extension = query_data.rsplit('.',1)
1028
802
  extension = extension[(len(extension)-1)]
1029
803
  if extension == 'mgf' or extension == 'MGF' or extension == 'mzML' or extension == 'mzml' or extension == 'MZML' or extension == 'cdf' or extension == 'CDF':
1030
- output_path_tmp = query_data[:-3] + 'csv'
804
+ output_path_tmp = query_data[:-3] + 'txt'
1031
805
  build_library_from_raw_data(input_path=query_data, output_path=output_path_tmp, is_reference=False)
1032
- df_query = pd.read_csv(output_path_tmp)
1033
- if extension == 'csv' or extension == 'CSV':
1034
- df_query = pd.read_csv(query_data)
806
+ df_query = pd.read_csv(output_path_tmp, sep='\t')
807
+ if extension == 'txt' or extension == 'TXT':
808
+ df_query = pd.read_csv(query_data, sep='\t')
1035
809
  unique_query_ids = df_query.iloc[:,0].unique()
1036
810
 
1037
811
  if reference_data is None:
@@ -1175,7 +949,7 @@ def run_spec_lib_matching_on_NRMS_data(query_data=None, reference_data=None, lik
1175
949
 
1176
950
  df_scores = pd.DataFrame(all_similarity_scores, columns = unique_reference_ids)
1177
951
  df_scores.index = unique_query_ids
1178
- df_scores.index.names = ['Query Spectrum ID']
952
+ df_scores.index.names = ['QUERY.SPECTRUM.ID']
1179
953
 
1180
954
  preds = []
1181
955
  scores = []
@@ -1208,7 +982,7 @@ def run_spec_lib_matching_on_NRMS_data(query_data=None, reference_data=None, lik
1208
982
 
1209
983
  df_top_ref_specs = pd.DataFrame(out, columns = [*cnames_preds, *cnames_scores])
1210
984
  df_top_ref_specs.index = unique_query_ids
1211
- df_top_ref_specs.index.names = ['Query Spectrum ID']
985
+ df_top_ref_specs.index.names = ['QUERY.SPECTRUM.ID']
1212
986
 
1213
987
  if print_id_results == True:
1214
988
  print(df_top_ref_specs.to_string())