pycompound 0.1.5__py3-none-any.whl → 0.1.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -21,46 +21,40 @@ def _vector_to_full_params(X, default_params, optimize_params):
21
21
 
22
22
  def objective_function_HRMS(X, ctx):
23
23
  p = _vector_to_full_params(X, ctx["default_params"], ctx["optimize_params"])
24
- if 'window_size_centroiding' in ctx.keys():
25
- acc = get_acc_HRMS(
26
- ctx["df_query"], ctx["df_reference"],
27
- ctx["unique_query_ids"], ctx["unique_reference_ids"],
28
- ctx["similarity_measure"], ctx["weights"], ctx["spectrum_preprocessing_order"],
29
- ctx["mz_min"], ctx["mz_max"], ctx["int_min"], ctx["int_max"],
30
- p["window_size_centroiding"], p["window_size_matching"], p["noise_threshold"],
31
- p["wf_mz"], p["wf_int"], p["LET_threshold"],
32
- p["entropy_dimension"],
33
- ctx["high_quality_reference_library"],
34
- verbose=False
35
- )
36
- else:
37
- acc = get_acc_NRMS(
38
- ctx["df_query"], ctx["df_reference"],
39
- ctx["unique_query_ids"], ctx["unique_reference_ids"],
40
- ctx["similarity_measure"], ctx["weights"], ctx["spectrum_preprocessing_order"],
41
- ctx["mz_min"], ctx["mz_max"], ctx["int_min"], ctx["int_max"],
42
- p["noise_threshold"],
43
- p["wf_mz"], p["wf_int"], p["LET_threshold"],
44
- p["entropy_dimension"],
45
- ctx["high_quality_reference_library"],
46
- verbose=False
47
- )
24
+ acc = get_acc_HRMS(
25
+ ctx["df_query"],
26
+ ctx["df_reference"],
27
+ ctx["precursor_ion_mz_tolerance"],
28
+ ctx["ionization_mode"], ctx["adduct"],
29
+ ctx["similarity_measure"], ctx["weights"], ctx["spectrum_preprocessing_order"],
30
+ ctx["mz_min"], ctx["mz_max"], ctx["int_min"], ctx["int_max"],
31
+ p["window_size_centroiding"], p["window_size_matching"], p["noise_threshold"],
32
+ p["wf_mz"], p["wf_int"], p["LET_threshold"],
33
+ p["entropy_dimension"],
34
+ ctx["high_quality_reference_library"],
35
+ verbose=False
36
+ )
48
37
  print(f"\nparams({ctx['optimize_params']}) = {np.array(X)}\naccuracy: {acc*100}%")
49
38
  return 1.0 - acc
50
39
 
51
40
 
41
+ def objective_function_NRMS(X, ctx):
42
+ p = _vector_to_full_params(X, ctx["default_params"], ctx["optimize_params"])
43
+ acc = get_acc_NRMS(
44
+ ctx["df_query"], ctx["df_reference"], ctx['unique_query_ids'], ctx['unique_reference_ids'],
45
+ ctx["similarity_measure"], ctx["weights"], ctx["spectrum_preprocessing_order"],
46
+ ctx["mz_min"], ctx["mz_max"], ctx["int_min"], ctx["int_max"],
47
+ p["noise_threshold"], p["wf_mz"], p["wf_int"], p["LET_threshold"], p["entropy_dimension"],
48
+ ctx["high_quality_reference_library"],
49
+ verbose=False
50
+ )
51
+ print(f"\nparams({ctx['optimize_params']}) = {np.array(X)}\naccuracy: {acc*100}%")
52
+ return 1.0 - acc
53
+
52
54
 
53
55
 
54
- def tune_params_DE(query_data=None, reference_data=None, similarity_measure='cosine', weights=None, spectrum_preprocessing_order='CNMWL', mz_min=0, mz_max=999999999, int_min=0, int_max=999999999, high_quality_reference_library=False, optimize_params=["window_size_centroiding","window_size_matching","noise_threshold","wf_mz","wf_int","LET_threshold","entropy_dimension"], param_bounds={"window_size_centroiding":(0.0,0.5),"window_size_matching":(0.0,0.5),"noise_threshold":(0.0,0.25),"wf_mz":(0.0,5.0),"wf_int":(0.0,5.0),"LET_threshold":(0.0,5.0),"entropy_dimension":(1.0,3.0)}, default_params={"window_size_centroiding": 0.5, "window_size_matching":0.5, "noise_threshold":0.10, "wf_mz":0.0, "wf_int":1.0, "LET_threshold":0.0, "entropy_dimension":1.1}):
55
56
 
56
- '''
57
- print('<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<')
58
- print(param_bounds)
59
- print(default_params)
60
- print(type(param_bounds['noise_threshold'][0]))
61
- print(type(param_bounds['noise_threshold'][1]))
62
- print('>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>')
63
- '''
57
+ def tune_params_DE(query_data=None, reference_data=None, chromatography_platform='HRMS', precursor_ion_mz_tolerance=None, ionization_mode=None, adduct=None, similarity_measure='cosine', weights=None, spectrum_preprocessing_order='CNMWL', mz_min=0, mz_max=999999999, int_min=0, int_max=999999999, high_quality_reference_library=False, optimize_params=["window_size_centroiding","window_size_matching","noise_threshold","wf_mz","wf_int","LET_threshold","entropy_dimension"], param_bounds={"window_size_centroiding":(0.0,0.5),"window_size_matching":(0.0,0.5),"noise_threshold":(0.0,0.25),"wf_mz":(0.0,5.0),"wf_int":(0.0,5.0),"LET_threshold":(0.0,5.0),"entropy_dimension":(1.0,3.0)}, default_params={"window_size_centroiding": 0.5, "window_size_matching":0.5, "noise_threshold":0.10, "wf_mz":0.0, "wf_int":1.0, "LET_threshold":0.0, "entropy_dimension":1.1}, maxiters=3, de_workers=1):
64
58
 
65
59
  if query_data is None:
66
60
  print('\nError: No argument passed to the mandatory query_data. Please pass the path to the TXT file of the query data.')
@@ -68,21 +62,19 @@ def tune_params_DE(query_data=None, reference_data=None, similarity_measure='cos
68
62
  else:
69
63
  extension = query_data.rsplit('.',1)
70
64
  extension = extension[(len(extension)-1)]
71
- if extension == 'mgf' or extension == 'MGF' or extension == 'mzML' or extension == 'mzml' or extension == 'MZML' or extension == 'cdf' or extension == 'CDF':
72
- output_path_tmp = query_data[:-3] + 'csv'
65
+ if extension == 'mgf' or extension == 'MGF' or extension == 'mzML' or extension == 'mzml' or extension == 'MZML' or extension == 'cdf' or extension == 'CDF' or extension == 'msp' or extension == 'MSP' or extension == 'json' or extension == 'JSON':
66
+ output_path_tmp = query_data[:-3] + 'txt'
73
67
  build_library_from_raw_data(input_path=query_data, output_path=output_path_tmp, is_reference=False)
74
- df_query = pd.read_csv(output_path_tmp)
75
- if extension == 'csv' or extension == 'CSV':
76
- df_query = pd.read_csv(query_data)
77
- unique_query_ids = df_query.iloc[:,0].unique()
68
+ df_query = pd.read_csv(output_path_tmp, sep='\t')
69
+ if extension == 'txt' or extension == 'TXT':
70
+ df_query = pd.read_csv(query_data, sep='\t')
78
71
 
79
72
  if reference_data is None:
80
- print('\nError: No argument passed to the mandatory reference_data. Please pass the path to the CSV file of the reference data.')
73
+ print('\nError: No argument passed to the mandatory reference_data. Please pass the path to the TXT file of the reference data.')
81
74
  sys.exit()
82
75
  else:
83
76
  if isinstance(reference_data,str):
84
77
  df_reference = get_reference_df(reference_data=reference_data)
85
- unique_reference_ids = df_reference.iloc[:,0].unique()
86
78
  else:
87
79
  dfs = []
88
80
  unique_reference_ids = []
@@ -92,6 +84,11 @@ def tune_params_DE(query_data=None, reference_data=None, similarity_measure='cos
92
84
  unique_reference_ids.extend(tmp.iloc[:,0].unique())
93
85
  df_reference = pd.concat(dfs, axis=0, ignore_index=True)
94
86
 
87
+ if 'ionization_mode' in df_reference.columns.tolist() and ionization_mode != None and ionization_mode != 'N/A':
88
+ df_reference = df_reference.loc[df_reference['ionization_mode']==ionization_mode]
89
+ if 'adduct' in df_reference.columns.tolist() and adduct != None and adduct != 'N/A':
90
+ df_reference = df_reference.loc[df_reference['adduct']==adduct]
91
+
95
92
  unique_query_ids = df_query['id'].unique().tolist()
96
93
  unique_reference_ids = df_reference['id'].unique().tolist()
97
94
 
@@ -100,6 +97,9 @@ def tune_params_DE(query_data=None, reference_data=None, similarity_measure='cos
100
97
  df_reference=df_reference,
101
98
  unique_query_ids=unique_query_ids,
102
99
  unique_reference_ids=unique_reference_ids,
100
+ precursor_ion_mz_tolerance=precursor_ion_mz_tolerance,
101
+ ionization_mode=ionization_mode,
102
+ adduct=adduct,
103
103
  similarity_measure=similarity_measure,
104
104
  weights=weights,
105
105
  spectrum_preprocessing_order=spectrum_preprocessing_order,
@@ -111,22 +111,10 @@ def tune_params_DE(query_data=None, reference_data=None, similarity_measure='cos
111
111
 
112
112
  bounds = [param_bounds[p] for p in optimize_params]
113
113
 
114
- #print('<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<')
115
- #print(df_query.head())
116
- #print(df_reference.head())
117
- #print(bounds)
118
- #print(ctx)
119
- #print('>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>')
120
-
121
- result = differential_evolution(
122
- objective_function_HRMS,
123
- bounds=bounds,
124
- args=(ctx,),
125
- maxiter=3,
126
- tol=0.0,
127
- workers=-1,
128
- seed=1,
129
- )
114
+ if chromatography_platform == 'HRMS':
115
+ result = differential_evolution(objective_function_HRMS, bounds=bounds, args=(ctx,), maxiter=maxiters, tol=0.0, workers=de_workers, seed=1, updating='deferred' if de_workers!=1 else 'immediate')
116
+ else:
117
+ result = differential_evolution(objective_function_NRMS, bounds=bounds, args=(ctx,), maxiter=maxiters, tol=0.0, workers=de_workers, seed=1, updating='deferred' if de_workers!=1 else 'immediate')
130
118
 
131
119
  best_full_params = _vector_to_full_params(result.x, default_params, optimize_params)
132
120
  best_acc = 100.0 - (result.fun * 100.0)
@@ -144,11 +132,13 @@ def tune_params_DE(query_data=None, reference_data=None, similarity_measure='cos
144
132
 
145
133
 
146
134
 
135
+
147
136
  default_HRMS_grid = {'similarity_measure':['cosine'], 'weight':[{'Cosine':0.25,'Shannon':0.25,'Renyi':0.25,'Tsallis':0.25}], 'spectrum_preprocessing_order':['FCNMWL'], 'mz_min':[0], 'mz_max':[9999999], 'int_min':[0], 'int_max':[99999999], 'window_size_centroiding':[0.5], 'window_size_matching':[0.5], 'noise_threshold':[0.0], 'wf_mz':[0.0], 'wf_int':[1.0], 'LET_threshold':[0.0], 'entropy_dimension':[1.1], 'high_quality_reference_library':[False]}
148
137
  default_NRMS_grid = {'similarity_measure':['cosine'], 'weight':[{'Cosine':0.25,'Shannon':0.25,'Renyi':0.25,'Tsallis':0.25}], 'spectrum_preprocessing_order':['FCNMWL'], 'mz_min':[0], 'mz_max':[9999999], 'int_min':[0], 'int_max':[99999999], 'noise_threshold':[0.0], 'wf_mz':[0.0], 'wf_int':[1.0], 'LET_threshold':[0.0], 'entropy_dimension':[1.1], 'high_quality_reference_library':[False]}
149
138
 
150
139
 
151
- def _eval_one_HRMS(df_query, df_reference, unique_query_ids, unique_reference_ids,
140
+ def _eval_one_HRMS(df_query, df_reference,
141
+ precursor_ion_mz_tolerance_tmp, ionization_mode_tmp, adduct_tmp,
152
142
  similarity_measure_tmp, weight,
153
143
  spectrum_preprocessing_order_tmp, mz_min_tmp, mz_max_tmp,
154
144
  int_min_tmp, int_max_tmp, noise_threshold_tmp,
@@ -158,7 +148,8 @@ def _eval_one_HRMS(df_query, df_reference, unique_query_ids, unique_reference_id
158
148
 
159
149
  acc = get_acc_HRMS(
160
150
  df_query=df_query, df_reference=df_reference,
161
- unique_query_ids=unique_query_ids, unique_reference_ids=unique_reference_ids,
151
+ precursor_ion_mz_tolerance=precursor_ion_mz_tolerance_tmp,
152
+ ionization_mode=ionization_mode_tmp, adduct=adduct_tmp,
162
153
  similarity_measure=similarity_measure_tmp, weights=weight,
163
154
  spectrum_preprocessing_order=spectrum_preprocessing_order_tmp,
164
155
  mz_min=mz_min_tmp, mz_max=mz_max_tmp,
@@ -170,7 +161,7 @@ def _eval_one_HRMS(df_query, df_reference, unique_query_ids, unique_reference_id
170
161
  LET_threshold=LET_threshold_tmp,
171
162
  entropy_dimension=entropy_dimension_tmp,
172
163
  high_quality_reference_library=high_quality_reference_library_tmp,
173
- verbose=True
164
+ verbose=False
174
165
  )
175
166
 
176
167
  return (
@@ -201,6 +192,7 @@ def _eval_one_NRMS(df_query, df_reference, unique_query_ids, unique_reference_id
201
192
  LET_threshold=LET_threshold_tmp,
202
193
  entropy_dimension=entropy_dimension_tmp,
203
194
  high_quality_reference_library=high_quality_reference_library_tmp,
195
+ verbose=False
204
196
  )
205
197
 
206
198
  return (
@@ -211,16 +203,7 @@ def _eval_one_NRMS(df_query, df_reference, unique_query_ids, unique_reference_id
211
203
 
212
204
 
213
205
 
214
- def tune_params_on_HRMS_data_grid(query_data=None, reference_data=None, grid=None, output_path=None, return_output=False):
215
- """
216
- runs spectral library matching on high-resolution mass spectrometry (HRMS) data with all possible combinations of parameters in the grid dict, saves results from each choice of parameters to a TXT file, and prints top-performing parameters
217
-
218
- --query_data: mgf, mzML, or csv file of query mass spectrum/spectra to be identified. If csv file, each row should correspond to a mass spectrum, the left-most column should contain an identifier, and each of the other columns should correspond to a single mass/charge ratio. Mandatory argument.
219
- --reference_data: mgf, mzML, or csv file of the reference mass spectra. If csv file, each row should correspond to a mass spectrum, the left-most column should contain in identifier (i.e. the CAS registry number or the compound name), and the remaining column should correspond to a single mass/charge ratio. Mandatory argument.
220
- --grid: dict with all possible parameter values to try.
221
- --output_path: accuracy from each choice of parameter set is saved to a TXT file here.
222
- """
223
-
206
+ def tune_params_on_HRMS_data_grid(query_data=None, reference_data=None, precursor_ion_mz_tolerance=None, ionization_mode=None, adduct=None, grid=None, output_path=None, return_output=False):
224
207
  grid = {**default_HRMS_grid, **(grid or {})}
225
208
  for key, value in grid.items():
226
209
  globals()[key] = value
@@ -231,31 +214,37 @@ def tune_params_on_HRMS_data_grid(query_data=None, reference_data=None, grid=Non
231
214
  else:
232
215
  extension = query_data.rsplit('.',1)
233
216
  extension = extension[(len(extension)-1)]
234
- if extension == 'mgf' or extension == 'MGF' or extension == 'mzML' or extension == 'mzml' or extension == 'MZML' or extension == 'cdf' or extension == 'CDF':
235
- output_path_tmp = query_data[:-3] + 'csv'
217
+ if extension == 'mgf' or extension == 'MGF' or extension == 'mzML' or extension == 'mzml' or extension == 'MZML' or extension == 'cdf' or extension == 'CDF' or extension == 'msp' or extension == 'MSP' or extension == 'json' or extension == 'JSON':
218
+ output_path_tmp = query_data[:-3] + 'txt'
236
219
  build_library_from_raw_data(input_path=query_data, output_path=output_path_tmp, is_reference=False)
237
- df_query = pd.read_csv(output_path_tmp)
238
- if extension == 'csv' or extension == 'CSV':
239
- df_query = pd.read_csv(query_data)
240
- unique_query_ids = df_query.iloc[:,0].unique()
220
+ df_query = pd.read_csv(output_path_tmp, sep='\t')
221
+ if extension == 'txt' or extension == 'TXT':
222
+ df_query = pd.read_csv(query_data, sep='\t')
223
+ unique_query_ids = df_query['id'].unique()
241
224
 
242
225
  if reference_data is None:
243
- print('\nError: No argument passed to the mandatory reference_data. Please pass the path to the CSV file of the reference data.')
226
+ print('\nError: No argument passed to the mandatory reference_data. Please pass the path to the TXT file of the reference data.')
244
227
  sys.exit()
245
228
  else:
246
229
  if isinstance(reference_data,str):
247
230
  df_reference = get_reference_df(reference_data=reference_data)
248
- unique_reference_ids = df_reference.iloc[:,0].unique()
231
+ unique_reference_ids = df_reference['id'].unique()
249
232
  else:
250
233
  dfs = []
251
234
  unique_reference_ids = []
252
235
  for f in reference_data:
253
236
  tmp = get_reference_df(reference_data=f)
254
237
  dfs.append(tmp)
255
- unique_reference_ids.extend(tmp.iloc[:,0].unique())
238
+ unique_reference_ids.extend(tmp['id'].unique())
256
239
  df_reference = pd.concat(dfs, axis=0, ignore_index=True)
257
240
 
258
- print(f'\nNote that there are {len(unique_query_ids)} unique query spectra, {len(unique_reference_ids)} unique reference spectra, and {len(set(unique_query_ids) & set(unique_reference_ids))} of the query and reference spectra IDs are in common.\n')
241
+ if 'ionization_mode' in df_reference.columns.tolist() and ionization_mode != 'N/A' and ionization_mode != None:
242
+ df_reference = df_reference.loc[df_reference['ionization_mode']==ionization_mode].copy()
243
+ if 'adduct' in df_reference.columns.tolist() and adduct != 'N/A' and adduct != None:
244
+ df_reference = df_reference.loc[df_reference['adduct']==adduct].copy()
245
+ unique_reference_ids_tmp2 = df_reference['id'].unique()
246
+
247
+ print(f'\nNote that there are {len(unique_query_ids)} unique query spectra, {len(unique_reference_ids)} unique reference spectra, and {len(set(unique_query_ids) & set(unique_reference_ids_tmp2))} of the query and reference spectra IDs are in common.\n')
259
248
 
260
249
  if output_path is None:
261
250
  output_path = f'{Path.cwd()}/tuning_param_output.txt'
@@ -263,7 +252,7 @@ def tune_params_on_HRMS_data_grid(query_data=None, reference_data=None, grid=Non
263
252
 
264
253
  param_grid = product(similarity_measure, weight, spectrum_preprocessing_order, mz_min, mz_max, int_min, int_max, noise_threshold,
265
254
  window_size_centroiding, window_size_matching, wf_mz, wf_int, LET_threshold, entropy_dimension, high_quality_reference_library)
266
- results = Parallel(n_jobs=-1, verbose=10)(delayed(_eval_one_HRMS)(df_query, df_reference, unique_query_ids, unique_reference_ids, *params) for params in param_grid)
255
+ results = Parallel(n_jobs=-1, verbose=10)(delayed(_eval_one_HRMS)(df_query, df_reference, precursor_ion_mz_tolerance, ionization_mode, adduct, *params) for params in param_grid)
267
256
 
268
257
  df_out = pd.DataFrame(results, columns=[
269
258
  'ACC','SIMILARITY.MEASURE','WEIGHT','SPECTRUM.PROCESSING.ORDER', 'MZ.MIN','MZ.MAX','INT.MIN','INT.MAX','NOISE.THRESHOLD',
@@ -287,124 +276,7 @@ def tune_params_on_HRMS_data_grid(query_data=None, reference_data=None, grid=Non
287
276
 
288
277
 
289
278
 
290
- def tune_params_on_HRMS_data_grid_shiny(query_data=None, reference_data=None, grid=None, output_path=None, return_output=False):
291
- """
292
- runs spectral library matching on high-resolution mass spectrometry (HRMS) data with all possible
293
- combinations of parameters in the grid dict, saves results from each choice of parameters to a TXT file,
294
- and prints top-performing parameters
295
-
296
- --query_data: mgf, mzML, or csv file of query mass spectrum/spectra to be identified. If csv file, each row
297
- should correspond to a mass spectrum, the left-most column should contain an identifier, and each of the
298
- other columns should correspond to a single mass/charge ratio. Mandatory argument.
299
- --reference_data: mgf, mzML, or csv file of the reference mass spectra. If csv file, each row should correspond
300
- to a mass spectrum, the left-most column should contain in identifier (i.e. the CAS registry number or the
301
- compound name), and the remaining column should correspond to a single mass/charge ratio. Mandatory argument.
302
- --grid: dict with all possible parameter values to try.
303
- --output_path: accuracy from each choice of parameter set is saved to a TXT file here.
304
- """
305
-
306
- local_grid = {**default_HRMS_grid, **(grid or {})}
307
- for key, value in local_grid.items():
308
- globals()[key] = value
309
-
310
- if query_data is None:
311
- print('\nError: No argument passed to the mandatory query_data. Please pass the path to the data file.')
312
- sys.exit()
313
- else:
314
- extension = query_data.rsplit('.', 1)[-1]
315
- if extension in ('mgf','MGF','mzML','mzml','MZML','cdf','CDF'):
316
- output_path_tmp = query_data[:-3] + 'csv'
317
- build_library_from_raw_data(input_path=query_data, output_path=output_path_tmp, is_reference=False)
318
- df_query = pd.read_csv(output_path_tmp)
319
- elif extension in ('csv','CSV'):
320
- df_query = pd.read_csv(query_data)
321
- else:
322
- print(f'\nError: Unsupported query_data extension: {extension}')
323
- sys.exit()
324
- unique_query_ids = df_query.iloc[:, 0].unique()
325
-
326
- if reference_data is None:
327
- print('\nError: No argument passed to the mandatory reference_data. Please pass the path to the data file(s).')
328
- sys.exit()
329
- else:
330
- if isinstance(reference_data, str):
331
- df_reference = get_reference_df(reference_data=reference_data)
332
- unique_reference_ids = df_reference.iloc[:, 0].unique()
333
- else:
334
- dfs = []
335
- unique_reference_ids = []
336
- for f in reference_data:
337
- tmp = get_reference_df(reference_data=f)
338
- dfs.append(tmp)
339
- unique_reference_ids.extend(tmp.iloc[:, 0].unique())
340
- df_reference = pd.concat(dfs, axis=0, ignore_index=True)
341
-
342
- print(f'\nNote that there are {len(unique_query_ids)} unique query spectra, '
343
- f'{len(unique_reference_ids)} unique reference spectra, and '
344
- f'{len(set(unique_query_ids) & set(unique_reference_ids))} of the query and reference spectra IDs are in common.\n')
345
-
346
- if output_path is None:
347
- output_path = f'{Path.cwd()}/tuning_param_output.txt'
348
- print(f'Warning: since output_path=None, the output will be written to the current working directory: {output_path}')
349
-
350
- param_grid = product(
351
- similarity_measure, weight, spectrum_preprocessing_order, mz_min, mz_max, int_min, int_max,
352
- noise_threshold, window_size_centroiding, window_size_matching, wf_mz, wf_int, LET_threshold,
353
- entropy_dimension, high_quality_reference_library
354
- )
355
-
356
- results = []
357
- total = (
358
- len(similarity_measure) * len(weight) * len(spectrum_preprocessing_order) * len(mz_min) * len(mz_max) *
359
- len(int_min) * len(int_max) * len(noise_threshold) * len(window_size_centroiding) *
360
- len(window_size_matching) * len(wf_mz) * len(wf_int) * len(LET_threshold) *
361
- len(entropy_dimension) * len(high_quality_reference_library)
362
- )
363
- done = 0
364
-
365
- for params in param_grid:
366
- res = _eval_one_HRMS(df_query, df_reference, unique_query_ids, unique_reference_ids, *params)
367
- results.append(res)
368
- done += 1
369
- print(f'Completed {done}/{total} grid combinations.\n', flush=True)
370
-
371
- df_out = pd.DataFrame(results, columns=[
372
- 'ACC','SIMILARITY.MEASURE','WEIGHT','SPECTRUM.PROCESSING.ORDER','MZ.MIN','MZ.MAX',
373
- 'INT.MIN','INT.MAX','NOISE.THRESHOLD','WINDOW.SIZE.CENTROIDING','WINDOW.SIZE.MATCHING',
374
- 'WF.MZ','WF.INT','LET.THRESHOLD','ENTROPY.DIMENSION','HIGH.QUALITY.REFERENCE.LIBRARY'
375
- ])
376
-
377
- if 'WEIGHT' in df_out.columns:
378
- df_out['WEIGHT'] = (
379
- df_out['WEIGHT'].astype(str)
380
- .str.replace("\"","",regex=False)
381
- .str.replace("{","",regex=False)
382
- .str.replace("}","",regex=False)
383
- .str.replace(":","",regex=False)
384
- .str.replace("Cosine","",regex=False)
385
- .str.replace("Shannon","",regex=False)
386
- .str.replace("Renyi","",regex=False)
387
- .str.replace("Tsallis","",regex=False)
388
- .str.replace(" ","",regex=False)
389
- )
390
-
391
- if return_output:
392
- return df_out
393
- else:
394
- df_out.to_csv(output_path, index=False, sep='\t', quoting=csv.QUOTE_NONE)
395
- print(f'Wrote results to {output_path}')
396
-
397
-
398
279
  def tune_params_on_NRMS_data_grid(query_data=None, reference_data=None, grid=None, output_path=None, return_output=False):
399
- """
400
- runs spectral library matching on nominal-resolution mass spectrometry (NRMS) data with all possible combinations of parameters in the grid dict, saves results from each choice of parameters to a TXT file, and prints top-performing parameters
401
-
402
- --query_data: mgf, mzML, or csv file of query mass spectrum/spectra to be identified. If csv file, each row should correspond to a mass spectrum, the left-most column should contain an identifier, and each of the other columns should correspond to a single mass/charge ratio. Mandatory argument.
403
- --reference_data: mgf, mzML, or csv file of the reference mass spectra. If csv file, each row should correspond to a mass spectrum, the left-most column should contain in identifier (i.e. the CAS registry number or the compound name), and the remaining column should correspond to a single mass/charge ratio. Mandatory argument.
404
- --grid: dict with all possible parameter values to try
405
- --output_path: accuracy from each choice of parameter set is saved to a TXT file here
406
- """
407
-
408
280
  grid = {**default_NRMS_grid, **(grid or {})}
409
281
  for key, value in grid.items():
410
282
  globals()[key] = value
@@ -415,13 +287,13 @@ def tune_params_on_NRMS_data_grid(query_data=None, reference_data=None, grid=Non
415
287
  else:
416
288
  extension = query_data.rsplit('.',1)
417
289
  extension = extension[(len(extension)-1)]
418
- if extension == 'mgf' or extension == 'MGF' or extension == 'mzML' or extension == 'mzml' or extension == 'MZML' or extension == 'cdf' or extension == 'CDF':
419
- output_path_tmp = query_data[:-3] + 'csv'
290
+ if extension == 'mgf' or extension == 'MGF' or extension == 'mzML' or extension == 'mzml' or extension == 'MZML' or extension == 'cdf' or extension == 'CDF' or extension == 'msp' or extension == 'MSP' or extension == 'json' or extension == 'JSON':
291
+ output_path_tmp = query_data[:-3] + 'txt'
420
292
  build_library_from_raw_data(input_path=query_data, output_path=output_path_tmp, is_reference=False)
421
- df_query = pd.read_csv(output_path_tmp)
422
- if extension == 'csv' or extension == 'CSV':
423
- df_query = pd.read_csv(query_data)
424
- unique_query_ids = df_query.iloc[:,0].unique()
293
+ df_query = pd.read_csv(output_path_tmp, sep='\t')
294
+ if extension == 'txt' or extension == 'TXT':
295
+ df_query = pd.read_csv(query_data, sep='\t')
296
+ unique_query_ids = df_query['id'].unique()
425
297
 
426
298
  if reference_data is None:
427
299
  print('\nError: No argument passed to the mandatory reference_data. Please pass the path to the CSV file of the reference data.')
@@ -429,7 +301,7 @@ def tune_params_on_NRMS_data_grid(query_data=None, reference_data=None, grid=Non
429
301
  else:
430
302
  if isinstance(reference_data,str):
431
303
  df_reference = get_reference_df(reference_data=reference_data)
432
- unique_reference_ids = df_reference.iloc[:,0].unique()
304
+ unique_reference_ids = df_reference['id'].unique()
433
305
  else:
434
306
  dfs = []
435
307
  unique_reference_ids = []
@@ -449,10 +321,8 @@ def tune_params_on_NRMS_data_grid(query_data=None, reference_data=None, grid=Non
449
321
  noise_threshold, wf_mz, wf_int, LET_threshold, entropy_dimension, high_quality_reference_library)
450
322
  results = Parallel(n_jobs=-1, verbose=10)(delayed(_eval_one_NRMS)(df_query, df_reference, unique_query_ids, unique_reference_ids, *params) for params in param_grid)
451
323
 
452
- df_out = pd.DataFrame(results, columns=[
453
- 'ACC','SIMILARITY.MEASURE','WEIGHT','SPECTRUM.PROCESSING.ORDER', 'MZ.MIN','MZ.MAX','INT.MIN','INT.MAX',
454
- 'NOISE.THRESHOLD','WF.MZ','WF.INT','LET.THRESHOLD','ENTROPY.DIMENSION', 'HIGH.QUALITY.REFERENCE.LIBRARY'
455
- ])
324
+ df_out = pd.DataFrame(results, columns=['ACC','SIMILARITY.MEASURE','WEIGHT','SPECTRUM.PROCESSING.ORDER', 'MZ.MIN','MZ.MAX','INT.MIN','INT.MAX',
325
+ 'NOISE.THRESHOLD','WF.MZ','WF.INT','LET.THRESHOLD','ENTROPY.DIMENSION', 'HIGH.QUALITY.REFERENCE.LIBRARY'])
456
326
  df_out['WEIGHT'] = df_out['WEIGHT'].str.replace("\"","",regex=False)
457
327
  df_out['WEIGHT'] = df_out['WEIGHT'].str.replace("{","",regex=False)
458
328
  df_out['WEIGHT'] = df_out['WEIGHT'].str.replace("}","",regex=False)
@@ -462,6 +332,7 @@ def tune_params_on_NRMS_data_grid(query_data=None, reference_data=None, grid=Non
462
332
  df_out['WEIGHT'] = df_out['WEIGHT'].str.replace("Renyi","",regex=False)
463
333
  df_out['WEIGHT'] = df_out['WEIGHT'].str.replace("Tsallis","",regex=False)
464
334
  df_out['WEIGHT'] = df_out['WEIGHT'].str.replace(" ","",regex=False)
335
+
465
336
  if return_output is False:
466
337
  df_out.to_csv(output_path, index=False, sep='\t', quoting=csv.QUOTE_NONE)
467
338
  else:
@@ -469,203 +340,116 @@ def tune_params_on_NRMS_data_grid(query_data=None, reference_data=None, grid=Non
469
340
 
470
341
 
471
342
 
472
- def tune_params_on_NRMS_data_grid_shiny(query_data=None, reference_data=None, grid=None, output_path=None, return_output=False):
473
- """
474
- runs spectral library matching on nominal-resolution mass spectrometry (NRMS) data with all possible
475
- combinations of parameters in the grid dict, saves results from each choice of parameters to a TXT file,
476
- and prints top-performing parameters
343
+ def get_acc_HRMS(df_query, df_reference, precursor_ion_mz_tolerance, ionization_mode, adduct, similarity_measure, weights, spectrum_preprocessing_order, mz_min, mz_max, int_min, int_max, window_size_centroiding, window_size_matching, noise_threshold, wf_mz, wf_int, LET_threshold, entropy_dimension, high_quality_reference_library, verbose=True):
477
344
 
478
- --query_data: mgf, mzML, or csv file of query mass spectrum/spectra to be identified. If csv file, each row
479
- should correspond to a mass spectrum, the left-most column should contain an identifier, and each of the
480
- other columns should correspond to a single mass/charge ratio. Mandatory argument.
481
- --reference_data: mgf, mzML, or csv file of the reference mass spectra. If csv file, each row should correspond
482
- to a mass spectrum, the left-most column should contain in identifier (i.e. the CAS registry number or the
483
- compound name), and the remaining column should correspond to a single mass/charge ratio. Mandatory argument.
484
- --grid: dict with all possible parameter values to try.
485
- --output_path: accuracy from each choice of parameter set is saved to a TXT file here.
486
- """
487
-
488
- local_grid = {**default_NRMS_grid, **(grid or {})}
489
- for key, value in local_grid.items():
490
- globals()[key] = value
491
-
492
- if query_data is None:
493
- print('\nError: No argument passed to the mandatory query_data. Please pass the path to the data file.')
494
- sys.exit()
495
- else:
496
- extension = query_data.rsplit('.', 1)[-1]
497
- if extension in ('mgf','MGF','mzML','mzml','MZML','cdf','CDF'):
498
- output_path_tmp = query_data[:-3] + 'csv'
499
- build_library_from_raw_data(input_path=query_data, output_path=output_path_tmp, is_reference=False)
500
- df_query = pd.read_csv(output_path_tmp)
501
- elif extension in ('csv','CSV'):
502
- df_query = pd.read_csv(query_data)
503
- else:
504
- print(f'\nError: Unsupported query_data extension: {extension}')
505
- sys.exit()
506
- unique_query_ids = df_query.iloc[:, 0].unique()
507
-
508
- if reference_data is None:
509
- print('\nError: No argument passed to the mandatory reference_data. Please pass the path to the data file(s).')
510
- sys.exit()
511
- else:
512
- if isinstance(reference_data, str):
513
- df_reference = get_reference_df(reference_data=reference_data)
514
- unique_reference_ids = df_reference.iloc[:, 0].unique()
515
- else:
516
- dfs = []
517
- unique_reference_ids = []
518
- for f in reference_data:
519
- tmp = get_reference_df(reference_data=f)
520
- dfs.append(tmp)
521
- unique_reference_ids.extend(tmp.iloc[:, 0].unique())
522
- df_reference = pd.concat(dfs, axis=0, ignore_index=True)
523
-
524
- print(f'\nNote that there are {len(unique_query_ids)} unique query spectra, '
525
- f'{len(unique_reference_ids)} unique reference spectra, and '
526
- f'{len(set(unique_query_ids) & set(unique_reference_ids))} of the query and reference spectra IDs are in common.\n')
527
-
528
- if output_path is None:
529
- output_path = f'{Path.cwd()}/tuning_param_output.txt'
530
- print(f'Warning: since output_path=None, the output will be written to the current working directory: {output_path}')
531
-
532
- param_grid = product(
533
- similarity_measure, weight, spectrum_preprocessing_order, mz_min, mz_max, int_min, int_max,
534
- noise_threshold, wf_mz, wf_int, LET_threshold,
535
- entropy_dimension, high_quality_reference_library
536
- )
537
-
538
- results = []
539
- total = (
540
- len(similarity_measure) * len(weight) * len(spectrum_preprocessing_order) * len(mz_min) * len(mz_max) * len(int_min) *
541
- len(int_max) * len(noise_threshold) * len(wf_mz) * len(wf_int) * len(LET_threshold) * len(entropy_dimension) * len(high_quality_reference_library)
542
- )
543
- done = 0
544
- for params in param_grid:
545
- res = _eval_one_NRMS(df_query, df_reference, unique_query_ids, unique_reference_ids, *params)
546
- results.append(res)
547
- done += 1
548
- print(f'Completed {done}/{total} grid combinations.\n', flush=True)
549
-
550
- df_out = pd.DataFrame(results, columns=[
551
- 'ACC','SIMILARITY.MEASURE','WEIGHT','SPECTRUM.PROCESSING.ORDER','MZ.MIN','MZ.MAX',
552
- 'INT.MIN','INT.MAX','NOISE.THRESHOLD','WF.MZ','WF.INT','LET.THRESHOLD','ENTROPY.DIMENSION','HIGH.QUALITY.REFERENCE.LIBRARY'
553
- ])
345
+ n_top_matches_to_save = 1
346
+ unique_reference_ids = df_reference['id'].dropna().astype(str).unique().tolist()
347
+ unique_query_ids = df_query['id'].dropna().astype(str).unique().tolist()
348
+ all_similarity_rows = []
554
349
 
555
- if 'WEIGHT' in df_out.columns:
556
- df_out['WEIGHT'] = (
557
- df_out['WEIGHT'].astype(str)
558
- .str.replace("\"","",regex=False)
559
- .str.replace("{","",regex=False)
560
- .str.replace("}","",regex=False)
561
- .str.replace(":","",regex=False)
562
- .str.replace("Cosine","",regex=False)
563
- .str.replace("Shannon","",regex=False)
564
- .str.replace("Renyi","",regex=False)
565
- .str.replace("Tsallis","",regex=False)
566
- .str.replace(" ","",regex=False)
567
- )
568
-
569
- if return_output:
570
- return df_out
571
- else:
572
- df_out.to_csv(output_path, index=False, sep='\t', quoting=csv.QUOTE_NONE)
573
- print(f'Wrote results to {output_path}')
350
+ for query_idx, qid in enumerate(unique_query_ids):
351
+ if verbose:
352
+ print(f'query spectrum #{query_idx} is being identified')
574
353
 
354
+ q_mask = (df_query['id'] == qid)
355
+ q_idxs = np.where(q_mask)[0]
356
+ if q_idxs.size == 0:
357
+ all_similarity_rows.append([0.0]*len(unique_reference_ids))
358
+ continue
575
359
 
360
+ q_spec_base = np.asarray(pd.concat([df_query['mz_ratio'].iloc[q_idxs], df_query['intensity'].iloc[q_idxs]], axis=1).reset_index(drop=True))
576
361
 
362
+ if 'precursor_ion_mz' in df_query.columns and 'precursor_ion_mz' in df_reference.columns and precursor_ion_mz_tolerance is not None:
363
+ precursor = float(df_query['precursor_ion_mz'].iloc[q_idxs[0]])
364
+ df_reference_tmp = df_reference.loc[df_reference['precursor_ion_mz'].between(precursor - precursor_ion_mz_tolerance, precursor + precursor_ion_mz_tolerance, inclusive='both'), ['id', 'mz_ratio', 'intensity']].copy()
365
+ else:
366
+ df_reference_tmp = df_reference[['id','mz_ratio','intensity']].copy()
577
367
 
578
- def get_acc_HRMS(df_query, df_reference, unique_query_ids, unique_reference_ids, similarity_measure, weights, spectrum_preprocessing_order, mz_min, mz_max, int_min, int_max, window_size_centroiding, window_size_matching, noise_threshold, wf_mz, wf_int, LET_threshold, entropy_dimension, high_quality_reference_library, verbose=True):
368
+ if df_reference_tmp.empty:
369
+ all_similarity_rows.append([0.0]*len(unique_reference_ids))
370
+ continue
579
371
 
580
- #print('\n\n\n\n!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n\n\n\n')
581
- n_top_matches_to_save = 1
372
+ ref_groups = dict(tuple(df_reference_tmp.groupby('id', sort=False)))
582
373
 
583
- all_similarity_scores = []
584
- for query_idx in range(0,len(unique_query_ids)):
585
- if verbose is True:
586
- print(f'query spectrum #{query_idx} is being identified')
587
- q_idxs_tmp = np.where(df_query.iloc[:,0] == unique_query_ids[query_idx])[0]
588
- q_spec_tmp = np.asarray(pd.concat([df_query.iloc[q_idxs_tmp,1], df_query.iloc[q_idxs_tmp,2]], axis=1).reset_index(drop=True))
589
- #q_spec_tmp = q_spec_tmp.astype(float)
374
+ similarity_by_ref = {}
590
375
 
591
- similarity_scores = []
592
- for ref_idx in range(0,len(unique_reference_ids)):
593
- q_spec = q_spec_tmp
594
- r_idxs_tmp = np.where(df_reference.iloc[:,0] == unique_reference_ids[ref_idx])[0]
595
- r_spec = np.asarray(pd.concat([df_reference.iloc[r_idxs_tmp,1], df_reference.iloc[r_idxs_tmp,2]], axis=1).reset_index(drop=True))
596
- #print(r_spec)
597
- #r_spec = r_spec.astype(float)
376
+ for ref_id, r_df in ref_groups.items():
377
+ q_spec = q_spec_base.copy()
378
+ r_spec = np.asarray(pd.concat([r_df['mz_ratio'], r_df['intensity']], axis=1).reset_index(drop=True))
598
379
 
599
380
  is_matched = False
600
381
  for transformation in spectrum_preprocessing_order:
601
- if np.isinf(q_spec[:,1]).sum() > 0:
602
- q_spec[:,1] = np.zeros(q_spec.shape[0])
603
- if np.isinf(r_spec[:,1]).sum() > 0:
604
- r_spec[:,1] = np.zeros(r_spec.shape[0])
605
- if transformation == 'C' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1:
606
- q_spec = centroid_spectrum(q_spec, window_size=window_size_centroiding)
607
- r_spec = centroid_spectrum(r_spec, window_size=window_size_centroiding)
608
- if transformation == 'M' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1:
609
- m_spec = match_peaks_in_spectra(spec_a=q_spec, spec_b=r_spec, window_size=window_size_matching)
610
- q_spec = m_spec[:,0:2]
611
- r_spec = m_spec[:,[0,2]]
382
+ if np.isinf(q_spec[:, 1]).any():
383
+ q_spec[:, 1] = 0.0
384
+ if np.isinf(r_spec[:, 1]).any():
385
+ r_spec[:, 1] = 0.0
386
+
387
+ if transformation == 'C' and q_spec.shape[0] > 1 and r_spec.shape[0] > 1:
388
+ q_spec = centroid_spectrum(q_spec, window_size=window_size_centroiding)
389
+ r_spec = centroid_spectrum(r_spec, window_size=window_size_centroiding)
390
+
391
+ if transformation == 'M' and q_spec.shape[0] > 1 and r_spec.shape[0] > 1:
392
+ m_spec = match_peaks_in_spectra(
393
+ spec_a=q_spec, spec_b=r_spec, window_size=window_size_matching
394
+ )
395
+ if m_spec.size == 0:
396
+ q_spec = np.empty((0,2))
397
+ r_spec = np.empty((0,2))
398
+ else:
399
+ q_spec = m_spec[:, 0:2]
400
+ r_spec = m_spec[:, [0, 2]]
612
401
  is_matched = True
613
- if transformation == 'W' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1:
614
- q_spec[:,1] = wf_transform(q_spec[:,0], q_spec[:,1], wf_mz, wf_int)
615
- r_spec[:,1] = wf_transform(r_spec[:,0], r_spec[:,1], wf_mz, wf_int)
616
- if transformation == 'L' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1:
617
- q_spec[:,1] = LE_transform(q_spec[:,1], LET_threshold, normalization_method='standard')
618
- r_spec[:,1] = LE_transform(r_spec[:,1], LET_threshold, normalization_method='standard')
619
- if transformation == 'N' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1:
620
- q_spec = remove_noise(q_spec, nr = noise_threshold)
621
- if high_quality_reference_library == False:
622
- r_spec = remove_noise(r_spec, nr = noise_threshold)
623
- if transformation == 'F' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1:
624
- q_spec = filter_spec_lcms(q_spec, mz_min = mz_min, mz_max = mz_max, int_min = int_min, int_max = int_max, is_matched = is_matched)
625
- if high_quality_reference_library == False:
626
- r_spec = filter_spec_lcms(r_spec, mz_min = mz_min, mz_max = mz_max, int_min = int_min, int_max = int_max, is_matched = is_matched)
627
402
 
628
- q_ints = q_spec[:,1]
629
- r_ints = r_spec[:,1]
630
- if np.sum(q_ints) != 0 and np.sum(r_ints) != 0 and q_spec.shape[0] > 1 and r_spec.shape[1] > 1:
631
- similarity_score = get_similarity(similarity_measure, q_ints, r_ints, weights, entropy_dimension)
403
+ if transformation == 'W' and q_spec.shape[0] > 1 and r_spec.shape[0] > 1:
404
+ q_spec[:, 1] = wf_transform(q_spec[:, 0], q_spec[:, 1], wf_mz, wf_int)
405
+ r_spec[:, 1] = wf_transform(r_spec[:, 0], r_spec[:, 1], wf_mz, wf_int)
406
+
407
+ if transformation == 'L' and q_spec.shape[0] > 1 and r_spec.shape[0] > 1:
408
+ q_spec[:, 1] = LE_transform(q_spec[:, 1], LET_threshold, normalization_method='standard')
409
+ r_spec[:, 1] = LE_transform(r_spec[:, 1], LET_threshold, normalization_method='standard')
410
+
411
+ if transformation == 'N' and q_spec.shape[0] > 1 and r_spec.shape[0] > 1:
412
+ q_spec = remove_noise(q_spec, nr=noise_threshold)
413
+ if not high_quality_reference_library:
414
+ r_spec = remove_noise(r_spec, nr=noise_threshold)
415
+
416
+ if transformation == 'F' and q_spec.shape[0] > 1 and r_spec.shape[0] > 1:
417
+ q_spec = filter_spec_lcms(
418
+ q_spec, mz_min=mz_min, mz_max=mz_max, int_min=int_min, int_max=int_max, is_matched=is_matched
419
+ )
420
+ if not high_quality_reference_library:
421
+ r_spec = filter_spec_lcms(
422
+ r_spec, mz_min=mz_min, mz_max=mz_max, int_min=int_min, int_max=int_max, is_matched=is_matched
423
+ )
424
+
425
+ if q_spec.shape[0] > 1 and r_spec.shape[0] > 1:
426
+ q_ints = q_spec[:, 1]
427
+ r_ints = r_spec[:, 1]
428
+ if np.sum(q_ints) != 0 and np.sum(r_ints) != 0:
429
+ sim = get_similarity(similarity_measure, q_ints, r_ints, weights, entropy_dimension)
430
+ else:
431
+ sim = 0.0
632
432
  else:
633
- similarity_score = 0
433
+ sim = 0.0
634
434
 
635
- similarity_scores.append(similarity_score)
636
- all_similarity_scores.append(similarity_scores)
435
+ similarity_by_ref[str(ref_id)] = float(sim)
637
436
 
638
- df_scores = pd.DataFrame(all_similarity_scores, columns = unique_reference_ids)
639
- df_scores.index = unique_query_ids
640
- df_scores.index.names = ['Query Spectrum ID']
437
+ row = [similarity_by_ref.get(ref_id, 0.0) for ref_id in unique_reference_ids]
438
+ all_similarity_rows.append(row)
641
439
 
642
- preds = []
643
- scores = []
644
- for i in range(0, df_scores.shape[0]):
645
- df_scores_tmp = df_scores
646
- preds_tmp = []
647
- scores_tmp = []
648
- for j in range(0, n_top_matches_to_save):
649
- top_ref_specs_tmp = df_scores_tmp.iloc[i,np.where(df_scores_tmp.iloc[i,:] == np.max(df_scores_tmp.iloc[i,:]))[0]]
650
- cols_to_keep = np.where(df_scores_tmp.iloc[i,:] != np.max(df_scores_tmp.iloc[i,:]))[0]
651
- df_scores_tmp = df_scores_tmp.iloc[:,cols_to_keep]
440
+ df_scores = pd.DataFrame(all_similarity_rows, index=unique_query_ids, columns=unique_reference_ids)
441
+ df_scores.index.name = 'QUERY.SPECTRUM.ID'
652
442
 
653
- preds_tmp.append(';'.join(map(str,top_ref_specs_tmp.index.to_list())))
654
- if len(top_ref_specs_tmp.values) == 0:
655
- scores_tmp.append(0)
656
- else:
657
- scores_tmp.append(top_ref_specs_tmp.values[0])
658
- preds.append(preds_tmp)
659
- scores.append(scores_tmp)
660
-
661
- preds = np.array(preds)
662
- scores = np.array(scores)
663
- out = np.c_[unique_query_ids,preds,scores]
664
- df_tmp = pd.DataFrame(out, columns=['TRUE.ID','PREDICTED.ID','SCORE'])
665
- acc = (df_tmp['TRUE.ID']==df_tmp['PREDICTED.ID']).mean()
666
- return acc
443
+ top_idx = df_scores.values.argmax(axis=1)
444
+ top_scores = df_scores.values[np.arange(df_scores.shape[0]), top_idx]
445
+ top_ids = [df_scores.columns[i] for i in top_idx]
667
446
 
447
+ df_tmp = pd.DataFrame({'TRUE.ID': df_scores.index.to_list(), 'PREDICTED.ID': top_ids, 'SCORE': top_scores})
448
+ if verbose:
449
+ print(df_tmp)
668
450
 
451
+ acc = (df_tmp['TRUE.ID'] == df_tmp['PREDICTED.ID']).mean()
452
+ return acc
669
453
 
670
454
 
671
455
  def get_acc_NRMS(df_query, df_reference, unique_query_ids, unique_reference_ids, similarity_measure, weights, spectrum_preprocessing_order, mz_min, mz_max, int_min, int_max, noise_threshold, wf_mz, wf_int, LET_threshold, entropy_dimension, high_quality_reference_library, verbose=True):
@@ -724,7 +508,7 @@ def get_acc_NRMS(df_query, df_reference, unique_query_ids, unique_reference_ids,
724
508
 
725
509
  df_scores = pd.DataFrame(all_similarity_scores, columns = unique_reference_ids)
726
510
  df_scores.index = unique_query_ids
727
- df_scores.index.names = ['Query Spectrum ID']
511
+ df_scores.index.names = ['QUERY.SPECTRUM.ID']
728
512
 
729
513
  preds = []
730
514
  scores = []
@@ -754,64 +538,40 @@ def get_acc_NRMS(df_query, df_reference, unique_query_ids, unique_reference_ids,
754
538
 
755
539
 
756
540
 
757
- def run_spec_lib_matching_on_HRMS_data(query_data=None, reference_data=None, likely_reference_ids=None, similarity_measure='cosine', weights={'Cosine':0.25,'Shannon':0.25,'Renyi':0.25,'Tsallis':0.25}, spectrum_preprocessing_order='FCNMWL', high_quality_reference_library=False, mz_min=0, mz_max=9999999, int_min=0, int_max=9999999, window_size_centroiding=0.5, window_size_matching=0.5, noise_threshold=0.0, wf_mz=0.0, wf_intensity=1.0, LET_threshold=0.0, entropy_dimension=1.1, n_top_matches_to_save=1, print_id_results=False, output_identification=None, output_similarity_scores=None, return_ID_output=False, verbose=True):
758
- '''
759
- runs spectral library matching on high-resolution mass spectrometry (HRMS) data
760
-
761
- --query_data: mgf, mzML, or csv file of query mass spectrum/spectra to be identified. If csv file, each row should correspond to a mass spectrum, the left-most column should contain an identifier, and each of the other columns should correspond to a single mass/charge ratio. Mandatory argument.
762
- --reference_data: either string or list of strings with pass to mgf, mzML, sdf, and/or csv file(s) of the reference mass spectra. If csv file, each row should correspond to a mass spectrum, the left-most column should contain in identifier (i.e. the CAS registry number or the compound name), and the remaining column should correspond to a single mass/charge ratio. Mandatory argument.
763
- --likely_reference_ids: CSV file with one column containing the IDs of a subset of all compounds in the reference_data to be used in spectral library matching. Each ID in this file must be an ID in the reference library. Default: None (i.e. default is to use entire reference library)
764
- --similarity_measure: cosine, shannon, renyi, tsallis, mixture, jaccard, dice, 3w_jaccard, sokal_sneath, binary_cosine, mountford, mcconnaughey, driver_kroeber, simpson, braun_banquet, fager_mcgowan, kulczynski, intersection, hamming, hellinger. Default: cosine.
765
- --weights: dict of weights to give to each non-binary similarity measure (i.e. cosine, shannon, renyi, and tsallis) when the mixture similarity measure is specified. Default: 0.25 for each of the four non-binary similarity measures.
766
- --spectrum_preprocessing_order: The spectrum preprocessing transformations and the order in which they are to be applied. Note that these transformations are applied prior to computing similarity scores. Format must be a string with 2-6 characters chosen from C, F, M, N, L, W representing centroiding, filtering based on mass/charge and intensity values, matching, noise removal, low-entropy trannsformation, and weight-factor-transformation, respectively. For example, if \'WCM\' is passed, then each spectrum will undergo a weight factor transformation, then centroiding, and then matching. Note that if an argument is passed, then \'M\' must be contained in the argument, since matching is a required preprocessing step in spectral library matching of HRMS data. Furthermore, \'C\' must be performed before matching since centroiding can change the number of ion fragments in a given spectrum. Default: FCNMWL')
767
- --high_quality_reference_library: True/False flag indicating whether the reference library is considered to be of high quality. If True, then the spectrum preprocessing transformations of filtering and noise removal are performed only on the query spectrum/spectra. If False, all spectrum preprocessing transformations specified will be applied to both the query and reference spectra. Default: False')
768
- --mz_min: Remove all peaks with mass/charge value less than mz_min in each spectrum. Default: 0
769
- --mz_max: Remove all peaks with mass/charge value greater than mz_max in each spectrum. Default: 9999999
770
- --int_min: Remove all peaks with intensity value less than int_min in each spectrum. Default: 0
771
- --int_max: Remove all peaks with intensity value greater than int_max in each spectrum. Default: 9999999
772
- --window_size_centroiding: Window size parameter used in centroiding a given spectrum. Default: 0.5
773
- --window_size_matching: Window size parameter used in matching a query spectrum and a reference library spectrum. Default: 0.5
774
- --noise_threshold: Ion fragments (i.e. points in a given mass spectrum) with intensity less than max(intensities)*noise_threshold are removed. Default: 0.0
775
- --wf_mz: Mass/charge weight factor parameter. Default: 0.0
776
- --wf_intensity: Intensity weight factor parameter. Default: 0.0
777
- --LET_threshold: Low-entropy transformation threshold parameter. Spectra with Shannon entropy less than LET_threshold are transformed according to intensitiesNew=intensitiesOriginal^{(1+S)/(1+LET_threshold)}. Default: 0.0
778
- --entropy_dimension: Entropy dimension parameter. Must have positive value other than 1. When the entropy dimension is 1, then Renyi and Tsallis entropy are equivalent to Shannon entropy. Therefore, this parameter only applies to the renyi and tsallis similarity measures. This parameter will be ignored if similarity measure cosine or shannon is chosen. Default: 1.1
779
- --n_top_matches_to_save: The number of top matches to report. For example, if n_top_matches_to_save=5, then for each query spectrum, the five reference spectra with the largest similarity with the given query spectrum will be reported. Default: 1
780
- --print_id_results: Flag that prints identification results if True. Default: False
781
- --output_identification: Output TXT file containing the most-similar reference spectra for each query spectrum along with the corresponding similarity scores. Default is to save identification output in current working directory with filename \'output_identification.txt\'.
782
- --output_similarity_scores: Output TXT file containing similarity scores between all query spectrum/spectra and all reference spectra. Each row corresponds to a query spectrum, the left-most column contains the query spectrum/spectra identifier, and the remaining column contain the similarity scores with respect to all reference library spectra. If no argument passed, then this TXT file is written to the current working directory with filename \'output_all_similarity_scores\'.txt.')
783
- '''
784
-
541
+ def run_spec_lib_matching_on_HRMS_data(query_data=None, reference_data=None, precursor_ion_mz_tolerance=None, ionization_mode=None, adduct=None, likely_reference_ids=None, similarity_measure='cosine', weights={'Cosine':0.25,'Shannon':0.25,'Renyi':0.25,'Tsallis':0.25}, spectrum_preprocessing_order='FCNMWL', high_quality_reference_library=False, mz_min=0, mz_max=9999999, int_min=0, int_max=9999999, window_size_centroiding=0.5, window_size_matching=0.5, noise_threshold=0.0, wf_mz=0.0, wf_intensity=1.0, LET_threshold=0.0, entropy_dimension=1.1, n_top_matches_to_save=1, print_id_results=False, output_identification=None, output_similarity_scores=None, return_ID_output=False, verbose=True):
785
542
  if query_data is None:
786
543
  print('\nError: No argument passed to the mandatory query_data. Please pass the path to the CSV file of the query data.')
787
544
  sys.exit()
788
545
  else:
789
546
  extension = query_data.rsplit('.',1)
790
547
  extension = extension[(len(extension)-1)]
791
- if extension == 'mgf' or extension == 'MGF' or extension == 'mzML' or extension == 'mzml' or extension == 'MZML' or extension == 'cdf' or extension == 'CDF':
792
- output_path_tmp = query_data[:-3] + 'csv'
548
+ if extension == 'mgf' or extension == 'MGF' or extension == 'mzML' or extension == 'mzml' or extension == 'MZML' or extension == 'cdf' or extension == 'CDF' or extension == 'json' or extension == 'JSON' or extension == 'msp' or extension == 'MSP':
549
+ output_path_tmp = query_data[:-3] + 'txt'
793
550
  build_library_from_raw_data(input_path=query_data, output_path=output_path_tmp, is_reference=False)
794
- df_query = pd.read_csv(output_path_tmp)
795
- if extension == 'csv' or extension == 'CSV':
796
- df_query = pd.read_csv(query_data)
797
- unique_query_ids = df_query.iloc[:,0].unique()
551
+ df_query = pd.read_csv(output_path_tmp, sep='\t')
552
+ if extension == 'txt' or extension == 'TXT':
553
+ df_query = pd.read_csv(query_data, sep='\t')
554
+ unique_query_ids = df_query['id'].unique()
798
555
 
799
556
  if reference_data is None:
800
- print('\nError: No argument passed to the mandatory reference_data. Please pass the path to the CSV file of the reference data.')
557
+ print('\nError: No argument passed to the mandatory reference_data. Please pass the path to the reference data.')
801
558
  sys.exit()
802
559
  else:
803
560
  if isinstance(reference_data,str):
804
561
  df_reference = get_reference_df(reference_data,likely_reference_ids)
805
- unique_reference_ids = df_reference.iloc[:,0].unique()
806
562
  else:
807
563
  dfs = []
808
- unique_reference_ids = []
809
564
  for f in reference_data:
810
565
  tmp = get_reference_df(f,likely_reference_ids)
811
566
  dfs.append(tmp)
812
- unique_reference_ids.extend(tmp.iloc[:,0].unique())
813
567
  df_reference = pd.concat(dfs, axis=0, ignore_index=True)
814
568
 
569
+ if 'ionization_mode' in df_reference.columns.tolist() and ionization_mode != 'N/A' and ionization_mode != None:
570
+ df_reference = df_reference.loc[df_reference['ionization_mode']==ionization_mode]
571
+ if 'adduct' in df_reference.columns.tolist() and adduct != 'N/A' and adduct != None:
572
+ df_reference = df_reference.loc[df_reference['adduct']==adduct]
573
+
574
+ print(df_reference.loc[df_reference['id']=='Hectochlorin M+H'])
815
575
 
816
576
  if spectrum_preprocessing_order is not None:
817
577
  spectrum_preprocessing_order = list(spectrum_preprocessing_order)
@@ -899,62 +659,91 @@ def run_spec_lib_matching_on_HRMS_data(query_data=None, reference_data=None, lik
899
659
  print(f'Warning: writing similarity scores to {output_similarity_scores}')
900
660
 
901
661
 
902
- all_similarity_scores = []
903
- for query_idx in range(0,len(unique_query_ids)):
904
- if verbose is True:
662
+ unique_reference_ids = df_reference['id'].unique().tolist()
663
+ all_similarity_scores = []
664
+
665
+ for query_idx in range(len(unique_query_ids)):
666
+ if verbose:
905
667
  print(f'query spectrum #{query_idx} is being identified')
906
- q_idxs_tmp = np.where(df_query.iloc[:,0] == unique_query_ids[query_idx])[0]
907
- q_spec_tmp = np.asarray(pd.concat([df_query.iloc[q_idxs_tmp,1], df_query.iloc[q_idxs_tmp,2]], axis=1).reset_index(drop=True))
908
668
 
909
- similarity_scores = []
910
- for ref_idx in range(0,len(unique_reference_ids)):
911
- q_spec = q_spec_tmp
912
- r_idxs_tmp = np.where(df_reference.iloc[:,0] == unique_reference_ids[ref_idx])[0]
913
- r_spec = np.asarray(pd.concat([df_reference.iloc[r_idxs_tmp,1], df_reference.iloc[r_idxs_tmp,2]], axis=1).reset_index(drop=True))
669
+ q_mask = (df_query['id'] == unique_query_ids[query_idx])
670
+ q_idxs_tmp = np.where(q_mask)[0]
671
+ q_spec_tmp = np.asarray(pd.concat([df_query['mz_ratio'].iloc[q_idxs_tmp], df_query['intensity'].iloc[q_idxs_tmp]], axis=1).reset_index(drop=True))
672
+
673
+ if 'precursor_ion_mz' in df_query.columns.tolist() and 'precursor_ion_mz' in df_reference.columns.tolist() and precursor_ion_mz_tolerance != None:
674
+ precursor_ion_mz_tmp = df_query['precursor_ion_mz'].iloc[q_idxs_tmp[0]]
675
+ df_reference_tmp = df_reference.loc[df_reference['precursor_ion_mz'].between(precursor_ion_mz_tmp-precursor_ion_mz_tolerance, precursor_ion_mz_tmp+precursor_ion_mz_tolerance, inclusive='both'),['id','mz_ratio','intensity']].copy()
676
+ else:
677
+ df_reference_tmp = df_reference.copy()
678
+
679
+ ref_groups = dict(tuple(df_reference_tmp.groupby('id', sort=False)))
680
+ unique_reference_ids_tmp = list(ref_groups.keys())
681
+
682
+ similarity_by_ref = {}
683
+ for ref_id in unique_reference_ids_tmp:
684
+ q_spec = q_spec_tmp.copy()
685
+ r_df = ref_groups[ref_id]
686
+ r_spec = np.asarray(pd.concat([r_df['mz_ratio'], r_df['intensity']], axis=1).reset_index(drop=True))
687
+ #print('\nhere!!!!!!!!!!!!!!!')
688
+ #print(r_spec)
914
689
 
915
690
  is_matched = False
691
+
916
692
  for transformation in spectrum_preprocessing_order:
917
- if np.isinf(q_spec[:,1]).sum() > 0:
918
- q_spec[:,1] = np.zeros(q_spec.shape[0])
919
- if np.isinf(r_spec[:,1]).sum() > 0:
920
- r_spec[:,1] = np.zeros(r_spec.shape[0])
921
- if transformation == 'C' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1:
922
- q_spec = centroid_spectrum(q_spec, window_size=window_size_centroiding)
923
- r_spec = centroid_spectrum(r_spec, window_size=window_size_centroiding)
924
- if transformation == 'M' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1:
693
+ if np.isinf(q_spec[:, 1]).sum() > 0:
694
+ q_spec[:, 1] = np.zeros(q_spec.shape[0])
695
+ if np.isinf(r_spec[:, 1]).sum() > 0:
696
+ r_spec[:, 1] = np.zeros(r_spec.shape[0])
697
+
698
+ if transformation == 'C' and q_spec.shape[0] > 1 and r_spec.shape[0] > 1:
699
+ q_spec = centroid_spectrum(q_spec, window_size=window_size_centroiding)
700
+ r_spec = centroid_spectrum(r_spec, window_size=window_size_centroiding)
701
+
702
+ if transformation == 'M' and q_spec.shape[0] > 1 and r_spec.shape[0] > 1:
925
703
  m_spec = match_peaks_in_spectra(spec_a=q_spec, spec_b=r_spec, window_size=window_size_matching)
926
- q_spec = m_spec[:,0:2]
927
- r_spec = m_spec[:,[0,2]]
704
+ q_spec = m_spec[:, 0:2]
705
+ r_spec = m_spec[:, [0, 2]]
928
706
  is_matched = True
929
- if transformation == 'W' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1:
930
- q_spec[:,1] = wf_transform(q_spec[:,0], q_spec[:,1], wf_mz, wf_intensity)
931
- r_spec[:,1] = wf_transform(r_spec[:,0], r_spec[:,1], wf_mz, wf_intensity)
932
- if transformation == 'L' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1:
933
- q_spec[:,1] = LE_transform(q_spec[:,1], LET_threshold, normalization_method=normalization_method)
934
- r_spec[:,1] = LE_transform(r_spec[:,1], LET_threshold, normalization_method=normalization_method)
935
- if transformation == 'N' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1:
936
- q_spec = remove_noise(q_spec, nr = noise_threshold)
937
- if high_quality_reference_library == False:
938
- r_spec = remove_noise(r_spec, nr = noise_threshold)
939
- if transformation == 'F' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1:
940
- q_spec = filter_spec_lcms(q_spec, mz_min = mz_min, mz_max = mz_max, int_min = int_min, int_max = int_max, is_matched = is_matched)
941
- if high_quality_reference_library == False:
942
- r_spec = filter_spec_lcms(r_spec, mz_min = mz_min, mz_max = mz_max, int_min = int_min, int_max = int_max, is_matched = is_matched)
943
707
 
944
- q_ints = q_spec[:,1]
945
- r_ints = r_spec[:,1]
946
-
947
- if np.sum(q_ints) != 0 and np.sum(r_ints) != 0 and q_spec.shape[0] > 1 and r_spec.shape[1] > 1:
948
- similarity_score = get_similarity(similarity_measure, q_ints, r_ints, weights, entropy_dimension)
708
+ if transformation == 'W' and q_spec.shape[0] > 1 and r_spec.shape[0] > 1:
709
+ q_spec[:, 1] = wf_transform(q_spec[:, 0], q_spec[:, 1], wf_mz, wf_intensity)
710
+ r_spec[:, 1] = wf_transform(r_spec[:, 0], r_spec[:, 1], wf_mz, wf_intensity)
711
+
712
+ if transformation == 'L' and q_spec.shape[0] > 1 and r_spec.shape[0] > 1:
713
+ q_spec[:, 1] = LE_transform(q_spec[:, 1], LET_threshold, normalization_method=normalization_method)
714
+ r_spec[:, 1] = LE_transform(r_spec[:, 1], LET_threshold, normalization_method=normalization_method)
715
+
716
+ if transformation == 'N' and q_spec.shape[0] > 1 and r_spec.shape[0] > 1:
717
+ q_spec = remove_noise(q_spec, nr=noise_threshold)
718
+ if not high_quality_reference_library:
719
+ r_spec = remove_noise(r_spec, nr=noise_threshold)
720
+
721
+ if transformation == 'F' and q_spec.shape[0] > 1 and r_spec.shape[0] > 1:
722
+ q_spec = filter_spec_lcms(
723
+ q_spec, mz_min=mz_min, mz_max=mz_max, int_min=int_min, int_max=int_max, is_matched=is_matched
724
+ )
725
+ if not high_quality_reference_library:
726
+ r_spec = filter_spec_lcms(
727
+ r_spec, mz_min=mz_min, mz_max=mz_max, int_min=int_min, int_max=int_max, is_matched=is_matched
728
+ )
729
+
730
+ q_ints = q_spec[:, 1]
731
+ r_ints = r_spec[:, 1]
732
+
733
+ if np.sum(q_ints) != 0 and np.sum(r_ints) != 0 and q_spec.shape[0] > 1 and r_spec.shape[0] > 1:
734
+ sim = get_similarity(similarity_measure, q_ints, r_ints, weights, entropy_dimension)
949
735
  else:
950
- similarity_score = 0
736
+ sim = 0.0
951
737
 
952
- similarity_scores.append(similarity_score)
953
- all_similarity_scores.append(similarity_scores)
738
+ similarity_by_ref[ref_id] = sim
954
739
 
955
- df_scores = pd.DataFrame(all_similarity_scores, columns = unique_reference_ids)
740
+ row_scores = [similarity_by_ref.get(ref_id, 0.0) for ref_id in unique_reference_ids]
741
+ all_similarity_scores.append(row_scores)
742
+
743
+ df_scores = pd.DataFrame(all_similarity_scores, index=unique_query_ids, columns=unique_reference_ids)
956
744
  df_scores.index = unique_query_ids
957
- df_scores.index.names = ['Query Spectrum ID']
745
+ df_scores.index.names = ['QUERY.SPECTRUM.ID']
746
+
958
747
 
959
748
  preds = []
960
749
  scores = []
@@ -987,7 +776,7 @@ def run_spec_lib_matching_on_HRMS_data(query_data=None, reference_data=None, lik
987
776
 
988
777
  df_top_ref_specs = pd.DataFrame(out, columns = [*cnames_preds, *cnames_scores])
989
778
  df_top_ref_specs.index = unique_query_ids
990
- df_top_ref_specs.index.names = ['Query Spectrum ID']
779
+ df_top_ref_specs.index.names = ['QUERY.SPECTRUM.ID']
991
780
 
992
781
  df_scores.columns = ['Reference Spectrum ID: ' + col for col in list(map(str,df_scores.columns.tolist()))]
993
782
 
@@ -1004,33 +793,7 @@ def run_spec_lib_matching_on_HRMS_data(query_data=None, reference_data=None, lik
1004
793
 
1005
794
 
1006
795
 
1007
- def run_spec_lib_matching_on_NRMS_data(query_data=None, reference_data=None, likely_reference_ids=None, spectrum_preprocessing_order='FNLW', similarity_measure='cosine', weights={'Cosine':0.25,'Shannon':0.25,'Renyi':0.25,'Tsallis':0.25}, high_quality_reference_library=False, mz_min=0, mz_max=9999999, int_min=0, int_max=9999999, noise_threshold=0.0, wf_mz=0.0, wf_intensity=1.0, LET_threshold=0.0, entropy_dimension=1.1, n_top_matches_to_save=1, print_id_results=False, output_identification=None, output_similarity_scores=None, return_ID_output=False):
1008
- '''
1009
- runs spectral library matching on nominal-resolution mass spectrometry (NRMS) data
1010
-
1011
- --query_data: cdf or csv file of query mass spectrum/spectra to be identified. If csv file, each row should correspond to a mass spectrum, the left-most column should contain an identifier, and each of the other columns should correspond to a single mass/charge ratio. Mandatory argument.
1012
- --reference_data: cdf of csv file of the reference mass spectra. If csv file, each row should correspond to a mass spectrum, the left-most column should contain in identifier (i.e. the CAS registry number or the compound name), and the remaining column should correspond to a single mass/charge ratio. Mandatory argument.
1013
- --likely_reference_ids: CSV file with one column containing the IDs of a subset of all compounds in the reference_data to be used in spectral library matching. Each ID in this file must be an ID in the reference library. Default: None (i.e. default is to use entire reference library)
1014
- --similarity_measure: cosine, shannon, renyi, tsallis, mixture, jaccard, dice, 3w_jaccard, sokal_sneath, binary_cosine, mountford, mcconnaughey, driver_kroeber, simpson, braun_banquet, fager_mcgowan, kulczynski, intersection, hamming, hellinger. Default: cosine.
1015
- --weights: dict of weights to give to each non-binary similarity measure (i.e. cosine, shannon, renyi, and tsallis) when the mixture similarity measure is specified. Default: 0.25 for each of the four non-binary similarity measures.
1016
- --spectrum_preprocessing_order: The spectrum preprocessing transformations and the order in which they are to be applied. Note that these transformations are applied prior to computing similarity scores. Format must be a string with 2-4 characters chosen from F, N, L, W representing filtering based on mass/charge and intensity values, noise removal, low-entropy trannsformation, and weight-factor-transformation, respectively. For example, if \'WN\' is passed, then each spectrum will undergo a weight factor transformation and then noise removal. Default: FNLW')
1017
- --high_quality_reference_library: True/False flag indicating whether the reference library is considered to be of high quality. If True, then the spectrum preprocessing transformations of filtering and noise removal are performed only on the query spectrum/spectra. If False, all spectrum preprocessing transformations specified will be applied to both the query and reference spectra. Default: False')
1018
- --mz_min: Remove all peaks with mass/charge value less than mz_min in each spectrum. Default: 0
1019
- --mz_max: Remove all peaks with mass/charge value greater than mz_max in each spectrum. Default: 9999999
1020
- --int_min: Remove all peaks with intensity value less than int_min in each spectrum. Default: 0
1021
- --int_max: Remove all peaks with intensity value greater than int_max in each spectrum. Default: 9999999
1022
- --noise_threshold: Ion fragments (i.e. points in a given mass spectrum) with intensity less than max(intensities)*noise_threshold are removed. Default: 0.0
1023
- --wf_mz: Mass/charge weight factor parameter. Default: 0.0
1024
- --wf_intensity: Intensity weight factor parameter. Default: 0.0
1025
- --LET_threshold: Low-entropy transformation threshold parameter. Spectra with Shannon entropy less than LET_threshold are transformed according to intensitiesNew=intensitiesOriginal^{(1+S)/(1+LET_threshold)}. Default: 0.0
1026
- --entropy_dimension: Entropy dimension parameter. Must have positive value other than 1. When the entropy dimension is 1, then Renyi and Tsallis entropy are equivalent to Shannon entropy. Therefore, this parameter only applies to the renyi and tsallis similarity measures. This parameter will be ignored if similarity measure cosine or shannon is chosen. Default: 1.1
1027
- --normalization_method: Method used to normalize the intensities of each spectrum so that the intensities sum to 1. Since the objects entropy quantifies the uncertainy of must be probability distributions, the intensities of a given spectrum must sum to 1 prior to computing the entropy of the given spectrum intensities. Options: \'standard\' and \'softmax\'. Default: standard.
1028
- --n_top_matches_to_save: The number of top matches to report. For example, if n_top_matches_to_save=5, then for each query spectrum, the five reference spectra with the largest similarity with the given query spectrum will be reported. Default: 1
1029
- --print_id_results: Flag that prints identification results if True. Default: False
1030
- --output_identification: Output TXT file containing the most-similar reference spectra for each query spectrum along with the corresponding similarity scores. Default is to save identification output in current working directory with filename \'output_identification.txt\'.
1031
- --output_similarity_scores: Output TXT file containing similarity scores between all query spectrum/spectra and all reference spectra. Each row corresponds to a query spectrum, the left-most column contains the query spectrum/spectra identifier, and the remaining column contain the similarity scores with respect to all reference library spectra. If no argument passed, then this TXT file is written to the current working directory with filename \'output_all_similarity_scores\'.txt.')
1032
- '''
1033
-
796
+ def run_spec_lib_matching_on_NRMS_data(query_data=None, reference_data=None, likely_reference_ids=None, spectrum_preprocessing_order='FNLW', similarity_measure='cosine', weights={'Cosine':0.25,'Shannon':0.25,'Renyi':0.25,'Tsallis':0.25}, high_quality_reference_library=False, mz_min=0, mz_max=9999999, int_min=0, int_max=9999999, noise_threshold=0.0, wf_mz=0.0, wf_intensity=1.0, LET_threshold=0.0, entropy_dimension=1.1, n_top_matches_to_save=1, print_id_results=False, output_identification=None, output_similarity_scores=None, return_ID_output=False, verbose=True):
1034
797
  if query_data is None:
1035
798
  print('\nError: No argument passed to the mandatory query_data. Please pass the path to the CSV file of the query data.')
1036
799
  sys.exit()
@@ -1038,11 +801,11 @@ def run_spec_lib_matching_on_NRMS_data(query_data=None, reference_data=None, lik
1038
801
  extension = query_data.rsplit('.',1)
1039
802
  extension = extension[(len(extension)-1)]
1040
803
  if extension == 'mgf' or extension == 'MGF' or extension == 'mzML' or extension == 'mzml' or extension == 'MZML' or extension == 'cdf' or extension == 'CDF':
1041
- output_path_tmp = query_data[:-3] + 'csv'
804
+ output_path_tmp = query_data[:-3] + 'txt'
1042
805
  build_library_from_raw_data(input_path=query_data, output_path=output_path_tmp, is_reference=False)
1043
- df_query = pd.read_csv(output_path_tmp)
1044
- if extension == 'csv' or extension == 'CSV':
1045
- df_query = pd.read_csv(query_data)
806
+ df_query = pd.read_csv(output_path_tmp, sep='\t')
807
+ if extension == 'txt' or extension == 'TXT':
808
+ df_query = pd.read_csv(query_data, sep='\t')
1046
809
  unique_query_ids = df_query.iloc[:,0].unique()
1047
810
 
1048
811
  if reference_data is None:
@@ -1186,7 +949,7 @@ def run_spec_lib_matching_on_NRMS_data(query_data=None, reference_data=None, lik
1186
949
 
1187
950
  df_scores = pd.DataFrame(all_similarity_scores, columns = unique_reference_ids)
1188
951
  df_scores.index = unique_query_ids
1189
- df_scores.index.names = ['Query Spectrum ID']
952
+ df_scores.index.names = ['QUERY.SPECTRUM.ID']
1190
953
 
1191
954
  preds = []
1192
955
  scores = []
@@ -1219,7 +982,7 @@ def run_spec_lib_matching_on_NRMS_data(query_data=None, reference_data=None, lik
1219
982
 
1220
983
  df_top_ref_specs = pd.DataFrame(out, columns = [*cnames_preds, *cnames_scores])
1221
984
  df_top_ref_specs.index = unique_query_ids
1222
- df_top_ref_specs.index.names = ['Query Spectrum ID']
985
+ df_top_ref_specs.index.names = ['QUERY.SPECTRUM.ID']
1223
986
 
1224
987
  if print_id_results == True:
1225
988
  print(df_top_ref_specs.to_string())