pycompound 0.1.6__py3-none-any.whl → 0.1.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pycompound/build_library.py +77 -20
- pycompound/plot_spectra.py +73 -111
- pycompound/processing.py +5 -5
- pycompound/spec_lib_matching.py +262 -491
- pycompound/spec_lib_matching_CLI.py +48 -2
- pycompound/tuning_CLI_DE.py +22 -22
- pycompound/tuning_CLI_grid.py +22 -6
- pycompound-0.1.8.dist-info/METADATA +824 -0
- pycompound-0.1.8.dist-info/RECORD +14 -0
- {pycompound-0.1.6.dist-info → pycompound-0.1.8.dist-info}/top_level.txt +0 -1
- app.py +0 -1519
- pycompound-0.1.6.dist-info/METADATA +0 -27
- pycompound-0.1.6.dist-info/RECORD +0 -15
- {pycompound-0.1.6.dist-info → pycompound-0.1.8.dist-info}/WHEEL +0 -0
- {pycompound-0.1.6.dist-info → pycompound-0.1.8.dist-info}/licenses/LICENSE +0 -0
pycompound/spec_lib_matching.py
CHANGED
|
@@ -22,8 +22,9 @@ def _vector_to_full_params(X, default_params, optimize_params):
|
|
|
22
22
|
def objective_function_HRMS(X, ctx):
|
|
23
23
|
p = _vector_to_full_params(X, ctx["default_params"], ctx["optimize_params"])
|
|
24
24
|
acc = get_acc_HRMS(
|
|
25
|
-
ctx["df_query"],
|
|
26
|
-
ctx["
|
|
25
|
+
ctx["df_query"],
|
|
26
|
+
ctx["df_reference"],
|
|
27
|
+
ctx["precursor_ion_mz_tolerance"], ctx["ionization_mode"], ctx["adduct"],
|
|
27
28
|
ctx["similarity_measure"], ctx["weights"], ctx["spectrum_preprocessing_order"],
|
|
28
29
|
ctx["mz_min"], ctx["mz_max"], ctx["int_min"], ctx["int_max"],
|
|
29
30
|
p["window_size_centroiding"], p["window_size_matching"], p["noise_threshold"],
|
|
@@ -35,11 +36,11 @@ def objective_function_HRMS(X, ctx):
|
|
|
35
36
|
print(f"\nparams({ctx['optimize_params']}) = {np.array(X)}\naccuracy: {acc*100}%")
|
|
36
37
|
return 1.0 - acc
|
|
37
38
|
|
|
39
|
+
|
|
38
40
|
def objective_function_NRMS(X, ctx):
|
|
39
41
|
p = _vector_to_full_params(X, ctx["default_params"], ctx["optimize_params"])
|
|
40
42
|
acc = get_acc_NRMS(
|
|
41
|
-
ctx["df_query"], ctx["df_reference"],
|
|
42
|
-
ctx["unique_query_ids"], ctx["unique_reference_ids"],
|
|
43
|
+
ctx["df_query"], ctx["df_reference"], ctx['unique_query_ids'], ctx['unique_reference_ids'],
|
|
43
44
|
ctx["similarity_measure"], ctx["weights"], ctx["spectrum_preprocessing_order"],
|
|
44
45
|
ctx["mz_min"], ctx["mz_max"], ctx["int_min"], ctx["int_max"],
|
|
45
46
|
p["noise_threshold"], p["wf_mz"], p["wf_int"], p["LET_threshold"], p["entropy_dimension"],
|
|
@@ -51,16 +52,8 @@ def objective_function_NRMS(X, ctx):
|
|
|
51
52
|
|
|
52
53
|
|
|
53
54
|
|
|
54
|
-
def tune_params_DE(query_data=None, reference_data=None, chromatography_platform='HRMS', similarity_measure='cosine', weights=None, spectrum_preprocessing_order='CNMWL', mz_min=0, mz_max=999999999, int_min=0, int_max=999999999, high_quality_reference_library=False, optimize_params=["window_size_centroiding","window_size_matching","noise_threshold","wf_mz","wf_int","LET_threshold","entropy_dimension"], param_bounds={"window_size_centroiding":(0.0,0.5),"window_size_matching":(0.0,0.5),"noise_threshold":(0.0,0.25),"wf_mz":(0.0,5.0),"wf_int":(0.0,5.0),"LET_threshold":(0.0,5.0),"entropy_dimension":(1.0,3.0)}, default_params={"window_size_centroiding": 0.5, "window_size_matching":0.5, "noise_threshold":0.10, "wf_mz":0.0, "wf_int":1.0, "LET_threshold":0.0, "entropy_dimension":1.1}, maxiters=3, de_workers=1, de_updating='immediate', log_hook=None):
|
|
55
|
-
|
|
56
|
-
def _log(msg):
|
|
57
|
-
if log_hook:
|
|
58
|
-
try: log_hook(msg if msg.endswith("\n") else msg + "\n")
|
|
59
|
-
except: pass
|
|
60
55
|
|
|
61
|
-
|
|
62
|
-
_log(f"iter callback: conv={conv:.4g}, x={xk}")
|
|
63
|
-
return False
|
|
56
|
+
def tune_params_DE(query_data=None, reference_data=None, chromatography_platform='HRMS', precursor_ion_mz_tolerance=None, ionization_mode=None, adduct=None, similarity_measure='cosine', weights=None, spectrum_preprocessing_order='CNMWL', mz_min=0, mz_max=999999999, int_min=0, int_max=999999999, high_quality_reference_library=False, optimize_params=["window_size_centroiding","window_size_matching","noise_threshold","wf_mz","wf_int","LET_threshold","entropy_dimension"], param_bounds={"window_size_centroiding":(0.0,0.5),"window_size_matching":(0.0,0.5),"noise_threshold":(0.0,0.25),"wf_mz":(0.0,5.0),"wf_int":(0.0,5.0),"LET_threshold":(0.0,5.0),"entropy_dimension":(1.0,3.0)}, default_params={"window_size_centroiding": 0.5, "window_size_matching":0.5, "noise_threshold":0.10, "wf_mz":0.0, "wf_int":1.0, "LET_threshold":0.0, "entropy_dimension":1.1}, maxiters=3, de_workers=1):
|
|
64
57
|
|
|
65
58
|
if query_data is None:
|
|
66
59
|
print('\nError: No argument passed to the mandatory query_data. Please pass the path to the TXT file of the query data.')
|
|
@@ -68,21 +61,19 @@ def tune_params_DE(query_data=None, reference_data=None, chromatography_platform
|
|
|
68
61
|
else:
|
|
69
62
|
extension = query_data.rsplit('.',1)
|
|
70
63
|
extension = extension[(len(extension)-1)]
|
|
71
|
-
if extension == 'mgf' or extension == 'MGF' or extension == 'mzML' or extension == 'mzml' or extension == 'MZML' or extension == 'cdf' or extension == 'CDF':
|
|
72
|
-
output_path_tmp = query_data[:-3] + '
|
|
64
|
+
if extension == 'mgf' or extension == 'MGF' or extension == 'mzML' or extension == 'mzml' or extension == 'MZML' or extension == 'cdf' or extension == 'CDF' or extension == 'msp' or extension == 'MSP' or extension == 'json' or extension == 'JSON':
|
|
65
|
+
output_path_tmp = query_data[:-3] + 'txt'
|
|
73
66
|
build_library_from_raw_data(input_path=query_data, output_path=output_path_tmp, is_reference=False)
|
|
74
|
-
df_query = pd.read_csv(output_path_tmp)
|
|
75
|
-
if extension == '
|
|
76
|
-
df_query = pd.read_csv(query_data)
|
|
77
|
-
unique_query_ids = df_query.iloc[:,0].unique()
|
|
67
|
+
df_query = pd.read_csv(output_path_tmp, sep='\t')
|
|
68
|
+
if extension == 'txt' or extension == 'TXT':
|
|
69
|
+
df_query = pd.read_csv(query_data, sep='\t')
|
|
78
70
|
|
|
79
71
|
if reference_data is None:
|
|
80
|
-
print('\nError: No argument passed to the mandatory reference_data. Please pass the path to the
|
|
72
|
+
print('\nError: No argument passed to the mandatory reference_data. Please pass the path to the TXT file of the reference data.')
|
|
81
73
|
sys.exit()
|
|
82
74
|
else:
|
|
83
75
|
if isinstance(reference_data,str):
|
|
84
76
|
df_reference = get_reference_df(reference_data=reference_data)
|
|
85
|
-
unique_reference_ids = df_reference.iloc[:,0].unique()
|
|
86
77
|
else:
|
|
87
78
|
dfs = []
|
|
88
79
|
unique_reference_ids = []
|
|
@@ -92,6 +83,11 @@ def tune_params_DE(query_data=None, reference_data=None, chromatography_platform
|
|
|
92
83
|
unique_reference_ids.extend(tmp.iloc[:,0].unique())
|
|
93
84
|
df_reference = pd.concat(dfs, axis=0, ignore_index=True)
|
|
94
85
|
|
|
86
|
+
if 'ionization_mode' in df_reference.columns.tolist() and ionization_mode != None and ionization_mode != 'N/A':
|
|
87
|
+
df_reference = df_reference.loc[df_reference['ionization_mode']==ionization_mode]
|
|
88
|
+
if 'adduct' in df_reference.columns.tolist() and adduct != None and adduct != 'N/A':
|
|
89
|
+
df_reference = df_reference.loc[df_reference['adduct']==adduct]
|
|
90
|
+
|
|
95
91
|
unique_query_ids = df_query['id'].unique().tolist()
|
|
96
92
|
unique_reference_ids = df_reference['id'].unique().tolist()
|
|
97
93
|
|
|
@@ -100,6 +96,9 @@ def tune_params_DE(query_data=None, reference_data=None, chromatography_platform
|
|
|
100
96
|
df_reference=df_reference,
|
|
101
97
|
unique_query_ids=unique_query_ids,
|
|
102
98
|
unique_reference_ids=unique_reference_ids,
|
|
99
|
+
precursor_ion_mz_tolerance=precursor_ion_mz_tolerance,
|
|
100
|
+
ionization_mode=ionization_mode,
|
|
101
|
+
adduct=adduct,
|
|
103
102
|
similarity_measure=similarity_measure,
|
|
104
103
|
weights=weights,
|
|
105
104
|
spectrum_preprocessing_order=spectrum_preprocessing_order,
|
|
@@ -111,13 +110,10 @@ def tune_params_DE(query_data=None, reference_data=None, chromatography_platform
|
|
|
111
110
|
|
|
112
111
|
bounds = [param_bounds[p] for p in optimize_params]
|
|
113
112
|
|
|
114
|
-
print('here!!!!!!!!!!!!!!!')
|
|
115
|
-
print(de_workers)
|
|
116
|
-
print('here!!!!!!!!!!!!!!!')
|
|
117
113
|
if chromatography_platform == 'HRMS':
|
|
118
|
-
result = differential_evolution(objective_function_HRMS, bounds=bounds, args=(ctx,), maxiter=maxiters, tol=0.0, workers=de_workers, seed=1)
|
|
114
|
+
result = differential_evolution(objective_function_HRMS, bounds=bounds, args=(ctx,), maxiter=maxiters, tol=0.0, workers=de_workers, seed=1, updating='deferred' if de_workers!=1 else 'immediate')
|
|
119
115
|
else:
|
|
120
|
-
result = differential_evolution(objective_function_NRMS, bounds=bounds, args=(ctx,), maxiter=maxiters, tol=0.0, workers=de_workers, seed=1)
|
|
116
|
+
result = differential_evolution(objective_function_NRMS, bounds=bounds, args=(ctx,), maxiter=maxiters, tol=0.0, workers=de_workers, seed=1, updating='deferred' if de_workers!=1 else 'immediate')
|
|
121
117
|
|
|
122
118
|
best_full_params = _vector_to_full_params(result.x, default_params, optimize_params)
|
|
123
119
|
best_acc = 100.0 - (result.fun * 100.0)
|
|
@@ -131,14 +127,17 @@ def tune_params_DE(query_data=None, reference_data=None, chromatography_platform
|
|
|
131
127
|
for k, v in best_full_params.items():
|
|
132
128
|
print(f" {k}: {v}")
|
|
133
129
|
print(f"\nBest accuracy: {best_acc:.3f}%")
|
|
134
|
-
|
|
130
|
+
|
|
131
|
+
|
|
132
|
+
|
|
135
133
|
|
|
136
134
|
|
|
137
135
|
default_HRMS_grid = {'similarity_measure':['cosine'], 'weight':[{'Cosine':0.25,'Shannon':0.25,'Renyi':0.25,'Tsallis':0.25}], 'spectrum_preprocessing_order':['FCNMWL'], 'mz_min':[0], 'mz_max':[9999999], 'int_min':[0], 'int_max':[99999999], 'window_size_centroiding':[0.5], 'window_size_matching':[0.5], 'noise_threshold':[0.0], 'wf_mz':[0.0], 'wf_int':[1.0], 'LET_threshold':[0.0], 'entropy_dimension':[1.1], 'high_quality_reference_library':[False]}
|
|
138
136
|
default_NRMS_grid = {'similarity_measure':['cosine'], 'weight':[{'Cosine':0.25,'Shannon':0.25,'Renyi':0.25,'Tsallis':0.25}], 'spectrum_preprocessing_order':['FCNMWL'], 'mz_min':[0], 'mz_max':[9999999], 'int_min':[0], 'int_max':[99999999], 'noise_threshold':[0.0], 'wf_mz':[0.0], 'wf_int':[1.0], 'LET_threshold':[0.0], 'entropy_dimension':[1.1], 'high_quality_reference_library':[False]}
|
|
139
137
|
|
|
140
138
|
|
|
141
|
-
def _eval_one_HRMS(df_query, df_reference,
|
|
139
|
+
def _eval_one_HRMS(df_query, df_reference,
|
|
140
|
+
precursor_ion_mz_tolerance_tmp, ionization_mode_tmp, adduct_tmp,
|
|
142
141
|
similarity_measure_tmp, weight,
|
|
143
142
|
spectrum_preprocessing_order_tmp, mz_min_tmp, mz_max_tmp,
|
|
144
143
|
int_min_tmp, int_max_tmp, noise_threshold_tmp,
|
|
@@ -148,7 +147,8 @@ def _eval_one_HRMS(df_query, df_reference, unique_query_ids, unique_reference_id
|
|
|
148
147
|
|
|
149
148
|
acc = get_acc_HRMS(
|
|
150
149
|
df_query=df_query, df_reference=df_reference,
|
|
151
|
-
|
|
150
|
+
precursor_ion_mz_tolerance=precursor_ion_mz_tolerance_tmp,
|
|
151
|
+
ionization_mode=ionization_mode_tmp, adduct=adduct_tmp,
|
|
152
152
|
similarity_measure=similarity_measure_tmp, weights=weight,
|
|
153
153
|
spectrum_preprocessing_order=spectrum_preprocessing_order_tmp,
|
|
154
154
|
mz_min=mz_min_tmp, mz_max=mz_max_tmp,
|
|
@@ -160,7 +160,7 @@ def _eval_one_HRMS(df_query, df_reference, unique_query_ids, unique_reference_id
|
|
|
160
160
|
LET_threshold=LET_threshold_tmp,
|
|
161
161
|
entropy_dimension=entropy_dimension_tmp,
|
|
162
162
|
high_quality_reference_library=high_quality_reference_library_tmp,
|
|
163
|
-
verbose=
|
|
163
|
+
verbose=False
|
|
164
164
|
)
|
|
165
165
|
|
|
166
166
|
return (
|
|
@@ -191,6 +191,7 @@ def _eval_one_NRMS(df_query, df_reference, unique_query_ids, unique_reference_id
|
|
|
191
191
|
LET_threshold=LET_threshold_tmp,
|
|
192
192
|
entropy_dimension=entropy_dimension_tmp,
|
|
193
193
|
high_quality_reference_library=high_quality_reference_library_tmp,
|
|
194
|
+
verbose=False
|
|
194
195
|
)
|
|
195
196
|
|
|
196
197
|
return (
|
|
@@ -201,16 +202,7 @@ def _eval_one_NRMS(df_query, df_reference, unique_query_ids, unique_reference_id
|
|
|
201
202
|
|
|
202
203
|
|
|
203
204
|
|
|
204
|
-
def tune_params_on_HRMS_data_grid(query_data=None, reference_data=None, grid=None, output_path=None, return_output=False):
|
|
205
|
-
"""
|
|
206
|
-
runs spectral library matching on high-resolution mass spectrometry (HRMS) data with all possible combinations of parameters in the grid dict, saves results from each choice of parameters to a TXT file, and prints top-performing parameters
|
|
207
|
-
|
|
208
|
-
--query_data: mgf, mzML, or csv file of query mass spectrum/spectra to be identified. If csv file, each row should correspond to a mass spectrum, the left-most column should contain an identifier, and each of the other columns should correspond to a single mass/charge ratio. Mandatory argument.
|
|
209
|
-
--reference_data: mgf, mzML, or csv file of the reference mass spectra. If csv file, each row should correspond to a mass spectrum, the left-most column should contain in identifier (i.e. the CAS registry number or the compound name), and the remaining column should correspond to a single mass/charge ratio. Mandatory argument.
|
|
210
|
-
--grid: dict with all possible parameter values to try.
|
|
211
|
-
--output_path: accuracy from each choice of parameter set is saved to a TXT file here.
|
|
212
|
-
"""
|
|
213
|
-
|
|
205
|
+
def tune_params_on_HRMS_data_grid(query_data=None, reference_data=None, precursor_ion_mz_tolerance=None, ionization_mode=None, adduct=None, grid=None, output_path=None, return_output=False):
|
|
214
206
|
grid = {**default_HRMS_grid, **(grid or {})}
|
|
215
207
|
for key, value in grid.items():
|
|
216
208
|
globals()[key] = value
|
|
@@ -221,31 +213,37 @@ def tune_params_on_HRMS_data_grid(query_data=None, reference_data=None, grid=Non
|
|
|
221
213
|
else:
|
|
222
214
|
extension = query_data.rsplit('.',1)
|
|
223
215
|
extension = extension[(len(extension)-1)]
|
|
224
|
-
if extension == 'mgf' or extension == 'MGF' or extension == 'mzML' or extension == 'mzml' or extension == 'MZML' or extension == 'cdf' or extension == 'CDF':
|
|
225
|
-
output_path_tmp = query_data[:-3] + '
|
|
216
|
+
if extension == 'mgf' or extension == 'MGF' or extension == 'mzML' or extension == 'mzml' or extension == 'MZML' or extension == 'cdf' or extension == 'CDF' or extension == 'msp' or extension == 'MSP' or extension == 'json' or extension == 'JSON':
|
|
217
|
+
output_path_tmp = query_data[:-3] + 'txt'
|
|
226
218
|
build_library_from_raw_data(input_path=query_data, output_path=output_path_tmp, is_reference=False)
|
|
227
|
-
df_query = pd.read_csv(output_path_tmp)
|
|
228
|
-
if extension == '
|
|
229
|
-
df_query = pd.read_csv(query_data)
|
|
230
|
-
unique_query_ids = df_query
|
|
219
|
+
df_query = pd.read_csv(output_path_tmp, sep='\t')
|
|
220
|
+
if extension == 'txt' or extension == 'TXT':
|
|
221
|
+
df_query = pd.read_csv(query_data, sep='\t')
|
|
222
|
+
unique_query_ids = df_query['id'].unique()
|
|
231
223
|
|
|
232
224
|
if reference_data is None:
|
|
233
|
-
print('\nError: No argument passed to the mandatory reference_data. Please pass the path to the
|
|
225
|
+
print('\nError: No argument passed to the mandatory reference_data. Please pass the path to the TXT file of the reference data.')
|
|
234
226
|
sys.exit()
|
|
235
227
|
else:
|
|
236
228
|
if isinstance(reference_data,str):
|
|
237
229
|
df_reference = get_reference_df(reference_data=reference_data)
|
|
238
|
-
unique_reference_ids = df_reference
|
|
230
|
+
unique_reference_ids = df_reference['id'].unique()
|
|
239
231
|
else:
|
|
240
232
|
dfs = []
|
|
241
233
|
unique_reference_ids = []
|
|
242
234
|
for f in reference_data:
|
|
243
235
|
tmp = get_reference_df(reference_data=f)
|
|
244
236
|
dfs.append(tmp)
|
|
245
|
-
unique_reference_ids.extend(tmp
|
|
237
|
+
unique_reference_ids.extend(tmp['id'].unique())
|
|
246
238
|
df_reference = pd.concat(dfs, axis=0, ignore_index=True)
|
|
247
239
|
|
|
248
|
-
|
|
240
|
+
if 'ionization_mode' in df_reference.columns.tolist() and ionization_mode != 'N/A' and ionization_mode != None:
|
|
241
|
+
df_reference = df_reference.loc[df_reference['ionization_mode']==ionization_mode].copy()
|
|
242
|
+
if 'adduct' in df_reference.columns.tolist() and adduct != 'N/A' and adduct != None:
|
|
243
|
+
df_reference = df_reference.loc[df_reference['adduct']==adduct].copy()
|
|
244
|
+
unique_reference_ids_tmp2 = df_reference['id'].unique()
|
|
245
|
+
|
|
246
|
+
print(f'\nNote that there are {len(unique_query_ids)} unique query spectra, {len(unique_reference_ids)} unique reference spectra, and {len(set(unique_query_ids) & set(unique_reference_ids_tmp2))} of the query and reference spectra IDs are in common.\n')
|
|
249
247
|
|
|
250
248
|
if output_path is None:
|
|
251
249
|
output_path = f'{Path.cwd()}/tuning_param_output.txt'
|
|
@@ -253,7 +251,7 @@ def tune_params_on_HRMS_data_grid(query_data=None, reference_data=None, grid=Non
|
|
|
253
251
|
|
|
254
252
|
param_grid = product(similarity_measure, weight, spectrum_preprocessing_order, mz_min, mz_max, int_min, int_max, noise_threshold,
|
|
255
253
|
window_size_centroiding, window_size_matching, wf_mz, wf_int, LET_threshold, entropy_dimension, high_quality_reference_library)
|
|
256
|
-
results = Parallel(n_jobs=-1, verbose=10)(delayed(_eval_one_HRMS)(df_query, df_reference,
|
|
254
|
+
results = Parallel(n_jobs=-1, verbose=10)(delayed(_eval_one_HRMS)(df_query, df_reference, precursor_ion_mz_tolerance, ionization_mode, adduct, *params) for params in param_grid)
|
|
257
255
|
|
|
258
256
|
df_out = pd.DataFrame(results, columns=[
|
|
259
257
|
'ACC','SIMILARITY.MEASURE','WEIGHT','SPECTRUM.PROCESSING.ORDER', 'MZ.MIN','MZ.MAX','INT.MIN','INT.MAX','NOISE.THRESHOLD',
|
|
@@ -277,124 +275,7 @@ def tune_params_on_HRMS_data_grid(query_data=None, reference_data=None, grid=Non
|
|
|
277
275
|
|
|
278
276
|
|
|
279
277
|
|
|
280
|
-
def tune_params_on_HRMS_data_grid_shiny(query_data=None, reference_data=None, grid=None, output_path=None, return_output=False):
|
|
281
|
-
"""
|
|
282
|
-
runs spectral library matching on high-resolution mass spectrometry (HRMS) data with all possible
|
|
283
|
-
combinations of parameters in the grid dict, saves results from each choice of parameters to a TXT file,
|
|
284
|
-
and prints top-performing parameters
|
|
285
|
-
|
|
286
|
-
--query_data: mgf, mzML, or csv file of query mass spectrum/spectra to be identified. If csv file, each row
|
|
287
|
-
should correspond to a mass spectrum, the left-most column should contain an identifier, and each of the
|
|
288
|
-
other columns should correspond to a single mass/charge ratio. Mandatory argument.
|
|
289
|
-
--reference_data: mgf, mzML, or csv file of the reference mass spectra. If csv file, each row should correspond
|
|
290
|
-
to a mass spectrum, the left-most column should contain in identifier (i.e. the CAS registry number or the
|
|
291
|
-
compound name), and the remaining column should correspond to a single mass/charge ratio. Mandatory argument.
|
|
292
|
-
--grid: dict with all possible parameter values to try.
|
|
293
|
-
--output_path: accuracy from each choice of parameter set is saved to a TXT file here.
|
|
294
|
-
"""
|
|
295
|
-
|
|
296
|
-
local_grid = {**default_HRMS_grid, **(grid or {})}
|
|
297
|
-
for key, value in local_grid.items():
|
|
298
|
-
globals()[key] = value
|
|
299
|
-
|
|
300
|
-
if query_data is None:
|
|
301
|
-
print('\nError: No argument passed to the mandatory query_data. Please pass the path to the data file.')
|
|
302
|
-
sys.exit()
|
|
303
|
-
else:
|
|
304
|
-
extension = query_data.rsplit('.', 1)[-1]
|
|
305
|
-
if extension in ('mgf','MGF','mzML','mzml','MZML','cdf','CDF'):
|
|
306
|
-
output_path_tmp = query_data[:-3] + 'csv'
|
|
307
|
-
build_library_from_raw_data(input_path=query_data, output_path=output_path_tmp, is_reference=False)
|
|
308
|
-
df_query = pd.read_csv(output_path_tmp)
|
|
309
|
-
elif extension in ('csv','CSV'):
|
|
310
|
-
df_query = pd.read_csv(query_data)
|
|
311
|
-
else:
|
|
312
|
-
print(f'\nError: Unsupported query_data extension: {extension}')
|
|
313
|
-
sys.exit()
|
|
314
|
-
unique_query_ids = df_query.iloc[:, 0].unique()
|
|
315
|
-
|
|
316
|
-
if reference_data is None:
|
|
317
|
-
print('\nError: No argument passed to the mandatory reference_data. Please pass the path to the data file(s).')
|
|
318
|
-
sys.exit()
|
|
319
|
-
else:
|
|
320
|
-
if isinstance(reference_data, str):
|
|
321
|
-
df_reference = get_reference_df(reference_data=reference_data)
|
|
322
|
-
unique_reference_ids = df_reference.iloc[:, 0].unique()
|
|
323
|
-
else:
|
|
324
|
-
dfs = []
|
|
325
|
-
unique_reference_ids = []
|
|
326
|
-
for f in reference_data:
|
|
327
|
-
tmp = get_reference_df(reference_data=f)
|
|
328
|
-
dfs.append(tmp)
|
|
329
|
-
unique_reference_ids.extend(tmp.iloc[:, 0].unique())
|
|
330
|
-
df_reference = pd.concat(dfs, axis=0, ignore_index=True)
|
|
331
|
-
|
|
332
|
-
print(f'\nNote that there are {len(unique_query_ids)} unique query spectra, '
|
|
333
|
-
f'{len(unique_reference_ids)} unique reference spectra, and '
|
|
334
|
-
f'{len(set(unique_query_ids) & set(unique_reference_ids))} of the query and reference spectra IDs are in common.\n')
|
|
335
|
-
|
|
336
|
-
if output_path is None:
|
|
337
|
-
output_path = f'{Path.cwd()}/tuning_param_output.txt'
|
|
338
|
-
print(f'Warning: since output_path=None, the output will be written to the current working directory: {output_path}')
|
|
339
|
-
|
|
340
|
-
param_grid = product(
|
|
341
|
-
similarity_measure, weight, spectrum_preprocessing_order, mz_min, mz_max, int_min, int_max,
|
|
342
|
-
noise_threshold, window_size_centroiding, window_size_matching, wf_mz, wf_int, LET_threshold,
|
|
343
|
-
entropy_dimension, high_quality_reference_library
|
|
344
|
-
)
|
|
345
|
-
|
|
346
|
-
results = []
|
|
347
|
-
total = (
|
|
348
|
-
len(similarity_measure) * len(weight) * len(spectrum_preprocessing_order) * len(mz_min) * len(mz_max) *
|
|
349
|
-
len(int_min) * len(int_max) * len(noise_threshold) * len(window_size_centroiding) *
|
|
350
|
-
len(window_size_matching) * len(wf_mz) * len(wf_int) * len(LET_threshold) *
|
|
351
|
-
len(entropy_dimension) * len(high_quality_reference_library)
|
|
352
|
-
)
|
|
353
|
-
done = 0
|
|
354
|
-
|
|
355
|
-
for params in param_grid:
|
|
356
|
-
res = _eval_one_HRMS(df_query, df_reference, unique_query_ids, unique_reference_ids, *params)
|
|
357
|
-
results.append(res)
|
|
358
|
-
done += 1
|
|
359
|
-
print(f'Completed {done}/{total} grid combinations.\n', flush=True)
|
|
360
|
-
|
|
361
|
-
df_out = pd.DataFrame(results, columns=[
|
|
362
|
-
'ACC','SIMILARITY.MEASURE','WEIGHT','SPECTRUM.PROCESSING.ORDER','MZ.MIN','MZ.MAX',
|
|
363
|
-
'INT.MIN','INT.MAX','NOISE.THRESHOLD','WINDOW.SIZE.CENTROIDING','WINDOW.SIZE.MATCHING',
|
|
364
|
-
'WF.MZ','WF.INT','LET.THRESHOLD','ENTROPY.DIMENSION','HIGH.QUALITY.REFERENCE.LIBRARY'
|
|
365
|
-
])
|
|
366
|
-
|
|
367
|
-
if 'WEIGHT' in df_out.columns:
|
|
368
|
-
df_out['WEIGHT'] = (
|
|
369
|
-
df_out['WEIGHT'].astype(str)
|
|
370
|
-
.str.replace("\"","",regex=False)
|
|
371
|
-
.str.replace("{","",regex=False)
|
|
372
|
-
.str.replace("}","",regex=False)
|
|
373
|
-
.str.replace(":","",regex=False)
|
|
374
|
-
.str.replace("Cosine","",regex=False)
|
|
375
|
-
.str.replace("Shannon","",regex=False)
|
|
376
|
-
.str.replace("Renyi","",regex=False)
|
|
377
|
-
.str.replace("Tsallis","",regex=False)
|
|
378
|
-
.str.replace(" ","",regex=False)
|
|
379
|
-
)
|
|
380
|
-
|
|
381
|
-
if return_output:
|
|
382
|
-
return df_out
|
|
383
|
-
else:
|
|
384
|
-
df_out.to_csv(output_path, index=False, sep='\t', quoting=csv.QUOTE_NONE)
|
|
385
|
-
print(f'Wrote results to {output_path}')
|
|
386
|
-
|
|
387
|
-
|
|
388
278
|
def tune_params_on_NRMS_data_grid(query_data=None, reference_data=None, grid=None, output_path=None, return_output=False):
|
|
389
|
-
"""
|
|
390
|
-
runs spectral library matching on nominal-resolution mass spectrometry (NRMS) data with all possible combinations of parameters in the grid dict, saves results from each choice of parameters to a TXT file, and prints top-performing parameters
|
|
391
|
-
|
|
392
|
-
--query_data: mgf, mzML, or csv file of query mass spectrum/spectra to be identified. If csv file, each row should correspond to a mass spectrum, the left-most column should contain an identifier, and each of the other columns should correspond to a single mass/charge ratio. Mandatory argument.
|
|
393
|
-
--reference_data: mgf, mzML, or csv file of the reference mass spectra. If csv file, each row should correspond to a mass spectrum, the left-most column should contain in identifier (i.e. the CAS registry number or the compound name), and the remaining column should correspond to a single mass/charge ratio. Mandatory argument.
|
|
394
|
-
--grid: dict with all possible parameter values to try
|
|
395
|
-
--output_path: accuracy from each choice of parameter set is saved to a TXT file here
|
|
396
|
-
"""
|
|
397
|
-
|
|
398
279
|
grid = {**default_NRMS_grid, **(grid or {})}
|
|
399
280
|
for key, value in grid.items():
|
|
400
281
|
globals()[key] = value
|
|
@@ -405,13 +286,13 @@ def tune_params_on_NRMS_data_grid(query_data=None, reference_data=None, grid=Non
|
|
|
405
286
|
else:
|
|
406
287
|
extension = query_data.rsplit('.',1)
|
|
407
288
|
extension = extension[(len(extension)-1)]
|
|
408
|
-
if extension == 'mgf' or extension == 'MGF' or extension == 'mzML' or extension == 'mzml' or extension == 'MZML' or extension == 'cdf' or extension == 'CDF':
|
|
409
|
-
output_path_tmp = query_data[:-3] + '
|
|
289
|
+
if extension == 'mgf' or extension == 'MGF' or extension == 'mzML' or extension == 'mzml' or extension == 'MZML' or extension == 'cdf' or extension == 'CDF' or extension == 'msp' or extension == 'MSP' or extension == 'json' or extension == 'JSON':
|
|
290
|
+
output_path_tmp = query_data[:-3] + 'txt'
|
|
410
291
|
build_library_from_raw_data(input_path=query_data, output_path=output_path_tmp, is_reference=False)
|
|
411
|
-
df_query = pd.read_csv(output_path_tmp)
|
|
412
|
-
if extension == '
|
|
413
|
-
df_query = pd.read_csv(query_data)
|
|
414
|
-
unique_query_ids = df_query
|
|
292
|
+
df_query = pd.read_csv(output_path_tmp, sep='\t')
|
|
293
|
+
if extension == 'txt' or extension == 'TXT':
|
|
294
|
+
df_query = pd.read_csv(query_data, sep='\t')
|
|
295
|
+
unique_query_ids = df_query['id'].unique()
|
|
415
296
|
|
|
416
297
|
if reference_data is None:
|
|
417
298
|
print('\nError: No argument passed to the mandatory reference_data. Please pass the path to the CSV file of the reference data.')
|
|
@@ -419,7 +300,7 @@ def tune_params_on_NRMS_data_grid(query_data=None, reference_data=None, grid=Non
|
|
|
419
300
|
else:
|
|
420
301
|
if isinstance(reference_data,str):
|
|
421
302
|
df_reference = get_reference_df(reference_data=reference_data)
|
|
422
|
-
unique_reference_ids = df_reference
|
|
303
|
+
unique_reference_ids = df_reference['id'].unique()
|
|
423
304
|
else:
|
|
424
305
|
dfs = []
|
|
425
306
|
unique_reference_ids = []
|
|
@@ -439,10 +320,8 @@ def tune_params_on_NRMS_data_grid(query_data=None, reference_data=None, grid=Non
|
|
|
439
320
|
noise_threshold, wf_mz, wf_int, LET_threshold, entropy_dimension, high_quality_reference_library)
|
|
440
321
|
results = Parallel(n_jobs=-1, verbose=10)(delayed(_eval_one_NRMS)(df_query, df_reference, unique_query_ids, unique_reference_ids, *params) for params in param_grid)
|
|
441
322
|
|
|
442
|
-
df_out = pd.DataFrame(results, columns=[
|
|
443
|
-
|
|
444
|
-
'NOISE.THRESHOLD','WF.MZ','WF.INT','LET.THRESHOLD','ENTROPY.DIMENSION', 'HIGH.QUALITY.REFERENCE.LIBRARY'
|
|
445
|
-
])
|
|
323
|
+
df_out = pd.DataFrame(results, columns=['ACC','SIMILARITY.MEASURE','WEIGHT','SPECTRUM.PROCESSING.ORDER', 'MZ.MIN','MZ.MAX','INT.MIN','INT.MAX',
|
|
324
|
+
'NOISE.THRESHOLD','WF.MZ','WF.INT','LET.THRESHOLD','ENTROPY.DIMENSION', 'HIGH.QUALITY.REFERENCE.LIBRARY'])
|
|
446
325
|
df_out['WEIGHT'] = df_out['WEIGHT'].str.replace("\"","",regex=False)
|
|
447
326
|
df_out['WEIGHT'] = df_out['WEIGHT'].str.replace("{","",regex=False)
|
|
448
327
|
df_out['WEIGHT'] = df_out['WEIGHT'].str.replace("}","",regex=False)
|
|
@@ -452,6 +331,7 @@ def tune_params_on_NRMS_data_grid(query_data=None, reference_data=None, grid=Non
|
|
|
452
331
|
df_out['WEIGHT'] = df_out['WEIGHT'].str.replace("Renyi","",regex=False)
|
|
453
332
|
df_out['WEIGHT'] = df_out['WEIGHT'].str.replace("Tsallis","",regex=False)
|
|
454
333
|
df_out['WEIGHT'] = df_out['WEIGHT'].str.replace(" ","",regex=False)
|
|
334
|
+
|
|
455
335
|
if return_output is False:
|
|
456
336
|
df_out.to_csv(output_path, index=False, sep='\t', quoting=csv.QUOTE_NONE)
|
|
457
337
|
else:
|
|
@@ -459,225 +339,137 @@ def tune_params_on_NRMS_data_grid(query_data=None, reference_data=None, grid=Non
|
|
|
459
339
|
|
|
460
340
|
|
|
461
341
|
|
|
462
|
-
def
|
|
463
|
-
"""
|
|
464
|
-
runs spectral library matching on nominal-resolution mass spectrometry (NRMS) data with all possible
|
|
465
|
-
combinations of parameters in the grid dict, saves results from each choice of parameters to a TXT file,
|
|
466
|
-
and prints top-performing parameters
|
|
467
|
-
|
|
468
|
-
--query_data: mgf, mzML, or csv file of query mass spectrum/spectra to be identified. If csv file, each row
|
|
469
|
-
should correspond to a mass spectrum, the left-most column should contain an identifier, and each of the
|
|
470
|
-
other columns should correspond to a single mass/charge ratio. Mandatory argument.
|
|
471
|
-
--reference_data: mgf, mzML, or csv file of the reference mass spectra. If csv file, each row should correspond
|
|
472
|
-
to a mass spectrum, the left-most column should contain in identifier (i.e. the CAS registry number or the
|
|
473
|
-
compound name), and the remaining column should correspond to a single mass/charge ratio. Mandatory argument.
|
|
474
|
-
--grid: dict with all possible parameter values to try.
|
|
475
|
-
--output_path: accuracy from each choice of parameter set is saved to a TXT file here.
|
|
476
|
-
"""
|
|
477
|
-
|
|
478
|
-
local_grid = {**default_NRMS_grid, **(grid or {})}
|
|
479
|
-
for key, value in local_grid.items():
|
|
480
|
-
globals()[key] = value
|
|
481
|
-
|
|
482
|
-
if query_data is None:
|
|
483
|
-
print('\nError: No argument passed to the mandatory query_data. Please pass the path to the data file.')
|
|
484
|
-
sys.exit()
|
|
485
|
-
else:
|
|
486
|
-
extension = query_data.rsplit('.', 1)[-1]
|
|
487
|
-
if extension in ('mgf','MGF','mzML','mzml','MZML','cdf','CDF'):
|
|
488
|
-
output_path_tmp = query_data[:-3] + 'csv'
|
|
489
|
-
build_library_from_raw_data(input_path=query_data, output_path=output_path_tmp, is_reference=False)
|
|
490
|
-
df_query = pd.read_csv(output_path_tmp)
|
|
491
|
-
elif extension in ('csv','CSV'):
|
|
492
|
-
df_query = pd.read_csv(query_data)
|
|
493
|
-
else:
|
|
494
|
-
print(f'\nError: Unsupported query_data extension: {extension}')
|
|
495
|
-
sys.exit()
|
|
496
|
-
unique_query_ids = df_query.iloc[:, 0].unique()
|
|
497
|
-
|
|
498
|
-
if reference_data is None:
|
|
499
|
-
print('\nError: No argument passed to the mandatory reference_data. Please pass the path to the data file(s).')
|
|
500
|
-
sys.exit()
|
|
501
|
-
else:
|
|
502
|
-
if isinstance(reference_data, str):
|
|
503
|
-
df_reference = get_reference_df(reference_data=reference_data)
|
|
504
|
-
unique_reference_ids = df_reference.iloc[:, 0].unique()
|
|
505
|
-
else:
|
|
506
|
-
dfs = []
|
|
507
|
-
unique_reference_ids = []
|
|
508
|
-
for f in reference_data:
|
|
509
|
-
tmp = get_reference_df(reference_data=f)
|
|
510
|
-
dfs.append(tmp)
|
|
511
|
-
unique_reference_ids.extend(tmp.iloc[:, 0].unique())
|
|
512
|
-
df_reference = pd.concat(dfs, axis=0, ignore_index=True)
|
|
513
|
-
|
|
514
|
-
print(f'\nNote that there are {len(unique_query_ids)} unique query spectra, '
|
|
515
|
-
f'{len(unique_reference_ids)} unique reference spectra, and '
|
|
516
|
-
f'{len(set(unique_query_ids) & set(unique_reference_ids))} of the query and reference spectra IDs are in common.\n')
|
|
517
|
-
|
|
518
|
-
if output_path is None:
|
|
519
|
-
output_path = f'{Path.cwd()}/tuning_param_output.txt'
|
|
520
|
-
print(f'Warning: since output_path=None, the output will be written to the current working directory: {output_path}')
|
|
521
|
-
|
|
522
|
-
param_grid = product(
|
|
523
|
-
similarity_measure, weight, spectrum_preprocessing_order, mz_min, mz_max, int_min, int_max,
|
|
524
|
-
noise_threshold, wf_mz, wf_int, LET_threshold,
|
|
525
|
-
entropy_dimension, high_quality_reference_library
|
|
526
|
-
)
|
|
527
|
-
|
|
528
|
-
results = []
|
|
529
|
-
total = (
|
|
530
|
-
len(similarity_measure) * len(weight) * len(spectrum_preprocessing_order) * len(mz_min) * len(mz_max) * len(int_min) *
|
|
531
|
-
len(int_max) * len(noise_threshold) * len(wf_mz) * len(wf_int) * len(LET_threshold) * len(entropy_dimension) * len(high_quality_reference_library)
|
|
532
|
-
)
|
|
533
|
-
done = 0
|
|
534
|
-
for params in param_grid:
|
|
535
|
-
res = _eval_one_NRMS(df_query, df_reference, unique_query_ids, unique_reference_ids, *params)
|
|
536
|
-
results.append(res)
|
|
537
|
-
done += 1
|
|
538
|
-
print(f'Completed {done}/{total} grid combinations.\n', flush=True)
|
|
342
|
+
def get_acc_HRMS(df_query, df_reference, precursor_ion_mz_tolerance, ionization_mode, adduct, similarity_measure, weights, spectrum_preprocessing_order, mz_min, mz_max, int_min, int_max, window_size_centroiding, window_size_matching, noise_threshold, wf_mz, wf_int, LET_threshold, entropy_dimension, high_quality_reference_library, verbose=True):
|
|
539
343
|
|
|
540
|
-
|
|
541
|
-
|
|
542
|
-
|
|
543
|
-
]
|
|
344
|
+
n_top_matches_to_save = 1
|
|
345
|
+
unique_reference_ids = df_reference['id'].dropna().astype(str).unique().tolist()
|
|
346
|
+
unique_query_ids = df_query['id'].dropna().astype(str).unique().tolist()
|
|
347
|
+
all_similarity_rows = []
|
|
544
348
|
|
|
545
|
-
|
|
546
|
-
|
|
547
|
-
|
|
548
|
-
.str.replace("\"","",regex=False)
|
|
549
|
-
.str.replace("{","",regex=False)
|
|
550
|
-
.str.replace("}","",regex=False)
|
|
551
|
-
.str.replace(":","",regex=False)
|
|
552
|
-
.str.replace("Cosine","",regex=False)
|
|
553
|
-
.str.replace("Shannon","",regex=False)
|
|
554
|
-
.str.replace("Renyi","",regex=False)
|
|
555
|
-
.str.replace("Tsallis","",regex=False)
|
|
556
|
-
.str.replace(" ","",regex=False)
|
|
557
|
-
)
|
|
558
|
-
|
|
559
|
-
if return_output:
|
|
560
|
-
return df_out
|
|
561
|
-
else:
|
|
562
|
-
df_out.to_csv(output_path, index=False, sep='\t', quoting=csv.QUOTE_NONE)
|
|
563
|
-
print(f'Wrote results to {output_path}')
|
|
349
|
+
for query_idx, qid in enumerate(unique_query_ids):
|
|
350
|
+
if verbose:
|
|
351
|
+
print(f'query spectrum #{query_idx} is being identified')
|
|
564
352
|
|
|
353
|
+
q_mask = (df_query['id'] == qid)
|
|
354
|
+
q_idxs = np.where(q_mask)[0]
|
|
355
|
+
if q_idxs.size == 0:
|
|
356
|
+
all_similarity_rows.append([0.0]*len(unique_reference_ids))
|
|
357
|
+
continue
|
|
565
358
|
|
|
359
|
+
q_spec_base = np.asarray(pd.concat([df_query['mz_ratio'].iloc[q_idxs], df_query['intensity'].iloc[q_idxs]], axis=1).reset_index(drop=True))
|
|
566
360
|
|
|
361
|
+
if 'precursor_ion_mz' in df_query.columns and 'precursor_ion_mz' in df_reference.columns and precursor_ion_mz_tolerance is not None:
|
|
362
|
+
precursor = float(df_query['precursor_ion_mz'].iloc[q_idxs[0]])
|
|
363
|
+
df_reference_tmp = df_reference.loc[df_reference['precursor_ion_mz'].between(precursor - precursor_ion_mz_tolerance, precursor + precursor_ion_mz_tolerance, inclusive='both'), ['id', 'mz_ratio', 'intensity']].copy()
|
|
364
|
+
else:
|
|
365
|
+
df_reference_tmp = df_reference[['id','mz_ratio','intensity']].copy()
|
|
567
366
|
|
|
568
|
-
|
|
367
|
+
if df_reference_tmp.empty:
|
|
368
|
+
all_similarity_rows.append([0.0]*len(unique_reference_ids))
|
|
369
|
+
continue
|
|
569
370
|
|
|
570
|
-
|
|
371
|
+
ref_groups = dict(tuple(df_reference_tmp.groupby('id', sort=False)))
|
|
571
372
|
|
|
572
|
-
|
|
573
|
-
for query_idx in range(0,len(unique_query_ids)):
|
|
574
|
-
if verbose is True:
|
|
575
|
-
print(f'query spectrum #{query_idx} is being identified')
|
|
576
|
-
q_idxs_tmp = np.where(df_query.iloc[:,0] == unique_query_ids[query_idx])[0]
|
|
577
|
-
q_spec_tmp = np.asarray(pd.concat([df_query.iloc[q_idxs_tmp,1], df_query.iloc[q_idxs_tmp,2]], axis=1).reset_index(drop=True))
|
|
578
|
-
#q_spec_tmp = q_spec_tmp.astype(float)
|
|
373
|
+
similarity_by_ref = {}
|
|
579
374
|
|
|
580
|
-
|
|
581
|
-
|
|
582
|
-
|
|
583
|
-
r_idxs_tmp = np.where(df_reference.iloc[:,0] == unique_reference_ids[ref_idx])[0]
|
|
584
|
-
r_spec = np.asarray(pd.concat([df_reference.iloc[r_idxs_tmp,1], df_reference.iloc[r_idxs_tmp,2]], axis=1).reset_index(drop=True))
|
|
585
|
-
#print(r_spec)
|
|
586
|
-
#r_spec = r_spec.astype(float)
|
|
375
|
+
for ref_id, r_df in ref_groups.items():
|
|
376
|
+
q_spec = q_spec_base.copy()
|
|
377
|
+
r_spec = np.asarray(pd.concat([r_df['mz_ratio'], r_df['intensity']], axis=1).reset_index(drop=True))
|
|
587
378
|
|
|
588
379
|
is_matched = False
|
|
589
380
|
for transformation in spectrum_preprocessing_order:
|
|
590
|
-
if np.isinf(q_spec[:,1]).
|
|
591
|
-
q_spec[:,1] =
|
|
592
|
-
if np.isinf(r_spec[:,1]).
|
|
593
|
-
r_spec[:,1] =
|
|
594
|
-
|
|
595
|
-
|
|
596
|
-
|
|
597
|
-
|
|
598
|
-
|
|
599
|
-
|
|
600
|
-
|
|
381
|
+
if np.isinf(q_spec[:, 1]).any():
|
|
382
|
+
q_spec[:, 1] = 0.0
|
|
383
|
+
if np.isinf(r_spec[:, 1]).any():
|
|
384
|
+
r_spec[:, 1] = 0.0
|
|
385
|
+
|
|
386
|
+
if transformation == 'C' and q_spec.shape[0] > 1 and r_spec.shape[0] > 1:
|
|
387
|
+
q_spec = centroid_spectrum(q_spec, window_size=window_size_centroiding)
|
|
388
|
+
r_spec = centroid_spectrum(r_spec, window_size=window_size_centroiding)
|
|
389
|
+
|
|
390
|
+
if transformation == 'M' and q_spec.shape[0] > 1 and r_spec.shape[0] > 1:
|
|
391
|
+
m_spec = match_peaks_in_spectra(
|
|
392
|
+
spec_a=q_spec, spec_b=r_spec, window_size=window_size_matching
|
|
393
|
+
)
|
|
394
|
+
if m_spec.size == 0:
|
|
395
|
+
q_spec = np.empty((0,2))
|
|
396
|
+
r_spec = np.empty((0,2))
|
|
397
|
+
else:
|
|
398
|
+
q_spec = m_spec[:, 0:2]
|
|
399
|
+
r_spec = m_spec[:, [0, 2]]
|
|
601
400
|
is_matched = True
|
|
602
|
-
if transformation == 'W' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1:
|
|
603
|
-
q_spec[:,1] = wf_transform(q_spec[:,0], q_spec[:,1], wf_mz, wf_int)
|
|
604
|
-
r_spec[:,1] = wf_transform(r_spec[:,0], r_spec[:,1], wf_mz, wf_int)
|
|
605
|
-
if transformation == 'L' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1:
|
|
606
|
-
q_spec[:,1] = LE_transform(q_spec[:,1], LET_threshold, normalization_method='standard')
|
|
607
|
-
r_spec[:,1] = LE_transform(r_spec[:,1], LET_threshold, normalization_method='standard')
|
|
608
|
-
if transformation == 'N' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1:
|
|
609
|
-
q_spec = remove_noise(q_spec, nr = noise_threshold)
|
|
610
|
-
if high_quality_reference_library == False:
|
|
611
|
-
r_spec = remove_noise(r_spec, nr = noise_threshold)
|
|
612
|
-
if transformation == 'F' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1:
|
|
613
|
-
q_spec = filter_spec_lcms(q_spec, mz_min = mz_min, mz_max = mz_max, int_min = int_min, int_max = int_max, is_matched = is_matched)
|
|
614
|
-
if high_quality_reference_library == False:
|
|
615
|
-
r_spec = filter_spec_lcms(r_spec, mz_min = mz_min, mz_max = mz_max, int_min = int_min, int_max = int_max, is_matched = is_matched)
|
|
616
401
|
|
|
617
|
-
|
|
618
|
-
|
|
619
|
-
|
|
620
|
-
|
|
402
|
+
if transformation == 'W' and q_spec.shape[0] > 1 and r_spec.shape[0] > 1:
|
|
403
|
+
q_spec[:, 1] = wf_transform(q_spec[:, 0], q_spec[:, 1], wf_mz, wf_int)
|
|
404
|
+
r_spec[:, 1] = wf_transform(r_spec[:, 0], r_spec[:, 1], wf_mz, wf_int)
|
|
405
|
+
|
|
406
|
+
if transformation == 'L' and q_spec.shape[0] > 1 and r_spec.shape[0] > 1:
|
|
407
|
+
q_spec[:, 1] = LE_transform(q_spec[:, 1], LET_threshold, normalization_method='standard')
|
|
408
|
+
r_spec[:, 1] = LE_transform(r_spec[:, 1], LET_threshold, normalization_method='standard')
|
|
409
|
+
|
|
410
|
+
if transformation == 'N' and q_spec.shape[0] > 1 and r_spec.shape[0] > 1:
|
|
411
|
+
q_spec = remove_noise(q_spec, nr=noise_threshold)
|
|
412
|
+
if not high_quality_reference_library:
|
|
413
|
+
r_spec = remove_noise(r_spec, nr=noise_threshold)
|
|
414
|
+
|
|
415
|
+
if transformation == 'F' and q_spec.shape[0] > 1 and r_spec.shape[0] > 1:
|
|
416
|
+
q_spec = filter_spec_lcms(
|
|
417
|
+
q_spec, mz_min=mz_min, mz_max=mz_max, int_min=int_min, int_max=int_max, is_matched=is_matched
|
|
418
|
+
)
|
|
419
|
+
if not high_quality_reference_library:
|
|
420
|
+
r_spec = filter_spec_lcms(
|
|
421
|
+
r_spec, mz_min=mz_min, mz_max=mz_max, int_min=int_min, int_max=int_max, is_matched=is_matched
|
|
422
|
+
)
|
|
423
|
+
|
|
424
|
+
if q_spec.shape[0] > 1 and r_spec.shape[0] > 1:
|
|
425
|
+
q_ints = q_spec[:, 1]
|
|
426
|
+
r_ints = r_spec[:, 1]
|
|
427
|
+
if np.sum(q_ints) != 0 and np.sum(r_ints) != 0:
|
|
428
|
+
sim = get_similarity(similarity_measure, q_ints, r_ints, weights, entropy_dimension)
|
|
429
|
+
else:
|
|
430
|
+
sim = 0.0
|
|
621
431
|
else:
|
|
622
|
-
|
|
432
|
+
sim = 0.0
|
|
623
433
|
|
|
624
|
-
|
|
625
|
-
all_similarity_scores.append(similarity_scores)
|
|
434
|
+
similarity_by_ref[str(ref_id)] = float(sim)
|
|
626
435
|
|
|
627
|
-
|
|
628
|
-
|
|
629
|
-
df_scores.index.names = ['Query Spectrum ID']
|
|
436
|
+
row = [similarity_by_ref.get(ref_id, 0.0) for ref_id in unique_reference_ids]
|
|
437
|
+
all_similarity_rows.append(row)
|
|
630
438
|
|
|
631
|
-
|
|
632
|
-
|
|
633
|
-
for i in range(0, df_scores.shape[0]):
|
|
634
|
-
df_scores_tmp = df_scores
|
|
635
|
-
preds_tmp = []
|
|
636
|
-
scores_tmp = []
|
|
637
|
-
for j in range(0, n_top_matches_to_save):
|
|
638
|
-
top_ref_specs_tmp = df_scores_tmp.iloc[i,np.where(df_scores_tmp.iloc[i,:] == np.max(df_scores_tmp.iloc[i,:]))[0]]
|
|
639
|
-
cols_to_keep = np.where(df_scores_tmp.iloc[i,:] != np.max(df_scores_tmp.iloc[i,:]))[0]
|
|
640
|
-
df_scores_tmp = df_scores_tmp.iloc[:,cols_to_keep]
|
|
641
|
-
|
|
642
|
-
preds_tmp.append(';'.join(map(str,top_ref_specs_tmp.index.to_list())))
|
|
643
|
-
if len(top_ref_specs_tmp.values) == 0:
|
|
644
|
-
scores_tmp.append(0)
|
|
645
|
-
else:
|
|
646
|
-
scores_tmp.append(top_ref_specs_tmp.values[0])
|
|
647
|
-
preds.append(preds_tmp)
|
|
648
|
-
scores.append(scores_tmp)
|
|
439
|
+
df_scores = pd.DataFrame(all_similarity_rows, index=unique_query_ids, columns=unique_reference_ids)
|
|
440
|
+
df_scores.index.name = 'QUERY.SPECTRUM.ID'
|
|
649
441
|
|
|
650
|
-
|
|
651
|
-
|
|
652
|
-
|
|
653
|
-
df_tmp = pd.DataFrame(
|
|
654
|
-
|
|
442
|
+
top_idx = df_scores.values.argmax(axis=1)
|
|
443
|
+
top_scores = df_scores.values[np.arange(df_scores.shape[0]), top_idx]
|
|
444
|
+
top_ids = [df_scores.columns[i] for i in top_idx]
|
|
445
|
+
df_tmp = pd.DataFrame({'TRUE.ID': df_scores.index.to_list(), 'PREDICTED.ID': top_ids, 'SCORE': top_scores})
|
|
446
|
+
#if verbose:
|
|
447
|
+
# print(df_tmp)
|
|
448
|
+
acc = (df_tmp['TRUE.ID'] == df_tmp['PREDICTED.ID']).mean()
|
|
655
449
|
return acc
|
|
656
450
|
|
|
657
451
|
|
|
658
|
-
|
|
659
|
-
|
|
660
452
|
def get_acc_NRMS(df_query, df_reference, unique_query_ids, unique_reference_ids, similarity_measure, weights, spectrum_preprocessing_order, mz_min, mz_max, int_min, int_max, noise_threshold, wf_mz, wf_int, LET_threshold, entropy_dimension, high_quality_reference_library, verbose=True):
|
|
661
453
|
|
|
662
454
|
n_top_matches_to_save = 1
|
|
663
455
|
|
|
664
|
-
min_mz = int(np.min([np.min(df_query
|
|
665
|
-
max_mz = int(np.max([np.max(df_query
|
|
456
|
+
min_mz = int(np.min([np.min(df_query['mz_ratio']), np.min(df_reference['mz_ratio'])]))
|
|
457
|
+
max_mz = int(np.max([np.max(df_query['mz_ratio']), np.max(df_reference['mz_ratio'])]))
|
|
666
458
|
mzs = np.linspace(min_mz,max_mz,(max_mz-min_mz+1))
|
|
667
459
|
|
|
668
460
|
all_similarity_scores = []
|
|
669
461
|
for query_idx in range(0,len(unique_query_ids)):
|
|
670
|
-
q_idxs_tmp = np.where(df_query
|
|
671
|
-
q_spec_tmp = np.asarray(pd.concat([df_query.iloc[q_idxs_tmp
|
|
462
|
+
q_idxs_tmp = np.where(df_query['id'] == unique_query_ids[query_idx])[0]
|
|
463
|
+
q_spec_tmp = np.asarray(pd.concat([df_query['mz_ratio'].iloc[q_idxs_tmp], df_query['intensity'].iloc[q_idxs_tmp]], axis=1).reset_index(drop=True))
|
|
672
464
|
q_spec_tmp = convert_spec(q_spec_tmp,mzs)
|
|
673
465
|
|
|
674
466
|
similarity_scores = []
|
|
675
467
|
for ref_idx in range(0,len(unique_reference_ids)):
|
|
676
468
|
q_spec = q_spec_tmp
|
|
677
|
-
if verbose is True and ref_idx % 1000 == 0:
|
|
678
|
-
|
|
679
|
-
r_idxs_tmp = np.where(df_reference
|
|
680
|
-
r_spec_tmp = np.asarray(pd.concat([df_reference.iloc[r_idxs_tmp
|
|
469
|
+
#if verbose is True and ref_idx % 1000 == 0:
|
|
470
|
+
# print(f'Query spectrum #{query_idx} has had its similarity with {ref_idx} reference library spectra computed')
|
|
471
|
+
r_idxs_tmp = np.where(df_reference['id'] == unique_reference_ids[ref_idx])[0]
|
|
472
|
+
r_spec_tmp = np.asarray(pd.concat([df_reference['mz_ratio'].iloc[r_idxs_tmp], df_reference['intensity'].iloc[r_idxs_tmp]], axis=1).reset_index(drop=True))
|
|
681
473
|
r_spec = convert_spec(r_spec_tmp,mzs)
|
|
682
474
|
|
|
683
475
|
for transformation in spectrum_preprocessing_order:
|
|
@@ -713,7 +505,7 @@ def get_acc_NRMS(df_query, df_reference, unique_query_ids, unique_reference_ids,
|
|
|
713
505
|
|
|
714
506
|
df_scores = pd.DataFrame(all_similarity_scores, columns = unique_reference_ids)
|
|
715
507
|
df_scores.index = unique_query_ids
|
|
716
|
-
df_scores.index.names = ['
|
|
508
|
+
df_scores.index.names = ['QUERY.SPECTRUM.ID']
|
|
717
509
|
|
|
718
510
|
preds = []
|
|
719
511
|
scores = []
|
|
@@ -738,69 +530,45 @@ def get_acc_NRMS(df_query, df_reference, unique_query_ids, unique_reference_ids,
|
|
|
738
530
|
scores = np.array(scores)
|
|
739
531
|
out = np.c_[unique_query_ids,preds,scores]
|
|
740
532
|
df_tmp = pd.DataFrame(out, columns=['TRUE.ID','PREDICTED.ID','SCORE'])
|
|
533
|
+
#if verbose:
|
|
534
|
+
# print(df_tmp)
|
|
741
535
|
acc = (df_tmp['TRUE.ID']==df_tmp['PREDICTED.ID']).mean()
|
|
742
536
|
return acc
|
|
743
537
|
|
|
744
538
|
|
|
745
539
|
|
|
746
|
-
def run_spec_lib_matching_on_HRMS_data(query_data=None, reference_data=None, likely_reference_ids=None, similarity_measure='cosine', weights={'Cosine':0.25,'Shannon':0.25,'Renyi':0.25,'Tsallis':0.25}, spectrum_preprocessing_order='FCNMWL', high_quality_reference_library=False, mz_min=0, mz_max=9999999, int_min=0, int_max=9999999, window_size_centroiding=0.5, window_size_matching=0.5, noise_threshold=0.0, wf_mz=0.0, wf_intensity=1.0, LET_threshold=0.0, entropy_dimension=1.1, n_top_matches_to_save=1, print_id_results=False, output_identification=None, output_similarity_scores=None, return_ID_output=False, verbose=True):
|
|
747
|
-
'''
|
|
748
|
-
runs spectral library matching on high-resolution mass spectrometry (HRMS) data
|
|
749
|
-
|
|
750
|
-
--query_data: mgf, mzML, or csv file of query mass spectrum/spectra to be identified. If csv file, each row should correspond to a mass spectrum, the left-most column should contain an identifier, and each of the other columns should correspond to a single mass/charge ratio. Mandatory argument.
|
|
751
|
-
--reference_data: either string or list of strings with pass to mgf, mzML, sdf, and/or csv file(s) of the reference mass spectra. If csv file, each row should correspond to a mass spectrum, the left-most column should contain in identifier (i.e. the CAS registry number or the compound name), and the remaining column should correspond to a single mass/charge ratio. Mandatory argument.
|
|
752
|
-
--likely_reference_ids: CSV file with one column containing the IDs of a subset of all compounds in the reference_data to be used in spectral library matching. Each ID in this file must be an ID in the reference library. Default: None (i.e. default is to use entire reference library)
|
|
753
|
-
--similarity_measure: cosine, shannon, renyi, tsallis, mixture, jaccard, dice, 3w_jaccard, sokal_sneath, binary_cosine, mountford, mcconnaughey, driver_kroeber, simpson, braun_banquet, fager_mcgowan, kulczynski, intersection, hamming, hellinger. Default: cosine.
|
|
754
|
-
--weights: dict of weights to give to each non-binary similarity measure (i.e. cosine, shannon, renyi, and tsallis) when the mixture similarity measure is specified. Default: 0.25 for each of the four non-binary similarity measures.
|
|
755
|
-
--spectrum_preprocessing_order: The spectrum preprocessing transformations and the order in which they are to be applied. Note that these transformations are applied prior to computing similarity scores. Format must be a string with 2-6 characters chosen from C, F, M, N, L, W representing centroiding, filtering based on mass/charge and intensity values, matching, noise removal, low-entropy trannsformation, and weight-factor-transformation, respectively. For example, if \'WCM\' is passed, then each spectrum will undergo a weight factor transformation, then centroiding, and then matching. Note that if an argument is passed, then \'M\' must be contained in the argument, since matching is a required preprocessing step in spectral library matching of HRMS data. Furthermore, \'C\' must be performed before matching since centroiding can change the number of ion fragments in a given spectrum. Default: FCNMWL')
|
|
756
|
-
--high_quality_reference_library: True/False flag indicating whether the reference library is considered to be of high quality. If True, then the spectrum preprocessing transformations of filtering and noise removal are performed only on the query spectrum/spectra. If False, all spectrum preprocessing transformations specified will be applied to both the query and reference spectra. Default: False')
|
|
757
|
-
--mz_min: Remove all peaks with mass/charge value less than mz_min in each spectrum. Default: 0
|
|
758
|
-
--mz_max: Remove all peaks with mass/charge value greater than mz_max in each spectrum. Default: 9999999
|
|
759
|
-
--int_min: Remove all peaks with intensity value less than int_min in each spectrum. Default: 0
|
|
760
|
-
--int_max: Remove all peaks with intensity value greater than int_max in each spectrum. Default: 9999999
|
|
761
|
-
--window_size_centroiding: Window size parameter used in centroiding a given spectrum. Default: 0.5
|
|
762
|
-
--window_size_matching: Window size parameter used in matching a query spectrum and a reference library spectrum. Default: 0.5
|
|
763
|
-
--noise_threshold: Ion fragments (i.e. points in a given mass spectrum) with intensity less than max(intensities)*noise_threshold are removed. Default: 0.0
|
|
764
|
-
--wf_mz: Mass/charge weight factor parameter. Default: 0.0
|
|
765
|
-
--wf_intensity: Intensity weight factor parameter. Default: 0.0
|
|
766
|
-
--LET_threshold: Low-entropy transformation threshold parameter. Spectra with Shannon entropy less than LET_threshold are transformed according to intensitiesNew=intensitiesOriginal^{(1+S)/(1+LET_threshold)}. Default: 0.0
|
|
767
|
-
--entropy_dimension: Entropy dimension parameter. Must have positive value other than 1. When the entropy dimension is 1, then Renyi and Tsallis entropy are equivalent to Shannon entropy. Therefore, this parameter only applies to the renyi and tsallis similarity measures. This parameter will be ignored if similarity measure cosine or shannon is chosen. Default: 1.1
|
|
768
|
-
--n_top_matches_to_save: The number of top matches to report. For example, if n_top_matches_to_save=5, then for each query spectrum, the five reference spectra with the largest similarity with the given query spectrum will be reported. Default: 1
|
|
769
|
-
--print_id_results: Flag that prints identification results if True. Default: False
|
|
770
|
-
--output_identification: Output TXT file containing the most-similar reference spectra for each query spectrum along with the corresponding similarity scores. Default is to save identification output in current working directory with filename \'output_identification.txt\'.
|
|
771
|
-
--output_similarity_scores: Output TXT file containing similarity scores between all query spectrum/spectra and all reference spectra. Each row corresponds to a query spectrum, the left-most column contains the query spectrum/spectra identifier, and the remaining column contain the similarity scores with respect to all reference library spectra. If no argument passed, then this TXT file is written to the current working directory with filename \'output_all_similarity_scores\'.txt.')
|
|
772
|
-
'''
|
|
773
|
-
|
|
540
|
+
def run_spec_lib_matching_on_HRMS_data(query_data=None, reference_data=None, precursor_ion_mz_tolerance=None, ionization_mode=None, adduct=None, likely_reference_ids=None, similarity_measure='cosine', weights={'Cosine':0.25,'Shannon':0.25,'Renyi':0.25,'Tsallis':0.25}, spectrum_preprocessing_order='FCNMWL', high_quality_reference_library=False, mz_min=0, mz_max=9999999, int_min=0, int_max=9999999, window_size_centroiding=0.5, window_size_matching=0.5, noise_threshold=0.0, wf_mz=0.0, wf_intensity=1.0, LET_threshold=0.0, entropy_dimension=1.1, n_top_matches_to_save=1, print_id_results=False, output_identification=None, output_similarity_scores=None, return_ID_output=False, verbose=True):
|
|
774
541
|
if query_data is None:
|
|
775
542
|
print('\nError: No argument passed to the mandatory query_data. Please pass the path to the CSV file of the query data.')
|
|
776
543
|
sys.exit()
|
|
777
544
|
else:
|
|
778
545
|
extension = query_data.rsplit('.',1)
|
|
779
546
|
extension = extension[(len(extension)-1)]
|
|
780
|
-
if extension == 'mgf' or extension == 'MGF' or extension == 'mzML' or extension == 'mzml' or extension == 'MZML' or extension == 'cdf' or extension == 'CDF':
|
|
781
|
-
output_path_tmp = query_data[:-3] + '
|
|
547
|
+
if extension == 'mgf' or extension == 'MGF' or extension == 'mzML' or extension == 'mzml' or extension == 'MZML' or extension == 'cdf' or extension == 'CDF' or extension == 'json' or extension == 'JSON' or extension == 'msp' or extension == 'MSP':
|
|
548
|
+
output_path_tmp = query_data[:-3] + 'txt'
|
|
782
549
|
build_library_from_raw_data(input_path=query_data, output_path=output_path_tmp, is_reference=False)
|
|
783
|
-
df_query = pd.read_csv(output_path_tmp)
|
|
784
|
-
if extension == '
|
|
785
|
-
df_query = pd.read_csv(query_data)
|
|
786
|
-
unique_query_ids = df_query
|
|
550
|
+
df_query = pd.read_csv(output_path_tmp, sep='\t')
|
|
551
|
+
if extension == 'txt' or extension == 'TXT':
|
|
552
|
+
df_query = pd.read_csv(query_data, sep='\t')
|
|
553
|
+
unique_query_ids = df_query['id'].unique()
|
|
787
554
|
|
|
788
555
|
if reference_data is None:
|
|
789
|
-
print('\nError: No argument passed to the mandatory reference_data. Please pass the path to the
|
|
556
|
+
print('\nError: No argument passed to the mandatory reference_data. Please pass the path to the reference data.')
|
|
790
557
|
sys.exit()
|
|
791
558
|
else:
|
|
792
559
|
if isinstance(reference_data,str):
|
|
793
560
|
df_reference = get_reference_df(reference_data,likely_reference_ids)
|
|
794
|
-
unique_reference_ids = df_reference.iloc[:,0].unique()
|
|
795
561
|
else:
|
|
796
562
|
dfs = []
|
|
797
|
-
unique_reference_ids = []
|
|
798
563
|
for f in reference_data:
|
|
799
564
|
tmp = get_reference_df(f,likely_reference_ids)
|
|
800
565
|
dfs.append(tmp)
|
|
801
|
-
unique_reference_ids.extend(tmp.iloc[:,0].unique())
|
|
802
566
|
df_reference = pd.concat(dfs, axis=0, ignore_index=True)
|
|
803
567
|
|
|
568
|
+
if 'ionization_mode' in df_reference.columns.tolist() and ionization_mode != 'N/A' and ionization_mode != None:
|
|
569
|
+
df_reference = df_reference.loc[df_reference['ionization_mode']==ionization_mode]
|
|
570
|
+
if 'adduct' in df_reference.columns.tolist() and adduct != 'N/A' and adduct != None:
|
|
571
|
+
df_reference = df_reference.loc[df_reference['adduct']==adduct]
|
|
804
572
|
|
|
805
573
|
if spectrum_preprocessing_order is not None:
|
|
806
574
|
spectrum_preprocessing_order = list(spectrum_preprocessing_order)
|
|
@@ -888,62 +656,91 @@ def run_spec_lib_matching_on_HRMS_data(query_data=None, reference_data=None, lik
|
|
|
888
656
|
print(f'Warning: writing similarity scores to {output_similarity_scores}')
|
|
889
657
|
|
|
890
658
|
|
|
891
|
-
|
|
892
|
-
|
|
893
|
-
|
|
659
|
+
unique_reference_ids = df_reference['id'].unique().tolist()
|
|
660
|
+
all_similarity_scores = []
|
|
661
|
+
|
|
662
|
+
for query_idx in range(len(unique_query_ids)):
|
|
663
|
+
if verbose:
|
|
894
664
|
print(f'query spectrum #{query_idx} is being identified')
|
|
895
|
-
q_idxs_tmp = np.where(df_query.iloc[:,0] == unique_query_ids[query_idx])[0]
|
|
896
|
-
q_spec_tmp = np.asarray(pd.concat([df_query.iloc[q_idxs_tmp,1], df_query.iloc[q_idxs_tmp,2]], axis=1).reset_index(drop=True))
|
|
897
665
|
|
|
898
|
-
|
|
899
|
-
|
|
900
|
-
|
|
901
|
-
|
|
902
|
-
|
|
666
|
+
q_mask = (df_query['id'] == unique_query_ids[query_idx])
|
|
667
|
+
q_idxs_tmp = np.where(q_mask)[0]
|
|
668
|
+
q_spec_tmp = np.asarray(pd.concat([df_query['mz_ratio'].iloc[q_idxs_tmp], df_query['intensity'].iloc[q_idxs_tmp]], axis=1).reset_index(drop=True))
|
|
669
|
+
|
|
670
|
+
if 'precursor_ion_mz' in df_query.columns.tolist() and 'precursor_ion_mz' in df_reference.columns.tolist() and precursor_ion_mz_tolerance != None:
|
|
671
|
+
precursor_ion_mz_tmp = df_query['precursor_ion_mz'].iloc[q_idxs_tmp[0]]
|
|
672
|
+
df_reference_tmp = df_reference.loc[df_reference['precursor_ion_mz'].between(precursor_ion_mz_tmp-precursor_ion_mz_tolerance, precursor_ion_mz_tmp+precursor_ion_mz_tolerance, inclusive='both'),['id','mz_ratio','intensity']].copy()
|
|
673
|
+
else:
|
|
674
|
+
df_reference_tmp = df_reference.copy()
|
|
675
|
+
|
|
676
|
+
ref_groups = dict(tuple(df_reference_tmp.groupby('id', sort=False)))
|
|
677
|
+
unique_reference_ids_tmp = list(ref_groups.keys())
|
|
678
|
+
|
|
679
|
+
similarity_by_ref = {}
|
|
680
|
+
for ref_id in unique_reference_ids_tmp:
|
|
681
|
+
q_spec = q_spec_tmp.copy()
|
|
682
|
+
r_df = ref_groups[ref_id]
|
|
683
|
+
r_spec = np.asarray(pd.concat([r_df['mz_ratio'], r_df['intensity']], axis=1).reset_index(drop=True))
|
|
684
|
+
#print('\nhere!!!!!!!!!!!!!!!')
|
|
685
|
+
#print(r_spec)
|
|
903
686
|
|
|
904
687
|
is_matched = False
|
|
688
|
+
|
|
905
689
|
for transformation in spectrum_preprocessing_order:
|
|
906
|
-
if np.isinf(q_spec[:,1]).sum() > 0:
|
|
907
|
-
q_spec[:,1] = np.zeros(q_spec.shape[0])
|
|
908
|
-
if np.isinf(r_spec[:,1]).sum() > 0:
|
|
909
|
-
r_spec[:,1] = np.zeros(r_spec.shape[0])
|
|
910
|
-
|
|
911
|
-
|
|
912
|
-
|
|
913
|
-
|
|
690
|
+
if np.isinf(q_spec[:, 1]).sum() > 0:
|
|
691
|
+
q_spec[:, 1] = np.zeros(q_spec.shape[0])
|
|
692
|
+
if np.isinf(r_spec[:, 1]).sum() > 0:
|
|
693
|
+
r_spec[:, 1] = np.zeros(r_spec.shape[0])
|
|
694
|
+
|
|
695
|
+
if transformation == 'C' and q_spec.shape[0] > 1 and r_spec.shape[0] > 1:
|
|
696
|
+
q_spec = centroid_spectrum(q_spec, window_size=window_size_centroiding)
|
|
697
|
+
r_spec = centroid_spectrum(r_spec, window_size=window_size_centroiding)
|
|
698
|
+
|
|
699
|
+
if transformation == 'M' and q_spec.shape[0] > 1 and r_spec.shape[0] > 1:
|
|
914
700
|
m_spec = match_peaks_in_spectra(spec_a=q_spec, spec_b=r_spec, window_size=window_size_matching)
|
|
915
|
-
q_spec = m_spec[:,0:2]
|
|
916
|
-
r_spec = m_spec[:,[0,2]]
|
|
701
|
+
q_spec = m_spec[:, 0:2]
|
|
702
|
+
r_spec = m_spec[:, [0, 2]]
|
|
917
703
|
is_matched = True
|
|
918
|
-
if transformation == 'W' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1:
|
|
919
|
-
q_spec[:,1] = wf_transform(q_spec[:,0], q_spec[:,1], wf_mz, wf_intensity)
|
|
920
|
-
r_spec[:,1] = wf_transform(r_spec[:,0], r_spec[:,1], wf_mz, wf_intensity)
|
|
921
|
-
if transformation == 'L' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1:
|
|
922
|
-
q_spec[:,1] = LE_transform(q_spec[:,1], LET_threshold, normalization_method=normalization_method)
|
|
923
|
-
r_spec[:,1] = LE_transform(r_spec[:,1], LET_threshold, normalization_method=normalization_method)
|
|
924
|
-
if transformation == 'N' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1:
|
|
925
|
-
q_spec = remove_noise(q_spec, nr = noise_threshold)
|
|
926
|
-
if high_quality_reference_library == False:
|
|
927
|
-
r_spec = remove_noise(r_spec, nr = noise_threshold)
|
|
928
|
-
if transformation == 'F' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1:
|
|
929
|
-
q_spec = filter_spec_lcms(q_spec, mz_min = mz_min, mz_max = mz_max, int_min = int_min, int_max = int_max, is_matched = is_matched)
|
|
930
|
-
if high_quality_reference_library == False:
|
|
931
|
-
r_spec = filter_spec_lcms(r_spec, mz_min = mz_min, mz_max = mz_max, int_min = int_min, int_max = int_max, is_matched = is_matched)
|
|
932
|
-
|
|
933
|
-
q_ints = q_spec[:,1]
|
|
934
|
-
r_ints = r_spec[:,1]
|
|
935
704
|
|
|
936
|
-
|
|
937
|
-
|
|
705
|
+
if transformation == 'W' and q_spec.shape[0] > 1 and r_spec.shape[0] > 1:
|
|
706
|
+
q_spec[:, 1] = wf_transform(q_spec[:, 0], q_spec[:, 1], wf_mz, wf_intensity)
|
|
707
|
+
r_spec[:, 1] = wf_transform(r_spec[:, 0], r_spec[:, 1], wf_mz, wf_intensity)
|
|
708
|
+
|
|
709
|
+
if transformation == 'L' and q_spec.shape[0] > 1 and r_spec.shape[0] > 1:
|
|
710
|
+
q_spec[:, 1] = LE_transform(q_spec[:, 1], LET_threshold, normalization_method=normalization_method)
|
|
711
|
+
r_spec[:, 1] = LE_transform(r_spec[:, 1], LET_threshold, normalization_method=normalization_method)
|
|
712
|
+
|
|
713
|
+
if transformation == 'N' and q_spec.shape[0] > 1 and r_spec.shape[0] > 1:
|
|
714
|
+
q_spec = remove_noise(q_spec, nr=noise_threshold)
|
|
715
|
+
if not high_quality_reference_library:
|
|
716
|
+
r_spec = remove_noise(r_spec, nr=noise_threshold)
|
|
717
|
+
|
|
718
|
+
if transformation == 'F' and q_spec.shape[0] > 1 and r_spec.shape[0] > 1:
|
|
719
|
+
q_spec = filter_spec_lcms(
|
|
720
|
+
q_spec, mz_min=mz_min, mz_max=mz_max, int_min=int_min, int_max=int_max, is_matched=is_matched
|
|
721
|
+
)
|
|
722
|
+
if not high_quality_reference_library:
|
|
723
|
+
r_spec = filter_spec_lcms(
|
|
724
|
+
r_spec, mz_min=mz_min, mz_max=mz_max, int_min=int_min, int_max=int_max, is_matched=is_matched
|
|
725
|
+
)
|
|
726
|
+
|
|
727
|
+
q_ints = q_spec[:, 1]
|
|
728
|
+
r_ints = r_spec[:, 1]
|
|
729
|
+
|
|
730
|
+
if np.sum(q_ints) != 0 and np.sum(r_ints) != 0 and q_spec.shape[0] > 1 and r_spec.shape[0] > 1:
|
|
731
|
+
sim = get_similarity(similarity_measure, q_ints, r_ints, weights, entropy_dimension)
|
|
938
732
|
else:
|
|
939
|
-
|
|
733
|
+
sim = 0.0
|
|
940
734
|
|
|
941
|
-
|
|
942
|
-
all_similarity_scores.append(similarity_scores)
|
|
735
|
+
similarity_by_ref[ref_id] = sim
|
|
943
736
|
|
|
944
|
-
|
|
737
|
+
row_scores = [similarity_by_ref.get(ref_id, 0.0) for ref_id in unique_reference_ids]
|
|
738
|
+
all_similarity_scores.append(row_scores)
|
|
739
|
+
|
|
740
|
+
df_scores = pd.DataFrame(all_similarity_scores, index=unique_query_ids, columns=unique_reference_ids)
|
|
945
741
|
df_scores.index = unique_query_ids
|
|
946
|
-
df_scores.index.names = ['
|
|
742
|
+
df_scores.index.names = ['QUERY.SPECTRUM.ID']
|
|
743
|
+
|
|
947
744
|
|
|
948
745
|
preds = []
|
|
949
746
|
scores = []
|
|
@@ -976,7 +773,7 @@ def run_spec_lib_matching_on_HRMS_data(query_data=None, reference_data=None, lik
|
|
|
976
773
|
|
|
977
774
|
df_top_ref_specs = pd.DataFrame(out, columns = [*cnames_preds, *cnames_scores])
|
|
978
775
|
df_top_ref_specs.index = unique_query_ids
|
|
979
|
-
df_top_ref_specs.index.names = ['
|
|
776
|
+
df_top_ref_specs.index.names = ['QUERY.SPECTRUM.ID']
|
|
980
777
|
|
|
981
778
|
df_scores.columns = ['Reference Spectrum ID: ' + col for col in list(map(str,df_scores.columns.tolist()))]
|
|
982
779
|
|
|
@@ -993,33 +790,7 @@ def run_spec_lib_matching_on_HRMS_data(query_data=None, reference_data=None, lik
|
|
|
993
790
|
|
|
994
791
|
|
|
995
792
|
|
|
996
|
-
def run_spec_lib_matching_on_NRMS_data(query_data=None, reference_data=None, likely_reference_ids=None, spectrum_preprocessing_order='FNLW', similarity_measure='cosine', weights={'Cosine':0.25,'Shannon':0.25,'Renyi':0.25,'Tsallis':0.25}, high_quality_reference_library=False, mz_min=0, mz_max=9999999, int_min=0, int_max=9999999, noise_threshold=0.0, wf_mz=0.0, wf_intensity=1.0, LET_threshold=0.0, entropy_dimension=1.1, n_top_matches_to_save=1, print_id_results=False, output_identification=None, output_similarity_scores=None, return_ID_output=False):
|
|
997
|
-
'''
|
|
998
|
-
runs spectral library matching on nominal-resolution mass spectrometry (NRMS) data
|
|
999
|
-
|
|
1000
|
-
--query_data: cdf or csv file of query mass spectrum/spectra to be identified. If csv file, each row should correspond to a mass spectrum, the left-most column should contain an identifier, and each of the other columns should correspond to a single mass/charge ratio. Mandatory argument.
|
|
1001
|
-
--reference_data: cdf of csv file of the reference mass spectra. If csv file, each row should correspond to a mass spectrum, the left-most column should contain in identifier (i.e. the CAS registry number or the compound name), and the remaining column should correspond to a single mass/charge ratio. Mandatory argument.
|
|
1002
|
-
--likely_reference_ids: CSV file with one column containing the IDs of a subset of all compounds in the reference_data to be used in spectral library matching. Each ID in this file must be an ID in the reference library. Default: None (i.e. default is to use entire reference library)
|
|
1003
|
-
--similarity_measure: cosine, shannon, renyi, tsallis, mixture, jaccard, dice, 3w_jaccard, sokal_sneath, binary_cosine, mountford, mcconnaughey, driver_kroeber, simpson, braun_banquet, fager_mcgowan, kulczynski, intersection, hamming, hellinger. Default: cosine.
|
|
1004
|
-
--weights: dict of weights to give to each non-binary similarity measure (i.e. cosine, shannon, renyi, and tsallis) when the mixture similarity measure is specified. Default: 0.25 for each of the four non-binary similarity measures.
|
|
1005
|
-
--spectrum_preprocessing_order: The spectrum preprocessing transformations and the order in which they are to be applied. Note that these transformations are applied prior to computing similarity scores. Format must be a string with 2-4 characters chosen from F, N, L, W representing filtering based on mass/charge and intensity values, noise removal, low-entropy trannsformation, and weight-factor-transformation, respectively. For example, if \'WN\' is passed, then each spectrum will undergo a weight factor transformation and then noise removal. Default: FNLW')
|
|
1006
|
-
--high_quality_reference_library: True/False flag indicating whether the reference library is considered to be of high quality. If True, then the spectrum preprocessing transformations of filtering and noise removal are performed only on the query spectrum/spectra. If False, all spectrum preprocessing transformations specified will be applied to both the query and reference spectra. Default: False')
|
|
1007
|
-
--mz_min: Remove all peaks with mass/charge value less than mz_min in each spectrum. Default: 0
|
|
1008
|
-
--mz_max: Remove all peaks with mass/charge value greater than mz_max in each spectrum. Default: 9999999
|
|
1009
|
-
--int_min: Remove all peaks with intensity value less than int_min in each spectrum. Default: 0
|
|
1010
|
-
--int_max: Remove all peaks with intensity value greater than int_max in each spectrum. Default: 9999999
|
|
1011
|
-
--noise_threshold: Ion fragments (i.e. points in a given mass spectrum) with intensity less than max(intensities)*noise_threshold are removed. Default: 0.0
|
|
1012
|
-
--wf_mz: Mass/charge weight factor parameter. Default: 0.0
|
|
1013
|
-
--wf_intensity: Intensity weight factor parameter. Default: 0.0
|
|
1014
|
-
--LET_threshold: Low-entropy transformation threshold parameter. Spectra with Shannon entropy less than LET_threshold are transformed according to intensitiesNew=intensitiesOriginal^{(1+S)/(1+LET_threshold)}. Default: 0.0
|
|
1015
|
-
--entropy_dimension: Entropy dimension parameter. Must have positive value other than 1. When the entropy dimension is 1, then Renyi and Tsallis entropy are equivalent to Shannon entropy. Therefore, this parameter only applies to the renyi and tsallis similarity measures. This parameter will be ignored if similarity measure cosine or shannon is chosen. Default: 1.1
|
|
1016
|
-
--normalization_method: Method used to normalize the intensities of each spectrum so that the intensities sum to 1. Since the objects entropy quantifies the uncertainy of must be probability distributions, the intensities of a given spectrum must sum to 1 prior to computing the entropy of the given spectrum intensities. Options: \'standard\' and \'softmax\'. Default: standard.
|
|
1017
|
-
--n_top_matches_to_save: The number of top matches to report. For example, if n_top_matches_to_save=5, then for each query spectrum, the five reference spectra with the largest similarity with the given query spectrum will be reported. Default: 1
|
|
1018
|
-
--print_id_results: Flag that prints identification results if True. Default: False
|
|
1019
|
-
--output_identification: Output TXT file containing the most-similar reference spectra for each query spectrum along with the corresponding similarity scores. Default is to save identification output in current working directory with filename \'output_identification.txt\'.
|
|
1020
|
-
--output_similarity_scores: Output TXT file containing similarity scores between all query spectrum/spectra and all reference spectra. Each row corresponds to a query spectrum, the left-most column contains the query spectrum/spectra identifier, and the remaining column contain the similarity scores with respect to all reference library spectra. If no argument passed, then this TXT file is written to the current working directory with filename \'output_all_similarity_scores\'.txt.')
|
|
1021
|
-
'''
|
|
1022
|
-
|
|
793
|
+
def run_spec_lib_matching_on_NRMS_data(query_data=None, reference_data=None, likely_reference_ids=None, spectrum_preprocessing_order='FNLW', similarity_measure='cosine', weights={'Cosine':0.25,'Shannon':0.25,'Renyi':0.25,'Tsallis':0.25}, high_quality_reference_library=False, mz_min=0, mz_max=9999999, int_min=0, int_max=9999999, noise_threshold=0.0, wf_mz=0.0, wf_intensity=1.0, LET_threshold=0.0, entropy_dimension=1.1, n_top_matches_to_save=1, print_id_results=False, output_identification=None, output_similarity_scores=None, return_ID_output=False, verbose=True):
|
|
1023
794
|
if query_data is None:
|
|
1024
795
|
print('\nError: No argument passed to the mandatory query_data. Please pass the path to the CSV file of the query data.')
|
|
1025
796
|
sys.exit()
|
|
@@ -1027,12 +798,12 @@ def run_spec_lib_matching_on_NRMS_data(query_data=None, reference_data=None, lik
|
|
|
1027
798
|
extension = query_data.rsplit('.',1)
|
|
1028
799
|
extension = extension[(len(extension)-1)]
|
|
1029
800
|
if extension == 'mgf' or extension == 'MGF' or extension == 'mzML' or extension == 'mzml' or extension == 'MZML' or extension == 'cdf' or extension == 'CDF':
|
|
1030
|
-
output_path_tmp = query_data[:-3] + '
|
|
801
|
+
output_path_tmp = query_data[:-3] + 'txt'
|
|
1031
802
|
build_library_from_raw_data(input_path=query_data, output_path=output_path_tmp, is_reference=False)
|
|
1032
|
-
df_query = pd.read_csv(output_path_tmp)
|
|
1033
|
-
if extension == '
|
|
1034
|
-
df_query = pd.read_csv(query_data)
|
|
1035
|
-
unique_query_ids = df_query
|
|
803
|
+
df_query = pd.read_csv(output_path_tmp, sep='\t')
|
|
804
|
+
if extension == 'txt' or extension == 'TXT':
|
|
805
|
+
df_query = pd.read_csv(query_data, sep='\t')
|
|
806
|
+
unique_query_ids = df_query['id'].unique()
|
|
1036
807
|
|
|
1037
808
|
if reference_data is None:
|
|
1038
809
|
print('\nError: No argument passed to the mandatory reference_data. Please pass the path to the CSV file of the reference data.')
|
|
@@ -1040,14 +811,14 @@ def run_spec_lib_matching_on_NRMS_data(query_data=None, reference_data=None, lik
|
|
|
1040
811
|
else:
|
|
1041
812
|
if isinstance(reference_data,str):
|
|
1042
813
|
df_reference = get_reference_df(reference_data,likely_reference_ids)
|
|
1043
|
-
unique_reference_ids = df_reference
|
|
814
|
+
unique_reference_ids = df_reference['id'].unique()
|
|
1044
815
|
else:
|
|
1045
816
|
dfs = []
|
|
1046
817
|
unique_reference_ids = []
|
|
1047
818
|
for f in reference_data:
|
|
1048
819
|
tmp = get_reference_df(f,likely_reference_ids)
|
|
1049
820
|
dfs.append(tmp)
|
|
1050
|
-
unique_reference_ids.extend(tmp
|
|
821
|
+
unique_reference_ids.extend(tmp['id'].unique())
|
|
1051
822
|
df_reference = pd.concat(dfs, axis=0, ignore_index=True)
|
|
1052
823
|
|
|
1053
824
|
|
|
@@ -1123,23 +894,23 @@ def run_spec_lib_matching_on_NRMS_data(query_data=None, reference_data=None, lik
|
|
|
1123
894
|
|
|
1124
895
|
|
|
1125
896
|
|
|
1126
|
-
min_mz = int(np.min([np.min(df_query
|
|
1127
|
-
max_mz = int(np.max([np.max(df_query
|
|
897
|
+
min_mz = int(np.min([np.min(df_query['mz_ratio']), np.min(df_reference['mz_ratio'])]))
|
|
898
|
+
max_mz = int(np.max([np.max(df_query['mz_ratio']), np.max(df_reference['mz_ratio'])]))
|
|
1128
899
|
mzs = np.linspace(min_mz,max_mz,(max_mz-min_mz+1))
|
|
1129
900
|
|
|
1130
901
|
all_similarity_scores = []
|
|
1131
902
|
for query_idx in range(0,len(unique_query_ids)):
|
|
1132
|
-
q_idxs_tmp = np.where(df_query
|
|
1133
|
-
q_spec_tmp = np.asarray(pd.concat([df_query.iloc[q_idxs_tmp
|
|
903
|
+
q_idxs_tmp = np.where(df_query['id'] == unique_query_ids[query_idx])[0]
|
|
904
|
+
q_spec_tmp = np.asarray(pd.concat([df_query['mz_ratio'].iloc[q_idxs_tmp], df_query['intensity'].iloc[q_idxs_tmp]], axis=1).reset_index(drop=True))
|
|
1134
905
|
q_spec_tmp = convert_spec(q_spec_tmp,mzs)
|
|
1135
906
|
|
|
1136
907
|
similarity_scores = []
|
|
1137
908
|
for ref_idx in range(0,len(unique_reference_ids)):
|
|
1138
|
-
if verbose is True and ref_idx % 1000 == 0:
|
|
1139
|
-
|
|
909
|
+
#if verbose is True and ref_idx % 1000 == 0:
|
|
910
|
+
# print(f'Query spectrum #{query_idx} has had its similarity with {ref_idx} reference library spectra computed')
|
|
1140
911
|
q_spec = q_spec_tmp
|
|
1141
|
-
r_idxs_tmp = np.where(df_reference
|
|
1142
|
-
r_spec_tmp = np.asarray(pd.concat([df_reference.iloc[r_idxs_tmp
|
|
912
|
+
r_idxs_tmp = np.where(df_reference['id'] == unique_reference_ids[ref_idx])[0]
|
|
913
|
+
r_spec_tmp = np.asarray(pd.concat([df_reference['mz_ratio'].iloc[r_idxs_tmp], df_reference['intensity'].iloc[r_idxs_tmp]], axis=1).reset_index(drop=True))
|
|
1143
914
|
r_spec = convert_spec(r_spec_tmp,mzs)
|
|
1144
915
|
|
|
1145
916
|
for transformation in spectrum_preprocessing_order:
|
|
@@ -1175,7 +946,7 @@ def run_spec_lib_matching_on_NRMS_data(query_data=None, reference_data=None, lik
|
|
|
1175
946
|
|
|
1176
947
|
df_scores = pd.DataFrame(all_similarity_scores, columns = unique_reference_ids)
|
|
1177
948
|
df_scores.index = unique_query_ids
|
|
1178
|
-
df_scores.index.names = ['
|
|
949
|
+
df_scores.index.names = ['QUERY.SPECTRUM.ID']
|
|
1179
950
|
|
|
1180
951
|
preds = []
|
|
1181
952
|
scores = []
|
|
@@ -1208,7 +979,7 @@ def run_spec_lib_matching_on_NRMS_data(query_data=None, reference_data=None, lik
|
|
|
1208
979
|
|
|
1209
980
|
df_top_ref_specs = pd.DataFrame(out, columns = [*cnames_preds, *cnames_scores])
|
|
1210
981
|
df_top_ref_specs.index = unique_query_ids
|
|
1211
|
-
df_top_ref_specs.index.names = ['
|
|
982
|
+
df_top_ref_specs.index.names = ['QUERY.SPECTRUM.ID']
|
|
1212
983
|
|
|
1213
984
|
if print_id_results == True:
|
|
1214
985
|
print(df_top_ref_specs.to_string())
|