pycompound 0.1.5__py3-none-any.whl → 0.1.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- app.py +2772 -243
- pycompound/build_library.py +77 -20
- pycompound/plot_spectra.py +1 -1
- pycompound/processing.py +5 -5
- pycompound/spec_lib_matching.py +265 -502
- pycompound/spec_lib_matching_CLI.py +48 -2
- pycompound/tuning_CLI_DE.py +22 -22
- pycompound/tuning_CLI_grid.py +22 -6
- {pycompound-0.1.5.dist-info → pycompound-0.1.7.dist-info}/METADATA +1 -1
- pycompound-0.1.7.dist-info/RECORD +15 -0
- {pycompound-0.1.5.dist-info → pycompound-0.1.7.dist-info}/top_level.txt +0 -1
- app2.py +0 -101
- pycompound-0.1.5.dist-info/RECORD +0 -16
- {pycompound-0.1.5.dist-info → pycompound-0.1.7.dist-info}/WHEEL +0 -0
- {pycompound-0.1.5.dist-info → pycompound-0.1.7.dist-info}/licenses/LICENSE +0 -0
pycompound/spec_lib_matching.py
CHANGED
|
@@ -21,46 +21,40 @@ def _vector_to_full_params(X, default_params, optimize_params):
|
|
|
21
21
|
|
|
22
22
|
def objective_function_HRMS(X, ctx):
|
|
23
23
|
p = _vector_to_full_params(X, ctx["default_params"], ctx["optimize_params"])
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
acc = get_acc_NRMS(
|
|
38
|
-
ctx["df_query"], ctx["df_reference"],
|
|
39
|
-
ctx["unique_query_ids"], ctx["unique_reference_ids"],
|
|
40
|
-
ctx["similarity_measure"], ctx["weights"], ctx["spectrum_preprocessing_order"],
|
|
41
|
-
ctx["mz_min"], ctx["mz_max"], ctx["int_min"], ctx["int_max"],
|
|
42
|
-
p["noise_threshold"],
|
|
43
|
-
p["wf_mz"], p["wf_int"], p["LET_threshold"],
|
|
44
|
-
p["entropy_dimension"],
|
|
45
|
-
ctx["high_quality_reference_library"],
|
|
46
|
-
verbose=False
|
|
47
|
-
)
|
|
24
|
+
acc = get_acc_HRMS(
|
|
25
|
+
ctx["df_query"],
|
|
26
|
+
ctx["df_reference"],
|
|
27
|
+
ctx["precursor_ion_mz_tolerance"],
|
|
28
|
+
ctx["ionization_mode"], ctx["adduct"],
|
|
29
|
+
ctx["similarity_measure"], ctx["weights"], ctx["spectrum_preprocessing_order"],
|
|
30
|
+
ctx["mz_min"], ctx["mz_max"], ctx["int_min"], ctx["int_max"],
|
|
31
|
+
p["window_size_centroiding"], p["window_size_matching"], p["noise_threshold"],
|
|
32
|
+
p["wf_mz"], p["wf_int"], p["LET_threshold"],
|
|
33
|
+
p["entropy_dimension"],
|
|
34
|
+
ctx["high_quality_reference_library"],
|
|
35
|
+
verbose=False
|
|
36
|
+
)
|
|
48
37
|
print(f"\nparams({ctx['optimize_params']}) = {np.array(X)}\naccuracy: {acc*100}%")
|
|
49
38
|
return 1.0 - acc
|
|
50
39
|
|
|
51
40
|
|
|
41
|
+
def objective_function_NRMS(X, ctx):
|
|
42
|
+
p = _vector_to_full_params(X, ctx["default_params"], ctx["optimize_params"])
|
|
43
|
+
acc = get_acc_NRMS(
|
|
44
|
+
ctx["df_query"], ctx["df_reference"], ctx['unique_query_ids'], ctx['unique_reference_ids'],
|
|
45
|
+
ctx["similarity_measure"], ctx["weights"], ctx["spectrum_preprocessing_order"],
|
|
46
|
+
ctx["mz_min"], ctx["mz_max"], ctx["int_min"], ctx["int_max"],
|
|
47
|
+
p["noise_threshold"], p["wf_mz"], p["wf_int"], p["LET_threshold"], p["entropy_dimension"],
|
|
48
|
+
ctx["high_quality_reference_library"],
|
|
49
|
+
verbose=False
|
|
50
|
+
)
|
|
51
|
+
print(f"\nparams({ctx['optimize_params']}) = {np.array(X)}\naccuracy: {acc*100}%")
|
|
52
|
+
return 1.0 - acc
|
|
53
|
+
|
|
52
54
|
|
|
53
55
|
|
|
54
|
-
def tune_params_DE(query_data=None, reference_data=None, similarity_measure='cosine', weights=None, spectrum_preprocessing_order='CNMWL', mz_min=0, mz_max=999999999, int_min=0, int_max=999999999, high_quality_reference_library=False, optimize_params=["window_size_centroiding","window_size_matching","noise_threshold","wf_mz","wf_int","LET_threshold","entropy_dimension"], param_bounds={"window_size_centroiding":(0.0,0.5),"window_size_matching":(0.0,0.5),"noise_threshold":(0.0,0.25),"wf_mz":(0.0,5.0),"wf_int":(0.0,5.0),"LET_threshold":(0.0,5.0),"entropy_dimension":(1.0,3.0)}, default_params={"window_size_centroiding": 0.5, "window_size_matching":0.5, "noise_threshold":0.10, "wf_mz":0.0, "wf_int":1.0, "LET_threshold":0.0, "entropy_dimension":1.1}):
|
|
55
56
|
|
|
56
|
-
|
|
57
|
-
print('<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<')
|
|
58
|
-
print(param_bounds)
|
|
59
|
-
print(default_params)
|
|
60
|
-
print(type(param_bounds['noise_threshold'][0]))
|
|
61
|
-
print(type(param_bounds['noise_threshold'][1]))
|
|
62
|
-
print('>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>')
|
|
63
|
-
'''
|
|
57
|
+
def tune_params_DE(query_data=None, reference_data=None, chromatography_platform='HRMS', precursor_ion_mz_tolerance=None, ionization_mode=None, adduct=None, similarity_measure='cosine', weights=None, spectrum_preprocessing_order='CNMWL', mz_min=0, mz_max=999999999, int_min=0, int_max=999999999, high_quality_reference_library=False, optimize_params=["window_size_centroiding","window_size_matching","noise_threshold","wf_mz","wf_int","LET_threshold","entropy_dimension"], param_bounds={"window_size_centroiding":(0.0,0.5),"window_size_matching":(0.0,0.5),"noise_threshold":(0.0,0.25),"wf_mz":(0.0,5.0),"wf_int":(0.0,5.0),"LET_threshold":(0.0,5.0),"entropy_dimension":(1.0,3.0)}, default_params={"window_size_centroiding": 0.5, "window_size_matching":0.5, "noise_threshold":0.10, "wf_mz":0.0, "wf_int":1.0, "LET_threshold":0.0, "entropy_dimension":1.1}, maxiters=3, de_workers=1):
|
|
64
58
|
|
|
65
59
|
if query_data is None:
|
|
66
60
|
print('\nError: No argument passed to the mandatory query_data. Please pass the path to the TXT file of the query data.')
|
|
@@ -68,21 +62,19 @@ def tune_params_DE(query_data=None, reference_data=None, similarity_measure='cos
|
|
|
68
62
|
else:
|
|
69
63
|
extension = query_data.rsplit('.',1)
|
|
70
64
|
extension = extension[(len(extension)-1)]
|
|
71
|
-
if extension == 'mgf' or extension == 'MGF' or extension == 'mzML' or extension == 'mzml' or extension == 'MZML' or extension == 'cdf' or extension == 'CDF':
|
|
72
|
-
output_path_tmp = query_data[:-3] + '
|
|
65
|
+
if extension == 'mgf' or extension == 'MGF' or extension == 'mzML' or extension == 'mzml' or extension == 'MZML' or extension == 'cdf' or extension == 'CDF' or extension == 'msp' or extension == 'MSP' or extension == 'json' or extension == 'JSON':
|
|
66
|
+
output_path_tmp = query_data[:-3] + 'txt'
|
|
73
67
|
build_library_from_raw_data(input_path=query_data, output_path=output_path_tmp, is_reference=False)
|
|
74
|
-
df_query = pd.read_csv(output_path_tmp)
|
|
75
|
-
if extension == '
|
|
76
|
-
df_query = pd.read_csv(query_data)
|
|
77
|
-
unique_query_ids = df_query.iloc[:,0].unique()
|
|
68
|
+
df_query = pd.read_csv(output_path_tmp, sep='\t')
|
|
69
|
+
if extension == 'txt' or extension == 'TXT':
|
|
70
|
+
df_query = pd.read_csv(query_data, sep='\t')
|
|
78
71
|
|
|
79
72
|
if reference_data is None:
|
|
80
|
-
print('\nError: No argument passed to the mandatory reference_data. Please pass the path to the
|
|
73
|
+
print('\nError: No argument passed to the mandatory reference_data. Please pass the path to the TXT file of the reference data.')
|
|
81
74
|
sys.exit()
|
|
82
75
|
else:
|
|
83
76
|
if isinstance(reference_data,str):
|
|
84
77
|
df_reference = get_reference_df(reference_data=reference_data)
|
|
85
|
-
unique_reference_ids = df_reference.iloc[:,0].unique()
|
|
86
78
|
else:
|
|
87
79
|
dfs = []
|
|
88
80
|
unique_reference_ids = []
|
|
@@ -92,6 +84,11 @@ def tune_params_DE(query_data=None, reference_data=None, similarity_measure='cos
|
|
|
92
84
|
unique_reference_ids.extend(tmp.iloc[:,0].unique())
|
|
93
85
|
df_reference = pd.concat(dfs, axis=0, ignore_index=True)
|
|
94
86
|
|
|
87
|
+
if 'ionization_mode' in df_reference.columns.tolist() and ionization_mode != None and ionization_mode != 'N/A':
|
|
88
|
+
df_reference = df_reference.loc[df_reference['ionization_mode']==ionization_mode]
|
|
89
|
+
if 'adduct' in df_reference.columns.tolist() and adduct != None and adduct != 'N/A':
|
|
90
|
+
df_reference = df_reference.loc[df_reference['adduct']==adduct]
|
|
91
|
+
|
|
95
92
|
unique_query_ids = df_query['id'].unique().tolist()
|
|
96
93
|
unique_reference_ids = df_reference['id'].unique().tolist()
|
|
97
94
|
|
|
@@ -100,6 +97,9 @@ def tune_params_DE(query_data=None, reference_data=None, similarity_measure='cos
|
|
|
100
97
|
df_reference=df_reference,
|
|
101
98
|
unique_query_ids=unique_query_ids,
|
|
102
99
|
unique_reference_ids=unique_reference_ids,
|
|
100
|
+
precursor_ion_mz_tolerance=precursor_ion_mz_tolerance,
|
|
101
|
+
ionization_mode=ionization_mode,
|
|
102
|
+
adduct=adduct,
|
|
103
103
|
similarity_measure=similarity_measure,
|
|
104
104
|
weights=weights,
|
|
105
105
|
spectrum_preprocessing_order=spectrum_preprocessing_order,
|
|
@@ -111,22 +111,10 @@ def tune_params_DE(query_data=None, reference_data=None, similarity_measure='cos
|
|
|
111
111
|
|
|
112
112
|
bounds = [param_bounds[p] for p in optimize_params]
|
|
113
113
|
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
#print(ctx)
|
|
119
|
-
#print('>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>')
|
|
120
|
-
|
|
121
|
-
result = differential_evolution(
|
|
122
|
-
objective_function_HRMS,
|
|
123
|
-
bounds=bounds,
|
|
124
|
-
args=(ctx,),
|
|
125
|
-
maxiter=3,
|
|
126
|
-
tol=0.0,
|
|
127
|
-
workers=-1,
|
|
128
|
-
seed=1,
|
|
129
|
-
)
|
|
114
|
+
if chromatography_platform == 'HRMS':
|
|
115
|
+
result = differential_evolution(objective_function_HRMS, bounds=bounds, args=(ctx,), maxiter=maxiters, tol=0.0, workers=de_workers, seed=1, updating='deferred' if de_workers!=1 else 'immediate')
|
|
116
|
+
else:
|
|
117
|
+
result = differential_evolution(objective_function_NRMS, bounds=bounds, args=(ctx,), maxiter=maxiters, tol=0.0, workers=de_workers, seed=1, updating='deferred' if de_workers!=1 else 'immediate')
|
|
130
118
|
|
|
131
119
|
best_full_params = _vector_to_full_params(result.x, default_params, optimize_params)
|
|
132
120
|
best_acc = 100.0 - (result.fun * 100.0)
|
|
@@ -144,11 +132,13 @@ def tune_params_DE(query_data=None, reference_data=None, similarity_measure='cos
|
|
|
144
132
|
|
|
145
133
|
|
|
146
134
|
|
|
135
|
+
|
|
147
136
|
default_HRMS_grid = {'similarity_measure':['cosine'], 'weight':[{'Cosine':0.25,'Shannon':0.25,'Renyi':0.25,'Tsallis':0.25}], 'spectrum_preprocessing_order':['FCNMWL'], 'mz_min':[0], 'mz_max':[9999999], 'int_min':[0], 'int_max':[99999999], 'window_size_centroiding':[0.5], 'window_size_matching':[0.5], 'noise_threshold':[0.0], 'wf_mz':[0.0], 'wf_int':[1.0], 'LET_threshold':[0.0], 'entropy_dimension':[1.1], 'high_quality_reference_library':[False]}
|
|
148
137
|
default_NRMS_grid = {'similarity_measure':['cosine'], 'weight':[{'Cosine':0.25,'Shannon':0.25,'Renyi':0.25,'Tsallis':0.25}], 'spectrum_preprocessing_order':['FCNMWL'], 'mz_min':[0], 'mz_max':[9999999], 'int_min':[0], 'int_max':[99999999], 'noise_threshold':[0.0], 'wf_mz':[0.0], 'wf_int':[1.0], 'LET_threshold':[0.0], 'entropy_dimension':[1.1], 'high_quality_reference_library':[False]}
|
|
149
138
|
|
|
150
139
|
|
|
151
|
-
def _eval_one_HRMS(df_query, df_reference,
|
|
140
|
+
def _eval_one_HRMS(df_query, df_reference,
|
|
141
|
+
precursor_ion_mz_tolerance_tmp, ionization_mode_tmp, adduct_tmp,
|
|
152
142
|
similarity_measure_tmp, weight,
|
|
153
143
|
spectrum_preprocessing_order_tmp, mz_min_tmp, mz_max_tmp,
|
|
154
144
|
int_min_tmp, int_max_tmp, noise_threshold_tmp,
|
|
@@ -158,7 +148,8 @@ def _eval_one_HRMS(df_query, df_reference, unique_query_ids, unique_reference_id
|
|
|
158
148
|
|
|
159
149
|
acc = get_acc_HRMS(
|
|
160
150
|
df_query=df_query, df_reference=df_reference,
|
|
161
|
-
|
|
151
|
+
precursor_ion_mz_tolerance=precursor_ion_mz_tolerance_tmp,
|
|
152
|
+
ionization_mode=ionization_mode_tmp, adduct=adduct_tmp,
|
|
162
153
|
similarity_measure=similarity_measure_tmp, weights=weight,
|
|
163
154
|
spectrum_preprocessing_order=spectrum_preprocessing_order_tmp,
|
|
164
155
|
mz_min=mz_min_tmp, mz_max=mz_max_tmp,
|
|
@@ -170,7 +161,7 @@ def _eval_one_HRMS(df_query, df_reference, unique_query_ids, unique_reference_id
|
|
|
170
161
|
LET_threshold=LET_threshold_tmp,
|
|
171
162
|
entropy_dimension=entropy_dimension_tmp,
|
|
172
163
|
high_quality_reference_library=high_quality_reference_library_tmp,
|
|
173
|
-
verbose=
|
|
164
|
+
verbose=False
|
|
174
165
|
)
|
|
175
166
|
|
|
176
167
|
return (
|
|
@@ -201,6 +192,7 @@ def _eval_one_NRMS(df_query, df_reference, unique_query_ids, unique_reference_id
|
|
|
201
192
|
LET_threshold=LET_threshold_tmp,
|
|
202
193
|
entropy_dimension=entropy_dimension_tmp,
|
|
203
194
|
high_quality_reference_library=high_quality_reference_library_tmp,
|
|
195
|
+
verbose=False
|
|
204
196
|
)
|
|
205
197
|
|
|
206
198
|
return (
|
|
@@ -211,16 +203,7 @@ def _eval_one_NRMS(df_query, df_reference, unique_query_ids, unique_reference_id
|
|
|
211
203
|
|
|
212
204
|
|
|
213
205
|
|
|
214
|
-
def tune_params_on_HRMS_data_grid(query_data=None, reference_data=None, grid=None, output_path=None, return_output=False):
|
|
215
|
-
"""
|
|
216
|
-
runs spectral library matching on high-resolution mass spectrometry (HRMS) data with all possible combinations of parameters in the grid dict, saves results from each choice of parameters to a TXT file, and prints top-performing parameters
|
|
217
|
-
|
|
218
|
-
--query_data: mgf, mzML, or csv file of query mass spectrum/spectra to be identified. If csv file, each row should correspond to a mass spectrum, the left-most column should contain an identifier, and each of the other columns should correspond to a single mass/charge ratio. Mandatory argument.
|
|
219
|
-
--reference_data: mgf, mzML, or csv file of the reference mass spectra. If csv file, each row should correspond to a mass spectrum, the left-most column should contain in identifier (i.e. the CAS registry number or the compound name), and the remaining column should correspond to a single mass/charge ratio. Mandatory argument.
|
|
220
|
-
--grid: dict with all possible parameter values to try.
|
|
221
|
-
--output_path: accuracy from each choice of parameter set is saved to a TXT file here.
|
|
222
|
-
"""
|
|
223
|
-
|
|
206
|
+
def tune_params_on_HRMS_data_grid(query_data=None, reference_data=None, precursor_ion_mz_tolerance=None, ionization_mode=None, adduct=None, grid=None, output_path=None, return_output=False):
|
|
224
207
|
grid = {**default_HRMS_grid, **(grid or {})}
|
|
225
208
|
for key, value in grid.items():
|
|
226
209
|
globals()[key] = value
|
|
@@ -231,31 +214,37 @@ def tune_params_on_HRMS_data_grid(query_data=None, reference_data=None, grid=Non
|
|
|
231
214
|
else:
|
|
232
215
|
extension = query_data.rsplit('.',1)
|
|
233
216
|
extension = extension[(len(extension)-1)]
|
|
234
|
-
if extension == 'mgf' or extension == 'MGF' or extension == 'mzML' or extension == 'mzml' or extension == 'MZML' or extension == 'cdf' or extension == 'CDF':
|
|
235
|
-
output_path_tmp = query_data[:-3] + '
|
|
217
|
+
if extension == 'mgf' or extension == 'MGF' or extension == 'mzML' or extension == 'mzml' or extension == 'MZML' or extension == 'cdf' or extension == 'CDF' or extension == 'msp' or extension == 'MSP' or extension == 'json' or extension == 'JSON':
|
|
218
|
+
output_path_tmp = query_data[:-3] + 'txt'
|
|
236
219
|
build_library_from_raw_data(input_path=query_data, output_path=output_path_tmp, is_reference=False)
|
|
237
|
-
df_query = pd.read_csv(output_path_tmp)
|
|
238
|
-
if extension == '
|
|
239
|
-
df_query = pd.read_csv(query_data)
|
|
240
|
-
unique_query_ids = df_query
|
|
220
|
+
df_query = pd.read_csv(output_path_tmp, sep='\t')
|
|
221
|
+
if extension == 'txt' or extension == 'TXT':
|
|
222
|
+
df_query = pd.read_csv(query_data, sep='\t')
|
|
223
|
+
unique_query_ids = df_query['id'].unique()
|
|
241
224
|
|
|
242
225
|
if reference_data is None:
|
|
243
|
-
print('\nError: No argument passed to the mandatory reference_data. Please pass the path to the
|
|
226
|
+
print('\nError: No argument passed to the mandatory reference_data. Please pass the path to the TXT file of the reference data.')
|
|
244
227
|
sys.exit()
|
|
245
228
|
else:
|
|
246
229
|
if isinstance(reference_data,str):
|
|
247
230
|
df_reference = get_reference_df(reference_data=reference_data)
|
|
248
|
-
unique_reference_ids = df_reference
|
|
231
|
+
unique_reference_ids = df_reference['id'].unique()
|
|
249
232
|
else:
|
|
250
233
|
dfs = []
|
|
251
234
|
unique_reference_ids = []
|
|
252
235
|
for f in reference_data:
|
|
253
236
|
tmp = get_reference_df(reference_data=f)
|
|
254
237
|
dfs.append(tmp)
|
|
255
|
-
unique_reference_ids.extend(tmp
|
|
238
|
+
unique_reference_ids.extend(tmp['id'].unique())
|
|
256
239
|
df_reference = pd.concat(dfs, axis=0, ignore_index=True)
|
|
257
240
|
|
|
258
|
-
|
|
241
|
+
if 'ionization_mode' in df_reference.columns.tolist() and ionization_mode != 'N/A' and ionization_mode != None:
|
|
242
|
+
df_reference = df_reference.loc[df_reference['ionization_mode']==ionization_mode].copy()
|
|
243
|
+
if 'adduct' in df_reference.columns.tolist() and adduct != 'N/A' and adduct != None:
|
|
244
|
+
df_reference = df_reference.loc[df_reference['adduct']==adduct].copy()
|
|
245
|
+
unique_reference_ids_tmp2 = df_reference['id'].unique()
|
|
246
|
+
|
|
247
|
+
print(f'\nNote that there are {len(unique_query_ids)} unique query spectra, {len(unique_reference_ids)} unique reference spectra, and {len(set(unique_query_ids) & set(unique_reference_ids_tmp2))} of the query and reference spectra IDs are in common.\n')
|
|
259
248
|
|
|
260
249
|
if output_path is None:
|
|
261
250
|
output_path = f'{Path.cwd()}/tuning_param_output.txt'
|
|
@@ -263,7 +252,7 @@ def tune_params_on_HRMS_data_grid(query_data=None, reference_data=None, grid=Non
|
|
|
263
252
|
|
|
264
253
|
param_grid = product(similarity_measure, weight, spectrum_preprocessing_order, mz_min, mz_max, int_min, int_max, noise_threshold,
|
|
265
254
|
window_size_centroiding, window_size_matching, wf_mz, wf_int, LET_threshold, entropy_dimension, high_quality_reference_library)
|
|
266
|
-
results = Parallel(n_jobs=-1, verbose=10)(delayed(_eval_one_HRMS)(df_query, df_reference,
|
|
255
|
+
results = Parallel(n_jobs=-1, verbose=10)(delayed(_eval_one_HRMS)(df_query, df_reference, precursor_ion_mz_tolerance, ionization_mode, adduct, *params) for params in param_grid)
|
|
267
256
|
|
|
268
257
|
df_out = pd.DataFrame(results, columns=[
|
|
269
258
|
'ACC','SIMILARITY.MEASURE','WEIGHT','SPECTRUM.PROCESSING.ORDER', 'MZ.MIN','MZ.MAX','INT.MIN','INT.MAX','NOISE.THRESHOLD',
|
|
@@ -287,124 +276,7 @@ def tune_params_on_HRMS_data_grid(query_data=None, reference_data=None, grid=Non
|
|
|
287
276
|
|
|
288
277
|
|
|
289
278
|
|
|
290
|
-
def tune_params_on_HRMS_data_grid_shiny(query_data=None, reference_data=None, grid=None, output_path=None, return_output=False):
|
|
291
|
-
"""
|
|
292
|
-
runs spectral library matching on high-resolution mass spectrometry (HRMS) data with all possible
|
|
293
|
-
combinations of parameters in the grid dict, saves results from each choice of parameters to a TXT file,
|
|
294
|
-
and prints top-performing parameters
|
|
295
|
-
|
|
296
|
-
--query_data: mgf, mzML, or csv file of query mass spectrum/spectra to be identified. If csv file, each row
|
|
297
|
-
should correspond to a mass spectrum, the left-most column should contain an identifier, and each of the
|
|
298
|
-
other columns should correspond to a single mass/charge ratio. Mandatory argument.
|
|
299
|
-
--reference_data: mgf, mzML, or csv file of the reference mass spectra. If csv file, each row should correspond
|
|
300
|
-
to a mass spectrum, the left-most column should contain in identifier (i.e. the CAS registry number or the
|
|
301
|
-
compound name), and the remaining column should correspond to a single mass/charge ratio. Mandatory argument.
|
|
302
|
-
--grid: dict with all possible parameter values to try.
|
|
303
|
-
--output_path: accuracy from each choice of parameter set is saved to a TXT file here.
|
|
304
|
-
"""
|
|
305
|
-
|
|
306
|
-
local_grid = {**default_HRMS_grid, **(grid or {})}
|
|
307
|
-
for key, value in local_grid.items():
|
|
308
|
-
globals()[key] = value
|
|
309
|
-
|
|
310
|
-
if query_data is None:
|
|
311
|
-
print('\nError: No argument passed to the mandatory query_data. Please pass the path to the data file.')
|
|
312
|
-
sys.exit()
|
|
313
|
-
else:
|
|
314
|
-
extension = query_data.rsplit('.', 1)[-1]
|
|
315
|
-
if extension in ('mgf','MGF','mzML','mzml','MZML','cdf','CDF'):
|
|
316
|
-
output_path_tmp = query_data[:-3] + 'csv'
|
|
317
|
-
build_library_from_raw_data(input_path=query_data, output_path=output_path_tmp, is_reference=False)
|
|
318
|
-
df_query = pd.read_csv(output_path_tmp)
|
|
319
|
-
elif extension in ('csv','CSV'):
|
|
320
|
-
df_query = pd.read_csv(query_data)
|
|
321
|
-
else:
|
|
322
|
-
print(f'\nError: Unsupported query_data extension: {extension}')
|
|
323
|
-
sys.exit()
|
|
324
|
-
unique_query_ids = df_query.iloc[:, 0].unique()
|
|
325
|
-
|
|
326
|
-
if reference_data is None:
|
|
327
|
-
print('\nError: No argument passed to the mandatory reference_data. Please pass the path to the data file(s).')
|
|
328
|
-
sys.exit()
|
|
329
|
-
else:
|
|
330
|
-
if isinstance(reference_data, str):
|
|
331
|
-
df_reference = get_reference_df(reference_data=reference_data)
|
|
332
|
-
unique_reference_ids = df_reference.iloc[:, 0].unique()
|
|
333
|
-
else:
|
|
334
|
-
dfs = []
|
|
335
|
-
unique_reference_ids = []
|
|
336
|
-
for f in reference_data:
|
|
337
|
-
tmp = get_reference_df(reference_data=f)
|
|
338
|
-
dfs.append(tmp)
|
|
339
|
-
unique_reference_ids.extend(tmp.iloc[:, 0].unique())
|
|
340
|
-
df_reference = pd.concat(dfs, axis=0, ignore_index=True)
|
|
341
|
-
|
|
342
|
-
print(f'\nNote that there are {len(unique_query_ids)} unique query spectra, '
|
|
343
|
-
f'{len(unique_reference_ids)} unique reference spectra, and '
|
|
344
|
-
f'{len(set(unique_query_ids) & set(unique_reference_ids))} of the query and reference spectra IDs are in common.\n')
|
|
345
|
-
|
|
346
|
-
if output_path is None:
|
|
347
|
-
output_path = f'{Path.cwd()}/tuning_param_output.txt'
|
|
348
|
-
print(f'Warning: since output_path=None, the output will be written to the current working directory: {output_path}')
|
|
349
|
-
|
|
350
|
-
param_grid = product(
|
|
351
|
-
similarity_measure, weight, spectrum_preprocessing_order, mz_min, mz_max, int_min, int_max,
|
|
352
|
-
noise_threshold, window_size_centroiding, window_size_matching, wf_mz, wf_int, LET_threshold,
|
|
353
|
-
entropy_dimension, high_quality_reference_library
|
|
354
|
-
)
|
|
355
|
-
|
|
356
|
-
results = []
|
|
357
|
-
total = (
|
|
358
|
-
len(similarity_measure) * len(weight) * len(spectrum_preprocessing_order) * len(mz_min) * len(mz_max) *
|
|
359
|
-
len(int_min) * len(int_max) * len(noise_threshold) * len(window_size_centroiding) *
|
|
360
|
-
len(window_size_matching) * len(wf_mz) * len(wf_int) * len(LET_threshold) *
|
|
361
|
-
len(entropy_dimension) * len(high_quality_reference_library)
|
|
362
|
-
)
|
|
363
|
-
done = 0
|
|
364
|
-
|
|
365
|
-
for params in param_grid:
|
|
366
|
-
res = _eval_one_HRMS(df_query, df_reference, unique_query_ids, unique_reference_ids, *params)
|
|
367
|
-
results.append(res)
|
|
368
|
-
done += 1
|
|
369
|
-
print(f'Completed {done}/{total} grid combinations.\n', flush=True)
|
|
370
|
-
|
|
371
|
-
df_out = pd.DataFrame(results, columns=[
|
|
372
|
-
'ACC','SIMILARITY.MEASURE','WEIGHT','SPECTRUM.PROCESSING.ORDER','MZ.MIN','MZ.MAX',
|
|
373
|
-
'INT.MIN','INT.MAX','NOISE.THRESHOLD','WINDOW.SIZE.CENTROIDING','WINDOW.SIZE.MATCHING',
|
|
374
|
-
'WF.MZ','WF.INT','LET.THRESHOLD','ENTROPY.DIMENSION','HIGH.QUALITY.REFERENCE.LIBRARY'
|
|
375
|
-
])
|
|
376
|
-
|
|
377
|
-
if 'WEIGHT' in df_out.columns:
|
|
378
|
-
df_out['WEIGHT'] = (
|
|
379
|
-
df_out['WEIGHT'].astype(str)
|
|
380
|
-
.str.replace("\"","",regex=False)
|
|
381
|
-
.str.replace("{","",regex=False)
|
|
382
|
-
.str.replace("}","",regex=False)
|
|
383
|
-
.str.replace(":","",regex=False)
|
|
384
|
-
.str.replace("Cosine","",regex=False)
|
|
385
|
-
.str.replace("Shannon","",regex=False)
|
|
386
|
-
.str.replace("Renyi","",regex=False)
|
|
387
|
-
.str.replace("Tsallis","",regex=False)
|
|
388
|
-
.str.replace(" ","",regex=False)
|
|
389
|
-
)
|
|
390
|
-
|
|
391
|
-
if return_output:
|
|
392
|
-
return df_out
|
|
393
|
-
else:
|
|
394
|
-
df_out.to_csv(output_path, index=False, sep='\t', quoting=csv.QUOTE_NONE)
|
|
395
|
-
print(f'Wrote results to {output_path}')
|
|
396
|
-
|
|
397
|
-
|
|
398
279
|
def tune_params_on_NRMS_data_grid(query_data=None, reference_data=None, grid=None, output_path=None, return_output=False):
|
|
399
|
-
"""
|
|
400
|
-
runs spectral library matching on nominal-resolution mass spectrometry (NRMS) data with all possible combinations of parameters in the grid dict, saves results from each choice of parameters to a TXT file, and prints top-performing parameters
|
|
401
|
-
|
|
402
|
-
--query_data: mgf, mzML, or csv file of query mass spectrum/spectra to be identified. If csv file, each row should correspond to a mass spectrum, the left-most column should contain an identifier, and each of the other columns should correspond to a single mass/charge ratio. Mandatory argument.
|
|
403
|
-
--reference_data: mgf, mzML, or csv file of the reference mass spectra. If csv file, each row should correspond to a mass spectrum, the left-most column should contain in identifier (i.e. the CAS registry number or the compound name), and the remaining column should correspond to a single mass/charge ratio. Mandatory argument.
|
|
404
|
-
--grid: dict with all possible parameter values to try
|
|
405
|
-
--output_path: accuracy from each choice of parameter set is saved to a TXT file here
|
|
406
|
-
"""
|
|
407
|
-
|
|
408
280
|
grid = {**default_NRMS_grid, **(grid or {})}
|
|
409
281
|
for key, value in grid.items():
|
|
410
282
|
globals()[key] = value
|
|
@@ -415,13 +287,13 @@ def tune_params_on_NRMS_data_grid(query_data=None, reference_data=None, grid=Non
|
|
|
415
287
|
else:
|
|
416
288
|
extension = query_data.rsplit('.',1)
|
|
417
289
|
extension = extension[(len(extension)-1)]
|
|
418
|
-
if extension == 'mgf' or extension == 'MGF' or extension == 'mzML' or extension == 'mzml' or extension == 'MZML' or extension == 'cdf' or extension == 'CDF':
|
|
419
|
-
output_path_tmp = query_data[:-3] + '
|
|
290
|
+
if extension == 'mgf' or extension == 'MGF' or extension == 'mzML' or extension == 'mzml' or extension == 'MZML' or extension == 'cdf' or extension == 'CDF' or extension == 'msp' or extension == 'MSP' or extension == 'json' or extension == 'JSON':
|
|
291
|
+
output_path_tmp = query_data[:-3] + 'txt'
|
|
420
292
|
build_library_from_raw_data(input_path=query_data, output_path=output_path_tmp, is_reference=False)
|
|
421
|
-
df_query = pd.read_csv(output_path_tmp)
|
|
422
|
-
if extension == '
|
|
423
|
-
df_query = pd.read_csv(query_data)
|
|
424
|
-
unique_query_ids = df_query
|
|
293
|
+
df_query = pd.read_csv(output_path_tmp, sep='\t')
|
|
294
|
+
if extension == 'txt' or extension == 'TXT':
|
|
295
|
+
df_query = pd.read_csv(query_data, sep='\t')
|
|
296
|
+
unique_query_ids = df_query['id'].unique()
|
|
425
297
|
|
|
426
298
|
if reference_data is None:
|
|
427
299
|
print('\nError: No argument passed to the mandatory reference_data. Please pass the path to the CSV file of the reference data.')
|
|
@@ -429,7 +301,7 @@ def tune_params_on_NRMS_data_grid(query_data=None, reference_data=None, grid=Non
|
|
|
429
301
|
else:
|
|
430
302
|
if isinstance(reference_data,str):
|
|
431
303
|
df_reference = get_reference_df(reference_data=reference_data)
|
|
432
|
-
unique_reference_ids = df_reference
|
|
304
|
+
unique_reference_ids = df_reference['id'].unique()
|
|
433
305
|
else:
|
|
434
306
|
dfs = []
|
|
435
307
|
unique_reference_ids = []
|
|
@@ -449,10 +321,8 @@ def tune_params_on_NRMS_data_grid(query_data=None, reference_data=None, grid=Non
|
|
|
449
321
|
noise_threshold, wf_mz, wf_int, LET_threshold, entropy_dimension, high_quality_reference_library)
|
|
450
322
|
results = Parallel(n_jobs=-1, verbose=10)(delayed(_eval_one_NRMS)(df_query, df_reference, unique_query_ids, unique_reference_ids, *params) for params in param_grid)
|
|
451
323
|
|
|
452
|
-
df_out = pd.DataFrame(results, columns=[
|
|
453
|
-
|
|
454
|
-
'NOISE.THRESHOLD','WF.MZ','WF.INT','LET.THRESHOLD','ENTROPY.DIMENSION', 'HIGH.QUALITY.REFERENCE.LIBRARY'
|
|
455
|
-
])
|
|
324
|
+
df_out = pd.DataFrame(results, columns=['ACC','SIMILARITY.MEASURE','WEIGHT','SPECTRUM.PROCESSING.ORDER', 'MZ.MIN','MZ.MAX','INT.MIN','INT.MAX',
|
|
325
|
+
'NOISE.THRESHOLD','WF.MZ','WF.INT','LET.THRESHOLD','ENTROPY.DIMENSION', 'HIGH.QUALITY.REFERENCE.LIBRARY'])
|
|
456
326
|
df_out['WEIGHT'] = df_out['WEIGHT'].str.replace("\"","",regex=False)
|
|
457
327
|
df_out['WEIGHT'] = df_out['WEIGHT'].str.replace("{","",regex=False)
|
|
458
328
|
df_out['WEIGHT'] = df_out['WEIGHT'].str.replace("}","",regex=False)
|
|
@@ -462,6 +332,7 @@ def tune_params_on_NRMS_data_grid(query_data=None, reference_data=None, grid=Non
|
|
|
462
332
|
df_out['WEIGHT'] = df_out['WEIGHT'].str.replace("Renyi","",regex=False)
|
|
463
333
|
df_out['WEIGHT'] = df_out['WEIGHT'].str.replace("Tsallis","",regex=False)
|
|
464
334
|
df_out['WEIGHT'] = df_out['WEIGHT'].str.replace(" ","",regex=False)
|
|
335
|
+
|
|
465
336
|
if return_output is False:
|
|
466
337
|
df_out.to_csv(output_path, index=False, sep='\t', quoting=csv.QUOTE_NONE)
|
|
467
338
|
else:
|
|
@@ -469,203 +340,116 @@ def tune_params_on_NRMS_data_grid(query_data=None, reference_data=None, grid=Non
|
|
|
469
340
|
|
|
470
341
|
|
|
471
342
|
|
|
472
|
-
def
|
|
473
|
-
"""
|
|
474
|
-
runs spectral library matching on nominal-resolution mass spectrometry (NRMS) data with all possible
|
|
475
|
-
combinations of parameters in the grid dict, saves results from each choice of parameters to a TXT file,
|
|
476
|
-
and prints top-performing parameters
|
|
343
|
+
def get_acc_HRMS(df_query, df_reference, precursor_ion_mz_tolerance, ionization_mode, adduct, similarity_measure, weights, spectrum_preprocessing_order, mz_min, mz_max, int_min, int_max, window_size_centroiding, window_size_matching, noise_threshold, wf_mz, wf_int, LET_threshold, entropy_dimension, high_quality_reference_library, verbose=True):
|
|
477
344
|
|
|
478
|
-
|
|
479
|
-
|
|
480
|
-
|
|
481
|
-
|
|
482
|
-
to a mass spectrum, the left-most column should contain in identifier (i.e. the CAS registry number or the
|
|
483
|
-
compound name), and the remaining column should correspond to a single mass/charge ratio. Mandatory argument.
|
|
484
|
-
--grid: dict with all possible parameter values to try.
|
|
485
|
-
--output_path: accuracy from each choice of parameter set is saved to a TXT file here.
|
|
486
|
-
"""
|
|
487
|
-
|
|
488
|
-
local_grid = {**default_NRMS_grid, **(grid or {})}
|
|
489
|
-
for key, value in local_grid.items():
|
|
490
|
-
globals()[key] = value
|
|
491
|
-
|
|
492
|
-
if query_data is None:
|
|
493
|
-
print('\nError: No argument passed to the mandatory query_data. Please pass the path to the data file.')
|
|
494
|
-
sys.exit()
|
|
495
|
-
else:
|
|
496
|
-
extension = query_data.rsplit('.', 1)[-1]
|
|
497
|
-
if extension in ('mgf','MGF','mzML','mzml','MZML','cdf','CDF'):
|
|
498
|
-
output_path_tmp = query_data[:-3] + 'csv'
|
|
499
|
-
build_library_from_raw_data(input_path=query_data, output_path=output_path_tmp, is_reference=False)
|
|
500
|
-
df_query = pd.read_csv(output_path_tmp)
|
|
501
|
-
elif extension in ('csv','CSV'):
|
|
502
|
-
df_query = pd.read_csv(query_data)
|
|
503
|
-
else:
|
|
504
|
-
print(f'\nError: Unsupported query_data extension: {extension}')
|
|
505
|
-
sys.exit()
|
|
506
|
-
unique_query_ids = df_query.iloc[:, 0].unique()
|
|
507
|
-
|
|
508
|
-
if reference_data is None:
|
|
509
|
-
print('\nError: No argument passed to the mandatory reference_data. Please pass the path to the data file(s).')
|
|
510
|
-
sys.exit()
|
|
511
|
-
else:
|
|
512
|
-
if isinstance(reference_data, str):
|
|
513
|
-
df_reference = get_reference_df(reference_data=reference_data)
|
|
514
|
-
unique_reference_ids = df_reference.iloc[:, 0].unique()
|
|
515
|
-
else:
|
|
516
|
-
dfs = []
|
|
517
|
-
unique_reference_ids = []
|
|
518
|
-
for f in reference_data:
|
|
519
|
-
tmp = get_reference_df(reference_data=f)
|
|
520
|
-
dfs.append(tmp)
|
|
521
|
-
unique_reference_ids.extend(tmp.iloc[:, 0].unique())
|
|
522
|
-
df_reference = pd.concat(dfs, axis=0, ignore_index=True)
|
|
523
|
-
|
|
524
|
-
print(f'\nNote that there are {len(unique_query_ids)} unique query spectra, '
|
|
525
|
-
f'{len(unique_reference_ids)} unique reference spectra, and '
|
|
526
|
-
f'{len(set(unique_query_ids) & set(unique_reference_ids))} of the query and reference spectra IDs are in common.\n')
|
|
527
|
-
|
|
528
|
-
if output_path is None:
|
|
529
|
-
output_path = f'{Path.cwd()}/tuning_param_output.txt'
|
|
530
|
-
print(f'Warning: since output_path=None, the output will be written to the current working directory: {output_path}')
|
|
531
|
-
|
|
532
|
-
param_grid = product(
|
|
533
|
-
similarity_measure, weight, spectrum_preprocessing_order, mz_min, mz_max, int_min, int_max,
|
|
534
|
-
noise_threshold, wf_mz, wf_int, LET_threshold,
|
|
535
|
-
entropy_dimension, high_quality_reference_library
|
|
536
|
-
)
|
|
537
|
-
|
|
538
|
-
results = []
|
|
539
|
-
total = (
|
|
540
|
-
len(similarity_measure) * len(weight) * len(spectrum_preprocessing_order) * len(mz_min) * len(mz_max) * len(int_min) *
|
|
541
|
-
len(int_max) * len(noise_threshold) * len(wf_mz) * len(wf_int) * len(LET_threshold) * len(entropy_dimension) * len(high_quality_reference_library)
|
|
542
|
-
)
|
|
543
|
-
done = 0
|
|
544
|
-
for params in param_grid:
|
|
545
|
-
res = _eval_one_NRMS(df_query, df_reference, unique_query_ids, unique_reference_ids, *params)
|
|
546
|
-
results.append(res)
|
|
547
|
-
done += 1
|
|
548
|
-
print(f'Completed {done}/{total} grid combinations.\n', flush=True)
|
|
549
|
-
|
|
550
|
-
df_out = pd.DataFrame(results, columns=[
|
|
551
|
-
'ACC','SIMILARITY.MEASURE','WEIGHT','SPECTRUM.PROCESSING.ORDER','MZ.MIN','MZ.MAX',
|
|
552
|
-
'INT.MIN','INT.MAX','NOISE.THRESHOLD','WF.MZ','WF.INT','LET.THRESHOLD','ENTROPY.DIMENSION','HIGH.QUALITY.REFERENCE.LIBRARY'
|
|
553
|
-
])
|
|
345
|
+
n_top_matches_to_save = 1
|
|
346
|
+
unique_reference_ids = df_reference['id'].dropna().astype(str).unique().tolist()
|
|
347
|
+
unique_query_ids = df_query['id'].dropna().astype(str).unique().tolist()
|
|
348
|
+
all_similarity_rows = []
|
|
554
349
|
|
|
555
|
-
|
|
556
|
-
|
|
557
|
-
|
|
558
|
-
.str.replace("\"","",regex=False)
|
|
559
|
-
.str.replace("{","",regex=False)
|
|
560
|
-
.str.replace("}","",regex=False)
|
|
561
|
-
.str.replace(":","",regex=False)
|
|
562
|
-
.str.replace("Cosine","",regex=False)
|
|
563
|
-
.str.replace("Shannon","",regex=False)
|
|
564
|
-
.str.replace("Renyi","",regex=False)
|
|
565
|
-
.str.replace("Tsallis","",regex=False)
|
|
566
|
-
.str.replace(" ","",regex=False)
|
|
567
|
-
)
|
|
568
|
-
|
|
569
|
-
if return_output:
|
|
570
|
-
return df_out
|
|
571
|
-
else:
|
|
572
|
-
df_out.to_csv(output_path, index=False, sep='\t', quoting=csv.QUOTE_NONE)
|
|
573
|
-
print(f'Wrote results to {output_path}')
|
|
350
|
+
for query_idx, qid in enumerate(unique_query_ids):
|
|
351
|
+
if verbose:
|
|
352
|
+
print(f'query spectrum #{query_idx} is being identified')
|
|
574
353
|
|
|
354
|
+
q_mask = (df_query['id'] == qid)
|
|
355
|
+
q_idxs = np.where(q_mask)[0]
|
|
356
|
+
if q_idxs.size == 0:
|
|
357
|
+
all_similarity_rows.append([0.0]*len(unique_reference_ids))
|
|
358
|
+
continue
|
|
575
359
|
|
|
360
|
+
q_spec_base = np.asarray(pd.concat([df_query['mz_ratio'].iloc[q_idxs], df_query['intensity'].iloc[q_idxs]], axis=1).reset_index(drop=True))
|
|
576
361
|
|
|
362
|
+
if 'precursor_ion_mz' in df_query.columns and 'precursor_ion_mz' in df_reference.columns and precursor_ion_mz_tolerance is not None:
|
|
363
|
+
precursor = float(df_query['precursor_ion_mz'].iloc[q_idxs[0]])
|
|
364
|
+
df_reference_tmp = df_reference.loc[df_reference['precursor_ion_mz'].between(precursor - precursor_ion_mz_tolerance, precursor + precursor_ion_mz_tolerance, inclusive='both'), ['id', 'mz_ratio', 'intensity']].copy()
|
|
365
|
+
else:
|
|
366
|
+
df_reference_tmp = df_reference[['id','mz_ratio','intensity']].copy()
|
|
577
367
|
|
|
578
|
-
|
|
368
|
+
if df_reference_tmp.empty:
|
|
369
|
+
all_similarity_rows.append([0.0]*len(unique_reference_ids))
|
|
370
|
+
continue
|
|
579
371
|
|
|
580
|
-
|
|
581
|
-
n_top_matches_to_save = 1
|
|
372
|
+
ref_groups = dict(tuple(df_reference_tmp.groupby('id', sort=False)))
|
|
582
373
|
|
|
583
|
-
|
|
584
|
-
for query_idx in range(0,len(unique_query_ids)):
|
|
585
|
-
if verbose is True:
|
|
586
|
-
print(f'query spectrum #{query_idx} is being identified')
|
|
587
|
-
q_idxs_tmp = np.where(df_query.iloc[:,0] == unique_query_ids[query_idx])[0]
|
|
588
|
-
q_spec_tmp = np.asarray(pd.concat([df_query.iloc[q_idxs_tmp,1], df_query.iloc[q_idxs_tmp,2]], axis=1).reset_index(drop=True))
|
|
589
|
-
#q_spec_tmp = q_spec_tmp.astype(float)
|
|
374
|
+
similarity_by_ref = {}
|
|
590
375
|
|
|
591
|
-
|
|
592
|
-
|
|
593
|
-
|
|
594
|
-
r_idxs_tmp = np.where(df_reference.iloc[:,0] == unique_reference_ids[ref_idx])[0]
|
|
595
|
-
r_spec = np.asarray(pd.concat([df_reference.iloc[r_idxs_tmp,1], df_reference.iloc[r_idxs_tmp,2]], axis=1).reset_index(drop=True))
|
|
596
|
-
#print(r_spec)
|
|
597
|
-
#r_spec = r_spec.astype(float)
|
|
376
|
+
for ref_id, r_df in ref_groups.items():
|
|
377
|
+
q_spec = q_spec_base.copy()
|
|
378
|
+
r_spec = np.asarray(pd.concat([r_df['mz_ratio'], r_df['intensity']], axis=1).reset_index(drop=True))
|
|
598
379
|
|
|
599
380
|
is_matched = False
|
|
600
381
|
for transformation in spectrum_preprocessing_order:
|
|
601
|
-
if np.isinf(q_spec[:,1]).
|
|
602
|
-
q_spec[:,1] =
|
|
603
|
-
if np.isinf(r_spec[:,1]).
|
|
604
|
-
r_spec[:,1] =
|
|
605
|
-
|
|
606
|
-
|
|
607
|
-
|
|
608
|
-
|
|
609
|
-
|
|
610
|
-
|
|
611
|
-
|
|
382
|
+
if np.isinf(q_spec[:, 1]).any():
|
|
383
|
+
q_spec[:, 1] = 0.0
|
|
384
|
+
if np.isinf(r_spec[:, 1]).any():
|
|
385
|
+
r_spec[:, 1] = 0.0
|
|
386
|
+
|
|
387
|
+
if transformation == 'C' and q_spec.shape[0] > 1 and r_spec.shape[0] > 1:
|
|
388
|
+
q_spec = centroid_spectrum(q_spec, window_size=window_size_centroiding)
|
|
389
|
+
r_spec = centroid_spectrum(r_spec, window_size=window_size_centroiding)
|
|
390
|
+
|
|
391
|
+
if transformation == 'M' and q_spec.shape[0] > 1 and r_spec.shape[0] > 1:
|
|
392
|
+
m_spec = match_peaks_in_spectra(
|
|
393
|
+
spec_a=q_spec, spec_b=r_spec, window_size=window_size_matching
|
|
394
|
+
)
|
|
395
|
+
if m_spec.size == 0:
|
|
396
|
+
q_spec = np.empty((0,2))
|
|
397
|
+
r_spec = np.empty((0,2))
|
|
398
|
+
else:
|
|
399
|
+
q_spec = m_spec[:, 0:2]
|
|
400
|
+
r_spec = m_spec[:, [0, 2]]
|
|
612
401
|
is_matched = True
|
|
613
|
-
if transformation == 'W' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1:
|
|
614
|
-
q_spec[:,1] = wf_transform(q_spec[:,0], q_spec[:,1], wf_mz, wf_int)
|
|
615
|
-
r_spec[:,1] = wf_transform(r_spec[:,0], r_spec[:,1], wf_mz, wf_int)
|
|
616
|
-
if transformation == 'L' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1:
|
|
617
|
-
q_spec[:,1] = LE_transform(q_spec[:,1], LET_threshold, normalization_method='standard')
|
|
618
|
-
r_spec[:,1] = LE_transform(r_spec[:,1], LET_threshold, normalization_method='standard')
|
|
619
|
-
if transformation == 'N' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1:
|
|
620
|
-
q_spec = remove_noise(q_spec, nr = noise_threshold)
|
|
621
|
-
if high_quality_reference_library == False:
|
|
622
|
-
r_spec = remove_noise(r_spec, nr = noise_threshold)
|
|
623
|
-
if transformation == 'F' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1:
|
|
624
|
-
q_spec = filter_spec_lcms(q_spec, mz_min = mz_min, mz_max = mz_max, int_min = int_min, int_max = int_max, is_matched = is_matched)
|
|
625
|
-
if high_quality_reference_library == False:
|
|
626
|
-
r_spec = filter_spec_lcms(r_spec, mz_min = mz_min, mz_max = mz_max, int_min = int_min, int_max = int_max, is_matched = is_matched)
|
|
627
402
|
|
|
628
|
-
|
|
629
|
-
|
|
630
|
-
|
|
631
|
-
|
|
403
|
+
if transformation == 'W' and q_spec.shape[0] > 1 and r_spec.shape[0] > 1:
|
|
404
|
+
q_spec[:, 1] = wf_transform(q_spec[:, 0], q_spec[:, 1], wf_mz, wf_int)
|
|
405
|
+
r_spec[:, 1] = wf_transform(r_spec[:, 0], r_spec[:, 1], wf_mz, wf_int)
|
|
406
|
+
|
|
407
|
+
if transformation == 'L' and q_spec.shape[0] > 1 and r_spec.shape[0] > 1:
|
|
408
|
+
q_spec[:, 1] = LE_transform(q_spec[:, 1], LET_threshold, normalization_method='standard')
|
|
409
|
+
r_spec[:, 1] = LE_transform(r_spec[:, 1], LET_threshold, normalization_method='standard')
|
|
410
|
+
|
|
411
|
+
if transformation == 'N' and q_spec.shape[0] > 1 and r_spec.shape[0] > 1:
|
|
412
|
+
q_spec = remove_noise(q_spec, nr=noise_threshold)
|
|
413
|
+
if not high_quality_reference_library:
|
|
414
|
+
r_spec = remove_noise(r_spec, nr=noise_threshold)
|
|
415
|
+
|
|
416
|
+
if transformation == 'F' and q_spec.shape[0] > 1 and r_spec.shape[0] > 1:
|
|
417
|
+
q_spec = filter_spec_lcms(
|
|
418
|
+
q_spec, mz_min=mz_min, mz_max=mz_max, int_min=int_min, int_max=int_max, is_matched=is_matched
|
|
419
|
+
)
|
|
420
|
+
if not high_quality_reference_library:
|
|
421
|
+
r_spec = filter_spec_lcms(
|
|
422
|
+
r_spec, mz_min=mz_min, mz_max=mz_max, int_min=int_min, int_max=int_max, is_matched=is_matched
|
|
423
|
+
)
|
|
424
|
+
|
|
425
|
+
if q_spec.shape[0] > 1 and r_spec.shape[0] > 1:
|
|
426
|
+
q_ints = q_spec[:, 1]
|
|
427
|
+
r_ints = r_spec[:, 1]
|
|
428
|
+
if np.sum(q_ints) != 0 and np.sum(r_ints) != 0:
|
|
429
|
+
sim = get_similarity(similarity_measure, q_ints, r_ints, weights, entropy_dimension)
|
|
430
|
+
else:
|
|
431
|
+
sim = 0.0
|
|
632
432
|
else:
|
|
633
|
-
|
|
433
|
+
sim = 0.0
|
|
634
434
|
|
|
635
|
-
|
|
636
|
-
all_similarity_scores.append(similarity_scores)
|
|
435
|
+
similarity_by_ref[str(ref_id)] = float(sim)
|
|
637
436
|
|
|
638
|
-
|
|
639
|
-
|
|
640
|
-
df_scores.index.names = ['Query Spectrum ID']
|
|
437
|
+
row = [similarity_by_ref.get(ref_id, 0.0) for ref_id in unique_reference_ids]
|
|
438
|
+
all_similarity_rows.append(row)
|
|
641
439
|
|
|
642
|
-
|
|
643
|
-
|
|
644
|
-
for i in range(0, df_scores.shape[0]):
|
|
645
|
-
df_scores_tmp = df_scores
|
|
646
|
-
preds_tmp = []
|
|
647
|
-
scores_tmp = []
|
|
648
|
-
for j in range(0, n_top_matches_to_save):
|
|
649
|
-
top_ref_specs_tmp = df_scores_tmp.iloc[i,np.where(df_scores_tmp.iloc[i,:] == np.max(df_scores_tmp.iloc[i,:]))[0]]
|
|
650
|
-
cols_to_keep = np.where(df_scores_tmp.iloc[i,:] != np.max(df_scores_tmp.iloc[i,:]))[0]
|
|
651
|
-
df_scores_tmp = df_scores_tmp.iloc[:,cols_to_keep]
|
|
440
|
+
df_scores = pd.DataFrame(all_similarity_rows, index=unique_query_ids, columns=unique_reference_ids)
|
|
441
|
+
df_scores.index.name = 'QUERY.SPECTRUM.ID'
|
|
652
442
|
|
|
653
|
-
|
|
654
|
-
|
|
655
|
-
|
|
656
|
-
else:
|
|
657
|
-
scores_tmp.append(top_ref_specs_tmp.values[0])
|
|
658
|
-
preds.append(preds_tmp)
|
|
659
|
-
scores.append(scores_tmp)
|
|
660
|
-
|
|
661
|
-
preds = np.array(preds)
|
|
662
|
-
scores = np.array(scores)
|
|
663
|
-
out = np.c_[unique_query_ids,preds,scores]
|
|
664
|
-
df_tmp = pd.DataFrame(out, columns=['TRUE.ID','PREDICTED.ID','SCORE'])
|
|
665
|
-
acc = (df_tmp['TRUE.ID']==df_tmp['PREDICTED.ID']).mean()
|
|
666
|
-
return acc
|
|
443
|
+
top_idx = df_scores.values.argmax(axis=1)
|
|
444
|
+
top_scores = df_scores.values[np.arange(df_scores.shape[0]), top_idx]
|
|
445
|
+
top_ids = [df_scores.columns[i] for i in top_idx]
|
|
667
446
|
|
|
447
|
+
df_tmp = pd.DataFrame({'TRUE.ID': df_scores.index.to_list(), 'PREDICTED.ID': top_ids, 'SCORE': top_scores})
|
|
448
|
+
if verbose:
|
|
449
|
+
print(df_tmp)
|
|
668
450
|
|
|
451
|
+
acc = (df_tmp['TRUE.ID'] == df_tmp['PREDICTED.ID']).mean()
|
|
452
|
+
return acc
|
|
669
453
|
|
|
670
454
|
|
|
671
455
|
def get_acc_NRMS(df_query, df_reference, unique_query_ids, unique_reference_ids, similarity_measure, weights, spectrum_preprocessing_order, mz_min, mz_max, int_min, int_max, noise_threshold, wf_mz, wf_int, LET_threshold, entropy_dimension, high_quality_reference_library, verbose=True):
|
|
@@ -724,7 +508,7 @@ def get_acc_NRMS(df_query, df_reference, unique_query_ids, unique_reference_ids,
|
|
|
724
508
|
|
|
725
509
|
df_scores = pd.DataFrame(all_similarity_scores, columns = unique_reference_ids)
|
|
726
510
|
df_scores.index = unique_query_ids
|
|
727
|
-
df_scores.index.names = ['
|
|
511
|
+
df_scores.index.names = ['QUERY.SPECTRUM.ID']
|
|
728
512
|
|
|
729
513
|
preds = []
|
|
730
514
|
scores = []
|
|
@@ -754,64 +538,40 @@ def get_acc_NRMS(df_query, df_reference, unique_query_ids, unique_reference_ids,
|
|
|
754
538
|
|
|
755
539
|
|
|
756
540
|
|
|
757
|
-
def run_spec_lib_matching_on_HRMS_data(query_data=None, reference_data=None, likely_reference_ids=None, similarity_measure='cosine', weights={'Cosine':0.25,'Shannon':0.25,'Renyi':0.25,'Tsallis':0.25}, spectrum_preprocessing_order='FCNMWL', high_quality_reference_library=False, mz_min=0, mz_max=9999999, int_min=0, int_max=9999999, window_size_centroiding=0.5, window_size_matching=0.5, noise_threshold=0.0, wf_mz=0.0, wf_intensity=1.0, LET_threshold=0.0, entropy_dimension=1.1, n_top_matches_to_save=1, print_id_results=False, output_identification=None, output_similarity_scores=None, return_ID_output=False, verbose=True):
|
|
758
|
-
'''
|
|
759
|
-
runs spectral library matching on high-resolution mass spectrometry (HRMS) data
|
|
760
|
-
|
|
761
|
-
--query_data: mgf, mzML, or csv file of query mass spectrum/spectra to be identified. If csv file, each row should correspond to a mass spectrum, the left-most column should contain an identifier, and each of the other columns should correspond to a single mass/charge ratio. Mandatory argument.
|
|
762
|
-
--reference_data: either string or list of strings with pass to mgf, mzML, sdf, and/or csv file(s) of the reference mass spectra. If csv file, each row should correspond to a mass spectrum, the left-most column should contain in identifier (i.e. the CAS registry number or the compound name), and the remaining column should correspond to a single mass/charge ratio. Mandatory argument.
|
|
763
|
-
--likely_reference_ids: CSV file with one column containing the IDs of a subset of all compounds in the reference_data to be used in spectral library matching. Each ID in this file must be an ID in the reference library. Default: None (i.e. default is to use entire reference library)
|
|
764
|
-
--similarity_measure: cosine, shannon, renyi, tsallis, mixture, jaccard, dice, 3w_jaccard, sokal_sneath, binary_cosine, mountford, mcconnaughey, driver_kroeber, simpson, braun_banquet, fager_mcgowan, kulczynski, intersection, hamming, hellinger. Default: cosine.
|
|
765
|
-
--weights: dict of weights to give to each non-binary similarity measure (i.e. cosine, shannon, renyi, and tsallis) when the mixture similarity measure is specified. Default: 0.25 for each of the four non-binary similarity measures.
|
|
766
|
-
--spectrum_preprocessing_order: The spectrum preprocessing transformations and the order in which they are to be applied. Note that these transformations are applied prior to computing similarity scores. Format must be a string with 2-6 characters chosen from C, F, M, N, L, W representing centroiding, filtering based on mass/charge and intensity values, matching, noise removal, low-entropy trannsformation, and weight-factor-transformation, respectively. For example, if \'WCM\' is passed, then each spectrum will undergo a weight factor transformation, then centroiding, and then matching. Note that if an argument is passed, then \'M\' must be contained in the argument, since matching is a required preprocessing step in spectral library matching of HRMS data. Furthermore, \'C\' must be performed before matching since centroiding can change the number of ion fragments in a given spectrum. Default: FCNMWL')
|
|
767
|
-
--high_quality_reference_library: True/False flag indicating whether the reference library is considered to be of high quality. If True, then the spectrum preprocessing transformations of filtering and noise removal are performed only on the query spectrum/spectra. If False, all spectrum preprocessing transformations specified will be applied to both the query and reference spectra. Default: False')
|
|
768
|
-
--mz_min: Remove all peaks with mass/charge value less than mz_min in each spectrum. Default: 0
|
|
769
|
-
--mz_max: Remove all peaks with mass/charge value greater than mz_max in each spectrum. Default: 9999999
|
|
770
|
-
--int_min: Remove all peaks with intensity value less than int_min in each spectrum. Default: 0
|
|
771
|
-
--int_max: Remove all peaks with intensity value greater than int_max in each spectrum. Default: 9999999
|
|
772
|
-
--window_size_centroiding: Window size parameter used in centroiding a given spectrum. Default: 0.5
|
|
773
|
-
--window_size_matching: Window size parameter used in matching a query spectrum and a reference library spectrum. Default: 0.5
|
|
774
|
-
--noise_threshold: Ion fragments (i.e. points in a given mass spectrum) with intensity less than max(intensities)*noise_threshold are removed. Default: 0.0
|
|
775
|
-
--wf_mz: Mass/charge weight factor parameter. Default: 0.0
|
|
776
|
-
--wf_intensity: Intensity weight factor parameter. Default: 0.0
|
|
777
|
-
--LET_threshold: Low-entropy transformation threshold parameter. Spectra with Shannon entropy less than LET_threshold are transformed according to intensitiesNew=intensitiesOriginal^{(1+S)/(1+LET_threshold)}. Default: 0.0
|
|
778
|
-
--entropy_dimension: Entropy dimension parameter. Must have positive value other than 1. When the entropy dimension is 1, then Renyi and Tsallis entropy are equivalent to Shannon entropy. Therefore, this parameter only applies to the renyi and tsallis similarity measures. This parameter will be ignored if similarity measure cosine or shannon is chosen. Default: 1.1
|
|
779
|
-
--n_top_matches_to_save: The number of top matches to report. For example, if n_top_matches_to_save=5, then for each query spectrum, the five reference spectra with the largest similarity with the given query spectrum will be reported. Default: 1
|
|
780
|
-
--print_id_results: Flag that prints identification results if True. Default: False
|
|
781
|
-
--output_identification: Output TXT file containing the most-similar reference spectra for each query spectrum along with the corresponding similarity scores. Default is to save identification output in current working directory with filename \'output_identification.txt\'.
|
|
782
|
-
--output_similarity_scores: Output TXT file containing similarity scores between all query spectrum/spectra and all reference spectra. Each row corresponds to a query spectrum, the left-most column contains the query spectrum/spectra identifier, and the remaining column contain the similarity scores with respect to all reference library spectra. If no argument passed, then this TXT file is written to the current working directory with filename \'output_all_similarity_scores\'.txt.')
|
|
783
|
-
'''
|
|
784
|
-
|
|
541
|
+
def run_spec_lib_matching_on_HRMS_data(query_data=None, reference_data=None, precursor_ion_mz_tolerance=None, ionization_mode=None, adduct=None, likely_reference_ids=None, similarity_measure='cosine', weights={'Cosine':0.25,'Shannon':0.25,'Renyi':0.25,'Tsallis':0.25}, spectrum_preprocessing_order='FCNMWL', high_quality_reference_library=False, mz_min=0, mz_max=9999999, int_min=0, int_max=9999999, window_size_centroiding=0.5, window_size_matching=0.5, noise_threshold=0.0, wf_mz=0.0, wf_intensity=1.0, LET_threshold=0.0, entropy_dimension=1.1, n_top_matches_to_save=1, print_id_results=False, output_identification=None, output_similarity_scores=None, return_ID_output=False, verbose=True):
|
|
785
542
|
if query_data is None:
|
|
786
543
|
print('\nError: No argument passed to the mandatory query_data. Please pass the path to the CSV file of the query data.')
|
|
787
544
|
sys.exit()
|
|
788
545
|
else:
|
|
789
546
|
extension = query_data.rsplit('.',1)
|
|
790
547
|
extension = extension[(len(extension)-1)]
|
|
791
|
-
if extension == 'mgf' or extension == 'MGF' or extension == 'mzML' or extension == 'mzml' or extension == 'MZML' or extension == 'cdf' or extension == 'CDF':
|
|
792
|
-
output_path_tmp = query_data[:-3] + '
|
|
548
|
+
if extension == 'mgf' or extension == 'MGF' or extension == 'mzML' or extension == 'mzml' or extension == 'MZML' or extension == 'cdf' or extension == 'CDF' or extension == 'json' or extension == 'JSON' or extension == 'msp' or extension == 'MSP':
|
|
549
|
+
output_path_tmp = query_data[:-3] + 'txt'
|
|
793
550
|
build_library_from_raw_data(input_path=query_data, output_path=output_path_tmp, is_reference=False)
|
|
794
|
-
df_query = pd.read_csv(output_path_tmp)
|
|
795
|
-
if extension == '
|
|
796
|
-
df_query = pd.read_csv(query_data)
|
|
797
|
-
unique_query_ids = df_query
|
|
551
|
+
df_query = pd.read_csv(output_path_tmp, sep='\t')
|
|
552
|
+
if extension == 'txt' or extension == 'TXT':
|
|
553
|
+
df_query = pd.read_csv(query_data, sep='\t')
|
|
554
|
+
unique_query_ids = df_query['id'].unique()
|
|
798
555
|
|
|
799
556
|
if reference_data is None:
|
|
800
|
-
print('\nError: No argument passed to the mandatory reference_data. Please pass the path to the
|
|
557
|
+
print('\nError: No argument passed to the mandatory reference_data. Please pass the path to the reference data.')
|
|
801
558
|
sys.exit()
|
|
802
559
|
else:
|
|
803
560
|
if isinstance(reference_data,str):
|
|
804
561
|
df_reference = get_reference_df(reference_data,likely_reference_ids)
|
|
805
|
-
unique_reference_ids = df_reference.iloc[:,0].unique()
|
|
806
562
|
else:
|
|
807
563
|
dfs = []
|
|
808
|
-
unique_reference_ids = []
|
|
809
564
|
for f in reference_data:
|
|
810
565
|
tmp = get_reference_df(f,likely_reference_ids)
|
|
811
566
|
dfs.append(tmp)
|
|
812
|
-
unique_reference_ids.extend(tmp.iloc[:,0].unique())
|
|
813
567
|
df_reference = pd.concat(dfs, axis=0, ignore_index=True)
|
|
814
568
|
|
|
569
|
+
if 'ionization_mode' in df_reference.columns.tolist() and ionization_mode != 'N/A' and ionization_mode != None:
|
|
570
|
+
df_reference = df_reference.loc[df_reference['ionization_mode']==ionization_mode]
|
|
571
|
+
if 'adduct' in df_reference.columns.tolist() and adduct != 'N/A' and adduct != None:
|
|
572
|
+
df_reference = df_reference.loc[df_reference['adduct']==adduct]
|
|
573
|
+
|
|
574
|
+
print(df_reference.loc[df_reference['id']=='Hectochlorin M+H'])
|
|
815
575
|
|
|
816
576
|
if spectrum_preprocessing_order is not None:
|
|
817
577
|
spectrum_preprocessing_order = list(spectrum_preprocessing_order)
|
|
@@ -899,62 +659,91 @@ def run_spec_lib_matching_on_HRMS_data(query_data=None, reference_data=None, lik
|
|
|
899
659
|
print(f'Warning: writing similarity scores to {output_similarity_scores}')
|
|
900
660
|
|
|
901
661
|
|
|
902
|
-
|
|
903
|
-
|
|
904
|
-
|
|
662
|
+
unique_reference_ids = df_reference['id'].unique().tolist()
|
|
663
|
+
all_similarity_scores = []
|
|
664
|
+
|
|
665
|
+
for query_idx in range(len(unique_query_ids)):
|
|
666
|
+
if verbose:
|
|
905
667
|
print(f'query spectrum #{query_idx} is being identified')
|
|
906
|
-
q_idxs_tmp = np.where(df_query.iloc[:,0] == unique_query_ids[query_idx])[0]
|
|
907
|
-
q_spec_tmp = np.asarray(pd.concat([df_query.iloc[q_idxs_tmp,1], df_query.iloc[q_idxs_tmp,2]], axis=1).reset_index(drop=True))
|
|
908
668
|
|
|
909
|
-
|
|
910
|
-
|
|
911
|
-
|
|
912
|
-
|
|
913
|
-
|
|
669
|
+
q_mask = (df_query['id'] == unique_query_ids[query_idx])
|
|
670
|
+
q_idxs_tmp = np.where(q_mask)[0]
|
|
671
|
+
q_spec_tmp = np.asarray(pd.concat([df_query['mz_ratio'].iloc[q_idxs_tmp], df_query['intensity'].iloc[q_idxs_tmp]], axis=1).reset_index(drop=True))
|
|
672
|
+
|
|
673
|
+
if 'precursor_ion_mz' in df_query.columns.tolist() and 'precursor_ion_mz' in df_reference.columns.tolist() and precursor_ion_mz_tolerance != None:
|
|
674
|
+
precursor_ion_mz_tmp = df_query['precursor_ion_mz'].iloc[q_idxs_tmp[0]]
|
|
675
|
+
df_reference_tmp = df_reference.loc[df_reference['precursor_ion_mz'].between(precursor_ion_mz_tmp-precursor_ion_mz_tolerance, precursor_ion_mz_tmp+precursor_ion_mz_tolerance, inclusive='both'),['id','mz_ratio','intensity']].copy()
|
|
676
|
+
else:
|
|
677
|
+
df_reference_tmp = df_reference.copy()
|
|
678
|
+
|
|
679
|
+
ref_groups = dict(tuple(df_reference_tmp.groupby('id', sort=False)))
|
|
680
|
+
unique_reference_ids_tmp = list(ref_groups.keys())
|
|
681
|
+
|
|
682
|
+
similarity_by_ref = {}
|
|
683
|
+
for ref_id in unique_reference_ids_tmp:
|
|
684
|
+
q_spec = q_spec_tmp.copy()
|
|
685
|
+
r_df = ref_groups[ref_id]
|
|
686
|
+
r_spec = np.asarray(pd.concat([r_df['mz_ratio'], r_df['intensity']], axis=1).reset_index(drop=True))
|
|
687
|
+
#print('\nhere!!!!!!!!!!!!!!!')
|
|
688
|
+
#print(r_spec)
|
|
914
689
|
|
|
915
690
|
is_matched = False
|
|
691
|
+
|
|
916
692
|
for transformation in spectrum_preprocessing_order:
|
|
917
|
-
if np.isinf(q_spec[:,1]).sum() > 0:
|
|
918
|
-
q_spec[:,1] = np.zeros(q_spec.shape[0])
|
|
919
|
-
if np.isinf(r_spec[:,1]).sum() > 0:
|
|
920
|
-
r_spec[:,1] = np.zeros(r_spec.shape[0])
|
|
921
|
-
|
|
922
|
-
|
|
923
|
-
|
|
924
|
-
|
|
693
|
+
if np.isinf(q_spec[:, 1]).sum() > 0:
|
|
694
|
+
q_spec[:, 1] = np.zeros(q_spec.shape[0])
|
|
695
|
+
if np.isinf(r_spec[:, 1]).sum() > 0:
|
|
696
|
+
r_spec[:, 1] = np.zeros(r_spec.shape[0])
|
|
697
|
+
|
|
698
|
+
if transformation == 'C' and q_spec.shape[0] > 1 and r_spec.shape[0] > 1:
|
|
699
|
+
q_spec = centroid_spectrum(q_spec, window_size=window_size_centroiding)
|
|
700
|
+
r_spec = centroid_spectrum(r_spec, window_size=window_size_centroiding)
|
|
701
|
+
|
|
702
|
+
if transformation == 'M' and q_spec.shape[0] > 1 and r_spec.shape[0] > 1:
|
|
925
703
|
m_spec = match_peaks_in_spectra(spec_a=q_spec, spec_b=r_spec, window_size=window_size_matching)
|
|
926
|
-
q_spec = m_spec[:,0:2]
|
|
927
|
-
r_spec = m_spec[:,[0,2]]
|
|
704
|
+
q_spec = m_spec[:, 0:2]
|
|
705
|
+
r_spec = m_spec[:, [0, 2]]
|
|
928
706
|
is_matched = True
|
|
929
|
-
if transformation == 'W' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1:
|
|
930
|
-
q_spec[:,1] = wf_transform(q_spec[:,0], q_spec[:,1], wf_mz, wf_intensity)
|
|
931
|
-
r_spec[:,1] = wf_transform(r_spec[:,0], r_spec[:,1], wf_mz, wf_intensity)
|
|
932
|
-
if transformation == 'L' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1:
|
|
933
|
-
q_spec[:,1] = LE_transform(q_spec[:,1], LET_threshold, normalization_method=normalization_method)
|
|
934
|
-
r_spec[:,1] = LE_transform(r_spec[:,1], LET_threshold, normalization_method=normalization_method)
|
|
935
|
-
if transformation == 'N' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1:
|
|
936
|
-
q_spec = remove_noise(q_spec, nr = noise_threshold)
|
|
937
|
-
if high_quality_reference_library == False:
|
|
938
|
-
r_spec = remove_noise(r_spec, nr = noise_threshold)
|
|
939
|
-
if transformation == 'F' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1:
|
|
940
|
-
q_spec = filter_spec_lcms(q_spec, mz_min = mz_min, mz_max = mz_max, int_min = int_min, int_max = int_max, is_matched = is_matched)
|
|
941
|
-
if high_quality_reference_library == False:
|
|
942
|
-
r_spec = filter_spec_lcms(r_spec, mz_min = mz_min, mz_max = mz_max, int_min = int_min, int_max = int_max, is_matched = is_matched)
|
|
943
707
|
|
|
944
|
-
|
|
945
|
-
|
|
946
|
-
|
|
947
|
-
|
|
948
|
-
|
|
708
|
+
if transformation == 'W' and q_spec.shape[0] > 1 and r_spec.shape[0] > 1:
|
|
709
|
+
q_spec[:, 1] = wf_transform(q_spec[:, 0], q_spec[:, 1], wf_mz, wf_intensity)
|
|
710
|
+
r_spec[:, 1] = wf_transform(r_spec[:, 0], r_spec[:, 1], wf_mz, wf_intensity)
|
|
711
|
+
|
|
712
|
+
if transformation == 'L' and q_spec.shape[0] > 1 and r_spec.shape[0] > 1:
|
|
713
|
+
q_spec[:, 1] = LE_transform(q_spec[:, 1], LET_threshold, normalization_method=normalization_method)
|
|
714
|
+
r_spec[:, 1] = LE_transform(r_spec[:, 1], LET_threshold, normalization_method=normalization_method)
|
|
715
|
+
|
|
716
|
+
if transformation == 'N' and q_spec.shape[0] > 1 and r_spec.shape[0] > 1:
|
|
717
|
+
q_spec = remove_noise(q_spec, nr=noise_threshold)
|
|
718
|
+
if not high_quality_reference_library:
|
|
719
|
+
r_spec = remove_noise(r_spec, nr=noise_threshold)
|
|
720
|
+
|
|
721
|
+
if transformation == 'F' and q_spec.shape[0] > 1 and r_spec.shape[0] > 1:
|
|
722
|
+
q_spec = filter_spec_lcms(
|
|
723
|
+
q_spec, mz_min=mz_min, mz_max=mz_max, int_min=int_min, int_max=int_max, is_matched=is_matched
|
|
724
|
+
)
|
|
725
|
+
if not high_quality_reference_library:
|
|
726
|
+
r_spec = filter_spec_lcms(
|
|
727
|
+
r_spec, mz_min=mz_min, mz_max=mz_max, int_min=int_min, int_max=int_max, is_matched=is_matched
|
|
728
|
+
)
|
|
729
|
+
|
|
730
|
+
q_ints = q_spec[:, 1]
|
|
731
|
+
r_ints = r_spec[:, 1]
|
|
732
|
+
|
|
733
|
+
if np.sum(q_ints) != 0 and np.sum(r_ints) != 0 and q_spec.shape[0] > 1 and r_spec.shape[0] > 1:
|
|
734
|
+
sim = get_similarity(similarity_measure, q_ints, r_ints, weights, entropy_dimension)
|
|
949
735
|
else:
|
|
950
|
-
|
|
736
|
+
sim = 0.0
|
|
951
737
|
|
|
952
|
-
|
|
953
|
-
all_similarity_scores.append(similarity_scores)
|
|
738
|
+
similarity_by_ref[ref_id] = sim
|
|
954
739
|
|
|
955
|
-
|
|
740
|
+
row_scores = [similarity_by_ref.get(ref_id, 0.0) for ref_id in unique_reference_ids]
|
|
741
|
+
all_similarity_scores.append(row_scores)
|
|
742
|
+
|
|
743
|
+
df_scores = pd.DataFrame(all_similarity_scores, index=unique_query_ids, columns=unique_reference_ids)
|
|
956
744
|
df_scores.index = unique_query_ids
|
|
957
|
-
df_scores.index.names = ['
|
|
745
|
+
df_scores.index.names = ['QUERY.SPECTRUM.ID']
|
|
746
|
+
|
|
958
747
|
|
|
959
748
|
preds = []
|
|
960
749
|
scores = []
|
|
@@ -987,7 +776,7 @@ def run_spec_lib_matching_on_HRMS_data(query_data=None, reference_data=None, lik
|
|
|
987
776
|
|
|
988
777
|
df_top_ref_specs = pd.DataFrame(out, columns = [*cnames_preds, *cnames_scores])
|
|
989
778
|
df_top_ref_specs.index = unique_query_ids
|
|
990
|
-
df_top_ref_specs.index.names = ['
|
|
779
|
+
df_top_ref_specs.index.names = ['QUERY.SPECTRUM.ID']
|
|
991
780
|
|
|
992
781
|
df_scores.columns = ['Reference Spectrum ID: ' + col for col in list(map(str,df_scores.columns.tolist()))]
|
|
993
782
|
|
|
@@ -1004,33 +793,7 @@ def run_spec_lib_matching_on_HRMS_data(query_data=None, reference_data=None, lik
|
|
|
1004
793
|
|
|
1005
794
|
|
|
1006
795
|
|
|
1007
|
-
def run_spec_lib_matching_on_NRMS_data(query_data=None, reference_data=None, likely_reference_ids=None, spectrum_preprocessing_order='FNLW', similarity_measure='cosine', weights={'Cosine':0.25,'Shannon':0.25,'Renyi':0.25,'Tsallis':0.25}, high_quality_reference_library=False, mz_min=0, mz_max=9999999, int_min=0, int_max=9999999, noise_threshold=0.0, wf_mz=0.0, wf_intensity=1.0, LET_threshold=0.0, entropy_dimension=1.1, n_top_matches_to_save=1, print_id_results=False, output_identification=None, output_similarity_scores=None, return_ID_output=False):
|
|
1008
|
-
'''
|
|
1009
|
-
runs spectral library matching on nominal-resolution mass spectrometry (NRMS) data
|
|
1010
|
-
|
|
1011
|
-
--query_data: cdf or csv file of query mass spectrum/spectra to be identified. If csv file, each row should correspond to a mass spectrum, the left-most column should contain an identifier, and each of the other columns should correspond to a single mass/charge ratio. Mandatory argument.
|
|
1012
|
-
--reference_data: cdf of csv file of the reference mass spectra. If csv file, each row should correspond to a mass spectrum, the left-most column should contain in identifier (i.e. the CAS registry number or the compound name), and the remaining column should correspond to a single mass/charge ratio. Mandatory argument.
|
|
1013
|
-
--likely_reference_ids: CSV file with one column containing the IDs of a subset of all compounds in the reference_data to be used in spectral library matching. Each ID in this file must be an ID in the reference library. Default: None (i.e. default is to use entire reference library)
|
|
1014
|
-
--similarity_measure: cosine, shannon, renyi, tsallis, mixture, jaccard, dice, 3w_jaccard, sokal_sneath, binary_cosine, mountford, mcconnaughey, driver_kroeber, simpson, braun_banquet, fager_mcgowan, kulczynski, intersection, hamming, hellinger. Default: cosine.
|
|
1015
|
-
--weights: dict of weights to give to each non-binary similarity measure (i.e. cosine, shannon, renyi, and tsallis) when the mixture similarity measure is specified. Default: 0.25 for each of the four non-binary similarity measures.
|
|
1016
|
-
--spectrum_preprocessing_order: The spectrum preprocessing transformations and the order in which they are to be applied. Note that these transformations are applied prior to computing similarity scores. Format must be a string with 2-4 characters chosen from F, N, L, W representing filtering based on mass/charge and intensity values, noise removal, low-entropy trannsformation, and weight-factor-transformation, respectively. For example, if \'WN\' is passed, then each spectrum will undergo a weight factor transformation and then noise removal. Default: FNLW')
|
|
1017
|
-
--high_quality_reference_library: True/False flag indicating whether the reference library is considered to be of high quality. If True, then the spectrum preprocessing transformations of filtering and noise removal are performed only on the query spectrum/spectra. If False, all spectrum preprocessing transformations specified will be applied to both the query and reference spectra. Default: False')
|
|
1018
|
-
--mz_min: Remove all peaks with mass/charge value less than mz_min in each spectrum. Default: 0
|
|
1019
|
-
--mz_max: Remove all peaks with mass/charge value greater than mz_max in each spectrum. Default: 9999999
|
|
1020
|
-
--int_min: Remove all peaks with intensity value less than int_min in each spectrum. Default: 0
|
|
1021
|
-
--int_max: Remove all peaks with intensity value greater than int_max in each spectrum. Default: 9999999
|
|
1022
|
-
--noise_threshold: Ion fragments (i.e. points in a given mass spectrum) with intensity less than max(intensities)*noise_threshold are removed. Default: 0.0
|
|
1023
|
-
--wf_mz: Mass/charge weight factor parameter. Default: 0.0
|
|
1024
|
-
--wf_intensity: Intensity weight factor parameter. Default: 0.0
|
|
1025
|
-
--LET_threshold: Low-entropy transformation threshold parameter. Spectra with Shannon entropy less than LET_threshold are transformed according to intensitiesNew=intensitiesOriginal^{(1+S)/(1+LET_threshold)}. Default: 0.0
|
|
1026
|
-
--entropy_dimension: Entropy dimension parameter. Must have positive value other than 1. When the entropy dimension is 1, then Renyi and Tsallis entropy are equivalent to Shannon entropy. Therefore, this parameter only applies to the renyi and tsallis similarity measures. This parameter will be ignored if similarity measure cosine or shannon is chosen. Default: 1.1
|
|
1027
|
-
--normalization_method: Method used to normalize the intensities of each spectrum so that the intensities sum to 1. Since the objects entropy quantifies the uncertainy of must be probability distributions, the intensities of a given spectrum must sum to 1 prior to computing the entropy of the given spectrum intensities. Options: \'standard\' and \'softmax\'. Default: standard.
|
|
1028
|
-
--n_top_matches_to_save: The number of top matches to report. For example, if n_top_matches_to_save=5, then for each query spectrum, the five reference spectra with the largest similarity with the given query spectrum will be reported. Default: 1
|
|
1029
|
-
--print_id_results: Flag that prints identification results if True. Default: False
|
|
1030
|
-
--output_identification: Output TXT file containing the most-similar reference spectra for each query spectrum along with the corresponding similarity scores. Default is to save identification output in current working directory with filename \'output_identification.txt\'.
|
|
1031
|
-
--output_similarity_scores: Output TXT file containing similarity scores between all query spectrum/spectra and all reference spectra. Each row corresponds to a query spectrum, the left-most column contains the query spectrum/spectra identifier, and the remaining column contain the similarity scores with respect to all reference library spectra. If no argument passed, then this TXT file is written to the current working directory with filename \'output_all_similarity_scores\'.txt.')
|
|
1032
|
-
'''
|
|
1033
|
-
|
|
796
|
+
def run_spec_lib_matching_on_NRMS_data(query_data=None, reference_data=None, likely_reference_ids=None, spectrum_preprocessing_order='FNLW', similarity_measure='cosine', weights={'Cosine':0.25,'Shannon':0.25,'Renyi':0.25,'Tsallis':0.25}, high_quality_reference_library=False, mz_min=0, mz_max=9999999, int_min=0, int_max=9999999, noise_threshold=0.0, wf_mz=0.0, wf_intensity=1.0, LET_threshold=0.0, entropy_dimension=1.1, n_top_matches_to_save=1, print_id_results=False, output_identification=None, output_similarity_scores=None, return_ID_output=False, verbose=True):
|
|
1034
797
|
if query_data is None:
|
|
1035
798
|
print('\nError: No argument passed to the mandatory query_data. Please pass the path to the CSV file of the query data.')
|
|
1036
799
|
sys.exit()
|
|
@@ -1038,11 +801,11 @@ def run_spec_lib_matching_on_NRMS_data(query_data=None, reference_data=None, lik
|
|
|
1038
801
|
extension = query_data.rsplit('.',1)
|
|
1039
802
|
extension = extension[(len(extension)-1)]
|
|
1040
803
|
if extension == 'mgf' or extension == 'MGF' or extension == 'mzML' or extension == 'mzml' or extension == 'MZML' or extension == 'cdf' or extension == 'CDF':
|
|
1041
|
-
output_path_tmp = query_data[:-3] + '
|
|
804
|
+
output_path_tmp = query_data[:-3] + 'txt'
|
|
1042
805
|
build_library_from_raw_data(input_path=query_data, output_path=output_path_tmp, is_reference=False)
|
|
1043
|
-
df_query = pd.read_csv(output_path_tmp)
|
|
1044
|
-
if extension == '
|
|
1045
|
-
df_query = pd.read_csv(query_data)
|
|
806
|
+
df_query = pd.read_csv(output_path_tmp, sep='\t')
|
|
807
|
+
if extension == 'txt' or extension == 'TXT':
|
|
808
|
+
df_query = pd.read_csv(query_data, sep='\t')
|
|
1046
809
|
unique_query_ids = df_query.iloc[:,0].unique()
|
|
1047
810
|
|
|
1048
811
|
if reference_data is None:
|
|
@@ -1186,7 +949,7 @@ def run_spec_lib_matching_on_NRMS_data(query_data=None, reference_data=None, lik
|
|
|
1186
949
|
|
|
1187
950
|
df_scores = pd.DataFrame(all_similarity_scores, columns = unique_reference_ids)
|
|
1188
951
|
df_scores.index = unique_query_ids
|
|
1189
|
-
df_scores.index.names = ['
|
|
952
|
+
df_scores.index.names = ['QUERY.SPECTRUM.ID']
|
|
1190
953
|
|
|
1191
954
|
preds = []
|
|
1192
955
|
scores = []
|
|
@@ -1219,7 +982,7 @@ def run_spec_lib_matching_on_NRMS_data(query_data=None, reference_data=None, lik
|
|
|
1219
982
|
|
|
1220
983
|
df_top_ref_specs = pd.DataFrame(out, columns = [*cnames_preds, *cnames_scores])
|
|
1221
984
|
df_top_ref_specs.index = unique_query_ids
|
|
1222
|
-
df_top_ref_specs.index.names = ['
|
|
985
|
+
df_top_ref_specs.index.names = ['QUERY.SPECTRUM.ID']
|
|
1223
986
|
|
|
1224
987
|
if print_id_results == True:
|
|
1225
988
|
print(df_top_ref_specs.to_string())
|