pycompound 0.1.6__py3-none-any.whl → 0.1.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- app.py +2589 -237
- pycompound/build_library.py +77 -20
- pycompound/plot_spectra.py +1 -1
- pycompound/processing.py +5 -5
- pycompound/spec_lib_matching.py +245 -471
- pycompound/spec_lib_matching_CLI.py +48 -2
- pycompound/tuning_CLI_DE.py +22 -22
- pycompound/tuning_CLI_grid.py +22 -6
- {pycompound-0.1.6.dist-info → pycompound-0.1.7.dist-info}/METADATA +1 -1
- pycompound-0.1.7.dist-info/RECORD +15 -0
- pycompound-0.1.6.dist-info/RECORD +0 -15
- {pycompound-0.1.6.dist-info → pycompound-0.1.7.dist-info}/WHEEL +0 -0
- {pycompound-0.1.6.dist-info → pycompound-0.1.7.dist-info}/licenses/LICENSE +0 -0
- {pycompound-0.1.6.dist-info → pycompound-0.1.7.dist-info}/top_level.txt +0 -0
pycompound/spec_lib_matching.py
CHANGED
|
@@ -22,8 +22,10 @@ def _vector_to_full_params(X, default_params, optimize_params):
|
|
|
22
22
|
def objective_function_HRMS(X, ctx):
|
|
23
23
|
p = _vector_to_full_params(X, ctx["default_params"], ctx["optimize_params"])
|
|
24
24
|
acc = get_acc_HRMS(
|
|
25
|
-
ctx["df_query"],
|
|
26
|
-
ctx["
|
|
25
|
+
ctx["df_query"],
|
|
26
|
+
ctx["df_reference"],
|
|
27
|
+
ctx["precursor_ion_mz_tolerance"],
|
|
28
|
+
ctx["ionization_mode"], ctx["adduct"],
|
|
27
29
|
ctx["similarity_measure"], ctx["weights"], ctx["spectrum_preprocessing_order"],
|
|
28
30
|
ctx["mz_min"], ctx["mz_max"], ctx["int_min"], ctx["int_max"],
|
|
29
31
|
p["window_size_centroiding"], p["window_size_matching"], p["noise_threshold"],
|
|
@@ -35,11 +37,11 @@ def objective_function_HRMS(X, ctx):
|
|
|
35
37
|
print(f"\nparams({ctx['optimize_params']}) = {np.array(X)}\naccuracy: {acc*100}%")
|
|
36
38
|
return 1.0 - acc
|
|
37
39
|
|
|
40
|
+
|
|
38
41
|
def objective_function_NRMS(X, ctx):
|
|
39
42
|
p = _vector_to_full_params(X, ctx["default_params"], ctx["optimize_params"])
|
|
40
43
|
acc = get_acc_NRMS(
|
|
41
|
-
ctx["df_query"], ctx["df_reference"],
|
|
42
|
-
ctx["unique_query_ids"], ctx["unique_reference_ids"],
|
|
44
|
+
ctx["df_query"], ctx["df_reference"], ctx['unique_query_ids'], ctx['unique_reference_ids'],
|
|
43
45
|
ctx["similarity_measure"], ctx["weights"], ctx["spectrum_preprocessing_order"],
|
|
44
46
|
ctx["mz_min"], ctx["mz_max"], ctx["int_min"], ctx["int_max"],
|
|
45
47
|
p["noise_threshold"], p["wf_mz"], p["wf_int"], p["LET_threshold"], p["entropy_dimension"],
|
|
@@ -51,16 +53,8 @@ def objective_function_NRMS(X, ctx):
|
|
|
51
53
|
|
|
52
54
|
|
|
53
55
|
|
|
54
|
-
def tune_params_DE(query_data=None, reference_data=None, chromatography_platform='HRMS', similarity_measure='cosine', weights=None, spectrum_preprocessing_order='CNMWL', mz_min=0, mz_max=999999999, int_min=0, int_max=999999999, high_quality_reference_library=False, optimize_params=["window_size_centroiding","window_size_matching","noise_threshold","wf_mz","wf_int","LET_threshold","entropy_dimension"], param_bounds={"window_size_centroiding":(0.0,0.5),"window_size_matching":(0.0,0.5),"noise_threshold":(0.0,0.25),"wf_mz":(0.0,5.0),"wf_int":(0.0,5.0),"LET_threshold":(0.0,5.0),"entropy_dimension":(1.0,3.0)}, default_params={"window_size_centroiding": 0.5, "window_size_matching":0.5, "noise_threshold":0.10, "wf_mz":0.0, "wf_int":1.0, "LET_threshold":0.0, "entropy_dimension":1.1}, maxiters=3, de_workers=1, de_updating='immediate', log_hook=None):
|
|
55
|
-
|
|
56
|
-
def _log(msg):
|
|
57
|
-
if log_hook:
|
|
58
|
-
try: log_hook(msg if msg.endswith("\n") else msg + "\n")
|
|
59
|
-
except: pass
|
|
60
56
|
|
|
61
|
-
|
|
62
|
-
_log(f"iter callback: conv={conv:.4g}, x={xk}")
|
|
63
|
-
return False
|
|
57
|
+
def tune_params_DE(query_data=None, reference_data=None, chromatography_platform='HRMS', precursor_ion_mz_tolerance=None, ionization_mode=None, adduct=None, similarity_measure='cosine', weights=None, spectrum_preprocessing_order='CNMWL', mz_min=0, mz_max=999999999, int_min=0, int_max=999999999, high_quality_reference_library=False, optimize_params=["window_size_centroiding","window_size_matching","noise_threshold","wf_mz","wf_int","LET_threshold","entropy_dimension"], param_bounds={"window_size_centroiding":(0.0,0.5),"window_size_matching":(0.0,0.5),"noise_threshold":(0.0,0.25),"wf_mz":(0.0,5.0),"wf_int":(0.0,5.0),"LET_threshold":(0.0,5.0),"entropy_dimension":(1.0,3.0)}, default_params={"window_size_centroiding": 0.5, "window_size_matching":0.5, "noise_threshold":0.10, "wf_mz":0.0, "wf_int":1.0, "LET_threshold":0.0, "entropy_dimension":1.1}, maxiters=3, de_workers=1):
|
|
64
58
|
|
|
65
59
|
if query_data is None:
|
|
66
60
|
print('\nError: No argument passed to the mandatory query_data. Please pass the path to the TXT file of the query data.')
|
|
@@ -68,21 +62,19 @@ def tune_params_DE(query_data=None, reference_data=None, chromatography_platform
|
|
|
68
62
|
else:
|
|
69
63
|
extension = query_data.rsplit('.',1)
|
|
70
64
|
extension = extension[(len(extension)-1)]
|
|
71
|
-
if extension == 'mgf' or extension == 'MGF' or extension == 'mzML' or extension == 'mzml' or extension == 'MZML' or extension == 'cdf' or extension == 'CDF':
|
|
72
|
-
output_path_tmp = query_data[:-3] + '
|
|
65
|
+
if extension == 'mgf' or extension == 'MGF' or extension == 'mzML' or extension == 'mzml' or extension == 'MZML' or extension == 'cdf' or extension == 'CDF' or extension == 'msp' or extension == 'MSP' or extension == 'json' or extension == 'JSON':
|
|
66
|
+
output_path_tmp = query_data[:-3] + 'txt'
|
|
73
67
|
build_library_from_raw_data(input_path=query_data, output_path=output_path_tmp, is_reference=False)
|
|
74
|
-
df_query = pd.read_csv(output_path_tmp)
|
|
75
|
-
if extension == '
|
|
76
|
-
df_query = pd.read_csv(query_data)
|
|
77
|
-
unique_query_ids = df_query.iloc[:,0].unique()
|
|
68
|
+
df_query = pd.read_csv(output_path_tmp, sep='\t')
|
|
69
|
+
if extension == 'txt' or extension == 'TXT':
|
|
70
|
+
df_query = pd.read_csv(query_data, sep='\t')
|
|
78
71
|
|
|
79
72
|
if reference_data is None:
|
|
80
|
-
print('\nError: No argument passed to the mandatory reference_data. Please pass the path to the
|
|
73
|
+
print('\nError: No argument passed to the mandatory reference_data. Please pass the path to the TXT file of the reference data.')
|
|
81
74
|
sys.exit()
|
|
82
75
|
else:
|
|
83
76
|
if isinstance(reference_data,str):
|
|
84
77
|
df_reference = get_reference_df(reference_data=reference_data)
|
|
85
|
-
unique_reference_ids = df_reference.iloc[:,0].unique()
|
|
86
78
|
else:
|
|
87
79
|
dfs = []
|
|
88
80
|
unique_reference_ids = []
|
|
@@ -92,6 +84,11 @@ def tune_params_DE(query_data=None, reference_data=None, chromatography_platform
|
|
|
92
84
|
unique_reference_ids.extend(tmp.iloc[:,0].unique())
|
|
93
85
|
df_reference = pd.concat(dfs, axis=0, ignore_index=True)
|
|
94
86
|
|
|
87
|
+
if 'ionization_mode' in df_reference.columns.tolist() and ionization_mode != None and ionization_mode != 'N/A':
|
|
88
|
+
df_reference = df_reference.loc[df_reference['ionization_mode']==ionization_mode]
|
|
89
|
+
if 'adduct' in df_reference.columns.tolist() and adduct != None and adduct != 'N/A':
|
|
90
|
+
df_reference = df_reference.loc[df_reference['adduct']==adduct]
|
|
91
|
+
|
|
95
92
|
unique_query_ids = df_query['id'].unique().tolist()
|
|
96
93
|
unique_reference_ids = df_reference['id'].unique().tolist()
|
|
97
94
|
|
|
@@ -100,6 +97,9 @@ def tune_params_DE(query_data=None, reference_data=None, chromatography_platform
|
|
|
100
97
|
df_reference=df_reference,
|
|
101
98
|
unique_query_ids=unique_query_ids,
|
|
102
99
|
unique_reference_ids=unique_reference_ids,
|
|
100
|
+
precursor_ion_mz_tolerance=precursor_ion_mz_tolerance,
|
|
101
|
+
ionization_mode=ionization_mode,
|
|
102
|
+
adduct=adduct,
|
|
103
103
|
similarity_measure=similarity_measure,
|
|
104
104
|
weights=weights,
|
|
105
105
|
spectrum_preprocessing_order=spectrum_preprocessing_order,
|
|
@@ -111,13 +111,10 @@ def tune_params_DE(query_data=None, reference_data=None, chromatography_platform
|
|
|
111
111
|
|
|
112
112
|
bounds = [param_bounds[p] for p in optimize_params]
|
|
113
113
|
|
|
114
|
-
print('here!!!!!!!!!!!!!!!')
|
|
115
|
-
print(de_workers)
|
|
116
|
-
print('here!!!!!!!!!!!!!!!')
|
|
117
114
|
if chromatography_platform == 'HRMS':
|
|
118
|
-
result = differential_evolution(objective_function_HRMS, bounds=bounds, args=(ctx,), maxiter=maxiters, tol=0.0, workers=de_workers, seed=1)
|
|
115
|
+
result = differential_evolution(objective_function_HRMS, bounds=bounds, args=(ctx,), maxiter=maxiters, tol=0.0, workers=de_workers, seed=1, updating='deferred' if de_workers!=1 else 'immediate')
|
|
119
116
|
else:
|
|
120
|
-
result = differential_evolution(objective_function_NRMS, bounds=bounds, args=(ctx,), maxiter=maxiters, tol=0.0, workers=de_workers, seed=1)
|
|
117
|
+
result = differential_evolution(objective_function_NRMS, bounds=bounds, args=(ctx,), maxiter=maxiters, tol=0.0, workers=de_workers, seed=1, updating='deferred' if de_workers!=1 else 'immediate')
|
|
121
118
|
|
|
122
119
|
best_full_params = _vector_to_full_params(result.x, default_params, optimize_params)
|
|
123
120
|
best_acc = 100.0 - (result.fun * 100.0)
|
|
@@ -131,14 +128,17 @@ def tune_params_DE(query_data=None, reference_data=None, chromatography_platform
|
|
|
131
128
|
for k, v in best_full_params.items():
|
|
132
129
|
print(f" {k}: {v}")
|
|
133
130
|
print(f"\nBest accuracy: {best_acc:.3f}%")
|
|
134
|
-
|
|
131
|
+
|
|
132
|
+
|
|
133
|
+
|
|
135
134
|
|
|
136
135
|
|
|
137
136
|
default_HRMS_grid = {'similarity_measure':['cosine'], 'weight':[{'Cosine':0.25,'Shannon':0.25,'Renyi':0.25,'Tsallis':0.25}], 'spectrum_preprocessing_order':['FCNMWL'], 'mz_min':[0], 'mz_max':[9999999], 'int_min':[0], 'int_max':[99999999], 'window_size_centroiding':[0.5], 'window_size_matching':[0.5], 'noise_threshold':[0.0], 'wf_mz':[0.0], 'wf_int':[1.0], 'LET_threshold':[0.0], 'entropy_dimension':[1.1], 'high_quality_reference_library':[False]}
|
|
138
137
|
default_NRMS_grid = {'similarity_measure':['cosine'], 'weight':[{'Cosine':0.25,'Shannon':0.25,'Renyi':0.25,'Tsallis':0.25}], 'spectrum_preprocessing_order':['FCNMWL'], 'mz_min':[0], 'mz_max':[9999999], 'int_min':[0], 'int_max':[99999999], 'noise_threshold':[0.0], 'wf_mz':[0.0], 'wf_int':[1.0], 'LET_threshold':[0.0], 'entropy_dimension':[1.1], 'high_quality_reference_library':[False]}
|
|
139
138
|
|
|
140
139
|
|
|
141
|
-
def _eval_one_HRMS(df_query, df_reference,
|
|
140
|
+
def _eval_one_HRMS(df_query, df_reference,
|
|
141
|
+
precursor_ion_mz_tolerance_tmp, ionization_mode_tmp, adduct_tmp,
|
|
142
142
|
similarity_measure_tmp, weight,
|
|
143
143
|
spectrum_preprocessing_order_tmp, mz_min_tmp, mz_max_tmp,
|
|
144
144
|
int_min_tmp, int_max_tmp, noise_threshold_tmp,
|
|
@@ -148,7 +148,8 @@ def _eval_one_HRMS(df_query, df_reference, unique_query_ids, unique_reference_id
|
|
|
148
148
|
|
|
149
149
|
acc = get_acc_HRMS(
|
|
150
150
|
df_query=df_query, df_reference=df_reference,
|
|
151
|
-
|
|
151
|
+
precursor_ion_mz_tolerance=precursor_ion_mz_tolerance_tmp,
|
|
152
|
+
ionization_mode=ionization_mode_tmp, adduct=adduct_tmp,
|
|
152
153
|
similarity_measure=similarity_measure_tmp, weights=weight,
|
|
153
154
|
spectrum_preprocessing_order=spectrum_preprocessing_order_tmp,
|
|
154
155
|
mz_min=mz_min_tmp, mz_max=mz_max_tmp,
|
|
@@ -160,7 +161,7 @@ def _eval_one_HRMS(df_query, df_reference, unique_query_ids, unique_reference_id
|
|
|
160
161
|
LET_threshold=LET_threshold_tmp,
|
|
161
162
|
entropy_dimension=entropy_dimension_tmp,
|
|
162
163
|
high_quality_reference_library=high_quality_reference_library_tmp,
|
|
163
|
-
verbose=
|
|
164
|
+
verbose=False
|
|
164
165
|
)
|
|
165
166
|
|
|
166
167
|
return (
|
|
@@ -191,6 +192,7 @@ def _eval_one_NRMS(df_query, df_reference, unique_query_ids, unique_reference_id
|
|
|
191
192
|
LET_threshold=LET_threshold_tmp,
|
|
192
193
|
entropy_dimension=entropy_dimension_tmp,
|
|
193
194
|
high_quality_reference_library=high_quality_reference_library_tmp,
|
|
195
|
+
verbose=False
|
|
194
196
|
)
|
|
195
197
|
|
|
196
198
|
return (
|
|
@@ -201,16 +203,7 @@ def _eval_one_NRMS(df_query, df_reference, unique_query_ids, unique_reference_id
|
|
|
201
203
|
|
|
202
204
|
|
|
203
205
|
|
|
204
|
-
def tune_params_on_HRMS_data_grid(query_data=None, reference_data=None, grid=None, output_path=None, return_output=False):
|
|
205
|
-
"""
|
|
206
|
-
runs spectral library matching on high-resolution mass spectrometry (HRMS) data with all possible combinations of parameters in the grid dict, saves results from each choice of parameters to a TXT file, and prints top-performing parameters
|
|
207
|
-
|
|
208
|
-
--query_data: mgf, mzML, or csv file of query mass spectrum/spectra to be identified. If csv file, each row should correspond to a mass spectrum, the left-most column should contain an identifier, and each of the other columns should correspond to a single mass/charge ratio. Mandatory argument.
|
|
209
|
-
--reference_data: mgf, mzML, or csv file of the reference mass spectra. If csv file, each row should correspond to a mass spectrum, the left-most column should contain in identifier (i.e. the CAS registry number or the compound name), and the remaining column should correspond to a single mass/charge ratio. Mandatory argument.
|
|
210
|
-
--grid: dict with all possible parameter values to try.
|
|
211
|
-
--output_path: accuracy from each choice of parameter set is saved to a TXT file here.
|
|
212
|
-
"""
|
|
213
|
-
|
|
206
|
+
def tune_params_on_HRMS_data_grid(query_data=None, reference_data=None, precursor_ion_mz_tolerance=None, ionization_mode=None, adduct=None, grid=None, output_path=None, return_output=False):
|
|
214
207
|
grid = {**default_HRMS_grid, **(grid or {})}
|
|
215
208
|
for key, value in grid.items():
|
|
216
209
|
globals()[key] = value
|
|
@@ -221,31 +214,37 @@ def tune_params_on_HRMS_data_grid(query_data=None, reference_data=None, grid=Non
|
|
|
221
214
|
else:
|
|
222
215
|
extension = query_data.rsplit('.',1)
|
|
223
216
|
extension = extension[(len(extension)-1)]
|
|
224
|
-
if extension == 'mgf' or extension == 'MGF' or extension == 'mzML' or extension == 'mzml' or extension == 'MZML' or extension == 'cdf' or extension == 'CDF':
|
|
225
|
-
output_path_tmp = query_data[:-3] + '
|
|
217
|
+
if extension == 'mgf' or extension == 'MGF' or extension == 'mzML' or extension == 'mzml' or extension == 'MZML' or extension == 'cdf' or extension == 'CDF' or extension == 'msp' or extension == 'MSP' or extension == 'json' or extension == 'JSON':
|
|
218
|
+
output_path_tmp = query_data[:-3] + 'txt'
|
|
226
219
|
build_library_from_raw_data(input_path=query_data, output_path=output_path_tmp, is_reference=False)
|
|
227
|
-
df_query = pd.read_csv(output_path_tmp)
|
|
228
|
-
if extension == '
|
|
229
|
-
df_query = pd.read_csv(query_data)
|
|
230
|
-
unique_query_ids = df_query
|
|
220
|
+
df_query = pd.read_csv(output_path_tmp, sep='\t')
|
|
221
|
+
if extension == 'txt' or extension == 'TXT':
|
|
222
|
+
df_query = pd.read_csv(query_data, sep='\t')
|
|
223
|
+
unique_query_ids = df_query['id'].unique()
|
|
231
224
|
|
|
232
225
|
if reference_data is None:
|
|
233
|
-
print('\nError: No argument passed to the mandatory reference_data. Please pass the path to the
|
|
226
|
+
print('\nError: No argument passed to the mandatory reference_data. Please pass the path to the TXT file of the reference data.')
|
|
234
227
|
sys.exit()
|
|
235
228
|
else:
|
|
236
229
|
if isinstance(reference_data,str):
|
|
237
230
|
df_reference = get_reference_df(reference_data=reference_data)
|
|
238
|
-
unique_reference_ids = df_reference
|
|
231
|
+
unique_reference_ids = df_reference['id'].unique()
|
|
239
232
|
else:
|
|
240
233
|
dfs = []
|
|
241
234
|
unique_reference_ids = []
|
|
242
235
|
for f in reference_data:
|
|
243
236
|
tmp = get_reference_df(reference_data=f)
|
|
244
237
|
dfs.append(tmp)
|
|
245
|
-
unique_reference_ids.extend(tmp
|
|
238
|
+
unique_reference_ids.extend(tmp['id'].unique())
|
|
246
239
|
df_reference = pd.concat(dfs, axis=0, ignore_index=True)
|
|
247
240
|
|
|
248
|
-
|
|
241
|
+
if 'ionization_mode' in df_reference.columns.tolist() and ionization_mode != 'N/A' and ionization_mode != None:
|
|
242
|
+
df_reference = df_reference.loc[df_reference['ionization_mode']==ionization_mode].copy()
|
|
243
|
+
if 'adduct' in df_reference.columns.tolist() and adduct != 'N/A' and adduct != None:
|
|
244
|
+
df_reference = df_reference.loc[df_reference['adduct']==adduct].copy()
|
|
245
|
+
unique_reference_ids_tmp2 = df_reference['id'].unique()
|
|
246
|
+
|
|
247
|
+
print(f'\nNote that there are {len(unique_query_ids)} unique query spectra, {len(unique_reference_ids)} unique reference spectra, and {len(set(unique_query_ids) & set(unique_reference_ids_tmp2))} of the query and reference spectra IDs are in common.\n')
|
|
249
248
|
|
|
250
249
|
if output_path is None:
|
|
251
250
|
output_path = f'{Path.cwd()}/tuning_param_output.txt'
|
|
@@ -253,7 +252,7 @@ def tune_params_on_HRMS_data_grid(query_data=None, reference_data=None, grid=Non
|
|
|
253
252
|
|
|
254
253
|
param_grid = product(similarity_measure, weight, spectrum_preprocessing_order, mz_min, mz_max, int_min, int_max, noise_threshold,
|
|
255
254
|
window_size_centroiding, window_size_matching, wf_mz, wf_int, LET_threshold, entropy_dimension, high_quality_reference_library)
|
|
256
|
-
results = Parallel(n_jobs=-1, verbose=10)(delayed(_eval_one_HRMS)(df_query, df_reference,
|
|
255
|
+
results = Parallel(n_jobs=-1, verbose=10)(delayed(_eval_one_HRMS)(df_query, df_reference, precursor_ion_mz_tolerance, ionization_mode, adduct, *params) for params in param_grid)
|
|
257
256
|
|
|
258
257
|
df_out = pd.DataFrame(results, columns=[
|
|
259
258
|
'ACC','SIMILARITY.MEASURE','WEIGHT','SPECTRUM.PROCESSING.ORDER', 'MZ.MIN','MZ.MAX','INT.MIN','INT.MAX','NOISE.THRESHOLD',
|
|
@@ -277,124 +276,7 @@ def tune_params_on_HRMS_data_grid(query_data=None, reference_data=None, grid=Non
|
|
|
277
276
|
|
|
278
277
|
|
|
279
278
|
|
|
280
|
-
def tune_params_on_HRMS_data_grid_shiny(query_data=None, reference_data=None, grid=None, output_path=None, return_output=False):
|
|
281
|
-
"""
|
|
282
|
-
runs spectral library matching on high-resolution mass spectrometry (HRMS) data with all possible
|
|
283
|
-
combinations of parameters in the grid dict, saves results from each choice of parameters to a TXT file,
|
|
284
|
-
and prints top-performing parameters
|
|
285
|
-
|
|
286
|
-
--query_data: mgf, mzML, or csv file of query mass spectrum/spectra to be identified. If csv file, each row
|
|
287
|
-
should correspond to a mass spectrum, the left-most column should contain an identifier, and each of the
|
|
288
|
-
other columns should correspond to a single mass/charge ratio. Mandatory argument.
|
|
289
|
-
--reference_data: mgf, mzML, or csv file of the reference mass spectra. If csv file, each row should correspond
|
|
290
|
-
to a mass spectrum, the left-most column should contain in identifier (i.e. the CAS registry number or the
|
|
291
|
-
compound name), and the remaining column should correspond to a single mass/charge ratio. Mandatory argument.
|
|
292
|
-
--grid: dict with all possible parameter values to try.
|
|
293
|
-
--output_path: accuracy from each choice of parameter set is saved to a TXT file here.
|
|
294
|
-
"""
|
|
295
|
-
|
|
296
|
-
local_grid = {**default_HRMS_grid, **(grid or {})}
|
|
297
|
-
for key, value in local_grid.items():
|
|
298
|
-
globals()[key] = value
|
|
299
|
-
|
|
300
|
-
if query_data is None:
|
|
301
|
-
print('\nError: No argument passed to the mandatory query_data. Please pass the path to the data file.')
|
|
302
|
-
sys.exit()
|
|
303
|
-
else:
|
|
304
|
-
extension = query_data.rsplit('.', 1)[-1]
|
|
305
|
-
if extension in ('mgf','MGF','mzML','mzml','MZML','cdf','CDF'):
|
|
306
|
-
output_path_tmp = query_data[:-3] + 'csv'
|
|
307
|
-
build_library_from_raw_data(input_path=query_data, output_path=output_path_tmp, is_reference=False)
|
|
308
|
-
df_query = pd.read_csv(output_path_tmp)
|
|
309
|
-
elif extension in ('csv','CSV'):
|
|
310
|
-
df_query = pd.read_csv(query_data)
|
|
311
|
-
else:
|
|
312
|
-
print(f'\nError: Unsupported query_data extension: {extension}')
|
|
313
|
-
sys.exit()
|
|
314
|
-
unique_query_ids = df_query.iloc[:, 0].unique()
|
|
315
|
-
|
|
316
|
-
if reference_data is None:
|
|
317
|
-
print('\nError: No argument passed to the mandatory reference_data. Please pass the path to the data file(s).')
|
|
318
|
-
sys.exit()
|
|
319
|
-
else:
|
|
320
|
-
if isinstance(reference_data, str):
|
|
321
|
-
df_reference = get_reference_df(reference_data=reference_data)
|
|
322
|
-
unique_reference_ids = df_reference.iloc[:, 0].unique()
|
|
323
|
-
else:
|
|
324
|
-
dfs = []
|
|
325
|
-
unique_reference_ids = []
|
|
326
|
-
for f in reference_data:
|
|
327
|
-
tmp = get_reference_df(reference_data=f)
|
|
328
|
-
dfs.append(tmp)
|
|
329
|
-
unique_reference_ids.extend(tmp.iloc[:, 0].unique())
|
|
330
|
-
df_reference = pd.concat(dfs, axis=0, ignore_index=True)
|
|
331
|
-
|
|
332
|
-
print(f'\nNote that there are {len(unique_query_ids)} unique query spectra, '
|
|
333
|
-
f'{len(unique_reference_ids)} unique reference spectra, and '
|
|
334
|
-
f'{len(set(unique_query_ids) & set(unique_reference_ids))} of the query and reference spectra IDs are in common.\n')
|
|
335
|
-
|
|
336
|
-
if output_path is None:
|
|
337
|
-
output_path = f'{Path.cwd()}/tuning_param_output.txt'
|
|
338
|
-
print(f'Warning: since output_path=None, the output will be written to the current working directory: {output_path}')
|
|
339
|
-
|
|
340
|
-
param_grid = product(
|
|
341
|
-
similarity_measure, weight, spectrum_preprocessing_order, mz_min, mz_max, int_min, int_max,
|
|
342
|
-
noise_threshold, window_size_centroiding, window_size_matching, wf_mz, wf_int, LET_threshold,
|
|
343
|
-
entropy_dimension, high_quality_reference_library
|
|
344
|
-
)
|
|
345
|
-
|
|
346
|
-
results = []
|
|
347
|
-
total = (
|
|
348
|
-
len(similarity_measure) * len(weight) * len(spectrum_preprocessing_order) * len(mz_min) * len(mz_max) *
|
|
349
|
-
len(int_min) * len(int_max) * len(noise_threshold) * len(window_size_centroiding) *
|
|
350
|
-
len(window_size_matching) * len(wf_mz) * len(wf_int) * len(LET_threshold) *
|
|
351
|
-
len(entropy_dimension) * len(high_quality_reference_library)
|
|
352
|
-
)
|
|
353
|
-
done = 0
|
|
354
|
-
|
|
355
|
-
for params in param_grid:
|
|
356
|
-
res = _eval_one_HRMS(df_query, df_reference, unique_query_ids, unique_reference_ids, *params)
|
|
357
|
-
results.append(res)
|
|
358
|
-
done += 1
|
|
359
|
-
print(f'Completed {done}/{total} grid combinations.\n', flush=True)
|
|
360
|
-
|
|
361
|
-
df_out = pd.DataFrame(results, columns=[
|
|
362
|
-
'ACC','SIMILARITY.MEASURE','WEIGHT','SPECTRUM.PROCESSING.ORDER','MZ.MIN','MZ.MAX',
|
|
363
|
-
'INT.MIN','INT.MAX','NOISE.THRESHOLD','WINDOW.SIZE.CENTROIDING','WINDOW.SIZE.MATCHING',
|
|
364
|
-
'WF.MZ','WF.INT','LET.THRESHOLD','ENTROPY.DIMENSION','HIGH.QUALITY.REFERENCE.LIBRARY'
|
|
365
|
-
])
|
|
366
|
-
|
|
367
|
-
if 'WEIGHT' in df_out.columns:
|
|
368
|
-
df_out['WEIGHT'] = (
|
|
369
|
-
df_out['WEIGHT'].astype(str)
|
|
370
|
-
.str.replace("\"","",regex=False)
|
|
371
|
-
.str.replace("{","",regex=False)
|
|
372
|
-
.str.replace("}","",regex=False)
|
|
373
|
-
.str.replace(":","",regex=False)
|
|
374
|
-
.str.replace("Cosine","",regex=False)
|
|
375
|
-
.str.replace("Shannon","",regex=False)
|
|
376
|
-
.str.replace("Renyi","",regex=False)
|
|
377
|
-
.str.replace("Tsallis","",regex=False)
|
|
378
|
-
.str.replace(" ","",regex=False)
|
|
379
|
-
)
|
|
380
|
-
|
|
381
|
-
if return_output:
|
|
382
|
-
return df_out
|
|
383
|
-
else:
|
|
384
|
-
df_out.to_csv(output_path, index=False, sep='\t', quoting=csv.QUOTE_NONE)
|
|
385
|
-
print(f'Wrote results to {output_path}')
|
|
386
|
-
|
|
387
|
-
|
|
388
279
|
def tune_params_on_NRMS_data_grid(query_data=None, reference_data=None, grid=None, output_path=None, return_output=False):
|
|
389
|
-
"""
|
|
390
|
-
runs spectral library matching on nominal-resolution mass spectrometry (NRMS) data with all possible combinations of parameters in the grid dict, saves results from each choice of parameters to a TXT file, and prints top-performing parameters
|
|
391
|
-
|
|
392
|
-
--query_data: mgf, mzML, or csv file of query mass spectrum/spectra to be identified. If csv file, each row should correspond to a mass spectrum, the left-most column should contain an identifier, and each of the other columns should correspond to a single mass/charge ratio. Mandatory argument.
|
|
393
|
-
--reference_data: mgf, mzML, or csv file of the reference mass spectra. If csv file, each row should correspond to a mass spectrum, the left-most column should contain in identifier (i.e. the CAS registry number or the compound name), and the remaining column should correspond to a single mass/charge ratio. Mandatory argument.
|
|
394
|
-
--grid: dict with all possible parameter values to try
|
|
395
|
-
--output_path: accuracy from each choice of parameter set is saved to a TXT file here
|
|
396
|
-
"""
|
|
397
|
-
|
|
398
280
|
grid = {**default_NRMS_grid, **(grid or {})}
|
|
399
281
|
for key, value in grid.items():
|
|
400
282
|
globals()[key] = value
|
|
@@ -405,13 +287,13 @@ def tune_params_on_NRMS_data_grid(query_data=None, reference_data=None, grid=Non
|
|
|
405
287
|
else:
|
|
406
288
|
extension = query_data.rsplit('.',1)
|
|
407
289
|
extension = extension[(len(extension)-1)]
|
|
408
|
-
if extension == 'mgf' or extension == 'MGF' or extension == 'mzML' or extension == 'mzml' or extension == 'MZML' or extension == 'cdf' or extension == 'CDF':
|
|
409
|
-
output_path_tmp = query_data[:-3] + '
|
|
290
|
+
if extension == 'mgf' or extension == 'MGF' or extension == 'mzML' or extension == 'mzml' or extension == 'MZML' or extension == 'cdf' or extension == 'CDF' or extension == 'msp' or extension == 'MSP' or extension == 'json' or extension == 'JSON':
|
|
291
|
+
output_path_tmp = query_data[:-3] + 'txt'
|
|
410
292
|
build_library_from_raw_data(input_path=query_data, output_path=output_path_tmp, is_reference=False)
|
|
411
|
-
df_query = pd.read_csv(output_path_tmp)
|
|
412
|
-
if extension == '
|
|
413
|
-
df_query = pd.read_csv(query_data)
|
|
414
|
-
unique_query_ids = df_query
|
|
293
|
+
df_query = pd.read_csv(output_path_tmp, sep='\t')
|
|
294
|
+
if extension == 'txt' or extension == 'TXT':
|
|
295
|
+
df_query = pd.read_csv(query_data, sep='\t')
|
|
296
|
+
unique_query_ids = df_query['id'].unique()
|
|
415
297
|
|
|
416
298
|
if reference_data is None:
|
|
417
299
|
print('\nError: No argument passed to the mandatory reference_data. Please pass the path to the CSV file of the reference data.')
|
|
@@ -419,7 +301,7 @@ def tune_params_on_NRMS_data_grid(query_data=None, reference_data=None, grid=Non
|
|
|
419
301
|
else:
|
|
420
302
|
if isinstance(reference_data,str):
|
|
421
303
|
df_reference = get_reference_df(reference_data=reference_data)
|
|
422
|
-
unique_reference_ids = df_reference
|
|
304
|
+
unique_reference_ids = df_reference['id'].unique()
|
|
423
305
|
else:
|
|
424
306
|
dfs = []
|
|
425
307
|
unique_reference_ids = []
|
|
@@ -439,10 +321,8 @@ def tune_params_on_NRMS_data_grid(query_data=None, reference_data=None, grid=Non
|
|
|
439
321
|
noise_threshold, wf_mz, wf_int, LET_threshold, entropy_dimension, high_quality_reference_library)
|
|
440
322
|
results = Parallel(n_jobs=-1, verbose=10)(delayed(_eval_one_NRMS)(df_query, df_reference, unique_query_ids, unique_reference_ids, *params) for params in param_grid)
|
|
441
323
|
|
|
442
|
-
df_out = pd.DataFrame(results, columns=[
|
|
443
|
-
|
|
444
|
-
'NOISE.THRESHOLD','WF.MZ','WF.INT','LET.THRESHOLD','ENTROPY.DIMENSION', 'HIGH.QUALITY.REFERENCE.LIBRARY'
|
|
445
|
-
])
|
|
324
|
+
df_out = pd.DataFrame(results, columns=['ACC','SIMILARITY.MEASURE','WEIGHT','SPECTRUM.PROCESSING.ORDER', 'MZ.MIN','MZ.MAX','INT.MIN','INT.MAX',
|
|
325
|
+
'NOISE.THRESHOLD','WF.MZ','WF.INT','LET.THRESHOLD','ENTROPY.DIMENSION', 'HIGH.QUALITY.REFERENCE.LIBRARY'])
|
|
446
326
|
df_out['WEIGHT'] = df_out['WEIGHT'].str.replace("\"","",regex=False)
|
|
447
327
|
df_out['WEIGHT'] = df_out['WEIGHT'].str.replace("{","",regex=False)
|
|
448
328
|
df_out['WEIGHT'] = df_out['WEIGHT'].str.replace("}","",regex=False)
|
|
@@ -452,6 +332,7 @@ def tune_params_on_NRMS_data_grid(query_data=None, reference_data=None, grid=Non
|
|
|
452
332
|
df_out['WEIGHT'] = df_out['WEIGHT'].str.replace("Renyi","",regex=False)
|
|
453
333
|
df_out['WEIGHT'] = df_out['WEIGHT'].str.replace("Tsallis","",regex=False)
|
|
454
334
|
df_out['WEIGHT'] = df_out['WEIGHT'].str.replace(" ","",regex=False)
|
|
335
|
+
|
|
455
336
|
if return_output is False:
|
|
456
337
|
df_out.to_csv(output_path, index=False, sep='\t', quoting=csv.QUOTE_NONE)
|
|
457
338
|
else:
|
|
@@ -459,202 +340,116 @@ def tune_params_on_NRMS_data_grid(query_data=None, reference_data=None, grid=Non
|
|
|
459
340
|
|
|
460
341
|
|
|
461
342
|
|
|
462
|
-
def
|
|
463
|
-
"""
|
|
464
|
-
runs spectral library matching on nominal-resolution mass spectrometry (NRMS) data with all possible
|
|
465
|
-
combinations of parameters in the grid dict, saves results from each choice of parameters to a TXT file,
|
|
466
|
-
and prints top-performing parameters
|
|
467
|
-
|
|
468
|
-
--query_data: mgf, mzML, or csv file of query mass spectrum/spectra to be identified. If csv file, each row
|
|
469
|
-
should correspond to a mass spectrum, the left-most column should contain an identifier, and each of the
|
|
470
|
-
other columns should correspond to a single mass/charge ratio. Mandatory argument.
|
|
471
|
-
--reference_data: mgf, mzML, or csv file of the reference mass spectra. If csv file, each row should correspond
|
|
472
|
-
to a mass spectrum, the left-most column should contain in identifier (i.e. the CAS registry number or the
|
|
473
|
-
compound name), and the remaining column should correspond to a single mass/charge ratio. Mandatory argument.
|
|
474
|
-
--grid: dict with all possible parameter values to try.
|
|
475
|
-
--output_path: accuracy from each choice of parameter set is saved to a TXT file here.
|
|
476
|
-
"""
|
|
477
|
-
|
|
478
|
-
local_grid = {**default_NRMS_grid, **(grid or {})}
|
|
479
|
-
for key, value in local_grid.items():
|
|
480
|
-
globals()[key] = value
|
|
481
|
-
|
|
482
|
-
if query_data is None:
|
|
483
|
-
print('\nError: No argument passed to the mandatory query_data. Please pass the path to the data file.')
|
|
484
|
-
sys.exit()
|
|
485
|
-
else:
|
|
486
|
-
extension = query_data.rsplit('.', 1)[-1]
|
|
487
|
-
if extension in ('mgf','MGF','mzML','mzml','MZML','cdf','CDF'):
|
|
488
|
-
output_path_tmp = query_data[:-3] + 'csv'
|
|
489
|
-
build_library_from_raw_data(input_path=query_data, output_path=output_path_tmp, is_reference=False)
|
|
490
|
-
df_query = pd.read_csv(output_path_tmp)
|
|
491
|
-
elif extension in ('csv','CSV'):
|
|
492
|
-
df_query = pd.read_csv(query_data)
|
|
493
|
-
else:
|
|
494
|
-
print(f'\nError: Unsupported query_data extension: {extension}')
|
|
495
|
-
sys.exit()
|
|
496
|
-
unique_query_ids = df_query.iloc[:, 0].unique()
|
|
343
|
+
def get_acc_HRMS(df_query, df_reference, precursor_ion_mz_tolerance, ionization_mode, adduct, similarity_measure, weights, spectrum_preprocessing_order, mz_min, mz_max, int_min, int_max, window_size_centroiding, window_size_matching, noise_threshold, wf_mz, wf_int, LET_threshold, entropy_dimension, high_quality_reference_library, verbose=True):
|
|
497
344
|
|
|
498
|
-
|
|
499
|
-
|
|
500
|
-
|
|
501
|
-
|
|
502
|
-
if isinstance(reference_data, str):
|
|
503
|
-
df_reference = get_reference_df(reference_data=reference_data)
|
|
504
|
-
unique_reference_ids = df_reference.iloc[:, 0].unique()
|
|
505
|
-
else:
|
|
506
|
-
dfs = []
|
|
507
|
-
unique_reference_ids = []
|
|
508
|
-
for f in reference_data:
|
|
509
|
-
tmp = get_reference_df(reference_data=f)
|
|
510
|
-
dfs.append(tmp)
|
|
511
|
-
unique_reference_ids.extend(tmp.iloc[:, 0].unique())
|
|
512
|
-
df_reference = pd.concat(dfs, axis=0, ignore_index=True)
|
|
513
|
-
|
|
514
|
-
print(f'\nNote that there are {len(unique_query_ids)} unique query spectra, '
|
|
515
|
-
f'{len(unique_reference_ids)} unique reference spectra, and '
|
|
516
|
-
f'{len(set(unique_query_ids) & set(unique_reference_ids))} of the query and reference spectra IDs are in common.\n')
|
|
517
|
-
|
|
518
|
-
if output_path is None:
|
|
519
|
-
output_path = f'{Path.cwd()}/tuning_param_output.txt'
|
|
520
|
-
print(f'Warning: since output_path=None, the output will be written to the current working directory: {output_path}')
|
|
521
|
-
|
|
522
|
-
param_grid = product(
|
|
523
|
-
similarity_measure, weight, spectrum_preprocessing_order, mz_min, mz_max, int_min, int_max,
|
|
524
|
-
noise_threshold, wf_mz, wf_int, LET_threshold,
|
|
525
|
-
entropy_dimension, high_quality_reference_library
|
|
526
|
-
)
|
|
527
|
-
|
|
528
|
-
results = []
|
|
529
|
-
total = (
|
|
530
|
-
len(similarity_measure) * len(weight) * len(spectrum_preprocessing_order) * len(mz_min) * len(mz_max) * len(int_min) *
|
|
531
|
-
len(int_max) * len(noise_threshold) * len(wf_mz) * len(wf_int) * len(LET_threshold) * len(entropy_dimension) * len(high_quality_reference_library)
|
|
532
|
-
)
|
|
533
|
-
done = 0
|
|
534
|
-
for params in param_grid:
|
|
535
|
-
res = _eval_one_NRMS(df_query, df_reference, unique_query_ids, unique_reference_ids, *params)
|
|
536
|
-
results.append(res)
|
|
537
|
-
done += 1
|
|
538
|
-
print(f'Completed {done}/{total} grid combinations.\n', flush=True)
|
|
539
|
-
|
|
540
|
-
df_out = pd.DataFrame(results, columns=[
|
|
541
|
-
'ACC','SIMILARITY.MEASURE','WEIGHT','SPECTRUM.PROCESSING.ORDER','MZ.MIN','MZ.MAX',
|
|
542
|
-
'INT.MIN','INT.MAX','NOISE.THRESHOLD','WF.MZ','WF.INT','LET.THRESHOLD','ENTROPY.DIMENSION','HIGH.QUALITY.REFERENCE.LIBRARY'
|
|
543
|
-
])
|
|
345
|
+
n_top_matches_to_save = 1
|
|
346
|
+
unique_reference_ids = df_reference['id'].dropna().astype(str).unique().tolist()
|
|
347
|
+
unique_query_ids = df_query['id'].dropna().astype(str).unique().tolist()
|
|
348
|
+
all_similarity_rows = []
|
|
544
349
|
|
|
545
|
-
|
|
546
|
-
|
|
547
|
-
|
|
548
|
-
.str.replace("\"","",regex=False)
|
|
549
|
-
.str.replace("{","",regex=False)
|
|
550
|
-
.str.replace("}","",regex=False)
|
|
551
|
-
.str.replace(":","",regex=False)
|
|
552
|
-
.str.replace("Cosine","",regex=False)
|
|
553
|
-
.str.replace("Shannon","",regex=False)
|
|
554
|
-
.str.replace("Renyi","",regex=False)
|
|
555
|
-
.str.replace("Tsallis","",regex=False)
|
|
556
|
-
.str.replace(" ","",regex=False)
|
|
557
|
-
)
|
|
558
|
-
|
|
559
|
-
if return_output:
|
|
560
|
-
return df_out
|
|
561
|
-
else:
|
|
562
|
-
df_out.to_csv(output_path, index=False, sep='\t', quoting=csv.QUOTE_NONE)
|
|
563
|
-
print(f'Wrote results to {output_path}')
|
|
350
|
+
for query_idx, qid in enumerate(unique_query_ids):
|
|
351
|
+
if verbose:
|
|
352
|
+
print(f'query spectrum #{query_idx} is being identified')
|
|
564
353
|
|
|
354
|
+
q_mask = (df_query['id'] == qid)
|
|
355
|
+
q_idxs = np.where(q_mask)[0]
|
|
356
|
+
if q_idxs.size == 0:
|
|
357
|
+
all_similarity_rows.append([0.0]*len(unique_reference_ids))
|
|
358
|
+
continue
|
|
565
359
|
|
|
360
|
+
q_spec_base = np.asarray(pd.concat([df_query['mz_ratio'].iloc[q_idxs], df_query['intensity'].iloc[q_idxs]], axis=1).reset_index(drop=True))
|
|
566
361
|
|
|
362
|
+
if 'precursor_ion_mz' in df_query.columns and 'precursor_ion_mz' in df_reference.columns and precursor_ion_mz_tolerance is not None:
|
|
363
|
+
precursor = float(df_query['precursor_ion_mz'].iloc[q_idxs[0]])
|
|
364
|
+
df_reference_tmp = df_reference.loc[df_reference['precursor_ion_mz'].between(precursor - precursor_ion_mz_tolerance, precursor + precursor_ion_mz_tolerance, inclusive='both'), ['id', 'mz_ratio', 'intensity']].copy()
|
|
365
|
+
else:
|
|
366
|
+
df_reference_tmp = df_reference[['id','mz_ratio','intensity']].copy()
|
|
567
367
|
|
|
568
|
-
|
|
368
|
+
if df_reference_tmp.empty:
|
|
369
|
+
all_similarity_rows.append([0.0]*len(unique_reference_ids))
|
|
370
|
+
continue
|
|
569
371
|
|
|
570
|
-
|
|
372
|
+
ref_groups = dict(tuple(df_reference_tmp.groupby('id', sort=False)))
|
|
571
373
|
|
|
572
|
-
|
|
573
|
-
for query_idx in range(0,len(unique_query_ids)):
|
|
574
|
-
if verbose is True:
|
|
575
|
-
print(f'query spectrum #{query_idx} is being identified')
|
|
576
|
-
q_idxs_tmp = np.where(df_query.iloc[:,0] == unique_query_ids[query_idx])[0]
|
|
577
|
-
q_spec_tmp = np.asarray(pd.concat([df_query.iloc[q_idxs_tmp,1], df_query.iloc[q_idxs_tmp,2]], axis=1).reset_index(drop=True))
|
|
578
|
-
#q_spec_tmp = q_spec_tmp.astype(float)
|
|
374
|
+
similarity_by_ref = {}
|
|
579
375
|
|
|
580
|
-
|
|
581
|
-
|
|
582
|
-
|
|
583
|
-
r_idxs_tmp = np.where(df_reference.iloc[:,0] == unique_reference_ids[ref_idx])[0]
|
|
584
|
-
r_spec = np.asarray(pd.concat([df_reference.iloc[r_idxs_tmp,1], df_reference.iloc[r_idxs_tmp,2]], axis=1).reset_index(drop=True))
|
|
585
|
-
#print(r_spec)
|
|
586
|
-
#r_spec = r_spec.astype(float)
|
|
376
|
+
for ref_id, r_df in ref_groups.items():
|
|
377
|
+
q_spec = q_spec_base.copy()
|
|
378
|
+
r_spec = np.asarray(pd.concat([r_df['mz_ratio'], r_df['intensity']], axis=1).reset_index(drop=True))
|
|
587
379
|
|
|
588
380
|
is_matched = False
|
|
589
381
|
for transformation in spectrum_preprocessing_order:
|
|
590
|
-
if np.isinf(q_spec[:,1]).
|
|
591
|
-
q_spec[:,1] =
|
|
592
|
-
if np.isinf(r_spec[:,1]).
|
|
593
|
-
r_spec[:,1] =
|
|
594
|
-
|
|
595
|
-
|
|
596
|
-
|
|
597
|
-
|
|
598
|
-
|
|
599
|
-
|
|
600
|
-
|
|
382
|
+
if np.isinf(q_spec[:, 1]).any():
|
|
383
|
+
q_spec[:, 1] = 0.0
|
|
384
|
+
if np.isinf(r_spec[:, 1]).any():
|
|
385
|
+
r_spec[:, 1] = 0.0
|
|
386
|
+
|
|
387
|
+
if transformation == 'C' and q_spec.shape[0] > 1 and r_spec.shape[0] > 1:
|
|
388
|
+
q_spec = centroid_spectrum(q_spec, window_size=window_size_centroiding)
|
|
389
|
+
r_spec = centroid_spectrum(r_spec, window_size=window_size_centroiding)
|
|
390
|
+
|
|
391
|
+
if transformation == 'M' and q_spec.shape[0] > 1 and r_spec.shape[0] > 1:
|
|
392
|
+
m_spec = match_peaks_in_spectra(
|
|
393
|
+
spec_a=q_spec, spec_b=r_spec, window_size=window_size_matching
|
|
394
|
+
)
|
|
395
|
+
if m_spec.size == 0:
|
|
396
|
+
q_spec = np.empty((0,2))
|
|
397
|
+
r_spec = np.empty((0,2))
|
|
398
|
+
else:
|
|
399
|
+
q_spec = m_spec[:, 0:2]
|
|
400
|
+
r_spec = m_spec[:, [0, 2]]
|
|
601
401
|
is_matched = True
|
|
602
|
-
if transformation == 'W' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1:
|
|
603
|
-
q_spec[:,1] = wf_transform(q_spec[:,0], q_spec[:,1], wf_mz, wf_int)
|
|
604
|
-
r_spec[:,1] = wf_transform(r_spec[:,0], r_spec[:,1], wf_mz, wf_int)
|
|
605
|
-
if transformation == 'L' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1:
|
|
606
|
-
q_spec[:,1] = LE_transform(q_spec[:,1], LET_threshold, normalization_method='standard')
|
|
607
|
-
r_spec[:,1] = LE_transform(r_spec[:,1], LET_threshold, normalization_method='standard')
|
|
608
|
-
if transformation == 'N' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1:
|
|
609
|
-
q_spec = remove_noise(q_spec, nr = noise_threshold)
|
|
610
|
-
if high_quality_reference_library == False:
|
|
611
|
-
r_spec = remove_noise(r_spec, nr = noise_threshold)
|
|
612
|
-
if transformation == 'F' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1:
|
|
613
|
-
q_spec = filter_spec_lcms(q_spec, mz_min = mz_min, mz_max = mz_max, int_min = int_min, int_max = int_max, is_matched = is_matched)
|
|
614
|
-
if high_quality_reference_library == False:
|
|
615
|
-
r_spec = filter_spec_lcms(r_spec, mz_min = mz_min, mz_max = mz_max, int_min = int_min, int_max = int_max, is_matched = is_matched)
|
|
616
402
|
|
|
617
|
-
|
|
618
|
-
|
|
619
|
-
|
|
620
|
-
|
|
403
|
+
if transformation == 'W' and q_spec.shape[0] > 1 and r_spec.shape[0] > 1:
|
|
404
|
+
q_spec[:, 1] = wf_transform(q_spec[:, 0], q_spec[:, 1], wf_mz, wf_int)
|
|
405
|
+
r_spec[:, 1] = wf_transform(r_spec[:, 0], r_spec[:, 1], wf_mz, wf_int)
|
|
406
|
+
|
|
407
|
+
if transformation == 'L' and q_spec.shape[0] > 1 and r_spec.shape[0] > 1:
|
|
408
|
+
q_spec[:, 1] = LE_transform(q_spec[:, 1], LET_threshold, normalization_method='standard')
|
|
409
|
+
r_spec[:, 1] = LE_transform(r_spec[:, 1], LET_threshold, normalization_method='standard')
|
|
410
|
+
|
|
411
|
+
if transformation == 'N' and q_spec.shape[0] > 1 and r_spec.shape[0] > 1:
|
|
412
|
+
q_spec = remove_noise(q_spec, nr=noise_threshold)
|
|
413
|
+
if not high_quality_reference_library:
|
|
414
|
+
r_spec = remove_noise(r_spec, nr=noise_threshold)
|
|
415
|
+
|
|
416
|
+
if transformation == 'F' and q_spec.shape[0] > 1 and r_spec.shape[0] > 1:
|
|
417
|
+
q_spec = filter_spec_lcms(
|
|
418
|
+
q_spec, mz_min=mz_min, mz_max=mz_max, int_min=int_min, int_max=int_max, is_matched=is_matched
|
|
419
|
+
)
|
|
420
|
+
if not high_quality_reference_library:
|
|
421
|
+
r_spec = filter_spec_lcms(
|
|
422
|
+
r_spec, mz_min=mz_min, mz_max=mz_max, int_min=int_min, int_max=int_max, is_matched=is_matched
|
|
423
|
+
)
|
|
424
|
+
|
|
425
|
+
if q_spec.shape[0] > 1 and r_spec.shape[0] > 1:
|
|
426
|
+
q_ints = q_spec[:, 1]
|
|
427
|
+
r_ints = r_spec[:, 1]
|
|
428
|
+
if np.sum(q_ints) != 0 and np.sum(r_ints) != 0:
|
|
429
|
+
sim = get_similarity(similarity_measure, q_ints, r_ints, weights, entropy_dimension)
|
|
430
|
+
else:
|
|
431
|
+
sim = 0.0
|
|
621
432
|
else:
|
|
622
|
-
|
|
433
|
+
sim = 0.0
|
|
623
434
|
|
|
624
|
-
|
|
625
|
-
all_similarity_scores.append(similarity_scores)
|
|
435
|
+
similarity_by_ref[str(ref_id)] = float(sim)
|
|
626
436
|
|
|
627
|
-
|
|
628
|
-
|
|
629
|
-
df_scores.index.names = ['Query Spectrum ID']
|
|
437
|
+
row = [similarity_by_ref.get(ref_id, 0.0) for ref_id in unique_reference_ids]
|
|
438
|
+
all_similarity_rows.append(row)
|
|
630
439
|
|
|
631
|
-
|
|
632
|
-
|
|
633
|
-
for i in range(0, df_scores.shape[0]):
|
|
634
|
-
df_scores_tmp = df_scores
|
|
635
|
-
preds_tmp = []
|
|
636
|
-
scores_tmp = []
|
|
637
|
-
for j in range(0, n_top_matches_to_save):
|
|
638
|
-
top_ref_specs_tmp = df_scores_tmp.iloc[i,np.where(df_scores_tmp.iloc[i,:] == np.max(df_scores_tmp.iloc[i,:]))[0]]
|
|
639
|
-
cols_to_keep = np.where(df_scores_tmp.iloc[i,:] != np.max(df_scores_tmp.iloc[i,:]))[0]
|
|
640
|
-
df_scores_tmp = df_scores_tmp.iloc[:,cols_to_keep]
|
|
440
|
+
df_scores = pd.DataFrame(all_similarity_rows, index=unique_query_ids, columns=unique_reference_ids)
|
|
441
|
+
df_scores.index.name = 'QUERY.SPECTRUM.ID'
|
|
641
442
|
|
|
642
|
-
|
|
643
|
-
|
|
644
|
-
|
|
645
|
-
else:
|
|
646
|
-
scores_tmp.append(top_ref_specs_tmp.values[0])
|
|
647
|
-
preds.append(preds_tmp)
|
|
648
|
-
scores.append(scores_tmp)
|
|
649
|
-
|
|
650
|
-
preds = np.array(preds)
|
|
651
|
-
scores = np.array(scores)
|
|
652
|
-
out = np.c_[unique_query_ids,preds,scores]
|
|
653
|
-
df_tmp = pd.DataFrame(out, columns=['TRUE.ID','PREDICTED.ID','SCORE'])
|
|
654
|
-
acc = (df_tmp['TRUE.ID']==df_tmp['PREDICTED.ID']).mean()
|
|
655
|
-
return acc
|
|
443
|
+
top_idx = df_scores.values.argmax(axis=1)
|
|
444
|
+
top_scores = df_scores.values[np.arange(df_scores.shape[0]), top_idx]
|
|
445
|
+
top_ids = [df_scores.columns[i] for i in top_idx]
|
|
656
446
|
|
|
447
|
+
df_tmp = pd.DataFrame({'TRUE.ID': df_scores.index.to_list(), 'PREDICTED.ID': top_ids, 'SCORE': top_scores})
|
|
448
|
+
if verbose:
|
|
449
|
+
print(df_tmp)
|
|
657
450
|
|
|
451
|
+
acc = (df_tmp['TRUE.ID'] == df_tmp['PREDICTED.ID']).mean()
|
|
452
|
+
return acc
|
|
658
453
|
|
|
659
454
|
|
|
660
455
|
def get_acc_NRMS(df_query, df_reference, unique_query_ids, unique_reference_ids, similarity_measure, weights, spectrum_preprocessing_order, mz_min, mz_max, int_min, int_max, noise_threshold, wf_mz, wf_int, LET_threshold, entropy_dimension, high_quality_reference_library, verbose=True):
|
|
@@ -713,7 +508,7 @@ def get_acc_NRMS(df_query, df_reference, unique_query_ids, unique_reference_ids,
|
|
|
713
508
|
|
|
714
509
|
df_scores = pd.DataFrame(all_similarity_scores, columns = unique_reference_ids)
|
|
715
510
|
df_scores.index = unique_query_ids
|
|
716
|
-
df_scores.index.names = ['
|
|
511
|
+
df_scores.index.names = ['QUERY.SPECTRUM.ID']
|
|
717
512
|
|
|
718
513
|
preds = []
|
|
719
514
|
scores = []
|
|
@@ -743,64 +538,40 @@ def get_acc_NRMS(df_query, df_reference, unique_query_ids, unique_reference_ids,
|
|
|
743
538
|
|
|
744
539
|
|
|
745
540
|
|
|
746
|
-
def run_spec_lib_matching_on_HRMS_data(query_data=None, reference_data=None, likely_reference_ids=None, similarity_measure='cosine', weights={'Cosine':0.25,'Shannon':0.25,'Renyi':0.25,'Tsallis':0.25}, spectrum_preprocessing_order='FCNMWL', high_quality_reference_library=False, mz_min=0, mz_max=9999999, int_min=0, int_max=9999999, window_size_centroiding=0.5, window_size_matching=0.5, noise_threshold=0.0, wf_mz=0.0, wf_intensity=1.0, LET_threshold=0.0, entropy_dimension=1.1, n_top_matches_to_save=1, print_id_results=False, output_identification=None, output_similarity_scores=None, return_ID_output=False, verbose=True):
|
|
747
|
-
'''
|
|
748
|
-
runs spectral library matching on high-resolution mass spectrometry (HRMS) data
|
|
749
|
-
|
|
750
|
-
--query_data: mgf, mzML, or csv file of query mass spectrum/spectra to be identified. If csv file, each row should correspond to a mass spectrum, the left-most column should contain an identifier, and each of the other columns should correspond to a single mass/charge ratio. Mandatory argument.
|
|
751
|
-
--reference_data: either string or list of strings with pass to mgf, mzML, sdf, and/or csv file(s) of the reference mass spectra. If csv file, each row should correspond to a mass spectrum, the left-most column should contain in identifier (i.e. the CAS registry number or the compound name), and the remaining column should correspond to a single mass/charge ratio. Mandatory argument.
|
|
752
|
-
--likely_reference_ids: CSV file with one column containing the IDs of a subset of all compounds in the reference_data to be used in spectral library matching. Each ID in this file must be an ID in the reference library. Default: None (i.e. default is to use entire reference library)
|
|
753
|
-
--similarity_measure: cosine, shannon, renyi, tsallis, mixture, jaccard, dice, 3w_jaccard, sokal_sneath, binary_cosine, mountford, mcconnaughey, driver_kroeber, simpson, braun_banquet, fager_mcgowan, kulczynski, intersection, hamming, hellinger. Default: cosine.
|
|
754
|
-
--weights: dict of weights to give to each non-binary similarity measure (i.e. cosine, shannon, renyi, and tsallis) when the mixture similarity measure is specified. Default: 0.25 for each of the four non-binary similarity measures.
|
|
755
|
-
--spectrum_preprocessing_order: The spectrum preprocessing transformations and the order in which they are to be applied. Note that these transformations are applied prior to computing similarity scores. Format must be a string with 2-6 characters chosen from C, F, M, N, L, W representing centroiding, filtering based on mass/charge and intensity values, matching, noise removal, low-entropy trannsformation, and weight-factor-transformation, respectively. For example, if \'WCM\' is passed, then each spectrum will undergo a weight factor transformation, then centroiding, and then matching. Note that if an argument is passed, then \'M\' must be contained in the argument, since matching is a required preprocessing step in spectral library matching of HRMS data. Furthermore, \'C\' must be performed before matching since centroiding can change the number of ion fragments in a given spectrum. Default: FCNMWL')
|
|
756
|
-
--high_quality_reference_library: True/False flag indicating whether the reference library is considered to be of high quality. If True, then the spectrum preprocessing transformations of filtering and noise removal are performed only on the query spectrum/spectra. If False, all spectrum preprocessing transformations specified will be applied to both the query and reference spectra. Default: False')
|
|
757
|
-
--mz_min: Remove all peaks with mass/charge value less than mz_min in each spectrum. Default: 0
|
|
758
|
-
--mz_max: Remove all peaks with mass/charge value greater than mz_max in each spectrum. Default: 9999999
|
|
759
|
-
--int_min: Remove all peaks with intensity value less than int_min in each spectrum. Default: 0
|
|
760
|
-
--int_max: Remove all peaks with intensity value greater than int_max in each spectrum. Default: 9999999
|
|
761
|
-
--window_size_centroiding: Window size parameter used in centroiding a given spectrum. Default: 0.5
|
|
762
|
-
--window_size_matching: Window size parameter used in matching a query spectrum and a reference library spectrum. Default: 0.5
|
|
763
|
-
--noise_threshold: Ion fragments (i.e. points in a given mass spectrum) with intensity less than max(intensities)*noise_threshold are removed. Default: 0.0
|
|
764
|
-
--wf_mz: Mass/charge weight factor parameter. Default: 0.0
|
|
765
|
-
--wf_intensity: Intensity weight factor parameter. Default: 0.0
|
|
766
|
-
--LET_threshold: Low-entropy transformation threshold parameter. Spectra with Shannon entropy less than LET_threshold are transformed according to intensitiesNew=intensitiesOriginal^{(1+S)/(1+LET_threshold)}. Default: 0.0
|
|
767
|
-
--entropy_dimension: Entropy dimension parameter. Must have positive value other than 1. When the entropy dimension is 1, then Renyi and Tsallis entropy are equivalent to Shannon entropy. Therefore, this parameter only applies to the renyi and tsallis similarity measures. This parameter will be ignored if similarity measure cosine or shannon is chosen. Default: 1.1
|
|
768
|
-
--n_top_matches_to_save: The number of top matches to report. For example, if n_top_matches_to_save=5, then for each query spectrum, the five reference spectra with the largest similarity with the given query spectrum will be reported. Default: 1
|
|
769
|
-
--print_id_results: Flag that prints identification results if True. Default: False
|
|
770
|
-
--output_identification: Output TXT file containing the most-similar reference spectra for each query spectrum along with the corresponding similarity scores. Default is to save identification output in current working directory with filename \'output_identification.txt\'.
|
|
771
|
-
--output_similarity_scores: Output TXT file containing similarity scores between all query spectrum/spectra and all reference spectra. Each row corresponds to a query spectrum, the left-most column contains the query spectrum/spectra identifier, and the remaining column contain the similarity scores with respect to all reference library spectra. If no argument passed, then this TXT file is written to the current working directory with filename \'output_all_similarity_scores\'.txt.')
|
|
772
|
-
'''
|
|
773
|
-
|
|
541
|
+
def run_spec_lib_matching_on_HRMS_data(query_data=None, reference_data=None, precursor_ion_mz_tolerance=None, ionization_mode=None, adduct=None, likely_reference_ids=None, similarity_measure='cosine', weights={'Cosine':0.25,'Shannon':0.25,'Renyi':0.25,'Tsallis':0.25}, spectrum_preprocessing_order='FCNMWL', high_quality_reference_library=False, mz_min=0, mz_max=9999999, int_min=0, int_max=9999999, window_size_centroiding=0.5, window_size_matching=0.5, noise_threshold=0.0, wf_mz=0.0, wf_intensity=1.0, LET_threshold=0.0, entropy_dimension=1.1, n_top_matches_to_save=1, print_id_results=False, output_identification=None, output_similarity_scores=None, return_ID_output=False, verbose=True):
|
|
774
542
|
if query_data is None:
|
|
775
543
|
print('\nError: No argument passed to the mandatory query_data. Please pass the path to the CSV file of the query data.')
|
|
776
544
|
sys.exit()
|
|
777
545
|
else:
|
|
778
546
|
extension = query_data.rsplit('.',1)
|
|
779
547
|
extension = extension[(len(extension)-1)]
|
|
780
|
-
if extension == 'mgf' or extension == 'MGF' or extension == 'mzML' or extension == 'mzml' or extension == 'MZML' or extension == 'cdf' or extension == 'CDF':
|
|
781
|
-
output_path_tmp = query_data[:-3] + '
|
|
548
|
+
if extension == 'mgf' or extension == 'MGF' or extension == 'mzML' or extension == 'mzml' or extension == 'MZML' or extension == 'cdf' or extension == 'CDF' or extension == 'json' or extension == 'JSON' or extension == 'msp' or extension == 'MSP':
|
|
549
|
+
output_path_tmp = query_data[:-3] + 'txt'
|
|
782
550
|
build_library_from_raw_data(input_path=query_data, output_path=output_path_tmp, is_reference=False)
|
|
783
|
-
df_query = pd.read_csv(output_path_tmp)
|
|
784
|
-
if extension == '
|
|
785
|
-
df_query = pd.read_csv(query_data)
|
|
786
|
-
unique_query_ids = df_query
|
|
551
|
+
df_query = pd.read_csv(output_path_tmp, sep='\t')
|
|
552
|
+
if extension == 'txt' or extension == 'TXT':
|
|
553
|
+
df_query = pd.read_csv(query_data, sep='\t')
|
|
554
|
+
unique_query_ids = df_query['id'].unique()
|
|
787
555
|
|
|
788
556
|
if reference_data is None:
|
|
789
|
-
print('\nError: No argument passed to the mandatory reference_data. Please pass the path to the
|
|
557
|
+
print('\nError: No argument passed to the mandatory reference_data. Please pass the path to the reference data.')
|
|
790
558
|
sys.exit()
|
|
791
559
|
else:
|
|
792
560
|
if isinstance(reference_data,str):
|
|
793
561
|
df_reference = get_reference_df(reference_data,likely_reference_ids)
|
|
794
|
-
unique_reference_ids = df_reference.iloc[:,0].unique()
|
|
795
562
|
else:
|
|
796
563
|
dfs = []
|
|
797
|
-
unique_reference_ids = []
|
|
798
564
|
for f in reference_data:
|
|
799
565
|
tmp = get_reference_df(f,likely_reference_ids)
|
|
800
566
|
dfs.append(tmp)
|
|
801
|
-
unique_reference_ids.extend(tmp.iloc[:,0].unique())
|
|
802
567
|
df_reference = pd.concat(dfs, axis=0, ignore_index=True)
|
|
803
568
|
|
|
569
|
+
if 'ionization_mode' in df_reference.columns.tolist() and ionization_mode != 'N/A' and ionization_mode != None:
|
|
570
|
+
df_reference = df_reference.loc[df_reference['ionization_mode']==ionization_mode]
|
|
571
|
+
if 'adduct' in df_reference.columns.tolist() and adduct != 'N/A' and adduct != None:
|
|
572
|
+
df_reference = df_reference.loc[df_reference['adduct']==adduct]
|
|
573
|
+
|
|
574
|
+
print(df_reference.loc[df_reference['id']=='Hectochlorin M+H'])
|
|
804
575
|
|
|
805
576
|
if spectrum_preprocessing_order is not None:
|
|
806
577
|
spectrum_preprocessing_order = list(spectrum_preprocessing_order)
|
|
@@ -888,62 +659,91 @@ def run_spec_lib_matching_on_HRMS_data(query_data=None, reference_data=None, lik
|
|
|
888
659
|
print(f'Warning: writing similarity scores to {output_similarity_scores}')
|
|
889
660
|
|
|
890
661
|
|
|
891
|
-
|
|
892
|
-
|
|
893
|
-
|
|
662
|
+
unique_reference_ids = df_reference['id'].unique().tolist()
|
|
663
|
+
all_similarity_scores = []
|
|
664
|
+
|
|
665
|
+
for query_idx in range(len(unique_query_ids)):
|
|
666
|
+
if verbose:
|
|
894
667
|
print(f'query spectrum #{query_idx} is being identified')
|
|
895
|
-
q_idxs_tmp = np.where(df_query.iloc[:,0] == unique_query_ids[query_idx])[0]
|
|
896
|
-
q_spec_tmp = np.asarray(pd.concat([df_query.iloc[q_idxs_tmp,1], df_query.iloc[q_idxs_tmp,2]], axis=1).reset_index(drop=True))
|
|
897
668
|
|
|
898
|
-
|
|
899
|
-
|
|
900
|
-
|
|
901
|
-
|
|
902
|
-
|
|
669
|
+
q_mask = (df_query['id'] == unique_query_ids[query_idx])
|
|
670
|
+
q_idxs_tmp = np.where(q_mask)[0]
|
|
671
|
+
q_spec_tmp = np.asarray(pd.concat([df_query['mz_ratio'].iloc[q_idxs_tmp], df_query['intensity'].iloc[q_idxs_tmp]], axis=1).reset_index(drop=True))
|
|
672
|
+
|
|
673
|
+
if 'precursor_ion_mz' in df_query.columns.tolist() and 'precursor_ion_mz' in df_reference.columns.tolist() and precursor_ion_mz_tolerance != None:
|
|
674
|
+
precursor_ion_mz_tmp = df_query['precursor_ion_mz'].iloc[q_idxs_tmp[0]]
|
|
675
|
+
df_reference_tmp = df_reference.loc[df_reference['precursor_ion_mz'].between(precursor_ion_mz_tmp-precursor_ion_mz_tolerance, precursor_ion_mz_tmp+precursor_ion_mz_tolerance, inclusive='both'),['id','mz_ratio','intensity']].copy()
|
|
676
|
+
else:
|
|
677
|
+
df_reference_tmp = df_reference.copy()
|
|
678
|
+
|
|
679
|
+
ref_groups = dict(tuple(df_reference_tmp.groupby('id', sort=False)))
|
|
680
|
+
unique_reference_ids_tmp = list(ref_groups.keys())
|
|
681
|
+
|
|
682
|
+
similarity_by_ref = {}
|
|
683
|
+
for ref_id in unique_reference_ids_tmp:
|
|
684
|
+
q_spec = q_spec_tmp.copy()
|
|
685
|
+
r_df = ref_groups[ref_id]
|
|
686
|
+
r_spec = np.asarray(pd.concat([r_df['mz_ratio'], r_df['intensity']], axis=1).reset_index(drop=True))
|
|
687
|
+
#print('\nhere!!!!!!!!!!!!!!!')
|
|
688
|
+
#print(r_spec)
|
|
903
689
|
|
|
904
690
|
is_matched = False
|
|
691
|
+
|
|
905
692
|
for transformation in spectrum_preprocessing_order:
|
|
906
|
-
if np.isinf(q_spec[:,1]).sum() > 0:
|
|
907
|
-
q_spec[:,1] = np.zeros(q_spec.shape[0])
|
|
908
|
-
if np.isinf(r_spec[:,1]).sum() > 0:
|
|
909
|
-
r_spec[:,1] = np.zeros(r_spec.shape[0])
|
|
910
|
-
|
|
911
|
-
|
|
912
|
-
|
|
913
|
-
|
|
693
|
+
if np.isinf(q_spec[:, 1]).sum() > 0:
|
|
694
|
+
q_spec[:, 1] = np.zeros(q_spec.shape[0])
|
|
695
|
+
if np.isinf(r_spec[:, 1]).sum() > 0:
|
|
696
|
+
r_spec[:, 1] = np.zeros(r_spec.shape[0])
|
|
697
|
+
|
|
698
|
+
if transformation == 'C' and q_spec.shape[0] > 1 and r_spec.shape[0] > 1:
|
|
699
|
+
q_spec = centroid_spectrum(q_spec, window_size=window_size_centroiding)
|
|
700
|
+
r_spec = centroid_spectrum(r_spec, window_size=window_size_centroiding)
|
|
701
|
+
|
|
702
|
+
if transformation == 'M' and q_spec.shape[0] > 1 and r_spec.shape[0] > 1:
|
|
914
703
|
m_spec = match_peaks_in_spectra(spec_a=q_spec, spec_b=r_spec, window_size=window_size_matching)
|
|
915
|
-
q_spec = m_spec[:,0:2]
|
|
916
|
-
r_spec = m_spec[:,[0,2]]
|
|
704
|
+
q_spec = m_spec[:, 0:2]
|
|
705
|
+
r_spec = m_spec[:, [0, 2]]
|
|
917
706
|
is_matched = True
|
|
918
|
-
if transformation == 'W' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1:
|
|
919
|
-
q_spec[:,1] = wf_transform(q_spec[:,0], q_spec[:,1], wf_mz, wf_intensity)
|
|
920
|
-
r_spec[:,1] = wf_transform(r_spec[:,0], r_spec[:,1], wf_mz, wf_intensity)
|
|
921
|
-
if transformation == 'L' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1:
|
|
922
|
-
q_spec[:,1] = LE_transform(q_spec[:,1], LET_threshold, normalization_method=normalization_method)
|
|
923
|
-
r_spec[:,1] = LE_transform(r_spec[:,1], LET_threshold, normalization_method=normalization_method)
|
|
924
|
-
if transformation == 'N' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1:
|
|
925
|
-
q_spec = remove_noise(q_spec, nr = noise_threshold)
|
|
926
|
-
if high_quality_reference_library == False:
|
|
927
|
-
r_spec = remove_noise(r_spec, nr = noise_threshold)
|
|
928
|
-
if transformation == 'F' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1:
|
|
929
|
-
q_spec = filter_spec_lcms(q_spec, mz_min = mz_min, mz_max = mz_max, int_min = int_min, int_max = int_max, is_matched = is_matched)
|
|
930
|
-
if high_quality_reference_library == False:
|
|
931
|
-
r_spec = filter_spec_lcms(r_spec, mz_min = mz_min, mz_max = mz_max, int_min = int_min, int_max = int_max, is_matched = is_matched)
|
|
932
707
|
|
|
933
|
-
|
|
934
|
-
|
|
935
|
-
|
|
936
|
-
|
|
937
|
-
|
|
708
|
+
if transformation == 'W' and q_spec.shape[0] > 1 and r_spec.shape[0] > 1:
|
|
709
|
+
q_spec[:, 1] = wf_transform(q_spec[:, 0], q_spec[:, 1], wf_mz, wf_intensity)
|
|
710
|
+
r_spec[:, 1] = wf_transform(r_spec[:, 0], r_spec[:, 1], wf_mz, wf_intensity)
|
|
711
|
+
|
|
712
|
+
if transformation == 'L' and q_spec.shape[0] > 1 and r_spec.shape[0] > 1:
|
|
713
|
+
q_spec[:, 1] = LE_transform(q_spec[:, 1], LET_threshold, normalization_method=normalization_method)
|
|
714
|
+
r_spec[:, 1] = LE_transform(r_spec[:, 1], LET_threshold, normalization_method=normalization_method)
|
|
715
|
+
|
|
716
|
+
if transformation == 'N' and q_spec.shape[0] > 1 and r_spec.shape[0] > 1:
|
|
717
|
+
q_spec = remove_noise(q_spec, nr=noise_threshold)
|
|
718
|
+
if not high_quality_reference_library:
|
|
719
|
+
r_spec = remove_noise(r_spec, nr=noise_threshold)
|
|
720
|
+
|
|
721
|
+
if transformation == 'F' and q_spec.shape[0] > 1 and r_spec.shape[0] > 1:
|
|
722
|
+
q_spec = filter_spec_lcms(
|
|
723
|
+
q_spec, mz_min=mz_min, mz_max=mz_max, int_min=int_min, int_max=int_max, is_matched=is_matched
|
|
724
|
+
)
|
|
725
|
+
if not high_quality_reference_library:
|
|
726
|
+
r_spec = filter_spec_lcms(
|
|
727
|
+
r_spec, mz_min=mz_min, mz_max=mz_max, int_min=int_min, int_max=int_max, is_matched=is_matched
|
|
728
|
+
)
|
|
729
|
+
|
|
730
|
+
q_ints = q_spec[:, 1]
|
|
731
|
+
r_ints = r_spec[:, 1]
|
|
732
|
+
|
|
733
|
+
if np.sum(q_ints) != 0 and np.sum(r_ints) != 0 and q_spec.shape[0] > 1 and r_spec.shape[0] > 1:
|
|
734
|
+
sim = get_similarity(similarity_measure, q_ints, r_ints, weights, entropy_dimension)
|
|
938
735
|
else:
|
|
939
|
-
|
|
736
|
+
sim = 0.0
|
|
940
737
|
|
|
941
|
-
|
|
942
|
-
all_similarity_scores.append(similarity_scores)
|
|
738
|
+
similarity_by_ref[ref_id] = sim
|
|
943
739
|
|
|
944
|
-
|
|
740
|
+
row_scores = [similarity_by_ref.get(ref_id, 0.0) for ref_id in unique_reference_ids]
|
|
741
|
+
all_similarity_scores.append(row_scores)
|
|
742
|
+
|
|
743
|
+
df_scores = pd.DataFrame(all_similarity_scores, index=unique_query_ids, columns=unique_reference_ids)
|
|
945
744
|
df_scores.index = unique_query_ids
|
|
946
|
-
df_scores.index.names = ['
|
|
745
|
+
df_scores.index.names = ['QUERY.SPECTRUM.ID']
|
|
746
|
+
|
|
947
747
|
|
|
948
748
|
preds = []
|
|
949
749
|
scores = []
|
|
@@ -976,7 +776,7 @@ def run_spec_lib_matching_on_HRMS_data(query_data=None, reference_data=None, lik
|
|
|
976
776
|
|
|
977
777
|
df_top_ref_specs = pd.DataFrame(out, columns = [*cnames_preds, *cnames_scores])
|
|
978
778
|
df_top_ref_specs.index = unique_query_ids
|
|
979
|
-
df_top_ref_specs.index.names = ['
|
|
779
|
+
df_top_ref_specs.index.names = ['QUERY.SPECTRUM.ID']
|
|
980
780
|
|
|
981
781
|
df_scores.columns = ['Reference Spectrum ID: ' + col for col in list(map(str,df_scores.columns.tolist()))]
|
|
982
782
|
|
|
@@ -993,33 +793,7 @@ def run_spec_lib_matching_on_HRMS_data(query_data=None, reference_data=None, lik
|
|
|
993
793
|
|
|
994
794
|
|
|
995
795
|
|
|
996
|
-
def run_spec_lib_matching_on_NRMS_data(query_data=None, reference_data=None, likely_reference_ids=None, spectrum_preprocessing_order='FNLW', similarity_measure='cosine', weights={'Cosine':0.25,'Shannon':0.25,'Renyi':0.25,'Tsallis':0.25}, high_quality_reference_library=False, mz_min=0, mz_max=9999999, int_min=0, int_max=9999999, noise_threshold=0.0, wf_mz=0.0, wf_intensity=1.0, LET_threshold=0.0, entropy_dimension=1.1, n_top_matches_to_save=1, print_id_results=False, output_identification=None, output_similarity_scores=None, return_ID_output=False):
|
|
997
|
-
'''
|
|
998
|
-
runs spectral library matching on nominal-resolution mass spectrometry (NRMS) data
|
|
999
|
-
|
|
1000
|
-
--query_data: cdf or csv file of query mass spectrum/spectra to be identified. If csv file, each row should correspond to a mass spectrum, the left-most column should contain an identifier, and each of the other columns should correspond to a single mass/charge ratio. Mandatory argument.
|
|
1001
|
-
--reference_data: cdf of csv file of the reference mass spectra. If csv file, each row should correspond to a mass spectrum, the left-most column should contain in identifier (i.e. the CAS registry number or the compound name), and the remaining column should correspond to a single mass/charge ratio. Mandatory argument.
|
|
1002
|
-
--likely_reference_ids: CSV file with one column containing the IDs of a subset of all compounds in the reference_data to be used in spectral library matching. Each ID in this file must be an ID in the reference library. Default: None (i.e. default is to use entire reference library)
|
|
1003
|
-
--similarity_measure: cosine, shannon, renyi, tsallis, mixture, jaccard, dice, 3w_jaccard, sokal_sneath, binary_cosine, mountford, mcconnaughey, driver_kroeber, simpson, braun_banquet, fager_mcgowan, kulczynski, intersection, hamming, hellinger. Default: cosine.
|
|
1004
|
-
--weights: dict of weights to give to each non-binary similarity measure (i.e. cosine, shannon, renyi, and tsallis) when the mixture similarity measure is specified. Default: 0.25 for each of the four non-binary similarity measures.
|
|
1005
|
-
--spectrum_preprocessing_order: The spectrum preprocessing transformations and the order in which they are to be applied. Note that these transformations are applied prior to computing similarity scores. Format must be a string with 2-4 characters chosen from F, N, L, W representing filtering based on mass/charge and intensity values, noise removal, low-entropy trannsformation, and weight-factor-transformation, respectively. For example, if \'WN\' is passed, then each spectrum will undergo a weight factor transformation and then noise removal. Default: FNLW')
|
|
1006
|
-
--high_quality_reference_library: True/False flag indicating whether the reference library is considered to be of high quality. If True, then the spectrum preprocessing transformations of filtering and noise removal are performed only on the query spectrum/spectra. If False, all spectrum preprocessing transformations specified will be applied to both the query and reference spectra. Default: False')
|
|
1007
|
-
--mz_min: Remove all peaks with mass/charge value less than mz_min in each spectrum. Default: 0
|
|
1008
|
-
--mz_max: Remove all peaks with mass/charge value greater than mz_max in each spectrum. Default: 9999999
|
|
1009
|
-
--int_min: Remove all peaks with intensity value less than int_min in each spectrum. Default: 0
|
|
1010
|
-
--int_max: Remove all peaks with intensity value greater than int_max in each spectrum. Default: 9999999
|
|
1011
|
-
--noise_threshold: Ion fragments (i.e. points in a given mass spectrum) with intensity less than max(intensities)*noise_threshold are removed. Default: 0.0
|
|
1012
|
-
--wf_mz: Mass/charge weight factor parameter. Default: 0.0
|
|
1013
|
-
--wf_intensity: Intensity weight factor parameter. Default: 0.0
|
|
1014
|
-
--LET_threshold: Low-entropy transformation threshold parameter. Spectra with Shannon entropy less than LET_threshold are transformed according to intensitiesNew=intensitiesOriginal^{(1+S)/(1+LET_threshold)}. Default: 0.0
|
|
1015
|
-
--entropy_dimension: Entropy dimension parameter. Must have positive value other than 1. When the entropy dimension is 1, then Renyi and Tsallis entropy are equivalent to Shannon entropy. Therefore, this parameter only applies to the renyi and tsallis similarity measures. This parameter will be ignored if similarity measure cosine or shannon is chosen. Default: 1.1
|
|
1016
|
-
--normalization_method: Method used to normalize the intensities of each spectrum so that the intensities sum to 1. Since the objects entropy quantifies the uncertainy of must be probability distributions, the intensities of a given spectrum must sum to 1 prior to computing the entropy of the given spectrum intensities. Options: \'standard\' and \'softmax\'. Default: standard.
|
|
1017
|
-
--n_top_matches_to_save: The number of top matches to report. For example, if n_top_matches_to_save=5, then for each query spectrum, the five reference spectra with the largest similarity with the given query spectrum will be reported. Default: 1
|
|
1018
|
-
--print_id_results: Flag that prints identification results if True. Default: False
|
|
1019
|
-
--output_identification: Output TXT file containing the most-similar reference spectra for each query spectrum along with the corresponding similarity scores. Default is to save identification output in current working directory with filename \'output_identification.txt\'.
|
|
1020
|
-
--output_similarity_scores: Output TXT file containing similarity scores between all query spectrum/spectra and all reference spectra. Each row corresponds to a query spectrum, the left-most column contains the query spectrum/spectra identifier, and the remaining column contain the similarity scores with respect to all reference library spectra. If no argument passed, then this TXT file is written to the current working directory with filename \'output_all_similarity_scores\'.txt.')
|
|
1021
|
-
'''
|
|
1022
|
-
|
|
796
|
+
def run_spec_lib_matching_on_NRMS_data(query_data=None, reference_data=None, likely_reference_ids=None, spectrum_preprocessing_order='FNLW', similarity_measure='cosine', weights={'Cosine':0.25,'Shannon':0.25,'Renyi':0.25,'Tsallis':0.25}, high_quality_reference_library=False, mz_min=0, mz_max=9999999, int_min=0, int_max=9999999, noise_threshold=0.0, wf_mz=0.0, wf_intensity=1.0, LET_threshold=0.0, entropy_dimension=1.1, n_top_matches_to_save=1, print_id_results=False, output_identification=None, output_similarity_scores=None, return_ID_output=False, verbose=True):
|
|
1023
797
|
if query_data is None:
|
|
1024
798
|
print('\nError: No argument passed to the mandatory query_data. Please pass the path to the CSV file of the query data.')
|
|
1025
799
|
sys.exit()
|
|
@@ -1027,11 +801,11 @@ def run_spec_lib_matching_on_NRMS_data(query_data=None, reference_data=None, lik
|
|
|
1027
801
|
extension = query_data.rsplit('.',1)
|
|
1028
802
|
extension = extension[(len(extension)-1)]
|
|
1029
803
|
if extension == 'mgf' or extension == 'MGF' or extension == 'mzML' or extension == 'mzml' or extension == 'MZML' or extension == 'cdf' or extension == 'CDF':
|
|
1030
|
-
output_path_tmp = query_data[:-3] + '
|
|
804
|
+
output_path_tmp = query_data[:-3] + 'txt'
|
|
1031
805
|
build_library_from_raw_data(input_path=query_data, output_path=output_path_tmp, is_reference=False)
|
|
1032
|
-
df_query = pd.read_csv(output_path_tmp)
|
|
1033
|
-
if extension == '
|
|
1034
|
-
df_query = pd.read_csv(query_data)
|
|
806
|
+
df_query = pd.read_csv(output_path_tmp, sep='\t')
|
|
807
|
+
if extension == 'txt' or extension == 'TXT':
|
|
808
|
+
df_query = pd.read_csv(query_data, sep='\t')
|
|
1035
809
|
unique_query_ids = df_query.iloc[:,0].unique()
|
|
1036
810
|
|
|
1037
811
|
if reference_data is None:
|
|
@@ -1175,7 +949,7 @@ def run_spec_lib_matching_on_NRMS_data(query_data=None, reference_data=None, lik
|
|
|
1175
949
|
|
|
1176
950
|
df_scores = pd.DataFrame(all_similarity_scores, columns = unique_reference_ids)
|
|
1177
951
|
df_scores.index = unique_query_ids
|
|
1178
|
-
df_scores.index.names = ['
|
|
952
|
+
df_scores.index.names = ['QUERY.SPECTRUM.ID']
|
|
1179
953
|
|
|
1180
954
|
preds = []
|
|
1181
955
|
scores = []
|
|
@@ -1208,7 +982,7 @@ def run_spec_lib_matching_on_NRMS_data(query_data=None, reference_data=None, lik
|
|
|
1208
982
|
|
|
1209
983
|
df_top_ref_specs = pd.DataFrame(out, columns = [*cnames_preds, *cnames_scores])
|
|
1210
984
|
df_top_ref_specs.index = unique_query_ids
|
|
1211
|
-
df_top_ref_specs.index.names = ['
|
|
985
|
+
df_top_ref_specs.index.names = ['QUERY.SPECTRUM.ID']
|
|
1212
986
|
|
|
1213
987
|
if print_id_results == True:
|
|
1214
988
|
print(df_top_ref_specs.to_string())
|