pycompound 0.1.5__py3-none-any.whl → 0.1.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- app.py +2772 -243
- pycompound/build_library.py +77 -20
- pycompound/plot_spectra.py +1 -1
- pycompound/processing.py +5 -5
- pycompound/spec_lib_matching.py +265 -502
- pycompound/spec_lib_matching_CLI.py +48 -2
- pycompound/tuning_CLI_DE.py +22 -22
- pycompound/tuning_CLI_grid.py +22 -6
- {pycompound-0.1.5.dist-info → pycompound-0.1.7.dist-info}/METADATA +1 -1
- pycompound-0.1.7.dist-info/RECORD +15 -0
- {pycompound-0.1.5.dist-info → pycompound-0.1.7.dist-info}/top_level.txt +0 -1
- app2.py +0 -101
- pycompound-0.1.5.dist-info/RECORD +0 -16
- {pycompound-0.1.5.dist-info → pycompound-0.1.7.dist-info}/WHEEL +0 -0
- {pycompound-0.1.5.dist-info → pycompound-0.1.7.dist-info}/licenses/LICENSE +0 -0
app.py
CHANGED
|
@@ -1,16 +1,9 @@
|
|
|
1
1
|
|
|
2
2
|
from shiny import App, ui, reactive, render, req
|
|
3
|
-
from
|
|
4
|
-
from pycompound.spec_lib_matching import run_spec_lib_matching_on_NRMS_data
|
|
5
|
-
from pycompound.spec_lib_matching import tune_params_on_HRMS_data_grid
|
|
6
|
-
from pycompound.spec_lib_matching import tune_params_on_NRMS_data_grid
|
|
7
|
-
from pycompound.spec_lib_matching import tune_params_on_HRMS_data_grid_shiny
|
|
8
|
-
from pycompound.spec_lib_matching import tune_params_on_NRMS_data_grid_shiny
|
|
9
|
-
from pycompound.spec_lib_matching import tune_params_DE
|
|
10
|
-
from pycompound.plot_spectra import generate_plots_on_HRMS_data
|
|
11
|
-
from pycompound.plot_spectra import generate_plots_on_NRMS_data
|
|
3
|
+
from shiny.types import SilentException
|
|
12
4
|
from pathlib import Path
|
|
13
5
|
from contextlib import redirect_stdout, redirect_stderr
|
|
6
|
+
import contextlib
|
|
14
7
|
import subprocess
|
|
15
8
|
import traceback
|
|
16
9
|
import asyncio
|
|
@@ -24,10 +17,2320 @@ import netCDF4 as nc
|
|
|
24
17
|
from pyteomics import mgf, mzml
|
|
25
18
|
import ast
|
|
26
19
|
from numbers import Real
|
|
27
|
-
|
|
20
|
+
import logging
|
|
21
|
+
from scipy.optimize import differential_evolution
|
|
22
|
+
import scipy
|
|
23
|
+
import scipy.stats
|
|
24
|
+
from itertools import product
|
|
25
|
+
import json
|
|
26
|
+
import re
|
|
27
|
+
import urllib.parse
|
|
28
|
+
import urllib.request
|
|
29
|
+
import matplotlib
|
|
30
|
+
|
|
31
|
+
matplotlib.rcParams['svg.fonttype'] = 'none'
|
|
28
32
|
|
|
29
33
|
_LOG_QUEUE: asyncio.Queue[str] = asyncio.Queue()
|
|
30
34
|
|
|
35
|
+
_ADDUCT_PAT = re.compile(r"\s*(?:\[(M[^\]]+)\]|(M[+-][A-Za-z0-9]+)\+?)\s*$", re.IGNORECASE)
|
|
36
|
+
|
|
37
|
+
def start_log_consumer():
|
|
38
|
+
if getattr(start_log_consumer, "_started", False):
|
|
39
|
+
return
|
|
40
|
+
start_log_consumer._started = True
|
|
41
|
+
|
|
42
|
+
async def _consume():
|
|
43
|
+
while True:
|
|
44
|
+
s = await _LOG_QUEUE.get()
|
|
45
|
+
match_log_rv.set(match_log_rv.get() + s)
|
|
46
|
+
await reactive.flush()
|
|
47
|
+
|
|
48
|
+
asyncio.create_task(_consume())
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def start_log_consumer():
|
|
52
|
+
if getattr(start_log_consumer, "_started", False):
|
|
53
|
+
return
|
|
54
|
+
start_log_consumer._started = True
|
|
55
|
+
|
|
56
|
+
async def _consume():
|
|
57
|
+
while True:
|
|
58
|
+
s = await _LOG_QUEUE.get()
|
|
59
|
+
match_log_rv.set(match_log_rv.get() + s)
|
|
60
|
+
await reactive.flush()
|
|
61
|
+
|
|
62
|
+
asyncio.create_task(_consume())
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
def _strip_adduct(name: str) -> str:
|
|
67
|
+
return _ADDUCT_PAT.sub("", name).strip()
|
|
68
|
+
|
|
69
|
+
def get_pubchem_url(query: str) -> str:
|
|
70
|
+
base_name = _strip_adduct(query)
|
|
71
|
+
endpoint = ("https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/name/" + urllib.parse.quote(base_name) + "/cids/TXT")
|
|
72
|
+
try:
|
|
73
|
+
with urllib.request.urlopen(endpoint, timeout=10) as r:
|
|
74
|
+
txt = r.read().decode("utf-8").strip()
|
|
75
|
+
cid = txt.splitlines()[0].strip()
|
|
76
|
+
if cid.isdigit():
|
|
77
|
+
return f"https://pubchem.ncbi.nlm.nih.gov/compound/{cid}"
|
|
78
|
+
except Exception:
|
|
79
|
+
pass
|
|
80
|
+
q = urllib.parse.quote(base_name)
|
|
81
|
+
return f"https://pubchem.ncbi.nlm.nih.gov/#query={q}"
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
def build_library_from_raw_data(input_path=None, output_path=None, is_reference=False):
|
|
86
|
+
if input_path is None:
|
|
87
|
+
print('Error: please specify input_path (i.e. the path to the input mgf, mzML, cdf, json, or msp file). Mandatory argument.')
|
|
88
|
+
sys.exit()
|
|
89
|
+
|
|
90
|
+
if output_path is None:
|
|
91
|
+
tmp = input_path.split('/')
|
|
92
|
+
tmp = tmp[(len(tmp)-1)]
|
|
93
|
+
basename = tmp.split('.')[0]
|
|
94
|
+
output_path = f'{Path.cwd()}/{basename}.csv'
|
|
95
|
+
print(f'Warning: no output_path specified, so library is written to {output_path}')
|
|
96
|
+
|
|
97
|
+
if is_reference not in [True,False]:
|
|
98
|
+
print('Error: is_reference must be either \'True\' or \'False\'.')
|
|
99
|
+
sys.exit()
|
|
100
|
+
|
|
101
|
+
last_three_chars = input_path[(len(input_path)-3):len(input_path)]
|
|
102
|
+
last_four_chars = input_path[(len(input_path)-4):len(input_path)]
|
|
103
|
+
if last_three_chars == 'mgf' or last_three_chars == 'MGF':
|
|
104
|
+
input_file_type = 'mgf'
|
|
105
|
+
elif last_four_chars == 'mzML' or last_four_chars == 'mzml' or last_four_chars == 'MZML':
|
|
106
|
+
input_file_type = 'mzML'
|
|
107
|
+
elif last_four_chars == 'json' or last_four_chars == 'JSON':
|
|
108
|
+
input_file_type = 'json'
|
|
109
|
+
elif last_three_chars == 'cdf' or last_three_chars == 'CDF':
|
|
110
|
+
input_file_type = 'cdf'
|
|
111
|
+
elif last_three_chars == 'msp' or last_three_chars == 'MSP':
|
|
112
|
+
input_file_type = 'msp'
|
|
113
|
+
else:
|
|
114
|
+
print('ERROR: either an \'mgf\', \'mzML\', \'cdf\', \'json\', or \'msp\' file must be passed to --input_path')
|
|
115
|
+
sys.exit()
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
def generate_plots_on_HRMS_data(query_data=None, reference_data=None, precursor_ion_mz=None, precursor_ion_mz_tolerance=None, ionization_mode=None, collision_energy=None, spectrum_ID1=None, spectrum_ID2=None, print_url_spectrum1='No', print_url_spectrum2='No', similarity_measure='cosine', weights={'Cosine':0.25,'Shannon':0.25,'Renyi':0.25,'Tsallis':0.25}, spectrum_preprocessing_order='FCNMWL', high_quality_reference_library=False, mz_min=0, mz_max=9999999, int_min=0, int_max=9999999, window_size_centroiding=0.5, window_size_matching=0.5, noise_threshold=0.0, wf_mz=0.0, wf_intensity=1.0, LET_threshold=0.0, entropy_dimension=1.1, y_axis_transformation='normalized', output_path=None, return_plot=False):
|
|
120
|
+
|
|
121
|
+
if query_data is None:
|
|
122
|
+
print('\nError: No argument passed to the mandatory query_data. Please pass the path to the TXT file of the query data.')
|
|
123
|
+
sys.exit()
|
|
124
|
+
else:
|
|
125
|
+
extension = query_data.rsplit('.',1)
|
|
126
|
+
extension = extension[(len(extension)-1)]
|
|
127
|
+
if extension == 'mgf' or extension == 'MGF' or extension == 'mzML' or extension == 'mzml' or extension == 'MZML' or extension == 'cdf' or extension == 'CDF' or extension == 'msp' or extension == 'MSP' or extension == 'json' or extension == 'JSON':
|
|
128
|
+
output_path_tmp = query_data[:-3] + 'txt'
|
|
129
|
+
#build_library_from_raw_data(input_path=query_data, output_path=output_path_tmp, is_reference=True)
|
|
130
|
+
build_library_from_raw_data(input_path=query_data, output_path=output_path_tmp, is_reference=False)
|
|
131
|
+
df_query = pd.read_csv(output_path_tmp, sep='\t')
|
|
132
|
+
if extension == 'txt' or extension == 'TXT':
|
|
133
|
+
df_query = pd.read_csv(query_data, sep='\t')
|
|
134
|
+
unique_query_ids = df_query['id'].unique().tolist()
|
|
135
|
+
unique_query_ids = [str(tmp) for tmp in unique_query_ids]
|
|
136
|
+
|
|
137
|
+
if reference_data is None:
|
|
138
|
+
print('\nError: No argument passed to the mandatory reference_data. Please pass the path to the TXT file of the reference data.')
|
|
139
|
+
sys.exit()
|
|
140
|
+
else:
|
|
141
|
+
extension = reference_data.rsplit('.',1)
|
|
142
|
+
extension = extension[(len(extension)-1)]
|
|
143
|
+
if extension == 'mgf' or extension == 'MGF' or extension == 'mzML' or extension == 'mzml' or extension == 'MZML' or extension == 'cdf' or extension == 'CDF' or extension == 'msp' or extension == 'MSP' or extension == 'json' or extension == 'JSON':
|
|
144
|
+
output_path_tmp = reference_data[:-3] + 'txt'
|
|
145
|
+
build_library_from_raw_data(input_path=reference_data, output_path=output_path_tmp, is_reference=True)
|
|
146
|
+
df_reference = pd.read_csv(output_path_tmp, sep='\t')
|
|
147
|
+
if extension == 'txt' or extension == 'TXT':
|
|
148
|
+
df_reference = pd.read_csv(reference_data, sep='\t')
|
|
149
|
+
cols_tmp = df_reference.columns.tolist()
|
|
150
|
+
if 'precursor_ion_mz' in cols_tmp and 'ionization_mode' in cols_tmp and 'collision_energy' in cols_tmp:
|
|
151
|
+
if precursor_ion_mz is not None and precursor_ion_mz_tolerance is not None:
|
|
152
|
+
df_reference = df_reference.loc[(df_reference['precursor_ion_mz'] > (precursor_ion_mz-precursor_ion_mz_tolerance) & df_reference['precursor_ion_mz'] < (precursor_ion_mz+precursor_ion_mz_tolerance))]
|
|
153
|
+
if ionization_mode is not None:
|
|
154
|
+
df_reference = df_reference.loc[df_reference['ionization_mode'==ionization_mode]]
|
|
155
|
+
if collision_energy is not None:
|
|
156
|
+
df_reference = df_reference.loc[df_reference['collision_energy'==collision_energy]]
|
|
157
|
+
df_reference = df_reference.drop(columns=['precursor_ion_mz','ionization_mode','collision_energy'])
|
|
158
|
+
unique_reference_ids = df_reference['id'].unique().tolist()
|
|
159
|
+
unique_reference_ids = [str(tmp) for tmp in unique_reference_ids]
|
|
160
|
+
|
|
161
|
+
if spectrum_ID1 is not None:
|
|
162
|
+
spectrum_ID1 = str(spectrum_ID1)
|
|
163
|
+
else:
|
|
164
|
+
spectrum_ID1 = str(df_query['id'].iloc[0])
|
|
165
|
+
print('No argument passed to spectrum_ID1; using the first spectrum in query_data.')
|
|
166
|
+
|
|
167
|
+
if spectrum_ID2 is not None:
|
|
168
|
+
spectrum_ID2 = str(spectrum_ID2)
|
|
169
|
+
else:
|
|
170
|
+
spectrum_ID2 = str(df_reference['id'].iloc[0])
|
|
171
|
+
print('No argument passed to spectrum_ID2; using the first spectrum in reference_data.')
|
|
172
|
+
|
|
173
|
+
if spectrum_preprocessing_order is not None:
|
|
174
|
+
spectrum_preprocessing_order = list(spectrum_preprocessing_order)
|
|
175
|
+
else:
|
|
176
|
+
spectrum_preprocessing_order = ['F', 'C', 'N', 'M', 'W', 'L']
|
|
177
|
+
if 'M' not in spectrum_preprocessing_order:
|
|
178
|
+
print(f'Error: \'M\' must be a character in spectrum_preprocessing_order.')
|
|
179
|
+
sys.exit()
|
|
180
|
+
if 'C' in spectrum_preprocessing_order:
|
|
181
|
+
if spectrum_preprocessing_order.index('C') > spectrum_preprocessing_order.index('M'):
|
|
182
|
+
print(f'Error: \'C\' must come before \'M\' in spectrum_preprocessing_order.')
|
|
183
|
+
sys.exit()
|
|
184
|
+
if set(spectrum_preprocessing_order) - {'F','C','N','M','W','L'}:
|
|
185
|
+
print(f'Error: spectrum_preprocessing_order must contain only \'C\', \'F\', \'M\', \'N\', \'L\', \'W\'.')
|
|
186
|
+
sys.exit()
|
|
187
|
+
|
|
188
|
+
if similarity_measure not in ['cosine','shannon','renyi','tsallis','mixture','jaccard','dice','3w_jaccard','sokal_sneath','binary_cosine','mountford','mcconnaughey','driver_kroeber','simpson','braun_banquet','fager_mcgowan','kulczynski','intersection','hamming','hellinger']:
|
|
189
|
+
print('\nError: similarity_measure must be either cosine, shannon, renyi, tsallis, mixture, jaccard, dice, 3w_jaccard, sokal_sneath, binary_cosine, mountford, mcconnaughey, driver_kroeber, simpson, braun_banquet, fager_mcgowan, kulczynski, intersection, hamming, or hellinger.')
|
|
190
|
+
sys.exit()
|
|
191
|
+
|
|
192
|
+
if isinstance(int_min,int) is True:
|
|
193
|
+
int_min = float(int_min)
|
|
194
|
+
if isinstance(int_max,int) is True:
|
|
195
|
+
int_max = float(int_max)
|
|
196
|
+
if isinstance(mz_min,int) is False or isinstance(mz_max,int) is False or isinstance(int_min,float) is False or isinstance(int_max,float) is False:
|
|
197
|
+
print('Error: mz_min must be a non-negative integer, mz_max must be a positive integer, int_min must be a non-negative float, and int_max must be a positive float')
|
|
198
|
+
sys.exit()
|
|
199
|
+
if mz_min < 0:
|
|
200
|
+
print('\nError: mz_min should be a non-negative integer')
|
|
201
|
+
sys.exit()
|
|
202
|
+
if mz_max <= 0:
|
|
203
|
+
print('\nError: mz_max should be a positive integer')
|
|
204
|
+
sys.exit()
|
|
205
|
+
if int_min < 0:
|
|
206
|
+
print('\nError: int_min should be a non-negative float')
|
|
207
|
+
sys.exit()
|
|
208
|
+
if int_max <= 0:
|
|
209
|
+
print('\nError: int_max should be a positive float')
|
|
210
|
+
sys.exit()
|
|
211
|
+
|
|
212
|
+
if isinstance(window_size_centroiding,float) is False or window_size_centroiding <= 0.0:
|
|
213
|
+
print('Error: window_size_centroiding must be a positive float.')
|
|
214
|
+
sys.exit()
|
|
215
|
+
if isinstance(window_size_matching,float) is False or window_size_matching<= 0.0:
|
|
216
|
+
print('Error: window_size_matching must be a positive float.')
|
|
217
|
+
sys.exit()
|
|
218
|
+
|
|
219
|
+
if isinstance(noise_threshold,int) is True:
|
|
220
|
+
noise_threshold = float(noise_threshold)
|
|
221
|
+
if isinstance(noise_threshold,float) is False or noise_threshold < 0:
|
|
222
|
+
print('Error: noise_threshold must be a positive float.')
|
|
223
|
+
sys.exit()
|
|
224
|
+
|
|
225
|
+
if isinstance(wf_intensity,int) is True:
|
|
226
|
+
wf_intensity = float(wf_intensity)
|
|
227
|
+
if isinstance(wf_mz,int) is True:
|
|
228
|
+
wf_mz = float(wf_mz)
|
|
229
|
+
if isinstance(wf_intensity,float) is False or isinstance(wf_mz,float) is False:
|
|
230
|
+
print('Error: wf_mz and wf_intensity must be integers or floats')
|
|
231
|
+
sys.exit()
|
|
232
|
+
|
|
233
|
+
if entropy_dimension <= 0:
|
|
234
|
+
print('\nError: entropy_dimension should be a positive float')
|
|
235
|
+
sys.exit()
|
|
236
|
+
else:
|
|
237
|
+
q = entropy_dimension
|
|
238
|
+
|
|
239
|
+
normalization_method = 'standard' #consider including additional normalization methods to transform intensities into a probability distribution; softmax results in many numerical errors/warnings
|
|
240
|
+
|
|
241
|
+
if y_axis_transformation not in ['normalized','none','log10','sqrt']:
|
|
242
|
+
print('Error: y_axis_transformation must be either \'normalized\', \'none\', \'log10\', or \'sqrt\'.')
|
|
243
|
+
sys.exit()
|
|
244
|
+
|
|
245
|
+
if output_path is None:
|
|
246
|
+
print(f'Warning: plots will be saved to the svg ./spectrum1_{spectrum_ID1}_spectrum2_{spectrum_ID2}.svg in the current working directory.')
|
|
247
|
+
output_path = f'{Path.cwd()}/spectrum1_{spectrum_ID1}_spectrum2_{spectrum_ID2}.svg'
|
|
248
|
+
|
|
249
|
+
|
|
250
|
+
if spectrum_ID1 in unique_query_ids and spectrum_ID2 in unique_query_ids:
|
|
251
|
+
query_idx = unique_query_ids.index(spectrum_ID1)
|
|
252
|
+
reference_idx = unique_query_ids.index(spectrum_ID2)
|
|
253
|
+
q_idxs_tmp = np.where(df_query.iloc[:,0].astype(str) == unique_query_ids[query_idx])[0]
|
|
254
|
+
r_idxs_tmp = np.where(df_query.iloc[:,0].astype(str) == unique_query_ids[reference_idx])[0]
|
|
255
|
+
q_spec = np.asarray(pd.concat([df_query.iloc[q_idxs_tmp,1], df_query.iloc[q_idxs_tmp,2]], axis=1).reset_index(drop=True))
|
|
256
|
+
r_spec = np.asarray(pd.concat([df_query.iloc[r_idxs_tmp,1], df_query.iloc[r_idxs_tmp,2]], axis=1).reset_index(drop=True))
|
|
257
|
+
elif spectrum_ID1 in unique_reference_ids and spectrum_ID2 in unique_reference_ids:
|
|
258
|
+
query_idx = unique_reference_ids.index(spectrum_ID1)
|
|
259
|
+
reference_idx = unique_reference_ids.index(spectrum_ID2)
|
|
260
|
+
q_idxs_tmp = np.where(df_reference.iloc[:,0].astype(str) == unique_reference_ids[query_idx])[0]
|
|
261
|
+
r_idxs_tmp = np.where(df_reference.iloc[:,0].astype(str) == unique_reference_ids[reference_idx])[0]
|
|
262
|
+
q_spec = np.asarray(pd.concat([df_reference.iloc[q_idxs_tmp,1], df_reference.iloc[q_idxs_tmp,2]], axis=1).reset_index(drop=True))
|
|
263
|
+
r_spec = np.asarray(pd.concat([df_reference.iloc[r_idxs_tmp,1], df_reference.iloc[r_idxs_tmp,2]], axis=1).reset_index(drop=True))
|
|
264
|
+
else:
|
|
265
|
+
if spectrum_ID1 in unique_reference_ids and spectrum_ID2 in unique_query_ids:
|
|
266
|
+
spec_tmp = spectrum_ID1
|
|
267
|
+
spectrum_ID1 = spectrum_ID2
|
|
268
|
+
spectrum_ID2 = spec_tmp
|
|
269
|
+
query_idx = unique_query_ids.index(spectrum_ID1)
|
|
270
|
+
reference_idx = unique_reference_ids.index(spectrum_ID2)
|
|
271
|
+
q_idxs_tmp = np.where(df_query['id'].astype(str) == unique_query_ids[query_idx])[0]
|
|
272
|
+
r_idxs_tmp = np.where(df_reference['id'].astype(str) == unique_reference_ids[reference_idx])[0]
|
|
273
|
+
q_spec = np.asarray(pd.concat([df_query['mz_ratio'].iloc[q_idxs_tmp], df_query['intensity'].iloc[q_idxs_tmp]], axis=1).reset_index(drop=True))
|
|
274
|
+
r_spec = np.asarray(pd.concat([df_reference['mz_ratio'].iloc[r_idxs_tmp], df_reference['intensity'].iloc[r_idxs_tmp]], axis=1).reset_index(drop=True))
|
|
275
|
+
|
|
276
|
+
|
|
277
|
+
q_spec_pre_trans = q_spec.copy()
|
|
278
|
+
r_spec_pre_trans = r_spec.copy()
|
|
279
|
+
q_spec_pre_trans[:,1] = q_spec_pre_trans[:,1].astype(float)
|
|
280
|
+
r_spec_pre_trans[:,1] = r_spec_pre_trans[:,1].astype(float)
|
|
281
|
+
|
|
282
|
+
if y_axis_transformation == 'normalized':
|
|
283
|
+
q_spec_pre_trans[:,1] = q_spec_pre_trans[:,1] / np.max(q_spec_pre_trans[:,1])
|
|
284
|
+
r_spec_pre_trans[:,1] = r_spec_pre_trans[:,1] / np.max(r_spec_pre_trans[:,1])
|
|
285
|
+
ylab = 'Normalized Intensity'
|
|
286
|
+
elif y_axis_transformation == 'log10':
|
|
287
|
+
q_spec_pre_trans[:,1] = np.log10(np.array(q_spec_pre_trans[:,1]+1,dtype=float))
|
|
288
|
+
r_spec_pre_trans[:,1] = np.log10(np.array(r_spec_pre_trans[:,1]+1,dtype=float))
|
|
289
|
+
ylab = 'log10(Intensity)'
|
|
290
|
+
elif y_axis_transformation == 'sqrt':
|
|
291
|
+
q_spec_pre_trans[:,1] = np.sqrt(np.array(q_spec_pre_trans[:,1],dtype=float))
|
|
292
|
+
r_spec_pre_trans[:,1] = np.sqrt(np.array(r_spec_pre_trans[:,1],dtype=float))
|
|
293
|
+
ylab = 'sqrt(Intensity)'
|
|
294
|
+
else:
|
|
295
|
+
ylab = 'Raw Intensity'
|
|
296
|
+
|
|
297
|
+
fig, axes = plt.subplots(nrows=2, ncols=1)
|
|
298
|
+
|
|
299
|
+
plt.subplot(2,1,1)
|
|
300
|
+
plt.vlines(x=q_spec_pre_trans[:,0], ymin=[0]*q_spec_pre_trans.shape[0], ymax=q_spec_pre_trans[:,1], linewidth=3, color='blue', label=f'Spectrum ID 1: {spectrum_ID1}')
|
|
301
|
+
plt.vlines(x=r_spec_pre_trans[:,0], ymin=[0]*r_spec_pre_trans.shape[0], ymax=-r_spec_pre_trans[:,1], linewidth=3, color='red', label=f'Spectrum ID 2: {spectrum_ID2}')
|
|
302
|
+
plt.xlabel('m/z',fontsize=7)
|
|
303
|
+
plt.ylabel(ylab, fontsize=7)
|
|
304
|
+
plt.xticks(fontsize=7)
|
|
305
|
+
plt.yticks(fontsize=7)
|
|
306
|
+
plt.title('Untransformed Spectra', fontsize=10)
|
|
307
|
+
|
|
308
|
+
mz_min_tmp_q = round(q_spec[:,0].min(),1)
|
|
309
|
+
mz_min_tmp_r = round(r_spec[:,0].min(),1)
|
|
310
|
+
int_min_tmp_q = round(q_spec[:,1].min(),1)
|
|
311
|
+
int_min_tmp_r = round(r_spec[:,1].min(),1)
|
|
312
|
+
mz_max_tmp_q = round(q_spec[:,0].max(),1)
|
|
313
|
+
mz_max_tmp_r = round(r_spec[:,0].max(),1)
|
|
314
|
+
int_max_tmp_q = round(q_spec[:,1].max(),1)
|
|
315
|
+
int_max_tmp_r = round(r_spec[:,1].max(),1)
|
|
316
|
+
mz_min_tmp = min([mz_min_tmp_q,mz_min_tmp_r])
|
|
317
|
+
mz_max_tmp = min([mz_max_tmp_q,mz_max_tmp_r])
|
|
318
|
+
int_min_tmp = min([int_min_tmp_q,int_min_tmp_r])
|
|
319
|
+
int_max_tmp = max([int_max_tmp_q,int_max_tmp_r])
|
|
320
|
+
|
|
321
|
+
is_matched = False
|
|
322
|
+
for transformation in spectrum_preprocessing_order:
|
|
323
|
+
if transformation == 'C' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1:
|
|
324
|
+
q_spec = centroid_spectrum(q_spec, window_size=window_size_centroiding)
|
|
325
|
+
r_spec = centroid_spectrum(r_spec, window_size=window_size_centroiding)
|
|
326
|
+
if transformation == 'M' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1:
|
|
327
|
+
m_spec = match_peaks_in_spectra(spec_a=q_spec, spec_b=r_spec, window_size=window_size_matching)
|
|
328
|
+
q_spec = m_spec[:,0:2]
|
|
329
|
+
r_spec = m_spec[:,[0,2]]
|
|
330
|
+
is_matched = True
|
|
331
|
+
if transformation == 'W' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1:
|
|
332
|
+
q_spec[:,1] = wf_transform(q_spec[:,0], q_spec[:,1], wf_mz, wf_intensity)
|
|
333
|
+
r_spec[:,1] = wf_transform(r_spec[:,0], r_spec[:,1], wf_mz, wf_intensity)
|
|
334
|
+
if transformation == 'L' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1:
|
|
335
|
+
q_spec[:,1] = LE_transform(q_spec[:,1], LET_threshold, normalization_method=normalization_method)
|
|
336
|
+
r_spec[:,1] = LE_transform(r_spec[:,1], LET_threshold, normalization_method=normalization_method)
|
|
337
|
+
if transformation == 'N' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1:
|
|
338
|
+
q_spec = remove_noise(q_spec, nr = noise_threshold)
|
|
339
|
+
if high_quality_reference_library == False or high_quality_reference_library == 'False':
|
|
340
|
+
r_spec = remove_noise(r_spec, nr = noise_threshold)
|
|
341
|
+
if transformation == 'F' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1:
|
|
342
|
+
q_spec = filter_spec_lcms(q_spec, mz_min = mz_min, mz_max = mz_max, int_min = int_min, int_max = int_max, is_matched = is_matched)
|
|
343
|
+
if high_quality_reference_library == False or high_quality_reference_library == 'False':
|
|
344
|
+
r_spec = filter_spec_lcms(r_spec, mz_min = mz_min, mz_max = mz_max, int_min = int_min, int_max = int_max, is_matched = is_matched)
|
|
345
|
+
|
|
346
|
+
q_ints = q_spec[:,1]
|
|
347
|
+
r_ints = r_spec[:,1]
|
|
348
|
+
|
|
349
|
+
if np.sum(q_ints) != 0 and np.sum(r_ints) != 0 and q_spec.shape[0] > 1 and r_spec.shape[1] > 1:
|
|
350
|
+
similarity_score = get_similarity(similarity_measure, q_ints, r_ints, weights, entropy_dimension)
|
|
351
|
+
else:
|
|
352
|
+
similarity_score = 0
|
|
353
|
+
|
|
354
|
+
plt.subplot(2,1,2)
|
|
355
|
+
|
|
356
|
+
if q_spec.shape[0] > 1:
|
|
357
|
+
if np.max(q_spec[:,1]) == 0 or np.max(r_spec[:,1]) == 0:
|
|
358
|
+
plt.text(0.5, 0.5, 'The query and/or reference spectrum has no non-zero intensities after transformations.\n Change transformation parameters.', ha='center', va='center', fontsize=7, color='black')
|
|
359
|
+
plt.xticks([])
|
|
360
|
+
plt.yticks([])
|
|
361
|
+
else:
|
|
362
|
+
if y_axis_transformation == 'normalized':
|
|
363
|
+
q_spec[:,1] = q_spec[:,1] / np.max(q_spec[:,1])
|
|
364
|
+
r_spec[:,1] = r_spec[:,1] / np.max(r_spec[:,1])
|
|
365
|
+
ylab='Normalized Intensity'
|
|
366
|
+
elif y_axis_transformation == 'log10':
|
|
367
|
+
q_spec[:,1] = np.log10(q_spec[:,1]+1)
|
|
368
|
+
r_spec[:,1] = np.log10(r_spec[:,1]+1)
|
|
369
|
+
ylab='log10(Intensity)'
|
|
370
|
+
elif y_axis_transformation == 'sqrt':
|
|
371
|
+
q_spec[:,1] = np.sqrt(q_spec[:,1])
|
|
372
|
+
r_spec[:,1] = np.sqrt(r_spec[:,1])
|
|
373
|
+
ylab='sqrt(Intensity)'
|
|
374
|
+
else:
|
|
375
|
+
ylab = 'Raw Intensity'
|
|
376
|
+
plt.vlines(x=q_spec[:,0], ymin=[0]*q_spec.shape[0], ymax=q_spec[:,1], linewidth=3, color='blue')
|
|
377
|
+
plt.vlines(x=r_spec[:,0], ymin=[0]*r_spec.shape[0], ymax=-r_spec[:,1], linewidth=3, color='red')
|
|
378
|
+
plt.xlabel('m/z', fontsize=7)
|
|
379
|
+
plt.ylabel(ylab, fontsize=7)
|
|
380
|
+
plt.xticks(fontsize=7)
|
|
381
|
+
plt.yticks(fontsize=7)
|
|
382
|
+
plt.title(f'Transformed Spectra', fontsize=10)
|
|
383
|
+
else:
|
|
384
|
+
plt.text(0.5, 0.5, 'All points in the spectra were removed during preprocessing. \nChange the spectrum_preprocesing_order and/or change other spectrum-preprocessing parameters.', ha='center', va='center', fontsize=7, color='black')
|
|
385
|
+
plt.xticks([])
|
|
386
|
+
plt.yticks([])
|
|
387
|
+
|
|
388
|
+
plt.subplots_adjust(top=0.8, hspace=0.92, bottom=0.3)
|
|
389
|
+
plt.figlegend(loc='upper center')
|
|
390
|
+
|
|
391
|
+
fig.text(0.05, 0.20, f'Similarity Measure: {similarity_measure.capitalize()}', fontsize=7)
|
|
392
|
+
fig.text(0.05, 0.17, f'Similarity Score: {round(similarity_score, 4)}', fontsize=7)
|
|
393
|
+
fig.text(0.05, 0.14, f"Spectrum Preprocessing Order: {''.join(spectrum_preprocessing_order)}", fontsize=7)
|
|
394
|
+
fig.text(0.05, 0.11, f'High Quality Reference Library: {str(high_quality_reference_library)}', fontsize=7)
|
|
395
|
+
fig.text(0.05, 0.08, f'Window Size (Centroiding): {window_size_centroiding}', fontsize=7)
|
|
396
|
+
fig.text(0.05, 0.05, f'Window Size (Matching): {window_size_matching}', fontsize=7)
|
|
397
|
+
if similarity_measure == 'mixture':
|
|
398
|
+
fig.text(0.05, 0.02, f'Weights for mixture similarity: {weights}', fontsize=7)
|
|
399
|
+
|
|
400
|
+
fig.text(0.40, 0.20, f'Raw-Scale M/Z Range: [{mz_min_tmp},{mz_max_tmp}]', fontsize=7)
|
|
401
|
+
fig.text(0.40, 0.17, f'Raw-Scale Intensity Range: [{int_min_tmp},{int_max_tmp}]', fontsize=7)
|
|
402
|
+
fig.text(0.40, 0.14, f'Noise Threshold: {noise_threshold}', fontsize=7)
|
|
403
|
+
fig.text(0.40, 0.11, f'Weight Factors (m/z,intensity): ({wf_mz},{wf_intensity})', fontsize=7)
|
|
404
|
+
fig.text(0.40, 0.08, f'Low-Entropy Threshold: {LET_threshold}', fontsize=7)
|
|
405
|
+
|
|
406
|
+
if print_url_spectrum1 == 'Yes' and print_url_spectrum2 == 'Yes':
|
|
407
|
+
url_tmp1 = get_pubchem_url(query=spectrum_ID1)
|
|
408
|
+
url_tmp2 = get_pubchem_url(query=spectrum_ID2)
|
|
409
|
+
t1 = fig.text(0.40, 0.05, f'PubChem URL for {spectrum_ID1}: {url_tmp1}', fontsize=7)
|
|
410
|
+
t2 = fig.text(0.40, 0.02, f'PubChem URL for {spectrum_ID2}: {url_tmp2}', fontsize=7)
|
|
411
|
+
t1.set_url(url_tmp1)
|
|
412
|
+
t2.set_url(url_tmp2)
|
|
413
|
+
|
|
414
|
+
if print_url_spectrum1 == 'Yes' and print_url_spectrum2 == 'No':
|
|
415
|
+
url_tmp1 = get_pubchem_url(query=spectrum_ID1)
|
|
416
|
+
t1 = fig.text(0.40, 0.05, f'PubChem URL for {spectrum_ID1}: {url_tmp1}', fontsize=7)
|
|
417
|
+
t1.set_url(url_tmp1)
|
|
418
|
+
|
|
419
|
+
if print_url_spectrum1 == 'No' and print_url_spectrum2 == 'Yes':
|
|
420
|
+
url_tmp2 = get_pubchem_url(query=spectrum_ID2)
|
|
421
|
+
t2 = fig.text(0.40, 0.05, f'PubChem URL for {spectrum_ID2}: {url_tmp2}', fontsize=7)
|
|
422
|
+
t2.set_url(url_tmp2)
|
|
423
|
+
|
|
424
|
+
fig.savefig(output_path, format='svg')
|
|
425
|
+
|
|
426
|
+
if return_plot == True:
|
|
427
|
+
return fig
|
|
428
|
+
|
|
429
|
+
|
|
430
|
+
|
|
431
|
+
|
|
432
|
+
def generate_plots_on_NRMS_data(query_data=None, reference_data=None, spectrum_ID1=None, spectrum_ID2=None, print_url_spectrum1='No', print_url_spectrum2='No', similarity_measure='cosine', weights={'Cosine':0.25,'Shannon':0.25,'Renyi':0.25,'Tsallis':0.25}, spectrum_preprocessing_order='FNLW', high_quality_reference_library=False, mz_min=0, mz_max=9999999, int_min=0, int_max=9999999, noise_threshold=0.0, wf_mz=0.0, wf_intensity=1.0, LET_threshold=0.0, entropy_dimension=1.1, y_axis_transformation='normalized', output_path=None, return_plot=False):
|
|
433
|
+
|
|
434
|
+
if query_data is None:
|
|
435
|
+
print('\nError: No argument passed to the mandatory query_data. Please pass the path to the TXT file of the query data.')
|
|
436
|
+
sys.exit()
|
|
437
|
+
else:
|
|
438
|
+
extension = query_data.rsplit('.',1)
|
|
439
|
+
extension = extension[(len(extension)-1)]
|
|
440
|
+
if extension == 'mgf' or extension == 'MGF' or extension == 'mzML' or extension == 'mzml' or extension == 'MZML' or extension == 'cdf' or extension == 'CDF' or extension == 'msp' or extension == 'MSP' or extension == 'json' or extension == 'JSON':
|
|
441
|
+
output_path_tmp = query_data[:-3] + 'txt'
|
|
442
|
+
build_library_from_raw_data(input_path=query_data, output_path=output_path_tmp, is_reference=False)
|
|
443
|
+
df_query = pd.read_csv(output_path_tmp, sep='\t')
|
|
444
|
+
if extension == 'txt' or extension == 'TXT':
|
|
445
|
+
df_query = pd.read_csv(query_data, sep='\t')
|
|
446
|
+
unique_query_ids = df_query['id'].unique()
|
|
447
|
+
|
|
448
|
+
if reference_data is None:
|
|
449
|
+
print('\nError: No argument passed to the mandatory reference_data. Please pass the path to the TXT file of the reference data.')
|
|
450
|
+
sys.exit()
|
|
451
|
+
else:
|
|
452
|
+
extension = reference_data.rsplit('.',1)
|
|
453
|
+
extension = extension[(len(extension)-1)]
|
|
454
|
+
if extension == 'mgf' or extension == 'MGF' or extension == 'mzML' or extension == 'mzml' or extension == 'MZML' or extension == 'cdf' or extension == 'CDF' or extension == 'msp' or extension == 'MSP' or extension == 'json' or extension == 'JSON':
|
|
455
|
+
output_path_tmp = reference_data[:-3] + 'txt'
|
|
456
|
+
build_library_from_raw_data(input_path=reference_data, output_path=output_path_tmp, is_reference=True)
|
|
457
|
+
df_reference = pd.read_csv(output_path_tmp, sep='\t')
|
|
458
|
+
if extension == 'txt' or extension == 'TXT':
|
|
459
|
+
df_reference = pd.read_csv(reference_data, sep='\t')
|
|
460
|
+
unique_reference_ids = df_reference['id'].unique()
|
|
461
|
+
|
|
462
|
+
|
|
463
|
+
if spectrum_ID1 is not None:
|
|
464
|
+
spectrum_ID1 = str(spectrum_ID1)
|
|
465
|
+
else:
|
|
466
|
+
spectrum_ID1 = str(df_query.iloc[0,0])
|
|
467
|
+
print('No argument passed to spectrum_ID1; using the first spectrum in query_data.')
|
|
468
|
+
|
|
469
|
+
if spectrum_ID2 is not None:
|
|
470
|
+
spectrum_ID2 = str(spectrum_ID2)
|
|
471
|
+
else:
|
|
472
|
+
spectrum_ID2 = str(df_reference.iloc[0,0])
|
|
473
|
+
print('No argument passed to spectrum_ID2; using the first spectrum in reference_data.')
|
|
474
|
+
|
|
475
|
+
if spectrum_preprocessing_order is not None:
|
|
476
|
+
spectrum_preprocessing_order = list(spectrum_preprocessing_order)
|
|
477
|
+
else:
|
|
478
|
+
spectrum_preprocessing_order = ['F','N','W','L']
|
|
479
|
+
if set(spectrum_preprocessing_order) - {'F','N','W','L'}:
|
|
480
|
+
print(f'Error: spectrum_preprocessing_order must contain only \'F\', \'N\', \'W\', \'L\'.')
|
|
481
|
+
sys.exit()
|
|
482
|
+
|
|
483
|
+
if similarity_measure not in ['cosine','shannon','renyi','tsallis','mixture','jaccard','dice','3w_jaccard','sokal_sneath','binary_cosine','mountford','mcconnaughey','driver_kroeber','simpson','braun_banquet','fager_mcgowan','kulczynski','intersection','hamming','hellinger']:
|
|
484
|
+
print('\nError: similarity_measure must be either cosine, shannon, renyi, tsallis, mixture, jaccard, dice, 3w_jaccard, sokal_sneath, binary_cosine, mountford, mcconnaughey, driver_kroeber, simpson, braun_banquet, fager_mcgowan, kulczynski, intersection, hamming, or hellinger.')
|
|
485
|
+
sys.exit()
|
|
486
|
+
|
|
487
|
+
if isinstance(int_min,int) is True:
|
|
488
|
+
int_min = float(int_min)
|
|
489
|
+
if isinstance(int_max,int) is True:
|
|
490
|
+
int_max = float(int_max)
|
|
491
|
+
if isinstance(mz_min,int) is False or isinstance(mz_max,int) is False or isinstance(int_min,float) is False or isinstance(int_max,float) is False:
|
|
492
|
+
print('Error: mz_min must be a non-negative integer, mz_max must be a positive integer, int_min must be a non-negative float, and int_max must be a positive float')
|
|
493
|
+
sys.exit()
|
|
494
|
+
if mz_min < 0:
|
|
495
|
+
print('\nError: mz_min should be a non-negative integer')
|
|
496
|
+
sys.exit()
|
|
497
|
+
if mz_max <= 0:
|
|
498
|
+
print('\nError: mz_max should be a positive integer')
|
|
499
|
+
sys.exit()
|
|
500
|
+
if int_min < 0:
|
|
501
|
+
print('\nError: int_min should be a non-negative float')
|
|
502
|
+
sys.exit()
|
|
503
|
+
if int_max <= 0:
|
|
504
|
+
print('\nError: int_max should be a positive float')
|
|
505
|
+
sys.exit()
|
|
506
|
+
|
|
507
|
+
if isinstance(noise_threshold,int) is True:
|
|
508
|
+
noise_threshold = float(noise_threshold)
|
|
509
|
+
if isinstance(noise_threshold,float) is False or noise_threshold < 0:
|
|
510
|
+
print('Error: noise_threshold must be a positive float.')
|
|
511
|
+
sys.exit()
|
|
512
|
+
|
|
513
|
+
if isinstance(wf_intensity,int) is True:
|
|
514
|
+
wf_intensity = float(wf_intensity)
|
|
515
|
+
if isinstance(wf_mz,int) is True:
|
|
516
|
+
wf_mz = float(wf_mz)
|
|
517
|
+
if isinstance(wf_intensity,float) is False or isinstance(wf_mz,float) is False:
|
|
518
|
+
print('Error: wf_mz and wf_intensity must be integers or floats')
|
|
519
|
+
sys.exit()
|
|
520
|
+
|
|
521
|
+
if entropy_dimension <= 0:
|
|
522
|
+
print('\nError: entropy_dimension should be a positive float')
|
|
523
|
+
sys.exit()
|
|
524
|
+
else:
|
|
525
|
+
q = entropy_dimension
|
|
526
|
+
|
|
527
|
+
normalization_method = 'standard' #consider including additional normalization methods to transform intensities into a probability distribution; softmax results in many numerical errors/warnings
|
|
528
|
+
|
|
529
|
+
if y_axis_transformation not in ['normalized','none','log10','sqrt']:
|
|
530
|
+
print('Error: y_axis_transformation must be either \'normalized\', \'none\', \'log10\', or \'sqrt\'.')
|
|
531
|
+
sys.exit()
|
|
532
|
+
|
|
533
|
+
if output_path is None:
|
|
534
|
+
print(f'Warning: plots will be saved to the svg ./spectrum1_{spectrum_ID1}_spectrum2_{spectrum_ID2}.svg in the current working directory.')
|
|
535
|
+
output_path = f'{Path.cwd()}/spectrum1_{spectrum_ID1}_spectrum2_{spectrum_ID2}.svg'
|
|
536
|
+
|
|
537
|
+
min_mz = np.min([df_query['mz_ratio'].min(), df_reference['mz_ratio'].min()])
|
|
538
|
+
max_mz = np.max([df_query['mz_ratio'].max(), df_reference['mz_ratio'].max()])
|
|
539
|
+
mzs = np.linspace(min_mz,max_mz,(max_mz-min_mz+1))
|
|
540
|
+
|
|
541
|
+
unique_query_ids = df_query['id'].unique().tolist()
|
|
542
|
+
unique_reference_ids = df_reference['id'].unique().tolist()
|
|
543
|
+
unique_query_ids = [str(ID) for ID in unique_query_ids]
|
|
544
|
+
unique_reference_ids = [str(ID) for ID in unique_reference_ids]
|
|
545
|
+
common_IDs = np.intersect1d([str(ID) for ID in unique_query_ids], [str(ID) for ID in unique_reference_ids])
|
|
546
|
+
if len(common_IDs) > 0:
|
|
547
|
+
print(f'Warning: the query and reference library have overlapping IDs: {common_IDs}')
|
|
548
|
+
|
|
549
|
+
if spectrum_ID1 in unique_query_ids and spectrum_ID2 in unique_query_ids:
|
|
550
|
+
q_idxs_tmp = np.where(df_query.iloc[:,0].astype(str) == spectrum_ID1)[0]
|
|
551
|
+
r_idxs_tmp = np.where(df_query.iloc[:,0].astype(str) == spectrum_ID2)[0]
|
|
552
|
+
q_spec = np.asarray(pd.concat([df_query.iloc[q_idxs_tmp,1], df_query.iloc[q_idxs_tmp,2]], axis=1).reset_index(drop=True))
|
|
553
|
+
r_spec = np.asarray(pd.concat([df_query.iloc[r_idxs_tmp,1], df_query.iloc[r_idxs_tmp,2]], axis=1).reset_index(drop=True))
|
|
554
|
+
elif spectrum_ID1 in unique_reference_ids and spectrum_ID2 in unique_reference_ids:
|
|
555
|
+
q_idxs_tmp = np.where(df_reference.iloc[:,0].astype(str) == spectrum_ID1)[0]
|
|
556
|
+
r_idxs_tmp = np.where(df_reference.iloc[:,0].astype(str) == spectrum_ID2)[0]
|
|
557
|
+
q_spec = np.asarray(pd.concat([df_reference.iloc[q_idxs_tmp,1], df_reference.iloc[q_idxs_tmp,2]], axis=1).reset_index(drop=True))
|
|
558
|
+
r_spec = np.asarray(pd.concat([df_reference.iloc[r_idxs_tmp,1], df_reference.iloc[r_idxs_tmp,2]], axis=1).reset_index(drop=True))
|
|
559
|
+
else:
|
|
560
|
+
if spectrum_ID1 in unique_reference_ids and spectrum_ID2 in unique_query_ids:
|
|
561
|
+
spec_tmp = spectrum_ID1
|
|
562
|
+
spectrum_ID1 = spectrum_ID2
|
|
563
|
+
spectrum_ID2 = spec_tmp
|
|
564
|
+
q_idxs_tmp = np.where(df_query['id'].astype(str) == spectrum_ID1)[0]
|
|
565
|
+
r_idxs_tmp = np.where(df_reference['id'].astype(str) == spectrum_ID2)[0]
|
|
566
|
+
q_spec = np.asarray(pd.concat([df_query['mz_ratio'].iloc[q_idxs_tmp], df_query['intensity'].iloc[q_idxs_tmp]], axis=1).reset_index(drop=True))
|
|
567
|
+
r_spec = np.asarray(pd.concat([df_reference['mz_ratio'].iloc[r_idxs_tmp], df_reference['intensity'].iloc[r_idxs_tmp]], axis=1).reset_index(drop=True))
|
|
568
|
+
|
|
569
|
+
q_spec = convert_spec(q_spec,mzs)
|
|
570
|
+
r_spec = convert_spec(r_spec,mzs)
|
|
571
|
+
|
|
572
|
+
int_min_tmp_q = min(q_spec[q_spec[:,1].nonzero(),1][0])
|
|
573
|
+
int_min_tmp_r = min(r_spec[r_spec[:,1].nonzero(),1][0])
|
|
574
|
+
int_max_tmp_q = max(q_spec[q_spec[:,1].nonzero(),1][0])
|
|
575
|
+
int_max_tmp_r = max(r_spec[r_spec[:,1].nonzero(),1][0])
|
|
576
|
+
int_min_tmp = int(min([int_min_tmp_q,int_min_tmp_r]))
|
|
577
|
+
int_max_tmp = int(max([int_max_tmp_q,int_max_tmp_r]))
|
|
578
|
+
|
|
579
|
+
fig, axes = plt.subplots(nrows=2, ncols=1)
|
|
580
|
+
|
|
581
|
+
plt.subplot(2,1,1)
|
|
582
|
+
|
|
583
|
+
if np.max(q_spec[:,1]) == 0 or np.max(r_spec[:,1]) == 0:
|
|
584
|
+
plt.text(0.5, 0.5, 'The query and/or reference spectrum has no non-zero intensities after transformations.\n Change transformation parameters.', ha='center', va='center', fontsize=7, color='black')
|
|
585
|
+
plt.xticks([])
|
|
586
|
+
plt.yticks([])
|
|
587
|
+
else:
|
|
588
|
+
q_spec_pre_trans = q_spec.copy()
|
|
589
|
+
r_spec_pre_trans = r_spec.copy()
|
|
590
|
+
q_spec_pre_trans[:,1] = q_spec_pre_trans[:,1].astype(float)
|
|
591
|
+
r_spec_pre_trans[:,1] = r_spec_pre_trans[:,1].astype(float)
|
|
592
|
+
|
|
593
|
+
if y_axis_transformation == 'normalized':
|
|
594
|
+
q_spec_pre_trans[:,1] = q_spec_pre_trans[:,1] / np.max(q_spec_pre_trans[:,1])
|
|
595
|
+
r_spec_pre_trans[:,1] = r_spec_pre_trans[:,1] / np.max(r_spec_pre_trans[:,1])
|
|
596
|
+
ylab = 'Normalized Intensity'
|
|
597
|
+
elif y_axis_transformation == 'log10':
|
|
598
|
+
q_spec_pre_trans[:,1] = np.log10(q_spec_pre_trans[:,1]+1)
|
|
599
|
+
r_spec_pre_trans[:,1] = np.log10(r_spec_pre_trans[:,1]+1)
|
|
600
|
+
ylab = 'log10(Intensity)'
|
|
601
|
+
elif y_axis_transformation == 'sqrt':
|
|
602
|
+
q_spec_pre_trans[:,1] = np.sqrt(q_spec_pre_trans[:,1])
|
|
603
|
+
r_spec_pre_trans[:,1] = np.sqrt(r_spec_pre_trans[:,1])
|
|
604
|
+
ylab = 'sqrt(Intensity)'
|
|
605
|
+
else:
|
|
606
|
+
ylab = 'Raw Intensity'
|
|
607
|
+
plt.vlines(x=q_spec_pre_trans[:,0], ymin=[0]*len(q_spec_pre_trans[:,0]), ymax=q_spec_pre_trans[:,1], linewidth=3, color='blue', label=f'Spectrum ID1: {spectrum_ID1}')
|
|
608
|
+
plt.vlines(x=r_spec_pre_trans[:,0], ymin=[0]*len(r_spec_pre_trans[:,0]), ymax=-r_spec_pre_trans[:,1], linewidth=3, color='red', label=f'Spectrum ID2: {spectrum_ID2}')
|
|
609
|
+
plt.xlabel('m/z',fontsize=7)
|
|
610
|
+
plt.ylabel(ylab, fontsize=7)
|
|
611
|
+
plt.xticks(fontsize=7)
|
|
612
|
+
plt.yticks(fontsize=7)
|
|
613
|
+
plt.title('Untransformed Query and Reference Spectra', fontsize=10)
|
|
614
|
+
|
|
615
|
+
for transformation in spectrum_preprocessing_order:
|
|
616
|
+
if transformation == 'W':
|
|
617
|
+
q_spec[:,1] = wf_transform(q_spec[:,0], q_spec[:,1], wf_mz, wf_intensity)
|
|
618
|
+
r_spec[:,1] = wf_transform(r_spec[:,0], r_spec[:,1], wf_mz, wf_intensity)
|
|
619
|
+
if transformation == 'L':
|
|
620
|
+
q_spec[:,1] = LE_transform(q_spec[:,1], LET_threshold, normalization_method)
|
|
621
|
+
r_spec[:,1] = LE_transform(r_spec[:,1], LET_threshold, normalization_method)
|
|
622
|
+
if transformation == 'N':
|
|
623
|
+
q_spec = remove_noise(q_spec, nr = noise_threshold)
|
|
624
|
+
if high_quality_reference_library == False or high_quality_reference_library == 'False':
|
|
625
|
+
r_spec = remove_noise(r_spec, nr = noise_threshold)
|
|
626
|
+
if transformation == 'F':
|
|
627
|
+
q_spec = filter_spec_gcms(q_spec, mz_min = mz_min, mz_max = mz_max, int_min = int_min, int_max = int_max)
|
|
628
|
+
if high_quality_reference_library == False or high_quality_reference_library == 'False':
|
|
629
|
+
r_spec = filter_spec_gcms(r_spec, mz_min = mz_min, mz_max = mz_max, int_min = int_min, int_max = int_max)
|
|
630
|
+
|
|
631
|
+
if q_spec.shape[0] > 1:
|
|
632
|
+
similarity_score = get_similarity(similarity_measure, q_spec[:,1], r_spec[:,1], weights, entropy_dimension)
|
|
633
|
+
else:
|
|
634
|
+
similarity_score = 0
|
|
635
|
+
|
|
636
|
+
|
|
637
|
+
plt.subplot(2,1,2)
|
|
638
|
+
|
|
639
|
+
if q_spec.shape[0] == 0 or r_spec.shape[0] == 0:
|
|
640
|
+
plt.text(0.5, 0.5, 'The query and/or reference spectrum has no ion fragments left after transformations.\n Change transformation parameters.', ha='center', va='center', fontsize=7, color='black')
|
|
641
|
+
plt.xticks([])
|
|
642
|
+
plt.yticks([])
|
|
643
|
+
elif np.max(q_spec[:,1]) == 0 or np.max(r_spec[:,1]) == 0:
|
|
644
|
+
plt.text(0.5, 0.5, 'The query and/or reference spectrum has no non-zero intensities after transformations.\n Change transformation parameters.', ha='center', va='center', fontsize=7, color='black')
|
|
645
|
+
plt.xticks([])
|
|
646
|
+
plt.yticks([])
|
|
647
|
+
else:
|
|
648
|
+
if y_axis_transformation == 'normalized':
|
|
649
|
+
q_spec[:,1] = q_spec[:,1] / np.max(q_spec[:,1])
|
|
650
|
+
r_spec[:,1] = r_spec[:,1] / np.max(r_spec[:,1])
|
|
651
|
+
ylab='Normalized Intensity'
|
|
652
|
+
elif y_axis_transformation == 'log10':
|
|
653
|
+
q_spec[:,1] = np.log10(q_spec[:,1]+1)
|
|
654
|
+
r_spec[:,1] = np.log10(r_spec[:,1]+1)
|
|
655
|
+
ylab='log10(Intensity)'
|
|
656
|
+
elif y_axis_transformation == 'sqrt':
|
|
657
|
+
q_spec[:,1] = np.sqrt(q_spec[:,1])
|
|
658
|
+
r_spec[:,1] = np.sqrt(r_spec[:,1])
|
|
659
|
+
ylab='sqrt(Intensity)'
|
|
660
|
+
else:
|
|
661
|
+
ylab = 'Raw Intensity'
|
|
662
|
+
plt.vlines(x=mzs, ymin=[0]*len(mzs), ymax=q_spec[:,1], linewidth=3, color='blue')
|
|
663
|
+
plt.vlines(x=mzs, ymin=[0]*len(mzs), ymax=-r_spec[:,1], linewidth=3, color='red')
|
|
664
|
+
plt.xlabel('m/z', fontsize=7)
|
|
665
|
+
plt.ylabel(ylab, fontsize=7)
|
|
666
|
+
plt.xticks(fontsize=7)
|
|
667
|
+
plt.yticks(fontsize=7)
|
|
668
|
+
plt.title(f'Transformed Query and Reference Spectra', fontsize=10)
|
|
669
|
+
|
|
670
|
+
plt.subplots_adjust(top=0.8, hspace=0.92, bottom=0.3)
|
|
671
|
+
plt.figlegend(loc='upper center')
|
|
672
|
+
|
|
673
|
+
fig.text(0.05, 0.20, f'Similarity Measure: {similarity_measure.capitalize()}', fontsize=7)
|
|
674
|
+
fig.text(0.05, 0.17, f'Similarity Score: {round(similarity_score, 4)}', fontsize=7)
|
|
675
|
+
fig.text(0.05, 0.14, f"Spectrum Preprocessing Order: {''.join(spectrum_preprocessing_order)}", fontsize=7)
|
|
676
|
+
fig.text(0.05, 0.11, f'High Quality Reference Library: {str(high_quality_reference_library)}', fontsize=7)
|
|
677
|
+
fig.text(0.05, 0.08, f'Weight Factors (m/z,intensity): ({wf_mz},{wf_intensity})', fontsize=7)
|
|
678
|
+
if similarity_measure == 'mixture':
|
|
679
|
+
fig.text(0.05, 0.05, f'Weights for mixture similarity: {weights}', fontsize=7)
|
|
680
|
+
|
|
681
|
+
fig.text(0.40, 0.20, f'Raw-Scale M/Z Range: [{min_mz},{max_mz}]', fontsize=7)
|
|
682
|
+
fig.text(0.40, 0.17, f'Raw-Scale Intensity Range: [{int_min_tmp},{int_max_tmp}]', fontsize=7)
|
|
683
|
+
fig.text(0.40, 0.14, f'Noise Threshold: {noise_threshold}', fontsize=7)
|
|
684
|
+
fig.text(0.40, 0.11, f'Low-Entropy Threshold: {LET_threshold}', fontsize=7)
|
|
685
|
+
|
|
686
|
+
if print_url_spectrum1 == 'Yes' and print_url_spectrum2 == 'Yes':
|
|
687
|
+
url_tmp1 = get_pubchem_url(query=spectrum_ID1)
|
|
688
|
+
url_tmp2 = get_pubchem_url(query=spectrum_ID2)
|
|
689
|
+
t1 = fig.text(0.40, 0.08, f'PubChem URL for {spectrum_ID1}: {url_tmp1}', fontsize=7)
|
|
690
|
+
t2 = fig.text(0.40, 0.05, f'PubChem URL for {spectrum_ID2}: {url_tmp2}', fontsize=7)
|
|
691
|
+
t1.set_url(url_tmp1)
|
|
692
|
+
t2.set_url(url_tmp2)
|
|
693
|
+
|
|
694
|
+
if print_url_spectrum1 == 'Yes' and print_url_spectrum2 == 'No':
|
|
695
|
+
url_tmp1 = get_pubchem_url(query=spectrum_ID1)
|
|
696
|
+
t1 = fig.text(0.40, 0.08, f'PubChem URL for {spectrum_ID1}: {url_tmp1}', fontsize=7)
|
|
697
|
+
t1.set_url(url_tmp1)
|
|
698
|
+
|
|
699
|
+
if print_url_spectrum1 == 'No' and print_url_spectrum2 == 'Yes':
|
|
700
|
+
url_tmp2 = get_pubchem_url(query=spectrum_ID2)
|
|
701
|
+
t2 = fig.text(0.40, 0.08, f'PubChem URL for {spectrum_ID2}: {url_tmp2}', fontsize=7)
|
|
702
|
+
t2.set_url(url_tmp2)
|
|
703
|
+
|
|
704
|
+
fig.savefig(output_path, format='svg')
|
|
705
|
+
|
|
706
|
+
if return_plot == True:
|
|
707
|
+
return fig
|
|
708
|
+
|
|
709
|
+
|
|
710
|
+
def wf_transform(spec_mzs, spec_ints, wf_mz, wf_int):
|
|
711
|
+
spec_ints = np.power(spec_mzs, wf_mz) * np.power(spec_ints, wf_int)
|
|
712
|
+
return(spec_ints)
|
|
713
|
+
|
|
714
|
+
|
|
715
|
+
def LE_transform(intensity, thresh, normalization_method):
|
|
716
|
+
intensity_tmp = normalize(intensity, method=normalization_method)
|
|
717
|
+
if np.sum(intensity_tmp) > 0:
|
|
718
|
+
S = scipy.stats.entropy(intensity_tmp.astype('float'))
|
|
719
|
+
if S > 0 and S < thresh:
|
|
720
|
+
w = (1 + S) / (1 + thresh)
|
|
721
|
+
intensity = np.power(intensity_tmp, w)
|
|
722
|
+
else:
|
|
723
|
+
intensity = np.zeros(len(intensity))
|
|
724
|
+
return intensity
|
|
725
|
+
|
|
726
|
+
|
|
727
|
+
def normalize(intensities,method='standard'):
|
|
728
|
+
if np.sum(intensities) > 0:
|
|
729
|
+
if method == 'softmax':
|
|
730
|
+
if np.any(intensities > 700):
|
|
731
|
+
print("Warning: some intensities are too large to exponentiate. Applying standard normalization.")
|
|
732
|
+
intensities /= np.sum(intensities)
|
|
733
|
+
else:
|
|
734
|
+
intensities2 = np.exp(intensities)
|
|
735
|
+
if np.isinf(intensities2).sum() == 0:
|
|
736
|
+
intensities = intensities / np.sum(intensities2)
|
|
737
|
+
elif method == 'standard':
|
|
738
|
+
intensities /= np.sum(intensities)
|
|
739
|
+
return(intensities)
|
|
740
|
+
|
|
741
|
+
|
|
742
|
+
def filter_spec_lcms(spec, mz_min = 0, mz_max = 999999999999, int_min = 0, int_max = 999999999999, is_matched = False):
|
|
743
|
+
if is_matched == False:
|
|
744
|
+
spec = spec[spec[:,0] >= mz_min]
|
|
745
|
+
spec = spec[spec[:,0] <= mz_max]
|
|
746
|
+
spec = spec[spec[:,1] >= int_min]
|
|
747
|
+
spec = spec[spec[:,1] <= int_max]
|
|
748
|
+
else:
|
|
749
|
+
spec = spec[spec[:,0] >= mz_min]
|
|
750
|
+
spec = spec[spec[:,0] <= mz_max]
|
|
751
|
+
spec[spec[:,1] >= int_min] = 0
|
|
752
|
+
spec[spec[:,1] <= int_max] = 0
|
|
753
|
+
return(spec)
|
|
754
|
+
|
|
755
|
+
|
|
756
|
+
def filter_spec_gcms(spec, mz_min = 0, mz_max = 999999999999, int_min = 0, int_max = 999999999999):
|
|
757
|
+
spec[np.where(spec[:,0] < mz_min)[0],1] = 0
|
|
758
|
+
spec[np.where(spec[:,0] > mz_max)[0],1] = 0
|
|
759
|
+
spec[np.where(spec[:,1] < int_min)[0],1] = 0
|
|
760
|
+
spec[np.where(spec[:,1] > int_max)[0],1] = 0
|
|
761
|
+
return(spec)
|
|
762
|
+
|
|
763
|
+
|
|
764
|
+
def remove_noise(spec, nr):
|
|
765
|
+
if spec.shape[0] > 1:
|
|
766
|
+
if nr is not None:
|
|
767
|
+
spec[np.where(spec[:,1] < np.max(spec[:,1]) * nr)[0]] = 0
|
|
768
|
+
|
|
769
|
+
return(spec)
|
|
770
|
+
|
|
771
|
+
|
|
772
|
+
def centroid_spectrum(spec, window_size):
|
|
773
|
+
spec = spec[np.argsort(spec[:,0])]
|
|
774
|
+
|
|
775
|
+
mz_array = spec[:, 0]
|
|
776
|
+
need_centroid = 0
|
|
777
|
+
if mz_array.shape[0] > 1:
|
|
778
|
+
mz_delta = mz_array[1:] - mz_array[:-1]
|
|
779
|
+
if np.min(mz_delta) <= window_size:
|
|
780
|
+
need_centroid = 1
|
|
781
|
+
|
|
782
|
+
if need_centroid:
|
|
783
|
+
intensity_order = np.argsort(-spec[:, 1])
|
|
784
|
+
spec_new = []
|
|
785
|
+
for i in intensity_order:
|
|
786
|
+
mz_delta_allowed = window_size
|
|
787
|
+
|
|
788
|
+
if spec[i, 1] > 0:
|
|
789
|
+
i_left = i - 1
|
|
790
|
+
while i_left >= 0:
|
|
791
|
+
mz_delta_left = spec[i, 0] - spec[i_left, 0]
|
|
792
|
+
if mz_delta_left <= mz_delta_allowed:
|
|
793
|
+
i_left -= 1
|
|
794
|
+
else:
|
|
795
|
+
break
|
|
796
|
+
i_left += 1
|
|
797
|
+
|
|
798
|
+
i_right = i + 1
|
|
799
|
+
while i_right < spec.shape[0]:
|
|
800
|
+
mz_delta_right = spec[i_right, 0] - spec[i, 0]
|
|
801
|
+
if mz_delta_right <= mz_delta_allowed:
|
|
802
|
+
i_right += 1
|
|
803
|
+
else:
|
|
804
|
+
break
|
|
805
|
+
|
|
806
|
+
intensity_sum = np.sum(spec[i_left:i_right, 1])
|
|
807
|
+
intensity_weighted_sum = np.sum(spec[i_left:i_right, 0] * spec[i_left:i_right, 1])
|
|
808
|
+
|
|
809
|
+
spec_new.append([intensity_weighted_sum / intensity_sum, intensity_sum])
|
|
810
|
+
spec[i_left:i_right, 1] = 0
|
|
811
|
+
|
|
812
|
+
spec_new = np.array(spec_new)
|
|
813
|
+
spec_new = spec_new[np.argsort(spec_new[:, 0])]
|
|
814
|
+
if spec_new.shape[0] > 1:
|
|
815
|
+
spec_new = spec_new[np.argsort(spec_new[:, 0])]
|
|
816
|
+
return spec_new
|
|
817
|
+
else:
|
|
818
|
+
return np.array([[0,0]])
|
|
819
|
+
else:
|
|
820
|
+
return spec
|
|
821
|
+
|
|
822
|
+
|
|
823
|
+
|
|
824
|
+
def match_peaks_in_spectra(spec_a, spec_b, window_size):
|
|
825
|
+
a = 0
|
|
826
|
+
b = 0
|
|
827
|
+
|
|
828
|
+
spec_merged = []
|
|
829
|
+
peak_b_int = 0.
|
|
830
|
+
while a < spec_a.shape[0] and b < spec_b.shape[0]:
|
|
831
|
+
mass_delta = spec_a[a, 0] - spec_b[b, 0]
|
|
832
|
+
|
|
833
|
+
if mass_delta < -window_size:
|
|
834
|
+
spec_merged.append([spec_a[a, 0], spec_a[a, 1], peak_b_int])
|
|
835
|
+
peak_b_int = 0.
|
|
836
|
+
a += 1
|
|
837
|
+
elif mass_delta > window_size:
|
|
838
|
+
spec_merged.append([spec_b[b, 0], 0., spec_b[b, 1]])
|
|
839
|
+
b += 1
|
|
840
|
+
else:
|
|
841
|
+
peak_b_int += spec_b[b, 1]
|
|
842
|
+
b += 1
|
|
843
|
+
|
|
844
|
+
if peak_b_int > 0.:
|
|
845
|
+
spec_merged.append([spec_a[a, 0], spec_a[a, 1], peak_b_int])
|
|
846
|
+
peak_b_int = 0.
|
|
847
|
+
a += 1
|
|
848
|
+
|
|
849
|
+
if b < spec_b.shape[0]:
|
|
850
|
+
spec_merged += [[x[0], 0., x[1]] for x in spec_b[b:]]
|
|
851
|
+
|
|
852
|
+
if a < spec_a.shape[0]:
|
|
853
|
+
spec_merged += [[x[0], x[1], 0.] for x in spec_a[a:]]
|
|
854
|
+
|
|
855
|
+
if spec_merged:
|
|
856
|
+
spec_merged = np.array(spec_merged, dtype=np.float64)
|
|
857
|
+
else:
|
|
858
|
+
spec_merged = np.array([[0., 0., 0.]], dtype=np.float64)
|
|
859
|
+
return spec_merged
|
|
860
|
+
|
|
861
|
+
|
|
862
|
+
|
|
863
|
+
def convert_spec(spec, mzs):
|
|
864
|
+
ints_tmp = []
|
|
865
|
+
for i in range(0,len(mzs)):
|
|
866
|
+
if mzs[i] in spec[:,0]:
|
|
867
|
+
int_tmp = spec[np.where(spec[:,0] == mzs[i])[0][0],1]
|
|
868
|
+
else:
|
|
869
|
+
int_tmp = 0
|
|
870
|
+
ints_tmp.append(int_tmp)
|
|
871
|
+
out = np.transpose(np.array([mzs,ints_tmp]))
|
|
872
|
+
return out
|
|
873
|
+
|
|
874
|
+
|
|
875
|
+
def get_reference_df(reference_data, likely_reference_IDs=None):
|
|
876
|
+
extension = reference_data.rsplit('.',1)
|
|
877
|
+
extension = extension[(len(extension)-1)]
|
|
878
|
+
if extension == 'mgf' or extension == 'MGF' or extension == 'mzML' or extension == 'mzml' or extension == 'MZML' or extension == 'cdf' or extension == 'CDF' or extension == 'msp' or extension == 'MSP' or extension == 'json' or extension == 'JSON':
|
|
879
|
+
output_path_tmp = reference_data[:-3] + 'txt'
|
|
880
|
+
build_library_from_raw_data(input_path=reference_data, output_path=output_path_tmp, is_reference=True)
|
|
881
|
+
df_reference = pd.read_csv(output_path_tmp, sep='\t')
|
|
882
|
+
if extension == 'txt' or extension == 'TXT':
|
|
883
|
+
df_reference = pd.read_csv(reference_data, sep='\t')
|
|
884
|
+
if likely_reference_IDs is not None:
|
|
885
|
+
likely_reference_IDs = pd.read_csv(likely_reference_IDs, header=None)
|
|
886
|
+
df_reference = df_reference.loc[df_reference.iloc[:,0].isin(likely_reference_IDs.iloc[:,0].tolist())]
|
|
887
|
+
return df_reference
|
|
888
|
+
|
|
889
|
+
|
|
890
|
+
|
|
891
|
+
def S_cos(ints_a, ints_b):
|
|
892
|
+
if np.sum(ints_a) == 0 or np.sum(ints_b) == 0:
|
|
893
|
+
return(0)
|
|
894
|
+
else:
|
|
895
|
+
return np.dot(ints_a,ints_b) / (np.sqrt(sum(np.power(ints_a,2))) * np.sqrt(sum(np.power(ints_b,2))))
|
|
896
|
+
|
|
897
|
+
|
|
898
|
+
def ent_renyi(ints, q):
|
|
899
|
+
return np.log(sum(np.power(ints,q))) / (1-q)
|
|
900
|
+
|
|
901
|
+
|
|
902
|
+
def ent_tsallis(ints, q):
|
|
903
|
+
return (sum(np.power(ints,q))-1) / (1-q)
|
|
904
|
+
|
|
905
|
+
|
|
906
|
+
def S_shannon(ints_a, ints_b):
|
|
907
|
+
ent_a = scipy.stats.entropy(ints_a)
|
|
908
|
+
ent_b = scipy.stats.entropy(ints_b)
|
|
909
|
+
ent_ab = scipy.stats.entropy(ints_a + ints_b)
|
|
910
|
+
return(1 - (2 * ent_ab - ent_a - ent_b)/np.log(4))
|
|
911
|
+
|
|
912
|
+
|
|
913
|
+
def S_renyi(ints_a, ints_b, q):
|
|
914
|
+
if q == 1:
|
|
915
|
+
print('Warning: the Renyi Entropy Similarity Measure is equivalent to the Shannon Entropy Similarity Measure when the entropy dimension is 1')
|
|
916
|
+
return S_shannon(ints_a, ints_b)
|
|
917
|
+
else:
|
|
918
|
+
ent_a = ent_renyi(ints_a, q)
|
|
919
|
+
ent_b = ent_renyi(ints_b, q)
|
|
920
|
+
ent_merg = ent_renyi(ints_a/2 + ints_b/2, q)
|
|
921
|
+
N = (1/(1-q)) * (2*np.log(np.sum(np.power(ints_a/2,q))+np.sum(np.power(ints_b/2,q))) - np.log(np.sum(np.power(ints_a,q))) - np.log(np.sum(np.power(ints_b,q))))
|
|
922
|
+
return 1 - (2 * ent_merg - ent_a - ent_b) / N
|
|
923
|
+
|
|
924
|
+
|
|
925
|
+
def S_tsallis(ints_a, ints_b, q):
|
|
926
|
+
if q == 1:
|
|
927
|
+
print('Warning: the Tsallis Entropy Similarity Measure is equivalent to the Shannon Entropy Similarity Measure when the entropy dimension is 1')
|
|
928
|
+
return S_shannon(ints_a, ints_b)
|
|
929
|
+
else:
|
|
930
|
+
ent_a = ent_tsallis(ints_a, q)
|
|
931
|
+
ent_b = ent_tsallis(ints_b, q)
|
|
932
|
+
ent_merg = ent_tsallis(ints_a/2 + ints_b/2, q)
|
|
933
|
+
N = np.sum(2*np.power(ints_a/2,q)+2*np.power(ints_b/2,q)-np.power(ints_a,q)-np.power(ints_b,q)) / (1-q)
|
|
934
|
+
return 1 - (2 * ent_merg - ent_a - ent_b) / N
|
|
935
|
+
|
|
936
|
+
def S_mixture(ints_a, ints_b, weights={'Cosine':0.25, 'Shannon':0.25, 'Renyi':0.25, 'Tsallis':0.25}, q=1.1):
|
|
937
|
+
if set(weights.keys()).issubset(set(['Cosine','Shannon','Renyi','Tsallis'])) is False:
|
|
938
|
+
print('Error: the keys to the weight parameter dict of the function S_mixture must be one of the four: Cosine, Shannon, Renyi, Tsallis')
|
|
939
|
+
sys.exit()
|
|
940
|
+
|
|
941
|
+
similarity = 0
|
|
942
|
+
for key, value in weights.items():
|
|
943
|
+
if key == 'Cosine':
|
|
944
|
+
similarity += value * S_cos(ints_a,ints_b)
|
|
945
|
+
if key == 'Shannon':
|
|
946
|
+
similarity += value * S_shannon(ints_a,ints_b)
|
|
947
|
+
if key == 'Renyi':
|
|
948
|
+
similarity += value * S_renyi(ints_a,ints_b,q)
|
|
949
|
+
if key == 'Tsallis':
|
|
950
|
+
similarity += value * S_tsallis(ints_a,ints_b,q)
|
|
951
|
+
return similarity
|
|
952
|
+
|
|
953
|
+
|
|
954
|
+
def get_contingency_entries(ints_a, ints_b):
|
|
955
|
+
a = 0
|
|
956
|
+
b = 0
|
|
957
|
+
c = 0
|
|
958
|
+
|
|
959
|
+
for x, y in zip(ints_a, ints_b):
|
|
960
|
+
if x != 0 and y != 0:
|
|
961
|
+
c += 1
|
|
962
|
+
elif x != 0 and y == 0:
|
|
963
|
+
a += 1
|
|
964
|
+
elif x == 0 and y != 0:
|
|
965
|
+
b += 1
|
|
966
|
+
return [a,b,c]
|
|
967
|
+
|
|
968
|
+
|
|
969
|
+
def S_jaccard(ints_a, ints_b):
|
|
970
|
+
tmp = get_contingency_entries(ints_a, ints_b)
|
|
971
|
+
a = tmp[0]
|
|
972
|
+
b = tmp[1]
|
|
973
|
+
c = tmp[2]
|
|
974
|
+
denom = a + b + c
|
|
975
|
+
if denom == 0:
|
|
976
|
+
similarity = 0
|
|
977
|
+
else:
|
|
978
|
+
similarity = c / (a + b + c)
|
|
979
|
+
return similarity
|
|
980
|
+
|
|
981
|
+
|
|
982
|
+
def S_dice(ints_a, ints_b):
|
|
983
|
+
tmp = get_contingency_entries(ints_a, ints_b)
|
|
984
|
+
a = tmp[0]
|
|
985
|
+
b = tmp[1]
|
|
986
|
+
c = tmp[2]
|
|
987
|
+
denom = a + b + 2 * c
|
|
988
|
+
if denom == 0:
|
|
989
|
+
similarity = 0
|
|
990
|
+
else:
|
|
991
|
+
similarity = 2 * c / denom
|
|
992
|
+
return similarity
|
|
993
|
+
|
|
994
|
+
|
|
995
|
+
def S_3w_jaccard(ints_a, ints_b):
|
|
996
|
+
tmp = get_contingency_entries(ints_a, ints_b)
|
|
997
|
+
a = tmp[0]
|
|
998
|
+
b = tmp[1]
|
|
999
|
+
c = tmp[2]
|
|
1000
|
+
denom = a + b + 3 * c
|
|
1001
|
+
if denom == 0:
|
|
1002
|
+
similarity = 0
|
|
1003
|
+
else:
|
|
1004
|
+
similarity = 3 * c / denom
|
|
1005
|
+
return similarity
|
|
1006
|
+
|
|
1007
|
+
|
|
1008
|
+
def S_sokal_sneath(ints_a, ints_b):
|
|
1009
|
+
tmp = get_contingency_entries(ints_a, ints_b)
|
|
1010
|
+
a = tmp[0]
|
|
1011
|
+
b = tmp[1]
|
|
1012
|
+
c = tmp[2]
|
|
1013
|
+
denom = 2 * a + 2 * b + c
|
|
1014
|
+
if denom == 0:
|
|
1015
|
+
similarity = 0
|
|
1016
|
+
else:
|
|
1017
|
+
similarity = c / denom
|
|
1018
|
+
return similarity
|
|
1019
|
+
|
|
1020
|
+
|
|
1021
|
+
def S_binary_cosine(ints_a, ints_b):
|
|
1022
|
+
tmp = get_contingency_entries(ints_a, ints_b)
|
|
1023
|
+
a = tmp[0]
|
|
1024
|
+
b = tmp[1]
|
|
1025
|
+
c = tmp[2]
|
|
1026
|
+
denom = np.sqrt((a + c) * (b + c))
|
|
1027
|
+
if denom == 0:
|
|
1028
|
+
similarity = 0
|
|
1029
|
+
else:
|
|
1030
|
+
similarity = c / denom
|
|
1031
|
+
return similarity
|
|
1032
|
+
|
|
1033
|
+
|
|
1034
|
+
def S_mountford(ints_a, ints_b):
|
|
1035
|
+
tmp = get_contingency_entries(ints_a, ints_b)
|
|
1036
|
+
a = tmp[0]
|
|
1037
|
+
b = tmp[1]
|
|
1038
|
+
c = tmp[2]
|
|
1039
|
+
denom = c * (a + b) + 2 * a * b
|
|
1040
|
+
if denom == 0:
|
|
1041
|
+
similarity = 1
|
|
1042
|
+
else:
|
|
1043
|
+
similarity = 2 * c / denom
|
|
1044
|
+
return similarity
|
|
1045
|
+
|
|
1046
|
+
|
|
1047
|
+
def S_mcconnaughey(ints_a, ints_b):
|
|
1048
|
+
tmp = get_contingency_entries(ints_a, ints_b)
|
|
1049
|
+
a = tmp[0]
|
|
1050
|
+
b = tmp[1]
|
|
1051
|
+
c = tmp[2]
|
|
1052
|
+
denom = (a + c) * (b + c)
|
|
1053
|
+
if denom == 0:
|
|
1054
|
+
similarity = 0
|
|
1055
|
+
else:
|
|
1056
|
+
similarity = (c**2 - a * b) / denom
|
|
1057
|
+
return similarity
|
|
1058
|
+
|
|
1059
|
+
|
|
1060
|
+
def S_driver_kroeber(ints_a, ints_b):
|
|
1061
|
+
tmp = get_contingency_entries(ints_a, ints_b)
|
|
1062
|
+
a = tmp[0]
|
|
1063
|
+
b = tmp[1]
|
|
1064
|
+
c = tmp[2]
|
|
1065
|
+
denom = 2 * (a + c) * (b + c)
|
|
1066
|
+
if denom == 0:
|
|
1067
|
+
similarity = 0
|
|
1068
|
+
else:
|
|
1069
|
+
similarity = c * (a + b + 2 * c) / denom
|
|
1070
|
+
return similarity
|
|
1071
|
+
|
|
1072
|
+
|
|
1073
|
+
def S_simpson(ints_a, ints_b):
|
|
1074
|
+
tmp = get_contingency_entries(ints_a, ints_b)
|
|
1075
|
+
a = tmp[0]
|
|
1076
|
+
b = tmp[1]
|
|
1077
|
+
c = tmp[2]
|
|
1078
|
+
denom = min(a + c, b + c)
|
|
1079
|
+
if denom == 0:
|
|
1080
|
+
similarity = 0
|
|
1081
|
+
else:
|
|
1082
|
+
similarity = c / denom
|
|
1083
|
+
return similarity
|
|
1084
|
+
|
|
1085
|
+
|
|
1086
|
+
def S_braun_banquet(ints_a, ints_b):
|
|
1087
|
+
tmp = get_contingency_entries(ints_a, ints_b)
|
|
1088
|
+
a = tmp[0]
|
|
1089
|
+
b = tmp[1]
|
|
1090
|
+
c = tmp[2]
|
|
1091
|
+
denom = max(a + c, b + c)
|
|
1092
|
+
if denom == 0:
|
|
1093
|
+
similarity = 0
|
|
1094
|
+
else:
|
|
1095
|
+
similarity = c / denom
|
|
1096
|
+
return similarity
|
|
1097
|
+
|
|
1098
|
+
|
|
1099
|
+
def S_fager_mcgowan(ints_a, ints_b):
|
|
1100
|
+
tmp = get_contingency_entries(ints_a, ints_b)
|
|
1101
|
+
a = tmp[0]
|
|
1102
|
+
b = tmp[1]
|
|
1103
|
+
c = tmp[2]
|
|
1104
|
+
denom1 = np.sqrt((a + c) * (b + c))
|
|
1105
|
+
denom2 = 2 * np.sqrt(max(a + c, b + c))
|
|
1106
|
+
if denom1 == 0 or denom2 == 0:
|
|
1107
|
+
similarity = 0
|
|
1108
|
+
else:
|
|
1109
|
+
similarity = c / denom1 - 1 / denom2
|
|
1110
|
+
return similarity
|
|
1111
|
+
|
|
1112
|
+
|
|
1113
|
+
def S_kulczynski(ints_a, ints_b):
|
|
1114
|
+
tmp = get_contingency_entries(ints_a, ints_b)
|
|
1115
|
+
a = tmp[0]
|
|
1116
|
+
b = tmp[1]
|
|
1117
|
+
c = tmp[2]
|
|
1118
|
+
denom = a + b
|
|
1119
|
+
if denom == 0:
|
|
1120
|
+
similarity = 1
|
|
1121
|
+
else:
|
|
1122
|
+
similarity = c / denom
|
|
1123
|
+
return similarity
|
|
1124
|
+
|
|
1125
|
+
|
|
1126
|
+
def S_intersection(ints_a, ints_b):
|
|
1127
|
+
tmp = get_contingency_entries(ints_a, ints_b)
|
|
1128
|
+
c = tmp[2]
|
|
1129
|
+
return c
|
|
1130
|
+
|
|
1131
|
+
|
|
1132
|
+
def S_hamming(ints_a, ints_b):
|
|
1133
|
+
tmp = get_contingency_entries(ints_a, ints_b)
|
|
1134
|
+
a = tmp[0]
|
|
1135
|
+
b = tmp[1]
|
|
1136
|
+
denom = a + b
|
|
1137
|
+
if denom == 0:
|
|
1138
|
+
similarity = 1
|
|
1139
|
+
else:
|
|
1140
|
+
similarity = 1 / denom
|
|
1141
|
+
return similarity
|
|
1142
|
+
|
|
1143
|
+
|
|
1144
|
+
def S_hellinger(ints_a, ints_b):
|
|
1145
|
+
tmp = get_contingency_entries(ints_a, ints_b)
|
|
1146
|
+
a = tmp[0]
|
|
1147
|
+
b = tmp[1]
|
|
1148
|
+
c = tmp[2]
|
|
1149
|
+
similarity = 1 - np.sqrt((1 - c / np.sqrt((a + c) * (b + c))))
|
|
1150
|
+
return similarity
|
|
1151
|
+
|
|
1152
|
+
|
|
1153
|
+
def get_similarity(similarity_measure, q_ints, r_ints, weights, q):
|
|
1154
|
+
|
|
1155
|
+
if similarity_measure == 'cosine':
|
|
1156
|
+
similarity = S_cos(q_ints, r_ints)
|
|
1157
|
+
|
|
1158
|
+
elif similarity_measure in ['shannon', 'renyi', 'tsallis']:
|
|
1159
|
+
q_ints = normalize(q_ints, method = 'standard')
|
|
1160
|
+
r_ints = normalize(r_ints, method = 'standard')
|
|
1161
|
+
if similarity_measure == 'shannon':
|
|
1162
|
+
similarity = S_shannon(q_ints, r_ints)
|
|
1163
|
+
elif similarity_measure == 'renyi':
|
|
1164
|
+
similarity = S_renyi(q_ints, r_ints, q)
|
|
1165
|
+
elif similarity_measure == 'tsallis':
|
|
1166
|
+
similarity = S_tsallis(q_ints, r_ints, q)
|
|
1167
|
+
|
|
1168
|
+
elif similarity_measure == 'mixture':
|
|
1169
|
+
similarity = S_mixture(q_ints, r_ints, weights, q)
|
|
1170
|
+
|
|
1171
|
+
elif similarity_measure == 'jaccard':
|
|
1172
|
+
similarity = S_jaccard(q_ints, r_ints)
|
|
1173
|
+
|
|
1174
|
+
elif similarity_measure == 'dice':
|
|
1175
|
+
similarity = S_dice(q_ints, r_ints)
|
|
1176
|
+
|
|
1177
|
+
elif similarity_measure == '3w_jaccard':
|
|
1178
|
+
similarity = S_3w_jaccard(q_ints, r_ints)
|
|
1179
|
+
|
|
1180
|
+
elif similarity_measure == 'sokal_sneath':
|
|
1181
|
+
similarity = S_sokal_sneath(q_ints, r_ints)
|
|
1182
|
+
|
|
1183
|
+
elif similarity_measure == 'binary_cosine':
|
|
1184
|
+
similarity = S_binary_cosine(q_ints, r_ints)
|
|
1185
|
+
|
|
1186
|
+
elif similarity_measure == 'mountford':
|
|
1187
|
+
similarity = S_mountford(q_ints, r_ints)
|
|
1188
|
+
|
|
1189
|
+
elif similarity_measure == 'mcconnaughey':
|
|
1190
|
+
similarity = S_mcconnaughey(q_ints, r_ints)
|
|
1191
|
+
|
|
1192
|
+
elif similarity_measure == 'driver_kroeber':
|
|
1193
|
+
similarity = S_driver_kroeber(q_ints, r_ints)
|
|
1194
|
+
|
|
1195
|
+
elif similarity_measure == 'simpson':
|
|
1196
|
+
similarity = S_simpson(q_ints, r_ints)
|
|
1197
|
+
|
|
1198
|
+
elif similarity_measure == 'braun_banquet':
|
|
1199
|
+
similarity = S_braun_banquet(q_ints, r_ints)
|
|
1200
|
+
|
|
1201
|
+
elif similarity_measure == 'fager_mcgowan':
|
|
1202
|
+
similarity = S_fager_mcgowan(q_ints, r_ints)
|
|
1203
|
+
|
|
1204
|
+
elif similarity_measure == 'kulczynski':
|
|
1205
|
+
similarity = S_kulczynski(q_ints, r_ints)
|
|
1206
|
+
|
|
1207
|
+
elif similarity_measure == 'intersection':
|
|
1208
|
+
similarity = S_intersection(q_ints, r_ints)
|
|
1209
|
+
|
|
1210
|
+
elif similarity_measure == 'hamming':
|
|
1211
|
+
similarity = S_hamming(q_ints, r_ints)
|
|
1212
|
+
|
|
1213
|
+
elif similarity_measure == 'hellinger':
|
|
1214
|
+
similarity = S_hellinger(q_ints, r_ints)
|
|
1215
|
+
|
|
1216
|
+
return similarity
|
|
1217
|
+
|
|
1218
|
+
|
|
1219
|
+
def _vector_to_full_params(X, default_params, optimize_params):
|
|
1220
|
+
params = default_params.copy()
|
|
1221
|
+
for name, val in zip(optimize_params, X):
|
|
1222
|
+
params[name] = float(val)
|
|
1223
|
+
return params
|
|
1224
|
+
|
|
1225
|
+
|
|
1226
|
+
def objective_function_HRMS(X, ctx):
|
|
1227
|
+
p = _vector_to_full_params(X, ctx["default_params"], ctx["optimize_params"])
|
|
1228
|
+
acc = get_acc_HRMS(
|
|
1229
|
+
ctx["df_query"], ctx["df_reference"],
|
|
1230
|
+
ctx["precursor_ion_mz_tolerance"], ctx["ionization_mode"], ctx["adduct"],
|
|
1231
|
+
ctx["similarity_measure"], ctx["weights"], ctx["spectrum_preprocessing_order"],
|
|
1232
|
+
ctx["mz_min"], ctx["mz_max"], ctx["int_min"], ctx["int_max"],
|
|
1233
|
+
p["window_size_centroiding"], p["window_size_matching"], p["noise_threshold"],
|
|
1234
|
+
p["wf_mz"], p["wf_int"], p["LET_threshold"],
|
|
1235
|
+
p["entropy_dimension"],
|
|
1236
|
+
ctx["high_quality_reference_library"],
|
|
1237
|
+
verbose=False
|
|
1238
|
+
)
|
|
1239
|
+
print(f"\nparams({ctx['optimize_params']}) = {np.array(X)}\naccuracy: {acc*100}%")
|
|
1240
|
+
return 1.0 - acc
|
|
1241
|
+
|
|
1242
|
+
def objective_function_NRMS(X, ctx):
|
|
1243
|
+
p = _vector_to_full_params(X, ctx["default_params"], ctx["optimize_params"])
|
|
1244
|
+
acc = get_acc_NRMS(
|
|
1245
|
+
ctx["df_query"], ctx["df_reference"],
|
|
1246
|
+
ctx["unique_query_ids"], ctx["unique_reference_ids"],
|
|
1247
|
+
ctx["similarity_measure"], ctx["weights"], ctx["spectrum_preprocessing_order"],
|
|
1248
|
+
ctx["mz_min"], ctx["mz_max"], ctx["int_min"], ctx["int_max"],
|
|
1249
|
+
p["noise_threshold"], p["wf_mz"], p["wf_int"], p["LET_threshold"], p["entropy_dimension"],
|
|
1250
|
+
ctx["high_quality_reference_library"],
|
|
1251
|
+
verbose=False
|
|
1252
|
+
)
|
|
1253
|
+
print(f"\nparams({ctx['optimize_params']}) = {np.array(X)}\naccuracy: {acc*100}%")
|
|
1254
|
+
return 1.0 - acc
|
|
1255
|
+
|
|
1256
|
+
|
|
1257
|
+
|
|
1258
|
+
def tune_params_DE(query_data=None, reference_data=None, precursor_ion_mz_tolerance=None, ionization_mode=None, adduct=None, chromatography_platform='HRMS', similarity_measure='cosine', weights=None, spectrum_preprocessing_order='CNMWL', mz_min=0, mz_max=999999999, int_min=0, int_max=999999999, high_quality_reference_library=False, optimize_params=["window_size_centroiding","window_size_matching","noise_threshold","wf_mz","wf_int","LET_threshold","entropy_dimension"], param_bounds={"window_size_centroiding":(0.0,0.5),"window_size_matching":(0.0,0.5),"noise_threshold":(0.0,0.25),"wf_mz":(0.0,5.0),"wf_int":(0.0,5.0),"LET_threshold":(0.0,5.0),"entropy_dimension":(1.0,3.0)}, default_params={"window_size_centroiding": 0.5, "window_size_matching":0.5, "noise_threshold":0.10, "wf_mz":0.0, "wf_int":1.0, "LET_threshold":0.0, "entropy_dimension":1.1}, maxiters=3, de_workers=1):
|
|
1259
|
+
|
|
1260
|
+
if query_data is None:
|
|
1261
|
+
print('\nError: No argument passed to the mandatory query_data. Please pass the path to the TXT file of the query data.')
|
|
1262
|
+
sys.exit()
|
|
1263
|
+
else:
|
|
1264
|
+
extension = query_data.rsplit('.',1)
|
|
1265
|
+
extension = extension[(len(extension)-1)]
|
|
1266
|
+
if extension == 'mgf' or extension == 'MGF' or extension == 'mzML' or extension == 'mzml' or extension == 'MZML' or extension == 'cdf' or extension == 'CDF' or extension == 'msp' or extension == 'MSP' or extension == 'json' or extension == 'JSON':
|
|
1267
|
+
output_path_tmp = query_data[:-3] + 'txt'
|
|
1268
|
+
build_library_from_raw_data(input_path=query_data, output_path=output_path_tmp, is_reference=False)
|
|
1269
|
+
df_query = pd.read_csv(output_path_tmp, sep='\t')
|
|
1270
|
+
if extension == 'txt' or extension == 'TXT':
|
|
1271
|
+
df_query = pd.read_csv(query_data, sep='\t')
|
|
1272
|
+
unique_query_ids = df_query.iloc[:,0].unique()
|
|
1273
|
+
|
|
1274
|
+
if reference_data is None:
|
|
1275
|
+
print('\nError: No argument passed to the mandatory reference_data. Please pass the path to the TXT file of the reference data.')
|
|
1276
|
+
sys.exit()
|
|
1277
|
+
else:
|
|
1278
|
+
if isinstance(reference_data,str):
|
|
1279
|
+
df_reference = get_reference_df(reference_data=reference_data)
|
|
1280
|
+
unique_reference_ids = df_reference.iloc[:,0].unique()
|
|
1281
|
+
else:
|
|
1282
|
+
dfs = []
|
|
1283
|
+
unique_reference_ids = []
|
|
1284
|
+
for f in reference_data:
|
|
1285
|
+
tmp = get_reference_df(reference_data=f)
|
|
1286
|
+
dfs.append(tmp)
|
|
1287
|
+
unique_reference_ids.extend(tmp.iloc[:,0].unique())
|
|
1288
|
+
df_reference = pd.concat(dfs, axis=0, ignore_index=True)
|
|
1289
|
+
|
|
1290
|
+
if 'ionization_mode' in df_reference.columns.tolist() and ionization_mode != None and ionization_mode != 'N/A':
|
|
1291
|
+
df_reference = df_reference.loc[df_reference['ionization_mode']==ionization_mode]
|
|
1292
|
+
if 'adduct' in df_reference.columns.tolist() and adduct != None and adduct != 'N/A':
|
|
1293
|
+
df_reference = df_reference.loc[df_reference['adduct']==adduct]
|
|
1294
|
+
|
|
1295
|
+
unique_query_ids = df_query['id'].unique().tolist()
|
|
1296
|
+
unique_reference_ids = df_reference['id'].unique().tolist()
|
|
1297
|
+
|
|
1298
|
+
ctx = dict(
|
|
1299
|
+
df_query=df_query,
|
|
1300
|
+
df_reference=df_reference,
|
|
1301
|
+
precursor_ion_mz_tolerance=precursor_ion_mz_tolerance,
|
|
1302
|
+
ionization_mode=ionization_mode,
|
|
1303
|
+
adduct=adduct,
|
|
1304
|
+
similarity_measure=similarity_measure,
|
|
1305
|
+
weights=weights,
|
|
1306
|
+
spectrum_preprocessing_order=spectrum_preprocessing_order,
|
|
1307
|
+
mz_min=mz_min, mz_max=mz_max, int_min=int_min, int_max=int_max,
|
|
1308
|
+
high_quality_reference_library=high_quality_reference_library,
|
|
1309
|
+
default_params=default_params,
|
|
1310
|
+
optimize_params=optimize_params,
|
|
1311
|
+
)
|
|
1312
|
+
|
|
1313
|
+
bounds = [param_bounds[p] for p in optimize_params]
|
|
1314
|
+
|
|
1315
|
+
if chromatography_platform == 'HRMS':
|
|
1316
|
+
result = differential_evolution(objective_function_HRMS, bounds=bounds, args=(ctx,), maxiter=maxiters, tol=0.0, workers=de_workers, seed=1)
|
|
1317
|
+
else:
|
|
1318
|
+
result = differential_evolution(objective_function_NRMS, bounds=bounds, args=(ctx,), maxiter=maxiters, tol=0.0, workers=de_workers, seed=1)
|
|
1319
|
+
|
|
1320
|
+
best_full_params = _vector_to_full_params(result.x, default_params, optimize_params)
|
|
1321
|
+
best_acc = 100.0 - (result.fun * 100.0)
|
|
1322
|
+
|
|
1323
|
+
print("\n=== Differential Evolution Result ===")
|
|
1324
|
+
print(f"Optimized over: {optimize_params}")
|
|
1325
|
+
print("Best values (selected params):")
|
|
1326
|
+
for name in optimize_params:
|
|
1327
|
+
print(f" {name}: {best_full_params[name]}")
|
|
1328
|
+
print("\nFull parameter set used in final evaluation:")
|
|
1329
|
+
for k, v in best_full_params.items():
|
|
1330
|
+
print(f" {k}: {v}")
|
|
1331
|
+
print(f"\nBest accuracy: {best_acc:.3f}%")
|
|
1332
|
+
_log(f"best = {result.x}, acc={100*(1-result.fun):.3f}%")
|
|
1333
|
+
|
|
1334
|
+
|
|
1335
|
+
default_HRMS_grid = {'similarity_measure':['cosine'], 'weight':[{'Cosine':0.25,'Shannon':0.25,'Renyi':0.25,'Tsallis':0.25}], 'spectrum_preprocessing_order':['FCNMWL'], 'mz_min':[0], 'mz_max':[9999999], 'int_min':[0], 'int_max':[99999999], 'window_size_centroiding':[0.5], 'window_size_matching':[0.5], 'noise_threshold':[0.0], 'wf_mz':[0.0], 'wf_int':[1.0], 'LET_threshold':[0.0], 'entropy_dimension':[1.1], 'high_quality_reference_library':[False]}
|
|
1336
|
+
default_NRMS_grid = {'similarity_measure':['cosine'], 'weight':[{'Cosine':0.25,'Shannon':0.25,'Renyi':0.25,'Tsallis':0.25}], 'spectrum_preprocessing_order':['FCNMWL'], 'mz_min':[0], 'mz_max':[9999999], 'int_min':[0], 'int_max':[99999999], 'noise_threshold':[0.0], 'wf_mz':[0.0], 'wf_int':[1.0], 'LET_threshold':[0.0], 'entropy_dimension':[1.1], 'high_quality_reference_library':[False]}
|
|
1337
|
+
|
|
1338
|
+
|
|
1339
|
+
def _eval_one_HRMS(df_query, df_reference,
|
|
1340
|
+
precursor_ion_mz_tolerance_tmp, ionization_mode_tmp, adduct_tmp,
|
|
1341
|
+
similarity_measure_tmp, weight,
|
|
1342
|
+
spectrum_preprocessing_order_tmp, mz_min_tmp, mz_max_tmp,
|
|
1343
|
+
int_min_tmp, int_max_tmp, noise_threshold_tmp,
|
|
1344
|
+
window_size_centroiding_tmp, window_size_matching_tmp,
|
|
1345
|
+
wf_mz_tmp, wf_int_tmp, LET_threshold_tmp,
|
|
1346
|
+
entropy_dimension_tmp, high_quality_reference_library_tmp):
|
|
1347
|
+
|
|
1348
|
+
acc = get_acc_HRMS(
|
|
1349
|
+
df_query=df_query, df_reference=df_reference,
|
|
1350
|
+
precursor_ion_mz_tolerance=precursor_ion_mz_tolerance_tmp,
|
|
1351
|
+
ionization_mode=ionization_mode_tmp, adduct=adduct_tmp,
|
|
1352
|
+
similarity_measure=similarity_measure_tmp, weights=weight,
|
|
1353
|
+
spectrum_preprocessing_order=spectrum_preprocessing_order_tmp,
|
|
1354
|
+
mz_min=mz_min_tmp, mz_max=mz_max_tmp,
|
|
1355
|
+
int_min=int_min_tmp, int_max=int_max_tmp,
|
|
1356
|
+
window_size_centroiding=window_size_centroiding_tmp,
|
|
1357
|
+
window_size_matching=window_size_matching_tmp,
|
|
1358
|
+
noise_threshold=noise_threshold_tmp,
|
|
1359
|
+
wf_mz=wf_mz_tmp, wf_int=wf_int_tmp,
|
|
1360
|
+
LET_threshold=LET_threshold_tmp,
|
|
1361
|
+
entropy_dimension=entropy_dimension_tmp,
|
|
1362
|
+
high_quality_reference_library=high_quality_reference_library_tmp,
|
|
1363
|
+
verbose=False
|
|
1364
|
+
)
|
|
1365
|
+
|
|
1366
|
+
return (
|
|
1367
|
+
acc, similarity_measure_tmp, json.dumps(weight), spectrum_preprocessing_order_tmp,
|
|
1368
|
+
mz_min_tmp, mz_max_tmp, int_min_tmp, int_max_tmp,
|
|
1369
|
+
noise_threshold_tmp, window_size_centroiding_tmp, window_size_matching_tmp,
|
|
1370
|
+
wf_mz_tmp, wf_int_tmp, LET_threshold_tmp, entropy_dimension_tmp,
|
|
1371
|
+
high_quality_reference_library_tmp
|
|
1372
|
+
)
|
|
1373
|
+
|
|
1374
|
+
|
|
1375
|
+
def _eval_one_NRMS(df_query, df_reference, unique_query_ids, unique_reference_ids,
|
|
1376
|
+
similarity_measure_tmp, weight,
|
|
1377
|
+
spectrum_preprocessing_order_tmp, mz_min_tmp, mz_max_tmp,
|
|
1378
|
+
int_min_tmp, int_max_tmp, noise_threshold_tmp,
|
|
1379
|
+
wf_mz_tmp, wf_int_tmp, LET_threshold_tmp,
|
|
1380
|
+
entropy_dimension_tmp, high_quality_reference_library_tmp):
|
|
1381
|
+
|
|
1382
|
+
acc = get_acc_NRMS(
|
|
1383
|
+
df_query=df_query, df_reference=df_reference,
|
|
1384
|
+
unique_query_ids=unique_query_ids, unique_reference_ids=unique_reference_ids,
|
|
1385
|
+
similarity_measure=similarity_measure_tmp, weights=weight,
|
|
1386
|
+
spectrum_preprocessing_order=spectrum_preprocessing_order_tmp,
|
|
1387
|
+
mz_min=mz_min_tmp, mz_max=mz_max_tmp,
|
|
1388
|
+
int_min=int_min_tmp, int_max=int_max_tmp,
|
|
1389
|
+
noise_threshold=noise_threshold_tmp,
|
|
1390
|
+
wf_mz=wf_mz_tmp, wf_int=wf_int_tmp,
|
|
1391
|
+
LET_threshold=LET_threshold_tmp,
|
|
1392
|
+
entropy_dimension=entropy_dimension_tmp,
|
|
1393
|
+
high_quality_reference_library=high_quality_reference_library_tmp,
|
|
1394
|
+
)
|
|
1395
|
+
|
|
1396
|
+
return (
|
|
1397
|
+
acc, similarity_measure_tmp, json.dumps(weight), spectrum_preprocessing_order_tmp,
|
|
1398
|
+
mz_min_tmp, mz_max_tmp, int_min_tmp, int_max_tmp, noise_threshold_tmp,
|
|
1399
|
+
wf_mz_tmp, wf_int_tmp, LET_threshold_tmp, entropy_dimension_tmp, high_quality_reference_library_tmp
|
|
1400
|
+
)
|
|
1401
|
+
|
|
1402
|
+
|
|
1403
|
+
|
|
1404
|
+
|
|
1405
|
+
def tune_params_on_HRMS_data_grid_shiny(query_data=None, reference_data=None, precursor_ion_mz_tolerance=None, ionization_mode=None, adduct=None, grid=None, output_path=None, return_output=False):
|
|
1406
|
+
local_grid = {**default_HRMS_grid, **(grid or {})}
|
|
1407
|
+
for key, value in local_grid.items():
|
|
1408
|
+
globals()[key] = value
|
|
1409
|
+
|
|
1410
|
+
if query_data is None:
|
|
1411
|
+
print('\nError: No argument passed to the mandatory query_data. Please pass the path to the data file.')
|
|
1412
|
+
sys.exit()
|
|
1413
|
+
else:
|
|
1414
|
+
extension = query_data.rsplit('.', 1)[-1]
|
|
1415
|
+
if extension in ('mgf','MGF','mzML','mzml','MZML','cdf','CDF'):
|
|
1416
|
+
output_path_tmp = query_data[:-3] + 'txt'
|
|
1417
|
+
build_library_from_raw_data(input_path=query_data, output_path=output_path_tmp, is_reference=False)
|
|
1418
|
+
df_query = pd.read_csv(output_path_tmp, sep='\t')
|
|
1419
|
+
elif extension in ('txt','TXT'):
|
|
1420
|
+
df_query = pd.read_csv(query_data, sep='\t')
|
|
1421
|
+
else:
|
|
1422
|
+
print(f'\nError: Unsupported query_data extension: {extension}')
|
|
1423
|
+
sys.exit()
|
|
1424
|
+
unique_query_ids = df_query.iloc[:, 0].unique()
|
|
1425
|
+
|
|
1426
|
+
if reference_data is None:
|
|
1427
|
+
print('\nError: No argument passed to the mandatory reference_data. Please pass the path to the data file(s).')
|
|
1428
|
+
sys.exit()
|
|
1429
|
+
else:
|
|
1430
|
+
if isinstance(reference_data, str):
|
|
1431
|
+
df_reference = get_reference_df(reference_data=reference_data)
|
|
1432
|
+
unique_reference_ids = df_reference.iloc[:, 0].unique()
|
|
1433
|
+
else:
|
|
1434
|
+
dfs = []
|
|
1435
|
+
unique_reference_ids = []
|
|
1436
|
+
for f in reference_data:
|
|
1437
|
+
tmp = get_reference_df(reference_data=f)
|
|
1438
|
+
dfs.append(tmp)
|
|
1439
|
+
unique_reference_ids.extend(tmp.iloc[:, 0].unique())
|
|
1440
|
+
df_reference = pd.concat(dfs, axis=0, ignore_index=True)
|
|
1441
|
+
|
|
1442
|
+
print(f'\nNote that there are {len(unique_query_ids)} unique query spectra, '
|
|
1443
|
+
f'{len(unique_reference_ids)} unique reference spectra, and '
|
|
1444
|
+
f'{len(set(unique_query_ids) & set(unique_reference_ids))} of the query and reference spectra IDs are in common.\n')
|
|
1445
|
+
|
|
1446
|
+
if 'ionization_mode' in df_reference.columns.tolist() and ionization_mode != None and ionization_mode != 'N/A':
|
|
1447
|
+
df_reference = df_reference.loc[df_reference['ionization_mode']==ionization_mode]
|
|
1448
|
+
if 'adduct' in df_reference.columns.tolist() and adduct != None and adduct != 'N/A':
|
|
1449
|
+
df_reference = df_reference.loc[df_reference['adduct']==adduct]
|
|
1450
|
+
|
|
1451
|
+
if output_path is None:
|
|
1452
|
+
output_path = f'{Path.cwd()}/tuning_param_output.txt'
|
|
1453
|
+
print(f'Warning: since output_path=None, the output will be written to the current working directory: {output_path}')
|
|
1454
|
+
|
|
1455
|
+
param_grid = product(
|
|
1456
|
+
similarity_measure, weight, spectrum_preprocessing_order, mz_min, mz_max, int_min, int_max,
|
|
1457
|
+
noise_threshold, window_size_centroiding, window_size_matching, wf_mz, wf_int, LET_threshold,
|
|
1458
|
+
entropy_dimension, high_quality_reference_library
|
|
1459
|
+
)
|
|
1460
|
+
|
|
1461
|
+
results = []
|
|
1462
|
+
total = (
|
|
1463
|
+
len(similarity_measure) * len(weight) * len(spectrum_preprocessing_order) * len(mz_min) * len(mz_max) *
|
|
1464
|
+
len(int_min) * len(int_max) * len(noise_threshold) * len(window_size_centroiding) *
|
|
1465
|
+
len(window_size_matching) * len(wf_mz) * len(wf_int) * len(LET_threshold) *
|
|
1466
|
+
len(entropy_dimension) * len(high_quality_reference_library)
|
|
1467
|
+
)
|
|
1468
|
+
done = 0
|
|
1469
|
+
for params in param_grid:
|
|
1470
|
+
res = _eval_one_HRMS(df_query, df_reference, precursor_ion_mz_tolerance, ionization_mode, adduct, *params)
|
|
1471
|
+
results.append(res)
|
|
1472
|
+
done += 1
|
|
1473
|
+
print(f'Completed {done}/{total} grid combinations.\n', flush=True)
|
|
1474
|
+
|
|
1475
|
+
df_out = pd.DataFrame(results, columns=[
|
|
1476
|
+
'ACC','SIMILARITY.MEASURE','WEIGHT','SPECTRUM.PROCESSING.ORDER','MZ.MIN','MZ.MAX',
|
|
1477
|
+
'INT.MIN','INT.MAX','NOISE.THRESHOLD','WINDOW.SIZE.CENTROIDING','WINDOW.SIZE.MATCHING',
|
|
1478
|
+
'WF.MZ','WF.INT','LET.THRESHOLD','ENTROPY.DIMENSION','HIGH.QUALITY.REFERENCE.LIBRARY'
|
|
1479
|
+
])
|
|
1480
|
+
|
|
1481
|
+
if 'WEIGHT' in df_out.columns:
|
|
1482
|
+
df_out['WEIGHT'] = (
|
|
1483
|
+
df_out['WEIGHT'].astype(str)
|
|
1484
|
+
.str.replace("\"","",regex=False)
|
|
1485
|
+
.str.replace("{","",regex=False)
|
|
1486
|
+
.str.replace("}","",regex=False)
|
|
1487
|
+
.str.replace(":","",regex=False)
|
|
1488
|
+
.str.replace("Cosine","",regex=False)
|
|
1489
|
+
.str.replace("Shannon","",regex=False)
|
|
1490
|
+
.str.replace("Renyi","",regex=False)
|
|
1491
|
+
.str.replace("Tsallis","",regex=False)
|
|
1492
|
+
.str.replace(" ","",regex=False)
|
|
1493
|
+
)
|
|
1494
|
+
|
|
1495
|
+
if return_output:
|
|
1496
|
+
return df_out
|
|
1497
|
+
else:
|
|
1498
|
+
df_out.to_csv(output_path, index=False, sep='\t', quoting=csv.QUOTE_NONE)
|
|
1499
|
+
print(f'Wrote results to {output_path}')
|
|
1500
|
+
|
|
1501
|
+
|
|
1502
|
+
|
|
1503
|
+
def tune_params_on_NRMS_data_grid(query_data=None, reference_data=None, grid=None, output_path=None, return_output=False):
|
|
1504
|
+
grid = {**default_NRMS_grid, **(grid or {})}
|
|
1505
|
+
for key, value in grid.items():
|
|
1506
|
+
globals()[key] = value
|
|
1507
|
+
|
|
1508
|
+
if query_data is None:
|
|
1509
|
+
print('\nError: No argument passed to the mandatory query_data. Please pass the path to the TXT file of the query data.')
|
|
1510
|
+
sys.exit()
|
|
1511
|
+
else:
|
|
1512
|
+
extension = query_data.rsplit('.',1)
|
|
1513
|
+
extension = extension[(len(extension)-1)]
|
|
1514
|
+
if extension == 'mgf' or extension == 'MGF' or extension == 'mzML' or extension == 'mzml' or extension == 'MZML' or extension == 'cdf' or extension == 'CDF' or extension == 'msp' or extension == 'MSP' or extension == 'json' or extension == 'JSON':
|
|
1515
|
+
output_path_tmp = query_data[:-3] + 'txt'
|
|
1516
|
+
build_library_from_raw_data(input_path=query_data, output_path=output_path_tmp, is_reference=False)
|
|
1517
|
+
df_query = pd.read_csv(output_path_tmp, sep='\t')
|
|
1518
|
+
if extension == 'txt' or extension == 'TXT':
|
|
1519
|
+
df_query = pd.read_csv(query_data, sep='\t')
|
|
1520
|
+
unique_query_ids = df_query.iloc[:,0].unique()
|
|
1521
|
+
|
|
1522
|
+
if reference_data is None:
|
|
1523
|
+
print('\nError: No argument passed to the mandatory reference_data. Please pass the path to the TXT file of the reference data.')
|
|
1524
|
+
sys.exit()
|
|
1525
|
+
else:
|
|
1526
|
+
if isinstance(reference_data,str):
|
|
1527
|
+
df_reference = get_reference_df(reference_data=reference_data)
|
|
1528
|
+
unique_reference_ids = df_reference.iloc[:,0].unique()
|
|
1529
|
+
else:
|
|
1530
|
+
dfs = []
|
|
1531
|
+
unique_reference_ids = []
|
|
1532
|
+
for f in reference_data:
|
|
1533
|
+
tmp = get_reference_df(reference_data=f)
|
|
1534
|
+
dfs.append(tmp)
|
|
1535
|
+
unique_reference_ids.extend(tmp.iloc[:,0].unique())
|
|
1536
|
+
df_reference = pd.concat(dfs, axis=0, ignore_index=True)
|
|
1537
|
+
|
|
1538
|
+
print(f'\nNote that there are {len(unique_query_ids)} unique query spectra, {len(unique_reference_ids)} unique reference spectra, and {len(set(unique_query_ids) & set(unique_reference_ids))} of the query and reference spectra IDs are in common.\n')
|
|
1539
|
+
|
|
1540
|
+
if output_path is None:
|
|
1541
|
+
output_path = f'{Path.cwd()}/tuning_param_output.txt'
|
|
1542
|
+
print(f'Warning: since output_path=None, the output will be written to the current working directory: {output_path}')
|
|
1543
|
+
|
|
1544
|
+
param_grid = product(similarity_measure, weight, spectrum_preprocessing_order, mz_min, mz_max, int_min, int_max,
|
|
1545
|
+
noise_threshold, wf_mz, wf_int, LET_threshold, entropy_dimension, high_quality_reference_library)
|
|
1546
|
+
results = Parallel(n_jobs=-1, verbose=10)(delayed(_eval_one_NRMS)(df_query, df_reference, unique_query_ids, unique_reference_ids, *params) for params in param_grid)
|
|
1547
|
+
|
|
1548
|
+
df_out = pd.DataFrame(results, columns=[
|
|
1549
|
+
'ACC','SIMILARITY.MEASURE','WEIGHT','SPECTRUM.PROCESSING.ORDER', 'MZ.MIN','MZ.MAX','INT.MIN','INT.MAX',
|
|
1550
|
+
'NOISE.THRESHOLD','WF.MZ','WF.INT','LET.THRESHOLD','ENTROPY.DIMENSION', 'HIGH.QUALITY.REFERENCE.LIBRARY'
|
|
1551
|
+
])
|
|
1552
|
+
df_out['WEIGHT'] = df_out['WEIGHT'].str.replace("\"","",regex=False)
|
|
1553
|
+
df_out['WEIGHT'] = df_out['WEIGHT'].str.replace("{","",regex=False)
|
|
1554
|
+
df_out['WEIGHT'] = df_out['WEIGHT'].str.replace("}","",regex=False)
|
|
1555
|
+
df_out['WEIGHT'] = df_out['WEIGHT'].str.replace(":","",regex=False)
|
|
1556
|
+
df_out['WEIGHT'] = df_out['WEIGHT'].str.replace("Cosine","",regex=False)
|
|
1557
|
+
df_out['WEIGHT'] = df_out['WEIGHT'].str.replace("Shannon","",regex=False)
|
|
1558
|
+
df_out['WEIGHT'] = df_out['WEIGHT'].str.replace("Renyi","",regex=False)
|
|
1559
|
+
df_out['WEIGHT'] = df_out['WEIGHT'].str.replace("Tsallis","",regex=False)
|
|
1560
|
+
df_out['WEIGHT'] = df_out['WEIGHT'].str.replace(" ","",regex=False)
|
|
1561
|
+
if return_output is False:
|
|
1562
|
+
df_out.to_csv(output_path, index=False, sep='\t', quoting=csv.QUOTE_NONE)
|
|
1563
|
+
else:
|
|
1564
|
+
return df_out
|
|
1565
|
+
|
|
1566
|
+
|
|
1567
|
+
|
|
1568
|
+
def tune_params_on_NRMS_data_grid_shiny(query_data=None, reference_data=None, grid=None, output_path=None, return_output=False):
|
|
1569
|
+
local_grid = {**default_NRMS_grid, **(grid or {})}
|
|
1570
|
+
for key, value in local_grid.items():
|
|
1571
|
+
globals()[key] = value
|
|
1572
|
+
|
|
1573
|
+
if query_data is None:
|
|
1574
|
+
print('\nError: No argument passed to the mandatory query_data. Please pass the path to the data file.')
|
|
1575
|
+
sys.exit()
|
|
1576
|
+
else:
|
|
1577
|
+
extension = query_data.rsplit('.', 1)[-1]
|
|
1578
|
+
if extension in ('mgf','MGF','mzML','mzml','MZML','cdf','CDF'):
|
|
1579
|
+
output_path_tmp = query_data[:-3] + 'txt'
|
|
1580
|
+
build_library_from_raw_data(input_path=query_data, output_path=output_path_tmp, is_reference=False)
|
|
1581
|
+
df_query = pd.read_csv(output_path_tmp, sep='\t')
|
|
1582
|
+
elif extension in ('txt','TXT'):
|
|
1583
|
+
df_query = pd.read_csv(query_data, sep='\t')
|
|
1584
|
+
else:
|
|
1585
|
+
print(f'\nError: Unsupported query_data extension: {extension}')
|
|
1586
|
+
sys.exit()
|
|
1587
|
+
unique_query_ids = df_query.iloc[:, 0].unique()
|
|
1588
|
+
|
|
1589
|
+
if reference_data is None:
|
|
1590
|
+
print('\nError: No argument passed to the mandatory reference_data. Please pass the path to the data file(s).')
|
|
1591
|
+
sys.exit()
|
|
1592
|
+
else:
|
|
1593
|
+
if isinstance(reference_data, str):
|
|
1594
|
+
df_reference = get_reference_df(reference_data=reference_data)
|
|
1595
|
+
unique_reference_ids = df_reference.iloc[:, 0].unique()
|
|
1596
|
+
else:
|
|
1597
|
+
dfs = []
|
|
1598
|
+
unique_reference_ids = []
|
|
1599
|
+
for f in reference_data:
|
|
1600
|
+
tmp = get_reference_df(reference_data=f)
|
|
1601
|
+
dfs.append(tmp)
|
|
1602
|
+
unique_reference_ids.extend(tmp.iloc[:, 0].unique())
|
|
1603
|
+
df_reference = pd.concat(dfs, axis=0, ignore_index=True)
|
|
1604
|
+
|
|
1605
|
+
print(f'\nNote that there are {len(unique_query_ids)} unique query spectra, '
|
|
1606
|
+
f'{len(unique_reference_ids)} unique reference spectra, and '
|
|
1607
|
+
f'{len(set(unique_query_ids) & set(unique_reference_ids))} of the query and reference spectra IDs are in common.\n')
|
|
1608
|
+
|
|
1609
|
+
if output_path is None:
|
|
1610
|
+
output_path = f'{Path.cwd()}/tuning_param_output.txt'
|
|
1611
|
+
print(f'Warning: since output_path=None, the output will be written to the current working directory: {output_path}')
|
|
1612
|
+
|
|
1613
|
+
param_grid = product(
|
|
1614
|
+
similarity_measure, weight, spectrum_preprocessing_order, mz_min, mz_max, int_min, int_max,
|
|
1615
|
+
noise_threshold, wf_mz, wf_int, LET_threshold,
|
|
1616
|
+
entropy_dimension, high_quality_reference_library
|
|
1617
|
+
)
|
|
1618
|
+
|
|
1619
|
+
results = []
|
|
1620
|
+
total = (
|
|
1621
|
+
len(similarity_measure) * len(weight) * len(spectrum_preprocessing_order) * len(mz_min) * len(mz_max) * len(int_min) *
|
|
1622
|
+
len(int_max) * len(noise_threshold) * len(wf_mz) * len(wf_int) * len(LET_threshold) * len(entropy_dimension) * len(high_quality_reference_library)
|
|
1623
|
+
)
|
|
1624
|
+
done = 0
|
|
1625
|
+
for params in param_grid:
|
|
1626
|
+
res = _eval_one_NRMS(df_query, df_reference, unique_query_ids, unique_reference_ids, *params)
|
|
1627
|
+
results.append(res)
|
|
1628
|
+
done += 1
|
|
1629
|
+
print(f'Completed {done}/{total} grid combinations.\n', flush=True)
|
|
1630
|
+
|
|
1631
|
+
df_out = pd.DataFrame(results, columns=[
|
|
1632
|
+
'ACC','SIMILARITY.MEASURE','WEIGHT','SPECTRUM.PROCESSING.ORDER','MZ.MIN','MZ.MAX',
|
|
1633
|
+
'INT.MIN','INT.MAX','NOISE.THRESHOLD','WF.MZ','WF.INT','LET.THRESHOLD','ENTROPY.DIMENSION','HIGH.QUALITY.REFERENCE.LIBRARY'
|
|
1634
|
+
])
|
|
1635
|
+
|
|
1636
|
+
if 'WEIGHT' in df_out.columns:
|
|
1637
|
+
df_out['WEIGHT'] = (
|
|
1638
|
+
df_out['WEIGHT'].astype(str)
|
|
1639
|
+
.str.replace("\"","",regex=False)
|
|
1640
|
+
.str.replace("{","",regex=False)
|
|
1641
|
+
.str.replace("}","",regex=False)
|
|
1642
|
+
.str.replace(":","",regex=False)
|
|
1643
|
+
.str.replace("Cosine","",regex=False)
|
|
1644
|
+
.str.replace("Shannon","",regex=False)
|
|
1645
|
+
.str.replace("Renyi","",regex=False)
|
|
1646
|
+
.str.replace("Tsallis","",regex=False)
|
|
1647
|
+
.str.replace(" ","",regex=False)
|
|
1648
|
+
)
|
|
1649
|
+
|
|
1650
|
+
if return_output:
|
|
1651
|
+
return df_out
|
|
1652
|
+
else:
|
|
1653
|
+
df_out.to_csv(output_path, index=False, sep='\t', quoting=csv.QUOTE_NONE)
|
|
1654
|
+
print(f'Wrote results to {output_path}')
|
|
1655
|
+
|
|
1656
|
+
|
|
1657
|
+
|
|
1658
|
+
|
|
1659
|
+
def get_acc_HRMS(df_query, df_reference, precursor_ion_mz_tolerance, ionization_mode, adduct, similarity_measure, weights, spectrum_preprocessing_order, mz_min, mz_max, int_min, int_max, window_size_centroiding, window_size_matching, noise_threshold, wf_mz, wf_int, LET_threshold, entropy_dimension, high_quality_reference_library, verbose=True):
|
|
1660
|
+
n_top_matches_to_save = 1
|
|
1661
|
+
unique_reference_ids = df_reference['id'].dropna().astype(str).unique().tolist()
|
|
1662
|
+
unique_query_ids = df_query['id'].dropna().astype(str).unique().tolist()
|
|
1663
|
+
all_similarity_rows = []
|
|
1664
|
+
|
|
1665
|
+
for query_idx, qid in enumerate(unique_query_ids):
|
|
1666
|
+
if verbose:
|
|
1667
|
+
print(f'query spectrum #{query_idx} is being identified')
|
|
1668
|
+
|
|
1669
|
+
q_mask = (df_query['id'] == qid)
|
|
1670
|
+
q_idxs = np.where(q_mask)[0]
|
|
1671
|
+
if q_idxs.size == 0:
|
|
1672
|
+
all_similarity_rows.append([0.0]*len(unique_reference_ids))
|
|
1673
|
+
continue
|
|
1674
|
+
|
|
1675
|
+
q_spec_base = np.asarray(pd.concat([df_query['mz_ratio'].iloc[q_idxs], df_query['intensity'].iloc[q_idxs]], axis=1).reset_index(drop=True))
|
|
1676
|
+
|
|
1677
|
+
if 'precursor_ion_mz' in df_query.columns and 'precursor_ion_mz' in df_reference.columns and precursor_ion_mz_tolerance is not None:
|
|
1678
|
+
precursor = float(df_query['precursor_ion_mz'].iloc[q_idxs[0]])
|
|
1679
|
+
df_reference_tmp = df_reference.loc[df_reference['precursor_ion_mz'].between(precursor - precursor_ion_mz_tolerance, precursor + precursor_ion_mz_tolerance, inclusive='both'), ['id', 'mz_ratio', 'intensity']].copy()
|
|
1680
|
+
else:
|
|
1681
|
+
df_reference_tmp = df_reference[['id','mz_ratio','intensity']].copy()
|
|
1682
|
+
|
|
1683
|
+
if df_reference_tmp.empty:
|
|
1684
|
+
all_similarity_rows.append([0.0]*len(unique_reference_ids))
|
|
1685
|
+
continue
|
|
1686
|
+
|
|
1687
|
+
ref_groups = dict(tuple(df_reference_tmp.groupby('id', sort=False)))
|
|
1688
|
+
|
|
1689
|
+
similarity_by_ref = {}
|
|
1690
|
+
|
|
1691
|
+
for ref_id, r_df in ref_groups.items():
|
|
1692
|
+
q_spec = q_spec_base.copy()
|
|
1693
|
+
r_spec = np.asarray(pd.concat([r_df['mz_ratio'], r_df['intensity']], axis=1).reset_index(drop=True))
|
|
1694
|
+
|
|
1695
|
+
is_matched = False
|
|
1696
|
+
for transformation in spectrum_preprocessing_order:
|
|
1697
|
+
if np.isinf(q_spec[:, 1]).any():
|
|
1698
|
+
q_spec[:, 1] = 0.0
|
|
1699
|
+
if np.isinf(r_spec[:, 1]).any():
|
|
1700
|
+
r_spec[:, 1] = 0.0
|
|
1701
|
+
|
|
1702
|
+
if transformation == 'C' and q_spec.shape[0] > 1 and r_spec.shape[0] > 1:
|
|
1703
|
+
q_spec = centroid_spectrum(q_spec, window_size=window_size_centroiding)
|
|
1704
|
+
r_spec = centroid_spectrum(r_spec, window_size=window_size_centroiding)
|
|
1705
|
+
|
|
1706
|
+
if transformation == 'M' and q_spec.shape[0] > 1 and r_spec.shape[0] > 1:
|
|
1707
|
+
m_spec = match_peaks_in_spectra(
|
|
1708
|
+
spec_a=q_spec, spec_b=r_spec, window_size=window_size_matching
|
|
1709
|
+
)
|
|
1710
|
+
if m_spec.size == 0:
|
|
1711
|
+
q_spec = np.empty((0,2))
|
|
1712
|
+
r_spec = np.empty((0,2))
|
|
1713
|
+
else:
|
|
1714
|
+
q_spec = m_spec[:, 0:2]
|
|
1715
|
+
r_spec = m_spec[:, [0, 2]]
|
|
1716
|
+
is_matched = True
|
|
1717
|
+
|
|
1718
|
+
if transformation == 'W' and q_spec.shape[0] > 1 and r_spec.shape[0] > 1:
|
|
1719
|
+
q_spec[:, 1] = wf_transform(q_spec[:, 0], q_spec[:, 1], wf_mz, wf_int)
|
|
1720
|
+
r_spec[:, 1] = wf_transform(r_spec[:, 0], r_spec[:, 1], wf_mz, wf_int)
|
|
1721
|
+
|
|
1722
|
+
if transformation == 'L' and q_spec.shape[0] > 1 and r_spec.shape[0] > 1:
|
|
1723
|
+
q_spec[:, 1] = LE_transform(q_spec[:, 1], LET_threshold, normalization_method='standard')
|
|
1724
|
+
r_spec[:, 1] = LE_transform(r_spec[:, 1], LET_threshold, normalization_method='standard')
|
|
1725
|
+
|
|
1726
|
+
if transformation == 'N' and q_spec.shape[0] > 1 and r_spec.shape[0] > 1:
|
|
1727
|
+
q_spec = remove_noise(q_spec, nr=noise_threshold)
|
|
1728
|
+
if not high_quality_reference_library:
|
|
1729
|
+
r_spec = remove_noise(r_spec, nr=noise_threshold)
|
|
1730
|
+
|
|
1731
|
+
if transformation == 'F' and q_spec.shape[0] > 1 and r_spec.shape[0] > 1:
|
|
1732
|
+
q_spec = filter_spec_lcms(
|
|
1733
|
+
q_spec, mz_min=mz_min, mz_max=mz_max, int_min=int_min, int_max=int_max, is_matched=is_matched
|
|
1734
|
+
)
|
|
1735
|
+
if not high_quality_reference_library:
|
|
1736
|
+
r_spec = filter_spec_lcms(
|
|
1737
|
+
r_spec, mz_min=mz_min, mz_max=mz_max, int_min=int_min, int_max=int_max, is_matched=is_matched
|
|
1738
|
+
)
|
|
1739
|
+
|
|
1740
|
+
if q_spec.shape[0] > 1 and r_spec.shape[0] > 1:
|
|
1741
|
+
q_ints = q_spec[:, 1]
|
|
1742
|
+
r_ints = r_spec[:, 1]
|
|
1743
|
+
if np.sum(q_ints) != 0 and np.sum(r_ints) != 0:
|
|
1744
|
+
sim = get_similarity(similarity_measure, q_ints, r_ints, weights, entropy_dimension)
|
|
1745
|
+
else:
|
|
1746
|
+
sim = 0.0
|
|
1747
|
+
else:
|
|
1748
|
+
sim = 0.0
|
|
1749
|
+
|
|
1750
|
+
similarity_by_ref[str(ref_id)] = float(sim)
|
|
1751
|
+
|
|
1752
|
+
row = [similarity_by_ref.get(ref_id, 0.0) for ref_id in unique_reference_ids]
|
|
1753
|
+
all_similarity_rows.append(row)
|
|
1754
|
+
|
|
1755
|
+
df_scores = pd.DataFrame(all_similarity_rows, index=unique_query_ids, columns=unique_reference_ids)
|
|
1756
|
+
df_scores.index.name = 'QUERY.SPECTRUM.ID'
|
|
1757
|
+
|
|
1758
|
+
top_idx = df_scores.values.argmax(axis=1)
|
|
1759
|
+
top_scores = df_scores.values[np.arange(df_scores.shape[0]), top_idx]
|
|
1760
|
+
top_ids = [df_scores.columns[i] for i in top_idx]
|
|
1761
|
+
|
|
1762
|
+
df_tmp = pd.DataFrame({'TRUE.ID': df_scores.index.to_list(), 'PREDICTED.ID': top_ids, 'SCORE': top_scores})
|
|
1763
|
+
if verbose:
|
|
1764
|
+
print(df_tmp)
|
|
1765
|
+
|
|
1766
|
+
acc = (df_tmp['TRUE.ID'] == df_tmp['PREDICTED.ID']).mean()
|
|
1767
|
+
return acc
|
|
1768
|
+
|
|
1769
|
+
|
|
1770
|
+
|
|
1771
|
+
def get_acc_NRMS(df_query, df_reference, unique_query_ids, unique_reference_ids, similarity_measure, weights, spectrum_preprocessing_order, mz_min, mz_max, int_min, int_max, noise_threshold, wf_mz, wf_int, LET_threshold, entropy_dimension, high_quality_reference_library, verbose=True):
|
|
1772
|
+
|
|
1773
|
+
n_top_matches_to_save = 1
|
|
1774
|
+
|
|
1775
|
+
min_mz = int(np.min([np.min(df_query.iloc[:,1]), np.min(df_reference.iloc[:,1])]))
|
|
1776
|
+
max_mz = int(np.max([np.max(df_query.iloc[:,1]), np.max(df_reference.iloc[:,1])]))
|
|
1777
|
+
mzs = np.linspace(min_mz,max_mz,(max_mz-min_mz+1))
|
|
1778
|
+
|
|
1779
|
+
all_similarity_scores = []
|
|
1780
|
+
for query_idx in range(0,len(unique_query_ids)):
|
|
1781
|
+
q_idxs_tmp = np.where(df_query.iloc[:,0] == unique_query_ids[query_idx])[0]
|
|
1782
|
+
q_spec_tmp = np.asarray(pd.concat([df_query.iloc[q_idxs_tmp,1], df_query.iloc[q_idxs_tmp,2]], axis=1).reset_index(drop=True))
|
|
1783
|
+
q_spec_tmp = convert_spec(q_spec_tmp,mzs)
|
|
1784
|
+
|
|
1785
|
+
similarity_scores = []
|
|
1786
|
+
for ref_idx in range(0,len(unique_reference_ids)):
|
|
1787
|
+
q_spec = q_spec_tmp
|
|
1788
|
+
if verbose is True and ref_idx % 1000 == 0:
|
|
1789
|
+
print(f'Query spectrum #{query_idx} has had its similarity with {ref_idx} reference library spectra computed')
|
|
1790
|
+
r_idxs_tmp = np.where(df_reference.iloc[:,0] == unique_reference_ids[ref_idx])[0]
|
|
1791
|
+
r_spec_tmp = np.asarray(pd.concat([df_reference.iloc[r_idxs_tmp,1], df_reference.iloc[r_idxs_tmp,2]], axis=1).reset_index(drop=True))
|
|
1792
|
+
r_spec = convert_spec(r_spec_tmp,mzs)
|
|
1793
|
+
|
|
1794
|
+
for transformation in spectrum_preprocessing_order:
|
|
1795
|
+
if np.isinf(q_spec[:,1]).sum() > 0:
|
|
1796
|
+
q_spec[:,1] = np.zeros(q_spec.shape[0])
|
|
1797
|
+
if np.isinf(r_spec[:,1]).sum() > 0:
|
|
1798
|
+
r_spec[:,1] = np.zeros(r_spec.shape[0])
|
|
1799
|
+
if transformation == 'W':
|
|
1800
|
+
q_spec[:,1] = wf_transform(q_spec[:,0], q_spec[:,1], wf_mz, wf_int)
|
|
1801
|
+
r_spec[:,1] = wf_transform(r_spec[:,0], r_spec[:,1], wf_mz, wf_int)
|
|
1802
|
+
if transformation == 'L':
|
|
1803
|
+
q_spec[:,1] = LE_transform(q_spec[:,1], LET_threshold, normalization_method='standard')
|
|
1804
|
+
r_spec[:,1] = LE_transform(r_spec[:,1], LET_threshold, normalization_method='standard')
|
|
1805
|
+
if transformation == 'N':
|
|
1806
|
+
q_spec = remove_noise(q_spec, nr = noise_threshold)
|
|
1807
|
+
if high_quality_reference_library == False:
|
|
1808
|
+
r_spec = remove_noise(r_spec, nr = noise_threshold)
|
|
1809
|
+
if transformation == 'F':
|
|
1810
|
+
q_spec = filter_spec_gcms(q_spec, mz_min = mz_min, mz_max = mz_max, int_min = int_min, int_max = int_max)
|
|
1811
|
+
if high_quality_reference_library == False:
|
|
1812
|
+
r_spec = filter_spec_gcms(r_spec, mz_min = mz_min, mz_max = mz_max, int_min = int_min, int_max = int_max)
|
|
1813
|
+
|
|
1814
|
+
q_ints = q_spec[:,1]
|
|
1815
|
+
r_ints = r_spec[:,1]
|
|
1816
|
+
|
|
1817
|
+
if np.sum(q_ints) != 0 and np.sum(r_ints) != 0:
|
|
1818
|
+
similarity_score = get_similarity(similarity_measure, q_spec[:,1], r_spec[:,1], weights, entropy_dimension)
|
|
1819
|
+
else:
|
|
1820
|
+
similarity_score = 0
|
|
1821
|
+
|
|
1822
|
+
similarity_scores.append(similarity_score)
|
|
1823
|
+
all_similarity_scores.append(similarity_scores)
|
|
1824
|
+
|
|
1825
|
+
df_scores = pd.DataFrame(all_similarity_scores, columns = unique_reference_ids)
|
|
1826
|
+
df_scores.index = unique_query_ids
|
|
1827
|
+
df_scores.index.names = ['QUERY.SPECTRUM.ID']
|
|
1828
|
+
|
|
1829
|
+
preds = []
|
|
1830
|
+
scores = []
|
|
1831
|
+
for i in range(0, df_scores.shape[0]):
|
|
1832
|
+
df_scores_tmp = df_scores
|
|
1833
|
+
preds_tmp = []
|
|
1834
|
+
scores_tmp = []
|
|
1835
|
+
for j in range(0, n_top_matches_to_save):
|
|
1836
|
+
top_ref_specs_tmp = df_scores_tmp.iloc[i,np.where(df_scores_tmp.iloc[i,:] == np.max(df_scores_tmp.iloc[i,:]))[0]]
|
|
1837
|
+
cols_to_keep = np.where(df_scores_tmp.iloc[i,:] != np.max(df_scores_tmp.iloc[i,:]))[0]
|
|
1838
|
+
df_scores_tmp = df_scores_tmp.iloc[:,cols_to_keep]
|
|
1839
|
+
|
|
1840
|
+
preds_tmp.append(';'.join(map(str,top_ref_specs_tmp.index.to_list())))
|
|
1841
|
+
if len(top_ref_specs_tmp.values) == 0:
|
|
1842
|
+
scores_tmp.append(0)
|
|
1843
|
+
else:
|
|
1844
|
+
scores_tmp.append(top_ref_specs_tmp.values[0])
|
|
1845
|
+
preds.append(preds_tmp)
|
|
1846
|
+
scores.append(scores_tmp)
|
|
1847
|
+
|
|
1848
|
+
preds = np.array(preds)
|
|
1849
|
+
scores = np.array(scores)
|
|
1850
|
+
out = np.c_[unique_query_ids,preds,scores]
|
|
1851
|
+
df_tmp = pd.DataFrame(out, columns=['TRUE.ID','PREDICTED.ID','SCORE'])
|
|
1852
|
+
acc = (df_tmp['TRUE.ID']==df_tmp['PREDICTED.ID']).mean()
|
|
1853
|
+
return acc
|
|
1854
|
+
|
|
1855
|
+
|
|
1856
|
+
|
|
1857
|
+
def run_spec_lib_matching_on_HRMS_data_shiny(query_data=None, reference_data=None, precursor_ion_mz_tolerance=None, ionization_mode=None, adduct=None, likely_reference_ids=None, similarity_measure='cosine', weights={'Cosine':0.25,'Shannon':0.25,'Renyi':0.25,'Tsallis':0.25}, spectrum_preprocessing_order='FCNMWL', high_quality_reference_library=False, mz_min=0, mz_max=9999999, int_min=0, int_max=9999999, window_size_centroiding=0.5, window_size_matching=0.5, noise_threshold=0.0, wf_mz=0.0, wf_intensity=1.0, LET_threshold=0.0, entropy_dimension=1.1, n_top_matches_to_save=1, print_id_results=False, output_identification=None, output_similarity_scores=None, return_ID_output=False, verbose=True):
|
|
1858
|
+
if query_data is None:
|
|
1859
|
+
print('\nError: No argument passed to the mandatory query_data. Please pass the path to the CSV file of the query data.')
|
|
1860
|
+
sys.exit()
|
|
1861
|
+
else:
|
|
1862
|
+
extension = query_data.rsplit('.',1)
|
|
1863
|
+
extension = extension[(len(extension)-1)]
|
|
1864
|
+
if extension == 'mgf' or extension == 'MGF' or extension == 'mzML' or extension == 'mzml' or extension == 'MZML' or extension == 'cdf' or extension == 'CDF' or extension == 'json' or extension == 'JSON':
|
|
1865
|
+
output_path_tmp = query_data[:-3] + 'txt'
|
|
1866
|
+
build_library_from_raw_data(input_path=query_data, output_path=output_path_tmp, is_reference=False)
|
|
1867
|
+
#build_library_from_raw_data(input_path=query_data, output_path=output_path_tmp, is_reference=True)
|
|
1868
|
+
df_query = pd.read_csv(output_path_tmp, sep='\t')
|
|
1869
|
+
if extension == 'txt' or extension == 'TXT':
|
|
1870
|
+
df_query = pd.read_csv(query_data, sep='\t')
|
|
1871
|
+
unique_query_ids = df_query['id'].unique()
|
|
1872
|
+
|
|
1873
|
+
if reference_data is None:
|
|
1874
|
+
print('\nError: No argument passed to the mandatory reference_data. Please pass the path to the reference data.')
|
|
1875
|
+
sys.exit()
|
|
1876
|
+
else:
|
|
1877
|
+
if isinstance(reference_data,str):
|
|
1878
|
+
df_reference = get_reference_df(reference_data,likely_reference_ids)
|
|
1879
|
+
else:
|
|
1880
|
+
dfs = []
|
|
1881
|
+
for f in reference_data:
|
|
1882
|
+
tmp = get_reference_df(f,likely_reference_ids)
|
|
1883
|
+
dfs.append(tmp)
|
|
1884
|
+
df_reference = pd.concat(dfs, axis=0, ignore_index=True)
|
|
1885
|
+
|
|
1886
|
+
if 'ionization_mode' in df_reference.columns.tolist() and ionization_mode != 'N/A':
|
|
1887
|
+
df_reference = df_reference.loc[df_reference['ionization_mode']==ionization_mode]
|
|
1888
|
+
if 'adduct' in df_reference.columns.tolist() and adduct != 'N/A':
|
|
1889
|
+
df_reference = df_reference.loc[df_reference['adduct']==adduct]
|
|
1890
|
+
|
|
1891
|
+
if spectrum_preprocessing_order is not None:
|
|
1892
|
+
spectrum_preprocessing_order = list(spectrum_preprocessing_order)
|
|
1893
|
+
else:
|
|
1894
|
+
spectrum_preprocessing_order = ['F', 'C', 'N', 'M', 'W', 'L']
|
|
1895
|
+
if 'M' not in spectrum_preprocessing_order:
|
|
1896
|
+
print(f'Error: \'M\' must be a character in spectrum_preprocessing_order.')
|
|
1897
|
+
sys.exit()
|
|
1898
|
+
if 'C' in spectrum_preprocessing_order:
|
|
1899
|
+
if spectrum_preprocessing_order.index('C') > spectrum_preprocessing_order.index('M'):
|
|
1900
|
+
print(f'Error: \'C\' must come before \'M\' in spectrum_preprocessing_order.')
|
|
1901
|
+
sys.exit()
|
|
1902
|
+
if set(spectrum_preprocessing_order) - {'F','C','N','M','W','L'}:
|
|
1903
|
+
print(f'Error: spectrum_preprocessing_order must contain only \'C\', \'F\', \'M\', \'N\', \'L\', \'W\'.')
|
|
1904
|
+
sys.exit()
|
|
1905
|
+
|
|
1906
|
+
|
|
1907
|
+
if similarity_measure not in ['cosine','shannon','renyi','tsallis','mixture','jaccard','dice','3w_jaccard','sokal_sneath','binary_cosine','mountford','mcconnaughey','driver_kroeber','simpson','braun_banquet','fager_mcgowan','kuldzynski','intersection','hamming','hellinger']:
|
|
1908
|
+
print('\nError: similarity_measure must be either cosine, shannon, renyi, tsallis, mixture, jaccard, dice, 3w_jaccard, sokal_sneath, binary_cosine, mountford, mcconnaughey, driver_kroeber, simpson, braun_banquet, fager_mcgowan, kulczynski, intersection, hamming, or hellinger')
|
|
1909
|
+
sys.exit()
|
|
1910
|
+
|
|
1911
|
+
if isinstance(int_min,int) is True:
|
|
1912
|
+
int_min = float(int_min)
|
|
1913
|
+
if isinstance(int_max,int) is True:
|
|
1914
|
+
int_max = float(int_max)
|
|
1915
|
+
if isinstance(mz_min,int) is False or isinstance(mz_max,int) is False or isinstance(int_min,float) is False or isinstance(int_max,float) is False:
|
|
1916
|
+
print('Error: mz_min must be a non-negative integer, mz_max must be a positive integer, int_min must be a non-negative float, and int_max must be a positive float')
|
|
1917
|
+
sys.exit()
|
|
1918
|
+
if mz_min < 0:
|
|
1919
|
+
print('\nError: mz_min should be a non-negative integer')
|
|
1920
|
+
sys.exit()
|
|
1921
|
+
if mz_max <= 0:
|
|
1922
|
+
print('\nError: mz_max should be a positive integer')
|
|
1923
|
+
sys.exit()
|
|
1924
|
+
if int_min < 0:
|
|
1925
|
+
print('\nError: int_min should be a non-negative float')
|
|
1926
|
+
sys.exit()
|
|
1927
|
+
if int_max <= 0:
|
|
1928
|
+
print('\nError: int_max should be a positive float')
|
|
1929
|
+
sys.exit()
|
|
1930
|
+
|
|
1931
|
+
if isinstance(window_size_centroiding,float) is False or window_size_centroiding <= 0.0:
|
|
1932
|
+
print('Error: window_size_centroiding must be a positive float.')
|
|
1933
|
+
sys.exit()
|
|
1934
|
+
if isinstance(window_size_matching,float) is False or window_size_matching<= 0.0:
|
|
1935
|
+
print('Error: window_size_matching must be a positive float.')
|
|
1936
|
+
sys.exit()
|
|
1937
|
+
|
|
1938
|
+
if isinstance(noise_threshold,int) is True:
|
|
1939
|
+
noise_threshold = float(noise_threshold)
|
|
1940
|
+
if isinstance(noise_threshold,float) is False or noise_threshold < 0:
|
|
1941
|
+
print('Error: noise_threshold must be a positive float.')
|
|
1942
|
+
sys.exit()
|
|
1943
|
+
|
|
1944
|
+
if isinstance(wf_intensity,int) is True:
|
|
1945
|
+
wf_intensity = float(wf_intensity)
|
|
1946
|
+
if isinstance(wf_mz,int) is True:
|
|
1947
|
+
wf_mz = float(wf_mz)
|
|
1948
|
+
if isinstance(wf_intensity,float) is False or isinstance(wf_mz,float) is False:
|
|
1949
|
+
print('Error: wf_mz and wf_intensity must be integers or floats')
|
|
1950
|
+
sys.exit()
|
|
1951
|
+
|
|
1952
|
+
if entropy_dimension <= 0:
|
|
1953
|
+
print('\nError: entropy_dimension should be a positive float')
|
|
1954
|
+
sys.exit()
|
|
1955
|
+
else:
|
|
1956
|
+
q = entropy_dimension
|
|
1957
|
+
|
|
1958
|
+
normalization_method = 'standard'
|
|
1959
|
+
|
|
1960
|
+
if n_top_matches_to_save <= 0 or isinstance(n_top_matches_to_save,int)==False:
|
|
1961
|
+
print('\nError: n_top_matches_to_save should be a positive integer')
|
|
1962
|
+
sys.exit()
|
|
1963
|
+
|
|
1964
|
+
if isinstance(print_id_results,bool)==False:
|
|
1965
|
+
print('\nError: print_id_results must be either True or False')
|
|
1966
|
+
sys.exit()
|
|
1967
|
+
|
|
1968
|
+
if output_identification is None:
|
|
1969
|
+
output_identification = f'{Path.cwd()}/output_identification.txt'
|
|
1970
|
+
print(f'Warning: writing identification output to {output_identification}')
|
|
1971
|
+
|
|
1972
|
+
if output_similarity_scores is None:
|
|
1973
|
+
output_similarity_scores = f'{Path.cwd()}/output_all_similarity_scores.txt'
|
|
1974
|
+
print(f'Warning: writing similarity scores to {output_similarity_scores}')
|
|
1975
|
+
|
|
1976
|
+
|
|
1977
|
+
unique_reference_ids = df_reference['id'].unique().tolist()
|
|
1978
|
+
all_similarity_scores = []
|
|
1979
|
+
|
|
1980
|
+
for query_idx in range(len(unique_query_ids)):
|
|
1981
|
+
if verbose:
|
|
1982
|
+
print(f'query spectrum #{query_idx} is being identified')
|
|
1983
|
+
|
|
1984
|
+
q_mask = (df_query['id'] == unique_query_ids[query_idx])
|
|
1985
|
+
q_idxs_tmp = np.where(q_mask)[0]
|
|
1986
|
+
q_spec_tmp = np.asarray(pd.concat([df_query['mz_ratio'].iloc[q_idxs_tmp], df_query['intensity'].iloc[q_idxs_tmp]], axis=1).reset_index(drop=True))
|
|
1987
|
+
|
|
1988
|
+
if 'precursor_ion_mz' in df_query.columns.tolist() and 'precursor_ion_mz' in df_reference.columns.tolist() and precursor_ion_mz_tolerance != None:
|
|
1989
|
+
precursor_ion_mz_tmp = df_query['precursor_ion_mz'].iloc[q_idxs_tmp[0]]
|
|
1990
|
+
df_reference_tmp = df_reference.loc[df_reference['precursor_ion_mz'].between(precursor_ion_mz_tmp-precursor_ion_mz_tolerance, precursor_ion_mz_tmp+precursor_ion_mz_tolerance, inclusive='both'),['id','mz_ratio','intensity']].copy()
|
|
1991
|
+
else:
|
|
1992
|
+
df_reference_tmp = df_reference.copy()
|
|
1993
|
+
|
|
1994
|
+
ref_groups = dict(tuple(df_reference_tmp.groupby('id', sort=False)))
|
|
1995
|
+
unique_reference_ids_tmp = list(ref_groups.keys())
|
|
1996
|
+
|
|
1997
|
+
similarity_by_ref = {}
|
|
1998
|
+
for ref_id in unique_reference_ids_tmp:
|
|
1999
|
+
q_spec = q_spec_tmp.copy()
|
|
2000
|
+
r_df = ref_groups[ref_id]
|
|
2001
|
+
r_spec = np.asarray(pd.concat([r_df['mz_ratio'], r_df['intensity']], axis=1).reset_index(drop=True))
|
|
2002
|
+
|
|
2003
|
+
is_matched = False
|
|
2004
|
+
|
|
2005
|
+
for transformation in spectrum_preprocessing_order:
|
|
2006
|
+
if np.isinf(q_spec[:, 1]).sum() > 0:
|
|
2007
|
+
q_spec[:, 1] = np.zeros(q_spec.shape[0])
|
|
2008
|
+
if np.isinf(r_spec[:, 1]).sum() > 0:
|
|
2009
|
+
r_spec[:, 1] = np.zeros(r_spec.shape[0])
|
|
2010
|
+
|
|
2011
|
+
if transformation == 'C' and q_spec.shape[0] > 1 and r_spec.shape[0] > 1:
|
|
2012
|
+
q_spec = centroid_spectrum(q_spec, window_size=window_size_centroiding)
|
|
2013
|
+
r_spec = centroid_spectrum(r_spec, window_size=window_size_centroiding)
|
|
2014
|
+
|
|
2015
|
+
if transformation == 'M' and q_spec.shape[0] > 1 and r_spec.shape[0] > 1:
|
|
2016
|
+
m_spec = match_peaks_in_spectra(spec_a=q_spec, spec_b=r_spec, window_size=window_size_matching)
|
|
2017
|
+
q_spec = m_spec[:, 0:2]
|
|
2018
|
+
r_spec = m_spec[:, [0, 2]]
|
|
2019
|
+
is_matched = True
|
|
2020
|
+
|
|
2021
|
+
if transformation == 'W' and q_spec.shape[0] > 1 and r_spec.shape[0] > 1:
|
|
2022
|
+
q_spec[:, 1] = wf_transform(q_spec[:, 0], q_spec[:, 1], wf_mz, wf_intensity)
|
|
2023
|
+
r_spec[:, 1] = wf_transform(r_spec[:, 0], r_spec[:, 1], wf_mz, wf_intensity)
|
|
2024
|
+
|
|
2025
|
+
if transformation == 'L' and q_spec.shape[0] > 1 and r_spec.shape[0] > 1:
|
|
2026
|
+
q_spec[:, 1] = LE_transform(q_spec[:, 1], LET_threshold, normalization_method=normalization_method)
|
|
2027
|
+
r_spec[:, 1] = LE_transform(r_spec[:, 1], LET_threshold, normalization_method=normalization_method)
|
|
2028
|
+
|
|
2029
|
+
if transformation == 'N' and q_spec.shape[0] > 1 and r_spec.shape[0] > 1:
|
|
2030
|
+
q_spec = remove_noise(q_spec, nr=noise_threshold)
|
|
2031
|
+
if not high_quality_reference_library:
|
|
2032
|
+
r_spec = remove_noise(r_spec, nr=noise_threshold)
|
|
2033
|
+
|
|
2034
|
+
if transformation == 'F' and q_spec.shape[0] > 1 and r_spec.shape[0] > 1:
|
|
2035
|
+
q_spec = filter_spec_lcms(
|
|
2036
|
+
q_spec, mz_min=mz_min, mz_max=mz_max, int_min=int_min, int_max=int_max, is_matched=is_matched
|
|
2037
|
+
)
|
|
2038
|
+
if not high_quality_reference_library:
|
|
2039
|
+
r_spec = filter_spec_lcms(
|
|
2040
|
+
r_spec, mz_min=mz_min, mz_max=mz_max, int_min=int_min, int_max=int_max, is_matched=is_matched
|
|
2041
|
+
)
|
|
2042
|
+
|
|
2043
|
+
q_ints = q_spec[:, 1]
|
|
2044
|
+
r_ints = r_spec[:, 1]
|
|
2045
|
+
|
|
2046
|
+
if np.sum(q_ints) != 0 and np.sum(r_ints) != 0 and q_spec.shape[0] > 1 and r_spec.shape[0] > 1:
|
|
2047
|
+
sim = get_similarity(similarity_measure, q_ints, r_ints, weights, entropy_dimension)
|
|
2048
|
+
else:
|
|
2049
|
+
sim = 0.0
|
|
2050
|
+
|
|
2051
|
+
similarity_by_ref[ref_id] = sim
|
|
2052
|
+
|
|
2053
|
+
row_scores = [similarity_by_ref.get(ref_id, 0.0) for ref_id in unique_reference_ids]
|
|
2054
|
+
all_similarity_scores.append(row_scores)
|
|
2055
|
+
|
|
2056
|
+
df_scores = pd.DataFrame(all_similarity_scores, index=unique_query_ids, columns=unique_reference_ids)
|
|
2057
|
+
df_scores.index = unique_query_ids
|
|
2058
|
+
df_scores.index.names = ['QUERY.SPECTRUM.ID']
|
|
2059
|
+
|
|
2060
|
+
|
|
2061
|
+
preds = []
|
|
2062
|
+
scores = []
|
|
2063
|
+
for i in range(0, df_scores.shape[0]):
|
|
2064
|
+
df_scores_tmp = df_scores
|
|
2065
|
+
preds_tmp = []
|
|
2066
|
+
scores_tmp = []
|
|
2067
|
+
for j in range(0, n_top_matches_to_save):
|
|
2068
|
+
top_ref_specs_tmp = df_scores_tmp.iloc[i,np.where(df_scores_tmp.iloc[i,:] == np.max(df_scores_tmp.iloc[i,:]))[0]]
|
|
2069
|
+
cols_to_keep = np.where(df_scores_tmp.iloc[i,:] != np.max(df_scores_tmp.iloc[i,:]))[0]
|
|
2070
|
+
df_scores_tmp = df_scores_tmp.iloc[:,cols_to_keep]
|
|
2071
|
+
|
|
2072
|
+
preds_tmp.append(';'.join(map(str,top_ref_specs_tmp.index.to_list())))
|
|
2073
|
+
if len(top_ref_specs_tmp.values) == 0:
|
|
2074
|
+
scores_tmp.append(0)
|
|
2075
|
+
else:
|
|
2076
|
+
scores_tmp.append(top_ref_specs_tmp.values[0])
|
|
2077
|
+
preds.append(preds_tmp)
|
|
2078
|
+
scores.append(scores_tmp)
|
|
2079
|
+
|
|
2080
|
+
preds = np.array(preds)
|
|
2081
|
+
scores = np.array(scores)
|
|
2082
|
+
out = np.c_[preds,scores]
|
|
2083
|
+
|
|
2084
|
+
cnames_preds = []
|
|
2085
|
+
cnames_scores = []
|
|
2086
|
+
for i in range(0,n_top_matches_to_save):
|
|
2087
|
+
cnames_preds.append(f'RANK.{i+1}.PRED')
|
|
2088
|
+
cnames_scores.append(f'RANK.{i+1}.SIMILARITY.SCORE')
|
|
2089
|
+
|
|
2090
|
+
df_top_ref_specs = pd.DataFrame(out, columns = [*cnames_preds, *cnames_scores])
|
|
2091
|
+
df_top_ref_specs.index = unique_query_ids
|
|
2092
|
+
df_top_ref_specs.index.names = ['QUERY.SPECTRUM.ID']
|
|
2093
|
+
|
|
2094
|
+
df_scores.columns = ['Reference Spectrum ID: ' + col for col in list(map(str,df_scores.columns.tolist()))]
|
|
2095
|
+
|
|
2096
|
+
if print_id_results == True:
|
|
2097
|
+
print(df_top_ref_specs.to_string())
|
|
2098
|
+
|
|
2099
|
+
if return_ID_output is False:
|
|
2100
|
+
df_top_ref_specs.to_csv(output_identification, sep='\t')
|
|
2101
|
+
df_scores.to_csv(output_similarity_scores, sep='\t')
|
|
2102
|
+
else:
|
|
2103
|
+
return df_top_ref_specs
|
|
2104
|
+
|
|
2105
|
+
|
|
2106
|
+
|
|
2107
|
+
|
|
2108
|
+
def run_spec_lib_matching_on_NRMS_data_shiny(query_data=None, reference_data=None, likely_reference_ids=None, spectrum_preprocessing_order='FNLW', similarity_measure='cosine', weights={'Cosine':0.25,'Shannon':0.25,'Renyi':0.25,'Tsallis':0.25}, high_quality_reference_library=False, mz_min=0, mz_max=9999999, int_min=0, int_max=9999999, noise_threshold=0.0, wf_mz=0.0, wf_intensity=1.0, LET_threshold=0.0, entropy_dimension=1.1, n_top_matches_to_save=1, print_id_results=False, output_identification=None, output_similarity_scores=None, return_ID_output=False, verbose=True):
|
|
2109
|
+
if query_data is None:
|
|
2110
|
+
print('\nError: No argument passed to the mandatory query_data. Please pass the path to the TXT file of the query data.')
|
|
2111
|
+
sys.exit()
|
|
2112
|
+
else:
|
|
2113
|
+
extension = query_data.rsplit('.',1)
|
|
2114
|
+
extension = extension[(len(extension)-1)]
|
|
2115
|
+
if extension == 'mgf' or extension == 'MGF' or extension == 'mzML' or extension == 'mzml' or extension == 'MZML' or extension == 'cdf' or extension == 'CDF' or extension == 'msp' or extension == 'MSP' or extension == 'json' or extension == 'JSON':
|
|
2116
|
+
output_path_tmp = query_data[:-3] + 'txt'
|
|
2117
|
+
build_library_from_raw_data(input_path=query_data, output_path=output_path_tmp, is_reference=False)
|
|
2118
|
+
df_query = pd.read_csv(output_path_tmp, sep='\t')
|
|
2119
|
+
if extension == 'txt' or extension == 'TXT':
|
|
2120
|
+
df_query = pd.read_csv(query_data, sep='\t')
|
|
2121
|
+
unique_query_ids = df_query.iloc[:,0].unique()
|
|
2122
|
+
|
|
2123
|
+
if reference_data is None:
|
|
2124
|
+
print('\nError: No argument passed to the mandatory reference_data. Please pass the path to the TXT file of the reference data.')
|
|
2125
|
+
sys.exit()
|
|
2126
|
+
else:
|
|
2127
|
+
if isinstance(reference_data,str):
|
|
2128
|
+
df_reference = get_reference_df(reference_data,likely_reference_ids)
|
|
2129
|
+
unique_reference_ids = df_reference.iloc[:,0].unique()
|
|
2130
|
+
else:
|
|
2131
|
+
dfs = []
|
|
2132
|
+
unique_reference_ids = []
|
|
2133
|
+
for f in reference_data:
|
|
2134
|
+
tmp = get_reference_df(f,likely_reference_ids)
|
|
2135
|
+
dfs.append(tmp)
|
|
2136
|
+
unique_reference_ids.extend(tmp.iloc[:,0].unique())
|
|
2137
|
+
df_reference = pd.concat(dfs, axis=0, ignore_index=True)
|
|
2138
|
+
|
|
2139
|
+
|
|
2140
|
+
if spectrum_preprocessing_order is not None:
|
|
2141
|
+
spectrum_preprocessing_order = list(spectrum_preprocessing_order)
|
|
2142
|
+
else:
|
|
2143
|
+
spectrum_preprocessing_order = ['F','N','W','L']
|
|
2144
|
+
if set(spectrum_preprocessing_order) - {'F','N','W','L'}:
|
|
2145
|
+
print(f'Error: spectrum_preprocessing_order must contain only \'F\', \'N\', \'W\', \'L\'.')
|
|
2146
|
+
sys.exit()
|
|
2147
|
+
|
|
2148
|
+
if similarity_measure not in ['cosine','shannon','renyi','tsallis','mixture','jaccard','dice','3w_jaccard','sokal_sneath','binary_cosine','mountford','mcconnaughey','driver_kroeber','simpson','braun_banquet','fager_mcgowan','kuldzynski','intersection','hamming','hellinger']:
|
|
2149
|
+
print('\nError: similarity_measure must be either cosine, shannon, renyi, tsallis, mixture, jaccard, dice, 3w_jaccard, sokal_sneath, binary_cosine, mountford, mcconnaughey, driver_kroeber, simpson, braun_banquet, fager_mcgowan, kulczynski, intersection, hamming, or hellinger')
|
|
2150
|
+
sys.exit()
|
|
2151
|
+
|
|
2152
|
+
if isinstance(int_min,int) is True:
|
|
2153
|
+
int_min = float(int_min)
|
|
2154
|
+
if isinstance(int_max,int) is True:
|
|
2155
|
+
int_max = float(int_max)
|
|
2156
|
+
if isinstance(mz_min,int) is False or isinstance(mz_max,int) is False or isinstance(int_min,float) is False or isinstance(int_max,float) is False:
|
|
2157
|
+
print('Error: mz_min must be a non-negative integer, mz_max must be a positive integer, int_min must be a non-negative float, and int_max must be a positive float')
|
|
2158
|
+
sys.exit()
|
|
2159
|
+
if mz_min < 0:
|
|
2160
|
+
print('\nError: mz_min should be a non-negative integer')
|
|
2161
|
+
sys.exit()
|
|
2162
|
+
if mz_max <= 0:
|
|
2163
|
+
print('\nError: mz_max should be a positive integer')
|
|
2164
|
+
sys.exit()
|
|
2165
|
+
if int_min < 0:
|
|
2166
|
+
print('\nError: int_min should be a non-negative float')
|
|
2167
|
+
sys.exit()
|
|
2168
|
+
if int_max <= 0:
|
|
2169
|
+
print('\nError: int_max should be a positive float')
|
|
2170
|
+
sys.exit()
|
|
2171
|
+
|
|
2172
|
+
if isinstance(noise_threshold,int) is True:
|
|
2173
|
+
noise_threshold = float(noise_threshold)
|
|
2174
|
+
if isinstance(noise_threshold,float) is False or noise_threshold < 0:
|
|
2175
|
+
print('Error: noise_threshold must be a positive float.')
|
|
2176
|
+
sys.exit()
|
|
2177
|
+
|
|
2178
|
+
if isinstance(wf_intensity,int) is True:
|
|
2179
|
+
wf_intensity = float(wf_intensity)
|
|
2180
|
+
if isinstance(wf_mz,int) is True:
|
|
2181
|
+
wf_mz = float(wf_mz)
|
|
2182
|
+
if isinstance(wf_intensity,float) is False or isinstance(wf_mz,float) is False:
|
|
2183
|
+
print('Error: wf_mz and wf_intensity must be integers or floats')
|
|
2184
|
+
sys.exit()
|
|
2185
|
+
|
|
2186
|
+
if entropy_dimension <= 0:
|
|
2187
|
+
print('\nError: entropy_dimension should be a positive float')
|
|
2188
|
+
sys.exit()
|
|
2189
|
+
else:
|
|
2190
|
+
q = entropy_dimension
|
|
2191
|
+
|
|
2192
|
+
normalization_method = 'standard'
|
|
2193
|
+
|
|
2194
|
+
if n_top_matches_to_save <= 0 or isinstance(n_top_matches_to_save,int)==False:
|
|
2195
|
+
print('\nError: n_top_matches_to_save should be a positive integer')
|
|
2196
|
+
sys.exit()
|
|
2197
|
+
|
|
2198
|
+
if isinstance(print_id_results,bool)==False:
|
|
2199
|
+
print('\nError: print_id_results must be either True or False')
|
|
2200
|
+
sys.exit()
|
|
2201
|
+
|
|
2202
|
+
if output_identification is None:
|
|
2203
|
+
output_identification = f'{Path.cwd()}/output_identification.txt'
|
|
2204
|
+
print(f'Warning: writing identification output to {output_identification}')
|
|
2205
|
+
|
|
2206
|
+
if output_similarity_scores is None:
|
|
2207
|
+
output_similarity_scores = f'{Path.cwd()}/output_all_similarity_scores.txt'
|
|
2208
|
+
print(f'Warning: writing similarity scores to {output_similarity_scores}')
|
|
2209
|
+
|
|
2210
|
+
|
|
2211
|
+
|
|
2212
|
+
min_mz = int(np.min([np.min(df_query.iloc[:,1]), np.min(df_reference.iloc[:,1])]))
|
|
2213
|
+
max_mz = int(np.max([np.max(df_query.iloc[:,1]), np.max(df_reference.iloc[:,1])]))
|
|
2214
|
+
mzs = np.linspace(min_mz,max_mz,(max_mz-min_mz+1))
|
|
2215
|
+
|
|
2216
|
+
all_similarity_scores = []
|
|
2217
|
+
for query_idx in range(0,len(unique_query_ids)):
|
|
2218
|
+
q_idxs_tmp = np.where(df_query.iloc[:,0] == unique_query_ids[query_idx])[0]
|
|
2219
|
+
q_spec_tmp = np.asarray(pd.concat([df_query.iloc[q_idxs_tmp,1], df_query.iloc[q_idxs_tmp,2]], axis=1).reset_index(drop=True))
|
|
2220
|
+
q_spec_tmp = convert_spec(q_spec_tmp,mzs)
|
|
2221
|
+
|
|
2222
|
+
similarity_scores = []
|
|
2223
|
+
for ref_idx in range(0,len(unique_reference_ids)):
|
|
2224
|
+
if verbose is True and ref_idx % 1000 == 0:
|
|
2225
|
+
print(f'Query spectrum #{query_idx} has had its similarity with {ref_idx} reference library spectra computed')
|
|
2226
|
+
q_spec = q_spec_tmp
|
|
2227
|
+
r_idxs_tmp = np.where(df_reference.iloc[:,0] == unique_reference_ids[ref_idx])[0]
|
|
2228
|
+
r_spec_tmp = np.asarray(pd.concat([df_reference.iloc[r_idxs_tmp,1], df_reference.iloc[r_idxs_tmp,2]], axis=1).reset_index(drop=True))
|
|
2229
|
+
r_spec = convert_spec(r_spec_tmp,mzs)
|
|
2230
|
+
|
|
2231
|
+
for transformation in spectrum_preprocessing_order:
|
|
2232
|
+
if np.isinf(q_spec[:,1]).sum() > 0:
|
|
2233
|
+
q_spec[:,1] = np.zeros(q_spec.shape[0])
|
|
2234
|
+
if np.isinf(r_spec[:,1]).sum() > 0:
|
|
2235
|
+
r_spec[:,1] = np.zeros(r_spec.shape[0])
|
|
2236
|
+
if transformation == 'W':
|
|
2237
|
+
q_spec[:,1] = wf_transform(q_spec[:,0], q_spec[:,1], wf_mz, wf_intensity)
|
|
2238
|
+
r_spec[:,1] = wf_transform(r_spec[:,0], r_spec[:,1], wf_mz, wf_intensity)
|
|
2239
|
+
if transformation == 'L':
|
|
2240
|
+
q_spec[:,1] = LE_transform(q_spec[:,1], LET_threshold, normalization_method=normalization_method)
|
|
2241
|
+
r_spec[:,1] = LE_transform(r_spec[:,1], LET_threshold, normalization_method=normalization_method)
|
|
2242
|
+
if transformation == 'N':
|
|
2243
|
+
q_spec = remove_noise(q_spec, nr = noise_threshold)
|
|
2244
|
+
if high_quality_reference_library == False:
|
|
2245
|
+
r_spec = remove_noise(r_spec, nr = noise_threshold)
|
|
2246
|
+
if transformation == 'F':
|
|
2247
|
+
q_spec = filter_spec_gcms(q_spec, mz_min = mz_min, mz_max = mz_max, int_min = int_min, int_max = int_max)
|
|
2248
|
+
if high_quality_reference_library == False:
|
|
2249
|
+
r_spec = filter_spec_gcms(r_spec, mz_min = mz_min, mz_max = mz_max, int_min = int_min, int_max = int_max)
|
|
2250
|
+
|
|
2251
|
+
q_ints = q_spec[:,1]
|
|
2252
|
+
r_ints = r_spec[:,1]
|
|
2253
|
+
|
|
2254
|
+
if np.sum(q_ints) != 0 and np.sum(r_ints) != 0:
|
|
2255
|
+
similarity_score = get_similarity(similarity_measure, q_spec[:,1], r_spec[:,1], weights, entropy_dimension)
|
|
2256
|
+
else:
|
|
2257
|
+
similarity_score = 0
|
|
2258
|
+
|
|
2259
|
+
similarity_scores.append(similarity_score)
|
|
2260
|
+
all_similarity_scores.append(similarity_scores)
|
|
2261
|
+
|
|
2262
|
+
df_scores = pd.DataFrame(all_similarity_scores, columns = unique_reference_ids)
|
|
2263
|
+
df_scores.index = unique_query_ids
|
|
2264
|
+
df_scores.index.names = ['QUERY.SPECTRUM.ID']
|
|
2265
|
+
|
|
2266
|
+
preds = []
|
|
2267
|
+
scores = []
|
|
2268
|
+
for i in range(0, df_scores.shape[0]):
|
|
2269
|
+
df_scores_tmp = df_scores
|
|
2270
|
+
preds_tmp = []
|
|
2271
|
+
scores_tmp = []
|
|
2272
|
+
for j in range(0, n_top_matches_to_save):
|
|
2273
|
+
top_ref_specs_tmp = df_scores_tmp.iloc[i,np.where(df_scores_tmp.iloc[i,:] == np.max(df_scores_tmp.iloc[i,:]))[0]]
|
|
2274
|
+
cols_to_keep = np.where(df_scores_tmp.iloc[i,:] != np.max(df_scores_tmp.iloc[i,:]))[0]
|
|
2275
|
+
df_scores_tmp = df_scores_tmp.iloc[:,cols_to_keep]
|
|
2276
|
+
|
|
2277
|
+
preds_tmp.append(';'.join(map(str,top_ref_specs_tmp.index.to_list())))
|
|
2278
|
+
if len(top_ref_specs_tmp.values) == 0:
|
|
2279
|
+
scores_tmp.append(0)
|
|
2280
|
+
else:
|
|
2281
|
+
scores_tmp.append(top_ref_specs_tmp.values[0])
|
|
2282
|
+
preds.append(preds_tmp)
|
|
2283
|
+
scores.append(scores_tmp)
|
|
2284
|
+
|
|
2285
|
+
preds = np.array(preds)
|
|
2286
|
+
scores = np.array(scores)
|
|
2287
|
+
out = np.c_[preds,scores]
|
|
2288
|
+
|
|
2289
|
+
cnames_preds = []
|
|
2290
|
+
cnames_scores = []
|
|
2291
|
+
for i in range(0,n_top_matches_to_save):
|
|
2292
|
+
cnames_preds.append(f'RANK.{i+1}.PRED')
|
|
2293
|
+
cnames_scores.append(f'RANK.{i+1}.SIMILARITY.SCORE')
|
|
2294
|
+
|
|
2295
|
+
df_top_ref_specs = pd.DataFrame(out, columns = [*cnames_preds, *cnames_scores])
|
|
2296
|
+
df_top_ref_specs.index = unique_query_ids
|
|
2297
|
+
df_top_ref_specs.index.names = ['QUERY.SPECTRUM.ID']
|
|
2298
|
+
|
|
2299
|
+
if print_id_results == True:
|
|
2300
|
+
print(df_top_ref_specs.to_string())
|
|
2301
|
+
|
|
2302
|
+
df_scores.columns = ['Reference Spectrum ID: ' + col for col in list(map(str,df_scores.columns.tolist()))]
|
|
2303
|
+
|
|
2304
|
+
if return_ID_output is False:
|
|
2305
|
+
df_top_ref_specs.to_csv(output_identification, sep='\t')
|
|
2306
|
+
df_scores.columns = ['Reference Spectrum ID: ' + col for col in list(map(str,df_scores.columns.tolist()))]
|
|
2307
|
+
df_scores.to_csv(output_similarity_scores, sep='\t')
|
|
2308
|
+
else:
|
|
2309
|
+
return df_top_ref_specs
|
|
2310
|
+
|
|
2311
|
+
|
|
2312
|
+
class _UIWriter:
|
|
2313
|
+
def __init__(self, loop, q: asyncio.Queue[str]):
|
|
2314
|
+
self._loop = loop
|
|
2315
|
+
self._q = q
|
|
2316
|
+
def write(self, s: str):
|
|
2317
|
+
if s:
|
|
2318
|
+
self._loop.call_soon_threadsafe(self._q.put_nowait, s)
|
|
2319
|
+
return len(s)
|
|
2320
|
+
def flush(self):
|
|
2321
|
+
pass
|
|
2322
|
+
|
|
2323
|
+
|
|
2324
|
+
def attach_logging_to_writer(writer):
|
|
2325
|
+
handler = logging.StreamHandler(writer)
|
|
2326
|
+
handler.setLevel(logging.INFO)
|
|
2327
|
+
root = logging.getLogger()
|
|
2328
|
+
root.addHandler(handler)
|
|
2329
|
+
root.setLevel(logging.INFO)
|
|
2330
|
+
return handler, root
|
|
2331
|
+
|
|
2332
|
+
|
|
2333
|
+
|
|
31
2334
|
def _run_with_redirects(fn, writer, *args, **kwargs):
|
|
32
2335
|
with redirect_stdout(writer), redirect_stderr(writer):
|
|
33
2336
|
return fn(*args, **kwargs)
|
|
@@ -64,19 +2367,21 @@ def strip_weights(s):
|
|
|
64
2367
|
def build_library(input_path=None, output_path=None):
|
|
65
2368
|
last_three_chars = input_path[(len(input_path)-3):len(input_path)]
|
|
66
2369
|
last_four_chars = input_path[(len(input_path)-4):len(input_path)]
|
|
67
|
-
if last_three_chars == '
|
|
68
|
-
return pd.read_csv(input_path)
|
|
2370
|
+
if last_three_chars == 'txt' or last_three_chars == 'TXT':
|
|
2371
|
+
return pd.read_csv(input_path, sep='\t')
|
|
69
2372
|
else:
|
|
70
2373
|
if last_three_chars == 'mgf' or last_three_chars == 'MGF':
|
|
71
2374
|
input_file_type = 'mgf'
|
|
72
2375
|
elif last_four_chars == 'mzML' or last_four_chars == 'mzml' or last_four_chars == 'MZML':
|
|
73
2376
|
input_file_type = 'mzML'
|
|
2377
|
+
elif last_four_chars == 'json' or last_four_chars == 'JSON':
|
|
2378
|
+
input_file_type = 'json'
|
|
74
2379
|
elif last_three_chars == 'cdf' or last_three_chars == 'CDF':
|
|
75
2380
|
input_file_type = 'cdf'
|
|
76
2381
|
elif last_three_chars == 'msp' or last_three_chars == 'MSP':
|
|
77
2382
|
input_file_type = 'msp'
|
|
78
2383
|
else:
|
|
79
|
-
print('ERROR: either an \'mgf\', \'mzML\', \'cdf\', or \'
|
|
2384
|
+
print('ERROR: either an \'mgf\', \'mzML\', \'cdf\', \'msp\', \'json\', or \'txt\' file must be passed to --input_path')
|
|
80
2385
|
sys.exit()
|
|
81
2386
|
|
|
82
2387
|
spectra = []
|
|
@@ -146,6 +2451,23 @@ def build_library(input_path=None, output_path=None):
|
|
|
146
2451
|
except ValueError:
|
|
147
2452
|
continue
|
|
148
2453
|
|
|
2454
|
+
if input_file_type == 'json':
|
|
2455
|
+
data = json.load(open(input_path))
|
|
2456
|
+
ids = []
|
|
2457
|
+
mzs = []
|
|
2458
|
+
ints = []
|
|
2459
|
+
for i in range(0,len(data)):
|
|
2460
|
+
spec_ID_tmp = data[i]['spectrum_id']
|
|
2461
|
+
tmp = data[i]['peaks_json']
|
|
2462
|
+
tmp = tmp[1:-1].split(",")
|
|
2463
|
+
tmp = [a.replace("[","") for a in tmp]
|
|
2464
|
+
tmp = [a.replace("]","") for a in tmp]
|
|
2465
|
+
mzs_tmp = tmp[0::2]
|
|
2466
|
+
ints_tmp = tmp[1::2]
|
|
2467
|
+
ids.extend([spec_ID_tmp] * len(mzs_tmp))
|
|
2468
|
+
mzs.extend(mzs_tmp)
|
|
2469
|
+
ints.extend(ints_tmp)
|
|
2470
|
+
|
|
149
2471
|
df = pd.DataFrame({'id':ids, 'mz_ratio':mzs, 'intensity':ints})
|
|
150
2472
|
return df
|
|
151
2473
|
|
|
@@ -154,9 +2476,12 @@ def build_library(input_path=None, output_path=None):
|
|
|
154
2476
|
def extract_first_column_ids(file_path: str, max_ids: int = 20000):
|
|
155
2477
|
suffix = Path(file_path).suffix.lower()
|
|
156
2478
|
|
|
157
|
-
if suffix == ".
|
|
158
|
-
df = pd.read_csv(file_path,
|
|
159
|
-
|
|
2479
|
+
if suffix == ".txt":
|
|
2480
|
+
df = pd.read_csv(file_path, sep='\t')
|
|
2481
|
+
if 'id' in df.columns.tolist():
|
|
2482
|
+
ids = df['id'].astype(str).dropna()
|
|
2483
|
+
else:
|
|
2484
|
+
ids = df.iloc[:, 0].astype(str).dropna()
|
|
160
2485
|
ids = [x for x in ids if x.strip() != ""]
|
|
161
2486
|
seen = set()
|
|
162
2487
|
uniq = []
|
|
@@ -191,17 +2516,17 @@ def extract_first_column_ids(file_path: str, max_ids: int = 20000):
|
|
|
191
2516
|
return []
|
|
192
2517
|
|
|
193
2518
|
|
|
194
|
-
def _open_plot_window(session,
|
|
195
|
-
"""Send
|
|
196
|
-
b64 = base64.b64encode(
|
|
197
|
-
data_url = f"data:image/
|
|
198
|
-
session.send_custom_message("open-plot-window", {"
|
|
2519
|
+
def _open_plot_window(session, svg_bytes: bytes, title: str = "plot.svg"):
|
|
2520
|
+
"""Send SVG bytes to browser and open in a new window as a data URL."""
|
|
2521
|
+
b64 = base64.b64encode(svg_bytes).decode("ascii")
|
|
2522
|
+
data_url = f"data:image/svg;base64,{b64}"
|
|
2523
|
+
session.send_custom_message("open-plot-window", {"svg": data_url, "title": title})
|
|
199
2524
|
|
|
200
2525
|
|
|
201
2526
|
def plot_spectra_ui(platform: str):
|
|
202
2527
|
base_inputs = [
|
|
203
|
-
ui.input_file("query_data", "Upload query dataset (mgf, mzML, cdf, msp, or
|
|
204
|
-
ui.input_file("reference_data", "Upload reference dataset (mgf, mzML, cdf, msp, or
|
|
2528
|
+
ui.input_file("query_data", "Upload query dataset (mgf, mzML, cdf, msp, or txt):"),
|
|
2529
|
+
ui.input_file("reference_data", "Upload reference dataset (mgf, mzML, cdf, msp, or txt):"),
|
|
205
2530
|
ui.input_selectize(
|
|
206
2531
|
"spectrum_ID1",
|
|
207
2532
|
"Select spectrum ID 1 (default is the first spectrum in the library):",
|
|
@@ -216,6 +2541,8 @@ def plot_spectra_ui(platform: str):
|
|
|
216
2541
|
multiple=False,
|
|
217
2542
|
options={"placeholder": "Upload a library..."},
|
|
218
2543
|
),
|
|
2544
|
+
ui.input_select('print_url_spectrum1', 'Print PubChem URL for spectrum 1:', ['No', 'Yes']),
|
|
2545
|
+
ui.input_select('print_url_spectrum2', 'Print PubChem URL for spectrum 2:', ['No', 'Yes']),
|
|
219
2546
|
ui.input_select("similarity_measure", "Select similarity measure:", ["cosine","shannon","renyi","tsallis","mixture","jaccard","dice","3w_jaccard","sokal_sneath","binary_cosine","mountford","mcconnaughey","driver_kroeber","simpson","braun_banquet","fager_mcgowan","kulczynski","intersection","hamming","hellinger"]),
|
|
220
2547
|
ui.input_text('weights', 'Weights for mixture similarity measure (cosine, shannon, renyi, tsallis):', '0.25, 0.25, 0.25, 0.25'),
|
|
221
2548
|
ui.input_select(
|
|
@@ -227,21 +2554,13 @@ def plot_spectra_ui(platform: str):
|
|
|
227
2554
|
|
|
228
2555
|
if platform == "HRMS":
|
|
229
2556
|
extra_inputs = [
|
|
230
|
-
ui.input_text(
|
|
231
|
-
"spectrum_preprocessing_order",
|
|
232
|
-
"Sequence of characters for preprocessing order (C (centroiding), F (filtering), M (matching), N (noise removal), L (low-entropy transformation), W (weight factor transformation)). M must be included, C before M if used.",
|
|
233
|
-
"FCNMWL",
|
|
234
|
-
),
|
|
2557
|
+
ui.input_text("spectrum_preprocessing_order", "Sequence of characters for preprocessing order (C (centroiding), F (filtering), M (matching), N (noise removal), L (low-entropy transformation), W (weight factor transformation)). M must be included, C before M if used.", "FCNMWL",),
|
|
235
2558
|
ui.input_numeric("window_size_centroiding", "Centroiding window-size:", 0.5),
|
|
236
2559
|
ui.input_numeric("window_size_matching", "Matching window-size:", 0.5),
|
|
237
2560
|
]
|
|
238
2561
|
else:
|
|
239
2562
|
extra_inputs = [
|
|
240
|
-
ui.input_text(
|
|
241
|
-
"spectrum_preprocessing_order",
|
|
242
|
-
"Sequence of characters for preprocessing order (F (filtering), N (noise removal), L (low-entropy transformation), W (weight factor transformation)).",
|
|
243
|
-
"FNLW",
|
|
244
|
-
)
|
|
2563
|
+
ui.input_text("spectrum_preprocessing_order", "Sequence of characters for preprocessing order (F (filtering), N (noise removal), L (low-entropy transformation), W (weight factor transformation)).", "FNLW",)
|
|
245
2564
|
]
|
|
246
2565
|
|
|
247
2566
|
numeric_inputs = [
|
|
@@ -256,11 +2575,7 @@ def plot_spectra_ui(platform: str):
|
|
|
256
2575
|
ui.input_numeric("entropy_dimension", "Entropy dimension (Renyi/Tsallis only):", 1.1),
|
|
257
2576
|
]
|
|
258
2577
|
|
|
259
|
-
select_input = ui.input_select(
|
|
260
|
-
"y_axis_transformation",
|
|
261
|
-
"Transformation to apply to intensity axis:",
|
|
262
|
-
["normalized", "none", "log10", "sqrt"],
|
|
263
|
-
)
|
|
2578
|
+
select_input = ui.input_select("y_axis_transformation", "Transformation to apply to intensity axis:", ["normalized", "none", "log10", "sqrt"])
|
|
264
2579
|
|
|
265
2580
|
run_button_plot_spectra = ui.download_button("run_btn_plot_spectra", "Run", style="font-size:16px; padding:15px 30px; width:200px; height:80px")
|
|
266
2581
|
back_button = ui.input_action_button("back", "Back to main menu", style="font-size:16px; padding:15px 30px; width:200px; height:80px")
|
|
@@ -268,15 +2583,15 @@ def plot_spectra_ui(platform: str):
|
|
|
268
2583
|
if platform == "HRMS":
|
|
269
2584
|
inputs_columns = ui.layout_columns(
|
|
270
2585
|
ui.div(base_inputs[0:6], style="display:flex; flex-direction:column; gap:10px;"),
|
|
271
|
-
ui.div([base_inputs[6:
|
|
272
|
-
ui.div(numeric_inputs[0:
|
|
273
|
-
ui.div([numeric_inputs[
|
|
2586
|
+
ui.div([base_inputs[6:9], extra_inputs[0]], style="display:flex; flex-direction:column; gap:10px;"),
|
|
2587
|
+
ui.div(extra_inputs[1:3], numeric_inputs[0:3], style="display:flex; flex-direction:column; gap:10px;"),
|
|
2588
|
+
ui.div([numeric_inputs[3:10], select_input], style="display:flex; flex-direction:column; gap:10px;"),
|
|
274
2589
|
col_widths=(3,3,3,3),
|
|
275
2590
|
)
|
|
276
2591
|
elif platform == "NRMS":
|
|
277
2592
|
inputs_columns = ui.layout_columns(
|
|
278
2593
|
ui.div(base_inputs[0:6], style="display:flex; flex-direction:column; gap:10px;"),
|
|
279
|
-
ui.div([base_inputs[6:
|
|
2594
|
+
ui.div([base_inputs[6:9], *extra_inputs], style="display:flex; flex-direction:column; gap:10px;"),
|
|
280
2595
|
ui.div(numeric_inputs[0:5], style="display:flex; flex-direction:column; gap:10px;"),
|
|
281
2596
|
ui.div([numeric_inputs[5:10], select_input], style="display:flex; flex-direction:column; gap:10px;"),
|
|
282
2597
|
col_widths=(3,3,3,3),
|
|
@@ -297,49 +2612,29 @@ def plot_spectra_ui(platform: str):
|
|
|
297
2612
|
|
|
298
2613
|
def run_spec_lib_matching_ui(platform: str):
|
|
299
2614
|
base_inputs = [
|
|
300
|
-
ui.input_file("query_data", "Upload query dataset (mgf, mzML, cdf, msp, or
|
|
301
|
-
ui.input_file("reference_data", "Upload reference dataset (mgf, mzML, cdf, msp, or
|
|
2615
|
+
ui.input_file("query_data", "Upload query dataset (mgf, mzML, cdf, msp, or txt):"),
|
|
2616
|
+
ui.input_file("reference_data", "Upload reference dataset (mgf, mzML, cdf, msp, or txt):"),
|
|
302
2617
|
ui.input_select("similarity_measure", "Select similarity measure:", ["cosine","shannon","renyi","tsallis","mixture","jaccard","dice","3w_jaccard","sokal_sneath","binary_cosine","mountford","mcconnaughey","driver_kroeber","simpson","braun_banquet","fager_mcgowan","kulczynski","intersection","hamming","hellinger"]),
|
|
303
2618
|
ui.input_text('weights', 'Weights for mixture similarity measure (cosine, shannon, renyi, tsallis):', '0.25, 0.25, 0.25, 0.25'),
|
|
304
|
-
ui.
|
|
305
|
-
|
|
306
|
-
|
|
307
|
-
|
|
308
|
-
|
|
309
|
-
|
|
310
|
-
),
|
|
311
|
-
ui.input_selectize(
|
|
312
|
-
"spectrum_ID2",
|
|
313
|
-
"Select spectrum ID 2 (only applicable for plotting; default is the first spectrum in the reference library):",
|
|
314
|
-
choices=[],
|
|
315
|
-
multiple=False,
|
|
316
|
-
options={"placeholder": "Upload a library..."},
|
|
317
|
-
),
|
|
318
|
-
ui.input_select(
|
|
319
|
-
"high_quality_reference_library",
|
|
320
|
-
"Indicate whether the reference library is considered high quality. If True, filtering and noise removal are only applied to the query spectra.",
|
|
321
|
-
[False, True],
|
|
322
|
-
)
|
|
2619
|
+
ui.input_file('compound_ID_output_file', 'Upload output from spectral library matching to plot top matches (optional)'),
|
|
2620
|
+
ui.input_selectize("q_spec", "Select query spectrum (only applicable for plotting; default is the first spectrum in the compound ID output):", choices=[], multiple=False, options={"placeholder": "Upload compound ID output..."}),
|
|
2621
|
+
ui.input_selectize("r_spec", "Select reference spectrum (only applicable for plotting; default is the rank 1 reference spectrum):", choices=[], multiple=False, options={"placeholder": "Upload compound ID output..."}),
|
|
2622
|
+
ui.input_select('print_url_spectrum1', 'Print PubChem URL for query spectrum (only applicable for plotting):', ['No', 'Yes']),
|
|
2623
|
+
ui.input_select('print_url_spectrum2', 'Print PubChem URL for reference spectrum (only applicable for plotting):', ['No', 'Yes']),
|
|
2624
|
+
ui.input_select("high_quality_reference_library", "Indicate whether the reference library is considered high quality. If True, filtering and noise removal are only applied to the query spectra.", [False, True])
|
|
323
2625
|
]
|
|
324
2626
|
|
|
325
2627
|
if platform == "HRMS":
|
|
326
2628
|
extra_inputs = [
|
|
327
|
-
ui.
|
|
328
|
-
|
|
329
|
-
|
|
330
|
-
|
|
331
|
-
),
|
|
2629
|
+
ui.input_numeric("precursor_ion_mz_tolerance", "Precursor ion mass tolerance (leave blank if not applicable):", None),
|
|
2630
|
+
ui.input_select("ionization_mode", "Ionization mode:", ['Positive','Negative','N/A'], selected='N/A'),
|
|
2631
|
+
ui.input_select("adduct", "Adduct:", ['H','NH3','NH4','Na','K','N/A'], selected='N/A'),
|
|
2632
|
+
ui.input_text("spectrum_preprocessing_order","Sequence of characters for preprocessing order (C (centroiding), F (filtering), M (matching), N (noise removal), L (low-entropy transformation), W (weight factor transformation)). M must be included, C before M if used.","FCNMWL"),
|
|
332
2633
|
ui.input_numeric("window_size_centroiding", "Centroiding window-size:", 0.5),
|
|
333
2634
|
ui.input_numeric("window_size_matching", "Matching window-size:", 0.5),
|
|
334
2635
|
]
|
|
335
2636
|
else:
|
|
336
|
-
extra_inputs = [
|
|
337
|
-
ui.input_text(
|
|
338
|
-
"spectrum_preprocessing_order",
|
|
339
|
-
"Sequence of characters for preprocessing order (F (filtering), N (noise removal), L (low-entropy transformation), W (weight factor transformation)).",
|
|
340
|
-
"FNLW",
|
|
341
|
-
)
|
|
342
|
-
]
|
|
2637
|
+
extra_inputs = [ui.input_text("spectrum_preprocessing_order","Sequence of characters for preprocessing order (F (filtering), N (noise removal), L (low-entropy transformation), W (weight factor transformation)).","FNLW")]
|
|
343
2638
|
|
|
344
2639
|
numeric_inputs = [
|
|
345
2640
|
ui.input_numeric("mz_min", "Minimum m/z for filtering:", 0),
|
|
@@ -361,16 +2656,16 @@ def run_spec_lib_matching_ui(platform: str):
|
|
|
361
2656
|
|
|
362
2657
|
if platform == "HRMS":
|
|
363
2658
|
inputs_columns = ui.layout_columns(
|
|
364
|
-
ui.div(base_inputs[0:
|
|
365
|
-
ui.div([base_inputs[
|
|
366
|
-
ui.div(numeric_inputs[0:
|
|
367
|
-
ui.div(numeric_inputs[
|
|
2659
|
+
ui.div([base_inputs[0:2], extra_inputs[0:3], base_inputs[2:4]], style="display:flex; flex-direction:column; gap:10px;"),
|
|
2660
|
+
ui.div([base_inputs[4:10]], style="display:flex; flex-direction:column; gap:10px;"),
|
|
2661
|
+
ui.div([extra_inputs[3:6], numeric_inputs[0:3]], style="display:flex; flex-direction:column; gap:10px;"),
|
|
2662
|
+
ui.div(numeric_inputs[3:10], style="display:flex; flex-direction:column; gap:10px;"),
|
|
368
2663
|
col_widths=(3,3,3,3)
|
|
369
2664
|
)
|
|
370
2665
|
elif platform == "NRMS":
|
|
371
2666
|
inputs_columns = ui.layout_columns(
|
|
372
2667
|
ui.div(base_inputs[0:6], style="display:flex; flex-direction:column; gap:10px;"),
|
|
373
|
-
ui.div([base_inputs[6:
|
|
2668
|
+
ui.div([base_inputs[6:10], *extra_inputs], style="display:flex; flex-direction:column; gap:10px;"),
|
|
374
2669
|
ui.div(numeric_inputs[0:5], style="display:flex; flex-direction:column; gap:10px;"),
|
|
375
2670
|
ui.div(numeric_inputs[5:10], style="display:flex; flex-direction:column; gap:10px;"),
|
|
376
2671
|
col_widths=(3,3,3,3)
|
|
@@ -397,8 +2692,8 @@ def run_spec_lib_matching_ui(platform: str):
|
|
|
397
2692
|
|
|
398
2693
|
def run_parameter_tuning_grid_ui(platform: str):
|
|
399
2694
|
base_inputs = [
|
|
400
|
-
ui.input_file("query_data", "Upload query dataset (mgf, mzML, cdf, msp, or
|
|
401
|
-
ui.input_file("reference_data", "Upload reference dataset (mgf, mzML, cdf, msp, or
|
|
2695
|
+
ui.input_file("query_data", "Upload query dataset (mgf, mzML, cdf, msp, or txt):"),
|
|
2696
|
+
ui.input_file("reference_data", "Upload reference dataset (mgf, mzML, cdf, msp, or txt):"),
|
|
402
2697
|
ui.input_selectize("similarity_measure", "Select similarity measure(s):", ["cosine","shannon","renyi","tsallis","mixture","jaccard","dice","3w_jaccard","sokal_sneath","binary_cosine","mountford","mcconnaughey","driver_kroeber","simpson","braun_banquet","fager_mcgowan","kulczynski","intersection","hamming","hellinger"], multiple=True, selected='cosine'),
|
|
403
2698
|
ui.input_text('weights', 'Weights for mixture similarity measure (cosine, shannon, renyi, tsallis):', '((0.25, 0.25, 0.25, 0.25))'),
|
|
404
2699
|
ui.input_text("high_quality_reference_library", "Indicate whether the reference library is considered high quality. If True, filtering and noise removal are only applied to the query spectra.", '[True]')
|
|
@@ -406,11 +2701,10 @@ def run_parameter_tuning_grid_ui(platform: str):
|
|
|
406
2701
|
|
|
407
2702
|
if platform == "HRMS":
|
|
408
2703
|
extra_inputs = [
|
|
409
|
-
ui.
|
|
410
|
-
|
|
411
|
-
|
|
412
|
-
|
|
413
|
-
),
|
|
2704
|
+
ui.input_numeric("precursor_ion_mz_tolerance", "Precursor ion mass tolerance (leave blank if not applicable):", None),
|
|
2705
|
+
ui.input_select("ionization_mode", "Ionization mode:", ['Positive','Negative','N/A'], selected='N/A'),
|
|
2706
|
+
ui.input_select("adduct", "Adduct:", ['H','NH3','NH4','Na','K','N/A'], selected='N/A'),
|
|
2707
|
+
ui.input_text("spectrum_preprocessing_order", "Sequence of characters for preprocessing order (C (centroiding), F (filtering), M (matching), N (noise removal), L (low-entropy transformation), W (weight factor transformation)). M must be included, C before M if used.", "[FCNMWL,CWM]"),
|
|
414
2708
|
ui.input_text("window_size_centroiding", "Centroiding window-size:", "[0.5]"),
|
|
415
2709
|
ui.input_text("window_size_matching", "Matching window-size:", "[0.1,0.5]"),
|
|
416
2710
|
]
|
|
@@ -464,7 +2758,7 @@ def run_parameter_tuning_grid_ui(platform: str):
|
|
|
464
2758
|
|
|
465
2759
|
return ui.div(
|
|
466
2760
|
ui.TagList(
|
|
467
|
-
ui.h2("Tune parameters"),
|
|
2761
|
+
ui.h2("Tune parameters (grid search)"),
|
|
468
2762
|
inputs_columns,
|
|
469
2763
|
run_button_parameter_tuning_grid,
|
|
470
2764
|
back_button,
|
|
@@ -492,83 +2786,71 @@ PARAMS_NRMS = {
|
|
|
492
2786
|
"entropy_dimension": (1.0, 3.0)
|
|
493
2787
|
}
|
|
494
2788
|
|
|
2789
|
+
|
|
495
2790
|
def run_parameter_tuning_DE_ui(platform: str):
|
|
496
|
-
|
|
497
|
-
|
|
2791
|
+
# Pick param set per platform
|
|
2792
|
+
if platform == "HRMS":
|
|
2793
|
+
PARAMS = PARAMS_HRMS
|
|
498
2794
|
else:
|
|
499
|
-
PARAMS=PARAMS_NRMS
|
|
2795
|
+
PARAMS = PARAMS_NRMS
|
|
500
2796
|
|
|
501
2797
|
base_inputs = [
|
|
502
|
-
ui.input_file("query_data", "Upload query dataset (mgf, mzML, cdf, msp, or
|
|
503
|
-
ui.input_file("reference_data", "Upload reference dataset (mgf, mzML, cdf, msp, or
|
|
2798
|
+
ui.input_file("query_data", "Upload query dataset (mgf, mzML, cdf, msp, or txt):"),
|
|
2799
|
+
ui.input_file("reference_data", "Upload reference dataset (mgf, mzML, cdf, msp, or txt):"),
|
|
504
2800
|
ui.input_select("similarity_measure", "Select similarity measure:", ["cosine","shannon","renyi","tsallis","mixture","jaccard","dice","3w_jaccard","sokal_sneath","binary_cosine","mountford","mcconnaughey","driver_kroeber","simpson","braun_banquet","fager_mcgowan","kulczynski","intersection","hamming","hellinger"]),
|
|
505
|
-
ui.input_text(
|
|
506
|
-
ui.input_select("high_quality_reference_library", "Indicate whether the reference library is considered high quality. If True, filtering and noise removal are only applied to the query spectra.", [False, True]
|
|
507
|
-
]
|
|
2801
|
+
ui.input_text("weights", "Weights for mixture similarity measure (cosine, shannon, renyi, tsallis):", "0.25, 0.25, 0.25, 0.25"),
|
|
2802
|
+
ui.input_select("high_quality_reference_library", "Indicate whether the reference library is considered high quality. If True, filtering and noise removal are only applied to the query spectra.", [False, True])]
|
|
508
2803
|
|
|
509
2804
|
if platform == "HRMS":
|
|
510
2805
|
extra_inputs = [
|
|
511
|
-
ui.
|
|
512
|
-
|
|
513
|
-
|
|
514
|
-
|
|
515
|
-
),
|
|
2806
|
+
ui.input_numeric("precursor_ion_mz_tolerance", "Precursor ion mass tolerance (leave blank if not applicable):", None),
|
|
2807
|
+
ui.input_select("ionization_mode", "Ionization mode:", ['Positive','Negative','N/A'], selected='N/A'),
|
|
2808
|
+
ui.input_select("adduct", "Adduct:", ['H','NH3','NH4','Na','K','N/A'], selected='N/A'),
|
|
2809
|
+
ui.input_text("spectrum_preprocessing_order", "Sequence of characters for preprocessing order (C (centroiding), F (filtering), M (matching), N (noise removal), L (low-entropy transformation), W (weight factor transformation)). M must be included, C before M if used.", "FCNMWL"),
|
|
516
2810
|
ui.input_numeric("window_size_centroiding", "Centroiding window-size:", 0.5),
|
|
517
2811
|
ui.input_numeric("window_size_matching", "Matching window-size:", 0.5),
|
|
518
2812
|
]
|
|
519
2813
|
else:
|
|
520
|
-
extra_inputs = [
|
|
521
|
-
ui.input_text(
|
|
522
|
-
"spectrum_preprocessing_order",
|
|
523
|
-
"Sequence of characters for preprocessing order (F (filtering), N (noise removal), L (low-entropy transformation), W (weight factor transformation)).",
|
|
524
|
-
"FNLW",
|
|
525
|
-
)
|
|
526
|
-
]
|
|
2814
|
+
extra_inputs = [ui.input_text("spectrum_preprocessing_order", "Sequence of characters for preprocessing order (F (filtering), N (noise removal), L (low-entropy transformation), W (weight factor transformation)).", "FNLW")]
|
|
527
2815
|
|
|
528
2816
|
numeric_inputs = [
|
|
529
2817
|
ui.input_numeric("mz_min", "Minimum m/z for filtering:", 0),
|
|
530
|
-
ui.input_numeric("mz_max", "Maximum m/z for filtering:",
|
|
2818
|
+
ui.input_numeric("mz_max", "Maximum m/z for filtering:", 99_999_999),
|
|
531
2819
|
ui.input_numeric("int_min", "Minimum intensity for filtering:", 0),
|
|
532
|
-
ui.input_numeric("int_max", "Maximum intensity for filtering:",
|
|
2820
|
+
ui.input_numeric("int_max", "Maximum intensity for filtering:", 999_999_999),
|
|
533
2821
|
ui.input_numeric("noise_threshold", "Noise removal threshold:", 0.0),
|
|
534
2822
|
ui.input_numeric("wf_mz", "Mass/charge weight factor:", 0.0),
|
|
535
2823
|
ui.input_numeric("wf_int", "Intensity weight factor:", 1.0),
|
|
536
2824
|
ui.input_numeric("LET_threshold", "Low-entropy threshold:", 0.0),
|
|
537
2825
|
ui.input_numeric("entropy_dimension", "Entropy dimension (Renyi/Tsallis only):", 1.1),
|
|
2826
|
+
ui.input_numeric("max_iterations", "Maximum number of iterations:", 5),
|
|
538
2827
|
]
|
|
539
2828
|
|
|
540
|
-
|
|
541
|
-
#run_button_parameter_tuning_DE = ui.download_button("run_btn_parameter_tuning_DE", "Tune parameters (differential evolution optimization)", style="font-size:16px; padding:15px 30px; width:300px; height:100px")
|
|
542
2829
|
run_button_parameter_tuning_DE = ui.input_action_button("run_btn_parameter_tuning_DE", "Tune parameters (differential evolution optimization)", style="font-size:16px; padding:15px 30px; width:300px; height:100px")
|
|
543
2830
|
back_button = ui.input_action_button("back", "Back to main menu", style="font-size:16px; padding:15px 30px; width:300px; height:100px")
|
|
544
2831
|
|
|
545
2832
|
if platform == "HRMS":
|
|
546
2833
|
inputs_columns = ui.layout_columns(
|
|
547
|
-
ui.div(base_inputs
|
|
548
|
-
ui.div(
|
|
549
|
-
ui.div(numeric_inputs[0:5], style="display:flex; flex-direction:column; gap:10px;"),
|
|
550
|
-
ui.div(
|
|
551
|
-
col_widths=(3,3,3,3),
|
|
2834
|
+
ui.div(*base_inputs, style="display:flex; flex-direction:column; gap:10px;"),
|
|
2835
|
+
ui.div(*extra_inputs, style="display:flex; flex-direction:column; gap:10px;"),
|
|
2836
|
+
ui.div(*numeric_inputs[0:5], style="display:flex; flex-direction:column; gap:10px;"),
|
|
2837
|
+
ui.div(*numeric_inputs[5:11], style="display:flex; flex-direction:column; gap:10px;"),
|
|
2838
|
+
col_widths=(3, 3, 3, 3),
|
|
552
2839
|
)
|
|
553
|
-
|
|
2840
|
+
else:
|
|
554
2841
|
inputs_columns = ui.layout_columns(
|
|
555
|
-
ui.div(base_inputs
|
|
556
|
-
ui.div(
|
|
557
|
-
ui.div(numeric_inputs[0:5], style="display:flex; flex-direction:column; gap:10px;"),
|
|
558
|
-
ui.div(
|
|
559
|
-
col_widths=(3,3,3,3),
|
|
2842
|
+
ui.div(*base_inputs, style="display:flex; flex-direction:column; gap:10px;"),
|
|
2843
|
+
ui.div(*extra_inputs, style="display:flex; flex-direction:column; gap:10px;"),
|
|
2844
|
+
ui.div(*numeric_inputs[0:5], style="display:flex; flex-direction:column; gap:10px;"),
|
|
2845
|
+
ui.div(*numeric_inputs[5:11], style="display:flex; flex-direction:column; gap:10px;"),
|
|
2846
|
+
col_widths=(3, 3, 3, 3),
|
|
560
2847
|
)
|
|
561
2848
|
|
|
562
2849
|
return ui.page_fillable(
|
|
563
2850
|
ui.layout_sidebar(
|
|
564
2851
|
ui.sidebar(
|
|
565
|
-
ui.h3("Select parameters"),
|
|
566
|
-
ui.input_checkbox_group(
|
|
567
|
-
"params",
|
|
568
|
-
None,
|
|
569
|
-
choices=list(PARAMS.keys()),
|
|
570
|
-
selected=["noise_threshold","LET_threshold"],
|
|
571
|
-
),
|
|
2852
|
+
ui.h3("Select continuous parameters to optimize"),
|
|
2853
|
+
ui.input_checkbox_group("params", None, choices=list(PARAMS.keys()), selected=["noise_threshold", "LET_threshold"]),
|
|
572
2854
|
ui.hr(),
|
|
573
2855
|
ui.h4("Bounds for selected parameters"),
|
|
574
2856
|
ui.output_ui("bounds_inputs"),
|
|
@@ -576,23 +2858,30 @@ def run_parameter_tuning_DE_ui(platform: str):
|
|
|
576
2858
|
),
|
|
577
2859
|
ui.div(
|
|
578
2860
|
ui.h2("Tune parameters (differential evolution optimization)"),
|
|
579
|
-
|
|
580
|
-
run_button_parameter_tuning_DE,
|
|
581
|
-
|
|
2861
|
+
inputs_columns,
|
|
2862
|
+
ui.div(run_button_parameter_tuning_DE, back_button, style=("display:flex; flex-direction:row; gap:12px; align-items:center; flex-wrap:wrap;")),
|
|
2863
|
+
ui.br(),
|
|
2864
|
+
ui.card(
|
|
2865
|
+
ui.card_header("Live log"),
|
|
2866
|
+
ui.output_text_verbatim("run_log"),
|
|
2867
|
+
),
|
|
2868
|
+
style="display:flex; flex-direction:column; gap:16px;",
|
|
582
2869
|
),
|
|
583
2870
|
)
|
|
584
2871
|
)
|
|
585
2872
|
|
|
586
2873
|
|
|
587
2874
|
|
|
588
|
-
|
|
589
2875
|
app_ui = ui.page_fluid(
|
|
590
2876
|
ui.head_content(ui.tags.link(rel="icon", href="emblem.png")),
|
|
2877
|
+
ui.div(ui.output_image("image"), style=("display:block; margin:20px auto; max-width:320px; height:auto; text-align:center")),
|
|
591
2878
|
ui.output_ui("main_ui"),
|
|
592
|
-
ui.output_text("status_output")
|
|
2879
|
+
ui.output_text("status_output"),
|
|
593
2880
|
)
|
|
594
2881
|
|
|
595
2882
|
|
|
2883
|
+
|
|
2884
|
+
|
|
596
2885
|
def server(input, output, session):
|
|
597
2886
|
|
|
598
2887
|
current_page = reactive.Value("main_menu")
|
|
@@ -611,7 +2900,7 @@ def server(input, output, session):
|
|
|
611
2900
|
match_log_rv = reactive.Value("")
|
|
612
2901
|
is_matching_rv = reactive.Value(False)
|
|
613
2902
|
is_any_job_running = reactive.Value(False)
|
|
614
|
-
|
|
2903
|
+
latest_txt_path_rv = reactive.Value("")
|
|
615
2904
|
latest_df_rv = reactive.Value(None)
|
|
616
2905
|
is_running_rv = reactive.Value(False)
|
|
617
2906
|
|
|
@@ -627,6 +2916,106 @@ def server(input, output, session):
|
|
|
627
2916
|
converted_query_path_rv = reactive.Value(None)
|
|
628
2917
|
converted_reference_path_rv = reactive.Value(None)
|
|
629
2918
|
|
|
2919
|
+
df_rv = reactive.Value(None)
|
|
2920
|
+
|
|
2921
|
+
|
|
2922
|
+
def _discover_rank_cols(df: pd.DataFrame):
|
|
2923
|
+
pred_pat = re.compile(r"^RANK\.(\d+)\.PRED$")
|
|
2924
|
+
score_pat = re.compile(r"^RANK\.(\d+)\.SIMILARITY\.SCORE$")
|
|
2925
|
+
pred_map, score_map = {}, {}
|
|
2926
|
+
for c in df.columns:
|
|
2927
|
+
m = pred_pat.match(c)
|
|
2928
|
+
if m: pred_map[int(m.group(1))] = c
|
|
2929
|
+
m = score_pat.match(c)
|
|
2930
|
+
if m: score_map[int(m.group(1))] = c
|
|
2931
|
+
return [(k, pred_map[k], score_map.get(k)) for k in sorted(pred_map)]
|
|
2932
|
+
|
|
2933
|
+
|
|
2934
|
+
def _rank_choices_for_query(df: pd.DataFrame, qid: str):
|
|
2935
|
+
sub = df.loc[df["QUERY.SPECTRUM.ID"].astype(str) == str(qid)]
|
|
2936
|
+
if sub.empty:
|
|
2937
|
+
return {}, None
|
|
2938
|
+
row = sub.iloc[0]
|
|
2939
|
+
rank_cols = _discover_rank_cols(df)
|
|
2940
|
+
if not rank_cols:
|
|
2941
|
+
return {}, None
|
|
2942
|
+
|
|
2943
|
+
choices = {}
|
|
2944
|
+
default_value = None
|
|
2945
|
+
for (k, pred_col, score_col) in rank_cols:
|
|
2946
|
+
pred = row.get(pred_col, None)
|
|
2947
|
+
if pd.isna(pred):
|
|
2948
|
+
continue
|
|
2949
|
+
pred = str(pred)
|
|
2950
|
+
score = row.get(score_col, None) if score_col else None
|
|
2951
|
+
score_str = f"{float(score):.6f}" if (score is not None and pd.notna(score)) else "NA"
|
|
2952
|
+
label = f"Rank {k} — {score_str} — {pred}"
|
|
2953
|
+
choices[label] = pred # values are plain names
|
|
2954
|
+
if k == 1:
|
|
2955
|
+
default_value = pred # default = Rank 1 name
|
|
2956
|
+
|
|
2957
|
+
if default_value is None and choices:
|
|
2958
|
+
default_value = next(iter(choices.values()))
|
|
2959
|
+
return choices, default_value
|
|
2960
|
+
|
|
2961
|
+
|
|
2962
|
+
@reactive.effect
|
|
2963
|
+
@reactive.event(input.compound_ID_output_file)
|
|
2964
|
+
async def _populate_ids_from_compound_ID_output_upload():
|
|
2965
|
+
files = input.compound_ID_output_file()
|
|
2966
|
+
if not files:
|
|
2967
|
+
return
|
|
2968
|
+
|
|
2969
|
+
in_path = Path(files[0]["datapath"])
|
|
2970
|
+
try:
|
|
2971
|
+
query_status_rv.set(f"Reading table from: {in_path.name} …")
|
|
2972
|
+
await reactive.flush()
|
|
2973
|
+
|
|
2974
|
+
df = await asyncio.to_thread(pd.read_csv, in_path, sep="\t", header=0)
|
|
2975
|
+
|
|
2976
|
+
if "QUERY.SPECTRUM.ID" not in df.columns:
|
|
2977
|
+
raise ValueError("Missing required column: QUERY.SPECTRUM.ID")
|
|
2978
|
+
if not _discover_rank_cols(df):
|
|
2979
|
+
raise ValueError("No columns matching RANK.<k>.PRED found.")
|
|
2980
|
+
|
|
2981
|
+
df_rv.set(df)
|
|
2982
|
+
|
|
2983
|
+
ids = df["QUERY.SPECTRUM.ID"].astype(str).tolist()
|
|
2984
|
+
unique_ids_in_order = list(dict.fromkeys(ids))
|
|
2985
|
+
|
|
2986
|
+
choices_dict, default_rank_value = _rank_choices_for_query(df, ids[0])
|
|
2987
|
+
choices_values = [str(v).strip() for v in choices_dict.values()]
|
|
2988
|
+
default_rank_value = str(default_rank_value).strip() if default_rank_value is not None else None
|
|
2989
|
+
|
|
2990
|
+
ui.update_selectize("q_spec", choices=unique_ids_in_order, selected=ids[0])
|
|
2991
|
+
await reactive.flush()
|
|
2992
|
+
|
|
2993
|
+
ui.update_selectize("r_spec", choices=choices_values, selected=choices_values[0])
|
|
2994
|
+
await reactive.flush()
|
|
2995
|
+
|
|
2996
|
+
except Exception as e:
|
|
2997
|
+
query_status_rv.set(f"❌ Failed: {e}")
|
|
2998
|
+
await reactive.flush()
|
|
2999
|
+
raise
|
|
3000
|
+
|
|
3001
|
+
|
|
3002
|
+
@reactive.effect
|
|
3003
|
+
@reactive.event(input.q_spec)
|
|
3004
|
+
async def _update_rank_choices_on_compound_ID_change():
|
|
3005
|
+
df = df_rv.get()
|
|
3006
|
+
if df is None:
|
|
3007
|
+
return
|
|
3008
|
+
qid = input.q_spec()
|
|
3009
|
+
if not qid:
|
|
3010
|
+
return
|
|
3011
|
+
|
|
3012
|
+
choices, default_rank_value = _rank_choices_for_query(df, qid)
|
|
3013
|
+
choices = list(choices.values())
|
|
3014
|
+
ui.update_selectize('r_spec', choices=choices, selected=default_rank_value)
|
|
3015
|
+
await reactive.flush()
|
|
3016
|
+
|
|
3017
|
+
|
|
3018
|
+
|
|
630
3019
|
@output
|
|
631
3020
|
@render.ui
|
|
632
3021
|
def bounds_inputs():
|
|
@@ -769,6 +3158,11 @@ def server(input, output, session):
|
|
|
769
3158
|
def flush(self):
|
|
770
3159
|
pass
|
|
771
3160
|
|
|
3161
|
+
def _run_with_redirects(func, writer: ReactiveWriter, **kwargs):
|
|
3162
|
+
with contextlib.redirect_stdout(writer), contextlib.redirect_stderr(writer):
|
|
3163
|
+
return func(**kwargs)
|
|
3164
|
+
|
|
3165
|
+
|
|
772
3166
|
|
|
773
3167
|
@reactive.effect
|
|
774
3168
|
async def _pump_logs():
|
|
@@ -865,7 +3259,7 @@ def server(input, output, session):
|
|
|
865
3259
|
@render.image
|
|
866
3260
|
def image():
|
|
867
3261
|
dir = Path(__file__).resolve().parent
|
|
868
|
-
img: ImgData = {"src": str(dir / "www/emblem.png"), "width": "
|
|
3262
|
+
img: ImgData = {"src": str(dir / "www/emblem.png"), "width": "250px", "height": "250px"}
|
|
869
3263
|
return img
|
|
870
3264
|
|
|
871
3265
|
@output
|
|
@@ -874,30 +3268,10 @@ def server(input, output, session):
|
|
|
874
3268
|
if current_page() == "main_menu":
|
|
875
3269
|
return ui.page_fluid(
|
|
876
3270
|
ui.h2("Main Menu"),
|
|
877
|
-
ui.div(
|
|
878
|
-
|
|
879
|
-
|
|
880
|
-
|
|
881
|
-
"position:fixed; top:0; left:50%; transform:translateX(-50%); "
|
|
882
|
-
"z-index:1000; text-align:center; padding:10px; background-color:white;"
|
|
883
|
-
),
|
|
884
|
-
),
|
|
885
|
-
ui.div(
|
|
886
|
-
"Overview:",
|
|
887
|
-
style="text-align:left; font-size:24px; font-weight:bold; margin-top:350px"
|
|
888
|
-
),
|
|
889
|
-
ui.div(
|
|
890
|
-
"PyCompound is a Python-based tool designed for performing spectral library matching on either high-resolution mass spectrometry data (HRMS) or low-resolution mass spectrometry data (NRMS). PyCompound offers a range of spectrum preprocessing transformations and similarity measures. These spectrum preprocessing transformations include filtering on mass/charge and/or intensity values, weight factor transformation, low-entropy transformation, centroiding, noise removal, and matching. The available similarity measures include the canonical Cosine similarity measure, three entropy-based similarity measures, and a variety of binary similarity measures: Jaccard, Dice, 3W-Jaccard, Sokal-Sneath, Binary Cosine, Mountford, McConnaughey, Driver-Kroeber, Simpson, Braun-Banquet, Fager-McGowan, Kulczynski, Intersection, Hamming, and Hellinger.",
|
|
891
|
-
style="margin-top:10px; text-align:left; font-size:16px; font-weight:500"
|
|
892
|
-
),
|
|
893
|
-
ui.div(
|
|
894
|
-
"Select options:",
|
|
895
|
-
style="margin-top:30px; text-align:left; font-size:24px; font-weight:bold"
|
|
896
|
-
),
|
|
897
|
-
ui.div(
|
|
898
|
-
ui.input_radio_buttons("chromatography_platform", "Specify chromatography platform:", ["HRMS","NRMS"]),
|
|
899
|
-
style="font-size:18px; margin-top:10px; max-width:none"
|
|
900
|
-
),
|
|
3271
|
+
ui.div("Overview:", style="text-align:left; font-size:24px; font-weight:bold"),
|
|
3272
|
+
ui.div("PyCompound is a Python-based tool designed for performing spectral library matching on either high-resolution mass spectrometry data (HRMS) or low-resolution mass spectrometry data (NRMS). PyCompound offers a range of spectrum preprocessing transformations and similarity measures. These spectrum preprocessing transformations include filtering on mass/charge and/or intensity values, weight factor transformation, low-entropy transformation, centroiding, noise removal, and matching. The available similarity measures include the canonical Cosine similarity measure, three entropy-based similarity measures, and a variety of binary similarity measures: Jaccard, Dice, 3W-Jaccard, Sokal-Sneath, Binary Cosine, Mountford, McConnaughey, Driver-Kroeber, Simpson, Braun-Banquet, Fager-McGowan, Kulczynski, Intersection, Hamming, and Hellinger.", style="margin-top:10px; text-align:left; font-size:16px; font-weight:500"),
|
|
3273
|
+
ui.div("Select options:", style="margin-top:30px; text-align:left; font-size:24px; font-weight:bold"),
|
|
3274
|
+
ui.div(ui.input_radio_buttons("chromatography_platform", "Specify chromatography platform:", ["HRMS","NRMS"]), style="font-size:18px; margin-top:10px; max-width:none"),
|
|
901
3275
|
ui.input_action_button("plot_spectra", "Plot two spectra before and after preprocessing transformations.", style="font-size:18px; padding:20px 40px; width:550px; height:100px; margin-top:10px; margin-right:50px"),
|
|
902
3276
|
ui.input_action_button("run_spec_lib_matching", "Run spectral library matching to perform compound identification on a query library of spectra.", style="font-size:18px; padding:20px 40px; width:550px; height:100px; margin-top:10px; margin-right:50px"),
|
|
903
3277
|
ui.input_action_button("run_parameter_tuning_grid", "Grid search: Tune parameters to maximize accuracy of compound identification given a query library with known spectrum IDs.", style="font-size:18px; padding:20px 40px; width:450px; height:120px; margin-top:10px; margin-right:50px"),
|
|
@@ -970,36 +3344,36 @@ def server(input, output, session):
|
|
|
970
3344
|
suffix = in_path.suffix.lower()
|
|
971
3345
|
|
|
972
3346
|
try:
|
|
973
|
-
if suffix == ".
|
|
974
|
-
|
|
975
|
-
converted_query_path_rv.set(str(
|
|
3347
|
+
if suffix == ".txt":
|
|
3348
|
+
txt_path = in_path
|
|
3349
|
+
converted_query_path_rv.set(str(txt_path))
|
|
976
3350
|
else:
|
|
977
|
-
query_status_rv.set(f"Converting {in_path.name} →
|
|
3351
|
+
query_status_rv.set(f"Converting {in_path.name} → TXT…")
|
|
978
3352
|
await reactive.flush()
|
|
979
3353
|
|
|
980
|
-
|
|
3354
|
+
tmp_txt_path = in_path.with_suffix(".converted.txt")
|
|
981
3355
|
|
|
982
|
-
out_obj = await asyncio.to_thread(build_library, str(in_path), str(
|
|
3356
|
+
out_obj = await asyncio.to_thread(build_library, str(in_path), str(tmp_txt_path))
|
|
983
3357
|
|
|
984
3358
|
if isinstance(out_obj, (str, os.PathLike, Path)):
|
|
985
|
-
|
|
3359
|
+
txt_path = Path(out_obj)
|
|
986
3360
|
elif isinstance(out_obj, pd.DataFrame):
|
|
987
|
-
out_obj.to_csv(
|
|
988
|
-
|
|
3361
|
+
out_obj.to_csv(tmp_txt_path, index=False, sep='\t')
|
|
3362
|
+
txt_path = tmp_txt_path
|
|
989
3363
|
else:
|
|
990
3364
|
raise TypeError(f"build_library returned unsupported type: {type(out_obj)}")
|
|
991
3365
|
|
|
992
|
-
converted_query_path_rv.set(str(
|
|
3366
|
+
converted_query_path_rv.set(str(txt_path))
|
|
993
3367
|
|
|
994
|
-
query_status_rv.set(f"Reading IDs from: {
|
|
3368
|
+
query_status_rv.set(f"Reading IDs from: {txt_path.name} …")
|
|
995
3369
|
await reactive.flush()
|
|
996
3370
|
|
|
997
|
-
ids = await asyncio.to_thread(extract_first_column_ids, str(
|
|
3371
|
+
ids = await asyncio.to_thread(extract_first_column_ids, str(txt_path))
|
|
998
3372
|
query_ids_rv.set(ids)
|
|
999
3373
|
|
|
1000
3374
|
ui.update_selectize("spectrum_ID1", choices=ids, selected=(ids[0] if ids else None))
|
|
1001
3375
|
|
|
1002
|
-
query_status_rv.set(f"✅ Loaded {len(ids)} IDs from {
|
|
3376
|
+
query_status_rv.set(f"✅ Loaded {len(ids)} IDs from {txt_path.name}" if ids else f"⚠️ No IDs found in {txt_path.name}")
|
|
1003
3377
|
await reactive.flush()
|
|
1004
3378
|
|
|
1005
3379
|
except Exception as e:
|
|
@@ -1019,37 +3393,37 @@ def server(input, output, session):
|
|
|
1019
3393
|
suffix = in_path.suffix.lower()
|
|
1020
3394
|
|
|
1021
3395
|
try:
|
|
1022
|
-
if suffix == ".
|
|
1023
|
-
|
|
1024
|
-
converted_reference_path_rv.set(str(
|
|
3396
|
+
if suffix == ".txt":
|
|
3397
|
+
txt_path = in_path
|
|
3398
|
+
converted_reference_path_rv.set(str(txt_path))
|
|
1025
3399
|
else:
|
|
1026
|
-
reference_status_rv.set(f"Converting {in_path.name} →
|
|
3400
|
+
reference_status_rv.set(f"Converting {in_path.name} → TXT…")
|
|
1027
3401
|
await reactive.flush()
|
|
1028
3402
|
|
|
1029
|
-
|
|
3403
|
+
tmp_txt_path = in_path.with_suffix(".converted.txt")
|
|
1030
3404
|
|
|
1031
|
-
out_obj = await asyncio.to_thread(build_library, str(in_path), str(
|
|
3405
|
+
out_obj = await asyncio.to_thread(build_library, str(in_path), str(tmp_txt_path))
|
|
1032
3406
|
|
|
1033
3407
|
if isinstance(out_obj, (str, os.PathLike, Path)):
|
|
1034
|
-
|
|
3408
|
+
txt_path = Path(out_obj)
|
|
1035
3409
|
elif isinstance(out_obj, pd.DataFrame):
|
|
1036
|
-
out_obj.to_csv(
|
|
1037
|
-
|
|
3410
|
+
out_obj.to_csv(tmp_txt_path, index=False, sep='\t')
|
|
3411
|
+
txt_path = tmp_txt_path
|
|
1038
3412
|
else:
|
|
1039
3413
|
raise TypeError(f"build_library returned unsupported type: {type(out_obj)}")
|
|
1040
3414
|
|
|
1041
|
-
converted_reference_path_rv.set(str(
|
|
3415
|
+
converted_reference_path_rv.set(str(txt_path))
|
|
1042
3416
|
|
|
1043
|
-
reference_status_rv.set(f"Reading IDs from: {
|
|
3417
|
+
reference_status_rv.set(f"Reading IDs from: {txt_path.name} …")
|
|
1044
3418
|
await reactive.flush()
|
|
1045
3419
|
|
|
1046
|
-
ids = await asyncio.to_thread(extract_first_column_ids, str(
|
|
3420
|
+
ids = await asyncio.to_thread(extract_first_column_ids, str(txt_path))
|
|
1047
3421
|
reference_ids_rv.set(ids)
|
|
1048
3422
|
|
|
1049
3423
|
ui.update_selectize("spectrum_ID2", choices=ids, selected=(ids[0] if ids else None))
|
|
1050
3424
|
|
|
1051
3425
|
reference_status_rv.set(
|
|
1052
|
-
f"✅ Loaded {len(ids)} IDs from {
|
|
3426
|
+
f"✅ Loaded {len(ids)} IDs from {txt_path.name}" if ids else f"⚠️ No IDs found in {txt_path.name}"
|
|
1053
3427
|
)
|
|
1054
3428
|
await reactive.flush()
|
|
1055
3429
|
|
|
@@ -1059,7 +3433,7 @@ def server(input, output, session):
|
|
|
1059
3433
|
raise
|
|
1060
3434
|
|
|
1061
3435
|
|
|
1062
|
-
@render.download(filename=lambda: f"plot.
|
|
3436
|
+
@render.download(filename=lambda: f"plot.svg")
|
|
1063
3437
|
def run_btn_plot_spectra():
|
|
1064
3438
|
spectrum_ID1 = input.spectrum_ID1() or None
|
|
1065
3439
|
spectrum_ID2 = input.spectrum_ID2() or None
|
|
@@ -1071,22 +3445,20 @@ def server(input, output, session):
|
|
|
1071
3445
|
if input.high_quality_reference_library() != 'False':
|
|
1072
3446
|
high_quality_reference_library_tmp2 = True
|
|
1073
3447
|
|
|
1074
|
-
print(input.high_quality_reference_library())
|
|
1075
|
-
print(high_quality_reference_library_tmp2)
|
|
1076
|
-
|
|
1077
3448
|
if input.chromatography_platform() == "HRMS":
|
|
1078
|
-
fig = generate_plots_on_HRMS_data(query_data=input.query_data()[0]['datapath'], reference_data=input.reference_data()[0]['datapath'], spectrum_ID1=spectrum_ID1, spectrum_ID2=spectrum_ID2, similarity_measure=input.similarity_measure(), weights=weights, spectrum_preprocessing_order=input.spectrum_preprocessing_order(), high_quality_reference_library=high_quality_reference_library_tmp2, mz_min=input.mz_min(), mz_max=input.mz_max(), int_min=input.int_min(), int_max=input.int_max(), window_size_centroiding=input.window_size_centroiding(), window_size_matching=input.window_size_matching(), noise_threshold=input.noise_threshold(), wf_mz=input.wf_mz(), wf_intensity=input.wf_int(), LET_threshold=input.LET_threshold(), entropy_dimension=input.entropy_dimension(), y_axis_transformation=input.y_axis_transformation(), return_plot=True)
|
|
3449
|
+
fig = generate_plots_on_HRMS_data(query_data=input.query_data()[0]['datapath'], reference_data=input.reference_data()[0]['datapath'], spectrum_ID1=spectrum_ID1, spectrum_ID2=spectrum_ID2, print_url_spectrum1=input.print_url_spectrum1(), print_url_spectrum2=input.print_url_spectrum2(), similarity_measure=input.similarity_measure(), weights=weights, spectrum_preprocessing_order=input.spectrum_preprocessing_order(), high_quality_reference_library=high_quality_reference_library_tmp2, mz_min=input.mz_min(), mz_max=input.mz_max(), int_min=input.int_min(), int_max=input.int_max(), window_size_centroiding=input.window_size_centroiding(), window_size_matching=input.window_size_matching(), noise_threshold=input.noise_threshold(), wf_mz=input.wf_mz(), wf_intensity=input.wf_int(), LET_threshold=input.LET_threshold(), entropy_dimension=input.entropy_dimension(), y_axis_transformation=input.y_axis_transformation(), return_plot=True)
|
|
1079
3450
|
plt.show()
|
|
1080
3451
|
elif input.chromatography_platform() == "NRMS":
|
|
1081
|
-
fig = generate_plots_on_NRMS_data(query_data=input.query_data()[0]['datapath'], reference_data=input.reference_data()[0]['datapath'], spectrum_ID1=spectrum_ID1, spectrum_ID2=spectrum_ID2, similarity_measure=input.similarity_measure(), spectrum_preprocessing_order=input.spectrum_preprocessing_order(), high_quality_reference_library=high_quality_reference_library_tmp2, mz_min=input.mz_min(), mz_max=input.mz_max(), int_min=input.int_min(), int_max=input.int_max(), noise_threshold=input.noise_threshold(), wf_mz=input.wf_mz(), wf_intensity=input.wf_int(), LET_threshold=input.LET_threshold(), entropy_dimension=input.entropy_dimension(), y_axis_transformation=input.y_axis_transformation(), return_plot=True)
|
|
3452
|
+
fig = generate_plots_on_NRMS_data(query_data=input.query_data()[0]['datapath'], reference_data=input.reference_data()[0]['datapath'], spectrum_ID1=spectrum_ID1, spectrum_ID2=spectrum_ID2, print_url_spectrum1=input.print_url_spectrum1(), print_url_spectrum2=input.print_url_spectrum2(), similarity_measure=input.similarity_measure(), spectrum_preprocessing_order=input.spectrum_preprocessing_order(), high_quality_reference_library=high_quality_reference_library_tmp2, mz_min=input.mz_min(), mz_max=input.mz_max(), int_min=input.int_min(), int_max=input.int_max(), noise_threshold=input.noise_threshold(), wf_mz=input.wf_mz(), wf_intensity=input.wf_int(), LET_threshold=input.LET_threshold(), entropy_dimension=input.entropy_dimension(), y_axis_transformation=input.y_axis_transformation(), return_plot=True)
|
|
1082
3453
|
plt.show()
|
|
1083
3454
|
with io.BytesIO() as buf:
|
|
1084
|
-
fig.savefig(buf, format="
|
|
3455
|
+
fig.savefig(buf, format="svg", dpi=150, bbox_inches="tight")
|
|
1085
3456
|
plt.close()
|
|
1086
3457
|
yield buf.getvalue()
|
|
1087
3458
|
|
|
1088
3459
|
|
|
1089
3460
|
|
|
3461
|
+
|
|
1090
3462
|
@render.download(filename="identification_output.txt")
|
|
1091
3463
|
async def run_btn_spec_lib_matching():
|
|
1092
3464
|
match_log_rv.set("Running identification...\n")
|
|
@@ -1099,7 +3471,7 @@ def server(input, output, session):
|
|
|
1099
3471
|
hq = bool(hq)
|
|
1100
3472
|
|
|
1101
3473
|
weights = [float(weight.strip()) for weight in input.weights().split(",") if weight.strip()]
|
|
1102
|
-
weights = {'Cosine':weights[0], 'Shannon':weights[1], 'Renyi':weights[2], 'Tsallis':weights[3]}
|
|
3474
|
+
weights = {'Cosine': weights[0], 'Shannon': weights[1], 'Renyi': weights[2], 'Tsallis': weights[3]}
|
|
1103
3475
|
|
|
1104
3476
|
common_kwargs = dict(
|
|
1105
3477
|
query_data=input.query_data()[0]["datapath"],
|
|
@@ -1121,37 +3493,81 @@ def server(input, output, session):
|
|
|
1121
3493
|
return_ID_output=True,
|
|
1122
3494
|
)
|
|
1123
3495
|
|
|
3496
|
+
# --- streaming setup (same pattern as your DE block) ---
|
|
1124
3497
|
loop = asyncio.get_running_loop()
|
|
1125
|
-
|
|
3498
|
+
q: asyncio.Queue[str | None] = asyncio.Queue()
|
|
3499
|
+
|
|
3500
|
+
class UIWriter(io.TextIOBase):
|
|
3501
|
+
def write(self, s: str):
|
|
3502
|
+
if s:
|
|
3503
|
+
loop.call_soon_threadsafe(q.put_nowait, s)
|
|
3504
|
+
return len(s)
|
|
3505
|
+
def flush(self): pass
|
|
3506
|
+
|
|
3507
|
+
async def _drain():
|
|
3508
|
+
while True:
|
|
3509
|
+
msg = await q.get()
|
|
3510
|
+
if msg is None:
|
|
3511
|
+
break
|
|
3512
|
+
match_log_rv.set(match_log_rv.get() + msg)
|
|
3513
|
+
await reactive.flush()
|
|
3514
|
+
|
|
3515
|
+
drain_task = asyncio.create_task(_drain())
|
|
3516
|
+
writer = UIWriter()
|
|
3517
|
+
|
|
3518
|
+
# --- worker wrappers that install redirects INSIDE the thread ---
|
|
3519
|
+
def _run_hrms():
|
|
3520
|
+
with redirect_stdout(writer), redirect_stderr(writer):
|
|
3521
|
+
# optional heartbeat
|
|
3522
|
+
print(">> Starting HRMS identification ...", flush=True)
|
|
3523
|
+
return run_spec_lib_matching_on_HRMS_data_shiny(
|
|
3524
|
+
precursor_ion_mz_tolerance=input.precursor_ion_mz_tolerance(),
|
|
3525
|
+
ionization_mode=input.ionization_mode(),
|
|
3526
|
+
adduct=input.adduct(),
|
|
3527
|
+
window_size_centroiding=input.window_size_centroiding(),
|
|
3528
|
+
window_size_matching=input.window_size_matching(),
|
|
3529
|
+
**common_kwargs
|
|
3530
|
+
)
|
|
3531
|
+
|
|
3532
|
+
def _run_nrms():
|
|
3533
|
+
with redirect_stdout(writer), redirect_stderr(writer):
|
|
3534
|
+
print(">> Starting NRMS identification ...", flush=True)
|
|
3535
|
+
return run_spec_lib_matching_on_NRMS_data_shiny(**common_kwargs)
|
|
1126
3536
|
|
|
3537
|
+
# --- run in worker thread and stream output live ---
|
|
1127
3538
|
try:
|
|
1128
|
-
|
|
1129
|
-
|
|
1130
|
-
|
|
1131
|
-
|
|
1132
|
-
|
|
1133
|
-
window_size_matching=input.window_size_matching(),
|
|
1134
|
-
**common_kwargs
|
|
1135
|
-
)
|
|
1136
|
-
else:
|
|
1137
|
-
df_out = await asyncio.to_thread(run_spec_lib_matching_on_NRMS_data, **common_kwargs)
|
|
3539
|
+
if input.chromatography_platform() == "HRMS":
|
|
3540
|
+
df_out = await asyncio.to_thread(_run_hrms)
|
|
3541
|
+
else:
|
|
3542
|
+
df_out = await asyncio.to_thread(_run_nrms)
|
|
3543
|
+
|
|
1138
3544
|
match_log_rv.set(match_log_rv.get() + "\n✅ Identification finished.\n")
|
|
1139
3545
|
await reactive.flush()
|
|
3546
|
+
|
|
1140
3547
|
except Exception as e:
|
|
1141
|
-
|
|
3548
|
+
import traceback
|
|
3549
|
+
tb = "".join(traceback.format_exception(type(e), e, e.__traceback__))
|
|
3550
|
+
match_log_rv.set(match_log_rv.get() + f"\n❌ {type(e).__name__}: {e}\n{tb}\n")
|
|
1142
3551
|
await reactive.flush()
|
|
3552
|
+
# make sure to stop the drainer before re-raising
|
|
3553
|
+
await q.put(None); await drain_task
|
|
1143
3554
|
raise
|
|
1144
3555
|
|
|
1145
|
-
|
|
3556
|
+
finally:
|
|
3557
|
+
await q.put(None)
|
|
3558
|
+
await drain_task
|
|
3559
|
+
|
|
3560
|
+
yield df_out.to_csv(index=True, sep="\t")
|
|
3561
|
+
|
|
1146
3562
|
|
|
1147
3563
|
|
|
1148
3564
|
|
|
1149
|
-
@render.download(filename="plot.
|
|
3565
|
+
@render.download(filename="plot.svg")
|
|
1150
3566
|
def run_btn_plot_spectra_within_spec_lib_matching():
|
|
1151
3567
|
req(input.query_data(), input.reference_data())
|
|
1152
3568
|
|
|
1153
|
-
spectrum_ID1 = input.
|
|
1154
|
-
spectrum_ID2 = input.
|
|
3569
|
+
spectrum_ID1 = input.q_spec() or None
|
|
3570
|
+
spectrum_ID2 = input.r_spec() or None
|
|
1155
3571
|
|
|
1156
3572
|
hq = input.high_quality_reference_library()
|
|
1157
3573
|
if isinstance(hq, str):
|
|
@@ -1167,6 +3583,8 @@ def server(input, output, session):
|
|
|
1167
3583
|
reference_data=input.reference_data()[0]['datapath'],
|
|
1168
3584
|
spectrum_ID1=spectrum_ID1,
|
|
1169
3585
|
spectrum_ID2=spectrum_ID2,
|
|
3586
|
+
print_url_spectrum1=input.print_url_spectrum1(),
|
|
3587
|
+
print_url_spectrum2=input.print_url_spectrum2(),
|
|
1170
3588
|
similarity_measure=input.similarity_measure(),
|
|
1171
3589
|
weights=weights,
|
|
1172
3590
|
spectrum_preprocessing_order=input.spectrum_preprocessing_order(),
|
|
@@ -1192,7 +3610,7 @@ def server(input, output, session):
|
|
|
1192
3610
|
plt.show()
|
|
1193
3611
|
|
|
1194
3612
|
with io.BytesIO() as buf:
|
|
1195
|
-
fig.savefig(buf, format="
|
|
3613
|
+
fig.savefig(buf, format="svg", dpi=150, bbox_inches="tight")
|
|
1196
3614
|
plt.close()
|
|
1197
3615
|
yield buf.getvalue()
|
|
1198
3616
|
|
|
@@ -1230,6 +3648,9 @@ def server(input, output, session):
|
|
|
1230
3648
|
|
|
1231
3649
|
try:
|
|
1232
3650
|
if input.chromatography_platform() == "HRMS":
|
|
3651
|
+
precursor_ion_mz_tolerance = float(input.precursor_ion_mz_tolerance())
|
|
3652
|
+
ionization_mode = str(input.ionization_mode())
|
|
3653
|
+
adduct = str(input.adduct())
|
|
1233
3654
|
window_size_centroiding_tmp = strip_numeric(input.window_size_centroiding())
|
|
1234
3655
|
window_size_matching_tmp = strip_numeric(input.window_size_matching())
|
|
1235
3656
|
grid = {
|
|
@@ -1249,7 +3670,7 @@ def server(input, output, session):
|
|
|
1249
3670
|
'window_size_centroiding': window_size_centroiding_tmp,
|
|
1250
3671
|
'window_size_matching': window_size_matching_tmp,
|
|
1251
3672
|
}
|
|
1252
|
-
df_out = await asyncio.to_thread(_run_with_redirects, tune_params_on_HRMS_data_grid_shiny, rw, **common_kwargs, grid=grid)
|
|
3673
|
+
df_out = await asyncio.to_thread(_run_with_redirects, tune_params_on_HRMS_data_grid_shiny, rw, **common_kwargs, grid=grid, precursor_ion_mz_tolerance=precursor_ion_mz_tolerance, ionization_mode=ionization_mode, adduct=adduct)
|
|
1253
3674
|
else:
|
|
1254
3675
|
grid = {
|
|
1255
3676
|
'similarity_measure': similarity_measure_tmp,
|
|
@@ -1277,43 +3698,147 @@ def server(input, output, session):
|
|
|
1277
3698
|
is_any_job_running.set(False)
|
|
1278
3699
|
await reactive.flush()
|
|
1279
3700
|
|
|
1280
|
-
yield df_out.to_csv(index=False).encode("utf-8"
|
|
3701
|
+
yield df_out.to_csv(index=False, sep='\t').encode("utf-8")
|
|
3702
|
+
|
|
1281
3703
|
|
|
1282
3704
|
|
|
1283
3705
|
@reactive.effect
|
|
1284
3706
|
@reactive.event(input.run_btn_parameter_tuning_DE)
|
|
1285
|
-
def
|
|
3707
|
+
async def run_btn_parameter_tuning_DE():
|
|
3708
|
+
match_log_rv.set("Tuning specified continuous parameters using differential evolution...\n")
|
|
1286
3709
|
is_any_job_running.set(True)
|
|
1287
3710
|
is_tuning_DE_running.set(True)
|
|
1288
|
-
|
|
3711
|
+
await reactive.flush()
|
|
1289
3712
|
|
|
1290
|
-
|
|
1291
|
-
|
|
1292
|
-
|
|
1293
|
-
|
|
1294
|
-
|
|
1295
|
-
|
|
1296
|
-
|
|
1297
|
-
|
|
1298
|
-
|
|
1299
|
-
|
|
1300
|
-
|
|
1301
|
-
|
|
1302
|
-
|
|
1303
|
-
|
|
1304
|
-
|
|
1305
|
-
|
|
1306
|
-
|
|
1307
|
-
|
|
1308
|
-
|
|
1309
|
-
|
|
1310
|
-
|
|
1311
|
-
|
|
1312
|
-
|
|
1313
|
-
|
|
1314
|
-
|
|
1315
|
-
|
|
3713
|
+
def _safe_float(v, default):
|
|
3714
|
+
try:
|
|
3715
|
+
if v is None:
|
|
3716
|
+
return default
|
|
3717
|
+
return float(v)
|
|
3718
|
+
except Exception:
|
|
3719
|
+
return default
|
|
3720
|
+
|
|
3721
|
+
def _iget(id, default=None):
|
|
3722
|
+
if id in input:
|
|
3723
|
+
try:
|
|
3724
|
+
return input[id]()
|
|
3725
|
+
except SilentException:
|
|
3726
|
+
return default
|
|
3727
|
+
return default
|
|
3728
|
+
|
|
3729
|
+
loop = asyncio.get_running_loop()
|
|
3730
|
+
q: asyncio.Queue[str | None] = asyncio.Queue()
|
|
3731
|
+
|
|
3732
|
+
class UIWriter(io.TextIOBase):
|
|
3733
|
+
def write(self, s: str):
|
|
3734
|
+
if s:
|
|
3735
|
+
loop.call_soon_threadsafe(q.put_nowait, s)
|
|
3736
|
+
return len(s)
|
|
3737
|
+
def flush(self): pass
|
|
3738
|
+
|
|
3739
|
+
async def _drain():
|
|
3740
|
+
while True:
|
|
3741
|
+
msg = await q.get()
|
|
3742
|
+
if msg is None:
|
|
3743
|
+
break
|
|
3744
|
+
match_log_rv.set(match_log_rv.get() + msg)
|
|
3745
|
+
await reactive.flush()
|
|
3746
|
+
|
|
3747
|
+
drain_task = asyncio.create_task(_drain())
|
|
3748
|
+
writer = UIWriter()
|
|
3749
|
+
|
|
3750
|
+
try:
|
|
3751
|
+
qfile = _iget("query_data")[0]["datapath"]
|
|
3752
|
+
rfile = _iget("reference_data")[0]["datapath"]
|
|
1316
3753
|
|
|
3754
|
+
platform = _iget("chromatography_platform", "HRMS")
|
|
3755
|
+
sim = _iget("similarity_measure", "cosine")
|
|
3756
|
+
spro = _iget("spectrum_preprocessing_order", "FCNMWL")
|
|
3757
|
+
|
|
3758
|
+
hq_raw = _iget("high_quality_reference_library", False)
|
|
3759
|
+
if isinstance(hq_raw, str):
|
|
3760
|
+
hq = hq_raw.lower() == "true"
|
|
3761
|
+
else:
|
|
3762
|
+
hq = bool(hq_raw)
|
|
3763
|
+
|
|
3764
|
+
mz_min = _safe_float(_iget("mz_min", 0.0), 0.0)
|
|
3765
|
+
mz_max = _safe_float(_iget("mz_max", 99_999_999.0), 99_999_999.0)
|
|
3766
|
+
int_min = _safe_float(_iget("int_min", 0.0), 0.0)
|
|
3767
|
+
int_max = _safe_float(_iget("int_max", 999_999_999.0), 999_999_999.0)
|
|
3768
|
+
|
|
3769
|
+
w_text = _iget("weights", "") or ""
|
|
3770
|
+
w_list = [float(w.strip()) for w in w_text.split(",") if w.strip()]
|
|
3771
|
+
w_list = (w_list + [0.0, 0.0, 0.0, 0.0])[:4]
|
|
3772
|
+
weights = {"Cosine": w_list[0], "Shannon": w_list[1], "Renyi": w_list[2], "Tsallis": w_list[3]}
|
|
3773
|
+
|
|
3774
|
+
opt_params = tuple(_iget("params", ()) or ())
|
|
3775
|
+
bounds_dict = {}
|
|
3776
|
+
param_defaults = PARAMS_HRMS if platform == "HRMS" else PARAMS_NRMS
|
|
3777
|
+
for p in opt_params:
|
|
3778
|
+
lo = _safe_float(_iget(f"min_{p}", param_defaults.get(p, (0.0, 1.0))[0]),
|
|
3779
|
+
param_defaults.get(p, (0.0, 1.0))[0])
|
|
3780
|
+
hi = _safe_float(_iget(f"max_{p}", param_defaults.get(p, (0.0, 1.0))[1]),
|
|
3781
|
+
param_defaults.get(p, (0.0, 1.0))[1])
|
|
3782
|
+
if lo > hi:
|
|
3783
|
+
lo, hi = hi, lo
|
|
3784
|
+
bounds_dict[p] = (lo, hi)
|
|
3785
|
+
|
|
3786
|
+
defaults = {
|
|
3787
|
+
"window_size_centroiding": _safe_float(_iget("window_size_centroiding", 0.5), 0.5),
|
|
3788
|
+
"window_size_matching": _safe_float(_iget("window_size_matching", 0.5), 0.5),
|
|
3789
|
+
"noise_threshold": _safe_float(_iget("noise_threshold", 0.0), 0.0),
|
|
3790
|
+
"wf_mz": _safe_float(_iget("wf_mz", 0.0), 0.0),
|
|
3791
|
+
"wf_int": _safe_float(_iget("wf_int", 1.0), 1.0),
|
|
3792
|
+
"LET_threshold": _safe_float(_iget("LET_threshold", 0.0), 0.0),
|
|
3793
|
+
"entropy_dimension": _safe_float(_iget("entropy_dimension", 1.1), 1.1),
|
|
3794
|
+
}
|
|
3795
|
+
if platform == "NRMS":
|
|
3796
|
+
defaults.pop("window_size_centroiding", None)
|
|
3797
|
+
defaults.pop("window_size_matching", None)
|
|
3798
|
+
|
|
3799
|
+
except Exception as e:
|
|
3800
|
+
import traceback
|
|
3801
|
+
tb = "".join(traceback.format_exception(type(e), e, e.__traceback__))
|
|
3802
|
+
match_log_rv.set(match_log_rv.get() + f"\n❌ Input snapshot failed:\n{tb}\n")
|
|
3803
|
+
is_tuning_DE_running.set(False); is_any_job_running.set(False)
|
|
3804
|
+
await q.put(None); await drain_task; await reactive.flush()
|
|
3805
|
+
return
|
|
3806
|
+
|
|
3807
|
+
def _run():
|
|
3808
|
+
with redirect_stdout(writer), redirect_stderr(writer):
|
|
3809
|
+
return tune_params_DE(
|
|
3810
|
+
query_data=qfile,
|
|
3811
|
+
reference_data=rfile,
|
|
3812
|
+
precursor_ion_mz_tolerance=float(input.precursor_ion_mz_tolerance()),
|
|
3813
|
+
ionization_mode=input.ionization_mode(),
|
|
3814
|
+
adduct=input.adduct(),
|
|
3815
|
+
chromatography_platform=input.chromatography_platform(),
|
|
3816
|
+
similarity_measure=sim,
|
|
3817
|
+
weights=weights,
|
|
3818
|
+
spectrum_preprocessing_order=spro,
|
|
3819
|
+
mz_min=mz_min, mz_max=mz_max,
|
|
3820
|
+
int_min=int_min, int_max=int_max,
|
|
3821
|
+
high_quality_reference_library=hq,
|
|
3822
|
+
optimize_params=list(opt_params),
|
|
3823
|
+
param_bounds=bounds_dict,
|
|
3824
|
+
default_params=defaults,
|
|
3825
|
+
de_workers=1,
|
|
3826
|
+
maxiters=input.max_iterations()
|
|
3827
|
+
)
|
|
3828
|
+
|
|
3829
|
+
try:
|
|
3830
|
+
_ = await asyncio.to_thread(_run)
|
|
3831
|
+
match_log_rv.set(match_log_rv.get() + "\n✅ Differential evolution finished.\n")
|
|
3832
|
+
except Exception as e:
|
|
3833
|
+
import traceback
|
|
3834
|
+
tb = "".join(traceback.format_exception(type(e), e, e.__traceback__))
|
|
3835
|
+
match_log_rv.set(match_log_rv.get() + f"\n❌ {type(e).__name__}: {e}\n{tb}\n")
|
|
3836
|
+
finally:
|
|
3837
|
+
await q.put(None)
|
|
3838
|
+
await drain_task
|
|
3839
|
+
is_tuning_DE_running.set(False)
|
|
3840
|
+
is_any_job_running.set(False)
|
|
3841
|
+
await reactive.flush()
|
|
1317
3842
|
|
|
1318
3843
|
|
|
1319
3844
|
@reactive.effect
|
|
@@ -1335,8 +3860,12 @@ def server(input, output, session):
|
|
|
1335
3860
|
return run_status_parameter_tuning_grid.get()
|
|
1336
3861
|
return run_status_parameter_tuning_DE.get()
|
|
1337
3862
|
|
|
3863
|
+
@output
|
|
3864
|
+
@render.text
|
|
3865
|
+
def run_log():
|
|
3866
|
+
return match_log_rv.get()
|
|
1338
3867
|
|
|
1339
|
-
app = App(app_ui, server)
|
|
1340
3868
|
|
|
3869
|
+
app = App(app_ui, server)
|
|
1341
3870
|
|
|
1342
3871
|
|