pycompound 0.1.6__py3-none-any.whl → 0.1.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- app.py +2589 -237
- pycompound/build_library.py +77 -20
- pycompound/plot_spectra.py +1 -1
- pycompound/processing.py +5 -5
- pycompound/spec_lib_matching.py +245 -471
- pycompound/spec_lib_matching_CLI.py +48 -2
- pycompound/tuning_CLI_DE.py +22 -22
- pycompound/tuning_CLI_grid.py +22 -6
- {pycompound-0.1.6.dist-info → pycompound-0.1.7.dist-info}/METADATA +1 -1
- pycompound-0.1.7.dist-info/RECORD +15 -0
- pycompound-0.1.6.dist-info/RECORD +0 -15
- {pycompound-0.1.6.dist-info → pycompound-0.1.7.dist-info}/WHEEL +0 -0
- {pycompound-0.1.6.dist-info → pycompound-0.1.7.dist-info}/licenses/LICENSE +0 -0
- {pycompound-0.1.6.dist-info → pycompound-0.1.7.dist-info}/top_level.txt +0 -0
app.py
CHANGED
|
@@ -1,15 +1,6 @@
|
|
|
1
1
|
|
|
2
2
|
from shiny import App, ui, reactive, render, req
|
|
3
3
|
from shiny.types import SilentException
|
|
4
|
-
from pycompound.spec_lib_matching import run_spec_lib_matching_on_HRMS_data
|
|
5
|
-
from pycompound.spec_lib_matching import run_spec_lib_matching_on_NRMS_data
|
|
6
|
-
from pycompound.spec_lib_matching import tune_params_on_HRMS_data_grid
|
|
7
|
-
from pycompound.spec_lib_matching import tune_params_on_NRMS_data_grid
|
|
8
|
-
from pycompound.spec_lib_matching import tune_params_on_HRMS_data_grid_shiny
|
|
9
|
-
from pycompound.spec_lib_matching import tune_params_on_NRMS_data_grid_shiny
|
|
10
|
-
from pycompound.spec_lib_matching import tune_params_DE
|
|
11
|
-
from pycompound.plot_spectra import generate_plots_on_HRMS_data
|
|
12
|
-
from pycompound.plot_spectra import generate_plots_on_NRMS_data
|
|
13
4
|
from pathlib import Path
|
|
14
5
|
from contextlib import redirect_stdout, redirect_stderr
|
|
15
6
|
import contextlib
|
|
@@ -28,10 +19,2296 @@ import ast
|
|
|
28
19
|
from numbers import Real
|
|
29
20
|
import logging
|
|
30
21
|
from scipy.optimize import differential_evolution
|
|
22
|
+
import scipy
|
|
23
|
+
import scipy.stats
|
|
24
|
+
from itertools import product
|
|
25
|
+
import json
|
|
26
|
+
import re
|
|
27
|
+
import urllib.parse
|
|
28
|
+
import urllib.request
|
|
29
|
+
import matplotlib
|
|
31
30
|
|
|
31
|
+
matplotlib.rcParams['svg.fonttype'] = 'none'
|
|
32
32
|
|
|
33
33
|
_LOG_QUEUE: asyncio.Queue[str] = asyncio.Queue()
|
|
34
34
|
|
|
35
|
+
_ADDUCT_PAT = re.compile(r"\s*(?:\[(M[^\]]+)\]|(M[+-][A-Za-z0-9]+)\+?)\s*$", re.IGNORECASE)
|
|
36
|
+
|
|
37
|
+
def start_log_consumer():
|
|
38
|
+
if getattr(start_log_consumer, "_started", False):
|
|
39
|
+
return
|
|
40
|
+
start_log_consumer._started = True
|
|
41
|
+
|
|
42
|
+
async def _consume():
|
|
43
|
+
while True:
|
|
44
|
+
s = await _LOG_QUEUE.get()
|
|
45
|
+
match_log_rv.set(match_log_rv.get() + s)
|
|
46
|
+
await reactive.flush()
|
|
47
|
+
|
|
48
|
+
asyncio.create_task(_consume())
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def start_log_consumer():
|
|
52
|
+
if getattr(start_log_consumer, "_started", False):
|
|
53
|
+
return
|
|
54
|
+
start_log_consumer._started = True
|
|
55
|
+
|
|
56
|
+
async def _consume():
|
|
57
|
+
while True:
|
|
58
|
+
s = await _LOG_QUEUE.get()
|
|
59
|
+
match_log_rv.set(match_log_rv.get() + s)
|
|
60
|
+
await reactive.flush()
|
|
61
|
+
|
|
62
|
+
asyncio.create_task(_consume())
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
def _strip_adduct(name: str) -> str:
|
|
67
|
+
return _ADDUCT_PAT.sub("", name).strip()
|
|
68
|
+
|
|
69
|
+
def get_pubchem_url(query: str) -> str:
|
|
70
|
+
base_name = _strip_adduct(query)
|
|
71
|
+
endpoint = ("https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/name/" + urllib.parse.quote(base_name) + "/cids/TXT")
|
|
72
|
+
try:
|
|
73
|
+
with urllib.request.urlopen(endpoint, timeout=10) as r:
|
|
74
|
+
txt = r.read().decode("utf-8").strip()
|
|
75
|
+
cid = txt.splitlines()[0].strip()
|
|
76
|
+
if cid.isdigit():
|
|
77
|
+
return f"https://pubchem.ncbi.nlm.nih.gov/compound/{cid}"
|
|
78
|
+
except Exception:
|
|
79
|
+
pass
|
|
80
|
+
q = urllib.parse.quote(base_name)
|
|
81
|
+
return f"https://pubchem.ncbi.nlm.nih.gov/#query={q}"
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
def build_library_from_raw_data(input_path=None, output_path=None, is_reference=False):
|
|
86
|
+
if input_path is None:
|
|
87
|
+
print('Error: please specify input_path (i.e. the path to the input mgf, mzML, cdf, json, or msp file). Mandatory argument.')
|
|
88
|
+
sys.exit()
|
|
89
|
+
|
|
90
|
+
if output_path is None:
|
|
91
|
+
tmp = input_path.split('/')
|
|
92
|
+
tmp = tmp[(len(tmp)-1)]
|
|
93
|
+
basename = tmp.split('.')[0]
|
|
94
|
+
output_path = f'{Path.cwd()}/{basename}.csv'
|
|
95
|
+
print(f'Warning: no output_path specified, so library is written to {output_path}')
|
|
96
|
+
|
|
97
|
+
if is_reference not in [True,False]:
|
|
98
|
+
print('Error: is_reference must be either \'True\' or \'False\'.')
|
|
99
|
+
sys.exit()
|
|
100
|
+
|
|
101
|
+
last_three_chars = input_path[(len(input_path)-3):len(input_path)]
|
|
102
|
+
last_four_chars = input_path[(len(input_path)-4):len(input_path)]
|
|
103
|
+
if last_three_chars == 'mgf' or last_three_chars == 'MGF':
|
|
104
|
+
input_file_type = 'mgf'
|
|
105
|
+
elif last_four_chars == 'mzML' or last_four_chars == 'mzml' or last_four_chars == 'MZML':
|
|
106
|
+
input_file_type = 'mzML'
|
|
107
|
+
elif last_four_chars == 'json' or last_four_chars == 'JSON':
|
|
108
|
+
input_file_type = 'json'
|
|
109
|
+
elif last_three_chars == 'cdf' or last_three_chars == 'CDF':
|
|
110
|
+
input_file_type = 'cdf'
|
|
111
|
+
elif last_three_chars == 'msp' or last_three_chars == 'MSP':
|
|
112
|
+
input_file_type = 'msp'
|
|
113
|
+
else:
|
|
114
|
+
print('ERROR: either an \'mgf\', \'mzML\', \'cdf\', \'json\', or \'msp\' file must be passed to --input_path')
|
|
115
|
+
sys.exit()
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
def generate_plots_on_HRMS_data(query_data=None, reference_data=None, precursor_ion_mz=None, precursor_ion_mz_tolerance=None, ionization_mode=None, collision_energy=None, spectrum_ID1=None, spectrum_ID2=None, print_url_spectrum1='No', print_url_spectrum2='No', similarity_measure='cosine', weights={'Cosine':0.25,'Shannon':0.25,'Renyi':0.25,'Tsallis':0.25}, spectrum_preprocessing_order='FCNMWL', high_quality_reference_library=False, mz_min=0, mz_max=9999999, int_min=0, int_max=9999999, window_size_centroiding=0.5, window_size_matching=0.5, noise_threshold=0.0, wf_mz=0.0, wf_intensity=1.0, LET_threshold=0.0, entropy_dimension=1.1, y_axis_transformation='normalized', output_path=None, return_plot=False):
|
|
120
|
+
|
|
121
|
+
if query_data is None:
|
|
122
|
+
print('\nError: No argument passed to the mandatory query_data. Please pass the path to the TXT file of the query data.')
|
|
123
|
+
sys.exit()
|
|
124
|
+
else:
|
|
125
|
+
extension = query_data.rsplit('.',1)
|
|
126
|
+
extension = extension[(len(extension)-1)]
|
|
127
|
+
if extension == 'mgf' or extension == 'MGF' or extension == 'mzML' or extension == 'mzml' or extension == 'MZML' or extension == 'cdf' or extension == 'CDF' or extension == 'msp' or extension == 'MSP' or extension == 'json' or extension == 'JSON':
|
|
128
|
+
output_path_tmp = query_data[:-3] + 'txt'
|
|
129
|
+
#build_library_from_raw_data(input_path=query_data, output_path=output_path_tmp, is_reference=True)
|
|
130
|
+
build_library_from_raw_data(input_path=query_data, output_path=output_path_tmp, is_reference=False)
|
|
131
|
+
df_query = pd.read_csv(output_path_tmp, sep='\t')
|
|
132
|
+
if extension == 'txt' or extension == 'TXT':
|
|
133
|
+
df_query = pd.read_csv(query_data, sep='\t')
|
|
134
|
+
unique_query_ids = df_query['id'].unique().tolist()
|
|
135
|
+
unique_query_ids = [str(tmp) for tmp in unique_query_ids]
|
|
136
|
+
|
|
137
|
+
if reference_data is None:
|
|
138
|
+
print('\nError: No argument passed to the mandatory reference_data. Please pass the path to the TXT file of the reference data.')
|
|
139
|
+
sys.exit()
|
|
140
|
+
else:
|
|
141
|
+
extension = reference_data.rsplit('.',1)
|
|
142
|
+
extension = extension[(len(extension)-1)]
|
|
143
|
+
if extension == 'mgf' or extension == 'MGF' or extension == 'mzML' or extension == 'mzml' or extension == 'MZML' or extension == 'cdf' or extension == 'CDF' or extension == 'msp' or extension == 'MSP' or extension == 'json' or extension == 'JSON':
|
|
144
|
+
output_path_tmp = reference_data[:-3] + 'txt'
|
|
145
|
+
build_library_from_raw_data(input_path=reference_data, output_path=output_path_tmp, is_reference=True)
|
|
146
|
+
df_reference = pd.read_csv(output_path_tmp, sep='\t')
|
|
147
|
+
if extension == 'txt' or extension == 'TXT':
|
|
148
|
+
df_reference = pd.read_csv(reference_data, sep='\t')
|
|
149
|
+
cols_tmp = df_reference.columns.tolist()
|
|
150
|
+
if 'precursor_ion_mz' in cols_tmp and 'ionization_mode' in cols_tmp and 'collision_energy' in cols_tmp:
|
|
151
|
+
if precursor_ion_mz is not None and precursor_ion_mz_tolerance is not None:
|
|
152
|
+
df_reference = df_reference.loc[(df_reference['precursor_ion_mz'] > (precursor_ion_mz-precursor_ion_mz_tolerance) & df_reference['precursor_ion_mz'] < (precursor_ion_mz+precursor_ion_mz_tolerance))]
|
|
153
|
+
if ionization_mode is not None:
|
|
154
|
+
df_reference = df_reference.loc[df_reference['ionization_mode'==ionization_mode]]
|
|
155
|
+
if collision_energy is not None:
|
|
156
|
+
df_reference = df_reference.loc[df_reference['collision_energy'==collision_energy]]
|
|
157
|
+
df_reference = df_reference.drop(columns=['precursor_ion_mz','ionization_mode','collision_energy'])
|
|
158
|
+
unique_reference_ids = df_reference['id'].unique().tolist()
|
|
159
|
+
unique_reference_ids = [str(tmp) for tmp in unique_reference_ids]
|
|
160
|
+
|
|
161
|
+
if spectrum_ID1 is not None:
|
|
162
|
+
spectrum_ID1 = str(spectrum_ID1)
|
|
163
|
+
else:
|
|
164
|
+
spectrum_ID1 = str(df_query['id'].iloc[0])
|
|
165
|
+
print('No argument passed to spectrum_ID1; using the first spectrum in query_data.')
|
|
166
|
+
|
|
167
|
+
if spectrum_ID2 is not None:
|
|
168
|
+
spectrum_ID2 = str(spectrum_ID2)
|
|
169
|
+
else:
|
|
170
|
+
spectrum_ID2 = str(df_reference['id'].iloc[0])
|
|
171
|
+
print('No argument passed to spectrum_ID2; using the first spectrum in reference_data.')
|
|
172
|
+
|
|
173
|
+
if spectrum_preprocessing_order is not None:
|
|
174
|
+
spectrum_preprocessing_order = list(spectrum_preprocessing_order)
|
|
175
|
+
else:
|
|
176
|
+
spectrum_preprocessing_order = ['F', 'C', 'N', 'M', 'W', 'L']
|
|
177
|
+
if 'M' not in spectrum_preprocessing_order:
|
|
178
|
+
print(f'Error: \'M\' must be a character in spectrum_preprocessing_order.')
|
|
179
|
+
sys.exit()
|
|
180
|
+
if 'C' in spectrum_preprocessing_order:
|
|
181
|
+
if spectrum_preprocessing_order.index('C') > spectrum_preprocessing_order.index('M'):
|
|
182
|
+
print(f'Error: \'C\' must come before \'M\' in spectrum_preprocessing_order.')
|
|
183
|
+
sys.exit()
|
|
184
|
+
if set(spectrum_preprocessing_order) - {'F','C','N','M','W','L'}:
|
|
185
|
+
print(f'Error: spectrum_preprocessing_order must contain only \'C\', \'F\', \'M\', \'N\', \'L\', \'W\'.')
|
|
186
|
+
sys.exit()
|
|
187
|
+
|
|
188
|
+
if similarity_measure not in ['cosine','shannon','renyi','tsallis','mixture','jaccard','dice','3w_jaccard','sokal_sneath','binary_cosine','mountford','mcconnaughey','driver_kroeber','simpson','braun_banquet','fager_mcgowan','kulczynski','intersection','hamming','hellinger']:
|
|
189
|
+
print('\nError: similarity_measure must be either cosine, shannon, renyi, tsallis, mixture, jaccard, dice, 3w_jaccard, sokal_sneath, binary_cosine, mountford, mcconnaughey, driver_kroeber, simpson, braun_banquet, fager_mcgowan, kulczynski, intersection, hamming, or hellinger.')
|
|
190
|
+
sys.exit()
|
|
191
|
+
|
|
192
|
+
if isinstance(int_min,int) is True:
|
|
193
|
+
int_min = float(int_min)
|
|
194
|
+
if isinstance(int_max,int) is True:
|
|
195
|
+
int_max = float(int_max)
|
|
196
|
+
if isinstance(mz_min,int) is False or isinstance(mz_max,int) is False or isinstance(int_min,float) is False or isinstance(int_max,float) is False:
|
|
197
|
+
print('Error: mz_min must be a non-negative integer, mz_max must be a positive integer, int_min must be a non-negative float, and int_max must be a positive float')
|
|
198
|
+
sys.exit()
|
|
199
|
+
if mz_min < 0:
|
|
200
|
+
print('\nError: mz_min should be a non-negative integer')
|
|
201
|
+
sys.exit()
|
|
202
|
+
if mz_max <= 0:
|
|
203
|
+
print('\nError: mz_max should be a positive integer')
|
|
204
|
+
sys.exit()
|
|
205
|
+
if int_min < 0:
|
|
206
|
+
print('\nError: int_min should be a non-negative float')
|
|
207
|
+
sys.exit()
|
|
208
|
+
if int_max <= 0:
|
|
209
|
+
print('\nError: int_max should be a positive float')
|
|
210
|
+
sys.exit()
|
|
211
|
+
|
|
212
|
+
if isinstance(window_size_centroiding,float) is False or window_size_centroiding <= 0.0:
|
|
213
|
+
print('Error: window_size_centroiding must be a positive float.')
|
|
214
|
+
sys.exit()
|
|
215
|
+
if isinstance(window_size_matching,float) is False or window_size_matching<= 0.0:
|
|
216
|
+
print('Error: window_size_matching must be a positive float.')
|
|
217
|
+
sys.exit()
|
|
218
|
+
|
|
219
|
+
if isinstance(noise_threshold,int) is True:
|
|
220
|
+
noise_threshold = float(noise_threshold)
|
|
221
|
+
if isinstance(noise_threshold,float) is False or noise_threshold < 0:
|
|
222
|
+
print('Error: noise_threshold must be a positive float.')
|
|
223
|
+
sys.exit()
|
|
224
|
+
|
|
225
|
+
if isinstance(wf_intensity,int) is True:
|
|
226
|
+
wf_intensity = float(wf_intensity)
|
|
227
|
+
if isinstance(wf_mz,int) is True:
|
|
228
|
+
wf_mz = float(wf_mz)
|
|
229
|
+
if isinstance(wf_intensity,float) is False or isinstance(wf_mz,float) is False:
|
|
230
|
+
print('Error: wf_mz and wf_intensity must be integers or floats')
|
|
231
|
+
sys.exit()
|
|
232
|
+
|
|
233
|
+
if entropy_dimension <= 0:
|
|
234
|
+
print('\nError: entropy_dimension should be a positive float')
|
|
235
|
+
sys.exit()
|
|
236
|
+
else:
|
|
237
|
+
q = entropy_dimension
|
|
238
|
+
|
|
239
|
+
normalization_method = 'standard' #consider including additional normalization methods to transform intensities into a probability distribution; softmax results in many numerical errors/warnings
|
|
240
|
+
|
|
241
|
+
if y_axis_transformation not in ['normalized','none','log10','sqrt']:
|
|
242
|
+
print('Error: y_axis_transformation must be either \'normalized\', \'none\', \'log10\', or \'sqrt\'.')
|
|
243
|
+
sys.exit()
|
|
244
|
+
|
|
245
|
+
if output_path is None:
|
|
246
|
+
print(f'Warning: plots will be saved to the svg ./spectrum1_{spectrum_ID1}_spectrum2_{spectrum_ID2}.svg in the current working directory.')
|
|
247
|
+
output_path = f'{Path.cwd()}/spectrum1_{spectrum_ID1}_spectrum2_{spectrum_ID2}.svg'
|
|
248
|
+
|
|
249
|
+
|
|
250
|
+
if spectrum_ID1 in unique_query_ids and spectrum_ID2 in unique_query_ids:
|
|
251
|
+
query_idx = unique_query_ids.index(spectrum_ID1)
|
|
252
|
+
reference_idx = unique_query_ids.index(spectrum_ID2)
|
|
253
|
+
q_idxs_tmp = np.where(df_query.iloc[:,0].astype(str) == unique_query_ids[query_idx])[0]
|
|
254
|
+
r_idxs_tmp = np.where(df_query.iloc[:,0].astype(str) == unique_query_ids[reference_idx])[0]
|
|
255
|
+
q_spec = np.asarray(pd.concat([df_query.iloc[q_idxs_tmp,1], df_query.iloc[q_idxs_tmp,2]], axis=1).reset_index(drop=True))
|
|
256
|
+
r_spec = np.asarray(pd.concat([df_query.iloc[r_idxs_tmp,1], df_query.iloc[r_idxs_tmp,2]], axis=1).reset_index(drop=True))
|
|
257
|
+
elif spectrum_ID1 in unique_reference_ids and spectrum_ID2 in unique_reference_ids:
|
|
258
|
+
query_idx = unique_reference_ids.index(spectrum_ID1)
|
|
259
|
+
reference_idx = unique_reference_ids.index(spectrum_ID2)
|
|
260
|
+
q_idxs_tmp = np.where(df_reference.iloc[:,0].astype(str) == unique_reference_ids[query_idx])[0]
|
|
261
|
+
r_idxs_tmp = np.where(df_reference.iloc[:,0].astype(str) == unique_reference_ids[reference_idx])[0]
|
|
262
|
+
q_spec = np.asarray(pd.concat([df_reference.iloc[q_idxs_tmp,1], df_reference.iloc[q_idxs_tmp,2]], axis=1).reset_index(drop=True))
|
|
263
|
+
r_spec = np.asarray(pd.concat([df_reference.iloc[r_idxs_tmp,1], df_reference.iloc[r_idxs_tmp,2]], axis=1).reset_index(drop=True))
|
|
264
|
+
else:
|
|
265
|
+
if spectrum_ID1 in unique_reference_ids and spectrum_ID2 in unique_query_ids:
|
|
266
|
+
spec_tmp = spectrum_ID1
|
|
267
|
+
spectrum_ID1 = spectrum_ID2
|
|
268
|
+
spectrum_ID2 = spec_tmp
|
|
269
|
+
query_idx = unique_query_ids.index(spectrum_ID1)
|
|
270
|
+
reference_idx = unique_reference_ids.index(spectrum_ID2)
|
|
271
|
+
q_idxs_tmp = np.where(df_query['id'].astype(str) == unique_query_ids[query_idx])[0]
|
|
272
|
+
r_idxs_tmp = np.where(df_reference['id'].astype(str) == unique_reference_ids[reference_idx])[0]
|
|
273
|
+
q_spec = np.asarray(pd.concat([df_query['mz_ratio'].iloc[q_idxs_tmp], df_query['intensity'].iloc[q_idxs_tmp]], axis=1).reset_index(drop=True))
|
|
274
|
+
r_spec = np.asarray(pd.concat([df_reference['mz_ratio'].iloc[r_idxs_tmp], df_reference['intensity'].iloc[r_idxs_tmp]], axis=1).reset_index(drop=True))
|
|
275
|
+
|
|
276
|
+
|
|
277
|
+
q_spec_pre_trans = q_spec.copy()
|
|
278
|
+
r_spec_pre_trans = r_spec.copy()
|
|
279
|
+
q_spec_pre_trans[:,1] = q_spec_pre_trans[:,1].astype(float)
|
|
280
|
+
r_spec_pre_trans[:,1] = r_spec_pre_trans[:,1].astype(float)
|
|
281
|
+
|
|
282
|
+
if y_axis_transformation == 'normalized':
|
|
283
|
+
q_spec_pre_trans[:,1] = q_spec_pre_trans[:,1] / np.max(q_spec_pre_trans[:,1])
|
|
284
|
+
r_spec_pre_trans[:,1] = r_spec_pre_trans[:,1] / np.max(r_spec_pre_trans[:,1])
|
|
285
|
+
ylab = 'Normalized Intensity'
|
|
286
|
+
elif y_axis_transformation == 'log10':
|
|
287
|
+
q_spec_pre_trans[:,1] = np.log10(np.array(q_spec_pre_trans[:,1]+1,dtype=float))
|
|
288
|
+
r_spec_pre_trans[:,1] = np.log10(np.array(r_spec_pre_trans[:,1]+1,dtype=float))
|
|
289
|
+
ylab = 'log10(Intensity)'
|
|
290
|
+
elif y_axis_transformation == 'sqrt':
|
|
291
|
+
q_spec_pre_trans[:,1] = np.sqrt(np.array(q_spec_pre_trans[:,1],dtype=float))
|
|
292
|
+
r_spec_pre_trans[:,1] = np.sqrt(np.array(r_spec_pre_trans[:,1],dtype=float))
|
|
293
|
+
ylab = 'sqrt(Intensity)'
|
|
294
|
+
else:
|
|
295
|
+
ylab = 'Raw Intensity'
|
|
296
|
+
|
|
297
|
+
fig, axes = plt.subplots(nrows=2, ncols=1)
|
|
298
|
+
|
|
299
|
+
plt.subplot(2,1,1)
|
|
300
|
+
plt.vlines(x=q_spec_pre_trans[:,0], ymin=[0]*q_spec_pre_trans.shape[0], ymax=q_spec_pre_trans[:,1], linewidth=3, color='blue', label=f'Spectrum ID 1: {spectrum_ID1}')
|
|
301
|
+
plt.vlines(x=r_spec_pre_trans[:,0], ymin=[0]*r_spec_pre_trans.shape[0], ymax=-r_spec_pre_trans[:,1], linewidth=3, color='red', label=f'Spectrum ID 2: {spectrum_ID2}')
|
|
302
|
+
plt.xlabel('m/z',fontsize=7)
|
|
303
|
+
plt.ylabel(ylab, fontsize=7)
|
|
304
|
+
plt.xticks(fontsize=7)
|
|
305
|
+
plt.yticks(fontsize=7)
|
|
306
|
+
plt.title('Untransformed Spectra', fontsize=10)
|
|
307
|
+
|
|
308
|
+
mz_min_tmp_q = round(q_spec[:,0].min(),1)
|
|
309
|
+
mz_min_tmp_r = round(r_spec[:,0].min(),1)
|
|
310
|
+
int_min_tmp_q = round(q_spec[:,1].min(),1)
|
|
311
|
+
int_min_tmp_r = round(r_spec[:,1].min(),1)
|
|
312
|
+
mz_max_tmp_q = round(q_spec[:,0].max(),1)
|
|
313
|
+
mz_max_tmp_r = round(r_spec[:,0].max(),1)
|
|
314
|
+
int_max_tmp_q = round(q_spec[:,1].max(),1)
|
|
315
|
+
int_max_tmp_r = round(r_spec[:,1].max(),1)
|
|
316
|
+
mz_min_tmp = min([mz_min_tmp_q,mz_min_tmp_r])
|
|
317
|
+
mz_max_tmp = min([mz_max_tmp_q,mz_max_tmp_r])
|
|
318
|
+
int_min_tmp = min([int_min_tmp_q,int_min_tmp_r])
|
|
319
|
+
int_max_tmp = max([int_max_tmp_q,int_max_tmp_r])
|
|
320
|
+
|
|
321
|
+
is_matched = False
|
|
322
|
+
for transformation in spectrum_preprocessing_order:
|
|
323
|
+
if transformation == 'C' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1:
|
|
324
|
+
q_spec = centroid_spectrum(q_spec, window_size=window_size_centroiding)
|
|
325
|
+
r_spec = centroid_spectrum(r_spec, window_size=window_size_centroiding)
|
|
326
|
+
if transformation == 'M' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1:
|
|
327
|
+
m_spec = match_peaks_in_spectra(spec_a=q_spec, spec_b=r_spec, window_size=window_size_matching)
|
|
328
|
+
q_spec = m_spec[:,0:2]
|
|
329
|
+
r_spec = m_spec[:,[0,2]]
|
|
330
|
+
is_matched = True
|
|
331
|
+
if transformation == 'W' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1:
|
|
332
|
+
q_spec[:,1] = wf_transform(q_spec[:,0], q_spec[:,1], wf_mz, wf_intensity)
|
|
333
|
+
r_spec[:,1] = wf_transform(r_spec[:,0], r_spec[:,1], wf_mz, wf_intensity)
|
|
334
|
+
if transformation == 'L' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1:
|
|
335
|
+
q_spec[:,1] = LE_transform(q_spec[:,1], LET_threshold, normalization_method=normalization_method)
|
|
336
|
+
r_spec[:,1] = LE_transform(r_spec[:,1], LET_threshold, normalization_method=normalization_method)
|
|
337
|
+
if transformation == 'N' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1:
|
|
338
|
+
q_spec = remove_noise(q_spec, nr = noise_threshold)
|
|
339
|
+
if high_quality_reference_library == False or high_quality_reference_library == 'False':
|
|
340
|
+
r_spec = remove_noise(r_spec, nr = noise_threshold)
|
|
341
|
+
if transformation == 'F' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1:
|
|
342
|
+
q_spec = filter_spec_lcms(q_spec, mz_min = mz_min, mz_max = mz_max, int_min = int_min, int_max = int_max, is_matched = is_matched)
|
|
343
|
+
if high_quality_reference_library == False or high_quality_reference_library == 'False':
|
|
344
|
+
r_spec = filter_spec_lcms(r_spec, mz_min = mz_min, mz_max = mz_max, int_min = int_min, int_max = int_max, is_matched = is_matched)
|
|
345
|
+
|
|
346
|
+
q_ints = q_spec[:,1]
|
|
347
|
+
r_ints = r_spec[:,1]
|
|
348
|
+
|
|
349
|
+
if np.sum(q_ints) != 0 and np.sum(r_ints) != 0 and q_spec.shape[0] > 1 and r_spec.shape[1] > 1:
|
|
350
|
+
similarity_score = get_similarity(similarity_measure, q_ints, r_ints, weights, entropy_dimension)
|
|
351
|
+
else:
|
|
352
|
+
similarity_score = 0
|
|
353
|
+
|
|
354
|
+
plt.subplot(2,1,2)
|
|
355
|
+
|
|
356
|
+
if q_spec.shape[0] > 1:
|
|
357
|
+
if np.max(q_spec[:,1]) == 0 or np.max(r_spec[:,1]) == 0:
|
|
358
|
+
plt.text(0.5, 0.5, 'The query and/or reference spectrum has no non-zero intensities after transformations.\n Change transformation parameters.', ha='center', va='center', fontsize=7, color='black')
|
|
359
|
+
plt.xticks([])
|
|
360
|
+
plt.yticks([])
|
|
361
|
+
else:
|
|
362
|
+
if y_axis_transformation == 'normalized':
|
|
363
|
+
q_spec[:,1] = q_spec[:,1] / np.max(q_spec[:,1])
|
|
364
|
+
r_spec[:,1] = r_spec[:,1] / np.max(r_spec[:,1])
|
|
365
|
+
ylab='Normalized Intensity'
|
|
366
|
+
elif y_axis_transformation == 'log10':
|
|
367
|
+
q_spec[:,1] = np.log10(q_spec[:,1]+1)
|
|
368
|
+
r_spec[:,1] = np.log10(r_spec[:,1]+1)
|
|
369
|
+
ylab='log10(Intensity)'
|
|
370
|
+
elif y_axis_transformation == 'sqrt':
|
|
371
|
+
q_spec[:,1] = np.sqrt(q_spec[:,1])
|
|
372
|
+
r_spec[:,1] = np.sqrt(r_spec[:,1])
|
|
373
|
+
ylab='sqrt(Intensity)'
|
|
374
|
+
else:
|
|
375
|
+
ylab = 'Raw Intensity'
|
|
376
|
+
plt.vlines(x=q_spec[:,0], ymin=[0]*q_spec.shape[0], ymax=q_spec[:,1], linewidth=3, color='blue')
|
|
377
|
+
plt.vlines(x=r_spec[:,0], ymin=[0]*r_spec.shape[0], ymax=-r_spec[:,1], linewidth=3, color='red')
|
|
378
|
+
plt.xlabel('m/z', fontsize=7)
|
|
379
|
+
plt.ylabel(ylab, fontsize=7)
|
|
380
|
+
plt.xticks(fontsize=7)
|
|
381
|
+
plt.yticks(fontsize=7)
|
|
382
|
+
plt.title(f'Transformed Spectra', fontsize=10)
|
|
383
|
+
else:
|
|
384
|
+
plt.text(0.5, 0.5, 'All points in the spectra were removed during preprocessing. \nChange the spectrum_preprocesing_order and/or change other spectrum-preprocessing parameters.', ha='center', va='center', fontsize=7, color='black')
|
|
385
|
+
plt.xticks([])
|
|
386
|
+
plt.yticks([])
|
|
387
|
+
|
|
388
|
+
plt.subplots_adjust(top=0.8, hspace=0.92, bottom=0.3)
|
|
389
|
+
plt.figlegend(loc='upper center')
|
|
390
|
+
|
|
391
|
+
fig.text(0.05, 0.20, f'Similarity Measure: {similarity_measure.capitalize()}', fontsize=7)
|
|
392
|
+
fig.text(0.05, 0.17, f'Similarity Score: {round(similarity_score, 4)}', fontsize=7)
|
|
393
|
+
fig.text(0.05, 0.14, f"Spectrum Preprocessing Order: {''.join(spectrum_preprocessing_order)}", fontsize=7)
|
|
394
|
+
fig.text(0.05, 0.11, f'High Quality Reference Library: {str(high_quality_reference_library)}', fontsize=7)
|
|
395
|
+
fig.text(0.05, 0.08, f'Window Size (Centroiding): {window_size_centroiding}', fontsize=7)
|
|
396
|
+
fig.text(0.05, 0.05, f'Window Size (Matching): {window_size_matching}', fontsize=7)
|
|
397
|
+
if similarity_measure == 'mixture':
|
|
398
|
+
fig.text(0.05, 0.02, f'Weights for mixture similarity: {weights}', fontsize=7)
|
|
399
|
+
|
|
400
|
+
fig.text(0.40, 0.20, f'Raw-Scale M/Z Range: [{mz_min_tmp},{mz_max_tmp}]', fontsize=7)
|
|
401
|
+
fig.text(0.40, 0.17, f'Raw-Scale Intensity Range: [{int_min_tmp},{int_max_tmp}]', fontsize=7)
|
|
402
|
+
fig.text(0.40, 0.14, f'Noise Threshold: {noise_threshold}', fontsize=7)
|
|
403
|
+
fig.text(0.40, 0.11, f'Weight Factors (m/z,intensity): ({wf_mz},{wf_intensity})', fontsize=7)
|
|
404
|
+
fig.text(0.40, 0.08, f'Low-Entropy Threshold: {LET_threshold}', fontsize=7)
|
|
405
|
+
|
|
406
|
+
if print_url_spectrum1 == 'Yes' and print_url_spectrum2 == 'Yes':
|
|
407
|
+
url_tmp1 = get_pubchem_url(query=spectrum_ID1)
|
|
408
|
+
url_tmp2 = get_pubchem_url(query=spectrum_ID2)
|
|
409
|
+
t1 = fig.text(0.40, 0.05, f'PubChem URL for {spectrum_ID1}: {url_tmp1}', fontsize=7)
|
|
410
|
+
t2 = fig.text(0.40, 0.02, f'PubChem URL for {spectrum_ID2}: {url_tmp2}', fontsize=7)
|
|
411
|
+
t1.set_url(url_tmp1)
|
|
412
|
+
t2.set_url(url_tmp2)
|
|
413
|
+
|
|
414
|
+
if print_url_spectrum1 == 'Yes' and print_url_spectrum2 == 'No':
|
|
415
|
+
url_tmp1 = get_pubchem_url(query=spectrum_ID1)
|
|
416
|
+
t1 = fig.text(0.40, 0.05, f'PubChem URL for {spectrum_ID1}: {url_tmp1}', fontsize=7)
|
|
417
|
+
t1.set_url(url_tmp1)
|
|
418
|
+
|
|
419
|
+
if print_url_spectrum1 == 'No' and print_url_spectrum2 == 'Yes':
|
|
420
|
+
url_tmp2 = get_pubchem_url(query=spectrum_ID2)
|
|
421
|
+
t2 = fig.text(0.40, 0.05, f'PubChem URL for {spectrum_ID2}: {url_tmp2}', fontsize=7)
|
|
422
|
+
t2.set_url(url_tmp2)
|
|
423
|
+
|
|
424
|
+
fig.savefig(output_path, format='svg')
|
|
425
|
+
|
|
426
|
+
if return_plot == True:
|
|
427
|
+
return fig
|
|
428
|
+
|
|
429
|
+
|
|
430
|
+
|
|
431
|
+
|
|
432
|
+
def generate_plots_on_NRMS_data(query_data=None, reference_data=None, spectrum_ID1=None, spectrum_ID2=None, print_url_spectrum1='No', print_url_spectrum2='No', similarity_measure='cosine', weights={'Cosine':0.25,'Shannon':0.25,'Renyi':0.25,'Tsallis':0.25}, spectrum_preprocessing_order='FNLW', high_quality_reference_library=False, mz_min=0, mz_max=9999999, int_min=0, int_max=9999999, noise_threshold=0.0, wf_mz=0.0, wf_intensity=1.0, LET_threshold=0.0, entropy_dimension=1.1, y_axis_transformation='normalized', output_path=None, return_plot=False):
|
|
433
|
+
|
|
434
|
+
if query_data is None:
|
|
435
|
+
print('\nError: No argument passed to the mandatory query_data. Please pass the path to the TXT file of the query data.')
|
|
436
|
+
sys.exit()
|
|
437
|
+
else:
|
|
438
|
+
extension = query_data.rsplit('.',1)
|
|
439
|
+
extension = extension[(len(extension)-1)]
|
|
440
|
+
if extension == 'mgf' or extension == 'MGF' or extension == 'mzML' or extension == 'mzml' or extension == 'MZML' or extension == 'cdf' or extension == 'CDF' or extension == 'msp' or extension == 'MSP' or extension == 'json' or extension == 'JSON':
|
|
441
|
+
output_path_tmp = query_data[:-3] + 'txt'
|
|
442
|
+
build_library_from_raw_data(input_path=query_data, output_path=output_path_tmp, is_reference=False)
|
|
443
|
+
df_query = pd.read_csv(output_path_tmp, sep='\t')
|
|
444
|
+
if extension == 'txt' or extension == 'TXT':
|
|
445
|
+
df_query = pd.read_csv(query_data, sep='\t')
|
|
446
|
+
unique_query_ids = df_query['id'].unique()
|
|
447
|
+
|
|
448
|
+
if reference_data is None:
|
|
449
|
+
print('\nError: No argument passed to the mandatory reference_data. Please pass the path to the TXT file of the reference data.')
|
|
450
|
+
sys.exit()
|
|
451
|
+
else:
|
|
452
|
+
extension = reference_data.rsplit('.',1)
|
|
453
|
+
extension = extension[(len(extension)-1)]
|
|
454
|
+
if extension == 'mgf' or extension == 'MGF' or extension == 'mzML' or extension == 'mzml' or extension == 'MZML' or extension == 'cdf' or extension == 'CDF' or extension == 'msp' or extension == 'MSP' or extension == 'json' or extension == 'JSON':
|
|
455
|
+
output_path_tmp = reference_data[:-3] + 'txt'
|
|
456
|
+
build_library_from_raw_data(input_path=reference_data, output_path=output_path_tmp, is_reference=True)
|
|
457
|
+
df_reference = pd.read_csv(output_path_tmp, sep='\t')
|
|
458
|
+
if extension == 'txt' or extension == 'TXT':
|
|
459
|
+
df_reference = pd.read_csv(reference_data, sep='\t')
|
|
460
|
+
unique_reference_ids = df_reference['id'].unique()
|
|
461
|
+
|
|
462
|
+
|
|
463
|
+
if spectrum_ID1 is not None:
|
|
464
|
+
spectrum_ID1 = str(spectrum_ID1)
|
|
465
|
+
else:
|
|
466
|
+
spectrum_ID1 = str(df_query.iloc[0,0])
|
|
467
|
+
print('No argument passed to spectrum_ID1; using the first spectrum in query_data.')
|
|
468
|
+
|
|
469
|
+
if spectrum_ID2 is not None:
|
|
470
|
+
spectrum_ID2 = str(spectrum_ID2)
|
|
471
|
+
else:
|
|
472
|
+
spectrum_ID2 = str(df_reference.iloc[0,0])
|
|
473
|
+
print('No argument passed to spectrum_ID2; using the first spectrum in reference_data.')
|
|
474
|
+
|
|
475
|
+
if spectrum_preprocessing_order is not None:
|
|
476
|
+
spectrum_preprocessing_order = list(spectrum_preprocessing_order)
|
|
477
|
+
else:
|
|
478
|
+
spectrum_preprocessing_order = ['F','N','W','L']
|
|
479
|
+
if set(spectrum_preprocessing_order) - {'F','N','W','L'}:
|
|
480
|
+
print(f'Error: spectrum_preprocessing_order must contain only \'F\', \'N\', \'W\', \'L\'.')
|
|
481
|
+
sys.exit()
|
|
482
|
+
|
|
483
|
+
if similarity_measure not in ['cosine','shannon','renyi','tsallis','mixture','jaccard','dice','3w_jaccard','sokal_sneath','binary_cosine','mountford','mcconnaughey','driver_kroeber','simpson','braun_banquet','fager_mcgowan','kulczynski','intersection','hamming','hellinger']:
|
|
484
|
+
print('\nError: similarity_measure must be either cosine, shannon, renyi, tsallis, mixture, jaccard, dice, 3w_jaccard, sokal_sneath, binary_cosine, mountford, mcconnaughey, driver_kroeber, simpson, braun_banquet, fager_mcgowan, kulczynski, intersection, hamming, or hellinger.')
|
|
485
|
+
sys.exit()
|
|
486
|
+
|
|
487
|
+
if isinstance(int_min,int) is True:
|
|
488
|
+
int_min = float(int_min)
|
|
489
|
+
if isinstance(int_max,int) is True:
|
|
490
|
+
int_max = float(int_max)
|
|
491
|
+
if isinstance(mz_min,int) is False or isinstance(mz_max,int) is False or isinstance(int_min,float) is False or isinstance(int_max,float) is False:
|
|
492
|
+
print('Error: mz_min must be a non-negative integer, mz_max must be a positive integer, int_min must be a non-negative float, and int_max must be a positive float')
|
|
493
|
+
sys.exit()
|
|
494
|
+
if mz_min < 0:
|
|
495
|
+
print('\nError: mz_min should be a non-negative integer')
|
|
496
|
+
sys.exit()
|
|
497
|
+
if mz_max <= 0:
|
|
498
|
+
print('\nError: mz_max should be a positive integer')
|
|
499
|
+
sys.exit()
|
|
500
|
+
if int_min < 0:
|
|
501
|
+
print('\nError: int_min should be a non-negative float')
|
|
502
|
+
sys.exit()
|
|
503
|
+
if int_max <= 0:
|
|
504
|
+
print('\nError: int_max should be a positive float')
|
|
505
|
+
sys.exit()
|
|
506
|
+
|
|
507
|
+
if isinstance(noise_threshold,int) is True:
|
|
508
|
+
noise_threshold = float(noise_threshold)
|
|
509
|
+
if isinstance(noise_threshold,float) is False or noise_threshold < 0:
|
|
510
|
+
print('Error: noise_threshold must be a positive float.')
|
|
511
|
+
sys.exit()
|
|
512
|
+
|
|
513
|
+
if isinstance(wf_intensity,int) is True:
|
|
514
|
+
wf_intensity = float(wf_intensity)
|
|
515
|
+
if isinstance(wf_mz,int) is True:
|
|
516
|
+
wf_mz = float(wf_mz)
|
|
517
|
+
if isinstance(wf_intensity,float) is False or isinstance(wf_mz,float) is False:
|
|
518
|
+
print('Error: wf_mz and wf_intensity must be integers or floats')
|
|
519
|
+
sys.exit()
|
|
520
|
+
|
|
521
|
+
if entropy_dimension <= 0:
|
|
522
|
+
print('\nError: entropy_dimension should be a positive float')
|
|
523
|
+
sys.exit()
|
|
524
|
+
else:
|
|
525
|
+
q = entropy_dimension
|
|
526
|
+
|
|
527
|
+
normalization_method = 'standard' #consider including additional normalization methods to transform intensities into a probability distribution; softmax results in many numerical errors/warnings
|
|
528
|
+
|
|
529
|
+
if y_axis_transformation not in ['normalized','none','log10','sqrt']:
|
|
530
|
+
print('Error: y_axis_transformation must be either \'normalized\', \'none\', \'log10\', or \'sqrt\'.')
|
|
531
|
+
sys.exit()
|
|
532
|
+
|
|
533
|
+
if output_path is None:
|
|
534
|
+
print(f'Warning: plots will be saved to the svg ./spectrum1_{spectrum_ID1}_spectrum2_{spectrum_ID2}.svg in the current working directory.')
|
|
535
|
+
output_path = f'{Path.cwd()}/spectrum1_{spectrum_ID1}_spectrum2_{spectrum_ID2}.svg'
|
|
536
|
+
|
|
537
|
+
min_mz = np.min([df_query['mz_ratio'].min(), df_reference['mz_ratio'].min()])
|
|
538
|
+
max_mz = np.max([df_query['mz_ratio'].max(), df_reference['mz_ratio'].max()])
|
|
539
|
+
mzs = np.linspace(min_mz,max_mz,(max_mz-min_mz+1))
|
|
540
|
+
|
|
541
|
+
unique_query_ids = df_query['id'].unique().tolist()
|
|
542
|
+
unique_reference_ids = df_reference['id'].unique().tolist()
|
|
543
|
+
unique_query_ids = [str(ID) for ID in unique_query_ids]
|
|
544
|
+
unique_reference_ids = [str(ID) for ID in unique_reference_ids]
|
|
545
|
+
common_IDs = np.intersect1d([str(ID) for ID in unique_query_ids], [str(ID) for ID in unique_reference_ids])
|
|
546
|
+
if len(common_IDs) > 0:
|
|
547
|
+
print(f'Warning: the query and reference library have overlapping IDs: {common_IDs}')
|
|
548
|
+
|
|
549
|
+
if spectrum_ID1 in unique_query_ids and spectrum_ID2 in unique_query_ids:
|
|
550
|
+
q_idxs_tmp = np.where(df_query.iloc[:,0].astype(str) == spectrum_ID1)[0]
|
|
551
|
+
r_idxs_tmp = np.where(df_query.iloc[:,0].astype(str) == spectrum_ID2)[0]
|
|
552
|
+
q_spec = np.asarray(pd.concat([df_query.iloc[q_idxs_tmp,1], df_query.iloc[q_idxs_tmp,2]], axis=1).reset_index(drop=True))
|
|
553
|
+
r_spec = np.asarray(pd.concat([df_query.iloc[r_idxs_tmp,1], df_query.iloc[r_idxs_tmp,2]], axis=1).reset_index(drop=True))
|
|
554
|
+
elif spectrum_ID1 in unique_reference_ids and spectrum_ID2 in unique_reference_ids:
|
|
555
|
+
q_idxs_tmp = np.where(df_reference.iloc[:,0].astype(str) == spectrum_ID1)[0]
|
|
556
|
+
r_idxs_tmp = np.where(df_reference.iloc[:,0].astype(str) == spectrum_ID2)[0]
|
|
557
|
+
q_spec = np.asarray(pd.concat([df_reference.iloc[q_idxs_tmp,1], df_reference.iloc[q_idxs_tmp,2]], axis=1).reset_index(drop=True))
|
|
558
|
+
r_spec = np.asarray(pd.concat([df_reference.iloc[r_idxs_tmp,1], df_reference.iloc[r_idxs_tmp,2]], axis=1).reset_index(drop=True))
|
|
559
|
+
else:
|
|
560
|
+
if spectrum_ID1 in unique_reference_ids and spectrum_ID2 in unique_query_ids:
|
|
561
|
+
spec_tmp = spectrum_ID1
|
|
562
|
+
spectrum_ID1 = spectrum_ID2
|
|
563
|
+
spectrum_ID2 = spec_tmp
|
|
564
|
+
q_idxs_tmp = np.where(df_query['id'].astype(str) == spectrum_ID1)[0]
|
|
565
|
+
r_idxs_tmp = np.where(df_reference['id'].astype(str) == spectrum_ID2)[0]
|
|
566
|
+
q_spec = np.asarray(pd.concat([df_query['mz_ratio'].iloc[q_idxs_tmp], df_query['intensity'].iloc[q_idxs_tmp]], axis=1).reset_index(drop=True))
|
|
567
|
+
r_spec = np.asarray(pd.concat([df_reference['mz_ratio'].iloc[r_idxs_tmp], df_reference['intensity'].iloc[r_idxs_tmp]], axis=1).reset_index(drop=True))
|
|
568
|
+
|
|
569
|
+
q_spec = convert_spec(q_spec,mzs)
|
|
570
|
+
r_spec = convert_spec(r_spec,mzs)
|
|
571
|
+
|
|
572
|
+
int_min_tmp_q = min(q_spec[q_spec[:,1].nonzero(),1][0])
|
|
573
|
+
int_min_tmp_r = min(r_spec[r_spec[:,1].nonzero(),1][0])
|
|
574
|
+
int_max_tmp_q = max(q_spec[q_spec[:,1].nonzero(),1][0])
|
|
575
|
+
int_max_tmp_r = max(r_spec[r_spec[:,1].nonzero(),1][0])
|
|
576
|
+
int_min_tmp = int(min([int_min_tmp_q,int_min_tmp_r]))
|
|
577
|
+
int_max_tmp = int(max([int_max_tmp_q,int_max_tmp_r]))
|
|
578
|
+
|
|
579
|
+
fig, axes = plt.subplots(nrows=2, ncols=1)
|
|
580
|
+
|
|
581
|
+
plt.subplot(2,1,1)
|
|
582
|
+
|
|
583
|
+
if np.max(q_spec[:,1]) == 0 or np.max(r_spec[:,1]) == 0:
|
|
584
|
+
plt.text(0.5, 0.5, 'The query and/or reference spectrum has no non-zero intensities after transformations.\n Change transformation parameters.', ha='center', va='center', fontsize=7, color='black')
|
|
585
|
+
plt.xticks([])
|
|
586
|
+
plt.yticks([])
|
|
587
|
+
else:
|
|
588
|
+
q_spec_pre_trans = q_spec.copy()
|
|
589
|
+
r_spec_pre_trans = r_spec.copy()
|
|
590
|
+
q_spec_pre_trans[:,1] = q_spec_pre_trans[:,1].astype(float)
|
|
591
|
+
r_spec_pre_trans[:,1] = r_spec_pre_trans[:,1].astype(float)
|
|
592
|
+
|
|
593
|
+
if y_axis_transformation == 'normalized':
|
|
594
|
+
q_spec_pre_trans[:,1] = q_spec_pre_trans[:,1] / np.max(q_spec_pre_trans[:,1])
|
|
595
|
+
r_spec_pre_trans[:,1] = r_spec_pre_trans[:,1] / np.max(r_spec_pre_trans[:,1])
|
|
596
|
+
ylab = 'Normalized Intensity'
|
|
597
|
+
elif y_axis_transformation == 'log10':
|
|
598
|
+
q_spec_pre_trans[:,1] = np.log10(q_spec_pre_trans[:,1]+1)
|
|
599
|
+
r_spec_pre_trans[:,1] = np.log10(r_spec_pre_trans[:,1]+1)
|
|
600
|
+
ylab = 'log10(Intensity)'
|
|
601
|
+
elif y_axis_transformation == 'sqrt':
|
|
602
|
+
q_spec_pre_trans[:,1] = np.sqrt(q_spec_pre_trans[:,1])
|
|
603
|
+
r_spec_pre_trans[:,1] = np.sqrt(r_spec_pre_trans[:,1])
|
|
604
|
+
ylab = 'sqrt(Intensity)'
|
|
605
|
+
else:
|
|
606
|
+
ylab = 'Raw Intensity'
|
|
607
|
+
plt.vlines(x=q_spec_pre_trans[:,0], ymin=[0]*len(q_spec_pre_trans[:,0]), ymax=q_spec_pre_trans[:,1], linewidth=3, color='blue', label=f'Spectrum ID1: {spectrum_ID1}')
|
|
608
|
+
plt.vlines(x=r_spec_pre_trans[:,0], ymin=[0]*len(r_spec_pre_trans[:,0]), ymax=-r_spec_pre_trans[:,1], linewidth=3, color='red', label=f'Spectrum ID2: {spectrum_ID2}')
|
|
609
|
+
plt.xlabel('m/z',fontsize=7)
|
|
610
|
+
plt.ylabel(ylab, fontsize=7)
|
|
611
|
+
plt.xticks(fontsize=7)
|
|
612
|
+
plt.yticks(fontsize=7)
|
|
613
|
+
plt.title('Untransformed Query and Reference Spectra', fontsize=10)
|
|
614
|
+
|
|
615
|
+
for transformation in spectrum_preprocessing_order:
|
|
616
|
+
if transformation == 'W':
|
|
617
|
+
q_spec[:,1] = wf_transform(q_spec[:,0], q_spec[:,1], wf_mz, wf_intensity)
|
|
618
|
+
r_spec[:,1] = wf_transform(r_spec[:,0], r_spec[:,1], wf_mz, wf_intensity)
|
|
619
|
+
if transformation == 'L':
|
|
620
|
+
q_spec[:,1] = LE_transform(q_spec[:,1], LET_threshold, normalization_method)
|
|
621
|
+
r_spec[:,1] = LE_transform(r_spec[:,1], LET_threshold, normalization_method)
|
|
622
|
+
if transformation == 'N':
|
|
623
|
+
q_spec = remove_noise(q_spec, nr = noise_threshold)
|
|
624
|
+
if high_quality_reference_library == False or high_quality_reference_library == 'False':
|
|
625
|
+
r_spec = remove_noise(r_spec, nr = noise_threshold)
|
|
626
|
+
if transformation == 'F':
|
|
627
|
+
q_spec = filter_spec_gcms(q_spec, mz_min = mz_min, mz_max = mz_max, int_min = int_min, int_max = int_max)
|
|
628
|
+
if high_quality_reference_library == False or high_quality_reference_library == 'False':
|
|
629
|
+
r_spec = filter_spec_gcms(r_spec, mz_min = mz_min, mz_max = mz_max, int_min = int_min, int_max = int_max)
|
|
630
|
+
|
|
631
|
+
if q_spec.shape[0] > 1:
|
|
632
|
+
similarity_score = get_similarity(similarity_measure, q_spec[:,1], r_spec[:,1], weights, entropy_dimension)
|
|
633
|
+
else:
|
|
634
|
+
similarity_score = 0
|
|
635
|
+
|
|
636
|
+
|
|
637
|
+
plt.subplot(2,1,2)
|
|
638
|
+
|
|
639
|
+
if q_spec.shape[0] == 0 or r_spec.shape[0] == 0:
|
|
640
|
+
plt.text(0.5, 0.5, 'The query and/or reference spectrum has no ion fragments left after transformations.\n Change transformation parameters.', ha='center', va='center', fontsize=7, color='black')
|
|
641
|
+
plt.xticks([])
|
|
642
|
+
plt.yticks([])
|
|
643
|
+
elif np.max(q_spec[:,1]) == 0 or np.max(r_spec[:,1]) == 0:
|
|
644
|
+
plt.text(0.5, 0.5, 'The query and/or reference spectrum has no non-zero intensities after transformations.\n Change transformation parameters.', ha='center', va='center', fontsize=7, color='black')
|
|
645
|
+
plt.xticks([])
|
|
646
|
+
plt.yticks([])
|
|
647
|
+
else:
|
|
648
|
+
if y_axis_transformation == 'normalized':
|
|
649
|
+
q_spec[:,1] = q_spec[:,1] / np.max(q_spec[:,1])
|
|
650
|
+
r_spec[:,1] = r_spec[:,1] / np.max(r_spec[:,1])
|
|
651
|
+
ylab='Normalized Intensity'
|
|
652
|
+
elif y_axis_transformation == 'log10':
|
|
653
|
+
q_spec[:,1] = np.log10(q_spec[:,1]+1)
|
|
654
|
+
r_spec[:,1] = np.log10(r_spec[:,1]+1)
|
|
655
|
+
ylab='log10(Intensity)'
|
|
656
|
+
elif y_axis_transformation == 'sqrt':
|
|
657
|
+
q_spec[:,1] = np.sqrt(q_spec[:,1])
|
|
658
|
+
r_spec[:,1] = np.sqrt(r_spec[:,1])
|
|
659
|
+
ylab='sqrt(Intensity)'
|
|
660
|
+
else:
|
|
661
|
+
ylab = 'Raw Intensity'
|
|
662
|
+
plt.vlines(x=mzs, ymin=[0]*len(mzs), ymax=q_spec[:,1], linewidth=3, color='blue')
|
|
663
|
+
plt.vlines(x=mzs, ymin=[0]*len(mzs), ymax=-r_spec[:,1], linewidth=3, color='red')
|
|
664
|
+
plt.xlabel('m/z', fontsize=7)
|
|
665
|
+
plt.ylabel(ylab, fontsize=7)
|
|
666
|
+
plt.xticks(fontsize=7)
|
|
667
|
+
plt.yticks(fontsize=7)
|
|
668
|
+
plt.title(f'Transformed Query and Reference Spectra', fontsize=10)
|
|
669
|
+
|
|
670
|
+
plt.subplots_adjust(top=0.8, hspace=0.92, bottom=0.3)
|
|
671
|
+
plt.figlegend(loc='upper center')
|
|
672
|
+
|
|
673
|
+
fig.text(0.05, 0.20, f'Similarity Measure: {similarity_measure.capitalize()}', fontsize=7)
|
|
674
|
+
fig.text(0.05, 0.17, f'Similarity Score: {round(similarity_score, 4)}', fontsize=7)
|
|
675
|
+
fig.text(0.05, 0.14, f"Spectrum Preprocessing Order: {''.join(spectrum_preprocessing_order)}", fontsize=7)
|
|
676
|
+
fig.text(0.05, 0.11, f'High Quality Reference Library: {str(high_quality_reference_library)}', fontsize=7)
|
|
677
|
+
fig.text(0.05, 0.08, f'Weight Factors (m/z,intensity): ({wf_mz},{wf_intensity})', fontsize=7)
|
|
678
|
+
if similarity_measure == 'mixture':
|
|
679
|
+
fig.text(0.05, 0.05, f'Weights for mixture similarity: {weights}', fontsize=7)
|
|
680
|
+
|
|
681
|
+
fig.text(0.40, 0.20, f'Raw-Scale M/Z Range: [{min_mz},{max_mz}]', fontsize=7)
|
|
682
|
+
fig.text(0.40, 0.17, f'Raw-Scale Intensity Range: [{int_min_tmp},{int_max_tmp}]', fontsize=7)
|
|
683
|
+
fig.text(0.40, 0.14, f'Noise Threshold: {noise_threshold}', fontsize=7)
|
|
684
|
+
fig.text(0.40, 0.11, f'Low-Entropy Threshold: {LET_threshold}', fontsize=7)
|
|
685
|
+
|
|
686
|
+
if print_url_spectrum1 == 'Yes' and print_url_spectrum2 == 'Yes':
|
|
687
|
+
url_tmp1 = get_pubchem_url(query=spectrum_ID1)
|
|
688
|
+
url_tmp2 = get_pubchem_url(query=spectrum_ID2)
|
|
689
|
+
t1 = fig.text(0.40, 0.08, f'PubChem URL for {spectrum_ID1}: {url_tmp1}', fontsize=7)
|
|
690
|
+
t2 = fig.text(0.40, 0.05, f'PubChem URL for {spectrum_ID2}: {url_tmp2}', fontsize=7)
|
|
691
|
+
t1.set_url(url_tmp1)
|
|
692
|
+
t2.set_url(url_tmp2)
|
|
693
|
+
|
|
694
|
+
if print_url_spectrum1 == 'Yes' and print_url_spectrum2 == 'No':
|
|
695
|
+
url_tmp1 = get_pubchem_url(query=spectrum_ID1)
|
|
696
|
+
t1 = fig.text(0.40, 0.08, f'PubChem URL for {spectrum_ID1}: {url_tmp1}', fontsize=7)
|
|
697
|
+
t1.set_url(url_tmp1)
|
|
698
|
+
|
|
699
|
+
if print_url_spectrum1 == 'No' and print_url_spectrum2 == 'Yes':
|
|
700
|
+
url_tmp2 = get_pubchem_url(query=spectrum_ID2)
|
|
701
|
+
t2 = fig.text(0.40, 0.08, f'PubChem URL for {spectrum_ID2}: {url_tmp2}', fontsize=7)
|
|
702
|
+
t2.set_url(url_tmp2)
|
|
703
|
+
|
|
704
|
+
fig.savefig(output_path, format='svg')
|
|
705
|
+
|
|
706
|
+
if return_plot == True:
|
|
707
|
+
return fig
|
|
708
|
+
|
|
709
|
+
|
|
710
|
+
def wf_transform(spec_mzs, spec_ints, wf_mz, wf_int):
|
|
711
|
+
spec_ints = np.power(spec_mzs, wf_mz) * np.power(spec_ints, wf_int)
|
|
712
|
+
return(spec_ints)
|
|
713
|
+
|
|
714
|
+
|
|
715
|
+
def LE_transform(intensity, thresh, normalization_method):
|
|
716
|
+
intensity_tmp = normalize(intensity, method=normalization_method)
|
|
717
|
+
if np.sum(intensity_tmp) > 0:
|
|
718
|
+
S = scipy.stats.entropy(intensity_tmp.astype('float'))
|
|
719
|
+
if S > 0 and S < thresh:
|
|
720
|
+
w = (1 + S) / (1 + thresh)
|
|
721
|
+
intensity = np.power(intensity_tmp, w)
|
|
722
|
+
else:
|
|
723
|
+
intensity = np.zeros(len(intensity))
|
|
724
|
+
return intensity
|
|
725
|
+
|
|
726
|
+
|
|
727
|
+
def normalize(intensities,method='standard'):
|
|
728
|
+
if np.sum(intensities) > 0:
|
|
729
|
+
if method == 'softmax':
|
|
730
|
+
if np.any(intensities > 700):
|
|
731
|
+
print("Warning: some intensities are too large to exponentiate. Applying standard normalization.")
|
|
732
|
+
intensities /= np.sum(intensities)
|
|
733
|
+
else:
|
|
734
|
+
intensities2 = np.exp(intensities)
|
|
735
|
+
if np.isinf(intensities2).sum() == 0:
|
|
736
|
+
intensities = intensities / np.sum(intensities2)
|
|
737
|
+
elif method == 'standard':
|
|
738
|
+
intensities /= np.sum(intensities)
|
|
739
|
+
return(intensities)
|
|
740
|
+
|
|
741
|
+
|
|
742
|
+
def filter_spec_lcms(spec, mz_min = 0, mz_max = 999999999999, int_min = 0, int_max = 999999999999, is_matched = False):
|
|
743
|
+
if is_matched == False:
|
|
744
|
+
spec = spec[spec[:,0] >= mz_min]
|
|
745
|
+
spec = spec[spec[:,0] <= mz_max]
|
|
746
|
+
spec = spec[spec[:,1] >= int_min]
|
|
747
|
+
spec = spec[spec[:,1] <= int_max]
|
|
748
|
+
else:
|
|
749
|
+
spec = spec[spec[:,0] >= mz_min]
|
|
750
|
+
spec = spec[spec[:,0] <= mz_max]
|
|
751
|
+
spec[spec[:,1] >= int_min] = 0
|
|
752
|
+
spec[spec[:,1] <= int_max] = 0
|
|
753
|
+
return(spec)
|
|
754
|
+
|
|
755
|
+
|
|
756
|
+
def filter_spec_gcms(spec, mz_min = 0, mz_max = 999999999999, int_min = 0, int_max = 999999999999):
|
|
757
|
+
spec[np.where(spec[:,0] < mz_min)[0],1] = 0
|
|
758
|
+
spec[np.where(spec[:,0] > mz_max)[0],1] = 0
|
|
759
|
+
spec[np.where(spec[:,1] < int_min)[0],1] = 0
|
|
760
|
+
spec[np.where(spec[:,1] > int_max)[0],1] = 0
|
|
761
|
+
return(spec)
|
|
762
|
+
|
|
763
|
+
|
|
764
|
+
def remove_noise(spec, nr):
|
|
765
|
+
if spec.shape[0] > 1:
|
|
766
|
+
if nr is not None:
|
|
767
|
+
spec[np.where(spec[:,1] < np.max(spec[:,1]) * nr)[0]] = 0
|
|
768
|
+
|
|
769
|
+
return(spec)
|
|
770
|
+
|
|
771
|
+
|
|
772
|
+
def centroid_spectrum(spec, window_size):
|
|
773
|
+
spec = spec[np.argsort(spec[:,0])]
|
|
774
|
+
|
|
775
|
+
mz_array = spec[:, 0]
|
|
776
|
+
need_centroid = 0
|
|
777
|
+
if mz_array.shape[0] > 1:
|
|
778
|
+
mz_delta = mz_array[1:] - mz_array[:-1]
|
|
779
|
+
if np.min(mz_delta) <= window_size:
|
|
780
|
+
need_centroid = 1
|
|
781
|
+
|
|
782
|
+
if need_centroid:
|
|
783
|
+
intensity_order = np.argsort(-spec[:, 1])
|
|
784
|
+
spec_new = []
|
|
785
|
+
for i in intensity_order:
|
|
786
|
+
mz_delta_allowed = window_size
|
|
787
|
+
|
|
788
|
+
if spec[i, 1] > 0:
|
|
789
|
+
i_left = i - 1
|
|
790
|
+
while i_left >= 0:
|
|
791
|
+
mz_delta_left = spec[i, 0] - spec[i_left, 0]
|
|
792
|
+
if mz_delta_left <= mz_delta_allowed:
|
|
793
|
+
i_left -= 1
|
|
794
|
+
else:
|
|
795
|
+
break
|
|
796
|
+
i_left += 1
|
|
797
|
+
|
|
798
|
+
i_right = i + 1
|
|
799
|
+
while i_right < spec.shape[0]:
|
|
800
|
+
mz_delta_right = spec[i_right, 0] - spec[i, 0]
|
|
801
|
+
if mz_delta_right <= mz_delta_allowed:
|
|
802
|
+
i_right += 1
|
|
803
|
+
else:
|
|
804
|
+
break
|
|
805
|
+
|
|
806
|
+
intensity_sum = np.sum(spec[i_left:i_right, 1])
|
|
807
|
+
intensity_weighted_sum = np.sum(spec[i_left:i_right, 0] * spec[i_left:i_right, 1])
|
|
808
|
+
|
|
809
|
+
spec_new.append([intensity_weighted_sum / intensity_sum, intensity_sum])
|
|
810
|
+
spec[i_left:i_right, 1] = 0
|
|
811
|
+
|
|
812
|
+
spec_new = np.array(spec_new)
|
|
813
|
+
spec_new = spec_new[np.argsort(spec_new[:, 0])]
|
|
814
|
+
if spec_new.shape[0] > 1:
|
|
815
|
+
spec_new = spec_new[np.argsort(spec_new[:, 0])]
|
|
816
|
+
return spec_new
|
|
817
|
+
else:
|
|
818
|
+
return np.array([[0,0]])
|
|
819
|
+
else:
|
|
820
|
+
return spec
|
|
821
|
+
|
|
822
|
+
|
|
823
|
+
|
|
824
|
+
def match_peaks_in_spectra(spec_a, spec_b, window_size):
|
|
825
|
+
a = 0
|
|
826
|
+
b = 0
|
|
827
|
+
|
|
828
|
+
spec_merged = []
|
|
829
|
+
peak_b_int = 0.
|
|
830
|
+
while a < spec_a.shape[0] and b < spec_b.shape[0]:
|
|
831
|
+
mass_delta = spec_a[a, 0] - spec_b[b, 0]
|
|
832
|
+
|
|
833
|
+
if mass_delta < -window_size:
|
|
834
|
+
spec_merged.append([spec_a[a, 0], spec_a[a, 1], peak_b_int])
|
|
835
|
+
peak_b_int = 0.
|
|
836
|
+
a += 1
|
|
837
|
+
elif mass_delta > window_size:
|
|
838
|
+
spec_merged.append([spec_b[b, 0], 0., spec_b[b, 1]])
|
|
839
|
+
b += 1
|
|
840
|
+
else:
|
|
841
|
+
peak_b_int += spec_b[b, 1]
|
|
842
|
+
b += 1
|
|
843
|
+
|
|
844
|
+
if peak_b_int > 0.:
|
|
845
|
+
spec_merged.append([spec_a[a, 0], spec_a[a, 1], peak_b_int])
|
|
846
|
+
peak_b_int = 0.
|
|
847
|
+
a += 1
|
|
848
|
+
|
|
849
|
+
if b < spec_b.shape[0]:
|
|
850
|
+
spec_merged += [[x[0], 0., x[1]] for x in spec_b[b:]]
|
|
851
|
+
|
|
852
|
+
if a < spec_a.shape[0]:
|
|
853
|
+
spec_merged += [[x[0], x[1], 0.] for x in spec_a[a:]]
|
|
854
|
+
|
|
855
|
+
if spec_merged:
|
|
856
|
+
spec_merged = np.array(spec_merged, dtype=np.float64)
|
|
857
|
+
else:
|
|
858
|
+
spec_merged = np.array([[0., 0., 0.]], dtype=np.float64)
|
|
859
|
+
return spec_merged
|
|
860
|
+
|
|
861
|
+
|
|
862
|
+
|
|
863
|
+
def convert_spec(spec, mzs):
|
|
864
|
+
ints_tmp = []
|
|
865
|
+
for i in range(0,len(mzs)):
|
|
866
|
+
if mzs[i] in spec[:,0]:
|
|
867
|
+
int_tmp = spec[np.where(spec[:,0] == mzs[i])[0][0],1]
|
|
868
|
+
else:
|
|
869
|
+
int_tmp = 0
|
|
870
|
+
ints_tmp.append(int_tmp)
|
|
871
|
+
out = np.transpose(np.array([mzs,ints_tmp]))
|
|
872
|
+
return out
|
|
873
|
+
|
|
874
|
+
|
|
875
|
+
def get_reference_df(reference_data, likely_reference_IDs=None):
|
|
876
|
+
extension = reference_data.rsplit('.',1)
|
|
877
|
+
extension = extension[(len(extension)-1)]
|
|
878
|
+
if extension == 'mgf' or extension == 'MGF' or extension == 'mzML' or extension == 'mzml' or extension == 'MZML' or extension == 'cdf' or extension == 'CDF' or extension == 'msp' or extension == 'MSP' or extension == 'json' or extension == 'JSON':
|
|
879
|
+
output_path_tmp = reference_data[:-3] + 'txt'
|
|
880
|
+
build_library_from_raw_data(input_path=reference_data, output_path=output_path_tmp, is_reference=True)
|
|
881
|
+
df_reference = pd.read_csv(output_path_tmp, sep='\t')
|
|
882
|
+
if extension == 'txt' or extension == 'TXT':
|
|
883
|
+
df_reference = pd.read_csv(reference_data, sep='\t')
|
|
884
|
+
if likely_reference_IDs is not None:
|
|
885
|
+
likely_reference_IDs = pd.read_csv(likely_reference_IDs, header=None)
|
|
886
|
+
df_reference = df_reference.loc[df_reference.iloc[:,0].isin(likely_reference_IDs.iloc[:,0].tolist())]
|
|
887
|
+
return df_reference
|
|
888
|
+
|
|
889
|
+
|
|
890
|
+
|
|
891
|
+
def S_cos(ints_a, ints_b):
|
|
892
|
+
if np.sum(ints_a) == 0 or np.sum(ints_b) == 0:
|
|
893
|
+
return(0)
|
|
894
|
+
else:
|
|
895
|
+
return np.dot(ints_a,ints_b) / (np.sqrt(sum(np.power(ints_a,2))) * np.sqrt(sum(np.power(ints_b,2))))
|
|
896
|
+
|
|
897
|
+
|
|
898
|
+
def ent_renyi(ints, q):
|
|
899
|
+
return np.log(sum(np.power(ints,q))) / (1-q)
|
|
900
|
+
|
|
901
|
+
|
|
902
|
+
def ent_tsallis(ints, q):
|
|
903
|
+
return (sum(np.power(ints,q))-1) / (1-q)
|
|
904
|
+
|
|
905
|
+
|
|
906
|
+
def S_shannon(ints_a, ints_b):
|
|
907
|
+
ent_a = scipy.stats.entropy(ints_a)
|
|
908
|
+
ent_b = scipy.stats.entropy(ints_b)
|
|
909
|
+
ent_ab = scipy.stats.entropy(ints_a + ints_b)
|
|
910
|
+
return(1 - (2 * ent_ab - ent_a - ent_b)/np.log(4))
|
|
911
|
+
|
|
912
|
+
|
|
913
|
+
def S_renyi(ints_a, ints_b, q):
|
|
914
|
+
if q == 1:
|
|
915
|
+
print('Warning: the Renyi Entropy Similarity Measure is equivalent to the Shannon Entropy Similarity Measure when the entropy dimension is 1')
|
|
916
|
+
return S_shannon(ints_a, ints_b)
|
|
917
|
+
else:
|
|
918
|
+
ent_a = ent_renyi(ints_a, q)
|
|
919
|
+
ent_b = ent_renyi(ints_b, q)
|
|
920
|
+
ent_merg = ent_renyi(ints_a/2 + ints_b/2, q)
|
|
921
|
+
N = (1/(1-q)) * (2*np.log(np.sum(np.power(ints_a/2,q))+np.sum(np.power(ints_b/2,q))) - np.log(np.sum(np.power(ints_a,q))) - np.log(np.sum(np.power(ints_b,q))))
|
|
922
|
+
return 1 - (2 * ent_merg - ent_a - ent_b) / N
|
|
923
|
+
|
|
924
|
+
|
|
925
|
+
def S_tsallis(ints_a, ints_b, q):
|
|
926
|
+
if q == 1:
|
|
927
|
+
print('Warning: the Tsallis Entropy Similarity Measure is equivalent to the Shannon Entropy Similarity Measure when the entropy dimension is 1')
|
|
928
|
+
return S_shannon(ints_a, ints_b)
|
|
929
|
+
else:
|
|
930
|
+
ent_a = ent_tsallis(ints_a, q)
|
|
931
|
+
ent_b = ent_tsallis(ints_b, q)
|
|
932
|
+
ent_merg = ent_tsallis(ints_a/2 + ints_b/2, q)
|
|
933
|
+
N = np.sum(2*np.power(ints_a/2,q)+2*np.power(ints_b/2,q)-np.power(ints_a,q)-np.power(ints_b,q)) / (1-q)
|
|
934
|
+
return 1 - (2 * ent_merg - ent_a - ent_b) / N
|
|
935
|
+
|
|
936
|
+
def S_mixture(ints_a, ints_b, weights={'Cosine':0.25, 'Shannon':0.25, 'Renyi':0.25, 'Tsallis':0.25}, q=1.1):
|
|
937
|
+
if set(weights.keys()).issubset(set(['Cosine','Shannon','Renyi','Tsallis'])) is False:
|
|
938
|
+
print('Error: the keys to the weight parameter dict of the function S_mixture must be one of the four: Cosine, Shannon, Renyi, Tsallis')
|
|
939
|
+
sys.exit()
|
|
940
|
+
|
|
941
|
+
similarity = 0
|
|
942
|
+
for key, value in weights.items():
|
|
943
|
+
if key == 'Cosine':
|
|
944
|
+
similarity += value * S_cos(ints_a,ints_b)
|
|
945
|
+
if key == 'Shannon':
|
|
946
|
+
similarity += value * S_shannon(ints_a,ints_b)
|
|
947
|
+
if key == 'Renyi':
|
|
948
|
+
similarity += value * S_renyi(ints_a,ints_b,q)
|
|
949
|
+
if key == 'Tsallis':
|
|
950
|
+
similarity += value * S_tsallis(ints_a,ints_b,q)
|
|
951
|
+
return similarity
|
|
952
|
+
|
|
953
|
+
|
|
954
|
+
def get_contingency_entries(ints_a, ints_b):
|
|
955
|
+
a = 0
|
|
956
|
+
b = 0
|
|
957
|
+
c = 0
|
|
958
|
+
|
|
959
|
+
for x, y in zip(ints_a, ints_b):
|
|
960
|
+
if x != 0 and y != 0:
|
|
961
|
+
c += 1
|
|
962
|
+
elif x != 0 and y == 0:
|
|
963
|
+
a += 1
|
|
964
|
+
elif x == 0 and y != 0:
|
|
965
|
+
b += 1
|
|
966
|
+
return [a,b,c]
|
|
967
|
+
|
|
968
|
+
|
|
969
|
+
def S_jaccard(ints_a, ints_b):
|
|
970
|
+
tmp = get_contingency_entries(ints_a, ints_b)
|
|
971
|
+
a = tmp[0]
|
|
972
|
+
b = tmp[1]
|
|
973
|
+
c = tmp[2]
|
|
974
|
+
denom = a + b + c
|
|
975
|
+
if denom == 0:
|
|
976
|
+
similarity = 0
|
|
977
|
+
else:
|
|
978
|
+
similarity = c / (a + b + c)
|
|
979
|
+
return similarity
|
|
980
|
+
|
|
981
|
+
|
|
982
|
+
def S_dice(ints_a, ints_b):
|
|
983
|
+
tmp = get_contingency_entries(ints_a, ints_b)
|
|
984
|
+
a = tmp[0]
|
|
985
|
+
b = tmp[1]
|
|
986
|
+
c = tmp[2]
|
|
987
|
+
denom = a + b + 2 * c
|
|
988
|
+
if denom == 0:
|
|
989
|
+
similarity = 0
|
|
990
|
+
else:
|
|
991
|
+
similarity = 2 * c / denom
|
|
992
|
+
return similarity
|
|
993
|
+
|
|
994
|
+
|
|
995
|
+
def S_3w_jaccard(ints_a, ints_b):
|
|
996
|
+
tmp = get_contingency_entries(ints_a, ints_b)
|
|
997
|
+
a = tmp[0]
|
|
998
|
+
b = tmp[1]
|
|
999
|
+
c = tmp[2]
|
|
1000
|
+
denom = a + b + 3 * c
|
|
1001
|
+
if denom == 0:
|
|
1002
|
+
similarity = 0
|
|
1003
|
+
else:
|
|
1004
|
+
similarity = 3 * c / denom
|
|
1005
|
+
return similarity
|
|
1006
|
+
|
|
1007
|
+
|
|
1008
|
+
def S_sokal_sneath(ints_a, ints_b):
|
|
1009
|
+
tmp = get_contingency_entries(ints_a, ints_b)
|
|
1010
|
+
a = tmp[0]
|
|
1011
|
+
b = tmp[1]
|
|
1012
|
+
c = tmp[2]
|
|
1013
|
+
denom = 2 * a + 2 * b + c
|
|
1014
|
+
if denom == 0:
|
|
1015
|
+
similarity = 0
|
|
1016
|
+
else:
|
|
1017
|
+
similarity = c / denom
|
|
1018
|
+
return similarity
|
|
1019
|
+
|
|
1020
|
+
|
|
1021
|
+
def S_binary_cosine(ints_a, ints_b):
|
|
1022
|
+
tmp = get_contingency_entries(ints_a, ints_b)
|
|
1023
|
+
a = tmp[0]
|
|
1024
|
+
b = tmp[1]
|
|
1025
|
+
c = tmp[2]
|
|
1026
|
+
denom = np.sqrt((a + c) * (b + c))
|
|
1027
|
+
if denom == 0:
|
|
1028
|
+
similarity = 0
|
|
1029
|
+
else:
|
|
1030
|
+
similarity = c / denom
|
|
1031
|
+
return similarity
|
|
1032
|
+
|
|
1033
|
+
|
|
1034
|
+
def S_mountford(ints_a, ints_b):
|
|
1035
|
+
tmp = get_contingency_entries(ints_a, ints_b)
|
|
1036
|
+
a = tmp[0]
|
|
1037
|
+
b = tmp[1]
|
|
1038
|
+
c = tmp[2]
|
|
1039
|
+
denom = c * (a + b) + 2 * a * b
|
|
1040
|
+
if denom == 0:
|
|
1041
|
+
similarity = 1
|
|
1042
|
+
else:
|
|
1043
|
+
similarity = 2 * c / denom
|
|
1044
|
+
return similarity
|
|
1045
|
+
|
|
1046
|
+
|
|
1047
|
+
def S_mcconnaughey(ints_a, ints_b):
|
|
1048
|
+
tmp = get_contingency_entries(ints_a, ints_b)
|
|
1049
|
+
a = tmp[0]
|
|
1050
|
+
b = tmp[1]
|
|
1051
|
+
c = tmp[2]
|
|
1052
|
+
denom = (a + c) * (b + c)
|
|
1053
|
+
if denom == 0:
|
|
1054
|
+
similarity = 0
|
|
1055
|
+
else:
|
|
1056
|
+
similarity = (c**2 - a * b) / denom
|
|
1057
|
+
return similarity
|
|
1058
|
+
|
|
1059
|
+
|
|
1060
|
+
def S_driver_kroeber(ints_a, ints_b):
|
|
1061
|
+
tmp = get_contingency_entries(ints_a, ints_b)
|
|
1062
|
+
a = tmp[0]
|
|
1063
|
+
b = tmp[1]
|
|
1064
|
+
c = tmp[2]
|
|
1065
|
+
denom = 2 * (a + c) * (b + c)
|
|
1066
|
+
if denom == 0:
|
|
1067
|
+
similarity = 0
|
|
1068
|
+
else:
|
|
1069
|
+
similarity = c * (a + b + 2 * c) / denom
|
|
1070
|
+
return similarity
|
|
1071
|
+
|
|
1072
|
+
|
|
1073
|
+
def S_simpson(ints_a, ints_b):
|
|
1074
|
+
tmp = get_contingency_entries(ints_a, ints_b)
|
|
1075
|
+
a = tmp[0]
|
|
1076
|
+
b = tmp[1]
|
|
1077
|
+
c = tmp[2]
|
|
1078
|
+
denom = min(a + c, b + c)
|
|
1079
|
+
if denom == 0:
|
|
1080
|
+
similarity = 0
|
|
1081
|
+
else:
|
|
1082
|
+
similarity = c / denom
|
|
1083
|
+
return similarity
|
|
1084
|
+
|
|
1085
|
+
|
|
1086
|
+
def S_braun_banquet(ints_a, ints_b):
|
|
1087
|
+
tmp = get_contingency_entries(ints_a, ints_b)
|
|
1088
|
+
a = tmp[0]
|
|
1089
|
+
b = tmp[1]
|
|
1090
|
+
c = tmp[2]
|
|
1091
|
+
denom = max(a + c, b + c)
|
|
1092
|
+
if denom == 0:
|
|
1093
|
+
similarity = 0
|
|
1094
|
+
else:
|
|
1095
|
+
similarity = c / denom
|
|
1096
|
+
return similarity
|
|
1097
|
+
|
|
1098
|
+
|
|
1099
|
+
def S_fager_mcgowan(ints_a, ints_b):
|
|
1100
|
+
tmp = get_contingency_entries(ints_a, ints_b)
|
|
1101
|
+
a = tmp[0]
|
|
1102
|
+
b = tmp[1]
|
|
1103
|
+
c = tmp[2]
|
|
1104
|
+
denom1 = np.sqrt((a + c) * (b + c))
|
|
1105
|
+
denom2 = 2 * np.sqrt(max(a + c, b + c))
|
|
1106
|
+
if denom1 == 0 or denom2 == 0:
|
|
1107
|
+
similarity = 0
|
|
1108
|
+
else:
|
|
1109
|
+
similarity = c / denom1 - 1 / denom2
|
|
1110
|
+
return similarity
|
|
1111
|
+
|
|
1112
|
+
|
|
1113
|
+
def S_kulczynski(ints_a, ints_b):
|
|
1114
|
+
tmp = get_contingency_entries(ints_a, ints_b)
|
|
1115
|
+
a = tmp[0]
|
|
1116
|
+
b = tmp[1]
|
|
1117
|
+
c = tmp[2]
|
|
1118
|
+
denom = a + b
|
|
1119
|
+
if denom == 0:
|
|
1120
|
+
similarity = 1
|
|
1121
|
+
else:
|
|
1122
|
+
similarity = c / denom
|
|
1123
|
+
return similarity
|
|
1124
|
+
|
|
1125
|
+
|
|
1126
|
+
def S_intersection(ints_a, ints_b):
|
|
1127
|
+
tmp = get_contingency_entries(ints_a, ints_b)
|
|
1128
|
+
c = tmp[2]
|
|
1129
|
+
return c
|
|
1130
|
+
|
|
1131
|
+
|
|
1132
|
+
def S_hamming(ints_a, ints_b):
|
|
1133
|
+
tmp = get_contingency_entries(ints_a, ints_b)
|
|
1134
|
+
a = tmp[0]
|
|
1135
|
+
b = tmp[1]
|
|
1136
|
+
denom = a + b
|
|
1137
|
+
if denom == 0:
|
|
1138
|
+
similarity = 1
|
|
1139
|
+
else:
|
|
1140
|
+
similarity = 1 / denom
|
|
1141
|
+
return similarity
|
|
1142
|
+
|
|
1143
|
+
|
|
1144
|
+
def S_hellinger(ints_a, ints_b):
|
|
1145
|
+
tmp = get_contingency_entries(ints_a, ints_b)
|
|
1146
|
+
a = tmp[0]
|
|
1147
|
+
b = tmp[1]
|
|
1148
|
+
c = tmp[2]
|
|
1149
|
+
similarity = 1 - np.sqrt((1 - c / np.sqrt((a + c) * (b + c))))
|
|
1150
|
+
return similarity
|
|
1151
|
+
|
|
1152
|
+
|
|
1153
|
+
def get_similarity(similarity_measure, q_ints, r_ints, weights, q):
|
|
1154
|
+
|
|
1155
|
+
if similarity_measure == 'cosine':
|
|
1156
|
+
similarity = S_cos(q_ints, r_ints)
|
|
1157
|
+
|
|
1158
|
+
elif similarity_measure in ['shannon', 'renyi', 'tsallis']:
|
|
1159
|
+
q_ints = normalize(q_ints, method = 'standard')
|
|
1160
|
+
r_ints = normalize(r_ints, method = 'standard')
|
|
1161
|
+
if similarity_measure == 'shannon':
|
|
1162
|
+
similarity = S_shannon(q_ints, r_ints)
|
|
1163
|
+
elif similarity_measure == 'renyi':
|
|
1164
|
+
similarity = S_renyi(q_ints, r_ints, q)
|
|
1165
|
+
elif similarity_measure == 'tsallis':
|
|
1166
|
+
similarity = S_tsallis(q_ints, r_ints, q)
|
|
1167
|
+
|
|
1168
|
+
elif similarity_measure == 'mixture':
|
|
1169
|
+
similarity = S_mixture(q_ints, r_ints, weights, q)
|
|
1170
|
+
|
|
1171
|
+
elif similarity_measure == 'jaccard':
|
|
1172
|
+
similarity = S_jaccard(q_ints, r_ints)
|
|
1173
|
+
|
|
1174
|
+
elif similarity_measure == 'dice':
|
|
1175
|
+
similarity = S_dice(q_ints, r_ints)
|
|
1176
|
+
|
|
1177
|
+
elif similarity_measure == '3w_jaccard':
|
|
1178
|
+
similarity = S_3w_jaccard(q_ints, r_ints)
|
|
1179
|
+
|
|
1180
|
+
elif similarity_measure == 'sokal_sneath':
|
|
1181
|
+
similarity = S_sokal_sneath(q_ints, r_ints)
|
|
1182
|
+
|
|
1183
|
+
elif similarity_measure == 'binary_cosine':
|
|
1184
|
+
similarity = S_binary_cosine(q_ints, r_ints)
|
|
1185
|
+
|
|
1186
|
+
elif similarity_measure == 'mountford':
|
|
1187
|
+
similarity = S_mountford(q_ints, r_ints)
|
|
1188
|
+
|
|
1189
|
+
elif similarity_measure == 'mcconnaughey':
|
|
1190
|
+
similarity = S_mcconnaughey(q_ints, r_ints)
|
|
1191
|
+
|
|
1192
|
+
elif similarity_measure == 'driver_kroeber':
|
|
1193
|
+
similarity = S_driver_kroeber(q_ints, r_ints)
|
|
1194
|
+
|
|
1195
|
+
elif similarity_measure == 'simpson':
|
|
1196
|
+
similarity = S_simpson(q_ints, r_ints)
|
|
1197
|
+
|
|
1198
|
+
elif similarity_measure == 'braun_banquet':
|
|
1199
|
+
similarity = S_braun_banquet(q_ints, r_ints)
|
|
1200
|
+
|
|
1201
|
+
elif similarity_measure == 'fager_mcgowan':
|
|
1202
|
+
similarity = S_fager_mcgowan(q_ints, r_ints)
|
|
1203
|
+
|
|
1204
|
+
elif similarity_measure == 'kulczynski':
|
|
1205
|
+
similarity = S_kulczynski(q_ints, r_ints)
|
|
1206
|
+
|
|
1207
|
+
elif similarity_measure == 'intersection':
|
|
1208
|
+
similarity = S_intersection(q_ints, r_ints)
|
|
1209
|
+
|
|
1210
|
+
elif similarity_measure == 'hamming':
|
|
1211
|
+
similarity = S_hamming(q_ints, r_ints)
|
|
1212
|
+
|
|
1213
|
+
elif similarity_measure == 'hellinger':
|
|
1214
|
+
similarity = S_hellinger(q_ints, r_ints)
|
|
1215
|
+
|
|
1216
|
+
return similarity
|
|
1217
|
+
|
|
1218
|
+
|
|
1219
|
+
def _vector_to_full_params(X, default_params, optimize_params):
|
|
1220
|
+
params = default_params.copy()
|
|
1221
|
+
for name, val in zip(optimize_params, X):
|
|
1222
|
+
params[name] = float(val)
|
|
1223
|
+
return params
|
|
1224
|
+
|
|
1225
|
+
|
|
1226
|
+
def objective_function_HRMS(X, ctx):
|
|
1227
|
+
p = _vector_to_full_params(X, ctx["default_params"], ctx["optimize_params"])
|
|
1228
|
+
acc = get_acc_HRMS(
|
|
1229
|
+
ctx["df_query"], ctx["df_reference"],
|
|
1230
|
+
ctx["precursor_ion_mz_tolerance"], ctx["ionization_mode"], ctx["adduct"],
|
|
1231
|
+
ctx["similarity_measure"], ctx["weights"], ctx["spectrum_preprocessing_order"],
|
|
1232
|
+
ctx["mz_min"], ctx["mz_max"], ctx["int_min"], ctx["int_max"],
|
|
1233
|
+
p["window_size_centroiding"], p["window_size_matching"], p["noise_threshold"],
|
|
1234
|
+
p["wf_mz"], p["wf_int"], p["LET_threshold"],
|
|
1235
|
+
p["entropy_dimension"],
|
|
1236
|
+
ctx["high_quality_reference_library"],
|
|
1237
|
+
verbose=False
|
|
1238
|
+
)
|
|
1239
|
+
print(f"\nparams({ctx['optimize_params']}) = {np.array(X)}\naccuracy: {acc*100}%")
|
|
1240
|
+
return 1.0 - acc
|
|
1241
|
+
|
|
1242
|
+
def objective_function_NRMS(X, ctx):
|
|
1243
|
+
p = _vector_to_full_params(X, ctx["default_params"], ctx["optimize_params"])
|
|
1244
|
+
acc = get_acc_NRMS(
|
|
1245
|
+
ctx["df_query"], ctx["df_reference"],
|
|
1246
|
+
ctx["unique_query_ids"], ctx["unique_reference_ids"],
|
|
1247
|
+
ctx["similarity_measure"], ctx["weights"], ctx["spectrum_preprocessing_order"],
|
|
1248
|
+
ctx["mz_min"], ctx["mz_max"], ctx["int_min"], ctx["int_max"],
|
|
1249
|
+
p["noise_threshold"], p["wf_mz"], p["wf_int"], p["LET_threshold"], p["entropy_dimension"],
|
|
1250
|
+
ctx["high_quality_reference_library"],
|
|
1251
|
+
verbose=False
|
|
1252
|
+
)
|
|
1253
|
+
print(f"\nparams({ctx['optimize_params']}) = {np.array(X)}\naccuracy: {acc*100}%")
|
|
1254
|
+
return 1.0 - acc
|
|
1255
|
+
|
|
1256
|
+
|
|
1257
|
+
|
|
1258
|
+
def tune_params_DE(query_data=None, reference_data=None, precursor_ion_mz_tolerance=None, ionization_mode=None, adduct=None, chromatography_platform='HRMS', similarity_measure='cosine', weights=None, spectrum_preprocessing_order='CNMWL', mz_min=0, mz_max=999999999, int_min=0, int_max=999999999, high_quality_reference_library=False, optimize_params=["window_size_centroiding","window_size_matching","noise_threshold","wf_mz","wf_int","LET_threshold","entropy_dimension"], param_bounds={"window_size_centroiding":(0.0,0.5),"window_size_matching":(0.0,0.5),"noise_threshold":(0.0,0.25),"wf_mz":(0.0,5.0),"wf_int":(0.0,5.0),"LET_threshold":(0.0,5.0),"entropy_dimension":(1.0,3.0)}, default_params={"window_size_centroiding": 0.5, "window_size_matching":0.5, "noise_threshold":0.10, "wf_mz":0.0, "wf_int":1.0, "LET_threshold":0.0, "entropy_dimension":1.1}, maxiters=3, de_workers=1):
|
|
1259
|
+
|
|
1260
|
+
if query_data is None:
|
|
1261
|
+
print('\nError: No argument passed to the mandatory query_data. Please pass the path to the TXT file of the query data.')
|
|
1262
|
+
sys.exit()
|
|
1263
|
+
else:
|
|
1264
|
+
extension = query_data.rsplit('.',1)
|
|
1265
|
+
extension = extension[(len(extension)-1)]
|
|
1266
|
+
if extension == 'mgf' or extension == 'MGF' or extension == 'mzML' or extension == 'mzml' or extension == 'MZML' or extension == 'cdf' or extension == 'CDF' or extension == 'msp' or extension == 'MSP' or extension == 'json' or extension == 'JSON':
|
|
1267
|
+
output_path_tmp = query_data[:-3] + 'txt'
|
|
1268
|
+
build_library_from_raw_data(input_path=query_data, output_path=output_path_tmp, is_reference=False)
|
|
1269
|
+
df_query = pd.read_csv(output_path_tmp, sep='\t')
|
|
1270
|
+
if extension == 'txt' or extension == 'TXT':
|
|
1271
|
+
df_query = pd.read_csv(query_data, sep='\t')
|
|
1272
|
+
unique_query_ids = df_query.iloc[:,0].unique()
|
|
1273
|
+
|
|
1274
|
+
if reference_data is None:
|
|
1275
|
+
print('\nError: No argument passed to the mandatory reference_data. Please pass the path to the TXT file of the reference data.')
|
|
1276
|
+
sys.exit()
|
|
1277
|
+
else:
|
|
1278
|
+
if isinstance(reference_data,str):
|
|
1279
|
+
df_reference = get_reference_df(reference_data=reference_data)
|
|
1280
|
+
unique_reference_ids = df_reference.iloc[:,0].unique()
|
|
1281
|
+
else:
|
|
1282
|
+
dfs = []
|
|
1283
|
+
unique_reference_ids = []
|
|
1284
|
+
for f in reference_data:
|
|
1285
|
+
tmp = get_reference_df(reference_data=f)
|
|
1286
|
+
dfs.append(tmp)
|
|
1287
|
+
unique_reference_ids.extend(tmp.iloc[:,0].unique())
|
|
1288
|
+
df_reference = pd.concat(dfs, axis=0, ignore_index=True)
|
|
1289
|
+
|
|
1290
|
+
if 'ionization_mode' in df_reference.columns.tolist() and ionization_mode != None and ionization_mode != 'N/A':
|
|
1291
|
+
df_reference = df_reference.loc[df_reference['ionization_mode']==ionization_mode]
|
|
1292
|
+
if 'adduct' in df_reference.columns.tolist() and adduct != None and adduct != 'N/A':
|
|
1293
|
+
df_reference = df_reference.loc[df_reference['adduct']==adduct]
|
|
1294
|
+
|
|
1295
|
+
unique_query_ids = df_query['id'].unique().tolist()
|
|
1296
|
+
unique_reference_ids = df_reference['id'].unique().tolist()
|
|
1297
|
+
|
|
1298
|
+
ctx = dict(
|
|
1299
|
+
df_query=df_query,
|
|
1300
|
+
df_reference=df_reference,
|
|
1301
|
+
precursor_ion_mz_tolerance=precursor_ion_mz_tolerance,
|
|
1302
|
+
ionization_mode=ionization_mode,
|
|
1303
|
+
adduct=adduct,
|
|
1304
|
+
similarity_measure=similarity_measure,
|
|
1305
|
+
weights=weights,
|
|
1306
|
+
spectrum_preprocessing_order=spectrum_preprocessing_order,
|
|
1307
|
+
mz_min=mz_min, mz_max=mz_max, int_min=int_min, int_max=int_max,
|
|
1308
|
+
high_quality_reference_library=high_quality_reference_library,
|
|
1309
|
+
default_params=default_params,
|
|
1310
|
+
optimize_params=optimize_params,
|
|
1311
|
+
)
|
|
1312
|
+
|
|
1313
|
+
bounds = [param_bounds[p] for p in optimize_params]
|
|
1314
|
+
|
|
1315
|
+
if chromatography_platform == 'HRMS':
|
|
1316
|
+
result = differential_evolution(objective_function_HRMS, bounds=bounds, args=(ctx,), maxiter=maxiters, tol=0.0, workers=de_workers, seed=1)
|
|
1317
|
+
else:
|
|
1318
|
+
result = differential_evolution(objective_function_NRMS, bounds=bounds, args=(ctx,), maxiter=maxiters, tol=0.0, workers=de_workers, seed=1)
|
|
1319
|
+
|
|
1320
|
+
best_full_params = _vector_to_full_params(result.x, default_params, optimize_params)
|
|
1321
|
+
best_acc = 100.0 - (result.fun * 100.0)
|
|
1322
|
+
|
|
1323
|
+
print("\n=== Differential Evolution Result ===")
|
|
1324
|
+
print(f"Optimized over: {optimize_params}")
|
|
1325
|
+
print("Best values (selected params):")
|
|
1326
|
+
for name in optimize_params:
|
|
1327
|
+
print(f" {name}: {best_full_params[name]}")
|
|
1328
|
+
print("\nFull parameter set used in final evaluation:")
|
|
1329
|
+
for k, v in best_full_params.items():
|
|
1330
|
+
print(f" {k}: {v}")
|
|
1331
|
+
print(f"\nBest accuracy: {best_acc:.3f}%")
|
|
1332
|
+
_log(f"best = {result.x}, acc={100*(1-result.fun):.3f}%")
|
|
1333
|
+
|
|
1334
|
+
|
|
1335
|
+
default_HRMS_grid = {'similarity_measure':['cosine'], 'weight':[{'Cosine':0.25,'Shannon':0.25,'Renyi':0.25,'Tsallis':0.25}], 'spectrum_preprocessing_order':['FCNMWL'], 'mz_min':[0], 'mz_max':[9999999], 'int_min':[0], 'int_max':[99999999], 'window_size_centroiding':[0.5], 'window_size_matching':[0.5], 'noise_threshold':[0.0], 'wf_mz':[0.0], 'wf_int':[1.0], 'LET_threshold':[0.0], 'entropy_dimension':[1.1], 'high_quality_reference_library':[False]}
|
|
1336
|
+
default_NRMS_grid = {'similarity_measure':['cosine'], 'weight':[{'Cosine':0.25,'Shannon':0.25,'Renyi':0.25,'Tsallis':0.25}], 'spectrum_preprocessing_order':['FCNMWL'], 'mz_min':[0], 'mz_max':[9999999], 'int_min':[0], 'int_max':[99999999], 'noise_threshold':[0.0], 'wf_mz':[0.0], 'wf_int':[1.0], 'LET_threshold':[0.0], 'entropy_dimension':[1.1], 'high_quality_reference_library':[False]}
|
|
1337
|
+
|
|
1338
|
+
|
|
1339
|
+
def _eval_one_HRMS(df_query, df_reference,
|
|
1340
|
+
precursor_ion_mz_tolerance_tmp, ionization_mode_tmp, adduct_tmp,
|
|
1341
|
+
similarity_measure_tmp, weight,
|
|
1342
|
+
spectrum_preprocessing_order_tmp, mz_min_tmp, mz_max_tmp,
|
|
1343
|
+
int_min_tmp, int_max_tmp, noise_threshold_tmp,
|
|
1344
|
+
window_size_centroiding_tmp, window_size_matching_tmp,
|
|
1345
|
+
wf_mz_tmp, wf_int_tmp, LET_threshold_tmp,
|
|
1346
|
+
entropy_dimension_tmp, high_quality_reference_library_tmp):
|
|
1347
|
+
|
|
1348
|
+
acc = get_acc_HRMS(
|
|
1349
|
+
df_query=df_query, df_reference=df_reference,
|
|
1350
|
+
precursor_ion_mz_tolerance=precursor_ion_mz_tolerance_tmp,
|
|
1351
|
+
ionization_mode=ionization_mode_tmp, adduct=adduct_tmp,
|
|
1352
|
+
similarity_measure=similarity_measure_tmp, weights=weight,
|
|
1353
|
+
spectrum_preprocessing_order=spectrum_preprocessing_order_tmp,
|
|
1354
|
+
mz_min=mz_min_tmp, mz_max=mz_max_tmp,
|
|
1355
|
+
int_min=int_min_tmp, int_max=int_max_tmp,
|
|
1356
|
+
window_size_centroiding=window_size_centroiding_tmp,
|
|
1357
|
+
window_size_matching=window_size_matching_tmp,
|
|
1358
|
+
noise_threshold=noise_threshold_tmp,
|
|
1359
|
+
wf_mz=wf_mz_tmp, wf_int=wf_int_tmp,
|
|
1360
|
+
LET_threshold=LET_threshold_tmp,
|
|
1361
|
+
entropy_dimension=entropy_dimension_tmp,
|
|
1362
|
+
high_quality_reference_library=high_quality_reference_library_tmp,
|
|
1363
|
+
verbose=False
|
|
1364
|
+
)
|
|
1365
|
+
|
|
1366
|
+
return (
|
|
1367
|
+
acc, similarity_measure_tmp, json.dumps(weight), spectrum_preprocessing_order_tmp,
|
|
1368
|
+
mz_min_tmp, mz_max_tmp, int_min_tmp, int_max_tmp,
|
|
1369
|
+
noise_threshold_tmp, window_size_centroiding_tmp, window_size_matching_tmp,
|
|
1370
|
+
wf_mz_tmp, wf_int_tmp, LET_threshold_tmp, entropy_dimension_tmp,
|
|
1371
|
+
high_quality_reference_library_tmp
|
|
1372
|
+
)
|
|
1373
|
+
|
|
1374
|
+
|
|
1375
|
+
def _eval_one_NRMS(df_query, df_reference, unique_query_ids, unique_reference_ids,
|
|
1376
|
+
similarity_measure_tmp, weight,
|
|
1377
|
+
spectrum_preprocessing_order_tmp, mz_min_tmp, mz_max_tmp,
|
|
1378
|
+
int_min_tmp, int_max_tmp, noise_threshold_tmp,
|
|
1379
|
+
wf_mz_tmp, wf_int_tmp, LET_threshold_tmp,
|
|
1380
|
+
entropy_dimension_tmp, high_quality_reference_library_tmp):
|
|
1381
|
+
|
|
1382
|
+
acc = get_acc_NRMS(
|
|
1383
|
+
df_query=df_query, df_reference=df_reference,
|
|
1384
|
+
unique_query_ids=unique_query_ids, unique_reference_ids=unique_reference_ids,
|
|
1385
|
+
similarity_measure=similarity_measure_tmp, weights=weight,
|
|
1386
|
+
spectrum_preprocessing_order=spectrum_preprocessing_order_tmp,
|
|
1387
|
+
mz_min=mz_min_tmp, mz_max=mz_max_tmp,
|
|
1388
|
+
int_min=int_min_tmp, int_max=int_max_tmp,
|
|
1389
|
+
noise_threshold=noise_threshold_tmp,
|
|
1390
|
+
wf_mz=wf_mz_tmp, wf_int=wf_int_tmp,
|
|
1391
|
+
LET_threshold=LET_threshold_tmp,
|
|
1392
|
+
entropy_dimension=entropy_dimension_tmp,
|
|
1393
|
+
high_quality_reference_library=high_quality_reference_library_tmp,
|
|
1394
|
+
)
|
|
1395
|
+
|
|
1396
|
+
return (
|
|
1397
|
+
acc, similarity_measure_tmp, json.dumps(weight), spectrum_preprocessing_order_tmp,
|
|
1398
|
+
mz_min_tmp, mz_max_tmp, int_min_tmp, int_max_tmp, noise_threshold_tmp,
|
|
1399
|
+
wf_mz_tmp, wf_int_tmp, LET_threshold_tmp, entropy_dimension_tmp, high_quality_reference_library_tmp
|
|
1400
|
+
)
|
|
1401
|
+
|
|
1402
|
+
|
|
1403
|
+
|
|
1404
|
+
|
|
1405
|
+
def tune_params_on_HRMS_data_grid_shiny(query_data=None, reference_data=None, precursor_ion_mz_tolerance=None, ionization_mode=None, adduct=None, grid=None, output_path=None, return_output=False):
|
|
1406
|
+
local_grid = {**default_HRMS_grid, **(grid or {})}
|
|
1407
|
+
for key, value in local_grid.items():
|
|
1408
|
+
globals()[key] = value
|
|
1409
|
+
|
|
1410
|
+
if query_data is None:
|
|
1411
|
+
print('\nError: No argument passed to the mandatory query_data. Please pass the path to the data file.')
|
|
1412
|
+
sys.exit()
|
|
1413
|
+
else:
|
|
1414
|
+
extension = query_data.rsplit('.', 1)[-1]
|
|
1415
|
+
if extension in ('mgf','MGF','mzML','mzml','MZML','cdf','CDF'):
|
|
1416
|
+
output_path_tmp = query_data[:-3] + 'txt'
|
|
1417
|
+
build_library_from_raw_data(input_path=query_data, output_path=output_path_tmp, is_reference=False)
|
|
1418
|
+
df_query = pd.read_csv(output_path_tmp, sep='\t')
|
|
1419
|
+
elif extension in ('txt','TXT'):
|
|
1420
|
+
df_query = pd.read_csv(query_data, sep='\t')
|
|
1421
|
+
else:
|
|
1422
|
+
print(f'\nError: Unsupported query_data extension: {extension}')
|
|
1423
|
+
sys.exit()
|
|
1424
|
+
unique_query_ids = df_query.iloc[:, 0].unique()
|
|
1425
|
+
|
|
1426
|
+
if reference_data is None:
|
|
1427
|
+
print('\nError: No argument passed to the mandatory reference_data. Please pass the path to the data file(s).')
|
|
1428
|
+
sys.exit()
|
|
1429
|
+
else:
|
|
1430
|
+
if isinstance(reference_data, str):
|
|
1431
|
+
df_reference = get_reference_df(reference_data=reference_data)
|
|
1432
|
+
unique_reference_ids = df_reference.iloc[:, 0].unique()
|
|
1433
|
+
else:
|
|
1434
|
+
dfs = []
|
|
1435
|
+
unique_reference_ids = []
|
|
1436
|
+
for f in reference_data:
|
|
1437
|
+
tmp = get_reference_df(reference_data=f)
|
|
1438
|
+
dfs.append(tmp)
|
|
1439
|
+
unique_reference_ids.extend(tmp.iloc[:, 0].unique())
|
|
1440
|
+
df_reference = pd.concat(dfs, axis=0, ignore_index=True)
|
|
1441
|
+
|
|
1442
|
+
print(f'\nNote that there are {len(unique_query_ids)} unique query spectra, '
|
|
1443
|
+
f'{len(unique_reference_ids)} unique reference spectra, and '
|
|
1444
|
+
f'{len(set(unique_query_ids) & set(unique_reference_ids))} of the query and reference spectra IDs are in common.\n')
|
|
1445
|
+
|
|
1446
|
+
if 'ionization_mode' in df_reference.columns.tolist() and ionization_mode != None and ionization_mode != 'N/A':
|
|
1447
|
+
df_reference = df_reference.loc[df_reference['ionization_mode']==ionization_mode]
|
|
1448
|
+
if 'adduct' in df_reference.columns.tolist() and adduct != None and adduct != 'N/A':
|
|
1449
|
+
df_reference = df_reference.loc[df_reference['adduct']==adduct]
|
|
1450
|
+
|
|
1451
|
+
if output_path is None:
|
|
1452
|
+
output_path = f'{Path.cwd()}/tuning_param_output.txt'
|
|
1453
|
+
print(f'Warning: since output_path=None, the output will be written to the current working directory: {output_path}')
|
|
1454
|
+
|
|
1455
|
+
param_grid = product(
|
|
1456
|
+
similarity_measure, weight, spectrum_preprocessing_order, mz_min, mz_max, int_min, int_max,
|
|
1457
|
+
noise_threshold, window_size_centroiding, window_size_matching, wf_mz, wf_int, LET_threshold,
|
|
1458
|
+
entropy_dimension, high_quality_reference_library
|
|
1459
|
+
)
|
|
1460
|
+
|
|
1461
|
+
results = []
|
|
1462
|
+
total = (
|
|
1463
|
+
len(similarity_measure) * len(weight) * len(spectrum_preprocessing_order) * len(mz_min) * len(mz_max) *
|
|
1464
|
+
len(int_min) * len(int_max) * len(noise_threshold) * len(window_size_centroiding) *
|
|
1465
|
+
len(window_size_matching) * len(wf_mz) * len(wf_int) * len(LET_threshold) *
|
|
1466
|
+
len(entropy_dimension) * len(high_quality_reference_library)
|
|
1467
|
+
)
|
|
1468
|
+
done = 0
|
|
1469
|
+
for params in param_grid:
|
|
1470
|
+
res = _eval_one_HRMS(df_query, df_reference, precursor_ion_mz_tolerance, ionization_mode, adduct, *params)
|
|
1471
|
+
results.append(res)
|
|
1472
|
+
done += 1
|
|
1473
|
+
print(f'Completed {done}/{total} grid combinations.\n', flush=True)
|
|
1474
|
+
|
|
1475
|
+
df_out = pd.DataFrame(results, columns=[
|
|
1476
|
+
'ACC','SIMILARITY.MEASURE','WEIGHT','SPECTRUM.PROCESSING.ORDER','MZ.MIN','MZ.MAX',
|
|
1477
|
+
'INT.MIN','INT.MAX','NOISE.THRESHOLD','WINDOW.SIZE.CENTROIDING','WINDOW.SIZE.MATCHING',
|
|
1478
|
+
'WF.MZ','WF.INT','LET.THRESHOLD','ENTROPY.DIMENSION','HIGH.QUALITY.REFERENCE.LIBRARY'
|
|
1479
|
+
])
|
|
1480
|
+
|
|
1481
|
+
if 'WEIGHT' in df_out.columns:
|
|
1482
|
+
df_out['WEIGHT'] = (
|
|
1483
|
+
df_out['WEIGHT'].astype(str)
|
|
1484
|
+
.str.replace("\"","",regex=False)
|
|
1485
|
+
.str.replace("{","",regex=False)
|
|
1486
|
+
.str.replace("}","",regex=False)
|
|
1487
|
+
.str.replace(":","",regex=False)
|
|
1488
|
+
.str.replace("Cosine","",regex=False)
|
|
1489
|
+
.str.replace("Shannon","",regex=False)
|
|
1490
|
+
.str.replace("Renyi","",regex=False)
|
|
1491
|
+
.str.replace("Tsallis","",regex=False)
|
|
1492
|
+
.str.replace(" ","",regex=False)
|
|
1493
|
+
)
|
|
1494
|
+
|
|
1495
|
+
if return_output:
|
|
1496
|
+
return df_out
|
|
1497
|
+
else:
|
|
1498
|
+
df_out.to_csv(output_path, index=False, sep='\t', quoting=csv.QUOTE_NONE)
|
|
1499
|
+
print(f'Wrote results to {output_path}')
|
|
1500
|
+
|
|
1501
|
+
|
|
1502
|
+
|
|
1503
|
+
def tune_params_on_NRMS_data_grid(query_data=None, reference_data=None, grid=None, output_path=None, return_output=False):
|
|
1504
|
+
grid = {**default_NRMS_grid, **(grid or {})}
|
|
1505
|
+
for key, value in grid.items():
|
|
1506
|
+
globals()[key] = value
|
|
1507
|
+
|
|
1508
|
+
if query_data is None:
|
|
1509
|
+
print('\nError: No argument passed to the mandatory query_data. Please pass the path to the TXT file of the query data.')
|
|
1510
|
+
sys.exit()
|
|
1511
|
+
else:
|
|
1512
|
+
extension = query_data.rsplit('.',1)
|
|
1513
|
+
extension = extension[(len(extension)-1)]
|
|
1514
|
+
if extension == 'mgf' or extension == 'MGF' or extension == 'mzML' or extension == 'mzml' or extension == 'MZML' or extension == 'cdf' or extension == 'CDF' or extension == 'msp' or extension == 'MSP' or extension == 'json' or extension == 'JSON':
|
|
1515
|
+
output_path_tmp = query_data[:-3] + 'txt'
|
|
1516
|
+
build_library_from_raw_data(input_path=query_data, output_path=output_path_tmp, is_reference=False)
|
|
1517
|
+
df_query = pd.read_csv(output_path_tmp, sep='\t')
|
|
1518
|
+
if extension == 'txt' or extension == 'TXT':
|
|
1519
|
+
df_query = pd.read_csv(query_data, sep='\t')
|
|
1520
|
+
unique_query_ids = df_query.iloc[:,0].unique()
|
|
1521
|
+
|
|
1522
|
+
if reference_data is None:
|
|
1523
|
+
print('\nError: No argument passed to the mandatory reference_data. Please pass the path to the TXT file of the reference data.')
|
|
1524
|
+
sys.exit()
|
|
1525
|
+
else:
|
|
1526
|
+
if isinstance(reference_data,str):
|
|
1527
|
+
df_reference = get_reference_df(reference_data=reference_data)
|
|
1528
|
+
unique_reference_ids = df_reference.iloc[:,0].unique()
|
|
1529
|
+
else:
|
|
1530
|
+
dfs = []
|
|
1531
|
+
unique_reference_ids = []
|
|
1532
|
+
for f in reference_data:
|
|
1533
|
+
tmp = get_reference_df(reference_data=f)
|
|
1534
|
+
dfs.append(tmp)
|
|
1535
|
+
unique_reference_ids.extend(tmp.iloc[:,0].unique())
|
|
1536
|
+
df_reference = pd.concat(dfs, axis=0, ignore_index=True)
|
|
1537
|
+
|
|
1538
|
+
print(f'\nNote that there are {len(unique_query_ids)} unique query spectra, {len(unique_reference_ids)} unique reference spectra, and {len(set(unique_query_ids) & set(unique_reference_ids))} of the query and reference spectra IDs are in common.\n')
|
|
1539
|
+
|
|
1540
|
+
if output_path is None:
|
|
1541
|
+
output_path = f'{Path.cwd()}/tuning_param_output.txt'
|
|
1542
|
+
print(f'Warning: since output_path=None, the output will be written to the current working directory: {output_path}')
|
|
1543
|
+
|
|
1544
|
+
param_grid = product(similarity_measure, weight, spectrum_preprocessing_order, mz_min, mz_max, int_min, int_max,
|
|
1545
|
+
noise_threshold, wf_mz, wf_int, LET_threshold, entropy_dimension, high_quality_reference_library)
|
|
1546
|
+
results = Parallel(n_jobs=-1, verbose=10)(delayed(_eval_one_NRMS)(df_query, df_reference, unique_query_ids, unique_reference_ids, *params) for params in param_grid)
|
|
1547
|
+
|
|
1548
|
+
df_out = pd.DataFrame(results, columns=[
|
|
1549
|
+
'ACC','SIMILARITY.MEASURE','WEIGHT','SPECTRUM.PROCESSING.ORDER', 'MZ.MIN','MZ.MAX','INT.MIN','INT.MAX',
|
|
1550
|
+
'NOISE.THRESHOLD','WF.MZ','WF.INT','LET.THRESHOLD','ENTROPY.DIMENSION', 'HIGH.QUALITY.REFERENCE.LIBRARY'
|
|
1551
|
+
])
|
|
1552
|
+
df_out['WEIGHT'] = df_out['WEIGHT'].str.replace("\"","",regex=False)
|
|
1553
|
+
df_out['WEIGHT'] = df_out['WEIGHT'].str.replace("{","",regex=False)
|
|
1554
|
+
df_out['WEIGHT'] = df_out['WEIGHT'].str.replace("}","",regex=False)
|
|
1555
|
+
df_out['WEIGHT'] = df_out['WEIGHT'].str.replace(":","",regex=False)
|
|
1556
|
+
df_out['WEIGHT'] = df_out['WEIGHT'].str.replace("Cosine","",regex=False)
|
|
1557
|
+
df_out['WEIGHT'] = df_out['WEIGHT'].str.replace("Shannon","",regex=False)
|
|
1558
|
+
df_out['WEIGHT'] = df_out['WEIGHT'].str.replace("Renyi","",regex=False)
|
|
1559
|
+
df_out['WEIGHT'] = df_out['WEIGHT'].str.replace("Tsallis","",regex=False)
|
|
1560
|
+
df_out['WEIGHT'] = df_out['WEIGHT'].str.replace(" ","",regex=False)
|
|
1561
|
+
if return_output is False:
|
|
1562
|
+
df_out.to_csv(output_path, index=False, sep='\t', quoting=csv.QUOTE_NONE)
|
|
1563
|
+
else:
|
|
1564
|
+
return df_out
|
|
1565
|
+
|
|
1566
|
+
|
|
1567
|
+
|
|
1568
|
+
def tune_params_on_NRMS_data_grid_shiny(query_data=None, reference_data=None, grid=None, output_path=None, return_output=False):
|
|
1569
|
+
local_grid = {**default_NRMS_grid, **(grid or {})}
|
|
1570
|
+
for key, value in local_grid.items():
|
|
1571
|
+
globals()[key] = value
|
|
1572
|
+
|
|
1573
|
+
if query_data is None:
|
|
1574
|
+
print('\nError: No argument passed to the mandatory query_data. Please pass the path to the data file.')
|
|
1575
|
+
sys.exit()
|
|
1576
|
+
else:
|
|
1577
|
+
extension = query_data.rsplit('.', 1)[-1]
|
|
1578
|
+
if extension in ('mgf','MGF','mzML','mzml','MZML','cdf','CDF'):
|
|
1579
|
+
output_path_tmp = query_data[:-3] + 'txt'
|
|
1580
|
+
build_library_from_raw_data(input_path=query_data, output_path=output_path_tmp, is_reference=False)
|
|
1581
|
+
df_query = pd.read_csv(output_path_tmp, sep='\t')
|
|
1582
|
+
elif extension in ('txt','TXT'):
|
|
1583
|
+
df_query = pd.read_csv(query_data, sep='\t')
|
|
1584
|
+
else:
|
|
1585
|
+
print(f'\nError: Unsupported query_data extension: {extension}')
|
|
1586
|
+
sys.exit()
|
|
1587
|
+
unique_query_ids = df_query.iloc[:, 0].unique()
|
|
1588
|
+
|
|
1589
|
+
if reference_data is None:
|
|
1590
|
+
print('\nError: No argument passed to the mandatory reference_data. Please pass the path to the data file(s).')
|
|
1591
|
+
sys.exit()
|
|
1592
|
+
else:
|
|
1593
|
+
if isinstance(reference_data, str):
|
|
1594
|
+
df_reference = get_reference_df(reference_data=reference_data)
|
|
1595
|
+
unique_reference_ids = df_reference.iloc[:, 0].unique()
|
|
1596
|
+
else:
|
|
1597
|
+
dfs = []
|
|
1598
|
+
unique_reference_ids = []
|
|
1599
|
+
for f in reference_data:
|
|
1600
|
+
tmp = get_reference_df(reference_data=f)
|
|
1601
|
+
dfs.append(tmp)
|
|
1602
|
+
unique_reference_ids.extend(tmp.iloc[:, 0].unique())
|
|
1603
|
+
df_reference = pd.concat(dfs, axis=0, ignore_index=True)
|
|
1604
|
+
|
|
1605
|
+
print(f'\nNote that there are {len(unique_query_ids)} unique query spectra, '
|
|
1606
|
+
f'{len(unique_reference_ids)} unique reference spectra, and '
|
|
1607
|
+
f'{len(set(unique_query_ids) & set(unique_reference_ids))} of the query and reference spectra IDs are in common.\n')
|
|
1608
|
+
|
|
1609
|
+
if output_path is None:
|
|
1610
|
+
output_path = f'{Path.cwd()}/tuning_param_output.txt'
|
|
1611
|
+
print(f'Warning: since output_path=None, the output will be written to the current working directory: {output_path}')
|
|
1612
|
+
|
|
1613
|
+
param_grid = product(
|
|
1614
|
+
similarity_measure, weight, spectrum_preprocessing_order, mz_min, mz_max, int_min, int_max,
|
|
1615
|
+
noise_threshold, wf_mz, wf_int, LET_threshold,
|
|
1616
|
+
entropy_dimension, high_quality_reference_library
|
|
1617
|
+
)
|
|
1618
|
+
|
|
1619
|
+
results = []
|
|
1620
|
+
total = (
|
|
1621
|
+
len(similarity_measure) * len(weight) * len(spectrum_preprocessing_order) * len(mz_min) * len(mz_max) * len(int_min) *
|
|
1622
|
+
len(int_max) * len(noise_threshold) * len(wf_mz) * len(wf_int) * len(LET_threshold) * len(entropy_dimension) * len(high_quality_reference_library)
|
|
1623
|
+
)
|
|
1624
|
+
done = 0
|
|
1625
|
+
for params in param_grid:
|
|
1626
|
+
res = _eval_one_NRMS(df_query, df_reference, unique_query_ids, unique_reference_ids, *params)
|
|
1627
|
+
results.append(res)
|
|
1628
|
+
done += 1
|
|
1629
|
+
print(f'Completed {done}/{total} grid combinations.\n', flush=True)
|
|
1630
|
+
|
|
1631
|
+
df_out = pd.DataFrame(results, columns=[
|
|
1632
|
+
'ACC','SIMILARITY.MEASURE','WEIGHT','SPECTRUM.PROCESSING.ORDER','MZ.MIN','MZ.MAX',
|
|
1633
|
+
'INT.MIN','INT.MAX','NOISE.THRESHOLD','WF.MZ','WF.INT','LET.THRESHOLD','ENTROPY.DIMENSION','HIGH.QUALITY.REFERENCE.LIBRARY'
|
|
1634
|
+
])
|
|
1635
|
+
|
|
1636
|
+
if 'WEIGHT' in df_out.columns:
|
|
1637
|
+
df_out['WEIGHT'] = (
|
|
1638
|
+
df_out['WEIGHT'].astype(str)
|
|
1639
|
+
.str.replace("\"","",regex=False)
|
|
1640
|
+
.str.replace("{","",regex=False)
|
|
1641
|
+
.str.replace("}","",regex=False)
|
|
1642
|
+
.str.replace(":","",regex=False)
|
|
1643
|
+
.str.replace("Cosine","",regex=False)
|
|
1644
|
+
.str.replace("Shannon","",regex=False)
|
|
1645
|
+
.str.replace("Renyi","",regex=False)
|
|
1646
|
+
.str.replace("Tsallis","",regex=False)
|
|
1647
|
+
.str.replace(" ","",regex=False)
|
|
1648
|
+
)
|
|
1649
|
+
|
|
1650
|
+
if return_output:
|
|
1651
|
+
return df_out
|
|
1652
|
+
else:
|
|
1653
|
+
df_out.to_csv(output_path, index=False, sep='\t', quoting=csv.QUOTE_NONE)
|
|
1654
|
+
print(f'Wrote results to {output_path}')
|
|
1655
|
+
|
|
1656
|
+
|
|
1657
|
+
|
|
1658
|
+
|
|
1659
|
+
def get_acc_HRMS(df_query, df_reference, precursor_ion_mz_tolerance, ionization_mode, adduct, similarity_measure, weights, spectrum_preprocessing_order, mz_min, mz_max, int_min, int_max, window_size_centroiding, window_size_matching, noise_threshold, wf_mz, wf_int, LET_threshold, entropy_dimension, high_quality_reference_library, verbose=True):
|
|
1660
|
+
n_top_matches_to_save = 1
|
|
1661
|
+
unique_reference_ids = df_reference['id'].dropna().astype(str).unique().tolist()
|
|
1662
|
+
unique_query_ids = df_query['id'].dropna().astype(str).unique().tolist()
|
|
1663
|
+
all_similarity_rows = []
|
|
1664
|
+
|
|
1665
|
+
for query_idx, qid in enumerate(unique_query_ids):
|
|
1666
|
+
if verbose:
|
|
1667
|
+
print(f'query spectrum #{query_idx} is being identified')
|
|
1668
|
+
|
|
1669
|
+
q_mask = (df_query['id'] == qid)
|
|
1670
|
+
q_idxs = np.where(q_mask)[0]
|
|
1671
|
+
if q_idxs.size == 0:
|
|
1672
|
+
all_similarity_rows.append([0.0]*len(unique_reference_ids))
|
|
1673
|
+
continue
|
|
1674
|
+
|
|
1675
|
+
q_spec_base = np.asarray(pd.concat([df_query['mz_ratio'].iloc[q_idxs], df_query['intensity'].iloc[q_idxs]], axis=1).reset_index(drop=True))
|
|
1676
|
+
|
|
1677
|
+
if 'precursor_ion_mz' in df_query.columns and 'precursor_ion_mz' in df_reference.columns and precursor_ion_mz_tolerance is not None:
|
|
1678
|
+
precursor = float(df_query['precursor_ion_mz'].iloc[q_idxs[0]])
|
|
1679
|
+
df_reference_tmp = df_reference.loc[df_reference['precursor_ion_mz'].between(precursor - precursor_ion_mz_tolerance, precursor + precursor_ion_mz_tolerance, inclusive='both'), ['id', 'mz_ratio', 'intensity']].copy()
|
|
1680
|
+
else:
|
|
1681
|
+
df_reference_tmp = df_reference[['id','mz_ratio','intensity']].copy()
|
|
1682
|
+
|
|
1683
|
+
if df_reference_tmp.empty:
|
|
1684
|
+
all_similarity_rows.append([0.0]*len(unique_reference_ids))
|
|
1685
|
+
continue
|
|
1686
|
+
|
|
1687
|
+
ref_groups = dict(tuple(df_reference_tmp.groupby('id', sort=False)))
|
|
1688
|
+
|
|
1689
|
+
similarity_by_ref = {}
|
|
1690
|
+
|
|
1691
|
+
for ref_id, r_df in ref_groups.items():
|
|
1692
|
+
q_spec = q_spec_base.copy()
|
|
1693
|
+
r_spec = np.asarray(pd.concat([r_df['mz_ratio'], r_df['intensity']], axis=1).reset_index(drop=True))
|
|
1694
|
+
|
|
1695
|
+
is_matched = False
|
|
1696
|
+
for transformation in spectrum_preprocessing_order:
|
|
1697
|
+
if np.isinf(q_spec[:, 1]).any():
|
|
1698
|
+
q_spec[:, 1] = 0.0
|
|
1699
|
+
if np.isinf(r_spec[:, 1]).any():
|
|
1700
|
+
r_spec[:, 1] = 0.0
|
|
1701
|
+
|
|
1702
|
+
if transformation == 'C' and q_spec.shape[0] > 1 and r_spec.shape[0] > 1:
|
|
1703
|
+
q_spec = centroid_spectrum(q_spec, window_size=window_size_centroiding)
|
|
1704
|
+
r_spec = centroid_spectrum(r_spec, window_size=window_size_centroiding)
|
|
1705
|
+
|
|
1706
|
+
if transformation == 'M' and q_spec.shape[0] > 1 and r_spec.shape[0] > 1:
|
|
1707
|
+
m_spec = match_peaks_in_spectra(
|
|
1708
|
+
spec_a=q_spec, spec_b=r_spec, window_size=window_size_matching
|
|
1709
|
+
)
|
|
1710
|
+
if m_spec.size == 0:
|
|
1711
|
+
q_spec = np.empty((0,2))
|
|
1712
|
+
r_spec = np.empty((0,2))
|
|
1713
|
+
else:
|
|
1714
|
+
q_spec = m_spec[:, 0:2]
|
|
1715
|
+
r_spec = m_spec[:, [0, 2]]
|
|
1716
|
+
is_matched = True
|
|
1717
|
+
|
|
1718
|
+
if transformation == 'W' and q_spec.shape[0] > 1 and r_spec.shape[0] > 1:
|
|
1719
|
+
q_spec[:, 1] = wf_transform(q_spec[:, 0], q_spec[:, 1], wf_mz, wf_int)
|
|
1720
|
+
r_spec[:, 1] = wf_transform(r_spec[:, 0], r_spec[:, 1], wf_mz, wf_int)
|
|
1721
|
+
|
|
1722
|
+
if transformation == 'L' and q_spec.shape[0] > 1 and r_spec.shape[0] > 1:
|
|
1723
|
+
q_spec[:, 1] = LE_transform(q_spec[:, 1], LET_threshold, normalization_method='standard')
|
|
1724
|
+
r_spec[:, 1] = LE_transform(r_spec[:, 1], LET_threshold, normalization_method='standard')
|
|
1725
|
+
|
|
1726
|
+
if transformation == 'N' and q_spec.shape[0] > 1 and r_spec.shape[0] > 1:
|
|
1727
|
+
q_spec = remove_noise(q_spec, nr=noise_threshold)
|
|
1728
|
+
if not high_quality_reference_library:
|
|
1729
|
+
r_spec = remove_noise(r_spec, nr=noise_threshold)
|
|
1730
|
+
|
|
1731
|
+
if transformation == 'F' and q_spec.shape[0] > 1 and r_spec.shape[0] > 1:
|
|
1732
|
+
q_spec = filter_spec_lcms(
|
|
1733
|
+
q_spec, mz_min=mz_min, mz_max=mz_max, int_min=int_min, int_max=int_max, is_matched=is_matched
|
|
1734
|
+
)
|
|
1735
|
+
if not high_quality_reference_library:
|
|
1736
|
+
r_spec = filter_spec_lcms(
|
|
1737
|
+
r_spec, mz_min=mz_min, mz_max=mz_max, int_min=int_min, int_max=int_max, is_matched=is_matched
|
|
1738
|
+
)
|
|
1739
|
+
|
|
1740
|
+
if q_spec.shape[0] > 1 and r_spec.shape[0] > 1:
|
|
1741
|
+
q_ints = q_spec[:, 1]
|
|
1742
|
+
r_ints = r_spec[:, 1]
|
|
1743
|
+
if np.sum(q_ints) != 0 and np.sum(r_ints) != 0:
|
|
1744
|
+
sim = get_similarity(similarity_measure, q_ints, r_ints, weights, entropy_dimension)
|
|
1745
|
+
else:
|
|
1746
|
+
sim = 0.0
|
|
1747
|
+
else:
|
|
1748
|
+
sim = 0.0
|
|
1749
|
+
|
|
1750
|
+
similarity_by_ref[str(ref_id)] = float(sim)
|
|
1751
|
+
|
|
1752
|
+
row = [similarity_by_ref.get(ref_id, 0.0) for ref_id in unique_reference_ids]
|
|
1753
|
+
all_similarity_rows.append(row)
|
|
1754
|
+
|
|
1755
|
+
df_scores = pd.DataFrame(all_similarity_rows, index=unique_query_ids, columns=unique_reference_ids)
|
|
1756
|
+
df_scores.index.name = 'QUERY.SPECTRUM.ID'
|
|
1757
|
+
|
|
1758
|
+
top_idx = df_scores.values.argmax(axis=1)
|
|
1759
|
+
top_scores = df_scores.values[np.arange(df_scores.shape[0]), top_idx]
|
|
1760
|
+
top_ids = [df_scores.columns[i] for i in top_idx]
|
|
1761
|
+
|
|
1762
|
+
df_tmp = pd.DataFrame({'TRUE.ID': df_scores.index.to_list(), 'PREDICTED.ID': top_ids, 'SCORE': top_scores})
|
|
1763
|
+
if verbose:
|
|
1764
|
+
print(df_tmp)
|
|
1765
|
+
|
|
1766
|
+
acc = (df_tmp['TRUE.ID'] == df_tmp['PREDICTED.ID']).mean()
|
|
1767
|
+
return acc
|
|
1768
|
+
|
|
1769
|
+
|
|
1770
|
+
|
|
1771
|
+
def get_acc_NRMS(df_query, df_reference, unique_query_ids, unique_reference_ids, similarity_measure, weights, spectrum_preprocessing_order, mz_min, mz_max, int_min, int_max, noise_threshold, wf_mz, wf_int, LET_threshold, entropy_dimension, high_quality_reference_library, verbose=True):
|
|
1772
|
+
|
|
1773
|
+
n_top_matches_to_save = 1
|
|
1774
|
+
|
|
1775
|
+
min_mz = int(np.min([np.min(df_query.iloc[:,1]), np.min(df_reference.iloc[:,1])]))
|
|
1776
|
+
max_mz = int(np.max([np.max(df_query.iloc[:,1]), np.max(df_reference.iloc[:,1])]))
|
|
1777
|
+
mzs = np.linspace(min_mz,max_mz,(max_mz-min_mz+1))
|
|
1778
|
+
|
|
1779
|
+
all_similarity_scores = []
|
|
1780
|
+
for query_idx in range(0,len(unique_query_ids)):
|
|
1781
|
+
q_idxs_tmp = np.where(df_query.iloc[:,0] == unique_query_ids[query_idx])[0]
|
|
1782
|
+
q_spec_tmp = np.asarray(pd.concat([df_query.iloc[q_idxs_tmp,1], df_query.iloc[q_idxs_tmp,2]], axis=1).reset_index(drop=True))
|
|
1783
|
+
q_spec_tmp = convert_spec(q_spec_tmp,mzs)
|
|
1784
|
+
|
|
1785
|
+
similarity_scores = []
|
|
1786
|
+
for ref_idx in range(0,len(unique_reference_ids)):
|
|
1787
|
+
q_spec = q_spec_tmp
|
|
1788
|
+
if verbose is True and ref_idx % 1000 == 0:
|
|
1789
|
+
print(f'Query spectrum #{query_idx} has had its similarity with {ref_idx} reference library spectra computed')
|
|
1790
|
+
r_idxs_tmp = np.where(df_reference.iloc[:,0] == unique_reference_ids[ref_idx])[0]
|
|
1791
|
+
r_spec_tmp = np.asarray(pd.concat([df_reference.iloc[r_idxs_tmp,1], df_reference.iloc[r_idxs_tmp,2]], axis=1).reset_index(drop=True))
|
|
1792
|
+
r_spec = convert_spec(r_spec_tmp,mzs)
|
|
1793
|
+
|
|
1794
|
+
for transformation in spectrum_preprocessing_order:
|
|
1795
|
+
if np.isinf(q_spec[:,1]).sum() > 0:
|
|
1796
|
+
q_spec[:,1] = np.zeros(q_spec.shape[0])
|
|
1797
|
+
if np.isinf(r_spec[:,1]).sum() > 0:
|
|
1798
|
+
r_spec[:,1] = np.zeros(r_spec.shape[0])
|
|
1799
|
+
if transformation == 'W':
|
|
1800
|
+
q_spec[:,1] = wf_transform(q_spec[:,0], q_spec[:,1], wf_mz, wf_int)
|
|
1801
|
+
r_spec[:,1] = wf_transform(r_spec[:,0], r_spec[:,1], wf_mz, wf_int)
|
|
1802
|
+
if transformation == 'L':
|
|
1803
|
+
q_spec[:,1] = LE_transform(q_spec[:,1], LET_threshold, normalization_method='standard')
|
|
1804
|
+
r_spec[:,1] = LE_transform(r_spec[:,1], LET_threshold, normalization_method='standard')
|
|
1805
|
+
if transformation == 'N':
|
|
1806
|
+
q_spec = remove_noise(q_spec, nr = noise_threshold)
|
|
1807
|
+
if high_quality_reference_library == False:
|
|
1808
|
+
r_spec = remove_noise(r_spec, nr = noise_threshold)
|
|
1809
|
+
if transformation == 'F':
|
|
1810
|
+
q_spec = filter_spec_gcms(q_spec, mz_min = mz_min, mz_max = mz_max, int_min = int_min, int_max = int_max)
|
|
1811
|
+
if high_quality_reference_library == False:
|
|
1812
|
+
r_spec = filter_spec_gcms(r_spec, mz_min = mz_min, mz_max = mz_max, int_min = int_min, int_max = int_max)
|
|
1813
|
+
|
|
1814
|
+
q_ints = q_spec[:,1]
|
|
1815
|
+
r_ints = r_spec[:,1]
|
|
1816
|
+
|
|
1817
|
+
if np.sum(q_ints) != 0 and np.sum(r_ints) != 0:
|
|
1818
|
+
similarity_score = get_similarity(similarity_measure, q_spec[:,1], r_spec[:,1], weights, entropy_dimension)
|
|
1819
|
+
else:
|
|
1820
|
+
similarity_score = 0
|
|
1821
|
+
|
|
1822
|
+
similarity_scores.append(similarity_score)
|
|
1823
|
+
all_similarity_scores.append(similarity_scores)
|
|
1824
|
+
|
|
1825
|
+
df_scores = pd.DataFrame(all_similarity_scores, columns = unique_reference_ids)
|
|
1826
|
+
df_scores.index = unique_query_ids
|
|
1827
|
+
df_scores.index.names = ['QUERY.SPECTRUM.ID']
|
|
1828
|
+
|
|
1829
|
+
preds = []
|
|
1830
|
+
scores = []
|
|
1831
|
+
for i in range(0, df_scores.shape[0]):
|
|
1832
|
+
df_scores_tmp = df_scores
|
|
1833
|
+
preds_tmp = []
|
|
1834
|
+
scores_tmp = []
|
|
1835
|
+
for j in range(0, n_top_matches_to_save):
|
|
1836
|
+
top_ref_specs_tmp = df_scores_tmp.iloc[i,np.where(df_scores_tmp.iloc[i,:] == np.max(df_scores_tmp.iloc[i,:]))[0]]
|
|
1837
|
+
cols_to_keep = np.where(df_scores_tmp.iloc[i,:] != np.max(df_scores_tmp.iloc[i,:]))[0]
|
|
1838
|
+
df_scores_tmp = df_scores_tmp.iloc[:,cols_to_keep]
|
|
1839
|
+
|
|
1840
|
+
preds_tmp.append(';'.join(map(str,top_ref_specs_tmp.index.to_list())))
|
|
1841
|
+
if len(top_ref_specs_tmp.values) == 0:
|
|
1842
|
+
scores_tmp.append(0)
|
|
1843
|
+
else:
|
|
1844
|
+
scores_tmp.append(top_ref_specs_tmp.values[0])
|
|
1845
|
+
preds.append(preds_tmp)
|
|
1846
|
+
scores.append(scores_tmp)
|
|
1847
|
+
|
|
1848
|
+
preds = np.array(preds)
|
|
1849
|
+
scores = np.array(scores)
|
|
1850
|
+
out = np.c_[unique_query_ids,preds,scores]
|
|
1851
|
+
df_tmp = pd.DataFrame(out, columns=['TRUE.ID','PREDICTED.ID','SCORE'])
|
|
1852
|
+
acc = (df_tmp['TRUE.ID']==df_tmp['PREDICTED.ID']).mean()
|
|
1853
|
+
return acc
|
|
1854
|
+
|
|
1855
|
+
|
|
1856
|
+
|
|
1857
|
+
def run_spec_lib_matching_on_HRMS_data_shiny(query_data=None, reference_data=None, precursor_ion_mz_tolerance=None, ionization_mode=None, adduct=None, likely_reference_ids=None, similarity_measure='cosine', weights={'Cosine':0.25,'Shannon':0.25,'Renyi':0.25,'Tsallis':0.25}, spectrum_preprocessing_order='FCNMWL', high_quality_reference_library=False, mz_min=0, mz_max=9999999, int_min=0, int_max=9999999, window_size_centroiding=0.5, window_size_matching=0.5, noise_threshold=0.0, wf_mz=0.0, wf_intensity=1.0, LET_threshold=0.0, entropy_dimension=1.1, n_top_matches_to_save=1, print_id_results=False, output_identification=None, output_similarity_scores=None, return_ID_output=False, verbose=True):
|
|
1858
|
+
if query_data is None:
|
|
1859
|
+
print('\nError: No argument passed to the mandatory query_data. Please pass the path to the CSV file of the query data.')
|
|
1860
|
+
sys.exit()
|
|
1861
|
+
else:
|
|
1862
|
+
extension = query_data.rsplit('.',1)
|
|
1863
|
+
extension = extension[(len(extension)-1)]
|
|
1864
|
+
if extension == 'mgf' or extension == 'MGF' or extension == 'mzML' or extension == 'mzml' or extension == 'MZML' or extension == 'cdf' or extension == 'CDF' or extension == 'json' or extension == 'JSON':
|
|
1865
|
+
output_path_tmp = query_data[:-3] + 'txt'
|
|
1866
|
+
build_library_from_raw_data(input_path=query_data, output_path=output_path_tmp, is_reference=False)
|
|
1867
|
+
#build_library_from_raw_data(input_path=query_data, output_path=output_path_tmp, is_reference=True)
|
|
1868
|
+
df_query = pd.read_csv(output_path_tmp, sep='\t')
|
|
1869
|
+
if extension == 'txt' or extension == 'TXT':
|
|
1870
|
+
df_query = pd.read_csv(query_data, sep='\t')
|
|
1871
|
+
unique_query_ids = df_query['id'].unique()
|
|
1872
|
+
|
|
1873
|
+
if reference_data is None:
|
|
1874
|
+
print('\nError: No argument passed to the mandatory reference_data. Please pass the path to the reference data.')
|
|
1875
|
+
sys.exit()
|
|
1876
|
+
else:
|
|
1877
|
+
if isinstance(reference_data,str):
|
|
1878
|
+
df_reference = get_reference_df(reference_data,likely_reference_ids)
|
|
1879
|
+
else:
|
|
1880
|
+
dfs = []
|
|
1881
|
+
for f in reference_data:
|
|
1882
|
+
tmp = get_reference_df(f,likely_reference_ids)
|
|
1883
|
+
dfs.append(tmp)
|
|
1884
|
+
df_reference = pd.concat(dfs, axis=0, ignore_index=True)
|
|
1885
|
+
|
|
1886
|
+
if 'ionization_mode' in df_reference.columns.tolist() and ionization_mode != 'N/A':
|
|
1887
|
+
df_reference = df_reference.loc[df_reference['ionization_mode']==ionization_mode]
|
|
1888
|
+
if 'adduct' in df_reference.columns.tolist() and adduct != 'N/A':
|
|
1889
|
+
df_reference = df_reference.loc[df_reference['adduct']==adduct]
|
|
1890
|
+
|
|
1891
|
+
if spectrum_preprocessing_order is not None:
|
|
1892
|
+
spectrum_preprocessing_order = list(spectrum_preprocessing_order)
|
|
1893
|
+
else:
|
|
1894
|
+
spectrum_preprocessing_order = ['F', 'C', 'N', 'M', 'W', 'L']
|
|
1895
|
+
if 'M' not in spectrum_preprocessing_order:
|
|
1896
|
+
print(f'Error: \'M\' must be a character in spectrum_preprocessing_order.')
|
|
1897
|
+
sys.exit()
|
|
1898
|
+
if 'C' in spectrum_preprocessing_order:
|
|
1899
|
+
if spectrum_preprocessing_order.index('C') > spectrum_preprocessing_order.index('M'):
|
|
1900
|
+
print(f'Error: \'C\' must come before \'M\' in spectrum_preprocessing_order.')
|
|
1901
|
+
sys.exit()
|
|
1902
|
+
if set(spectrum_preprocessing_order) - {'F','C','N','M','W','L'}:
|
|
1903
|
+
print(f'Error: spectrum_preprocessing_order must contain only \'C\', \'F\', \'M\', \'N\', \'L\', \'W\'.')
|
|
1904
|
+
sys.exit()
|
|
1905
|
+
|
|
1906
|
+
|
|
1907
|
+
if similarity_measure not in ['cosine','shannon','renyi','tsallis','mixture','jaccard','dice','3w_jaccard','sokal_sneath','binary_cosine','mountford','mcconnaughey','driver_kroeber','simpson','braun_banquet','fager_mcgowan','kuldzynski','intersection','hamming','hellinger']:
|
|
1908
|
+
print('\nError: similarity_measure must be either cosine, shannon, renyi, tsallis, mixture, jaccard, dice, 3w_jaccard, sokal_sneath, binary_cosine, mountford, mcconnaughey, driver_kroeber, simpson, braun_banquet, fager_mcgowan, kulczynski, intersection, hamming, or hellinger')
|
|
1909
|
+
sys.exit()
|
|
1910
|
+
|
|
1911
|
+
if isinstance(int_min,int) is True:
|
|
1912
|
+
int_min = float(int_min)
|
|
1913
|
+
if isinstance(int_max,int) is True:
|
|
1914
|
+
int_max = float(int_max)
|
|
1915
|
+
if isinstance(mz_min,int) is False or isinstance(mz_max,int) is False or isinstance(int_min,float) is False or isinstance(int_max,float) is False:
|
|
1916
|
+
print('Error: mz_min must be a non-negative integer, mz_max must be a positive integer, int_min must be a non-negative float, and int_max must be a positive float')
|
|
1917
|
+
sys.exit()
|
|
1918
|
+
if mz_min < 0:
|
|
1919
|
+
print('\nError: mz_min should be a non-negative integer')
|
|
1920
|
+
sys.exit()
|
|
1921
|
+
if mz_max <= 0:
|
|
1922
|
+
print('\nError: mz_max should be a positive integer')
|
|
1923
|
+
sys.exit()
|
|
1924
|
+
if int_min < 0:
|
|
1925
|
+
print('\nError: int_min should be a non-negative float')
|
|
1926
|
+
sys.exit()
|
|
1927
|
+
if int_max <= 0:
|
|
1928
|
+
print('\nError: int_max should be a positive float')
|
|
1929
|
+
sys.exit()
|
|
1930
|
+
|
|
1931
|
+
if isinstance(window_size_centroiding,float) is False or window_size_centroiding <= 0.0:
|
|
1932
|
+
print('Error: window_size_centroiding must be a positive float.')
|
|
1933
|
+
sys.exit()
|
|
1934
|
+
if isinstance(window_size_matching,float) is False or window_size_matching<= 0.0:
|
|
1935
|
+
print('Error: window_size_matching must be a positive float.')
|
|
1936
|
+
sys.exit()
|
|
1937
|
+
|
|
1938
|
+
if isinstance(noise_threshold,int) is True:
|
|
1939
|
+
noise_threshold = float(noise_threshold)
|
|
1940
|
+
if isinstance(noise_threshold,float) is False or noise_threshold < 0:
|
|
1941
|
+
print('Error: noise_threshold must be a positive float.')
|
|
1942
|
+
sys.exit()
|
|
1943
|
+
|
|
1944
|
+
if isinstance(wf_intensity,int) is True:
|
|
1945
|
+
wf_intensity = float(wf_intensity)
|
|
1946
|
+
if isinstance(wf_mz,int) is True:
|
|
1947
|
+
wf_mz = float(wf_mz)
|
|
1948
|
+
if isinstance(wf_intensity,float) is False or isinstance(wf_mz,float) is False:
|
|
1949
|
+
print('Error: wf_mz and wf_intensity must be integers or floats')
|
|
1950
|
+
sys.exit()
|
|
1951
|
+
|
|
1952
|
+
if entropy_dimension <= 0:
|
|
1953
|
+
print('\nError: entropy_dimension should be a positive float')
|
|
1954
|
+
sys.exit()
|
|
1955
|
+
else:
|
|
1956
|
+
q = entropy_dimension
|
|
1957
|
+
|
|
1958
|
+
normalization_method = 'standard'
|
|
1959
|
+
|
|
1960
|
+
if n_top_matches_to_save <= 0 or isinstance(n_top_matches_to_save,int)==False:
|
|
1961
|
+
print('\nError: n_top_matches_to_save should be a positive integer')
|
|
1962
|
+
sys.exit()
|
|
1963
|
+
|
|
1964
|
+
if isinstance(print_id_results,bool)==False:
|
|
1965
|
+
print('\nError: print_id_results must be either True or False')
|
|
1966
|
+
sys.exit()
|
|
1967
|
+
|
|
1968
|
+
if output_identification is None:
|
|
1969
|
+
output_identification = f'{Path.cwd()}/output_identification.txt'
|
|
1970
|
+
print(f'Warning: writing identification output to {output_identification}')
|
|
1971
|
+
|
|
1972
|
+
if output_similarity_scores is None:
|
|
1973
|
+
output_similarity_scores = f'{Path.cwd()}/output_all_similarity_scores.txt'
|
|
1974
|
+
print(f'Warning: writing similarity scores to {output_similarity_scores}')
|
|
1975
|
+
|
|
1976
|
+
|
|
1977
|
+
unique_reference_ids = df_reference['id'].unique().tolist()
|
|
1978
|
+
all_similarity_scores = []
|
|
1979
|
+
|
|
1980
|
+
for query_idx in range(len(unique_query_ids)):
|
|
1981
|
+
if verbose:
|
|
1982
|
+
print(f'query spectrum #{query_idx} is being identified')
|
|
1983
|
+
|
|
1984
|
+
q_mask = (df_query['id'] == unique_query_ids[query_idx])
|
|
1985
|
+
q_idxs_tmp = np.where(q_mask)[0]
|
|
1986
|
+
q_spec_tmp = np.asarray(pd.concat([df_query['mz_ratio'].iloc[q_idxs_tmp], df_query['intensity'].iloc[q_idxs_tmp]], axis=1).reset_index(drop=True))
|
|
1987
|
+
|
|
1988
|
+
if 'precursor_ion_mz' in df_query.columns.tolist() and 'precursor_ion_mz' in df_reference.columns.tolist() and precursor_ion_mz_tolerance != None:
|
|
1989
|
+
precursor_ion_mz_tmp = df_query['precursor_ion_mz'].iloc[q_idxs_tmp[0]]
|
|
1990
|
+
df_reference_tmp = df_reference.loc[df_reference['precursor_ion_mz'].between(precursor_ion_mz_tmp-precursor_ion_mz_tolerance, precursor_ion_mz_tmp+precursor_ion_mz_tolerance, inclusive='both'),['id','mz_ratio','intensity']].copy()
|
|
1991
|
+
else:
|
|
1992
|
+
df_reference_tmp = df_reference.copy()
|
|
1993
|
+
|
|
1994
|
+
ref_groups = dict(tuple(df_reference_tmp.groupby('id', sort=False)))
|
|
1995
|
+
unique_reference_ids_tmp = list(ref_groups.keys())
|
|
1996
|
+
|
|
1997
|
+
similarity_by_ref = {}
|
|
1998
|
+
for ref_id in unique_reference_ids_tmp:
|
|
1999
|
+
q_spec = q_spec_tmp.copy()
|
|
2000
|
+
r_df = ref_groups[ref_id]
|
|
2001
|
+
r_spec = np.asarray(pd.concat([r_df['mz_ratio'], r_df['intensity']], axis=1).reset_index(drop=True))
|
|
2002
|
+
|
|
2003
|
+
is_matched = False
|
|
2004
|
+
|
|
2005
|
+
for transformation in spectrum_preprocessing_order:
|
|
2006
|
+
if np.isinf(q_spec[:, 1]).sum() > 0:
|
|
2007
|
+
q_spec[:, 1] = np.zeros(q_spec.shape[0])
|
|
2008
|
+
if np.isinf(r_spec[:, 1]).sum() > 0:
|
|
2009
|
+
r_spec[:, 1] = np.zeros(r_spec.shape[0])
|
|
2010
|
+
|
|
2011
|
+
if transformation == 'C' and q_spec.shape[0] > 1 and r_spec.shape[0] > 1:
|
|
2012
|
+
q_spec = centroid_spectrum(q_spec, window_size=window_size_centroiding)
|
|
2013
|
+
r_spec = centroid_spectrum(r_spec, window_size=window_size_centroiding)
|
|
2014
|
+
|
|
2015
|
+
if transformation == 'M' and q_spec.shape[0] > 1 and r_spec.shape[0] > 1:
|
|
2016
|
+
m_spec = match_peaks_in_spectra(spec_a=q_spec, spec_b=r_spec, window_size=window_size_matching)
|
|
2017
|
+
q_spec = m_spec[:, 0:2]
|
|
2018
|
+
r_spec = m_spec[:, [0, 2]]
|
|
2019
|
+
is_matched = True
|
|
2020
|
+
|
|
2021
|
+
if transformation == 'W' and q_spec.shape[0] > 1 and r_spec.shape[0] > 1:
|
|
2022
|
+
q_spec[:, 1] = wf_transform(q_spec[:, 0], q_spec[:, 1], wf_mz, wf_intensity)
|
|
2023
|
+
r_spec[:, 1] = wf_transform(r_spec[:, 0], r_spec[:, 1], wf_mz, wf_intensity)
|
|
2024
|
+
|
|
2025
|
+
if transformation == 'L' and q_spec.shape[0] > 1 and r_spec.shape[0] > 1:
|
|
2026
|
+
q_spec[:, 1] = LE_transform(q_spec[:, 1], LET_threshold, normalization_method=normalization_method)
|
|
2027
|
+
r_spec[:, 1] = LE_transform(r_spec[:, 1], LET_threshold, normalization_method=normalization_method)
|
|
2028
|
+
|
|
2029
|
+
if transformation == 'N' and q_spec.shape[0] > 1 and r_spec.shape[0] > 1:
|
|
2030
|
+
q_spec = remove_noise(q_spec, nr=noise_threshold)
|
|
2031
|
+
if not high_quality_reference_library:
|
|
2032
|
+
r_spec = remove_noise(r_spec, nr=noise_threshold)
|
|
2033
|
+
|
|
2034
|
+
if transformation == 'F' and q_spec.shape[0] > 1 and r_spec.shape[0] > 1:
|
|
2035
|
+
q_spec = filter_spec_lcms(
|
|
2036
|
+
q_spec, mz_min=mz_min, mz_max=mz_max, int_min=int_min, int_max=int_max, is_matched=is_matched
|
|
2037
|
+
)
|
|
2038
|
+
if not high_quality_reference_library:
|
|
2039
|
+
r_spec = filter_spec_lcms(
|
|
2040
|
+
r_spec, mz_min=mz_min, mz_max=mz_max, int_min=int_min, int_max=int_max, is_matched=is_matched
|
|
2041
|
+
)
|
|
2042
|
+
|
|
2043
|
+
q_ints = q_spec[:, 1]
|
|
2044
|
+
r_ints = r_spec[:, 1]
|
|
2045
|
+
|
|
2046
|
+
if np.sum(q_ints) != 0 and np.sum(r_ints) != 0 and q_spec.shape[0] > 1 and r_spec.shape[0] > 1:
|
|
2047
|
+
sim = get_similarity(similarity_measure, q_ints, r_ints, weights, entropy_dimension)
|
|
2048
|
+
else:
|
|
2049
|
+
sim = 0.0
|
|
2050
|
+
|
|
2051
|
+
similarity_by_ref[ref_id] = sim
|
|
2052
|
+
|
|
2053
|
+
row_scores = [similarity_by_ref.get(ref_id, 0.0) for ref_id in unique_reference_ids]
|
|
2054
|
+
all_similarity_scores.append(row_scores)
|
|
2055
|
+
|
|
2056
|
+
df_scores = pd.DataFrame(all_similarity_scores, index=unique_query_ids, columns=unique_reference_ids)
|
|
2057
|
+
df_scores.index = unique_query_ids
|
|
2058
|
+
df_scores.index.names = ['QUERY.SPECTRUM.ID']
|
|
2059
|
+
|
|
2060
|
+
|
|
2061
|
+
preds = []
|
|
2062
|
+
scores = []
|
|
2063
|
+
for i in range(0, df_scores.shape[0]):
|
|
2064
|
+
df_scores_tmp = df_scores
|
|
2065
|
+
preds_tmp = []
|
|
2066
|
+
scores_tmp = []
|
|
2067
|
+
for j in range(0, n_top_matches_to_save):
|
|
2068
|
+
top_ref_specs_tmp = df_scores_tmp.iloc[i,np.where(df_scores_tmp.iloc[i,:] == np.max(df_scores_tmp.iloc[i,:]))[0]]
|
|
2069
|
+
cols_to_keep = np.where(df_scores_tmp.iloc[i,:] != np.max(df_scores_tmp.iloc[i,:]))[0]
|
|
2070
|
+
df_scores_tmp = df_scores_tmp.iloc[:,cols_to_keep]
|
|
2071
|
+
|
|
2072
|
+
preds_tmp.append(';'.join(map(str,top_ref_specs_tmp.index.to_list())))
|
|
2073
|
+
if len(top_ref_specs_tmp.values) == 0:
|
|
2074
|
+
scores_tmp.append(0)
|
|
2075
|
+
else:
|
|
2076
|
+
scores_tmp.append(top_ref_specs_tmp.values[0])
|
|
2077
|
+
preds.append(preds_tmp)
|
|
2078
|
+
scores.append(scores_tmp)
|
|
2079
|
+
|
|
2080
|
+
preds = np.array(preds)
|
|
2081
|
+
scores = np.array(scores)
|
|
2082
|
+
out = np.c_[preds,scores]
|
|
2083
|
+
|
|
2084
|
+
cnames_preds = []
|
|
2085
|
+
cnames_scores = []
|
|
2086
|
+
for i in range(0,n_top_matches_to_save):
|
|
2087
|
+
cnames_preds.append(f'RANK.{i+1}.PRED')
|
|
2088
|
+
cnames_scores.append(f'RANK.{i+1}.SIMILARITY.SCORE')
|
|
2089
|
+
|
|
2090
|
+
df_top_ref_specs = pd.DataFrame(out, columns = [*cnames_preds, *cnames_scores])
|
|
2091
|
+
df_top_ref_specs.index = unique_query_ids
|
|
2092
|
+
df_top_ref_specs.index.names = ['QUERY.SPECTRUM.ID']
|
|
2093
|
+
|
|
2094
|
+
df_scores.columns = ['Reference Spectrum ID: ' + col for col in list(map(str,df_scores.columns.tolist()))]
|
|
2095
|
+
|
|
2096
|
+
if print_id_results == True:
|
|
2097
|
+
print(df_top_ref_specs.to_string())
|
|
2098
|
+
|
|
2099
|
+
if return_ID_output is False:
|
|
2100
|
+
df_top_ref_specs.to_csv(output_identification, sep='\t')
|
|
2101
|
+
df_scores.to_csv(output_similarity_scores, sep='\t')
|
|
2102
|
+
else:
|
|
2103
|
+
return df_top_ref_specs
|
|
2104
|
+
|
|
2105
|
+
|
|
2106
|
+
|
|
2107
|
+
|
|
2108
|
+
def run_spec_lib_matching_on_NRMS_data_shiny(query_data=None, reference_data=None, likely_reference_ids=None, spectrum_preprocessing_order='FNLW', similarity_measure='cosine', weights={'Cosine':0.25,'Shannon':0.25,'Renyi':0.25,'Tsallis':0.25}, high_quality_reference_library=False, mz_min=0, mz_max=9999999, int_min=0, int_max=9999999, noise_threshold=0.0, wf_mz=0.0, wf_intensity=1.0, LET_threshold=0.0, entropy_dimension=1.1, n_top_matches_to_save=1, print_id_results=False, output_identification=None, output_similarity_scores=None, return_ID_output=False, verbose=True):
|
|
2109
|
+
if query_data is None:
|
|
2110
|
+
print('\nError: No argument passed to the mandatory query_data. Please pass the path to the TXT file of the query data.')
|
|
2111
|
+
sys.exit()
|
|
2112
|
+
else:
|
|
2113
|
+
extension = query_data.rsplit('.',1)
|
|
2114
|
+
extension = extension[(len(extension)-1)]
|
|
2115
|
+
if extension == 'mgf' or extension == 'MGF' or extension == 'mzML' or extension == 'mzml' or extension == 'MZML' or extension == 'cdf' or extension == 'CDF' or extension == 'msp' or extension == 'MSP' or extension == 'json' or extension == 'JSON':
|
|
2116
|
+
output_path_tmp = query_data[:-3] + 'txt'
|
|
2117
|
+
build_library_from_raw_data(input_path=query_data, output_path=output_path_tmp, is_reference=False)
|
|
2118
|
+
df_query = pd.read_csv(output_path_tmp, sep='\t')
|
|
2119
|
+
if extension == 'txt' or extension == 'TXT':
|
|
2120
|
+
df_query = pd.read_csv(query_data, sep='\t')
|
|
2121
|
+
unique_query_ids = df_query.iloc[:,0].unique()
|
|
2122
|
+
|
|
2123
|
+
if reference_data is None:
|
|
2124
|
+
print('\nError: No argument passed to the mandatory reference_data. Please pass the path to the TXT file of the reference data.')
|
|
2125
|
+
sys.exit()
|
|
2126
|
+
else:
|
|
2127
|
+
if isinstance(reference_data,str):
|
|
2128
|
+
df_reference = get_reference_df(reference_data,likely_reference_ids)
|
|
2129
|
+
unique_reference_ids = df_reference.iloc[:,0].unique()
|
|
2130
|
+
else:
|
|
2131
|
+
dfs = []
|
|
2132
|
+
unique_reference_ids = []
|
|
2133
|
+
for f in reference_data:
|
|
2134
|
+
tmp = get_reference_df(f,likely_reference_ids)
|
|
2135
|
+
dfs.append(tmp)
|
|
2136
|
+
unique_reference_ids.extend(tmp.iloc[:,0].unique())
|
|
2137
|
+
df_reference = pd.concat(dfs, axis=0, ignore_index=True)
|
|
2138
|
+
|
|
2139
|
+
|
|
2140
|
+
if spectrum_preprocessing_order is not None:
|
|
2141
|
+
spectrum_preprocessing_order = list(spectrum_preprocessing_order)
|
|
2142
|
+
else:
|
|
2143
|
+
spectrum_preprocessing_order = ['F','N','W','L']
|
|
2144
|
+
if set(spectrum_preprocessing_order) - {'F','N','W','L'}:
|
|
2145
|
+
print(f'Error: spectrum_preprocessing_order must contain only \'F\', \'N\', \'W\', \'L\'.')
|
|
2146
|
+
sys.exit()
|
|
2147
|
+
|
|
2148
|
+
if similarity_measure not in ['cosine','shannon','renyi','tsallis','mixture','jaccard','dice','3w_jaccard','sokal_sneath','binary_cosine','mountford','mcconnaughey','driver_kroeber','simpson','braun_banquet','fager_mcgowan','kuldzynski','intersection','hamming','hellinger']:
|
|
2149
|
+
print('\nError: similarity_measure must be either cosine, shannon, renyi, tsallis, mixture, jaccard, dice, 3w_jaccard, sokal_sneath, binary_cosine, mountford, mcconnaughey, driver_kroeber, simpson, braun_banquet, fager_mcgowan, kulczynski, intersection, hamming, or hellinger')
|
|
2150
|
+
sys.exit()
|
|
2151
|
+
|
|
2152
|
+
if isinstance(int_min,int) is True:
|
|
2153
|
+
int_min = float(int_min)
|
|
2154
|
+
if isinstance(int_max,int) is True:
|
|
2155
|
+
int_max = float(int_max)
|
|
2156
|
+
if isinstance(mz_min,int) is False or isinstance(mz_max,int) is False or isinstance(int_min,float) is False or isinstance(int_max,float) is False:
|
|
2157
|
+
print('Error: mz_min must be a non-negative integer, mz_max must be a positive integer, int_min must be a non-negative float, and int_max must be a positive float')
|
|
2158
|
+
sys.exit()
|
|
2159
|
+
if mz_min < 0:
|
|
2160
|
+
print('\nError: mz_min should be a non-negative integer')
|
|
2161
|
+
sys.exit()
|
|
2162
|
+
if mz_max <= 0:
|
|
2163
|
+
print('\nError: mz_max should be a positive integer')
|
|
2164
|
+
sys.exit()
|
|
2165
|
+
if int_min < 0:
|
|
2166
|
+
print('\nError: int_min should be a non-negative float')
|
|
2167
|
+
sys.exit()
|
|
2168
|
+
if int_max <= 0:
|
|
2169
|
+
print('\nError: int_max should be a positive float')
|
|
2170
|
+
sys.exit()
|
|
2171
|
+
|
|
2172
|
+
if isinstance(noise_threshold,int) is True:
|
|
2173
|
+
noise_threshold = float(noise_threshold)
|
|
2174
|
+
if isinstance(noise_threshold,float) is False or noise_threshold < 0:
|
|
2175
|
+
print('Error: noise_threshold must be a positive float.')
|
|
2176
|
+
sys.exit()
|
|
2177
|
+
|
|
2178
|
+
if isinstance(wf_intensity,int) is True:
|
|
2179
|
+
wf_intensity = float(wf_intensity)
|
|
2180
|
+
if isinstance(wf_mz,int) is True:
|
|
2181
|
+
wf_mz = float(wf_mz)
|
|
2182
|
+
if isinstance(wf_intensity,float) is False or isinstance(wf_mz,float) is False:
|
|
2183
|
+
print('Error: wf_mz and wf_intensity must be integers or floats')
|
|
2184
|
+
sys.exit()
|
|
2185
|
+
|
|
2186
|
+
if entropy_dimension <= 0:
|
|
2187
|
+
print('\nError: entropy_dimension should be a positive float')
|
|
2188
|
+
sys.exit()
|
|
2189
|
+
else:
|
|
2190
|
+
q = entropy_dimension
|
|
2191
|
+
|
|
2192
|
+
normalization_method = 'standard'
|
|
2193
|
+
|
|
2194
|
+
if n_top_matches_to_save <= 0 or isinstance(n_top_matches_to_save,int)==False:
|
|
2195
|
+
print('\nError: n_top_matches_to_save should be a positive integer')
|
|
2196
|
+
sys.exit()
|
|
2197
|
+
|
|
2198
|
+
if isinstance(print_id_results,bool)==False:
|
|
2199
|
+
print('\nError: print_id_results must be either True or False')
|
|
2200
|
+
sys.exit()
|
|
2201
|
+
|
|
2202
|
+
if output_identification is None:
|
|
2203
|
+
output_identification = f'{Path.cwd()}/output_identification.txt'
|
|
2204
|
+
print(f'Warning: writing identification output to {output_identification}')
|
|
2205
|
+
|
|
2206
|
+
if output_similarity_scores is None:
|
|
2207
|
+
output_similarity_scores = f'{Path.cwd()}/output_all_similarity_scores.txt'
|
|
2208
|
+
print(f'Warning: writing similarity scores to {output_similarity_scores}')
|
|
2209
|
+
|
|
2210
|
+
|
|
2211
|
+
|
|
2212
|
+
min_mz = int(np.min([np.min(df_query.iloc[:,1]), np.min(df_reference.iloc[:,1])]))
|
|
2213
|
+
max_mz = int(np.max([np.max(df_query.iloc[:,1]), np.max(df_reference.iloc[:,1])]))
|
|
2214
|
+
mzs = np.linspace(min_mz,max_mz,(max_mz-min_mz+1))
|
|
2215
|
+
|
|
2216
|
+
all_similarity_scores = []
|
|
2217
|
+
for query_idx in range(0,len(unique_query_ids)):
|
|
2218
|
+
q_idxs_tmp = np.where(df_query.iloc[:,0] == unique_query_ids[query_idx])[0]
|
|
2219
|
+
q_spec_tmp = np.asarray(pd.concat([df_query.iloc[q_idxs_tmp,1], df_query.iloc[q_idxs_tmp,2]], axis=1).reset_index(drop=True))
|
|
2220
|
+
q_spec_tmp = convert_spec(q_spec_tmp,mzs)
|
|
2221
|
+
|
|
2222
|
+
similarity_scores = []
|
|
2223
|
+
for ref_idx in range(0,len(unique_reference_ids)):
|
|
2224
|
+
if verbose is True and ref_idx % 1000 == 0:
|
|
2225
|
+
print(f'Query spectrum #{query_idx} has had its similarity with {ref_idx} reference library spectra computed')
|
|
2226
|
+
q_spec = q_spec_tmp
|
|
2227
|
+
r_idxs_tmp = np.where(df_reference.iloc[:,0] == unique_reference_ids[ref_idx])[0]
|
|
2228
|
+
r_spec_tmp = np.asarray(pd.concat([df_reference.iloc[r_idxs_tmp,1], df_reference.iloc[r_idxs_tmp,2]], axis=1).reset_index(drop=True))
|
|
2229
|
+
r_spec = convert_spec(r_spec_tmp,mzs)
|
|
2230
|
+
|
|
2231
|
+
for transformation in spectrum_preprocessing_order:
|
|
2232
|
+
if np.isinf(q_spec[:,1]).sum() > 0:
|
|
2233
|
+
q_spec[:,1] = np.zeros(q_spec.shape[0])
|
|
2234
|
+
if np.isinf(r_spec[:,1]).sum() > 0:
|
|
2235
|
+
r_spec[:,1] = np.zeros(r_spec.shape[0])
|
|
2236
|
+
if transformation == 'W':
|
|
2237
|
+
q_spec[:,1] = wf_transform(q_spec[:,0], q_spec[:,1], wf_mz, wf_intensity)
|
|
2238
|
+
r_spec[:,1] = wf_transform(r_spec[:,0], r_spec[:,1], wf_mz, wf_intensity)
|
|
2239
|
+
if transformation == 'L':
|
|
2240
|
+
q_spec[:,1] = LE_transform(q_spec[:,1], LET_threshold, normalization_method=normalization_method)
|
|
2241
|
+
r_spec[:,1] = LE_transform(r_spec[:,1], LET_threshold, normalization_method=normalization_method)
|
|
2242
|
+
if transformation == 'N':
|
|
2243
|
+
q_spec = remove_noise(q_spec, nr = noise_threshold)
|
|
2244
|
+
if high_quality_reference_library == False:
|
|
2245
|
+
r_spec = remove_noise(r_spec, nr = noise_threshold)
|
|
2246
|
+
if transformation == 'F':
|
|
2247
|
+
q_spec = filter_spec_gcms(q_spec, mz_min = mz_min, mz_max = mz_max, int_min = int_min, int_max = int_max)
|
|
2248
|
+
if high_quality_reference_library == False:
|
|
2249
|
+
r_spec = filter_spec_gcms(r_spec, mz_min = mz_min, mz_max = mz_max, int_min = int_min, int_max = int_max)
|
|
2250
|
+
|
|
2251
|
+
q_ints = q_spec[:,1]
|
|
2252
|
+
r_ints = r_spec[:,1]
|
|
2253
|
+
|
|
2254
|
+
if np.sum(q_ints) != 0 and np.sum(r_ints) != 0:
|
|
2255
|
+
similarity_score = get_similarity(similarity_measure, q_spec[:,1], r_spec[:,1], weights, entropy_dimension)
|
|
2256
|
+
else:
|
|
2257
|
+
similarity_score = 0
|
|
2258
|
+
|
|
2259
|
+
similarity_scores.append(similarity_score)
|
|
2260
|
+
all_similarity_scores.append(similarity_scores)
|
|
2261
|
+
|
|
2262
|
+
df_scores = pd.DataFrame(all_similarity_scores, columns = unique_reference_ids)
|
|
2263
|
+
df_scores.index = unique_query_ids
|
|
2264
|
+
df_scores.index.names = ['QUERY.SPECTRUM.ID']
|
|
2265
|
+
|
|
2266
|
+
preds = []
|
|
2267
|
+
scores = []
|
|
2268
|
+
for i in range(0, df_scores.shape[0]):
|
|
2269
|
+
df_scores_tmp = df_scores
|
|
2270
|
+
preds_tmp = []
|
|
2271
|
+
scores_tmp = []
|
|
2272
|
+
for j in range(0, n_top_matches_to_save):
|
|
2273
|
+
top_ref_specs_tmp = df_scores_tmp.iloc[i,np.where(df_scores_tmp.iloc[i,:] == np.max(df_scores_tmp.iloc[i,:]))[0]]
|
|
2274
|
+
cols_to_keep = np.where(df_scores_tmp.iloc[i,:] != np.max(df_scores_tmp.iloc[i,:]))[0]
|
|
2275
|
+
df_scores_tmp = df_scores_tmp.iloc[:,cols_to_keep]
|
|
2276
|
+
|
|
2277
|
+
preds_tmp.append(';'.join(map(str,top_ref_specs_tmp.index.to_list())))
|
|
2278
|
+
if len(top_ref_specs_tmp.values) == 0:
|
|
2279
|
+
scores_tmp.append(0)
|
|
2280
|
+
else:
|
|
2281
|
+
scores_tmp.append(top_ref_specs_tmp.values[0])
|
|
2282
|
+
preds.append(preds_tmp)
|
|
2283
|
+
scores.append(scores_tmp)
|
|
2284
|
+
|
|
2285
|
+
preds = np.array(preds)
|
|
2286
|
+
scores = np.array(scores)
|
|
2287
|
+
out = np.c_[preds,scores]
|
|
2288
|
+
|
|
2289
|
+
cnames_preds = []
|
|
2290
|
+
cnames_scores = []
|
|
2291
|
+
for i in range(0,n_top_matches_to_save):
|
|
2292
|
+
cnames_preds.append(f'RANK.{i+1}.PRED')
|
|
2293
|
+
cnames_scores.append(f'RANK.{i+1}.SIMILARITY.SCORE')
|
|
2294
|
+
|
|
2295
|
+
df_top_ref_specs = pd.DataFrame(out, columns = [*cnames_preds, *cnames_scores])
|
|
2296
|
+
df_top_ref_specs.index = unique_query_ids
|
|
2297
|
+
df_top_ref_specs.index.names = ['QUERY.SPECTRUM.ID']
|
|
2298
|
+
|
|
2299
|
+
if print_id_results == True:
|
|
2300
|
+
print(df_top_ref_specs.to_string())
|
|
2301
|
+
|
|
2302
|
+
df_scores.columns = ['Reference Spectrum ID: ' + col for col in list(map(str,df_scores.columns.tolist()))]
|
|
2303
|
+
|
|
2304
|
+
if return_ID_output is False:
|
|
2305
|
+
df_top_ref_specs.to_csv(output_identification, sep='\t')
|
|
2306
|
+
df_scores.columns = ['Reference Spectrum ID: ' + col for col in list(map(str,df_scores.columns.tolist()))]
|
|
2307
|
+
df_scores.to_csv(output_similarity_scores, sep='\t')
|
|
2308
|
+
else:
|
|
2309
|
+
return df_top_ref_specs
|
|
2310
|
+
|
|
2311
|
+
|
|
35
2312
|
class _UIWriter:
|
|
36
2313
|
def __init__(self, loop, q: asyncio.Queue[str]):
|
|
37
2314
|
self._loop = loop
|
|
@@ -90,19 +2367,21 @@ def strip_weights(s):
|
|
|
90
2367
|
def build_library(input_path=None, output_path=None):
|
|
91
2368
|
last_three_chars = input_path[(len(input_path)-3):len(input_path)]
|
|
92
2369
|
last_four_chars = input_path[(len(input_path)-4):len(input_path)]
|
|
93
|
-
if last_three_chars == '
|
|
94
|
-
return pd.read_csv(input_path)
|
|
2370
|
+
if last_three_chars == 'txt' or last_three_chars == 'TXT':
|
|
2371
|
+
return pd.read_csv(input_path, sep='\t')
|
|
95
2372
|
else:
|
|
96
2373
|
if last_three_chars == 'mgf' or last_three_chars == 'MGF':
|
|
97
2374
|
input_file_type = 'mgf'
|
|
98
2375
|
elif last_four_chars == 'mzML' or last_four_chars == 'mzml' or last_four_chars == 'MZML':
|
|
99
2376
|
input_file_type = 'mzML'
|
|
2377
|
+
elif last_four_chars == 'json' or last_four_chars == 'JSON':
|
|
2378
|
+
input_file_type = 'json'
|
|
100
2379
|
elif last_three_chars == 'cdf' or last_three_chars == 'CDF':
|
|
101
2380
|
input_file_type = 'cdf'
|
|
102
2381
|
elif last_three_chars == 'msp' or last_three_chars == 'MSP':
|
|
103
2382
|
input_file_type = 'msp'
|
|
104
2383
|
else:
|
|
105
|
-
print('ERROR: either an \'mgf\', \'mzML\', \'cdf\', or \'
|
|
2384
|
+
print('ERROR: either an \'mgf\', \'mzML\', \'cdf\', \'msp\', \'json\', or \'txt\' file must be passed to --input_path')
|
|
106
2385
|
sys.exit()
|
|
107
2386
|
|
|
108
2387
|
spectra = []
|
|
@@ -172,6 +2451,23 @@ def build_library(input_path=None, output_path=None):
|
|
|
172
2451
|
except ValueError:
|
|
173
2452
|
continue
|
|
174
2453
|
|
|
2454
|
+
if input_file_type == 'json':
|
|
2455
|
+
data = json.load(open(input_path))
|
|
2456
|
+
ids = []
|
|
2457
|
+
mzs = []
|
|
2458
|
+
ints = []
|
|
2459
|
+
for i in range(0,len(data)):
|
|
2460
|
+
spec_ID_tmp = data[i]['spectrum_id']
|
|
2461
|
+
tmp = data[i]['peaks_json']
|
|
2462
|
+
tmp = tmp[1:-1].split(",")
|
|
2463
|
+
tmp = [a.replace("[","") for a in tmp]
|
|
2464
|
+
tmp = [a.replace("]","") for a in tmp]
|
|
2465
|
+
mzs_tmp = tmp[0::2]
|
|
2466
|
+
ints_tmp = tmp[1::2]
|
|
2467
|
+
ids.extend([spec_ID_tmp] * len(mzs_tmp))
|
|
2468
|
+
mzs.extend(mzs_tmp)
|
|
2469
|
+
ints.extend(ints_tmp)
|
|
2470
|
+
|
|
175
2471
|
df = pd.DataFrame({'id':ids, 'mz_ratio':mzs, 'intensity':ints})
|
|
176
2472
|
return df
|
|
177
2473
|
|
|
@@ -180,9 +2476,12 @@ def build_library(input_path=None, output_path=None):
|
|
|
180
2476
|
def extract_first_column_ids(file_path: str, max_ids: int = 20000):
|
|
181
2477
|
suffix = Path(file_path).suffix.lower()
|
|
182
2478
|
|
|
183
|
-
if suffix == ".
|
|
184
|
-
df = pd.read_csv(file_path,
|
|
185
|
-
|
|
2479
|
+
if suffix == ".txt":
|
|
2480
|
+
df = pd.read_csv(file_path, sep='\t')
|
|
2481
|
+
if 'id' in df.columns.tolist():
|
|
2482
|
+
ids = df['id'].astype(str).dropna()
|
|
2483
|
+
else:
|
|
2484
|
+
ids = df.iloc[:, 0].astype(str).dropna()
|
|
186
2485
|
ids = [x for x in ids if x.strip() != ""]
|
|
187
2486
|
seen = set()
|
|
188
2487
|
uniq = []
|
|
@@ -217,17 +2516,17 @@ def extract_first_column_ids(file_path: str, max_ids: int = 20000):
|
|
|
217
2516
|
return []
|
|
218
2517
|
|
|
219
2518
|
|
|
220
|
-
def _open_plot_window(session,
|
|
221
|
-
"""Send
|
|
222
|
-
b64 = base64.b64encode(
|
|
223
|
-
data_url = f"data:image/
|
|
224
|
-
session.send_custom_message("open-plot-window", {"
|
|
2519
|
+
def _open_plot_window(session, svg_bytes: bytes, title: str = "plot.svg"):
|
|
2520
|
+
"""Send SVG bytes to browser and open in a new window as a data URL."""
|
|
2521
|
+
b64 = base64.b64encode(svg_bytes).decode("ascii")
|
|
2522
|
+
data_url = f"data:image/svg;base64,{b64}"
|
|
2523
|
+
session.send_custom_message("open-plot-window", {"svg": data_url, "title": title})
|
|
225
2524
|
|
|
226
2525
|
|
|
227
2526
|
def plot_spectra_ui(platform: str):
|
|
228
2527
|
base_inputs = [
|
|
229
|
-
ui.input_file("query_data", "Upload query dataset (mgf, mzML, cdf, msp, or
|
|
230
|
-
ui.input_file("reference_data", "Upload reference dataset (mgf, mzML, cdf, msp, or
|
|
2528
|
+
ui.input_file("query_data", "Upload query dataset (mgf, mzML, cdf, msp, or txt):"),
|
|
2529
|
+
ui.input_file("reference_data", "Upload reference dataset (mgf, mzML, cdf, msp, or txt):"),
|
|
231
2530
|
ui.input_selectize(
|
|
232
2531
|
"spectrum_ID1",
|
|
233
2532
|
"Select spectrum ID 1 (default is the first spectrum in the library):",
|
|
@@ -242,6 +2541,8 @@ def plot_spectra_ui(platform: str):
|
|
|
242
2541
|
multiple=False,
|
|
243
2542
|
options={"placeholder": "Upload a library..."},
|
|
244
2543
|
),
|
|
2544
|
+
ui.input_select('print_url_spectrum1', 'Print PubChem URL for spectrum 1:', ['No', 'Yes']),
|
|
2545
|
+
ui.input_select('print_url_spectrum2', 'Print PubChem URL for spectrum 2:', ['No', 'Yes']),
|
|
245
2546
|
ui.input_select("similarity_measure", "Select similarity measure:", ["cosine","shannon","renyi","tsallis","mixture","jaccard","dice","3w_jaccard","sokal_sneath","binary_cosine","mountford","mcconnaughey","driver_kroeber","simpson","braun_banquet","fager_mcgowan","kulczynski","intersection","hamming","hellinger"]),
|
|
246
2547
|
ui.input_text('weights', 'Weights for mixture similarity measure (cosine, shannon, renyi, tsallis):', '0.25, 0.25, 0.25, 0.25'),
|
|
247
2548
|
ui.input_select(
|
|
@@ -253,21 +2554,13 @@ def plot_spectra_ui(platform: str):
|
|
|
253
2554
|
|
|
254
2555
|
if platform == "HRMS":
|
|
255
2556
|
extra_inputs = [
|
|
256
|
-
ui.input_text(
|
|
257
|
-
"spectrum_preprocessing_order",
|
|
258
|
-
"Sequence of characters for preprocessing order (C (centroiding), F (filtering), M (matching), N (noise removal), L (low-entropy transformation), W (weight factor transformation)). M must be included, C before M if used.",
|
|
259
|
-
"FCNMWL",
|
|
260
|
-
),
|
|
2557
|
+
ui.input_text("spectrum_preprocessing_order", "Sequence of characters for preprocessing order (C (centroiding), F (filtering), M (matching), N (noise removal), L (low-entropy transformation), W (weight factor transformation)). M must be included, C before M if used.", "FCNMWL",),
|
|
261
2558
|
ui.input_numeric("window_size_centroiding", "Centroiding window-size:", 0.5),
|
|
262
2559
|
ui.input_numeric("window_size_matching", "Matching window-size:", 0.5),
|
|
263
2560
|
]
|
|
264
2561
|
else:
|
|
265
2562
|
extra_inputs = [
|
|
266
|
-
ui.input_text(
|
|
267
|
-
"spectrum_preprocessing_order",
|
|
268
|
-
"Sequence of characters for preprocessing order (F (filtering), N (noise removal), L (low-entropy transformation), W (weight factor transformation)).",
|
|
269
|
-
"FNLW",
|
|
270
|
-
)
|
|
2563
|
+
ui.input_text("spectrum_preprocessing_order", "Sequence of characters for preprocessing order (F (filtering), N (noise removal), L (low-entropy transformation), W (weight factor transformation)).", "FNLW",)
|
|
271
2564
|
]
|
|
272
2565
|
|
|
273
2566
|
numeric_inputs = [
|
|
@@ -282,11 +2575,7 @@ def plot_spectra_ui(platform: str):
|
|
|
282
2575
|
ui.input_numeric("entropy_dimension", "Entropy dimension (Renyi/Tsallis only):", 1.1),
|
|
283
2576
|
]
|
|
284
2577
|
|
|
285
|
-
select_input = ui.input_select(
|
|
286
|
-
"y_axis_transformation",
|
|
287
|
-
"Transformation to apply to intensity axis:",
|
|
288
|
-
["normalized", "none", "log10", "sqrt"],
|
|
289
|
-
)
|
|
2578
|
+
select_input = ui.input_select("y_axis_transformation", "Transformation to apply to intensity axis:", ["normalized", "none", "log10", "sqrt"])
|
|
290
2579
|
|
|
291
2580
|
run_button_plot_spectra = ui.download_button("run_btn_plot_spectra", "Run", style="font-size:16px; padding:15px 30px; width:200px; height:80px")
|
|
292
2581
|
back_button = ui.input_action_button("back", "Back to main menu", style="font-size:16px; padding:15px 30px; width:200px; height:80px")
|
|
@@ -294,15 +2583,15 @@ def plot_spectra_ui(platform: str):
|
|
|
294
2583
|
if platform == "HRMS":
|
|
295
2584
|
inputs_columns = ui.layout_columns(
|
|
296
2585
|
ui.div(base_inputs[0:6], style="display:flex; flex-direction:column; gap:10px;"),
|
|
297
|
-
ui.div([base_inputs[6:
|
|
298
|
-
ui.div(numeric_inputs[0:
|
|
299
|
-
ui.div([numeric_inputs[
|
|
2586
|
+
ui.div([base_inputs[6:9], extra_inputs[0]], style="display:flex; flex-direction:column; gap:10px;"),
|
|
2587
|
+
ui.div(extra_inputs[1:3], numeric_inputs[0:3], style="display:flex; flex-direction:column; gap:10px;"),
|
|
2588
|
+
ui.div([numeric_inputs[3:10], select_input], style="display:flex; flex-direction:column; gap:10px;"),
|
|
300
2589
|
col_widths=(3,3,3,3),
|
|
301
2590
|
)
|
|
302
2591
|
elif platform == "NRMS":
|
|
303
2592
|
inputs_columns = ui.layout_columns(
|
|
304
2593
|
ui.div(base_inputs[0:6], style="display:flex; flex-direction:column; gap:10px;"),
|
|
305
|
-
ui.div([base_inputs[6:
|
|
2594
|
+
ui.div([base_inputs[6:9], *extra_inputs], style="display:flex; flex-direction:column; gap:10px;"),
|
|
306
2595
|
ui.div(numeric_inputs[0:5], style="display:flex; flex-direction:column; gap:10px;"),
|
|
307
2596
|
ui.div([numeric_inputs[5:10], select_input], style="display:flex; flex-direction:column; gap:10px;"),
|
|
308
2597
|
col_widths=(3,3,3,3),
|
|
@@ -323,49 +2612,29 @@ def plot_spectra_ui(platform: str):
|
|
|
323
2612
|
|
|
324
2613
|
def run_spec_lib_matching_ui(platform: str):
|
|
325
2614
|
base_inputs = [
|
|
326
|
-
ui.input_file("query_data", "Upload query dataset (mgf, mzML, cdf, msp, or
|
|
327
|
-
ui.input_file("reference_data", "Upload reference dataset (mgf, mzML, cdf, msp, or
|
|
2615
|
+
ui.input_file("query_data", "Upload query dataset (mgf, mzML, cdf, msp, or txt):"),
|
|
2616
|
+
ui.input_file("reference_data", "Upload reference dataset (mgf, mzML, cdf, msp, or txt):"),
|
|
328
2617
|
ui.input_select("similarity_measure", "Select similarity measure:", ["cosine","shannon","renyi","tsallis","mixture","jaccard","dice","3w_jaccard","sokal_sneath","binary_cosine","mountford","mcconnaughey","driver_kroeber","simpson","braun_banquet","fager_mcgowan","kulczynski","intersection","hamming","hellinger"]),
|
|
329
2618
|
ui.input_text('weights', 'Weights for mixture similarity measure (cosine, shannon, renyi, tsallis):', '0.25, 0.25, 0.25, 0.25'),
|
|
330
|
-
ui.
|
|
331
|
-
|
|
332
|
-
|
|
333
|
-
|
|
334
|
-
|
|
335
|
-
|
|
336
|
-
),
|
|
337
|
-
ui.input_selectize(
|
|
338
|
-
"spectrum_ID2",
|
|
339
|
-
"Select spectrum ID 2 (only applicable for plotting; default is the first spectrum in the reference library):",
|
|
340
|
-
choices=[],
|
|
341
|
-
multiple=False,
|
|
342
|
-
options={"placeholder": "Upload a library..."},
|
|
343
|
-
),
|
|
344
|
-
ui.input_select(
|
|
345
|
-
"high_quality_reference_library",
|
|
346
|
-
"Indicate whether the reference library is considered high quality. If True, filtering and noise removal are only applied to the query spectra.",
|
|
347
|
-
[False, True],
|
|
348
|
-
)
|
|
2619
|
+
ui.input_file('compound_ID_output_file', 'Upload output from spectral library matching to plot top matches (optional)'),
|
|
2620
|
+
ui.input_selectize("q_spec", "Select query spectrum (only applicable for plotting; default is the first spectrum in the compound ID output):", choices=[], multiple=False, options={"placeholder": "Upload compound ID output..."}),
|
|
2621
|
+
ui.input_selectize("r_spec", "Select reference spectrum (only applicable for plotting; default is the rank 1 reference spectrum):", choices=[], multiple=False, options={"placeholder": "Upload compound ID output..."}),
|
|
2622
|
+
ui.input_select('print_url_spectrum1', 'Print PubChem URL for query spectrum (only applicable for plotting):', ['No', 'Yes']),
|
|
2623
|
+
ui.input_select('print_url_spectrum2', 'Print PubChem URL for reference spectrum (only applicable for plotting):', ['No', 'Yes']),
|
|
2624
|
+
ui.input_select("high_quality_reference_library", "Indicate whether the reference library is considered high quality. If True, filtering and noise removal are only applied to the query spectra.", [False, True])
|
|
349
2625
|
]
|
|
350
2626
|
|
|
351
2627
|
if platform == "HRMS":
|
|
352
2628
|
extra_inputs = [
|
|
353
|
-
ui.
|
|
354
|
-
|
|
355
|
-
|
|
356
|
-
|
|
357
|
-
),
|
|
2629
|
+
ui.input_numeric("precursor_ion_mz_tolerance", "Precursor ion mass tolerance (leave blank if not applicable):", None),
|
|
2630
|
+
ui.input_select("ionization_mode", "Ionization mode:", ['Positive','Negative','N/A'], selected='N/A'),
|
|
2631
|
+
ui.input_select("adduct", "Adduct:", ['H','NH3','NH4','Na','K','N/A'], selected='N/A'),
|
|
2632
|
+
ui.input_text("spectrum_preprocessing_order","Sequence of characters for preprocessing order (C (centroiding), F (filtering), M (matching), N (noise removal), L (low-entropy transformation), W (weight factor transformation)). M must be included, C before M if used.","FCNMWL"),
|
|
358
2633
|
ui.input_numeric("window_size_centroiding", "Centroiding window-size:", 0.5),
|
|
359
2634
|
ui.input_numeric("window_size_matching", "Matching window-size:", 0.5),
|
|
360
2635
|
]
|
|
361
2636
|
else:
|
|
362
|
-
extra_inputs = [
|
|
363
|
-
ui.input_text(
|
|
364
|
-
"spectrum_preprocessing_order",
|
|
365
|
-
"Sequence of characters for preprocessing order (F (filtering), N (noise removal), L (low-entropy transformation), W (weight factor transformation)).",
|
|
366
|
-
"FNLW",
|
|
367
|
-
)
|
|
368
|
-
]
|
|
2637
|
+
extra_inputs = [ui.input_text("spectrum_preprocessing_order","Sequence of characters for preprocessing order (F (filtering), N (noise removal), L (low-entropy transformation), W (weight factor transformation)).","FNLW")]
|
|
369
2638
|
|
|
370
2639
|
numeric_inputs = [
|
|
371
2640
|
ui.input_numeric("mz_min", "Minimum m/z for filtering:", 0),
|
|
@@ -387,16 +2656,16 @@ def run_spec_lib_matching_ui(platform: str):
|
|
|
387
2656
|
|
|
388
2657
|
if platform == "HRMS":
|
|
389
2658
|
inputs_columns = ui.layout_columns(
|
|
390
|
-
ui.div(base_inputs[0:
|
|
391
|
-
ui.div([base_inputs[
|
|
392
|
-
ui.div(numeric_inputs[0:
|
|
393
|
-
ui.div(numeric_inputs[
|
|
2659
|
+
ui.div([base_inputs[0:2], extra_inputs[0:3], base_inputs[2:4]], style="display:flex; flex-direction:column; gap:10px;"),
|
|
2660
|
+
ui.div([base_inputs[4:10]], style="display:flex; flex-direction:column; gap:10px;"),
|
|
2661
|
+
ui.div([extra_inputs[3:6], numeric_inputs[0:3]], style="display:flex; flex-direction:column; gap:10px;"),
|
|
2662
|
+
ui.div(numeric_inputs[3:10], style="display:flex; flex-direction:column; gap:10px;"),
|
|
394
2663
|
col_widths=(3,3,3,3)
|
|
395
2664
|
)
|
|
396
2665
|
elif platform == "NRMS":
|
|
397
2666
|
inputs_columns = ui.layout_columns(
|
|
398
2667
|
ui.div(base_inputs[0:6], style="display:flex; flex-direction:column; gap:10px;"),
|
|
399
|
-
ui.div([base_inputs[6:
|
|
2668
|
+
ui.div([base_inputs[6:10], *extra_inputs], style="display:flex; flex-direction:column; gap:10px;"),
|
|
400
2669
|
ui.div(numeric_inputs[0:5], style="display:flex; flex-direction:column; gap:10px;"),
|
|
401
2670
|
ui.div(numeric_inputs[5:10], style="display:flex; flex-direction:column; gap:10px;"),
|
|
402
2671
|
col_widths=(3,3,3,3)
|
|
@@ -423,8 +2692,8 @@ def run_spec_lib_matching_ui(platform: str):
|
|
|
423
2692
|
|
|
424
2693
|
def run_parameter_tuning_grid_ui(platform: str):
|
|
425
2694
|
base_inputs = [
|
|
426
|
-
ui.input_file("query_data", "Upload query dataset (mgf, mzML, cdf, msp, or
|
|
427
|
-
ui.input_file("reference_data", "Upload reference dataset (mgf, mzML, cdf, msp, or
|
|
2695
|
+
ui.input_file("query_data", "Upload query dataset (mgf, mzML, cdf, msp, or txt):"),
|
|
2696
|
+
ui.input_file("reference_data", "Upload reference dataset (mgf, mzML, cdf, msp, or txt):"),
|
|
428
2697
|
ui.input_selectize("similarity_measure", "Select similarity measure(s):", ["cosine","shannon","renyi","tsallis","mixture","jaccard","dice","3w_jaccard","sokal_sneath","binary_cosine","mountford","mcconnaughey","driver_kroeber","simpson","braun_banquet","fager_mcgowan","kulczynski","intersection","hamming","hellinger"], multiple=True, selected='cosine'),
|
|
429
2698
|
ui.input_text('weights', 'Weights for mixture similarity measure (cosine, shannon, renyi, tsallis):', '((0.25, 0.25, 0.25, 0.25))'),
|
|
430
2699
|
ui.input_text("high_quality_reference_library", "Indicate whether the reference library is considered high quality. If True, filtering and noise removal are only applied to the query spectra.", '[True]')
|
|
@@ -432,11 +2701,10 @@ def run_parameter_tuning_grid_ui(platform: str):
|
|
|
432
2701
|
|
|
433
2702
|
if platform == "HRMS":
|
|
434
2703
|
extra_inputs = [
|
|
435
|
-
ui.
|
|
436
|
-
|
|
437
|
-
|
|
438
|
-
|
|
439
|
-
),
|
|
2704
|
+
ui.input_numeric("precursor_ion_mz_tolerance", "Precursor ion mass tolerance (leave blank if not applicable):", None),
|
|
2705
|
+
ui.input_select("ionization_mode", "Ionization mode:", ['Positive','Negative','N/A'], selected='N/A'),
|
|
2706
|
+
ui.input_select("adduct", "Adduct:", ['H','NH3','NH4','Na','K','N/A'], selected='N/A'),
|
|
2707
|
+
ui.input_text("spectrum_preprocessing_order", "Sequence of characters for preprocessing order (C (centroiding), F (filtering), M (matching), N (noise removal), L (low-entropy transformation), W (weight factor transformation)). M must be included, C before M if used.", "[FCNMWL,CWM]"),
|
|
440
2708
|
ui.input_text("window_size_centroiding", "Centroiding window-size:", "[0.5]"),
|
|
441
2709
|
ui.input_text("window_size_matching", "Matching window-size:", "[0.1,0.5]"),
|
|
442
2710
|
]
|
|
@@ -490,7 +2758,7 @@ def run_parameter_tuning_grid_ui(platform: str):
|
|
|
490
2758
|
|
|
491
2759
|
return ui.div(
|
|
492
2760
|
ui.TagList(
|
|
493
|
-
ui.h2("Tune parameters"),
|
|
2761
|
+
ui.h2("Tune parameters (grid search)"),
|
|
494
2762
|
inputs_columns,
|
|
495
2763
|
run_button_parameter_tuning_grid,
|
|
496
2764
|
back_button,
|
|
@@ -527,48 +2795,23 @@ def run_parameter_tuning_DE_ui(platform: str):
|
|
|
527
2795
|
PARAMS = PARAMS_NRMS
|
|
528
2796
|
|
|
529
2797
|
base_inputs = [
|
|
530
|
-
ui.input_file("query_data", "Upload query dataset (mgf, mzML, cdf, msp, or
|
|
531
|
-
ui.input_file("reference_data", "Upload reference dataset (mgf, mzML, cdf, msp, or
|
|
532
|
-
ui.input_select(
|
|
533
|
-
|
|
534
|
-
|
|
535
|
-
[
|
|
536
|
-
"cosine","shannon","renyi","tsallis","mixture","jaccard","dice",
|
|
537
|
-
"3w_jaccard","sokal_sneath","binary_cosine","mountford",
|
|
538
|
-
"mcconnaughey","driver_kroeber","simpson","braun_banquet",
|
|
539
|
-
"fager_mcgowan","kulczynski","intersection","hamming","hellinger",
|
|
540
|
-
],
|
|
541
|
-
),
|
|
542
|
-
ui.input_text(
|
|
543
|
-
"weights",
|
|
544
|
-
"Weights for mixture similarity measure (cosine, shannon, renyi, tsallis):",
|
|
545
|
-
"0.25, 0.25, 0.25, 0.25",
|
|
546
|
-
),
|
|
547
|
-
ui.input_select(
|
|
548
|
-
"high_quality_reference_library",
|
|
549
|
-
"Indicate whether the reference library is considered high quality. If True, filtering and noise removal are only applied to the query spectra.",
|
|
550
|
-
[False, True],
|
|
551
|
-
),
|
|
552
|
-
]
|
|
2798
|
+
ui.input_file("query_data", "Upload query dataset (mgf, mzML, cdf, msp, or txt):"),
|
|
2799
|
+
ui.input_file("reference_data", "Upload reference dataset (mgf, mzML, cdf, msp, or txt):"),
|
|
2800
|
+
ui.input_select("similarity_measure", "Select similarity measure:", ["cosine","shannon","renyi","tsallis","mixture","jaccard","dice","3w_jaccard","sokal_sneath","binary_cosine","mountford","mcconnaughey","driver_kroeber","simpson","braun_banquet","fager_mcgowan","kulczynski","intersection","hamming","hellinger"]),
|
|
2801
|
+
ui.input_text("weights", "Weights for mixture similarity measure (cosine, shannon, renyi, tsallis):", "0.25, 0.25, 0.25, 0.25"),
|
|
2802
|
+
ui.input_select("high_quality_reference_library", "Indicate whether the reference library is considered high quality. If True, filtering and noise removal are only applied to the query spectra.", [False, True])]
|
|
553
2803
|
|
|
554
2804
|
if platform == "HRMS":
|
|
555
2805
|
extra_inputs = [
|
|
556
|
-
ui.
|
|
557
|
-
|
|
558
|
-
|
|
559
|
-
|
|
560
|
-
),
|
|
2806
|
+
ui.input_numeric("precursor_ion_mz_tolerance", "Precursor ion mass tolerance (leave blank if not applicable):", None),
|
|
2807
|
+
ui.input_select("ionization_mode", "Ionization mode:", ['Positive','Negative','N/A'], selected='N/A'),
|
|
2808
|
+
ui.input_select("adduct", "Adduct:", ['H','NH3','NH4','Na','K','N/A'], selected='N/A'),
|
|
2809
|
+
ui.input_text("spectrum_preprocessing_order", "Sequence of characters for preprocessing order (C (centroiding), F (filtering), M (matching), N (noise removal), L (low-entropy transformation), W (weight factor transformation)). M must be included, C before M if used.", "FCNMWL"),
|
|
561
2810
|
ui.input_numeric("window_size_centroiding", "Centroiding window-size:", 0.5),
|
|
562
2811
|
ui.input_numeric("window_size_matching", "Matching window-size:", 0.5),
|
|
563
2812
|
]
|
|
564
2813
|
else:
|
|
565
|
-
extra_inputs = [
|
|
566
|
-
ui.input_text(
|
|
567
|
-
"spectrum_preprocessing_order",
|
|
568
|
-
"Sequence of characters for preprocessing order (F (filtering), N (noise removal), L (low-entropy transformation), W (weight factor transformation)).",
|
|
569
|
-
"FNLW",
|
|
570
|
-
)
|
|
571
|
-
]
|
|
2814
|
+
extra_inputs = [ui.input_text("spectrum_preprocessing_order", "Sequence of characters for preprocessing order (F (filtering), N (noise removal), L (low-entropy transformation), W (weight factor transformation)).", "FNLW")]
|
|
572
2815
|
|
|
573
2816
|
numeric_inputs = [
|
|
574
2817
|
ui.input_numeric("mz_min", "Minimum m/z for filtering:", 0),
|
|
@@ -583,18 +2826,9 @@ def run_parameter_tuning_DE_ui(platform: str):
|
|
|
583
2826
|
ui.input_numeric("max_iterations", "Maximum number of iterations:", 5),
|
|
584
2827
|
]
|
|
585
2828
|
|
|
586
|
-
run_button_parameter_tuning_DE = ui.input_action_button(
|
|
587
|
-
|
|
588
|
-
"Tune parameters (differential evolution optimization)",
|
|
589
|
-
style="font-size:16px; padding:15px 30px; width:300px; height:100px",
|
|
590
|
-
)
|
|
591
|
-
back_button = ui.input_action_button(
|
|
592
|
-
"back",
|
|
593
|
-
"Back to main menu",
|
|
594
|
-
style="font-size:16px; padding:15px 30px; width:300px; height:100px",
|
|
595
|
-
)
|
|
2829
|
+
run_button_parameter_tuning_DE = ui.input_action_button("run_btn_parameter_tuning_DE", "Tune parameters (differential evolution optimization)", style="font-size:16px; padding:15px 30px; width:300px; height:100px")
|
|
2830
|
+
back_button = ui.input_action_button("back", "Back to main menu", style="font-size:16px; padding:15px 30px; width:300px; height:100px")
|
|
596
2831
|
|
|
597
|
-
# Build the 4-column inputs panel (fixed slices corrected, unpack lists properly)
|
|
598
2832
|
if platform == "HRMS":
|
|
599
2833
|
inputs_columns = ui.layout_columns(
|
|
600
2834
|
ui.div(*base_inputs, style="display:flex; flex-direction:column; gap:10px;"),
|
|
@@ -603,7 +2837,7 @@ def run_parameter_tuning_DE_ui(platform: str):
|
|
|
603
2837
|
ui.div(*numeric_inputs[5:11], style="display:flex; flex-direction:column; gap:10px;"),
|
|
604
2838
|
col_widths=(3, 3, 3, 3),
|
|
605
2839
|
)
|
|
606
|
-
else:
|
|
2840
|
+
else:
|
|
607
2841
|
inputs_columns = ui.layout_columns(
|
|
608
2842
|
ui.div(*base_inputs, style="display:flex; flex-direction:column; gap:10px;"),
|
|
609
2843
|
ui.div(*extra_inputs, style="display:flex; flex-direction:column; gap:10px;"),
|
|
@@ -612,17 +2846,11 @@ def run_parameter_tuning_DE_ui(platform: str):
|
|
|
612
2846
|
col_widths=(3, 3, 3, 3),
|
|
613
2847
|
)
|
|
614
2848
|
|
|
615
|
-
# Main page: sidebar (param selection + bounds) and body (inputs + buttons + live log)
|
|
616
2849
|
return ui.page_fillable(
|
|
617
2850
|
ui.layout_sidebar(
|
|
618
2851
|
ui.sidebar(
|
|
619
2852
|
ui.h3("Select continuous parameters to optimize"),
|
|
620
|
-
ui.input_checkbox_group(
|
|
621
|
-
"params",
|
|
622
|
-
None,
|
|
623
|
-
choices=list(PARAMS.keys()),
|
|
624
|
-
selected=["noise_threshold", "LET_threshold"],
|
|
625
|
-
),
|
|
2853
|
+
ui.input_checkbox_group("params", None, choices=list(PARAMS.keys()), selected=["noise_threshold", "LET_threshold"]),
|
|
626
2854
|
ui.hr(),
|
|
627
2855
|
ui.h4("Bounds for selected parameters"),
|
|
628
2856
|
ui.output_ui("bounds_inputs"),
|
|
@@ -631,12 +2859,11 @@ def run_parameter_tuning_DE_ui(platform: str):
|
|
|
631
2859
|
ui.div(
|
|
632
2860
|
ui.h2("Tune parameters (differential evolution optimization)"),
|
|
633
2861
|
inputs_columns,
|
|
634
|
-
run_button_parameter_tuning_DE,
|
|
635
|
-
back_button,
|
|
2862
|
+
ui.div(run_button_parameter_tuning_DE, back_button, style=("display:flex; flex-direction:row; gap:12px; align-items:center; flex-wrap:wrap;")),
|
|
636
2863
|
ui.br(),
|
|
637
2864
|
ui.card(
|
|
638
2865
|
ui.card_header("Live log"),
|
|
639
|
-
ui.output_text_verbatim("run_log"),
|
|
2866
|
+
ui.output_text_verbatim("run_log"),
|
|
640
2867
|
),
|
|
641
2868
|
style="display:flex; flex-direction:column; gap:16px;",
|
|
642
2869
|
),
|
|
@@ -645,15 +2872,16 @@ def run_parameter_tuning_DE_ui(platform: str):
|
|
|
645
2872
|
|
|
646
2873
|
|
|
647
2874
|
|
|
648
|
-
|
|
649
|
-
|
|
650
2875
|
app_ui = ui.page_fluid(
|
|
651
2876
|
ui.head_content(ui.tags.link(rel="icon", href="emblem.png")),
|
|
2877
|
+
ui.div(ui.output_image("image"), style=("display:block; margin:20px auto; max-width:320px; height:auto; text-align:center")),
|
|
652
2878
|
ui.output_ui("main_ui"),
|
|
653
|
-
ui.output_text("status_output")
|
|
2879
|
+
ui.output_text("status_output"),
|
|
654
2880
|
)
|
|
655
2881
|
|
|
656
2882
|
|
|
2883
|
+
|
|
2884
|
+
|
|
657
2885
|
def server(input, output, session):
|
|
658
2886
|
|
|
659
2887
|
current_page = reactive.Value("main_menu")
|
|
@@ -672,7 +2900,7 @@ def server(input, output, session):
|
|
|
672
2900
|
match_log_rv = reactive.Value("")
|
|
673
2901
|
is_matching_rv = reactive.Value(False)
|
|
674
2902
|
is_any_job_running = reactive.Value(False)
|
|
675
|
-
|
|
2903
|
+
latest_txt_path_rv = reactive.Value("")
|
|
676
2904
|
latest_df_rv = reactive.Value(None)
|
|
677
2905
|
is_running_rv = reactive.Value(False)
|
|
678
2906
|
|
|
@@ -688,6 +2916,106 @@ def server(input, output, session):
|
|
|
688
2916
|
converted_query_path_rv = reactive.Value(None)
|
|
689
2917
|
converted_reference_path_rv = reactive.Value(None)
|
|
690
2918
|
|
|
2919
|
+
df_rv = reactive.Value(None)
|
|
2920
|
+
|
|
2921
|
+
|
|
2922
|
+
def _discover_rank_cols(df: pd.DataFrame):
|
|
2923
|
+
pred_pat = re.compile(r"^RANK\.(\d+)\.PRED$")
|
|
2924
|
+
score_pat = re.compile(r"^RANK\.(\d+)\.SIMILARITY\.SCORE$")
|
|
2925
|
+
pred_map, score_map = {}, {}
|
|
2926
|
+
for c in df.columns:
|
|
2927
|
+
m = pred_pat.match(c)
|
|
2928
|
+
if m: pred_map[int(m.group(1))] = c
|
|
2929
|
+
m = score_pat.match(c)
|
|
2930
|
+
if m: score_map[int(m.group(1))] = c
|
|
2931
|
+
return [(k, pred_map[k], score_map.get(k)) for k in sorted(pred_map)]
|
|
2932
|
+
|
|
2933
|
+
|
|
2934
|
+
def _rank_choices_for_query(df: pd.DataFrame, qid: str):
|
|
2935
|
+
sub = df.loc[df["QUERY.SPECTRUM.ID"].astype(str) == str(qid)]
|
|
2936
|
+
if sub.empty:
|
|
2937
|
+
return {}, None
|
|
2938
|
+
row = sub.iloc[0]
|
|
2939
|
+
rank_cols = _discover_rank_cols(df)
|
|
2940
|
+
if not rank_cols:
|
|
2941
|
+
return {}, None
|
|
2942
|
+
|
|
2943
|
+
choices = {}
|
|
2944
|
+
default_value = None
|
|
2945
|
+
for (k, pred_col, score_col) in rank_cols:
|
|
2946
|
+
pred = row.get(pred_col, None)
|
|
2947
|
+
if pd.isna(pred):
|
|
2948
|
+
continue
|
|
2949
|
+
pred = str(pred)
|
|
2950
|
+
score = row.get(score_col, None) if score_col else None
|
|
2951
|
+
score_str = f"{float(score):.6f}" if (score is not None and pd.notna(score)) else "NA"
|
|
2952
|
+
label = f"Rank {k} — {score_str} — {pred}"
|
|
2953
|
+
choices[label] = pred # values are plain names
|
|
2954
|
+
if k == 1:
|
|
2955
|
+
default_value = pred # default = Rank 1 name
|
|
2956
|
+
|
|
2957
|
+
if default_value is None and choices:
|
|
2958
|
+
default_value = next(iter(choices.values()))
|
|
2959
|
+
return choices, default_value
|
|
2960
|
+
|
|
2961
|
+
|
|
2962
|
+
@reactive.effect
|
|
2963
|
+
@reactive.event(input.compound_ID_output_file)
|
|
2964
|
+
async def _populate_ids_from_compound_ID_output_upload():
|
|
2965
|
+
files = input.compound_ID_output_file()
|
|
2966
|
+
if not files:
|
|
2967
|
+
return
|
|
2968
|
+
|
|
2969
|
+
in_path = Path(files[0]["datapath"])
|
|
2970
|
+
try:
|
|
2971
|
+
query_status_rv.set(f"Reading table from: {in_path.name} …")
|
|
2972
|
+
await reactive.flush()
|
|
2973
|
+
|
|
2974
|
+
df = await asyncio.to_thread(pd.read_csv, in_path, sep="\t", header=0)
|
|
2975
|
+
|
|
2976
|
+
if "QUERY.SPECTRUM.ID" not in df.columns:
|
|
2977
|
+
raise ValueError("Missing required column: QUERY.SPECTRUM.ID")
|
|
2978
|
+
if not _discover_rank_cols(df):
|
|
2979
|
+
raise ValueError("No columns matching RANK.<k>.PRED found.")
|
|
2980
|
+
|
|
2981
|
+
df_rv.set(df)
|
|
2982
|
+
|
|
2983
|
+
ids = df["QUERY.SPECTRUM.ID"].astype(str).tolist()
|
|
2984
|
+
unique_ids_in_order = list(dict.fromkeys(ids))
|
|
2985
|
+
|
|
2986
|
+
choices_dict, default_rank_value = _rank_choices_for_query(df, ids[0])
|
|
2987
|
+
choices_values = [str(v).strip() for v in choices_dict.values()]
|
|
2988
|
+
default_rank_value = str(default_rank_value).strip() if default_rank_value is not None else None
|
|
2989
|
+
|
|
2990
|
+
ui.update_selectize("q_spec", choices=unique_ids_in_order, selected=ids[0])
|
|
2991
|
+
await reactive.flush()
|
|
2992
|
+
|
|
2993
|
+
ui.update_selectize("r_spec", choices=choices_values, selected=choices_values[0])
|
|
2994
|
+
await reactive.flush()
|
|
2995
|
+
|
|
2996
|
+
except Exception as e:
|
|
2997
|
+
query_status_rv.set(f"❌ Failed: {e}")
|
|
2998
|
+
await reactive.flush()
|
|
2999
|
+
raise
|
|
3000
|
+
|
|
3001
|
+
|
|
3002
|
+
@reactive.effect
|
|
3003
|
+
@reactive.event(input.q_spec)
|
|
3004
|
+
async def _update_rank_choices_on_compound_ID_change():
|
|
3005
|
+
df = df_rv.get()
|
|
3006
|
+
if df is None:
|
|
3007
|
+
return
|
|
3008
|
+
qid = input.q_spec()
|
|
3009
|
+
if not qid:
|
|
3010
|
+
return
|
|
3011
|
+
|
|
3012
|
+
choices, default_rank_value = _rank_choices_for_query(df, qid)
|
|
3013
|
+
choices = list(choices.values())
|
|
3014
|
+
ui.update_selectize('r_spec', choices=choices, selected=default_rank_value)
|
|
3015
|
+
await reactive.flush()
|
|
3016
|
+
|
|
3017
|
+
|
|
3018
|
+
|
|
691
3019
|
@output
|
|
692
3020
|
@render.ui
|
|
693
3021
|
def bounds_inputs():
|
|
@@ -830,6 +3158,11 @@ def server(input, output, session):
|
|
|
830
3158
|
def flush(self):
|
|
831
3159
|
pass
|
|
832
3160
|
|
|
3161
|
+
def _run_with_redirects(func, writer: ReactiveWriter, **kwargs):
|
|
3162
|
+
with contextlib.redirect_stdout(writer), contextlib.redirect_stderr(writer):
|
|
3163
|
+
return func(**kwargs)
|
|
3164
|
+
|
|
3165
|
+
|
|
833
3166
|
|
|
834
3167
|
@reactive.effect
|
|
835
3168
|
async def _pump_logs():
|
|
@@ -926,7 +3259,7 @@ def server(input, output, session):
|
|
|
926
3259
|
@render.image
|
|
927
3260
|
def image():
|
|
928
3261
|
dir = Path(__file__).resolve().parent
|
|
929
|
-
img: ImgData = {"src": str(dir / "www/emblem.png"), "width": "
|
|
3262
|
+
img: ImgData = {"src": str(dir / "www/emblem.png"), "width": "250px", "height": "250px"}
|
|
930
3263
|
return img
|
|
931
3264
|
|
|
932
3265
|
@output
|
|
@@ -935,30 +3268,10 @@ def server(input, output, session):
|
|
|
935
3268
|
if current_page() == "main_menu":
|
|
936
3269
|
return ui.page_fluid(
|
|
937
3270
|
ui.h2("Main Menu"),
|
|
938
|
-
ui.div(
|
|
939
|
-
|
|
940
|
-
|
|
941
|
-
|
|
942
|
-
"position:fixed; top:0; left:50%; transform:translateX(-50%); "
|
|
943
|
-
"z-index:1000; text-align:center; padding:10px; background-color:white;"
|
|
944
|
-
),
|
|
945
|
-
),
|
|
946
|
-
ui.div(
|
|
947
|
-
"Overview:",
|
|
948
|
-
style="text-align:left; font-size:24px; font-weight:bold; margin-top:350px"
|
|
949
|
-
),
|
|
950
|
-
ui.div(
|
|
951
|
-
"PyCompound is a Python-based tool designed for performing spectral library matching on either high-resolution mass spectrometry data (HRMS) or low-resolution mass spectrometry data (NRMS). PyCompound offers a range of spectrum preprocessing transformations and similarity measures. These spectrum preprocessing transformations include filtering on mass/charge and/or intensity values, weight factor transformation, low-entropy transformation, centroiding, noise removal, and matching. The available similarity measures include the canonical Cosine similarity measure, three entropy-based similarity measures, and a variety of binary similarity measures: Jaccard, Dice, 3W-Jaccard, Sokal-Sneath, Binary Cosine, Mountford, McConnaughey, Driver-Kroeber, Simpson, Braun-Banquet, Fager-McGowan, Kulczynski, Intersection, Hamming, and Hellinger.",
|
|
952
|
-
style="margin-top:10px; text-align:left; font-size:16px; font-weight:500"
|
|
953
|
-
),
|
|
954
|
-
ui.div(
|
|
955
|
-
"Select options:",
|
|
956
|
-
style="margin-top:30px; text-align:left; font-size:24px; font-weight:bold"
|
|
957
|
-
),
|
|
958
|
-
ui.div(
|
|
959
|
-
ui.input_radio_buttons("chromatography_platform", "Specify chromatography platform:", ["HRMS","NRMS"]),
|
|
960
|
-
style="font-size:18px; margin-top:10px; max-width:none"
|
|
961
|
-
),
|
|
3271
|
+
ui.div("Overview:", style="text-align:left; font-size:24px; font-weight:bold"),
|
|
3272
|
+
ui.div("PyCompound is a Python-based tool designed for performing spectral library matching on either high-resolution mass spectrometry data (HRMS) or low-resolution mass spectrometry data (NRMS). PyCompound offers a range of spectrum preprocessing transformations and similarity measures. These spectrum preprocessing transformations include filtering on mass/charge and/or intensity values, weight factor transformation, low-entropy transformation, centroiding, noise removal, and matching. The available similarity measures include the canonical Cosine similarity measure, three entropy-based similarity measures, and a variety of binary similarity measures: Jaccard, Dice, 3W-Jaccard, Sokal-Sneath, Binary Cosine, Mountford, McConnaughey, Driver-Kroeber, Simpson, Braun-Banquet, Fager-McGowan, Kulczynski, Intersection, Hamming, and Hellinger.", style="margin-top:10px; text-align:left; font-size:16px; font-weight:500"),
|
|
3273
|
+
ui.div("Select options:", style="margin-top:30px; text-align:left; font-size:24px; font-weight:bold"),
|
|
3274
|
+
ui.div(ui.input_radio_buttons("chromatography_platform", "Specify chromatography platform:", ["HRMS","NRMS"]), style="font-size:18px; margin-top:10px; max-width:none"),
|
|
962
3275
|
ui.input_action_button("plot_spectra", "Plot two spectra before and after preprocessing transformations.", style="font-size:18px; padding:20px 40px; width:550px; height:100px; margin-top:10px; margin-right:50px"),
|
|
963
3276
|
ui.input_action_button("run_spec_lib_matching", "Run spectral library matching to perform compound identification on a query library of spectra.", style="font-size:18px; padding:20px 40px; width:550px; height:100px; margin-top:10px; margin-right:50px"),
|
|
964
3277
|
ui.input_action_button("run_parameter_tuning_grid", "Grid search: Tune parameters to maximize accuracy of compound identification given a query library with known spectrum IDs.", style="font-size:18px; padding:20px 40px; width:450px; height:120px; margin-top:10px; margin-right:50px"),
|
|
@@ -1031,36 +3344,36 @@ def server(input, output, session):
|
|
|
1031
3344
|
suffix = in_path.suffix.lower()
|
|
1032
3345
|
|
|
1033
3346
|
try:
|
|
1034
|
-
if suffix == ".
|
|
1035
|
-
|
|
1036
|
-
converted_query_path_rv.set(str(
|
|
3347
|
+
if suffix == ".txt":
|
|
3348
|
+
txt_path = in_path
|
|
3349
|
+
converted_query_path_rv.set(str(txt_path))
|
|
1037
3350
|
else:
|
|
1038
|
-
query_status_rv.set(f"Converting {in_path.name} →
|
|
3351
|
+
query_status_rv.set(f"Converting {in_path.name} → TXT…")
|
|
1039
3352
|
await reactive.flush()
|
|
1040
3353
|
|
|
1041
|
-
|
|
3354
|
+
tmp_txt_path = in_path.with_suffix(".converted.txt")
|
|
1042
3355
|
|
|
1043
|
-
out_obj = await asyncio.to_thread(build_library, str(in_path), str(
|
|
3356
|
+
out_obj = await asyncio.to_thread(build_library, str(in_path), str(tmp_txt_path))
|
|
1044
3357
|
|
|
1045
3358
|
if isinstance(out_obj, (str, os.PathLike, Path)):
|
|
1046
|
-
|
|
3359
|
+
txt_path = Path(out_obj)
|
|
1047
3360
|
elif isinstance(out_obj, pd.DataFrame):
|
|
1048
|
-
out_obj.to_csv(
|
|
1049
|
-
|
|
3361
|
+
out_obj.to_csv(tmp_txt_path, index=False, sep='\t')
|
|
3362
|
+
txt_path = tmp_txt_path
|
|
1050
3363
|
else:
|
|
1051
3364
|
raise TypeError(f"build_library returned unsupported type: {type(out_obj)}")
|
|
1052
3365
|
|
|
1053
|
-
converted_query_path_rv.set(str(
|
|
3366
|
+
converted_query_path_rv.set(str(txt_path))
|
|
1054
3367
|
|
|
1055
|
-
query_status_rv.set(f"Reading IDs from: {
|
|
3368
|
+
query_status_rv.set(f"Reading IDs from: {txt_path.name} …")
|
|
1056
3369
|
await reactive.flush()
|
|
1057
3370
|
|
|
1058
|
-
ids = await asyncio.to_thread(extract_first_column_ids, str(
|
|
3371
|
+
ids = await asyncio.to_thread(extract_first_column_ids, str(txt_path))
|
|
1059
3372
|
query_ids_rv.set(ids)
|
|
1060
3373
|
|
|
1061
3374
|
ui.update_selectize("spectrum_ID1", choices=ids, selected=(ids[0] if ids else None))
|
|
1062
3375
|
|
|
1063
|
-
query_status_rv.set(f"✅ Loaded {len(ids)} IDs from {
|
|
3376
|
+
query_status_rv.set(f"✅ Loaded {len(ids)} IDs from {txt_path.name}" if ids else f"⚠️ No IDs found in {txt_path.name}")
|
|
1064
3377
|
await reactive.flush()
|
|
1065
3378
|
|
|
1066
3379
|
except Exception as e:
|
|
@@ -1080,37 +3393,37 @@ def server(input, output, session):
|
|
|
1080
3393
|
suffix = in_path.suffix.lower()
|
|
1081
3394
|
|
|
1082
3395
|
try:
|
|
1083
|
-
if suffix == ".
|
|
1084
|
-
|
|
1085
|
-
converted_reference_path_rv.set(str(
|
|
3396
|
+
if suffix == ".txt":
|
|
3397
|
+
txt_path = in_path
|
|
3398
|
+
converted_reference_path_rv.set(str(txt_path))
|
|
1086
3399
|
else:
|
|
1087
|
-
reference_status_rv.set(f"Converting {in_path.name} →
|
|
3400
|
+
reference_status_rv.set(f"Converting {in_path.name} → TXT…")
|
|
1088
3401
|
await reactive.flush()
|
|
1089
3402
|
|
|
1090
|
-
|
|
3403
|
+
tmp_txt_path = in_path.with_suffix(".converted.txt")
|
|
1091
3404
|
|
|
1092
|
-
out_obj = await asyncio.to_thread(build_library, str(in_path), str(
|
|
3405
|
+
out_obj = await asyncio.to_thread(build_library, str(in_path), str(tmp_txt_path))
|
|
1093
3406
|
|
|
1094
3407
|
if isinstance(out_obj, (str, os.PathLike, Path)):
|
|
1095
|
-
|
|
3408
|
+
txt_path = Path(out_obj)
|
|
1096
3409
|
elif isinstance(out_obj, pd.DataFrame):
|
|
1097
|
-
out_obj.to_csv(
|
|
1098
|
-
|
|
3410
|
+
out_obj.to_csv(tmp_txt_path, index=False, sep='\t')
|
|
3411
|
+
txt_path = tmp_txt_path
|
|
1099
3412
|
else:
|
|
1100
3413
|
raise TypeError(f"build_library returned unsupported type: {type(out_obj)}")
|
|
1101
3414
|
|
|
1102
|
-
converted_reference_path_rv.set(str(
|
|
3415
|
+
converted_reference_path_rv.set(str(txt_path))
|
|
1103
3416
|
|
|
1104
|
-
reference_status_rv.set(f"Reading IDs from: {
|
|
3417
|
+
reference_status_rv.set(f"Reading IDs from: {txt_path.name} …")
|
|
1105
3418
|
await reactive.flush()
|
|
1106
3419
|
|
|
1107
|
-
ids = await asyncio.to_thread(extract_first_column_ids, str(
|
|
3420
|
+
ids = await asyncio.to_thread(extract_first_column_ids, str(txt_path))
|
|
1108
3421
|
reference_ids_rv.set(ids)
|
|
1109
3422
|
|
|
1110
3423
|
ui.update_selectize("spectrum_ID2", choices=ids, selected=(ids[0] if ids else None))
|
|
1111
3424
|
|
|
1112
3425
|
reference_status_rv.set(
|
|
1113
|
-
f"✅ Loaded {len(ids)} IDs from {
|
|
3426
|
+
f"✅ Loaded {len(ids)} IDs from {txt_path.name}" if ids else f"⚠️ No IDs found in {txt_path.name}"
|
|
1114
3427
|
)
|
|
1115
3428
|
await reactive.flush()
|
|
1116
3429
|
|
|
@@ -1120,7 +3433,7 @@ def server(input, output, session):
|
|
|
1120
3433
|
raise
|
|
1121
3434
|
|
|
1122
3435
|
|
|
1123
|
-
@render.download(filename=lambda: f"plot.
|
|
3436
|
+
@render.download(filename=lambda: f"plot.svg")
|
|
1124
3437
|
def run_btn_plot_spectra():
|
|
1125
3438
|
spectrum_ID1 = input.spectrum_ID1() or None
|
|
1126
3439
|
spectrum_ID2 = input.spectrum_ID2() or None
|
|
@@ -1132,22 +3445,20 @@ def server(input, output, session):
|
|
|
1132
3445
|
if input.high_quality_reference_library() != 'False':
|
|
1133
3446
|
high_quality_reference_library_tmp2 = True
|
|
1134
3447
|
|
|
1135
|
-
print(input.high_quality_reference_library())
|
|
1136
|
-
print(high_quality_reference_library_tmp2)
|
|
1137
|
-
|
|
1138
3448
|
if input.chromatography_platform() == "HRMS":
|
|
1139
|
-
fig = generate_plots_on_HRMS_data(query_data=input.query_data()[0]['datapath'], reference_data=input.reference_data()[0]['datapath'], spectrum_ID1=spectrum_ID1, spectrum_ID2=spectrum_ID2, similarity_measure=input.similarity_measure(), weights=weights, spectrum_preprocessing_order=input.spectrum_preprocessing_order(), high_quality_reference_library=high_quality_reference_library_tmp2, mz_min=input.mz_min(), mz_max=input.mz_max(), int_min=input.int_min(), int_max=input.int_max(), window_size_centroiding=input.window_size_centroiding(), window_size_matching=input.window_size_matching(), noise_threshold=input.noise_threshold(), wf_mz=input.wf_mz(), wf_intensity=input.wf_int(), LET_threshold=input.LET_threshold(), entropy_dimension=input.entropy_dimension(), y_axis_transformation=input.y_axis_transformation(), return_plot=True)
|
|
3449
|
+
fig = generate_plots_on_HRMS_data(query_data=input.query_data()[0]['datapath'], reference_data=input.reference_data()[0]['datapath'], spectrum_ID1=spectrum_ID1, spectrum_ID2=spectrum_ID2, print_url_spectrum1=input.print_url_spectrum1(), print_url_spectrum2=input.print_url_spectrum2(), similarity_measure=input.similarity_measure(), weights=weights, spectrum_preprocessing_order=input.spectrum_preprocessing_order(), high_quality_reference_library=high_quality_reference_library_tmp2, mz_min=input.mz_min(), mz_max=input.mz_max(), int_min=input.int_min(), int_max=input.int_max(), window_size_centroiding=input.window_size_centroiding(), window_size_matching=input.window_size_matching(), noise_threshold=input.noise_threshold(), wf_mz=input.wf_mz(), wf_intensity=input.wf_int(), LET_threshold=input.LET_threshold(), entropy_dimension=input.entropy_dimension(), y_axis_transformation=input.y_axis_transformation(), return_plot=True)
|
|
1140
3450
|
plt.show()
|
|
1141
3451
|
elif input.chromatography_platform() == "NRMS":
|
|
1142
|
-
fig = generate_plots_on_NRMS_data(query_data=input.query_data()[0]['datapath'], reference_data=input.reference_data()[0]['datapath'], spectrum_ID1=spectrum_ID1, spectrum_ID2=spectrum_ID2, similarity_measure=input.similarity_measure(), spectrum_preprocessing_order=input.spectrum_preprocessing_order(), high_quality_reference_library=high_quality_reference_library_tmp2, mz_min=input.mz_min(), mz_max=input.mz_max(), int_min=input.int_min(), int_max=input.int_max(), noise_threshold=input.noise_threshold(), wf_mz=input.wf_mz(), wf_intensity=input.wf_int(), LET_threshold=input.LET_threshold(), entropy_dimension=input.entropy_dimension(), y_axis_transformation=input.y_axis_transformation(), return_plot=True)
|
|
3452
|
+
fig = generate_plots_on_NRMS_data(query_data=input.query_data()[0]['datapath'], reference_data=input.reference_data()[0]['datapath'], spectrum_ID1=spectrum_ID1, spectrum_ID2=spectrum_ID2, print_url_spectrum1=input.print_url_spectrum1(), print_url_spectrum2=input.print_url_spectrum2(), similarity_measure=input.similarity_measure(), spectrum_preprocessing_order=input.spectrum_preprocessing_order(), high_quality_reference_library=high_quality_reference_library_tmp2, mz_min=input.mz_min(), mz_max=input.mz_max(), int_min=input.int_min(), int_max=input.int_max(), noise_threshold=input.noise_threshold(), wf_mz=input.wf_mz(), wf_intensity=input.wf_int(), LET_threshold=input.LET_threshold(), entropy_dimension=input.entropy_dimension(), y_axis_transformation=input.y_axis_transformation(), return_plot=True)
|
|
1143
3453
|
plt.show()
|
|
1144
3454
|
with io.BytesIO() as buf:
|
|
1145
|
-
fig.savefig(buf, format="
|
|
3455
|
+
fig.savefig(buf, format="svg", dpi=150, bbox_inches="tight")
|
|
1146
3456
|
plt.close()
|
|
1147
3457
|
yield buf.getvalue()
|
|
1148
3458
|
|
|
1149
3459
|
|
|
1150
3460
|
|
|
3461
|
+
|
|
1151
3462
|
@render.download(filename="identification_output.txt")
|
|
1152
3463
|
async def run_btn_spec_lib_matching():
|
|
1153
3464
|
match_log_rv.set("Running identification...\n")
|
|
@@ -1160,7 +3471,7 @@ def server(input, output, session):
|
|
|
1160
3471
|
hq = bool(hq)
|
|
1161
3472
|
|
|
1162
3473
|
weights = [float(weight.strip()) for weight in input.weights().split(",") if weight.strip()]
|
|
1163
|
-
weights = {'Cosine':weights[0], 'Shannon':weights[1], 'Renyi':weights[2], 'Tsallis':weights[3]}
|
|
3474
|
+
weights = {'Cosine': weights[0], 'Shannon': weights[1], 'Renyi': weights[2], 'Tsallis': weights[3]}
|
|
1164
3475
|
|
|
1165
3476
|
common_kwargs = dict(
|
|
1166
3477
|
query_data=input.query_data()[0]["datapath"],
|
|
@@ -1182,37 +3493,81 @@ def server(input, output, session):
|
|
|
1182
3493
|
return_ID_output=True,
|
|
1183
3494
|
)
|
|
1184
3495
|
|
|
3496
|
+
# --- streaming setup (same pattern as your DE block) ---
|
|
1185
3497
|
loop = asyncio.get_running_loop()
|
|
1186
|
-
|
|
3498
|
+
q: asyncio.Queue[str | None] = asyncio.Queue()
|
|
3499
|
+
|
|
3500
|
+
class UIWriter(io.TextIOBase):
|
|
3501
|
+
def write(self, s: str):
|
|
3502
|
+
if s:
|
|
3503
|
+
loop.call_soon_threadsafe(q.put_nowait, s)
|
|
3504
|
+
return len(s)
|
|
3505
|
+
def flush(self): pass
|
|
3506
|
+
|
|
3507
|
+
async def _drain():
|
|
3508
|
+
while True:
|
|
3509
|
+
msg = await q.get()
|
|
3510
|
+
if msg is None:
|
|
3511
|
+
break
|
|
3512
|
+
match_log_rv.set(match_log_rv.get() + msg)
|
|
3513
|
+
await reactive.flush()
|
|
1187
3514
|
|
|
3515
|
+
drain_task = asyncio.create_task(_drain())
|
|
3516
|
+
writer = UIWriter()
|
|
3517
|
+
|
|
3518
|
+
# --- worker wrappers that install redirects INSIDE the thread ---
|
|
3519
|
+
def _run_hrms():
|
|
3520
|
+
with redirect_stdout(writer), redirect_stderr(writer):
|
|
3521
|
+
# optional heartbeat
|
|
3522
|
+
print(">> Starting HRMS identification ...", flush=True)
|
|
3523
|
+
return run_spec_lib_matching_on_HRMS_data_shiny(
|
|
3524
|
+
precursor_ion_mz_tolerance=input.precursor_ion_mz_tolerance(),
|
|
3525
|
+
ionization_mode=input.ionization_mode(),
|
|
3526
|
+
adduct=input.adduct(),
|
|
3527
|
+
window_size_centroiding=input.window_size_centroiding(),
|
|
3528
|
+
window_size_matching=input.window_size_matching(),
|
|
3529
|
+
**common_kwargs
|
|
3530
|
+
)
|
|
3531
|
+
|
|
3532
|
+
def _run_nrms():
|
|
3533
|
+
with redirect_stdout(writer), redirect_stderr(writer):
|
|
3534
|
+
print(">> Starting NRMS identification ...", flush=True)
|
|
3535
|
+
return run_spec_lib_matching_on_NRMS_data_shiny(**common_kwargs)
|
|
3536
|
+
|
|
3537
|
+
# --- run in worker thread and stream output live ---
|
|
1188
3538
|
try:
|
|
1189
|
-
|
|
1190
|
-
|
|
1191
|
-
|
|
1192
|
-
|
|
1193
|
-
|
|
1194
|
-
window_size_matching=input.window_size_matching(),
|
|
1195
|
-
**common_kwargs
|
|
1196
|
-
)
|
|
1197
|
-
else:
|
|
1198
|
-
df_out = await asyncio.to_thread(run_spec_lib_matching_on_NRMS_data, **common_kwargs)
|
|
3539
|
+
if input.chromatography_platform() == "HRMS":
|
|
3540
|
+
df_out = await asyncio.to_thread(_run_hrms)
|
|
3541
|
+
else:
|
|
3542
|
+
df_out = await asyncio.to_thread(_run_nrms)
|
|
3543
|
+
|
|
1199
3544
|
match_log_rv.set(match_log_rv.get() + "\n✅ Identification finished.\n")
|
|
1200
3545
|
await reactive.flush()
|
|
3546
|
+
|
|
1201
3547
|
except Exception as e:
|
|
1202
|
-
|
|
3548
|
+
import traceback
|
|
3549
|
+
tb = "".join(traceback.format_exception(type(e), e, e.__traceback__))
|
|
3550
|
+
match_log_rv.set(match_log_rv.get() + f"\n❌ {type(e).__name__}: {e}\n{tb}\n")
|
|
1203
3551
|
await reactive.flush()
|
|
3552
|
+
# make sure to stop the drainer before re-raising
|
|
3553
|
+
await q.put(None); await drain_task
|
|
1204
3554
|
raise
|
|
1205
3555
|
|
|
1206
|
-
|
|
3556
|
+
finally:
|
|
3557
|
+
await q.put(None)
|
|
3558
|
+
await drain_task
|
|
3559
|
+
|
|
3560
|
+
yield df_out.to_csv(index=True, sep="\t")
|
|
3561
|
+
|
|
1207
3562
|
|
|
1208
3563
|
|
|
1209
3564
|
|
|
1210
|
-
@render.download(filename="plot.
|
|
3565
|
+
@render.download(filename="plot.svg")
|
|
1211
3566
|
def run_btn_plot_spectra_within_spec_lib_matching():
|
|
1212
3567
|
req(input.query_data(), input.reference_data())
|
|
1213
3568
|
|
|
1214
|
-
spectrum_ID1 = input.
|
|
1215
|
-
spectrum_ID2 = input.
|
|
3569
|
+
spectrum_ID1 = input.q_spec() or None
|
|
3570
|
+
spectrum_ID2 = input.r_spec() or None
|
|
1216
3571
|
|
|
1217
3572
|
hq = input.high_quality_reference_library()
|
|
1218
3573
|
if isinstance(hq, str):
|
|
@@ -1228,6 +3583,8 @@ def server(input, output, session):
|
|
|
1228
3583
|
reference_data=input.reference_data()[0]['datapath'],
|
|
1229
3584
|
spectrum_ID1=spectrum_ID1,
|
|
1230
3585
|
spectrum_ID2=spectrum_ID2,
|
|
3586
|
+
print_url_spectrum1=input.print_url_spectrum1(),
|
|
3587
|
+
print_url_spectrum2=input.print_url_spectrum2(),
|
|
1231
3588
|
similarity_measure=input.similarity_measure(),
|
|
1232
3589
|
weights=weights,
|
|
1233
3590
|
spectrum_preprocessing_order=input.spectrum_preprocessing_order(),
|
|
@@ -1253,7 +3610,7 @@ def server(input, output, session):
|
|
|
1253
3610
|
plt.show()
|
|
1254
3611
|
|
|
1255
3612
|
with io.BytesIO() as buf:
|
|
1256
|
-
fig.savefig(buf, format="
|
|
3613
|
+
fig.savefig(buf, format="svg", dpi=150, bbox_inches="tight")
|
|
1257
3614
|
plt.close()
|
|
1258
3615
|
yield buf.getvalue()
|
|
1259
3616
|
|
|
@@ -1291,6 +3648,9 @@ def server(input, output, session):
|
|
|
1291
3648
|
|
|
1292
3649
|
try:
|
|
1293
3650
|
if input.chromatography_platform() == "HRMS":
|
|
3651
|
+
precursor_ion_mz_tolerance = float(input.precursor_ion_mz_tolerance())
|
|
3652
|
+
ionization_mode = str(input.ionization_mode())
|
|
3653
|
+
adduct = str(input.adduct())
|
|
1294
3654
|
window_size_centroiding_tmp = strip_numeric(input.window_size_centroiding())
|
|
1295
3655
|
window_size_matching_tmp = strip_numeric(input.window_size_matching())
|
|
1296
3656
|
grid = {
|
|
@@ -1310,7 +3670,7 @@ def server(input, output, session):
|
|
|
1310
3670
|
'window_size_centroiding': window_size_centroiding_tmp,
|
|
1311
3671
|
'window_size_matching': window_size_matching_tmp,
|
|
1312
3672
|
}
|
|
1313
|
-
df_out = await asyncio.to_thread(_run_with_redirects, tune_params_on_HRMS_data_grid_shiny, rw, **common_kwargs, grid=grid)
|
|
3673
|
+
df_out = await asyncio.to_thread(_run_with_redirects, tune_params_on_HRMS_data_grid_shiny, rw, **common_kwargs, grid=grid, precursor_ion_mz_tolerance=precursor_ion_mz_tolerance, ionization_mode=ionization_mode, adduct=adduct)
|
|
1314
3674
|
else:
|
|
1315
3675
|
grid = {
|
|
1316
3676
|
'similarity_measure': similarity_measure_tmp,
|
|
@@ -1338,7 +3698,7 @@ def server(input, output, session):
|
|
|
1338
3698
|
is_any_job_running.set(False)
|
|
1339
3699
|
await reactive.flush()
|
|
1340
3700
|
|
|
1341
|
-
yield df_out.to_csv(index=False).encode("utf-8"
|
|
3701
|
+
yield df_out.to_csv(index=False, sep='\t').encode("utf-8")
|
|
1342
3702
|
|
|
1343
3703
|
|
|
1344
3704
|
|
|
@@ -1350,7 +3710,6 @@ def server(input, output, session):
|
|
|
1350
3710
|
is_tuning_DE_running.set(True)
|
|
1351
3711
|
await reactive.flush()
|
|
1352
3712
|
|
|
1353
|
-
# --- helpers ---
|
|
1354
3713
|
def _safe_float(v, default):
|
|
1355
3714
|
try:
|
|
1356
3715
|
if v is None:
|
|
@@ -1360,7 +3719,6 @@ def server(input, output, session):
|
|
|
1360
3719
|
return default
|
|
1361
3720
|
|
|
1362
3721
|
def _iget(id, default=None):
|
|
1363
|
-
# Safe getter for Shiny inputs (avoids SilentException)
|
|
1364
3722
|
if id in input:
|
|
1365
3723
|
try:
|
|
1366
3724
|
return input[id]()
|
|
@@ -1368,7 +3726,6 @@ def server(input, output, session):
|
|
|
1368
3726
|
return default
|
|
1369
3727
|
return default
|
|
1370
3728
|
|
|
1371
|
-
# ---- log plumbing (stdout/stderr -> UI) ----
|
|
1372
3729
|
loop = asyncio.get_running_loop()
|
|
1373
3730
|
q: asyncio.Queue[str | None] = asyncio.Queue()
|
|
1374
3731
|
|
|
@@ -1390,7 +3747,6 @@ def server(input, output, session):
|
|
|
1390
3747
|
drain_task = asyncio.create_task(_drain())
|
|
1391
3748
|
writer = UIWriter()
|
|
1392
3749
|
|
|
1393
|
-
# ---------- SNAPSHOT INPUTS SAFELY ----------
|
|
1394
3750
|
try:
|
|
1395
3751
|
qfile = _iget("query_data")[0]["datapath"]
|
|
1396
3752
|
rfile = _iget("reference_data")[0]["datapath"]
|
|
@@ -1410,17 +3766,13 @@ def server(input, output, session):
|
|
|
1410
3766
|
int_min = _safe_float(_iget("int_min", 0.0), 0.0)
|
|
1411
3767
|
int_max = _safe_float(_iget("int_max", 999_999_999.0), 999_999_999.0)
|
|
1412
3768
|
|
|
1413
|
-
# weights "a,b,c,d"
|
|
1414
3769
|
w_text = _iget("weights", "") or ""
|
|
1415
3770
|
w_list = [float(w.strip()) for w in w_text.split(",") if w.strip()]
|
|
1416
3771
|
w_list = (w_list + [0.0, 0.0, 0.0, 0.0])[:4]
|
|
1417
3772
|
weights = {"Cosine": w_list[0], "Shannon": w_list[1], "Renyi": w_list[2], "Tsallis": w_list[3]}
|
|
1418
3773
|
|
|
1419
|
-
# selected params + bounds
|
|
1420
3774
|
opt_params = tuple(_iget("params", ()) or ())
|
|
1421
3775
|
bounds_dict = {}
|
|
1422
|
-
# populate bounds using the min_/max_ inputs if present, otherwise fall back
|
|
1423
|
-
# to your default PARAMS dicts already defined in your file
|
|
1424
3776
|
param_defaults = PARAMS_HRMS if platform == "HRMS" else PARAMS_NRMS
|
|
1425
3777
|
for p in opt_params:
|
|
1426
3778
|
lo = _safe_float(_iget(f"min_{p}", param_defaults.get(p, (0.0, 1.0))[0]),
|
|
@@ -1431,7 +3783,6 @@ def server(input, output, session):
|
|
|
1431
3783
|
lo, hi = hi, lo
|
|
1432
3784
|
bounds_dict[p] = (lo, hi)
|
|
1433
3785
|
|
|
1434
|
-
# defaults (guarded!)
|
|
1435
3786
|
defaults = {
|
|
1436
3787
|
"window_size_centroiding": _safe_float(_iget("window_size_centroiding", 0.5), 0.5),
|
|
1437
3788
|
"window_size_matching": _safe_float(_iget("window_size_matching", 0.5), 0.5),
|
|
@@ -1454,11 +3805,13 @@ def server(input, output, session):
|
|
|
1454
3805
|
return
|
|
1455
3806
|
|
|
1456
3807
|
def _run():
|
|
1457
|
-
from contextlib import redirect_stdout, redirect_stderr
|
|
1458
3808
|
with redirect_stdout(writer), redirect_stderr(writer):
|
|
1459
3809
|
return tune_params_DE(
|
|
1460
3810
|
query_data=qfile,
|
|
1461
3811
|
reference_data=rfile,
|
|
3812
|
+
precursor_ion_mz_tolerance=float(input.precursor_ion_mz_tolerance()),
|
|
3813
|
+
ionization_mode=input.ionization_mode(),
|
|
3814
|
+
adduct=input.adduct(),
|
|
1462
3815
|
chromatography_platform=input.chromatography_platform(),
|
|
1463
3816
|
similarity_measure=sim,
|
|
1464
3817
|
weights=weights,
|
|
@@ -1516,4 +3869,3 @@ def server(input, output, session):
|
|
|
1516
3869
|
app = App(app_ui, server)
|
|
1517
3870
|
|
|
1518
3871
|
|
|
1519
|
-
|