pycompound 0.1.7__py3-none-any.whl → 0.1.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pycompound/plot_spectra.py +72 -110
- pycompound/spec_lib_matching.py +59 -54
- {pycompound-0.1.7.dist-info → pycompound-0.1.9.dist-info}/METADATA +3 -2
- {pycompound-0.1.7.dist-info → pycompound-0.1.9.dist-info}/RECORD +7 -8
- {pycompound-0.1.7.dist-info → pycompound-0.1.9.dist-info}/top_level.txt +0 -1
- app.py +0 -3871
- {pycompound-0.1.7.dist-info → pycompound-0.1.9.dist-info}/WHEEL +0 -0
- {pycompound-0.1.7.dist-info → pycompound-0.1.9.dist-info}/licenses/LICENSE +0 -0
app.py
DELETED
|
@@ -1,3871 +0,0 @@
|
|
|
1
|
-
|
|
2
|
-
from shiny import App, ui, reactive, render, req
|
|
3
|
-
from shiny.types import SilentException
|
|
4
|
-
from pathlib import Path
|
|
5
|
-
from contextlib import redirect_stdout, redirect_stderr
|
|
6
|
-
import contextlib
|
|
7
|
-
import subprocess
|
|
8
|
-
import traceback
|
|
9
|
-
import asyncio
|
|
10
|
-
import io
|
|
11
|
-
import os
|
|
12
|
-
import sys
|
|
13
|
-
import matplotlib.pyplot as plt
|
|
14
|
-
import pandas as pd
|
|
15
|
-
import numpy as np
|
|
16
|
-
import netCDF4 as nc
|
|
17
|
-
from pyteomics import mgf, mzml
|
|
18
|
-
import ast
|
|
19
|
-
from numbers import Real
|
|
20
|
-
import logging
|
|
21
|
-
from scipy.optimize import differential_evolution
|
|
22
|
-
import scipy
|
|
23
|
-
import scipy.stats
|
|
24
|
-
from itertools import product
|
|
25
|
-
import json
|
|
26
|
-
import re
|
|
27
|
-
import urllib.parse
|
|
28
|
-
import urllib.request
|
|
29
|
-
import matplotlib
|
|
30
|
-
|
|
31
|
-
matplotlib.rcParams['svg.fonttype'] = 'none'
|
|
32
|
-
|
|
33
|
-
_LOG_QUEUE: asyncio.Queue[str] = asyncio.Queue()
|
|
34
|
-
|
|
35
|
-
_ADDUCT_PAT = re.compile(r"\s*(?:\[(M[^\]]+)\]|(M[+-][A-Za-z0-9]+)\+?)\s*$", re.IGNORECASE)
|
|
36
|
-
|
|
37
|
-
def start_log_consumer():
|
|
38
|
-
if getattr(start_log_consumer, "_started", False):
|
|
39
|
-
return
|
|
40
|
-
start_log_consumer._started = True
|
|
41
|
-
|
|
42
|
-
async def _consume():
|
|
43
|
-
while True:
|
|
44
|
-
s = await _LOG_QUEUE.get()
|
|
45
|
-
match_log_rv.set(match_log_rv.get() + s)
|
|
46
|
-
await reactive.flush()
|
|
47
|
-
|
|
48
|
-
asyncio.create_task(_consume())
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
def start_log_consumer():
|
|
52
|
-
if getattr(start_log_consumer, "_started", False):
|
|
53
|
-
return
|
|
54
|
-
start_log_consumer._started = True
|
|
55
|
-
|
|
56
|
-
async def _consume():
|
|
57
|
-
while True:
|
|
58
|
-
s = await _LOG_QUEUE.get()
|
|
59
|
-
match_log_rv.set(match_log_rv.get() + s)
|
|
60
|
-
await reactive.flush()
|
|
61
|
-
|
|
62
|
-
asyncio.create_task(_consume())
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
def _strip_adduct(name: str) -> str:
|
|
67
|
-
return _ADDUCT_PAT.sub("", name).strip()
|
|
68
|
-
|
|
69
|
-
def get_pubchem_url(query: str) -> str:
|
|
70
|
-
base_name = _strip_adduct(query)
|
|
71
|
-
endpoint = ("https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/name/" + urllib.parse.quote(base_name) + "/cids/TXT")
|
|
72
|
-
try:
|
|
73
|
-
with urllib.request.urlopen(endpoint, timeout=10) as r:
|
|
74
|
-
txt = r.read().decode("utf-8").strip()
|
|
75
|
-
cid = txt.splitlines()[0].strip()
|
|
76
|
-
if cid.isdigit():
|
|
77
|
-
return f"https://pubchem.ncbi.nlm.nih.gov/compound/{cid}"
|
|
78
|
-
except Exception:
|
|
79
|
-
pass
|
|
80
|
-
q = urllib.parse.quote(base_name)
|
|
81
|
-
return f"https://pubchem.ncbi.nlm.nih.gov/#query={q}"
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
def build_library_from_raw_data(input_path=None, output_path=None, is_reference=False):
|
|
86
|
-
if input_path is None:
|
|
87
|
-
print('Error: please specify input_path (i.e. the path to the input mgf, mzML, cdf, json, or msp file). Mandatory argument.')
|
|
88
|
-
sys.exit()
|
|
89
|
-
|
|
90
|
-
if output_path is None:
|
|
91
|
-
tmp = input_path.split('/')
|
|
92
|
-
tmp = tmp[(len(tmp)-1)]
|
|
93
|
-
basename = tmp.split('.')[0]
|
|
94
|
-
output_path = f'{Path.cwd()}/{basename}.csv'
|
|
95
|
-
print(f'Warning: no output_path specified, so library is written to {output_path}')
|
|
96
|
-
|
|
97
|
-
if is_reference not in [True,False]:
|
|
98
|
-
print('Error: is_reference must be either \'True\' or \'False\'.')
|
|
99
|
-
sys.exit()
|
|
100
|
-
|
|
101
|
-
last_three_chars = input_path[(len(input_path)-3):len(input_path)]
|
|
102
|
-
last_four_chars = input_path[(len(input_path)-4):len(input_path)]
|
|
103
|
-
if last_three_chars == 'mgf' or last_three_chars == 'MGF':
|
|
104
|
-
input_file_type = 'mgf'
|
|
105
|
-
elif last_four_chars == 'mzML' or last_four_chars == 'mzml' or last_four_chars == 'MZML':
|
|
106
|
-
input_file_type = 'mzML'
|
|
107
|
-
elif last_four_chars == 'json' or last_four_chars == 'JSON':
|
|
108
|
-
input_file_type = 'json'
|
|
109
|
-
elif last_three_chars == 'cdf' or last_three_chars == 'CDF':
|
|
110
|
-
input_file_type = 'cdf'
|
|
111
|
-
elif last_three_chars == 'msp' or last_three_chars == 'MSP':
|
|
112
|
-
input_file_type = 'msp'
|
|
113
|
-
else:
|
|
114
|
-
print('ERROR: either an \'mgf\', \'mzML\', \'cdf\', \'json\', or \'msp\' file must be passed to --input_path')
|
|
115
|
-
sys.exit()
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
def generate_plots_on_HRMS_data(query_data=None, reference_data=None, precursor_ion_mz=None, precursor_ion_mz_tolerance=None, ionization_mode=None, collision_energy=None, spectrum_ID1=None, spectrum_ID2=None, print_url_spectrum1='No', print_url_spectrum2='No', similarity_measure='cosine', weights={'Cosine':0.25,'Shannon':0.25,'Renyi':0.25,'Tsallis':0.25}, spectrum_preprocessing_order='FCNMWL', high_quality_reference_library=False, mz_min=0, mz_max=9999999, int_min=0, int_max=9999999, window_size_centroiding=0.5, window_size_matching=0.5, noise_threshold=0.0, wf_mz=0.0, wf_intensity=1.0, LET_threshold=0.0, entropy_dimension=1.1, y_axis_transformation='normalized', output_path=None, return_plot=False):
|
|
120
|
-
|
|
121
|
-
if query_data is None:
|
|
122
|
-
print('\nError: No argument passed to the mandatory query_data. Please pass the path to the TXT file of the query data.')
|
|
123
|
-
sys.exit()
|
|
124
|
-
else:
|
|
125
|
-
extension = query_data.rsplit('.',1)
|
|
126
|
-
extension = extension[(len(extension)-1)]
|
|
127
|
-
if extension == 'mgf' or extension == 'MGF' or extension == 'mzML' or extension == 'mzml' or extension == 'MZML' or extension == 'cdf' or extension == 'CDF' or extension == 'msp' or extension == 'MSP' or extension == 'json' or extension == 'JSON':
|
|
128
|
-
output_path_tmp = query_data[:-3] + 'txt'
|
|
129
|
-
#build_library_from_raw_data(input_path=query_data, output_path=output_path_tmp, is_reference=True)
|
|
130
|
-
build_library_from_raw_data(input_path=query_data, output_path=output_path_tmp, is_reference=False)
|
|
131
|
-
df_query = pd.read_csv(output_path_tmp, sep='\t')
|
|
132
|
-
if extension == 'txt' or extension == 'TXT':
|
|
133
|
-
df_query = pd.read_csv(query_data, sep='\t')
|
|
134
|
-
unique_query_ids = df_query['id'].unique().tolist()
|
|
135
|
-
unique_query_ids = [str(tmp) for tmp in unique_query_ids]
|
|
136
|
-
|
|
137
|
-
if reference_data is None:
|
|
138
|
-
print('\nError: No argument passed to the mandatory reference_data. Please pass the path to the TXT file of the reference data.')
|
|
139
|
-
sys.exit()
|
|
140
|
-
else:
|
|
141
|
-
extension = reference_data.rsplit('.',1)
|
|
142
|
-
extension = extension[(len(extension)-1)]
|
|
143
|
-
if extension == 'mgf' or extension == 'MGF' or extension == 'mzML' or extension == 'mzml' or extension == 'MZML' or extension == 'cdf' or extension == 'CDF' or extension == 'msp' or extension == 'MSP' or extension == 'json' or extension == 'JSON':
|
|
144
|
-
output_path_tmp = reference_data[:-3] + 'txt'
|
|
145
|
-
build_library_from_raw_data(input_path=reference_data, output_path=output_path_tmp, is_reference=True)
|
|
146
|
-
df_reference = pd.read_csv(output_path_tmp, sep='\t')
|
|
147
|
-
if extension == 'txt' or extension == 'TXT':
|
|
148
|
-
df_reference = pd.read_csv(reference_data, sep='\t')
|
|
149
|
-
cols_tmp = df_reference.columns.tolist()
|
|
150
|
-
if 'precursor_ion_mz' in cols_tmp and 'ionization_mode' in cols_tmp and 'collision_energy' in cols_tmp:
|
|
151
|
-
if precursor_ion_mz is not None and precursor_ion_mz_tolerance is not None:
|
|
152
|
-
df_reference = df_reference.loc[(df_reference['precursor_ion_mz'] > (precursor_ion_mz-precursor_ion_mz_tolerance) & df_reference['precursor_ion_mz'] < (precursor_ion_mz+precursor_ion_mz_tolerance))]
|
|
153
|
-
if ionization_mode is not None:
|
|
154
|
-
df_reference = df_reference.loc[df_reference['ionization_mode'==ionization_mode]]
|
|
155
|
-
if collision_energy is not None:
|
|
156
|
-
df_reference = df_reference.loc[df_reference['collision_energy'==collision_energy]]
|
|
157
|
-
df_reference = df_reference.drop(columns=['precursor_ion_mz','ionization_mode','collision_energy'])
|
|
158
|
-
unique_reference_ids = df_reference['id'].unique().tolist()
|
|
159
|
-
unique_reference_ids = [str(tmp) for tmp in unique_reference_ids]
|
|
160
|
-
|
|
161
|
-
if spectrum_ID1 is not None:
|
|
162
|
-
spectrum_ID1 = str(spectrum_ID1)
|
|
163
|
-
else:
|
|
164
|
-
spectrum_ID1 = str(df_query['id'].iloc[0])
|
|
165
|
-
print('No argument passed to spectrum_ID1; using the first spectrum in query_data.')
|
|
166
|
-
|
|
167
|
-
if spectrum_ID2 is not None:
|
|
168
|
-
spectrum_ID2 = str(spectrum_ID2)
|
|
169
|
-
else:
|
|
170
|
-
spectrum_ID2 = str(df_reference['id'].iloc[0])
|
|
171
|
-
print('No argument passed to spectrum_ID2; using the first spectrum in reference_data.')
|
|
172
|
-
|
|
173
|
-
if spectrum_preprocessing_order is not None:
|
|
174
|
-
spectrum_preprocessing_order = list(spectrum_preprocessing_order)
|
|
175
|
-
else:
|
|
176
|
-
spectrum_preprocessing_order = ['F', 'C', 'N', 'M', 'W', 'L']
|
|
177
|
-
if 'M' not in spectrum_preprocessing_order:
|
|
178
|
-
print(f'Error: \'M\' must be a character in spectrum_preprocessing_order.')
|
|
179
|
-
sys.exit()
|
|
180
|
-
if 'C' in spectrum_preprocessing_order:
|
|
181
|
-
if spectrum_preprocessing_order.index('C') > spectrum_preprocessing_order.index('M'):
|
|
182
|
-
print(f'Error: \'C\' must come before \'M\' in spectrum_preprocessing_order.')
|
|
183
|
-
sys.exit()
|
|
184
|
-
if set(spectrum_preprocessing_order) - {'F','C','N','M','W','L'}:
|
|
185
|
-
print(f'Error: spectrum_preprocessing_order must contain only \'C\', \'F\', \'M\', \'N\', \'L\', \'W\'.')
|
|
186
|
-
sys.exit()
|
|
187
|
-
|
|
188
|
-
if similarity_measure not in ['cosine','shannon','renyi','tsallis','mixture','jaccard','dice','3w_jaccard','sokal_sneath','binary_cosine','mountford','mcconnaughey','driver_kroeber','simpson','braun_banquet','fager_mcgowan','kulczynski','intersection','hamming','hellinger']:
|
|
189
|
-
print('\nError: similarity_measure must be either cosine, shannon, renyi, tsallis, mixture, jaccard, dice, 3w_jaccard, sokal_sneath, binary_cosine, mountford, mcconnaughey, driver_kroeber, simpson, braun_banquet, fager_mcgowan, kulczynski, intersection, hamming, or hellinger.')
|
|
190
|
-
sys.exit()
|
|
191
|
-
|
|
192
|
-
if isinstance(int_min,int) is True:
|
|
193
|
-
int_min = float(int_min)
|
|
194
|
-
if isinstance(int_max,int) is True:
|
|
195
|
-
int_max = float(int_max)
|
|
196
|
-
if isinstance(mz_min,int) is False or isinstance(mz_max,int) is False or isinstance(int_min,float) is False or isinstance(int_max,float) is False:
|
|
197
|
-
print('Error: mz_min must be a non-negative integer, mz_max must be a positive integer, int_min must be a non-negative float, and int_max must be a positive float')
|
|
198
|
-
sys.exit()
|
|
199
|
-
if mz_min < 0:
|
|
200
|
-
print('\nError: mz_min should be a non-negative integer')
|
|
201
|
-
sys.exit()
|
|
202
|
-
if mz_max <= 0:
|
|
203
|
-
print('\nError: mz_max should be a positive integer')
|
|
204
|
-
sys.exit()
|
|
205
|
-
if int_min < 0:
|
|
206
|
-
print('\nError: int_min should be a non-negative float')
|
|
207
|
-
sys.exit()
|
|
208
|
-
if int_max <= 0:
|
|
209
|
-
print('\nError: int_max should be a positive float')
|
|
210
|
-
sys.exit()
|
|
211
|
-
|
|
212
|
-
if isinstance(window_size_centroiding,float) is False or window_size_centroiding <= 0.0:
|
|
213
|
-
print('Error: window_size_centroiding must be a positive float.')
|
|
214
|
-
sys.exit()
|
|
215
|
-
if isinstance(window_size_matching,float) is False or window_size_matching<= 0.0:
|
|
216
|
-
print('Error: window_size_matching must be a positive float.')
|
|
217
|
-
sys.exit()
|
|
218
|
-
|
|
219
|
-
if isinstance(noise_threshold,int) is True:
|
|
220
|
-
noise_threshold = float(noise_threshold)
|
|
221
|
-
if isinstance(noise_threshold,float) is False or noise_threshold < 0:
|
|
222
|
-
print('Error: noise_threshold must be a positive float.')
|
|
223
|
-
sys.exit()
|
|
224
|
-
|
|
225
|
-
if isinstance(wf_intensity,int) is True:
|
|
226
|
-
wf_intensity = float(wf_intensity)
|
|
227
|
-
if isinstance(wf_mz,int) is True:
|
|
228
|
-
wf_mz = float(wf_mz)
|
|
229
|
-
if isinstance(wf_intensity,float) is False or isinstance(wf_mz,float) is False:
|
|
230
|
-
print('Error: wf_mz and wf_intensity must be integers or floats')
|
|
231
|
-
sys.exit()
|
|
232
|
-
|
|
233
|
-
if entropy_dimension <= 0:
|
|
234
|
-
print('\nError: entropy_dimension should be a positive float')
|
|
235
|
-
sys.exit()
|
|
236
|
-
else:
|
|
237
|
-
q = entropy_dimension
|
|
238
|
-
|
|
239
|
-
normalization_method = 'standard' #consider including additional normalization methods to transform intensities into a probability distribution; softmax results in many numerical errors/warnings
|
|
240
|
-
|
|
241
|
-
if y_axis_transformation not in ['normalized','none','log10','sqrt']:
|
|
242
|
-
print('Error: y_axis_transformation must be either \'normalized\', \'none\', \'log10\', or \'sqrt\'.')
|
|
243
|
-
sys.exit()
|
|
244
|
-
|
|
245
|
-
if output_path is None:
|
|
246
|
-
print(f'Warning: plots will be saved to the svg ./spectrum1_{spectrum_ID1}_spectrum2_{spectrum_ID2}.svg in the current working directory.')
|
|
247
|
-
output_path = f'{Path.cwd()}/spectrum1_{spectrum_ID1}_spectrum2_{spectrum_ID2}.svg'
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
if spectrum_ID1 in unique_query_ids and spectrum_ID2 in unique_query_ids:
|
|
251
|
-
query_idx = unique_query_ids.index(spectrum_ID1)
|
|
252
|
-
reference_idx = unique_query_ids.index(spectrum_ID2)
|
|
253
|
-
q_idxs_tmp = np.where(df_query.iloc[:,0].astype(str) == unique_query_ids[query_idx])[0]
|
|
254
|
-
r_idxs_tmp = np.where(df_query.iloc[:,0].astype(str) == unique_query_ids[reference_idx])[0]
|
|
255
|
-
q_spec = np.asarray(pd.concat([df_query.iloc[q_idxs_tmp,1], df_query.iloc[q_idxs_tmp,2]], axis=1).reset_index(drop=True))
|
|
256
|
-
r_spec = np.asarray(pd.concat([df_query.iloc[r_idxs_tmp,1], df_query.iloc[r_idxs_tmp,2]], axis=1).reset_index(drop=True))
|
|
257
|
-
elif spectrum_ID1 in unique_reference_ids and spectrum_ID2 in unique_reference_ids:
|
|
258
|
-
query_idx = unique_reference_ids.index(spectrum_ID1)
|
|
259
|
-
reference_idx = unique_reference_ids.index(spectrum_ID2)
|
|
260
|
-
q_idxs_tmp = np.where(df_reference.iloc[:,0].astype(str) == unique_reference_ids[query_idx])[0]
|
|
261
|
-
r_idxs_tmp = np.where(df_reference.iloc[:,0].astype(str) == unique_reference_ids[reference_idx])[0]
|
|
262
|
-
q_spec = np.asarray(pd.concat([df_reference.iloc[q_idxs_tmp,1], df_reference.iloc[q_idxs_tmp,2]], axis=1).reset_index(drop=True))
|
|
263
|
-
r_spec = np.asarray(pd.concat([df_reference.iloc[r_idxs_tmp,1], df_reference.iloc[r_idxs_tmp,2]], axis=1).reset_index(drop=True))
|
|
264
|
-
else:
|
|
265
|
-
if spectrum_ID1 in unique_reference_ids and spectrum_ID2 in unique_query_ids:
|
|
266
|
-
spec_tmp = spectrum_ID1
|
|
267
|
-
spectrum_ID1 = spectrum_ID2
|
|
268
|
-
spectrum_ID2 = spec_tmp
|
|
269
|
-
query_idx = unique_query_ids.index(spectrum_ID1)
|
|
270
|
-
reference_idx = unique_reference_ids.index(spectrum_ID2)
|
|
271
|
-
q_idxs_tmp = np.where(df_query['id'].astype(str) == unique_query_ids[query_idx])[0]
|
|
272
|
-
r_idxs_tmp = np.where(df_reference['id'].astype(str) == unique_reference_ids[reference_idx])[0]
|
|
273
|
-
q_spec = np.asarray(pd.concat([df_query['mz_ratio'].iloc[q_idxs_tmp], df_query['intensity'].iloc[q_idxs_tmp]], axis=1).reset_index(drop=True))
|
|
274
|
-
r_spec = np.asarray(pd.concat([df_reference['mz_ratio'].iloc[r_idxs_tmp], df_reference['intensity'].iloc[r_idxs_tmp]], axis=1).reset_index(drop=True))
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
q_spec_pre_trans = q_spec.copy()
|
|
278
|
-
r_spec_pre_trans = r_spec.copy()
|
|
279
|
-
q_spec_pre_trans[:,1] = q_spec_pre_trans[:,1].astype(float)
|
|
280
|
-
r_spec_pre_trans[:,1] = r_spec_pre_trans[:,1].astype(float)
|
|
281
|
-
|
|
282
|
-
if y_axis_transformation == 'normalized':
|
|
283
|
-
q_spec_pre_trans[:,1] = q_spec_pre_trans[:,1] / np.max(q_spec_pre_trans[:,1])
|
|
284
|
-
r_spec_pre_trans[:,1] = r_spec_pre_trans[:,1] / np.max(r_spec_pre_trans[:,1])
|
|
285
|
-
ylab = 'Normalized Intensity'
|
|
286
|
-
elif y_axis_transformation == 'log10':
|
|
287
|
-
q_spec_pre_trans[:,1] = np.log10(np.array(q_spec_pre_trans[:,1]+1,dtype=float))
|
|
288
|
-
r_spec_pre_trans[:,1] = np.log10(np.array(r_spec_pre_trans[:,1]+1,dtype=float))
|
|
289
|
-
ylab = 'log10(Intensity)'
|
|
290
|
-
elif y_axis_transformation == 'sqrt':
|
|
291
|
-
q_spec_pre_trans[:,1] = np.sqrt(np.array(q_spec_pre_trans[:,1],dtype=float))
|
|
292
|
-
r_spec_pre_trans[:,1] = np.sqrt(np.array(r_spec_pre_trans[:,1],dtype=float))
|
|
293
|
-
ylab = 'sqrt(Intensity)'
|
|
294
|
-
else:
|
|
295
|
-
ylab = 'Raw Intensity'
|
|
296
|
-
|
|
297
|
-
fig, axes = plt.subplots(nrows=2, ncols=1)
|
|
298
|
-
|
|
299
|
-
plt.subplot(2,1,1)
|
|
300
|
-
plt.vlines(x=q_spec_pre_trans[:,0], ymin=[0]*q_spec_pre_trans.shape[0], ymax=q_spec_pre_trans[:,1], linewidth=3, color='blue', label=f'Spectrum ID 1: {spectrum_ID1}')
|
|
301
|
-
plt.vlines(x=r_spec_pre_trans[:,0], ymin=[0]*r_spec_pre_trans.shape[0], ymax=-r_spec_pre_trans[:,1], linewidth=3, color='red', label=f'Spectrum ID 2: {spectrum_ID2}')
|
|
302
|
-
plt.xlabel('m/z',fontsize=7)
|
|
303
|
-
plt.ylabel(ylab, fontsize=7)
|
|
304
|
-
plt.xticks(fontsize=7)
|
|
305
|
-
plt.yticks(fontsize=7)
|
|
306
|
-
plt.title('Untransformed Spectra', fontsize=10)
|
|
307
|
-
|
|
308
|
-
mz_min_tmp_q = round(q_spec[:,0].min(),1)
|
|
309
|
-
mz_min_tmp_r = round(r_spec[:,0].min(),1)
|
|
310
|
-
int_min_tmp_q = round(q_spec[:,1].min(),1)
|
|
311
|
-
int_min_tmp_r = round(r_spec[:,1].min(),1)
|
|
312
|
-
mz_max_tmp_q = round(q_spec[:,0].max(),1)
|
|
313
|
-
mz_max_tmp_r = round(r_spec[:,0].max(),1)
|
|
314
|
-
int_max_tmp_q = round(q_spec[:,1].max(),1)
|
|
315
|
-
int_max_tmp_r = round(r_spec[:,1].max(),1)
|
|
316
|
-
mz_min_tmp = min([mz_min_tmp_q,mz_min_tmp_r])
|
|
317
|
-
mz_max_tmp = min([mz_max_tmp_q,mz_max_tmp_r])
|
|
318
|
-
int_min_tmp = min([int_min_tmp_q,int_min_tmp_r])
|
|
319
|
-
int_max_tmp = max([int_max_tmp_q,int_max_tmp_r])
|
|
320
|
-
|
|
321
|
-
is_matched = False
|
|
322
|
-
for transformation in spectrum_preprocessing_order:
|
|
323
|
-
if transformation == 'C' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1:
|
|
324
|
-
q_spec = centroid_spectrum(q_spec, window_size=window_size_centroiding)
|
|
325
|
-
r_spec = centroid_spectrum(r_spec, window_size=window_size_centroiding)
|
|
326
|
-
if transformation == 'M' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1:
|
|
327
|
-
m_spec = match_peaks_in_spectra(spec_a=q_spec, spec_b=r_spec, window_size=window_size_matching)
|
|
328
|
-
q_spec = m_spec[:,0:2]
|
|
329
|
-
r_spec = m_spec[:,[0,2]]
|
|
330
|
-
is_matched = True
|
|
331
|
-
if transformation == 'W' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1:
|
|
332
|
-
q_spec[:,1] = wf_transform(q_spec[:,0], q_spec[:,1], wf_mz, wf_intensity)
|
|
333
|
-
r_spec[:,1] = wf_transform(r_spec[:,0], r_spec[:,1], wf_mz, wf_intensity)
|
|
334
|
-
if transformation == 'L' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1:
|
|
335
|
-
q_spec[:,1] = LE_transform(q_spec[:,1], LET_threshold, normalization_method=normalization_method)
|
|
336
|
-
r_spec[:,1] = LE_transform(r_spec[:,1], LET_threshold, normalization_method=normalization_method)
|
|
337
|
-
if transformation == 'N' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1:
|
|
338
|
-
q_spec = remove_noise(q_spec, nr = noise_threshold)
|
|
339
|
-
if high_quality_reference_library == False or high_quality_reference_library == 'False':
|
|
340
|
-
r_spec = remove_noise(r_spec, nr = noise_threshold)
|
|
341
|
-
if transformation == 'F' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1:
|
|
342
|
-
q_spec = filter_spec_lcms(q_spec, mz_min = mz_min, mz_max = mz_max, int_min = int_min, int_max = int_max, is_matched = is_matched)
|
|
343
|
-
if high_quality_reference_library == False or high_quality_reference_library == 'False':
|
|
344
|
-
r_spec = filter_spec_lcms(r_spec, mz_min = mz_min, mz_max = mz_max, int_min = int_min, int_max = int_max, is_matched = is_matched)
|
|
345
|
-
|
|
346
|
-
q_ints = q_spec[:,1]
|
|
347
|
-
r_ints = r_spec[:,1]
|
|
348
|
-
|
|
349
|
-
if np.sum(q_ints) != 0 and np.sum(r_ints) != 0 and q_spec.shape[0] > 1 and r_spec.shape[1] > 1:
|
|
350
|
-
similarity_score = get_similarity(similarity_measure, q_ints, r_ints, weights, entropy_dimension)
|
|
351
|
-
else:
|
|
352
|
-
similarity_score = 0
|
|
353
|
-
|
|
354
|
-
plt.subplot(2,1,2)
|
|
355
|
-
|
|
356
|
-
if q_spec.shape[0] > 1:
|
|
357
|
-
if np.max(q_spec[:,1]) == 0 or np.max(r_spec[:,1]) == 0:
|
|
358
|
-
plt.text(0.5, 0.5, 'The query and/or reference spectrum has no non-zero intensities after transformations.\n Change transformation parameters.', ha='center', va='center', fontsize=7, color='black')
|
|
359
|
-
plt.xticks([])
|
|
360
|
-
plt.yticks([])
|
|
361
|
-
else:
|
|
362
|
-
if y_axis_transformation == 'normalized':
|
|
363
|
-
q_spec[:,1] = q_spec[:,1] / np.max(q_spec[:,1])
|
|
364
|
-
r_spec[:,1] = r_spec[:,1] / np.max(r_spec[:,1])
|
|
365
|
-
ylab='Normalized Intensity'
|
|
366
|
-
elif y_axis_transformation == 'log10':
|
|
367
|
-
q_spec[:,1] = np.log10(q_spec[:,1]+1)
|
|
368
|
-
r_spec[:,1] = np.log10(r_spec[:,1]+1)
|
|
369
|
-
ylab='log10(Intensity)'
|
|
370
|
-
elif y_axis_transformation == 'sqrt':
|
|
371
|
-
q_spec[:,1] = np.sqrt(q_spec[:,1])
|
|
372
|
-
r_spec[:,1] = np.sqrt(r_spec[:,1])
|
|
373
|
-
ylab='sqrt(Intensity)'
|
|
374
|
-
else:
|
|
375
|
-
ylab = 'Raw Intensity'
|
|
376
|
-
plt.vlines(x=q_spec[:,0], ymin=[0]*q_spec.shape[0], ymax=q_spec[:,1], linewidth=3, color='blue')
|
|
377
|
-
plt.vlines(x=r_spec[:,0], ymin=[0]*r_spec.shape[0], ymax=-r_spec[:,1], linewidth=3, color='red')
|
|
378
|
-
plt.xlabel('m/z', fontsize=7)
|
|
379
|
-
plt.ylabel(ylab, fontsize=7)
|
|
380
|
-
plt.xticks(fontsize=7)
|
|
381
|
-
plt.yticks(fontsize=7)
|
|
382
|
-
plt.title(f'Transformed Spectra', fontsize=10)
|
|
383
|
-
else:
|
|
384
|
-
plt.text(0.5, 0.5, 'All points in the spectra were removed during preprocessing. \nChange the spectrum_preprocesing_order and/or change other spectrum-preprocessing parameters.', ha='center', va='center', fontsize=7, color='black')
|
|
385
|
-
plt.xticks([])
|
|
386
|
-
plt.yticks([])
|
|
387
|
-
|
|
388
|
-
plt.subplots_adjust(top=0.8, hspace=0.92, bottom=0.3)
|
|
389
|
-
plt.figlegend(loc='upper center')
|
|
390
|
-
|
|
391
|
-
fig.text(0.05, 0.20, f'Similarity Measure: {similarity_measure.capitalize()}', fontsize=7)
|
|
392
|
-
fig.text(0.05, 0.17, f'Similarity Score: {round(similarity_score, 4)}', fontsize=7)
|
|
393
|
-
fig.text(0.05, 0.14, f"Spectrum Preprocessing Order: {''.join(spectrum_preprocessing_order)}", fontsize=7)
|
|
394
|
-
fig.text(0.05, 0.11, f'High Quality Reference Library: {str(high_quality_reference_library)}', fontsize=7)
|
|
395
|
-
fig.text(0.05, 0.08, f'Window Size (Centroiding): {window_size_centroiding}', fontsize=7)
|
|
396
|
-
fig.text(0.05, 0.05, f'Window Size (Matching): {window_size_matching}', fontsize=7)
|
|
397
|
-
if similarity_measure == 'mixture':
|
|
398
|
-
fig.text(0.05, 0.02, f'Weights for mixture similarity: {weights}', fontsize=7)
|
|
399
|
-
|
|
400
|
-
fig.text(0.40, 0.20, f'Raw-Scale M/Z Range: [{mz_min_tmp},{mz_max_tmp}]', fontsize=7)
|
|
401
|
-
fig.text(0.40, 0.17, f'Raw-Scale Intensity Range: [{int_min_tmp},{int_max_tmp}]', fontsize=7)
|
|
402
|
-
fig.text(0.40, 0.14, f'Noise Threshold: {noise_threshold}', fontsize=7)
|
|
403
|
-
fig.text(0.40, 0.11, f'Weight Factors (m/z,intensity): ({wf_mz},{wf_intensity})', fontsize=7)
|
|
404
|
-
fig.text(0.40, 0.08, f'Low-Entropy Threshold: {LET_threshold}', fontsize=7)
|
|
405
|
-
|
|
406
|
-
if print_url_spectrum1 == 'Yes' and print_url_spectrum2 == 'Yes':
|
|
407
|
-
url_tmp1 = get_pubchem_url(query=spectrum_ID1)
|
|
408
|
-
url_tmp2 = get_pubchem_url(query=spectrum_ID2)
|
|
409
|
-
t1 = fig.text(0.40, 0.05, f'PubChem URL for {spectrum_ID1}: {url_tmp1}', fontsize=7)
|
|
410
|
-
t2 = fig.text(0.40, 0.02, f'PubChem URL for {spectrum_ID2}: {url_tmp2}', fontsize=7)
|
|
411
|
-
t1.set_url(url_tmp1)
|
|
412
|
-
t2.set_url(url_tmp2)
|
|
413
|
-
|
|
414
|
-
if print_url_spectrum1 == 'Yes' and print_url_spectrum2 == 'No':
|
|
415
|
-
url_tmp1 = get_pubchem_url(query=spectrum_ID1)
|
|
416
|
-
t1 = fig.text(0.40, 0.05, f'PubChem URL for {spectrum_ID1}: {url_tmp1}', fontsize=7)
|
|
417
|
-
t1.set_url(url_tmp1)
|
|
418
|
-
|
|
419
|
-
if print_url_spectrum1 == 'No' and print_url_spectrum2 == 'Yes':
|
|
420
|
-
url_tmp2 = get_pubchem_url(query=spectrum_ID2)
|
|
421
|
-
t2 = fig.text(0.40, 0.05, f'PubChem URL for {spectrum_ID2}: {url_tmp2}', fontsize=7)
|
|
422
|
-
t2.set_url(url_tmp2)
|
|
423
|
-
|
|
424
|
-
fig.savefig(output_path, format='svg')
|
|
425
|
-
|
|
426
|
-
if return_plot == True:
|
|
427
|
-
return fig
|
|
428
|
-
|
|
429
|
-
|
|
430
|
-
|
|
431
|
-
|
|
432
|
-
def generate_plots_on_NRMS_data(query_data=None, reference_data=None, spectrum_ID1=None, spectrum_ID2=None, print_url_spectrum1='No', print_url_spectrum2='No', similarity_measure='cosine', weights={'Cosine':0.25,'Shannon':0.25,'Renyi':0.25,'Tsallis':0.25}, spectrum_preprocessing_order='FNLW', high_quality_reference_library=False, mz_min=0, mz_max=9999999, int_min=0, int_max=9999999, noise_threshold=0.0, wf_mz=0.0, wf_intensity=1.0, LET_threshold=0.0, entropy_dimension=1.1, y_axis_transformation='normalized', output_path=None, return_plot=False):
|
|
433
|
-
|
|
434
|
-
if query_data is None:
|
|
435
|
-
print('\nError: No argument passed to the mandatory query_data. Please pass the path to the TXT file of the query data.')
|
|
436
|
-
sys.exit()
|
|
437
|
-
else:
|
|
438
|
-
extension = query_data.rsplit('.',1)
|
|
439
|
-
extension = extension[(len(extension)-1)]
|
|
440
|
-
if extension == 'mgf' or extension == 'MGF' or extension == 'mzML' or extension == 'mzml' or extension == 'MZML' or extension == 'cdf' or extension == 'CDF' or extension == 'msp' or extension == 'MSP' or extension == 'json' or extension == 'JSON':
|
|
441
|
-
output_path_tmp = query_data[:-3] + 'txt'
|
|
442
|
-
build_library_from_raw_data(input_path=query_data, output_path=output_path_tmp, is_reference=False)
|
|
443
|
-
df_query = pd.read_csv(output_path_tmp, sep='\t')
|
|
444
|
-
if extension == 'txt' or extension == 'TXT':
|
|
445
|
-
df_query = pd.read_csv(query_data, sep='\t')
|
|
446
|
-
unique_query_ids = df_query['id'].unique()
|
|
447
|
-
|
|
448
|
-
if reference_data is None:
|
|
449
|
-
print('\nError: No argument passed to the mandatory reference_data. Please pass the path to the TXT file of the reference data.')
|
|
450
|
-
sys.exit()
|
|
451
|
-
else:
|
|
452
|
-
extension = reference_data.rsplit('.',1)
|
|
453
|
-
extension = extension[(len(extension)-1)]
|
|
454
|
-
if extension == 'mgf' or extension == 'MGF' or extension == 'mzML' or extension == 'mzml' or extension == 'MZML' or extension == 'cdf' or extension == 'CDF' or extension == 'msp' or extension == 'MSP' or extension == 'json' or extension == 'JSON':
|
|
455
|
-
output_path_tmp = reference_data[:-3] + 'txt'
|
|
456
|
-
build_library_from_raw_data(input_path=reference_data, output_path=output_path_tmp, is_reference=True)
|
|
457
|
-
df_reference = pd.read_csv(output_path_tmp, sep='\t')
|
|
458
|
-
if extension == 'txt' or extension == 'TXT':
|
|
459
|
-
df_reference = pd.read_csv(reference_data, sep='\t')
|
|
460
|
-
unique_reference_ids = df_reference['id'].unique()
|
|
461
|
-
|
|
462
|
-
|
|
463
|
-
if spectrum_ID1 is not None:
|
|
464
|
-
spectrum_ID1 = str(spectrum_ID1)
|
|
465
|
-
else:
|
|
466
|
-
spectrum_ID1 = str(df_query.iloc[0,0])
|
|
467
|
-
print('No argument passed to spectrum_ID1; using the first spectrum in query_data.')
|
|
468
|
-
|
|
469
|
-
if spectrum_ID2 is not None:
|
|
470
|
-
spectrum_ID2 = str(spectrum_ID2)
|
|
471
|
-
else:
|
|
472
|
-
spectrum_ID2 = str(df_reference.iloc[0,0])
|
|
473
|
-
print('No argument passed to spectrum_ID2; using the first spectrum in reference_data.')
|
|
474
|
-
|
|
475
|
-
if spectrum_preprocessing_order is not None:
|
|
476
|
-
spectrum_preprocessing_order = list(spectrum_preprocessing_order)
|
|
477
|
-
else:
|
|
478
|
-
spectrum_preprocessing_order = ['F','N','W','L']
|
|
479
|
-
if set(spectrum_preprocessing_order) - {'F','N','W','L'}:
|
|
480
|
-
print(f'Error: spectrum_preprocessing_order must contain only \'F\', \'N\', \'W\', \'L\'.')
|
|
481
|
-
sys.exit()
|
|
482
|
-
|
|
483
|
-
if similarity_measure not in ['cosine','shannon','renyi','tsallis','mixture','jaccard','dice','3w_jaccard','sokal_sneath','binary_cosine','mountford','mcconnaughey','driver_kroeber','simpson','braun_banquet','fager_mcgowan','kulczynski','intersection','hamming','hellinger']:
|
|
484
|
-
print('\nError: similarity_measure must be either cosine, shannon, renyi, tsallis, mixture, jaccard, dice, 3w_jaccard, sokal_sneath, binary_cosine, mountford, mcconnaughey, driver_kroeber, simpson, braun_banquet, fager_mcgowan, kulczynski, intersection, hamming, or hellinger.')
|
|
485
|
-
sys.exit()
|
|
486
|
-
|
|
487
|
-
if isinstance(int_min,int) is True:
|
|
488
|
-
int_min = float(int_min)
|
|
489
|
-
if isinstance(int_max,int) is True:
|
|
490
|
-
int_max = float(int_max)
|
|
491
|
-
if isinstance(mz_min,int) is False or isinstance(mz_max,int) is False or isinstance(int_min,float) is False or isinstance(int_max,float) is False:
|
|
492
|
-
print('Error: mz_min must be a non-negative integer, mz_max must be a positive integer, int_min must be a non-negative float, and int_max must be a positive float')
|
|
493
|
-
sys.exit()
|
|
494
|
-
if mz_min < 0:
|
|
495
|
-
print('\nError: mz_min should be a non-negative integer')
|
|
496
|
-
sys.exit()
|
|
497
|
-
if mz_max <= 0:
|
|
498
|
-
print('\nError: mz_max should be a positive integer')
|
|
499
|
-
sys.exit()
|
|
500
|
-
if int_min < 0:
|
|
501
|
-
print('\nError: int_min should be a non-negative float')
|
|
502
|
-
sys.exit()
|
|
503
|
-
if int_max <= 0:
|
|
504
|
-
print('\nError: int_max should be a positive float')
|
|
505
|
-
sys.exit()
|
|
506
|
-
|
|
507
|
-
if isinstance(noise_threshold,int) is True:
|
|
508
|
-
noise_threshold = float(noise_threshold)
|
|
509
|
-
if isinstance(noise_threshold,float) is False or noise_threshold < 0:
|
|
510
|
-
print('Error: noise_threshold must be a positive float.')
|
|
511
|
-
sys.exit()
|
|
512
|
-
|
|
513
|
-
if isinstance(wf_intensity,int) is True:
|
|
514
|
-
wf_intensity = float(wf_intensity)
|
|
515
|
-
if isinstance(wf_mz,int) is True:
|
|
516
|
-
wf_mz = float(wf_mz)
|
|
517
|
-
if isinstance(wf_intensity,float) is False or isinstance(wf_mz,float) is False:
|
|
518
|
-
print('Error: wf_mz and wf_intensity must be integers or floats')
|
|
519
|
-
sys.exit()
|
|
520
|
-
|
|
521
|
-
if entropy_dimension <= 0:
|
|
522
|
-
print('\nError: entropy_dimension should be a positive float')
|
|
523
|
-
sys.exit()
|
|
524
|
-
else:
|
|
525
|
-
q = entropy_dimension
|
|
526
|
-
|
|
527
|
-
normalization_method = 'standard' #consider including additional normalization methods to transform intensities into a probability distribution; softmax results in many numerical errors/warnings
|
|
528
|
-
|
|
529
|
-
if y_axis_transformation not in ['normalized','none','log10','sqrt']:
|
|
530
|
-
print('Error: y_axis_transformation must be either \'normalized\', \'none\', \'log10\', or \'sqrt\'.')
|
|
531
|
-
sys.exit()
|
|
532
|
-
|
|
533
|
-
if output_path is None:
|
|
534
|
-
print(f'Warning: plots will be saved to the svg ./spectrum1_{spectrum_ID1}_spectrum2_{spectrum_ID2}.svg in the current working directory.')
|
|
535
|
-
output_path = f'{Path.cwd()}/spectrum1_{spectrum_ID1}_spectrum2_{spectrum_ID2}.svg'
|
|
536
|
-
|
|
537
|
-
min_mz = np.min([df_query['mz_ratio'].min(), df_reference['mz_ratio'].min()])
|
|
538
|
-
max_mz = np.max([df_query['mz_ratio'].max(), df_reference['mz_ratio'].max()])
|
|
539
|
-
mzs = np.linspace(min_mz,max_mz,(max_mz-min_mz+1))
|
|
540
|
-
|
|
541
|
-
unique_query_ids = df_query['id'].unique().tolist()
|
|
542
|
-
unique_reference_ids = df_reference['id'].unique().tolist()
|
|
543
|
-
unique_query_ids = [str(ID) for ID in unique_query_ids]
|
|
544
|
-
unique_reference_ids = [str(ID) for ID in unique_reference_ids]
|
|
545
|
-
common_IDs = np.intersect1d([str(ID) for ID in unique_query_ids], [str(ID) for ID in unique_reference_ids])
|
|
546
|
-
if len(common_IDs) > 0:
|
|
547
|
-
print(f'Warning: the query and reference library have overlapping IDs: {common_IDs}')
|
|
548
|
-
|
|
549
|
-
if spectrum_ID1 in unique_query_ids and spectrum_ID2 in unique_query_ids:
|
|
550
|
-
q_idxs_tmp = np.where(df_query.iloc[:,0].astype(str) == spectrum_ID1)[0]
|
|
551
|
-
r_idxs_tmp = np.where(df_query.iloc[:,0].astype(str) == spectrum_ID2)[0]
|
|
552
|
-
q_spec = np.asarray(pd.concat([df_query.iloc[q_idxs_tmp,1], df_query.iloc[q_idxs_tmp,2]], axis=1).reset_index(drop=True))
|
|
553
|
-
r_spec = np.asarray(pd.concat([df_query.iloc[r_idxs_tmp,1], df_query.iloc[r_idxs_tmp,2]], axis=1).reset_index(drop=True))
|
|
554
|
-
elif spectrum_ID1 in unique_reference_ids and spectrum_ID2 in unique_reference_ids:
|
|
555
|
-
q_idxs_tmp = np.where(df_reference.iloc[:,0].astype(str) == spectrum_ID1)[0]
|
|
556
|
-
r_idxs_tmp = np.where(df_reference.iloc[:,0].astype(str) == spectrum_ID2)[0]
|
|
557
|
-
q_spec = np.asarray(pd.concat([df_reference.iloc[q_idxs_tmp,1], df_reference.iloc[q_idxs_tmp,2]], axis=1).reset_index(drop=True))
|
|
558
|
-
r_spec = np.asarray(pd.concat([df_reference.iloc[r_idxs_tmp,1], df_reference.iloc[r_idxs_tmp,2]], axis=1).reset_index(drop=True))
|
|
559
|
-
else:
|
|
560
|
-
if spectrum_ID1 in unique_reference_ids and spectrum_ID2 in unique_query_ids:
|
|
561
|
-
spec_tmp = spectrum_ID1
|
|
562
|
-
spectrum_ID1 = spectrum_ID2
|
|
563
|
-
spectrum_ID2 = spec_tmp
|
|
564
|
-
q_idxs_tmp = np.where(df_query['id'].astype(str) == spectrum_ID1)[0]
|
|
565
|
-
r_idxs_tmp = np.where(df_reference['id'].astype(str) == spectrum_ID2)[0]
|
|
566
|
-
q_spec = np.asarray(pd.concat([df_query['mz_ratio'].iloc[q_idxs_tmp], df_query['intensity'].iloc[q_idxs_tmp]], axis=1).reset_index(drop=True))
|
|
567
|
-
r_spec = np.asarray(pd.concat([df_reference['mz_ratio'].iloc[r_idxs_tmp], df_reference['intensity'].iloc[r_idxs_tmp]], axis=1).reset_index(drop=True))
|
|
568
|
-
|
|
569
|
-
q_spec = convert_spec(q_spec,mzs)
|
|
570
|
-
r_spec = convert_spec(r_spec,mzs)
|
|
571
|
-
|
|
572
|
-
int_min_tmp_q = min(q_spec[q_spec[:,1].nonzero(),1][0])
|
|
573
|
-
int_min_tmp_r = min(r_spec[r_spec[:,1].nonzero(),1][0])
|
|
574
|
-
int_max_tmp_q = max(q_spec[q_spec[:,1].nonzero(),1][0])
|
|
575
|
-
int_max_tmp_r = max(r_spec[r_spec[:,1].nonzero(),1][0])
|
|
576
|
-
int_min_tmp = int(min([int_min_tmp_q,int_min_tmp_r]))
|
|
577
|
-
int_max_tmp = int(max([int_max_tmp_q,int_max_tmp_r]))
|
|
578
|
-
|
|
579
|
-
fig, axes = plt.subplots(nrows=2, ncols=1)
|
|
580
|
-
|
|
581
|
-
plt.subplot(2,1,1)
|
|
582
|
-
|
|
583
|
-
if np.max(q_spec[:,1]) == 0 or np.max(r_spec[:,1]) == 0:
|
|
584
|
-
plt.text(0.5, 0.5, 'The query and/or reference spectrum has no non-zero intensities after transformations.\n Change transformation parameters.', ha='center', va='center', fontsize=7, color='black')
|
|
585
|
-
plt.xticks([])
|
|
586
|
-
plt.yticks([])
|
|
587
|
-
else:
|
|
588
|
-
q_spec_pre_trans = q_spec.copy()
|
|
589
|
-
r_spec_pre_trans = r_spec.copy()
|
|
590
|
-
q_spec_pre_trans[:,1] = q_spec_pre_trans[:,1].astype(float)
|
|
591
|
-
r_spec_pre_trans[:,1] = r_spec_pre_trans[:,1].astype(float)
|
|
592
|
-
|
|
593
|
-
if y_axis_transformation == 'normalized':
|
|
594
|
-
q_spec_pre_trans[:,1] = q_spec_pre_trans[:,1] / np.max(q_spec_pre_trans[:,1])
|
|
595
|
-
r_spec_pre_trans[:,1] = r_spec_pre_trans[:,1] / np.max(r_spec_pre_trans[:,1])
|
|
596
|
-
ylab = 'Normalized Intensity'
|
|
597
|
-
elif y_axis_transformation == 'log10':
|
|
598
|
-
q_spec_pre_trans[:,1] = np.log10(q_spec_pre_trans[:,1]+1)
|
|
599
|
-
r_spec_pre_trans[:,1] = np.log10(r_spec_pre_trans[:,1]+1)
|
|
600
|
-
ylab = 'log10(Intensity)'
|
|
601
|
-
elif y_axis_transformation == 'sqrt':
|
|
602
|
-
q_spec_pre_trans[:,1] = np.sqrt(q_spec_pre_trans[:,1])
|
|
603
|
-
r_spec_pre_trans[:,1] = np.sqrt(r_spec_pre_trans[:,1])
|
|
604
|
-
ylab = 'sqrt(Intensity)'
|
|
605
|
-
else:
|
|
606
|
-
ylab = 'Raw Intensity'
|
|
607
|
-
plt.vlines(x=q_spec_pre_trans[:,0], ymin=[0]*len(q_spec_pre_trans[:,0]), ymax=q_spec_pre_trans[:,1], linewidth=3, color='blue', label=f'Spectrum ID1: {spectrum_ID1}')
|
|
608
|
-
plt.vlines(x=r_spec_pre_trans[:,0], ymin=[0]*len(r_spec_pre_trans[:,0]), ymax=-r_spec_pre_trans[:,1], linewidth=3, color='red', label=f'Spectrum ID2: {spectrum_ID2}')
|
|
609
|
-
plt.xlabel('m/z',fontsize=7)
|
|
610
|
-
plt.ylabel(ylab, fontsize=7)
|
|
611
|
-
plt.xticks(fontsize=7)
|
|
612
|
-
plt.yticks(fontsize=7)
|
|
613
|
-
plt.title('Untransformed Query and Reference Spectra', fontsize=10)
|
|
614
|
-
|
|
615
|
-
for transformation in spectrum_preprocessing_order:
|
|
616
|
-
if transformation == 'W':
|
|
617
|
-
q_spec[:,1] = wf_transform(q_spec[:,0], q_spec[:,1], wf_mz, wf_intensity)
|
|
618
|
-
r_spec[:,1] = wf_transform(r_spec[:,0], r_spec[:,1], wf_mz, wf_intensity)
|
|
619
|
-
if transformation == 'L':
|
|
620
|
-
q_spec[:,1] = LE_transform(q_spec[:,1], LET_threshold, normalization_method)
|
|
621
|
-
r_spec[:,1] = LE_transform(r_spec[:,1], LET_threshold, normalization_method)
|
|
622
|
-
if transformation == 'N':
|
|
623
|
-
q_spec = remove_noise(q_spec, nr = noise_threshold)
|
|
624
|
-
if high_quality_reference_library == False or high_quality_reference_library == 'False':
|
|
625
|
-
r_spec = remove_noise(r_spec, nr = noise_threshold)
|
|
626
|
-
if transformation == 'F':
|
|
627
|
-
q_spec = filter_spec_gcms(q_spec, mz_min = mz_min, mz_max = mz_max, int_min = int_min, int_max = int_max)
|
|
628
|
-
if high_quality_reference_library == False or high_quality_reference_library == 'False':
|
|
629
|
-
r_spec = filter_spec_gcms(r_spec, mz_min = mz_min, mz_max = mz_max, int_min = int_min, int_max = int_max)
|
|
630
|
-
|
|
631
|
-
if q_spec.shape[0] > 1:
|
|
632
|
-
similarity_score = get_similarity(similarity_measure, q_spec[:,1], r_spec[:,1], weights, entropy_dimension)
|
|
633
|
-
else:
|
|
634
|
-
similarity_score = 0
|
|
635
|
-
|
|
636
|
-
|
|
637
|
-
plt.subplot(2,1,2)
|
|
638
|
-
|
|
639
|
-
if q_spec.shape[0] == 0 or r_spec.shape[0] == 0:
|
|
640
|
-
plt.text(0.5, 0.5, 'The query and/or reference spectrum has no ion fragments left after transformations.\n Change transformation parameters.', ha='center', va='center', fontsize=7, color='black')
|
|
641
|
-
plt.xticks([])
|
|
642
|
-
plt.yticks([])
|
|
643
|
-
elif np.max(q_spec[:,1]) == 0 or np.max(r_spec[:,1]) == 0:
|
|
644
|
-
plt.text(0.5, 0.5, 'The query and/or reference spectrum has no non-zero intensities after transformations.\n Change transformation parameters.', ha='center', va='center', fontsize=7, color='black')
|
|
645
|
-
plt.xticks([])
|
|
646
|
-
plt.yticks([])
|
|
647
|
-
else:
|
|
648
|
-
if y_axis_transformation == 'normalized':
|
|
649
|
-
q_spec[:,1] = q_spec[:,1] / np.max(q_spec[:,1])
|
|
650
|
-
r_spec[:,1] = r_spec[:,1] / np.max(r_spec[:,1])
|
|
651
|
-
ylab='Normalized Intensity'
|
|
652
|
-
elif y_axis_transformation == 'log10':
|
|
653
|
-
q_spec[:,1] = np.log10(q_spec[:,1]+1)
|
|
654
|
-
r_spec[:,1] = np.log10(r_spec[:,1]+1)
|
|
655
|
-
ylab='log10(Intensity)'
|
|
656
|
-
elif y_axis_transformation == 'sqrt':
|
|
657
|
-
q_spec[:,1] = np.sqrt(q_spec[:,1])
|
|
658
|
-
r_spec[:,1] = np.sqrt(r_spec[:,1])
|
|
659
|
-
ylab='sqrt(Intensity)'
|
|
660
|
-
else:
|
|
661
|
-
ylab = 'Raw Intensity'
|
|
662
|
-
plt.vlines(x=mzs, ymin=[0]*len(mzs), ymax=q_spec[:,1], linewidth=3, color='blue')
|
|
663
|
-
plt.vlines(x=mzs, ymin=[0]*len(mzs), ymax=-r_spec[:,1], linewidth=3, color='red')
|
|
664
|
-
plt.xlabel('m/z', fontsize=7)
|
|
665
|
-
plt.ylabel(ylab, fontsize=7)
|
|
666
|
-
plt.xticks(fontsize=7)
|
|
667
|
-
plt.yticks(fontsize=7)
|
|
668
|
-
plt.title(f'Transformed Query and Reference Spectra', fontsize=10)
|
|
669
|
-
|
|
670
|
-
plt.subplots_adjust(top=0.8, hspace=0.92, bottom=0.3)
|
|
671
|
-
plt.figlegend(loc='upper center')
|
|
672
|
-
|
|
673
|
-
fig.text(0.05, 0.20, f'Similarity Measure: {similarity_measure.capitalize()}', fontsize=7)
|
|
674
|
-
fig.text(0.05, 0.17, f'Similarity Score: {round(similarity_score, 4)}', fontsize=7)
|
|
675
|
-
fig.text(0.05, 0.14, f"Spectrum Preprocessing Order: {''.join(spectrum_preprocessing_order)}", fontsize=7)
|
|
676
|
-
fig.text(0.05, 0.11, f'High Quality Reference Library: {str(high_quality_reference_library)}', fontsize=7)
|
|
677
|
-
fig.text(0.05, 0.08, f'Weight Factors (m/z,intensity): ({wf_mz},{wf_intensity})', fontsize=7)
|
|
678
|
-
if similarity_measure == 'mixture':
|
|
679
|
-
fig.text(0.05, 0.05, f'Weights for mixture similarity: {weights}', fontsize=7)
|
|
680
|
-
|
|
681
|
-
fig.text(0.40, 0.20, f'Raw-Scale M/Z Range: [{min_mz},{max_mz}]', fontsize=7)
|
|
682
|
-
fig.text(0.40, 0.17, f'Raw-Scale Intensity Range: [{int_min_tmp},{int_max_tmp}]', fontsize=7)
|
|
683
|
-
fig.text(0.40, 0.14, f'Noise Threshold: {noise_threshold}', fontsize=7)
|
|
684
|
-
fig.text(0.40, 0.11, f'Low-Entropy Threshold: {LET_threshold}', fontsize=7)
|
|
685
|
-
|
|
686
|
-
if print_url_spectrum1 == 'Yes' and print_url_spectrum2 == 'Yes':
|
|
687
|
-
url_tmp1 = get_pubchem_url(query=spectrum_ID1)
|
|
688
|
-
url_tmp2 = get_pubchem_url(query=spectrum_ID2)
|
|
689
|
-
t1 = fig.text(0.40, 0.08, f'PubChem URL for {spectrum_ID1}: {url_tmp1}', fontsize=7)
|
|
690
|
-
t2 = fig.text(0.40, 0.05, f'PubChem URL for {spectrum_ID2}: {url_tmp2}', fontsize=7)
|
|
691
|
-
t1.set_url(url_tmp1)
|
|
692
|
-
t2.set_url(url_tmp2)
|
|
693
|
-
|
|
694
|
-
if print_url_spectrum1 == 'Yes' and print_url_spectrum2 == 'No':
|
|
695
|
-
url_tmp1 = get_pubchem_url(query=spectrum_ID1)
|
|
696
|
-
t1 = fig.text(0.40, 0.08, f'PubChem URL for {spectrum_ID1}: {url_tmp1}', fontsize=7)
|
|
697
|
-
t1.set_url(url_tmp1)
|
|
698
|
-
|
|
699
|
-
if print_url_spectrum1 == 'No' and print_url_spectrum2 == 'Yes':
|
|
700
|
-
url_tmp2 = get_pubchem_url(query=spectrum_ID2)
|
|
701
|
-
t2 = fig.text(0.40, 0.08, f'PubChem URL for {spectrum_ID2}: {url_tmp2}', fontsize=7)
|
|
702
|
-
t2.set_url(url_tmp2)
|
|
703
|
-
|
|
704
|
-
fig.savefig(output_path, format='svg')
|
|
705
|
-
|
|
706
|
-
if return_plot == True:
|
|
707
|
-
return fig
|
|
708
|
-
|
|
709
|
-
|
|
710
|
-
def wf_transform(spec_mzs, spec_ints, wf_mz, wf_int):
|
|
711
|
-
spec_ints = np.power(spec_mzs, wf_mz) * np.power(spec_ints, wf_int)
|
|
712
|
-
return(spec_ints)
|
|
713
|
-
|
|
714
|
-
|
|
715
|
-
def LE_transform(intensity, thresh, normalization_method):
|
|
716
|
-
intensity_tmp = normalize(intensity, method=normalization_method)
|
|
717
|
-
if np.sum(intensity_tmp) > 0:
|
|
718
|
-
S = scipy.stats.entropy(intensity_tmp.astype('float'))
|
|
719
|
-
if S > 0 and S < thresh:
|
|
720
|
-
w = (1 + S) / (1 + thresh)
|
|
721
|
-
intensity = np.power(intensity_tmp, w)
|
|
722
|
-
else:
|
|
723
|
-
intensity = np.zeros(len(intensity))
|
|
724
|
-
return intensity
|
|
725
|
-
|
|
726
|
-
|
|
727
|
-
def normalize(intensities,method='standard'):
|
|
728
|
-
if np.sum(intensities) > 0:
|
|
729
|
-
if method == 'softmax':
|
|
730
|
-
if np.any(intensities > 700):
|
|
731
|
-
print("Warning: some intensities are too large to exponentiate. Applying standard normalization.")
|
|
732
|
-
intensities /= np.sum(intensities)
|
|
733
|
-
else:
|
|
734
|
-
intensities2 = np.exp(intensities)
|
|
735
|
-
if np.isinf(intensities2).sum() == 0:
|
|
736
|
-
intensities = intensities / np.sum(intensities2)
|
|
737
|
-
elif method == 'standard':
|
|
738
|
-
intensities /= np.sum(intensities)
|
|
739
|
-
return(intensities)
|
|
740
|
-
|
|
741
|
-
|
|
742
|
-
def filter_spec_lcms(spec, mz_min = 0, mz_max = 999999999999, int_min = 0, int_max = 999999999999, is_matched = False):
|
|
743
|
-
if is_matched == False:
|
|
744
|
-
spec = spec[spec[:,0] >= mz_min]
|
|
745
|
-
spec = spec[spec[:,0] <= mz_max]
|
|
746
|
-
spec = spec[spec[:,1] >= int_min]
|
|
747
|
-
spec = spec[spec[:,1] <= int_max]
|
|
748
|
-
else:
|
|
749
|
-
spec = spec[spec[:,0] >= mz_min]
|
|
750
|
-
spec = spec[spec[:,0] <= mz_max]
|
|
751
|
-
spec[spec[:,1] >= int_min] = 0
|
|
752
|
-
spec[spec[:,1] <= int_max] = 0
|
|
753
|
-
return(spec)
|
|
754
|
-
|
|
755
|
-
|
|
756
|
-
def filter_spec_gcms(spec, mz_min = 0, mz_max = 999999999999, int_min = 0, int_max = 999999999999):
|
|
757
|
-
spec[np.where(spec[:,0] < mz_min)[0],1] = 0
|
|
758
|
-
spec[np.where(spec[:,0] > mz_max)[0],1] = 0
|
|
759
|
-
spec[np.where(spec[:,1] < int_min)[0],1] = 0
|
|
760
|
-
spec[np.where(spec[:,1] > int_max)[0],1] = 0
|
|
761
|
-
return(spec)
|
|
762
|
-
|
|
763
|
-
|
|
764
|
-
def remove_noise(spec, nr):
|
|
765
|
-
if spec.shape[0] > 1:
|
|
766
|
-
if nr is not None:
|
|
767
|
-
spec[np.where(spec[:,1] < np.max(spec[:,1]) * nr)[0]] = 0
|
|
768
|
-
|
|
769
|
-
return(spec)
|
|
770
|
-
|
|
771
|
-
|
|
772
|
-
def centroid_spectrum(spec, window_size):
|
|
773
|
-
spec = spec[np.argsort(spec[:,0])]
|
|
774
|
-
|
|
775
|
-
mz_array = spec[:, 0]
|
|
776
|
-
need_centroid = 0
|
|
777
|
-
if mz_array.shape[0] > 1:
|
|
778
|
-
mz_delta = mz_array[1:] - mz_array[:-1]
|
|
779
|
-
if np.min(mz_delta) <= window_size:
|
|
780
|
-
need_centroid = 1
|
|
781
|
-
|
|
782
|
-
if need_centroid:
|
|
783
|
-
intensity_order = np.argsort(-spec[:, 1])
|
|
784
|
-
spec_new = []
|
|
785
|
-
for i in intensity_order:
|
|
786
|
-
mz_delta_allowed = window_size
|
|
787
|
-
|
|
788
|
-
if spec[i, 1] > 0:
|
|
789
|
-
i_left = i - 1
|
|
790
|
-
while i_left >= 0:
|
|
791
|
-
mz_delta_left = spec[i, 0] - spec[i_left, 0]
|
|
792
|
-
if mz_delta_left <= mz_delta_allowed:
|
|
793
|
-
i_left -= 1
|
|
794
|
-
else:
|
|
795
|
-
break
|
|
796
|
-
i_left += 1
|
|
797
|
-
|
|
798
|
-
i_right = i + 1
|
|
799
|
-
while i_right < spec.shape[0]:
|
|
800
|
-
mz_delta_right = spec[i_right, 0] - spec[i, 0]
|
|
801
|
-
if mz_delta_right <= mz_delta_allowed:
|
|
802
|
-
i_right += 1
|
|
803
|
-
else:
|
|
804
|
-
break
|
|
805
|
-
|
|
806
|
-
intensity_sum = np.sum(spec[i_left:i_right, 1])
|
|
807
|
-
intensity_weighted_sum = np.sum(spec[i_left:i_right, 0] * spec[i_left:i_right, 1])
|
|
808
|
-
|
|
809
|
-
spec_new.append([intensity_weighted_sum / intensity_sum, intensity_sum])
|
|
810
|
-
spec[i_left:i_right, 1] = 0
|
|
811
|
-
|
|
812
|
-
spec_new = np.array(spec_new)
|
|
813
|
-
spec_new = spec_new[np.argsort(spec_new[:, 0])]
|
|
814
|
-
if spec_new.shape[0] > 1:
|
|
815
|
-
spec_new = spec_new[np.argsort(spec_new[:, 0])]
|
|
816
|
-
return spec_new
|
|
817
|
-
else:
|
|
818
|
-
return np.array([[0,0]])
|
|
819
|
-
else:
|
|
820
|
-
return spec
|
|
821
|
-
|
|
822
|
-
|
|
823
|
-
|
|
824
|
-
def match_peaks_in_spectra(spec_a, spec_b, window_size):
|
|
825
|
-
a = 0
|
|
826
|
-
b = 0
|
|
827
|
-
|
|
828
|
-
spec_merged = []
|
|
829
|
-
peak_b_int = 0.
|
|
830
|
-
while a < spec_a.shape[0] and b < spec_b.shape[0]:
|
|
831
|
-
mass_delta = spec_a[a, 0] - spec_b[b, 0]
|
|
832
|
-
|
|
833
|
-
if mass_delta < -window_size:
|
|
834
|
-
spec_merged.append([spec_a[a, 0], spec_a[a, 1], peak_b_int])
|
|
835
|
-
peak_b_int = 0.
|
|
836
|
-
a += 1
|
|
837
|
-
elif mass_delta > window_size:
|
|
838
|
-
spec_merged.append([spec_b[b, 0], 0., spec_b[b, 1]])
|
|
839
|
-
b += 1
|
|
840
|
-
else:
|
|
841
|
-
peak_b_int += spec_b[b, 1]
|
|
842
|
-
b += 1
|
|
843
|
-
|
|
844
|
-
if peak_b_int > 0.:
|
|
845
|
-
spec_merged.append([spec_a[a, 0], spec_a[a, 1], peak_b_int])
|
|
846
|
-
peak_b_int = 0.
|
|
847
|
-
a += 1
|
|
848
|
-
|
|
849
|
-
if b < spec_b.shape[0]:
|
|
850
|
-
spec_merged += [[x[0], 0., x[1]] for x in spec_b[b:]]
|
|
851
|
-
|
|
852
|
-
if a < spec_a.shape[0]:
|
|
853
|
-
spec_merged += [[x[0], x[1], 0.] for x in spec_a[a:]]
|
|
854
|
-
|
|
855
|
-
if spec_merged:
|
|
856
|
-
spec_merged = np.array(spec_merged, dtype=np.float64)
|
|
857
|
-
else:
|
|
858
|
-
spec_merged = np.array([[0., 0., 0.]], dtype=np.float64)
|
|
859
|
-
return spec_merged
|
|
860
|
-
|
|
861
|
-
|
|
862
|
-
|
|
863
|
-
def convert_spec(spec, mzs):
|
|
864
|
-
ints_tmp = []
|
|
865
|
-
for i in range(0,len(mzs)):
|
|
866
|
-
if mzs[i] in spec[:,0]:
|
|
867
|
-
int_tmp = spec[np.where(spec[:,0] == mzs[i])[0][0],1]
|
|
868
|
-
else:
|
|
869
|
-
int_tmp = 0
|
|
870
|
-
ints_tmp.append(int_tmp)
|
|
871
|
-
out = np.transpose(np.array([mzs,ints_tmp]))
|
|
872
|
-
return out
|
|
873
|
-
|
|
874
|
-
|
|
875
|
-
def get_reference_df(reference_data, likely_reference_IDs=None):
|
|
876
|
-
extension = reference_data.rsplit('.',1)
|
|
877
|
-
extension = extension[(len(extension)-1)]
|
|
878
|
-
if extension == 'mgf' or extension == 'MGF' or extension == 'mzML' or extension == 'mzml' or extension == 'MZML' or extension == 'cdf' or extension == 'CDF' or extension == 'msp' or extension == 'MSP' or extension == 'json' or extension == 'JSON':
|
|
879
|
-
output_path_tmp = reference_data[:-3] + 'txt'
|
|
880
|
-
build_library_from_raw_data(input_path=reference_data, output_path=output_path_tmp, is_reference=True)
|
|
881
|
-
df_reference = pd.read_csv(output_path_tmp, sep='\t')
|
|
882
|
-
if extension == 'txt' or extension == 'TXT':
|
|
883
|
-
df_reference = pd.read_csv(reference_data, sep='\t')
|
|
884
|
-
if likely_reference_IDs is not None:
|
|
885
|
-
likely_reference_IDs = pd.read_csv(likely_reference_IDs, header=None)
|
|
886
|
-
df_reference = df_reference.loc[df_reference.iloc[:,0].isin(likely_reference_IDs.iloc[:,0].tolist())]
|
|
887
|
-
return df_reference
|
|
888
|
-
|
|
889
|
-
|
|
890
|
-
|
|
891
|
-
def S_cos(ints_a, ints_b):
|
|
892
|
-
if np.sum(ints_a) == 0 or np.sum(ints_b) == 0:
|
|
893
|
-
return(0)
|
|
894
|
-
else:
|
|
895
|
-
return np.dot(ints_a,ints_b) / (np.sqrt(sum(np.power(ints_a,2))) * np.sqrt(sum(np.power(ints_b,2))))
|
|
896
|
-
|
|
897
|
-
|
|
898
|
-
def ent_renyi(ints, q):
|
|
899
|
-
return np.log(sum(np.power(ints,q))) / (1-q)
|
|
900
|
-
|
|
901
|
-
|
|
902
|
-
def ent_tsallis(ints, q):
|
|
903
|
-
return (sum(np.power(ints,q))-1) / (1-q)
|
|
904
|
-
|
|
905
|
-
|
|
906
|
-
def S_shannon(ints_a, ints_b):
|
|
907
|
-
ent_a = scipy.stats.entropy(ints_a)
|
|
908
|
-
ent_b = scipy.stats.entropy(ints_b)
|
|
909
|
-
ent_ab = scipy.stats.entropy(ints_a + ints_b)
|
|
910
|
-
return(1 - (2 * ent_ab - ent_a - ent_b)/np.log(4))
|
|
911
|
-
|
|
912
|
-
|
|
913
|
-
def S_renyi(ints_a, ints_b, q):
|
|
914
|
-
if q == 1:
|
|
915
|
-
print('Warning: the Renyi Entropy Similarity Measure is equivalent to the Shannon Entropy Similarity Measure when the entropy dimension is 1')
|
|
916
|
-
return S_shannon(ints_a, ints_b)
|
|
917
|
-
else:
|
|
918
|
-
ent_a = ent_renyi(ints_a, q)
|
|
919
|
-
ent_b = ent_renyi(ints_b, q)
|
|
920
|
-
ent_merg = ent_renyi(ints_a/2 + ints_b/2, q)
|
|
921
|
-
N = (1/(1-q)) * (2*np.log(np.sum(np.power(ints_a/2,q))+np.sum(np.power(ints_b/2,q))) - np.log(np.sum(np.power(ints_a,q))) - np.log(np.sum(np.power(ints_b,q))))
|
|
922
|
-
return 1 - (2 * ent_merg - ent_a - ent_b) / N
|
|
923
|
-
|
|
924
|
-
|
|
925
|
-
def S_tsallis(ints_a, ints_b, q):
|
|
926
|
-
if q == 1:
|
|
927
|
-
print('Warning: the Tsallis Entropy Similarity Measure is equivalent to the Shannon Entropy Similarity Measure when the entropy dimension is 1')
|
|
928
|
-
return S_shannon(ints_a, ints_b)
|
|
929
|
-
else:
|
|
930
|
-
ent_a = ent_tsallis(ints_a, q)
|
|
931
|
-
ent_b = ent_tsallis(ints_b, q)
|
|
932
|
-
ent_merg = ent_tsallis(ints_a/2 + ints_b/2, q)
|
|
933
|
-
N = np.sum(2*np.power(ints_a/2,q)+2*np.power(ints_b/2,q)-np.power(ints_a,q)-np.power(ints_b,q)) / (1-q)
|
|
934
|
-
return 1 - (2 * ent_merg - ent_a - ent_b) / N
|
|
935
|
-
|
|
936
|
-
def S_mixture(ints_a, ints_b, weights={'Cosine':0.25, 'Shannon':0.25, 'Renyi':0.25, 'Tsallis':0.25}, q=1.1):
|
|
937
|
-
if set(weights.keys()).issubset(set(['Cosine','Shannon','Renyi','Tsallis'])) is False:
|
|
938
|
-
print('Error: the keys to the weight parameter dict of the function S_mixture must be one of the four: Cosine, Shannon, Renyi, Tsallis')
|
|
939
|
-
sys.exit()
|
|
940
|
-
|
|
941
|
-
similarity = 0
|
|
942
|
-
for key, value in weights.items():
|
|
943
|
-
if key == 'Cosine':
|
|
944
|
-
similarity += value * S_cos(ints_a,ints_b)
|
|
945
|
-
if key == 'Shannon':
|
|
946
|
-
similarity += value * S_shannon(ints_a,ints_b)
|
|
947
|
-
if key == 'Renyi':
|
|
948
|
-
similarity += value * S_renyi(ints_a,ints_b,q)
|
|
949
|
-
if key == 'Tsallis':
|
|
950
|
-
similarity += value * S_tsallis(ints_a,ints_b,q)
|
|
951
|
-
return similarity
|
|
952
|
-
|
|
953
|
-
|
|
954
|
-
def get_contingency_entries(ints_a, ints_b):
|
|
955
|
-
a = 0
|
|
956
|
-
b = 0
|
|
957
|
-
c = 0
|
|
958
|
-
|
|
959
|
-
for x, y in zip(ints_a, ints_b):
|
|
960
|
-
if x != 0 and y != 0:
|
|
961
|
-
c += 1
|
|
962
|
-
elif x != 0 and y == 0:
|
|
963
|
-
a += 1
|
|
964
|
-
elif x == 0 and y != 0:
|
|
965
|
-
b += 1
|
|
966
|
-
return [a,b,c]
|
|
967
|
-
|
|
968
|
-
|
|
969
|
-
def S_jaccard(ints_a, ints_b):
|
|
970
|
-
tmp = get_contingency_entries(ints_a, ints_b)
|
|
971
|
-
a = tmp[0]
|
|
972
|
-
b = tmp[1]
|
|
973
|
-
c = tmp[2]
|
|
974
|
-
denom = a + b + c
|
|
975
|
-
if denom == 0:
|
|
976
|
-
similarity = 0
|
|
977
|
-
else:
|
|
978
|
-
similarity = c / (a + b + c)
|
|
979
|
-
return similarity
|
|
980
|
-
|
|
981
|
-
|
|
982
|
-
def S_dice(ints_a, ints_b):
|
|
983
|
-
tmp = get_contingency_entries(ints_a, ints_b)
|
|
984
|
-
a = tmp[0]
|
|
985
|
-
b = tmp[1]
|
|
986
|
-
c = tmp[2]
|
|
987
|
-
denom = a + b + 2 * c
|
|
988
|
-
if denom == 0:
|
|
989
|
-
similarity = 0
|
|
990
|
-
else:
|
|
991
|
-
similarity = 2 * c / denom
|
|
992
|
-
return similarity
|
|
993
|
-
|
|
994
|
-
|
|
995
|
-
def S_3w_jaccard(ints_a, ints_b):
|
|
996
|
-
tmp = get_contingency_entries(ints_a, ints_b)
|
|
997
|
-
a = tmp[0]
|
|
998
|
-
b = tmp[1]
|
|
999
|
-
c = tmp[2]
|
|
1000
|
-
denom = a + b + 3 * c
|
|
1001
|
-
if denom == 0:
|
|
1002
|
-
similarity = 0
|
|
1003
|
-
else:
|
|
1004
|
-
similarity = 3 * c / denom
|
|
1005
|
-
return similarity
|
|
1006
|
-
|
|
1007
|
-
|
|
1008
|
-
def S_sokal_sneath(ints_a, ints_b):
|
|
1009
|
-
tmp = get_contingency_entries(ints_a, ints_b)
|
|
1010
|
-
a = tmp[0]
|
|
1011
|
-
b = tmp[1]
|
|
1012
|
-
c = tmp[2]
|
|
1013
|
-
denom = 2 * a + 2 * b + c
|
|
1014
|
-
if denom == 0:
|
|
1015
|
-
similarity = 0
|
|
1016
|
-
else:
|
|
1017
|
-
similarity = c / denom
|
|
1018
|
-
return similarity
|
|
1019
|
-
|
|
1020
|
-
|
|
1021
|
-
def S_binary_cosine(ints_a, ints_b):
|
|
1022
|
-
tmp = get_contingency_entries(ints_a, ints_b)
|
|
1023
|
-
a = tmp[0]
|
|
1024
|
-
b = tmp[1]
|
|
1025
|
-
c = tmp[2]
|
|
1026
|
-
denom = np.sqrt((a + c) * (b + c))
|
|
1027
|
-
if denom == 0:
|
|
1028
|
-
similarity = 0
|
|
1029
|
-
else:
|
|
1030
|
-
similarity = c / denom
|
|
1031
|
-
return similarity
|
|
1032
|
-
|
|
1033
|
-
|
|
1034
|
-
def S_mountford(ints_a, ints_b):
|
|
1035
|
-
tmp = get_contingency_entries(ints_a, ints_b)
|
|
1036
|
-
a = tmp[0]
|
|
1037
|
-
b = tmp[1]
|
|
1038
|
-
c = tmp[2]
|
|
1039
|
-
denom = c * (a + b) + 2 * a * b
|
|
1040
|
-
if denom == 0:
|
|
1041
|
-
similarity = 1
|
|
1042
|
-
else:
|
|
1043
|
-
similarity = 2 * c / denom
|
|
1044
|
-
return similarity
|
|
1045
|
-
|
|
1046
|
-
|
|
1047
|
-
def S_mcconnaughey(ints_a, ints_b):
|
|
1048
|
-
tmp = get_contingency_entries(ints_a, ints_b)
|
|
1049
|
-
a = tmp[0]
|
|
1050
|
-
b = tmp[1]
|
|
1051
|
-
c = tmp[2]
|
|
1052
|
-
denom = (a + c) * (b + c)
|
|
1053
|
-
if denom == 0:
|
|
1054
|
-
similarity = 0
|
|
1055
|
-
else:
|
|
1056
|
-
similarity = (c**2 - a * b) / denom
|
|
1057
|
-
return similarity
|
|
1058
|
-
|
|
1059
|
-
|
|
1060
|
-
def S_driver_kroeber(ints_a, ints_b):
|
|
1061
|
-
tmp = get_contingency_entries(ints_a, ints_b)
|
|
1062
|
-
a = tmp[0]
|
|
1063
|
-
b = tmp[1]
|
|
1064
|
-
c = tmp[2]
|
|
1065
|
-
denom = 2 * (a + c) * (b + c)
|
|
1066
|
-
if denom == 0:
|
|
1067
|
-
similarity = 0
|
|
1068
|
-
else:
|
|
1069
|
-
similarity = c * (a + b + 2 * c) / denom
|
|
1070
|
-
return similarity
|
|
1071
|
-
|
|
1072
|
-
|
|
1073
|
-
def S_simpson(ints_a, ints_b):
|
|
1074
|
-
tmp = get_contingency_entries(ints_a, ints_b)
|
|
1075
|
-
a = tmp[0]
|
|
1076
|
-
b = tmp[1]
|
|
1077
|
-
c = tmp[2]
|
|
1078
|
-
denom = min(a + c, b + c)
|
|
1079
|
-
if denom == 0:
|
|
1080
|
-
similarity = 0
|
|
1081
|
-
else:
|
|
1082
|
-
similarity = c / denom
|
|
1083
|
-
return similarity
|
|
1084
|
-
|
|
1085
|
-
|
|
1086
|
-
def S_braun_banquet(ints_a, ints_b):
|
|
1087
|
-
tmp = get_contingency_entries(ints_a, ints_b)
|
|
1088
|
-
a = tmp[0]
|
|
1089
|
-
b = tmp[1]
|
|
1090
|
-
c = tmp[2]
|
|
1091
|
-
denom = max(a + c, b + c)
|
|
1092
|
-
if denom == 0:
|
|
1093
|
-
similarity = 0
|
|
1094
|
-
else:
|
|
1095
|
-
similarity = c / denom
|
|
1096
|
-
return similarity
|
|
1097
|
-
|
|
1098
|
-
|
|
1099
|
-
def S_fager_mcgowan(ints_a, ints_b):
|
|
1100
|
-
tmp = get_contingency_entries(ints_a, ints_b)
|
|
1101
|
-
a = tmp[0]
|
|
1102
|
-
b = tmp[1]
|
|
1103
|
-
c = tmp[2]
|
|
1104
|
-
denom1 = np.sqrt((a + c) * (b + c))
|
|
1105
|
-
denom2 = 2 * np.sqrt(max(a + c, b + c))
|
|
1106
|
-
if denom1 == 0 or denom2 == 0:
|
|
1107
|
-
similarity = 0
|
|
1108
|
-
else:
|
|
1109
|
-
similarity = c / denom1 - 1 / denom2
|
|
1110
|
-
return similarity
|
|
1111
|
-
|
|
1112
|
-
|
|
1113
|
-
def S_kulczynski(ints_a, ints_b):
|
|
1114
|
-
tmp = get_contingency_entries(ints_a, ints_b)
|
|
1115
|
-
a = tmp[0]
|
|
1116
|
-
b = tmp[1]
|
|
1117
|
-
c = tmp[2]
|
|
1118
|
-
denom = a + b
|
|
1119
|
-
if denom == 0:
|
|
1120
|
-
similarity = 1
|
|
1121
|
-
else:
|
|
1122
|
-
similarity = c / denom
|
|
1123
|
-
return similarity
|
|
1124
|
-
|
|
1125
|
-
|
|
1126
|
-
def S_intersection(ints_a, ints_b):
|
|
1127
|
-
tmp = get_contingency_entries(ints_a, ints_b)
|
|
1128
|
-
c = tmp[2]
|
|
1129
|
-
return c
|
|
1130
|
-
|
|
1131
|
-
|
|
1132
|
-
def S_hamming(ints_a, ints_b):
|
|
1133
|
-
tmp = get_contingency_entries(ints_a, ints_b)
|
|
1134
|
-
a = tmp[0]
|
|
1135
|
-
b = tmp[1]
|
|
1136
|
-
denom = a + b
|
|
1137
|
-
if denom == 0:
|
|
1138
|
-
similarity = 1
|
|
1139
|
-
else:
|
|
1140
|
-
similarity = 1 / denom
|
|
1141
|
-
return similarity
|
|
1142
|
-
|
|
1143
|
-
|
|
1144
|
-
def S_hellinger(ints_a, ints_b):
|
|
1145
|
-
tmp = get_contingency_entries(ints_a, ints_b)
|
|
1146
|
-
a = tmp[0]
|
|
1147
|
-
b = tmp[1]
|
|
1148
|
-
c = tmp[2]
|
|
1149
|
-
similarity = 1 - np.sqrt((1 - c / np.sqrt((a + c) * (b + c))))
|
|
1150
|
-
return similarity
|
|
1151
|
-
|
|
1152
|
-
|
|
1153
|
-
def get_similarity(similarity_measure, q_ints, r_ints, weights, q):
|
|
1154
|
-
|
|
1155
|
-
if similarity_measure == 'cosine':
|
|
1156
|
-
similarity = S_cos(q_ints, r_ints)
|
|
1157
|
-
|
|
1158
|
-
elif similarity_measure in ['shannon', 'renyi', 'tsallis']:
|
|
1159
|
-
q_ints = normalize(q_ints, method = 'standard')
|
|
1160
|
-
r_ints = normalize(r_ints, method = 'standard')
|
|
1161
|
-
if similarity_measure == 'shannon':
|
|
1162
|
-
similarity = S_shannon(q_ints, r_ints)
|
|
1163
|
-
elif similarity_measure == 'renyi':
|
|
1164
|
-
similarity = S_renyi(q_ints, r_ints, q)
|
|
1165
|
-
elif similarity_measure == 'tsallis':
|
|
1166
|
-
similarity = S_tsallis(q_ints, r_ints, q)
|
|
1167
|
-
|
|
1168
|
-
elif similarity_measure == 'mixture':
|
|
1169
|
-
similarity = S_mixture(q_ints, r_ints, weights, q)
|
|
1170
|
-
|
|
1171
|
-
elif similarity_measure == 'jaccard':
|
|
1172
|
-
similarity = S_jaccard(q_ints, r_ints)
|
|
1173
|
-
|
|
1174
|
-
elif similarity_measure == 'dice':
|
|
1175
|
-
similarity = S_dice(q_ints, r_ints)
|
|
1176
|
-
|
|
1177
|
-
elif similarity_measure == '3w_jaccard':
|
|
1178
|
-
similarity = S_3w_jaccard(q_ints, r_ints)
|
|
1179
|
-
|
|
1180
|
-
elif similarity_measure == 'sokal_sneath':
|
|
1181
|
-
similarity = S_sokal_sneath(q_ints, r_ints)
|
|
1182
|
-
|
|
1183
|
-
elif similarity_measure == 'binary_cosine':
|
|
1184
|
-
similarity = S_binary_cosine(q_ints, r_ints)
|
|
1185
|
-
|
|
1186
|
-
elif similarity_measure == 'mountford':
|
|
1187
|
-
similarity = S_mountford(q_ints, r_ints)
|
|
1188
|
-
|
|
1189
|
-
elif similarity_measure == 'mcconnaughey':
|
|
1190
|
-
similarity = S_mcconnaughey(q_ints, r_ints)
|
|
1191
|
-
|
|
1192
|
-
elif similarity_measure == 'driver_kroeber':
|
|
1193
|
-
similarity = S_driver_kroeber(q_ints, r_ints)
|
|
1194
|
-
|
|
1195
|
-
elif similarity_measure == 'simpson':
|
|
1196
|
-
similarity = S_simpson(q_ints, r_ints)
|
|
1197
|
-
|
|
1198
|
-
elif similarity_measure == 'braun_banquet':
|
|
1199
|
-
similarity = S_braun_banquet(q_ints, r_ints)
|
|
1200
|
-
|
|
1201
|
-
elif similarity_measure == 'fager_mcgowan':
|
|
1202
|
-
similarity = S_fager_mcgowan(q_ints, r_ints)
|
|
1203
|
-
|
|
1204
|
-
elif similarity_measure == 'kulczynski':
|
|
1205
|
-
similarity = S_kulczynski(q_ints, r_ints)
|
|
1206
|
-
|
|
1207
|
-
elif similarity_measure == 'intersection':
|
|
1208
|
-
similarity = S_intersection(q_ints, r_ints)
|
|
1209
|
-
|
|
1210
|
-
elif similarity_measure == 'hamming':
|
|
1211
|
-
similarity = S_hamming(q_ints, r_ints)
|
|
1212
|
-
|
|
1213
|
-
elif similarity_measure == 'hellinger':
|
|
1214
|
-
similarity = S_hellinger(q_ints, r_ints)
|
|
1215
|
-
|
|
1216
|
-
return similarity
|
|
1217
|
-
|
|
1218
|
-
|
|
1219
|
-
def _vector_to_full_params(X, default_params, optimize_params):
|
|
1220
|
-
params = default_params.copy()
|
|
1221
|
-
for name, val in zip(optimize_params, X):
|
|
1222
|
-
params[name] = float(val)
|
|
1223
|
-
return params
|
|
1224
|
-
|
|
1225
|
-
|
|
1226
|
-
def objective_function_HRMS(X, ctx):
|
|
1227
|
-
p = _vector_to_full_params(X, ctx["default_params"], ctx["optimize_params"])
|
|
1228
|
-
acc = get_acc_HRMS(
|
|
1229
|
-
ctx["df_query"], ctx["df_reference"],
|
|
1230
|
-
ctx["precursor_ion_mz_tolerance"], ctx["ionization_mode"], ctx["adduct"],
|
|
1231
|
-
ctx["similarity_measure"], ctx["weights"], ctx["spectrum_preprocessing_order"],
|
|
1232
|
-
ctx["mz_min"], ctx["mz_max"], ctx["int_min"], ctx["int_max"],
|
|
1233
|
-
p["window_size_centroiding"], p["window_size_matching"], p["noise_threshold"],
|
|
1234
|
-
p["wf_mz"], p["wf_int"], p["LET_threshold"],
|
|
1235
|
-
p["entropy_dimension"],
|
|
1236
|
-
ctx["high_quality_reference_library"],
|
|
1237
|
-
verbose=False
|
|
1238
|
-
)
|
|
1239
|
-
print(f"\nparams({ctx['optimize_params']}) = {np.array(X)}\naccuracy: {acc*100}%")
|
|
1240
|
-
return 1.0 - acc
|
|
1241
|
-
|
|
1242
|
-
def objective_function_NRMS(X, ctx):
|
|
1243
|
-
p = _vector_to_full_params(X, ctx["default_params"], ctx["optimize_params"])
|
|
1244
|
-
acc = get_acc_NRMS(
|
|
1245
|
-
ctx["df_query"], ctx["df_reference"],
|
|
1246
|
-
ctx["unique_query_ids"], ctx["unique_reference_ids"],
|
|
1247
|
-
ctx["similarity_measure"], ctx["weights"], ctx["spectrum_preprocessing_order"],
|
|
1248
|
-
ctx["mz_min"], ctx["mz_max"], ctx["int_min"], ctx["int_max"],
|
|
1249
|
-
p["noise_threshold"], p["wf_mz"], p["wf_int"], p["LET_threshold"], p["entropy_dimension"],
|
|
1250
|
-
ctx["high_quality_reference_library"],
|
|
1251
|
-
verbose=False
|
|
1252
|
-
)
|
|
1253
|
-
print(f"\nparams({ctx['optimize_params']}) = {np.array(X)}\naccuracy: {acc*100}%")
|
|
1254
|
-
return 1.0 - acc
|
|
1255
|
-
|
|
1256
|
-
|
|
1257
|
-
|
|
1258
|
-
def tune_params_DE(query_data=None, reference_data=None, precursor_ion_mz_tolerance=None, ionization_mode=None, adduct=None, chromatography_platform='HRMS', similarity_measure='cosine', weights=None, spectrum_preprocessing_order='CNMWL', mz_min=0, mz_max=999999999, int_min=0, int_max=999999999, high_quality_reference_library=False, optimize_params=["window_size_centroiding","window_size_matching","noise_threshold","wf_mz","wf_int","LET_threshold","entropy_dimension"], param_bounds={"window_size_centroiding":(0.0,0.5),"window_size_matching":(0.0,0.5),"noise_threshold":(0.0,0.25),"wf_mz":(0.0,5.0),"wf_int":(0.0,5.0),"LET_threshold":(0.0,5.0),"entropy_dimension":(1.0,3.0)}, default_params={"window_size_centroiding": 0.5, "window_size_matching":0.5, "noise_threshold":0.10, "wf_mz":0.0, "wf_int":1.0, "LET_threshold":0.0, "entropy_dimension":1.1}, maxiters=3, de_workers=1):
|
|
1259
|
-
|
|
1260
|
-
if query_data is None:
|
|
1261
|
-
print('\nError: No argument passed to the mandatory query_data. Please pass the path to the TXT file of the query data.')
|
|
1262
|
-
sys.exit()
|
|
1263
|
-
else:
|
|
1264
|
-
extension = query_data.rsplit('.',1)
|
|
1265
|
-
extension = extension[(len(extension)-1)]
|
|
1266
|
-
if extension == 'mgf' or extension == 'MGF' or extension == 'mzML' or extension == 'mzml' or extension == 'MZML' or extension == 'cdf' or extension == 'CDF' or extension == 'msp' or extension == 'MSP' or extension == 'json' or extension == 'JSON':
|
|
1267
|
-
output_path_tmp = query_data[:-3] + 'txt'
|
|
1268
|
-
build_library_from_raw_data(input_path=query_data, output_path=output_path_tmp, is_reference=False)
|
|
1269
|
-
df_query = pd.read_csv(output_path_tmp, sep='\t')
|
|
1270
|
-
if extension == 'txt' or extension == 'TXT':
|
|
1271
|
-
df_query = pd.read_csv(query_data, sep='\t')
|
|
1272
|
-
unique_query_ids = df_query.iloc[:,0].unique()
|
|
1273
|
-
|
|
1274
|
-
if reference_data is None:
|
|
1275
|
-
print('\nError: No argument passed to the mandatory reference_data. Please pass the path to the TXT file of the reference data.')
|
|
1276
|
-
sys.exit()
|
|
1277
|
-
else:
|
|
1278
|
-
if isinstance(reference_data,str):
|
|
1279
|
-
df_reference = get_reference_df(reference_data=reference_data)
|
|
1280
|
-
unique_reference_ids = df_reference.iloc[:,0].unique()
|
|
1281
|
-
else:
|
|
1282
|
-
dfs = []
|
|
1283
|
-
unique_reference_ids = []
|
|
1284
|
-
for f in reference_data:
|
|
1285
|
-
tmp = get_reference_df(reference_data=f)
|
|
1286
|
-
dfs.append(tmp)
|
|
1287
|
-
unique_reference_ids.extend(tmp.iloc[:,0].unique())
|
|
1288
|
-
df_reference = pd.concat(dfs, axis=0, ignore_index=True)
|
|
1289
|
-
|
|
1290
|
-
if 'ionization_mode' in df_reference.columns.tolist() and ionization_mode != None and ionization_mode != 'N/A':
|
|
1291
|
-
df_reference = df_reference.loc[df_reference['ionization_mode']==ionization_mode]
|
|
1292
|
-
if 'adduct' in df_reference.columns.tolist() and adduct != None and adduct != 'N/A':
|
|
1293
|
-
df_reference = df_reference.loc[df_reference['adduct']==adduct]
|
|
1294
|
-
|
|
1295
|
-
unique_query_ids = df_query['id'].unique().tolist()
|
|
1296
|
-
unique_reference_ids = df_reference['id'].unique().tolist()
|
|
1297
|
-
|
|
1298
|
-
ctx = dict(
|
|
1299
|
-
df_query=df_query,
|
|
1300
|
-
df_reference=df_reference,
|
|
1301
|
-
precursor_ion_mz_tolerance=precursor_ion_mz_tolerance,
|
|
1302
|
-
ionization_mode=ionization_mode,
|
|
1303
|
-
adduct=adduct,
|
|
1304
|
-
similarity_measure=similarity_measure,
|
|
1305
|
-
weights=weights,
|
|
1306
|
-
spectrum_preprocessing_order=spectrum_preprocessing_order,
|
|
1307
|
-
mz_min=mz_min, mz_max=mz_max, int_min=int_min, int_max=int_max,
|
|
1308
|
-
high_quality_reference_library=high_quality_reference_library,
|
|
1309
|
-
default_params=default_params,
|
|
1310
|
-
optimize_params=optimize_params,
|
|
1311
|
-
)
|
|
1312
|
-
|
|
1313
|
-
bounds = [param_bounds[p] for p in optimize_params]
|
|
1314
|
-
|
|
1315
|
-
if chromatography_platform == 'HRMS':
|
|
1316
|
-
result = differential_evolution(objective_function_HRMS, bounds=bounds, args=(ctx,), maxiter=maxiters, tol=0.0, workers=de_workers, seed=1)
|
|
1317
|
-
else:
|
|
1318
|
-
result = differential_evolution(objective_function_NRMS, bounds=bounds, args=(ctx,), maxiter=maxiters, tol=0.0, workers=de_workers, seed=1)
|
|
1319
|
-
|
|
1320
|
-
best_full_params = _vector_to_full_params(result.x, default_params, optimize_params)
|
|
1321
|
-
best_acc = 100.0 - (result.fun * 100.0)
|
|
1322
|
-
|
|
1323
|
-
print("\n=== Differential Evolution Result ===")
|
|
1324
|
-
print(f"Optimized over: {optimize_params}")
|
|
1325
|
-
print("Best values (selected params):")
|
|
1326
|
-
for name in optimize_params:
|
|
1327
|
-
print(f" {name}: {best_full_params[name]}")
|
|
1328
|
-
print("\nFull parameter set used in final evaluation:")
|
|
1329
|
-
for k, v in best_full_params.items():
|
|
1330
|
-
print(f" {k}: {v}")
|
|
1331
|
-
print(f"\nBest accuracy: {best_acc:.3f}%")
|
|
1332
|
-
_log(f"best = {result.x}, acc={100*(1-result.fun):.3f}%")
|
|
1333
|
-
|
|
1334
|
-
|
|
1335
|
-
default_HRMS_grid = {'similarity_measure':['cosine'], 'weight':[{'Cosine':0.25,'Shannon':0.25,'Renyi':0.25,'Tsallis':0.25}], 'spectrum_preprocessing_order':['FCNMWL'], 'mz_min':[0], 'mz_max':[9999999], 'int_min':[0], 'int_max':[99999999], 'window_size_centroiding':[0.5], 'window_size_matching':[0.5], 'noise_threshold':[0.0], 'wf_mz':[0.0], 'wf_int':[1.0], 'LET_threshold':[0.0], 'entropy_dimension':[1.1], 'high_quality_reference_library':[False]}
|
|
1336
|
-
default_NRMS_grid = {'similarity_measure':['cosine'], 'weight':[{'Cosine':0.25,'Shannon':0.25,'Renyi':0.25,'Tsallis':0.25}], 'spectrum_preprocessing_order':['FCNMWL'], 'mz_min':[0], 'mz_max':[9999999], 'int_min':[0], 'int_max':[99999999], 'noise_threshold':[0.0], 'wf_mz':[0.0], 'wf_int':[1.0], 'LET_threshold':[0.0], 'entropy_dimension':[1.1], 'high_quality_reference_library':[False]}
|
|
1337
|
-
|
|
1338
|
-
|
|
1339
|
-
def _eval_one_HRMS(df_query, df_reference,
|
|
1340
|
-
precursor_ion_mz_tolerance_tmp, ionization_mode_tmp, adduct_tmp,
|
|
1341
|
-
similarity_measure_tmp, weight,
|
|
1342
|
-
spectrum_preprocessing_order_tmp, mz_min_tmp, mz_max_tmp,
|
|
1343
|
-
int_min_tmp, int_max_tmp, noise_threshold_tmp,
|
|
1344
|
-
window_size_centroiding_tmp, window_size_matching_tmp,
|
|
1345
|
-
wf_mz_tmp, wf_int_tmp, LET_threshold_tmp,
|
|
1346
|
-
entropy_dimension_tmp, high_quality_reference_library_tmp):
|
|
1347
|
-
|
|
1348
|
-
acc = get_acc_HRMS(
|
|
1349
|
-
df_query=df_query, df_reference=df_reference,
|
|
1350
|
-
precursor_ion_mz_tolerance=precursor_ion_mz_tolerance_tmp,
|
|
1351
|
-
ionization_mode=ionization_mode_tmp, adduct=adduct_tmp,
|
|
1352
|
-
similarity_measure=similarity_measure_tmp, weights=weight,
|
|
1353
|
-
spectrum_preprocessing_order=spectrum_preprocessing_order_tmp,
|
|
1354
|
-
mz_min=mz_min_tmp, mz_max=mz_max_tmp,
|
|
1355
|
-
int_min=int_min_tmp, int_max=int_max_tmp,
|
|
1356
|
-
window_size_centroiding=window_size_centroiding_tmp,
|
|
1357
|
-
window_size_matching=window_size_matching_tmp,
|
|
1358
|
-
noise_threshold=noise_threshold_tmp,
|
|
1359
|
-
wf_mz=wf_mz_tmp, wf_int=wf_int_tmp,
|
|
1360
|
-
LET_threshold=LET_threshold_tmp,
|
|
1361
|
-
entropy_dimension=entropy_dimension_tmp,
|
|
1362
|
-
high_quality_reference_library=high_quality_reference_library_tmp,
|
|
1363
|
-
verbose=False
|
|
1364
|
-
)
|
|
1365
|
-
|
|
1366
|
-
return (
|
|
1367
|
-
acc, similarity_measure_tmp, json.dumps(weight), spectrum_preprocessing_order_tmp,
|
|
1368
|
-
mz_min_tmp, mz_max_tmp, int_min_tmp, int_max_tmp,
|
|
1369
|
-
noise_threshold_tmp, window_size_centroiding_tmp, window_size_matching_tmp,
|
|
1370
|
-
wf_mz_tmp, wf_int_tmp, LET_threshold_tmp, entropy_dimension_tmp,
|
|
1371
|
-
high_quality_reference_library_tmp
|
|
1372
|
-
)
|
|
1373
|
-
|
|
1374
|
-
|
|
1375
|
-
def _eval_one_NRMS(df_query, df_reference, unique_query_ids, unique_reference_ids,
|
|
1376
|
-
similarity_measure_tmp, weight,
|
|
1377
|
-
spectrum_preprocessing_order_tmp, mz_min_tmp, mz_max_tmp,
|
|
1378
|
-
int_min_tmp, int_max_tmp, noise_threshold_tmp,
|
|
1379
|
-
wf_mz_tmp, wf_int_tmp, LET_threshold_tmp,
|
|
1380
|
-
entropy_dimension_tmp, high_quality_reference_library_tmp):
|
|
1381
|
-
|
|
1382
|
-
acc = get_acc_NRMS(
|
|
1383
|
-
df_query=df_query, df_reference=df_reference,
|
|
1384
|
-
unique_query_ids=unique_query_ids, unique_reference_ids=unique_reference_ids,
|
|
1385
|
-
similarity_measure=similarity_measure_tmp, weights=weight,
|
|
1386
|
-
spectrum_preprocessing_order=spectrum_preprocessing_order_tmp,
|
|
1387
|
-
mz_min=mz_min_tmp, mz_max=mz_max_tmp,
|
|
1388
|
-
int_min=int_min_tmp, int_max=int_max_tmp,
|
|
1389
|
-
noise_threshold=noise_threshold_tmp,
|
|
1390
|
-
wf_mz=wf_mz_tmp, wf_int=wf_int_tmp,
|
|
1391
|
-
LET_threshold=LET_threshold_tmp,
|
|
1392
|
-
entropy_dimension=entropy_dimension_tmp,
|
|
1393
|
-
high_quality_reference_library=high_quality_reference_library_tmp,
|
|
1394
|
-
)
|
|
1395
|
-
|
|
1396
|
-
return (
|
|
1397
|
-
acc, similarity_measure_tmp, json.dumps(weight), spectrum_preprocessing_order_tmp,
|
|
1398
|
-
mz_min_tmp, mz_max_tmp, int_min_tmp, int_max_tmp, noise_threshold_tmp,
|
|
1399
|
-
wf_mz_tmp, wf_int_tmp, LET_threshold_tmp, entropy_dimension_tmp, high_quality_reference_library_tmp
|
|
1400
|
-
)
|
|
1401
|
-
|
|
1402
|
-
|
|
1403
|
-
|
|
1404
|
-
|
|
1405
|
-
def tune_params_on_HRMS_data_grid_shiny(query_data=None, reference_data=None, precursor_ion_mz_tolerance=None, ionization_mode=None, adduct=None, grid=None, output_path=None, return_output=False):
|
|
1406
|
-
local_grid = {**default_HRMS_grid, **(grid or {})}
|
|
1407
|
-
for key, value in local_grid.items():
|
|
1408
|
-
globals()[key] = value
|
|
1409
|
-
|
|
1410
|
-
if query_data is None:
|
|
1411
|
-
print('\nError: No argument passed to the mandatory query_data. Please pass the path to the data file.')
|
|
1412
|
-
sys.exit()
|
|
1413
|
-
else:
|
|
1414
|
-
extension = query_data.rsplit('.', 1)[-1]
|
|
1415
|
-
if extension in ('mgf','MGF','mzML','mzml','MZML','cdf','CDF'):
|
|
1416
|
-
output_path_tmp = query_data[:-3] + 'txt'
|
|
1417
|
-
build_library_from_raw_data(input_path=query_data, output_path=output_path_tmp, is_reference=False)
|
|
1418
|
-
df_query = pd.read_csv(output_path_tmp, sep='\t')
|
|
1419
|
-
elif extension in ('txt','TXT'):
|
|
1420
|
-
df_query = pd.read_csv(query_data, sep='\t')
|
|
1421
|
-
else:
|
|
1422
|
-
print(f'\nError: Unsupported query_data extension: {extension}')
|
|
1423
|
-
sys.exit()
|
|
1424
|
-
unique_query_ids = df_query.iloc[:, 0].unique()
|
|
1425
|
-
|
|
1426
|
-
if reference_data is None:
|
|
1427
|
-
print('\nError: No argument passed to the mandatory reference_data. Please pass the path to the data file(s).')
|
|
1428
|
-
sys.exit()
|
|
1429
|
-
else:
|
|
1430
|
-
if isinstance(reference_data, str):
|
|
1431
|
-
df_reference = get_reference_df(reference_data=reference_data)
|
|
1432
|
-
unique_reference_ids = df_reference.iloc[:, 0].unique()
|
|
1433
|
-
else:
|
|
1434
|
-
dfs = []
|
|
1435
|
-
unique_reference_ids = []
|
|
1436
|
-
for f in reference_data:
|
|
1437
|
-
tmp = get_reference_df(reference_data=f)
|
|
1438
|
-
dfs.append(tmp)
|
|
1439
|
-
unique_reference_ids.extend(tmp.iloc[:, 0].unique())
|
|
1440
|
-
df_reference = pd.concat(dfs, axis=0, ignore_index=True)
|
|
1441
|
-
|
|
1442
|
-
print(f'\nNote that there are {len(unique_query_ids)} unique query spectra, '
|
|
1443
|
-
f'{len(unique_reference_ids)} unique reference spectra, and '
|
|
1444
|
-
f'{len(set(unique_query_ids) & set(unique_reference_ids))} of the query and reference spectra IDs are in common.\n')
|
|
1445
|
-
|
|
1446
|
-
if 'ionization_mode' in df_reference.columns.tolist() and ionization_mode != None and ionization_mode != 'N/A':
|
|
1447
|
-
df_reference = df_reference.loc[df_reference['ionization_mode']==ionization_mode]
|
|
1448
|
-
if 'adduct' in df_reference.columns.tolist() and adduct != None and adduct != 'N/A':
|
|
1449
|
-
df_reference = df_reference.loc[df_reference['adduct']==adduct]
|
|
1450
|
-
|
|
1451
|
-
if output_path is None:
|
|
1452
|
-
output_path = f'{Path.cwd()}/tuning_param_output.txt'
|
|
1453
|
-
print(f'Warning: since output_path=None, the output will be written to the current working directory: {output_path}')
|
|
1454
|
-
|
|
1455
|
-
param_grid = product(
|
|
1456
|
-
similarity_measure, weight, spectrum_preprocessing_order, mz_min, mz_max, int_min, int_max,
|
|
1457
|
-
noise_threshold, window_size_centroiding, window_size_matching, wf_mz, wf_int, LET_threshold,
|
|
1458
|
-
entropy_dimension, high_quality_reference_library
|
|
1459
|
-
)
|
|
1460
|
-
|
|
1461
|
-
results = []
|
|
1462
|
-
total = (
|
|
1463
|
-
len(similarity_measure) * len(weight) * len(spectrum_preprocessing_order) * len(mz_min) * len(mz_max) *
|
|
1464
|
-
len(int_min) * len(int_max) * len(noise_threshold) * len(window_size_centroiding) *
|
|
1465
|
-
len(window_size_matching) * len(wf_mz) * len(wf_int) * len(LET_threshold) *
|
|
1466
|
-
len(entropy_dimension) * len(high_quality_reference_library)
|
|
1467
|
-
)
|
|
1468
|
-
done = 0
|
|
1469
|
-
for params in param_grid:
|
|
1470
|
-
res = _eval_one_HRMS(df_query, df_reference, precursor_ion_mz_tolerance, ionization_mode, adduct, *params)
|
|
1471
|
-
results.append(res)
|
|
1472
|
-
done += 1
|
|
1473
|
-
print(f'Completed {done}/{total} grid combinations.\n', flush=True)
|
|
1474
|
-
|
|
1475
|
-
df_out = pd.DataFrame(results, columns=[
|
|
1476
|
-
'ACC','SIMILARITY.MEASURE','WEIGHT','SPECTRUM.PROCESSING.ORDER','MZ.MIN','MZ.MAX',
|
|
1477
|
-
'INT.MIN','INT.MAX','NOISE.THRESHOLD','WINDOW.SIZE.CENTROIDING','WINDOW.SIZE.MATCHING',
|
|
1478
|
-
'WF.MZ','WF.INT','LET.THRESHOLD','ENTROPY.DIMENSION','HIGH.QUALITY.REFERENCE.LIBRARY'
|
|
1479
|
-
])
|
|
1480
|
-
|
|
1481
|
-
if 'WEIGHT' in df_out.columns:
|
|
1482
|
-
df_out['WEIGHT'] = (
|
|
1483
|
-
df_out['WEIGHT'].astype(str)
|
|
1484
|
-
.str.replace("\"","",regex=False)
|
|
1485
|
-
.str.replace("{","",regex=False)
|
|
1486
|
-
.str.replace("}","",regex=False)
|
|
1487
|
-
.str.replace(":","",regex=False)
|
|
1488
|
-
.str.replace("Cosine","",regex=False)
|
|
1489
|
-
.str.replace("Shannon","",regex=False)
|
|
1490
|
-
.str.replace("Renyi","",regex=False)
|
|
1491
|
-
.str.replace("Tsallis","",regex=False)
|
|
1492
|
-
.str.replace(" ","",regex=False)
|
|
1493
|
-
)
|
|
1494
|
-
|
|
1495
|
-
if return_output:
|
|
1496
|
-
return df_out
|
|
1497
|
-
else:
|
|
1498
|
-
df_out.to_csv(output_path, index=False, sep='\t', quoting=csv.QUOTE_NONE)
|
|
1499
|
-
print(f'Wrote results to {output_path}')
|
|
1500
|
-
|
|
1501
|
-
|
|
1502
|
-
|
|
1503
|
-
def tune_params_on_NRMS_data_grid(query_data=None, reference_data=None, grid=None, output_path=None, return_output=False):
|
|
1504
|
-
grid = {**default_NRMS_grid, **(grid or {})}
|
|
1505
|
-
for key, value in grid.items():
|
|
1506
|
-
globals()[key] = value
|
|
1507
|
-
|
|
1508
|
-
if query_data is None:
|
|
1509
|
-
print('\nError: No argument passed to the mandatory query_data. Please pass the path to the TXT file of the query data.')
|
|
1510
|
-
sys.exit()
|
|
1511
|
-
else:
|
|
1512
|
-
extension = query_data.rsplit('.',1)
|
|
1513
|
-
extension = extension[(len(extension)-1)]
|
|
1514
|
-
if extension == 'mgf' or extension == 'MGF' or extension == 'mzML' or extension == 'mzml' or extension == 'MZML' or extension == 'cdf' or extension == 'CDF' or extension == 'msp' or extension == 'MSP' or extension == 'json' or extension == 'JSON':
|
|
1515
|
-
output_path_tmp = query_data[:-3] + 'txt'
|
|
1516
|
-
build_library_from_raw_data(input_path=query_data, output_path=output_path_tmp, is_reference=False)
|
|
1517
|
-
df_query = pd.read_csv(output_path_tmp, sep='\t')
|
|
1518
|
-
if extension == 'txt' or extension == 'TXT':
|
|
1519
|
-
df_query = pd.read_csv(query_data, sep='\t')
|
|
1520
|
-
unique_query_ids = df_query.iloc[:,0].unique()
|
|
1521
|
-
|
|
1522
|
-
if reference_data is None:
|
|
1523
|
-
print('\nError: No argument passed to the mandatory reference_data. Please pass the path to the TXT file of the reference data.')
|
|
1524
|
-
sys.exit()
|
|
1525
|
-
else:
|
|
1526
|
-
if isinstance(reference_data,str):
|
|
1527
|
-
df_reference = get_reference_df(reference_data=reference_data)
|
|
1528
|
-
unique_reference_ids = df_reference.iloc[:,0].unique()
|
|
1529
|
-
else:
|
|
1530
|
-
dfs = []
|
|
1531
|
-
unique_reference_ids = []
|
|
1532
|
-
for f in reference_data:
|
|
1533
|
-
tmp = get_reference_df(reference_data=f)
|
|
1534
|
-
dfs.append(tmp)
|
|
1535
|
-
unique_reference_ids.extend(tmp.iloc[:,0].unique())
|
|
1536
|
-
df_reference = pd.concat(dfs, axis=0, ignore_index=True)
|
|
1537
|
-
|
|
1538
|
-
print(f'\nNote that there are {len(unique_query_ids)} unique query spectra, {len(unique_reference_ids)} unique reference spectra, and {len(set(unique_query_ids) & set(unique_reference_ids))} of the query and reference spectra IDs are in common.\n')
|
|
1539
|
-
|
|
1540
|
-
if output_path is None:
|
|
1541
|
-
output_path = f'{Path.cwd()}/tuning_param_output.txt'
|
|
1542
|
-
print(f'Warning: since output_path=None, the output will be written to the current working directory: {output_path}')
|
|
1543
|
-
|
|
1544
|
-
param_grid = product(similarity_measure, weight, spectrum_preprocessing_order, mz_min, mz_max, int_min, int_max,
|
|
1545
|
-
noise_threshold, wf_mz, wf_int, LET_threshold, entropy_dimension, high_quality_reference_library)
|
|
1546
|
-
results = Parallel(n_jobs=-1, verbose=10)(delayed(_eval_one_NRMS)(df_query, df_reference, unique_query_ids, unique_reference_ids, *params) for params in param_grid)
|
|
1547
|
-
|
|
1548
|
-
df_out = pd.DataFrame(results, columns=[
|
|
1549
|
-
'ACC','SIMILARITY.MEASURE','WEIGHT','SPECTRUM.PROCESSING.ORDER', 'MZ.MIN','MZ.MAX','INT.MIN','INT.MAX',
|
|
1550
|
-
'NOISE.THRESHOLD','WF.MZ','WF.INT','LET.THRESHOLD','ENTROPY.DIMENSION', 'HIGH.QUALITY.REFERENCE.LIBRARY'
|
|
1551
|
-
])
|
|
1552
|
-
df_out['WEIGHT'] = df_out['WEIGHT'].str.replace("\"","",regex=False)
|
|
1553
|
-
df_out['WEIGHT'] = df_out['WEIGHT'].str.replace("{","",regex=False)
|
|
1554
|
-
df_out['WEIGHT'] = df_out['WEIGHT'].str.replace("}","",regex=False)
|
|
1555
|
-
df_out['WEIGHT'] = df_out['WEIGHT'].str.replace(":","",regex=False)
|
|
1556
|
-
df_out['WEIGHT'] = df_out['WEIGHT'].str.replace("Cosine","",regex=False)
|
|
1557
|
-
df_out['WEIGHT'] = df_out['WEIGHT'].str.replace("Shannon","",regex=False)
|
|
1558
|
-
df_out['WEIGHT'] = df_out['WEIGHT'].str.replace("Renyi","",regex=False)
|
|
1559
|
-
df_out['WEIGHT'] = df_out['WEIGHT'].str.replace("Tsallis","",regex=False)
|
|
1560
|
-
df_out['WEIGHT'] = df_out['WEIGHT'].str.replace(" ","",regex=False)
|
|
1561
|
-
if return_output is False:
|
|
1562
|
-
df_out.to_csv(output_path, index=False, sep='\t', quoting=csv.QUOTE_NONE)
|
|
1563
|
-
else:
|
|
1564
|
-
return df_out
|
|
1565
|
-
|
|
1566
|
-
|
|
1567
|
-
|
|
1568
|
-
def tune_params_on_NRMS_data_grid_shiny(query_data=None, reference_data=None, grid=None, output_path=None, return_output=False):
|
|
1569
|
-
local_grid = {**default_NRMS_grid, **(grid or {})}
|
|
1570
|
-
for key, value in local_grid.items():
|
|
1571
|
-
globals()[key] = value
|
|
1572
|
-
|
|
1573
|
-
if query_data is None:
|
|
1574
|
-
print('\nError: No argument passed to the mandatory query_data. Please pass the path to the data file.')
|
|
1575
|
-
sys.exit()
|
|
1576
|
-
else:
|
|
1577
|
-
extension = query_data.rsplit('.', 1)[-1]
|
|
1578
|
-
if extension in ('mgf','MGF','mzML','mzml','MZML','cdf','CDF'):
|
|
1579
|
-
output_path_tmp = query_data[:-3] + 'txt'
|
|
1580
|
-
build_library_from_raw_data(input_path=query_data, output_path=output_path_tmp, is_reference=False)
|
|
1581
|
-
df_query = pd.read_csv(output_path_tmp, sep='\t')
|
|
1582
|
-
elif extension in ('txt','TXT'):
|
|
1583
|
-
df_query = pd.read_csv(query_data, sep='\t')
|
|
1584
|
-
else:
|
|
1585
|
-
print(f'\nError: Unsupported query_data extension: {extension}')
|
|
1586
|
-
sys.exit()
|
|
1587
|
-
unique_query_ids = df_query.iloc[:, 0].unique()
|
|
1588
|
-
|
|
1589
|
-
if reference_data is None:
|
|
1590
|
-
print('\nError: No argument passed to the mandatory reference_data. Please pass the path to the data file(s).')
|
|
1591
|
-
sys.exit()
|
|
1592
|
-
else:
|
|
1593
|
-
if isinstance(reference_data, str):
|
|
1594
|
-
df_reference = get_reference_df(reference_data=reference_data)
|
|
1595
|
-
unique_reference_ids = df_reference.iloc[:, 0].unique()
|
|
1596
|
-
else:
|
|
1597
|
-
dfs = []
|
|
1598
|
-
unique_reference_ids = []
|
|
1599
|
-
for f in reference_data:
|
|
1600
|
-
tmp = get_reference_df(reference_data=f)
|
|
1601
|
-
dfs.append(tmp)
|
|
1602
|
-
unique_reference_ids.extend(tmp.iloc[:, 0].unique())
|
|
1603
|
-
df_reference = pd.concat(dfs, axis=0, ignore_index=True)
|
|
1604
|
-
|
|
1605
|
-
print(f'\nNote that there are {len(unique_query_ids)} unique query spectra, '
|
|
1606
|
-
f'{len(unique_reference_ids)} unique reference spectra, and '
|
|
1607
|
-
f'{len(set(unique_query_ids) & set(unique_reference_ids))} of the query and reference spectra IDs are in common.\n')
|
|
1608
|
-
|
|
1609
|
-
if output_path is None:
|
|
1610
|
-
output_path = f'{Path.cwd()}/tuning_param_output.txt'
|
|
1611
|
-
print(f'Warning: since output_path=None, the output will be written to the current working directory: {output_path}')
|
|
1612
|
-
|
|
1613
|
-
param_grid = product(
|
|
1614
|
-
similarity_measure, weight, spectrum_preprocessing_order, mz_min, mz_max, int_min, int_max,
|
|
1615
|
-
noise_threshold, wf_mz, wf_int, LET_threshold,
|
|
1616
|
-
entropy_dimension, high_quality_reference_library
|
|
1617
|
-
)
|
|
1618
|
-
|
|
1619
|
-
results = []
|
|
1620
|
-
total = (
|
|
1621
|
-
len(similarity_measure) * len(weight) * len(spectrum_preprocessing_order) * len(mz_min) * len(mz_max) * len(int_min) *
|
|
1622
|
-
len(int_max) * len(noise_threshold) * len(wf_mz) * len(wf_int) * len(LET_threshold) * len(entropy_dimension) * len(high_quality_reference_library)
|
|
1623
|
-
)
|
|
1624
|
-
done = 0
|
|
1625
|
-
for params in param_grid:
|
|
1626
|
-
res = _eval_one_NRMS(df_query, df_reference, unique_query_ids, unique_reference_ids, *params)
|
|
1627
|
-
results.append(res)
|
|
1628
|
-
done += 1
|
|
1629
|
-
print(f'Completed {done}/{total} grid combinations.\n', flush=True)
|
|
1630
|
-
|
|
1631
|
-
df_out = pd.DataFrame(results, columns=[
|
|
1632
|
-
'ACC','SIMILARITY.MEASURE','WEIGHT','SPECTRUM.PROCESSING.ORDER','MZ.MIN','MZ.MAX',
|
|
1633
|
-
'INT.MIN','INT.MAX','NOISE.THRESHOLD','WF.MZ','WF.INT','LET.THRESHOLD','ENTROPY.DIMENSION','HIGH.QUALITY.REFERENCE.LIBRARY'
|
|
1634
|
-
])
|
|
1635
|
-
|
|
1636
|
-
if 'WEIGHT' in df_out.columns:
|
|
1637
|
-
df_out['WEIGHT'] = (
|
|
1638
|
-
df_out['WEIGHT'].astype(str)
|
|
1639
|
-
.str.replace("\"","",regex=False)
|
|
1640
|
-
.str.replace("{","",regex=False)
|
|
1641
|
-
.str.replace("}","",regex=False)
|
|
1642
|
-
.str.replace(":","",regex=False)
|
|
1643
|
-
.str.replace("Cosine","",regex=False)
|
|
1644
|
-
.str.replace("Shannon","",regex=False)
|
|
1645
|
-
.str.replace("Renyi","",regex=False)
|
|
1646
|
-
.str.replace("Tsallis","",regex=False)
|
|
1647
|
-
.str.replace(" ","",regex=False)
|
|
1648
|
-
)
|
|
1649
|
-
|
|
1650
|
-
if return_output:
|
|
1651
|
-
return df_out
|
|
1652
|
-
else:
|
|
1653
|
-
df_out.to_csv(output_path, index=False, sep='\t', quoting=csv.QUOTE_NONE)
|
|
1654
|
-
print(f'Wrote results to {output_path}')
|
|
1655
|
-
|
|
1656
|
-
|
|
1657
|
-
|
|
1658
|
-
|
|
1659
|
-
def get_acc_HRMS(df_query, df_reference, precursor_ion_mz_tolerance, ionization_mode, adduct, similarity_measure, weights, spectrum_preprocessing_order, mz_min, mz_max, int_min, int_max, window_size_centroiding, window_size_matching, noise_threshold, wf_mz, wf_int, LET_threshold, entropy_dimension, high_quality_reference_library, verbose=True):
|
|
1660
|
-
n_top_matches_to_save = 1
|
|
1661
|
-
unique_reference_ids = df_reference['id'].dropna().astype(str).unique().tolist()
|
|
1662
|
-
unique_query_ids = df_query['id'].dropna().astype(str).unique().tolist()
|
|
1663
|
-
all_similarity_rows = []
|
|
1664
|
-
|
|
1665
|
-
for query_idx, qid in enumerate(unique_query_ids):
|
|
1666
|
-
if verbose:
|
|
1667
|
-
print(f'query spectrum #{query_idx} is being identified')
|
|
1668
|
-
|
|
1669
|
-
q_mask = (df_query['id'] == qid)
|
|
1670
|
-
q_idxs = np.where(q_mask)[0]
|
|
1671
|
-
if q_idxs.size == 0:
|
|
1672
|
-
all_similarity_rows.append([0.0]*len(unique_reference_ids))
|
|
1673
|
-
continue
|
|
1674
|
-
|
|
1675
|
-
q_spec_base = np.asarray(pd.concat([df_query['mz_ratio'].iloc[q_idxs], df_query['intensity'].iloc[q_idxs]], axis=1).reset_index(drop=True))
|
|
1676
|
-
|
|
1677
|
-
if 'precursor_ion_mz' in df_query.columns and 'precursor_ion_mz' in df_reference.columns and precursor_ion_mz_tolerance is not None:
|
|
1678
|
-
precursor = float(df_query['precursor_ion_mz'].iloc[q_idxs[0]])
|
|
1679
|
-
df_reference_tmp = df_reference.loc[df_reference['precursor_ion_mz'].between(precursor - precursor_ion_mz_tolerance, precursor + precursor_ion_mz_tolerance, inclusive='both'), ['id', 'mz_ratio', 'intensity']].copy()
|
|
1680
|
-
else:
|
|
1681
|
-
df_reference_tmp = df_reference[['id','mz_ratio','intensity']].copy()
|
|
1682
|
-
|
|
1683
|
-
if df_reference_tmp.empty:
|
|
1684
|
-
all_similarity_rows.append([0.0]*len(unique_reference_ids))
|
|
1685
|
-
continue
|
|
1686
|
-
|
|
1687
|
-
ref_groups = dict(tuple(df_reference_tmp.groupby('id', sort=False)))
|
|
1688
|
-
|
|
1689
|
-
similarity_by_ref = {}
|
|
1690
|
-
|
|
1691
|
-
for ref_id, r_df in ref_groups.items():
|
|
1692
|
-
q_spec = q_spec_base.copy()
|
|
1693
|
-
r_spec = np.asarray(pd.concat([r_df['mz_ratio'], r_df['intensity']], axis=1).reset_index(drop=True))
|
|
1694
|
-
|
|
1695
|
-
is_matched = False
|
|
1696
|
-
for transformation in spectrum_preprocessing_order:
|
|
1697
|
-
if np.isinf(q_spec[:, 1]).any():
|
|
1698
|
-
q_spec[:, 1] = 0.0
|
|
1699
|
-
if np.isinf(r_spec[:, 1]).any():
|
|
1700
|
-
r_spec[:, 1] = 0.0
|
|
1701
|
-
|
|
1702
|
-
if transformation == 'C' and q_spec.shape[0] > 1 and r_spec.shape[0] > 1:
|
|
1703
|
-
q_spec = centroid_spectrum(q_spec, window_size=window_size_centroiding)
|
|
1704
|
-
r_spec = centroid_spectrum(r_spec, window_size=window_size_centroiding)
|
|
1705
|
-
|
|
1706
|
-
if transformation == 'M' and q_spec.shape[0] > 1 and r_spec.shape[0] > 1:
|
|
1707
|
-
m_spec = match_peaks_in_spectra(
|
|
1708
|
-
spec_a=q_spec, spec_b=r_spec, window_size=window_size_matching
|
|
1709
|
-
)
|
|
1710
|
-
if m_spec.size == 0:
|
|
1711
|
-
q_spec = np.empty((0,2))
|
|
1712
|
-
r_spec = np.empty((0,2))
|
|
1713
|
-
else:
|
|
1714
|
-
q_spec = m_spec[:, 0:2]
|
|
1715
|
-
r_spec = m_spec[:, [0, 2]]
|
|
1716
|
-
is_matched = True
|
|
1717
|
-
|
|
1718
|
-
if transformation == 'W' and q_spec.shape[0] > 1 and r_spec.shape[0] > 1:
|
|
1719
|
-
q_spec[:, 1] = wf_transform(q_spec[:, 0], q_spec[:, 1], wf_mz, wf_int)
|
|
1720
|
-
r_spec[:, 1] = wf_transform(r_spec[:, 0], r_spec[:, 1], wf_mz, wf_int)
|
|
1721
|
-
|
|
1722
|
-
if transformation == 'L' and q_spec.shape[0] > 1 and r_spec.shape[0] > 1:
|
|
1723
|
-
q_spec[:, 1] = LE_transform(q_spec[:, 1], LET_threshold, normalization_method='standard')
|
|
1724
|
-
r_spec[:, 1] = LE_transform(r_spec[:, 1], LET_threshold, normalization_method='standard')
|
|
1725
|
-
|
|
1726
|
-
if transformation == 'N' and q_spec.shape[0] > 1 and r_spec.shape[0] > 1:
|
|
1727
|
-
q_spec = remove_noise(q_spec, nr=noise_threshold)
|
|
1728
|
-
if not high_quality_reference_library:
|
|
1729
|
-
r_spec = remove_noise(r_spec, nr=noise_threshold)
|
|
1730
|
-
|
|
1731
|
-
if transformation == 'F' and q_spec.shape[0] > 1 and r_spec.shape[0] > 1:
|
|
1732
|
-
q_spec = filter_spec_lcms(
|
|
1733
|
-
q_spec, mz_min=mz_min, mz_max=mz_max, int_min=int_min, int_max=int_max, is_matched=is_matched
|
|
1734
|
-
)
|
|
1735
|
-
if not high_quality_reference_library:
|
|
1736
|
-
r_spec = filter_spec_lcms(
|
|
1737
|
-
r_spec, mz_min=mz_min, mz_max=mz_max, int_min=int_min, int_max=int_max, is_matched=is_matched
|
|
1738
|
-
)
|
|
1739
|
-
|
|
1740
|
-
if q_spec.shape[0] > 1 and r_spec.shape[0] > 1:
|
|
1741
|
-
q_ints = q_spec[:, 1]
|
|
1742
|
-
r_ints = r_spec[:, 1]
|
|
1743
|
-
if np.sum(q_ints) != 0 and np.sum(r_ints) != 0:
|
|
1744
|
-
sim = get_similarity(similarity_measure, q_ints, r_ints, weights, entropy_dimension)
|
|
1745
|
-
else:
|
|
1746
|
-
sim = 0.0
|
|
1747
|
-
else:
|
|
1748
|
-
sim = 0.0
|
|
1749
|
-
|
|
1750
|
-
similarity_by_ref[str(ref_id)] = float(sim)
|
|
1751
|
-
|
|
1752
|
-
row = [similarity_by_ref.get(ref_id, 0.0) for ref_id in unique_reference_ids]
|
|
1753
|
-
all_similarity_rows.append(row)
|
|
1754
|
-
|
|
1755
|
-
df_scores = pd.DataFrame(all_similarity_rows, index=unique_query_ids, columns=unique_reference_ids)
|
|
1756
|
-
df_scores.index.name = 'QUERY.SPECTRUM.ID'
|
|
1757
|
-
|
|
1758
|
-
top_idx = df_scores.values.argmax(axis=1)
|
|
1759
|
-
top_scores = df_scores.values[np.arange(df_scores.shape[0]), top_idx]
|
|
1760
|
-
top_ids = [df_scores.columns[i] for i in top_idx]
|
|
1761
|
-
|
|
1762
|
-
df_tmp = pd.DataFrame({'TRUE.ID': df_scores.index.to_list(), 'PREDICTED.ID': top_ids, 'SCORE': top_scores})
|
|
1763
|
-
if verbose:
|
|
1764
|
-
print(df_tmp)
|
|
1765
|
-
|
|
1766
|
-
acc = (df_tmp['TRUE.ID'] == df_tmp['PREDICTED.ID']).mean()
|
|
1767
|
-
return acc
|
|
1768
|
-
|
|
1769
|
-
|
|
1770
|
-
|
|
1771
|
-
def get_acc_NRMS(df_query, df_reference, unique_query_ids, unique_reference_ids, similarity_measure, weights, spectrum_preprocessing_order, mz_min, mz_max, int_min, int_max, noise_threshold, wf_mz, wf_int, LET_threshold, entropy_dimension, high_quality_reference_library, verbose=True):
|
|
1772
|
-
|
|
1773
|
-
n_top_matches_to_save = 1
|
|
1774
|
-
|
|
1775
|
-
min_mz = int(np.min([np.min(df_query.iloc[:,1]), np.min(df_reference.iloc[:,1])]))
|
|
1776
|
-
max_mz = int(np.max([np.max(df_query.iloc[:,1]), np.max(df_reference.iloc[:,1])]))
|
|
1777
|
-
mzs = np.linspace(min_mz,max_mz,(max_mz-min_mz+1))
|
|
1778
|
-
|
|
1779
|
-
all_similarity_scores = []
|
|
1780
|
-
for query_idx in range(0,len(unique_query_ids)):
|
|
1781
|
-
q_idxs_tmp = np.where(df_query.iloc[:,0] == unique_query_ids[query_idx])[0]
|
|
1782
|
-
q_spec_tmp = np.asarray(pd.concat([df_query.iloc[q_idxs_tmp,1], df_query.iloc[q_idxs_tmp,2]], axis=1).reset_index(drop=True))
|
|
1783
|
-
q_spec_tmp = convert_spec(q_spec_tmp,mzs)
|
|
1784
|
-
|
|
1785
|
-
similarity_scores = []
|
|
1786
|
-
for ref_idx in range(0,len(unique_reference_ids)):
|
|
1787
|
-
q_spec = q_spec_tmp
|
|
1788
|
-
if verbose is True and ref_idx % 1000 == 0:
|
|
1789
|
-
print(f'Query spectrum #{query_idx} has had its similarity with {ref_idx} reference library spectra computed')
|
|
1790
|
-
r_idxs_tmp = np.where(df_reference.iloc[:,0] == unique_reference_ids[ref_idx])[0]
|
|
1791
|
-
r_spec_tmp = np.asarray(pd.concat([df_reference.iloc[r_idxs_tmp,1], df_reference.iloc[r_idxs_tmp,2]], axis=1).reset_index(drop=True))
|
|
1792
|
-
r_spec = convert_spec(r_spec_tmp,mzs)
|
|
1793
|
-
|
|
1794
|
-
for transformation in spectrum_preprocessing_order:
|
|
1795
|
-
if np.isinf(q_spec[:,1]).sum() > 0:
|
|
1796
|
-
q_spec[:,1] = np.zeros(q_spec.shape[0])
|
|
1797
|
-
if np.isinf(r_spec[:,1]).sum() > 0:
|
|
1798
|
-
r_spec[:,1] = np.zeros(r_spec.shape[0])
|
|
1799
|
-
if transformation == 'W':
|
|
1800
|
-
q_spec[:,1] = wf_transform(q_spec[:,0], q_spec[:,1], wf_mz, wf_int)
|
|
1801
|
-
r_spec[:,1] = wf_transform(r_spec[:,0], r_spec[:,1], wf_mz, wf_int)
|
|
1802
|
-
if transformation == 'L':
|
|
1803
|
-
q_spec[:,1] = LE_transform(q_spec[:,1], LET_threshold, normalization_method='standard')
|
|
1804
|
-
r_spec[:,1] = LE_transform(r_spec[:,1], LET_threshold, normalization_method='standard')
|
|
1805
|
-
if transformation == 'N':
|
|
1806
|
-
q_spec = remove_noise(q_spec, nr = noise_threshold)
|
|
1807
|
-
if high_quality_reference_library == False:
|
|
1808
|
-
r_spec = remove_noise(r_spec, nr = noise_threshold)
|
|
1809
|
-
if transformation == 'F':
|
|
1810
|
-
q_spec = filter_spec_gcms(q_spec, mz_min = mz_min, mz_max = mz_max, int_min = int_min, int_max = int_max)
|
|
1811
|
-
if high_quality_reference_library == False:
|
|
1812
|
-
r_spec = filter_spec_gcms(r_spec, mz_min = mz_min, mz_max = mz_max, int_min = int_min, int_max = int_max)
|
|
1813
|
-
|
|
1814
|
-
q_ints = q_spec[:,1]
|
|
1815
|
-
r_ints = r_spec[:,1]
|
|
1816
|
-
|
|
1817
|
-
if np.sum(q_ints) != 0 and np.sum(r_ints) != 0:
|
|
1818
|
-
similarity_score = get_similarity(similarity_measure, q_spec[:,1], r_spec[:,1], weights, entropy_dimension)
|
|
1819
|
-
else:
|
|
1820
|
-
similarity_score = 0
|
|
1821
|
-
|
|
1822
|
-
similarity_scores.append(similarity_score)
|
|
1823
|
-
all_similarity_scores.append(similarity_scores)
|
|
1824
|
-
|
|
1825
|
-
df_scores = pd.DataFrame(all_similarity_scores, columns = unique_reference_ids)
|
|
1826
|
-
df_scores.index = unique_query_ids
|
|
1827
|
-
df_scores.index.names = ['QUERY.SPECTRUM.ID']
|
|
1828
|
-
|
|
1829
|
-
preds = []
|
|
1830
|
-
scores = []
|
|
1831
|
-
for i in range(0, df_scores.shape[0]):
|
|
1832
|
-
df_scores_tmp = df_scores
|
|
1833
|
-
preds_tmp = []
|
|
1834
|
-
scores_tmp = []
|
|
1835
|
-
for j in range(0, n_top_matches_to_save):
|
|
1836
|
-
top_ref_specs_tmp = df_scores_tmp.iloc[i,np.where(df_scores_tmp.iloc[i,:] == np.max(df_scores_tmp.iloc[i,:]))[0]]
|
|
1837
|
-
cols_to_keep = np.where(df_scores_tmp.iloc[i,:] != np.max(df_scores_tmp.iloc[i,:]))[0]
|
|
1838
|
-
df_scores_tmp = df_scores_tmp.iloc[:,cols_to_keep]
|
|
1839
|
-
|
|
1840
|
-
preds_tmp.append(';'.join(map(str,top_ref_specs_tmp.index.to_list())))
|
|
1841
|
-
if len(top_ref_specs_tmp.values) == 0:
|
|
1842
|
-
scores_tmp.append(0)
|
|
1843
|
-
else:
|
|
1844
|
-
scores_tmp.append(top_ref_specs_tmp.values[0])
|
|
1845
|
-
preds.append(preds_tmp)
|
|
1846
|
-
scores.append(scores_tmp)
|
|
1847
|
-
|
|
1848
|
-
preds = np.array(preds)
|
|
1849
|
-
scores = np.array(scores)
|
|
1850
|
-
out = np.c_[unique_query_ids,preds,scores]
|
|
1851
|
-
df_tmp = pd.DataFrame(out, columns=['TRUE.ID','PREDICTED.ID','SCORE'])
|
|
1852
|
-
acc = (df_tmp['TRUE.ID']==df_tmp['PREDICTED.ID']).mean()
|
|
1853
|
-
return acc
|
|
1854
|
-
|
|
1855
|
-
|
|
1856
|
-
|
|
1857
|
-
def run_spec_lib_matching_on_HRMS_data_shiny(query_data=None, reference_data=None, precursor_ion_mz_tolerance=None, ionization_mode=None, adduct=None, likely_reference_ids=None, similarity_measure='cosine', weights={'Cosine':0.25,'Shannon':0.25,'Renyi':0.25,'Tsallis':0.25}, spectrum_preprocessing_order='FCNMWL', high_quality_reference_library=False, mz_min=0, mz_max=9999999, int_min=0, int_max=9999999, window_size_centroiding=0.5, window_size_matching=0.5, noise_threshold=0.0, wf_mz=0.0, wf_intensity=1.0, LET_threshold=0.0, entropy_dimension=1.1, n_top_matches_to_save=1, print_id_results=False, output_identification=None, output_similarity_scores=None, return_ID_output=False, verbose=True):
|
|
1858
|
-
if query_data is None:
|
|
1859
|
-
print('\nError: No argument passed to the mandatory query_data. Please pass the path to the CSV file of the query data.')
|
|
1860
|
-
sys.exit()
|
|
1861
|
-
else:
|
|
1862
|
-
extension = query_data.rsplit('.',1)
|
|
1863
|
-
extension = extension[(len(extension)-1)]
|
|
1864
|
-
if extension == 'mgf' or extension == 'MGF' or extension == 'mzML' or extension == 'mzml' or extension == 'MZML' or extension == 'cdf' or extension == 'CDF' or extension == 'json' or extension == 'JSON':
|
|
1865
|
-
output_path_tmp = query_data[:-3] + 'txt'
|
|
1866
|
-
build_library_from_raw_data(input_path=query_data, output_path=output_path_tmp, is_reference=False)
|
|
1867
|
-
#build_library_from_raw_data(input_path=query_data, output_path=output_path_tmp, is_reference=True)
|
|
1868
|
-
df_query = pd.read_csv(output_path_tmp, sep='\t')
|
|
1869
|
-
if extension == 'txt' or extension == 'TXT':
|
|
1870
|
-
df_query = pd.read_csv(query_data, sep='\t')
|
|
1871
|
-
unique_query_ids = df_query['id'].unique()
|
|
1872
|
-
|
|
1873
|
-
if reference_data is None:
|
|
1874
|
-
print('\nError: No argument passed to the mandatory reference_data. Please pass the path to the reference data.')
|
|
1875
|
-
sys.exit()
|
|
1876
|
-
else:
|
|
1877
|
-
if isinstance(reference_data,str):
|
|
1878
|
-
df_reference = get_reference_df(reference_data,likely_reference_ids)
|
|
1879
|
-
else:
|
|
1880
|
-
dfs = []
|
|
1881
|
-
for f in reference_data:
|
|
1882
|
-
tmp = get_reference_df(f,likely_reference_ids)
|
|
1883
|
-
dfs.append(tmp)
|
|
1884
|
-
df_reference = pd.concat(dfs, axis=0, ignore_index=True)
|
|
1885
|
-
|
|
1886
|
-
if 'ionization_mode' in df_reference.columns.tolist() and ionization_mode != 'N/A':
|
|
1887
|
-
df_reference = df_reference.loc[df_reference['ionization_mode']==ionization_mode]
|
|
1888
|
-
if 'adduct' in df_reference.columns.tolist() and adduct != 'N/A':
|
|
1889
|
-
df_reference = df_reference.loc[df_reference['adduct']==adduct]
|
|
1890
|
-
|
|
1891
|
-
if spectrum_preprocessing_order is not None:
|
|
1892
|
-
spectrum_preprocessing_order = list(spectrum_preprocessing_order)
|
|
1893
|
-
else:
|
|
1894
|
-
spectrum_preprocessing_order = ['F', 'C', 'N', 'M', 'W', 'L']
|
|
1895
|
-
if 'M' not in spectrum_preprocessing_order:
|
|
1896
|
-
print(f'Error: \'M\' must be a character in spectrum_preprocessing_order.')
|
|
1897
|
-
sys.exit()
|
|
1898
|
-
if 'C' in spectrum_preprocessing_order:
|
|
1899
|
-
if spectrum_preprocessing_order.index('C') > spectrum_preprocessing_order.index('M'):
|
|
1900
|
-
print(f'Error: \'C\' must come before \'M\' in spectrum_preprocessing_order.')
|
|
1901
|
-
sys.exit()
|
|
1902
|
-
if set(spectrum_preprocessing_order) - {'F','C','N','M','W','L'}:
|
|
1903
|
-
print(f'Error: spectrum_preprocessing_order must contain only \'C\', \'F\', \'M\', \'N\', \'L\', \'W\'.')
|
|
1904
|
-
sys.exit()
|
|
1905
|
-
|
|
1906
|
-
|
|
1907
|
-
if similarity_measure not in ['cosine','shannon','renyi','tsallis','mixture','jaccard','dice','3w_jaccard','sokal_sneath','binary_cosine','mountford','mcconnaughey','driver_kroeber','simpson','braun_banquet','fager_mcgowan','kuldzynski','intersection','hamming','hellinger']:
|
|
1908
|
-
print('\nError: similarity_measure must be either cosine, shannon, renyi, tsallis, mixture, jaccard, dice, 3w_jaccard, sokal_sneath, binary_cosine, mountford, mcconnaughey, driver_kroeber, simpson, braun_banquet, fager_mcgowan, kulczynski, intersection, hamming, or hellinger')
|
|
1909
|
-
sys.exit()
|
|
1910
|
-
|
|
1911
|
-
if isinstance(int_min,int) is True:
|
|
1912
|
-
int_min = float(int_min)
|
|
1913
|
-
if isinstance(int_max,int) is True:
|
|
1914
|
-
int_max = float(int_max)
|
|
1915
|
-
if isinstance(mz_min,int) is False or isinstance(mz_max,int) is False or isinstance(int_min,float) is False or isinstance(int_max,float) is False:
|
|
1916
|
-
print('Error: mz_min must be a non-negative integer, mz_max must be a positive integer, int_min must be a non-negative float, and int_max must be a positive float')
|
|
1917
|
-
sys.exit()
|
|
1918
|
-
if mz_min < 0:
|
|
1919
|
-
print('\nError: mz_min should be a non-negative integer')
|
|
1920
|
-
sys.exit()
|
|
1921
|
-
if mz_max <= 0:
|
|
1922
|
-
print('\nError: mz_max should be a positive integer')
|
|
1923
|
-
sys.exit()
|
|
1924
|
-
if int_min < 0:
|
|
1925
|
-
print('\nError: int_min should be a non-negative float')
|
|
1926
|
-
sys.exit()
|
|
1927
|
-
if int_max <= 0:
|
|
1928
|
-
print('\nError: int_max should be a positive float')
|
|
1929
|
-
sys.exit()
|
|
1930
|
-
|
|
1931
|
-
if isinstance(window_size_centroiding,float) is False or window_size_centroiding <= 0.0:
|
|
1932
|
-
print('Error: window_size_centroiding must be a positive float.')
|
|
1933
|
-
sys.exit()
|
|
1934
|
-
if isinstance(window_size_matching,float) is False or window_size_matching<= 0.0:
|
|
1935
|
-
print('Error: window_size_matching must be a positive float.')
|
|
1936
|
-
sys.exit()
|
|
1937
|
-
|
|
1938
|
-
if isinstance(noise_threshold,int) is True:
|
|
1939
|
-
noise_threshold = float(noise_threshold)
|
|
1940
|
-
if isinstance(noise_threshold,float) is False or noise_threshold < 0:
|
|
1941
|
-
print('Error: noise_threshold must be a positive float.')
|
|
1942
|
-
sys.exit()
|
|
1943
|
-
|
|
1944
|
-
if isinstance(wf_intensity,int) is True:
|
|
1945
|
-
wf_intensity = float(wf_intensity)
|
|
1946
|
-
if isinstance(wf_mz,int) is True:
|
|
1947
|
-
wf_mz = float(wf_mz)
|
|
1948
|
-
if isinstance(wf_intensity,float) is False or isinstance(wf_mz,float) is False:
|
|
1949
|
-
print('Error: wf_mz and wf_intensity must be integers or floats')
|
|
1950
|
-
sys.exit()
|
|
1951
|
-
|
|
1952
|
-
if entropy_dimension <= 0:
|
|
1953
|
-
print('\nError: entropy_dimension should be a positive float')
|
|
1954
|
-
sys.exit()
|
|
1955
|
-
else:
|
|
1956
|
-
q = entropy_dimension
|
|
1957
|
-
|
|
1958
|
-
normalization_method = 'standard'
|
|
1959
|
-
|
|
1960
|
-
if n_top_matches_to_save <= 0 or isinstance(n_top_matches_to_save,int)==False:
|
|
1961
|
-
print('\nError: n_top_matches_to_save should be a positive integer')
|
|
1962
|
-
sys.exit()
|
|
1963
|
-
|
|
1964
|
-
if isinstance(print_id_results,bool)==False:
|
|
1965
|
-
print('\nError: print_id_results must be either True or False')
|
|
1966
|
-
sys.exit()
|
|
1967
|
-
|
|
1968
|
-
if output_identification is None:
|
|
1969
|
-
output_identification = f'{Path.cwd()}/output_identification.txt'
|
|
1970
|
-
print(f'Warning: writing identification output to {output_identification}')
|
|
1971
|
-
|
|
1972
|
-
if output_similarity_scores is None:
|
|
1973
|
-
output_similarity_scores = f'{Path.cwd()}/output_all_similarity_scores.txt'
|
|
1974
|
-
print(f'Warning: writing similarity scores to {output_similarity_scores}')
|
|
1975
|
-
|
|
1976
|
-
|
|
1977
|
-
unique_reference_ids = df_reference['id'].unique().tolist()
|
|
1978
|
-
all_similarity_scores = []
|
|
1979
|
-
|
|
1980
|
-
for query_idx in range(len(unique_query_ids)):
|
|
1981
|
-
if verbose:
|
|
1982
|
-
print(f'query spectrum #{query_idx} is being identified')
|
|
1983
|
-
|
|
1984
|
-
q_mask = (df_query['id'] == unique_query_ids[query_idx])
|
|
1985
|
-
q_idxs_tmp = np.where(q_mask)[0]
|
|
1986
|
-
q_spec_tmp = np.asarray(pd.concat([df_query['mz_ratio'].iloc[q_idxs_tmp], df_query['intensity'].iloc[q_idxs_tmp]], axis=1).reset_index(drop=True))
|
|
1987
|
-
|
|
1988
|
-
if 'precursor_ion_mz' in df_query.columns.tolist() and 'precursor_ion_mz' in df_reference.columns.tolist() and precursor_ion_mz_tolerance != None:
|
|
1989
|
-
precursor_ion_mz_tmp = df_query['precursor_ion_mz'].iloc[q_idxs_tmp[0]]
|
|
1990
|
-
df_reference_tmp = df_reference.loc[df_reference['precursor_ion_mz'].between(precursor_ion_mz_tmp-precursor_ion_mz_tolerance, precursor_ion_mz_tmp+precursor_ion_mz_tolerance, inclusive='both'),['id','mz_ratio','intensity']].copy()
|
|
1991
|
-
else:
|
|
1992
|
-
df_reference_tmp = df_reference.copy()
|
|
1993
|
-
|
|
1994
|
-
ref_groups = dict(tuple(df_reference_tmp.groupby('id', sort=False)))
|
|
1995
|
-
unique_reference_ids_tmp = list(ref_groups.keys())
|
|
1996
|
-
|
|
1997
|
-
similarity_by_ref = {}
|
|
1998
|
-
for ref_id in unique_reference_ids_tmp:
|
|
1999
|
-
q_spec = q_spec_tmp.copy()
|
|
2000
|
-
r_df = ref_groups[ref_id]
|
|
2001
|
-
r_spec = np.asarray(pd.concat([r_df['mz_ratio'], r_df['intensity']], axis=1).reset_index(drop=True))
|
|
2002
|
-
|
|
2003
|
-
is_matched = False
|
|
2004
|
-
|
|
2005
|
-
for transformation in spectrum_preprocessing_order:
|
|
2006
|
-
if np.isinf(q_spec[:, 1]).sum() > 0:
|
|
2007
|
-
q_spec[:, 1] = np.zeros(q_spec.shape[0])
|
|
2008
|
-
if np.isinf(r_spec[:, 1]).sum() > 0:
|
|
2009
|
-
r_spec[:, 1] = np.zeros(r_spec.shape[0])
|
|
2010
|
-
|
|
2011
|
-
if transformation == 'C' and q_spec.shape[0] > 1 and r_spec.shape[0] > 1:
|
|
2012
|
-
q_spec = centroid_spectrum(q_spec, window_size=window_size_centroiding)
|
|
2013
|
-
r_spec = centroid_spectrum(r_spec, window_size=window_size_centroiding)
|
|
2014
|
-
|
|
2015
|
-
if transformation == 'M' and q_spec.shape[0] > 1 and r_spec.shape[0] > 1:
|
|
2016
|
-
m_spec = match_peaks_in_spectra(spec_a=q_spec, spec_b=r_spec, window_size=window_size_matching)
|
|
2017
|
-
q_spec = m_spec[:, 0:2]
|
|
2018
|
-
r_spec = m_spec[:, [0, 2]]
|
|
2019
|
-
is_matched = True
|
|
2020
|
-
|
|
2021
|
-
if transformation == 'W' and q_spec.shape[0] > 1 and r_spec.shape[0] > 1:
|
|
2022
|
-
q_spec[:, 1] = wf_transform(q_spec[:, 0], q_spec[:, 1], wf_mz, wf_intensity)
|
|
2023
|
-
r_spec[:, 1] = wf_transform(r_spec[:, 0], r_spec[:, 1], wf_mz, wf_intensity)
|
|
2024
|
-
|
|
2025
|
-
if transformation == 'L' and q_spec.shape[0] > 1 and r_spec.shape[0] > 1:
|
|
2026
|
-
q_spec[:, 1] = LE_transform(q_spec[:, 1], LET_threshold, normalization_method=normalization_method)
|
|
2027
|
-
r_spec[:, 1] = LE_transform(r_spec[:, 1], LET_threshold, normalization_method=normalization_method)
|
|
2028
|
-
|
|
2029
|
-
if transformation == 'N' and q_spec.shape[0] > 1 and r_spec.shape[0] > 1:
|
|
2030
|
-
q_spec = remove_noise(q_spec, nr=noise_threshold)
|
|
2031
|
-
if not high_quality_reference_library:
|
|
2032
|
-
r_spec = remove_noise(r_spec, nr=noise_threshold)
|
|
2033
|
-
|
|
2034
|
-
if transformation == 'F' and q_spec.shape[0] > 1 and r_spec.shape[0] > 1:
|
|
2035
|
-
q_spec = filter_spec_lcms(
|
|
2036
|
-
q_spec, mz_min=mz_min, mz_max=mz_max, int_min=int_min, int_max=int_max, is_matched=is_matched
|
|
2037
|
-
)
|
|
2038
|
-
if not high_quality_reference_library:
|
|
2039
|
-
r_spec = filter_spec_lcms(
|
|
2040
|
-
r_spec, mz_min=mz_min, mz_max=mz_max, int_min=int_min, int_max=int_max, is_matched=is_matched
|
|
2041
|
-
)
|
|
2042
|
-
|
|
2043
|
-
q_ints = q_spec[:, 1]
|
|
2044
|
-
r_ints = r_spec[:, 1]
|
|
2045
|
-
|
|
2046
|
-
if np.sum(q_ints) != 0 and np.sum(r_ints) != 0 and q_spec.shape[0] > 1 and r_spec.shape[0] > 1:
|
|
2047
|
-
sim = get_similarity(similarity_measure, q_ints, r_ints, weights, entropy_dimension)
|
|
2048
|
-
else:
|
|
2049
|
-
sim = 0.0
|
|
2050
|
-
|
|
2051
|
-
similarity_by_ref[ref_id] = sim
|
|
2052
|
-
|
|
2053
|
-
row_scores = [similarity_by_ref.get(ref_id, 0.0) for ref_id in unique_reference_ids]
|
|
2054
|
-
all_similarity_scores.append(row_scores)
|
|
2055
|
-
|
|
2056
|
-
df_scores = pd.DataFrame(all_similarity_scores, index=unique_query_ids, columns=unique_reference_ids)
|
|
2057
|
-
df_scores.index = unique_query_ids
|
|
2058
|
-
df_scores.index.names = ['QUERY.SPECTRUM.ID']
|
|
2059
|
-
|
|
2060
|
-
|
|
2061
|
-
preds = []
|
|
2062
|
-
scores = []
|
|
2063
|
-
for i in range(0, df_scores.shape[0]):
|
|
2064
|
-
df_scores_tmp = df_scores
|
|
2065
|
-
preds_tmp = []
|
|
2066
|
-
scores_tmp = []
|
|
2067
|
-
for j in range(0, n_top_matches_to_save):
|
|
2068
|
-
top_ref_specs_tmp = df_scores_tmp.iloc[i,np.where(df_scores_tmp.iloc[i,:] == np.max(df_scores_tmp.iloc[i,:]))[0]]
|
|
2069
|
-
cols_to_keep = np.where(df_scores_tmp.iloc[i,:] != np.max(df_scores_tmp.iloc[i,:]))[0]
|
|
2070
|
-
df_scores_tmp = df_scores_tmp.iloc[:,cols_to_keep]
|
|
2071
|
-
|
|
2072
|
-
preds_tmp.append(';'.join(map(str,top_ref_specs_tmp.index.to_list())))
|
|
2073
|
-
if len(top_ref_specs_tmp.values) == 0:
|
|
2074
|
-
scores_tmp.append(0)
|
|
2075
|
-
else:
|
|
2076
|
-
scores_tmp.append(top_ref_specs_tmp.values[0])
|
|
2077
|
-
preds.append(preds_tmp)
|
|
2078
|
-
scores.append(scores_tmp)
|
|
2079
|
-
|
|
2080
|
-
preds = np.array(preds)
|
|
2081
|
-
scores = np.array(scores)
|
|
2082
|
-
out = np.c_[preds,scores]
|
|
2083
|
-
|
|
2084
|
-
cnames_preds = []
|
|
2085
|
-
cnames_scores = []
|
|
2086
|
-
for i in range(0,n_top_matches_to_save):
|
|
2087
|
-
cnames_preds.append(f'RANK.{i+1}.PRED')
|
|
2088
|
-
cnames_scores.append(f'RANK.{i+1}.SIMILARITY.SCORE')
|
|
2089
|
-
|
|
2090
|
-
df_top_ref_specs = pd.DataFrame(out, columns = [*cnames_preds, *cnames_scores])
|
|
2091
|
-
df_top_ref_specs.index = unique_query_ids
|
|
2092
|
-
df_top_ref_specs.index.names = ['QUERY.SPECTRUM.ID']
|
|
2093
|
-
|
|
2094
|
-
df_scores.columns = ['Reference Spectrum ID: ' + col for col in list(map(str,df_scores.columns.tolist()))]
|
|
2095
|
-
|
|
2096
|
-
if print_id_results == True:
|
|
2097
|
-
print(df_top_ref_specs.to_string())
|
|
2098
|
-
|
|
2099
|
-
if return_ID_output is False:
|
|
2100
|
-
df_top_ref_specs.to_csv(output_identification, sep='\t')
|
|
2101
|
-
df_scores.to_csv(output_similarity_scores, sep='\t')
|
|
2102
|
-
else:
|
|
2103
|
-
return df_top_ref_specs
|
|
2104
|
-
|
|
2105
|
-
|
|
2106
|
-
|
|
2107
|
-
|
|
2108
|
-
def run_spec_lib_matching_on_NRMS_data_shiny(query_data=None, reference_data=None, likely_reference_ids=None, spectrum_preprocessing_order='FNLW', similarity_measure='cosine', weights={'Cosine':0.25,'Shannon':0.25,'Renyi':0.25,'Tsallis':0.25}, high_quality_reference_library=False, mz_min=0, mz_max=9999999, int_min=0, int_max=9999999, noise_threshold=0.0, wf_mz=0.0, wf_intensity=1.0, LET_threshold=0.0, entropy_dimension=1.1, n_top_matches_to_save=1, print_id_results=False, output_identification=None, output_similarity_scores=None, return_ID_output=False, verbose=True):
|
|
2109
|
-
if query_data is None:
|
|
2110
|
-
print('\nError: No argument passed to the mandatory query_data. Please pass the path to the TXT file of the query data.')
|
|
2111
|
-
sys.exit()
|
|
2112
|
-
else:
|
|
2113
|
-
extension = query_data.rsplit('.',1)
|
|
2114
|
-
extension = extension[(len(extension)-1)]
|
|
2115
|
-
if extension == 'mgf' or extension == 'MGF' or extension == 'mzML' or extension == 'mzml' or extension == 'MZML' or extension == 'cdf' or extension == 'CDF' or extension == 'msp' or extension == 'MSP' or extension == 'json' or extension == 'JSON':
|
|
2116
|
-
output_path_tmp = query_data[:-3] + 'txt'
|
|
2117
|
-
build_library_from_raw_data(input_path=query_data, output_path=output_path_tmp, is_reference=False)
|
|
2118
|
-
df_query = pd.read_csv(output_path_tmp, sep='\t')
|
|
2119
|
-
if extension == 'txt' or extension == 'TXT':
|
|
2120
|
-
df_query = pd.read_csv(query_data, sep='\t')
|
|
2121
|
-
unique_query_ids = df_query.iloc[:,0].unique()
|
|
2122
|
-
|
|
2123
|
-
if reference_data is None:
|
|
2124
|
-
print('\nError: No argument passed to the mandatory reference_data. Please pass the path to the TXT file of the reference data.')
|
|
2125
|
-
sys.exit()
|
|
2126
|
-
else:
|
|
2127
|
-
if isinstance(reference_data,str):
|
|
2128
|
-
df_reference = get_reference_df(reference_data,likely_reference_ids)
|
|
2129
|
-
unique_reference_ids = df_reference.iloc[:,0].unique()
|
|
2130
|
-
else:
|
|
2131
|
-
dfs = []
|
|
2132
|
-
unique_reference_ids = []
|
|
2133
|
-
for f in reference_data:
|
|
2134
|
-
tmp = get_reference_df(f,likely_reference_ids)
|
|
2135
|
-
dfs.append(tmp)
|
|
2136
|
-
unique_reference_ids.extend(tmp.iloc[:,0].unique())
|
|
2137
|
-
df_reference = pd.concat(dfs, axis=0, ignore_index=True)
|
|
2138
|
-
|
|
2139
|
-
|
|
2140
|
-
if spectrum_preprocessing_order is not None:
|
|
2141
|
-
spectrum_preprocessing_order = list(spectrum_preprocessing_order)
|
|
2142
|
-
else:
|
|
2143
|
-
spectrum_preprocessing_order = ['F','N','W','L']
|
|
2144
|
-
if set(spectrum_preprocessing_order) - {'F','N','W','L'}:
|
|
2145
|
-
print(f'Error: spectrum_preprocessing_order must contain only \'F\', \'N\', \'W\', \'L\'.')
|
|
2146
|
-
sys.exit()
|
|
2147
|
-
|
|
2148
|
-
if similarity_measure not in ['cosine','shannon','renyi','tsallis','mixture','jaccard','dice','3w_jaccard','sokal_sneath','binary_cosine','mountford','mcconnaughey','driver_kroeber','simpson','braun_banquet','fager_mcgowan','kuldzynski','intersection','hamming','hellinger']:
|
|
2149
|
-
print('\nError: similarity_measure must be either cosine, shannon, renyi, tsallis, mixture, jaccard, dice, 3w_jaccard, sokal_sneath, binary_cosine, mountford, mcconnaughey, driver_kroeber, simpson, braun_banquet, fager_mcgowan, kulczynski, intersection, hamming, or hellinger')
|
|
2150
|
-
sys.exit()
|
|
2151
|
-
|
|
2152
|
-
if isinstance(int_min,int) is True:
|
|
2153
|
-
int_min = float(int_min)
|
|
2154
|
-
if isinstance(int_max,int) is True:
|
|
2155
|
-
int_max = float(int_max)
|
|
2156
|
-
if isinstance(mz_min,int) is False or isinstance(mz_max,int) is False or isinstance(int_min,float) is False or isinstance(int_max,float) is False:
|
|
2157
|
-
print('Error: mz_min must be a non-negative integer, mz_max must be a positive integer, int_min must be a non-negative float, and int_max must be a positive float')
|
|
2158
|
-
sys.exit()
|
|
2159
|
-
if mz_min < 0:
|
|
2160
|
-
print('\nError: mz_min should be a non-negative integer')
|
|
2161
|
-
sys.exit()
|
|
2162
|
-
if mz_max <= 0:
|
|
2163
|
-
print('\nError: mz_max should be a positive integer')
|
|
2164
|
-
sys.exit()
|
|
2165
|
-
if int_min < 0:
|
|
2166
|
-
print('\nError: int_min should be a non-negative float')
|
|
2167
|
-
sys.exit()
|
|
2168
|
-
if int_max <= 0:
|
|
2169
|
-
print('\nError: int_max should be a positive float')
|
|
2170
|
-
sys.exit()
|
|
2171
|
-
|
|
2172
|
-
if isinstance(noise_threshold,int) is True:
|
|
2173
|
-
noise_threshold = float(noise_threshold)
|
|
2174
|
-
if isinstance(noise_threshold,float) is False or noise_threshold < 0:
|
|
2175
|
-
print('Error: noise_threshold must be a positive float.')
|
|
2176
|
-
sys.exit()
|
|
2177
|
-
|
|
2178
|
-
if isinstance(wf_intensity,int) is True:
|
|
2179
|
-
wf_intensity = float(wf_intensity)
|
|
2180
|
-
if isinstance(wf_mz,int) is True:
|
|
2181
|
-
wf_mz = float(wf_mz)
|
|
2182
|
-
if isinstance(wf_intensity,float) is False or isinstance(wf_mz,float) is False:
|
|
2183
|
-
print('Error: wf_mz and wf_intensity must be integers or floats')
|
|
2184
|
-
sys.exit()
|
|
2185
|
-
|
|
2186
|
-
if entropy_dimension <= 0:
|
|
2187
|
-
print('\nError: entropy_dimension should be a positive float')
|
|
2188
|
-
sys.exit()
|
|
2189
|
-
else:
|
|
2190
|
-
q = entropy_dimension
|
|
2191
|
-
|
|
2192
|
-
normalization_method = 'standard'
|
|
2193
|
-
|
|
2194
|
-
if n_top_matches_to_save <= 0 or isinstance(n_top_matches_to_save,int)==False:
|
|
2195
|
-
print('\nError: n_top_matches_to_save should be a positive integer')
|
|
2196
|
-
sys.exit()
|
|
2197
|
-
|
|
2198
|
-
if isinstance(print_id_results,bool)==False:
|
|
2199
|
-
print('\nError: print_id_results must be either True or False')
|
|
2200
|
-
sys.exit()
|
|
2201
|
-
|
|
2202
|
-
if output_identification is None:
|
|
2203
|
-
output_identification = f'{Path.cwd()}/output_identification.txt'
|
|
2204
|
-
print(f'Warning: writing identification output to {output_identification}')
|
|
2205
|
-
|
|
2206
|
-
if output_similarity_scores is None:
|
|
2207
|
-
output_similarity_scores = f'{Path.cwd()}/output_all_similarity_scores.txt'
|
|
2208
|
-
print(f'Warning: writing similarity scores to {output_similarity_scores}')
|
|
2209
|
-
|
|
2210
|
-
|
|
2211
|
-
|
|
2212
|
-
min_mz = int(np.min([np.min(df_query.iloc[:,1]), np.min(df_reference.iloc[:,1])]))
|
|
2213
|
-
max_mz = int(np.max([np.max(df_query.iloc[:,1]), np.max(df_reference.iloc[:,1])]))
|
|
2214
|
-
mzs = np.linspace(min_mz,max_mz,(max_mz-min_mz+1))
|
|
2215
|
-
|
|
2216
|
-
all_similarity_scores = []
|
|
2217
|
-
for query_idx in range(0,len(unique_query_ids)):
|
|
2218
|
-
q_idxs_tmp = np.where(df_query.iloc[:,0] == unique_query_ids[query_idx])[0]
|
|
2219
|
-
q_spec_tmp = np.asarray(pd.concat([df_query.iloc[q_idxs_tmp,1], df_query.iloc[q_idxs_tmp,2]], axis=1).reset_index(drop=True))
|
|
2220
|
-
q_spec_tmp = convert_spec(q_spec_tmp,mzs)
|
|
2221
|
-
|
|
2222
|
-
similarity_scores = []
|
|
2223
|
-
for ref_idx in range(0,len(unique_reference_ids)):
|
|
2224
|
-
if verbose is True and ref_idx % 1000 == 0:
|
|
2225
|
-
print(f'Query spectrum #{query_idx} has had its similarity with {ref_idx} reference library spectra computed')
|
|
2226
|
-
q_spec = q_spec_tmp
|
|
2227
|
-
r_idxs_tmp = np.where(df_reference.iloc[:,0] == unique_reference_ids[ref_idx])[0]
|
|
2228
|
-
r_spec_tmp = np.asarray(pd.concat([df_reference.iloc[r_idxs_tmp,1], df_reference.iloc[r_idxs_tmp,2]], axis=1).reset_index(drop=True))
|
|
2229
|
-
r_spec = convert_spec(r_spec_tmp,mzs)
|
|
2230
|
-
|
|
2231
|
-
for transformation in spectrum_preprocessing_order:
|
|
2232
|
-
if np.isinf(q_spec[:,1]).sum() > 0:
|
|
2233
|
-
q_spec[:,1] = np.zeros(q_spec.shape[0])
|
|
2234
|
-
if np.isinf(r_spec[:,1]).sum() > 0:
|
|
2235
|
-
r_spec[:,1] = np.zeros(r_spec.shape[0])
|
|
2236
|
-
if transformation == 'W':
|
|
2237
|
-
q_spec[:,1] = wf_transform(q_spec[:,0], q_spec[:,1], wf_mz, wf_intensity)
|
|
2238
|
-
r_spec[:,1] = wf_transform(r_spec[:,0], r_spec[:,1], wf_mz, wf_intensity)
|
|
2239
|
-
if transformation == 'L':
|
|
2240
|
-
q_spec[:,1] = LE_transform(q_spec[:,1], LET_threshold, normalization_method=normalization_method)
|
|
2241
|
-
r_spec[:,1] = LE_transform(r_spec[:,1], LET_threshold, normalization_method=normalization_method)
|
|
2242
|
-
if transformation == 'N':
|
|
2243
|
-
q_spec = remove_noise(q_spec, nr = noise_threshold)
|
|
2244
|
-
if high_quality_reference_library == False:
|
|
2245
|
-
r_spec = remove_noise(r_spec, nr = noise_threshold)
|
|
2246
|
-
if transformation == 'F':
|
|
2247
|
-
q_spec = filter_spec_gcms(q_spec, mz_min = mz_min, mz_max = mz_max, int_min = int_min, int_max = int_max)
|
|
2248
|
-
if high_quality_reference_library == False:
|
|
2249
|
-
r_spec = filter_spec_gcms(r_spec, mz_min = mz_min, mz_max = mz_max, int_min = int_min, int_max = int_max)
|
|
2250
|
-
|
|
2251
|
-
q_ints = q_spec[:,1]
|
|
2252
|
-
r_ints = r_spec[:,1]
|
|
2253
|
-
|
|
2254
|
-
if np.sum(q_ints) != 0 and np.sum(r_ints) != 0:
|
|
2255
|
-
similarity_score = get_similarity(similarity_measure, q_spec[:,1], r_spec[:,1], weights, entropy_dimension)
|
|
2256
|
-
else:
|
|
2257
|
-
similarity_score = 0
|
|
2258
|
-
|
|
2259
|
-
similarity_scores.append(similarity_score)
|
|
2260
|
-
all_similarity_scores.append(similarity_scores)
|
|
2261
|
-
|
|
2262
|
-
df_scores = pd.DataFrame(all_similarity_scores, columns = unique_reference_ids)
|
|
2263
|
-
df_scores.index = unique_query_ids
|
|
2264
|
-
df_scores.index.names = ['QUERY.SPECTRUM.ID']
|
|
2265
|
-
|
|
2266
|
-
preds = []
|
|
2267
|
-
scores = []
|
|
2268
|
-
for i in range(0, df_scores.shape[0]):
|
|
2269
|
-
df_scores_tmp = df_scores
|
|
2270
|
-
preds_tmp = []
|
|
2271
|
-
scores_tmp = []
|
|
2272
|
-
for j in range(0, n_top_matches_to_save):
|
|
2273
|
-
top_ref_specs_tmp = df_scores_tmp.iloc[i,np.where(df_scores_tmp.iloc[i,:] == np.max(df_scores_tmp.iloc[i,:]))[0]]
|
|
2274
|
-
cols_to_keep = np.where(df_scores_tmp.iloc[i,:] != np.max(df_scores_tmp.iloc[i,:]))[0]
|
|
2275
|
-
df_scores_tmp = df_scores_tmp.iloc[:,cols_to_keep]
|
|
2276
|
-
|
|
2277
|
-
preds_tmp.append(';'.join(map(str,top_ref_specs_tmp.index.to_list())))
|
|
2278
|
-
if len(top_ref_specs_tmp.values) == 0:
|
|
2279
|
-
scores_tmp.append(0)
|
|
2280
|
-
else:
|
|
2281
|
-
scores_tmp.append(top_ref_specs_tmp.values[0])
|
|
2282
|
-
preds.append(preds_tmp)
|
|
2283
|
-
scores.append(scores_tmp)
|
|
2284
|
-
|
|
2285
|
-
preds = np.array(preds)
|
|
2286
|
-
scores = np.array(scores)
|
|
2287
|
-
out = np.c_[preds,scores]
|
|
2288
|
-
|
|
2289
|
-
cnames_preds = []
|
|
2290
|
-
cnames_scores = []
|
|
2291
|
-
for i in range(0,n_top_matches_to_save):
|
|
2292
|
-
cnames_preds.append(f'RANK.{i+1}.PRED')
|
|
2293
|
-
cnames_scores.append(f'RANK.{i+1}.SIMILARITY.SCORE')
|
|
2294
|
-
|
|
2295
|
-
df_top_ref_specs = pd.DataFrame(out, columns = [*cnames_preds, *cnames_scores])
|
|
2296
|
-
df_top_ref_specs.index = unique_query_ids
|
|
2297
|
-
df_top_ref_specs.index.names = ['QUERY.SPECTRUM.ID']
|
|
2298
|
-
|
|
2299
|
-
if print_id_results == True:
|
|
2300
|
-
print(df_top_ref_specs.to_string())
|
|
2301
|
-
|
|
2302
|
-
df_scores.columns = ['Reference Spectrum ID: ' + col for col in list(map(str,df_scores.columns.tolist()))]
|
|
2303
|
-
|
|
2304
|
-
if return_ID_output is False:
|
|
2305
|
-
df_top_ref_specs.to_csv(output_identification, sep='\t')
|
|
2306
|
-
df_scores.columns = ['Reference Spectrum ID: ' + col for col in list(map(str,df_scores.columns.tolist()))]
|
|
2307
|
-
df_scores.to_csv(output_similarity_scores, sep='\t')
|
|
2308
|
-
else:
|
|
2309
|
-
return df_top_ref_specs
|
|
2310
|
-
|
|
2311
|
-
|
|
2312
|
-
class _UIWriter:
|
|
2313
|
-
def __init__(self, loop, q: asyncio.Queue[str]):
|
|
2314
|
-
self._loop = loop
|
|
2315
|
-
self._q = q
|
|
2316
|
-
def write(self, s: str):
|
|
2317
|
-
if s:
|
|
2318
|
-
self._loop.call_soon_threadsafe(self._q.put_nowait, s)
|
|
2319
|
-
return len(s)
|
|
2320
|
-
def flush(self):
|
|
2321
|
-
pass
|
|
2322
|
-
|
|
2323
|
-
|
|
2324
|
-
def attach_logging_to_writer(writer):
|
|
2325
|
-
handler = logging.StreamHandler(writer)
|
|
2326
|
-
handler.setLevel(logging.INFO)
|
|
2327
|
-
root = logging.getLogger()
|
|
2328
|
-
root.addHandler(handler)
|
|
2329
|
-
root.setLevel(logging.INFO)
|
|
2330
|
-
return handler, root
|
|
2331
|
-
|
|
2332
|
-
|
|
2333
|
-
|
|
2334
|
-
def _run_with_redirects(fn, writer, *args, **kwargs):
|
|
2335
|
-
with redirect_stdout(writer), redirect_stderr(writer):
|
|
2336
|
-
return fn(*args, **kwargs)
|
|
2337
|
-
|
|
2338
|
-
|
|
2339
|
-
def strip_text(s):
|
|
2340
|
-
return [x.strip() for x in s.strip('[]').split(',') if x.strip()]
|
|
2341
|
-
|
|
2342
|
-
|
|
2343
|
-
def strip_numeric(s):
|
|
2344
|
-
return [float(x.strip()) for x in s.strip('[]').split(',') if x.strip()]
|
|
2345
|
-
|
|
2346
|
-
|
|
2347
|
-
def strip_weights(s):
|
|
2348
|
-
obj = ast.literal_eval(s) if isinstance(s, (str, bytes)) else s
|
|
2349
|
-
keys = ['Cosine', 'Shannon', 'Renyi', 'Tsallis']
|
|
2350
|
-
|
|
2351
|
-
if isinstance(obj, (list, tuple)):
|
|
2352
|
-
if len(obj) == 4 and all(isinstance(x, Real) for x in obj):
|
|
2353
|
-
tuples = [obj]
|
|
2354
|
-
else:
|
|
2355
|
-
tuples = list(obj)
|
|
2356
|
-
else:
|
|
2357
|
-
raise ValueError(f"Expected a 4-tuple or a sequence of 4-tuples, got {type(obj).__name__}")
|
|
2358
|
-
|
|
2359
|
-
out = []
|
|
2360
|
-
for t in tuples:
|
|
2361
|
-
if not (isinstance(t, (list, tuple)) and len(t) == 4):
|
|
2362
|
-
raise ValueError(f"Each item must be a 4-tuple, got: {t!r}")
|
|
2363
|
-
out.append(dict(zip(keys, t)))
|
|
2364
|
-
return out
|
|
2365
|
-
|
|
2366
|
-
|
|
2367
|
-
def build_library(input_path=None, output_path=None):
|
|
2368
|
-
last_three_chars = input_path[(len(input_path)-3):len(input_path)]
|
|
2369
|
-
last_four_chars = input_path[(len(input_path)-4):len(input_path)]
|
|
2370
|
-
if last_three_chars == 'txt' or last_three_chars == 'TXT':
|
|
2371
|
-
return pd.read_csv(input_path, sep='\t')
|
|
2372
|
-
else:
|
|
2373
|
-
if last_three_chars == 'mgf' or last_three_chars == 'MGF':
|
|
2374
|
-
input_file_type = 'mgf'
|
|
2375
|
-
elif last_four_chars == 'mzML' or last_four_chars == 'mzml' or last_four_chars == 'MZML':
|
|
2376
|
-
input_file_type = 'mzML'
|
|
2377
|
-
elif last_four_chars == 'json' or last_four_chars == 'JSON':
|
|
2378
|
-
input_file_type = 'json'
|
|
2379
|
-
elif last_three_chars == 'cdf' or last_three_chars == 'CDF':
|
|
2380
|
-
input_file_type = 'cdf'
|
|
2381
|
-
elif last_three_chars == 'msp' or last_three_chars == 'MSP':
|
|
2382
|
-
input_file_type = 'msp'
|
|
2383
|
-
else:
|
|
2384
|
-
print('ERROR: either an \'mgf\', \'mzML\', \'cdf\', \'msp\', \'json\', or \'txt\' file must be passed to --input_path')
|
|
2385
|
-
sys.exit()
|
|
2386
|
-
|
|
2387
|
-
spectra = []
|
|
2388
|
-
if input_file_type == 'mgf':
|
|
2389
|
-
with mgf.read(input_path, index_by_scans = True) as reader:
|
|
2390
|
-
for spec in reader:
|
|
2391
|
-
spectra.append(spec)
|
|
2392
|
-
if input_file_type == 'mzML':
|
|
2393
|
-
with mzml.read(input_path) as reader:
|
|
2394
|
-
for spec in reader:
|
|
2395
|
-
spectra.append(spec)
|
|
2396
|
-
|
|
2397
|
-
if input_file_type == 'mgf' or input_file_type == 'mzML':
|
|
2398
|
-
ids = []
|
|
2399
|
-
mzs = []
|
|
2400
|
-
ints = []
|
|
2401
|
-
for i in range(0,len(spectra)):
|
|
2402
|
-
for j in range(0,len(spectra[i]['m/z array'])):
|
|
2403
|
-
if input_file_type == 'mzML':
|
|
2404
|
-
ids.append(f'ID_{i+1}')
|
|
2405
|
-
else:
|
|
2406
|
-
ids.append(spectra[i]['params']['name'])
|
|
2407
|
-
mzs.append(spectra[i]['m/z array'][j])
|
|
2408
|
-
ints.append(spectra[i]['intensity array'][j])
|
|
2409
|
-
|
|
2410
|
-
if input_file_type == 'cdf':
|
|
2411
|
-
dataset = nc.Dataset(input_path, 'r')
|
|
2412
|
-
all_mzs = dataset.variables['mass_values'][:]
|
|
2413
|
-
all_ints = dataset.variables['intensity_values'][:]
|
|
2414
|
-
scan_idxs = dataset.variables['scan_index'][:]
|
|
2415
|
-
dataset.close()
|
|
2416
|
-
|
|
2417
|
-
ids = []
|
|
2418
|
-
mzs = []
|
|
2419
|
-
ints = []
|
|
2420
|
-
for i in range(0,(len(scan_idxs)-1)):
|
|
2421
|
-
if i % 1000 == 0:
|
|
2422
|
-
print(f'analyzed {i} out of {len(scan_idxs)} scans')
|
|
2423
|
-
s_idx = scan_idxs[i]
|
|
2424
|
-
e_idx = scan_idxs[i+1]
|
|
2425
|
-
|
|
2426
|
-
mzs_tmp = all_mzs[s_idx:e_idx]
|
|
2427
|
-
ints_tmp = all_ints[s_idx:e_idx]
|
|
2428
|
-
|
|
2429
|
-
for j in range(0,len(mzs_tmp)):
|
|
2430
|
-
ids.append(f'ID_{i+1}')
|
|
2431
|
-
mzs.append(mzs_tmp[j])
|
|
2432
|
-
ints.append(ints_tmp[j])
|
|
2433
|
-
|
|
2434
|
-
if input_file_type == 'msp':
|
|
2435
|
-
ids = []
|
|
2436
|
-
mzs = []
|
|
2437
|
-
ints = []
|
|
2438
|
-
with open(input_path, 'r') as f:
|
|
2439
|
-
i = 0
|
|
2440
|
-
for line in f:
|
|
2441
|
-
line = line.strip()
|
|
2442
|
-
if line.startswith('Name:'):
|
|
2443
|
-
i += 1
|
|
2444
|
-
spectrum_id = line.replace('Name: ','')
|
|
2445
|
-
elif line and line[0].isdigit():
|
|
2446
|
-
try:
|
|
2447
|
-
mz, intensity = map(float, line.split()[:2])
|
|
2448
|
-
ids.append(spectrum_id)
|
|
2449
|
-
mzs.append(mz)
|
|
2450
|
-
ints.append(intensity)
|
|
2451
|
-
except ValueError:
|
|
2452
|
-
continue
|
|
2453
|
-
|
|
2454
|
-
if input_file_type == 'json':
|
|
2455
|
-
data = json.load(open(input_path))
|
|
2456
|
-
ids = []
|
|
2457
|
-
mzs = []
|
|
2458
|
-
ints = []
|
|
2459
|
-
for i in range(0,len(data)):
|
|
2460
|
-
spec_ID_tmp = data[i]['spectrum_id']
|
|
2461
|
-
tmp = data[i]['peaks_json']
|
|
2462
|
-
tmp = tmp[1:-1].split(",")
|
|
2463
|
-
tmp = [a.replace("[","") for a in tmp]
|
|
2464
|
-
tmp = [a.replace("]","") for a in tmp]
|
|
2465
|
-
mzs_tmp = tmp[0::2]
|
|
2466
|
-
ints_tmp = tmp[1::2]
|
|
2467
|
-
ids.extend([spec_ID_tmp] * len(mzs_tmp))
|
|
2468
|
-
mzs.extend(mzs_tmp)
|
|
2469
|
-
ints.extend(ints_tmp)
|
|
2470
|
-
|
|
2471
|
-
df = pd.DataFrame({'id':ids, 'mz_ratio':mzs, 'intensity':ints})
|
|
2472
|
-
return df
|
|
2473
|
-
|
|
2474
|
-
|
|
2475
|
-
|
|
2476
|
-
def extract_first_column_ids(file_path: str, max_ids: int = 20000):
|
|
2477
|
-
suffix = Path(file_path).suffix.lower()
|
|
2478
|
-
|
|
2479
|
-
if suffix == ".txt":
|
|
2480
|
-
df = pd.read_csv(file_path, sep='\t')
|
|
2481
|
-
if 'id' in df.columns.tolist():
|
|
2482
|
-
ids = df['id'].astype(str).dropna()
|
|
2483
|
-
else:
|
|
2484
|
-
ids = df.iloc[:, 0].astype(str).dropna()
|
|
2485
|
-
ids = [x for x in ids if x.strip() != ""]
|
|
2486
|
-
seen = set()
|
|
2487
|
-
uniq = []
|
|
2488
|
-
for x in ids:
|
|
2489
|
-
if x not in seen:
|
|
2490
|
-
uniq.append(x)
|
|
2491
|
-
seen.add(x)
|
|
2492
|
-
return uniq[:max_ids]
|
|
2493
|
-
|
|
2494
|
-
ids = []
|
|
2495
|
-
try:
|
|
2496
|
-
with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
|
|
2497
|
-
for line in f:
|
|
2498
|
-
ls = line.strip()
|
|
2499
|
-
if ls.startswith("TITLE="):
|
|
2500
|
-
ids.append(ls.split("=", 1)[1].strip())
|
|
2501
|
-
elif ls.lower().startswith("name:"):
|
|
2502
|
-
ids.append(ls.split(":", 1)[1].strip())
|
|
2503
|
-
if len(ids) >= max_ids:
|
|
2504
|
-
break
|
|
2505
|
-
except Exception:
|
|
2506
|
-
pass
|
|
2507
|
-
|
|
2508
|
-
if ids:
|
|
2509
|
-
seen = set()
|
|
2510
|
-
uniq = []
|
|
2511
|
-
for x in ids:
|
|
2512
|
-
if x not in seen:
|
|
2513
|
-
uniq.append(x)
|
|
2514
|
-
seen.add(x)
|
|
2515
|
-
return uniq
|
|
2516
|
-
return []
|
|
2517
|
-
|
|
2518
|
-
|
|
2519
|
-
def _open_plot_window(session, svg_bytes: bytes, title: str = "plot.svg"):
|
|
2520
|
-
"""Send SVG bytes to browser and open in a new window as a data URL."""
|
|
2521
|
-
b64 = base64.b64encode(svg_bytes).decode("ascii")
|
|
2522
|
-
data_url = f"data:image/svg;base64,{b64}"
|
|
2523
|
-
session.send_custom_message("open-plot-window", {"svg": data_url, "title": title})
|
|
2524
|
-
|
|
2525
|
-
|
|
2526
|
-
def plot_spectra_ui(platform: str):
|
|
2527
|
-
base_inputs = [
|
|
2528
|
-
ui.input_file("query_data", "Upload query dataset (mgf, mzML, cdf, msp, or txt):"),
|
|
2529
|
-
ui.input_file("reference_data", "Upload reference dataset (mgf, mzML, cdf, msp, or txt):"),
|
|
2530
|
-
ui.input_selectize(
|
|
2531
|
-
"spectrum_ID1",
|
|
2532
|
-
"Select spectrum ID 1 (default is the first spectrum in the library):",
|
|
2533
|
-
choices=[],
|
|
2534
|
-
multiple=False,
|
|
2535
|
-
options={"placeholder": "Upload a library..."},
|
|
2536
|
-
),
|
|
2537
|
-
ui.input_selectize(
|
|
2538
|
-
"spectrum_ID2",
|
|
2539
|
-
"Select spectrum ID 2 (default is the first spectrum in the library):",
|
|
2540
|
-
choices=[],
|
|
2541
|
-
multiple=False,
|
|
2542
|
-
options={"placeholder": "Upload a library..."},
|
|
2543
|
-
),
|
|
2544
|
-
ui.input_select('print_url_spectrum1', 'Print PubChem URL for spectrum 1:', ['No', 'Yes']),
|
|
2545
|
-
ui.input_select('print_url_spectrum2', 'Print PubChem URL for spectrum 2:', ['No', 'Yes']),
|
|
2546
|
-
ui.input_select("similarity_measure", "Select similarity measure:", ["cosine","shannon","renyi","tsallis","mixture","jaccard","dice","3w_jaccard","sokal_sneath","binary_cosine","mountford","mcconnaughey","driver_kroeber","simpson","braun_banquet","fager_mcgowan","kulczynski","intersection","hamming","hellinger"]),
|
|
2547
|
-
ui.input_text('weights', 'Weights for mixture similarity measure (cosine, shannon, renyi, tsallis):', '0.25, 0.25, 0.25, 0.25'),
|
|
2548
|
-
ui.input_select(
|
|
2549
|
-
"high_quality_reference_library",
|
|
2550
|
-
"Indicate whether the reference library is considered high quality. If True, filtering and noise removal are only applied to the query spectra.",
|
|
2551
|
-
[False, True],
|
|
2552
|
-
),
|
|
2553
|
-
]
|
|
2554
|
-
|
|
2555
|
-
if platform == "HRMS":
|
|
2556
|
-
extra_inputs = [
|
|
2557
|
-
ui.input_text("spectrum_preprocessing_order", "Sequence of characters for preprocessing order (C (centroiding), F (filtering), M (matching), N (noise removal), L (low-entropy transformation), W (weight factor transformation)). M must be included, C before M if used.", "FCNMWL",),
|
|
2558
|
-
ui.input_numeric("window_size_centroiding", "Centroiding window-size:", 0.5),
|
|
2559
|
-
ui.input_numeric("window_size_matching", "Matching window-size:", 0.5),
|
|
2560
|
-
]
|
|
2561
|
-
else:
|
|
2562
|
-
extra_inputs = [
|
|
2563
|
-
ui.input_text("spectrum_preprocessing_order", "Sequence of characters for preprocessing order (F (filtering), N (noise removal), L (low-entropy transformation), W (weight factor transformation)).", "FNLW",)
|
|
2564
|
-
]
|
|
2565
|
-
|
|
2566
|
-
numeric_inputs = [
|
|
2567
|
-
ui.input_numeric("mz_min", "Minimum m/z for filtering:", 0),
|
|
2568
|
-
ui.input_numeric("mz_max", "Maximum m/z for filtering:", 99999999),
|
|
2569
|
-
ui.input_numeric("int_min", "Minimum intensity for filtering:", 0),
|
|
2570
|
-
ui.input_numeric("int_max", "Maximum intensity for filtering:", 999999999),
|
|
2571
|
-
ui.input_numeric("noise_threshold", "Noise removal threshold:", 0.0),
|
|
2572
|
-
ui.input_numeric("wf_mz", "Mass/charge weight factor:", 0.0),
|
|
2573
|
-
ui.input_numeric("wf_int", "Intensity weight factor:", 1.0),
|
|
2574
|
-
ui.input_numeric("LET_threshold", "Low-entropy threshold:", 0.0),
|
|
2575
|
-
ui.input_numeric("entropy_dimension", "Entropy dimension (Renyi/Tsallis only):", 1.1),
|
|
2576
|
-
]
|
|
2577
|
-
|
|
2578
|
-
select_input = ui.input_select("y_axis_transformation", "Transformation to apply to intensity axis:", ["normalized", "none", "log10", "sqrt"])
|
|
2579
|
-
|
|
2580
|
-
run_button_plot_spectra = ui.download_button("run_btn_plot_spectra", "Run", style="font-size:16px; padding:15px 30px; width:200px; height:80px")
|
|
2581
|
-
back_button = ui.input_action_button("back", "Back to main menu", style="font-size:16px; padding:15px 30px; width:200px; height:80px")
|
|
2582
|
-
|
|
2583
|
-
if platform == "HRMS":
|
|
2584
|
-
inputs_columns = ui.layout_columns(
|
|
2585
|
-
ui.div(base_inputs[0:6], style="display:flex; flex-direction:column; gap:10px;"),
|
|
2586
|
-
ui.div([base_inputs[6:9], extra_inputs[0]], style="display:flex; flex-direction:column; gap:10px;"),
|
|
2587
|
-
ui.div(extra_inputs[1:3], numeric_inputs[0:3], style="display:flex; flex-direction:column; gap:10px;"),
|
|
2588
|
-
ui.div([numeric_inputs[3:10], select_input], style="display:flex; flex-direction:column; gap:10px;"),
|
|
2589
|
-
col_widths=(3,3,3,3),
|
|
2590
|
-
)
|
|
2591
|
-
elif platform == "NRMS":
|
|
2592
|
-
inputs_columns = ui.layout_columns(
|
|
2593
|
-
ui.div(base_inputs[0:6], style="display:flex; flex-direction:column; gap:10px;"),
|
|
2594
|
-
ui.div([base_inputs[6:9], *extra_inputs], style="display:flex; flex-direction:column; gap:10px;"),
|
|
2595
|
-
ui.div(numeric_inputs[0:5], style="display:flex; flex-direction:column; gap:10px;"),
|
|
2596
|
-
ui.div([numeric_inputs[5:10], select_input], style="display:flex; flex-direction:column; gap:10px;"),
|
|
2597
|
-
col_widths=(3,3,3,3),
|
|
2598
|
-
)
|
|
2599
|
-
|
|
2600
|
-
return ui.div(
|
|
2601
|
-
ui.TagList(
|
|
2602
|
-
ui.h2("Plot Spectra"),
|
|
2603
|
-
inputs_columns,
|
|
2604
|
-
run_button_plot_spectra,
|
|
2605
|
-
back_button,
|
|
2606
|
-
ui.div(ui.output_text("plot_query_status"), style="margin-top:8px; font-size:14px"),
|
|
2607
|
-
ui.div(ui.output_text("plot_reference_status"), style="margin-top:8px; font-size:14px")
|
|
2608
|
-
),
|
|
2609
|
-
)
|
|
2610
|
-
|
|
2611
|
-
|
|
2612
|
-
|
|
2613
|
-
def run_spec_lib_matching_ui(platform: str):
|
|
2614
|
-
base_inputs = [
|
|
2615
|
-
ui.input_file("query_data", "Upload query dataset (mgf, mzML, cdf, msp, or txt):"),
|
|
2616
|
-
ui.input_file("reference_data", "Upload reference dataset (mgf, mzML, cdf, msp, or txt):"),
|
|
2617
|
-
ui.input_select("similarity_measure", "Select similarity measure:", ["cosine","shannon","renyi","tsallis","mixture","jaccard","dice","3w_jaccard","sokal_sneath","binary_cosine","mountford","mcconnaughey","driver_kroeber","simpson","braun_banquet","fager_mcgowan","kulczynski","intersection","hamming","hellinger"]),
|
|
2618
|
-
ui.input_text('weights', 'Weights for mixture similarity measure (cosine, shannon, renyi, tsallis):', '0.25, 0.25, 0.25, 0.25'),
|
|
2619
|
-
ui.input_file('compound_ID_output_file', 'Upload output from spectral library matching to plot top matches (optional)'),
|
|
2620
|
-
ui.input_selectize("q_spec", "Select query spectrum (only applicable for plotting; default is the first spectrum in the compound ID output):", choices=[], multiple=False, options={"placeholder": "Upload compound ID output..."}),
|
|
2621
|
-
ui.input_selectize("r_spec", "Select reference spectrum (only applicable for plotting; default is the rank 1 reference spectrum):", choices=[], multiple=False, options={"placeholder": "Upload compound ID output..."}),
|
|
2622
|
-
ui.input_select('print_url_spectrum1', 'Print PubChem URL for query spectrum (only applicable for plotting):', ['No', 'Yes']),
|
|
2623
|
-
ui.input_select('print_url_spectrum2', 'Print PubChem URL for reference spectrum (only applicable for plotting):', ['No', 'Yes']),
|
|
2624
|
-
ui.input_select("high_quality_reference_library", "Indicate whether the reference library is considered high quality. If True, filtering and noise removal are only applied to the query spectra.", [False, True])
|
|
2625
|
-
]
|
|
2626
|
-
|
|
2627
|
-
if platform == "HRMS":
|
|
2628
|
-
extra_inputs = [
|
|
2629
|
-
ui.input_numeric("precursor_ion_mz_tolerance", "Precursor ion mass tolerance (leave blank if not applicable):", None),
|
|
2630
|
-
ui.input_select("ionization_mode", "Ionization mode:", ['Positive','Negative','N/A'], selected='N/A'),
|
|
2631
|
-
ui.input_select("adduct", "Adduct:", ['H','NH3','NH4','Na','K','N/A'], selected='N/A'),
|
|
2632
|
-
ui.input_text("spectrum_preprocessing_order","Sequence of characters for preprocessing order (C (centroiding), F (filtering), M (matching), N (noise removal), L (low-entropy transformation), W (weight factor transformation)). M must be included, C before M if used.","FCNMWL"),
|
|
2633
|
-
ui.input_numeric("window_size_centroiding", "Centroiding window-size:", 0.5),
|
|
2634
|
-
ui.input_numeric("window_size_matching", "Matching window-size:", 0.5),
|
|
2635
|
-
]
|
|
2636
|
-
else:
|
|
2637
|
-
extra_inputs = [ui.input_text("spectrum_preprocessing_order","Sequence of characters for preprocessing order (F (filtering), N (noise removal), L (low-entropy transformation), W (weight factor transformation)).","FNLW")]
|
|
2638
|
-
|
|
2639
|
-
numeric_inputs = [
|
|
2640
|
-
ui.input_numeric("mz_min", "Minimum m/z for filtering:", 0),
|
|
2641
|
-
ui.input_numeric("mz_max", "Maximum m/z for filtering:", 99999999),
|
|
2642
|
-
ui.input_numeric("int_min", "Minimum intensity for filtering:", 0),
|
|
2643
|
-
ui.input_numeric("int_max", "Maximum intensity for filtering:", 999999999),
|
|
2644
|
-
ui.input_numeric("noise_threshold", "Noise removal threshold:", 0.0),
|
|
2645
|
-
ui.input_numeric("wf_mz", "Mass/charge weight factor:", 0.0),
|
|
2646
|
-
ui.input_numeric("wf_int", "Intensity weight factor:", 1.0),
|
|
2647
|
-
ui.input_numeric("LET_threshold", "Low-entropy threshold:", 0.0),
|
|
2648
|
-
ui.input_numeric("entropy_dimension", "Entropy dimension (Renyi/Tsallis only):", 1.1),
|
|
2649
|
-
ui.input_numeric("n_top_matches_to_save", "Number of top matches to save:", 3),
|
|
2650
|
-
]
|
|
2651
|
-
|
|
2652
|
-
|
|
2653
|
-
run_button_spec_lib_matching = ui.download_button("run_btn_spec_lib_matching", "Run Spectral Library Matching", style="font-size:16px; padding:15px 30px; width:200px; height:80px")
|
|
2654
|
-
run_button_plot_spectra_within_spec_lib_matching = ui.download_button("run_btn_plot_spectra_within_spec_lib_matching", "Plot Spectra", style="font-size:16px; padding:15px 30px; width:200px; height:80px")
|
|
2655
|
-
back_button = ui.input_action_button("back", "Back to main menu", style="font-size:16px; padding:15px 30px; width:200px; height:80px")
|
|
2656
|
-
|
|
2657
|
-
if platform == "HRMS":
|
|
2658
|
-
inputs_columns = ui.layout_columns(
|
|
2659
|
-
ui.div([base_inputs[0:2], extra_inputs[0:3], base_inputs[2:4]], style="display:flex; flex-direction:column; gap:10px;"),
|
|
2660
|
-
ui.div([base_inputs[4:10]], style="display:flex; flex-direction:column; gap:10px;"),
|
|
2661
|
-
ui.div([extra_inputs[3:6], numeric_inputs[0:3]], style="display:flex; flex-direction:column; gap:10px;"),
|
|
2662
|
-
ui.div(numeric_inputs[3:10], style="display:flex; flex-direction:column; gap:10px;"),
|
|
2663
|
-
col_widths=(3,3,3,3)
|
|
2664
|
-
)
|
|
2665
|
-
elif platform == "NRMS":
|
|
2666
|
-
inputs_columns = ui.layout_columns(
|
|
2667
|
-
ui.div(base_inputs[0:6], style="display:flex; flex-direction:column; gap:10px;"),
|
|
2668
|
-
ui.div([base_inputs[6:10], *extra_inputs], style="display:flex; flex-direction:column; gap:10px;"),
|
|
2669
|
-
ui.div(numeric_inputs[0:5], style="display:flex; flex-direction:column; gap:10px;"),
|
|
2670
|
-
ui.div(numeric_inputs[5:10], style="display:flex; flex-direction:column; gap:10px;"),
|
|
2671
|
-
col_widths=(3,3,3,3)
|
|
2672
|
-
)
|
|
2673
|
-
|
|
2674
|
-
log_panel = ui.card(
|
|
2675
|
-
ui.card_header("Identification log"),
|
|
2676
|
-
ui.output_text_verbatim("match_log"),
|
|
2677
|
-
style="max-height:300px; overflow:auto"
|
|
2678
|
-
)
|
|
2679
|
-
|
|
2680
|
-
return ui.div(
|
|
2681
|
-
ui.TagList(
|
|
2682
|
-
ui.h2("Run Spectral Library Matching"),
|
|
2683
|
-
inputs_columns,
|
|
2684
|
-
run_button_spec_lib_matching,
|
|
2685
|
-
run_button_plot_spectra_within_spec_lib_matching,
|
|
2686
|
-
back_button,
|
|
2687
|
-
log_panel
|
|
2688
|
-
),
|
|
2689
|
-
)
|
|
2690
|
-
|
|
2691
|
-
|
|
2692
|
-
|
|
2693
|
-
def run_parameter_tuning_grid_ui(platform: str):
|
|
2694
|
-
base_inputs = [
|
|
2695
|
-
ui.input_file("query_data", "Upload query dataset (mgf, mzML, cdf, msp, or txt):"),
|
|
2696
|
-
ui.input_file("reference_data", "Upload reference dataset (mgf, mzML, cdf, msp, or txt):"),
|
|
2697
|
-
ui.input_selectize("similarity_measure", "Select similarity measure(s):", ["cosine","shannon","renyi","tsallis","mixture","jaccard","dice","3w_jaccard","sokal_sneath","binary_cosine","mountford","mcconnaughey","driver_kroeber","simpson","braun_banquet","fager_mcgowan","kulczynski","intersection","hamming","hellinger"], multiple=True, selected='cosine'),
|
|
2698
|
-
ui.input_text('weights', 'Weights for mixture similarity measure (cosine, shannon, renyi, tsallis):', '((0.25, 0.25, 0.25, 0.25))'),
|
|
2699
|
-
ui.input_text("high_quality_reference_library", "Indicate whether the reference library is considered high quality. If True, filtering and noise removal are only applied to the query spectra.", '[True]')
|
|
2700
|
-
]
|
|
2701
|
-
|
|
2702
|
-
if platform == "HRMS":
|
|
2703
|
-
extra_inputs = [
|
|
2704
|
-
ui.input_numeric("precursor_ion_mz_tolerance", "Precursor ion mass tolerance (leave blank if not applicable):", None),
|
|
2705
|
-
ui.input_select("ionization_mode", "Ionization mode:", ['Positive','Negative','N/A'], selected='N/A'),
|
|
2706
|
-
ui.input_select("adduct", "Adduct:", ['H','NH3','NH4','Na','K','N/A'], selected='N/A'),
|
|
2707
|
-
ui.input_text("spectrum_preprocessing_order", "Sequence of characters for preprocessing order (C (centroiding), F (filtering), M (matching), N (noise removal), L (low-entropy transformation), W (weight factor transformation)). M must be included, C before M if used.", "[FCNMWL,CWM]"),
|
|
2708
|
-
ui.input_text("window_size_centroiding", "Centroiding window-size:", "[0.5]"),
|
|
2709
|
-
ui.input_text("window_size_matching", "Matching window-size:", "[0.1,0.5]"),
|
|
2710
|
-
]
|
|
2711
|
-
else:
|
|
2712
|
-
extra_inputs = [
|
|
2713
|
-
ui.input_text(
|
|
2714
|
-
"spectrum_preprocessing_order",
|
|
2715
|
-
"Sequence of characters for preprocessing order (F (filtering), N (noise removal), L (low-entropy transformation), W (weight factor transformation)).",
|
|
2716
|
-
"[FNLW,WNL]",
|
|
2717
|
-
)
|
|
2718
|
-
]
|
|
2719
|
-
|
|
2720
|
-
numeric_inputs = [
|
|
2721
|
-
ui.input_text("mz_min", "Minimum m/z for filtering:", '[0]'),
|
|
2722
|
-
ui.input_text("mz_max", "Maximum m/z for filtering:", '[99999999]'),
|
|
2723
|
-
ui.input_text("int_min", "Minimum intensity for filtering:", '[0]'),
|
|
2724
|
-
ui.input_text("int_max", "Maximum intensity for filtering:", '[999999999]'),
|
|
2725
|
-
ui.input_text("noise_threshold", "Noise removal threshold:", '[0.0]'),
|
|
2726
|
-
ui.input_text("wf_mz", "Mass/charge weight factor:", '[0.0]'),
|
|
2727
|
-
ui.input_text("wf_int", "Intensity weight factor:", '[1.0]'),
|
|
2728
|
-
ui.input_text("LET_threshold", "Low-entropy threshold:", '[0.0]'),
|
|
2729
|
-
ui.input_text("entropy_dimension", "Entropy dimension (Renyi/Tsallis only):", '[1.1]')
|
|
2730
|
-
]
|
|
2731
|
-
|
|
2732
|
-
|
|
2733
|
-
run_button_parameter_tuning_grid = ui.download_button("run_btn_parameter_tuning_grid", "Tune parameters (grid search)", style="font-size:16px; padding:15px 30px; width:200px; height:80px")
|
|
2734
|
-
back_button = ui.input_action_button("back", "Back to main menu", style="font-size:16px; padding:15px 30px; width:200px; height:80px")
|
|
2735
|
-
|
|
2736
|
-
if platform == "HRMS":
|
|
2737
|
-
inputs_columns = ui.layout_columns(
|
|
2738
|
-
ui.div(base_inputs[0:6], style="display:flex; flex-direction:column; gap:10px;"),
|
|
2739
|
-
ui.div([base_inputs[6:7], *extra_inputs], style="display:flex; flex-direction:column; gap:10px;"),
|
|
2740
|
-
ui.div(numeric_inputs[0:5], style="display:flex; flex-direction:column; gap:10px;"),
|
|
2741
|
-
ui.div(numeric_inputs[5:9], style="display:flex; flex-direction:column; gap:10px;"),
|
|
2742
|
-
col_widths=(3, 3, 3, 3),
|
|
2743
|
-
)
|
|
2744
|
-
elif platform == "NRMS":
|
|
2745
|
-
inputs_columns = ui.layout_columns(
|
|
2746
|
-
ui.div(base_inputs[0:6], style="display:flex; flex-direction:column; gap:10px;"),
|
|
2747
|
-
ui.div([base_inputs[6:7], *extra_inputs], style="display:flex; flex-direction:column; gap:10px;"),
|
|
2748
|
-
ui.div(numeric_inputs[0:5], style="display:flex; flex-direction:column; gap:10px;"),
|
|
2749
|
-
ui.div(numeric_inputs[5:9], style="display:flex; flex-direction:column; gap:10px;"),
|
|
2750
|
-
col_widths=(3, 3, 3, 3),
|
|
2751
|
-
)
|
|
2752
|
-
|
|
2753
|
-
log_panel = ui.card(
|
|
2754
|
-
ui.card_header("Identification log"),
|
|
2755
|
-
ui.output_text_verbatim("match_log"),
|
|
2756
|
-
style="max-height:300px; overflow:auto"
|
|
2757
|
-
)
|
|
2758
|
-
|
|
2759
|
-
return ui.div(
|
|
2760
|
-
ui.TagList(
|
|
2761
|
-
ui.h2("Tune parameters (grid search)"),
|
|
2762
|
-
inputs_columns,
|
|
2763
|
-
run_button_parameter_tuning_grid,
|
|
2764
|
-
back_button,
|
|
2765
|
-
log_panel
|
|
2766
|
-
),
|
|
2767
|
-
)
|
|
2768
|
-
|
|
2769
|
-
|
|
2770
|
-
|
|
2771
|
-
PARAMS_HRMS = {
|
|
2772
|
-
"window_size_centroiding": (0.0, 0.5),
|
|
2773
|
-
"window_size_matching": (0.0, 0.5),
|
|
2774
|
-
"noise_threshold": (0.0, 0.25),
|
|
2775
|
-
"wf_mz": (0.0, 5.0),
|
|
2776
|
-
"wf_int": (0.0, 5.0),
|
|
2777
|
-
"LET_threshold": (0.0, 5.0),
|
|
2778
|
-
"entropy_dimension": (1.0, 3.0)
|
|
2779
|
-
}
|
|
2780
|
-
|
|
2781
|
-
PARAMS_NRMS = {
|
|
2782
|
-
"noise_threshold": (0.0, 0.25),
|
|
2783
|
-
"wf_mz": (0.0, 5.0),
|
|
2784
|
-
"wf_int": (0.0, 5.0),
|
|
2785
|
-
"LET_threshold": (0.0, 5.0),
|
|
2786
|
-
"entropy_dimension": (1.0, 3.0)
|
|
2787
|
-
}
|
|
2788
|
-
|
|
2789
|
-
|
|
2790
|
-
def run_parameter_tuning_DE_ui(platform: str):
|
|
2791
|
-
# Pick param set per platform
|
|
2792
|
-
if platform == "HRMS":
|
|
2793
|
-
PARAMS = PARAMS_HRMS
|
|
2794
|
-
else:
|
|
2795
|
-
PARAMS = PARAMS_NRMS
|
|
2796
|
-
|
|
2797
|
-
base_inputs = [
|
|
2798
|
-
ui.input_file("query_data", "Upload query dataset (mgf, mzML, cdf, msp, or txt):"),
|
|
2799
|
-
ui.input_file("reference_data", "Upload reference dataset (mgf, mzML, cdf, msp, or txt):"),
|
|
2800
|
-
ui.input_select("similarity_measure", "Select similarity measure:", ["cosine","shannon","renyi","tsallis","mixture","jaccard","dice","3w_jaccard","sokal_sneath","binary_cosine","mountford","mcconnaughey","driver_kroeber","simpson","braun_banquet","fager_mcgowan","kulczynski","intersection","hamming","hellinger"]),
|
|
2801
|
-
ui.input_text("weights", "Weights for mixture similarity measure (cosine, shannon, renyi, tsallis):", "0.25, 0.25, 0.25, 0.25"),
|
|
2802
|
-
ui.input_select("high_quality_reference_library", "Indicate whether the reference library is considered high quality. If True, filtering and noise removal are only applied to the query spectra.", [False, True])]
|
|
2803
|
-
|
|
2804
|
-
if platform == "HRMS":
|
|
2805
|
-
extra_inputs = [
|
|
2806
|
-
ui.input_numeric("precursor_ion_mz_tolerance", "Precursor ion mass tolerance (leave blank if not applicable):", None),
|
|
2807
|
-
ui.input_select("ionization_mode", "Ionization mode:", ['Positive','Negative','N/A'], selected='N/A'),
|
|
2808
|
-
ui.input_select("adduct", "Adduct:", ['H','NH3','NH4','Na','K','N/A'], selected='N/A'),
|
|
2809
|
-
ui.input_text("spectrum_preprocessing_order", "Sequence of characters for preprocessing order (C (centroiding), F (filtering), M (matching), N (noise removal), L (low-entropy transformation), W (weight factor transformation)). M must be included, C before M if used.", "FCNMWL"),
|
|
2810
|
-
ui.input_numeric("window_size_centroiding", "Centroiding window-size:", 0.5),
|
|
2811
|
-
ui.input_numeric("window_size_matching", "Matching window-size:", 0.5),
|
|
2812
|
-
]
|
|
2813
|
-
else:
|
|
2814
|
-
extra_inputs = [ui.input_text("spectrum_preprocessing_order", "Sequence of characters for preprocessing order (F (filtering), N (noise removal), L (low-entropy transformation), W (weight factor transformation)).", "FNLW")]
|
|
2815
|
-
|
|
2816
|
-
numeric_inputs = [
|
|
2817
|
-
ui.input_numeric("mz_min", "Minimum m/z for filtering:", 0),
|
|
2818
|
-
ui.input_numeric("mz_max", "Maximum m/z for filtering:", 99_999_999),
|
|
2819
|
-
ui.input_numeric("int_min", "Minimum intensity for filtering:", 0),
|
|
2820
|
-
ui.input_numeric("int_max", "Maximum intensity for filtering:", 999_999_999),
|
|
2821
|
-
ui.input_numeric("noise_threshold", "Noise removal threshold:", 0.0),
|
|
2822
|
-
ui.input_numeric("wf_mz", "Mass/charge weight factor:", 0.0),
|
|
2823
|
-
ui.input_numeric("wf_int", "Intensity weight factor:", 1.0),
|
|
2824
|
-
ui.input_numeric("LET_threshold", "Low-entropy threshold:", 0.0),
|
|
2825
|
-
ui.input_numeric("entropy_dimension", "Entropy dimension (Renyi/Tsallis only):", 1.1),
|
|
2826
|
-
ui.input_numeric("max_iterations", "Maximum number of iterations:", 5),
|
|
2827
|
-
]
|
|
2828
|
-
|
|
2829
|
-
run_button_parameter_tuning_DE = ui.input_action_button("run_btn_parameter_tuning_DE", "Tune parameters (differential evolution optimization)", style="font-size:16px; padding:15px 30px; width:300px; height:100px")
|
|
2830
|
-
back_button = ui.input_action_button("back", "Back to main menu", style="font-size:16px; padding:15px 30px; width:300px; height:100px")
|
|
2831
|
-
|
|
2832
|
-
if platform == "HRMS":
|
|
2833
|
-
inputs_columns = ui.layout_columns(
|
|
2834
|
-
ui.div(*base_inputs, style="display:flex; flex-direction:column; gap:10px;"),
|
|
2835
|
-
ui.div(*extra_inputs, style="display:flex; flex-direction:column; gap:10px;"),
|
|
2836
|
-
ui.div(*numeric_inputs[0:5], style="display:flex; flex-direction:column; gap:10px;"),
|
|
2837
|
-
ui.div(*numeric_inputs[5:11], style="display:flex; flex-direction:column; gap:10px;"),
|
|
2838
|
-
col_widths=(3, 3, 3, 3),
|
|
2839
|
-
)
|
|
2840
|
-
else:
|
|
2841
|
-
inputs_columns = ui.layout_columns(
|
|
2842
|
-
ui.div(*base_inputs, style="display:flex; flex-direction:column; gap:10px;"),
|
|
2843
|
-
ui.div(*extra_inputs, style="display:flex; flex-direction:column; gap:10px;"),
|
|
2844
|
-
ui.div(*numeric_inputs[0:5], style="display:flex; flex-direction:column; gap:10px;"),
|
|
2845
|
-
ui.div(*numeric_inputs[5:11], style="display:flex; flex-direction:column; gap:10px;"),
|
|
2846
|
-
col_widths=(3, 3, 3, 3),
|
|
2847
|
-
)
|
|
2848
|
-
|
|
2849
|
-
return ui.page_fillable(
|
|
2850
|
-
ui.layout_sidebar(
|
|
2851
|
-
ui.sidebar(
|
|
2852
|
-
ui.h3("Select continuous parameters to optimize"),
|
|
2853
|
-
ui.input_checkbox_group("params", None, choices=list(PARAMS.keys()), selected=["noise_threshold", "LET_threshold"]),
|
|
2854
|
-
ui.hr(),
|
|
2855
|
-
ui.h4("Bounds for selected parameters"),
|
|
2856
|
-
ui.output_ui("bounds_inputs"),
|
|
2857
|
-
width=360,
|
|
2858
|
-
),
|
|
2859
|
-
ui.div(
|
|
2860
|
-
ui.h2("Tune parameters (differential evolution optimization)"),
|
|
2861
|
-
inputs_columns,
|
|
2862
|
-
ui.div(run_button_parameter_tuning_DE, back_button, style=("display:flex; flex-direction:row; gap:12px; align-items:center; flex-wrap:wrap;")),
|
|
2863
|
-
ui.br(),
|
|
2864
|
-
ui.card(
|
|
2865
|
-
ui.card_header("Live log"),
|
|
2866
|
-
ui.output_text_verbatim("run_log"),
|
|
2867
|
-
),
|
|
2868
|
-
style="display:flex; flex-direction:column; gap:16px;",
|
|
2869
|
-
),
|
|
2870
|
-
)
|
|
2871
|
-
)
|
|
2872
|
-
|
|
2873
|
-
|
|
2874
|
-
|
|
2875
|
-
app_ui = ui.page_fluid(
|
|
2876
|
-
ui.head_content(ui.tags.link(rel="icon", href="emblem.png")),
|
|
2877
|
-
ui.div(ui.output_image("image"), style=("display:block; margin:20px auto; max-width:320px; height:auto; text-align:center")),
|
|
2878
|
-
ui.output_ui("main_ui"),
|
|
2879
|
-
ui.output_text("status_output"),
|
|
2880
|
-
)
|
|
2881
|
-
|
|
2882
|
-
|
|
2883
|
-
|
|
2884
|
-
|
|
2885
|
-
def server(input, output, session):
|
|
2886
|
-
|
|
2887
|
-
current_page = reactive.Value("main_menu")
|
|
2888
|
-
|
|
2889
|
-
plot_clicks = reactive.Value(0)
|
|
2890
|
-
match_clicks = reactive.Value(0)
|
|
2891
|
-
back_clicks = reactive.Value(0)
|
|
2892
|
-
|
|
2893
|
-
run_status_plot_spectra = reactive.Value("")
|
|
2894
|
-
run_status_spec_lib_matching = reactive.Value("")
|
|
2895
|
-
run_status_plot_spectra_within_spec_lib_matching = reactive.Value("")
|
|
2896
|
-
run_status_parameter_tuning_grid = reactive.Value("")
|
|
2897
|
-
run_status_parameter_tuning_DE = reactive.Value("")
|
|
2898
|
-
is_tuning_grid_running = reactive.Value(False)
|
|
2899
|
-
is_tuning_DE_running = reactive.Value(False)
|
|
2900
|
-
match_log_rv = reactive.Value("")
|
|
2901
|
-
is_matching_rv = reactive.Value(False)
|
|
2902
|
-
is_any_job_running = reactive.Value(False)
|
|
2903
|
-
latest_txt_path_rv = reactive.Value("")
|
|
2904
|
-
latest_df_rv = reactive.Value(None)
|
|
2905
|
-
is_running_rv = reactive.Value(False)
|
|
2906
|
-
|
|
2907
|
-
query_ids_rv = reactive.Value([])
|
|
2908
|
-
query_file_path_rv = reactive.Value(None)
|
|
2909
|
-
query_result_rv = reactive.Value(None)
|
|
2910
|
-
query_status_rv = reactive.Value("")
|
|
2911
|
-
reference_ids_rv = reactive.Value([])
|
|
2912
|
-
reference_file_path_rv = reactive.Value(None)
|
|
2913
|
-
reference_result_rv = reactive.Value(None)
|
|
2914
|
-
reference_status_rv = reactive.Value("")
|
|
2915
|
-
|
|
2916
|
-
converted_query_path_rv = reactive.Value(None)
|
|
2917
|
-
converted_reference_path_rv = reactive.Value(None)
|
|
2918
|
-
|
|
2919
|
-
df_rv = reactive.Value(None)
|
|
2920
|
-
|
|
2921
|
-
|
|
2922
|
-
def _discover_rank_cols(df: pd.DataFrame):
|
|
2923
|
-
pred_pat = re.compile(r"^RANK\.(\d+)\.PRED$")
|
|
2924
|
-
score_pat = re.compile(r"^RANK\.(\d+)\.SIMILARITY\.SCORE$")
|
|
2925
|
-
pred_map, score_map = {}, {}
|
|
2926
|
-
for c in df.columns:
|
|
2927
|
-
m = pred_pat.match(c)
|
|
2928
|
-
if m: pred_map[int(m.group(1))] = c
|
|
2929
|
-
m = score_pat.match(c)
|
|
2930
|
-
if m: score_map[int(m.group(1))] = c
|
|
2931
|
-
return [(k, pred_map[k], score_map.get(k)) for k in sorted(pred_map)]
|
|
2932
|
-
|
|
2933
|
-
|
|
2934
|
-
def _rank_choices_for_query(df: pd.DataFrame, qid: str):
|
|
2935
|
-
sub = df.loc[df["QUERY.SPECTRUM.ID"].astype(str) == str(qid)]
|
|
2936
|
-
if sub.empty:
|
|
2937
|
-
return {}, None
|
|
2938
|
-
row = sub.iloc[0]
|
|
2939
|
-
rank_cols = _discover_rank_cols(df)
|
|
2940
|
-
if not rank_cols:
|
|
2941
|
-
return {}, None
|
|
2942
|
-
|
|
2943
|
-
choices = {}
|
|
2944
|
-
default_value = None
|
|
2945
|
-
for (k, pred_col, score_col) in rank_cols:
|
|
2946
|
-
pred = row.get(pred_col, None)
|
|
2947
|
-
if pd.isna(pred):
|
|
2948
|
-
continue
|
|
2949
|
-
pred = str(pred)
|
|
2950
|
-
score = row.get(score_col, None) if score_col else None
|
|
2951
|
-
score_str = f"{float(score):.6f}" if (score is not None and pd.notna(score)) else "NA"
|
|
2952
|
-
label = f"Rank {k} — {score_str} — {pred}"
|
|
2953
|
-
choices[label] = pred # values are plain names
|
|
2954
|
-
if k == 1:
|
|
2955
|
-
default_value = pred # default = Rank 1 name
|
|
2956
|
-
|
|
2957
|
-
if default_value is None and choices:
|
|
2958
|
-
default_value = next(iter(choices.values()))
|
|
2959
|
-
return choices, default_value
|
|
2960
|
-
|
|
2961
|
-
|
|
2962
|
-
@reactive.effect
|
|
2963
|
-
@reactive.event(input.compound_ID_output_file)
|
|
2964
|
-
async def _populate_ids_from_compound_ID_output_upload():
|
|
2965
|
-
files = input.compound_ID_output_file()
|
|
2966
|
-
if not files:
|
|
2967
|
-
return
|
|
2968
|
-
|
|
2969
|
-
in_path = Path(files[0]["datapath"])
|
|
2970
|
-
try:
|
|
2971
|
-
query_status_rv.set(f"Reading table from: {in_path.name} …")
|
|
2972
|
-
await reactive.flush()
|
|
2973
|
-
|
|
2974
|
-
df = await asyncio.to_thread(pd.read_csv, in_path, sep="\t", header=0)
|
|
2975
|
-
|
|
2976
|
-
if "QUERY.SPECTRUM.ID" not in df.columns:
|
|
2977
|
-
raise ValueError("Missing required column: QUERY.SPECTRUM.ID")
|
|
2978
|
-
if not _discover_rank_cols(df):
|
|
2979
|
-
raise ValueError("No columns matching RANK.<k>.PRED found.")
|
|
2980
|
-
|
|
2981
|
-
df_rv.set(df)
|
|
2982
|
-
|
|
2983
|
-
ids = df["QUERY.SPECTRUM.ID"].astype(str).tolist()
|
|
2984
|
-
unique_ids_in_order = list(dict.fromkeys(ids))
|
|
2985
|
-
|
|
2986
|
-
choices_dict, default_rank_value = _rank_choices_for_query(df, ids[0])
|
|
2987
|
-
choices_values = [str(v).strip() for v in choices_dict.values()]
|
|
2988
|
-
default_rank_value = str(default_rank_value).strip() if default_rank_value is not None else None
|
|
2989
|
-
|
|
2990
|
-
ui.update_selectize("q_spec", choices=unique_ids_in_order, selected=ids[0])
|
|
2991
|
-
await reactive.flush()
|
|
2992
|
-
|
|
2993
|
-
ui.update_selectize("r_spec", choices=choices_values, selected=choices_values[0])
|
|
2994
|
-
await reactive.flush()
|
|
2995
|
-
|
|
2996
|
-
except Exception as e:
|
|
2997
|
-
query_status_rv.set(f"❌ Failed: {e}")
|
|
2998
|
-
await reactive.flush()
|
|
2999
|
-
raise
|
|
3000
|
-
|
|
3001
|
-
|
|
3002
|
-
@reactive.effect
|
|
3003
|
-
@reactive.event(input.q_spec)
|
|
3004
|
-
async def _update_rank_choices_on_compound_ID_change():
|
|
3005
|
-
df = df_rv.get()
|
|
3006
|
-
if df is None:
|
|
3007
|
-
return
|
|
3008
|
-
qid = input.q_spec()
|
|
3009
|
-
if not qid:
|
|
3010
|
-
return
|
|
3011
|
-
|
|
3012
|
-
choices, default_rank_value = _rank_choices_for_query(df, qid)
|
|
3013
|
-
choices = list(choices.values())
|
|
3014
|
-
ui.update_selectize('r_spec', choices=choices, selected=default_rank_value)
|
|
3015
|
-
await reactive.flush()
|
|
3016
|
-
|
|
3017
|
-
|
|
3018
|
-
|
|
3019
|
-
@output
|
|
3020
|
-
@render.ui
|
|
3021
|
-
def bounds_inputs():
|
|
3022
|
-
selected = input.params()
|
|
3023
|
-
if not selected:
|
|
3024
|
-
return ui.div(ui.em("Select one or more parameters above."))
|
|
3025
|
-
|
|
3026
|
-
if input.chromatography_platform() == 'HRMS':
|
|
3027
|
-
PARAMS = PARAMS_HRMS
|
|
3028
|
-
else:
|
|
3029
|
-
PARAMS = PARAMS_NRMS
|
|
3030
|
-
blocks = []
|
|
3031
|
-
for name in selected:
|
|
3032
|
-
lo, hi = PARAMS.get(name, (0.0, 1.0))
|
|
3033
|
-
blocks.append(
|
|
3034
|
-
ui.card(
|
|
3035
|
-
ui.card_header(name),
|
|
3036
|
-
ui.layout_columns(
|
|
3037
|
-
ui.input_numeric(f"min_{name}", "Lower", lo, step=0.001),
|
|
3038
|
-
ui.input_numeric(f"max_{name}", "Upper", hi, step=0.001),
|
|
3039
|
-
)
|
|
3040
|
-
)
|
|
3041
|
-
)
|
|
3042
|
-
return ui.div(*blocks)
|
|
3043
|
-
|
|
3044
|
-
def _read_bounds_dict():
|
|
3045
|
-
selected = input.params()
|
|
3046
|
-
out = {}
|
|
3047
|
-
for name in selected:
|
|
3048
|
-
lo_default, hi_default = PARAMS.get(name, (0.0, 1.0))
|
|
3049
|
-
lo_id = f"min_{name}"
|
|
3050
|
-
hi_id = f"max_{name}"
|
|
3051
|
-
|
|
3052
|
-
lo_val = input[lo_id]() if lo_id in input else lo_default
|
|
3053
|
-
hi_val = input[hi_id]() if hi_id in input else hi_default
|
|
3054
|
-
|
|
3055
|
-
out[name] = (float(lo_val), float(hi_val))
|
|
3056
|
-
return out
|
|
3057
|
-
|
|
3058
|
-
def _read_bounds():
|
|
3059
|
-
opt_params = input.params()
|
|
3060
|
-
bounds_dict = {}
|
|
3061
|
-
if input.chromatography_platform() == 'HRMS':
|
|
3062
|
-
PARAMS = PARAMS_HRMS
|
|
3063
|
-
else:
|
|
3064
|
-
PARAMS = PARAMS_NRMS
|
|
3065
|
-
|
|
3066
|
-
for p in opt_params:
|
|
3067
|
-
lo_id, hi_id = f"min_{p}", f"max_{p}"
|
|
3068
|
-
lo_default, hi_default = PARAMS.get(p, (0.0, 1.0))
|
|
3069
|
-
lo = input[lo_id]() if lo_id in input else lo_default
|
|
3070
|
-
hi = input[hi_id]() if hi_id in input else hi_default
|
|
3071
|
-
if lo > hi:
|
|
3072
|
-
lo, hi = hi, lo
|
|
3073
|
-
bounds_dict[p] = (float(lo), float(hi))
|
|
3074
|
-
|
|
3075
|
-
bounds_list = [bounds_dict[p] for p in opt_params]
|
|
3076
|
-
return opt_params, bounds_dict, bounds_list
|
|
3077
|
-
|
|
3078
|
-
def _reset_plot_spectra_state():
|
|
3079
|
-
query_status_rv.set("")
|
|
3080
|
-
reference_status_rv.set("")
|
|
3081
|
-
query_ids_rv.set([])
|
|
3082
|
-
reference_ids_rv.set([])
|
|
3083
|
-
query_file_path_rv.set(None)
|
|
3084
|
-
reference_file_path_rv.set(None)
|
|
3085
|
-
query_result_rv.set(None)
|
|
3086
|
-
reference_result_rv.set(None)
|
|
3087
|
-
converted_query_path_rv.set(None)
|
|
3088
|
-
converted_reference_path_rv.set(None)
|
|
3089
|
-
try:
|
|
3090
|
-
ui.update_selectize("spectrum_ID1", choices=[], selected=None)
|
|
3091
|
-
ui.update_selectize("spectrum_ID2", choices=[], selected=None)
|
|
3092
|
-
except Exception:
|
|
3093
|
-
pass
|
|
3094
|
-
|
|
3095
|
-
|
|
3096
|
-
def _reset_spec_lib_matching_state():
|
|
3097
|
-
match_log_rv.set("")
|
|
3098
|
-
is_matching_rv.set(False)
|
|
3099
|
-
is_any_job_running.set(False)
|
|
3100
|
-
try:
|
|
3101
|
-
ui.update_selectize("spectrum_ID1", choices=[], selected=None)
|
|
3102
|
-
ui.update_selectize("spectrum_ID2", choices=[], selected=None)
|
|
3103
|
-
except Exception:
|
|
3104
|
-
pass
|
|
3105
|
-
|
|
3106
|
-
|
|
3107
|
-
def _reset_parameter_tuning_state():
|
|
3108
|
-
match_log_rv.set("")
|
|
3109
|
-
is_tuning_grid_running.set(False)
|
|
3110
|
-
is_tuning_DE_running.set(False)
|
|
3111
|
-
is_any_job_running.set(False)
|
|
3112
|
-
|
|
3113
|
-
|
|
3114
|
-
@reactive.effect
|
|
3115
|
-
@reactive.event(input.back)
|
|
3116
|
-
def _clear_on_back_from_pages():
|
|
3117
|
-
page = current_page()
|
|
3118
|
-
if page == "plot_spectra":
|
|
3119
|
-
_reset_plot_spectra_state()
|
|
3120
|
-
elif page == "run_spec_lib_matching":
|
|
3121
|
-
_reset_spec_lib_matching_state()
|
|
3122
|
-
elif page == "run_parameter_tuning_grid":
|
|
3123
|
-
_reset_parameter_tuning_state()
|
|
3124
|
-
elif page == "run_parameter_tuning_DE":
|
|
3125
|
-
_reset_parameter_tuning_state()
|
|
3126
|
-
|
|
3127
|
-
@reactive.effect
|
|
3128
|
-
def _clear_on_enter_pages():
|
|
3129
|
-
page = current_page()
|
|
3130
|
-
if page == "plot_spectra":
|
|
3131
|
-
_reset_plot_spectra_state()
|
|
3132
|
-
elif page == "run_spec_lib_matching":
|
|
3133
|
-
_reset_spec_lib_matching_state()
|
|
3134
|
-
elif page == "run_parameter_tuning_grid":
|
|
3135
|
-
_reset_parameter_tuning_state()
|
|
3136
|
-
elif page == "run_parameter_tuning_DE":
|
|
3137
|
-
_reset_parameter_tuning_state()
|
|
3138
|
-
|
|
3139
|
-
|
|
3140
|
-
def _drain_queue_nowait(q: asyncio.Queue) -> list[str]:
|
|
3141
|
-
out = []
|
|
3142
|
-
try:
|
|
3143
|
-
while True:
|
|
3144
|
-
out.append(q.get_nowait())
|
|
3145
|
-
except asyncio.QueueEmpty:
|
|
3146
|
-
pass
|
|
3147
|
-
return out
|
|
3148
|
-
|
|
3149
|
-
|
|
3150
|
-
class ReactiveWriter(io.TextIOBase):
|
|
3151
|
-
def __init__(self, loop: asyncio.AbstractEventLoop):
|
|
3152
|
-
self._loop = loop
|
|
3153
|
-
def write(self, s: str):
|
|
3154
|
-
if not s:
|
|
3155
|
-
return 0
|
|
3156
|
-
self._loop.call_soon_threadsafe(_LOG_QUEUE.put_nowait, s)
|
|
3157
|
-
return len(s)
|
|
3158
|
-
def flush(self):
|
|
3159
|
-
pass
|
|
3160
|
-
|
|
3161
|
-
def _run_with_redirects(func, writer: ReactiveWriter, **kwargs):
|
|
3162
|
-
with contextlib.redirect_stdout(writer), contextlib.redirect_stderr(writer):
|
|
3163
|
-
return func(**kwargs)
|
|
3164
|
-
|
|
3165
|
-
|
|
3166
|
-
|
|
3167
|
-
@reactive.effect
|
|
3168
|
-
async def _pump_logs():
|
|
3169
|
-
if not (is_any_job_running.get() or is_tuning_grid_running.get() or is_tuning_DE_running.get() or is_matching_rv.get()):
|
|
3170
|
-
return
|
|
3171
|
-
reactive.invalidate_later(0.05)
|
|
3172
|
-
msgs = _drain_queue_nowait(_LOG_QUEUE)
|
|
3173
|
-
if msgs:
|
|
3174
|
-
match_log_rv.set(match_log_rv.get() + "".join(msgs))
|
|
3175
|
-
await reactive.flush()
|
|
3176
|
-
|
|
3177
|
-
|
|
3178
|
-
def process_database(file_path: str):
|
|
3179
|
-
suffix = Path(file_path).suffix.lower()
|
|
3180
|
-
return {"path": file_path, "suffix": suffix}
|
|
3181
|
-
|
|
3182
|
-
@render.text
|
|
3183
|
-
def plot_query_status():
|
|
3184
|
-
return query_status_rv.get() or ""
|
|
3185
|
-
|
|
3186
|
-
@render.text
|
|
3187
|
-
def plot_reference_status():
|
|
3188
|
-
return reference_status_rv.get() or ""
|
|
3189
|
-
|
|
3190
|
-
|
|
3191
|
-
@reactive.effect
|
|
3192
|
-
@reactive.event(input.query_data)
|
|
3193
|
-
async def _on_query_upload():
|
|
3194
|
-
files = input.query_data()
|
|
3195
|
-
req(files and len(files) > 0)
|
|
3196
|
-
|
|
3197
|
-
file_path = files[0]["datapath"]
|
|
3198
|
-
query_file_path_rv.set(file_path)
|
|
3199
|
-
|
|
3200
|
-
query_status_rv.set(f"Processing query database: {Path(file_path).name} …")
|
|
3201
|
-
await reactive.flush()
|
|
3202
|
-
|
|
3203
|
-
try:
|
|
3204
|
-
result = await asyncio.to_thread(process_database, file_path)
|
|
3205
|
-
query_result_rv.set(result)
|
|
3206
|
-
query_status_rv.set("✅ Query database processed.")
|
|
3207
|
-
await reactive.flush()
|
|
3208
|
-
except Exception as e:
|
|
3209
|
-
query_status_rv.set(f"❌ Failed to process query database: {e}")
|
|
3210
|
-
await reactive.flush()
|
|
3211
|
-
|
|
3212
|
-
|
|
3213
|
-
@reactive.effect
|
|
3214
|
-
@reactive.event(input.reference_data)
|
|
3215
|
-
async def _on_reference_upload():
|
|
3216
|
-
files = input.reference_data()
|
|
3217
|
-
req(files and len(files) > 0)
|
|
3218
|
-
|
|
3219
|
-
file_path = files[0]["datapath"]
|
|
3220
|
-
reference_file_path_rv.set(file_path)
|
|
3221
|
-
|
|
3222
|
-
reference_status_rv.set(f"Processing reference database: {Path(file_path).name} …")
|
|
3223
|
-
await reactive.flush()
|
|
3224
|
-
|
|
3225
|
-
try:
|
|
3226
|
-
result = await asyncio.to_thread(process_database, file_path)
|
|
3227
|
-
reference_result_rv.set(result)
|
|
3228
|
-
reference_status_rv.set("✅ Reference database processed.")
|
|
3229
|
-
await reactive.flush()
|
|
3230
|
-
except Exception as e:
|
|
3231
|
-
reference_status_rv.set(f"❌ Failed to process reference database: {e}")
|
|
3232
|
-
await reactive.flush()
|
|
3233
|
-
|
|
3234
|
-
|
|
3235
|
-
@render.text
|
|
3236
|
-
def match_log():
|
|
3237
|
-
return match_log_rv.get()
|
|
3238
|
-
|
|
3239
|
-
|
|
3240
|
-
@reactive.Effect
|
|
3241
|
-
def _():
|
|
3242
|
-
if input.plot_spectra() > plot_clicks.get():
|
|
3243
|
-
current_page.set("plot_spectra")
|
|
3244
|
-
plot_clicks.set(input.plot_spectra())
|
|
3245
|
-
elif input.run_spec_lib_matching() > match_clicks.get():
|
|
3246
|
-
current_page.set("run_spec_lib_matching")
|
|
3247
|
-
match_clicks.set(input.run_spec_lib_matching())
|
|
3248
|
-
elif input.run_parameter_tuning_grid() > match_clicks.get():
|
|
3249
|
-
current_page.set("run_parameter_tuning_grid")
|
|
3250
|
-
match_clicks.set(input.run_parameter_tuning_grid())
|
|
3251
|
-
elif input.run_parameter_tuning_DE() > match_clicks.get():
|
|
3252
|
-
current_page.set("run_parameter_tuning_DE")
|
|
3253
|
-
match_clicks.set(input.run_parameter_tuning_DE())
|
|
3254
|
-
elif hasattr(input, "back") and input.back() > back_clicks.get():
|
|
3255
|
-
current_page.set("main_menu")
|
|
3256
|
-
back_clicks.set(input.back())
|
|
3257
|
-
|
|
3258
|
-
|
|
3259
|
-
@render.image
|
|
3260
|
-
def image():
|
|
3261
|
-
dir = Path(__file__).resolve().parent
|
|
3262
|
-
img: ImgData = {"src": str(dir / "www/emblem.png"), "width": "250px", "height": "250px"}
|
|
3263
|
-
return img
|
|
3264
|
-
|
|
3265
|
-
@output
|
|
3266
|
-
@render.ui
|
|
3267
|
-
def main_ui():
|
|
3268
|
-
if current_page() == "main_menu":
|
|
3269
|
-
return ui.page_fluid(
|
|
3270
|
-
ui.h2("Main Menu"),
|
|
3271
|
-
ui.div("Overview:", style="text-align:left; font-size:24px; font-weight:bold"),
|
|
3272
|
-
ui.div("PyCompound is a Python-based tool designed for performing spectral library matching on either high-resolution mass spectrometry data (HRMS) or low-resolution mass spectrometry data (NRMS). PyCompound offers a range of spectrum preprocessing transformations and similarity measures. These spectrum preprocessing transformations include filtering on mass/charge and/or intensity values, weight factor transformation, low-entropy transformation, centroiding, noise removal, and matching. The available similarity measures include the canonical Cosine similarity measure, three entropy-based similarity measures, and a variety of binary similarity measures: Jaccard, Dice, 3W-Jaccard, Sokal-Sneath, Binary Cosine, Mountford, McConnaughey, Driver-Kroeber, Simpson, Braun-Banquet, Fager-McGowan, Kulczynski, Intersection, Hamming, and Hellinger.", style="margin-top:10px; text-align:left; font-size:16px; font-weight:500"),
|
|
3273
|
-
ui.div("Select options:", style="margin-top:30px; text-align:left; font-size:24px; font-weight:bold"),
|
|
3274
|
-
ui.div(ui.input_radio_buttons("chromatography_platform", "Specify chromatography platform:", ["HRMS","NRMS"]), style="font-size:18px; margin-top:10px; max-width:none"),
|
|
3275
|
-
ui.input_action_button("plot_spectra", "Plot two spectra before and after preprocessing transformations.", style="font-size:18px; padding:20px 40px; width:550px; height:100px; margin-top:10px; margin-right:50px"),
|
|
3276
|
-
ui.input_action_button("run_spec_lib_matching", "Run spectral library matching to perform compound identification on a query library of spectra.", style="font-size:18px; padding:20px 40px; width:550px; height:100px; margin-top:10px; margin-right:50px"),
|
|
3277
|
-
ui.input_action_button("run_parameter_tuning_grid", "Grid search: Tune parameters to maximize accuracy of compound identification given a query library with known spectrum IDs.", style="font-size:18px; padding:20px 40px; width:450px; height:120px; margin-top:10px; margin-right:50px"),
|
|
3278
|
-
ui.input_action_button("run_parameter_tuning_DE", "Differential evolution optimization: Tune parameters to maximize accuracy of compound identification given a query library with known spectrum IDs.", style="font-size:18px; padding:20px 40px; width:500px; height:150px; margin-top:10px; margin-right:50px"),
|
|
3279
|
-
ui.div(
|
|
3280
|
-
"References:",
|
|
3281
|
-
style="margin-top:35px; text-align:left; font-size:24px; font-weight:bold"
|
|
3282
|
-
),
|
|
3283
|
-
ui.div(
|
|
3284
|
-
"If Shannon Entropy similarity measure, low-entropy transformation, or centroiding are used:",
|
|
3285
|
-
style="margin-top:10px; text-align:left; font-size:14px; font-weight:500"
|
|
3286
|
-
),
|
|
3287
|
-
ui.div(
|
|
3288
|
-
ui.HTML(
|
|
3289
|
-
'Li, Y., Kind, T., Folz, J. et al. (2021) Spectral entropy outperforms MS/MS dot product similarity for small-molecule compound identification. Nat Methods, 18 1524–1531. <a href="https://doi.org/10.1038/s41592-021-01331-z" target="_blank">https://doi.org/10.1038/s41592-021-01331-z</a>.'
|
|
3290
|
-
),
|
|
3291
|
-
style="text-align:left; font-size:14px; font-weight:500"
|
|
3292
|
-
),
|
|
3293
|
-
ui.div(
|
|
3294
|
-
"If Tsallis Entropy similarity measure or series of preprocessing transformations are used:",
|
|
3295
|
-
style="margin-top:10px; text-align:left; font-size:14px; font-weight:500"
|
|
3296
|
-
),
|
|
3297
|
-
ui.div(
|
|
3298
|
-
ui.HTML(
|
|
3299
|
-
'Dlugas, H., Zhang, X., Kim, S. (2025) Comparative analysis of continuous similarity measures for compound identification in mass spectrometry-based metabolomics. Chemometrics and Intelligent Laboratory Systems, 263, 105417. <a href="https://doi.org/10.1016/j.chemolab.2025.105417", target="_blank">https://doi.org/10.1016/j.chemolab.2025.105417</a>.'
|
|
3300
|
-
),
|
|
3301
|
-
style="text-align:left; font-size:14px; font-weight:500"
|
|
3302
|
-
),
|
|
3303
|
-
ui.div(
|
|
3304
|
-
"If binary similarity measures are used:",
|
|
3305
|
-
style="margin-top:10px; text-align:left; font-size:14px; font-weight:500"
|
|
3306
|
-
),
|
|
3307
|
-
ui.div(
|
|
3308
|
-
ui.HTML(
|
|
3309
|
-
'Kim, S., Kato, I., & Zhang, X. (2022). Comparative Analysis of Binary Similarity Measures for Compound Identification in Mass Spectrometry-Based Metabolomics. Metabolites, 12(8), 694. <a href="https://doi.org/10.3390/metabo12080694" target="_blank">https://doi.org/10.3390/metabo12080694</a>.'
|
|
3310
|
-
),
|
|
3311
|
-
style="text-align:left; font-size:14px; font-weight:500"
|
|
3312
|
-
),
|
|
3313
|
-
|
|
3314
|
-
ui.div(
|
|
3315
|
-
"If weight factor transformation is used:",
|
|
3316
|
-
style="margin-top:10px; text-align:left; font-size:14px; font-weight:500"
|
|
3317
|
-
),
|
|
3318
|
-
ui.div(
|
|
3319
|
-
ui.HTML(
|
|
3320
|
-
'Kim, S., Koo, I., Wei, X., & Zhang, X. (2012). A method of finding optimal weight factors for compound identification in gas chromatography-mass spectrometry. Bioinformatics, 28(8), 1158-1163. <a href="https://doi.org/10.1093/bioinformatics/bts083" target="_blank">https://doi.org/10.1093/bioinformatics/bts083</a>.'
|
|
3321
|
-
),
|
|
3322
|
-
style="margin-bottom:40px; text-align:left; font-size:14px; font-weight:500"
|
|
3323
|
-
),
|
|
3324
|
-
)
|
|
3325
|
-
elif current_page() == "plot_spectra":
|
|
3326
|
-
return plot_spectra_ui(input.chromatography_platform())
|
|
3327
|
-
elif current_page() == "run_spec_lib_matching":
|
|
3328
|
-
return run_spec_lib_matching_ui(input.chromatography_platform())
|
|
3329
|
-
elif current_page() == "run_parameter_tuning_grid":
|
|
3330
|
-
return run_parameter_tuning_grid_ui(input.chromatography_platform())
|
|
3331
|
-
elif current_page() == "run_parameter_tuning_DE":
|
|
3332
|
-
return run_parameter_tuning_DE_ui(input.chromatography_platform())
|
|
3333
|
-
|
|
3334
|
-
|
|
3335
|
-
|
|
3336
|
-
@reactive.effect
|
|
3337
|
-
@reactive.event(input.query_data)
|
|
3338
|
-
async def _populate_ids_from_query_upload():
|
|
3339
|
-
files = input.query_data()
|
|
3340
|
-
if not files:
|
|
3341
|
-
return
|
|
3342
|
-
|
|
3343
|
-
in_path = Path(files[0]["datapath"])
|
|
3344
|
-
suffix = in_path.suffix.lower()
|
|
3345
|
-
|
|
3346
|
-
try:
|
|
3347
|
-
if suffix == ".txt":
|
|
3348
|
-
txt_path = in_path
|
|
3349
|
-
converted_query_path_rv.set(str(txt_path))
|
|
3350
|
-
else:
|
|
3351
|
-
query_status_rv.set(f"Converting {in_path.name} → TXT…")
|
|
3352
|
-
await reactive.flush()
|
|
3353
|
-
|
|
3354
|
-
tmp_txt_path = in_path.with_suffix(".converted.txt")
|
|
3355
|
-
|
|
3356
|
-
out_obj = await asyncio.to_thread(build_library, str(in_path), str(tmp_txt_path))
|
|
3357
|
-
|
|
3358
|
-
if isinstance(out_obj, (str, os.PathLike, Path)):
|
|
3359
|
-
txt_path = Path(out_obj)
|
|
3360
|
-
elif isinstance(out_obj, pd.DataFrame):
|
|
3361
|
-
out_obj.to_csv(tmp_txt_path, index=False, sep='\t')
|
|
3362
|
-
txt_path = tmp_txt_path
|
|
3363
|
-
else:
|
|
3364
|
-
raise TypeError(f"build_library returned unsupported type: {type(out_obj)}")
|
|
3365
|
-
|
|
3366
|
-
converted_query_path_rv.set(str(txt_path))
|
|
3367
|
-
|
|
3368
|
-
query_status_rv.set(f"Reading IDs from: {txt_path.name} …")
|
|
3369
|
-
await reactive.flush()
|
|
3370
|
-
|
|
3371
|
-
ids = await asyncio.to_thread(extract_first_column_ids, str(txt_path))
|
|
3372
|
-
query_ids_rv.set(ids)
|
|
3373
|
-
|
|
3374
|
-
ui.update_selectize("spectrum_ID1", choices=ids, selected=(ids[0] if ids else None))
|
|
3375
|
-
|
|
3376
|
-
query_status_rv.set(f"✅ Loaded {len(ids)} IDs from {txt_path.name}" if ids else f"⚠️ No IDs found in {txt_path.name}")
|
|
3377
|
-
await reactive.flush()
|
|
3378
|
-
|
|
3379
|
-
except Exception as e:
|
|
3380
|
-
query_status_rv.set(f"❌ Failed: {e}")
|
|
3381
|
-
await reactive.flush()
|
|
3382
|
-
raise
|
|
3383
|
-
|
|
3384
|
-
|
|
3385
|
-
@reactive.effect
|
|
3386
|
-
@reactive.event(input.reference_data)
|
|
3387
|
-
async def _populate_ids_from_reference_upload():
|
|
3388
|
-
files = input.reference_data()
|
|
3389
|
-
if not files:
|
|
3390
|
-
return
|
|
3391
|
-
|
|
3392
|
-
in_path = Path(files[0]["datapath"])
|
|
3393
|
-
suffix = in_path.suffix.lower()
|
|
3394
|
-
|
|
3395
|
-
try:
|
|
3396
|
-
if suffix == ".txt":
|
|
3397
|
-
txt_path = in_path
|
|
3398
|
-
converted_reference_path_rv.set(str(txt_path))
|
|
3399
|
-
else:
|
|
3400
|
-
reference_status_rv.set(f"Converting {in_path.name} → TXT…")
|
|
3401
|
-
await reactive.flush()
|
|
3402
|
-
|
|
3403
|
-
tmp_txt_path = in_path.with_suffix(".converted.txt")
|
|
3404
|
-
|
|
3405
|
-
out_obj = await asyncio.to_thread(build_library, str(in_path), str(tmp_txt_path))
|
|
3406
|
-
|
|
3407
|
-
if isinstance(out_obj, (str, os.PathLike, Path)):
|
|
3408
|
-
txt_path = Path(out_obj)
|
|
3409
|
-
elif isinstance(out_obj, pd.DataFrame):
|
|
3410
|
-
out_obj.to_csv(tmp_txt_path, index=False, sep='\t')
|
|
3411
|
-
txt_path = tmp_txt_path
|
|
3412
|
-
else:
|
|
3413
|
-
raise TypeError(f"build_library returned unsupported type: {type(out_obj)}")
|
|
3414
|
-
|
|
3415
|
-
converted_reference_path_rv.set(str(txt_path))
|
|
3416
|
-
|
|
3417
|
-
reference_status_rv.set(f"Reading IDs from: {txt_path.name} …")
|
|
3418
|
-
await reactive.flush()
|
|
3419
|
-
|
|
3420
|
-
ids = await asyncio.to_thread(extract_first_column_ids, str(txt_path))
|
|
3421
|
-
reference_ids_rv.set(ids)
|
|
3422
|
-
|
|
3423
|
-
ui.update_selectize("spectrum_ID2", choices=ids, selected=(ids[0] if ids else None))
|
|
3424
|
-
|
|
3425
|
-
reference_status_rv.set(
|
|
3426
|
-
f"✅ Loaded {len(ids)} IDs from {txt_path.name}" if ids else f"⚠️ No IDs found in {txt_path.name}"
|
|
3427
|
-
)
|
|
3428
|
-
await reactive.flush()
|
|
3429
|
-
|
|
3430
|
-
except Exception as e:
|
|
3431
|
-
reference_status_rv.set(f"❌ Failed: {e}")
|
|
3432
|
-
await reactive.flush()
|
|
3433
|
-
raise
|
|
3434
|
-
|
|
3435
|
-
|
|
3436
|
-
@render.download(filename=lambda: f"plot.svg")
|
|
3437
|
-
def run_btn_plot_spectra():
|
|
3438
|
-
spectrum_ID1 = input.spectrum_ID1() or None
|
|
3439
|
-
spectrum_ID2 = input.spectrum_ID2() or None
|
|
3440
|
-
|
|
3441
|
-
weights = [float(weight.strip()) for weight in input.weights().split(",") if weight.strip()]
|
|
3442
|
-
weights = {'Cosine':weights[0], 'Shannon':weights[1], 'Renyi':weights[2], 'Tsallis':weights[3]}
|
|
3443
|
-
|
|
3444
|
-
high_quality_reference_library_tmp2 = False
|
|
3445
|
-
if input.high_quality_reference_library() != 'False':
|
|
3446
|
-
high_quality_reference_library_tmp2 = True
|
|
3447
|
-
|
|
3448
|
-
if input.chromatography_platform() == "HRMS":
|
|
3449
|
-
fig = generate_plots_on_HRMS_data(query_data=input.query_data()[0]['datapath'], reference_data=input.reference_data()[0]['datapath'], spectrum_ID1=spectrum_ID1, spectrum_ID2=spectrum_ID2, print_url_spectrum1=input.print_url_spectrum1(), print_url_spectrum2=input.print_url_spectrum2(), similarity_measure=input.similarity_measure(), weights=weights, spectrum_preprocessing_order=input.spectrum_preprocessing_order(), high_quality_reference_library=high_quality_reference_library_tmp2, mz_min=input.mz_min(), mz_max=input.mz_max(), int_min=input.int_min(), int_max=input.int_max(), window_size_centroiding=input.window_size_centroiding(), window_size_matching=input.window_size_matching(), noise_threshold=input.noise_threshold(), wf_mz=input.wf_mz(), wf_intensity=input.wf_int(), LET_threshold=input.LET_threshold(), entropy_dimension=input.entropy_dimension(), y_axis_transformation=input.y_axis_transformation(), return_plot=True)
|
|
3450
|
-
plt.show()
|
|
3451
|
-
elif input.chromatography_platform() == "NRMS":
|
|
3452
|
-
fig = generate_plots_on_NRMS_data(query_data=input.query_data()[0]['datapath'], reference_data=input.reference_data()[0]['datapath'], spectrum_ID1=spectrum_ID1, spectrum_ID2=spectrum_ID2, print_url_spectrum1=input.print_url_spectrum1(), print_url_spectrum2=input.print_url_spectrum2(), similarity_measure=input.similarity_measure(), spectrum_preprocessing_order=input.spectrum_preprocessing_order(), high_quality_reference_library=high_quality_reference_library_tmp2, mz_min=input.mz_min(), mz_max=input.mz_max(), int_min=input.int_min(), int_max=input.int_max(), noise_threshold=input.noise_threshold(), wf_mz=input.wf_mz(), wf_intensity=input.wf_int(), LET_threshold=input.LET_threshold(), entropy_dimension=input.entropy_dimension(), y_axis_transformation=input.y_axis_transformation(), return_plot=True)
|
|
3453
|
-
plt.show()
|
|
3454
|
-
with io.BytesIO() as buf:
|
|
3455
|
-
fig.savefig(buf, format="svg", dpi=150, bbox_inches="tight")
|
|
3456
|
-
plt.close()
|
|
3457
|
-
yield buf.getvalue()
|
|
3458
|
-
|
|
3459
|
-
|
|
3460
|
-
|
|
3461
|
-
|
|
3462
|
-
@render.download(filename="identification_output.txt")
|
|
3463
|
-
async def run_btn_spec_lib_matching():
|
|
3464
|
-
match_log_rv.set("Running identification...\n")
|
|
3465
|
-
await reactive.flush()
|
|
3466
|
-
|
|
3467
|
-
hq = input.high_quality_reference_library()
|
|
3468
|
-
if isinstance(hq, str):
|
|
3469
|
-
hq = hq.lower() == "true"
|
|
3470
|
-
elif isinstance(hq, (int, float)):
|
|
3471
|
-
hq = bool(hq)
|
|
3472
|
-
|
|
3473
|
-
weights = [float(weight.strip()) for weight in input.weights().split(",") if weight.strip()]
|
|
3474
|
-
weights = {'Cosine': weights[0], 'Shannon': weights[1], 'Renyi': weights[2], 'Tsallis': weights[3]}
|
|
3475
|
-
|
|
3476
|
-
common_kwargs = dict(
|
|
3477
|
-
query_data=input.query_data()[0]["datapath"],
|
|
3478
|
-
reference_data=input.reference_data()[0]["datapath"],
|
|
3479
|
-
likely_reference_ids=None,
|
|
3480
|
-
similarity_measure=input.similarity_measure(),
|
|
3481
|
-
weights=weights,
|
|
3482
|
-
spectrum_preprocessing_order=input.spectrum_preprocessing_order(),
|
|
3483
|
-
high_quality_reference_library=hq,
|
|
3484
|
-
mz_min=input.mz_min(), mz_max=input.mz_max(),
|
|
3485
|
-
int_min=input.int_min(), int_max=input.int_max(),
|
|
3486
|
-
noise_threshold=input.noise_threshold(),
|
|
3487
|
-
wf_mz=input.wf_mz(), wf_intensity=input.wf_int(),
|
|
3488
|
-
LET_threshold=input.LET_threshold(), entropy_dimension=input.entropy_dimension(),
|
|
3489
|
-
n_top_matches_to_save=input.n_top_matches_to_save(),
|
|
3490
|
-
print_id_results=True,
|
|
3491
|
-
output_identification=str(Path.cwd() / "identification_output.txt"),
|
|
3492
|
-
output_similarity_scores=str(Path.cwd() / "similarity_scores.txt"),
|
|
3493
|
-
return_ID_output=True,
|
|
3494
|
-
)
|
|
3495
|
-
|
|
3496
|
-
# --- streaming setup (same pattern as your DE block) ---
|
|
3497
|
-
loop = asyncio.get_running_loop()
|
|
3498
|
-
q: asyncio.Queue[str | None] = asyncio.Queue()
|
|
3499
|
-
|
|
3500
|
-
class UIWriter(io.TextIOBase):
|
|
3501
|
-
def write(self, s: str):
|
|
3502
|
-
if s:
|
|
3503
|
-
loop.call_soon_threadsafe(q.put_nowait, s)
|
|
3504
|
-
return len(s)
|
|
3505
|
-
def flush(self): pass
|
|
3506
|
-
|
|
3507
|
-
async def _drain():
|
|
3508
|
-
while True:
|
|
3509
|
-
msg = await q.get()
|
|
3510
|
-
if msg is None:
|
|
3511
|
-
break
|
|
3512
|
-
match_log_rv.set(match_log_rv.get() + msg)
|
|
3513
|
-
await reactive.flush()
|
|
3514
|
-
|
|
3515
|
-
drain_task = asyncio.create_task(_drain())
|
|
3516
|
-
writer = UIWriter()
|
|
3517
|
-
|
|
3518
|
-
# --- worker wrappers that install redirects INSIDE the thread ---
|
|
3519
|
-
def _run_hrms():
|
|
3520
|
-
with redirect_stdout(writer), redirect_stderr(writer):
|
|
3521
|
-
# optional heartbeat
|
|
3522
|
-
print(">> Starting HRMS identification ...", flush=True)
|
|
3523
|
-
return run_spec_lib_matching_on_HRMS_data_shiny(
|
|
3524
|
-
precursor_ion_mz_tolerance=input.precursor_ion_mz_tolerance(),
|
|
3525
|
-
ionization_mode=input.ionization_mode(),
|
|
3526
|
-
adduct=input.adduct(),
|
|
3527
|
-
window_size_centroiding=input.window_size_centroiding(),
|
|
3528
|
-
window_size_matching=input.window_size_matching(),
|
|
3529
|
-
**common_kwargs
|
|
3530
|
-
)
|
|
3531
|
-
|
|
3532
|
-
def _run_nrms():
|
|
3533
|
-
with redirect_stdout(writer), redirect_stderr(writer):
|
|
3534
|
-
print(">> Starting NRMS identification ...", flush=True)
|
|
3535
|
-
return run_spec_lib_matching_on_NRMS_data_shiny(**common_kwargs)
|
|
3536
|
-
|
|
3537
|
-
# --- run in worker thread and stream output live ---
|
|
3538
|
-
try:
|
|
3539
|
-
if input.chromatography_platform() == "HRMS":
|
|
3540
|
-
df_out = await asyncio.to_thread(_run_hrms)
|
|
3541
|
-
else:
|
|
3542
|
-
df_out = await asyncio.to_thread(_run_nrms)
|
|
3543
|
-
|
|
3544
|
-
match_log_rv.set(match_log_rv.get() + "\n✅ Identification finished.\n")
|
|
3545
|
-
await reactive.flush()
|
|
3546
|
-
|
|
3547
|
-
except Exception as e:
|
|
3548
|
-
import traceback
|
|
3549
|
-
tb = "".join(traceback.format_exception(type(e), e, e.__traceback__))
|
|
3550
|
-
match_log_rv.set(match_log_rv.get() + f"\n❌ {type(e).__name__}: {e}\n{tb}\n")
|
|
3551
|
-
await reactive.flush()
|
|
3552
|
-
# make sure to stop the drainer before re-raising
|
|
3553
|
-
await q.put(None); await drain_task
|
|
3554
|
-
raise
|
|
3555
|
-
|
|
3556
|
-
finally:
|
|
3557
|
-
await q.put(None)
|
|
3558
|
-
await drain_task
|
|
3559
|
-
|
|
3560
|
-
yield df_out.to_csv(index=True, sep="\t")
|
|
3561
|
-
|
|
3562
|
-
|
|
3563
|
-
|
|
3564
|
-
|
|
3565
|
-
@render.download(filename="plot.svg")
|
|
3566
|
-
def run_btn_plot_spectra_within_spec_lib_matching():
|
|
3567
|
-
req(input.query_data(), input.reference_data())
|
|
3568
|
-
|
|
3569
|
-
spectrum_ID1 = input.q_spec() or None
|
|
3570
|
-
spectrum_ID2 = input.r_spec() or None
|
|
3571
|
-
|
|
3572
|
-
hq = input.high_quality_reference_library()
|
|
3573
|
-
if isinstance(hq, str):
|
|
3574
|
-
hq = hq.lower() == "true"
|
|
3575
|
-
elif isinstance(hq, (int, float)):
|
|
3576
|
-
hq = bool(hq)
|
|
3577
|
-
|
|
3578
|
-
weights = [float(weight.strip()) for weight in input.weights().split(",") if weight.strip()]
|
|
3579
|
-
weights = {'Cosine':weights[0], 'Shannon':weights[1], 'Renyi':weights[2], 'Tsallis':weights[3]}
|
|
3580
|
-
|
|
3581
|
-
common = dict(
|
|
3582
|
-
query_data=input.query_data()[0]['datapath'],
|
|
3583
|
-
reference_data=input.reference_data()[0]['datapath'],
|
|
3584
|
-
spectrum_ID1=spectrum_ID1,
|
|
3585
|
-
spectrum_ID2=spectrum_ID2,
|
|
3586
|
-
print_url_spectrum1=input.print_url_spectrum1(),
|
|
3587
|
-
print_url_spectrum2=input.print_url_spectrum2(),
|
|
3588
|
-
similarity_measure=input.similarity_measure(),
|
|
3589
|
-
weights=weights,
|
|
3590
|
-
spectrum_preprocessing_order=input.spectrum_preprocessing_order(),
|
|
3591
|
-
high_quality_reference_library=hq,
|
|
3592
|
-
mz_min=input.mz_min(), mz_max=input.mz_max(),
|
|
3593
|
-
int_min=input.int_min(), int_max=input.int_max(),
|
|
3594
|
-
noise_threshold=input.noise_threshold(),
|
|
3595
|
-
wf_mz=input.wf_mz(), wf_intensity=input.wf_int(),
|
|
3596
|
-
LET_threshold=input.LET_threshold(), entropy_dimension=input.entropy_dimension(),
|
|
3597
|
-
y_axis_transformation="normalized",
|
|
3598
|
-
return_plot=True
|
|
3599
|
-
)
|
|
3600
|
-
|
|
3601
|
-
if input.chromatography_platform() == "HRMS":
|
|
3602
|
-
fig = generate_plots_on_HRMS_data(
|
|
3603
|
-
window_size_centroiding=input.window_size_centroiding(),
|
|
3604
|
-
window_size_matching=input.window_size_matching(),
|
|
3605
|
-
**common
|
|
3606
|
-
)
|
|
3607
|
-
plt.show()
|
|
3608
|
-
else:
|
|
3609
|
-
fig = generate_plots_on_NRMS_data(**common)
|
|
3610
|
-
plt.show()
|
|
3611
|
-
|
|
3612
|
-
with io.BytesIO() as buf:
|
|
3613
|
-
fig.savefig(buf, format="svg", dpi=150, bbox_inches="tight")
|
|
3614
|
-
plt.close()
|
|
3615
|
-
yield buf.getvalue()
|
|
3616
|
-
|
|
3617
|
-
|
|
3618
|
-
@render.download(filename="parameter_tuning_grid_output.txt")
|
|
3619
|
-
async def run_btn_parameter_tuning_grid():
|
|
3620
|
-
is_any_job_running.set(True)
|
|
3621
|
-
is_tuning_grid_running.set(True)
|
|
3622
|
-
match_log_rv.set("Running grid search of all parameters specified...\n")
|
|
3623
|
-
await reactive.flush()
|
|
3624
|
-
|
|
3625
|
-
similarity_measure_tmp = list(input.similarity_measure())
|
|
3626
|
-
high_quality_reference_library_tmp = [x.strip().lower() == "true" for x in input.high_quality_reference_library().strip().strip("[]").split(",") if x.strip()]
|
|
3627
|
-
spectrum_preprocessing_order_tmp = strip_text(input.spectrum_preprocessing_order())
|
|
3628
|
-
mz_min_tmp = strip_numeric(input.mz_min())
|
|
3629
|
-
mz_max_tmp = strip_numeric(input.mz_max())
|
|
3630
|
-
int_min_tmp = strip_numeric(input.int_min())
|
|
3631
|
-
int_max_tmp = strip_numeric(input.int_max())
|
|
3632
|
-
noise_threshold_tmp = strip_numeric(input.noise_threshold())
|
|
3633
|
-
wf_mz_tmp = strip_numeric(input.wf_mz())
|
|
3634
|
-
wf_int_tmp = strip_numeric(input.wf_int())
|
|
3635
|
-
LET_threshold_tmp = strip_numeric(input.LET_threshold())
|
|
3636
|
-
entropy_dimension_tmp = strip_numeric(input.entropy_dimension())
|
|
3637
|
-
weights_tmp = strip_weights(input.weights())
|
|
3638
|
-
|
|
3639
|
-
common_kwargs = dict(
|
|
3640
|
-
query_data=input.query_data()[0]["datapath"],
|
|
3641
|
-
reference_data=input.reference_data()[0]["datapath"],
|
|
3642
|
-
output_path=str(Path.cwd() / "parameter_tuning_grid_output.txt"),
|
|
3643
|
-
return_output=True,
|
|
3644
|
-
)
|
|
3645
|
-
|
|
3646
|
-
loop = asyncio.get_running_loop()
|
|
3647
|
-
rw = ReactiveWriter(loop)
|
|
3648
|
-
|
|
3649
|
-
try:
|
|
3650
|
-
if input.chromatography_platform() == "HRMS":
|
|
3651
|
-
precursor_ion_mz_tolerance = float(input.precursor_ion_mz_tolerance())
|
|
3652
|
-
ionization_mode = str(input.ionization_mode())
|
|
3653
|
-
adduct = str(input.adduct())
|
|
3654
|
-
window_size_centroiding_tmp = strip_numeric(input.window_size_centroiding())
|
|
3655
|
-
window_size_matching_tmp = strip_numeric(input.window_size_matching())
|
|
3656
|
-
grid = {
|
|
3657
|
-
'similarity_measure': similarity_measure_tmp,
|
|
3658
|
-
'weight': weights_tmp,
|
|
3659
|
-
'spectrum_preprocessing_order': spectrum_preprocessing_order_tmp,
|
|
3660
|
-
'mz_min': mz_min_tmp,
|
|
3661
|
-
'mz_max': mz_max_tmp,
|
|
3662
|
-
'int_min': int_min_tmp,
|
|
3663
|
-
'int_max': int_max_tmp,
|
|
3664
|
-
'noise_threshold': noise_threshold_tmp,
|
|
3665
|
-
'wf_mz': wf_mz_tmp,
|
|
3666
|
-
'wf_int': wf_int_tmp,
|
|
3667
|
-
'LET_threshold': LET_threshold_tmp,
|
|
3668
|
-
'entropy_dimension': entropy_dimension_tmp,
|
|
3669
|
-
'high_quality_reference_library': high_quality_reference_library_tmp,
|
|
3670
|
-
'window_size_centroiding': window_size_centroiding_tmp,
|
|
3671
|
-
'window_size_matching': window_size_matching_tmp,
|
|
3672
|
-
}
|
|
3673
|
-
df_out = await asyncio.to_thread(_run_with_redirects, tune_params_on_HRMS_data_grid_shiny, rw, **common_kwargs, grid=grid, precursor_ion_mz_tolerance=precursor_ion_mz_tolerance, ionization_mode=ionization_mode, adduct=adduct)
|
|
3674
|
-
else:
|
|
3675
|
-
grid = {
|
|
3676
|
-
'similarity_measure': similarity_measure_tmp,
|
|
3677
|
-
'weight': weights_tmp,
|
|
3678
|
-
'spectrum_preprocessing_order': spectrum_preprocessing_order_tmp,
|
|
3679
|
-
'mz_min': mz_min_tmp,
|
|
3680
|
-
'mz_max': mz_max_tmp,
|
|
3681
|
-
'int_min': int_min_tmp,
|
|
3682
|
-
'int_max': int_max_tmp,
|
|
3683
|
-
'noise_threshold': noise_threshold_tmp,
|
|
3684
|
-
'wf_mz': wf_mz_tmp,
|
|
3685
|
-
'wf_int': wf_int_tmp,
|
|
3686
|
-
'LET_threshold': LET_threshold_tmp,
|
|
3687
|
-
'entropy_dimension': entropy_dimension_tmp,
|
|
3688
|
-
'high_quality_reference_library': high_quality_reference_library_tmp,
|
|
3689
|
-
}
|
|
3690
|
-
df_out = await asyncio.to_thread(_run_with_redirects, tune_params_on_NRMS_data_grid_shiny, rw, **common_kwargs, grid=grid)
|
|
3691
|
-
|
|
3692
|
-
match_log_rv.set(match_log_rv.get() + "\n✅ Parameter tuning finished.\n")
|
|
3693
|
-
except Exception as e:
|
|
3694
|
-
match_log_rv.set(match_log_rv.get() + f"\n❌ Error: {e}\n")
|
|
3695
|
-
raise
|
|
3696
|
-
finally:
|
|
3697
|
-
is_tuning_grid_running.set(False)
|
|
3698
|
-
is_any_job_running.set(False)
|
|
3699
|
-
await reactive.flush()
|
|
3700
|
-
|
|
3701
|
-
yield df_out.to_csv(index=False, sep='\t').encode("utf-8")
|
|
3702
|
-
|
|
3703
|
-
|
|
3704
|
-
|
|
3705
|
-
@reactive.effect
|
|
3706
|
-
@reactive.event(input.run_btn_parameter_tuning_DE)
|
|
3707
|
-
async def run_btn_parameter_tuning_DE():
|
|
3708
|
-
match_log_rv.set("Tuning specified continuous parameters using differential evolution...\n")
|
|
3709
|
-
is_any_job_running.set(True)
|
|
3710
|
-
is_tuning_DE_running.set(True)
|
|
3711
|
-
await reactive.flush()
|
|
3712
|
-
|
|
3713
|
-
def _safe_float(v, default):
|
|
3714
|
-
try:
|
|
3715
|
-
if v is None:
|
|
3716
|
-
return default
|
|
3717
|
-
return float(v)
|
|
3718
|
-
except Exception:
|
|
3719
|
-
return default
|
|
3720
|
-
|
|
3721
|
-
def _iget(id, default=None):
|
|
3722
|
-
if id in input:
|
|
3723
|
-
try:
|
|
3724
|
-
return input[id]()
|
|
3725
|
-
except SilentException:
|
|
3726
|
-
return default
|
|
3727
|
-
return default
|
|
3728
|
-
|
|
3729
|
-
loop = asyncio.get_running_loop()
|
|
3730
|
-
q: asyncio.Queue[str | None] = asyncio.Queue()
|
|
3731
|
-
|
|
3732
|
-
class UIWriter(io.TextIOBase):
|
|
3733
|
-
def write(self, s: str):
|
|
3734
|
-
if s:
|
|
3735
|
-
loop.call_soon_threadsafe(q.put_nowait, s)
|
|
3736
|
-
return len(s)
|
|
3737
|
-
def flush(self): pass
|
|
3738
|
-
|
|
3739
|
-
async def _drain():
|
|
3740
|
-
while True:
|
|
3741
|
-
msg = await q.get()
|
|
3742
|
-
if msg is None:
|
|
3743
|
-
break
|
|
3744
|
-
match_log_rv.set(match_log_rv.get() + msg)
|
|
3745
|
-
await reactive.flush()
|
|
3746
|
-
|
|
3747
|
-
drain_task = asyncio.create_task(_drain())
|
|
3748
|
-
writer = UIWriter()
|
|
3749
|
-
|
|
3750
|
-
try:
|
|
3751
|
-
qfile = _iget("query_data")[0]["datapath"]
|
|
3752
|
-
rfile = _iget("reference_data")[0]["datapath"]
|
|
3753
|
-
|
|
3754
|
-
platform = _iget("chromatography_platform", "HRMS")
|
|
3755
|
-
sim = _iget("similarity_measure", "cosine")
|
|
3756
|
-
spro = _iget("spectrum_preprocessing_order", "FCNMWL")
|
|
3757
|
-
|
|
3758
|
-
hq_raw = _iget("high_quality_reference_library", False)
|
|
3759
|
-
if isinstance(hq_raw, str):
|
|
3760
|
-
hq = hq_raw.lower() == "true"
|
|
3761
|
-
else:
|
|
3762
|
-
hq = bool(hq_raw)
|
|
3763
|
-
|
|
3764
|
-
mz_min = _safe_float(_iget("mz_min", 0.0), 0.0)
|
|
3765
|
-
mz_max = _safe_float(_iget("mz_max", 99_999_999.0), 99_999_999.0)
|
|
3766
|
-
int_min = _safe_float(_iget("int_min", 0.0), 0.0)
|
|
3767
|
-
int_max = _safe_float(_iget("int_max", 999_999_999.0), 999_999_999.0)
|
|
3768
|
-
|
|
3769
|
-
w_text = _iget("weights", "") or ""
|
|
3770
|
-
w_list = [float(w.strip()) for w in w_text.split(",") if w.strip()]
|
|
3771
|
-
w_list = (w_list + [0.0, 0.0, 0.0, 0.0])[:4]
|
|
3772
|
-
weights = {"Cosine": w_list[0], "Shannon": w_list[1], "Renyi": w_list[2], "Tsallis": w_list[3]}
|
|
3773
|
-
|
|
3774
|
-
opt_params = tuple(_iget("params", ()) or ())
|
|
3775
|
-
bounds_dict = {}
|
|
3776
|
-
param_defaults = PARAMS_HRMS if platform == "HRMS" else PARAMS_NRMS
|
|
3777
|
-
for p in opt_params:
|
|
3778
|
-
lo = _safe_float(_iget(f"min_{p}", param_defaults.get(p, (0.0, 1.0))[0]),
|
|
3779
|
-
param_defaults.get(p, (0.0, 1.0))[0])
|
|
3780
|
-
hi = _safe_float(_iget(f"max_{p}", param_defaults.get(p, (0.0, 1.0))[1]),
|
|
3781
|
-
param_defaults.get(p, (0.0, 1.0))[1])
|
|
3782
|
-
if lo > hi:
|
|
3783
|
-
lo, hi = hi, lo
|
|
3784
|
-
bounds_dict[p] = (lo, hi)
|
|
3785
|
-
|
|
3786
|
-
defaults = {
|
|
3787
|
-
"window_size_centroiding": _safe_float(_iget("window_size_centroiding", 0.5), 0.5),
|
|
3788
|
-
"window_size_matching": _safe_float(_iget("window_size_matching", 0.5), 0.5),
|
|
3789
|
-
"noise_threshold": _safe_float(_iget("noise_threshold", 0.0), 0.0),
|
|
3790
|
-
"wf_mz": _safe_float(_iget("wf_mz", 0.0), 0.0),
|
|
3791
|
-
"wf_int": _safe_float(_iget("wf_int", 1.0), 1.0),
|
|
3792
|
-
"LET_threshold": _safe_float(_iget("LET_threshold", 0.0), 0.0),
|
|
3793
|
-
"entropy_dimension": _safe_float(_iget("entropy_dimension", 1.1), 1.1),
|
|
3794
|
-
}
|
|
3795
|
-
if platform == "NRMS":
|
|
3796
|
-
defaults.pop("window_size_centroiding", None)
|
|
3797
|
-
defaults.pop("window_size_matching", None)
|
|
3798
|
-
|
|
3799
|
-
except Exception as e:
|
|
3800
|
-
import traceback
|
|
3801
|
-
tb = "".join(traceback.format_exception(type(e), e, e.__traceback__))
|
|
3802
|
-
match_log_rv.set(match_log_rv.get() + f"\n❌ Input snapshot failed:\n{tb}\n")
|
|
3803
|
-
is_tuning_DE_running.set(False); is_any_job_running.set(False)
|
|
3804
|
-
await q.put(None); await drain_task; await reactive.flush()
|
|
3805
|
-
return
|
|
3806
|
-
|
|
3807
|
-
def _run():
|
|
3808
|
-
with redirect_stdout(writer), redirect_stderr(writer):
|
|
3809
|
-
return tune_params_DE(
|
|
3810
|
-
query_data=qfile,
|
|
3811
|
-
reference_data=rfile,
|
|
3812
|
-
precursor_ion_mz_tolerance=float(input.precursor_ion_mz_tolerance()),
|
|
3813
|
-
ionization_mode=input.ionization_mode(),
|
|
3814
|
-
adduct=input.adduct(),
|
|
3815
|
-
chromatography_platform=input.chromatography_platform(),
|
|
3816
|
-
similarity_measure=sim,
|
|
3817
|
-
weights=weights,
|
|
3818
|
-
spectrum_preprocessing_order=spro,
|
|
3819
|
-
mz_min=mz_min, mz_max=mz_max,
|
|
3820
|
-
int_min=int_min, int_max=int_max,
|
|
3821
|
-
high_quality_reference_library=hq,
|
|
3822
|
-
optimize_params=list(opt_params),
|
|
3823
|
-
param_bounds=bounds_dict,
|
|
3824
|
-
default_params=defaults,
|
|
3825
|
-
de_workers=1,
|
|
3826
|
-
maxiters=input.max_iterations()
|
|
3827
|
-
)
|
|
3828
|
-
|
|
3829
|
-
try:
|
|
3830
|
-
_ = await asyncio.to_thread(_run)
|
|
3831
|
-
match_log_rv.set(match_log_rv.get() + "\n✅ Differential evolution finished.\n")
|
|
3832
|
-
except Exception as e:
|
|
3833
|
-
import traceback
|
|
3834
|
-
tb = "".join(traceback.format_exception(type(e), e, e.__traceback__))
|
|
3835
|
-
match_log_rv.set(match_log_rv.get() + f"\n❌ {type(e).__name__}: {e}\n{tb}\n")
|
|
3836
|
-
finally:
|
|
3837
|
-
await q.put(None)
|
|
3838
|
-
await drain_task
|
|
3839
|
-
is_tuning_DE_running.set(False)
|
|
3840
|
-
is_any_job_running.set(False)
|
|
3841
|
-
await reactive.flush()
|
|
3842
|
-
|
|
3843
|
-
|
|
3844
|
-
@reactive.effect
|
|
3845
|
-
async def _pump_reactive_writer_logs():
|
|
3846
|
-
if not is_tuning_grid_running.get():
|
|
3847
|
-
return
|
|
3848
|
-
|
|
3849
|
-
reactive.invalidate_later(0.1)
|
|
3850
|
-
msgs = _drain_queue_nowait(_LOG_QUEUE)
|
|
3851
|
-
if msgs:
|
|
3852
|
-
match_log_rv.set(match_log_rv.get() + "".join(msgs))
|
|
3853
|
-
await reactive.flush()
|
|
3854
|
-
|
|
3855
|
-
|
|
3856
|
-
@render.text
|
|
3857
|
-
def status_output():
|
|
3858
|
-
return run_status_plot_spectra.get()
|
|
3859
|
-
return run_status_spec_lib_matching.get()
|
|
3860
|
-
return run_status_parameter_tuning_grid.get()
|
|
3861
|
-
return run_status_parameter_tuning_DE.get()
|
|
3862
|
-
|
|
3863
|
-
@output
|
|
3864
|
-
@render.text
|
|
3865
|
-
def run_log():
|
|
3866
|
-
return match_log_rv.get()
|
|
3867
|
-
|
|
3868
|
-
|
|
3869
|
-
app = App(app_ui, server)
|
|
3870
|
-
|
|
3871
|
-
|