pycompound 0.1.7__py3-none-any.whl → 0.1.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
app.py DELETED
@@ -1,3871 +0,0 @@
1
-
2
- from shiny import App, ui, reactive, render, req
3
- from shiny.types import SilentException
4
- from pathlib import Path
5
- from contextlib import redirect_stdout, redirect_stderr
6
- import contextlib
7
- import subprocess
8
- import traceback
9
- import asyncio
10
- import io
11
- import os
12
- import sys
13
- import matplotlib.pyplot as plt
14
- import pandas as pd
15
- import numpy as np
16
- import netCDF4 as nc
17
- from pyteomics import mgf, mzml
18
- import ast
19
- from numbers import Real
20
- import logging
21
- from scipy.optimize import differential_evolution
22
- import scipy
23
- import scipy.stats
24
- from itertools import product
25
- import json
26
- import re
27
- import urllib.parse
28
- import urllib.request
29
- import matplotlib
30
-
31
- matplotlib.rcParams['svg.fonttype'] = 'none'
32
-
33
- _LOG_QUEUE: asyncio.Queue[str] = asyncio.Queue()
34
-
35
- _ADDUCT_PAT = re.compile(r"\s*(?:\[(M[^\]]+)\]|(M[+-][A-Za-z0-9]+)\+?)\s*$", re.IGNORECASE)
36
-
37
- def start_log_consumer():
38
- if getattr(start_log_consumer, "_started", False):
39
- return
40
- start_log_consumer._started = True
41
-
42
- async def _consume():
43
- while True:
44
- s = await _LOG_QUEUE.get()
45
- match_log_rv.set(match_log_rv.get() + s)
46
- await reactive.flush()
47
-
48
- asyncio.create_task(_consume())
49
-
50
-
51
- def start_log_consumer():
52
- if getattr(start_log_consumer, "_started", False):
53
- return
54
- start_log_consumer._started = True
55
-
56
- async def _consume():
57
- while True:
58
- s = await _LOG_QUEUE.get()
59
- match_log_rv.set(match_log_rv.get() + s)
60
- await reactive.flush()
61
-
62
- asyncio.create_task(_consume())
63
-
64
-
65
-
66
- def _strip_adduct(name: str) -> str:
67
- return _ADDUCT_PAT.sub("", name).strip()
68
-
69
- def get_pubchem_url(query: str) -> str:
70
- base_name = _strip_adduct(query)
71
- endpoint = ("https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/name/" + urllib.parse.quote(base_name) + "/cids/TXT")
72
- try:
73
- with urllib.request.urlopen(endpoint, timeout=10) as r:
74
- txt = r.read().decode("utf-8").strip()
75
- cid = txt.splitlines()[0].strip()
76
- if cid.isdigit():
77
- return f"https://pubchem.ncbi.nlm.nih.gov/compound/{cid}"
78
- except Exception:
79
- pass
80
- q = urllib.parse.quote(base_name)
81
- return f"https://pubchem.ncbi.nlm.nih.gov/#query={q}"
82
-
83
-
84
-
85
- def build_library_from_raw_data(input_path=None, output_path=None, is_reference=False):
86
- if input_path is None:
87
- print('Error: please specify input_path (i.e. the path to the input mgf, mzML, cdf, json, or msp file). Mandatory argument.')
88
- sys.exit()
89
-
90
- if output_path is None:
91
- tmp = input_path.split('/')
92
- tmp = tmp[(len(tmp)-1)]
93
- basename = tmp.split('.')[0]
94
- output_path = f'{Path.cwd()}/{basename}.csv'
95
- print(f'Warning: no output_path specified, so library is written to {output_path}')
96
-
97
- if is_reference not in [True,False]:
98
- print('Error: is_reference must be either \'True\' or \'False\'.')
99
- sys.exit()
100
-
101
- last_three_chars = input_path[(len(input_path)-3):len(input_path)]
102
- last_four_chars = input_path[(len(input_path)-4):len(input_path)]
103
- if last_three_chars == 'mgf' or last_three_chars == 'MGF':
104
- input_file_type = 'mgf'
105
- elif last_four_chars == 'mzML' or last_four_chars == 'mzml' or last_four_chars == 'MZML':
106
- input_file_type = 'mzML'
107
- elif last_four_chars == 'json' or last_four_chars == 'JSON':
108
- input_file_type = 'json'
109
- elif last_three_chars == 'cdf' or last_three_chars == 'CDF':
110
- input_file_type = 'cdf'
111
- elif last_three_chars == 'msp' or last_three_chars == 'MSP':
112
- input_file_type = 'msp'
113
- else:
114
- print('ERROR: either an \'mgf\', \'mzML\', \'cdf\', \'json\', or \'msp\' file must be passed to --input_path')
115
- sys.exit()
116
-
117
-
118
-
119
- def generate_plots_on_HRMS_data(query_data=None, reference_data=None, precursor_ion_mz=None, precursor_ion_mz_tolerance=None, ionization_mode=None, collision_energy=None, spectrum_ID1=None, spectrum_ID2=None, print_url_spectrum1='No', print_url_spectrum2='No', similarity_measure='cosine', weights={'Cosine':0.25,'Shannon':0.25,'Renyi':0.25,'Tsallis':0.25}, spectrum_preprocessing_order='FCNMWL', high_quality_reference_library=False, mz_min=0, mz_max=9999999, int_min=0, int_max=9999999, window_size_centroiding=0.5, window_size_matching=0.5, noise_threshold=0.0, wf_mz=0.0, wf_intensity=1.0, LET_threshold=0.0, entropy_dimension=1.1, y_axis_transformation='normalized', output_path=None, return_plot=False):
120
-
121
- if query_data is None:
122
- print('\nError: No argument passed to the mandatory query_data. Please pass the path to the TXT file of the query data.')
123
- sys.exit()
124
- else:
125
- extension = query_data.rsplit('.',1)
126
- extension = extension[(len(extension)-1)]
127
- if extension == 'mgf' or extension == 'MGF' or extension == 'mzML' or extension == 'mzml' or extension == 'MZML' or extension == 'cdf' or extension == 'CDF' or extension == 'msp' or extension == 'MSP' or extension == 'json' or extension == 'JSON':
128
- output_path_tmp = query_data[:-3] + 'txt'
129
- #build_library_from_raw_data(input_path=query_data, output_path=output_path_tmp, is_reference=True)
130
- build_library_from_raw_data(input_path=query_data, output_path=output_path_tmp, is_reference=False)
131
- df_query = pd.read_csv(output_path_tmp, sep='\t')
132
- if extension == 'txt' or extension == 'TXT':
133
- df_query = pd.read_csv(query_data, sep='\t')
134
- unique_query_ids = df_query['id'].unique().tolist()
135
- unique_query_ids = [str(tmp) for tmp in unique_query_ids]
136
-
137
- if reference_data is None:
138
- print('\nError: No argument passed to the mandatory reference_data. Please pass the path to the TXT file of the reference data.')
139
- sys.exit()
140
- else:
141
- extension = reference_data.rsplit('.',1)
142
- extension = extension[(len(extension)-1)]
143
- if extension == 'mgf' or extension == 'MGF' or extension == 'mzML' or extension == 'mzml' or extension == 'MZML' or extension == 'cdf' or extension == 'CDF' or extension == 'msp' or extension == 'MSP' or extension == 'json' or extension == 'JSON':
144
- output_path_tmp = reference_data[:-3] + 'txt'
145
- build_library_from_raw_data(input_path=reference_data, output_path=output_path_tmp, is_reference=True)
146
- df_reference = pd.read_csv(output_path_tmp, sep='\t')
147
- if extension == 'txt' or extension == 'TXT':
148
- df_reference = pd.read_csv(reference_data, sep='\t')
149
- cols_tmp = df_reference.columns.tolist()
150
- if 'precursor_ion_mz' in cols_tmp and 'ionization_mode' in cols_tmp and 'collision_energy' in cols_tmp:
151
- if precursor_ion_mz is not None and precursor_ion_mz_tolerance is not None:
152
- df_reference = df_reference.loc[(df_reference['precursor_ion_mz'] > (precursor_ion_mz-precursor_ion_mz_tolerance) & df_reference['precursor_ion_mz'] < (precursor_ion_mz+precursor_ion_mz_tolerance))]
153
- if ionization_mode is not None:
154
- df_reference = df_reference.loc[df_reference['ionization_mode'==ionization_mode]]
155
- if collision_energy is not None:
156
- df_reference = df_reference.loc[df_reference['collision_energy'==collision_energy]]
157
- df_reference = df_reference.drop(columns=['precursor_ion_mz','ionization_mode','collision_energy'])
158
- unique_reference_ids = df_reference['id'].unique().tolist()
159
- unique_reference_ids = [str(tmp) for tmp in unique_reference_ids]
160
-
161
- if spectrum_ID1 is not None:
162
- spectrum_ID1 = str(spectrum_ID1)
163
- else:
164
- spectrum_ID1 = str(df_query['id'].iloc[0])
165
- print('No argument passed to spectrum_ID1; using the first spectrum in query_data.')
166
-
167
- if spectrum_ID2 is not None:
168
- spectrum_ID2 = str(spectrum_ID2)
169
- else:
170
- spectrum_ID2 = str(df_reference['id'].iloc[0])
171
- print('No argument passed to spectrum_ID2; using the first spectrum in reference_data.')
172
-
173
- if spectrum_preprocessing_order is not None:
174
- spectrum_preprocessing_order = list(spectrum_preprocessing_order)
175
- else:
176
- spectrum_preprocessing_order = ['F', 'C', 'N', 'M', 'W', 'L']
177
- if 'M' not in spectrum_preprocessing_order:
178
- print(f'Error: \'M\' must be a character in spectrum_preprocessing_order.')
179
- sys.exit()
180
- if 'C' in spectrum_preprocessing_order:
181
- if spectrum_preprocessing_order.index('C') > spectrum_preprocessing_order.index('M'):
182
- print(f'Error: \'C\' must come before \'M\' in spectrum_preprocessing_order.')
183
- sys.exit()
184
- if set(spectrum_preprocessing_order) - {'F','C','N','M','W','L'}:
185
- print(f'Error: spectrum_preprocessing_order must contain only \'C\', \'F\', \'M\', \'N\', \'L\', \'W\'.')
186
- sys.exit()
187
-
188
- if similarity_measure not in ['cosine','shannon','renyi','tsallis','mixture','jaccard','dice','3w_jaccard','sokal_sneath','binary_cosine','mountford','mcconnaughey','driver_kroeber','simpson','braun_banquet','fager_mcgowan','kulczynski','intersection','hamming','hellinger']:
189
- print('\nError: similarity_measure must be either cosine, shannon, renyi, tsallis, mixture, jaccard, dice, 3w_jaccard, sokal_sneath, binary_cosine, mountford, mcconnaughey, driver_kroeber, simpson, braun_banquet, fager_mcgowan, kulczynski, intersection, hamming, or hellinger.')
190
- sys.exit()
191
-
192
- if isinstance(int_min,int) is True:
193
- int_min = float(int_min)
194
- if isinstance(int_max,int) is True:
195
- int_max = float(int_max)
196
- if isinstance(mz_min,int) is False or isinstance(mz_max,int) is False or isinstance(int_min,float) is False or isinstance(int_max,float) is False:
197
- print('Error: mz_min must be a non-negative integer, mz_max must be a positive integer, int_min must be a non-negative float, and int_max must be a positive float')
198
- sys.exit()
199
- if mz_min < 0:
200
- print('\nError: mz_min should be a non-negative integer')
201
- sys.exit()
202
- if mz_max <= 0:
203
- print('\nError: mz_max should be a positive integer')
204
- sys.exit()
205
- if int_min < 0:
206
- print('\nError: int_min should be a non-negative float')
207
- sys.exit()
208
- if int_max <= 0:
209
- print('\nError: int_max should be a positive float')
210
- sys.exit()
211
-
212
- if isinstance(window_size_centroiding,float) is False or window_size_centroiding <= 0.0:
213
- print('Error: window_size_centroiding must be a positive float.')
214
- sys.exit()
215
- if isinstance(window_size_matching,float) is False or window_size_matching<= 0.0:
216
- print('Error: window_size_matching must be a positive float.')
217
- sys.exit()
218
-
219
- if isinstance(noise_threshold,int) is True:
220
- noise_threshold = float(noise_threshold)
221
- if isinstance(noise_threshold,float) is False or noise_threshold < 0:
222
- print('Error: noise_threshold must be a positive float.')
223
- sys.exit()
224
-
225
- if isinstance(wf_intensity,int) is True:
226
- wf_intensity = float(wf_intensity)
227
- if isinstance(wf_mz,int) is True:
228
- wf_mz = float(wf_mz)
229
- if isinstance(wf_intensity,float) is False or isinstance(wf_mz,float) is False:
230
- print('Error: wf_mz and wf_intensity must be integers or floats')
231
- sys.exit()
232
-
233
- if entropy_dimension <= 0:
234
- print('\nError: entropy_dimension should be a positive float')
235
- sys.exit()
236
- else:
237
- q = entropy_dimension
238
-
239
- normalization_method = 'standard' #consider including additional normalization methods to transform intensities into a probability distribution; softmax results in many numerical errors/warnings
240
-
241
- if y_axis_transformation not in ['normalized','none','log10','sqrt']:
242
- print('Error: y_axis_transformation must be either \'normalized\', \'none\', \'log10\', or \'sqrt\'.')
243
- sys.exit()
244
-
245
- if output_path is None:
246
- print(f'Warning: plots will be saved to the svg ./spectrum1_{spectrum_ID1}_spectrum2_{spectrum_ID2}.svg in the current working directory.')
247
- output_path = f'{Path.cwd()}/spectrum1_{spectrum_ID1}_spectrum2_{spectrum_ID2}.svg'
248
-
249
-
250
- if spectrum_ID1 in unique_query_ids and spectrum_ID2 in unique_query_ids:
251
- query_idx = unique_query_ids.index(spectrum_ID1)
252
- reference_idx = unique_query_ids.index(spectrum_ID2)
253
- q_idxs_tmp = np.where(df_query.iloc[:,0].astype(str) == unique_query_ids[query_idx])[0]
254
- r_idxs_tmp = np.where(df_query.iloc[:,0].astype(str) == unique_query_ids[reference_idx])[0]
255
- q_spec = np.asarray(pd.concat([df_query.iloc[q_idxs_tmp,1], df_query.iloc[q_idxs_tmp,2]], axis=1).reset_index(drop=True))
256
- r_spec = np.asarray(pd.concat([df_query.iloc[r_idxs_tmp,1], df_query.iloc[r_idxs_tmp,2]], axis=1).reset_index(drop=True))
257
- elif spectrum_ID1 in unique_reference_ids and spectrum_ID2 in unique_reference_ids:
258
- query_idx = unique_reference_ids.index(spectrum_ID1)
259
- reference_idx = unique_reference_ids.index(spectrum_ID2)
260
- q_idxs_tmp = np.where(df_reference.iloc[:,0].astype(str) == unique_reference_ids[query_idx])[0]
261
- r_idxs_tmp = np.where(df_reference.iloc[:,0].astype(str) == unique_reference_ids[reference_idx])[0]
262
- q_spec = np.asarray(pd.concat([df_reference.iloc[q_idxs_tmp,1], df_reference.iloc[q_idxs_tmp,2]], axis=1).reset_index(drop=True))
263
- r_spec = np.asarray(pd.concat([df_reference.iloc[r_idxs_tmp,1], df_reference.iloc[r_idxs_tmp,2]], axis=1).reset_index(drop=True))
264
- else:
265
- if spectrum_ID1 in unique_reference_ids and spectrum_ID2 in unique_query_ids:
266
- spec_tmp = spectrum_ID1
267
- spectrum_ID1 = spectrum_ID2
268
- spectrum_ID2 = spec_tmp
269
- query_idx = unique_query_ids.index(spectrum_ID1)
270
- reference_idx = unique_reference_ids.index(spectrum_ID2)
271
- q_idxs_tmp = np.where(df_query['id'].astype(str) == unique_query_ids[query_idx])[0]
272
- r_idxs_tmp = np.where(df_reference['id'].astype(str) == unique_reference_ids[reference_idx])[0]
273
- q_spec = np.asarray(pd.concat([df_query['mz_ratio'].iloc[q_idxs_tmp], df_query['intensity'].iloc[q_idxs_tmp]], axis=1).reset_index(drop=True))
274
- r_spec = np.asarray(pd.concat([df_reference['mz_ratio'].iloc[r_idxs_tmp], df_reference['intensity'].iloc[r_idxs_tmp]], axis=1).reset_index(drop=True))
275
-
276
-
277
- q_spec_pre_trans = q_spec.copy()
278
- r_spec_pre_trans = r_spec.copy()
279
- q_spec_pre_trans[:,1] = q_spec_pre_trans[:,1].astype(float)
280
- r_spec_pre_trans[:,1] = r_spec_pre_trans[:,1].astype(float)
281
-
282
- if y_axis_transformation == 'normalized':
283
- q_spec_pre_trans[:,1] = q_spec_pre_trans[:,1] / np.max(q_spec_pre_trans[:,1])
284
- r_spec_pre_trans[:,1] = r_spec_pre_trans[:,1] / np.max(r_spec_pre_trans[:,1])
285
- ylab = 'Normalized Intensity'
286
- elif y_axis_transformation == 'log10':
287
- q_spec_pre_trans[:,1] = np.log10(np.array(q_spec_pre_trans[:,1]+1,dtype=float))
288
- r_spec_pre_trans[:,1] = np.log10(np.array(r_spec_pre_trans[:,1]+1,dtype=float))
289
- ylab = 'log10(Intensity)'
290
- elif y_axis_transformation == 'sqrt':
291
- q_spec_pre_trans[:,1] = np.sqrt(np.array(q_spec_pre_trans[:,1],dtype=float))
292
- r_spec_pre_trans[:,1] = np.sqrt(np.array(r_spec_pre_trans[:,1],dtype=float))
293
- ylab = 'sqrt(Intensity)'
294
- else:
295
- ylab = 'Raw Intensity'
296
-
297
- fig, axes = plt.subplots(nrows=2, ncols=1)
298
-
299
- plt.subplot(2,1,1)
300
- plt.vlines(x=q_spec_pre_trans[:,0], ymin=[0]*q_spec_pre_trans.shape[0], ymax=q_spec_pre_trans[:,1], linewidth=3, color='blue', label=f'Spectrum ID 1: {spectrum_ID1}')
301
- plt.vlines(x=r_spec_pre_trans[:,0], ymin=[0]*r_spec_pre_trans.shape[0], ymax=-r_spec_pre_trans[:,1], linewidth=3, color='red', label=f'Spectrum ID 2: {spectrum_ID2}')
302
- plt.xlabel('m/z',fontsize=7)
303
- plt.ylabel(ylab, fontsize=7)
304
- plt.xticks(fontsize=7)
305
- plt.yticks(fontsize=7)
306
- plt.title('Untransformed Spectra', fontsize=10)
307
-
308
- mz_min_tmp_q = round(q_spec[:,0].min(),1)
309
- mz_min_tmp_r = round(r_spec[:,0].min(),1)
310
- int_min_tmp_q = round(q_spec[:,1].min(),1)
311
- int_min_tmp_r = round(r_spec[:,1].min(),1)
312
- mz_max_tmp_q = round(q_spec[:,0].max(),1)
313
- mz_max_tmp_r = round(r_spec[:,0].max(),1)
314
- int_max_tmp_q = round(q_spec[:,1].max(),1)
315
- int_max_tmp_r = round(r_spec[:,1].max(),1)
316
- mz_min_tmp = min([mz_min_tmp_q,mz_min_tmp_r])
317
- mz_max_tmp = min([mz_max_tmp_q,mz_max_tmp_r])
318
- int_min_tmp = min([int_min_tmp_q,int_min_tmp_r])
319
- int_max_tmp = max([int_max_tmp_q,int_max_tmp_r])
320
-
321
- is_matched = False
322
- for transformation in spectrum_preprocessing_order:
323
- if transformation == 'C' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1:
324
- q_spec = centroid_spectrum(q_spec, window_size=window_size_centroiding)
325
- r_spec = centroid_spectrum(r_spec, window_size=window_size_centroiding)
326
- if transformation == 'M' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1:
327
- m_spec = match_peaks_in_spectra(spec_a=q_spec, spec_b=r_spec, window_size=window_size_matching)
328
- q_spec = m_spec[:,0:2]
329
- r_spec = m_spec[:,[0,2]]
330
- is_matched = True
331
- if transformation == 'W' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1:
332
- q_spec[:,1] = wf_transform(q_spec[:,0], q_spec[:,1], wf_mz, wf_intensity)
333
- r_spec[:,1] = wf_transform(r_spec[:,0], r_spec[:,1], wf_mz, wf_intensity)
334
- if transformation == 'L' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1:
335
- q_spec[:,1] = LE_transform(q_spec[:,1], LET_threshold, normalization_method=normalization_method)
336
- r_spec[:,1] = LE_transform(r_spec[:,1], LET_threshold, normalization_method=normalization_method)
337
- if transformation == 'N' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1:
338
- q_spec = remove_noise(q_spec, nr = noise_threshold)
339
- if high_quality_reference_library == False or high_quality_reference_library == 'False':
340
- r_spec = remove_noise(r_spec, nr = noise_threshold)
341
- if transformation == 'F' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1:
342
- q_spec = filter_spec_lcms(q_spec, mz_min = mz_min, mz_max = mz_max, int_min = int_min, int_max = int_max, is_matched = is_matched)
343
- if high_quality_reference_library == False or high_quality_reference_library == 'False':
344
- r_spec = filter_spec_lcms(r_spec, mz_min = mz_min, mz_max = mz_max, int_min = int_min, int_max = int_max, is_matched = is_matched)
345
-
346
- q_ints = q_spec[:,1]
347
- r_ints = r_spec[:,1]
348
-
349
- if np.sum(q_ints) != 0 and np.sum(r_ints) != 0 and q_spec.shape[0] > 1 and r_spec.shape[1] > 1:
350
- similarity_score = get_similarity(similarity_measure, q_ints, r_ints, weights, entropy_dimension)
351
- else:
352
- similarity_score = 0
353
-
354
- plt.subplot(2,1,2)
355
-
356
- if q_spec.shape[0] > 1:
357
- if np.max(q_spec[:,1]) == 0 or np.max(r_spec[:,1]) == 0:
358
- plt.text(0.5, 0.5, 'The query and/or reference spectrum has no non-zero intensities after transformations.\n Change transformation parameters.', ha='center', va='center', fontsize=7, color='black')
359
- plt.xticks([])
360
- plt.yticks([])
361
- else:
362
- if y_axis_transformation == 'normalized':
363
- q_spec[:,1] = q_spec[:,1] / np.max(q_spec[:,1])
364
- r_spec[:,1] = r_spec[:,1] / np.max(r_spec[:,1])
365
- ylab='Normalized Intensity'
366
- elif y_axis_transformation == 'log10':
367
- q_spec[:,1] = np.log10(q_spec[:,1]+1)
368
- r_spec[:,1] = np.log10(r_spec[:,1]+1)
369
- ylab='log10(Intensity)'
370
- elif y_axis_transformation == 'sqrt':
371
- q_spec[:,1] = np.sqrt(q_spec[:,1])
372
- r_spec[:,1] = np.sqrt(r_spec[:,1])
373
- ylab='sqrt(Intensity)'
374
- else:
375
- ylab = 'Raw Intensity'
376
- plt.vlines(x=q_spec[:,0], ymin=[0]*q_spec.shape[0], ymax=q_spec[:,1], linewidth=3, color='blue')
377
- plt.vlines(x=r_spec[:,0], ymin=[0]*r_spec.shape[0], ymax=-r_spec[:,1], linewidth=3, color='red')
378
- plt.xlabel('m/z', fontsize=7)
379
- plt.ylabel(ylab, fontsize=7)
380
- plt.xticks(fontsize=7)
381
- plt.yticks(fontsize=7)
382
- plt.title(f'Transformed Spectra', fontsize=10)
383
- else:
384
- plt.text(0.5, 0.5, 'All points in the spectra were removed during preprocessing. \nChange the spectrum_preprocesing_order and/or change other spectrum-preprocessing parameters.', ha='center', va='center', fontsize=7, color='black')
385
- plt.xticks([])
386
- plt.yticks([])
387
-
388
- plt.subplots_adjust(top=0.8, hspace=0.92, bottom=0.3)
389
- plt.figlegend(loc='upper center')
390
-
391
- fig.text(0.05, 0.20, f'Similarity Measure: {similarity_measure.capitalize()}', fontsize=7)
392
- fig.text(0.05, 0.17, f'Similarity Score: {round(similarity_score, 4)}', fontsize=7)
393
- fig.text(0.05, 0.14, f"Spectrum Preprocessing Order: {''.join(spectrum_preprocessing_order)}", fontsize=7)
394
- fig.text(0.05, 0.11, f'High Quality Reference Library: {str(high_quality_reference_library)}', fontsize=7)
395
- fig.text(0.05, 0.08, f'Window Size (Centroiding): {window_size_centroiding}', fontsize=7)
396
- fig.text(0.05, 0.05, f'Window Size (Matching): {window_size_matching}', fontsize=7)
397
- if similarity_measure == 'mixture':
398
- fig.text(0.05, 0.02, f'Weights for mixture similarity: {weights}', fontsize=7)
399
-
400
- fig.text(0.40, 0.20, f'Raw-Scale M/Z Range: [{mz_min_tmp},{mz_max_tmp}]', fontsize=7)
401
- fig.text(0.40, 0.17, f'Raw-Scale Intensity Range: [{int_min_tmp},{int_max_tmp}]', fontsize=7)
402
- fig.text(0.40, 0.14, f'Noise Threshold: {noise_threshold}', fontsize=7)
403
- fig.text(0.40, 0.11, f'Weight Factors (m/z,intensity): ({wf_mz},{wf_intensity})', fontsize=7)
404
- fig.text(0.40, 0.08, f'Low-Entropy Threshold: {LET_threshold}', fontsize=7)
405
-
406
- if print_url_spectrum1 == 'Yes' and print_url_spectrum2 == 'Yes':
407
- url_tmp1 = get_pubchem_url(query=spectrum_ID1)
408
- url_tmp2 = get_pubchem_url(query=spectrum_ID2)
409
- t1 = fig.text(0.40, 0.05, f'PubChem URL for {spectrum_ID1}: {url_tmp1}', fontsize=7)
410
- t2 = fig.text(0.40, 0.02, f'PubChem URL for {spectrum_ID2}: {url_tmp2}', fontsize=7)
411
- t1.set_url(url_tmp1)
412
- t2.set_url(url_tmp2)
413
-
414
- if print_url_spectrum1 == 'Yes' and print_url_spectrum2 == 'No':
415
- url_tmp1 = get_pubchem_url(query=spectrum_ID1)
416
- t1 = fig.text(0.40, 0.05, f'PubChem URL for {spectrum_ID1}: {url_tmp1}', fontsize=7)
417
- t1.set_url(url_tmp1)
418
-
419
- if print_url_spectrum1 == 'No' and print_url_spectrum2 == 'Yes':
420
- url_tmp2 = get_pubchem_url(query=spectrum_ID2)
421
- t2 = fig.text(0.40, 0.05, f'PubChem URL for {spectrum_ID2}: {url_tmp2}', fontsize=7)
422
- t2.set_url(url_tmp2)
423
-
424
- fig.savefig(output_path, format='svg')
425
-
426
- if return_plot == True:
427
- return fig
428
-
429
-
430
-
431
-
432
- def generate_plots_on_NRMS_data(query_data=None, reference_data=None, spectrum_ID1=None, spectrum_ID2=None, print_url_spectrum1='No', print_url_spectrum2='No', similarity_measure='cosine', weights={'Cosine':0.25,'Shannon':0.25,'Renyi':0.25,'Tsallis':0.25}, spectrum_preprocessing_order='FNLW', high_quality_reference_library=False, mz_min=0, mz_max=9999999, int_min=0, int_max=9999999, noise_threshold=0.0, wf_mz=0.0, wf_intensity=1.0, LET_threshold=0.0, entropy_dimension=1.1, y_axis_transformation='normalized', output_path=None, return_plot=False):
433
-
434
- if query_data is None:
435
- print('\nError: No argument passed to the mandatory query_data. Please pass the path to the TXT file of the query data.')
436
- sys.exit()
437
- else:
438
- extension = query_data.rsplit('.',1)
439
- extension = extension[(len(extension)-1)]
440
- if extension == 'mgf' or extension == 'MGF' or extension == 'mzML' or extension == 'mzml' or extension == 'MZML' or extension == 'cdf' or extension == 'CDF' or extension == 'msp' or extension == 'MSP' or extension == 'json' or extension == 'JSON':
441
- output_path_tmp = query_data[:-3] + 'txt'
442
- build_library_from_raw_data(input_path=query_data, output_path=output_path_tmp, is_reference=False)
443
- df_query = pd.read_csv(output_path_tmp, sep='\t')
444
- if extension == 'txt' or extension == 'TXT':
445
- df_query = pd.read_csv(query_data, sep='\t')
446
- unique_query_ids = df_query['id'].unique()
447
-
448
- if reference_data is None:
449
- print('\nError: No argument passed to the mandatory reference_data. Please pass the path to the TXT file of the reference data.')
450
- sys.exit()
451
- else:
452
- extension = reference_data.rsplit('.',1)
453
- extension = extension[(len(extension)-1)]
454
- if extension == 'mgf' or extension == 'MGF' or extension == 'mzML' or extension == 'mzml' or extension == 'MZML' or extension == 'cdf' or extension == 'CDF' or extension == 'msp' or extension == 'MSP' or extension == 'json' or extension == 'JSON':
455
- output_path_tmp = reference_data[:-3] + 'txt'
456
- build_library_from_raw_data(input_path=reference_data, output_path=output_path_tmp, is_reference=True)
457
- df_reference = pd.read_csv(output_path_tmp, sep='\t')
458
- if extension == 'txt' or extension == 'TXT':
459
- df_reference = pd.read_csv(reference_data, sep='\t')
460
- unique_reference_ids = df_reference['id'].unique()
461
-
462
-
463
- if spectrum_ID1 is not None:
464
- spectrum_ID1 = str(spectrum_ID1)
465
- else:
466
- spectrum_ID1 = str(df_query.iloc[0,0])
467
- print('No argument passed to spectrum_ID1; using the first spectrum in query_data.')
468
-
469
- if spectrum_ID2 is not None:
470
- spectrum_ID2 = str(spectrum_ID2)
471
- else:
472
- spectrum_ID2 = str(df_reference.iloc[0,0])
473
- print('No argument passed to spectrum_ID2; using the first spectrum in reference_data.')
474
-
475
- if spectrum_preprocessing_order is not None:
476
- spectrum_preprocessing_order = list(spectrum_preprocessing_order)
477
- else:
478
- spectrum_preprocessing_order = ['F','N','W','L']
479
- if set(spectrum_preprocessing_order) - {'F','N','W','L'}:
480
- print(f'Error: spectrum_preprocessing_order must contain only \'F\', \'N\', \'W\', \'L\'.')
481
- sys.exit()
482
-
483
- if similarity_measure not in ['cosine','shannon','renyi','tsallis','mixture','jaccard','dice','3w_jaccard','sokal_sneath','binary_cosine','mountford','mcconnaughey','driver_kroeber','simpson','braun_banquet','fager_mcgowan','kulczynski','intersection','hamming','hellinger']:
484
- print('\nError: similarity_measure must be either cosine, shannon, renyi, tsallis, mixture, jaccard, dice, 3w_jaccard, sokal_sneath, binary_cosine, mountford, mcconnaughey, driver_kroeber, simpson, braun_banquet, fager_mcgowan, kulczynski, intersection, hamming, or hellinger.')
485
- sys.exit()
486
-
487
- if isinstance(int_min,int) is True:
488
- int_min = float(int_min)
489
- if isinstance(int_max,int) is True:
490
- int_max = float(int_max)
491
- if isinstance(mz_min,int) is False or isinstance(mz_max,int) is False or isinstance(int_min,float) is False or isinstance(int_max,float) is False:
492
- print('Error: mz_min must be a non-negative integer, mz_max must be a positive integer, int_min must be a non-negative float, and int_max must be a positive float')
493
- sys.exit()
494
- if mz_min < 0:
495
- print('\nError: mz_min should be a non-negative integer')
496
- sys.exit()
497
- if mz_max <= 0:
498
- print('\nError: mz_max should be a positive integer')
499
- sys.exit()
500
- if int_min < 0:
501
- print('\nError: int_min should be a non-negative float')
502
- sys.exit()
503
- if int_max <= 0:
504
- print('\nError: int_max should be a positive float')
505
- sys.exit()
506
-
507
- if isinstance(noise_threshold,int) is True:
508
- noise_threshold = float(noise_threshold)
509
- if isinstance(noise_threshold,float) is False or noise_threshold < 0:
510
- print('Error: noise_threshold must be a positive float.')
511
- sys.exit()
512
-
513
- if isinstance(wf_intensity,int) is True:
514
- wf_intensity = float(wf_intensity)
515
- if isinstance(wf_mz,int) is True:
516
- wf_mz = float(wf_mz)
517
- if isinstance(wf_intensity,float) is False or isinstance(wf_mz,float) is False:
518
- print('Error: wf_mz and wf_intensity must be integers or floats')
519
- sys.exit()
520
-
521
- if entropy_dimension <= 0:
522
- print('\nError: entropy_dimension should be a positive float')
523
- sys.exit()
524
- else:
525
- q = entropy_dimension
526
-
527
- normalization_method = 'standard' #consider including additional normalization methods to transform intensities into a probability distribution; softmax results in many numerical errors/warnings
528
-
529
- if y_axis_transformation not in ['normalized','none','log10','sqrt']:
530
- print('Error: y_axis_transformation must be either \'normalized\', \'none\', \'log10\', or \'sqrt\'.')
531
- sys.exit()
532
-
533
- if output_path is None:
534
- print(f'Warning: plots will be saved to the svg ./spectrum1_{spectrum_ID1}_spectrum2_{spectrum_ID2}.svg in the current working directory.')
535
- output_path = f'{Path.cwd()}/spectrum1_{spectrum_ID1}_spectrum2_{spectrum_ID2}.svg'
536
-
537
- min_mz = np.min([df_query['mz_ratio'].min(), df_reference['mz_ratio'].min()])
538
- max_mz = np.max([df_query['mz_ratio'].max(), df_reference['mz_ratio'].max()])
539
- mzs = np.linspace(min_mz,max_mz,(max_mz-min_mz+1))
540
-
541
- unique_query_ids = df_query['id'].unique().tolist()
542
- unique_reference_ids = df_reference['id'].unique().tolist()
543
- unique_query_ids = [str(ID) for ID in unique_query_ids]
544
- unique_reference_ids = [str(ID) for ID in unique_reference_ids]
545
- common_IDs = np.intersect1d([str(ID) for ID in unique_query_ids], [str(ID) for ID in unique_reference_ids])
546
- if len(common_IDs) > 0:
547
- print(f'Warning: the query and reference library have overlapping IDs: {common_IDs}')
548
-
549
- if spectrum_ID1 in unique_query_ids and spectrum_ID2 in unique_query_ids:
550
- q_idxs_tmp = np.where(df_query.iloc[:,0].astype(str) == spectrum_ID1)[0]
551
- r_idxs_tmp = np.where(df_query.iloc[:,0].astype(str) == spectrum_ID2)[0]
552
- q_spec = np.asarray(pd.concat([df_query.iloc[q_idxs_tmp,1], df_query.iloc[q_idxs_tmp,2]], axis=1).reset_index(drop=True))
553
- r_spec = np.asarray(pd.concat([df_query.iloc[r_idxs_tmp,1], df_query.iloc[r_idxs_tmp,2]], axis=1).reset_index(drop=True))
554
- elif spectrum_ID1 in unique_reference_ids and spectrum_ID2 in unique_reference_ids:
555
- q_idxs_tmp = np.where(df_reference.iloc[:,0].astype(str) == spectrum_ID1)[0]
556
- r_idxs_tmp = np.where(df_reference.iloc[:,0].astype(str) == spectrum_ID2)[0]
557
- q_spec = np.asarray(pd.concat([df_reference.iloc[q_idxs_tmp,1], df_reference.iloc[q_idxs_tmp,2]], axis=1).reset_index(drop=True))
558
- r_spec = np.asarray(pd.concat([df_reference.iloc[r_idxs_tmp,1], df_reference.iloc[r_idxs_tmp,2]], axis=1).reset_index(drop=True))
559
- else:
560
- if spectrum_ID1 in unique_reference_ids and spectrum_ID2 in unique_query_ids:
561
- spec_tmp = spectrum_ID1
562
- spectrum_ID1 = spectrum_ID2
563
- spectrum_ID2 = spec_tmp
564
- q_idxs_tmp = np.where(df_query['id'].astype(str) == spectrum_ID1)[0]
565
- r_idxs_tmp = np.where(df_reference['id'].astype(str) == spectrum_ID2)[0]
566
- q_spec = np.asarray(pd.concat([df_query['mz_ratio'].iloc[q_idxs_tmp], df_query['intensity'].iloc[q_idxs_tmp]], axis=1).reset_index(drop=True))
567
- r_spec = np.asarray(pd.concat([df_reference['mz_ratio'].iloc[r_idxs_tmp], df_reference['intensity'].iloc[r_idxs_tmp]], axis=1).reset_index(drop=True))
568
-
569
- q_spec = convert_spec(q_spec,mzs)
570
- r_spec = convert_spec(r_spec,mzs)
571
-
572
- int_min_tmp_q = min(q_spec[q_spec[:,1].nonzero(),1][0])
573
- int_min_tmp_r = min(r_spec[r_spec[:,1].nonzero(),1][0])
574
- int_max_tmp_q = max(q_spec[q_spec[:,1].nonzero(),1][0])
575
- int_max_tmp_r = max(r_spec[r_spec[:,1].nonzero(),1][0])
576
- int_min_tmp = int(min([int_min_tmp_q,int_min_tmp_r]))
577
- int_max_tmp = int(max([int_max_tmp_q,int_max_tmp_r]))
578
-
579
- fig, axes = plt.subplots(nrows=2, ncols=1)
580
-
581
- plt.subplot(2,1,1)
582
-
583
- if np.max(q_spec[:,1]) == 0 or np.max(r_spec[:,1]) == 0:
584
- plt.text(0.5, 0.5, 'The query and/or reference spectrum has no non-zero intensities after transformations.\n Change transformation parameters.', ha='center', va='center', fontsize=7, color='black')
585
- plt.xticks([])
586
- plt.yticks([])
587
- else:
588
- q_spec_pre_trans = q_spec.copy()
589
- r_spec_pre_trans = r_spec.copy()
590
- q_spec_pre_trans[:,1] = q_spec_pre_trans[:,1].astype(float)
591
- r_spec_pre_trans[:,1] = r_spec_pre_trans[:,1].astype(float)
592
-
593
- if y_axis_transformation == 'normalized':
594
- q_spec_pre_trans[:,1] = q_spec_pre_trans[:,1] / np.max(q_spec_pre_trans[:,1])
595
- r_spec_pre_trans[:,1] = r_spec_pre_trans[:,1] / np.max(r_spec_pre_trans[:,1])
596
- ylab = 'Normalized Intensity'
597
- elif y_axis_transformation == 'log10':
598
- q_spec_pre_trans[:,1] = np.log10(q_spec_pre_trans[:,1]+1)
599
- r_spec_pre_trans[:,1] = np.log10(r_spec_pre_trans[:,1]+1)
600
- ylab = 'log10(Intensity)'
601
- elif y_axis_transformation == 'sqrt':
602
- q_spec_pre_trans[:,1] = np.sqrt(q_spec_pre_trans[:,1])
603
- r_spec_pre_trans[:,1] = np.sqrt(r_spec_pre_trans[:,1])
604
- ylab = 'sqrt(Intensity)'
605
- else:
606
- ylab = 'Raw Intensity'
607
- plt.vlines(x=q_spec_pre_trans[:,0], ymin=[0]*len(q_spec_pre_trans[:,0]), ymax=q_spec_pre_trans[:,1], linewidth=3, color='blue', label=f'Spectrum ID1: {spectrum_ID1}')
608
- plt.vlines(x=r_spec_pre_trans[:,0], ymin=[0]*len(r_spec_pre_trans[:,0]), ymax=-r_spec_pre_trans[:,1], linewidth=3, color='red', label=f'Spectrum ID2: {spectrum_ID2}')
609
- plt.xlabel('m/z',fontsize=7)
610
- plt.ylabel(ylab, fontsize=7)
611
- plt.xticks(fontsize=7)
612
- plt.yticks(fontsize=7)
613
- plt.title('Untransformed Query and Reference Spectra', fontsize=10)
614
-
615
- for transformation in spectrum_preprocessing_order:
616
- if transformation == 'W':
617
- q_spec[:,1] = wf_transform(q_spec[:,0], q_spec[:,1], wf_mz, wf_intensity)
618
- r_spec[:,1] = wf_transform(r_spec[:,0], r_spec[:,1], wf_mz, wf_intensity)
619
- if transformation == 'L':
620
- q_spec[:,1] = LE_transform(q_spec[:,1], LET_threshold, normalization_method)
621
- r_spec[:,1] = LE_transform(r_spec[:,1], LET_threshold, normalization_method)
622
- if transformation == 'N':
623
- q_spec = remove_noise(q_spec, nr = noise_threshold)
624
- if high_quality_reference_library == False or high_quality_reference_library == 'False':
625
- r_spec = remove_noise(r_spec, nr = noise_threshold)
626
- if transformation == 'F':
627
- q_spec = filter_spec_gcms(q_spec, mz_min = mz_min, mz_max = mz_max, int_min = int_min, int_max = int_max)
628
- if high_quality_reference_library == False or high_quality_reference_library == 'False':
629
- r_spec = filter_spec_gcms(r_spec, mz_min = mz_min, mz_max = mz_max, int_min = int_min, int_max = int_max)
630
-
631
- if q_spec.shape[0] > 1:
632
- similarity_score = get_similarity(similarity_measure, q_spec[:,1], r_spec[:,1], weights, entropy_dimension)
633
- else:
634
- similarity_score = 0
635
-
636
-
637
- plt.subplot(2,1,2)
638
-
639
- if q_spec.shape[0] == 0 or r_spec.shape[0] == 0:
640
- plt.text(0.5, 0.5, 'The query and/or reference spectrum has no ion fragments left after transformations.\n Change transformation parameters.', ha='center', va='center', fontsize=7, color='black')
641
- plt.xticks([])
642
- plt.yticks([])
643
- elif np.max(q_spec[:,1]) == 0 or np.max(r_spec[:,1]) == 0:
644
- plt.text(0.5, 0.5, 'The query and/or reference spectrum has no non-zero intensities after transformations.\n Change transformation parameters.', ha='center', va='center', fontsize=7, color='black')
645
- plt.xticks([])
646
- plt.yticks([])
647
- else:
648
- if y_axis_transformation == 'normalized':
649
- q_spec[:,1] = q_spec[:,1] / np.max(q_spec[:,1])
650
- r_spec[:,1] = r_spec[:,1] / np.max(r_spec[:,1])
651
- ylab='Normalized Intensity'
652
- elif y_axis_transformation == 'log10':
653
- q_spec[:,1] = np.log10(q_spec[:,1]+1)
654
- r_spec[:,1] = np.log10(r_spec[:,1]+1)
655
- ylab='log10(Intensity)'
656
- elif y_axis_transformation == 'sqrt':
657
- q_spec[:,1] = np.sqrt(q_spec[:,1])
658
- r_spec[:,1] = np.sqrt(r_spec[:,1])
659
- ylab='sqrt(Intensity)'
660
- else:
661
- ylab = 'Raw Intensity'
662
- plt.vlines(x=mzs, ymin=[0]*len(mzs), ymax=q_spec[:,1], linewidth=3, color='blue')
663
- plt.vlines(x=mzs, ymin=[0]*len(mzs), ymax=-r_spec[:,1], linewidth=3, color='red')
664
- plt.xlabel('m/z', fontsize=7)
665
- plt.ylabel(ylab, fontsize=7)
666
- plt.xticks(fontsize=7)
667
- plt.yticks(fontsize=7)
668
- plt.title(f'Transformed Query and Reference Spectra', fontsize=10)
669
-
670
- plt.subplots_adjust(top=0.8, hspace=0.92, bottom=0.3)
671
- plt.figlegend(loc='upper center')
672
-
673
- fig.text(0.05, 0.20, f'Similarity Measure: {similarity_measure.capitalize()}', fontsize=7)
674
- fig.text(0.05, 0.17, f'Similarity Score: {round(similarity_score, 4)}', fontsize=7)
675
- fig.text(0.05, 0.14, f"Spectrum Preprocessing Order: {''.join(spectrum_preprocessing_order)}", fontsize=7)
676
- fig.text(0.05, 0.11, f'High Quality Reference Library: {str(high_quality_reference_library)}', fontsize=7)
677
- fig.text(0.05, 0.08, f'Weight Factors (m/z,intensity): ({wf_mz},{wf_intensity})', fontsize=7)
678
- if similarity_measure == 'mixture':
679
- fig.text(0.05, 0.05, f'Weights for mixture similarity: {weights}', fontsize=7)
680
-
681
- fig.text(0.40, 0.20, f'Raw-Scale M/Z Range: [{min_mz},{max_mz}]', fontsize=7)
682
- fig.text(0.40, 0.17, f'Raw-Scale Intensity Range: [{int_min_tmp},{int_max_tmp}]', fontsize=7)
683
- fig.text(0.40, 0.14, f'Noise Threshold: {noise_threshold}', fontsize=7)
684
- fig.text(0.40, 0.11, f'Low-Entropy Threshold: {LET_threshold}', fontsize=7)
685
-
686
- if print_url_spectrum1 == 'Yes' and print_url_spectrum2 == 'Yes':
687
- url_tmp1 = get_pubchem_url(query=spectrum_ID1)
688
- url_tmp2 = get_pubchem_url(query=spectrum_ID2)
689
- t1 = fig.text(0.40, 0.08, f'PubChem URL for {spectrum_ID1}: {url_tmp1}', fontsize=7)
690
- t2 = fig.text(0.40, 0.05, f'PubChem URL for {spectrum_ID2}: {url_tmp2}', fontsize=7)
691
- t1.set_url(url_tmp1)
692
- t2.set_url(url_tmp2)
693
-
694
- if print_url_spectrum1 == 'Yes' and print_url_spectrum2 == 'No':
695
- url_tmp1 = get_pubchem_url(query=spectrum_ID1)
696
- t1 = fig.text(0.40, 0.08, f'PubChem URL for {spectrum_ID1}: {url_tmp1}', fontsize=7)
697
- t1.set_url(url_tmp1)
698
-
699
- if print_url_spectrum1 == 'No' and print_url_spectrum2 == 'Yes':
700
- url_tmp2 = get_pubchem_url(query=spectrum_ID2)
701
- t2 = fig.text(0.40, 0.08, f'PubChem URL for {spectrum_ID2}: {url_tmp2}', fontsize=7)
702
- t2.set_url(url_tmp2)
703
-
704
- fig.savefig(output_path, format='svg')
705
-
706
- if return_plot == True:
707
- return fig
708
-
709
-
710
- def wf_transform(spec_mzs, spec_ints, wf_mz, wf_int):
711
- spec_ints = np.power(spec_mzs, wf_mz) * np.power(spec_ints, wf_int)
712
- return(spec_ints)
713
-
714
-
715
- def LE_transform(intensity, thresh, normalization_method):
716
- intensity_tmp = normalize(intensity, method=normalization_method)
717
- if np.sum(intensity_tmp) > 0:
718
- S = scipy.stats.entropy(intensity_tmp.astype('float'))
719
- if S > 0 and S < thresh:
720
- w = (1 + S) / (1 + thresh)
721
- intensity = np.power(intensity_tmp, w)
722
- else:
723
- intensity = np.zeros(len(intensity))
724
- return intensity
725
-
726
-
727
- def normalize(intensities,method='standard'):
728
- if np.sum(intensities) > 0:
729
- if method == 'softmax':
730
- if np.any(intensities > 700):
731
- print("Warning: some intensities are too large to exponentiate. Applying standard normalization.")
732
- intensities /= np.sum(intensities)
733
- else:
734
- intensities2 = np.exp(intensities)
735
- if np.isinf(intensities2).sum() == 0:
736
- intensities = intensities / np.sum(intensities2)
737
- elif method == 'standard':
738
- intensities /= np.sum(intensities)
739
- return(intensities)
740
-
741
-
742
- def filter_spec_lcms(spec, mz_min = 0, mz_max = 999999999999, int_min = 0, int_max = 999999999999, is_matched = False):
743
- if is_matched == False:
744
- spec = spec[spec[:,0] >= mz_min]
745
- spec = spec[spec[:,0] <= mz_max]
746
- spec = spec[spec[:,1] >= int_min]
747
- spec = spec[spec[:,1] <= int_max]
748
- else:
749
- spec = spec[spec[:,0] >= mz_min]
750
- spec = spec[spec[:,0] <= mz_max]
751
- spec[spec[:,1] >= int_min] = 0
752
- spec[spec[:,1] <= int_max] = 0
753
- return(spec)
754
-
755
-
756
- def filter_spec_gcms(spec, mz_min = 0, mz_max = 999999999999, int_min = 0, int_max = 999999999999):
757
- spec[np.where(spec[:,0] < mz_min)[0],1] = 0
758
- spec[np.where(spec[:,0] > mz_max)[0],1] = 0
759
- spec[np.where(spec[:,1] < int_min)[0],1] = 0
760
- spec[np.where(spec[:,1] > int_max)[0],1] = 0
761
- return(spec)
762
-
763
-
764
- def remove_noise(spec, nr):
765
- if spec.shape[0] > 1:
766
- if nr is not None:
767
- spec[np.where(spec[:,1] < np.max(spec[:,1]) * nr)[0]] = 0
768
-
769
- return(spec)
770
-
771
-
772
- def centroid_spectrum(spec, window_size):
773
- spec = spec[np.argsort(spec[:,0])]
774
-
775
- mz_array = spec[:, 0]
776
- need_centroid = 0
777
- if mz_array.shape[0] > 1:
778
- mz_delta = mz_array[1:] - mz_array[:-1]
779
- if np.min(mz_delta) <= window_size:
780
- need_centroid = 1
781
-
782
- if need_centroid:
783
- intensity_order = np.argsort(-spec[:, 1])
784
- spec_new = []
785
- for i in intensity_order:
786
- mz_delta_allowed = window_size
787
-
788
- if spec[i, 1] > 0:
789
- i_left = i - 1
790
- while i_left >= 0:
791
- mz_delta_left = spec[i, 0] - spec[i_left, 0]
792
- if mz_delta_left <= mz_delta_allowed:
793
- i_left -= 1
794
- else:
795
- break
796
- i_left += 1
797
-
798
- i_right = i + 1
799
- while i_right < spec.shape[0]:
800
- mz_delta_right = spec[i_right, 0] - spec[i, 0]
801
- if mz_delta_right <= mz_delta_allowed:
802
- i_right += 1
803
- else:
804
- break
805
-
806
- intensity_sum = np.sum(spec[i_left:i_right, 1])
807
- intensity_weighted_sum = np.sum(spec[i_left:i_right, 0] * spec[i_left:i_right, 1])
808
-
809
- spec_new.append([intensity_weighted_sum / intensity_sum, intensity_sum])
810
- spec[i_left:i_right, 1] = 0
811
-
812
- spec_new = np.array(spec_new)
813
- spec_new = spec_new[np.argsort(spec_new[:, 0])]
814
- if spec_new.shape[0] > 1:
815
- spec_new = spec_new[np.argsort(spec_new[:, 0])]
816
- return spec_new
817
- else:
818
- return np.array([[0,0]])
819
- else:
820
- return spec
821
-
822
-
823
-
824
- def match_peaks_in_spectra(spec_a, spec_b, window_size):
825
- a = 0
826
- b = 0
827
-
828
- spec_merged = []
829
- peak_b_int = 0.
830
- while a < spec_a.shape[0] and b < spec_b.shape[0]:
831
- mass_delta = spec_a[a, 0] - spec_b[b, 0]
832
-
833
- if mass_delta < -window_size:
834
- spec_merged.append([spec_a[a, 0], spec_a[a, 1], peak_b_int])
835
- peak_b_int = 0.
836
- a += 1
837
- elif mass_delta > window_size:
838
- spec_merged.append([spec_b[b, 0], 0., spec_b[b, 1]])
839
- b += 1
840
- else:
841
- peak_b_int += spec_b[b, 1]
842
- b += 1
843
-
844
- if peak_b_int > 0.:
845
- spec_merged.append([spec_a[a, 0], spec_a[a, 1], peak_b_int])
846
- peak_b_int = 0.
847
- a += 1
848
-
849
- if b < spec_b.shape[0]:
850
- spec_merged += [[x[0], 0., x[1]] for x in spec_b[b:]]
851
-
852
- if a < spec_a.shape[0]:
853
- spec_merged += [[x[0], x[1], 0.] for x in spec_a[a:]]
854
-
855
- if spec_merged:
856
- spec_merged = np.array(spec_merged, dtype=np.float64)
857
- else:
858
- spec_merged = np.array([[0., 0., 0.]], dtype=np.float64)
859
- return spec_merged
860
-
861
-
862
-
863
- def convert_spec(spec, mzs):
864
- ints_tmp = []
865
- for i in range(0,len(mzs)):
866
- if mzs[i] in spec[:,0]:
867
- int_tmp = spec[np.where(spec[:,0] == mzs[i])[0][0],1]
868
- else:
869
- int_tmp = 0
870
- ints_tmp.append(int_tmp)
871
- out = np.transpose(np.array([mzs,ints_tmp]))
872
- return out
873
-
874
-
875
- def get_reference_df(reference_data, likely_reference_IDs=None):
876
- extension = reference_data.rsplit('.',1)
877
- extension = extension[(len(extension)-1)]
878
- if extension == 'mgf' or extension == 'MGF' or extension == 'mzML' or extension == 'mzml' or extension == 'MZML' or extension == 'cdf' or extension == 'CDF' or extension == 'msp' or extension == 'MSP' or extension == 'json' or extension == 'JSON':
879
- output_path_tmp = reference_data[:-3] + 'txt'
880
- build_library_from_raw_data(input_path=reference_data, output_path=output_path_tmp, is_reference=True)
881
- df_reference = pd.read_csv(output_path_tmp, sep='\t')
882
- if extension == 'txt' or extension == 'TXT':
883
- df_reference = pd.read_csv(reference_data, sep='\t')
884
- if likely_reference_IDs is not None:
885
- likely_reference_IDs = pd.read_csv(likely_reference_IDs, header=None)
886
- df_reference = df_reference.loc[df_reference.iloc[:,0].isin(likely_reference_IDs.iloc[:,0].tolist())]
887
- return df_reference
888
-
889
-
890
-
891
- def S_cos(ints_a, ints_b):
892
- if np.sum(ints_a) == 0 or np.sum(ints_b) == 0:
893
- return(0)
894
- else:
895
- return np.dot(ints_a,ints_b) / (np.sqrt(sum(np.power(ints_a,2))) * np.sqrt(sum(np.power(ints_b,2))))
896
-
897
-
898
- def ent_renyi(ints, q):
899
- return np.log(sum(np.power(ints,q))) / (1-q)
900
-
901
-
902
- def ent_tsallis(ints, q):
903
- return (sum(np.power(ints,q))-1) / (1-q)
904
-
905
-
906
- def S_shannon(ints_a, ints_b):
907
- ent_a = scipy.stats.entropy(ints_a)
908
- ent_b = scipy.stats.entropy(ints_b)
909
- ent_ab = scipy.stats.entropy(ints_a + ints_b)
910
- return(1 - (2 * ent_ab - ent_a - ent_b)/np.log(4))
911
-
912
-
913
- def S_renyi(ints_a, ints_b, q):
914
- if q == 1:
915
- print('Warning: the Renyi Entropy Similarity Measure is equivalent to the Shannon Entropy Similarity Measure when the entropy dimension is 1')
916
- return S_shannon(ints_a, ints_b)
917
- else:
918
- ent_a = ent_renyi(ints_a, q)
919
- ent_b = ent_renyi(ints_b, q)
920
- ent_merg = ent_renyi(ints_a/2 + ints_b/2, q)
921
- N = (1/(1-q)) * (2*np.log(np.sum(np.power(ints_a/2,q))+np.sum(np.power(ints_b/2,q))) - np.log(np.sum(np.power(ints_a,q))) - np.log(np.sum(np.power(ints_b,q))))
922
- return 1 - (2 * ent_merg - ent_a - ent_b) / N
923
-
924
-
925
- def S_tsallis(ints_a, ints_b, q):
926
- if q == 1:
927
- print('Warning: the Tsallis Entropy Similarity Measure is equivalent to the Shannon Entropy Similarity Measure when the entropy dimension is 1')
928
- return S_shannon(ints_a, ints_b)
929
- else:
930
- ent_a = ent_tsallis(ints_a, q)
931
- ent_b = ent_tsallis(ints_b, q)
932
- ent_merg = ent_tsallis(ints_a/2 + ints_b/2, q)
933
- N = np.sum(2*np.power(ints_a/2,q)+2*np.power(ints_b/2,q)-np.power(ints_a,q)-np.power(ints_b,q)) / (1-q)
934
- return 1 - (2 * ent_merg - ent_a - ent_b) / N
935
-
936
- def S_mixture(ints_a, ints_b, weights={'Cosine':0.25, 'Shannon':0.25, 'Renyi':0.25, 'Tsallis':0.25}, q=1.1):
937
- if set(weights.keys()).issubset(set(['Cosine','Shannon','Renyi','Tsallis'])) is False:
938
- print('Error: the keys to the weight parameter dict of the function S_mixture must be one of the four: Cosine, Shannon, Renyi, Tsallis')
939
- sys.exit()
940
-
941
- similarity = 0
942
- for key, value in weights.items():
943
- if key == 'Cosine':
944
- similarity += value * S_cos(ints_a,ints_b)
945
- if key == 'Shannon':
946
- similarity += value * S_shannon(ints_a,ints_b)
947
- if key == 'Renyi':
948
- similarity += value * S_renyi(ints_a,ints_b,q)
949
- if key == 'Tsallis':
950
- similarity += value * S_tsallis(ints_a,ints_b,q)
951
- return similarity
952
-
953
-
954
- def get_contingency_entries(ints_a, ints_b):
955
- a = 0
956
- b = 0
957
- c = 0
958
-
959
- for x, y in zip(ints_a, ints_b):
960
- if x != 0 and y != 0:
961
- c += 1
962
- elif x != 0 and y == 0:
963
- a += 1
964
- elif x == 0 and y != 0:
965
- b += 1
966
- return [a,b,c]
967
-
968
-
969
- def S_jaccard(ints_a, ints_b):
970
- tmp = get_contingency_entries(ints_a, ints_b)
971
- a = tmp[0]
972
- b = tmp[1]
973
- c = tmp[2]
974
- denom = a + b + c
975
- if denom == 0:
976
- similarity = 0
977
- else:
978
- similarity = c / (a + b + c)
979
- return similarity
980
-
981
-
982
- def S_dice(ints_a, ints_b):
983
- tmp = get_contingency_entries(ints_a, ints_b)
984
- a = tmp[0]
985
- b = tmp[1]
986
- c = tmp[2]
987
- denom = a + b + 2 * c
988
- if denom == 0:
989
- similarity = 0
990
- else:
991
- similarity = 2 * c / denom
992
- return similarity
993
-
994
-
995
- def S_3w_jaccard(ints_a, ints_b):
996
- tmp = get_contingency_entries(ints_a, ints_b)
997
- a = tmp[0]
998
- b = tmp[1]
999
- c = tmp[2]
1000
- denom = a + b + 3 * c
1001
- if denom == 0:
1002
- similarity = 0
1003
- else:
1004
- similarity = 3 * c / denom
1005
- return similarity
1006
-
1007
-
1008
- def S_sokal_sneath(ints_a, ints_b):
1009
- tmp = get_contingency_entries(ints_a, ints_b)
1010
- a = tmp[0]
1011
- b = tmp[1]
1012
- c = tmp[2]
1013
- denom = 2 * a + 2 * b + c
1014
- if denom == 0:
1015
- similarity = 0
1016
- else:
1017
- similarity = c / denom
1018
- return similarity
1019
-
1020
-
1021
- def S_binary_cosine(ints_a, ints_b):
1022
- tmp = get_contingency_entries(ints_a, ints_b)
1023
- a = tmp[0]
1024
- b = tmp[1]
1025
- c = tmp[2]
1026
- denom = np.sqrt((a + c) * (b + c))
1027
- if denom == 0:
1028
- similarity = 0
1029
- else:
1030
- similarity = c / denom
1031
- return similarity
1032
-
1033
-
1034
- def S_mountford(ints_a, ints_b):
1035
- tmp = get_contingency_entries(ints_a, ints_b)
1036
- a = tmp[0]
1037
- b = tmp[1]
1038
- c = tmp[2]
1039
- denom = c * (a + b) + 2 * a * b
1040
- if denom == 0:
1041
- similarity = 1
1042
- else:
1043
- similarity = 2 * c / denom
1044
- return similarity
1045
-
1046
-
1047
- def S_mcconnaughey(ints_a, ints_b):
1048
- tmp = get_contingency_entries(ints_a, ints_b)
1049
- a = tmp[0]
1050
- b = tmp[1]
1051
- c = tmp[2]
1052
- denom = (a + c) * (b + c)
1053
- if denom == 0:
1054
- similarity = 0
1055
- else:
1056
- similarity = (c**2 - a * b) / denom
1057
- return similarity
1058
-
1059
-
1060
- def S_driver_kroeber(ints_a, ints_b):
1061
- tmp = get_contingency_entries(ints_a, ints_b)
1062
- a = tmp[0]
1063
- b = tmp[1]
1064
- c = tmp[2]
1065
- denom = 2 * (a + c) * (b + c)
1066
- if denom == 0:
1067
- similarity = 0
1068
- else:
1069
- similarity = c * (a + b + 2 * c) / denom
1070
- return similarity
1071
-
1072
-
1073
- def S_simpson(ints_a, ints_b):
1074
- tmp = get_contingency_entries(ints_a, ints_b)
1075
- a = tmp[0]
1076
- b = tmp[1]
1077
- c = tmp[2]
1078
- denom = min(a + c, b + c)
1079
- if denom == 0:
1080
- similarity = 0
1081
- else:
1082
- similarity = c / denom
1083
- return similarity
1084
-
1085
-
1086
- def S_braun_banquet(ints_a, ints_b):
1087
- tmp = get_contingency_entries(ints_a, ints_b)
1088
- a = tmp[0]
1089
- b = tmp[1]
1090
- c = tmp[2]
1091
- denom = max(a + c, b + c)
1092
- if denom == 0:
1093
- similarity = 0
1094
- else:
1095
- similarity = c / denom
1096
- return similarity
1097
-
1098
-
1099
- def S_fager_mcgowan(ints_a, ints_b):
1100
- tmp = get_contingency_entries(ints_a, ints_b)
1101
- a = tmp[0]
1102
- b = tmp[1]
1103
- c = tmp[2]
1104
- denom1 = np.sqrt((a + c) * (b + c))
1105
- denom2 = 2 * np.sqrt(max(a + c, b + c))
1106
- if denom1 == 0 or denom2 == 0:
1107
- similarity = 0
1108
- else:
1109
- similarity = c / denom1 - 1 / denom2
1110
- return similarity
1111
-
1112
-
1113
- def S_kulczynski(ints_a, ints_b):
1114
- tmp = get_contingency_entries(ints_a, ints_b)
1115
- a = tmp[0]
1116
- b = tmp[1]
1117
- c = tmp[2]
1118
- denom = a + b
1119
- if denom == 0:
1120
- similarity = 1
1121
- else:
1122
- similarity = c / denom
1123
- return similarity
1124
-
1125
-
1126
- def S_intersection(ints_a, ints_b):
1127
- tmp = get_contingency_entries(ints_a, ints_b)
1128
- c = tmp[2]
1129
- return c
1130
-
1131
-
1132
- def S_hamming(ints_a, ints_b):
1133
- tmp = get_contingency_entries(ints_a, ints_b)
1134
- a = tmp[0]
1135
- b = tmp[1]
1136
- denom = a + b
1137
- if denom == 0:
1138
- similarity = 1
1139
- else:
1140
- similarity = 1 / denom
1141
- return similarity
1142
-
1143
-
1144
- def S_hellinger(ints_a, ints_b):
1145
- tmp = get_contingency_entries(ints_a, ints_b)
1146
- a = tmp[0]
1147
- b = tmp[1]
1148
- c = tmp[2]
1149
- similarity = 1 - np.sqrt((1 - c / np.sqrt((a + c) * (b + c))))
1150
- return similarity
1151
-
1152
-
1153
- def get_similarity(similarity_measure, q_ints, r_ints, weights, q):
1154
-
1155
- if similarity_measure == 'cosine':
1156
- similarity = S_cos(q_ints, r_ints)
1157
-
1158
- elif similarity_measure in ['shannon', 'renyi', 'tsallis']:
1159
- q_ints = normalize(q_ints, method = 'standard')
1160
- r_ints = normalize(r_ints, method = 'standard')
1161
- if similarity_measure == 'shannon':
1162
- similarity = S_shannon(q_ints, r_ints)
1163
- elif similarity_measure == 'renyi':
1164
- similarity = S_renyi(q_ints, r_ints, q)
1165
- elif similarity_measure == 'tsallis':
1166
- similarity = S_tsallis(q_ints, r_ints, q)
1167
-
1168
- elif similarity_measure == 'mixture':
1169
- similarity = S_mixture(q_ints, r_ints, weights, q)
1170
-
1171
- elif similarity_measure == 'jaccard':
1172
- similarity = S_jaccard(q_ints, r_ints)
1173
-
1174
- elif similarity_measure == 'dice':
1175
- similarity = S_dice(q_ints, r_ints)
1176
-
1177
- elif similarity_measure == '3w_jaccard':
1178
- similarity = S_3w_jaccard(q_ints, r_ints)
1179
-
1180
- elif similarity_measure == 'sokal_sneath':
1181
- similarity = S_sokal_sneath(q_ints, r_ints)
1182
-
1183
- elif similarity_measure == 'binary_cosine':
1184
- similarity = S_binary_cosine(q_ints, r_ints)
1185
-
1186
- elif similarity_measure == 'mountford':
1187
- similarity = S_mountford(q_ints, r_ints)
1188
-
1189
- elif similarity_measure == 'mcconnaughey':
1190
- similarity = S_mcconnaughey(q_ints, r_ints)
1191
-
1192
- elif similarity_measure == 'driver_kroeber':
1193
- similarity = S_driver_kroeber(q_ints, r_ints)
1194
-
1195
- elif similarity_measure == 'simpson':
1196
- similarity = S_simpson(q_ints, r_ints)
1197
-
1198
- elif similarity_measure == 'braun_banquet':
1199
- similarity = S_braun_banquet(q_ints, r_ints)
1200
-
1201
- elif similarity_measure == 'fager_mcgowan':
1202
- similarity = S_fager_mcgowan(q_ints, r_ints)
1203
-
1204
- elif similarity_measure == 'kulczynski':
1205
- similarity = S_kulczynski(q_ints, r_ints)
1206
-
1207
- elif similarity_measure == 'intersection':
1208
- similarity = S_intersection(q_ints, r_ints)
1209
-
1210
- elif similarity_measure == 'hamming':
1211
- similarity = S_hamming(q_ints, r_ints)
1212
-
1213
- elif similarity_measure == 'hellinger':
1214
- similarity = S_hellinger(q_ints, r_ints)
1215
-
1216
- return similarity
1217
-
1218
-
1219
- def _vector_to_full_params(X, default_params, optimize_params):
1220
- params = default_params.copy()
1221
- for name, val in zip(optimize_params, X):
1222
- params[name] = float(val)
1223
- return params
1224
-
1225
-
1226
- def objective_function_HRMS(X, ctx):
1227
- p = _vector_to_full_params(X, ctx["default_params"], ctx["optimize_params"])
1228
- acc = get_acc_HRMS(
1229
- ctx["df_query"], ctx["df_reference"],
1230
- ctx["precursor_ion_mz_tolerance"], ctx["ionization_mode"], ctx["adduct"],
1231
- ctx["similarity_measure"], ctx["weights"], ctx["spectrum_preprocessing_order"],
1232
- ctx["mz_min"], ctx["mz_max"], ctx["int_min"], ctx["int_max"],
1233
- p["window_size_centroiding"], p["window_size_matching"], p["noise_threshold"],
1234
- p["wf_mz"], p["wf_int"], p["LET_threshold"],
1235
- p["entropy_dimension"],
1236
- ctx["high_quality_reference_library"],
1237
- verbose=False
1238
- )
1239
- print(f"\nparams({ctx['optimize_params']}) = {np.array(X)}\naccuracy: {acc*100}%")
1240
- return 1.0 - acc
1241
-
1242
- def objective_function_NRMS(X, ctx):
1243
- p = _vector_to_full_params(X, ctx["default_params"], ctx["optimize_params"])
1244
- acc = get_acc_NRMS(
1245
- ctx["df_query"], ctx["df_reference"],
1246
- ctx["unique_query_ids"], ctx["unique_reference_ids"],
1247
- ctx["similarity_measure"], ctx["weights"], ctx["spectrum_preprocessing_order"],
1248
- ctx["mz_min"], ctx["mz_max"], ctx["int_min"], ctx["int_max"],
1249
- p["noise_threshold"], p["wf_mz"], p["wf_int"], p["LET_threshold"], p["entropy_dimension"],
1250
- ctx["high_quality_reference_library"],
1251
- verbose=False
1252
- )
1253
- print(f"\nparams({ctx['optimize_params']}) = {np.array(X)}\naccuracy: {acc*100}%")
1254
- return 1.0 - acc
1255
-
1256
-
1257
-
1258
- def tune_params_DE(query_data=None, reference_data=None, precursor_ion_mz_tolerance=None, ionization_mode=None, adduct=None, chromatography_platform='HRMS', similarity_measure='cosine', weights=None, spectrum_preprocessing_order='CNMWL', mz_min=0, mz_max=999999999, int_min=0, int_max=999999999, high_quality_reference_library=False, optimize_params=["window_size_centroiding","window_size_matching","noise_threshold","wf_mz","wf_int","LET_threshold","entropy_dimension"], param_bounds={"window_size_centroiding":(0.0,0.5),"window_size_matching":(0.0,0.5),"noise_threshold":(0.0,0.25),"wf_mz":(0.0,5.0),"wf_int":(0.0,5.0),"LET_threshold":(0.0,5.0),"entropy_dimension":(1.0,3.0)}, default_params={"window_size_centroiding": 0.5, "window_size_matching":0.5, "noise_threshold":0.10, "wf_mz":0.0, "wf_int":1.0, "LET_threshold":0.0, "entropy_dimension":1.1}, maxiters=3, de_workers=1):
1259
-
1260
- if query_data is None:
1261
- print('\nError: No argument passed to the mandatory query_data. Please pass the path to the TXT file of the query data.')
1262
- sys.exit()
1263
- else:
1264
- extension = query_data.rsplit('.',1)
1265
- extension = extension[(len(extension)-1)]
1266
- if extension == 'mgf' or extension == 'MGF' or extension == 'mzML' or extension == 'mzml' or extension == 'MZML' or extension == 'cdf' or extension == 'CDF' or extension == 'msp' or extension == 'MSP' or extension == 'json' or extension == 'JSON':
1267
- output_path_tmp = query_data[:-3] + 'txt'
1268
- build_library_from_raw_data(input_path=query_data, output_path=output_path_tmp, is_reference=False)
1269
- df_query = pd.read_csv(output_path_tmp, sep='\t')
1270
- if extension == 'txt' or extension == 'TXT':
1271
- df_query = pd.read_csv(query_data, sep='\t')
1272
- unique_query_ids = df_query.iloc[:,0].unique()
1273
-
1274
- if reference_data is None:
1275
- print('\nError: No argument passed to the mandatory reference_data. Please pass the path to the TXT file of the reference data.')
1276
- sys.exit()
1277
- else:
1278
- if isinstance(reference_data,str):
1279
- df_reference = get_reference_df(reference_data=reference_data)
1280
- unique_reference_ids = df_reference.iloc[:,0].unique()
1281
- else:
1282
- dfs = []
1283
- unique_reference_ids = []
1284
- for f in reference_data:
1285
- tmp = get_reference_df(reference_data=f)
1286
- dfs.append(tmp)
1287
- unique_reference_ids.extend(tmp.iloc[:,0].unique())
1288
- df_reference = pd.concat(dfs, axis=0, ignore_index=True)
1289
-
1290
- if 'ionization_mode' in df_reference.columns.tolist() and ionization_mode != None and ionization_mode != 'N/A':
1291
- df_reference = df_reference.loc[df_reference['ionization_mode']==ionization_mode]
1292
- if 'adduct' in df_reference.columns.tolist() and adduct != None and adduct != 'N/A':
1293
- df_reference = df_reference.loc[df_reference['adduct']==adduct]
1294
-
1295
- unique_query_ids = df_query['id'].unique().tolist()
1296
- unique_reference_ids = df_reference['id'].unique().tolist()
1297
-
1298
- ctx = dict(
1299
- df_query=df_query,
1300
- df_reference=df_reference,
1301
- precursor_ion_mz_tolerance=precursor_ion_mz_tolerance,
1302
- ionization_mode=ionization_mode,
1303
- adduct=adduct,
1304
- similarity_measure=similarity_measure,
1305
- weights=weights,
1306
- spectrum_preprocessing_order=spectrum_preprocessing_order,
1307
- mz_min=mz_min, mz_max=mz_max, int_min=int_min, int_max=int_max,
1308
- high_quality_reference_library=high_quality_reference_library,
1309
- default_params=default_params,
1310
- optimize_params=optimize_params,
1311
- )
1312
-
1313
- bounds = [param_bounds[p] for p in optimize_params]
1314
-
1315
- if chromatography_platform == 'HRMS':
1316
- result = differential_evolution(objective_function_HRMS, bounds=bounds, args=(ctx,), maxiter=maxiters, tol=0.0, workers=de_workers, seed=1)
1317
- else:
1318
- result = differential_evolution(objective_function_NRMS, bounds=bounds, args=(ctx,), maxiter=maxiters, tol=0.0, workers=de_workers, seed=1)
1319
-
1320
- best_full_params = _vector_to_full_params(result.x, default_params, optimize_params)
1321
- best_acc = 100.0 - (result.fun * 100.0)
1322
-
1323
- print("\n=== Differential Evolution Result ===")
1324
- print(f"Optimized over: {optimize_params}")
1325
- print("Best values (selected params):")
1326
- for name in optimize_params:
1327
- print(f" {name}: {best_full_params[name]}")
1328
- print("\nFull parameter set used in final evaluation:")
1329
- for k, v in best_full_params.items():
1330
- print(f" {k}: {v}")
1331
- print(f"\nBest accuracy: {best_acc:.3f}%")
1332
- _log(f"best = {result.x}, acc={100*(1-result.fun):.3f}%")
1333
-
1334
-
1335
- default_HRMS_grid = {'similarity_measure':['cosine'], 'weight':[{'Cosine':0.25,'Shannon':0.25,'Renyi':0.25,'Tsallis':0.25}], 'spectrum_preprocessing_order':['FCNMWL'], 'mz_min':[0], 'mz_max':[9999999], 'int_min':[0], 'int_max':[99999999], 'window_size_centroiding':[0.5], 'window_size_matching':[0.5], 'noise_threshold':[0.0], 'wf_mz':[0.0], 'wf_int':[1.0], 'LET_threshold':[0.0], 'entropy_dimension':[1.1], 'high_quality_reference_library':[False]}
1336
- default_NRMS_grid = {'similarity_measure':['cosine'], 'weight':[{'Cosine':0.25,'Shannon':0.25,'Renyi':0.25,'Tsallis':0.25}], 'spectrum_preprocessing_order':['FCNMWL'], 'mz_min':[0], 'mz_max':[9999999], 'int_min':[0], 'int_max':[99999999], 'noise_threshold':[0.0], 'wf_mz':[0.0], 'wf_int':[1.0], 'LET_threshold':[0.0], 'entropy_dimension':[1.1], 'high_quality_reference_library':[False]}
1337
-
1338
-
1339
- def _eval_one_HRMS(df_query, df_reference,
1340
- precursor_ion_mz_tolerance_tmp, ionization_mode_tmp, adduct_tmp,
1341
- similarity_measure_tmp, weight,
1342
- spectrum_preprocessing_order_tmp, mz_min_tmp, mz_max_tmp,
1343
- int_min_tmp, int_max_tmp, noise_threshold_tmp,
1344
- window_size_centroiding_tmp, window_size_matching_tmp,
1345
- wf_mz_tmp, wf_int_tmp, LET_threshold_tmp,
1346
- entropy_dimension_tmp, high_quality_reference_library_tmp):
1347
-
1348
- acc = get_acc_HRMS(
1349
- df_query=df_query, df_reference=df_reference,
1350
- precursor_ion_mz_tolerance=precursor_ion_mz_tolerance_tmp,
1351
- ionization_mode=ionization_mode_tmp, adduct=adduct_tmp,
1352
- similarity_measure=similarity_measure_tmp, weights=weight,
1353
- spectrum_preprocessing_order=spectrum_preprocessing_order_tmp,
1354
- mz_min=mz_min_tmp, mz_max=mz_max_tmp,
1355
- int_min=int_min_tmp, int_max=int_max_tmp,
1356
- window_size_centroiding=window_size_centroiding_tmp,
1357
- window_size_matching=window_size_matching_tmp,
1358
- noise_threshold=noise_threshold_tmp,
1359
- wf_mz=wf_mz_tmp, wf_int=wf_int_tmp,
1360
- LET_threshold=LET_threshold_tmp,
1361
- entropy_dimension=entropy_dimension_tmp,
1362
- high_quality_reference_library=high_quality_reference_library_tmp,
1363
- verbose=False
1364
- )
1365
-
1366
- return (
1367
- acc, similarity_measure_tmp, json.dumps(weight), spectrum_preprocessing_order_tmp,
1368
- mz_min_tmp, mz_max_tmp, int_min_tmp, int_max_tmp,
1369
- noise_threshold_tmp, window_size_centroiding_tmp, window_size_matching_tmp,
1370
- wf_mz_tmp, wf_int_tmp, LET_threshold_tmp, entropy_dimension_tmp,
1371
- high_quality_reference_library_tmp
1372
- )
1373
-
1374
-
1375
- def _eval_one_NRMS(df_query, df_reference, unique_query_ids, unique_reference_ids,
1376
- similarity_measure_tmp, weight,
1377
- spectrum_preprocessing_order_tmp, mz_min_tmp, mz_max_tmp,
1378
- int_min_tmp, int_max_tmp, noise_threshold_tmp,
1379
- wf_mz_tmp, wf_int_tmp, LET_threshold_tmp,
1380
- entropy_dimension_tmp, high_quality_reference_library_tmp):
1381
-
1382
- acc = get_acc_NRMS(
1383
- df_query=df_query, df_reference=df_reference,
1384
- unique_query_ids=unique_query_ids, unique_reference_ids=unique_reference_ids,
1385
- similarity_measure=similarity_measure_tmp, weights=weight,
1386
- spectrum_preprocessing_order=spectrum_preprocessing_order_tmp,
1387
- mz_min=mz_min_tmp, mz_max=mz_max_tmp,
1388
- int_min=int_min_tmp, int_max=int_max_tmp,
1389
- noise_threshold=noise_threshold_tmp,
1390
- wf_mz=wf_mz_tmp, wf_int=wf_int_tmp,
1391
- LET_threshold=LET_threshold_tmp,
1392
- entropy_dimension=entropy_dimension_tmp,
1393
- high_quality_reference_library=high_quality_reference_library_tmp,
1394
- )
1395
-
1396
- return (
1397
- acc, similarity_measure_tmp, json.dumps(weight), spectrum_preprocessing_order_tmp,
1398
- mz_min_tmp, mz_max_tmp, int_min_tmp, int_max_tmp, noise_threshold_tmp,
1399
- wf_mz_tmp, wf_int_tmp, LET_threshold_tmp, entropy_dimension_tmp, high_quality_reference_library_tmp
1400
- )
1401
-
1402
-
1403
-
1404
-
1405
- def tune_params_on_HRMS_data_grid_shiny(query_data=None, reference_data=None, precursor_ion_mz_tolerance=None, ionization_mode=None, adduct=None, grid=None, output_path=None, return_output=False):
1406
- local_grid = {**default_HRMS_grid, **(grid or {})}
1407
- for key, value in local_grid.items():
1408
- globals()[key] = value
1409
-
1410
- if query_data is None:
1411
- print('\nError: No argument passed to the mandatory query_data. Please pass the path to the data file.')
1412
- sys.exit()
1413
- else:
1414
- extension = query_data.rsplit('.', 1)[-1]
1415
- if extension in ('mgf','MGF','mzML','mzml','MZML','cdf','CDF'):
1416
- output_path_tmp = query_data[:-3] + 'txt'
1417
- build_library_from_raw_data(input_path=query_data, output_path=output_path_tmp, is_reference=False)
1418
- df_query = pd.read_csv(output_path_tmp, sep='\t')
1419
- elif extension in ('txt','TXT'):
1420
- df_query = pd.read_csv(query_data, sep='\t')
1421
- else:
1422
- print(f'\nError: Unsupported query_data extension: {extension}')
1423
- sys.exit()
1424
- unique_query_ids = df_query.iloc[:, 0].unique()
1425
-
1426
- if reference_data is None:
1427
- print('\nError: No argument passed to the mandatory reference_data. Please pass the path to the data file(s).')
1428
- sys.exit()
1429
- else:
1430
- if isinstance(reference_data, str):
1431
- df_reference = get_reference_df(reference_data=reference_data)
1432
- unique_reference_ids = df_reference.iloc[:, 0].unique()
1433
- else:
1434
- dfs = []
1435
- unique_reference_ids = []
1436
- for f in reference_data:
1437
- tmp = get_reference_df(reference_data=f)
1438
- dfs.append(tmp)
1439
- unique_reference_ids.extend(tmp.iloc[:, 0].unique())
1440
- df_reference = pd.concat(dfs, axis=0, ignore_index=True)
1441
-
1442
- print(f'\nNote that there are {len(unique_query_ids)} unique query spectra, '
1443
- f'{len(unique_reference_ids)} unique reference spectra, and '
1444
- f'{len(set(unique_query_ids) & set(unique_reference_ids))} of the query and reference spectra IDs are in common.\n')
1445
-
1446
- if 'ionization_mode' in df_reference.columns.tolist() and ionization_mode != None and ionization_mode != 'N/A':
1447
- df_reference = df_reference.loc[df_reference['ionization_mode']==ionization_mode]
1448
- if 'adduct' in df_reference.columns.tolist() and adduct != None and adduct != 'N/A':
1449
- df_reference = df_reference.loc[df_reference['adduct']==adduct]
1450
-
1451
- if output_path is None:
1452
- output_path = f'{Path.cwd()}/tuning_param_output.txt'
1453
- print(f'Warning: since output_path=None, the output will be written to the current working directory: {output_path}')
1454
-
1455
- param_grid = product(
1456
- similarity_measure, weight, spectrum_preprocessing_order, mz_min, mz_max, int_min, int_max,
1457
- noise_threshold, window_size_centroiding, window_size_matching, wf_mz, wf_int, LET_threshold,
1458
- entropy_dimension, high_quality_reference_library
1459
- )
1460
-
1461
- results = []
1462
- total = (
1463
- len(similarity_measure) * len(weight) * len(spectrum_preprocessing_order) * len(mz_min) * len(mz_max) *
1464
- len(int_min) * len(int_max) * len(noise_threshold) * len(window_size_centroiding) *
1465
- len(window_size_matching) * len(wf_mz) * len(wf_int) * len(LET_threshold) *
1466
- len(entropy_dimension) * len(high_quality_reference_library)
1467
- )
1468
- done = 0
1469
- for params in param_grid:
1470
- res = _eval_one_HRMS(df_query, df_reference, precursor_ion_mz_tolerance, ionization_mode, adduct, *params)
1471
- results.append(res)
1472
- done += 1
1473
- print(f'Completed {done}/{total} grid combinations.\n', flush=True)
1474
-
1475
- df_out = pd.DataFrame(results, columns=[
1476
- 'ACC','SIMILARITY.MEASURE','WEIGHT','SPECTRUM.PROCESSING.ORDER','MZ.MIN','MZ.MAX',
1477
- 'INT.MIN','INT.MAX','NOISE.THRESHOLD','WINDOW.SIZE.CENTROIDING','WINDOW.SIZE.MATCHING',
1478
- 'WF.MZ','WF.INT','LET.THRESHOLD','ENTROPY.DIMENSION','HIGH.QUALITY.REFERENCE.LIBRARY'
1479
- ])
1480
-
1481
- if 'WEIGHT' in df_out.columns:
1482
- df_out['WEIGHT'] = (
1483
- df_out['WEIGHT'].astype(str)
1484
- .str.replace("\"","",regex=False)
1485
- .str.replace("{","",regex=False)
1486
- .str.replace("}","",regex=False)
1487
- .str.replace(":","",regex=False)
1488
- .str.replace("Cosine","",regex=False)
1489
- .str.replace("Shannon","",regex=False)
1490
- .str.replace("Renyi","",regex=False)
1491
- .str.replace("Tsallis","",regex=False)
1492
- .str.replace(" ","",regex=False)
1493
- )
1494
-
1495
- if return_output:
1496
- return df_out
1497
- else:
1498
- df_out.to_csv(output_path, index=False, sep='\t', quoting=csv.QUOTE_NONE)
1499
- print(f'Wrote results to {output_path}')
1500
-
1501
-
1502
-
1503
- def tune_params_on_NRMS_data_grid(query_data=None, reference_data=None, grid=None, output_path=None, return_output=False):
1504
- grid = {**default_NRMS_grid, **(grid or {})}
1505
- for key, value in grid.items():
1506
- globals()[key] = value
1507
-
1508
- if query_data is None:
1509
- print('\nError: No argument passed to the mandatory query_data. Please pass the path to the TXT file of the query data.')
1510
- sys.exit()
1511
- else:
1512
- extension = query_data.rsplit('.',1)
1513
- extension = extension[(len(extension)-1)]
1514
- if extension == 'mgf' or extension == 'MGF' or extension == 'mzML' or extension == 'mzml' or extension == 'MZML' or extension == 'cdf' or extension == 'CDF' or extension == 'msp' or extension == 'MSP' or extension == 'json' or extension == 'JSON':
1515
- output_path_tmp = query_data[:-3] + 'txt'
1516
- build_library_from_raw_data(input_path=query_data, output_path=output_path_tmp, is_reference=False)
1517
- df_query = pd.read_csv(output_path_tmp, sep='\t')
1518
- if extension == 'txt' or extension == 'TXT':
1519
- df_query = pd.read_csv(query_data, sep='\t')
1520
- unique_query_ids = df_query.iloc[:,0].unique()
1521
-
1522
- if reference_data is None:
1523
- print('\nError: No argument passed to the mandatory reference_data. Please pass the path to the TXT file of the reference data.')
1524
- sys.exit()
1525
- else:
1526
- if isinstance(reference_data,str):
1527
- df_reference = get_reference_df(reference_data=reference_data)
1528
- unique_reference_ids = df_reference.iloc[:,0].unique()
1529
- else:
1530
- dfs = []
1531
- unique_reference_ids = []
1532
- for f in reference_data:
1533
- tmp = get_reference_df(reference_data=f)
1534
- dfs.append(tmp)
1535
- unique_reference_ids.extend(tmp.iloc[:,0].unique())
1536
- df_reference = pd.concat(dfs, axis=0, ignore_index=True)
1537
-
1538
- print(f'\nNote that there are {len(unique_query_ids)} unique query spectra, {len(unique_reference_ids)} unique reference spectra, and {len(set(unique_query_ids) & set(unique_reference_ids))} of the query and reference spectra IDs are in common.\n')
1539
-
1540
- if output_path is None:
1541
- output_path = f'{Path.cwd()}/tuning_param_output.txt'
1542
- print(f'Warning: since output_path=None, the output will be written to the current working directory: {output_path}')
1543
-
1544
- param_grid = product(similarity_measure, weight, spectrum_preprocessing_order, mz_min, mz_max, int_min, int_max,
1545
- noise_threshold, wf_mz, wf_int, LET_threshold, entropy_dimension, high_quality_reference_library)
1546
- results = Parallel(n_jobs=-1, verbose=10)(delayed(_eval_one_NRMS)(df_query, df_reference, unique_query_ids, unique_reference_ids, *params) for params in param_grid)
1547
-
1548
- df_out = pd.DataFrame(results, columns=[
1549
- 'ACC','SIMILARITY.MEASURE','WEIGHT','SPECTRUM.PROCESSING.ORDER', 'MZ.MIN','MZ.MAX','INT.MIN','INT.MAX',
1550
- 'NOISE.THRESHOLD','WF.MZ','WF.INT','LET.THRESHOLD','ENTROPY.DIMENSION', 'HIGH.QUALITY.REFERENCE.LIBRARY'
1551
- ])
1552
- df_out['WEIGHT'] = df_out['WEIGHT'].str.replace("\"","",regex=False)
1553
- df_out['WEIGHT'] = df_out['WEIGHT'].str.replace("{","",regex=False)
1554
- df_out['WEIGHT'] = df_out['WEIGHT'].str.replace("}","",regex=False)
1555
- df_out['WEIGHT'] = df_out['WEIGHT'].str.replace(":","",regex=False)
1556
- df_out['WEIGHT'] = df_out['WEIGHT'].str.replace("Cosine","",regex=False)
1557
- df_out['WEIGHT'] = df_out['WEIGHT'].str.replace("Shannon","",regex=False)
1558
- df_out['WEIGHT'] = df_out['WEIGHT'].str.replace("Renyi","",regex=False)
1559
- df_out['WEIGHT'] = df_out['WEIGHT'].str.replace("Tsallis","",regex=False)
1560
- df_out['WEIGHT'] = df_out['WEIGHT'].str.replace(" ","",regex=False)
1561
- if return_output is False:
1562
- df_out.to_csv(output_path, index=False, sep='\t', quoting=csv.QUOTE_NONE)
1563
- else:
1564
- return df_out
1565
-
1566
-
1567
-
1568
- def tune_params_on_NRMS_data_grid_shiny(query_data=None, reference_data=None, grid=None, output_path=None, return_output=False):
1569
- local_grid = {**default_NRMS_grid, **(grid or {})}
1570
- for key, value in local_grid.items():
1571
- globals()[key] = value
1572
-
1573
- if query_data is None:
1574
- print('\nError: No argument passed to the mandatory query_data. Please pass the path to the data file.')
1575
- sys.exit()
1576
- else:
1577
- extension = query_data.rsplit('.', 1)[-1]
1578
- if extension in ('mgf','MGF','mzML','mzml','MZML','cdf','CDF'):
1579
- output_path_tmp = query_data[:-3] + 'txt'
1580
- build_library_from_raw_data(input_path=query_data, output_path=output_path_tmp, is_reference=False)
1581
- df_query = pd.read_csv(output_path_tmp, sep='\t')
1582
- elif extension in ('txt','TXT'):
1583
- df_query = pd.read_csv(query_data, sep='\t')
1584
- else:
1585
- print(f'\nError: Unsupported query_data extension: {extension}')
1586
- sys.exit()
1587
- unique_query_ids = df_query.iloc[:, 0].unique()
1588
-
1589
- if reference_data is None:
1590
- print('\nError: No argument passed to the mandatory reference_data. Please pass the path to the data file(s).')
1591
- sys.exit()
1592
- else:
1593
- if isinstance(reference_data, str):
1594
- df_reference = get_reference_df(reference_data=reference_data)
1595
- unique_reference_ids = df_reference.iloc[:, 0].unique()
1596
- else:
1597
- dfs = []
1598
- unique_reference_ids = []
1599
- for f in reference_data:
1600
- tmp = get_reference_df(reference_data=f)
1601
- dfs.append(tmp)
1602
- unique_reference_ids.extend(tmp.iloc[:, 0].unique())
1603
- df_reference = pd.concat(dfs, axis=0, ignore_index=True)
1604
-
1605
- print(f'\nNote that there are {len(unique_query_ids)} unique query spectra, '
1606
- f'{len(unique_reference_ids)} unique reference spectra, and '
1607
- f'{len(set(unique_query_ids) & set(unique_reference_ids))} of the query and reference spectra IDs are in common.\n')
1608
-
1609
- if output_path is None:
1610
- output_path = f'{Path.cwd()}/tuning_param_output.txt'
1611
- print(f'Warning: since output_path=None, the output will be written to the current working directory: {output_path}')
1612
-
1613
- param_grid = product(
1614
- similarity_measure, weight, spectrum_preprocessing_order, mz_min, mz_max, int_min, int_max,
1615
- noise_threshold, wf_mz, wf_int, LET_threshold,
1616
- entropy_dimension, high_quality_reference_library
1617
- )
1618
-
1619
- results = []
1620
- total = (
1621
- len(similarity_measure) * len(weight) * len(spectrum_preprocessing_order) * len(mz_min) * len(mz_max) * len(int_min) *
1622
- len(int_max) * len(noise_threshold) * len(wf_mz) * len(wf_int) * len(LET_threshold) * len(entropy_dimension) * len(high_quality_reference_library)
1623
- )
1624
- done = 0
1625
- for params in param_grid:
1626
- res = _eval_one_NRMS(df_query, df_reference, unique_query_ids, unique_reference_ids, *params)
1627
- results.append(res)
1628
- done += 1
1629
- print(f'Completed {done}/{total} grid combinations.\n', flush=True)
1630
-
1631
- df_out = pd.DataFrame(results, columns=[
1632
- 'ACC','SIMILARITY.MEASURE','WEIGHT','SPECTRUM.PROCESSING.ORDER','MZ.MIN','MZ.MAX',
1633
- 'INT.MIN','INT.MAX','NOISE.THRESHOLD','WF.MZ','WF.INT','LET.THRESHOLD','ENTROPY.DIMENSION','HIGH.QUALITY.REFERENCE.LIBRARY'
1634
- ])
1635
-
1636
- if 'WEIGHT' in df_out.columns:
1637
- df_out['WEIGHT'] = (
1638
- df_out['WEIGHT'].astype(str)
1639
- .str.replace("\"","",regex=False)
1640
- .str.replace("{","",regex=False)
1641
- .str.replace("}","",regex=False)
1642
- .str.replace(":","",regex=False)
1643
- .str.replace("Cosine","",regex=False)
1644
- .str.replace("Shannon","",regex=False)
1645
- .str.replace("Renyi","",regex=False)
1646
- .str.replace("Tsallis","",regex=False)
1647
- .str.replace(" ","",regex=False)
1648
- )
1649
-
1650
- if return_output:
1651
- return df_out
1652
- else:
1653
- df_out.to_csv(output_path, index=False, sep='\t', quoting=csv.QUOTE_NONE)
1654
- print(f'Wrote results to {output_path}')
1655
-
1656
-
1657
-
1658
-
1659
- def get_acc_HRMS(df_query, df_reference, precursor_ion_mz_tolerance, ionization_mode, adduct, similarity_measure, weights, spectrum_preprocessing_order, mz_min, mz_max, int_min, int_max, window_size_centroiding, window_size_matching, noise_threshold, wf_mz, wf_int, LET_threshold, entropy_dimension, high_quality_reference_library, verbose=True):
1660
- n_top_matches_to_save = 1
1661
- unique_reference_ids = df_reference['id'].dropna().astype(str).unique().tolist()
1662
- unique_query_ids = df_query['id'].dropna().astype(str).unique().tolist()
1663
- all_similarity_rows = []
1664
-
1665
- for query_idx, qid in enumerate(unique_query_ids):
1666
- if verbose:
1667
- print(f'query spectrum #{query_idx} is being identified')
1668
-
1669
- q_mask = (df_query['id'] == qid)
1670
- q_idxs = np.where(q_mask)[0]
1671
- if q_idxs.size == 0:
1672
- all_similarity_rows.append([0.0]*len(unique_reference_ids))
1673
- continue
1674
-
1675
- q_spec_base = np.asarray(pd.concat([df_query['mz_ratio'].iloc[q_idxs], df_query['intensity'].iloc[q_idxs]], axis=1).reset_index(drop=True))
1676
-
1677
- if 'precursor_ion_mz' in df_query.columns and 'precursor_ion_mz' in df_reference.columns and precursor_ion_mz_tolerance is not None:
1678
- precursor = float(df_query['precursor_ion_mz'].iloc[q_idxs[0]])
1679
- df_reference_tmp = df_reference.loc[df_reference['precursor_ion_mz'].between(precursor - precursor_ion_mz_tolerance, precursor + precursor_ion_mz_tolerance, inclusive='both'), ['id', 'mz_ratio', 'intensity']].copy()
1680
- else:
1681
- df_reference_tmp = df_reference[['id','mz_ratio','intensity']].copy()
1682
-
1683
- if df_reference_tmp.empty:
1684
- all_similarity_rows.append([0.0]*len(unique_reference_ids))
1685
- continue
1686
-
1687
- ref_groups = dict(tuple(df_reference_tmp.groupby('id', sort=False)))
1688
-
1689
- similarity_by_ref = {}
1690
-
1691
- for ref_id, r_df in ref_groups.items():
1692
- q_spec = q_spec_base.copy()
1693
- r_spec = np.asarray(pd.concat([r_df['mz_ratio'], r_df['intensity']], axis=1).reset_index(drop=True))
1694
-
1695
- is_matched = False
1696
- for transformation in spectrum_preprocessing_order:
1697
- if np.isinf(q_spec[:, 1]).any():
1698
- q_spec[:, 1] = 0.0
1699
- if np.isinf(r_spec[:, 1]).any():
1700
- r_spec[:, 1] = 0.0
1701
-
1702
- if transformation == 'C' and q_spec.shape[0] > 1 and r_spec.shape[0] > 1:
1703
- q_spec = centroid_spectrum(q_spec, window_size=window_size_centroiding)
1704
- r_spec = centroid_spectrum(r_spec, window_size=window_size_centroiding)
1705
-
1706
- if transformation == 'M' and q_spec.shape[0] > 1 and r_spec.shape[0] > 1:
1707
- m_spec = match_peaks_in_spectra(
1708
- spec_a=q_spec, spec_b=r_spec, window_size=window_size_matching
1709
- )
1710
- if m_spec.size == 0:
1711
- q_spec = np.empty((0,2))
1712
- r_spec = np.empty((0,2))
1713
- else:
1714
- q_spec = m_spec[:, 0:2]
1715
- r_spec = m_spec[:, [0, 2]]
1716
- is_matched = True
1717
-
1718
- if transformation == 'W' and q_spec.shape[0] > 1 and r_spec.shape[0] > 1:
1719
- q_spec[:, 1] = wf_transform(q_spec[:, 0], q_spec[:, 1], wf_mz, wf_int)
1720
- r_spec[:, 1] = wf_transform(r_spec[:, 0], r_spec[:, 1], wf_mz, wf_int)
1721
-
1722
- if transformation == 'L' and q_spec.shape[0] > 1 and r_spec.shape[0] > 1:
1723
- q_spec[:, 1] = LE_transform(q_spec[:, 1], LET_threshold, normalization_method='standard')
1724
- r_spec[:, 1] = LE_transform(r_spec[:, 1], LET_threshold, normalization_method='standard')
1725
-
1726
- if transformation == 'N' and q_spec.shape[0] > 1 and r_spec.shape[0] > 1:
1727
- q_spec = remove_noise(q_spec, nr=noise_threshold)
1728
- if not high_quality_reference_library:
1729
- r_spec = remove_noise(r_spec, nr=noise_threshold)
1730
-
1731
- if transformation == 'F' and q_spec.shape[0] > 1 and r_spec.shape[0] > 1:
1732
- q_spec = filter_spec_lcms(
1733
- q_spec, mz_min=mz_min, mz_max=mz_max, int_min=int_min, int_max=int_max, is_matched=is_matched
1734
- )
1735
- if not high_quality_reference_library:
1736
- r_spec = filter_spec_lcms(
1737
- r_spec, mz_min=mz_min, mz_max=mz_max, int_min=int_min, int_max=int_max, is_matched=is_matched
1738
- )
1739
-
1740
- if q_spec.shape[0] > 1 and r_spec.shape[0] > 1:
1741
- q_ints = q_spec[:, 1]
1742
- r_ints = r_spec[:, 1]
1743
- if np.sum(q_ints) != 0 and np.sum(r_ints) != 0:
1744
- sim = get_similarity(similarity_measure, q_ints, r_ints, weights, entropy_dimension)
1745
- else:
1746
- sim = 0.0
1747
- else:
1748
- sim = 0.0
1749
-
1750
- similarity_by_ref[str(ref_id)] = float(sim)
1751
-
1752
- row = [similarity_by_ref.get(ref_id, 0.0) for ref_id in unique_reference_ids]
1753
- all_similarity_rows.append(row)
1754
-
1755
- df_scores = pd.DataFrame(all_similarity_rows, index=unique_query_ids, columns=unique_reference_ids)
1756
- df_scores.index.name = 'QUERY.SPECTRUM.ID'
1757
-
1758
- top_idx = df_scores.values.argmax(axis=1)
1759
- top_scores = df_scores.values[np.arange(df_scores.shape[0]), top_idx]
1760
- top_ids = [df_scores.columns[i] for i in top_idx]
1761
-
1762
- df_tmp = pd.DataFrame({'TRUE.ID': df_scores.index.to_list(), 'PREDICTED.ID': top_ids, 'SCORE': top_scores})
1763
- if verbose:
1764
- print(df_tmp)
1765
-
1766
- acc = (df_tmp['TRUE.ID'] == df_tmp['PREDICTED.ID']).mean()
1767
- return acc
1768
-
1769
-
1770
-
1771
- def get_acc_NRMS(df_query, df_reference, unique_query_ids, unique_reference_ids, similarity_measure, weights, spectrum_preprocessing_order, mz_min, mz_max, int_min, int_max, noise_threshold, wf_mz, wf_int, LET_threshold, entropy_dimension, high_quality_reference_library, verbose=True):
1772
-
1773
- n_top_matches_to_save = 1
1774
-
1775
- min_mz = int(np.min([np.min(df_query.iloc[:,1]), np.min(df_reference.iloc[:,1])]))
1776
- max_mz = int(np.max([np.max(df_query.iloc[:,1]), np.max(df_reference.iloc[:,1])]))
1777
- mzs = np.linspace(min_mz,max_mz,(max_mz-min_mz+1))
1778
-
1779
- all_similarity_scores = []
1780
- for query_idx in range(0,len(unique_query_ids)):
1781
- q_idxs_tmp = np.where(df_query.iloc[:,0] == unique_query_ids[query_idx])[0]
1782
- q_spec_tmp = np.asarray(pd.concat([df_query.iloc[q_idxs_tmp,1], df_query.iloc[q_idxs_tmp,2]], axis=1).reset_index(drop=True))
1783
- q_spec_tmp = convert_spec(q_spec_tmp,mzs)
1784
-
1785
- similarity_scores = []
1786
- for ref_idx in range(0,len(unique_reference_ids)):
1787
- q_spec = q_spec_tmp
1788
- if verbose is True and ref_idx % 1000 == 0:
1789
- print(f'Query spectrum #{query_idx} has had its similarity with {ref_idx} reference library spectra computed')
1790
- r_idxs_tmp = np.where(df_reference.iloc[:,0] == unique_reference_ids[ref_idx])[0]
1791
- r_spec_tmp = np.asarray(pd.concat([df_reference.iloc[r_idxs_tmp,1], df_reference.iloc[r_idxs_tmp,2]], axis=1).reset_index(drop=True))
1792
- r_spec = convert_spec(r_spec_tmp,mzs)
1793
-
1794
- for transformation in spectrum_preprocessing_order:
1795
- if np.isinf(q_spec[:,1]).sum() > 0:
1796
- q_spec[:,1] = np.zeros(q_spec.shape[0])
1797
- if np.isinf(r_spec[:,1]).sum() > 0:
1798
- r_spec[:,1] = np.zeros(r_spec.shape[0])
1799
- if transformation == 'W':
1800
- q_spec[:,1] = wf_transform(q_spec[:,0], q_spec[:,1], wf_mz, wf_int)
1801
- r_spec[:,1] = wf_transform(r_spec[:,0], r_spec[:,1], wf_mz, wf_int)
1802
- if transformation == 'L':
1803
- q_spec[:,1] = LE_transform(q_spec[:,1], LET_threshold, normalization_method='standard')
1804
- r_spec[:,1] = LE_transform(r_spec[:,1], LET_threshold, normalization_method='standard')
1805
- if transformation == 'N':
1806
- q_spec = remove_noise(q_spec, nr = noise_threshold)
1807
- if high_quality_reference_library == False:
1808
- r_spec = remove_noise(r_spec, nr = noise_threshold)
1809
- if transformation == 'F':
1810
- q_spec = filter_spec_gcms(q_spec, mz_min = mz_min, mz_max = mz_max, int_min = int_min, int_max = int_max)
1811
- if high_quality_reference_library == False:
1812
- r_spec = filter_spec_gcms(r_spec, mz_min = mz_min, mz_max = mz_max, int_min = int_min, int_max = int_max)
1813
-
1814
- q_ints = q_spec[:,1]
1815
- r_ints = r_spec[:,1]
1816
-
1817
- if np.sum(q_ints) != 0 and np.sum(r_ints) != 0:
1818
- similarity_score = get_similarity(similarity_measure, q_spec[:,1], r_spec[:,1], weights, entropy_dimension)
1819
- else:
1820
- similarity_score = 0
1821
-
1822
- similarity_scores.append(similarity_score)
1823
- all_similarity_scores.append(similarity_scores)
1824
-
1825
- df_scores = pd.DataFrame(all_similarity_scores, columns = unique_reference_ids)
1826
- df_scores.index = unique_query_ids
1827
- df_scores.index.names = ['QUERY.SPECTRUM.ID']
1828
-
1829
- preds = []
1830
- scores = []
1831
- for i in range(0, df_scores.shape[0]):
1832
- df_scores_tmp = df_scores
1833
- preds_tmp = []
1834
- scores_tmp = []
1835
- for j in range(0, n_top_matches_to_save):
1836
- top_ref_specs_tmp = df_scores_tmp.iloc[i,np.where(df_scores_tmp.iloc[i,:] == np.max(df_scores_tmp.iloc[i,:]))[0]]
1837
- cols_to_keep = np.where(df_scores_tmp.iloc[i,:] != np.max(df_scores_tmp.iloc[i,:]))[0]
1838
- df_scores_tmp = df_scores_tmp.iloc[:,cols_to_keep]
1839
-
1840
- preds_tmp.append(';'.join(map(str,top_ref_specs_tmp.index.to_list())))
1841
- if len(top_ref_specs_tmp.values) == 0:
1842
- scores_tmp.append(0)
1843
- else:
1844
- scores_tmp.append(top_ref_specs_tmp.values[0])
1845
- preds.append(preds_tmp)
1846
- scores.append(scores_tmp)
1847
-
1848
- preds = np.array(preds)
1849
- scores = np.array(scores)
1850
- out = np.c_[unique_query_ids,preds,scores]
1851
- df_tmp = pd.DataFrame(out, columns=['TRUE.ID','PREDICTED.ID','SCORE'])
1852
- acc = (df_tmp['TRUE.ID']==df_tmp['PREDICTED.ID']).mean()
1853
- return acc
1854
-
1855
-
1856
-
1857
- def run_spec_lib_matching_on_HRMS_data_shiny(query_data=None, reference_data=None, precursor_ion_mz_tolerance=None, ionization_mode=None, adduct=None, likely_reference_ids=None, similarity_measure='cosine', weights={'Cosine':0.25,'Shannon':0.25,'Renyi':0.25,'Tsallis':0.25}, spectrum_preprocessing_order='FCNMWL', high_quality_reference_library=False, mz_min=0, mz_max=9999999, int_min=0, int_max=9999999, window_size_centroiding=0.5, window_size_matching=0.5, noise_threshold=0.0, wf_mz=0.0, wf_intensity=1.0, LET_threshold=0.0, entropy_dimension=1.1, n_top_matches_to_save=1, print_id_results=False, output_identification=None, output_similarity_scores=None, return_ID_output=False, verbose=True):
1858
- if query_data is None:
1859
- print('\nError: No argument passed to the mandatory query_data. Please pass the path to the CSV file of the query data.')
1860
- sys.exit()
1861
- else:
1862
- extension = query_data.rsplit('.',1)
1863
- extension = extension[(len(extension)-1)]
1864
- if extension == 'mgf' or extension == 'MGF' or extension == 'mzML' or extension == 'mzml' or extension == 'MZML' or extension == 'cdf' or extension == 'CDF' or extension == 'json' or extension == 'JSON':
1865
- output_path_tmp = query_data[:-3] + 'txt'
1866
- build_library_from_raw_data(input_path=query_data, output_path=output_path_tmp, is_reference=False)
1867
- #build_library_from_raw_data(input_path=query_data, output_path=output_path_tmp, is_reference=True)
1868
- df_query = pd.read_csv(output_path_tmp, sep='\t')
1869
- if extension == 'txt' or extension == 'TXT':
1870
- df_query = pd.read_csv(query_data, sep='\t')
1871
- unique_query_ids = df_query['id'].unique()
1872
-
1873
- if reference_data is None:
1874
- print('\nError: No argument passed to the mandatory reference_data. Please pass the path to the reference data.')
1875
- sys.exit()
1876
- else:
1877
- if isinstance(reference_data,str):
1878
- df_reference = get_reference_df(reference_data,likely_reference_ids)
1879
- else:
1880
- dfs = []
1881
- for f in reference_data:
1882
- tmp = get_reference_df(f,likely_reference_ids)
1883
- dfs.append(tmp)
1884
- df_reference = pd.concat(dfs, axis=0, ignore_index=True)
1885
-
1886
- if 'ionization_mode' in df_reference.columns.tolist() and ionization_mode != 'N/A':
1887
- df_reference = df_reference.loc[df_reference['ionization_mode']==ionization_mode]
1888
- if 'adduct' in df_reference.columns.tolist() and adduct != 'N/A':
1889
- df_reference = df_reference.loc[df_reference['adduct']==adduct]
1890
-
1891
- if spectrum_preprocessing_order is not None:
1892
- spectrum_preprocessing_order = list(spectrum_preprocessing_order)
1893
- else:
1894
- spectrum_preprocessing_order = ['F', 'C', 'N', 'M', 'W', 'L']
1895
- if 'M' not in spectrum_preprocessing_order:
1896
- print(f'Error: \'M\' must be a character in spectrum_preprocessing_order.')
1897
- sys.exit()
1898
- if 'C' in spectrum_preprocessing_order:
1899
- if spectrum_preprocessing_order.index('C') > spectrum_preprocessing_order.index('M'):
1900
- print(f'Error: \'C\' must come before \'M\' in spectrum_preprocessing_order.')
1901
- sys.exit()
1902
- if set(spectrum_preprocessing_order) - {'F','C','N','M','W','L'}:
1903
- print(f'Error: spectrum_preprocessing_order must contain only \'C\', \'F\', \'M\', \'N\', \'L\', \'W\'.')
1904
- sys.exit()
1905
-
1906
-
1907
- if similarity_measure not in ['cosine','shannon','renyi','tsallis','mixture','jaccard','dice','3w_jaccard','sokal_sneath','binary_cosine','mountford','mcconnaughey','driver_kroeber','simpson','braun_banquet','fager_mcgowan','kuldzynski','intersection','hamming','hellinger']:
1908
- print('\nError: similarity_measure must be either cosine, shannon, renyi, tsallis, mixture, jaccard, dice, 3w_jaccard, sokal_sneath, binary_cosine, mountford, mcconnaughey, driver_kroeber, simpson, braun_banquet, fager_mcgowan, kulczynski, intersection, hamming, or hellinger')
1909
- sys.exit()
1910
-
1911
- if isinstance(int_min,int) is True:
1912
- int_min = float(int_min)
1913
- if isinstance(int_max,int) is True:
1914
- int_max = float(int_max)
1915
- if isinstance(mz_min,int) is False or isinstance(mz_max,int) is False or isinstance(int_min,float) is False or isinstance(int_max,float) is False:
1916
- print('Error: mz_min must be a non-negative integer, mz_max must be a positive integer, int_min must be a non-negative float, and int_max must be a positive float')
1917
- sys.exit()
1918
- if mz_min < 0:
1919
- print('\nError: mz_min should be a non-negative integer')
1920
- sys.exit()
1921
- if mz_max <= 0:
1922
- print('\nError: mz_max should be a positive integer')
1923
- sys.exit()
1924
- if int_min < 0:
1925
- print('\nError: int_min should be a non-negative float')
1926
- sys.exit()
1927
- if int_max <= 0:
1928
- print('\nError: int_max should be a positive float')
1929
- sys.exit()
1930
-
1931
- if isinstance(window_size_centroiding,float) is False or window_size_centroiding <= 0.0:
1932
- print('Error: window_size_centroiding must be a positive float.')
1933
- sys.exit()
1934
- if isinstance(window_size_matching,float) is False or window_size_matching<= 0.0:
1935
- print('Error: window_size_matching must be a positive float.')
1936
- sys.exit()
1937
-
1938
- if isinstance(noise_threshold,int) is True:
1939
- noise_threshold = float(noise_threshold)
1940
- if isinstance(noise_threshold,float) is False or noise_threshold < 0:
1941
- print('Error: noise_threshold must be a positive float.')
1942
- sys.exit()
1943
-
1944
- if isinstance(wf_intensity,int) is True:
1945
- wf_intensity = float(wf_intensity)
1946
- if isinstance(wf_mz,int) is True:
1947
- wf_mz = float(wf_mz)
1948
- if isinstance(wf_intensity,float) is False or isinstance(wf_mz,float) is False:
1949
- print('Error: wf_mz and wf_intensity must be integers or floats')
1950
- sys.exit()
1951
-
1952
- if entropy_dimension <= 0:
1953
- print('\nError: entropy_dimension should be a positive float')
1954
- sys.exit()
1955
- else:
1956
- q = entropy_dimension
1957
-
1958
- normalization_method = 'standard'
1959
-
1960
- if n_top_matches_to_save <= 0 or isinstance(n_top_matches_to_save,int)==False:
1961
- print('\nError: n_top_matches_to_save should be a positive integer')
1962
- sys.exit()
1963
-
1964
- if isinstance(print_id_results,bool)==False:
1965
- print('\nError: print_id_results must be either True or False')
1966
- sys.exit()
1967
-
1968
- if output_identification is None:
1969
- output_identification = f'{Path.cwd()}/output_identification.txt'
1970
- print(f'Warning: writing identification output to {output_identification}')
1971
-
1972
- if output_similarity_scores is None:
1973
- output_similarity_scores = f'{Path.cwd()}/output_all_similarity_scores.txt'
1974
- print(f'Warning: writing similarity scores to {output_similarity_scores}')
1975
-
1976
-
1977
- unique_reference_ids = df_reference['id'].unique().tolist()
1978
- all_similarity_scores = []
1979
-
1980
- for query_idx in range(len(unique_query_ids)):
1981
- if verbose:
1982
- print(f'query spectrum #{query_idx} is being identified')
1983
-
1984
- q_mask = (df_query['id'] == unique_query_ids[query_idx])
1985
- q_idxs_tmp = np.where(q_mask)[0]
1986
- q_spec_tmp = np.asarray(pd.concat([df_query['mz_ratio'].iloc[q_idxs_tmp], df_query['intensity'].iloc[q_idxs_tmp]], axis=1).reset_index(drop=True))
1987
-
1988
- if 'precursor_ion_mz' in df_query.columns.tolist() and 'precursor_ion_mz' in df_reference.columns.tolist() and precursor_ion_mz_tolerance != None:
1989
- precursor_ion_mz_tmp = df_query['precursor_ion_mz'].iloc[q_idxs_tmp[0]]
1990
- df_reference_tmp = df_reference.loc[df_reference['precursor_ion_mz'].between(precursor_ion_mz_tmp-precursor_ion_mz_tolerance, precursor_ion_mz_tmp+precursor_ion_mz_tolerance, inclusive='both'),['id','mz_ratio','intensity']].copy()
1991
- else:
1992
- df_reference_tmp = df_reference.copy()
1993
-
1994
- ref_groups = dict(tuple(df_reference_tmp.groupby('id', sort=False)))
1995
- unique_reference_ids_tmp = list(ref_groups.keys())
1996
-
1997
- similarity_by_ref = {}
1998
- for ref_id in unique_reference_ids_tmp:
1999
- q_spec = q_spec_tmp.copy()
2000
- r_df = ref_groups[ref_id]
2001
- r_spec = np.asarray(pd.concat([r_df['mz_ratio'], r_df['intensity']], axis=1).reset_index(drop=True))
2002
-
2003
- is_matched = False
2004
-
2005
- for transformation in spectrum_preprocessing_order:
2006
- if np.isinf(q_spec[:, 1]).sum() > 0:
2007
- q_spec[:, 1] = np.zeros(q_spec.shape[0])
2008
- if np.isinf(r_spec[:, 1]).sum() > 0:
2009
- r_spec[:, 1] = np.zeros(r_spec.shape[0])
2010
-
2011
- if transformation == 'C' and q_spec.shape[0] > 1 and r_spec.shape[0] > 1:
2012
- q_spec = centroid_spectrum(q_spec, window_size=window_size_centroiding)
2013
- r_spec = centroid_spectrum(r_spec, window_size=window_size_centroiding)
2014
-
2015
- if transformation == 'M' and q_spec.shape[0] > 1 and r_spec.shape[0] > 1:
2016
- m_spec = match_peaks_in_spectra(spec_a=q_spec, spec_b=r_spec, window_size=window_size_matching)
2017
- q_spec = m_spec[:, 0:2]
2018
- r_spec = m_spec[:, [0, 2]]
2019
- is_matched = True
2020
-
2021
- if transformation == 'W' and q_spec.shape[0] > 1 and r_spec.shape[0] > 1:
2022
- q_spec[:, 1] = wf_transform(q_spec[:, 0], q_spec[:, 1], wf_mz, wf_intensity)
2023
- r_spec[:, 1] = wf_transform(r_spec[:, 0], r_spec[:, 1], wf_mz, wf_intensity)
2024
-
2025
- if transformation == 'L' and q_spec.shape[0] > 1 and r_spec.shape[0] > 1:
2026
- q_spec[:, 1] = LE_transform(q_spec[:, 1], LET_threshold, normalization_method=normalization_method)
2027
- r_spec[:, 1] = LE_transform(r_spec[:, 1], LET_threshold, normalization_method=normalization_method)
2028
-
2029
- if transformation == 'N' and q_spec.shape[0] > 1 and r_spec.shape[0] > 1:
2030
- q_spec = remove_noise(q_spec, nr=noise_threshold)
2031
- if not high_quality_reference_library:
2032
- r_spec = remove_noise(r_spec, nr=noise_threshold)
2033
-
2034
- if transformation == 'F' and q_spec.shape[0] > 1 and r_spec.shape[0] > 1:
2035
- q_spec = filter_spec_lcms(
2036
- q_spec, mz_min=mz_min, mz_max=mz_max, int_min=int_min, int_max=int_max, is_matched=is_matched
2037
- )
2038
- if not high_quality_reference_library:
2039
- r_spec = filter_spec_lcms(
2040
- r_spec, mz_min=mz_min, mz_max=mz_max, int_min=int_min, int_max=int_max, is_matched=is_matched
2041
- )
2042
-
2043
- q_ints = q_spec[:, 1]
2044
- r_ints = r_spec[:, 1]
2045
-
2046
- if np.sum(q_ints) != 0 and np.sum(r_ints) != 0 and q_spec.shape[0] > 1 and r_spec.shape[0] > 1:
2047
- sim = get_similarity(similarity_measure, q_ints, r_ints, weights, entropy_dimension)
2048
- else:
2049
- sim = 0.0
2050
-
2051
- similarity_by_ref[ref_id] = sim
2052
-
2053
- row_scores = [similarity_by_ref.get(ref_id, 0.0) for ref_id in unique_reference_ids]
2054
- all_similarity_scores.append(row_scores)
2055
-
2056
- df_scores = pd.DataFrame(all_similarity_scores, index=unique_query_ids, columns=unique_reference_ids)
2057
- df_scores.index = unique_query_ids
2058
- df_scores.index.names = ['QUERY.SPECTRUM.ID']
2059
-
2060
-
2061
- preds = []
2062
- scores = []
2063
- for i in range(0, df_scores.shape[0]):
2064
- df_scores_tmp = df_scores
2065
- preds_tmp = []
2066
- scores_tmp = []
2067
- for j in range(0, n_top_matches_to_save):
2068
- top_ref_specs_tmp = df_scores_tmp.iloc[i,np.where(df_scores_tmp.iloc[i,:] == np.max(df_scores_tmp.iloc[i,:]))[0]]
2069
- cols_to_keep = np.where(df_scores_tmp.iloc[i,:] != np.max(df_scores_tmp.iloc[i,:]))[0]
2070
- df_scores_tmp = df_scores_tmp.iloc[:,cols_to_keep]
2071
-
2072
- preds_tmp.append(';'.join(map(str,top_ref_specs_tmp.index.to_list())))
2073
- if len(top_ref_specs_tmp.values) == 0:
2074
- scores_tmp.append(0)
2075
- else:
2076
- scores_tmp.append(top_ref_specs_tmp.values[0])
2077
- preds.append(preds_tmp)
2078
- scores.append(scores_tmp)
2079
-
2080
- preds = np.array(preds)
2081
- scores = np.array(scores)
2082
- out = np.c_[preds,scores]
2083
-
2084
- cnames_preds = []
2085
- cnames_scores = []
2086
- for i in range(0,n_top_matches_to_save):
2087
- cnames_preds.append(f'RANK.{i+1}.PRED')
2088
- cnames_scores.append(f'RANK.{i+1}.SIMILARITY.SCORE')
2089
-
2090
- df_top_ref_specs = pd.DataFrame(out, columns = [*cnames_preds, *cnames_scores])
2091
- df_top_ref_specs.index = unique_query_ids
2092
- df_top_ref_specs.index.names = ['QUERY.SPECTRUM.ID']
2093
-
2094
- df_scores.columns = ['Reference Spectrum ID: ' + col for col in list(map(str,df_scores.columns.tolist()))]
2095
-
2096
- if print_id_results == True:
2097
- print(df_top_ref_specs.to_string())
2098
-
2099
- if return_ID_output is False:
2100
- df_top_ref_specs.to_csv(output_identification, sep='\t')
2101
- df_scores.to_csv(output_similarity_scores, sep='\t')
2102
- else:
2103
- return df_top_ref_specs
2104
-
2105
-
2106
-
2107
-
2108
- def run_spec_lib_matching_on_NRMS_data_shiny(query_data=None, reference_data=None, likely_reference_ids=None, spectrum_preprocessing_order='FNLW', similarity_measure='cosine', weights={'Cosine':0.25,'Shannon':0.25,'Renyi':0.25,'Tsallis':0.25}, high_quality_reference_library=False, mz_min=0, mz_max=9999999, int_min=0, int_max=9999999, noise_threshold=0.0, wf_mz=0.0, wf_intensity=1.0, LET_threshold=0.0, entropy_dimension=1.1, n_top_matches_to_save=1, print_id_results=False, output_identification=None, output_similarity_scores=None, return_ID_output=False, verbose=True):
2109
- if query_data is None:
2110
- print('\nError: No argument passed to the mandatory query_data. Please pass the path to the TXT file of the query data.')
2111
- sys.exit()
2112
- else:
2113
- extension = query_data.rsplit('.',1)
2114
- extension = extension[(len(extension)-1)]
2115
- if extension == 'mgf' or extension == 'MGF' or extension == 'mzML' or extension == 'mzml' or extension == 'MZML' or extension == 'cdf' or extension == 'CDF' or extension == 'msp' or extension == 'MSP' or extension == 'json' or extension == 'JSON':
2116
- output_path_tmp = query_data[:-3] + 'txt'
2117
- build_library_from_raw_data(input_path=query_data, output_path=output_path_tmp, is_reference=False)
2118
- df_query = pd.read_csv(output_path_tmp, sep='\t')
2119
- if extension == 'txt' or extension == 'TXT':
2120
- df_query = pd.read_csv(query_data, sep='\t')
2121
- unique_query_ids = df_query.iloc[:,0].unique()
2122
-
2123
- if reference_data is None:
2124
- print('\nError: No argument passed to the mandatory reference_data. Please pass the path to the TXT file of the reference data.')
2125
- sys.exit()
2126
- else:
2127
- if isinstance(reference_data,str):
2128
- df_reference = get_reference_df(reference_data,likely_reference_ids)
2129
- unique_reference_ids = df_reference.iloc[:,0].unique()
2130
- else:
2131
- dfs = []
2132
- unique_reference_ids = []
2133
- for f in reference_data:
2134
- tmp = get_reference_df(f,likely_reference_ids)
2135
- dfs.append(tmp)
2136
- unique_reference_ids.extend(tmp.iloc[:,0].unique())
2137
- df_reference = pd.concat(dfs, axis=0, ignore_index=True)
2138
-
2139
-
2140
- if spectrum_preprocessing_order is not None:
2141
- spectrum_preprocessing_order = list(spectrum_preprocessing_order)
2142
- else:
2143
- spectrum_preprocessing_order = ['F','N','W','L']
2144
- if set(spectrum_preprocessing_order) - {'F','N','W','L'}:
2145
- print(f'Error: spectrum_preprocessing_order must contain only \'F\', \'N\', \'W\', \'L\'.')
2146
- sys.exit()
2147
-
2148
- if similarity_measure not in ['cosine','shannon','renyi','tsallis','mixture','jaccard','dice','3w_jaccard','sokal_sneath','binary_cosine','mountford','mcconnaughey','driver_kroeber','simpson','braun_banquet','fager_mcgowan','kuldzynski','intersection','hamming','hellinger']:
2149
- print('\nError: similarity_measure must be either cosine, shannon, renyi, tsallis, mixture, jaccard, dice, 3w_jaccard, sokal_sneath, binary_cosine, mountford, mcconnaughey, driver_kroeber, simpson, braun_banquet, fager_mcgowan, kulczynski, intersection, hamming, or hellinger')
2150
- sys.exit()
2151
-
2152
- if isinstance(int_min,int) is True:
2153
- int_min = float(int_min)
2154
- if isinstance(int_max,int) is True:
2155
- int_max = float(int_max)
2156
- if isinstance(mz_min,int) is False or isinstance(mz_max,int) is False or isinstance(int_min,float) is False or isinstance(int_max,float) is False:
2157
- print('Error: mz_min must be a non-negative integer, mz_max must be a positive integer, int_min must be a non-negative float, and int_max must be a positive float')
2158
- sys.exit()
2159
- if mz_min < 0:
2160
- print('\nError: mz_min should be a non-negative integer')
2161
- sys.exit()
2162
- if mz_max <= 0:
2163
- print('\nError: mz_max should be a positive integer')
2164
- sys.exit()
2165
- if int_min < 0:
2166
- print('\nError: int_min should be a non-negative float')
2167
- sys.exit()
2168
- if int_max <= 0:
2169
- print('\nError: int_max should be a positive float')
2170
- sys.exit()
2171
-
2172
- if isinstance(noise_threshold,int) is True:
2173
- noise_threshold = float(noise_threshold)
2174
- if isinstance(noise_threshold,float) is False or noise_threshold < 0:
2175
- print('Error: noise_threshold must be a positive float.')
2176
- sys.exit()
2177
-
2178
- if isinstance(wf_intensity,int) is True:
2179
- wf_intensity = float(wf_intensity)
2180
- if isinstance(wf_mz,int) is True:
2181
- wf_mz = float(wf_mz)
2182
- if isinstance(wf_intensity,float) is False or isinstance(wf_mz,float) is False:
2183
- print('Error: wf_mz and wf_intensity must be integers or floats')
2184
- sys.exit()
2185
-
2186
- if entropy_dimension <= 0:
2187
- print('\nError: entropy_dimension should be a positive float')
2188
- sys.exit()
2189
- else:
2190
- q = entropy_dimension
2191
-
2192
- normalization_method = 'standard'
2193
-
2194
- if n_top_matches_to_save <= 0 or isinstance(n_top_matches_to_save,int)==False:
2195
- print('\nError: n_top_matches_to_save should be a positive integer')
2196
- sys.exit()
2197
-
2198
- if isinstance(print_id_results,bool)==False:
2199
- print('\nError: print_id_results must be either True or False')
2200
- sys.exit()
2201
-
2202
- if output_identification is None:
2203
- output_identification = f'{Path.cwd()}/output_identification.txt'
2204
- print(f'Warning: writing identification output to {output_identification}')
2205
-
2206
- if output_similarity_scores is None:
2207
- output_similarity_scores = f'{Path.cwd()}/output_all_similarity_scores.txt'
2208
- print(f'Warning: writing similarity scores to {output_similarity_scores}')
2209
-
2210
-
2211
-
2212
- min_mz = int(np.min([np.min(df_query.iloc[:,1]), np.min(df_reference.iloc[:,1])]))
2213
- max_mz = int(np.max([np.max(df_query.iloc[:,1]), np.max(df_reference.iloc[:,1])]))
2214
- mzs = np.linspace(min_mz,max_mz,(max_mz-min_mz+1))
2215
-
2216
- all_similarity_scores = []
2217
- for query_idx in range(0,len(unique_query_ids)):
2218
- q_idxs_tmp = np.where(df_query.iloc[:,0] == unique_query_ids[query_idx])[0]
2219
- q_spec_tmp = np.asarray(pd.concat([df_query.iloc[q_idxs_tmp,1], df_query.iloc[q_idxs_tmp,2]], axis=1).reset_index(drop=True))
2220
- q_spec_tmp = convert_spec(q_spec_tmp,mzs)
2221
-
2222
- similarity_scores = []
2223
- for ref_idx in range(0,len(unique_reference_ids)):
2224
- if verbose is True and ref_idx % 1000 == 0:
2225
- print(f'Query spectrum #{query_idx} has had its similarity with {ref_idx} reference library spectra computed')
2226
- q_spec = q_spec_tmp
2227
- r_idxs_tmp = np.where(df_reference.iloc[:,0] == unique_reference_ids[ref_idx])[0]
2228
- r_spec_tmp = np.asarray(pd.concat([df_reference.iloc[r_idxs_tmp,1], df_reference.iloc[r_idxs_tmp,2]], axis=1).reset_index(drop=True))
2229
- r_spec = convert_spec(r_spec_tmp,mzs)
2230
-
2231
- for transformation in spectrum_preprocessing_order:
2232
- if np.isinf(q_spec[:,1]).sum() > 0:
2233
- q_spec[:,1] = np.zeros(q_spec.shape[0])
2234
- if np.isinf(r_spec[:,1]).sum() > 0:
2235
- r_spec[:,1] = np.zeros(r_spec.shape[0])
2236
- if transformation == 'W':
2237
- q_spec[:,1] = wf_transform(q_spec[:,0], q_spec[:,1], wf_mz, wf_intensity)
2238
- r_spec[:,1] = wf_transform(r_spec[:,0], r_spec[:,1], wf_mz, wf_intensity)
2239
- if transformation == 'L':
2240
- q_spec[:,1] = LE_transform(q_spec[:,1], LET_threshold, normalization_method=normalization_method)
2241
- r_spec[:,1] = LE_transform(r_spec[:,1], LET_threshold, normalization_method=normalization_method)
2242
- if transformation == 'N':
2243
- q_spec = remove_noise(q_spec, nr = noise_threshold)
2244
- if high_quality_reference_library == False:
2245
- r_spec = remove_noise(r_spec, nr = noise_threshold)
2246
- if transformation == 'F':
2247
- q_spec = filter_spec_gcms(q_spec, mz_min = mz_min, mz_max = mz_max, int_min = int_min, int_max = int_max)
2248
- if high_quality_reference_library == False:
2249
- r_spec = filter_spec_gcms(r_spec, mz_min = mz_min, mz_max = mz_max, int_min = int_min, int_max = int_max)
2250
-
2251
- q_ints = q_spec[:,1]
2252
- r_ints = r_spec[:,1]
2253
-
2254
- if np.sum(q_ints) != 0 and np.sum(r_ints) != 0:
2255
- similarity_score = get_similarity(similarity_measure, q_spec[:,1], r_spec[:,1], weights, entropy_dimension)
2256
- else:
2257
- similarity_score = 0
2258
-
2259
- similarity_scores.append(similarity_score)
2260
- all_similarity_scores.append(similarity_scores)
2261
-
2262
- df_scores = pd.DataFrame(all_similarity_scores, columns = unique_reference_ids)
2263
- df_scores.index = unique_query_ids
2264
- df_scores.index.names = ['QUERY.SPECTRUM.ID']
2265
-
2266
- preds = []
2267
- scores = []
2268
- for i in range(0, df_scores.shape[0]):
2269
- df_scores_tmp = df_scores
2270
- preds_tmp = []
2271
- scores_tmp = []
2272
- for j in range(0, n_top_matches_to_save):
2273
- top_ref_specs_tmp = df_scores_tmp.iloc[i,np.where(df_scores_tmp.iloc[i,:] == np.max(df_scores_tmp.iloc[i,:]))[0]]
2274
- cols_to_keep = np.where(df_scores_tmp.iloc[i,:] != np.max(df_scores_tmp.iloc[i,:]))[0]
2275
- df_scores_tmp = df_scores_tmp.iloc[:,cols_to_keep]
2276
-
2277
- preds_tmp.append(';'.join(map(str,top_ref_specs_tmp.index.to_list())))
2278
- if len(top_ref_specs_tmp.values) == 0:
2279
- scores_tmp.append(0)
2280
- else:
2281
- scores_tmp.append(top_ref_specs_tmp.values[0])
2282
- preds.append(preds_tmp)
2283
- scores.append(scores_tmp)
2284
-
2285
- preds = np.array(preds)
2286
- scores = np.array(scores)
2287
- out = np.c_[preds,scores]
2288
-
2289
- cnames_preds = []
2290
- cnames_scores = []
2291
- for i in range(0,n_top_matches_to_save):
2292
- cnames_preds.append(f'RANK.{i+1}.PRED')
2293
- cnames_scores.append(f'RANK.{i+1}.SIMILARITY.SCORE')
2294
-
2295
- df_top_ref_specs = pd.DataFrame(out, columns = [*cnames_preds, *cnames_scores])
2296
- df_top_ref_specs.index = unique_query_ids
2297
- df_top_ref_specs.index.names = ['QUERY.SPECTRUM.ID']
2298
-
2299
- if print_id_results == True:
2300
- print(df_top_ref_specs.to_string())
2301
-
2302
- df_scores.columns = ['Reference Spectrum ID: ' + col for col in list(map(str,df_scores.columns.tolist()))]
2303
-
2304
- if return_ID_output is False:
2305
- df_top_ref_specs.to_csv(output_identification, sep='\t')
2306
- df_scores.columns = ['Reference Spectrum ID: ' + col for col in list(map(str,df_scores.columns.tolist()))]
2307
- df_scores.to_csv(output_similarity_scores, sep='\t')
2308
- else:
2309
- return df_top_ref_specs
2310
-
2311
-
2312
- class _UIWriter:
2313
- def __init__(self, loop, q: asyncio.Queue[str]):
2314
- self._loop = loop
2315
- self._q = q
2316
- def write(self, s: str):
2317
- if s:
2318
- self._loop.call_soon_threadsafe(self._q.put_nowait, s)
2319
- return len(s)
2320
- def flush(self):
2321
- pass
2322
-
2323
-
2324
- def attach_logging_to_writer(writer):
2325
- handler = logging.StreamHandler(writer)
2326
- handler.setLevel(logging.INFO)
2327
- root = logging.getLogger()
2328
- root.addHandler(handler)
2329
- root.setLevel(logging.INFO)
2330
- return handler, root
2331
-
2332
-
2333
-
2334
- def _run_with_redirects(fn, writer, *args, **kwargs):
2335
- with redirect_stdout(writer), redirect_stderr(writer):
2336
- return fn(*args, **kwargs)
2337
-
2338
-
2339
- def strip_text(s):
2340
- return [x.strip() for x in s.strip('[]').split(',') if x.strip()]
2341
-
2342
-
2343
- def strip_numeric(s):
2344
- return [float(x.strip()) for x in s.strip('[]').split(',') if x.strip()]
2345
-
2346
-
2347
- def strip_weights(s):
2348
- obj = ast.literal_eval(s) if isinstance(s, (str, bytes)) else s
2349
- keys = ['Cosine', 'Shannon', 'Renyi', 'Tsallis']
2350
-
2351
- if isinstance(obj, (list, tuple)):
2352
- if len(obj) == 4 and all(isinstance(x, Real) for x in obj):
2353
- tuples = [obj]
2354
- else:
2355
- tuples = list(obj)
2356
- else:
2357
- raise ValueError(f"Expected a 4-tuple or a sequence of 4-tuples, got {type(obj).__name__}")
2358
-
2359
- out = []
2360
- for t in tuples:
2361
- if not (isinstance(t, (list, tuple)) and len(t) == 4):
2362
- raise ValueError(f"Each item must be a 4-tuple, got: {t!r}")
2363
- out.append(dict(zip(keys, t)))
2364
- return out
2365
-
2366
-
2367
- def build_library(input_path=None, output_path=None):
2368
- last_three_chars = input_path[(len(input_path)-3):len(input_path)]
2369
- last_four_chars = input_path[(len(input_path)-4):len(input_path)]
2370
- if last_three_chars == 'txt' or last_three_chars == 'TXT':
2371
- return pd.read_csv(input_path, sep='\t')
2372
- else:
2373
- if last_three_chars == 'mgf' or last_three_chars == 'MGF':
2374
- input_file_type = 'mgf'
2375
- elif last_four_chars == 'mzML' or last_four_chars == 'mzml' or last_four_chars == 'MZML':
2376
- input_file_type = 'mzML'
2377
- elif last_four_chars == 'json' or last_four_chars == 'JSON':
2378
- input_file_type = 'json'
2379
- elif last_three_chars == 'cdf' or last_three_chars == 'CDF':
2380
- input_file_type = 'cdf'
2381
- elif last_three_chars == 'msp' or last_three_chars == 'MSP':
2382
- input_file_type = 'msp'
2383
- else:
2384
- print('ERROR: either an \'mgf\', \'mzML\', \'cdf\', \'msp\', \'json\', or \'txt\' file must be passed to --input_path')
2385
- sys.exit()
2386
-
2387
- spectra = []
2388
- if input_file_type == 'mgf':
2389
- with mgf.read(input_path, index_by_scans = True) as reader:
2390
- for spec in reader:
2391
- spectra.append(spec)
2392
- if input_file_type == 'mzML':
2393
- with mzml.read(input_path) as reader:
2394
- for spec in reader:
2395
- spectra.append(spec)
2396
-
2397
- if input_file_type == 'mgf' or input_file_type == 'mzML':
2398
- ids = []
2399
- mzs = []
2400
- ints = []
2401
- for i in range(0,len(spectra)):
2402
- for j in range(0,len(spectra[i]['m/z array'])):
2403
- if input_file_type == 'mzML':
2404
- ids.append(f'ID_{i+1}')
2405
- else:
2406
- ids.append(spectra[i]['params']['name'])
2407
- mzs.append(spectra[i]['m/z array'][j])
2408
- ints.append(spectra[i]['intensity array'][j])
2409
-
2410
- if input_file_type == 'cdf':
2411
- dataset = nc.Dataset(input_path, 'r')
2412
- all_mzs = dataset.variables['mass_values'][:]
2413
- all_ints = dataset.variables['intensity_values'][:]
2414
- scan_idxs = dataset.variables['scan_index'][:]
2415
- dataset.close()
2416
-
2417
- ids = []
2418
- mzs = []
2419
- ints = []
2420
- for i in range(0,(len(scan_idxs)-1)):
2421
- if i % 1000 == 0:
2422
- print(f'analyzed {i} out of {len(scan_idxs)} scans')
2423
- s_idx = scan_idxs[i]
2424
- e_idx = scan_idxs[i+1]
2425
-
2426
- mzs_tmp = all_mzs[s_idx:e_idx]
2427
- ints_tmp = all_ints[s_idx:e_idx]
2428
-
2429
- for j in range(0,len(mzs_tmp)):
2430
- ids.append(f'ID_{i+1}')
2431
- mzs.append(mzs_tmp[j])
2432
- ints.append(ints_tmp[j])
2433
-
2434
- if input_file_type == 'msp':
2435
- ids = []
2436
- mzs = []
2437
- ints = []
2438
- with open(input_path, 'r') as f:
2439
- i = 0
2440
- for line in f:
2441
- line = line.strip()
2442
- if line.startswith('Name:'):
2443
- i += 1
2444
- spectrum_id = line.replace('Name: ','')
2445
- elif line and line[0].isdigit():
2446
- try:
2447
- mz, intensity = map(float, line.split()[:2])
2448
- ids.append(spectrum_id)
2449
- mzs.append(mz)
2450
- ints.append(intensity)
2451
- except ValueError:
2452
- continue
2453
-
2454
- if input_file_type == 'json':
2455
- data = json.load(open(input_path))
2456
- ids = []
2457
- mzs = []
2458
- ints = []
2459
- for i in range(0,len(data)):
2460
- spec_ID_tmp = data[i]['spectrum_id']
2461
- tmp = data[i]['peaks_json']
2462
- tmp = tmp[1:-1].split(",")
2463
- tmp = [a.replace("[","") for a in tmp]
2464
- tmp = [a.replace("]","") for a in tmp]
2465
- mzs_tmp = tmp[0::2]
2466
- ints_tmp = tmp[1::2]
2467
- ids.extend([spec_ID_tmp] * len(mzs_tmp))
2468
- mzs.extend(mzs_tmp)
2469
- ints.extend(ints_tmp)
2470
-
2471
- df = pd.DataFrame({'id':ids, 'mz_ratio':mzs, 'intensity':ints})
2472
- return df
2473
-
2474
-
2475
-
2476
- def extract_first_column_ids(file_path: str, max_ids: int = 20000):
2477
- suffix = Path(file_path).suffix.lower()
2478
-
2479
- if suffix == ".txt":
2480
- df = pd.read_csv(file_path, sep='\t')
2481
- if 'id' in df.columns.tolist():
2482
- ids = df['id'].astype(str).dropna()
2483
- else:
2484
- ids = df.iloc[:, 0].astype(str).dropna()
2485
- ids = [x for x in ids if x.strip() != ""]
2486
- seen = set()
2487
- uniq = []
2488
- for x in ids:
2489
- if x not in seen:
2490
- uniq.append(x)
2491
- seen.add(x)
2492
- return uniq[:max_ids]
2493
-
2494
- ids = []
2495
- try:
2496
- with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
2497
- for line in f:
2498
- ls = line.strip()
2499
- if ls.startswith("TITLE="):
2500
- ids.append(ls.split("=", 1)[1].strip())
2501
- elif ls.lower().startswith("name:"):
2502
- ids.append(ls.split(":", 1)[1].strip())
2503
- if len(ids) >= max_ids:
2504
- break
2505
- except Exception:
2506
- pass
2507
-
2508
- if ids:
2509
- seen = set()
2510
- uniq = []
2511
- for x in ids:
2512
- if x not in seen:
2513
- uniq.append(x)
2514
- seen.add(x)
2515
- return uniq
2516
- return []
2517
-
2518
-
2519
- def _open_plot_window(session, svg_bytes: bytes, title: str = "plot.svg"):
2520
- """Send SVG bytes to browser and open in a new window as a data URL."""
2521
- b64 = base64.b64encode(svg_bytes).decode("ascii")
2522
- data_url = f"data:image/svg;base64,{b64}"
2523
- session.send_custom_message("open-plot-window", {"svg": data_url, "title": title})
2524
-
2525
-
2526
- def plot_spectra_ui(platform: str):
2527
- base_inputs = [
2528
- ui.input_file("query_data", "Upload query dataset (mgf, mzML, cdf, msp, or txt):"),
2529
- ui.input_file("reference_data", "Upload reference dataset (mgf, mzML, cdf, msp, or txt):"),
2530
- ui.input_selectize(
2531
- "spectrum_ID1",
2532
- "Select spectrum ID 1 (default is the first spectrum in the library):",
2533
- choices=[],
2534
- multiple=False,
2535
- options={"placeholder": "Upload a library..."},
2536
- ),
2537
- ui.input_selectize(
2538
- "spectrum_ID2",
2539
- "Select spectrum ID 2 (default is the first spectrum in the library):",
2540
- choices=[],
2541
- multiple=False,
2542
- options={"placeholder": "Upload a library..."},
2543
- ),
2544
- ui.input_select('print_url_spectrum1', 'Print PubChem URL for spectrum 1:', ['No', 'Yes']),
2545
- ui.input_select('print_url_spectrum2', 'Print PubChem URL for spectrum 2:', ['No', 'Yes']),
2546
- ui.input_select("similarity_measure", "Select similarity measure:", ["cosine","shannon","renyi","tsallis","mixture","jaccard","dice","3w_jaccard","sokal_sneath","binary_cosine","mountford","mcconnaughey","driver_kroeber","simpson","braun_banquet","fager_mcgowan","kulczynski","intersection","hamming","hellinger"]),
2547
- ui.input_text('weights', 'Weights for mixture similarity measure (cosine, shannon, renyi, tsallis):', '0.25, 0.25, 0.25, 0.25'),
2548
- ui.input_select(
2549
- "high_quality_reference_library",
2550
- "Indicate whether the reference library is considered high quality. If True, filtering and noise removal are only applied to the query spectra.",
2551
- [False, True],
2552
- ),
2553
- ]
2554
-
2555
- if platform == "HRMS":
2556
- extra_inputs = [
2557
- ui.input_text("spectrum_preprocessing_order", "Sequence of characters for preprocessing order (C (centroiding), F (filtering), M (matching), N (noise removal), L (low-entropy transformation), W (weight factor transformation)). M must be included, C before M if used.", "FCNMWL",),
2558
- ui.input_numeric("window_size_centroiding", "Centroiding window-size:", 0.5),
2559
- ui.input_numeric("window_size_matching", "Matching window-size:", 0.5),
2560
- ]
2561
- else:
2562
- extra_inputs = [
2563
- ui.input_text("spectrum_preprocessing_order", "Sequence of characters for preprocessing order (F (filtering), N (noise removal), L (low-entropy transformation), W (weight factor transformation)).", "FNLW",)
2564
- ]
2565
-
2566
- numeric_inputs = [
2567
- ui.input_numeric("mz_min", "Minimum m/z for filtering:", 0),
2568
- ui.input_numeric("mz_max", "Maximum m/z for filtering:", 99999999),
2569
- ui.input_numeric("int_min", "Minimum intensity for filtering:", 0),
2570
- ui.input_numeric("int_max", "Maximum intensity for filtering:", 999999999),
2571
- ui.input_numeric("noise_threshold", "Noise removal threshold:", 0.0),
2572
- ui.input_numeric("wf_mz", "Mass/charge weight factor:", 0.0),
2573
- ui.input_numeric("wf_int", "Intensity weight factor:", 1.0),
2574
- ui.input_numeric("LET_threshold", "Low-entropy threshold:", 0.0),
2575
- ui.input_numeric("entropy_dimension", "Entropy dimension (Renyi/Tsallis only):", 1.1),
2576
- ]
2577
-
2578
- select_input = ui.input_select("y_axis_transformation", "Transformation to apply to intensity axis:", ["normalized", "none", "log10", "sqrt"])
2579
-
2580
- run_button_plot_spectra = ui.download_button("run_btn_plot_spectra", "Run", style="font-size:16px; padding:15px 30px; width:200px; height:80px")
2581
- back_button = ui.input_action_button("back", "Back to main menu", style="font-size:16px; padding:15px 30px; width:200px; height:80px")
2582
-
2583
- if platform == "HRMS":
2584
- inputs_columns = ui.layout_columns(
2585
- ui.div(base_inputs[0:6], style="display:flex; flex-direction:column; gap:10px;"),
2586
- ui.div([base_inputs[6:9], extra_inputs[0]], style="display:flex; flex-direction:column; gap:10px;"),
2587
- ui.div(extra_inputs[1:3], numeric_inputs[0:3], style="display:flex; flex-direction:column; gap:10px;"),
2588
- ui.div([numeric_inputs[3:10], select_input], style="display:flex; flex-direction:column; gap:10px;"),
2589
- col_widths=(3,3,3,3),
2590
- )
2591
- elif platform == "NRMS":
2592
- inputs_columns = ui.layout_columns(
2593
- ui.div(base_inputs[0:6], style="display:flex; flex-direction:column; gap:10px;"),
2594
- ui.div([base_inputs[6:9], *extra_inputs], style="display:flex; flex-direction:column; gap:10px;"),
2595
- ui.div(numeric_inputs[0:5], style="display:flex; flex-direction:column; gap:10px;"),
2596
- ui.div([numeric_inputs[5:10], select_input], style="display:flex; flex-direction:column; gap:10px;"),
2597
- col_widths=(3,3,3,3),
2598
- )
2599
-
2600
- return ui.div(
2601
- ui.TagList(
2602
- ui.h2("Plot Spectra"),
2603
- inputs_columns,
2604
- run_button_plot_spectra,
2605
- back_button,
2606
- ui.div(ui.output_text("plot_query_status"), style="margin-top:8px; font-size:14px"),
2607
- ui.div(ui.output_text("plot_reference_status"), style="margin-top:8px; font-size:14px")
2608
- ),
2609
- )
2610
-
2611
-
2612
-
2613
- def run_spec_lib_matching_ui(platform: str):
2614
- base_inputs = [
2615
- ui.input_file("query_data", "Upload query dataset (mgf, mzML, cdf, msp, or txt):"),
2616
- ui.input_file("reference_data", "Upload reference dataset (mgf, mzML, cdf, msp, or txt):"),
2617
- ui.input_select("similarity_measure", "Select similarity measure:", ["cosine","shannon","renyi","tsallis","mixture","jaccard","dice","3w_jaccard","sokal_sneath","binary_cosine","mountford","mcconnaughey","driver_kroeber","simpson","braun_banquet","fager_mcgowan","kulczynski","intersection","hamming","hellinger"]),
2618
- ui.input_text('weights', 'Weights for mixture similarity measure (cosine, shannon, renyi, tsallis):', '0.25, 0.25, 0.25, 0.25'),
2619
- ui.input_file('compound_ID_output_file', 'Upload output from spectral library matching to plot top matches (optional)'),
2620
- ui.input_selectize("q_spec", "Select query spectrum (only applicable for plotting; default is the first spectrum in the compound ID output):", choices=[], multiple=False, options={"placeholder": "Upload compound ID output..."}),
2621
- ui.input_selectize("r_spec", "Select reference spectrum (only applicable for plotting; default is the rank 1 reference spectrum):", choices=[], multiple=False, options={"placeholder": "Upload compound ID output..."}),
2622
- ui.input_select('print_url_spectrum1', 'Print PubChem URL for query spectrum (only applicable for plotting):', ['No', 'Yes']),
2623
- ui.input_select('print_url_spectrum2', 'Print PubChem URL for reference spectrum (only applicable for plotting):', ['No', 'Yes']),
2624
- ui.input_select("high_quality_reference_library", "Indicate whether the reference library is considered high quality. If True, filtering and noise removal are only applied to the query spectra.", [False, True])
2625
- ]
2626
-
2627
- if platform == "HRMS":
2628
- extra_inputs = [
2629
- ui.input_numeric("precursor_ion_mz_tolerance", "Precursor ion mass tolerance (leave blank if not applicable):", None),
2630
- ui.input_select("ionization_mode", "Ionization mode:", ['Positive','Negative','N/A'], selected='N/A'),
2631
- ui.input_select("adduct", "Adduct:", ['H','NH3','NH4','Na','K','N/A'], selected='N/A'),
2632
- ui.input_text("spectrum_preprocessing_order","Sequence of characters for preprocessing order (C (centroiding), F (filtering), M (matching), N (noise removal), L (low-entropy transformation), W (weight factor transformation)). M must be included, C before M if used.","FCNMWL"),
2633
- ui.input_numeric("window_size_centroiding", "Centroiding window-size:", 0.5),
2634
- ui.input_numeric("window_size_matching", "Matching window-size:", 0.5),
2635
- ]
2636
- else:
2637
- extra_inputs = [ui.input_text("spectrum_preprocessing_order","Sequence of characters for preprocessing order (F (filtering), N (noise removal), L (low-entropy transformation), W (weight factor transformation)).","FNLW")]
2638
-
2639
- numeric_inputs = [
2640
- ui.input_numeric("mz_min", "Minimum m/z for filtering:", 0),
2641
- ui.input_numeric("mz_max", "Maximum m/z for filtering:", 99999999),
2642
- ui.input_numeric("int_min", "Minimum intensity for filtering:", 0),
2643
- ui.input_numeric("int_max", "Maximum intensity for filtering:", 999999999),
2644
- ui.input_numeric("noise_threshold", "Noise removal threshold:", 0.0),
2645
- ui.input_numeric("wf_mz", "Mass/charge weight factor:", 0.0),
2646
- ui.input_numeric("wf_int", "Intensity weight factor:", 1.0),
2647
- ui.input_numeric("LET_threshold", "Low-entropy threshold:", 0.0),
2648
- ui.input_numeric("entropy_dimension", "Entropy dimension (Renyi/Tsallis only):", 1.1),
2649
- ui.input_numeric("n_top_matches_to_save", "Number of top matches to save:", 3),
2650
- ]
2651
-
2652
-
2653
- run_button_spec_lib_matching = ui.download_button("run_btn_spec_lib_matching", "Run Spectral Library Matching", style="font-size:16px; padding:15px 30px; width:200px; height:80px")
2654
- run_button_plot_spectra_within_spec_lib_matching = ui.download_button("run_btn_plot_spectra_within_spec_lib_matching", "Plot Spectra", style="font-size:16px; padding:15px 30px; width:200px; height:80px")
2655
- back_button = ui.input_action_button("back", "Back to main menu", style="font-size:16px; padding:15px 30px; width:200px; height:80px")
2656
-
2657
- if platform == "HRMS":
2658
- inputs_columns = ui.layout_columns(
2659
- ui.div([base_inputs[0:2], extra_inputs[0:3], base_inputs[2:4]], style="display:flex; flex-direction:column; gap:10px;"),
2660
- ui.div([base_inputs[4:10]], style="display:flex; flex-direction:column; gap:10px;"),
2661
- ui.div([extra_inputs[3:6], numeric_inputs[0:3]], style="display:flex; flex-direction:column; gap:10px;"),
2662
- ui.div(numeric_inputs[3:10], style="display:flex; flex-direction:column; gap:10px;"),
2663
- col_widths=(3,3,3,3)
2664
- )
2665
- elif platform == "NRMS":
2666
- inputs_columns = ui.layout_columns(
2667
- ui.div(base_inputs[0:6], style="display:flex; flex-direction:column; gap:10px;"),
2668
- ui.div([base_inputs[6:10], *extra_inputs], style="display:flex; flex-direction:column; gap:10px;"),
2669
- ui.div(numeric_inputs[0:5], style="display:flex; flex-direction:column; gap:10px;"),
2670
- ui.div(numeric_inputs[5:10], style="display:flex; flex-direction:column; gap:10px;"),
2671
- col_widths=(3,3,3,3)
2672
- )
2673
-
2674
- log_panel = ui.card(
2675
- ui.card_header("Identification log"),
2676
- ui.output_text_verbatim("match_log"),
2677
- style="max-height:300px; overflow:auto"
2678
- )
2679
-
2680
- return ui.div(
2681
- ui.TagList(
2682
- ui.h2("Run Spectral Library Matching"),
2683
- inputs_columns,
2684
- run_button_spec_lib_matching,
2685
- run_button_plot_spectra_within_spec_lib_matching,
2686
- back_button,
2687
- log_panel
2688
- ),
2689
- )
2690
-
2691
-
2692
-
2693
- def run_parameter_tuning_grid_ui(platform: str):
2694
- base_inputs = [
2695
- ui.input_file("query_data", "Upload query dataset (mgf, mzML, cdf, msp, or txt):"),
2696
- ui.input_file("reference_data", "Upload reference dataset (mgf, mzML, cdf, msp, or txt):"),
2697
- ui.input_selectize("similarity_measure", "Select similarity measure(s):", ["cosine","shannon","renyi","tsallis","mixture","jaccard","dice","3w_jaccard","sokal_sneath","binary_cosine","mountford","mcconnaughey","driver_kroeber","simpson","braun_banquet","fager_mcgowan","kulczynski","intersection","hamming","hellinger"], multiple=True, selected='cosine'),
2698
- ui.input_text('weights', 'Weights for mixture similarity measure (cosine, shannon, renyi, tsallis):', '((0.25, 0.25, 0.25, 0.25))'),
2699
- ui.input_text("high_quality_reference_library", "Indicate whether the reference library is considered high quality. If True, filtering and noise removal are only applied to the query spectra.", '[True]')
2700
- ]
2701
-
2702
- if platform == "HRMS":
2703
- extra_inputs = [
2704
- ui.input_numeric("precursor_ion_mz_tolerance", "Precursor ion mass tolerance (leave blank if not applicable):", None),
2705
- ui.input_select("ionization_mode", "Ionization mode:", ['Positive','Negative','N/A'], selected='N/A'),
2706
- ui.input_select("adduct", "Adduct:", ['H','NH3','NH4','Na','K','N/A'], selected='N/A'),
2707
- ui.input_text("spectrum_preprocessing_order", "Sequence of characters for preprocessing order (C (centroiding), F (filtering), M (matching), N (noise removal), L (low-entropy transformation), W (weight factor transformation)). M must be included, C before M if used.", "[FCNMWL,CWM]"),
2708
- ui.input_text("window_size_centroiding", "Centroiding window-size:", "[0.5]"),
2709
- ui.input_text("window_size_matching", "Matching window-size:", "[0.1,0.5]"),
2710
- ]
2711
- else:
2712
- extra_inputs = [
2713
- ui.input_text(
2714
- "spectrum_preprocessing_order",
2715
- "Sequence of characters for preprocessing order (F (filtering), N (noise removal), L (low-entropy transformation), W (weight factor transformation)).",
2716
- "[FNLW,WNL]",
2717
- )
2718
- ]
2719
-
2720
- numeric_inputs = [
2721
- ui.input_text("mz_min", "Minimum m/z for filtering:", '[0]'),
2722
- ui.input_text("mz_max", "Maximum m/z for filtering:", '[99999999]'),
2723
- ui.input_text("int_min", "Minimum intensity for filtering:", '[0]'),
2724
- ui.input_text("int_max", "Maximum intensity for filtering:", '[999999999]'),
2725
- ui.input_text("noise_threshold", "Noise removal threshold:", '[0.0]'),
2726
- ui.input_text("wf_mz", "Mass/charge weight factor:", '[0.0]'),
2727
- ui.input_text("wf_int", "Intensity weight factor:", '[1.0]'),
2728
- ui.input_text("LET_threshold", "Low-entropy threshold:", '[0.0]'),
2729
- ui.input_text("entropy_dimension", "Entropy dimension (Renyi/Tsallis only):", '[1.1]')
2730
- ]
2731
-
2732
-
2733
- run_button_parameter_tuning_grid = ui.download_button("run_btn_parameter_tuning_grid", "Tune parameters (grid search)", style="font-size:16px; padding:15px 30px; width:200px; height:80px")
2734
- back_button = ui.input_action_button("back", "Back to main menu", style="font-size:16px; padding:15px 30px; width:200px; height:80px")
2735
-
2736
- if platform == "HRMS":
2737
- inputs_columns = ui.layout_columns(
2738
- ui.div(base_inputs[0:6], style="display:flex; flex-direction:column; gap:10px;"),
2739
- ui.div([base_inputs[6:7], *extra_inputs], style="display:flex; flex-direction:column; gap:10px;"),
2740
- ui.div(numeric_inputs[0:5], style="display:flex; flex-direction:column; gap:10px;"),
2741
- ui.div(numeric_inputs[5:9], style="display:flex; flex-direction:column; gap:10px;"),
2742
- col_widths=(3, 3, 3, 3),
2743
- )
2744
- elif platform == "NRMS":
2745
- inputs_columns = ui.layout_columns(
2746
- ui.div(base_inputs[0:6], style="display:flex; flex-direction:column; gap:10px;"),
2747
- ui.div([base_inputs[6:7], *extra_inputs], style="display:flex; flex-direction:column; gap:10px;"),
2748
- ui.div(numeric_inputs[0:5], style="display:flex; flex-direction:column; gap:10px;"),
2749
- ui.div(numeric_inputs[5:9], style="display:flex; flex-direction:column; gap:10px;"),
2750
- col_widths=(3, 3, 3, 3),
2751
- )
2752
-
2753
- log_panel = ui.card(
2754
- ui.card_header("Identification log"),
2755
- ui.output_text_verbatim("match_log"),
2756
- style="max-height:300px; overflow:auto"
2757
- )
2758
-
2759
- return ui.div(
2760
- ui.TagList(
2761
- ui.h2("Tune parameters (grid search)"),
2762
- inputs_columns,
2763
- run_button_parameter_tuning_grid,
2764
- back_button,
2765
- log_panel
2766
- ),
2767
- )
2768
-
2769
-
2770
-
2771
- PARAMS_HRMS = {
2772
- "window_size_centroiding": (0.0, 0.5),
2773
- "window_size_matching": (0.0, 0.5),
2774
- "noise_threshold": (0.0, 0.25),
2775
- "wf_mz": (0.0, 5.0),
2776
- "wf_int": (0.0, 5.0),
2777
- "LET_threshold": (0.0, 5.0),
2778
- "entropy_dimension": (1.0, 3.0)
2779
- }
2780
-
2781
- PARAMS_NRMS = {
2782
- "noise_threshold": (0.0, 0.25),
2783
- "wf_mz": (0.0, 5.0),
2784
- "wf_int": (0.0, 5.0),
2785
- "LET_threshold": (0.0, 5.0),
2786
- "entropy_dimension": (1.0, 3.0)
2787
- }
2788
-
2789
-
2790
- def run_parameter_tuning_DE_ui(platform: str):
2791
- # Pick param set per platform
2792
- if platform == "HRMS":
2793
- PARAMS = PARAMS_HRMS
2794
- else:
2795
- PARAMS = PARAMS_NRMS
2796
-
2797
- base_inputs = [
2798
- ui.input_file("query_data", "Upload query dataset (mgf, mzML, cdf, msp, or txt):"),
2799
- ui.input_file("reference_data", "Upload reference dataset (mgf, mzML, cdf, msp, or txt):"),
2800
- ui.input_select("similarity_measure", "Select similarity measure:", ["cosine","shannon","renyi","tsallis","mixture","jaccard","dice","3w_jaccard","sokal_sneath","binary_cosine","mountford","mcconnaughey","driver_kroeber","simpson","braun_banquet","fager_mcgowan","kulczynski","intersection","hamming","hellinger"]),
2801
- ui.input_text("weights", "Weights for mixture similarity measure (cosine, shannon, renyi, tsallis):", "0.25, 0.25, 0.25, 0.25"),
2802
- ui.input_select("high_quality_reference_library", "Indicate whether the reference library is considered high quality. If True, filtering and noise removal are only applied to the query spectra.", [False, True])]
2803
-
2804
- if platform == "HRMS":
2805
- extra_inputs = [
2806
- ui.input_numeric("precursor_ion_mz_tolerance", "Precursor ion mass tolerance (leave blank if not applicable):", None),
2807
- ui.input_select("ionization_mode", "Ionization mode:", ['Positive','Negative','N/A'], selected='N/A'),
2808
- ui.input_select("adduct", "Adduct:", ['H','NH3','NH4','Na','K','N/A'], selected='N/A'),
2809
- ui.input_text("spectrum_preprocessing_order", "Sequence of characters for preprocessing order (C (centroiding), F (filtering), M (matching), N (noise removal), L (low-entropy transformation), W (weight factor transformation)). M must be included, C before M if used.", "FCNMWL"),
2810
- ui.input_numeric("window_size_centroiding", "Centroiding window-size:", 0.5),
2811
- ui.input_numeric("window_size_matching", "Matching window-size:", 0.5),
2812
- ]
2813
- else:
2814
- extra_inputs = [ui.input_text("spectrum_preprocessing_order", "Sequence of characters for preprocessing order (F (filtering), N (noise removal), L (low-entropy transformation), W (weight factor transformation)).", "FNLW")]
2815
-
2816
- numeric_inputs = [
2817
- ui.input_numeric("mz_min", "Minimum m/z for filtering:", 0),
2818
- ui.input_numeric("mz_max", "Maximum m/z for filtering:", 99_999_999),
2819
- ui.input_numeric("int_min", "Minimum intensity for filtering:", 0),
2820
- ui.input_numeric("int_max", "Maximum intensity for filtering:", 999_999_999),
2821
- ui.input_numeric("noise_threshold", "Noise removal threshold:", 0.0),
2822
- ui.input_numeric("wf_mz", "Mass/charge weight factor:", 0.0),
2823
- ui.input_numeric("wf_int", "Intensity weight factor:", 1.0),
2824
- ui.input_numeric("LET_threshold", "Low-entropy threshold:", 0.0),
2825
- ui.input_numeric("entropy_dimension", "Entropy dimension (Renyi/Tsallis only):", 1.1),
2826
- ui.input_numeric("max_iterations", "Maximum number of iterations:", 5),
2827
- ]
2828
-
2829
- run_button_parameter_tuning_DE = ui.input_action_button("run_btn_parameter_tuning_DE", "Tune parameters (differential evolution optimization)", style="font-size:16px; padding:15px 30px; width:300px; height:100px")
2830
- back_button = ui.input_action_button("back", "Back to main menu", style="font-size:16px; padding:15px 30px; width:300px; height:100px")
2831
-
2832
- if platform == "HRMS":
2833
- inputs_columns = ui.layout_columns(
2834
- ui.div(*base_inputs, style="display:flex; flex-direction:column; gap:10px;"),
2835
- ui.div(*extra_inputs, style="display:flex; flex-direction:column; gap:10px;"),
2836
- ui.div(*numeric_inputs[0:5], style="display:flex; flex-direction:column; gap:10px;"),
2837
- ui.div(*numeric_inputs[5:11], style="display:flex; flex-direction:column; gap:10px;"),
2838
- col_widths=(3, 3, 3, 3),
2839
- )
2840
- else:
2841
- inputs_columns = ui.layout_columns(
2842
- ui.div(*base_inputs, style="display:flex; flex-direction:column; gap:10px;"),
2843
- ui.div(*extra_inputs, style="display:flex; flex-direction:column; gap:10px;"),
2844
- ui.div(*numeric_inputs[0:5], style="display:flex; flex-direction:column; gap:10px;"),
2845
- ui.div(*numeric_inputs[5:11], style="display:flex; flex-direction:column; gap:10px;"),
2846
- col_widths=(3, 3, 3, 3),
2847
- )
2848
-
2849
- return ui.page_fillable(
2850
- ui.layout_sidebar(
2851
- ui.sidebar(
2852
- ui.h3("Select continuous parameters to optimize"),
2853
- ui.input_checkbox_group("params", None, choices=list(PARAMS.keys()), selected=["noise_threshold", "LET_threshold"]),
2854
- ui.hr(),
2855
- ui.h4("Bounds for selected parameters"),
2856
- ui.output_ui("bounds_inputs"),
2857
- width=360,
2858
- ),
2859
- ui.div(
2860
- ui.h2("Tune parameters (differential evolution optimization)"),
2861
- inputs_columns,
2862
- ui.div(run_button_parameter_tuning_DE, back_button, style=("display:flex; flex-direction:row; gap:12px; align-items:center; flex-wrap:wrap;")),
2863
- ui.br(),
2864
- ui.card(
2865
- ui.card_header("Live log"),
2866
- ui.output_text_verbatim("run_log"),
2867
- ),
2868
- style="display:flex; flex-direction:column; gap:16px;",
2869
- ),
2870
- )
2871
- )
2872
-
2873
-
2874
-
2875
- app_ui = ui.page_fluid(
2876
- ui.head_content(ui.tags.link(rel="icon", href="emblem.png")),
2877
- ui.div(ui.output_image("image"), style=("display:block; margin:20px auto; max-width:320px; height:auto; text-align:center")),
2878
- ui.output_ui("main_ui"),
2879
- ui.output_text("status_output"),
2880
- )
2881
-
2882
-
2883
-
2884
-
2885
- def server(input, output, session):
2886
-
2887
- current_page = reactive.Value("main_menu")
2888
-
2889
- plot_clicks = reactive.Value(0)
2890
- match_clicks = reactive.Value(0)
2891
- back_clicks = reactive.Value(0)
2892
-
2893
- run_status_plot_spectra = reactive.Value("")
2894
- run_status_spec_lib_matching = reactive.Value("")
2895
- run_status_plot_spectra_within_spec_lib_matching = reactive.Value("")
2896
- run_status_parameter_tuning_grid = reactive.Value("")
2897
- run_status_parameter_tuning_DE = reactive.Value("")
2898
- is_tuning_grid_running = reactive.Value(False)
2899
- is_tuning_DE_running = reactive.Value(False)
2900
- match_log_rv = reactive.Value("")
2901
- is_matching_rv = reactive.Value(False)
2902
- is_any_job_running = reactive.Value(False)
2903
- latest_txt_path_rv = reactive.Value("")
2904
- latest_df_rv = reactive.Value(None)
2905
- is_running_rv = reactive.Value(False)
2906
-
2907
- query_ids_rv = reactive.Value([])
2908
- query_file_path_rv = reactive.Value(None)
2909
- query_result_rv = reactive.Value(None)
2910
- query_status_rv = reactive.Value("")
2911
- reference_ids_rv = reactive.Value([])
2912
- reference_file_path_rv = reactive.Value(None)
2913
- reference_result_rv = reactive.Value(None)
2914
- reference_status_rv = reactive.Value("")
2915
-
2916
- converted_query_path_rv = reactive.Value(None)
2917
- converted_reference_path_rv = reactive.Value(None)
2918
-
2919
- df_rv = reactive.Value(None)
2920
-
2921
-
2922
- def _discover_rank_cols(df: pd.DataFrame):
2923
- pred_pat = re.compile(r"^RANK\.(\d+)\.PRED$")
2924
- score_pat = re.compile(r"^RANK\.(\d+)\.SIMILARITY\.SCORE$")
2925
- pred_map, score_map = {}, {}
2926
- for c in df.columns:
2927
- m = pred_pat.match(c)
2928
- if m: pred_map[int(m.group(1))] = c
2929
- m = score_pat.match(c)
2930
- if m: score_map[int(m.group(1))] = c
2931
- return [(k, pred_map[k], score_map.get(k)) for k in sorted(pred_map)]
2932
-
2933
-
2934
- def _rank_choices_for_query(df: pd.DataFrame, qid: str):
2935
- sub = df.loc[df["QUERY.SPECTRUM.ID"].astype(str) == str(qid)]
2936
- if sub.empty:
2937
- return {}, None
2938
- row = sub.iloc[0]
2939
- rank_cols = _discover_rank_cols(df)
2940
- if not rank_cols:
2941
- return {}, None
2942
-
2943
- choices = {}
2944
- default_value = None
2945
- for (k, pred_col, score_col) in rank_cols:
2946
- pred = row.get(pred_col, None)
2947
- if pd.isna(pred):
2948
- continue
2949
- pred = str(pred)
2950
- score = row.get(score_col, None) if score_col else None
2951
- score_str = f"{float(score):.6f}" if (score is not None and pd.notna(score)) else "NA"
2952
- label = f"Rank {k} — {score_str} — {pred}"
2953
- choices[label] = pred # values are plain names
2954
- if k == 1:
2955
- default_value = pred # default = Rank 1 name
2956
-
2957
- if default_value is None and choices:
2958
- default_value = next(iter(choices.values()))
2959
- return choices, default_value
2960
-
2961
-
2962
- @reactive.effect
2963
- @reactive.event(input.compound_ID_output_file)
2964
- async def _populate_ids_from_compound_ID_output_upload():
2965
- files = input.compound_ID_output_file()
2966
- if not files:
2967
- return
2968
-
2969
- in_path = Path(files[0]["datapath"])
2970
- try:
2971
- query_status_rv.set(f"Reading table from: {in_path.name} …")
2972
- await reactive.flush()
2973
-
2974
- df = await asyncio.to_thread(pd.read_csv, in_path, sep="\t", header=0)
2975
-
2976
- if "QUERY.SPECTRUM.ID" not in df.columns:
2977
- raise ValueError("Missing required column: QUERY.SPECTRUM.ID")
2978
- if not _discover_rank_cols(df):
2979
- raise ValueError("No columns matching RANK.<k>.PRED found.")
2980
-
2981
- df_rv.set(df)
2982
-
2983
- ids = df["QUERY.SPECTRUM.ID"].astype(str).tolist()
2984
- unique_ids_in_order = list(dict.fromkeys(ids))
2985
-
2986
- choices_dict, default_rank_value = _rank_choices_for_query(df, ids[0])
2987
- choices_values = [str(v).strip() for v in choices_dict.values()]
2988
- default_rank_value = str(default_rank_value).strip() if default_rank_value is not None else None
2989
-
2990
- ui.update_selectize("q_spec", choices=unique_ids_in_order, selected=ids[0])
2991
- await reactive.flush()
2992
-
2993
- ui.update_selectize("r_spec", choices=choices_values, selected=choices_values[0])
2994
- await reactive.flush()
2995
-
2996
- except Exception as e:
2997
- query_status_rv.set(f"❌ Failed: {e}")
2998
- await reactive.flush()
2999
- raise
3000
-
3001
-
3002
- @reactive.effect
3003
- @reactive.event(input.q_spec)
3004
- async def _update_rank_choices_on_compound_ID_change():
3005
- df = df_rv.get()
3006
- if df is None:
3007
- return
3008
- qid = input.q_spec()
3009
- if not qid:
3010
- return
3011
-
3012
- choices, default_rank_value = _rank_choices_for_query(df, qid)
3013
- choices = list(choices.values())
3014
- ui.update_selectize('r_spec', choices=choices, selected=default_rank_value)
3015
- await reactive.flush()
3016
-
3017
-
3018
-
3019
- @output
3020
- @render.ui
3021
- def bounds_inputs():
3022
- selected = input.params()
3023
- if not selected:
3024
- return ui.div(ui.em("Select one or more parameters above."))
3025
-
3026
- if input.chromatography_platform() == 'HRMS':
3027
- PARAMS = PARAMS_HRMS
3028
- else:
3029
- PARAMS = PARAMS_NRMS
3030
- blocks = []
3031
- for name in selected:
3032
- lo, hi = PARAMS.get(name, (0.0, 1.0))
3033
- blocks.append(
3034
- ui.card(
3035
- ui.card_header(name),
3036
- ui.layout_columns(
3037
- ui.input_numeric(f"min_{name}", "Lower", lo, step=0.001),
3038
- ui.input_numeric(f"max_{name}", "Upper", hi, step=0.001),
3039
- )
3040
- )
3041
- )
3042
- return ui.div(*blocks)
3043
-
3044
- def _read_bounds_dict():
3045
- selected = input.params()
3046
- out = {}
3047
- for name in selected:
3048
- lo_default, hi_default = PARAMS.get(name, (0.0, 1.0))
3049
- lo_id = f"min_{name}"
3050
- hi_id = f"max_{name}"
3051
-
3052
- lo_val = input[lo_id]() if lo_id in input else lo_default
3053
- hi_val = input[hi_id]() if hi_id in input else hi_default
3054
-
3055
- out[name] = (float(lo_val), float(hi_val))
3056
- return out
3057
-
3058
- def _read_bounds():
3059
- opt_params = input.params()
3060
- bounds_dict = {}
3061
- if input.chromatography_platform() == 'HRMS':
3062
- PARAMS = PARAMS_HRMS
3063
- else:
3064
- PARAMS = PARAMS_NRMS
3065
-
3066
- for p in opt_params:
3067
- lo_id, hi_id = f"min_{p}", f"max_{p}"
3068
- lo_default, hi_default = PARAMS.get(p, (0.0, 1.0))
3069
- lo = input[lo_id]() if lo_id in input else lo_default
3070
- hi = input[hi_id]() if hi_id in input else hi_default
3071
- if lo > hi:
3072
- lo, hi = hi, lo
3073
- bounds_dict[p] = (float(lo), float(hi))
3074
-
3075
- bounds_list = [bounds_dict[p] for p in opt_params]
3076
- return opt_params, bounds_dict, bounds_list
3077
-
3078
- def _reset_plot_spectra_state():
3079
- query_status_rv.set("")
3080
- reference_status_rv.set("")
3081
- query_ids_rv.set([])
3082
- reference_ids_rv.set([])
3083
- query_file_path_rv.set(None)
3084
- reference_file_path_rv.set(None)
3085
- query_result_rv.set(None)
3086
- reference_result_rv.set(None)
3087
- converted_query_path_rv.set(None)
3088
- converted_reference_path_rv.set(None)
3089
- try:
3090
- ui.update_selectize("spectrum_ID1", choices=[], selected=None)
3091
- ui.update_selectize("spectrum_ID2", choices=[], selected=None)
3092
- except Exception:
3093
- pass
3094
-
3095
-
3096
- def _reset_spec_lib_matching_state():
3097
- match_log_rv.set("")
3098
- is_matching_rv.set(False)
3099
- is_any_job_running.set(False)
3100
- try:
3101
- ui.update_selectize("spectrum_ID1", choices=[], selected=None)
3102
- ui.update_selectize("spectrum_ID2", choices=[], selected=None)
3103
- except Exception:
3104
- pass
3105
-
3106
-
3107
- def _reset_parameter_tuning_state():
3108
- match_log_rv.set("")
3109
- is_tuning_grid_running.set(False)
3110
- is_tuning_DE_running.set(False)
3111
- is_any_job_running.set(False)
3112
-
3113
-
3114
- @reactive.effect
3115
- @reactive.event(input.back)
3116
- def _clear_on_back_from_pages():
3117
- page = current_page()
3118
- if page == "plot_spectra":
3119
- _reset_plot_spectra_state()
3120
- elif page == "run_spec_lib_matching":
3121
- _reset_spec_lib_matching_state()
3122
- elif page == "run_parameter_tuning_grid":
3123
- _reset_parameter_tuning_state()
3124
- elif page == "run_parameter_tuning_DE":
3125
- _reset_parameter_tuning_state()
3126
-
3127
- @reactive.effect
3128
- def _clear_on_enter_pages():
3129
- page = current_page()
3130
- if page == "plot_spectra":
3131
- _reset_plot_spectra_state()
3132
- elif page == "run_spec_lib_matching":
3133
- _reset_spec_lib_matching_state()
3134
- elif page == "run_parameter_tuning_grid":
3135
- _reset_parameter_tuning_state()
3136
- elif page == "run_parameter_tuning_DE":
3137
- _reset_parameter_tuning_state()
3138
-
3139
-
3140
- def _drain_queue_nowait(q: asyncio.Queue) -> list[str]:
3141
- out = []
3142
- try:
3143
- while True:
3144
- out.append(q.get_nowait())
3145
- except asyncio.QueueEmpty:
3146
- pass
3147
- return out
3148
-
3149
-
3150
- class ReactiveWriter(io.TextIOBase):
3151
- def __init__(self, loop: asyncio.AbstractEventLoop):
3152
- self._loop = loop
3153
- def write(self, s: str):
3154
- if not s:
3155
- return 0
3156
- self._loop.call_soon_threadsafe(_LOG_QUEUE.put_nowait, s)
3157
- return len(s)
3158
- def flush(self):
3159
- pass
3160
-
3161
- def _run_with_redirects(func, writer: ReactiveWriter, **kwargs):
3162
- with contextlib.redirect_stdout(writer), contextlib.redirect_stderr(writer):
3163
- return func(**kwargs)
3164
-
3165
-
3166
-
3167
- @reactive.effect
3168
- async def _pump_logs():
3169
- if not (is_any_job_running.get() or is_tuning_grid_running.get() or is_tuning_DE_running.get() or is_matching_rv.get()):
3170
- return
3171
- reactive.invalidate_later(0.05)
3172
- msgs = _drain_queue_nowait(_LOG_QUEUE)
3173
- if msgs:
3174
- match_log_rv.set(match_log_rv.get() + "".join(msgs))
3175
- await reactive.flush()
3176
-
3177
-
3178
- def process_database(file_path: str):
3179
- suffix = Path(file_path).suffix.lower()
3180
- return {"path": file_path, "suffix": suffix}
3181
-
3182
- @render.text
3183
- def plot_query_status():
3184
- return query_status_rv.get() or ""
3185
-
3186
- @render.text
3187
- def plot_reference_status():
3188
- return reference_status_rv.get() or ""
3189
-
3190
-
3191
- @reactive.effect
3192
- @reactive.event(input.query_data)
3193
- async def _on_query_upload():
3194
- files = input.query_data()
3195
- req(files and len(files) > 0)
3196
-
3197
- file_path = files[0]["datapath"]
3198
- query_file_path_rv.set(file_path)
3199
-
3200
- query_status_rv.set(f"Processing query database: {Path(file_path).name} …")
3201
- await reactive.flush()
3202
-
3203
- try:
3204
- result = await asyncio.to_thread(process_database, file_path)
3205
- query_result_rv.set(result)
3206
- query_status_rv.set("✅ Query database processed.")
3207
- await reactive.flush()
3208
- except Exception as e:
3209
- query_status_rv.set(f"❌ Failed to process query database: {e}")
3210
- await reactive.flush()
3211
-
3212
-
3213
- @reactive.effect
3214
- @reactive.event(input.reference_data)
3215
- async def _on_reference_upload():
3216
- files = input.reference_data()
3217
- req(files and len(files) > 0)
3218
-
3219
- file_path = files[0]["datapath"]
3220
- reference_file_path_rv.set(file_path)
3221
-
3222
- reference_status_rv.set(f"Processing reference database: {Path(file_path).name} …")
3223
- await reactive.flush()
3224
-
3225
- try:
3226
- result = await asyncio.to_thread(process_database, file_path)
3227
- reference_result_rv.set(result)
3228
- reference_status_rv.set("✅ Reference database processed.")
3229
- await reactive.flush()
3230
- except Exception as e:
3231
- reference_status_rv.set(f"❌ Failed to process reference database: {e}")
3232
- await reactive.flush()
3233
-
3234
-
3235
- @render.text
3236
- def match_log():
3237
- return match_log_rv.get()
3238
-
3239
-
3240
- @reactive.Effect
3241
- def _():
3242
- if input.plot_spectra() > plot_clicks.get():
3243
- current_page.set("plot_spectra")
3244
- plot_clicks.set(input.plot_spectra())
3245
- elif input.run_spec_lib_matching() > match_clicks.get():
3246
- current_page.set("run_spec_lib_matching")
3247
- match_clicks.set(input.run_spec_lib_matching())
3248
- elif input.run_parameter_tuning_grid() > match_clicks.get():
3249
- current_page.set("run_parameter_tuning_grid")
3250
- match_clicks.set(input.run_parameter_tuning_grid())
3251
- elif input.run_parameter_tuning_DE() > match_clicks.get():
3252
- current_page.set("run_parameter_tuning_DE")
3253
- match_clicks.set(input.run_parameter_tuning_DE())
3254
- elif hasattr(input, "back") and input.back() > back_clicks.get():
3255
- current_page.set("main_menu")
3256
- back_clicks.set(input.back())
3257
-
3258
-
3259
- @render.image
3260
- def image():
3261
- dir = Path(__file__).resolve().parent
3262
- img: ImgData = {"src": str(dir / "www/emblem.png"), "width": "250px", "height": "250px"}
3263
- return img
3264
-
3265
- @output
3266
- @render.ui
3267
- def main_ui():
3268
- if current_page() == "main_menu":
3269
- return ui.page_fluid(
3270
- ui.h2("Main Menu"),
3271
- ui.div("Overview:", style="text-align:left; font-size:24px; font-weight:bold"),
3272
- ui.div("PyCompound is a Python-based tool designed for performing spectral library matching on either high-resolution mass spectrometry data (HRMS) or low-resolution mass spectrometry data (NRMS). PyCompound offers a range of spectrum preprocessing transformations and similarity measures. These spectrum preprocessing transformations include filtering on mass/charge and/or intensity values, weight factor transformation, low-entropy transformation, centroiding, noise removal, and matching. The available similarity measures include the canonical Cosine similarity measure, three entropy-based similarity measures, and a variety of binary similarity measures: Jaccard, Dice, 3W-Jaccard, Sokal-Sneath, Binary Cosine, Mountford, McConnaughey, Driver-Kroeber, Simpson, Braun-Banquet, Fager-McGowan, Kulczynski, Intersection, Hamming, and Hellinger.", style="margin-top:10px; text-align:left; font-size:16px; font-weight:500"),
3273
- ui.div("Select options:", style="margin-top:30px; text-align:left; font-size:24px; font-weight:bold"),
3274
- ui.div(ui.input_radio_buttons("chromatography_platform", "Specify chromatography platform:", ["HRMS","NRMS"]), style="font-size:18px; margin-top:10px; max-width:none"),
3275
- ui.input_action_button("plot_spectra", "Plot two spectra before and after preprocessing transformations.", style="font-size:18px; padding:20px 40px; width:550px; height:100px; margin-top:10px; margin-right:50px"),
3276
- ui.input_action_button("run_spec_lib_matching", "Run spectral library matching to perform compound identification on a query library of spectra.", style="font-size:18px; padding:20px 40px; width:550px; height:100px; margin-top:10px; margin-right:50px"),
3277
- ui.input_action_button("run_parameter_tuning_grid", "Grid search: Tune parameters to maximize accuracy of compound identification given a query library with known spectrum IDs.", style="font-size:18px; padding:20px 40px; width:450px; height:120px; margin-top:10px; margin-right:50px"),
3278
- ui.input_action_button("run_parameter_tuning_DE", "Differential evolution optimization: Tune parameters to maximize accuracy of compound identification given a query library with known spectrum IDs.", style="font-size:18px; padding:20px 40px; width:500px; height:150px; margin-top:10px; margin-right:50px"),
3279
- ui.div(
3280
- "References:",
3281
- style="margin-top:35px; text-align:left; font-size:24px; font-weight:bold"
3282
- ),
3283
- ui.div(
3284
- "If Shannon Entropy similarity measure, low-entropy transformation, or centroiding are used:",
3285
- style="margin-top:10px; text-align:left; font-size:14px; font-weight:500"
3286
- ),
3287
- ui.div(
3288
- ui.HTML(
3289
- 'Li, Y., Kind, T., Folz, J. et al. (2021) Spectral entropy outperforms MS/MS dot product similarity for small-molecule compound identification. Nat Methods, 18 1524–1531. <a href="https://doi.org/10.1038/s41592-021-01331-z" target="_blank">https://doi.org/10.1038/s41592-021-01331-z</a>.'
3290
- ),
3291
- style="text-align:left; font-size:14px; font-weight:500"
3292
- ),
3293
- ui.div(
3294
- "If Tsallis Entropy similarity measure or series of preprocessing transformations are used:",
3295
- style="margin-top:10px; text-align:left; font-size:14px; font-weight:500"
3296
- ),
3297
- ui.div(
3298
- ui.HTML(
3299
- 'Dlugas, H., Zhang, X., Kim, S. (2025) Comparative analysis of continuous similarity measures for compound identification in mass spectrometry-based metabolomics. Chemometrics and Intelligent Laboratory Systems, 263, 105417. <a href="https://doi.org/10.1016/j.chemolab.2025.105417", target="_blank">https://doi.org/10.1016/j.chemolab.2025.105417</a>.'
3300
- ),
3301
- style="text-align:left; font-size:14px; font-weight:500"
3302
- ),
3303
- ui.div(
3304
- "If binary similarity measures are used:",
3305
- style="margin-top:10px; text-align:left; font-size:14px; font-weight:500"
3306
- ),
3307
- ui.div(
3308
- ui.HTML(
3309
- 'Kim, S., Kato, I., & Zhang, X. (2022). Comparative Analysis of Binary Similarity Measures for Compound Identification in Mass Spectrometry-Based Metabolomics. Metabolites, 12(8), 694. <a href="https://doi.org/10.3390/metabo12080694" target="_blank">https://doi.org/10.3390/metabo12080694</a>.'
3310
- ),
3311
- style="text-align:left; font-size:14px; font-weight:500"
3312
- ),
3313
-
3314
- ui.div(
3315
- "If weight factor transformation is used:",
3316
- style="margin-top:10px; text-align:left; font-size:14px; font-weight:500"
3317
- ),
3318
- ui.div(
3319
- ui.HTML(
3320
- 'Kim, S., Koo, I., Wei, X., & Zhang, X. (2012). A method of finding optimal weight factors for compound identification in gas chromatography-mass spectrometry. Bioinformatics, 28(8), 1158-1163. <a href="https://doi.org/10.1093/bioinformatics/bts083" target="_blank">https://doi.org/10.1093/bioinformatics/bts083</a>.'
3321
- ),
3322
- style="margin-bottom:40px; text-align:left; font-size:14px; font-weight:500"
3323
- ),
3324
- )
3325
- elif current_page() == "plot_spectra":
3326
- return plot_spectra_ui(input.chromatography_platform())
3327
- elif current_page() == "run_spec_lib_matching":
3328
- return run_spec_lib_matching_ui(input.chromatography_platform())
3329
- elif current_page() == "run_parameter_tuning_grid":
3330
- return run_parameter_tuning_grid_ui(input.chromatography_platform())
3331
- elif current_page() == "run_parameter_tuning_DE":
3332
- return run_parameter_tuning_DE_ui(input.chromatography_platform())
3333
-
3334
-
3335
-
3336
- @reactive.effect
3337
- @reactive.event(input.query_data)
3338
- async def _populate_ids_from_query_upload():
3339
- files = input.query_data()
3340
- if not files:
3341
- return
3342
-
3343
- in_path = Path(files[0]["datapath"])
3344
- suffix = in_path.suffix.lower()
3345
-
3346
- try:
3347
- if suffix == ".txt":
3348
- txt_path = in_path
3349
- converted_query_path_rv.set(str(txt_path))
3350
- else:
3351
- query_status_rv.set(f"Converting {in_path.name} → TXT…")
3352
- await reactive.flush()
3353
-
3354
- tmp_txt_path = in_path.with_suffix(".converted.txt")
3355
-
3356
- out_obj = await asyncio.to_thread(build_library, str(in_path), str(tmp_txt_path))
3357
-
3358
- if isinstance(out_obj, (str, os.PathLike, Path)):
3359
- txt_path = Path(out_obj)
3360
- elif isinstance(out_obj, pd.DataFrame):
3361
- out_obj.to_csv(tmp_txt_path, index=False, sep='\t')
3362
- txt_path = tmp_txt_path
3363
- else:
3364
- raise TypeError(f"build_library returned unsupported type: {type(out_obj)}")
3365
-
3366
- converted_query_path_rv.set(str(txt_path))
3367
-
3368
- query_status_rv.set(f"Reading IDs from: {txt_path.name} …")
3369
- await reactive.flush()
3370
-
3371
- ids = await asyncio.to_thread(extract_first_column_ids, str(txt_path))
3372
- query_ids_rv.set(ids)
3373
-
3374
- ui.update_selectize("spectrum_ID1", choices=ids, selected=(ids[0] if ids else None))
3375
-
3376
- query_status_rv.set(f"✅ Loaded {len(ids)} IDs from {txt_path.name}" if ids else f"⚠️ No IDs found in {txt_path.name}")
3377
- await reactive.flush()
3378
-
3379
- except Exception as e:
3380
- query_status_rv.set(f"❌ Failed: {e}")
3381
- await reactive.flush()
3382
- raise
3383
-
3384
-
3385
- @reactive.effect
3386
- @reactive.event(input.reference_data)
3387
- async def _populate_ids_from_reference_upload():
3388
- files = input.reference_data()
3389
- if not files:
3390
- return
3391
-
3392
- in_path = Path(files[0]["datapath"])
3393
- suffix = in_path.suffix.lower()
3394
-
3395
- try:
3396
- if suffix == ".txt":
3397
- txt_path = in_path
3398
- converted_reference_path_rv.set(str(txt_path))
3399
- else:
3400
- reference_status_rv.set(f"Converting {in_path.name} → TXT…")
3401
- await reactive.flush()
3402
-
3403
- tmp_txt_path = in_path.with_suffix(".converted.txt")
3404
-
3405
- out_obj = await asyncio.to_thread(build_library, str(in_path), str(tmp_txt_path))
3406
-
3407
- if isinstance(out_obj, (str, os.PathLike, Path)):
3408
- txt_path = Path(out_obj)
3409
- elif isinstance(out_obj, pd.DataFrame):
3410
- out_obj.to_csv(tmp_txt_path, index=False, sep='\t')
3411
- txt_path = tmp_txt_path
3412
- else:
3413
- raise TypeError(f"build_library returned unsupported type: {type(out_obj)}")
3414
-
3415
- converted_reference_path_rv.set(str(txt_path))
3416
-
3417
- reference_status_rv.set(f"Reading IDs from: {txt_path.name} …")
3418
- await reactive.flush()
3419
-
3420
- ids = await asyncio.to_thread(extract_first_column_ids, str(txt_path))
3421
- reference_ids_rv.set(ids)
3422
-
3423
- ui.update_selectize("spectrum_ID2", choices=ids, selected=(ids[0] if ids else None))
3424
-
3425
- reference_status_rv.set(
3426
- f"✅ Loaded {len(ids)} IDs from {txt_path.name}" if ids else f"⚠️ No IDs found in {txt_path.name}"
3427
- )
3428
- await reactive.flush()
3429
-
3430
- except Exception as e:
3431
- reference_status_rv.set(f"❌ Failed: {e}")
3432
- await reactive.flush()
3433
- raise
3434
-
3435
-
3436
- @render.download(filename=lambda: f"plot.svg")
3437
- def run_btn_plot_spectra():
3438
- spectrum_ID1 = input.spectrum_ID1() or None
3439
- spectrum_ID2 = input.spectrum_ID2() or None
3440
-
3441
- weights = [float(weight.strip()) for weight in input.weights().split(",") if weight.strip()]
3442
- weights = {'Cosine':weights[0], 'Shannon':weights[1], 'Renyi':weights[2], 'Tsallis':weights[3]}
3443
-
3444
- high_quality_reference_library_tmp2 = False
3445
- if input.high_quality_reference_library() != 'False':
3446
- high_quality_reference_library_tmp2 = True
3447
-
3448
- if input.chromatography_platform() == "HRMS":
3449
- fig = generate_plots_on_HRMS_data(query_data=input.query_data()[0]['datapath'], reference_data=input.reference_data()[0]['datapath'], spectrum_ID1=spectrum_ID1, spectrum_ID2=spectrum_ID2, print_url_spectrum1=input.print_url_spectrum1(), print_url_spectrum2=input.print_url_spectrum2(), similarity_measure=input.similarity_measure(), weights=weights, spectrum_preprocessing_order=input.spectrum_preprocessing_order(), high_quality_reference_library=high_quality_reference_library_tmp2, mz_min=input.mz_min(), mz_max=input.mz_max(), int_min=input.int_min(), int_max=input.int_max(), window_size_centroiding=input.window_size_centroiding(), window_size_matching=input.window_size_matching(), noise_threshold=input.noise_threshold(), wf_mz=input.wf_mz(), wf_intensity=input.wf_int(), LET_threshold=input.LET_threshold(), entropy_dimension=input.entropy_dimension(), y_axis_transformation=input.y_axis_transformation(), return_plot=True)
3450
- plt.show()
3451
- elif input.chromatography_platform() == "NRMS":
3452
- fig = generate_plots_on_NRMS_data(query_data=input.query_data()[0]['datapath'], reference_data=input.reference_data()[0]['datapath'], spectrum_ID1=spectrum_ID1, spectrum_ID2=spectrum_ID2, print_url_spectrum1=input.print_url_spectrum1(), print_url_spectrum2=input.print_url_spectrum2(), similarity_measure=input.similarity_measure(), spectrum_preprocessing_order=input.spectrum_preprocessing_order(), high_quality_reference_library=high_quality_reference_library_tmp2, mz_min=input.mz_min(), mz_max=input.mz_max(), int_min=input.int_min(), int_max=input.int_max(), noise_threshold=input.noise_threshold(), wf_mz=input.wf_mz(), wf_intensity=input.wf_int(), LET_threshold=input.LET_threshold(), entropy_dimension=input.entropy_dimension(), y_axis_transformation=input.y_axis_transformation(), return_plot=True)
3453
- plt.show()
3454
- with io.BytesIO() as buf:
3455
- fig.savefig(buf, format="svg", dpi=150, bbox_inches="tight")
3456
- plt.close()
3457
- yield buf.getvalue()
3458
-
3459
-
3460
-
3461
-
3462
- @render.download(filename="identification_output.txt")
3463
- async def run_btn_spec_lib_matching():
3464
- match_log_rv.set("Running identification...\n")
3465
- await reactive.flush()
3466
-
3467
- hq = input.high_quality_reference_library()
3468
- if isinstance(hq, str):
3469
- hq = hq.lower() == "true"
3470
- elif isinstance(hq, (int, float)):
3471
- hq = bool(hq)
3472
-
3473
- weights = [float(weight.strip()) for weight in input.weights().split(",") if weight.strip()]
3474
- weights = {'Cosine': weights[0], 'Shannon': weights[1], 'Renyi': weights[2], 'Tsallis': weights[3]}
3475
-
3476
- common_kwargs = dict(
3477
- query_data=input.query_data()[0]["datapath"],
3478
- reference_data=input.reference_data()[0]["datapath"],
3479
- likely_reference_ids=None,
3480
- similarity_measure=input.similarity_measure(),
3481
- weights=weights,
3482
- spectrum_preprocessing_order=input.spectrum_preprocessing_order(),
3483
- high_quality_reference_library=hq,
3484
- mz_min=input.mz_min(), mz_max=input.mz_max(),
3485
- int_min=input.int_min(), int_max=input.int_max(),
3486
- noise_threshold=input.noise_threshold(),
3487
- wf_mz=input.wf_mz(), wf_intensity=input.wf_int(),
3488
- LET_threshold=input.LET_threshold(), entropy_dimension=input.entropy_dimension(),
3489
- n_top_matches_to_save=input.n_top_matches_to_save(),
3490
- print_id_results=True,
3491
- output_identification=str(Path.cwd() / "identification_output.txt"),
3492
- output_similarity_scores=str(Path.cwd() / "similarity_scores.txt"),
3493
- return_ID_output=True,
3494
- )
3495
-
3496
- # --- streaming setup (same pattern as your DE block) ---
3497
- loop = asyncio.get_running_loop()
3498
- q: asyncio.Queue[str | None] = asyncio.Queue()
3499
-
3500
- class UIWriter(io.TextIOBase):
3501
- def write(self, s: str):
3502
- if s:
3503
- loop.call_soon_threadsafe(q.put_nowait, s)
3504
- return len(s)
3505
- def flush(self): pass
3506
-
3507
- async def _drain():
3508
- while True:
3509
- msg = await q.get()
3510
- if msg is None:
3511
- break
3512
- match_log_rv.set(match_log_rv.get() + msg)
3513
- await reactive.flush()
3514
-
3515
- drain_task = asyncio.create_task(_drain())
3516
- writer = UIWriter()
3517
-
3518
- # --- worker wrappers that install redirects INSIDE the thread ---
3519
- def _run_hrms():
3520
- with redirect_stdout(writer), redirect_stderr(writer):
3521
- # optional heartbeat
3522
- print(">> Starting HRMS identification ...", flush=True)
3523
- return run_spec_lib_matching_on_HRMS_data_shiny(
3524
- precursor_ion_mz_tolerance=input.precursor_ion_mz_tolerance(),
3525
- ionization_mode=input.ionization_mode(),
3526
- adduct=input.adduct(),
3527
- window_size_centroiding=input.window_size_centroiding(),
3528
- window_size_matching=input.window_size_matching(),
3529
- **common_kwargs
3530
- )
3531
-
3532
- def _run_nrms():
3533
- with redirect_stdout(writer), redirect_stderr(writer):
3534
- print(">> Starting NRMS identification ...", flush=True)
3535
- return run_spec_lib_matching_on_NRMS_data_shiny(**common_kwargs)
3536
-
3537
- # --- run in worker thread and stream output live ---
3538
- try:
3539
- if input.chromatography_platform() == "HRMS":
3540
- df_out = await asyncio.to_thread(_run_hrms)
3541
- else:
3542
- df_out = await asyncio.to_thread(_run_nrms)
3543
-
3544
- match_log_rv.set(match_log_rv.get() + "\n✅ Identification finished.\n")
3545
- await reactive.flush()
3546
-
3547
- except Exception as e:
3548
- import traceback
3549
- tb = "".join(traceback.format_exception(type(e), e, e.__traceback__))
3550
- match_log_rv.set(match_log_rv.get() + f"\n❌ {type(e).__name__}: {e}\n{tb}\n")
3551
- await reactive.flush()
3552
- # make sure to stop the drainer before re-raising
3553
- await q.put(None); await drain_task
3554
- raise
3555
-
3556
- finally:
3557
- await q.put(None)
3558
- await drain_task
3559
-
3560
- yield df_out.to_csv(index=True, sep="\t")
3561
-
3562
-
3563
-
3564
-
3565
- @render.download(filename="plot.svg")
3566
- def run_btn_plot_spectra_within_spec_lib_matching():
3567
- req(input.query_data(), input.reference_data())
3568
-
3569
- spectrum_ID1 = input.q_spec() or None
3570
- spectrum_ID2 = input.r_spec() or None
3571
-
3572
- hq = input.high_quality_reference_library()
3573
- if isinstance(hq, str):
3574
- hq = hq.lower() == "true"
3575
- elif isinstance(hq, (int, float)):
3576
- hq = bool(hq)
3577
-
3578
- weights = [float(weight.strip()) for weight in input.weights().split(",") if weight.strip()]
3579
- weights = {'Cosine':weights[0], 'Shannon':weights[1], 'Renyi':weights[2], 'Tsallis':weights[3]}
3580
-
3581
- common = dict(
3582
- query_data=input.query_data()[0]['datapath'],
3583
- reference_data=input.reference_data()[0]['datapath'],
3584
- spectrum_ID1=spectrum_ID1,
3585
- spectrum_ID2=spectrum_ID2,
3586
- print_url_spectrum1=input.print_url_spectrum1(),
3587
- print_url_spectrum2=input.print_url_spectrum2(),
3588
- similarity_measure=input.similarity_measure(),
3589
- weights=weights,
3590
- spectrum_preprocessing_order=input.spectrum_preprocessing_order(),
3591
- high_quality_reference_library=hq,
3592
- mz_min=input.mz_min(), mz_max=input.mz_max(),
3593
- int_min=input.int_min(), int_max=input.int_max(),
3594
- noise_threshold=input.noise_threshold(),
3595
- wf_mz=input.wf_mz(), wf_intensity=input.wf_int(),
3596
- LET_threshold=input.LET_threshold(), entropy_dimension=input.entropy_dimension(),
3597
- y_axis_transformation="normalized",
3598
- return_plot=True
3599
- )
3600
-
3601
- if input.chromatography_platform() == "HRMS":
3602
- fig = generate_plots_on_HRMS_data(
3603
- window_size_centroiding=input.window_size_centroiding(),
3604
- window_size_matching=input.window_size_matching(),
3605
- **common
3606
- )
3607
- plt.show()
3608
- else:
3609
- fig = generate_plots_on_NRMS_data(**common)
3610
- plt.show()
3611
-
3612
- with io.BytesIO() as buf:
3613
- fig.savefig(buf, format="svg", dpi=150, bbox_inches="tight")
3614
- plt.close()
3615
- yield buf.getvalue()
3616
-
3617
-
3618
- @render.download(filename="parameter_tuning_grid_output.txt")
3619
- async def run_btn_parameter_tuning_grid():
3620
- is_any_job_running.set(True)
3621
- is_tuning_grid_running.set(True)
3622
- match_log_rv.set("Running grid search of all parameters specified...\n")
3623
- await reactive.flush()
3624
-
3625
- similarity_measure_tmp = list(input.similarity_measure())
3626
- high_quality_reference_library_tmp = [x.strip().lower() == "true" for x in input.high_quality_reference_library().strip().strip("[]").split(",") if x.strip()]
3627
- spectrum_preprocessing_order_tmp = strip_text(input.spectrum_preprocessing_order())
3628
- mz_min_tmp = strip_numeric(input.mz_min())
3629
- mz_max_tmp = strip_numeric(input.mz_max())
3630
- int_min_tmp = strip_numeric(input.int_min())
3631
- int_max_tmp = strip_numeric(input.int_max())
3632
- noise_threshold_tmp = strip_numeric(input.noise_threshold())
3633
- wf_mz_tmp = strip_numeric(input.wf_mz())
3634
- wf_int_tmp = strip_numeric(input.wf_int())
3635
- LET_threshold_tmp = strip_numeric(input.LET_threshold())
3636
- entropy_dimension_tmp = strip_numeric(input.entropy_dimension())
3637
- weights_tmp = strip_weights(input.weights())
3638
-
3639
- common_kwargs = dict(
3640
- query_data=input.query_data()[0]["datapath"],
3641
- reference_data=input.reference_data()[0]["datapath"],
3642
- output_path=str(Path.cwd() / "parameter_tuning_grid_output.txt"),
3643
- return_output=True,
3644
- )
3645
-
3646
- loop = asyncio.get_running_loop()
3647
- rw = ReactiveWriter(loop)
3648
-
3649
- try:
3650
- if input.chromatography_platform() == "HRMS":
3651
- precursor_ion_mz_tolerance = float(input.precursor_ion_mz_tolerance())
3652
- ionization_mode = str(input.ionization_mode())
3653
- adduct = str(input.adduct())
3654
- window_size_centroiding_tmp = strip_numeric(input.window_size_centroiding())
3655
- window_size_matching_tmp = strip_numeric(input.window_size_matching())
3656
- grid = {
3657
- 'similarity_measure': similarity_measure_tmp,
3658
- 'weight': weights_tmp,
3659
- 'spectrum_preprocessing_order': spectrum_preprocessing_order_tmp,
3660
- 'mz_min': mz_min_tmp,
3661
- 'mz_max': mz_max_tmp,
3662
- 'int_min': int_min_tmp,
3663
- 'int_max': int_max_tmp,
3664
- 'noise_threshold': noise_threshold_tmp,
3665
- 'wf_mz': wf_mz_tmp,
3666
- 'wf_int': wf_int_tmp,
3667
- 'LET_threshold': LET_threshold_tmp,
3668
- 'entropy_dimension': entropy_dimension_tmp,
3669
- 'high_quality_reference_library': high_quality_reference_library_tmp,
3670
- 'window_size_centroiding': window_size_centroiding_tmp,
3671
- 'window_size_matching': window_size_matching_tmp,
3672
- }
3673
- df_out = await asyncio.to_thread(_run_with_redirects, tune_params_on_HRMS_data_grid_shiny, rw, **common_kwargs, grid=grid, precursor_ion_mz_tolerance=precursor_ion_mz_tolerance, ionization_mode=ionization_mode, adduct=adduct)
3674
- else:
3675
- grid = {
3676
- 'similarity_measure': similarity_measure_tmp,
3677
- 'weight': weights_tmp,
3678
- 'spectrum_preprocessing_order': spectrum_preprocessing_order_tmp,
3679
- 'mz_min': mz_min_tmp,
3680
- 'mz_max': mz_max_tmp,
3681
- 'int_min': int_min_tmp,
3682
- 'int_max': int_max_tmp,
3683
- 'noise_threshold': noise_threshold_tmp,
3684
- 'wf_mz': wf_mz_tmp,
3685
- 'wf_int': wf_int_tmp,
3686
- 'LET_threshold': LET_threshold_tmp,
3687
- 'entropy_dimension': entropy_dimension_tmp,
3688
- 'high_quality_reference_library': high_quality_reference_library_tmp,
3689
- }
3690
- df_out = await asyncio.to_thread(_run_with_redirects, tune_params_on_NRMS_data_grid_shiny, rw, **common_kwargs, grid=grid)
3691
-
3692
- match_log_rv.set(match_log_rv.get() + "\n✅ Parameter tuning finished.\n")
3693
- except Exception as e:
3694
- match_log_rv.set(match_log_rv.get() + f"\n❌ Error: {e}\n")
3695
- raise
3696
- finally:
3697
- is_tuning_grid_running.set(False)
3698
- is_any_job_running.set(False)
3699
- await reactive.flush()
3700
-
3701
- yield df_out.to_csv(index=False, sep='\t').encode("utf-8")
3702
-
3703
-
3704
-
3705
- @reactive.effect
3706
- @reactive.event(input.run_btn_parameter_tuning_DE)
3707
- async def run_btn_parameter_tuning_DE():
3708
- match_log_rv.set("Tuning specified continuous parameters using differential evolution...\n")
3709
- is_any_job_running.set(True)
3710
- is_tuning_DE_running.set(True)
3711
- await reactive.flush()
3712
-
3713
- def _safe_float(v, default):
3714
- try:
3715
- if v is None:
3716
- return default
3717
- return float(v)
3718
- except Exception:
3719
- return default
3720
-
3721
- def _iget(id, default=None):
3722
- if id in input:
3723
- try:
3724
- return input[id]()
3725
- except SilentException:
3726
- return default
3727
- return default
3728
-
3729
- loop = asyncio.get_running_loop()
3730
- q: asyncio.Queue[str | None] = asyncio.Queue()
3731
-
3732
- class UIWriter(io.TextIOBase):
3733
- def write(self, s: str):
3734
- if s:
3735
- loop.call_soon_threadsafe(q.put_nowait, s)
3736
- return len(s)
3737
- def flush(self): pass
3738
-
3739
- async def _drain():
3740
- while True:
3741
- msg = await q.get()
3742
- if msg is None:
3743
- break
3744
- match_log_rv.set(match_log_rv.get() + msg)
3745
- await reactive.flush()
3746
-
3747
- drain_task = asyncio.create_task(_drain())
3748
- writer = UIWriter()
3749
-
3750
- try:
3751
- qfile = _iget("query_data")[0]["datapath"]
3752
- rfile = _iget("reference_data")[0]["datapath"]
3753
-
3754
- platform = _iget("chromatography_platform", "HRMS")
3755
- sim = _iget("similarity_measure", "cosine")
3756
- spro = _iget("spectrum_preprocessing_order", "FCNMWL")
3757
-
3758
- hq_raw = _iget("high_quality_reference_library", False)
3759
- if isinstance(hq_raw, str):
3760
- hq = hq_raw.lower() == "true"
3761
- else:
3762
- hq = bool(hq_raw)
3763
-
3764
- mz_min = _safe_float(_iget("mz_min", 0.0), 0.0)
3765
- mz_max = _safe_float(_iget("mz_max", 99_999_999.0), 99_999_999.0)
3766
- int_min = _safe_float(_iget("int_min", 0.0), 0.0)
3767
- int_max = _safe_float(_iget("int_max", 999_999_999.0), 999_999_999.0)
3768
-
3769
- w_text = _iget("weights", "") or ""
3770
- w_list = [float(w.strip()) for w in w_text.split(",") if w.strip()]
3771
- w_list = (w_list + [0.0, 0.0, 0.0, 0.0])[:4]
3772
- weights = {"Cosine": w_list[0], "Shannon": w_list[1], "Renyi": w_list[2], "Tsallis": w_list[3]}
3773
-
3774
- opt_params = tuple(_iget("params", ()) or ())
3775
- bounds_dict = {}
3776
- param_defaults = PARAMS_HRMS if platform == "HRMS" else PARAMS_NRMS
3777
- for p in opt_params:
3778
- lo = _safe_float(_iget(f"min_{p}", param_defaults.get(p, (0.0, 1.0))[0]),
3779
- param_defaults.get(p, (0.0, 1.0))[0])
3780
- hi = _safe_float(_iget(f"max_{p}", param_defaults.get(p, (0.0, 1.0))[1]),
3781
- param_defaults.get(p, (0.0, 1.0))[1])
3782
- if lo > hi:
3783
- lo, hi = hi, lo
3784
- bounds_dict[p] = (lo, hi)
3785
-
3786
- defaults = {
3787
- "window_size_centroiding": _safe_float(_iget("window_size_centroiding", 0.5), 0.5),
3788
- "window_size_matching": _safe_float(_iget("window_size_matching", 0.5), 0.5),
3789
- "noise_threshold": _safe_float(_iget("noise_threshold", 0.0), 0.0),
3790
- "wf_mz": _safe_float(_iget("wf_mz", 0.0), 0.0),
3791
- "wf_int": _safe_float(_iget("wf_int", 1.0), 1.0),
3792
- "LET_threshold": _safe_float(_iget("LET_threshold", 0.0), 0.0),
3793
- "entropy_dimension": _safe_float(_iget("entropy_dimension", 1.1), 1.1),
3794
- }
3795
- if platform == "NRMS":
3796
- defaults.pop("window_size_centroiding", None)
3797
- defaults.pop("window_size_matching", None)
3798
-
3799
- except Exception as e:
3800
- import traceback
3801
- tb = "".join(traceback.format_exception(type(e), e, e.__traceback__))
3802
- match_log_rv.set(match_log_rv.get() + f"\n❌ Input snapshot failed:\n{tb}\n")
3803
- is_tuning_DE_running.set(False); is_any_job_running.set(False)
3804
- await q.put(None); await drain_task; await reactive.flush()
3805
- return
3806
-
3807
- def _run():
3808
- with redirect_stdout(writer), redirect_stderr(writer):
3809
- return tune_params_DE(
3810
- query_data=qfile,
3811
- reference_data=rfile,
3812
- precursor_ion_mz_tolerance=float(input.precursor_ion_mz_tolerance()),
3813
- ionization_mode=input.ionization_mode(),
3814
- adduct=input.adduct(),
3815
- chromatography_platform=input.chromatography_platform(),
3816
- similarity_measure=sim,
3817
- weights=weights,
3818
- spectrum_preprocessing_order=spro,
3819
- mz_min=mz_min, mz_max=mz_max,
3820
- int_min=int_min, int_max=int_max,
3821
- high_quality_reference_library=hq,
3822
- optimize_params=list(opt_params),
3823
- param_bounds=bounds_dict,
3824
- default_params=defaults,
3825
- de_workers=1,
3826
- maxiters=input.max_iterations()
3827
- )
3828
-
3829
- try:
3830
- _ = await asyncio.to_thread(_run)
3831
- match_log_rv.set(match_log_rv.get() + "\n✅ Differential evolution finished.\n")
3832
- except Exception as e:
3833
- import traceback
3834
- tb = "".join(traceback.format_exception(type(e), e, e.__traceback__))
3835
- match_log_rv.set(match_log_rv.get() + f"\n❌ {type(e).__name__}: {e}\n{tb}\n")
3836
- finally:
3837
- await q.put(None)
3838
- await drain_task
3839
- is_tuning_DE_running.set(False)
3840
- is_any_job_running.set(False)
3841
- await reactive.flush()
3842
-
3843
-
3844
- @reactive.effect
3845
- async def _pump_reactive_writer_logs():
3846
- if not is_tuning_grid_running.get():
3847
- return
3848
-
3849
- reactive.invalidate_later(0.1)
3850
- msgs = _drain_queue_nowait(_LOG_QUEUE)
3851
- if msgs:
3852
- match_log_rv.set(match_log_rv.get() + "".join(msgs))
3853
- await reactive.flush()
3854
-
3855
-
3856
- @render.text
3857
- def status_output():
3858
- return run_status_plot_spectra.get()
3859
- return run_status_spec_lib_matching.get()
3860
- return run_status_parameter_tuning_grid.get()
3861
- return run_status_parameter_tuning_DE.get()
3862
-
3863
- @output
3864
- @render.text
3865
- def run_log():
3866
- return match_log_rv.get()
3867
-
3868
-
3869
- app = App(app_ui, server)
3870
-
3871
-