pycompound 0.1.5__py3-none-any.whl → 0.1.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
app.py CHANGED
@@ -1,16 +1,9 @@
1
1
 
2
2
  from shiny import App, ui, reactive, render, req
3
- from pycompound.spec_lib_matching import run_spec_lib_matching_on_HRMS_data
4
- from pycompound.spec_lib_matching import run_spec_lib_matching_on_NRMS_data
5
- from pycompound.spec_lib_matching import tune_params_on_HRMS_data_grid
6
- from pycompound.spec_lib_matching import tune_params_on_NRMS_data_grid
7
- from pycompound.spec_lib_matching import tune_params_on_HRMS_data_grid_shiny
8
- from pycompound.spec_lib_matching import tune_params_on_NRMS_data_grid_shiny
9
- from pycompound.spec_lib_matching import tune_params_DE
10
- from pycompound.plot_spectra import generate_plots_on_HRMS_data
11
- from pycompound.plot_spectra import generate_plots_on_NRMS_data
3
+ from shiny.types import SilentException
12
4
  from pathlib import Path
13
5
  from contextlib import redirect_stdout, redirect_stderr
6
+ import contextlib
14
7
  import subprocess
15
8
  import traceback
16
9
  import asyncio
@@ -24,10 +17,2320 @@ import netCDF4 as nc
24
17
  from pyteomics import mgf, mzml
25
18
  import ast
26
19
  from numbers import Real
27
-
20
+ import logging
21
+ from scipy.optimize import differential_evolution
22
+ import scipy
23
+ import scipy.stats
24
+ from itertools import product
25
+ import json
26
+ import re
27
+ import urllib.parse
28
+ import urllib.request
29
+ import matplotlib
30
+
31
+ matplotlib.rcParams['svg.fonttype'] = 'none'
28
32
 
29
33
  _LOG_QUEUE: asyncio.Queue[str] = asyncio.Queue()
30
34
 
35
+ _ADDUCT_PAT = re.compile(r"\s*(?:\[(M[^\]]+)\]|(M[+-][A-Za-z0-9]+)\+?)\s*$", re.IGNORECASE)
36
+
37
+ def start_log_consumer():
38
+ if getattr(start_log_consumer, "_started", False):
39
+ return
40
+ start_log_consumer._started = True
41
+
42
+ async def _consume():
43
+ while True:
44
+ s = await _LOG_QUEUE.get()
45
+ match_log_rv.set(match_log_rv.get() + s)
46
+ await reactive.flush()
47
+
48
+ asyncio.create_task(_consume())
49
+
50
+
51
+ def start_log_consumer():
52
+ if getattr(start_log_consumer, "_started", False):
53
+ return
54
+ start_log_consumer._started = True
55
+
56
+ async def _consume():
57
+ while True:
58
+ s = await _LOG_QUEUE.get()
59
+ match_log_rv.set(match_log_rv.get() + s)
60
+ await reactive.flush()
61
+
62
+ asyncio.create_task(_consume())
63
+
64
+
65
+
66
+ def _strip_adduct(name: str) -> str:
67
+ return _ADDUCT_PAT.sub("", name).strip()
68
+
69
+ def get_pubchem_url(query: str) -> str:
70
+ base_name = _strip_adduct(query)
71
+ endpoint = ("https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/name/" + urllib.parse.quote(base_name) + "/cids/TXT")
72
+ try:
73
+ with urllib.request.urlopen(endpoint, timeout=10) as r:
74
+ txt = r.read().decode("utf-8").strip()
75
+ cid = txt.splitlines()[0].strip()
76
+ if cid.isdigit():
77
+ return f"https://pubchem.ncbi.nlm.nih.gov/compound/{cid}"
78
+ except Exception:
79
+ pass
80
+ q = urllib.parse.quote(base_name)
81
+ return f"https://pubchem.ncbi.nlm.nih.gov/#query={q}"
82
+
83
+
84
+
85
+ def build_library_from_raw_data(input_path=None, output_path=None, is_reference=False):
86
+ if input_path is None:
87
+ print('Error: please specify input_path (i.e. the path to the input mgf, mzML, cdf, json, or msp file). Mandatory argument.')
88
+ sys.exit()
89
+
90
+ if output_path is None:
91
+ tmp = input_path.split('/')
92
+ tmp = tmp[(len(tmp)-1)]
93
+ basename = tmp.split('.')[0]
94
+ output_path = f'{Path.cwd()}/{basename}.csv'
95
+ print(f'Warning: no output_path specified, so library is written to {output_path}')
96
+
97
+ if is_reference not in [True,False]:
98
+ print('Error: is_reference must be either \'True\' or \'False\'.')
99
+ sys.exit()
100
+
101
+ last_three_chars = input_path[(len(input_path)-3):len(input_path)]
102
+ last_four_chars = input_path[(len(input_path)-4):len(input_path)]
103
+ if last_three_chars == 'mgf' or last_three_chars == 'MGF':
104
+ input_file_type = 'mgf'
105
+ elif last_four_chars == 'mzML' or last_four_chars == 'mzml' or last_four_chars == 'MZML':
106
+ input_file_type = 'mzML'
107
+ elif last_four_chars == 'json' or last_four_chars == 'JSON':
108
+ input_file_type = 'json'
109
+ elif last_three_chars == 'cdf' or last_three_chars == 'CDF':
110
+ input_file_type = 'cdf'
111
+ elif last_three_chars == 'msp' or last_three_chars == 'MSP':
112
+ input_file_type = 'msp'
113
+ else:
114
+ print('ERROR: either an \'mgf\', \'mzML\', \'cdf\', \'json\', or \'msp\' file must be passed to --input_path')
115
+ sys.exit()
116
+
117
+
118
+
119
+ def generate_plots_on_HRMS_data(query_data=None, reference_data=None, precursor_ion_mz=None, precursor_ion_mz_tolerance=None, ionization_mode=None, collision_energy=None, spectrum_ID1=None, spectrum_ID2=None, print_url_spectrum1='No', print_url_spectrum2='No', similarity_measure='cosine', weights={'Cosine':0.25,'Shannon':0.25,'Renyi':0.25,'Tsallis':0.25}, spectrum_preprocessing_order='FCNMWL', high_quality_reference_library=False, mz_min=0, mz_max=9999999, int_min=0, int_max=9999999, window_size_centroiding=0.5, window_size_matching=0.5, noise_threshold=0.0, wf_mz=0.0, wf_intensity=1.0, LET_threshold=0.0, entropy_dimension=1.1, y_axis_transformation='normalized', output_path=None, return_plot=False):
120
+
121
+ if query_data is None:
122
+ print('\nError: No argument passed to the mandatory query_data. Please pass the path to the TXT file of the query data.')
123
+ sys.exit()
124
+ else:
125
+ extension = query_data.rsplit('.',1)
126
+ extension = extension[(len(extension)-1)]
127
+ if extension == 'mgf' or extension == 'MGF' or extension == 'mzML' or extension == 'mzml' or extension == 'MZML' or extension == 'cdf' or extension == 'CDF' or extension == 'msp' or extension == 'MSP' or extension == 'json' or extension == 'JSON':
128
+ output_path_tmp = query_data[:-3] + 'txt'
129
+ #build_library_from_raw_data(input_path=query_data, output_path=output_path_tmp, is_reference=True)
130
+ build_library_from_raw_data(input_path=query_data, output_path=output_path_tmp, is_reference=False)
131
+ df_query = pd.read_csv(output_path_tmp, sep='\t')
132
+ if extension == 'txt' or extension == 'TXT':
133
+ df_query = pd.read_csv(query_data, sep='\t')
134
+ unique_query_ids = df_query['id'].unique().tolist()
135
+ unique_query_ids = [str(tmp) for tmp in unique_query_ids]
136
+
137
+ if reference_data is None:
138
+ print('\nError: No argument passed to the mandatory reference_data. Please pass the path to the TXT file of the reference data.')
139
+ sys.exit()
140
+ else:
141
+ extension = reference_data.rsplit('.',1)
142
+ extension = extension[(len(extension)-1)]
143
+ if extension == 'mgf' or extension == 'MGF' or extension == 'mzML' or extension == 'mzml' or extension == 'MZML' or extension == 'cdf' or extension == 'CDF' or extension == 'msp' or extension == 'MSP' or extension == 'json' or extension == 'JSON':
144
+ output_path_tmp = reference_data[:-3] + 'txt'
145
+ build_library_from_raw_data(input_path=reference_data, output_path=output_path_tmp, is_reference=True)
146
+ df_reference = pd.read_csv(output_path_tmp, sep='\t')
147
+ if extension == 'txt' or extension == 'TXT':
148
+ df_reference = pd.read_csv(reference_data, sep='\t')
149
+ cols_tmp = df_reference.columns.tolist()
150
+ if 'precursor_ion_mz' in cols_tmp and 'ionization_mode' in cols_tmp and 'collision_energy' in cols_tmp:
151
+ if precursor_ion_mz is not None and precursor_ion_mz_tolerance is not None:
152
+ df_reference = df_reference.loc[(df_reference['precursor_ion_mz'] > (precursor_ion_mz-precursor_ion_mz_tolerance) & df_reference['precursor_ion_mz'] < (precursor_ion_mz+precursor_ion_mz_tolerance))]
153
+ if ionization_mode is not None:
154
+ df_reference = df_reference.loc[df_reference['ionization_mode'==ionization_mode]]
155
+ if collision_energy is not None:
156
+ df_reference = df_reference.loc[df_reference['collision_energy'==collision_energy]]
157
+ df_reference = df_reference.drop(columns=['precursor_ion_mz','ionization_mode','collision_energy'])
158
+ unique_reference_ids = df_reference['id'].unique().tolist()
159
+ unique_reference_ids = [str(tmp) for tmp in unique_reference_ids]
160
+
161
+ if spectrum_ID1 is not None:
162
+ spectrum_ID1 = str(spectrum_ID1)
163
+ else:
164
+ spectrum_ID1 = str(df_query['id'].iloc[0])
165
+ print('No argument passed to spectrum_ID1; using the first spectrum in query_data.')
166
+
167
+ if spectrum_ID2 is not None:
168
+ spectrum_ID2 = str(spectrum_ID2)
169
+ else:
170
+ spectrum_ID2 = str(df_reference['id'].iloc[0])
171
+ print('No argument passed to spectrum_ID2; using the first spectrum in reference_data.')
172
+
173
+ if spectrum_preprocessing_order is not None:
174
+ spectrum_preprocessing_order = list(spectrum_preprocessing_order)
175
+ else:
176
+ spectrum_preprocessing_order = ['F', 'C', 'N', 'M', 'W', 'L']
177
+ if 'M' not in spectrum_preprocessing_order:
178
+ print(f'Error: \'M\' must be a character in spectrum_preprocessing_order.')
179
+ sys.exit()
180
+ if 'C' in spectrum_preprocessing_order:
181
+ if spectrum_preprocessing_order.index('C') > spectrum_preprocessing_order.index('M'):
182
+ print(f'Error: \'C\' must come before \'M\' in spectrum_preprocessing_order.')
183
+ sys.exit()
184
+ if set(spectrum_preprocessing_order) - {'F','C','N','M','W','L'}:
185
+ print(f'Error: spectrum_preprocessing_order must contain only \'C\', \'F\', \'M\', \'N\', \'L\', \'W\'.')
186
+ sys.exit()
187
+
188
+ if similarity_measure not in ['cosine','shannon','renyi','tsallis','mixture','jaccard','dice','3w_jaccard','sokal_sneath','binary_cosine','mountford','mcconnaughey','driver_kroeber','simpson','braun_banquet','fager_mcgowan','kulczynski','intersection','hamming','hellinger']:
189
+ print('\nError: similarity_measure must be either cosine, shannon, renyi, tsallis, mixture, jaccard, dice, 3w_jaccard, sokal_sneath, binary_cosine, mountford, mcconnaughey, driver_kroeber, simpson, braun_banquet, fager_mcgowan, kulczynski, intersection, hamming, or hellinger.')
190
+ sys.exit()
191
+
192
+ if isinstance(int_min,int) is True:
193
+ int_min = float(int_min)
194
+ if isinstance(int_max,int) is True:
195
+ int_max = float(int_max)
196
+ if isinstance(mz_min,int) is False or isinstance(mz_max,int) is False or isinstance(int_min,float) is False or isinstance(int_max,float) is False:
197
+ print('Error: mz_min must be a non-negative integer, mz_max must be a positive integer, int_min must be a non-negative float, and int_max must be a positive float')
198
+ sys.exit()
199
+ if mz_min < 0:
200
+ print('\nError: mz_min should be a non-negative integer')
201
+ sys.exit()
202
+ if mz_max <= 0:
203
+ print('\nError: mz_max should be a positive integer')
204
+ sys.exit()
205
+ if int_min < 0:
206
+ print('\nError: int_min should be a non-negative float')
207
+ sys.exit()
208
+ if int_max <= 0:
209
+ print('\nError: int_max should be a positive float')
210
+ sys.exit()
211
+
212
+ if isinstance(window_size_centroiding,float) is False or window_size_centroiding <= 0.0:
213
+ print('Error: window_size_centroiding must be a positive float.')
214
+ sys.exit()
215
+ if isinstance(window_size_matching,float) is False or window_size_matching<= 0.0:
216
+ print('Error: window_size_matching must be a positive float.')
217
+ sys.exit()
218
+
219
+ if isinstance(noise_threshold,int) is True:
220
+ noise_threshold = float(noise_threshold)
221
+ if isinstance(noise_threshold,float) is False or noise_threshold < 0:
222
+ print('Error: noise_threshold must be a positive float.')
223
+ sys.exit()
224
+
225
+ if isinstance(wf_intensity,int) is True:
226
+ wf_intensity = float(wf_intensity)
227
+ if isinstance(wf_mz,int) is True:
228
+ wf_mz = float(wf_mz)
229
+ if isinstance(wf_intensity,float) is False or isinstance(wf_mz,float) is False:
230
+ print('Error: wf_mz and wf_intensity must be integers or floats')
231
+ sys.exit()
232
+
233
+ if entropy_dimension <= 0:
234
+ print('\nError: entropy_dimension should be a positive float')
235
+ sys.exit()
236
+ else:
237
+ q = entropy_dimension
238
+
239
+ normalization_method = 'standard' #consider including additional normalization methods to transform intensities into a probability distribution; softmax results in many numerical errors/warnings
240
+
241
+ if y_axis_transformation not in ['normalized','none','log10','sqrt']:
242
+ print('Error: y_axis_transformation must be either \'normalized\', \'none\', \'log10\', or \'sqrt\'.')
243
+ sys.exit()
244
+
245
+ if output_path is None:
246
+ print(f'Warning: plots will be saved to the svg ./spectrum1_{spectrum_ID1}_spectrum2_{spectrum_ID2}.svg in the current working directory.')
247
+ output_path = f'{Path.cwd()}/spectrum1_{spectrum_ID1}_spectrum2_{spectrum_ID2}.svg'
248
+
249
+
250
+ if spectrum_ID1 in unique_query_ids and spectrum_ID2 in unique_query_ids:
251
+ query_idx = unique_query_ids.index(spectrum_ID1)
252
+ reference_idx = unique_query_ids.index(spectrum_ID2)
253
+ q_idxs_tmp = np.where(df_query.iloc[:,0].astype(str) == unique_query_ids[query_idx])[0]
254
+ r_idxs_tmp = np.where(df_query.iloc[:,0].astype(str) == unique_query_ids[reference_idx])[0]
255
+ q_spec = np.asarray(pd.concat([df_query.iloc[q_idxs_tmp,1], df_query.iloc[q_idxs_tmp,2]], axis=1).reset_index(drop=True))
256
+ r_spec = np.asarray(pd.concat([df_query.iloc[r_idxs_tmp,1], df_query.iloc[r_idxs_tmp,2]], axis=1).reset_index(drop=True))
257
+ elif spectrum_ID1 in unique_reference_ids and spectrum_ID2 in unique_reference_ids:
258
+ query_idx = unique_reference_ids.index(spectrum_ID1)
259
+ reference_idx = unique_reference_ids.index(spectrum_ID2)
260
+ q_idxs_tmp = np.where(df_reference.iloc[:,0].astype(str) == unique_reference_ids[query_idx])[0]
261
+ r_idxs_tmp = np.where(df_reference.iloc[:,0].astype(str) == unique_reference_ids[reference_idx])[0]
262
+ q_spec = np.asarray(pd.concat([df_reference.iloc[q_idxs_tmp,1], df_reference.iloc[q_idxs_tmp,2]], axis=1).reset_index(drop=True))
263
+ r_spec = np.asarray(pd.concat([df_reference.iloc[r_idxs_tmp,1], df_reference.iloc[r_idxs_tmp,2]], axis=1).reset_index(drop=True))
264
+ else:
265
+ if spectrum_ID1 in unique_reference_ids and spectrum_ID2 in unique_query_ids:
266
+ spec_tmp = spectrum_ID1
267
+ spectrum_ID1 = spectrum_ID2
268
+ spectrum_ID2 = spec_tmp
269
+ query_idx = unique_query_ids.index(spectrum_ID1)
270
+ reference_idx = unique_reference_ids.index(spectrum_ID2)
271
+ q_idxs_tmp = np.where(df_query['id'].astype(str) == unique_query_ids[query_idx])[0]
272
+ r_idxs_tmp = np.where(df_reference['id'].astype(str) == unique_reference_ids[reference_idx])[0]
273
+ q_spec = np.asarray(pd.concat([df_query['mz_ratio'].iloc[q_idxs_tmp], df_query['intensity'].iloc[q_idxs_tmp]], axis=1).reset_index(drop=True))
274
+ r_spec = np.asarray(pd.concat([df_reference['mz_ratio'].iloc[r_idxs_tmp], df_reference['intensity'].iloc[r_idxs_tmp]], axis=1).reset_index(drop=True))
275
+
276
+
277
+ q_spec_pre_trans = q_spec.copy()
278
+ r_spec_pre_trans = r_spec.copy()
279
+ q_spec_pre_trans[:,1] = q_spec_pre_trans[:,1].astype(float)
280
+ r_spec_pre_trans[:,1] = r_spec_pre_trans[:,1].astype(float)
281
+
282
+ if y_axis_transformation == 'normalized':
283
+ q_spec_pre_trans[:,1] = q_spec_pre_trans[:,1] / np.max(q_spec_pre_trans[:,1])
284
+ r_spec_pre_trans[:,1] = r_spec_pre_trans[:,1] / np.max(r_spec_pre_trans[:,1])
285
+ ylab = 'Normalized Intensity'
286
+ elif y_axis_transformation == 'log10':
287
+ q_spec_pre_trans[:,1] = np.log10(np.array(q_spec_pre_trans[:,1]+1,dtype=float))
288
+ r_spec_pre_trans[:,1] = np.log10(np.array(r_spec_pre_trans[:,1]+1,dtype=float))
289
+ ylab = 'log10(Intensity)'
290
+ elif y_axis_transformation == 'sqrt':
291
+ q_spec_pre_trans[:,1] = np.sqrt(np.array(q_spec_pre_trans[:,1],dtype=float))
292
+ r_spec_pre_trans[:,1] = np.sqrt(np.array(r_spec_pre_trans[:,1],dtype=float))
293
+ ylab = 'sqrt(Intensity)'
294
+ else:
295
+ ylab = 'Raw Intensity'
296
+
297
+ fig, axes = plt.subplots(nrows=2, ncols=1)
298
+
299
+ plt.subplot(2,1,1)
300
+ plt.vlines(x=q_spec_pre_trans[:,0], ymin=[0]*q_spec_pre_trans.shape[0], ymax=q_spec_pre_trans[:,1], linewidth=3, color='blue', label=f'Spectrum ID 1: {spectrum_ID1}')
301
+ plt.vlines(x=r_spec_pre_trans[:,0], ymin=[0]*r_spec_pre_trans.shape[0], ymax=-r_spec_pre_trans[:,1], linewidth=3, color='red', label=f'Spectrum ID 2: {spectrum_ID2}')
302
+ plt.xlabel('m/z',fontsize=7)
303
+ plt.ylabel(ylab, fontsize=7)
304
+ plt.xticks(fontsize=7)
305
+ plt.yticks(fontsize=7)
306
+ plt.title('Untransformed Spectra', fontsize=10)
307
+
308
+ mz_min_tmp_q = round(q_spec[:,0].min(),1)
309
+ mz_min_tmp_r = round(r_spec[:,0].min(),1)
310
+ int_min_tmp_q = round(q_spec[:,1].min(),1)
311
+ int_min_tmp_r = round(r_spec[:,1].min(),1)
312
+ mz_max_tmp_q = round(q_spec[:,0].max(),1)
313
+ mz_max_tmp_r = round(r_spec[:,0].max(),1)
314
+ int_max_tmp_q = round(q_spec[:,1].max(),1)
315
+ int_max_tmp_r = round(r_spec[:,1].max(),1)
316
+ mz_min_tmp = min([mz_min_tmp_q,mz_min_tmp_r])
317
+ mz_max_tmp = min([mz_max_tmp_q,mz_max_tmp_r])
318
+ int_min_tmp = min([int_min_tmp_q,int_min_tmp_r])
319
+ int_max_tmp = max([int_max_tmp_q,int_max_tmp_r])
320
+
321
+ is_matched = False
322
+ for transformation in spectrum_preprocessing_order:
323
+ if transformation == 'C' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1:
324
+ q_spec = centroid_spectrum(q_spec, window_size=window_size_centroiding)
325
+ r_spec = centroid_spectrum(r_spec, window_size=window_size_centroiding)
326
+ if transformation == 'M' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1:
327
+ m_spec = match_peaks_in_spectra(spec_a=q_spec, spec_b=r_spec, window_size=window_size_matching)
328
+ q_spec = m_spec[:,0:2]
329
+ r_spec = m_spec[:,[0,2]]
330
+ is_matched = True
331
+ if transformation == 'W' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1:
332
+ q_spec[:,1] = wf_transform(q_spec[:,0], q_spec[:,1], wf_mz, wf_intensity)
333
+ r_spec[:,1] = wf_transform(r_spec[:,0], r_spec[:,1], wf_mz, wf_intensity)
334
+ if transformation == 'L' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1:
335
+ q_spec[:,1] = LE_transform(q_spec[:,1], LET_threshold, normalization_method=normalization_method)
336
+ r_spec[:,1] = LE_transform(r_spec[:,1], LET_threshold, normalization_method=normalization_method)
337
+ if transformation == 'N' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1:
338
+ q_spec = remove_noise(q_spec, nr = noise_threshold)
339
+ if high_quality_reference_library == False or high_quality_reference_library == 'False':
340
+ r_spec = remove_noise(r_spec, nr = noise_threshold)
341
+ if transformation == 'F' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1:
342
+ q_spec = filter_spec_lcms(q_spec, mz_min = mz_min, mz_max = mz_max, int_min = int_min, int_max = int_max, is_matched = is_matched)
343
+ if high_quality_reference_library == False or high_quality_reference_library == 'False':
344
+ r_spec = filter_spec_lcms(r_spec, mz_min = mz_min, mz_max = mz_max, int_min = int_min, int_max = int_max, is_matched = is_matched)
345
+
346
+ q_ints = q_spec[:,1]
347
+ r_ints = r_spec[:,1]
348
+
349
+ if np.sum(q_ints) != 0 and np.sum(r_ints) != 0 and q_spec.shape[0] > 1 and r_spec.shape[1] > 1:
350
+ similarity_score = get_similarity(similarity_measure, q_ints, r_ints, weights, entropy_dimension)
351
+ else:
352
+ similarity_score = 0
353
+
354
+ plt.subplot(2,1,2)
355
+
356
+ if q_spec.shape[0] > 1:
357
+ if np.max(q_spec[:,1]) == 0 or np.max(r_spec[:,1]) == 0:
358
+ plt.text(0.5, 0.5, 'The query and/or reference spectrum has no non-zero intensities after transformations.\n Change transformation parameters.', ha='center', va='center', fontsize=7, color='black')
359
+ plt.xticks([])
360
+ plt.yticks([])
361
+ else:
362
+ if y_axis_transformation == 'normalized':
363
+ q_spec[:,1] = q_spec[:,1] / np.max(q_spec[:,1])
364
+ r_spec[:,1] = r_spec[:,1] / np.max(r_spec[:,1])
365
+ ylab='Normalized Intensity'
366
+ elif y_axis_transformation == 'log10':
367
+ q_spec[:,1] = np.log10(q_spec[:,1]+1)
368
+ r_spec[:,1] = np.log10(r_spec[:,1]+1)
369
+ ylab='log10(Intensity)'
370
+ elif y_axis_transformation == 'sqrt':
371
+ q_spec[:,1] = np.sqrt(q_spec[:,1])
372
+ r_spec[:,1] = np.sqrt(r_spec[:,1])
373
+ ylab='sqrt(Intensity)'
374
+ else:
375
+ ylab = 'Raw Intensity'
376
+ plt.vlines(x=q_spec[:,0], ymin=[0]*q_spec.shape[0], ymax=q_spec[:,1], linewidth=3, color='blue')
377
+ plt.vlines(x=r_spec[:,0], ymin=[0]*r_spec.shape[0], ymax=-r_spec[:,1], linewidth=3, color='red')
378
+ plt.xlabel('m/z', fontsize=7)
379
+ plt.ylabel(ylab, fontsize=7)
380
+ plt.xticks(fontsize=7)
381
+ plt.yticks(fontsize=7)
382
+ plt.title(f'Transformed Spectra', fontsize=10)
383
+ else:
384
+ plt.text(0.5, 0.5, 'All points in the spectra were removed during preprocessing. \nChange the spectrum_preprocesing_order and/or change other spectrum-preprocessing parameters.', ha='center', va='center', fontsize=7, color='black')
385
+ plt.xticks([])
386
+ plt.yticks([])
387
+
388
+ plt.subplots_adjust(top=0.8, hspace=0.92, bottom=0.3)
389
+ plt.figlegend(loc='upper center')
390
+
391
+ fig.text(0.05, 0.20, f'Similarity Measure: {similarity_measure.capitalize()}', fontsize=7)
392
+ fig.text(0.05, 0.17, f'Similarity Score: {round(similarity_score, 4)}', fontsize=7)
393
+ fig.text(0.05, 0.14, f"Spectrum Preprocessing Order: {''.join(spectrum_preprocessing_order)}", fontsize=7)
394
+ fig.text(0.05, 0.11, f'High Quality Reference Library: {str(high_quality_reference_library)}', fontsize=7)
395
+ fig.text(0.05, 0.08, f'Window Size (Centroiding): {window_size_centroiding}', fontsize=7)
396
+ fig.text(0.05, 0.05, f'Window Size (Matching): {window_size_matching}', fontsize=7)
397
+ if similarity_measure == 'mixture':
398
+ fig.text(0.05, 0.02, f'Weights for mixture similarity: {weights}', fontsize=7)
399
+
400
+ fig.text(0.40, 0.20, f'Raw-Scale M/Z Range: [{mz_min_tmp},{mz_max_tmp}]', fontsize=7)
401
+ fig.text(0.40, 0.17, f'Raw-Scale Intensity Range: [{int_min_tmp},{int_max_tmp}]', fontsize=7)
402
+ fig.text(0.40, 0.14, f'Noise Threshold: {noise_threshold}', fontsize=7)
403
+ fig.text(0.40, 0.11, f'Weight Factors (m/z,intensity): ({wf_mz},{wf_intensity})', fontsize=7)
404
+ fig.text(0.40, 0.08, f'Low-Entropy Threshold: {LET_threshold}', fontsize=7)
405
+
406
+ if print_url_spectrum1 == 'Yes' and print_url_spectrum2 == 'Yes':
407
+ url_tmp1 = get_pubchem_url(query=spectrum_ID1)
408
+ url_tmp2 = get_pubchem_url(query=spectrum_ID2)
409
+ t1 = fig.text(0.40, 0.05, f'PubChem URL for {spectrum_ID1}: {url_tmp1}', fontsize=7)
410
+ t2 = fig.text(0.40, 0.02, f'PubChem URL for {spectrum_ID2}: {url_tmp2}', fontsize=7)
411
+ t1.set_url(url_tmp1)
412
+ t2.set_url(url_tmp2)
413
+
414
+ if print_url_spectrum1 == 'Yes' and print_url_spectrum2 == 'No':
415
+ url_tmp1 = get_pubchem_url(query=spectrum_ID1)
416
+ t1 = fig.text(0.40, 0.05, f'PubChem URL for {spectrum_ID1}: {url_tmp1}', fontsize=7)
417
+ t1.set_url(url_tmp1)
418
+
419
+ if print_url_spectrum1 == 'No' and print_url_spectrum2 == 'Yes':
420
+ url_tmp2 = get_pubchem_url(query=spectrum_ID2)
421
+ t2 = fig.text(0.40, 0.05, f'PubChem URL for {spectrum_ID2}: {url_tmp2}', fontsize=7)
422
+ t2.set_url(url_tmp2)
423
+
424
+ fig.savefig(output_path, format='svg')
425
+
426
+ if return_plot == True:
427
+ return fig
428
+
429
+
430
+
431
+
432
+ def generate_plots_on_NRMS_data(query_data=None, reference_data=None, spectrum_ID1=None, spectrum_ID2=None, print_url_spectrum1='No', print_url_spectrum2='No', similarity_measure='cosine', weights={'Cosine':0.25,'Shannon':0.25,'Renyi':0.25,'Tsallis':0.25}, spectrum_preprocessing_order='FNLW', high_quality_reference_library=False, mz_min=0, mz_max=9999999, int_min=0, int_max=9999999, noise_threshold=0.0, wf_mz=0.0, wf_intensity=1.0, LET_threshold=0.0, entropy_dimension=1.1, y_axis_transformation='normalized', output_path=None, return_plot=False):
433
+
434
+ if query_data is None:
435
+ print('\nError: No argument passed to the mandatory query_data. Please pass the path to the TXT file of the query data.')
436
+ sys.exit()
437
+ else:
438
+ extension = query_data.rsplit('.',1)
439
+ extension = extension[(len(extension)-1)]
440
+ if extension == 'mgf' or extension == 'MGF' or extension == 'mzML' or extension == 'mzml' or extension == 'MZML' or extension == 'cdf' or extension == 'CDF' or extension == 'msp' or extension == 'MSP' or extension == 'json' or extension == 'JSON':
441
+ output_path_tmp = query_data[:-3] + 'txt'
442
+ build_library_from_raw_data(input_path=query_data, output_path=output_path_tmp, is_reference=False)
443
+ df_query = pd.read_csv(output_path_tmp, sep='\t')
444
+ if extension == 'txt' or extension == 'TXT':
445
+ df_query = pd.read_csv(query_data, sep='\t')
446
+ unique_query_ids = df_query['id'].unique()
447
+
448
+ if reference_data is None:
449
+ print('\nError: No argument passed to the mandatory reference_data. Please pass the path to the TXT file of the reference data.')
450
+ sys.exit()
451
+ else:
452
+ extension = reference_data.rsplit('.',1)
453
+ extension = extension[(len(extension)-1)]
454
+ if extension == 'mgf' or extension == 'MGF' or extension == 'mzML' or extension == 'mzml' or extension == 'MZML' or extension == 'cdf' or extension == 'CDF' or extension == 'msp' or extension == 'MSP' or extension == 'json' or extension == 'JSON':
455
+ output_path_tmp = reference_data[:-3] + 'txt'
456
+ build_library_from_raw_data(input_path=reference_data, output_path=output_path_tmp, is_reference=True)
457
+ df_reference = pd.read_csv(output_path_tmp, sep='\t')
458
+ if extension == 'txt' or extension == 'TXT':
459
+ df_reference = pd.read_csv(reference_data, sep='\t')
460
+ unique_reference_ids = df_reference['id'].unique()
461
+
462
+
463
+ if spectrum_ID1 is not None:
464
+ spectrum_ID1 = str(spectrum_ID1)
465
+ else:
466
+ spectrum_ID1 = str(df_query.iloc[0,0])
467
+ print('No argument passed to spectrum_ID1; using the first spectrum in query_data.')
468
+
469
+ if spectrum_ID2 is not None:
470
+ spectrum_ID2 = str(spectrum_ID2)
471
+ else:
472
+ spectrum_ID2 = str(df_reference.iloc[0,0])
473
+ print('No argument passed to spectrum_ID2; using the first spectrum in reference_data.')
474
+
475
+ if spectrum_preprocessing_order is not None:
476
+ spectrum_preprocessing_order = list(spectrum_preprocessing_order)
477
+ else:
478
+ spectrum_preprocessing_order = ['F','N','W','L']
479
+ if set(spectrum_preprocessing_order) - {'F','N','W','L'}:
480
+ print(f'Error: spectrum_preprocessing_order must contain only \'F\', \'N\', \'W\', \'L\'.')
481
+ sys.exit()
482
+
483
+ if similarity_measure not in ['cosine','shannon','renyi','tsallis','mixture','jaccard','dice','3w_jaccard','sokal_sneath','binary_cosine','mountford','mcconnaughey','driver_kroeber','simpson','braun_banquet','fager_mcgowan','kulczynski','intersection','hamming','hellinger']:
484
+ print('\nError: similarity_measure must be either cosine, shannon, renyi, tsallis, mixture, jaccard, dice, 3w_jaccard, sokal_sneath, binary_cosine, mountford, mcconnaughey, driver_kroeber, simpson, braun_banquet, fager_mcgowan, kulczynski, intersection, hamming, or hellinger.')
485
+ sys.exit()
486
+
487
+ if isinstance(int_min,int) is True:
488
+ int_min = float(int_min)
489
+ if isinstance(int_max,int) is True:
490
+ int_max = float(int_max)
491
+ if isinstance(mz_min,int) is False or isinstance(mz_max,int) is False or isinstance(int_min,float) is False or isinstance(int_max,float) is False:
492
+ print('Error: mz_min must be a non-negative integer, mz_max must be a positive integer, int_min must be a non-negative float, and int_max must be a positive float')
493
+ sys.exit()
494
+ if mz_min < 0:
495
+ print('\nError: mz_min should be a non-negative integer')
496
+ sys.exit()
497
+ if mz_max <= 0:
498
+ print('\nError: mz_max should be a positive integer')
499
+ sys.exit()
500
+ if int_min < 0:
501
+ print('\nError: int_min should be a non-negative float')
502
+ sys.exit()
503
+ if int_max <= 0:
504
+ print('\nError: int_max should be a positive float')
505
+ sys.exit()
506
+
507
+ if isinstance(noise_threshold,int) is True:
508
+ noise_threshold = float(noise_threshold)
509
+ if isinstance(noise_threshold,float) is False or noise_threshold < 0:
510
+ print('Error: noise_threshold must be a positive float.')
511
+ sys.exit()
512
+
513
+ if isinstance(wf_intensity,int) is True:
514
+ wf_intensity = float(wf_intensity)
515
+ if isinstance(wf_mz,int) is True:
516
+ wf_mz = float(wf_mz)
517
+ if isinstance(wf_intensity,float) is False or isinstance(wf_mz,float) is False:
518
+ print('Error: wf_mz and wf_intensity must be integers or floats')
519
+ sys.exit()
520
+
521
+ if entropy_dimension <= 0:
522
+ print('\nError: entropy_dimension should be a positive float')
523
+ sys.exit()
524
+ else:
525
+ q = entropy_dimension
526
+
527
+ normalization_method = 'standard' #consider including additional normalization methods to transform intensities into a probability distribution; softmax results in many numerical errors/warnings
528
+
529
+ if y_axis_transformation not in ['normalized','none','log10','sqrt']:
530
+ print('Error: y_axis_transformation must be either \'normalized\', \'none\', \'log10\', or \'sqrt\'.')
531
+ sys.exit()
532
+
533
+ if output_path is None:
534
+ print(f'Warning: plots will be saved to the svg ./spectrum1_{spectrum_ID1}_spectrum2_{spectrum_ID2}.svg in the current working directory.')
535
+ output_path = f'{Path.cwd()}/spectrum1_{spectrum_ID1}_spectrum2_{spectrum_ID2}.svg'
536
+
537
+ min_mz = np.min([df_query['mz_ratio'].min(), df_reference['mz_ratio'].min()])
538
+ max_mz = np.max([df_query['mz_ratio'].max(), df_reference['mz_ratio'].max()])
539
+ mzs = np.linspace(min_mz,max_mz,(max_mz-min_mz+1))
540
+
541
+ unique_query_ids = df_query['id'].unique().tolist()
542
+ unique_reference_ids = df_reference['id'].unique().tolist()
543
+ unique_query_ids = [str(ID) for ID in unique_query_ids]
544
+ unique_reference_ids = [str(ID) for ID in unique_reference_ids]
545
+ common_IDs = np.intersect1d([str(ID) for ID in unique_query_ids], [str(ID) for ID in unique_reference_ids])
546
+ if len(common_IDs) > 0:
547
+ print(f'Warning: the query and reference library have overlapping IDs: {common_IDs}')
548
+
549
+ if spectrum_ID1 in unique_query_ids and spectrum_ID2 in unique_query_ids:
550
+ q_idxs_tmp = np.where(df_query.iloc[:,0].astype(str) == spectrum_ID1)[0]
551
+ r_idxs_tmp = np.where(df_query.iloc[:,0].astype(str) == spectrum_ID2)[0]
552
+ q_spec = np.asarray(pd.concat([df_query.iloc[q_idxs_tmp,1], df_query.iloc[q_idxs_tmp,2]], axis=1).reset_index(drop=True))
553
+ r_spec = np.asarray(pd.concat([df_query.iloc[r_idxs_tmp,1], df_query.iloc[r_idxs_tmp,2]], axis=1).reset_index(drop=True))
554
+ elif spectrum_ID1 in unique_reference_ids and spectrum_ID2 in unique_reference_ids:
555
+ q_idxs_tmp = np.where(df_reference.iloc[:,0].astype(str) == spectrum_ID1)[0]
556
+ r_idxs_tmp = np.where(df_reference.iloc[:,0].astype(str) == spectrum_ID2)[0]
557
+ q_spec = np.asarray(pd.concat([df_reference.iloc[q_idxs_tmp,1], df_reference.iloc[q_idxs_tmp,2]], axis=1).reset_index(drop=True))
558
+ r_spec = np.asarray(pd.concat([df_reference.iloc[r_idxs_tmp,1], df_reference.iloc[r_idxs_tmp,2]], axis=1).reset_index(drop=True))
559
+ else:
560
+ if spectrum_ID1 in unique_reference_ids and spectrum_ID2 in unique_query_ids:
561
+ spec_tmp = spectrum_ID1
562
+ spectrum_ID1 = spectrum_ID2
563
+ spectrum_ID2 = spec_tmp
564
+ q_idxs_tmp = np.where(df_query['id'].astype(str) == spectrum_ID1)[0]
565
+ r_idxs_tmp = np.where(df_reference['id'].astype(str) == spectrum_ID2)[0]
566
+ q_spec = np.asarray(pd.concat([df_query['mz_ratio'].iloc[q_idxs_tmp], df_query['intensity'].iloc[q_idxs_tmp]], axis=1).reset_index(drop=True))
567
+ r_spec = np.asarray(pd.concat([df_reference['mz_ratio'].iloc[r_idxs_tmp], df_reference['intensity'].iloc[r_idxs_tmp]], axis=1).reset_index(drop=True))
568
+
569
+ q_spec = convert_spec(q_spec,mzs)
570
+ r_spec = convert_spec(r_spec,mzs)
571
+
572
+ int_min_tmp_q = min(q_spec[q_spec[:,1].nonzero(),1][0])
573
+ int_min_tmp_r = min(r_spec[r_spec[:,1].nonzero(),1][0])
574
+ int_max_tmp_q = max(q_spec[q_spec[:,1].nonzero(),1][0])
575
+ int_max_tmp_r = max(r_spec[r_spec[:,1].nonzero(),1][0])
576
+ int_min_tmp = int(min([int_min_tmp_q,int_min_tmp_r]))
577
+ int_max_tmp = int(max([int_max_tmp_q,int_max_tmp_r]))
578
+
579
+ fig, axes = plt.subplots(nrows=2, ncols=1)
580
+
581
+ plt.subplot(2,1,1)
582
+
583
+ if np.max(q_spec[:,1]) == 0 or np.max(r_spec[:,1]) == 0:
584
+ plt.text(0.5, 0.5, 'The query and/or reference spectrum has no non-zero intensities after transformations.\n Change transformation parameters.', ha='center', va='center', fontsize=7, color='black')
585
+ plt.xticks([])
586
+ plt.yticks([])
587
+ else:
588
+ q_spec_pre_trans = q_spec.copy()
589
+ r_spec_pre_trans = r_spec.copy()
590
+ q_spec_pre_trans[:,1] = q_spec_pre_trans[:,1].astype(float)
591
+ r_spec_pre_trans[:,1] = r_spec_pre_trans[:,1].astype(float)
592
+
593
+ if y_axis_transformation == 'normalized':
594
+ q_spec_pre_trans[:,1] = q_spec_pre_trans[:,1] / np.max(q_spec_pre_trans[:,1])
595
+ r_spec_pre_trans[:,1] = r_spec_pre_trans[:,1] / np.max(r_spec_pre_trans[:,1])
596
+ ylab = 'Normalized Intensity'
597
+ elif y_axis_transformation == 'log10':
598
+ q_spec_pre_trans[:,1] = np.log10(q_spec_pre_trans[:,1]+1)
599
+ r_spec_pre_trans[:,1] = np.log10(r_spec_pre_trans[:,1]+1)
600
+ ylab = 'log10(Intensity)'
601
+ elif y_axis_transformation == 'sqrt':
602
+ q_spec_pre_trans[:,1] = np.sqrt(q_spec_pre_trans[:,1])
603
+ r_spec_pre_trans[:,1] = np.sqrt(r_spec_pre_trans[:,1])
604
+ ylab = 'sqrt(Intensity)'
605
+ else:
606
+ ylab = 'Raw Intensity'
607
+ plt.vlines(x=q_spec_pre_trans[:,0], ymin=[0]*len(q_spec_pre_trans[:,0]), ymax=q_spec_pre_trans[:,1], linewidth=3, color='blue', label=f'Spectrum ID1: {spectrum_ID1}')
608
+ plt.vlines(x=r_spec_pre_trans[:,0], ymin=[0]*len(r_spec_pre_trans[:,0]), ymax=-r_spec_pre_trans[:,1], linewidth=3, color='red', label=f'Spectrum ID2: {spectrum_ID2}')
609
+ plt.xlabel('m/z',fontsize=7)
610
+ plt.ylabel(ylab, fontsize=7)
611
+ plt.xticks(fontsize=7)
612
+ plt.yticks(fontsize=7)
613
+ plt.title('Untransformed Query and Reference Spectra', fontsize=10)
614
+
615
+ for transformation in spectrum_preprocessing_order:
616
+ if transformation == 'W':
617
+ q_spec[:,1] = wf_transform(q_spec[:,0], q_spec[:,1], wf_mz, wf_intensity)
618
+ r_spec[:,1] = wf_transform(r_spec[:,0], r_spec[:,1], wf_mz, wf_intensity)
619
+ if transformation == 'L':
620
+ q_spec[:,1] = LE_transform(q_spec[:,1], LET_threshold, normalization_method)
621
+ r_spec[:,1] = LE_transform(r_spec[:,1], LET_threshold, normalization_method)
622
+ if transformation == 'N':
623
+ q_spec = remove_noise(q_spec, nr = noise_threshold)
624
+ if high_quality_reference_library == False or high_quality_reference_library == 'False':
625
+ r_spec = remove_noise(r_spec, nr = noise_threshold)
626
+ if transformation == 'F':
627
+ q_spec = filter_spec_gcms(q_spec, mz_min = mz_min, mz_max = mz_max, int_min = int_min, int_max = int_max)
628
+ if high_quality_reference_library == False or high_quality_reference_library == 'False':
629
+ r_spec = filter_spec_gcms(r_spec, mz_min = mz_min, mz_max = mz_max, int_min = int_min, int_max = int_max)
630
+
631
+ if q_spec.shape[0] > 1:
632
+ similarity_score = get_similarity(similarity_measure, q_spec[:,1], r_spec[:,1], weights, entropy_dimension)
633
+ else:
634
+ similarity_score = 0
635
+
636
+
637
+ plt.subplot(2,1,2)
638
+
639
+ if q_spec.shape[0] == 0 or r_spec.shape[0] == 0:
640
+ plt.text(0.5, 0.5, 'The query and/or reference spectrum has no ion fragments left after transformations.\n Change transformation parameters.', ha='center', va='center', fontsize=7, color='black')
641
+ plt.xticks([])
642
+ plt.yticks([])
643
+ elif np.max(q_spec[:,1]) == 0 or np.max(r_spec[:,1]) == 0:
644
+ plt.text(0.5, 0.5, 'The query and/or reference spectrum has no non-zero intensities after transformations.\n Change transformation parameters.', ha='center', va='center', fontsize=7, color='black')
645
+ plt.xticks([])
646
+ plt.yticks([])
647
+ else:
648
+ if y_axis_transformation == 'normalized':
649
+ q_spec[:,1] = q_spec[:,1] / np.max(q_spec[:,1])
650
+ r_spec[:,1] = r_spec[:,1] / np.max(r_spec[:,1])
651
+ ylab='Normalized Intensity'
652
+ elif y_axis_transformation == 'log10':
653
+ q_spec[:,1] = np.log10(q_spec[:,1]+1)
654
+ r_spec[:,1] = np.log10(r_spec[:,1]+1)
655
+ ylab='log10(Intensity)'
656
+ elif y_axis_transformation == 'sqrt':
657
+ q_spec[:,1] = np.sqrt(q_spec[:,1])
658
+ r_spec[:,1] = np.sqrt(r_spec[:,1])
659
+ ylab='sqrt(Intensity)'
660
+ else:
661
+ ylab = 'Raw Intensity'
662
+ plt.vlines(x=mzs, ymin=[0]*len(mzs), ymax=q_spec[:,1], linewidth=3, color='blue')
663
+ plt.vlines(x=mzs, ymin=[0]*len(mzs), ymax=-r_spec[:,1], linewidth=3, color='red')
664
+ plt.xlabel('m/z', fontsize=7)
665
+ plt.ylabel(ylab, fontsize=7)
666
+ plt.xticks(fontsize=7)
667
+ plt.yticks(fontsize=7)
668
+ plt.title(f'Transformed Query and Reference Spectra', fontsize=10)
669
+
670
+ plt.subplots_adjust(top=0.8, hspace=0.92, bottom=0.3)
671
+ plt.figlegend(loc='upper center')
672
+
673
+ fig.text(0.05, 0.20, f'Similarity Measure: {similarity_measure.capitalize()}', fontsize=7)
674
+ fig.text(0.05, 0.17, f'Similarity Score: {round(similarity_score, 4)}', fontsize=7)
675
+ fig.text(0.05, 0.14, f"Spectrum Preprocessing Order: {''.join(spectrum_preprocessing_order)}", fontsize=7)
676
+ fig.text(0.05, 0.11, f'High Quality Reference Library: {str(high_quality_reference_library)}', fontsize=7)
677
+ fig.text(0.05, 0.08, f'Weight Factors (m/z,intensity): ({wf_mz},{wf_intensity})', fontsize=7)
678
+ if similarity_measure == 'mixture':
679
+ fig.text(0.05, 0.05, f'Weights for mixture similarity: {weights}', fontsize=7)
680
+
681
+ fig.text(0.40, 0.20, f'Raw-Scale M/Z Range: [{min_mz},{max_mz}]', fontsize=7)
682
+ fig.text(0.40, 0.17, f'Raw-Scale Intensity Range: [{int_min_tmp},{int_max_tmp}]', fontsize=7)
683
+ fig.text(0.40, 0.14, f'Noise Threshold: {noise_threshold}', fontsize=7)
684
+ fig.text(0.40, 0.11, f'Low-Entropy Threshold: {LET_threshold}', fontsize=7)
685
+
686
+ if print_url_spectrum1 == 'Yes' and print_url_spectrum2 == 'Yes':
687
+ url_tmp1 = get_pubchem_url(query=spectrum_ID1)
688
+ url_tmp2 = get_pubchem_url(query=spectrum_ID2)
689
+ t1 = fig.text(0.40, 0.08, f'PubChem URL for {spectrum_ID1}: {url_tmp1}', fontsize=7)
690
+ t2 = fig.text(0.40, 0.05, f'PubChem URL for {spectrum_ID2}: {url_tmp2}', fontsize=7)
691
+ t1.set_url(url_tmp1)
692
+ t2.set_url(url_tmp2)
693
+
694
+ if print_url_spectrum1 == 'Yes' and print_url_spectrum2 == 'No':
695
+ url_tmp1 = get_pubchem_url(query=spectrum_ID1)
696
+ t1 = fig.text(0.40, 0.08, f'PubChem URL for {spectrum_ID1}: {url_tmp1}', fontsize=7)
697
+ t1.set_url(url_tmp1)
698
+
699
+ if print_url_spectrum1 == 'No' and print_url_spectrum2 == 'Yes':
700
+ url_tmp2 = get_pubchem_url(query=spectrum_ID2)
701
+ t2 = fig.text(0.40, 0.08, f'PubChem URL for {spectrum_ID2}: {url_tmp2}', fontsize=7)
702
+ t2.set_url(url_tmp2)
703
+
704
+ fig.savefig(output_path, format='svg')
705
+
706
+ if return_plot == True:
707
+ return fig
708
+
709
+
710
+ def wf_transform(spec_mzs, spec_ints, wf_mz, wf_int):
711
+ spec_ints = np.power(spec_mzs, wf_mz) * np.power(spec_ints, wf_int)
712
+ return(spec_ints)
713
+
714
+
715
+ def LE_transform(intensity, thresh, normalization_method):
716
+ intensity_tmp = normalize(intensity, method=normalization_method)
717
+ if np.sum(intensity_tmp) > 0:
718
+ S = scipy.stats.entropy(intensity_tmp.astype('float'))
719
+ if S > 0 and S < thresh:
720
+ w = (1 + S) / (1 + thresh)
721
+ intensity = np.power(intensity_tmp, w)
722
+ else:
723
+ intensity = np.zeros(len(intensity))
724
+ return intensity
725
+
726
+
727
+ def normalize(intensities,method='standard'):
728
+ if np.sum(intensities) > 0:
729
+ if method == 'softmax':
730
+ if np.any(intensities > 700):
731
+ print("Warning: some intensities are too large to exponentiate. Applying standard normalization.")
732
+ intensities /= np.sum(intensities)
733
+ else:
734
+ intensities2 = np.exp(intensities)
735
+ if np.isinf(intensities2).sum() == 0:
736
+ intensities = intensities / np.sum(intensities2)
737
+ elif method == 'standard':
738
+ intensities /= np.sum(intensities)
739
+ return(intensities)
740
+
741
+
742
+ def filter_spec_lcms(spec, mz_min = 0, mz_max = 999999999999, int_min = 0, int_max = 999999999999, is_matched = False):
743
+ if is_matched == False:
744
+ spec = spec[spec[:,0] >= mz_min]
745
+ spec = spec[spec[:,0] <= mz_max]
746
+ spec = spec[spec[:,1] >= int_min]
747
+ spec = spec[spec[:,1] <= int_max]
748
+ else:
749
+ spec = spec[spec[:,0] >= mz_min]
750
+ spec = spec[spec[:,0] <= mz_max]
751
+ spec[spec[:,1] >= int_min] = 0
752
+ spec[spec[:,1] <= int_max] = 0
753
+ return(spec)
754
+
755
+
756
+ def filter_spec_gcms(spec, mz_min = 0, mz_max = 999999999999, int_min = 0, int_max = 999999999999):
757
+ spec[np.where(spec[:,0] < mz_min)[0],1] = 0
758
+ spec[np.where(spec[:,0] > mz_max)[0],1] = 0
759
+ spec[np.where(spec[:,1] < int_min)[0],1] = 0
760
+ spec[np.where(spec[:,1] > int_max)[0],1] = 0
761
+ return(spec)
762
+
763
+
764
+ def remove_noise(spec, nr):
765
+ if spec.shape[0] > 1:
766
+ if nr is not None:
767
+ spec[np.where(spec[:,1] < np.max(spec[:,1]) * nr)[0]] = 0
768
+
769
+ return(spec)
770
+
771
+
772
+ def centroid_spectrum(spec, window_size):
773
+ spec = spec[np.argsort(spec[:,0])]
774
+
775
+ mz_array = spec[:, 0]
776
+ need_centroid = 0
777
+ if mz_array.shape[0] > 1:
778
+ mz_delta = mz_array[1:] - mz_array[:-1]
779
+ if np.min(mz_delta) <= window_size:
780
+ need_centroid = 1
781
+
782
+ if need_centroid:
783
+ intensity_order = np.argsort(-spec[:, 1])
784
+ spec_new = []
785
+ for i in intensity_order:
786
+ mz_delta_allowed = window_size
787
+
788
+ if spec[i, 1] > 0:
789
+ i_left = i - 1
790
+ while i_left >= 0:
791
+ mz_delta_left = spec[i, 0] - spec[i_left, 0]
792
+ if mz_delta_left <= mz_delta_allowed:
793
+ i_left -= 1
794
+ else:
795
+ break
796
+ i_left += 1
797
+
798
+ i_right = i + 1
799
+ while i_right < spec.shape[0]:
800
+ mz_delta_right = spec[i_right, 0] - spec[i, 0]
801
+ if mz_delta_right <= mz_delta_allowed:
802
+ i_right += 1
803
+ else:
804
+ break
805
+
806
+ intensity_sum = np.sum(spec[i_left:i_right, 1])
807
+ intensity_weighted_sum = np.sum(spec[i_left:i_right, 0] * spec[i_left:i_right, 1])
808
+
809
+ spec_new.append([intensity_weighted_sum / intensity_sum, intensity_sum])
810
+ spec[i_left:i_right, 1] = 0
811
+
812
+ spec_new = np.array(spec_new)
813
+ spec_new = spec_new[np.argsort(spec_new[:, 0])]
814
+ if spec_new.shape[0] > 1:
815
+ spec_new = spec_new[np.argsort(spec_new[:, 0])]
816
+ return spec_new
817
+ else:
818
+ return np.array([[0,0]])
819
+ else:
820
+ return spec
821
+
822
+
823
+
824
+ def match_peaks_in_spectra(spec_a, spec_b, window_size):
825
+ a = 0
826
+ b = 0
827
+
828
+ spec_merged = []
829
+ peak_b_int = 0.
830
+ while a < spec_a.shape[0] and b < spec_b.shape[0]:
831
+ mass_delta = spec_a[a, 0] - spec_b[b, 0]
832
+
833
+ if mass_delta < -window_size:
834
+ spec_merged.append([spec_a[a, 0], spec_a[a, 1], peak_b_int])
835
+ peak_b_int = 0.
836
+ a += 1
837
+ elif mass_delta > window_size:
838
+ spec_merged.append([spec_b[b, 0], 0., spec_b[b, 1]])
839
+ b += 1
840
+ else:
841
+ peak_b_int += spec_b[b, 1]
842
+ b += 1
843
+
844
+ if peak_b_int > 0.:
845
+ spec_merged.append([spec_a[a, 0], spec_a[a, 1], peak_b_int])
846
+ peak_b_int = 0.
847
+ a += 1
848
+
849
+ if b < spec_b.shape[0]:
850
+ spec_merged += [[x[0], 0., x[1]] for x in spec_b[b:]]
851
+
852
+ if a < spec_a.shape[0]:
853
+ spec_merged += [[x[0], x[1], 0.] for x in spec_a[a:]]
854
+
855
+ if spec_merged:
856
+ spec_merged = np.array(spec_merged, dtype=np.float64)
857
+ else:
858
+ spec_merged = np.array([[0., 0., 0.]], dtype=np.float64)
859
+ return spec_merged
860
+
861
+
862
+
863
+ def convert_spec(spec, mzs):
864
+ ints_tmp = []
865
+ for i in range(0,len(mzs)):
866
+ if mzs[i] in spec[:,0]:
867
+ int_tmp = spec[np.where(spec[:,0] == mzs[i])[0][0],1]
868
+ else:
869
+ int_tmp = 0
870
+ ints_tmp.append(int_tmp)
871
+ out = np.transpose(np.array([mzs,ints_tmp]))
872
+ return out
873
+
874
+
875
+ def get_reference_df(reference_data, likely_reference_IDs=None):
876
+ extension = reference_data.rsplit('.',1)
877
+ extension = extension[(len(extension)-1)]
878
+ if extension == 'mgf' or extension == 'MGF' or extension == 'mzML' or extension == 'mzml' or extension == 'MZML' or extension == 'cdf' or extension == 'CDF' or extension == 'msp' or extension == 'MSP' or extension == 'json' or extension == 'JSON':
879
+ output_path_tmp = reference_data[:-3] + 'txt'
880
+ build_library_from_raw_data(input_path=reference_data, output_path=output_path_tmp, is_reference=True)
881
+ df_reference = pd.read_csv(output_path_tmp, sep='\t')
882
+ if extension == 'txt' or extension == 'TXT':
883
+ df_reference = pd.read_csv(reference_data, sep='\t')
884
+ if likely_reference_IDs is not None:
885
+ likely_reference_IDs = pd.read_csv(likely_reference_IDs, header=None)
886
+ df_reference = df_reference.loc[df_reference.iloc[:,0].isin(likely_reference_IDs.iloc[:,0].tolist())]
887
+ return df_reference
888
+
889
+
890
+
891
+ def S_cos(ints_a, ints_b):
892
+ if np.sum(ints_a) == 0 or np.sum(ints_b) == 0:
893
+ return(0)
894
+ else:
895
+ return np.dot(ints_a,ints_b) / (np.sqrt(sum(np.power(ints_a,2))) * np.sqrt(sum(np.power(ints_b,2))))
896
+
897
+
898
+ def ent_renyi(ints, q):
899
+ return np.log(sum(np.power(ints,q))) / (1-q)
900
+
901
+
902
+ def ent_tsallis(ints, q):
903
+ return (sum(np.power(ints,q))-1) / (1-q)
904
+
905
+
906
+ def S_shannon(ints_a, ints_b):
907
+ ent_a = scipy.stats.entropy(ints_a)
908
+ ent_b = scipy.stats.entropy(ints_b)
909
+ ent_ab = scipy.stats.entropy(ints_a + ints_b)
910
+ return(1 - (2 * ent_ab - ent_a - ent_b)/np.log(4))
911
+
912
+
913
+ def S_renyi(ints_a, ints_b, q):
914
+ if q == 1:
915
+ print('Warning: the Renyi Entropy Similarity Measure is equivalent to the Shannon Entropy Similarity Measure when the entropy dimension is 1')
916
+ return S_shannon(ints_a, ints_b)
917
+ else:
918
+ ent_a = ent_renyi(ints_a, q)
919
+ ent_b = ent_renyi(ints_b, q)
920
+ ent_merg = ent_renyi(ints_a/2 + ints_b/2, q)
921
+ N = (1/(1-q)) * (2*np.log(np.sum(np.power(ints_a/2,q))+np.sum(np.power(ints_b/2,q))) - np.log(np.sum(np.power(ints_a,q))) - np.log(np.sum(np.power(ints_b,q))))
922
+ return 1 - (2 * ent_merg - ent_a - ent_b) / N
923
+
924
+
925
+ def S_tsallis(ints_a, ints_b, q):
926
+ if q == 1:
927
+ print('Warning: the Tsallis Entropy Similarity Measure is equivalent to the Shannon Entropy Similarity Measure when the entropy dimension is 1')
928
+ return S_shannon(ints_a, ints_b)
929
+ else:
930
+ ent_a = ent_tsallis(ints_a, q)
931
+ ent_b = ent_tsallis(ints_b, q)
932
+ ent_merg = ent_tsallis(ints_a/2 + ints_b/2, q)
933
+ N = np.sum(2*np.power(ints_a/2,q)+2*np.power(ints_b/2,q)-np.power(ints_a,q)-np.power(ints_b,q)) / (1-q)
934
+ return 1 - (2 * ent_merg - ent_a - ent_b) / N
935
+
936
+ def S_mixture(ints_a, ints_b, weights={'Cosine':0.25, 'Shannon':0.25, 'Renyi':0.25, 'Tsallis':0.25}, q=1.1):
937
+ if set(weights.keys()).issubset(set(['Cosine','Shannon','Renyi','Tsallis'])) is False:
938
+ print('Error: the keys to the weight parameter dict of the function S_mixture must be one of the four: Cosine, Shannon, Renyi, Tsallis')
939
+ sys.exit()
940
+
941
+ similarity = 0
942
+ for key, value in weights.items():
943
+ if key == 'Cosine':
944
+ similarity += value * S_cos(ints_a,ints_b)
945
+ if key == 'Shannon':
946
+ similarity += value * S_shannon(ints_a,ints_b)
947
+ if key == 'Renyi':
948
+ similarity += value * S_renyi(ints_a,ints_b,q)
949
+ if key == 'Tsallis':
950
+ similarity += value * S_tsallis(ints_a,ints_b,q)
951
+ return similarity
952
+
953
+
954
+ def get_contingency_entries(ints_a, ints_b):
955
+ a = 0
956
+ b = 0
957
+ c = 0
958
+
959
+ for x, y in zip(ints_a, ints_b):
960
+ if x != 0 and y != 0:
961
+ c += 1
962
+ elif x != 0 and y == 0:
963
+ a += 1
964
+ elif x == 0 and y != 0:
965
+ b += 1
966
+ return [a,b,c]
967
+
968
+
969
+ def S_jaccard(ints_a, ints_b):
970
+ tmp = get_contingency_entries(ints_a, ints_b)
971
+ a = tmp[0]
972
+ b = tmp[1]
973
+ c = tmp[2]
974
+ denom = a + b + c
975
+ if denom == 0:
976
+ similarity = 0
977
+ else:
978
+ similarity = c / (a + b + c)
979
+ return similarity
980
+
981
+
982
+ def S_dice(ints_a, ints_b):
983
+ tmp = get_contingency_entries(ints_a, ints_b)
984
+ a = tmp[0]
985
+ b = tmp[1]
986
+ c = tmp[2]
987
+ denom = a + b + 2 * c
988
+ if denom == 0:
989
+ similarity = 0
990
+ else:
991
+ similarity = 2 * c / denom
992
+ return similarity
993
+
994
+
995
+ def S_3w_jaccard(ints_a, ints_b):
996
+ tmp = get_contingency_entries(ints_a, ints_b)
997
+ a = tmp[0]
998
+ b = tmp[1]
999
+ c = tmp[2]
1000
+ denom = a + b + 3 * c
1001
+ if denom == 0:
1002
+ similarity = 0
1003
+ else:
1004
+ similarity = 3 * c / denom
1005
+ return similarity
1006
+
1007
+
1008
+ def S_sokal_sneath(ints_a, ints_b):
1009
+ tmp = get_contingency_entries(ints_a, ints_b)
1010
+ a = tmp[0]
1011
+ b = tmp[1]
1012
+ c = tmp[2]
1013
+ denom = 2 * a + 2 * b + c
1014
+ if denom == 0:
1015
+ similarity = 0
1016
+ else:
1017
+ similarity = c / denom
1018
+ return similarity
1019
+
1020
+
1021
+ def S_binary_cosine(ints_a, ints_b):
1022
+ tmp = get_contingency_entries(ints_a, ints_b)
1023
+ a = tmp[0]
1024
+ b = tmp[1]
1025
+ c = tmp[2]
1026
+ denom = np.sqrt((a + c) * (b + c))
1027
+ if denom == 0:
1028
+ similarity = 0
1029
+ else:
1030
+ similarity = c / denom
1031
+ return similarity
1032
+
1033
+
1034
+ def S_mountford(ints_a, ints_b):
1035
+ tmp = get_contingency_entries(ints_a, ints_b)
1036
+ a = tmp[0]
1037
+ b = tmp[1]
1038
+ c = tmp[2]
1039
+ denom = c * (a + b) + 2 * a * b
1040
+ if denom == 0:
1041
+ similarity = 1
1042
+ else:
1043
+ similarity = 2 * c / denom
1044
+ return similarity
1045
+
1046
+
1047
+ def S_mcconnaughey(ints_a, ints_b):
1048
+ tmp = get_contingency_entries(ints_a, ints_b)
1049
+ a = tmp[0]
1050
+ b = tmp[1]
1051
+ c = tmp[2]
1052
+ denom = (a + c) * (b + c)
1053
+ if denom == 0:
1054
+ similarity = 0
1055
+ else:
1056
+ similarity = (c**2 - a * b) / denom
1057
+ return similarity
1058
+
1059
+
1060
+ def S_driver_kroeber(ints_a, ints_b):
1061
+ tmp = get_contingency_entries(ints_a, ints_b)
1062
+ a = tmp[0]
1063
+ b = tmp[1]
1064
+ c = tmp[2]
1065
+ denom = 2 * (a + c) * (b + c)
1066
+ if denom == 0:
1067
+ similarity = 0
1068
+ else:
1069
+ similarity = c * (a + b + 2 * c) / denom
1070
+ return similarity
1071
+
1072
+
1073
+ def S_simpson(ints_a, ints_b):
1074
+ tmp = get_contingency_entries(ints_a, ints_b)
1075
+ a = tmp[0]
1076
+ b = tmp[1]
1077
+ c = tmp[2]
1078
+ denom = min(a + c, b + c)
1079
+ if denom == 0:
1080
+ similarity = 0
1081
+ else:
1082
+ similarity = c / denom
1083
+ return similarity
1084
+
1085
+
1086
+ def S_braun_banquet(ints_a, ints_b):
1087
+ tmp = get_contingency_entries(ints_a, ints_b)
1088
+ a = tmp[0]
1089
+ b = tmp[1]
1090
+ c = tmp[2]
1091
+ denom = max(a + c, b + c)
1092
+ if denom == 0:
1093
+ similarity = 0
1094
+ else:
1095
+ similarity = c / denom
1096
+ return similarity
1097
+
1098
+
1099
+ def S_fager_mcgowan(ints_a, ints_b):
1100
+ tmp = get_contingency_entries(ints_a, ints_b)
1101
+ a = tmp[0]
1102
+ b = tmp[1]
1103
+ c = tmp[2]
1104
+ denom1 = np.sqrt((a + c) * (b + c))
1105
+ denom2 = 2 * np.sqrt(max(a + c, b + c))
1106
+ if denom1 == 0 or denom2 == 0:
1107
+ similarity = 0
1108
+ else:
1109
+ similarity = c / denom1 - 1 / denom2
1110
+ return similarity
1111
+
1112
+
1113
+ def S_kulczynski(ints_a, ints_b):
1114
+ tmp = get_contingency_entries(ints_a, ints_b)
1115
+ a = tmp[0]
1116
+ b = tmp[1]
1117
+ c = tmp[2]
1118
+ denom = a + b
1119
+ if denom == 0:
1120
+ similarity = 1
1121
+ else:
1122
+ similarity = c / denom
1123
+ return similarity
1124
+
1125
+
1126
+ def S_intersection(ints_a, ints_b):
1127
+ tmp = get_contingency_entries(ints_a, ints_b)
1128
+ c = tmp[2]
1129
+ return c
1130
+
1131
+
1132
+ def S_hamming(ints_a, ints_b):
1133
+ tmp = get_contingency_entries(ints_a, ints_b)
1134
+ a = tmp[0]
1135
+ b = tmp[1]
1136
+ denom = a + b
1137
+ if denom == 0:
1138
+ similarity = 1
1139
+ else:
1140
+ similarity = 1 / denom
1141
+ return similarity
1142
+
1143
+
1144
+ def S_hellinger(ints_a, ints_b):
1145
+ tmp = get_contingency_entries(ints_a, ints_b)
1146
+ a = tmp[0]
1147
+ b = tmp[1]
1148
+ c = tmp[2]
1149
+ similarity = 1 - np.sqrt((1 - c / np.sqrt((a + c) * (b + c))))
1150
+ return similarity
1151
+
1152
+
1153
+ def get_similarity(similarity_measure, q_ints, r_ints, weights, q):
1154
+
1155
+ if similarity_measure == 'cosine':
1156
+ similarity = S_cos(q_ints, r_ints)
1157
+
1158
+ elif similarity_measure in ['shannon', 'renyi', 'tsallis']:
1159
+ q_ints = normalize(q_ints, method = 'standard')
1160
+ r_ints = normalize(r_ints, method = 'standard')
1161
+ if similarity_measure == 'shannon':
1162
+ similarity = S_shannon(q_ints, r_ints)
1163
+ elif similarity_measure == 'renyi':
1164
+ similarity = S_renyi(q_ints, r_ints, q)
1165
+ elif similarity_measure == 'tsallis':
1166
+ similarity = S_tsallis(q_ints, r_ints, q)
1167
+
1168
+ elif similarity_measure == 'mixture':
1169
+ similarity = S_mixture(q_ints, r_ints, weights, q)
1170
+
1171
+ elif similarity_measure == 'jaccard':
1172
+ similarity = S_jaccard(q_ints, r_ints)
1173
+
1174
+ elif similarity_measure == 'dice':
1175
+ similarity = S_dice(q_ints, r_ints)
1176
+
1177
+ elif similarity_measure == '3w_jaccard':
1178
+ similarity = S_3w_jaccard(q_ints, r_ints)
1179
+
1180
+ elif similarity_measure == 'sokal_sneath':
1181
+ similarity = S_sokal_sneath(q_ints, r_ints)
1182
+
1183
+ elif similarity_measure == 'binary_cosine':
1184
+ similarity = S_binary_cosine(q_ints, r_ints)
1185
+
1186
+ elif similarity_measure == 'mountford':
1187
+ similarity = S_mountford(q_ints, r_ints)
1188
+
1189
+ elif similarity_measure == 'mcconnaughey':
1190
+ similarity = S_mcconnaughey(q_ints, r_ints)
1191
+
1192
+ elif similarity_measure == 'driver_kroeber':
1193
+ similarity = S_driver_kroeber(q_ints, r_ints)
1194
+
1195
+ elif similarity_measure == 'simpson':
1196
+ similarity = S_simpson(q_ints, r_ints)
1197
+
1198
+ elif similarity_measure == 'braun_banquet':
1199
+ similarity = S_braun_banquet(q_ints, r_ints)
1200
+
1201
+ elif similarity_measure == 'fager_mcgowan':
1202
+ similarity = S_fager_mcgowan(q_ints, r_ints)
1203
+
1204
+ elif similarity_measure == 'kulczynski':
1205
+ similarity = S_kulczynski(q_ints, r_ints)
1206
+
1207
+ elif similarity_measure == 'intersection':
1208
+ similarity = S_intersection(q_ints, r_ints)
1209
+
1210
+ elif similarity_measure == 'hamming':
1211
+ similarity = S_hamming(q_ints, r_ints)
1212
+
1213
+ elif similarity_measure == 'hellinger':
1214
+ similarity = S_hellinger(q_ints, r_ints)
1215
+
1216
+ return similarity
1217
+
1218
+
1219
+ def _vector_to_full_params(X, default_params, optimize_params):
1220
+ params = default_params.copy()
1221
+ for name, val in zip(optimize_params, X):
1222
+ params[name] = float(val)
1223
+ return params
1224
+
1225
+
1226
+ def objective_function_HRMS(X, ctx):
1227
+ p = _vector_to_full_params(X, ctx["default_params"], ctx["optimize_params"])
1228
+ acc = get_acc_HRMS(
1229
+ ctx["df_query"], ctx["df_reference"],
1230
+ ctx["precursor_ion_mz_tolerance"], ctx["ionization_mode"], ctx["adduct"],
1231
+ ctx["similarity_measure"], ctx["weights"], ctx["spectrum_preprocessing_order"],
1232
+ ctx["mz_min"], ctx["mz_max"], ctx["int_min"], ctx["int_max"],
1233
+ p["window_size_centroiding"], p["window_size_matching"], p["noise_threshold"],
1234
+ p["wf_mz"], p["wf_int"], p["LET_threshold"],
1235
+ p["entropy_dimension"],
1236
+ ctx["high_quality_reference_library"],
1237
+ verbose=False
1238
+ )
1239
+ print(f"\nparams({ctx['optimize_params']}) = {np.array(X)}\naccuracy: {acc*100}%")
1240
+ return 1.0 - acc
1241
+
1242
+ def objective_function_NRMS(X, ctx):
1243
+ p = _vector_to_full_params(X, ctx["default_params"], ctx["optimize_params"])
1244
+ acc = get_acc_NRMS(
1245
+ ctx["df_query"], ctx["df_reference"],
1246
+ ctx["unique_query_ids"], ctx["unique_reference_ids"],
1247
+ ctx["similarity_measure"], ctx["weights"], ctx["spectrum_preprocessing_order"],
1248
+ ctx["mz_min"], ctx["mz_max"], ctx["int_min"], ctx["int_max"],
1249
+ p["noise_threshold"], p["wf_mz"], p["wf_int"], p["LET_threshold"], p["entropy_dimension"],
1250
+ ctx["high_quality_reference_library"],
1251
+ verbose=False
1252
+ )
1253
+ print(f"\nparams({ctx['optimize_params']}) = {np.array(X)}\naccuracy: {acc*100}%")
1254
+ return 1.0 - acc
1255
+
1256
+
1257
+
1258
+ def tune_params_DE(query_data=None, reference_data=None, precursor_ion_mz_tolerance=None, ionization_mode=None, adduct=None, chromatography_platform='HRMS', similarity_measure='cosine', weights=None, spectrum_preprocessing_order='CNMWL', mz_min=0, mz_max=999999999, int_min=0, int_max=999999999, high_quality_reference_library=False, optimize_params=["window_size_centroiding","window_size_matching","noise_threshold","wf_mz","wf_int","LET_threshold","entropy_dimension"], param_bounds={"window_size_centroiding":(0.0,0.5),"window_size_matching":(0.0,0.5),"noise_threshold":(0.0,0.25),"wf_mz":(0.0,5.0),"wf_int":(0.0,5.0),"LET_threshold":(0.0,5.0),"entropy_dimension":(1.0,3.0)}, default_params={"window_size_centroiding": 0.5, "window_size_matching":0.5, "noise_threshold":0.10, "wf_mz":0.0, "wf_int":1.0, "LET_threshold":0.0, "entropy_dimension":1.1}, maxiters=3, de_workers=1):
1259
+
1260
+ if query_data is None:
1261
+ print('\nError: No argument passed to the mandatory query_data. Please pass the path to the TXT file of the query data.')
1262
+ sys.exit()
1263
+ else:
1264
+ extension = query_data.rsplit('.',1)
1265
+ extension = extension[(len(extension)-1)]
1266
+ if extension == 'mgf' or extension == 'MGF' or extension == 'mzML' or extension == 'mzml' or extension == 'MZML' or extension == 'cdf' or extension == 'CDF' or extension == 'msp' or extension == 'MSP' or extension == 'json' or extension == 'JSON':
1267
+ output_path_tmp = query_data[:-3] + 'txt'
1268
+ build_library_from_raw_data(input_path=query_data, output_path=output_path_tmp, is_reference=False)
1269
+ df_query = pd.read_csv(output_path_tmp, sep='\t')
1270
+ if extension == 'txt' or extension == 'TXT':
1271
+ df_query = pd.read_csv(query_data, sep='\t')
1272
+ unique_query_ids = df_query.iloc[:,0].unique()
1273
+
1274
+ if reference_data is None:
1275
+ print('\nError: No argument passed to the mandatory reference_data. Please pass the path to the TXT file of the reference data.')
1276
+ sys.exit()
1277
+ else:
1278
+ if isinstance(reference_data,str):
1279
+ df_reference = get_reference_df(reference_data=reference_data)
1280
+ unique_reference_ids = df_reference.iloc[:,0].unique()
1281
+ else:
1282
+ dfs = []
1283
+ unique_reference_ids = []
1284
+ for f in reference_data:
1285
+ tmp = get_reference_df(reference_data=f)
1286
+ dfs.append(tmp)
1287
+ unique_reference_ids.extend(tmp.iloc[:,0].unique())
1288
+ df_reference = pd.concat(dfs, axis=0, ignore_index=True)
1289
+
1290
+ if 'ionization_mode' in df_reference.columns.tolist() and ionization_mode != None and ionization_mode != 'N/A':
1291
+ df_reference = df_reference.loc[df_reference['ionization_mode']==ionization_mode]
1292
+ if 'adduct' in df_reference.columns.tolist() and adduct != None and adduct != 'N/A':
1293
+ df_reference = df_reference.loc[df_reference['adduct']==adduct]
1294
+
1295
+ unique_query_ids = df_query['id'].unique().tolist()
1296
+ unique_reference_ids = df_reference['id'].unique().tolist()
1297
+
1298
+ ctx = dict(
1299
+ df_query=df_query,
1300
+ df_reference=df_reference,
1301
+ precursor_ion_mz_tolerance=precursor_ion_mz_tolerance,
1302
+ ionization_mode=ionization_mode,
1303
+ adduct=adduct,
1304
+ similarity_measure=similarity_measure,
1305
+ weights=weights,
1306
+ spectrum_preprocessing_order=spectrum_preprocessing_order,
1307
+ mz_min=mz_min, mz_max=mz_max, int_min=int_min, int_max=int_max,
1308
+ high_quality_reference_library=high_quality_reference_library,
1309
+ default_params=default_params,
1310
+ optimize_params=optimize_params,
1311
+ )
1312
+
1313
+ bounds = [param_bounds[p] for p in optimize_params]
1314
+
1315
+ if chromatography_platform == 'HRMS':
1316
+ result = differential_evolution(objective_function_HRMS, bounds=bounds, args=(ctx,), maxiter=maxiters, tol=0.0, workers=de_workers, seed=1)
1317
+ else:
1318
+ result = differential_evolution(objective_function_NRMS, bounds=bounds, args=(ctx,), maxiter=maxiters, tol=0.0, workers=de_workers, seed=1)
1319
+
1320
+ best_full_params = _vector_to_full_params(result.x, default_params, optimize_params)
1321
+ best_acc = 100.0 - (result.fun * 100.0)
1322
+
1323
+ print("\n=== Differential Evolution Result ===")
1324
+ print(f"Optimized over: {optimize_params}")
1325
+ print("Best values (selected params):")
1326
+ for name in optimize_params:
1327
+ print(f" {name}: {best_full_params[name]}")
1328
+ print("\nFull parameter set used in final evaluation:")
1329
+ for k, v in best_full_params.items():
1330
+ print(f" {k}: {v}")
1331
+ print(f"\nBest accuracy: {best_acc:.3f}%")
1332
+ _log(f"best = {result.x}, acc={100*(1-result.fun):.3f}%")
1333
+
1334
+
1335
+ default_HRMS_grid = {'similarity_measure':['cosine'], 'weight':[{'Cosine':0.25,'Shannon':0.25,'Renyi':0.25,'Tsallis':0.25}], 'spectrum_preprocessing_order':['FCNMWL'], 'mz_min':[0], 'mz_max':[9999999], 'int_min':[0], 'int_max':[99999999], 'window_size_centroiding':[0.5], 'window_size_matching':[0.5], 'noise_threshold':[0.0], 'wf_mz':[0.0], 'wf_int':[1.0], 'LET_threshold':[0.0], 'entropy_dimension':[1.1], 'high_quality_reference_library':[False]}
1336
+ default_NRMS_grid = {'similarity_measure':['cosine'], 'weight':[{'Cosine':0.25,'Shannon':0.25,'Renyi':0.25,'Tsallis':0.25}], 'spectrum_preprocessing_order':['FCNMWL'], 'mz_min':[0], 'mz_max':[9999999], 'int_min':[0], 'int_max':[99999999], 'noise_threshold':[0.0], 'wf_mz':[0.0], 'wf_int':[1.0], 'LET_threshold':[0.0], 'entropy_dimension':[1.1], 'high_quality_reference_library':[False]}
1337
+
1338
+
1339
+ def _eval_one_HRMS(df_query, df_reference,
1340
+ precursor_ion_mz_tolerance_tmp, ionization_mode_tmp, adduct_tmp,
1341
+ similarity_measure_tmp, weight,
1342
+ spectrum_preprocessing_order_tmp, mz_min_tmp, mz_max_tmp,
1343
+ int_min_tmp, int_max_tmp, noise_threshold_tmp,
1344
+ window_size_centroiding_tmp, window_size_matching_tmp,
1345
+ wf_mz_tmp, wf_int_tmp, LET_threshold_tmp,
1346
+ entropy_dimension_tmp, high_quality_reference_library_tmp):
1347
+
1348
+ acc = get_acc_HRMS(
1349
+ df_query=df_query, df_reference=df_reference,
1350
+ precursor_ion_mz_tolerance=precursor_ion_mz_tolerance_tmp,
1351
+ ionization_mode=ionization_mode_tmp, adduct=adduct_tmp,
1352
+ similarity_measure=similarity_measure_tmp, weights=weight,
1353
+ spectrum_preprocessing_order=spectrum_preprocessing_order_tmp,
1354
+ mz_min=mz_min_tmp, mz_max=mz_max_tmp,
1355
+ int_min=int_min_tmp, int_max=int_max_tmp,
1356
+ window_size_centroiding=window_size_centroiding_tmp,
1357
+ window_size_matching=window_size_matching_tmp,
1358
+ noise_threshold=noise_threshold_tmp,
1359
+ wf_mz=wf_mz_tmp, wf_int=wf_int_tmp,
1360
+ LET_threshold=LET_threshold_tmp,
1361
+ entropy_dimension=entropy_dimension_tmp,
1362
+ high_quality_reference_library=high_quality_reference_library_tmp,
1363
+ verbose=False
1364
+ )
1365
+
1366
+ return (
1367
+ acc, similarity_measure_tmp, json.dumps(weight), spectrum_preprocessing_order_tmp,
1368
+ mz_min_tmp, mz_max_tmp, int_min_tmp, int_max_tmp,
1369
+ noise_threshold_tmp, window_size_centroiding_tmp, window_size_matching_tmp,
1370
+ wf_mz_tmp, wf_int_tmp, LET_threshold_tmp, entropy_dimension_tmp,
1371
+ high_quality_reference_library_tmp
1372
+ )
1373
+
1374
+
1375
+ def _eval_one_NRMS(df_query, df_reference, unique_query_ids, unique_reference_ids,
1376
+ similarity_measure_tmp, weight,
1377
+ spectrum_preprocessing_order_tmp, mz_min_tmp, mz_max_tmp,
1378
+ int_min_tmp, int_max_tmp, noise_threshold_tmp,
1379
+ wf_mz_tmp, wf_int_tmp, LET_threshold_tmp,
1380
+ entropy_dimension_tmp, high_quality_reference_library_tmp):
1381
+
1382
+ acc = get_acc_NRMS(
1383
+ df_query=df_query, df_reference=df_reference,
1384
+ unique_query_ids=unique_query_ids, unique_reference_ids=unique_reference_ids,
1385
+ similarity_measure=similarity_measure_tmp, weights=weight,
1386
+ spectrum_preprocessing_order=spectrum_preprocessing_order_tmp,
1387
+ mz_min=mz_min_tmp, mz_max=mz_max_tmp,
1388
+ int_min=int_min_tmp, int_max=int_max_tmp,
1389
+ noise_threshold=noise_threshold_tmp,
1390
+ wf_mz=wf_mz_tmp, wf_int=wf_int_tmp,
1391
+ LET_threshold=LET_threshold_tmp,
1392
+ entropy_dimension=entropy_dimension_tmp,
1393
+ high_quality_reference_library=high_quality_reference_library_tmp,
1394
+ )
1395
+
1396
+ return (
1397
+ acc, similarity_measure_tmp, json.dumps(weight), spectrum_preprocessing_order_tmp,
1398
+ mz_min_tmp, mz_max_tmp, int_min_tmp, int_max_tmp, noise_threshold_tmp,
1399
+ wf_mz_tmp, wf_int_tmp, LET_threshold_tmp, entropy_dimension_tmp, high_quality_reference_library_tmp
1400
+ )
1401
+
1402
+
1403
+
1404
+
1405
+ def tune_params_on_HRMS_data_grid_shiny(query_data=None, reference_data=None, precursor_ion_mz_tolerance=None, ionization_mode=None, adduct=None, grid=None, output_path=None, return_output=False):
1406
+ local_grid = {**default_HRMS_grid, **(grid or {})}
1407
+ for key, value in local_grid.items():
1408
+ globals()[key] = value
1409
+
1410
+ if query_data is None:
1411
+ print('\nError: No argument passed to the mandatory query_data. Please pass the path to the data file.')
1412
+ sys.exit()
1413
+ else:
1414
+ extension = query_data.rsplit('.', 1)[-1]
1415
+ if extension in ('mgf','MGF','mzML','mzml','MZML','cdf','CDF'):
1416
+ output_path_tmp = query_data[:-3] + 'txt'
1417
+ build_library_from_raw_data(input_path=query_data, output_path=output_path_tmp, is_reference=False)
1418
+ df_query = pd.read_csv(output_path_tmp, sep='\t')
1419
+ elif extension in ('txt','TXT'):
1420
+ df_query = pd.read_csv(query_data, sep='\t')
1421
+ else:
1422
+ print(f'\nError: Unsupported query_data extension: {extension}')
1423
+ sys.exit()
1424
+ unique_query_ids = df_query.iloc[:, 0].unique()
1425
+
1426
+ if reference_data is None:
1427
+ print('\nError: No argument passed to the mandatory reference_data. Please pass the path to the data file(s).')
1428
+ sys.exit()
1429
+ else:
1430
+ if isinstance(reference_data, str):
1431
+ df_reference = get_reference_df(reference_data=reference_data)
1432
+ unique_reference_ids = df_reference.iloc[:, 0].unique()
1433
+ else:
1434
+ dfs = []
1435
+ unique_reference_ids = []
1436
+ for f in reference_data:
1437
+ tmp = get_reference_df(reference_data=f)
1438
+ dfs.append(tmp)
1439
+ unique_reference_ids.extend(tmp.iloc[:, 0].unique())
1440
+ df_reference = pd.concat(dfs, axis=0, ignore_index=True)
1441
+
1442
+ print(f'\nNote that there are {len(unique_query_ids)} unique query spectra, '
1443
+ f'{len(unique_reference_ids)} unique reference spectra, and '
1444
+ f'{len(set(unique_query_ids) & set(unique_reference_ids))} of the query and reference spectra IDs are in common.\n')
1445
+
1446
+ if 'ionization_mode' in df_reference.columns.tolist() and ionization_mode != None and ionization_mode != 'N/A':
1447
+ df_reference = df_reference.loc[df_reference['ionization_mode']==ionization_mode]
1448
+ if 'adduct' in df_reference.columns.tolist() and adduct != None and adduct != 'N/A':
1449
+ df_reference = df_reference.loc[df_reference['adduct']==adduct]
1450
+
1451
+ if output_path is None:
1452
+ output_path = f'{Path.cwd()}/tuning_param_output.txt'
1453
+ print(f'Warning: since output_path=None, the output will be written to the current working directory: {output_path}')
1454
+
1455
+ param_grid = product(
1456
+ similarity_measure, weight, spectrum_preprocessing_order, mz_min, mz_max, int_min, int_max,
1457
+ noise_threshold, window_size_centroiding, window_size_matching, wf_mz, wf_int, LET_threshold,
1458
+ entropy_dimension, high_quality_reference_library
1459
+ )
1460
+
1461
+ results = []
1462
+ total = (
1463
+ len(similarity_measure) * len(weight) * len(spectrum_preprocessing_order) * len(mz_min) * len(mz_max) *
1464
+ len(int_min) * len(int_max) * len(noise_threshold) * len(window_size_centroiding) *
1465
+ len(window_size_matching) * len(wf_mz) * len(wf_int) * len(LET_threshold) *
1466
+ len(entropy_dimension) * len(high_quality_reference_library)
1467
+ )
1468
+ done = 0
1469
+ for params in param_grid:
1470
+ res = _eval_one_HRMS(df_query, df_reference, precursor_ion_mz_tolerance, ionization_mode, adduct, *params)
1471
+ results.append(res)
1472
+ done += 1
1473
+ print(f'Completed {done}/{total} grid combinations.\n', flush=True)
1474
+
1475
+ df_out = pd.DataFrame(results, columns=[
1476
+ 'ACC','SIMILARITY.MEASURE','WEIGHT','SPECTRUM.PROCESSING.ORDER','MZ.MIN','MZ.MAX',
1477
+ 'INT.MIN','INT.MAX','NOISE.THRESHOLD','WINDOW.SIZE.CENTROIDING','WINDOW.SIZE.MATCHING',
1478
+ 'WF.MZ','WF.INT','LET.THRESHOLD','ENTROPY.DIMENSION','HIGH.QUALITY.REFERENCE.LIBRARY'
1479
+ ])
1480
+
1481
+ if 'WEIGHT' in df_out.columns:
1482
+ df_out['WEIGHT'] = (
1483
+ df_out['WEIGHT'].astype(str)
1484
+ .str.replace("\"","",regex=False)
1485
+ .str.replace("{","",regex=False)
1486
+ .str.replace("}","",regex=False)
1487
+ .str.replace(":","",regex=False)
1488
+ .str.replace("Cosine","",regex=False)
1489
+ .str.replace("Shannon","",regex=False)
1490
+ .str.replace("Renyi","",regex=False)
1491
+ .str.replace("Tsallis","",regex=False)
1492
+ .str.replace(" ","",regex=False)
1493
+ )
1494
+
1495
+ if return_output:
1496
+ return df_out
1497
+ else:
1498
+ df_out.to_csv(output_path, index=False, sep='\t', quoting=csv.QUOTE_NONE)
1499
+ print(f'Wrote results to {output_path}')
1500
+
1501
+
1502
+
1503
+ def tune_params_on_NRMS_data_grid(query_data=None, reference_data=None, grid=None, output_path=None, return_output=False):
1504
+ grid = {**default_NRMS_grid, **(grid or {})}
1505
+ for key, value in grid.items():
1506
+ globals()[key] = value
1507
+
1508
+ if query_data is None:
1509
+ print('\nError: No argument passed to the mandatory query_data. Please pass the path to the TXT file of the query data.')
1510
+ sys.exit()
1511
+ else:
1512
+ extension = query_data.rsplit('.',1)
1513
+ extension = extension[(len(extension)-1)]
1514
+ if extension == 'mgf' or extension == 'MGF' or extension == 'mzML' or extension == 'mzml' or extension == 'MZML' or extension == 'cdf' or extension == 'CDF' or extension == 'msp' or extension == 'MSP' or extension == 'json' or extension == 'JSON':
1515
+ output_path_tmp = query_data[:-3] + 'txt'
1516
+ build_library_from_raw_data(input_path=query_data, output_path=output_path_tmp, is_reference=False)
1517
+ df_query = pd.read_csv(output_path_tmp, sep='\t')
1518
+ if extension == 'txt' or extension == 'TXT':
1519
+ df_query = pd.read_csv(query_data, sep='\t')
1520
+ unique_query_ids = df_query.iloc[:,0].unique()
1521
+
1522
+ if reference_data is None:
1523
+ print('\nError: No argument passed to the mandatory reference_data. Please pass the path to the TXT file of the reference data.')
1524
+ sys.exit()
1525
+ else:
1526
+ if isinstance(reference_data,str):
1527
+ df_reference = get_reference_df(reference_data=reference_data)
1528
+ unique_reference_ids = df_reference.iloc[:,0].unique()
1529
+ else:
1530
+ dfs = []
1531
+ unique_reference_ids = []
1532
+ for f in reference_data:
1533
+ tmp = get_reference_df(reference_data=f)
1534
+ dfs.append(tmp)
1535
+ unique_reference_ids.extend(tmp.iloc[:,0].unique())
1536
+ df_reference = pd.concat(dfs, axis=0, ignore_index=True)
1537
+
1538
+ print(f'\nNote that there are {len(unique_query_ids)} unique query spectra, {len(unique_reference_ids)} unique reference spectra, and {len(set(unique_query_ids) & set(unique_reference_ids))} of the query and reference spectra IDs are in common.\n')
1539
+
1540
+ if output_path is None:
1541
+ output_path = f'{Path.cwd()}/tuning_param_output.txt'
1542
+ print(f'Warning: since output_path=None, the output will be written to the current working directory: {output_path}')
1543
+
1544
+ param_grid = product(similarity_measure, weight, spectrum_preprocessing_order, mz_min, mz_max, int_min, int_max,
1545
+ noise_threshold, wf_mz, wf_int, LET_threshold, entropy_dimension, high_quality_reference_library)
1546
+ results = Parallel(n_jobs=-1, verbose=10)(delayed(_eval_one_NRMS)(df_query, df_reference, unique_query_ids, unique_reference_ids, *params) for params in param_grid)
1547
+
1548
+ df_out = pd.DataFrame(results, columns=[
1549
+ 'ACC','SIMILARITY.MEASURE','WEIGHT','SPECTRUM.PROCESSING.ORDER', 'MZ.MIN','MZ.MAX','INT.MIN','INT.MAX',
1550
+ 'NOISE.THRESHOLD','WF.MZ','WF.INT','LET.THRESHOLD','ENTROPY.DIMENSION', 'HIGH.QUALITY.REFERENCE.LIBRARY'
1551
+ ])
1552
+ df_out['WEIGHT'] = df_out['WEIGHT'].str.replace("\"","",regex=False)
1553
+ df_out['WEIGHT'] = df_out['WEIGHT'].str.replace("{","",regex=False)
1554
+ df_out['WEIGHT'] = df_out['WEIGHT'].str.replace("}","",regex=False)
1555
+ df_out['WEIGHT'] = df_out['WEIGHT'].str.replace(":","",regex=False)
1556
+ df_out['WEIGHT'] = df_out['WEIGHT'].str.replace("Cosine","",regex=False)
1557
+ df_out['WEIGHT'] = df_out['WEIGHT'].str.replace("Shannon","",regex=False)
1558
+ df_out['WEIGHT'] = df_out['WEIGHT'].str.replace("Renyi","",regex=False)
1559
+ df_out['WEIGHT'] = df_out['WEIGHT'].str.replace("Tsallis","",regex=False)
1560
+ df_out['WEIGHT'] = df_out['WEIGHT'].str.replace(" ","",regex=False)
1561
+ if return_output is False:
1562
+ df_out.to_csv(output_path, index=False, sep='\t', quoting=csv.QUOTE_NONE)
1563
+ else:
1564
+ return df_out
1565
+
1566
+
1567
+
1568
+ def tune_params_on_NRMS_data_grid_shiny(query_data=None, reference_data=None, grid=None, output_path=None, return_output=False):
1569
+ local_grid = {**default_NRMS_grid, **(grid or {})}
1570
+ for key, value in local_grid.items():
1571
+ globals()[key] = value
1572
+
1573
+ if query_data is None:
1574
+ print('\nError: No argument passed to the mandatory query_data. Please pass the path to the data file.')
1575
+ sys.exit()
1576
+ else:
1577
+ extension = query_data.rsplit('.', 1)[-1]
1578
+ if extension in ('mgf','MGF','mzML','mzml','MZML','cdf','CDF'):
1579
+ output_path_tmp = query_data[:-3] + 'txt'
1580
+ build_library_from_raw_data(input_path=query_data, output_path=output_path_tmp, is_reference=False)
1581
+ df_query = pd.read_csv(output_path_tmp, sep='\t')
1582
+ elif extension in ('txt','TXT'):
1583
+ df_query = pd.read_csv(query_data, sep='\t')
1584
+ else:
1585
+ print(f'\nError: Unsupported query_data extension: {extension}')
1586
+ sys.exit()
1587
+ unique_query_ids = df_query.iloc[:, 0].unique()
1588
+
1589
+ if reference_data is None:
1590
+ print('\nError: No argument passed to the mandatory reference_data. Please pass the path to the data file(s).')
1591
+ sys.exit()
1592
+ else:
1593
+ if isinstance(reference_data, str):
1594
+ df_reference = get_reference_df(reference_data=reference_data)
1595
+ unique_reference_ids = df_reference.iloc[:, 0].unique()
1596
+ else:
1597
+ dfs = []
1598
+ unique_reference_ids = []
1599
+ for f in reference_data:
1600
+ tmp = get_reference_df(reference_data=f)
1601
+ dfs.append(tmp)
1602
+ unique_reference_ids.extend(tmp.iloc[:, 0].unique())
1603
+ df_reference = pd.concat(dfs, axis=0, ignore_index=True)
1604
+
1605
+ print(f'\nNote that there are {len(unique_query_ids)} unique query spectra, '
1606
+ f'{len(unique_reference_ids)} unique reference spectra, and '
1607
+ f'{len(set(unique_query_ids) & set(unique_reference_ids))} of the query and reference spectra IDs are in common.\n')
1608
+
1609
+ if output_path is None:
1610
+ output_path = f'{Path.cwd()}/tuning_param_output.txt'
1611
+ print(f'Warning: since output_path=None, the output will be written to the current working directory: {output_path}')
1612
+
1613
+ param_grid = product(
1614
+ similarity_measure, weight, spectrum_preprocessing_order, mz_min, mz_max, int_min, int_max,
1615
+ noise_threshold, wf_mz, wf_int, LET_threshold,
1616
+ entropy_dimension, high_quality_reference_library
1617
+ )
1618
+
1619
+ results = []
1620
+ total = (
1621
+ len(similarity_measure) * len(weight) * len(spectrum_preprocessing_order) * len(mz_min) * len(mz_max) * len(int_min) *
1622
+ len(int_max) * len(noise_threshold) * len(wf_mz) * len(wf_int) * len(LET_threshold) * len(entropy_dimension) * len(high_quality_reference_library)
1623
+ )
1624
+ done = 0
1625
+ for params in param_grid:
1626
+ res = _eval_one_NRMS(df_query, df_reference, unique_query_ids, unique_reference_ids, *params)
1627
+ results.append(res)
1628
+ done += 1
1629
+ print(f'Completed {done}/{total} grid combinations.\n', flush=True)
1630
+
1631
+ df_out = pd.DataFrame(results, columns=[
1632
+ 'ACC','SIMILARITY.MEASURE','WEIGHT','SPECTRUM.PROCESSING.ORDER','MZ.MIN','MZ.MAX',
1633
+ 'INT.MIN','INT.MAX','NOISE.THRESHOLD','WF.MZ','WF.INT','LET.THRESHOLD','ENTROPY.DIMENSION','HIGH.QUALITY.REFERENCE.LIBRARY'
1634
+ ])
1635
+
1636
+ if 'WEIGHT' in df_out.columns:
1637
+ df_out['WEIGHT'] = (
1638
+ df_out['WEIGHT'].astype(str)
1639
+ .str.replace("\"","",regex=False)
1640
+ .str.replace("{","",regex=False)
1641
+ .str.replace("}","",regex=False)
1642
+ .str.replace(":","",regex=False)
1643
+ .str.replace("Cosine","",regex=False)
1644
+ .str.replace("Shannon","",regex=False)
1645
+ .str.replace("Renyi","",regex=False)
1646
+ .str.replace("Tsallis","",regex=False)
1647
+ .str.replace(" ","",regex=False)
1648
+ )
1649
+
1650
+ if return_output:
1651
+ return df_out
1652
+ else:
1653
+ df_out.to_csv(output_path, index=False, sep='\t', quoting=csv.QUOTE_NONE)
1654
+ print(f'Wrote results to {output_path}')
1655
+
1656
+
1657
+
1658
+
1659
+ def get_acc_HRMS(df_query, df_reference, precursor_ion_mz_tolerance, ionization_mode, adduct, similarity_measure, weights, spectrum_preprocessing_order, mz_min, mz_max, int_min, int_max, window_size_centroiding, window_size_matching, noise_threshold, wf_mz, wf_int, LET_threshold, entropy_dimension, high_quality_reference_library, verbose=True):
1660
+ n_top_matches_to_save = 1
1661
+ unique_reference_ids = df_reference['id'].dropna().astype(str).unique().tolist()
1662
+ unique_query_ids = df_query['id'].dropna().astype(str).unique().tolist()
1663
+ all_similarity_rows = []
1664
+
1665
+ for query_idx, qid in enumerate(unique_query_ids):
1666
+ if verbose:
1667
+ print(f'query spectrum #{query_idx} is being identified')
1668
+
1669
+ q_mask = (df_query['id'] == qid)
1670
+ q_idxs = np.where(q_mask)[0]
1671
+ if q_idxs.size == 0:
1672
+ all_similarity_rows.append([0.0]*len(unique_reference_ids))
1673
+ continue
1674
+
1675
+ q_spec_base = np.asarray(pd.concat([df_query['mz_ratio'].iloc[q_idxs], df_query['intensity'].iloc[q_idxs]], axis=1).reset_index(drop=True))
1676
+
1677
+ if 'precursor_ion_mz' in df_query.columns and 'precursor_ion_mz' in df_reference.columns and precursor_ion_mz_tolerance is not None:
1678
+ precursor = float(df_query['precursor_ion_mz'].iloc[q_idxs[0]])
1679
+ df_reference_tmp = df_reference.loc[df_reference['precursor_ion_mz'].between(precursor - precursor_ion_mz_tolerance, precursor + precursor_ion_mz_tolerance, inclusive='both'), ['id', 'mz_ratio', 'intensity']].copy()
1680
+ else:
1681
+ df_reference_tmp = df_reference[['id','mz_ratio','intensity']].copy()
1682
+
1683
+ if df_reference_tmp.empty:
1684
+ all_similarity_rows.append([0.0]*len(unique_reference_ids))
1685
+ continue
1686
+
1687
+ ref_groups = dict(tuple(df_reference_tmp.groupby('id', sort=False)))
1688
+
1689
+ similarity_by_ref = {}
1690
+
1691
+ for ref_id, r_df in ref_groups.items():
1692
+ q_spec = q_spec_base.copy()
1693
+ r_spec = np.asarray(pd.concat([r_df['mz_ratio'], r_df['intensity']], axis=1).reset_index(drop=True))
1694
+
1695
+ is_matched = False
1696
+ for transformation in spectrum_preprocessing_order:
1697
+ if np.isinf(q_spec[:, 1]).any():
1698
+ q_spec[:, 1] = 0.0
1699
+ if np.isinf(r_spec[:, 1]).any():
1700
+ r_spec[:, 1] = 0.0
1701
+
1702
+ if transformation == 'C' and q_spec.shape[0] > 1 and r_spec.shape[0] > 1:
1703
+ q_spec = centroid_spectrum(q_spec, window_size=window_size_centroiding)
1704
+ r_spec = centroid_spectrum(r_spec, window_size=window_size_centroiding)
1705
+
1706
+ if transformation == 'M' and q_spec.shape[0] > 1 and r_spec.shape[0] > 1:
1707
+ m_spec = match_peaks_in_spectra(
1708
+ spec_a=q_spec, spec_b=r_spec, window_size=window_size_matching
1709
+ )
1710
+ if m_spec.size == 0:
1711
+ q_spec = np.empty((0,2))
1712
+ r_spec = np.empty((0,2))
1713
+ else:
1714
+ q_spec = m_spec[:, 0:2]
1715
+ r_spec = m_spec[:, [0, 2]]
1716
+ is_matched = True
1717
+
1718
+ if transformation == 'W' and q_spec.shape[0] > 1 and r_spec.shape[0] > 1:
1719
+ q_spec[:, 1] = wf_transform(q_spec[:, 0], q_spec[:, 1], wf_mz, wf_int)
1720
+ r_spec[:, 1] = wf_transform(r_spec[:, 0], r_spec[:, 1], wf_mz, wf_int)
1721
+
1722
+ if transformation == 'L' and q_spec.shape[0] > 1 and r_spec.shape[0] > 1:
1723
+ q_spec[:, 1] = LE_transform(q_spec[:, 1], LET_threshold, normalization_method='standard')
1724
+ r_spec[:, 1] = LE_transform(r_spec[:, 1], LET_threshold, normalization_method='standard')
1725
+
1726
+ if transformation == 'N' and q_spec.shape[0] > 1 and r_spec.shape[0] > 1:
1727
+ q_spec = remove_noise(q_spec, nr=noise_threshold)
1728
+ if not high_quality_reference_library:
1729
+ r_spec = remove_noise(r_spec, nr=noise_threshold)
1730
+
1731
+ if transformation == 'F' and q_spec.shape[0] > 1 and r_spec.shape[0] > 1:
1732
+ q_spec = filter_spec_lcms(
1733
+ q_spec, mz_min=mz_min, mz_max=mz_max, int_min=int_min, int_max=int_max, is_matched=is_matched
1734
+ )
1735
+ if not high_quality_reference_library:
1736
+ r_spec = filter_spec_lcms(
1737
+ r_spec, mz_min=mz_min, mz_max=mz_max, int_min=int_min, int_max=int_max, is_matched=is_matched
1738
+ )
1739
+
1740
+ if q_spec.shape[0] > 1 and r_spec.shape[0] > 1:
1741
+ q_ints = q_spec[:, 1]
1742
+ r_ints = r_spec[:, 1]
1743
+ if np.sum(q_ints) != 0 and np.sum(r_ints) != 0:
1744
+ sim = get_similarity(similarity_measure, q_ints, r_ints, weights, entropy_dimension)
1745
+ else:
1746
+ sim = 0.0
1747
+ else:
1748
+ sim = 0.0
1749
+
1750
+ similarity_by_ref[str(ref_id)] = float(sim)
1751
+
1752
+ row = [similarity_by_ref.get(ref_id, 0.0) for ref_id in unique_reference_ids]
1753
+ all_similarity_rows.append(row)
1754
+
1755
+ df_scores = pd.DataFrame(all_similarity_rows, index=unique_query_ids, columns=unique_reference_ids)
1756
+ df_scores.index.name = 'QUERY.SPECTRUM.ID'
1757
+
1758
+ top_idx = df_scores.values.argmax(axis=1)
1759
+ top_scores = df_scores.values[np.arange(df_scores.shape[0]), top_idx]
1760
+ top_ids = [df_scores.columns[i] for i in top_idx]
1761
+
1762
+ df_tmp = pd.DataFrame({'TRUE.ID': df_scores.index.to_list(), 'PREDICTED.ID': top_ids, 'SCORE': top_scores})
1763
+ if verbose:
1764
+ print(df_tmp)
1765
+
1766
+ acc = (df_tmp['TRUE.ID'] == df_tmp['PREDICTED.ID']).mean()
1767
+ return acc
1768
+
1769
+
1770
+
1771
+ def get_acc_NRMS(df_query, df_reference, unique_query_ids, unique_reference_ids, similarity_measure, weights, spectrum_preprocessing_order, mz_min, mz_max, int_min, int_max, noise_threshold, wf_mz, wf_int, LET_threshold, entropy_dimension, high_quality_reference_library, verbose=True):
1772
+
1773
+ n_top_matches_to_save = 1
1774
+
1775
+ min_mz = int(np.min([np.min(df_query.iloc[:,1]), np.min(df_reference.iloc[:,1])]))
1776
+ max_mz = int(np.max([np.max(df_query.iloc[:,1]), np.max(df_reference.iloc[:,1])]))
1777
+ mzs = np.linspace(min_mz,max_mz,(max_mz-min_mz+1))
1778
+
1779
+ all_similarity_scores = []
1780
+ for query_idx in range(0,len(unique_query_ids)):
1781
+ q_idxs_tmp = np.where(df_query.iloc[:,0] == unique_query_ids[query_idx])[0]
1782
+ q_spec_tmp = np.asarray(pd.concat([df_query.iloc[q_idxs_tmp,1], df_query.iloc[q_idxs_tmp,2]], axis=1).reset_index(drop=True))
1783
+ q_spec_tmp = convert_spec(q_spec_tmp,mzs)
1784
+
1785
+ similarity_scores = []
1786
+ for ref_idx in range(0,len(unique_reference_ids)):
1787
+ q_spec = q_spec_tmp
1788
+ if verbose is True and ref_idx % 1000 == 0:
1789
+ print(f'Query spectrum #{query_idx} has had its similarity with {ref_idx} reference library spectra computed')
1790
+ r_idxs_tmp = np.where(df_reference.iloc[:,0] == unique_reference_ids[ref_idx])[0]
1791
+ r_spec_tmp = np.asarray(pd.concat([df_reference.iloc[r_idxs_tmp,1], df_reference.iloc[r_idxs_tmp,2]], axis=1).reset_index(drop=True))
1792
+ r_spec = convert_spec(r_spec_tmp,mzs)
1793
+
1794
+ for transformation in spectrum_preprocessing_order:
1795
+ if np.isinf(q_spec[:,1]).sum() > 0:
1796
+ q_spec[:,1] = np.zeros(q_spec.shape[0])
1797
+ if np.isinf(r_spec[:,1]).sum() > 0:
1798
+ r_spec[:,1] = np.zeros(r_spec.shape[0])
1799
+ if transformation == 'W':
1800
+ q_spec[:,1] = wf_transform(q_spec[:,0], q_spec[:,1], wf_mz, wf_int)
1801
+ r_spec[:,1] = wf_transform(r_spec[:,0], r_spec[:,1], wf_mz, wf_int)
1802
+ if transformation == 'L':
1803
+ q_spec[:,1] = LE_transform(q_spec[:,1], LET_threshold, normalization_method='standard')
1804
+ r_spec[:,1] = LE_transform(r_spec[:,1], LET_threshold, normalization_method='standard')
1805
+ if transformation == 'N':
1806
+ q_spec = remove_noise(q_spec, nr = noise_threshold)
1807
+ if high_quality_reference_library == False:
1808
+ r_spec = remove_noise(r_spec, nr = noise_threshold)
1809
+ if transformation == 'F':
1810
+ q_spec = filter_spec_gcms(q_spec, mz_min = mz_min, mz_max = mz_max, int_min = int_min, int_max = int_max)
1811
+ if high_quality_reference_library == False:
1812
+ r_spec = filter_spec_gcms(r_spec, mz_min = mz_min, mz_max = mz_max, int_min = int_min, int_max = int_max)
1813
+
1814
+ q_ints = q_spec[:,1]
1815
+ r_ints = r_spec[:,1]
1816
+
1817
+ if np.sum(q_ints) != 0 and np.sum(r_ints) != 0:
1818
+ similarity_score = get_similarity(similarity_measure, q_spec[:,1], r_spec[:,1], weights, entropy_dimension)
1819
+ else:
1820
+ similarity_score = 0
1821
+
1822
+ similarity_scores.append(similarity_score)
1823
+ all_similarity_scores.append(similarity_scores)
1824
+
1825
+ df_scores = pd.DataFrame(all_similarity_scores, columns = unique_reference_ids)
1826
+ df_scores.index = unique_query_ids
1827
+ df_scores.index.names = ['QUERY.SPECTRUM.ID']
1828
+
1829
+ preds = []
1830
+ scores = []
1831
+ for i in range(0, df_scores.shape[0]):
1832
+ df_scores_tmp = df_scores
1833
+ preds_tmp = []
1834
+ scores_tmp = []
1835
+ for j in range(0, n_top_matches_to_save):
1836
+ top_ref_specs_tmp = df_scores_tmp.iloc[i,np.where(df_scores_tmp.iloc[i,:] == np.max(df_scores_tmp.iloc[i,:]))[0]]
1837
+ cols_to_keep = np.where(df_scores_tmp.iloc[i,:] != np.max(df_scores_tmp.iloc[i,:]))[0]
1838
+ df_scores_tmp = df_scores_tmp.iloc[:,cols_to_keep]
1839
+
1840
+ preds_tmp.append(';'.join(map(str,top_ref_specs_tmp.index.to_list())))
1841
+ if len(top_ref_specs_tmp.values) == 0:
1842
+ scores_tmp.append(0)
1843
+ else:
1844
+ scores_tmp.append(top_ref_specs_tmp.values[0])
1845
+ preds.append(preds_tmp)
1846
+ scores.append(scores_tmp)
1847
+
1848
+ preds = np.array(preds)
1849
+ scores = np.array(scores)
1850
+ out = np.c_[unique_query_ids,preds,scores]
1851
+ df_tmp = pd.DataFrame(out, columns=['TRUE.ID','PREDICTED.ID','SCORE'])
1852
+ acc = (df_tmp['TRUE.ID']==df_tmp['PREDICTED.ID']).mean()
1853
+ return acc
1854
+
1855
+
1856
+
1857
+ def run_spec_lib_matching_on_HRMS_data_shiny(query_data=None, reference_data=None, precursor_ion_mz_tolerance=None, ionization_mode=None, adduct=None, likely_reference_ids=None, similarity_measure='cosine', weights={'Cosine':0.25,'Shannon':0.25,'Renyi':0.25,'Tsallis':0.25}, spectrum_preprocessing_order='FCNMWL', high_quality_reference_library=False, mz_min=0, mz_max=9999999, int_min=0, int_max=9999999, window_size_centroiding=0.5, window_size_matching=0.5, noise_threshold=0.0, wf_mz=0.0, wf_intensity=1.0, LET_threshold=0.0, entropy_dimension=1.1, n_top_matches_to_save=1, print_id_results=False, output_identification=None, output_similarity_scores=None, return_ID_output=False, verbose=True):
1858
+ if query_data is None:
1859
+ print('\nError: No argument passed to the mandatory query_data. Please pass the path to the CSV file of the query data.')
1860
+ sys.exit()
1861
+ else:
1862
+ extension = query_data.rsplit('.',1)
1863
+ extension = extension[(len(extension)-1)]
1864
+ if extension == 'mgf' or extension == 'MGF' or extension == 'mzML' or extension == 'mzml' or extension == 'MZML' or extension == 'cdf' or extension == 'CDF' or extension == 'json' or extension == 'JSON':
1865
+ output_path_tmp = query_data[:-3] + 'txt'
1866
+ build_library_from_raw_data(input_path=query_data, output_path=output_path_tmp, is_reference=False)
1867
+ #build_library_from_raw_data(input_path=query_data, output_path=output_path_tmp, is_reference=True)
1868
+ df_query = pd.read_csv(output_path_tmp, sep='\t')
1869
+ if extension == 'txt' or extension == 'TXT':
1870
+ df_query = pd.read_csv(query_data, sep='\t')
1871
+ unique_query_ids = df_query['id'].unique()
1872
+
1873
+ if reference_data is None:
1874
+ print('\nError: No argument passed to the mandatory reference_data. Please pass the path to the reference data.')
1875
+ sys.exit()
1876
+ else:
1877
+ if isinstance(reference_data,str):
1878
+ df_reference = get_reference_df(reference_data,likely_reference_ids)
1879
+ else:
1880
+ dfs = []
1881
+ for f in reference_data:
1882
+ tmp = get_reference_df(f,likely_reference_ids)
1883
+ dfs.append(tmp)
1884
+ df_reference = pd.concat(dfs, axis=0, ignore_index=True)
1885
+
1886
+ if 'ionization_mode' in df_reference.columns.tolist() and ionization_mode != 'N/A':
1887
+ df_reference = df_reference.loc[df_reference['ionization_mode']==ionization_mode]
1888
+ if 'adduct' in df_reference.columns.tolist() and adduct != 'N/A':
1889
+ df_reference = df_reference.loc[df_reference['adduct']==adduct]
1890
+
1891
+ if spectrum_preprocessing_order is not None:
1892
+ spectrum_preprocessing_order = list(spectrum_preprocessing_order)
1893
+ else:
1894
+ spectrum_preprocessing_order = ['F', 'C', 'N', 'M', 'W', 'L']
1895
+ if 'M' not in spectrum_preprocessing_order:
1896
+ print(f'Error: \'M\' must be a character in spectrum_preprocessing_order.')
1897
+ sys.exit()
1898
+ if 'C' in spectrum_preprocessing_order:
1899
+ if spectrum_preprocessing_order.index('C') > spectrum_preprocessing_order.index('M'):
1900
+ print(f'Error: \'C\' must come before \'M\' in spectrum_preprocessing_order.')
1901
+ sys.exit()
1902
+ if set(spectrum_preprocessing_order) - {'F','C','N','M','W','L'}:
1903
+ print(f'Error: spectrum_preprocessing_order must contain only \'C\', \'F\', \'M\', \'N\', \'L\', \'W\'.')
1904
+ sys.exit()
1905
+
1906
+
1907
+ if similarity_measure not in ['cosine','shannon','renyi','tsallis','mixture','jaccard','dice','3w_jaccard','sokal_sneath','binary_cosine','mountford','mcconnaughey','driver_kroeber','simpson','braun_banquet','fager_mcgowan','kuldzynski','intersection','hamming','hellinger']:
1908
+ print('\nError: similarity_measure must be either cosine, shannon, renyi, tsallis, mixture, jaccard, dice, 3w_jaccard, sokal_sneath, binary_cosine, mountford, mcconnaughey, driver_kroeber, simpson, braun_banquet, fager_mcgowan, kulczynski, intersection, hamming, or hellinger')
1909
+ sys.exit()
1910
+
1911
+ if isinstance(int_min,int) is True:
1912
+ int_min = float(int_min)
1913
+ if isinstance(int_max,int) is True:
1914
+ int_max = float(int_max)
1915
+ if isinstance(mz_min,int) is False or isinstance(mz_max,int) is False or isinstance(int_min,float) is False or isinstance(int_max,float) is False:
1916
+ print('Error: mz_min must be a non-negative integer, mz_max must be a positive integer, int_min must be a non-negative float, and int_max must be a positive float')
1917
+ sys.exit()
1918
+ if mz_min < 0:
1919
+ print('\nError: mz_min should be a non-negative integer')
1920
+ sys.exit()
1921
+ if mz_max <= 0:
1922
+ print('\nError: mz_max should be a positive integer')
1923
+ sys.exit()
1924
+ if int_min < 0:
1925
+ print('\nError: int_min should be a non-negative float')
1926
+ sys.exit()
1927
+ if int_max <= 0:
1928
+ print('\nError: int_max should be a positive float')
1929
+ sys.exit()
1930
+
1931
+ if isinstance(window_size_centroiding,float) is False or window_size_centroiding <= 0.0:
1932
+ print('Error: window_size_centroiding must be a positive float.')
1933
+ sys.exit()
1934
+ if isinstance(window_size_matching,float) is False or window_size_matching<= 0.0:
1935
+ print('Error: window_size_matching must be a positive float.')
1936
+ sys.exit()
1937
+
1938
+ if isinstance(noise_threshold,int) is True:
1939
+ noise_threshold = float(noise_threshold)
1940
+ if isinstance(noise_threshold,float) is False or noise_threshold < 0:
1941
+ print('Error: noise_threshold must be a positive float.')
1942
+ sys.exit()
1943
+
1944
+ if isinstance(wf_intensity,int) is True:
1945
+ wf_intensity = float(wf_intensity)
1946
+ if isinstance(wf_mz,int) is True:
1947
+ wf_mz = float(wf_mz)
1948
+ if isinstance(wf_intensity,float) is False or isinstance(wf_mz,float) is False:
1949
+ print('Error: wf_mz and wf_intensity must be integers or floats')
1950
+ sys.exit()
1951
+
1952
+ if entropy_dimension <= 0:
1953
+ print('\nError: entropy_dimension should be a positive float')
1954
+ sys.exit()
1955
+ else:
1956
+ q = entropy_dimension
1957
+
1958
+ normalization_method = 'standard'
1959
+
1960
+ if n_top_matches_to_save <= 0 or isinstance(n_top_matches_to_save,int)==False:
1961
+ print('\nError: n_top_matches_to_save should be a positive integer')
1962
+ sys.exit()
1963
+
1964
+ if isinstance(print_id_results,bool)==False:
1965
+ print('\nError: print_id_results must be either True or False')
1966
+ sys.exit()
1967
+
1968
+ if output_identification is None:
1969
+ output_identification = f'{Path.cwd()}/output_identification.txt'
1970
+ print(f'Warning: writing identification output to {output_identification}')
1971
+
1972
+ if output_similarity_scores is None:
1973
+ output_similarity_scores = f'{Path.cwd()}/output_all_similarity_scores.txt'
1974
+ print(f'Warning: writing similarity scores to {output_similarity_scores}')
1975
+
1976
+
1977
+ unique_reference_ids = df_reference['id'].unique().tolist()
1978
+ all_similarity_scores = []
1979
+
1980
+ for query_idx in range(len(unique_query_ids)):
1981
+ if verbose:
1982
+ print(f'query spectrum #{query_idx} is being identified')
1983
+
1984
+ q_mask = (df_query['id'] == unique_query_ids[query_idx])
1985
+ q_idxs_tmp = np.where(q_mask)[0]
1986
+ q_spec_tmp = np.asarray(pd.concat([df_query['mz_ratio'].iloc[q_idxs_tmp], df_query['intensity'].iloc[q_idxs_tmp]], axis=1).reset_index(drop=True))
1987
+
1988
+ if 'precursor_ion_mz' in df_query.columns.tolist() and 'precursor_ion_mz' in df_reference.columns.tolist() and precursor_ion_mz_tolerance != None:
1989
+ precursor_ion_mz_tmp = df_query['precursor_ion_mz'].iloc[q_idxs_tmp[0]]
1990
+ df_reference_tmp = df_reference.loc[df_reference['precursor_ion_mz'].between(precursor_ion_mz_tmp-precursor_ion_mz_tolerance, precursor_ion_mz_tmp+precursor_ion_mz_tolerance, inclusive='both'),['id','mz_ratio','intensity']].copy()
1991
+ else:
1992
+ df_reference_tmp = df_reference.copy()
1993
+
1994
+ ref_groups = dict(tuple(df_reference_tmp.groupby('id', sort=False)))
1995
+ unique_reference_ids_tmp = list(ref_groups.keys())
1996
+
1997
+ similarity_by_ref = {}
1998
+ for ref_id in unique_reference_ids_tmp:
1999
+ q_spec = q_spec_tmp.copy()
2000
+ r_df = ref_groups[ref_id]
2001
+ r_spec = np.asarray(pd.concat([r_df['mz_ratio'], r_df['intensity']], axis=1).reset_index(drop=True))
2002
+
2003
+ is_matched = False
2004
+
2005
+ for transformation in spectrum_preprocessing_order:
2006
+ if np.isinf(q_spec[:, 1]).sum() > 0:
2007
+ q_spec[:, 1] = np.zeros(q_spec.shape[0])
2008
+ if np.isinf(r_spec[:, 1]).sum() > 0:
2009
+ r_spec[:, 1] = np.zeros(r_spec.shape[0])
2010
+
2011
+ if transformation == 'C' and q_spec.shape[0] > 1 and r_spec.shape[0] > 1:
2012
+ q_spec = centroid_spectrum(q_spec, window_size=window_size_centroiding)
2013
+ r_spec = centroid_spectrum(r_spec, window_size=window_size_centroiding)
2014
+
2015
+ if transformation == 'M' and q_spec.shape[0] > 1 and r_spec.shape[0] > 1:
2016
+ m_spec = match_peaks_in_spectra(spec_a=q_spec, spec_b=r_spec, window_size=window_size_matching)
2017
+ q_spec = m_spec[:, 0:2]
2018
+ r_spec = m_spec[:, [0, 2]]
2019
+ is_matched = True
2020
+
2021
+ if transformation == 'W' and q_spec.shape[0] > 1 and r_spec.shape[0] > 1:
2022
+ q_spec[:, 1] = wf_transform(q_spec[:, 0], q_spec[:, 1], wf_mz, wf_intensity)
2023
+ r_spec[:, 1] = wf_transform(r_spec[:, 0], r_spec[:, 1], wf_mz, wf_intensity)
2024
+
2025
+ if transformation == 'L' and q_spec.shape[0] > 1 and r_spec.shape[0] > 1:
2026
+ q_spec[:, 1] = LE_transform(q_spec[:, 1], LET_threshold, normalization_method=normalization_method)
2027
+ r_spec[:, 1] = LE_transform(r_spec[:, 1], LET_threshold, normalization_method=normalization_method)
2028
+
2029
+ if transformation == 'N' and q_spec.shape[0] > 1 and r_spec.shape[0] > 1:
2030
+ q_spec = remove_noise(q_spec, nr=noise_threshold)
2031
+ if not high_quality_reference_library:
2032
+ r_spec = remove_noise(r_spec, nr=noise_threshold)
2033
+
2034
+ if transformation == 'F' and q_spec.shape[0] > 1 and r_spec.shape[0] > 1:
2035
+ q_spec = filter_spec_lcms(
2036
+ q_spec, mz_min=mz_min, mz_max=mz_max, int_min=int_min, int_max=int_max, is_matched=is_matched
2037
+ )
2038
+ if not high_quality_reference_library:
2039
+ r_spec = filter_spec_lcms(
2040
+ r_spec, mz_min=mz_min, mz_max=mz_max, int_min=int_min, int_max=int_max, is_matched=is_matched
2041
+ )
2042
+
2043
+ q_ints = q_spec[:, 1]
2044
+ r_ints = r_spec[:, 1]
2045
+
2046
+ if np.sum(q_ints) != 0 and np.sum(r_ints) != 0 and q_spec.shape[0] > 1 and r_spec.shape[0] > 1:
2047
+ sim = get_similarity(similarity_measure, q_ints, r_ints, weights, entropy_dimension)
2048
+ else:
2049
+ sim = 0.0
2050
+
2051
+ similarity_by_ref[ref_id] = sim
2052
+
2053
+ row_scores = [similarity_by_ref.get(ref_id, 0.0) for ref_id in unique_reference_ids]
2054
+ all_similarity_scores.append(row_scores)
2055
+
2056
+ df_scores = pd.DataFrame(all_similarity_scores, index=unique_query_ids, columns=unique_reference_ids)
2057
+ df_scores.index = unique_query_ids
2058
+ df_scores.index.names = ['QUERY.SPECTRUM.ID']
2059
+
2060
+
2061
+ preds = []
2062
+ scores = []
2063
+ for i in range(0, df_scores.shape[0]):
2064
+ df_scores_tmp = df_scores
2065
+ preds_tmp = []
2066
+ scores_tmp = []
2067
+ for j in range(0, n_top_matches_to_save):
2068
+ top_ref_specs_tmp = df_scores_tmp.iloc[i,np.where(df_scores_tmp.iloc[i,:] == np.max(df_scores_tmp.iloc[i,:]))[0]]
2069
+ cols_to_keep = np.where(df_scores_tmp.iloc[i,:] != np.max(df_scores_tmp.iloc[i,:]))[0]
2070
+ df_scores_tmp = df_scores_tmp.iloc[:,cols_to_keep]
2071
+
2072
+ preds_tmp.append(';'.join(map(str,top_ref_specs_tmp.index.to_list())))
2073
+ if len(top_ref_specs_tmp.values) == 0:
2074
+ scores_tmp.append(0)
2075
+ else:
2076
+ scores_tmp.append(top_ref_specs_tmp.values[0])
2077
+ preds.append(preds_tmp)
2078
+ scores.append(scores_tmp)
2079
+
2080
+ preds = np.array(preds)
2081
+ scores = np.array(scores)
2082
+ out = np.c_[preds,scores]
2083
+
2084
+ cnames_preds = []
2085
+ cnames_scores = []
2086
+ for i in range(0,n_top_matches_to_save):
2087
+ cnames_preds.append(f'RANK.{i+1}.PRED')
2088
+ cnames_scores.append(f'RANK.{i+1}.SIMILARITY.SCORE')
2089
+
2090
+ df_top_ref_specs = pd.DataFrame(out, columns = [*cnames_preds, *cnames_scores])
2091
+ df_top_ref_specs.index = unique_query_ids
2092
+ df_top_ref_specs.index.names = ['QUERY.SPECTRUM.ID']
2093
+
2094
+ df_scores.columns = ['Reference Spectrum ID: ' + col for col in list(map(str,df_scores.columns.tolist()))]
2095
+
2096
+ if print_id_results == True:
2097
+ print(df_top_ref_specs.to_string())
2098
+
2099
+ if return_ID_output is False:
2100
+ df_top_ref_specs.to_csv(output_identification, sep='\t')
2101
+ df_scores.to_csv(output_similarity_scores, sep='\t')
2102
+ else:
2103
+ return df_top_ref_specs
2104
+
2105
+
2106
+
2107
+
2108
+ def run_spec_lib_matching_on_NRMS_data_shiny(query_data=None, reference_data=None, likely_reference_ids=None, spectrum_preprocessing_order='FNLW', similarity_measure='cosine', weights={'Cosine':0.25,'Shannon':0.25,'Renyi':0.25,'Tsallis':0.25}, high_quality_reference_library=False, mz_min=0, mz_max=9999999, int_min=0, int_max=9999999, noise_threshold=0.0, wf_mz=0.0, wf_intensity=1.0, LET_threshold=0.0, entropy_dimension=1.1, n_top_matches_to_save=1, print_id_results=False, output_identification=None, output_similarity_scores=None, return_ID_output=False, verbose=True):
2109
+ if query_data is None:
2110
+ print('\nError: No argument passed to the mandatory query_data. Please pass the path to the TXT file of the query data.')
2111
+ sys.exit()
2112
+ else:
2113
+ extension = query_data.rsplit('.',1)
2114
+ extension = extension[(len(extension)-1)]
2115
+ if extension == 'mgf' or extension == 'MGF' or extension == 'mzML' or extension == 'mzml' or extension == 'MZML' or extension == 'cdf' or extension == 'CDF' or extension == 'msp' or extension == 'MSP' or extension == 'json' or extension == 'JSON':
2116
+ output_path_tmp = query_data[:-3] + 'txt'
2117
+ build_library_from_raw_data(input_path=query_data, output_path=output_path_tmp, is_reference=False)
2118
+ df_query = pd.read_csv(output_path_tmp, sep='\t')
2119
+ if extension == 'txt' or extension == 'TXT':
2120
+ df_query = pd.read_csv(query_data, sep='\t')
2121
+ unique_query_ids = df_query.iloc[:,0].unique()
2122
+
2123
+ if reference_data is None:
2124
+ print('\nError: No argument passed to the mandatory reference_data. Please pass the path to the TXT file of the reference data.')
2125
+ sys.exit()
2126
+ else:
2127
+ if isinstance(reference_data,str):
2128
+ df_reference = get_reference_df(reference_data,likely_reference_ids)
2129
+ unique_reference_ids = df_reference.iloc[:,0].unique()
2130
+ else:
2131
+ dfs = []
2132
+ unique_reference_ids = []
2133
+ for f in reference_data:
2134
+ tmp = get_reference_df(f,likely_reference_ids)
2135
+ dfs.append(tmp)
2136
+ unique_reference_ids.extend(tmp.iloc[:,0].unique())
2137
+ df_reference = pd.concat(dfs, axis=0, ignore_index=True)
2138
+
2139
+
2140
+ if spectrum_preprocessing_order is not None:
2141
+ spectrum_preprocessing_order = list(spectrum_preprocessing_order)
2142
+ else:
2143
+ spectrum_preprocessing_order = ['F','N','W','L']
2144
+ if set(spectrum_preprocessing_order) - {'F','N','W','L'}:
2145
+ print(f'Error: spectrum_preprocessing_order must contain only \'F\', \'N\', \'W\', \'L\'.')
2146
+ sys.exit()
2147
+
2148
+ if similarity_measure not in ['cosine','shannon','renyi','tsallis','mixture','jaccard','dice','3w_jaccard','sokal_sneath','binary_cosine','mountford','mcconnaughey','driver_kroeber','simpson','braun_banquet','fager_mcgowan','kuldzynski','intersection','hamming','hellinger']:
2149
+ print('\nError: similarity_measure must be either cosine, shannon, renyi, tsallis, mixture, jaccard, dice, 3w_jaccard, sokal_sneath, binary_cosine, mountford, mcconnaughey, driver_kroeber, simpson, braun_banquet, fager_mcgowan, kulczynski, intersection, hamming, or hellinger')
2150
+ sys.exit()
2151
+
2152
+ if isinstance(int_min,int) is True:
2153
+ int_min = float(int_min)
2154
+ if isinstance(int_max,int) is True:
2155
+ int_max = float(int_max)
2156
+ if isinstance(mz_min,int) is False or isinstance(mz_max,int) is False or isinstance(int_min,float) is False or isinstance(int_max,float) is False:
2157
+ print('Error: mz_min must be a non-negative integer, mz_max must be a positive integer, int_min must be a non-negative float, and int_max must be a positive float')
2158
+ sys.exit()
2159
+ if mz_min < 0:
2160
+ print('\nError: mz_min should be a non-negative integer')
2161
+ sys.exit()
2162
+ if mz_max <= 0:
2163
+ print('\nError: mz_max should be a positive integer')
2164
+ sys.exit()
2165
+ if int_min < 0:
2166
+ print('\nError: int_min should be a non-negative float')
2167
+ sys.exit()
2168
+ if int_max <= 0:
2169
+ print('\nError: int_max should be a positive float')
2170
+ sys.exit()
2171
+
2172
+ if isinstance(noise_threshold,int) is True:
2173
+ noise_threshold = float(noise_threshold)
2174
+ if isinstance(noise_threshold,float) is False or noise_threshold < 0:
2175
+ print('Error: noise_threshold must be a positive float.')
2176
+ sys.exit()
2177
+
2178
+ if isinstance(wf_intensity,int) is True:
2179
+ wf_intensity = float(wf_intensity)
2180
+ if isinstance(wf_mz,int) is True:
2181
+ wf_mz = float(wf_mz)
2182
+ if isinstance(wf_intensity,float) is False or isinstance(wf_mz,float) is False:
2183
+ print('Error: wf_mz and wf_intensity must be integers or floats')
2184
+ sys.exit()
2185
+
2186
+ if entropy_dimension <= 0:
2187
+ print('\nError: entropy_dimension should be a positive float')
2188
+ sys.exit()
2189
+ else:
2190
+ q = entropy_dimension
2191
+
2192
+ normalization_method = 'standard'
2193
+
2194
+ if n_top_matches_to_save <= 0 or isinstance(n_top_matches_to_save,int)==False:
2195
+ print('\nError: n_top_matches_to_save should be a positive integer')
2196
+ sys.exit()
2197
+
2198
+ if isinstance(print_id_results,bool)==False:
2199
+ print('\nError: print_id_results must be either True or False')
2200
+ sys.exit()
2201
+
2202
+ if output_identification is None:
2203
+ output_identification = f'{Path.cwd()}/output_identification.txt'
2204
+ print(f'Warning: writing identification output to {output_identification}')
2205
+
2206
+ if output_similarity_scores is None:
2207
+ output_similarity_scores = f'{Path.cwd()}/output_all_similarity_scores.txt'
2208
+ print(f'Warning: writing similarity scores to {output_similarity_scores}')
2209
+
2210
+
2211
+
2212
+ min_mz = int(np.min([np.min(df_query.iloc[:,1]), np.min(df_reference.iloc[:,1])]))
2213
+ max_mz = int(np.max([np.max(df_query.iloc[:,1]), np.max(df_reference.iloc[:,1])]))
2214
+ mzs = np.linspace(min_mz,max_mz,(max_mz-min_mz+1))
2215
+
2216
+ all_similarity_scores = []
2217
+ for query_idx in range(0,len(unique_query_ids)):
2218
+ q_idxs_tmp = np.where(df_query.iloc[:,0] == unique_query_ids[query_idx])[0]
2219
+ q_spec_tmp = np.asarray(pd.concat([df_query.iloc[q_idxs_tmp,1], df_query.iloc[q_idxs_tmp,2]], axis=1).reset_index(drop=True))
2220
+ q_spec_tmp = convert_spec(q_spec_tmp,mzs)
2221
+
2222
+ similarity_scores = []
2223
+ for ref_idx in range(0,len(unique_reference_ids)):
2224
+ if verbose is True and ref_idx % 1000 == 0:
2225
+ print(f'Query spectrum #{query_idx} has had its similarity with {ref_idx} reference library spectra computed')
2226
+ q_spec = q_spec_tmp
2227
+ r_idxs_tmp = np.where(df_reference.iloc[:,0] == unique_reference_ids[ref_idx])[0]
2228
+ r_spec_tmp = np.asarray(pd.concat([df_reference.iloc[r_idxs_tmp,1], df_reference.iloc[r_idxs_tmp,2]], axis=1).reset_index(drop=True))
2229
+ r_spec = convert_spec(r_spec_tmp,mzs)
2230
+
2231
+ for transformation in spectrum_preprocessing_order:
2232
+ if np.isinf(q_spec[:,1]).sum() > 0:
2233
+ q_spec[:,1] = np.zeros(q_spec.shape[0])
2234
+ if np.isinf(r_spec[:,1]).sum() > 0:
2235
+ r_spec[:,1] = np.zeros(r_spec.shape[0])
2236
+ if transformation == 'W':
2237
+ q_spec[:,1] = wf_transform(q_spec[:,0], q_spec[:,1], wf_mz, wf_intensity)
2238
+ r_spec[:,1] = wf_transform(r_spec[:,0], r_spec[:,1], wf_mz, wf_intensity)
2239
+ if transformation == 'L':
2240
+ q_spec[:,1] = LE_transform(q_spec[:,1], LET_threshold, normalization_method=normalization_method)
2241
+ r_spec[:,1] = LE_transform(r_spec[:,1], LET_threshold, normalization_method=normalization_method)
2242
+ if transformation == 'N':
2243
+ q_spec = remove_noise(q_spec, nr = noise_threshold)
2244
+ if high_quality_reference_library == False:
2245
+ r_spec = remove_noise(r_spec, nr = noise_threshold)
2246
+ if transformation == 'F':
2247
+ q_spec = filter_spec_gcms(q_spec, mz_min = mz_min, mz_max = mz_max, int_min = int_min, int_max = int_max)
2248
+ if high_quality_reference_library == False:
2249
+ r_spec = filter_spec_gcms(r_spec, mz_min = mz_min, mz_max = mz_max, int_min = int_min, int_max = int_max)
2250
+
2251
+ q_ints = q_spec[:,1]
2252
+ r_ints = r_spec[:,1]
2253
+
2254
+ if np.sum(q_ints) != 0 and np.sum(r_ints) != 0:
2255
+ similarity_score = get_similarity(similarity_measure, q_spec[:,1], r_spec[:,1], weights, entropy_dimension)
2256
+ else:
2257
+ similarity_score = 0
2258
+
2259
+ similarity_scores.append(similarity_score)
2260
+ all_similarity_scores.append(similarity_scores)
2261
+
2262
+ df_scores = pd.DataFrame(all_similarity_scores, columns = unique_reference_ids)
2263
+ df_scores.index = unique_query_ids
2264
+ df_scores.index.names = ['QUERY.SPECTRUM.ID']
2265
+
2266
+ preds = []
2267
+ scores = []
2268
+ for i in range(0, df_scores.shape[0]):
2269
+ df_scores_tmp = df_scores
2270
+ preds_tmp = []
2271
+ scores_tmp = []
2272
+ for j in range(0, n_top_matches_to_save):
2273
+ top_ref_specs_tmp = df_scores_tmp.iloc[i,np.where(df_scores_tmp.iloc[i,:] == np.max(df_scores_tmp.iloc[i,:]))[0]]
2274
+ cols_to_keep = np.where(df_scores_tmp.iloc[i,:] != np.max(df_scores_tmp.iloc[i,:]))[0]
2275
+ df_scores_tmp = df_scores_tmp.iloc[:,cols_to_keep]
2276
+
2277
+ preds_tmp.append(';'.join(map(str,top_ref_specs_tmp.index.to_list())))
2278
+ if len(top_ref_specs_tmp.values) == 0:
2279
+ scores_tmp.append(0)
2280
+ else:
2281
+ scores_tmp.append(top_ref_specs_tmp.values[0])
2282
+ preds.append(preds_tmp)
2283
+ scores.append(scores_tmp)
2284
+
2285
+ preds = np.array(preds)
2286
+ scores = np.array(scores)
2287
+ out = np.c_[preds,scores]
2288
+
2289
+ cnames_preds = []
2290
+ cnames_scores = []
2291
+ for i in range(0,n_top_matches_to_save):
2292
+ cnames_preds.append(f'RANK.{i+1}.PRED')
2293
+ cnames_scores.append(f'RANK.{i+1}.SIMILARITY.SCORE')
2294
+
2295
+ df_top_ref_specs = pd.DataFrame(out, columns = [*cnames_preds, *cnames_scores])
2296
+ df_top_ref_specs.index = unique_query_ids
2297
+ df_top_ref_specs.index.names = ['QUERY.SPECTRUM.ID']
2298
+
2299
+ if print_id_results == True:
2300
+ print(df_top_ref_specs.to_string())
2301
+
2302
+ df_scores.columns = ['Reference Spectrum ID: ' + col for col in list(map(str,df_scores.columns.tolist()))]
2303
+
2304
+ if return_ID_output is False:
2305
+ df_top_ref_specs.to_csv(output_identification, sep='\t')
2306
+ df_scores.columns = ['Reference Spectrum ID: ' + col for col in list(map(str,df_scores.columns.tolist()))]
2307
+ df_scores.to_csv(output_similarity_scores, sep='\t')
2308
+ else:
2309
+ return df_top_ref_specs
2310
+
2311
+
2312
+ class _UIWriter:
2313
+ def __init__(self, loop, q: asyncio.Queue[str]):
2314
+ self._loop = loop
2315
+ self._q = q
2316
+ def write(self, s: str):
2317
+ if s:
2318
+ self._loop.call_soon_threadsafe(self._q.put_nowait, s)
2319
+ return len(s)
2320
+ def flush(self):
2321
+ pass
2322
+
2323
+
2324
+ def attach_logging_to_writer(writer):
2325
+ handler = logging.StreamHandler(writer)
2326
+ handler.setLevel(logging.INFO)
2327
+ root = logging.getLogger()
2328
+ root.addHandler(handler)
2329
+ root.setLevel(logging.INFO)
2330
+ return handler, root
2331
+
2332
+
2333
+
31
2334
  def _run_with_redirects(fn, writer, *args, **kwargs):
32
2335
  with redirect_stdout(writer), redirect_stderr(writer):
33
2336
  return fn(*args, **kwargs)
@@ -64,19 +2367,21 @@ def strip_weights(s):
64
2367
  def build_library(input_path=None, output_path=None):
65
2368
  last_three_chars = input_path[(len(input_path)-3):len(input_path)]
66
2369
  last_four_chars = input_path[(len(input_path)-4):len(input_path)]
67
- if last_three_chars == 'csv' or last_three_chars == 'CSV':
68
- return pd.read_csv(input_path)
2370
+ if last_three_chars == 'txt' or last_three_chars == 'TXT':
2371
+ return pd.read_csv(input_path, sep='\t')
69
2372
  else:
70
2373
  if last_three_chars == 'mgf' or last_three_chars == 'MGF':
71
2374
  input_file_type = 'mgf'
72
2375
  elif last_four_chars == 'mzML' or last_four_chars == 'mzml' or last_four_chars == 'MZML':
73
2376
  input_file_type = 'mzML'
2377
+ elif last_four_chars == 'json' or last_four_chars == 'JSON':
2378
+ input_file_type = 'json'
74
2379
  elif last_three_chars == 'cdf' or last_three_chars == 'CDF':
75
2380
  input_file_type = 'cdf'
76
2381
  elif last_three_chars == 'msp' or last_three_chars == 'MSP':
77
2382
  input_file_type = 'msp'
78
2383
  else:
79
- print('ERROR: either an \'mgf\', \'mzML\', \'cdf\', or \'msp\' file must be passed to --input_path')
2384
+ print('ERROR: either an \'mgf\', \'mzML\', \'cdf\', \'msp\', \'json\', or \'txt\' file must be passed to --input_path')
80
2385
  sys.exit()
81
2386
 
82
2387
  spectra = []
@@ -146,6 +2451,23 @@ def build_library(input_path=None, output_path=None):
146
2451
  except ValueError:
147
2452
  continue
148
2453
 
2454
+ if input_file_type == 'json':
2455
+ data = json.load(open(input_path))
2456
+ ids = []
2457
+ mzs = []
2458
+ ints = []
2459
+ for i in range(0,len(data)):
2460
+ spec_ID_tmp = data[i]['spectrum_id']
2461
+ tmp = data[i]['peaks_json']
2462
+ tmp = tmp[1:-1].split(",")
2463
+ tmp = [a.replace("[","") for a in tmp]
2464
+ tmp = [a.replace("]","") for a in tmp]
2465
+ mzs_tmp = tmp[0::2]
2466
+ ints_tmp = tmp[1::2]
2467
+ ids.extend([spec_ID_tmp] * len(mzs_tmp))
2468
+ mzs.extend(mzs_tmp)
2469
+ ints.extend(ints_tmp)
2470
+
149
2471
  df = pd.DataFrame({'id':ids, 'mz_ratio':mzs, 'intensity':ints})
150
2472
  return df
151
2473
 
@@ -154,9 +2476,12 @@ def build_library(input_path=None, output_path=None):
154
2476
  def extract_first_column_ids(file_path: str, max_ids: int = 20000):
155
2477
  suffix = Path(file_path).suffix.lower()
156
2478
 
157
- if suffix == ".csv":
158
- df = pd.read_csv(file_path, usecols=[0])
159
- ids = df.iloc[:, 0].astype(str).dropna()
2479
+ if suffix == ".txt":
2480
+ df = pd.read_csv(file_path, sep='\t')
2481
+ if 'id' in df.columns.tolist():
2482
+ ids = df['id'].astype(str).dropna()
2483
+ else:
2484
+ ids = df.iloc[:, 0].astype(str).dropna()
160
2485
  ids = [x for x in ids if x.strip() != ""]
161
2486
  seen = set()
162
2487
  uniq = []
@@ -191,17 +2516,17 @@ def extract_first_column_ids(file_path: str, max_ids: int = 20000):
191
2516
  return []
192
2517
 
193
2518
 
194
- def _open_plot_window(session, png_bytes: bytes, title: str = "plot.png"):
195
- """Send PNG bytes to browser and open in a new window as a data URL."""
196
- b64 = base64.b64encode(png_bytes).decode("ascii")
197
- data_url = f"data:image/png;base64,{b64}"
198
- session.send_custom_message("open-plot-window", {"png": data_url, "title": title})
2519
+ def _open_plot_window(session, svg_bytes: bytes, title: str = "plot.svg"):
2520
+ """Send SVG bytes to browser and open in a new window as a data URL."""
2521
+ b64 = base64.b64encode(svg_bytes).decode("ascii")
2522
+ data_url = f"data:image/svg;base64,{b64}"
2523
+ session.send_custom_message("open-plot-window", {"svg": data_url, "title": title})
199
2524
 
200
2525
 
201
2526
  def plot_spectra_ui(platform: str):
202
2527
  base_inputs = [
203
- ui.input_file("query_data", "Upload query dataset (mgf, mzML, cdf, msp, or csv):"),
204
- ui.input_file("reference_data", "Upload reference dataset (mgf, mzML, cdf, msp, or csv):"),
2528
+ ui.input_file("query_data", "Upload query dataset (mgf, mzML, cdf, msp, or txt):"),
2529
+ ui.input_file("reference_data", "Upload reference dataset (mgf, mzML, cdf, msp, or txt):"),
205
2530
  ui.input_selectize(
206
2531
  "spectrum_ID1",
207
2532
  "Select spectrum ID 1 (default is the first spectrum in the library):",
@@ -216,6 +2541,8 @@ def plot_spectra_ui(platform: str):
216
2541
  multiple=False,
217
2542
  options={"placeholder": "Upload a library..."},
218
2543
  ),
2544
+ ui.input_select('print_url_spectrum1', 'Print PubChem URL for spectrum 1:', ['No', 'Yes']),
2545
+ ui.input_select('print_url_spectrum2', 'Print PubChem URL for spectrum 2:', ['No', 'Yes']),
219
2546
  ui.input_select("similarity_measure", "Select similarity measure:", ["cosine","shannon","renyi","tsallis","mixture","jaccard","dice","3w_jaccard","sokal_sneath","binary_cosine","mountford","mcconnaughey","driver_kroeber","simpson","braun_banquet","fager_mcgowan","kulczynski","intersection","hamming","hellinger"]),
220
2547
  ui.input_text('weights', 'Weights for mixture similarity measure (cosine, shannon, renyi, tsallis):', '0.25, 0.25, 0.25, 0.25'),
221
2548
  ui.input_select(
@@ -227,21 +2554,13 @@ def plot_spectra_ui(platform: str):
227
2554
 
228
2555
  if platform == "HRMS":
229
2556
  extra_inputs = [
230
- ui.input_text(
231
- "spectrum_preprocessing_order",
232
- "Sequence of characters for preprocessing order (C (centroiding), F (filtering), M (matching), N (noise removal), L (low-entropy transformation), W (weight factor transformation)). M must be included, C before M if used.",
233
- "FCNMWL",
234
- ),
2557
+ ui.input_text("spectrum_preprocessing_order", "Sequence of characters for preprocessing order (C (centroiding), F (filtering), M (matching), N (noise removal), L (low-entropy transformation), W (weight factor transformation)). M must be included, C before M if used.", "FCNMWL",),
235
2558
  ui.input_numeric("window_size_centroiding", "Centroiding window-size:", 0.5),
236
2559
  ui.input_numeric("window_size_matching", "Matching window-size:", 0.5),
237
2560
  ]
238
2561
  else:
239
2562
  extra_inputs = [
240
- ui.input_text(
241
- "spectrum_preprocessing_order",
242
- "Sequence of characters for preprocessing order (F (filtering), N (noise removal), L (low-entropy transformation), W (weight factor transformation)).",
243
- "FNLW",
244
- )
2563
+ ui.input_text("spectrum_preprocessing_order", "Sequence of characters for preprocessing order (F (filtering), N (noise removal), L (low-entropy transformation), W (weight factor transformation)).", "FNLW",)
245
2564
  ]
246
2565
 
247
2566
  numeric_inputs = [
@@ -256,11 +2575,7 @@ def plot_spectra_ui(platform: str):
256
2575
  ui.input_numeric("entropy_dimension", "Entropy dimension (Renyi/Tsallis only):", 1.1),
257
2576
  ]
258
2577
 
259
- select_input = ui.input_select(
260
- "y_axis_transformation",
261
- "Transformation to apply to intensity axis:",
262
- ["normalized", "none", "log10", "sqrt"],
263
- )
2578
+ select_input = ui.input_select("y_axis_transformation", "Transformation to apply to intensity axis:", ["normalized", "none", "log10", "sqrt"])
264
2579
 
265
2580
  run_button_plot_spectra = ui.download_button("run_btn_plot_spectra", "Run", style="font-size:16px; padding:15px 30px; width:200px; height:80px")
266
2581
  back_button = ui.input_action_button("back", "Back to main menu", style="font-size:16px; padding:15px 30px; width:200px; height:80px")
@@ -268,15 +2583,15 @@ def plot_spectra_ui(platform: str):
268
2583
  if platform == "HRMS":
269
2584
  inputs_columns = ui.layout_columns(
270
2585
  ui.div(base_inputs[0:6], style="display:flex; flex-direction:column; gap:10px;"),
271
- ui.div([base_inputs[6:7], *extra_inputs], style="display:flex; flex-direction:column; gap:10px;"),
272
- ui.div(numeric_inputs[0:5], style="display:flex; flex-direction:column; gap:10px;"),
273
- ui.div([numeric_inputs[5:10], select_input], style="display:flex; flex-direction:column; gap:10px;"),
2586
+ ui.div([base_inputs[6:9], extra_inputs[0]], style="display:flex; flex-direction:column; gap:10px;"),
2587
+ ui.div(extra_inputs[1:3], numeric_inputs[0:3], style="display:flex; flex-direction:column; gap:10px;"),
2588
+ ui.div([numeric_inputs[3:10], select_input], style="display:flex; flex-direction:column; gap:10px;"),
274
2589
  col_widths=(3,3,3,3),
275
2590
  )
276
2591
  elif platform == "NRMS":
277
2592
  inputs_columns = ui.layout_columns(
278
2593
  ui.div(base_inputs[0:6], style="display:flex; flex-direction:column; gap:10px;"),
279
- ui.div([base_inputs[6:7], *extra_inputs], style="display:flex; flex-direction:column; gap:10px;"),
2594
+ ui.div([base_inputs[6:9], *extra_inputs], style="display:flex; flex-direction:column; gap:10px;"),
280
2595
  ui.div(numeric_inputs[0:5], style="display:flex; flex-direction:column; gap:10px;"),
281
2596
  ui.div([numeric_inputs[5:10], select_input], style="display:flex; flex-direction:column; gap:10px;"),
282
2597
  col_widths=(3,3,3,3),
@@ -297,49 +2612,29 @@ def plot_spectra_ui(platform: str):
297
2612
 
298
2613
  def run_spec_lib_matching_ui(platform: str):
299
2614
  base_inputs = [
300
- ui.input_file("query_data", "Upload query dataset (mgf, mzML, cdf, msp, or csv):"),
301
- ui.input_file("reference_data", "Upload reference dataset (mgf, mzML, cdf, msp, or csv):"),
2615
+ ui.input_file("query_data", "Upload query dataset (mgf, mzML, cdf, msp, or txt):"),
2616
+ ui.input_file("reference_data", "Upload reference dataset (mgf, mzML, cdf, msp, or txt):"),
302
2617
  ui.input_select("similarity_measure", "Select similarity measure:", ["cosine","shannon","renyi","tsallis","mixture","jaccard","dice","3w_jaccard","sokal_sneath","binary_cosine","mountford","mcconnaughey","driver_kroeber","simpson","braun_banquet","fager_mcgowan","kulczynski","intersection","hamming","hellinger"]),
303
2618
  ui.input_text('weights', 'Weights for mixture similarity measure (cosine, shannon, renyi, tsallis):', '0.25, 0.25, 0.25, 0.25'),
304
- ui.input_selectize(
305
- "spectrum_ID1",
306
- "Select spectrum ID 1 (only applicable for plotting; default is the first spectrum in the query library):",
307
- choices=[],
308
- multiple=False,
309
- options={"placeholder": "Upload a library..."},
310
- ),
311
- ui.input_selectize(
312
- "spectrum_ID2",
313
- "Select spectrum ID 2 (only applicable for plotting; default is the first spectrum in the reference library):",
314
- choices=[],
315
- multiple=False,
316
- options={"placeholder": "Upload a library..."},
317
- ),
318
- ui.input_select(
319
- "high_quality_reference_library",
320
- "Indicate whether the reference library is considered high quality. If True, filtering and noise removal are only applied to the query spectra.",
321
- [False, True],
322
- )
2619
+ ui.input_file('compound_ID_output_file', 'Upload output from spectral library matching to plot top matches (optional)'),
2620
+ ui.input_selectize("q_spec", "Select query spectrum (only applicable for plotting; default is the first spectrum in the compound ID output):", choices=[], multiple=False, options={"placeholder": "Upload compound ID output..."}),
2621
+ ui.input_selectize("r_spec", "Select reference spectrum (only applicable for plotting; default is the rank 1 reference spectrum):", choices=[], multiple=False, options={"placeholder": "Upload compound ID output..."}),
2622
+ ui.input_select('print_url_spectrum1', 'Print PubChem URL for query spectrum (only applicable for plotting):', ['No', 'Yes']),
2623
+ ui.input_select('print_url_spectrum2', 'Print PubChem URL for reference spectrum (only applicable for plotting):', ['No', 'Yes']),
2624
+ ui.input_select("high_quality_reference_library", "Indicate whether the reference library is considered high quality. If True, filtering and noise removal are only applied to the query spectra.", [False, True])
323
2625
  ]
324
2626
 
325
2627
  if platform == "HRMS":
326
2628
  extra_inputs = [
327
- ui.input_text(
328
- "spectrum_preprocessing_order",
329
- "Sequence of characters for preprocessing order (C (centroiding), F (filtering), M (matching), N (noise removal), L (low-entropy transformation), W (weight factor transformation)). M must be included, C before M if used.",
330
- "FCNMWL",
331
- ),
2629
+ ui.input_numeric("precursor_ion_mz_tolerance", "Precursor ion mass tolerance (leave blank if not applicable):", None),
2630
+ ui.input_select("ionization_mode", "Ionization mode:", ['Positive','Negative','N/A'], selected='N/A'),
2631
+ ui.input_select("adduct", "Adduct:", ['H','NH3','NH4','Na','K','N/A'], selected='N/A'),
2632
+ ui.input_text("spectrum_preprocessing_order","Sequence of characters for preprocessing order (C (centroiding), F (filtering), M (matching), N (noise removal), L (low-entropy transformation), W (weight factor transformation)). M must be included, C before M if used.","FCNMWL"),
332
2633
  ui.input_numeric("window_size_centroiding", "Centroiding window-size:", 0.5),
333
2634
  ui.input_numeric("window_size_matching", "Matching window-size:", 0.5),
334
2635
  ]
335
2636
  else:
336
- extra_inputs = [
337
- ui.input_text(
338
- "spectrum_preprocessing_order",
339
- "Sequence of characters for preprocessing order (F (filtering), N (noise removal), L (low-entropy transformation), W (weight factor transformation)).",
340
- "FNLW",
341
- )
342
- ]
2637
+ extra_inputs = [ui.input_text("spectrum_preprocessing_order","Sequence of characters for preprocessing order (F (filtering), N (noise removal), L (low-entropy transformation), W (weight factor transformation)).","FNLW")]
343
2638
 
344
2639
  numeric_inputs = [
345
2640
  ui.input_numeric("mz_min", "Minimum m/z for filtering:", 0),
@@ -361,16 +2656,16 @@ def run_spec_lib_matching_ui(platform: str):
361
2656
 
362
2657
  if platform == "HRMS":
363
2658
  inputs_columns = ui.layout_columns(
364
- ui.div(base_inputs[0:6], style="display:flex; flex-direction:column; gap:10px;"),
365
- ui.div([base_inputs[6:7], *extra_inputs], style="display:flex; flex-direction:column; gap:10px;"),
366
- ui.div(numeric_inputs[0:5], style="display:flex; flex-direction:column; gap:10px;"),
367
- ui.div(numeric_inputs[5:10], style="display:flex; flex-direction:column; gap:10px;"),
2659
+ ui.div([base_inputs[0:2], extra_inputs[0:3], base_inputs[2:4]], style="display:flex; flex-direction:column; gap:10px;"),
2660
+ ui.div([base_inputs[4:10]], style="display:flex; flex-direction:column; gap:10px;"),
2661
+ ui.div([extra_inputs[3:6], numeric_inputs[0:3]], style="display:flex; flex-direction:column; gap:10px;"),
2662
+ ui.div(numeric_inputs[3:10], style="display:flex; flex-direction:column; gap:10px;"),
368
2663
  col_widths=(3,3,3,3)
369
2664
  )
370
2665
  elif platform == "NRMS":
371
2666
  inputs_columns = ui.layout_columns(
372
2667
  ui.div(base_inputs[0:6], style="display:flex; flex-direction:column; gap:10px;"),
373
- ui.div([base_inputs[6:7], *extra_inputs], style="display:flex; flex-direction:column; gap:10px;"),
2668
+ ui.div([base_inputs[6:10], *extra_inputs], style="display:flex; flex-direction:column; gap:10px;"),
374
2669
  ui.div(numeric_inputs[0:5], style="display:flex; flex-direction:column; gap:10px;"),
375
2670
  ui.div(numeric_inputs[5:10], style="display:flex; flex-direction:column; gap:10px;"),
376
2671
  col_widths=(3,3,3,3)
@@ -397,8 +2692,8 @@ def run_spec_lib_matching_ui(platform: str):
397
2692
 
398
2693
  def run_parameter_tuning_grid_ui(platform: str):
399
2694
  base_inputs = [
400
- ui.input_file("query_data", "Upload query dataset (mgf, mzML, cdf, msp, or csv):"),
401
- ui.input_file("reference_data", "Upload reference dataset (mgf, mzML, cdf, msp, or csv):"),
2695
+ ui.input_file("query_data", "Upload query dataset (mgf, mzML, cdf, msp, or txt):"),
2696
+ ui.input_file("reference_data", "Upload reference dataset (mgf, mzML, cdf, msp, or txt):"),
402
2697
  ui.input_selectize("similarity_measure", "Select similarity measure(s):", ["cosine","shannon","renyi","tsallis","mixture","jaccard","dice","3w_jaccard","sokal_sneath","binary_cosine","mountford","mcconnaughey","driver_kroeber","simpson","braun_banquet","fager_mcgowan","kulczynski","intersection","hamming","hellinger"], multiple=True, selected='cosine'),
403
2698
  ui.input_text('weights', 'Weights for mixture similarity measure (cosine, shannon, renyi, tsallis):', '((0.25, 0.25, 0.25, 0.25))'),
404
2699
  ui.input_text("high_quality_reference_library", "Indicate whether the reference library is considered high quality. If True, filtering and noise removal are only applied to the query spectra.", '[True]')
@@ -406,11 +2701,10 @@ def run_parameter_tuning_grid_ui(platform: str):
406
2701
 
407
2702
  if platform == "HRMS":
408
2703
  extra_inputs = [
409
- ui.input_text(
410
- "spectrum_preprocessing_order",
411
- "Sequence of characters for preprocessing order (C (centroiding), F (filtering), M (matching), N (noise removal), L (low-entropy transformation), W (weight factor transformation)). M must be included, C before M if used.",
412
- "[FCNMWL,CWM]",
413
- ),
2704
+ ui.input_numeric("precursor_ion_mz_tolerance", "Precursor ion mass tolerance (leave blank if not applicable):", None),
2705
+ ui.input_select("ionization_mode", "Ionization mode:", ['Positive','Negative','N/A'], selected='N/A'),
2706
+ ui.input_select("adduct", "Adduct:", ['H','NH3','NH4','Na','K','N/A'], selected='N/A'),
2707
+ ui.input_text("spectrum_preprocessing_order", "Sequence of characters for preprocessing order (C (centroiding), F (filtering), M (matching), N (noise removal), L (low-entropy transformation), W (weight factor transformation)). M must be included, C before M if used.", "[FCNMWL,CWM]"),
414
2708
  ui.input_text("window_size_centroiding", "Centroiding window-size:", "[0.5]"),
415
2709
  ui.input_text("window_size_matching", "Matching window-size:", "[0.1,0.5]"),
416
2710
  ]
@@ -464,7 +2758,7 @@ def run_parameter_tuning_grid_ui(platform: str):
464
2758
 
465
2759
  return ui.div(
466
2760
  ui.TagList(
467
- ui.h2("Tune parameters"),
2761
+ ui.h2("Tune parameters (grid search)"),
468
2762
  inputs_columns,
469
2763
  run_button_parameter_tuning_grid,
470
2764
  back_button,
@@ -492,83 +2786,71 @@ PARAMS_NRMS = {
492
2786
  "entropy_dimension": (1.0, 3.0)
493
2787
  }
494
2788
 
2789
+
495
2790
  def run_parameter_tuning_DE_ui(platform: str):
496
- if platform == 'HRMS':
497
- PARAMS=PARAMS_HRMS
2791
+ # Pick param set per platform
2792
+ if platform == "HRMS":
2793
+ PARAMS = PARAMS_HRMS
498
2794
  else:
499
- PARAMS=PARAMS_NRMS
2795
+ PARAMS = PARAMS_NRMS
500
2796
 
501
2797
  base_inputs = [
502
- ui.input_file("query_data", "Upload query dataset (mgf, mzML, cdf, msp, or csv):"),
503
- ui.input_file("reference_data", "Upload reference dataset (mgf, mzML, cdf, msp, or csv):"),
2798
+ ui.input_file("query_data", "Upload query dataset (mgf, mzML, cdf, msp, or txt):"),
2799
+ ui.input_file("reference_data", "Upload reference dataset (mgf, mzML, cdf, msp, or txt):"),
504
2800
  ui.input_select("similarity_measure", "Select similarity measure:", ["cosine","shannon","renyi","tsallis","mixture","jaccard","dice","3w_jaccard","sokal_sneath","binary_cosine","mountford","mcconnaughey","driver_kroeber","simpson","braun_banquet","fager_mcgowan","kulczynski","intersection","hamming","hellinger"]),
505
- ui.input_text('weights', 'Weights for mixture similarity measure (cosine, shannon, renyi, tsallis):', '0.25, 0.25, 0.25, 0.25'),
506
- ui.input_select("high_quality_reference_library", "Indicate whether the reference library is considered high quality. If True, filtering and noise removal are only applied to the query spectra.", [False, True],),
507
- ]
2801
+ ui.input_text("weights", "Weights for mixture similarity measure (cosine, shannon, renyi, tsallis):", "0.25, 0.25, 0.25, 0.25"),
2802
+ ui.input_select("high_quality_reference_library", "Indicate whether the reference library is considered high quality. If True, filtering and noise removal are only applied to the query spectra.", [False, True])]
508
2803
 
509
2804
  if platform == "HRMS":
510
2805
  extra_inputs = [
511
- ui.input_text(
512
- "spectrum_preprocessing_order",
513
- "Sequence of characters for preprocessing order (C (centroiding), F (filtering), M (matching), N (noise removal), L (low-entropy transformation), W (weight factor transformation)). M must be included, C before M if used.",
514
- "FCNMWL",
515
- ),
2806
+ ui.input_numeric("precursor_ion_mz_tolerance", "Precursor ion mass tolerance (leave blank if not applicable):", None),
2807
+ ui.input_select("ionization_mode", "Ionization mode:", ['Positive','Negative','N/A'], selected='N/A'),
2808
+ ui.input_select("adduct", "Adduct:", ['H','NH3','NH4','Na','K','N/A'], selected='N/A'),
2809
+ ui.input_text("spectrum_preprocessing_order", "Sequence of characters for preprocessing order (C (centroiding), F (filtering), M (matching), N (noise removal), L (low-entropy transformation), W (weight factor transformation)). M must be included, C before M if used.", "FCNMWL"),
516
2810
  ui.input_numeric("window_size_centroiding", "Centroiding window-size:", 0.5),
517
2811
  ui.input_numeric("window_size_matching", "Matching window-size:", 0.5),
518
2812
  ]
519
2813
  else:
520
- extra_inputs = [
521
- ui.input_text(
522
- "spectrum_preprocessing_order",
523
- "Sequence of characters for preprocessing order (F (filtering), N (noise removal), L (low-entropy transformation), W (weight factor transformation)).",
524
- "FNLW",
525
- )
526
- ]
2814
+ extra_inputs = [ui.input_text("spectrum_preprocessing_order", "Sequence of characters for preprocessing order (F (filtering), N (noise removal), L (low-entropy transformation), W (weight factor transformation)).", "FNLW")]
527
2815
 
528
2816
  numeric_inputs = [
529
2817
  ui.input_numeric("mz_min", "Minimum m/z for filtering:", 0),
530
- ui.input_numeric("mz_max", "Maximum m/z for filtering:", 99999999),
2818
+ ui.input_numeric("mz_max", "Maximum m/z for filtering:", 99_999_999),
531
2819
  ui.input_numeric("int_min", "Minimum intensity for filtering:", 0),
532
- ui.input_numeric("int_max", "Maximum intensity for filtering:", 999999999),
2820
+ ui.input_numeric("int_max", "Maximum intensity for filtering:", 999_999_999),
533
2821
  ui.input_numeric("noise_threshold", "Noise removal threshold:", 0.0),
534
2822
  ui.input_numeric("wf_mz", "Mass/charge weight factor:", 0.0),
535
2823
  ui.input_numeric("wf_int", "Intensity weight factor:", 1.0),
536
2824
  ui.input_numeric("LET_threshold", "Low-entropy threshold:", 0.0),
537
2825
  ui.input_numeric("entropy_dimension", "Entropy dimension (Renyi/Tsallis only):", 1.1),
2826
+ ui.input_numeric("max_iterations", "Maximum number of iterations:", 5),
538
2827
  ]
539
2828
 
540
-
541
- #run_button_parameter_tuning_DE = ui.download_button("run_btn_parameter_tuning_DE", "Tune parameters (differential evolution optimization)", style="font-size:16px; padding:15px 30px; width:300px; height:100px")
542
2829
  run_button_parameter_tuning_DE = ui.input_action_button("run_btn_parameter_tuning_DE", "Tune parameters (differential evolution optimization)", style="font-size:16px; padding:15px 30px; width:300px; height:100px")
543
2830
  back_button = ui.input_action_button("back", "Back to main menu", style="font-size:16px; padding:15px 30px; width:300px; height:100px")
544
2831
 
545
2832
  if platform == "HRMS":
546
2833
  inputs_columns = ui.layout_columns(
547
- ui.div(base_inputs[0:6], style="display:flex; flex-direction:column; gap:10px;"),
548
- ui.div([base_inputs[6:7], *extra_inputs], style="display:flex; flex-direction:column; gap:10px;"),
549
- ui.div(numeric_inputs[0:5], style="display:flex; flex-direction:column; gap:10px;"),
550
- ui.div([numeric_inputs[5:10]], style="display:flex; flex-direction:column; gap:10px;"),
551
- col_widths=(3,3,3,3),
2834
+ ui.div(*base_inputs, style="display:flex; flex-direction:column; gap:10px;"),
2835
+ ui.div(*extra_inputs, style="display:flex; flex-direction:column; gap:10px;"),
2836
+ ui.div(*numeric_inputs[0:5], style="display:flex; flex-direction:column; gap:10px;"),
2837
+ ui.div(*numeric_inputs[5:11], style="display:flex; flex-direction:column; gap:10px;"),
2838
+ col_widths=(3, 3, 3, 3),
552
2839
  )
553
- elif platform == "NRMS":
2840
+ else:
554
2841
  inputs_columns = ui.layout_columns(
555
- ui.div(base_inputs[0:6], style="display:flex; flex-direction:column; gap:10px;"),
556
- ui.div([base_inputs[6:7], *extra_inputs], style="display:flex; flex-direction:column; gap:10px;"),
557
- ui.div(numeric_inputs[0:5], style="display:flex; flex-direction:column; gap:10px;"),
558
- ui.div([numeric_inputs[5:10]], style="display:flex; flex-direction:column; gap:10px;"),
559
- col_widths=(3,3,3,3),
2842
+ ui.div(*base_inputs, style="display:flex; flex-direction:column; gap:10px;"),
2843
+ ui.div(*extra_inputs, style="display:flex; flex-direction:column; gap:10px;"),
2844
+ ui.div(*numeric_inputs[0:5], style="display:flex; flex-direction:column; gap:10px;"),
2845
+ ui.div(*numeric_inputs[5:11], style="display:flex; flex-direction:column; gap:10px;"),
2846
+ col_widths=(3, 3, 3, 3),
560
2847
  )
561
2848
 
562
2849
  return ui.page_fillable(
563
2850
  ui.layout_sidebar(
564
2851
  ui.sidebar(
565
- ui.h3("Select parameters"),
566
- ui.input_checkbox_group(
567
- "params",
568
- None,
569
- choices=list(PARAMS.keys()),
570
- selected=["noise_threshold","LET_threshold"],
571
- ),
2852
+ ui.h3("Select continuous parameters to optimize"),
2853
+ ui.input_checkbox_group("params", None, choices=list(PARAMS.keys()), selected=["noise_threshold", "LET_threshold"]),
572
2854
  ui.hr(),
573
2855
  ui.h4("Bounds for selected parameters"),
574
2856
  ui.output_ui("bounds_inputs"),
@@ -576,23 +2858,30 @@ def run_parameter_tuning_DE_ui(platform: str):
576
2858
  ),
577
2859
  ui.div(
578
2860
  ui.h2("Tune parameters (differential evolution optimization)"),
579
- *(inputs_columns if isinstance(inputs_columns, (list, tuple)) else [inputs_columns]),
580
- run_button_parameter_tuning_DE,
581
- back_button,
2861
+ inputs_columns,
2862
+ ui.div(run_button_parameter_tuning_DE, back_button, style=("display:flex; flex-direction:row; gap:12px; align-items:center; flex-wrap:wrap;")),
2863
+ ui.br(),
2864
+ ui.card(
2865
+ ui.card_header("Live log"),
2866
+ ui.output_text_verbatim("run_log"),
2867
+ ),
2868
+ style="display:flex; flex-direction:column; gap:16px;",
582
2869
  ),
583
2870
  )
584
2871
  )
585
2872
 
586
2873
 
587
2874
 
588
-
589
2875
  app_ui = ui.page_fluid(
590
2876
  ui.head_content(ui.tags.link(rel="icon", href="emblem.png")),
2877
+ ui.div(ui.output_image("image"), style=("display:block; margin:20px auto; max-width:320px; height:auto; text-align:center")),
591
2878
  ui.output_ui("main_ui"),
592
- ui.output_text("status_output")
2879
+ ui.output_text("status_output"),
593
2880
  )
594
2881
 
595
2882
 
2883
+
2884
+
596
2885
  def server(input, output, session):
597
2886
 
598
2887
  current_page = reactive.Value("main_menu")
@@ -611,7 +2900,7 @@ def server(input, output, session):
611
2900
  match_log_rv = reactive.Value("")
612
2901
  is_matching_rv = reactive.Value(False)
613
2902
  is_any_job_running = reactive.Value(False)
614
- latest_csv_path_rv = reactive.Value("")
2903
+ latest_txt_path_rv = reactive.Value("")
615
2904
  latest_df_rv = reactive.Value(None)
616
2905
  is_running_rv = reactive.Value(False)
617
2906
 
@@ -627,6 +2916,106 @@ def server(input, output, session):
627
2916
  converted_query_path_rv = reactive.Value(None)
628
2917
  converted_reference_path_rv = reactive.Value(None)
629
2918
 
2919
+ df_rv = reactive.Value(None)
2920
+
2921
+
2922
+ def _discover_rank_cols(df: pd.DataFrame):
2923
+ pred_pat = re.compile(r"^RANK\.(\d+)\.PRED$")
2924
+ score_pat = re.compile(r"^RANK\.(\d+)\.SIMILARITY\.SCORE$")
2925
+ pred_map, score_map = {}, {}
2926
+ for c in df.columns:
2927
+ m = pred_pat.match(c)
2928
+ if m: pred_map[int(m.group(1))] = c
2929
+ m = score_pat.match(c)
2930
+ if m: score_map[int(m.group(1))] = c
2931
+ return [(k, pred_map[k], score_map.get(k)) for k in sorted(pred_map)]
2932
+
2933
+
2934
+ def _rank_choices_for_query(df: pd.DataFrame, qid: str):
2935
+ sub = df.loc[df["QUERY.SPECTRUM.ID"].astype(str) == str(qid)]
2936
+ if sub.empty:
2937
+ return {}, None
2938
+ row = sub.iloc[0]
2939
+ rank_cols = _discover_rank_cols(df)
2940
+ if not rank_cols:
2941
+ return {}, None
2942
+
2943
+ choices = {}
2944
+ default_value = None
2945
+ for (k, pred_col, score_col) in rank_cols:
2946
+ pred = row.get(pred_col, None)
2947
+ if pd.isna(pred):
2948
+ continue
2949
+ pred = str(pred)
2950
+ score = row.get(score_col, None) if score_col else None
2951
+ score_str = f"{float(score):.6f}" if (score is not None and pd.notna(score)) else "NA"
2952
+ label = f"Rank {k} — {score_str} — {pred}"
2953
+ choices[label] = pred # values are plain names
2954
+ if k == 1:
2955
+ default_value = pred # default = Rank 1 name
2956
+
2957
+ if default_value is None and choices:
2958
+ default_value = next(iter(choices.values()))
2959
+ return choices, default_value
2960
+
2961
+
2962
+ @reactive.effect
2963
+ @reactive.event(input.compound_ID_output_file)
2964
+ async def _populate_ids_from_compound_ID_output_upload():
2965
+ files = input.compound_ID_output_file()
2966
+ if not files:
2967
+ return
2968
+
2969
+ in_path = Path(files[0]["datapath"])
2970
+ try:
2971
+ query_status_rv.set(f"Reading table from: {in_path.name} …")
2972
+ await reactive.flush()
2973
+
2974
+ df = await asyncio.to_thread(pd.read_csv, in_path, sep="\t", header=0)
2975
+
2976
+ if "QUERY.SPECTRUM.ID" not in df.columns:
2977
+ raise ValueError("Missing required column: QUERY.SPECTRUM.ID")
2978
+ if not _discover_rank_cols(df):
2979
+ raise ValueError("No columns matching RANK.<k>.PRED found.")
2980
+
2981
+ df_rv.set(df)
2982
+
2983
+ ids = df["QUERY.SPECTRUM.ID"].astype(str).tolist()
2984
+ unique_ids_in_order = list(dict.fromkeys(ids))
2985
+
2986
+ choices_dict, default_rank_value = _rank_choices_for_query(df, ids[0])
2987
+ choices_values = [str(v).strip() for v in choices_dict.values()]
2988
+ default_rank_value = str(default_rank_value).strip() if default_rank_value is not None else None
2989
+
2990
+ ui.update_selectize("q_spec", choices=unique_ids_in_order, selected=ids[0])
2991
+ await reactive.flush()
2992
+
2993
+ ui.update_selectize("r_spec", choices=choices_values, selected=choices_values[0])
2994
+ await reactive.flush()
2995
+
2996
+ except Exception as e:
2997
+ query_status_rv.set(f"❌ Failed: {e}")
2998
+ await reactive.flush()
2999
+ raise
3000
+
3001
+
3002
+ @reactive.effect
3003
+ @reactive.event(input.q_spec)
3004
+ async def _update_rank_choices_on_compound_ID_change():
3005
+ df = df_rv.get()
3006
+ if df is None:
3007
+ return
3008
+ qid = input.q_spec()
3009
+ if not qid:
3010
+ return
3011
+
3012
+ choices, default_rank_value = _rank_choices_for_query(df, qid)
3013
+ choices = list(choices.values())
3014
+ ui.update_selectize('r_spec', choices=choices, selected=default_rank_value)
3015
+ await reactive.flush()
3016
+
3017
+
3018
+
630
3019
  @output
631
3020
  @render.ui
632
3021
  def bounds_inputs():
@@ -769,6 +3158,11 @@ def server(input, output, session):
769
3158
  def flush(self):
770
3159
  pass
771
3160
 
3161
+ def _run_with_redirects(func, writer: ReactiveWriter, **kwargs):
3162
+ with contextlib.redirect_stdout(writer), contextlib.redirect_stderr(writer):
3163
+ return func(**kwargs)
3164
+
3165
+
772
3166
 
773
3167
  @reactive.effect
774
3168
  async def _pump_logs():
@@ -865,7 +3259,7 @@ def server(input, output, session):
865
3259
  @render.image
866
3260
  def image():
867
3261
  dir = Path(__file__).resolve().parent
868
- img: ImgData = {"src": str(dir / "www/emblem.png"), "width": "320px", "height": "250px"}
3262
+ img: ImgData = {"src": str(dir / "www/emblem.png"), "width": "250px", "height": "250px"}
869
3263
  return img
870
3264
 
871
3265
  @output
@@ -874,30 +3268,10 @@ def server(input, output, session):
874
3268
  if current_page() == "main_menu":
875
3269
  return ui.page_fluid(
876
3270
  ui.h2("Main Menu"),
877
- ui.div(
878
- ui.output_image("image"),
879
- #ui.img(src="emblem.png", width="320px", height="250px"),
880
- style=(
881
- "position:fixed; top:0; left:50%; transform:translateX(-50%); "
882
- "z-index:1000; text-align:center; padding:10px; background-color:white;"
883
- ),
884
- ),
885
- ui.div(
886
- "Overview:",
887
- style="text-align:left; font-size:24px; font-weight:bold; margin-top:350px"
888
- ),
889
- ui.div(
890
- "PyCompound is a Python-based tool designed for performing spectral library matching on either high-resolution mass spectrometry data (HRMS) or low-resolution mass spectrometry data (NRMS). PyCompound offers a range of spectrum preprocessing transformations and similarity measures. These spectrum preprocessing transformations include filtering on mass/charge and/or intensity values, weight factor transformation, low-entropy transformation, centroiding, noise removal, and matching. The available similarity measures include the canonical Cosine similarity measure, three entropy-based similarity measures, and a variety of binary similarity measures: Jaccard, Dice, 3W-Jaccard, Sokal-Sneath, Binary Cosine, Mountford, McConnaughey, Driver-Kroeber, Simpson, Braun-Banquet, Fager-McGowan, Kulczynski, Intersection, Hamming, and Hellinger.",
891
- style="margin-top:10px; text-align:left; font-size:16px; font-weight:500"
892
- ),
893
- ui.div(
894
- "Select options:",
895
- style="margin-top:30px; text-align:left; font-size:24px; font-weight:bold"
896
- ),
897
- ui.div(
898
- ui.input_radio_buttons("chromatography_platform", "Specify chromatography platform:", ["HRMS","NRMS"]),
899
- style="font-size:18px; margin-top:10px; max-width:none"
900
- ),
3271
+ ui.div("Overview:", style="text-align:left; font-size:24px; font-weight:bold"),
3272
+ ui.div("PyCompound is a Python-based tool designed for performing spectral library matching on either high-resolution mass spectrometry data (HRMS) or low-resolution mass spectrometry data (NRMS). PyCompound offers a range of spectrum preprocessing transformations and similarity measures. These spectrum preprocessing transformations include filtering on mass/charge and/or intensity values, weight factor transformation, low-entropy transformation, centroiding, noise removal, and matching. The available similarity measures include the canonical Cosine similarity measure, three entropy-based similarity measures, and a variety of binary similarity measures: Jaccard, Dice, 3W-Jaccard, Sokal-Sneath, Binary Cosine, Mountford, McConnaughey, Driver-Kroeber, Simpson, Braun-Banquet, Fager-McGowan, Kulczynski, Intersection, Hamming, and Hellinger.", style="margin-top:10px; text-align:left; font-size:16px; font-weight:500"),
3273
+ ui.div("Select options:", style="margin-top:30px; text-align:left; font-size:24px; font-weight:bold"),
3274
+ ui.div(ui.input_radio_buttons("chromatography_platform", "Specify chromatography platform:", ["HRMS","NRMS"]), style="font-size:18px; margin-top:10px; max-width:none"),
901
3275
  ui.input_action_button("plot_spectra", "Plot two spectra before and after preprocessing transformations.", style="font-size:18px; padding:20px 40px; width:550px; height:100px; margin-top:10px; margin-right:50px"),
902
3276
  ui.input_action_button("run_spec_lib_matching", "Run spectral library matching to perform compound identification on a query library of spectra.", style="font-size:18px; padding:20px 40px; width:550px; height:100px; margin-top:10px; margin-right:50px"),
903
3277
  ui.input_action_button("run_parameter_tuning_grid", "Grid search: Tune parameters to maximize accuracy of compound identification given a query library with known spectrum IDs.", style="font-size:18px; padding:20px 40px; width:450px; height:120px; margin-top:10px; margin-right:50px"),
@@ -970,36 +3344,36 @@ def server(input, output, session):
970
3344
  suffix = in_path.suffix.lower()
971
3345
 
972
3346
  try:
973
- if suffix == ".csv":
974
- csv_path = in_path
975
- converted_query_path_rv.set(str(csv_path))
3347
+ if suffix == ".txt":
3348
+ txt_path = in_path
3349
+ converted_query_path_rv.set(str(txt_path))
976
3350
  else:
977
- query_status_rv.set(f"Converting {in_path.name} → CSV …")
3351
+ query_status_rv.set(f"Converting {in_path.name} → TXT…")
978
3352
  await reactive.flush()
979
3353
 
980
- tmp_csv_path = in_path.with_suffix(".converted.csv")
3354
+ tmp_txt_path = in_path.with_suffix(".converted.txt")
981
3355
 
982
- out_obj = await asyncio.to_thread(build_library, str(in_path), str(tmp_csv_path))
3356
+ out_obj = await asyncio.to_thread(build_library, str(in_path), str(tmp_txt_path))
983
3357
 
984
3358
  if isinstance(out_obj, (str, os.PathLike, Path)):
985
- csv_path = Path(out_obj)
3359
+ txt_path = Path(out_obj)
986
3360
  elif isinstance(out_obj, pd.DataFrame):
987
- out_obj.to_csv(tmp_csv_path, index=False, sep='\t')
988
- csv_path = tmp_csv_path
3361
+ out_obj.to_csv(tmp_txt_path, index=False, sep='\t')
3362
+ txt_path = tmp_txt_path
989
3363
  else:
990
3364
  raise TypeError(f"build_library returned unsupported type: {type(out_obj)}")
991
3365
 
992
- converted_query_path_rv.set(str(csv_path))
3366
+ converted_query_path_rv.set(str(txt_path))
993
3367
 
994
- query_status_rv.set(f"Reading IDs from: {csv_path.name} …")
3368
+ query_status_rv.set(f"Reading IDs from: {txt_path.name} …")
995
3369
  await reactive.flush()
996
3370
 
997
- ids = await asyncio.to_thread(extract_first_column_ids, str(csv_path))
3371
+ ids = await asyncio.to_thread(extract_first_column_ids, str(txt_path))
998
3372
  query_ids_rv.set(ids)
999
3373
 
1000
3374
  ui.update_selectize("spectrum_ID1", choices=ids, selected=(ids[0] if ids else None))
1001
3375
 
1002
- query_status_rv.set(f"✅ Loaded {len(ids)} IDs from {csv_path.name}" if ids else f"⚠️ No IDs found in {csv_path.name}")
3376
+ query_status_rv.set(f"✅ Loaded {len(ids)} IDs from {txt_path.name}" if ids else f"⚠️ No IDs found in {txt_path.name}")
1003
3377
  await reactive.flush()
1004
3378
 
1005
3379
  except Exception as e:
@@ -1019,37 +3393,37 @@ def server(input, output, session):
1019
3393
  suffix = in_path.suffix.lower()
1020
3394
 
1021
3395
  try:
1022
- if suffix == ".csv":
1023
- csv_path = in_path
1024
- converted_reference_path_rv.set(str(csv_path))
3396
+ if suffix == ".txt":
3397
+ txt_path = in_path
3398
+ converted_reference_path_rv.set(str(txt_path))
1025
3399
  else:
1026
- reference_status_rv.set(f"Converting {in_path.name} → CSV …")
3400
+ reference_status_rv.set(f"Converting {in_path.name} → TXT…")
1027
3401
  await reactive.flush()
1028
3402
 
1029
- tmp_csv_path = in_path.with_suffix(".converted.csv")
3403
+ tmp_txt_path = in_path.with_suffix(".converted.txt")
1030
3404
 
1031
- out_obj = await asyncio.to_thread(build_library, str(in_path), str(tmp_csv_path))
3405
+ out_obj = await asyncio.to_thread(build_library, str(in_path), str(tmp_txt_path))
1032
3406
 
1033
3407
  if isinstance(out_obj, (str, os.PathLike, Path)):
1034
- csv_path = Path(out_obj)
3408
+ txt_path = Path(out_obj)
1035
3409
  elif isinstance(out_obj, pd.DataFrame):
1036
- out_obj.to_csv(tmp_csv_path, index=False, sep='\t')
1037
- csv_path = tmp_csv_path
3410
+ out_obj.to_csv(tmp_txt_path, index=False, sep='\t')
3411
+ txt_path = tmp_txt_path
1038
3412
  else:
1039
3413
  raise TypeError(f"build_library returned unsupported type: {type(out_obj)}")
1040
3414
 
1041
- converted_reference_path_rv.set(str(csv_path))
3415
+ converted_reference_path_rv.set(str(txt_path))
1042
3416
 
1043
- reference_status_rv.set(f"Reading IDs from: {csv_path.name} …")
3417
+ reference_status_rv.set(f"Reading IDs from: {txt_path.name} …")
1044
3418
  await reactive.flush()
1045
3419
 
1046
- ids = await asyncio.to_thread(extract_first_column_ids, str(csv_path))
3420
+ ids = await asyncio.to_thread(extract_first_column_ids, str(txt_path))
1047
3421
  reference_ids_rv.set(ids)
1048
3422
 
1049
3423
  ui.update_selectize("spectrum_ID2", choices=ids, selected=(ids[0] if ids else None))
1050
3424
 
1051
3425
  reference_status_rv.set(
1052
- f"✅ Loaded {len(ids)} IDs from {csv_path.name}" if ids else f"⚠️ No IDs found in {csv_path.name}"
3426
+ f"✅ Loaded {len(ids)} IDs from {txt_path.name}" if ids else f"⚠️ No IDs found in {txt_path.name}"
1053
3427
  )
1054
3428
  await reactive.flush()
1055
3429
 
@@ -1059,7 +3433,7 @@ def server(input, output, session):
1059
3433
  raise
1060
3434
 
1061
3435
 
1062
- @render.download(filename=lambda: f"plot.png")
3436
+ @render.download(filename=lambda: f"plot.svg")
1063
3437
  def run_btn_plot_spectra():
1064
3438
  spectrum_ID1 = input.spectrum_ID1() or None
1065
3439
  spectrum_ID2 = input.spectrum_ID2() or None
@@ -1071,22 +3445,20 @@ def server(input, output, session):
1071
3445
  if input.high_quality_reference_library() != 'False':
1072
3446
  high_quality_reference_library_tmp2 = True
1073
3447
 
1074
- print(input.high_quality_reference_library())
1075
- print(high_quality_reference_library_tmp2)
1076
-
1077
3448
  if input.chromatography_platform() == "HRMS":
1078
- fig = generate_plots_on_HRMS_data(query_data=input.query_data()[0]['datapath'], reference_data=input.reference_data()[0]['datapath'], spectrum_ID1=spectrum_ID1, spectrum_ID2=spectrum_ID2, similarity_measure=input.similarity_measure(), weights=weights, spectrum_preprocessing_order=input.spectrum_preprocessing_order(), high_quality_reference_library=high_quality_reference_library_tmp2, mz_min=input.mz_min(), mz_max=input.mz_max(), int_min=input.int_min(), int_max=input.int_max(), window_size_centroiding=input.window_size_centroiding(), window_size_matching=input.window_size_matching(), noise_threshold=input.noise_threshold(), wf_mz=input.wf_mz(), wf_intensity=input.wf_int(), LET_threshold=input.LET_threshold(), entropy_dimension=input.entropy_dimension(), y_axis_transformation=input.y_axis_transformation(), return_plot=True)
3449
+ fig = generate_plots_on_HRMS_data(query_data=input.query_data()[0]['datapath'], reference_data=input.reference_data()[0]['datapath'], spectrum_ID1=spectrum_ID1, spectrum_ID2=spectrum_ID2, print_url_spectrum1=input.print_url_spectrum1(), print_url_spectrum2=input.print_url_spectrum2(), similarity_measure=input.similarity_measure(), weights=weights, spectrum_preprocessing_order=input.spectrum_preprocessing_order(), high_quality_reference_library=high_quality_reference_library_tmp2, mz_min=input.mz_min(), mz_max=input.mz_max(), int_min=input.int_min(), int_max=input.int_max(), window_size_centroiding=input.window_size_centroiding(), window_size_matching=input.window_size_matching(), noise_threshold=input.noise_threshold(), wf_mz=input.wf_mz(), wf_intensity=input.wf_int(), LET_threshold=input.LET_threshold(), entropy_dimension=input.entropy_dimension(), y_axis_transformation=input.y_axis_transformation(), return_plot=True)
1079
3450
  plt.show()
1080
3451
  elif input.chromatography_platform() == "NRMS":
1081
- fig = generate_plots_on_NRMS_data(query_data=input.query_data()[0]['datapath'], reference_data=input.reference_data()[0]['datapath'], spectrum_ID1=spectrum_ID1, spectrum_ID2=spectrum_ID2, similarity_measure=input.similarity_measure(), spectrum_preprocessing_order=input.spectrum_preprocessing_order(), high_quality_reference_library=high_quality_reference_library_tmp2, mz_min=input.mz_min(), mz_max=input.mz_max(), int_min=input.int_min(), int_max=input.int_max(), noise_threshold=input.noise_threshold(), wf_mz=input.wf_mz(), wf_intensity=input.wf_int(), LET_threshold=input.LET_threshold(), entropy_dimension=input.entropy_dimension(), y_axis_transformation=input.y_axis_transformation(), return_plot=True)
3452
+ fig = generate_plots_on_NRMS_data(query_data=input.query_data()[0]['datapath'], reference_data=input.reference_data()[0]['datapath'], spectrum_ID1=spectrum_ID1, spectrum_ID2=spectrum_ID2, print_url_spectrum1=input.print_url_spectrum1(), print_url_spectrum2=input.print_url_spectrum2(), similarity_measure=input.similarity_measure(), spectrum_preprocessing_order=input.spectrum_preprocessing_order(), high_quality_reference_library=high_quality_reference_library_tmp2, mz_min=input.mz_min(), mz_max=input.mz_max(), int_min=input.int_min(), int_max=input.int_max(), noise_threshold=input.noise_threshold(), wf_mz=input.wf_mz(), wf_intensity=input.wf_int(), LET_threshold=input.LET_threshold(), entropy_dimension=input.entropy_dimension(), y_axis_transformation=input.y_axis_transformation(), return_plot=True)
1082
3453
  plt.show()
1083
3454
  with io.BytesIO() as buf:
1084
- fig.savefig(buf, format="png", dpi=150, bbox_inches="tight")
3455
+ fig.savefig(buf, format="svg", dpi=150, bbox_inches="tight")
1085
3456
  plt.close()
1086
3457
  yield buf.getvalue()
1087
3458
 
1088
3459
 
1089
3460
 
3461
+
1090
3462
  @render.download(filename="identification_output.txt")
1091
3463
  async def run_btn_spec_lib_matching():
1092
3464
  match_log_rv.set("Running identification...\n")
@@ -1099,7 +3471,7 @@ def server(input, output, session):
1099
3471
  hq = bool(hq)
1100
3472
 
1101
3473
  weights = [float(weight.strip()) for weight in input.weights().split(",") if weight.strip()]
1102
- weights = {'Cosine':weights[0], 'Shannon':weights[1], 'Renyi':weights[2], 'Tsallis':weights[3]}
3474
+ weights = {'Cosine': weights[0], 'Shannon': weights[1], 'Renyi': weights[2], 'Tsallis': weights[3]}
1103
3475
 
1104
3476
  common_kwargs = dict(
1105
3477
  query_data=input.query_data()[0]["datapath"],
@@ -1121,37 +3493,81 @@ def server(input, output, session):
1121
3493
  return_ID_output=True,
1122
3494
  )
1123
3495
 
3496
+ # --- streaming setup (same pattern as your DE block) ---
1124
3497
  loop = asyncio.get_running_loop()
1125
- rw = ReactiveWriter(loop)
3498
+ q: asyncio.Queue[str | None] = asyncio.Queue()
3499
+
3500
+ class UIWriter(io.TextIOBase):
3501
+ def write(self, s: str):
3502
+ if s:
3503
+ loop.call_soon_threadsafe(q.put_nowait, s)
3504
+ return len(s)
3505
+ def flush(self): pass
3506
+
3507
+ async def _drain():
3508
+ while True:
3509
+ msg = await q.get()
3510
+ if msg is None:
3511
+ break
3512
+ match_log_rv.set(match_log_rv.get() + msg)
3513
+ await reactive.flush()
3514
+
3515
+ drain_task = asyncio.create_task(_drain())
3516
+ writer = UIWriter()
3517
+
3518
+ # --- worker wrappers that install redirects INSIDE the thread ---
3519
+ def _run_hrms():
3520
+ with redirect_stdout(writer), redirect_stderr(writer):
3521
+ # optional heartbeat
3522
+ print(">> Starting HRMS identification ...", flush=True)
3523
+ return run_spec_lib_matching_on_HRMS_data_shiny(
3524
+ precursor_ion_mz_tolerance=input.precursor_ion_mz_tolerance(),
3525
+ ionization_mode=input.ionization_mode(),
3526
+ adduct=input.adduct(),
3527
+ window_size_centroiding=input.window_size_centroiding(),
3528
+ window_size_matching=input.window_size_matching(),
3529
+ **common_kwargs
3530
+ )
3531
+
3532
+ def _run_nrms():
3533
+ with redirect_stdout(writer), redirect_stderr(writer):
3534
+ print(">> Starting NRMS identification ...", flush=True)
3535
+ return run_spec_lib_matching_on_NRMS_data_shiny(**common_kwargs)
1126
3536
 
3537
+ # --- run in worker thread and stream output live ---
1127
3538
  try:
1128
- with redirect_stdout(rw), redirect_stderr(rw):
1129
- if input.chromatography_platform() == "HRMS":
1130
- df_out = await asyncio.to_thread(
1131
- run_spec_lib_matching_on_HRMS_data,
1132
- window_size_centroiding=input.window_size_centroiding(),
1133
- window_size_matching=input.window_size_matching(),
1134
- **common_kwargs
1135
- )
1136
- else:
1137
- df_out = await asyncio.to_thread(run_spec_lib_matching_on_NRMS_data, **common_kwargs)
3539
+ if input.chromatography_platform() == "HRMS":
3540
+ df_out = await asyncio.to_thread(_run_hrms)
3541
+ else:
3542
+ df_out = await asyncio.to_thread(_run_nrms)
3543
+
1138
3544
  match_log_rv.set(match_log_rv.get() + "\n✅ Identification finished.\n")
1139
3545
  await reactive.flush()
3546
+
1140
3547
  except Exception as e:
1141
- match_log_rv.set(match_log_rv.get() + f"\n❌ Error: {e}\n")
3548
+ import traceback
3549
+ tb = "".join(traceback.format_exception(type(e), e, e.__traceback__))
3550
+ match_log_rv.set(match_log_rv.get() + f"\n❌ {type(e).__name__}: {e}\n{tb}\n")
1142
3551
  await reactive.flush()
3552
+ # make sure to stop the drainer before re-raising
3553
+ await q.put(None); await drain_task
1143
3554
  raise
1144
3555
 
1145
- yield df_out.to_csv(index=True, sep='\t')
3556
+ finally:
3557
+ await q.put(None)
3558
+ await drain_task
3559
+
3560
+ yield df_out.to_csv(index=True, sep="\t")
3561
+
1146
3562
 
1147
3563
 
1148
3564
 
1149
- @render.download(filename="plot.png")
3565
+ @render.download(filename="plot.svg")
1150
3566
  def run_btn_plot_spectra_within_spec_lib_matching():
1151
3567
  req(input.query_data(), input.reference_data())
1152
3568
 
1153
- spectrum_ID1 = input.spectrum_ID1() or None
1154
- spectrum_ID2 = input.spectrum_ID2() or None
3569
+ spectrum_ID1 = input.q_spec() or None
3570
+ spectrum_ID2 = input.r_spec() or None
1155
3571
 
1156
3572
  hq = input.high_quality_reference_library()
1157
3573
  if isinstance(hq, str):
@@ -1167,6 +3583,8 @@ def server(input, output, session):
1167
3583
  reference_data=input.reference_data()[0]['datapath'],
1168
3584
  spectrum_ID1=spectrum_ID1,
1169
3585
  spectrum_ID2=spectrum_ID2,
3586
+ print_url_spectrum1=input.print_url_spectrum1(),
3587
+ print_url_spectrum2=input.print_url_spectrum2(),
1170
3588
  similarity_measure=input.similarity_measure(),
1171
3589
  weights=weights,
1172
3590
  spectrum_preprocessing_order=input.spectrum_preprocessing_order(),
@@ -1192,7 +3610,7 @@ def server(input, output, session):
1192
3610
  plt.show()
1193
3611
 
1194
3612
  with io.BytesIO() as buf:
1195
- fig.savefig(buf, format="png", dpi=150, bbox_inches="tight")
3613
+ fig.savefig(buf, format="svg", dpi=150, bbox_inches="tight")
1196
3614
  plt.close()
1197
3615
  yield buf.getvalue()
1198
3616
 
@@ -1230,6 +3648,9 @@ def server(input, output, session):
1230
3648
 
1231
3649
  try:
1232
3650
  if input.chromatography_platform() == "HRMS":
3651
+ precursor_ion_mz_tolerance = float(input.precursor_ion_mz_tolerance())
3652
+ ionization_mode = str(input.ionization_mode())
3653
+ adduct = str(input.adduct())
1233
3654
  window_size_centroiding_tmp = strip_numeric(input.window_size_centroiding())
1234
3655
  window_size_matching_tmp = strip_numeric(input.window_size_matching())
1235
3656
  grid = {
@@ -1249,7 +3670,7 @@ def server(input, output, session):
1249
3670
  'window_size_centroiding': window_size_centroiding_tmp,
1250
3671
  'window_size_matching': window_size_matching_tmp,
1251
3672
  }
1252
- df_out = await asyncio.to_thread(_run_with_redirects, tune_params_on_HRMS_data_grid_shiny, rw, **common_kwargs, grid=grid)
3673
+ df_out = await asyncio.to_thread(_run_with_redirects, tune_params_on_HRMS_data_grid_shiny, rw, **common_kwargs, grid=grid, precursor_ion_mz_tolerance=precursor_ion_mz_tolerance, ionization_mode=ionization_mode, adduct=adduct)
1253
3674
  else:
1254
3675
  grid = {
1255
3676
  'similarity_measure': similarity_measure_tmp,
@@ -1277,43 +3698,147 @@ def server(input, output, session):
1277
3698
  is_any_job_running.set(False)
1278
3699
  await reactive.flush()
1279
3700
 
1280
- yield df_out.to_csv(index=False).encode("utf-8", sep='\t')
3701
+ yield df_out.to_csv(index=False, sep='\t').encode("utf-8")
3702
+
1281
3703
 
1282
3704
 
1283
3705
  @reactive.effect
1284
3706
  @reactive.event(input.run_btn_parameter_tuning_DE)
1285
- def _run_btn_parameter_tuning_DE():
3707
+ async def run_btn_parameter_tuning_DE():
3708
+ match_log_rv.set("Tuning specified continuous parameters using differential evolution...\n")
1286
3709
  is_any_job_running.set(True)
1287
3710
  is_tuning_DE_running.set(True)
1288
- match_log_rv.set("Tuning specified continuous parameters using differential evolution...\n")
3711
+ await reactive.flush()
1289
3712
 
1290
- #print('\nhere!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!')
1291
- weights = [float(weight.strip()) for weight in input.weights().split(",") if weight.strip()]
1292
- weights = {'Cosine':weights[0], 'Shannon':weights[1], 'Renyi':weights[2], 'Tsallis':weights[3]}
1293
- opt_params, bounds_dict, bounds_list = _read_bounds()
1294
- #print(input.params())
1295
- #print("Optimizing over:", opt_params)
1296
- #print("Bounds list:", bounds_list)
1297
- #print("Bounds dict:", bounds_dict)
1298
- #tmp = {"window_size_centroiding":input.window_size_centroiding(), "window_size_matching":input.window_size_matching(), "noise_threshold":input.noise_threshold(), "wf_mz":input.wf_mz(), "wf_int":input.wf_int(), "LET_threshold":input.LET_threshold(), "entropy_dimension":input.entropy_dimension()}
1299
- #print(tmp)
1300
- if input.chromatography_platform() == 'HRMS':
1301
- tune_params_DE(query_data=input.query_data()[0]["datapath"],
1302
- reference_data=input.reference_data()[0]["datapath"],
1303
- similarity_measure=input.similarity_measure(),
1304
- weights=weights,
1305
- spectrum_preprocessing_order=input.spectrum_preprocessing_order(),
1306
- mz_min=input.mz_min(),
1307
- mz_max=input.mz_max(),
1308
- int_min=input.int_min(),
1309
- int_max=input.int_max(),
1310
- high_quality_reference_library=input.high_quality_reference_library(),
1311
- optimize_params=list(input.params()),
1312
- param_bounds={"window_size_matching":(0.0,0.5),"noise_threshold":(0.0,0.25),"wf_mz":(0.0,5.0),"wf_int":(0.0,5.0)},
1313
- #param_bounds=bounds_dict,
1314
- default_params={"window_size_centroiding":input.window_size_centroiding(), "window_size_matching":input.window_size_matching(), "noise_threshold":input.noise_threshold(), "wf_mz":input.wf_mz(), "wf_int":input.wf_int(), "LET_threshold":input.LET_threshold(), "entropy_dimension":input.entropy_dimension()})
1315
- #print('here!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n')
3713
+ def _safe_float(v, default):
3714
+ try:
3715
+ if v is None:
3716
+ return default
3717
+ return float(v)
3718
+ except Exception:
3719
+ return default
3720
+
3721
+ def _iget(id, default=None):
3722
+ if id in input:
3723
+ try:
3724
+ return input[id]()
3725
+ except SilentException:
3726
+ return default
3727
+ return default
3728
+
3729
+ loop = asyncio.get_running_loop()
3730
+ q: asyncio.Queue[str | None] = asyncio.Queue()
3731
+
3732
+ class UIWriter(io.TextIOBase):
3733
+ def write(self, s: str):
3734
+ if s:
3735
+ loop.call_soon_threadsafe(q.put_nowait, s)
3736
+ return len(s)
3737
+ def flush(self): pass
3738
+
3739
+ async def _drain():
3740
+ while True:
3741
+ msg = await q.get()
3742
+ if msg is None:
3743
+ break
3744
+ match_log_rv.set(match_log_rv.get() + msg)
3745
+ await reactive.flush()
3746
+
3747
+ drain_task = asyncio.create_task(_drain())
3748
+ writer = UIWriter()
3749
+
3750
+ try:
3751
+ qfile = _iget("query_data")[0]["datapath"]
3752
+ rfile = _iget("reference_data")[0]["datapath"]
1316
3753
 
3754
+ platform = _iget("chromatography_platform", "HRMS")
3755
+ sim = _iget("similarity_measure", "cosine")
3756
+ spro = _iget("spectrum_preprocessing_order", "FCNMWL")
3757
+
3758
+ hq_raw = _iget("high_quality_reference_library", False)
3759
+ if isinstance(hq_raw, str):
3760
+ hq = hq_raw.lower() == "true"
3761
+ else:
3762
+ hq = bool(hq_raw)
3763
+
3764
+ mz_min = _safe_float(_iget("mz_min", 0.0), 0.0)
3765
+ mz_max = _safe_float(_iget("mz_max", 99_999_999.0), 99_999_999.0)
3766
+ int_min = _safe_float(_iget("int_min", 0.0), 0.0)
3767
+ int_max = _safe_float(_iget("int_max", 999_999_999.0), 999_999_999.0)
3768
+
3769
+ w_text = _iget("weights", "") or ""
3770
+ w_list = [float(w.strip()) for w in w_text.split(",") if w.strip()]
3771
+ w_list = (w_list + [0.0, 0.0, 0.0, 0.0])[:4]
3772
+ weights = {"Cosine": w_list[0], "Shannon": w_list[1], "Renyi": w_list[2], "Tsallis": w_list[3]}
3773
+
3774
+ opt_params = tuple(_iget("params", ()) or ())
3775
+ bounds_dict = {}
3776
+ param_defaults = PARAMS_HRMS if platform == "HRMS" else PARAMS_NRMS
3777
+ for p in opt_params:
3778
+ lo = _safe_float(_iget(f"min_{p}", param_defaults.get(p, (0.0, 1.0))[0]),
3779
+ param_defaults.get(p, (0.0, 1.0))[0])
3780
+ hi = _safe_float(_iget(f"max_{p}", param_defaults.get(p, (0.0, 1.0))[1]),
3781
+ param_defaults.get(p, (0.0, 1.0))[1])
3782
+ if lo > hi:
3783
+ lo, hi = hi, lo
3784
+ bounds_dict[p] = (lo, hi)
3785
+
3786
+ defaults = {
3787
+ "window_size_centroiding": _safe_float(_iget("window_size_centroiding", 0.5), 0.5),
3788
+ "window_size_matching": _safe_float(_iget("window_size_matching", 0.5), 0.5),
3789
+ "noise_threshold": _safe_float(_iget("noise_threshold", 0.0), 0.0),
3790
+ "wf_mz": _safe_float(_iget("wf_mz", 0.0), 0.0),
3791
+ "wf_int": _safe_float(_iget("wf_int", 1.0), 1.0),
3792
+ "LET_threshold": _safe_float(_iget("LET_threshold", 0.0), 0.0),
3793
+ "entropy_dimension": _safe_float(_iget("entropy_dimension", 1.1), 1.1),
3794
+ }
3795
+ if platform == "NRMS":
3796
+ defaults.pop("window_size_centroiding", None)
3797
+ defaults.pop("window_size_matching", None)
3798
+
3799
+ except Exception as e:
3800
+ import traceback
3801
+ tb = "".join(traceback.format_exception(type(e), e, e.__traceback__))
3802
+ match_log_rv.set(match_log_rv.get() + f"\n❌ Input snapshot failed:\n{tb}\n")
3803
+ is_tuning_DE_running.set(False); is_any_job_running.set(False)
3804
+ await q.put(None); await drain_task; await reactive.flush()
3805
+ return
3806
+
3807
+ def _run():
3808
+ with redirect_stdout(writer), redirect_stderr(writer):
3809
+ return tune_params_DE(
3810
+ query_data=qfile,
3811
+ reference_data=rfile,
3812
+ precursor_ion_mz_tolerance=float(input.precursor_ion_mz_tolerance()),
3813
+ ionization_mode=input.ionization_mode(),
3814
+ adduct=input.adduct(),
3815
+ chromatography_platform=input.chromatography_platform(),
3816
+ similarity_measure=sim,
3817
+ weights=weights,
3818
+ spectrum_preprocessing_order=spro,
3819
+ mz_min=mz_min, mz_max=mz_max,
3820
+ int_min=int_min, int_max=int_max,
3821
+ high_quality_reference_library=hq,
3822
+ optimize_params=list(opt_params),
3823
+ param_bounds=bounds_dict,
3824
+ default_params=defaults,
3825
+ de_workers=1,
3826
+ maxiters=input.max_iterations()
3827
+ )
3828
+
3829
+ try:
3830
+ _ = await asyncio.to_thread(_run)
3831
+ match_log_rv.set(match_log_rv.get() + "\n✅ Differential evolution finished.\n")
3832
+ except Exception as e:
3833
+ import traceback
3834
+ tb = "".join(traceback.format_exception(type(e), e, e.__traceback__))
3835
+ match_log_rv.set(match_log_rv.get() + f"\n❌ {type(e).__name__}: {e}\n{tb}\n")
3836
+ finally:
3837
+ await q.put(None)
3838
+ await drain_task
3839
+ is_tuning_DE_running.set(False)
3840
+ is_any_job_running.set(False)
3841
+ await reactive.flush()
1317
3842
 
1318
3843
 
1319
3844
  @reactive.effect
@@ -1335,8 +3860,12 @@ def server(input, output, session):
1335
3860
  return run_status_parameter_tuning_grid.get()
1336
3861
  return run_status_parameter_tuning_DE.get()
1337
3862
 
3863
+ @output
3864
+ @render.text
3865
+ def run_log():
3866
+ return match_log_rv.get()
1338
3867
 
1339
- app = App(app_ui, server)
1340
3868
 
3869
+ app = App(app_ui, server)
1341
3870
 
1342
3871