pycompound 0.1.6__py3-none-any.whl → 0.1.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
app.py CHANGED
@@ -1,15 +1,6 @@
1
1
 
2
2
  from shiny import App, ui, reactive, render, req
3
3
  from shiny.types import SilentException
4
- from pycompound.spec_lib_matching import run_spec_lib_matching_on_HRMS_data
5
- from pycompound.spec_lib_matching import run_spec_lib_matching_on_NRMS_data
6
- from pycompound.spec_lib_matching import tune_params_on_HRMS_data_grid
7
- from pycompound.spec_lib_matching import tune_params_on_NRMS_data_grid
8
- from pycompound.spec_lib_matching import tune_params_on_HRMS_data_grid_shiny
9
- from pycompound.spec_lib_matching import tune_params_on_NRMS_data_grid_shiny
10
- from pycompound.spec_lib_matching import tune_params_DE
11
- from pycompound.plot_spectra import generate_plots_on_HRMS_data
12
- from pycompound.plot_spectra import generate_plots_on_NRMS_data
13
4
  from pathlib import Path
14
5
  from contextlib import redirect_stdout, redirect_stderr
15
6
  import contextlib
@@ -28,10 +19,2296 @@ import ast
28
19
  from numbers import Real
29
20
  import logging
30
21
  from scipy.optimize import differential_evolution
22
+ import scipy
23
+ import scipy.stats
24
+ from itertools import product
25
+ import json
26
+ import re
27
+ import urllib.parse
28
+ import urllib.request
29
+ import matplotlib
31
30
 
31
+ matplotlib.rcParams['svg.fonttype'] = 'none'
32
32
 
33
33
  _LOG_QUEUE: asyncio.Queue[str] = asyncio.Queue()
34
34
 
35
+ _ADDUCT_PAT = re.compile(r"\s*(?:\[(M[^\]]+)\]|(M[+-][A-Za-z0-9]+)\+?)\s*$", re.IGNORECASE)
36
+
37
+ def start_log_consumer():
38
+ if getattr(start_log_consumer, "_started", False):
39
+ return
40
+ start_log_consumer._started = True
41
+
42
+ async def _consume():
43
+ while True:
44
+ s = await _LOG_QUEUE.get()
45
+ match_log_rv.set(match_log_rv.get() + s)
46
+ await reactive.flush()
47
+
48
+ asyncio.create_task(_consume())
49
+
50
+
51
+ def start_log_consumer():
52
+ if getattr(start_log_consumer, "_started", False):
53
+ return
54
+ start_log_consumer._started = True
55
+
56
+ async def _consume():
57
+ while True:
58
+ s = await _LOG_QUEUE.get()
59
+ match_log_rv.set(match_log_rv.get() + s)
60
+ await reactive.flush()
61
+
62
+ asyncio.create_task(_consume())
63
+
64
+
65
+
66
+ def _strip_adduct(name: str) -> str:
67
+ return _ADDUCT_PAT.sub("", name).strip()
68
+
69
+ def get_pubchem_url(query: str) -> str:
70
+ base_name = _strip_adduct(query)
71
+ endpoint = ("https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/name/" + urllib.parse.quote(base_name) + "/cids/TXT")
72
+ try:
73
+ with urllib.request.urlopen(endpoint, timeout=10) as r:
74
+ txt = r.read().decode("utf-8").strip()
75
+ cid = txt.splitlines()[0].strip()
76
+ if cid.isdigit():
77
+ return f"https://pubchem.ncbi.nlm.nih.gov/compound/{cid}"
78
+ except Exception:
79
+ pass
80
+ q = urllib.parse.quote(base_name)
81
+ return f"https://pubchem.ncbi.nlm.nih.gov/#query={q}"
82
+
83
+
84
+
85
+ def build_library_from_raw_data(input_path=None, output_path=None, is_reference=False):
86
+ if input_path is None:
87
+ print('Error: please specify input_path (i.e. the path to the input mgf, mzML, cdf, json, or msp file). Mandatory argument.')
88
+ sys.exit()
89
+
90
+ if output_path is None:
91
+ tmp = input_path.split('/')
92
+ tmp = tmp[(len(tmp)-1)]
93
+ basename = tmp.split('.')[0]
94
+ output_path = f'{Path.cwd()}/{basename}.csv'
95
+ print(f'Warning: no output_path specified, so library is written to {output_path}')
96
+
97
+ if is_reference not in [True,False]:
98
+ print('Error: is_reference must be either \'True\' or \'False\'.')
99
+ sys.exit()
100
+
101
+ last_three_chars = input_path[(len(input_path)-3):len(input_path)]
102
+ last_four_chars = input_path[(len(input_path)-4):len(input_path)]
103
+ if last_three_chars == 'mgf' or last_three_chars == 'MGF':
104
+ input_file_type = 'mgf'
105
+ elif last_four_chars == 'mzML' or last_four_chars == 'mzml' or last_four_chars == 'MZML':
106
+ input_file_type = 'mzML'
107
+ elif last_four_chars == 'json' or last_four_chars == 'JSON':
108
+ input_file_type = 'json'
109
+ elif last_three_chars == 'cdf' or last_three_chars == 'CDF':
110
+ input_file_type = 'cdf'
111
+ elif last_three_chars == 'msp' or last_three_chars == 'MSP':
112
+ input_file_type = 'msp'
113
+ else:
114
+ print('ERROR: either an \'mgf\', \'mzML\', \'cdf\', \'json\', or \'msp\' file must be passed to --input_path')
115
+ sys.exit()
116
+
117
+
118
+
119
+ def generate_plots_on_HRMS_data(query_data=None, reference_data=None, precursor_ion_mz=None, precursor_ion_mz_tolerance=None, ionization_mode=None, collision_energy=None, spectrum_ID1=None, spectrum_ID2=None, print_url_spectrum1='No', print_url_spectrum2='No', similarity_measure='cosine', weights={'Cosine':0.25,'Shannon':0.25,'Renyi':0.25,'Tsallis':0.25}, spectrum_preprocessing_order='FCNMWL', high_quality_reference_library=False, mz_min=0, mz_max=9999999, int_min=0, int_max=9999999, window_size_centroiding=0.5, window_size_matching=0.5, noise_threshold=0.0, wf_mz=0.0, wf_intensity=1.0, LET_threshold=0.0, entropy_dimension=1.1, y_axis_transformation='normalized', output_path=None, return_plot=False):
120
+
121
+ if query_data is None:
122
+ print('\nError: No argument passed to the mandatory query_data. Please pass the path to the TXT file of the query data.')
123
+ sys.exit()
124
+ else:
125
+ extension = query_data.rsplit('.',1)
126
+ extension = extension[(len(extension)-1)]
127
+ if extension == 'mgf' or extension == 'MGF' or extension == 'mzML' or extension == 'mzml' or extension == 'MZML' or extension == 'cdf' or extension == 'CDF' or extension == 'msp' or extension == 'MSP' or extension == 'json' or extension == 'JSON':
128
+ output_path_tmp = query_data[:-3] + 'txt'
129
+ #build_library_from_raw_data(input_path=query_data, output_path=output_path_tmp, is_reference=True)
130
+ build_library_from_raw_data(input_path=query_data, output_path=output_path_tmp, is_reference=False)
131
+ df_query = pd.read_csv(output_path_tmp, sep='\t')
132
+ if extension == 'txt' or extension == 'TXT':
133
+ df_query = pd.read_csv(query_data, sep='\t')
134
+ unique_query_ids = df_query['id'].unique().tolist()
135
+ unique_query_ids = [str(tmp) for tmp in unique_query_ids]
136
+
137
+ if reference_data is None:
138
+ print('\nError: No argument passed to the mandatory reference_data. Please pass the path to the TXT file of the reference data.')
139
+ sys.exit()
140
+ else:
141
+ extension = reference_data.rsplit('.',1)
142
+ extension = extension[(len(extension)-1)]
143
+ if extension == 'mgf' or extension == 'MGF' or extension == 'mzML' or extension == 'mzml' or extension == 'MZML' or extension == 'cdf' or extension == 'CDF' or extension == 'msp' or extension == 'MSP' or extension == 'json' or extension == 'JSON':
144
+ output_path_tmp = reference_data[:-3] + 'txt'
145
+ build_library_from_raw_data(input_path=reference_data, output_path=output_path_tmp, is_reference=True)
146
+ df_reference = pd.read_csv(output_path_tmp, sep='\t')
147
+ if extension == 'txt' or extension == 'TXT':
148
+ df_reference = pd.read_csv(reference_data, sep='\t')
149
+ cols_tmp = df_reference.columns.tolist()
150
+ if 'precursor_ion_mz' in cols_tmp and 'ionization_mode' in cols_tmp and 'collision_energy' in cols_tmp:
151
+ if precursor_ion_mz is not None and precursor_ion_mz_tolerance is not None:
152
+ df_reference = df_reference.loc[(df_reference['precursor_ion_mz'] > (precursor_ion_mz-precursor_ion_mz_tolerance) & df_reference['precursor_ion_mz'] < (precursor_ion_mz+precursor_ion_mz_tolerance))]
153
+ if ionization_mode is not None:
154
+ df_reference = df_reference.loc[df_reference['ionization_mode'==ionization_mode]]
155
+ if collision_energy is not None:
156
+ df_reference = df_reference.loc[df_reference['collision_energy'==collision_energy]]
157
+ df_reference = df_reference.drop(columns=['precursor_ion_mz','ionization_mode','collision_energy'])
158
+ unique_reference_ids = df_reference['id'].unique().tolist()
159
+ unique_reference_ids = [str(tmp) for tmp in unique_reference_ids]
160
+
161
+ if spectrum_ID1 is not None:
162
+ spectrum_ID1 = str(spectrum_ID1)
163
+ else:
164
+ spectrum_ID1 = str(df_query['id'].iloc[0])
165
+ print('No argument passed to spectrum_ID1; using the first spectrum in query_data.')
166
+
167
+ if spectrum_ID2 is not None:
168
+ spectrum_ID2 = str(spectrum_ID2)
169
+ else:
170
+ spectrum_ID2 = str(df_reference['id'].iloc[0])
171
+ print('No argument passed to spectrum_ID2; using the first spectrum in reference_data.')
172
+
173
+ if spectrum_preprocessing_order is not None:
174
+ spectrum_preprocessing_order = list(spectrum_preprocessing_order)
175
+ else:
176
+ spectrum_preprocessing_order = ['F', 'C', 'N', 'M', 'W', 'L']
177
+ if 'M' not in spectrum_preprocessing_order:
178
+ print(f'Error: \'M\' must be a character in spectrum_preprocessing_order.')
179
+ sys.exit()
180
+ if 'C' in spectrum_preprocessing_order:
181
+ if spectrum_preprocessing_order.index('C') > spectrum_preprocessing_order.index('M'):
182
+ print(f'Error: \'C\' must come before \'M\' in spectrum_preprocessing_order.')
183
+ sys.exit()
184
+ if set(spectrum_preprocessing_order) - {'F','C','N','M','W','L'}:
185
+ print(f'Error: spectrum_preprocessing_order must contain only \'C\', \'F\', \'M\', \'N\', \'L\', \'W\'.')
186
+ sys.exit()
187
+
188
+ if similarity_measure not in ['cosine','shannon','renyi','tsallis','mixture','jaccard','dice','3w_jaccard','sokal_sneath','binary_cosine','mountford','mcconnaughey','driver_kroeber','simpson','braun_banquet','fager_mcgowan','kulczynski','intersection','hamming','hellinger']:
189
+ print('\nError: similarity_measure must be either cosine, shannon, renyi, tsallis, mixture, jaccard, dice, 3w_jaccard, sokal_sneath, binary_cosine, mountford, mcconnaughey, driver_kroeber, simpson, braun_banquet, fager_mcgowan, kulczynski, intersection, hamming, or hellinger.')
190
+ sys.exit()
191
+
192
+ if isinstance(int_min,int) is True:
193
+ int_min = float(int_min)
194
+ if isinstance(int_max,int) is True:
195
+ int_max = float(int_max)
196
+ if isinstance(mz_min,int) is False or isinstance(mz_max,int) is False or isinstance(int_min,float) is False or isinstance(int_max,float) is False:
197
+ print('Error: mz_min must be a non-negative integer, mz_max must be a positive integer, int_min must be a non-negative float, and int_max must be a positive float')
198
+ sys.exit()
199
+ if mz_min < 0:
200
+ print('\nError: mz_min should be a non-negative integer')
201
+ sys.exit()
202
+ if mz_max <= 0:
203
+ print('\nError: mz_max should be a positive integer')
204
+ sys.exit()
205
+ if int_min < 0:
206
+ print('\nError: int_min should be a non-negative float')
207
+ sys.exit()
208
+ if int_max <= 0:
209
+ print('\nError: int_max should be a positive float')
210
+ sys.exit()
211
+
212
+ if isinstance(window_size_centroiding,float) is False or window_size_centroiding <= 0.0:
213
+ print('Error: window_size_centroiding must be a positive float.')
214
+ sys.exit()
215
+ if isinstance(window_size_matching,float) is False or window_size_matching<= 0.0:
216
+ print('Error: window_size_matching must be a positive float.')
217
+ sys.exit()
218
+
219
+ if isinstance(noise_threshold,int) is True:
220
+ noise_threshold = float(noise_threshold)
221
+ if isinstance(noise_threshold,float) is False or noise_threshold < 0:
222
+ print('Error: noise_threshold must be a positive float.')
223
+ sys.exit()
224
+
225
+ if isinstance(wf_intensity,int) is True:
226
+ wf_intensity = float(wf_intensity)
227
+ if isinstance(wf_mz,int) is True:
228
+ wf_mz = float(wf_mz)
229
+ if isinstance(wf_intensity,float) is False or isinstance(wf_mz,float) is False:
230
+ print('Error: wf_mz and wf_intensity must be integers or floats')
231
+ sys.exit()
232
+
233
+ if entropy_dimension <= 0:
234
+ print('\nError: entropy_dimension should be a positive float')
235
+ sys.exit()
236
+ else:
237
+ q = entropy_dimension
238
+
239
+ normalization_method = 'standard' #consider including additional normalization methods to transform intensities into a probability distribution; softmax results in many numerical errors/warnings
240
+
241
+ if y_axis_transformation not in ['normalized','none','log10','sqrt']:
242
+ print('Error: y_axis_transformation must be either \'normalized\', \'none\', \'log10\', or \'sqrt\'.')
243
+ sys.exit()
244
+
245
+ if output_path is None:
246
+ print(f'Warning: plots will be saved to the svg ./spectrum1_{spectrum_ID1}_spectrum2_{spectrum_ID2}.svg in the current working directory.')
247
+ output_path = f'{Path.cwd()}/spectrum1_{spectrum_ID1}_spectrum2_{spectrum_ID2}.svg'
248
+
249
+
250
+ if spectrum_ID1 in unique_query_ids and spectrum_ID2 in unique_query_ids:
251
+ query_idx = unique_query_ids.index(spectrum_ID1)
252
+ reference_idx = unique_query_ids.index(spectrum_ID2)
253
+ q_idxs_tmp = np.where(df_query.iloc[:,0].astype(str) == unique_query_ids[query_idx])[0]
254
+ r_idxs_tmp = np.where(df_query.iloc[:,0].astype(str) == unique_query_ids[reference_idx])[0]
255
+ q_spec = np.asarray(pd.concat([df_query.iloc[q_idxs_tmp,1], df_query.iloc[q_idxs_tmp,2]], axis=1).reset_index(drop=True))
256
+ r_spec = np.asarray(pd.concat([df_query.iloc[r_idxs_tmp,1], df_query.iloc[r_idxs_tmp,2]], axis=1).reset_index(drop=True))
257
+ elif spectrum_ID1 in unique_reference_ids and spectrum_ID2 in unique_reference_ids:
258
+ query_idx = unique_reference_ids.index(spectrum_ID1)
259
+ reference_idx = unique_reference_ids.index(spectrum_ID2)
260
+ q_idxs_tmp = np.where(df_reference.iloc[:,0].astype(str) == unique_reference_ids[query_idx])[0]
261
+ r_idxs_tmp = np.where(df_reference.iloc[:,0].astype(str) == unique_reference_ids[reference_idx])[0]
262
+ q_spec = np.asarray(pd.concat([df_reference.iloc[q_idxs_tmp,1], df_reference.iloc[q_idxs_tmp,2]], axis=1).reset_index(drop=True))
263
+ r_spec = np.asarray(pd.concat([df_reference.iloc[r_idxs_tmp,1], df_reference.iloc[r_idxs_tmp,2]], axis=1).reset_index(drop=True))
264
+ else:
265
+ if spectrum_ID1 in unique_reference_ids and spectrum_ID2 in unique_query_ids:
266
+ spec_tmp = spectrum_ID1
267
+ spectrum_ID1 = spectrum_ID2
268
+ spectrum_ID2 = spec_tmp
269
+ query_idx = unique_query_ids.index(spectrum_ID1)
270
+ reference_idx = unique_reference_ids.index(spectrum_ID2)
271
+ q_idxs_tmp = np.where(df_query['id'].astype(str) == unique_query_ids[query_idx])[0]
272
+ r_idxs_tmp = np.where(df_reference['id'].astype(str) == unique_reference_ids[reference_idx])[0]
273
+ q_spec = np.asarray(pd.concat([df_query['mz_ratio'].iloc[q_idxs_tmp], df_query['intensity'].iloc[q_idxs_tmp]], axis=1).reset_index(drop=True))
274
+ r_spec = np.asarray(pd.concat([df_reference['mz_ratio'].iloc[r_idxs_tmp], df_reference['intensity'].iloc[r_idxs_tmp]], axis=1).reset_index(drop=True))
275
+
276
+
277
+ q_spec_pre_trans = q_spec.copy()
278
+ r_spec_pre_trans = r_spec.copy()
279
+ q_spec_pre_trans[:,1] = q_spec_pre_trans[:,1].astype(float)
280
+ r_spec_pre_trans[:,1] = r_spec_pre_trans[:,1].astype(float)
281
+
282
+ if y_axis_transformation == 'normalized':
283
+ q_spec_pre_trans[:,1] = q_spec_pre_trans[:,1] / np.max(q_spec_pre_trans[:,1])
284
+ r_spec_pre_trans[:,1] = r_spec_pre_trans[:,1] / np.max(r_spec_pre_trans[:,1])
285
+ ylab = 'Normalized Intensity'
286
+ elif y_axis_transformation == 'log10':
287
+ q_spec_pre_trans[:,1] = np.log10(np.array(q_spec_pre_trans[:,1]+1,dtype=float))
288
+ r_spec_pre_trans[:,1] = np.log10(np.array(r_spec_pre_trans[:,1]+1,dtype=float))
289
+ ylab = 'log10(Intensity)'
290
+ elif y_axis_transformation == 'sqrt':
291
+ q_spec_pre_trans[:,1] = np.sqrt(np.array(q_spec_pre_trans[:,1],dtype=float))
292
+ r_spec_pre_trans[:,1] = np.sqrt(np.array(r_spec_pre_trans[:,1],dtype=float))
293
+ ylab = 'sqrt(Intensity)'
294
+ else:
295
+ ylab = 'Raw Intensity'
296
+
297
+ fig, axes = plt.subplots(nrows=2, ncols=1)
298
+
299
+ plt.subplot(2,1,1)
300
+ plt.vlines(x=q_spec_pre_trans[:,0], ymin=[0]*q_spec_pre_trans.shape[0], ymax=q_spec_pre_trans[:,1], linewidth=3, color='blue', label=f'Spectrum ID 1: {spectrum_ID1}')
301
+ plt.vlines(x=r_spec_pre_trans[:,0], ymin=[0]*r_spec_pre_trans.shape[0], ymax=-r_spec_pre_trans[:,1], linewidth=3, color='red', label=f'Spectrum ID 2: {spectrum_ID2}')
302
+ plt.xlabel('m/z',fontsize=7)
303
+ plt.ylabel(ylab, fontsize=7)
304
+ plt.xticks(fontsize=7)
305
+ plt.yticks(fontsize=7)
306
+ plt.title('Untransformed Spectra', fontsize=10)
307
+
308
+ mz_min_tmp_q = round(q_spec[:,0].min(),1)
309
+ mz_min_tmp_r = round(r_spec[:,0].min(),1)
310
+ int_min_tmp_q = round(q_spec[:,1].min(),1)
311
+ int_min_tmp_r = round(r_spec[:,1].min(),1)
312
+ mz_max_tmp_q = round(q_spec[:,0].max(),1)
313
+ mz_max_tmp_r = round(r_spec[:,0].max(),1)
314
+ int_max_tmp_q = round(q_spec[:,1].max(),1)
315
+ int_max_tmp_r = round(r_spec[:,1].max(),1)
316
+ mz_min_tmp = min([mz_min_tmp_q,mz_min_tmp_r])
317
+ mz_max_tmp = min([mz_max_tmp_q,mz_max_tmp_r])
318
+ int_min_tmp = min([int_min_tmp_q,int_min_tmp_r])
319
+ int_max_tmp = max([int_max_tmp_q,int_max_tmp_r])
320
+
321
+ is_matched = False
322
+ for transformation in spectrum_preprocessing_order:
323
+ if transformation == 'C' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1:
324
+ q_spec = centroid_spectrum(q_spec, window_size=window_size_centroiding)
325
+ r_spec = centroid_spectrum(r_spec, window_size=window_size_centroiding)
326
+ if transformation == 'M' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1:
327
+ m_spec = match_peaks_in_spectra(spec_a=q_spec, spec_b=r_spec, window_size=window_size_matching)
328
+ q_spec = m_spec[:,0:2]
329
+ r_spec = m_spec[:,[0,2]]
330
+ is_matched = True
331
+ if transformation == 'W' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1:
332
+ q_spec[:,1] = wf_transform(q_spec[:,0], q_spec[:,1], wf_mz, wf_intensity)
333
+ r_spec[:,1] = wf_transform(r_spec[:,0], r_spec[:,1], wf_mz, wf_intensity)
334
+ if transformation == 'L' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1:
335
+ q_spec[:,1] = LE_transform(q_spec[:,1], LET_threshold, normalization_method=normalization_method)
336
+ r_spec[:,1] = LE_transform(r_spec[:,1], LET_threshold, normalization_method=normalization_method)
337
+ if transformation == 'N' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1:
338
+ q_spec = remove_noise(q_spec, nr = noise_threshold)
339
+ if high_quality_reference_library == False or high_quality_reference_library == 'False':
340
+ r_spec = remove_noise(r_spec, nr = noise_threshold)
341
+ if transformation == 'F' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1:
342
+ q_spec = filter_spec_lcms(q_spec, mz_min = mz_min, mz_max = mz_max, int_min = int_min, int_max = int_max, is_matched = is_matched)
343
+ if high_quality_reference_library == False or high_quality_reference_library == 'False':
344
+ r_spec = filter_spec_lcms(r_spec, mz_min = mz_min, mz_max = mz_max, int_min = int_min, int_max = int_max, is_matched = is_matched)
345
+
346
+ q_ints = q_spec[:,1]
347
+ r_ints = r_spec[:,1]
348
+
349
+ if np.sum(q_ints) != 0 and np.sum(r_ints) != 0 and q_spec.shape[0] > 1 and r_spec.shape[1] > 1:
350
+ similarity_score = get_similarity(similarity_measure, q_ints, r_ints, weights, entropy_dimension)
351
+ else:
352
+ similarity_score = 0
353
+
354
+ plt.subplot(2,1,2)
355
+
356
+ if q_spec.shape[0] > 1:
357
+ if np.max(q_spec[:,1]) == 0 or np.max(r_spec[:,1]) == 0:
358
+ plt.text(0.5, 0.5, 'The query and/or reference spectrum has no non-zero intensities after transformations.\n Change transformation parameters.', ha='center', va='center', fontsize=7, color='black')
359
+ plt.xticks([])
360
+ plt.yticks([])
361
+ else:
362
+ if y_axis_transformation == 'normalized':
363
+ q_spec[:,1] = q_spec[:,1] / np.max(q_spec[:,1])
364
+ r_spec[:,1] = r_spec[:,1] / np.max(r_spec[:,1])
365
+ ylab='Normalized Intensity'
366
+ elif y_axis_transformation == 'log10':
367
+ q_spec[:,1] = np.log10(q_spec[:,1]+1)
368
+ r_spec[:,1] = np.log10(r_spec[:,1]+1)
369
+ ylab='log10(Intensity)'
370
+ elif y_axis_transformation == 'sqrt':
371
+ q_spec[:,1] = np.sqrt(q_spec[:,1])
372
+ r_spec[:,1] = np.sqrt(r_spec[:,1])
373
+ ylab='sqrt(Intensity)'
374
+ else:
375
+ ylab = 'Raw Intensity'
376
+ plt.vlines(x=q_spec[:,0], ymin=[0]*q_spec.shape[0], ymax=q_spec[:,1], linewidth=3, color='blue')
377
+ plt.vlines(x=r_spec[:,0], ymin=[0]*r_spec.shape[0], ymax=-r_spec[:,1], linewidth=3, color='red')
378
+ plt.xlabel('m/z', fontsize=7)
379
+ plt.ylabel(ylab, fontsize=7)
380
+ plt.xticks(fontsize=7)
381
+ plt.yticks(fontsize=7)
382
+ plt.title(f'Transformed Spectra', fontsize=10)
383
+ else:
384
+ plt.text(0.5, 0.5, 'All points in the spectra were removed during preprocessing. \nChange the spectrum_preprocesing_order and/or change other spectrum-preprocessing parameters.', ha='center', va='center', fontsize=7, color='black')
385
+ plt.xticks([])
386
+ plt.yticks([])
387
+
388
+ plt.subplots_adjust(top=0.8, hspace=0.92, bottom=0.3)
389
+ plt.figlegend(loc='upper center')
390
+
391
+ fig.text(0.05, 0.20, f'Similarity Measure: {similarity_measure.capitalize()}', fontsize=7)
392
+ fig.text(0.05, 0.17, f'Similarity Score: {round(similarity_score, 4)}', fontsize=7)
393
+ fig.text(0.05, 0.14, f"Spectrum Preprocessing Order: {''.join(spectrum_preprocessing_order)}", fontsize=7)
394
+ fig.text(0.05, 0.11, f'High Quality Reference Library: {str(high_quality_reference_library)}', fontsize=7)
395
+ fig.text(0.05, 0.08, f'Window Size (Centroiding): {window_size_centroiding}', fontsize=7)
396
+ fig.text(0.05, 0.05, f'Window Size (Matching): {window_size_matching}', fontsize=7)
397
+ if similarity_measure == 'mixture':
398
+ fig.text(0.05, 0.02, f'Weights for mixture similarity: {weights}', fontsize=7)
399
+
400
+ fig.text(0.40, 0.20, f'Raw-Scale M/Z Range: [{mz_min_tmp},{mz_max_tmp}]', fontsize=7)
401
+ fig.text(0.40, 0.17, f'Raw-Scale Intensity Range: [{int_min_tmp},{int_max_tmp}]', fontsize=7)
402
+ fig.text(0.40, 0.14, f'Noise Threshold: {noise_threshold}', fontsize=7)
403
+ fig.text(0.40, 0.11, f'Weight Factors (m/z,intensity): ({wf_mz},{wf_intensity})', fontsize=7)
404
+ fig.text(0.40, 0.08, f'Low-Entropy Threshold: {LET_threshold}', fontsize=7)
405
+
406
+ if print_url_spectrum1 == 'Yes' and print_url_spectrum2 == 'Yes':
407
+ url_tmp1 = get_pubchem_url(query=spectrum_ID1)
408
+ url_tmp2 = get_pubchem_url(query=spectrum_ID2)
409
+ t1 = fig.text(0.40, 0.05, f'PubChem URL for {spectrum_ID1}: {url_tmp1}', fontsize=7)
410
+ t2 = fig.text(0.40, 0.02, f'PubChem URL for {spectrum_ID2}: {url_tmp2}', fontsize=7)
411
+ t1.set_url(url_tmp1)
412
+ t2.set_url(url_tmp2)
413
+
414
+ if print_url_spectrum1 == 'Yes' and print_url_spectrum2 == 'No':
415
+ url_tmp1 = get_pubchem_url(query=spectrum_ID1)
416
+ t1 = fig.text(0.40, 0.05, f'PubChem URL for {spectrum_ID1}: {url_tmp1}', fontsize=7)
417
+ t1.set_url(url_tmp1)
418
+
419
+ if print_url_spectrum1 == 'No' and print_url_spectrum2 == 'Yes':
420
+ url_tmp2 = get_pubchem_url(query=spectrum_ID2)
421
+ t2 = fig.text(0.40, 0.05, f'PubChem URL for {spectrum_ID2}: {url_tmp2}', fontsize=7)
422
+ t2.set_url(url_tmp2)
423
+
424
+ fig.savefig(output_path, format='svg')
425
+
426
+ if return_plot == True:
427
+ return fig
428
+
429
+
430
+
431
+
432
+ def generate_plots_on_NRMS_data(query_data=None, reference_data=None, spectrum_ID1=None, spectrum_ID2=None, print_url_spectrum1='No', print_url_spectrum2='No', similarity_measure='cosine', weights={'Cosine':0.25,'Shannon':0.25,'Renyi':0.25,'Tsallis':0.25}, spectrum_preprocessing_order='FNLW', high_quality_reference_library=False, mz_min=0, mz_max=9999999, int_min=0, int_max=9999999, noise_threshold=0.0, wf_mz=0.0, wf_intensity=1.0, LET_threshold=0.0, entropy_dimension=1.1, y_axis_transformation='normalized', output_path=None, return_plot=False):
433
+
434
+ if query_data is None:
435
+ print('\nError: No argument passed to the mandatory query_data. Please pass the path to the TXT file of the query data.')
436
+ sys.exit()
437
+ else:
438
+ extension = query_data.rsplit('.',1)
439
+ extension = extension[(len(extension)-1)]
440
+ if extension == 'mgf' or extension == 'MGF' or extension == 'mzML' or extension == 'mzml' or extension == 'MZML' or extension == 'cdf' or extension == 'CDF' or extension == 'msp' or extension == 'MSP' or extension == 'json' or extension == 'JSON':
441
+ output_path_tmp = query_data[:-3] + 'txt'
442
+ build_library_from_raw_data(input_path=query_data, output_path=output_path_tmp, is_reference=False)
443
+ df_query = pd.read_csv(output_path_tmp, sep='\t')
444
+ if extension == 'txt' or extension == 'TXT':
445
+ df_query = pd.read_csv(query_data, sep='\t')
446
+ unique_query_ids = df_query['id'].unique()
447
+
448
+ if reference_data is None:
449
+ print('\nError: No argument passed to the mandatory reference_data. Please pass the path to the TXT file of the reference data.')
450
+ sys.exit()
451
+ else:
452
+ extension = reference_data.rsplit('.',1)
453
+ extension = extension[(len(extension)-1)]
454
+ if extension == 'mgf' or extension == 'MGF' or extension == 'mzML' or extension == 'mzml' or extension == 'MZML' or extension == 'cdf' or extension == 'CDF' or extension == 'msp' or extension == 'MSP' or extension == 'json' or extension == 'JSON':
455
+ output_path_tmp = reference_data[:-3] + 'txt'
456
+ build_library_from_raw_data(input_path=reference_data, output_path=output_path_tmp, is_reference=True)
457
+ df_reference = pd.read_csv(output_path_tmp, sep='\t')
458
+ if extension == 'txt' or extension == 'TXT':
459
+ df_reference = pd.read_csv(reference_data, sep='\t')
460
+ unique_reference_ids = df_reference['id'].unique()
461
+
462
+
463
+ if spectrum_ID1 is not None:
464
+ spectrum_ID1 = str(spectrum_ID1)
465
+ else:
466
+ spectrum_ID1 = str(df_query.iloc[0,0])
467
+ print('No argument passed to spectrum_ID1; using the first spectrum in query_data.')
468
+
469
+ if spectrum_ID2 is not None:
470
+ spectrum_ID2 = str(spectrum_ID2)
471
+ else:
472
+ spectrum_ID2 = str(df_reference.iloc[0,0])
473
+ print('No argument passed to spectrum_ID2; using the first spectrum in reference_data.')
474
+
475
+ if spectrum_preprocessing_order is not None:
476
+ spectrum_preprocessing_order = list(spectrum_preprocessing_order)
477
+ else:
478
+ spectrum_preprocessing_order = ['F','N','W','L']
479
+ if set(spectrum_preprocessing_order) - {'F','N','W','L'}:
480
+ print(f'Error: spectrum_preprocessing_order must contain only \'F\', \'N\', \'W\', \'L\'.')
481
+ sys.exit()
482
+
483
+ if similarity_measure not in ['cosine','shannon','renyi','tsallis','mixture','jaccard','dice','3w_jaccard','sokal_sneath','binary_cosine','mountford','mcconnaughey','driver_kroeber','simpson','braun_banquet','fager_mcgowan','kulczynski','intersection','hamming','hellinger']:
484
+ print('\nError: similarity_measure must be either cosine, shannon, renyi, tsallis, mixture, jaccard, dice, 3w_jaccard, sokal_sneath, binary_cosine, mountford, mcconnaughey, driver_kroeber, simpson, braun_banquet, fager_mcgowan, kulczynski, intersection, hamming, or hellinger.')
485
+ sys.exit()
486
+
487
+ if isinstance(int_min,int) is True:
488
+ int_min = float(int_min)
489
+ if isinstance(int_max,int) is True:
490
+ int_max = float(int_max)
491
+ if isinstance(mz_min,int) is False or isinstance(mz_max,int) is False or isinstance(int_min,float) is False or isinstance(int_max,float) is False:
492
+ print('Error: mz_min must be a non-negative integer, mz_max must be a positive integer, int_min must be a non-negative float, and int_max must be a positive float')
493
+ sys.exit()
494
+ if mz_min < 0:
495
+ print('\nError: mz_min should be a non-negative integer')
496
+ sys.exit()
497
+ if mz_max <= 0:
498
+ print('\nError: mz_max should be a positive integer')
499
+ sys.exit()
500
+ if int_min < 0:
501
+ print('\nError: int_min should be a non-negative float')
502
+ sys.exit()
503
+ if int_max <= 0:
504
+ print('\nError: int_max should be a positive float')
505
+ sys.exit()
506
+
507
+ if isinstance(noise_threshold,int) is True:
508
+ noise_threshold = float(noise_threshold)
509
+ if isinstance(noise_threshold,float) is False or noise_threshold < 0:
510
+ print('Error: noise_threshold must be a positive float.')
511
+ sys.exit()
512
+
513
+ if isinstance(wf_intensity,int) is True:
514
+ wf_intensity = float(wf_intensity)
515
+ if isinstance(wf_mz,int) is True:
516
+ wf_mz = float(wf_mz)
517
+ if isinstance(wf_intensity,float) is False or isinstance(wf_mz,float) is False:
518
+ print('Error: wf_mz and wf_intensity must be integers or floats')
519
+ sys.exit()
520
+
521
+ if entropy_dimension <= 0:
522
+ print('\nError: entropy_dimension should be a positive float')
523
+ sys.exit()
524
+ else:
525
+ q = entropy_dimension
526
+
527
+ normalization_method = 'standard' #consider including additional normalization methods to transform intensities into a probability distribution; softmax results in many numerical errors/warnings
528
+
529
+ if y_axis_transformation not in ['normalized','none','log10','sqrt']:
530
+ print('Error: y_axis_transformation must be either \'normalized\', \'none\', \'log10\', or \'sqrt\'.')
531
+ sys.exit()
532
+
533
+ if output_path is None:
534
+ print(f'Warning: plots will be saved to the svg ./spectrum1_{spectrum_ID1}_spectrum2_{spectrum_ID2}.svg in the current working directory.')
535
+ output_path = f'{Path.cwd()}/spectrum1_{spectrum_ID1}_spectrum2_{spectrum_ID2}.svg'
536
+
537
+ min_mz = np.min([df_query['mz_ratio'].min(), df_reference['mz_ratio'].min()])
538
+ max_mz = np.max([df_query['mz_ratio'].max(), df_reference['mz_ratio'].max()])
539
+ mzs = np.linspace(min_mz,max_mz,(max_mz-min_mz+1))
540
+
541
+ unique_query_ids = df_query['id'].unique().tolist()
542
+ unique_reference_ids = df_reference['id'].unique().tolist()
543
+ unique_query_ids = [str(ID) for ID in unique_query_ids]
544
+ unique_reference_ids = [str(ID) for ID in unique_reference_ids]
545
+ common_IDs = np.intersect1d([str(ID) for ID in unique_query_ids], [str(ID) for ID in unique_reference_ids])
546
+ if len(common_IDs) > 0:
547
+ print(f'Warning: the query and reference library have overlapping IDs: {common_IDs}')
548
+
549
+ if spectrum_ID1 in unique_query_ids and spectrum_ID2 in unique_query_ids:
550
+ q_idxs_tmp = np.where(df_query.iloc[:,0].astype(str) == spectrum_ID1)[0]
551
+ r_idxs_tmp = np.where(df_query.iloc[:,0].astype(str) == spectrum_ID2)[0]
552
+ q_spec = np.asarray(pd.concat([df_query.iloc[q_idxs_tmp,1], df_query.iloc[q_idxs_tmp,2]], axis=1).reset_index(drop=True))
553
+ r_spec = np.asarray(pd.concat([df_query.iloc[r_idxs_tmp,1], df_query.iloc[r_idxs_tmp,2]], axis=1).reset_index(drop=True))
554
+ elif spectrum_ID1 in unique_reference_ids and spectrum_ID2 in unique_reference_ids:
555
+ q_idxs_tmp = np.where(df_reference.iloc[:,0].astype(str) == spectrum_ID1)[0]
556
+ r_idxs_tmp = np.where(df_reference.iloc[:,0].astype(str) == spectrum_ID2)[0]
557
+ q_spec = np.asarray(pd.concat([df_reference.iloc[q_idxs_tmp,1], df_reference.iloc[q_idxs_tmp,2]], axis=1).reset_index(drop=True))
558
+ r_spec = np.asarray(pd.concat([df_reference.iloc[r_idxs_tmp,1], df_reference.iloc[r_idxs_tmp,2]], axis=1).reset_index(drop=True))
559
+ else:
560
+ if spectrum_ID1 in unique_reference_ids and spectrum_ID2 in unique_query_ids:
561
+ spec_tmp = spectrum_ID1
562
+ spectrum_ID1 = spectrum_ID2
563
+ spectrum_ID2 = spec_tmp
564
+ q_idxs_tmp = np.where(df_query['id'].astype(str) == spectrum_ID1)[0]
565
+ r_idxs_tmp = np.where(df_reference['id'].astype(str) == spectrum_ID2)[0]
566
+ q_spec = np.asarray(pd.concat([df_query['mz_ratio'].iloc[q_idxs_tmp], df_query['intensity'].iloc[q_idxs_tmp]], axis=1).reset_index(drop=True))
567
+ r_spec = np.asarray(pd.concat([df_reference['mz_ratio'].iloc[r_idxs_tmp], df_reference['intensity'].iloc[r_idxs_tmp]], axis=1).reset_index(drop=True))
568
+
569
+ q_spec = convert_spec(q_spec,mzs)
570
+ r_spec = convert_spec(r_spec,mzs)
571
+
572
+ int_min_tmp_q = min(q_spec[q_spec[:,1].nonzero(),1][0])
573
+ int_min_tmp_r = min(r_spec[r_spec[:,1].nonzero(),1][0])
574
+ int_max_tmp_q = max(q_spec[q_spec[:,1].nonzero(),1][0])
575
+ int_max_tmp_r = max(r_spec[r_spec[:,1].nonzero(),1][0])
576
+ int_min_tmp = int(min([int_min_tmp_q,int_min_tmp_r]))
577
+ int_max_tmp = int(max([int_max_tmp_q,int_max_tmp_r]))
578
+
579
+ fig, axes = plt.subplots(nrows=2, ncols=1)
580
+
581
+ plt.subplot(2,1,1)
582
+
583
+ if np.max(q_spec[:,1]) == 0 or np.max(r_spec[:,1]) == 0:
584
+ plt.text(0.5, 0.5, 'The query and/or reference spectrum has no non-zero intensities after transformations.\n Change transformation parameters.', ha='center', va='center', fontsize=7, color='black')
585
+ plt.xticks([])
586
+ plt.yticks([])
587
+ else:
588
+ q_spec_pre_trans = q_spec.copy()
589
+ r_spec_pre_trans = r_spec.copy()
590
+ q_spec_pre_trans[:,1] = q_spec_pre_trans[:,1].astype(float)
591
+ r_spec_pre_trans[:,1] = r_spec_pre_trans[:,1].astype(float)
592
+
593
+ if y_axis_transformation == 'normalized':
594
+ q_spec_pre_trans[:,1] = q_spec_pre_trans[:,1] / np.max(q_spec_pre_trans[:,1])
595
+ r_spec_pre_trans[:,1] = r_spec_pre_trans[:,1] / np.max(r_spec_pre_trans[:,1])
596
+ ylab = 'Normalized Intensity'
597
+ elif y_axis_transformation == 'log10':
598
+ q_spec_pre_trans[:,1] = np.log10(q_spec_pre_trans[:,1]+1)
599
+ r_spec_pre_trans[:,1] = np.log10(r_spec_pre_trans[:,1]+1)
600
+ ylab = 'log10(Intensity)'
601
+ elif y_axis_transformation == 'sqrt':
602
+ q_spec_pre_trans[:,1] = np.sqrt(q_spec_pre_trans[:,1])
603
+ r_spec_pre_trans[:,1] = np.sqrt(r_spec_pre_trans[:,1])
604
+ ylab = 'sqrt(Intensity)'
605
+ else:
606
+ ylab = 'Raw Intensity'
607
+ plt.vlines(x=q_spec_pre_trans[:,0], ymin=[0]*len(q_spec_pre_trans[:,0]), ymax=q_spec_pre_trans[:,1], linewidth=3, color='blue', label=f'Spectrum ID1: {spectrum_ID1}')
608
+ plt.vlines(x=r_spec_pre_trans[:,0], ymin=[0]*len(r_spec_pre_trans[:,0]), ymax=-r_spec_pre_trans[:,1], linewidth=3, color='red', label=f'Spectrum ID2: {spectrum_ID2}')
609
+ plt.xlabel('m/z',fontsize=7)
610
+ plt.ylabel(ylab, fontsize=7)
611
+ plt.xticks(fontsize=7)
612
+ plt.yticks(fontsize=7)
613
+ plt.title('Untransformed Query and Reference Spectra', fontsize=10)
614
+
615
+ for transformation in spectrum_preprocessing_order:
616
+ if transformation == 'W':
617
+ q_spec[:,1] = wf_transform(q_spec[:,0], q_spec[:,1], wf_mz, wf_intensity)
618
+ r_spec[:,1] = wf_transform(r_spec[:,0], r_spec[:,1], wf_mz, wf_intensity)
619
+ if transformation == 'L':
620
+ q_spec[:,1] = LE_transform(q_spec[:,1], LET_threshold, normalization_method)
621
+ r_spec[:,1] = LE_transform(r_spec[:,1], LET_threshold, normalization_method)
622
+ if transformation == 'N':
623
+ q_spec = remove_noise(q_spec, nr = noise_threshold)
624
+ if high_quality_reference_library == False or high_quality_reference_library == 'False':
625
+ r_spec = remove_noise(r_spec, nr = noise_threshold)
626
+ if transformation == 'F':
627
+ q_spec = filter_spec_gcms(q_spec, mz_min = mz_min, mz_max = mz_max, int_min = int_min, int_max = int_max)
628
+ if high_quality_reference_library == False or high_quality_reference_library == 'False':
629
+ r_spec = filter_spec_gcms(r_spec, mz_min = mz_min, mz_max = mz_max, int_min = int_min, int_max = int_max)
630
+
631
+ if q_spec.shape[0] > 1:
632
+ similarity_score = get_similarity(similarity_measure, q_spec[:,1], r_spec[:,1], weights, entropy_dimension)
633
+ else:
634
+ similarity_score = 0
635
+
636
+
637
+ plt.subplot(2,1,2)
638
+
639
+ if q_spec.shape[0] == 0 or r_spec.shape[0] == 0:
640
+ plt.text(0.5, 0.5, 'The query and/or reference spectrum has no ion fragments left after transformations.\n Change transformation parameters.', ha='center', va='center', fontsize=7, color='black')
641
+ plt.xticks([])
642
+ plt.yticks([])
643
+ elif np.max(q_spec[:,1]) == 0 or np.max(r_spec[:,1]) == 0:
644
+ plt.text(0.5, 0.5, 'The query and/or reference spectrum has no non-zero intensities after transformations.\n Change transformation parameters.', ha='center', va='center', fontsize=7, color='black')
645
+ plt.xticks([])
646
+ plt.yticks([])
647
+ else:
648
+ if y_axis_transformation == 'normalized':
649
+ q_spec[:,1] = q_spec[:,1] / np.max(q_spec[:,1])
650
+ r_spec[:,1] = r_spec[:,1] / np.max(r_spec[:,1])
651
+ ylab='Normalized Intensity'
652
+ elif y_axis_transformation == 'log10':
653
+ q_spec[:,1] = np.log10(q_spec[:,1]+1)
654
+ r_spec[:,1] = np.log10(r_spec[:,1]+1)
655
+ ylab='log10(Intensity)'
656
+ elif y_axis_transformation == 'sqrt':
657
+ q_spec[:,1] = np.sqrt(q_spec[:,1])
658
+ r_spec[:,1] = np.sqrt(r_spec[:,1])
659
+ ylab='sqrt(Intensity)'
660
+ else:
661
+ ylab = 'Raw Intensity'
662
+ plt.vlines(x=mzs, ymin=[0]*len(mzs), ymax=q_spec[:,1], linewidth=3, color='blue')
663
+ plt.vlines(x=mzs, ymin=[0]*len(mzs), ymax=-r_spec[:,1], linewidth=3, color='red')
664
+ plt.xlabel('m/z', fontsize=7)
665
+ plt.ylabel(ylab, fontsize=7)
666
+ plt.xticks(fontsize=7)
667
+ plt.yticks(fontsize=7)
668
+ plt.title(f'Transformed Query and Reference Spectra', fontsize=10)
669
+
670
+ plt.subplots_adjust(top=0.8, hspace=0.92, bottom=0.3)
671
+ plt.figlegend(loc='upper center')
672
+
673
+ fig.text(0.05, 0.20, f'Similarity Measure: {similarity_measure.capitalize()}', fontsize=7)
674
+ fig.text(0.05, 0.17, f'Similarity Score: {round(similarity_score, 4)}', fontsize=7)
675
+ fig.text(0.05, 0.14, f"Spectrum Preprocessing Order: {''.join(spectrum_preprocessing_order)}", fontsize=7)
676
+ fig.text(0.05, 0.11, f'High Quality Reference Library: {str(high_quality_reference_library)}', fontsize=7)
677
+ fig.text(0.05, 0.08, f'Weight Factors (m/z,intensity): ({wf_mz},{wf_intensity})', fontsize=7)
678
+ if similarity_measure == 'mixture':
679
+ fig.text(0.05, 0.05, f'Weights for mixture similarity: {weights}', fontsize=7)
680
+
681
+ fig.text(0.40, 0.20, f'Raw-Scale M/Z Range: [{min_mz},{max_mz}]', fontsize=7)
682
+ fig.text(0.40, 0.17, f'Raw-Scale Intensity Range: [{int_min_tmp},{int_max_tmp}]', fontsize=7)
683
+ fig.text(0.40, 0.14, f'Noise Threshold: {noise_threshold}', fontsize=7)
684
+ fig.text(0.40, 0.11, f'Low-Entropy Threshold: {LET_threshold}', fontsize=7)
685
+
686
+ if print_url_spectrum1 == 'Yes' and print_url_spectrum2 == 'Yes':
687
+ url_tmp1 = get_pubchem_url(query=spectrum_ID1)
688
+ url_tmp2 = get_pubchem_url(query=spectrum_ID2)
689
+ t1 = fig.text(0.40, 0.08, f'PubChem URL for {spectrum_ID1}: {url_tmp1}', fontsize=7)
690
+ t2 = fig.text(0.40, 0.05, f'PubChem URL for {spectrum_ID2}: {url_tmp2}', fontsize=7)
691
+ t1.set_url(url_tmp1)
692
+ t2.set_url(url_tmp2)
693
+
694
+ if print_url_spectrum1 == 'Yes' and print_url_spectrum2 == 'No':
695
+ url_tmp1 = get_pubchem_url(query=spectrum_ID1)
696
+ t1 = fig.text(0.40, 0.08, f'PubChem URL for {spectrum_ID1}: {url_tmp1}', fontsize=7)
697
+ t1.set_url(url_tmp1)
698
+
699
+ if print_url_spectrum1 == 'No' and print_url_spectrum2 == 'Yes':
700
+ url_tmp2 = get_pubchem_url(query=spectrum_ID2)
701
+ t2 = fig.text(0.40, 0.08, f'PubChem URL for {spectrum_ID2}: {url_tmp2}', fontsize=7)
702
+ t2.set_url(url_tmp2)
703
+
704
+ fig.savefig(output_path, format='svg')
705
+
706
+ if return_plot == True:
707
+ return fig
708
+
709
+
710
+ def wf_transform(spec_mzs, spec_ints, wf_mz, wf_int):
711
+ spec_ints = np.power(spec_mzs, wf_mz) * np.power(spec_ints, wf_int)
712
+ return(spec_ints)
713
+
714
+
715
+ def LE_transform(intensity, thresh, normalization_method):
716
+ intensity_tmp = normalize(intensity, method=normalization_method)
717
+ if np.sum(intensity_tmp) > 0:
718
+ S = scipy.stats.entropy(intensity_tmp.astype('float'))
719
+ if S > 0 and S < thresh:
720
+ w = (1 + S) / (1 + thresh)
721
+ intensity = np.power(intensity_tmp, w)
722
+ else:
723
+ intensity = np.zeros(len(intensity))
724
+ return intensity
725
+
726
+
727
+ def normalize(intensities,method='standard'):
728
+ if np.sum(intensities) > 0:
729
+ if method == 'softmax':
730
+ if np.any(intensities > 700):
731
+ print("Warning: some intensities are too large to exponentiate. Applying standard normalization.")
732
+ intensities /= np.sum(intensities)
733
+ else:
734
+ intensities2 = np.exp(intensities)
735
+ if np.isinf(intensities2).sum() == 0:
736
+ intensities = intensities / np.sum(intensities2)
737
+ elif method == 'standard':
738
+ intensities /= np.sum(intensities)
739
+ return(intensities)
740
+
741
+
742
+ def filter_spec_lcms(spec, mz_min = 0, mz_max = 999999999999, int_min = 0, int_max = 999999999999, is_matched = False):
743
+ if is_matched == False:
744
+ spec = spec[spec[:,0] >= mz_min]
745
+ spec = spec[spec[:,0] <= mz_max]
746
+ spec = spec[spec[:,1] >= int_min]
747
+ spec = spec[spec[:,1] <= int_max]
748
+ else:
749
+ spec = spec[spec[:,0] >= mz_min]
750
+ spec = spec[spec[:,0] <= mz_max]
751
+ spec[spec[:,1] >= int_min] = 0
752
+ spec[spec[:,1] <= int_max] = 0
753
+ return(spec)
754
+
755
+
756
+ def filter_spec_gcms(spec, mz_min = 0, mz_max = 999999999999, int_min = 0, int_max = 999999999999):
757
+ spec[np.where(spec[:,0] < mz_min)[0],1] = 0
758
+ spec[np.where(spec[:,0] > mz_max)[0],1] = 0
759
+ spec[np.where(spec[:,1] < int_min)[0],1] = 0
760
+ spec[np.where(spec[:,1] > int_max)[0],1] = 0
761
+ return(spec)
762
+
763
+
764
+ def remove_noise(spec, nr):
765
+ if spec.shape[0] > 1:
766
+ if nr is not None:
767
+ spec[np.where(spec[:,1] < np.max(spec[:,1]) * nr)[0]] = 0
768
+
769
+ return(spec)
770
+
771
+
772
+ def centroid_spectrum(spec, window_size):
773
+ spec = spec[np.argsort(spec[:,0])]
774
+
775
+ mz_array = spec[:, 0]
776
+ need_centroid = 0
777
+ if mz_array.shape[0] > 1:
778
+ mz_delta = mz_array[1:] - mz_array[:-1]
779
+ if np.min(mz_delta) <= window_size:
780
+ need_centroid = 1
781
+
782
+ if need_centroid:
783
+ intensity_order = np.argsort(-spec[:, 1])
784
+ spec_new = []
785
+ for i in intensity_order:
786
+ mz_delta_allowed = window_size
787
+
788
+ if spec[i, 1] > 0:
789
+ i_left = i - 1
790
+ while i_left >= 0:
791
+ mz_delta_left = spec[i, 0] - spec[i_left, 0]
792
+ if mz_delta_left <= mz_delta_allowed:
793
+ i_left -= 1
794
+ else:
795
+ break
796
+ i_left += 1
797
+
798
+ i_right = i + 1
799
+ while i_right < spec.shape[0]:
800
+ mz_delta_right = spec[i_right, 0] - spec[i, 0]
801
+ if mz_delta_right <= mz_delta_allowed:
802
+ i_right += 1
803
+ else:
804
+ break
805
+
806
+ intensity_sum = np.sum(spec[i_left:i_right, 1])
807
+ intensity_weighted_sum = np.sum(spec[i_left:i_right, 0] * spec[i_left:i_right, 1])
808
+
809
+ spec_new.append([intensity_weighted_sum / intensity_sum, intensity_sum])
810
+ spec[i_left:i_right, 1] = 0
811
+
812
+ spec_new = np.array(spec_new)
813
+ spec_new = spec_new[np.argsort(spec_new[:, 0])]
814
+ if spec_new.shape[0] > 1:
815
+ spec_new = spec_new[np.argsort(spec_new[:, 0])]
816
+ return spec_new
817
+ else:
818
+ return np.array([[0,0]])
819
+ else:
820
+ return spec
821
+
822
+
823
+
824
+ def match_peaks_in_spectra(spec_a, spec_b, window_size):
825
+ a = 0
826
+ b = 0
827
+
828
+ spec_merged = []
829
+ peak_b_int = 0.
830
+ while a < spec_a.shape[0] and b < spec_b.shape[0]:
831
+ mass_delta = spec_a[a, 0] - spec_b[b, 0]
832
+
833
+ if mass_delta < -window_size:
834
+ spec_merged.append([spec_a[a, 0], spec_a[a, 1], peak_b_int])
835
+ peak_b_int = 0.
836
+ a += 1
837
+ elif mass_delta > window_size:
838
+ spec_merged.append([spec_b[b, 0], 0., spec_b[b, 1]])
839
+ b += 1
840
+ else:
841
+ peak_b_int += spec_b[b, 1]
842
+ b += 1
843
+
844
+ if peak_b_int > 0.:
845
+ spec_merged.append([spec_a[a, 0], spec_a[a, 1], peak_b_int])
846
+ peak_b_int = 0.
847
+ a += 1
848
+
849
+ if b < spec_b.shape[0]:
850
+ spec_merged += [[x[0], 0., x[1]] for x in spec_b[b:]]
851
+
852
+ if a < spec_a.shape[0]:
853
+ spec_merged += [[x[0], x[1], 0.] for x in spec_a[a:]]
854
+
855
+ if spec_merged:
856
+ spec_merged = np.array(spec_merged, dtype=np.float64)
857
+ else:
858
+ spec_merged = np.array([[0., 0., 0.]], dtype=np.float64)
859
+ return spec_merged
860
+
861
+
862
+
863
+ def convert_spec(spec, mzs):
864
+ ints_tmp = []
865
+ for i in range(0,len(mzs)):
866
+ if mzs[i] in spec[:,0]:
867
+ int_tmp = spec[np.where(spec[:,0] == mzs[i])[0][0],1]
868
+ else:
869
+ int_tmp = 0
870
+ ints_tmp.append(int_tmp)
871
+ out = np.transpose(np.array([mzs,ints_tmp]))
872
+ return out
873
+
874
+
875
+ def get_reference_df(reference_data, likely_reference_IDs=None):
876
+ extension = reference_data.rsplit('.',1)
877
+ extension = extension[(len(extension)-1)]
878
+ if extension == 'mgf' or extension == 'MGF' or extension == 'mzML' or extension == 'mzml' or extension == 'MZML' or extension == 'cdf' or extension == 'CDF' or extension == 'msp' or extension == 'MSP' or extension == 'json' or extension == 'JSON':
879
+ output_path_tmp = reference_data[:-3] + 'txt'
880
+ build_library_from_raw_data(input_path=reference_data, output_path=output_path_tmp, is_reference=True)
881
+ df_reference = pd.read_csv(output_path_tmp, sep='\t')
882
+ if extension == 'txt' or extension == 'TXT':
883
+ df_reference = pd.read_csv(reference_data, sep='\t')
884
+ if likely_reference_IDs is not None:
885
+ likely_reference_IDs = pd.read_csv(likely_reference_IDs, header=None)
886
+ df_reference = df_reference.loc[df_reference.iloc[:,0].isin(likely_reference_IDs.iloc[:,0].tolist())]
887
+ return df_reference
888
+
889
+
890
+
891
+ def S_cos(ints_a, ints_b):
892
+ if np.sum(ints_a) == 0 or np.sum(ints_b) == 0:
893
+ return(0)
894
+ else:
895
+ return np.dot(ints_a,ints_b) / (np.sqrt(sum(np.power(ints_a,2))) * np.sqrt(sum(np.power(ints_b,2))))
896
+
897
+
898
+ def ent_renyi(ints, q):
899
+ return np.log(sum(np.power(ints,q))) / (1-q)
900
+
901
+
902
+ def ent_tsallis(ints, q):
903
+ return (sum(np.power(ints,q))-1) / (1-q)
904
+
905
+
906
+ def S_shannon(ints_a, ints_b):
907
+ ent_a = scipy.stats.entropy(ints_a)
908
+ ent_b = scipy.stats.entropy(ints_b)
909
+ ent_ab = scipy.stats.entropy(ints_a + ints_b)
910
+ return(1 - (2 * ent_ab - ent_a - ent_b)/np.log(4))
911
+
912
+
913
+ def S_renyi(ints_a, ints_b, q):
914
+ if q == 1:
915
+ print('Warning: the Renyi Entropy Similarity Measure is equivalent to the Shannon Entropy Similarity Measure when the entropy dimension is 1')
916
+ return S_shannon(ints_a, ints_b)
917
+ else:
918
+ ent_a = ent_renyi(ints_a, q)
919
+ ent_b = ent_renyi(ints_b, q)
920
+ ent_merg = ent_renyi(ints_a/2 + ints_b/2, q)
921
+ N = (1/(1-q)) * (2*np.log(np.sum(np.power(ints_a/2,q))+np.sum(np.power(ints_b/2,q))) - np.log(np.sum(np.power(ints_a,q))) - np.log(np.sum(np.power(ints_b,q))))
922
+ return 1 - (2 * ent_merg - ent_a - ent_b) / N
923
+
924
+
925
+ def S_tsallis(ints_a, ints_b, q):
926
+ if q == 1:
927
+ print('Warning: the Tsallis Entropy Similarity Measure is equivalent to the Shannon Entropy Similarity Measure when the entropy dimension is 1')
928
+ return S_shannon(ints_a, ints_b)
929
+ else:
930
+ ent_a = ent_tsallis(ints_a, q)
931
+ ent_b = ent_tsallis(ints_b, q)
932
+ ent_merg = ent_tsallis(ints_a/2 + ints_b/2, q)
933
+ N = np.sum(2*np.power(ints_a/2,q)+2*np.power(ints_b/2,q)-np.power(ints_a,q)-np.power(ints_b,q)) / (1-q)
934
+ return 1 - (2 * ent_merg - ent_a - ent_b) / N
935
+
936
+ def S_mixture(ints_a, ints_b, weights={'Cosine':0.25, 'Shannon':0.25, 'Renyi':0.25, 'Tsallis':0.25}, q=1.1):
937
+ if set(weights.keys()).issubset(set(['Cosine','Shannon','Renyi','Tsallis'])) is False:
938
+ print('Error: the keys to the weight parameter dict of the function S_mixture must be one of the four: Cosine, Shannon, Renyi, Tsallis')
939
+ sys.exit()
940
+
941
+ similarity = 0
942
+ for key, value in weights.items():
943
+ if key == 'Cosine':
944
+ similarity += value * S_cos(ints_a,ints_b)
945
+ if key == 'Shannon':
946
+ similarity += value * S_shannon(ints_a,ints_b)
947
+ if key == 'Renyi':
948
+ similarity += value * S_renyi(ints_a,ints_b,q)
949
+ if key == 'Tsallis':
950
+ similarity += value * S_tsallis(ints_a,ints_b,q)
951
+ return similarity
952
+
953
+
954
+ def get_contingency_entries(ints_a, ints_b):
955
+ a = 0
956
+ b = 0
957
+ c = 0
958
+
959
+ for x, y in zip(ints_a, ints_b):
960
+ if x != 0 and y != 0:
961
+ c += 1
962
+ elif x != 0 and y == 0:
963
+ a += 1
964
+ elif x == 0 and y != 0:
965
+ b += 1
966
+ return [a,b,c]
967
+
968
+
969
+ def S_jaccard(ints_a, ints_b):
970
+ tmp = get_contingency_entries(ints_a, ints_b)
971
+ a = tmp[0]
972
+ b = tmp[1]
973
+ c = tmp[2]
974
+ denom = a + b + c
975
+ if denom == 0:
976
+ similarity = 0
977
+ else:
978
+ similarity = c / (a + b + c)
979
+ return similarity
980
+
981
+
982
+ def S_dice(ints_a, ints_b):
983
+ tmp = get_contingency_entries(ints_a, ints_b)
984
+ a = tmp[0]
985
+ b = tmp[1]
986
+ c = tmp[2]
987
+ denom = a + b + 2 * c
988
+ if denom == 0:
989
+ similarity = 0
990
+ else:
991
+ similarity = 2 * c / denom
992
+ return similarity
993
+
994
+
995
+ def S_3w_jaccard(ints_a, ints_b):
996
+ tmp = get_contingency_entries(ints_a, ints_b)
997
+ a = tmp[0]
998
+ b = tmp[1]
999
+ c = tmp[2]
1000
+ denom = a + b + 3 * c
1001
+ if denom == 0:
1002
+ similarity = 0
1003
+ else:
1004
+ similarity = 3 * c / denom
1005
+ return similarity
1006
+
1007
+
1008
+ def S_sokal_sneath(ints_a, ints_b):
1009
+ tmp = get_contingency_entries(ints_a, ints_b)
1010
+ a = tmp[0]
1011
+ b = tmp[1]
1012
+ c = tmp[2]
1013
+ denom = 2 * a + 2 * b + c
1014
+ if denom == 0:
1015
+ similarity = 0
1016
+ else:
1017
+ similarity = c / denom
1018
+ return similarity
1019
+
1020
+
1021
+ def S_binary_cosine(ints_a, ints_b):
1022
+ tmp = get_contingency_entries(ints_a, ints_b)
1023
+ a = tmp[0]
1024
+ b = tmp[1]
1025
+ c = tmp[2]
1026
+ denom = np.sqrt((a + c) * (b + c))
1027
+ if denom == 0:
1028
+ similarity = 0
1029
+ else:
1030
+ similarity = c / denom
1031
+ return similarity
1032
+
1033
+
1034
+ def S_mountford(ints_a, ints_b):
1035
+ tmp = get_contingency_entries(ints_a, ints_b)
1036
+ a = tmp[0]
1037
+ b = tmp[1]
1038
+ c = tmp[2]
1039
+ denom = c * (a + b) + 2 * a * b
1040
+ if denom == 0:
1041
+ similarity = 1
1042
+ else:
1043
+ similarity = 2 * c / denom
1044
+ return similarity
1045
+
1046
+
1047
+ def S_mcconnaughey(ints_a, ints_b):
1048
+ tmp = get_contingency_entries(ints_a, ints_b)
1049
+ a = tmp[0]
1050
+ b = tmp[1]
1051
+ c = tmp[2]
1052
+ denom = (a + c) * (b + c)
1053
+ if denom == 0:
1054
+ similarity = 0
1055
+ else:
1056
+ similarity = (c**2 - a * b) / denom
1057
+ return similarity
1058
+
1059
+
1060
+ def S_driver_kroeber(ints_a, ints_b):
1061
+ tmp = get_contingency_entries(ints_a, ints_b)
1062
+ a = tmp[0]
1063
+ b = tmp[1]
1064
+ c = tmp[2]
1065
+ denom = 2 * (a + c) * (b + c)
1066
+ if denom == 0:
1067
+ similarity = 0
1068
+ else:
1069
+ similarity = c * (a + b + 2 * c) / denom
1070
+ return similarity
1071
+
1072
+
1073
+ def S_simpson(ints_a, ints_b):
1074
+ tmp = get_contingency_entries(ints_a, ints_b)
1075
+ a = tmp[0]
1076
+ b = tmp[1]
1077
+ c = tmp[2]
1078
+ denom = min(a + c, b + c)
1079
+ if denom == 0:
1080
+ similarity = 0
1081
+ else:
1082
+ similarity = c / denom
1083
+ return similarity
1084
+
1085
+
1086
+ def S_braun_banquet(ints_a, ints_b):
1087
+ tmp = get_contingency_entries(ints_a, ints_b)
1088
+ a = tmp[0]
1089
+ b = tmp[1]
1090
+ c = tmp[2]
1091
+ denom = max(a + c, b + c)
1092
+ if denom == 0:
1093
+ similarity = 0
1094
+ else:
1095
+ similarity = c / denom
1096
+ return similarity
1097
+
1098
+
1099
+ def S_fager_mcgowan(ints_a, ints_b):
1100
+ tmp = get_contingency_entries(ints_a, ints_b)
1101
+ a = tmp[0]
1102
+ b = tmp[1]
1103
+ c = tmp[2]
1104
+ denom1 = np.sqrt((a + c) * (b + c))
1105
+ denom2 = 2 * np.sqrt(max(a + c, b + c))
1106
+ if denom1 == 0 or denom2 == 0:
1107
+ similarity = 0
1108
+ else:
1109
+ similarity = c / denom1 - 1 / denom2
1110
+ return similarity
1111
+
1112
+
1113
+ def S_kulczynski(ints_a, ints_b):
1114
+ tmp = get_contingency_entries(ints_a, ints_b)
1115
+ a = tmp[0]
1116
+ b = tmp[1]
1117
+ c = tmp[2]
1118
+ denom = a + b
1119
+ if denom == 0:
1120
+ similarity = 1
1121
+ else:
1122
+ similarity = c / denom
1123
+ return similarity
1124
+
1125
+
1126
+ def S_intersection(ints_a, ints_b):
1127
+ tmp = get_contingency_entries(ints_a, ints_b)
1128
+ c = tmp[2]
1129
+ return c
1130
+
1131
+
1132
+ def S_hamming(ints_a, ints_b):
1133
+ tmp = get_contingency_entries(ints_a, ints_b)
1134
+ a = tmp[0]
1135
+ b = tmp[1]
1136
+ denom = a + b
1137
+ if denom == 0:
1138
+ similarity = 1
1139
+ else:
1140
+ similarity = 1 / denom
1141
+ return similarity
1142
+
1143
+
1144
+ def S_hellinger(ints_a, ints_b):
1145
+ tmp = get_contingency_entries(ints_a, ints_b)
1146
+ a = tmp[0]
1147
+ b = tmp[1]
1148
+ c = tmp[2]
1149
+ similarity = 1 - np.sqrt((1 - c / np.sqrt((a + c) * (b + c))))
1150
+ return similarity
1151
+
1152
+
1153
+ def get_similarity(similarity_measure, q_ints, r_ints, weights, q):
1154
+
1155
+ if similarity_measure == 'cosine':
1156
+ similarity = S_cos(q_ints, r_ints)
1157
+
1158
+ elif similarity_measure in ['shannon', 'renyi', 'tsallis']:
1159
+ q_ints = normalize(q_ints, method = 'standard')
1160
+ r_ints = normalize(r_ints, method = 'standard')
1161
+ if similarity_measure == 'shannon':
1162
+ similarity = S_shannon(q_ints, r_ints)
1163
+ elif similarity_measure == 'renyi':
1164
+ similarity = S_renyi(q_ints, r_ints, q)
1165
+ elif similarity_measure == 'tsallis':
1166
+ similarity = S_tsallis(q_ints, r_ints, q)
1167
+
1168
+ elif similarity_measure == 'mixture':
1169
+ similarity = S_mixture(q_ints, r_ints, weights, q)
1170
+
1171
+ elif similarity_measure == 'jaccard':
1172
+ similarity = S_jaccard(q_ints, r_ints)
1173
+
1174
+ elif similarity_measure == 'dice':
1175
+ similarity = S_dice(q_ints, r_ints)
1176
+
1177
+ elif similarity_measure == '3w_jaccard':
1178
+ similarity = S_3w_jaccard(q_ints, r_ints)
1179
+
1180
+ elif similarity_measure == 'sokal_sneath':
1181
+ similarity = S_sokal_sneath(q_ints, r_ints)
1182
+
1183
+ elif similarity_measure == 'binary_cosine':
1184
+ similarity = S_binary_cosine(q_ints, r_ints)
1185
+
1186
+ elif similarity_measure == 'mountford':
1187
+ similarity = S_mountford(q_ints, r_ints)
1188
+
1189
+ elif similarity_measure == 'mcconnaughey':
1190
+ similarity = S_mcconnaughey(q_ints, r_ints)
1191
+
1192
+ elif similarity_measure == 'driver_kroeber':
1193
+ similarity = S_driver_kroeber(q_ints, r_ints)
1194
+
1195
+ elif similarity_measure == 'simpson':
1196
+ similarity = S_simpson(q_ints, r_ints)
1197
+
1198
+ elif similarity_measure == 'braun_banquet':
1199
+ similarity = S_braun_banquet(q_ints, r_ints)
1200
+
1201
+ elif similarity_measure == 'fager_mcgowan':
1202
+ similarity = S_fager_mcgowan(q_ints, r_ints)
1203
+
1204
+ elif similarity_measure == 'kulczynski':
1205
+ similarity = S_kulczynski(q_ints, r_ints)
1206
+
1207
+ elif similarity_measure == 'intersection':
1208
+ similarity = S_intersection(q_ints, r_ints)
1209
+
1210
+ elif similarity_measure == 'hamming':
1211
+ similarity = S_hamming(q_ints, r_ints)
1212
+
1213
+ elif similarity_measure == 'hellinger':
1214
+ similarity = S_hellinger(q_ints, r_ints)
1215
+
1216
+ return similarity
1217
+
1218
+
1219
+ def _vector_to_full_params(X, default_params, optimize_params):
1220
+ params = default_params.copy()
1221
+ for name, val in zip(optimize_params, X):
1222
+ params[name] = float(val)
1223
+ return params
1224
+
1225
+
1226
+ def objective_function_HRMS(X, ctx):
1227
+ p = _vector_to_full_params(X, ctx["default_params"], ctx["optimize_params"])
1228
+ acc = get_acc_HRMS(
1229
+ ctx["df_query"], ctx["df_reference"],
1230
+ ctx["precursor_ion_mz_tolerance"], ctx["ionization_mode"], ctx["adduct"],
1231
+ ctx["similarity_measure"], ctx["weights"], ctx["spectrum_preprocessing_order"],
1232
+ ctx["mz_min"], ctx["mz_max"], ctx["int_min"], ctx["int_max"],
1233
+ p["window_size_centroiding"], p["window_size_matching"], p["noise_threshold"],
1234
+ p["wf_mz"], p["wf_int"], p["LET_threshold"],
1235
+ p["entropy_dimension"],
1236
+ ctx["high_quality_reference_library"],
1237
+ verbose=False
1238
+ )
1239
+ print(f"\nparams({ctx['optimize_params']}) = {np.array(X)}\naccuracy: {acc*100}%")
1240
+ return 1.0 - acc
1241
+
1242
+ def objective_function_NRMS(X, ctx):
1243
+ p = _vector_to_full_params(X, ctx["default_params"], ctx["optimize_params"])
1244
+ acc = get_acc_NRMS(
1245
+ ctx["df_query"], ctx["df_reference"],
1246
+ ctx["unique_query_ids"], ctx["unique_reference_ids"],
1247
+ ctx["similarity_measure"], ctx["weights"], ctx["spectrum_preprocessing_order"],
1248
+ ctx["mz_min"], ctx["mz_max"], ctx["int_min"], ctx["int_max"],
1249
+ p["noise_threshold"], p["wf_mz"], p["wf_int"], p["LET_threshold"], p["entropy_dimension"],
1250
+ ctx["high_quality_reference_library"],
1251
+ verbose=False
1252
+ )
1253
+ print(f"\nparams({ctx['optimize_params']}) = {np.array(X)}\naccuracy: {acc*100}%")
1254
+ return 1.0 - acc
1255
+
1256
+
1257
+
1258
+ def tune_params_DE(query_data=None, reference_data=None, precursor_ion_mz_tolerance=None, ionization_mode=None, adduct=None, chromatography_platform='HRMS', similarity_measure='cosine', weights=None, spectrum_preprocessing_order='CNMWL', mz_min=0, mz_max=999999999, int_min=0, int_max=999999999, high_quality_reference_library=False, optimize_params=["window_size_centroiding","window_size_matching","noise_threshold","wf_mz","wf_int","LET_threshold","entropy_dimension"], param_bounds={"window_size_centroiding":(0.0,0.5),"window_size_matching":(0.0,0.5),"noise_threshold":(0.0,0.25),"wf_mz":(0.0,5.0),"wf_int":(0.0,5.0),"LET_threshold":(0.0,5.0),"entropy_dimension":(1.0,3.0)}, default_params={"window_size_centroiding": 0.5, "window_size_matching":0.5, "noise_threshold":0.10, "wf_mz":0.0, "wf_int":1.0, "LET_threshold":0.0, "entropy_dimension":1.1}, maxiters=3, de_workers=1):
1259
+
1260
+ if query_data is None:
1261
+ print('\nError: No argument passed to the mandatory query_data. Please pass the path to the TXT file of the query data.')
1262
+ sys.exit()
1263
+ else:
1264
+ extension = query_data.rsplit('.',1)
1265
+ extension = extension[(len(extension)-1)]
1266
+ if extension == 'mgf' or extension == 'MGF' or extension == 'mzML' or extension == 'mzml' or extension == 'MZML' or extension == 'cdf' or extension == 'CDF' or extension == 'msp' or extension == 'MSP' or extension == 'json' or extension == 'JSON':
1267
+ output_path_tmp = query_data[:-3] + 'txt'
1268
+ build_library_from_raw_data(input_path=query_data, output_path=output_path_tmp, is_reference=False)
1269
+ df_query = pd.read_csv(output_path_tmp, sep='\t')
1270
+ if extension == 'txt' or extension == 'TXT':
1271
+ df_query = pd.read_csv(query_data, sep='\t')
1272
+ unique_query_ids = df_query.iloc[:,0].unique()
1273
+
1274
+ if reference_data is None:
1275
+ print('\nError: No argument passed to the mandatory reference_data. Please pass the path to the TXT file of the reference data.')
1276
+ sys.exit()
1277
+ else:
1278
+ if isinstance(reference_data,str):
1279
+ df_reference = get_reference_df(reference_data=reference_data)
1280
+ unique_reference_ids = df_reference.iloc[:,0].unique()
1281
+ else:
1282
+ dfs = []
1283
+ unique_reference_ids = []
1284
+ for f in reference_data:
1285
+ tmp = get_reference_df(reference_data=f)
1286
+ dfs.append(tmp)
1287
+ unique_reference_ids.extend(tmp.iloc[:,0].unique())
1288
+ df_reference = pd.concat(dfs, axis=0, ignore_index=True)
1289
+
1290
+ if 'ionization_mode' in df_reference.columns.tolist() and ionization_mode != None and ionization_mode != 'N/A':
1291
+ df_reference = df_reference.loc[df_reference['ionization_mode']==ionization_mode]
1292
+ if 'adduct' in df_reference.columns.tolist() and adduct != None and adduct != 'N/A':
1293
+ df_reference = df_reference.loc[df_reference['adduct']==adduct]
1294
+
1295
+ unique_query_ids = df_query['id'].unique().tolist()
1296
+ unique_reference_ids = df_reference['id'].unique().tolist()
1297
+
1298
+ ctx = dict(
1299
+ df_query=df_query,
1300
+ df_reference=df_reference,
1301
+ precursor_ion_mz_tolerance=precursor_ion_mz_tolerance,
1302
+ ionization_mode=ionization_mode,
1303
+ adduct=adduct,
1304
+ similarity_measure=similarity_measure,
1305
+ weights=weights,
1306
+ spectrum_preprocessing_order=spectrum_preprocessing_order,
1307
+ mz_min=mz_min, mz_max=mz_max, int_min=int_min, int_max=int_max,
1308
+ high_quality_reference_library=high_quality_reference_library,
1309
+ default_params=default_params,
1310
+ optimize_params=optimize_params,
1311
+ )
1312
+
1313
+ bounds = [param_bounds[p] for p in optimize_params]
1314
+
1315
+ if chromatography_platform == 'HRMS':
1316
+ result = differential_evolution(objective_function_HRMS, bounds=bounds, args=(ctx,), maxiter=maxiters, tol=0.0, workers=de_workers, seed=1)
1317
+ else:
1318
+ result = differential_evolution(objective_function_NRMS, bounds=bounds, args=(ctx,), maxiter=maxiters, tol=0.0, workers=de_workers, seed=1)
1319
+
1320
+ best_full_params = _vector_to_full_params(result.x, default_params, optimize_params)
1321
+ best_acc = 100.0 - (result.fun * 100.0)
1322
+
1323
+ print("\n=== Differential Evolution Result ===")
1324
+ print(f"Optimized over: {optimize_params}")
1325
+ print("Best values (selected params):")
1326
+ for name in optimize_params:
1327
+ print(f" {name}: {best_full_params[name]}")
1328
+ print("\nFull parameter set used in final evaluation:")
1329
+ for k, v in best_full_params.items():
1330
+ print(f" {k}: {v}")
1331
+ print(f"\nBest accuracy: {best_acc:.3f}%")
1332
+ _log(f"best = {result.x}, acc={100*(1-result.fun):.3f}%")
1333
+
1334
+
1335
+ default_HRMS_grid = {'similarity_measure':['cosine'], 'weight':[{'Cosine':0.25,'Shannon':0.25,'Renyi':0.25,'Tsallis':0.25}], 'spectrum_preprocessing_order':['FCNMWL'], 'mz_min':[0], 'mz_max':[9999999], 'int_min':[0], 'int_max':[99999999], 'window_size_centroiding':[0.5], 'window_size_matching':[0.5], 'noise_threshold':[0.0], 'wf_mz':[0.0], 'wf_int':[1.0], 'LET_threshold':[0.0], 'entropy_dimension':[1.1], 'high_quality_reference_library':[False]}
1336
+ default_NRMS_grid = {'similarity_measure':['cosine'], 'weight':[{'Cosine':0.25,'Shannon':0.25,'Renyi':0.25,'Tsallis':0.25}], 'spectrum_preprocessing_order':['FCNMWL'], 'mz_min':[0], 'mz_max':[9999999], 'int_min':[0], 'int_max':[99999999], 'noise_threshold':[0.0], 'wf_mz':[0.0], 'wf_int':[1.0], 'LET_threshold':[0.0], 'entropy_dimension':[1.1], 'high_quality_reference_library':[False]}
1337
+
1338
+
1339
+ def _eval_one_HRMS(df_query, df_reference,
1340
+ precursor_ion_mz_tolerance_tmp, ionization_mode_tmp, adduct_tmp,
1341
+ similarity_measure_tmp, weight,
1342
+ spectrum_preprocessing_order_tmp, mz_min_tmp, mz_max_tmp,
1343
+ int_min_tmp, int_max_tmp, noise_threshold_tmp,
1344
+ window_size_centroiding_tmp, window_size_matching_tmp,
1345
+ wf_mz_tmp, wf_int_tmp, LET_threshold_tmp,
1346
+ entropy_dimension_tmp, high_quality_reference_library_tmp):
1347
+
1348
+ acc = get_acc_HRMS(
1349
+ df_query=df_query, df_reference=df_reference,
1350
+ precursor_ion_mz_tolerance=precursor_ion_mz_tolerance_tmp,
1351
+ ionization_mode=ionization_mode_tmp, adduct=adduct_tmp,
1352
+ similarity_measure=similarity_measure_tmp, weights=weight,
1353
+ spectrum_preprocessing_order=spectrum_preprocessing_order_tmp,
1354
+ mz_min=mz_min_tmp, mz_max=mz_max_tmp,
1355
+ int_min=int_min_tmp, int_max=int_max_tmp,
1356
+ window_size_centroiding=window_size_centroiding_tmp,
1357
+ window_size_matching=window_size_matching_tmp,
1358
+ noise_threshold=noise_threshold_tmp,
1359
+ wf_mz=wf_mz_tmp, wf_int=wf_int_tmp,
1360
+ LET_threshold=LET_threshold_tmp,
1361
+ entropy_dimension=entropy_dimension_tmp,
1362
+ high_quality_reference_library=high_quality_reference_library_tmp,
1363
+ verbose=False
1364
+ )
1365
+
1366
+ return (
1367
+ acc, similarity_measure_tmp, json.dumps(weight), spectrum_preprocessing_order_tmp,
1368
+ mz_min_tmp, mz_max_tmp, int_min_tmp, int_max_tmp,
1369
+ noise_threshold_tmp, window_size_centroiding_tmp, window_size_matching_tmp,
1370
+ wf_mz_tmp, wf_int_tmp, LET_threshold_tmp, entropy_dimension_tmp,
1371
+ high_quality_reference_library_tmp
1372
+ )
1373
+
1374
+
1375
+ def _eval_one_NRMS(df_query, df_reference, unique_query_ids, unique_reference_ids,
1376
+ similarity_measure_tmp, weight,
1377
+ spectrum_preprocessing_order_tmp, mz_min_tmp, mz_max_tmp,
1378
+ int_min_tmp, int_max_tmp, noise_threshold_tmp,
1379
+ wf_mz_tmp, wf_int_tmp, LET_threshold_tmp,
1380
+ entropy_dimension_tmp, high_quality_reference_library_tmp):
1381
+
1382
+ acc = get_acc_NRMS(
1383
+ df_query=df_query, df_reference=df_reference,
1384
+ unique_query_ids=unique_query_ids, unique_reference_ids=unique_reference_ids,
1385
+ similarity_measure=similarity_measure_tmp, weights=weight,
1386
+ spectrum_preprocessing_order=spectrum_preprocessing_order_tmp,
1387
+ mz_min=mz_min_tmp, mz_max=mz_max_tmp,
1388
+ int_min=int_min_tmp, int_max=int_max_tmp,
1389
+ noise_threshold=noise_threshold_tmp,
1390
+ wf_mz=wf_mz_tmp, wf_int=wf_int_tmp,
1391
+ LET_threshold=LET_threshold_tmp,
1392
+ entropy_dimension=entropy_dimension_tmp,
1393
+ high_quality_reference_library=high_quality_reference_library_tmp,
1394
+ )
1395
+
1396
+ return (
1397
+ acc, similarity_measure_tmp, json.dumps(weight), spectrum_preprocessing_order_tmp,
1398
+ mz_min_tmp, mz_max_tmp, int_min_tmp, int_max_tmp, noise_threshold_tmp,
1399
+ wf_mz_tmp, wf_int_tmp, LET_threshold_tmp, entropy_dimension_tmp, high_quality_reference_library_tmp
1400
+ )
1401
+
1402
+
1403
+
1404
+
1405
+ def tune_params_on_HRMS_data_grid_shiny(query_data=None, reference_data=None, precursor_ion_mz_tolerance=None, ionization_mode=None, adduct=None, grid=None, output_path=None, return_output=False):
1406
+ local_grid = {**default_HRMS_grid, **(grid or {})}
1407
+ for key, value in local_grid.items():
1408
+ globals()[key] = value
1409
+
1410
+ if query_data is None:
1411
+ print('\nError: No argument passed to the mandatory query_data. Please pass the path to the data file.')
1412
+ sys.exit()
1413
+ else:
1414
+ extension = query_data.rsplit('.', 1)[-1]
1415
+ if extension in ('mgf','MGF','mzML','mzml','MZML','cdf','CDF'):
1416
+ output_path_tmp = query_data[:-3] + 'txt'
1417
+ build_library_from_raw_data(input_path=query_data, output_path=output_path_tmp, is_reference=False)
1418
+ df_query = pd.read_csv(output_path_tmp, sep='\t')
1419
+ elif extension in ('txt','TXT'):
1420
+ df_query = pd.read_csv(query_data, sep='\t')
1421
+ else:
1422
+ print(f'\nError: Unsupported query_data extension: {extension}')
1423
+ sys.exit()
1424
+ unique_query_ids = df_query.iloc[:, 0].unique()
1425
+
1426
+ if reference_data is None:
1427
+ print('\nError: No argument passed to the mandatory reference_data. Please pass the path to the data file(s).')
1428
+ sys.exit()
1429
+ else:
1430
+ if isinstance(reference_data, str):
1431
+ df_reference = get_reference_df(reference_data=reference_data)
1432
+ unique_reference_ids = df_reference.iloc[:, 0].unique()
1433
+ else:
1434
+ dfs = []
1435
+ unique_reference_ids = []
1436
+ for f in reference_data:
1437
+ tmp = get_reference_df(reference_data=f)
1438
+ dfs.append(tmp)
1439
+ unique_reference_ids.extend(tmp.iloc[:, 0].unique())
1440
+ df_reference = pd.concat(dfs, axis=0, ignore_index=True)
1441
+
1442
+ print(f'\nNote that there are {len(unique_query_ids)} unique query spectra, '
1443
+ f'{len(unique_reference_ids)} unique reference spectra, and '
1444
+ f'{len(set(unique_query_ids) & set(unique_reference_ids))} of the query and reference spectra IDs are in common.\n')
1445
+
1446
+ if 'ionization_mode' in df_reference.columns.tolist() and ionization_mode != None and ionization_mode != 'N/A':
1447
+ df_reference = df_reference.loc[df_reference['ionization_mode']==ionization_mode]
1448
+ if 'adduct' in df_reference.columns.tolist() and adduct != None and adduct != 'N/A':
1449
+ df_reference = df_reference.loc[df_reference['adduct']==adduct]
1450
+
1451
+ if output_path is None:
1452
+ output_path = f'{Path.cwd()}/tuning_param_output.txt'
1453
+ print(f'Warning: since output_path=None, the output will be written to the current working directory: {output_path}')
1454
+
1455
+ param_grid = product(
1456
+ similarity_measure, weight, spectrum_preprocessing_order, mz_min, mz_max, int_min, int_max,
1457
+ noise_threshold, window_size_centroiding, window_size_matching, wf_mz, wf_int, LET_threshold,
1458
+ entropy_dimension, high_quality_reference_library
1459
+ )
1460
+
1461
+ results = []
1462
+ total = (
1463
+ len(similarity_measure) * len(weight) * len(spectrum_preprocessing_order) * len(mz_min) * len(mz_max) *
1464
+ len(int_min) * len(int_max) * len(noise_threshold) * len(window_size_centroiding) *
1465
+ len(window_size_matching) * len(wf_mz) * len(wf_int) * len(LET_threshold) *
1466
+ len(entropy_dimension) * len(high_quality_reference_library)
1467
+ )
1468
+ done = 0
1469
+ for params in param_grid:
1470
+ res = _eval_one_HRMS(df_query, df_reference, precursor_ion_mz_tolerance, ionization_mode, adduct, *params)
1471
+ results.append(res)
1472
+ done += 1
1473
+ print(f'Completed {done}/{total} grid combinations.\n', flush=True)
1474
+
1475
+ df_out = pd.DataFrame(results, columns=[
1476
+ 'ACC','SIMILARITY.MEASURE','WEIGHT','SPECTRUM.PROCESSING.ORDER','MZ.MIN','MZ.MAX',
1477
+ 'INT.MIN','INT.MAX','NOISE.THRESHOLD','WINDOW.SIZE.CENTROIDING','WINDOW.SIZE.MATCHING',
1478
+ 'WF.MZ','WF.INT','LET.THRESHOLD','ENTROPY.DIMENSION','HIGH.QUALITY.REFERENCE.LIBRARY'
1479
+ ])
1480
+
1481
+ if 'WEIGHT' in df_out.columns:
1482
+ df_out['WEIGHT'] = (
1483
+ df_out['WEIGHT'].astype(str)
1484
+ .str.replace("\"","",regex=False)
1485
+ .str.replace("{","",regex=False)
1486
+ .str.replace("}","",regex=False)
1487
+ .str.replace(":","",regex=False)
1488
+ .str.replace("Cosine","",regex=False)
1489
+ .str.replace("Shannon","",regex=False)
1490
+ .str.replace("Renyi","",regex=False)
1491
+ .str.replace("Tsallis","",regex=False)
1492
+ .str.replace(" ","",regex=False)
1493
+ )
1494
+
1495
+ if return_output:
1496
+ return df_out
1497
+ else:
1498
+ df_out.to_csv(output_path, index=False, sep='\t', quoting=csv.QUOTE_NONE)
1499
+ print(f'Wrote results to {output_path}')
1500
+
1501
+
1502
+
1503
+ def tune_params_on_NRMS_data_grid(query_data=None, reference_data=None, grid=None, output_path=None, return_output=False):
1504
+ grid = {**default_NRMS_grid, **(grid or {})}
1505
+ for key, value in grid.items():
1506
+ globals()[key] = value
1507
+
1508
+ if query_data is None:
1509
+ print('\nError: No argument passed to the mandatory query_data. Please pass the path to the TXT file of the query data.')
1510
+ sys.exit()
1511
+ else:
1512
+ extension = query_data.rsplit('.',1)
1513
+ extension = extension[(len(extension)-1)]
1514
+ if extension == 'mgf' or extension == 'MGF' or extension == 'mzML' or extension == 'mzml' or extension == 'MZML' or extension == 'cdf' or extension == 'CDF' or extension == 'msp' or extension == 'MSP' or extension == 'json' or extension == 'JSON':
1515
+ output_path_tmp = query_data[:-3] + 'txt'
1516
+ build_library_from_raw_data(input_path=query_data, output_path=output_path_tmp, is_reference=False)
1517
+ df_query = pd.read_csv(output_path_tmp, sep='\t')
1518
+ if extension == 'txt' or extension == 'TXT':
1519
+ df_query = pd.read_csv(query_data, sep='\t')
1520
+ unique_query_ids = df_query.iloc[:,0].unique()
1521
+
1522
+ if reference_data is None:
1523
+ print('\nError: No argument passed to the mandatory reference_data. Please pass the path to the TXT file of the reference data.')
1524
+ sys.exit()
1525
+ else:
1526
+ if isinstance(reference_data,str):
1527
+ df_reference = get_reference_df(reference_data=reference_data)
1528
+ unique_reference_ids = df_reference.iloc[:,0].unique()
1529
+ else:
1530
+ dfs = []
1531
+ unique_reference_ids = []
1532
+ for f in reference_data:
1533
+ tmp = get_reference_df(reference_data=f)
1534
+ dfs.append(tmp)
1535
+ unique_reference_ids.extend(tmp.iloc[:,0].unique())
1536
+ df_reference = pd.concat(dfs, axis=0, ignore_index=True)
1537
+
1538
+ print(f'\nNote that there are {len(unique_query_ids)} unique query spectra, {len(unique_reference_ids)} unique reference spectra, and {len(set(unique_query_ids) & set(unique_reference_ids))} of the query and reference spectra IDs are in common.\n')
1539
+
1540
+ if output_path is None:
1541
+ output_path = f'{Path.cwd()}/tuning_param_output.txt'
1542
+ print(f'Warning: since output_path=None, the output will be written to the current working directory: {output_path}')
1543
+
1544
+ param_grid = product(similarity_measure, weight, spectrum_preprocessing_order, mz_min, mz_max, int_min, int_max,
1545
+ noise_threshold, wf_mz, wf_int, LET_threshold, entropy_dimension, high_quality_reference_library)
1546
+ results = Parallel(n_jobs=-1, verbose=10)(delayed(_eval_one_NRMS)(df_query, df_reference, unique_query_ids, unique_reference_ids, *params) for params in param_grid)
1547
+
1548
+ df_out = pd.DataFrame(results, columns=[
1549
+ 'ACC','SIMILARITY.MEASURE','WEIGHT','SPECTRUM.PROCESSING.ORDER', 'MZ.MIN','MZ.MAX','INT.MIN','INT.MAX',
1550
+ 'NOISE.THRESHOLD','WF.MZ','WF.INT','LET.THRESHOLD','ENTROPY.DIMENSION', 'HIGH.QUALITY.REFERENCE.LIBRARY'
1551
+ ])
1552
+ df_out['WEIGHT'] = df_out['WEIGHT'].str.replace("\"","",regex=False)
1553
+ df_out['WEIGHT'] = df_out['WEIGHT'].str.replace("{","",regex=False)
1554
+ df_out['WEIGHT'] = df_out['WEIGHT'].str.replace("}","",regex=False)
1555
+ df_out['WEIGHT'] = df_out['WEIGHT'].str.replace(":","",regex=False)
1556
+ df_out['WEIGHT'] = df_out['WEIGHT'].str.replace("Cosine","",regex=False)
1557
+ df_out['WEIGHT'] = df_out['WEIGHT'].str.replace("Shannon","",regex=False)
1558
+ df_out['WEIGHT'] = df_out['WEIGHT'].str.replace("Renyi","",regex=False)
1559
+ df_out['WEIGHT'] = df_out['WEIGHT'].str.replace("Tsallis","",regex=False)
1560
+ df_out['WEIGHT'] = df_out['WEIGHT'].str.replace(" ","",regex=False)
1561
+ if return_output is False:
1562
+ df_out.to_csv(output_path, index=False, sep='\t', quoting=csv.QUOTE_NONE)
1563
+ else:
1564
+ return df_out
1565
+
1566
+
1567
+
1568
+ def tune_params_on_NRMS_data_grid_shiny(query_data=None, reference_data=None, grid=None, output_path=None, return_output=False):
1569
+ local_grid = {**default_NRMS_grid, **(grid or {})}
1570
+ for key, value in local_grid.items():
1571
+ globals()[key] = value
1572
+
1573
+ if query_data is None:
1574
+ print('\nError: No argument passed to the mandatory query_data. Please pass the path to the data file.')
1575
+ sys.exit()
1576
+ else:
1577
+ extension = query_data.rsplit('.', 1)[-1]
1578
+ if extension in ('mgf','MGF','mzML','mzml','MZML','cdf','CDF'):
1579
+ output_path_tmp = query_data[:-3] + 'txt'
1580
+ build_library_from_raw_data(input_path=query_data, output_path=output_path_tmp, is_reference=False)
1581
+ df_query = pd.read_csv(output_path_tmp, sep='\t')
1582
+ elif extension in ('txt','TXT'):
1583
+ df_query = pd.read_csv(query_data, sep='\t')
1584
+ else:
1585
+ print(f'\nError: Unsupported query_data extension: {extension}')
1586
+ sys.exit()
1587
+ unique_query_ids = df_query.iloc[:, 0].unique()
1588
+
1589
+ if reference_data is None:
1590
+ print('\nError: No argument passed to the mandatory reference_data. Please pass the path to the data file(s).')
1591
+ sys.exit()
1592
+ else:
1593
+ if isinstance(reference_data, str):
1594
+ df_reference = get_reference_df(reference_data=reference_data)
1595
+ unique_reference_ids = df_reference.iloc[:, 0].unique()
1596
+ else:
1597
+ dfs = []
1598
+ unique_reference_ids = []
1599
+ for f in reference_data:
1600
+ tmp = get_reference_df(reference_data=f)
1601
+ dfs.append(tmp)
1602
+ unique_reference_ids.extend(tmp.iloc[:, 0].unique())
1603
+ df_reference = pd.concat(dfs, axis=0, ignore_index=True)
1604
+
1605
+ print(f'\nNote that there are {len(unique_query_ids)} unique query spectra, '
1606
+ f'{len(unique_reference_ids)} unique reference spectra, and '
1607
+ f'{len(set(unique_query_ids) & set(unique_reference_ids))} of the query and reference spectra IDs are in common.\n')
1608
+
1609
+ if output_path is None:
1610
+ output_path = f'{Path.cwd()}/tuning_param_output.txt'
1611
+ print(f'Warning: since output_path=None, the output will be written to the current working directory: {output_path}')
1612
+
1613
+ param_grid = product(
1614
+ similarity_measure, weight, spectrum_preprocessing_order, mz_min, mz_max, int_min, int_max,
1615
+ noise_threshold, wf_mz, wf_int, LET_threshold,
1616
+ entropy_dimension, high_quality_reference_library
1617
+ )
1618
+
1619
+ results = []
1620
+ total = (
1621
+ len(similarity_measure) * len(weight) * len(spectrum_preprocessing_order) * len(mz_min) * len(mz_max) * len(int_min) *
1622
+ len(int_max) * len(noise_threshold) * len(wf_mz) * len(wf_int) * len(LET_threshold) * len(entropy_dimension) * len(high_quality_reference_library)
1623
+ )
1624
+ done = 0
1625
+ for params in param_grid:
1626
+ res = _eval_one_NRMS(df_query, df_reference, unique_query_ids, unique_reference_ids, *params)
1627
+ results.append(res)
1628
+ done += 1
1629
+ print(f'Completed {done}/{total} grid combinations.\n', flush=True)
1630
+
1631
+ df_out = pd.DataFrame(results, columns=[
1632
+ 'ACC','SIMILARITY.MEASURE','WEIGHT','SPECTRUM.PROCESSING.ORDER','MZ.MIN','MZ.MAX',
1633
+ 'INT.MIN','INT.MAX','NOISE.THRESHOLD','WF.MZ','WF.INT','LET.THRESHOLD','ENTROPY.DIMENSION','HIGH.QUALITY.REFERENCE.LIBRARY'
1634
+ ])
1635
+
1636
+ if 'WEIGHT' in df_out.columns:
1637
+ df_out['WEIGHT'] = (
1638
+ df_out['WEIGHT'].astype(str)
1639
+ .str.replace("\"","",regex=False)
1640
+ .str.replace("{","",regex=False)
1641
+ .str.replace("}","",regex=False)
1642
+ .str.replace(":","",regex=False)
1643
+ .str.replace("Cosine","",regex=False)
1644
+ .str.replace("Shannon","",regex=False)
1645
+ .str.replace("Renyi","",regex=False)
1646
+ .str.replace("Tsallis","",regex=False)
1647
+ .str.replace(" ","",regex=False)
1648
+ )
1649
+
1650
+ if return_output:
1651
+ return df_out
1652
+ else:
1653
+ df_out.to_csv(output_path, index=False, sep='\t', quoting=csv.QUOTE_NONE)
1654
+ print(f'Wrote results to {output_path}')
1655
+
1656
+
1657
+
1658
+
1659
+ def get_acc_HRMS(df_query, df_reference, precursor_ion_mz_tolerance, ionization_mode, adduct, similarity_measure, weights, spectrum_preprocessing_order, mz_min, mz_max, int_min, int_max, window_size_centroiding, window_size_matching, noise_threshold, wf_mz, wf_int, LET_threshold, entropy_dimension, high_quality_reference_library, verbose=True):
1660
+ n_top_matches_to_save = 1
1661
+ unique_reference_ids = df_reference['id'].dropna().astype(str).unique().tolist()
1662
+ unique_query_ids = df_query['id'].dropna().astype(str).unique().tolist()
1663
+ all_similarity_rows = []
1664
+
1665
+ for query_idx, qid in enumerate(unique_query_ids):
1666
+ if verbose:
1667
+ print(f'query spectrum #{query_idx} is being identified')
1668
+
1669
+ q_mask = (df_query['id'] == qid)
1670
+ q_idxs = np.where(q_mask)[0]
1671
+ if q_idxs.size == 0:
1672
+ all_similarity_rows.append([0.0]*len(unique_reference_ids))
1673
+ continue
1674
+
1675
+ q_spec_base = np.asarray(pd.concat([df_query['mz_ratio'].iloc[q_idxs], df_query['intensity'].iloc[q_idxs]], axis=1).reset_index(drop=True))
1676
+
1677
+ if 'precursor_ion_mz' in df_query.columns and 'precursor_ion_mz' in df_reference.columns and precursor_ion_mz_tolerance is not None:
1678
+ precursor = float(df_query['precursor_ion_mz'].iloc[q_idxs[0]])
1679
+ df_reference_tmp = df_reference.loc[df_reference['precursor_ion_mz'].between(precursor - precursor_ion_mz_tolerance, precursor + precursor_ion_mz_tolerance, inclusive='both'), ['id', 'mz_ratio', 'intensity']].copy()
1680
+ else:
1681
+ df_reference_tmp = df_reference[['id','mz_ratio','intensity']].copy()
1682
+
1683
+ if df_reference_tmp.empty:
1684
+ all_similarity_rows.append([0.0]*len(unique_reference_ids))
1685
+ continue
1686
+
1687
+ ref_groups = dict(tuple(df_reference_tmp.groupby('id', sort=False)))
1688
+
1689
+ similarity_by_ref = {}
1690
+
1691
+ for ref_id, r_df in ref_groups.items():
1692
+ q_spec = q_spec_base.copy()
1693
+ r_spec = np.asarray(pd.concat([r_df['mz_ratio'], r_df['intensity']], axis=1).reset_index(drop=True))
1694
+
1695
+ is_matched = False
1696
+ for transformation in spectrum_preprocessing_order:
1697
+ if np.isinf(q_spec[:, 1]).any():
1698
+ q_spec[:, 1] = 0.0
1699
+ if np.isinf(r_spec[:, 1]).any():
1700
+ r_spec[:, 1] = 0.0
1701
+
1702
+ if transformation == 'C' and q_spec.shape[0] > 1 and r_spec.shape[0] > 1:
1703
+ q_spec = centroid_spectrum(q_spec, window_size=window_size_centroiding)
1704
+ r_spec = centroid_spectrum(r_spec, window_size=window_size_centroiding)
1705
+
1706
+ if transformation == 'M' and q_spec.shape[0] > 1 and r_spec.shape[0] > 1:
1707
+ m_spec = match_peaks_in_spectra(
1708
+ spec_a=q_spec, spec_b=r_spec, window_size=window_size_matching
1709
+ )
1710
+ if m_spec.size == 0:
1711
+ q_spec = np.empty((0,2))
1712
+ r_spec = np.empty((0,2))
1713
+ else:
1714
+ q_spec = m_spec[:, 0:2]
1715
+ r_spec = m_spec[:, [0, 2]]
1716
+ is_matched = True
1717
+
1718
+ if transformation == 'W' and q_spec.shape[0] > 1 and r_spec.shape[0] > 1:
1719
+ q_spec[:, 1] = wf_transform(q_spec[:, 0], q_spec[:, 1], wf_mz, wf_int)
1720
+ r_spec[:, 1] = wf_transform(r_spec[:, 0], r_spec[:, 1], wf_mz, wf_int)
1721
+
1722
+ if transformation == 'L' and q_spec.shape[0] > 1 and r_spec.shape[0] > 1:
1723
+ q_spec[:, 1] = LE_transform(q_spec[:, 1], LET_threshold, normalization_method='standard')
1724
+ r_spec[:, 1] = LE_transform(r_spec[:, 1], LET_threshold, normalization_method='standard')
1725
+
1726
+ if transformation == 'N' and q_spec.shape[0] > 1 and r_spec.shape[0] > 1:
1727
+ q_spec = remove_noise(q_spec, nr=noise_threshold)
1728
+ if not high_quality_reference_library:
1729
+ r_spec = remove_noise(r_spec, nr=noise_threshold)
1730
+
1731
+ if transformation == 'F' and q_spec.shape[0] > 1 and r_spec.shape[0] > 1:
1732
+ q_spec = filter_spec_lcms(
1733
+ q_spec, mz_min=mz_min, mz_max=mz_max, int_min=int_min, int_max=int_max, is_matched=is_matched
1734
+ )
1735
+ if not high_quality_reference_library:
1736
+ r_spec = filter_spec_lcms(
1737
+ r_spec, mz_min=mz_min, mz_max=mz_max, int_min=int_min, int_max=int_max, is_matched=is_matched
1738
+ )
1739
+
1740
+ if q_spec.shape[0] > 1 and r_spec.shape[0] > 1:
1741
+ q_ints = q_spec[:, 1]
1742
+ r_ints = r_spec[:, 1]
1743
+ if np.sum(q_ints) != 0 and np.sum(r_ints) != 0:
1744
+ sim = get_similarity(similarity_measure, q_ints, r_ints, weights, entropy_dimension)
1745
+ else:
1746
+ sim = 0.0
1747
+ else:
1748
+ sim = 0.0
1749
+
1750
+ similarity_by_ref[str(ref_id)] = float(sim)
1751
+
1752
+ row = [similarity_by_ref.get(ref_id, 0.0) for ref_id in unique_reference_ids]
1753
+ all_similarity_rows.append(row)
1754
+
1755
+ df_scores = pd.DataFrame(all_similarity_rows, index=unique_query_ids, columns=unique_reference_ids)
1756
+ df_scores.index.name = 'QUERY.SPECTRUM.ID'
1757
+
1758
+ top_idx = df_scores.values.argmax(axis=1)
1759
+ top_scores = df_scores.values[np.arange(df_scores.shape[0]), top_idx]
1760
+ top_ids = [df_scores.columns[i] for i in top_idx]
1761
+
1762
+ df_tmp = pd.DataFrame({'TRUE.ID': df_scores.index.to_list(), 'PREDICTED.ID': top_ids, 'SCORE': top_scores})
1763
+ if verbose:
1764
+ print(df_tmp)
1765
+
1766
+ acc = (df_tmp['TRUE.ID'] == df_tmp['PREDICTED.ID']).mean()
1767
+ return acc
1768
+
1769
+
1770
+
1771
+ def get_acc_NRMS(df_query, df_reference, unique_query_ids, unique_reference_ids, similarity_measure, weights, spectrum_preprocessing_order, mz_min, mz_max, int_min, int_max, noise_threshold, wf_mz, wf_int, LET_threshold, entropy_dimension, high_quality_reference_library, verbose=True):
1772
+
1773
+ n_top_matches_to_save = 1
1774
+
1775
+ min_mz = int(np.min([np.min(df_query.iloc[:,1]), np.min(df_reference.iloc[:,1])]))
1776
+ max_mz = int(np.max([np.max(df_query.iloc[:,1]), np.max(df_reference.iloc[:,1])]))
1777
+ mzs = np.linspace(min_mz,max_mz,(max_mz-min_mz+1))
1778
+
1779
+ all_similarity_scores = []
1780
+ for query_idx in range(0,len(unique_query_ids)):
1781
+ q_idxs_tmp = np.where(df_query.iloc[:,0] == unique_query_ids[query_idx])[0]
1782
+ q_spec_tmp = np.asarray(pd.concat([df_query.iloc[q_idxs_tmp,1], df_query.iloc[q_idxs_tmp,2]], axis=1).reset_index(drop=True))
1783
+ q_spec_tmp = convert_spec(q_spec_tmp,mzs)
1784
+
1785
+ similarity_scores = []
1786
+ for ref_idx in range(0,len(unique_reference_ids)):
1787
+ q_spec = q_spec_tmp
1788
+ if verbose is True and ref_idx % 1000 == 0:
1789
+ print(f'Query spectrum #{query_idx} has had its similarity with {ref_idx} reference library spectra computed')
1790
+ r_idxs_tmp = np.where(df_reference.iloc[:,0] == unique_reference_ids[ref_idx])[0]
1791
+ r_spec_tmp = np.asarray(pd.concat([df_reference.iloc[r_idxs_tmp,1], df_reference.iloc[r_idxs_tmp,2]], axis=1).reset_index(drop=True))
1792
+ r_spec = convert_spec(r_spec_tmp,mzs)
1793
+
1794
+ for transformation in spectrum_preprocessing_order:
1795
+ if np.isinf(q_spec[:,1]).sum() > 0:
1796
+ q_spec[:,1] = np.zeros(q_spec.shape[0])
1797
+ if np.isinf(r_spec[:,1]).sum() > 0:
1798
+ r_spec[:,1] = np.zeros(r_spec.shape[0])
1799
+ if transformation == 'W':
1800
+ q_spec[:,1] = wf_transform(q_spec[:,0], q_spec[:,1], wf_mz, wf_int)
1801
+ r_spec[:,1] = wf_transform(r_spec[:,0], r_spec[:,1], wf_mz, wf_int)
1802
+ if transformation == 'L':
1803
+ q_spec[:,1] = LE_transform(q_spec[:,1], LET_threshold, normalization_method='standard')
1804
+ r_spec[:,1] = LE_transform(r_spec[:,1], LET_threshold, normalization_method='standard')
1805
+ if transformation == 'N':
1806
+ q_spec = remove_noise(q_spec, nr = noise_threshold)
1807
+ if high_quality_reference_library == False:
1808
+ r_spec = remove_noise(r_spec, nr = noise_threshold)
1809
+ if transformation == 'F':
1810
+ q_spec = filter_spec_gcms(q_spec, mz_min = mz_min, mz_max = mz_max, int_min = int_min, int_max = int_max)
1811
+ if high_quality_reference_library == False:
1812
+ r_spec = filter_spec_gcms(r_spec, mz_min = mz_min, mz_max = mz_max, int_min = int_min, int_max = int_max)
1813
+
1814
+ q_ints = q_spec[:,1]
1815
+ r_ints = r_spec[:,1]
1816
+
1817
+ if np.sum(q_ints) != 0 and np.sum(r_ints) != 0:
1818
+ similarity_score = get_similarity(similarity_measure, q_spec[:,1], r_spec[:,1], weights, entropy_dimension)
1819
+ else:
1820
+ similarity_score = 0
1821
+
1822
+ similarity_scores.append(similarity_score)
1823
+ all_similarity_scores.append(similarity_scores)
1824
+
1825
+ df_scores = pd.DataFrame(all_similarity_scores, columns = unique_reference_ids)
1826
+ df_scores.index = unique_query_ids
1827
+ df_scores.index.names = ['QUERY.SPECTRUM.ID']
1828
+
1829
+ preds = []
1830
+ scores = []
1831
+ for i in range(0, df_scores.shape[0]):
1832
+ df_scores_tmp = df_scores
1833
+ preds_tmp = []
1834
+ scores_tmp = []
1835
+ for j in range(0, n_top_matches_to_save):
1836
+ top_ref_specs_tmp = df_scores_tmp.iloc[i,np.where(df_scores_tmp.iloc[i,:] == np.max(df_scores_tmp.iloc[i,:]))[0]]
1837
+ cols_to_keep = np.where(df_scores_tmp.iloc[i,:] != np.max(df_scores_tmp.iloc[i,:]))[0]
1838
+ df_scores_tmp = df_scores_tmp.iloc[:,cols_to_keep]
1839
+
1840
+ preds_tmp.append(';'.join(map(str,top_ref_specs_tmp.index.to_list())))
1841
+ if len(top_ref_specs_tmp.values) == 0:
1842
+ scores_tmp.append(0)
1843
+ else:
1844
+ scores_tmp.append(top_ref_specs_tmp.values[0])
1845
+ preds.append(preds_tmp)
1846
+ scores.append(scores_tmp)
1847
+
1848
+ preds = np.array(preds)
1849
+ scores = np.array(scores)
1850
+ out = np.c_[unique_query_ids,preds,scores]
1851
+ df_tmp = pd.DataFrame(out, columns=['TRUE.ID','PREDICTED.ID','SCORE'])
1852
+ acc = (df_tmp['TRUE.ID']==df_tmp['PREDICTED.ID']).mean()
1853
+ return acc
1854
+
1855
+
1856
+
1857
+ def run_spec_lib_matching_on_HRMS_data_shiny(query_data=None, reference_data=None, precursor_ion_mz_tolerance=None, ionization_mode=None, adduct=None, likely_reference_ids=None, similarity_measure='cosine', weights={'Cosine':0.25,'Shannon':0.25,'Renyi':0.25,'Tsallis':0.25}, spectrum_preprocessing_order='FCNMWL', high_quality_reference_library=False, mz_min=0, mz_max=9999999, int_min=0, int_max=9999999, window_size_centroiding=0.5, window_size_matching=0.5, noise_threshold=0.0, wf_mz=0.0, wf_intensity=1.0, LET_threshold=0.0, entropy_dimension=1.1, n_top_matches_to_save=1, print_id_results=False, output_identification=None, output_similarity_scores=None, return_ID_output=False, verbose=True):
1858
+ if query_data is None:
1859
+ print('\nError: No argument passed to the mandatory query_data. Please pass the path to the CSV file of the query data.')
1860
+ sys.exit()
1861
+ else:
1862
+ extension = query_data.rsplit('.',1)
1863
+ extension = extension[(len(extension)-1)]
1864
+ if extension == 'mgf' or extension == 'MGF' or extension == 'mzML' or extension == 'mzml' or extension == 'MZML' or extension == 'cdf' or extension == 'CDF' or extension == 'json' or extension == 'JSON':
1865
+ output_path_tmp = query_data[:-3] + 'txt'
1866
+ build_library_from_raw_data(input_path=query_data, output_path=output_path_tmp, is_reference=False)
1867
+ #build_library_from_raw_data(input_path=query_data, output_path=output_path_tmp, is_reference=True)
1868
+ df_query = pd.read_csv(output_path_tmp, sep='\t')
1869
+ if extension == 'txt' or extension == 'TXT':
1870
+ df_query = pd.read_csv(query_data, sep='\t')
1871
+ unique_query_ids = df_query['id'].unique()
1872
+
1873
+ if reference_data is None:
1874
+ print('\nError: No argument passed to the mandatory reference_data. Please pass the path to the reference data.')
1875
+ sys.exit()
1876
+ else:
1877
+ if isinstance(reference_data,str):
1878
+ df_reference = get_reference_df(reference_data,likely_reference_ids)
1879
+ else:
1880
+ dfs = []
1881
+ for f in reference_data:
1882
+ tmp = get_reference_df(f,likely_reference_ids)
1883
+ dfs.append(tmp)
1884
+ df_reference = pd.concat(dfs, axis=0, ignore_index=True)
1885
+
1886
+ if 'ionization_mode' in df_reference.columns.tolist() and ionization_mode != 'N/A':
1887
+ df_reference = df_reference.loc[df_reference['ionization_mode']==ionization_mode]
1888
+ if 'adduct' in df_reference.columns.tolist() and adduct != 'N/A':
1889
+ df_reference = df_reference.loc[df_reference['adduct']==adduct]
1890
+
1891
+ if spectrum_preprocessing_order is not None:
1892
+ spectrum_preprocessing_order = list(spectrum_preprocessing_order)
1893
+ else:
1894
+ spectrum_preprocessing_order = ['F', 'C', 'N', 'M', 'W', 'L']
1895
+ if 'M' not in spectrum_preprocessing_order:
1896
+ print(f'Error: \'M\' must be a character in spectrum_preprocessing_order.')
1897
+ sys.exit()
1898
+ if 'C' in spectrum_preprocessing_order:
1899
+ if spectrum_preprocessing_order.index('C') > spectrum_preprocessing_order.index('M'):
1900
+ print(f'Error: \'C\' must come before \'M\' in spectrum_preprocessing_order.')
1901
+ sys.exit()
1902
+ if set(spectrum_preprocessing_order) - {'F','C','N','M','W','L'}:
1903
+ print(f'Error: spectrum_preprocessing_order must contain only \'C\', \'F\', \'M\', \'N\', \'L\', \'W\'.')
1904
+ sys.exit()
1905
+
1906
+
1907
+ if similarity_measure not in ['cosine','shannon','renyi','tsallis','mixture','jaccard','dice','3w_jaccard','sokal_sneath','binary_cosine','mountford','mcconnaughey','driver_kroeber','simpson','braun_banquet','fager_mcgowan','kuldzynski','intersection','hamming','hellinger']:
1908
+ print('\nError: similarity_measure must be either cosine, shannon, renyi, tsallis, mixture, jaccard, dice, 3w_jaccard, sokal_sneath, binary_cosine, mountford, mcconnaughey, driver_kroeber, simpson, braun_banquet, fager_mcgowan, kulczynski, intersection, hamming, or hellinger')
1909
+ sys.exit()
1910
+
1911
+ if isinstance(int_min,int) is True:
1912
+ int_min = float(int_min)
1913
+ if isinstance(int_max,int) is True:
1914
+ int_max = float(int_max)
1915
+ if isinstance(mz_min,int) is False or isinstance(mz_max,int) is False or isinstance(int_min,float) is False or isinstance(int_max,float) is False:
1916
+ print('Error: mz_min must be a non-negative integer, mz_max must be a positive integer, int_min must be a non-negative float, and int_max must be a positive float')
1917
+ sys.exit()
1918
+ if mz_min < 0:
1919
+ print('\nError: mz_min should be a non-negative integer')
1920
+ sys.exit()
1921
+ if mz_max <= 0:
1922
+ print('\nError: mz_max should be a positive integer')
1923
+ sys.exit()
1924
+ if int_min < 0:
1925
+ print('\nError: int_min should be a non-negative float')
1926
+ sys.exit()
1927
+ if int_max <= 0:
1928
+ print('\nError: int_max should be a positive float')
1929
+ sys.exit()
1930
+
1931
+ if isinstance(window_size_centroiding,float) is False or window_size_centroiding <= 0.0:
1932
+ print('Error: window_size_centroiding must be a positive float.')
1933
+ sys.exit()
1934
+ if isinstance(window_size_matching,float) is False or window_size_matching<= 0.0:
1935
+ print('Error: window_size_matching must be a positive float.')
1936
+ sys.exit()
1937
+
1938
+ if isinstance(noise_threshold,int) is True:
1939
+ noise_threshold = float(noise_threshold)
1940
+ if isinstance(noise_threshold,float) is False or noise_threshold < 0:
1941
+ print('Error: noise_threshold must be a positive float.')
1942
+ sys.exit()
1943
+
1944
+ if isinstance(wf_intensity,int) is True:
1945
+ wf_intensity = float(wf_intensity)
1946
+ if isinstance(wf_mz,int) is True:
1947
+ wf_mz = float(wf_mz)
1948
+ if isinstance(wf_intensity,float) is False or isinstance(wf_mz,float) is False:
1949
+ print('Error: wf_mz and wf_intensity must be integers or floats')
1950
+ sys.exit()
1951
+
1952
+ if entropy_dimension <= 0:
1953
+ print('\nError: entropy_dimension should be a positive float')
1954
+ sys.exit()
1955
+ else:
1956
+ q = entropy_dimension
1957
+
1958
+ normalization_method = 'standard'
1959
+
1960
+ if n_top_matches_to_save <= 0 or isinstance(n_top_matches_to_save,int)==False:
1961
+ print('\nError: n_top_matches_to_save should be a positive integer')
1962
+ sys.exit()
1963
+
1964
+ if isinstance(print_id_results,bool)==False:
1965
+ print('\nError: print_id_results must be either True or False')
1966
+ sys.exit()
1967
+
1968
+ if output_identification is None:
1969
+ output_identification = f'{Path.cwd()}/output_identification.txt'
1970
+ print(f'Warning: writing identification output to {output_identification}')
1971
+
1972
+ if output_similarity_scores is None:
1973
+ output_similarity_scores = f'{Path.cwd()}/output_all_similarity_scores.txt'
1974
+ print(f'Warning: writing similarity scores to {output_similarity_scores}')
1975
+
1976
+
1977
+ unique_reference_ids = df_reference['id'].unique().tolist()
1978
+ all_similarity_scores = []
1979
+
1980
+ for query_idx in range(len(unique_query_ids)):
1981
+ if verbose:
1982
+ print(f'query spectrum #{query_idx} is being identified')
1983
+
1984
+ q_mask = (df_query['id'] == unique_query_ids[query_idx])
1985
+ q_idxs_tmp = np.where(q_mask)[0]
1986
+ q_spec_tmp = np.asarray(pd.concat([df_query['mz_ratio'].iloc[q_idxs_tmp], df_query['intensity'].iloc[q_idxs_tmp]], axis=1).reset_index(drop=True))
1987
+
1988
+ if 'precursor_ion_mz' in df_query.columns.tolist() and 'precursor_ion_mz' in df_reference.columns.tolist() and precursor_ion_mz_tolerance != None:
1989
+ precursor_ion_mz_tmp = df_query['precursor_ion_mz'].iloc[q_idxs_tmp[0]]
1990
+ df_reference_tmp = df_reference.loc[df_reference['precursor_ion_mz'].between(precursor_ion_mz_tmp-precursor_ion_mz_tolerance, precursor_ion_mz_tmp+precursor_ion_mz_tolerance, inclusive='both'),['id','mz_ratio','intensity']].copy()
1991
+ else:
1992
+ df_reference_tmp = df_reference.copy()
1993
+
1994
+ ref_groups = dict(tuple(df_reference_tmp.groupby('id', sort=False)))
1995
+ unique_reference_ids_tmp = list(ref_groups.keys())
1996
+
1997
+ similarity_by_ref = {}
1998
+ for ref_id in unique_reference_ids_tmp:
1999
+ q_spec = q_spec_tmp.copy()
2000
+ r_df = ref_groups[ref_id]
2001
+ r_spec = np.asarray(pd.concat([r_df['mz_ratio'], r_df['intensity']], axis=1).reset_index(drop=True))
2002
+
2003
+ is_matched = False
2004
+
2005
+ for transformation in spectrum_preprocessing_order:
2006
+ if np.isinf(q_spec[:, 1]).sum() > 0:
2007
+ q_spec[:, 1] = np.zeros(q_spec.shape[0])
2008
+ if np.isinf(r_spec[:, 1]).sum() > 0:
2009
+ r_spec[:, 1] = np.zeros(r_spec.shape[0])
2010
+
2011
+ if transformation == 'C' and q_spec.shape[0] > 1 and r_spec.shape[0] > 1:
2012
+ q_spec = centroid_spectrum(q_spec, window_size=window_size_centroiding)
2013
+ r_spec = centroid_spectrum(r_spec, window_size=window_size_centroiding)
2014
+
2015
+ if transformation == 'M' and q_spec.shape[0] > 1 and r_spec.shape[0] > 1:
2016
+ m_spec = match_peaks_in_spectra(spec_a=q_spec, spec_b=r_spec, window_size=window_size_matching)
2017
+ q_spec = m_spec[:, 0:2]
2018
+ r_spec = m_spec[:, [0, 2]]
2019
+ is_matched = True
2020
+
2021
+ if transformation == 'W' and q_spec.shape[0] > 1 and r_spec.shape[0] > 1:
2022
+ q_spec[:, 1] = wf_transform(q_spec[:, 0], q_spec[:, 1], wf_mz, wf_intensity)
2023
+ r_spec[:, 1] = wf_transform(r_spec[:, 0], r_spec[:, 1], wf_mz, wf_intensity)
2024
+
2025
+ if transformation == 'L' and q_spec.shape[0] > 1 and r_spec.shape[0] > 1:
2026
+ q_spec[:, 1] = LE_transform(q_spec[:, 1], LET_threshold, normalization_method=normalization_method)
2027
+ r_spec[:, 1] = LE_transform(r_spec[:, 1], LET_threshold, normalization_method=normalization_method)
2028
+
2029
+ if transformation == 'N' and q_spec.shape[0] > 1 and r_spec.shape[0] > 1:
2030
+ q_spec = remove_noise(q_spec, nr=noise_threshold)
2031
+ if not high_quality_reference_library:
2032
+ r_spec = remove_noise(r_spec, nr=noise_threshold)
2033
+
2034
+ if transformation == 'F' and q_spec.shape[0] > 1 and r_spec.shape[0] > 1:
2035
+ q_spec = filter_spec_lcms(
2036
+ q_spec, mz_min=mz_min, mz_max=mz_max, int_min=int_min, int_max=int_max, is_matched=is_matched
2037
+ )
2038
+ if not high_quality_reference_library:
2039
+ r_spec = filter_spec_lcms(
2040
+ r_spec, mz_min=mz_min, mz_max=mz_max, int_min=int_min, int_max=int_max, is_matched=is_matched
2041
+ )
2042
+
2043
+ q_ints = q_spec[:, 1]
2044
+ r_ints = r_spec[:, 1]
2045
+
2046
+ if np.sum(q_ints) != 0 and np.sum(r_ints) != 0 and q_spec.shape[0] > 1 and r_spec.shape[0] > 1:
2047
+ sim = get_similarity(similarity_measure, q_ints, r_ints, weights, entropy_dimension)
2048
+ else:
2049
+ sim = 0.0
2050
+
2051
+ similarity_by_ref[ref_id] = sim
2052
+
2053
+ row_scores = [similarity_by_ref.get(ref_id, 0.0) for ref_id in unique_reference_ids]
2054
+ all_similarity_scores.append(row_scores)
2055
+
2056
+ df_scores = pd.DataFrame(all_similarity_scores, index=unique_query_ids, columns=unique_reference_ids)
2057
+ df_scores.index = unique_query_ids
2058
+ df_scores.index.names = ['QUERY.SPECTRUM.ID']
2059
+
2060
+
2061
+ preds = []
2062
+ scores = []
2063
+ for i in range(0, df_scores.shape[0]):
2064
+ df_scores_tmp = df_scores
2065
+ preds_tmp = []
2066
+ scores_tmp = []
2067
+ for j in range(0, n_top_matches_to_save):
2068
+ top_ref_specs_tmp = df_scores_tmp.iloc[i,np.where(df_scores_tmp.iloc[i,:] == np.max(df_scores_tmp.iloc[i,:]))[0]]
2069
+ cols_to_keep = np.where(df_scores_tmp.iloc[i,:] != np.max(df_scores_tmp.iloc[i,:]))[0]
2070
+ df_scores_tmp = df_scores_tmp.iloc[:,cols_to_keep]
2071
+
2072
+ preds_tmp.append(';'.join(map(str,top_ref_specs_tmp.index.to_list())))
2073
+ if len(top_ref_specs_tmp.values) == 0:
2074
+ scores_tmp.append(0)
2075
+ else:
2076
+ scores_tmp.append(top_ref_specs_tmp.values[0])
2077
+ preds.append(preds_tmp)
2078
+ scores.append(scores_tmp)
2079
+
2080
+ preds = np.array(preds)
2081
+ scores = np.array(scores)
2082
+ out = np.c_[preds,scores]
2083
+
2084
+ cnames_preds = []
2085
+ cnames_scores = []
2086
+ for i in range(0,n_top_matches_to_save):
2087
+ cnames_preds.append(f'RANK.{i+1}.PRED')
2088
+ cnames_scores.append(f'RANK.{i+1}.SIMILARITY.SCORE')
2089
+
2090
+ df_top_ref_specs = pd.DataFrame(out, columns = [*cnames_preds, *cnames_scores])
2091
+ df_top_ref_specs.index = unique_query_ids
2092
+ df_top_ref_specs.index.names = ['QUERY.SPECTRUM.ID']
2093
+
2094
+ df_scores.columns = ['Reference Spectrum ID: ' + col for col in list(map(str,df_scores.columns.tolist()))]
2095
+
2096
+ if print_id_results == True:
2097
+ print(df_top_ref_specs.to_string())
2098
+
2099
+ if return_ID_output is False:
2100
+ df_top_ref_specs.to_csv(output_identification, sep='\t')
2101
+ df_scores.to_csv(output_similarity_scores, sep='\t')
2102
+ else:
2103
+ return df_top_ref_specs
2104
+
2105
+
2106
+
2107
+
2108
+ def run_spec_lib_matching_on_NRMS_data_shiny(query_data=None, reference_data=None, likely_reference_ids=None, spectrum_preprocessing_order='FNLW', similarity_measure='cosine', weights={'Cosine':0.25,'Shannon':0.25,'Renyi':0.25,'Tsallis':0.25}, high_quality_reference_library=False, mz_min=0, mz_max=9999999, int_min=0, int_max=9999999, noise_threshold=0.0, wf_mz=0.0, wf_intensity=1.0, LET_threshold=0.0, entropy_dimension=1.1, n_top_matches_to_save=1, print_id_results=False, output_identification=None, output_similarity_scores=None, return_ID_output=False, verbose=True):
2109
+ if query_data is None:
2110
+ print('\nError: No argument passed to the mandatory query_data. Please pass the path to the TXT file of the query data.')
2111
+ sys.exit()
2112
+ else:
2113
+ extension = query_data.rsplit('.',1)
2114
+ extension = extension[(len(extension)-1)]
2115
+ if extension == 'mgf' or extension == 'MGF' or extension == 'mzML' or extension == 'mzml' or extension == 'MZML' or extension == 'cdf' or extension == 'CDF' or extension == 'msp' or extension == 'MSP' or extension == 'json' or extension == 'JSON':
2116
+ output_path_tmp = query_data[:-3] + 'txt'
2117
+ build_library_from_raw_data(input_path=query_data, output_path=output_path_tmp, is_reference=False)
2118
+ df_query = pd.read_csv(output_path_tmp, sep='\t')
2119
+ if extension == 'txt' or extension == 'TXT':
2120
+ df_query = pd.read_csv(query_data, sep='\t')
2121
+ unique_query_ids = df_query.iloc[:,0].unique()
2122
+
2123
+ if reference_data is None:
2124
+ print('\nError: No argument passed to the mandatory reference_data. Please pass the path to the TXT file of the reference data.')
2125
+ sys.exit()
2126
+ else:
2127
+ if isinstance(reference_data,str):
2128
+ df_reference = get_reference_df(reference_data,likely_reference_ids)
2129
+ unique_reference_ids = df_reference.iloc[:,0].unique()
2130
+ else:
2131
+ dfs = []
2132
+ unique_reference_ids = []
2133
+ for f in reference_data:
2134
+ tmp = get_reference_df(f,likely_reference_ids)
2135
+ dfs.append(tmp)
2136
+ unique_reference_ids.extend(tmp.iloc[:,0].unique())
2137
+ df_reference = pd.concat(dfs, axis=0, ignore_index=True)
2138
+
2139
+
2140
+ if spectrum_preprocessing_order is not None:
2141
+ spectrum_preprocessing_order = list(spectrum_preprocessing_order)
2142
+ else:
2143
+ spectrum_preprocessing_order = ['F','N','W','L']
2144
+ if set(spectrum_preprocessing_order) - {'F','N','W','L'}:
2145
+ print(f'Error: spectrum_preprocessing_order must contain only \'F\', \'N\', \'W\', \'L\'.')
2146
+ sys.exit()
2147
+
2148
+ if similarity_measure not in ['cosine','shannon','renyi','tsallis','mixture','jaccard','dice','3w_jaccard','sokal_sneath','binary_cosine','mountford','mcconnaughey','driver_kroeber','simpson','braun_banquet','fager_mcgowan','kuldzynski','intersection','hamming','hellinger']:
2149
+ print('\nError: similarity_measure must be either cosine, shannon, renyi, tsallis, mixture, jaccard, dice, 3w_jaccard, sokal_sneath, binary_cosine, mountford, mcconnaughey, driver_kroeber, simpson, braun_banquet, fager_mcgowan, kulczynski, intersection, hamming, or hellinger')
2150
+ sys.exit()
2151
+
2152
+ if isinstance(int_min,int) is True:
2153
+ int_min = float(int_min)
2154
+ if isinstance(int_max,int) is True:
2155
+ int_max = float(int_max)
2156
+ if isinstance(mz_min,int) is False or isinstance(mz_max,int) is False or isinstance(int_min,float) is False or isinstance(int_max,float) is False:
2157
+ print('Error: mz_min must be a non-negative integer, mz_max must be a positive integer, int_min must be a non-negative float, and int_max must be a positive float')
2158
+ sys.exit()
2159
+ if mz_min < 0:
2160
+ print('\nError: mz_min should be a non-negative integer')
2161
+ sys.exit()
2162
+ if mz_max <= 0:
2163
+ print('\nError: mz_max should be a positive integer')
2164
+ sys.exit()
2165
+ if int_min < 0:
2166
+ print('\nError: int_min should be a non-negative float')
2167
+ sys.exit()
2168
+ if int_max <= 0:
2169
+ print('\nError: int_max should be a positive float')
2170
+ sys.exit()
2171
+
2172
+ if isinstance(noise_threshold,int) is True:
2173
+ noise_threshold = float(noise_threshold)
2174
+ if isinstance(noise_threshold,float) is False or noise_threshold < 0:
2175
+ print('Error: noise_threshold must be a positive float.')
2176
+ sys.exit()
2177
+
2178
+ if isinstance(wf_intensity,int) is True:
2179
+ wf_intensity = float(wf_intensity)
2180
+ if isinstance(wf_mz,int) is True:
2181
+ wf_mz = float(wf_mz)
2182
+ if isinstance(wf_intensity,float) is False or isinstance(wf_mz,float) is False:
2183
+ print('Error: wf_mz and wf_intensity must be integers or floats')
2184
+ sys.exit()
2185
+
2186
+ if entropy_dimension <= 0:
2187
+ print('\nError: entropy_dimension should be a positive float')
2188
+ sys.exit()
2189
+ else:
2190
+ q = entropy_dimension
2191
+
2192
+ normalization_method = 'standard'
2193
+
2194
+ if n_top_matches_to_save <= 0 or isinstance(n_top_matches_to_save,int)==False:
2195
+ print('\nError: n_top_matches_to_save should be a positive integer')
2196
+ sys.exit()
2197
+
2198
+ if isinstance(print_id_results,bool)==False:
2199
+ print('\nError: print_id_results must be either True or False')
2200
+ sys.exit()
2201
+
2202
+ if output_identification is None:
2203
+ output_identification = f'{Path.cwd()}/output_identification.txt'
2204
+ print(f'Warning: writing identification output to {output_identification}')
2205
+
2206
+ if output_similarity_scores is None:
2207
+ output_similarity_scores = f'{Path.cwd()}/output_all_similarity_scores.txt'
2208
+ print(f'Warning: writing similarity scores to {output_similarity_scores}')
2209
+
2210
+
2211
+
2212
+ min_mz = int(np.min([np.min(df_query.iloc[:,1]), np.min(df_reference.iloc[:,1])]))
2213
+ max_mz = int(np.max([np.max(df_query.iloc[:,1]), np.max(df_reference.iloc[:,1])]))
2214
+ mzs = np.linspace(min_mz,max_mz,(max_mz-min_mz+1))
2215
+
2216
+ all_similarity_scores = []
2217
+ for query_idx in range(0,len(unique_query_ids)):
2218
+ q_idxs_tmp = np.where(df_query.iloc[:,0] == unique_query_ids[query_idx])[0]
2219
+ q_spec_tmp = np.asarray(pd.concat([df_query.iloc[q_idxs_tmp,1], df_query.iloc[q_idxs_tmp,2]], axis=1).reset_index(drop=True))
2220
+ q_spec_tmp = convert_spec(q_spec_tmp,mzs)
2221
+
2222
+ similarity_scores = []
2223
+ for ref_idx in range(0,len(unique_reference_ids)):
2224
+ if verbose is True and ref_idx % 1000 == 0:
2225
+ print(f'Query spectrum #{query_idx} has had its similarity with {ref_idx} reference library spectra computed')
2226
+ q_spec = q_spec_tmp
2227
+ r_idxs_tmp = np.where(df_reference.iloc[:,0] == unique_reference_ids[ref_idx])[0]
2228
+ r_spec_tmp = np.asarray(pd.concat([df_reference.iloc[r_idxs_tmp,1], df_reference.iloc[r_idxs_tmp,2]], axis=1).reset_index(drop=True))
2229
+ r_spec = convert_spec(r_spec_tmp,mzs)
2230
+
2231
+ for transformation in spectrum_preprocessing_order:
2232
+ if np.isinf(q_spec[:,1]).sum() > 0:
2233
+ q_spec[:,1] = np.zeros(q_spec.shape[0])
2234
+ if np.isinf(r_spec[:,1]).sum() > 0:
2235
+ r_spec[:,1] = np.zeros(r_spec.shape[0])
2236
+ if transformation == 'W':
2237
+ q_spec[:,1] = wf_transform(q_spec[:,0], q_spec[:,1], wf_mz, wf_intensity)
2238
+ r_spec[:,1] = wf_transform(r_spec[:,0], r_spec[:,1], wf_mz, wf_intensity)
2239
+ if transformation == 'L':
2240
+ q_spec[:,1] = LE_transform(q_spec[:,1], LET_threshold, normalization_method=normalization_method)
2241
+ r_spec[:,1] = LE_transform(r_spec[:,1], LET_threshold, normalization_method=normalization_method)
2242
+ if transformation == 'N':
2243
+ q_spec = remove_noise(q_spec, nr = noise_threshold)
2244
+ if high_quality_reference_library == False:
2245
+ r_spec = remove_noise(r_spec, nr = noise_threshold)
2246
+ if transformation == 'F':
2247
+ q_spec = filter_spec_gcms(q_spec, mz_min = mz_min, mz_max = mz_max, int_min = int_min, int_max = int_max)
2248
+ if high_quality_reference_library == False:
2249
+ r_spec = filter_spec_gcms(r_spec, mz_min = mz_min, mz_max = mz_max, int_min = int_min, int_max = int_max)
2250
+
2251
+ q_ints = q_spec[:,1]
2252
+ r_ints = r_spec[:,1]
2253
+
2254
+ if np.sum(q_ints) != 0 and np.sum(r_ints) != 0:
2255
+ similarity_score = get_similarity(similarity_measure, q_spec[:,1], r_spec[:,1], weights, entropy_dimension)
2256
+ else:
2257
+ similarity_score = 0
2258
+
2259
+ similarity_scores.append(similarity_score)
2260
+ all_similarity_scores.append(similarity_scores)
2261
+
2262
+ df_scores = pd.DataFrame(all_similarity_scores, columns = unique_reference_ids)
2263
+ df_scores.index = unique_query_ids
2264
+ df_scores.index.names = ['QUERY.SPECTRUM.ID']
2265
+
2266
+ preds = []
2267
+ scores = []
2268
+ for i in range(0, df_scores.shape[0]):
2269
+ df_scores_tmp = df_scores
2270
+ preds_tmp = []
2271
+ scores_tmp = []
2272
+ for j in range(0, n_top_matches_to_save):
2273
+ top_ref_specs_tmp = df_scores_tmp.iloc[i,np.where(df_scores_tmp.iloc[i,:] == np.max(df_scores_tmp.iloc[i,:]))[0]]
2274
+ cols_to_keep = np.where(df_scores_tmp.iloc[i,:] != np.max(df_scores_tmp.iloc[i,:]))[0]
2275
+ df_scores_tmp = df_scores_tmp.iloc[:,cols_to_keep]
2276
+
2277
+ preds_tmp.append(';'.join(map(str,top_ref_specs_tmp.index.to_list())))
2278
+ if len(top_ref_specs_tmp.values) == 0:
2279
+ scores_tmp.append(0)
2280
+ else:
2281
+ scores_tmp.append(top_ref_specs_tmp.values[0])
2282
+ preds.append(preds_tmp)
2283
+ scores.append(scores_tmp)
2284
+
2285
+ preds = np.array(preds)
2286
+ scores = np.array(scores)
2287
+ out = np.c_[preds,scores]
2288
+
2289
+ cnames_preds = []
2290
+ cnames_scores = []
2291
+ for i in range(0,n_top_matches_to_save):
2292
+ cnames_preds.append(f'RANK.{i+1}.PRED')
2293
+ cnames_scores.append(f'RANK.{i+1}.SIMILARITY.SCORE')
2294
+
2295
+ df_top_ref_specs = pd.DataFrame(out, columns = [*cnames_preds, *cnames_scores])
2296
+ df_top_ref_specs.index = unique_query_ids
2297
+ df_top_ref_specs.index.names = ['QUERY.SPECTRUM.ID']
2298
+
2299
+ if print_id_results == True:
2300
+ print(df_top_ref_specs.to_string())
2301
+
2302
+ df_scores.columns = ['Reference Spectrum ID: ' + col for col in list(map(str,df_scores.columns.tolist()))]
2303
+
2304
+ if return_ID_output is False:
2305
+ df_top_ref_specs.to_csv(output_identification, sep='\t')
2306
+ df_scores.columns = ['Reference Spectrum ID: ' + col for col in list(map(str,df_scores.columns.tolist()))]
2307
+ df_scores.to_csv(output_similarity_scores, sep='\t')
2308
+ else:
2309
+ return df_top_ref_specs
2310
+
2311
+
35
2312
  class _UIWriter:
36
2313
  def __init__(self, loop, q: asyncio.Queue[str]):
37
2314
  self._loop = loop
@@ -90,19 +2367,21 @@ def strip_weights(s):
90
2367
  def build_library(input_path=None, output_path=None):
91
2368
  last_three_chars = input_path[(len(input_path)-3):len(input_path)]
92
2369
  last_four_chars = input_path[(len(input_path)-4):len(input_path)]
93
- if last_three_chars == 'csv' or last_three_chars == 'CSV':
94
- return pd.read_csv(input_path)
2370
+ if last_three_chars == 'txt' or last_three_chars == 'TXT':
2371
+ return pd.read_csv(input_path, sep='\t')
95
2372
  else:
96
2373
  if last_three_chars == 'mgf' or last_three_chars == 'MGF':
97
2374
  input_file_type = 'mgf'
98
2375
  elif last_four_chars == 'mzML' or last_four_chars == 'mzml' or last_four_chars == 'MZML':
99
2376
  input_file_type = 'mzML'
2377
+ elif last_four_chars == 'json' or last_four_chars == 'JSON':
2378
+ input_file_type = 'json'
100
2379
  elif last_three_chars == 'cdf' or last_three_chars == 'CDF':
101
2380
  input_file_type = 'cdf'
102
2381
  elif last_three_chars == 'msp' or last_three_chars == 'MSP':
103
2382
  input_file_type = 'msp'
104
2383
  else:
105
- print('ERROR: either an \'mgf\', \'mzML\', \'cdf\', or \'msp\' file must be passed to --input_path')
2384
+ print('ERROR: either an \'mgf\', \'mzML\', \'cdf\', \'msp\', \'json\', or \'txt\' file must be passed to --input_path')
106
2385
  sys.exit()
107
2386
 
108
2387
  spectra = []
@@ -172,6 +2451,23 @@ def build_library(input_path=None, output_path=None):
172
2451
  except ValueError:
173
2452
  continue
174
2453
 
2454
+ if input_file_type == 'json':
2455
+ data = json.load(open(input_path))
2456
+ ids = []
2457
+ mzs = []
2458
+ ints = []
2459
+ for i in range(0,len(data)):
2460
+ spec_ID_tmp = data[i]['spectrum_id']
2461
+ tmp = data[i]['peaks_json']
2462
+ tmp = tmp[1:-1].split(",")
2463
+ tmp = [a.replace("[","") for a in tmp]
2464
+ tmp = [a.replace("]","") for a in tmp]
2465
+ mzs_tmp = tmp[0::2]
2466
+ ints_tmp = tmp[1::2]
2467
+ ids.extend([spec_ID_tmp] * len(mzs_tmp))
2468
+ mzs.extend(mzs_tmp)
2469
+ ints.extend(ints_tmp)
2470
+
175
2471
  df = pd.DataFrame({'id':ids, 'mz_ratio':mzs, 'intensity':ints})
176
2472
  return df
177
2473
 
@@ -180,9 +2476,12 @@ def build_library(input_path=None, output_path=None):
180
2476
  def extract_first_column_ids(file_path: str, max_ids: int = 20000):
181
2477
  suffix = Path(file_path).suffix.lower()
182
2478
 
183
- if suffix == ".csv":
184
- df = pd.read_csv(file_path, usecols=[0])
185
- ids = df.iloc[:, 0].astype(str).dropna()
2479
+ if suffix == ".txt":
2480
+ df = pd.read_csv(file_path, sep='\t')
2481
+ if 'id' in df.columns.tolist():
2482
+ ids = df['id'].astype(str).dropna()
2483
+ else:
2484
+ ids = df.iloc[:, 0].astype(str).dropna()
186
2485
  ids = [x for x in ids if x.strip() != ""]
187
2486
  seen = set()
188
2487
  uniq = []
@@ -217,17 +2516,17 @@ def extract_first_column_ids(file_path: str, max_ids: int = 20000):
217
2516
  return []
218
2517
 
219
2518
 
220
- def _open_plot_window(session, png_bytes: bytes, title: str = "plot.png"):
221
- """Send PNG bytes to browser and open in a new window as a data URL."""
222
- b64 = base64.b64encode(png_bytes).decode("ascii")
223
- data_url = f"data:image/png;base64,{b64}"
224
- session.send_custom_message("open-plot-window", {"png": data_url, "title": title})
2519
+ def _open_plot_window(session, svg_bytes: bytes, title: str = "plot.svg"):
2520
+ """Send SVG bytes to browser and open in a new window as a data URL."""
2521
+ b64 = base64.b64encode(svg_bytes).decode("ascii")
2522
+ data_url = f"data:image/svg;base64,{b64}"
2523
+ session.send_custom_message("open-plot-window", {"svg": data_url, "title": title})
225
2524
 
226
2525
 
227
2526
  def plot_spectra_ui(platform: str):
228
2527
  base_inputs = [
229
- ui.input_file("query_data", "Upload query dataset (mgf, mzML, cdf, msp, or csv):"),
230
- ui.input_file("reference_data", "Upload reference dataset (mgf, mzML, cdf, msp, or csv):"),
2528
+ ui.input_file("query_data", "Upload query dataset (mgf, mzML, cdf, msp, or txt):"),
2529
+ ui.input_file("reference_data", "Upload reference dataset (mgf, mzML, cdf, msp, or txt):"),
231
2530
  ui.input_selectize(
232
2531
  "spectrum_ID1",
233
2532
  "Select spectrum ID 1 (default is the first spectrum in the library):",
@@ -242,6 +2541,8 @@ def plot_spectra_ui(platform: str):
242
2541
  multiple=False,
243
2542
  options={"placeholder": "Upload a library..."},
244
2543
  ),
2544
+ ui.input_select('print_url_spectrum1', 'Print PubChem URL for spectrum 1:', ['No', 'Yes']),
2545
+ ui.input_select('print_url_spectrum2', 'Print PubChem URL for spectrum 2:', ['No', 'Yes']),
245
2546
  ui.input_select("similarity_measure", "Select similarity measure:", ["cosine","shannon","renyi","tsallis","mixture","jaccard","dice","3w_jaccard","sokal_sneath","binary_cosine","mountford","mcconnaughey","driver_kroeber","simpson","braun_banquet","fager_mcgowan","kulczynski","intersection","hamming","hellinger"]),
246
2547
  ui.input_text('weights', 'Weights for mixture similarity measure (cosine, shannon, renyi, tsallis):', '0.25, 0.25, 0.25, 0.25'),
247
2548
  ui.input_select(
@@ -253,21 +2554,13 @@ def plot_spectra_ui(platform: str):
253
2554
 
254
2555
  if platform == "HRMS":
255
2556
  extra_inputs = [
256
- ui.input_text(
257
- "spectrum_preprocessing_order",
258
- "Sequence of characters for preprocessing order (C (centroiding), F (filtering), M (matching), N (noise removal), L (low-entropy transformation), W (weight factor transformation)). M must be included, C before M if used.",
259
- "FCNMWL",
260
- ),
2557
+ ui.input_text("spectrum_preprocessing_order", "Sequence of characters for preprocessing order (C (centroiding), F (filtering), M (matching), N (noise removal), L (low-entropy transformation), W (weight factor transformation)). M must be included, C before M if used.", "FCNMWL",),
261
2558
  ui.input_numeric("window_size_centroiding", "Centroiding window-size:", 0.5),
262
2559
  ui.input_numeric("window_size_matching", "Matching window-size:", 0.5),
263
2560
  ]
264
2561
  else:
265
2562
  extra_inputs = [
266
- ui.input_text(
267
- "spectrum_preprocessing_order",
268
- "Sequence of characters for preprocessing order (F (filtering), N (noise removal), L (low-entropy transformation), W (weight factor transformation)).",
269
- "FNLW",
270
- )
2563
+ ui.input_text("spectrum_preprocessing_order", "Sequence of characters for preprocessing order (F (filtering), N (noise removal), L (low-entropy transformation), W (weight factor transformation)).", "FNLW",)
271
2564
  ]
272
2565
 
273
2566
  numeric_inputs = [
@@ -282,11 +2575,7 @@ def plot_spectra_ui(platform: str):
282
2575
  ui.input_numeric("entropy_dimension", "Entropy dimension (Renyi/Tsallis only):", 1.1),
283
2576
  ]
284
2577
 
285
- select_input = ui.input_select(
286
- "y_axis_transformation",
287
- "Transformation to apply to intensity axis:",
288
- ["normalized", "none", "log10", "sqrt"],
289
- )
2578
+ select_input = ui.input_select("y_axis_transformation", "Transformation to apply to intensity axis:", ["normalized", "none", "log10", "sqrt"])
290
2579
 
291
2580
  run_button_plot_spectra = ui.download_button("run_btn_plot_spectra", "Run", style="font-size:16px; padding:15px 30px; width:200px; height:80px")
292
2581
  back_button = ui.input_action_button("back", "Back to main menu", style="font-size:16px; padding:15px 30px; width:200px; height:80px")
@@ -294,15 +2583,15 @@ def plot_spectra_ui(platform: str):
294
2583
  if platform == "HRMS":
295
2584
  inputs_columns = ui.layout_columns(
296
2585
  ui.div(base_inputs[0:6], style="display:flex; flex-direction:column; gap:10px;"),
297
- ui.div([base_inputs[6:7], *extra_inputs], style="display:flex; flex-direction:column; gap:10px;"),
298
- ui.div(numeric_inputs[0:5], style="display:flex; flex-direction:column; gap:10px;"),
299
- ui.div([numeric_inputs[5:10], select_input], style="display:flex; flex-direction:column; gap:10px;"),
2586
+ ui.div([base_inputs[6:9], extra_inputs[0]], style="display:flex; flex-direction:column; gap:10px;"),
2587
+ ui.div(extra_inputs[1:3], numeric_inputs[0:3], style="display:flex; flex-direction:column; gap:10px;"),
2588
+ ui.div([numeric_inputs[3:10], select_input], style="display:flex; flex-direction:column; gap:10px;"),
300
2589
  col_widths=(3,3,3,3),
301
2590
  )
302
2591
  elif platform == "NRMS":
303
2592
  inputs_columns = ui.layout_columns(
304
2593
  ui.div(base_inputs[0:6], style="display:flex; flex-direction:column; gap:10px;"),
305
- ui.div([base_inputs[6:7], *extra_inputs], style="display:flex; flex-direction:column; gap:10px;"),
2594
+ ui.div([base_inputs[6:9], *extra_inputs], style="display:flex; flex-direction:column; gap:10px;"),
306
2595
  ui.div(numeric_inputs[0:5], style="display:flex; flex-direction:column; gap:10px;"),
307
2596
  ui.div([numeric_inputs[5:10], select_input], style="display:flex; flex-direction:column; gap:10px;"),
308
2597
  col_widths=(3,3,3,3),
@@ -323,49 +2612,29 @@ def plot_spectra_ui(platform: str):
323
2612
 
324
2613
  def run_spec_lib_matching_ui(platform: str):
325
2614
  base_inputs = [
326
- ui.input_file("query_data", "Upload query dataset (mgf, mzML, cdf, msp, or csv):"),
327
- ui.input_file("reference_data", "Upload reference dataset (mgf, mzML, cdf, msp, or csv):"),
2615
+ ui.input_file("query_data", "Upload query dataset (mgf, mzML, cdf, msp, or txt):"),
2616
+ ui.input_file("reference_data", "Upload reference dataset (mgf, mzML, cdf, msp, or txt):"),
328
2617
  ui.input_select("similarity_measure", "Select similarity measure:", ["cosine","shannon","renyi","tsallis","mixture","jaccard","dice","3w_jaccard","sokal_sneath","binary_cosine","mountford","mcconnaughey","driver_kroeber","simpson","braun_banquet","fager_mcgowan","kulczynski","intersection","hamming","hellinger"]),
329
2618
  ui.input_text('weights', 'Weights for mixture similarity measure (cosine, shannon, renyi, tsallis):', '0.25, 0.25, 0.25, 0.25'),
330
- ui.input_selectize(
331
- "spectrum_ID1",
332
- "Select spectrum ID 1 (only applicable for plotting; default is the first spectrum in the query library):",
333
- choices=[],
334
- multiple=False,
335
- options={"placeholder": "Upload a library..."},
336
- ),
337
- ui.input_selectize(
338
- "spectrum_ID2",
339
- "Select spectrum ID 2 (only applicable for plotting; default is the first spectrum in the reference library):",
340
- choices=[],
341
- multiple=False,
342
- options={"placeholder": "Upload a library..."},
343
- ),
344
- ui.input_select(
345
- "high_quality_reference_library",
346
- "Indicate whether the reference library is considered high quality. If True, filtering and noise removal are only applied to the query spectra.",
347
- [False, True],
348
- )
2619
+ ui.input_file('compound_ID_output_file', 'Upload output from spectral library matching to plot top matches (optional)'),
2620
+ ui.input_selectize("q_spec", "Select query spectrum (only applicable for plotting; default is the first spectrum in the compound ID output):", choices=[], multiple=False, options={"placeholder": "Upload compound ID output..."}),
2621
+ ui.input_selectize("r_spec", "Select reference spectrum (only applicable for plotting; default is the rank 1 reference spectrum):", choices=[], multiple=False, options={"placeholder": "Upload compound ID output..."}),
2622
+ ui.input_select('print_url_spectrum1', 'Print PubChem URL for query spectrum (only applicable for plotting):', ['No', 'Yes']),
2623
+ ui.input_select('print_url_spectrum2', 'Print PubChem URL for reference spectrum (only applicable for plotting):', ['No', 'Yes']),
2624
+ ui.input_select("high_quality_reference_library", "Indicate whether the reference library is considered high quality. If True, filtering and noise removal are only applied to the query spectra.", [False, True])
349
2625
  ]
350
2626
 
351
2627
  if platform == "HRMS":
352
2628
  extra_inputs = [
353
- ui.input_text(
354
- "spectrum_preprocessing_order",
355
- "Sequence of characters for preprocessing order (C (centroiding), F (filtering), M (matching), N (noise removal), L (low-entropy transformation), W (weight factor transformation)). M must be included, C before M if used.",
356
- "FCNMWL",
357
- ),
2629
+ ui.input_numeric("precursor_ion_mz_tolerance", "Precursor ion mass tolerance (leave blank if not applicable):", None),
2630
+ ui.input_select("ionization_mode", "Ionization mode:", ['Positive','Negative','N/A'], selected='N/A'),
2631
+ ui.input_select("adduct", "Adduct:", ['H','NH3','NH4','Na','K','N/A'], selected='N/A'),
2632
+ ui.input_text("spectrum_preprocessing_order","Sequence of characters for preprocessing order (C (centroiding), F (filtering), M (matching), N (noise removal), L (low-entropy transformation), W (weight factor transformation)). M must be included, C before M if used.","FCNMWL"),
358
2633
  ui.input_numeric("window_size_centroiding", "Centroiding window-size:", 0.5),
359
2634
  ui.input_numeric("window_size_matching", "Matching window-size:", 0.5),
360
2635
  ]
361
2636
  else:
362
- extra_inputs = [
363
- ui.input_text(
364
- "spectrum_preprocessing_order",
365
- "Sequence of characters for preprocessing order (F (filtering), N (noise removal), L (low-entropy transformation), W (weight factor transformation)).",
366
- "FNLW",
367
- )
368
- ]
2637
+ extra_inputs = [ui.input_text("spectrum_preprocessing_order","Sequence of characters for preprocessing order (F (filtering), N (noise removal), L (low-entropy transformation), W (weight factor transformation)).","FNLW")]
369
2638
 
370
2639
  numeric_inputs = [
371
2640
  ui.input_numeric("mz_min", "Minimum m/z for filtering:", 0),
@@ -387,16 +2656,16 @@ def run_spec_lib_matching_ui(platform: str):
387
2656
 
388
2657
  if platform == "HRMS":
389
2658
  inputs_columns = ui.layout_columns(
390
- ui.div(base_inputs[0:6], style="display:flex; flex-direction:column; gap:10px;"),
391
- ui.div([base_inputs[6:7], *extra_inputs], style="display:flex; flex-direction:column; gap:10px;"),
392
- ui.div(numeric_inputs[0:5], style="display:flex; flex-direction:column; gap:10px;"),
393
- ui.div(numeric_inputs[5:10], style="display:flex; flex-direction:column; gap:10px;"),
2659
+ ui.div([base_inputs[0:2], extra_inputs[0:3], base_inputs[2:4]], style="display:flex; flex-direction:column; gap:10px;"),
2660
+ ui.div([base_inputs[4:10]], style="display:flex; flex-direction:column; gap:10px;"),
2661
+ ui.div([extra_inputs[3:6], numeric_inputs[0:3]], style="display:flex; flex-direction:column; gap:10px;"),
2662
+ ui.div(numeric_inputs[3:10], style="display:flex; flex-direction:column; gap:10px;"),
394
2663
  col_widths=(3,3,3,3)
395
2664
  )
396
2665
  elif platform == "NRMS":
397
2666
  inputs_columns = ui.layout_columns(
398
2667
  ui.div(base_inputs[0:6], style="display:flex; flex-direction:column; gap:10px;"),
399
- ui.div([base_inputs[6:7], *extra_inputs], style="display:flex; flex-direction:column; gap:10px;"),
2668
+ ui.div([base_inputs[6:10], *extra_inputs], style="display:flex; flex-direction:column; gap:10px;"),
400
2669
  ui.div(numeric_inputs[0:5], style="display:flex; flex-direction:column; gap:10px;"),
401
2670
  ui.div(numeric_inputs[5:10], style="display:flex; flex-direction:column; gap:10px;"),
402
2671
  col_widths=(3,3,3,3)
@@ -423,8 +2692,8 @@ def run_spec_lib_matching_ui(platform: str):
423
2692
 
424
2693
  def run_parameter_tuning_grid_ui(platform: str):
425
2694
  base_inputs = [
426
- ui.input_file("query_data", "Upload query dataset (mgf, mzML, cdf, msp, or csv):"),
427
- ui.input_file("reference_data", "Upload reference dataset (mgf, mzML, cdf, msp, or csv):"),
2695
+ ui.input_file("query_data", "Upload query dataset (mgf, mzML, cdf, msp, or txt):"),
2696
+ ui.input_file("reference_data", "Upload reference dataset (mgf, mzML, cdf, msp, or txt):"),
428
2697
  ui.input_selectize("similarity_measure", "Select similarity measure(s):", ["cosine","shannon","renyi","tsallis","mixture","jaccard","dice","3w_jaccard","sokal_sneath","binary_cosine","mountford","mcconnaughey","driver_kroeber","simpson","braun_banquet","fager_mcgowan","kulczynski","intersection","hamming","hellinger"], multiple=True, selected='cosine'),
429
2698
  ui.input_text('weights', 'Weights for mixture similarity measure (cosine, shannon, renyi, tsallis):', '((0.25, 0.25, 0.25, 0.25))'),
430
2699
  ui.input_text("high_quality_reference_library", "Indicate whether the reference library is considered high quality. If True, filtering and noise removal are only applied to the query spectra.", '[True]')
@@ -432,11 +2701,10 @@ def run_parameter_tuning_grid_ui(platform: str):
432
2701
 
433
2702
  if platform == "HRMS":
434
2703
  extra_inputs = [
435
- ui.input_text(
436
- "spectrum_preprocessing_order",
437
- "Sequence of characters for preprocessing order (C (centroiding), F (filtering), M (matching), N (noise removal), L (low-entropy transformation), W (weight factor transformation)). M must be included, C before M if used.",
438
- "[FCNMWL,CWM]",
439
- ),
2704
+ ui.input_numeric("precursor_ion_mz_tolerance", "Precursor ion mass tolerance (leave blank if not applicable):", None),
2705
+ ui.input_select("ionization_mode", "Ionization mode:", ['Positive','Negative','N/A'], selected='N/A'),
2706
+ ui.input_select("adduct", "Adduct:", ['H','NH3','NH4','Na','K','N/A'], selected='N/A'),
2707
+ ui.input_text("spectrum_preprocessing_order", "Sequence of characters for preprocessing order (C (centroiding), F (filtering), M (matching), N (noise removal), L (low-entropy transformation), W (weight factor transformation)). M must be included, C before M if used.", "[FCNMWL,CWM]"),
440
2708
  ui.input_text("window_size_centroiding", "Centroiding window-size:", "[0.5]"),
441
2709
  ui.input_text("window_size_matching", "Matching window-size:", "[0.1,0.5]"),
442
2710
  ]
@@ -490,7 +2758,7 @@ def run_parameter_tuning_grid_ui(platform: str):
490
2758
 
491
2759
  return ui.div(
492
2760
  ui.TagList(
493
- ui.h2("Tune parameters"),
2761
+ ui.h2("Tune parameters (grid search)"),
494
2762
  inputs_columns,
495
2763
  run_button_parameter_tuning_grid,
496
2764
  back_button,
@@ -527,48 +2795,23 @@ def run_parameter_tuning_DE_ui(platform: str):
527
2795
  PARAMS = PARAMS_NRMS
528
2796
 
529
2797
  base_inputs = [
530
- ui.input_file("query_data", "Upload query dataset (mgf, mzML, cdf, msp, or csv):"),
531
- ui.input_file("reference_data", "Upload reference dataset (mgf, mzML, cdf, msp, or csv):"),
532
- ui.input_select(
533
- "similarity_measure",
534
- "Select similarity measure:",
535
- [
536
- "cosine","shannon","renyi","tsallis","mixture","jaccard","dice",
537
- "3w_jaccard","sokal_sneath","binary_cosine","mountford",
538
- "mcconnaughey","driver_kroeber","simpson","braun_banquet",
539
- "fager_mcgowan","kulczynski","intersection","hamming","hellinger",
540
- ],
541
- ),
542
- ui.input_text(
543
- "weights",
544
- "Weights for mixture similarity measure (cosine, shannon, renyi, tsallis):",
545
- "0.25, 0.25, 0.25, 0.25",
546
- ),
547
- ui.input_select(
548
- "high_quality_reference_library",
549
- "Indicate whether the reference library is considered high quality. If True, filtering and noise removal are only applied to the query spectra.",
550
- [False, True],
551
- ),
552
- ]
2798
+ ui.input_file("query_data", "Upload query dataset (mgf, mzML, cdf, msp, or txt):"),
2799
+ ui.input_file("reference_data", "Upload reference dataset (mgf, mzML, cdf, msp, or txt):"),
2800
+ ui.input_select("similarity_measure", "Select similarity measure:", ["cosine","shannon","renyi","tsallis","mixture","jaccard","dice","3w_jaccard","sokal_sneath","binary_cosine","mountford","mcconnaughey","driver_kroeber","simpson","braun_banquet","fager_mcgowan","kulczynski","intersection","hamming","hellinger"]),
2801
+ ui.input_text("weights", "Weights for mixture similarity measure (cosine, shannon, renyi, tsallis):", "0.25, 0.25, 0.25, 0.25"),
2802
+ ui.input_select("high_quality_reference_library", "Indicate whether the reference library is considered high quality. If True, filtering and noise removal are only applied to the query spectra.", [False, True])]
553
2803
 
554
2804
  if platform == "HRMS":
555
2805
  extra_inputs = [
556
- ui.input_text(
557
- "spectrum_preprocessing_order",
558
- "Sequence of characters for preprocessing order (C (centroiding), F (filtering), M (matching), N (noise removal), L (low-entropy transformation), W (weight factor transformation)). M must be included, C before M if used.",
559
- "FCNMWL",
560
- ),
2806
+ ui.input_numeric("precursor_ion_mz_tolerance", "Precursor ion mass tolerance (leave blank if not applicable):", None),
2807
+ ui.input_select("ionization_mode", "Ionization mode:", ['Positive','Negative','N/A'], selected='N/A'),
2808
+ ui.input_select("adduct", "Adduct:", ['H','NH3','NH4','Na','K','N/A'], selected='N/A'),
2809
+ ui.input_text("spectrum_preprocessing_order", "Sequence of characters for preprocessing order (C (centroiding), F (filtering), M (matching), N (noise removal), L (low-entropy transformation), W (weight factor transformation)). M must be included, C before M if used.", "FCNMWL"),
561
2810
  ui.input_numeric("window_size_centroiding", "Centroiding window-size:", 0.5),
562
2811
  ui.input_numeric("window_size_matching", "Matching window-size:", 0.5),
563
2812
  ]
564
2813
  else:
565
- extra_inputs = [
566
- ui.input_text(
567
- "spectrum_preprocessing_order",
568
- "Sequence of characters for preprocessing order (F (filtering), N (noise removal), L (low-entropy transformation), W (weight factor transformation)).",
569
- "FNLW",
570
- )
571
- ]
2814
+ extra_inputs = [ui.input_text("spectrum_preprocessing_order", "Sequence of characters for preprocessing order (F (filtering), N (noise removal), L (low-entropy transformation), W (weight factor transformation)).", "FNLW")]
572
2815
 
573
2816
  numeric_inputs = [
574
2817
  ui.input_numeric("mz_min", "Minimum m/z for filtering:", 0),
@@ -583,18 +2826,9 @@ def run_parameter_tuning_DE_ui(platform: str):
583
2826
  ui.input_numeric("max_iterations", "Maximum number of iterations:", 5),
584
2827
  ]
585
2828
 
586
- run_button_parameter_tuning_DE = ui.input_action_button(
587
- "run_btn_parameter_tuning_DE",
588
- "Tune parameters (differential evolution optimization)",
589
- style="font-size:16px; padding:15px 30px; width:300px; height:100px",
590
- )
591
- back_button = ui.input_action_button(
592
- "back",
593
- "Back to main menu",
594
- style="font-size:16px; padding:15px 30px; width:300px; height:100px",
595
- )
2829
+ run_button_parameter_tuning_DE = ui.input_action_button("run_btn_parameter_tuning_DE", "Tune parameters (differential evolution optimization)", style="font-size:16px; padding:15px 30px; width:300px; height:100px")
2830
+ back_button = ui.input_action_button("back", "Back to main menu", style="font-size:16px; padding:15px 30px; width:300px; height:100px")
596
2831
 
597
- # Build the 4-column inputs panel (fixed slices corrected, unpack lists properly)
598
2832
  if platform == "HRMS":
599
2833
  inputs_columns = ui.layout_columns(
600
2834
  ui.div(*base_inputs, style="display:flex; flex-direction:column; gap:10px;"),
@@ -603,7 +2837,7 @@ def run_parameter_tuning_DE_ui(platform: str):
603
2837
  ui.div(*numeric_inputs[5:11], style="display:flex; flex-direction:column; gap:10px;"),
604
2838
  col_widths=(3, 3, 3, 3),
605
2839
  )
606
- else: # NRMS
2840
+ else:
607
2841
  inputs_columns = ui.layout_columns(
608
2842
  ui.div(*base_inputs, style="display:flex; flex-direction:column; gap:10px;"),
609
2843
  ui.div(*extra_inputs, style="display:flex; flex-direction:column; gap:10px;"),
@@ -612,17 +2846,11 @@ def run_parameter_tuning_DE_ui(platform: str):
612
2846
  col_widths=(3, 3, 3, 3),
613
2847
  )
614
2848
 
615
- # Main page: sidebar (param selection + bounds) and body (inputs + buttons + live log)
616
2849
  return ui.page_fillable(
617
2850
  ui.layout_sidebar(
618
2851
  ui.sidebar(
619
2852
  ui.h3("Select continuous parameters to optimize"),
620
- ui.input_checkbox_group(
621
- "params",
622
- None,
623
- choices=list(PARAMS.keys()),
624
- selected=["noise_threshold", "LET_threshold"],
625
- ),
2853
+ ui.input_checkbox_group("params", None, choices=list(PARAMS.keys()), selected=["noise_threshold", "LET_threshold"]),
626
2854
  ui.hr(),
627
2855
  ui.h4("Bounds for selected parameters"),
628
2856
  ui.output_ui("bounds_inputs"),
@@ -631,12 +2859,11 @@ def run_parameter_tuning_DE_ui(platform: str):
631
2859
  ui.div(
632
2860
  ui.h2("Tune parameters (differential evolution optimization)"),
633
2861
  inputs_columns,
634
- run_button_parameter_tuning_DE,
635
- back_button,
2862
+ ui.div(run_button_parameter_tuning_DE, back_button, style=("display:flex; flex-direction:row; gap:12px; align-items:center; flex-wrap:wrap;")),
636
2863
  ui.br(),
637
2864
  ui.card(
638
2865
  ui.card_header("Live log"),
639
- ui.output_text_verbatim("run_log"), # <-- make sure server defines this
2866
+ ui.output_text_verbatim("run_log"),
640
2867
  ),
641
2868
  style="display:flex; flex-direction:column; gap:16px;",
642
2869
  ),
@@ -645,15 +2872,16 @@ def run_parameter_tuning_DE_ui(platform: str):
645
2872
 
646
2873
 
647
2874
 
648
-
649
-
650
2875
  app_ui = ui.page_fluid(
651
2876
  ui.head_content(ui.tags.link(rel="icon", href="emblem.png")),
2877
+ ui.div(ui.output_image("image"), style=("display:block; margin:20px auto; max-width:320px; height:auto; text-align:center")),
652
2878
  ui.output_ui("main_ui"),
653
- ui.output_text("status_output")
2879
+ ui.output_text("status_output"),
654
2880
  )
655
2881
 
656
2882
 
2883
+
2884
+
657
2885
  def server(input, output, session):
658
2886
 
659
2887
  current_page = reactive.Value("main_menu")
@@ -672,7 +2900,7 @@ def server(input, output, session):
672
2900
  match_log_rv = reactive.Value("")
673
2901
  is_matching_rv = reactive.Value(False)
674
2902
  is_any_job_running = reactive.Value(False)
675
- latest_csv_path_rv = reactive.Value("")
2903
+ latest_txt_path_rv = reactive.Value("")
676
2904
  latest_df_rv = reactive.Value(None)
677
2905
  is_running_rv = reactive.Value(False)
678
2906
 
@@ -688,6 +2916,106 @@ def server(input, output, session):
688
2916
  converted_query_path_rv = reactive.Value(None)
689
2917
  converted_reference_path_rv = reactive.Value(None)
690
2918
 
2919
+ df_rv = reactive.Value(None)
2920
+
2921
+
2922
+ def _discover_rank_cols(df: pd.DataFrame):
2923
+ pred_pat = re.compile(r"^RANK\.(\d+)\.PRED$")
2924
+ score_pat = re.compile(r"^RANK\.(\d+)\.SIMILARITY\.SCORE$")
2925
+ pred_map, score_map = {}, {}
2926
+ for c in df.columns:
2927
+ m = pred_pat.match(c)
2928
+ if m: pred_map[int(m.group(1))] = c
2929
+ m = score_pat.match(c)
2930
+ if m: score_map[int(m.group(1))] = c
2931
+ return [(k, pred_map[k], score_map.get(k)) for k in sorted(pred_map)]
2932
+
2933
+
2934
+ def _rank_choices_for_query(df: pd.DataFrame, qid: str):
2935
+ sub = df.loc[df["QUERY.SPECTRUM.ID"].astype(str) == str(qid)]
2936
+ if sub.empty:
2937
+ return {}, None
2938
+ row = sub.iloc[0]
2939
+ rank_cols = _discover_rank_cols(df)
2940
+ if not rank_cols:
2941
+ return {}, None
2942
+
2943
+ choices = {}
2944
+ default_value = None
2945
+ for (k, pred_col, score_col) in rank_cols:
2946
+ pred = row.get(pred_col, None)
2947
+ if pd.isna(pred):
2948
+ continue
2949
+ pred = str(pred)
2950
+ score = row.get(score_col, None) if score_col else None
2951
+ score_str = f"{float(score):.6f}" if (score is not None and pd.notna(score)) else "NA"
2952
+ label = f"Rank {k} — {score_str} — {pred}"
2953
+ choices[label] = pred # values are plain names
2954
+ if k == 1:
2955
+ default_value = pred # default = Rank 1 name
2956
+
2957
+ if default_value is None and choices:
2958
+ default_value = next(iter(choices.values()))
2959
+ return choices, default_value
2960
+
2961
+
2962
+ @reactive.effect
2963
+ @reactive.event(input.compound_ID_output_file)
2964
+ async def _populate_ids_from_compound_ID_output_upload():
2965
+ files = input.compound_ID_output_file()
2966
+ if not files:
2967
+ return
2968
+
2969
+ in_path = Path(files[0]["datapath"])
2970
+ try:
2971
+ query_status_rv.set(f"Reading table from: {in_path.name} …")
2972
+ await reactive.flush()
2973
+
2974
+ df = await asyncio.to_thread(pd.read_csv, in_path, sep="\t", header=0)
2975
+
2976
+ if "QUERY.SPECTRUM.ID" not in df.columns:
2977
+ raise ValueError("Missing required column: QUERY.SPECTRUM.ID")
2978
+ if not _discover_rank_cols(df):
2979
+ raise ValueError("No columns matching RANK.<k>.PRED found.")
2980
+
2981
+ df_rv.set(df)
2982
+
2983
+ ids = df["QUERY.SPECTRUM.ID"].astype(str).tolist()
2984
+ unique_ids_in_order = list(dict.fromkeys(ids))
2985
+
2986
+ choices_dict, default_rank_value = _rank_choices_for_query(df, ids[0])
2987
+ choices_values = [str(v).strip() for v in choices_dict.values()]
2988
+ default_rank_value = str(default_rank_value).strip() if default_rank_value is not None else None
2989
+
2990
+ ui.update_selectize("q_spec", choices=unique_ids_in_order, selected=ids[0])
2991
+ await reactive.flush()
2992
+
2993
+ ui.update_selectize("r_spec", choices=choices_values, selected=choices_values[0])
2994
+ await reactive.flush()
2995
+
2996
+ except Exception as e:
2997
+ query_status_rv.set(f"❌ Failed: {e}")
2998
+ await reactive.flush()
2999
+ raise
3000
+
3001
+
3002
+ @reactive.effect
3003
+ @reactive.event(input.q_spec)
3004
+ async def _update_rank_choices_on_compound_ID_change():
3005
+ df = df_rv.get()
3006
+ if df is None:
3007
+ return
3008
+ qid = input.q_spec()
3009
+ if not qid:
3010
+ return
3011
+
3012
+ choices, default_rank_value = _rank_choices_for_query(df, qid)
3013
+ choices = list(choices.values())
3014
+ ui.update_selectize('r_spec', choices=choices, selected=default_rank_value)
3015
+ await reactive.flush()
3016
+
3017
+
3018
+
691
3019
  @output
692
3020
  @render.ui
693
3021
  def bounds_inputs():
@@ -830,6 +3158,11 @@ def server(input, output, session):
830
3158
  def flush(self):
831
3159
  pass
832
3160
 
3161
+ def _run_with_redirects(func, writer: ReactiveWriter, **kwargs):
3162
+ with contextlib.redirect_stdout(writer), contextlib.redirect_stderr(writer):
3163
+ return func(**kwargs)
3164
+
3165
+
833
3166
 
834
3167
  @reactive.effect
835
3168
  async def _pump_logs():
@@ -926,7 +3259,7 @@ def server(input, output, session):
926
3259
  @render.image
927
3260
  def image():
928
3261
  dir = Path(__file__).resolve().parent
929
- img: ImgData = {"src": str(dir / "www/emblem.png"), "width": "320px", "height": "250px"}
3262
+ img: ImgData = {"src": str(dir / "www/emblem.png"), "width": "250px", "height": "250px"}
930
3263
  return img
931
3264
 
932
3265
  @output
@@ -935,30 +3268,10 @@ def server(input, output, session):
935
3268
  if current_page() == "main_menu":
936
3269
  return ui.page_fluid(
937
3270
  ui.h2("Main Menu"),
938
- ui.div(
939
- ui.output_image("image"),
940
- #ui.img(src="emblem.png", width="320px", height="250px"),
941
- style=(
942
- "position:fixed; top:0; left:50%; transform:translateX(-50%); "
943
- "z-index:1000; text-align:center; padding:10px; background-color:white;"
944
- ),
945
- ),
946
- ui.div(
947
- "Overview:",
948
- style="text-align:left; font-size:24px; font-weight:bold; margin-top:350px"
949
- ),
950
- ui.div(
951
- "PyCompound is a Python-based tool designed for performing spectral library matching on either high-resolution mass spectrometry data (HRMS) or low-resolution mass spectrometry data (NRMS). PyCompound offers a range of spectrum preprocessing transformations and similarity measures. These spectrum preprocessing transformations include filtering on mass/charge and/or intensity values, weight factor transformation, low-entropy transformation, centroiding, noise removal, and matching. The available similarity measures include the canonical Cosine similarity measure, three entropy-based similarity measures, and a variety of binary similarity measures: Jaccard, Dice, 3W-Jaccard, Sokal-Sneath, Binary Cosine, Mountford, McConnaughey, Driver-Kroeber, Simpson, Braun-Banquet, Fager-McGowan, Kulczynski, Intersection, Hamming, and Hellinger.",
952
- style="margin-top:10px; text-align:left; font-size:16px; font-weight:500"
953
- ),
954
- ui.div(
955
- "Select options:",
956
- style="margin-top:30px; text-align:left; font-size:24px; font-weight:bold"
957
- ),
958
- ui.div(
959
- ui.input_radio_buttons("chromatography_platform", "Specify chromatography platform:", ["HRMS","NRMS"]),
960
- style="font-size:18px; margin-top:10px; max-width:none"
961
- ),
3271
+ ui.div("Overview:", style="text-align:left; font-size:24px; font-weight:bold"),
3272
+ ui.div("PyCompound is a Python-based tool designed for performing spectral library matching on either high-resolution mass spectrometry data (HRMS) or low-resolution mass spectrometry data (NRMS). PyCompound offers a range of spectrum preprocessing transformations and similarity measures. These spectrum preprocessing transformations include filtering on mass/charge and/or intensity values, weight factor transformation, low-entropy transformation, centroiding, noise removal, and matching. The available similarity measures include the canonical Cosine similarity measure, three entropy-based similarity measures, and a variety of binary similarity measures: Jaccard, Dice, 3W-Jaccard, Sokal-Sneath, Binary Cosine, Mountford, McConnaughey, Driver-Kroeber, Simpson, Braun-Banquet, Fager-McGowan, Kulczynski, Intersection, Hamming, and Hellinger.", style="margin-top:10px; text-align:left; font-size:16px; font-weight:500"),
3273
+ ui.div("Select options:", style="margin-top:30px; text-align:left; font-size:24px; font-weight:bold"),
3274
+ ui.div(ui.input_radio_buttons("chromatography_platform", "Specify chromatography platform:", ["HRMS","NRMS"]), style="font-size:18px; margin-top:10px; max-width:none"),
962
3275
  ui.input_action_button("plot_spectra", "Plot two spectra before and after preprocessing transformations.", style="font-size:18px; padding:20px 40px; width:550px; height:100px; margin-top:10px; margin-right:50px"),
963
3276
  ui.input_action_button("run_spec_lib_matching", "Run spectral library matching to perform compound identification on a query library of spectra.", style="font-size:18px; padding:20px 40px; width:550px; height:100px; margin-top:10px; margin-right:50px"),
964
3277
  ui.input_action_button("run_parameter_tuning_grid", "Grid search: Tune parameters to maximize accuracy of compound identification given a query library with known spectrum IDs.", style="font-size:18px; padding:20px 40px; width:450px; height:120px; margin-top:10px; margin-right:50px"),
@@ -1031,36 +3344,36 @@ def server(input, output, session):
1031
3344
  suffix = in_path.suffix.lower()
1032
3345
 
1033
3346
  try:
1034
- if suffix == ".csv":
1035
- csv_path = in_path
1036
- converted_query_path_rv.set(str(csv_path))
3347
+ if suffix == ".txt":
3348
+ txt_path = in_path
3349
+ converted_query_path_rv.set(str(txt_path))
1037
3350
  else:
1038
- query_status_rv.set(f"Converting {in_path.name} → CSV …")
3351
+ query_status_rv.set(f"Converting {in_path.name} → TXT…")
1039
3352
  await reactive.flush()
1040
3353
 
1041
- tmp_csv_path = in_path.with_suffix(".converted.csv")
3354
+ tmp_txt_path = in_path.with_suffix(".converted.txt")
1042
3355
 
1043
- out_obj = await asyncio.to_thread(build_library, str(in_path), str(tmp_csv_path))
3356
+ out_obj = await asyncio.to_thread(build_library, str(in_path), str(tmp_txt_path))
1044
3357
 
1045
3358
  if isinstance(out_obj, (str, os.PathLike, Path)):
1046
- csv_path = Path(out_obj)
3359
+ txt_path = Path(out_obj)
1047
3360
  elif isinstance(out_obj, pd.DataFrame):
1048
- out_obj.to_csv(tmp_csv_path, index=False, sep='\t')
1049
- csv_path = tmp_csv_path
3361
+ out_obj.to_csv(tmp_txt_path, index=False, sep='\t')
3362
+ txt_path = tmp_txt_path
1050
3363
  else:
1051
3364
  raise TypeError(f"build_library returned unsupported type: {type(out_obj)}")
1052
3365
 
1053
- converted_query_path_rv.set(str(csv_path))
3366
+ converted_query_path_rv.set(str(txt_path))
1054
3367
 
1055
- query_status_rv.set(f"Reading IDs from: {csv_path.name} …")
3368
+ query_status_rv.set(f"Reading IDs from: {txt_path.name} …")
1056
3369
  await reactive.flush()
1057
3370
 
1058
- ids = await asyncio.to_thread(extract_first_column_ids, str(csv_path))
3371
+ ids = await asyncio.to_thread(extract_first_column_ids, str(txt_path))
1059
3372
  query_ids_rv.set(ids)
1060
3373
 
1061
3374
  ui.update_selectize("spectrum_ID1", choices=ids, selected=(ids[0] if ids else None))
1062
3375
 
1063
- query_status_rv.set(f"✅ Loaded {len(ids)} IDs from {csv_path.name}" if ids else f"⚠️ No IDs found in {csv_path.name}")
3376
+ query_status_rv.set(f"✅ Loaded {len(ids)} IDs from {txt_path.name}" if ids else f"⚠️ No IDs found in {txt_path.name}")
1064
3377
  await reactive.flush()
1065
3378
 
1066
3379
  except Exception as e:
@@ -1080,37 +3393,37 @@ def server(input, output, session):
1080
3393
  suffix = in_path.suffix.lower()
1081
3394
 
1082
3395
  try:
1083
- if suffix == ".csv":
1084
- csv_path = in_path
1085
- converted_reference_path_rv.set(str(csv_path))
3396
+ if suffix == ".txt":
3397
+ txt_path = in_path
3398
+ converted_reference_path_rv.set(str(txt_path))
1086
3399
  else:
1087
- reference_status_rv.set(f"Converting {in_path.name} → CSV …")
3400
+ reference_status_rv.set(f"Converting {in_path.name} → TXT…")
1088
3401
  await reactive.flush()
1089
3402
 
1090
- tmp_csv_path = in_path.with_suffix(".converted.csv")
3403
+ tmp_txt_path = in_path.with_suffix(".converted.txt")
1091
3404
 
1092
- out_obj = await asyncio.to_thread(build_library, str(in_path), str(tmp_csv_path))
3405
+ out_obj = await asyncio.to_thread(build_library, str(in_path), str(tmp_txt_path))
1093
3406
 
1094
3407
  if isinstance(out_obj, (str, os.PathLike, Path)):
1095
- csv_path = Path(out_obj)
3408
+ txt_path = Path(out_obj)
1096
3409
  elif isinstance(out_obj, pd.DataFrame):
1097
- out_obj.to_csv(tmp_csv_path, index=False, sep='\t')
1098
- csv_path = tmp_csv_path
3410
+ out_obj.to_csv(tmp_txt_path, index=False, sep='\t')
3411
+ txt_path = tmp_txt_path
1099
3412
  else:
1100
3413
  raise TypeError(f"build_library returned unsupported type: {type(out_obj)}")
1101
3414
 
1102
- converted_reference_path_rv.set(str(csv_path))
3415
+ converted_reference_path_rv.set(str(txt_path))
1103
3416
 
1104
- reference_status_rv.set(f"Reading IDs from: {csv_path.name} …")
3417
+ reference_status_rv.set(f"Reading IDs from: {txt_path.name} …")
1105
3418
  await reactive.flush()
1106
3419
 
1107
- ids = await asyncio.to_thread(extract_first_column_ids, str(csv_path))
3420
+ ids = await asyncio.to_thread(extract_first_column_ids, str(txt_path))
1108
3421
  reference_ids_rv.set(ids)
1109
3422
 
1110
3423
  ui.update_selectize("spectrum_ID2", choices=ids, selected=(ids[0] if ids else None))
1111
3424
 
1112
3425
  reference_status_rv.set(
1113
- f"✅ Loaded {len(ids)} IDs from {csv_path.name}" if ids else f"⚠️ No IDs found in {csv_path.name}"
3426
+ f"✅ Loaded {len(ids)} IDs from {txt_path.name}" if ids else f"⚠️ No IDs found in {txt_path.name}"
1114
3427
  )
1115
3428
  await reactive.flush()
1116
3429
 
@@ -1120,7 +3433,7 @@ def server(input, output, session):
1120
3433
  raise
1121
3434
 
1122
3435
 
1123
- @render.download(filename=lambda: f"plot.png")
3436
+ @render.download(filename=lambda: f"plot.svg")
1124
3437
  def run_btn_plot_spectra():
1125
3438
  spectrum_ID1 = input.spectrum_ID1() or None
1126
3439
  spectrum_ID2 = input.spectrum_ID2() or None
@@ -1132,22 +3445,20 @@ def server(input, output, session):
1132
3445
  if input.high_quality_reference_library() != 'False':
1133
3446
  high_quality_reference_library_tmp2 = True
1134
3447
 
1135
- print(input.high_quality_reference_library())
1136
- print(high_quality_reference_library_tmp2)
1137
-
1138
3448
  if input.chromatography_platform() == "HRMS":
1139
- fig = generate_plots_on_HRMS_data(query_data=input.query_data()[0]['datapath'], reference_data=input.reference_data()[0]['datapath'], spectrum_ID1=spectrum_ID1, spectrum_ID2=spectrum_ID2, similarity_measure=input.similarity_measure(), weights=weights, spectrum_preprocessing_order=input.spectrum_preprocessing_order(), high_quality_reference_library=high_quality_reference_library_tmp2, mz_min=input.mz_min(), mz_max=input.mz_max(), int_min=input.int_min(), int_max=input.int_max(), window_size_centroiding=input.window_size_centroiding(), window_size_matching=input.window_size_matching(), noise_threshold=input.noise_threshold(), wf_mz=input.wf_mz(), wf_intensity=input.wf_int(), LET_threshold=input.LET_threshold(), entropy_dimension=input.entropy_dimension(), y_axis_transformation=input.y_axis_transformation(), return_plot=True)
3449
+ fig = generate_plots_on_HRMS_data(query_data=input.query_data()[0]['datapath'], reference_data=input.reference_data()[0]['datapath'], spectrum_ID1=spectrum_ID1, spectrum_ID2=spectrum_ID2, print_url_spectrum1=input.print_url_spectrum1(), print_url_spectrum2=input.print_url_spectrum2(), similarity_measure=input.similarity_measure(), weights=weights, spectrum_preprocessing_order=input.spectrum_preprocessing_order(), high_quality_reference_library=high_quality_reference_library_tmp2, mz_min=input.mz_min(), mz_max=input.mz_max(), int_min=input.int_min(), int_max=input.int_max(), window_size_centroiding=input.window_size_centroiding(), window_size_matching=input.window_size_matching(), noise_threshold=input.noise_threshold(), wf_mz=input.wf_mz(), wf_intensity=input.wf_int(), LET_threshold=input.LET_threshold(), entropy_dimension=input.entropy_dimension(), y_axis_transformation=input.y_axis_transformation(), return_plot=True)
1140
3450
  plt.show()
1141
3451
  elif input.chromatography_platform() == "NRMS":
1142
- fig = generate_plots_on_NRMS_data(query_data=input.query_data()[0]['datapath'], reference_data=input.reference_data()[0]['datapath'], spectrum_ID1=spectrum_ID1, spectrum_ID2=spectrum_ID2, similarity_measure=input.similarity_measure(), spectrum_preprocessing_order=input.spectrum_preprocessing_order(), high_quality_reference_library=high_quality_reference_library_tmp2, mz_min=input.mz_min(), mz_max=input.mz_max(), int_min=input.int_min(), int_max=input.int_max(), noise_threshold=input.noise_threshold(), wf_mz=input.wf_mz(), wf_intensity=input.wf_int(), LET_threshold=input.LET_threshold(), entropy_dimension=input.entropy_dimension(), y_axis_transformation=input.y_axis_transformation(), return_plot=True)
3452
+ fig = generate_plots_on_NRMS_data(query_data=input.query_data()[0]['datapath'], reference_data=input.reference_data()[0]['datapath'], spectrum_ID1=spectrum_ID1, spectrum_ID2=spectrum_ID2, print_url_spectrum1=input.print_url_spectrum1(), print_url_spectrum2=input.print_url_spectrum2(), similarity_measure=input.similarity_measure(), spectrum_preprocessing_order=input.spectrum_preprocessing_order(), high_quality_reference_library=high_quality_reference_library_tmp2, mz_min=input.mz_min(), mz_max=input.mz_max(), int_min=input.int_min(), int_max=input.int_max(), noise_threshold=input.noise_threshold(), wf_mz=input.wf_mz(), wf_intensity=input.wf_int(), LET_threshold=input.LET_threshold(), entropy_dimension=input.entropy_dimension(), y_axis_transformation=input.y_axis_transformation(), return_plot=True)
1143
3453
  plt.show()
1144
3454
  with io.BytesIO() as buf:
1145
- fig.savefig(buf, format="png", dpi=150, bbox_inches="tight")
3455
+ fig.savefig(buf, format="svg", dpi=150, bbox_inches="tight")
1146
3456
  plt.close()
1147
3457
  yield buf.getvalue()
1148
3458
 
1149
3459
 
1150
3460
 
3461
+
1151
3462
  @render.download(filename="identification_output.txt")
1152
3463
  async def run_btn_spec_lib_matching():
1153
3464
  match_log_rv.set("Running identification...\n")
@@ -1160,7 +3471,7 @@ def server(input, output, session):
1160
3471
  hq = bool(hq)
1161
3472
 
1162
3473
  weights = [float(weight.strip()) for weight in input.weights().split(",") if weight.strip()]
1163
- weights = {'Cosine':weights[0], 'Shannon':weights[1], 'Renyi':weights[2], 'Tsallis':weights[3]}
3474
+ weights = {'Cosine': weights[0], 'Shannon': weights[1], 'Renyi': weights[2], 'Tsallis': weights[3]}
1164
3475
 
1165
3476
  common_kwargs = dict(
1166
3477
  query_data=input.query_data()[0]["datapath"],
@@ -1182,37 +3493,81 @@ def server(input, output, session):
1182
3493
  return_ID_output=True,
1183
3494
  )
1184
3495
 
3496
+ # --- streaming setup (same pattern as your DE block) ---
1185
3497
  loop = asyncio.get_running_loop()
1186
- rw = ReactiveWriter(loop)
3498
+ q: asyncio.Queue[str | None] = asyncio.Queue()
3499
+
3500
+ class UIWriter(io.TextIOBase):
3501
+ def write(self, s: str):
3502
+ if s:
3503
+ loop.call_soon_threadsafe(q.put_nowait, s)
3504
+ return len(s)
3505
+ def flush(self): pass
3506
+
3507
+ async def _drain():
3508
+ while True:
3509
+ msg = await q.get()
3510
+ if msg is None:
3511
+ break
3512
+ match_log_rv.set(match_log_rv.get() + msg)
3513
+ await reactive.flush()
1187
3514
 
3515
+ drain_task = asyncio.create_task(_drain())
3516
+ writer = UIWriter()
3517
+
3518
+ # --- worker wrappers that install redirects INSIDE the thread ---
3519
+ def _run_hrms():
3520
+ with redirect_stdout(writer), redirect_stderr(writer):
3521
+ # optional heartbeat
3522
+ print(">> Starting HRMS identification ...", flush=True)
3523
+ return run_spec_lib_matching_on_HRMS_data_shiny(
3524
+ precursor_ion_mz_tolerance=input.precursor_ion_mz_tolerance(),
3525
+ ionization_mode=input.ionization_mode(),
3526
+ adduct=input.adduct(),
3527
+ window_size_centroiding=input.window_size_centroiding(),
3528
+ window_size_matching=input.window_size_matching(),
3529
+ **common_kwargs
3530
+ )
3531
+
3532
+ def _run_nrms():
3533
+ with redirect_stdout(writer), redirect_stderr(writer):
3534
+ print(">> Starting NRMS identification ...", flush=True)
3535
+ return run_spec_lib_matching_on_NRMS_data_shiny(**common_kwargs)
3536
+
3537
+ # --- run in worker thread and stream output live ---
1188
3538
  try:
1189
- with redirect_stdout(rw), redirect_stderr(rw):
1190
- if input.chromatography_platform() == "HRMS":
1191
- df_out = await asyncio.to_thread(
1192
- run_spec_lib_matching_on_HRMS_data,
1193
- window_size_centroiding=input.window_size_centroiding(),
1194
- window_size_matching=input.window_size_matching(),
1195
- **common_kwargs
1196
- )
1197
- else:
1198
- df_out = await asyncio.to_thread(run_spec_lib_matching_on_NRMS_data, **common_kwargs)
3539
+ if input.chromatography_platform() == "HRMS":
3540
+ df_out = await asyncio.to_thread(_run_hrms)
3541
+ else:
3542
+ df_out = await asyncio.to_thread(_run_nrms)
3543
+
1199
3544
  match_log_rv.set(match_log_rv.get() + "\n✅ Identification finished.\n")
1200
3545
  await reactive.flush()
3546
+
1201
3547
  except Exception as e:
1202
- match_log_rv.set(match_log_rv.get() + f"\n❌ Error: {e}\n")
3548
+ import traceback
3549
+ tb = "".join(traceback.format_exception(type(e), e, e.__traceback__))
3550
+ match_log_rv.set(match_log_rv.get() + f"\n❌ {type(e).__name__}: {e}\n{tb}\n")
1203
3551
  await reactive.flush()
3552
+ # make sure to stop the drainer before re-raising
3553
+ await q.put(None); await drain_task
1204
3554
  raise
1205
3555
 
1206
- yield df_out.to_csv(index=True, sep='\t')
3556
+ finally:
3557
+ await q.put(None)
3558
+ await drain_task
3559
+
3560
+ yield df_out.to_csv(index=True, sep="\t")
3561
+
1207
3562
 
1208
3563
 
1209
3564
 
1210
- @render.download(filename="plot.png")
3565
+ @render.download(filename="plot.svg")
1211
3566
  def run_btn_plot_spectra_within_spec_lib_matching():
1212
3567
  req(input.query_data(), input.reference_data())
1213
3568
 
1214
- spectrum_ID1 = input.spectrum_ID1() or None
1215
- spectrum_ID2 = input.spectrum_ID2() or None
3569
+ spectrum_ID1 = input.q_spec() or None
3570
+ spectrum_ID2 = input.r_spec() or None
1216
3571
 
1217
3572
  hq = input.high_quality_reference_library()
1218
3573
  if isinstance(hq, str):
@@ -1228,6 +3583,8 @@ def server(input, output, session):
1228
3583
  reference_data=input.reference_data()[0]['datapath'],
1229
3584
  spectrum_ID1=spectrum_ID1,
1230
3585
  spectrum_ID2=spectrum_ID2,
3586
+ print_url_spectrum1=input.print_url_spectrum1(),
3587
+ print_url_spectrum2=input.print_url_spectrum2(),
1231
3588
  similarity_measure=input.similarity_measure(),
1232
3589
  weights=weights,
1233
3590
  spectrum_preprocessing_order=input.spectrum_preprocessing_order(),
@@ -1253,7 +3610,7 @@ def server(input, output, session):
1253
3610
  plt.show()
1254
3611
 
1255
3612
  with io.BytesIO() as buf:
1256
- fig.savefig(buf, format="png", dpi=150, bbox_inches="tight")
3613
+ fig.savefig(buf, format="svg", dpi=150, bbox_inches="tight")
1257
3614
  plt.close()
1258
3615
  yield buf.getvalue()
1259
3616
 
@@ -1291,6 +3648,9 @@ def server(input, output, session):
1291
3648
 
1292
3649
  try:
1293
3650
  if input.chromatography_platform() == "HRMS":
3651
+ precursor_ion_mz_tolerance = float(input.precursor_ion_mz_tolerance())
3652
+ ionization_mode = str(input.ionization_mode())
3653
+ adduct = str(input.adduct())
1294
3654
  window_size_centroiding_tmp = strip_numeric(input.window_size_centroiding())
1295
3655
  window_size_matching_tmp = strip_numeric(input.window_size_matching())
1296
3656
  grid = {
@@ -1310,7 +3670,7 @@ def server(input, output, session):
1310
3670
  'window_size_centroiding': window_size_centroiding_tmp,
1311
3671
  'window_size_matching': window_size_matching_tmp,
1312
3672
  }
1313
- df_out = await asyncio.to_thread(_run_with_redirects, tune_params_on_HRMS_data_grid_shiny, rw, **common_kwargs, grid=grid)
3673
+ df_out = await asyncio.to_thread(_run_with_redirects, tune_params_on_HRMS_data_grid_shiny, rw, **common_kwargs, grid=grid, precursor_ion_mz_tolerance=precursor_ion_mz_tolerance, ionization_mode=ionization_mode, adduct=adduct)
1314
3674
  else:
1315
3675
  grid = {
1316
3676
  'similarity_measure': similarity_measure_tmp,
@@ -1338,7 +3698,7 @@ def server(input, output, session):
1338
3698
  is_any_job_running.set(False)
1339
3699
  await reactive.flush()
1340
3700
 
1341
- yield df_out.to_csv(index=False).encode("utf-8", sep='\t')
3701
+ yield df_out.to_csv(index=False, sep='\t').encode("utf-8")
1342
3702
 
1343
3703
 
1344
3704
 
@@ -1350,7 +3710,6 @@ def server(input, output, session):
1350
3710
  is_tuning_DE_running.set(True)
1351
3711
  await reactive.flush()
1352
3712
 
1353
- # --- helpers ---
1354
3713
  def _safe_float(v, default):
1355
3714
  try:
1356
3715
  if v is None:
@@ -1360,7 +3719,6 @@ def server(input, output, session):
1360
3719
  return default
1361
3720
 
1362
3721
  def _iget(id, default=None):
1363
- # Safe getter for Shiny inputs (avoids SilentException)
1364
3722
  if id in input:
1365
3723
  try:
1366
3724
  return input[id]()
@@ -1368,7 +3726,6 @@ def server(input, output, session):
1368
3726
  return default
1369
3727
  return default
1370
3728
 
1371
- # ---- log plumbing (stdout/stderr -> UI) ----
1372
3729
  loop = asyncio.get_running_loop()
1373
3730
  q: asyncio.Queue[str | None] = asyncio.Queue()
1374
3731
 
@@ -1390,7 +3747,6 @@ def server(input, output, session):
1390
3747
  drain_task = asyncio.create_task(_drain())
1391
3748
  writer = UIWriter()
1392
3749
 
1393
- # ---------- SNAPSHOT INPUTS SAFELY ----------
1394
3750
  try:
1395
3751
  qfile = _iget("query_data")[0]["datapath"]
1396
3752
  rfile = _iget("reference_data")[0]["datapath"]
@@ -1410,17 +3766,13 @@ def server(input, output, session):
1410
3766
  int_min = _safe_float(_iget("int_min", 0.0), 0.0)
1411
3767
  int_max = _safe_float(_iget("int_max", 999_999_999.0), 999_999_999.0)
1412
3768
 
1413
- # weights "a,b,c,d"
1414
3769
  w_text = _iget("weights", "") or ""
1415
3770
  w_list = [float(w.strip()) for w in w_text.split(",") if w.strip()]
1416
3771
  w_list = (w_list + [0.0, 0.0, 0.0, 0.0])[:4]
1417
3772
  weights = {"Cosine": w_list[0], "Shannon": w_list[1], "Renyi": w_list[2], "Tsallis": w_list[3]}
1418
3773
 
1419
- # selected params + bounds
1420
3774
  opt_params = tuple(_iget("params", ()) or ())
1421
3775
  bounds_dict = {}
1422
- # populate bounds using the min_/max_ inputs if present, otherwise fall back
1423
- # to your default PARAMS dicts already defined in your file
1424
3776
  param_defaults = PARAMS_HRMS if platform == "HRMS" else PARAMS_NRMS
1425
3777
  for p in opt_params:
1426
3778
  lo = _safe_float(_iget(f"min_{p}", param_defaults.get(p, (0.0, 1.0))[0]),
@@ -1431,7 +3783,6 @@ def server(input, output, session):
1431
3783
  lo, hi = hi, lo
1432
3784
  bounds_dict[p] = (lo, hi)
1433
3785
 
1434
- # defaults (guarded!)
1435
3786
  defaults = {
1436
3787
  "window_size_centroiding": _safe_float(_iget("window_size_centroiding", 0.5), 0.5),
1437
3788
  "window_size_matching": _safe_float(_iget("window_size_matching", 0.5), 0.5),
@@ -1454,11 +3805,13 @@ def server(input, output, session):
1454
3805
  return
1455
3806
 
1456
3807
  def _run():
1457
- from contextlib import redirect_stdout, redirect_stderr
1458
3808
  with redirect_stdout(writer), redirect_stderr(writer):
1459
3809
  return tune_params_DE(
1460
3810
  query_data=qfile,
1461
3811
  reference_data=rfile,
3812
+ precursor_ion_mz_tolerance=float(input.precursor_ion_mz_tolerance()),
3813
+ ionization_mode=input.ionization_mode(),
3814
+ adduct=input.adduct(),
1462
3815
  chromatography_platform=input.chromatography_platform(),
1463
3816
  similarity_measure=sim,
1464
3817
  weights=weights,
@@ -1516,4 +3869,3 @@ def server(input, output, session):
1516
3869
  app = App(app_ui, server)
1517
3870
 
1518
3871
 
1519
-