pycompound 0.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,636 @@
1
+
2
+ # this script's functions plot a given query spectrum against a given reference spectrum before and after spectrum preprocessing transformations
3
+
4
+ from .processing import *
5
+ from .similarity_measures import *
6
+ import pandas as pd
7
+ from pathlib import Path
8
+ import sys
9
+ import matplotlib.pyplot as plt
10
+
11
+
12
+ def generate_plots_on_HRMS_data(query_data=None, reference_data=None, spectrum_ID1=None, spectrum_ID2=None, similarity_measure='cosine', spectrum_preprocessing_order='FCNMWL', high_quality_reference_library=False, mz_min=0, mz_max=9999999, int_min=0, int_max=9999999, window_size_centroiding=0.5, window_size_matching=0.5, noise_threshold=0.0, wf_mz=0.0, wf_intensity=1.0, LET_threshold=0.0, entropy_dimension=1.1, y_axis_transformation='normalized', output_path=None):
13
+ '''
14
+ plots two spectra against each other before and after preprocessing transformations for high-resolution mass spectrometry data
15
+
16
+ --query_data: mgf, mzML, or csv file of query mass spectrum/spectra to be identified. If csv file, each row should correspond to a mass spectrum, the left-most column should contain an identifier, and each of the other columns should correspond to a single mass/charge ratio. Mandatory argument.
17
+ --reference_data: mgf, mzML, or csv file of the reference mass spectra. If csv file, each row should correspond to a mass spectrum, the left-most column should contain in identifier (i.e. the CAS registry number or the compound name), and the remaining column should correspond to a single mass/charge ratio. Mandatory argument.
18
+ --spectrum_ID1: ID of one spectrum to be plotted. Default is first spectrum in the query library. Optional argument.
19
+ --spectrum_ID2: ID of another spectrum to be plotted. Default is first spectrum in the reference library. Optional argument.
20
+ --similarity_measure: \'cosine\', \'shannon\', \'renyi\', and \'tsallis\'. Default: cosine.
21
+ --spectrum_preprocessing_order: The spectrum preprocessing transformations and the order in which they are to be applied. Note that these transformations are applied prior to computing similarity scores. Format must be a string with 2-6 characters chosen from C, F, M, N, L, W representing centroiding, filtering based on mass/charge and intensity values, matching, noise removal, low-entropy trannsformation, and weight-factor-transformation, respectively. For example, if \'WCM\' is passed, then each spectrum will undergo a weight factor transformation, then centroiding, and then matching. Note that if an argument is passed, then \'M\' must be contained in the argument, since matching is a required preprocessing step in spectral library matching of HRMS data. Furthermore, \'C\' must be performed before matching since centroiding can change the number of ion fragments in a given spectrum. Default: FCNMWL')
22
+ --high_quality_reference_library: True/False flag indicating whether the reference library is considered to be of high quality. If True, then the spectrum preprocessing transformations of filtering and noise removal are performed only on the query spectrum/spectra. If False, all spectrum preprocessing transformations specified will be applied to both the query and reference spectra. Default: False')
23
+ --mz_min: Remove all peaks with mass/charge value less than mz_min in each spectrum. Default: 0
24
+ --mz_max: Remove all peaks with mass/charge value greater than mz_max in each spectrum. Default: 9999999
25
+ --int_min: Remove all peaks with intensity value less than int_min in each spectrum. Default: 0
26
+ --int_max: Remove all peaks with intensity value greater than int_max in each spectrum. Default: 9999999
27
+ --window_size_centroiding: Window size parameter used in centroiding a given spectrum. Default: 0.5
28
+ --window_size_matching: Window size parameter used in matching a query spectrum and a reference library spectrum. Default: 0.5
29
+ --noise_threshold: Ion fragments (i.e. points in a given mass spectrum) with intensity less than max(intensities)*noise_threshold are removed. Default: 0.0
30
+ --wf_mz: Mass/charge weight factor parameter. Default: 0.0
31
+ --wf_intensity: Intensity weight factor parameter. Default: 0.0
32
+ --LET_threshold: Low-entropy transformation threshold parameter. Spectra with Shannon entropy less than LET_threshold are transformed according to intensitiesNew=intensitiesOriginal^{(1+S)/(1+LET_threshold)}. Default: 0.0
33
+ --entropy_dimension: Entropy dimension parameter. Must have positive value other than 1. When the entropy dimension is 1, then Renyi and Tsallis entropy are equivalent to Shannon entropy. Therefore, this parameter only applies to the renyi and tsallis similarity measures. This parameter will be ignored if similarity measure cosine or shannon is chosen. Default: 1.1
34
+ --y_axis_transformation: transformation to apply to y-axis (i.e. intensity axis) of plots. Options: \'normalized\', \'none\', \'log10\', and \'sqrt\'. Default: normalized.')
35
+ --output_path: path to output PDF file containing the plots of the spectra before and after preprocessing transformations. If no argument is passed, then the plots will be saved to the PDF ./spectrum1_{spectrum_ID1}_spectrum2_{spectrum_ID2}_plot.pdf in the current working directory.
36
+ '''
37
+
38
+ # load query and reference libraries
39
+ if query_data is None:
40
+ print('\nError: No argument passed to the mandatory query_data. Please pass the path to the CSV file of the query data.')
41
+ sys.exit()
42
+ else:
43
+ extension = query_data.rsplit('.',1)
44
+ extension = extension[(len(extension)-1)]
45
+ if extension == 'mgf' or extension == 'MGF' or extension == 'mzML' or extension == 'mzml' or extension == 'MZML' or extension == 'cdf' or extension == 'CDF':
46
+ output_path_tmp = query_data[:-3] + 'csv'
47
+ build_library_from_raw_data(input_path=query_data, output_path=output_path_tmp, is_reference=False)
48
+ df_query = pd.read_csv(output_path_tmp)
49
+ if extension == 'csv' or extension == 'CSV':
50
+ df_query = pd.read_csv(query_data)
51
+ unique_query_ids = df_query.iloc[:,0].unique().tolist()
52
+ unique_query_ids = [str(tmp) for tmp in unique_query_ids]
53
+
54
+ if reference_data is None:
55
+ print('\nError: No argument passed to the mandatory reference_data. Please pass the path to the CSV file of the reference data.')
56
+ sys.exit()
57
+ else:
58
+ extension = reference_data.rsplit('.',1)
59
+ extension = extension[(len(extension)-1)]
60
+ if extension == 'mgf' or extension == 'MGF' or extension == 'mzML' or extension == 'mzml' or extension == 'MZML' or extension == 'cdf' or extension == 'CDF':
61
+ output_path_tmp = reference_data[:-3] + 'csv'
62
+ build_library_from_raw_data(input_path=reference_data, output_path=output_path_tmp, is_reference=True)
63
+ df_reference = pd.read_csv(output_path_tmp)
64
+ if extension == 'csv' or extension == 'CSV':
65
+ df_reference = pd.read_csv(reference_data)
66
+ unique_reference_ids = df_reference.iloc[:,0].unique().tolist()
67
+ unique_reference_ids = [str(tmp) for tmp in unique_reference_ids]
68
+
69
+
70
+ ##### process input parameters and ensure they are in a valid format #####
71
+ if spectrum_ID1 is not None:
72
+ spectrum_ID1 = str(spectrum_ID1)
73
+ else:
74
+ spectrum_ID1 = str(df_query.iloc[0,0])
75
+ print('No argument passed to spectrum_ID1; using the first spectrum in query_data.')
76
+
77
+ if spectrum_ID2 is not None:
78
+ spectrum_ID2 = str(spectrum_ID2)
79
+ else:
80
+ spectrum_ID2 = str(df_reference.iloc[0,0])
81
+ print('No argument passed to spectrum_ID2; using the first spectrum in reference_data.')
82
+
83
+ if spectrum_preprocessing_order is not None:
84
+ spectrum_preprocessing_order = list(spectrum_preprocessing_order)
85
+ else:
86
+ spectrum_preprocessing_order = ['F', 'C', 'N', 'M', 'W', 'L']
87
+ if 'M' not in spectrum_preprocessing_order:
88
+ print(f'Error: \'M\' must be a character in spectrum_preprocessing_order.')
89
+ sys.exit()
90
+ if 'C' in spectrum_preprocessing_order:
91
+ if spectrum_preprocessing_order.index('C') > spectrum_preprocessing_order.index('M'):
92
+ print(f'Error: \'C\' must come before \'M\' in spectrum_preprocessing_order.')
93
+ sys.exit()
94
+ if set(spectrum_preprocessing_order) - {'F','C','N','M','W','L'}:
95
+ print(f'Error: spectrum_preprocessing_order must contain only \'C\', \'F\', \'M\', \'N\', \'L\', \'W\'.')
96
+ sys.exit()
97
+
98
+ if similarity_measure not in ['cosine','shannon','renyi','tsallis']:
99
+ print('\nError: similarity_measure must be either \'cosine\', \'shannon\', \'renyi\', or \'tsallis\'')
100
+ sys.exit()
101
+
102
+ if isinstance(int_min,int) is True:
103
+ int_min = float(int_min)
104
+ if isinstance(int_max,int) is True:
105
+ int_max = float(int_max)
106
+ if isinstance(mz_min,int) is False or isinstance(mz_max,int) is False or isinstance(int_min,float) is False or isinstance(int_max,float) is False:
107
+ print('Error: mz_min must be a non-negative integer, mz_max must be a positive integer, int_min must be a non-negative float, and int_max must be a positive float')
108
+ sys.exit()
109
+ if mz_min < 0:
110
+ print('\nError: mz_min should be a non-negative integer')
111
+ sys.exit()
112
+ if mz_max <= 0:
113
+ print('\nError: mz_max should be a positive integer')
114
+ sys.exit()
115
+ if int_min < 0:
116
+ print('\nError: int_min should be a non-negative float')
117
+ sys.exit()
118
+ if int_max <= 0:
119
+ print('\nError: int_max should be a positive float')
120
+ sys.exit()
121
+
122
+ if isinstance(window_size_centroiding,float) is False or window_size_centroiding <= 0.0:
123
+ print('Error: window_size_centroiding must be a positive float.')
124
+ sys.exit()
125
+ if isinstance(window_size_matching,float) is False or window_size_matching<= 0.0:
126
+ print('Error: window_size_matching must be a positive float.')
127
+ sys.exit()
128
+
129
+ if isinstance(noise_threshold,int) is True:
130
+ noise_threshold = float(noise_threshold)
131
+ if isinstance(noise_threshold,float) is False or noise_threshold < 0:
132
+ print('Error: noise_threshold must be a positive float.')
133
+ sys.exit()
134
+
135
+ if isinstance(wf_intensity,int) is True:
136
+ wf_intensity = float(wf_intensity)
137
+ if isinstance(wf_mz,int) is True:
138
+ wf_mz = float(wf_mz)
139
+ if isinstance(wf_intensity,float) is False or isinstance(wf_mz,float) is False:
140
+ print('Error: wf_mz and wf_intensity must be integers or floats')
141
+ sys.exit()
142
+
143
+ if entropy_dimension <= 0:
144
+ print('\nError: entropy_dimension should be a positive float')
145
+ sys.exit()
146
+ else:
147
+ q = entropy_dimension
148
+
149
+ normalization_method = 'standard' #consider including additional normalization methods to transform intensities into a probability distribution; softmax results in many numerical errors/warnings
150
+
151
+ if y_axis_transformation not in ['normalized','none','log10','sqrt']:
152
+ print('Error: y_axis_transformation must be either \'normalized\', \'none\', \'log10\', or \'sqrt\'.')
153
+ sys.exit()
154
+
155
+ if output_path is None:
156
+ print(f'Warning: plots will be saved to the PDF ./spectrum1_{spectrum_ID1}_spectrum2_{spectrum_ID2}_plot.pdf in the current working directory.')
157
+ output_path = f'{Path.cwd()}/spectrum1_{spectrum_ID1}_spectrum2_{spectrum_ID2}.pdf'
158
+
159
+
160
+ #print(spectrum_ID1)
161
+ #print(spectrum_ID2)
162
+ #print(unique_query_ids)
163
+ #print(unique_reference_ids)
164
+ if spectrum_ID1 in unique_query_ids and spectrum_ID2 in unique_query_ids:
165
+ query_idx = unique_query_ids.index(spectrum_ID1)
166
+ reference_idx = unique_query_ids.index(spectrum_ID2)
167
+ q_idxs_tmp = np.where(df_query.iloc[:,0].astype(str) == unique_query_ids[query_idx])[0]
168
+ r_idxs_tmp = np.where(df_query.iloc[:,0].astype(str) == unique_query_ids[reference_idx])[0]
169
+ q_spec = np.asarray(pd.concat([df_query.iloc[q_idxs_tmp,1], df_query.iloc[q_idxs_tmp,2]], axis=1).reset_index(drop=True))
170
+ r_spec = np.asarray(pd.concat([df_query.iloc[r_idxs_tmp,1], df_query.iloc[r_idxs_tmp,2]], axis=1).reset_index(drop=True))
171
+ elif spectrum_ID1 in unique_reference_ids and spectrum_ID2 in unique_reference_ids:
172
+ query_idx = unique_reference_ids.index(spectrum_ID1)
173
+ reference_idx = unique_reference_ids.index(spectrum_ID2)
174
+ q_idxs_tmp = np.where(df_reference.iloc[:,0].astype(str) == unique_reference_ids[query_idx])[0]
175
+ r_idxs_tmp = np.where(df_reference.iloc[:,0].astype(str) == unique_reference_ids[reference_idx])[0]
176
+ q_spec = np.asarray(pd.concat([df_reference.iloc[q_idxs_tmp,1], df_reference.iloc[q_idxs_tmp,2]], axis=1).reset_index(drop=True))
177
+ r_spec = np.asarray(pd.concat([df_reference.iloc[r_idxs_tmp,1], df_reference.iloc[r_idxs_tmp,2]], axis=1).reset_index(drop=True))
178
+ else:
179
+ if spectrum_ID1 in unique_reference_ids and spectrum_ID2 in unique_query_ids:
180
+ spec_tmp = spectrum_ID1
181
+ spectrum_ID1 = spectrum_ID2
182
+ spectrum_ID2 = spec_tmp
183
+ query_idx = unique_query_ids.index(spectrum_ID1)
184
+ reference_idx = unique_reference_ids.index(spectrum_ID2)
185
+ q_idxs_tmp = np.where(df_query.iloc[:,0].astype(str) == unique_query_ids[query_idx])[0]
186
+ r_idxs_tmp = np.where(df_reference.iloc[:,0].astype(str) == unique_reference_ids[reference_idx])[0]
187
+ q_spec = np.asarray(pd.concat([df_query.iloc[q_idxs_tmp,1], df_query.iloc[q_idxs_tmp,2]], axis=1).reset_index(drop=True))
188
+ r_spec = np.asarray(pd.concat([df_reference.iloc[r_idxs_tmp,1], df_reference.iloc[r_idxs_tmp,2]], axis=1).reset_index(drop=True))
189
+
190
+
191
+ q_spec_pre_trans = q_spec.copy()
192
+ r_spec_pre_trans = r_spec.copy()
193
+ q_spec_pre_trans[:,1] = q_spec_pre_trans[:,1].astype(float)
194
+ r_spec_pre_trans[:,1] = r_spec_pre_trans[:,1].astype(float)
195
+
196
+ # apply transformation to y-axis if relevant
197
+ if y_axis_transformation == 'normalized':
198
+ q_spec_pre_trans[:,1] = q_spec_pre_trans[:,1] / np.max(q_spec_pre_trans[:,1])
199
+ r_spec_pre_trans[:,1] = r_spec_pre_trans[:,1] / np.max(r_spec_pre_trans[:,1])
200
+ ylab = 'Normalized Intensity'
201
+ elif y_axis_transformation == 'log10':
202
+ q_spec_pre_trans[:,1] = np.log10(np.array(q_spec_pre_trans[:,1]+1,dtype=float))
203
+ r_spec_pre_trans[:,1] = np.log10(np.array(r_spec_pre_trans[:,1]+1,dtype=float))
204
+ ylab = 'log10(Intensity)'
205
+ elif y_axis_transformation == 'sqrt':
206
+ q_spec_pre_trans[:,1] = np.sqrt(np.array(q_spec_pre_trans[:,1],dtype=float))
207
+ r_spec_pre_trans[:,1] = np.sqrt(np.array(r_spec_pre_trans[:,1],dtype=float))
208
+ ylab = 'sqrt(Intensity)'
209
+ else:
210
+ ylab = 'Raw Intensity'
211
+
212
+ # create the figure
213
+ fig, axes = plt.subplots(nrows=2, ncols=1)
214
+
215
+ # plot the untransformed spectra
216
+ plt.subplot(2,1,1)
217
+ plt.vlines(x=q_spec_pre_trans[:,0], ymin=[0]*q_spec_pre_trans.shape[0], ymax=q_spec_pre_trans[:,1], linewidth=3, color='blue', label=f'Spectrum ID 1: {spectrum_ID1}')
218
+ plt.vlines(x=r_spec_pre_trans[:,0], ymin=[0]*r_spec_pre_trans.shape[0], ymax=-r_spec_pre_trans[:,1], linewidth=3, color='red', label=f'Spectrum ID 2: {spectrum_ID2}')
219
+ plt.xlabel('m/z',fontsize=7)
220
+ plt.ylabel(ylab, fontsize=7)
221
+ plt.xticks(fontsize=7)
222
+ plt.yticks(fontsize=7)
223
+ plt.title('Untransformed Spectra', fontsize=10)
224
+
225
+ # get the ranges of m/z and intensity values to display at the bottom of the two plots
226
+ mz_min_tmp_q = round(q_spec[:,0].min(),1)
227
+ mz_min_tmp_r = round(r_spec[:,0].min(),1)
228
+ int_min_tmp_q = round(q_spec[:,1].min(),1)
229
+ int_min_tmp_r = round(r_spec[:,1].min(),1)
230
+ mz_max_tmp_q = round(q_spec[:,0].max(),1)
231
+ mz_max_tmp_r = round(r_spec[:,0].max(),1)
232
+ int_max_tmp_q = round(q_spec[:,1].max(),1)
233
+ int_max_tmp_r = round(r_spec[:,1].max(),1)
234
+ mz_min_tmp = min([mz_min_tmp_q,mz_min_tmp_r])
235
+ mz_max_tmp = min([mz_max_tmp_q,mz_max_tmp_r])
236
+ int_min_tmp = min([int_min_tmp_q,int_min_tmp_r])
237
+ int_max_tmp = max([int_max_tmp_q,int_max_tmp_r])
238
+
239
+ # perform the spectrum preprocessing transformations in the order specified
240
+ is_matched = False
241
+ for transformation in spectrum_preprocessing_order:
242
+ if transformation == 'C' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1: # centroiding
243
+ q_spec = centroid_spectrum(q_spec, window_size=window_size_centroiding)
244
+ r_spec = centroid_spectrum(r_spec, window_size=window_size_centroiding)
245
+ if transformation == 'M' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1: # matching
246
+ m_spec = match_peaks_in_spectra(spec_a=q_spec, spec_b=r_spec, window_size=window_size_matching)
247
+ q_spec = m_spec[:,0:2]
248
+ r_spec = m_spec[:,[0,2]]
249
+ is_matched = True
250
+ if transformation == 'W' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1: # weight factor transformation
251
+ q_spec[:,1] = wf_transform(q_spec[:,0], q_spec[:,1], wf_mz, wf_intensity)
252
+ r_spec[:,1] = wf_transform(r_spec[:,0], r_spec[:,1], wf_mz, wf_intensity)
253
+ if transformation == 'L' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1: # low-entropy transformation
254
+ q_spec[:,1] = LE_transform(q_spec[:,1], LET_threshold, normalization_method=normalization_method)
255
+ r_spec[:,1] = LE_transform(r_spec[:,1], LET_threshold, normalization_method=normalization_method)
256
+ if transformation == 'N' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1: # noise removal
257
+ q_spec = remove_noise(q_spec, nr = noise_threshold)
258
+ r_spec = remove_noise(r_spec, nr = noise_threshold)
259
+ if transformation == 'F' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1: # filtering
260
+ q_spec = filter_spec_lcms(q_spec, mz_min = mz_min, mz_max = mz_max, int_min = int_min, int_max = int_max, is_matched = is_matched)
261
+ r_spec = filter_spec_lcms(r_spec, mz_min = mz_min, mz_max = mz_max, int_min = int_min, int_max = int_max, is_matched = is_matched)
262
+
263
+ # intensities of query and reference library
264
+ q_ints = q_spec[:,1]
265
+ r_ints = r_spec[:,1]
266
+
267
+ # if there is at least one non-zero intensity ion fragment in either spectra, compute their similarity
268
+ if np.sum(q_ints) != 0 and np.sum(r_ints) != 0 and q_spec.shape[0] > 1 and r_spec.shape[1] > 1:
269
+ if similarity_measure == 'cosine':
270
+ similarity_score = S_cos(q_ints, r_ints)
271
+ else:
272
+ q_ints = normalize(q_ints, method = normalization_method)
273
+ r_ints = normalize(r_ints, method = normalization_method)
274
+ if similarity_measure == 'shannon':
275
+ similarity_score = S_shannon(q_ints, r_ints)
276
+ elif similarity_measure == 'renyi':
277
+ similarity_score = S_renyi(q_ints, r_ints, q)
278
+ elif similarity_measure == 'tsallis':
279
+ similarity_score = S_tsallis(q_ints, r_ints, q)
280
+ else:
281
+ similarity_score = 0
282
+
283
+ # plot the transformed spectra
284
+ plt.subplot(2,1,2)
285
+
286
+ # display warning message if either spectra are empty or have no non-zero intensity ion fragments
287
+ if q_spec.shape[0] > 1:
288
+ if np.max(q_spec[:,1]) == 0 or np.max(r_spec[:,1]) == 0:
289
+ plt.text(0.5, 0.5, 'The query and/or reference spectrum has no non-zero intensities after transformations.\n Change transformation parameters.', ha='center', va='center', fontsize=7, color='black')
290
+ plt.xticks([])
291
+ plt.yticks([])
292
+ else:
293
+ # apply transformation to y-axis if relevant
294
+ if y_axis_transformation == 'normalized':
295
+ q_spec[:,1] = q_spec[:,1] / np.max(q_spec[:,1])
296
+ r_spec[:,1] = r_spec[:,1] / np.max(r_spec[:,1])
297
+ ylab='Normalized Intensity'
298
+ elif y_axis_transformation == 'log10':
299
+ q_spec[:,1] = np.log10(q_spec[:,1]+1)
300
+ r_spec[:,1] = np.log10(r_spec[:,1]+1)
301
+ ylab='log10(Intensity)'
302
+ elif y_axis_transformation == 'sqrt':
303
+ q_spec[:,1] = np.sqrt(q_spec[:,1])
304
+ r_spec[:,1] = np.sqrt(r_spec[:,1])
305
+ ylab='sqrt(Intensity)'
306
+ else:
307
+ ylab = 'Raw Intensity'
308
+ plt.vlines(x=q_spec[:,0], ymin=[0]*q_spec.shape[0], ymax=q_spec[:,1], linewidth=3, color='blue')
309
+ plt.vlines(x=r_spec[:,0], ymin=[0]*r_spec.shape[0], ymax=-r_spec[:,1], linewidth=3, color='red')
310
+ plt.xlabel('m/z', fontsize=7)
311
+ plt.ylabel(ylab, fontsize=7)
312
+ plt.xticks(fontsize=7)
313
+ plt.yticks(fontsize=7)
314
+ plt.title(f'Transformed Spectra', fontsize=10)
315
+ else:
316
+ plt.text(0.5, 0.5, 'All points in the spectra were removed during preprocessing. \nChange the spectrum_preprocesing_order and/or change other spectrum-preprocessing parameters.', ha='center', va='center', fontsize=7, color='black')
317
+ plt.xticks([])
318
+ plt.yticks([])
319
+
320
+
321
+ plt.subplots_adjust(top=0.8, hspace=0.92, bottom=0.3)
322
+ plt.figlegend(loc = 'upper center')
323
+ fig.text(0.05, 0.18, f'Similarity Measure: {similarity_measure.capitalize()}', fontsize=7)
324
+ fig.text(0.05, 0.15, f'Similarity Score: {round(similarity_score,4)}', fontsize=7)
325
+ fig.text(0.05, 0.12, f'Spectrum Preprocessing Order: {''.join(spectrum_preprocessing_order)}', fontsize=7)
326
+ fig.text(0.05, 0.09, f'High Quality Reference Library: {high_quality_reference_library}', fontsize=7)
327
+ fig.text(0.05, 0.06, f'Window Size (Centroiding): {window_size_centroiding}', fontsize=7)
328
+ fig.text(0.05, 0.03, f'Window Size (Matching): {window_size_matching}', fontsize=7)
329
+ fig.text(0.45, 0.18, f'Raw-Scale M/Z Range: [{mz_min_tmp},{mz_max_tmp}]', fontsize=7)
330
+ fig.text(0.45, 0.15, f'Raw-Scale Intensity Range: [{int_min_tmp},{int_max_tmp}]', fontsize=7)
331
+ fig.text(0.45, 0.12, f'Noise Threshold: {noise_threshold}', fontsize=7)
332
+ fig.text(0.45, 0.09, f'Weight Factors (m/z,intensity): ({wf_mz},{wf_intensity})', fontsize=7)
333
+ fig.text(0.45, 0.06, f'Low-Entropy Threshold: {LET_threshold}', fontsize=7)
334
+ plt.savefig(output_path, format='pdf')
335
+
336
+
337
+
338
+
339
+ def generate_plots_on_NRMS_data(query_data=None, reference_data=None, spectrum_ID1=None, spectrum_ID2=None, similarity_measure='cosine', spectrum_preprocessing_order='FNLW', high_quality_reference_library=False, mz_min=0, mz_max=9999999, int_min=0, int_max=9999999, noise_threshold=0.0, wf_mz=0.0, wf_intensity=1.0, LET_threshold=0.0, entropy_dimension=1.1, y_axis_transformation='normalized', output_path=None):
340
+ '''
341
+ plots two spectra against each other before and after preprocessing transformations for high-resolution mass spectrometry data
342
+
343
+ --query_data: cdf or csv file of query mass spectrum/spectra to be identified. If csv file, each row should correspond to a mass spectrum, the left-most column should contain an identifier, and each of the other columns should correspond to a single mass/charge ratio. Mandatory argument.
344
+ --reference_data: cdf of csv file of the reference mass spectra. If csv file, each row should correspond to a mass spectrum, the left-most column should contain in identifier (i.e. the CAS registry number or the compound name), and the remaining column should correspond to a single mass/charge ratio. Mandatory argument.
345
+ --similarity_measure: \'cosine\', \'shannon\', \'renyi\', and \'tsallis\'. Default: cosine.
346
+ --spectrum_preprocessing_order: The spectrum preprocessing transformations and the order in which they are to be applied. Note that these transformations are applied prior to computing similarity scores. Format must be a string with 2-4 characters chosen from F, N, L, W representing filtering based on mass/charge and intensity values, noise removal, low-entropy trannsformation, and weight-factor-transformation, respectively. For example, if \'WN\' is passed, then each spectrum will undergo a weight factor transformation and then noise removal. Default: FNLW')
347
+ --high_quality_reference_library: True/False flag indicating whether the reference library is considered to be of high quality. If True, then the spectrum preprocessing transformations of filtering and noise removal are performed only on the query spectrum/spectra. If False, all spectrum preprocessing transformations specified will be applied to both the query and reference spectra. Default: False')
348
+ --mz_min: Remove all peaks with mass/charge value less than mz_min in each spectrum. Default: 0
349
+ --mz_max: Remove all peaks with mass/charge value greater than mz_max in each spectrum. Default: 9999999
350
+ --int_min: Remove all peaks with intensity value less than int_min in each spectrum. Default: 0
351
+ --int_max: Remove all peaks with intensity value greater than int_max in each spectrum. Default: 9999999
352
+ --noise_threshold: Ion fragments (i.e. points in a given mass spectrum) with intensity less than max(intensities)*noise_threshold are removed. Default: 0.0
353
+ --wf_mz: Mass/charge weight factor parameter. Default: 0.0
354
+ --wf_intensity: Intensity weight factor parameter. Default: 0.0
355
+ --LET_threshold: Low-entropy transformation threshold parameter. Spectra with Shannon entropy less than LET_threshold are transformed according to intensitiesNew=intensitiesOriginal^{(1+S)/(1+LET_threshold)}. Default: 0.0
356
+ --entropy_dimension: Entropy dimension parameter. Must have positive value other than 1. When the entropy dimension is 1, then Renyi and Tsallis entropy are equivalent to Shannon entropy. Therefore, this parameter only applies to the renyi and tsallis similarity measures. This parameter will be ignored if similarity measure cosine or shannon is chosen. Default: 1.1
357
+ --y_axis_transformation: transformation to apply to y-axis (i.e. intensity axis) of plots. Options: \'normalized\', \'none\', \'log10\', and \'sqrt\'. Default: normalized.')
358
+ --output_path: path to output PDF file containing the plots of the spectra before and after preprocessing transformations. If no argument is passed, then the plots will be saved to the PDF ./spectrum1_{spectrum_ID1}_spectrum2_{spectrum_ID2}_plot.pdf in the current working directory.
359
+ '''
360
+
361
+ # load query and reference libraries
362
+ if query_data is None:
363
+ print('\nError: No argument passed to the mandatory query_data. Please pass the path to the CSV file of the query data.')
364
+ sys.exit()
365
+ else:
366
+ extension = query_data.rsplit('.',1)
367
+ extension = extension[(len(extension)-1)]
368
+ if extension == 'mgf' or extension == 'MGF' or extension == 'mzML' or extension == 'mzml' or extension == 'MZML' or extension == 'cdf' or extension == 'CDF':
369
+ output_path_tmp = query_data[:-3] + 'csv'
370
+ build_library_from_raw_data(input_path=query_data, output_path=output_path_tmp, is_reference=False)
371
+ df_query = pd.read_csv(output_path_tmp)
372
+ if extension == 'csv' or extension == 'CSV':
373
+ df_query = pd.read_csv(query_data)
374
+ unique_query_ids = df_query.iloc[:,0].unique()
375
+
376
+ if reference_data is None:
377
+ print('\nError: No argument passed to the mandatory reference_data. Please pass the path to the CSV file of the reference data.')
378
+ sys.exit()
379
+ else:
380
+ extension = reference_data.rsplit('.',1)
381
+ extension = extension[(len(extension)-1)]
382
+ if extension == 'mgf' or extension == 'MGF' or extension == 'mzML' or extension == 'mzml' or extension == 'MZML' or extension == 'cdf' or extension == 'CDF':
383
+ output_path_tmp = reference_data[:-3] + 'csv'
384
+ build_library_from_raw_data(input_path=reference_data, output_path=output_path_tmp, is_reference=True)
385
+ df_reference = pd.read_csv(output_path_tmp)
386
+ if extension == 'csv' or extension == 'CSV':
387
+ df_reference = pd.read_csv(reference_data)
388
+ unique_reference_ids = df_reference.iloc[:,0].unique()
389
+
390
+
391
+ ##### process input parameters and ensure they are in a valid format #####
392
+ if spectrum_ID1 is not None:
393
+ spectrum_ID1 = str(spectrum_ID1)
394
+ else:
395
+ spectrum_ID1 = str(df_query.iloc[0,0])
396
+ print('No argument passed to spectrum_ID1; using the first spectrum in query_data.')
397
+
398
+ if spectrum_ID2 is not None:
399
+ spectrum_ID2 = str(spectrum_ID2)
400
+ else:
401
+ spectrum_ID2 = str(df_reference.iloc[0,0])
402
+ print('No argument passed to spectrum_ID2; using the first spectrum in reference_data.')
403
+
404
+ if spectrum_preprocessing_order is not None:
405
+ spectrum_preprocessing_order = list(spectrum_preprocessing_order)
406
+ else:
407
+ spectrum_preprocessing_order = ['F','N','W','L']
408
+ if set(spectrum_preprocessing_order) - {'F','N','W','L'}:
409
+ print(f'Error: spectrum_preprocessing_order must contain only \'F\', \'N\', \'W\', \'L\'.')
410
+ sys.exit()
411
+
412
+ if similarity_measure not in ['cosine','shannon','renyi','tsallis']:
413
+ print('\nError: similarity_measure must be either \'cosine\', \'shannon\', \'renyi\', or \'tsallis\'')
414
+ sys.exit()
415
+
416
+ if isinstance(int_min,int) is True:
417
+ int_min = float(int_min)
418
+ if isinstance(int_max,int) is True:
419
+ int_max = float(int_max)
420
+ if isinstance(mz_min,int) is False or isinstance(mz_max,int) is False or isinstance(int_min,float) is False or isinstance(int_max,float) is False:
421
+ print('Error: mz_min must be a non-negative integer, mz_max must be a positive integer, int_min must be a non-negative float, and int_max must be a positive float')
422
+ sys.exit()
423
+ if mz_min < 0:
424
+ print('\nError: mz_min should be a non-negative integer')
425
+ sys.exit()
426
+ if mz_max <= 0:
427
+ print('\nError: mz_max should be a positive integer')
428
+ sys.exit()
429
+ if int_min < 0:
430
+ print('\nError: int_min should be a non-negative float')
431
+ sys.exit()
432
+ if int_max <= 0:
433
+ print('\nError: int_max should be a positive float')
434
+ sys.exit()
435
+
436
+ if isinstance(noise_threshold,int) is True:
437
+ noise_threshold = float(noise_threshold)
438
+ if isinstance(noise_threshold,float) is False or noise_threshold < 0:
439
+ print('Error: noise_threshold must be a positive float.')
440
+ sys.exit()
441
+
442
+ if isinstance(wf_intensity,int) is True:
443
+ wf_intensity = float(wf_intensity)
444
+ if isinstance(wf_mz,int) is True:
445
+ wf_mz = float(wf_mz)
446
+ if isinstance(wf_intensity,float) is False or isinstance(wf_mz,float) is False:
447
+ print('Error: wf_mz and wf_intensity must be integers or floats')
448
+ sys.exit()
449
+
450
+ if entropy_dimension <= 0:
451
+ print('\nError: entropy_dimension should be a positive float')
452
+ sys.exit()
453
+ else:
454
+ q = entropy_dimension
455
+
456
+ normalization_method = 'standard' #consider including additional normalization methods to transform intensities into a probability distribution; softmax results in many numerical errors/warnings
457
+
458
+ if y_axis_transformation not in ['normalized','none','log10','sqrt']:
459
+ print('Error: y_axis_transformation must be either \'normalized\', \'none\', \'log10\', or \'sqrt\'.')
460
+ sys.exit()
461
+
462
+ if output_path is None:
463
+ print(f'Warning: plots will be saved to the PDF ./spectrum1_{spectrum_ID1}_spectrum2_{spectrum_ID2}_plot.pdf in the current working directory.')
464
+ output_path = f'{Path.cwd()}/spectrum1_{spectrum_ID1}_spectrum2_{spectrum_ID2}.pdf'
465
+
466
+ # get m/z values
467
+ min_mz = np.min([np.min(df_query.iloc[:,1]), np.min(df_reference.iloc[:,1])])
468
+ max_mz = np.max([np.max(df_query.iloc[:,1]), np.max(df_reference.iloc[:,1])])
469
+ mzs = np.linspace(min_mz,max_mz,(max_mz-min_mz+1))
470
+
471
+ # get unique query/reference library IDs; each query/reference ID corresponds to exactly one query/reference mass spectrum
472
+ unique_query_ids = df_query.iloc[:,0].unique().tolist()
473
+ unique_reference_ids = df_reference.iloc[:,0].unique().tolist()
474
+ unique_query_ids = [str(ID) for ID in unique_query_ids]
475
+ unique_reference_ids = [str(ID) for ID in unique_reference_ids]
476
+ common_IDs = np.intersect1d([str(ID) for ID in unique_query_ids], [str(ID) for ID in unique_reference_ids])
477
+ if len(common_IDs) > 0:
478
+ print(f'Warning: the query and reference library have overlapping IDs: {common_IDs}')
479
+
480
+ if spectrum_ID1 in unique_query_ids and spectrum_ID2 in unique_query_ids:
481
+ q_idxs_tmp = np.where(df_query.iloc[:,0].astype(str) == spectrum_ID1)[0]
482
+ r_idxs_tmp = np.where(df_query.iloc[:,0].astype(str) == spectrum_ID2)[0]
483
+ q_spec = np.asarray(pd.concat([df_query.iloc[q_idxs_tmp,1], df_query.iloc[q_idxs_tmp,2]], axis=1).reset_index(drop=True))
484
+ r_spec = np.asarray(pd.concat([df_query.iloc[r_idxs_tmp,1], df_query.iloc[r_idxs_tmp,2]], axis=1).reset_index(drop=True))
485
+ elif spectrum_ID1 in unique_reference_ids and spectrum_ID2 in unique_reference_ids:
486
+ q_idxs_tmp = np.where(df_reference.iloc[:,0].astype(str) == spectrum_ID1)[0]
487
+ r_idxs_tmp = np.where(df_reference.iloc[:,0].astype(str) == spectrum_ID2)[0]
488
+ q_spec = np.asarray(pd.concat([df_reference.iloc[q_idxs_tmp,1], df_reference.iloc[q_idxs_tmp,2]], axis=1).reset_index(drop=True))
489
+ r_spec = np.asarray(pd.concat([df_reference.iloc[r_idxs_tmp,1], df_reference.iloc[r_idxs_tmp,2]], axis=1).reset_index(drop=True))
490
+ else:
491
+ if spectrum_ID1 in unique_reference_ids and spectrum_ID2 in unique_query_ids:
492
+ spec_tmp = spectrum_ID1
493
+ spectrum_ID1 = spectrum_ID2
494
+ spectrum_ID2 = spec_tmp
495
+ q_idxs_tmp = np.where(df_query.iloc[:,0].astype(str) == spectrum_ID1)[0]
496
+ r_idxs_tmp = np.where(df_reference.iloc[:,0].astype(str) == spectrum_ID2)[0]
497
+ q_spec = np.asarray(pd.concat([df_query.iloc[q_idxs_tmp,1], df_query.iloc[q_idxs_tmp,2]], axis=1).reset_index(drop=True))
498
+ r_spec = np.asarray(pd.concat([df_reference.iloc[r_idxs_tmp,1], df_reference.iloc[r_idxs_tmp,2]], axis=1).reset_index(drop=True))
499
+
500
+ q_spec = convert_spec(q_spec,mzs)
501
+ r_spec = convert_spec(r_spec,mzs)
502
+
503
+ # get the ranges of m/z and intensity values to display at the bottom of the two plots
504
+ int_min_tmp_q = min(q_spec[q_spec[:,1].nonzero(),1][0])
505
+ int_min_tmp_r = min(r_spec[r_spec[:,1].nonzero(),1][0])
506
+ int_max_tmp_q = max(q_spec[q_spec[:,1].nonzero(),1][0])
507
+ int_max_tmp_r = max(r_spec[r_spec[:,1].nonzero(),1][0])
508
+ int_min_tmp = int(min([int_min_tmp_q,int_min_tmp_r]))
509
+ int_max_tmp = int(max([int_max_tmp_q,int_max_tmp_r]))
510
+
511
+ # create the figure
512
+ fig, axes = plt.subplots(nrows=2, ncols=1)
513
+
514
+ # plot the untransformed spectra
515
+ plt.subplot(2,1,1)
516
+
517
+ # display warning message if either spectra have no non-zero ion fragments
518
+ if np.max(q_spec[:,1]) == 0 or np.max(r_spec[:,1]) == 0:
519
+ plt.text(0.5, 0.5, 'The query and/or reference spectrum has no non-zero intensities after transformations.\n Change transformation parameters.', ha='center', va='center', fontsize=7, color='black')
520
+ plt.xticks([])
521
+ plt.yticks([])
522
+ else:
523
+ q_spec_pre_trans = q_spec.copy()
524
+ r_spec_pre_trans = r_spec.copy()
525
+ q_spec_pre_trans[:,1] = q_spec_pre_trans[:,1].astype(float)
526
+ r_spec_pre_trans[:,1] = r_spec_pre_trans[:,1].astype(float)
527
+
528
+ # apply transformation to y-axis if relevant
529
+ if y_axis_transformation == 'normalized':
530
+ q_spec_pre_trans[:,1] = q_spec_pre_trans[:,1] / np.max(q_spec_pre_trans[:,1])
531
+ r_spec_pre_trans[:,1] = r_spec_pre_trans[:,1] / np.max(r_spec_pre_trans[:,1])
532
+ ylab = 'Normalized Intensity'
533
+ elif y_axis_transformation == 'log10':
534
+ q_spec_pre_trans[:,1] = np.log10(q_spec_pre_trans[:,1]+1)
535
+ r_spec_pre_trans[:,1] = np.log10(r_spec_pre_trans[:,1]+1)
536
+ ylab = 'log10(Intensity)'
537
+ elif y_axis_transformation == 'sqrt':
538
+ q_spec_pre_trans[:,1] = np.sqrt(q_spec_pre_trans[:,1])
539
+ r_spec_pre_trans[:,1] = np.sqrt(r_spec_pre_trans[:,1])
540
+ ylab = 'sqrt(Intensity)'
541
+ else:
542
+ ylab = 'Raw Intensity'
543
+ plt.vlines(x=q_spec_pre_trans[:,0], ymin=[0]*len(q_spec_pre_trans[:,0]), ymax=q_spec_pre_trans[:,1], linewidth=3, color='blue', label=f'Spectrum ID1: {spectrum_ID1}')
544
+ plt.vlines(x=r_spec_pre_trans[:,0], ymin=[0]*len(r_spec_pre_trans[:,0]), ymax=-r_spec_pre_trans[:,1], linewidth=3, color='red', label=f'Spectrum ID2: {spectrum_ID2}')
545
+ plt.xlabel('m/z',fontsize=7)
546
+ plt.ylabel(ylab, fontsize=7)
547
+ plt.xticks(fontsize=7)
548
+ plt.yticks(fontsize=7)
549
+ plt.title('Untransformed Query and Reference Spectra', fontsize=10)
550
+
551
+ for transformation in spectrum_preprocessing_order:
552
+ if transformation == 'W': # weight factor transformation
553
+ q_spec[:,1] = wf_transform(q_spec[:,0], q_spec[:,1], wf_mz, wf_intensity)
554
+ r_spec[:,1] = wf_transform(r_spec[:,0], r_spec[:,1], wf_mz, wf_intensity)
555
+ if transformation == 'L': # low-entropy transformation
556
+ q_spec[:,1] = LE_transform(q_spec[:,1], LET_threshold, normalization_method)
557
+ r_spec[:,1] = LE_transform(r_spec[:,1], LET_threshold, normalization_method)
558
+ if transformation == 'N': # noise removal
559
+ q_spec = remove_noise(q_spec, nr = noise_threshold)
560
+ if high_quality_reference_library == False:
561
+ r_spec = remove_noise(r_spec, nr = noise_threshold)
562
+ if transformation == 'F': # filtering with respect to mz and/or intensity
563
+ q_spec = filter_spec_gcms(q_spec, mz_min = mz_min, mz_max = mz_max, int_min = int_min, int_max = int_max)
564
+ if high_quality_reference_library == False:
565
+ r_spec = filter_spec_gcms(r_spec, mz_min = mz_min, mz_max = mz_max, int_min = int_min, int_max = int_max)
566
+
567
+ # compute similarity score; if the spectra contain one point at most, their similarity is considered to be 0
568
+ if q_spec.shape[0] > 1:
569
+ if similarity_measure == 'cosine':
570
+ similarity_score = S_cos(q_spec[:,1], r_spec[:,1])
571
+ else:
572
+ q_spec[:,1] = normalize(q_spec[:,1], method = normalization_method)
573
+ r_spec[:,1] = normalize(r_spec[:,1], method = normalization_method)
574
+
575
+ if similarity_measure == 'shannon':
576
+ similarity_score = S_shannon(q_spec[:,1].astype('float'), r_spec[:,1].astype('float'))
577
+ elif similarity_measure == 'renyi':
578
+ similarity_score = S_renyi(q_spec[:,1], r_spec[:,1], q)
579
+ elif similarity_measure == 'tsallis':
580
+ similarity_score = S_tsallis(q_spec[:,1], r_spec[:,1], q)
581
+ else:
582
+ similarity_score = 0
583
+
584
+
585
+ # plot the transformed spectra
586
+ plt.subplot(2,1,2)
587
+
588
+ # display warning message if either spectra are empty or have no non-zero intensity ion fragments
589
+ if q_spec.shape[0] == 0 or r_spec.shape[0] == 0:
590
+ plt.text(0.5, 0.5, 'The query and/or reference spectrum has no ion fragments left after transformations.\n Change transformation parameters.', ha='center', va='center', fontsize=7, color='black')
591
+ plt.xticks([])
592
+ plt.yticks([])
593
+ elif np.max(q_spec[:,1]) == 0 or np.max(r_spec[:,1]) == 0:
594
+ plt.text(0.5, 0.5, 'The query and/or reference spectrum has no non-zero intensities after transformations.\n Change transformation parameters.', ha='center', va='center', fontsize=7, color='black')
595
+ plt.xticks([])
596
+ plt.yticks([])
597
+ else:
598
+ # apply transformation to y-axis if relevant
599
+ if y_axis_transformation == 'normalized':
600
+ q_spec[:,1] = q_spec[:,1] / np.max(q_spec[:,1])
601
+ r_spec[:,1] = r_spec[:,1] / np.max(r_spec[:,1])
602
+ ylab='Normalized Intensity'
603
+ elif y_axis_transformation == 'log10':
604
+ q_spec[:,1] = np.log10(q_spec[:,1]+1)
605
+ r_spec[:,1] = np.log10(r_spec[:,1]+1)
606
+ ylab='log10(Intensity)'
607
+ elif y_axis_transformation == 'sqrt':
608
+ q_spec[:,1] = np.sqrt(q_spec[:,1])
609
+ r_spec[:,1] = np.sqrt(r_spec[:,1])
610
+ ylab='sqrt(Intensity)'
611
+ else:
612
+ ylab = 'Raw Intensity'
613
+ plt.vlines(x=mzs, ymin=[0]*len(mzs), ymax=q_spec[:,1], linewidth=3, color='blue')
614
+ plt.vlines(x=mzs, ymin=[0]*len(mzs), ymax=-r_spec[:,1], linewidth=3, color='red')
615
+ plt.xlabel('m/z', fontsize=7)
616
+ plt.ylabel(ylab, fontsize=7)
617
+ plt.xticks(fontsize=7)
618
+ plt.yticks(fontsize=7)
619
+ plt.title(f'Transformed Query and Reference Spectra', fontsize=10)
620
+
621
+
622
+ #plt.subplots_adjust(top = 0.8, hspace = 0.7)
623
+ plt.subplots_adjust(top=0.8, hspace=0.92, bottom=0.3)
624
+ plt.figlegend(loc = 'upper center')
625
+ fig.text(0.05, 0.15, f'Similarity Measure: {similarity_measure.capitalize()}', fontsize=7)
626
+ fig.text(0.05, 0.12, f'Similarity Score: {round(similarity_score,4)}', fontsize=7)
627
+ fig.text(0.05, 0.09, f'Spectrum Preprocessing Order: {''.join(spectrum_preprocessing_order)}', fontsize=7)
628
+ fig.text(0.05, 0.06, f'High Quality Reference Library: {high_quality_reference_library}', fontsize=7)
629
+ fig.text(0.05, 0.03, f'Raw-Scale M/Z Range: [{min_mz},{max_mz}]', fontsize=7)
630
+ fig.text(0.45, 0.15, f'Raw-Scale Intensity Range: [{int_min_tmp},{int_max_tmp}]', fontsize=7)
631
+ fig.text(0.45, 0.12, f'Noise Threshold: {noise_threshold}', fontsize=7)
632
+ fig.text(0.45, 0.09, f'Weight Factors (m/z,intensity): ({wf_mz},{wf_intensity})', fontsize=7)
633
+ fig.text(0.45, 0.06, f'Low-Entropy Threshold: {LET_threshold}', fontsize=7)
634
+ plt.savefig(output_path, format='pdf')
635
+
636
+