pycompound 0.1.1__py3-none-any.whl → 0.1.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- app.py +164 -196
- pycompound/build_library.py +2 -9
- pycompound/plot_spectra.py +25 -48
- pycompound/processing.py +0 -9
- pycompound/similarity_measures.py +0 -3
- pycompound/spec_lib_matching.py +246 -81
- pycompound/spec_lib_matching_CLI.py +2 -7
- pycompound/tuning_CLI.py +1 -1
- {pycompound-0.1.1.dist-info → pycompound-0.1.3.dist-info}/METADATA +1 -1
- pycompound-0.1.3.dist-info/RECORD +14 -0
- pycompound-0.1.1.dist-info/RECORD +0 -14
- {pycompound-0.1.1.dist-info → pycompound-0.1.3.dist-info}/WHEEL +0 -0
- {pycompound-0.1.1.dist-info → pycompound-0.1.3.dist-info}/licenses/LICENSE +0 -0
- {pycompound-0.1.1.dist-info → pycompound-0.1.3.dist-info}/top_level.txt +0 -0
pycompound/build_library.py
CHANGED
|
@@ -1,6 +1,4 @@
|
|
|
1
1
|
|
|
2
|
-
# this script has a function to extract the mass spectra from an mgf, mzML, or cdf file and write them in the necessary format for use in spectral library matching
|
|
3
|
-
|
|
4
2
|
import netCDF4 as nc
|
|
5
3
|
import numpy as np
|
|
6
4
|
import pandas as pd
|
|
@@ -14,7 +12,7 @@ def build_library_from_raw_data(input_path=None, output_path=None, is_reference=
|
|
|
14
12
|
Converts mgf, mzML, cdf, or msp file to the necessary format for spectral library matching.
|
|
15
13
|
|
|
16
14
|
--input_path: Path to input file (must be mgf, mzML, cdf, or msp file). Mandatory argument.
|
|
17
|
-
--output_path: Path to output
|
|
15
|
+
--output_path: Path to output TXT file. Default: current working directory.
|
|
18
16
|
--is_reference: Boolean flag indicating whether IDs of spectra should be written to output. Only pass true if building a reference library with known compound IDs. Only applicable to mgf and msp files. Options: \'True\', \'False\'. Optional argument. Default: False.
|
|
19
17
|
'''
|
|
20
18
|
|
|
@@ -23,7 +21,6 @@ def build_library_from_raw_data(input_path=None, output_path=None, is_reference=
|
|
|
23
21
|
sys.exit()
|
|
24
22
|
|
|
25
23
|
if output_path is None:
|
|
26
|
-
#print('Warning: no output_path specified, so library is written to {Path.cwd()}/build_library.csv')
|
|
27
24
|
tmp = input_path.split('/')
|
|
28
25
|
tmp = tmp[(len(tmp)-1)]
|
|
29
26
|
basename = tmp.split('.')[0]
|
|
@@ -34,7 +31,6 @@ def build_library_from_raw_data(input_path=None, output_path=None, is_reference=
|
|
|
34
31
|
print('Error: is_reference must be either \'True\' or \'False\'.')
|
|
35
32
|
sys.exit()
|
|
36
33
|
|
|
37
|
-
# determine whether an mgf or a mzML file was passed to --input_path
|
|
38
34
|
last_three_chars = input_path[(len(input_path)-3):len(input_path)]
|
|
39
35
|
last_four_chars = input_path[(len(input_path)-4):len(input_path)]
|
|
40
36
|
if last_three_chars == 'mgf' or last_three_chars == 'MGF':
|
|
@@ -50,7 +46,6 @@ def build_library_from_raw_data(input_path=None, output_path=None, is_reference=
|
|
|
50
46
|
sys.exit()
|
|
51
47
|
|
|
52
48
|
|
|
53
|
-
# obtain a list of spectra from the input file
|
|
54
49
|
spectra = []
|
|
55
50
|
if input_file_type == 'mgf':
|
|
56
51
|
with mgf.read(input_path, index_by_scans = True) as reader:
|
|
@@ -62,7 +57,6 @@ def build_library_from_raw_data(input_path=None, output_path=None, is_reference=
|
|
|
62
57
|
spectra.append(spec)
|
|
63
58
|
|
|
64
59
|
|
|
65
|
-
# extract the relevant information from each spectra (i.e m/z ratios and intensities)
|
|
66
60
|
if input_file_type == 'mgf' or input_file_type == 'mzML':
|
|
67
61
|
ids = []
|
|
68
62
|
mzs = []
|
|
@@ -128,8 +122,7 @@ def build_library_from_raw_data(input_path=None, output_path=None, is_reference=
|
|
|
128
122
|
continue
|
|
129
123
|
|
|
130
124
|
|
|
131
|
-
# write CSV file of spectra for use in spectral library matching
|
|
132
125
|
df = pd.DataFrame({'id':ids, 'mz_ratio':mzs, 'intensity':ints})
|
|
133
|
-
df.to_csv(output_path, index=False)
|
|
126
|
+
df.to_csv(output_path, index=False, sep='\t')
|
|
134
127
|
|
|
135
128
|
|
pycompound/plot_spectra.py
CHANGED
|
@@ -1,6 +1,4 @@
|
|
|
1
1
|
|
|
2
|
-
# this script's functions plot a given query spectrum against a given reference spectrum before and after spectrum preprocessing transformations
|
|
3
|
-
|
|
4
2
|
from .processing import *
|
|
5
3
|
from .similarity_measures import *
|
|
6
4
|
import pandas as pd
|
|
@@ -36,7 +34,6 @@ def generate_plots_on_HRMS_data(query_data=None, reference_data=None, spectrum_I
|
|
|
36
34
|
--output_path: path to output PDF file containing the plots of the spectra before and after preprocessing transformations. If no argument is passed, then the plots will be saved to the PDF ./spectrum1_{spectrum_ID1}_spectrum2_{spectrum_ID2}_plot.pdf in the current working directory.
|
|
37
35
|
'''
|
|
38
36
|
|
|
39
|
-
# load query and reference libraries
|
|
40
37
|
if query_data is None:
|
|
41
38
|
print('\nError: No argument passed to the mandatory query_data. Please pass the path to the CSV file of the query data.')
|
|
42
39
|
sys.exit()
|
|
@@ -68,7 +65,6 @@ def generate_plots_on_HRMS_data(query_data=None, reference_data=None, spectrum_I
|
|
|
68
65
|
unique_reference_ids = [str(tmp) for tmp in unique_reference_ids]
|
|
69
66
|
|
|
70
67
|
|
|
71
|
-
##### process input parameters and ensure they are in a valid format #####
|
|
72
68
|
if spectrum_ID1 is not None:
|
|
73
69
|
spectrum_ID1 = str(spectrum_ID1)
|
|
74
70
|
else:
|
|
@@ -96,8 +92,8 @@ def generate_plots_on_HRMS_data(query_data=None, reference_data=None, spectrum_I
|
|
|
96
92
|
print(f'Error: spectrum_preprocessing_order must contain only \'C\', \'F\', \'M\', \'N\', \'L\', \'W\'.')
|
|
97
93
|
sys.exit()
|
|
98
94
|
|
|
99
|
-
if similarity_measure not in ['cosine','shannon','renyi','tsallis','mixture','jaccard','dice','3w_jaccard','sokal_sneath','binary_cosine','mountford','mcconnaughey','driver_kroeber','simpson','braun_banquet','fager_mcgowan','kulczynski','
|
|
100
|
-
print('\nError: similarity_measure must be either cosine, shannon, renyi, tsallis, mixture, jaccard, dice, 3w_jaccard, sokal_sneath, binary_cosine, mountford, mcconnaughey, driver_kroeber, simpson, braun_banquet, fager_mcgowan, kulczynski,
|
|
95
|
+
if similarity_measure not in ['cosine','shannon','renyi','tsallis','mixture','jaccard','dice','3w_jaccard','sokal_sneath','binary_cosine','mountford','mcconnaughey','driver_kroeber','simpson','braun_banquet','fager_mcgowan','kulczynski','intersection','hamming','hellinger']:
|
|
96
|
+
print('\nError: similarity_measure must be either cosine, shannon, renyi, tsallis, mixture, jaccard, dice, 3w_jaccard, sokal_sneath, binary_cosine, mountford, mcconnaughey, driver_kroeber, simpson, braun_banquet, fager_mcgowan, kulczynski, intersection, hamming, or hellinger.')
|
|
101
97
|
sys.exit()
|
|
102
98
|
|
|
103
99
|
if isinstance(int_min,int) is True:
|
|
@@ -190,7 +186,6 @@ def generate_plots_on_HRMS_data(query_data=None, reference_data=None, spectrum_I
|
|
|
190
186
|
q_spec_pre_trans[:,1] = q_spec_pre_trans[:,1].astype(float)
|
|
191
187
|
r_spec_pre_trans[:,1] = r_spec_pre_trans[:,1].astype(float)
|
|
192
188
|
|
|
193
|
-
# apply transformation to y-axis if relevant
|
|
194
189
|
if y_axis_transformation == 'normalized':
|
|
195
190
|
q_spec_pre_trans[:,1] = q_spec_pre_trans[:,1] / np.max(q_spec_pre_trans[:,1])
|
|
196
191
|
r_spec_pre_trans[:,1] = r_spec_pre_trans[:,1] / np.max(r_spec_pre_trans[:,1])
|
|
@@ -206,10 +201,8 @@ def generate_plots_on_HRMS_data(query_data=None, reference_data=None, spectrum_I
|
|
|
206
201
|
else:
|
|
207
202
|
ylab = 'Raw Intensity'
|
|
208
203
|
|
|
209
|
-
# create the figure
|
|
210
204
|
fig, axes = plt.subplots(nrows=2, ncols=1)
|
|
211
205
|
|
|
212
|
-
# plot the untransformed spectra
|
|
213
206
|
plt.subplot(2,1,1)
|
|
214
207
|
plt.vlines(x=q_spec_pre_trans[:,0], ymin=[0]*q_spec_pre_trans.shape[0], ymax=q_spec_pre_trans[:,1], linewidth=3, color='blue', label=f'Spectrum ID 1: {spectrum_ID1}')
|
|
215
208
|
plt.vlines(x=r_spec_pre_trans[:,0], ymin=[0]*r_spec_pre_trans.shape[0], ymax=-r_spec_pre_trans[:,1], linewidth=3, color='red', label=f'Spectrum ID 2: {spectrum_ID2}')
|
|
@@ -219,7 +212,6 @@ def generate_plots_on_HRMS_data(query_data=None, reference_data=None, spectrum_I
|
|
|
219
212
|
plt.yticks(fontsize=7)
|
|
220
213
|
plt.title('Untransformed Spectra', fontsize=10)
|
|
221
214
|
|
|
222
|
-
# get the ranges of m/z and intensity values to display at the bottom of the two plots
|
|
223
215
|
mz_min_tmp_q = round(q_spec[:,0].min(),1)
|
|
224
216
|
mz_min_tmp_r = round(r_spec[:,0].min(),1)
|
|
225
217
|
int_min_tmp_q = round(q_spec[:,1].min(),1)
|
|
@@ -233,51 +225,47 @@ def generate_plots_on_HRMS_data(query_data=None, reference_data=None, spectrum_I
|
|
|
233
225
|
int_min_tmp = min([int_min_tmp_q,int_min_tmp_r])
|
|
234
226
|
int_max_tmp = max([int_max_tmp_q,int_max_tmp_r])
|
|
235
227
|
|
|
236
|
-
# perform the spectrum preprocessing transformations in the order specified
|
|
237
228
|
is_matched = False
|
|
238
229
|
for transformation in spectrum_preprocessing_order:
|
|
239
|
-
if transformation == 'C' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1:
|
|
230
|
+
if transformation == 'C' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1:
|
|
240
231
|
q_spec = centroid_spectrum(q_spec, window_size=window_size_centroiding)
|
|
241
232
|
r_spec = centroid_spectrum(r_spec, window_size=window_size_centroiding)
|
|
242
|
-
if transformation == 'M' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1:
|
|
233
|
+
if transformation == 'M' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1:
|
|
243
234
|
m_spec = match_peaks_in_spectra(spec_a=q_spec, spec_b=r_spec, window_size=window_size_matching)
|
|
244
235
|
q_spec = m_spec[:,0:2]
|
|
245
236
|
r_spec = m_spec[:,[0,2]]
|
|
246
237
|
is_matched = True
|
|
247
|
-
if transformation == 'W' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1:
|
|
238
|
+
if transformation == 'W' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1:
|
|
248
239
|
q_spec[:,1] = wf_transform(q_spec[:,0], q_spec[:,1], wf_mz, wf_intensity)
|
|
249
240
|
r_spec[:,1] = wf_transform(r_spec[:,0], r_spec[:,1], wf_mz, wf_intensity)
|
|
250
|
-
if transformation == 'L' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1:
|
|
241
|
+
if transformation == 'L' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1:
|
|
251
242
|
q_spec[:,1] = LE_transform(q_spec[:,1], LET_threshold, normalization_method=normalization_method)
|
|
252
243
|
r_spec[:,1] = LE_transform(r_spec[:,1], LET_threshold, normalization_method=normalization_method)
|
|
253
|
-
if transformation == 'N' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1:
|
|
244
|
+
if transformation == 'N' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1:
|
|
254
245
|
q_spec = remove_noise(q_spec, nr = noise_threshold)
|
|
255
|
-
|
|
256
|
-
|
|
246
|
+
if high_quality_reference_library == False or high_quality_reference_library == 'False':
|
|
247
|
+
r_spec = remove_noise(r_spec, nr = noise_threshold)
|
|
248
|
+
if transformation == 'F' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1:
|
|
257
249
|
q_spec = filter_spec_lcms(q_spec, mz_min = mz_min, mz_max = mz_max, int_min = int_min, int_max = int_max, is_matched = is_matched)
|
|
258
|
-
|
|
250
|
+
if high_quality_reference_library == False or high_quality_reference_library == 'False':
|
|
251
|
+
r_spec = filter_spec_lcms(r_spec, mz_min = mz_min, mz_max = mz_max, int_min = int_min, int_max = int_max, is_matched = is_matched)
|
|
259
252
|
|
|
260
|
-
# intensities of query and reference library
|
|
261
253
|
q_ints = q_spec[:,1]
|
|
262
254
|
r_ints = r_spec[:,1]
|
|
263
255
|
|
|
264
|
-
# if there is at least one non-zero intensity ion fragment in either spectra, compute their similarity
|
|
265
256
|
if np.sum(q_ints) != 0 and np.sum(r_ints) != 0 and q_spec.shape[0] > 1 and r_spec.shape[1] > 1:
|
|
266
257
|
similarity_score = get_similarity(similarity_measure, q_ints, r_ints, weights, entropy_dimension)
|
|
267
258
|
else:
|
|
268
259
|
similarity_score = 0
|
|
269
260
|
|
|
270
|
-
# plot the transformed spectra
|
|
271
261
|
plt.subplot(2,1,2)
|
|
272
262
|
|
|
273
|
-
# display warning message if either spectra are empty or have no non-zero intensity ion fragments
|
|
274
263
|
if q_spec.shape[0] > 1:
|
|
275
264
|
if np.max(q_spec[:,1]) == 0 or np.max(r_spec[:,1]) == 0:
|
|
276
265
|
plt.text(0.5, 0.5, 'The query and/or reference spectrum has no non-zero intensities after transformations.\n Change transformation parameters.', ha='center', va='center', fontsize=7, color='black')
|
|
277
266
|
plt.xticks([])
|
|
278
267
|
plt.yticks([])
|
|
279
268
|
else:
|
|
280
|
-
# apply transformation to y-axis if relevant
|
|
281
269
|
if y_axis_transformation == 'normalized':
|
|
282
270
|
q_spec[:,1] = q_spec[:,1] / np.max(q_spec[:,1])
|
|
283
271
|
r_spec[:,1] = r_spec[:,1] / np.max(r_spec[:,1])
|
|
@@ -305,12 +293,15 @@ def generate_plots_on_HRMS_data(query_data=None, reference_data=None, spectrum_I
|
|
|
305
293
|
plt.yticks([])
|
|
306
294
|
|
|
307
295
|
|
|
296
|
+
print('\n\n\n')
|
|
297
|
+
print(high_quality_reference_library)
|
|
298
|
+
print('\n\n\n')
|
|
308
299
|
plt.subplots_adjust(top=0.8, hspace=0.92, bottom=0.3)
|
|
309
300
|
plt.figlegend(loc = 'upper center')
|
|
310
301
|
fig.text(0.05, 0.18, f'Similarity Measure: {similarity_measure.capitalize()}', fontsize=7)
|
|
311
302
|
fig.text(0.05, 0.15, f'Similarity Score: {round(similarity_score,4)}', fontsize=7)
|
|
312
303
|
fig.text(0.05, 0.12, f"Spectrum Preprocessing Order: {''.join(spectrum_preprocessing_order)}", fontsize=7)
|
|
313
|
-
fig.text(0.05, 0.09, f'High Quality Reference Library: {high_quality_reference_library}', fontsize=7)
|
|
304
|
+
fig.text(0.05, 0.09, f'High Quality Reference Library: {str(high_quality_reference_library)}', fontsize=7)
|
|
314
305
|
fig.text(0.05, 0.06, f'Window Size (Centroiding): {window_size_centroiding}', fontsize=7)
|
|
315
306
|
fig.text(0.05, 0.03, f'Window Size (Matching): {window_size_matching}', fontsize=7)
|
|
316
307
|
fig.text(0.45, 0.18, f'Raw-Scale M/Z Range: [{mz_min_tmp},{mz_max_tmp}]', fontsize=7)
|
|
@@ -352,7 +343,6 @@ def generate_plots_on_NRMS_data(query_data=None, reference_data=None, spectrum_I
|
|
|
352
343
|
--output_path: path to output PDF file containing the plots of the spectra before and after preprocessing transformations. If no argument is passed, then the plots will be saved to the PDF ./spectrum1_{spectrum_ID1}_spectrum2_{spectrum_ID2}_plot.pdf in the current working directory.
|
|
353
344
|
'''
|
|
354
345
|
|
|
355
|
-
# load query and reference libraries
|
|
356
346
|
if query_data is None:
|
|
357
347
|
print('\nError: No argument passed to the mandatory query_data. Please pass the path to the CSV file of the query data.')
|
|
358
348
|
sys.exit()
|
|
@@ -382,7 +372,6 @@ def generate_plots_on_NRMS_data(query_data=None, reference_data=None, spectrum_I
|
|
|
382
372
|
unique_reference_ids = df_reference.iloc[:,0].unique()
|
|
383
373
|
|
|
384
374
|
|
|
385
|
-
##### process input parameters and ensure they are in a valid format #####
|
|
386
375
|
if spectrum_ID1 is not None:
|
|
387
376
|
spectrum_ID1 = str(spectrum_ID1)
|
|
388
377
|
else:
|
|
@@ -403,8 +392,8 @@ def generate_plots_on_NRMS_data(query_data=None, reference_data=None, spectrum_I
|
|
|
403
392
|
print(f'Error: spectrum_preprocessing_order must contain only \'F\', \'N\', \'W\', \'L\'.')
|
|
404
393
|
sys.exit()
|
|
405
394
|
|
|
406
|
-
if similarity_measure not in ['cosine','shannon','renyi','tsallis','mixture','jaccard','dice','3w_jaccard','sokal_sneath','binary_cosine','mountford','mcconnaughey','driver_kroeber','simpson','braun_banquet','fager_mcgowan','kulczynski','
|
|
407
|
-
print('\nError: similarity_measure must be either cosine, shannon, renyi, tsallis, mixture, jaccard, dice, 3w_jaccard, sokal_sneath, binary_cosine, mountford, mcconnaughey, driver_kroeber, simpson, braun_banquet, fager_mcgowan, kulczynski,
|
|
395
|
+
if similarity_measure not in ['cosine','shannon','renyi','tsallis','mixture','jaccard','dice','3w_jaccard','sokal_sneath','binary_cosine','mountford','mcconnaughey','driver_kroeber','simpson','braun_banquet','fager_mcgowan','kulczynski','intersection','hamming','hellinger']:
|
|
396
|
+
print('\nError: similarity_measure must be either cosine, shannon, renyi, tsallis, mixture, jaccard, dice, 3w_jaccard, sokal_sneath, binary_cosine, mountford, mcconnaughey, driver_kroeber, simpson, braun_banquet, fager_mcgowan, kulczynski, intersection, hamming, or hellinger.')
|
|
408
397
|
sys.exit()
|
|
409
398
|
|
|
410
399
|
if isinstance(int_min,int) is True:
|
|
@@ -457,12 +446,10 @@ def generate_plots_on_NRMS_data(query_data=None, reference_data=None, spectrum_I
|
|
|
457
446
|
print(f'Warning: plots will be saved to the PDF ./spectrum1_{spectrum_ID1}_spectrum2_{spectrum_ID2}_plot.pdf in the current working directory.')
|
|
458
447
|
output_path = f'{Path.cwd()}/spectrum1_{spectrum_ID1}_spectrum2_{spectrum_ID2}.pdf'
|
|
459
448
|
|
|
460
|
-
# get m/z values
|
|
461
449
|
min_mz = np.min([np.min(df_query.iloc[:,1]), np.min(df_reference.iloc[:,1])])
|
|
462
450
|
max_mz = np.max([np.max(df_query.iloc[:,1]), np.max(df_reference.iloc[:,1])])
|
|
463
451
|
mzs = np.linspace(min_mz,max_mz,(max_mz-min_mz+1))
|
|
464
452
|
|
|
465
|
-
# get unique query/reference library IDs; each query/reference ID corresponds to exactly one query/reference mass spectrum
|
|
466
453
|
unique_query_ids = df_query.iloc[:,0].unique().tolist()
|
|
467
454
|
unique_reference_ids = df_reference.iloc[:,0].unique().tolist()
|
|
468
455
|
unique_query_ids = [str(ID) for ID in unique_query_ids]
|
|
@@ -494,7 +481,6 @@ def generate_plots_on_NRMS_data(query_data=None, reference_data=None, spectrum_I
|
|
|
494
481
|
q_spec = convert_spec(q_spec,mzs)
|
|
495
482
|
r_spec = convert_spec(r_spec,mzs)
|
|
496
483
|
|
|
497
|
-
# get the ranges of m/z and intensity values to display at the bottom of the two plots
|
|
498
484
|
int_min_tmp_q = min(q_spec[q_spec[:,1].nonzero(),1][0])
|
|
499
485
|
int_min_tmp_r = min(r_spec[r_spec[:,1].nonzero(),1][0])
|
|
500
486
|
int_max_tmp_q = max(q_spec[q_spec[:,1].nonzero(),1][0])
|
|
@@ -502,13 +488,10 @@ def generate_plots_on_NRMS_data(query_data=None, reference_data=None, spectrum_I
|
|
|
502
488
|
int_min_tmp = int(min([int_min_tmp_q,int_min_tmp_r]))
|
|
503
489
|
int_max_tmp = int(max([int_max_tmp_q,int_max_tmp_r]))
|
|
504
490
|
|
|
505
|
-
# create the figure
|
|
506
491
|
fig, axes = plt.subplots(nrows=2, ncols=1)
|
|
507
492
|
|
|
508
|
-
# plot the untransformed spectra
|
|
509
493
|
plt.subplot(2,1,1)
|
|
510
494
|
|
|
511
|
-
# display warning message if either spectra have no non-zero ion fragments
|
|
512
495
|
if np.max(q_spec[:,1]) == 0 or np.max(r_spec[:,1]) == 0:
|
|
513
496
|
plt.text(0.5, 0.5, 'The query and/or reference spectrum has no non-zero intensities after transformations.\n Change transformation parameters.', ha='center', va='center', fontsize=7, color='black')
|
|
514
497
|
plt.xticks([])
|
|
@@ -519,7 +502,6 @@ def generate_plots_on_NRMS_data(query_data=None, reference_data=None, spectrum_I
|
|
|
519
502
|
q_spec_pre_trans[:,1] = q_spec_pre_trans[:,1].astype(float)
|
|
520
503
|
r_spec_pre_trans[:,1] = r_spec_pre_trans[:,1].astype(float)
|
|
521
504
|
|
|
522
|
-
# apply transformation to y-axis if relevant
|
|
523
505
|
if y_axis_transformation == 'normalized':
|
|
524
506
|
q_spec_pre_trans[:,1] = q_spec_pre_trans[:,1] / np.max(q_spec_pre_trans[:,1])
|
|
525
507
|
r_spec_pre_trans[:,1] = r_spec_pre_trans[:,1] / np.max(r_spec_pre_trans[:,1])
|
|
@@ -543,32 +525,29 @@ def generate_plots_on_NRMS_data(query_data=None, reference_data=None, spectrum_I
|
|
|
543
525
|
plt.title('Untransformed Query and Reference Spectra', fontsize=10)
|
|
544
526
|
|
|
545
527
|
for transformation in spectrum_preprocessing_order:
|
|
546
|
-
if transformation == 'W':
|
|
528
|
+
if transformation == 'W':
|
|
547
529
|
q_spec[:,1] = wf_transform(q_spec[:,0], q_spec[:,1], wf_mz, wf_intensity)
|
|
548
530
|
r_spec[:,1] = wf_transform(r_spec[:,0], r_spec[:,1], wf_mz, wf_intensity)
|
|
549
|
-
if transformation == 'L':
|
|
531
|
+
if transformation == 'L':
|
|
550
532
|
q_spec[:,1] = LE_transform(q_spec[:,1], LET_threshold, normalization_method)
|
|
551
533
|
r_spec[:,1] = LE_transform(r_spec[:,1], LET_threshold, normalization_method)
|
|
552
|
-
if transformation == 'N':
|
|
534
|
+
if transformation == 'N':
|
|
553
535
|
q_spec = remove_noise(q_spec, nr = noise_threshold)
|
|
554
|
-
if high_quality_reference_library == False:
|
|
536
|
+
if high_quality_reference_library == False or high_quality_reference_library == 'False':
|
|
555
537
|
r_spec = remove_noise(r_spec, nr = noise_threshold)
|
|
556
|
-
if transformation == 'F':
|
|
538
|
+
if transformation == 'F':
|
|
557
539
|
q_spec = filter_spec_gcms(q_spec, mz_min = mz_min, mz_max = mz_max, int_min = int_min, int_max = int_max)
|
|
558
|
-
if high_quality_reference_library == False:
|
|
540
|
+
if high_quality_reference_library == False or high_quality_reference_library == 'False':
|
|
559
541
|
r_spec = filter_spec_gcms(r_spec, mz_min = mz_min, mz_max = mz_max, int_min = int_min, int_max = int_max)
|
|
560
542
|
|
|
561
|
-
# compute similarity score; if the spectra contain at most one point, their similarity is considered to be 0
|
|
562
543
|
if q_spec.shape[0] > 1:
|
|
563
544
|
similarity_score = get_similarity(similarity_measure, q_spec[:,1], r_spec[:,1], weights, entropy_dimension)
|
|
564
545
|
else:
|
|
565
546
|
similarity_score = 0
|
|
566
547
|
|
|
567
548
|
|
|
568
|
-
# plot the transformed spectra
|
|
569
549
|
plt.subplot(2,1,2)
|
|
570
550
|
|
|
571
|
-
# display warning message if either spectra are empty or have no non-zero intensity ion fragments
|
|
572
551
|
if q_spec.shape[0] == 0 or r_spec.shape[0] == 0:
|
|
573
552
|
plt.text(0.5, 0.5, 'The query and/or reference spectrum has no ion fragments left after transformations.\n Change transformation parameters.', ha='center', va='center', fontsize=7, color='black')
|
|
574
553
|
plt.xticks([])
|
|
@@ -578,7 +557,6 @@ def generate_plots_on_NRMS_data(query_data=None, reference_data=None, spectrum_I
|
|
|
578
557
|
plt.xticks([])
|
|
579
558
|
plt.yticks([])
|
|
580
559
|
else:
|
|
581
|
-
# apply transformation to y-axis if relevant
|
|
582
560
|
if y_axis_transformation == 'normalized':
|
|
583
561
|
q_spec[:,1] = q_spec[:,1] / np.max(q_spec[:,1])
|
|
584
562
|
r_spec[:,1] = r_spec[:,1] / np.max(r_spec[:,1])
|
|
@@ -602,13 +580,12 @@ def generate_plots_on_NRMS_data(query_data=None, reference_data=None, spectrum_I
|
|
|
602
580
|
plt.title(f'Transformed Query and Reference Spectra', fontsize=10)
|
|
603
581
|
|
|
604
582
|
|
|
605
|
-
#plt.subplots_adjust(top = 0.8, hspace = 0.7)
|
|
606
583
|
plt.subplots_adjust(top=0.8, hspace=0.92, bottom=0.3)
|
|
607
584
|
plt.figlegend(loc = 'upper center')
|
|
608
585
|
fig.text(0.05, 0.15, f'Similarity Measure: {similarity_measure.capitalize()}', fontsize=7)
|
|
609
586
|
fig.text(0.05, 0.12, f'Similarity Score: {round(similarity_score,4)}', fontsize=7)
|
|
610
587
|
fig.text(0.05, 0.09, f"Spectrum Preprocessing Order: {''.join(spectrum_preprocessing_order)}", fontsize=7)
|
|
611
|
-
fig.text(0.05, 0.06, f'High Quality Reference Library: {high_quality_reference_library}', fontsize=7)
|
|
588
|
+
fig.text(0.05, 0.06, f'High Quality Reference Library: {str(high_quality_reference_library)}', fontsize=7)
|
|
612
589
|
fig.text(0.05, 0.03, f'Raw-Scale M/Z Range: [{min_mz},{max_mz}]', fontsize=7)
|
|
613
590
|
fig.text(0.45, 0.15, f'Raw-Scale Intensity Range: [{int_min_tmp},{int_max_tmp}]', fontsize=7)
|
|
614
591
|
fig.text(0.45, 0.12, f'Noise Threshold: {noise_threshold}', fontsize=7)
|
pycompound/processing.py
CHANGED
|
@@ -1,6 +1,4 @@
|
|
|
1
1
|
|
|
2
|
-
# This script contains the functions used to transform spectra prior to computing similarity scores
|
|
3
|
-
|
|
4
2
|
from pycompound.build_library import build_library_from_raw_data
|
|
5
3
|
import scipy.stats
|
|
6
4
|
import numpy as np
|
|
@@ -165,7 +163,6 @@ def centroid_spectrum(spec, window_size):
|
|
|
165
163
|
|
|
166
164
|
spec = spec[np.argsort(spec[:,0])]
|
|
167
165
|
|
|
168
|
-
#Fast check is the spectrum needs centroiding
|
|
169
166
|
mz_array = spec[:, 0]
|
|
170
167
|
need_centroid = 0
|
|
171
168
|
if mz_array.shape[0] > 1:
|
|
@@ -180,7 +177,6 @@ def centroid_spectrum(spec, window_size):
|
|
|
180
177
|
mz_delta_allowed = window_size
|
|
181
178
|
|
|
182
179
|
if spec[i, 1] > 0:
|
|
183
|
-
#Find left bound for current peak
|
|
184
180
|
i_left = i - 1
|
|
185
181
|
while i_left >= 0:
|
|
186
182
|
mz_delta_left = spec[i, 0] - spec[i_left, 0]
|
|
@@ -190,7 +186,6 @@ def centroid_spectrum(spec, window_size):
|
|
|
190
186
|
break
|
|
191
187
|
i_left += 1
|
|
192
188
|
|
|
193
|
-
#Find right bound for current peak
|
|
194
189
|
i_right = i + 1
|
|
195
190
|
while i_right < spec.shape[0]:
|
|
196
191
|
mz_delta_right = spec[i_right, 0] - spec[i, 0]
|
|
@@ -199,7 +194,6 @@ def centroid_spectrum(spec, window_size):
|
|
|
199
194
|
else:
|
|
200
195
|
break
|
|
201
196
|
|
|
202
|
-
#Merge those peaks
|
|
203
197
|
intensity_sum = np.sum(spec[i_left:i_right, 1])
|
|
204
198
|
intensity_weighted_sum = np.sum(spec[i_left:i_right, 0] * spec[i_left:i_right, 1])
|
|
205
199
|
|
|
@@ -246,16 +240,13 @@ def match_peaks_in_spectra(spec_a, spec_b, window_size):
|
|
|
246
240
|
mass_delta = spec_a[a, 0] - spec_b[b, 0]
|
|
247
241
|
|
|
248
242
|
if mass_delta < -window_size:
|
|
249
|
-
# Peak only existed in spec a.
|
|
250
243
|
spec_merged.append([spec_a[a, 0], spec_a[a, 1], peak_b_int])
|
|
251
244
|
peak_b_int = 0.
|
|
252
245
|
a += 1
|
|
253
246
|
elif mass_delta > window_size:
|
|
254
|
-
# Peak only existed in spec b.
|
|
255
247
|
spec_merged.append([spec_b[b, 0], 0., spec_b[b, 1]])
|
|
256
248
|
b += 1
|
|
257
249
|
else:
|
|
258
|
-
# Peak existed in both spec.
|
|
259
250
|
peak_b_int += spec_b[b, 1]
|
|
260
251
|
b += 1
|
|
261
252
|
|
|
@@ -10,7 +10,6 @@ import sys
|
|
|
10
10
|
|
|
11
11
|
|
|
12
12
|
def S_cos(ints_a, ints_b):
|
|
13
|
-
# Cosine Similarity Measure
|
|
14
13
|
if np.sum(ints_a) == 0 or np.sum(ints_b) == 0:
|
|
15
14
|
return(0)
|
|
16
15
|
else:
|
|
@@ -18,12 +17,10 @@ def S_cos(ints_a, ints_b):
|
|
|
18
17
|
|
|
19
18
|
|
|
20
19
|
def ent_renyi(ints, q):
|
|
21
|
-
# Computes the Renyi entropy of a probability distribution for a given positive entropy dimension q
|
|
22
20
|
return np.log(sum(np.power(ints,q))) / (1-q)
|
|
23
21
|
|
|
24
22
|
|
|
25
23
|
def ent_tsallis(ints, q):
|
|
26
|
-
# Computes the Tsallis entropy of a probability distribution for a given positive entropy dimension q
|
|
27
24
|
return (sum(np.power(ints,q))-1) / (1-q)
|
|
28
25
|
|
|
29
26
|
|