pycompound 0.1.0__py3-none-any.whl → 0.1.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- app.py +470 -144
- pycompound/build_library.py +2 -9
- pycompound/plot_spectra.py +17 -42
- pycompound/processing.py +0 -9
- pycompound/similarity_measures.py +0 -3
- pycompound/spec_lib_matching.py +295 -102
- pycompound/spec_lib_matching_CLI.py +2 -7
- pycompound/tuning_CLI.py +2 -3
- {pycompound-0.1.0.dist-info → pycompound-0.1.2.dist-info}/METADATA +1 -1
- pycompound-0.1.2.dist-info/RECORD +14 -0
- pycompound-0.1.0.dist-info/RECORD +0 -14
- {pycompound-0.1.0.dist-info → pycompound-0.1.2.dist-info}/WHEEL +0 -0
- {pycompound-0.1.0.dist-info → pycompound-0.1.2.dist-info}/licenses/LICENSE +0 -0
- {pycompound-0.1.0.dist-info → pycompound-0.1.2.dist-info}/top_level.txt +0 -0
pycompound/build_library.py
CHANGED
|
@@ -1,6 +1,4 @@
|
|
|
1
1
|
|
|
2
|
-
# this script has a function to extract the mass spectra from an mgf, mzML, or cdf file and write them in the necessary format for use in spectral library matching
|
|
3
|
-
|
|
4
2
|
import netCDF4 as nc
|
|
5
3
|
import numpy as np
|
|
6
4
|
import pandas as pd
|
|
@@ -14,7 +12,7 @@ def build_library_from_raw_data(input_path=None, output_path=None, is_reference=
|
|
|
14
12
|
Converts mgf, mzML, cdf, or msp file to the necessary format for spectral library matching.
|
|
15
13
|
|
|
16
14
|
--input_path: Path to input file (must be mgf, mzML, cdf, or msp file). Mandatory argument.
|
|
17
|
-
--output_path: Path to output
|
|
15
|
+
--output_path: Path to output TXT file. Default: current working directory.
|
|
18
16
|
--is_reference: Boolean flag indicating whether IDs of spectra should be written to output. Only pass true if building a reference library with known compound IDs. Only applicable to mgf and msp files. Options: \'True\', \'False\'. Optional argument. Default: False.
|
|
19
17
|
'''
|
|
20
18
|
|
|
@@ -23,7 +21,6 @@ def build_library_from_raw_data(input_path=None, output_path=None, is_reference=
|
|
|
23
21
|
sys.exit()
|
|
24
22
|
|
|
25
23
|
if output_path is None:
|
|
26
|
-
#print('Warning: no output_path specified, so library is written to {Path.cwd()}/build_library.csv')
|
|
27
24
|
tmp = input_path.split('/')
|
|
28
25
|
tmp = tmp[(len(tmp)-1)]
|
|
29
26
|
basename = tmp.split('.')[0]
|
|
@@ -34,7 +31,6 @@ def build_library_from_raw_data(input_path=None, output_path=None, is_reference=
|
|
|
34
31
|
print('Error: is_reference must be either \'True\' or \'False\'.')
|
|
35
32
|
sys.exit()
|
|
36
33
|
|
|
37
|
-
# determine whether an mgf or a mzML file was passed to --input_path
|
|
38
34
|
last_three_chars = input_path[(len(input_path)-3):len(input_path)]
|
|
39
35
|
last_four_chars = input_path[(len(input_path)-4):len(input_path)]
|
|
40
36
|
if last_three_chars == 'mgf' or last_three_chars == 'MGF':
|
|
@@ -50,7 +46,6 @@ def build_library_from_raw_data(input_path=None, output_path=None, is_reference=
|
|
|
50
46
|
sys.exit()
|
|
51
47
|
|
|
52
48
|
|
|
53
|
-
# obtain a list of spectra from the input file
|
|
54
49
|
spectra = []
|
|
55
50
|
if input_file_type == 'mgf':
|
|
56
51
|
with mgf.read(input_path, index_by_scans = True) as reader:
|
|
@@ -62,7 +57,6 @@ def build_library_from_raw_data(input_path=None, output_path=None, is_reference=
|
|
|
62
57
|
spectra.append(spec)
|
|
63
58
|
|
|
64
59
|
|
|
65
|
-
# extract the relevant information from each spectra (i.e m/z ratios and intensities)
|
|
66
60
|
if input_file_type == 'mgf' or input_file_type == 'mzML':
|
|
67
61
|
ids = []
|
|
68
62
|
mzs = []
|
|
@@ -128,8 +122,7 @@ def build_library_from_raw_data(input_path=None, output_path=None, is_reference=
|
|
|
128
122
|
continue
|
|
129
123
|
|
|
130
124
|
|
|
131
|
-
# write CSV file of spectra for use in spectral library matching
|
|
132
125
|
df = pd.DataFrame({'id':ids, 'mz_ratio':mzs, 'intensity':ints})
|
|
133
|
-
df.to_csv(output_path, index=False)
|
|
126
|
+
df.to_csv(output_path, index=False, sep='\t')
|
|
134
127
|
|
|
135
128
|
|
pycompound/plot_spectra.py
CHANGED
|
@@ -1,6 +1,4 @@
|
|
|
1
1
|
|
|
2
|
-
# this script's functions plot a given query spectrum against a given reference spectrum before and after spectrum preprocessing transformations
|
|
3
|
-
|
|
4
2
|
from .processing import *
|
|
5
3
|
from .similarity_measures import *
|
|
6
4
|
import pandas as pd
|
|
@@ -36,7 +34,6 @@ def generate_plots_on_HRMS_data(query_data=None, reference_data=None, spectrum_I
|
|
|
36
34
|
--output_path: path to output PDF file containing the plots of the spectra before and after preprocessing transformations. If no argument is passed, then the plots will be saved to the PDF ./spectrum1_{spectrum_ID1}_spectrum2_{spectrum_ID2}_plot.pdf in the current working directory.
|
|
37
35
|
'''
|
|
38
36
|
|
|
39
|
-
# load query and reference libraries
|
|
40
37
|
if query_data is None:
|
|
41
38
|
print('\nError: No argument passed to the mandatory query_data. Please pass the path to the CSV file of the query data.')
|
|
42
39
|
sys.exit()
|
|
@@ -68,7 +65,6 @@ def generate_plots_on_HRMS_data(query_data=None, reference_data=None, spectrum_I
|
|
|
68
65
|
unique_reference_ids = [str(tmp) for tmp in unique_reference_ids]
|
|
69
66
|
|
|
70
67
|
|
|
71
|
-
##### process input parameters and ensure they are in a valid format #####
|
|
72
68
|
if spectrum_ID1 is not None:
|
|
73
69
|
spectrum_ID1 = str(spectrum_ID1)
|
|
74
70
|
else:
|
|
@@ -177,8 +173,6 @@ def generate_plots_on_HRMS_data(query_data=None, reference_data=None, spectrum_I
|
|
|
177
173
|
spec_tmp = spectrum_ID1
|
|
178
174
|
spectrum_ID1 = spectrum_ID2
|
|
179
175
|
spectrum_ID2 = spec_tmp
|
|
180
|
-
print(unique_query_ids)
|
|
181
|
-
print(spectrum_ID1)
|
|
182
176
|
query_idx = unique_query_ids.index(spectrum_ID1)
|
|
183
177
|
reference_idx = unique_reference_ids.index(spectrum_ID2)
|
|
184
178
|
q_idxs_tmp = np.where(df_query.iloc[:,0].astype(str) == unique_query_ids[query_idx])[0]
|
|
@@ -192,7 +186,6 @@ def generate_plots_on_HRMS_data(query_data=None, reference_data=None, spectrum_I
|
|
|
192
186
|
q_spec_pre_trans[:,1] = q_spec_pre_trans[:,1].astype(float)
|
|
193
187
|
r_spec_pre_trans[:,1] = r_spec_pre_trans[:,1].astype(float)
|
|
194
188
|
|
|
195
|
-
# apply transformation to y-axis if relevant
|
|
196
189
|
if y_axis_transformation == 'normalized':
|
|
197
190
|
q_spec_pre_trans[:,1] = q_spec_pre_trans[:,1] / np.max(q_spec_pre_trans[:,1])
|
|
198
191
|
r_spec_pre_trans[:,1] = r_spec_pre_trans[:,1] / np.max(r_spec_pre_trans[:,1])
|
|
@@ -208,10 +201,8 @@ def generate_plots_on_HRMS_data(query_data=None, reference_data=None, spectrum_I
|
|
|
208
201
|
else:
|
|
209
202
|
ylab = 'Raw Intensity'
|
|
210
203
|
|
|
211
|
-
# create the figure
|
|
212
204
|
fig, axes = plt.subplots(nrows=2, ncols=1)
|
|
213
205
|
|
|
214
|
-
# plot the untransformed spectra
|
|
215
206
|
plt.subplot(2,1,1)
|
|
216
207
|
plt.vlines(x=q_spec_pre_trans[:,0], ymin=[0]*q_spec_pre_trans.shape[0], ymax=q_spec_pre_trans[:,1], linewidth=3, color='blue', label=f'Spectrum ID 1: {spectrum_ID1}')
|
|
217
208
|
plt.vlines(x=r_spec_pre_trans[:,0], ymin=[0]*r_spec_pre_trans.shape[0], ymax=-r_spec_pre_trans[:,1], linewidth=3, color='red', label=f'Spectrum ID 2: {spectrum_ID2}')
|
|
@@ -221,7 +212,6 @@ def generate_plots_on_HRMS_data(query_data=None, reference_data=None, spectrum_I
|
|
|
221
212
|
plt.yticks(fontsize=7)
|
|
222
213
|
plt.title('Untransformed Spectra', fontsize=10)
|
|
223
214
|
|
|
224
|
-
# get the ranges of m/z and intensity values to display at the bottom of the two plots
|
|
225
215
|
mz_min_tmp_q = round(q_spec[:,0].min(),1)
|
|
226
216
|
mz_min_tmp_r = round(r_spec[:,0].min(),1)
|
|
227
217
|
int_min_tmp_q = round(q_spec[:,1].min(),1)
|
|
@@ -235,51 +225,45 @@ def generate_plots_on_HRMS_data(query_data=None, reference_data=None, spectrum_I
|
|
|
235
225
|
int_min_tmp = min([int_min_tmp_q,int_min_tmp_r])
|
|
236
226
|
int_max_tmp = max([int_max_tmp_q,int_max_tmp_r])
|
|
237
227
|
|
|
238
|
-
# perform the spectrum preprocessing transformations in the order specified
|
|
239
228
|
is_matched = False
|
|
240
229
|
for transformation in spectrum_preprocessing_order:
|
|
241
|
-
if transformation == 'C' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1:
|
|
230
|
+
if transformation == 'C' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1:
|
|
242
231
|
q_spec = centroid_spectrum(q_spec, window_size=window_size_centroiding)
|
|
243
232
|
r_spec = centroid_spectrum(r_spec, window_size=window_size_centroiding)
|
|
244
|
-
if transformation == 'M' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1:
|
|
233
|
+
if transformation == 'M' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1:
|
|
245
234
|
m_spec = match_peaks_in_spectra(spec_a=q_spec, spec_b=r_spec, window_size=window_size_matching)
|
|
246
235
|
q_spec = m_spec[:,0:2]
|
|
247
236
|
r_spec = m_spec[:,[0,2]]
|
|
248
237
|
is_matched = True
|
|
249
|
-
if transformation == 'W' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1:
|
|
238
|
+
if transformation == 'W' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1:
|
|
250
239
|
q_spec[:,1] = wf_transform(q_spec[:,0], q_spec[:,1], wf_mz, wf_intensity)
|
|
251
240
|
r_spec[:,1] = wf_transform(r_spec[:,0], r_spec[:,1], wf_mz, wf_intensity)
|
|
252
|
-
if transformation == 'L' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1:
|
|
241
|
+
if transformation == 'L' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1:
|
|
253
242
|
q_spec[:,1] = LE_transform(q_spec[:,1], LET_threshold, normalization_method=normalization_method)
|
|
254
243
|
r_spec[:,1] = LE_transform(r_spec[:,1], LET_threshold, normalization_method=normalization_method)
|
|
255
|
-
if transformation == 'N' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1:
|
|
244
|
+
if transformation == 'N' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1:
|
|
256
245
|
q_spec = remove_noise(q_spec, nr = noise_threshold)
|
|
257
246
|
r_spec = remove_noise(r_spec, nr = noise_threshold)
|
|
258
|
-
if transformation == 'F' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1:
|
|
247
|
+
if transformation == 'F' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1:
|
|
259
248
|
q_spec = filter_spec_lcms(q_spec, mz_min = mz_min, mz_max = mz_max, int_min = int_min, int_max = int_max, is_matched = is_matched)
|
|
260
249
|
r_spec = filter_spec_lcms(r_spec, mz_min = mz_min, mz_max = mz_max, int_min = int_min, int_max = int_max, is_matched = is_matched)
|
|
261
250
|
|
|
262
|
-
# intensities of query and reference library
|
|
263
251
|
q_ints = q_spec[:,1]
|
|
264
252
|
r_ints = r_spec[:,1]
|
|
265
253
|
|
|
266
|
-
# if there is at least one non-zero intensity ion fragment in either spectra, compute their similarity
|
|
267
254
|
if np.sum(q_ints) != 0 and np.sum(r_ints) != 0 and q_spec.shape[0] > 1 and r_spec.shape[1] > 1:
|
|
268
255
|
similarity_score = get_similarity(similarity_measure, q_ints, r_ints, weights, entropy_dimension)
|
|
269
256
|
else:
|
|
270
257
|
similarity_score = 0
|
|
271
258
|
|
|
272
|
-
# plot the transformed spectra
|
|
273
259
|
plt.subplot(2,1,2)
|
|
274
260
|
|
|
275
|
-
# display warning message if either spectra are empty or have no non-zero intensity ion fragments
|
|
276
261
|
if q_spec.shape[0] > 1:
|
|
277
262
|
if np.max(q_spec[:,1]) == 0 or np.max(r_spec[:,1]) == 0:
|
|
278
263
|
plt.text(0.5, 0.5, 'The query and/or reference spectrum has no non-zero intensities after transformations.\n Change transformation parameters.', ha='center', va='center', fontsize=7, color='black')
|
|
279
264
|
plt.xticks([])
|
|
280
265
|
plt.yticks([])
|
|
281
266
|
else:
|
|
282
|
-
# apply transformation to y-axis if relevant
|
|
283
267
|
if y_axis_transformation == 'normalized':
|
|
284
268
|
q_spec[:,1] = q_spec[:,1] / np.max(q_spec[:,1])
|
|
285
269
|
r_spec[:,1] = r_spec[:,1] / np.max(r_spec[:,1])
|
|
@@ -311,7 +295,7 @@ def generate_plots_on_HRMS_data(query_data=None, reference_data=None, spectrum_I
|
|
|
311
295
|
plt.figlegend(loc = 'upper center')
|
|
312
296
|
fig.text(0.05, 0.18, f'Similarity Measure: {similarity_measure.capitalize()}', fontsize=7)
|
|
313
297
|
fig.text(0.05, 0.15, f'Similarity Score: {round(similarity_score,4)}', fontsize=7)
|
|
314
|
-
fig.text(0.05, 0.12, f
|
|
298
|
+
fig.text(0.05, 0.12, f"Spectrum Preprocessing Order: {''.join(spectrum_preprocessing_order)}", fontsize=7)
|
|
315
299
|
fig.text(0.05, 0.09, f'High Quality Reference Library: {high_quality_reference_library}', fontsize=7)
|
|
316
300
|
fig.text(0.05, 0.06, f'Window Size (Centroiding): {window_size_centroiding}', fontsize=7)
|
|
317
301
|
fig.text(0.05, 0.03, f'Window Size (Matching): {window_size_matching}', fontsize=7)
|
|
@@ -320,6 +304,9 @@ def generate_plots_on_HRMS_data(query_data=None, reference_data=None, spectrum_I
|
|
|
320
304
|
fig.text(0.45, 0.12, f'Noise Threshold: {noise_threshold}', fontsize=7)
|
|
321
305
|
fig.text(0.45, 0.09, f'Weight Factors (m/z,intensity): ({wf_mz},{wf_intensity})', fontsize=7)
|
|
322
306
|
fig.text(0.45, 0.06, f'Low-Entropy Threshold: {LET_threshold}', fontsize=7)
|
|
307
|
+
if similarity_measure == 'mixture':
|
|
308
|
+
fig.text(0.45, 0.03, f'Weights for mixture similarity: {weights}', fontsize=7)
|
|
309
|
+
|
|
323
310
|
plt.savefig(output_path, format='pdf')
|
|
324
311
|
|
|
325
312
|
if return_plot == True:
|
|
@@ -351,7 +338,6 @@ def generate_plots_on_NRMS_data(query_data=None, reference_data=None, spectrum_I
|
|
|
351
338
|
--output_path: path to output PDF file containing the plots of the spectra before and after preprocessing transformations. If no argument is passed, then the plots will be saved to the PDF ./spectrum1_{spectrum_ID1}_spectrum2_{spectrum_ID2}_plot.pdf in the current working directory.
|
|
352
339
|
'''
|
|
353
340
|
|
|
354
|
-
# load query and reference libraries
|
|
355
341
|
if query_data is None:
|
|
356
342
|
print('\nError: No argument passed to the mandatory query_data. Please pass the path to the CSV file of the query data.')
|
|
357
343
|
sys.exit()
|
|
@@ -381,7 +367,6 @@ def generate_plots_on_NRMS_data(query_data=None, reference_data=None, spectrum_I
|
|
|
381
367
|
unique_reference_ids = df_reference.iloc[:,0].unique()
|
|
382
368
|
|
|
383
369
|
|
|
384
|
-
##### process input parameters and ensure they are in a valid format #####
|
|
385
370
|
if spectrum_ID1 is not None:
|
|
386
371
|
spectrum_ID1 = str(spectrum_ID1)
|
|
387
372
|
else:
|
|
@@ -456,12 +441,10 @@ def generate_plots_on_NRMS_data(query_data=None, reference_data=None, spectrum_I
|
|
|
456
441
|
print(f'Warning: plots will be saved to the PDF ./spectrum1_{spectrum_ID1}_spectrum2_{spectrum_ID2}_plot.pdf in the current working directory.')
|
|
457
442
|
output_path = f'{Path.cwd()}/spectrum1_{spectrum_ID1}_spectrum2_{spectrum_ID2}.pdf'
|
|
458
443
|
|
|
459
|
-
# get m/z values
|
|
460
444
|
min_mz = np.min([np.min(df_query.iloc[:,1]), np.min(df_reference.iloc[:,1])])
|
|
461
445
|
max_mz = np.max([np.max(df_query.iloc[:,1]), np.max(df_reference.iloc[:,1])])
|
|
462
446
|
mzs = np.linspace(min_mz,max_mz,(max_mz-min_mz+1))
|
|
463
447
|
|
|
464
|
-
# get unique query/reference library IDs; each query/reference ID corresponds to exactly one query/reference mass spectrum
|
|
465
448
|
unique_query_ids = df_query.iloc[:,0].unique().tolist()
|
|
466
449
|
unique_reference_ids = df_reference.iloc[:,0].unique().tolist()
|
|
467
450
|
unique_query_ids = [str(ID) for ID in unique_query_ids]
|
|
@@ -493,7 +476,6 @@ def generate_plots_on_NRMS_data(query_data=None, reference_data=None, spectrum_I
|
|
|
493
476
|
q_spec = convert_spec(q_spec,mzs)
|
|
494
477
|
r_spec = convert_spec(r_spec,mzs)
|
|
495
478
|
|
|
496
|
-
# get the ranges of m/z and intensity values to display at the bottom of the two plots
|
|
497
479
|
int_min_tmp_q = min(q_spec[q_spec[:,1].nonzero(),1][0])
|
|
498
480
|
int_min_tmp_r = min(r_spec[r_spec[:,1].nonzero(),1][0])
|
|
499
481
|
int_max_tmp_q = max(q_spec[q_spec[:,1].nonzero(),1][0])
|
|
@@ -501,13 +483,10 @@ def generate_plots_on_NRMS_data(query_data=None, reference_data=None, spectrum_I
|
|
|
501
483
|
int_min_tmp = int(min([int_min_tmp_q,int_min_tmp_r]))
|
|
502
484
|
int_max_tmp = int(max([int_max_tmp_q,int_max_tmp_r]))
|
|
503
485
|
|
|
504
|
-
# create the figure
|
|
505
486
|
fig, axes = plt.subplots(nrows=2, ncols=1)
|
|
506
487
|
|
|
507
|
-
# plot the untransformed spectra
|
|
508
488
|
plt.subplot(2,1,1)
|
|
509
489
|
|
|
510
|
-
# display warning message if either spectra have no non-zero ion fragments
|
|
511
490
|
if np.max(q_spec[:,1]) == 0 or np.max(r_spec[:,1]) == 0:
|
|
512
491
|
plt.text(0.5, 0.5, 'The query and/or reference spectrum has no non-zero intensities after transformations.\n Change transformation parameters.', ha='center', va='center', fontsize=7, color='black')
|
|
513
492
|
plt.xticks([])
|
|
@@ -518,7 +497,6 @@ def generate_plots_on_NRMS_data(query_data=None, reference_data=None, spectrum_I
|
|
|
518
497
|
q_spec_pre_trans[:,1] = q_spec_pre_trans[:,1].astype(float)
|
|
519
498
|
r_spec_pre_trans[:,1] = r_spec_pre_trans[:,1].astype(float)
|
|
520
499
|
|
|
521
|
-
# apply transformation to y-axis if relevant
|
|
522
500
|
if y_axis_transformation == 'normalized':
|
|
523
501
|
q_spec_pre_trans[:,1] = q_spec_pre_trans[:,1] / np.max(q_spec_pre_trans[:,1])
|
|
524
502
|
r_spec_pre_trans[:,1] = r_spec_pre_trans[:,1] / np.max(r_spec_pre_trans[:,1])
|
|
@@ -542,32 +520,29 @@ def generate_plots_on_NRMS_data(query_data=None, reference_data=None, spectrum_I
|
|
|
542
520
|
plt.title('Untransformed Query and Reference Spectra', fontsize=10)
|
|
543
521
|
|
|
544
522
|
for transformation in spectrum_preprocessing_order:
|
|
545
|
-
if transformation == 'W':
|
|
523
|
+
if transformation == 'W':
|
|
546
524
|
q_spec[:,1] = wf_transform(q_spec[:,0], q_spec[:,1], wf_mz, wf_intensity)
|
|
547
525
|
r_spec[:,1] = wf_transform(r_spec[:,0], r_spec[:,1], wf_mz, wf_intensity)
|
|
548
|
-
if transformation == 'L':
|
|
526
|
+
if transformation == 'L':
|
|
549
527
|
q_spec[:,1] = LE_transform(q_spec[:,1], LET_threshold, normalization_method)
|
|
550
528
|
r_spec[:,1] = LE_transform(r_spec[:,1], LET_threshold, normalization_method)
|
|
551
|
-
if transformation == 'N':
|
|
529
|
+
if transformation == 'N':
|
|
552
530
|
q_spec = remove_noise(q_spec, nr = noise_threshold)
|
|
553
531
|
if high_quality_reference_library == False:
|
|
554
532
|
r_spec = remove_noise(r_spec, nr = noise_threshold)
|
|
555
|
-
if transformation == 'F':
|
|
533
|
+
if transformation == 'F':
|
|
556
534
|
q_spec = filter_spec_gcms(q_spec, mz_min = mz_min, mz_max = mz_max, int_min = int_min, int_max = int_max)
|
|
557
535
|
if high_quality_reference_library == False:
|
|
558
536
|
r_spec = filter_spec_gcms(r_spec, mz_min = mz_min, mz_max = mz_max, int_min = int_min, int_max = int_max)
|
|
559
537
|
|
|
560
|
-
# compute similarity score; if the spectra contain at most one point, their similarity is considered to be 0
|
|
561
538
|
if q_spec.shape[0] > 1:
|
|
562
539
|
similarity_score = get_similarity(similarity_measure, q_spec[:,1], r_spec[:,1], weights, entropy_dimension)
|
|
563
540
|
else:
|
|
564
541
|
similarity_score = 0
|
|
565
542
|
|
|
566
543
|
|
|
567
|
-
# plot the transformed spectra
|
|
568
544
|
plt.subplot(2,1,2)
|
|
569
545
|
|
|
570
|
-
# display warning message if either spectra are empty or have no non-zero intensity ion fragments
|
|
571
546
|
if q_spec.shape[0] == 0 or r_spec.shape[0] == 0:
|
|
572
547
|
plt.text(0.5, 0.5, 'The query and/or reference spectrum has no ion fragments left after transformations.\n Change transformation parameters.', ha='center', va='center', fontsize=7, color='black')
|
|
573
548
|
plt.xticks([])
|
|
@@ -577,7 +552,6 @@ def generate_plots_on_NRMS_data(query_data=None, reference_data=None, spectrum_I
|
|
|
577
552
|
plt.xticks([])
|
|
578
553
|
plt.yticks([])
|
|
579
554
|
else:
|
|
580
|
-
# apply transformation to y-axis if relevant
|
|
581
555
|
if y_axis_transformation == 'normalized':
|
|
582
556
|
q_spec[:,1] = q_spec[:,1] / np.max(q_spec[:,1])
|
|
583
557
|
r_spec[:,1] = r_spec[:,1] / np.max(r_spec[:,1])
|
|
@@ -601,18 +575,19 @@ def generate_plots_on_NRMS_data(query_data=None, reference_data=None, spectrum_I
|
|
|
601
575
|
plt.title(f'Transformed Query and Reference Spectra', fontsize=10)
|
|
602
576
|
|
|
603
577
|
|
|
604
|
-
#plt.subplots_adjust(top = 0.8, hspace = 0.7)
|
|
605
578
|
plt.subplots_adjust(top=0.8, hspace=0.92, bottom=0.3)
|
|
606
579
|
plt.figlegend(loc = 'upper center')
|
|
607
580
|
fig.text(0.05, 0.15, f'Similarity Measure: {similarity_measure.capitalize()}', fontsize=7)
|
|
608
581
|
fig.text(0.05, 0.12, f'Similarity Score: {round(similarity_score,4)}', fontsize=7)
|
|
609
|
-
fig.text(0.05, 0.09, f
|
|
582
|
+
fig.text(0.05, 0.09, f"Spectrum Preprocessing Order: {''.join(spectrum_preprocessing_order)}", fontsize=7)
|
|
610
583
|
fig.text(0.05, 0.06, f'High Quality Reference Library: {high_quality_reference_library}', fontsize=7)
|
|
611
584
|
fig.text(0.05, 0.03, f'Raw-Scale M/Z Range: [{min_mz},{max_mz}]', fontsize=7)
|
|
612
585
|
fig.text(0.45, 0.15, f'Raw-Scale Intensity Range: [{int_min_tmp},{int_max_tmp}]', fontsize=7)
|
|
613
586
|
fig.text(0.45, 0.12, f'Noise Threshold: {noise_threshold}', fontsize=7)
|
|
614
587
|
fig.text(0.45, 0.09, f'Weight Factors (m/z,intensity): ({wf_mz},{wf_intensity})', fontsize=7)
|
|
615
588
|
fig.text(0.45, 0.06, f'Low-Entropy Threshold: {LET_threshold}', fontsize=7)
|
|
589
|
+
if similarity_measure=='mixture':
|
|
590
|
+
fig.text(0.45, 0.03, f'Weights for mixture similarity: {weights}', fontsize=7)
|
|
616
591
|
plt.savefig(output_path, format='pdf')
|
|
617
592
|
|
|
618
593
|
if return_plot == True:
|
pycompound/processing.py
CHANGED
|
@@ -1,6 +1,4 @@
|
|
|
1
1
|
|
|
2
|
-
# This script contains the functions used to transform spectra prior to computing similarity scores
|
|
3
|
-
|
|
4
2
|
from pycompound.build_library import build_library_from_raw_data
|
|
5
3
|
import scipy.stats
|
|
6
4
|
import numpy as np
|
|
@@ -165,7 +163,6 @@ def centroid_spectrum(spec, window_size):
|
|
|
165
163
|
|
|
166
164
|
spec = spec[np.argsort(spec[:,0])]
|
|
167
165
|
|
|
168
|
-
#Fast check is the spectrum needs centroiding
|
|
169
166
|
mz_array = spec[:, 0]
|
|
170
167
|
need_centroid = 0
|
|
171
168
|
if mz_array.shape[0] > 1:
|
|
@@ -180,7 +177,6 @@ def centroid_spectrum(spec, window_size):
|
|
|
180
177
|
mz_delta_allowed = window_size
|
|
181
178
|
|
|
182
179
|
if spec[i, 1] > 0:
|
|
183
|
-
#Find left bound for current peak
|
|
184
180
|
i_left = i - 1
|
|
185
181
|
while i_left >= 0:
|
|
186
182
|
mz_delta_left = spec[i, 0] - spec[i_left, 0]
|
|
@@ -190,7 +186,6 @@ def centroid_spectrum(spec, window_size):
|
|
|
190
186
|
break
|
|
191
187
|
i_left += 1
|
|
192
188
|
|
|
193
|
-
#Find right bound for current peak
|
|
194
189
|
i_right = i + 1
|
|
195
190
|
while i_right < spec.shape[0]:
|
|
196
191
|
mz_delta_right = spec[i_right, 0] - spec[i, 0]
|
|
@@ -199,7 +194,6 @@ def centroid_spectrum(spec, window_size):
|
|
|
199
194
|
else:
|
|
200
195
|
break
|
|
201
196
|
|
|
202
|
-
#Merge those peaks
|
|
203
197
|
intensity_sum = np.sum(spec[i_left:i_right, 1])
|
|
204
198
|
intensity_weighted_sum = np.sum(spec[i_left:i_right, 0] * spec[i_left:i_right, 1])
|
|
205
199
|
|
|
@@ -246,16 +240,13 @@ def match_peaks_in_spectra(spec_a, spec_b, window_size):
|
|
|
246
240
|
mass_delta = spec_a[a, 0] - spec_b[b, 0]
|
|
247
241
|
|
|
248
242
|
if mass_delta < -window_size:
|
|
249
|
-
# Peak only existed in spec a.
|
|
250
243
|
spec_merged.append([spec_a[a, 0], spec_a[a, 1], peak_b_int])
|
|
251
244
|
peak_b_int = 0.
|
|
252
245
|
a += 1
|
|
253
246
|
elif mass_delta > window_size:
|
|
254
|
-
# Peak only existed in spec b.
|
|
255
247
|
spec_merged.append([spec_b[b, 0], 0., spec_b[b, 1]])
|
|
256
248
|
b += 1
|
|
257
249
|
else:
|
|
258
|
-
# Peak existed in both spec.
|
|
259
250
|
peak_b_int += spec_b[b, 1]
|
|
260
251
|
b += 1
|
|
261
252
|
|
|
@@ -10,7 +10,6 @@ import sys
|
|
|
10
10
|
|
|
11
11
|
|
|
12
12
|
def S_cos(ints_a, ints_b):
|
|
13
|
-
# Cosine Similarity Measure
|
|
14
13
|
if np.sum(ints_a) == 0 or np.sum(ints_b) == 0:
|
|
15
14
|
return(0)
|
|
16
15
|
else:
|
|
@@ -18,12 +17,10 @@ def S_cos(ints_a, ints_b):
|
|
|
18
17
|
|
|
19
18
|
|
|
20
19
|
def ent_renyi(ints, q):
|
|
21
|
-
# Computes the Renyi entropy of a probability distribution for a given positive entropy dimension q
|
|
22
20
|
return np.log(sum(np.power(ints,q))) / (1-q)
|
|
23
21
|
|
|
24
22
|
|
|
25
23
|
def ent_tsallis(ints, q):
|
|
26
|
-
# Computes the Tsallis entropy of a probability distribution for a given positive entropy dimension q
|
|
27
24
|
return (sum(np.power(ints,q))-1) / (1-q)
|
|
28
25
|
|
|
29
26
|
|