pycompound 0.0.55__py3-none-any.whl → 0.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- app.py +874 -82
- {pycompound_fy7392 → pycompound}/plot_spectra.py +8 -3
- {pycompound_fy7392 → pycompound}/plot_spectra_CLI.py +2 -2
- {pycompound_fy7392 → pycompound}/processing.py +1 -1
- {pycompound_fy7392 → pycompound}/spec_lib_matching.py +70 -34
- {pycompound_fy7392 → pycompound}/spec_lib_matching_CLI.py +2 -2
- {pycompound_fy7392 → pycompound}/tuning_CLI.py +3 -4
- {pycompound-0.0.55.dist-info → pycompound-0.1.1.dist-info}/METADATA +2 -1
- pycompound-0.1.1.dist-info/RECORD +14 -0
- pycompound-0.1.1.dist-info/top_level.txt +2 -0
- pycompound-0.0.55.dist-info/RECORD +0 -15
- pycompound-0.0.55.dist-info/top_level.txt +0 -2
- pycompound_fy7392/pycompound_shiny.py +0 -299
- {pycompound_fy7392 → pycompound}/build_library.py +0 -0
- {pycompound_fy7392 → pycompound}/similarity_measures.py +0 -0
- {pycompound-0.0.55.dist-info → pycompound-0.1.1.dist-info}/WHEEL +0 -0
- {pycompound-0.0.55.dist-info → pycompound-0.1.1.dist-info}/licenses/LICENSE +0 -0
|
@@ -45,7 +45,7 @@ def generate_plots_on_HRMS_data(query_data=None, reference_data=None, spectrum_I
|
|
|
45
45
|
extension = extension[(len(extension)-1)]
|
|
46
46
|
if extension == 'mgf' or extension == 'MGF' or extension == 'mzML' or extension == 'mzml' or extension == 'MZML' or extension == 'cdf' or extension == 'CDF':
|
|
47
47
|
output_path_tmp = query_data[:-3] + 'csv'
|
|
48
|
-
build_library_from_raw_data(input_path=query_data, output_path=output_path_tmp, is_reference=
|
|
48
|
+
build_library_from_raw_data(input_path=query_data, output_path=output_path_tmp, is_reference=True)
|
|
49
49
|
df_query = pd.read_csv(output_path_tmp)
|
|
50
50
|
if extension == 'csv' or extension == 'CSV':
|
|
51
51
|
df_query = pd.read_csv(query_data)
|
|
@@ -309,7 +309,7 @@ def generate_plots_on_HRMS_data(query_data=None, reference_data=None, spectrum_I
|
|
|
309
309
|
plt.figlegend(loc = 'upper center')
|
|
310
310
|
fig.text(0.05, 0.18, f'Similarity Measure: {similarity_measure.capitalize()}', fontsize=7)
|
|
311
311
|
fig.text(0.05, 0.15, f'Similarity Score: {round(similarity_score,4)}', fontsize=7)
|
|
312
|
-
fig.text(0.05, 0.12, f
|
|
312
|
+
fig.text(0.05, 0.12, f"Spectrum Preprocessing Order: {''.join(spectrum_preprocessing_order)}", fontsize=7)
|
|
313
313
|
fig.text(0.05, 0.09, f'High Quality Reference Library: {high_quality_reference_library}', fontsize=7)
|
|
314
314
|
fig.text(0.05, 0.06, f'Window Size (Centroiding): {window_size_centroiding}', fontsize=7)
|
|
315
315
|
fig.text(0.05, 0.03, f'Window Size (Matching): {window_size_matching}', fontsize=7)
|
|
@@ -318,6 +318,9 @@ def generate_plots_on_HRMS_data(query_data=None, reference_data=None, spectrum_I
|
|
|
318
318
|
fig.text(0.45, 0.12, f'Noise Threshold: {noise_threshold}', fontsize=7)
|
|
319
319
|
fig.text(0.45, 0.09, f'Weight Factors (m/z,intensity): ({wf_mz},{wf_intensity})', fontsize=7)
|
|
320
320
|
fig.text(0.45, 0.06, f'Low-Entropy Threshold: {LET_threshold}', fontsize=7)
|
|
321
|
+
if similarity_measure == 'mixture':
|
|
322
|
+
fig.text(0.45, 0.03, f'Weights for mixture similarity: {weights}', fontsize=7)
|
|
323
|
+
|
|
321
324
|
plt.savefig(output_path, format='pdf')
|
|
322
325
|
|
|
323
326
|
if return_plot == True:
|
|
@@ -604,13 +607,15 @@ def generate_plots_on_NRMS_data(query_data=None, reference_data=None, spectrum_I
|
|
|
604
607
|
plt.figlegend(loc = 'upper center')
|
|
605
608
|
fig.text(0.05, 0.15, f'Similarity Measure: {similarity_measure.capitalize()}', fontsize=7)
|
|
606
609
|
fig.text(0.05, 0.12, f'Similarity Score: {round(similarity_score,4)}', fontsize=7)
|
|
607
|
-
fig.text(0.05, 0.09, f
|
|
610
|
+
fig.text(0.05, 0.09, f"Spectrum Preprocessing Order: {''.join(spectrum_preprocessing_order)}", fontsize=7)
|
|
608
611
|
fig.text(0.05, 0.06, f'High Quality Reference Library: {high_quality_reference_library}', fontsize=7)
|
|
609
612
|
fig.text(0.05, 0.03, f'Raw-Scale M/Z Range: [{min_mz},{max_mz}]', fontsize=7)
|
|
610
613
|
fig.text(0.45, 0.15, f'Raw-Scale Intensity Range: [{int_min_tmp},{int_max_tmp}]', fontsize=7)
|
|
611
614
|
fig.text(0.45, 0.12, f'Noise Threshold: {noise_threshold}', fontsize=7)
|
|
612
615
|
fig.text(0.45, 0.09, f'Weight Factors (m/z,intensity): ({wf_mz},{wf_intensity})', fontsize=7)
|
|
613
616
|
fig.text(0.45, 0.06, f'Low-Entropy Threshold: {LET_threshold}', fontsize=7)
|
|
617
|
+
if similarity_measure=='mixture':
|
|
618
|
+
fig.text(0.45, 0.03, f'Weights for mixture similarity: {weights}', fontsize=7)
|
|
614
619
|
plt.savefig(output_path, format='pdf')
|
|
615
620
|
|
|
616
621
|
if return_plot == True:
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
|
|
2
|
-
from
|
|
3
|
-
from
|
|
2
|
+
from pycompound.plot_spectra import generate_plots_on_HRMS_data
|
|
3
|
+
from pycompound.plot_spectra import generate_plots_on_NRMS_data
|
|
4
4
|
import pandas as pd
|
|
5
5
|
import argparse
|
|
6
6
|
import json
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
|
|
2
2
|
# This script contains the functions used to transform spectra prior to computing similarity scores
|
|
3
3
|
|
|
4
|
-
from
|
|
4
|
+
from pycompound.build_library import build_library_from_raw_data
|
|
5
5
|
import scipy.stats
|
|
6
6
|
import numpy as np
|
|
7
7
|
import pandas as pd
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
|
|
2
2
|
# this script's function runs spectral library matching to identify unknown query compound(s)
|
|
3
3
|
|
|
4
|
-
from
|
|
4
|
+
from pycompound.build_library import build_library_from_raw_data
|
|
5
5
|
from .processing import *
|
|
6
6
|
from .similarity_measures import *
|
|
7
7
|
import pandas as pd
|
|
@@ -9,6 +9,12 @@ from pathlib import Path
|
|
|
9
9
|
import json
|
|
10
10
|
from itertools import product
|
|
11
11
|
from joblib import Parallel, delayed
|
|
12
|
+
import csv
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
default_HRMS_grid = {'similarity_measure':['cosine'], 'weight':[{'Cosine':0.25,'Shannon':0.25,'Renyi':0.25,'Tsallis':0.25}], 'spectrum_preprocessing_order':['FCNMWL'], 'mz_min':[0], 'mz_max':[9999999], 'int_min':[0], 'int_max':[99999999], 'window_size_centroiding':[0.5], 'window_size_matching':[0.5], 'noise_threshold':[0.0], 'wf_mz':[0.0], 'wf_int':[1.0], 'LET_threshold':[0.0], 'entropy_dimension':[1.1], 'high_quality_reference_library':[False]}
|
|
16
|
+
default_NRMS_grid = {'similarity_measure':['cosine'], 'weight':[{'Cosine':0.25,'Shannon':0.25,'Renyi':0.25,'Tsallis':0.25}], 'spectrum_preprocessing_order':['FCNMWL'], 'mz_min':[0], 'mz_max':[9999999], 'int_min':[0], 'int_max':[99999999], 'noise_threshold':[0.0], 'wf_mz':[0.0], 'wf_int':[1.0], 'LET_threshold':[0.0], 'entropy_dimension':[1.1], 'high_quality_reference_library':[False]}
|
|
17
|
+
|
|
12
18
|
|
|
13
19
|
def _eval_one_HRMS(df_query, df_reference, unique_query_ids, unique_reference_ids,
|
|
14
20
|
similarity_measure_tmp, weight,
|
|
@@ -71,7 +77,8 @@ def _eval_one_NRMS(df_query, df_reference, unique_query_ids, unique_reference_id
|
|
|
71
77
|
)
|
|
72
78
|
|
|
73
79
|
|
|
74
|
-
|
|
80
|
+
|
|
81
|
+
def tune_params_on_HRMS_data(query_data=None, reference_data=None, grid=None, output_path=None, return_output=False):
|
|
75
82
|
"""
|
|
76
83
|
runs spectral library matching on high-resolution mass spectrometry (HRMS) data with all possible combinations of parameters in the grid dict, saves results from each choice of parameters to a CSV file, and prints top-performing parameters
|
|
77
84
|
|
|
@@ -81,6 +88,7 @@ def tune_params_on_HRMS_data(query_data=None, reference_data=None, grid={'simila
|
|
|
81
88
|
--output_path: accuracy from each choice of parameter set is saved to a CSV file here.
|
|
82
89
|
"""
|
|
83
90
|
|
|
91
|
+
grid = {**default_HRMS_grid, **(grid or {})}
|
|
84
92
|
for key, value in grid.items():
|
|
85
93
|
globals()[key] = value
|
|
86
94
|
|
|
@@ -118,24 +126,35 @@ def tune_params_on_HRMS_data(query_data=None, reference_data=None, grid={'simila
|
|
|
118
126
|
print(f'\nNote that there are {len(unique_query_ids)} unique query spectra, {len(unique_reference_ids)} unique reference spectra, and {len(set(unique_query_ids) & set(unique_reference_ids))} of the query and reference spectra IDs are in common.\n')
|
|
119
127
|
|
|
120
128
|
if output_path is None:
|
|
121
|
-
output_path = f'{Path.cwd()}/tuning_param_output.
|
|
129
|
+
output_path = f'{Path.cwd()}/tuning_param_output.txt'
|
|
122
130
|
print(f'Warning: since output_path=None, the output will be written to the current working directory: {output_path}')
|
|
123
131
|
|
|
124
|
-
# build parameter grid out of the lists you already set
|
|
125
132
|
param_grid = product(similarity_measure, weight, spectrum_preprocessing_order, mz_min, mz_max, int_min, int_max, noise_threshold,
|
|
126
133
|
window_size_centroiding, window_size_matching, wf_mz, wf_int, LET_threshold, entropy_dimension, high_quality_reference_library)
|
|
127
|
-
# run in parallel on all CPUs
|
|
128
134
|
results = Parallel(n_jobs=-1, verbose=10)(delayed(_eval_one_HRMS)(df_query, df_reference, unique_query_ids, unique_reference_ids, *params) for params in param_grid)
|
|
129
135
|
|
|
130
136
|
df_out = pd.DataFrame(results, columns=[
|
|
131
137
|
'ACC','SIMILARITY.MEASURE','WEIGHT','SPECTRUM.PROCESSING.ORDER', 'MZ.MIN','MZ.MAX','INT.MIN','INT.MAX','NOISE.THRESHOLD',
|
|
132
138
|
'WINDOW.SIZE.CENTROIDING','WINDOW.SIZE.MATCHING', 'WF.MZ','WF.INT','LET.THRESHOLD','ENTROPY.DIMENSION', 'HIGH.QUALITY.REFERENCE.LIBRARY'
|
|
133
139
|
])
|
|
134
|
-
df_out = df_out
|
|
135
|
-
df_out.
|
|
140
|
+
df_out['WEIGHT'] = df_out['WEIGHT'].str.replace("\"","",regex=False)
|
|
141
|
+
df_out['WEIGHT'] = df_out['WEIGHT'].str.replace("{","",regex=False)
|
|
142
|
+
df_out['WEIGHT'] = df_out['WEIGHT'].str.replace("}","",regex=False)
|
|
143
|
+
df_out['WEIGHT'] = df_out['WEIGHT'].str.replace(":","",regex=False)
|
|
144
|
+
df_out['WEIGHT'] = df_out['WEIGHT'].str.replace("Cosine","",regex=False)
|
|
145
|
+
df_out['WEIGHT'] = df_out['WEIGHT'].str.replace("Shannon","",regex=False)
|
|
146
|
+
df_out['WEIGHT'] = df_out['WEIGHT'].str.replace("Renyi","",regex=False)
|
|
147
|
+
df_out['WEIGHT'] = df_out['WEIGHT'].str.replace("Tsallis","",regex=False)
|
|
148
|
+
df_out['WEIGHT'] = df_out['WEIGHT'].str.replace(" ","",regex=False)
|
|
149
|
+
df_out.to_csv(output_path, index=False, sep='\t', quoting=csv.QUOTE_NONE)
|
|
150
|
+
|
|
151
|
+
if return_output is False:
|
|
152
|
+
df_out.to_csv(output_path, index=False, sep='\t', quoting=csv.QUOTE_NONE)
|
|
153
|
+
else:
|
|
154
|
+
return df_out
|
|
136
155
|
|
|
137
156
|
|
|
138
|
-
def tune_params_on_NRMS_data(query_data=None, reference_data=None, grid=
|
|
157
|
+
def tune_params_on_NRMS_data(query_data=None, reference_data=None, grid=None, output_path=None, return_output=False):
|
|
139
158
|
"""
|
|
140
159
|
runs spectral library matching on nominal-resolution mass spectrometry (NRMS) data with all possible combinations of parameters in the grid dict, saves results from each choice of parameters to a CSV file, and prints top-performing parameters
|
|
141
160
|
|
|
@@ -145,10 +164,10 @@ def tune_params_on_NRMS_data(query_data=None, reference_data=None, grid={'simila
|
|
|
145
164
|
--output_path: accuracy from each choice of parameter set is saved to a CSV file here
|
|
146
165
|
"""
|
|
147
166
|
|
|
167
|
+
grid = {**default_NRMS_grid, **(grid or {})}
|
|
148
168
|
for key, value in grid.items():
|
|
149
169
|
globals()[key] = value
|
|
150
170
|
|
|
151
|
-
# load query and reference libraries
|
|
152
171
|
if query_data is None:
|
|
153
172
|
print('\nError: No argument passed to the mandatory query_data. Please pass the path to the CSV file of the query data.')
|
|
154
173
|
sys.exit()
|
|
@@ -182,21 +201,30 @@ def tune_params_on_NRMS_data(query_data=None, reference_data=None, grid={'simila
|
|
|
182
201
|
print(f'\nNote that there are {len(unique_query_ids)} unique query spectra, {len(unique_reference_ids)} unique reference spectra, and {len(set(unique_query_ids) & set(unique_reference_ids))} of the query and reference spectra IDs are in common.\n')
|
|
183
202
|
|
|
184
203
|
if output_path is None:
|
|
185
|
-
output_path = f'{Path.cwd()}/tuning_param_output.
|
|
204
|
+
output_path = f'{Path.cwd()}/tuning_param_output.txt'
|
|
186
205
|
print(f'Warning: since output_path=None, the output will be written to the current working directory: {output_path}')
|
|
187
206
|
|
|
188
|
-
# build parameter grid out of the lists you already set
|
|
189
207
|
param_grid = product(similarity_measure, weight, spectrum_preprocessing_order, mz_min, mz_max, int_min, int_max,
|
|
190
208
|
noise_threshold, wf_mz, wf_int, LET_threshold, entropy_dimension, high_quality_reference_library)
|
|
191
|
-
# run in parallel on all CPUs
|
|
192
209
|
results = Parallel(n_jobs=-1, verbose=10)(delayed(_eval_one_NRMS)(df_query, df_reference, unique_query_ids, unique_reference_ids, *params) for params in param_grid)
|
|
193
210
|
|
|
194
211
|
df_out = pd.DataFrame(results, columns=[
|
|
195
212
|
'ACC','SIMILARITY.MEASURE','WEIGHT','SPECTRUM.PROCESSING.ORDER', 'MZ.MIN','MZ.MAX','INT.MIN','INT.MAX',
|
|
196
213
|
'NOISE.THRESHOLD','WF.MZ','WF.INT','LET.THRESHOLD','ENTROPY.DIMENSION', 'HIGH.QUALITY.REFERENCE.LIBRARY'
|
|
197
214
|
])
|
|
198
|
-
df_out = df_out
|
|
199
|
-
df_out.
|
|
215
|
+
df_out['WEIGHT'] = df_out['WEIGHT'].str.replace("\"","",regex=False)
|
|
216
|
+
df_out['WEIGHT'] = df_out['WEIGHT'].str.replace("{","",regex=False)
|
|
217
|
+
df_out['WEIGHT'] = df_out['WEIGHT'].str.replace("}","",regex=False)
|
|
218
|
+
df_out['WEIGHT'] = df_out['WEIGHT'].str.replace(":","",regex=False)
|
|
219
|
+
df_out['WEIGHT'] = df_out['WEIGHT'].str.replace("Cosine","",regex=False)
|
|
220
|
+
df_out['WEIGHT'] = df_out['WEIGHT'].str.replace("Shannon","",regex=False)
|
|
221
|
+
df_out['WEIGHT'] = df_out['WEIGHT'].str.replace("Renyi","",regex=False)
|
|
222
|
+
df_out['WEIGHT'] = df_out['WEIGHT'].str.replace("Tsallis","",regex=False)
|
|
223
|
+
df_out['WEIGHT'] = df_out['WEIGHT'].str.replace(" ","",regex=False)
|
|
224
|
+
if return_output is False:
|
|
225
|
+
df_out.to_csv(output_path, index=False, sep='\t', quoting=csv.QUOTE_NONE)
|
|
226
|
+
else:
|
|
227
|
+
return df_out
|
|
200
228
|
|
|
201
229
|
|
|
202
230
|
|
|
@@ -389,7 +417,7 @@ def get_acc_NRMS(df_query, df_reference, unique_query_ids, unique_reference_ids,
|
|
|
389
417
|
|
|
390
418
|
|
|
391
419
|
|
|
392
|
-
def run_spec_lib_matching_on_HRMS_data(query_data=None, reference_data=None, likely_reference_ids=None, similarity_measure='cosine', weights={'Cosine':0.25,'Shannon':0.25,'Renyi':0.25,'Tsallis':0.25}, spectrum_preprocessing_order='FCNMWL', high_quality_reference_library=False, mz_min=0, mz_max=9999999, int_min=0, int_max=9999999, window_size_centroiding=0.5, window_size_matching=0.5, noise_threshold=0.0, wf_mz=0.0, wf_intensity=1.0, LET_threshold=0.0, entropy_dimension=1.1, n_top_matches_to_save=1, print_id_results=False, output_identification=None, output_similarity_scores=None):
|
|
420
|
+
def run_spec_lib_matching_on_HRMS_data(query_data=None, reference_data=None, likely_reference_ids=None, similarity_measure='cosine', weights={'Cosine':0.25,'Shannon':0.25,'Renyi':0.25,'Tsallis':0.25}, spectrum_preprocessing_order='FCNMWL', high_quality_reference_library=False, mz_min=0, mz_max=9999999, int_min=0, int_max=9999999, window_size_centroiding=0.5, window_size_matching=0.5, noise_threshold=0.0, wf_mz=0.0, wf_intensity=1.0, LET_threshold=0.0, entropy_dimension=1.1, n_top_matches_to_save=1, print_id_results=False, output_identification=None, output_similarity_scores=None, return_ID_output=False):
|
|
393
421
|
'''
|
|
394
422
|
runs spectral library matching on high-resolution mass spectrometry (HRMS) data
|
|
395
423
|
|
|
@@ -413,8 +441,8 @@ def run_spec_lib_matching_on_HRMS_data(query_data=None, reference_data=None, lik
|
|
|
413
441
|
--entropy_dimension: Entropy dimension parameter. Must have positive value other than 1. When the entropy dimension is 1, then Renyi and Tsallis entropy are equivalent to Shannon entropy. Therefore, this parameter only applies to the renyi and tsallis similarity measures. This parameter will be ignored if similarity measure cosine or shannon is chosen. Default: 1.1
|
|
414
442
|
--n_top_matches_to_save: The number of top matches to report. For example, if n_top_matches_to_save=5, then for each query spectrum, the five reference spectra with the largest similarity with the given query spectrum will be reported. Default: 1
|
|
415
443
|
--print_id_results: Flag that prints identification results if True. Default: False
|
|
416
|
-
--output_identification: Output CSV file containing the most-similar reference spectra for each query spectrum along with the corresponding similarity scores. Default is to save identification output in current working directory with filename \'output_identification.
|
|
417
|
-
--output_similarity_scores: Output CSV file containing similarity scores between all query spectrum/spectra and all reference spectra. Each row corresponds to a query spectrum, the left-most column contains the query spectrum/spectra identifier, and the remaining column contain the similarity scores with respect to all reference library spectra. If no argument passed, then this CSV file is written to the current working directory with filename \'output_all_similarity_scores\'.
|
|
444
|
+
--output_identification: Output CSV file containing the most-similar reference spectra for each query spectrum along with the corresponding similarity scores. Default is to save identification output in current working directory with filename \'output_identification.txt\'.
|
|
445
|
+
--output_similarity_scores: Output CSV file containing similarity scores between all query spectrum/spectra and all reference spectra. Each row corresponds to a query spectrum, the left-most column contains the query spectrum/spectra identifier, and the remaining column contain the similarity scores with respect to all reference library spectra. If no argument passed, then this CSV file is written to the current working directory with filename \'output_all_similarity_scores\'.txt.')
|
|
418
446
|
'''
|
|
419
447
|
|
|
420
448
|
# load query and reference libraries
|
|
@@ -528,11 +556,11 @@ def run_spec_lib_matching_on_HRMS_data(query_data=None, reference_data=None, lik
|
|
|
528
556
|
sys.exit()
|
|
529
557
|
|
|
530
558
|
if output_identification is None:
|
|
531
|
-
output_identification = f'{Path.cwd()}/output_identification.
|
|
559
|
+
output_identification = f'{Path.cwd()}/output_identification.txt'
|
|
532
560
|
print(f'Warning: writing identification output to {output_identification}')
|
|
533
561
|
|
|
534
562
|
if output_similarity_scores is None:
|
|
535
|
-
output_similarity_scores = f'{Path.cwd()}/output_all_similarity_scores.
|
|
563
|
+
output_similarity_scores = f'{Path.cwd()}/output_all_similarity_scores.txt'
|
|
536
564
|
print(f'Warning: writing similarity scores to {output_similarity_scores}')
|
|
537
565
|
|
|
538
566
|
|
|
@@ -636,22 +664,26 @@ def run_spec_lib_matching_on_HRMS_data(query_data=None, reference_data=None, lik
|
|
|
636
664
|
df_top_ref_specs.index = unique_query_ids
|
|
637
665
|
df_top_ref_specs.index.names = ['Query Spectrum ID']
|
|
638
666
|
|
|
667
|
+
df_scores.columns = ['Reference Spectrum ID: ' + col for col in list(map(str,df_scores.columns.tolist()))]
|
|
668
|
+
|
|
639
669
|
# print the identification results if the user desires
|
|
640
670
|
if print_id_results == True:
|
|
641
671
|
print(df_top_ref_specs.to_string())
|
|
642
672
|
|
|
643
|
-
|
|
644
|
-
|
|
673
|
+
if return_ID_output is False:
|
|
674
|
+
# write spectral library matching results to disk
|
|
675
|
+
df_top_ref_specs.to_csv(output_identification, sep='\t')
|
|
645
676
|
|
|
646
|
-
|
|
647
|
-
|
|
648
|
-
|
|
677
|
+
# write all similarity scores to disk
|
|
678
|
+
df_scores.to_csv(output_similarity_scores, sep='\t')
|
|
679
|
+
else:
|
|
680
|
+
return df_top_ref_specs
|
|
649
681
|
|
|
650
682
|
|
|
651
683
|
|
|
652
684
|
|
|
653
685
|
|
|
654
|
-
def run_spec_lib_matching_on_NRMS_data(query_data=None, reference_data=None, likely_reference_ids=None, spectrum_preprocessing_order='FNLW', similarity_measure='cosine', weights={'Cosine':0.25,'Shannon':0.25,'Renyi':0.25,'Tsallis':0.25}, high_quality_reference_library=False, mz_min=0, mz_max=9999999, int_min=0, int_max=9999999, noise_threshold=0.0, wf_mz=0.0, wf_intensity=1.0, LET_threshold=0.0, entropy_dimension=1.1, n_top_matches_to_save=1, print_id_results=False, output_identification=None, output_similarity_scores=None):
|
|
686
|
+
def run_spec_lib_matching_on_NRMS_data(query_data=None, reference_data=None, likely_reference_ids=None, spectrum_preprocessing_order='FNLW', similarity_measure='cosine', weights={'Cosine':0.25,'Shannon':0.25,'Renyi':0.25,'Tsallis':0.25}, high_quality_reference_library=False, mz_min=0, mz_max=9999999, int_min=0, int_max=9999999, noise_threshold=0.0, wf_mz=0.0, wf_intensity=1.0, LET_threshold=0.0, entropy_dimension=1.1, n_top_matches_to_save=1, print_id_results=False, output_identification=None, output_similarity_scores=None, return_ID_output=False):
|
|
655
687
|
'''
|
|
656
688
|
runs spectral library matching on nominal-resolution mass spectrometry (NRMS) data
|
|
657
689
|
|
|
@@ -674,8 +706,8 @@ def run_spec_lib_matching_on_NRMS_data(query_data=None, reference_data=None, lik
|
|
|
674
706
|
--normalization_method: Method used to normalize the intensities of each spectrum so that the intensities sum to 1. Since the objects entropy quantifies the uncertainy of must be probability distributions, the intensities of a given spectrum must sum to 1 prior to computing the entropy of the given spectrum intensities. Options: \'standard\' and \'softmax\'. Default: standard.
|
|
675
707
|
--n_top_matches_to_save: The number of top matches to report. For example, if n_top_matches_to_save=5, then for each query spectrum, the five reference spectra with the largest similarity with the given query spectrum will be reported. Default: 1
|
|
676
708
|
--print_id_results: Flag that prints identification results if True. Default: False
|
|
677
|
-
--output_identification: Output CSV file containing the most-similar reference spectra for each query spectrum along with the corresponding similarity scores. Default is to save identification output in current working directory with filename \'output_identification.
|
|
678
|
-
--output_similarity_scores: Output CSV file containing similarity scores between all query spectrum/spectra and all reference spectra. Each row corresponds to a query spectrum, the left-most column contains the query spectrum/spectra identifier, and the remaining column contain the similarity scores with respect to all reference library spectra. If no argument passed, then this CSV file is written to the current working directory with filename \'output_all_similarity_scores\'.
|
|
709
|
+
--output_identification: Output CSV file containing the most-similar reference spectra for each query spectrum along with the corresponding similarity scores. Default is to save identification output in current working directory with filename \'output_identification.txt\'.
|
|
710
|
+
--output_similarity_scores: Output CSV file containing similarity scores between all query spectrum/spectra and all reference spectra. Each row corresponds to a query spectrum, the left-most column contains the query spectrum/spectra identifier, and the remaining column contain the similarity scores with respect to all reference library spectra. If no argument passed, then this CSV file is written to the current working directory with filename \'output_all_similarity_scores\'.txt.')
|
|
679
711
|
'''
|
|
680
712
|
|
|
681
713
|
# load query and reference libraries
|
|
@@ -774,11 +806,11 @@ def run_spec_lib_matching_on_NRMS_data(query_data=None, reference_data=None, lik
|
|
|
774
806
|
sys.exit()
|
|
775
807
|
|
|
776
808
|
if output_identification is None:
|
|
777
|
-
output_identification = f'{Path.cwd()}/output_identification.
|
|
809
|
+
output_identification = f'{Path.cwd()}/output_identification.txt'
|
|
778
810
|
print(f'Warning: writing identification output to {output_identification}')
|
|
779
811
|
|
|
780
812
|
if output_similarity_scores is None:
|
|
781
|
-
output_similarity_scores = f'{Path.cwd()}/output_all_similarity_scores.
|
|
813
|
+
output_similarity_scores = f'{Path.cwd()}/output_all_similarity_scores.txt'
|
|
782
814
|
print(f'Warning: writing similarity scores to {output_similarity_scores}')
|
|
783
815
|
|
|
784
816
|
|
|
@@ -886,11 +918,15 @@ def run_spec_lib_matching_on_NRMS_data(query_data=None, reference_data=None, lik
|
|
|
886
918
|
if print_id_results == True:
|
|
887
919
|
print(df_top_ref_specs.to_string())
|
|
888
920
|
|
|
889
|
-
# write spectral library matching results to disk
|
|
890
|
-
df_top_ref_specs.to_csv(output_identification)
|
|
891
|
-
|
|
892
|
-
# write all similarity scores to disk
|
|
893
921
|
df_scores.columns = ['Reference Spectrum ID: ' + col for col in list(map(str,df_scores.columns.tolist()))]
|
|
894
|
-
df_scores.to_csv(output_similarity_scores)
|
|
895
922
|
|
|
923
|
+
if return_ID_output is False:
|
|
924
|
+
# write spectral library matching results to disk
|
|
925
|
+
df_top_ref_specs.to_csv(output_identification, sep='\t')
|
|
926
|
+
|
|
927
|
+
# write all similarity scores to disk
|
|
928
|
+
df_scores.columns = ['Reference Spectrum ID: ' + col for col in list(map(str,df_scores.columns.tolist()))]
|
|
929
|
+
df_scores.to_csv(output_similarity_scores, sep='\t')
|
|
930
|
+
else:
|
|
931
|
+
return df_top_ref_specs
|
|
896
932
|
|
|
@@ -2,8 +2,8 @@
|
|
|
2
2
|
# this script performs spectral library matching to identify unknown query compound(s) from GC-MS data
|
|
3
3
|
|
|
4
4
|
# load libraries
|
|
5
|
-
from
|
|
6
|
-
from
|
|
5
|
+
from pycompound.spec_lib_matching import run_spec_lib_matching_on_HRMS_data
|
|
6
|
+
from pycompound.spec_lib_matching import run_spec_lib_matching_on_NRMS_data
|
|
7
7
|
from pathlib import Path
|
|
8
8
|
import pandas as pd
|
|
9
9
|
import argparse
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
|
|
2
|
-
from
|
|
3
|
-
from
|
|
2
|
+
from pycompound.spec_lib_matching import tune_params_on_HRMS_data
|
|
3
|
+
from pycompound.spec_lib_matching import tune_params_on_NRMS_data
|
|
4
4
|
import argparse
|
|
5
5
|
import json
|
|
6
6
|
from pathlib import Path
|
|
@@ -40,8 +40,7 @@ else:
|
|
|
40
40
|
sys.exit()
|
|
41
41
|
|
|
42
42
|
|
|
43
|
-
grid = {'similarity_measure':args.similarity_measure.split(','), 'weight':args.weights, 'spectrum_preprocessing_order':spectrum_preprocessing_order.split(','), 'mz_min':args.mz_min.split(','), 'mz_max':args.mz_max.split(','), 'int_min':args.int_min.split(','), 'int_max':args.int_max.split(','), 'window_size_centroiding':args.window_size_centroiding.split(','), 'window_size_matching':args.window_size_matching.split(','), 'noise_threshold':args.noise_threshold.split(','), 'wf_mz':args.wf_mz.split(','), 'wf_int':args.wf_intensity.split(','), 'LET_threshold':args.LET_threshold.split(','), 'entropy_dimension':args.entropy_dimension.split(','), 'high_quality_reference_library':args.high_quality_reference_library.split(',')}
|
|
44
|
-
|
|
43
|
+
grid = {'similarity_measure':args.similarity_measure.split(','), 'weight':[args.weights], 'spectrum_preprocessing_order':spectrum_preprocessing_order.split(','), 'mz_min':args.mz_min.split(','), 'mz_max':args.mz_max.split(','), 'int_min':args.int_min.split(','), 'int_max':args.int_max.split(','), 'window_size_centroiding':args.window_size_centroiding.split(','), 'window_size_matching':args.window_size_matching.split(','), 'noise_threshold':args.noise_threshold.split(','), 'wf_mz':args.wf_mz.split(','), 'wf_int':args.wf_intensity.split(','), 'LET_threshold':args.LET_threshold.split(','), 'entropy_dimension':args.entropy_dimension.split(','), 'high_quality_reference_library':args.high_quality_reference_library.split(',')}
|
|
45
44
|
|
|
46
45
|
if args.chromatography_platform == 'HRMS':
|
|
47
46
|
grid['mz_min'] = [float(x) for x in grid['mz_min']]
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: pycompound
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.1.1
|
|
4
4
|
Summary: Python package to perform compound identification in mass spectrometry via spectral library matching.
|
|
5
5
|
Author-email: Hunter Dlugas <fy7392@wayne.edu>
|
|
6
6
|
License-Expression: MIT
|
|
@@ -19,6 +19,7 @@ Requires-Dist: pyteomics==4.7.2
|
|
|
19
19
|
Requires-Dist: netCDF4==1.6.5
|
|
20
20
|
Requires-Dist: lxml>=5.1.0
|
|
21
21
|
Requires-Dist: orjson==3.11.0
|
|
22
|
+
Requires-Dist: shiny==1.4.0
|
|
22
23
|
Requires-Dist: joblib==1.5.2
|
|
23
24
|
Dynamic: license-file
|
|
24
25
|
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
app.py,sha256=k5mPyctA1eWkGjtnKrJb7STuweh_aH4HmPUH07jO92Y,53841
|
|
2
|
+
pycompound/build_library.py,sha256=8ghpX8wfj6u-3V5X2IdJ-e8G_FRSla1lO0pzLj7hOtI,5373
|
|
3
|
+
pycompound/plot_spectra.py,sha256=Q7nDSW3Y5pR_Ql4JeEmyd6KRRyzvxk9j0yaUR0hfjJc,42275
|
|
4
|
+
pycompound/plot_spectra_CLI.py,sha256=ObaLad5Z5DmfQB-j0HSCg1mLORbYj2BM3hb5Yd0ZdDI,8395
|
|
5
|
+
pycompound/processing.py,sha256=vqtKaZ6vot6wlnKNTYUQFX7ccPpnCAl0L6bN289vZoM,11068
|
|
6
|
+
pycompound/similarity_measures.py,sha256=TuvtEXWwyxE6dfpmuAqRC6gOHvHg3Jf21099pVaNBAs,10702
|
|
7
|
+
pycompound/spec_lib_matching.py,sha256=AAMxWqi6LXWo-tJ-uqJ4QxfHSg8bX3G_DJVt2bLLMcM,61860
|
|
8
|
+
pycompound/spec_lib_matching_CLI.py,sha256=EdXM0dRQfwGQAK4OKxhcVytuUnX9pRyJROwC6rloZ9s,9915
|
|
9
|
+
pycompound/tuning_CLI.py,sha256=dSFLwMiI0_6G4YDZR5ubqn9-75ixOvDPZMOoGS-_B6w,8540
|
|
10
|
+
pycompound-0.1.1.dist-info/licenses/LICENSE,sha256=fPFFlkSGg60VQWyWqTSv8yoJnpLzppzdihVWY5NKom8,1064
|
|
11
|
+
pycompound-0.1.1.dist-info/METADATA,sha256=XZtkvSau_Z723iCgy_LTR1CkYryDxXBdIFtb_D_E9u0,1732
|
|
12
|
+
pycompound-0.1.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
13
|
+
pycompound-0.1.1.dist-info/top_level.txt,sha256=wFBLVrqpC07HghIU8tsEdgdvgkdOE3GN_1Gfjk-uEUc,15
|
|
14
|
+
pycompound-0.1.1.dist-info/RECORD,,
|
|
@@ -1,15 +0,0 @@
|
|
|
1
|
-
app.py,sha256=PKiCJe_18EJIHvs0R7pl_Yf-XakZn5J0AAfI-AnGsX0,21535
|
|
2
|
-
pycompound-0.0.55.dist-info/licenses/LICENSE,sha256=fPFFlkSGg60VQWyWqTSv8yoJnpLzppzdihVWY5NKom8,1064
|
|
3
|
-
pycompound_fy7392/build_library.py,sha256=8ghpX8wfj6u-3V5X2IdJ-e8G_FRSla1lO0pzLj7hOtI,5373
|
|
4
|
-
pycompound_fy7392/plot_spectra.py,sha256=wOnf2oOAfifj7FYkTZAcIeD7dHW1aRHzmsspPpySDcY,42023
|
|
5
|
-
pycompound_fy7392/plot_spectra_CLI.py,sha256=fo0nUmbuy2qE6d9HgVdASn2CNUG8seg2mUCPrUU-rao,8409
|
|
6
|
-
pycompound_fy7392/processing.py,sha256=7cKMX7PQ4Q-I4c8lRo5qXbOVGr8CeRdgNPURJx8DBV0,11075
|
|
7
|
-
pycompound_fy7392/pycompound_shiny.py,sha256=uYfeIuR5j1UK_KE8RbDPaQxqMIU1qykVJ2L-zgaSkY0,30154
|
|
8
|
-
pycompound_fy7392/similarity_measures.py,sha256=TuvtEXWwyxE6dfpmuAqRC6gOHvHg3Jf21099pVaNBAs,10702
|
|
9
|
-
pycompound_fy7392/spec_lib_matching.py,sha256=jtUpG5OBDtIaHIpCNc62a3y-wQ_SmIgXZ9Q_p8xKZu4,59969
|
|
10
|
-
pycompound_fy7392/spec_lib_matching_CLI.py,sha256=TAafJ3DGPorBTDzmXLQaaSH3giKn6q3GrRJPWh03yyo,9929
|
|
11
|
-
pycompound_fy7392/tuning_CLI.py,sha256=qLglxqq-y6EXCDk0P3CkWn6cTFCmWDeKz0-SZBXcwCA,8553
|
|
12
|
-
pycompound-0.0.55.dist-info/METADATA,sha256=3i67ba8TVHHSK-toc2-OI9XJYdQRkrCKGXOrqHyV5e4,1705
|
|
13
|
-
pycompound-0.0.55.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
14
|
-
pycompound-0.0.55.dist-info/top_level.txt,sha256=h_c9lBkHcABTURy4sDAmgRzZdFHYWX9MDdsaiftT-Yw,22
|
|
15
|
-
pycompound-0.0.55.dist-info/RECORD,,
|