pycompound 0.1.6__py3-none-any.whl → 0.1.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- app.py +2589 -237
- pycompound/build_library.py +77 -20
- pycompound/plot_spectra.py +1 -1
- pycompound/processing.py +5 -5
- pycompound/spec_lib_matching.py +245 -471
- pycompound/spec_lib_matching_CLI.py +48 -2
- pycompound/tuning_CLI_DE.py +22 -22
- pycompound/tuning_CLI_grid.py +22 -6
- {pycompound-0.1.6.dist-info → pycompound-0.1.7.dist-info}/METADATA +1 -1
- pycompound-0.1.7.dist-info/RECORD +15 -0
- pycompound-0.1.6.dist-info/RECORD +0 -15
- {pycompound-0.1.6.dist-info → pycompound-0.1.7.dist-info}/WHEEL +0 -0
- {pycompound-0.1.6.dist-info → pycompound-0.1.7.dist-info}/licenses/LICENSE +0 -0
- {pycompound-0.1.6.dist-info → pycompound-0.1.7.dist-info}/top_level.txt +0 -0
pycompound/build_library.py
CHANGED
|
@@ -6,18 +6,19 @@ from pathlib import Path
|
|
|
6
6
|
from pyteomics import mgf
|
|
7
7
|
from pyteomics import mzml
|
|
8
8
|
import sys
|
|
9
|
+
import json
|
|
9
10
|
|
|
10
11
|
def build_library_from_raw_data(input_path=None, output_path=None, is_reference=False):
|
|
11
12
|
'''
|
|
12
|
-
Converts mgf, mzML, cdf, or msp file to the necessary format for spectral library matching.
|
|
13
|
+
Converts mgf, mzML, cdf, json, or msp file to the necessary format for spectral library matching.
|
|
13
14
|
|
|
14
|
-
--input_path: Path to input file (must be mgf, mzML, cdf, or msp file). Mandatory argument.
|
|
15
|
+
--input_path: Path to input file (must be mgf, mzML, cdf, json, or msp file). Mandatory argument.
|
|
15
16
|
--output_path: Path to output TXT file. Default: current working directory.
|
|
16
17
|
--is_reference: Boolean flag indicating whether IDs of spectra should be written to output. Only pass true if building a reference library with known compound IDs. Only applicable to mgf and msp files. Options: \'True\', \'False\'. Optional argument. Default: False.
|
|
17
18
|
'''
|
|
18
19
|
|
|
19
20
|
if input_path is None:
|
|
20
|
-
print('Error: please specify input_path (i.e. the path to the input mgf, mzML, cdf, or msp file). Mandatory argument.')
|
|
21
|
+
print('Error: please specify input_path (i.e. the path to the input mgf, mzML, cdf, json, or msp file). Mandatory argument.')
|
|
21
22
|
sys.exit()
|
|
22
23
|
|
|
23
24
|
if output_path is None:
|
|
@@ -37,18 +38,21 @@ def build_library_from_raw_data(input_path=None, output_path=None, is_reference=
|
|
|
37
38
|
input_file_type = 'mgf'
|
|
38
39
|
elif last_four_chars == 'mzML' or last_four_chars == 'mzml' or last_four_chars == 'MZML':
|
|
39
40
|
input_file_type = 'mzML'
|
|
41
|
+
elif last_four_chars == 'json' or last_four_chars == 'JSON':
|
|
42
|
+
input_file_type = 'json'
|
|
40
43
|
elif last_three_chars == 'cdf' or last_three_chars == 'CDF':
|
|
41
44
|
input_file_type = 'cdf'
|
|
42
45
|
elif last_three_chars == 'msp' or last_three_chars == 'MSP':
|
|
43
46
|
input_file_type = 'msp'
|
|
44
47
|
else:
|
|
45
|
-
print('ERROR: either an \'mgf\', \'mzML\', \'cdf\', or \'msp\' file must be passed to --input_path')
|
|
48
|
+
print('ERROR: either an \'mgf\', \'mzML\', \'cdf\', \'json\', or \'msp\' file must be passed to --input_path')
|
|
46
49
|
sys.exit()
|
|
47
50
|
|
|
48
51
|
|
|
49
52
|
spectra = []
|
|
50
53
|
if input_file_type == 'mgf':
|
|
51
|
-
with mgf.read(input_path, index_by_scans = True) as reader:
|
|
54
|
+
#with mgf.read(input_path, index_by_scans = True) as reader:
|
|
55
|
+
with mgf.read(input_path, use_index=False) as reader:
|
|
52
56
|
for spec in reader:
|
|
53
57
|
spectra.append(spec)
|
|
54
58
|
if input_file_type == 'mzML':
|
|
@@ -61,18 +65,24 @@ def build_library_from_raw_data(input_path=None, output_path=None, is_reference=
|
|
|
61
65
|
ids = []
|
|
62
66
|
mzs = []
|
|
63
67
|
ints = []
|
|
68
|
+
precursor_ion_mzs = []
|
|
64
69
|
for i in range(0,len(spectra)):
|
|
65
70
|
for j in range(0,len(spectra[i]['m/z array'])):
|
|
66
71
|
if input_file_type == 'mzML':
|
|
67
|
-
ids.append(f'ID_{i+1}')
|
|
68
|
-
else:
|
|
69
72
|
if is_reference == False:
|
|
70
73
|
ids.append(f'ID_{i+1}')
|
|
71
|
-
|
|
74
|
+
else:
|
|
75
|
+
ids.append(spectra[i]['id'])
|
|
76
|
+
elif input_file_type == 'mgf':
|
|
77
|
+
precursor_ion_mzs.append(spectra[i]['params']['pepmass'][0])
|
|
78
|
+
if is_reference == False:
|
|
79
|
+
ids.append(f'ID_{i+1}')
|
|
80
|
+
else:
|
|
72
81
|
ids.append(spectra[i]['params']['name'])
|
|
73
82
|
mzs.append(spectra[i]['m/z array'][j])
|
|
74
83
|
ints.append(spectra[i]['intensity array'][j])
|
|
75
84
|
|
|
85
|
+
|
|
76
86
|
if input_file_type == 'cdf':
|
|
77
87
|
dataset = nc.Dataset(input_path, 'r')
|
|
78
88
|
all_mzs = dataset.variables['mass_values'][:]
|
|
@@ -98,31 +108,78 @@ def build_library_from_raw_data(input_path=None, output_path=None, is_reference=
|
|
|
98
108
|
ints.append(ints_tmp[j])
|
|
99
109
|
|
|
100
110
|
|
|
101
|
-
|
|
111
|
+
|
|
112
|
+
if input_file_type == "msp":
|
|
102
113
|
ids = []
|
|
103
114
|
mzs = []
|
|
104
115
|
ints = []
|
|
105
|
-
|
|
116
|
+
precursor_ion_mzs = []
|
|
117
|
+
spectrum_id = None
|
|
118
|
+
precursor_ion_mz = None
|
|
119
|
+
with open(input_path, "r", encoding="utf-8", errors="ignore") as f:
|
|
106
120
|
i = 0
|
|
107
121
|
for line in f:
|
|
108
122
|
line = line.strip()
|
|
109
|
-
if line
|
|
123
|
+
if not line:
|
|
124
|
+
continue
|
|
125
|
+
|
|
126
|
+
if line.startswith("Name:"):
|
|
110
127
|
i += 1
|
|
111
|
-
if is_reference
|
|
112
|
-
spectrum_id = f
|
|
113
|
-
|
|
114
|
-
spectrum_id = line.replace(
|
|
115
|
-
|
|
128
|
+
if not is_reference:
|
|
129
|
+
spectrum_id = f"ID_{i}"
|
|
130
|
+
else:
|
|
131
|
+
spectrum_id = line.replace("Name:", "", 1).strip()
|
|
132
|
+
|
|
133
|
+
elif line.startswith("PrecursorMZ:"):
|
|
134
|
+
try:
|
|
135
|
+
precursor_ion_mz = float(line.replace("PrecursorMZ:", "", 1).strip())
|
|
136
|
+
except ValueError:
|
|
137
|
+
precursor_ion_mz = None
|
|
138
|
+
|
|
139
|
+
elif line[0].isdigit():
|
|
116
140
|
try:
|
|
117
141
|
mz, intensity = map(float, line.split()[:2])
|
|
118
|
-
ids.append(spectrum_id)
|
|
119
|
-
mzs.append(mz)
|
|
120
|
-
ints.append(intensity)
|
|
121
142
|
except ValueError:
|
|
122
143
|
continue
|
|
123
144
|
|
|
145
|
+
if spectrum_id is None:
|
|
146
|
+
continue
|
|
147
|
+
|
|
148
|
+
ids.append(spectrum_id)
|
|
149
|
+
mzs.append(mz)
|
|
150
|
+
ints.append(intensity)
|
|
151
|
+
precursor_ion_mzs.append(precursor_ion_mz)
|
|
152
|
+
|
|
153
|
+
|
|
154
|
+
|
|
155
|
+
if input_file_type == 'json':
|
|
156
|
+
data = json.load(open(input_path))
|
|
157
|
+
ids = []
|
|
158
|
+
mzs = []
|
|
159
|
+
ints = []
|
|
160
|
+
precursor_ion_mzs = []
|
|
161
|
+
for i in range(0,len(data)):
|
|
162
|
+
spec_ID_tmp = data[i]['spectrum_id']
|
|
163
|
+
tmp = data[i]['peaks_json']
|
|
164
|
+
tmp = tmp[1:-1].split(",")
|
|
165
|
+
tmp = [a.replace("[","") for a in tmp]
|
|
166
|
+
tmp = [a.replace("]","") for a in tmp]
|
|
167
|
+
mzs_tmp = tmp[0::2]
|
|
168
|
+
ints_tmp = tmp[1::2]
|
|
169
|
+
if is_reference == False:
|
|
170
|
+
ids.extend([f'ID_{i+1}'] * len(mzs_tmp))
|
|
171
|
+
elif is_reference == True:
|
|
172
|
+
ids.extend([spec_ID_tmp] * len(mzs_tmp))
|
|
173
|
+
mzs.extend(mzs_tmp)
|
|
174
|
+
ints.extend(ints_tmp)
|
|
175
|
+
precursor_ion_mzs.extend([data[i]['Precursor_MZ']] * len(mzs_tmp))
|
|
176
|
+
|
|
177
|
+
|
|
178
|
+
if len(precursor_ion_mzs) > 0:
|
|
179
|
+
df = pd.DataFrame({'id':ids, 'mz_ratio':mzs, 'intensity':ints, 'precursor_ion_mz':precursor_ion_mzs})
|
|
180
|
+
else:
|
|
181
|
+
df = pd.DataFrame({'id':ids, 'mz_ratio':mzs, 'intensity':ints})
|
|
124
182
|
|
|
125
|
-
df = pd.DataFrame({'id':ids, 'mz_ratio':mzs, 'intensity':ints})
|
|
126
183
|
df.to_csv(output_path, index=False, sep='\t')
|
|
127
184
|
|
|
128
185
|
|
pycompound/plot_spectra.py
CHANGED
pycompound/processing.py
CHANGED
|
@@ -295,13 +295,13 @@ def get_reference_df(reference_data, likely_reference_IDs=None):
|
|
|
295
295
|
extension = reference_data.rsplit('.',1)
|
|
296
296
|
extension = extension[(len(extension)-1)]
|
|
297
297
|
if extension == 'mgf' or extension == 'MGF' or extension == 'mzML' or extension == 'mzml' or extension == 'MZML' or extension == 'cdf' or extension == 'CDF':
|
|
298
|
-
output_path_tmp = reference_data[:-3] + '
|
|
298
|
+
output_path_tmp = reference_data[:-3] + 'txt'
|
|
299
299
|
build_library_from_raw_data(input_path=reference_data, output_path=output_path_tmp, is_reference=True)
|
|
300
|
-
df_reference = pd.read_csv(output_path_tmp)
|
|
301
|
-
if extension == '
|
|
302
|
-
df_reference = pd.read_csv(reference_data)
|
|
300
|
+
df_reference = pd.read_csv(output_path_tmp, sep='\t')
|
|
301
|
+
if extension == 'txt' or extension == 'TXT':
|
|
302
|
+
df_reference = pd.read_csv(reference_data, sep='\t')
|
|
303
303
|
if likely_reference_IDs is not None:
|
|
304
|
-
likely_reference_IDs = pd.read_csv(likely_reference_IDs, header=None)
|
|
304
|
+
likely_reference_IDs = pd.read_csv(likely_reference_IDs, header=None, sep='\t')
|
|
305
305
|
df_reference = df_reference.loc[df_reference.iloc[:,0].isin(likely_reference_IDs.iloc[:,0].tolist())]
|
|
306
306
|
return df_reference
|
|
307
307
|
|