pycompound 0.1.6__py3-none-any.whl → 0.1.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -6,18 +6,19 @@ from pathlib import Path
6
6
  from pyteomics import mgf
7
7
  from pyteomics import mzml
8
8
  import sys
9
+ import json
9
10
 
10
11
  def build_library_from_raw_data(input_path=None, output_path=None, is_reference=False):
11
12
  '''
12
- Converts mgf, mzML, cdf, or msp file to the necessary format for spectral library matching.
13
+ Converts mgf, mzML, cdf, json, or msp file to the necessary format for spectral library matching.
13
14
 
14
- --input_path: Path to input file (must be mgf, mzML, cdf, or msp file). Mandatory argument.
15
+ --input_path: Path to input file (must be mgf, mzML, cdf, json, or msp file). Mandatory argument.
15
16
  --output_path: Path to output TXT file. Default: current working directory.
16
17
  --is_reference: Boolean flag indicating whether IDs of spectra should be written to output. Only pass true if building a reference library with known compound IDs. Only applicable to mgf and msp files. Options: \'True\', \'False\'. Optional argument. Default: False.
17
18
  '''
18
19
 
19
20
  if input_path is None:
20
- print('Error: please specify input_path (i.e. the path to the input mgf, mzML, cdf, or msp file). Mandatory argument.')
21
+ print('Error: please specify input_path (i.e. the path to the input mgf, mzML, cdf, json, or msp file). Mandatory argument.')
21
22
  sys.exit()
22
23
 
23
24
  if output_path is None:
@@ -37,18 +38,21 @@ def build_library_from_raw_data(input_path=None, output_path=None, is_reference=
37
38
  input_file_type = 'mgf'
38
39
  elif last_four_chars == 'mzML' or last_four_chars == 'mzml' or last_four_chars == 'MZML':
39
40
  input_file_type = 'mzML'
41
+ elif last_four_chars == 'json' or last_four_chars == 'JSON':
42
+ input_file_type = 'json'
40
43
  elif last_three_chars == 'cdf' or last_three_chars == 'CDF':
41
44
  input_file_type = 'cdf'
42
45
  elif last_three_chars == 'msp' or last_three_chars == 'MSP':
43
46
  input_file_type = 'msp'
44
47
  else:
45
- print('ERROR: either an \'mgf\', \'mzML\', \'cdf\', or \'msp\' file must be passed to --input_path')
48
+ print('ERROR: either an \'mgf\', \'mzML\', \'cdf\', \'json\', or \'msp\' file must be passed to --input_path')
46
49
  sys.exit()
47
50
 
48
51
 
49
52
  spectra = []
50
53
  if input_file_type == 'mgf':
51
- with mgf.read(input_path, index_by_scans = True) as reader:
54
+ #with mgf.read(input_path, index_by_scans = True) as reader:
55
+ with mgf.read(input_path, use_index=False) as reader:
52
56
  for spec in reader:
53
57
  spectra.append(spec)
54
58
  if input_file_type == 'mzML':
@@ -61,18 +65,24 @@ def build_library_from_raw_data(input_path=None, output_path=None, is_reference=
61
65
  ids = []
62
66
  mzs = []
63
67
  ints = []
68
+ precursor_ion_mzs = []
64
69
  for i in range(0,len(spectra)):
65
70
  for j in range(0,len(spectra[i]['m/z array'])):
66
71
  if input_file_type == 'mzML':
67
- ids.append(f'ID_{i+1}')
68
- else:
69
72
  if is_reference == False:
70
73
  ids.append(f'ID_{i+1}')
71
- elif is_reference == True:
74
+ else:
75
+ ids.append(spectra[i]['id'])
76
+ elif input_file_type == 'mgf':
77
+ precursor_ion_mzs.append(spectra[i]['params']['pepmass'][0])
78
+ if is_reference == False:
79
+ ids.append(f'ID_{i+1}')
80
+ else:
72
81
  ids.append(spectra[i]['params']['name'])
73
82
  mzs.append(spectra[i]['m/z array'][j])
74
83
  ints.append(spectra[i]['intensity array'][j])
75
84
 
85
+
76
86
  if input_file_type == 'cdf':
77
87
  dataset = nc.Dataset(input_path, 'r')
78
88
  all_mzs = dataset.variables['mass_values'][:]
@@ -98,31 +108,78 @@ def build_library_from_raw_data(input_path=None, output_path=None, is_reference=
98
108
  ints.append(ints_tmp[j])
99
109
 
100
110
 
101
- if input_file_type == 'msp':
111
+
112
+ if input_file_type == "msp":
102
113
  ids = []
103
114
  mzs = []
104
115
  ints = []
105
- with open(input_path, 'r') as f:
116
+ precursor_ion_mzs = []
117
+ spectrum_id = None
118
+ precursor_ion_mz = None
119
+ with open(input_path, "r", encoding="utf-8", errors="ignore") as f:
106
120
  i = 0
107
121
  for line in f:
108
122
  line = line.strip()
109
- if line.startswith('Name:'):
123
+ if not line:
124
+ continue
125
+
126
+ if line.startswith("Name:"):
110
127
  i += 1
111
- if is_reference == False:
112
- spectrum_id = f'ID_{i+1}'
113
- elif is_reference == True:
114
- spectrum_id = line.replace('Name: ','')
115
- elif line and line[0].isdigit():
128
+ if not is_reference:
129
+ spectrum_id = f"ID_{i}"
130
+ else:
131
+ spectrum_id = line.replace("Name:", "", 1).strip()
132
+
133
+ elif line.startswith("PrecursorMZ:"):
134
+ try:
135
+ precursor_ion_mz = float(line.replace("PrecursorMZ:", "", 1).strip())
136
+ except ValueError:
137
+ precursor_ion_mz = None
138
+
139
+ elif line[0].isdigit():
116
140
  try:
117
141
  mz, intensity = map(float, line.split()[:2])
118
- ids.append(spectrum_id)
119
- mzs.append(mz)
120
- ints.append(intensity)
121
142
  except ValueError:
122
143
  continue
123
144
 
145
+ if spectrum_id is None:
146
+ continue
147
+
148
+ ids.append(spectrum_id)
149
+ mzs.append(mz)
150
+ ints.append(intensity)
151
+ precursor_ion_mzs.append(precursor_ion_mz)
152
+
153
+
154
+
155
+ if input_file_type == 'json':
156
+ data = json.load(open(input_path))
157
+ ids = []
158
+ mzs = []
159
+ ints = []
160
+ precursor_ion_mzs = []
161
+ for i in range(0,len(data)):
162
+ spec_ID_tmp = data[i]['spectrum_id']
163
+ tmp = data[i]['peaks_json']
164
+ tmp = tmp[1:-1].split(",")
165
+ tmp = [a.replace("[","") for a in tmp]
166
+ tmp = [a.replace("]","") for a in tmp]
167
+ mzs_tmp = tmp[0::2]
168
+ ints_tmp = tmp[1::2]
169
+ if is_reference == False:
170
+ ids.extend([f'ID_{i+1}'] * len(mzs_tmp))
171
+ elif is_reference == True:
172
+ ids.extend([spec_ID_tmp] * len(mzs_tmp))
173
+ mzs.extend(mzs_tmp)
174
+ ints.extend(ints_tmp)
175
+ precursor_ion_mzs.extend([data[i]['Precursor_MZ']] * len(mzs_tmp))
176
+
177
+
178
+ if len(precursor_ion_mzs) > 0:
179
+ df = pd.DataFrame({'id':ids, 'mz_ratio':mzs, 'intensity':ints, 'precursor_ion_mz':precursor_ion_mzs})
180
+ else:
181
+ df = pd.DataFrame({'id':ids, 'mz_ratio':mzs, 'intensity':ints})
124
182
 
125
- df = pd.DataFrame({'id':ids, 'mz_ratio':mzs, 'intensity':ints})
126
183
  df.to_csv(output_path, index=False, sep='\t')
127
184
 
128
185
 
@@ -315,7 +315,7 @@ def generate_plots_on_HRMS_data(query_data=None, reference_data=None, spectrum_I
315
315
  plt.savefig(output_path, format='pdf')
316
316
 
317
317
  if return_plot == True:
318
- return plt
318
+ return fig
319
319
 
320
320
 
321
321
 
pycompound/processing.py CHANGED
@@ -295,13 +295,13 @@ def get_reference_df(reference_data, likely_reference_IDs=None):
295
295
  extension = reference_data.rsplit('.',1)
296
296
  extension = extension[(len(extension)-1)]
297
297
  if extension == 'mgf' or extension == 'MGF' or extension == 'mzML' or extension == 'mzml' or extension == 'MZML' or extension == 'cdf' or extension == 'CDF':
298
- output_path_tmp = reference_data[:-3] + 'csv'
298
+ output_path_tmp = reference_data[:-3] + 'txt'
299
299
  build_library_from_raw_data(input_path=reference_data, output_path=output_path_tmp, is_reference=True)
300
- df_reference = pd.read_csv(output_path_tmp)
301
- if extension == 'csv' or extension == 'CSV':
302
- df_reference = pd.read_csv(reference_data)
300
+ df_reference = pd.read_csv(output_path_tmp, sep='\t')
301
+ if extension == 'txt' or extension == 'TXT':
302
+ df_reference = pd.read_csv(reference_data, sep='\t')
303
303
  if likely_reference_IDs is not None:
304
- likely_reference_IDs = pd.read_csv(likely_reference_IDs, header=None)
304
+ likely_reference_IDs = pd.read_csv(likely_reference_IDs, header=None, sep='\t')
305
305
  df_reference = df_reference.loc[df_reference.iloc[:,0].isin(likely_reference_IDs.iloc[:,0].tolist())]
306
306
  return df_reference
307
307