pycompound 0.1.1__py3-none-any.whl → 0.1.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,4 @@
1
1
 
2
- # this script has a function to extract the mass spectra from an mgf, mzML, or cdf file and write them in the necessary format for use in spectral library matching
3
-
4
2
  import netCDF4 as nc
5
3
  import numpy as np
6
4
  import pandas as pd
@@ -14,7 +12,7 @@ def build_library_from_raw_data(input_path=None, output_path=None, is_reference=
14
12
  Converts mgf, mzML, cdf, or msp file to the necessary format for spectral library matching.
15
13
 
16
14
  --input_path: Path to input file (must be mgf, mzML, cdf, or msp file). Mandatory argument.
17
- --output_path: Path to output CSV file. Default: current working directory.
15
+ --output_path: Path to output TXT file. Default: current working directory.
18
16
  --is_reference: Boolean flag indicating whether IDs of spectra should be written to output. Only pass true if building a reference library with known compound IDs. Only applicable to mgf and msp files. Options: \'True\', \'False\'. Optional argument. Default: False.
19
17
  '''
20
18
 
@@ -23,7 +21,6 @@ def build_library_from_raw_data(input_path=None, output_path=None, is_reference=
23
21
  sys.exit()
24
22
 
25
23
  if output_path is None:
26
- #print('Warning: no output_path specified, so library is written to {Path.cwd()}/build_library.csv')
27
24
  tmp = input_path.split('/')
28
25
  tmp = tmp[(len(tmp)-1)]
29
26
  basename = tmp.split('.')[0]
@@ -34,7 +31,6 @@ def build_library_from_raw_data(input_path=None, output_path=None, is_reference=
34
31
  print('Error: is_reference must be either \'True\' or \'False\'.')
35
32
  sys.exit()
36
33
 
37
- # determine whether an mgf or a mzML file was passed to --input_path
38
34
  last_three_chars = input_path[(len(input_path)-3):len(input_path)]
39
35
  last_four_chars = input_path[(len(input_path)-4):len(input_path)]
40
36
  if last_three_chars == 'mgf' or last_three_chars == 'MGF':
@@ -50,7 +46,6 @@ def build_library_from_raw_data(input_path=None, output_path=None, is_reference=
50
46
  sys.exit()
51
47
 
52
48
 
53
- # obtain a list of spectra from the input file
54
49
  spectra = []
55
50
  if input_file_type == 'mgf':
56
51
  with mgf.read(input_path, index_by_scans = True) as reader:
@@ -62,7 +57,6 @@ def build_library_from_raw_data(input_path=None, output_path=None, is_reference=
62
57
  spectra.append(spec)
63
58
 
64
59
 
65
- # extract the relevant information from each spectra (i.e m/z ratios and intensities)
66
60
  if input_file_type == 'mgf' or input_file_type == 'mzML':
67
61
  ids = []
68
62
  mzs = []
@@ -128,8 +122,7 @@ def build_library_from_raw_data(input_path=None, output_path=None, is_reference=
128
122
  continue
129
123
 
130
124
 
131
- # write CSV file of spectra for use in spectral library matching
132
125
  df = pd.DataFrame({'id':ids, 'mz_ratio':mzs, 'intensity':ints})
133
- df.to_csv(output_path, index=False)
126
+ df.to_csv(output_path, index=False, sep='\t')
134
127
 
135
128
 
@@ -1,6 +1,4 @@
1
1
 
2
- # this script's functions plot a given query spectrum against a given reference spectrum before and after spectrum preprocessing transformations
3
-
4
2
  from .processing import *
5
3
  from .similarity_measures import *
6
4
  import pandas as pd
@@ -36,7 +34,6 @@ def generate_plots_on_HRMS_data(query_data=None, reference_data=None, spectrum_I
36
34
  --output_path: path to output PDF file containing the plots of the spectra before and after preprocessing transformations. If no argument is passed, then the plots will be saved to the PDF ./spectrum1_{spectrum_ID1}_spectrum2_{spectrum_ID2}_plot.pdf in the current working directory.
37
35
  '''
38
36
 
39
- # load query and reference libraries
40
37
  if query_data is None:
41
38
  print('\nError: No argument passed to the mandatory query_data. Please pass the path to the CSV file of the query data.')
42
39
  sys.exit()
@@ -68,7 +65,6 @@ def generate_plots_on_HRMS_data(query_data=None, reference_data=None, spectrum_I
68
65
  unique_reference_ids = [str(tmp) for tmp in unique_reference_ids]
69
66
 
70
67
 
71
- ##### process input parameters and ensure they are in a valid format #####
72
68
  if spectrum_ID1 is not None:
73
69
  spectrum_ID1 = str(spectrum_ID1)
74
70
  else:
@@ -190,7 +186,6 @@ def generate_plots_on_HRMS_data(query_data=None, reference_data=None, spectrum_I
190
186
  q_spec_pre_trans[:,1] = q_spec_pre_trans[:,1].astype(float)
191
187
  r_spec_pre_trans[:,1] = r_spec_pre_trans[:,1].astype(float)
192
188
 
193
- # apply transformation to y-axis if relevant
194
189
  if y_axis_transformation == 'normalized':
195
190
  q_spec_pre_trans[:,1] = q_spec_pre_trans[:,1] / np.max(q_spec_pre_trans[:,1])
196
191
  r_spec_pre_trans[:,1] = r_spec_pre_trans[:,1] / np.max(r_spec_pre_trans[:,1])
@@ -206,10 +201,8 @@ def generate_plots_on_HRMS_data(query_data=None, reference_data=None, spectrum_I
206
201
  else:
207
202
  ylab = 'Raw Intensity'
208
203
 
209
- # create the figure
210
204
  fig, axes = plt.subplots(nrows=2, ncols=1)
211
205
 
212
- # plot the untransformed spectra
213
206
  plt.subplot(2,1,1)
214
207
  plt.vlines(x=q_spec_pre_trans[:,0], ymin=[0]*q_spec_pre_trans.shape[0], ymax=q_spec_pre_trans[:,1], linewidth=3, color='blue', label=f'Spectrum ID 1: {spectrum_ID1}')
215
208
  plt.vlines(x=r_spec_pre_trans[:,0], ymin=[0]*r_spec_pre_trans.shape[0], ymax=-r_spec_pre_trans[:,1], linewidth=3, color='red', label=f'Spectrum ID 2: {spectrum_ID2}')
@@ -219,7 +212,6 @@ def generate_plots_on_HRMS_data(query_data=None, reference_data=None, spectrum_I
219
212
  plt.yticks(fontsize=7)
220
213
  plt.title('Untransformed Spectra', fontsize=10)
221
214
 
222
- # get the ranges of m/z and intensity values to display at the bottom of the two plots
223
215
  mz_min_tmp_q = round(q_spec[:,0].min(),1)
224
216
  mz_min_tmp_r = round(r_spec[:,0].min(),1)
225
217
  int_min_tmp_q = round(q_spec[:,1].min(),1)
@@ -233,51 +225,45 @@ def generate_plots_on_HRMS_data(query_data=None, reference_data=None, spectrum_I
233
225
  int_min_tmp = min([int_min_tmp_q,int_min_tmp_r])
234
226
  int_max_tmp = max([int_max_tmp_q,int_max_tmp_r])
235
227
 
236
- # perform the spectrum preprocessing transformations in the order specified
237
228
  is_matched = False
238
229
  for transformation in spectrum_preprocessing_order:
239
- if transformation == 'C' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1: # centroiding
230
+ if transformation == 'C' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1:
240
231
  q_spec = centroid_spectrum(q_spec, window_size=window_size_centroiding)
241
232
  r_spec = centroid_spectrum(r_spec, window_size=window_size_centroiding)
242
- if transformation == 'M' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1: # matching
233
+ if transformation == 'M' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1:
243
234
  m_spec = match_peaks_in_spectra(spec_a=q_spec, spec_b=r_spec, window_size=window_size_matching)
244
235
  q_spec = m_spec[:,0:2]
245
236
  r_spec = m_spec[:,[0,2]]
246
237
  is_matched = True
247
- if transformation == 'W' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1: # weight factor transformation
238
+ if transformation == 'W' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1:
248
239
  q_spec[:,1] = wf_transform(q_spec[:,0], q_spec[:,1], wf_mz, wf_intensity)
249
240
  r_spec[:,1] = wf_transform(r_spec[:,0], r_spec[:,1], wf_mz, wf_intensity)
250
- if transformation == 'L' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1: # low-entropy transformation
241
+ if transformation == 'L' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1:
251
242
  q_spec[:,1] = LE_transform(q_spec[:,1], LET_threshold, normalization_method=normalization_method)
252
243
  r_spec[:,1] = LE_transform(r_spec[:,1], LET_threshold, normalization_method=normalization_method)
253
- if transformation == 'N' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1: # noise removal
244
+ if transformation == 'N' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1:
254
245
  q_spec = remove_noise(q_spec, nr = noise_threshold)
255
246
  r_spec = remove_noise(r_spec, nr = noise_threshold)
256
- if transformation == 'F' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1: # filtering
247
+ if transformation == 'F' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1:
257
248
  q_spec = filter_spec_lcms(q_spec, mz_min = mz_min, mz_max = mz_max, int_min = int_min, int_max = int_max, is_matched = is_matched)
258
249
  r_spec = filter_spec_lcms(r_spec, mz_min = mz_min, mz_max = mz_max, int_min = int_min, int_max = int_max, is_matched = is_matched)
259
250
 
260
- # intensities of query and reference library
261
251
  q_ints = q_spec[:,1]
262
252
  r_ints = r_spec[:,1]
263
253
 
264
- # if there is at least one non-zero intensity ion fragment in either spectra, compute their similarity
265
254
  if np.sum(q_ints) != 0 and np.sum(r_ints) != 0 and q_spec.shape[0] > 1 and r_spec.shape[1] > 1:
266
255
  similarity_score = get_similarity(similarity_measure, q_ints, r_ints, weights, entropy_dimension)
267
256
  else:
268
257
  similarity_score = 0
269
258
 
270
- # plot the transformed spectra
271
259
  plt.subplot(2,1,2)
272
260
 
273
- # display warning message if either spectra are empty or have no non-zero intensity ion fragments
274
261
  if q_spec.shape[0] > 1:
275
262
  if np.max(q_spec[:,1]) == 0 or np.max(r_spec[:,1]) == 0:
276
263
  plt.text(0.5, 0.5, 'The query and/or reference spectrum has no non-zero intensities after transformations.\n Change transformation parameters.', ha='center', va='center', fontsize=7, color='black')
277
264
  plt.xticks([])
278
265
  plt.yticks([])
279
266
  else:
280
- # apply transformation to y-axis if relevant
281
267
  if y_axis_transformation == 'normalized':
282
268
  q_spec[:,1] = q_spec[:,1] / np.max(q_spec[:,1])
283
269
  r_spec[:,1] = r_spec[:,1] / np.max(r_spec[:,1])
@@ -352,7 +338,6 @@ def generate_plots_on_NRMS_data(query_data=None, reference_data=None, spectrum_I
352
338
  --output_path: path to output PDF file containing the plots of the spectra before and after preprocessing transformations. If no argument is passed, then the plots will be saved to the PDF ./spectrum1_{spectrum_ID1}_spectrum2_{spectrum_ID2}_plot.pdf in the current working directory.
353
339
  '''
354
340
 
355
- # load query and reference libraries
356
341
  if query_data is None:
357
342
  print('\nError: No argument passed to the mandatory query_data. Please pass the path to the CSV file of the query data.')
358
343
  sys.exit()
@@ -382,7 +367,6 @@ def generate_plots_on_NRMS_data(query_data=None, reference_data=None, spectrum_I
382
367
  unique_reference_ids = df_reference.iloc[:,0].unique()
383
368
 
384
369
 
385
- ##### process input parameters and ensure they are in a valid format #####
386
370
  if spectrum_ID1 is not None:
387
371
  spectrum_ID1 = str(spectrum_ID1)
388
372
  else:
@@ -457,12 +441,10 @@ def generate_plots_on_NRMS_data(query_data=None, reference_data=None, spectrum_I
457
441
  print(f'Warning: plots will be saved to the PDF ./spectrum1_{spectrum_ID1}_spectrum2_{spectrum_ID2}_plot.pdf in the current working directory.')
458
442
  output_path = f'{Path.cwd()}/spectrum1_{spectrum_ID1}_spectrum2_{spectrum_ID2}.pdf'
459
443
 
460
- # get m/z values
461
444
  min_mz = np.min([np.min(df_query.iloc[:,1]), np.min(df_reference.iloc[:,1])])
462
445
  max_mz = np.max([np.max(df_query.iloc[:,1]), np.max(df_reference.iloc[:,1])])
463
446
  mzs = np.linspace(min_mz,max_mz,(max_mz-min_mz+1))
464
447
 
465
- # get unique query/reference library IDs; each query/reference ID corresponds to exactly one query/reference mass spectrum
466
448
  unique_query_ids = df_query.iloc[:,0].unique().tolist()
467
449
  unique_reference_ids = df_reference.iloc[:,0].unique().tolist()
468
450
  unique_query_ids = [str(ID) for ID in unique_query_ids]
@@ -494,7 +476,6 @@ def generate_plots_on_NRMS_data(query_data=None, reference_data=None, spectrum_I
494
476
  q_spec = convert_spec(q_spec,mzs)
495
477
  r_spec = convert_spec(r_spec,mzs)
496
478
 
497
- # get the ranges of m/z and intensity values to display at the bottom of the two plots
498
479
  int_min_tmp_q = min(q_spec[q_spec[:,1].nonzero(),1][0])
499
480
  int_min_tmp_r = min(r_spec[r_spec[:,1].nonzero(),1][0])
500
481
  int_max_tmp_q = max(q_spec[q_spec[:,1].nonzero(),1][0])
@@ -502,13 +483,10 @@ def generate_plots_on_NRMS_data(query_data=None, reference_data=None, spectrum_I
502
483
  int_min_tmp = int(min([int_min_tmp_q,int_min_tmp_r]))
503
484
  int_max_tmp = int(max([int_max_tmp_q,int_max_tmp_r]))
504
485
 
505
- # create the figure
506
486
  fig, axes = plt.subplots(nrows=2, ncols=1)
507
487
 
508
- # plot the untransformed spectra
509
488
  plt.subplot(2,1,1)
510
489
 
511
- # display warning message if either spectra have no non-zero ion fragments
512
490
  if np.max(q_spec[:,1]) == 0 or np.max(r_spec[:,1]) == 0:
513
491
  plt.text(0.5, 0.5, 'The query and/or reference spectrum has no non-zero intensities after transformations.\n Change transformation parameters.', ha='center', va='center', fontsize=7, color='black')
514
492
  plt.xticks([])
@@ -519,7 +497,6 @@ def generate_plots_on_NRMS_data(query_data=None, reference_data=None, spectrum_I
519
497
  q_spec_pre_trans[:,1] = q_spec_pre_trans[:,1].astype(float)
520
498
  r_spec_pre_trans[:,1] = r_spec_pre_trans[:,1].astype(float)
521
499
 
522
- # apply transformation to y-axis if relevant
523
500
  if y_axis_transformation == 'normalized':
524
501
  q_spec_pre_trans[:,1] = q_spec_pre_trans[:,1] / np.max(q_spec_pre_trans[:,1])
525
502
  r_spec_pre_trans[:,1] = r_spec_pre_trans[:,1] / np.max(r_spec_pre_trans[:,1])
@@ -543,32 +520,29 @@ def generate_plots_on_NRMS_data(query_data=None, reference_data=None, spectrum_I
543
520
  plt.title('Untransformed Query and Reference Spectra', fontsize=10)
544
521
 
545
522
  for transformation in spectrum_preprocessing_order:
546
- if transformation == 'W': # weight factor transformation
523
+ if transformation == 'W':
547
524
  q_spec[:,1] = wf_transform(q_spec[:,0], q_spec[:,1], wf_mz, wf_intensity)
548
525
  r_spec[:,1] = wf_transform(r_spec[:,0], r_spec[:,1], wf_mz, wf_intensity)
549
- if transformation == 'L': # low-entropy transformation
526
+ if transformation == 'L':
550
527
  q_spec[:,1] = LE_transform(q_spec[:,1], LET_threshold, normalization_method)
551
528
  r_spec[:,1] = LE_transform(r_spec[:,1], LET_threshold, normalization_method)
552
- if transformation == 'N': # noise removal
529
+ if transformation == 'N':
553
530
  q_spec = remove_noise(q_spec, nr = noise_threshold)
554
531
  if high_quality_reference_library == False:
555
532
  r_spec = remove_noise(r_spec, nr = noise_threshold)
556
- if transformation == 'F': # filtering with respect to mz and/or intensity
533
+ if transformation == 'F':
557
534
  q_spec = filter_spec_gcms(q_spec, mz_min = mz_min, mz_max = mz_max, int_min = int_min, int_max = int_max)
558
535
  if high_quality_reference_library == False:
559
536
  r_spec = filter_spec_gcms(r_spec, mz_min = mz_min, mz_max = mz_max, int_min = int_min, int_max = int_max)
560
537
 
561
- # compute similarity score; if the spectra contain at most one point, their similarity is considered to be 0
562
538
  if q_spec.shape[0] > 1:
563
539
  similarity_score = get_similarity(similarity_measure, q_spec[:,1], r_spec[:,1], weights, entropy_dimension)
564
540
  else:
565
541
  similarity_score = 0
566
542
 
567
543
 
568
- # plot the transformed spectra
569
544
  plt.subplot(2,1,2)
570
545
 
571
- # display warning message if either spectra are empty or have no non-zero intensity ion fragments
572
546
  if q_spec.shape[0] == 0 or r_spec.shape[0] == 0:
573
547
  plt.text(0.5, 0.5, 'The query and/or reference spectrum has no ion fragments left after transformations.\n Change transformation parameters.', ha='center', va='center', fontsize=7, color='black')
574
548
  plt.xticks([])
@@ -578,7 +552,6 @@ def generate_plots_on_NRMS_data(query_data=None, reference_data=None, spectrum_I
578
552
  plt.xticks([])
579
553
  plt.yticks([])
580
554
  else:
581
- # apply transformation to y-axis if relevant
582
555
  if y_axis_transformation == 'normalized':
583
556
  q_spec[:,1] = q_spec[:,1] / np.max(q_spec[:,1])
584
557
  r_spec[:,1] = r_spec[:,1] / np.max(r_spec[:,1])
@@ -602,7 +575,6 @@ def generate_plots_on_NRMS_data(query_data=None, reference_data=None, spectrum_I
602
575
  plt.title(f'Transformed Query and Reference Spectra', fontsize=10)
603
576
 
604
577
 
605
- #plt.subplots_adjust(top = 0.8, hspace = 0.7)
606
578
  plt.subplots_adjust(top=0.8, hspace=0.92, bottom=0.3)
607
579
  plt.figlegend(loc = 'upper center')
608
580
  fig.text(0.05, 0.15, f'Similarity Measure: {similarity_measure.capitalize()}', fontsize=7)
pycompound/processing.py CHANGED
@@ -1,6 +1,4 @@
1
1
 
2
- # This script contains the functions used to transform spectra prior to computing similarity scores
3
-
4
2
  from pycompound.build_library import build_library_from_raw_data
5
3
  import scipy.stats
6
4
  import numpy as np
@@ -165,7 +163,6 @@ def centroid_spectrum(spec, window_size):
165
163
 
166
164
  spec = spec[np.argsort(spec[:,0])]
167
165
 
168
- #Fast check is the spectrum needs centroiding
169
166
  mz_array = spec[:, 0]
170
167
  need_centroid = 0
171
168
  if mz_array.shape[0] > 1:
@@ -180,7 +177,6 @@ def centroid_spectrum(spec, window_size):
180
177
  mz_delta_allowed = window_size
181
178
 
182
179
  if spec[i, 1] > 0:
183
- #Find left bound for current peak
184
180
  i_left = i - 1
185
181
  while i_left >= 0:
186
182
  mz_delta_left = spec[i, 0] - spec[i_left, 0]
@@ -190,7 +186,6 @@ def centroid_spectrum(spec, window_size):
190
186
  break
191
187
  i_left += 1
192
188
 
193
- #Find right bound for current peak
194
189
  i_right = i + 1
195
190
  while i_right < spec.shape[0]:
196
191
  mz_delta_right = spec[i_right, 0] - spec[i, 0]
@@ -199,7 +194,6 @@ def centroid_spectrum(spec, window_size):
199
194
  else:
200
195
  break
201
196
 
202
- #Merge those peaks
203
197
  intensity_sum = np.sum(spec[i_left:i_right, 1])
204
198
  intensity_weighted_sum = np.sum(spec[i_left:i_right, 0] * spec[i_left:i_right, 1])
205
199
 
@@ -246,16 +240,13 @@ def match_peaks_in_spectra(spec_a, spec_b, window_size):
246
240
  mass_delta = spec_a[a, 0] - spec_b[b, 0]
247
241
 
248
242
  if mass_delta < -window_size:
249
- # Peak only existed in spec a.
250
243
  spec_merged.append([spec_a[a, 0], spec_a[a, 1], peak_b_int])
251
244
  peak_b_int = 0.
252
245
  a += 1
253
246
  elif mass_delta > window_size:
254
- # Peak only existed in spec b.
255
247
  spec_merged.append([spec_b[b, 0], 0., spec_b[b, 1]])
256
248
  b += 1
257
249
  else:
258
- # Peak existed in both spec.
259
250
  peak_b_int += spec_b[b, 1]
260
251
  b += 1
261
252
 
@@ -10,7 +10,6 @@ import sys
10
10
 
11
11
 
12
12
  def S_cos(ints_a, ints_b):
13
- # Cosine Similarity Measure
14
13
  if np.sum(ints_a) == 0 or np.sum(ints_b) == 0:
15
14
  return(0)
16
15
  else:
@@ -18,12 +17,10 @@ def S_cos(ints_a, ints_b):
18
17
 
19
18
 
20
19
  def ent_renyi(ints, q):
21
- # Computes the Renyi entropy of a probability distribution for a given positive entropy dimension q
22
20
  return np.log(sum(np.power(ints,q))) / (1-q)
23
21
 
24
22
 
25
23
  def ent_tsallis(ints, q):
26
- # Computes the Tsallis entropy of a probability distribution for a given positive entropy dimension q
27
24
  return (sum(np.power(ints,q))-1) / (1-q)
28
25
 
29
26