PyPI - pycompound - Versions diffs - 0.1.1__py3-none-any.whl → 0.1.2__py3-none-any.whl - Mend

pycompound 0.1.1py3-none-any.whl → 0.1.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (14) hide show

app.py +155 -194
pycompound/build_library.py +2 -9
pycompound/plot_spectra.py +10 -38
pycompound/processing.py +0 -9
pycompound/similarity_measures.py +0 -3
pycompound/spec_lib_matching.py +246 -81
pycompound/spec_lib_matching_CLI.py +2 -7
pycompound/tuning_CLI.py +1 -1
{pycompound-0.1.1.dist-info → pycompound-0.1.2.dist-info}/METADATA +1 -1
pycompound-0.1.2.dist-info/RECORD +14 -0
pycompound-0.1.1.dist-info/RECORD +0 -14
{pycompound-0.1.1.dist-info → pycompound-0.1.2.dist-info}/WHEEL +0 -0
{pycompound-0.1.1.dist-info → pycompound-0.1.2.dist-info}/licenses/LICENSE +0 -0
{pycompound-0.1.1.dist-info → pycompound-0.1.2.dist-info}/top_level.txt +0 -0

pycompound/build_library.py CHANGED Viewed

@@ -1,6 +1,4 @@
-# this script has a function to extract the mass spectra from an mgf, mzML, or cdf file and write them in the necessary format for use in spectral library matching
 import netCDF4 as nc
 import numpy as np
 import pandas as pd
@@ -14,7 +12,7 @@ def build_library_from_raw_data(input_path=None, output_path=None, is_reference=
     Converts mgf, mzML, cdf, or msp file to the necessary format for spectral library matching.
     --input_path: Path to input file (must be mgf, mzML, cdf, or msp file). Mandatory argument.
-    --output_path: Path to output CSV file. Default: current working directory.
+    --output_path: Path to output TXT file. Default: current working directory.
     --is_reference: Boolean flag indicating whether IDs of spectra should be written to output. Only pass true if building a reference library with known compound IDs. Only applicable to mgf and msp files. Options: \'True\', \'False\'. Optional argument. Default: False.
     '''
@@ -23,7 +21,6 @@ def build_library_from_raw_data(input_path=None, output_path=None, is_reference=
         sys.exit()
     if output_path is None:
-        #print('Warning: no output_path specified, so library is written to {Path.cwd()}/build_library.csv')
         tmp = input_path.split('/')
         tmp = tmp[(len(tmp)-1)]
         basename = tmp.split('.')[0]
@@ -34,7 +31,6 @@ def build_library_from_raw_data(input_path=None, output_path=None, is_reference=
         print('Error: is_reference must be either \'True\' or \'False\'.')
         sys.exit()
-    # determine whether an mgf or a mzML file was passed to --input_path
     last_three_chars = input_path[(len(input_path)-3):len(input_path)]
     last_four_chars = input_path[(len(input_path)-4):len(input_path)]
     if last_three_chars == 'mgf' or last_three_chars == 'MGF':
@@ -50,7 +46,6 @@ def build_library_from_raw_data(input_path=None, output_path=None, is_reference=
         sys.exit()
-    # obtain a list of spectra from the input file
     spectra = []
     if input_file_type == 'mgf':
         with mgf.read(input_path, index_by_scans = True) as reader:
@@ -62,7 +57,6 @@ def build_library_from_raw_data(input_path=None, output_path=None, is_reference=
                 spectra.append(spec)
-    # extract the relevant information from each spectra (i.e m/z ratios and intensities)
     if input_file_type == 'mgf' or input_file_type == 'mzML':
         ids = []
         mzs = []
@@ -128,8 +122,7 @@ def build_library_from_raw_data(input_path=None, output_path=None, is_reference=
                         continue
-    # write CSV file of spectra for use in spectral library matching
     df = pd.DataFrame({'id':ids, 'mz_ratio':mzs, 'intensity':ints})
-    df.to_csv(output_path, index=False)
+    df.to_csv(output_path, index=False, sep='\t')

pycompound/plot_spectra.py CHANGED Viewed

@@ -1,6 +1,4 @@
-# this script's functions plot a given query spectrum against a given reference spectrum before and after spectrum preprocessing transformations
 from .processing import *
 from .similarity_measures import *
 import pandas as pd
@@ -36,7 +34,6 @@ def generate_plots_on_HRMS_data(query_data=None, reference_data=None, spectrum_I
     --output_path: path to output PDF file containing the plots of the spectra before and after preprocessing transformations. If no argument is passed, then the plots will be saved to the PDF ./spectrum1_{spectrum_ID1}_spectrum2_{spectrum_ID2}_plot.pdf in the current working directory.
     '''
-    # load query and reference libraries
     if query_data is None:
         print('\nError: No argument passed to the mandatory query_data. Please pass the path to the CSV file of the query data.')
         sys.exit()
@@ -68,7 +65,6 @@ def generate_plots_on_HRMS_data(query_data=None, reference_data=None, spectrum_I
         unique_reference_ids = [str(tmp) for tmp in unique_reference_ids]
-    ##### process input parameters and ensure they are in a valid format #####
     if spectrum_ID1 is not None:
         spectrum_ID1 = str(spectrum_ID1)
     else:
@@ -190,7 +186,6 @@ def generate_plots_on_HRMS_data(query_data=None, reference_data=None, spectrum_I
     q_spec_pre_trans[:,1] = q_spec_pre_trans[:,1].astype(float)
     r_spec_pre_trans[:,1] = r_spec_pre_trans[:,1].astype(float)
-    # apply transformation to y-axis if relevant
     if y_axis_transformation == 'normalized':
         q_spec_pre_trans[:,1] = q_spec_pre_trans[:,1] / np.max(q_spec_pre_trans[:,1])
         r_spec_pre_trans[:,1] = r_spec_pre_trans[:,1] / np.max(r_spec_pre_trans[:,1])
@@ -206,10 +201,8 @@ def generate_plots_on_HRMS_data(query_data=None, reference_data=None, spectrum_I
     else:
         ylab = 'Raw Intensity'
-    # create the figure
     fig, axes = plt.subplots(nrows=2, ncols=1)
-    # plot the untransformed spectra
     plt.subplot(2,1,1)
     plt.vlines(x=q_spec_pre_trans[:,0], ymin=[0]*q_spec_pre_trans.shape[0], ymax=q_spec_pre_trans[:,1], linewidth=3, color='blue', label=f'Spectrum ID 1: {spectrum_ID1}')
     plt.vlines(x=r_spec_pre_trans[:,0], ymin=[0]*r_spec_pre_trans.shape[0], ymax=-r_spec_pre_trans[:,1], linewidth=3, color='red', label=f'Spectrum ID 2: {spectrum_ID2}')
@@ -219,7 +212,6 @@ def generate_plots_on_HRMS_data(query_data=None, reference_data=None, spectrum_I
     plt.yticks(fontsize=7)
     plt.title('Untransformed Spectra', fontsize=10)
-    # get the ranges of m/z and intensity values to display at the bottom of the two plots
     mz_min_tmp_q = round(q_spec[:,0].min(),1)
     mz_min_tmp_r = round(r_spec[:,0].min(),1)
     int_min_tmp_q = round(q_spec[:,1].min(),1)
@@ -233,51 +225,45 @@ def generate_plots_on_HRMS_data(query_data=None, reference_data=None, spectrum_I
     int_min_tmp = min([int_min_tmp_q,int_min_tmp_r])
     int_max_tmp = max([int_max_tmp_q,int_max_tmp_r])
-    # perform the spectrum preprocessing transformations in the order specified
     is_matched = False
     for transformation in spectrum_preprocessing_order:
-        if transformation == 'C' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1: # centroiding
+        if transformation == 'C' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1:
             q_spec = centroid_spectrum(q_spec, window_size=window_size_centroiding)
             r_spec = centroid_spectrum(r_spec, window_size=window_size_centroiding)
-        if transformation == 'M' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1: # matching
+        if transformation == 'M' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1:
             m_spec = match_peaks_in_spectra(spec_a=q_spec, spec_b=r_spec, window_size=window_size_matching)
             q_spec = m_spec[:,0:2]
             r_spec = m_spec[:,[0,2]]
             is_matched = True
-        if transformation == 'W' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1: # weight factor transformation
+        if transformation == 'W' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1:
             q_spec[:,1] = wf_transform(q_spec[:,0], q_spec[:,1], wf_mz, wf_intensity)
             r_spec[:,1] = wf_transform(r_spec[:,0], r_spec[:,1], wf_mz, wf_intensity)
-        if transformation == 'L' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1: # low-entropy transformation
+        if transformation == 'L' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1:
             q_spec[:,1] = LE_transform(q_spec[:,1], LET_threshold, normalization_method=normalization_method)
             r_spec[:,1] = LE_transform(r_spec[:,1], LET_threshold, normalization_method=normalization_method)
-        if transformation == 'N' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1: # noise removal
+        if transformation == 'N' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1:
             q_spec = remove_noise(q_spec, nr = noise_threshold)
             r_spec = remove_noise(r_spec, nr = noise_threshold)
-        if transformation == 'F' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1: # filtering
+        if transformation == 'F' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1:
             q_spec = filter_spec_lcms(q_spec, mz_min = mz_min, mz_max = mz_max, int_min = int_min, int_max = int_max, is_matched = is_matched)
             r_spec = filter_spec_lcms(r_spec, mz_min = mz_min, mz_max = mz_max, int_min = int_min, int_max = int_max, is_matched = is_matched)
-    # intensities of query and reference library
     q_ints = q_spec[:,1]
     r_ints = r_spec[:,1]
-    # if there is at least one non-zero intensity ion fragment in either spectra, compute their similarity
     if np.sum(q_ints) != 0 and np.sum(r_ints) != 0 and q_spec.shape[0] > 1 and r_spec.shape[1] > 1:
         similarity_score = get_similarity(similarity_measure, q_ints, r_ints, weights, entropy_dimension)
     else:
         similarity_score = 0
-    # plot the transformed spectra
     plt.subplot(2,1,2)
-    # display warning message if either spectra are empty or have no non-zero intensity ion fragments
     if q_spec.shape[0] > 1:
         if np.max(q_spec[:,1]) == 0 or np.max(r_spec[:,1]) == 0:
             plt.text(0.5, 0.5, 'The query and/or reference spectrum has no non-zero intensities after transformations.\n Change transformation parameters.', ha='center', va='center', fontsize=7, color='black')
             plt.xticks([])
             plt.yticks([])
         else:
-            # apply transformation to y-axis if relevant
             if y_axis_transformation == 'normalized':
                 q_spec[:,1] = q_spec[:,1] / np.max(q_spec[:,1])
                 r_spec[:,1] = r_spec[:,1] / np.max(r_spec[:,1])
@@ -352,7 +338,6 @@ def generate_plots_on_NRMS_data(query_data=None, reference_data=None, spectrum_I
     --output_path: path to output PDF file containing the plots of the spectra before and after preprocessing transformations. If no argument is passed, then the plots will be saved to the PDF ./spectrum1_{spectrum_ID1}_spectrum2_{spectrum_ID2}_plot.pdf in the current working directory.
     '''
-    # load query and reference libraries
     if query_data is None:
         print('\nError: No argument passed to the mandatory query_data. Please pass the path to the CSV file of the query data.')
         sys.exit()
@@ -382,7 +367,6 @@ def generate_plots_on_NRMS_data(query_data=None, reference_data=None, spectrum_I
             unique_reference_ids = df_reference.iloc[:,0].unique()
-    ##### process input parameters and ensure they are in a valid format #####
     if spectrum_ID1 is not None:
         spectrum_ID1 = str(spectrum_ID1)
     else:
@@ -457,12 +441,10 @@ def generate_plots_on_NRMS_data(query_data=None, reference_data=None, spectrum_I
         print(f'Warning: plots will be saved to the PDF ./spectrum1_{spectrum_ID1}_spectrum2_{spectrum_ID2}_plot.pdf in the current working directory.')
         output_path = f'{Path.cwd()}/spectrum1_{spectrum_ID1}_spectrum2_{spectrum_ID2}.pdf'
-    # get m/z values
     min_mz = np.min([np.min(df_query.iloc[:,1]), np.min(df_reference.iloc[:,1])])
     max_mz = np.max([np.max(df_query.iloc[:,1]), np.max(df_reference.iloc[:,1])])
     mzs = np.linspace(min_mz,max_mz,(max_mz-min_mz+1))
-    # get unique query/reference library IDs; each query/reference ID corresponds to exactly one query/reference mass spectrum
     unique_query_ids = df_query.iloc[:,0].unique().tolist()
     unique_reference_ids = df_reference.iloc[:,0].unique().tolist()
     unique_query_ids = [str(ID) for ID in unique_query_ids]
@@ -494,7 +476,6 @@ def generate_plots_on_NRMS_data(query_data=None, reference_data=None, spectrum_I
     q_spec = convert_spec(q_spec,mzs)
     r_spec = convert_spec(r_spec,mzs)
-    # get the ranges of m/z and intensity values to display at the bottom of the two plots
     int_min_tmp_q = min(q_spec[q_spec[:,1].nonzero(),1][0])
     int_min_tmp_r = min(r_spec[r_spec[:,1].nonzero(),1][0])
     int_max_tmp_q = max(q_spec[q_spec[:,1].nonzero(),1][0])
@@ -502,13 +483,10 @@ def generate_plots_on_NRMS_data(query_data=None, reference_data=None, spectrum_I
     int_min_tmp = int(min([int_min_tmp_q,int_min_tmp_r]))
     int_max_tmp = int(max([int_max_tmp_q,int_max_tmp_r]))
-    # create the figure
     fig, axes = plt.subplots(nrows=2, ncols=1)
-    # plot the untransformed spectra
     plt.subplot(2,1,1)
-    # display warning message if either spectra have no non-zero ion fragments
     if np.max(q_spec[:,1]) == 0 or np.max(r_spec[:,1]) == 0:
         plt.text(0.5, 0.5, 'The query and/or reference spectrum has no non-zero intensities after transformations.\n Change transformation parameters.', ha='center', va='center', fontsize=7, color='black')
         plt.xticks([])
@@ -519,7 +497,6 @@ def generate_plots_on_NRMS_data(query_data=None, reference_data=None, spectrum_I
         q_spec_pre_trans[:,1] = q_spec_pre_trans[:,1].astype(float)
         r_spec_pre_trans[:,1] = r_spec_pre_trans[:,1].astype(float)
-        # apply transformation to y-axis if relevant
         if y_axis_transformation == 'normalized':
             q_spec_pre_trans[:,1] = q_spec_pre_trans[:,1] / np.max(q_spec_pre_trans[:,1])
             r_spec_pre_trans[:,1] = r_spec_pre_trans[:,1] / np.max(r_spec_pre_trans[:,1])
@@ -543,32 +520,29 @@ def generate_plots_on_NRMS_data(query_data=None, reference_data=None, spectrum_I
         plt.title('Untransformed Query and Reference Spectra', fontsize=10)
     for transformation in spectrum_preprocessing_order:
-        if transformation == 'W': # weight factor transformation
+        if transformation == 'W':
             q_spec[:,1] = wf_transform(q_spec[:,0], q_spec[:,1], wf_mz, wf_intensity)
             r_spec[:,1] = wf_transform(r_spec[:,0], r_spec[:,1], wf_mz, wf_intensity)
-        if transformation == 'L': # low-entropy transformation
+        if transformation == 'L':
             q_spec[:,1] = LE_transform(q_spec[:,1], LET_threshold, normalization_method)
             r_spec[:,1] = LE_transform(r_spec[:,1], LET_threshold, normalization_method)
-        if transformation == 'N': # noise removal
+        if transformation == 'N':
             q_spec = remove_noise(q_spec, nr = noise_threshold)
             if high_quality_reference_library == False:
                 r_spec = remove_noise(r_spec, nr = noise_threshold)
-        if transformation == 'F': # filtering with respect to mz and/or intensity
+        if transformation == 'F':
             q_spec = filter_spec_gcms(q_spec, mz_min = mz_min, mz_max = mz_max, int_min = int_min, int_max = int_max)
             if high_quality_reference_library == False:
                 r_spec = filter_spec_gcms(r_spec, mz_min = mz_min, mz_max = mz_max, int_min = int_min, int_max = int_max)
-    # compute similarity score; if the spectra contain at most one point, their similarity is considered to be 0
     if q_spec.shape[0] > 1:
         similarity_score = get_similarity(similarity_measure, q_spec[:,1], r_spec[:,1], weights, entropy_dimension)
     else:
         similarity_score = 0
-    # plot the transformed spectra
     plt.subplot(2,1,2)
-    # display warning message if either spectra are empty or have no non-zero intensity ion fragments
     if q_spec.shape[0] == 0 or r_spec.shape[0] == 0:
         plt.text(0.5, 0.5, 'The query and/or reference spectrum has no ion fragments left after transformations.\n Change transformation parameters.', ha='center', va='center', fontsize=7, color='black')
         plt.xticks([])
@@ -578,7 +552,6 @@ def generate_plots_on_NRMS_data(query_data=None, reference_data=None, spectrum_I
         plt.xticks([])
         plt.yticks([])
     else:
-        # apply transformation to y-axis if relevant
         if y_axis_transformation == 'normalized':
             q_spec[:,1] = q_spec[:,1] / np.max(q_spec[:,1])
             r_spec[:,1] = r_spec[:,1] / np.max(r_spec[:,1])
@@ -602,7 +575,6 @@ def generate_plots_on_NRMS_data(query_data=None, reference_data=None, spectrum_I
         plt.title(f'Transformed Query and Reference Spectra', fontsize=10)
-    #plt.subplots_adjust(top = 0.8, hspace = 0.7)
     plt.subplots_adjust(top=0.8, hspace=0.92, bottom=0.3)
     plt.figlegend(loc = 'upper center')
     fig.text(0.05, 0.15, f'Similarity Measure: {similarity_measure.capitalize()}', fontsize=7)

pycompound/processing.py CHANGED Viewed

@@ -1,6 +1,4 @@
-# This script contains the functions used to transform spectra prior to computing similarity scores
 from pycompound.build_library import build_library_from_raw_data
 import scipy.stats
 import numpy as np
@@ -165,7 +163,6 @@ def centroid_spectrum(spec, window_size):
     spec = spec[np.argsort(spec[:,0])]
-    #Fast check is the spectrum needs centroiding
     mz_array = spec[:, 0]
     need_centroid = 0
     if mz_array.shape[0] > 1:
@@ -180,7 +177,6 @@ def centroid_spectrum(spec, window_size):
             mz_delta_allowed = window_size
             if spec[i, 1] > 0:
-                #Find left bound for current peak
                 i_left = i - 1
                 while i_left >= 0:
                     mz_delta_left = spec[i, 0] - spec[i_left, 0]
@@ -190,7 +186,6 @@ def centroid_spectrum(spec, window_size):
                         break
                 i_left += 1
-                #Find right bound for current peak
                 i_right = i + 1
                 while i_right < spec.shape[0]:
                     mz_delta_right = spec[i_right, 0] - spec[i, 0]
@@ -199,7 +194,6 @@ def centroid_spectrum(spec, window_size):
                     else:
                         break
-                #Merge those peaks
                 intensity_sum = np.sum(spec[i_left:i_right, 1])
                 intensity_weighted_sum = np.sum(spec[i_left:i_right, 0] * spec[i_left:i_right, 1])
@@ -246,16 +240,13 @@ def match_peaks_in_spectra(spec_a, spec_b, window_size):
         mass_delta = spec_a[a, 0] - spec_b[b, 0]
         if mass_delta < -window_size:
-            # Peak only existed in spec a.
             spec_merged.append([spec_a[a, 0], spec_a[a, 1], peak_b_int])
             peak_b_int = 0.
             a += 1
         elif mass_delta > window_size:
-            # Peak only existed in spec b.
             spec_merged.append([spec_b[b, 0], 0., spec_b[b, 1]])
             b += 1
         else:
-            # Peak existed in both spec.
             peak_b_int += spec_b[b, 1]
             b += 1

pycompound/similarity_measures.py CHANGED Viewed

@@ -10,7 +10,6 @@ import sys
 def S_cos(ints_a, ints_b):
-    # Cosine Similarity Measure
     if np.sum(ints_a) == 0 or np.sum(ints_b) == 0:
         return(0)
     else:
@@ -18,12 +17,10 @@ def S_cos(ints_a, ints_b):
 def ent_renyi(ints, q):
-    # Computes the Renyi entropy of a probability distribution for a given positive entropy dimension q
     return np.log(sum(np.power(ints,q))) / (1-q)
 def ent_tsallis(ints, q):
-    # Computes the Tsallis entropy of a probability distribution for a given positive entropy dimension q
     return (sum(np.power(ints,q))-1) / (1-q)

pycompound 0.1.1__py3-none-any.whl → 0.1.2__py3-none-any.whl

pycompound 0.1.1py3-none-any.whl → 0.1.2py3-none-any.whl