PyPI - pycompound - Versions diffs - 0.1.1__py3-none-any.whl → 0.1.2__py3-none-any.whl - Mend

pycompound 0.1.1py3-none-any.whl → 0.1.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (14) hide show

app.py +155 -194
pycompound/build_library.py +2 -9
pycompound/plot_spectra.py +10 -38
pycompound/processing.py +0 -9
pycompound/similarity_measures.py +0 -3
pycompound/spec_lib_matching.py +246 -81
pycompound/spec_lib_matching_CLI.py +2 -7
pycompound/tuning_CLI.py +1 -1
{pycompound-0.1.1.dist-info → pycompound-0.1.2.dist-info}/METADATA +1 -1
pycompound-0.1.2.dist-info/RECORD +14 -0
pycompound-0.1.1.dist-info/RECORD +0 -14
{pycompound-0.1.1.dist-info → pycompound-0.1.2.dist-info}/WHEEL +0 -0
{pycompound-0.1.1.dist-info → pycompound-0.1.2.dist-info}/licenses/LICENSE +0 -0
{pycompound-0.1.1.dist-info → pycompound-0.1.2.dist-info}/top_level.txt +0 -0

pycompound/spec_lib_matching.py CHANGED Viewed

@@ -1,6 +1,4 @@
-# this script's function runs spectral library matching to identify unknown query compound(s)
 from pycompound.build_library import build_library_from_raw_data
 from .processing import *
 from .similarity_measures import *
@@ -10,6 +8,7 @@ import json
 from itertools import product
 from joblib import Parallel, delayed
 import csv
+import sys, csv
 default_HRMS_grid = {'similarity_measure':['cosine'], 'weight':[{'Cosine':0.25,'Shannon':0.25,'Renyi':0.25,'Tsallis':0.25}], 'spectrum_preprocessing_order':['FCNMWL'], 'mz_min':[0], 'mz_max':[9999999], 'int_min':[0], 'int_max':[99999999], 'window_size_centroiding':[0.5], 'window_size_matching':[0.5], 'noise_threshold':[0.0], 'wf_mz':[0.0], 'wf_int':[1.0], 'LET_threshold':[0.0], 'entropy_dimension':[1.1], 'high_quality_reference_library':[False]}
@@ -80,21 +79,20 @@ def _eval_one_NRMS(df_query, df_reference, unique_query_ids, unique_reference_id
 def tune_params_on_HRMS_data(query_data=None, reference_data=None, grid=None, output_path=None, return_output=False):
     """
-    runs spectral library matching on high-resolution mass spectrometry (HRMS) data with all possible combinations of parameters in the grid dict, saves results from each choice of parameters to a CSV file, and prints top-performing parameters
+    runs spectral library matching on high-resolution mass spectrometry (HRMS) data with all possible combinations of parameters in the grid dict, saves results from each choice of parameters to a TXT file, and prints top-performing parameters
     --query_data: mgf, mzML, or csv file of query mass spectrum/spectra to be identified. If csv file, each row should correspond to a mass spectrum, the left-most column should contain an identifier, and each of the other columns should correspond to a single mass/charge ratio. Mandatory argument.
     --reference_data: mgf, mzML, or csv file of the reference mass spectra. If csv file, each row should correspond to a mass spectrum, the left-most column should contain in identifier (i.e. the CAS registry number or the compound name), and the remaining column should correspond to a single mass/charge ratio. Mandatory argument.
     --grid: dict with all possible parameter values to try.
-    --output_path: accuracy from each choice of parameter set is saved to a CSV file here.
+    --output_path: accuracy from each choice of parameter set is saved to a TXT file here.
     """
     grid = {**default_HRMS_grid, **(grid or {})}
     for key, value in grid.items():
         globals()[key] = value
-    # load query and reference libraries
     if query_data is None:
-        print('\nError: No argument passed to the mandatory query_data. Please pass the path to the CSV file of the query data.')
+        print('\nError: No argument passed to the mandatory query_data. Please pass the path to the TXT file of the query data.')
         sys.exit()
     else:
         extension = query_data.rsplit('.',1)
@@ -154,14 +152,123 @@ def tune_params_on_HRMS_data(query_data=None, reference_data=None, grid=None, ou
         return df_out
+def tune_params_on_HRMS_data_shiny(query_data=None, reference_data=None, grid=None, output_path=None, return_output=False):
+    """
+    runs spectral library matching on high-resolution mass spectrometry (HRMS) data with all possible
+    combinations of parameters in the grid dict, saves results from each choice of parameters to a TXT file,
+    and prints top-performing parameters
+    --query_data: mgf, mzML, or csv file of query mass spectrum/spectra to be identified. If csv file, each row
+       should correspond to a mass spectrum, the left-most column should contain an identifier, and each of the
+       other columns should correspond to a single mass/charge ratio. Mandatory argument.
+    --reference_data: mgf, mzML, or csv file of the reference mass spectra. If csv file, each row should correspond
+       to a mass spectrum, the left-most column should contain in identifier (i.e. the CAS registry number or the
+       compound name), and the remaining column should correspond to a single mass/charge ratio. Mandatory argument.
+    --grid: dict with all possible parameter values to try.
+    --output_path: accuracy from each choice of parameter set is saved to a TXT file here.
+    """
+    local_grid = {**default_HRMS_grid, **(grid or {})}
+    for key, value in local_grid.items():
+        globals()[key] = value
+    if query_data is None:
+        print('\nError: No argument passed to the mandatory query_data. Please pass the path to the data file.')
+        sys.exit()
+    else:
+        extension = query_data.rsplit('.', 1)[-1]
+        if extension in ('mgf','MGF','mzML','mzml','MZML','cdf','CDF'):
+            output_path_tmp = query_data[:-3] + 'csv'
+            build_library_from_raw_data(input_path=query_data, output_path=output_path_tmp, is_reference=False)
+            df_query = pd.read_csv(output_path_tmp)
+        elif extension in ('csv','CSV'):
+            df_query = pd.read_csv(query_data)
+        else:
+            print(f'\nError: Unsupported query_data extension: {extension}')
+            sys.exit()
+        unique_query_ids = df_query.iloc[:, 0].unique()
+    if reference_data is None:
+        print('\nError: No argument passed to the mandatory reference_data. Please pass the path to the data file(s).')
+        sys.exit()
+    else:
+        if isinstance(reference_data, str):
+            df_reference = get_reference_df(reference_data=reference_data)
+            unique_reference_ids = df_reference.iloc[:, 0].unique()
+        else:
+            dfs = []
+            unique_reference_ids = []
+            for f in reference_data:
+                tmp = get_reference_df(reference_data=f)
+                dfs.append(tmp)
+                unique_reference_ids.extend(tmp.iloc[:, 0].unique())
+            df_reference = pd.concat(dfs, axis=0, ignore_index=True)
+    print(f'\nNote that there are {len(unique_query_ids)} unique query spectra, '
+          f'{len(unique_reference_ids)} unique reference spectra, and '
+          f'{len(set(unique_query_ids) & set(unique_reference_ids))} of the query and reference spectra IDs are in common.\n')
+    if output_path is None:
+        output_path = f'{Path.cwd()}/tuning_param_output.txt'
+        print(f'Warning: since output_path=None, the output will be written to the current working directory: {output_path}')
+    param_grid = product(
+        similarity_measure, weight, spectrum_preprocessing_order, mz_min, mz_max, int_min, int_max,
+        noise_threshold, window_size_centroiding, window_size_matching, wf_mz, wf_int, LET_threshold,
+        entropy_dimension, high_quality_reference_library
+    )
+    results = []
+    total = (
+        len(similarity_measure) * len(weight) * len(spectrum_preprocessing_order) * len(mz_min) * len(mz_max) *
+        len(int_min) * len(int_max) * len(noise_threshold) * len(window_size_centroiding) *
+        len(window_size_matching) * len(wf_mz) * len(wf_int) * len(LET_threshold) *
+        len(entropy_dimension) * len(high_quality_reference_library)
+    )
+    done = 0
+    for params in param_grid:
+        res = _eval_one_HRMS(df_query, df_reference, unique_query_ids, unique_reference_ids, *params)
+        results.append(res)
+        done += 1
+        print(f'Completed {done}/{total} grid combinations.\n', flush=True)
+    df_out = pd.DataFrame(results, columns=[
+        'ACC','SIMILARITY.MEASURE','WEIGHT','SPECTRUM.PROCESSING.ORDER','MZ.MIN','MZ.MAX',
+        'INT.MIN','INT.MAX','NOISE.THRESHOLD','WINDOW.SIZE.CENTROIDING','WINDOW.SIZE.MATCHING',
+        'WF.MZ','WF.INT','LET.THRESHOLD','ENTROPY.DIMENSION','HIGH.QUALITY.REFERENCE.LIBRARY'
+    ])
+    if 'WEIGHT' in df_out.columns:
+        df_out['WEIGHT'] = (
+            df_out['WEIGHT'].astype(str)
+                .str.replace("\"","",regex=False)
+                .str.replace("{","",regex=False)
+                .str.replace("}","",regex=False)
+                .str.replace(":","",regex=False)
+                .str.replace("Cosine","",regex=False)
+                .str.replace("Shannon","",regex=False)
+                .str.replace("Renyi","",regex=False)
+                .str.replace("Tsallis","",regex=False)
+                .str.replace(" ","",regex=False)
+        )
+    if return_output:
+        return df_out
+    else:
+        df_out.to_csv(output_path, index=False, sep='\t', quoting=csv.QUOTE_NONE)
+        print(f'Wrote results to {output_path}')
 def tune_params_on_NRMS_data(query_data=None, reference_data=None, grid=None, output_path=None, return_output=False):
     """
-    runs spectral library matching on nominal-resolution mass spectrometry (NRMS) data with all possible combinations of parameters in the grid dict, saves results from each choice of parameters to a CSV file, and prints top-performing parameters
+    runs spectral library matching on nominal-resolution mass spectrometry (NRMS) data with all possible combinations of parameters in the grid dict, saves results from each choice of parameters to a TXT file, and prints top-performing parameters
     --query_data: mgf, mzML, or csv file of query mass spectrum/spectra to be identified. If csv file, each row should correspond to a mass spectrum, the left-most column should contain an identifier, and each of the other columns should correspond to a single mass/charge ratio. Mandatory argument.
     --reference_data: mgf, mzML, or csv file of the reference mass spectra. If csv file, each row should correspond to a mass spectrum, the left-most column should contain in identifier (i.e. the CAS registry number or the compound name), and the remaining column should correspond to a single mass/charge ratio. Mandatory argument.
     --grid: dict with all possible parameter values to try
-    --output_path: accuracy from each choice of parameter set is saved to a CSV file here
+    --output_path: accuracy from each choice of parameter set is saved to a TXT file here
     """
     grid = {**default_NRMS_grid, **(grid or {})}
@@ -228,57 +335,157 @@ def tune_params_on_NRMS_data(query_data=None, reference_data=None, grid=None, ou
+def tune_params_on_NRMS_data_shiny(query_data=None, reference_data=None, grid=None, output_path=None, return_output=False):
+    """
+    runs spectral library matching on nominal-resolution mass spectrometry (NRMS) data with all possible
+    combinations of parameters in the grid dict, saves results from each choice of parameters to a TXT file,
+    and prints top-performing parameters
+    --query_data: mgf, mzML, or csv file of query mass spectrum/spectra to be identified. If csv file, each row
+       should correspond to a mass spectrum, the left-most column should contain an identifier, and each of the
+       other columns should correspond to a single mass/charge ratio. Mandatory argument.
+    --reference_data: mgf, mzML, or csv file of the reference mass spectra. If csv file, each row should correspond
+       to a mass spectrum, the left-most column should contain in identifier (i.e. the CAS registry number or the
+       compound name), and the remaining column should correspond to a single mass/charge ratio. Mandatory argument.
+    --grid: dict with all possible parameter values to try.
+    --output_path: accuracy from each choice of parameter set is saved to a TXT file here.
+    """
+    local_grid = {**default_NRMS_grid, **(grid or {})}
+    for key, value in local_grid.items():
+        globals()[key] = value
+    if query_data is None:
+        print('\nError: No argument passed to the mandatory query_data. Please pass the path to the data file.')
+        sys.exit()
+    else:
+        extension = query_data.rsplit('.', 1)[-1]
+        if extension in ('mgf','MGF','mzML','mzml','MZML','cdf','CDF'):
+            output_path_tmp = query_data[:-3] + 'csv'
+            build_library_from_raw_data(input_path=query_data, output_path=output_path_tmp, is_reference=False)
+            df_query = pd.read_csv(output_path_tmp)
+        elif extension in ('csv','CSV'):
+            df_query = pd.read_csv(query_data)
+        else:
+            print(f'\nError: Unsupported query_data extension: {extension}')
+            sys.exit()
+        unique_query_ids = df_query.iloc[:, 0].unique()
+    if reference_data is None:
+        print('\nError: No argument passed to the mandatory reference_data. Please pass the path to the data file(s).')
+        sys.exit()
+    else:
+        if isinstance(reference_data, str):
+            df_reference = get_reference_df(reference_data=reference_data)
+            unique_reference_ids = df_reference.iloc[:, 0].unique()
+        else:
+            dfs = []
+            unique_reference_ids = []
+            for f in reference_data:
+                tmp = get_reference_df(reference_data=f)
+                dfs.append(tmp)
+                unique_reference_ids.extend(tmp.iloc[:, 0].unique())
+            df_reference = pd.concat(dfs, axis=0, ignore_index=True)
+    print(f'\nNote that there are {len(unique_query_ids)} unique query spectra, '
+          f'{len(unique_reference_ids)} unique reference spectra, and '
+          f'{len(set(unique_query_ids) & set(unique_reference_ids))} of the query and reference spectra IDs are in common.\n')
+    if output_path is None:
+        output_path = f'{Path.cwd()}/tuning_param_output.txt'
+        print(f'Warning: since output_path=None, the output will be written to the current working directory: {output_path}')
+    param_grid = product(
+        similarity_measure, weight, spectrum_preprocessing_order, mz_min, mz_max, int_min, int_max,
+        noise_threshold, wf_mz, wf_int, LET_threshold,
+        entropy_dimension, high_quality_reference_library
+    )
+    results = []
+    total = (
+        len(similarity_measure) * len(weight) * len(spectrum_preprocessing_order) * len(mz_min) * len(mz_max) * len(int_min) *
+        len(int_max) * len(noise_threshold) * len(wf_mz) * len(wf_int) * len(LET_threshold) * len(entropy_dimension) * len(high_quality_reference_library)
+    )
+    done = 0
+    for params in param_grid:
+        res = _eval_one_NRMS(df_query, df_reference, unique_query_ids, unique_reference_ids, *params)
+        results.append(res)
+        done += 1
+        print(f'Completed {done}/{total} grid combinations.\n', flush=True)
+    df_out = pd.DataFrame(results, columns=[
+        'ACC','SIMILARITY.MEASURE','WEIGHT','SPECTRUM.PROCESSING.ORDER','MZ.MIN','MZ.MAX',
+        'INT.MIN','INT.MAX','NOISE.THRESHOLD','WF.MZ','WF.INT','LET.THRESHOLD','ENTROPY.DIMENSION','HIGH.QUALITY.REFERENCE.LIBRARY'
+    ])
+    if 'WEIGHT' in df_out.columns:
+        df_out['WEIGHT'] = (
+            df_out['WEIGHT'].astype(str)
+                .str.replace("\"","",regex=False)
+                .str.replace("{","",regex=False)
+                .str.replace("}","",regex=False)
+                .str.replace(":","",regex=False)
+                .str.replace("Cosine","",regex=False)
+                .str.replace("Shannon","",regex=False)
+                .str.replace("Renyi","",regex=False)
+                .str.replace("Tsallis","",regex=False)
+                .str.replace(" ","",regex=False)
+        )
+    if return_output:
+        return df_out
+    else:
+        df_out.to_csv(output_path, index=False, sep='\t', quoting=csv.QUOTE_NONE)
+        print(f'Wrote results to {output_path}')
 def get_acc_HRMS(df_query, df_reference, unique_query_ids, unique_reference_ids, similarity_measure, weights, spectrum_preprocessing_order, mz_min, mz_max, int_min, int_max, window_size_centroiding, window_size_matching, noise_threshold, wf_mz, wf_int, LET_threshold, entropy_dimension, high_quality_reference_library):
-    # returns accuracy for a given set of parameters
     n_top_matches_to_save = 1
-    # compute the similarity score between each query library spectrum/spectra and all reference library spectra
     all_similarity_scores =  []
     for query_idx in range(0,len(unique_query_ids)):
         print(f'query spectrum #{query_idx} is being identified')
         q_idxs_tmp = np.where(df_query.iloc[:,0] == unique_query_ids[query_idx])[0]
         q_spec_tmp = np.asarray(pd.concat([df_query.iloc[q_idxs_tmp,1], df_query.iloc[q_idxs_tmp,2]], axis=1).reset_index(drop=True))
-        # compute the similarity score between the given query spectrum and all spectra in the reference library
         similarity_scores = []
         for ref_idx in range(0,len(unique_reference_ids)):
             q_spec = q_spec_tmp
             r_idxs_tmp = np.where(df_reference.iloc[:,0] == unique_reference_ids[ref_idx])[0]
             r_spec = np.asarray(pd.concat([df_reference.iloc[r_idxs_tmp,1], df_reference.iloc[r_idxs_tmp,2]], axis=1).reset_index(drop=True))
-            # apply spectrum preprocessing transformation in the order specified by user
             is_matched = False
             for transformation in spectrum_preprocessing_order:
                 if np.isinf(q_spec[:,1]).sum() > 0:
                     q_spec[:,1] = np.zeros(q_spec.shape[0])
                 if np.isinf(r_spec[:,1]).sum() > 0:
                     r_spec[:,1] = np.zeros(r_spec.shape[0])
-                if transformation == 'C' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1: # centroiding
+                if transformation == 'C' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1:
                     q_spec = centroid_spectrum(q_spec, window_size=window_size_centroiding)
                     r_spec = centroid_spectrum(r_spec, window_size=window_size_centroiding)
-                if transformation == 'M' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1: # matching
+                if transformation == 'M' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1:
                     m_spec = match_peaks_in_spectra(spec_a=q_spec, spec_b=r_spec, window_size=window_size_matching)
                     q_spec = m_spec[:,0:2]
                     r_spec = m_spec[:,[0,2]]
                     is_matched = True
-                if transformation == 'W' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1: # weight factor transformation
+                if transformation == 'W' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1:
                     q_spec[:,1] = wf_transform(q_spec[:,0], q_spec[:,1], wf_mz, wf_int)
                     r_spec[:,1] = wf_transform(r_spec[:,0], r_spec[:,1], wf_mz, wf_int)
-                if transformation == 'L' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1: # low-entropy tranformation
+                if transformation == 'L' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1:
                     q_spec[:,1] = LE_transform(q_spec[:,1], LET_threshold, normalization_method='standard')
                     r_spec[:,1] = LE_transform(r_spec[:,1], LET_threshold, normalization_method='standard')
-                if transformation == 'N' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1: # noise removal
+                if transformation == 'N' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1:
                     q_spec = remove_noise(q_spec, nr = noise_threshold)
                     if high_quality_reference_library == False:
                         r_spec = remove_noise(r_spec, nr = noise_threshold)
-                if transformation == 'F' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1: # filter with respect to mz and/or intensity
+                if transformation == 'F' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1:
                     q_spec = filter_spec_lcms(q_spec, mz_min = mz_min, mz_max = mz_max, int_min = int_min, int_max = int_max, is_matched = is_matched)
                     if high_quality_reference_library == False:
                         r_spec = filter_spec_lcms(r_spec, mz_min = mz_min, mz_max = mz_max, int_min = int_min, int_max = int_max, is_matched = is_matched)
-            # query and reference spectrum intensities
             q_ints = q_spec[:,1]
             r_ints = r_spec[:,1]
             if np.sum(q_ints) != 0 and np.sum(r_ints) != 0 and q_spec.shape[0] > 1 and r_spec.shape[1] > 1:
@@ -289,12 +496,10 @@ def get_acc_HRMS(df_query, df_reference, unique_query_ids, unique_reference_ids,
             similarity_scores.append(similarity_score)
         all_similarity_scores.append(similarity_scores)
-    # create pandas dataframe containing all similarity scores computed with one row for each query spectrum and one column for each reference spectrum
     df_scores = pd.DataFrame(all_similarity_scores, columns = unique_reference_ids)
     df_scores.index = unique_query_ids
     df_scores.index.names = ['Query Spectrum ID']
-    # get predicted identity/identities of each query spectrum and the corresponding maximum similarity score
     preds = []
     scores = []
     for i in range(0, df_scores.shape[0]):
@@ -325,7 +530,6 @@ def get_acc_HRMS(df_query, df_reference, unique_query_ids, unique_reference_ids,
 def get_acc_NRMS(df_query, df_reference, unique_query_ids, unique_reference_ids, similarity_measure, weights, spectrum_preprocessing_order, mz_min, mz_max, int_min, int_max, noise_threshold, wf_mz, wf_int, LET_threshold, entropy_dimension, high_quality_reference_library):
-    # returns accuracy for a given set of parameters
     n_top_matches_to_save = 1
@@ -348,32 +552,29 @@ def get_acc_NRMS(df_query, df_reference, unique_query_ids, unique_reference_ids,
             r_spec_tmp = np.asarray(pd.concat([df_reference.iloc[r_idxs_tmp,1], df_reference.iloc[r_idxs_tmp,2]], axis=1).reset_index(drop=True))
             r_spec = convert_spec(r_spec_tmp,mzs)
-            # apply spectrum preprocessing transformation in the order specified by user
             for transformation in spectrum_preprocessing_order:
                 if np.isinf(q_spec[:,1]).sum() > 0:
                     q_spec[:,1] = np.zeros(q_spec.shape[0])
                 if np.isinf(r_spec[:,1]).sum() > 0:
                     r_spec[:,1] = np.zeros(r_spec.shape[0])
-                if transformation == 'W': # weight factor transformation
+                if transformation == 'W':
                     q_spec[:,1] = wf_transform(q_spec[:,0], q_spec[:,1], wf_mz, wf_int)
                     r_spec[:,1] = wf_transform(r_spec[:,0], r_spec[:,1], wf_mz, wf_int)
-                if transformation == 'L': # low-entropy transformation
+                if transformation == 'L':
                     q_spec[:,1] = LE_transform(q_spec[:,1], LET_threshold, normalization_method='standard')
                     r_spec[:,1] = LE_transform(r_spec[:,1], LET_threshold, normalization_method='standard')
-                if transformation == 'N': # noise removal
+                if transformation == 'N':
                     q_spec = remove_noise(q_spec, nr = noise_threshold)
                     if high_quality_reference_library == False:
                         r_spec = remove_noise(r_spec, nr = noise_threshold)
-                if transformation == 'F': # filter with respect to mz and/or intensity
+                if transformation == 'F':
                     q_spec = filter_spec_gcms(q_spec, mz_min = mz_min, mz_max = mz_max, int_min = int_min, int_max = int_max)
                     if high_quality_reference_library == False:
                         r_spec = filter_spec_gcms(r_spec, mz_min = mz_min, mz_max = mz_max, int_min = int_min, int_max = int_max)
-            # query and reference spectrum intensities
             q_ints = q_spec[:,1]
             r_ints = r_spec[:,1]
-            # if there are no non-zero intensities in the query or reference spectrum, their similarity is 0
             if np.sum(q_ints) != 0 and np.sum(r_ints) != 0:
                 similarity_score = get_similarity(similarity_measure, q_spec[:,1], r_spec[:,1], weights, entropy_dimension)
             else:
@@ -382,12 +583,10 @@ def get_acc_NRMS(df_query, df_reference, unique_query_ids, unique_reference_ids,
             similarity_scores.append(similarity_score)
         all_similarity_scores.append(similarity_scores)
-    # create pandas dataframe containing all similarity scores computed with one row for each query spectrum and one column for each reference spectrum
     df_scores = pd.DataFrame(all_similarity_scores, columns = unique_reference_ids)
     df_scores.index = unique_query_ids
     df_scores.index.names = ['Query Spectrum ID']
-    # get predicted identity/identities of each query spectrum and the corresponding maximum similarity score
     preds = []
     scores = []
     for i in range(0, df_scores.shape[0]):
@@ -399,7 +598,6 @@ def get_acc_NRMS(df_query, df_reference, unique_query_ids, unique_reference_ids,
             cols_to_keep = np.where(df_scores_tmp.iloc[i,:] != np.max(df_scores_tmp.iloc[i,:]))[0]
             df_scores_tmp = df_scores_tmp.iloc[:,cols_to_keep]
-            #preds_tmp.append(';'.join(top_ref_specs_tmp.index.to_list()))
             preds_tmp.append(';'.join(map(str,top_ref_specs_tmp.index.to_list())))
             if len(top_ref_specs_tmp.values) == 0:
                 scores_tmp.append(0)
@@ -441,11 +639,10 @@ def run_spec_lib_matching_on_HRMS_data(query_data=None, reference_data=None, lik
     --entropy_dimension: Entropy dimension parameter. Must have positive value other than 1. When the entropy dimension is 1, then Renyi and Tsallis entropy are equivalent to Shannon entropy. Therefore, this parameter only applies to the renyi and tsallis similarity measures. This parameter will be ignored if similarity measure cosine or shannon is chosen. Default: 1.1
     --n_top_matches_to_save: The number of top matches to report. For example, if n_top_matches_to_save=5, then for each query spectrum, the five reference spectra with the largest similarity with the given query spectrum will be reported. Default: 1
     --print_id_results: Flag that prints identification results if True. Default: False
-    --output_identification: Output CSV file containing the most-similar reference spectra for each query spectrum along with the corresponding similarity scores. Default is to save identification output in current working directory with filename \'output_identification.txt\'.
-    --output_similarity_scores: Output CSV file containing similarity scores between all query spectrum/spectra and all reference spectra. Each row corresponds to a query spectrum, the left-most column contains the query spectrum/spectra identifier, and the remaining column contain the similarity scores with respect to all reference library spectra. If no argument passed, then this CSV file is written to the current working directory with filename \'output_all_similarity_scores\'.txt.')
+    --output_identification: Output TXT file containing the most-similar reference spectra for each query spectrum along with the corresponding similarity scores. Default is to save identification output in current working directory with filename \'output_identification.txt\'.
+    --output_similarity_scores: Output TXT file containing similarity scores between all query spectrum/spectra and all reference spectra. Each row corresponds to a query spectrum, the left-most column contains the query spectrum/spectra identifier, and the remaining column contain the similarity scores with respect to all reference library spectra. If no argument passed, then this TXT file is written to the current working directory with filename \'output_all_similarity_scores\'.txt.')
     '''
-    # load query and reference libraries
     if query_data is None:
         print('\nError: No argument passed to the mandatory query_data. Please pass the path to the CSV file of the query data.')
         sys.exit()
@@ -477,7 +674,6 @@ def run_spec_lib_matching_on_HRMS_data(query_data=None, reference_data=None, lik
             df_reference = pd.concat(dfs, axis=0, ignore_index=True)
-    ##### process input parameters and ensure they are in a valid format #####
     if spectrum_preprocessing_order is not None:
         spectrum_preprocessing_order = list(spectrum_preprocessing_order)
     else:
@@ -545,7 +741,7 @@ def run_spec_lib_matching_on_HRMS_data(query_data=None, reference_data=None, lik
     else:
         q = entropy_dimension
-    normalization_method = 'standard' #consider including additional normalization methods to transform intensities into a probability distribution; softmax results in many numerical errors/warnings
+    normalization_method = 'standard'
     if n_top_matches_to_save <= 0 or isinstance(n_top_matches_to_save,int)==False:
         print('\nError: n_top_matches_to_save should be a positive integer')
@@ -564,15 +760,12 @@ def run_spec_lib_matching_on_HRMS_data(query_data=None, reference_data=None, lik
         print(f'Warning: writing similarity scores to {output_similarity_scores}')
-    ####################################### begin spectral library matching #######################################
-    # compute the similarity score between each query library spectrum/spectra and all reference library spectra
     all_similarity_scores =  []
     for query_idx in range(0,len(unique_query_ids)):
         print(f'query spectrum #{query_idx} is being identified')
         q_idxs_tmp = np.where(df_query.iloc[:,0] == unique_query_ids[query_idx])[0]
         q_spec_tmp = np.asarray(pd.concat([df_query.iloc[q_idxs_tmp,1], df_query.iloc[q_idxs_tmp,2]], axis=1).reset_index(drop=True))
-        # compute the similarity score between the given query spectrum and all spectra in the reference library
         similarity_scores = []
         for ref_idx in range(0,len(unique_reference_ids)):
             #if ref_idx % 100 == 0:
@@ -581,37 +774,35 @@ def run_spec_lib_matching_on_HRMS_data(query_data=None, reference_data=None, lik
             r_idxs_tmp = np.where(df_reference.iloc[:,0] == unique_reference_ids[ref_idx])[0]
             r_spec = np.asarray(pd.concat([df_reference.iloc[r_idxs_tmp,1], df_reference.iloc[r_idxs_tmp,2]], axis=1).reset_index(drop=True))
-            # apply spectrum preprocessing transformation in the order specified by user
             is_matched = False
             for transformation in spectrum_preprocessing_order:
                 if np.isinf(q_spec[:,1]).sum() > 0:
                     q_spec[:,1] = np.zeros(q_spec.shape[0])
                 if np.isinf(r_spec[:,1]).sum() > 0:
                     r_spec[:,1] = np.zeros(r_spec.shape[0])
-                if transformation == 'C' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1: # centroiding
+                if transformation == 'C' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1:
                     q_spec = centroid_spectrum(q_spec, window_size=window_size_centroiding)
                     r_spec = centroid_spectrum(r_spec, window_size=window_size_centroiding)
-                if transformation == 'M' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1: # matching
+                if transformation == 'M' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1:
                     m_spec = match_peaks_in_spectra(spec_a=q_spec, spec_b=r_spec, window_size=window_size_matching)
                     q_spec = m_spec[:,0:2]
                     r_spec = m_spec[:,[0,2]]
                     is_matched = True
-                if transformation == 'W' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1: # weight factor transformation
+                if transformation == 'W' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1:
                     q_spec[:,1] = wf_transform(q_spec[:,0], q_spec[:,1], wf_mz, wf_intensity)
                     r_spec[:,1] = wf_transform(r_spec[:,0], r_spec[:,1], wf_mz, wf_intensity)
-                if transformation == 'L' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1: # low-entropy tranformation
+                if transformation == 'L' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1:
                     q_spec[:,1] = LE_transform(q_spec[:,1], LET_threshold, normalization_method=normalization_method)
                     r_spec[:,1] = LE_transform(r_spec[:,1], LET_threshold, normalization_method=normalization_method)
-                if transformation == 'N' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1: # noise removal
+                if transformation == 'N' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1:
                     q_spec = remove_noise(q_spec, nr = noise_threshold)
                     if high_quality_reference_library == False:
                         r_spec = remove_noise(r_spec, nr = noise_threshold)
-                if transformation == 'F' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1: # filter with respect to mz and/or intensity
+                if transformation == 'F' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1:
                     q_spec = filter_spec_lcms(q_spec, mz_min = mz_min, mz_max = mz_max, int_min = int_min, int_max = int_max, is_matched = is_matched)
                     if high_quality_reference_library == False:
                         r_spec = filter_spec_lcms(r_spec, mz_min = mz_min, mz_max = mz_max, int_min = int_min, int_max = int_max, is_matched = is_matched)
-            # query and reference spectrum intensities
             q_ints = q_spec[:,1]
             r_ints = r_spec[:,1]
@@ -623,12 +814,10 @@ def run_spec_lib_matching_on_HRMS_data(query_data=None, reference_data=None, lik
             similarity_scores.append(similarity_score)
         all_similarity_scores.append(similarity_scores)
-    # create pandas dataframe containing all similarity scores computed with one row for each query spectrum and one column for each reference spectrum
     df_scores = pd.DataFrame(all_similarity_scores, columns = unique_reference_ids)
     df_scores.index = unique_query_ids
     df_scores.index.names = ['Query Spectrum ID']
-    # get predicted identity/identities of each query spectrum and the corresponding maximum similarity score
     preds = []
     scores = []
     for i in range(0, df_scores.shape[0]):
@@ -652,29 +841,23 @@ def run_spec_lib_matching_on_HRMS_data(query_data=None, reference_data=None, lik
     scores = np.array(scores)
     out = np.c_[preds,scores]
-    # get column names for a pandas dataframe with the n_top_matches_to_save top-matches for each query spectrum
     cnames_preds = []
     cnames_scores = []
     for i in range(0,n_top_matches_to_save):
         cnames_preds.append(f'RANK.{i+1}.PRED')
         cnames_scores.append(f'RANK.{i+1}.SIMILARITY.SCORE')
-    # get pandas dataframe with identifcation results with each row corresponding to a query spectrum, n_top_matches_to_save columns for the top predictions, and n_top_matches_to_save columns for the similarity scores corresponding to the predictions
     df_top_ref_specs = pd.DataFrame(out, columns = [*cnames_preds, *cnames_scores])
     df_top_ref_specs.index = unique_query_ids
     df_top_ref_specs.index.names = ['Query Spectrum ID']
     df_scores.columns = ['Reference Spectrum ID: ' + col for col in  list(map(str,df_scores.columns.tolist()))]
-    # print the identification results if the user desires
     if print_id_results == True:
         print(df_top_ref_specs.to_string())
     if return_ID_output is False:
-        # write spectral library matching results to disk
         df_top_ref_specs.to_csv(output_identification, sep='\t')
-        # write all similarity scores to disk
         df_scores.to_csv(output_similarity_scores, sep='\t')
     else:
         return df_top_ref_specs
@@ -706,11 +889,10 @@ def run_spec_lib_matching_on_NRMS_data(query_data=None, reference_data=None, lik
     --normalization_method: Method used to normalize the intensities of each spectrum so that the intensities sum to 1. Since the objects entropy quantifies the uncertainy of must be probability distributions, the intensities of a given spectrum must sum to 1 prior to computing the entropy of the given spectrum intensities. Options: \'standard\' and \'softmax\'. Default: standard.
     --n_top_matches_to_save: The number of top matches to report. For example, if n_top_matches_to_save=5, then for each query spectrum, the five reference spectra with the largest similarity with the given query spectrum will be reported. Default: 1
     --print_id_results: Flag that prints identification results if True. Default: False
-    --output_identification: Output CSV file containing the most-similar reference spectra for each query spectrum along with the corresponding similarity scores. Default is to save identification output in current working directory with filename \'output_identification.txt\'.
-    --output_similarity_scores: Output CSV file containing similarity scores between all query spectrum/spectra and all reference spectra. Each row corresponds to a query spectrum, the left-most column contains the query spectrum/spectra identifier, and the remaining column contain the similarity scores with respect to all reference library spectra. If no argument passed, then this CSV file is written to the current working directory with filename \'output_all_similarity_scores\'.txt.')
+    --output_identification: Output TXT file containing the most-similar reference spectra for each query spectrum along with the corresponding similarity scores. Default is to save identification output in current working directory with filename \'output_identification.txt\'.
+    --output_similarity_scores: Output TXT file containing similarity scores between all query spectrum/spectra and all reference spectra. Each row corresponds to a query spectrum, the left-most column contains the query spectrum/spectra identifier, and the remaining column contain the similarity scores with respect to all reference library spectra. If no argument passed, then this TXT file is written to the current working directory with filename \'output_all_similarity_scores\'.txt.')
     '''
-    # load query and reference libraries
     if query_data is None:
         print('\nError: No argument passed to the mandatory query_data. Please pass the path to the CSV file of the query data.')
         sys.exit()
@@ -742,7 +924,6 @@ def run_spec_lib_matching_on_NRMS_data(query_data=None, reference_data=None, lik
             df_reference = pd.concat(dfs, axis=0, ignore_index=True)
-    ##### process input parameters and ensure they are in a valid format #####
     if spectrum_preprocessing_order is not None:
         spectrum_preprocessing_order = list(spectrum_preprocessing_order)
     else:
@@ -795,7 +976,7 @@ def run_spec_lib_matching_on_NRMS_data(query_data=None, reference_data=None, lik
     else:
         q = entropy_dimension
-    normalization_method = 'standard' #consider including additional normalization methods to transform intensities into a probability distribution; softmax results in many numerical errors/warnings
+    normalization_method = 'standard'
     if n_top_matches_to_save <= 0 or isinstance(n_top_matches_to_save,int)==False:
         print('\nError: n_top_matches_to_save should be a positive integer')
@@ -815,14 +996,10 @@ def run_spec_lib_matching_on_NRMS_data(query_data=None, reference_data=None, lik
-    ####################################### begin spectral library matching #######################################
-    # get the range of m/z values
     min_mz = int(np.min([np.min(df_query.iloc[:,1]), np.min(df_reference.iloc[:,1])]))
     max_mz = int(np.max([np.max(df_query.iloc[:,1]), np.max(df_reference.iloc[:,1])]))
     mzs = np.linspace(min_mz,max_mz,(max_mz-min_mz+1))
-    # compute the similarity score between each query library spectrum/spectra and all reference library spectra
-    # for each query spectrum, compute its similarity with all reference spectra
     all_similarity_scores =  []
     for query_idx in range(0,len(unique_query_ids)):
         q_idxs_tmp = np.where(df_query.iloc[:,0] == unique_query_ids[query_idx])[0]
@@ -838,32 +1015,29 @@ def run_spec_lib_matching_on_NRMS_data(query_data=None, reference_data=None, lik
             r_spec_tmp = np.asarray(pd.concat([df_reference.iloc[r_idxs_tmp,1], df_reference.iloc[r_idxs_tmp,2]], axis=1).reset_index(drop=True))
             r_spec = convert_spec(r_spec_tmp,mzs)
-            # apply spectrum preprocessing transformation in the order specified by user
             for transformation in spectrum_preprocessing_order:
                 if np.isinf(q_spec[:,1]).sum() > 0:
                     q_spec[:,1] = np.zeros(q_spec.shape[0])
                 if np.isinf(r_spec[:,1]).sum() > 0:
                     r_spec[:,1] = np.zeros(r_spec.shape[0])
-                if transformation == 'W': # weight factor transformation
+                if transformation == 'W':
                     q_spec[:,1] = wf_transform(q_spec[:,0], q_spec[:,1], wf_mz, wf_intensity)
                     r_spec[:,1] = wf_transform(r_spec[:,0], r_spec[:,1], wf_mz, wf_intensity)
-                if transformation == 'L': # low-entropy transformation
+                if transformation == 'L':
                     q_spec[:,1] = LE_transform(q_spec[:,1], LET_threshold, normalization_method=normalization_method)
                     r_spec[:,1] = LE_transform(r_spec[:,1], LET_threshold, normalization_method=normalization_method)
-                if transformation == 'N': # noise removal
+                if transformation == 'N':
                     q_spec = remove_noise(q_spec, nr = noise_threshold)
                     if high_quality_reference_library == False:
                         r_spec = remove_noise(r_spec, nr = noise_threshold)
-                if transformation == 'F': # filter with respect to mz and/or intensity
+                if transformation == 'F':
                     q_spec = filter_spec_gcms(q_spec, mz_min = mz_min, mz_max = mz_max, int_min = int_min, int_max = int_max)
                     if high_quality_reference_library == False:
                         r_spec = filter_spec_gcms(r_spec, mz_min = mz_min, mz_max = mz_max, int_min = int_min, int_max = int_max)
-            # query and reference spectrum intensities
             q_ints = q_spec[:,1]
             r_ints = r_spec[:,1]
-            # if there are no non-zero intensities in the query or reference spectrum, their similarity is 0
             if np.sum(q_ints) != 0 and np.sum(r_ints) != 0:
                 similarity_score = get_similarity(similarity_measure, q_spec[:,1], r_spec[:,1], weights, entropy_dimension)
             else:
@@ -872,12 +1046,10 @@ def run_spec_lib_matching_on_NRMS_data(query_data=None, reference_data=None, lik
             similarity_scores.append(similarity_score)
         all_similarity_scores.append(similarity_scores)
-    # create pandas dataframe containing all similarity scores computed with one row for each query spectrum and one column for each reference spectrum
     df_scores = pd.DataFrame(all_similarity_scores, columns = unique_reference_ids)
     df_scores.index = unique_query_ids
     df_scores.index.names = ['Query Spectrum ID']
-    # get predicted identity/identities of each query spectrum and the corresponding maximum similarity score
     preds = []
     scores = []
     for i in range(0, df_scores.shape[0]):
@@ -889,7 +1061,6 @@ def run_spec_lib_matching_on_NRMS_data(query_data=None, reference_data=None, lik
             cols_to_keep = np.where(df_scores_tmp.iloc[i,:] != np.max(df_scores_tmp.iloc[i,:]))[0]
             df_scores_tmp = df_scores_tmp.iloc[:,cols_to_keep]
-            #preds_tmp.append(';'.join(top_ref_specs_tmp.index.to_list()))
             preds_tmp.append(';'.join(map(str,top_ref_specs_tmp.index.to_list())))
             if len(top_ref_specs_tmp.values) == 0:
                 scores_tmp.append(0)
@@ -902,29 +1073,23 @@ def run_spec_lib_matching_on_NRMS_data(query_data=None, reference_data=None, lik
     scores = np.array(scores)
     out = np.c_[preds,scores]
-    # get column names for a pandas dataframe with the n_top_matches_to_save top-matches for each query spectrum
     cnames_preds = []
     cnames_scores = []
     for i in range(0,n_top_matches_to_save):
         cnames_preds.append(f'RANK.{i+1}.PRED')
         cnames_scores.append(f'RANK.{i+1}.SIMILARITY.SCORE')
-    # get pandas dataframe with identifcation results with each row corresponding to a query spectrum, n_top_matches_to_save columns for the top predictions, and n_top_matches_to_save columns for the similarity scores corresponding to the predictions
     df_top_ref_specs = pd.DataFrame(out, columns = [*cnames_preds, *cnames_scores])
     df_top_ref_specs.index = unique_query_ids
     df_top_ref_specs.index.names = ['Query Spectrum ID']
-    # print the identification results if the user desires
     if print_id_results == True:
         print(df_top_ref_specs.to_string())
     df_scores.columns = ['Reference Spectrum ID: ' + col for col in  list(map(str,df_scores.columns.tolist()))]
     if return_ID_output is False:
-        # write spectral library matching results to disk
         df_top_ref_specs.to_csv(output_identification, sep='\t')
-        # write all similarity scores to disk
         df_scores.columns = ['Reference Spectrum ID: ' + col for col in  list(map(str,df_scores.columns.tolist()))]
         df_scores.to_csv(output_similarity_scores, sep='\t')
     else:

pycompound 0.1.1__py3-none-any.whl → 0.1.2__py3-none-any.whl

pycompound 0.1.1py3-none-any.whl → 0.1.2py3-none-any.whl