PyPI - pycompound - Versions diffs - 0.1.6__py3-none-any.whl → 0.1.7__py3-none-any.whl - Mend

pycompound 0.1.6py3-none-any.whl → 0.1.7py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (14) hide show

app.py +2589 -237
pycompound/build_library.py +77 -20
pycompound/plot_spectra.py +1 -1
pycompound/processing.py +5 -5
pycompound/spec_lib_matching.py +245 -471
pycompound/spec_lib_matching_CLI.py +48 -2
pycompound/tuning_CLI_DE.py +22 -22
pycompound/tuning_CLI_grid.py +22 -6
{pycompound-0.1.6.dist-info → pycompound-0.1.7.dist-info}/METADATA +1 -1
pycompound-0.1.7.dist-info/RECORD +15 -0
pycompound-0.1.6.dist-info/RECORD +0 -15
{pycompound-0.1.6.dist-info → pycompound-0.1.7.dist-info}/WHEEL +0 -0
{pycompound-0.1.6.dist-info → pycompound-0.1.7.dist-info}/licenses/LICENSE +0 -0
{pycompound-0.1.6.dist-info → pycompound-0.1.7.dist-info}/top_level.txt +0 -0

pycompound/spec_lib_matching.py CHANGED Viewed

@@ -22,8 +22,10 @@ def _vector_to_full_params(X, default_params, optimize_params):
 def objective_function_HRMS(X, ctx):
     p = _vector_to_full_params(X, ctx["default_params"], ctx["optimize_params"])
     acc = get_acc_HRMS(
-        ctx["df_query"], ctx["df_reference"],
-        ctx["unique_query_ids"], ctx["unique_reference_ids"],
+        ctx["df_query"],
+        ctx["df_reference"],
+        ctx["precursor_ion_mz_tolerance"],
+        ctx["ionization_mode"], ctx["adduct"],
         ctx["similarity_measure"], ctx["weights"], ctx["spectrum_preprocessing_order"],
         ctx["mz_min"], ctx["mz_max"], ctx["int_min"], ctx["int_max"],
         p["window_size_centroiding"], p["window_size_matching"], p["noise_threshold"],
@@ -35,11 +37,11 @@ def objective_function_HRMS(X, ctx):
     print(f"\nparams({ctx['optimize_params']}) = {np.array(X)}\naccuracy: {acc*100}%")
     return 1.0 - acc
 def objective_function_NRMS(X, ctx):
     p = _vector_to_full_params(X, ctx["default_params"], ctx["optimize_params"])
     acc = get_acc_NRMS(
-        ctx["df_query"], ctx["df_reference"],
-        ctx["unique_query_ids"], ctx["unique_reference_ids"],
+        ctx["df_query"], ctx["df_reference"], ctx['unique_query_ids'], ctx['unique_reference_ids'],
         ctx["similarity_measure"], ctx["weights"], ctx["spectrum_preprocessing_order"],
         ctx["mz_min"], ctx["mz_max"], ctx["int_min"], ctx["int_max"],
         p["noise_threshold"], p["wf_mz"], p["wf_int"], p["LET_threshold"], p["entropy_dimension"],
@@ -51,16 +53,8 @@ def objective_function_NRMS(X, ctx):
-def tune_params_DE(query_data=None, reference_data=None, chromatography_platform='HRMS', similarity_measure='cosine', weights=None, spectrum_preprocessing_order='CNMWL', mz_min=0, mz_max=999999999, int_min=0, int_max=999999999, high_quality_reference_library=False, optimize_params=["window_size_centroiding","window_size_matching","noise_threshold","wf_mz","wf_int","LET_threshold","entropy_dimension"], param_bounds={"window_size_centroiding":(0.0,0.5),"window_size_matching":(0.0,0.5),"noise_threshold":(0.0,0.25),"wf_mz":(0.0,5.0),"wf_int":(0.0,5.0),"LET_threshold":(0.0,5.0),"entropy_dimension":(1.0,3.0)}, default_params={"window_size_centroiding": 0.5, "window_size_matching":0.5, "noise_threshold":0.10, "wf_mz":0.0, "wf_int":1.0, "LET_threshold":0.0, "entropy_dimension":1.1}, maxiters=3, de_workers=1, de_updating='immediate', log_hook=None):
-    def _log(msg):
-        if log_hook:
-            try: log_hook(msg if msg.endswith("\n") else msg + "\n")
-            except: pass
-    def callback(xk, conv):
-        _log(f"iter callback: conv={conv:.4g}, x={xk}")
-        return False
+def tune_params_DE(query_data=None, reference_data=None, chromatography_platform='HRMS', precursor_ion_mz_tolerance=None, ionization_mode=None, adduct=None, similarity_measure='cosine', weights=None, spectrum_preprocessing_order='CNMWL', mz_min=0, mz_max=999999999, int_min=0, int_max=999999999, high_quality_reference_library=False, optimize_params=["window_size_centroiding","window_size_matching","noise_threshold","wf_mz","wf_int","LET_threshold","entropy_dimension"], param_bounds={"window_size_centroiding":(0.0,0.5),"window_size_matching":(0.0,0.5),"noise_threshold":(0.0,0.25),"wf_mz":(0.0,5.0),"wf_int":(0.0,5.0),"LET_threshold":(0.0,5.0),"entropy_dimension":(1.0,3.0)}, default_params={"window_size_centroiding": 0.5, "window_size_matching":0.5, "noise_threshold":0.10, "wf_mz":0.0, "wf_int":1.0, "LET_threshold":0.0, "entropy_dimension":1.1}, maxiters=3, de_workers=1):
     if query_data is None:
         print('\nError: No argument passed to the mandatory query_data. Please pass the path to the TXT file of the query data.')
@@ -68,21 +62,19 @@ def tune_params_DE(query_data=None, reference_data=None, chromatography_platform
     else:
         extension = query_data.rsplit('.',1)
         extension = extension[(len(extension)-1)]
-        if extension == 'mgf' or extension == 'MGF' or extension == 'mzML' or extension == 'mzml' or extension == 'MZML' or extension == 'cdf' or extension == 'CDF':
-            output_path_tmp = query_data[:-3] + 'csv'
+        if extension == 'mgf' or extension == 'MGF' or extension == 'mzML' or extension == 'mzml' or extension == 'MZML' or extension == 'cdf' or extension == 'CDF' or extension == 'msp' or extension == 'MSP' or extension == 'json' or extension == 'JSON':
+            output_path_tmp = query_data[:-3] + 'txt'
             build_library_from_raw_data(input_path=query_data, output_path=output_path_tmp, is_reference=False)
-            df_query = pd.read_csv(output_path_tmp)
-        if extension == 'csv' or extension == 'CSV':
-            df_query = pd.read_csv(query_data)
-        unique_query_ids = df_query.iloc[:,0].unique()
+            df_query = pd.read_csv(output_path_tmp, sep='\t')
+        if extension == 'txt' or extension == 'TXT':
+            df_query = pd.read_csv(query_data, sep='\t')
     if reference_data is None:
-        print('\nError: No argument passed to the mandatory reference_data. Please pass the path to the CSV file of the reference data.')
+        print('\nError: No argument passed to the mandatory reference_data. Please pass the path to the TXT file of the reference data.')
         sys.exit()
     else:
         if isinstance(reference_data,str):
             df_reference = get_reference_df(reference_data=reference_data)
-            unique_reference_ids = df_reference.iloc[:,0].unique()
         else:
             dfs = []
             unique_reference_ids = []
@@ -92,6 +84,11 @@ def tune_params_DE(query_data=None, reference_data=None, chromatography_platform
                 unique_reference_ids.extend(tmp.iloc[:,0].unique())
             df_reference = pd.concat(dfs, axis=0, ignore_index=True)
+    if 'ionization_mode' in df_reference.columns.tolist() and ionization_mode != None and ionization_mode != 'N/A':
+        df_reference = df_reference.loc[df_reference['ionization_mode']==ionization_mode]
+    if 'adduct' in df_reference.columns.tolist() and adduct != None and adduct != 'N/A':
+        df_reference = df_reference.loc[df_reference['adduct']==adduct]
     unique_query_ids = df_query['id'].unique().tolist()
     unique_reference_ids = df_reference['id'].unique().tolist()
@@ -100,6 +97,9 @@ def tune_params_DE(query_data=None, reference_data=None, chromatography_platform
         df_reference=df_reference,
         unique_query_ids=unique_query_ids,
         unique_reference_ids=unique_reference_ids,
+        precursor_ion_mz_tolerance=precursor_ion_mz_tolerance,
+        ionization_mode=ionization_mode,
+        adduct=adduct,
         similarity_measure=similarity_measure,
         weights=weights,
         spectrum_preprocessing_order=spectrum_preprocessing_order,
@@ -111,13 +111,10 @@ def tune_params_DE(query_data=None, reference_data=None, chromatography_platform
     bounds = [param_bounds[p] for p in optimize_params]
-    print('here!!!!!!!!!!!!!!!')
-    print(de_workers)
-    print('here!!!!!!!!!!!!!!!')
     if chromatography_platform == 'HRMS':
-        result = differential_evolution(objective_function_HRMS, bounds=bounds, args=(ctx,), maxiter=maxiters, tol=0.0, workers=de_workers, seed=1)
+        result = differential_evolution(objective_function_HRMS, bounds=bounds, args=(ctx,), maxiter=maxiters, tol=0.0, workers=de_workers, seed=1, updating='deferred' if de_workers!=1 else 'immediate')
     else:
-        result = differential_evolution(objective_function_NRMS, bounds=bounds, args=(ctx,), maxiter=maxiters, tol=0.0, workers=de_workers, seed=1)
+        result = differential_evolution(objective_function_NRMS, bounds=bounds, args=(ctx,), maxiter=maxiters, tol=0.0, workers=de_workers, seed=1, updating='deferred' if de_workers!=1 else 'immediate')
     best_full_params = _vector_to_full_params(result.x, default_params, optimize_params)
     best_acc = 100.0 - (result.fun * 100.0)
@@ -131,14 +128,17 @@ def tune_params_DE(query_data=None, reference_data=None, chromatography_platform
     for k, v in best_full_params.items():
         print(f"  {k}: {v}")
     print(f"\nBest accuracy: {best_acc:.3f}%")
-    _log(f"best = {result.x}, acc={100*(1-result.fun):.3f}%")
 default_HRMS_grid = {'similarity_measure':['cosine'], 'weight':[{'Cosine':0.25,'Shannon':0.25,'Renyi':0.25,'Tsallis':0.25}], 'spectrum_preprocessing_order':['FCNMWL'], 'mz_min':[0], 'mz_max':[9999999], 'int_min':[0], 'int_max':[99999999], 'window_size_centroiding':[0.5], 'window_size_matching':[0.5], 'noise_threshold':[0.0], 'wf_mz':[0.0], 'wf_int':[1.0], 'LET_threshold':[0.0], 'entropy_dimension':[1.1], 'high_quality_reference_library':[False]}
 default_NRMS_grid = {'similarity_measure':['cosine'], 'weight':[{'Cosine':0.25,'Shannon':0.25,'Renyi':0.25,'Tsallis':0.25}], 'spectrum_preprocessing_order':['FCNMWL'], 'mz_min':[0], 'mz_max':[9999999], 'int_min':[0], 'int_max':[99999999], 'noise_threshold':[0.0], 'wf_mz':[0.0], 'wf_int':[1.0], 'LET_threshold':[0.0], 'entropy_dimension':[1.1], 'high_quality_reference_library':[False]}
-def _eval_one_HRMS(df_query, df_reference, unique_query_ids, unique_reference_ids,
+def _eval_one_HRMS(df_query, df_reference,
+              precursor_ion_mz_tolerance_tmp, ionization_mode_tmp, adduct_tmp,
               similarity_measure_tmp, weight,
               spectrum_preprocessing_order_tmp, mz_min_tmp, mz_max_tmp,
               int_min_tmp, int_max_tmp, noise_threshold_tmp,
@@ -148,7 +148,8 @@ def _eval_one_HRMS(df_query, df_reference, unique_query_ids, unique_reference_id
     acc = get_acc_HRMS(
         df_query=df_query, df_reference=df_reference,
-        unique_query_ids=unique_query_ids, unique_reference_ids=unique_reference_ids,
+        precursor_ion_mz_tolerance=precursor_ion_mz_tolerance_tmp,
+        ionization_mode=ionization_mode_tmp, adduct=adduct_tmp,
         similarity_measure=similarity_measure_tmp, weights=weight,
         spectrum_preprocessing_order=spectrum_preprocessing_order_tmp,
         mz_min=mz_min_tmp, mz_max=mz_max_tmp,
@@ -160,7 +161,7 @@ def _eval_one_HRMS(df_query, df_reference, unique_query_ids, unique_reference_id
         LET_threshold=LET_threshold_tmp,
         entropy_dimension=entropy_dimension_tmp,
         high_quality_reference_library=high_quality_reference_library_tmp,
-        verbose=True
+        verbose=False
     )
     return (
@@ -191,6 +192,7 @@ def _eval_one_NRMS(df_query, df_reference, unique_query_ids, unique_reference_id
         LET_threshold=LET_threshold_tmp,
         entropy_dimension=entropy_dimension_tmp,
         high_quality_reference_library=high_quality_reference_library_tmp,
+        verbose=False
     )
     return (
@@ -201,16 +203,7 @@ def _eval_one_NRMS(df_query, df_reference, unique_query_ids, unique_reference_id
-def tune_params_on_HRMS_data_grid(query_data=None, reference_data=None, grid=None, output_path=None, return_output=False):
-    """
-    runs spectral library matching on high-resolution mass spectrometry (HRMS) data with all possible combinations of parameters in the grid dict, saves results from each choice of parameters to a TXT file, and prints top-performing parameters
-    --query_data: mgf, mzML, or csv file of query mass spectrum/spectra to be identified. If csv file, each row should correspond to a mass spectrum, the left-most column should contain an identifier, and each of the other columns should correspond to a single mass/charge ratio. Mandatory argument.
-    --reference_data: mgf, mzML, or csv file of the reference mass spectra. If csv file, each row should correspond to a mass spectrum, the left-most column should contain in identifier (i.e. the CAS registry number or the compound name), and the remaining column should correspond to a single mass/charge ratio. Mandatory argument.
-    --grid: dict with all possible parameter values to try.
-    --output_path: accuracy from each choice of parameter set is saved to a TXT file here.
-    """
+def tune_params_on_HRMS_data_grid(query_data=None, reference_data=None, precursor_ion_mz_tolerance=None, ionization_mode=None, adduct=None, grid=None, output_path=None, return_output=False):
     grid = {**default_HRMS_grid, **(grid or {})}
     for key, value in grid.items():
         globals()[key] = value
@@ -221,31 +214,37 @@ def tune_params_on_HRMS_data_grid(query_data=None, reference_data=None, grid=Non
     else:
         extension = query_data.rsplit('.',1)
         extension = extension[(len(extension)-1)]
-        if extension == 'mgf' or extension == 'MGF' or extension == 'mzML' or extension == 'mzml' or extension == 'MZML' or extension == 'cdf' or extension == 'CDF':
-            output_path_tmp = query_data[:-3] + 'csv'
+        if extension == 'mgf' or extension == 'MGF' or extension == 'mzML' or extension == 'mzml' or extension == 'MZML' or extension == 'cdf' or extension == 'CDF' or extension == 'msp' or extension == 'MSP' or extension == 'json' or extension == 'JSON':
+            output_path_tmp = query_data[:-3] + 'txt'
             build_library_from_raw_data(input_path=query_data, output_path=output_path_tmp, is_reference=False)
-            df_query = pd.read_csv(output_path_tmp)
-        if extension == 'csv' or extension == 'CSV':
-            df_query = pd.read_csv(query_data)
-        unique_query_ids = df_query.iloc[:,0].unique()
+            df_query = pd.read_csv(output_path_tmp, sep='\t')
+        if extension == 'txt' or extension == 'TXT':
+            df_query = pd.read_csv(query_data, sep='\t')
+        unique_query_ids = df_query['id'].unique()
     if reference_data is None:
-        print('\nError: No argument passed to the mandatory reference_data. Please pass the path to the CSV file of the reference data.')
+        print('\nError: No argument passed to the mandatory reference_data. Please pass the path to the TXT file of the reference data.')
         sys.exit()
     else:
         if isinstance(reference_data,str):
             df_reference = get_reference_df(reference_data=reference_data)
-            unique_reference_ids = df_reference.iloc[:,0].unique()
+            unique_reference_ids = df_reference['id'].unique()
         else:
             dfs = []
             unique_reference_ids = []
             for f in reference_data:
                 tmp = get_reference_df(reference_data=f)
                 dfs.append(tmp)
-                unique_reference_ids.extend(tmp.iloc[:,0].unique())
+                unique_reference_ids.extend(tmp['id'].unique())
             df_reference = pd.concat(dfs, axis=0, ignore_index=True)
-    print(f'\nNote that there are {len(unique_query_ids)} unique query spectra, {len(unique_reference_ids)} unique reference spectra, and {len(set(unique_query_ids) & set(unique_reference_ids))} of the query and reference spectra IDs are in common.\n')
+    if 'ionization_mode' in df_reference.columns.tolist() and ionization_mode != 'N/A' and ionization_mode != None:
+        df_reference = df_reference.loc[df_reference['ionization_mode']==ionization_mode].copy()
+    if 'adduct' in df_reference.columns.tolist() and adduct != 'N/A' and adduct != None:
+        df_reference = df_reference.loc[df_reference['adduct']==adduct].copy()
+    unique_reference_ids_tmp2 = df_reference['id'].unique()
+    print(f'\nNote that there are {len(unique_query_ids)} unique query spectra, {len(unique_reference_ids)} unique reference spectra, and {len(set(unique_query_ids) & set(unique_reference_ids_tmp2))} of the query and reference spectra IDs are in common.\n')
     if output_path is None:
         output_path = f'{Path.cwd()}/tuning_param_output.txt'
@@ -253,7 +252,7 @@ def tune_params_on_HRMS_data_grid(query_data=None, reference_data=None, grid=Non
     param_grid = product(similarity_measure, weight, spectrum_preprocessing_order, mz_min, mz_max, int_min, int_max, noise_threshold,
                          window_size_centroiding, window_size_matching, wf_mz, wf_int, LET_threshold, entropy_dimension, high_quality_reference_library)
-    results = Parallel(n_jobs=-1, verbose=10)(delayed(_eval_one_HRMS)(df_query, df_reference, unique_query_ids, unique_reference_ids, *params) for params in param_grid)
+    results = Parallel(n_jobs=-1, verbose=10)(delayed(_eval_one_HRMS)(df_query, df_reference, precursor_ion_mz_tolerance, ionization_mode, adduct,  *params) for params in param_grid)
     df_out = pd.DataFrame(results, columns=[
         'ACC','SIMILARITY.MEASURE','WEIGHT','SPECTRUM.PROCESSING.ORDER', 'MZ.MIN','MZ.MAX','INT.MIN','INT.MAX','NOISE.THRESHOLD',
@@ -277,124 +276,7 @@ def tune_params_on_HRMS_data_grid(query_data=None, reference_data=None, grid=Non
-def tune_params_on_HRMS_data_grid_shiny(query_data=None, reference_data=None, grid=None, output_path=None, return_output=False):
-    """
-    runs spectral library matching on high-resolution mass spectrometry (HRMS) data with all possible
-    combinations of parameters in the grid dict, saves results from each choice of parameters to a TXT file,
-    and prints top-performing parameters
-    --query_data: mgf, mzML, or csv file of query mass spectrum/spectra to be identified. If csv file, each row
-       should correspond to a mass spectrum, the left-most column should contain an identifier, and each of the
-       other columns should correspond to a single mass/charge ratio. Mandatory argument.
-    --reference_data: mgf, mzML, or csv file of the reference mass spectra. If csv file, each row should correspond
-       to a mass spectrum, the left-most column should contain in identifier (i.e. the CAS registry number or the
-       compound name), and the remaining column should correspond to a single mass/charge ratio. Mandatory argument.
-    --grid: dict with all possible parameter values to try.
-    --output_path: accuracy from each choice of parameter set is saved to a TXT file here.
-    """
-    local_grid = {**default_HRMS_grid, **(grid or {})}
-    for key, value in local_grid.items():
-        globals()[key] = value
-    if query_data is None:
-        print('\nError: No argument passed to the mandatory query_data. Please pass the path to the data file.')
-        sys.exit()
-    else:
-        extension = query_data.rsplit('.', 1)[-1]
-        if extension in ('mgf','MGF','mzML','mzml','MZML','cdf','CDF'):
-            output_path_tmp = query_data[:-3] + 'csv'
-            build_library_from_raw_data(input_path=query_data, output_path=output_path_tmp, is_reference=False)
-            df_query = pd.read_csv(output_path_tmp)
-        elif extension in ('csv','CSV'):
-            df_query = pd.read_csv(query_data)
-        else:
-            print(f'\nError: Unsupported query_data extension: {extension}')
-            sys.exit()
-        unique_query_ids = df_query.iloc[:, 0].unique()
-    if reference_data is None:
-        print('\nError: No argument passed to the mandatory reference_data. Please pass the path to the data file(s).')
-        sys.exit()
-    else:
-        if isinstance(reference_data, str):
-            df_reference = get_reference_df(reference_data=reference_data)
-            unique_reference_ids = df_reference.iloc[:, 0].unique()
-        else:
-            dfs = []
-            unique_reference_ids = []
-            for f in reference_data:
-                tmp = get_reference_df(reference_data=f)
-                dfs.append(tmp)
-                unique_reference_ids.extend(tmp.iloc[:, 0].unique())
-            df_reference = pd.concat(dfs, axis=0, ignore_index=True)
-    print(f'\nNote that there are {len(unique_query_ids)} unique query spectra, '
-          f'{len(unique_reference_ids)} unique reference spectra, and '
-          f'{len(set(unique_query_ids) & set(unique_reference_ids))} of the query and reference spectra IDs are in common.\n')
-    if output_path is None:
-        output_path = f'{Path.cwd()}/tuning_param_output.txt'
-        print(f'Warning: since output_path=None, the output will be written to the current working directory: {output_path}')
-    param_grid = product(
-        similarity_measure, weight, spectrum_preprocessing_order, mz_min, mz_max, int_min, int_max,
-        noise_threshold, window_size_centroiding, window_size_matching, wf_mz, wf_int, LET_threshold,
-        entropy_dimension, high_quality_reference_library
-    )
-    results = []
-    total = (
-        len(similarity_measure) * len(weight) * len(spectrum_preprocessing_order) * len(mz_min) * len(mz_max) *
-        len(int_min) * len(int_max) * len(noise_threshold) * len(window_size_centroiding) *
-        len(window_size_matching) * len(wf_mz) * len(wf_int) * len(LET_threshold) *
-        len(entropy_dimension) * len(high_quality_reference_library)
-    )
-    done = 0
-    for params in param_grid:
-        res = _eval_one_HRMS(df_query, df_reference, unique_query_ids, unique_reference_ids, *params)
-        results.append(res)
-        done += 1
-        print(f'Completed {done}/{total} grid combinations.\n', flush=True)
-    df_out = pd.DataFrame(results, columns=[
-        'ACC','SIMILARITY.MEASURE','WEIGHT','SPECTRUM.PROCESSING.ORDER','MZ.MIN','MZ.MAX',
-        'INT.MIN','INT.MAX','NOISE.THRESHOLD','WINDOW.SIZE.CENTROIDING','WINDOW.SIZE.MATCHING',
-        'WF.MZ','WF.INT','LET.THRESHOLD','ENTROPY.DIMENSION','HIGH.QUALITY.REFERENCE.LIBRARY'
-    ])
-    if 'WEIGHT' in df_out.columns:
-        df_out['WEIGHT'] = (
-            df_out['WEIGHT'].astype(str)
-                .str.replace("\"","",regex=False)
-                .str.replace("{","",regex=False)
-                .str.replace("}","",regex=False)
-                .str.replace(":","",regex=False)
-                .str.replace("Cosine","",regex=False)
-                .str.replace("Shannon","",regex=False)
-                .str.replace("Renyi","",regex=False)
-                .str.replace("Tsallis","",regex=False)
-                .str.replace(" ","",regex=False)
-        )
-    if return_output:
-        return df_out
-    else:
-        df_out.to_csv(output_path, index=False, sep='\t', quoting=csv.QUOTE_NONE)
-        print(f'Wrote results to {output_path}')
 def tune_params_on_NRMS_data_grid(query_data=None, reference_data=None, grid=None, output_path=None, return_output=False):
-    """
-    runs spectral library matching on nominal-resolution mass spectrometry (NRMS) data with all possible combinations of parameters in the grid dict, saves results from each choice of parameters to a TXT file, and prints top-performing parameters
-    --query_data: mgf, mzML, or csv file of query mass spectrum/spectra to be identified. If csv file, each row should correspond to a mass spectrum, the left-most column should contain an identifier, and each of the other columns should correspond to a single mass/charge ratio. Mandatory argument.
-    --reference_data: mgf, mzML, or csv file of the reference mass spectra. If csv file, each row should correspond to a mass spectrum, the left-most column should contain in identifier (i.e. the CAS registry number or the compound name), and the remaining column should correspond to a single mass/charge ratio. Mandatory argument.
-    --grid: dict with all possible parameter values to try
-    --output_path: accuracy from each choice of parameter set is saved to a TXT file here
-    """
     grid = {**default_NRMS_grid, **(grid or {})}
     for key, value in grid.items():
         globals()[key] = value
@@ -405,13 +287,13 @@ def tune_params_on_NRMS_data_grid(query_data=None, reference_data=None, grid=Non
     else:
         extension = query_data.rsplit('.',1)
         extension = extension[(len(extension)-1)]
-        if extension == 'mgf' or extension == 'MGF' or extension == 'mzML' or extension == 'mzml' or extension == 'MZML' or extension == 'cdf' or extension == 'CDF':
-            output_path_tmp = query_data[:-3] + 'csv'
+        if extension == 'mgf' or extension == 'MGF' or extension == 'mzML' or extension == 'mzml' or extension == 'MZML' or extension == 'cdf' or extension == 'CDF' or extension == 'msp' or extension == 'MSP' or extension == 'json' or extension == 'JSON':
+            output_path_tmp = query_data[:-3] + 'txt'
             build_library_from_raw_data(input_path=query_data, output_path=output_path_tmp, is_reference=False)
-            df_query = pd.read_csv(output_path_tmp)
-        if extension == 'csv' or extension == 'CSV':
-            df_query = pd.read_csv(query_data)
-        unique_query_ids = df_query.iloc[:,0].unique()
+            df_query = pd.read_csv(output_path_tmp, sep='\t')
+        if extension == 'txt' or extension == 'TXT':
+            df_query = pd.read_csv(query_data, sep='\t')
+        unique_query_ids = df_query['id'].unique()
     if reference_data is None:
         print('\nError: No argument passed to the mandatory reference_data. Please pass the path to the CSV file of the reference data.')
@@ -419,7 +301,7 @@ def tune_params_on_NRMS_data_grid(query_data=None, reference_data=None, grid=Non
     else:
         if isinstance(reference_data,str):
             df_reference = get_reference_df(reference_data=reference_data)
-            unique_reference_ids = df_reference.iloc[:,0].unique()
+            unique_reference_ids = df_reference['id'].unique()
         else:
             dfs = []
             unique_reference_ids = []
@@ -439,10 +321,8 @@ def tune_params_on_NRMS_data_grid(query_data=None, reference_data=None, grid=Non
                          noise_threshold, wf_mz, wf_int, LET_threshold, entropy_dimension, high_quality_reference_library)
     results = Parallel(n_jobs=-1, verbose=10)(delayed(_eval_one_NRMS)(df_query, df_reference, unique_query_ids, unique_reference_ids, *params) for params in param_grid)
-    df_out = pd.DataFrame(results, columns=[
-        'ACC','SIMILARITY.MEASURE','WEIGHT','SPECTRUM.PROCESSING.ORDER', 'MZ.MIN','MZ.MAX','INT.MIN','INT.MAX',
-        'NOISE.THRESHOLD','WF.MZ','WF.INT','LET.THRESHOLD','ENTROPY.DIMENSION', 'HIGH.QUALITY.REFERENCE.LIBRARY'
-    ])
+    df_out = pd.DataFrame(results, columns=['ACC','SIMILARITY.MEASURE','WEIGHT','SPECTRUM.PROCESSING.ORDER', 'MZ.MIN','MZ.MAX','INT.MIN','INT.MAX',
+                                            'NOISE.THRESHOLD','WF.MZ','WF.INT','LET.THRESHOLD','ENTROPY.DIMENSION', 'HIGH.QUALITY.REFERENCE.LIBRARY'])
     df_out['WEIGHT'] = df_out['WEIGHT'].str.replace("\"","",regex=False)
     df_out['WEIGHT'] = df_out['WEIGHT'].str.replace("{","",regex=False)
     df_out['WEIGHT'] = df_out['WEIGHT'].str.replace("}","",regex=False)
@@ -452,6 +332,7 @@ def tune_params_on_NRMS_data_grid(query_data=None, reference_data=None, grid=Non
     df_out['WEIGHT'] = df_out['WEIGHT'].str.replace("Renyi","",regex=False)
     df_out['WEIGHT'] = df_out['WEIGHT'].str.replace("Tsallis","",regex=False)
     df_out['WEIGHT'] = df_out['WEIGHT'].str.replace(" ","",regex=False)
     if return_output is False:
         df_out.to_csv(output_path, index=False, sep='\t', quoting=csv.QUOTE_NONE)
     else:
@@ -459,202 +340,116 @@ def tune_params_on_NRMS_data_grid(query_data=None, reference_data=None, grid=Non
-def tune_params_on_NRMS_data_grid_shiny(query_data=None, reference_data=None, grid=None, output_path=None, return_output=False):
-    """
-    runs spectral library matching on nominal-resolution mass spectrometry (NRMS) data with all possible
-    combinations of parameters in the grid dict, saves results from each choice of parameters to a TXT file,
-    and prints top-performing parameters
-    --query_data: mgf, mzML, or csv file of query mass spectrum/spectra to be identified. If csv file, each row
-       should correspond to a mass spectrum, the left-most column should contain an identifier, and each of the
-       other columns should correspond to a single mass/charge ratio. Mandatory argument.
-    --reference_data: mgf, mzML, or csv file of the reference mass spectra. If csv file, each row should correspond
-       to a mass spectrum, the left-most column should contain in identifier (i.e. the CAS registry number or the
-       compound name), and the remaining column should correspond to a single mass/charge ratio. Mandatory argument.
-    --grid: dict with all possible parameter values to try.
-    --output_path: accuracy from each choice of parameter set is saved to a TXT file here.
-    """
-    local_grid = {**default_NRMS_grid, **(grid or {})}
-    for key, value in local_grid.items():
-        globals()[key] = value
-    if query_data is None:
-        print('\nError: No argument passed to the mandatory query_data. Please pass the path to the data file.')
-        sys.exit()
-    else:
-        extension = query_data.rsplit('.', 1)[-1]
-        if extension in ('mgf','MGF','mzML','mzml','MZML','cdf','CDF'):
-            output_path_tmp = query_data[:-3] + 'csv'
-            build_library_from_raw_data(input_path=query_data, output_path=output_path_tmp, is_reference=False)
-            df_query = pd.read_csv(output_path_tmp)
-        elif extension in ('csv','CSV'):
-            df_query = pd.read_csv(query_data)
-        else:
-            print(f'\nError: Unsupported query_data extension: {extension}')
-            sys.exit()
-        unique_query_ids = df_query.iloc[:, 0].unique()
+def get_acc_HRMS(df_query, df_reference, precursor_ion_mz_tolerance, ionization_mode, adduct, similarity_measure, weights, spectrum_preprocessing_order, mz_min, mz_max, int_min, int_max, window_size_centroiding, window_size_matching, noise_threshold, wf_mz, wf_int, LET_threshold, entropy_dimension, high_quality_reference_library, verbose=True):
-    if reference_data is None:
-        print('\nError: No argument passed to the mandatory reference_data. Please pass the path to the data file(s).')
-        sys.exit()
-    else:
-        if isinstance(reference_data, str):
-            df_reference = get_reference_df(reference_data=reference_data)
-            unique_reference_ids = df_reference.iloc[:, 0].unique()
-        else:
-            dfs = []
-            unique_reference_ids = []
-            for f in reference_data:
-                tmp = get_reference_df(reference_data=f)
-                dfs.append(tmp)
-                unique_reference_ids.extend(tmp.iloc[:, 0].unique())
-            df_reference = pd.concat(dfs, axis=0, ignore_index=True)
-    print(f'\nNote that there are {len(unique_query_ids)} unique query spectra, '
-          f'{len(unique_reference_ids)} unique reference spectra, and '
-          f'{len(set(unique_query_ids) & set(unique_reference_ids))} of the query and reference spectra IDs are in common.\n')
-    if output_path is None:
-        output_path = f'{Path.cwd()}/tuning_param_output.txt'
-        print(f'Warning: since output_path=None, the output will be written to the current working directory: {output_path}')
-    param_grid = product(
-        similarity_measure, weight, spectrum_preprocessing_order, mz_min, mz_max, int_min, int_max,
-        noise_threshold, wf_mz, wf_int, LET_threshold,
-        entropy_dimension, high_quality_reference_library
-    )
-    results = []
-    total = (
-        len(similarity_measure) * len(weight) * len(spectrum_preprocessing_order) * len(mz_min) * len(mz_max) * len(int_min) *
-        len(int_max) * len(noise_threshold) * len(wf_mz) * len(wf_int) * len(LET_threshold) * len(entropy_dimension) * len(high_quality_reference_library)
-    )
-    done = 0
-    for params in param_grid:
-        res = _eval_one_NRMS(df_query, df_reference, unique_query_ids, unique_reference_ids, *params)
-        results.append(res)
-        done += 1
-        print(f'Completed {done}/{total} grid combinations.\n', flush=True)
-    df_out = pd.DataFrame(results, columns=[
-        'ACC','SIMILARITY.MEASURE','WEIGHT','SPECTRUM.PROCESSING.ORDER','MZ.MIN','MZ.MAX',
-        'INT.MIN','INT.MAX','NOISE.THRESHOLD','WF.MZ','WF.INT','LET.THRESHOLD','ENTROPY.DIMENSION','HIGH.QUALITY.REFERENCE.LIBRARY'
-    ])
+    n_top_matches_to_save = 1
+    unique_reference_ids = df_reference['id'].dropna().astype(str).unique().tolist()
+    unique_query_ids = df_query['id'].dropna().astype(str).unique().tolist()
+    all_similarity_rows = []
-    if 'WEIGHT' in df_out.columns:
-        df_out['WEIGHT'] = (
-            df_out['WEIGHT'].astype(str)
-                .str.replace("\"","",regex=False)
-                .str.replace("{","",regex=False)
-                .str.replace("}","",regex=False)
-                .str.replace(":","",regex=False)
-                .str.replace("Cosine","",regex=False)
-                .str.replace("Shannon","",regex=False)
-                .str.replace("Renyi","",regex=False)
-                .str.replace("Tsallis","",regex=False)
-                .str.replace(" ","",regex=False)
-        )
-    if return_output:
-        return df_out
-    else:
-        df_out.to_csv(output_path, index=False, sep='\t', quoting=csv.QUOTE_NONE)
-        print(f'Wrote results to {output_path}')
+    for query_idx, qid in enumerate(unique_query_ids):
+        if verbose:
+            print(f'query spectrum #{query_idx} is being identified')
+        q_mask = (df_query['id'] == qid)
+        q_idxs = np.where(q_mask)[0]
+        if q_idxs.size == 0:
+            all_similarity_rows.append([0.0]*len(unique_reference_ids))
+            continue
+        q_spec_base = np.asarray(pd.concat([df_query['mz_ratio'].iloc[q_idxs], df_query['intensity'].iloc[q_idxs]], axis=1).reset_index(drop=True))
+        if 'precursor_ion_mz' in df_query.columns and 'precursor_ion_mz' in df_reference.columns and precursor_ion_mz_tolerance is not None:
+            precursor = float(df_query['precursor_ion_mz'].iloc[q_idxs[0]])
+            df_reference_tmp = df_reference.loc[df_reference['precursor_ion_mz'].between(precursor - precursor_ion_mz_tolerance, precursor + precursor_ion_mz_tolerance, inclusive='both'), ['id', 'mz_ratio', 'intensity']].copy()
+        else:
+            df_reference_tmp = df_reference[['id','mz_ratio','intensity']].copy()
-def get_acc_HRMS(df_query, df_reference, unique_query_ids, unique_reference_ids, similarity_measure, weights, spectrum_preprocessing_order, mz_min, mz_max, int_min, int_max, window_size_centroiding, window_size_matching, noise_threshold, wf_mz, wf_int, LET_threshold, entropy_dimension, high_quality_reference_library, verbose=True):
+        if df_reference_tmp.empty:
+            all_similarity_rows.append([0.0]*len(unique_reference_ids))
+            continue
-    n_top_matches_to_save = 1
+        ref_groups = dict(tuple(df_reference_tmp.groupby('id', sort=False)))
-    all_similarity_scores =  []
-    for query_idx in range(0,len(unique_query_ids)):
-        if verbose is True:
-            print(f'query spectrum #{query_idx} is being identified')
-        q_idxs_tmp = np.where(df_query.iloc[:,0] == unique_query_ids[query_idx])[0]
-        q_spec_tmp = np.asarray(pd.concat([df_query.iloc[q_idxs_tmp,1], df_query.iloc[q_idxs_tmp,2]], axis=1).reset_index(drop=True))
-        #q_spec_tmp = q_spec_tmp.astype(float)
+        similarity_by_ref = {}
-        similarity_scores = []
-        for ref_idx in range(0,len(unique_reference_ids)):
-            q_spec = q_spec_tmp
-            r_idxs_tmp = np.where(df_reference.iloc[:,0] == unique_reference_ids[ref_idx])[0]
-            r_spec = np.asarray(pd.concat([df_reference.iloc[r_idxs_tmp,1], df_reference.iloc[r_idxs_tmp,2]], axis=1).reset_index(drop=True))
-            #print(r_spec)
-            #r_spec = r_spec.astype(float)
+        for ref_id, r_df in ref_groups.items():
+            q_spec = q_spec_base.copy()
+            r_spec = np.asarray(pd.concat([r_df['mz_ratio'], r_df['intensity']], axis=1).reset_index(drop=True))
             is_matched = False
             for transformation in spectrum_preprocessing_order:
-                if np.isinf(q_spec[:,1]).sum() > 0:
-                    q_spec[:,1] = np.zeros(q_spec.shape[0])
-                if np.isinf(r_spec[:,1]).sum() > 0:
-                    r_spec[:,1] = np.zeros(r_spec.shape[0])
-                if transformation == 'C' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1:
-                    q_spec = centroid_spectrum(q_spec, window_size=window_size_centroiding)
-                    r_spec = centroid_spectrum(r_spec, window_size=window_size_centroiding)
-                if transformation == 'M' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1:
-                    m_spec = match_peaks_in_spectra(spec_a=q_spec, spec_b=r_spec, window_size=window_size_matching)
-                    q_spec = m_spec[:,0:2]
-                    r_spec = m_spec[:,[0,2]]
+                if np.isinf(q_spec[:, 1]).any():
+                    q_spec[:, 1] = 0.0
+                if np.isinf(r_spec[:, 1]).any():
+                    r_spec[:, 1] = 0.0
+                if transformation == 'C' and q_spec.shape[0] > 1 and r_spec.shape[0] > 1:
+                    q_spec = centroid_spectrum(q_spec, window_size=window_size_centroiding)
+                    r_spec = centroid_spectrum(r_spec, window_size=window_size_centroiding)
+                if transformation == 'M' and q_spec.shape[0] > 1 and r_spec.shape[0] > 1:
+                    m_spec = match_peaks_in_spectra(
+                        spec_a=q_spec, spec_b=r_spec, window_size=window_size_matching
+                    )
+                    if m_spec.size == 0:
+                        q_spec = np.empty((0,2))
+                        r_spec = np.empty((0,2))
+                    else:
+                        q_spec = m_spec[:, 0:2]
+                        r_spec = m_spec[:, [0, 2]]
                     is_matched = True
-                if transformation == 'W' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1:
-                    q_spec[:,1] = wf_transform(q_spec[:,0], q_spec[:,1], wf_mz, wf_int)
-                    r_spec[:,1] = wf_transform(r_spec[:,0], r_spec[:,1], wf_mz, wf_int)
-                if transformation == 'L' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1:
-                    q_spec[:,1] = LE_transform(q_spec[:,1], LET_threshold, normalization_method='standard')
-                    r_spec[:,1] = LE_transform(r_spec[:,1], LET_threshold, normalization_method='standard')
-                if transformation == 'N' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1:
-                    q_spec = remove_noise(q_spec, nr = noise_threshold)
-                    if high_quality_reference_library == False:
-                        r_spec = remove_noise(r_spec, nr = noise_threshold)
-                if transformation == 'F' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1:
-                    q_spec = filter_spec_lcms(q_spec, mz_min = mz_min, mz_max = mz_max, int_min = int_min, int_max = int_max, is_matched = is_matched)
-                    if high_quality_reference_library == False:
-                        r_spec = filter_spec_lcms(r_spec, mz_min = mz_min, mz_max = mz_max, int_min = int_min, int_max = int_max, is_matched = is_matched)
-            q_ints = q_spec[:,1]
-            r_ints = r_spec[:,1]
-            if np.sum(q_ints) != 0 and np.sum(r_ints) != 0 and q_spec.shape[0] > 1 and r_spec.shape[1] > 1:
-                similarity_score = get_similarity(similarity_measure, q_ints, r_ints, weights, entropy_dimension)
+                if transformation == 'W' and q_spec.shape[0] > 1 and r_spec.shape[0] > 1:
+                    q_spec[:, 1] = wf_transform(q_spec[:, 0], q_spec[:, 1], wf_mz, wf_int)
+                    r_spec[:, 1] = wf_transform(r_spec[:, 0], r_spec[:, 1], wf_mz, wf_int)
+                if transformation == 'L' and q_spec.shape[0] > 1 and r_spec.shape[0] > 1:
+                    q_spec[:, 1] = LE_transform(q_spec[:, 1], LET_threshold, normalization_method='standard')
+                    r_spec[:, 1] = LE_transform(r_spec[:, 1], LET_threshold, normalization_method='standard')
+                if transformation == 'N' and q_spec.shape[0] > 1 and r_spec.shape[0] > 1:
+                    q_spec = remove_noise(q_spec, nr=noise_threshold)
+                    if not high_quality_reference_library:
+                        r_spec = remove_noise(r_spec, nr=noise_threshold)
+                if transformation == 'F' and q_spec.shape[0] > 1 and r_spec.shape[0] > 1:
+                    q_spec = filter_spec_lcms(
+                        q_spec, mz_min=mz_min, mz_max=mz_max, int_min=int_min, int_max=int_max, is_matched=is_matched
+                    )
+                    if not high_quality_reference_library:
+                        r_spec = filter_spec_lcms(
+                            r_spec, mz_min=mz_min, mz_max=mz_max, int_min=int_min, int_max=int_max, is_matched=is_matched
+                        )
+            if q_spec.shape[0] > 1 and r_spec.shape[0] > 1:
+                q_ints = q_spec[:, 1]
+                r_ints = r_spec[:, 1]
+                if np.sum(q_ints) != 0 and np.sum(r_ints) != 0:
+                    sim = get_similarity(similarity_measure, q_ints, r_ints, weights, entropy_dimension)
+                else:
+                    sim = 0.0
             else:
-                similarity_score = 0
+                sim = 0.0
-            similarity_scores.append(similarity_score)
-        all_similarity_scores.append(similarity_scores)
+            similarity_by_ref[str(ref_id)] = float(sim)
-    df_scores = pd.DataFrame(all_similarity_scores, columns = unique_reference_ids)
-    df_scores.index = unique_query_ids
-    df_scores.index.names = ['Query Spectrum ID']
+        row = [similarity_by_ref.get(ref_id, 0.0) for ref_id in unique_reference_ids]
+        all_similarity_rows.append(row)
-    preds = []
-    scores = []
-    for i in range(0, df_scores.shape[0]):
-        df_scores_tmp = df_scores
-        preds_tmp = []
-        scores_tmp = []
-        for j in range(0, n_top_matches_to_save):
-            top_ref_specs_tmp = df_scores_tmp.iloc[i,np.where(df_scores_tmp.iloc[i,:] == np.max(df_scores_tmp.iloc[i,:]))[0]]
-            cols_to_keep = np.where(df_scores_tmp.iloc[i,:] != np.max(df_scores_tmp.iloc[i,:]))[0]
-            df_scores_tmp = df_scores_tmp.iloc[:,cols_to_keep]
+    df_scores = pd.DataFrame(all_similarity_rows, index=unique_query_ids, columns=unique_reference_ids)
+    df_scores.index.name = 'QUERY.SPECTRUM.ID'
-            preds_tmp.append(';'.join(map(str,top_ref_specs_tmp.index.to_list())))
-            if len(top_ref_specs_tmp.values) == 0:
-                scores_tmp.append(0)
-            else:
-                scores_tmp.append(top_ref_specs_tmp.values[0])
-        preds.append(preds_tmp)
-        scores.append(scores_tmp)
-    preds = np.array(preds)
-    scores = np.array(scores)
-    out = np.c_[unique_query_ids,preds,scores]
-    df_tmp = pd.DataFrame(out, columns=['TRUE.ID','PREDICTED.ID','SCORE'])
-    acc = (df_tmp['TRUE.ID']==df_tmp['PREDICTED.ID']).mean()
-    return acc
+    top_idx = df_scores.values.argmax(axis=1)
+    top_scores = df_scores.values[np.arange(df_scores.shape[0]), top_idx]
+    top_ids = [df_scores.columns[i] for i in top_idx]
+    df_tmp = pd.DataFrame({'TRUE.ID': df_scores.index.to_list(), 'PREDICTED.ID': top_ids, 'SCORE': top_scores})
+    if verbose:
+        print(df_tmp)
+    acc = (df_tmp['TRUE.ID'] == df_tmp['PREDICTED.ID']).mean()
+    return acc
 def get_acc_NRMS(df_query, df_reference, unique_query_ids, unique_reference_ids, similarity_measure, weights, spectrum_preprocessing_order, mz_min, mz_max, int_min, int_max, noise_threshold, wf_mz, wf_int, LET_threshold, entropy_dimension, high_quality_reference_library, verbose=True):
@@ -713,7 +508,7 @@ def get_acc_NRMS(df_query, df_reference, unique_query_ids, unique_reference_ids,
     df_scores = pd.DataFrame(all_similarity_scores, columns = unique_reference_ids)
     df_scores.index = unique_query_ids
-    df_scores.index.names = ['Query Spectrum ID']
+    df_scores.index.names = ['QUERY.SPECTRUM.ID']
     preds = []
     scores = []
@@ -743,64 +538,40 @@ def get_acc_NRMS(df_query, df_reference, unique_query_ids, unique_reference_ids,
-def run_spec_lib_matching_on_HRMS_data(query_data=None, reference_data=None, likely_reference_ids=None, similarity_measure='cosine', weights={'Cosine':0.25,'Shannon':0.25,'Renyi':0.25,'Tsallis':0.25}, spectrum_preprocessing_order='FCNMWL', high_quality_reference_library=False, mz_min=0, mz_max=9999999, int_min=0, int_max=9999999, window_size_centroiding=0.5, window_size_matching=0.5, noise_threshold=0.0, wf_mz=0.0, wf_intensity=1.0, LET_threshold=0.0, entropy_dimension=1.1, n_top_matches_to_save=1, print_id_results=False, output_identification=None, output_similarity_scores=None, return_ID_output=False, verbose=True):
-    '''
-    runs spectral library matching on high-resolution mass spectrometry (HRMS) data
-    --query_data: mgf, mzML, or csv file of query mass spectrum/spectra to be identified. If csv file, each row should correspond to a mass spectrum, the left-most column should contain an identifier, and each of the other columns should correspond to a single mass/charge ratio. Mandatory argument.
-    --reference_data: either string or list of strings with pass to mgf, mzML, sdf, and/or csv file(s) of the reference mass spectra. If csv file, each row should correspond to a mass spectrum, the left-most column should contain in identifier (i.e. the CAS registry number or the compound name), and the remaining column should correspond to a single mass/charge ratio. Mandatory argument.
-    --likely_reference_ids: CSV file with one column containing the IDs of a subset of all compounds in the reference_data to be used in spectral library matching. Each ID in this file must be an ID in the reference library. Default: None (i.e. default is to use entire reference library)
-    --similarity_measure: cosine, shannon, renyi, tsallis, mixture, jaccard, dice, 3w_jaccard, sokal_sneath, binary_cosine, mountford, mcconnaughey, driver_kroeber, simpson, braun_banquet, fager_mcgowan, kulczynski, intersection, hamming, hellinger. Default: cosine.
-    --weights: dict of weights to give to each non-binary similarity measure (i.e. cosine, shannon, renyi, and tsallis) when the mixture similarity measure is specified. Default: 0.25 for each of the four non-binary similarity measures.
-    --spectrum_preprocessing_order: The spectrum preprocessing transformations and the order in which they are to be applied. Note that these transformations are applied prior to computing similarity scores. Format must be a string with 2-6 characters chosen from C, F, M, N, L, W representing centroiding, filtering based on mass/charge and intensity values, matching, noise removal, low-entropy trannsformation, and weight-factor-transformation, respectively. For example, if \'WCM\' is passed, then each spectrum will undergo a weight factor transformation, then centroiding, and then matching. Note that if an argument is passed, then \'M\' must be contained in the argument, since matching is a required preprocessing step in spectral library matching of HRMS data. Furthermore, \'C\' must be performed before matching since centroiding can change the number of ion fragments in a given spectrum. Default: FCNMWL')
-    --high_quality_reference_library: True/False flag indicating whether the reference library is considered to be of high quality. If True, then the spectrum preprocessing transformations of filtering and noise removal are performed only on the query spectrum/spectra. If False, all spectrum preprocessing transformations specified will be applied to both the query and reference spectra. Default: False')
-    --mz_min: Remove all peaks with mass/charge value less than mz_min in each spectrum. Default: 0
-    --mz_max: Remove all peaks with mass/charge value greater than mz_max in each spectrum. Default: 9999999
-    --int_min: Remove all peaks with intensity value less than int_min in each spectrum. Default: 0
-    --int_max: Remove all peaks with intensity value greater than int_max in each spectrum. Default: 9999999
-    --window_size_centroiding: Window size parameter used in centroiding a given spectrum. Default: 0.5
-    --window_size_matching: Window size parameter used in matching a query spectrum and a reference library spectrum. Default: 0.5
-    --noise_threshold: Ion fragments (i.e. points in a given mass spectrum) with intensity less than max(intensities)*noise_threshold are removed. Default: 0.0
-    --wf_mz: Mass/charge weight factor parameter. Default: 0.0
-    --wf_intensity: Intensity weight factor parameter. Default: 0.0
-    --LET_threshold: Low-entropy transformation threshold parameter. Spectra with Shannon entropy less than LET_threshold are transformed according to intensitiesNew=intensitiesOriginal^{(1+S)/(1+LET_threshold)}. Default: 0.0
-    --entropy_dimension: Entropy dimension parameter. Must have positive value other than 1. When the entropy dimension is 1, then Renyi and Tsallis entropy are equivalent to Shannon entropy. Therefore, this parameter only applies to the renyi and tsallis similarity measures. This parameter will be ignored if similarity measure cosine or shannon is chosen. Default: 1.1
-    --n_top_matches_to_save: The number of top matches to report. For example, if n_top_matches_to_save=5, then for each query spectrum, the five reference spectra with the largest similarity with the given query spectrum will be reported. Default: 1
-    --print_id_results: Flag that prints identification results if True. Default: False
-    --output_identification: Output TXT file containing the most-similar reference spectra for each query spectrum along with the corresponding similarity scores. Default is to save identification output in current working directory with filename \'output_identification.txt\'.
-    --output_similarity_scores: Output TXT file containing similarity scores between all query spectrum/spectra and all reference spectra. Each row corresponds to a query spectrum, the left-most column contains the query spectrum/spectra identifier, and the remaining column contain the similarity scores with respect to all reference library spectra. If no argument passed, then this TXT file is written to the current working directory with filename \'output_all_similarity_scores\'.txt.')
-    '''
+def run_spec_lib_matching_on_HRMS_data(query_data=None, reference_data=None, precursor_ion_mz_tolerance=None, ionization_mode=None, adduct=None, likely_reference_ids=None, similarity_measure='cosine', weights={'Cosine':0.25,'Shannon':0.25,'Renyi':0.25,'Tsallis':0.25}, spectrum_preprocessing_order='FCNMWL', high_quality_reference_library=False, mz_min=0, mz_max=9999999, int_min=0, int_max=9999999, window_size_centroiding=0.5, window_size_matching=0.5, noise_threshold=0.0, wf_mz=0.0, wf_intensity=1.0, LET_threshold=0.0, entropy_dimension=1.1, n_top_matches_to_save=1, print_id_results=False, output_identification=None, output_similarity_scores=None, return_ID_output=False, verbose=True):
     if query_data is None:
         print('\nError: No argument passed to the mandatory query_data. Please pass the path to the CSV file of the query data.')
         sys.exit()
     else:
         extension = query_data.rsplit('.',1)
         extension = extension[(len(extension)-1)]
-        if extension == 'mgf' or extension == 'MGF' or extension == 'mzML' or extension == 'mzml' or extension == 'MZML' or extension == 'cdf' or extension == 'CDF':
-            output_path_tmp = query_data[:-3] + 'csv'
+        if extension == 'mgf' or extension == 'MGF' or extension == 'mzML' or extension == 'mzml' or extension == 'MZML' or extension == 'cdf' or extension == 'CDF' or extension == 'json' or extension == 'JSON' or extension == 'msp' or extension == 'MSP':
+            output_path_tmp = query_data[:-3] + 'txt'
             build_library_from_raw_data(input_path=query_data, output_path=output_path_tmp, is_reference=False)
-            df_query = pd.read_csv(output_path_tmp)
-        if extension == 'csv' or extension == 'CSV':
-            df_query = pd.read_csv(query_data)
-        unique_query_ids = df_query.iloc[:,0].unique()
+            df_query = pd.read_csv(output_path_tmp, sep='\t')
+        if extension == 'txt' or extension == 'TXT':
+            df_query = pd.read_csv(query_data, sep='\t')
+        unique_query_ids = df_query['id'].unique()
     if reference_data is None:
-        print('\nError: No argument passed to the mandatory reference_data. Please pass the path to the CSV file of the reference data.')
+        print('\nError: No argument passed to the mandatory reference_data. Please pass the path to the reference data.')
         sys.exit()
     else:
         if isinstance(reference_data,str):
             df_reference = get_reference_df(reference_data,likely_reference_ids)
-            unique_reference_ids = df_reference.iloc[:,0].unique()
         else:
             dfs = []
-            unique_reference_ids = []
             for f in reference_data:
                 tmp = get_reference_df(f,likely_reference_ids)
                 dfs.append(tmp)
-                unique_reference_ids.extend(tmp.iloc[:,0].unique())
             df_reference = pd.concat(dfs, axis=0, ignore_index=True)
+    if 'ionization_mode' in df_reference.columns.tolist() and ionization_mode != 'N/A' and ionization_mode != None:
+        df_reference = df_reference.loc[df_reference['ionization_mode']==ionization_mode]
+    if 'adduct' in df_reference.columns.tolist() and adduct != 'N/A' and adduct != None:
+        df_reference = df_reference.loc[df_reference['adduct']==adduct]
+    print(df_reference.loc[df_reference['id']=='Hectochlorin M+H'])
     if spectrum_preprocessing_order is not None:
         spectrum_preprocessing_order = list(spectrum_preprocessing_order)
@@ -888,62 +659,91 @@ def run_spec_lib_matching_on_HRMS_data(query_data=None, reference_data=None, lik
         print(f'Warning: writing similarity scores to {output_similarity_scores}')
-    all_similarity_scores =  []
-    for query_idx in range(0,len(unique_query_ids)):
-        if verbose is True:
+    unique_reference_ids = df_reference['id'].unique().tolist()
+    all_similarity_scores = []
+    for query_idx in range(len(unique_query_ids)):
+        if verbose:
             print(f'query spectrum #{query_idx} is being identified')
-        q_idxs_tmp = np.where(df_query.iloc[:,0] == unique_query_ids[query_idx])[0]
-        q_spec_tmp = np.asarray(pd.concat([df_query.iloc[q_idxs_tmp,1], df_query.iloc[q_idxs_tmp,2]], axis=1).reset_index(drop=True))
-        similarity_scores = []
-        for ref_idx in range(0,len(unique_reference_ids)):
-            q_spec = q_spec_tmp
-            r_idxs_tmp = np.where(df_reference.iloc[:,0] == unique_reference_ids[ref_idx])[0]
-            r_spec = np.asarray(pd.concat([df_reference.iloc[r_idxs_tmp,1], df_reference.iloc[r_idxs_tmp,2]], axis=1).reset_index(drop=True))
+        q_mask = (df_query['id'] == unique_query_ids[query_idx])
+        q_idxs_tmp = np.where(q_mask)[0]
+        q_spec_tmp = np.asarray(pd.concat([df_query['mz_ratio'].iloc[q_idxs_tmp], df_query['intensity'].iloc[q_idxs_tmp]], axis=1).reset_index(drop=True))
+        if 'precursor_ion_mz' in df_query.columns.tolist() and 'precursor_ion_mz' in df_reference.columns.tolist() and precursor_ion_mz_tolerance != None:
+            precursor_ion_mz_tmp = df_query['precursor_ion_mz'].iloc[q_idxs_tmp[0]]
+            df_reference_tmp = df_reference.loc[df_reference['precursor_ion_mz'].between(precursor_ion_mz_tmp-precursor_ion_mz_tolerance, precursor_ion_mz_tmp+precursor_ion_mz_tolerance, inclusive='both'),['id','mz_ratio','intensity']].copy()
+        else:
+            df_reference_tmp = df_reference.copy()
+        ref_groups = dict(tuple(df_reference_tmp.groupby('id', sort=False)))
+        unique_reference_ids_tmp = list(ref_groups.keys())
+        similarity_by_ref = {}
+        for ref_id in unique_reference_ids_tmp:
+            q_spec = q_spec_tmp.copy()
+            r_df = ref_groups[ref_id]
+            r_spec = np.asarray(pd.concat([r_df['mz_ratio'], r_df['intensity']], axis=1).reset_index(drop=True))
+            #print('\nhere!!!!!!!!!!!!!!!')
+            #print(r_spec)
             is_matched = False
             for transformation in spectrum_preprocessing_order:
-                if np.isinf(q_spec[:,1]).sum() > 0:
-                    q_spec[:,1] = np.zeros(q_spec.shape[0])
-                if np.isinf(r_spec[:,1]).sum() > 0:
-                    r_spec[:,1] = np.zeros(r_spec.shape[0])
-                if transformation == 'C' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1:
-                    q_spec = centroid_spectrum(q_spec, window_size=window_size_centroiding)
-                    r_spec = centroid_spectrum(r_spec, window_size=window_size_centroiding)
-                if transformation == 'M' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1:
+                if np.isinf(q_spec[:, 1]).sum() > 0:
+                    q_spec[:, 1] = np.zeros(q_spec.shape[0])
+                if np.isinf(r_spec[:, 1]).sum() > 0:
+                    r_spec[:, 1] = np.zeros(r_spec.shape[0])
+                if transformation == 'C' and q_spec.shape[0] > 1 and r_spec.shape[0] > 1:
+                    q_spec = centroid_spectrum(q_spec, window_size=window_size_centroiding)
+                    r_spec = centroid_spectrum(r_spec, window_size=window_size_centroiding)
+                if transformation == 'M' and q_spec.shape[0] > 1 and r_spec.shape[0] > 1:
                     m_spec = match_peaks_in_spectra(spec_a=q_spec, spec_b=r_spec, window_size=window_size_matching)
-                    q_spec = m_spec[:,0:2]
-                    r_spec = m_spec[:,[0,2]]
+                    q_spec = m_spec[:, 0:2]
+                    r_spec = m_spec[:, [0, 2]]
                     is_matched = True
-                if transformation == 'W' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1:
-                    q_spec[:,1] = wf_transform(q_spec[:,0], q_spec[:,1], wf_mz, wf_intensity)
-                    r_spec[:,1] = wf_transform(r_spec[:,0], r_spec[:,1], wf_mz, wf_intensity)
-                if transformation == 'L' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1:
-                    q_spec[:,1] = LE_transform(q_spec[:,1], LET_threshold, normalization_method=normalization_method)
-                    r_spec[:,1] = LE_transform(r_spec[:,1], LET_threshold, normalization_method=normalization_method)
-                if transformation == 'N' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1:
-                    q_spec = remove_noise(q_spec, nr = noise_threshold)
-                    if high_quality_reference_library == False:
-                        r_spec = remove_noise(r_spec, nr = noise_threshold)
-                if transformation == 'F' and q_spec.shape[0] > 1 and r_spec.shape[1] > 1:
-                    q_spec = filter_spec_lcms(q_spec, mz_min = mz_min, mz_max = mz_max, int_min = int_min, int_max = int_max, is_matched = is_matched)
-                    if high_quality_reference_library == False:
-                        r_spec = filter_spec_lcms(r_spec, mz_min = mz_min, mz_max = mz_max, int_min = int_min, int_max = int_max, is_matched = is_matched)
-            q_ints = q_spec[:,1]
-            r_ints = r_spec[:,1]
-            if np.sum(q_ints) != 0 and np.sum(r_ints) != 0 and q_spec.shape[0] > 1 and r_spec.shape[1] > 1:
-                similarity_score = get_similarity(similarity_measure, q_ints, r_ints, weights, entropy_dimension)
+                if transformation == 'W' and q_spec.shape[0] > 1 and r_spec.shape[0] > 1:
+                    q_spec[:, 1] = wf_transform(q_spec[:, 0], q_spec[:, 1], wf_mz, wf_intensity)
+                    r_spec[:, 1] = wf_transform(r_spec[:, 0], r_spec[:, 1], wf_mz, wf_intensity)
+                if transformation == 'L' and q_spec.shape[0] > 1 and r_spec.shape[0] > 1:
+                    q_spec[:, 1] = LE_transform(q_spec[:, 1], LET_threshold, normalization_method=normalization_method)
+                    r_spec[:, 1] = LE_transform(r_spec[:, 1], LET_threshold, normalization_method=normalization_method)
+                if transformation == 'N' and q_spec.shape[0] > 1 and r_spec.shape[0] > 1:
+                    q_spec = remove_noise(q_spec, nr=noise_threshold)
+                    if not high_quality_reference_library:
+                        r_spec = remove_noise(r_spec, nr=noise_threshold)
+                if transformation == 'F' and q_spec.shape[0] > 1 and r_spec.shape[0] > 1:
+                    q_spec = filter_spec_lcms(
+                        q_spec, mz_min=mz_min, mz_max=mz_max, int_min=int_min, int_max=int_max, is_matched=is_matched
+                    )
+                    if not high_quality_reference_library:
+                        r_spec = filter_spec_lcms(
+                            r_spec, mz_min=mz_min, mz_max=mz_max, int_min=int_min, int_max=int_max, is_matched=is_matched
+                        )
+            q_ints = q_spec[:, 1]
+            r_ints = r_spec[:, 1]
+            if np.sum(q_ints) != 0 and np.sum(r_ints) != 0 and q_spec.shape[0] > 1 and r_spec.shape[0] > 1:
+                sim = get_similarity(similarity_measure, q_ints, r_ints, weights, entropy_dimension)
             else:
-                similarity_score = 0
+                sim = 0.0
-            similarity_scores.append(similarity_score)
-        all_similarity_scores.append(similarity_scores)
+            similarity_by_ref[ref_id] = sim
-    df_scores = pd.DataFrame(all_similarity_scores, columns = unique_reference_ids)
+        row_scores = [similarity_by_ref.get(ref_id, 0.0) for ref_id in unique_reference_ids]
+        all_similarity_scores.append(row_scores)
+    df_scores = pd.DataFrame(all_similarity_scores, index=unique_query_ids, columns=unique_reference_ids)
     df_scores.index = unique_query_ids
-    df_scores.index.names = ['Query Spectrum ID']
+    df_scores.index.names = ['QUERY.SPECTRUM.ID']
     preds = []
     scores = []
@@ -976,7 +776,7 @@ def run_spec_lib_matching_on_HRMS_data(query_data=None, reference_data=None, lik
     df_top_ref_specs = pd.DataFrame(out, columns = [*cnames_preds, *cnames_scores])
     df_top_ref_specs.index = unique_query_ids
-    df_top_ref_specs.index.names = ['Query Spectrum ID']
+    df_top_ref_specs.index.names = ['QUERY.SPECTRUM.ID']
     df_scores.columns = ['Reference Spectrum ID: ' + col for col in  list(map(str,df_scores.columns.tolist()))]
@@ -993,33 +793,7 @@ def run_spec_lib_matching_on_HRMS_data(query_data=None, reference_data=None, lik
-def run_spec_lib_matching_on_NRMS_data(query_data=None, reference_data=None, likely_reference_ids=None, spectrum_preprocessing_order='FNLW', similarity_measure='cosine', weights={'Cosine':0.25,'Shannon':0.25,'Renyi':0.25,'Tsallis':0.25}, high_quality_reference_library=False, mz_min=0, mz_max=9999999, int_min=0, int_max=9999999, noise_threshold=0.0, wf_mz=0.0, wf_intensity=1.0, LET_threshold=0.0, entropy_dimension=1.1, n_top_matches_to_save=1, print_id_results=False, output_identification=None, output_similarity_scores=None, return_ID_output=False):
-    '''
-    runs spectral library matching on nominal-resolution mass spectrometry (NRMS) data
-    --query_data: cdf or csv file of query mass spectrum/spectra to be identified. If csv file, each row should correspond to a mass spectrum, the left-most column should contain an identifier, and each of the other columns should correspond to a single mass/charge ratio. Mandatory argument.
-    --reference_data: cdf of csv file of the reference mass spectra. If csv file, each row should correspond to a mass spectrum, the left-most column should contain in identifier (i.e. the CAS registry number or the compound name), and the remaining column should correspond to a single mass/charge ratio. Mandatory argument.
-    --likely_reference_ids: CSV file with one column containing the IDs of a subset of all compounds in the reference_data to be used in spectral library matching. Each ID in this file must be an ID in the reference library. Default: None (i.e. default is to use entire reference library)
-    --similarity_measure: cosine, shannon, renyi, tsallis, mixture, jaccard, dice, 3w_jaccard, sokal_sneath, binary_cosine, mountford, mcconnaughey, driver_kroeber, simpson, braun_banquet, fager_mcgowan, kulczynski, intersection, hamming, hellinger. Default: cosine.
-    --weights: dict of weights to give to each non-binary similarity measure (i.e. cosine, shannon, renyi, and tsallis) when the mixture similarity measure is specified. Default: 0.25 for each of the four non-binary similarity measures.
-    --spectrum_preprocessing_order: The spectrum preprocessing transformations and the order in which they are to be applied. Note that these transformations are applied prior to computing similarity scores. Format must be a string with 2-4 characters chosen from F, N, L, W representing filtering based on mass/charge and intensity values, noise removal, low-entropy trannsformation, and weight-factor-transformation, respectively. For example, if \'WN\' is passed, then each spectrum will undergo a weight factor transformation and then noise removal. Default: FNLW')
-    --high_quality_reference_library: True/False flag indicating whether the reference library is considered to be of high quality. If True, then the spectrum preprocessing transformations of filtering and noise removal are performed only on the query spectrum/spectra. If False, all spectrum preprocessing transformations specified will be applied to both the query and reference spectra. Default: False')
-    --mz_min: Remove all peaks with mass/charge value less than mz_min in each spectrum. Default: 0
-    --mz_max: Remove all peaks with mass/charge value greater than mz_max in each spectrum. Default: 9999999
-    --int_min: Remove all peaks with intensity value less than int_min in each spectrum. Default: 0
-    --int_max: Remove all peaks with intensity value greater than int_max in each spectrum. Default: 9999999
-    --noise_threshold: Ion fragments (i.e. points in a given mass spectrum) with intensity less than max(intensities)*noise_threshold are removed. Default: 0.0
-    --wf_mz: Mass/charge weight factor parameter. Default: 0.0
-    --wf_intensity: Intensity weight factor parameter. Default: 0.0
-    --LET_threshold: Low-entropy transformation threshold parameter. Spectra with Shannon entropy less than LET_threshold are transformed according to intensitiesNew=intensitiesOriginal^{(1+S)/(1+LET_threshold)}. Default: 0.0
-    --entropy_dimension: Entropy dimension parameter. Must have positive value other than 1. When the entropy dimension is 1, then Renyi and Tsallis entropy are equivalent to Shannon entropy. Therefore, this parameter only applies to the renyi and tsallis similarity measures. This parameter will be ignored if similarity measure cosine or shannon is chosen. Default: 1.1
-    --normalization_method: Method used to normalize the intensities of each spectrum so that the intensities sum to 1. Since the objects entropy quantifies the uncertainy of must be probability distributions, the intensities of a given spectrum must sum to 1 prior to computing the entropy of the given spectrum intensities. Options: \'standard\' and \'softmax\'. Default: standard.
-    --n_top_matches_to_save: The number of top matches to report. For example, if n_top_matches_to_save=5, then for each query spectrum, the five reference spectra with the largest similarity with the given query spectrum will be reported. Default: 1
-    --print_id_results: Flag that prints identification results if True. Default: False
-    --output_identification: Output TXT file containing the most-similar reference spectra for each query spectrum along with the corresponding similarity scores. Default is to save identification output in current working directory with filename \'output_identification.txt\'.
-    --output_similarity_scores: Output TXT file containing similarity scores between all query spectrum/spectra and all reference spectra. Each row corresponds to a query spectrum, the left-most column contains the query spectrum/spectra identifier, and the remaining column contain the similarity scores with respect to all reference library spectra. If no argument passed, then this TXT file is written to the current working directory with filename \'output_all_similarity_scores\'.txt.')
-    '''
+def run_spec_lib_matching_on_NRMS_data(query_data=None, reference_data=None, likely_reference_ids=None, spectrum_preprocessing_order='FNLW', similarity_measure='cosine', weights={'Cosine':0.25,'Shannon':0.25,'Renyi':0.25,'Tsallis':0.25}, high_quality_reference_library=False, mz_min=0, mz_max=9999999, int_min=0, int_max=9999999, noise_threshold=0.0, wf_mz=0.0, wf_intensity=1.0, LET_threshold=0.0, entropy_dimension=1.1, n_top_matches_to_save=1, print_id_results=False, output_identification=None, output_similarity_scores=None, return_ID_output=False, verbose=True):
     if query_data is None:
         print('\nError: No argument passed to the mandatory query_data. Please pass the path to the CSV file of the query data.')
         sys.exit()
@@ -1027,11 +801,11 @@ def run_spec_lib_matching_on_NRMS_data(query_data=None, reference_data=None, lik
         extension = query_data.rsplit('.',1)
         extension = extension[(len(extension)-1)]
         if extension == 'mgf' or extension == 'MGF' or extension == 'mzML' or extension == 'mzml' or extension == 'MZML' or extension == 'cdf' or extension == 'CDF':
-            output_path_tmp = query_data[:-3] + 'csv'
+            output_path_tmp = query_data[:-3] + 'txt'
             build_library_from_raw_data(input_path=query_data, output_path=output_path_tmp, is_reference=False)
-            df_query = pd.read_csv(output_path_tmp)
-        if extension == 'csv' or extension == 'CSV':
-            df_query = pd.read_csv(query_data)
+            df_query = pd.read_csv(output_path_tmp, sep='\t')
+        if extension == 'txt' or extension == 'TXT':
+            df_query = pd.read_csv(query_data, sep='\t')
         unique_query_ids = df_query.iloc[:,0].unique()
     if reference_data is None:
@@ -1175,7 +949,7 @@ def run_spec_lib_matching_on_NRMS_data(query_data=None, reference_data=None, lik
     df_scores = pd.DataFrame(all_similarity_scores, columns = unique_reference_ids)
     df_scores.index = unique_query_ids
-    df_scores.index.names = ['Query Spectrum ID']
+    df_scores.index.names = ['QUERY.SPECTRUM.ID']
     preds = []
     scores = []
@@ -1208,7 +982,7 @@ def run_spec_lib_matching_on_NRMS_data(query_data=None, reference_data=None, lik
     df_top_ref_specs = pd.DataFrame(out, columns = [*cnames_preds, *cnames_scores])
     df_top_ref_specs.index = unique_query_ids
-    df_top_ref_specs.index.names = ['Query Spectrum ID']
+    df_top_ref_specs.index.names = ['QUERY.SPECTRUM.ID']
     if print_id_results == True:
         print(df_top_ref_specs.to_string())

pycompound 0.1.6__py3-none-any.whl → 0.1.7__py3-none-any.whl

pycompound 0.1.6py3-none-any.whl → 0.1.7py3-none-any.whl