PyPI - pycompound - Versions diffs - 0.1.6__py3-none-any.whl → 0.1.8__py3-none-any.whl - Mend

pycompound 0.1.6py3-none-any.whl → 0.1.8py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (15) hide show

pycompound/build_library.py +77 -20
pycompound/plot_spectra.py +73 -111
pycompound/processing.py +5 -5
pycompound/spec_lib_matching.py +262 -491
pycompound/spec_lib_matching_CLI.py +48 -2
pycompound/tuning_CLI_DE.py +22 -22
pycompound/tuning_CLI_grid.py +22 -6
pycompound-0.1.8.dist-info/METADATA +824 -0
pycompound-0.1.8.dist-info/RECORD +14 -0
{pycompound-0.1.6.dist-info → pycompound-0.1.8.dist-info}/top_level.txt +0 -1
app.py +0 -1519
pycompound-0.1.6.dist-info/METADATA +0 -27
pycompound-0.1.6.dist-info/RECORD +0 -15
{pycompound-0.1.6.dist-info → pycompound-0.1.8.dist-info}/WHEEL +0 -0
{pycompound-0.1.6.dist-info → pycompound-0.1.8.dist-info}/licenses/LICENSE +0 -0

pycompound/spec_lib_matching_CLI.py CHANGED Viewed

@@ -12,6 +12,9 @@ parser = argparse.ArgumentParser()
 parser.add_argument('--query_data', type=str, metavar='\b', help='CSV file of query mass spectrum/spectra to be identified. Each row should correspond to a mass spectrum, the left-most column should contain an identifier, and each of the other columns should correspond to a single mass/charge ratio. Mandatory argument.')
 parser.add_argument('--reference_data', type=str, metavar='\b', help='CSV file of the reference mass spectra. Each row should correspond to a mass spectrum, the left-most column should contain in identifier (i.e. the CAS registry number or the compound name), and the remaining column should correspond to a single mass/charge ratio. Mandatory argument.')
+parser.add_argument('--precursor_ion_mz_tolerance', type=str, metavar='\b', default=None, help='Precursor ion m/z tolerance (positive real number; only applicable to HRMS)). Default=None')
+parser.add_argument('--ionization_mode', type=str, metavar='\b', default=None, help='Ionization mode (only applicable to HRMS). Options: \'Positive\', \'Negative\', or \'N/A\'.')
+parser.add_argument('--adduct', type=str, metavar='\b', default='H', help='Adduct (only applicable to HRMS). Options: \'H\', \'NH3\', \'NH4\', \'OH\', \'Cl\', \'K\', \'Li\', \'Na\'. Default: \'H\'.')
 parser.add_argument('--likely_reference_ids', type=str, metavar='\b', help='CSV file with one column containing the IDs of a subset of all compounds in the reference_data to be used in spectral library matching. Each ID in this file must be an ID in the reference library. Default: none (i.e. default is to use entire reference library)')
 parser.add_argument('--similarity_measure', type=str, default='cosine', metavar='\b', help='Similarity measure: options are cosine, shannon, renyi, tsallis, mixture, jaccard, dice, 3w_jaccard, sokal_sneath, binary_cosine, mountford, mcconnaughey, driver_kroeber, simpson, braun_banquet, fager_mcgowan, kulczynski, intersection, hamming, or hellinger. Default: cosine.')
 parser.add_argument('--weights', type=json.loads, default={'Cosine':0.25,'Shannon':0.25,'Renyi':0.25,'Tsallis':0.25}, metavar='\b', help='dict of weights to give to each non-binary similarity measure (i.e. cosine, shannon, renyi, and tsallis) when the mixture similarity measure is specified. Default: 0.25 for each of the four non-binary similarity measures.')
@@ -39,9 +42,52 @@ args = parser.parse_args()
 if args.chromatography_platform == 'HRMS':
-    run_spec_lib_matching_on_HRMS_data(query_data=args.query_data, reference_data=args.reference_data, likely_reference_ids=args.likely_reference_ids, similarity_measure=args.similarity_measure, weights=args.weights, spectrum_preprocessing_order=args.spectrum_preprocessing_order, high_quality_reference_library=args.high_quality_reference_library, mz_min=args.mz_min, mz_max=args.mz_max, int_min=args.int_min, int_max=args.int_max, window_size_centroiding=args.window_size_centroiding, window_size_matching=args.window_size_matching, noise_threshold=args.noise_threshold, wf_mz=args.wf_mz, wf_intensity=args.wf_intensity, LET_threshold=args.LET_threshold, entropy_dimension=args.entropy_dimension, n_top_matches_to_save=args.n_top_matches_to_save, print_id_results=args.print_id_results, output_identification=args.output_identification, output_similarity_scores=args.output_similarity_scores)
+    run_spec_lib_matching_on_HRMS_data(query_data=args.query_data,
+                                       reference_data=args.reference_data,
+                                       precursor_ion_mz_tolerance=args.precursor_ion_mz_tolerance,
+                                       ionization_mode=args.ionization_mode,
+                                       adduct=args.adduct,
+                                       likely_reference_ids=args.likely_reference_ids,
+                                       similarity_measure=args.similarity_measure,
+                                       weights=args.weights,
+                                       spectrum_preprocessing_order=args.spectrum_preprocessing_order,
+                                       high_quality_reference_library=args.high_quality_reference_library,
+                                       mz_min=args.mz_min,
+                                       mz_max=args.mz_max,
+                                       int_min=args.int_min,
+                                       int_max=args.int_max,
+                                       window_size_centroiding=args.window_size_centroiding,
+                                       window_size_matching=args.window_size_matching,
+                                       noise_threshold=args.noise_threshold,
+                                       wf_mz=args.wf_mz,
+                                       wf_intensity=args.wf_intensity,
+                                       LET_threshold=args.LET_threshold,
+                                       entropy_dimension=args.entropy_dimension,
+                                       n_top_matches_to_save=args.n_top_matches_to_save,
+                                       print_id_results=args.print_id_results,
+                                       output_identification=args.output_identification,
+                                       output_similarity_scores=args.output_similarity_scores)
 if args.chromatography_platform == 'NRMS':
-    run_spec_lib_matching_on_NRMS_data(query_data=args.query_data, reference_data=args.reference_data, likely_reference_ids=args.likely_reference_ids, similarity_measure=args.similarity_measure, weights=args.weights, spectrum_preprocessing_order=args.spectrum_preprocessing_order, high_quality_reference_library=args.high_quality_reference_library, mz_min=args.mz_min, mz_max=args.mz_max, int_min=args.int_min, int_max=args.int_max, noise_threshold=args.noise_threshold, wf_mz=args.wf_mz, wf_intensity=args.wf_intensity, LET_threshold=args.LET_threshold, entropy_dimension=args.entropy_dimension, n_top_matches_to_save=args.n_top_matches_to_save, print_id_results=args.print_id_results, output_identification=args.output_identification, output_similarity_scores=args.output_similarity_scores)
+    run_spec_lib_matching_on_NRMS_data(query_data=args.query_data,
+                                       reference_data=args.reference_data,
+                                       likely_reference_ids=args.likely_reference_ids,
+                                       similarity_measure=args.similarity_measure,
+                                       weights=args.weights,
+                                       spectrum_preprocessing_order=args.spectrum_preprocessing_order,
+                                       high_quality_reference_library=args.high_quality_reference_library,
+                                       mz_min=args.mz_min,
+                                       mz_max=args.mz_max,
+                                       int_min=args.int_min,
+                                       int_max=args.int_max,
+                                       noise_threshold=args.noise_threshold,
+                                       wf_mz=args.wf_mz,
+                                       wf_intensity=args.wf_intensity,
+                                       LET_threshold=args.LET_threshold,
+                                       entropy_dimension=args.entropy_dimension,
+                                       n_top_matches_to_save=args.n_top_matches_to_save,
+                                       print_id_results=args.print_id_results,
+                                       output_identification=args.output_identification,
+                                       output_similarity_scores=args.output_similarity_scores)

pycompound/tuning_CLI_DE.py CHANGED Viewed

@@ -42,9 +42,7 @@ DEFAULT_PARAMS = {
 }
-# ---------- Utilities ----------
 def parse_bound(s: str) -> Tuple[str, Tuple[float, float]]:
-    # "name=min:max" → (name, (min, max))
     if "=" not in s or ":" not in s:
         raise argparse.ArgumentTypeError(f"Bad --bound format '{s}'. Use name=min:max")
     name, rng = s.split("=", 1)
@@ -59,7 +57,6 @@ def parse_bound(s: str) -> Tuple[str, Tuple[float, float]]:
 def parse_default(s: str) -> Tuple[str, float]:
-    # "name=value" → (name, value)
     if "=" not in s:
         raise argparse.ArgumentTypeError(f"Bad --default format '{s}'. Use name=value")
     name, val = s.split("=", 1)
@@ -82,7 +79,7 @@ def objective_HRMS(X: np.ndarray, ctx: dict) -> float:
     p = _vector_to_full_params(X, ctx["default_params"], ctx["optimize_params"])
     acc = get_acc_HRMS(
         ctx["df_query"], ctx["df_reference"],
-        ctx["uq"], ctx["ur"],
+        ctx["precursor_ion_mz_tolerance"], ctx["ionization_mode"], ctx["adduct"],
         ctx["similarity_measure"], ctx["weights"], ctx["spectrum_preprocessing_order"],
         ctx["mz_min"], ctx["mz_max"], ctx["int_min"], ctx["int_max"],
         p["window_size_centroiding"], p["window_size_matching"], p["noise_threshold"],
@@ -103,8 +100,7 @@ def objective_NRMS(X: np.ndarray, ctx: dict) -> float:
         ctx["similarity_measure"], ctx["weights"], ctx["spectrum_preprocessing_order"],
         ctx["mz_min"], ctx["mz_max"], ctx["int_min"], ctx["int_max"],
         p["noise_threshold"], p["wf_mz"], p["wf_int"], p["LET_threshold"], p["entropy_dimension"],
-        ctx["high_quality_reference_library"],
-        verbose=False
+        ctx["high_quality_reference_library"]
     )
     print(f"\n{ctx['optimize_params']} = {np.array(X)}\naccuracy: {acc*100}%")
     return 1.0 - acc
@@ -112,15 +108,16 @@ def objective_NRMS(X: np.ndarray, ctx: dict) -> float:
 # ---------- Main CLI ----------
 def main():
-    p = argparse.ArgumentParser(
-        description="Parameter tuning via Differential Evolution for HRMS/NRMS using pycompound."
-    )
+    p = argparse.ArgumentParser(description="Parameter tuning via Differential Evolution for HRMS/NRMS using pycompound.")
     p.add_argument("--chromatography_platform", choices=["HRMS", "NRMS"], default="HRMS", help="Chromatography Platform.")
-    p.add_argument("--query_data", required=True, help="Path to query CSV (must contain 'id' column).")
-    p.add_argument("--reference_data", required=True, nargs="+", help="Path(s) to reference CSV(s) (must contain 'id').")
-    p.add_argument("--similarity_measure", default="cosine", choices=["cosine", "renyi", "tsallis"], help="Similarity measure.")
+    p.add_argument("--query_data", required=True, help="Path to query TXT (must contain 'id' column).")
+    p.add_argument("--reference_data", required=True, nargs="+", help="Path(s) to reference TXT(s) (must contain 'id').")
+    p.add_argument("--precursor_ion_mz_tolerance", type=float, default=None, help='Precursor ion m/z tolerance (positive real number; only applicable to HRMS)). Default=None')
+    p.add_argument("--ionization_mode", choices=['Positive','Negative',None], default=None, help='Ionization mode (only applicable to HRMS). Options: \'Positive\', \'Negative\', or \'None\'. Default=None')
+    p.add_argument("--adduct", choices=['H','NH3','NH4','OH','K','Li','Na',None], default=None, help='Adduct (only applicable to HRMS). Options: \'H\', \'NH3\', \'NH4\', \'OH\', \'Cl\', \'K\', \'Li\', \'Na\'. Default: \'H\'.')
+    p.add_argument("--similarity_measure", default="cosine", choices=["cosine", "shannon", "renyi", "tsallis"], help="Similarity measure.")
     p.add_argument("--weights", default="", help="Weights spec; empty means None.")
-    p.add_argument("--spectrum-order", default="CNMWL", help="Spectrum preprocessing order string.")
+    p.add_argument("--spectrum_preprocessing_order", default="CNMWL", help="Spectrum preprocessing order string.")
     p.add_argument("--mz-min", type=float, default=0.0)
     p.add_argument("--mz-max", type=float, default=999_999_999.0)
     p.add_argument("--int-min", type=float, default=0.0)
@@ -143,19 +140,19 @@ def main():
     qpath = Path(args.query_data)
     if not qpath.exists():
-        sys.exit(f"Query CSV not found: {qpath}")
+        sys.exit(f"Query TXT not found: {qpath}")
-    df_query = pd.read_csv(qpath)
+    df_query = pd.read_csv(qpath,sep='\t')
     if "id" not in df_query.columns:
-        sys.exit("Query CSV must contain an 'id' column.")
+        sys.exit("Query TXT must contain an 'id' column.")
     ref_paths = [Path(pth) for pth in args.reference_data]
     for r in ref_paths:
         if not r.exists():
-            sys.exit(f"Reference CSV not found: {r}")
-    df_reference = pd.concat([pd.read_csv(r) for r in ref_paths], axis=0, ignore_index=True)
+            sys.exit(f"Reference TXT not found: {r}")
+    df_reference = pd.concat([pd.read_csv(r,sep='\t') for r in ref_paths], axis=0, ignore_index=True)
     if "id" not in df_reference.columns:
-        sys.exit("Reference CSV must contain an 'id' column.")
+        sys.exit("Reference TXT must contain an 'id' column.")
     uq = df_query["id"].unique().tolist()
     ur = df_reference["id"].unique().tolist()
@@ -177,11 +174,13 @@ def main():
     ctx = dict(
         df_query=df_query,
         df_reference=df_reference,
-        uq=uq,
-        ur=ur,
+        precursor_ion_mz_tolerance=args.precursor_ion_mz_tolerance,
+        ionization_mode=args.ionization_mode,
+        uq=uq, ur=ur,
+        adduct=args.adduct,
         similarity_measure=args.similarity_measure,
         weights=(None if args.weights.strip() == "" else args.weights),
-        spectrum_preprocessing_order=args.spectrum_order,
+        spectrum_preprocessing_order=args.spectrum_preprocessing_order,
         mz_min=float(args.mz_min),
         mz_max=float(args.mz_max),
         int_min=float(args.int_min),
@@ -211,6 +210,7 @@ def main():
         seed=int(args.seed),
         workers=int(args.workers),
         callback=_cb,
+        updating='deferred' if int(args.workers)!=1 else 'immediate'
     )
     best_params = _vector_to_full_params(result.x, default_params, args.opt)

pycompound/tuning_CLI_grid.py CHANGED Viewed

@@ -10,11 +10,14 @@ parser = argparse.ArgumentParser()
 parser.add_argument('--query_data', type=str, metavar='\b', help='CSV file of query mass spectrum/spectra to be identified. Each row should correspond to a mass spectrum, the left-most column should contain an identifier, and each of the other columns should correspond to a single mass/charge ratio. Mandatory argument.')
 parser.add_argument('--reference_data', type=str, metavar='\b', help='CSV file of the reference mass spectra. Each row should correspond to a mass spectrum, the left-most column should contain in identifier (i.e. the CAS registry number or the compound name), and the remaining column should correspond to a single mass/charge ratio. Mandatory argument.')
+parser.add_argument('--precursor_ion_mz_tolerance', type=str, metavar='\b', default=None, help='Precursor ion m/z tolerance (positive real number; only applicable to HRMS)). Default=None')
+parser.add_argument('--ionization_mode', type=str, metavar='\b', default=None, help='Ionization mode (only applicable to HRMS). Options: \'Positive\', \'Negative\', or \'N/A\'.')
+parser.add_argument('--adduct', type=str, metavar='\b', default='H', help='Adduct (only applicable to HRMS). Options: \'H\', \'NH3\', \'NH4\', \'OH\', \'Cl\', \'K\', \'Li\', \'Na\'. Default: \'H\'.')
 parser.add_argument('--likely_reference_ids', type=str, metavar='\b', help='CSV file with one column containing the IDs of a subset of all compounds in the reference_data to be used in spectral library matching. Each ID in this file must be an ID in the reference library. Default: none (i.e. default is to use entire reference library)')
 parser.add_argument('--similarity_measure', type=str, default='cosine', metavar='\b', help='Similarity measure: options are cosine, shannon, renyi, tsallis, mixture, jaccard, dice, 3w_jaccard, sokal_sneath, binary_cosine, mountford, mcconnaughey, driver_kroeber, simpson, braun_banquet, fager_mcgowan, kulczynski, intersection, hamming, or hellinger. Default: cosine.')
 parser.add_argument('--weights', type=json.loads, default={'Cosine':0.25,'Shannon':0.25,'Renyi':0.25,'Tsallis':0.25}, metavar='\b', help='dict of weights to give to each non-binary similarity measure (i.e. cosine, shannon, renyi, and tsallis) when the mixture similarity measure is specified. Default: 0.25 for each of the four non-binary similarity measures.')
 parser.add_argument('--chromatography_platform', type=str, metavar='\b', help='Chromatography platform: options are \'HRMS\' and \'NRMS\'. Mandatory argument.')
-parser.add_argument('--spectrum_preprocessing_order', type=str, metavar='\b', help='The LC-MS/MS spectrum preprocessing transformations and the order in which they are to be applied. Note that these transformations are applied prior to computing similarity scores. Format must be a string with 2-6 characters chosen from C, F, M, N, L, W representing centroiding, filtering based on mass/charge and intensity values, matching, noise removal, low-entropy trannsformation, and weight-factor-transformation, respectively. For example, if \'WCM\' is passed, then each spectrum will undergo a weight factor transformation, then centroiding, and then matching. Note that if an argument is passed, then \'M\' must be contained in the argument, since matching is a required preprocessing step in spectral library matching of LC-MS/MS data. Furthermore, \'C\' must be performed before matching since centroiding can change the number of ion fragments in a given spectrum. Default: FCNMWL for HRMS, FNLW for NRMS')
+parser.add_argument('--spectrum_preprocessing_order', type=str, metavar='\b', default=None, help='The LC-MS/MS spectrum preprocessing transformations and the order in which they are to be applied. Note that these transformations are applied prior to computing similarity scores. Format must be a string with 2-6 characters chosen from C, F, M, N, L, W representing centroiding, filtering based on mass/charge and intensity values, matching, noise removal, low-entropy trannsformation, and weight-factor-transformation, respectively. For example, if \'WCM\' is passed, then each spectrum will undergo a weight factor transformation, then centroiding, and then matching. Note that if an argument is passed, then \'M\' must be contained in the argument, since matching is a required preprocessing step in spectral library matching of LC-MS/MS data. Furthermore, \'C\' must be performed before matching since centroiding can change the number of ion fragments in a given spectrum. Default: FCNMWL for HRMS, FNLW for NRMS')
 parser.add_argument('--high_quality_reference_library', type=str, default='False', metavar='\b', help='True/False flag indicating whether the reference library is considered to be of high quality. If True, then the spectrum preprocessing transformations of filtering and noise removal are performed only on the query spectrum/spectra. If False, all spectrum preprocessing transformations specified will be applied to both the query and reference spectra. Default: False')
 parser.add_argument('--mz_min', type=str, default='0', metavar='\b', help='Remove all peaks with mass/charge less than mz_min in each spectrum. Default: 0')
 parser.add_argument('--mz_max', type=str, default='999999999999', metavar='\b', help='Remove all peaks with mass/charge greater than mz_max in each spectrum. Default: 999999999999')
@@ -31,9 +34,9 @@ parser.add_argument('--output_path', type=str, default=f'{Path.cwd()}/output_tun
 args = parser.parse_args()
-if args.chromatography_platform == 'HRMS':
+if args.chromatography_platform == 'HRMS' and args.spectrum_preprocessing_order == None:
     spectrum_preprocessing_order = 'FCNMWL'
-elif args.chromatography_platform == 'NRMS':
+elif args.chromatography_platform == 'NRMS' and args.spectrum_preprocessing_order == None:
     spectrum_preprocessing_order = 'FNLW'
 else:
     print('Error: chromatography_platform must be either \'HRMS\' or \'NRMS\'')
@@ -59,11 +62,24 @@ grid['wf_int'] = [float(x) for x in grid['wf_int']]
 grid['LET_threshold'] = [float(x) for x in grid['LET_threshold']]
 grid['entropy_dimension'] = [float(x) for x in grid['entropy_dimension']]
+if args.precursor_ion_mz_tolerance == None:
+    precursor_ion_mz_tolerance_tmp = None
+else:
+    precursor_ion_mz_tolerance_tmp = float(args.precursor_ion_mz_tolerance)
 if args.chromatography_platform == 'HRMS':
-    tune_params_on_HRMS_data_grid(query_data=args.query_data, reference_data=args.reference_data, grid=grid, output_path=args.output_path)
+    tune_params_on_HRMS_data_grid(query_data=args.query_data,
+                                  reference_data=args.reference_data,
+                                  precursor_ion_mz_tolerance=precursor_ion_mz_tolerance_tmp,
+                                  ionization_mode=args.ionization_mode,
+                                  adduct=args.adduct,
+                                  grid=grid,
+                                  output_path=args.output_path)
 if args.chromatography_platform == 'NRMS':
-    tune_params_on_NRMS_data_grid(query_data=args.query_data, reference_data=args.reference_data, grid=grid, output_path=args.output_path)
+    tune_params_on_NRMS_data_grid(query_data=args.query_data,
+    reference_data=args.reference_data,
+    grid=grid,
+    output_path=args.output_path)

pycompound 0.1.6__py3-none-any.whl → 0.1.8__py3-none-any.whl

pycompound 0.1.6py3-none-any.whl → 0.1.8py3-none-any.whl