pycompound 0.1.6__py3-none-any.whl → 0.1.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pycompound/build_library.py +77 -20
- pycompound/plot_spectra.py +73 -111
- pycompound/processing.py +5 -5
- pycompound/spec_lib_matching.py +262 -491
- pycompound/spec_lib_matching_CLI.py +48 -2
- pycompound/tuning_CLI_DE.py +22 -22
- pycompound/tuning_CLI_grid.py +22 -6
- pycompound-0.1.8.dist-info/METADATA +824 -0
- pycompound-0.1.8.dist-info/RECORD +14 -0
- {pycompound-0.1.6.dist-info → pycompound-0.1.8.dist-info}/top_level.txt +0 -1
- app.py +0 -1519
- pycompound-0.1.6.dist-info/METADATA +0 -27
- pycompound-0.1.6.dist-info/RECORD +0 -15
- {pycompound-0.1.6.dist-info → pycompound-0.1.8.dist-info}/WHEEL +0 -0
- {pycompound-0.1.6.dist-info → pycompound-0.1.8.dist-info}/licenses/LICENSE +0 -0
|
@@ -12,6 +12,9 @@ parser = argparse.ArgumentParser()
|
|
|
12
12
|
|
|
13
13
|
parser.add_argument('--query_data', type=str, metavar='\b', help='CSV file of query mass spectrum/spectra to be identified. Each row should correspond to a mass spectrum, the left-most column should contain an identifier, and each of the other columns should correspond to a single mass/charge ratio. Mandatory argument.')
|
|
14
14
|
parser.add_argument('--reference_data', type=str, metavar='\b', help='CSV file of the reference mass spectra. Each row should correspond to a mass spectrum, the left-most column should contain in identifier (i.e. the CAS registry number or the compound name), and the remaining column should correspond to a single mass/charge ratio. Mandatory argument.')
|
|
15
|
+
parser.add_argument('--precursor_ion_mz_tolerance', type=str, metavar='\b', default=None, help='Precursor ion m/z tolerance (positive real number; only applicable to HRMS)). Default=None')
|
|
16
|
+
parser.add_argument('--ionization_mode', type=str, metavar='\b', default=None, help='Ionization mode (only applicable to HRMS). Options: \'Positive\', \'Negative\', or \'N/A\'.')
|
|
17
|
+
parser.add_argument('--adduct', type=str, metavar='\b', default='H', help='Adduct (only applicable to HRMS). Options: \'H\', \'NH3\', \'NH4\', \'OH\', \'Cl\', \'K\', \'Li\', \'Na\'. Default: \'H\'.')
|
|
15
18
|
parser.add_argument('--likely_reference_ids', type=str, metavar='\b', help='CSV file with one column containing the IDs of a subset of all compounds in the reference_data to be used in spectral library matching. Each ID in this file must be an ID in the reference library. Default: none (i.e. default is to use entire reference library)')
|
|
16
19
|
parser.add_argument('--similarity_measure', type=str, default='cosine', metavar='\b', help='Similarity measure: options are cosine, shannon, renyi, tsallis, mixture, jaccard, dice, 3w_jaccard, sokal_sneath, binary_cosine, mountford, mcconnaughey, driver_kroeber, simpson, braun_banquet, fager_mcgowan, kulczynski, intersection, hamming, or hellinger. Default: cosine.')
|
|
17
20
|
parser.add_argument('--weights', type=json.loads, default={'Cosine':0.25,'Shannon':0.25,'Renyi':0.25,'Tsallis':0.25}, metavar='\b', help='dict of weights to give to each non-binary similarity measure (i.e. cosine, shannon, renyi, and tsallis) when the mixture similarity measure is specified. Default: 0.25 for each of the four non-binary similarity measures.')
|
|
@@ -39,9 +42,52 @@ args = parser.parse_args()
|
|
|
39
42
|
|
|
40
43
|
|
|
41
44
|
if args.chromatography_platform == 'HRMS':
|
|
42
|
-
run_spec_lib_matching_on_HRMS_data(query_data=args.query_data,
|
|
45
|
+
run_spec_lib_matching_on_HRMS_data(query_data=args.query_data,
|
|
46
|
+
reference_data=args.reference_data,
|
|
47
|
+
precursor_ion_mz_tolerance=args.precursor_ion_mz_tolerance,
|
|
48
|
+
ionization_mode=args.ionization_mode,
|
|
49
|
+
adduct=args.adduct,
|
|
50
|
+
likely_reference_ids=args.likely_reference_ids,
|
|
51
|
+
similarity_measure=args.similarity_measure,
|
|
52
|
+
weights=args.weights,
|
|
53
|
+
spectrum_preprocessing_order=args.spectrum_preprocessing_order,
|
|
54
|
+
high_quality_reference_library=args.high_quality_reference_library,
|
|
55
|
+
mz_min=args.mz_min,
|
|
56
|
+
mz_max=args.mz_max,
|
|
57
|
+
int_min=args.int_min,
|
|
58
|
+
int_max=args.int_max,
|
|
59
|
+
window_size_centroiding=args.window_size_centroiding,
|
|
60
|
+
window_size_matching=args.window_size_matching,
|
|
61
|
+
noise_threshold=args.noise_threshold,
|
|
62
|
+
wf_mz=args.wf_mz,
|
|
63
|
+
wf_intensity=args.wf_intensity,
|
|
64
|
+
LET_threshold=args.LET_threshold,
|
|
65
|
+
entropy_dimension=args.entropy_dimension,
|
|
66
|
+
n_top_matches_to_save=args.n_top_matches_to_save,
|
|
67
|
+
print_id_results=args.print_id_results,
|
|
68
|
+
output_identification=args.output_identification,
|
|
69
|
+
output_similarity_scores=args.output_similarity_scores)
|
|
43
70
|
|
|
44
71
|
|
|
45
72
|
if args.chromatography_platform == 'NRMS':
|
|
46
|
-
run_spec_lib_matching_on_NRMS_data(query_data=args.query_data,
|
|
73
|
+
run_spec_lib_matching_on_NRMS_data(query_data=args.query_data,
|
|
74
|
+
reference_data=args.reference_data,
|
|
75
|
+
likely_reference_ids=args.likely_reference_ids,
|
|
76
|
+
similarity_measure=args.similarity_measure,
|
|
77
|
+
weights=args.weights,
|
|
78
|
+
spectrum_preprocessing_order=args.spectrum_preprocessing_order,
|
|
79
|
+
high_quality_reference_library=args.high_quality_reference_library,
|
|
80
|
+
mz_min=args.mz_min,
|
|
81
|
+
mz_max=args.mz_max,
|
|
82
|
+
int_min=args.int_min,
|
|
83
|
+
int_max=args.int_max,
|
|
84
|
+
noise_threshold=args.noise_threshold,
|
|
85
|
+
wf_mz=args.wf_mz,
|
|
86
|
+
wf_intensity=args.wf_intensity,
|
|
87
|
+
LET_threshold=args.LET_threshold,
|
|
88
|
+
entropy_dimension=args.entropy_dimension,
|
|
89
|
+
n_top_matches_to_save=args.n_top_matches_to_save,
|
|
90
|
+
print_id_results=args.print_id_results,
|
|
91
|
+
output_identification=args.output_identification,
|
|
92
|
+
output_similarity_scores=args.output_similarity_scores)
|
|
47
93
|
|
pycompound/tuning_CLI_DE.py
CHANGED
|
@@ -42,9 +42,7 @@ DEFAULT_PARAMS = {
|
|
|
42
42
|
}
|
|
43
43
|
|
|
44
44
|
|
|
45
|
-
# ---------- Utilities ----------
|
|
46
45
|
def parse_bound(s: str) -> Tuple[str, Tuple[float, float]]:
|
|
47
|
-
# "name=min:max" → (name, (min, max))
|
|
48
46
|
if "=" not in s or ":" not in s:
|
|
49
47
|
raise argparse.ArgumentTypeError(f"Bad --bound format '{s}'. Use name=min:max")
|
|
50
48
|
name, rng = s.split("=", 1)
|
|
@@ -59,7 +57,6 @@ def parse_bound(s: str) -> Tuple[str, Tuple[float, float]]:
|
|
|
59
57
|
|
|
60
58
|
|
|
61
59
|
def parse_default(s: str) -> Tuple[str, float]:
|
|
62
|
-
# "name=value" → (name, value)
|
|
63
60
|
if "=" not in s:
|
|
64
61
|
raise argparse.ArgumentTypeError(f"Bad --default format '{s}'. Use name=value")
|
|
65
62
|
name, val = s.split("=", 1)
|
|
@@ -82,7 +79,7 @@ def objective_HRMS(X: np.ndarray, ctx: dict) -> float:
|
|
|
82
79
|
p = _vector_to_full_params(X, ctx["default_params"], ctx["optimize_params"])
|
|
83
80
|
acc = get_acc_HRMS(
|
|
84
81
|
ctx["df_query"], ctx["df_reference"],
|
|
85
|
-
ctx["
|
|
82
|
+
ctx["precursor_ion_mz_tolerance"], ctx["ionization_mode"], ctx["adduct"],
|
|
86
83
|
ctx["similarity_measure"], ctx["weights"], ctx["spectrum_preprocessing_order"],
|
|
87
84
|
ctx["mz_min"], ctx["mz_max"], ctx["int_min"], ctx["int_max"],
|
|
88
85
|
p["window_size_centroiding"], p["window_size_matching"], p["noise_threshold"],
|
|
@@ -103,8 +100,7 @@ def objective_NRMS(X: np.ndarray, ctx: dict) -> float:
|
|
|
103
100
|
ctx["similarity_measure"], ctx["weights"], ctx["spectrum_preprocessing_order"],
|
|
104
101
|
ctx["mz_min"], ctx["mz_max"], ctx["int_min"], ctx["int_max"],
|
|
105
102
|
p["noise_threshold"], p["wf_mz"], p["wf_int"], p["LET_threshold"], p["entropy_dimension"],
|
|
106
|
-
ctx["high_quality_reference_library"]
|
|
107
|
-
verbose=False
|
|
103
|
+
ctx["high_quality_reference_library"]
|
|
108
104
|
)
|
|
109
105
|
print(f"\n{ctx['optimize_params']} = {np.array(X)}\naccuracy: {acc*100}%")
|
|
110
106
|
return 1.0 - acc
|
|
@@ -112,15 +108,16 @@ def objective_NRMS(X: np.ndarray, ctx: dict) -> float:
|
|
|
112
108
|
|
|
113
109
|
# ---------- Main CLI ----------
|
|
114
110
|
def main():
|
|
115
|
-
p = argparse.ArgumentParser(
|
|
116
|
-
description="Parameter tuning via Differential Evolution for HRMS/NRMS using pycompound."
|
|
117
|
-
)
|
|
111
|
+
p = argparse.ArgumentParser(description="Parameter tuning via Differential Evolution for HRMS/NRMS using pycompound.")
|
|
118
112
|
p.add_argument("--chromatography_platform", choices=["HRMS", "NRMS"], default="HRMS", help="Chromatography Platform.")
|
|
119
|
-
p.add_argument("--query_data", required=True, help="Path to query
|
|
120
|
-
p.add_argument("--reference_data", required=True, nargs="+", help="Path(s) to reference
|
|
121
|
-
p.add_argument("--
|
|
113
|
+
p.add_argument("--query_data", required=True, help="Path to query TXT (must contain 'id' column).")
|
|
114
|
+
p.add_argument("--reference_data", required=True, nargs="+", help="Path(s) to reference TXT(s) (must contain 'id').")
|
|
115
|
+
p.add_argument("--precursor_ion_mz_tolerance", type=float, default=None, help='Precursor ion m/z tolerance (positive real number; only applicable to HRMS)). Default=None')
|
|
116
|
+
p.add_argument("--ionization_mode", choices=['Positive','Negative',None], default=None, help='Ionization mode (only applicable to HRMS). Options: \'Positive\', \'Negative\', or \'None\'. Default=None')
|
|
117
|
+
p.add_argument("--adduct", choices=['H','NH3','NH4','OH','K','Li','Na',None], default=None, help='Adduct (only applicable to HRMS). Options: \'H\', \'NH3\', \'NH4\', \'OH\', \'Cl\', \'K\', \'Li\', \'Na\'. Default: \'H\'.')
|
|
118
|
+
p.add_argument("--similarity_measure", default="cosine", choices=["cosine", "shannon", "renyi", "tsallis"], help="Similarity measure.")
|
|
122
119
|
p.add_argument("--weights", default="", help="Weights spec; empty means None.")
|
|
123
|
-
p.add_argument("--
|
|
120
|
+
p.add_argument("--spectrum_preprocessing_order", default="CNMWL", help="Spectrum preprocessing order string.")
|
|
124
121
|
p.add_argument("--mz-min", type=float, default=0.0)
|
|
125
122
|
p.add_argument("--mz-max", type=float, default=999_999_999.0)
|
|
126
123
|
p.add_argument("--int-min", type=float, default=0.0)
|
|
@@ -143,19 +140,19 @@ def main():
|
|
|
143
140
|
|
|
144
141
|
qpath = Path(args.query_data)
|
|
145
142
|
if not qpath.exists():
|
|
146
|
-
sys.exit(f"Query
|
|
143
|
+
sys.exit(f"Query TXT not found: {qpath}")
|
|
147
144
|
|
|
148
|
-
df_query = pd.read_csv(qpath)
|
|
145
|
+
df_query = pd.read_csv(qpath,sep='\t')
|
|
149
146
|
if "id" not in df_query.columns:
|
|
150
|
-
sys.exit("Query
|
|
147
|
+
sys.exit("Query TXT must contain an 'id' column.")
|
|
151
148
|
|
|
152
149
|
ref_paths = [Path(pth) for pth in args.reference_data]
|
|
153
150
|
for r in ref_paths:
|
|
154
151
|
if not r.exists():
|
|
155
|
-
sys.exit(f"Reference
|
|
156
|
-
df_reference = pd.concat([pd.read_csv(r) for r in ref_paths], axis=0, ignore_index=True)
|
|
152
|
+
sys.exit(f"Reference TXT not found: {r}")
|
|
153
|
+
df_reference = pd.concat([pd.read_csv(r,sep='\t') for r in ref_paths], axis=0, ignore_index=True)
|
|
157
154
|
if "id" not in df_reference.columns:
|
|
158
|
-
sys.exit("Reference
|
|
155
|
+
sys.exit("Reference TXT must contain an 'id' column.")
|
|
159
156
|
|
|
160
157
|
uq = df_query["id"].unique().tolist()
|
|
161
158
|
ur = df_reference["id"].unique().tolist()
|
|
@@ -177,11 +174,13 @@ def main():
|
|
|
177
174
|
ctx = dict(
|
|
178
175
|
df_query=df_query,
|
|
179
176
|
df_reference=df_reference,
|
|
180
|
-
|
|
181
|
-
|
|
177
|
+
precursor_ion_mz_tolerance=args.precursor_ion_mz_tolerance,
|
|
178
|
+
ionization_mode=args.ionization_mode,
|
|
179
|
+
uq=uq, ur=ur,
|
|
180
|
+
adduct=args.adduct,
|
|
182
181
|
similarity_measure=args.similarity_measure,
|
|
183
182
|
weights=(None if args.weights.strip() == "" else args.weights),
|
|
184
|
-
spectrum_preprocessing_order=args.
|
|
183
|
+
spectrum_preprocessing_order=args.spectrum_preprocessing_order,
|
|
185
184
|
mz_min=float(args.mz_min),
|
|
186
185
|
mz_max=float(args.mz_max),
|
|
187
186
|
int_min=float(args.int_min),
|
|
@@ -211,6 +210,7 @@ def main():
|
|
|
211
210
|
seed=int(args.seed),
|
|
212
211
|
workers=int(args.workers),
|
|
213
212
|
callback=_cb,
|
|
213
|
+
updating='deferred' if int(args.workers)!=1 else 'immediate'
|
|
214
214
|
)
|
|
215
215
|
|
|
216
216
|
best_params = _vector_to_full_params(result.x, default_params, args.opt)
|
pycompound/tuning_CLI_grid.py
CHANGED
|
@@ -10,11 +10,14 @@ parser = argparse.ArgumentParser()
|
|
|
10
10
|
|
|
11
11
|
parser.add_argument('--query_data', type=str, metavar='\b', help='CSV file of query mass spectrum/spectra to be identified. Each row should correspond to a mass spectrum, the left-most column should contain an identifier, and each of the other columns should correspond to a single mass/charge ratio. Mandatory argument.')
|
|
12
12
|
parser.add_argument('--reference_data', type=str, metavar='\b', help='CSV file of the reference mass spectra. Each row should correspond to a mass spectrum, the left-most column should contain in identifier (i.e. the CAS registry number or the compound name), and the remaining column should correspond to a single mass/charge ratio. Mandatory argument.')
|
|
13
|
+
parser.add_argument('--precursor_ion_mz_tolerance', type=str, metavar='\b', default=None, help='Precursor ion m/z tolerance (positive real number; only applicable to HRMS)). Default=None')
|
|
14
|
+
parser.add_argument('--ionization_mode', type=str, metavar='\b', default=None, help='Ionization mode (only applicable to HRMS). Options: \'Positive\', \'Negative\', or \'N/A\'.')
|
|
15
|
+
parser.add_argument('--adduct', type=str, metavar='\b', default='H', help='Adduct (only applicable to HRMS). Options: \'H\', \'NH3\', \'NH4\', \'OH\', \'Cl\', \'K\', \'Li\', \'Na\'. Default: \'H\'.')
|
|
13
16
|
parser.add_argument('--likely_reference_ids', type=str, metavar='\b', help='CSV file with one column containing the IDs of a subset of all compounds in the reference_data to be used in spectral library matching. Each ID in this file must be an ID in the reference library. Default: none (i.e. default is to use entire reference library)')
|
|
14
17
|
parser.add_argument('--similarity_measure', type=str, default='cosine', metavar='\b', help='Similarity measure: options are cosine, shannon, renyi, tsallis, mixture, jaccard, dice, 3w_jaccard, sokal_sneath, binary_cosine, mountford, mcconnaughey, driver_kroeber, simpson, braun_banquet, fager_mcgowan, kulczynski, intersection, hamming, or hellinger. Default: cosine.')
|
|
15
18
|
parser.add_argument('--weights', type=json.loads, default={'Cosine':0.25,'Shannon':0.25,'Renyi':0.25,'Tsallis':0.25}, metavar='\b', help='dict of weights to give to each non-binary similarity measure (i.e. cosine, shannon, renyi, and tsallis) when the mixture similarity measure is specified. Default: 0.25 for each of the four non-binary similarity measures.')
|
|
16
19
|
parser.add_argument('--chromatography_platform', type=str, metavar='\b', help='Chromatography platform: options are \'HRMS\' and \'NRMS\'. Mandatory argument.')
|
|
17
|
-
parser.add_argument('--spectrum_preprocessing_order', type=str, metavar='\b', help='The LC-MS/MS spectrum preprocessing transformations and the order in which they are to be applied. Note that these transformations are applied prior to computing similarity scores. Format must be a string with 2-6 characters chosen from C, F, M, N, L, W representing centroiding, filtering based on mass/charge and intensity values, matching, noise removal, low-entropy trannsformation, and weight-factor-transformation, respectively. For example, if \'WCM\' is passed, then each spectrum will undergo a weight factor transformation, then centroiding, and then matching. Note that if an argument is passed, then \'M\' must be contained in the argument, since matching is a required preprocessing step in spectral library matching of LC-MS/MS data. Furthermore, \'C\' must be performed before matching since centroiding can change the number of ion fragments in a given spectrum. Default: FCNMWL for HRMS, FNLW for NRMS')
|
|
20
|
+
parser.add_argument('--spectrum_preprocessing_order', type=str, metavar='\b', default=None, help='The LC-MS/MS spectrum preprocessing transformations and the order in which they are to be applied. Note that these transformations are applied prior to computing similarity scores. Format must be a string with 2-6 characters chosen from C, F, M, N, L, W representing centroiding, filtering based on mass/charge and intensity values, matching, noise removal, low-entropy trannsformation, and weight-factor-transformation, respectively. For example, if \'WCM\' is passed, then each spectrum will undergo a weight factor transformation, then centroiding, and then matching. Note that if an argument is passed, then \'M\' must be contained in the argument, since matching is a required preprocessing step in spectral library matching of LC-MS/MS data. Furthermore, \'C\' must be performed before matching since centroiding can change the number of ion fragments in a given spectrum. Default: FCNMWL for HRMS, FNLW for NRMS')
|
|
18
21
|
parser.add_argument('--high_quality_reference_library', type=str, default='False', metavar='\b', help='True/False flag indicating whether the reference library is considered to be of high quality. If True, then the spectrum preprocessing transformations of filtering and noise removal are performed only on the query spectrum/spectra. If False, all spectrum preprocessing transformations specified will be applied to both the query and reference spectra. Default: False')
|
|
19
22
|
parser.add_argument('--mz_min', type=str, default='0', metavar='\b', help='Remove all peaks with mass/charge less than mz_min in each spectrum. Default: 0')
|
|
20
23
|
parser.add_argument('--mz_max', type=str, default='999999999999', metavar='\b', help='Remove all peaks with mass/charge greater than mz_max in each spectrum. Default: 999999999999')
|
|
@@ -31,9 +34,9 @@ parser.add_argument('--output_path', type=str, default=f'{Path.cwd()}/output_tun
|
|
|
31
34
|
|
|
32
35
|
args = parser.parse_args()
|
|
33
36
|
|
|
34
|
-
if args.chromatography_platform == 'HRMS':
|
|
37
|
+
if args.chromatography_platform == 'HRMS' and args.spectrum_preprocessing_order == None:
|
|
35
38
|
spectrum_preprocessing_order = 'FCNMWL'
|
|
36
|
-
elif args.chromatography_platform == 'NRMS':
|
|
39
|
+
elif args.chromatography_platform == 'NRMS' and args.spectrum_preprocessing_order == None:
|
|
37
40
|
spectrum_preprocessing_order = 'FNLW'
|
|
38
41
|
else:
|
|
39
42
|
print('Error: chromatography_platform must be either \'HRMS\' or \'NRMS\'')
|
|
@@ -59,11 +62,24 @@ grid['wf_int'] = [float(x) for x in grid['wf_int']]
|
|
|
59
62
|
grid['LET_threshold'] = [float(x) for x in grid['LET_threshold']]
|
|
60
63
|
grid['entropy_dimension'] = [float(x) for x in grid['entropy_dimension']]
|
|
61
64
|
|
|
65
|
+
if args.precursor_ion_mz_tolerance == None:
|
|
66
|
+
precursor_ion_mz_tolerance_tmp = None
|
|
67
|
+
else:
|
|
68
|
+
precursor_ion_mz_tolerance_tmp = float(args.precursor_ion_mz_tolerance)
|
|
69
|
+
|
|
62
70
|
|
|
63
71
|
if args.chromatography_platform == 'HRMS':
|
|
64
|
-
tune_params_on_HRMS_data_grid(query_data=args.query_data,
|
|
72
|
+
tune_params_on_HRMS_data_grid(query_data=args.query_data,
|
|
73
|
+
reference_data=args.reference_data,
|
|
74
|
+
precursor_ion_mz_tolerance=precursor_ion_mz_tolerance_tmp,
|
|
75
|
+
ionization_mode=args.ionization_mode,
|
|
76
|
+
adduct=args.adduct,
|
|
77
|
+
grid=grid,
|
|
78
|
+
output_path=args.output_path)
|
|
65
79
|
|
|
66
80
|
if args.chromatography_platform == 'NRMS':
|
|
67
|
-
tune_params_on_NRMS_data_grid(query_data=args.query_data,
|
|
68
|
-
|
|
81
|
+
tune_params_on_NRMS_data_grid(query_data=args.query_data,
|
|
82
|
+
reference_data=args.reference_data,
|
|
83
|
+
grid=grid,
|
|
84
|
+
output_path=args.output_path)
|
|
69
85
|
|