pycompound 0.1.8__py3-none-any.whl → 0.1.10__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -14,7 +14,7 @@ def generate_plots_on_HRMS_data(query_data=None, reference_data=None, spectrum_I
14
14
  else:
15
15
  extension = query_data.rsplit('.',1)
16
16
  extension = extension[(len(extension)-1)]
17
- if extension == 'mgf' or extension == 'MGF' or extension == 'mzML' or extension == 'mzml' or extension == 'MZML' or extension == 'cdf' or extension == 'CDF':
17
+ if extension == 'mgf' or extension == 'MGF' or extension == 'mzML' or extension == 'mzml' or extension == 'MZML' or extension == 'cdf' or extension == 'CDF' or extension == 'msp' or extension == 'MSP' or extension == 'json' or extension == 'JSON':
18
18
  output_path_tmp = query_data[:-3] + 'txt'
19
19
  build_library_from_raw_data(input_path=query_data, output_path=output_path_tmp, is_reference=True)
20
20
  df_query = pd.read_csv(output_path_tmp, sep='\t')
@@ -29,7 +29,7 @@ def generate_plots_on_HRMS_data(query_data=None, reference_data=None, spectrum_I
29
29
  else:
30
30
  extension = reference_data.rsplit('.',1)
31
31
  extension = extension[(len(extension)-1)]
32
- if extension == 'mgf' or extension == 'MGF' or extension == 'mzML' or extension == 'mzml' or extension == 'MZML' or extension == 'cdf' or extension == 'CDF':
32
+ if extension == 'mgf' or extension == 'MGF' or extension == 'mzML' or extension == 'mzml' or extension == 'MZML' or extension == 'cdf' or extension == 'CDF' or extension == 'msp' or extension == 'MSP' or extension == 'json' or extension == 'JSON':
33
33
  output_path_tmp = reference_data[:-3] + 'txt'
34
34
  build_library_from_raw_data(input_path=reference_data, output_path=output_path_tmp, is_reference=True)
35
35
  df_reference = pd.read_csv(output_path_tmp, sep='\t')
@@ -298,7 +298,7 @@ def generate_plots_on_NRMS_data(query_data=None, reference_data=None, spectrum_I
298
298
  else:
299
299
  extension = query_data.rsplit('.',1)
300
300
  extension = extension[(len(extension)-1)]
301
- if extension == 'mgf' or extension == 'MGF' or extension == 'mzML' or extension == 'mzml' or extension == 'MZML' or extension == 'cdf' or extension == 'CDF':
301
+ if extension == 'mgf' or extension == 'MGF' or extension == 'mzML' or extension == 'mzml' or extension == 'MZML' or extension == 'cdf' or extension == 'CDF' or extension == 'msp' or extension == 'MSP' or extension == 'json' or extension == 'JSON':
302
302
  output_path_tmp = query_data[:-3] + 'txt'
303
303
  build_library_from_raw_data(input_path=query_data, output_path=output_path_tmp, is_reference=False)
304
304
  df_query = pd.read_csv(output_path_tmp, sep='\t')
@@ -312,7 +312,7 @@ def generate_plots_on_NRMS_data(query_data=None, reference_data=None, spectrum_I
312
312
  else:
313
313
  extension = reference_data.rsplit('.',1)
314
314
  extension = extension[(len(extension)-1)]
315
- if extension == 'mgf' or extension == 'MGF' or extension == 'mzML' or extension == 'mzml' or extension == 'MZML' or extension == 'cdf' or extension == 'CDF':
315
+ if extension == 'mgf' or extension == 'MGF' or extension == 'mzML' or extension == 'mzml' or extension == 'MZML' or extension == 'cdf' or extension == 'CDF' or extension == 'msp' or extension == 'MSP' or extension == 'json' or extension == 'JSON':
316
316
  output_path_tmp = reference_data[:-3] + 'txt'
317
317
  build_library_from_raw_data(input_path=reference_data, output_path=output_path_tmp, is_reference=True)
318
318
  df_reference = pd.read_csv(output_path_tmp, sep='\t')
@@ -395,8 +395,8 @@ def generate_plots_on_NRMS_data(query_data=None, reference_data=None, spectrum_I
395
395
  print(f'Warning: plots will be saved to the PDF ./spectrum1_{spectrum_ID1}_spectrum2_{spectrum_ID2}_plot.pdf in the current working directory.')
396
396
  output_path = f'{Path.cwd()}/spectrum1_{spectrum_ID1}_spectrum2_{spectrum_ID2}.pdf'
397
397
 
398
- min_mz = np.min([np.min(df_query['mz_ratio'].tolist()), np.min(df_reference['mz_ratio'].tolist())])
399
- max_mz = np.max([np.max(df_query['mz_ratio'].tolist()), np.max(df_reference['mz_ratio'].tolist())])
398
+ min_mz = int(np.min([np.min(df_query['mz_ratio'].tolist()), np.min(df_reference['mz_ratio'].tolist())]))
399
+ max_mz = int(np.max([np.max(df_query['mz_ratio'].tolist()), np.max(df_reference['mz_ratio'].tolist())]))
400
400
  mzs = np.linspace(min_mz,max_mz,(max_mz-min_mz+1))
401
401
 
402
402
  unique_query_ids = df_query['id'].unique().tolist()
@@ -31,7 +31,8 @@ def objective_function_HRMS(X, ctx):
31
31
  p["wf_mz"], p["wf_int"], p["LET_threshold"],
32
32
  p["entropy_dimension"],
33
33
  ctx["high_quality_reference_library"],
34
- verbose=False
34
+ verbose=False,
35
+ exact_match_required=ctx["exact_match_required"]
35
36
  )
36
37
  print(f"\nparams({ctx['optimize_params']}) = {np.array(X)}\naccuracy: {acc*100}%")
37
38
  return 1.0 - acc
@@ -45,7 +46,8 @@ def objective_function_NRMS(X, ctx):
45
46
  ctx["mz_min"], ctx["mz_max"], ctx["int_min"], ctx["int_max"],
46
47
  p["noise_threshold"], p["wf_mz"], p["wf_int"], p["LET_threshold"], p["entropy_dimension"],
47
48
  ctx["high_quality_reference_library"],
48
- verbose=False
49
+ verbose=False,
50
+ exact_match_required=ctx["exact_match_required"]
49
51
  )
50
52
  print(f"\nparams({ctx['optimize_params']}) = {np.array(X)}\naccuracy: {acc*100}%")
51
53
  return 1.0 - acc
@@ -53,7 +55,7 @@ def objective_function_NRMS(X, ctx):
53
55
 
54
56
 
55
57
 
56
- def tune_params_DE(query_data=None, reference_data=None, chromatography_platform='HRMS', precursor_ion_mz_tolerance=None, ionization_mode=None, adduct=None, similarity_measure='cosine', weights=None, spectrum_preprocessing_order='CNMWL', mz_min=0, mz_max=999999999, int_min=0, int_max=999999999, high_quality_reference_library=False, optimize_params=["window_size_centroiding","window_size_matching","noise_threshold","wf_mz","wf_int","LET_threshold","entropy_dimension"], param_bounds={"window_size_centroiding":(0.0,0.5),"window_size_matching":(0.0,0.5),"noise_threshold":(0.0,0.25),"wf_mz":(0.0,5.0),"wf_int":(0.0,5.0),"LET_threshold":(0.0,5.0),"entropy_dimension":(1.0,3.0)}, default_params={"window_size_centroiding": 0.5, "window_size_matching":0.5, "noise_threshold":0.10, "wf_mz":0.0, "wf_int":1.0, "LET_threshold":0.0, "entropy_dimension":1.1}, maxiters=3, de_workers=1):
58
+ def tune_params_DE(query_data=None, reference_data=None, chromatography_platform='HRMS', precursor_ion_mz_tolerance=None, ionization_mode=None, adduct=None, similarity_measure='cosine', weights=None, spectrum_preprocessing_order='CNMWL', mz_min=0, mz_max=999999999, int_min=0, int_max=999999999, high_quality_reference_library=False, optimize_params=["window_size_centroiding","window_size_matching","noise_threshold","wf_mz","wf_int","LET_threshold","entropy_dimension"], param_bounds={"window_size_centroiding":(0.0,0.5),"window_size_matching":(0.0,0.5),"noise_threshold":(0.0,0.25),"wf_mz":(0.0,5.0),"wf_int":(0.0,5.0),"LET_threshold":(0.0,5.0),"entropy_dimension":(1.0,3.0)}, default_params={"window_size_centroiding": 0.5, "window_size_matching":0.5, "noise_threshold":0.10, "wf_mz":0.0, "wf_int":1.0, "LET_threshold":0.0, "entropy_dimension":1.1}, maxiters=3, de_workers=1, exact_match_required=False):
57
59
 
58
60
  if query_data is None:
59
61
  print('\nError: No argument passed to the mandatory query_data. Please pass the path to the TXT file of the query data.')
@@ -63,7 +65,7 @@ def tune_params_DE(query_data=None, reference_data=None, chromatography_platform
63
65
  extension = extension[(len(extension)-1)]
64
66
  if extension == 'mgf' or extension == 'MGF' or extension == 'mzML' or extension == 'mzml' or extension == 'MZML' or extension == 'cdf' or extension == 'CDF' or extension == 'msp' or extension == 'MSP' or extension == 'json' or extension == 'JSON':
65
67
  output_path_tmp = query_data[:-3] + 'txt'
66
- build_library_from_raw_data(input_path=query_data, output_path=output_path_tmp, is_reference=False)
68
+ build_library_from_raw_data(input_path=query_data, output_path=output_path_tmp, is_reference=True)
67
69
  df_query = pd.read_csv(output_path_tmp, sep='\t')
68
70
  if extension == 'txt' or extension == 'TXT':
69
71
  df_query = pd.read_csv(query_data, sep='\t')
@@ -106,6 +108,7 @@ def tune_params_DE(query_data=None, reference_data=None, chromatography_platform
106
108
  high_quality_reference_library=high_quality_reference_library,
107
109
  default_params=default_params,
108
110
  optimize_params=optimize_params,
111
+ exact_match_required=exact_match_required
109
112
  )
110
113
 
111
114
  bounds = [param_bounds[p] for p in optimize_params]
@@ -136,14 +139,7 @@ default_HRMS_grid = {'similarity_measure':['cosine'], 'weight':[{'Cosine':0.25,'
136
139
  default_NRMS_grid = {'similarity_measure':['cosine'], 'weight':[{'Cosine':0.25,'Shannon':0.25,'Renyi':0.25,'Tsallis':0.25}], 'spectrum_preprocessing_order':['FCNMWL'], 'mz_min':[0], 'mz_max':[9999999], 'int_min':[0], 'int_max':[99999999], 'noise_threshold':[0.0], 'wf_mz':[0.0], 'wf_int':[1.0], 'LET_threshold':[0.0], 'entropy_dimension':[1.1], 'high_quality_reference_library':[False]}
137
140
 
138
141
 
139
- def _eval_one_HRMS(df_query, df_reference,
140
- precursor_ion_mz_tolerance_tmp, ionization_mode_tmp, adduct_tmp,
141
- similarity_measure_tmp, weight,
142
- spectrum_preprocessing_order_tmp, mz_min_tmp, mz_max_tmp,
143
- int_min_tmp, int_max_tmp, noise_threshold_tmp,
144
- window_size_centroiding_tmp, window_size_matching_tmp,
145
- wf_mz_tmp, wf_int_tmp, LET_threshold_tmp,
146
- entropy_dimension_tmp, high_quality_reference_library_tmp):
142
+ def _eval_one_HRMS(df_query, df_reference, precursor_ion_mz_tolerance_tmp, ionization_mode_tmp, adduct_tmp, similarity_measure_tmp, weight, spectrum_preprocessing_order_tmp, mz_min_tmp, mz_max_tmp, int_min_tmp, int_max_tmp, noise_threshold_tmp, window_size_centroiding_tmp, window_size_matching_tmp, wf_mz_tmp, wf_int_tmp, LET_threshold_tmp, entropy_dimension_tmp, high_quality_reference_library_tmp, exact_match_required_tmp):
147
143
 
148
144
  acc = get_acc_HRMS(
149
145
  df_query=df_query, df_reference=df_reference,
@@ -160,7 +156,8 @@ def _eval_one_HRMS(df_query, df_reference,
160
156
  LET_threshold=LET_threshold_tmp,
161
157
  entropy_dimension=entropy_dimension_tmp,
162
158
  high_quality_reference_library=high_quality_reference_library_tmp,
163
- verbose=False
159
+ verbose=False,
160
+ exact_match_required=exact_match_required_tmp
164
161
  )
165
162
 
166
163
  return (
@@ -172,12 +169,7 @@ def _eval_one_HRMS(df_query, df_reference,
172
169
  )
173
170
 
174
171
 
175
- def _eval_one_NRMS(df_query, df_reference, unique_query_ids, unique_reference_ids,
176
- similarity_measure_tmp, weight,
177
- spectrum_preprocessing_order_tmp, mz_min_tmp, mz_max_tmp,
178
- int_min_tmp, int_max_tmp, noise_threshold_tmp,
179
- wf_mz_tmp, wf_int_tmp, LET_threshold_tmp,
180
- entropy_dimension_tmp, high_quality_reference_library_tmp):
172
+ def _eval_one_NRMS(df_query, df_reference, unique_query_ids, unique_reference_ids, similarity_measure_tmp, weight, spectrum_preprocessing_order_tmp, mz_min_tmp, mz_max_tmp, int_min_tmp, int_max_tmp, noise_threshold_tmp, wf_mz_tmp, wf_int_tmp, LET_threshold_tmp, entropy_dimension_tmp, high_quality_reference_library_tmp, exact_match_required):
181
173
 
182
174
  acc = get_acc_NRMS(
183
175
  df_query=df_query, df_reference=df_reference,
@@ -191,7 +183,8 @@ def _eval_one_NRMS(df_query, df_reference, unique_query_ids, unique_reference_id
191
183
  LET_threshold=LET_threshold_tmp,
192
184
  entropy_dimension=entropy_dimension_tmp,
193
185
  high_quality_reference_library=high_quality_reference_library_tmp,
194
- verbose=False
186
+ verbose=False,
187
+ exact_match_required=exact_match_required_tmp
195
188
  )
196
189
 
197
190
  return (
@@ -202,7 +195,7 @@ def _eval_one_NRMS(df_query, df_reference, unique_query_ids, unique_reference_id
202
195
 
203
196
 
204
197
 
205
- def tune_params_on_HRMS_data_grid(query_data=None, reference_data=None, precursor_ion_mz_tolerance=None, ionization_mode=None, adduct=None, grid=None, output_path=None, return_output=False):
198
+ def tune_params_on_HRMS_data_grid(query_data=None, reference_data=None, precursor_ion_mz_tolerance=None, ionization_mode=None, adduct=None, grid=None, output_path=None, return_output=False, exact_match_required=False):
206
199
  grid = {**default_HRMS_grid, **(grid or {})}
207
200
  for key, value in grid.items():
208
201
  globals()[key] = value
@@ -251,7 +244,9 @@ def tune_params_on_HRMS_data_grid(query_data=None, reference_data=None, precurso
251
244
 
252
245
  param_grid = product(similarity_measure, weight, spectrum_preprocessing_order, mz_min, mz_max, int_min, int_max, noise_threshold,
253
246
  window_size_centroiding, window_size_matching, wf_mz, wf_int, LET_threshold, entropy_dimension, high_quality_reference_library)
254
- results = Parallel(n_jobs=-1, verbose=10)(delayed(_eval_one_HRMS)(df_query, df_reference, precursor_ion_mz_tolerance, ionization_mode, adduct, *params) for params in param_grid)
247
+ #results = Parallel(n_jobs=-1, verbose=10)(delayed(_eval_one_HRMS)(df_query, df_reference, precursor_ion_mz_tolerance, ionization_mode, adduct, (*params for params in param_grid), exact_match_required))
248
+ results = Parallel(n_jobs=-1, verbose=10)(delayed(_eval_one_HRMS)(df_query, df_reference, precursor_ion_mz_tolerance, ionization_mode, adduct, *params, exact_match_required) for params in param_grid)
249
+
255
250
 
256
251
  df_out = pd.DataFrame(results, columns=[
257
252
  'ACC','SIMILARITY.MEASURE','WEIGHT','SPECTRUM.PROCESSING.ORDER', 'MZ.MIN','MZ.MAX','INT.MIN','INT.MAX','NOISE.THRESHOLD',
@@ -275,7 +270,7 @@ def tune_params_on_HRMS_data_grid(query_data=None, reference_data=None, precurso
275
270
 
276
271
 
277
272
 
278
- def tune_params_on_NRMS_data_grid(query_data=None, reference_data=None, grid=None, output_path=None, return_output=False):
273
+ def tune_params_on_NRMS_data_grid(query_data=None, reference_data=None, grid=None, output_path=None, return_output=False, exact_match_required=False):
279
274
  grid = {**default_NRMS_grid, **(grid or {})}
280
275
  for key, value in grid.items():
281
276
  globals()[key] = value
@@ -318,7 +313,8 @@ def tune_params_on_NRMS_data_grid(query_data=None, reference_data=None, grid=Non
318
313
 
319
314
  param_grid = product(similarity_measure, weight, spectrum_preprocessing_order, mz_min, mz_max, int_min, int_max,
320
315
  noise_threshold, wf_mz, wf_int, LET_threshold, entropy_dimension, high_quality_reference_library)
321
- results = Parallel(n_jobs=-1, verbose=10)(delayed(_eval_one_NRMS)(df_query, df_reference, unique_query_ids, unique_reference_ids, *params) for params in param_grid)
316
+ #results = Parallel(n_jobs=-1, verbose=10)(delayed(_eval_one_NRMS)(df_query, df_reference, unique_query_ids, unique_reference_ids, *params) for params in param_grid, exact_match_required)
317
+ results = Parallel(n_jobs=-1, verbose=10)(delayed(_eval_one_NRMS)(df_query, df_reference, unique_query_ids, unique_reference_ids, *params, exact_match_required) for params in param_grid)
322
318
 
323
319
  df_out = pd.DataFrame(results, columns=['ACC','SIMILARITY.MEASURE','WEIGHT','SPECTRUM.PROCESSING.ORDER', 'MZ.MIN','MZ.MAX','INT.MIN','INT.MAX',
324
320
  'NOISE.THRESHOLD','WF.MZ','WF.INT','LET.THRESHOLD','ENTROPY.DIMENSION', 'HIGH.QUALITY.REFERENCE.LIBRARY'])
@@ -339,7 +335,7 @@ def tune_params_on_NRMS_data_grid(query_data=None, reference_data=None, grid=Non
339
335
 
340
336
 
341
337
 
342
- def get_acc_HRMS(df_query, df_reference, precursor_ion_mz_tolerance, ionization_mode, adduct, similarity_measure, weights, spectrum_preprocessing_order, mz_min, mz_max, int_min, int_max, window_size_centroiding, window_size_matching, noise_threshold, wf_mz, wf_int, LET_threshold, entropy_dimension, high_quality_reference_library, verbose=True):
338
+ def get_acc_HRMS(df_query, df_reference, precursor_ion_mz_tolerance, ionization_mode, adduct, similarity_measure, weights, spectrum_preprocessing_order, mz_min, mz_max, int_min, int_max, window_size_centroiding, window_size_matching, noise_threshold, wf_mz, wf_int, LET_threshold, entropy_dimension, high_quality_reference_library, verbose=True, exact_match_required=False):
343
339
 
344
340
  n_top_matches_to_save = 1
345
341
  unique_reference_ids = df_reference['id'].dropna().astype(str).unique().tolist()
@@ -445,11 +441,17 @@ def get_acc_HRMS(df_query, df_reference, precursor_ion_mz_tolerance, ionization_
445
441
  df_tmp = pd.DataFrame({'TRUE.ID': df_scores.index.to_list(), 'PREDICTED.ID': top_ids, 'SCORE': top_scores})
446
442
  #if verbose:
447
443
  # print(df_tmp)
448
- acc = (df_tmp['TRUE.ID'] == df_tmp['PREDICTED.ID']).mean()
444
+ if exact_match_required == True:
445
+ acc = (df_tmp['TRUE.ID'] == df_tmp['PREDICTED.ID']).mean()
446
+ else:
447
+ true_lower = df_tmp['TRUE.ID'].str.lower()
448
+ pred_lower = df_tmp['PREDICTED.ID'].str.lower()
449
+ matches = [t in p for t, p in zip(true_lower, pred_lower)]
450
+ acc = sum(matches) / len(matches)
449
451
  return acc
450
452
 
451
453
 
452
- def get_acc_NRMS(df_query, df_reference, unique_query_ids, unique_reference_ids, similarity_measure, weights, spectrum_preprocessing_order, mz_min, mz_max, int_min, int_max, noise_threshold, wf_mz, wf_int, LET_threshold, entropy_dimension, high_quality_reference_library, verbose=True):
454
+ def get_acc_NRMS(df_query, df_reference, unique_query_ids, unique_reference_ids, similarity_measure, weights, spectrum_preprocessing_order, mz_min, mz_max, int_min, int_max, noise_threshold, wf_mz, wf_int, LET_threshold, entropy_dimension, high_quality_reference_library, verbose=True, exact_match_required=False):
453
455
 
454
456
  n_top_matches_to_save = 1
455
457
 
@@ -532,7 +534,13 @@ def get_acc_NRMS(df_query, df_reference, unique_query_ids, unique_reference_ids,
532
534
  df_tmp = pd.DataFrame(out, columns=['TRUE.ID','PREDICTED.ID','SCORE'])
533
535
  #if verbose:
534
536
  # print(df_tmp)
535
- acc = (df_tmp['TRUE.ID']==df_tmp['PREDICTED.ID']).mean()
537
+ if exact_match_required == True:
538
+ acc = (df_tmp['TRUE.ID'] == df_tmp['PREDICTED.ID']).mean()
539
+ else:
540
+ true_lower = df_tmp['TRUE.ID'].str.lower()
541
+ pred_lower = df_tmp['PREDICTED.ID'].str.lower()
542
+ matches = [t in p for t, p in zip(true_lower, pred_lower)]
543
+ acc = sum(matches) / len(matches)
536
544
  return acc
537
545
 
538
546
 
@@ -797,7 +805,7 @@ def run_spec_lib_matching_on_NRMS_data(query_data=None, reference_data=None, lik
797
805
  else:
798
806
  extension = query_data.rsplit('.',1)
799
807
  extension = extension[(len(extension)-1)]
800
- if extension == 'mgf' or extension == 'MGF' or extension == 'mzML' or extension == 'mzml' or extension == 'MZML' or extension == 'cdf' or extension == 'CDF':
808
+ if extension == 'mgf' or extension == 'MGF' or extension == 'mzML' or extension == 'mzml' or extension == 'MZML' or extension == 'cdf' or extension == 'CDF' or extension == 'msp' or extension == 'MSP' or extension == 'json' or extension == 'JSON':
801
809
  output_path_tmp = query_data[:-3] + 'txt'
802
810
  build_library_from_raw_data(input_path=query_data, output_path=output_path_tmp, is_reference=False)
803
811
  df_query = pd.read_csv(output_path_tmp, sep='\t')
@@ -0,0 +1,28 @@
1
+ Metadata-Version: 2.4
2
+ Name: pycompound
3
+ Version: 0.1.10
4
+ Summary: Python package to perform compound identification in mass spectrometry via spectral library matching.
5
+ Author-email: Hunter Dlugas <fy7392@wayne.edu>
6
+ License-Expression: MIT
7
+ Project-URL: Homepage, https://github.com/hdlugas/pycompound
8
+ Project-URL: Issues, https://github.com/hdlugas/pycompound/issues
9
+ Classifier: Programming Language :: Python :: 3
10
+ Classifier: Operating System :: OS Independent
11
+ Requires-Python: >=3.9
12
+ Description-Content-Type: text/markdown
13
+ License-File: LICENSE
14
+ Requires-Dist: matplotlib==3.8.4
15
+ Requires-Dist: numpy==1.26.4
16
+ Requires-Dist: pandas==2.2.2
17
+ Requires-Dist: scipy==1.13.1
18
+ Requires-Dist: pyteomics==4.7.2
19
+ Requires-Dist: netCDF4==1.6.5
20
+ Requires-Dist: lxml>=5.1.0
21
+ Requires-Dist: orjson==3.11.0
22
+ Requires-Dist: shiny==1.4.0
23
+ Requires-Dist: joblib==1.5.2
24
+ Dynamic: license-file
25
+
26
+ # PyCompound
27
+
28
+ A Python-based tool for spectral library matching, PyCompound is available as a Python package (pycompound) with a command-line interface (CLI) available and as a GUI application build with Python/Shiny. It performs spectral library matching to identify chemical compounds, offering a range of spectrum preprocessing transformations and similarity measures, including Cosine, three entropy-based similarity measures, and a plethora of binary similarity measures. PyCompound also includes functionality to tune parameters commonly used in a compound identification workflow given a query library of spectra with known ID. PyCompound supports both high-resolution mass spectrometry (HRMS) data (e.g., LC-MS/MS) and nominal-resolution mass spectrometry (NRMS) data (e.g., GC-MS). For the full documentation, see the GitHub repository https://github.com/hdlugas/pycompound.
@@ -1,14 +1,14 @@
1
1
  pycompound/build_library.py,sha256=4S8hT8FSrS_13daCdsva5UCEU-1qy9pD7kaaG-vaxvE,6815
2
- pycompound/plot_spectra.py,sha256=Y_onxNnw36LU5JnY_Frywgh6KLEawlfEjy4vfc67MII,32151
2
+ pycompound/plot_spectra.py,sha256=XW_UHgmPCdjsTD5oiNrszuW5GHyfBslcpXM9_bYg66I,32521
3
3
  pycompound/plot_spectra_CLI.py,sha256=ObaLad5Z5DmfQB-j0HSCg1mLORbYj2BM3hb5Yd0ZdDI,8395
4
4
  pycompound/processing.py,sha256=NsLI994MRlDq7M13LE-1RkfAfgVjHLrLLPbu2SvArKg,10684
5
5
  pycompound/similarity_measures.py,sha256=NbeVIy9DE_KWlDMXXylekjKuYVrtzbeEXbTutKFxmfU,10460
6
- pycompound/spec_lib_matching.py,sha256=WWos0UEBKTOwhNEMH6Ftb0n1Wy8Ec1BC8UddOxBzArQ,53282
6
+ pycompound/spec_lib_matching.py,sha256=y-4sWt7fphzFKXeURWgosLj8cNIBrcfUOaiCToELvgQ,54646
7
7
  pycompound/spec_lib_matching_CLI.py,sha256=L1D1j3MDdIe7th5n47z4uyvR5tL_8lN_22kbc-J7CF8,12053
8
8
  pycompound/tuning_CLI_DE.py,sha256=VRxoPLvuvE1gTRMC_lrOOK8TjINinMV_f4q69uDK2oE,9916
9
9
  pycompound/tuning_CLI_grid.py,sha256=lavROwKfJSi7xLaUX0zEaphlq7sJ-1FVY3hY3tWwoV4,9735
10
- pycompound-0.1.8.dist-info/licenses/LICENSE,sha256=fPFFlkSGg60VQWyWqTSv8yoJnpLzppzdihVWY5NKom8,1064
11
- pycompound-0.1.8.dist-info/METADATA,sha256=FUVY4wc3167HIT0f8gxL2DKu1aksJwVG9Wifv13TekU,40942
12
- pycompound-0.1.8.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
13
- pycompound-0.1.8.dist-info/top_level.txt,sha256=x7XjCCdlf335pIEf2TkiFHDrpbSaogk0zg1gPzKe-ic,11
14
- pycompound-0.1.8.dist-info/RECORD,,
10
+ pycompound-0.1.10.dist-info/licenses/LICENSE,sha256=fPFFlkSGg60VQWyWqTSv8yoJnpLzppzdihVWY5NKom8,1064
11
+ pycompound-0.1.10.dist-info/METADATA,sha256=v1A52MHLSpf1Ty2sMxx-FwdPY7pFIxFvYFWNb86Od4s,1747
12
+ pycompound-0.1.10.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
13
+ pycompound-0.1.10.dist-info/top_level.txt,sha256=x7XjCCdlf335pIEf2TkiFHDrpbSaogk0zg1gPzKe-ic,11
14
+ pycompound-0.1.10.dist-info/RECORD,,
@@ -1,824 +0,0 @@
1
- Metadata-Version: 2.4
2
- Name: pycompound
3
- Version: 0.1.8
4
- Summary: Python package to perform compound identification in mass spectrometry via spectral library matching.
5
- Author-email: Hunter Dlugas <fy7392@wayne.edu>
6
- License-Expression: MIT
7
- Project-URL: Homepage, https://github.com/hdlugas/pycompound
8
- Project-URL: Issues, https://github.com/hdlugas/pycompound/issues
9
- Classifier: Programming Language :: Python :: 3
10
- Classifier: Operating System :: OS Independent
11
- Requires-Python: >=3.9
12
- Description-Content-Type: text/markdown
13
- License-File: LICENSE
14
- Requires-Dist: matplotlib==3.8.4
15
- Requires-Dist: numpy==1.26.4
16
- Requires-Dist: pandas==2.2.2
17
- Requires-Dist: scipy==1.13.1
18
- Requires-Dist: pyteomics==4.7.2
19
- Requires-Dist: netCDF4==1.6.5
20
- Requires-Dist: lxml>=5.1.0
21
- Requires-Dist: orjson==3.11.0
22
- Requires-Dist: shiny==1.4.0
23
- Requires-Dist: joblib==1.5.2
24
- Dynamic: license-file
25
-
26
- # PyCompound
27
- A Python-based tool for spectral library matching, PyCompound is available as a Python package with a command-line interface (CLI) available and as a GUI application build with Python/Shiny. It performs spectral library matching to identify chemical compounds, offering a range of spectrum preprocessing transformations and similarity measures, including Cosine and three entropy-based similarity measures. PyCompound supports both high-resolution mass spectrometry (HRMS) data (e.g., LC-MS/MS) and nominal-resolution mass spectrometry (NRMS) data (e.g., GC-MS).
28
-
29
- ## Table of Contents
30
- - [1. Install dependencies](#create-conda-env)
31
- - [2. Functionality](#functionality)
32
- - [2.1 Spectrum Preprocessing Transformations](#spec-preprocessing-transformations)
33
- - [2.2 Similarity Measures](#similarity-measures)
34
- - [3. Usage](#usage)
35
- - [3.1 Parameter descriptions](#param_descriptions)
36
- - [3.2 Obtain LC-MS/MS or GC-MS library from MGF, mzML, or cdf file](#process-data)
37
- - [3.3 Run spectral library matching](#run-spec-lib-matching)
38
- - [3.4 Tune parameters](#tuning)
39
- - [3.5 Plot a query spectrum against a reference spectrum before and after spectrum preprocessing transformations](#plotting)
40
- - [3.6 Shiny application](#shiny)
41
- - [4. Bugs/Questions?](#bugs-questions)
42
-
43
- <a name="create-conda-env"></a>
44
- ## 1. Install dependencies
45
- PyCompound requires the Python dependencies Matplotlib, NumPy, Pandas, SciPy, Pyteomics, and netCDF4. Specifically, this software was validated with python=3.12.4, matplotlib=3.8.4, numpy=1.26.4, pandas=2.2.2, scipy=1.13.1, pyteomics=4.7.2, netCDF4=1.6.5, lxml=5.1.0, joblib=1.5.2, and shiny=1.4.0, although it may work with other versions of these tools. A user may consider creating a conda environment (see [https://docs.conda.io/projects/conda/en/latest/user-guide/getting-started.html](https://docs.conda.io/projects/conda/en/latest/user-guide/getting-started.html) for guidance on getting started with conda if you are unfamiliar). For a system with conda installed, one can create the environment pycompound_env, activate it, and install the necessary dependencies with:
46
- ```
47
- conda create -n pycompound_env python=3.12
48
- conda activate pycompound_env
49
- pip install pycompound==0.1.7
50
- ```
51
-
52
- <a name="functionality"></a>
53
- ## 2. Functionality
54
-
55
- <a name="spec-preprocessing-transformations"></a>
56
- ## 2.1 Spectrum Preprocessing Transformations
57
- The following spectrum preprocessing transformations are offered:
58
-
59
- - Filtering: Given user-defined parameters (mz_min,mz_max),
60
- (int_min,int_max) and spectrum $I$ with m/z values
61
- $(m_{1},m_{2},...,m_{n})$ and intensities $(x_{1},x_{2},...,x_{n})$,
62
- the transformed spectrum $I^{\star}$ consists of the peaks
63
- $(m_{i},x_{i})$ in $I$ such that mz_min $\leq m_{i}\leq$ mz_max and
64
- int_min $\leq x_{i}\leq$ int_max.
65
-
66
- - Weight Factor Transformation: Given a pair of user-defined weight
67
- factor parameters $(\text{a,b})$ and spectrum $I$ with m/z values
68
- $(m_{1},m_{2},...,m_{n})$ and intensities $(x_{1},x_{2},...,x_{n})$,
69
- the transformed spectrum $I^{\star}$ has the same m/z values as $I$
70
- and has intensities given by
71
- $I^{\star}:=(m_{1}^{\text{a}}\cdot x_{1}^{\text{b}},m_{2}^{\text{a}}\cdot x_{2}^{\text{b}},...,m_{n}^{\text{a}}\cdot x_{n}^{\text{b}})$.
72
-
73
- - Low-Entropy Transformation: Given a user-defined low-entropy
74
- threshold parameter $T$ and spectrum $I$ with intensities
75
- $(x_{1},x_{2},...,x_{n})$, $\sum_{i=1}^nx_i = 1$, and Shannon
76
- entropy $H_{Shannon}(I)=-\sum_{i=1}^{n}x_{i}\cdot ln(x_{i})$, the
77
- transformed spectrum intensities
78
- $I^{\star}=(x_{1}^{\star},x_{2}^{\star},...,x_{n}^{\star})$ are such
79
- that, for all $i\in\{1,2,...,n\}$, $x_{i}^{\star}=x_{i}$ if
80
- $H_{Shannon}(I)\geq T$ and
81
- $x_{i}^{\star}=x_{i}^{\frac{1+H_{Shannon}(I)}{1+T}}$ if
82
- $H_{Shannon}(I)<T$.
83
-
84
- - Centroiding (only applicable to HRMS data): Given a user-defined
85
- window-size parameter $w_{centroiding}$ and a spectrum $I$ with m/z
86
- values $(m_{1},m_{2},...,m_{n})$ and intensities
87
- $(x_{1},x_{2},...,x_{n})$, the transformed spectrum $I^{\star}$
88
- merges adjacent peaks $(m_{i},x_{i}),(m_{i+1},x_{i+1})$ into the
89
- peak
90
- $(\frac{m_{i}\cdot x_{i}+m_{i+1}\cdot x_{i+1}}{x_{i}+x_{i+1}},x_{i}+x_{i+1})$
91
- if $|m_{i}-m_{i+1}|< w_{centroiding}$ for
92
- $i\in\{1,2,...,n-1\}$. This centroiding procedure generalizes to
93
- more than two peaks whose m/z values are within a distance
94
- $w_{centroiding}$ of each other.
95
-
96
- - Noise Removal: Given a user-defined noise removal parameter $r$ and
97
- a spectrum $I$ with intensities $(x_{1},x_{2},...,x_{n})$, noise
98
- removal removes peaks from $I$ with
99
- $x_{j}< r\cdot\text{max}(\{x_{1},x_{2},...,x_{n}\})$ for
100
- $j\in\{1,2,...,n\}$.
101
-
102
- - Matching (only applicable to HRMS data): Given a user-defined
103
- window-size parameter $w_{matching}$ and two spectra $I$, $J$ with
104
- m/z ratios $(a_{1},a_{2},...,a_{n}), (b_{1},b_{2},...,b_{m})$ and
105
- intensities $(x_{1},x_{2},...,x_{n}), (y_{1},y_{2},...,y_{m})$,
106
- respectively, of which we would like to measure the similarity
107
- between, the matching procedure outputs two spectra
108
- $I^{\star},J^{\star}$ containing the same number of peaks with
109
- $I^{\star}$ and $J^{\star}$ having intensities and
110
- identical m/z ratios. Specifically, for a given peak $(a_{i},x_{i})$
111
- of $I$, if there are no peaks $(b_{j},y_{j})$ in $J$ with
112
- $|a_{i}-b_{j}|< w_{matching}$, then the peak $(a_{i},x_{i})$
113
- remains in $I^{\star}$ and the peak $(a_{i},0)$ is included in
114
- $J^{\star}$. If there is at least one peak $(b_{j},y_{j})$ with
115
- $|a_{i}-b_{j}|< w_{matching}$, then the peak $(a_{i},x_{i})$
116
- remains in $I^{\star}$ and the peak
117
- $(a_{i},\sum_{j\text{ such that }|a_{i}-b_{j}|< w_{matching}}b_{j})$
118
- is included in $J^{\star}$. This procedure is applied when
119
- transposing the roles of $I$ and $J$ as well.
120
-
121
- <a name="similarity-measures"></a>
122
- ## 2.2 Similarity Measures
123
- Given a pair of processed spectra intensities
124
- $I=(a_{1},a_{2},...,a_{n}), J=(b_{1},b_{2},...,b_{n})\in\mathbb{R}^{n}$
125
- with $0\leq a_{i},b_{i}\leq 1$ for all $i\in\{1,2,...,n\}$ and
126
- $\sum_{i=1}^{n}a_{i}=\sum_{i=1}^{n}b_{i}=1$, PyCompound provides
127
- functionality for computing the following similarity measures:
128
-
129
- - Cosine Similarity Measure:
130
-
131
- ```math
132
- S_{Cosine}(I,J)=\frac{I\circ J}{|I|_{2}\cdot |J|_{2}}
133
- ```
134
- where multiplication in the numerator refers to the dot product $I\circ J=a_{1}b_{1}+a_{2}b_{2}+...+a_{n}b_{n}$ of $I$ and $J$ and multiplication in the denominator refers to multiplication of the $L^{2}$-norms of $I$ and $J$, $\vert I\vert_{2}=\sqrt{a_{1}^{2}+a_{2}^{2}+...+a_{n}^{2}}, \vert J\vert_{2}=\sqrt{b_{1}^{2}+b_{2}^{2}+...+b_{n}^{2}}$.
135
-
136
- - Shannon Entropy Similarity Measure:
137
-
138
- ```math
139
- S_{Shannon}(I,J) = 1-\frac{2\cdot H_{Shannon}\left(\frac{I+J}{2}\right) - H_{Shannon}(I)-H_{Shannon}(J)}{ln(4)},
140
- ```
141
-
142
- ```math
143
- H_{Shannon}(I)=-\sum_{i=1}^{n}a_{i}\cdot ln(a_{i})
144
- ```
145
-
146
- - Tsallis Entropy Similarity Measure:
147
-
148
- ```math
149
- S_{Tsallis}(I,J,q)=1-\frac{2\times H_{Tsallis}(I/2+J/2,q)-H_{Tsallis}(I,q)-H_{Tsallis}(J,q)}{N_{Tsallis}(I,J,q)},
150
- ```
151
-
152
- ```math
153
- N_{Tsallis}(I,J,q):=\frac{\sum_{i=1}^{n}\left(2\left(\frac{a_{i}}{2}\right)^{q}+2\left(\frac{b_{i}}{2}\right)^{q}-a_{i}^{q}-b_{i}^{q}\right)}{1-q},
154
- ```
155
-
156
- ```math
157
- H_{Tsallis}(I,q)=\frac{\left(\sum_{i=1}^{n}a_{i}^{q}\right)-1}{1-q},
158
- ```
159
-
160
- ```math
161
- q\neq 1, \ q>0
162
- ```
163
-
164
- - Rényi Entropy Similarity Measure:
165
-
166
- ```math
167
- S_{Renyi}(I,J,q)=1-\frac{2\times H_{Renyi}(I/2+J/2,q)-H_{Renyi}(I,q)-H_{Renyi}(J,q)}{N_{Renyi}(I,J,q)},
168
- ```
169
-
170
- ```math
171
- N_{Renyi}(I,J,q):=\left(\frac{1}{1-q}\right)\left(2\times ln\left(\sum_{i}(a_{i}/2)^{q}+\sum_{j}(b_{j}/2)^{q}\right)-ln(\sum_{i}a_{i}^{q})-ln(\sum_{i}b_{i}^{q})\right),
172
- ```
173
-
174
- ```math
175
- H_{Renyi}(I,q)=\frac{1}{1-q}ln(\sum_{i=1}^{n}a_{i}^{q}),
176
- ```
177
-
178
- ```math
179
- q\neq 1, \ q>0
180
- ```
181
-
182
- Additionally, the plethora of binary similarity measures considered in https://doi.org/10.3390/metabo12080694 are available along with a mixture similarity measure that is a weighted sum of the four non-binary similarity measures (i.e. Cosine, Shannon Entropy, Renyi, and Tsallis).
183
-
184
- <a name="usage"></a>
185
- ## 3. Usage
186
- PyCompound has three main capabilities:
187
- 1. Plotting a query spectrum vs. a reference spectrum before and after preprocessing transformations.
188
- 2. Running spectral library matching to identify compounds based on their mass spectrometry data
189
- 3. Tuning parameters to maximize accuracy given a query dataset with known compuond IDs (e.g. from targeted metabolomics experiments).
190
-
191
- These tasks are implemented separately for the cases of (i) NRMS and (ii) HRMS data due to the different spectrum preprocessing transformations stemming from a different format in the mass to charge (m/z) ratios in NRMS vs HRMS data. Example scripts which implement these tasks can be found in the pycompound/tests directory.
192
-
193
- <a name="param_descriptions"></a>
194
- ### 3.1 Parameter descriptions
195
-
196
- For the function build_library_from_raw_data:
197
- ```
198
- --input_path: Path to input file (must be either mgf, mzMZ, msp, cdf, or json file). Mandatory argument.
199
-
200
- --output_path: Path to output text file. Default: current working directory.
201
-
202
- --is_reference: Boolean flag indicating whether IDs of spectra should be written to output. Only pass True if building a library with known compound IDs. Only applicable to MGF files. Options: \'True\', \'False\'. Optional argument. Default: False.
203
- ```
204
-
205
- Common parameters:
206
- ```
207
- --query_data (mandatory argument):
208
- * HRMS case: mgf, mzML, msp, json, or txt file of query mass spectrum/spectra to be identified. If txt file, must have at least 3 columns with each row corresponding to a single ion fragment of a mass spectrum, one 'id' column containing an identifier, one 'mz_ratio' column corresponding to the mass to charge (m/z) ratios, and one 'intensity' column containing the intensities. For example, if spectrum A has 3 ion fragments, then there would be three rows in this text file corresponding to spectrum A. Optional columns for the text file are 'precursor_ion_mz', 'ionization_mode', and 'adduct'.
209
- * NRMS case: cdf or txt file of query mass spectrum/spectra to be identified. If txt file, same format as in HRMS case is required.
210
-
211
- --reference_data (mandatory argument): Same format text file as query_data except of reference library spectra. We recommend using the reference libraries from our Zenodo database ([https://zenodo.org/records/12786324](https://zenodo.org/records/12786324); stored on Zenodo due to file size limitations on GitHub).
212
-
213
- --precursor_ion_mz_tolerance (only applicable to HRMS): positive float representing a window size around each query spectrum's precursor ion mass:charge ratio in which candidate reference spectra must lie to be considered in compound identification. Default: None.
214
-
215
- --ionization_mode (only applicable to HRMS): Positive, Negative, or None. Default: None.
216
-
217
- --adduct (only applicable to HRMS): Options: H, NH3, NH4, Na, K, N/A. Default: N/A.
218
-
219
- --likely_reference_IDs: text file with one column containing the IDs of a subset of all compounds in the reference_data to be used in spectral library matching. Each ID in this file must be an ID in the reference library. Default: None (i.e. default is to use entire reference library)
220
-
221
- --similarity_measure: cosine, shannon, renyi, tsallis, mixture, jaccard, dice, 3w_jaccard, sokal_sneath, binary_cosine, mountford, mcconnaughey, driver_kroeber, simpson, braun_banquet, fager_mcgowan, kulczynski, intersection, hamming, hellinger. Default: cosine.
222
-
223
- --weights: dict of weights to give to each non-binary similarity measure (i.e. cosine, shannon, renyi, and tsallis) when the mixture similarity measure is specified. Default: 0.25 for each of the four non-binary similarity measures.
224
-
225
- --spectrum_preprocessing_order: The spectrum preprocessing transformations and the order in which they are to be applied. Note that these transformations are applied prior to computing similarity scores. Format must be a string with 2-6 characters chosen from C, F, M, N, L, W representing centroiding, filtering based on mass/charge and intensity values, matching, noise removal, low-entropy trannsformation, and weight-factor-transformation, respectively. For example, if \'WCM\' is passed, then each spectrum will undergo a weight factor transformation, then centroiding, and then matching. Note that if an argument is passed, then \'M\' must be contained in the argument, since matching is a required preprocessing step in spectral library matching of HRMS data. Furthermore, \'C\' must be performed before matching since centroiding can change the number of ion fragments in a given spectrum. Note that C and M are not applicable to NRMS data. Default: FCNMWL for HRMS and FNLW for NRMS.')
226
-
227
- --high_quality_reference_library: True/False flag indicating whether the reference library is considered to be of high quality. If True, then the spectrum preprocessing transformations of filtering and noise removal are performed only on the query spectrum/spectra. If False, all spectrum preprocessing transformations specified will be applied to both the query and reference spectra. Default: False')
228
-
229
- --mz_min: Remove all peaks with mass/charge value less than mz_min in each spectrum. Default: 0
230
-
231
- --mz_max: Remove all peaks with mass/charge value greater than mz_max in each spectrum. Default: 9999999
232
-
233
- --int_min: Remove all peaks with intensity value less than int_min in each spectrum. Default: 0
234
-
235
- --int_max: Remove all peaks with intensity value greater than int_max in each spectrum. Default: 9999999
236
-
237
- --window_size_centroiding (only for HRMS): Window size parameter used in centroiding a given spectrum. Default: 0.5
238
-
239
- --window_size_matching (only for HRMS): Window size parameter used in matching a query spectrum and a reference library spectrum. Default: 0.5
240
-
241
- --noise_threshold: Ion fragments (i.e. points in a given mass spectrum) with intensity less than max(intensities)*noise_threshold are removed. Default: 0.0
242
-
243
- --wf_mz: Mass/charge weight factor parameter. Default: 0.0
244
-
245
- --wf_intensity: Intensity weight factor parameter. Default: 0.0
246
-
247
- --LET_threshold: Low-entropy transformation threshold parameter. Spectra with Shannon entropy less than LET_threshold are transformed according to intensitiesNew=intensitiesOriginal^{(1+S)/(1+LET_threshold)}. Default: 0.0
248
-
249
- --entropy_dimension: Entropy dimension parameter. Must have positive value other than 1. When the entropy dimension is 1, then Renyi and Tsallis entropy are equivalent to Shannon entropy. Therefore, this parameter only applies to the renyi and tsallis similarity measures. This parameter will be ignored if similarity measure cosine or shannon is chosen. Default: 1.1
250
- ```
251
-
252
- Parameters specific to run_spec_lib_matching_on_HRMS_data and run_spec_lib_matching_on_NRMS_data:
253
-
254
- ```
255
- --n_top_matches_to_save: The number of top matches to report. For example, if n_top_matches_to_save=5, then for each query spectrum, the five reference spectra with the largest similarity with the given query spectrum will be reported. Default: 1
256
-
257
- --print_id_results: Flag that prints identification results if True. Default: False
258
-
259
- --output_identification: Output text file containing the most-similar reference spectra for each query spectrum along with the corresponding similarity scores. Default is to save identification output in current working directory with filename 'output_identification.txt'.
260
-
261
- --output_similarity_scores: Output text file containing similarity scores between all query spectrum/spectra and all reference spectra. Each row corresponds to a query spectrum, the left-most column contains the query spectrum/spectra identifier, and the remaining column contain the similarity scores with respect to all reference library spectra. If no argument passed, then this text file is written to the current working directory with filename output_all_similarity_scores.txt.
262
- ```
263
-
264
- Parameters specific to tune_params_on_HRMS_data_grid and tune_params_on_NRMS_data_grid:
265
- ```
266
- `` grid: dict object such as {'similarity_measure':['cosine','shannon'], 'spectrum_preprocessing_order':['FCNMWL'], 'mz_min':[0], 'mz_max':[9999999], 'int_min':[0], 'int_max':[99999999], 'window_size_centroiding':[0.5], 'window_size_matching':[0.5], 'noise_threshold':[0.0,0.1], 'wf_mz':[0.0], 'wf_int':[1.0], 'LET_threshold':[0.0], 'entropy_dimension':[1.1], 'high_quality_reference_library':[False]} with all possible combinations of parameters being utilized.
267
-
268
- --output_path: path to output text file containing the accuracies for each possible combination of parameters. If no argument is passed, then the plots will be saved to ./tuning_param_output.txt in the current working directory.
269
- ```
270
-
271
- Parameters specific to tune_params_DE:
272
- ```
273
- -- optimize_params: list of continuous parameters (i.e. window_size_centroiding, window_size_matching, noise_threshold, wf_mz, wf_int, LET_threshold; window_size parameters only applicable to HRMS data) to optimize via differential evolution.
274
-
275
- -- param_bounds: dict with keys being the parameters to optimize and values being a tuple of length 2 of the lower and upper bounds of acceptable parameter values.
276
-
277
- -- maxiters: maximum number of iterations of differential evolution.
278
-
279
- -- de_workers: number of CPUs to utilize.
280
- ```
281
-
282
- Parameters specific to generate_plots_on_HRMS_data and generate_plots_on_NRMS_data:
283
- ```
284
- --spectrum_ID1: ID of one spectrum to be plotted. Default is first spectrum in the query library. Optional argument.
285
-
286
- --spectrum_ID2: ID of another spectrum to be plotted. Default is first spectrum in the reference library. Optional argument.
287
-
288
- --y_axis_transformation: transformation to apply to y-axis (i.e. intensity axis) of plots. Options: 'normalized', 'none', 'log10', and 'sqrt'. Default: 'normalized.')
289
-
290
- --output_path: path to output PDF file containing the plots of the spectra before and after preprocessing transformations. If no argument is passed, then the plots will be saved to the PDF ./spectrum1_{spectrum_ID1}_spectrum2_{spectrum_ID2}_plot.pdf in the current working directory.
291
- ```
292
-
293
-
294
- <a name="process-data"></a>
295
- ### 3.2 Obtain LC-MS/MS or GC-MS library from MGF, mzML, cdf, msp, or json file
296
- To obtain a text file of LC-MS/MS spectra in the format necessary for spectral library matching from raw data in the form of an mgf, mzML, msp, json, or cdf file inside Python, one can run:
297
- ```
298
- from pycompound.build_library import build_library_from_raw_data
299
-
300
- build_library_from_raw_data(input_path='path_to_input_file', output_path='path_to_output_file', is_reference=False)
301
- ```
302
-
303
- Since the other functionality provided by pycompound is capable of being directly run on mgf, mzML, msp, json, and cdf files, you may not need to directly build a library yourself. Some example mgf and json files one can use to build an LC-MS/MS library can be found from the Global Natural Products Social Molecular Networking (GNPS) databases here: [https://external.gnps2.org/gnpslibrary](https://external.gnps2.org/gnpslibrary). Some example mzML files one can use to build an LC-MS/MS library can be found in this repository: [https://github.com/HUPO-PSI/mzML](https://github.com/HUPO-PSI/mzML). Some example MSP files can be found here: [https://mona.fiehnlab.ucdavis.edu/downloads](https://mona.fiehnlab.ucdavis.edu/downloads). The mgf, mzML, msp, and json files provided in this repository are trimmed versions of files found in these referenced repositories. The script tests/test_build_libraries.py demonstrates this usage.
304
-
305
- Full LC-MS/MS and GC-MS reference libraries are available at the Zenodo database ([https://zenodo.org/records/12786324](https://zenodo.org/records/12786324)).
306
-
307
- <a name="run-spec-lib-matching"></a>
308
- ### 3.3 Run spectral library matching
309
- The files tests/test_spec_lib_matching.py, tests/test_spec_lib_matching_CLI, and tests/example_code_for_python_use.py demonstrate how some of the spectrum preprocessing functionality and similarity measures can be implemented either directly in Python or in the CLI wrapper. The two main functions - one for HRMS data and one for NRMS data - can be implemented as shown below inside Python:
310
- ```
311
- from pycompound.spec_lib_matching import run_spec_lib_matching_on_HRMS_data
312
- from pycompound.spec_lib_matching import run_spec_lib_matching_on_NRMS_data
313
-
314
- run_spec_lib_matching_on_HRMS_data(
315
- query_data='path_to_query_library',
316
- reference_data='path_to_reference_library',
317
- likely_reference_IDs=None,
318
- similarity_measure='cosine',
319
- spectrum_preprocessing_order='FCNMWL',
320
- high_quality_reference_library=False,
321
- mz_min=0,
322
- mz_max=9999999,
323
- int_min=0,
324
- int_max=9999999,
325
- window_size_centroiding=0.5,
326
- window_size_matching=0.5,
327
- noise_threshold=0.0,
328
- wf_mz=0.0,
329
- wf_intensity=1.0,
330
- LET_threshold=0.0,
331
- entropy_dimension=1.1,
332
- n_top_matches_to_save=1,
333
- print_id_results=False,
334
- output_identification=None,
335
- output_similarity_scores=None)
336
-
337
- run_spec_lib_matching_on_NRMS_data(
338
- query_data='path_to_query_library',
339
- reference_data='path_to_reference_library',
340
- likely_reference_IDs=None,
341
- similarity_measure='cosine',
342
- spectrum_preprocessing_order='FNLW',
343
- high_quality_reference_library=False,
344
- mz_min=0,
345
- mz_max=9999999,
346
- int_min=0,
347
- int_max=9999999,
348
- noise_threshold=0.0,
349
- wf_mz=0.0,
350
- wf_intensity=1.0,
351
- LET_threshold=0.0,
352
- entropy_dimension=1.1,
353
- n_top_matches_to_save=1,
354
- print_id_results=False,
355
- output_identification=None,
356
- output_similarity_scores=None)
357
- ```
358
-
359
- To use the CLI version, one can run the following from the terminal:
360
- ```
361
- python spec_lib_matching_CLI.py \
362
- --query_data ${PWD}/../tests/data/lcms_query_library.txt \
363
- --reference_data ${PWD}/../tests/data/full_GNPS_reference_library.txt \
364
- --chromatography_platform HRMS \
365
- --likely_reference_IDs None \
366
- --similarity_measure cosine \
367
- --spectrum_preprocessing_order FCNMWL \
368
- --high_quality_reference_library False \
369
- --mz_min 0 \
370
- --mz_max 9999999 \
371
- --int_min 0 \
372
- --int_max 9999999 \
373
- --window_size_centroiding 0.5 \
374
- --window_size_matching 0.5 \
375
- --noise_threshold 0.0 \
376
- --wf_mz 0.0 \
377
- --wf_intensity 1.0 \
378
- --LET_threshold 0.0 \
379
- --entropy_dimension 1.1 \
380
- --n_top_matches_to_save 1 \
381
- --print_id_results False \
382
- --output_identification ${PWD}/../tests/output_identification_HRMS.txt \
383
- --output_similarity_scores ${PWD}/../tests/output_similarity_scores_HRMS.txt
384
-
385
- python spec_lib_matching_CLI.py \
386
- --query_data ${PWD}/../tests/data/lcms_query_library.txt \
387
- --reference_data ${PWD}/../tests/data/full_GNPS_reference_library.txt \
388
- --chromatography_platform NRMS \
389
- --likely_reference_IDs None \
390
- --similarity_measure cosine \
391
- --spectrum_preprocessing_order FCNMWL \
392
- --high_quality_reference_library False \
393
- --mz_min 0 \
394
- --mz_max 9999999 \
395
- --int_min 0 \
396
- --int_max 9999999 \
397
- --noise_threshold 0.0 \
398
- --wf_mz 0.0 \
399
- --wf_intensity 1.0 \
400
- --LET_threshold 0.0 \
401
- --entropy_dimension 1.1 \
402
- --n_top_matches_to_save 1 \
403
- --print_id_results False \
404
- --output_identification ${PWD}/../tests/output_identification_NRMS.txt \
405
- --output_similarity_scores ${PWD}/../tests/output_similarity_scores_NRMS.txt
406
- ```
407
-
408
- For a user who may wish to incorporate our transformations and similarity measures directly in their python code similar to the example script tests/example_code_for_python_use.py, the available transformations and similarity measures are:
409
- ```
410
- # Weight factor transformation
411
- wf_transform(spec_mzs, spec_ints, wf_mz, wf_int)
412
- """
413
- Perform weight factor transformation on a spectrum
414
- Args:
415
- spec_mzs: 1d numpy array representing mass/charge values
416
- spec_ints: 1d numpy array representing intensity values
417
- wf_mz: float
418
- wf_int: float
419
- Returns:
420
- np.ndarray: 1d numpy array of weight-factor-transformed spectrum intensities
421
- """
422
-
423
- # Low-entropy transformation
424
- LE_transform(intensity, thresh, normalization_method)
425
- """
426
- Transforms spectrum's intensities if the Shannon entropy of the intensities is below some threshold
427
- Args:
428
- intensity: 1d numpy array
429
- thresh: nonnegative float
430
- normalization_method: either 'standard' or 'softmax'
431
- Returns:
432
- np.ndarray: 1d numpy array of transformed intensities
433
- """
434
-
435
- # Filter HR-MS such as LC-MS/MS spectrum
436
- filter_spec_lcms(spec, mz_min, mz_max, int_min, int_max, is_matched)
437
- """
438
- Filter an MS/MS spectrum based on m/z and intensity values
439
- Args:
440
- spec: N x 2 numpy array with first column being m/z and second column being intensity
441
- mz_min: minimum m/z value
442
- mz_max: maximum m/z value
443
- int_min: minimum intensity value
444
- int_max: maximum intensity value
445
- is_matched: flag to indicate whether the given spectrum has already been matched to another spectrum
446
- Returns:
447
- np.ndarray: N x 2 numpy array with intensity of 0 put anywhere outside of the m/z and/or intensity bounds
448
- """
449
-
450
- # Filter NR-MS such as GC-MS spectrum
451
- filter_spec_gcms(spec, mz_min, mz_max, int_min, int_max)
452
- """
453
- Filter an MS spectrum based on m/z and intensity values
454
- Args:
455
- spec: N x 2 numpy array with first column being m/z and second column being intensity
456
- mz_min: minimum m/z value
457
- mz_max: maximum m/z value
458
- int_min: minimum intensity value
459
- int_max: maximum intensity value
460
- Returns:
461
- np.ndarray: N x 2 numpy array with intensity of 0 put anywhere outside of the m/z and/or intensity bounds
462
- """
463
-
464
- # Remove low-intensity noise
465
- remove_noise(spec, nr)
466
- """
467
- Remove low-intensity ion fragments
468
- Args:
469
- spec: N x 2 numpy array with first column being m/z and second column being intensity
470
- nr: noise removal parameter; ion fragments with intensity less than max(intensity)*nr have intensity set to 0
471
- Returns:
472
- np.ndarray: N x 2 numpy array
473
- """
474
-
475
- # Centroid spectrum by merging close m/z peaks
476
- centroid_spectrum(spec, window_size)
477
- """
478
- Centroid a spectrum by merging ion fragments that are 'close' with respect to m/z value
479
- Args:
480
- spec: N x 2 numpy array with the first column being mass/charge and the second column being intensity
481
- window_size: window-size parameter
482
- Returns:
483
- np.ndarray: M x 2 numpy array with M <= N due to peaks being merged
484
- """
485
-
486
- # Match peaks between two spectra
487
- match_peaks_in_spectra(spec_a, spec_b, window_size)
488
- """
489
- Align two spectra so that we obtain a list of intensity values from each spectrum of the same length
490
- Args:
491
- spec_a: N x 2 numpy array with the first column being mass/charge and the second column being intensity
492
- spec_b: M x 2 numpy array with the first column being mass/charge and the second column being intensity
493
- window_size: window-size parameter
494
- Returns:
495
- np.ndarray: K x 3 numpy array with first column being mass/charge, second column being matched intensities of spec_a, and third column being matched intensities of spec_b
496
- """
497
-
498
- # Assign 0 to the intensities without m/z values
499
- convert_spec(spec, mzs)
500
- """
501
- Set intensity values to 0 where m/z values are missing
502
- Args:
503
- spec: N x 2 dimensional numpy array
504
- mzs: length M list of entire span of mass/charge values considering both the query and reference libraries
505
- Returns:
506
- np.ndarray: M x 2 dimensional numpy array
507
- """
508
-
509
- # Cosine similarity
510
- S_cos(ints_a, ints_b)
511
- """
512
- Cosine similarity measure
513
- Args:
514
- ints_a: 1d numpy array of intensities of a spectrum
515
- ints_b: 1d numpy array of intensities of a spectrum
516
- Returns:
517
- float: float between 0 and 1 indicating the similarity of the two spectra
518
- """
519
-
520
- # Shnnon entropy similarity
521
- S_shannon(ints_a, ints_b)
522
- """
523
- Shannon entropy similarity measure
524
- Args:
525
- ints_a: 1d numpy array of intensities of a spectrum
526
- ints_b: 1d numpy array of intensities of a spectrum
527
- Returns:
528
- float: float between 0 and 1 indicating the similarity of the two spectra
529
- """
530
-
531
- # Renyi entropy similarity
532
- S_renyi(ints_a, ints_b, q)
533
- """
534
- Renyi entropy similarity measure
535
- Args:
536
- ints_a: 1d numpy array of intensities of a spectrum
537
- ints_b: 1d numpy array of intensities of a spectrum
538
- q: positive float representing 'entropy dimension'
539
- Returns:
540
- float: float between 0 and 1 indicating the similarity of the two spectra
541
- """
542
-
543
- # Tsallis entropy similarity
544
- S_tsallis(ints_a, ints_b, q)
545
- """
546
- Tsallis entropy similarity measure
547
- Args:
548
- ints_a: 1d numpy array of intensities of a spectrum
549
- ints_b: 1d numpy array of intensities of a spectrum
550
- q: positive float representing 'entropy dimension'
551
- Returns:
552
- float: float between 0 and 1 indicating the similarity of the two spectra
553
- """
554
- ```
555
-
556
-
557
- <a name="tuning"></a>
558
- ### 3.4 Tune parameters
559
- Note that in order to tune parameters such as noise_threshold, LET_threshold etc., one must have a query library with compounds whose ground truth ID is known (e.g. from targeted metabolomics experiments). PyCompound offers two different methods of tuning parameters: one being an exhaustive grid search of pre-specified values, and the other being an optimization approach using differential evolution to optimize continuous parameters with respect to accuracy. The usage of the functions to tune parameters within Python is:
560
- ```
561
- from pycompound.spec_lib_matching import tune_params_on_HRMS_data_grid
562
- from pycompound.spec_lib_matching import tune_params_on_NRMS_data_grid
563
- from pycompound.spec_lib_matching import tune_params_DE
564
- from pathlib import Path
565
-
566
- tune_params_on_HRMS_data_grid(
567
- query_data=f'{Path.cwd()}/tests/data/lcms_query_library_tuning.txt',
568
- reference_data=f'{Path.cwd()}/tests/data/full_GNPS_reference_library.txt',
569
- precursor_ion_mz_tolerance=0.5,
570
- ionization_mode='Positive',
571
- adduct='H',
572
- grid={'similarity_measure':['cosine'], 'spectrum_preprocessing_order':['FCNMWL'], 'mz_min':[0], 'mz_max':[9999999], 'int_min':[0], 'int_max':[99999999], 'window_size_centroiding':[0.5], 'window_size_matching':[0.1,0.5], 'noise_threshold':[0.0], 'wf_mz':[0.0], 'wf_int':[1.0], 'LET_threshold':[0.0], 'entropy_dimension':[1.1], 'high_quality_reference_library':[False]},
573
- output_path=f'{Path.cwd()}/tuning_param_output_HRMS.txt'
574
- )
575
-
576
- tune_params_on_NRMS_data_grid(
577
- query_data=f'{Path.cwd()}/tests/data/gcms_query_library_tuning.txt',
578
- reference_data=f'{Path.cwd()}/tests/data/gcms_reference_library.txt',
579
- grid={'similarity_measure':['cosine','shannon'], 'spectrum_preprocessing_order':['FNLW'], 'mz_min':[0], 'mz_max':[9999999], 'int_min':[0], 'int_max':[99999999], 'noise_threshold':[0.0,0.1], 'wf_mz':[0.0], 'wf_int':[1.0], 'LET_threshold':[0.0,3.0], 'entropy_dimension':[1.1], 'high_quality_reference_library':[False]},
580
- output_path=f'{Path.cwd()}/tuning_param_output_NRMS.txt'
581
- )
582
-
583
- tune_params_DE(
584
- query_data=f'{Path.cwd()}/tests/data/lcms_query_library_tuning.txt',
585
- reference_data=f'{Path.cwd()}/tests/data/full_GNPS_reference_library.txt',
586
- precursor_ion_mz_tolerance=0.1,
587
- ionization_mode='Positive',
588
- adduct='H',
589
- chromatography_platform='HRMS',
590
- similarity_measure='shannon',
591
- optimize_params=["wf_mz","wf_int"],
592
- param_bounds={"wf_mz":(0.0,5.0),"wf_int":(0.0,5.0)},
593
- default_params={"window_size_centroiding": 0.5, "window_size_matching":0.5, "noise_threshold":0.10, "wf_mz":0.0, "wf_int":1.0, "LET_threshold":0.0, "entropy_dimension":1.1},
594
- maxiters=2,
595
- de_workers=-1
596
- )
597
-
598
- tune_params_DE(
599
- query_data=f'{Path.cwd()}/tests/data/gcms_query_library_tuning.txt',
600
- reference_data=f'{Path.cwd()}/tests/data/gcms_reference_library.txt',
601
- chromatography_platform='NRMS',
602
- similarity_measure='renyi',
603
- optimize_params=["wf_mz","wf_int","LET_threshold","entropy_dimension"],
604
- param_bounds={"wf_mz":(0.0,5.0),"wf_int":(0.0,5.0),"LET_threshold":(0,5),"entropy_dimension":(1.01,3)},
605
- default_params={"noise_threshold":0.10, "wf_mz":0.0, "wf_int":1.0, "LET_threshold":0.0, "entropy_dimension":1.1},
606
- de_workers=-1
607
- )
608
- ```
609
-
610
- The CLI version can be run with:
611
- ```
612
- python ../src/tuning_CLI_grid.py \
613
- --query_data ${PWD}/tests/data/lcms_query_library_tuning.txt \
614
- --reference_data ${PWD}/tests/data/full_GNPS_reference_library.txt \
615
- --precursor_ion_mz_tolerance 0.1 \
616
- --ionization_mode Positive \
617
- --adduct H \
618
- --chromatography_platform HRMS \
619
- --similarity_measure cosine \
620
- --spectrum_preprocessing_order FCNMWL \
621
- --high_quality_reference_library False \
622
- --mz_min 0 \
623
- --mz_max 9999999 \
624
- --int_min 0 \
625
- --int_max 9999999 \
626
- --window_size_centroiding 0.5 \
627
- --window_size_matching 0.1,0.5 \
628
- --noise_threshold 0.0 \
629
- --wf_mz 2,3 \
630
- --wf_intensity 1.0 \
631
- --LET_threshold 0.0 \
632
- --entropy_dimension 1.1 \
633
- --output_path ${PWD}/output_tuning_HRMS_grid.txt \
634
-
635
- python ../src/pycompound/tuning_CLI_grid.py \
636
- --query_data ${PWD}/tests/data/gcms_query_library_tuning.txt \
637
- --reference_data ${PWD}/tests/data/gcms_reference_library.txt \
638
- --chromatography_platform NRMS \
639
- --similarity_measure cosine,shannon \
640
- --spectrum_preprocessing_order FCNMWL \
641
- --high_quality_reference_library False \
642
- --mz_min 0 \
643
- --mz_max 9999999 \
644
- --int_min 0 \
645
- --int_max 9999999 \
646
- --noise_threshold 0.0,0.1 \
647
- --wf_mz 0 \
648
- --wf_intensity 1.0 \
649
- --LET_threshold 0.0 \
650
- --entropy_dimension 1.1 \
651
- --output_path ${PWD}/output_tuning_NRMS_grid.txt \
652
-
653
- python ../src/pycompound/tuning_CLI_DE.py \
654
- --chromatography_platform HRMS \
655
- --query_data ${PWD}/data/lcms_query_library_tuning.txt \
656
- --reference_data ${PWD}/data/full_GNPS_reference_library.txt \
657
- --precursor_ion_mz_tolerance 0.1 \
658
- --ionization_mode Positive \
659
- --adduct H \
660
- --similarity_measure cosine \
661
- --opt window_size_centroiding noise_threshold wf_mz \
662
- --bound window_size_centroiding=0.0:0.4 \
663
- --bound noise_threshold=0.0:0.20 \
664
- --bound wf_mz=0.0:5.0 \
665
- --maxiter 3 \
666
- --seed 1 \
667
- --workers 5
668
-
669
- python ../src/pycompound/tuning_CLI_DE.py \
670
- --query_data ${PWD}/tests/data/gcms_query_library_tuning.txt \
671
- --reference_data ${PWD}/tests/data/gcms_reference_library.txt \
672
- --chromatography_platform NRMS \
673
- --similarity_measure cosine \
674
- --opt noise_threshold wf_mz \
675
- --bound noise_threshold=0.0:0.20 \
676
- --bound wf_mz=0.0:5.0 \
677
- --maxiter 3 \
678
- --seed 1 \
679
- --workers 4
680
-
681
- ```
682
-
683
-
684
- <a name="plotting"></a>
685
- ### 3.5 Plot a query spectrum against a reference spectrum before and after spectrum preprocessing transformations
686
- These functions plot a query spectrum against a reference spectrum, both before and after preprocessing. They support HRMS and NRMS data and can be used directly within Python with usage:
687
- ```
688
- from pycompound.plot_spectra import generate_plots_on_HRMS_data
689
- from pycompound.plot_spectra import generate_plots_on_NRMS_data
690
-
691
- generate_plots_on_HRMS_data(
692
- query_data='path_to_query_library',
693
- reference_data='path_to_reference_data',
694
- spectrum_ID1=None,
695
- spectrum_ID2=None,
696
- similarity_measure='cosine',
697
- spectrum_preprocessing_order='FCNMWL',
698
- high_quality_reference_library=False,
699
- mz_min=0,
700
- mz_max=9999999,
701
- int_min=0,
702
- int_max=9999999,
703
- window_size_centroiding=0.5,
704
- window_size_matching=0.5,
705
- noise_threshold=0.0,
706
- wf_mz=0.0,
707
- wf_intensity=1.0,
708
- LET_threshold=0.0,
709
- entropy_dimension=1.1,
710
- y_axis_transformation='normalized',
711
- output_path=None
712
- )
713
-
714
- generate_plots_on_NRMS_data(
715
- query_data='path_to_query_library',
716
- reference_data='path_to_reference_data',
717
- spectrum_ID1=None,
718
- spectrum_ID2=None,
719
- similarity_measure='cosine',
720
- spectrum_preprocessing_order='FNLW',
721
- high_quality_reference_library=False,
722
- mz_min=0,
723
- mz_max=9999999,
724
- int_min=0,
725
- int_max=9999999,
726
- noise_threshold=0.0,
727
- wf_mz=0.0,
728
- wf_intensity=1.0,
729
- LET_threshold=0.0,
730
- entropy_dimension=1.1,
731
- y_axis_transformation='normalized',
732
- output_path=None
733
- )
734
- ```
735
-
736
- To use the command line version, one can run the following from the terminal:
737
- ```
738
- python plot_spectra_CLI.py \
739
- --query_data ${PWD}/tests/data/lcms_query_library.txt \
740
- --reference_data ${PWD}/tests/data/full_GNPS_reference_library.txt \
741
- --spectrum_ID1 463514 \
742
- --spectrum_ID2 112312 \
743
- --chromatography_platform HRMS \
744
- --similarity_measure cosine \
745
- --spectrum_preprocessing_order FCNMWL \
746
- --high_quality_reference_library False \
747
- --mz_min 0 \
748
- --mz_max 9999999 \
749
- --int_min 0 \
750
- --int_max 9999999 \
751
- --window_size_centroiding 0.5 \
752
- --window_size_matching 0.5 \
753
- --noise_threshold 0.0 \
754
- --wf_mz 0.0 \
755
- --wf_intensity 1.0 \
756
- --LET_threshold 0.0 \
757
- --entropy_dimension 1.1 \
758
- --output_path ${PWD}/output_plotting_HRMS.pdf \
759
-
760
- python plot_spectra_CLI.py \
761
- --query_data ${PWD}/data/gcms_query_library.txt \
762
- --reference_data ${PWD}/data/gcms_reference_library.txt \
763
- --spectrum_ID1 463514 \
764
- --spectrum_ID2 112312 \
765
- --chromatography_platform NRMS \
766
- --similarity_measure tsallis \
767
- --spectrum_preprocessing_order FCNMWL \
768
- --high_quality_reference_library False \
769
- --mz_min 0 \
770
- --mz_max 9999999 \
771
- --int_min 0 \
772
- --int_max 9999999 \
773
- --noise_threshold 0.0 \
774
- --wf_mz 0.0 \
775
- --wf_intensity 1.0 \
776
- --LET_threshold 0.0 \
777
- --entropy_dimension 1.1 \
778
- --output_path ${PWD}/output_plotting_NRMS.pdf \
779
- ```
780
-
781
- An example of such a generated plot is seen below.
782
-
783
- <br />
784
-
785
- ![image](https://github.com/user-attachments/assets/de22a402-1329-4bb3-a664-9423159264c8)
786
-
787
- <br />
788
-
789
- This plot compares two MS/MS spectra: Spectrum ID 1 (unknown, in blue) and Spectrum ID 2 (Hectochlorin M+H, in red). The top panel displays the untransformed spectra, while the bottom panel shows the transformed spectra following preprocessing steps. The footnote details are as follows:
790
-
791
- - Filtering: Given user-defined parameters (mz_min,mz_max),
792
-
793
- - Similarity Measure: Cosine -- The similarity measure used is cosine correlation.
794
-
795
- - Similarity Score: 0.9946 -- The cosine similarity score between the two transformed spectra.
796
-
797
- - Spectrum Preprocessing Order: FCNMWL -- The sequence of preprocessing steps applied: Filtering (F), Centroiding (C), Noise removal (N), Matching (M), Weight factor transformation (W), and Low-entropy transformation (L).
798
-
799
- - High Quality Reference Library: False -- Both query and reference spectra underwent the same preprocessing transformations.
800
-
801
- - Window Size (Centroiding): 0.5 -- A 0.5 Da window was used for centroiding peaks.
802
-
803
- - Window Size (Matching): 0.5 -- Peaks were aligned using a 0.5 Da m/z tolerance window.
804
-
805
- - Raw-Scale M/Z Range: [217.7, 628.8] -- The maximum and minimum of m/z values of peaks with non-zero intensities.
806
-
807
- - Raw-Scale Intensity Range: [3885.0, 5549140] -- The maximum and minimum of absolute non-zero intensity values of the raw spectra before normalization.
808
-
809
- - Noise Threshold: 0.0 -- No noise threshold was applied.
810
-
811
- - Weight Factors (m/z, intensity): (0.0, 1.0) -- Non-zero intensities were transformed using weights of 0.0 for m/z and 1.0 for intensity.
812
-
813
- - Low-Entropy Threshold: 0.0 -- No low-entropy transformation was applied.
814
-
815
-
816
- <a name="shiny"></a>
817
- ### 3.6 Shiny application
818
- PyCompound is also available as a Shiny application. The Shiny application offers the same functionality as the Python package and its CLI interface. Simply run the Python script src/pycompound_shiny.py with a command such as <shiny run --launch-browser pycompound_shiny.py> to launch the Shiny application. Alternatively, one can you the publicly available web version at [https://0199ee0c-c2ce-4fdc-5ade-623633df1622.share.connect.posit.cloud/](https://0199ee0c-c2ce-4fdc-5ade-623633df1622.share.connect.posit.cloud/). If you plan to perform some heavy computations such as parameter tuning on large datasets, we recommend either using the Python package, its CLI wrapper, or running the Shiny app on your local machine to take advantage of multithreading (which isn't offered on the POSIT-hosted Shiny app).
819
-
820
-
821
- <a name="bugs-questions"></a>
822
- ## 4. Bugs/Questions?
823
- If you notice any bugs in this software or have any questions, please create a new issue in this repository.
824
-