pycompound 0.0.8__py3-none-any.whl → 0.0.10__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
app.py CHANGED
@@ -1,5 +1,5 @@
1
1
 
2
- from shiny import App, ui, reactive, render
2
+ from shiny import App, ui, reactive, render, req
3
3
  from pycompound.spec_lib_matching import run_spec_lib_matching_on_HRMS_data
4
4
  from pycompound.spec_lib_matching import run_spec_lib_matching_on_NRMS_data
5
5
  from pycompound.spec_lib_matching import tune_params_on_HRMS_data
@@ -7,11 +7,149 @@ from pycompound.spec_lib_matching import tune_params_on_NRMS_data
7
7
  from pycompound.plot_spectra import generate_plots_on_HRMS_data
8
8
  from pycompound.plot_spectra import generate_plots_on_NRMS_data
9
9
  from pathlib import Path
10
+ from contextlib import redirect_stdout, redirect_stderr
10
11
  import subprocess
11
12
  import traceback
12
13
  import asyncio
13
14
  import io
15
+ import os
16
+ import sys
14
17
  import matplotlib.pyplot as plt
18
+ import pandas as pd
19
+ import numpy as np
20
+ import netCDF4 as nc
21
+ from pyteomics import mgf
22
+ from pyteomics import mzml
23
+
24
+
25
+ def build_library(input_path=None, output_path=None):
26
+ last_three_chars = input_path[(len(input_path)-3):len(input_path)]
27
+ last_four_chars = input_path[(len(input_path)-4):len(input_path)]
28
+ if last_three_chars == 'csv' or last_three_chars == 'CSV':
29
+ return pd.read_csv(input_path)
30
+ else:
31
+ if last_three_chars == 'mgf' or last_three_chars == 'MGF':
32
+ input_file_type = 'mgf'
33
+ elif last_four_chars == 'mzML' or last_four_chars == 'mzml' or last_four_chars == 'MZML':
34
+ input_file_type = 'mzML'
35
+ elif last_three_chars == 'cdf' or last_three_chars == 'CDF':
36
+ input_file_type = 'cdf'
37
+ elif last_three_chars == 'msp' or last_three_chars == 'MSP':
38
+ input_file_type = 'msp'
39
+ else:
40
+ print('ERROR: either an \'mgf\', \'mzML\', \'cdf\', or \'msp\' file must be passed to --input_path')
41
+ sys.exit()
42
+
43
+ spectra = []
44
+ if input_file_type == 'mgf':
45
+ with mgf.read(input_path, index_by_scans = True) as reader:
46
+ for spec in reader:
47
+ spectra.append(spec)
48
+ if input_file_type == 'mzML':
49
+ with mzml.read(input_path) as reader:
50
+ for spec in reader:
51
+ spectra.append(spec)
52
+
53
+ if input_file_type == 'mgf' or input_file_type == 'mzML':
54
+ ids = []
55
+ mzs = []
56
+ ints = []
57
+ for i in range(0,len(spectra)):
58
+ for j in range(0,len(spectra[i]['m/z array'])):
59
+ if input_file_type == 'mzML':
60
+ ids.append(f'ID_{i+1}')
61
+ else:
62
+ ids.append(spectra[i]['params']['name'])
63
+ mzs.append(spectra[i]['m/z array'][j])
64
+ ints.append(spectra[i]['intensity array'][j])
65
+
66
+ if input_file_type == 'cdf':
67
+ dataset = nc.Dataset(input_path, 'r')
68
+ all_mzs = dataset.variables['mass_values'][:]
69
+ all_ints = dataset.variables['intensity_values'][:]
70
+ scan_idxs = dataset.variables['scan_index'][:]
71
+ dataset.close()
72
+
73
+ ids = []
74
+ mzs = []
75
+ ints = []
76
+ for i in range(0,(len(scan_idxs)-1)):
77
+ if i % 1000 == 0:
78
+ print(f'analyzed {i} out of {len(scan_idxs)} scans')
79
+ s_idx = scan_idxs[i]
80
+ e_idx = scan_idxs[i+1]
81
+
82
+ mzs_tmp = all_mzs[s_idx:e_idx]
83
+ ints_tmp = all_ints[s_idx:e_idx]
84
+
85
+ for j in range(0,len(mzs_tmp)):
86
+ ids.append(f'ID_{i+1}')
87
+ mzs.append(mzs_tmp[j])
88
+ ints.append(ints_tmp[j])
89
+
90
+ if input_file_type == 'msp':
91
+ ids = []
92
+ mzs = []
93
+ ints = []
94
+ with open(input_path, 'r') as f:
95
+ i = 0
96
+ for line in f:
97
+ line = line.strip()
98
+ if line.startswith('Name:'):
99
+ i += 1
100
+ spectrum_id = line.replace('Name: ','')
101
+ elif line and line[0].isdigit():
102
+ try:
103
+ mz, intensity = map(float, line.split()[:2])
104
+ ids.append(spectrum_id)
105
+ mzs.append(mz)
106
+ ints.append(intensity)
107
+ except ValueError:
108
+ continue
109
+
110
+ df = pd.DataFrame({'id':ids, 'mz_ratio':mzs, 'intensity':ints})
111
+ return df
112
+
113
+
114
+
115
+ def extract_first_column_ids(file_path: str, max_ids: int = 20000):
116
+ suffix = Path(file_path).suffix.lower()
117
+
118
+ if suffix == ".csv":
119
+ df = pd.read_csv(file_path, usecols=[0])
120
+ ids = df.iloc[:, 0].astype(str).dropna()
121
+ ids = [x for x in ids if x.strip() != ""]
122
+ seen = set()
123
+ uniq = []
124
+ for x in ids:
125
+ if x not in seen:
126
+ uniq.append(x)
127
+ seen.add(x)
128
+ return uniq[:max_ids]
129
+
130
+ ids = []
131
+ try:
132
+ with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
133
+ for line in f:
134
+ ls = line.strip()
135
+ if ls.startswith("TITLE="):
136
+ ids.append(ls.split("=", 1)[1].strip())
137
+ elif ls.lower().startswith("name:"):
138
+ ids.append(ls.split(":", 1)[1].strip())
139
+ if len(ids) >= max_ids:
140
+ break
141
+ except Exception:
142
+ pass
143
+
144
+ if ids:
145
+ seen = set()
146
+ uniq = []
147
+ for x in ids:
148
+ if x not in seen:
149
+ uniq.append(x)
150
+ seen.add(x)
151
+ return uniq
152
+ return []
15
153
 
16
154
 
17
155
  def plot_spectra_ui(platform: str):
@@ -19,8 +157,20 @@ def plot_spectra_ui(platform: str):
19
157
  base_inputs = [
20
158
  ui.input_file("query_data", "Upload query dataset (mgf, mzML, cdf, msp, or csv):"),
21
159
  ui.input_file("reference_data", "Upload reference dataset (mgf, mzML, cdf, msp, or csv):"),
22
- ui.input_text("spectrum_ID1", "Input ID of one spectrum to be plotted:", None),
23
- ui.input_text("spectrum_ID2", "Input ID of another spectrum to be plotted:", None),
160
+ ui.input_selectize(
161
+ "spectrum_ID1",
162
+ "Select spectrum ID 1:",
163
+ choices=[],
164
+ multiple=False,
165
+ options={"placeholder": "Upload a query file to load IDs..."},
166
+ ),
167
+ ui.input_selectize(
168
+ "spectrum_ID2",
169
+ "Select spectrum ID 2 (optional):",
170
+ choices=[],
171
+ multiple=False,
172
+ options={"placeholder": "Upload a reference file to load IDs..."},
173
+ ),
24
174
  ui.input_select("similarity_measure", "Select similarity measure:", ["cosine","shannon","renyi","tsallis","mixture","jaccard","dice","3w_jaccard","sokal_sneath","binary_cosine","mountford","mcconnaughey","driver_kroeber","simpson","braun_banquet","fager_mcgowan","kulczynski","intersection","hamming","hellinger"]),
25
175
  ui.input_select(
26
176
  "high_quality_reference_library",
@@ -71,10 +221,9 @@ def plot_spectra_ui(platform: str):
71
221
  )
72
222
 
73
223
  # Run and Back buttons
74
- run_button = ui.input_action_button("run_btn", "Run", style="font-size:16px; padding:15px 30px; width:200px; height:80px")
224
+ run_button_plot_spectra = ui.download_button("run_btn_plot_spectra", "Run", style="font-size:16px; padding:15px 30px; width:200px; height:80px")
75
225
  back_button = ui.input_action_button("back", "Back to main menu", style="font-size:16px; padding:15px 30px; width:200px; height:80px")
76
226
 
77
- #print(len(extra_inputs))
78
227
  # Layout base_inputs and extra_inputs in columns
79
228
  if platform == "HRMS":
80
229
  inputs_columns = ui.layout_columns(
@@ -98,8 +247,9 @@ def plot_spectra_ui(platform: str):
98
247
  ui.TagList(
99
248
  ui.h2("Plot Spectra"),
100
249
  inputs_columns,
101
- run_button,
102
- back_button
250
+ run_button_plot_spectra,
251
+ back_button,
252
+ ui.div(ui.output_text("plot_query_status"), style="margin-top:8px; font-size:14px")
103
253
  ),
104
254
  )
105
255
 
@@ -155,10 +305,9 @@ def run_spec_lib_matching_ui(platform: str):
155
305
 
156
306
 
157
307
  # Run and Back buttons
158
- run_button = ui.input_action_button("run_btn", "Run", style="font-size:16px; padding:15px 30px; width:200px; height:80px")
308
+ run_button_spec_lib_matching = ui.download_button("run_btn_spec_lib_matching", "Run", style="font-size:16px; padding:15px 30px; width:200px; height:80px")
159
309
  back_button = ui.input_action_button("back", "Back to main menu", style="font-size:16px; padding:15px 30px; width:200px; height:80px")
160
310
 
161
- #print(len(extra_inputs))
162
311
  # Layout base_inputs and extra_inputs in columns
163
312
  if platform == "HRMS":
164
313
  inputs_columns = ui.layout_columns(
@@ -177,13 +326,20 @@ def run_spec_lib_matching_ui(platform: str):
177
326
  col_widths=(3, 3, 3, 3),
178
327
  )
179
328
 
329
+ log_panel = ui.card(
330
+ ui.card_header("Identification log"),
331
+ ui.output_text_verbatim("match_log"),
332
+ style="max-height:300px; overflow:auto"
333
+ )
334
+
180
335
  # Combine everything
181
336
  return ui.div(
182
337
  ui.TagList(
183
338
  ui.h2("Run Spectral Library Matching"),
184
339
  inputs_columns,
185
- run_button,
186
- back_button
340
+ run_button_spec_lib_matching,
341
+ back_button,
342
+ log_panel,
187
343
  ),
188
344
  )
189
345
 
@@ -197,19 +353,114 @@ app_ui = ui.page_fluid(
197
353
 
198
354
  def server(input, output, session):
199
355
 
200
- # Track which page to show
201
356
  current_page = reactive.Value("main_menu")
202
357
 
203
- # Track button clicks
204
358
  plot_clicks = reactive.Value(0)
205
359
  match_clicks = reactive.Value(0)
206
360
  back_clicks = reactive.Value(0)
207
361
 
208
- run_status = reactive.Value("Waiting for input...")
362
+ run_status_plot_spectra = reactive.Value("")
363
+ run_status_spec_lib_matching = reactive.Value("")
364
+ match_log_rv = reactive.Value("")
365
+ is_matching_rv = reactive.Value(False)
366
+
367
+ query_ids_rv = reactive.Value([])
368
+ query_file_path_rv = reactive.Value(None)
369
+ query_result_rv = reactive.Value(None)
370
+ query_status_rv = reactive.Value("")
371
+ reference_ids_rv = reactive.Value([])
372
+ reference_file_path_rv = reactive.Value(None)
373
+ reference_result_rv = reactive.Value(None)
374
+ reference_status_rv = reactive.Value("")
375
+
376
+ converted_query_path_rv = reactive.Value(None)
377
+ converted_reference_path_rv = reactive.Value(None)
378
+
379
+
380
+ def process_database(file_path: str):
381
+ suffix = Path(file_path).suffix.lower()
382
+ return {"path": file_path, "suffix": suffix}
383
+
384
+ @render.text
385
+ def plot_query_status():
386
+ return query_status_rv.get() or ""
387
+
388
+
389
+ @reactive.effect
390
+ @reactive.event(input.query_data)
391
+ async def _on_query_upload():
392
+ if current_page() != "plot_spectra":
393
+ return
394
+
395
+ files = input.query_data()
396
+ req(files and len(files) > 0)
397
+
398
+ file_path = files[0]["datapath"]
399
+ query_file_path_rv.set(file_path)
400
+
401
+ query_status_rv.set(f"Processing query database: {Path(file_path).name} …")
402
+ await reactive.flush()
403
+
404
+ try:
405
+ result = await asyncio.to_thread(process_database, file_path)
406
+ query_result_rv.set(result)
407
+ query_status_rv.set("✅ Query database processed.")
408
+ await reactive.flush()
409
+ except Exception as e:
410
+ query_status_rv.set(f"❌ Failed to process query database: {e}")
411
+ await reactive.flush()
412
+
413
+
414
+ @reactive.effect
415
+ @reactive.event(input.reference_data)
416
+ async def _on_reference_upload():
417
+ if current_page() != "plot_spectra":
418
+ return
419
+
420
+ files = input.reference_data()
421
+ req(files and len(files) > 0)
422
+
423
+ file_path = files[0]["datapath"]
424
+ reference_file_path_rv.set(file_path)
425
+
426
+ reference_status_rv.set(f"Processing reference database: {Path(file_path).name} …")
427
+ await reactive.flush()
428
+
429
+ try:
430
+ result = await asyncio.to_thread(process_database, file_path)
431
+ reference_result_rv.set(result)
432
+ reference_status_rv.set("✅ Reference database processed.")
433
+ await reactive.flush()
434
+ except Exception as e:
435
+ reference_status_rv.set(f"❌ Failed to process reference database: {e}")
436
+ await reactive.flush()
437
+
438
+
439
+ @render.text
440
+ def match_log():
441
+ return match_log_rv.get()
442
+
443
+
444
+ class ReactiveWriter(io.TextIOBase):
445
+ def __init__(self, rv):
446
+ self.rv = rv
447
+ def write(self, s: str):
448
+ if not s:
449
+ return 0
450
+ self.rv.set(self.rv.get() + s)
451
+ try:
452
+ loop = asyncio.get_running_loop()
453
+ loop.create_task(reactive.flush())
454
+ except RuntimeError:
455
+ pass
456
+ return len(s)
457
+ def flush(self):
458
+ pass
459
+
460
+
209
461
 
210
462
  @reactive.Effect
211
463
  def _():
212
- # Main menu buttons
213
464
  if input.plot_spectra() > plot_clicks.get():
214
465
  current_page.set("plot_spectra")
215
466
  plot_clicks.set(input.plot_spectra())
@@ -220,6 +471,7 @@ def server(input, output, session):
220
471
  current_page.set("main_menu")
221
472
  back_clicks.set(input.back())
222
473
 
474
+
223
475
  @render.image
224
476
  def image():
225
477
  from pathlib import Path
@@ -228,6 +480,7 @@ def server(input, output, session):
228
480
  img: ImgData = {"src": str(dir / "www/emblem.png"), "width": "320px", "height": "250px"}
229
481
  return img
230
482
 
483
+
231
484
  @output
232
485
  @render.ui
233
486
  def main_ui():
@@ -310,53 +563,227 @@ def server(input, output, session):
310
563
  elif current_page() == "run_spec_lib_matching":
311
564
  return run_spec_lib_matching_ui(input.chromatography_platform())
312
565
 
566
+
567
+
313
568
  @reactive.effect
314
- @reactive.event(input.run_btn)
315
- def _():
316
- if current_page() == "plot_spectra":
317
- if len(input.spectrum_ID1())==0:
318
- spectrum_ID1 = None
569
+ @reactive.event(input.query_data)
570
+ async def _populate_ids_from_query_upload():
571
+ if current_page() != "plot_spectra":
572
+ return
573
+
574
+ files = input.query_data()
575
+ if not files:
576
+ return
577
+
578
+ in_path = Path(files[0]["datapath"])
579
+ suffix = in_path.suffix.lower()
580
+
581
+ # Decide what CSV to read IDs from
582
+ try:
583
+ if suffix == ".csv":
584
+ csv_path = in_path
585
+ converted_query_path_rv.set(str(csv_path))
319
586
  else:
320
- spectrum_ID1 = input.spectrum_ID1()
321
- if len(input.spectrum_ID2())==0:
322
- spectrum_ID2 = None
587
+ query_status_rv.set(f"Converting {in_path.name} → CSV …")
588
+ await reactive.flush()
589
+
590
+ # Choose an output temp path next to the upload
591
+ tmp_csv_path = in_path.with_suffix(".converted.csv")
592
+
593
+ out_obj = await asyncio.to_thread(build_library, str(in_path), str(tmp_csv_path))
594
+
595
+ # out_obj may be a path (str/PathLike) OR a DataFrame. Normalize to a path.
596
+ if isinstance(out_obj, (str, os.PathLike, Path)):
597
+ csv_path = Path(out_obj)
598
+ elif isinstance(out_obj, pd.DataFrame):
599
+ # Write the DF to our chosen path
600
+ out_obj.to_csv(tmp_csv_path, index=False)
601
+ csv_path = tmp_csv_path
602
+ else:
603
+ raise TypeError(f"build_library returned unsupported type: {type(out_obj)}")
604
+
605
+ converted_query_path_rv.set(str(csv_path))
606
+
607
+ query_status_rv.set(f"Reading IDs from: {csv_path.name} …")
608
+ await reactive.flush()
609
+
610
+ # Extract IDs from the CSV’s first column
611
+ ids = await asyncio.to_thread(extract_first_column_ids, str(csv_path))
612
+ query_ids_rv.set(ids)
613
+
614
+ # Update dropdowns
615
+ ui.update_selectize("spectrum_ID1", choices=ids, selected=(ids[0] if ids else None))
616
+
617
+ query_status_rv.set(
618
+ f"✅ Loaded {len(ids)} IDs from {csv_path.name}" if ids else f"⚠️ No IDs found in {csv_path.name}"
619
+ )
620
+ await reactive.flush()
621
+
622
+ except Exception as e:
623
+ query_status_rv.set(f"❌ Failed: {e}")
624
+ await reactive.flush()
625
+ raise
626
+
627
+
628
+ @reactive.effect
629
+ @reactive.event(input.reference_data)
630
+ async def _populate_ids_from_reference_upload():
631
+ if current_page() != "plot_spectra":
632
+ return
633
+
634
+ files = input.reference_data()
635
+ if not files:
636
+ return
637
+
638
+ in_path = Path(files[0]["datapath"])
639
+ suffix = in_path.suffix.lower()
640
+
641
+ # Decide what CSV to read IDs from
642
+ try:
643
+ if suffix == ".csv":
644
+ csv_path = in_path
645
+ converted_reference_path_rv.set(str(csv_path))
323
646
  else:
324
- spectrum_ID2 = input.spectrum_ID2()
325
-
326
- if input.chromatography_platform() == "HRMS":
327
- try:
328
- fig = generate_plots_on_HRMS_data(query_data=input.query_data()[0]['datapath'], reference_data=input.reference_data()[0]['datapath'], spectrum_ID1=spectrum_ID1, spectrum_ID2=spectrum_ID2, similarity_measure=input.similarity_measure(), spectrum_preprocessing_order=input.spectrum_preprocessing_order(), high_quality_reference_library=input.high_quality_reference_library(), mz_min=input.mz_min(), mz_max=input.mz_max(), int_min=input.int_min(), int_max=input.int_max(), window_size_centroiding=input.window_size_centroiding(), window_size_matching=input.window_size_matching(), noise_threshold=input.noise_threshold(), wf_mz=input.wf_mz(), wf_intensity=input.wf_int(), LET_threshold=input.LET_threshold(), entropy_dimension=input.entropy_dimension(), y_axis_transformation=input.y_axis_transformation(), return_plot=True)
329
- plt.show()
330
- run_status.set(f"✅ Plotting has finished.")
331
- except Exception as e:
332
- run_status.set(f"❌ Error: {traceback.format_exc()}")
333
- elif input.chromatography_platform() == "NRMS":
334
- try:
335
- generate_plots_on_NRMS_data(query_data=input.query_data()[0]['datapath'], reference_data=input.reference_data()[0]['datapath'], spectrum_ID1=spectrum_ID1, spectrum_ID2=spectrum_ID2, similarity_measure=input.similarity_measure(), spectrum_preprocessing_order=input.spectrum_preprocessing_order(), high_quality_reference_library=input.high_quality_reference_library(), mz_min=input.mz_min(), mz_max=input.mz_max(), int_min=input.int_min(), int_max=input.int_max(), noise_threshold=input.noise_threshold(), wf_mz=input.wf_mz(), wf_intensity=input.wf_int(), LET_threshold=input.LET_threshold(), entropy_dimension=input.entropy_dimension(), y_axis_transformation=input.y_axis_transformation(), return_plot=True)
336
- plt.show()
337
- run_status.set(f"✅ Plotting has finished.")
338
- except Exception as e:
339
- run_status.set(f"❌ Error: {traceback.format_exc()}")
340
-
341
- elif current_page() == 'run_spec_lib_matching':
342
- if input.chromatography_platform() == 'HRMS':
343
- try:
344
- run_spec_lib_matching_on_HRMS_data(query_data=input.query_data()[0]['datapath'], reference_data=input.reference_data()[0]['datapath'], likely_reference_ids=None, similarity_measure=input.similarity_measure(), spectrum_preprocessing_order=input.spectrum_preprocessing_order(), high_quality_reference_library=input.high_quality_reference_library(), mz_min=input.mz_min(), mz_max=input.mz_max(), int_min=input.int_min(), int_max=input.int_max(), window_size_centroiding=input.window_size_centroiding(), window_size_matching=input.window_size_matching(), noise_threshold=input.noise_threshold(), wf_mz=input.wf_mz(), wf_intensity=input.wf_int(), LET_threshold=input.LET_threshold(), entropy_dimension=input.entropy_dimension(), n_top_matches_to_save=input.n_top_matches_to_save(), print_id_results=False, output_identification=f'{Path.cwd()}/output_identification.csv', output_similarity_scores=f'{Path.cwd()}/')
345
- run_status.set(f"✅ Spectral library matching has finished and results were written to {Path.cwd()}/output_similarity_scores.csv.")
346
- except Exception as e:
347
- run_status.set(f"❌ Error: {traceback.format_exc()}")
348
- elif input.chromatography_platform() == 'NRMS':
349
- try:
350
- run_spec_lib_matching_on_NRMS_data(query_data=input.query_data()[0]['datapath'], reference_data=input.reference_data()[0]['datapath'], likely_reference_ids=None, similarity_measure=input.similarity_measure(), spectrum_preprocessing_order=input.spectrum_preprocessing_order(), high_quality_reference_library=input.high_quality_reference_library(), mz_min=input.mz_min(), mz_max=input.mz_max(), int_min=input.int_min(), int_max=input.int_max(), noise_threshold=input.noise_threshold(), wf_mz=input.wf_mz(), wf_intensity=input.wf_int(), LET_threshold=input.LET_threshold(), entropy_dimension=input.entropy_dimension(), n_top_matches_to_save=input.n_top_matches_to_save(), print_id_results=False, output_identification=f'{Path.cwd()}/output_identification.csv', output_similarity_scores=f'{Path.cwd()}/output_similarity_scores.csv')
351
- run_status.set(f"✅ Spectral library matching has finished and results were written to {Path.cwd()}/")
352
- except Exception as e:
353
- run_status.set(f"❌ Error: {traceback.format_exc()}")
647
+ reference_status_rv.set(f"Converting {in_path.name} → CSV …")
648
+ await reactive.flush()
649
+
650
+ # Choose an output temp path next to the upload
651
+ tmp_csv_path = in_path.with_suffix(".converted.csv")
652
+
653
+ out_obj = await asyncio.to_thread(build_library, str(in_path), str(tmp_csv_path))
654
+
655
+ # out_obj may be a path (str/PathLike) OR a DataFrame. Normalize to a path.
656
+ if isinstance(out_obj, (str, os.PathLike, Path)):
657
+ csv_path = Path(out_obj)
658
+ elif isinstance(out_obj, pd.DataFrame):
659
+ # Write the DF to our chosen path
660
+ out_obj.to_csv(tmp_csv_path, index=False)
661
+ csv_path = tmp_csv_path
662
+ else:
663
+ raise TypeError(f"build_library returned unsupported type: {type(out_obj)}")
664
+
665
+ converted_reference_path_rv.set(str(csv_path))
666
+
667
+ reference_status_rv.set(f"Reading IDs from: {csv_path.name} …")
668
+ await reactive.flush()
669
+
670
+ # Extract IDs from the CSV’s first column
671
+ ids = await asyncio.to_thread(extract_first_column_ids, str(csv_path))
672
+ reference_ids_rv.set(ids)
673
+
674
+ # Update dropdowns
675
+ ui.update_selectize("spectrum_ID2", choices=ids, selected=(ids[0] if ids else None))
676
+
677
+ reference_status_rv.set(
678
+ f"✅ Loaded {len(ids)} IDs from {csv_path.name}" if ids else f"⚠️ No IDs found in {csv_path.name}"
679
+ )
680
+ await reactive.flush()
681
+
682
+ except Exception as e:
683
+ reference_status_rv.set(f"❌ Failed: {e}")
684
+ await reactive.flush()
685
+ raise
686
+
687
+
688
+
689
+ @render.download(filename=lambda: f"plot.png")
690
+ def run_btn_plot_spectra():
691
+ spectrum_ID1 = input.spectrum_ID1() or None
692
+ spectrum_ID2 = input.spectrum_ID2() or None
693
+
694
+ if input.chromatography_platform() == "HRMS":
695
+ fig = generate_plots_on_HRMS_data(query_data=input.query_data()[0]['datapath'], reference_data=input.reference_data()[0]['datapath'], spectrum_ID1=spectrum_ID1, spectrum_ID2=spectrum_ID2, similarity_measure=input.similarity_measure(), spectrum_preprocessing_order=input.spectrum_preprocessing_order(), high_quality_reference_library=input.high_quality_reference_library(), mz_min=input.mz_min(), mz_max=input.mz_max(), int_min=input.int_min(), int_max=input.int_max(), window_size_centroiding=input.window_size_centroiding(), window_size_matching=input.window_size_matching(), noise_threshold=input.noise_threshold(), wf_mz=input.wf_mz(), wf_intensity=input.wf_int(), LET_threshold=input.LET_threshold(), entropy_dimension=input.entropy_dimension(), y_axis_transformation=input.y_axis_transformation(), return_plot=True)
696
+ #run_status_plot_spectra.set("✅ Plotting has finished.")
697
+ elif input.chromatography_platform() == "NRMS":
698
+ fig = generate_plots_on_NRMS_data(query_data=input.query_data()[0]['datapath'], reference_data=input.reference_data()[0]['datapath'], spectrum_ID1=spectrum_ID1, spectrum_ID2=spectrum_ID2, similarity_measure=input.similarity_measure(), spectrum_preprocessing_order=input.spectrum_preprocessing_order(), high_quality_reference_library=input.high_quality_reference_library(), mz_min=input.mz_min(), mz_max=input.mz_max(), int_min=input.int_min(), int_max=input.int_max(), noise_threshold=input.noise_threshold(), wf_mz=input.wf_mz(), wf_intensity=input.wf_int(), LET_threshold=input.LET_threshold(), entropy_dimension=input.entropy_dimension(), y_axis_transformation=input.y_axis_transformation(), return_plot=True)
699
+ with io.BytesIO() as buf:
700
+ fig.savefig(buf, format="png", dpi=150, bbox_inches="tight")
701
+ yield buf.getvalue()
354
702
 
355
703
 
356
704
  @render.text
357
705
  def status_output():
358
- return run_status.get()
706
+ return run_status_plot_spectra.get()
707
+ return run_status_spec_lib_matching.get()
708
+
709
+
710
+ class ReactiveWriter(io.TextIOBase):
711
+ def __init__(self, rv: reactive.Value, loop: asyncio.AbstractEventLoop):
712
+ self.rv = rv
713
+ self.loop = loop
714
+
715
+ def write(self, s: str):
716
+ if not s:
717
+ return 0
718
+ def _apply():
719
+ self.rv.set(self.rv.get() + s)
720
+ self.loop.create_task(reactive.flush())
721
+
722
+ self.loop.call_soon_threadsafe(_apply)
723
+ return len(s)
724
+
725
+ def flush(self):
726
+ pass
727
+
728
+
729
+ @render.download(filename="identification_output.csv")
730
+ async def run_btn_spec_lib_matching():
731
+ # 1) quick first paint
732
+ match_log_rv.set("Starting identification...\n")
733
+ await reactive.flush()
734
+
735
+ # 2) normalize inputs (same as before)
736
+ hq = input.high_quality_reference_library()
737
+ if isinstance(hq, str):
738
+ hq = hq.lower() == "true"
739
+ elif isinstance(hq, (int, float)):
740
+ hq = bool(hq)
741
+
742
+ common_kwargs = dict(
743
+ query_data=input.query_data()[0]["datapath"],
744
+ reference_data=input.reference_data()[0]["datapath"],
745
+ likely_reference_ids=None,
746
+ similarity_measure=input.similarity_measure(),
747
+ spectrum_preprocessing_order=input.spectrum_preprocessing_order(),
748
+ high_quality_reference_library=hq,
749
+ mz_min=input.mz_min(), mz_max=input.mz_max(),
750
+ int_min=input.int_min(), int_max=input.int_max(),
751
+ noise_threshold=input.noise_threshold(),
752
+ wf_mz=input.wf_mz(), wf_intensity=input.wf_int(),
753
+ LET_threshold=input.LET_threshold(), entropy_dimension=input.entropy_dimension(),
754
+ n_top_matches_to_save=input.n_top_matches_to_save(),
755
+ print_id_results=True, # ensure the library actually prints progress
756
+ output_identification=str(Path.cwd() / "identification_output.csv"),
757
+ output_similarity_scores=str(Path.cwd() / "similarity_scores.csv"),
758
+ return_ID_output=True,
759
+ )
359
760
 
761
+ loop = asyncio.get_running_loop()
762
+ rw = ReactiveWriter(match_log_rv, loop)
763
+
764
+ # 3) run the heavy function in a thread so the event loop can repaint
765
+ try:
766
+ with redirect_stdout(rw), redirect_stderr(rw):
767
+ if input.chromatography_platform() == "HRMS":
768
+ df_out = await asyncio.to_thread(
769
+ run_spec_lib_matching_on_HRMS_data,
770
+ window_size_centroiding=input.window_size_centroiding(),
771
+ window_size_matching=input.window_size_matching(),
772
+ **common_kwargs
773
+ )
774
+ else:
775
+ df_out = await asyncio.to_thread(
776
+ run_spec_lib_matching_on_NRMS_data, **common_kwargs
777
+ )
778
+ match_log_rv.set(match_log_rv.get() + "\n✅ Identification finished.\n")
779
+ await reactive.flush()
780
+ except Exception as e:
781
+ match_log_rv.set(match_log_rv.get() + f"\n❌ Error: {e}\n")
782
+ await reactive.flush()
783
+ raise
784
+
785
+ # 4) stream CSV back to the browser
786
+ yield df_out.to_csv(index=False)
360
787
 
361
788
 
362
789
  app = App(app_ui, server)
@@ -45,7 +45,7 @@ def generate_plots_on_HRMS_data(query_data=None, reference_data=None, spectrum_I
45
45
  extension = extension[(len(extension)-1)]
46
46
  if extension == 'mgf' or extension == 'MGF' or extension == 'mzML' or extension == 'mzml' or extension == 'MZML' or extension == 'cdf' or extension == 'CDF':
47
47
  output_path_tmp = query_data[:-3] + 'csv'
48
- build_library_from_raw_data(input_path=query_data, output_path=output_path_tmp, is_reference=False)
48
+ build_library_from_raw_data(input_path=query_data, output_path=output_path_tmp, is_reference=True)
49
49
  df_query = pd.read_csv(output_path_tmp)
50
50
  if extension == 'csv' or extension == 'CSV':
51
51
  df_query = pd.read_csv(query_data)
@@ -96,8 +96,8 @@ def generate_plots_on_HRMS_data(query_data=None, reference_data=None, spectrum_I
96
96
  print(f'Error: spectrum_preprocessing_order must contain only \'C\', \'F\', \'M\', \'N\', \'L\', \'W\'.')
97
97
  sys.exit()
98
98
 
99
- if similarity_measure not in ['cosine','shannon','renyi','tsallis','mixture','jaccard','dice','3w_jaccard','sokal_sneath','binary_cosine','mountford','mcconnaughey','driver_kroeber','simpson','braun_banquet','fager_mcgowan','kulczynski','intersection','hamming','hellinger']:
100
- print('\nError: similarity_measure must be either cosine, shannon, renyi, tsallis, mixture, jaccard, dice, 3w_jaccard, sokal_sneath, binary_cosine, mountford, mcconnaughey, driver_kroeber, simpson, braun_banquet, fager_mcgowan, kulczynski, intersection, hamming, or hellinger.')
99
+ if similarity_measure not in ['cosine','shannon','renyi','tsallis','mixture','jaccard','dice','3w_jaccard','sokal_sneath','binary_cosine','mountford','mcconnaughey','driver_kroeber','simpson','braun_banquet','fager_mcgowan','kulczynski','interection','hamming','hellinger']:
100
+ print('\nError: similarity_measure must be either cosine, shannon, renyi, tsallis, mixture, jaccard, dice, 3w_jaccard, sokal_sneath, binary_cosine, mountford, mcconnaughey, driver_kroeber, simpson, braun_banquet, fager_mcgowan, kulczynski, interection, hamming, or hellinger.')
101
101
  sys.exit()
102
102
 
103
103
  if isinstance(int_min,int) is True:
@@ -177,6 +177,8 @@ def generate_plots_on_HRMS_data(query_data=None, reference_data=None, spectrum_I
177
177
  spec_tmp = spectrum_ID1
178
178
  spectrum_ID1 = spectrum_ID2
179
179
  spectrum_ID2 = spec_tmp
180
+ print(unique_query_ids)
181
+ print(spectrum_ID1)
180
182
  query_idx = unique_query_ids.index(spectrum_ID1)
181
183
  reference_idx = unique_reference_ids.index(spectrum_ID2)
182
184
  q_idxs_tmp = np.where(df_query.iloc[:,0].astype(str) == unique_query_ids[query_idx])[0]
@@ -400,8 +402,8 @@ def generate_plots_on_NRMS_data(query_data=None, reference_data=None, spectrum_I
400
402
  print(f'Error: spectrum_preprocessing_order must contain only \'F\', \'N\', \'W\', \'L\'.')
401
403
  sys.exit()
402
404
 
403
- if similarity_measure not in ['cosine','shannon','renyi','tsallis','mixture','jaccard','dice','3w_jaccard','sokal_sneath','binary_cosine','mountford','mcconnaughey','driver_kroeber','simpson','braun_banquet','fager_mcgowan','kulczynski','intersection','hamming','hellinger']:
404
- print('\nError: similarity_measure must be either cosine, shannon, renyi, tsallis, mixture, jaccard, dice, 3w_jaccard, sokal_sneath, binary_cosine, mountford, mcconnaughey, driver_kroeber, simpson, braun_banquet, fager_mcgowan, kulczynski, intersection, hamming, or hellinger.')
405
+ if similarity_measure not in ['cosine','shannon','renyi','tsallis','mixture','jaccard','dice','3w_jaccard','sokal_sneath','binary_cosine','mountford','mcconnaughey','driver_kroeber','simpson','braun_banquet','fager_mcgowan','kulczynski','interection','hamming','hellinger']:
406
+ print('\nError: similarity_measure must be either cosine, shannon, renyi, tsallis, mixture, jaccard, dice, 3w_jaccard, sokal_sneath, binary_cosine, mountford, mcconnaughey, driver_kroeber, simpson, braun_banquet, fager_mcgowan, kulczynski, interection, hamming, or hellinger.')
405
407
  sys.exit()
406
408
 
407
409
  if isinstance(int_min,int) is True:
@@ -389,7 +389,7 @@ def get_acc_NRMS(df_query, df_reference, unique_query_ids, unique_reference_ids,
389
389
 
390
390
 
391
391
 
392
- def run_spec_lib_matching_on_HRMS_data(query_data=None, reference_data=None, likely_reference_ids=None, similarity_measure='cosine', weights={'Cosine':0.25,'Shannon':0.25,'Renyi':0.25,'Tsallis':0.25}, spectrum_preprocessing_order='FCNMWL', high_quality_reference_library=False, mz_min=0, mz_max=9999999, int_min=0, int_max=9999999, window_size_centroiding=0.5, window_size_matching=0.5, noise_threshold=0.0, wf_mz=0.0, wf_intensity=1.0, LET_threshold=0.0, entropy_dimension=1.1, n_top_matches_to_save=1, print_id_results=False, output_identification=None, output_similarity_scores=None):
392
+ def run_spec_lib_matching_on_HRMS_data(query_data=None, reference_data=None, likely_reference_ids=None, similarity_measure='cosine', weights={'Cosine':0.25,'Shannon':0.25,'Renyi':0.25,'Tsallis':0.25}, spectrum_preprocessing_order='FCNMWL', high_quality_reference_library=False, mz_min=0, mz_max=9999999, int_min=0, int_max=9999999, window_size_centroiding=0.5, window_size_matching=0.5, noise_threshold=0.0, wf_mz=0.0, wf_intensity=1.0, LET_threshold=0.0, entropy_dimension=1.1, n_top_matches_to_save=1, print_id_results=False, output_identification=None, output_similarity_scores=None, return_ID_output=False):
393
393
  '''
394
394
  runs spectral library matching on high-resolution mass spectrometry (HRMS) data
395
395
 
@@ -636,22 +636,26 @@ def run_spec_lib_matching_on_HRMS_data(query_data=None, reference_data=None, lik
636
636
  df_top_ref_specs.index = unique_query_ids
637
637
  df_top_ref_specs.index.names = ['Query Spectrum ID']
638
638
 
639
+ df_scores.columns = ['Reference Spectrum ID: ' + col for col in list(map(str,df_scores.columns.tolist()))]
640
+
639
641
  # print the identification results if the user desires
640
642
  if print_id_results == True:
641
643
  print(df_top_ref_specs.to_string())
642
644
 
643
- # write spectral library matching results to disk
644
- df_top_ref_specs.to_csv(output_identification)
645
+ if return_ID_output is False:
646
+ # write spectral library matching results to disk
647
+ df_top_ref_specs.to_csv(output_identification)
645
648
 
646
- # write all similarity scores to disk
647
- df_scores.columns = ['Reference Spectrum ID: ' + col for col in list(map(str,df_scores.columns.tolist()))]
648
- df_scores.to_csv(output_similarity_scores)
649
+ # write all similarity scores to disk
650
+ df_scores.to_csv(output_similarity_scores)
651
+ else:
652
+ return df_top_ref_specs
649
653
 
650
654
 
651
655
 
652
656
 
653
657
 
654
- def run_spec_lib_matching_on_NRMS_data(query_data=None, reference_data=None, likely_reference_ids=None, spectrum_preprocessing_order='FNLW', similarity_measure='cosine', weights={'Cosine':0.25,'Shannon':0.25,'Renyi':0.25,'Tsallis':0.25}, high_quality_reference_library=False, mz_min=0, mz_max=9999999, int_min=0, int_max=9999999, noise_threshold=0.0, wf_mz=0.0, wf_intensity=1.0, LET_threshold=0.0, entropy_dimension=1.1, n_top_matches_to_save=1, print_id_results=False, output_identification=None, output_similarity_scores=None):
658
+ def run_spec_lib_matching_on_NRMS_data(query_data=None, reference_data=None, likely_reference_ids=None, spectrum_preprocessing_order='FNLW', similarity_measure='cosine', weights={'Cosine':0.25,'Shannon':0.25,'Renyi':0.25,'Tsallis':0.25}, high_quality_reference_library=False, mz_min=0, mz_max=9999999, int_min=0, int_max=9999999, noise_threshold=0.0, wf_mz=0.0, wf_intensity=1.0, LET_threshold=0.0, entropy_dimension=1.1, n_top_matches_to_save=1, print_id_results=False, output_identification=None, output_similarity_scores=None, return_ID_output=False):
655
659
  '''
656
660
  runs spectral library matching on nominal-resolution mass spectrometry (NRMS) data
657
661
 
@@ -886,11 +890,15 @@ def run_spec_lib_matching_on_NRMS_data(query_data=None, reference_data=None, lik
886
890
  if print_id_results == True:
887
891
  print(df_top_ref_specs.to_string())
888
892
 
889
- # write spectral library matching results to disk
890
- df_top_ref_specs.to_csv(output_identification)
891
-
892
- # write all similarity scores to disk
893
893
  df_scores.columns = ['Reference Spectrum ID: ' + col for col in list(map(str,df_scores.columns.tolist()))]
894
- df_scores.to_csv(output_similarity_scores)
895
894
 
895
+ if return_ID_output is False:
896
+ # write spectral library matching results to disk
897
+ df_top_ref_specs.to_csv(output_identification)
898
+
899
+ # write all similarity scores to disk
900
+ df_scores.columns = ['Reference Spectrum ID: ' + col for col in list(map(str,df_scores.columns.tolist()))]
901
+ df_scores.to_csv(output_similarity_scores)
902
+ else:
903
+ return df_top_ref_specs
896
904
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: pycompound
3
- Version: 0.0.8
3
+ Version: 0.0.10
4
4
  Summary: Python package to perform compound identification in mass spectrometry via spectral library matching.
5
5
  Author-email: Hunter Dlugas <fy7392@wayne.edu>
6
6
  License-Expression: MIT
@@ -19,10 +19,9 @@ Requires-Dist: pyteomics==4.7.2
19
19
  Requires-Dist: netCDF4==1.6.5
20
20
  Requires-Dist: lxml>=5.1.0
21
21
  Requires-Dist: orjson==3.11.0
22
+ Requires-Dist: shiny==1.4.0
22
23
  Requires-Dist: joblib==1.5.2
23
24
  Dynamic: license-file
24
25
 
25
26
  # PyCompound
26
-
27
27
  A Python-based tool for spectral library matching, PyCompound is available as a Python package with a command-line interface (CLI) available and as a GUI application build with Python/Shiny. It performs spectral library matching to identify chemical compounds, offering a range of spectrum preprocessing transformations and similarity measures, including Cosine, three entropy-based similarity measures, and a plethora of binary similarity measures. PyCompound also includes functionality to tune parameters commonly used in a compound identification workflow given a query library of spectra with known ID. PyCompound supports both high-resolution mass spectrometry (HRMS) data (e.g., LC-MS/MS) and nominal-resolution mass spectrometry (NRMS) data (e.g., GC-MS). For the full documentation, see the GitHub repository https://github.com/hdlugas/pycompound.
28
-
@@ -0,0 +1,14 @@
1
+ app.py,sha256=ab1hII23lVwAmMh4bfzdni50vz-bK-ODbJT_b1VjGMA,34678
2
+ pycompound/build_library.py,sha256=8ghpX8wfj6u-3V5X2IdJ-e8G_FRSla1lO0pzLj7hOtI,5373
3
+ pycompound/plot_spectra.py,sha256=_5r9YR3AA2IfTbcyfyTnPxxxA92T4hQ9olOgaw7FE6A,42082
4
+ pycompound/plot_spectra_CLI.py,sha256=ObaLad5Z5DmfQB-j0HSCg1mLORbYj2BM3hb5Yd0ZdDI,8395
5
+ pycompound/processing.py,sha256=vqtKaZ6vot6wlnKNTYUQFX7ccPpnCAl0L6bN289vZoM,11068
6
+ pycompound/similarity_measures.py,sha256=TuvtEXWwyxE6dfpmuAqRC6gOHvHg3Jf21099pVaNBAs,10702
7
+ pycompound/spec_lib_matching.py,sha256=p8gj-72fjkf0p7XrqEl9hnYUGNSbyr7BXugvRT7Y5OA,60311
8
+ pycompound/spec_lib_matching_CLI.py,sha256=EdXM0dRQfwGQAK4OKxhcVytuUnX9pRyJROwC6rloZ9s,9915
9
+ pycompound/tuning_CLI.py,sha256=lkFBRZ5VxCBteIh_KTkQFdUBVZA0dL-BLiyMZce1vzE,8539
10
+ pycompound-0.0.10.dist-info/licenses/LICENSE,sha256=fPFFlkSGg60VQWyWqTSv8yoJnpLzppzdihVWY5NKom8,1064
11
+ pycompound-0.0.10.dist-info/METADATA,sha256=Gb0d0ZbClc4AFRcDjMnNWcb4TCuq84CJl-AKCNjY2wU,1733
12
+ pycompound-0.0.10.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
13
+ pycompound-0.0.10.dist-info/top_level.txt,sha256=wFBLVrqpC07HghIU8tsEdgdvgkdOE3GN_1Gfjk-uEUc,15
14
+ pycompound-0.0.10.dist-info/RECORD,,
@@ -1,14 +0,0 @@
1
- app.py,sha256=DCaQEp8_1-oldlhzEKo5HpKC2S-orV9gJxuSEHga9MY,21493
2
- pycompound/build_library.py,sha256=8ghpX8wfj6u-3V5X2IdJ-e8G_FRSla1lO0pzLj7hOtI,5373
3
- pycompound/plot_spectra.py,sha256=9s6bDgNv_CZsgMlM_CDToJMxJCasVJbFAGoUrZPfnW8,42027
4
- pycompound/plot_spectra_CLI.py,sha256=ObaLad5Z5DmfQB-j0HSCg1mLORbYj2BM3hb5Yd0ZdDI,8395
5
- pycompound/processing.py,sha256=vqtKaZ6vot6wlnKNTYUQFX7ccPpnCAl0L6bN289vZoM,11068
6
- pycompound/similarity_measures.py,sha256=TuvtEXWwyxE6dfpmuAqRC6gOHvHg3Jf21099pVaNBAs,10702
7
- pycompound/spec_lib_matching.py,sha256=pfDPmH1aQ11_25T80U9i0OUbgjCvvkzNEcDeBrDWNtA,59962
8
- pycompound/spec_lib_matching_CLI.py,sha256=EdXM0dRQfwGQAK4OKxhcVytuUnX9pRyJROwC6rloZ9s,9915
9
- pycompound/tuning_CLI.py,sha256=lkFBRZ5VxCBteIh_KTkQFdUBVZA0dL-BLiyMZce1vzE,8539
10
- pycompound-0.0.8.dist-info/licenses/LICENSE,sha256=fPFFlkSGg60VQWyWqTSv8yoJnpLzppzdihVWY5NKom8,1064
11
- pycompound-0.0.8.dist-info/METADATA,sha256=N76PE3DJwBvDV2VU4qXrbMvVQc7tTHaxZE_fpBxuC84,1706
12
- pycompound-0.0.8.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
13
- pycompound-0.0.8.dist-info/top_level.txt,sha256=wFBLVrqpC07HghIU8tsEdgdvgkdOE3GN_1Gfjk-uEUc,15
14
- pycompound-0.0.8.dist-info/RECORD,,