cdxml-toolkit 0.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (91) hide show
  1. cdxml_toolkit/__init__.py +18 -0
  2. cdxml_toolkit/_jre/__init__.py +2 -0
  3. cdxml_toolkit/_jre/temurin-21-jre-win-x64.zip +0 -0
  4. cdxml_toolkit/analysis/__init__.py +35 -0
  5. cdxml_toolkit/analysis/deterministic/__init__.py +12 -0
  6. cdxml_toolkit/analysis/deterministic/discover_experiment_files.py +413 -0
  7. cdxml_toolkit/analysis/deterministic/lab_book_formatter.py +701 -0
  8. cdxml_toolkit/analysis/deterministic/lcms_file_categorizer.py +928 -0
  9. cdxml_toolkit/analysis/deterministic/lcms_identifier.py +598 -0
  10. cdxml_toolkit/analysis/deterministic/mass_resolver.py +654 -0
  11. cdxml_toolkit/analysis/deterministic/multi_lcms_analyzer.py +1412 -0
  12. cdxml_toolkit/analysis/deterministic/procedure_writer.py +446 -0
  13. cdxml_toolkit/analysis/extract_nmr.py +47 -0
  14. cdxml_toolkit/analysis/format_procedure_entry.py +479 -0
  15. cdxml_toolkit/analysis/lcms_analyzer.py +1299 -0
  16. cdxml_toolkit/analysis/parse_analysis_file.py +134 -0
  17. cdxml_toolkit/cdxml_builder.py +920 -0
  18. cdxml_toolkit/cdxml_utils.py +342 -0
  19. cdxml_toolkit/chemdraw/__init__.py +5 -0
  20. cdxml_toolkit/chemdraw/_chemscript_server.py +562 -0
  21. cdxml_toolkit/chemdraw/cdx_converter.py +527 -0
  22. cdxml_toolkit/chemdraw/cdxml_to_image.py +262 -0
  23. cdxml_toolkit/chemdraw/cdxml_to_image_rdkit.py +296 -0
  24. cdxml_toolkit/chemdraw/chemscript_bridge.py +901 -0
  25. cdxml_toolkit/constants.py +304 -0
  26. cdxml_toolkit/coord_normalizer.py +438 -0
  27. cdxml_toolkit/deterministic_pipeline/__init__.py +6 -0
  28. cdxml_toolkit/deterministic_pipeline/legacy/__init__.py +5 -0
  29. cdxml_toolkit/deterministic_pipeline/legacy/eln_cdx_cleanup.py +509 -0
  30. cdxml_toolkit/deterministic_pipeline/legacy/eln_enrichment.py +1394 -0
  31. cdxml_toolkit/deterministic_pipeline/legacy/scheme_aligner.py +428 -0
  32. cdxml_toolkit/deterministic_pipeline/legacy/scheme_polisher.py +1337 -0
  33. cdxml_toolkit/deterministic_pipeline/legacy/scheme_polisher_v2.py +1340 -0
  34. cdxml_toolkit/deterministic_pipeline/scheme_reader_audit.py +931 -0
  35. cdxml_toolkit/deterministic_pipeline/scheme_reader_verify.py +1160 -0
  36. cdxml_toolkit/image/__init__.py +15 -0
  37. cdxml_toolkit/image/reaction_from_image.py +2103 -0
  38. cdxml_toolkit/image/structure_from_image.py +1711 -0
  39. cdxml_toolkit/layout/__init__.py +5 -0
  40. cdxml_toolkit/layout/alignment.py +1642 -0
  41. cdxml_toolkit/layout/reaction_cleanup.py +1002 -0
  42. cdxml_toolkit/layout/scheme_merger.py +2260 -0
  43. cdxml_toolkit/mcp_server/__init__.py +0 -0
  44. cdxml_toolkit/mcp_server/__main__.py +5 -0
  45. cdxml_toolkit/mcp_server/server.py +1567 -0
  46. cdxml_toolkit/naming/__init__.py +6 -0
  47. cdxml_toolkit/naming/aligned_namer.py +2342 -0
  48. cdxml_toolkit/naming/mol_builder.py +3722 -0
  49. cdxml_toolkit/naming/name_decomposer.py +2843 -0
  50. cdxml_toolkit/naming/reactions_datamol.json +2414 -0
  51. cdxml_toolkit/office/__init__.py +5 -0
  52. cdxml_toolkit/office/doc_from_template.py +722 -0
  53. cdxml_toolkit/office/ole_embedder.py +808 -0
  54. cdxml_toolkit/office/ole_extractor.py +272 -0
  55. cdxml_toolkit/perception/__init__.py +10 -0
  56. cdxml_toolkit/perception/compound_search.py +229 -0
  57. cdxml_toolkit/perception/eln_csv_parser.py +240 -0
  58. cdxml_toolkit/perception/rdf_parser.py +664 -0
  59. cdxml_toolkit/perception/reactant_heuristic.py +1045 -0
  60. cdxml_toolkit/perception/reaction_parser.py +2150 -0
  61. cdxml_toolkit/perception/scheme_reader.py +2948 -0
  62. cdxml_toolkit/perception/scheme_refine.py +1404 -0
  63. cdxml_toolkit/perception/scheme_segmenter.py +619 -0
  64. cdxml_toolkit/perception/spatial_assignment.py +1013 -0
  65. cdxml_toolkit/rdkit_utils.py +605 -0
  66. cdxml_toolkit/render/__init__.py +17 -0
  67. cdxml_toolkit/render/auto_layout.py +229 -0
  68. cdxml_toolkit/render/compact_parser.py +632 -0
  69. cdxml_toolkit/render/parser.py +706 -0
  70. cdxml_toolkit/render/render_scheme.py +267 -0
  71. cdxml_toolkit/render/renderer.py +2387 -0
  72. cdxml_toolkit/render/schema.py +90 -0
  73. cdxml_toolkit/render/scheme_maker.py +1043 -0
  74. cdxml_toolkit/render/scheme_yaml_writer.py +1487 -0
  75. cdxml_toolkit/resolve/__init__.py +13 -0
  76. cdxml_toolkit/resolve/cas_resolver.py +430 -0
  77. cdxml_toolkit/resolve/chemscanner_abbreviations.json +28813 -0
  78. cdxml_toolkit/resolve/condensed_formula.py +493 -0
  79. cdxml_toolkit/resolve/jre_manager.py +195 -0
  80. cdxml_toolkit/resolve/reagent_abbreviations.json +1046 -0
  81. cdxml_toolkit/resolve/reagent_db.py +285 -0
  82. cdxml_toolkit/resolve/superatom_data.json +2856 -0
  83. cdxml_toolkit/resolve/superatom_table.py +146 -0
  84. cdxml_toolkit/text_formatting.py +298 -0
  85. cdxml_toolkit-0.5.0.dist-info/METADATA +318 -0
  86. cdxml_toolkit-0.5.0.dist-info/RECORD +91 -0
  87. cdxml_toolkit-0.5.0.dist-info/WHEEL +5 -0
  88. cdxml_toolkit-0.5.0.dist-info/entry_points.txt +17 -0
  89. cdxml_toolkit-0.5.0.dist-info/licenses/LICENSE +21 -0
  90. cdxml_toolkit-0.5.0.dist-info/licenses/NOTICE.md +37 -0
  91. cdxml_toolkit-0.5.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,1299 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ LCMS Report Analyzer
4
+ Parses Waters MassLynx PDF reports using pdfplumber with spatial word-level
5
+ extraction. Extracts peak tables from all three detectors (TAC, 220nm, 254nm),
6
+ mass spectra (ESI+/ESI-), and UV lambda-max data. Optionally identifies
7
+ SM/product peaks by expected mass.
8
+
9
+ PDF layout handling:
10
+ - Pages 1-2: chromatograms + peak tables (TAC, 220nm, 254nm) — parsed from
11
+ full-page extracted text.
12
+ - Pages 3+: mass spectra + UV panels in a 2-column × 4-row grid — parsed
13
+ using word-level coordinates to avoid column interleaving. Each panel is
14
+ isolated by bounding box (left column x < 306, right column x >= 306).
15
+
16
+ Data structures:
17
+ - LCMSReport: header info (sample name, date, instrument, method) + peaks
18
+ - ChromPeak: RT, area/area% for each detector, MS spectra, UV lambda-max
19
+ - peak_num is a string: "4", or "2a"/"2b" when a table has duplicate numbers
20
+ - MassSpectrum: mode ("ES+"/"ES-") + top_ions (m/z values, descending intensity)
21
+
22
+ Usage:
23
+ python lcms_analyzer.py \\
24
+ --sm-mass 445 \\
25
+ --product-mass 345 \\
26
+ --procedure "KL-7003-008 (100 mg, 224 umol) was dissolved in..." \\
27
+ file1.pdf file2.pdf ...
28
+ """
29
+
30
+ import argparse
31
+ import re
32
+ import os
33
+ import sys
34
+ from dataclasses import dataclass, field
35
+ from typing import List, Optional, Tuple, Dict
36
+ from datetime import datetime
37
+ from collections import defaultdict
38
+
39
+ from cdxml_toolkit.constants import (
40
+ LCMS_COLUMN_BOUNDARY,
41
+ LCMS_MS_AXIS_TICKS,
42
+ LCMS_UV_AXIS_TICKS,
43
+ LCMS_UV_WAVELENGTH_MIN,
44
+ LCMS_UV_WAVELENGTH_MAX,
45
+ )
46
+
47
+ # ---------------------------------------------------------------------------
48
+ # Data structures
49
+ # ---------------------------------------------------------------------------
50
+
51
+ @dataclass
52
+ class MassSpectrum:
53
+ """ESI+ or ESI- spectrum for a single chromatographic peak."""
54
+ mode: str # "ES+" or "ES-"
55
+ top_ions: List[float] = field(default_factory=list) # Up to 2 m/z values, tallest first
56
+
57
+ @dataclass
58
+ class ChromPeak:
59
+ """A single integrated peak from the UV chromatogram."""
60
+ peak_num: str # e.g. "4", "2a", "2b"
61
+ rt: float
62
+ area: Optional[float] = None # TAC area
63
+ area_pct: Optional[float] = None # TAC area %
64
+ width: Optional[float] = None
65
+ height: Optional[float] = None
66
+ mass_found: Optional[str] = None
67
+ ms_spectra: List[MassSpectrum] = field(default_factory=list)
68
+ uv_lambda_max: List[float] = field(default_factory=list)
69
+ area_220nm: Optional[float] = None
70
+ area_pct_220nm: Optional[float] = None
71
+ area_254nm: Optional[float] = None
72
+ area_pct_254nm: Optional[float] = None
73
+
74
+ @dataclass
75
+ class LCMSReport:
76
+ """Parsed contents of one MassLynx PDF report."""
77
+ filename: str
78
+ sample_name: str
79
+ date: str
80
+ instrument: str
81
+ method_path: str
82
+ method_short: str # abbreviated method name for annotation
83
+ peaks: List[ChromPeak] = field(default_factory=list)
84
+ file_modified: Optional[str] = None
85
+ run_time: Optional[str] = None # "HH:MM:SS" from PDF header
86
+
87
+ # ---------------------------------------------------------------------------
88
+ # PDF text extraction
89
+ # ---------------------------------------------------------------------------
90
+
91
+ def extract_all_text(pdf_path: str) -> str:
92
+ """Extract all text from all pages of a PDF."""
93
+ import pdfplumber
94
+ texts = []
95
+ with pdfplumber.open(pdf_path) as pdf:
96
+ for page in pdf.pages:
97
+ t = page.extract_text()
98
+ if t:
99
+ texts.append(t)
100
+ return "\n\n".join(texts)
101
+
102
+ # ---------------------------------------------------------------------------
103
+ # Parsing logic
104
+ # ---------------------------------------------------------------------------
105
+
106
+ def parse_method_short(method_path: str) -> str:
107
+ """
108
+ Extract a short method description from the full MassLynx method path.
109
+
110
+ e.g. '...21_CSH_C18_AmF_5to100_ACN_220_254nm_TAC_TIC_1p9min.olp'
111
+ -> 'CSH C18, AmF, 5-100%, 1.9 min'
112
+
113
+ e.g. '...21_CSH_C18_AmB_50to100_ACN_220_254nm_TAC_TIC_1p9min.olp'
114
+ -> 'CSH C18, AmB, 50-100%, 1.9 min'
115
+ """
116
+ basename = os.path.basename(method_path).replace('.olp', '')
117
+ parts = basename.split('_')
118
+
119
+ column = ""
120
+ buffer_type = ""
121
+ gradient = ""
122
+ runtime = ""
123
+
124
+ for p in parts:
125
+ pl = p.lower()
126
+ # Column type
127
+ if any(kw in pl for kw in ('c18', 'c8', 'beh', 'csh', 'hss')):
128
+ column = (column + " " + p).strip()
129
+ # Buffer/modifier — check AmB before AmF to avoid false match
130
+ # ('amb' doesn't contain 'amf' so order doesn't matter for exclusion,
131
+ # but we guard against future overlap)
132
+ if not buffer_type:
133
+ if pl in ('amb', 'ambic') or pl.startswith('amb'):
134
+ buffer_type = "AmB"
135
+ elif pl in ('amf',) or pl.startswith('amf'):
136
+ buffer_type = "AmF"
137
+ elif pl == 'fa':
138
+ buffer_type = "FA"
139
+ elif pl in ('tfa',) or pl.startswith('tfa'):
140
+ buffer_type = "TFA"
141
+ # Gradient range: 5to100, 5to50, 50to100
142
+ m_grad = re.match(r'(\d+)to(\d+)', pl)
143
+ if m_grad and not gradient:
144
+ gradient = f"{m_grad.group(1)}-{m_grad.group(2)}%"
145
+ # Runtime: 1p9min -> 1.9 min
146
+ if 'min' in pl and not runtime:
147
+ runtime = p.replace('p', '.').replace('min', ' min')
148
+
149
+ pieces = [x for x in [column, buffer_type, gradient, runtime] if x]
150
+ return ", ".join(pieces) if pieces else basename
151
+
152
+
153
+ def method_basename(method_path: str) -> str:
154
+ """Return the method filename without directory and extension, lowercased.
155
+
156
+ Used for grouping files by exact method — files with the same method
157
+ basename are comparable (same column, buffer, gradient, runtime).
158
+ """
159
+ return os.path.basename(method_path).replace('.olp', '').lower()
160
+
161
+
162
+ def parse_header(text: str) -> dict:
163
+ """Extract header fields from the report text."""
164
+ info = {}
165
+
166
+ m = re.search(r'Sample Name:\s*(\S+)', text)
167
+ info['sample_name'] = m.group(1) if m else "Unknown"
168
+
169
+ m = re.search(r'Date:\s*(\S+)', text)
170
+ info['date'] = m.group(1) if m else "Unknown"
171
+
172
+ m = re.search(r'Time:\s*(\d{1,2}:\d{2}:\d{2})', text)
173
+ info['run_time'] = m.group(1) if m else None
174
+
175
+ # Instrument name is on the line after "Page 1", before "_UPLC" or
176
+ # similar suffix. e.g. "PPIMSA05_UPLC-PDA-MS Open Access ..."
177
+ m = re.search(r'Page\s+1\s*\n(\w+)', text)
178
+ info['instrument'] = m.group(1).split('_')[0] if m else "Unknown"
179
+
180
+ m = re.search(r'Method:\s*(.+?)(?:\n|Report)', text)
181
+ info['method_path'] = m.group(1).strip() if m else "Unknown"
182
+
183
+ return info
184
+
185
+
186
+ # ---------------------------------------------------------------------------
187
+ # Peak table parsing — all three detectors (TAC, 220nm, 254nm)
188
+ # ---------------------------------------------------------------------------
189
+
190
+ _ROW_PATTERN = re.compile(
191
+ r'^\s*(\d+)\s+' # peak number
192
+ r'(\d+\.\d+)\s+' # retention time
193
+ r'(\d+)\s+' # area
194
+ r'(\d+\.\d+)\s+' # area %
195
+ r'(\d+\.\d+)\s+' # width
196
+ r'(\d+)\s+' # height
197
+ r'(.+?)$', # mass found
198
+ re.MULTILINE
199
+ )
200
+
201
+ _TABLE_HEADER = re.compile(r'Peak\s+Time\s+Area\s+Area\s*%', re.IGNORECASE)
202
+
203
+
204
+ def _parse_table_rows(text_block: str) -> List[dict]:
205
+ """Parse peak rows from a single table text block."""
206
+ rows = []
207
+ for m in _ROW_PATTERN.finditer(text_block):
208
+ rows.append({
209
+ 'peak_num_raw': int(m.group(1)),
210
+ 'rt': float(m.group(2)),
211
+ 'area': float(m.group(3)),
212
+ 'area_pct': float(m.group(4)),
213
+ 'width': float(m.group(5)),
214
+ 'height': float(m.group(6)),
215
+ 'mass_found': m.group(7).strip(),
216
+ })
217
+ return rows
218
+
219
+
220
+ def _identify_detector(text: str, header_start: int) -> str:
221
+ """Look backward from a table header to identify detector type."""
222
+ before = text[:header_start]
223
+ tac_pos = max((m.start() for m in re.finditer(r'TAC:\s*Wavelength|UV Detector:\s*TAC', before)), default=-1)
224
+ ch1_pos = max((m.start() for m in re.finditer(r'Ch1\s*220nm|PDA\s*Ch1', before)), default=-1)
225
+ ch2_pos = max((m.start() for m in re.finditer(r'Ch2\s*254nm|PDA\s*Ch2', before)), default=-1)
226
+
227
+ best = max(('TAC', tac_pos), ('220nm', ch1_pos), ('254nm', ch2_pos), key=lambda x: x[1])
228
+ return best[0] if best[1] >= 0 else 'TAC'
229
+
230
+
231
+ def _build_peak_id_map(tables_raw: Dict[str, List[dict]]) -> Dict[Tuple[int, float], str]:
232
+ """
233
+ Build mapping from (raw_peak_num, rt) -> string peak_id.
234
+ Assigns 'a', 'b' suffixes when a peak number appears at multiple distinct RTs.
235
+ """
236
+ all_pairs = set()
237
+ for rows in tables_raw.values():
238
+ for row in rows:
239
+ all_pairs.add((row['peak_num_raw'], row['rt']))
240
+
241
+ by_num: Dict[int, List[float]] = defaultdict(list)
242
+ for num, rt in all_pairs:
243
+ by_num[num].append(rt)
244
+
245
+ mapping = {}
246
+ for num, rts in by_num.items():
247
+ rts_sorted = sorted(set(rts))
248
+ if len(rts_sorted) == 1:
249
+ mapping[(num, rts_sorted[0])] = str(num)
250
+ else:
251
+ for i, rt in enumerate(rts_sorted):
252
+ mapping[(num, rt)] = f"{num}{chr(ord('a') + i)}"
253
+
254
+ return mapping
255
+
256
+
257
+ def _lookup_peak_id(id_map: Dict[Tuple[int, float], str], raw_num: int, rt: float,
258
+ tolerance: float = 0.02) -> str:
259
+ """Look up string peak ID with RT tolerance for fuzzy matching."""
260
+ if (raw_num, rt) in id_map:
261
+ return id_map[(raw_num, rt)]
262
+ for (num, map_rt), pid in id_map.items():
263
+ if num == raw_num and abs(map_rt - rt) < tolerance:
264
+ return pid
265
+ return str(raw_num)
266
+
267
+
268
+ def parse_all_peak_tables(text: str) -> Tuple[List[ChromPeak], Dict[Tuple[int, float], str]]:
269
+ """
270
+ Parse all UV peak integration tables (TAC, 220nm, 254nm).
271
+ Returns (peaks, id_map) where id_map maps (raw_num, rt) -> string peak_id.
272
+ """
273
+ headers = list(_TABLE_HEADER.finditer(text))
274
+ if not headers:
275
+ return [], {}
276
+
277
+ tables_raw: Dict[str, List[dict]] = {}
278
+ for i, header in enumerate(headers):
279
+ start = header.end()
280
+ end = headers[i + 1].start() if i + 1 < len(headers) else len(text)
281
+ table_text = text[start:end]
282
+ detector = _identify_detector(text, header.start())
283
+ rows = _parse_table_rows(table_text)
284
+ if detector not in tables_raw:
285
+ tables_raw[detector] = rows
286
+ else:
287
+ tables_raw[detector].extend(rows)
288
+
289
+ id_map = _build_peak_id_map(tables_raw)
290
+
291
+ peaks_dict: Dict[str, ChromPeak] = {}
292
+ for detector, rows in tables_raw.items():
293
+ for row in rows:
294
+ pid = id_map[(row['peak_num_raw'], row['rt'])]
295
+ if pid not in peaks_dict:
296
+ peaks_dict[pid] = ChromPeak(
297
+ peak_num=pid,
298
+ rt=row['rt'],
299
+ width=row['width'],
300
+ height=row['height'],
301
+ mass_found=row['mass_found'],
302
+ )
303
+ p = peaks_dict[pid]
304
+ if detector == 'TAC':
305
+ p.area = row['area']
306
+ p.area_pct = row['area_pct']
307
+ elif detector == '220nm':
308
+ p.area_220nm = row['area']
309
+ p.area_pct_220nm = row['area_pct']
310
+ elif detector == '254nm':
311
+ p.area_254nm = row['area']
312
+ p.area_pct_254nm = row['area_pct']
313
+
314
+ peaks = sorted(peaks_dict.values(), key=lambda p: (p.rt, p.peak_num))
315
+ return peaks, id_map
316
+
317
+
318
+ # ---------------------------------------------------------------------------
319
+ # Spatial mass spectrum + UV parsing (fixes two-column layout)
320
+ # ---------------------------------------------------------------------------
321
+
322
+ _MS_AXIS_TICKS = LCMS_MS_AXIS_TICKS
323
+ _UV_AXIS_TICKS = LCMS_UV_AXIS_TICKS
324
+
325
+
326
+ def _find_panel_headers(words: List[dict]) -> List[dict]:
327
+ """Find 'Peak' words that are part of spectrum 'Peak Time Mass' headers.
328
+
329
+ Rejects peak-table headers ('Peak Time Area Area% Width Height Mass Found')
330
+ which also contain 'Peak', 'Time', and 'Mass' but are NOT spectrum panels.
331
+ The discriminator is the presence of 'Area' or 'Height' as neighbours.
332
+ """
333
+ results = []
334
+ for w in words:
335
+ if w['text'] != 'Peak':
336
+ continue
337
+ y = w['top']
338
+ neighbors = [nw['text'] for nw in words
339
+ if abs(nw['top'] - y) < 3 and nw['x0'] > w['x0']
340
+ and nw['x0'] < w['x0'] + 400]
341
+ if 'Time' in neighbors and 'Mass' in neighbors:
342
+ # Reject peak-table headers which have "Area" / "Height" / "Width"
343
+ if 'Area' in neighbors or 'Height' in neighbors or 'Width' in neighbors:
344
+ continue
345
+ results.append(w)
346
+ return results
347
+
348
+
349
+ def _group_headers_into_rows(headers: List[dict], y_tolerance: float = 5.0):
350
+ """Group panel headers by y-coordinate into rows. Returns [(y, [headers])]."""
351
+ if not headers:
352
+ return []
353
+ sorted_h = sorted(headers, key=lambda w: w['top'])
354
+ rows = []
355
+ current = [sorted_h[0]]
356
+ current_y = sorted_h[0]['top']
357
+ for h in sorted_h[1:]:
358
+ if abs(h['top'] - current_y) < y_tolerance:
359
+ current.append(h)
360
+ else:
361
+ rows.append((current_y, current))
362
+ current = [h]
363
+ current_y = h['top']
364
+ rows.append((current_y, current))
365
+ return rows
366
+
367
+
368
+ def _extract_mz_values(word_text: str) -> List[float]:
369
+ """
370
+ Extract m/z values from a word string, splitting joined numbers.
371
+ MassLynx reports m/z to 1 decimal place, so we match \\d+\\.\\d patterns.
372
+ E.g. '569.1814.6874.9' -> [569.1, 814.6, 874.9]
373
+ """
374
+ values = []
375
+ for m in re.finditer(r'(\d+\.\d)', word_text):
376
+ val = float(m.group(1))
377
+ if 50 < val < 2000 and val not in _MS_AXIS_TICKS:
378
+ values.append(val)
379
+ return values
380
+
381
+
382
+ def _parse_ms_from_words(panel_words: List[dict]) -> List[MassSpectrum]:
383
+ """
384
+ Parse MS spectra from a panel's word list.
385
+ Words are sorted top-to-bottom. MassLynx labels ions tallest-first.
386
+ """
387
+ results = []
388
+
389
+ # Sort words by vertical position
390
+ sorted_words = sorted(panel_words, key=lambda w: (w['top'], w['x0']))
391
+
392
+ # Find MS mode markers and their positions
393
+ ms_markers = []
394
+ for w in sorted_words:
395
+ m = re.match(r'(ES[+-])$', w['text'])
396
+ if m:
397
+ ms_markers.append((w['top'], m.group(1)))
398
+
399
+ if not ms_markers:
400
+ return results
401
+
402
+ # For each MS mode section, collect m/z values from words below it
403
+ for idx, (marker_y, mode) in enumerate(ms_markers):
404
+ # Section ends at next MS marker, or at UV section, or at end
405
+ if idx + 1 < len(ms_markers):
406
+ section_end_y = ms_markers[idx + 1][0]
407
+ else:
408
+ # Find UV section start if present
409
+ uv_words = [w for w in sorted_words if 'UV' in w['text'] or w['text'] == 'Nm']
410
+ if uv_words:
411
+ section_end_y = min(w['top'] for w in uv_words)
412
+ else:
413
+ section_end_y = float('inf')
414
+
415
+ # Collect m/z values, splitting any joined numbers
416
+ section_nums = []
417
+ for w in sorted_words:
418
+ if w['top'] <= marker_y or w['top'] >= section_end_y:
419
+ continue
420
+ section_nums.extend(_extract_mz_values(w['text']))
421
+
422
+ if section_nums:
423
+ results.append(MassSpectrum(mode=mode, top_ions=section_nums))
424
+
425
+ return results
426
+
427
+
428
+ def _parse_uv_from_words(panel_words: List[dict]) -> List[float]:
429
+ """Parse UV lambda-max wavelengths from a panel's word list."""
430
+ sorted_words = sorted(panel_words, key=lambda w: (w['top'], w['x0']))
431
+
432
+ # Find "AU" word position — wavelengths come after it
433
+ au_y = None
434
+ for w in sorted_words:
435
+ if w['text'] == 'AU':
436
+ au_y = w['top']
437
+ break
438
+
439
+ if au_y is None:
440
+ return []
441
+
442
+ wavelengths = []
443
+ for w in sorted_words:
444
+ if w['top'] <= au_y:
445
+ continue
446
+ try:
447
+ val = float(w['text'])
448
+ if LCMS_UV_WAVELENGTH_MIN <= val <= LCMS_UV_WAVELENGTH_MAX and val not in _UV_AXIS_TICKS:
449
+ wavelengths.append(val)
450
+ except ValueError:
451
+ continue
452
+
453
+ return wavelengths
454
+
455
+
456
+ def _parse_spectrum_pages(pdf) -> Tuple[Dict[int, Tuple[float, list]],
457
+ Dict[int, Tuple[float, list]]]:
458
+ """
459
+ Parse mass spectra and UV lambda-max from spectrum pages using spatial cropping.
460
+ Uses word-level extraction to avoid joined-number artifacts from extract_text().
461
+
462
+ Returns:
463
+ ms_data: {raw_peak_num: (rt, [MassSpectrum, ...])}
464
+ uv_data: {raw_peak_num: (rt, [wavelength, ...])}
465
+ """
466
+ ms_data = {}
467
+ uv_data = {}
468
+
469
+ # Start from page 2 (index 1): MassLynx sometimes places the first
470
+ # peak's mass spectrum at the bottom of page 2 after the peak tables.
471
+ # Panel header detection rejects peak-table headers via "Area" filter.
472
+ for page_idx in range(1, len(pdf.pages)):
473
+ page = pdf.pages[page_idx]
474
+ words = page.extract_words()
475
+ if not words:
476
+ continue
477
+
478
+ headers = _find_panel_headers(words)
479
+ if not headers:
480
+ continue
481
+
482
+ rows = _group_headers_into_rows(headers)
483
+ page_width = float(page.width)
484
+ page_height = float(page.height)
485
+ col_mid = LCMS_COLUMN_BOUNDARY
486
+
487
+ for i, (y_start, headers_in_row) in enumerate(rows):
488
+ y_end = rows[i + 1][0] if i + 1 < len(rows) else page_height
489
+
490
+ for hdr in headers_in_row:
491
+ x_center = (hdr['x0'] + hdr['x1']) / 2
492
+ if x_center < col_mid:
493
+ x_start, x_end = 0, col_mid
494
+ else:
495
+ x_start, x_end = col_mid, page_width
496
+
497
+ # Filter words to this panel's bounding box
498
+ panel_words = [w for w in words
499
+ if w['x0'] >= x_start and w['x1'] <= x_end
500
+ and w['top'] >= y_start - 2 and w['top'] < y_end]
501
+
502
+ if not panel_words:
503
+ continue
504
+
505
+ # Extract peak number and RT from panel words
506
+ # Look for the first integer followed by a decimal (e.g. "4" then "0.64")
507
+ peak_num = None
508
+ rt = None
509
+ num_words = sorted(panel_words, key=lambda w: (w['top'], w['x0']))
510
+ for j, w in enumerate(num_words):
511
+ if peak_num is not None:
512
+ break
513
+ if re.match(r'^\d+$', w['text']) and w['text'] != '0':
514
+ # Check if next word at similar y is a decimal (RT)
515
+ for nw in num_words[j+1:j+4]:
516
+ if abs(nw['top'] - w['top']) < 3 and re.match(r'^\d+\.\d+$', nw['text']):
517
+ peak_num = int(w['text'])
518
+ rt = float(nw['text'])
519
+ break
520
+
521
+ if peak_num is None:
522
+ continue
523
+
524
+ # Check panel content type from word texts
525
+ panel_texts = [w['text'] for w in panel_words]
526
+ has_ms = any('ES+' in t or 'ES-' in t for t in panel_texts)
527
+ has_uv = any('UV' in t for t in panel_texts) and any('Detector' in t for t in panel_texts)
528
+
529
+ # Parse MS data
530
+ if has_ms:
531
+ ms_list = _parse_ms_from_words(panel_words)
532
+ if ms_list:
533
+ if peak_num not in ms_data:
534
+ ms_data[peak_num] = (rt, [])
535
+ ms_data[peak_num][1].extend(ms_list)
536
+
537
+ # Parse UV data
538
+ if has_uv:
539
+ wavelengths = _parse_uv_from_words(panel_words)
540
+ if wavelengths:
541
+ if peak_num not in uv_data:
542
+ uv_data[peak_num] = (rt, [])
543
+ uv_data[peak_num][1].extend(wavelengths)
544
+
545
+ return ms_data, uv_data
546
+
547
+
548
+ def is_waters_report(pdf_path: str) -> bool:
549
+ """Quick content-based check: is this PDF a standard Waters MassLynx report?
550
+
551
+ Manually integrated chromatograms (e.g. LC-only or MS-only exports) lack
552
+ the structured headers of a full UPLC-PDA-MS Open Access report. This
553
+ function reads only the first page and checks for Waters report markers.
554
+
555
+ Returns True for standard reports, False for manually integrated exports
556
+ or other non-standard PDFs.
557
+ """
558
+ import pdfplumber
559
+ try:
560
+ with pdfplumber.open(pdf_path) as pdf:
561
+ if not pdf.pages:
562
+ return False
563
+ text = pdf.pages[0].extract_text() or ""
564
+ # Standard Waters MassLynx reports contain these markers on page 1
565
+ # Check for at least 2 of 3 markers for robustness
566
+ markers = [
567
+ "Sample Name:" in text,
568
+ "Instrument:" in text or "UPLC" in text,
569
+ "Date:" in text and "Time:" in text,
570
+ ]
571
+ return sum(markers) >= 2
572
+ except Exception:
573
+ return False
574
+
575
+
576
+ def parse_report(pdf_path: str) -> LCMSReport:
577
+ """Parse a complete MassLynx PDF report."""
578
+ import pdfplumber
579
+
580
+ with pdfplumber.open(pdf_path) as pdf:
581
+ # Extract all text for header and peak tables
582
+ texts = []
583
+ for page in pdf.pages:
584
+ t = page.extract_text()
585
+ if t:
586
+ texts.append(t)
587
+ text = "\n\n".join(texts)
588
+
589
+ header = parse_header(text)
590
+ peaks, id_map = parse_all_peak_tables(text)
591
+
592
+ # Parse mass spectra and UV using spatial approach
593
+ ms_data, uv_data = _parse_spectrum_pages(pdf)
594
+
595
+ # Attach MS spectra to peaks
596
+ for raw_num, (rt, ms_list) in ms_data.items():
597
+ pid = _lookup_peak_id(id_map, raw_num, rt)
598
+ for peak in peaks:
599
+ if peak.peak_num == pid:
600
+ peak.ms_spectra = ms_list
601
+ break
602
+
603
+ # Attach UV lambda-max to peaks
604
+ for raw_num, (rt, wavelengths) in uv_data.items():
605
+ pid = _lookup_peak_id(id_map, raw_num, rt)
606
+ for peak in peaks:
607
+ if peak.peak_num == pid:
608
+ peak.uv_lambda_max = wavelengths
609
+ break
610
+
611
+ # Get file modified time
612
+ mtime = os.path.getmtime(pdf_path)
613
+ modified = datetime.fromtimestamp(mtime).strftime("%Y-%m-%d %H:%M")
614
+
615
+ return LCMSReport(
616
+ filename=os.path.basename(pdf_path),
617
+ sample_name=header['sample_name'],
618
+ date=header['date'],
619
+ instrument=header['instrument'],
620
+ method_path=header['method_path'],
621
+ method_short=parse_method_short(header['method_path']),
622
+ peaks=peaks,
623
+ file_modified=modified,
624
+ run_time=header.get('run_time'),
625
+ )
626
+
627
+ # ---------------------------------------------------------------------------
628
+ # Manual integration reports (LC-only / MS-only MassLynx exports)
629
+ # ---------------------------------------------------------------------------
630
+
631
+ @dataclass
632
+ class ManualPeak:
633
+ """A peak from a manually integrated chromatogram."""
634
+ peak_num: str
635
+ rt: float
636
+ area: float
637
+ area_pct: float
638
+ height: Optional[float] = None
639
+
640
+
641
+ @dataclass
642
+ class ManualLCMSSample:
643
+ """One chromatogram section from a manual integration PDF."""
644
+ sample_name: str
645
+ peaks: List[ManualPeak] = field(default_factory=list)
646
+ detector: str = "" # e.g. "Diode Array 290nm"
647
+ from_labels: bool = False # True if parsed from RT;Area labels (best-effort)
648
+
649
+
650
+ @dataclass
651
+ class ManualLCMSReport:
652
+ """Parsed contents of a manually integrated MassLynx PDF."""
653
+ filename: str
654
+ instrument: str
655
+ date: str
656
+ samples: List[ManualLCMSSample] = field(default_factory=list)
657
+ run_time: Optional[str] = None
658
+
659
+
660
+ def is_manual_integration(pdf_path: str) -> bool:
661
+ """Check if this PDF is a MassLynx manual integration export.
662
+
663
+ Manual integration PDFs have "Diode Array" but lack the structured
664
+ "Sample Name:" / "Date:" / "Time:" headers of a full Waters report.
665
+ """
666
+ import pdfplumber
667
+ try:
668
+ with pdfplumber.open(pdf_path) as pdf:
669
+ if not pdf.pages:
670
+ return False
671
+ text = pdf.pages[0].extract_text() or ""
672
+ has_diode_array = "Diode Array" in text
673
+ has_waters_header = "Sample Name:" in text
674
+ return has_diode_array and not has_waters_header
675
+ except Exception:
676
+ return False
677
+
678
+
679
+ def parse_manual_report(pdf_path: str) -> ManualLCMSReport:
680
+ """Parse a manually integrated MassLynx PDF.
681
+
682
+ Handles three variants:
683
+ 1. Single sample with peak table (Time Height Area Area%)
684
+ 2. Multi-sample with peak tables per section
685
+ 3. Multi-sample with only RT;Area chromatogram labels (no table)
686
+
687
+ Returns a ManualLCMSReport with one ManualLCMSSample per chromatogram.
688
+ """
689
+ text = extract_all_text(pdf_path)
690
+ filename = os.path.basename(pdf_path)
691
+
692
+ # --- Header: first line is "SampleName Instrument Date" ---
693
+ # Instrument is an alphanumeric code (e.g. PPIMSA05, UPLCMS01, SQD2)
694
+ # immediately followed by a date in DD-Mon-YYYY format.
695
+ header_match = re.match(
696
+ r'(.+?)\s+([A-Z][A-Za-z0-9]+)\s+'
697
+ r'(\d{1,2}-\w{3}-\d{4})\s*\n\s*(\d{2}:\d{2}:\d{2})?',
698
+ text
699
+ )
700
+ instrument = header_match.group(2) if header_match else ""
701
+ date_str = header_match.group(3) if header_match else ""
702
+ run_time = header_match.group(4) if header_match else None
703
+
704
+ # --- Split into per-sample sections ---
705
+ # Each section starts with a sample name followed by optional smoothing
706
+ # params and "3: Diode Array" or similar detector marker.
707
+ # Pattern: "SampleName [Sm (Mn, 2x3)] 3: Diode Array"
708
+ section_pattern = re.compile(
709
+ r'^([\w][\w\-]+(?:\s+Sm\s*\([^)]+\))?)\s+'
710
+ r'(\d+:\s*Diode Array)\s*\n'
711
+ r'(.*?)(?=^[\w][\w\-]+(?:\s+Sm\s*\([^)]+\))?\s+\d+:\s*Diode Array|\Z)',
712
+ re.MULTILINE | re.DOTALL
713
+ )
714
+
715
+ samples = []
716
+ for m in section_pattern.finditer(text):
717
+ raw_name = m.group(1).strip()
718
+ detector_str = m.group(2).strip()
719
+ section_text = m.group(3)
720
+
721
+ # Clean sample name: strip smoothing params
722
+ sample_name = re.sub(r'\s+Sm\s*\([^)]+\)', '', raw_name).strip()
723
+
724
+ # Try to extract detector wavelength
725
+ wl_match = re.search(r'(\d{3})', detector_str)
726
+ detector = f"Diode Array {wl_match.group(1)}nm" if wl_match else detector_str
727
+
728
+ # --- Try peak table first (Time Height Area Area%) ---
729
+ # Table rows may be interleaved with Y-axis tick labels (e.g.
730
+ # "5.5e+1") from pdfplumber. We search for the header, then
731
+ # scan subsequent lines for 4-number rows that look like
732
+ # Time Height Area Area% data.
733
+ peaks = []
734
+ header_match = re.search(r'Time\s+Height\s+Area\s+Area%', section_text)
735
+ if header_match:
736
+ after_header = section_text[header_match.end():]
737
+ # Find rows of 4 numbers where RT is plausible (<20 min)
738
+ # and Area% is 0-100
739
+ row_pattern = re.compile(
740
+ r'(\d+\.\d+)\s+(\d+)\s+(\d+(?:\.\d+)?)\s+(\d+\.\d+)'
741
+ )
742
+ for rm in row_pattern.finditer(after_header):
743
+ rt = float(rm.group(1))
744
+ height = float(rm.group(2))
745
+ area = float(rm.group(3))
746
+ area_pct = float(rm.group(4))
747
+ # Sanity: RT < 20 min, area% <= 100
748
+ if rt < 20.0 and area_pct <= 100.0:
749
+ peaks.append(ManualPeak(
750
+ peak_num=str(len(peaks) + 1),
751
+ rt=rt,
752
+ height=height,
753
+ area=area,
754
+ area_pct=area_pct,
755
+ ))
756
+ else:
757
+ # --- Fallback: parse RT;Area labels from chromatogram ---
758
+ # Labels appear as "RT;Area" or "RT\nArea" (area on next line)
759
+ label_pattern = re.compile(
760
+ r'([\d.]+);([\d.]+)' # "0.56;404"
761
+ )
762
+ raw_peaks = []
763
+ for lm in label_pattern.finditer(section_text):
764
+ raw_peaks.append((float(lm.group(1)), float(lm.group(2))))
765
+
766
+ # Also catch "RT\nArea" patterns (RT alone, area on next line)
767
+ # These show up when the label wraps, e.g. "1.32\n20"
768
+ # But we need to avoid matching axis ticks. Axis ticks are on
769
+ # lines starting with "-0.00" or in sequences.
770
+ # Strategy: look for floating numbers that aren't matched by
771
+ # the RT;Area pattern and aren't axis-like.
772
+ standalone_rt_pattern = re.compile(
773
+ r'(?<!\d[;.])(?:^|\s)((?:0\.\d{2}|1\.\d{2}))\s*\n\s*(\d+)(?:\s|$)',
774
+ re.MULTILINE
775
+ )
776
+ for sm in standalone_rt_pattern.finditer(section_text):
777
+ rt_val = float(sm.group(1))
778
+ area_val = float(sm.group(2))
779
+ # Deduplicate: skip if we already have a peak at this RT
780
+ if not any(abs(rt_val - rp[0]) < 0.02 for rp in raw_peaks):
781
+ raw_peaks.append((rt_val, area_val))
782
+
783
+ # Sort by RT and compute area%
784
+ raw_peaks.sort(key=lambda x: x[0])
785
+ total_area = sum(a for _, a in raw_peaks) if raw_peaks else 1.0
786
+ for i, (rt, area) in enumerate(raw_peaks, 1):
787
+ peaks.append(ManualPeak(
788
+ peak_num=str(i),
789
+ rt=rt,
790
+ area=area,
791
+ area_pct=(area / total_area * 100) if total_area > 0 else 0.0,
792
+ ))
793
+
794
+ used_labels = not bool(header_match)
795
+ samples.append(ManualLCMSSample(
796
+ sample_name=sample_name,
797
+ peaks=peaks,
798
+ detector=detector,
799
+ from_labels=used_labels,
800
+ ))
801
+
802
+ return ManualLCMSReport(
803
+ filename=filename,
804
+ instrument=instrument,
805
+ date=date_str,
806
+ samples=samples,
807
+ run_time=run_time,
808
+ )
809
+
810
+
811
+ def format_manual_table(report: ManualLCMSReport) -> str:
812
+ """Format a manual integration report as markdown for LLM consumption."""
813
+ lines = []
814
+
815
+ lines.append(f"**File:** {report.filename} (manual integration)")
816
+ lines.append(f"**Instrument:** {report.instrument}")
817
+ date_str = report.date
818
+ if report.run_time:
819
+ date_str += f" {report.run_time}"
820
+ lines.append(f"**Date:** {date_str}")
821
+
822
+ for sample in report.samples:
823
+ lines.append("")
824
+ lines.append(f"### {sample.sample_name}")
825
+ if not sample.peaks:
826
+ lines.append("(no peaks)")
827
+ continue
828
+
829
+ lines.append(f"| # | RT | Area% |")
830
+ lines.append(f"|---|------|-------|")
831
+ total_pct = 0.0
832
+ for peak in sample.peaks:
833
+ lines.append(f"| {peak.peak_num} | {peak.rt:.2f} | {peak.area_pct:.1f} |")
834
+ total_pct += peak.area_pct
835
+ if total_pct < 95.0:
836
+ lines.append(f"")
837
+ lines.append(f"*Warning: parsed peaks sum to {total_pct:.1f}% — some peaks may not have been extracted.*")
838
+ elif sample.from_labels:
839
+ lines.append(f"")
840
+ lines.append(f"*Note: area% computed from chromatogram labels (no peak table in PDF). Some small peaks may be missing.*")
841
+
842
+ return "\n".join(lines)
843
+
844
+
845
+ # ---------------------------------------------------------------------------
846
+ # Peak identification by expected mass
847
+ # ---------------------------------------------------------------------------
848
+
849
+ def identify_peak(peak: ChromPeak, sm_mass: float, product_mass: float,
850
+ tolerance: float = 1.5) -> Optional[str]:
851
+ """
852
+ Try to identify a peak as SM, product, or unknown based on ESI mass data.
853
+
854
+ Checks for [M+H]+, [M-H]-, [M+Na]+, [M+formate]- adducts.
855
+ Returns: "SM", "DP" (desired product), or None
856
+
857
+ Key subtlety: if ESI+ matches product but ESI- matches SM for the same peak,
858
+ that's likely SM with in-source fragmentation (e.g. Boc loss). We collect
859
+ evidence from both polarities and resolve conflicts.
860
+ """
861
+ adducts_pos = [
862
+ ("M+H", 1.008),
863
+ ("M+Na", 22.990),
864
+ ]
865
+ adducts_neg = [
866
+ ("M-H", -1.008),
867
+ ("M+formate", 44.998),
868
+ ]
869
+
870
+ # Collect all evidence: list of (identity, mode, adduct_name, mz, is_base_peak)
871
+ evidence = []
872
+
873
+ for spec in peak.ms_spectra:
874
+ if not spec.top_ions:
875
+ continue
876
+ for i, mz in enumerate(spec.top_ions):
877
+ is_base = (i == 0) # First ion is the tallest
878
+ if spec.mode == "ES+":
879
+ for adduct_name, adduct_mass in adducts_pos:
880
+ if abs(mz - (product_mass + adduct_mass)) < tolerance:
881
+ evidence.append(("DP", spec.mode, adduct_name, mz, is_base))
882
+ if abs(mz - (sm_mass + adduct_mass)) < tolerance:
883
+ evidence.append(("SM", spec.mode, adduct_name, mz, is_base))
884
+ elif spec.mode == "ES-":
885
+ for adduct_name, adduct_mass in adducts_neg:
886
+ if abs(mz - (product_mass + adduct_mass)) < tolerance:
887
+ evidence.append(("DP", spec.mode, adduct_name, mz, is_base))
888
+ if abs(mz - (sm_mass + adduct_mass)) < tolerance:
889
+ evidence.append(("SM", spec.mode, adduct_name, mz, is_base))
890
+
891
+ if not evidence:
892
+ return None
893
+
894
+ # Resolve: do we have conflicting identities?
895
+ identities_found = set(e[0] for e in evidence)
896
+
897
+ if len(identities_found) == 1:
898
+ return identities_found.pop()
899
+
900
+ if "SM" in identities_found and "DP" in identities_found:
901
+ # Conflict! Common case: SM fragments in ESI+ to look like product.
902
+ # Heuristic: if ESI- clearly shows SM (via [M-H]-), trust that over
903
+ # ESI+ showing product (which is likely in-source fragmentation).
904
+ sm_neg = [e for e in evidence if e[0] == "SM" and e[1] == "ES-"]
905
+ dp_pos = [e for e in evidence if e[0] == "DP" and e[1] == "ES+"]
906
+
907
+ if sm_neg:
908
+ # ESI- says SM — trust it. The ESI+ "product" signal is fragmentation.
909
+ return "SM"
910
+
911
+ sm_pos = [e for e in evidence if e[0] == "SM" and e[1] == "ES+"]
912
+ dp_neg = [e for e in evidence if e[0] == "DP" and e[1] == "ES-"]
913
+
914
+ if dp_neg:
915
+ return "DP"
916
+
917
+ # Both in same polarity — go with the one that has base peak evidence
918
+ sm_base = [e for e in evidence if e[0] == "SM" and e[4]]
919
+ dp_base = [e for e in evidence if e[0] == "DP" and e[4]]
920
+ if dp_base and not sm_base:
921
+ return "DP"
922
+ if sm_base and not dp_base:
923
+ return "SM"
924
+
925
+ # Default: return the one with more evidence
926
+ sm_count = sum(1 for e in evidence if e[0] == "SM")
927
+ dp_count = sum(1 for e in evidence if e[0] == "DP")
928
+ return "SM" if sm_count >= dp_count else "DP"
929
+
930
+ return None
931
+
932
+ # ---------------------------------------------------------------------------
933
+ # Output formatting
934
+ # ---------------------------------------------------------------------------
935
+
936
+ def format_annotation(report: LCMSReport, sm_mass: float, product_mass: float) -> str:
937
+ """
938
+ Format section (1): LCMS annotation line.
939
+ Template: [Instrument], [Method short], SM RT = X.XX min, ESI+/- XXX.X; DP RT = X.XX min, ESI+/- XXX.X
940
+ """
941
+ instrument_short = report.instrument.split('#')[0] if '#' in report.instrument else report.instrument
942
+
943
+ sm_parts = []
944
+ dp_parts = []
945
+
946
+ for peak in report.peaks:
947
+ identity = identify_peak(peak, sm_mass, product_mass)
948
+ if identity == "SM":
949
+ best_ion = _find_best_ion_for(peak, sm_mass)
950
+ sm_parts.append((peak.rt, peak.area_pct, best_ion))
951
+ elif identity == "DP":
952
+ best_ion = _find_best_ion_for(peak, product_mass)
953
+ dp_parts.append((peak.rt, peak.area_pct, best_ion))
954
+
955
+ # Pick the highest-area match for SM and DP
956
+ sm_parts.sort(key=lambda x: x[1], reverse=True)
957
+ dp_parts.sort(key=lambda x: x[1], reverse=True)
958
+
959
+ parts = []
960
+ parts.append(f"{instrument_short}")
961
+ parts.append(f"{report.method_short}")
962
+
963
+ if sm_parts:
964
+ rt, area_pct, ion_str = sm_parts[0]
965
+ parts.append(f"SM RT = {rt:.2f} min, {ion_str}")
966
+
967
+ if dp_parts:
968
+ rt, area_pct, ion_str = dp_parts[0]
969
+ parts.append(f"DP RT = {rt:.2f} min, {ion_str}")
970
+
971
+ return ", ".join(parts)
972
+
973
+
974
+ def _find_best_ion_for(peak: ChromPeak, exact_mass: float) -> str:
975
+ """Find the best matching ion and return formatted string like 'ESI+ 346.0' or 'ESI- 444.1'."""
976
+ tolerance = 1.5
977
+
978
+ for spec in peak.ms_spectra:
979
+ for mz in spec.top_ions:
980
+ if spec.mode == "ES+":
981
+ if abs(mz - (exact_mass + 1.008)) < tolerance:
982
+ return f"ESI+ {mz:.1f}"
983
+ elif spec.mode == "ES-":
984
+ if abs(mz - (exact_mass - 1.008)) < tolerance:
985
+ return f"ESI- {mz:.1f}"
986
+
987
+ return "mass not confirmed"
988
+
989
+
990
+ def format_peak_summary(report: LCMSReport, sm_mass: float, product_mass: float) -> str:
991
+ """Format a summary of all peaks with identification."""
992
+ lines = []
993
+ for peak in report.peaks:
994
+ identity = identify_peak(peak, sm_mass, product_mass)
995
+ label = identity if identity else "unknown"
996
+
997
+ ion_strs = []
998
+ for spec in peak.ms_spectra:
999
+ if spec.top_ions:
1000
+ ion_strs.append(f"ESI{'+' if spec.mode == 'ES+' else '-'} {spec.top_ions[0]:.1f}")
1001
+
1002
+ ion_info = "; ".join(ion_strs) if ion_strs else "no MS data"
1003
+ area_str = f"{peak.area_pct:.1f}%" if peak.area_pct is not None else "-"
1004
+ lines.append(f" Peak {peak.peak_num}: RT {peak.rt:.2f} min, {area_str}, {ion_info} → {label}")
1005
+
1006
+ return "\n".join(lines)
1007
+
1008
+
1009
+ def analyze_reaction_progress(reports: List[LCMSReport], sm_mass: float, product_mass: float) -> str:
1010
+ """
1011
+ Analyze reaction progress across multiple timepoints.
1012
+ Returns notes section.
1013
+ """
1014
+ notes = []
1015
+
1016
+ for report in reports:
1017
+ sm_area = 0.0
1018
+ dp_area = 0.0
1019
+ unknown_area = 0.0
1020
+
1021
+ for peak in report.peaks:
1022
+ if peak.area_pct is None:
1023
+ continue # Skip peaks not in TAC table
1024
+ identity = identify_peak(peak, sm_mass, product_mass)
1025
+ if identity == "SM":
1026
+ sm_area += peak.area_pct
1027
+ elif identity == "DP":
1028
+ dp_area += peak.area_pct
1029
+ else:
1030
+ unknown_area += peak.area_pct
1031
+
1032
+ # Infer timepoint / action from filename
1033
+ name = report.sample_name.lower()
1034
+ timepoint = _infer_timepoint(name)
1035
+
1036
+ if sm_area > 0 and dp_area > 0:
1037
+ conversion = dp_area / (dp_area + sm_area) * 100
1038
+ note = f"{report.sample_name} ({report.date}): ~{conversion:.0f}% conversion{timepoint}."
1039
+ if unknown_area > 2:
1040
+ note += f" ({unknown_area:.0f}% unidentified)"
1041
+ notes.append(note)
1042
+ elif dp_area > 0 and sm_area == 0:
1043
+ note = f"{report.sample_name} ({report.date}): SM consumed{timepoint}. DP {dp_area:.0f}%"
1044
+ if unknown_area > 2:
1045
+ note += f", impurities {unknown_area:.0f}%"
1046
+ note += "."
1047
+ notes.append(note)
1048
+ elif sm_area > 0 and dp_area == 0:
1049
+ notes.append(f"{report.sample_name} ({report.date}): No product detected{timepoint}. SM {sm_area:.0f}%.")
1050
+ else:
1051
+ notes.append(f"{report.sample_name} ({report.date}): Neither SM nor DP identified in major peaks{timepoint}.")
1052
+
1053
+ return "\n".join(notes)
1054
+
1055
+
1056
+ def _infer_timepoint(name: str) -> str:
1057
+ """Try to infer timepoint or action from sample name."""
1058
+ # Common patterns in LCMS filenames
1059
+ patterns = [
1060
+ (r'(\d+)\s*h\b', lambda m: f" after {m.group(1)}h"),
1061
+ (r'(\d+)\s*min\b', lambda m: f" after {m.group(1)} min"),
1062
+ (r't(\d+)', lambda m: f" at t={m.group(1)}"),
1063
+ (r'overnight|o/?n', lambda m: " after overnight"),
1064
+ (r'ea\s*wash', lambda m: " (after EtOAc wash)"),
1065
+ (r'dcm\s*wash', lambda m: " (after DCM wash)"),
1066
+ (r'purif', lambda m: " (after purification)"),
1067
+ (r'c18', lambda m: " (C18 purification)"),
1068
+ (r'crude', lambda m: " (crude)"),
1069
+ (r'addmore', lambda m: " (after adding more reagent)"),
1070
+ ]
1071
+
1072
+ for pattern, formatter in patterns:
1073
+ m = re.search(pattern, name, re.IGNORECASE)
1074
+ if m:
1075
+ return formatter(m)
1076
+
1077
+ return ""
1078
+
1079
+
1080
+ def format_basic_report(report: LCMSReport) -> str:
1081
+ """
1082
+ Format a single LCMS file report without species identification.
1083
+
1084
+ Produces a simple peak table with RT, area%, ions, and UV data.
1085
+ Used when SM/product masses are not available, or when the pipeline
1086
+ has only a single tracking file (no cross-file analysis needed).
1087
+ """
1088
+ import math
1089
+
1090
+ lines = []
1091
+ lines.append("=" * 60)
1092
+ lines.append("SINGLE-FILE LCMS REPORT")
1093
+ lines.append("=" * 60)
1094
+ lines.append("")
1095
+ lines.append(f"File: {report.filename}")
1096
+ lines.append(f"Sample: {report.sample_name}")
1097
+ lines.append(f"Date: {report.date}"
1098
+ + (f" {report.run_time}" if report.run_time else ""))
1099
+ lines.append(f"Instrument: {report.instrument}")
1100
+ lines.append(f"Method: {report.method_short}")
1101
+ lines.append("")
1102
+ lines.append("-" * 60)
1103
+ lines.append("PEAK TABLE")
1104
+ lines.append("-" * 60)
1105
+ lines.append("")
1106
+
1107
+ if not report.peaks:
1108
+ lines.append(" (no peaks detected)")
1109
+ return "\n".join(lines)
1110
+
1111
+ for peak in report.peaks:
1112
+ # Area columns: TAC, 220nm, 254nm
1113
+ areas = []
1114
+ if peak.area_pct is not None:
1115
+ areas.append(f"TAC {peak.area_pct:.1f}%")
1116
+ if peak.area_pct_220nm is not None:
1117
+ areas.append(f"220nm {peak.area_pct_220nm:.1f}%")
1118
+ if peak.area_pct_254nm is not None:
1119
+ areas.append(f"254nm {peak.area_pct_254nm:.1f}%")
1120
+ area_str = ", ".join(areas) if areas else "(no area)"
1121
+
1122
+ # Ions
1123
+ ion_strs = []
1124
+ for spec in peak.ms_spectra:
1125
+ if spec.top_ions:
1126
+ mode_str = "ESI+" if spec.mode == "ES+" else "ESI-"
1127
+ top = ", ".join(f"{mz:.1f}" for mz in spec.top_ions[:3])
1128
+ ion_strs.append(f"{mode_str} {top}")
1129
+ ion_info = "; ".join(ion_strs) if ion_strs else "no MS"
1130
+
1131
+ # UV lambda max
1132
+ uv_str = ""
1133
+ if peak.uv_lambda_max:
1134
+ wl_strs = [str(math.floor(wl + 0.5)) for wl in sorted(peak.uv_lambda_max)]
1135
+ uv_str = f" λmax {', '.join(wl_strs)} nm"
1136
+
1137
+ lines.append(f" Peak {peak.peak_num}: RT {peak.rt:.2f} min, "
1138
+ f"{area_str}")
1139
+ lines.append(f" Ions: {ion_info}{uv_str}")
1140
+ lines.append("")
1141
+
1142
+ return "\n".join(lines)
1143
+
1144
+
1145
+ def format_table(report: LCMSReport) -> str:
1146
+ """Format an LCMS report as a markdown table for LLM consumption.
1147
+
1148
+ Output is pure data — no peak identification, no conversion, no
1149
+ interpretation. Header key-value lines followed by a 7-column table:
1150
+ peak#, RT, TAC%, 220nm%, 254nm%, ESI+ ions, ESI- ions.
1151
+ """
1152
+ import math
1153
+
1154
+ lines = []
1155
+
1156
+ # Header metadata
1157
+ lines.append(f"**Sample:** {report.sample_name}")
1158
+ lines.append(f"**Instrument:** {report.instrument}")
1159
+ lines.append(f"**Method:** {report.method_short}")
1160
+ date_str = report.date
1161
+ if report.run_time:
1162
+ date_str += f" {report.run_time}"
1163
+ lines.append(f"**Date:** {date_str}")
1164
+ lines.append("")
1165
+
1166
+ if not report.peaks:
1167
+ lines.append("(no peaks detected)")
1168
+ return "\n".join(lines)
1169
+
1170
+ # Table header
1171
+ lines.append("| # | RT | TAC% | 220nm% | 254nm% | ESI+ | ESI\u2212 |")
1172
+ lines.append("|---|------|-------|--------|--------|------|------|")
1173
+
1174
+ for peak in report.peaks:
1175
+ # Area columns
1176
+ tac = f"{peak.area_pct:.1f}" if peak.area_pct is not None else "\u2014"
1177
+ a220 = f"{peak.area_pct_220nm:.1f}" if peak.area_pct_220nm is not None else "\u2014"
1178
+ a254 = f"{peak.area_pct_254nm:.1f}" if peak.area_pct_254nm is not None else "\u2014"
1179
+
1180
+ # ESI columns: top 2-3 ions per mode, comma-separated
1181
+ esi_plus = "\u2014"
1182
+ esi_minus = "\u2014"
1183
+ for spec in peak.ms_spectra:
1184
+ ions_str = ", ".join(f"{mz:.1f}" for mz in spec.top_ions[:3])
1185
+ if not ions_str:
1186
+ continue
1187
+ if spec.mode == "ES+":
1188
+ esi_plus = ions_str
1189
+ elif spec.mode == "ES-":
1190
+ esi_minus = ions_str
1191
+
1192
+ lines.append(
1193
+ f"| {peak.peak_num} | {peak.rt:.2f} "
1194
+ f"| {tac} | {a220} | {a254} "
1195
+ f"| {esi_plus} | {esi_minus} |"
1196
+ )
1197
+
1198
+ return "\n".join(lines)
1199
+
1200
+
1201
+ # ---------------------------------------------------------------------------
1202
+ # Main
1203
+ # ---------------------------------------------------------------------------
1204
+
1205
+ def main(argv=None) -> int:
1206
+ parser = argparse.ArgumentParser(description="LCMS Report Analyzer")
1207
+ parser.add_argument('files', nargs='+', help='MassLynx PDF report files')
1208
+ parser.add_argument('--sm-mass', type=float, default=None,
1209
+ help='Exact mass of starting material')
1210
+ parser.add_argument('--product-mass', type=float, default=None,
1211
+ help='Exact mass of desired product')
1212
+ parser.add_argument('--procedure', type=str, default='',
1213
+ help='Original procedure text (for context)')
1214
+ parser.add_argument('--output', type=str, default=None,
1215
+ help='Output file path (default: stdout)')
1216
+ parser.add_argument('--format', type=str, default='table',
1217
+ choices=['basic', 'table'],
1218
+ help='Output format: table (default, markdown for LLM) or basic')
1219
+
1220
+ args = parser.parse_args(argv)
1221
+
1222
+ # Parse all reports (auto-detect manual integration vs Waters)
1223
+ reports = [] # standard Waters reports
1224
+ manual_reports = [] # manual integration exports
1225
+ for f in sorted(args.files):
1226
+ try:
1227
+ if is_manual_integration(f):
1228
+ manual_reports.append(parse_manual_report(f))
1229
+ else:
1230
+ reports.append(parse_report(f))
1231
+ except Exception as e:
1232
+ print(f"Warning: Could not parse {f}: {e}", file=sys.stderr)
1233
+
1234
+ if not reports and not manual_reports:
1235
+ print("Error: No reports could be parsed.", file=sys.stderr)
1236
+ return 1
1237
+
1238
+ # Sort by date/time
1239
+ reports.sort(key=lambda r: r.date)
1240
+
1241
+ # Table format: pure data for LLM consumption, ignores SM/product masses
1242
+ if args.format == 'table':
1243
+ parts = [format_table(r) for r in reports]
1244
+ parts += [format_manual_table(r) for r in manual_reports]
1245
+ result = "\n\n".join(parts)
1246
+ elif args.sm_mass is None or args.product_mass is None:
1247
+ result = "\n\n".join(format_basic_report(r) for r in reports)
1248
+ else:
1249
+ # Build full annotated output
1250
+ output_lines = []
1251
+
1252
+ # Section 1: Annotation
1253
+ output_lines.append("=" * 60)
1254
+ output_lines.append("(1) LCMS ANNOTATION")
1255
+ output_lines.append("=" * 60)
1256
+ for report in reports:
1257
+ # Header line: sample name + date/time
1258
+ time_str = f" {report.run_time}" if report.run_time else ""
1259
+ output_lines.append(
1260
+ f"{report.sample_name} (Date: {report.date}{time_str}, "
1261
+ f"{report.instrument}):"
1262
+ )
1263
+ annotation = format_annotation(report, args.sm_mass, args.product_mass)
1264
+ output_lines.append(f" {annotation}")
1265
+ # Also show peak breakdown
1266
+ output_lines.append(format_peak_summary(report, args.sm_mass, args.product_mass))
1267
+ output_lines.append("")
1268
+
1269
+ # Section 2: Tentative procedure
1270
+ output_lines.append("=" * 60)
1271
+ output_lines.append("(2) TENTATIVE PROCEDURE")
1272
+ output_lines.append("=" * 60)
1273
+ if args.procedure:
1274
+ output_lines.append(args.procedure)
1275
+ else:
1276
+ output_lines.append("[No procedure provided]")
1277
+ output_lines.append("")
1278
+
1279
+ # Section 3: Notes
1280
+ output_lines.append("=" * 60)
1281
+ output_lines.append("(3) NOTES")
1282
+ output_lines.append("=" * 60)
1283
+ output_lines.append(analyze_reaction_progress(reports, args.sm_mass, args.product_mass))
1284
+
1285
+ result = "\n".join(output_lines)
1286
+
1287
+ if args.output:
1288
+ with open(args.output, 'w', encoding='utf-8') as f:
1289
+ f.write(result)
1290
+ print(f"Output written to {args.output}", file=sys.stderr)
1291
+ else:
1292
+ sys.stdout.buffer.write(result.encode('utf-8'))
1293
+ sys.stdout.buffer.write(b'\n')
1294
+
1295
+ return 0
1296
+
1297
+
1298
+ if __name__ == '__main__':
1299
+ sys.exit(main())