cdxml-toolkit 0.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cdxml_toolkit/__init__.py +18 -0
- cdxml_toolkit/_jre/__init__.py +2 -0
- cdxml_toolkit/_jre/temurin-21-jre-win-x64.zip +0 -0
- cdxml_toolkit/analysis/__init__.py +35 -0
- cdxml_toolkit/analysis/deterministic/__init__.py +12 -0
- cdxml_toolkit/analysis/deterministic/discover_experiment_files.py +413 -0
- cdxml_toolkit/analysis/deterministic/lab_book_formatter.py +701 -0
- cdxml_toolkit/analysis/deterministic/lcms_file_categorizer.py +928 -0
- cdxml_toolkit/analysis/deterministic/lcms_identifier.py +598 -0
- cdxml_toolkit/analysis/deterministic/mass_resolver.py +654 -0
- cdxml_toolkit/analysis/deterministic/multi_lcms_analyzer.py +1412 -0
- cdxml_toolkit/analysis/deterministic/procedure_writer.py +446 -0
- cdxml_toolkit/analysis/extract_nmr.py +47 -0
- cdxml_toolkit/analysis/format_procedure_entry.py +479 -0
- cdxml_toolkit/analysis/lcms_analyzer.py +1299 -0
- cdxml_toolkit/analysis/parse_analysis_file.py +134 -0
- cdxml_toolkit/cdxml_builder.py +920 -0
- cdxml_toolkit/cdxml_utils.py +342 -0
- cdxml_toolkit/chemdraw/__init__.py +5 -0
- cdxml_toolkit/chemdraw/_chemscript_server.py +562 -0
- cdxml_toolkit/chemdraw/cdx_converter.py +527 -0
- cdxml_toolkit/chemdraw/cdxml_to_image.py +262 -0
- cdxml_toolkit/chemdraw/cdxml_to_image_rdkit.py +296 -0
- cdxml_toolkit/chemdraw/chemscript_bridge.py +901 -0
- cdxml_toolkit/constants.py +304 -0
- cdxml_toolkit/coord_normalizer.py +438 -0
- cdxml_toolkit/deterministic_pipeline/__init__.py +6 -0
- cdxml_toolkit/deterministic_pipeline/legacy/__init__.py +5 -0
- cdxml_toolkit/deterministic_pipeline/legacy/eln_cdx_cleanup.py +509 -0
- cdxml_toolkit/deterministic_pipeline/legacy/eln_enrichment.py +1394 -0
- cdxml_toolkit/deterministic_pipeline/legacy/scheme_aligner.py +428 -0
- cdxml_toolkit/deterministic_pipeline/legacy/scheme_polisher.py +1337 -0
- cdxml_toolkit/deterministic_pipeline/legacy/scheme_polisher_v2.py +1340 -0
- cdxml_toolkit/deterministic_pipeline/scheme_reader_audit.py +931 -0
- cdxml_toolkit/deterministic_pipeline/scheme_reader_verify.py +1160 -0
- cdxml_toolkit/image/__init__.py +15 -0
- cdxml_toolkit/image/reaction_from_image.py +2103 -0
- cdxml_toolkit/image/structure_from_image.py +1711 -0
- cdxml_toolkit/layout/__init__.py +5 -0
- cdxml_toolkit/layout/alignment.py +1642 -0
- cdxml_toolkit/layout/reaction_cleanup.py +1002 -0
- cdxml_toolkit/layout/scheme_merger.py +2260 -0
- cdxml_toolkit/mcp_server/__init__.py +0 -0
- cdxml_toolkit/mcp_server/__main__.py +5 -0
- cdxml_toolkit/mcp_server/server.py +1567 -0
- cdxml_toolkit/naming/__init__.py +6 -0
- cdxml_toolkit/naming/aligned_namer.py +2342 -0
- cdxml_toolkit/naming/mol_builder.py +3722 -0
- cdxml_toolkit/naming/name_decomposer.py +2843 -0
- cdxml_toolkit/naming/reactions_datamol.json +2414 -0
- cdxml_toolkit/office/__init__.py +5 -0
- cdxml_toolkit/office/doc_from_template.py +722 -0
- cdxml_toolkit/office/ole_embedder.py +808 -0
- cdxml_toolkit/office/ole_extractor.py +272 -0
- cdxml_toolkit/perception/__init__.py +10 -0
- cdxml_toolkit/perception/compound_search.py +229 -0
- cdxml_toolkit/perception/eln_csv_parser.py +240 -0
- cdxml_toolkit/perception/rdf_parser.py +664 -0
- cdxml_toolkit/perception/reactant_heuristic.py +1045 -0
- cdxml_toolkit/perception/reaction_parser.py +2150 -0
- cdxml_toolkit/perception/scheme_reader.py +2948 -0
- cdxml_toolkit/perception/scheme_refine.py +1404 -0
- cdxml_toolkit/perception/scheme_segmenter.py +619 -0
- cdxml_toolkit/perception/spatial_assignment.py +1013 -0
- cdxml_toolkit/rdkit_utils.py +605 -0
- cdxml_toolkit/render/__init__.py +17 -0
- cdxml_toolkit/render/auto_layout.py +229 -0
- cdxml_toolkit/render/compact_parser.py +632 -0
- cdxml_toolkit/render/parser.py +706 -0
- cdxml_toolkit/render/render_scheme.py +267 -0
- cdxml_toolkit/render/renderer.py +2387 -0
- cdxml_toolkit/render/schema.py +90 -0
- cdxml_toolkit/render/scheme_maker.py +1043 -0
- cdxml_toolkit/render/scheme_yaml_writer.py +1487 -0
- cdxml_toolkit/resolve/__init__.py +13 -0
- cdxml_toolkit/resolve/cas_resolver.py +430 -0
- cdxml_toolkit/resolve/chemscanner_abbreviations.json +28813 -0
- cdxml_toolkit/resolve/condensed_formula.py +493 -0
- cdxml_toolkit/resolve/jre_manager.py +195 -0
- cdxml_toolkit/resolve/reagent_abbreviations.json +1046 -0
- cdxml_toolkit/resolve/reagent_db.py +285 -0
- cdxml_toolkit/resolve/superatom_data.json +2856 -0
- cdxml_toolkit/resolve/superatom_table.py +146 -0
- cdxml_toolkit/text_formatting.py +298 -0
- cdxml_toolkit-0.5.0.dist-info/METADATA +318 -0
- cdxml_toolkit-0.5.0.dist-info/RECORD +91 -0
- cdxml_toolkit-0.5.0.dist-info/WHEEL +5 -0
- cdxml_toolkit-0.5.0.dist-info/entry_points.txt +17 -0
- cdxml_toolkit-0.5.0.dist-info/licenses/LICENSE +21 -0
- cdxml_toolkit-0.5.0.dist-info/licenses/NOTICE.md +37 -0
- cdxml_toolkit-0.5.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,1299 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
LCMS Report Analyzer
|
|
4
|
+
Parses Waters MassLynx PDF reports using pdfplumber with spatial word-level
|
|
5
|
+
extraction. Extracts peak tables from all three detectors (TAC, 220nm, 254nm),
|
|
6
|
+
mass spectra (ESI+/ESI-), and UV lambda-max data. Optionally identifies
|
|
7
|
+
SM/product peaks by expected mass.
|
|
8
|
+
|
|
9
|
+
PDF layout handling:
|
|
10
|
+
- Pages 1-2: chromatograms + peak tables (TAC, 220nm, 254nm) — parsed from
|
|
11
|
+
full-page extracted text.
|
|
12
|
+
- Pages 3+: mass spectra + UV panels in a 2-column × 4-row grid — parsed
|
|
13
|
+
using word-level coordinates to avoid column interleaving. Each panel is
|
|
14
|
+
isolated by bounding box (left column x < 306, right column x >= 306).
|
|
15
|
+
|
|
16
|
+
Data structures:
|
|
17
|
+
- LCMSReport: header info (sample name, date, instrument, method) + peaks
|
|
18
|
+
- ChromPeak: RT, area/area% for each detector, MS spectra, UV lambda-max
|
|
19
|
+
- peak_num is a string: "4", or "2a"/"2b" when a table has duplicate numbers
|
|
20
|
+
- MassSpectrum: mode ("ES+"/"ES-") + top_ions (m/z values, descending intensity)
|
|
21
|
+
|
|
22
|
+
Usage:
|
|
23
|
+
python lcms_analyzer.py \\
|
|
24
|
+
--sm-mass 445 \\
|
|
25
|
+
--product-mass 345 \\
|
|
26
|
+
--procedure "KL-7003-008 (100 mg, 224 umol) was dissolved in..." \\
|
|
27
|
+
file1.pdf file2.pdf ...
|
|
28
|
+
"""
|
|
29
|
+
|
|
30
|
+
import argparse
|
|
31
|
+
import re
|
|
32
|
+
import os
|
|
33
|
+
import sys
|
|
34
|
+
from dataclasses import dataclass, field
|
|
35
|
+
from typing import List, Optional, Tuple, Dict
|
|
36
|
+
from datetime import datetime
|
|
37
|
+
from collections import defaultdict
|
|
38
|
+
|
|
39
|
+
from cdxml_toolkit.constants import (
|
|
40
|
+
LCMS_COLUMN_BOUNDARY,
|
|
41
|
+
LCMS_MS_AXIS_TICKS,
|
|
42
|
+
LCMS_UV_AXIS_TICKS,
|
|
43
|
+
LCMS_UV_WAVELENGTH_MIN,
|
|
44
|
+
LCMS_UV_WAVELENGTH_MAX,
|
|
45
|
+
)
|
|
46
|
+
|
|
47
|
+
# ---------------------------------------------------------------------------
|
|
48
|
+
# Data structures
|
|
49
|
+
# ---------------------------------------------------------------------------
|
|
50
|
+
|
|
51
|
+
@dataclass
|
|
52
|
+
class MassSpectrum:
|
|
53
|
+
"""ESI+ or ESI- spectrum for a single chromatographic peak."""
|
|
54
|
+
mode: str # "ES+" or "ES-"
|
|
55
|
+
top_ions: List[float] = field(default_factory=list) # Up to 2 m/z values, tallest first
|
|
56
|
+
|
|
57
|
+
@dataclass
|
|
58
|
+
class ChromPeak:
|
|
59
|
+
"""A single integrated peak from the UV chromatogram."""
|
|
60
|
+
peak_num: str # e.g. "4", "2a", "2b"
|
|
61
|
+
rt: float
|
|
62
|
+
area: Optional[float] = None # TAC area
|
|
63
|
+
area_pct: Optional[float] = None # TAC area %
|
|
64
|
+
width: Optional[float] = None
|
|
65
|
+
height: Optional[float] = None
|
|
66
|
+
mass_found: Optional[str] = None
|
|
67
|
+
ms_spectra: List[MassSpectrum] = field(default_factory=list)
|
|
68
|
+
uv_lambda_max: List[float] = field(default_factory=list)
|
|
69
|
+
area_220nm: Optional[float] = None
|
|
70
|
+
area_pct_220nm: Optional[float] = None
|
|
71
|
+
area_254nm: Optional[float] = None
|
|
72
|
+
area_pct_254nm: Optional[float] = None
|
|
73
|
+
|
|
74
|
+
@dataclass
|
|
75
|
+
class LCMSReport:
|
|
76
|
+
"""Parsed contents of one MassLynx PDF report."""
|
|
77
|
+
filename: str
|
|
78
|
+
sample_name: str
|
|
79
|
+
date: str
|
|
80
|
+
instrument: str
|
|
81
|
+
method_path: str
|
|
82
|
+
method_short: str # abbreviated method name for annotation
|
|
83
|
+
peaks: List[ChromPeak] = field(default_factory=list)
|
|
84
|
+
file_modified: Optional[str] = None
|
|
85
|
+
run_time: Optional[str] = None # "HH:MM:SS" from PDF header
|
|
86
|
+
|
|
87
|
+
# ---------------------------------------------------------------------------
|
|
88
|
+
# PDF text extraction
|
|
89
|
+
# ---------------------------------------------------------------------------
|
|
90
|
+
|
|
91
|
+
def extract_all_text(pdf_path: str) -> str:
|
|
92
|
+
"""Extract all text from all pages of a PDF."""
|
|
93
|
+
import pdfplumber
|
|
94
|
+
texts = []
|
|
95
|
+
with pdfplumber.open(pdf_path) as pdf:
|
|
96
|
+
for page in pdf.pages:
|
|
97
|
+
t = page.extract_text()
|
|
98
|
+
if t:
|
|
99
|
+
texts.append(t)
|
|
100
|
+
return "\n\n".join(texts)
|
|
101
|
+
|
|
102
|
+
# ---------------------------------------------------------------------------
|
|
103
|
+
# Parsing logic
|
|
104
|
+
# ---------------------------------------------------------------------------
|
|
105
|
+
|
|
106
|
+
def parse_method_short(method_path: str) -> str:
|
|
107
|
+
"""
|
|
108
|
+
Extract a short method description from the full MassLynx method path.
|
|
109
|
+
|
|
110
|
+
e.g. '...21_CSH_C18_AmF_5to100_ACN_220_254nm_TAC_TIC_1p9min.olp'
|
|
111
|
+
-> 'CSH C18, AmF, 5-100%, 1.9 min'
|
|
112
|
+
|
|
113
|
+
e.g. '...21_CSH_C18_AmB_50to100_ACN_220_254nm_TAC_TIC_1p9min.olp'
|
|
114
|
+
-> 'CSH C18, AmB, 50-100%, 1.9 min'
|
|
115
|
+
"""
|
|
116
|
+
basename = os.path.basename(method_path).replace('.olp', '')
|
|
117
|
+
parts = basename.split('_')
|
|
118
|
+
|
|
119
|
+
column = ""
|
|
120
|
+
buffer_type = ""
|
|
121
|
+
gradient = ""
|
|
122
|
+
runtime = ""
|
|
123
|
+
|
|
124
|
+
for p in parts:
|
|
125
|
+
pl = p.lower()
|
|
126
|
+
# Column type
|
|
127
|
+
if any(kw in pl for kw in ('c18', 'c8', 'beh', 'csh', 'hss')):
|
|
128
|
+
column = (column + " " + p).strip()
|
|
129
|
+
# Buffer/modifier — check AmB before AmF to avoid false match
|
|
130
|
+
# ('amb' doesn't contain 'amf' so order doesn't matter for exclusion,
|
|
131
|
+
# but we guard against future overlap)
|
|
132
|
+
if not buffer_type:
|
|
133
|
+
if pl in ('amb', 'ambic') or pl.startswith('amb'):
|
|
134
|
+
buffer_type = "AmB"
|
|
135
|
+
elif pl in ('amf',) or pl.startswith('amf'):
|
|
136
|
+
buffer_type = "AmF"
|
|
137
|
+
elif pl == 'fa':
|
|
138
|
+
buffer_type = "FA"
|
|
139
|
+
elif pl in ('tfa',) or pl.startswith('tfa'):
|
|
140
|
+
buffer_type = "TFA"
|
|
141
|
+
# Gradient range: 5to100, 5to50, 50to100
|
|
142
|
+
m_grad = re.match(r'(\d+)to(\d+)', pl)
|
|
143
|
+
if m_grad and not gradient:
|
|
144
|
+
gradient = f"{m_grad.group(1)}-{m_grad.group(2)}%"
|
|
145
|
+
# Runtime: 1p9min -> 1.9 min
|
|
146
|
+
if 'min' in pl and not runtime:
|
|
147
|
+
runtime = p.replace('p', '.').replace('min', ' min')
|
|
148
|
+
|
|
149
|
+
pieces = [x for x in [column, buffer_type, gradient, runtime] if x]
|
|
150
|
+
return ", ".join(pieces) if pieces else basename
|
|
151
|
+
|
|
152
|
+
|
|
153
|
+
def method_basename(method_path: str) -> str:
|
|
154
|
+
"""Return the method filename without directory and extension, lowercased.
|
|
155
|
+
|
|
156
|
+
Used for grouping files by exact method — files with the same method
|
|
157
|
+
basename are comparable (same column, buffer, gradient, runtime).
|
|
158
|
+
"""
|
|
159
|
+
return os.path.basename(method_path).replace('.olp', '').lower()
|
|
160
|
+
|
|
161
|
+
|
|
162
|
+
def parse_header(text: str) -> dict:
|
|
163
|
+
"""Extract header fields from the report text."""
|
|
164
|
+
info = {}
|
|
165
|
+
|
|
166
|
+
m = re.search(r'Sample Name:\s*(\S+)', text)
|
|
167
|
+
info['sample_name'] = m.group(1) if m else "Unknown"
|
|
168
|
+
|
|
169
|
+
m = re.search(r'Date:\s*(\S+)', text)
|
|
170
|
+
info['date'] = m.group(1) if m else "Unknown"
|
|
171
|
+
|
|
172
|
+
m = re.search(r'Time:\s*(\d{1,2}:\d{2}:\d{2})', text)
|
|
173
|
+
info['run_time'] = m.group(1) if m else None
|
|
174
|
+
|
|
175
|
+
# Instrument name is on the line after "Page 1", before "_UPLC" or
|
|
176
|
+
# similar suffix. e.g. "PPIMSA05_UPLC-PDA-MS Open Access ..."
|
|
177
|
+
m = re.search(r'Page\s+1\s*\n(\w+)', text)
|
|
178
|
+
info['instrument'] = m.group(1).split('_')[0] if m else "Unknown"
|
|
179
|
+
|
|
180
|
+
m = re.search(r'Method:\s*(.+?)(?:\n|Report)', text)
|
|
181
|
+
info['method_path'] = m.group(1).strip() if m else "Unknown"
|
|
182
|
+
|
|
183
|
+
return info
|
|
184
|
+
|
|
185
|
+
|
|
186
|
+
# ---------------------------------------------------------------------------
|
|
187
|
+
# Peak table parsing — all three detectors (TAC, 220nm, 254nm)
|
|
188
|
+
# ---------------------------------------------------------------------------
|
|
189
|
+
|
|
190
|
+
_ROW_PATTERN = re.compile(
|
|
191
|
+
r'^\s*(\d+)\s+' # peak number
|
|
192
|
+
r'(\d+\.\d+)\s+' # retention time
|
|
193
|
+
r'(\d+)\s+' # area
|
|
194
|
+
r'(\d+\.\d+)\s+' # area %
|
|
195
|
+
r'(\d+\.\d+)\s+' # width
|
|
196
|
+
r'(\d+)\s+' # height
|
|
197
|
+
r'(.+?)$', # mass found
|
|
198
|
+
re.MULTILINE
|
|
199
|
+
)
|
|
200
|
+
|
|
201
|
+
_TABLE_HEADER = re.compile(r'Peak\s+Time\s+Area\s+Area\s*%', re.IGNORECASE)
|
|
202
|
+
|
|
203
|
+
|
|
204
|
+
def _parse_table_rows(text_block: str) -> List[dict]:
|
|
205
|
+
"""Parse peak rows from a single table text block."""
|
|
206
|
+
rows = []
|
|
207
|
+
for m in _ROW_PATTERN.finditer(text_block):
|
|
208
|
+
rows.append({
|
|
209
|
+
'peak_num_raw': int(m.group(1)),
|
|
210
|
+
'rt': float(m.group(2)),
|
|
211
|
+
'area': float(m.group(3)),
|
|
212
|
+
'area_pct': float(m.group(4)),
|
|
213
|
+
'width': float(m.group(5)),
|
|
214
|
+
'height': float(m.group(6)),
|
|
215
|
+
'mass_found': m.group(7).strip(),
|
|
216
|
+
})
|
|
217
|
+
return rows
|
|
218
|
+
|
|
219
|
+
|
|
220
|
+
def _identify_detector(text: str, header_start: int) -> str:
|
|
221
|
+
"""Look backward from a table header to identify detector type."""
|
|
222
|
+
before = text[:header_start]
|
|
223
|
+
tac_pos = max((m.start() for m in re.finditer(r'TAC:\s*Wavelength|UV Detector:\s*TAC', before)), default=-1)
|
|
224
|
+
ch1_pos = max((m.start() for m in re.finditer(r'Ch1\s*220nm|PDA\s*Ch1', before)), default=-1)
|
|
225
|
+
ch2_pos = max((m.start() for m in re.finditer(r'Ch2\s*254nm|PDA\s*Ch2', before)), default=-1)
|
|
226
|
+
|
|
227
|
+
best = max(('TAC', tac_pos), ('220nm', ch1_pos), ('254nm', ch2_pos), key=lambda x: x[1])
|
|
228
|
+
return best[0] if best[1] >= 0 else 'TAC'
|
|
229
|
+
|
|
230
|
+
|
|
231
|
+
def _build_peak_id_map(tables_raw: Dict[str, List[dict]]) -> Dict[Tuple[int, float], str]:
|
|
232
|
+
"""
|
|
233
|
+
Build mapping from (raw_peak_num, rt) -> string peak_id.
|
|
234
|
+
Assigns 'a', 'b' suffixes when a peak number appears at multiple distinct RTs.
|
|
235
|
+
"""
|
|
236
|
+
all_pairs = set()
|
|
237
|
+
for rows in tables_raw.values():
|
|
238
|
+
for row in rows:
|
|
239
|
+
all_pairs.add((row['peak_num_raw'], row['rt']))
|
|
240
|
+
|
|
241
|
+
by_num: Dict[int, List[float]] = defaultdict(list)
|
|
242
|
+
for num, rt in all_pairs:
|
|
243
|
+
by_num[num].append(rt)
|
|
244
|
+
|
|
245
|
+
mapping = {}
|
|
246
|
+
for num, rts in by_num.items():
|
|
247
|
+
rts_sorted = sorted(set(rts))
|
|
248
|
+
if len(rts_sorted) == 1:
|
|
249
|
+
mapping[(num, rts_sorted[0])] = str(num)
|
|
250
|
+
else:
|
|
251
|
+
for i, rt in enumerate(rts_sorted):
|
|
252
|
+
mapping[(num, rt)] = f"{num}{chr(ord('a') + i)}"
|
|
253
|
+
|
|
254
|
+
return mapping
|
|
255
|
+
|
|
256
|
+
|
|
257
|
+
def _lookup_peak_id(id_map: Dict[Tuple[int, float], str], raw_num: int, rt: float,
|
|
258
|
+
tolerance: float = 0.02) -> str:
|
|
259
|
+
"""Look up string peak ID with RT tolerance for fuzzy matching."""
|
|
260
|
+
if (raw_num, rt) in id_map:
|
|
261
|
+
return id_map[(raw_num, rt)]
|
|
262
|
+
for (num, map_rt), pid in id_map.items():
|
|
263
|
+
if num == raw_num and abs(map_rt - rt) < tolerance:
|
|
264
|
+
return pid
|
|
265
|
+
return str(raw_num)
|
|
266
|
+
|
|
267
|
+
|
|
268
|
+
def parse_all_peak_tables(text: str) -> Tuple[List[ChromPeak], Dict[Tuple[int, float], str]]:
|
|
269
|
+
"""
|
|
270
|
+
Parse all UV peak integration tables (TAC, 220nm, 254nm).
|
|
271
|
+
Returns (peaks, id_map) where id_map maps (raw_num, rt) -> string peak_id.
|
|
272
|
+
"""
|
|
273
|
+
headers = list(_TABLE_HEADER.finditer(text))
|
|
274
|
+
if not headers:
|
|
275
|
+
return [], {}
|
|
276
|
+
|
|
277
|
+
tables_raw: Dict[str, List[dict]] = {}
|
|
278
|
+
for i, header in enumerate(headers):
|
|
279
|
+
start = header.end()
|
|
280
|
+
end = headers[i + 1].start() if i + 1 < len(headers) else len(text)
|
|
281
|
+
table_text = text[start:end]
|
|
282
|
+
detector = _identify_detector(text, header.start())
|
|
283
|
+
rows = _parse_table_rows(table_text)
|
|
284
|
+
if detector not in tables_raw:
|
|
285
|
+
tables_raw[detector] = rows
|
|
286
|
+
else:
|
|
287
|
+
tables_raw[detector].extend(rows)
|
|
288
|
+
|
|
289
|
+
id_map = _build_peak_id_map(tables_raw)
|
|
290
|
+
|
|
291
|
+
peaks_dict: Dict[str, ChromPeak] = {}
|
|
292
|
+
for detector, rows in tables_raw.items():
|
|
293
|
+
for row in rows:
|
|
294
|
+
pid = id_map[(row['peak_num_raw'], row['rt'])]
|
|
295
|
+
if pid not in peaks_dict:
|
|
296
|
+
peaks_dict[pid] = ChromPeak(
|
|
297
|
+
peak_num=pid,
|
|
298
|
+
rt=row['rt'],
|
|
299
|
+
width=row['width'],
|
|
300
|
+
height=row['height'],
|
|
301
|
+
mass_found=row['mass_found'],
|
|
302
|
+
)
|
|
303
|
+
p = peaks_dict[pid]
|
|
304
|
+
if detector == 'TAC':
|
|
305
|
+
p.area = row['area']
|
|
306
|
+
p.area_pct = row['area_pct']
|
|
307
|
+
elif detector == '220nm':
|
|
308
|
+
p.area_220nm = row['area']
|
|
309
|
+
p.area_pct_220nm = row['area_pct']
|
|
310
|
+
elif detector == '254nm':
|
|
311
|
+
p.area_254nm = row['area']
|
|
312
|
+
p.area_pct_254nm = row['area_pct']
|
|
313
|
+
|
|
314
|
+
peaks = sorted(peaks_dict.values(), key=lambda p: (p.rt, p.peak_num))
|
|
315
|
+
return peaks, id_map
|
|
316
|
+
|
|
317
|
+
|
|
318
|
+
# ---------------------------------------------------------------------------
|
|
319
|
+
# Spatial mass spectrum + UV parsing (fixes two-column layout)
|
|
320
|
+
# ---------------------------------------------------------------------------
|
|
321
|
+
|
|
322
|
+
_MS_AXIS_TICKS = LCMS_MS_AXIS_TICKS
|
|
323
|
+
_UV_AXIS_TICKS = LCMS_UV_AXIS_TICKS
|
|
324
|
+
|
|
325
|
+
|
|
326
|
+
def _find_panel_headers(words: List[dict]) -> List[dict]:
|
|
327
|
+
"""Find 'Peak' words that are part of spectrum 'Peak Time Mass' headers.
|
|
328
|
+
|
|
329
|
+
Rejects peak-table headers ('Peak Time Area Area% Width Height Mass Found')
|
|
330
|
+
which also contain 'Peak', 'Time', and 'Mass' but are NOT spectrum panels.
|
|
331
|
+
The discriminator is the presence of 'Area' or 'Height' as neighbours.
|
|
332
|
+
"""
|
|
333
|
+
results = []
|
|
334
|
+
for w in words:
|
|
335
|
+
if w['text'] != 'Peak':
|
|
336
|
+
continue
|
|
337
|
+
y = w['top']
|
|
338
|
+
neighbors = [nw['text'] for nw in words
|
|
339
|
+
if abs(nw['top'] - y) < 3 and nw['x0'] > w['x0']
|
|
340
|
+
and nw['x0'] < w['x0'] + 400]
|
|
341
|
+
if 'Time' in neighbors and 'Mass' in neighbors:
|
|
342
|
+
# Reject peak-table headers which have "Area" / "Height" / "Width"
|
|
343
|
+
if 'Area' in neighbors or 'Height' in neighbors or 'Width' in neighbors:
|
|
344
|
+
continue
|
|
345
|
+
results.append(w)
|
|
346
|
+
return results
|
|
347
|
+
|
|
348
|
+
|
|
349
|
+
def _group_headers_into_rows(headers: List[dict], y_tolerance: float = 5.0):
|
|
350
|
+
"""Group panel headers by y-coordinate into rows. Returns [(y, [headers])]."""
|
|
351
|
+
if not headers:
|
|
352
|
+
return []
|
|
353
|
+
sorted_h = sorted(headers, key=lambda w: w['top'])
|
|
354
|
+
rows = []
|
|
355
|
+
current = [sorted_h[0]]
|
|
356
|
+
current_y = sorted_h[0]['top']
|
|
357
|
+
for h in sorted_h[1:]:
|
|
358
|
+
if abs(h['top'] - current_y) < y_tolerance:
|
|
359
|
+
current.append(h)
|
|
360
|
+
else:
|
|
361
|
+
rows.append((current_y, current))
|
|
362
|
+
current = [h]
|
|
363
|
+
current_y = h['top']
|
|
364
|
+
rows.append((current_y, current))
|
|
365
|
+
return rows
|
|
366
|
+
|
|
367
|
+
|
|
368
|
+
def _extract_mz_values(word_text: str) -> List[float]:
|
|
369
|
+
"""
|
|
370
|
+
Extract m/z values from a word string, splitting joined numbers.
|
|
371
|
+
MassLynx reports m/z to 1 decimal place, so we match \\d+\\.\\d patterns.
|
|
372
|
+
E.g. '569.1814.6874.9' -> [569.1, 814.6, 874.9]
|
|
373
|
+
"""
|
|
374
|
+
values = []
|
|
375
|
+
for m in re.finditer(r'(\d+\.\d)', word_text):
|
|
376
|
+
val = float(m.group(1))
|
|
377
|
+
if 50 < val < 2000 and val not in _MS_AXIS_TICKS:
|
|
378
|
+
values.append(val)
|
|
379
|
+
return values
|
|
380
|
+
|
|
381
|
+
|
|
382
|
+
def _parse_ms_from_words(panel_words: List[dict]) -> List[MassSpectrum]:
|
|
383
|
+
"""
|
|
384
|
+
Parse MS spectra from a panel's word list.
|
|
385
|
+
Words are sorted top-to-bottom. MassLynx labels ions tallest-first.
|
|
386
|
+
"""
|
|
387
|
+
results = []
|
|
388
|
+
|
|
389
|
+
# Sort words by vertical position
|
|
390
|
+
sorted_words = sorted(panel_words, key=lambda w: (w['top'], w['x0']))
|
|
391
|
+
|
|
392
|
+
# Find MS mode markers and their positions
|
|
393
|
+
ms_markers = []
|
|
394
|
+
for w in sorted_words:
|
|
395
|
+
m = re.match(r'(ES[+-])$', w['text'])
|
|
396
|
+
if m:
|
|
397
|
+
ms_markers.append((w['top'], m.group(1)))
|
|
398
|
+
|
|
399
|
+
if not ms_markers:
|
|
400
|
+
return results
|
|
401
|
+
|
|
402
|
+
# For each MS mode section, collect m/z values from words below it
|
|
403
|
+
for idx, (marker_y, mode) in enumerate(ms_markers):
|
|
404
|
+
# Section ends at next MS marker, or at UV section, or at end
|
|
405
|
+
if idx + 1 < len(ms_markers):
|
|
406
|
+
section_end_y = ms_markers[idx + 1][0]
|
|
407
|
+
else:
|
|
408
|
+
# Find UV section start if present
|
|
409
|
+
uv_words = [w for w in sorted_words if 'UV' in w['text'] or w['text'] == 'Nm']
|
|
410
|
+
if uv_words:
|
|
411
|
+
section_end_y = min(w['top'] for w in uv_words)
|
|
412
|
+
else:
|
|
413
|
+
section_end_y = float('inf')
|
|
414
|
+
|
|
415
|
+
# Collect m/z values, splitting any joined numbers
|
|
416
|
+
section_nums = []
|
|
417
|
+
for w in sorted_words:
|
|
418
|
+
if w['top'] <= marker_y or w['top'] >= section_end_y:
|
|
419
|
+
continue
|
|
420
|
+
section_nums.extend(_extract_mz_values(w['text']))
|
|
421
|
+
|
|
422
|
+
if section_nums:
|
|
423
|
+
results.append(MassSpectrum(mode=mode, top_ions=section_nums))
|
|
424
|
+
|
|
425
|
+
return results
|
|
426
|
+
|
|
427
|
+
|
|
428
|
+
def _parse_uv_from_words(panel_words: List[dict]) -> List[float]:
|
|
429
|
+
"""Parse UV lambda-max wavelengths from a panel's word list."""
|
|
430
|
+
sorted_words = sorted(panel_words, key=lambda w: (w['top'], w['x0']))
|
|
431
|
+
|
|
432
|
+
# Find "AU" word position — wavelengths come after it
|
|
433
|
+
au_y = None
|
|
434
|
+
for w in sorted_words:
|
|
435
|
+
if w['text'] == 'AU':
|
|
436
|
+
au_y = w['top']
|
|
437
|
+
break
|
|
438
|
+
|
|
439
|
+
if au_y is None:
|
|
440
|
+
return []
|
|
441
|
+
|
|
442
|
+
wavelengths = []
|
|
443
|
+
for w in sorted_words:
|
|
444
|
+
if w['top'] <= au_y:
|
|
445
|
+
continue
|
|
446
|
+
try:
|
|
447
|
+
val = float(w['text'])
|
|
448
|
+
if LCMS_UV_WAVELENGTH_MIN <= val <= LCMS_UV_WAVELENGTH_MAX and val not in _UV_AXIS_TICKS:
|
|
449
|
+
wavelengths.append(val)
|
|
450
|
+
except ValueError:
|
|
451
|
+
continue
|
|
452
|
+
|
|
453
|
+
return wavelengths
|
|
454
|
+
|
|
455
|
+
|
|
456
|
+
def _parse_spectrum_pages(pdf) -> Tuple[Dict[int, Tuple[float, list]],
|
|
457
|
+
Dict[int, Tuple[float, list]]]:
|
|
458
|
+
"""
|
|
459
|
+
Parse mass spectra and UV lambda-max from spectrum pages using spatial cropping.
|
|
460
|
+
Uses word-level extraction to avoid joined-number artifacts from extract_text().
|
|
461
|
+
|
|
462
|
+
Returns:
|
|
463
|
+
ms_data: {raw_peak_num: (rt, [MassSpectrum, ...])}
|
|
464
|
+
uv_data: {raw_peak_num: (rt, [wavelength, ...])}
|
|
465
|
+
"""
|
|
466
|
+
ms_data = {}
|
|
467
|
+
uv_data = {}
|
|
468
|
+
|
|
469
|
+
# Start from page 2 (index 1): MassLynx sometimes places the first
|
|
470
|
+
# peak's mass spectrum at the bottom of page 2 after the peak tables.
|
|
471
|
+
# Panel header detection rejects peak-table headers via "Area" filter.
|
|
472
|
+
for page_idx in range(1, len(pdf.pages)):
|
|
473
|
+
page = pdf.pages[page_idx]
|
|
474
|
+
words = page.extract_words()
|
|
475
|
+
if not words:
|
|
476
|
+
continue
|
|
477
|
+
|
|
478
|
+
headers = _find_panel_headers(words)
|
|
479
|
+
if not headers:
|
|
480
|
+
continue
|
|
481
|
+
|
|
482
|
+
rows = _group_headers_into_rows(headers)
|
|
483
|
+
page_width = float(page.width)
|
|
484
|
+
page_height = float(page.height)
|
|
485
|
+
col_mid = LCMS_COLUMN_BOUNDARY
|
|
486
|
+
|
|
487
|
+
for i, (y_start, headers_in_row) in enumerate(rows):
|
|
488
|
+
y_end = rows[i + 1][0] if i + 1 < len(rows) else page_height
|
|
489
|
+
|
|
490
|
+
for hdr in headers_in_row:
|
|
491
|
+
x_center = (hdr['x0'] + hdr['x1']) / 2
|
|
492
|
+
if x_center < col_mid:
|
|
493
|
+
x_start, x_end = 0, col_mid
|
|
494
|
+
else:
|
|
495
|
+
x_start, x_end = col_mid, page_width
|
|
496
|
+
|
|
497
|
+
# Filter words to this panel's bounding box
|
|
498
|
+
panel_words = [w for w in words
|
|
499
|
+
if w['x0'] >= x_start and w['x1'] <= x_end
|
|
500
|
+
and w['top'] >= y_start - 2 and w['top'] < y_end]
|
|
501
|
+
|
|
502
|
+
if not panel_words:
|
|
503
|
+
continue
|
|
504
|
+
|
|
505
|
+
# Extract peak number and RT from panel words
|
|
506
|
+
# Look for the first integer followed by a decimal (e.g. "4" then "0.64")
|
|
507
|
+
peak_num = None
|
|
508
|
+
rt = None
|
|
509
|
+
num_words = sorted(panel_words, key=lambda w: (w['top'], w['x0']))
|
|
510
|
+
for j, w in enumerate(num_words):
|
|
511
|
+
if peak_num is not None:
|
|
512
|
+
break
|
|
513
|
+
if re.match(r'^\d+$', w['text']) and w['text'] != '0':
|
|
514
|
+
# Check if next word at similar y is a decimal (RT)
|
|
515
|
+
for nw in num_words[j+1:j+4]:
|
|
516
|
+
if abs(nw['top'] - w['top']) < 3 and re.match(r'^\d+\.\d+$', nw['text']):
|
|
517
|
+
peak_num = int(w['text'])
|
|
518
|
+
rt = float(nw['text'])
|
|
519
|
+
break
|
|
520
|
+
|
|
521
|
+
if peak_num is None:
|
|
522
|
+
continue
|
|
523
|
+
|
|
524
|
+
# Check panel content type from word texts
|
|
525
|
+
panel_texts = [w['text'] for w in panel_words]
|
|
526
|
+
has_ms = any('ES+' in t or 'ES-' in t for t in panel_texts)
|
|
527
|
+
has_uv = any('UV' in t for t in panel_texts) and any('Detector' in t for t in panel_texts)
|
|
528
|
+
|
|
529
|
+
# Parse MS data
|
|
530
|
+
if has_ms:
|
|
531
|
+
ms_list = _parse_ms_from_words(panel_words)
|
|
532
|
+
if ms_list:
|
|
533
|
+
if peak_num not in ms_data:
|
|
534
|
+
ms_data[peak_num] = (rt, [])
|
|
535
|
+
ms_data[peak_num][1].extend(ms_list)
|
|
536
|
+
|
|
537
|
+
# Parse UV data
|
|
538
|
+
if has_uv:
|
|
539
|
+
wavelengths = _parse_uv_from_words(panel_words)
|
|
540
|
+
if wavelengths:
|
|
541
|
+
if peak_num not in uv_data:
|
|
542
|
+
uv_data[peak_num] = (rt, [])
|
|
543
|
+
uv_data[peak_num][1].extend(wavelengths)
|
|
544
|
+
|
|
545
|
+
return ms_data, uv_data
|
|
546
|
+
|
|
547
|
+
|
|
548
|
+
def is_waters_report(pdf_path: str) -> bool:
|
|
549
|
+
"""Quick content-based check: is this PDF a standard Waters MassLynx report?
|
|
550
|
+
|
|
551
|
+
Manually integrated chromatograms (e.g. LC-only or MS-only exports) lack
|
|
552
|
+
the structured headers of a full UPLC-PDA-MS Open Access report. This
|
|
553
|
+
function reads only the first page and checks for Waters report markers.
|
|
554
|
+
|
|
555
|
+
Returns True for standard reports, False for manually integrated exports
|
|
556
|
+
or other non-standard PDFs.
|
|
557
|
+
"""
|
|
558
|
+
import pdfplumber
|
|
559
|
+
try:
|
|
560
|
+
with pdfplumber.open(pdf_path) as pdf:
|
|
561
|
+
if not pdf.pages:
|
|
562
|
+
return False
|
|
563
|
+
text = pdf.pages[0].extract_text() or ""
|
|
564
|
+
# Standard Waters MassLynx reports contain these markers on page 1
|
|
565
|
+
# Check for at least 2 of 3 markers for robustness
|
|
566
|
+
markers = [
|
|
567
|
+
"Sample Name:" in text,
|
|
568
|
+
"Instrument:" in text or "UPLC" in text,
|
|
569
|
+
"Date:" in text and "Time:" in text,
|
|
570
|
+
]
|
|
571
|
+
return sum(markers) >= 2
|
|
572
|
+
except Exception:
|
|
573
|
+
return False
|
|
574
|
+
|
|
575
|
+
|
|
576
|
+
def parse_report(pdf_path: str) -> LCMSReport:
|
|
577
|
+
"""Parse a complete MassLynx PDF report."""
|
|
578
|
+
import pdfplumber
|
|
579
|
+
|
|
580
|
+
with pdfplumber.open(pdf_path) as pdf:
|
|
581
|
+
# Extract all text for header and peak tables
|
|
582
|
+
texts = []
|
|
583
|
+
for page in pdf.pages:
|
|
584
|
+
t = page.extract_text()
|
|
585
|
+
if t:
|
|
586
|
+
texts.append(t)
|
|
587
|
+
text = "\n\n".join(texts)
|
|
588
|
+
|
|
589
|
+
header = parse_header(text)
|
|
590
|
+
peaks, id_map = parse_all_peak_tables(text)
|
|
591
|
+
|
|
592
|
+
# Parse mass spectra and UV using spatial approach
|
|
593
|
+
ms_data, uv_data = _parse_spectrum_pages(pdf)
|
|
594
|
+
|
|
595
|
+
# Attach MS spectra to peaks
|
|
596
|
+
for raw_num, (rt, ms_list) in ms_data.items():
|
|
597
|
+
pid = _lookup_peak_id(id_map, raw_num, rt)
|
|
598
|
+
for peak in peaks:
|
|
599
|
+
if peak.peak_num == pid:
|
|
600
|
+
peak.ms_spectra = ms_list
|
|
601
|
+
break
|
|
602
|
+
|
|
603
|
+
# Attach UV lambda-max to peaks
|
|
604
|
+
for raw_num, (rt, wavelengths) in uv_data.items():
|
|
605
|
+
pid = _lookup_peak_id(id_map, raw_num, rt)
|
|
606
|
+
for peak in peaks:
|
|
607
|
+
if peak.peak_num == pid:
|
|
608
|
+
peak.uv_lambda_max = wavelengths
|
|
609
|
+
break
|
|
610
|
+
|
|
611
|
+
# Get file modified time
|
|
612
|
+
mtime = os.path.getmtime(pdf_path)
|
|
613
|
+
modified = datetime.fromtimestamp(mtime).strftime("%Y-%m-%d %H:%M")
|
|
614
|
+
|
|
615
|
+
return LCMSReport(
|
|
616
|
+
filename=os.path.basename(pdf_path),
|
|
617
|
+
sample_name=header['sample_name'],
|
|
618
|
+
date=header['date'],
|
|
619
|
+
instrument=header['instrument'],
|
|
620
|
+
method_path=header['method_path'],
|
|
621
|
+
method_short=parse_method_short(header['method_path']),
|
|
622
|
+
peaks=peaks,
|
|
623
|
+
file_modified=modified,
|
|
624
|
+
run_time=header.get('run_time'),
|
|
625
|
+
)
|
|
626
|
+
|
|
627
|
+
# ---------------------------------------------------------------------------
|
|
628
|
+
# Manual integration reports (LC-only / MS-only MassLynx exports)
|
|
629
|
+
# ---------------------------------------------------------------------------
|
|
630
|
+
|
|
631
|
+
@dataclass
|
|
632
|
+
class ManualPeak:
|
|
633
|
+
"""A peak from a manually integrated chromatogram."""
|
|
634
|
+
peak_num: str
|
|
635
|
+
rt: float
|
|
636
|
+
area: float
|
|
637
|
+
area_pct: float
|
|
638
|
+
height: Optional[float] = None
|
|
639
|
+
|
|
640
|
+
|
|
641
|
+
@dataclass
|
|
642
|
+
class ManualLCMSSample:
|
|
643
|
+
"""One chromatogram section from a manual integration PDF."""
|
|
644
|
+
sample_name: str
|
|
645
|
+
peaks: List[ManualPeak] = field(default_factory=list)
|
|
646
|
+
detector: str = "" # e.g. "Diode Array 290nm"
|
|
647
|
+
from_labels: bool = False # True if parsed from RT;Area labels (best-effort)
|
|
648
|
+
|
|
649
|
+
|
|
650
|
+
@dataclass
|
|
651
|
+
class ManualLCMSReport:
|
|
652
|
+
"""Parsed contents of a manually integrated MassLynx PDF."""
|
|
653
|
+
filename: str
|
|
654
|
+
instrument: str
|
|
655
|
+
date: str
|
|
656
|
+
samples: List[ManualLCMSSample] = field(default_factory=list)
|
|
657
|
+
run_time: Optional[str] = None
|
|
658
|
+
|
|
659
|
+
|
|
660
|
+
def is_manual_integration(pdf_path: str) -> bool:
|
|
661
|
+
"""Check if this PDF is a MassLynx manual integration export.
|
|
662
|
+
|
|
663
|
+
Manual integration PDFs have "Diode Array" but lack the structured
|
|
664
|
+
"Sample Name:" / "Date:" / "Time:" headers of a full Waters report.
|
|
665
|
+
"""
|
|
666
|
+
import pdfplumber
|
|
667
|
+
try:
|
|
668
|
+
with pdfplumber.open(pdf_path) as pdf:
|
|
669
|
+
if not pdf.pages:
|
|
670
|
+
return False
|
|
671
|
+
text = pdf.pages[0].extract_text() or ""
|
|
672
|
+
has_diode_array = "Diode Array" in text
|
|
673
|
+
has_waters_header = "Sample Name:" in text
|
|
674
|
+
return has_diode_array and not has_waters_header
|
|
675
|
+
except Exception:
|
|
676
|
+
return False
|
|
677
|
+
|
|
678
|
+
|
|
679
|
+
def parse_manual_report(pdf_path: str) -> ManualLCMSReport:
|
|
680
|
+
"""Parse a manually integrated MassLynx PDF.
|
|
681
|
+
|
|
682
|
+
Handles three variants:
|
|
683
|
+
1. Single sample with peak table (Time Height Area Area%)
|
|
684
|
+
2. Multi-sample with peak tables per section
|
|
685
|
+
3. Multi-sample with only RT;Area chromatogram labels (no table)
|
|
686
|
+
|
|
687
|
+
Returns a ManualLCMSReport with one ManualLCMSSample per chromatogram.
|
|
688
|
+
"""
|
|
689
|
+
text = extract_all_text(pdf_path)
|
|
690
|
+
filename = os.path.basename(pdf_path)
|
|
691
|
+
|
|
692
|
+
# --- Header: first line is "SampleName Instrument Date" ---
|
|
693
|
+
# Instrument is an alphanumeric code (e.g. PPIMSA05, UPLCMS01, SQD2)
|
|
694
|
+
# immediately followed by a date in DD-Mon-YYYY format.
|
|
695
|
+
header_match = re.match(
|
|
696
|
+
r'(.+?)\s+([A-Z][A-Za-z0-9]+)\s+'
|
|
697
|
+
r'(\d{1,2}-\w{3}-\d{4})\s*\n\s*(\d{2}:\d{2}:\d{2})?',
|
|
698
|
+
text
|
|
699
|
+
)
|
|
700
|
+
instrument = header_match.group(2) if header_match else ""
|
|
701
|
+
date_str = header_match.group(3) if header_match else ""
|
|
702
|
+
run_time = header_match.group(4) if header_match else None
|
|
703
|
+
|
|
704
|
+
# --- Split into per-sample sections ---
|
|
705
|
+
# Each section starts with a sample name followed by optional smoothing
|
|
706
|
+
# params and "3: Diode Array" or similar detector marker.
|
|
707
|
+
# Pattern: "SampleName [Sm (Mn, 2x3)] 3: Diode Array"
|
|
708
|
+
section_pattern = re.compile(
|
|
709
|
+
r'^([\w][\w\-]+(?:\s+Sm\s*\([^)]+\))?)\s+'
|
|
710
|
+
r'(\d+:\s*Diode Array)\s*\n'
|
|
711
|
+
r'(.*?)(?=^[\w][\w\-]+(?:\s+Sm\s*\([^)]+\))?\s+\d+:\s*Diode Array|\Z)',
|
|
712
|
+
re.MULTILINE | re.DOTALL
|
|
713
|
+
)
|
|
714
|
+
|
|
715
|
+
samples = []
|
|
716
|
+
for m in section_pattern.finditer(text):
|
|
717
|
+
raw_name = m.group(1).strip()
|
|
718
|
+
detector_str = m.group(2).strip()
|
|
719
|
+
section_text = m.group(3)
|
|
720
|
+
|
|
721
|
+
# Clean sample name: strip smoothing params
|
|
722
|
+
sample_name = re.sub(r'\s+Sm\s*\([^)]+\)', '', raw_name).strip()
|
|
723
|
+
|
|
724
|
+
# Try to extract detector wavelength
|
|
725
|
+
wl_match = re.search(r'(\d{3})', detector_str)
|
|
726
|
+
detector = f"Diode Array {wl_match.group(1)}nm" if wl_match else detector_str
|
|
727
|
+
|
|
728
|
+
# --- Try peak table first (Time Height Area Area%) ---
|
|
729
|
+
# Table rows may be interleaved with Y-axis tick labels (e.g.
|
|
730
|
+
# "5.5e+1") from pdfplumber. We search for the header, then
|
|
731
|
+
# scan subsequent lines for 4-number rows that look like
|
|
732
|
+
# Time Height Area Area% data.
|
|
733
|
+
peaks = []
|
|
734
|
+
header_match = re.search(r'Time\s+Height\s+Area\s+Area%', section_text)
|
|
735
|
+
if header_match:
|
|
736
|
+
after_header = section_text[header_match.end():]
|
|
737
|
+
# Find rows of 4 numbers where RT is plausible (<20 min)
|
|
738
|
+
# and Area% is 0-100
|
|
739
|
+
row_pattern = re.compile(
|
|
740
|
+
r'(\d+\.\d+)\s+(\d+)\s+(\d+(?:\.\d+)?)\s+(\d+\.\d+)'
|
|
741
|
+
)
|
|
742
|
+
for rm in row_pattern.finditer(after_header):
|
|
743
|
+
rt = float(rm.group(1))
|
|
744
|
+
height = float(rm.group(2))
|
|
745
|
+
area = float(rm.group(3))
|
|
746
|
+
area_pct = float(rm.group(4))
|
|
747
|
+
# Sanity: RT < 20 min, area% <= 100
|
|
748
|
+
if rt < 20.0 and area_pct <= 100.0:
|
|
749
|
+
peaks.append(ManualPeak(
|
|
750
|
+
peak_num=str(len(peaks) + 1),
|
|
751
|
+
rt=rt,
|
|
752
|
+
height=height,
|
|
753
|
+
area=area,
|
|
754
|
+
area_pct=area_pct,
|
|
755
|
+
))
|
|
756
|
+
else:
|
|
757
|
+
# --- Fallback: parse RT;Area labels from chromatogram ---
|
|
758
|
+
# Labels appear as "RT;Area" or "RT\nArea" (area on next line)
|
|
759
|
+
label_pattern = re.compile(
|
|
760
|
+
r'([\d.]+);([\d.]+)' # "0.56;404"
|
|
761
|
+
)
|
|
762
|
+
raw_peaks = []
|
|
763
|
+
for lm in label_pattern.finditer(section_text):
|
|
764
|
+
raw_peaks.append((float(lm.group(1)), float(lm.group(2))))
|
|
765
|
+
|
|
766
|
+
# Also catch "RT\nArea" patterns (RT alone, area on next line)
|
|
767
|
+
# These show up when the label wraps, e.g. "1.32\n20"
|
|
768
|
+
# But we need to avoid matching axis ticks. Axis ticks are on
|
|
769
|
+
# lines starting with "-0.00" or in sequences.
|
|
770
|
+
# Strategy: look for floating numbers that aren't matched by
|
|
771
|
+
# the RT;Area pattern and aren't axis-like.
|
|
772
|
+
standalone_rt_pattern = re.compile(
|
|
773
|
+
r'(?<!\d[;.])(?:^|\s)((?:0\.\d{2}|1\.\d{2}))\s*\n\s*(\d+)(?:\s|$)',
|
|
774
|
+
re.MULTILINE
|
|
775
|
+
)
|
|
776
|
+
for sm in standalone_rt_pattern.finditer(section_text):
|
|
777
|
+
rt_val = float(sm.group(1))
|
|
778
|
+
area_val = float(sm.group(2))
|
|
779
|
+
# Deduplicate: skip if we already have a peak at this RT
|
|
780
|
+
if not any(abs(rt_val - rp[0]) < 0.02 for rp in raw_peaks):
|
|
781
|
+
raw_peaks.append((rt_val, area_val))
|
|
782
|
+
|
|
783
|
+
# Sort by RT and compute area%
|
|
784
|
+
raw_peaks.sort(key=lambda x: x[0])
|
|
785
|
+
total_area = sum(a for _, a in raw_peaks) if raw_peaks else 1.0
|
|
786
|
+
for i, (rt, area) in enumerate(raw_peaks, 1):
|
|
787
|
+
peaks.append(ManualPeak(
|
|
788
|
+
peak_num=str(i),
|
|
789
|
+
rt=rt,
|
|
790
|
+
area=area,
|
|
791
|
+
area_pct=(area / total_area * 100) if total_area > 0 else 0.0,
|
|
792
|
+
))
|
|
793
|
+
|
|
794
|
+
used_labels = not bool(header_match)
|
|
795
|
+
samples.append(ManualLCMSSample(
|
|
796
|
+
sample_name=sample_name,
|
|
797
|
+
peaks=peaks,
|
|
798
|
+
detector=detector,
|
|
799
|
+
from_labels=used_labels,
|
|
800
|
+
))
|
|
801
|
+
|
|
802
|
+
return ManualLCMSReport(
|
|
803
|
+
filename=filename,
|
|
804
|
+
instrument=instrument,
|
|
805
|
+
date=date_str,
|
|
806
|
+
samples=samples,
|
|
807
|
+
run_time=run_time,
|
|
808
|
+
)
|
|
809
|
+
|
|
810
|
+
|
|
811
|
+
def format_manual_table(report: ManualLCMSReport) -> str:
|
|
812
|
+
"""Format a manual integration report as markdown for LLM consumption."""
|
|
813
|
+
lines = []
|
|
814
|
+
|
|
815
|
+
lines.append(f"**File:** {report.filename} (manual integration)")
|
|
816
|
+
lines.append(f"**Instrument:** {report.instrument}")
|
|
817
|
+
date_str = report.date
|
|
818
|
+
if report.run_time:
|
|
819
|
+
date_str += f" {report.run_time}"
|
|
820
|
+
lines.append(f"**Date:** {date_str}")
|
|
821
|
+
|
|
822
|
+
for sample in report.samples:
|
|
823
|
+
lines.append("")
|
|
824
|
+
lines.append(f"### {sample.sample_name}")
|
|
825
|
+
if not sample.peaks:
|
|
826
|
+
lines.append("(no peaks)")
|
|
827
|
+
continue
|
|
828
|
+
|
|
829
|
+
lines.append(f"| # | RT | Area% |")
|
|
830
|
+
lines.append(f"|---|------|-------|")
|
|
831
|
+
total_pct = 0.0
|
|
832
|
+
for peak in sample.peaks:
|
|
833
|
+
lines.append(f"| {peak.peak_num} | {peak.rt:.2f} | {peak.area_pct:.1f} |")
|
|
834
|
+
total_pct += peak.area_pct
|
|
835
|
+
if total_pct < 95.0:
|
|
836
|
+
lines.append(f"")
|
|
837
|
+
lines.append(f"*Warning: parsed peaks sum to {total_pct:.1f}% — some peaks may not have been extracted.*")
|
|
838
|
+
elif sample.from_labels:
|
|
839
|
+
lines.append(f"")
|
|
840
|
+
lines.append(f"*Note: area% computed from chromatogram labels (no peak table in PDF). Some small peaks may be missing.*")
|
|
841
|
+
|
|
842
|
+
return "\n".join(lines)
|
|
843
|
+
|
|
844
|
+
|
|
845
|
+
# ---------------------------------------------------------------------------
|
|
846
|
+
# Peak identification by expected mass
|
|
847
|
+
# ---------------------------------------------------------------------------
|
|
848
|
+
|
|
849
|
+
def identify_peak(peak: ChromPeak, sm_mass: float, product_mass: float,
|
|
850
|
+
tolerance: float = 1.5) -> Optional[str]:
|
|
851
|
+
"""
|
|
852
|
+
Try to identify a peak as SM, product, or unknown based on ESI mass data.
|
|
853
|
+
|
|
854
|
+
Checks for [M+H]+, [M-H]-, [M+Na]+, [M+formate]- adducts.
|
|
855
|
+
Returns: "SM", "DP" (desired product), or None
|
|
856
|
+
|
|
857
|
+
Key subtlety: if ESI+ matches product but ESI- matches SM for the same peak,
|
|
858
|
+
that's likely SM with in-source fragmentation (e.g. Boc loss). We collect
|
|
859
|
+
evidence from both polarities and resolve conflicts.
|
|
860
|
+
"""
|
|
861
|
+
adducts_pos = [
|
|
862
|
+
("M+H", 1.008),
|
|
863
|
+
("M+Na", 22.990),
|
|
864
|
+
]
|
|
865
|
+
adducts_neg = [
|
|
866
|
+
("M-H", -1.008),
|
|
867
|
+
("M+formate", 44.998),
|
|
868
|
+
]
|
|
869
|
+
|
|
870
|
+
# Collect all evidence: list of (identity, mode, adduct_name, mz, is_base_peak)
|
|
871
|
+
evidence = []
|
|
872
|
+
|
|
873
|
+
for spec in peak.ms_spectra:
|
|
874
|
+
if not spec.top_ions:
|
|
875
|
+
continue
|
|
876
|
+
for i, mz in enumerate(spec.top_ions):
|
|
877
|
+
is_base = (i == 0) # First ion is the tallest
|
|
878
|
+
if spec.mode == "ES+":
|
|
879
|
+
for adduct_name, adduct_mass in adducts_pos:
|
|
880
|
+
if abs(mz - (product_mass + adduct_mass)) < tolerance:
|
|
881
|
+
evidence.append(("DP", spec.mode, adduct_name, mz, is_base))
|
|
882
|
+
if abs(mz - (sm_mass + adduct_mass)) < tolerance:
|
|
883
|
+
evidence.append(("SM", spec.mode, adduct_name, mz, is_base))
|
|
884
|
+
elif spec.mode == "ES-":
|
|
885
|
+
for adduct_name, adduct_mass in adducts_neg:
|
|
886
|
+
if abs(mz - (product_mass + adduct_mass)) < tolerance:
|
|
887
|
+
evidence.append(("DP", spec.mode, adduct_name, mz, is_base))
|
|
888
|
+
if abs(mz - (sm_mass + adduct_mass)) < tolerance:
|
|
889
|
+
evidence.append(("SM", spec.mode, adduct_name, mz, is_base))
|
|
890
|
+
|
|
891
|
+
if not evidence:
|
|
892
|
+
return None
|
|
893
|
+
|
|
894
|
+
# Resolve: do we have conflicting identities?
|
|
895
|
+
identities_found = set(e[0] for e in evidence)
|
|
896
|
+
|
|
897
|
+
if len(identities_found) == 1:
|
|
898
|
+
return identities_found.pop()
|
|
899
|
+
|
|
900
|
+
if "SM" in identities_found and "DP" in identities_found:
|
|
901
|
+
# Conflict! Common case: SM fragments in ESI+ to look like product.
|
|
902
|
+
# Heuristic: if ESI- clearly shows SM (via [M-H]-), trust that over
|
|
903
|
+
# ESI+ showing product (which is likely in-source fragmentation).
|
|
904
|
+
sm_neg = [e for e in evidence if e[0] == "SM" and e[1] == "ES-"]
|
|
905
|
+
dp_pos = [e for e in evidence if e[0] == "DP" and e[1] == "ES+"]
|
|
906
|
+
|
|
907
|
+
if sm_neg:
|
|
908
|
+
# ESI- says SM — trust it. The ESI+ "product" signal is fragmentation.
|
|
909
|
+
return "SM"
|
|
910
|
+
|
|
911
|
+
sm_pos = [e for e in evidence if e[0] == "SM" and e[1] == "ES+"]
|
|
912
|
+
dp_neg = [e for e in evidence if e[0] == "DP" and e[1] == "ES-"]
|
|
913
|
+
|
|
914
|
+
if dp_neg:
|
|
915
|
+
return "DP"
|
|
916
|
+
|
|
917
|
+
# Both in same polarity — go with the one that has base peak evidence
|
|
918
|
+
sm_base = [e for e in evidence if e[0] == "SM" and e[4]]
|
|
919
|
+
dp_base = [e for e in evidence if e[0] == "DP" and e[4]]
|
|
920
|
+
if dp_base and not sm_base:
|
|
921
|
+
return "DP"
|
|
922
|
+
if sm_base and not dp_base:
|
|
923
|
+
return "SM"
|
|
924
|
+
|
|
925
|
+
# Default: return the one with more evidence
|
|
926
|
+
sm_count = sum(1 for e in evidence if e[0] == "SM")
|
|
927
|
+
dp_count = sum(1 for e in evidence if e[0] == "DP")
|
|
928
|
+
return "SM" if sm_count >= dp_count else "DP"
|
|
929
|
+
|
|
930
|
+
return None
|
|
931
|
+
|
|
932
|
+
# ---------------------------------------------------------------------------
|
|
933
|
+
# Output formatting
|
|
934
|
+
# ---------------------------------------------------------------------------
|
|
935
|
+
|
|
936
|
+
def format_annotation(report: LCMSReport, sm_mass: float, product_mass: float) -> str:
|
|
937
|
+
"""
|
|
938
|
+
Format section (1): LCMS annotation line.
|
|
939
|
+
Template: [Instrument], [Method short], SM RT = X.XX min, ESI+/- XXX.X; DP RT = X.XX min, ESI+/- XXX.X
|
|
940
|
+
"""
|
|
941
|
+
instrument_short = report.instrument.split('#')[0] if '#' in report.instrument else report.instrument
|
|
942
|
+
|
|
943
|
+
sm_parts = []
|
|
944
|
+
dp_parts = []
|
|
945
|
+
|
|
946
|
+
for peak in report.peaks:
|
|
947
|
+
identity = identify_peak(peak, sm_mass, product_mass)
|
|
948
|
+
if identity == "SM":
|
|
949
|
+
best_ion = _find_best_ion_for(peak, sm_mass)
|
|
950
|
+
sm_parts.append((peak.rt, peak.area_pct, best_ion))
|
|
951
|
+
elif identity == "DP":
|
|
952
|
+
best_ion = _find_best_ion_for(peak, product_mass)
|
|
953
|
+
dp_parts.append((peak.rt, peak.area_pct, best_ion))
|
|
954
|
+
|
|
955
|
+
# Pick the highest-area match for SM and DP
|
|
956
|
+
sm_parts.sort(key=lambda x: x[1], reverse=True)
|
|
957
|
+
dp_parts.sort(key=lambda x: x[1], reverse=True)
|
|
958
|
+
|
|
959
|
+
parts = []
|
|
960
|
+
parts.append(f"{instrument_short}")
|
|
961
|
+
parts.append(f"{report.method_short}")
|
|
962
|
+
|
|
963
|
+
if sm_parts:
|
|
964
|
+
rt, area_pct, ion_str = sm_parts[0]
|
|
965
|
+
parts.append(f"SM RT = {rt:.2f} min, {ion_str}")
|
|
966
|
+
|
|
967
|
+
if dp_parts:
|
|
968
|
+
rt, area_pct, ion_str = dp_parts[0]
|
|
969
|
+
parts.append(f"DP RT = {rt:.2f} min, {ion_str}")
|
|
970
|
+
|
|
971
|
+
return ", ".join(parts)
|
|
972
|
+
|
|
973
|
+
|
|
974
|
+
def _find_best_ion_for(peak: ChromPeak, exact_mass: float) -> str:
|
|
975
|
+
"""Find the best matching ion and return formatted string like 'ESI+ 346.0' or 'ESI- 444.1'."""
|
|
976
|
+
tolerance = 1.5
|
|
977
|
+
|
|
978
|
+
for spec in peak.ms_spectra:
|
|
979
|
+
for mz in spec.top_ions:
|
|
980
|
+
if spec.mode == "ES+":
|
|
981
|
+
if abs(mz - (exact_mass + 1.008)) < tolerance:
|
|
982
|
+
return f"ESI+ {mz:.1f}"
|
|
983
|
+
elif spec.mode == "ES-":
|
|
984
|
+
if abs(mz - (exact_mass - 1.008)) < tolerance:
|
|
985
|
+
return f"ESI- {mz:.1f}"
|
|
986
|
+
|
|
987
|
+
return "mass not confirmed"
|
|
988
|
+
|
|
989
|
+
|
|
990
|
+
def format_peak_summary(report: LCMSReport, sm_mass: float, product_mass: float) -> str:
|
|
991
|
+
"""Format a summary of all peaks with identification."""
|
|
992
|
+
lines = []
|
|
993
|
+
for peak in report.peaks:
|
|
994
|
+
identity = identify_peak(peak, sm_mass, product_mass)
|
|
995
|
+
label = identity if identity else "unknown"
|
|
996
|
+
|
|
997
|
+
ion_strs = []
|
|
998
|
+
for spec in peak.ms_spectra:
|
|
999
|
+
if spec.top_ions:
|
|
1000
|
+
ion_strs.append(f"ESI{'+' if spec.mode == 'ES+' else '-'} {spec.top_ions[0]:.1f}")
|
|
1001
|
+
|
|
1002
|
+
ion_info = "; ".join(ion_strs) if ion_strs else "no MS data"
|
|
1003
|
+
area_str = f"{peak.area_pct:.1f}%" if peak.area_pct is not None else "-"
|
|
1004
|
+
lines.append(f" Peak {peak.peak_num}: RT {peak.rt:.2f} min, {area_str}, {ion_info} → {label}")
|
|
1005
|
+
|
|
1006
|
+
return "\n".join(lines)
|
|
1007
|
+
|
|
1008
|
+
|
|
1009
|
+
def analyze_reaction_progress(reports: List[LCMSReport], sm_mass: float, product_mass: float) -> str:
|
|
1010
|
+
"""
|
|
1011
|
+
Analyze reaction progress across multiple timepoints.
|
|
1012
|
+
Returns notes section.
|
|
1013
|
+
"""
|
|
1014
|
+
notes = []
|
|
1015
|
+
|
|
1016
|
+
for report in reports:
|
|
1017
|
+
sm_area = 0.0
|
|
1018
|
+
dp_area = 0.0
|
|
1019
|
+
unknown_area = 0.0
|
|
1020
|
+
|
|
1021
|
+
for peak in report.peaks:
|
|
1022
|
+
if peak.area_pct is None:
|
|
1023
|
+
continue # Skip peaks not in TAC table
|
|
1024
|
+
identity = identify_peak(peak, sm_mass, product_mass)
|
|
1025
|
+
if identity == "SM":
|
|
1026
|
+
sm_area += peak.area_pct
|
|
1027
|
+
elif identity == "DP":
|
|
1028
|
+
dp_area += peak.area_pct
|
|
1029
|
+
else:
|
|
1030
|
+
unknown_area += peak.area_pct
|
|
1031
|
+
|
|
1032
|
+
# Infer timepoint / action from filename
|
|
1033
|
+
name = report.sample_name.lower()
|
|
1034
|
+
timepoint = _infer_timepoint(name)
|
|
1035
|
+
|
|
1036
|
+
if sm_area > 0 and dp_area > 0:
|
|
1037
|
+
conversion = dp_area / (dp_area + sm_area) * 100
|
|
1038
|
+
note = f"{report.sample_name} ({report.date}): ~{conversion:.0f}% conversion{timepoint}."
|
|
1039
|
+
if unknown_area > 2:
|
|
1040
|
+
note += f" ({unknown_area:.0f}% unidentified)"
|
|
1041
|
+
notes.append(note)
|
|
1042
|
+
elif dp_area > 0 and sm_area == 0:
|
|
1043
|
+
note = f"{report.sample_name} ({report.date}): SM consumed{timepoint}. DP {dp_area:.0f}%"
|
|
1044
|
+
if unknown_area > 2:
|
|
1045
|
+
note += f", impurities {unknown_area:.0f}%"
|
|
1046
|
+
note += "."
|
|
1047
|
+
notes.append(note)
|
|
1048
|
+
elif sm_area > 0 and dp_area == 0:
|
|
1049
|
+
notes.append(f"{report.sample_name} ({report.date}): No product detected{timepoint}. SM {sm_area:.0f}%.")
|
|
1050
|
+
else:
|
|
1051
|
+
notes.append(f"{report.sample_name} ({report.date}): Neither SM nor DP identified in major peaks{timepoint}.")
|
|
1052
|
+
|
|
1053
|
+
return "\n".join(notes)
|
|
1054
|
+
|
|
1055
|
+
|
|
1056
|
+
def _infer_timepoint(name: str) -> str:
|
|
1057
|
+
"""Try to infer timepoint or action from sample name."""
|
|
1058
|
+
# Common patterns in LCMS filenames
|
|
1059
|
+
patterns = [
|
|
1060
|
+
(r'(\d+)\s*h\b', lambda m: f" after {m.group(1)}h"),
|
|
1061
|
+
(r'(\d+)\s*min\b', lambda m: f" after {m.group(1)} min"),
|
|
1062
|
+
(r't(\d+)', lambda m: f" at t={m.group(1)}"),
|
|
1063
|
+
(r'overnight|o/?n', lambda m: " after overnight"),
|
|
1064
|
+
(r'ea\s*wash', lambda m: " (after EtOAc wash)"),
|
|
1065
|
+
(r'dcm\s*wash', lambda m: " (after DCM wash)"),
|
|
1066
|
+
(r'purif', lambda m: " (after purification)"),
|
|
1067
|
+
(r'c18', lambda m: " (C18 purification)"),
|
|
1068
|
+
(r'crude', lambda m: " (crude)"),
|
|
1069
|
+
(r'addmore', lambda m: " (after adding more reagent)"),
|
|
1070
|
+
]
|
|
1071
|
+
|
|
1072
|
+
for pattern, formatter in patterns:
|
|
1073
|
+
m = re.search(pattern, name, re.IGNORECASE)
|
|
1074
|
+
if m:
|
|
1075
|
+
return formatter(m)
|
|
1076
|
+
|
|
1077
|
+
return ""
|
|
1078
|
+
|
|
1079
|
+
|
|
1080
|
+
def format_basic_report(report: LCMSReport) -> str:
|
|
1081
|
+
"""
|
|
1082
|
+
Format a single LCMS file report without species identification.
|
|
1083
|
+
|
|
1084
|
+
Produces a simple peak table with RT, area%, ions, and UV data.
|
|
1085
|
+
Used when SM/product masses are not available, or when the pipeline
|
|
1086
|
+
has only a single tracking file (no cross-file analysis needed).
|
|
1087
|
+
"""
|
|
1088
|
+
import math
|
|
1089
|
+
|
|
1090
|
+
lines = []
|
|
1091
|
+
lines.append("=" * 60)
|
|
1092
|
+
lines.append("SINGLE-FILE LCMS REPORT")
|
|
1093
|
+
lines.append("=" * 60)
|
|
1094
|
+
lines.append("")
|
|
1095
|
+
lines.append(f"File: {report.filename}")
|
|
1096
|
+
lines.append(f"Sample: {report.sample_name}")
|
|
1097
|
+
lines.append(f"Date: {report.date}"
|
|
1098
|
+
+ (f" {report.run_time}" if report.run_time else ""))
|
|
1099
|
+
lines.append(f"Instrument: {report.instrument}")
|
|
1100
|
+
lines.append(f"Method: {report.method_short}")
|
|
1101
|
+
lines.append("")
|
|
1102
|
+
lines.append("-" * 60)
|
|
1103
|
+
lines.append("PEAK TABLE")
|
|
1104
|
+
lines.append("-" * 60)
|
|
1105
|
+
lines.append("")
|
|
1106
|
+
|
|
1107
|
+
if not report.peaks:
|
|
1108
|
+
lines.append(" (no peaks detected)")
|
|
1109
|
+
return "\n".join(lines)
|
|
1110
|
+
|
|
1111
|
+
for peak in report.peaks:
|
|
1112
|
+
# Area columns: TAC, 220nm, 254nm
|
|
1113
|
+
areas = []
|
|
1114
|
+
if peak.area_pct is not None:
|
|
1115
|
+
areas.append(f"TAC {peak.area_pct:.1f}%")
|
|
1116
|
+
if peak.area_pct_220nm is not None:
|
|
1117
|
+
areas.append(f"220nm {peak.area_pct_220nm:.1f}%")
|
|
1118
|
+
if peak.area_pct_254nm is not None:
|
|
1119
|
+
areas.append(f"254nm {peak.area_pct_254nm:.1f}%")
|
|
1120
|
+
area_str = ", ".join(areas) if areas else "(no area)"
|
|
1121
|
+
|
|
1122
|
+
# Ions
|
|
1123
|
+
ion_strs = []
|
|
1124
|
+
for spec in peak.ms_spectra:
|
|
1125
|
+
if spec.top_ions:
|
|
1126
|
+
mode_str = "ESI+" if spec.mode == "ES+" else "ESI-"
|
|
1127
|
+
top = ", ".join(f"{mz:.1f}" for mz in spec.top_ions[:3])
|
|
1128
|
+
ion_strs.append(f"{mode_str} {top}")
|
|
1129
|
+
ion_info = "; ".join(ion_strs) if ion_strs else "no MS"
|
|
1130
|
+
|
|
1131
|
+
# UV lambda max
|
|
1132
|
+
uv_str = ""
|
|
1133
|
+
if peak.uv_lambda_max:
|
|
1134
|
+
wl_strs = [str(math.floor(wl + 0.5)) for wl in sorted(peak.uv_lambda_max)]
|
|
1135
|
+
uv_str = f" λmax {', '.join(wl_strs)} nm"
|
|
1136
|
+
|
|
1137
|
+
lines.append(f" Peak {peak.peak_num}: RT {peak.rt:.2f} min, "
|
|
1138
|
+
f"{area_str}")
|
|
1139
|
+
lines.append(f" Ions: {ion_info}{uv_str}")
|
|
1140
|
+
lines.append("")
|
|
1141
|
+
|
|
1142
|
+
return "\n".join(lines)
|
|
1143
|
+
|
|
1144
|
+
|
|
1145
|
+
def format_table(report: LCMSReport) -> str:
|
|
1146
|
+
"""Format an LCMS report as a markdown table for LLM consumption.
|
|
1147
|
+
|
|
1148
|
+
Output is pure data — no peak identification, no conversion, no
|
|
1149
|
+
interpretation. Header key-value lines followed by a 7-column table:
|
|
1150
|
+
peak#, RT, TAC%, 220nm%, 254nm%, ESI+ ions, ESI- ions.
|
|
1151
|
+
"""
|
|
1152
|
+
import math
|
|
1153
|
+
|
|
1154
|
+
lines = []
|
|
1155
|
+
|
|
1156
|
+
# Header metadata
|
|
1157
|
+
lines.append(f"**Sample:** {report.sample_name}")
|
|
1158
|
+
lines.append(f"**Instrument:** {report.instrument}")
|
|
1159
|
+
lines.append(f"**Method:** {report.method_short}")
|
|
1160
|
+
date_str = report.date
|
|
1161
|
+
if report.run_time:
|
|
1162
|
+
date_str += f" {report.run_time}"
|
|
1163
|
+
lines.append(f"**Date:** {date_str}")
|
|
1164
|
+
lines.append("")
|
|
1165
|
+
|
|
1166
|
+
if not report.peaks:
|
|
1167
|
+
lines.append("(no peaks detected)")
|
|
1168
|
+
return "\n".join(lines)
|
|
1169
|
+
|
|
1170
|
+
# Table header
|
|
1171
|
+
lines.append("| # | RT | TAC% | 220nm% | 254nm% | ESI+ | ESI\u2212 |")
|
|
1172
|
+
lines.append("|---|------|-------|--------|--------|------|------|")
|
|
1173
|
+
|
|
1174
|
+
for peak in report.peaks:
|
|
1175
|
+
# Area columns
|
|
1176
|
+
tac = f"{peak.area_pct:.1f}" if peak.area_pct is not None else "\u2014"
|
|
1177
|
+
a220 = f"{peak.area_pct_220nm:.1f}" if peak.area_pct_220nm is not None else "\u2014"
|
|
1178
|
+
a254 = f"{peak.area_pct_254nm:.1f}" if peak.area_pct_254nm is not None else "\u2014"
|
|
1179
|
+
|
|
1180
|
+
# ESI columns: top 2-3 ions per mode, comma-separated
|
|
1181
|
+
esi_plus = "\u2014"
|
|
1182
|
+
esi_minus = "\u2014"
|
|
1183
|
+
for spec in peak.ms_spectra:
|
|
1184
|
+
ions_str = ", ".join(f"{mz:.1f}" for mz in spec.top_ions[:3])
|
|
1185
|
+
if not ions_str:
|
|
1186
|
+
continue
|
|
1187
|
+
if spec.mode == "ES+":
|
|
1188
|
+
esi_plus = ions_str
|
|
1189
|
+
elif spec.mode == "ES-":
|
|
1190
|
+
esi_minus = ions_str
|
|
1191
|
+
|
|
1192
|
+
lines.append(
|
|
1193
|
+
f"| {peak.peak_num} | {peak.rt:.2f} "
|
|
1194
|
+
f"| {tac} | {a220} | {a254} "
|
|
1195
|
+
f"| {esi_plus} | {esi_minus} |"
|
|
1196
|
+
)
|
|
1197
|
+
|
|
1198
|
+
return "\n".join(lines)
|
|
1199
|
+
|
|
1200
|
+
|
|
1201
|
+
# ---------------------------------------------------------------------------
|
|
1202
|
+
# Main
|
|
1203
|
+
# ---------------------------------------------------------------------------
|
|
1204
|
+
|
|
1205
|
+
def main(argv=None) -> int:
|
|
1206
|
+
parser = argparse.ArgumentParser(description="LCMS Report Analyzer")
|
|
1207
|
+
parser.add_argument('files', nargs='+', help='MassLynx PDF report files')
|
|
1208
|
+
parser.add_argument('--sm-mass', type=float, default=None,
|
|
1209
|
+
help='Exact mass of starting material')
|
|
1210
|
+
parser.add_argument('--product-mass', type=float, default=None,
|
|
1211
|
+
help='Exact mass of desired product')
|
|
1212
|
+
parser.add_argument('--procedure', type=str, default='',
|
|
1213
|
+
help='Original procedure text (for context)')
|
|
1214
|
+
parser.add_argument('--output', type=str, default=None,
|
|
1215
|
+
help='Output file path (default: stdout)')
|
|
1216
|
+
parser.add_argument('--format', type=str, default='table',
|
|
1217
|
+
choices=['basic', 'table'],
|
|
1218
|
+
help='Output format: table (default, markdown for LLM) or basic')
|
|
1219
|
+
|
|
1220
|
+
args = parser.parse_args(argv)
|
|
1221
|
+
|
|
1222
|
+
# Parse all reports (auto-detect manual integration vs Waters)
|
|
1223
|
+
reports = [] # standard Waters reports
|
|
1224
|
+
manual_reports = [] # manual integration exports
|
|
1225
|
+
for f in sorted(args.files):
|
|
1226
|
+
try:
|
|
1227
|
+
if is_manual_integration(f):
|
|
1228
|
+
manual_reports.append(parse_manual_report(f))
|
|
1229
|
+
else:
|
|
1230
|
+
reports.append(parse_report(f))
|
|
1231
|
+
except Exception as e:
|
|
1232
|
+
print(f"Warning: Could not parse {f}: {e}", file=sys.stderr)
|
|
1233
|
+
|
|
1234
|
+
if not reports and not manual_reports:
|
|
1235
|
+
print("Error: No reports could be parsed.", file=sys.stderr)
|
|
1236
|
+
return 1
|
|
1237
|
+
|
|
1238
|
+
# Sort by date/time
|
|
1239
|
+
reports.sort(key=lambda r: r.date)
|
|
1240
|
+
|
|
1241
|
+
# Table format: pure data for LLM consumption, ignores SM/product masses
|
|
1242
|
+
if args.format == 'table':
|
|
1243
|
+
parts = [format_table(r) for r in reports]
|
|
1244
|
+
parts += [format_manual_table(r) for r in manual_reports]
|
|
1245
|
+
result = "\n\n".join(parts)
|
|
1246
|
+
elif args.sm_mass is None or args.product_mass is None:
|
|
1247
|
+
result = "\n\n".join(format_basic_report(r) for r in reports)
|
|
1248
|
+
else:
|
|
1249
|
+
# Build full annotated output
|
|
1250
|
+
output_lines = []
|
|
1251
|
+
|
|
1252
|
+
# Section 1: Annotation
|
|
1253
|
+
output_lines.append("=" * 60)
|
|
1254
|
+
output_lines.append("(1) LCMS ANNOTATION")
|
|
1255
|
+
output_lines.append("=" * 60)
|
|
1256
|
+
for report in reports:
|
|
1257
|
+
# Header line: sample name + date/time
|
|
1258
|
+
time_str = f" {report.run_time}" if report.run_time else ""
|
|
1259
|
+
output_lines.append(
|
|
1260
|
+
f"{report.sample_name} (Date: {report.date}{time_str}, "
|
|
1261
|
+
f"{report.instrument}):"
|
|
1262
|
+
)
|
|
1263
|
+
annotation = format_annotation(report, args.sm_mass, args.product_mass)
|
|
1264
|
+
output_lines.append(f" {annotation}")
|
|
1265
|
+
# Also show peak breakdown
|
|
1266
|
+
output_lines.append(format_peak_summary(report, args.sm_mass, args.product_mass))
|
|
1267
|
+
output_lines.append("")
|
|
1268
|
+
|
|
1269
|
+
# Section 2: Tentative procedure
|
|
1270
|
+
output_lines.append("=" * 60)
|
|
1271
|
+
output_lines.append("(2) TENTATIVE PROCEDURE")
|
|
1272
|
+
output_lines.append("=" * 60)
|
|
1273
|
+
if args.procedure:
|
|
1274
|
+
output_lines.append(args.procedure)
|
|
1275
|
+
else:
|
|
1276
|
+
output_lines.append("[No procedure provided]")
|
|
1277
|
+
output_lines.append("")
|
|
1278
|
+
|
|
1279
|
+
# Section 3: Notes
|
|
1280
|
+
output_lines.append("=" * 60)
|
|
1281
|
+
output_lines.append("(3) NOTES")
|
|
1282
|
+
output_lines.append("=" * 60)
|
|
1283
|
+
output_lines.append(analyze_reaction_progress(reports, args.sm_mass, args.product_mass))
|
|
1284
|
+
|
|
1285
|
+
result = "\n".join(output_lines)
|
|
1286
|
+
|
|
1287
|
+
if args.output:
|
|
1288
|
+
with open(args.output, 'w', encoding='utf-8') as f:
|
|
1289
|
+
f.write(result)
|
|
1290
|
+
print(f"Output written to {args.output}", file=sys.stderr)
|
|
1291
|
+
else:
|
|
1292
|
+
sys.stdout.buffer.write(result.encode('utf-8'))
|
|
1293
|
+
sys.stdout.buffer.write(b'\n')
|
|
1294
|
+
|
|
1295
|
+
return 0
|
|
1296
|
+
|
|
1297
|
+
|
|
1298
|
+
if __name__ == '__main__':
|
|
1299
|
+
sys.exit(main())
|