cdxml-toolkit 0.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cdxml_toolkit/__init__.py +18 -0
- cdxml_toolkit/_jre/__init__.py +2 -0
- cdxml_toolkit/_jre/temurin-21-jre-win-x64.zip +0 -0
- cdxml_toolkit/analysis/__init__.py +35 -0
- cdxml_toolkit/analysis/deterministic/__init__.py +12 -0
- cdxml_toolkit/analysis/deterministic/discover_experiment_files.py +413 -0
- cdxml_toolkit/analysis/deterministic/lab_book_formatter.py +701 -0
- cdxml_toolkit/analysis/deterministic/lcms_file_categorizer.py +928 -0
- cdxml_toolkit/analysis/deterministic/lcms_identifier.py +598 -0
- cdxml_toolkit/analysis/deterministic/mass_resolver.py +654 -0
- cdxml_toolkit/analysis/deterministic/multi_lcms_analyzer.py +1412 -0
- cdxml_toolkit/analysis/deterministic/procedure_writer.py +446 -0
- cdxml_toolkit/analysis/extract_nmr.py +47 -0
- cdxml_toolkit/analysis/format_procedure_entry.py +479 -0
- cdxml_toolkit/analysis/lcms_analyzer.py +1299 -0
- cdxml_toolkit/analysis/parse_analysis_file.py +134 -0
- cdxml_toolkit/cdxml_builder.py +920 -0
- cdxml_toolkit/cdxml_utils.py +342 -0
- cdxml_toolkit/chemdraw/__init__.py +5 -0
- cdxml_toolkit/chemdraw/_chemscript_server.py +562 -0
- cdxml_toolkit/chemdraw/cdx_converter.py +527 -0
- cdxml_toolkit/chemdraw/cdxml_to_image.py +262 -0
- cdxml_toolkit/chemdraw/cdxml_to_image_rdkit.py +296 -0
- cdxml_toolkit/chemdraw/chemscript_bridge.py +901 -0
- cdxml_toolkit/constants.py +304 -0
- cdxml_toolkit/coord_normalizer.py +438 -0
- cdxml_toolkit/deterministic_pipeline/__init__.py +6 -0
- cdxml_toolkit/deterministic_pipeline/legacy/__init__.py +5 -0
- cdxml_toolkit/deterministic_pipeline/legacy/eln_cdx_cleanup.py +509 -0
- cdxml_toolkit/deterministic_pipeline/legacy/eln_enrichment.py +1394 -0
- cdxml_toolkit/deterministic_pipeline/legacy/scheme_aligner.py +428 -0
- cdxml_toolkit/deterministic_pipeline/legacy/scheme_polisher.py +1337 -0
- cdxml_toolkit/deterministic_pipeline/legacy/scheme_polisher_v2.py +1340 -0
- cdxml_toolkit/deterministic_pipeline/scheme_reader_audit.py +931 -0
- cdxml_toolkit/deterministic_pipeline/scheme_reader_verify.py +1160 -0
- cdxml_toolkit/image/__init__.py +15 -0
- cdxml_toolkit/image/reaction_from_image.py +2103 -0
- cdxml_toolkit/image/structure_from_image.py +1711 -0
- cdxml_toolkit/layout/__init__.py +5 -0
- cdxml_toolkit/layout/alignment.py +1642 -0
- cdxml_toolkit/layout/reaction_cleanup.py +1002 -0
- cdxml_toolkit/layout/scheme_merger.py +2260 -0
- cdxml_toolkit/mcp_server/__init__.py +0 -0
- cdxml_toolkit/mcp_server/__main__.py +5 -0
- cdxml_toolkit/mcp_server/server.py +1567 -0
- cdxml_toolkit/naming/__init__.py +6 -0
- cdxml_toolkit/naming/aligned_namer.py +2342 -0
- cdxml_toolkit/naming/mol_builder.py +3722 -0
- cdxml_toolkit/naming/name_decomposer.py +2843 -0
- cdxml_toolkit/naming/reactions_datamol.json +2414 -0
- cdxml_toolkit/office/__init__.py +5 -0
- cdxml_toolkit/office/doc_from_template.py +722 -0
- cdxml_toolkit/office/ole_embedder.py +808 -0
- cdxml_toolkit/office/ole_extractor.py +272 -0
- cdxml_toolkit/perception/__init__.py +10 -0
- cdxml_toolkit/perception/compound_search.py +229 -0
- cdxml_toolkit/perception/eln_csv_parser.py +240 -0
- cdxml_toolkit/perception/rdf_parser.py +664 -0
- cdxml_toolkit/perception/reactant_heuristic.py +1045 -0
- cdxml_toolkit/perception/reaction_parser.py +2150 -0
- cdxml_toolkit/perception/scheme_reader.py +2948 -0
- cdxml_toolkit/perception/scheme_refine.py +1404 -0
- cdxml_toolkit/perception/scheme_segmenter.py +619 -0
- cdxml_toolkit/perception/spatial_assignment.py +1013 -0
- cdxml_toolkit/rdkit_utils.py +605 -0
- cdxml_toolkit/render/__init__.py +17 -0
- cdxml_toolkit/render/auto_layout.py +229 -0
- cdxml_toolkit/render/compact_parser.py +632 -0
- cdxml_toolkit/render/parser.py +706 -0
- cdxml_toolkit/render/render_scheme.py +267 -0
- cdxml_toolkit/render/renderer.py +2387 -0
- cdxml_toolkit/render/schema.py +90 -0
- cdxml_toolkit/render/scheme_maker.py +1043 -0
- cdxml_toolkit/render/scheme_yaml_writer.py +1487 -0
- cdxml_toolkit/resolve/__init__.py +13 -0
- cdxml_toolkit/resolve/cas_resolver.py +430 -0
- cdxml_toolkit/resolve/chemscanner_abbreviations.json +28813 -0
- cdxml_toolkit/resolve/condensed_formula.py +493 -0
- cdxml_toolkit/resolve/jre_manager.py +195 -0
- cdxml_toolkit/resolve/reagent_abbreviations.json +1046 -0
- cdxml_toolkit/resolve/reagent_db.py +285 -0
- cdxml_toolkit/resolve/superatom_data.json +2856 -0
- cdxml_toolkit/resolve/superatom_table.py +146 -0
- cdxml_toolkit/text_formatting.py +298 -0
- cdxml_toolkit-0.5.0.dist-info/METADATA +318 -0
- cdxml_toolkit-0.5.0.dist-info/RECORD +91 -0
- cdxml_toolkit-0.5.0.dist-info/WHEEL +5 -0
- cdxml_toolkit-0.5.0.dist-info/entry_points.txt +17 -0
- cdxml_toolkit-0.5.0.dist-info/licenses/LICENSE +21 -0
- cdxml_toolkit-0.5.0.dist-info/licenses/NOTICE.md +37 -0
- cdxml_toolkit-0.5.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,701 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
Lab Book Formatter — Output Section Generation for Procedure Writer
|
|
4
|
+
|
|
5
|
+
Builds the three sections of a lab book entry:
|
|
6
|
+
PROCEDURE — setup text, tracking narrative, workup, purification, isolation
|
|
7
|
+
CHARACTERIZATION — LCMS annotations (tracking + purified) + NMR data
|
|
8
|
+
NOTES — conversion timeline, unidentified compounds, diagnostics
|
|
9
|
+
|
|
10
|
+
Also contains formatting helpers for LCMS method names, purity reporting,
|
|
11
|
+
UV lambda-max values, and narrative inference from LCMS filenames.
|
|
12
|
+
|
|
13
|
+
Usage:
|
|
14
|
+
from lab_book_formatter import (
|
|
15
|
+
build_procedure_section, build_characterization_section,
|
|
16
|
+
build_notes_section, assemble_output,
|
|
17
|
+
)
|
|
18
|
+
"""
|
|
19
|
+
|
|
20
|
+
import math
|
|
21
|
+
import os
|
|
22
|
+
import re
|
|
23
|
+
import sys
|
|
24
|
+
from typing import List, Optional, Dict
|
|
25
|
+
|
|
26
|
+
from cdxml_toolkit.constants import MIN_REPORT_AREA_PCT
|
|
27
|
+
from .mass_resolver import ExpectedSpecies
|
|
28
|
+
from .lcms_identifier import (
|
|
29
|
+
IdentifiedCompound, IdentifiedPeak,
|
|
30
|
+
TrackingAnalysis, PurifiedAnalysis,
|
|
31
|
+
)
|
|
32
|
+
|
|
33
|
+
# ---------------------------------------------------------------------------
|
|
34
|
+
# Constants
|
|
35
|
+
# ---------------------------------------------------------------------------
|
|
36
|
+
|
|
37
|
+
SECTION_SEP = "=" * 60
|
|
38
|
+
|
|
39
|
+
# ---------------------------------------------------------------------------
|
|
40
|
+
# Method formatting
|
|
41
|
+
# ---------------------------------------------------------------------------
|
|
42
|
+
|
|
43
|
+
def format_method_name(method_path: str) -> str:
|
|
44
|
+
"""Extract short method name like 'AmF 2 min' from method path."""
|
|
45
|
+
basename = os.path.basename(method_path).replace('.olp', '')
|
|
46
|
+
parts = basename.split('_')
|
|
47
|
+
|
|
48
|
+
buffer_type = ""
|
|
49
|
+
runtime = ""
|
|
50
|
+
|
|
51
|
+
for p in parts:
|
|
52
|
+
pl = p.lower()
|
|
53
|
+
if 'amf' in pl:
|
|
54
|
+
buffer_type = "AmF"
|
|
55
|
+
elif 'amb' in pl or 'ambic' in pl:
|
|
56
|
+
buffer_type = "AmB"
|
|
57
|
+
elif pl == 'fa':
|
|
58
|
+
buffer_type = "FA"
|
|
59
|
+
elif 'tfa' in pl:
|
|
60
|
+
buffer_type = "TFA"
|
|
61
|
+
|
|
62
|
+
if 'min' in pl:
|
|
63
|
+
time_str = pl.replace('min', '').replace('p', '.')
|
|
64
|
+
try:
|
|
65
|
+
time_val = float(time_str)
|
|
66
|
+
runtime = f"{round(time_val)} min"
|
|
67
|
+
except ValueError:
|
|
68
|
+
runtime = p
|
|
69
|
+
|
|
70
|
+
pieces = [x for x in [buffer_type, runtime] if x]
|
|
71
|
+
return " ".join(pieces) if pieces else basename
|
|
72
|
+
|
|
73
|
+
# ---------------------------------------------------------------------------
|
|
74
|
+
# Tracking narrative helpers
|
|
75
|
+
# ---------------------------------------------------------------------------
|
|
76
|
+
|
|
77
|
+
def _infer_event_from_filename(filename: str) -> Optional[str]:
|
|
78
|
+
"""Infer a reaction event from an LCMS filename suffix."""
|
|
79
|
+
name = filename.lower()
|
|
80
|
+
|
|
81
|
+
# Additional reagent addition
|
|
82
|
+
m = re.search(r'add\s*(\d+)\s*mg\s*(\w+)', name)
|
|
83
|
+
if m:
|
|
84
|
+
amount = m.group(1)
|
|
85
|
+
reagent = m.group(2).upper()
|
|
86
|
+
return f"Additional {reagent} ({amount} mg) was added"
|
|
87
|
+
|
|
88
|
+
if 'addmore' in name:
|
|
89
|
+
return "Additional reagent was added"
|
|
90
|
+
|
|
91
|
+
# Quenching
|
|
92
|
+
m = re.search(r'(\d+)\s*ml\s*me', name)
|
|
93
|
+
if m:
|
|
94
|
+
return f"The reaction was quenched with MeOH ({m.group(1)} mL)"
|
|
95
|
+
|
|
96
|
+
return None
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
def _infer_timepoint_desc(filename: str) -> str:
|
|
100
|
+
"""Extract a timepoint description from filename for narrative use."""
|
|
101
|
+
name = filename.lower()
|
|
102
|
+
|
|
103
|
+
# Remove experiment prefix and extension
|
|
104
|
+
base = os.path.splitext(os.path.basename(name))[0]
|
|
105
|
+
|
|
106
|
+
# Extract time patterns
|
|
107
|
+
m = re.search(r'(\d+)\s*min', base)
|
|
108
|
+
if m:
|
|
109
|
+
if 'premix' in base:
|
|
110
|
+
return f"after {m.group(1)} min premixing"
|
|
111
|
+
return f"after {m.group(1)} min"
|
|
112
|
+
|
|
113
|
+
m = re.search(r'(\d+)\s*h\b', base)
|
|
114
|
+
if m:
|
|
115
|
+
return f"after {m.group(1)} h"
|
|
116
|
+
|
|
117
|
+
if re.search(r'\bON\b', os.path.basename(filename)) or 'overnight' in name:
|
|
118
|
+
return "after overnight stirring"
|
|
119
|
+
|
|
120
|
+
if 'beforeadd' in name:
|
|
121
|
+
return "before adding more reagent"
|
|
122
|
+
|
|
123
|
+
if 'premix' in name:
|
|
124
|
+
return "after premixing"
|
|
125
|
+
|
|
126
|
+
return ""
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
def _timepoint_for_file(files, file_index: int) -> str:
|
|
130
|
+
"""Get timepoint description for a file by index."""
|
|
131
|
+
if file_index < len(files):
|
|
132
|
+
return _infer_timepoint_desc(files[file_index].filename)
|
|
133
|
+
return ""
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
def build_tracking_narrative(exp, tracking: TrackingAnalysis) -> str:
|
|
137
|
+
"""
|
|
138
|
+
Build a concise reaction monitoring narrative from multi-LCMS data.
|
|
139
|
+
|
|
140
|
+
Uses identified compound trends and area% timelines to produce
|
|
141
|
+
a paragraph summarizing the conversion story.
|
|
142
|
+
"""
|
|
143
|
+
if not tracking.result or not tracking.identified:
|
|
144
|
+
return ""
|
|
145
|
+
|
|
146
|
+
# Find SM and product compounds
|
|
147
|
+
sm_ic = None
|
|
148
|
+
dp_ic = None
|
|
149
|
+
for ic in tracking.identified:
|
|
150
|
+
if ic.species.role == "substrate":
|
|
151
|
+
sm_ic = ic
|
|
152
|
+
elif ic.species.role == "product":
|
|
153
|
+
dp_ic = ic
|
|
154
|
+
|
|
155
|
+
if not sm_ic and not dp_ic:
|
|
156
|
+
return ""
|
|
157
|
+
|
|
158
|
+
events = []
|
|
159
|
+
files = tracking.result.files
|
|
160
|
+
|
|
161
|
+
# Check for reagent addition events from filenames
|
|
162
|
+
for lf in exp.lcms_files:
|
|
163
|
+
if lf.category == "tracking":
|
|
164
|
+
event = _infer_event_from_filename(lf.filename)
|
|
165
|
+
if event:
|
|
166
|
+
events.append(event + ".")
|
|
167
|
+
break # Only report first addition
|
|
168
|
+
|
|
169
|
+
# Compute conversion at each timepoint from area% data
|
|
170
|
+
sm_areas = sm_ic.compound.area_pct_by_file if sm_ic else {}
|
|
171
|
+
dp_areas = dp_ic.compound.area_pct_by_file if dp_ic else {}
|
|
172
|
+
|
|
173
|
+
prev_conversion = None
|
|
174
|
+
reported_complete = False
|
|
175
|
+
|
|
176
|
+
for fi in range(len(files)):
|
|
177
|
+
fe = files[fi]
|
|
178
|
+
is_premix = 'premix' in fe.filename.lower()
|
|
179
|
+
tp = _infer_timepoint_desc(fe.filename)
|
|
180
|
+
|
|
181
|
+
sm_area = sm_areas.get(fi)
|
|
182
|
+
dp_area = dp_areas.get(fi)
|
|
183
|
+
|
|
184
|
+
# Compute conversion if both SM and DP are tracked
|
|
185
|
+
if sm_ic and dp_ic:
|
|
186
|
+
sa = sm_area if sm_area is not None else 0
|
|
187
|
+
da = dp_area if dp_area is not None else 0
|
|
188
|
+
total = sa + da
|
|
189
|
+
conversion = (da / total * 100) if total > 1.0 else None
|
|
190
|
+
elif sm_ic and sm_area is not None:
|
|
191
|
+
# No product identified — track SM consumption
|
|
192
|
+
conversion = 100 - sm_area if sm_area < 95 else None
|
|
193
|
+
else:
|
|
194
|
+
conversion = None
|
|
195
|
+
|
|
196
|
+
if is_premix or conversion is None:
|
|
197
|
+
continue
|
|
198
|
+
|
|
199
|
+
if prev_conversion is None:
|
|
200
|
+
if conversion > 0 and tp:
|
|
201
|
+
if conversion >= 95:
|
|
202
|
+
events.append(
|
|
203
|
+
f"LCMS {tp} indicated complete consumption "
|
|
204
|
+
f"of starting material.")
|
|
205
|
+
reported_complete = True
|
|
206
|
+
else:
|
|
207
|
+
events.append(
|
|
208
|
+
f"LCMS {tp} indicated ~{conversion:.0f}% "
|
|
209
|
+
f"conversion.")
|
|
210
|
+
else:
|
|
211
|
+
delta = conversion - prev_conversion
|
|
212
|
+
if conversion >= 95 and not reported_complete:
|
|
213
|
+
events.append(
|
|
214
|
+
f"LCMS {tp} indicated complete consumption "
|
|
215
|
+
f"of starting material.")
|
|
216
|
+
reported_complete = True
|
|
217
|
+
elif reported_complete:
|
|
218
|
+
pass
|
|
219
|
+
elif abs(delta) < 5:
|
|
220
|
+
if not any('did not improve' in e for e in events):
|
|
221
|
+
events.append(
|
|
222
|
+
f"Conversion did not improve {tp} "
|
|
223
|
+
f"(~{conversion:.0f}%).")
|
|
224
|
+
elif delta > 10:
|
|
225
|
+
events.append(
|
|
226
|
+
f"LCMS {tp} showed ~{conversion:.0f}% conversion.")
|
|
227
|
+
|
|
228
|
+
prev_conversion = conversion
|
|
229
|
+
|
|
230
|
+
if not events:
|
|
231
|
+
# Fallback: report trends
|
|
232
|
+
parts = []
|
|
233
|
+
if sm_ic:
|
|
234
|
+
parts.append(f"SM {sm_ic.compound.trend}")
|
|
235
|
+
if dp_ic:
|
|
236
|
+
parts.append(f"DP {dp_ic.compound.trend}")
|
|
237
|
+
if parts:
|
|
238
|
+
return "LCMS tracking: " + ", ".join(parts) + "."
|
|
239
|
+
return ""
|
|
240
|
+
|
|
241
|
+
return " ".join(events)
|
|
242
|
+
|
|
243
|
+
|
|
244
|
+
# ---------------------------------------------------------------------------
|
|
245
|
+
# Workup / purification inference
|
|
246
|
+
# ---------------------------------------------------------------------------
|
|
247
|
+
|
|
248
|
+
def _infer_workup_steps(exp) -> List[str]:
|
|
249
|
+
"""Infer workup steps from LCMS filenames."""
|
|
250
|
+
steps = []
|
|
251
|
+
seen = set()
|
|
252
|
+
|
|
253
|
+
for lf in exp.lcms_files:
|
|
254
|
+
name = lf.filename.lower()
|
|
255
|
+
|
|
256
|
+
if 'eawash' in name and 'ea_wash' not in seen:
|
|
257
|
+
steps.append("washed with EtOAc")
|
|
258
|
+
seen.add('ea_wash')
|
|
259
|
+
elif 'dcmwash' in name and 'dcm_wash' not in seen:
|
|
260
|
+
steps.append("washed with DCM")
|
|
261
|
+
seen.add('dcm_wash')
|
|
262
|
+
elif 'wash' in name and 'rewash' not in name and 'wash' not in seen:
|
|
263
|
+
steps.append("washed")
|
|
264
|
+
seen.add('wash')
|
|
265
|
+
elif 'rewash' in name and 'rewash' not in seen:
|
|
266
|
+
steps.append("re-washed")
|
|
267
|
+
seen.add('rewash')
|
|
268
|
+
elif 'pellet' in name and 'pellet' not in seen:
|
|
269
|
+
steps.append("precipitate collected")
|
|
270
|
+
seen.add('pellet')
|
|
271
|
+
elif 'super' in name and 'super' not in seen:
|
|
272
|
+
steps.append("supernatant separated")
|
|
273
|
+
seen.add('super')
|
|
274
|
+
|
|
275
|
+
return steps
|
|
276
|
+
|
|
277
|
+
|
|
278
|
+
def _infer_purification(exp) -> Optional[str]:
|
|
279
|
+
"""Infer purification method from LCMS filenames."""
|
|
280
|
+
names = [lf.filename.lower() for lf in exp.lcms_files]
|
|
281
|
+
all_names = " ".join(names)
|
|
282
|
+
|
|
283
|
+
methods = []
|
|
284
|
+
if 'nppurif' in all_names or 'np-' in all_names:
|
|
285
|
+
methods.append("normal phase column chromatography")
|
|
286
|
+
if 'c18' in all_names:
|
|
287
|
+
methods.append("reversed phase (C18) column chromatography")
|
|
288
|
+
if 'peakcomb' in all_names:
|
|
289
|
+
methods.append("fractions combined based on LCMS")
|
|
290
|
+
|
|
291
|
+
has_purified = any('purified' in n or 'pure' in n for n in names)
|
|
292
|
+
|
|
293
|
+
if methods:
|
|
294
|
+
return "Purified by " + " followed by ".join(methods) + "."
|
|
295
|
+
elif has_purified:
|
|
296
|
+
return "Purified (method not specified in LCMS filenames)."
|
|
297
|
+
|
|
298
|
+
return None
|
|
299
|
+
|
|
300
|
+
# ---------------------------------------------------------------------------
|
|
301
|
+
# Procedure section
|
|
302
|
+
# ---------------------------------------------------------------------------
|
|
303
|
+
|
|
304
|
+
def _strip_nmr_from_procedure(text: str) -> str:
|
|
305
|
+
"""Remove NMR data strings from procedure text.
|
|
306
|
+
|
|
307
|
+
NMR data (e.g. '1H NMR (400 MHz, DMSO-d6) δ ...') belongs in
|
|
308
|
+
CHARACTERIZATION, not PROCEDURE. If the ELN CSV procedure text
|
|
309
|
+
contains NMR data, strip it out to avoid duplication.
|
|
310
|
+
"""
|
|
311
|
+
# Pattern matches "1H NMR (...) δ ..." through to the end of the data
|
|
312
|
+
# (terminated by a period + whitespace/newline, or end of string)
|
|
313
|
+
nmr_pattern = re.compile(
|
|
314
|
+
r'\d+[A-Z]\s+NMR\s*\([^)]+\)\s*[\u03b4\u00b4d]\s*.+?'
|
|
315
|
+
r'(?=\.\s*(?:\d+[A-Z]\s+NMR|\n|$)|\Z)',
|
|
316
|
+
re.DOTALL
|
|
317
|
+
)
|
|
318
|
+
cleaned = nmr_pattern.sub('', text)
|
|
319
|
+
# Clean up leftover whitespace / blank lines from removal
|
|
320
|
+
cleaned = re.sub(r'\n{3,}', '\n\n', cleaned)
|
|
321
|
+
return cleaned.strip()
|
|
322
|
+
|
|
323
|
+
|
|
324
|
+
def _procedure_has_tracking(text: str) -> bool:
|
|
325
|
+
keywords = ['lcms', 'tracking', 'conversion', 'consumption of sm',
|
|
326
|
+
'reaction complete', 'indicated', 'lc-ms']
|
|
327
|
+
tl = text.lower()
|
|
328
|
+
return any(kw in tl for kw in keywords)
|
|
329
|
+
|
|
330
|
+
|
|
331
|
+
def _procedure_has_workup(text: str) -> bool:
|
|
332
|
+
keywords = ['was concentrated', 'was quenched', 'was washed',
|
|
333
|
+
'was extracted', 'aqueous', 'was dried', 'workup']
|
|
334
|
+
tl = text.lower()
|
|
335
|
+
return any(kw in tl for kw in keywords)
|
|
336
|
+
|
|
337
|
+
|
|
338
|
+
def _procedure_has_isolation(text: str) -> bool:
|
|
339
|
+
keywords = ['was purified', 'to give', 'to afford',
|
|
340
|
+
'column chromatography', 'chromatography', 'purified by']
|
|
341
|
+
tl = text.lower()
|
|
342
|
+
return any(kw in tl for kw in keywords)
|
|
343
|
+
|
|
344
|
+
|
|
345
|
+
def build_procedure_section(exp, tracking: TrackingAnalysis) -> str:
|
|
346
|
+
"""Build the complete PROCEDURE section."""
|
|
347
|
+
parts = []
|
|
348
|
+
|
|
349
|
+
# 1. Setup from CSV procedure text
|
|
350
|
+
# Strip NMR data from procedure (it belongs in CHARACTERIZATION)
|
|
351
|
+
setup = exp.procedure_text if exp.procedure_text else ""
|
|
352
|
+
setup = _strip_nmr_from_procedure(setup) if setup else ""
|
|
353
|
+
|
|
354
|
+
if setup:
|
|
355
|
+
parts.append(setup)
|
|
356
|
+
else:
|
|
357
|
+
parts.append("[Procedure text not available in CSV.]")
|
|
358
|
+
|
|
359
|
+
# 2. Reaction monitoring (only if not already in procedure text)
|
|
360
|
+
if not _procedure_has_tracking(setup):
|
|
361
|
+
narrative = build_tracking_narrative(exp, tracking)
|
|
362
|
+
if narrative:
|
|
363
|
+
parts.append(narrative)
|
|
364
|
+
|
|
365
|
+
# 3. Workup (only if not already described)
|
|
366
|
+
if not _procedure_has_workup(setup):
|
|
367
|
+
workup_steps = _infer_workup_steps(exp)
|
|
368
|
+
if workup_steps:
|
|
369
|
+
parts.append("The reaction mixture was " +
|
|
370
|
+
", ".join(workup_steps) + ".")
|
|
371
|
+
|
|
372
|
+
# 4. Purification (only if not already described)
|
|
373
|
+
if not _procedure_has_isolation(setup):
|
|
374
|
+
purif = _infer_purification(exp)
|
|
375
|
+
if purif:
|
|
376
|
+
parts.append(purif)
|
|
377
|
+
|
|
378
|
+
# 5. Product isolation
|
|
379
|
+
if exp.product and not _procedure_has_isolation(setup):
|
|
380
|
+
if exp.product.obtained_mass and exp.product.yield_pct:
|
|
381
|
+
parts.append(
|
|
382
|
+
f"Obtained {exp.product.name} "
|
|
383
|
+
f"({exp.product.obtained_mass}, {exp.product.yield_pct}).")
|
|
384
|
+
elif exp.product.obtained_mass:
|
|
385
|
+
parts.append(
|
|
386
|
+
f"Obtained {exp.product.name} "
|
|
387
|
+
f"({exp.product.obtained_mass}).")
|
|
388
|
+
|
|
389
|
+
return "\n\n".join(parts)
|
|
390
|
+
|
|
391
|
+
# ---------------------------------------------------------------------------
|
|
392
|
+
# Characterization section
|
|
393
|
+
# ---------------------------------------------------------------------------
|
|
394
|
+
|
|
395
|
+
def _format_lambda_max(wavelengths: List[float]) -> str:
|
|
396
|
+
"""Format UV lambda max values, e.g. 'λmax 218, 254 nm'.
|
|
397
|
+
|
|
398
|
+
Uses arithmetic rounding (half-up) rather than Python's default
|
|
399
|
+
banker's rounding, so 222.5 -> 223 not 222.
|
|
400
|
+
"""
|
|
401
|
+
if not wavelengths:
|
|
402
|
+
return ""
|
|
403
|
+
wl_strs = [str(math.floor(wl + 0.5)) for wl in sorted(wavelengths)]
|
|
404
|
+
return f"\u03bbmax {', '.join(wl_strs)} nm"
|
|
405
|
+
|
|
406
|
+
|
|
407
|
+
def _format_purity(purified: PurifiedAnalysis) -> str:
|
|
408
|
+
"""Format purity from all available detectors."""
|
|
409
|
+
parts = []
|
|
410
|
+
if purified.purity_tac is not None:
|
|
411
|
+
parts.append(f"TAC {purified.purity_tac:.0f}%")
|
|
412
|
+
if purified.purity_220nm is not None:
|
|
413
|
+
parts.append(f"220nm {purified.purity_220nm:.0f}%")
|
|
414
|
+
if purified.purity_254nm is not None:
|
|
415
|
+
parts.append(f"254nm {purified.purity_254nm:.0f}%")
|
|
416
|
+
if not parts:
|
|
417
|
+
return ""
|
|
418
|
+
return "; purity " + ", ".join(parts)
|
|
419
|
+
|
|
420
|
+
|
|
421
|
+
def build_characterization_section(
|
|
422
|
+
exp,
|
|
423
|
+
expected: List[ExpectedSpecies],
|
|
424
|
+
tracking: TrackingAnalysis,
|
|
425
|
+
purified: PurifiedAnalysis,
|
|
426
|
+
) -> str:
|
|
427
|
+
"""Build the CHARACTERIZATION section with LCMS and NMR data."""
|
|
428
|
+
if not expected:
|
|
429
|
+
return "[Expected species masses not available — cannot annotate LCMS.]"
|
|
430
|
+
|
|
431
|
+
lines = []
|
|
432
|
+
|
|
433
|
+
# Tracking LCMS — report identified species from multi-LCMS analysis
|
|
434
|
+
# Only include compounds with max area% >= MIN_REPORT_AREA_PCT
|
|
435
|
+
if tracking.result and tracking.identified:
|
|
436
|
+
instrument = tracking.result.instrument
|
|
437
|
+
method = tracking.result.method_short
|
|
438
|
+
|
|
439
|
+
# Build annotation from identified compounds
|
|
440
|
+
# Order: SM first, then product, then other reactants
|
|
441
|
+
role_order = {"substrate": 0, "product": 1, "reactant": 2}
|
|
442
|
+
sorted_ic = sorted(
|
|
443
|
+
tracking.identified,
|
|
444
|
+
key=lambda ic: (role_order.get(ic.species.role, 3),
|
|
445
|
+
ic.compound.canonical_rt))
|
|
446
|
+
|
|
447
|
+
parts = []
|
|
448
|
+
for ic in sorted_ic:
|
|
449
|
+
if ic.compound.max_area < MIN_REPORT_AREA_PCT:
|
|
450
|
+
continue
|
|
451
|
+
entry = (f"{ic.species.name} RT {ic.compound.canonical_rt:.2f}, "
|
|
452
|
+
f"{ic.adduct} {ic.matched_mz:.1f}")
|
|
453
|
+
lm = _format_lambda_max(ic.compound.uv_lambda_max)
|
|
454
|
+
if lm:
|
|
455
|
+
entry += f", {lm}"
|
|
456
|
+
parts.append(entry)
|
|
457
|
+
|
|
458
|
+
if parts:
|
|
459
|
+
lines.append(
|
|
460
|
+
f"Reaction tracking LCMS ({instrument}, {method}): "
|
|
461
|
+
+ "; ".join(parts))
|
|
462
|
+
|
|
463
|
+
# Purified/Crude product LCMS — deduplicate by species (keep highest area)
|
|
464
|
+
# Only include peaks with area% >= MIN_REPORT_AREA_PCT
|
|
465
|
+
_product_label = ("Crude product LCMS" if purified.is_crude_fallback
|
|
466
|
+
else "Purified product LCMS")
|
|
467
|
+
if purified.report and purified.identified:
|
|
468
|
+
instrument = purified.report.instrument
|
|
469
|
+
method = purified.report.method_short
|
|
470
|
+
|
|
471
|
+
best_by_species: Dict[str, IdentifiedPeak] = {}
|
|
472
|
+
for ip in purified.identified:
|
|
473
|
+
area = ip.peak.area_pct or 0
|
|
474
|
+
if area < MIN_REPORT_AREA_PCT:
|
|
475
|
+
continue
|
|
476
|
+
key = ip.species.name
|
|
477
|
+
if key not in best_by_species or (
|
|
478
|
+
area > (best_by_species[key].peak.area_pct or 0)):
|
|
479
|
+
best_by_species[key] = ip
|
|
480
|
+
|
|
481
|
+
parts = []
|
|
482
|
+
for ip in best_by_species.values():
|
|
483
|
+
entry = (f"{ip.species.name} RT {ip.peak.rt:.2f}, "
|
|
484
|
+
f"{ip.adduct} {ip.matched_mz:.1f}")
|
|
485
|
+
lm = _format_lambda_max(ip.peak.uv_lambda_max)
|
|
486
|
+
if lm:
|
|
487
|
+
entry += f", {lm}"
|
|
488
|
+
parts.append(entry)
|
|
489
|
+
|
|
490
|
+
purity_str = _format_purity(purified)
|
|
491
|
+
|
|
492
|
+
if parts:
|
|
493
|
+
lines.append(
|
|
494
|
+
f"{_product_label} ({instrument}, {method}): "
|
|
495
|
+
+ "; ".join(parts) + purity_str)
|
|
496
|
+
|
|
497
|
+
elif purified.report and not purified.identified:
|
|
498
|
+
# No species identified by mass spec — report dominant peak's purity
|
|
499
|
+
# and UV data if the main peak is large enough (likely purified product)
|
|
500
|
+
instrument = purified.report.instrument
|
|
501
|
+
method = purified.report.method_short
|
|
502
|
+
|
|
503
|
+
# Find the dominant peak (highest TAC area%)
|
|
504
|
+
best_peak = None
|
|
505
|
+
best_area = 0.0
|
|
506
|
+
for peak in purified.report.peaks:
|
|
507
|
+
area = peak.area_pct or 0
|
|
508
|
+
if area > best_area:
|
|
509
|
+
best_area = area
|
|
510
|
+
best_peak = peak
|
|
511
|
+
|
|
512
|
+
if best_peak and best_area >= MIN_REPORT_AREA_PCT:
|
|
513
|
+
entry = f"RT {best_peak.rt:.2f}"
|
|
514
|
+
lm = _format_lambda_max(best_peak.uv_lambda_max)
|
|
515
|
+
if lm:
|
|
516
|
+
entry += f", {lm}"
|
|
517
|
+
|
|
518
|
+
# Build purity from the dominant peak directly
|
|
519
|
+
purity_parts = []
|
|
520
|
+
if best_peak.area_pct is not None:
|
|
521
|
+
purity_parts.append(f"TAC {best_peak.area_pct:.0f}%")
|
|
522
|
+
if best_peak.area_pct_220nm is not None:
|
|
523
|
+
purity_parts.append(f"220nm {best_peak.area_pct_220nm:.0f}%")
|
|
524
|
+
if best_peak.area_pct_254nm is not None:
|
|
525
|
+
purity_parts.append(f"254nm {best_peak.area_pct_254nm:.0f}%")
|
|
526
|
+
purity_str = ("; purity " + ", ".join(purity_parts)
|
|
527
|
+
if purity_parts else "")
|
|
528
|
+
|
|
529
|
+
lines.append(
|
|
530
|
+
f"{_product_label} ({instrument}, {method}): "
|
|
531
|
+
f"{entry}{purity_str} [no MS data]")
|
|
532
|
+
|
|
533
|
+
# NMR data
|
|
534
|
+
for nmr_str in exp.nmr_data:
|
|
535
|
+
lines.append(nmr_str)
|
|
536
|
+
|
|
537
|
+
if not lines:
|
|
538
|
+
return "[No characterization data available.]"
|
|
539
|
+
|
|
540
|
+
return "\n\n".join(lines)
|
|
541
|
+
|
|
542
|
+
# ---------------------------------------------------------------------------
|
|
543
|
+
# Notes section
|
|
544
|
+
# ---------------------------------------------------------------------------
|
|
545
|
+
|
|
546
|
+
def build_notes_section(
|
|
547
|
+
exp,
|
|
548
|
+
expected: List[ExpectedSpecies],
|
|
549
|
+
tracking: TrackingAnalysis,
|
|
550
|
+
purified: PurifiedAnalysis,
|
|
551
|
+
) -> str:
|
|
552
|
+
"""Build the NOTES section with observations and inferences."""
|
|
553
|
+
notes = []
|
|
554
|
+
|
|
555
|
+
# Conversion timeline from multi-LCMS tracking data
|
|
556
|
+
if tracking.result and tracking.identified:
|
|
557
|
+
sm_ic = next((ic for ic in tracking.identified
|
|
558
|
+
if ic.species.role == "substrate"), None)
|
|
559
|
+
dp_ic = next((ic for ic in tracking.identified
|
|
560
|
+
if ic.species.role == "product"), None)
|
|
561
|
+
|
|
562
|
+
if sm_ic or dp_ic:
|
|
563
|
+
for fi, fe in enumerate(tracking.result.files):
|
|
564
|
+
sm_area = (sm_ic.compound.area_pct_by_file.get(fi)
|
|
565
|
+
if sm_ic else None)
|
|
566
|
+
dp_area = (dp_ic.compound.area_pct_by_file.get(fi)
|
|
567
|
+
if dp_ic else None)
|
|
568
|
+
|
|
569
|
+
# Compute conversion%: 1 - SM/(SM+DP)
|
|
570
|
+
conv_str = None
|
|
571
|
+
if sm_area is not None and dp_area is not None:
|
|
572
|
+
total = sm_area + dp_area
|
|
573
|
+
if total > 0:
|
|
574
|
+
conv = (1.0 - sm_area / total) * 100
|
|
575
|
+
conv_str = f"conversion {conv:.0f}%"
|
|
576
|
+
elif sm_area is not None and dp_area is None:
|
|
577
|
+
# No product detected yet
|
|
578
|
+
conv_str = "conversion 0%"
|
|
579
|
+
elif dp_area is not None and sm_area is None:
|
|
580
|
+
# No SM detected — full conversion
|
|
581
|
+
conv_str = "conversion 100%"
|
|
582
|
+
|
|
583
|
+
tp = _infer_timepoint_desc(fe.filename)
|
|
584
|
+
if conv_str is not None:
|
|
585
|
+
notes.append(f"- {fe.filename}: {conv_str}"
|
|
586
|
+
f"{' ' + tp if tp else ''}")
|
|
587
|
+
|
|
588
|
+
# Unidentified compounds in tracking (only significant ones)
|
|
589
|
+
if tracking.unidentified:
|
|
590
|
+
for c in tracking.unidentified:
|
|
591
|
+
if c.max_area >= MIN_REPORT_AREA_PCT:
|
|
592
|
+
ions_strs = []
|
|
593
|
+
for ic in (c.recurring_ions or c.other_ions)[:3]:
|
|
594
|
+
mode_str = "ESI+" if ic.mode == "ES+" else "ESI-"
|
|
595
|
+
ions_strs.append(f"{mode_str} {ic.mean_mz:.1f}")
|
|
596
|
+
ions_str = ", ".join(ions_strs) if ions_strs else "no ions"
|
|
597
|
+
notes.append(
|
|
598
|
+
f"- Unidentified compound RT {c.canonical_rt:.2f} "
|
|
599
|
+
f"({c.trend}, max {c.max_area:.1f}%): {ions_str}")
|
|
600
|
+
|
|
601
|
+
# Unknown peaks in purified product
|
|
602
|
+
if purified.report and purified.identified:
|
|
603
|
+
identified_rts = {ip.peak.rt for ip in purified.identified}
|
|
604
|
+
for peak in purified.report.peaks:
|
|
605
|
+
if peak.rt not in identified_rts and (peak.area_pct or 0) > 1.0:
|
|
606
|
+
ions_strs = []
|
|
607
|
+
for spec in peak.ms_spectra:
|
|
608
|
+
if spec.top_ions:
|
|
609
|
+
mode_str = "ESI+" if spec.mode == "ES+" else "ESI-"
|
|
610
|
+
ions_strs.append(f"{mode_str} {spec.top_ions[0]:.1f}")
|
|
611
|
+
ions_str = ", ".join(ions_strs) if ions_strs else "no MS"
|
|
612
|
+
_product_type = "crude product" if purified.is_crude_fallback else "purified product"
|
|
613
|
+
notes.append(
|
|
614
|
+
f"- Unknown peak in {_product_type}: RT {peak.rt:.2f}, "
|
|
615
|
+
f"{peak.area_pct:.1f}% ({ions_str})")
|
|
616
|
+
|
|
617
|
+
# Purified product LCMS source file
|
|
618
|
+
if purified.file_info:
|
|
619
|
+
_product_label = ("Crude product LCMS" if purified.is_crude_fallback
|
|
620
|
+
else "Purified product LCMS")
|
|
621
|
+
notes.append(f"- {_product_label}: {purified.file_info.filename}")
|
|
622
|
+
|
|
623
|
+
# Species source — check if masses came from structure or CSV
|
|
624
|
+
source_files = {sp.source_file for sp in expected if sp.source_file}
|
|
625
|
+
if source_files:
|
|
626
|
+
source = next(iter(source_files))
|
|
627
|
+
notes.append(f"- Expected masses from structure file "
|
|
628
|
+
f"({os.path.basename(source)})")
|
|
629
|
+
elif expected:
|
|
630
|
+
notes.append("- Expected masses from CSV MW (no CDX/RXN available)")
|
|
631
|
+
|
|
632
|
+
# Analysis warnings (method mismatches, outliers, discarded files)
|
|
633
|
+
if tracking.result and tracking.result.warnings:
|
|
634
|
+
for w in tracking.result.warnings:
|
|
635
|
+
notes.append(f"- LCMS analysis: {w}")
|
|
636
|
+
if tracking.result and tracking.result.discarded_files:
|
|
637
|
+
disc_names = [f.filename for f in tracking.result.discarded_files]
|
|
638
|
+
notes.append(
|
|
639
|
+
f"- {len(disc_names)} LCMS file(s) from different instrument/method "
|
|
640
|
+
f"excluded from tracking: {', '.join(disc_names)}")
|
|
641
|
+
|
|
642
|
+
# File categorization summary
|
|
643
|
+
categories = {}
|
|
644
|
+
for lf in exp.lcms_files:
|
|
645
|
+
categories.setdefault(lf.category, []).append(lf.filename)
|
|
646
|
+
if categories:
|
|
647
|
+
notes.append(f"- LCMS files: {len(exp.lcms_files)} total "
|
|
648
|
+
f"({', '.join(f'{len(v)} {k}' for k, v in categories.items())})")
|
|
649
|
+
|
|
650
|
+
# Missing data flags
|
|
651
|
+
if not exp.procedure_text:
|
|
652
|
+
notes.append("- Procedure text was empty")
|
|
653
|
+
if not exp.lcms_files:
|
|
654
|
+
notes.append("- No LCMS files found")
|
|
655
|
+
if not exp.nmr_pdfs:
|
|
656
|
+
notes.append("- No NMR PDFs found")
|
|
657
|
+
elif not exp.nmr_data:
|
|
658
|
+
notes.append("- NMR PDFs present but no reported data string found "
|
|
659
|
+
"— needs manual extraction")
|
|
660
|
+
if exp.product and not exp.product.obtained_mass:
|
|
661
|
+
notes.append("- Yield/mass obtained not recorded in CSV")
|
|
662
|
+
if not expected:
|
|
663
|
+
notes.append("- No expected species masses available")
|
|
664
|
+
|
|
665
|
+
return "\n".join(notes) if notes else "No notes."
|
|
666
|
+
|
|
667
|
+
# ---------------------------------------------------------------------------
|
|
668
|
+
# Output assembly
|
|
669
|
+
# ---------------------------------------------------------------------------
|
|
670
|
+
|
|
671
|
+
def assemble_output(procedure: str, characterization: str,
|
|
672
|
+
notes: str) -> str:
|
|
673
|
+
"""Assemble the three sections into final output."""
|
|
674
|
+
parts = [
|
|
675
|
+
SECTION_SEP,
|
|
676
|
+
"PROCEDURE",
|
|
677
|
+
SECTION_SEP,
|
|
678
|
+
"",
|
|
679
|
+
procedure,
|
|
680
|
+
"",
|
|
681
|
+
SECTION_SEP,
|
|
682
|
+
"CHARACTERIZATION",
|
|
683
|
+
SECTION_SEP,
|
|
684
|
+
"",
|
|
685
|
+
characterization,
|
|
686
|
+
"",
|
|
687
|
+
SECTION_SEP,
|
|
688
|
+
"NOTES",
|
|
689
|
+
SECTION_SEP,
|
|
690
|
+
"",
|
|
691
|
+
notes,
|
|
692
|
+
]
|
|
693
|
+
return "\n".join(parts)
|
|
694
|
+
|
|
695
|
+
# ---------------------------------------------------------------------------
|
|
696
|
+
# CLI placeholder
|
|
697
|
+
# ---------------------------------------------------------------------------
|
|
698
|
+
|
|
699
|
+
if __name__ == "__main__":
|
|
700
|
+
print("lab_book_formatter: no standalone CLI — "
|
|
701
|
+
"import from procedure_writer.py")
|