cdxml-toolkit 0.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cdxml_toolkit/__init__.py +18 -0
- cdxml_toolkit/_jre/__init__.py +2 -0
- cdxml_toolkit/_jre/temurin-21-jre-win-x64.zip +0 -0
- cdxml_toolkit/analysis/__init__.py +35 -0
- cdxml_toolkit/analysis/deterministic/__init__.py +12 -0
- cdxml_toolkit/analysis/deterministic/discover_experiment_files.py +413 -0
- cdxml_toolkit/analysis/deterministic/lab_book_formatter.py +701 -0
- cdxml_toolkit/analysis/deterministic/lcms_file_categorizer.py +928 -0
- cdxml_toolkit/analysis/deterministic/lcms_identifier.py +598 -0
- cdxml_toolkit/analysis/deterministic/mass_resolver.py +654 -0
- cdxml_toolkit/analysis/deterministic/multi_lcms_analyzer.py +1412 -0
- cdxml_toolkit/analysis/deterministic/procedure_writer.py +446 -0
- cdxml_toolkit/analysis/extract_nmr.py +47 -0
- cdxml_toolkit/analysis/format_procedure_entry.py +479 -0
- cdxml_toolkit/analysis/lcms_analyzer.py +1299 -0
- cdxml_toolkit/analysis/parse_analysis_file.py +134 -0
- cdxml_toolkit/cdxml_builder.py +920 -0
- cdxml_toolkit/cdxml_utils.py +342 -0
- cdxml_toolkit/chemdraw/__init__.py +5 -0
- cdxml_toolkit/chemdraw/_chemscript_server.py +562 -0
- cdxml_toolkit/chemdraw/cdx_converter.py +527 -0
- cdxml_toolkit/chemdraw/cdxml_to_image.py +262 -0
- cdxml_toolkit/chemdraw/cdxml_to_image_rdkit.py +296 -0
- cdxml_toolkit/chemdraw/chemscript_bridge.py +901 -0
- cdxml_toolkit/constants.py +304 -0
- cdxml_toolkit/coord_normalizer.py +438 -0
- cdxml_toolkit/deterministic_pipeline/__init__.py +6 -0
- cdxml_toolkit/deterministic_pipeline/legacy/__init__.py +5 -0
- cdxml_toolkit/deterministic_pipeline/legacy/eln_cdx_cleanup.py +509 -0
- cdxml_toolkit/deterministic_pipeline/legacy/eln_enrichment.py +1394 -0
- cdxml_toolkit/deterministic_pipeline/legacy/scheme_aligner.py +428 -0
- cdxml_toolkit/deterministic_pipeline/legacy/scheme_polisher.py +1337 -0
- cdxml_toolkit/deterministic_pipeline/legacy/scheme_polisher_v2.py +1340 -0
- cdxml_toolkit/deterministic_pipeline/scheme_reader_audit.py +931 -0
- cdxml_toolkit/deterministic_pipeline/scheme_reader_verify.py +1160 -0
- cdxml_toolkit/image/__init__.py +15 -0
- cdxml_toolkit/image/reaction_from_image.py +2103 -0
- cdxml_toolkit/image/structure_from_image.py +1711 -0
- cdxml_toolkit/layout/__init__.py +5 -0
- cdxml_toolkit/layout/alignment.py +1642 -0
- cdxml_toolkit/layout/reaction_cleanup.py +1002 -0
- cdxml_toolkit/layout/scheme_merger.py +2260 -0
- cdxml_toolkit/mcp_server/__init__.py +0 -0
- cdxml_toolkit/mcp_server/__main__.py +5 -0
- cdxml_toolkit/mcp_server/server.py +1567 -0
- cdxml_toolkit/naming/__init__.py +6 -0
- cdxml_toolkit/naming/aligned_namer.py +2342 -0
- cdxml_toolkit/naming/mol_builder.py +3722 -0
- cdxml_toolkit/naming/name_decomposer.py +2843 -0
- cdxml_toolkit/naming/reactions_datamol.json +2414 -0
- cdxml_toolkit/office/__init__.py +5 -0
- cdxml_toolkit/office/doc_from_template.py +722 -0
- cdxml_toolkit/office/ole_embedder.py +808 -0
- cdxml_toolkit/office/ole_extractor.py +272 -0
- cdxml_toolkit/perception/__init__.py +10 -0
- cdxml_toolkit/perception/compound_search.py +229 -0
- cdxml_toolkit/perception/eln_csv_parser.py +240 -0
- cdxml_toolkit/perception/rdf_parser.py +664 -0
- cdxml_toolkit/perception/reactant_heuristic.py +1045 -0
- cdxml_toolkit/perception/reaction_parser.py +2150 -0
- cdxml_toolkit/perception/scheme_reader.py +2948 -0
- cdxml_toolkit/perception/scheme_refine.py +1404 -0
- cdxml_toolkit/perception/scheme_segmenter.py +619 -0
- cdxml_toolkit/perception/spatial_assignment.py +1013 -0
- cdxml_toolkit/rdkit_utils.py +605 -0
- cdxml_toolkit/render/__init__.py +17 -0
- cdxml_toolkit/render/auto_layout.py +229 -0
- cdxml_toolkit/render/compact_parser.py +632 -0
- cdxml_toolkit/render/parser.py +706 -0
- cdxml_toolkit/render/render_scheme.py +267 -0
- cdxml_toolkit/render/renderer.py +2387 -0
- cdxml_toolkit/render/schema.py +90 -0
- cdxml_toolkit/render/scheme_maker.py +1043 -0
- cdxml_toolkit/render/scheme_yaml_writer.py +1487 -0
- cdxml_toolkit/resolve/__init__.py +13 -0
- cdxml_toolkit/resolve/cas_resolver.py +430 -0
- cdxml_toolkit/resolve/chemscanner_abbreviations.json +28813 -0
- cdxml_toolkit/resolve/condensed_formula.py +493 -0
- cdxml_toolkit/resolve/jre_manager.py +195 -0
- cdxml_toolkit/resolve/reagent_abbreviations.json +1046 -0
- cdxml_toolkit/resolve/reagent_db.py +285 -0
- cdxml_toolkit/resolve/superatom_data.json +2856 -0
- cdxml_toolkit/resolve/superatom_table.py +146 -0
- cdxml_toolkit/text_formatting.py +298 -0
- cdxml_toolkit-0.5.0.dist-info/METADATA +318 -0
- cdxml_toolkit-0.5.0.dist-info/RECORD +91 -0
- cdxml_toolkit-0.5.0.dist-info/WHEEL +5 -0
- cdxml_toolkit-0.5.0.dist-info/entry_points.txt +17 -0
- cdxml_toolkit-0.5.0.dist-info/licenses/LICENSE +21 -0
- cdxml_toolkit-0.5.0.dist-info/licenses/NOTICE.md +37 -0
- cdxml_toolkit-0.5.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,1404 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
scheme_refine.py — Tier 2 LLM-based refinement of scheme_reader output.
|
|
4
|
+
|
|
5
|
+
Takes a SchemeDescription (Tier 1 deterministic output) and produces a refined
|
|
6
|
+
version with corrections from an LLM. The module:
|
|
7
|
+
|
|
8
|
+
1. Generates a structured prompt with the Tier 1 JSON + context.
|
|
9
|
+
2. Accepts a correction dict (from any LLM — Claude API, local model, etc.).
|
|
10
|
+
3. Applies corrections to produce a refined SchemeDescription.
|
|
11
|
+
|
|
12
|
+
The correction format is designed to be simple and LLM-friendly:
|
|
13
|
+
|
|
14
|
+
{
|
|
15
|
+
"content_type": "synthesis", # override content type
|
|
16
|
+
"topology": "linear", # override topology
|
|
17
|
+
"species_corrections": {
|
|
18
|
+
"species_5": {"text_category": "condition_ref"},
|
|
19
|
+
"species_8": {"text_category": "citation"},
|
|
20
|
+
},
|
|
21
|
+
"narrative_override": "...", # replace narrative entirely
|
|
22
|
+
"notes": "..." # free-form LLM reasoning
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
CLI:
|
|
26
|
+
# Generate prompt for LLM review
|
|
27
|
+
python -m cdxml_toolkit.scheme_refine prompt scheme.json
|
|
28
|
+
|
|
29
|
+
# Apply corrections
|
|
30
|
+
python -m cdxml_toolkit.scheme_refine apply scheme.json corrections.json -o refined.json
|
|
31
|
+
|
|
32
|
+
Python API:
|
|
33
|
+
from cdxml_toolkit.perception.scheme_refine import generate_prompt, apply_corrections
|
|
34
|
+
prompt = generate_prompt(desc)
|
|
35
|
+
refined = apply_corrections(desc, corrections_dict)
|
|
36
|
+
"""
|
|
37
|
+
|
|
38
|
+
from __future__ import annotations
|
|
39
|
+
|
|
40
|
+
import argparse
|
|
41
|
+
import json
|
|
42
|
+
import os
|
|
43
|
+
import re
|
|
44
|
+
import sys
|
|
45
|
+
from typing import Any, Dict, List, Optional, Tuple
|
|
46
|
+
|
|
47
|
+
from .scheme_reader import SchemeDescription, SpeciesRecord, ScopeEntry
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
# ---------------------------------------------------------------------------
|
|
51
|
+
# Aligned IUPAC name enrichment
|
|
52
|
+
# ---------------------------------------------------------------------------
|
|
53
|
+
|
|
54
|
+
def enrich_aligned_names(desc: SchemeDescription, verbose: bool = False) -> int:
|
|
55
|
+
"""Replace canonical IUPAC names with aligned alternatives per step.
|
|
56
|
+
|
|
57
|
+
For each step, finds reactant→product SMILES pairs, calls
|
|
58
|
+
``find_aligned_names()``, and overwrites ``iupac_name`` with the
|
|
59
|
+
best enumerated name regardless of alignment quality. ALIGNED and
|
|
60
|
+
SEMI-ALIGNED pairs naturally outrank UNALIGNED (via higher
|
|
61
|
+
similarity score), but even UNALIGNED names are always used —
|
|
62
|
+
any IUPAC name is better than showing raw SMILES.
|
|
63
|
+
|
|
64
|
+
Also stores transformation diffs in ``desc._alignment_diffs`` for
|
|
65
|
+
display in the narrative.
|
|
66
|
+
|
|
67
|
+
Requires ``cdxml_toolkit.aligned_namer`` (which in turn requires
|
|
68
|
+
ChemScript). Returns 0 silently if the module is unavailable.
|
|
69
|
+
|
|
70
|
+
Parameters
|
|
71
|
+
----------
|
|
72
|
+
desc : SchemeDescription
|
|
73
|
+
Parsed scheme with species SMILES populated.
|
|
74
|
+
verbose : bool
|
|
75
|
+
Print alignment progress to stderr.
|
|
76
|
+
|
|
77
|
+
Returns
|
|
78
|
+
-------
|
|
79
|
+
int
|
|
80
|
+
Number of species whose ``iupac_name`` was updated.
|
|
81
|
+
"""
|
|
82
|
+
try:
|
|
83
|
+
from ..naming.aligned_namer import find_aligned_names, format_name_diff
|
|
84
|
+
except Exception:
|
|
85
|
+
return 0
|
|
86
|
+
|
|
87
|
+
updated: Dict[str, float] = {} # species_id → best_similarity so far
|
|
88
|
+
n_updated = 0
|
|
89
|
+
|
|
90
|
+
# Store transformation diffs: (reactant_id, product_id) → diff string
|
|
91
|
+
if not hasattr(desc, '_alignment_diffs'):
|
|
92
|
+
desc._alignment_diffs = {}
|
|
93
|
+
|
|
94
|
+
for step in desc.steps:
|
|
95
|
+
# Collect reactant and product species with SMILES
|
|
96
|
+
reactants = []
|
|
97
|
+
for rid in step.reactant_ids:
|
|
98
|
+
sp = desc.species.get(rid)
|
|
99
|
+
if sp and sp.smiles and sp.element_type == "fragment":
|
|
100
|
+
reactants.append(sp)
|
|
101
|
+
products = []
|
|
102
|
+
for pid in step.product_ids:
|
|
103
|
+
sp = desc.species.get(pid)
|
|
104
|
+
if sp and sp.smiles and sp.element_type == "fragment":
|
|
105
|
+
products.append(sp)
|
|
106
|
+
|
|
107
|
+
if not reactants or not products:
|
|
108
|
+
continue
|
|
109
|
+
|
|
110
|
+
# Pair each reactant with each product
|
|
111
|
+
for r_sp in reactants:
|
|
112
|
+
for p_sp in products:
|
|
113
|
+
try:
|
|
114
|
+
result = find_aligned_names(r_sp.smiles, p_sp.smiles,
|
|
115
|
+
verbose=verbose)
|
|
116
|
+
except Exception:
|
|
117
|
+
continue
|
|
118
|
+
|
|
119
|
+
quality = result.alignment_quality
|
|
120
|
+
|
|
121
|
+
# Use best_similarity as priority — ALIGNED/SEMI-ALIGNED
|
|
122
|
+
# (sim >= 0.5) naturally outranks UNALIGNED (sim < 0.5),
|
|
123
|
+
# but UNALIGNED names are still always better than SMILES.
|
|
124
|
+
sim = result.best_similarity
|
|
125
|
+
# Update reactant name if this alignment is better
|
|
126
|
+
if (result.best_sm_name
|
|
127
|
+
and sim > updated.get(r_sp.id, -1)):
|
|
128
|
+
r_sp.iupac_name = result.best_sm_name
|
|
129
|
+
updated[r_sp.id] = sim
|
|
130
|
+
n_updated += 1
|
|
131
|
+
|
|
132
|
+
# Update product name if this alignment is better
|
|
133
|
+
if (result.best_prod_name
|
|
134
|
+
and sim > updated.get(p_sp.id, -1)):
|
|
135
|
+
p_sp.iupac_name = result.best_prod_name
|
|
136
|
+
updated[p_sp.id] = sim
|
|
137
|
+
n_updated += 1
|
|
138
|
+
|
|
139
|
+
# Store the transformation diff for this aligned pair
|
|
140
|
+
if result.best_sm_name and result.best_prod_name:
|
|
141
|
+
try:
|
|
142
|
+
diff_str = format_name_diff(
|
|
143
|
+
result.best_sm_name, result.best_prod_name)
|
|
144
|
+
if diff_str and diff_str != "(identical)":
|
|
145
|
+
desc._alignment_diffs[
|
|
146
|
+
(r_sp.id, p_sp.id)] = diff_str
|
|
147
|
+
except Exception:
|
|
148
|
+
pass
|
|
149
|
+
|
|
150
|
+
if verbose:
|
|
151
|
+
print(f" Aligned [{quality}] {r_sp.id} \u2194 {p_sp.id}: "
|
|
152
|
+
f"sim={sim:.2f}", file=sys.stderr)
|
|
153
|
+
print(f" SM: {result.best_sm_name}",
|
|
154
|
+
file=sys.stderr)
|
|
155
|
+
print(f" Prod: {result.best_prod_name}",
|
|
156
|
+
file=sys.stderr)
|
|
157
|
+
|
|
158
|
+
# Second pass: name R-group species by replacing * with H and naming
|
|
159
|
+
# the "core" structure. Any IUPAC name is better than a raw formula
|
|
160
|
+
# or SMILES in the narrative.
|
|
161
|
+
n_updated += _name_rgroup_cores(desc, verbose=verbose)
|
|
162
|
+
|
|
163
|
+
return n_updated
|
|
164
|
+
|
|
165
|
+
|
|
166
|
+
# Regex matching R-group tokens in SMILES: [R], [R'], [R3], [OR'], [Het],
|
|
167
|
+
# [COOR''], [F,Cl,Br,I], compound-label brackets like [2.21], [(R,S,S)-5.2]
|
|
168
|
+
_RGROUP_TOKEN_RE = re.compile(
|
|
169
|
+
r'\[R\d*\'*\]' # [R], [R'], [R3], [R'']
|
|
170
|
+
r'|\[OR\'*\]' # [OR'], [OR'']
|
|
171
|
+
r'|\[Het\]' # [Het]
|
|
172
|
+
r'|\[COOR\'*\]' # [COOR''], [COOR']
|
|
173
|
+
r'|\[F,Cl,Br,I\]' # halide variable
|
|
174
|
+
)
|
|
175
|
+
|
|
176
|
+
# ChemDraw generic group abbreviations that prevent SMILES parsing.
|
|
177
|
+
# When a SMILES contains these, the species is a generic methodology scheme
|
|
178
|
+
# (protecting groups, leaving groups, etc.) and cannot be named.
|
|
179
|
+
_CHEMDRAW_ABBREV_RE = re.compile(
|
|
180
|
+
r'\[G\]' # generic group
|
|
181
|
+
r'|\[LG\]' # leaving group
|
|
182
|
+
r'|\[P\d*\'*\]' # protecting group [P], [P1'], [P']
|
|
183
|
+
r'|\[EWG\]' # electron-withdrawing group
|
|
184
|
+
r'|\[EDG\]' # electron-donating group
|
|
185
|
+
r'|\[Nu\]' # nucleophile
|
|
186
|
+
r'|\[E\+?\]' # electrophile
|
|
187
|
+
r'|\[Base\]' # base
|
|
188
|
+
)
|
|
189
|
+
|
|
190
|
+
# Tokens that are just bare R-group labels (entire SMILES is the token)
|
|
191
|
+
_BARE_RGROUP_RE = re.compile(
|
|
192
|
+
r'^\[R\d*\'*\]$'
|
|
193
|
+
r'|^\[OR\'*\]$'
|
|
194
|
+
r'|^\[Het\]$'
|
|
195
|
+
r'|^\[COOR\'*\]$'
|
|
196
|
+
r'|^\[F,Cl,Br,I\]$'
|
|
197
|
+
r'|^\*$'
|
|
198
|
+
)
|
|
199
|
+
|
|
200
|
+
|
|
201
|
+
def _name_rgroup_cores(desc: SchemeDescription,
|
|
202
|
+
verbose: bool = False) -> int:
|
|
203
|
+
"""Name species that contain R-group atoms by naming the H-core.
|
|
204
|
+
|
|
205
|
+
Handles both RDKit notation (``*``) and ChemScript notation
|
|
206
|
+
(``[R]``, ``[R']``, ``[R3]``, ``[OR']``, ``[Het]``, etc.).
|
|
207
|
+
|
|
208
|
+
Replaces R-group tokens with ``[H]``, canonicalises with RDKit,
|
|
209
|
+
then calls ChemScript ``get_name()`` on the resulting real molecule.
|
|
210
|
+
The IUPAC name is stored as ``"<core name> derivative"`` (for 2+ R)
|
|
211
|
+
or ``"R-substituted <core name>"`` (for 1 R).
|
|
212
|
+
|
|
213
|
+
Returns the number of species updated.
|
|
214
|
+
"""
|
|
215
|
+
try:
|
|
216
|
+
from rdkit import Chem
|
|
217
|
+
from ..chemdraw.chemscript_bridge import ChemScriptBridge
|
|
218
|
+
bridge = ChemScriptBridge()
|
|
219
|
+
except Exception:
|
|
220
|
+
return 0
|
|
221
|
+
|
|
222
|
+
n = 0
|
|
223
|
+
for sid, sp in desc.species.items():
|
|
224
|
+
if getattr(sp, 'iupac_name', None):
|
|
225
|
+
continue # already named
|
|
226
|
+
smiles = sp.smiles
|
|
227
|
+
if not smiles:
|
|
228
|
+
continue
|
|
229
|
+
|
|
230
|
+
# Check for R-group tokens (both * and [R]-style) and
|
|
231
|
+
# ChemDraw generic abbreviations ([P], [G], [LG], etc.)
|
|
232
|
+
has_star = '*' in smiles
|
|
233
|
+
rgroup_matches = _RGROUP_TOKEN_RE.findall(smiles)
|
|
234
|
+
abbrev_matches = _CHEMDRAW_ABBREV_RE.findall(smiles)
|
|
235
|
+
if not has_star and not rgroup_matches and not abbrev_matches:
|
|
236
|
+
continue
|
|
237
|
+
|
|
238
|
+
try:
|
|
239
|
+
# Bare R-group atom: just show the label directly
|
|
240
|
+
stripped = smiles.strip()
|
|
241
|
+
if _BARE_RGROUP_RE.match(stripped):
|
|
242
|
+
# Show as-is but cleaned up: [R] -> R, [R3] -> R3, etc.
|
|
243
|
+
label = stripped.strip('[]')
|
|
244
|
+
if label == '*':
|
|
245
|
+
label = 'R'
|
|
246
|
+
sp.iupac_name = label
|
|
247
|
+
n += 1
|
|
248
|
+
continue
|
|
249
|
+
|
|
250
|
+
# Replace all R-group and abbreviation tokens with [H]
|
|
251
|
+
core_smiles = smiles
|
|
252
|
+
for token in rgroup_matches:
|
|
253
|
+
core_smiles = core_smiles.replace(token, '[H]')
|
|
254
|
+
for token in abbrev_matches:
|
|
255
|
+
core_smiles = core_smiles.replace(token, '[H]')
|
|
256
|
+
if has_star:
|
|
257
|
+
core_smiles = core_smiles.replace('*', '[H]')
|
|
258
|
+
|
|
259
|
+
mol = Chem.MolFromSmiles(core_smiles)
|
|
260
|
+
if mol is None:
|
|
261
|
+
# Can't parse even after stripping — label as generic
|
|
262
|
+
sp.iupac_name = "generic intermediate"
|
|
263
|
+
n += 1
|
|
264
|
+
if verbose:
|
|
265
|
+
print(f" R-group core: {sid} -> generic intermediate "
|
|
266
|
+
f"(unparseable: {smiles[:50]})", file=sys.stderr)
|
|
267
|
+
continue
|
|
268
|
+
|
|
269
|
+
core_canon = Chem.MolToSmiles(mol)
|
|
270
|
+
core_name = bridge.get_name(core_canon)
|
|
271
|
+
if not core_name:
|
|
272
|
+
# Named core failed — still better than raw SMILES
|
|
273
|
+
sp.iupac_name = "generic intermediate"
|
|
274
|
+
n += 1
|
|
275
|
+
continue
|
|
276
|
+
|
|
277
|
+
n_rgroups = (len(rgroup_matches) + len(abbrev_matches)
|
|
278
|
+
+ smiles.count('*'))
|
|
279
|
+
if n_rgroups == 1:
|
|
280
|
+
sp.iupac_name = f"R-substituted {core_name}"
|
|
281
|
+
else:
|
|
282
|
+
sp.iupac_name = f"{core_name} derivative"
|
|
283
|
+
n += 1
|
|
284
|
+
|
|
285
|
+
if verbose:
|
|
286
|
+
print(f" R-group core: {sid} -> {sp.iupac_name}",
|
|
287
|
+
file=sys.stderr)
|
|
288
|
+
except Exception:
|
|
289
|
+
continue
|
|
290
|
+
|
|
291
|
+
return n
|
|
292
|
+
|
|
293
|
+
|
|
294
|
+
# ---------------------------------------------------------------------------
|
|
295
|
+
# Prompt generation
|
|
296
|
+
# ---------------------------------------------------------------------------
|
|
297
|
+
|
|
298
|
+
def generate_prompt(desc: SchemeDescription,
|
|
299
|
+
image_path: Optional[str] = None) -> str:
|
|
300
|
+
"""Generate a structured prompt for LLM refinement.
|
|
301
|
+
|
|
302
|
+
Parameters
|
|
303
|
+
----------
|
|
304
|
+
desc : SchemeDescription
|
|
305
|
+
Tier 1 deterministic output.
|
|
306
|
+
image_path : str, optional
|
|
307
|
+
Path to rendered scheme image (for vision models).
|
|
308
|
+
|
|
309
|
+
Returns
|
|
310
|
+
-------
|
|
311
|
+
str
|
|
312
|
+
Structured prompt text.
|
|
313
|
+
"""
|
|
314
|
+
parts = []
|
|
315
|
+
|
|
316
|
+
parts.append("# Scheme Refinement Task\n")
|
|
317
|
+
parts.append("You are reviewing the output of a deterministic chemical "
|
|
318
|
+
"scheme parser. Your job is to identify and correct any "
|
|
319
|
+
"misclassifications in the structured output.\n")
|
|
320
|
+
|
|
321
|
+
if image_path:
|
|
322
|
+
parts.append(f"**Rendered image**: {image_path}\n")
|
|
323
|
+
|
|
324
|
+
parts.append("## Tier 1 Parser Output\n")
|
|
325
|
+
parts.append(f"- **Topology**: {desc.topology}")
|
|
326
|
+
parts.append(f"- **Content type**: {desc.content_type or 'unknown'}")
|
|
327
|
+
parts.append(f"- **Steps**: {desc.num_steps}")
|
|
328
|
+
parts.append(f"- **Species**: {len(desc.species)}\n")
|
|
329
|
+
|
|
330
|
+
# Species summary table
|
|
331
|
+
parts.append("### Species Registry\n")
|
|
332
|
+
parts.append("| ID | Type | Category | Label | Name (first 60 chars) | "
|
|
333
|
+
"SMILES (first 40 chars) | MW |")
|
|
334
|
+
parts.append("|" + "|".join(["---"] * 7) + "|")
|
|
335
|
+
for sp_id, sp in desc.species.items():
|
|
336
|
+
name_short = (sp.name or "")[:60].replace("\n", " / ")
|
|
337
|
+
smi_short = (sp.smiles or "")[:40]
|
|
338
|
+
parts.append(
|
|
339
|
+
f"| {sp_id} | {sp.element_type} | {sp.text_category or '-'} | "
|
|
340
|
+
f"{sp.label or '-'} | {name_short} | {smi_short} | "
|
|
341
|
+
f"{sp.mw or '-'} |"
|
|
342
|
+
)
|
|
343
|
+
|
|
344
|
+
# Steps summary
|
|
345
|
+
parts.append("\n### Reaction Steps\n")
|
|
346
|
+
for step in desc.steps:
|
|
347
|
+
r_ids = ", ".join(step.reactant_ids) or "(none)"
|
|
348
|
+
p_ids = ", ".join(step.product_ids) or "(none)"
|
|
349
|
+
rg_ids = ", ".join(step.reagent_ids) or "(none)"
|
|
350
|
+
parts.append(
|
|
351
|
+
f"- **Step {step.step_index}**: "
|
|
352
|
+
f"R=[{r_ids}] → P=[{p_ids}] | "
|
|
353
|
+
f"reagents=[{rg_ids}] | "
|
|
354
|
+
f"conditions={step.conditions} | "
|
|
355
|
+
f"yield={step.yield_text or '-'} | "
|
|
356
|
+
f"arrow={step.arrow_style}"
|
|
357
|
+
)
|
|
358
|
+
|
|
359
|
+
# Narrative
|
|
360
|
+
parts.append(f"\n### Current Narrative\n{desc.narrative}\n")
|
|
361
|
+
|
|
362
|
+
# Instructions
|
|
363
|
+
parts.append("## Your Task\n")
|
|
364
|
+
parts.append("Review the above and return a JSON correction object with "
|
|
365
|
+
"any needed fixes. Only include fields that need changing.\n")
|
|
366
|
+
parts.append("Correction format:\n```json")
|
|
367
|
+
parts.append(json.dumps({
|
|
368
|
+
"content_type": "<correct type: synthesis | sar_design | "
|
|
369
|
+
"biological_pathway | target_array | "
|
|
370
|
+
"literature_comparison | composite | investigation>",
|
|
371
|
+
"topology": "<correct topology if wrong>",
|
|
372
|
+
"species_corrections": {
|
|
373
|
+
"<species_id>": {
|
|
374
|
+
"text_category": "<condition_ref | citation | bioactivity | "
|
|
375
|
+
"chemical | conditions_block>"
|
|
376
|
+
}
|
|
377
|
+
},
|
|
378
|
+
"narrative_override": "<better narrative if the current one is wrong>",
|
|
379
|
+
"notes": "<your reasoning>"
|
|
380
|
+
}, indent=2))
|
|
381
|
+
parts.append("```\n")
|
|
382
|
+
parts.append("If the Tier 1 output is correct, return: `{}`\n")
|
|
383
|
+
|
|
384
|
+
return "\n".join(parts)
|
|
385
|
+
|
|
386
|
+
|
|
387
|
+
# ---------------------------------------------------------------------------
|
|
388
|
+
# Apply corrections
|
|
389
|
+
# ---------------------------------------------------------------------------
|
|
390
|
+
|
|
391
|
+
def apply_corrections(desc: SchemeDescription,
|
|
392
|
+
corrections: Dict[str, Any]) -> SchemeDescription:
|
|
393
|
+
"""Apply LLM corrections to a SchemeDescription.
|
|
394
|
+
|
|
395
|
+
Returns a new SchemeDescription with corrections applied.
|
|
396
|
+
The original is not modified.
|
|
397
|
+
|
|
398
|
+
Parameters
|
|
399
|
+
----------
|
|
400
|
+
desc : SchemeDescription
|
|
401
|
+
Tier 1 deterministic output.
|
|
402
|
+
corrections : dict
|
|
403
|
+
LLM correction dict (see module docstring for format).
|
|
404
|
+
|
|
405
|
+
Returns
|
|
406
|
+
-------
|
|
407
|
+
SchemeDescription
|
|
408
|
+
Refined description.
|
|
409
|
+
"""
|
|
410
|
+
# Deep copy via JSON round-trip
|
|
411
|
+
d = desc.to_dict()
|
|
412
|
+
refined = SchemeDescription.from_dict(d)
|
|
413
|
+
|
|
414
|
+
if not corrections:
|
|
415
|
+
return refined
|
|
416
|
+
|
|
417
|
+
# Apply content type override
|
|
418
|
+
if "content_type" in corrections:
|
|
419
|
+
refined.content_type = corrections["content_type"]
|
|
420
|
+
|
|
421
|
+
# Apply topology override
|
|
422
|
+
if "topology" in corrections:
|
|
423
|
+
refined.topology = corrections["topology"]
|
|
424
|
+
|
|
425
|
+
# Apply species corrections
|
|
426
|
+
sp_corr = corrections.get("species_corrections", {})
|
|
427
|
+
for sp_id, fixes in sp_corr.items():
|
|
428
|
+
sp = refined.species.get(sp_id)
|
|
429
|
+
if sp is None:
|
|
430
|
+
continue
|
|
431
|
+
if "text_category" in fixes:
|
|
432
|
+
sp.text_category = fixes["text_category"]
|
|
433
|
+
if "name" in fixes:
|
|
434
|
+
sp.name = fixes["name"]
|
|
435
|
+
if "smiles" in fixes:
|
|
436
|
+
sp.smiles = fixes["smiles"]
|
|
437
|
+
if "label" in fixes:
|
|
438
|
+
sp.label = fixes["label"]
|
|
439
|
+
|
|
440
|
+
# Apply narrative override
|
|
441
|
+
if "narrative_override" in corrections:
|
|
442
|
+
refined.narrative = corrections["narrative_override"]
|
|
443
|
+
else:
|
|
444
|
+
# Regenerate narrative with corrected data
|
|
445
|
+
from .scheme_reader import _generate_narrative
|
|
446
|
+
refined.narrative = _generate_narrative(refined)
|
|
447
|
+
|
|
448
|
+
return refined
|
|
449
|
+
|
|
450
|
+
|
|
451
|
+
# ---------------------------------------------------------------------------
|
|
452
|
+
# Batch refinement: apply a corrections file to multiple schemes
|
|
453
|
+
# ---------------------------------------------------------------------------
|
|
454
|
+
|
|
455
|
+
def load_corrections_file(path: str) -> Dict[str, Dict[str, Any]]:
|
|
456
|
+
"""Load a corrections file mapping source filenames to corrections.
|
|
457
|
+
|
|
458
|
+
Format:
|
|
459
|
+
{
|
|
460
|
+
"oleObject1.cdxml": { ... corrections ... },
|
|
461
|
+
"oleObject2.cdxml": { ... corrections ... },
|
|
462
|
+
}
|
|
463
|
+
"""
|
|
464
|
+
with open(path, "r", encoding="utf-8") as f:
|
|
465
|
+
return json.load(f)
|
|
466
|
+
|
|
467
|
+
|
|
468
|
+
def refine_scheme(desc: SchemeDescription,
|
|
469
|
+
corrections: Optional[Dict[str, Any]] = None) -> SchemeDescription:
|
|
470
|
+
"""Refine a scheme description.
|
|
471
|
+
|
|
472
|
+
If corrections are provided, applies them.
|
|
473
|
+
Otherwise returns the description unchanged.
|
|
474
|
+
"""
|
|
475
|
+
if corrections:
|
|
476
|
+
return apply_corrections(desc, corrections)
|
|
477
|
+
return desc
|
|
478
|
+
|
|
479
|
+
|
|
480
|
+
# ---------------------------------------------------------------------------
|
|
481
|
+
# LLM-quality narrative generation
|
|
482
|
+
# ---------------------------------------------------------------------------
|
|
483
|
+
|
|
484
|
+
# Reagent -> reaction type mapping (pattern, reaction_name, notes)
|
|
485
|
+
_REACTION_PATTERNS: List[Tuple[re.Pattern, str, str]] = [
|
|
486
|
+
# Pd-catalysed cross-couplings
|
|
487
|
+
(re.compile(r"Pd.*(?:dba|PPh3|dppf|dppp|OAc|Cl2)", re.I),
|
|
488
|
+
None, "Pd-catalysed"), # refined below
|
|
489
|
+
# Buchwald-Hartwig: Pd + amine + base
|
|
490
|
+
(re.compile(r"(?:BINAP|XPhos|SPhos|DavePhos|RuPhos|BrettPhos|JohnPhos"
|
|
491
|
+
r"|XantPhos|t-?Bu[23]?P)", re.I),
|
|
492
|
+
"Buchwald-Hartwig amination", "Pd/ligand system"),
|
|
493
|
+
# Suzuki: boronic acid
|
|
494
|
+
(re.compile(r"B\(OH\)2|boronic|Bpin|BF3K|potassium trifluoroborate", re.I),
|
|
495
|
+
"Suzuki coupling", "boronic acid coupling partner"),
|
|
496
|
+
# Sonogashira: alkyne + Pd/Cu
|
|
497
|
+
(re.compile(r"(?:Sonogashira|CuI.*Pd|PdCl2.*CuI)", re.I),
|
|
498
|
+
"Sonogashira coupling", ""),
|
|
499
|
+
# Heck
|
|
500
|
+
(re.compile(r"(?:Heck|acrylate.*Pd|Pd.*vinyl)", re.I),
|
|
501
|
+
"Heck reaction", ""),
|
|
502
|
+
# NBS bromination
|
|
503
|
+
(re.compile(r"\bNBS\b", re.I),
|
|
504
|
+
"NBS bromination", "electrophilic aromatic bromination"),
|
|
505
|
+
# Boc deprotection
|
|
506
|
+
(re.compile(r"\bTFA\b.*(?:DCM|CH2Cl2)|HCl.*(?:dioxane|Et2O|MeOH)|"
|
|
507
|
+
r"Boc.*(?:deprot|remov)", re.I),
|
|
508
|
+
"Boc deprotection", "acidic removal of tert-butoxycarbonyl"),
|
|
509
|
+
# Cbz deprotection / hydrogenolysis
|
|
510
|
+
(re.compile(r"H2.*Pd/?C|Pd/?C.*H2|hydrogenolysis|Cbz.*deprot", re.I),
|
|
511
|
+
"hydrogenolysis", "Pd/C-catalysed H2 reduction"),
|
|
512
|
+
# Amide coupling
|
|
513
|
+
(re.compile(r"\b(?:HATU|HBTU|EDCI|EDC|DCC|T3P|COMU|PyBOP|TBTU|HOBt"
|
|
514
|
+
r"|HOAt|TFFH|SOCl2.*amine|CDI)\b", re.I),
|
|
515
|
+
"amide coupling", "peptide bond formation"),
|
|
516
|
+
# Reductive amination
|
|
517
|
+
(re.compile(r"NaBH(?:3CN|OAc|\(OAc\)3)|reductive amin", re.I),
|
|
518
|
+
"reductive amination", "imine formation + reduction"),
|
|
519
|
+
# Mitsunobu
|
|
520
|
+
(re.compile(r"(?:DIAD|DEAD|DMAP).*PPh3|Mitsunobu", re.I),
|
|
521
|
+
"Mitsunobu reaction", "stereoinversion of alcohol"),
|
|
522
|
+
# Grignard
|
|
523
|
+
(re.compile(r"\bMgBr\b|\bMgCl\b|Grignard", re.I),
|
|
524
|
+
"Grignard addition", "organomagnesium addition"),
|
|
525
|
+
# Wittig / HWE
|
|
526
|
+
(re.compile(r"(?:Wittig|ylide|PPh3.*CHO|HWE|Horner)", re.I),
|
|
527
|
+
"Wittig/HWE olefination", ""),
|
|
528
|
+
# SNAr
|
|
529
|
+
(re.compile(r"(?:SNAr|nucleophilic aromatic|K2CO3.*DMF|Cs2CO3.*DMF"
|
|
530
|
+
r"|NaH.*DMF)", re.I),
|
|
531
|
+
None, ""), # needs context to distinguish from Buchwald
|
|
532
|
+
# Reduction (general)
|
|
533
|
+
(re.compile(r"\bLiAlH4\b|LiAlH\(OtBu\)3|NaBH4|DIBAL", re.I),
|
|
534
|
+
"reduction", "hydride reduction"),
|
|
535
|
+
# Oxidation
|
|
536
|
+
(re.compile(r"\b(?:mCPBA|Dess.?Martin|Swern|TEMPO|PDC|PCC|Jones)\b", re.I),
|
|
537
|
+
"oxidation", ""),
|
|
538
|
+
# Halogenation
|
|
539
|
+
(re.compile(r"\bNCS\b", re.I),
|
|
540
|
+
"NCS chlorination", "electrophilic aromatic chlorination"),
|
|
541
|
+
# Alkylation
|
|
542
|
+
(re.compile(r"\b(?:NaH|K2CO3|Cs2CO3)\b.*(?:alkyl|benzyl|methyl|BnBr"
|
|
543
|
+
r"|MeI|allyl)", re.I),
|
|
544
|
+
"alkylation", "base-mediated alkylation"),
|
|
545
|
+
# Ring closure / cyclisation
|
|
546
|
+
(re.compile(r"(?:exo-trig|exo-dig|endo-trig|endo-dig|cycliz|ring.?clos"
|
|
547
|
+
r"|lacton)", re.I),
|
|
548
|
+
"cyclisation", "intramolecular ring closure"),
|
|
549
|
+
]
|
|
550
|
+
|
|
551
|
+
|
|
552
|
+
def _build_reaction_smiles(step, species: Dict[str, SpeciesRecord]) -> Optional[str]:
|
|
553
|
+
"""Build reaction SMILES from a step's species references.
|
|
554
|
+
|
|
555
|
+
Constructs ``R1.R2.reagent1>>P1`` from the step's reactant, reagent,
|
|
556
|
+
and product species. Only species with SMILES are included.
|
|
557
|
+
|
|
558
|
+
Returns None if either side has no SMILES.
|
|
559
|
+
"""
|
|
560
|
+
lhs = []
|
|
561
|
+
for sid in list(step.reactant_ids) + list(step.reagent_ids):
|
|
562
|
+
sp = species.get(sid)
|
|
563
|
+
if sp and sp.smiles:
|
|
564
|
+
lhs.append(sp.smiles)
|
|
565
|
+
rhs = []
|
|
566
|
+
for sid in step.product_ids:
|
|
567
|
+
sp = species.get(sid)
|
|
568
|
+
if sp and sp.smiles:
|
|
569
|
+
rhs.append(sp.smiles)
|
|
570
|
+
if not lhs or not rhs:
|
|
571
|
+
return None
|
|
572
|
+
return ".".join(lhs) + ">>" + ".".join(rhs)
|
|
573
|
+
|
|
574
|
+
|
|
575
|
+
def _classify_reaction(condition_text_raw: List[str],
|
|
576
|
+
reagent_species: List[SpeciesRecord],
|
|
577
|
+
desc: SchemeDescription,
|
|
578
|
+
ml_data: Optional[Dict] = None) -> Optional[str]:
|
|
579
|
+
"""Try to classify a reaction step from its conditions/reagents.
|
|
580
|
+
|
|
581
|
+
When *ml_data* is supplied (from RXN Insight via ``enrich_steps``), its
|
|
582
|
+
``reaction_name`` is preferred. The regex heuristic still runs as a
|
|
583
|
+
cross-check and fallback.
|
|
584
|
+
"""
|
|
585
|
+
# --- ML classification (preferred when available) ---
|
|
586
|
+
ml_name = None
|
|
587
|
+
if ml_data:
|
|
588
|
+
ml_name = ml_data.get("reaction_name") or None
|
|
589
|
+
# rxn-insight sometimes returns generic class as name; ignore those
|
|
590
|
+
if ml_name and ml_name.lower() in ("unrecognized", "", "other"):
|
|
591
|
+
ml_name = None
|
|
592
|
+
|
|
593
|
+
# Combine all text for pattern matching
|
|
594
|
+
all_text = " ".join(condition_text_raw)
|
|
595
|
+
for sp in reagent_species:
|
|
596
|
+
if sp.name:
|
|
597
|
+
all_text += " " + sp.name
|
|
598
|
+
if sp.smiles:
|
|
599
|
+
all_text += " " + sp.smiles
|
|
600
|
+
|
|
601
|
+
# Check for Pd + specific ligand patterns first (Buchwald vs Suzuki)
|
|
602
|
+
has_pd = bool(re.search(r"Pd", all_text))
|
|
603
|
+
has_boronic = bool(re.search(r"B\(OH\)2|boronic|Bpin", all_text, re.I))
|
|
604
|
+
has_amine = any(
|
|
605
|
+
(sp.smiles and re.search(r"N[^a-z]|NH", sp.smiles or ""))
|
|
606
|
+
or (sp.name and re.search(
|
|
607
|
+
r"morpholin|piperid|pyrrolid|piperazin|amine|aniline|indol",
|
|
608
|
+
sp.name or "", re.I))
|
|
609
|
+
for sp in reagent_species
|
|
610
|
+
)
|
|
611
|
+
has_coupling_ligand = bool(re.search(
|
|
612
|
+
r"BINAP|XPhos|SPhos|DavePhos|RuPhos|BrettPhos|dppf|dppp", all_text, re.I))
|
|
613
|
+
|
|
614
|
+
regex_name = None
|
|
615
|
+
if has_pd and has_boronic:
|
|
616
|
+
regex_name = "Suzuki coupling"
|
|
617
|
+
elif has_pd and has_coupling_ligand and has_amine:
|
|
618
|
+
regex_name = "Buchwald-Hartwig amination"
|
|
619
|
+
elif has_pd and has_coupling_ligand:
|
|
620
|
+
regex_name = "Pd-catalysed cross-coupling"
|
|
621
|
+
else:
|
|
622
|
+
# Pattern-based classification
|
|
623
|
+
for pat, name, _notes in _REACTION_PATTERNS:
|
|
624
|
+
if pat.search(all_text) and name:
|
|
625
|
+
regex_name = name
|
|
626
|
+
break
|
|
627
|
+
# Fallback: check if it's a coupling with base
|
|
628
|
+
if regex_name is None and has_pd:
|
|
629
|
+
regex_name = "Pd-catalysed transformation"
|
|
630
|
+
|
|
631
|
+
# Prefer ML name when available; regex serves as cross-check
|
|
632
|
+
if ml_name and regex_name:
|
|
633
|
+
return regex_name # trust regex for medchem-specific names
|
|
634
|
+
if regex_name:
|
|
635
|
+
return regex_name
|
|
636
|
+
if ml_name:
|
|
637
|
+
return ml_name
|
|
638
|
+
return None
|
|
639
|
+
|
|
640
|
+
|
|
641
|
+
# Compound label that ended up as SMILES (e.g. [2.21], [(R,S,S)-5.2], [5.1])
|
|
642
|
+
_COMPOUND_LABEL_RE = re.compile(
|
|
643
|
+
r'^\[[\d(][\w.,\-()/ ]*\]$'
|
|
644
|
+
)
|
|
645
|
+
|
|
646
|
+
|
|
647
|
+
def _species_display(sp: SpeciesRecord) -> str:
|
|
648
|
+
"""Format a species for narrative display.
|
|
649
|
+
|
|
650
|
+
Priority: label > IUPAC name > common name > formula > SMILES.
|
|
651
|
+
SMILES is only shown as a fallback when no readable name is available.
|
|
652
|
+
Compound labels disguised as SMILES (e.g. ``[2.21]``) are shown as
|
|
653
|
+
``compound 2.21`` instead of ``[SMILES: ...]``.
|
|
654
|
+
"""
|
|
655
|
+
parts = []
|
|
656
|
+
if sp.label:
|
|
657
|
+
parts.append(f"compound {sp.label}")
|
|
658
|
+
# Add IUPAC name as parenthetical when available
|
|
659
|
+
iupac = getattr(sp, "iupac_name", None)
|
|
660
|
+
if iupac:
|
|
661
|
+
parts.append(f"({iupac})")
|
|
662
|
+
elif getattr(sp, "iupac_name", None):
|
|
663
|
+
parts.append(sp.iupac_name)
|
|
664
|
+
elif sp.name and len(sp.name) < 40:
|
|
665
|
+
parts.append(sp.name)
|
|
666
|
+
elif sp.formula:
|
|
667
|
+
parts.append(sp.formula)
|
|
668
|
+
# Only show SMILES as last-resort identification when no readable name
|
|
669
|
+
has_readable_name = bool(parts)
|
|
670
|
+
if sp.smiles and not has_readable_name:
|
|
671
|
+
# Detect compound labels disguised as SMILES
|
|
672
|
+
if _COMPOUND_LABEL_RE.match(sp.smiles):
|
|
673
|
+
label_text = sp.smiles.strip('[]')
|
|
674
|
+
parts.append(f"compound {label_text}")
|
|
675
|
+
else:
|
|
676
|
+
parts.append(f"[SMILES: {sp.smiles}]")
|
|
677
|
+
if sp.mw and not sp.label:
|
|
678
|
+
parts.append(f"[MW {sp.mw:.1f}]")
|
|
679
|
+
return " ".join(parts) if parts else sp.id
|
|
680
|
+
|
|
681
|
+
|
|
682
|
+
def _parse_step_reagents(step, species: Dict[str, SpeciesRecord]) -> Dict[str, list]:
|
|
683
|
+
"""Decompose all reagent and condition information into categorised bins.
|
|
684
|
+
|
|
685
|
+
Collects data from:
|
|
686
|
+
1. Fragment reagent species (drawn structures above/below arrow)
|
|
687
|
+
2. Text reagent species (multi-line text blocks with reagent names,
|
|
688
|
+
solvents, conditions, and workup instructions)
|
|
689
|
+
3. Parsed ``step.conditions`` (extracted physical conditions)
|
|
690
|
+
|
|
691
|
+
Returns a dict with keys:
|
|
692
|
+
``catalysts`` – [(display_name, equiv_or_loading), ...]
|
|
693
|
+
``ligands`` – [(display_name, equiv_or_loading), ...]
|
|
694
|
+
``bases`` – [(display_name, equiv_or_loading), ...]
|
|
695
|
+
``reagents`` – [(display_name, equiv_or_loading), ...] (catch-all)
|
|
696
|
+
``solvents`` – [display_name, ...]
|
|
697
|
+
``conditions`` – [str, ...] (temperature, time, atmosphere, ...)
|
|
698
|
+
``workup`` – [str, ...] (quench/workup instructions)
|
|
699
|
+
"""
|
|
700
|
+
from ..resolve.reagent_db import get_reagent_db
|
|
701
|
+
from .reaction_parser import _is_condition_token
|
|
702
|
+
db = get_reagent_db()
|
|
703
|
+
|
|
704
|
+
cats: Dict[str, list] = {
|
|
705
|
+
"catalysts": [], "ligands": [], "bases": [], "reagents": [],
|
|
706
|
+
"solvents": [], "conditions": [], "workup": [],
|
|
707
|
+
}
|
|
708
|
+
# Role → bin mapping
|
|
709
|
+
_ROLE_BIN = {
|
|
710
|
+
"catalyst": "catalysts",
|
|
711
|
+
"ligand": "ligands",
|
|
712
|
+
"base": "bases",
|
|
713
|
+
"lewis_acid": "catalysts",
|
|
714
|
+
"solvent": "solvents",
|
|
715
|
+
"coupling_reagent": "reagents",
|
|
716
|
+
"reducing_agent": "reagents",
|
|
717
|
+
"reductant": "reagents",
|
|
718
|
+
"oxidant": "reagents",
|
|
719
|
+
"halogenating_agent": "reagents",
|
|
720
|
+
"fluorinating_agent": "reagents",
|
|
721
|
+
"borylating_agent": "reagents",
|
|
722
|
+
"activating_agent": "reagents",
|
|
723
|
+
"deprotecting_agent": "reagents",
|
|
724
|
+
"protecting_group": "reagents",
|
|
725
|
+
"drying_agent": "reagents",
|
|
726
|
+
"acid": "reagents",
|
|
727
|
+
"additive": "reagents",
|
|
728
|
+
"reagent": "reagents",
|
|
729
|
+
}
|
|
730
|
+
|
|
731
|
+
# Track names we've already added (avoid duplicates)
|
|
732
|
+
_seen_names: set = set()
|
|
733
|
+
|
|
734
|
+
def _add_token(raw_token: str) -> None:
|
|
735
|
+
"""Classify a single token and add to the right bin."""
|
|
736
|
+
token = raw_token.strip()
|
|
737
|
+
if not token:
|
|
738
|
+
return
|
|
739
|
+
|
|
740
|
+
# Skip yield tokens (e.g. "72%", "quant.", "95% yield")
|
|
741
|
+
if re.match(r"^\d+\.?\d*\s*%", token) or \
|
|
742
|
+
re.match(r"^quant\.?$", token, re.IGNORECASE):
|
|
743
|
+
return
|
|
744
|
+
|
|
745
|
+
# Skip reaction name labels (e.g. "Rieche formylation", "Mitsunobu")
|
|
746
|
+
_rxn_name_patterns = re.compile(
|
|
747
|
+
r"^(?:Rieche|Mitsunobu|Swern|Wittig|Grignard|Heck|Suzuki|"
|
|
748
|
+
r"Buchwald|Sonogashira|Negishi|Stille|Kumada|Chan.Lam|"
|
|
749
|
+
r"Ullmann|Goldberg|Appel|Gabriel|Finkelstein|"
|
|
750
|
+
r"Curtius|Arndt.Eistert|Barton|Dess.Martin|"
|
|
751
|
+
r"Williamson|Fischer|Mannich|Strecker|Reformatsky)\b",
|
|
752
|
+
re.IGNORECASE)
|
|
753
|
+
if _rxn_name_patterns.search(token):
|
|
754
|
+
return
|
|
755
|
+
|
|
756
|
+
# Strip equiv/loading annotations for lookup, but preserve for display
|
|
757
|
+
equiv_str = ""
|
|
758
|
+
m = re.match(r"^(.+?)\s*\((\d+\.?\d*\s*(?:eq\.?|equiv\.?|mol\s*%|cat\.))\)\s*$",
|
|
759
|
+
token, re.IGNORECASE)
|
|
760
|
+
if m:
|
|
761
|
+
token_clean = m.group(1).strip()
|
|
762
|
+
equiv_str = m.group(2).strip()
|
|
763
|
+
else:
|
|
764
|
+
token_clean = token
|
|
765
|
+
|
|
766
|
+
# Normalise key for lookup
|
|
767
|
+
lookup_key = token_clean.lower().strip()
|
|
768
|
+
if lookup_key in _seen_names:
|
|
769
|
+
return
|
|
770
|
+
_seen_names.add(lookup_key)
|
|
771
|
+
|
|
772
|
+
# Check if it's a physical condition
|
|
773
|
+
if _is_condition_token(token_clean):
|
|
774
|
+
cats["conditions"].append(token_clean)
|
|
775
|
+
return
|
|
776
|
+
|
|
777
|
+
# Temperature range patterns not caught by _is_condition_token:
|
|
778
|
+
# "-78 to RT", "0 C to RT", "-78 C to rt", "-78°C to RT"
|
|
779
|
+
if re.match(
|
|
780
|
+
r"^-?\d+\s*[°\u00b0]?\s*C?\s+to\s+(?:r\.?t\.?|-?\d+\s*[°\u00b0]?\s*C?)\s*$",
|
|
781
|
+
token_clean, re.IGNORECASE
|
|
782
|
+
):
|
|
783
|
+
cats["conditions"].append(token_clean)
|
|
784
|
+
return
|
|
785
|
+
|
|
786
|
+
# Workup detection
|
|
787
|
+
if re.match(r"^then\b", token_clean, re.IGNORECASE):
|
|
788
|
+
cats["workup"].append(token)
|
|
789
|
+
return
|
|
790
|
+
|
|
791
|
+
# Reagent DB lookup
|
|
792
|
+
role = db.role_for_name(lookup_key)
|
|
793
|
+
entry = db.entry_for_name(lookup_key)
|
|
794
|
+
display = entry.get("display", token_clean) if entry else token_clean
|
|
795
|
+
|
|
796
|
+
if role:
|
|
797
|
+
bin_name = _ROLE_BIN.get(role, "reagents")
|
|
798
|
+
if bin_name == "solvents":
|
|
799
|
+
cats["solvents"].append(display)
|
|
800
|
+
else:
|
|
801
|
+
cats[bin_name].append((display, equiv_str))
|
|
802
|
+
else:
|
|
803
|
+
# Unknown — check if it looks like a solvent ratio ("dioxane/H2O (3:1)")
|
|
804
|
+
if re.match(r"^[A-Za-z0-9,\-]+(/[A-Za-z0-9,\-]+)+(\s*\(\d+:\d+\))?$",
|
|
805
|
+
token_clean):
|
|
806
|
+
cats["solvents"].append(token_clean)
|
|
807
|
+
# "cat." usually means catalytic amount
|
|
808
|
+
elif equiv_str and "cat" in equiv_str.lower():
|
|
809
|
+
cats["reagents"].append((display, equiv_str))
|
|
810
|
+
# Has a loading → likely a reagent
|
|
811
|
+
elif equiv_str:
|
|
812
|
+
cats["reagents"].append((display, equiv_str))
|
|
813
|
+
else:
|
|
814
|
+
# Genuinely unknown — treat as reagent
|
|
815
|
+
cats["reagents"].append((display, ""))
|
|
816
|
+
|
|
817
|
+
# 1. Fragment reagent species (drawn structures)
|
|
818
|
+
for rid in step.reagent_ids:
|
|
819
|
+
sp = species.get(rid)
|
|
820
|
+
if not sp:
|
|
821
|
+
continue
|
|
822
|
+
if sp.element_type == "fragment":
|
|
823
|
+
# Build best display name: label > IUPAC > reagent_db display > name > SMILES
|
|
824
|
+
display_name = None
|
|
825
|
+
role = None
|
|
826
|
+
|
|
827
|
+
# Try reagent_db by name first
|
|
828
|
+
if sp.name:
|
|
829
|
+
role = db.role_for_name(sp.name.lower())
|
|
830
|
+
entry = db.entry_for_name(sp.name.lower())
|
|
831
|
+
if entry:
|
|
832
|
+
display_name = entry.get("display", sp.name)
|
|
833
|
+
|
|
834
|
+
# Try reagent_db by SMILES
|
|
835
|
+
if not role and sp.smiles:
|
|
836
|
+
role = db.role_for_smiles(sp.smiles)
|
|
837
|
+
sr = db.smiles_role_display(sp.smiles)
|
|
838
|
+
if sr:
|
|
839
|
+
if not display_name:
|
|
840
|
+
display_name = sr[1]
|
|
841
|
+
if not role:
|
|
842
|
+
role = sr[0]
|
|
843
|
+
|
|
844
|
+
# Fallback display: IUPAC > name > SMILES
|
|
845
|
+
if not display_name:
|
|
846
|
+
display_name = (
|
|
847
|
+
getattr(sp, "iupac_name", None)
|
|
848
|
+
or sp.name
|
|
849
|
+
or (sp.smiles if sp.smiles and len(sp.smiles) <= 40 else None)
|
|
850
|
+
or sp.id
|
|
851
|
+
)
|
|
852
|
+
|
|
853
|
+
lookup_key = display_name.lower().strip()
|
|
854
|
+
if lookup_key in _seen_names:
|
|
855
|
+
continue
|
|
856
|
+
_seen_names.add(lookup_key)
|
|
857
|
+
|
|
858
|
+
if role:
|
|
859
|
+
bin_name = _ROLE_BIN.get(role, "reagents")
|
|
860
|
+
if bin_name == "solvents":
|
|
861
|
+
cats["solvents"].append(display_name)
|
|
862
|
+
else:
|
|
863
|
+
cats[bin_name].append((display_name, ""))
|
|
864
|
+
else:
|
|
865
|
+
cats["reagents"].append((display_name, ""))
|
|
866
|
+
|
|
867
|
+
# 2. Text reagent species (multi-line text blocks)
|
|
868
|
+
for rid in step.reagent_ids:
|
|
869
|
+
sp = species.get(rid)
|
|
870
|
+
if not sp or sp.element_type != "text":
|
|
871
|
+
continue
|
|
872
|
+
if not sp.name:
|
|
873
|
+
continue
|
|
874
|
+
# Split multi-line block into individual tokens
|
|
875
|
+
for line in sp.name.split("\n"):
|
|
876
|
+
line = line.strip()
|
|
877
|
+
if not line:
|
|
878
|
+
continue
|
|
879
|
+
# Split on comma/semicolon (but protect names like "1,4-dioxane")
|
|
880
|
+
# Strategy: if whole line is a known name, keep it; else try splitting
|
|
881
|
+
if db.entry_for_name(line.strip().lower()):
|
|
882
|
+
_add_token(line)
|
|
883
|
+
continue
|
|
884
|
+
# Try splitting on commas
|
|
885
|
+
parts = re.split(r"[;,]\s*", line)
|
|
886
|
+
if len(parts) > 1:
|
|
887
|
+
for part in parts:
|
|
888
|
+
_add_token(part)
|
|
889
|
+
else:
|
|
890
|
+
_add_token(line)
|
|
891
|
+
|
|
892
|
+
# 3. Physical conditions from parsed step.conditions
|
|
893
|
+
for cond in step.conditions:
|
|
894
|
+
cond_lower = cond.lower().strip()
|
|
895
|
+
if cond_lower not in _seen_names:
|
|
896
|
+
_seen_names.add(cond_lower)
|
|
897
|
+
cats["conditions"].append(cond)
|
|
898
|
+
|
|
899
|
+
return cats
|
|
900
|
+
|
|
901
|
+
|
|
902
|
+
def _format_conditions(step, species: Dict[str, SpeciesRecord]) -> str:
|
|
903
|
+
"""Format step conditions as readable text.
|
|
904
|
+
|
|
905
|
+
Delegates to ``_parse_step_reagents`` for structured decomposition,
|
|
906
|
+
then formats into a single-line summary for backward compatibility.
|
|
907
|
+
"""
|
|
908
|
+
cats = _parse_step_reagents(step, species)
|
|
909
|
+
parts = []
|
|
910
|
+
for name, equiv in cats["catalysts"]:
|
|
911
|
+
parts.append(f"{name} ({equiv})" if equiv else name)
|
|
912
|
+
for name, equiv in cats["ligands"]:
|
|
913
|
+
parts.append(f"{name} ({equiv})" if equiv else name)
|
|
914
|
+
for name, equiv in cats["bases"]:
|
|
915
|
+
parts.append(f"{name} ({equiv})" if equiv else name)
|
|
916
|
+
for name, equiv in cats["reagents"]:
|
|
917
|
+
parts.append(f"{name} ({equiv})" if equiv else name)
|
|
918
|
+
parts.extend(cats["solvents"])
|
|
919
|
+
parts.extend(cats["conditions"])
|
|
920
|
+
if step.yield_text:
|
|
921
|
+
parts.append(f"{step.yield_text} yield")
|
|
922
|
+
return ", ".join(parts) if parts else "(conditions not specified)"
|
|
923
|
+
|
|
924
|
+
|
|
925
|
+
def analyze_bond_changes(mapped_rxn: str) -> Dict[str, list]:
|
|
926
|
+
"""Analyze bond changes from an atom-mapped reaction SMILES.
|
|
927
|
+
|
|
928
|
+
Uses RDKit to compare bonds between mapped atoms in reactants vs products.
|
|
929
|
+
|
|
930
|
+
Returns
|
|
931
|
+
-------
|
|
932
|
+
dict
|
|
933
|
+
``formed`` : list of (sym1, map1, sym2, map2, bond_order)
|
|
934
|
+
``broken`` : list of (sym1, map1, sym2, map2, bond_order)
|
|
935
|
+
``changed_order`` : list of (sym1, map1, sym2, map2, old_order, new_order)
|
|
936
|
+
``leaving`` : list of (group_symbol, nbr_symbol, nbr_map)
|
|
937
|
+
"""
|
|
938
|
+
try:
|
|
939
|
+
from rdkit import Chem
|
|
940
|
+
except ImportError:
|
|
941
|
+
return {}
|
|
942
|
+
|
|
943
|
+
parts = mapped_rxn.split(">>")
|
|
944
|
+
if len(parts) != 2:
|
|
945
|
+
return {}
|
|
946
|
+
|
|
947
|
+
reactants = Chem.MolFromSmiles(parts[0])
|
|
948
|
+
products = Chem.MolFromSmiles(parts[1])
|
|
949
|
+
if not reactants or not products:
|
|
950
|
+
return {}
|
|
951
|
+
|
|
952
|
+
def _map_to_idx(mol):
|
|
953
|
+
return {a.GetAtomMapNum(): a.GetIdx()
|
|
954
|
+
for a in mol.GetAtoms() if a.GetAtomMapNum()}
|
|
955
|
+
|
|
956
|
+
def _bonds_by_map(mol):
|
|
957
|
+
idx_to_map = {a.GetIdx(): a.GetAtomMapNum() for a in mol.GetAtoms()}
|
|
958
|
+
bonds = {}
|
|
959
|
+
for bond in mol.GetBonds():
|
|
960
|
+
m1 = idx_to_map.get(bond.GetBeginAtomIdx(), 0)
|
|
961
|
+
m2 = idx_to_map.get(bond.GetEndAtomIdx(), 0)
|
|
962
|
+
if m1 and m2:
|
|
963
|
+
bonds[(min(m1, m2), max(m1, m2))] = bond.GetBondTypeAsDouble()
|
|
964
|
+
return bonds
|
|
965
|
+
|
|
966
|
+
def _atom_sym(mol, mapnum):
|
|
967
|
+
m = _map_to_idx(mol)
|
|
968
|
+
idx = m.get(mapnum)
|
|
969
|
+
if idx is None:
|
|
970
|
+
return "?"
|
|
971
|
+
return mol.GetAtomWithIdx(idx).GetSymbol()
|
|
972
|
+
|
|
973
|
+
r_bonds = _bonds_by_map(reactants)
|
|
974
|
+
p_bonds = _bonds_by_map(products)
|
|
975
|
+
|
|
976
|
+
formed = [
|
|
977
|
+
(_atom_sym(products, k[0]), k[0],
|
|
978
|
+
_atom_sym(products, k[1]), k[1], p_bonds[k])
|
|
979
|
+
for k in sorted(set(p_bonds) - set(r_bonds))
|
|
980
|
+
]
|
|
981
|
+
broken = [
|
|
982
|
+
(_atom_sym(reactants, k[0]), k[0],
|
|
983
|
+
_atom_sym(reactants, k[1]), k[1], r_bonds[k])
|
|
984
|
+
for k in sorted(set(r_bonds) - set(p_bonds))
|
|
985
|
+
]
|
|
986
|
+
changed = [
|
|
987
|
+
(_atom_sym(products, k[0]), k[0],
|
|
988
|
+
_atom_sym(products, k[1]), k[1], r_bonds[k], p_bonds[k])
|
|
989
|
+
for k in sorted(set(r_bonds) & set(p_bonds))
|
|
990
|
+
if r_bonds[k] != p_bonds[k]
|
|
991
|
+
]
|
|
992
|
+
|
|
993
|
+
# Leaving groups: unmapped atoms bonded to mapped atoms in reactants
|
|
994
|
+
leaving = []
|
|
995
|
+
seen = set()
|
|
996
|
+
for atom in reactants.GetAtoms():
|
|
997
|
+
if atom.GetAtomMapNum() == 0 and atom.GetIdx() not in seen:
|
|
998
|
+
for nbr in atom.GetNeighbors():
|
|
999
|
+
mn = nbr.GetAtomMapNum()
|
|
1000
|
+
if mn:
|
|
1001
|
+
leaving.append((atom.GetSymbol(), nbr.GetSymbol(), mn))
|
|
1002
|
+
seen.add(atom.GetIdx())
|
|
1003
|
+
break
|
|
1004
|
+
|
|
1005
|
+
return {
|
|
1006
|
+
"formed": formed,
|
|
1007
|
+
"broken": broken,
|
|
1008
|
+
"changed_order": changed,
|
|
1009
|
+
"leaving": leaving,
|
|
1010
|
+
}
|
|
1011
|
+
|
|
1012
|
+
|
|
1013
|
+
def describe_transformation(changes: Dict[str, list],
|
|
1014
|
+
max_changes: int = 5) -> str:
|
|
1015
|
+
"""Generate a chemical English description from bond-change analysis.
|
|
1016
|
+
|
|
1017
|
+
Produces a concise, human-readable description of what bonds formed,
|
|
1018
|
+
broke, or changed order, and what groups were displaced.
|
|
1019
|
+
|
|
1020
|
+
When the atom mapping is incomplete (reagents not drawn as structures),
|
|
1021
|
+
the mapper may shuffle atoms producing many spurious bond changes.
|
|
1022
|
+
If total changes exceed *max_changes*, only leaving groups and key
|
|
1023
|
+
single-bond formations are reported.
|
|
1024
|
+
|
|
1025
|
+
Example: "C-N bond formed; Br displaced from C"
|
|
1026
|
+
"""
|
|
1027
|
+
if not changes:
|
|
1028
|
+
return ""
|
|
1029
|
+
|
|
1030
|
+
formed = changes.get("formed", [])
|
|
1031
|
+
broken = changes.get("broken", [])
|
|
1032
|
+
changed = changes.get("changed_order", [])
|
|
1033
|
+
leaving = changes.get("leaving", [])
|
|
1034
|
+
|
|
1035
|
+
total = len(formed) + len(broken) + len(changed)
|
|
1036
|
+
|
|
1037
|
+
_ORDER_NAME = {
|
|
1038
|
+
1.0: "single", 1.5: "aromatic", 2.0: "double", 3.0: "triple",
|
|
1039
|
+
}
|
|
1040
|
+
parts = []
|
|
1041
|
+
|
|
1042
|
+
if total > max_changes:
|
|
1043
|
+
# Too many changes — mapping likely incomplete (reagent not drawn).
|
|
1044
|
+
# Report only the most informative: single-bond formations (coupling)
|
|
1045
|
+
# and leaving groups.
|
|
1046
|
+
key_formed = [f for f in formed if f[4] == 1.0]
|
|
1047
|
+
for sym1, _m1, sym2, _m2, _bt in key_formed[:2]:
|
|
1048
|
+
parts.append(f"{sym1}-{sym2} bond formed")
|
|
1049
|
+
for lg_sym, nbr_sym, _mn in leaving[:2]:
|
|
1050
|
+
parts.append(f"{lg_sym} displaced from {nbr_sym}")
|
|
1051
|
+
if not parts:
|
|
1052
|
+
parts.append(f"complex rearrangement ({total} bond changes)")
|
|
1053
|
+
else:
|
|
1054
|
+
for sym1, _m1, sym2, _m2, bt in formed:
|
|
1055
|
+
bname = _ORDER_NAME.get(bt, f"order-{bt}")
|
|
1056
|
+
parts.append(f"{sym1}-{sym2} {bname} bond formed")
|
|
1057
|
+
|
|
1058
|
+
for sym1, _m1, sym2, _m2, bt in broken:
|
|
1059
|
+
bname = _ORDER_NAME.get(bt, f"order-{bt}")
|
|
1060
|
+
parts.append(f"{sym1}-{sym2} {bname} bond broken")
|
|
1061
|
+
|
|
1062
|
+
for sym1, _m1, sym2, _m2, old_bt, new_bt in changed:
|
|
1063
|
+
old_n = _ORDER_NAME.get(old_bt, str(old_bt))
|
|
1064
|
+
new_n = _ORDER_NAME.get(new_bt, str(new_bt))
|
|
1065
|
+
parts.append(f"{sym1}-{sym2} bond changed {old_n} -> {new_n}")
|
|
1066
|
+
|
|
1067
|
+
for lg_sym, nbr_sym, _mn in leaving:
|
|
1068
|
+
parts.append(f"{lg_sym} displaced from {nbr_sym}")
|
|
1069
|
+
|
|
1070
|
+
return "; ".join(parts) if parts else ""
|
|
1071
|
+
|
|
1072
|
+
|
|
1073
|
+
def generate_llm_narrative(desc: SchemeDescription,
|
|
1074
|
+
ml_enrichment: Optional[Dict[int, Dict]] = None,
|
|
1075
|
+
) -> str:
|
|
1076
|
+
"""Generate a chemist-quality natural language narrative.
|
|
1077
|
+
|
|
1078
|
+
This function produces Layer 3 output: readable text that an LLM can
|
|
1079
|
+
consume for chemical reasoning, grounded in SMILES from the species
|
|
1080
|
+
registry.
|
|
1081
|
+
|
|
1082
|
+
Parameters
|
|
1083
|
+
----------
|
|
1084
|
+
desc : SchemeDescription
|
|
1085
|
+
Parsed scheme (Tier 1 or Tier 2).
|
|
1086
|
+
ml_enrichment : dict, optional
|
|
1087
|
+
Per-step ML grounding data keyed by step_index. Each entry is
|
|
1088
|
+
the dict returned by ``classify_roles_enriched()`` (RXNMapper +
|
|
1089
|
+
rxn-insight). Keys include ``reaction_class``, ``reaction_name``,
|
|
1090
|
+
``confidence``, ``byproducts``, ``components``.
|
|
1091
|
+
|
|
1092
|
+
Returns
|
|
1093
|
+
-------
|
|
1094
|
+
str
|
|
1095
|
+
Natural language narrative with embedded SMILES for grounding.
|
|
1096
|
+
"""
|
|
1097
|
+
ml_enrichment = ml_enrichment or {}
|
|
1098
|
+
if not desc.steps:
|
|
1099
|
+
# No-step schemes (target arrays, etc.)
|
|
1100
|
+
n_frag = sum(1 for sp in desc.species.values()
|
|
1101
|
+
if sp.element_type == "fragment")
|
|
1102
|
+
n_text = sum(1 for sp in desc.species.values()
|
|
1103
|
+
if sp.element_type == "text")
|
|
1104
|
+
if n_frag == 0 and n_text == 0:
|
|
1105
|
+
return "Empty scheme with no chemical content detected."
|
|
1106
|
+
|
|
1107
|
+
ctype_label = {
|
|
1108
|
+
"target_array": "Target structure array",
|
|
1109
|
+
"sar_design": "SAR design diagram",
|
|
1110
|
+
"synthesis": "Structure collection",
|
|
1111
|
+
}.get(desc.content_type or "", "Non-reaction scheme")
|
|
1112
|
+
|
|
1113
|
+
parts = [f"{ctype_label} containing {n_frag} structure(s)."]
|
|
1114
|
+
|
|
1115
|
+
for sp in desc.species.values():
|
|
1116
|
+
if sp.element_type != "fragment":
|
|
1117
|
+
continue
|
|
1118
|
+
display = _species_display(sp)
|
|
1119
|
+
# Flag generic scaffolds (contain [*] dummy atoms from R-groups)
|
|
1120
|
+
is_generic = sp.smiles and "[*]" in sp.smiles
|
|
1121
|
+
if is_generic:
|
|
1122
|
+
# Check if variable position info is in the name
|
|
1123
|
+
if sp.name and "variable:" in sp.name:
|
|
1124
|
+
parts.append(f" - {display} [generic scaffold — {sp.name}]")
|
|
1125
|
+
else:
|
|
1126
|
+
parts.append(f" - {display} [generic scaffold]")
|
|
1127
|
+
else:
|
|
1128
|
+
parts.append(f" - {display}")
|
|
1129
|
+
|
|
1130
|
+
# Include text annotations if present
|
|
1131
|
+
text_sps = [sp for sp in desc.species.values()
|
|
1132
|
+
if sp.element_type == "text" and sp.name]
|
|
1133
|
+
if text_sps:
|
|
1134
|
+
parts.append("")
|
|
1135
|
+
parts.append("Annotations:")
|
|
1136
|
+
for sp in text_sps:
|
|
1137
|
+
first_line = sp.name.split("\n")[0].strip()
|
|
1138
|
+
parts.append(f" - {first_line}")
|
|
1139
|
+
|
|
1140
|
+
return "\n".join(parts)
|
|
1141
|
+
|
|
1142
|
+
# Build narrative
|
|
1143
|
+
ctype_label = {
|
|
1144
|
+
"synthesis": "Synthetic route",
|
|
1145
|
+
"sar_design": "SAR exploration",
|
|
1146
|
+
"biological_pathway": "Biological pathway",
|
|
1147
|
+
"literature_comparison": "Literature method comparison",
|
|
1148
|
+
"composite": "Composite methodology overview",
|
|
1149
|
+
"investigation": "Methodology investigation",
|
|
1150
|
+
}.get(desc.content_type or "", "Reaction scheme")
|
|
1151
|
+
|
|
1152
|
+
# Opening line
|
|
1153
|
+
topo_adj = {
|
|
1154
|
+
"linear": "sequential",
|
|
1155
|
+
"divergent": "divergent",
|
|
1156
|
+
"convergent": "convergent",
|
|
1157
|
+
"parallel": "parallel",
|
|
1158
|
+
"mixed": "multi-pathway",
|
|
1159
|
+
}.get(desc.topology, "")
|
|
1160
|
+
|
|
1161
|
+
# Identify final product(s) for the opening
|
|
1162
|
+
final_products = []
|
|
1163
|
+
if desc.steps:
|
|
1164
|
+
last_step = desc.steps[-1]
|
|
1165
|
+
for pid in last_step.product_ids:
|
|
1166
|
+
sp = desc.species.get(pid)
|
|
1167
|
+
if sp:
|
|
1168
|
+
final_products.append(_species_display(sp))
|
|
1169
|
+
|
|
1170
|
+
opening = f"{ctype_label}"
|
|
1171
|
+
if desc.num_steps > 0:
|
|
1172
|
+
opening += f" ({desc.num_steps} step{'s' if desc.num_steps > 1 else ''}"
|
|
1173
|
+
if topo_adj:
|
|
1174
|
+
opening += f", {topo_adj}"
|
|
1175
|
+
opening += ")"
|
|
1176
|
+
if final_products and desc.content_type in ("synthesis", "", None):
|
|
1177
|
+
opening += f" toward {final_products[0]}"
|
|
1178
|
+
opening += "."
|
|
1179
|
+
|
|
1180
|
+
parts = [opening, ""]
|
|
1181
|
+
|
|
1182
|
+
# Step-by-step description
|
|
1183
|
+
for step in desc.steps:
|
|
1184
|
+
ml_data = ml_enrichment.get(step.step_index)
|
|
1185
|
+
|
|
1186
|
+
# Classify reaction (regex + optional ML)
|
|
1187
|
+
reagent_sps = [desc.species[rid] for rid in step.reagent_ids
|
|
1188
|
+
if rid in desc.species]
|
|
1189
|
+
# Also check text species in reactants for amine detection
|
|
1190
|
+
all_step_sps = reagent_sps + [
|
|
1191
|
+
desc.species[rid] for rid in step.reactant_ids
|
|
1192
|
+
if rid in desc.species]
|
|
1193
|
+
rxn_type = _classify_reaction(
|
|
1194
|
+
step.condition_text_raw, all_step_sps, desc,
|
|
1195
|
+
ml_data=ml_data)
|
|
1196
|
+
|
|
1197
|
+
# Reactant display
|
|
1198
|
+
r_names = []
|
|
1199
|
+
for rid in step.reactant_ids:
|
|
1200
|
+
sp = desc.species.get(rid)
|
|
1201
|
+
if sp:
|
|
1202
|
+
r_names.append(_species_display(sp))
|
|
1203
|
+
r_str = " + ".join(r_names) if r_names else ""
|
|
1204
|
+
|
|
1205
|
+
# Product display
|
|
1206
|
+
p_names = []
|
|
1207
|
+
for pid in step.product_ids:
|
|
1208
|
+
sp = desc.species.get(pid)
|
|
1209
|
+
if sp:
|
|
1210
|
+
p_names.append(_species_display(sp))
|
|
1211
|
+
p_str = " + ".join(p_names) if p_names else ""
|
|
1212
|
+
|
|
1213
|
+
# Detect protocol-only steps (no substrate/product drawn)
|
|
1214
|
+
_is_protocol_step = (
|
|
1215
|
+
not step.reactant_ids and not step.product_ids
|
|
1216
|
+
and desc.content_type in (
|
|
1217
|
+
"composite", "literature_comparison", "investigation"))
|
|
1218
|
+
|
|
1219
|
+
# Step header
|
|
1220
|
+
step_num = step.step_index + 1
|
|
1221
|
+
if rxn_type:
|
|
1222
|
+
step_line = f"Step {step_num} -- {rxn_type}:"
|
|
1223
|
+
elif _is_protocol_step:
|
|
1224
|
+
step_line = f"Method {step_num}:"
|
|
1225
|
+
else:
|
|
1226
|
+
step_line = f"Step {step_num}:"
|
|
1227
|
+
|
|
1228
|
+
# Arrow annotation
|
|
1229
|
+
if step.arrow_style == "failed":
|
|
1230
|
+
step_line += " [FAILED]"
|
|
1231
|
+
elif step.arrow_style == "dashed":
|
|
1232
|
+
step_line += " [tentative/planned]"
|
|
1233
|
+
|
|
1234
|
+
parts.append(step_line)
|
|
1235
|
+
|
|
1236
|
+
# Transformation description with structured conditions
|
|
1237
|
+
cats = _parse_step_reagents(step, desc.species)
|
|
1238
|
+
|
|
1239
|
+
# Reactant → product line (or protocol description for method-only steps)
|
|
1240
|
+
if _is_protocol_step:
|
|
1241
|
+
# No substrate/product drawn — describe the protocol directly
|
|
1242
|
+
desc_line = " Protocol:"
|
|
1243
|
+
elif r_str and p_str:
|
|
1244
|
+
desc_line = f" {r_str} -> {p_str}"
|
|
1245
|
+
elif r_str:
|
|
1246
|
+
desc_line = f" {r_str} -> (product)"
|
|
1247
|
+
elif p_str:
|
|
1248
|
+
desc_line = f" (starting material) -> {p_str}"
|
|
1249
|
+
else:
|
|
1250
|
+
desc_line = f" (starting material) -> (product)"
|
|
1251
|
+
parts.append(desc_line)
|
|
1252
|
+
|
|
1253
|
+
# Transformation diff (when aligned names show what changed)
|
|
1254
|
+
_diffs = getattr(desc, '_alignment_diffs', {})
|
|
1255
|
+
if _diffs:
|
|
1256
|
+
for rid in step.reactant_ids:
|
|
1257
|
+
for pid in step.product_ids:
|
|
1258
|
+
diff_str = _diffs.get((rid, pid))
|
|
1259
|
+
if diff_str:
|
|
1260
|
+
# Replace " -> " with " → " for readability
|
|
1261
|
+
diff_display = diff_str.replace(" -> ", " \u2192 ")
|
|
1262
|
+
parts.append(f" Transformation: {diff_display}")
|
|
1263
|
+
|
|
1264
|
+
# Reagents line (catalysts, ligands, bases, coupling/reducing agents)
|
|
1265
|
+
reagent_parts = []
|
|
1266
|
+
for name, equiv in cats["catalysts"]:
|
|
1267
|
+
reagent_parts.append(f"{name} ({equiv})" if equiv else name)
|
|
1268
|
+
for name, equiv in cats["ligands"]:
|
|
1269
|
+
reagent_parts.append(f"{name} ({equiv})" if equiv else name)
|
|
1270
|
+
for name, equiv in cats["bases"]:
|
|
1271
|
+
reagent_parts.append(f"{name} ({equiv})" if equiv else name)
|
|
1272
|
+
for name, equiv in cats["reagents"]:
|
|
1273
|
+
reagent_parts.append(f"{name} ({equiv})" if equiv else name)
|
|
1274
|
+
if reagent_parts:
|
|
1275
|
+
parts.append(f" Reagents: {', '.join(reagent_parts)}")
|
|
1276
|
+
|
|
1277
|
+
# Solvent line
|
|
1278
|
+
if cats["solvents"]:
|
|
1279
|
+
parts.append(f" Solvent: {', '.join(cats['solvents'])}")
|
|
1280
|
+
|
|
1281
|
+
# Physical conditions line (temp, time, atmosphere)
|
|
1282
|
+
cond_parts = list(cats["conditions"])
|
|
1283
|
+
if step.yield_text:
|
|
1284
|
+
cond_parts.append(f"{step.yield_text} yield")
|
|
1285
|
+
if cond_parts:
|
|
1286
|
+
parts.append(f" Conditions: {', '.join(cond_parts)}")
|
|
1287
|
+
elif not reagent_parts and not cats["solvents"]:
|
|
1288
|
+
parts.append(" Conditions: (not specified)")
|
|
1289
|
+
|
|
1290
|
+
# Workup line
|
|
1291
|
+
if cats["workup"]:
|
|
1292
|
+
parts.append(f" Workup: {'; '.join(cats['workup'])}")
|
|
1293
|
+
|
|
1294
|
+
# ML grounding block (when enrichment available)
|
|
1295
|
+
if ml_data:
|
|
1296
|
+
ml_parts = []
|
|
1297
|
+
rc = ml_data.get("reaction_class")
|
|
1298
|
+
rn = ml_data.get("reaction_name")
|
|
1299
|
+
conf = ml_data.get("confidence", 0)
|
|
1300
|
+
if rc or rn:
|
|
1301
|
+
label = rn or rc
|
|
1302
|
+
ml_parts.append(f'rxn-insight="{label}"')
|
|
1303
|
+
if conf:
|
|
1304
|
+
ml_parts.append(f"atom-map confidence={conf:.2f}")
|
|
1305
|
+
bp = ml_data.get("byproducts", [])
|
|
1306
|
+
if bp:
|
|
1307
|
+
ml_parts.append(f"byproducts=[{', '.join(bp)}]")
|
|
1308
|
+
if ml_parts:
|
|
1309
|
+
parts.append(f" [ML: {'; '.join(ml_parts)}]")
|
|
1310
|
+
|
|
1311
|
+
# Tier B: bond-change description from atom maps
|
|
1312
|
+
mapped_rxn = ml_data.get("mapped_rxn", "")
|
|
1313
|
+
if mapped_rxn:
|
|
1314
|
+
changes = analyze_bond_changes(mapped_rxn)
|
|
1315
|
+
xform_desc = describe_transformation(changes)
|
|
1316
|
+
if xform_desc:
|
|
1317
|
+
parts.append(f" Bond changes: {xform_desc}")
|
|
1318
|
+
|
|
1319
|
+
parts.append("")
|
|
1320
|
+
|
|
1321
|
+
# Substrate scope table section (when scope entries detected)
|
|
1322
|
+
if hasattr(desc, 'scope_entries') and desc.scope_entries:
|
|
1323
|
+
parts.append("Substrate scope:")
|
|
1324
|
+
for entry in desc.scope_entries:
|
|
1325
|
+
sp = desc.species.get(entry.species_id) if entry.species_id else None
|
|
1326
|
+
display = _species_display(sp) if sp else None
|
|
1327
|
+
|
|
1328
|
+
line_parts = []
|
|
1329
|
+
if entry.label:
|
|
1330
|
+
line_parts.append(entry.label)
|
|
1331
|
+
elif display:
|
|
1332
|
+
line_parts.append(display)
|
|
1333
|
+
else:
|
|
1334
|
+
line_parts.append(entry.entry_id)
|
|
1335
|
+
|
|
1336
|
+
if entry.conditions_variant:
|
|
1337
|
+
line_parts.append(f"({entry.conditions_variant})")
|
|
1338
|
+
if entry.yield_text:
|
|
1339
|
+
line_parts.append(f"— {entry.yield_text}")
|
|
1340
|
+
if entry.mass_text:
|
|
1341
|
+
line_parts.append(f"({entry.mass_text})")
|
|
1342
|
+
if entry.notes:
|
|
1343
|
+
line_parts.append(f"[{entry.notes}]")
|
|
1344
|
+
|
|
1345
|
+
parts.append(f" - {' '.join(line_parts)}")
|
|
1346
|
+
parts.append("")
|
|
1347
|
+
|
|
1348
|
+
return "\n".join(parts).rstrip()
|
|
1349
|
+
|
|
1350
|
+
|
|
1351
|
+
# ---------------------------------------------------------------------------
|
|
1352
|
+
# CLI
|
|
1353
|
+
# ---------------------------------------------------------------------------
|
|
1354
|
+
|
|
1355
|
+
def main(argv: Optional[list] = None) -> int:
|
|
1356
|
+
parser = argparse.ArgumentParser(
|
|
1357
|
+
prog="scheme_refine",
|
|
1358
|
+
description="LLM refinement of scheme_reader output.",
|
|
1359
|
+
)
|
|
1360
|
+
sub = parser.add_subparsers(dest="command")
|
|
1361
|
+
|
|
1362
|
+
# prompt subcommand
|
|
1363
|
+
p_prompt = sub.add_parser("prompt",
|
|
1364
|
+
help="Generate refinement prompt for LLM")
|
|
1365
|
+
p_prompt.add_argument("input", help="Tier 1 JSON file")
|
|
1366
|
+
p_prompt.add_argument("--image", help="Path to rendered scheme image")
|
|
1367
|
+
|
|
1368
|
+
# apply subcommand
|
|
1369
|
+
p_apply = sub.add_parser("apply",
|
|
1370
|
+
help="Apply corrections to Tier 1 output")
|
|
1371
|
+
p_apply.add_argument("input", help="Tier 1 JSON file")
|
|
1372
|
+
p_apply.add_argument("corrections", help="Corrections JSON file")
|
|
1373
|
+
p_apply.add_argument("-o", "--output", help="Output refined JSON")
|
|
1374
|
+
|
|
1375
|
+
args = parser.parse_args(argv)
|
|
1376
|
+
|
|
1377
|
+
if args.command == "prompt":
|
|
1378
|
+
desc = SchemeDescription.from_json(args.input)
|
|
1379
|
+
prompt = generate_prompt(desc, image_path=args.image)
|
|
1380
|
+
print(prompt)
|
|
1381
|
+
return 0
|
|
1382
|
+
|
|
1383
|
+
elif args.command == "apply":
|
|
1384
|
+
desc = SchemeDescription.from_json(args.input)
|
|
1385
|
+
with open(args.corrections, "r", encoding="utf-8") as f:
|
|
1386
|
+
corrections = json.load(f)
|
|
1387
|
+
refined = apply_corrections(desc, corrections)
|
|
1388
|
+
if args.output:
|
|
1389
|
+
refined.to_json(args.output)
|
|
1390
|
+
print(f"Written to {args.output}", file=sys.stderr)
|
|
1391
|
+
else:
|
|
1392
|
+
out = json.dumps(refined.to_dict(), indent=2,
|
|
1393
|
+
ensure_ascii=False)
|
|
1394
|
+
sys.stdout.buffer.write(out.encode("utf-8"))
|
|
1395
|
+
sys.stdout.buffer.write(b"\n")
|
|
1396
|
+
return 0
|
|
1397
|
+
|
|
1398
|
+
else:
|
|
1399
|
+
parser.print_help()
|
|
1400
|
+
return 1
|
|
1401
|
+
|
|
1402
|
+
|
|
1403
|
+
if __name__ == "__main__":
|
|
1404
|
+
sys.exit(main())
|