cdxml-toolkit 0.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (91) hide show
  1. cdxml_toolkit/__init__.py +18 -0
  2. cdxml_toolkit/_jre/__init__.py +2 -0
  3. cdxml_toolkit/_jre/temurin-21-jre-win-x64.zip +0 -0
  4. cdxml_toolkit/analysis/__init__.py +35 -0
  5. cdxml_toolkit/analysis/deterministic/__init__.py +12 -0
  6. cdxml_toolkit/analysis/deterministic/discover_experiment_files.py +413 -0
  7. cdxml_toolkit/analysis/deterministic/lab_book_formatter.py +701 -0
  8. cdxml_toolkit/analysis/deterministic/lcms_file_categorizer.py +928 -0
  9. cdxml_toolkit/analysis/deterministic/lcms_identifier.py +598 -0
  10. cdxml_toolkit/analysis/deterministic/mass_resolver.py +654 -0
  11. cdxml_toolkit/analysis/deterministic/multi_lcms_analyzer.py +1412 -0
  12. cdxml_toolkit/analysis/deterministic/procedure_writer.py +446 -0
  13. cdxml_toolkit/analysis/extract_nmr.py +47 -0
  14. cdxml_toolkit/analysis/format_procedure_entry.py +479 -0
  15. cdxml_toolkit/analysis/lcms_analyzer.py +1299 -0
  16. cdxml_toolkit/analysis/parse_analysis_file.py +134 -0
  17. cdxml_toolkit/cdxml_builder.py +920 -0
  18. cdxml_toolkit/cdxml_utils.py +342 -0
  19. cdxml_toolkit/chemdraw/__init__.py +5 -0
  20. cdxml_toolkit/chemdraw/_chemscript_server.py +562 -0
  21. cdxml_toolkit/chemdraw/cdx_converter.py +527 -0
  22. cdxml_toolkit/chemdraw/cdxml_to_image.py +262 -0
  23. cdxml_toolkit/chemdraw/cdxml_to_image_rdkit.py +296 -0
  24. cdxml_toolkit/chemdraw/chemscript_bridge.py +901 -0
  25. cdxml_toolkit/constants.py +304 -0
  26. cdxml_toolkit/coord_normalizer.py +438 -0
  27. cdxml_toolkit/deterministic_pipeline/__init__.py +6 -0
  28. cdxml_toolkit/deterministic_pipeline/legacy/__init__.py +5 -0
  29. cdxml_toolkit/deterministic_pipeline/legacy/eln_cdx_cleanup.py +509 -0
  30. cdxml_toolkit/deterministic_pipeline/legacy/eln_enrichment.py +1394 -0
  31. cdxml_toolkit/deterministic_pipeline/legacy/scheme_aligner.py +428 -0
  32. cdxml_toolkit/deterministic_pipeline/legacy/scheme_polisher.py +1337 -0
  33. cdxml_toolkit/deterministic_pipeline/legacy/scheme_polisher_v2.py +1340 -0
  34. cdxml_toolkit/deterministic_pipeline/scheme_reader_audit.py +931 -0
  35. cdxml_toolkit/deterministic_pipeline/scheme_reader_verify.py +1160 -0
  36. cdxml_toolkit/image/__init__.py +15 -0
  37. cdxml_toolkit/image/reaction_from_image.py +2103 -0
  38. cdxml_toolkit/image/structure_from_image.py +1711 -0
  39. cdxml_toolkit/layout/__init__.py +5 -0
  40. cdxml_toolkit/layout/alignment.py +1642 -0
  41. cdxml_toolkit/layout/reaction_cleanup.py +1002 -0
  42. cdxml_toolkit/layout/scheme_merger.py +2260 -0
  43. cdxml_toolkit/mcp_server/__init__.py +0 -0
  44. cdxml_toolkit/mcp_server/__main__.py +5 -0
  45. cdxml_toolkit/mcp_server/server.py +1567 -0
  46. cdxml_toolkit/naming/__init__.py +6 -0
  47. cdxml_toolkit/naming/aligned_namer.py +2342 -0
  48. cdxml_toolkit/naming/mol_builder.py +3722 -0
  49. cdxml_toolkit/naming/name_decomposer.py +2843 -0
  50. cdxml_toolkit/naming/reactions_datamol.json +2414 -0
  51. cdxml_toolkit/office/__init__.py +5 -0
  52. cdxml_toolkit/office/doc_from_template.py +722 -0
  53. cdxml_toolkit/office/ole_embedder.py +808 -0
  54. cdxml_toolkit/office/ole_extractor.py +272 -0
  55. cdxml_toolkit/perception/__init__.py +10 -0
  56. cdxml_toolkit/perception/compound_search.py +229 -0
  57. cdxml_toolkit/perception/eln_csv_parser.py +240 -0
  58. cdxml_toolkit/perception/rdf_parser.py +664 -0
  59. cdxml_toolkit/perception/reactant_heuristic.py +1045 -0
  60. cdxml_toolkit/perception/reaction_parser.py +2150 -0
  61. cdxml_toolkit/perception/scheme_reader.py +2948 -0
  62. cdxml_toolkit/perception/scheme_refine.py +1404 -0
  63. cdxml_toolkit/perception/scheme_segmenter.py +619 -0
  64. cdxml_toolkit/perception/spatial_assignment.py +1013 -0
  65. cdxml_toolkit/rdkit_utils.py +605 -0
  66. cdxml_toolkit/render/__init__.py +17 -0
  67. cdxml_toolkit/render/auto_layout.py +229 -0
  68. cdxml_toolkit/render/compact_parser.py +632 -0
  69. cdxml_toolkit/render/parser.py +706 -0
  70. cdxml_toolkit/render/render_scheme.py +267 -0
  71. cdxml_toolkit/render/renderer.py +2387 -0
  72. cdxml_toolkit/render/schema.py +90 -0
  73. cdxml_toolkit/render/scheme_maker.py +1043 -0
  74. cdxml_toolkit/render/scheme_yaml_writer.py +1487 -0
  75. cdxml_toolkit/resolve/__init__.py +13 -0
  76. cdxml_toolkit/resolve/cas_resolver.py +430 -0
  77. cdxml_toolkit/resolve/chemscanner_abbreviations.json +28813 -0
  78. cdxml_toolkit/resolve/condensed_formula.py +493 -0
  79. cdxml_toolkit/resolve/jre_manager.py +195 -0
  80. cdxml_toolkit/resolve/reagent_abbreviations.json +1046 -0
  81. cdxml_toolkit/resolve/reagent_db.py +285 -0
  82. cdxml_toolkit/resolve/superatom_data.json +2856 -0
  83. cdxml_toolkit/resolve/superatom_table.py +146 -0
  84. cdxml_toolkit/text_formatting.py +298 -0
  85. cdxml_toolkit-0.5.0.dist-info/METADATA +318 -0
  86. cdxml_toolkit-0.5.0.dist-info/RECORD +91 -0
  87. cdxml_toolkit-0.5.0.dist-info/WHEEL +5 -0
  88. cdxml_toolkit-0.5.0.dist-info/entry_points.txt +17 -0
  89. cdxml_toolkit-0.5.0.dist-info/licenses/LICENSE +21 -0
  90. cdxml_toolkit-0.5.0.dist-info/licenses/NOTICE.md +37 -0
  91. cdxml_toolkit-0.5.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,1404 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ scheme_refine.py — Tier 2 LLM-based refinement of scheme_reader output.
4
+
5
+ Takes a SchemeDescription (Tier 1 deterministic output) and produces a refined
6
+ version with corrections from an LLM. The module:
7
+
8
+ 1. Generates a structured prompt with the Tier 1 JSON + context.
9
+ 2. Accepts a correction dict (from any LLM — Claude API, local model, etc.).
10
+ 3. Applies corrections to produce a refined SchemeDescription.
11
+
12
+ The correction format is designed to be simple and LLM-friendly:
13
+
14
+ {
15
+ "content_type": "synthesis", # override content type
16
+ "topology": "linear", # override topology
17
+ "species_corrections": {
18
+ "species_5": {"text_category": "condition_ref"},
19
+ "species_8": {"text_category": "citation"},
20
+ },
21
+ "narrative_override": "...", # replace narrative entirely
22
+ "notes": "..." # free-form LLM reasoning
23
+ }
24
+
25
+ CLI:
26
+ # Generate prompt for LLM review
27
+ python -m cdxml_toolkit.scheme_refine prompt scheme.json
28
+
29
+ # Apply corrections
30
+ python -m cdxml_toolkit.scheme_refine apply scheme.json corrections.json -o refined.json
31
+
32
+ Python API:
33
+ from cdxml_toolkit.perception.scheme_refine import generate_prompt, apply_corrections
34
+ prompt = generate_prompt(desc)
35
+ refined = apply_corrections(desc, corrections_dict)
36
+ """
37
+
38
+ from __future__ import annotations
39
+
40
+ import argparse
41
+ import json
42
+ import os
43
+ import re
44
+ import sys
45
+ from typing import Any, Dict, List, Optional, Tuple
46
+
47
+ from .scheme_reader import SchemeDescription, SpeciesRecord, ScopeEntry
48
+
49
+
50
+ # ---------------------------------------------------------------------------
51
+ # Aligned IUPAC name enrichment
52
+ # ---------------------------------------------------------------------------
53
+
54
+ def enrich_aligned_names(desc: SchemeDescription, verbose: bool = False) -> int:
55
+ """Replace canonical IUPAC names with aligned alternatives per step.
56
+
57
+ For each step, finds reactant→product SMILES pairs, calls
58
+ ``find_aligned_names()``, and overwrites ``iupac_name`` with the
59
+ best enumerated name regardless of alignment quality. ALIGNED and
60
+ SEMI-ALIGNED pairs naturally outrank UNALIGNED (via higher
61
+ similarity score), but even UNALIGNED names are always used —
62
+ any IUPAC name is better than showing raw SMILES.
63
+
64
+ Also stores transformation diffs in ``desc._alignment_diffs`` for
65
+ display in the narrative.
66
+
67
+ Requires ``cdxml_toolkit.aligned_namer`` (which in turn requires
68
+ ChemScript). Returns 0 silently if the module is unavailable.
69
+
70
+ Parameters
71
+ ----------
72
+ desc : SchemeDescription
73
+ Parsed scheme with species SMILES populated.
74
+ verbose : bool
75
+ Print alignment progress to stderr.
76
+
77
+ Returns
78
+ -------
79
+ int
80
+ Number of species whose ``iupac_name`` was updated.
81
+ """
82
+ try:
83
+ from ..naming.aligned_namer import find_aligned_names, format_name_diff
84
+ except Exception:
85
+ return 0
86
+
87
+ updated: Dict[str, float] = {} # species_id → best_similarity so far
88
+ n_updated = 0
89
+
90
+ # Store transformation diffs: (reactant_id, product_id) → diff string
91
+ if not hasattr(desc, '_alignment_diffs'):
92
+ desc._alignment_diffs = {}
93
+
94
+ for step in desc.steps:
95
+ # Collect reactant and product species with SMILES
96
+ reactants = []
97
+ for rid in step.reactant_ids:
98
+ sp = desc.species.get(rid)
99
+ if sp and sp.smiles and sp.element_type == "fragment":
100
+ reactants.append(sp)
101
+ products = []
102
+ for pid in step.product_ids:
103
+ sp = desc.species.get(pid)
104
+ if sp and sp.smiles and sp.element_type == "fragment":
105
+ products.append(sp)
106
+
107
+ if not reactants or not products:
108
+ continue
109
+
110
+ # Pair each reactant with each product
111
+ for r_sp in reactants:
112
+ for p_sp in products:
113
+ try:
114
+ result = find_aligned_names(r_sp.smiles, p_sp.smiles,
115
+ verbose=verbose)
116
+ except Exception:
117
+ continue
118
+
119
+ quality = result.alignment_quality
120
+
121
+ # Use best_similarity as priority — ALIGNED/SEMI-ALIGNED
122
+ # (sim >= 0.5) naturally outranks UNALIGNED (sim < 0.5),
123
+ # but UNALIGNED names are still always better than SMILES.
124
+ sim = result.best_similarity
125
+ # Update reactant name if this alignment is better
126
+ if (result.best_sm_name
127
+ and sim > updated.get(r_sp.id, -1)):
128
+ r_sp.iupac_name = result.best_sm_name
129
+ updated[r_sp.id] = sim
130
+ n_updated += 1
131
+
132
+ # Update product name if this alignment is better
133
+ if (result.best_prod_name
134
+ and sim > updated.get(p_sp.id, -1)):
135
+ p_sp.iupac_name = result.best_prod_name
136
+ updated[p_sp.id] = sim
137
+ n_updated += 1
138
+
139
+ # Store the transformation diff for this aligned pair
140
+ if result.best_sm_name and result.best_prod_name:
141
+ try:
142
+ diff_str = format_name_diff(
143
+ result.best_sm_name, result.best_prod_name)
144
+ if diff_str and diff_str != "(identical)":
145
+ desc._alignment_diffs[
146
+ (r_sp.id, p_sp.id)] = diff_str
147
+ except Exception:
148
+ pass
149
+
150
+ if verbose:
151
+ print(f" Aligned [{quality}] {r_sp.id} \u2194 {p_sp.id}: "
152
+ f"sim={sim:.2f}", file=sys.stderr)
153
+ print(f" SM: {result.best_sm_name}",
154
+ file=sys.stderr)
155
+ print(f" Prod: {result.best_prod_name}",
156
+ file=sys.stderr)
157
+
158
+ # Second pass: name R-group species by replacing * with H and naming
159
+ # the "core" structure. Any IUPAC name is better than a raw formula
160
+ # or SMILES in the narrative.
161
+ n_updated += _name_rgroup_cores(desc, verbose=verbose)
162
+
163
+ return n_updated
164
+
165
+
166
+ # Regex matching R-group tokens in SMILES: [R], [R'], [R3], [OR'], [Het],
167
+ # [COOR''], [F,Cl,Br,I], compound-label brackets like [2.21], [(R,S,S)-5.2]
168
+ _RGROUP_TOKEN_RE = re.compile(
169
+ r'\[R\d*\'*\]' # [R], [R'], [R3], [R'']
170
+ r'|\[OR\'*\]' # [OR'], [OR'']
171
+ r'|\[Het\]' # [Het]
172
+ r'|\[COOR\'*\]' # [COOR''], [COOR']
173
+ r'|\[F,Cl,Br,I\]' # halide variable
174
+ )
175
+
176
+ # ChemDraw generic group abbreviations that prevent SMILES parsing.
177
+ # When a SMILES contains these, the species is a generic methodology scheme
178
+ # (protecting groups, leaving groups, etc.) and cannot be named.
179
+ _CHEMDRAW_ABBREV_RE = re.compile(
180
+ r'\[G\]' # generic group
181
+ r'|\[LG\]' # leaving group
182
+ r'|\[P\d*\'*\]' # protecting group [P], [P1'], [P']
183
+ r'|\[EWG\]' # electron-withdrawing group
184
+ r'|\[EDG\]' # electron-donating group
185
+ r'|\[Nu\]' # nucleophile
186
+ r'|\[E\+?\]' # electrophile
187
+ r'|\[Base\]' # base
188
+ )
189
+
190
+ # Tokens that are just bare R-group labels (entire SMILES is the token)
191
+ _BARE_RGROUP_RE = re.compile(
192
+ r'^\[R\d*\'*\]$'
193
+ r'|^\[OR\'*\]$'
194
+ r'|^\[Het\]$'
195
+ r'|^\[COOR\'*\]$'
196
+ r'|^\[F,Cl,Br,I\]$'
197
+ r'|^\*$'
198
+ )
199
+
200
+
201
+ def _name_rgroup_cores(desc: SchemeDescription,
202
+ verbose: bool = False) -> int:
203
+ """Name species that contain R-group atoms by naming the H-core.
204
+
205
+ Handles both RDKit notation (``*``) and ChemScript notation
206
+ (``[R]``, ``[R']``, ``[R3]``, ``[OR']``, ``[Het]``, etc.).
207
+
208
+ Replaces R-group tokens with ``[H]``, canonicalises with RDKit,
209
+ then calls ChemScript ``get_name()`` on the resulting real molecule.
210
+ The IUPAC name is stored as ``"<core name> derivative"`` (for 2+ R)
211
+ or ``"R-substituted <core name>"`` (for 1 R).
212
+
213
+ Returns the number of species updated.
214
+ """
215
+ try:
216
+ from rdkit import Chem
217
+ from ..chemdraw.chemscript_bridge import ChemScriptBridge
218
+ bridge = ChemScriptBridge()
219
+ except Exception:
220
+ return 0
221
+
222
+ n = 0
223
+ for sid, sp in desc.species.items():
224
+ if getattr(sp, 'iupac_name', None):
225
+ continue # already named
226
+ smiles = sp.smiles
227
+ if not smiles:
228
+ continue
229
+
230
+ # Check for R-group tokens (both * and [R]-style) and
231
+ # ChemDraw generic abbreviations ([P], [G], [LG], etc.)
232
+ has_star = '*' in smiles
233
+ rgroup_matches = _RGROUP_TOKEN_RE.findall(smiles)
234
+ abbrev_matches = _CHEMDRAW_ABBREV_RE.findall(smiles)
235
+ if not has_star and not rgroup_matches and not abbrev_matches:
236
+ continue
237
+
238
+ try:
239
+ # Bare R-group atom: just show the label directly
240
+ stripped = smiles.strip()
241
+ if _BARE_RGROUP_RE.match(stripped):
242
+ # Show as-is but cleaned up: [R] -> R, [R3] -> R3, etc.
243
+ label = stripped.strip('[]')
244
+ if label == '*':
245
+ label = 'R'
246
+ sp.iupac_name = label
247
+ n += 1
248
+ continue
249
+
250
+ # Replace all R-group and abbreviation tokens with [H]
251
+ core_smiles = smiles
252
+ for token in rgroup_matches:
253
+ core_smiles = core_smiles.replace(token, '[H]')
254
+ for token in abbrev_matches:
255
+ core_smiles = core_smiles.replace(token, '[H]')
256
+ if has_star:
257
+ core_smiles = core_smiles.replace('*', '[H]')
258
+
259
+ mol = Chem.MolFromSmiles(core_smiles)
260
+ if mol is None:
261
+ # Can't parse even after stripping — label as generic
262
+ sp.iupac_name = "generic intermediate"
263
+ n += 1
264
+ if verbose:
265
+ print(f" R-group core: {sid} -> generic intermediate "
266
+ f"(unparseable: {smiles[:50]})", file=sys.stderr)
267
+ continue
268
+
269
+ core_canon = Chem.MolToSmiles(mol)
270
+ core_name = bridge.get_name(core_canon)
271
+ if not core_name:
272
+ # Named core failed — still better than raw SMILES
273
+ sp.iupac_name = "generic intermediate"
274
+ n += 1
275
+ continue
276
+
277
+ n_rgroups = (len(rgroup_matches) + len(abbrev_matches)
278
+ + smiles.count('*'))
279
+ if n_rgroups == 1:
280
+ sp.iupac_name = f"R-substituted {core_name}"
281
+ else:
282
+ sp.iupac_name = f"{core_name} derivative"
283
+ n += 1
284
+
285
+ if verbose:
286
+ print(f" R-group core: {sid} -> {sp.iupac_name}",
287
+ file=sys.stderr)
288
+ except Exception:
289
+ continue
290
+
291
+ return n
292
+
293
+
294
+ # ---------------------------------------------------------------------------
295
+ # Prompt generation
296
+ # ---------------------------------------------------------------------------
297
+
298
+ def generate_prompt(desc: SchemeDescription,
299
+ image_path: Optional[str] = None) -> str:
300
+ """Generate a structured prompt for LLM refinement.
301
+
302
+ Parameters
303
+ ----------
304
+ desc : SchemeDescription
305
+ Tier 1 deterministic output.
306
+ image_path : str, optional
307
+ Path to rendered scheme image (for vision models).
308
+
309
+ Returns
310
+ -------
311
+ str
312
+ Structured prompt text.
313
+ """
314
+ parts = []
315
+
316
+ parts.append("# Scheme Refinement Task\n")
317
+ parts.append("You are reviewing the output of a deterministic chemical "
318
+ "scheme parser. Your job is to identify and correct any "
319
+ "misclassifications in the structured output.\n")
320
+
321
+ if image_path:
322
+ parts.append(f"**Rendered image**: {image_path}\n")
323
+
324
+ parts.append("## Tier 1 Parser Output\n")
325
+ parts.append(f"- **Topology**: {desc.topology}")
326
+ parts.append(f"- **Content type**: {desc.content_type or 'unknown'}")
327
+ parts.append(f"- **Steps**: {desc.num_steps}")
328
+ parts.append(f"- **Species**: {len(desc.species)}\n")
329
+
330
+ # Species summary table
331
+ parts.append("### Species Registry\n")
332
+ parts.append("| ID | Type | Category | Label | Name (first 60 chars) | "
333
+ "SMILES (first 40 chars) | MW |")
334
+ parts.append("|" + "|".join(["---"] * 7) + "|")
335
+ for sp_id, sp in desc.species.items():
336
+ name_short = (sp.name or "")[:60].replace("\n", " / ")
337
+ smi_short = (sp.smiles or "")[:40]
338
+ parts.append(
339
+ f"| {sp_id} | {sp.element_type} | {sp.text_category or '-'} | "
340
+ f"{sp.label or '-'} | {name_short} | {smi_short} | "
341
+ f"{sp.mw or '-'} |"
342
+ )
343
+
344
+ # Steps summary
345
+ parts.append("\n### Reaction Steps\n")
346
+ for step in desc.steps:
347
+ r_ids = ", ".join(step.reactant_ids) or "(none)"
348
+ p_ids = ", ".join(step.product_ids) or "(none)"
349
+ rg_ids = ", ".join(step.reagent_ids) or "(none)"
350
+ parts.append(
351
+ f"- **Step {step.step_index}**: "
352
+ f"R=[{r_ids}] → P=[{p_ids}] | "
353
+ f"reagents=[{rg_ids}] | "
354
+ f"conditions={step.conditions} | "
355
+ f"yield={step.yield_text or '-'} | "
356
+ f"arrow={step.arrow_style}"
357
+ )
358
+
359
+ # Narrative
360
+ parts.append(f"\n### Current Narrative\n{desc.narrative}\n")
361
+
362
+ # Instructions
363
+ parts.append("## Your Task\n")
364
+ parts.append("Review the above and return a JSON correction object with "
365
+ "any needed fixes. Only include fields that need changing.\n")
366
+ parts.append("Correction format:\n```json")
367
+ parts.append(json.dumps({
368
+ "content_type": "<correct type: synthesis | sar_design | "
369
+ "biological_pathway | target_array | "
370
+ "literature_comparison | composite | investigation>",
371
+ "topology": "<correct topology if wrong>",
372
+ "species_corrections": {
373
+ "<species_id>": {
374
+ "text_category": "<condition_ref | citation | bioactivity | "
375
+ "chemical | conditions_block>"
376
+ }
377
+ },
378
+ "narrative_override": "<better narrative if the current one is wrong>",
379
+ "notes": "<your reasoning>"
380
+ }, indent=2))
381
+ parts.append("```\n")
382
+ parts.append("If the Tier 1 output is correct, return: `{}`\n")
383
+
384
+ return "\n".join(parts)
385
+
386
+
387
+ # ---------------------------------------------------------------------------
388
+ # Apply corrections
389
+ # ---------------------------------------------------------------------------
390
+
391
+ def apply_corrections(desc: SchemeDescription,
392
+ corrections: Dict[str, Any]) -> SchemeDescription:
393
+ """Apply LLM corrections to a SchemeDescription.
394
+
395
+ Returns a new SchemeDescription with corrections applied.
396
+ The original is not modified.
397
+
398
+ Parameters
399
+ ----------
400
+ desc : SchemeDescription
401
+ Tier 1 deterministic output.
402
+ corrections : dict
403
+ LLM correction dict (see module docstring for format).
404
+
405
+ Returns
406
+ -------
407
+ SchemeDescription
408
+ Refined description.
409
+ """
410
+ # Deep copy via JSON round-trip
411
+ d = desc.to_dict()
412
+ refined = SchemeDescription.from_dict(d)
413
+
414
+ if not corrections:
415
+ return refined
416
+
417
+ # Apply content type override
418
+ if "content_type" in corrections:
419
+ refined.content_type = corrections["content_type"]
420
+
421
+ # Apply topology override
422
+ if "topology" in corrections:
423
+ refined.topology = corrections["topology"]
424
+
425
+ # Apply species corrections
426
+ sp_corr = corrections.get("species_corrections", {})
427
+ for sp_id, fixes in sp_corr.items():
428
+ sp = refined.species.get(sp_id)
429
+ if sp is None:
430
+ continue
431
+ if "text_category" in fixes:
432
+ sp.text_category = fixes["text_category"]
433
+ if "name" in fixes:
434
+ sp.name = fixes["name"]
435
+ if "smiles" in fixes:
436
+ sp.smiles = fixes["smiles"]
437
+ if "label" in fixes:
438
+ sp.label = fixes["label"]
439
+
440
+ # Apply narrative override
441
+ if "narrative_override" in corrections:
442
+ refined.narrative = corrections["narrative_override"]
443
+ else:
444
+ # Regenerate narrative with corrected data
445
+ from .scheme_reader import _generate_narrative
446
+ refined.narrative = _generate_narrative(refined)
447
+
448
+ return refined
449
+
450
+
451
+ # ---------------------------------------------------------------------------
452
+ # Batch refinement: apply a corrections file to multiple schemes
453
+ # ---------------------------------------------------------------------------
454
+
455
+ def load_corrections_file(path: str) -> Dict[str, Dict[str, Any]]:
456
+ """Load a corrections file mapping source filenames to corrections.
457
+
458
+ Format:
459
+ {
460
+ "oleObject1.cdxml": { ... corrections ... },
461
+ "oleObject2.cdxml": { ... corrections ... },
462
+ }
463
+ """
464
+ with open(path, "r", encoding="utf-8") as f:
465
+ return json.load(f)
466
+
467
+
468
+ def refine_scheme(desc: SchemeDescription,
469
+ corrections: Optional[Dict[str, Any]] = None) -> SchemeDescription:
470
+ """Refine a scheme description.
471
+
472
+ If corrections are provided, applies them.
473
+ Otherwise returns the description unchanged.
474
+ """
475
+ if corrections:
476
+ return apply_corrections(desc, corrections)
477
+ return desc
478
+
479
+
480
+ # ---------------------------------------------------------------------------
481
+ # LLM-quality narrative generation
482
+ # ---------------------------------------------------------------------------
483
+
484
+ # Reagent -> reaction type mapping (pattern, reaction_name, notes)
485
+ _REACTION_PATTERNS: List[Tuple[re.Pattern, str, str]] = [
486
+ # Pd-catalysed cross-couplings
487
+ (re.compile(r"Pd.*(?:dba|PPh3|dppf|dppp|OAc|Cl2)", re.I),
488
+ None, "Pd-catalysed"), # refined below
489
+ # Buchwald-Hartwig: Pd + amine + base
490
+ (re.compile(r"(?:BINAP|XPhos|SPhos|DavePhos|RuPhos|BrettPhos|JohnPhos"
491
+ r"|XantPhos|t-?Bu[23]?P)", re.I),
492
+ "Buchwald-Hartwig amination", "Pd/ligand system"),
493
+ # Suzuki: boronic acid
494
+ (re.compile(r"B\(OH\)2|boronic|Bpin|BF3K|potassium trifluoroborate", re.I),
495
+ "Suzuki coupling", "boronic acid coupling partner"),
496
+ # Sonogashira: alkyne + Pd/Cu
497
+ (re.compile(r"(?:Sonogashira|CuI.*Pd|PdCl2.*CuI)", re.I),
498
+ "Sonogashira coupling", ""),
499
+ # Heck
500
+ (re.compile(r"(?:Heck|acrylate.*Pd|Pd.*vinyl)", re.I),
501
+ "Heck reaction", ""),
502
+ # NBS bromination
503
+ (re.compile(r"\bNBS\b", re.I),
504
+ "NBS bromination", "electrophilic aromatic bromination"),
505
+ # Boc deprotection
506
+ (re.compile(r"\bTFA\b.*(?:DCM|CH2Cl2)|HCl.*(?:dioxane|Et2O|MeOH)|"
507
+ r"Boc.*(?:deprot|remov)", re.I),
508
+ "Boc deprotection", "acidic removal of tert-butoxycarbonyl"),
509
+ # Cbz deprotection / hydrogenolysis
510
+ (re.compile(r"H2.*Pd/?C|Pd/?C.*H2|hydrogenolysis|Cbz.*deprot", re.I),
511
+ "hydrogenolysis", "Pd/C-catalysed H2 reduction"),
512
+ # Amide coupling
513
+ (re.compile(r"\b(?:HATU|HBTU|EDCI|EDC|DCC|T3P|COMU|PyBOP|TBTU|HOBt"
514
+ r"|HOAt|TFFH|SOCl2.*amine|CDI)\b", re.I),
515
+ "amide coupling", "peptide bond formation"),
516
+ # Reductive amination
517
+ (re.compile(r"NaBH(?:3CN|OAc|\(OAc\)3)|reductive amin", re.I),
518
+ "reductive amination", "imine formation + reduction"),
519
+ # Mitsunobu
520
+ (re.compile(r"(?:DIAD|DEAD|DMAP).*PPh3|Mitsunobu", re.I),
521
+ "Mitsunobu reaction", "stereoinversion of alcohol"),
522
+ # Grignard
523
+ (re.compile(r"\bMgBr\b|\bMgCl\b|Grignard", re.I),
524
+ "Grignard addition", "organomagnesium addition"),
525
+ # Wittig / HWE
526
+ (re.compile(r"(?:Wittig|ylide|PPh3.*CHO|HWE|Horner)", re.I),
527
+ "Wittig/HWE olefination", ""),
528
+ # SNAr
529
+ (re.compile(r"(?:SNAr|nucleophilic aromatic|K2CO3.*DMF|Cs2CO3.*DMF"
530
+ r"|NaH.*DMF)", re.I),
531
+ None, ""), # needs context to distinguish from Buchwald
532
+ # Reduction (general)
533
+ (re.compile(r"\bLiAlH4\b|LiAlH\(OtBu\)3|NaBH4|DIBAL", re.I),
534
+ "reduction", "hydride reduction"),
535
+ # Oxidation
536
+ (re.compile(r"\b(?:mCPBA|Dess.?Martin|Swern|TEMPO|PDC|PCC|Jones)\b", re.I),
537
+ "oxidation", ""),
538
+ # Halogenation
539
+ (re.compile(r"\bNCS\b", re.I),
540
+ "NCS chlorination", "electrophilic aromatic chlorination"),
541
+ # Alkylation
542
+ (re.compile(r"\b(?:NaH|K2CO3|Cs2CO3)\b.*(?:alkyl|benzyl|methyl|BnBr"
543
+ r"|MeI|allyl)", re.I),
544
+ "alkylation", "base-mediated alkylation"),
545
+ # Ring closure / cyclisation
546
+ (re.compile(r"(?:exo-trig|exo-dig|endo-trig|endo-dig|cycliz|ring.?clos"
547
+ r"|lacton)", re.I),
548
+ "cyclisation", "intramolecular ring closure"),
549
+ ]
550
+
551
+
552
+ def _build_reaction_smiles(step, species: Dict[str, SpeciesRecord]) -> Optional[str]:
553
+ """Build reaction SMILES from a step's species references.
554
+
555
+ Constructs ``R1.R2.reagent1>>P1`` from the step's reactant, reagent,
556
+ and product species. Only species with SMILES are included.
557
+
558
+ Returns None if either side has no SMILES.
559
+ """
560
+ lhs = []
561
+ for sid in list(step.reactant_ids) + list(step.reagent_ids):
562
+ sp = species.get(sid)
563
+ if sp and sp.smiles:
564
+ lhs.append(sp.smiles)
565
+ rhs = []
566
+ for sid in step.product_ids:
567
+ sp = species.get(sid)
568
+ if sp and sp.smiles:
569
+ rhs.append(sp.smiles)
570
+ if not lhs or not rhs:
571
+ return None
572
+ return ".".join(lhs) + ">>" + ".".join(rhs)
573
+
574
+
575
+ def _classify_reaction(condition_text_raw: List[str],
576
+ reagent_species: List[SpeciesRecord],
577
+ desc: SchemeDescription,
578
+ ml_data: Optional[Dict] = None) -> Optional[str]:
579
+ """Try to classify a reaction step from its conditions/reagents.
580
+
581
+ When *ml_data* is supplied (from RXN Insight via ``enrich_steps``), its
582
+ ``reaction_name`` is preferred. The regex heuristic still runs as a
583
+ cross-check and fallback.
584
+ """
585
+ # --- ML classification (preferred when available) ---
586
+ ml_name = None
587
+ if ml_data:
588
+ ml_name = ml_data.get("reaction_name") or None
589
+ # rxn-insight sometimes returns generic class as name; ignore those
590
+ if ml_name and ml_name.lower() in ("unrecognized", "", "other"):
591
+ ml_name = None
592
+
593
+ # Combine all text for pattern matching
594
+ all_text = " ".join(condition_text_raw)
595
+ for sp in reagent_species:
596
+ if sp.name:
597
+ all_text += " " + sp.name
598
+ if sp.smiles:
599
+ all_text += " " + sp.smiles
600
+
601
+ # Check for Pd + specific ligand patterns first (Buchwald vs Suzuki)
602
+ has_pd = bool(re.search(r"Pd", all_text))
603
+ has_boronic = bool(re.search(r"B\(OH\)2|boronic|Bpin", all_text, re.I))
604
+ has_amine = any(
605
+ (sp.smiles and re.search(r"N[^a-z]|NH", sp.smiles or ""))
606
+ or (sp.name and re.search(
607
+ r"morpholin|piperid|pyrrolid|piperazin|amine|aniline|indol",
608
+ sp.name or "", re.I))
609
+ for sp in reagent_species
610
+ )
611
+ has_coupling_ligand = bool(re.search(
612
+ r"BINAP|XPhos|SPhos|DavePhos|RuPhos|BrettPhos|dppf|dppp", all_text, re.I))
613
+
614
+ regex_name = None
615
+ if has_pd and has_boronic:
616
+ regex_name = "Suzuki coupling"
617
+ elif has_pd and has_coupling_ligand and has_amine:
618
+ regex_name = "Buchwald-Hartwig amination"
619
+ elif has_pd and has_coupling_ligand:
620
+ regex_name = "Pd-catalysed cross-coupling"
621
+ else:
622
+ # Pattern-based classification
623
+ for pat, name, _notes in _REACTION_PATTERNS:
624
+ if pat.search(all_text) and name:
625
+ regex_name = name
626
+ break
627
+ # Fallback: check if it's a coupling with base
628
+ if regex_name is None and has_pd:
629
+ regex_name = "Pd-catalysed transformation"
630
+
631
+ # Prefer ML name when available; regex serves as cross-check
632
+ if ml_name and regex_name:
633
+ return regex_name # trust regex for medchem-specific names
634
+ if regex_name:
635
+ return regex_name
636
+ if ml_name:
637
+ return ml_name
638
+ return None
639
+
640
+
641
+ # Compound label that ended up as SMILES (e.g. [2.21], [(R,S,S)-5.2], [5.1])
642
+ _COMPOUND_LABEL_RE = re.compile(
643
+ r'^\[[\d(][\w.,\-()/ ]*\]$'
644
+ )
645
+
646
+
647
+ def _species_display(sp: SpeciesRecord) -> str:
648
+ """Format a species for narrative display.
649
+
650
+ Priority: label > IUPAC name > common name > formula > SMILES.
651
+ SMILES is only shown as a fallback when no readable name is available.
652
+ Compound labels disguised as SMILES (e.g. ``[2.21]``) are shown as
653
+ ``compound 2.21`` instead of ``[SMILES: ...]``.
654
+ """
655
+ parts = []
656
+ if sp.label:
657
+ parts.append(f"compound {sp.label}")
658
+ # Add IUPAC name as parenthetical when available
659
+ iupac = getattr(sp, "iupac_name", None)
660
+ if iupac:
661
+ parts.append(f"({iupac})")
662
+ elif getattr(sp, "iupac_name", None):
663
+ parts.append(sp.iupac_name)
664
+ elif sp.name and len(sp.name) < 40:
665
+ parts.append(sp.name)
666
+ elif sp.formula:
667
+ parts.append(sp.formula)
668
+ # Only show SMILES as last-resort identification when no readable name
669
+ has_readable_name = bool(parts)
670
+ if sp.smiles and not has_readable_name:
671
+ # Detect compound labels disguised as SMILES
672
+ if _COMPOUND_LABEL_RE.match(sp.smiles):
673
+ label_text = sp.smiles.strip('[]')
674
+ parts.append(f"compound {label_text}")
675
+ else:
676
+ parts.append(f"[SMILES: {sp.smiles}]")
677
+ if sp.mw and not sp.label:
678
+ parts.append(f"[MW {sp.mw:.1f}]")
679
+ return " ".join(parts) if parts else sp.id
680
+
681
+
682
+ def _parse_step_reagents(step, species: Dict[str, SpeciesRecord]) -> Dict[str, list]:
683
+ """Decompose all reagent and condition information into categorised bins.
684
+
685
+ Collects data from:
686
+ 1. Fragment reagent species (drawn structures above/below arrow)
687
+ 2. Text reagent species (multi-line text blocks with reagent names,
688
+ solvents, conditions, and workup instructions)
689
+ 3. Parsed ``step.conditions`` (extracted physical conditions)
690
+
691
+ Returns a dict with keys:
692
+ ``catalysts`` – [(display_name, equiv_or_loading), ...]
693
+ ``ligands`` – [(display_name, equiv_or_loading), ...]
694
+ ``bases`` – [(display_name, equiv_or_loading), ...]
695
+ ``reagents`` – [(display_name, equiv_or_loading), ...] (catch-all)
696
+ ``solvents`` – [display_name, ...]
697
+ ``conditions`` – [str, ...] (temperature, time, atmosphere, ...)
698
+ ``workup`` – [str, ...] (quench/workup instructions)
699
+ """
700
+ from ..resolve.reagent_db import get_reagent_db
701
+ from .reaction_parser import _is_condition_token
702
+ db = get_reagent_db()
703
+
704
+ cats: Dict[str, list] = {
705
+ "catalysts": [], "ligands": [], "bases": [], "reagents": [],
706
+ "solvents": [], "conditions": [], "workup": [],
707
+ }
708
+ # Role → bin mapping
709
+ _ROLE_BIN = {
710
+ "catalyst": "catalysts",
711
+ "ligand": "ligands",
712
+ "base": "bases",
713
+ "lewis_acid": "catalysts",
714
+ "solvent": "solvents",
715
+ "coupling_reagent": "reagents",
716
+ "reducing_agent": "reagents",
717
+ "reductant": "reagents",
718
+ "oxidant": "reagents",
719
+ "halogenating_agent": "reagents",
720
+ "fluorinating_agent": "reagents",
721
+ "borylating_agent": "reagents",
722
+ "activating_agent": "reagents",
723
+ "deprotecting_agent": "reagents",
724
+ "protecting_group": "reagents",
725
+ "drying_agent": "reagents",
726
+ "acid": "reagents",
727
+ "additive": "reagents",
728
+ "reagent": "reagents",
729
+ }
730
+
731
+ # Track names we've already added (avoid duplicates)
732
+ _seen_names: set = set()
733
+
734
+ def _add_token(raw_token: str) -> None:
735
+ """Classify a single token and add to the right bin."""
736
+ token = raw_token.strip()
737
+ if not token:
738
+ return
739
+
740
+ # Skip yield tokens (e.g. "72%", "quant.", "95% yield")
741
+ if re.match(r"^\d+\.?\d*\s*%", token) or \
742
+ re.match(r"^quant\.?$", token, re.IGNORECASE):
743
+ return
744
+
745
+ # Skip reaction name labels (e.g. "Rieche formylation", "Mitsunobu")
746
+ _rxn_name_patterns = re.compile(
747
+ r"^(?:Rieche|Mitsunobu|Swern|Wittig|Grignard|Heck|Suzuki|"
748
+ r"Buchwald|Sonogashira|Negishi|Stille|Kumada|Chan.Lam|"
749
+ r"Ullmann|Goldberg|Appel|Gabriel|Finkelstein|"
750
+ r"Curtius|Arndt.Eistert|Barton|Dess.Martin|"
751
+ r"Williamson|Fischer|Mannich|Strecker|Reformatsky)\b",
752
+ re.IGNORECASE)
753
+ if _rxn_name_patterns.search(token):
754
+ return
755
+
756
+ # Strip equiv/loading annotations for lookup, but preserve for display
757
+ equiv_str = ""
758
+ m = re.match(r"^(.+?)\s*\((\d+\.?\d*\s*(?:eq\.?|equiv\.?|mol\s*%|cat\.))\)\s*$",
759
+ token, re.IGNORECASE)
760
+ if m:
761
+ token_clean = m.group(1).strip()
762
+ equiv_str = m.group(2).strip()
763
+ else:
764
+ token_clean = token
765
+
766
+ # Normalise key for lookup
767
+ lookup_key = token_clean.lower().strip()
768
+ if lookup_key in _seen_names:
769
+ return
770
+ _seen_names.add(lookup_key)
771
+
772
+ # Check if it's a physical condition
773
+ if _is_condition_token(token_clean):
774
+ cats["conditions"].append(token_clean)
775
+ return
776
+
777
+ # Temperature range patterns not caught by _is_condition_token:
778
+ # "-78 to RT", "0 C to RT", "-78 C to rt", "-78°C to RT"
779
+ if re.match(
780
+ r"^-?\d+\s*[°\u00b0]?\s*C?\s+to\s+(?:r\.?t\.?|-?\d+\s*[°\u00b0]?\s*C?)\s*$",
781
+ token_clean, re.IGNORECASE
782
+ ):
783
+ cats["conditions"].append(token_clean)
784
+ return
785
+
786
+ # Workup detection
787
+ if re.match(r"^then\b", token_clean, re.IGNORECASE):
788
+ cats["workup"].append(token)
789
+ return
790
+
791
+ # Reagent DB lookup
792
+ role = db.role_for_name(lookup_key)
793
+ entry = db.entry_for_name(lookup_key)
794
+ display = entry.get("display", token_clean) if entry else token_clean
795
+
796
+ if role:
797
+ bin_name = _ROLE_BIN.get(role, "reagents")
798
+ if bin_name == "solvents":
799
+ cats["solvents"].append(display)
800
+ else:
801
+ cats[bin_name].append((display, equiv_str))
802
+ else:
803
+ # Unknown — check if it looks like a solvent ratio ("dioxane/H2O (3:1)")
804
+ if re.match(r"^[A-Za-z0-9,\-]+(/[A-Za-z0-9,\-]+)+(\s*\(\d+:\d+\))?$",
805
+ token_clean):
806
+ cats["solvents"].append(token_clean)
807
+ # "cat." usually means catalytic amount
808
+ elif equiv_str and "cat" in equiv_str.lower():
809
+ cats["reagents"].append((display, equiv_str))
810
+ # Has a loading → likely a reagent
811
+ elif equiv_str:
812
+ cats["reagents"].append((display, equiv_str))
813
+ else:
814
+ # Genuinely unknown — treat as reagent
815
+ cats["reagents"].append((display, ""))
816
+
817
+ # 1. Fragment reagent species (drawn structures)
818
+ for rid in step.reagent_ids:
819
+ sp = species.get(rid)
820
+ if not sp:
821
+ continue
822
+ if sp.element_type == "fragment":
823
+ # Build best display name: label > IUPAC > reagent_db display > name > SMILES
824
+ display_name = None
825
+ role = None
826
+
827
+ # Try reagent_db by name first
828
+ if sp.name:
829
+ role = db.role_for_name(sp.name.lower())
830
+ entry = db.entry_for_name(sp.name.lower())
831
+ if entry:
832
+ display_name = entry.get("display", sp.name)
833
+
834
+ # Try reagent_db by SMILES
835
+ if not role and sp.smiles:
836
+ role = db.role_for_smiles(sp.smiles)
837
+ sr = db.smiles_role_display(sp.smiles)
838
+ if sr:
839
+ if not display_name:
840
+ display_name = sr[1]
841
+ if not role:
842
+ role = sr[0]
843
+
844
+ # Fallback display: IUPAC > name > SMILES
845
+ if not display_name:
846
+ display_name = (
847
+ getattr(sp, "iupac_name", None)
848
+ or sp.name
849
+ or (sp.smiles if sp.smiles and len(sp.smiles) <= 40 else None)
850
+ or sp.id
851
+ )
852
+
853
+ lookup_key = display_name.lower().strip()
854
+ if lookup_key in _seen_names:
855
+ continue
856
+ _seen_names.add(lookup_key)
857
+
858
+ if role:
859
+ bin_name = _ROLE_BIN.get(role, "reagents")
860
+ if bin_name == "solvents":
861
+ cats["solvents"].append(display_name)
862
+ else:
863
+ cats[bin_name].append((display_name, ""))
864
+ else:
865
+ cats["reagents"].append((display_name, ""))
866
+
867
+ # 2. Text reagent species (multi-line text blocks)
868
+ for rid in step.reagent_ids:
869
+ sp = species.get(rid)
870
+ if not sp or sp.element_type != "text":
871
+ continue
872
+ if not sp.name:
873
+ continue
874
+ # Split multi-line block into individual tokens
875
+ for line in sp.name.split("\n"):
876
+ line = line.strip()
877
+ if not line:
878
+ continue
879
+ # Split on comma/semicolon (but protect names like "1,4-dioxane")
880
+ # Strategy: if whole line is a known name, keep it; else try splitting
881
+ if db.entry_for_name(line.strip().lower()):
882
+ _add_token(line)
883
+ continue
884
+ # Try splitting on commas
885
+ parts = re.split(r"[;,]\s*", line)
886
+ if len(parts) > 1:
887
+ for part in parts:
888
+ _add_token(part)
889
+ else:
890
+ _add_token(line)
891
+
892
+ # 3. Physical conditions from parsed step.conditions
893
+ for cond in step.conditions:
894
+ cond_lower = cond.lower().strip()
895
+ if cond_lower not in _seen_names:
896
+ _seen_names.add(cond_lower)
897
+ cats["conditions"].append(cond)
898
+
899
+ return cats
900
+
901
+
902
+ def _format_conditions(step, species: Dict[str, SpeciesRecord]) -> str:
903
+ """Format step conditions as readable text.
904
+
905
+ Delegates to ``_parse_step_reagents`` for structured decomposition,
906
+ then formats into a single-line summary for backward compatibility.
907
+ """
908
+ cats = _parse_step_reagents(step, species)
909
+ parts = []
910
+ for name, equiv in cats["catalysts"]:
911
+ parts.append(f"{name} ({equiv})" if equiv else name)
912
+ for name, equiv in cats["ligands"]:
913
+ parts.append(f"{name} ({equiv})" if equiv else name)
914
+ for name, equiv in cats["bases"]:
915
+ parts.append(f"{name} ({equiv})" if equiv else name)
916
+ for name, equiv in cats["reagents"]:
917
+ parts.append(f"{name} ({equiv})" if equiv else name)
918
+ parts.extend(cats["solvents"])
919
+ parts.extend(cats["conditions"])
920
+ if step.yield_text:
921
+ parts.append(f"{step.yield_text} yield")
922
+ return ", ".join(parts) if parts else "(conditions not specified)"
923
+
924
+
925
+ def analyze_bond_changes(mapped_rxn: str) -> Dict[str, list]:
926
+ """Analyze bond changes from an atom-mapped reaction SMILES.
927
+
928
+ Uses RDKit to compare bonds between mapped atoms in reactants vs products.
929
+
930
+ Returns
931
+ -------
932
+ dict
933
+ ``formed`` : list of (sym1, map1, sym2, map2, bond_order)
934
+ ``broken`` : list of (sym1, map1, sym2, map2, bond_order)
935
+ ``changed_order`` : list of (sym1, map1, sym2, map2, old_order, new_order)
936
+ ``leaving`` : list of (group_symbol, nbr_symbol, nbr_map)
937
+ """
938
+ try:
939
+ from rdkit import Chem
940
+ except ImportError:
941
+ return {}
942
+
943
+ parts = mapped_rxn.split(">>")
944
+ if len(parts) != 2:
945
+ return {}
946
+
947
+ reactants = Chem.MolFromSmiles(parts[0])
948
+ products = Chem.MolFromSmiles(parts[1])
949
+ if not reactants or not products:
950
+ return {}
951
+
952
+ def _map_to_idx(mol):
953
+ return {a.GetAtomMapNum(): a.GetIdx()
954
+ for a in mol.GetAtoms() if a.GetAtomMapNum()}
955
+
956
+ def _bonds_by_map(mol):
957
+ idx_to_map = {a.GetIdx(): a.GetAtomMapNum() for a in mol.GetAtoms()}
958
+ bonds = {}
959
+ for bond in mol.GetBonds():
960
+ m1 = idx_to_map.get(bond.GetBeginAtomIdx(), 0)
961
+ m2 = idx_to_map.get(bond.GetEndAtomIdx(), 0)
962
+ if m1 and m2:
963
+ bonds[(min(m1, m2), max(m1, m2))] = bond.GetBondTypeAsDouble()
964
+ return bonds
965
+
966
+ def _atom_sym(mol, mapnum):
967
+ m = _map_to_idx(mol)
968
+ idx = m.get(mapnum)
969
+ if idx is None:
970
+ return "?"
971
+ return mol.GetAtomWithIdx(idx).GetSymbol()
972
+
973
+ r_bonds = _bonds_by_map(reactants)
974
+ p_bonds = _bonds_by_map(products)
975
+
976
+ formed = [
977
+ (_atom_sym(products, k[0]), k[0],
978
+ _atom_sym(products, k[1]), k[1], p_bonds[k])
979
+ for k in sorted(set(p_bonds) - set(r_bonds))
980
+ ]
981
+ broken = [
982
+ (_atom_sym(reactants, k[0]), k[0],
983
+ _atom_sym(reactants, k[1]), k[1], r_bonds[k])
984
+ for k in sorted(set(r_bonds) - set(p_bonds))
985
+ ]
986
+ changed = [
987
+ (_atom_sym(products, k[0]), k[0],
988
+ _atom_sym(products, k[1]), k[1], r_bonds[k], p_bonds[k])
989
+ for k in sorted(set(r_bonds) & set(p_bonds))
990
+ if r_bonds[k] != p_bonds[k]
991
+ ]
992
+
993
+ # Leaving groups: unmapped atoms bonded to mapped atoms in reactants
994
+ leaving = []
995
+ seen = set()
996
+ for atom in reactants.GetAtoms():
997
+ if atom.GetAtomMapNum() == 0 and atom.GetIdx() not in seen:
998
+ for nbr in atom.GetNeighbors():
999
+ mn = nbr.GetAtomMapNum()
1000
+ if mn:
1001
+ leaving.append((atom.GetSymbol(), nbr.GetSymbol(), mn))
1002
+ seen.add(atom.GetIdx())
1003
+ break
1004
+
1005
+ return {
1006
+ "formed": formed,
1007
+ "broken": broken,
1008
+ "changed_order": changed,
1009
+ "leaving": leaving,
1010
+ }
1011
+
1012
+
1013
+ def describe_transformation(changes: Dict[str, list],
1014
+ max_changes: int = 5) -> str:
1015
+ """Generate a chemical English description from bond-change analysis.
1016
+
1017
+ Produces a concise, human-readable description of what bonds formed,
1018
+ broke, or changed order, and what groups were displaced.
1019
+
1020
+ When the atom mapping is incomplete (reagents not drawn as structures),
1021
+ the mapper may shuffle atoms producing many spurious bond changes.
1022
+ If total changes exceed *max_changes*, only leaving groups and key
1023
+ single-bond formations are reported.
1024
+
1025
+ Example: "C-N bond formed; Br displaced from C"
1026
+ """
1027
+ if not changes:
1028
+ return ""
1029
+
1030
+ formed = changes.get("formed", [])
1031
+ broken = changes.get("broken", [])
1032
+ changed = changes.get("changed_order", [])
1033
+ leaving = changes.get("leaving", [])
1034
+
1035
+ total = len(formed) + len(broken) + len(changed)
1036
+
1037
+ _ORDER_NAME = {
1038
+ 1.0: "single", 1.5: "aromatic", 2.0: "double", 3.0: "triple",
1039
+ }
1040
+ parts = []
1041
+
1042
+ if total > max_changes:
1043
+ # Too many changes — mapping likely incomplete (reagent not drawn).
1044
+ # Report only the most informative: single-bond formations (coupling)
1045
+ # and leaving groups.
1046
+ key_formed = [f for f in formed if f[4] == 1.0]
1047
+ for sym1, _m1, sym2, _m2, _bt in key_formed[:2]:
1048
+ parts.append(f"{sym1}-{sym2} bond formed")
1049
+ for lg_sym, nbr_sym, _mn in leaving[:2]:
1050
+ parts.append(f"{lg_sym} displaced from {nbr_sym}")
1051
+ if not parts:
1052
+ parts.append(f"complex rearrangement ({total} bond changes)")
1053
+ else:
1054
+ for sym1, _m1, sym2, _m2, bt in formed:
1055
+ bname = _ORDER_NAME.get(bt, f"order-{bt}")
1056
+ parts.append(f"{sym1}-{sym2} {bname} bond formed")
1057
+
1058
+ for sym1, _m1, sym2, _m2, bt in broken:
1059
+ bname = _ORDER_NAME.get(bt, f"order-{bt}")
1060
+ parts.append(f"{sym1}-{sym2} {bname} bond broken")
1061
+
1062
+ for sym1, _m1, sym2, _m2, old_bt, new_bt in changed:
1063
+ old_n = _ORDER_NAME.get(old_bt, str(old_bt))
1064
+ new_n = _ORDER_NAME.get(new_bt, str(new_bt))
1065
+ parts.append(f"{sym1}-{sym2} bond changed {old_n} -> {new_n}")
1066
+
1067
+ for lg_sym, nbr_sym, _mn in leaving:
1068
+ parts.append(f"{lg_sym} displaced from {nbr_sym}")
1069
+
1070
+ return "; ".join(parts) if parts else ""
1071
+
1072
+
1073
+ def generate_llm_narrative(desc: SchemeDescription,
1074
+ ml_enrichment: Optional[Dict[int, Dict]] = None,
1075
+ ) -> str:
1076
+ """Generate a chemist-quality natural language narrative.
1077
+
1078
+ This function produces Layer 3 output: readable text that an LLM can
1079
+ consume for chemical reasoning, grounded in SMILES from the species
1080
+ registry.
1081
+
1082
+ Parameters
1083
+ ----------
1084
+ desc : SchemeDescription
1085
+ Parsed scheme (Tier 1 or Tier 2).
1086
+ ml_enrichment : dict, optional
1087
+ Per-step ML grounding data keyed by step_index. Each entry is
1088
+ the dict returned by ``classify_roles_enriched()`` (RXNMapper +
1089
+ rxn-insight). Keys include ``reaction_class``, ``reaction_name``,
1090
+ ``confidence``, ``byproducts``, ``components``.
1091
+
1092
+ Returns
1093
+ -------
1094
+ str
1095
+ Natural language narrative with embedded SMILES for grounding.
1096
+ """
1097
+ ml_enrichment = ml_enrichment or {}
1098
+ if not desc.steps:
1099
+ # No-step schemes (target arrays, etc.)
1100
+ n_frag = sum(1 for sp in desc.species.values()
1101
+ if sp.element_type == "fragment")
1102
+ n_text = sum(1 for sp in desc.species.values()
1103
+ if sp.element_type == "text")
1104
+ if n_frag == 0 and n_text == 0:
1105
+ return "Empty scheme with no chemical content detected."
1106
+
1107
+ ctype_label = {
1108
+ "target_array": "Target structure array",
1109
+ "sar_design": "SAR design diagram",
1110
+ "synthesis": "Structure collection",
1111
+ }.get(desc.content_type or "", "Non-reaction scheme")
1112
+
1113
+ parts = [f"{ctype_label} containing {n_frag} structure(s)."]
1114
+
1115
+ for sp in desc.species.values():
1116
+ if sp.element_type != "fragment":
1117
+ continue
1118
+ display = _species_display(sp)
1119
+ # Flag generic scaffolds (contain [*] dummy atoms from R-groups)
1120
+ is_generic = sp.smiles and "[*]" in sp.smiles
1121
+ if is_generic:
1122
+ # Check if variable position info is in the name
1123
+ if sp.name and "variable:" in sp.name:
1124
+ parts.append(f" - {display} [generic scaffold — {sp.name}]")
1125
+ else:
1126
+ parts.append(f" - {display} [generic scaffold]")
1127
+ else:
1128
+ parts.append(f" - {display}")
1129
+
1130
+ # Include text annotations if present
1131
+ text_sps = [sp for sp in desc.species.values()
1132
+ if sp.element_type == "text" and sp.name]
1133
+ if text_sps:
1134
+ parts.append("")
1135
+ parts.append("Annotations:")
1136
+ for sp in text_sps:
1137
+ first_line = sp.name.split("\n")[0].strip()
1138
+ parts.append(f" - {first_line}")
1139
+
1140
+ return "\n".join(parts)
1141
+
1142
+ # Build narrative
1143
+ ctype_label = {
1144
+ "synthesis": "Synthetic route",
1145
+ "sar_design": "SAR exploration",
1146
+ "biological_pathway": "Biological pathway",
1147
+ "literature_comparison": "Literature method comparison",
1148
+ "composite": "Composite methodology overview",
1149
+ "investigation": "Methodology investigation",
1150
+ }.get(desc.content_type or "", "Reaction scheme")
1151
+
1152
+ # Opening line
1153
+ topo_adj = {
1154
+ "linear": "sequential",
1155
+ "divergent": "divergent",
1156
+ "convergent": "convergent",
1157
+ "parallel": "parallel",
1158
+ "mixed": "multi-pathway",
1159
+ }.get(desc.topology, "")
1160
+
1161
+ # Identify final product(s) for the opening
1162
+ final_products = []
1163
+ if desc.steps:
1164
+ last_step = desc.steps[-1]
1165
+ for pid in last_step.product_ids:
1166
+ sp = desc.species.get(pid)
1167
+ if sp:
1168
+ final_products.append(_species_display(sp))
1169
+
1170
+ opening = f"{ctype_label}"
1171
+ if desc.num_steps > 0:
1172
+ opening += f" ({desc.num_steps} step{'s' if desc.num_steps > 1 else ''}"
1173
+ if topo_adj:
1174
+ opening += f", {topo_adj}"
1175
+ opening += ")"
1176
+ if final_products and desc.content_type in ("synthesis", "", None):
1177
+ opening += f" toward {final_products[0]}"
1178
+ opening += "."
1179
+
1180
+ parts = [opening, ""]
1181
+
1182
+ # Step-by-step description
1183
+ for step in desc.steps:
1184
+ ml_data = ml_enrichment.get(step.step_index)
1185
+
1186
+ # Classify reaction (regex + optional ML)
1187
+ reagent_sps = [desc.species[rid] for rid in step.reagent_ids
1188
+ if rid in desc.species]
1189
+ # Also check text species in reactants for amine detection
1190
+ all_step_sps = reagent_sps + [
1191
+ desc.species[rid] for rid in step.reactant_ids
1192
+ if rid in desc.species]
1193
+ rxn_type = _classify_reaction(
1194
+ step.condition_text_raw, all_step_sps, desc,
1195
+ ml_data=ml_data)
1196
+
1197
+ # Reactant display
1198
+ r_names = []
1199
+ for rid in step.reactant_ids:
1200
+ sp = desc.species.get(rid)
1201
+ if sp:
1202
+ r_names.append(_species_display(sp))
1203
+ r_str = " + ".join(r_names) if r_names else ""
1204
+
1205
+ # Product display
1206
+ p_names = []
1207
+ for pid in step.product_ids:
1208
+ sp = desc.species.get(pid)
1209
+ if sp:
1210
+ p_names.append(_species_display(sp))
1211
+ p_str = " + ".join(p_names) if p_names else ""
1212
+
1213
+ # Detect protocol-only steps (no substrate/product drawn)
1214
+ _is_protocol_step = (
1215
+ not step.reactant_ids and not step.product_ids
1216
+ and desc.content_type in (
1217
+ "composite", "literature_comparison", "investigation"))
1218
+
1219
+ # Step header
1220
+ step_num = step.step_index + 1
1221
+ if rxn_type:
1222
+ step_line = f"Step {step_num} -- {rxn_type}:"
1223
+ elif _is_protocol_step:
1224
+ step_line = f"Method {step_num}:"
1225
+ else:
1226
+ step_line = f"Step {step_num}:"
1227
+
1228
+ # Arrow annotation
1229
+ if step.arrow_style == "failed":
1230
+ step_line += " [FAILED]"
1231
+ elif step.arrow_style == "dashed":
1232
+ step_line += " [tentative/planned]"
1233
+
1234
+ parts.append(step_line)
1235
+
1236
+ # Transformation description with structured conditions
1237
+ cats = _parse_step_reagents(step, desc.species)
1238
+
1239
+ # Reactant → product line (or protocol description for method-only steps)
1240
+ if _is_protocol_step:
1241
+ # No substrate/product drawn — describe the protocol directly
1242
+ desc_line = " Protocol:"
1243
+ elif r_str and p_str:
1244
+ desc_line = f" {r_str} -> {p_str}"
1245
+ elif r_str:
1246
+ desc_line = f" {r_str} -> (product)"
1247
+ elif p_str:
1248
+ desc_line = f" (starting material) -> {p_str}"
1249
+ else:
1250
+ desc_line = f" (starting material) -> (product)"
1251
+ parts.append(desc_line)
1252
+
1253
+ # Transformation diff (when aligned names show what changed)
1254
+ _diffs = getattr(desc, '_alignment_diffs', {})
1255
+ if _diffs:
1256
+ for rid in step.reactant_ids:
1257
+ for pid in step.product_ids:
1258
+ diff_str = _diffs.get((rid, pid))
1259
+ if diff_str:
1260
+ # Replace " -> " with " → " for readability
1261
+ diff_display = diff_str.replace(" -> ", " \u2192 ")
1262
+ parts.append(f" Transformation: {diff_display}")
1263
+
1264
+ # Reagents line (catalysts, ligands, bases, coupling/reducing agents)
1265
+ reagent_parts = []
1266
+ for name, equiv in cats["catalysts"]:
1267
+ reagent_parts.append(f"{name} ({equiv})" if equiv else name)
1268
+ for name, equiv in cats["ligands"]:
1269
+ reagent_parts.append(f"{name} ({equiv})" if equiv else name)
1270
+ for name, equiv in cats["bases"]:
1271
+ reagent_parts.append(f"{name} ({equiv})" if equiv else name)
1272
+ for name, equiv in cats["reagents"]:
1273
+ reagent_parts.append(f"{name} ({equiv})" if equiv else name)
1274
+ if reagent_parts:
1275
+ parts.append(f" Reagents: {', '.join(reagent_parts)}")
1276
+
1277
+ # Solvent line
1278
+ if cats["solvents"]:
1279
+ parts.append(f" Solvent: {', '.join(cats['solvents'])}")
1280
+
1281
+ # Physical conditions line (temp, time, atmosphere)
1282
+ cond_parts = list(cats["conditions"])
1283
+ if step.yield_text:
1284
+ cond_parts.append(f"{step.yield_text} yield")
1285
+ if cond_parts:
1286
+ parts.append(f" Conditions: {', '.join(cond_parts)}")
1287
+ elif not reagent_parts and not cats["solvents"]:
1288
+ parts.append(" Conditions: (not specified)")
1289
+
1290
+ # Workup line
1291
+ if cats["workup"]:
1292
+ parts.append(f" Workup: {'; '.join(cats['workup'])}")
1293
+
1294
+ # ML grounding block (when enrichment available)
1295
+ if ml_data:
1296
+ ml_parts = []
1297
+ rc = ml_data.get("reaction_class")
1298
+ rn = ml_data.get("reaction_name")
1299
+ conf = ml_data.get("confidence", 0)
1300
+ if rc or rn:
1301
+ label = rn or rc
1302
+ ml_parts.append(f'rxn-insight="{label}"')
1303
+ if conf:
1304
+ ml_parts.append(f"atom-map confidence={conf:.2f}")
1305
+ bp = ml_data.get("byproducts", [])
1306
+ if bp:
1307
+ ml_parts.append(f"byproducts=[{', '.join(bp)}]")
1308
+ if ml_parts:
1309
+ parts.append(f" [ML: {'; '.join(ml_parts)}]")
1310
+
1311
+ # Tier B: bond-change description from atom maps
1312
+ mapped_rxn = ml_data.get("mapped_rxn", "")
1313
+ if mapped_rxn:
1314
+ changes = analyze_bond_changes(mapped_rxn)
1315
+ xform_desc = describe_transformation(changes)
1316
+ if xform_desc:
1317
+ parts.append(f" Bond changes: {xform_desc}")
1318
+
1319
+ parts.append("")
1320
+
1321
+ # Substrate scope table section (when scope entries detected)
1322
+ if hasattr(desc, 'scope_entries') and desc.scope_entries:
1323
+ parts.append("Substrate scope:")
1324
+ for entry in desc.scope_entries:
1325
+ sp = desc.species.get(entry.species_id) if entry.species_id else None
1326
+ display = _species_display(sp) if sp else None
1327
+
1328
+ line_parts = []
1329
+ if entry.label:
1330
+ line_parts.append(entry.label)
1331
+ elif display:
1332
+ line_parts.append(display)
1333
+ else:
1334
+ line_parts.append(entry.entry_id)
1335
+
1336
+ if entry.conditions_variant:
1337
+ line_parts.append(f"({entry.conditions_variant})")
1338
+ if entry.yield_text:
1339
+ line_parts.append(f"— {entry.yield_text}")
1340
+ if entry.mass_text:
1341
+ line_parts.append(f"({entry.mass_text})")
1342
+ if entry.notes:
1343
+ line_parts.append(f"[{entry.notes}]")
1344
+
1345
+ parts.append(f" - {' '.join(line_parts)}")
1346
+ parts.append("")
1347
+
1348
+ return "\n".join(parts).rstrip()
1349
+
1350
+
1351
+ # ---------------------------------------------------------------------------
1352
+ # CLI
1353
+ # ---------------------------------------------------------------------------
1354
+
1355
+ def main(argv: Optional[list] = None) -> int:
1356
+ parser = argparse.ArgumentParser(
1357
+ prog="scheme_refine",
1358
+ description="LLM refinement of scheme_reader output.",
1359
+ )
1360
+ sub = parser.add_subparsers(dest="command")
1361
+
1362
+ # prompt subcommand
1363
+ p_prompt = sub.add_parser("prompt",
1364
+ help="Generate refinement prompt for LLM")
1365
+ p_prompt.add_argument("input", help="Tier 1 JSON file")
1366
+ p_prompt.add_argument("--image", help="Path to rendered scheme image")
1367
+
1368
+ # apply subcommand
1369
+ p_apply = sub.add_parser("apply",
1370
+ help="Apply corrections to Tier 1 output")
1371
+ p_apply.add_argument("input", help="Tier 1 JSON file")
1372
+ p_apply.add_argument("corrections", help="Corrections JSON file")
1373
+ p_apply.add_argument("-o", "--output", help="Output refined JSON")
1374
+
1375
+ args = parser.parse_args(argv)
1376
+
1377
+ if args.command == "prompt":
1378
+ desc = SchemeDescription.from_json(args.input)
1379
+ prompt = generate_prompt(desc, image_path=args.image)
1380
+ print(prompt)
1381
+ return 0
1382
+
1383
+ elif args.command == "apply":
1384
+ desc = SchemeDescription.from_json(args.input)
1385
+ with open(args.corrections, "r", encoding="utf-8") as f:
1386
+ corrections = json.load(f)
1387
+ refined = apply_corrections(desc, corrections)
1388
+ if args.output:
1389
+ refined.to_json(args.output)
1390
+ print(f"Written to {args.output}", file=sys.stderr)
1391
+ else:
1392
+ out = json.dumps(refined.to_dict(), indent=2,
1393
+ ensure_ascii=False)
1394
+ sys.stdout.buffer.write(out.encode("utf-8"))
1395
+ sys.stdout.buffer.write(b"\n")
1396
+ return 0
1397
+
1398
+ else:
1399
+ parser.print_help()
1400
+ return 1
1401
+
1402
+
1403
+ if __name__ == "__main__":
1404
+ sys.exit(main())