cdxml-toolkit 0.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (91) hide show
  1. cdxml_toolkit/__init__.py +18 -0
  2. cdxml_toolkit/_jre/__init__.py +2 -0
  3. cdxml_toolkit/_jre/temurin-21-jre-win-x64.zip +0 -0
  4. cdxml_toolkit/analysis/__init__.py +35 -0
  5. cdxml_toolkit/analysis/deterministic/__init__.py +12 -0
  6. cdxml_toolkit/analysis/deterministic/discover_experiment_files.py +413 -0
  7. cdxml_toolkit/analysis/deterministic/lab_book_formatter.py +701 -0
  8. cdxml_toolkit/analysis/deterministic/lcms_file_categorizer.py +928 -0
  9. cdxml_toolkit/analysis/deterministic/lcms_identifier.py +598 -0
  10. cdxml_toolkit/analysis/deterministic/mass_resolver.py +654 -0
  11. cdxml_toolkit/analysis/deterministic/multi_lcms_analyzer.py +1412 -0
  12. cdxml_toolkit/analysis/deterministic/procedure_writer.py +446 -0
  13. cdxml_toolkit/analysis/extract_nmr.py +47 -0
  14. cdxml_toolkit/analysis/format_procedure_entry.py +479 -0
  15. cdxml_toolkit/analysis/lcms_analyzer.py +1299 -0
  16. cdxml_toolkit/analysis/parse_analysis_file.py +134 -0
  17. cdxml_toolkit/cdxml_builder.py +920 -0
  18. cdxml_toolkit/cdxml_utils.py +342 -0
  19. cdxml_toolkit/chemdraw/__init__.py +5 -0
  20. cdxml_toolkit/chemdraw/_chemscript_server.py +562 -0
  21. cdxml_toolkit/chemdraw/cdx_converter.py +527 -0
  22. cdxml_toolkit/chemdraw/cdxml_to_image.py +262 -0
  23. cdxml_toolkit/chemdraw/cdxml_to_image_rdkit.py +296 -0
  24. cdxml_toolkit/chemdraw/chemscript_bridge.py +901 -0
  25. cdxml_toolkit/constants.py +304 -0
  26. cdxml_toolkit/coord_normalizer.py +438 -0
  27. cdxml_toolkit/deterministic_pipeline/__init__.py +6 -0
  28. cdxml_toolkit/deterministic_pipeline/legacy/__init__.py +5 -0
  29. cdxml_toolkit/deterministic_pipeline/legacy/eln_cdx_cleanup.py +509 -0
  30. cdxml_toolkit/deterministic_pipeline/legacy/eln_enrichment.py +1394 -0
  31. cdxml_toolkit/deterministic_pipeline/legacy/scheme_aligner.py +428 -0
  32. cdxml_toolkit/deterministic_pipeline/legacy/scheme_polisher.py +1337 -0
  33. cdxml_toolkit/deterministic_pipeline/legacy/scheme_polisher_v2.py +1340 -0
  34. cdxml_toolkit/deterministic_pipeline/scheme_reader_audit.py +931 -0
  35. cdxml_toolkit/deterministic_pipeline/scheme_reader_verify.py +1160 -0
  36. cdxml_toolkit/image/__init__.py +15 -0
  37. cdxml_toolkit/image/reaction_from_image.py +2103 -0
  38. cdxml_toolkit/image/structure_from_image.py +1711 -0
  39. cdxml_toolkit/layout/__init__.py +5 -0
  40. cdxml_toolkit/layout/alignment.py +1642 -0
  41. cdxml_toolkit/layout/reaction_cleanup.py +1002 -0
  42. cdxml_toolkit/layout/scheme_merger.py +2260 -0
  43. cdxml_toolkit/mcp_server/__init__.py +0 -0
  44. cdxml_toolkit/mcp_server/__main__.py +5 -0
  45. cdxml_toolkit/mcp_server/server.py +1567 -0
  46. cdxml_toolkit/naming/__init__.py +6 -0
  47. cdxml_toolkit/naming/aligned_namer.py +2342 -0
  48. cdxml_toolkit/naming/mol_builder.py +3722 -0
  49. cdxml_toolkit/naming/name_decomposer.py +2843 -0
  50. cdxml_toolkit/naming/reactions_datamol.json +2414 -0
  51. cdxml_toolkit/office/__init__.py +5 -0
  52. cdxml_toolkit/office/doc_from_template.py +722 -0
  53. cdxml_toolkit/office/ole_embedder.py +808 -0
  54. cdxml_toolkit/office/ole_extractor.py +272 -0
  55. cdxml_toolkit/perception/__init__.py +10 -0
  56. cdxml_toolkit/perception/compound_search.py +229 -0
  57. cdxml_toolkit/perception/eln_csv_parser.py +240 -0
  58. cdxml_toolkit/perception/rdf_parser.py +664 -0
  59. cdxml_toolkit/perception/reactant_heuristic.py +1045 -0
  60. cdxml_toolkit/perception/reaction_parser.py +2150 -0
  61. cdxml_toolkit/perception/scheme_reader.py +2948 -0
  62. cdxml_toolkit/perception/scheme_refine.py +1404 -0
  63. cdxml_toolkit/perception/scheme_segmenter.py +619 -0
  64. cdxml_toolkit/perception/spatial_assignment.py +1013 -0
  65. cdxml_toolkit/rdkit_utils.py +605 -0
  66. cdxml_toolkit/render/__init__.py +17 -0
  67. cdxml_toolkit/render/auto_layout.py +229 -0
  68. cdxml_toolkit/render/compact_parser.py +632 -0
  69. cdxml_toolkit/render/parser.py +706 -0
  70. cdxml_toolkit/render/render_scheme.py +267 -0
  71. cdxml_toolkit/render/renderer.py +2387 -0
  72. cdxml_toolkit/render/schema.py +90 -0
  73. cdxml_toolkit/render/scheme_maker.py +1043 -0
  74. cdxml_toolkit/render/scheme_yaml_writer.py +1487 -0
  75. cdxml_toolkit/resolve/__init__.py +13 -0
  76. cdxml_toolkit/resolve/cas_resolver.py +430 -0
  77. cdxml_toolkit/resolve/chemscanner_abbreviations.json +28813 -0
  78. cdxml_toolkit/resolve/condensed_formula.py +493 -0
  79. cdxml_toolkit/resolve/jre_manager.py +195 -0
  80. cdxml_toolkit/resolve/reagent_abbreviations.json +1046 -0
  81. cdxml_toolkit/resolve/reagent_db.py +285 -0
  82. cdxml_toolkit/resolve/superatom_data.json +2856 -0
  83. cdxml_toolkit/resolve/superatom_table.py +146 -0
  84. cdxml_toolkit/text_formatting.py +298 -0
  85. cdxml_toolkit-0.5.0.dist-info/METADATA +318 -0
  86. cdxml_toolkit-0.5.0.dist-info/RECORD +91 -0
  87. cdxml_toolkit-0.5.0.dist-info/WHEEL +5 -0
  88. cdxml_toolkit-0.5.0.dist-info/entry_points.txt +17 -0
  89. cdxml_toolkit-0.5.0.dist-info/licenses/LICENSE +21 -0
  90. cdxml_toolkit-0.5.0.dist-info/licenses/NOTICE.md +37 -0
  91. cdxml_toolkit-0.5.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,1160 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ scheme_reader_verify.py — Visual verification report for scheme_reader output.
4
+
5
+ Generates an HTML report that shows each CDXML scheme as a rendered image
6
+ alongside scheme_reader's parsed narrative, species list, and step graph.
7
+ This lets a chemist visually confirm that the parser understood the scheme
8
+ correctly.
9
+
10
+ Two modes:
11
+ 1. Directory mode: point at a folder of .cdxml files
12
+ 2. Document mode: point at a .pptx or .docx; objects are extracted first
13
+
14
+ CLI:
15
+ python -m cdxml_toolkit.scheme_reader_verify dir_of_cdxml/ -o report.html
16
+ python -m cdxml_toolkit.scheme_reader_verify slides.pptx -o report.html
17
+ python -m cdxml_toolkit.scheme_reader_verify slides.pptx thesis.docx -o report.html
18
+ python -m cdxml_toolkit.scheme_reader_verify dir/ --render # also renders images via ChemDraw
19
+ """
20
+
21
+ from __future__ import annotations
22
+
23
+ import argparse
24
+ import base64
25
+ import json
26
+ import os
27
+ import sys
28
+ import tempfile
29
+ import traceback
30
+ from pathlib import Path
31
+ from typing import List, Optional, Tuple
32
+
33
+ from cdxml_toolkit.perception.scheme_reader import read_scheme, SchemeDescription
34
+ from cdxml_toolkit.perception.scheme_refine import (
35
+ apply_corrections, generate_llm_narrative, _build_reaction_smiles,
36
+ enrich_aligned_names,
37
+ )
38
+
39
+ # ---------------------------------------------------------------------------
40
+ # ML enrichment (optional — requires chem-pipeline's experiments modules)
41
+ # ---------------------------------------------------------------------------
42
+ _ML_AVAILABLE = False
43
+
44
+ def _try_load_ml():
45
+ """Try to import RXNMapper + RXN Insight from chem-pipeline experiments."""
46
+ global _ML_AVAILABLE
47
+ if _ML_AVAILABLE:
48
+ return True
49
+ # chem-pipeline experiments/ is not a proper package — add path
50
+ _pipeline_root = os.path.normpath(
51
+ os.path.join(os.path.expanduser("~"), "chem-pipeline"))
52
+ if os.path.isdir(_pipeline_root) and _pipeline_root not in sys.path:
53
+ sys.path.insert(0, _pipeline_root)
54
+ try:
55
+ from experiments.role_classification.rxn_role_classifier import ( # noqa: F401
56
+ classify_roles_enriched,
57
+ )
58
+ _ML_AVAILABLE = True
59
+ return True
60
+ except ImportError:
61
+ return False
62
+
63
+
64
+ def _enrich_step(rxn_smiles: str, timeout: int = 120) -> Optional[dict]:
65
+ """Run RXNMapper + RXN Insight on a single reaction SMILES."""
66
+ try:
67
+ from experiments.role_classification.rxn_role_classifier import (
68
+ classify_roles_enriched,
69
+ )
70
+ return classify_roles_enriched(rxn_smiles, timeout=timeout)
71
+ except Exception:
72
+ return None
73
+
74
+
75
+ def enrich_scheme(desc: SchemeDescription,
76
+ verbose: bool = False) -> dict:
77
+ """Generate ML enrichment for all steps in a scheme.
78
+
79
+ Returns dict keyed by step_index with RXNMapper/RXN Insight results.
80
+ """
81
+ enrichment = {}
82
+ if not _try_load_ml():
83
+ if verbose:
84
+ print(" ML enrichment unavailable (chem-pipeline not found)",
85
+ file=sys.stderr)
86
+ return enrichment
87
+
88
+ for step in desc.steps:
89
+ rxn_smi = _build_reaction_smiles(step, desc.species)
90
+ if not rxn_smi:
91
+ if verbose:
92
+ print(f" Step {step.step_index}: no SMILES for rxn SMILES",
93
+ file=sys.stderr)
94
+ continue
95
+ if verbose:
96
+ print(f" Step {step.step_index}: {rxn_smi[:80]}...",
97
+ file=sys.stderr)
98
+ result = _enrich_step(rxn_smi)
99
+ if result:
100
+ enrichment[step.step_index] = result
101
+ elif verbose:
102
+ print(f" Step {step.step_index}: ML enrichment failed",
103
+ file=sys.stderr)
104
+ return enrichment
105
+
106
+
107
+ def batch_enrich_schemes(descs: list, verbose: bool = False) -> list:
108
+ """Batch ML enrichment for multiple SchemeDescriptions.
109
+
110
+ Uses RXNMapper batch API to send all reaction SMILES in a single
111
+ subprocess call (one model load), then calls RXN Insight per-step
112
+ for reaction classification.
113
+
114
+ Args:
115
+ descs: List of (index, SchemeDescription) tuples.
116
+
117
+ Returns:
118
+ List of (index, enrichment_dict) tuples.
119
+ """
120
+ if not _try_load_ml():
121
+ if verbose:
122
+ print("ML enrichment unavailable", file=sys.stderr)
123
+ return [(i, {}) for i, _ in descs]
124
+
125
+ from experiments.atom_mapping.rxn_atom_mapper import (
126
+ map_reactions_batch, classify_roles_from_mapping,
127
+ )
128
+
129
+ # Phase 1: Collect all reaction SMILES (filter out R-group/invalid)
130
+ def _valid_rxn_smiles(rxn_smi: str) -> bool:
131
+ """Check that both sides of reaction contain valid SMILES."""
132
+ try:
133
+ from rdkit import Chem
134
+ except ImportError:
135
+ return True # can't validate, let RXNMapper try
136
+ parts = rxn_smi.split(">>")
137
+ if len(parts) != 2:
138
+ return False
139
+ for side in parts:
140
+ for frag in side.split("."):
141
+ if not frag:
142
+ continue
143
+ mol = Chem.MolFromSmiles(frag)
144
+ if mol is None:
145
+ return False
146
+ return True
147
+
148
+ all_rxns = [] # (desc_idx, step_idx, rxn_smiles)
149
+ n_skipped = 0
150
+ for desc_idx, desc in descs:
151
+ for step in desc.steps:
152
+ rxn_smi = _build_reaction_smiles(step, desc.species)
153
+ if rxn_smi:
154
+ if _valid_rxn_smiles(rxn_smi):
155
+ all_rxns.append((desc_idx, step.step_index, rxn_smi))
156
+ else:
157
+ n_skipped += 1
158
+
159
+ if not all_rxns:
160
+ return [(i, {}) for i, _ in descs]
161
+
162
+ if verbose:
163
+ msg = f"Batch mapping {len(all_rxns)} reactions via RXNMapper..."
164
+ if n_skipped:
165
+ msg += f" ({n_skipped} skipped: invalid/R-group SMILES)"
166
+ print(msg, file=sys.stderr)
167
+
168
+ # Phase 2: Batch atom mapping (single subprocess)
169
+ rxn_smiles_list = [r[2] for r in all_rxns]
170
+ batch_results = map_reactions_batch(rxn_smiles_list, timeout=600)
171
+
172
+ if verbose:
173
+ n_ok = sum(1 for r in batch_results if r is not None)
174
+ print(f" {n_ok}/{len(batch_results)} reactions mapped",
175
+ file=sys.stderr)
176
+
177
+ # Phase 3: Role classification from atom maps + RXN Insight enrichment
178
+ enrichments = {i: {} for i, _ in descs}
179
+
180
+ for (desc_idx, step_idx, rxn_smi), map_result in zip(all_rxns, batch_results):
181
+ if map_result is None:
182
+ continue
183
+
184
+ # Classify roles from atom maps
185
+ role_result = classify_roles_from_mapping(
186
+ original_rxn=rxn_smi,
187
+ mapped_rxn=map_result["mapped_rxn"],
188
+ confidence=map_result["confidence"],
189
+ )
190
+
191
+ # Try RXN Insight for reaction class/name (still per-step subprocess)
192
+ try:
193
+ from experiments.role_classification.rxn_role_classifier import (
194
+ _run_rxn_insight,
195
+ )
196
+ insight = _run_rxn_insight(rxn_smi, timeout=60)
197
+ if insight:
198
+ role_result["reaction_class"] = insight.get("reaction_class", "")
199
+ role_result["reaction_name"] = insight.get("reaction_name", "")
200
+ role_result["byproducts"] = insight.get("byproducts", [])
201
+ role_result["functional_groups_reactants"] = insight.get(
202
+ "functional_groups_reactants", [])
203
+ else:
204
+ role_result["reaction_class"] = ""
205
+ role_result["reaction_name"] = ""
206
+ role_result["byproducts"] = []
207
+ except ImportError:
208
+ role_result["reaction_class"] = ""
209
+ role_result["reaction_name"] = ""
210
+ role_result["byproducts"] = []
211
+
212
+ enrichments[desc_idx][step_idx] = role_result
213
+
214
+ if verbose and (step_idx == 0 or desc_idx % 10 == 0):
215
+ rc = role_result.get("reaction_class", "?")
216
+ print(f" [{desc_idx}] step {step_idx}: {rc}",
217
+ file=sys.stderr)
218
+
219
+ return [(i, enrichments[i]) for i, _ in descs]
220
+
221
+
222
+ # ---------------------------------------------------------------------------
223
+ # SMILES -> structure image (RDKit SVG)
224
+ # ---------------------------------------------------------------------------
225
+
226
+ # Cache: smiles -> base64 data-URI SVG
227
+ _smiles_svg_cache: dict = {}
228
+
229
+
230
+ def _smiles_to_svg_b64(smiles: str, width: int = 200, height: int = 120) -> str:
231
+ """Render a SMILES string to an inline SVG data-URI via RDKit.
232
+
233
+ Returns a data:image/svg+xml;base64,... string, or "" on failure.
234
+ Results are cached so duplicate SMILES are rendered only once.
235
+ """
236
+ if not smiles:
237
+ return ""
238
+ if smiles in _smiles_svg_cache:
239
+ return _smiles_svg_cache[smiles]
240
+
241
+ try:
242
+ from rdkit import Chem
243
+ from rdkit.Chem.Draw import rdMolDraw2D
244
+
245
+ mol = Chem.MolFromSmiles(smiles, sanitize=False)
246
+ if mol is None:
247
+ _smiles_svg_cache[smiles] = ""
248
+ return ""
249
+
250
+ # Partial sanitisation — tolerate dummy atoms / R-groups
251
+ try:
252
+ Chem.SanitizeMol(mol, Chem.SanitizeFlags.SANITIZE_ALL
253
+ ^ Chem.SanitizeFlags.SANITIZE_PROPERTIES)
254
+ except Exception:
255
+ pass
256
+
257
+ try:
258
+ Chem.rdDepictor.Compute2DCoords(mol)
259
+ except Exception:
260
+ pass
261
+
262
+ drawer = rdMolDraw2D.MolDraw2DSVG(width, height)
263
+ opts = drawer.drawOptions()
264
+ opts.clearBackground = True
265
+ opts.bondLineWidth = 1.2
266
+ opts.padding = 0.15
267
+ # Make dummy atoms (R-groups) visible
268
+ opts.dummyIsotopeLabels = False
269
+ drawer.DrawMolecule(mol)
270
+ drawer.FinishDrawing()
271
+ svg_text = drawer.GetDrawingText()
272
+
273
+ b64 = base64.b64encode(svg_text.encode("utf-8")).decode("ascii")
274
+ uri = f"data:image/svg+xml;base64,{b64}"
275
+ _smiles_svg_cache[smiles] = uri
276
+ return uri
277
+
278
+ except Exception:
279
+ _smiles_svg_cache[smiles] = ""
280
+ return ""
281
+
282
+
283
+ # ---------------------------------------------------------------------------
284
+ # Image rendering (optional, requires ChemDraw COM)
285
+ # ---------------------------------------------------------------------------
286
+
287
+ def _render_cdxml_to_png(cdxml_path: str, output_path: str) -> bool:
288
+ """Render a CDXML file to PNG via cdxml_to_image. Returns True on success."""
289
+ try:
290
+ from cdxml_toolkit.chemdraw.cdxml_to_image import cdxml_to_png
291
+ cdxml_to_png(cdxml_path, output_path)
292
+ return True
293
+ except Exception:
294
+ # Fall back to subprocess call
295
+ try:
296
+ import subprocess
297
+ python = sys.executable
298
+ result = subprocess.run(
299
+ [python, "-m", "cdxml_toolkit.cdxml_to_image",
300
+ cdxml_path, "-o", output_path],
301
+ capture_output=True, timeout=30,
302
+ )
303
+ return result.returncode == 0 and os.path.exists(output_path)
304
+ except Exception:
305
+ return False
306
+
307
+
308
+ def _embed_image_b64(img_path: str) -> str:
309
+ """Read image file and return base64 data-URI string."""
310
+ if not os.path.exists(img_path):
311
+ return ""
312
+ with open(img_path, "rb") as f:
313
+ data = base64.b64encode(f.read()).decode("ascii")
314
+ ext = os.path.splitext(img_path)[1].lower()
315
+ mime = {"png": "image/png", "jpg": "image/jpeg", "jpeg": "image/jpeg",
316
+ "gif": "image/gif", "svg": "image/svg+xml"}.get(ext.lstrip("."), "image/png")
317
+ return f"data:{mime};base64,{data}"
318
+
319
+
320
+ # ---------------------------------------------------------------------------
321
+ # OLE extraction helpers
322
+ # ---------------------------------------------------------------------------
323
+
324
+ def _extract_from_document(doc_path: str, out_dir: str) -> List[str]:
325
+ """Extract ChemDraw objects from PPTX/DOCX, return list of CDXML paths."""
326
+ from cdxml_toolkit.office.ole_extractor import extract_from_office
327
+ results = extract_from_office(doc_path, out_dir,
328
+ output_format="cdxml", convert_method="auto")
329
+ paths = []
330
+ for r in results:
331
+ if r.cdxml_output and os.path.exists(r.cdxml_output):
332
+ paths.append(r.cdxml_output)
333
+ elif r.error:
334
+ print(f" Warning: {r.source_path}: {r.error}", file=sys.stderr)
335
+ return paths
336
+
337
+
338
+ # ---------------------------------------------------------------------------
339
+ # Helpers
340
+ # ---------------------------------------------------------------------------
341
+
342
+ def _build_species_summary(desc) -> list:
343
+ """Build a species summary list from a SchemeDescription."""
344
+ summary = []
345
+ for sid, sp in desc.species.items():
346
+ entry = {"id": sid, "element_type": sp.element_type}
347
+ if sp.label:
348
+ entry["label"] = sp.label
349
+ if sp.name:
350
+ entry["name"] = sp.name[:80]
351
+ if sp.smiles:
352
+ entry["smiles"] = sp.smiles[:120]
353
+ if sp.formula:
354
+ entry["formula"] = sp.formula
355
+ if sp.mw is not None:
356
+ entry["mw"] = round(sp.mw, 1)
357
+ if sp.text_category:
358
+ entry["text_category"] = sp.text_category
359
+ if getattr(sp, "iupac_name", None):
360
+ entry["iupac_name"] = sp.iupac_name
361
+ summary.append(entry)
362
+ return summary
363
+
364
+
365
+ # ---------------------------------------------------------------------------
366
+ # Parse one CDXML and return structured result
367
+ # ---------------------------------------------------------------------------
368
+
369
+ def _parse_one(cdxml_path: str, render: bool = False,
370
+ img_dir: Optional[str] = None,
371
+ use_chemscript: bool = False,
372
+ enrich: bool = False,
373
+ segment: bool = False) -> dict:
374
+ """Parse a single CDXML file and return a result dict for the report."""
375
+ result = {
376
+ "file": os.path.basename(cdxml_path),
377
+ "path": cdxml_path,
378
+ "error": None,
379
+ "narrative": "",
380
+ "topology": "",
381
+ "num_steps": 0,
382
+ "species_summary": [],
383
+ "steps_summary": [],
384
+ "warnings": [],
385
+ "image_b64": "",
386
+ "json_full": None,
387
+ }
388
+
389
+ # Parse
390
+ try:
391
+ desc = read_scheme(cdxml_path, use_network=False,
392
+ use_chemscript=use_chemscript, verbose=False,
393
+ segment=segment)
394
+ result["narrative"] = desc.narrative
395
+ result["topology"] = desc.topology
396
+ result["content_type"] = desc.content_type or "unknown"
397
+ result["num_steps"] = desc.num_steps
398
+ result["warnings"] = desc.warnings
399
+ result["_desc"] = desc # keep for Tier 2 corrections
400
+
401
+ # Species summary
402
+ result["species_summary"] = _build_species_summary(desc)
403
+
404
+ # Steps summary
405
+ for step in desc.steps:
406
+ s = {
407
+ "idx": step.step_index,
408
+ "reactants": step.reactant_ids,
409
+ "products": step.product_ids,
410
+ "reagents": step.reagent_ids,
411
+ "arrow": step.arrow_style,
412
+ }
413
+ if step.conditions:
414
+ s["conditions"] = step.conditions[:5] # cap for display
415
+ if step.yield_text:
416
+ s["yield"] = step.yield_text
417
+ result["steps_summary"].append(s)
418
+
419
+ result["json_full"] = desc.to_dict()
420
+
421
+ # Sub-scheme data (when segmentation is active)
422
+ if desc.sub_schemes:
423
+ result["sub_schemes"] = []
424
+ for sub in desc.sub_schemes:
425
+ sub_info = {
426
+ "num_steps": sub.num_steps,
427
+ "topology": sub.topology,
428
+ "content_type": sub.content_type or "unknown",
429
+ "num_species": len(sub.species),
430
+ "narrative": sub.narrative,
431
+ "species_summary": _build_species_summary(sub),
432
+ "steps_summary": [],
433
+ }
434
+ for step in sub.steps:
435
+ s = {
436
+ "idx": step.step_index,
437
+ "reactants": step.reactant_ids,
438
+ "products": step.product_ids,
439
+ "reagents": step.reagent_ids,
440
+ "arrow": step.arrow_style,
441
+ }
442
+ if step.conditions:
443
+ s["conditions"] = step.conditions[:5]
444
+ if step.yield_text:
445
+ s["yield"] = step.yield_text
446
+ sub_info["steps_summary"].append(s)
447
+ result["sub_schemes"].append(sub_info)
448
+
449
+ # ML enrichment (optional)
450
+ ml_enrichment = {}
451
+ if enrich and desc.steps:
452
+ try:
453
+ ml_enrichment = enrich_scheme(desc, verbose=True)
454
+ result["ml_enrichment"] = ml_enrichment
455
+ except Exception as exc:
456
+ print(f" ML enrichment error: {exc}", file=sys.stderr)
457
+
458
+ # LLM-quality narrative (with ML grounding when available)
459
+ try:
460
+ result["llm_narrative"] = generate_llm_narrative(
461
+ desc, ml_enrichment=ml_enrichment)
462
+ except Exception:
463
+ result["llm_narrative"] = ""
464
+
465
+ except Exception as e:
466
+ result["error"] = f"{type(e).__name__}: {e}"
467
+ traceback.print_exc(file=sys.stderr)
468
+
469
+ # Render image
470
+ if render and img_dir:
471
+ png_path = os.path.join(img_dir, Path(cdxml_path).stem + ".png")
472
+ if _render_cdxml_to_png(cdxml_path, png_path):
473
+ result["image_b64"] = _embed_image_b64(png_path)
474
+
475
+ return result
476
+
477
+
478
+ # ---------------------------------------------------------------------------
479
+ # HTML report generation
480
+ # ---------------------------------------------------------------------------
481
+
482
+ _CSS = """
483
+ :root { --bg: #f8f9fa; --card: #fff; --border: #dee2e6; --accent: #0d6efd;
484
+ --green: #198754; --red: #dc3545; --muted: #6c757d; }
485
+ * { box-sizing: border-box; margin: 0; padding: 0; }
486
+ body { font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, sans-serif;
487
+ background: var(--bg); color: #212529; line-height: 1.5; padding: 24px; }
488
+ h1 { font-size: 1.6rem; margin-bottom: 8px; }
489
+ .subtitle { color: var(--muted); margin-bottom: 24px; }
490
+ .stats { display: flex; gap: 16px; margin-bottom: 24px; flex-wrap: wrap; }
491
+ .stat { background: var(--card); border: 1px solid var(--border);
492
+ border-radius: 8px; padding: 12px 20px; min-width: 140px; }
493
+ .stat-value { font-size: 1.5rem; font-weight: 700; }
494
+ .stat-label { font-size: 0.85rem; color: var(--muted); }
495
+ .card { background: var(--card); border: 1px solid var(--border);
496
+ border-radius: 8px; margin-bottom: 20px; overflow: hidden; }
497
+ .card-header { padding: 12px 16px; border-bottom: 1px solid var(--border);
498
+ display: flex; align-items: center; gap: 12px;
499
+ cursor: pointer; user-select: none; }
500
+ .card-header:hover { background: #f1f3f5; }
501
+ .card-header h2 { font-size: 1rem; flex: 1; }
502
+ .badge { padding: 2px 8px; border-radius: 12px; font-size: 0.75rem;
503
+ font-weight: 600; }
504
+ .badge-topo { background: #e7f1ff; color: var(--accent); }
505
+ .badge-content { background: #f3e8ff; color: #6f42c1; }
506
+ .badge-steps { background: #d1e7dd; color: var(--green); }
507
+ .badge-error { background: #f8d7da; color: var(--red); }
508
+ .badge-warn { background: #fff3cd; color: #856404; }
509
+ .badge-cat { background: #e2e3e5; color: #41464b; font-size: 0.7rem; padding: 1px 6px; }
510
+ .badge-cat-cond { background: #cff4fc; color: #055160; }
511
+ .badge-cat-cite { background: #e2d9f3; color: #432874; }
512
+ .badge-cat-bio { background: #f8d7da; color: var(--red); }
513
+ .card-body { padding: 16px; display: none; }
514
+ .card.open .card-body { display: block; }
515
+ .two-col { display: grid; grid-template-columns: 1fr 1fr; gap: 20px; }
516
+ @media (max-width: 900px) { .two-col { grid-template-columns: 1fr; } }
517
+ .img-box { text-align: center; }
518
+ .img-box img { max-width: 100%; border: 1px solid var(--border); border-radius: 4px; }
519
+ .no-img { padding: 40px; text-align: center; color: var(--muted);
520
+ background: #f1f3f5; border-radius: 4px; }
521
+ .narrative { background: #f8f9fa; padding: 12px; border-radius: 4px;
522
+ white-space: pre-wrap; font-size: 0.9rem; line-height: 1.6; }
523
+ .section-title { font-weight: 600; font-size: 0.9rem; margin: 12px 0 6px;
524
+ color: var(--muted); text-transform: uppercase;
525
+ letter-spacing: 0.5px; }
526
+ table { width: 100%; border-collapse: collapse; font-size: 0.85rem; margin-top: 4px; }
527
+ th, td { padding: 6px 10px; text-align: left; border-bottom: 1px solid var(--border); }
528
+ th { background: #f1f3f5; font-weight: 600; position: sticky; top: 0; }
529
+ .smiles { font-family: "Courier New", monospace; font-size: 0.75rem;
530
+ word-break: break-all; max-width: 280px; color: var(--muted); }
531
+ .struct-cell { padding: 2px 4px; }
532
+ .struct-img { width: 160px; height: 96px; object-fit: contain;
533
+ border: 1px solid #e9ecef; border-radius: 3px; background: #fff;
534
+ vertical-align: middle; }
535
+ .no-struct { color: #ccc; }
536
+ .json-toggle { color: var(--accent); cursor: pointer; font-size: 0.85rem;
537
+ text-decoration: underline; margin-top: 8px; display: inline-block; }
538
+ .json-block { display: none; background: #f1f3f5; padding: 12px;
539
+ border-radius: 4px; font-family: monospace; font-size: 0.8rem;
540
+ white-space: pre-wrap; max-height: 400px; overflow: auto;
541
+ margin-top: 6px; }
542
+ .chevron { transition: transform 0.2s; display: inline-block; }
543
+ .card.open .chevron { transform: rotate(90deg); }
544
+ .verdict { padding: 3px 10px; border-radius: 4px; font-size: 0.8rem;
545
+ font-weight: 600; display: inline-block; }
546
+ .verdict-ok { background: #d1e7dd; color: var(--green); }
547
+ .verdict-warn { background: #fff3cd; color: #856404; }
548
+ .verdict-fail { background: #f8d7da; color: var(--red); }
549
+ .tier-label { font-weight: 700; font-size: 0.8rem; text-transform: uppercase;
550
+ letter-spacing: 0.5px; margin-bottom: 4px; }
551
+ .tier-1 .tier-label { color: var(--accent); }
552
+ .tier-2 .tier-label { color: #198754; }
553
+ .tier-row { display: grid; grid-template-columns: 1fr 1fr; gap: 16px;
554
+ margin-top: 8px; }
555
+ .tier-col { padding: 10px; border-radius: 6px; }
556
+ .tier-1 { background: #f8f9fa; border: 1px solid #dee2e6; }
557
+ .tier-2 { background: #f0faf4; border: 1px solid #a3cfbb; }
558
+ .correction-note { font-size: 0.8rem; color: #495057; font-style: italic;
559
+ margin-top: 6px; padding: 6px 10px; background: #fff3cd;
560
+ border-radius: 4px; }
561
+ .diff-highlight { background: #fff3cd; padding: 1px 4px; border-radius: 2px; }
562
+ .badge-t2 { background: #d1e7dd; color: var(--green); }
563
+ """
564
+
565
+ _JS = """
566
+ document.querySelectorAll('.card-header').forEach(h => {
567
+ h.addEventListener('click', () => h.parentElement.classList.toggle('open'));
568
+ });
569
+ document.querySelectorAll('.json-toggle').forEach(t => {
570
+ t.addEventListener('click', e => {
571
+ e.stopPropagation();
572
+ const block = t.nextElementSibling;
573
+ block.style.display = block.style.display === 'block' ? 'none' : 'block';
574
+ });
575
+ });
576
+ function expandAll() {
577
+ document.querySelectorAll('.card').forEach(c => c.classList.add('open'));
578
+ }
579
+ function collapseAll() {
580
+ document.querySelectorAll('.card').forEach(c => c.classList.remove('open'));
581
+ }
582
+ """
583
+
584
+
585
+ def _species_table_html(species: list) -> str:
586
+ if not species:
587
+ return '<p style="color:var(--muted)">No species detected</p>'
588
+ rows = []
589
+ for sp in species:
590
+ label = sp.get("label", "")
591
+ name = sp.get("name", "")
592
+ smiles = sp.get("smiles", "")
593
+ formula = sp.get("formula", "")
594
+ mw = sp.get("mw", "")
595
+ etype = sp.get("element_type", "")
596
+ tcat = sp.get("text_category", "")
597
+
598
+ # Type/category badge
599
+ if etype == "text":
600
+ cat_css = {"condition_ref": "badge-cat-cond",
601
+ "citation": "badge-cat-cite",
602
+ "bioactivity": "badge-cat-bio"}.get(tcat, "badge-cat")
603
+ type_html = f'<span class="badge {cat_css}">{tcat or "text"}</span>'
604
+ else:
605
+ type_html = f'<span class="badge badge-cat">{etype or "?"}</span>'
606
+
607
+ # Render SMILES to SVG structure image
608
+ svg_uri = _smiles_to_svg_b64(smiles) if smiles else ""
609
+ if svg_uri:
610
+ struct_html = f'<img src="{svg_uri}" class="struct-img" alt="{smiles}">'
611
+ else:
612
+ struct_html = '<span class="no-struct">-</span>'
613
+
614
+ iupac = sp.get("iupac_name", "")
615
+ iupac_html = (f'<span style="color:#0d6efd;font-size:0.85em">{iupac}</span>'
616
+ if iupac else "")
617
+
618
+ rows.append(f"""<tr>
619
+ <td>{sp['id']}</td>
620
+ <td>{type_html}</td>
621
+ <td><b>{label}</b></td>
622
+ <td>{name}{('<br>' + iupac_html) if iupac_html else ''}</td>
623
+ <td class="struct-cell">{struct_html}</td>
624
+ <td class="smiles">{smiles}</td>
625
+ <td>{formula}</td>
626
+ <td>{mw}</td>
627
+ </tr>""")
628
+ return f"""<table>
629
+ <tr><th>ID</th><th>Type</th><th>Label</th><th>Name / IUPAC</th><th>Structure</th><th>SMILES</th><th>Formula</th><th>MW</th></tr>
630
+ {''.join(rows)}
631
+ </table>"""
632
+
633
+
634
+ def _steps_table_html(steps: list) -> str:
635
+ if not steps:
636
+ return '<p style="color:var(--muted)">No steps detected</p>'
637
+ rows = []
638
+ for s in steps:
639
+ r_ids = ", ".join(s.get("reactants", []))
640
+ p_ids = ", ".join(s.get("products", []))
641
+ rg_ids = ", ".join(s.get("reagents", []))
642
+ conds = "; ".join(s.get("conditions", [])[:3])
643
+ yld = s.get("yield", "")
644
+ arrow = s.get("arrow", "solid")
645
+ arrow_icon = {"solid": "->", "dashed": "-->", "failed": "X->"}.get(arrow, "->")
646
+ rows.append(f"""<tr>
647
+ <td>{s['idx'] + 1}</td>
648
+ <td>{r_ids}</td>
649
+ <td>{arrow_icon}</td>
650
+ <td>{p_ids}</td>
651
+ <td>{rg_ids}</td>
652
+ <td>{conds}</td>
653
+ <td>{yld}</td>
654
+ </tr>""")
655
+ return f"""<table>
656
+ <tr><th>#</th><th>Reactants</th><th></th><th>Products</th>
657
+ <th>Reagents</th><th>Conditions</th><th>Yield</th></tr>
658
+ {''.join(rows)}
659
+ </table>"""
660
+
661
+
662
+ def _sub_schemes_html(sub_schemes: list) -> str:
663
+ """Generate HTML for sub-scheme display (collapsible sections)."""
664
+ if not sub_schemes:
665
+ return ""
666
+ parts = [f'<div class="section-title" style="color:#6f42c1">'
667
+ f'Composite Scheme: {len(sub_schemes)} independent sub-schemes'
668
+ f'</div>']
669
+ for i, sub in enumerate(sub_schemes):
670
+ topo = sub.get("topology", "?")
671
+ ctype = sub.get("content_type", "unknown")
672
+ n_steps = sub.get("num_steps", 0)
673
+ n_species = sub.get("num_species", 0)
674
+ narrative = sub.get("narrative", "")
675
+ species_summary = sub.get("species_summary", [])
676
+ steps_summary = sub.get("steps_summary", [])
677
+ parts.append(f"""
678
+ <details style="margin:8px 0;border:1px solid #ddd;border-radius:4px;padding:8px">
679
+ <summary style="cursor:pointer;font-weight:600">
680
+ Sub-scheme {i + 1}
681
+ <span class="badge badge-topo">{topo}</span>
682
+ <span class="badge badge-content">{ctype}</span>
683
+ <span class="badge badge-steps">{n_steps} steps, {n_species} species</span>
684
+ </summary>
685
+ <div style="margin-top:8px">
686
+ <div class="narrative">{narrative}</div>
687
+ <div class="section-title" style="font-size:0.85rem">
688
+ Species ({len(species_summary)})
689
+ </div>
690
+ {_species_table_html(species_summary)}
691
+ <div class="section-title" style="font-size:0.85rem">
692
+ Steps
693
+ </div>
694
+ {_steps_table_html(steps_summary)}
695
+ </div>
696
+ </details>
697
+ """)
698
+ return "\n".join(parts)
699
+
700
+
701
+ def _verdict(result: dict) -> Tuple[str, str]:
702
+ """Return (css_class, text) for the verdict badge."""
703
+ if result["error"]:
704
+ return "verdict-fail", "PARSE ERROR"
705
+ if result["num_steps"] == 0:
706
+ return "verdict-warn", "NO STEPS"
707
+ if result["warnings"]:
708
+ return "verdict-warn", f"OK ({len(result['warnings'])} warnings)"
709
+ return "verdict-ok", "OK"
710
+
711
+
712
+ def _tier2_summary_html(t1: dict, t2_corrections: dict, t2_desc) -> str:
713
+ """Generate Tier 2 correction summary HTML."""
714
+ if not t2_corrections:
715
+ return ""
716
+
717
+ changes = []
718
+ if "content_type" in t2_corrections:
719
+ old = t1.get("content_type", "unknown")
720
+ new = t2_corrections["content_type"]
721
+ if old != new:
722
+ changes.append(
723
+ f'<b>Content type</b>: '
724
+ f'<span class="diff-highlight">{old} &rarr; {new}</span>')
725
+ if "topology" in t2_corrections:
726
+ old = t1.get("topology", "?")
727
+ new = t2_corrections["topology"]
728
+ if old != new:
729
+ changes.append(
730
+ f'<b>Topology</b>: '
731
+ f'<span class="diff-highlight">{old} &rarr; {new}</span>')
732
+ sp_corr = t2_corrections.get("species_corrections", {})
733
+ for sp_id, fixes in sp_corr.items():
734
+ for field, val in fixes.items():
735
+ changes.append(
736
+ f'<b>{sp_id}.{field}</b>: '
737
+ f'<span class="diff-highlight">&rarr; {val}</span>')
738
+
739
+ notes = t2_corrections.get("notes", "")
740
+
741
+ # Tier 2 narrative
742
+ t2_narrative = t2_desc.narrative if t2_desc else ""
743
+
744
+ changes_html = "<br>".join(changes) if changes else "No field changes"
745
+ note_html = (f'<div class="correction-note">{notes}</div>'
746
+ if notes else "")
747
+
748
+ return f"""
749
+ <div class="tier-row">
750
+ <div class="tier-col tier-1">
751
+ <div class="tier-label">Tier 1 (Deterministic)</div>
752
+ <div class="narrative">{t1.get('narrative', '')}</div>
753
+ </div>
754
+ <div class="tier-col tier-2">
755
+ <div class="tier-label">Tier 2 (LLM-Refined)</div>
756
+ <div class="narrative">{t2_narrative}</div>
757
+ </div>
758
+ </div>
759
+ <div style="margin-top:8px">
760
+ <b style="font-size:0.85rem">LLM Corrections:</b><br>
761
+ <span style="font-size:0.85rem">{changes_html}</span>
762
+ {note_html}
763
+ </div>
764
+ """
765
+
766
+
767
+ def _card_html(idx: int, result: dict) -> str:
768
+ """Generate HTML for one scheme card."""
769
+ v_class, v_text = _verdict(result)
770
+ has_t2 = result.get("_t2_corrections") is not None
771
+
772
+ # Image section
773
+ if result["image_b64"]:
774
+ img_html = f'<img src="{result["image_b64"]}" alt="Rendered scheme">'
775
+ else:
776
+ img_html = '<div class="no-img">No rendered image<br>(use --render to enable ChemDraw rendering)</div>'
777
+
778
+ # Error display
779
+ if result["error"]:
780
+ body_html = f'<div class="narrative" style="color:var(--red)">{result["error"]}</div>'
781
+ elif has_t2:
782
+ # Dual-tier display with LLM narrative
783
+ t2_desc = result.get("_t2_desc")
784
+ body_html = _tier2_summary_html(result, result["_t2_corrections"], t2_desc)
785
+ # Add LLM narrative if available
786
+ llm_nar = result.get("llm_narrative", "")
787
+ if llm_nar:
788
+ body_html += f"""
789
+ <div style="margin-top:10px">
790
+ <div class="tier-col tier-2" style="margin-bottom:8px">
791
+ <div class="tier-label">LLM Narrative</div>
792
+ <div class="narrative">{llm_nar}</div>
793
+ </div>
794
+ </div>
795
+ """
796
+ body_html += f"""
797
+ <div class="section-title">Species Registry ({len(result['species_summary'])} species)</div>
798
+ {_species_table_html(result['species_summary'])}
799
+
800
+ <div class="section-title">Reaction Steps</div>
801
+ {_steps_table_html(result['steps_summary'])}
802
+
803
+ {"".join(f'<div class="badge badge-warn" style="margin-top:4px">{w}</div>' for w in result.get('warnings', []))}
804
+ """
805
+ else:
806
+ llm_nar = result.get("llm_narrative", "")
807
+ llm_html = ""
808
+ if llm_nar:
809
+ llm_html = f"""
810
+ <div class="tier-row">
811
+ <div class="tier-col tier-1">
812
+ <div class="tier-label">Parser Output</div>
813
+ <div class="narrative">{result['narrative']}</div>
814
+ </div>
815
+ <div class="tier-col tier-2">
816
+ <div class="tier-label">LLM Narrative</div>
817
+ <div class="narrative">{llm_nar}</div>
818
+ </div>
819
+ </div>
820
+ """
821
+ else:
822
+ llm_html = f"""
823
+ <div class="section-title">Narrative</div>
824
+ <div class="narrative">{result['narrative']}</div>
825
+ """
826
+
827
+ body_html = f"""
828
+ {llm_html}
829
+
830
+ <div class="section-title">Species Registry ({len(result['species_summary'])} species)</div>
831
+ {_species_table_html(result['species_summary'])}
832
+
833
+ <div class="section-title">Reaction Steps</div>
834
+ {_steps_table_html(result['steps_summary'])}
835
+
836
+ {"".join(f'<div class="badge badge-warn" style="margin-top:4px">{w}</div>' for w in result.get('warnings', []))}
837
+ """
838
+
839
+ # Sub-schemes display (when segmentation is active)
840
+ sub_html = _sub_schemes_html(result.get("sub_schemes", []))
841
+ if sub_html:
842
+ body_html += sub_html
843
+
844
+ # JSON toggle
845
+ json_html = ""
846
+ if result.get("json_full"):
847
+ json_str = json.dumps(result["json_full"], indent=2, ensure_ascii=False)
848
+ # Escape HTML
849
+ json_str = json_str.replace("&", "&amp;").replace("<", "&lt;").replace(">", "&gt;")
850
+ json_html = f"""
851
+ <span class="json-toggle">Show full JSON</span>
852
+ <div class="json-block">{json_str}</div>
853
+ """
854
+
855
+ # Header badges — show Tier 2 values if corrected
856
+ t2_corr = result.get("_t2_corrections", {}) or {}
857
+ topo_display = t2_corr.get("topology", result.get("topology", "?"))
858
+ ctype_display = t2_corr.get("content_type", result.get("content_type", "unknown"))
859
+ t2_badge = ' <span class="badge badge-t2">T2</span>' if has_t2 else ""
860
+ ml_badge = (' <span class="badge" style="background:#cce5ff;color:#004085">ML</span>'
861
+ if result.get("ml_enrichment") else "")
862
+ n_sub = len(result.get("sub_schemes", []))
863
+ seg_badge = (f' <span class="badge" style="background:#e8daef;color:#6f42c1">'
864
+ f'{n_sub} sub-schemes</span>' if n_sub > 0 else "")
865
+
866
+ return f"""
867
+ <div class="card" id="card-{idx}">
868
+ <div class="card-header">
869
+ <span class="chevron">&#9654;</span>
870
+ <h2>{result['file']}</h2>
871
+ <span class="badge badge-topo">{topo_display}</span>
872
+ <span class="badge badge-content">{ctype_display}</span>
873
+ <span class="badge badge-steps">{result['num_steps']} steps</span>
874
+ <span class="verdict {v_class}">{v_text}</span>{t2_badge}{ml_badge}{seg_badge}
875
+ </div>
876
+ <div class="card-body">
877
+ <div class="two-col">
878
+ <div class="img-box">{img_html}</div>
879
+ <div>{body_html}</div>
880
+ </div>
881
+ {json_html}
882
+ </div>
883
+ </div>
884
+ """
885
+
886
+
887
+ def generate_report(results: List[dict], output_path: str,
888
+ title: str = "Scheme Reader Verification Report") -> None:
889
+ """Generate the HTML report from a list of result dicts."""
890
+ n_total = len(results)
891
+ n_ok = sum(1 for r in results if not r["error"] and r["num_steps"] > 0)
892
+ n_warn = sum(1 for r in results if not r["error"] and r["warnings"])
893
+ n_err = sum(1 for r in results if r["error"])
894
+ n_empty = sum(1 for r in results if not r["error"] and r["num_steps"] == 0)
895
+ n_t2 = sum(1 for r in results if r.get("_t2_corrections"))
896
+ n_ml = sum(1 for r in results if r.get("ml_enrichment"))
897
+
898
+ # Sort: errors first, then by filename
899
+ results_sorted = sorted(results,
900
+ key=lambda r: (0 if r["error"] else 1, r["file"]))
901
+
902
+ cards = "\n".join(_card_html(i, r) for i, r in enumerate(results_sorted))
903
+
904
+ html = f"""<!DOCTYPE html>
905
+ <html lang="en">
906
+ <head>
907
+ <meta charset="utf-8">
908
+ <title>{title}</title>
909
+ <style>{_CSS}</style>
910
+ </head>
911
+ <body>
912
+ <h1>{title}</h1>
913
+ <p class="subtitle">Visual verification of scheme_reader output.
914
+ Click a card to expand. Compare the rendered image with the parsed narrative.</p>
915
+
916
+ <div class="stats">
917
+ <div class="stat">
918
+ <div class="stat-value">{n_total}</div>
919
+ <div class="stat-label">Total schemes</div>
920
+ </div>
921
+ <div class="stat">
922
+ <div class="stat-value" style="color:var(--green)">{n_ok}</div>
923
+ <div class="stat-label">Parsed OK</div>
924
+ </div>
925
+ <div class="stat">
926
+ <div class="stat-value" style="color:#856404">{n_warn}</div>
927
+ <div class="stat-label">With warnings</div>
928
+ </div>
929
+ <div class="stat">
930
+ <div class="stat-value" style="color:var(--muted)">{n_empty}</div>
931
+ <div class="stat-label">No steps found</div>
932
+ </div>
933
+ <div class="stat">
934
+ <div class="stat-value" style="color:var(--red)">{n_err}</div>
935
+ <div class="stat-label">Parse errors</div>
936
+ </div>
937
+ <div class="stat">
938
+ <div class="stat-value" style="color:#198754">{n_t2}</div>
939
+ <div class="stat-label">LLM-refined</div>
940
+ </div>
941
+ <div class="stat">
942
+ <div class="stat-value" style="color:#0d6efd">{n_ml}</div>
943
+ <div class="stat-label">ML-enriched</div>
944
+ </div>
945
+ </div>
946
+
947
+ <div style="margin-bottom: 16px;">
948
+ <button onclick="expandAll()" style="padding:6px 14px;cursor:pointer;border:1px solid var(--border);border-radius:4px;background:var(--card)">Expand All</button>
949
+ <button onclick="collapseAll()" style="padding:6px 14px;cursor:pointer;border:1px solid var(--border);border-radius:4px;background:var(--card);margin-left:6px">Collapse All</button>
950
+ </div>
951
+
952
+ {cards}
953
+
954
+ <script>{_JS}</script>
955
+ </body>
956
+ </html>"""
957
+
958
+ with open(output_path, "w", encoding="utf-8") as f:
959
+ f.write(html)
960
+ print(f"Report written to {output_path} ({n_total} schemes)")
961
+
962
+
963
+ # ---------------------------------------------------------------------------
964
+ # Main entry point
965
+ # ---------------------------------------------------------------------------
966
+
967
+ def main():
968
+ parser = argparse.ArgumentParser(
969
+ description="Generate a visual verification report for scheme_reader")
970
+ parser.add_argument("inputs", nargs="+",
971
+ help="CDXML files, directories of CDXML files, "
972
+ "or PPTX/DOCX documents")
973
+ parser.add_argument("-o", "--output", default="scheme_reader_report.html",
974
+ help="Output HTML file (default: scheme_reader_report.html)")
975
+ parser.add_argument("--render", action="store_true",
976
+ help="Render CDXML to PNG via ChemDraw COM "
977
+ "(requires ChemDraw to be closed)")
978
+ parser.add_argument("--chemscript", action="store_true",
979
+ help="Use ChemScript for SMILES (best abbreviation "
980
+ "resolution, requires ChemDraw 16+ on Windows)")
981
+ parser.add_argument("--corrections",
982
+ help="Tier 2 corrections JSON file "
983
+ "(maps source_key to correction dict)")
984
+ parser.add_argument("--enrich", action="store_true",
985
+ help="Run RXNMapper + RXN Insight ML enrichment "
986
+ "per step (requires chem-pipeline rxn-experiments)")
987
+ parser.add_argument("--segment", action="store_true",
988
+ help="Auto-segment multi-panel CDXML files into "
989
+ "independent sub-schemes")
990
+ parser.add_argument("--title", default="Scheme Reader Verification Report",
991
+ help="Report title")
992
+ args = parser.parse_args()
993
+
994
+ # Collect all CDXML paths
995
+ cdxml_paths: List[str] = []
996
+ tmp_dirs = []
997
+
998
+ for inp in args.inputs:
999
+ inp = os.path.abspath(inp)
1000
+ ext = os.path.splitext(inp)[1].lower()
1001
+
1002
+ if ext in (".pptx", ".docx"):
1003
+ # Extract from document
1004
+ doc_name = Path(inp).stem
1005
+ tmp = tempfile.mkdtemp(prefix=f"sr_verify_{doc_name}_")
1006
+ tmp_dirs.append(tmp)
1007
+ print(f"Extracting from {os.path.basename(inp)}...", file=sys.stderr)
1008
+ extracted = _extract_from_document(inp, tmp)
1009
+ # Tag with source document
1010
+ for p in extracted:
1011
+ cdxml_paths.append((p, os.path.basename(inp)))
1012
+ print(f" -> {len(extracted)} ChemDraw objects", file=sys.stderr)
1013
+
1014
+ elif ext == ".cdxml":
1015
+ cdxml_paths.append((inp, None))
1016
+
1017
+ elif os.path.isdir(inp):
1018
+ for fn in sorted(os.listdir(inp)):
1019
+ if fn.lower().endswith(".cdxml"):
1020
+ cdxml_paths.append((os.path.join(inp, fn), os.path.basename(inp)))
1021
+ else:
1022
+ print(f"Skipping unknown input: {inp}", file=sys.stderr)
1023
+
1024
+ if not cdxml_paths:
1025
+ print("No CDXML files found.", file=sys.stderr)
1026
+ sys.exit(1)
1027
+
1028
+ # Optional image rendering directory
1029
+ img_dir = None
1030
+ if args.render:
1031
+ img_dir = tempfile.mkdtemp(prefix="sr_verify_img_")
1032
+ tmp_dirs.append(img_dir)
1033
+
1034
+ # Load Tier 2 corrections if provided
1035
+ corrections_map = {}
1036
+ if args.corrections:
1037
+ with open(args.corrections, "r", encoding="utf-8") as f:
1038
+ corrections_map = json.load(f)
1039
+ print(f"Loaded {len(corrections_map)} Tier 2 corrections",
1040
+ file=sys.stderr)
1041
+
1042
+ # Parse all (Phase 1: deterministic parsing, no ML enrichment yet)
1043
+ results = []
1044
+ for i, (cdxml_path, source_doc) in enumerate(cdxml_paths):
1045
+ name = os.path.basename(cdxml_path)
1046
+ if source_doc:
1047
+ display_name = f"[{source_doc}] {name}"
1048
+ else:
1049
+ display_name = name
1050
+ print(f" [{i+1}/{len(cdxml_paths)}] {display_name}", file=sys.stderr)
1051
+ result = _parse_one(cdxml_path, render=args.render, img_dir=img_dir,
1052
+ use_chemscript=args.chemscript,
1053
+ enrich=False, # ML enrichment handled in batch below
1054
+ segment=args.segment)
1055
+ if source_doc:
1056
+ result["file"] = display_name
1057
+
1058
+ # Apply Tier 2 corrections if available
1059
+ corr_key = None
1060
+ for candidate in [
1061
+ f"{source_doc or 'standalone'}/{name}" if source_doc else name,
1062
+ name,
1063
+ f"docx/{name}" if source_doc and "docx" in source_doc.lower() else None,
1064
+ f"pptx/{name}" if source_doc and "pptx" in source_doc.lower() else None,
1065
+ f"showcase/{name}" if source_doc and "showcase" in source_doc.lower() else None,
1066
+ ]:
1067
+ if candidate and candidate in corrections_map:
1068
+ corr_key = candidate
1069
+ break
1070
+
1071
+ if corr_key and result.get("_desc"):
1072
+ corr = corrections_map[corr_key]
1073
+ try:
1074
+ t2_desc = apply_corrections(result["_desc"], corr)
1075
+ result["_t2_corrections"] = corr
1076
+ result["_t2_desc"] = t2_desc
1077
+ except Exception as e:
1078
+ print(f" Warning: Tier 2 correction failed for {name}: {e}",
1079
+ file=sys.stderr)
1080
+
1081
+ results.append(result)
1082
+
1083
+ # Regenerate LLM narrative from corrected desc (Tier 2) where available
1084
+ # This ensures content_type/topology corrections flow into the narrative
1085
+ for r in results:
1086
+ t2 = r.get("_t2_desc")
1087
+ if t2:
1088
+ try:
1089
+ r["llm_narrative"] = generate_llm_narrative(t2)
1090
+ except Exception:
1091
+ pass
1092
+
1093
+ # Phase 1.5: Aligned IUPAC name enrichment (requires ChemScript)
1094
+ if args.chemscript:
1095
+ n_aligned_total = 0
1096
+ for r in results:
1097
+ desc = r.get("_t2_desc") or r.get("_desc")
1098
+ if desc and desc.steps:
1099
+ try:
1100
+ n = enrich_aligned_names(desc)
1101
+ if n:
1102
+ n_aligned_total += n
1103
+ r["llm_narrative"] = generate_llm_narrative(desc)
1104
+ # Rebuild species summary to include updated names
1105
+ r["species_summary"] = _build_species_summary(desc)
1106
+ except Exception:
1107
+ pass
1108
+ if n_aligned_total:
1109
+ print(f" Aligned IUPAC names: {n_aligned_total} species updated",
1110
+ file=sys.stderr)
1111
+
1112
+ # Phase 2: Batch ML enrichment (single RXNMapper subprocess for all reactions)
1113
+ if args.enrich:
1114
+ # Collect schemes with steps for enrichment
1115
+ descs_for_enrich = []
1116
+ for i, r in enumerate(results):
1117
+ desc = r.get("_desc")
1118
+ if desc and desc.steps:
1119
+ descs_for_enrich.append((i, desc))
1120
+
1121
+ if descs_for_enrich:
1122
+ print(f"\nBatch ML enrichment for {len(descs_for_enrich)} schemes...",
1123
+ file=sys.stderr)
1124
+ batch_results = batch_enrich_schemes(descs_for_enrich, verbose=True)
1125
+
1126
+ # Apply enrichment and regenerate narratives
1127
+ # Use corrected T2 desc when available so corrections flow into narrative
1128
+ for desc_idx, enrichment in batch_results:
1129
+ if enrichment:
1130
+ results[desc_idx]["ml_enrichment"] = enrichment
1131
+ desc = (results[desc_idx].get("_t2_desc")
1132
+ or results[desc_idx].get("_desc"))
1133
+ if desc:
1134
+ try:
1135
+ results[desc_idx]["llm_narrative"] = (
1136
+ generate_llm_narrative(desc,
1137
+ ml_enrichment=enrichment))
1138
+ except Exception:
1139
+ pass
1140
+ n_enriched = sum(1 for _, e in batch_results if e)
1141
+ print(f" {n_enriched} schemes enriched", file=sys.stderr)
1142
+
1143
+ # Clean up internal fields before report
1144
+ for r in results:
1145
+ r.pop("_desc", None)
1146
+
1147
+ # Generate report
1148
+ generate_report(results, args.output, title=args.title)
1149
+
1150
+ # Cleanup temp dirs only if no images needed (they're embedded as b64)
1151
+ for d in tmp_dirs:
1152
+ try:
1153
+ import shutil
1154
+ shutil.rmtree(d, ignore_errors=True)
1155
+ except Exception:
1156
+ pass
1157
+
1158
+
1159
+ if __name__ == "__main__":
1160
+ main()