cdxml-toolkit 0.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cdxml_toolkit/__init__.py +18 -0
- cdxml_toolkit/_jre/__init__.py +2 -0
- cdxml_toolkit/_jre/temurin-21-jre-win-x64.zip +0 -0
- cdxml_toolkit/analysis/__init__.py +35 -0
- cdxml_toolkit/analysis/deterministic/__init__.py +12 -0
- cdxml_toolkit/analysis/deterministic/discover_experiment_files.py +413 -0
- cdxml_toolkit/analysis/deterministic/lab_book_formatter.py +701 -0
- cdxml_toolkit/analysis/deterministic/lcms_file_categorizer.py +928 -0
- cdxml_toolkit/analysis/deterministic/lcms_identifier.py +598 -0
- cdxml_toolkit/analysis/deterministic/mass_resolver.py +654 -0
- cdxml_toolkit/analysis/deterministic/multi_lcms_analyzer.py +1412 -0
- cdxml_toolkit/analysis/deterministic/procedure_writer.py +446 -0
- cdxml_toolkit/analysis/extract_nmr.py +47 -0
- cdxml_toolkit/analysis/format_procedure_entry.py +479 -0
- cdxml_toolkit/analysis/lcms_analyzer.py +1299 -0
- cdxml_toolkit/analysis/parse_analysis_file.py +134 -0
- cdxml_toolkit/cdxml_builder.py +920 -0
- cdxml_toolkit/cdxml_utils.py +342 -0
- cdxml_toolkit/chemdraw/__init__.py +5 -0
- cdxml_toolkit/chemdraw/_chemscript_server.py +562 -0
- cdxml_toolkit/chemdraw/cdx_converter.py +527 -0
- cdxml_toolkit/chemdraw/cdxml_to_image.py +262 -0
- cdxml_toolkit/chemdraw/cdxml_to_image_rdkit.py +296 -0
- cdxml_toolkit/chemdraw/chemscript_bridge.py +901 -0
- cdxml_toolkit/constants.py +304 -0
- cdxml_toolkit/coord_normalizer.py +438 -0
- cdxml_toolkit/deterministic_pipeline/__init__.py +6 -0
- cdxml_toolkit/deterministic_pipeline/legacy/__init__.py +5 -0
- cdxml_toolkit/deterministic_pipeline/legacy/eln_cdx_cleanup.py +509 -0
- cdxml_toolkit/deterministic_pipeline/legacy/eln_enrichment.py +1394 -0
- cdxml_toolkit/deterministic_pipeline/legacy/scheme_aligner.py +428 -0
- cdxml_toolkit/deterministic_pipeline/legacy/scheme_polisher.py +1337 -0
- cdxml_toolkit/deterministic_pipeline/legacy/scheme_polisher_v2.py +1340 -0
- cdxml_toolkit/deterministic_pipeline/scheme_reader_audit.py +931 -0
- cdxml_toolkit/deterministic_pipeline/scheme_reader_verify.py +1160 -0
- cdxml_toolkit/image/__init__.py +15 -0
- cdxml_toolkit/image/reaction_from_image.py +2103 -0
- cdxml_toolkit/image/structure_from_image.py +1711 -0
- cdxml_toolkit/layout/__init__.py +5 -0
- cdxml_toolkit/layout/alignment.py +1642 -0
- cdxml_toolkit/layout/reaction_cleanup.py +1002 -0
- cdxml_toolkit/layout/scheme_merger.py +2260 -0
- cdxml_toolkit/mcp_server/__init__.py +0 -0
- cdxml_toolkit/mcp_server/__main__.py +5 -0
- cdxml_toolkit/mcp_server/server.py +1567 -0
- cdxml_toolkit/naming/__init__.py +6 -0
- cdxml_toolkit/naming/aligned_namer.py +2342 -0
- cdxml_toolkit/naming/mol_builder.py +3722 -0
- cdxml_toolkit/naming/name_decomposer.py +2843 -0
- cdxml_toolkit/naming/reactions_datamol.json +2414 -0
- cdxml_toolkit/office/__init__.py +5 -0
- cdxml_toolkit/office/doc_from_template.py +722 -0
- cdxml_toolkit/office/ole_embedder.py +808 -0
- cdxml_toolkit/office/ole_extractor.py +272 -0
- cdxml_toolkit/perception/__init__.py +10 -0
- cdxml_toolkit/perception/compound_search.py +229 -0
- cdxml_toolkit/perception/eln_csv_parser.py +240 -0
- cdxml_toolkit/perception/rdf_parser.py +664 -0
- cdxml_toolkit/perception/reactant_heuristic.py +1045 -0
- cdxml_toolkit/perception/reaction_parser.py +2150 -0
- cdxml_toolkit/perception/scheme_reader.py +2948 -0
- cdxml_toolkit/perception/scheme_refine.py +1404 -0
- cdxml_toolkit/perception/scheme_segmenter.py +619 -0
- cdxml_toolkit/perception/spatial_assignment.py +1013 -0
- cdxml_toolkit/rdkit_utils.py +605 -0
- cdxml_toolkit/render/__init__.py +17 -0
- cdxml_toolkit/render/auto_layout.py +229 -0
- cdxml_toolkit/render/compact_parser.py +632 -0
- cdxml_toolkit/render/parser.py +706 -0
- cdxml_toolkit/render/render_scheme.py +267 -0
- cdxml_toolkit/render/renderer.py +2387 -0
- cdxml_toolkit/render/schema.py +90 -0
- cdxml_toolkit/render/scheme_maker.py +1043 -0
- cdxml_toolkit/render/scheme_yaml_writer.py +1487 -0
- cdxml_toolkit/resolve/__init__.py +13 -0
- cdxml_toolkit/resolve/cas_resolver.py +430 -0
- cdxml_toolkit/resolve/chemscanner_abbreviations.json +28813 -0
- cdxml_toolkit/resolve/condensed_formula.py +493 -0
- cdxml_toolkit/resolve/jre_manager.py +195 -0
- cdxml_toolkit/resolve/reagent_abbreviations.json +1046 -0
- cdxml_toolkit/resolve/reagent_db.py +285 -0
- cdxml_toolkit/resolve/superatom_data.json +2856 -0
- cdxml_toolkit/resolve/superatom_table.py +146 -0
- cdxml_toolkit/text_formatting.py +298 -0
- cdxml_toolkit-0.5.0.dist-info/METADATA +318 -0
- cdxml_toolkit-0.5.0.dist-info/RECORD +91 -0
- cdxml_toolkit-0.5.0.dist-info/WHEEL +5 -0
- cdxml_toolkit-0.5.0.dist-info/entry_points.txt +17 -0
- cdxml_toolkit-0.5.0.dist-info/licenses/LICENSE +21 -0
- cdxml_toolkit-0.5.0.dist-info/licenses/NOTICE.md +37 -0
- cdxml_toolkit-0.5.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,1160 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
scheme_reader_verify.py — Visual verification report for scheme_reader output.
|
|
4
|
+
|
|
5
|
+
Generates an HTML report that shows each CDXML scheme as a rendered image
|
|
6
|
+
alongside scheme_reader's parsed narrative, species list, and step graph.
|
|
7
|
+
This lets a chemist visually confirm that the parser understood the scheme
|
|
8
|
+
correctly.
|
|
9
|
+
|
|
10
|
+
Two modes:
|
|
11
|
+
1. Directory mode: point at a folder of .cdxml files
|
|
12
|
+
2. Document mode: point at a .pptx or .docx; objects are extracted first
|
|
13
|
+
|
|
14
|
+
CLI:
|
|
15
|
+
python -m cdxml_toolkit.scheme_reader_verify dir_of_cdxml/ -o report.html
|
|
16
|
+
python -m cdxml_toolkit.scheme_reader_verify slides.pptx -o report.html
|
|
17
|
+
python -m cdxml_toolkit.scheme_reader_verify slides.pptx thesis.docx -o report.html
|
|
18
|
+
python -m cdxml_toolkit.scheme_reader_verify dir/ --render # also renders images via ChemDraw
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
|
+
from __future__ import annotations
|
|
22
|
+
|
|
23
|
+
import argparse
|
|
24
|
+
import base64
|
|
25
|
+
import json
|
|
26
|
+
import os
|
|
27
|
+
import sys
|
|
28
|
+
import tempfile
|
|
29
|
+
import traceback
|
|
30
|
+
from pathlib import Path
|
|
31
|
+
from typing import List, Optional, Tuple
|
|
32
|
+
|
|
33
|
+
from cdxml_toolkit.perception.scheme_reader import read_scheme, SchemeDescription
|
|
34
|
+
from cdxml_toolkit.perception.scheme_refine import (
|
|
35
|
+
apply_corrections, generate_llm_narrative, _build_reaction_smiles,
|
|
36
|
+
enrich_aligned_names,
|
|
37
|
+
)
|
|
38
|
+
|
|
39
|
+
# ---------------------------------------------------------------------------
|
|
40
|
+
# ML enrichment (optional — requires chem-pipeline's experiments modules)
|
|
41
|
+
# ---------------------------------------------------------------------------
|
|
42
|
+
_ML_AVAILABLE = False
|
|
43
|
+
|
|
44
|
+
def _try_load_ml():
|
|
45
|
+
"""Try to import RXNMapper + RXN Insight from chem-pipeline experiments."""
|
|
46
|
+
global _ML_AVAILABLE
|
|
47
|
+
if _ML_AVAILABLE:
|
|
48
|
+
return True
|
|
49
|
+
# chem-pipeline experiments/ is not a proper package — add path
|
|
50
|
+
_pipeline_root = os.path.normpath(
|
|
51
|
+
os.path.join(os.path.expanduser("~"), "chem-pipeline"))
|
|
52
|
+
if os.path.isdir(_pipeline_root) and _pipeline_root not in sys.path:
|
|
53
|
+
sys.path.insert(0, _pipeline_root)
|
|
54
|
+
try:
|
|
55
|
+
from experiments.role_classification.rxn_role_classifier import ( # noqa: F401
|
|
56
|
+
classify_roles_enriched,
|
|
57
|
+
)
|
|
58
|
+
_ML_AVAILABLE = True
|
|
59
|
+
return True
|
|
60
|
+
except ImportError:
|
|
61
|
+
return False
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
def _enrich_step(rxn_smiles: str, timeout: int = 120) -> Optional[dict]:
|
|
65
|
+
"""Run RXNMapper + RXN Insight on a single reaction SMILES."""
|
|
66
|
+
try:
|
|
67
|
+
from experiments.role_classification.rxn_role_classifier import (
|
|
68
|
+
classify_roles_enriched,
|
|
69
|
+
)
|
|
70
|
+
return classify_roles_enriched(rxn_smiles, timeout=timeout)
|
|
71
|
+
except Exception:
|
|
72
|
+
return None
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
def enrich_scheme(desc: SchemeDescription,
|
|
76
|
+
verbose: bool = False) -> dict:
|
|
77
|
+
"""Generate ML enrichment for all steps in a scheme.
|
|
78
|
+
|
|
79
|
+
Returns dict keyed by step_index with RXNMapper/RXN Insight results.
|
|
80
|
+
"""
|
|
81
|
+
enrichment = {}
|
|
82
|
+
if not _try_load_ml():
|
|
83
|
+
if verbose:
|
|
84
|
+
print(" ML enrichment unavailable (chem-pipeline not found)",
|
|
85
|
+
file=sys.stderr)
|
|
86
|
+
return enrichment
|
|
87
|
+
|
|
88
|
+
for step in desc.steps:
|
|
89
|
+
rxn_smi = _build_reaction_smiles(step, desc.species)
|
|
90
|
+
if not rxn_smi:
|
|
91
|
+
if verbose:
|
|
92
|
+
print(f" Step {step.step_index}: no SMILES for rxn SMILES",
|
|
93
|
+
file=sys.stderr)
|
|
94
|
+
continue
|
|
95
|
+
if verbose:
|
|
96
|
+
print(f" Step {step.step_index}: {rxn_smi[:80]}...",
|
|
97
|
+
file=sys.stderr)
|
|
98
|
+
result = _enrich_step(rxn_smi)
|
|
99
|
+
if result:
|
|
100
|
+
enrichment[step.step_index] = result
|
|
101
|
+
elif verbose:
|
|
102
|
+
print(f" Step {step.step_index}: ML enrichment failed",
|
|
103
|
+
file=sys.stderr)
|
|
104
|
+
return enrichment
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
def batch_enrich_schemes(descs: list, verbose: bool = False) -> list:
|
|
108
|
+
"""Batch ML enrichment for multiple SchemeDescriptions.
|
|
109
|
+
|
|
110
|
+
Uses RXNMapper batch API to send all reaction SMILES in a single
|
|
111
|
+
subprocess call (one model load), then calls RXN Insight per-step
|
|
112
|
+
for reaction classification.
|
|
113
|
+
|
|
114
|
+
Args:
|
|
115
|
+
descs: List of (index, SchemeDescription) tuples.
|
|
116
|
+
|
|
117
|
+
Returns:
|
|
118
|
+
List of (index, enrichment_dict) tuples.
|
|
119
|
+
"""
|
|
120
|
+
if not _try_load_ml():
|
|
121
|
+
if verbose:
|
|
122
|
+
print("ML enrichment unavailable", file=sys.stderr)
|
|
123
|
+
return [(i, {}) for i, _ in descs]
|
|
124
|
+
|
|
125
|
+
from experiments.atom_mapping.rxn_atom_mapper import (
|
|
126
|
+
map_reactions_batch, classify_roles_from_mapping,
|
|
127
|
+
)
|
|
128
|
+
|
|
129
|
+
# Phase 1: Collect all reaction SMILES (filter out R-group/invalid)
|
|
130
|
+
def _valid_rxn_smiles(rxn_smi: str) -> bool:
|
|
131
|
+
"""Check that both sides of reaction contain valid SMILES."""
|
|
132
|
+
try:
|
|
133
|
+
from rdkit import Chem
|
|
134
|
+
except ImportError:
|
|
135
|
+
return True # can't validate, let RXNMapper try
|
|
136
|
+
parts = rxn_smi.split(">>")
|
|
137
|
+
if len(parts) != 2:
|
|
138
|
+
return False
|
|
139
|
+
for side in parts:
|
|
140
|
+
for frag in side.split("."):
|
|
141
|
+
if not frag:
|
|
142
|
+
continue
|
|
143
|
+
mol = Chem.MolFromSmiles(frag)
|
|
144
|
+
if mol is None:
|
|
145
|
+
return False
|
|
146
|
+
return True
|
|
147
|
+
|
|
148
|
+
all_rxns = [] # (desc_idx, step_idx, rxn_smiles)
|
|
149
|
+
n_skipped = 0
|
|
150
|
+
for desc_idx, desc in descs:
|
|
151
|
+
for step in desc.steps:
|
|
152
|
+
rxn_smi = _build_reaction_smiles(step, desc.species)
|
|
153
|
+
if rxn_smi:
|
|
154
|
+
if _valid_rxn_smiles(rxn_smi):
|
|
155
|
+
all_rxns.append((desc_idx, step.step_index, rxn_smi))
|
|
156
|
+
else:
|
|
157
|
+
n_skipped += 1
|
|
158
|
+
|
|
159
|
+
if not all_rxns:
|
|
160
|
+
return [(i, {}) for i, _ in descs]
|
|
161
|
+
|
|
162
|
+
if verbose:
|
|
163
|
+
msg = f"Batch mapping {len(all_rxns)} reactions via RXNMapper..."
|
|
164
|
+
if n_skipped:
|
|
165
|
+
msg += f" ({n_skipped} skipped: invalid/R-group SMILES)"
|
|
166
|
+
print(msg, file=sys.stderr)
|
|
167
|
+
|
|
168
|
+
# Phase 2: Batch atom mapping (single subprocess)
|
|
169
|
+
rxn_smiles_list = [r[2] for r in all_rxns]
|
|
170
|
+
batch_results = map_reactions_batch(rxn_smiles_list, timeout=600)
|
|
171
|
+
|
|
172
|
+
if verbose:
|
|
173
|
+
n_ok = sum(1 for r in batch_results if r is not None)
|
|
174
|
+
print(f" {n_ok}/{len(batch_results)} reactions mapped",
|
|
175
|
+
file=sys.stderr)
|
|
176
|
+
|
|
177
|
+
# Phase 3: Role classification from atom maps + RXN Insight enrichment
|
|
178
|
+
enrichments = {i: {} for i, _ in descs}
|
|
179
|
+
|
|
180
|
+
for (desc_idx, step_idx, rxn_smi), map_result in zip(all_rxns, batch_results):
|
|
181
|
+
if map_result is None:
|
|
182
|
+
continue
|
|
183
|
+
|
|
184
|
+
# Classify roles from atom maps
|
|
185
|
+
role_result = classify_roles_from_mapping(
|
|
186
|
+
original_rxn=rxn_smi,
|
|
187
|
+
mapped_rxn=map_result["mapped_rxn"],
|
|
188
|
+
confidence=map_result["confidence"],
|
|
189
|
+
)
|
|
190
|
+
|
|
191
|
+
# Try RXN Insight for reaction class/name (still per-step subprocess)
|
|
192
|
+
try:
|
|
193
|
+
from experiments.role_classification.rxn_role_classifier import (
|
|
194
|
+
_run_rxn_insight,
|
|
195
|
+
)
|
|
196
|
+
insight = _run_rxn_insight(rxn_smi, timeout=60)
|
|
197
|
+
if insight:
|
|
198
|
+
role_result["reaction_class"] = insight.get("reaction_class", "")
|
|
199
|
+
role_result["reaction_name"] = insight.get("reaction_name", "")
|
|
200
|
+
role_result["byproducts"] = insight.get("byproducts", [])
|
|
201
|
+
role_result["functional_groups_reactants"] = insight.get(
|
|
202
|
+
"functional_groups_reactants", [])
|
|
203
|
+
else:
|
|
204
|
+
role_result["reaction_class"] = ""
|
|
205
|
+
role_result["reaction_name"] = ""
|
|
206
|
+
role_result["byproducts"] = []
|
|
207
|
+
except ImportError:
|
|
208
|
+
role_result["reaction_class"] = ""
|
|
209
|
+
role_result["reaction_name"] = ""
|
|
210
|
+
role_result["byproducts"] = []
|
|
211
|
+
|
|
212
|
+
enrichments[desc_idx][step_idx] = role_result
|
|
213
|
+
|
|
214
|
+
if verbose and (step_idx == 0 or desc_idx % 10 == 0):
|
|
215
|
+
rc = role_result.get("reaction_class", "?")
|
|
216
|
+
print(f" [{desc_idx}] step {step_idx}: {rc}",
|
|
217
|
+
file=sys.stderr)
|
|
218
|
+
|
|
219
|
+
return [(i, enrichments[i]) for i, _ in descs]
|
|
220
|
+
|
|
221
|
+
|
|
222
|
+
# ---------------------------------------------------------------------------
|
|
223
|
+
# SMILES -> structure image (RDKit SVG)
|
|
224
|
+
# ---------------------------------------------------------------------------
|
|
225
|
+
|
|
226
|
+
# Cache: smiles -> base64 data-URI SVG
|
|
227
|
+
_smiles_svg_cache: dict = {}
|
|
228
|
+
|
|
229
|
+
|
|
230
|
+
def _smiles_to_svg_b64(smiles: str, width: int = 200, height: int = 120) -> str:
|
|
231
|
+
"""Render a SMILES string to an inline SVG data-URI via RDKit.
|
|
232
|
+
|
|
233
|
+
Returns a data:image/svg+xml;base64,... string, or "" on failure.
|
|
234
|
+
Results are cached so duplicate SMILES are rendered only once.
|
|
235
|
+
"""
|
|
236
|
+
if not smiles:
|
|
237
|
+
return ""
|
|
238
|
+
if smiles in _smiles_svg_cache:
|
|
239
|
+
return _smiles_svg_cache[smiles]
|
|
240
|
+
|
|
241
|
+
try:
|
|
242
|
+
from rdkit import Chem
|
|
243
|
+
from rdkit.Chem.Draw import rdMolDraw2D
|
|
244
|
+
|
|
245
|
+
mol = Chem.MolFromSmiles(smiles, sanitize=False)
|
|
246
|
+
if mol is None:
|
|
247
|
+
_smiles_svg_cache[smiles] = ""
|
|
248
|
+
return ""
|
|
249
|
+
|
|
250
|
+
# Partial sanitisation — tolerate dummy atoms / R-groups
|
|
251
|
+
try:
|
|
252
|
+
Chem.SanitizeMol(mol, Chem.SanitizeFlags.SANITIZE_ALL
|
|
253
|
+
^ Chem.SanitizeFlags.SANITIZE_PROPERTIES)
|
|
254
|
+
except Exception:
|
|
255
|
+
pass
|
|
256
|
+
|
|
257
|
+
try:
|
|
258
|
+
Chem.rdDepictor.Compute2DCoords(mol)
|
|
259
|
+
except Exception:
|
|
260
|
+
pass
|
|
261
|
+
|
|
262
|
+
drawer = rdMolDraw2D.MolDraw2DSVG(width, height)
|
|
263
|
+
opts = drawer.drawOptions()
|
|
264
|
+
opts.clearBackground = True
|
|
265
|
+
opts.bondLineWidth = 1.2
|
|
266
|
+
opts.padding = 0.15
|
|
267
|
+
# Make dummy atoms (R-groups) visible
|
|
268
|
+
opts.dummyIsotopeLabels = False
|
|
269
|
+
drawer.DrawMolecule(mol)
|
|
270
|
+
drawer.FinishDrawing()
|
|
271
|
+
svg_text = drawer.GetDrawingText()
|
|
272
|
+
|
|
273
|
+
b64 = base64.b64encode(svg_text.encode("utf-8")).decode("ascii")
|
|
274
|
+
uri = f"data:image/svg+xml;base64,{b64}"
|
|
275
|
+
_smiles_svg_cache[smiles] = uri
|
|
276
|
+
return uri
|
|
277
|
+
|
|
278
|
+
except Exception:
|
|
279
|
+
_smiles_svg_cache[smiles] = ""
|
|
280
|
+
return ""
|
|
281
|
+
|
|
282
|
+
|
|
283
|
+
# ---------------------------------------------------------------------------
|
|
284
|
+
# Image rendering (optional, requires ChemDraw COM)
|
|
285
|
+
# ---------------------------------------------------------------------------
|
|
286
|
+
|
|
287
|
+
def _render_cdxml_to_png(cdxml_path: str, output_path: str) -> bool:
|
|
288
|
+
"""Render a CDXML file to PNG via cdxml_to_image. Returns True on success."""
|
|
289
|
+
try:
|
|
290
|
+
from cdxml_toolkit.chemdraw.cdxml_to_image import cdxml_to_png
|
|
291
|
+
cdxml_to_png(cdxml_path, output_path)
|
|
292
|
+
return True
|
|
293
|
+
except Exception:
|
|
294
|
+
# Fall back to subprocess call
|
|
295
|
+
try:
|
|
296
|
+
import subprocess
|
|
297
|
+
python = sys.executable
|
|
298
|
+
result = subprocess.run(
|
|
299
|
+
[python, "-m", "cdxml_toolkit.cdxml_to_image",
|
|
300
|
+
cdxml_path, "-o", output_path],
|
|
301
|
+
capture_output=True, timeout=30,
|
|
302
|
+
)
|
|
303
|
+
return result.returncode == 0 and os.path.exists(output_path)
|
|
304
|
+
except Exception:
|
|
305
|
+
return False
|
|
306
|
+
|
|
307
|
+
|
|
308
|
+
def _embed_image_b64(img_path: str) -> str:
|
|
309
|
+
"""Read image file and return base64 data-URI string."""
|
|
310
|
+
if not os.path.exists(img_path):
|
|
311
|
+
return ""
|
|
312
|
+
with open(img_path, "rb") as f:
|
|
313
|
+
data = base64.b64encode(f.read()).decode("ascii")
|
|
314
|
+
ext = os.path.splitext(img_path)[1].lower()
|
|
315
|
+
mime = {"png": "image/png", "jpg": "image/jpeg", "jpeg": "image/jpeg",
|
|
316
|
+
"gif": "image/gif", "svg": "image/svg+xml"}.get(ext.lstrip("."), "image/png")
|
|
317
|
+
return f"data:{mime};base64,{data}"
|
|
318
|
+
|
|
319
|
+
|
|
320
|
+
# ---------------------------------------------------------------------------
|
|
321
|
+
# OLE extraction helpers
|
|
322
|
+
# ---------------------------------------------------------------------------
|
|
323
|
+
|
|
324
|
+
def _extract_from_document(doc_path: str, out_dir: str) -> List[str]:
|
|
325
|
+
"""Extract ChemDraw objects from PPTX/DOCX, return list of CDXML paths."""
|
|
326
|
+
from cdxml_toolkit.office.ole_extractor import extract_from_office
|
|
327
|
+
results = extract_from_office(doc_path, out_dir,
|
|
328
|
+
output_format="cdxml", convert_method="auto")
|
|
329
|
+
paths = []
|
|
330
|
+
for r in results:
|
|
331
|
+
if r.cdxml_output and os.path.exists(r.cdxml_output):
|
|
332
|
+
paths.append(r.cdxml_output)
|
|
333
|
+
elif r.error:
|
|
334
|
+
print(f" Warning: {r.source_path}: {r.error}", file=sys.stderr)
|
|
335
|
+
return paths
|
|
336
|
+
|
|
337
|
+
|
|
338
|
+
# ---------------------------------------------------------------------------
|
|
339
|
+
# Helpers
|
|
340
|
+
# ---------------------------------------------------------------------------
|
|
341
|
+
|
|
342
|
+
def _build_species_summary(desc) -> list:
|
|
343
|
+
"""Build a species summary list from a SchemeDescription."""
|
|
344
|
+
summary = []
|
|
345
|
+
for sid, sp in desc.species.items():
|
|
346
|
+
entry = {"id": sid, "element_type": sp.element_type}
|
|
347
|
+
if sp.label:
|
|
348
|
+
entry["label"] = sp.label
|
|
349
|
+
if sp.name:
|
|
350
|
+
entry["name"] = sp.name[:80]
|
|
351
|
+
if sp.smiles:
|
|
352
|
+
entry["smiles"] = sp.smiles[:120]
|
|
353
|
+
if sp.formula:
|
|
354
|
+
entry["formula"] = sp.formula
|
|
355
|
+
if sp.mw is not None:
|
|
356
|
+
entry["mw"] = round(sp.mw, 1)
|
|
357
|
+
if sp.text_category:
|
|
358
|
+
entry["text_category"] = sp.text_category
|
|
359
|
+
if getattr(sp, "iupac_name", None):
|
|
360
|
+
entry["iupac_name"] = sp.iupac_name
|
|
361
|
+
summary.append(entry)
|
|
362
|
+
return summary
|
|
363
|
+
|
|
364
|
+
|
|
365
|
+
# ---------------------------------------------------------------------------
|
|
366
|
+
# Parse one CDXML and return structured result
|
|
367
|
+
# ---------------------------------------------------------------------------
|
|
368
|
+
|
|
369
|
+
def _parse_one(cdxml_path: str, render: bool = False,
|
|
370
|
+
img_dir: Optional[str] = None,
|
|
371
|
+
use_chemscript: bool = False,
|
|
372
|
+
enrich: bool = False,
|
|
373
|
+
segment: bool = False) -> dict:
|
|
374
|
+
"""Parse a single CDXML file and return a result dict for the report."""
|
|
375
|
+
result = {
|
|
376
|
+
"file": os.path.basename(cdxml_path),
|
|
377
|
+
"path": cdxml_path,
|
|
378
|
+
"error": None,
|
|
379
|
+
"narrative": "",
|
|
380
|
+
"topology": "",
|
|
381
|
+
"num_steps": 0,
|
|
382
|
+
"species_summary": [],
|
|
383
|
+
"steps_summary": [],
|
|
384
|
+
"warnings": [],
|
|
385
|
+
"image_b64": "",
|
|
386
|
+
"json_full": None,
|
|
387
|
+
}
|
|
388
|
+
|
|
389
|
+
# Parse
|
|
390
|
+
try:
|
|
391
|
+
desc = read_scheme(cdxml_path, use_network=False,
|
|
392
|
+
use_chemscript=use_chemscript, verbose=False,
|
|
393
|
+
segment=segment)
|
|
394
|
+
result["narrative"] = desc.narrative
|
|
395
|
+
result["topology"] = desc.topology
|
|
396
|
+
result["content_type"] = desc.content_type or "unknown"
|
|
397
|
+
result["num_steps"] = desc.num_steps
|
|
398
|
+
result["warnings"] = desc.warnings
|
|
399
|
+
result["_desc"] = desc # keep for Tier 2 corrections
|
|
400
|
+
|
|
401
|
+
# Species summary
|
|
402
|
+
result["species_summary"] = _build_species_summary(desc)
|
|
403
|
+
|
|
404
|
+
# Steps summary
|
|
405
|
+
for step in desc.steps:
|
|
406
|
+
s = {
|
|
407
|
+
"idx": step.step_index,
|
|
408
|
+
"reactants": step.reactant_ids,
|
|
409
|
+
"products": step.product_ids,
|
|
410
|
+
"reagents": step.reagent_ids,
|
|
411
|
+
"arrow": step.arrow_style,
|
|
412
|
+
}
|
|
413
|
+
if step.conditions:
|
|
414
|
+
s["conditions"] = step.conditions[:5] # cap for display
|
|
415
|
+
if step.yield_text:
|
|
416
|
+
s["yield"] = step.yield_text
|
|
417
|
+
result["steps_summary"].append(s)
|
|
418
|
+
|
|
419
|
+
result["json_full"] = desc.to_dict()
|
|
420
|
+
|
|
421
|
+
# Sub-scheme data (when segmentation is active)
|
|
422
|
+
if desc.sub_schemes:
|
|
423
|
+
result["sub_schemes"] = []
|
|
424
|
+
for sub in desc.sub_schemes:
|
|
425
|
+
sub_info = {
|
|
426
|
+
"num_steps": sub.num_steps,
|
|
427
|
+
"topology": sub.topology,
|
|
428
|
+
"content_type": sub.content_type or "unknown",
|
|
429
|
+
"num_species": len(sub.species),
|
|
430
|
+
"narrative": sub.narrative,
|
|
431
|
+
"species_summary": _build_species_summary(sub),
|
|
432
|
+
"steps_summary": [],
|
|
433
|
+
}
|
|
434
|
+
for step in sub.steps:
|
|
435
|
+
s = {
|
|
436
|
+
"idx": step.step_index,
|
|
437
|
+
"reactants": step.reactant_ids,
|
|
438
|
+
"products": step.product_ids,
|
|
439
|
+
"reagents": step.reagent_ids,
|
|
440
|
+
"arrow": step.arrow_style,
|
|
441
|
+
}
|
|
442
|
+
if step.conditions:
|
|
443
|
+
s["conditions"] = step.conditions[:5]
|
|
444
|
+
if step.yield_text:
|
|
445
|
+
s["yield"] = step.yield_text
|
|
446
|
+
sub_info["steps_summary"].append(s)
|
|
447
|
+
result["sub_schemes"].append(sub_info)
|
|
448
|
+
|
|
449
|
+
# ML enrichment (optional)
|
|
450
|
+
ml_enrichment = {}
|
|
451
|
+
if enrich and desc.steps:
|
|
452
|
+
try:
|
|
453
|
+
ml_enrichment = enrich_scheme(desc, verbose=True)
|
|
454
|
+
result["ml_enrichment"] = ml_enrichment
|
|
455
|
+
except Exception as exc:
|
|
456
|
+
print(f" ML enrichment error: {exc}", file=sys.stderr)
|
|
457
|
+
|
|
458
|
+
# LLM-quality narrative (with ML grounding when available)
|
|
459
|
+
try:
|
|
460
|
+
result["llm_narrative"] = generate_llm_narrative(
|
|
461
|
+
desc, ml_enrichment=ml_enrichment)
|
|
462
|
+
except Exception:
|
|
463
|
+
result["llm_narrative"] = ""
|
|
464
|
+
|
|
465
|
+
except Exception as e:
|
|
466
|
+
result["error"] = f"{type(e).__name__}: {e}"
|
|
467
|
+
traceback.print_exc(file=sys.stderr)
|
|
468
|
+
|
|
469
|
+
# Render image
|
|
470
|
+
if render and img_dir:
|
|
471
|
+
png_path = os.path.join(img_dir, Path(cdxml_path).stem + ".png")
|
|
472
|
+
if _render_cdxml_to_png(cdxml_path, png_path):
|
|
473
|
+
result["image_b64"] = _embed_image_b64(png_path)
|
|
474
|
+
|
|
475
|
+
return result
|
|
476
|
+
|
|
477
|
+
|
|
478
|
+
# ---------------------------------------------------------------------------
|
|
479
|
+
# HTML report generation
|
|
480
|
+
# ---------------------------------------------------------------------------
|
|
481
|
+
|
|
482
|
+
_CSS = """
|
|
483
|
+
:root { --bg: #f8f9fa; --card: #fff; --border: #dee2e6; --accent: #0d6efd;
|
|
484
|
+
--green: #198754; --red: #dc3545; --muted: #6c757d; }
|
|
485
|
+
* { box-sizing: border-box; margin: 0; padding: 0; }
|
|
486
|
+
body { font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, sans-serif;
|
|
487
|
+
background: var(--bg); color: #212529; line-height: 1.5; padding: 24px; }
|
|
488
|
+
h1 { font-size: 1.6rem; margin-bottom: 8px; }
|
|
489
|
+
.subtitle { color: var(--muted); margin-bottom: 24px; }
|
|
490
|
+
.stats { display: flex; gap: 16px; margin-bottom: 24px; flex-wrap: wrap; }
|
|
491
|
+
.stat { background: var(--card); border: 1px solid var(--border);
|
|
492
|
+
border-radius: 8px; padding: 12px 20px; min-width: 140px; }
|
|
493
|
+
.stat-value { font-size: 1.5rem; font-weight: 700; }
|
|
494
|
+
.stat-label { font-size: 0.85rem; color: var(--muted); }
|
|
495
|
+
.card { background: var(--card); border: 1px solid var(--border);
|
|
496
|
+
border-radius: 8px; margin-bottom: 20px; overflow: hidden; }
|
|
497
|
+
.card-header { padding: 12px 16px; border-bottom: 1px solid var(--border);
|
|
498
|
+
display: flex; align-items: center; gap: 12px;
|
|
499
|
+
cursor: pointer; user-select: none; }
|
|
500
|
+
.card-header:hover { background: #f1f3f5; }
|
|
501
|
+
.card-header h2 { font-size: 1rem; flex: 1; }
|
|
502
|
+
.badge { padding: 2px 8px; border-radius: 12px; font-size: 0.75rem;
|
|
503
|
+
font-weight: 600; }
|
|
504
|
+
.badge-topo { background: #e7f1ff; color: var(--accent); }
|
|
505
|
+
.badge-content { background: #f3e8ff; color: #6f42c1; }
|
|
506
|
+
.badge-steps { background: #d1e7dd; color: var(--green); }
|
|
507
|
+
.badge-error { background: #f8d7da; color: var(--red); }
|
|
508
|
+
.badge-warn { background: #fff3cd; color: #856404; }
|
|
509
|
+
.badge-cat { background: #e2e3e5; color: #41464b; font-size: 0.7rem; padding: 1px 6px; }
|
|
510
|
+
.badge-cat-cond { background: #cff4fc; color: #055160; }
|
|
511
|
+
.badge-cat-cite { background: #e2d9f3; color: #432874; }
|
|
512
|
+
.badge-cat-bio { background: #f8d7da; color: var(--red); }
|
|
513
|
+
.card-body { padding: 16px; display: none; }
|
|
514
|
+
.card.open .card-body { display: block; }
|
|
515
|
+
.two-col { display: grid; grid-template-columns: 1fr 1fr; gap: 20px; }
|
|
516
|
+
@media (max-width: 900px) { .two-col { grid-template-columns: 1fr; } }
|
|
517
|
+
.img-box { text-align: center; }
|
|
518
|
+
.img-box img { max-width: 100%; border: 1px solid var(--border); border-radius: 4px; }
|
|
519
|
+
.no-img { padding: 40px; text-align: center; color: var(--muted);
|
|
520
|
+
background: #f1f3f5; border-radius: 4px; }
|
|
521
|
+
.narrative { background: #f8f9fa; padding: 12px; border-radius: 4px;
|
|
522
|
+
white-space: pre-wrap; font-size: 0.9rem; line-height: 1.6; }
|
|
523
|
+
.section-title { font-weight: 600; font-size: 0.9rem; margin: 12px 0 6px;
|
|
524
|
+
color: var(--muted); text-transform: uppercase;
|
|
525
|
+
letter-spacing: 0.5px; }
|
|
526
|
+
table { width: 100%; border-collapse: collapse; font-size: 0.85rem; margin-top: 4px; }
|
|
527
|
+
th, td { padding: 6px 10px; text-align: left; border-bottom: 1px solid var(--border); }
|
|
528
|
+
th { background: #f1f3f5; font-weight: 600; position: sticky; top: 0; }
|
|
529
|
+
.smiles { font-family: "Courier New", monospace; font-size: 0.75rem;
|
|
530
|
+
word-break: break-all; max-width: 280px; color: var(--muted); }
|
|
531
|
+
.struct-cell { padding: 2px 4px; }
|
|
532
|
+
.struct-img { width: 160px; height: 96px; object-fit: contain;
|
|
533
|
+
border: 1px solid #e9ecef; border-radius: 3px; background: #fff;
|
|
534
|
+
vertical-align: middle; }
|
|
535
|
+
.no-struct { color: #ccc; }
|
|
536
|
+
.json-toggle { color: var(--accent); cursor: pointer; font-size: 0.85rem;
|
|
537
|
+
text-decoration: underline; margin-top: 8px; display: inline-block; }
|
|
538
|
+
.json-block { display: none; background: #f1f3f5; padding: 12px;
|
|
539
|
+
border-radius: 4px; font-family: monospace; font-size: 0.8rem;
|
|
540
|
+
white-space: pre-wrap; max-height: 400px; overflow: auto;
|
|
541
|
+
margin-top: 6px; }
|
|
542
|
+
.chevron { transition: transform 0.2s; display: inline-block; }
|
|
543
|
+
.card.open .chevron { transform: rotate(90deg); }
|
|
544
|
+
.verdict { padding: 3px 10px; border-radius: 4px; font-size: 0.8rem;
|
|
545
|
+
font-weight: 600; display: inline-block; }
|
|
546
|
+
.verdict-ok { background: #d1e7dd; color: var(--green); }
|
|
547
|
+
.verdict-warn { background: #fff3cd; color: #856404; }
|
|
548
|
+
.verdict-fail { background: #f8d7da; color: var(--red); }
|
|
549
|
+
.tier-label { font-weight: 700; font-size: 0.8rem; text-transform: uppercase;
|
|
550
|
+
letter-spacing: 0.5px; margin-bottom: 4px; }
|
|
551
|
+
.tier-1 .tier-label { color: var(--accent); }
|
|
552
|
+
.tier-2 .tier-label { color: #198754; }
|
|
553
|
+
.tier-row { display: grid; grid-template-columns: 1fr 1fr; gap: 16px;
|
|
554
|
+
margin-top: 8px; }
|
|
555
|
+
.tier-col { padding: 10px; border-radius: 6px; }
|
|
556
|
+
.tier-1 { background: #f8f9fa; border: 1px solid #dee2e6; }
|
|
557
|
+
.tier-2 { background: #f0faf4; border: 1px solid #a3cfbb; }
|
|
558
|
+
.correction-note { font-size: 0.8rem; color: #495057; font-style: italic;
|
|
559
|
+
margin-top: 6px; padding: 6px 10px; background: #fff3cd;
|
|
560
|
+
border-radius: 4px; }
|
|
561
|
+
.diff-highlight { background: #fff3cd; padding: 1px 4px; border-radius: 2px; }
|
|
562
|
+
.badge-t2 { background: #d1e7dd; color: var(--green); }
|
|
563
|
+
"""
|
|
564
|
+
|
|
565
|
+
_JS = """
|
|
566
|
+
document.querySelectorAll('.card-header').forEach(h => {
|
|
567
|
+
h.addEventListener('click', () => h.parentElement.classList.toggle('open'));
|
|
568
|
+
});
|
|
569
|
+
document.querySelectorAll('.json-toggle').forEach(t => {
|
|
570
|
+
t.addEventListener('click', e => {
|
|
571
|
+
e.stopPropagation();
|
|
572
|
+
const block = t.nextElementSibling;
|
|
573
|
+
block.style.display = block.style.display === 'block' ? 'none' : 'block';
|
|
574
|
+
});
|
|
575
|
+
});
|
|
576
|
+
function expandAll() {
|
|
577
|
+
document.querySelectorAll('.card').forEach(c => c.classList.add('open'));
|
|
578
|
+
}
|
|
579
|
+
function collapseAll() {
|
|
580
|
+
document.querySelectorAll('.card').forEach(c => c.classList.remove('open'));
|
|
581
|
+
}
|
|
582
|
+
"""
|
|
583
|
+
|
|
584
|
+
|
|
585
|
+
def _species_table_html(species: list) -> str:
|
|
586
|
+
if not species:
|
|
587
|
+
return '<p style="color:var(--muted)">No species detected</p>'
|
|
588
|
+
rows = []
|
|
589
|
+
for sp in species:
|
|
590
|
+
label = sp.get("label", "")
|
|
591
|
+
name = sp.get("name", "")
|
|
592
|
+
smiles = sp.get("smiles", "")
|
|
593
|
+
formula = sp.get("formula", "")
|
|
594
|
+
mw = sp.get("mw", "")
|
|
595
|
+
etype = sp.get("element_type", "")
|
|
596
|
+
tcat = sp.get("text_category", "")
|
|
597
|
+
|
|
598
|
+
# Type/category badge
|
|
599
|
+
if etype == "text":
|
|
600
|
+
cat_css = {"condition_ref": "badge-cat-cond",
|
|
601
|
+
"citation": "badge-cat-cite",
|
|
602
|
+
"bioactivity": "badge-cat-bio"}.get(tcat, "badge-cat")
|
|
603
|
+
type_html = f'<span class="badge {cat_css}">{tcat or "text"}</span>'
|
|
604
|
+
else:
|
|
605
|
+
type_html = f'<span class="badge badge-cat">{etype or "?"}</span>'
|
|
606
|
+
|
|
607
|
+
# Render SMILES to SVG structure image
|
|
608
|
+
svg_uri = _smiles_to_svg_b64(smiles) if smiles else ""
|
|
609
|
+
if svg_uri:
|
|
610
|
+
struct_html = f'<img src="{svg_uri}" class="struct-img" alt="{smiles}">'
|
|
611
|
+
else:
|
|
612
|
+
struct_html = '<span class="no-struct">-</span>'
|
|
613
|
+
|
|
614
|
+
iupac = sp.get("iupac_name", "")
|
|
615
|
+
iupac_html = (f'<span style="color:#0d6efd;font-size:0.85em">{iupac}</span>'
|
|
616
|
+
if iupac else "")
|
|
617
|
+
|
|
618
|
+
rows.append(f"""<tr>
|
|
619
|
+
<td>{sp['id']}</td>
|
|
620
|
+
<td>{type_html}</td>
|
|
621
|
+
<td><b>{label}</b></td>
|
|
622
|
+
<td>{name}{('<br>' + iupac_html) if iupac_html else ''}</td>
|
|
623
|
+
<td class="struct-cell">{struct_html}</td>
|
|
624
|
+
<td class="smiles">{smiles}</td>
|
|
625
|
+
<td>{formula}</td>
|
|
626
|
+
<td>{mw}</td>
|
|
627
|
+
</tr>""")
|
|
628
|
+
return f"""<table>
|
|
629
|
+
<tr><th>ID</th><th>Type</th><th>Label</th><th>Name / IUPAC</th><th>Structure</th><th>SMILES</th><th>Formula</th><th>MW</th></tr>
|
|
630
|
+
{''.join(rows)}
|
|
631
|
+
</table>"""
|
|
632
|
+
|
|
633
|
+
|
|
634
|
+
def _steps_table_html(steps: list) -> str:
|
|
635
|
+
if not steps:
|
|
636
|
+
return '<p style="color:var(--muted)">No steps detected</p>'
|
|
637
|
+
rows = []
|
|
638
|
+
for s in steps:
|
|
639
|
+
r_ids = ", ".join(s.get("reactants", []))
|
|
640
|
+
p_ids = ", ".join(s.get("products", []))
|
|
641
|
+
rg_ids = ", ".join(s.get("reagents", []))
|
|
642
|
+
conds = "; ".join(s.get("conditions", [])[:3])
|
|
643
|
+
yld = s.get("yield", "")
|
|
644
|
+
arrow = s.get("arrow", "solid")
|
|
645
|
+
arrow_icon = {"solid": "->", "dashed": "-->", "failed": "X->"}.get(arrow, "->")
|
|
646
|
+
rows.append(f"""<tr>
|
|
647
|
+
<td>{s['idx'] + 1}</td>
|
|
648
|
+
<td>{r_ids}</td>
|
|
649
|
+
<td>{arrow_icon}</td>
|
|
650
|
+
<td>{p_ids}</td>
|
|
651
|
+
<td>{rg_ids}</td>
|
|
652
|
+
<td>{conds}</td>
|
|
653
|
+
<td>{yld}</td>
|
|
654
|
+
</tr>""")
|
|
655
|
+
return f"""<table>
|
|
656
|
+
<tr><th>#</th><th>Reactants</th><th></th><th>Products</th>
|
|
657
|
+
<th>Reagents</th><th>Conditions</th><th>Yield</th></tr>
|
|
658
|
+
{''.join(rows)}
|
|
659
|
+
</table>"""
|
|
660
|
+
|
|
661
|
+
|
|
662
|
+
def _sub_schemes_html(sub_schemes: list) -> str:
|
|
663
|
+
"""Generate HTML for sub-scheme display (collapsible sections)."""
|
|
664
|
+
if not sub_schemes:
|
|
665
|
+
return ""
|
|
666
|
+
parts = [f'<div class="section-title" style="color:#6f42c1">'
|
|
667
|
+
f'Composite Scheme: {len(sub_schemes)} independent sub-schemes'
|
|
668
|
+
f'</div>']
|
|
669
|
+
for i, sub in enumerate(sub_schemes):
|
|
670
|
+
topo = sub.get("topology", "?")
|
|
671
|
+
ctype = sub.get("content_type", "unknown")
|
|
672
|
+
n_steps = sub.get("num_steps", 0)
|
|
673
|
+
n_species = sub.get("num_species", 0)
|
|
674
|
+
narrative = sub.get("narrative", "")
|
|
675
|
+
species_summary = sub.get("species_summary", [])
|
|
676
|
+
steps_summary = sub.get("steps_summary", [])
|
|
677
|
+
parts.append(f"""
|
|
678
|
+
<details style="margin:8px 0;border:1px solid #ddd;border-radius:4px;padding:8px">
|
|
679
|
+
<summary style="cursor:pointer;font-weight:600">
|
|
680
|
+
Sub-scheme {i + 1}
|
|
681
|
+
<span class="badge badge-topo">{topo}</span>
|
|
682
|
+
<span class="badge badge-content">{ctype}</span>
|
|
683
|
+
<span class="badge badge-steps">{n_steps} steps, {n_species} species</span>
|
|
684
|
+
</summary>
|
|
685
|
+
<div style="margin-top:8px">
|
|
686
|
+
<div class="narrative">{narrative}</div>
|
|
687
|
+
<div class="section-title" style="font-size:0.85rem">
|
|
688
|
+
Species ({len(species_summary)})
|
|
689
|
+
</div>
|
|
690
|
+
{_species_table_html(species_summary)}
|
|
691
|
+
<div class="section-title" style="font-size:0.85rem">
|
|
692
|
+
Steps
|
|
693
|
+
</div>
|
|
694
|
+
{_steps_table_html(steps_summary)}
|
|
695
|
+
</div>
|
|
696
|
+
</details>
|
|
697
|
+
""")
|
|
698
|
+
return "\n".join(parts)
|
|
699
|
+
|
|
700
|
+
|
|
701
|
+
def _verdict(result: dict) -> Tuple[str, str]:
|
|
702
|
+
"""Return (css_class, text) for the verdict badge."""
|
|
703
|
+
if result["error"]:
|
|
704
|
+
return "verdict-fail", "PARSE ERROR"
|
|
705
|
+
if result["num_steps"] == 0:
|
|
706
|
+
return "verdict-warn", "NO STEPS"
|
|
707
|
+
if result["warnings"]:
|
|
708
|
+
return "verdict-warn", f"OK ({len(result['warnings'])} warnings)"
|
|
709
|
+
return "verdict-ok", "OK"
|
|
710
|
+
|
|
711
|
+
|
|
712
|
+
def _tier2_summary_html(t1: dict, t2_corrections: dict, t2_desc) -> str:
|
|
713
|
+
"""Generate Tier 2 correction summary HTML."""
|
|
714
|
+
if not t2_corrections:
|
|
715
|
+
return ""
|
|
716
|
+
|
|
717
|
+
changes = []
|
|
718
|
+
if "content_type" in t2_corrections:
|
|
719
|
+
old = t1.get("content_type", "unknown")
|
|
720
|
+
new = t2_corrections["content_type"]
|
|
721
|
+
if old != new:
|
|
722
|
+
changes.append(
|
|
723
|
+
f'<b>Content type</b>: '
|
|
724
|
+
f'<span class="diff-highlight">{old} → {new}</span>')
|
|
725
|
+
if "topology" in t2_corrections:
|
|
726
|
+
old = t1.get("topology", "?")
|
|
727
|
+
new = t2_corrections["topology"]
|
|
728
|
+
if old != new:
|
|
729
|
+
changes.append(
|
|
730
|
+
f'<b>Topology</b>: '
|
|
731
|
+
f'<span class="diff-highlight">{old} → {new}</span>')
|
|
732
|
+
sp_corr = t2_corrections.get("species_corrections", {})
|
|
733
|
+
for sp_id, fixes in sp_corr.items():
|
|
734
|
+
for field, val in fixes.items():
|
|
735
|
+
changes.append(
|
|
736
|
+
f'<b>{sp_id}.{field}</b>: '
|
|
737
|
+
f'<span class="diff-highlight">→ {val}</span>')
|
|
738
|
+
|
|
739
|
+
notes = t2_corrections.get("notes", "")
|
|
740
|
+
|
|
741
|
+
# Tier 2 narrative
|
|
742
|
+
t2_narrative = t2_desc.narrative if t2_desc else ""
|
|
743
|
+
|
|
744
|
+
changes_html = "<br>".join(changes) if changes else "No field changes"
|
|
745
|
+
note_html = (f'<div class="correction-note">{notes}</div>'
|
|
746
|
+
if notes else "")
|
|
747
|
+
|
|
748
|
+
return f"""
|
|
749
|
+
<div class="tier-row">
|
|
750
|
+
<div class="tier-col tier-1">
|
|
751
|
+
<div class="tier-label">Tier 1 (Deterministic)</div>
|
|
752
|
+
<div class="narrative">{t1.get('narrative', '')}</div>
|
|
753
|
+
</div>
|
|
754
|
+
<div class="tier-col tier-2">
|
|
755
|
+
<div class="tier-label">Tier 2 (LLM-Refined)</div>
|
|
756
|
+
<div class="narrative">{t2_narrative}</div>
|
|
757
|
+
</div>
|
|
758
|
+
</div>
|
|
759
|
+
<div style="margin-top:8px">
|
|
760
|
+
<b style="font-size:0.85rem">LLM Corrections:</b><br>
|
|
761
|
+
<span style="font-size:0.85rem">{changes_html}</span>
|
|
762
|
+
{note_html}
|
|
763
|
+
</div>
|
|
764
|
+
"""
|
|
765
|
+
|
|
766
|
+
|
|
767
|
+
def _card_html(idx: int, result: dict) -> str:
|
|
768
|
+
"""Generate HTML for one scheme card."""
|
|
769
|
+
v_class, v_text = _verdict(result)
|
|
770
|
+
has_t2 = result.get("_t2_corrections") is not None
|
|
771
|
+
|
|
772
|
+
# Image section
|
|
773
|
+
if result["image_b64"]:
|
|
774
|
+
img_html = f'<img src="{result["image_b64"]}" alt="Rendered scheme">'
|
|
775
|
+
else:
|
|
776
|
+
img_html = '<div class="no-img">No rendered image<br>(use --render to enable ChemDraw rendering)</div>'
|
|
777
|
+
|
|
778
|
+
# Error display
|
|
779
|
+
if result["error"]:
|
|
780
|
+
body_html = f'<div class="narrative" style="color:var(--red)">{result["error"]}</div>'
|
|
781
|
+
elif has_t2:
|
|
782
|
+
# Dual-tier display with LLM narrative
|
|
783
|
+
t2_desc = result.get("_t2_desc")
|
|
784
|
+
body_html = _tier2_summary_html(result, result["_t2_corrections"], t2_desc)
|
|
785
|
+
# Add LLM narrative if available
|
|
786
|
+
llm_nar = result.get("llm_narrative", "")
|
|
787
|
+
if llm_nar:
|
|
788
|
+
body_html += f"""
|
|
789
|
+
<div style="margin-top:10px">
|
|
790
|
+
<div class="tier-col tier-2" style="margin-bottom:8px">
|
|
791
|
+
<div class="tier-label">LLM Narrative</div>
|
|
792
|
+
<div class="narrative">{llm_nar}</div>
|
|
793
|
+
</div>
|
|
794
|
+
</div>
|
|
795
|
+
"""
|
|
796
|
+
body_html += f"""
|
|
797
|
+
<div class="section-title">Species Registry ({len(result['species_summary'])} species)</div>
|
|
798
|
+
{_species_table_html(result['species_summary'])}
|
|
799
|
+
|
|
800
|
+
<div class="section-title">Reaction Steps</div>
|
|
801
|
+
{_steps_table_html(result['steps_summary'])}
|
|
802
|
+
|
|
803
|
+
{"".join(f'<div class="badge badge-warn" style="margin-top:4px">{w}</div>' for w in result.get('warnings', []))}
|
|
804
|
+
"""
|
|
805
|
+
else:
|
|
806
|
+
llm_nar = result.get("llm_narrative", "")
|
|
807
|
+
llm_html = ""
|
|
808
|
+
if llm_nar:
|
|
809
|
+
llm_html = f"""
|
|
810
|
+
<div class="tier-row">
|
|
811
|
+
<div class="tier-col tier-1">
|
|
812
|
+
<div class="tier-label">Parser Output</div>
|
|
813
|
+
<div class="narrative">{result['narrative']}</div>
|
|
814
|
+
</div>
|
|
815
|
+
<div class="tier-col tier-2">
|
|
816
|
+
<div class="tier-label">LLM Narrative</div>
|
|
817
|
+
<div class="narrative">{llm_nar}</div>
|
|
818
|
+
</div>
|
|
819
|
+
</div>
|
|
820
|
+
"""
|
|
821
|
+
else:
|
|
822
|
+
llm_html = f"""
|
|
823
|
+
<div class="section-title">Narrative</div>
|
|
824
|
+
<div class="narrative">{result['narrative']}</div>
|
|
825
|
+
"""
|
|
826
|
+
|
|
827
|
+
body_html = f"""
|
|
828
|
+
{llm_html}
|
|
829
|
+
|
|
830
|
+
<div class="section-title">Species Registry ({len(result['species_summary'])} species)</div>
|
|
831
|
+
{_species_table_html(result['species_summary'])}
|
|
832
|
+
|
|
833
|
+
<div class="section-title">Reaction Steps</div>
|
|
834
|
+
{_steps_table_html(result['steps_summary'])}
|
|
835
|
+
|
|
836
|
+
{"".join(f'<div class="badge badge-warn" style="margin-top:4px">{w}</div>' for w in result.get('warnings', []))}
|
|
837
|
+
"""
|
|
838
|
+
|
|
839
|
+
# Sub-schemes display (when segmentation is active)
|
|
840
|
+
sub_html = _sub_schemes_html(result.get("sub_schemes", []))
|
|
841
|
+
if sub_html:
|
|
842
|
+
body_html += sub_html
|
|
843
|
+
|
|
844
|
+
# JSON toggle
|
|
845
|
+
json_html = ""
|
|
846
|
+
if result.get("json_full"):
|
|
847
|
+
json_str = json.dumps(result["json_full"], indent=2, ensure_ascii=False)
|
|
848
|
+
# Escape HTML
|
|
849
|
+
json_str = json_str.replace("&", "&").replace("<", "<").replace(">", ">")
|
|
850
|
+
json_html = f"""
|
|
851
|
+
<span class="json-toggle">Show full JSON</span>
|
|
852
|
+
<div class="json-block">{json_str}</div>
|
|
853
|
+
"""
|
|
854
|
+
|
|
855
|
+
# Header badges — show Tier 2 values if corrected
|
|
856
|
+
t2_corr = result.get("_t2_corrections", {}) or {}
|
|
857
|
+
topo_display = t2_corr.get("topology", result.get("topology", "?"))
|
|
858
|
+
ctype_display = t2_corr.get("content_type", result.get("content_type", "unknown"))
|
|
859
|
+
t2_badge = ' <span class="badge badge-t2">T2</span>' if has_t2 else ""
|
|
860
|
+
ml_badge = (' <span class="badge" style="background:#cce5ff;color:#004085">ML</span>'
|
|
861
|
+
if result.get("ml_enrichment") else "")
|
|
862
|
+
n_sub = len(result.get("sub_schemes", []))
|
|
863
|
+
seg_badge = (f' <span class="badge" style="background:#e8daef;color:#6f42c1">'
|
|
864
|
+
f'{n_sub} sub-schemes</span>' if n_sub > 0 else "")
|
|
865
|
+
|
|
866
|
+
return f"""
|
|
867
|
+
<div class="card" id="card-{idx}">
|
|
868
|
+
<div class="card-header">
|
|
869
|
+
<span class="chevron">▶</span>
|
|
870
|
+
<h2>{result['file']}</h2>
|
|
871
|
+
<span class="badge badge-topo">{topo_display}</span>
|
|
872
|
+
<span class="badge badge-content">{ctype_display}</span>
|
|
873
|
+
<span class="badge badge-steps">{result['num_steps']} steps</span>
|
|
874
|
+
<span class="verdict {v_class}">{v_text}</span>{t2_badge}{ml_badge}{seg_badge}
|
|
875
|
+
</div>
|
|
876
|
+
<div class="card-body">
|
|
877
|
+
<div class="two-col">
|
|
878
|
+
<div class="img-box">{img_html}</div>
|
|
879
|
+
<div>{body_html}</div>
|
|
880
|
+
</div>
|
|
881
|
+
{json_html}
|
|
882
|
+
</div>
|
|
883
|
+
</div>
|
|
884
|
+
"""
|
|
885
|
+
|
|
886
|
+
|
|
887
|
+
def generate_report(results: List[dict], output_path: str,
|
|
888
|
+
title: str = "Scheme Reader Verification Report") -> None:
|
|
889
|
+
"""Generate the HTML report from a list of result dicts."""
|
|
890
|
+
n_total = len(results)
|
|
891
|
+
n_ok = sum(1 for r in results if not r["error"] and r["num_steps"] > 0)
|
|
892
|
+
n_warn = sum(1 for r in results if not r["error"] and r["warnings"])
|
|
893
|
+
n_err = sum(1 for r in results if r["error"])
|
|
894
|
+
n_empty = sum(1 for r in results if not r["error"] and r["num_steps"] == 0)
|
|
895
|
+
n_t2 = sum(1 for r in results if r.get("_t2_corrections"))
|
|
896
|
+
n_ml = sum(1 for r in results if r.get("ml_enrichment"))
|
|
897
|
+
|
|
898
|
+
# Sort: errors first, then by filename
|
|
899
|
+
results_sorted = sorted(results,
|
|
900
|
+
key=lambda r: (0 if r["error"] else 1, r["file"]))
|
|
901
|
+
|
|
902
|
+
cards = "\n".join(_card_html(i, r) for i, r in enumerate(results_sorted))
|
|
903
|
+
|
|
904
|
+
html = f"""<!DOCTYPE html>
|
|
905
|
+
<html lang="en">
|
|
906
|
+
<head>
|
|
907
|
+
<meta charset="utf-8">
|
|
908
|
+
<title>{title}</title>
|
|
909
|
+
<style>{_CSS}</style>
|
|
910
|
+
</head>
|
|
911
|
+
<body>
|
|
912
|
+
<h1>{title}</h1>
|
|
913
|
+
<p class="subtitle">Visual verification of scheme_reader output.
|
|
914
|
+
Click a card to expand. Compare the rendered image with the parsed narrative.</p>
|
|
915
|
+
|
|
916
|
+
<div class="stats">
|
|
917
|
+
<div class="stat">
|
|
918
|
+
<div class="stat-value">{n_total}</div>
|
|
919
|
+
<div class="stat-label">Total schemes</div>
|
|
920
|
+
</div>
|
|
921
|
+
<div class="stat">
|
|
922
|
+
<div class="stat-value" style="color:var(--green)">{n_ok}</div>
|
|
923
|
+
<div class="stat-label">Parsed OK</div>
|
|
924
|
+
</div>
|
|
925
|
+
<div class="stat">
|
|
926
|
+
<div class="stat-value" style="color:#856404">{n_warn}</div>
|
|
927
|
+
<div class="stat-label">With warnings</div>
|
|
928
|
+
</div>
|
|
929
|
+
<div class="stat">
|
|
930
|
+
<div class="stat-value" style="color:var(--muted)">{n_empty}</div>
|
|
931
|
+
<div class="stat-label">No steps found</div>
|
|
932
|
+
</div>
|
|
933
|
+
<div class="stat">
|
|
934
|
+
<div class="stat-value" style="color:var(--red)">{n_err}</div>
|
|
935
|
+
<div class="stat-label">Parse errors</div>
|
|
936
|
+
</div>
|
|
937
|
+
<div class="stat">
|
|
938
|
+
<div class="stat-value" style="color:#198754">{n_t2}</div>
|
|
939
|
+
<div class="stat-label">LLM-refined</div>
|
|
940
|
+
</div>
|
|
941
|
+
<div class="stat">
|
|
942
|
+
<div class="stat-value" style="color:#0d6efd">{n_ml}</div>
|
|
943
|
+
<div class="stat-label">ML-enriched</div>
|
|
944
|
+
</div>
|
|
945
|
+
</div>
|
|
946
|
+
|
|
947
|
+
<div style="margin-bottom: 16px;">
|
|
948
|
+
<button onclick="expandAll()" style="padding:6px 14px;cursor:pointer;border:1px solid var(--border);border-radius:4px;background:var(--card)">Expand All</button>
|
|
949
|
+
<button onclick="collapseAll()" style="padding:6px 14px;cursor:pointer;border:1px solid var(--border);border-radius:4px;background:var(--card);margin-left:6px">Collapse All</button>
|
|
950
|
+
</div>
|
|
951
|
+
|
|
952
|
+
{cards}
|
|
953
|
+
|
|
954
|
+
<script>{_JS}</script>
|
|
955
|
+
</body>
|
|
956
|
+
</html>"""
|
|
957
|
+
|
|
958
|
+
with open(output_path, "w", encoding="utf-8") as f:
|
|
959
|
+
f.write(html)
|
|
960
|
+
print(f"Report written to {output_path} ({n_total} schemes)")
|
|
961
|
+
|
|
962
|
+
|
|
963
|
+
# ---------------------------------------------------------------------------
|
|
964
|
+
# Main entry point
|
|
965
|
+
# ---------------------------------------------------------------------------
|
|
966
|
+
|
|
967
|
+
def main():
|
|
968
|
+
parser = argparse.ArgumentParser(
|
|
969
|
+
description="Generate a visual verification report for scheme_reader")
|
|
970
|
+
parser.add_argument("inputs", nargs="+",
|
|
971
|
+
help="CDXML files, directories of CDXML files, "
|
|
972
|
+
"or PPTX/DOCX documents")
|
|
973
|
+
parser.add_argument("-o", "--output", default="scheme_reader_report.html",
|
|
974
|
+
help="Output HTML file (default: scheme_reader_report.html)")
|
|
975
|
+
parser.add_argument("--render", action="store_true",
|
|
976
|
+
help="Render CDXML to PNG via ChemDraw COM "
|
|
977
|
+
"(requires ChemDraw to be closed)")
|
|
978
|
+
parser.add_argument("--chemscript", action="store_true",
|
|
979
|
+
help="Use ChemScript for SMILES (best abbreviation "
|
|
980
|
+
"resolution, requires ChemDraw 16+ on Windows)")
|
|
981
|
+
parser.add_argument("--corrections",
|
|
982
|
+
help="Tier 2 corrections JSON file "
|
|
983
|
+
"(maps source_key to correction dict)")
|
|
984
|
+
parser.add_argument("--enrich", action="store_true",
|
|
985
|
+
help="Run RXNMapper + RXN Insight ML enrichment "
|
|
986
|
+
"per step (requires chem-pipeline rxn-experiments)")
|
|
987
|
+
parser.add_argument("--segment", action="store_true",
|
|
988
|
+
help="Auto-segment multi-panel CDXML files into "
|
|
989
|
+
"independent sub-schemes")
|
|
990
|
+
parser.add_argument("--title", default="Scheme Reader Verification Report",
|
|
991
|
+
help="Report title")
|
|
992
|
+
args = parser.parse_args()
|
|
993
|
+
|
|
994
|
+
# Collect all CDXML paths
|
|
995
|
+
cdxml_paths: List[str] = []
|
|
996
|
+
tmp_dirs = []
|
|
997
|
+
|
|
998
|
+
for inp in args.inputs:
|
|
999
|
+
inp = os.path.abspath(inp)
|
|
1000
|
+
ext = os.path.splitext(inp)[1].lower()
|
|
1001
|
+
|
|
1002
|
+
if ext in (".pptx", ".docx"):
|
|
1003
|
+
# Extract from document
|
|
1004
|
+
doc_name = Path(inp).stem
|
|
1005
|
+
tmp = tempfile.mkdtemp(prefix=f"sr_verify_{doc_name}_")
|
|
1006
|
+
tmp_dirs.append(tmp)
|
|
1007
|
+
print(f"Extracting from {os.path.basename(inp)}...", file=sys.stderr)
|
|
1008
|
+
extracted = _extract_from_document(inp, tmp)
|
|
1009
|
+
# Tag with source document
|
|
1010
|
+
for p in extracted:
|
|
1011
|
+
cdxml_paths.append((p, os.path.basename(inp)))
|
|
1012
|
+
print(f" -> {len(extracted)} ChemDraw objects", file=sys.stderr)
|
|
1013
|
+
|
|
1014
|
+
elif ext == ".cdxml":
|
|
1015
|
+
cdxml_paths.append((inp, None))
|
|
1016
|
+
|
|
1017
|
+
elif os.path.isdir(inp):
|
|
1018
|
+
for fn in sorted(os.listdir(inp)):
|
|
1019
|
+
if fn.lower().endswith(".cdxml"):
|
|
1020
|
+
cdxml_paths.append((os.path.join(inp, fn), os.path.basename(inp)))
|
|
1021
|
+
else:
|
|
1022
|
+
print(f"Skipping unknown input: {inp}", file=sys.stderr)
|
|
1023
|
+
|
|
1024
|
+
if not cdxml_paths:
|
|
1025
|
+
print("No CDXML files found.", file=sys.stderr)
|
|
1026
|
+
sys.exit(1)
|
|
1027
|
+
|
|
1028
|
+
# Optional image rendering directory
|
|
1029
|
+
img_dir = None
|
|
1030
|
+
if args.render:
|
|
1031
|
+
img_dir = tempfile.mkdtemp(prefix="sr_verify_img_")
|
|
1032
|
+
tmp_dirs.append(img_dir)
|
|
1033
|
+
|
|
1034
|
+
# Load Tier 2 corrections if provided
|
|
1035
|
+
corrections_map = {}
|
|
1036
|
+
if args.corrections:
|
|
1037
|
+
with open(args.corrections, "r", encoding="utf-8") as f:
|
|
1038
|
+
corrections_map = json.load(f)
|
|
1039
|
+
print(f"Loaded {len(corrections_map)} Tier 2 corrections",
|
|
1040
|
+
file=sys.stderr)
|
|
1041
|
+
|
|
1042
|
+
# Parse all (Phase 1: deterministic parsing, no ML enrichment yet)
|
|
1043
|
+
results = []
|
|
1044
|
+
for i, (cdxml_path, source_doc) in enumerate(cdxml_paths):
|
|
1045
|
+
name = os.path.basename(cdxml_path)
|
|
1046
|
+
if source_doc:
|
|
1047
|
+
display_name = f"[{source_doc}] {name}"
|
|
1048
|
+
else:
|
|
1049
|
+
display_name = name
|
|
1050
|
+
print(f" [{i+1}/{len(cdxml_paths)}] {display_name}", file=sys.stderr)
|
|
1051
|
+
result = _parse_one(cdxml_path, render=args.render, img_dir=img_dir,
|
|
1052
|
+
use_chemscript=args.chemscript,
|
|
1053
|
+
enrich=False, # ML enrichment handled in batch below
|
|
1054
|
+
segment=args.segment)
|
|
1055
|
+
if source_doc:
|
|
1056
|
+
result["file"] = display_name
|
|
1057
|
+
|
|
1058
|
+
# Apply Tier 2 corrections if available
|
|
1059
|
+
corr_key = None
|
|
1060
|
+
for candidate in [
|
|
1061
|
+
f"{source_doc or 'standalone'}/{name}" if source_doc else name,
|
|
1062
|
+
name,
|
|
1063
|
+
f"docx/{name}" if source_doc and "docx" in source_doc.lower() else None,
|
|
1064
|
+
f"pptx/{name}" if source_doc and "pptx" in source_doc.lower() else None,
|
|
1065
|
+
f"showcase/{name}" if source_doc and "showcase" in source_doc.lower() else None,
|
|
1066
|
+
]:
|
|
1067
|
+
if candidate and candidate in corrections_map:
|
|
1068
|
+
corr_key = candidate
|
|
1069
|
+
break
|
|
1070
|
+
|
|
1071
|
+
if corr_key and result.get("_desc"):
|
|
1072
|
+
corr = corrections_map[corr_key]
|
|
1073
|
+
try:
|
|
1074
|
+
t2_desc = apply_corrections(result["_desc"], corr)
|
|
1075
|
+
result["_t2_corrections"] = corr
|
|
1076
|
+
result["_t2_desc"] = t2_desc
|
|
1077
|
+
except Exception as e:
|
|
1078
|
+
print(f" Warning: Tier 2 correction failed for {name}: {e}",
|
|
1079
|
+
file=sys.stderr)
|
|
1080
|
+
|
|
1081
|
+
results.append(result)
|
|
1082
|
+
|
|
1083
|
+
# Regenerate LLM narrative from corrected desc (Tier 2) where available
|
|
1084
|
+
# This ensures content_type/topology corrections flow into the narrative
|
|
1085
|
+
for r in results:
|
|
1086
|
+
t2 = r.get("_t2_desc")
|
|
1087
|
+
if t2:
|
|
1088
|
+
try:
|
|
1089
|
+
r["llm_narrative"] = generate_llm_narrative(t2)
|
|
1090
|
+
except Exception:
|
|
1091
|
+
pass
|
|
1092
|
+
|
|
1093
|
+
# Phase 1.5: Aligned IUPAC name enrichment (requires ChemScript)
|
|
1094
|
+
if args.chemscript:
|
|
1095
|
+
n_aligned_total = 0
|
|
1096
|
+
for r in results:
|
|
1097
|
+
desc = r.get("_t2_desc") or r.get("_desc")
|
|
1098
|
+
if desc and desc.steps:
|
|
1099
|
+
try:
|
|
1100
|
+
n = enrich_aligned_names(desc)
|
|
1101
|
+
if n:
|
|
1102
|
+
n_aligned_total += n
|
|
1103
|
+
r["llm_narrative"] = generate_llm_narrative(desc)
|
|
1104
|
+
# Rebuild species summary to include updated names
|
|
1105
|
+
r["species_summary"] = _build_species_summary(desc)
|
|
1106
|
+
except Exception:
|
|
1107
|
+
pass
|
|
1108
|
+
if n_aligned_total:
|
|
1109
|
+
print(f" Aligned IUPAC names: {n_aligned_total} species updated",
|
|
1110
|
+
file=sys.stderr)
|
|
1111
|
+
|
|
1112
|
+
# Phase 2: Batch ML enrichment (single RXNMapper subprocess for all reactions)
|
|
1113
|
+
if args.enrich:
|
|
1114
|
+
# Collect schemes with steps for enrichment
|
|
1115
|
+
descs_for_enrich = []
|
|
1116
|
+
for i, r in enumerate(results):
|
|
1117
|
+
desc = r.get("_desc")
|
|
1118
|
+
if desc and desc.steps:
|
|
1119
|
+
descs_for_enrich.append((i, desc))
|
|
1120
|
+
|
|
1121
|
+
if descs_for_enrich:
|
|
1122
|
+
print(f"\nBatch ML enrichment for {len(descs_for_enrich)} schemes...",
|
|
1123
|
+
file=sys.stderr)
|
|
1124
|
+
batch_results = batch_enrich_schemes(descs_for_enrich, verbose=True)
|
|
1125
|
+
|
|
1126
|
+
# Apply enrichment and regenerate narratives
|
|
1127
|
+
# Use corrected T2 desc when available so corrections flow into narrative
|
|
1128
|
+
for desc_idx, enrichment in batch_results:
|
|
1129
|
+
if enrichment:
|
|
1130
|
+
results[desc_idx]["ml_enrichment"] = enrichment
|
|
1131
|
+
desc = (results[desc_idx].get("_t2_desc")
|
|
1132
|
+
or results[desc_idx].get("_desc"))
|
|
1133
|
+
if desc:
|
|
1134
|
+
try:
|
|
1135
|
+
results[desc_idx]["llm_narrative"] = (
|
|
1136
|
+
generate_llm_narrative(desc,
|
|
1137
|
+
ml_enrichment=enrichment))
|
|
1138
|
+
except Exception:
|
|
1139
|
+
pass
|
|
1140
|
+
n_enriched = sum(1 for _, e in batch_results if e)
|
|
1141
|
+
print(f" {n_enriched} schemes enriched", file=sys.stderr)
|
|
1142
|
+
|
|
1143
|
+
# Clean up internal fields before report
|
|
1144
|
+
for r in results:
|
|
1145
|
+
r.pop("_desc", None)
|
|
1146
|
+
|
|
1147
|
+
# Generate report
|
|
1148
|
+
generate_report(results, args.output, title=args.title)
|
|
1149
|
+
|
|
1150
|
+
# Cleanup temp dirs only if no images needed (they're embedded as b64)
|
|
1151
|
+
for d in tmp_dirs:
|
|
1152
|
+
try:
|
|
1153
|
+
import shutil
|
|
1154
|
+
shutil.rmtree(d, ignore_errors=True)
|
|
1155
|
+
except Exception:
|
|
1156
|
+
pass
|
|
1157
|
+
|
|
1158
|
+
|
|
1159
|
+
if __name__ == "__main__":
|
|
1160
|
+
main()
|