cdxml-toolkit 0.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cdxml_toolkit/__init__.py +18 -0
- cdxml_toolkit/_jre/__init__.py +2 -0
- cdxml_toolkit/_jre/temurin-21-jre-win-x64.zip +0 -0
- cdxml_toolkit/analysis/__init__.py +35 -0
- cdxml_toolkit/analysis/deterministic/__init__.py +12 -0
- cdxml_toolkit/analysis/deterministic/discover_experiment_files.py +413 -0
- cdxml_toolkit/analysis/deterministic/lab_book_formatter.py +701 -0
- cdxml_toolkit/analysis/deterministic/lcms_file_categorizer.py +928 -0
- cdxml_toolkit/analysis/deterministic/lcms_identifier.py +598 -0
- cdxml_toolkit/analysis/deterministic/mass_resolver.py +654 -0
- cdxml_toolkit/analysis/deterministic/multi_lcms_analyzer.py +1412 -0
- cdxml_toolkit/analysis/deterministic/procedure_writer.py +446 -0
- cdxml_toolkit/analysis/extract_nmr.py +47 -0
- cdxml_toolkit/analysis/format_procedure_entry.py +479 -0
- cdxml_toolkit/analysis/lcms_analyzer.py +1299 -0
- cdxml_toolkit/analysis/parse_analysis_file.py +134 -0
- cdxml_toolkit/cdxml_builder.py +920 -0
- cdxml_toolkit/cdxml_utils.py +342 -0
- cdxml_toolkit/chemdraw/__init__.py +5 -0
- cdxml_toolkit/chemdraw/_chemscript_server.py +562 -0
- cdxml_toolkit/chemdraw/cdx_converter.py +527 -0
- cdxml_toolkit/chemdraw/cdxml_to_image.py +262 -0
- cdxml_toolkit/chemdraw/cdxml_to_image_rdkit.py +296 -0
- cdxml_toolkit/chemdraw/chemscript_bridge.py +901 -0
- cdxml_toolkit/constants.py +304 -0
- cdxml_toolkit/coord_normalizer.py +438 -0
- cdxml_toolkit/deterministic_pipeline/__init__.py +6 -0
- cdxml_toolkit/deterministic_pipeline/legacy/__init__.py +5 -0
- cdxml_toolkit/deterministic_pipeline/legacy/eln_cdx_cleanup.py +509 -0
- cdxml_toolkit/deterministic_pipeline/legacy/eln_enrichment.py +1394 -0
- cdxml_toolkit/deterministic_pipeline/legacy/scheme_aligner.py +428 -0
- cdxml_toolkit/deterministic_pipeline/legacy/scheme_polisher.py +1337 -0
- cdxml_toolkit/deterministic_pipeline/legacy/scheme_polisher_v2.py +1340 -0
- cdxml_toolkit/deterministic_pipeline/scheme_reader_audit.py +931 -0
- cdxml_toolkit/deterministic_pipeline/scheme_reader_verify.py +1160 -0
- cdxml_toolkit/image/__init__.py +15 -0
- cdxml_toolkit/image/reaction_from_image.py +2103 -0
- cdxml_toolkit/image/structure_from_image.py +1711 -0
- cdxml_toolkit/layout/__init__.py +5 -0
- cdxml_toolkit/layout/alignment.py +1642 -0
- cdxml_toolkit/layout/reaction_cleanup.py +1002 -0
- cdxml_toolkit/layout/scheme_merger.py +2260 -0
- cdxml_toolkit/mcp_server/__init__.py +0 -0
- cdxml_toolkit/mcp_server/__main__.py +5 -0
- cdxml_toolkit/mcp_server/server.py +1567 -0
- cdxml_toolkit/naming/__init__.py +6 -0
- cdxml_toolkit/naming/aligned_namer.py +2342 -0
- cdxml_toolkit/naming/mol_builder.py +3722 -0
- cdxml_toolkit/naming/name_decomposer.py +2843 -0
- cdxml_toolkit/naming/reactions_datamol.json +2414 -0
- cdxml_toolkit/office/__init__.py +5 -0
- cdxml_toolkit/office/doc_from_template.py +722 -0
- cdxml_toolkit/office/ole_embedder.py +808 -0
- cdxml_toolkit/office/ole_extractor.py +272 -0
- cdxml_toolkit/perception/__init__.py +10 -0
- cdxml_toolkit/perception/compound_search.py +229 -0
- cdxml_toolkit/perception/eln_csv_parser.py +240 -0
- cdxml_toolkit/perception/rdf_parser.py +664 -0
- cdxml_toolkit/perception/reactant_heuristic.py +1045 -0
- cdxml_toolkit/perception/reaction_parser.py +2150 -0
- cdxml_toolkit/perception/scheme_reader.py +2948 -0
- cdxml_toolkit/perception/scheme_refine.py +1404 -0
- cdxml_toolkit/perception/scheme_segmenter.py +619 -0
- cdxml_toolkit/perception/spatial_assignment.py +1013 -0
- cdxml_toolkit/rdkit_utils.py +605 -0
- cdxml_toolkit/render/__init__.py +17 -0
- cdxml_toolkit/render/auto_layout.py +229 -0
- cdxml_toolkit/render/compact_parser.py +632 -0
- cdxml_toolkit/render/parser.py +706 -0
- cdxml_toolkit/render/render_scheme.py +267 -0
- cdxml_toolkit/render/renderer.py +2387 -0
- cdxml_toolkit/render/schema.py +90 -0
- cdxml_toolkit/render/scheme_maker.py +1043 -0
- cdxml_toolkit/render/scheme_yaml_writer.py +1487 -0
- cdxml_toolkit/resolve/__init__.py +13 -0
- cdxml_toolkit/resolve/cas_resolver.py +430 -0
- cdxml_toolkit/resolve/chemscanner_abbreviations.json +28813 -0
- cdxml_toolkit/resolve/condensed_formula.py +493 -0
- cdxml_toolkit/resolve/jre_manager.py +195 -0
- cdxml_toolkit/resolve/reagent_abbreviations.json +1046 -0
- cdxml_toolkit/resolve/reagent_db.py +285 -0
- cdxml_toolkit/resolve/superatom_data.json +2856 -0
- cdxml_toolkit/resolve/superatom_table.py +146 -0
- cdxml_toolkit/text_formatting.py +298 -0
- cdxml_toolkit-0.5.0.dist-info/METADATA +318 -0
- cdxml_toolkit-0.5.0.dist-info/RECORD +91 -0
- cdxml_toolkit-0.5.0.dist-info/WHEEL +5 -0
- cdxml_toolkit-0.5.0.dist-info/entry_points.txt +17 -0
- cdxml_toolkit-0.5.0.dist-info/licenses/LICENSE +21 -0
- cdxml_toolkit-0.5.0.dist-info/licenses/NOTICE.md +37 -0
- cdxml_toolkit-0.5.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,931 @@
|
|
|
1
|
+
"""Audit scheme_reader Mode A (deterministic) quality on showcase CDXMLs.
|
|
2
|
+
|
|
3
|
+
Runs ``read_scheme()`` against every showcase file and checks:
|
|
4
|
+
- Parse success (steps, species, narrative present)
|
|
5
|
+
- Step completeness (each step has reactants AND products)
|
|
6
|
+
- Species coverage (fraction of species used in steps)
|
|
7
|
+
- Topology correctness (matches filename convention)
|
|
8
|
+
- Conditions extraction (non-empty conditions per step)
|
|
9
|
+
- Arrow style accuracy (dashed/failed detected)
|
|
10
|
+
- Narrative quality (no leftover ``[SMILES: ...]`` fragments)
|
|
11
|
+
|
|
12
|
+
Usage::
|
|
13
|
+
|
|
14
|
+
python -m cdxml_toolkit.scheme_reader_audit [showcase_dir]
|
|
15
|
+
python -m cdxml_toolkit.scheme_reader_audit --html report.html --render
|
|
16
|
+
python -m cdxml_toolkit.scheme_reader_audit --json -o audit_results.json
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
import argparse
|
|
20
|
+
import base64
|
|
21
|
+
import json
|
|
22
|
+
import os
|
|
23
|
+
import re
|
|
24
|
+
import subprocess
|
|
25
|
+
import sys
|
|
26
|
+
import tempfile
|
|
27
|
+
import time
|
|
28
|
+
import traceback
|
|
29
|
+
from dataclasses import dataclass, field, asdict
|
|
30
|
+
from html import escape as html_escape
|
|
31
|
+
from pathlib import Path
|
|
32
|
+
from typing import Dict, List, Optional, Tuple
|
|
33
|
+
|
|
34
|
+
from ..perception.scheme_reader import read_scheme, SchemeDescription
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
# ---------------------------------------------------------------------------
|
|
38
|
+
# Expected topology from filename conventions
|
|
39
|
+
# ---------------------------------------------------------------------------
|
|
40
|
+
|
|
41
|
+
_TOPOLOGY_RULES: List[Tuple[re.Pattern, str]] = [
|
|
42
|
+
(re.compile(r"divergent|_sar", re.I), "divergent"),
|
|
43
|
+
(re.compile(r"stacked|parallel|comparison|different_routes", re.I), "parallel"),
|
|
44
|
+
# serpentine/wrap are LAYOUTS, topology is still linear
|
|
45
|
+
(re.compile(r"serpentine|wrap|linear|sequential|letter|compact|"
|
|
46
|
+
r"name_resolution|reductive|mitsunobu|grignard|"
|
|
47
|
+
r"two_step|three_step|run_arrows|failed|above_structures", re.I),
|
|
48
|
+
"linear"),
|
|
49
|
+
]
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def _expected_topology(filename: str) -> Optional[str]:
|
|
53
|
+
"""Infer expected topology from filename convention. Returns None if ambiguous."""
|
|
54
|
+
base = os.path.splitext(os.path.basename(filename))[0]
|
|
55
|
+
for pattern, topo in _TOPOLOGY_RULES:
|
|
56
|
+
if pattern.search(base):
|
|
57
|
+
return topo
|
|
58
|
+
return None
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def _expected_step_count(filename: str) -> Optional[int]:
|
|
62
|
+
"""Infer expected step count from filename if it contains a number hint."""
|
|
63
|
+
base = os.path.splitext(os.path.basename(filename))[0]
|
|
64
|
+
# Patterns like "4step", "5step", "7step", "3step"
|
|
65
|
+
m = re.search(r"(\d+)\s*step", base, re.I)
|
|
66
|
+
if m:
|
|
67
|
+
return int(m.group(1))
|
|
68
|
+
# "two_step", "three_step"
|
|
69
|
+
word_map = {"two": 2, "three": 3, "four": 4, "five": 5,
|
|
70
|
+
"six": 6, "seven": 7, "eight": 8}
|
|
71
|
+
for word, n in word_map.items():
|
|
72
|
+
if f"{word}_step" in base.lower():
|
|
73
|
+
return n
|
|
74
|
+
# Divergent: count from filename e.g. "4products"
|
|
75
|
+
m2 = re.search(r"(\d+)\s*product", base, re.I)
|
|
76
|
+
if m2:
|
|
77
|
+
return int(m2.group(1))
|
|
78
|
+
# Divergent/SAR schemes — step count equals number of products, skip
|
|
79
|
+
# single-step heuristic for these
|
|
80
|
+
if re.search(r"divergent|_sar", base, re.I):
|
|
81
|
+
return None # can't infer from filename alone
|
|
82
|
+
# Single-step schemes (buchwald, suzuki, snar, etc.) that are linear with no step count
|
|
83
|
+
if re.search(r"buchwald|suzuki|snar|amide_coupling|boc_deprotection|"
|
|
84
|
+
r"reductive_amination|mitsunobu|grignard|name_resolution|"
|
|
85
|
+
r"failed_arrow", base, re.I):
|
|
86
|
+
return 1
|
|
87
|
+
return None
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
# ---------------------------------------------------------------------------
|
|
91
|
+
# Per-file audit result
|
|
92
|
+
# ---------------------------------------------------------------------------
|
|
93
|
+
|
|
94
|
+
@dataclass
|
|
95
|
+
class FileAuditResult:
|
|
96
|
+
"""Quality audit result for one CDXML file."""
|
|
97
|
+
filename: str
|
|
98
|
+
cdxml_path: str = ""
|
|
99
|
+
status: str = "PASS" # "PASS", "WARN", "FAIL", "ERROR"
|
|
100
|
+
num_steps: int = 0
|
|
101
|
+
num_species: int = 0
|
|
102
|
+
topology: str = ""
|
|
103
|
+
content_type: str = ""
|
|
104
|
+
expected_topology: Optional[str] = None
|
|
105
|
+
expected_steps: Optional[int] = None
|
|
106
|
+
topology_match: bool = True
|
|
107
|
+
step_count_match: bool = True
|
|
108
|
+
all_steps_complete: bool = True # every step has >=1 reactant AND product
|
|
109
|
+
species_coverage: float = 1.0 # fraction in steps
|
|
110
|
+
orphan_species: List[str] = field(default_factory=list)
|
|
111
|
+
conditions_coverage: float = 1.0 # fraction of steps with conditions
|
|
112
|
+
steps_missing_conditions: List[int] = field(default_factory=list)
|
|
113
|
+
arrow_styles: List[str] = field(default_factory=list)
|
|
114
|
+
smiles_in_narrative: int = 0 # count of [SMILES: ...] in narrative
|
|
115
|
+
warnings: List[str] = field(default_factory=list)
|
|
116
|
+
parse_time_ms: float = 0.0
|
|
117
|
+
error: Optional[str] = None
|
|
118
|
+
# Rich data (stored for HTML, not serialised to JSON)
|
|
119
|
+
_desc: Optional[SchemeDescription] = field(default=None, repr=False)
|
|
120
|
+
_image_b64: str = field(default="", repr=False)
|
|
121
|
+
|
|
122
|
+
@property
|
|
123
|
+
def detail_line(self) -> str:
|
|
124
|
+
"""One-line summary for terminal output."""
|
|
125
|
+
parts = [f"{self.num_steps} step{'s' if self.num_steps != 1 else ''}"]
|
|
126
|
+
parts.append(f"{self.num_species} species")
|
|
127
|
+
parts.append(f"{self.topology}")
|
|
128
|
+
if self.expected_topology and self.topology_match:
|
|
129
|
+
parts[-1] += " OK"
|
|
130
|
+
elif self.expected_topology and not self.topology_match:
|
|
131
|
+
parts[-1] += f" MISMATCH (expected {self.expected_topology})"
|
|
132
|
+
if self.expected_steps is not None and not self.step_count_match:
|
|
133
|
+
parts.append(f"steps: {self.num_steps}/{self.expected_steps}")
|
|
134
|
+
if not self.all_steps_complete:
|
|
135
|
+
parts.append("incomplete steps")
|
|
136
|
+
if self.species_coverage < 1.0:
|
|
137
|
+
parts.append(f"coverage {self.species_coverage:.0%}")
|
|
138
|
+
if self.smiles_in_narrative > 0:
|
|
139
|
+
parts.append(f"{self.smiles_in_narrative} SMILES in narrative")
|
|
140
|
+
return ", ".join(parts)
|
|
141
|
+
|
|
142
|
+
|
|
143
|
+
# ---------------------------------------------------------------------------
|
|
144
|
+
# Aggregate report
|
|
145
|
+
# ---------------------------------------------------------------------------
|
|
146
|
+
|
|
147
|
+
@dataclass
|
|
148
|
+
class AuditReport:
|
|
149
|
+
"""Aggregate quality report across all audited files."""
|
|
150
|
+
showcase_dir: str = ""
|
|
151
|
+
total_files: int = 0
|
|
152
|
+
pass_count: int = 0
|
|
153
|
+
warn_count: int = 0
|
|
154
|
+
fail_count: int = 0
|
|
155
|
+
error_count: int = 0
|
|
156
|
+
results: List[FileAuditResult] = field(default_factory=list)
|
|
157
|
+
total_time_ms: float = 0.0
|
|
158
|
+
|
|
159
|
+
def to_dict(self) -> dict:
|
|
160
|
+
d = {
|
|
161
|
+
"showcase_dir": self.showcase_dir,
|
|
162
|
+
"total_files": self.total_files,
|
|
163
|
+
"pass": self.pass_count,
|
|
164
|
+
"warn": self.warn_count,
|
|
165
|
+
"fail": self.fail_count,
|
|
166
|
+
"error": self.error_count,
|
|
167
|
+
"total_time_ms": round(self.total_time_ms, 1),
|
|
168
|
+
"results": [],
|
|
169
|
+
}
|
|
170
|
+
for r in self.results:
|
|
171
|
+
rd = {k: v for k, v in asdict(r).items()
|
|
172
|
+
if not k.startswith("_")}
|
|
173
|
+
d["results"].append(rd)
|
|
174
|
+
return d
|
|
175
|
+
|
|
176
|
+
|
|
177
|
+
# ---------------------------------------------------------------------------
|
|
178
|
+
# Image rendering helpers
|
|
179
|
+
# ---------------------------------------------------------------------------
|
|
180
|
+
|
|
181
|
+
def _render_cdxml_to_png(cdxml_path: str, output_path: str) -> bool:
|
|
182
|
+
"""Render a CDXML file to PNG via cdxml_to_image. Returns True on success."""
|
|
183
|
+
try:
|
|
184
|
+
from ..chemdraw.cdxml_to_image import cdxml_to_png
|
|
185
|
+
cdxml_to_png(cdxml_path, output_path)
|
|
186
|
+
return True
|
|
187
|
+
except Exception:
|
|
188
|
+
try:
|
|
189
|
+
python = sys.executable
|
|
190
|
+
result = subprocess.run(
|
|
191
|
+
[python, "-m", "cdxml_toolkit.cdxml_to_image",
|
|
192
|
+
cdxml_path, "-o", output_path],
|
|
193
|
+
capture_output=True, timeout=30,
|
|
194
|
+
)
|
|
195
|
+
return result.returncode == 0 and os.path.exists(output_path)
|
|
196
|
+
except Exception:
|
|
197
|
+
return False
|
|
198
|
+
|
|
199
|
+
|
|
200
|
+
def _embed_image_b64(img_path: str) -> str:
|
|
201
|
+
"""Read image file and return base64 data-URI string."""
|
|
202
|
+
if not os.path.exists(img_path):
|
|
203
|
+
return ""
|
|
204
|
+
with open(img_path, "rb") as f:
|
|
205
|
+
data = base64.b64encode(f.read()).decode("ascii")
|
|
206
|
+
ext = os.path.splitext(img_path)[1].lower().lstrip(".")
|
|
207
|
+
mime = {"png": "image/png", "jpg": "image/jpeg",
|
|
208
|
+
"jpeg": "image/jpeg"}.get(ext, "image/png")
|
|
209
|
+
return f"data:{mime};base64,{data}"
|
|
210
|
+
|
|
211
|
+
|
|
212
|
+
# ---------------------------------------------------------------------------
|
|
213
|
+
# Core audit logic
|
|
214
|
+
# ---------------------------------------------------------------------------
|
|
215
|
+
|
|
216
|
+
def _audit_one(cdxml_path: str,
|
|
217
|
+
use_chemscript: bool = False,
|
|
218
|
+
verbose: bool = False,
|
|
219
|
+
render: bool = False,
|
|
220
|
+
img_dir: Optional[str] = None) -> FileAuditResult:
|
|
221
|
+
"""Run quality audit on a single CDXML file."""
|
|
222
|
+
filename = os.path.basename(cdxml_path)
|
|
223
|
+
result = FileAuditResult(filename=filename, cdxml_path=cdxml_path)
|
|
224
|
+
result.expected_topology = _expected_topology(filename)
|
|
225
|
+
result.expected_steps = _expected_step_count(filename)
|
|
226
|
+
|
|
227
|
+
t0 = time.perf_counter()
|
|
228
|
+
try:
|
|
229
|
+
desc = read_scheme(cdxml_path,
|
|
230
|
+
use_network=False,
|
|
231
|
+
use_chemscript=use_chemscript,
|
|
232
|
+
verbose=verbose)
|
|
233
|
+
except Exception as exc:
|
|
234
|
+
result.status = "ERROR"
|
|
235
|
+
result.error = f"{type(exc).__name__}: {exc}"
|
|
236
|
+
result.parse_time_ms = (time.perf_counter() - t0) * 1000
|
|
237
|
+
return result
|
|
238
|
+
result.parse_time_ms = (time.perf_counter() - t0) * 1000
|
|
239
|
+
result._desc = desc
|
|
240
|
+
|
|
241
|
+
# Render image
|
|
242
|
+
if render and img_dir:
|
|
243
|
+
png_path = os.path.join(img_dir, Path(cdxml_path).stem + ".png")
|
|
244
|
+
if _render_cdxml_to_png(cdxml_path, png_path):
|
|
245
|
+
result._image_b64 = _embed_image_b64(png_path)
|
|
246
|
+
|
|
247
|
+
# Basic parse success
|
|
248
|
+
result.num_steps = desc.num_steps
|
|
249
|
+
result.num_species = len(desc.species)
|
|
250
|
+
result.topology = desc.topology
|
|
251
|
+
result.content_type = desc.content_type or "unknown"
|
|
252
|
+
|
|
253
|
+
if desc.num_steps < 1:
|
|
254
|
+
result.status = "FAIL"
|
|
255
|
+
result.warnings.append("No steps parsed")
|
|
256
|
+
return result
|
|
257
|
+
if not desc.species:
|
|
258
|
+
result.status = "FAIL"
|
|
259
|
+
result.warnings.append("No species found")
|
|
260
|
+
return result
|
|
261
|
+
if not desc.narrative:
|
|
262
|
+
result.warnings.append("Empty narrative")
|
|
263
|
+
|
|
264
|
+
# Topology correctness
|
|
265
|
+
if result.expected_topology:
|
|
266
|
+
result.topology_match = (desc.topology == result.expected_topology)
|
|
267
|
+
if not result.topology_match:
|
|
268
|
+
result.warnings.append(
|
|
269
|
+
f"Topology mismatch: got '{desc.topology}', "
|
|
270
|
+
f"expected '{result.expected_topology}'"
|
|
271
|
+
)
|
|
272
|
+
|
|
273
|
+
# Step count correctness
|
|
274
|
+
if result.expected_steps is not None:
|
|
275
|
+
result.step_count_match = (desc.num_steps == result.expected_steps)
|
|
276
|
+
if not result.step_count_match:
|
|
277
|
+
result.warnings.append(
|
|
278
|
+
f"Step count mismatch: got {desc.num_steps}, "
|
|
279
|
+
f"expected {result.expected_steps}"
|
|
280
|
+
)
|
|
281
|
+
|
|
282
|
+
# Step completeness
|
|
283
|
+
for step in desc.steps:
|
|
284
|
+
if not step.reactant_ids or not step.product_ids:
|
|
285
|
+
result.all_steps_complete = False
|
|
286
|
+
result.warnings.append(
|
|
287
|
+
f"Step {step.step_index}: missing "
|
|
288
|
+
f"{'reactants' if not step.reactant_ids else 'products'}"
|
|
289
|
+
)
|
|
290
|
+
|
|
291
|
+
# Species coverage
|
|
292
|
+
referenced_ids = set()
|
|
293
|
+
for step in desc.steps:
|
|
294
|
+
referenced_ids.update(step.reactant_ids)
|
|
295
|
+
referenced_ids.update(step.product_ids)
|
|
296
|
+
referenced_ids.update(step.reagent_ids)
|
|
297
|
+
|
|
298
|
+
fragment_species = {sid: sp for sid, sp in desc.species.items()
|
|
299
|
+
if sp.element_type == "fragment"}
|
|
300
|
+
if fragment_species:
|
|
301
|
+
covered = sum(1 for sid in fragment_species if sid in referenced_ids)
|
|
302
|
+
result.species_coverage = covered / len(fragment_species)
|
|
303
|
+
result.orphan_species = [
|
|
304
|
+
sid for sid in fragment_species if sid not in referenced_ids
|
|
305
|
+
]
|
|
306
|
+
if result.species_coverage < 0.8:
|
|
307
|
+
result.warnings.append(
|
|
308
|
+
f"Low species coverage: {result.species_coverage:.0%} "
|
|
309
|
+
f"({len(result.orphan_species)} orphans)"
|
|
310
|
+
)
|
|
311
|
+
|
|
312
|
+
# Conditions coverage
|
|
313
|
+
steps_with_conds = 0
|
|
314
|
+
for step in desc.steps:
|
|
315
|
+
if step.conditions or step.condition_text_raw:
|
|
316
|
+
steps_with_conds += 1
|
|
317
|
+
else:
|
|
318
|
+
result.steps_missing_conditions.append(step.step_index)
|
|
319
|
+
if desc.num_steps > 0:
|
|
320
|
+
result.conditions_coverage = steps_with_conds / desc.num_steps
|
|
321
|
+
|
|
322
|
+
# Arrow styles
|
|
323
|
+
result.arrow_styles = [s.arrow_style for s in desc.steps]
|
|
324
|
+
|
|
325
|
+
# Narrative quality
|
|
326
|
+
result.smiles_in_narrative = len(re.findall(r"\[SMILES:", desc.narrative))
|
|
327
|
+
if result.smiles_in_narrative > 0:
|
|
328
|
+
result.warnings.append(
|
|
329
|
+
f"{result.smiles_in_narrative} raw SMILES in narrative"
|
|
330
|
+
)
|
|
331
|
+
|
|
332
|
+
# Scheme warnings
|
|
333
|
+
if desc.warnings:
|
|
334
|
+
for w in desc.warnings:
|
|
335
|
+
result.warnings.append(f"scheme warning: {w}")
|
|
336
|
+
|
|
337
|
+
# Determine overall status
|
|
338
|
+
has_fail = False
|
|
339
|
+
has_warn = False
|
|
340
|
+
|
|
341
|
+
if not result.topology_match:
|
|
342
|
+
has_fail = True
|
|
343
|
+
if not result.step_count_match:
|
|
344
|
+
if result.expected_steps and result.num_steps <= result.expected_steps // 2:
|
|
345
|
+
has_fail = True
|
|
346
|
+
else:
|
|
347
|
+
has_warn = True
|
|
348
|
+
if not result.all_steps_complete:
|
|
349
|
+
has_warn = True
|
|
350
|
+
if result.species_coverage < 0.5:
|
|
351
|
+
has_fail = True
|
|
352
|
+
elif result.species_coverage < 0.8:
|
|
353
|
+
has_warn = True
|
|
354
|
+
if result.smiles_in_narrative > 0:
|
|
355
|
+
has_warn = True
|
|
356
|
+
|
|
357
|
+
if has_fail:
|
|
358
|
+
result.status = "FAIL"
|
|
359
|
+
elif has_warn:
|
|
360
|
+
result.status = "WARN"
|
|
361
|
+
else:
|
|
362
|
+
result.status = "PASS"
|
|
363
|
+
|
|
364
|
+
return result
|
|
365
|
+
|
|
366
|
+
|
|
367
|
+
def audit_showcase(showcase_dir: str,
|
|
368
|
+
use_chemscript: bool = False,
|
|
369
|
+
verbose: bool = False,
|
|
370
|
+
render: bool = False) -> AuditReport:
|
|
371
|
+
"""Run quality audit on all showcase CDXMLs in a directory.
|
|
372
|
+
|
|
373
|
+
Parameters
|
|
374
|
+
----------
|
|
375
|
+
showcase_dir : str
|
|
376
|
+
Directory containing ``*.cdxml`` showcase files.
|
|
377
|
+
use_chemscript : bool
|
|
378
|
+
Use ChemScript for SMILES extraction.
|
|
379
|
+
verbose : bool
|
|
380
|
+
Print debug info during parsing.
|
|
381
|
+
render : bool
|
|
382
|
+
Render CDXMLs to PNG via ChemDraw COM (requires ChemDraw closed).
|
|
383
|
+
|
|
384
|
+
Returns
|
|
385
|
+
-------
|
|
386
|
+
AuditReport
|
|
387
|
+
Aggregate quality report.
|
|
388
|
+
"""
|
|
389
|
+
report = AuditReport(showcase_dir=showcase_dir)
|
|
390
|
+
|
|
391
|
+
cdxml_files = sorted(
|
|
392
|
+
f for f in os.listdir(showcase_dir) if f.endswith(".cdxml")
|
|
393
|
+
)
|
|
394
|
+
report.total_files = len(cdxml_files)
|
|
395
|
+
|
|
396
|
+
# Set up image directory if rendering
|
|
397
|
+
img_dir = None
|
|
398
|
+
if render:
|
|
399
|
+
img_dir = tempfile.mkdtemp(prefix="audit_imgs_")
|
|
400
|
+
print(f" Rendering to {img_dir}", file=sys.stderr)
|
|
401
|
+
|
|
402
|
+
t_total = time.perf_counter()
|
|
403
|
+
for i, fname in enumerate(cdxml_files):
|
|
404
|
+
path = os.path.join(showcase_dir, fname)
|
|
405
|
+
if render:
|
|
406
|
+
print(f" [{i+1}/{len(cdxml_files)}] {fname}", file=sys.stderr)
|
|
407
|
+
result = _audit_one(path, use_chemscript=use_chemscript,
|
|
408
|
+
verbose=verbose, render=render, img_dir=img_dir)
|
|
409
|
+
report.results.append(result)
|
|
410
|
+
|
|
411
|
+
if result.status == "PASS":
|
|
412
|
+
report.pass_count += 1
|
|
413
|
+
elif result.status == "WARN":
|
|
414
|
+
report.warn_count += 1
|
|
415
|
+
elif result.status == "FAIL":
|
|
416
|
+
report.fail_count += 1
|
|
417
|
+
else:
|
|
418
|
+
report.error_count += 1
|
|
419
|
+
|
|
420
|
+
report.total_time_ms = (time.perf_counter() - t_total) * 1000
|
|
421
|
+
return report
|
|
422
|
+
|
|
423
|
+
|
|
424
|
+
# ---------------------------------------------------------------------------
|
|
425
|
+
# Terminal output
|
|
426
|
+
# ---------------------------------------------------------------------------
|
|
427
|
+
|
|
428
|
+
_STATUS_COLORS = {
|
|
429
|
+
"PASS": "\033[92m", # green
|
|
430
|
+
"WARN": "\033[93m", # yellow
|
|
431
|
+
"FAIL": "\033[91m", # red
|
|
432
|
+
"ERROR": "\033[91m", # red
|
|
433
|
+
}
|
|
434
|
+
_RESET = "\033[0m"
|
|
435
|
+
|
|
436
|
+
|
|
437
|
+
def _print_report(report: AuditReport, color: bool = True) -> None:
|
|
438
|
+
"""Print human-readable audit report to stdout."""
|
|
439
|
+
print()
|
|
440
|
+
print("=" * 70)
|
|
441
|
+
print(" Scheme Reader Audit: Mode A (Deterministic)")
|
|
442
|
+
print(f" {report.total_files} showcase files evaluated")
|
|
443
|
+
print("=" * 70)
|
|
444
|
+
print()
|
|
445
|
+
|
|
446
|
+
max_name_len = max((len(r.filename) for r in report.results), default=30)
|
|
447
|
+
|
|
448
|
+
for r in report.results:
|
|
449
|
+
if color:
|
|
450
|
+
c = _STATUS_COLORS.get(r.status, "")
|
|
451
|
+
tag = f"{c}{r.status:5s}{_RESET}"
|
|
452
|
+
else:
|
|
453
|
+
tag = f"{r.status:5s}"
|
|
454
|
+
|
|
455
|
+
name = r.filename.ljust(max_name_len)
|
|
456
|
+
if r.error:
|
|
457
|
+
detail = f"ERROR: {r.error}"
|
|
458
|
+
else:
|
|
459
|
+
detail = r.detail_line
|
|
460
|
+
print(f" {tag} {name} {detail}")
|
|
461
|
+
|
|
462
|
+
# Print warnings indented
|
|
463
|
+
for w in r.warnings:
|
|
464
|
+
if color:
|
|
465
|
+
print(f" {_STATUS_COLORS.get('WARN', '')}-> {w}{_RESET}")
|
|
466
|
+
else:
|
|
467
|
+
print(f" -> {w}")
|
|
468
|
+
|
|
469
|
+
print()
|
|
470
|
+
print("-" * 70)
|
|
471
|
+
summary_parts = []
|
|
472
|
+
if report.pass_count:
|
|
473
|
+
summary_parts.append(f"{report.pass_count} PASS")
|
|
474
|
+
if report.warn_count:
|
|
475
|
+
summary_parts.append(f"{report.warn_count} WARN")
|
|
476
|
+
if report.fail_count:
|
|
477
|
+
summary_parts.append(f"{report.fail_count} FAIL")
|
|
478
|
+
if report.error_count:
|
|
479
|
+
summary_parts.append(f"{report.error_count} ERROR")
|
|
480
|
+
print(f" Summary: {', '.join(summary_parts)}")
|
|
481
|
+
print(f" Total parse time: {report.total_time_ms:.0f} ms")
|
|
482
|
+
print()
|
|
483
|
+
|
|
484
|
+
|
|
485
|
+
# ---------------------------------------------------------------------------
|
|
486
|
+
# HTML helpers
|
|
487
|
+
# ---------------------------------------------------------------------------
|
|
488
|
+
|
|
489
|
+
_STATUS_BG = {
|
|
490
|
+
"PASS": "#d4edda", "WARN": "#fff3cd",
|
|
491
|
+
"FAIL": "#f8d7da", "ERROR": "#f8d7da",
|
|
492
|
+
}
|
|
493
|
+
_STATUS_FG = {
|
|
494
|
+
"PASS": "#155724", "WARN": "#856404",
|
|
495
|
+
"FAIL": "#721c24", "ERROR": "#721c24",
|
|
496
|
+
}
|
|
497
|
+
_STATUS_BORDER = {
|
|
498
|
+
"PASS": "#c3e6cb", "WARN": "#ffeeba",
|
|
499
|
+
"FAIL": "#f5c6cb", "ERROR": "#f5c6cb",
|
|
500
|
+
}
|
|
501
|
+
|
|
502
|
+
|
|
503
|
+
def _species_table_html(desc: SchemeDescription) -> str:
|
|
504
|
+
"""Build an HTML table for the species registry."""
|
|
505
|
+
if not desc.species:
|
|
506
|
+
return '<p style="color:#6c757d;font-size:0.85rem">No species</p>'
|
|
507
|
+
rows = []
|
|
508
|
+
for sid, sp in desc.species.items():
|
|
509
|
+
label = html_escape(sp.label or "")
|
|
510
|
+
name = html_escape(sp.name or "")
|
|
511
|
+
iupac = html_escape(getattr(sp, "iupac_name", "") or "")
|
|
512
|
+
smiles = html_escape(sp.smiles or "")
|
|
513
|
+
formula = html_escape(sp.formula or "")
|
|
514
|
+
mw_str = f"{sp.mw:.1f}" if sp.mw else ""
|
|
515
|
+
etype = sp.element_type or ""
|
|
516
|
+
tcat = html_escape(sp.text_category or "")
|
|
517
|
+
# Choose display name
|
|
518
|
+
display = iupac or name or formula or ""
|
|
519
|
+
# Truncate long SMILES for display
|
|
520
|
+
smiles_short = smiles[:60] + ("..." if len(smiles) > 60 else "")
|
|
521
|
+
rows.append(f"""<tr>
|
|
522
|
+
<td class="mono">{html_escape(sid)}</td>
|
|
523
|
+
<td>{label}</td>
|
|
524
|
+
<td>{display}</td>
|
|
525
|
+
<td class="mono" title="{smiles}">{smiles_short}</td>
|
|
526
|
+
<td>{formula}</td>
|
|
527
|
+
<td style="text-align:right">{mw_str}</td>
|
|
528
|
+
<td>{etype}{(' / ' + tcat) if tcat else ''}</td>
|
|
529
|
+
</tr>""")
|
|
530
|
+
return f"""<table class="inner-table">
|
|
531
|
+
<tr><th>ID</th><th>Label</th><th>Name</th><th>SMILES</th>
|
|
532
|
+
<th>Formula</th><th>MW</th><th>Type</th></tr>
|
|
533
|
+
{''.join(rows)}
|
|
534
|
+
</table>"""
|
|
535
|
+
|
|
536
|
+
|
|
537
|
+
def _steps_table_html(desc: SchemeDescription) -> str:
|
|
538
|
+
"""Build an HTML table for the reaction steps."""
|
|
539
|
+
if not desc.steps:
|
|
540
|
+
return '<p style="color:#6c757d;font-size:0.85rem">No steps</p>'
|
|
541
|
+
rows = []
|
|
542
|
+
for step in desc.steps:
|
|
543
|
+
r_ids = ", ".join(step.reactant_ids)
|
|
544
|
+
p_ids = ", ".join(step.product_ids)
|
|
545
|
+
rg_ids = ", ".join(step.reagent_ids)
|
|
546
|
+
conds = "; ".join(step.conditions[:3])
|
|
547
|
+
yld = step.yield_text or ""
|
|
548
|
+
arrow_icon = {"solid": "→", "dashed": "⇝",
|
|
549
|
+
"failed": "✗→"}.get(step.arrow_style, "→")
|
|
550
|
+
rows.append(f"""<tr>
|
|
551
|
+
<td style="text-align:center">{step.step_index + 1}</td>
|
|
552
|
+
<td class="mono">{html_escape(r_ids)}</td>
|
|
553
|
+
<td style="text-align:center;font-size:1.1rem">{arrow_icon}</td>
|
|
554
|
+
<td class="mono">{html_escape(p_ids)}</td>
|
|
555
|
+
<td class="mono">{html_escape(rg_ids)}</td>
|
|
556
|
+
<td>{html_escape(conds)}</td>
|
|
557
|
+
<td>{html_escape(yld)}</td>
|
|
558
|
+
</tr>""")
|
|
559
|
+
return f"""<table class="inner-table">
|
|
560
|
+
<tr><th>#</th><th>Reactants</th><th></th><th>Products</th>
|
|
561
|
+
<th>Reagents</th><th>Conditions</th><th>Yield</th></tr>
|
|
562
|
+
{''.join(rows)}
|
|
563
|
+
</table>"""
|
|
564
|
+
|
|
565
|
+
|
|
566
|
+
def _card_html(idx: int, r: FileAuditResult) -> str:
|
|
567
|
+
"""Generate one expandable card for a scheme file."""
|
|
568
|
+
bg = _STATUS_BG.get(r.status, "#fff")
|
|
569
|
+
fg = _STATUS_FG.get(r.status, "#000")
|
|
570
|
+
border = _STATUS_BORDER.get(r.status, "#dee2e6")
|
|
571
|
+
|
|
572
|
+
# Status badge
|
|
573
|
+
status_badge = (f'<span class="badge" style="background:{bg};color:{fg}">'
|
|
574
|
+
f'{r.status}</span>')
|
|
575
|
+
|
|
576
|
+
# Topology badge
|
|
577
|
+
if r.expected_topology and not r.topology_match:
|
|
578
|
+
topo_badge = (f'<span class="badge badge-fail">{r.topology}'
|
|
579
|
+
f' (expected {r.expected_topology})</span>')
|
|
580
|
+
else:
|
|
581
|
+
topo_badge = f'<span class="badge badge-info">{r.topology}</span>'
|
|
582
|
+
|
|
583
|
+
ctype_badge = (f'<span class="badge badge-muted">{r.content_type}</span>'
|
|
584
|
+
if r.content_type else "")
|
|
585
|
+
steps_badge = (f'<span class="badge badge-info">'
|
|
586
|
+
f'{r.num_steps} step{"s" if r.num_steps != 1 else ""}</span>')
|
|
587
|
+
|
|
588
|
+
# Image section
|
|
589
|
+
if r._image_b64:
|
|
590
|
+
img_html = f'<img src="{r._image_b64}" alt="Rendered scheme">'
|
|
591
|
+
else:
|
|
592
|
+
img_html = ('<div class="no-img">No rendered image<br>'
|
|
593
|
+
'<small>(use --render)</small></div>')
|
|
594
|
+
|
|
595
|
+
# Body content
|
|
596
|
+
if r.error:
|
|
597
|
+
body_html = (f'<div class="narrative" style="color:#721c24">'
|
|
598
|
+
f'{html_escape(r.error)}</div>')
|
|
599
|
+
else:
|
|
600
|
+
desc = r._desc
|
|
601
|
+
narrative = html_escape(desc.narrative) if desc else ""
|
|
602
|
+
# Quality checklist
|
|
603
|
+
checks = []
|
|
604
|
+
checks.append(_check_item("Steps parsed", r.num_steps >= 1,
|
|
605
|
+
f"{r.num_steps} steps"))
|
|
606
|
+
checks.append(_check_item("Species found", r.num_species >= 1,
|
|
607
|
+
f"{r.num_species} species"))
|
|
608
|
+
if r.expected_topology:
|
|
609
|
+
checks.append(_check_item("Topology correct", r.topology_match,
|
|
610
|
+
f"{r.topology}"
|
|
611
|
+
+ (f" (expected {r.expected_topology})"
|
|
612
|
+
if not r.topology_match else "")))
|
|
613
|
+
if r.expected_steps is not None:
|
|
614
|
+
checks.append(_check_item("Step count correct", r.step_count_match,
|
|
615
|
+
f"{r.num_steps}"
|
|
616
|
+
+ (f"/{r.expected_steps}"
|
|
617
|
+
if not r.step_count_match else "")))
|
|
618
|
+
checks.append(_check_item("All steps complete", r.all_steps_complete))
|
|
619
|
+
checks.append(_check_item("Species coverage",
|
|
620
|
+
r.species_coverage >= 0.8,
|
|
621
|
+
f"{r.species_coverage:.0%}"))
|
|
622
|
+
checks.append(_check_item("Conditions extracted",
|
|
623
|
+
r.conditions_coverage >= 0.5,
|
|
624
|
+
f"{r.conditions_coverage:.0%}"))
|
|
625
|
+
checks.append(_check_item("No raw SMILES in narrative",
|
|
626
|
+
r.smiles_in_narrative == 0,
|
|
627
|
+
f"{r.smiles_in_narrative} found"
|
|
628
|
+
if r.smiles_in_narrative else ""))
|
|
629
|
+
|
|
630
|
+
checklist_html = '<div class="checklist">' + ''.join(checks) + '</div>'
|
|
631
|
+
|
|
632
|
+
# Warnings
|
|
633
|
+
warn_html = ""
|
|
634
|
+
if r.warnings:
|
|
635
|
+
warn_items = "".join(
|
|
636
|
+
f'<div class="warn-item">{html_escape(w)}</div>'
|
|
637
|
+
for w in r.warnings
|
|
638
|
+
)
|
|
639
|
+
warn_html = f'<div class="warn-box">{warn_items}</div>'
|
|
640
|
+
|
|
641
|
+
# Narrative
|
|
642
|
+
nar_html = ""
|
|
643
|
+
if narrative:
|
|
644
|
+
nar_html = (f'<div class="section-title">Narrative</div>'
|
|
645
|
+
f'<div class="narrative">{narrative}</div>')
|
|
646
|
+
|
|
647
|
+
# Species table
|
|
648
|
+
sp_html = ""
|
|
649
|
+
if desc and desc.species:
|
|
650
|
+
sp_html = (f'<div class="section-title">'
|
|
651
|
+
f'Species Registry ({len(desc.species)})</div>'
|
|
652
|
+
+ _species_table_html(desc))
|
|
653
|
+
|
|
654
|
+
# Steps table
|
|
655
|
+
st_html = ""
|
|
656
|
+
if desc and desc.steps:
|
|
657
|
+
st_html = (f'<div class="section-title">'
|
|
658
|
+
f'Reaction Steps ({len(desc.steps)})</div>'
|
|
659
|
+
+ _steps_table_html(desc))
|
|
660
|
+
|
|
661
|
+
body_html = f"""
|
|
662
|
+
{checklist_html}
|
|
663
|
+
{warn_html}
|
|
664
|
+
{nar_html}
|
|
665
|
+
{sp_html}
|
|
666
|
+
{st_html}
|
|
667
|
+
"""
|
|
668
|
+
|
|
669
|
+
# Parse time
|
|
670
|
+
time_str = f"{r.parse_time_ms:.0f} ms"
|
|
671
|
+
|
|
672
|
+
return f"""
|
|
673
|
+
<div class="card" style="border-left:4px solid {border}">
|
|
674
|
+
<div class="card-header" onclick="this.parentElement.classList.toggle('open')">
|
|
675
|
+
<span class="chevron">▶</span>
|
|
676
|
+
{status_badge}
|
|
677
|
+
<span class="card-title">{html_escape(r.filename)}</span>
|
|
678
|
+
{topo_badge} {ctype_badge} {steps_badge}
|
|
679
|
+
<span class="badge badge-muted">{time_str}</span>
|
|
680
|
+
</div>
|
|
681
|
+
<div class="card-body">
|
|
682
|
+
<div class="two-col">
|
|
683
|
+
<div class="img-box">{img_html}</div>
|
|
684
|
+
<div class="detail-box">{body_html}</div>
|
|
685
|
+
</div>
|
|
686
|
+
</div>
|
|
687
|
+
</div>
|
|
688
|
+
"""
|
|
689
|
+
|
|
690
|
+
|
|
691
|
+
def _check_item(label: str, ok: bool, detail: str = "") -> str:
|
|
692
|
+
"""Render one quality check item."""
|
|
693
|
+
icon = "✓" if ok else "✗"
|
|
694
|
+
color = "#155724" if ok else "#dc3545"
|
|
695
|
+
detail_span = f' <span class="check-detail">{html_escape(detail)}</span>' if detail else ""
|
|
696
|
+
return (f'<div class="check-item">'
|
|
697
|
+
f'<span style="color:{color};font-weight:700">{icon}</span> '
|
|
698
|
+
f'{html_escape(label)}{detail_span}</div>')
|
|
699
|
+
|
|
700
|
+
|
|
701
|
+
# ---------------------------------------------------------------------------
|
|
702
|
+
# HTML report
|
|
703
|
+
# ---------------------------------------------------------------------------
|
|
704
|
+
|
|
705
|
+
def _html_report(report: AuditReport) -> str:
|
|
706
|
+
"""Generate a self-contained HTML audit report with scheme cards."""
|
|
707
|
+
pass_pct = (report.pass_count / report.total_files * 100
|
|
708
|
+
if report.total_files else 0)
|
|
709
|
+
warn_pct = (report.warn_count / report.total_files * 100
|
|
710
|
+
if report.total_files else 0)
|
|
711
|
+
fail_pct = ((report.fail_count + report.error_count)
|
|
712
|
+
/ report.total_files * 100
|
|
713
|
+
if report.total_files else 0)
|
|
714
|
+
|
|
715
|
+
cards_html = "\n".join(
|
|
716
|
+
_card_html(i, r) for i, r in enumerate(report.results)
|
|
717
|
+
)
|
|
718
|
+
|
|
719
|
+
return f"""<!DOCTYPE html>
|
|
720
|
+
<html lang="en">
|
|
721
|
+
<head>
|
|
722
|
+
<meta charset="utf-8">
|
|
723
|
+
<title>Mode A Audit Report</title>
|
|
724
|
+
<style>
|
|
725
|
+
* {{ box-sizing: border-box; margin: 0; padding: 0; }}
|
|
726
|
+
body {{ font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto,
|
|
727
|
+
Helvetica, Arial, sans-serif; background: #f8f9fa; color: #212529;
|
|
728
|
+
padding: 24px; max-width: 1440px; margin: 0 auto; }}
|
|
729
|
+
h1 {{ font-size: 1.5rem; margin-bottom: 4px; }}
|
|
730
|
+
.subtitle {{ color: #6c757d; font-size: 0.9rem; margin-bottom: 20px; }}
|
|
731
|
+
|
|
732
|
+
/* Summary */
|
|
733
|
+
.summary-bar {{ display: flex; gap: 16px; margin-bottom: 20px; flex-wrap: wrap; }}
|
|
734
|
+
.summary-card {{ background: #fff; border-radius: 8px; padding: 14px 20px;
|
|
735
|
+
box-shadow: 0 1px 3px rgba(0,0,0,0.08);
|
|
736
|
+
min-width: 110px; text-align: center; }}
|
|
737
|
+
.summary-card .num {{ font-size: 2rem; font-weight: 700; }}
|
|
738
|
+
.summary-card .label {{ font-size: 0.75rem; color: #6c757d;
|
|
739
|
+
text-transform: uppercase; letter-spacing: 0.5px; }}
|
|
740
|
+
.progress-bar {{ height: 10px; border-radius: 5px; overflow: hidden;
|
|
741
|
+
display: flex; margin-bottom: 24px; background: #e9ecef; }}
|
|
742
|
+
.progress-bar .seg {{ height: 100%; }}
|
|
743
|
+
|
|
744
|
+
/* Badges */
|
|
745
|
+
.badge {{ display: inline-block; padding: 2px 8px; border-radius: 4px;
|
|
746
|
+
font-size: 0.78rem; font-weight: 600; margin: 0 2px;
|
|
747
|
+
vertical-align: middle; }}
|
|
748
|
+
.badge-info {{ background: #d1ecf1; color: #0c5460; }}
|
|
749
|
+
.badge-muted {{ background: #e9ecef; color: #6c757d; }}
|
|
750
|
+
.badge-fail {{ background: #f8d7da; color: #721c24; }}
|
|
751
|
+
|
|
752
|
+
/* Cards */
|
|
753
|
+
.card {{ background: #fff; border-radius: 8px; margin-bottom: 10px;
|
|
754
|
+
box-shadow: 0 1px 3px rgba(0,0,0,0.06); overflow: hidden; }}
|
|
755
|
+
.card-header {{ padding: 10px 16px; cursor: pointer; display: flex;
|
|
756
|
+
align-items: center; gap: 8px; user-select: none; }}
|
|
757
|
+
.card-header:hover {{ background: #f1f3f5; }}
|
|
758
|
+
.card-title {{ font-weight: 600; font-size: 0.92rem; font-family: monospace; }}
|
|
759
|
+
.chevron {{ font-size: 0.7rem; color: #6c757d; transition: transform 0.15s;
|
|
760
|
+
display: inline-block; width: 14px; }}
|
|
761
|
+
.card.open .chevron {{ transform: rotate(90deg); }}
|
|
762
|
+
.card-body {{ display: none; padding: 0 16px 16px 16px; }}
|
|
763
|
+
.card.open .card-body {{ display: block; }}
|
|
764
|
+
|
|
765
|
+
/* Two-column layout */
|
|
766
|
+
.two-col {{ display: grid; grid-template-columns: minmax(250px,420px) 1fr;
|
|
767
|
+
gap: 16px; margin-top: 8px; }}
|
|
768
|
+
@media (max-width: 900px) {{ .two-col {{ grid-template-columns: 1fr; }} }}
|
|
769
|
+
.img-box {{ text-align: center; }}
|
|
770
|
+
.img-box img {{ max-width: 100%; border: 1px solid #dee2e6; border-radius: 4px; }}
|
|
771
|
+
.no-img {{ background: #f8f9fa; border: 1px dashed #dee2e6; border-radius: 4px;
|
|
772
|
+
padding: 40px 20px; color: #adb5bd; text-align: center;
|
|
773
|
+
font-size: 0.85rem; }}
|
|
774
|
+
|
|
775
|
+
/* Quality checklist */
|
|
776
|
+
.checklist {{ display: flex; flex-wrap: wrap; gap: 2px 16px;
|
|
777
|
+
margin-bottom: 10px; }}
|
|
778
|
+
.check-item {{ font-size: 0.84rem; white-space: nowrap; }}
|
|
779
|
+
.check-detail {{ color: #6c757d; }}
|
|
780
|
+
|
|
781
|
+
/* Warnings */
|
|
782
|
+
.warn-box {{ background: #fff3cd; border-radius: 4px; padding: 6px 10px;
|
|
783
|
+
margin-bottom: 10px; }}
|
|
784
|
+
.warn-item {{ font-size: 0.82rem; color: #856404; padding: 1px 0; }}
|
|
785
|
+
.warn-item::before {{ content: "\\26A0 "; }}
|
|
786
|
+
|
|
787
|
+
/* Sections */
|
|
788
|
+
.section-title {{ font-size: 0.82rem; font-weight: 700; color: #495057;
|
|
789
|
+
text-transform: uppercase; letter-spacing: 0.4px;
|
|
790
|
+
margin: 12px 0 4px 0; }}
|
|
791
|
+
.narrative {{ background: #f8f9fa; border-radius: 4px; padding: 10px;
|
|
792
|
+
font-size: 0.85rem; line-height: 1.5; white-space: pre-wrap;
|
|
793
|
+
max-height: 300px; overflow-y: auto; margin-bottom: 8px; }}
|
|
794
|
+
|
|
795
|
+
/* Inner tables */
|
|
796
|
+
.inner-table {{ width: 100%; border-collapse: collapse; font-size: 0.82rem; }}
|
|
797
|
+
.inner-table th {{ background: #495057; color: #fff; padding: 5px 8px;
|
|
798
|
+
font-size: 0.74rem; text-transform: uppercase;
|
|
799
|
+
letter-spacing: 0.3px; text-align: left; }}
|
|
800
|
+
.inner-table td {{ padding: 4px 8px; border-bottom: 1px solid #e9ecef;
|
|
801
|
+
vertical-align: top; }}
|
|
802
|
+
.inner-table tr:hover td {{ background: #f8f9fa; }}
|
|
803
|
+
.mono {{ font-family: "SFMono-Regular", Consolas, monospace; font-size: 0.8rem; }}
|
|
804
|
+
|
|
805
|
+
.footer {{ margin-top: 20px; font-size: 0.8rem; color: #6c757d; }}
|
|
806
|
+
</style>
|
|
807
|
+
</head>
|
|
808
|
+
<body>
|
|
809
|
+
<h1>Scheme Reader Audit: Mode A (Deterministic)</h1>
|
|
810
|
+
<p class="subtitle">{report.total_files} showcase files ·
|
|
811
|
+
{report.total_time_ms:.0f} ms total parse time ·
|
|
812
|
+
{os.path.basename(report.showcase_dir)}/</p>
|
|
813
|
+
|
|
814
|
+
<div class="summary-bar">
|
|
815
|
+
<div class="summary-card">
|
|
816
|
+
<div class="num" style="color:#155724">{report.pass_count}</div>
|
|
817
|
+
<div class="label">Pass</div>
|
|
818
|
+
</div>
|
|
819
|
+
<div class="summary-card">
|
|
820
|
+
<div class="num" style="color:#856404">{report.warn_count}</div>
|
|
821
|
+
<div class="label">Warn</div>
|
|
822
|
+
</div>
|
|
823
|
+
<div class="summary-card">
|
|
824
|
+
<div class="num" style="color:#721c24">{report.fail_count}</div>
|
|
825
|
+
<div class="label">Fail</div>
|
|
826
|
+
</div>
|
|
827
|
+
<div class="summary-card">
|
|
828
|
+
<div class="num" style="color:#721c24">{report.error_count}</div>
|
|
829
|
+
<div class="label">Error</div>
|
|
830
|
+
</div>
|
|
831
|
+
<div class="summary-card">
|
|
832
|
+
<div class="num" style="color:#004085">{pass_pct:.0f}%</div>
|
|
833
|
+
<div class="label">Pass Rate</div>
|
|
834
|
+
</div>
|
|
835
|
+
</div>
|
|
836
|
+
|
|
837
|
+
<div class="progress-bar">
|
|
838
|
+
<div class="seg" style="width:{pass_pct}%;background:#28a745"></div>
|
|
839
|
+
<div class="seg" style="width:{warn_pct}%;background:#ffc107"></div>
|
|
840
|
+
<div class="seg" style="width:{fail_pct}%;background:#dc3545"></div>
|
|
841
|
+
</div>
|
|
842
|
+
|
|
843
|
+
{cards_html}
|
|
844
|
+
|
|
845
|
+
<div class="footer">
|
|
846
|
+
<p><b>Quality checks:</b> Steps parsed, species found, topology correct,
|
|
847
|
+
step count correct, all steps have reactants+products, species coverage
|
|
848
|
+
≥80%, conditions extracted, no raw [SMILES:...] in narrative.</p>
|
|
849
|
+
</div>
|
|
850
|
+
|
|
851
|
+
<script>
|
|
852
|
+
// Expand all FAIL/WARN cards by default
|
|
853
|
+
document.querySelectorAll('.card').forEach(function(c) {{
|
|
854
|
+
var hdr = c.querySelector('.card-header');
|
|
855
|
+
if (hdr && (hdr.innerHTML.indexOf('FAIL') >= 0 ||
|
|
856
|
+
hdr.innerHTML.indexOf('WARN') >= 0)) {{
|
|
857
|
+
c.classList.add('open');
|
|
858
|
+
}}
|
|
859
|
+
}});
|
|
860
|
+
</script>
|
|
861
|
+
</body>
|
|
862
|
+
</html>"""
|
|
863
|
+
|
|
864
|
+
|
|
865
|
+
# ---------------------------------------------------------------------------
|
|
866
|
+
# CLI
|
|
867
|
+
# ---------------------------------------------------------------------------
|
|
868
|
+
|
|
869
|
+
def main():
|
|
870
|
+
parser = argparse.ArgumentParser(
|
|
871
|
+
description="Audit scheme_reader Mode A quality on showcase CDXMLs"
|
|
872
|
+
)
|
|
873
|
+
parser.add_argument(
|
|
874
|
+
"showcase_dir", nargs="?",
|
|
875
|
+
default=os.path.join(os.path.dirname(__file__),
|
|
876
|
+
"..", "experiments", "scheme_dsl", "showcase"),
|
|
877
|
+
help="Directory of showcase CDXML files (default: experiments/scheme_dsl/showcase)"
|
|
878
|
+
)
|
|
879
|
+
parser.add_argument("--chemscript", action="store_true",
|
|
880
|
+
help="Use ChemScript for SMILES extraction")
|
|
881
|
+
parser.add_argument("--render", action="store_true",
|
|
882
|
+
help="Render CDXMLs to PNG via ChemDraw COM "
|
|
883
|
+
"(requires ChemDraw closed)")
|
|
884
|
+
parser.add_argument("--json", action="store_true",
|
|
885
|
+
help="Output JSON instead of terminal report")
|
|
886
|
+
parser.add_argument("--html",
|
|
887
|
+
help="Write HTML report to file")
|
|
888
|
+
parser.add_argument("-o", "--output",
|
|
889
|
+
help="Write JSON output to file (implies --json)")
|
|
890
|
+
parser.add_argument("-v", "--verbose", action="store_true",
|
|
891
|
+
help="Print debug info during parsing")
|
|
892
|
+
parser.add_argument("--no-color", action="store_true",
|
|
893
|
+
help="Disable terminal colors")
|
|
894
|
+
|
|
895
|
+
args = parser.parse_args()
|
|
896
|
+
|
|
897
|
+
# Resolve path
|
|
898
|
+
showcase_dir = os.path.abspath(args.showcase_dir)
|
|
899
|
+
if not os.path.isdir(showcase_dir):
|
|
900
|
+
print(f"Error: not a directory: {showcase_dir}", file=sys.stderr)
|
|
901
|
+
sys.exit(1)
|
|
902
|
+
|
|
903
|
+
report = audit_showcase(showcase_dir,
|
|
904
|
+
use_chemscript=args.chemscript,
|
|
905
|
+
verbose=args.verbose,
|
|
906
|
+
render=args.render)
|
|
907
|
+
|
|
908
|
+
if args.html:
|
|
909
|
+
html = _html_report(report)
|
|
910
|
+
with open(args.html, "w", encoding="utf-8") as f:
|
|
911
|
+
f.write(html)
|
|
912
|
+
print(f"HTML audit report written to {args.html}")
|
|
913
|
+
elif args.json or args.output:
|
|
914
|
+
data = report.to_dict()
|
|
915
|
+
if args.output:
|
|
916
|
+
with open(args.output, "w", encoding="utf-8") as f:
|
|
917
|
+
json.dump(data, f, indent=2, ensure_ascii=False)
|
|
918
|
+
print(f"Audit results written to {args.output}")
|
|
919
|
+
else:
|
|
920
|
+
json.dump(data, sys.stdout, indent=2, ensure_ascii=False)
|
|
921
|
+
print()
|
|
922
|
+
else:
|
|
923
|
+
_print_report(report, color=not args.no_color)
|
|
924
|
+
|
|
925
|
+
# Exit code: 0 if all PASS/WARN, 1 if any FAIL/ERROR
|
|
926
|
+
if report.fail_count + report.error_count > 0:
|
|
927
|
+
sys.exit(1)
|
|
928
|
+
|
|
929
|
+
|
|
930
|
+
if __name__ == "__main__":
|
|
931
|
+
main()
|