cdxml-toolkit 0.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (91) hide show
  1. cdxml_toolkit/__init__.py +18 -0
  2. cdxml_toolkit/_jre/__init__.py +2 -0
  3. cdxml_toolkit/_jre/temurin-21-jre-win-x64.zip +0 -0
  4. cdxml_toolkit/analysis/__init__.py +35 -0
  5. cdxml_toolkit/analysis/deterministic/__init__.py +12 -0
  6. cdxml_toolkit/analysis/deterministic/discover_experiment_files.py +413 -0
  7. cdxml_toolkit/analysis/deterministic/lab_book_formatter.py +701 -0
  8. cdxml_toolkit/analysis/deterministic/lcms_file_categorizer.py +928 -0
  9. cdxml_toolkit/analysis/deterministic/lcms_identifier.py +598 -0
  10. cdxml_toolkit/analysis/deterministic/mass_resolver.py +654 -0
  11. cdxml_toolkit/analysis/deterministic/multi_lcms_analyzer.py +1412 -0
  12. cdxml_toolkit/analysis/deterministic/procedure_writer.py +446 -0
  13. cdxml_toolkit/analysis/extract_nmr.py +47 -0
  14. cdxml_toolkit/analysis/format_procedure_entry.py +479 -0
  15. cdxml_toolkit/analysis/lcms_analyzer.py +1299 -0
  16. cdxml_toolkit/analysis/parse_analysis_file.py +134 -0
  17. cdxml_toolkit/cdxml_builder.py +920 -0
  18. cdxml_toolkit/cdxml_utils.py +342 -0
  19. cdxml_toolkit/chemdraw/__init__.py +5 -0
  20. cdxml_toolkit/chemdraw/_chemscript_server.py +562 -0
  21. cdxml_toolkit/chemdraw/cdx_converter.py +527 -0
  22. cdxml_toolkit/chemdraw/cdxml_to_image.py +262 -0
  23. cdxml_toolkit/chemdraw/cdxml_to_image_rdkit.py +296 -0
  24. cdxml_toolkit/chemdraw/chemscript_bridge.py +901 -0
  25. cdxml_toolkit/constants.py +304 -0
  26. cdxml_toolkit/coord_normalizer.py +438 -0
  27. cdxml_toolkit/deterministic_pipeline/__init__.py +6 -0
  28. cdxml_toolkit/deterministic_pipeline/legacy/__init__.py +5 -0
  29. cdxml_toolkit/deterministic_pipeline/legacy/eln_cdx_cleanup.py +509 -0
  30. cdxml_toolkit/deterministic_pipeline/legacy/eln_enrichment.py +1394 -0
  31. cdxml_toolkit/deterministic_pipeline/legacy/scheme_aligner.py +428 -0
  32. cdxml_toolkit/deterministic_pipeline/legacy/scheme_polisher.py +1337 -0
  33. cdxml_toolkit/deterministic_pipeline/legacy/scheme_polisher_v2.py +1340 -0
  34. cdxml_toolkit/deterministic_pipeline/scheme_reader_audit.py +931 -0
  35. cdxml_toolkit/deterministic_pipeline/scheme_reader_verify.py +1160 -0
  36. cdxml_toolkit/image/__init__.py +15 -0
  37. cdxml_toolkit/image/reaction_from_image.py +2103 -0
  38. cdxml_toolkit/image/structure_from_image.py +1711 -0
  39. cdxml_toolkit/layout/__init__.py +5 -0
  40. cdxml_toolkit/layout/alignment.py +1642 -0
  41. cdxml_toolkit/layout/reaction_cleanup.py +1002 -0
  42. cdxml_toolkit/layout/scheme_merger.py +2260 -0
  43. cdxml_toolkit/mcp_server/__init__.py +0 -0
  44. cdxml_toolkit/mcp_server/__main__.py +5 -0
  45. cdxml_toolkit/mcp_server/server.py +1567 -0
  46. cdxml_toolkit/naming/__init__.py +6 -0
  47. cdxml_toolkit/naming/aligned_namer.py +2342 -0
  48. cdxml_toolkit/naming/mol_builder.py +3722 -0
  49. cdxml_toolkit/naming/name_decomposer.py +2843 -0
  50. cdxml_toolkit/naming/reactions_datamol.json +2414 -0
  51. cdxml_toolkit/office/__init__.py +5 -0
  52. cdxml_toolkit/office/doc_from_template.py +722 -0
  53. cdxml_toolkit/office/ole_embedder.py +808 -0
  54. cdxml_toolkit/office/ole_extractor.py +272 -0
  55. cdxml_toolkit/perception/__init__.py +10 -0
  56. cdxml_toolkit/perception/compound_search.py +229 -0
  57. cdxml_toolkit/perception/eln_csv_parser.py +240 -0
  58. cdxml_toolkit/perception/rdf_parser.py +664 -0
  59. cdxml_toolkit/perception/reactant_heuristic.py +1045 -0
  60. cdxml_toolkit/perception/reaction_parser.py +2150 -0
  61. cdxml_toolkit/perception/scheme_reader.py +2948 -0
  62. cdxml_toolkit/perception/scheme_refine.py +1404 -0
  63. cdxml_toolkit/perception/scheme_segmenter.py +619 -0
  64. cdxml_toolkit/perception/spatial_assignment.py +1013 -0
  65. cdxml_toolkit/rdkit_utils.py +605 -0
  66. cdxml_toolkit/render/__init__.py +17 -0
  67. cdxml_toolkit/render/auto_layout.py +229 -0
  68. cdxml_toolkit/render/compact_parser.py +632 -0
  69. cdxml_toolkit/render/parser.py +706 -0
  70. cdxml_toolkit/render/render_scheme.py +267 -0
  71. cdxml_toolkit/render/renderer.py +2387 -0
  72. cdxml_toolkit/render/schema.py +90 -0
  73. cdxml_toolkit/render/scheme_maker.py +1043 -0
  74. cdxml_toolkit/render/scheme_yaml_writer.py +1487 -0
  75. cdxml_toolkit/resolve/__init__.py +13 -0
  76. cdxml_toolkit/resolve/cas_resolver.py +430 -0
  77. cdxml_toolkit/resolve/chemscanner_abbreviations.json +28813 -0
  78. cdxml_toolkit/resolve/condensed_formula.py +493 -0
  79. cdxml_toolkit/resolve/jre_manager.py +195 -0
  80. cdxml_toolkit/resolve/reagent_abbreviations.json +1046 -0
  81. cdxml_toolkit/resolve/reagent_db.py +285 -0
  82. cdxml_toolkit/resolve/superatom_data.json +2856 -0
  83. cdxml_toolkit/resolve/superatom_table.py +146 -0
  84. cdxml_toolkit/text_formatting.py +298 -0
  85. cdxml_toolkit-0.5.0.dist-info/METADATA +318 -0
  86. cdxml_toolkit-0.5.0.dist-info/RECORD +91 -0
  87. cdxml_toolkit-0.5.0.dist-info/WHEEL +5 -0
  88. cdxml_toolkit-0.5.0.dist-info/entry_points.txt +17 -0
  89. cdxml_toolkit-0.5.0.dist-info/licenses/LICENSE +21 -0
  90. cdxml_toolkit-0.5.0.dist-info/licenses/NOTICE.md +37 -0
  91. cdxml_toolkit-0.5.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,931 @@
1
+ """Audit scheme_reader Mode A (deterministic) quality on showcase CDXMLs.
2
+
3
+ Runs ``read_scheme()`` against every showcase file and checks:
4
+ - Parse success (steps, species, narrative present)
5
+ - Step completeness (each step has reactants AND products)
6
+ - Species coverage (fraction of species used in steps)
7
+ - Topology correctness (matches filename convention)
8
+ - Conditions extraction (non-empty conditions per step)
9
+ - Arrow style accuracy (dashed/failed detected)
10
+ - Narrative quality (no leftover ``[SMILES: ...]`` fragments)
11
+
12
+ Usage::
13
+
14
+ python -m cdxml_toolkit.scheme_reader_audit [showcase_dir]
15
+ python -m cdxml_toolkit.scheme_reader_audit --html report.html --render
16
+ python -m cdxml_toolkit.scheme_reader_audit --json -o audit_results.json
17
+ """
18
+
19
+ import argparse
20
+ import base64
21
+ import json
22
+ import os
23
+ import re
24
+ import subprocess
25
+ import sys
26
+ import tempfile
27
+ import time
28
+ import traceback
29
+ from dataclasses import dataclass, field, asdict
30
+ from html import escape as html_escape
31
+ from pathlib import Path
32
+ from typing import Dict, List, Optional, Tuple
33
+
34
+ from ..perception.scheme_reader import read_scheme, SchemeDescription
35
+
36
+
37
+ # ---------------------------------------------------------------------------
38
+ # Expected topology from filename conventions
39
+ # ---------------------------------------------------------------------------
40
+
41
+ _TOPOLOGY_RULES: List[Tuple[re.Pattern, str]] = [
42
+ (re.compile(r"divergent|_sar", re.I), "divergent"),
43
+ (re.compile(r"stacked|parallel|comparison|different_routes", re.I), "parallel"),
44
+ # serpentine/wrap are LAYOUTS, topology is still linear
45
+ (re.compile(r"serpentine|wrap|linear|sequential|letter|compact|"
46
+ r"name_resolution|reductive|mitsunobu|grignard|"
47
+ r"two_step|three_step|run_arrows|failed|above_structures", re.I),
48
+ "linear"),
49
+ ]
50
+
51
+
52
+ def _expected_topology(filename: str) -> Optional[str]:
53
+ """Infer expected topology from filename convention. Returns None if ambiguous."""
54
+ base = os.path.splitext(os.path.basename(filename))[0]
55
+ for pattern, topo in _TOPOLOGY_RULES:
56
+ if pattern.search(base):
57
+ return topo
58
+ return None
59
+
60
+
61
+ def _expected_step_count(filename: str) -> Optional[int]:
62
+ """Infer expected step count from filename if it contains a number hint."""
63
+ base = os.path.splitext(os.path.basename(filename))[0]
64
+ # Patterns like "4step", "5step", "7step", "3step"
65
+ m = re.search(r"(\d+)\s*step", base, re.I)
66
+ if m:
67
+ return int(m.group(1))
68
+ # "two_step", "three_step"
69
+ word_map = {"two": 2, "three": 3, "four": 4, "five": 5,
70
+ "six": 6, "seven": 7, "eight": 8}
71
+ for word, n in word_map.items():
72
+ if f"{word}_step" in base.lower():
73
+ return n
74
+ # Divergent: count from filename e.g. "4products"
75
+ m2 = re.search(r"(\d+)\s*product", base, re.I)
76
+ if m2:
77
+ return int(m2.group(1))
78
+ # Divergent/SAR schemes — step count equals number of products, skip
79
+ # single-step heuristic for these
80
+ if re.search(r"divergent|_sar", base, re.I):
81
+ return None # can't infer from filename alone
82
+ # Single-step schemes (buchwald, suzuki, snar, etc.) that are linear with no step count
83
+ if re.search(r"buchwald|suzuki|snar|amide_coupling|boc_deprotection|"
84
+ r"reductive_amination|mitsunobu|grignard|name_resolution|"
85
+ r"failed_arrow", base, re.I):
86
+ return 1
87
+ return None
88
+
89
+
90
+ # ---------------------------------------------------------------------------
91
+ # Per-file audit result
92
+ # ---------------------------------------------------------------------------
93
+
94
+ @dataclass
95
+ class FileAuditResult:
96
+ """Quality audit result for one CDXML file."""
97
+ filename: str
98
+ cdxml_path: str = ""
99
+ status: str = "PASS" # "PASS", "WARN", "FAIL", "ERROR"
100
+ num_steps: int = 0
101
+ num_species: int = 0
102
+ topology: str = ""
103
+ content_type: str = ""
104
+ expected_topology: Optional[str] = None
105
+ expected_steps: Optional[int] = None
106
+ topology_match: bool = True
107
+ step_count_match: bool = True
108
+ all_steps_complete: bool = True # every step has >=1 reactant AND product
109
+ species_coverage: float = 1.0 # fraction in steps
110
+ orphan_species: List[str] = field(default_factory=list)
111
+ conditions_coverage: float = 1.0 # fraction of steps with conditions
112
+ steps_missing_conditions: List[int] = field(default_factory=list)
113
+ arrow_styles: List[str] = field(default_factory=list)
114
+ smiles_in_narrative: int = 0 # count of [SMILES: ...] in narrative
115
+ warnings: List[str] = field(default_factory=list)
116
+ parse_time_ms: float = 0.0
117
+ error: Optional[str] = None
118
+ # Rich data (stored for HTML, not serialised to JSON)
119
+ _desc: Optional[SchemeDescription] = field(default=None, repr=False)
120
+ _image_b64: str = field(default="", repr=False)
121
+
122
+ @property
123
+ def detail_line(self) -> str:
124
+ """One-line summary for terminal output."""
125
+ parts = [f"{self.num_steps} step{'s' if self.num_steps != 1 else ''}"]
126
+ parts.append(f"{self.num_species} species")
127
+ parts.append(f"{self.topology}")
128
+ if self.expected_topology and self.topology_match:
129
+ parts[-1] += " OK"
130
+ elif self.expected_topology and not self.topology_match:
131
+ parts[-1] += f" MISMATCH (expected {self.expected_topology})"
132
+ if self.expected_steps is not None and not self.step_count_match:
133
+ parts.append(f"steps: {self.num_steps}/{self.expected_steps}")
134
+ if not self.all_steps_complete:
135
+ parts.append("incomplete steps")
136
+ if self.species_coverage < 1.0:
137
+ parts.append(f"coverage {self.species_coverage:.0%}")
138
+ if self.smiles_in_narrative > 0:
139
+ parts.append(f"{self.smiles_in_narrative} SMILES in narrative")
140
+ return ", ".join(parts)
141
+
142
+
143
+ # ---------------------------------------------------------------------------
144
+ # Aggregate report
145
+ # ---------------------------------------------------------------------------
146
+
147
+ @dataclass
148
+ class AuditReport:
149
+ """Aggregate quality report across all audited files."""
150
+ showcase_dir: str = ""
151
+ total_files: int = 0
152
+ pass_count: int = 0
153
+ warn_count: int = 0
154
+ fail_count: int = 0
155
+ error_count: int = 0
156
+ results: List[FileAuditResult] = field(default_factory=list)
157
+ total_time_ms: float = 0.0
158
+
159
+ def to_dict(self) -> dict:
160
+ d = {
161
+ "showcase_dir": self.showcase_dir,
162
+ "total_files": self.total_files,
163
+ "pass": self.pass_count,
164
+ "warn": self.warn_count,
165
+ "fail": self.fail_count,
166
+ "error": self.error_count,
167
+ "total_time_ms": round(self.total_time_ms, 1),
168
+ "results": [],
169
+ }
170
+ for r in self.results:
171
+ rd = {k: v for k, v in asdict(r).items()
172
+ if not k.startswith("_")}
173
+ d["results"].append(rd)
174
+ return d
175
+
176
+
177
+ # ---------------------------------------------------------------------------
178
+ # Image rendering helpers
179
+ # ---------------------------------------------------------------------------
180
+
181
+ def _render_cdxml_to_png(cdxml_path: str, output_path: str) -> bool:
182
+ """Render a CDXML file to PNG via cdxml_to_image. Returns True on success."""
183
+ try:
184
+ from ..chemdraw.cdxml_to_image import cdxml_to_png
185
+ cdxml_to_png(cdxml_path, output_path)
186
+ return True
187
+ except Exception:
188
+ try:
189
+ python = sys.executable
190
+ result = subprocess.run(
191
+ [python, "-m", "cdxml_toolkit.cdxml_to_image",
192
+ cdxml_path, "-o", output_path],
193
+ capture_output=True, timeout=30,
194
+ )
195
+ return result.returncode == 0 and os.path.exists(output_path)
196
+ except Exception:
197
+ return False
198
+
199
+
200
+ def _embed_image_b64(img_path: str) -> str:
201
+ """Read image file and return base64 data-URI string."""
202
+ if not os.path.exists(img_path):
203
+ return ""
204
+ with open(img_path, "rb") as f:
205
+ data = base64.b64encode(f.read()).decode("ascii")
206
+ ext = os.path.splitext(img_path)[1].lower().lstrip(".")
207
+ mime = {"png": "image/png", "jpg": "image/jpeg",
208
+ "jpeg": "image/jpeg"}.get(ext, "image/png")
209
+ return f"data:{mime};base64,{data}"
210
+
211
+
212
+ # ---------------------------------------------------------------------------
213
+ # Core audit logic
214
+ # ---------------------------------------------------------------------------
215
+
216
+ def _audit_one(cdxml_path: str,
217
+ use_chemscript: bool = False,
218
+ verbose: bool = False,
219
+ render: bool = False,
220
+ img_dir: Optional[str] = None) -> FileAuditResult:
221
+ """Run quality audit on a single CDXML file."""
222
+ filename = os.path.basename(cdxml_path)
223
+ result = FileAuditResult(filename=filename, cdxml_path=cdxml_path)
224
+ result.expected_topology = _expected_topology(filename)
225
+ result.expected_steps = _expected_step_count(filename)
226
+
227
+ t0 = time.perf_counter()
228
+ try:
229
+ desc = read_scheme(cdxml_path,
230
+ use_network=False,
231
+ use_chemscript=use_chemscript,
232
+ verbose=verbose)
233
+ except Exception as exc:
234
+ result.status = "ERROR"
235
+ result.error = f"{type(exc).__name__}: {exc}"
236
+ result.parse_time_ms = (time.perf_counter() - t0) * 1000
237
+ return result
238
+ result.parse_time_ms = (time.perf_counter() - t0) * 1000
239
+ result._desc = desc
240
+
241
+ # Render image
242
+ if render and img_dir:
243
+ png_path = os.path.join(img_dir, Path(cdxml_path).stem + ".png")
244
+ if _render_cdxml_to_png(cdxml_path, png_path):
245
+ result._image_b64 = _embed_image_b64(png_path)
246
+
247
+ # Basic parse success
248
+ result.num_steps = desc.num_steps
249
+ result.num_species = len(desc.species)
250
+ result.topology = desc.topology
251
+ result.content_type = desc.content_type or "unknown"
252
+
253
+ if desc.num_steps < 1:
254
+ result.status = "FAIL"
255
+ result.warnings.append("No steps parsed")
256
+ return result
257
+ if not desc.species:
258
+ result.status = "FAIL"
259
+ result.warnings.append("No species found")
260
+ return result
261
+ if not desc.narrative:
262
+ result.warnings.append("Empty narrative")
263
+
264
+ # Topology correctness
265
+ if result.expected_topology:
266
+ result.topology_match = (desc.topology == result.expected_topology)
267
+ if not result.topology_match:
268
+ result.warnings.append(
269
+ f"Topology mismatch: got '{desc.topology}', "
270
+ f"expected '{result.expected_topology}'"
271
+ )
272
+
273
+ # Step count correctness
274
+ if result.expected_steps is not None:
275
+ result.step_count_match = (desc.num_steps == result.expected_steps)
276
+ if not result.step_count_match:
277
+ result.warnings.append(
278
+ f"Step count mismatch: got {desc.num_steps}, "
279
+ f"expected {result.expected_steps}"
280
+ )
281
+
282
+ # Step completeness
283
+ for step in desc.steps:
284
+ if not step.reactant_ids or not step.product_ids:
285
+ result.all_steps_complete = False
286
+ result.warnings.append(
287
+ f"Step {step.step_index}: missing "
288
+ f"{'reactants' if not step.reactant_ids else 'products'}"
289
+ )
290
+
291
+ # Species coverage
292
+ referenced_ids = set()
293
+ for step in desc.steps:
294
+ referenced_ids.update(step.reactant_ids)
295
+ referenced_ids.update(step.product_ids)
296
+ referenced_ids.update(step.reagent_ids)
297
+
298
+ fragment_species = {sid: sp for sid, sp in desc.species.items()
299
+ if sp.element_type == "fragment"}
300
+ if fragment_species:
301
+ covered = sum(1 for sid in fragment_species if sid in referenced_ids)
302
+ result.species_coverage = covered / len(fragment_species)
303
+ result.orphan_species = [
304
+ sid for sid in fragment_species if sid not in referenced_ids
305
+ ]
306
+ if result.species_coverage < 0.8:
307
+ result.warnings.append(
308
+ f"Low species coverage: {result.species_coverage:.0%} "
309
+ f"({len(result.orphan_species)} orphans)"
310
+ )
311
+
312
+ # Conditions coverage
313
+ steps_with_conds = 0
314
+ for step in desc.steps:
315
+ if step.conditions or step.condition_text_raw:
316
+ steps_with_conds += 1
317
+ else:
318
+ result.steps_missing_conditions.append(step.step_index)
319
+ if desc.num_steps > 0:
320
+ result.conditions_coverage = steps_with_conds / desc.num_steps
321
+
322
+ # Arrow styles
323
+ result.arrow_styles = [s.arrow_style for s in desc.steps]
324
+
325
+ # Narrative quality
326
+ result.smiles_in_narrative = len(re.findall(r"\[SMILES:", desc.narrative))
327
+ if result.smiles_in_narrative > 0:
328
+ result.warnings.append(
329
+ f"{result.smiles_in_narrative} raw SMILES in narrative"
330
+ )
331
+
332
+ # Scheme warnings
333
+ if desc.warnings:
334
+ for w in desc.warnings:
335
+ result.warnings.append(f"scheme warning: {w}")
336
+
337
+ # Determine overall status
338
+ has_fail = False
339
+ has_warn = False
340
+
341
+ if not result.topology_match:
342
+ has_fail = True
343
+ if not result.step_count_match:
344
+ if result.expected_steps and result.num_steps <= result.expected_steps // 2:
345
+ has_fail = True
346
+ else:
347
+ has_warn = True
348
+ if not result.all_steps_complete:
349
+ has_warn = True
350
+ if result.species_coverage < 0.5:
351
+ has_fail = True
352
+ elif result.species_coverage < 0.8:
353
+ has_warn = True
354
+ if result.smiles_in_narrative > 0:
355
+ has_warn = True
356
+
357
+ if has_fail:
358
+ result.status = "FAIL"
359
+ elif has_warn:
360
+ result.status = "WARN"
361
+ else:
362
+ result.status = "PASS"
363
+
364
+ return result
365
+
366
+
367
+ def audit_showcase(showcase_dir: str,
368
+ use_chemscript: bool = False,
369
+ verbose: bool = False,
370
+ render: bool = False) -> AuditReport:
371
+ """Run quality audit on all showcase CDXMLs in a directory.
372
+
373
+ Parameters
374
+ ----------
375
+ showcase_dir : str
376
+ Directory containing ``*.cdxml`` showcase files.
377
+ use_chemscript : bool
378
+ Use ChemScript for SMILES extraction.
379
+ verbose : bool
380
+ Print debug info during parsing.
381
+ render : bool
382
+ Render CDXMLs to PNG via ChemDraw COM (requires ChemDraw closed).
383
+
384
+ Returns
385
+ -------
386
+ AuditReport
387
+ Aggregate quality report.
388
+ """
389
+ report = AuditReport(showcase_dir=showcase_dir)
390
+
391
+ cdxml_files = sorted(
392
+ f for f in os.listdir(showcase_dir) if f.endswith(".cdxml")
393
+ )
394
+ report.total_files = len(cdxml_files)
395
+
396
+ # Set up image directory if rendering
397
+ img_dir = None
398
+ if render:
399
+ img_dir = tempfile.mkdtemp(prefix="audit_imgs_")
400
+ print(f" Rendering to {img_dir}", file=sys.stderr)
401
+
402
+ t_total = time.perf_counter()
403
+ for i, fname in enumerate(cdxml_files):
404
+ path = os.path.join(showcase_dir, fname)
405
+ if render:
406
+ print(f" [{i+1}/{len(cdxml_files)}] {fname}", file=sys.stderr)
407
+ result = _audit_one(path, use_chemscript=use_chemscript,
408
+ verbose=verbose, render=render, img_dir=img_dir)
409
+ report.results.append(result)
410
+
411
+ if result.status == "PASS":
412
+ report.pass_count += 1
413
+ elif result.status == "WARN":
414
+ report.warn_count += 1
415
+ elif result.status == "FAIL":
416
+ report.fail_count += 1
417
+ else:
418
+ report.error_count += 1
419
+
420
+ report.total_time_ms = (time.perf_counter() - t_total) * 1000
421
+ return report
422
+
423
+
424
+ # ---------------------------------------------------------------------------
425
+ # Terminal output
426
+ # ---------------------------------------------------------------------------
427
+
428
+ _STATUS_COLORS = {
429
+ "PASS": "\033[92m", # green
430
+ "WARN": "\033[93m", # yellow
431
+ "FAIL": "\033[91m", # red
432
+ "ERROR": "\033[91m", # red
433
+ }
434
+ _RESET = "\033[0m"
435
+
436
+
437
+ def _print_report(report: AuditReport, color: bool = True) -> None:
438
+ """Print human-readable audit report to stdout."""
439
+ print()
440
+ print("=" * 70)
441
+ print(" Scheme Reader Audit: Mode A (Deterministic)")
442
+ print(f" {report.total_files} showcase files evaluated")
443
+ print("=" * 70)
444
+ print()
445
+
446
+ max_name_len = max((len(r.filename) for r in report.results), default=30)
447
+
448
+ for r in report.results:
449
+ if color:
450
+ c = _STATUS_COLORS.get(r.status, "")
451
+ tag = f"{c}{r.status:5s}{_RESET}"
452
+ else:
453
+ tag = f"{r.status:5s}"
454
+
455
+ name = r.filename.ljust(max_name_len)
456
+ if r.error:
457
+ detail = f"ERROR: {r.error}"
458
+ else:
459
+ detail = r.detail_line
460
+ print(f" {tag} {name} {detail}")
461
+
462
+ # Print warnings indented
463
+ for w in r.warnings:
464
+ if color:
465
+ print(f" {_STATUS_COLORS.get('WARN', '')}-> {w}{_RESET}")
466
+ else:
467
+ print(f" -> {w}")
468
+
469
+ print()
470
+ print("-" * 70)
471
+ summary_parts = []
472
+ if report.pass_count:
473
+ summary_parts.append(f"{report.pass_count} PASS")
474
+ if report.warn_count:
475
+ summary_parts.append(f"{report.warn_count} WARN")
476
+ if report.fail_count:
477
+ summary_parts.append(f"{report.fail_count} FAIL")
478
+ if report.error_count:
479
+ summary_parts.append(f"{report.error_count} ERROR")
480
+ print(f" Summary: {', '.join(summary_parts)}")
481
+ print(f" Total parse time: {report.total_time_ms:.0f} ms")
482
+ print()
483
+
484
+
485
+ # ---------------------------------------------------------------------------
486
+ # HTML helpers
487
+ # ---------------------------------------------------------------------------
488
+
489
+ _STATUS_BG = {
490
+ "PASS": "#d4edda", "WARN": "#fff3cd",
491
+ "FAIL": "#f8d7da", "ERROR": "#f8d7da",
492
+ }
493
+ _STATUS_FG = {
494
+ "PASS": "#155724", "WARN": "#856404",
495
+ "FAIL": "#721c24", "ERROR": "#721c24",
496
+ }
497
+ _STATUS_BORDER = {
498
+ "PASS": "#c3e6cb", "WARN": "#ffeeba",
499
+ "FAIL": "#f5c6cb", "ERROR": "#f5c6cb",
500
+ }
501
+
502
+
503
+ def _species_table_html(desc: SchemeDescription) -> str:
504
+ """Build an HTML table for the species registry."""
505
+ if not desc.species:
506
+ return '<p style="color:#6c757d;font-size:0.85rem">No species</p>'
507
+ rows = []
508
+ for sid, sp in desc.species.items():
509
+ label = html_escape(sp.label or "")
510
+ name = html_escape(sp.name or "")
511
+ iupac = html_escape(getattr(sp, "iupac_name", "") or "")
512
+ smiles = html_escape(sp.smiles or "")
513
+ formula = html_escape(sp.formula or "")
514
+ mw_str = f"{sp.mw:.1f}" if sp.mw else ""
515
+ etype = sp.element_type or ""
516
+ tcat = html_escape(sp.text_category or "")
517
+ # Choose display name
518
+ display = iupac or name or formula or ""
519
+ # Truncate long SMILES for display
520
+ smiles_short = smiles[:60] + ("..." if len(smiles) > 60 else "")
521
+ rows.append(f"""<tr>
522
+ <td class="mono">{html_escape(sid)}</td>
523
+ <td>{label}</td>
524
+ <td>{display}</td>
525
+ <td class="mono" title="{smiles}">{smiles_short}</td>
526
+ <td>{formula}</td>
527
+ <td style="text-align:right">{mw_str}</td>
528
+ <td>{etype}{(' / ' + tcat) if tcat else ''}</td>
529
+ </tr>""")
530
+ return f"""<table class="inner-table">
531
+ <tr><th>ID</th><th>Label</th><th>Name</th><th>SMILES</th>
532
+ <th>Formula</th><th>MW</th><th>Type</th></tr>
533
+ {''.join(rows)}
534
+ </table>"""
535
+
536
+
537
+ def _steps_table_html(desc: SchemeDescription) -> str:
538
+ """Build an HTML table for the reaction steps."""
539
+ if not desc.steps:
540
+ return '<p style="color:#6c757d;font-size:0.85rem">No steps</p>'
541
+ rows = []
542
+ for step in desc.steps:
543
+ r_ids = ", ".join(step.reactant_ids)
544
+ p_ids = ", ".join(step.product_ids)
545
+ rg_ids = ", ".join(step.reagent_ids)
546
+ conds = "; ".join(step.conditions[:3])
547
+ yld = step.yield_text or ""
548
+ arrow_icon = {"solid": "&#8594;", "dashed": "&#8669;",
549
+ "failed": "&#10007;&#8594;"}.get(step.arrow_style, "&#8594;")
550
+ rows.append(f"""<tr>
551
+ <td style="text-align:center">{step.step_index + 1}</td>
552
+ <td class="mono">{html_escape(r_ids)}</td>
553
+ <td style="text-align:center;font-size:1.1rem">{arrow_icon}</td>
554
+ <td class="mono">{html_escape(p_ids)}</td>
555
+ <td class="mono">{html_escape(rg_ids)}</td>
556
+ <td>{html_escape(conds)}</td>
557
+ <td>{html_escape(yld)}</td>
558
+ </tr>""")
559
+ return f"""<table class="inner-table">
560
+ <tr><th>#</th><th>Reactants</th><th></th><th>Products</th>
561
+ <th>Reagents</th><th>Conditions</th><th>Yield</th></tr>
562
+ {''.join(rows)}
563
+ </table>"""
564
+
565
+
566
+ def _card_html(idx: int, r: FileAuditResult) -> str:
567
+ """Generate one expandable card for a scheme file."""
568
+ bg = _STATUS_BG.get(r.status, "#fff")
569
+ fg = _STATUS_FG.get(r.status, "#000")
570
+ border = _STATUS_BORDER.get(r.status, "#dee2e6")
571
+
572
+ # Status badge
573
+ status_badge = (f'<span class="badge" style="background:{bg};color:{fg}">'
574
+ f'{r.status}</span>')
575
+
576
+ # Topology badge
577
+ if r.expected_topology and not r.topology_match:
578
+ topo_badge = (f'<span class="badge badge-fail">{r.topology}'
579
+ f' (expected {r.expected_topology})</span>')
580
+ else:
581
+ topo_badge = f'<span class="badge badge-info">{r.topology}</span>'
582
+
583
+ ctype_badge = (f'<span class="badge badge-muted">{r.content_type}</span>'
584
+ if r.content_type else "")
585
+ steps_badge = (f'<span class="badge badge-info">'
586
+ f'{r.num_steps} step{"s" if r.num_steps != 1 else ""}</span>')
587
+
588
+ # Image section
589
+ if r._image_b64:
590
+ img_html = f'<img src="{r._image_b64}" alt="Rendered scheme">'
591
+ else:
592
+ img_html = ('<div class="no-img">No rendered image<br>'
593
+ '<small>(use --render)</small></div>')
594
+
595
+ # Body content
596
+ if r.error:
597
+ body_html = (f'<div class="narrative" style="color:#721c24">'
598
+ f'{html_escape(r.error)}</div>')
599
+ else:
600
+ desc = r._desc
601
+ narrative = html_escape(desc.narrative) if desc else ""
602
+ # Quality checklist
603
+ checks = []
604
+ checks.append(_check_item("Steps parsed", r.num_steps >= 1,
605
+ f"{r.num_steps} steps"))
606
+ checks.append(_check_item("Species found", r.num_species >= 1,
607
+ f"{r.num_species} species"))
608
+ if r.expected_topology:
609
+ checks.append(_check_item("Topology correct", r.topology_match,
610
+ f"{r.topology}"
611
+ + (f" (expected {r.expected_topology})"
612
+ if not r.topology_match else "")))
613
+ if r.expected_steps is not None:
614
+ checks.append(_check_item("Step count correct", r.step_count_match,
615
+ f"{r.num_steps}"
616
+ + (f"/{r.expected_steps}"
617
+ if not r.step_count_match else "")))
618
+ checks.append(_check_item("All steps complete", r.all_steps_complete))
619
+ checks.append(_check_item("Species coverage",
620
+ r.species_coverage >= 0.8,
621
+ f"{r.species_coverage:.0%}"))
622
+ checks.append(_check_item("Conditions extracted",
623
+ r.conditions_coverage >= 0.5,
624
+ f"{r.conditions_coverage:.0%}"))
625
+ checks.append(_check_item("No raw SMILES in narrative",
626
+ r.smiles_in_narrative == 0,
627
+ f"{r.smiles_in_narrative} found"
628
+ if r.smiles_in_narrative else ""))
629
+
630
+ checklist_html = '<div class="checklist">' + ''.join(checks) + '</div>'
631
+
632
+ # Warnings
633
+ warn_html = ""
634
+ if r.warnings:
635
+ warn_items = "".join(
636
+ f'<div class="warn-item">{html_escape(w)}</div>'
637
+ for w in r.warnings
638
+ )
639
+ warn_html = f'<div class="warn-box">{warn_items}</div>'
640
+
641
+ # Narrative
642
+ nar_html = ""
643
+ if narrative:
644
+ nar_html = (f'<div class="section-title">Narrative</div>'
645
+ f'<div class="narrative">{narrative}</div>')
646
+
647
+ # Species table
648
+ sp_html = ""
649
+ if desc and desc.species:
650
+ sp_html = (f'<div class="section-title">'
651
+ f'Species Registry ({len(desc.species)})</div>'
652
+ + _species_table_html(desc))
653
+
654
+ # Steps table
655
+ st_html = ""
656
+ if desc and desc.steps:
657
+ st_html = (f'<div class="section-title">'
658
+ f'Reaction Steps ({len(desc.steps)})</div>'
659
+ + _steps_table_html(desc))
660
+
661
+ body_html = f"""
662
+ {checklist_html}
663
+ {warn_html}
664
+ {nar_html}
665
+ {sp_html}
666
+ {st_html}
667
+ """
668
+
669
+ # Parse time
670
+ time_str = f"{r.parse_time_ms:.0f} ms"
671
+
672
+ return f"""
673
+ <div class="card" style="border-left:4px solid {border}">
674
+ <div class="card-header" onclick="this.parentElement.classList.toggle('open')">
675
+ <span class="chevron">&#9654;</span>
676
+ {status_badge}
677
+ <span class="card-title">{html_escape(r.filename)}</span>
678
+ {topo_badge} {ctype_badge} {steps_badge}
679
+ <span class="badge badge-muted">{time_str}</span>
680
+ </div>
681
+ <div class="card-body">
682
+ <div class="two-col">
683
+ <div class="img-box">{img_html}</div>
684
+ <div class="detail-box">{body_html}</div>
685
+ </div>
686
+ </div>
687
+ </div>
688
+ """
689
+
690
+
691
+ def _check_item(label: str, ok: bool, detail: str = "") -> str:
692
+ """Render one quality check item."""
693
+ icon = "&#10003;" if ok else "&#10007;"
694
+ color = "#155724" if ok else "#dc3545"
695
+ detail_span = f' <span class="check-detail">{html_escape(detail)}</span>' if detail else ""
696
+ return (f'<div class="check-item">'
697
+ f'<span style="color:{color};font-weight:700">{icon}</span> '
698
+ f'{html_escape(label)}{detail_span}</div>')
699
+
700
+
701
+ # ---------------------------------------------------------------------------
702
+ # HTML report
703
+ # ---------------------------------------------------------------------------
704
+
705
+ def _html_report(report: AuditReport) -> str:
706
+ """Generate a self-contained HTML audit report with scheme cards."""
707
+ pass_pct = (report.pass_count / report.total_files * 100
708
+ if report.total_files else 0)
709
+ warn_pct = (report.warn_count / report.total_files * 100
710
+ if report.total_files else 0)
711
+ fail_pct = ((report.fail_count + report.error_count)
712
+ / report.total_files * 100
713
+ if report.total_files else 0)
714
+
715
+ cards_html = "\n".join(
716
+ _card_html(i, r) for i, r in enumerate(report.results)
717
+ )
718
+
719
+ return f"""<!DOCTYPE html>
720
+ <html lang="en">
721
+ <head>
722
+ <meta charset="utf-8">
723
+ <title>Mode A Audit Report</title>
724
+ <style>
725
+ * {{ box-sizing: border-box; margin: 0; padding: 0; }}
726
+ body {{ font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto,
727
+ Helvetica, Arial, sans-serif; background: #f8f9fa; color: #212529;
728
+ padding: 24px; max-width: 1440px; margin: 0 auto; }}
729
+ h1 {{ font-size: 1.5rem; margin-bottom: 4px; }}
730
+ .subtitle {{ color: #6c757d; font-size: 0.9rem; margin-bottom: 20px; }}
731
+
732
+ /* Summary */
733
+ .summary-bar {{ display: flex; gap: 16px; margin-bottom: 20px; flex-wrap: wrap; }}
734
+ .summary-card {{ background: #fff; border-radius: 8px; padding: 14px 20px;
735
+ box-shadow: 0 1px 3px rgba(0,0,0,0.08);
736
+ min-width: 110px; text-align: center; }}
737
+ .summary-card .num {{ font-size: 2rem; font-weight: 700; }}
738
+ .summary-card .label {{ font-size: 0.75rem; color: #6c757d;
739
+ text-transform: uppercase; letter-spacing: 0.5px; }}
740
+ .progress-bar {{ height: 10px; border-radius: 5px; overflow: hidden;
741
+ display: flex; margin-bottom: 24px; background: #e9ecef; }}
742
+ .progress-bar .seg {{ height: 100%; }}
743
+
744
+ /* Badges */
745
+ .badge {{ display: inline-block; padding: 2px 8px; border-radius: 4px;
746
+ font-size: 0.78rem; font-weight: 600; margin: 0 2px;
747
+ vertical-align: middle; }}
748
+ .badge-info {{ background: #d1ecf1; color: #0c5460; }}
749
+ .badge-muted {{ background: #e9ecef; color: #6c757d; }}
750
+ .badge-fail {{ background: #f8d7da; color: #721c24; }}
751
+
752
+ /* Cards */
753
+ .card {{ background: #fff; border-radius: 8px; margin-bottom: 10px;
754
+ box-shadow: 0 1px 3px rgba(0,0,0,0.06); overflow: hidden; }}
755
+ .card-header {{ padding: 10px 16px; cursor: pointer; display: flex;
756
+ align-items: center; gap: 8px; user-select: none; }}
757
+ .card-header:hover {{ background: #f1f3f5; }}
758
+ .card-title {{ font-weight: 600; font-size: 0.92rem; font-family: monospace; }}
759
+ .chevron {{ font-size: 0.7rem; color: #6c757d; transition: transform 0.15s;
760
+ display: inline-block; width: 14px; }}
761
+ .card.open .chevron {{ transform: rotate(90deg); }}
762
+ .card-body {{ display: none; padding: 0 16px 16px 16px; }}
763
+ .card.open .card-body {{ display: block; }}
764
+
765
+ /* Two-column layout */
766
+ .two-col {{ display: grid; grid-template-columns: minmax(250px,420px) 1fr;
767
+ gap: 16px; margin-top: 8px; }}
768
+ @media (max-width: 900px) {{ .two-col {{ grid-template-columns: 1fr; }} }}
769
+ .img-box {{ text-align: center; }}
770
+ .img-box img {{ max-width: 100%; border: 1px solid #dee2e6; border-radius: 4px; }}
771
+ .no-img {{ background: #f8f9fa; border: 1px dashed #dee2e6; border-radius: 4px;
772
+ padding: 40px 20px; color: #adb5bd; text-align: center;
773
+ font-size: 0.85rem; }}
774
+
775
+ /* Quality checklist */
776
+ .checklist {{ display: flex; flex-wrap: wrap; gap: 2px 16px;
777
+ margin-bottom: 10px; }}
778
+ .check-item {{ font-size: 0.84rem; white-space: nowrap; }}
779
+ .check-detail {{ color: #6c757d; }}
780
+
781
+ /* Warnings */
782
+ .warn-box {{ background: #fff3cd; border-radius: 4px; padding: 6px 10px;
783
+ margin-bottom: 10px; }}
784
+ .warn-item {{ font-size: 0.82rem; color: #856404; padding: 1px 0; }}
785
+ .warn-item::before {{ content: "\\26A0 "; }}
786
+
787
+ /* Sections */
788
+ .section-title {{ font-size: 0.82rem; font-weight: 700; color: #495057;
789
+ text-transform: uppercase; letter-spacing: 0.4px;
790
+ margin: 12px 0 4px 0; }}
791
+ .narrative {{ background: #f8f9fa; border-radius: 4px; padding: 10px;
792
+ font-size: 0.85rem; line-height: 1.5; white-space: pre-wrap;
793
+ max-height: 300px; overflow-y: auto; margin-bottom: 8px; }}
794
+
795
+ /* Inner tables */
796
+ .inner-table {{ width: 100%; border-collapse: collapse; font-size: 0.82rem; }}
797
+ .inner-table th {{ background: #495057; color: #fff; padding: 5px 8px;
798
+ font-size: 0.74rem; text-transform: uppercase;
799
+ letter-spacing: 0.3px; text-align: left; }}
800
+ .inner-table td {{ padding: 4px 8px; border-bottom: 1px solid #e9ecef;
801
+ vertical-align: top; }}
802
+ .inner-table tr:hover td {{ background: #f8f9fa; }}
803
+ .mono {{ font-family: "SFMono-Regular", Consolas, monospace; font-size: 0.8rem; }}
804
+
805
+ .footer {{ margin-top: 20px; font-size: 0.8rem; color: #6c757d; }}
806
+ </style>
807
+ </head>
808
+ <body>
809
+ <h1>Scheme Reader Audit: Mode A (Deterministic)</h1>
810
+ <p class="subtitle">{report.total_files} showcase files &middot;
811
+ {report.total_time_ms:.0f} ms total parse time &middot;
812
+ {os.path.basename(report.showcase_dir)}/</p>
813
+
814
+ <div class="summary-bar">
815
+ <div class="summary-card">
816
+ <div class="num" style="color:#155724">{report.pass_count}</div>
817
+ <div class="label">Pass</div>
818
+ </div>
819
+ <div class="summary-card">
820
+ <div class="num" style="color:#856404">{report.warn_count}</div>
821
+ <div class="label">Warn</div>
822
+ </div>
823
+ <div class="summary-card">
824
+ <div class="num" style="color:#721c24">{report.fail_count}</div>
825
+ <div class="label">Fail</div>
826
+ </div>
827
+ <div class="summary-card">
828
+ <div class="num" style="color:#721c24">{report.error_count}</div>
829
+ <div class="label">Error</div>
830
+ </div>
831
+ <div class="summary-card">
832
+ <div class="num" style="color:#004085">{pass_pct:.0f}%</div>
833
+ <div class="label">Pass Rate</div>
834
+ </div>
835
+ </div>
836
+
837
+ <div class="progress-bar">
838
+ <div class="seg" style="width:{pass_pct}%;background:#28a745"></div>
839
+ <div class="seg" style="width:{warn_pct}%;background:#ffc107"></div>
840
+ <div class="seg" style="width:{fail_pct}%;background:#dc3545"></div>
841
+ </div>
842
+
843
+ {cards_html}
844
+
845
+ <div class="footer">
846
+ <p><b>Quality checks:</b> Steps parsed, species found, topology correct,
847
+ step count correct, all steps have reactants+products, species coverage
848
+ &ge;80%, conditions extracted, no raw [SMILES:...] in narrative.</p>
849
+ </div>
850
+
851
+ <script>
852
+ // Expand all FAIL/WARN cards by default
853
+ document.querySelectorAll('.card').forEach(function(c) {{
854
+ var hdr = c.querySelector('.card-header');
855
+ if (hdr && (hdr.innerHTML.indexOf('FAIL') >= 0 ||
856
+ hdr.innerHTML.indexOf('WARN') >= 0)) {{
857
+ c.classList.add('open');
858
+ }}
859
+ }});
860
+ </script>
861
+ </body>
862
+ </html>"""
863
+
864
+
865
+ # ---------------------------------------------------------------------------
866
+ # CLI
867
+ # ---------------------------------------------------------------------------
868
+
869
+ def main():
870
+ parser = argparse.ArgumentParser(
871
+ description="Audit scheme_reader Mode A quality on showcase CDXMLs"
872
+ )
873
+ parser.add_argument(
874
+ "showcase_dir", nargs="?",
875
+ default=os.path.join(os.path.dirname(__file__),
876
+ "..", "experiments", "scheme_dsl", "showcase"),
877
+ help="Directory of showcase CDXML files (default: experiments/scheme_dsl/showcase)"
878
+ )
879
+ parser.add_argument("--chemscript", action="store_true",
880
+ help="Use ChemScript for SMILES extraction")
881
+ parser.add_argument("--render", action="store_true",
882
+ help="Render CDXMLs to PNG via ChemDraw COM "
883
+ "(requires ChemDraw closed)")
884
+ parser.add_argument("--json", action="store_true",
885
+ help="Output JSON instead of terminal report")
886
+ parser.add_argument("--html",
887
+ help="Write HTML report to file")
888
+ parser.add_argument("-o", "--output",
889
+ help="Write JSON output to file (implies --json)")
890
+ parser.add_argument("-v", "--verbose", action="store_true",
891
+ help="Print debug info during parsing")
892
+ parser.add_argument("--no-color", action="store_true",
893
+ help="Disable terminal colors")
894
+
895
+ args = parser.parse_args()
896
+
897
+ # Resolve path
898
+ showcase_dir = os.path.abspath(args.showcase_dir)
899
+ if not os.path.isdir(showcase_dir):
900
+ print(f"Error: not a directory: {showcase_dir}", file=sys.stderr)
901
+ sys.exit(1)
902
+
903
+ report = audit_showcase(showcase_dir,
904
+ use_chemscript=args.chemscript,
905
+ verbose=args.verbose,
906
+ render=args.render)
907
+
908
+ if args.html:
909
+ html = _html_report(report)
910
+ with open(args.html, "w", encoding="utf-8") as f:
911
+ f.write(html)
912
+ print(f"HTML audit report written to {args.html}")
913
+ elif args.json or args.output:
914
+ data = report.to_dict()
915
+ if args.output:
916
+ with open(args.output, "w", encoding="utf-8") as f:
917
+ json.dump(data, f, indent=2, ensure_ascii=False)
918
+ print(f"Audit results written to {args.output}")
919
+ else:
920
+ json.dump(data, sys.stdout, indent=2, ensure_ascii=False)
921
+ print()
922
+ else:
923
+ _print_report(report, color=not args.no_color)
924
+
925
+ # Exit code: 0 if all PASS/WARN, 1 if any FAIL/ERROR
926
+ if report.fail_count + report.error_count > 0:
927
+ sys.exit(1)
928
+
929
+
930
+ if __name__ == "__main__":
931
+ main()