cdxml-toolkit 0.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (91) hide show
  1. cdxml_toolkit/__init__.py +18 -0
  2. cdxml_toolkit/_jre/__init__.py +2 -0
  3. cdxml_toolkit/_jre/temurin-21-jre-win-x64.zip +0 -0
  4. cdxml_toolkit/analysis/__init__.py +35 -0
  5. cdxml_toolkit/analysis/deterministic/__init__.py +12 -0
  6. cdxml_toolkit/analysis/deterministic/discover_experiment_files.py +413 -0
  7. cdxml_toolkit/analysis/deterministic/lab_book_formatter.py +701 -0
  8. cdxml_toolkit/analysis/deterministic/lcms_file_categorizer.py +928 -0
  9. cdxml_toolkit/analysis/deterministic/lcms_identifier.py +598 -0
  10. cdxml_toolkit/analysis/deterministic/mass_resolver.py +654 -0
  11. cdxml_toolkit/analysis/deterministic/multi_lcms_analyzer.py +1412 -0
  12. cdxml_toolkit/analysis/deterministic/procedure_writer.py +446 -0
  13. cdxml_toolkit/analysis/extract_nmr.py +47 -0
  14. cdxml_toolkit/analysis/format_procedure_entry.py +479 -0
  15. cdxml_toolkit/analysis/lcms_analyzer.py +1299 -0
  16. cdxml_toolkit/analysis/parse_analysis_file.py +134 -0
  17. cdxml_toolkit/cdxml_builder.py +920 -0
  18. cdxml_toolkit/cdxml_utils.py +342 -0
  19. cdxml_toolkit/chemdraw/__init__.py +5 -0
  20. cdxml_toolkit/chemdraw/_chemscript_server.py +562 -0
  21. cdxml_toolkit/chemdraw/cdx_converter.py +527 -0
  22. cdxml_toolkit/chemdraw/cdxml_to_image.py +262 -0
  23. cdxml_toolkit/chemdraw/cdxml_to_image_rdkit.py +296 -0
  24. cdxml_toolkit/chemdraw/chemscript_bridge.py +901 -0
  25. cdxml_toolkit/constants.py +304 -0
  26. cdxml_toolkit/coord_normalizer.py +438 -0
  27. cdxml_toolkit/deterministic_pipeline/__init__.py +6 -0
  28. cdxml_toolkit/deterministic_pipeline/legacy/__init__.py +5 -0
  29. cdxml_toolkit/deterministic_pipeline/legacy/eln_cdx_cleanup.py +509 -0
  30. cdxml_toolkit/deterministic_pipeline/legacy/eln_enrichment.py +1394 -0
  31. cdxml_toolkit/deterministic_pipeline/legacy/scheme_aligner.py +428 -0
  32. cdxml_toolkit/deterministic_pipeline/legacy/scheme_polisher.py +1337 -0
  33. cdxml_toolkit/deterministic_pipeline/legacy/scheme_polisher_v2.py +1340 -0
  34. cdxml_toolkit/deterministic_pipeline/scheme_reader_audit.py +931 -0
  35. cdxml_toolkit/deterministic_pipeline/scheme_reader_verify.py +1160 -0
  36. cdxml_toolkit/image/__init__.py +15 -0
  37. cdxml_toolkit/image/reaction_from_image.py +2103 -0
  38. cdxml_toolkit/image/structure_from_image.py +1711 -0
  39. cdxml_toolkit/layout/__init__.py +5 -0
  40. cdxml_toolkit/layout/alignment.py +1642 -0
  41. cdxml_toolkit/layout/reaction_cleanup.py +1002 -0
  42. cdxml_toolkit/layout/scheme_merger.py +2260 -0
  43. cdxml_toolkit/mcp_server/__init__.py +0 -0
  44. cdxml_toolkit/mcp_server/__main__.py +5 -0
  45. cdxml_toolkit/mcp_server/server.py +1567 -0
  46. cdxml_toolkit/naming/__init__.py +6 -0
  47. cdxml_toolkit/naming/aligned_namer.py +2342 -0
  48. cdxml_toolkit/naming/mol_builder.py +3722 -0
  49. cdxml_toolkit/naming/name_decomposer.py +2843 -0
  50. cdxml_toolkit/naming/reactions_datamol.json +2414 -0
  51. cdxml_toolkit/office/__init__.py +5 -0
  52. cdxml_toolkit/office/doc_from_template.py +722 -0
  53. cdxml_toolkit/office/ole_embedder.py +808 -0
  54. cdxml_toolkit/office/ole_extractor.py +272 -0
  55. cdxml_toolkit/perception/__init__.py +10 -0
  56. cdxml_toolkit/perception/compound_search.py +229 -0
  57. cdxml_toolkit/perception/eln_csv_parser.py +240 -0
  58. cdxml_toolkit/perception/rdf_parser.py +664 -0
  59. cdxml_toolkit/perception/reactant_heuristic.py +1045 -0
  60. cdxml_toolkit/perception/reaction_parser.py +2150 -0
  61. cdxml_toolkit/perception/scheme_reader.py +2948 -0
  62. cdxml_toolkit/perception/scheme_refine.py +1404 -0
  63. cdxml_toolkit/perception/scheme_segmenter.py +619 -0
  64. cdxml_toolkit/perception/spatial_assignment.py +1013 -0
  65. cdxml_toolkit/rdkit_utils.py +605 -0
  66. cdxml_toolkit/render/__init__.py +17 -0
  67. cdxml_toolkit/render/auto_layout.py +229 -0
  68. cdxml_toolkit/render/compact_parser.py +632 -0
  69. cdxml_toolkit/render/parser.py +706 -0
  70. cdxml_toolkit/render/render_scheme.py +267 -0
  71. cdxml_toolkit/render/renderer.py +2387 -0
  72. cdxml_toolkit/render/schema.py +90 -0
  73. cdxml_toolkit/render/scheme_maker.py +1043 -0
  74. cdxml_toolkit/render/scheme_yaml_writer.py +1487 -0
  75. cdxml_toolkit/resolve/__init__.py +13 -0
  76. cdxml_toolkit/resolve/cas_resolver.py +430 -0
  77. cdxml_toolkit/resolve/chemscanner_abbreviations.json +28813 -0
  78. cdxml_toolkit/resolve/condensed_formula.py +493 -0
  79. cdxml_toolkit/resolve/jre_manager.py +195 -0
  80. cdxml_toolkit/resolve/reagent_abbreviations.json +1046 -0
  81. cdxml_toolkit/resolve/reagent_db.py +285 -0
  82. cdxml_toolkit/resolve/superatom_data.json +2856 -0
  83. cdxml_toolkit/resolve/superatom_table.py +146 -0
  84. cdxml_toolkit/text_formatting.py +298 -0
  85. cdxml_toolkit-0.5.0.dist-info/METADATA +318 -0
  86. cdxml_toolkit-0.5.0.dist-info/RECORD +91 -0
  87. cdxml_toolkit-0.5.0.dist-info/WHEEL +5 -0
  88. cdxml_toolkit-0.5.0.dist-info/entry_points.txt +17 -0
  89. cdxml_toolkit-0.5.0.dist-info/licenses/LICENSE +21 -0
  90. cdxml_toolkit-0.5.0.dist-info/licenses/NOTICE.md +37 -0
  91. cdxml_toolkit-0.5.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,1711 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ structure_from_image.py — Extract chemical structures from images using DECIMER.
4
+
5
+ Takes a PNG/JPG image or a PDF page and returns SMILES + 2D atom coordinates
6
+ for every detected chemical structure.
7
+
8
+ Pipeline
9
+ --------
10
+ 1. Input : image file (PNG/JPG) or PDF (one page extracted per run)
11
+ 2. Segment: detect and crop individual structure regions using OpenCV
12
+ (white-background connected-component / contour approach)
13
+ 3. DECIMER: convert each cropped image to SMILES
14
+ (DECIMER Image Transformer v2, ~285 MB model, downloaded on first run)
15
+ 4. RDKit : SMILES → 2D coordinates (Compute2DCoords)
16
+ 5. Output : JSON with SMILES + normalised atom/bond data per structure,
17
+ ready for cdxml_builder.py; optionally write CDXML directly.
18
+
19
+ Usage
20
+ -----
21
+ Single image, JSON output:
22
+ python structure_from_image.py --input image.png --output structures.json
23
+
24
+ PDF page (0-indexed):
25
+ python structure_from_image.py --input paper.pdf --page 0 --output out.json
26
+
27
+ Pipe straight to CDXML builder (multi-molecule page):
28
+ python structure_from_image.py --input image.png | python cdxml_builder.py --mode multi
29
+
30
+ Hand-drawn structures (uses DECIMER hand-drawn model):
31
+ python structure_from_image.py --input sketch.png --hand-drawn
32
+
33
+ Skip segmentation (whole image is one structure):
34
+ python structure_from_image.py --input single_structure.png --no-segment
35
+
36
+ Output JSON format
37
+ ------------------
38
+ [
39
+ {
40
+ "index": 0,
41
+ "smiles": "c1ccccc1",
42
+ "bbox": [x0, y0, x1, y1], # pixel coords in the input image
43
+ "atoms": [
44
+ {"index": 1, "symbol": "C", "x": 200.0, "y": 300.0},
45
+ ...
46
+ ],
47
+ "bonds": [
48
+ {"index": 1, "order": 1, "atom1": 1, "atom2": 2},
49
+ ...
50
+ ]
51
+ },
52
+ ...
53
+ ]
54
+
55
+ Notes
56
+ -----
57
+ - DECIMER models download to ~/.data/DECIMER-V2/ on first run (~570 MB total).
58
+ - TensorFlow 2.20 prints hardware-capability warnings to stderr; these are harmless.
59
+ - Segmentation uses an OpenCV contour approach tuned for white-background publication
60
+ figures. For densely packed figures (multiple overlapping structures) it works best
61
+ on clean, high-resolution images (≥150 DPI equivalent).
62
+ - Coordinates are normalised to ACS 1996 style (bond length 14.40 pt) by
63
+ coord_normalizer.normalize_coords().
64
+ """
65
+
66
+ import argparse
67
+ import json
68
+ import math
69
+ import os
70
+ import sys
71
+ import tempfile
72
+ from copy import deepcopy
73
+ from typing import Dict, List, Optional, Tuple
74
+
75
+ # ---------------------------------------------------------------------------
76
+ # Optional heavy imports (warn gracefully if missing)
77
+ # ---------------------------------------------------------------------------
78
+
79
+ try:
80
+ import cv2
81
+ import numpy as np
82
+ HAS_CV2 = True
83
+ except ImportError:
84
+ HAS_CV2 = False
85
+
86
+ try:
87
+ from PIL import Image
88
+ HAS_PIL = True
89
+ except ImportError:
90
+ HAS_PIL = False
91
+
92
+ try:
93
+ import pymupdf # PyMuPDF — PDF rendering
94
+ HAS_PYMUPDF = True
95
+ except ImportError:
96
+ HAS_PYMUPDF = False
97
+
98
+ try:
99
+ from rdkit import Chem
100
+ from rdkit.Chem import AllChem
101
+ HAS_RDKIT = True
102
+ except ImportError:
103
+ HAS_RDKIT = False
104
+
105
+ # ---------------------------------------------------------------------------
106
+ # DECIMER lazy loader — loads ONE model on demand, not both at import time.
107
+ #
108
+ # Upstream DECIMER eagerly loads both the standard AND hand-drawn models
109
+ # (~332 MB each) at import time via module-level get_models(). This takes
110
+ # ~50 s on CPU. We bypass that by loading only the model we need, only
111
+ # when predict is first called, cutting cold-start roughly in half.
112
+ # ---------------------------------------------------------------------------
113
+ import threading as _threading
114
+
115
+ _decimer_predict = None
116
+ _decimer_mode = None # tracks which model is loaded: "standard" or "hand_drawn"
117
+ _decimer_lock = _threading.Lock()
118
+
119
+
120
+ def _load_decimer(hand_drawn: bool = False):
121
+ """Lazy-load DECIMER, loading only the requested model (not both).
122
+
123
+ On first call, loads TensorFlow + one DECIMER SavedModel (~25 s instead
124
+ of ~50 s). Subsequent calls with the same ``hand_drawn`` flag return
125
+ instantly. If the flag changes, the other model is loaded on demand.
126
+
127
+ Thread-safe: a lock prevents duplicate loads when the MCP server's
128
+ background preload thread and a tool call race.
129
+ """
130
+ global _decimer_predict, _decimer_mode
131
+ requested = "hand_drawn" if hand_drawn else "standard"
132
+
133
+ if _decimer_predict is not None and _decimer_mode == requested:
134
+ return _decimer_predict
135
+
136
+ with _decimer_lock:
137
+ # Double-check after acquiring the lock (another thread may have
138
+ # finished loading while we waited).
139
+ if _decimer_predict is not None and _decimer_mode == requested:
140
+ return _decimer_predict
141
+
142
+ try:
143
+ import tensorflow as tf
144
+ import pystow
145
+ except ImportError as exc:
146
+ raise ImportError(
147
+ "DECIMER is not installed. Run:\n"
148
+ " pip install cdxml-toolkit[decimer]\n"
149
+ f"Original error: {exc}"
150
+ ) from exc
151
+
152
+ # Bypass DECIMER's __init__.py which eagerly loads BOTH models (~50 s).
153
+ # Instead, load only the two helper submodules we need (utils,
154
+ # pre_process) via importlib, then tf.saved_model.load for ONE model.
155
+ import importlib.util, sys, types, pickle
156
+
157
+ spec = importlib.util.find_spec("DECIMER")
158
+ if spec is None or spec.submodule_search_locations is None:
159
+ raise ImportError("DECIMER package not found")
160
+ pkg_dir = spec.submodule_search_locations[0]
161
+
162
+ # Register a stub DECIMER package so submodule imports resolve
163
+ if "DECIMER" not in sys.modules:
164
+ stub = types.ModuleType("DECIMER")
165
+ stub.__path__ = [pkg_dir]
166
+ stub.__package__ = "DECIMER"
167
+ sys.modules["DECIMER"] = stub
168
+
169
+ def _load_submodule(name):
170
+ fqn = f"DECIMER.{name}"
171
+ if fqn in sys.modules:
172
+ return sys.modules[fqn]
173
+ sub_spec = importlib.util.spec_from_file_location(
174
+ fqn, os.path.join(pkg_dir, f"{name}.py"),
175
+ )
176
+ mod = importlib.util.module_from_spec(sub_spec)
177
+ sys.modules[fqn] = mod
178
+ sub_spec.loader.exec_module(mod)
179
+ return mod
180
+
181
+ utils = _load_submodule("utils")
182
+ pre_process = _load_submodule("pre_process")
183
+
184
+ # Locate models on disk (downloads ~570 MB on first run)
185
+ default_path = pystow.join("DECIMER-V2")
186
+ model_urls = {
187
+ "DECIMER": "https://zenodo.org/record/8300489/files/models.zip",
188
+ "DECIMER_HandDrawn": "https://zenodo.org/records/10781330/files/DECIMER_HandDrawn_model.zip",
189
+ }
190
+ model_paths = utils.ensure_models(
191
+ default_path=default_path, model_urls=model_urls,
192
+ )
193
+
194
+ # Load tokenizer (fast, ~0 s)
195
+ tokenizer_path = os.path.join(
196
+ model_paths["DECIMER"], "assets", "tokenizer_SMILES.pkl"
197
+ )
198
+ try:
199
+ with open(tokenizer_path, "rb") as f:
200
+ tokenizer = pickle.load(f)
201
+ except ModuleNotFoundError:
202
+ # Keras 2→3 compat: redirect keras.preprocessing.text
203
+ class _K2Unpickler(pickle.Unpickler):
204
+ def find_class(self, module, name):
205
+ if module.startswith("keras."):
206
+ module = module.replace("keras.", "tensorflow.keras.", 1)
207
+ return super().find_class(module, name)
208
+ with open(tokenizer_path, "rb") as f:
209
+ tokenizer = _K2Unpickler(f).load()
210
+
211
+ # Load only the requested model (~25 s instead of ~50 s for both)
212
+ model_key = "DECIMER_HandDrawn" if hand_drawn else "DECIMER"
213
+ model = tf.saved_model.load(model_paths[model_key])
214
+
215
+ def _predict(image_input, confidence=False, hand_drawn=False):
216
+ """Predict SMILES from an image (numpy array or file path)."""
217
+ chemical_structure = pre_process.decode_image(image_input)
218
+ predicted_tokens, confidence_values = model(
219
+ tf.constant(chemical_structure)
220
+ )
221
+ outputs = [tokenizer.index_word[i] for i in predicted_tokens[0].numpy()]
222
+ smiles = (
223
+ "".join(str(t) for t in outputs)
224
+ .replace("<start>", "")
225
+ .replace("<end>", "")
226
+ )
227
+ smiles = utils.decoder(smiles)
228
+
229
+ if confidence:
230
+ conf_pairs = [
231
+ (
232
+ utils.decoder(
233
+ tokenizer.index_word[predicted_tokens[0].numpy()[i]]
234
+ ),
235
+ confidence_values[i].numpy(),
236
+ )
237
+ for i in range(len(confidence_values))
238
+ ]
239
+ # strip <start>/<end> tokens
240
+ conf_pairs = conf_pairs[1:-1]
241
+ return smiles, conf_pairs
242
+
243
+ return smiles
244
+
245
+ _decimer_predict = _predict
246
+ _decimer_mode = requested
247
+ return _decimer_predict
248
+
249
+
250
+ # ---------------------------------------------------------------------------
251
+ # Image I/O
252
+ # ---------------------------------------------------------------------------
253
+
254
+ def load_image(path: str, page: int = 0) -> "np.ndarray":
255
+ """
256
+ Load an image from a PNG/JPG file or from a specific page of a PDF.
257
+
258
+ Returns an RGB numpy array (H x W x 3).
259
+ """
260
+ if not HAS_CV2:
261
+ raise RuntimeError("opencv-python is required. Run: pip install opencv-python")
262
+
263
+ ext = os.path.splitext(path)[1].lower()
264
+
265
+ if ext == ".pdf":
266
+ if not HAS_PYMUPDF:
267
+ raise RuntimeError("PyMuPDF is required for PDF input. Run: pip install pymupdf")
268
+ doc = pymupdf.open(path)
269
+ if page >= len(doc):
270
+ raise ValueError(f"PDF has {len(doc)} pages; requested page {page} (0-indexed)")
271
+ pg = doc[page]
272
+ # Render at 150 DPI (matrix scale = 150/72)
273
+ matrix = pymupdf.Matrix(150 / 72, 150 / 72)
274
+ pix = pg.get_pixmap(matrix=matrix, alpha=False)
275
+ arr = np.frombuffer(pix.samples, dtype=np.uint8).reshape(pix.h, pix.w, pix.n)
276
+ doc.close()
277
+ # PyMuPDF returns RGB; convert to BGR for OpenCV
278
+ return cv2.cvtColor(arr, cv2.COLOR_RGB2BGR)
279
+
280
+ img = cv2.imread(path, cv2.IMREAD_COLOR)
281
+ if img is None:
282
+ raise FileNotFoundError(f"Cannot read image: {path}")
283
+ return img
284
+
285
+
286
+ # ---------------------------------------------------------------------------
287
+ # Segmentation
288
+ # ---------------------------------------------------------------------------
289
+
290
+ _MIN_STRUCTURE_AREA_PX = 1500 # ignore regions smaller than this (noise)
291
+ _MIN_SIDE_PX = 40 # ignore regions thinner than this
292
+ _PADDING_PX = 12 # padding around detected bounding boxes
293
+ _MAX_AREA_FRACTION = 0.90 # ignore boxes covering >90% of image (whole-page)
294
+ _MIN_ASPECT_RATIO = 0.15 # reject very wide/tall thin strips (text lines, arrows)
295
+ _MAX_SMILES_LEN = 500 # truncated/garbage DECIMER outputs above this length
296
+
297
+
298
+ def _to_gray_binary(bgr: "np.ndarray") -> "np.ndarray":
299
+ """Convert BGR image to a binary mask where dark (non-white) pixels are 1."""
300
+ gray = cv2.cvtColor(bgr, cv2.COLOR_BGR2GRAY)
301
+ # Otsu threshold; for white-background publication figures this finds ink
302
+ _, binary = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)
303
+ return binary
304
+
305
+
306
+ def _merge_nearby_boxes(
307
+ boxes: List[Tuple[int, int, int, int]],
308
+ gap: int = 30,
309
+ ) -> List[Tuple[int, int, int, int]]:
310
+ """
311
+ Iteratively merge bounding boxes that are close together (within `gap` pixels).
312
+ This groups fragmented bond lines and labels that belong to one structure.
313
+ """
314
+ if not boxes:
315
+ return []
316
+
317
+ changed = True
318
+ while changed:
319
+ changed = False
320
+ merged: List[Tuple[int, int, int, int]] = []
321
+ used = [False] * len(boxes)
322
+ for i, (x0, y0, x1, y1) in enumerate(boxes):
323
+ if used[i]:
324
+ continue
325
+ # Expand by gap for proximity test
326
+ ex0, ey0, ex1, ey1 = x0 - gap, y0 - gap, x1 + gap, y1 + gap
327
+ for j, (ax0, ay0, ax1, ay1) in enumerate(boxes):
328
+ if used[j] or j == i:
329
+ continue
330
+ # Overlap test on expanded box
331
+ if ax0 <= ex1 and ax1 >= ex0 and ay0 <= ey1 and ay1 >= ey0:
332
+ x0 = min(x0, ax0)
333
+ y0 = min(y0, ay0)
334
+ x1 = max(x1, ax1)
335
+ y1 = max(y1, ay1)
336
+ ex0, ey0, ex1, ey1 = x0 - gap, y0 - gap, x1 + gap, y1 + gap
337
+ used[j] = True
338
+ changed = True
339
+ merged.append((x0, y0, x1, y1))
340
+ used[i] = True
341
+ boxes = merged
342
+ return boxes
343
+
344
+
345
+ def _adaptive_gap(boxes: List[Tuple[int, int, int, int]]) -> int:
346
+ """
347
+ Compute an adaptive merge gap based on inter-box distances.
348
+
349
+ Finds the minimum edge-to-edge distance between each pair of boxes,
350
+ then uses the median of these nearest-neighbour distances. The gap is
351
+ set to 50% of that median, clamped to [8, 40]. This prevents merging
352
+ truly distinct structures in dense figures while still grouping
353
+ fragments that belong to one molecule.
354
+
355
+ Falls back to 25 if there are fewer than 3 boxes.
356
+ """
357
+ n = len(boxes)
358
+ if n < 3:
359
+ return 25 # reasonable default for sparse images
360
+
361
+ # Compute minimum edge-to-edge distance for each box to its nearest neighbour
362
+ def _edge_dist(a: Tuple[int, int, int, int], b: Tuple[int, int, int, int]) -> float:
363
+ ax0, ay0, ax1, ay1 = a
364
+ bx0, by0, bx1, by1 = b
365
+ dx = max(0, max(ax0 - bx1, bx0 - ax1))
366
+ dy = max(0, max(ay0 - by1, by0 - ay1))
367
+ return (dx**2 + dy**2) ** 0.5
368
+
369
+ nn_dists = []
370
+ for i in range(n):
371
+ min_d = float("inf")
372
+ for j in range(n):
373
+ if i == j:
374
+ continue
375
+ d = _edge_dist(boxes[i], boxes[j])
376
+ if d < min_d:
377
+ min_d = d
378
+ nn_dists.append(min_d)
379
+
380
+ nn_dists.sort()
381
+ median_nn = nn_dists[len(nn_dists) // 2]
382
+
383
+ # Gap = 50% of median nearest-neighbour distance
384
+ gap = int(median_nn * 0.50)
385
+ return max(8, min(40, gap))
386
+
387
+
388
+ def segment_structures(
389
+ bgr: "np.ndarray",
390
+ merge_gap: Optional[int] = None,
391
+ ) -> List[Tuple["np.ndarray", Tuple[int, int, int, int]]]:
392
+ """
393
+ Detect chemical structure regions in a BGR image.
394
+
395
+ Returns a list of (cropped_bgr, (x0, y0, x1, y1)) tuples, one per detected
396
+ structure, sorted left→right, top→bottom.
397
+
398
+ Parameters
399
+ ----------
400
+ bgr : BGR image array (from OpenCV)
401
+ merge_gap : pixel gap for merging nearby boxes. None = adaptive
402
+ (computed from median box size). Set to 0 to disable merging.
403
+
404
+ Strategy: threshold → morphological close to fill small gaps → find external
405
+ contours → filter by area/size → merge nearby boxes → crop with padding.
406
+ """
407
+ if not HAS_CV2:
408
+ raise RuntimeError("opencv-python is required.")
409
+
410
+ h, w = bgr.shape[:2]
411
+ total_px = h * w
412
+
413
+ binary = _to_gray_binary(bgr)
414
+
415
+ # Morphological close: connect nearby ink pixels (bond lines, letters)
416
+ kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (5, 5))
417
+ closed = cv2.morphologyEx(binary, cv2.MORPH_CLOSE, kernel, iterations=3)
418
+
419
+ contours, _ = cv2.findContours(closed, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
420
+
421
+ raw_boxes: List[Tuple[int, int, int, int]] = []
422
+ for cnt in contours:
423
+ x, y, cw, ch = cv2.boundingRect(cnt)
424
+ area = cw * ch
425
+ if area < _MIN_STRUCTURE_AREA_PX:
426
+ continue
427
+ if cw < _MIN_SIDE_PX or ch < _MIN_SIDE_PX:
428
+ continue
429
+ if area > _MAX_AREA_FRACTION * total_px:
430
+ continue
431
+ # Reject very thin horizontal/vertical strips (text lines, arrows)
432
+ aspect = min(cw, ch) / max(cw, ch)
433
+ if aspect < _MIN_ASPECT_RATIO:
434
+ continue
435
+ raw_boxes.append((x, y, x + cw, y + ch))
436
+
437
+ if not raw_boxes:
438
+ # Fall back to the whole image as one region
439
+ return [(bgr.copy(), (0, 0, w, h))]
440
+
441
+ gap = merge_gap if merge_gap is not None else _adaptive_gap(raw_boxes)
442
+ merged = _merge_nearby_boxes(raw_boxes, gap=gap)
443
+
444
+ # Sort top→bottom, left→right (row then column)
445
+ merged.sort(key=lambda b: (b[1] // 100, b[0]))
446
+
447
+ results = []
448
+ for x0, y0, x1, y1 in merged:
449
+ # Add padding, clamp to image bounds
450
+ px0 = max(0, x0 - _PADDING_PX)
451
+ py0 = max(0, y0 - _PADDING_PX)
452
+ px1 = min(w, x1 + _PADDING_PX)
453
+ py1 = min(h, y1 + _PADDING_PX)
454
+ crop = bgr[py0:py1, px0:px1]
455
+ results.append((crop, (px0, py0, px1, py1)))
456
+
457
+ return results
458
+
459
+
460
+ # ---------------------------------------------------------------------------
461
+ # SMILES → atom/bond data via RDKit
462
+ # ---------------------------------------------------------------------------
463
+
464
+ def _ring_double_bond_side(
465
+ mol: "Chem.Mol",
466
+ bond: "Chem.Bond",
467
+ ) -> Optional[str]:
468
+ """
469
+ For a double bond that is part of a ring, determine whether the second
470
+ bond line should be drawn to the Right or Left (relative to bond direction
471
+ begin→end). Returns None for non-ring double bonds.
472
+
473
+ Strategy: find the ring neighbour of the begin atom that is NOT the end
474
+ atom; the cross-product of (end-begin) × (neighbour-begin) gives the
475
+ side. Positive z → neighbour is to the left → double bond offset Right,
476
+ and vice-versa. This matches ChemDraw's DoublePosition convention.
477
+ """
478
+ if not bond.IsInRing():
479
+ return None
480
+ conf = mol.GetConformer()
481
+ bi = bond.GetBeginAtomIdx()
482
+ ei = bond.GetEndAtomIdx()
483
+ bx, by = conf.GetAtomPosition(bi).x, conf.GetAtomPosition(bi).y
484
+ ex, ey = conf.GetAtomPosition(ei).x, conf.GetAtomPosition(ei).y
485
+ dx, dy = ex - bx, ey - by # bond vector
486
+
487
+ # Find a ring neighbour of begin-atom (other than end-atom)
488
+ ri = mol.GetRingInfo()
489
+ # Find the smallest ring containing this bond. For fused ring systems
490
+ # (e.g. thienopyrimidine), using only the smallest ring's atoms ensures
491
+ # the double-bond offset points toward the ring interior, not outward
492
+ # (which would look exocyclic).
493
+ containing_rings = [ring for ring in ri.AtomRings()
494
+ if bi in ring and ei in ring]
495
+ if not containing_rings:
496
+ return None
497
+ smallest_ring = min(containing_rings, key=len)
498
+ ring_atoms = set(smallest_ring)
499
+
500
+ for nb in mol.GetAtomWithIdx(bi).GetNeighbors():
501
+ ni = nb.GetIdx()
502
+ if ni == ei:
503
+ continue
504
+ if ni not in ring_atoms:
505
+ continue
506
+ nx, ny = conf.GetAtomPosition(ni).x, conf.GetAtomPosition(ni).y
507
+ # Cross product z-component: (bond vec) × (neighbour vec from begin)
508
+ cross_z = dx * (ny - by) - dy * (nx - bx)
509
+ # Positive cross → neighbour is to the left of bond direction
510
+ # ChemDraw DoublePosition="Right" means the second line is on the
511
+ # right side of the bond (i.e. away from the ring interior when
512
+ # neighbour is to the left)
513
+ return "Right" if cross_z > 0 else "Left"
514
+
515
+ return None
516
+
517
+
518
+ def _rdkit_mol_to_atom_bond_dicts(
519
+ mol: "Chem.Mol",
520
+ offset_index: int = 0,
521
+ ) -> Tuple[List[Dict], List[Dict]]:
522
+ """
523
+ Convert an RDKit Mol (with 2D conformer, already Kekulized) to atom/bond
524
+ dicts matching the format expected by coord_normalizer / cdxml_builder.
525
+
526
+ Atom indices are 1-based and offset by `offset_index` to allow unique
527
+ numbering across multiple molecules.
528
+
529
+ The mol MUST have been Kekulized with clearAromaticFlags=True before
530
+ calling this function, so that all bonds have explicit SINGLE/DOUBLE/TRIPLE
531
+ types (no AROMATIC). This is required for correct ChemDraw rendering —
532
+ ChemDraw 16 does not recognise Order="1.5" as an aromatic bond.
533
+ """
534
+ conf = mol.GetConformer()
535
+ atoms = []
536
+ rdkit_to_local: Dict[int, int] = {} # rdkit 0-based → output 1-based
537
+ for i, atom in enumerate(mol.GetAtoms()):
538
+ pos = conf.GetAtomPosition(i)
539
+ local_idx = i + 1 + offset_index
540
+ rdkit_to_local[i] = local_idx
541
+ a: Dict = {
542
+ "index": local_idx,
543
+ "symbol": atom.GetSymbol(),
544
+ "x": round(float(pos.x), 4),
545
+ "y": round(float(pos.y), 4),
546
+ }
547
+ charge = atom.GetFormalCharge()
548
+ if charge != 0:
549
+ a["charge"] = charge
550
+ # GetTotalNumHs works even after Kekulize
551
+ nh = atom.GetTotalNumHs(includeNeighbors=False)
552
+ if atom.GetSymbol() != "C":
553
+ a["num_hydrogens"] = nh
554
+ isotope = atom.GetIsotope()
555
+ if isotope:
556
+ a["isotope"] = isotope
557
+ atoms.append(a)
558
+
559
+ bonds = []
560
+ for bi, bond in enumerate(mol.GetBonds()):
561
+ order_map = {
562
+ Chem.BondType.SINGLE: 1,
563
+ Chem.BondType.DOUBLE: 2,
564
+ Chem.BondType.TRIPLE: 3,
565
+ # AROMATIC should not appear after Kekulize, but keep as fallback
566
+ Chem.BondType.AROMATIC: 2,
567
+ }
568
+ order = order_map.get(bond.GetBondType(), 1)
569
+
570
+ # Bond direction for wedge/dash stereo
571
+ cfg = 0
572
+ bd = bond.GetBondDir()
573
+ if bd == Chem.BondDir.BEGINWEDGE:
574
+ cfg = 1
575
+ elif bd == Chem.BondDir.BEGINDASH:
576
+ cfg = 6
577
+
578
+ bond_dict: Dict = {
579
+ "index": bi + 1 + offset_index,
580
+ "order": order,
581
+ "atom1": rdkit_to_local[bond.GetBeginAtomIdx()],
582
+ "atom2": rdkit_to_local[bond.GetEndAtomIdx()],
583
+ "cfg": cfg,
584
+ }
585
+
586
+ # For in-ring double bonds, add DoublePosition so ChemDraw draws the
587
+ # second line on the correct (inside-ring) side.
588
+ if order == 2:
589
+ side = _ring_double_bond_side(mol, bond)
590
+ if side:
591
+ bond_dict["double_pos"] = side
592
+
593
+ bonds.append(bond_dict)
594
+
595
+ return atoms, bonds
596
+
597
+
598
+ def smiles_to_coords(smiles: str, offset_index: int = 0) -> Optional[Dict]:
599
+ """
600
+ Convert a SMILES string to 2D atom/bond data using RDKit.
601
+
602
+ Returns a dict with "atoms" and "bonds" lists (raw RDKit Angstrom units),
603
+ or None if the SMILES is invalid or coordinate generation fails.
604
+ """
605
+ if not HAS_RDKIT:
606
+ raise RuntimeError("RDKit is required. Activate the LLMChem conda environment.")
607
+
608
+ if not smiles or smiles.strip() in ("", "FAILED", "N/A"):
609
+ return None
610
+
611
+ mol = Chem.MolFromSmiles(smiles)
612
+ if mol is None:
613
+ return None
614
+
615
+ # Generate 2D coords directly on heavy atoms. Previous versions did
616
+ # AddHs → Compute2DCoords → RemoveHs, but that causes RDKit to lay out
617
+ # alkyl chains in a straight line (all bonds collinear) instead of a
618
+ # proper zigzag, because the algorithm spaces out explicit H positions
619
+ # and the heavy-atom backbone becomes linear.
620
+ result = AllChem.Compute2DCoords(mol)
621
+ if result != 0:
622
+ return None
623
+
624
+ # Kekulize AFTER coord generation so bond orders are explicit SINGLE/DOUBLE.
625
+ # clearAromaticFlags=True ensures GetBondType() returns SINGLE/DOUBLE, not
626
+ # AROMATIC — required for correct ChemDraw rendering (no Order="1.5").
627
+ try:
628
+ Chem.Kekulize(mol, clearAromaticFlags=True)
629
+ except Exception:
630
+ # If Kekulization fails (unusual), proceed anyway; aromatic bonds will
631
+ # be mapped to order=2 as a fallback in _rdkit_mol_to_atom_bond_dicts.
632
+ pass
633
+
634
+ atoms, bonds = _rdkit_mol_to_atom_bond_dicts(mol, offset_index=offset_index)
635
+ return {"atoms": atoms, "bonds": bonds}
636
+
637
+
638
+ # ---------------------------------------------------------------------------
639
+ # Coordinate normalisation (inline, no import dependency on coord_normalizer)
640
+ # ---------------------------------------------------------------------------
641
+
642
+ from ..constants import (
643
+ ACS_BOND_LENGTH as ACS_BOND_LENGTH_PT,
644
+ CDXML_HEADER as _CDXML_HEADER,
645
+ CDXML_FOOTER as _CDXML_FOOTER,
646
+ ACS_LABEL_FONT, ACS_LABEL_SIZE, ACS_LABEL_FACE,
647
+ ACS_CAPTION_SIZE, ACS_HASH_SPACING, ACS_MARGIN_WIDTH,
648
+ ACS_LINE_WIDTH, ACS_BOLD_WIDTH, ACS_BOND_LENGTH_STR,
649
+ ACS_BOND_SPACING, ACS_CHAIN_ANGLE_STR,
650
+ )
651
+
652
+
653
+ def _average_bond_length(atoms: List[Dict], bonds: List[Dict]) -> float:
654
+ if not bonds:
655
+ return 1.0
656
+ xy = {a["index"]: (a["x"], a["y"]) for a in atoms}
657
+ lengths = [
658
+ math.hypot(
659
+ xy.get(b["atom1"], (0, 0))[0] - xy.get(b["atom2"], (0, 0))[0],
660
+ xy.get(b["atom1"], (0, 0))[1] - xy.get(b["atom2"], (0, 0))[1],
661
+ )
662
+ for b in bonds
663
+ if math.hypot(
664
+ xy.get(b["atom1"], (0, 0))[0] - xy.get(b["atom2"], (0, 0))[0],
665
+ xy.get(b["atom1"], (0, 0))[1] - xy.get(b["atom2"], (0, 0))[1],
666
+ ) > 1e-6
667
+ ]
668
+ return sum(lengths) / len(lengths) if lengths else 1.0
669
+
670
+
671
+ def normalize_for_cdxml(
672
+ atoms: List[Dict],
673
+ bonds: List[Dict],
674
+ center_x: float = 200.0,
675
+ center_y: float = 300.0,
676
+ ) -> Tuple[List[Dict], List[Dict]]:
677
+ """
678
+ Scale + flip-y + centre coordinates for CDXML output (ACS 1996, 14.40 pt bonds).
679
+ RDKit coords are Angstroms, y-up. CDXML is points, y-down.
680
+ """
681
+ atoms = deepcopy(atoms)
682
+ bonds = deepcopy(bonds)
683
+
684
+ if not atoms:
685
+ return atoms, bonds
686
+
687
+ # Flip y
688
+ for a in atoms:
689
+ a["y"] = -a["y"]
690
+
691
+ # Scale
692
+ avg_bl = _average_bond_length(atoms, bonds)
693
+ if avg_bl > 1e-6:
694
+ scale = ACS_BOND_LENGTH_PT / avg_bl
695
+ for a in atoms:
696
+ a["x"] *= scale
697
+ a["y"] *= scale
698
+
699
+ # Centre
700
+ xs = [a["x"] for a in atoms]
701
+ ys = [a["y"] for a in atoms]
702
+ cx = (min(xs) + max(xs)) / 2.0
703
+ cy = (min(ys) + max(ys)) / 2.0
704
+ for a in atoms:
705
+ a["x"] = round(a["x"] - cx + center_x, 3)
706
+ a["y"] = round(a["y"] - cy + center_y, 3)
707
+
708
+ return atoms, bonds
709
+
710
+
711
+ # ---------------------------------------------------------------------------
712
+ # Mass data enrichment
713
+ # ---------------------------------------------------------------------------
714
+
715
+ def enrich_with_mass_data(results: List[Dict]) -> None:
716
+ """Add formula, mw, exact_mass, and adducts to each extracted structure.
717
+
718
+ Mutates *results* in place. Requires RDKit; silently skips if unavailable.
719
+ """
720
+ if not HAS_RDKIT:
721
+ return
722
+
723
+ from rdkit.Chem import Descriptors, rdMolDescriptors
724
+
725
+ for entry in results:
726
+ smiles = entry.get("smiles", "").strip()
727
+ if not smiles:
728
+ continue
729
+ mol = Chem.MolFromSmiles(smiles)
730
+ if mol is None:
731
+ continue
732
+
733
+ exact_mass_full = Descriptors.ExactMolWt(mol)
734
+ mw = Descriptors.MolWt(mol)
735
+ formula = rdMolDescriptors.CalcMolFormula(mol)
736
+
737
+ # Salt splitting: neutral = largest fragment
738
+ frags = Chem.GetMolFrags(mol, asMols=True)
739
+ if len(frags) > 1:
740
+ neutral_mol = max(frags, key=lambda m: m.GetNumHeavyAtoms())
741
+ exact_mass = Descriptors.ExactMolWt(neutral_mol)
742
+ else:
743
+ exact_mass = exact_mass_full
744
+
745
+ entry["formula"] = formula
746
+ entry["mw"] = round(mw, 4)
747
+ entry["exact_mass"] = round(exact_mass, 5)
748
+ entry["exact_mass_full"] = round(exact_mass_full, 5)
749
+ entry["adducts"] = {
750
+ "[M+H]+": round(exact_mass + 1.00728, 5),
751
+ "[M-H]-": round(exact_mass - 1.00728, 5),
752
+ "[M+Na]+": round(exact_mass + 22.98922, 5),
753
+ "[M+formate]-": round(exact_mass + 44.99820, 5),
754
+ }
755
+
756
+
757
+ # ---------------------------------------------------------------------------
758
+ # Main extraction pipeline
759
+ # ---------------------------------------------------------------------------
760
+
761
+ def _extract_structures_raw(
762
+ image_path: str,
763
+ page: int = 0,
764
+ segment: bool = True,
765
+ hand_drawn: bool = False,
766
+ verbose: bool = False,
767
+ merge_gap: Optional[int] = None,
768
+ ) -> List[Dict]:
769
+ """
770
+ Full pipeline: image → segmented crops → SMILES → 2D coords.
771
+
772
+ This is the internal low-level function. Call extract_structures_from_image()
773
+ for the public API that returns structured JSON.
774
+
775
+ Parameters
776
+ ----------
777
+ image_path : path to PNG/JPG/PDF
778
+ page : PDF page number (0-indexed); ignored for image files
779
+ segment : if False, treat whole image as one structure
780
+ hand_drawn : use DECIMER hand-drawn model
781
+ verbose : print progress to stderr
782
+ merge_gap : pixel gap for merging nearby boxes during segmentation.
783
+ None = adaptive (based on median box size). 0 = no merging.
784
+
785
+ Returns
786
+ -------
787
+ List of dicts, one per detected structure:
788
+ {
789
+ "index": int,
790
+ "smiles": str,
791
+ "confidence": float or None, # mean per-token DECIMER confidence, 0-1
792
+ "bbox": [x0, y0, x1, y1],
793
+ "atoms": [...],
794
+ "bonds": [...]
795
+ }
796
+ """
797
+ def log(msg: str):
798
+ if verbose:
799
+ print(f"[structure_from_image] {msg}", file=sys.stderr)
800
+
801
+ # 1. Load image
802
+ log(f"Loading {image_path}" + (f" page {page}" if image_path.lower().endswith(".pdf") else ""))
803
+ bgr = load_image(image_path, page=page)
804
+ h, w = bgr.shape[:2]
805
+ log(f"Image size: {w}x{h} px")
806
+
807
+ # 2. Segment
808
+ if segment:
809
+ log("Segmenting structures...")
810
+ regions = segment_structures(bgr, merge_gap=merge_gap)
811
+ log(f"Found {len(regions)} candidate region(s)")
812
+ else:
813
+ regions = [(bgr.copy(), (0, 0, w, h))]
814
+ log("Skipping segmentation (--no-segment)")
815
+
816
+ # 3. Load DECIMER (deferred)
817
+ log("Loading DECIMER model (may take a moment on first call)...")
818
+ predict_fn = _load_decimer(hand_drawn=hand_drawn)
819
+
820
+ # 4. Process each region
821
+ results: List[Dict] = []
822
+ atom_offset = 0
823
+
824
+ for i, (crop, bbox) in enumerate(regions):
825
+ log(f"Processing region {i+1}/{len(regions)} — bbox {bbox}")
826
+
827
+ # Try to call DECIMER with confidence=True to get per-token scores.
828
+ # Falls back to confidence=False if the version doesn't support it.
829
+ raw_confidence = None
830
+ try:
831
+ result = predict_fn(crop, confidence=True, hand_drawn=hand_drawn)
832
+ if isinstance(result, tuple):
833
+ smiles, raw_confidence = result[0], result[1]
834
+ else:
835
+ smiles = result
836
+ except TypeError:
837
+ # Older DECIMER versions don't accept numpy; fall back to temp file
838
+ with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tf:
839
+ tmp_path = tf.name
840
+ try:
841
+ cv2.imwrite(tmp_path, crop)
842
+ try:
843
+ result = predict_fn(tmp_path, confidence=True)
844
+ if isinstance(result, tuple):
845
+ smiles, raw_confidence = result[0], result[1]
846
+ else:
847
+ smiles = result
848
+ except TypeError:
849
+ smiles = predict_fn(tmp_path)
850
+ finally:
851
+ os.unlink(tmp_path)
852
+ except Exception as exc:
853
+ log(f" DECIMER failed: {exc}")
854
+ smiles = ""
855
+
856
+ smiles = smiles.strip() if smiles else ""
857
+
858
+ # Sanity check: abnormally long SMILES usually means DECIMER is reading
859
+ # text/arrows/noise rather than a chemical structure.
860
+ if len(smiles) > _MAX_SMILES_LEN:
861
+ log(f" SMILES too long ({len(smiles)} chars) — discarding as noise")
862
+ smiles = ""
863
+ raw_confidence = None
864
+
865
+ # Compute a single confidence scalar from per-token scores
866
+ confidence = _compute_confidence_score(raw_confidence)
867
+
868
+ log(f" SMILES: {smiles or '(none)'}"
869
+ + (f" confidence: {confidence:.3f}" if confidence is not None else ""))
870
+
871
+ # 5. SMILES → 2D coordinates
872
+ mol_data = None
873
+ if smiles:
874
+ mol_data = smiles_to_coords(smiles, offset_index=atom_offset)
875
+ if mol_data is None:
876
+ log(f" RDKit could not parse SMILES: {smiles}")
877
+ else:
878
+ # Normalise to ACS 1996 CDXML coords.
879
+ # Use a fixed origin here; final placement is done in
880
+ # results_to_cdxml() based on actual bounding boxes.
881
+ atoms_norm, bonds_norm = normalize_for_cdxml(
882
+ mol_data["atoms"],
883
+ mol_data["bonds"],
884
+ center_x=200.0,
885
+ center_y=300.0,
886
+ )
887
+ mol_data["atoms"] = atoms_norm
888
+ mol_data["bonds"] = bonds_norm
889
+ atom_offset += len(mol_data["atoms"])
890
+
891
+ entry: Dict = {
892
+ "index": i,
893
+ "smiles": smiles,
894
+ "confidence": confidence,
895
+ "bbox": list(bbox),
896
+ }
897
+ if mol_data:
898
+ entry["atoms"] = mol_data["atoms"]
899
+ entry["bonds"] = mol_data["bonds"]
900
+ else:
901
+ entry["atoms"] = []
902
+ entry["bonds"] = []
903
+
904
+ results.append(entry)
905
+
906
+ # Enrich with mass data (formula, MW, exact_mass, adducts)
907
+ enrich_with_mass_data(results)
908
+
909
+ log(f"Done. {len(results)} structure(s) extracted.")
910
+ return results
911
+
912
+
913
+ # ---------------------------------------------------------------------------
914
+ # Confidence scoring
915
+ # ---------------------------------------------------------------------------
916
+
917
+ def _compute_confidence_score(
918
+ raw_confidence: Optional[list],
919
+ ) -> Optional[float]:
920
+ """
921
+ Reduce DECIMER's per-token confidence list to a single scalar in [0, 1].
922
+
923
+ DECIMER returns a list of (token, score) tuples when called with
924
+ confidence=True. This function computes the geometric mean of the
925
+ scores, which is more sensitive to low-confidence tokens than the
926
+ arithmetic mean and better reflects overall prediction reliability.
927
+
928
+ Returns None if no confidence data is available.
929
+ """
930
+ if not raw_confidence:
931
+ return None
932
+
933
+ scores = []
934
+ for item in raw_confidence:
935
+ if isinstance(item, (tuple, list)) and len(item) >= 2:
936
+ try:
937
+ scores.append(float(item[1]))
938
+ except (TypeError, ValueError):
939
+ pass
940
+ else:
941
+ try:
942
+ scores.append(float(item))
943
+ except (TypeError, ValueError):
944
+ pass
945
+
946
+ if not scores:
947
+ return None
948
+
949
+ # Geometric mean (log-space to avoid underflow)
950
+ import math as _math
951
+ log_sum = sum(_math.log(max(s, 1e-9)) for s in scores)
952
+ return round(_math.exp(log_sum / len(scores)), 4)
953
+
954
+
955
+ # ---------------------------------------------------------------------------
956
+ # Nearby text label detection
957
+ # ---------------------------------------------------------------------------
958
+
959
+ def _detect_nearby_labels(
960
+ bgr: "np.ndarray",
961
+ structure_bboxes: List[Tuple[int, int, int, int]],
962
+ search_margin: int = 80,
963
+ ) -> List[Optional[str]]:
964
+ """
965
+ Detect text labels near each structure bounding box in the image.
966
+
967
+ Uses a two-phase strategy:
968
+ 1. Find candidate text regions via OpenCV contours (small, elongated blobs
969
+ that look like text lines rather than structure fragments).
970
+ 2. If pytesseract or easyocr is available, OCR those regions and associate
971
+ the nearest text label to each structure. If neither is installed,
972
+ returns None for every structure.
973
+
974
+ Parameters
975
+ ----------
976
+ bgr : BGR image array
977
+ structure_bboxes : list of (x0, y0, x1, y1) for each detected structure
978
+ search_margin : how many pixels outside the structure bbox to search
979
+ for associated text labels
980
+
981
+ Returns
982
+ -------
983
+ List of str|None, one per structure. Each entry is the detected label
984
+ text (stripped) or None if no label was found or OCR is unavailable.
985
+ """
986
+ if not HAS_CV2 or not structure_bboxes:
987
+ return [None] * len(structure_bboxes)
988
+
989
+ import numpy as np
990
+
991
+ h, w = bgr.shape[:2]
992
+
993
+ # --- Phase 1: Find candidate text regions ---
994
+ # Text regions tend to be: small area, high aspect ratio (wide and short),
995
+ # located outside the structure bounding boxes.
996
+ gray = cv2.cvtColor(bgr, cv2.COLOR_BGR2GRAY)
997
+ _, binary = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)
998
+
999
+ # Use a smaller morphological kernel to preserve text character separations
1000
+ kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (3, 3))
1001
+ closed = cv2.morphologyEx(binary, cv2.MORPH_CLOSE, kernel, iterations=1)
1002
+ contours, _ = cv2.findContours(closed, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
1003
+
1004
+ # Build a mask of structure regions (to exclude them from label search)
1005
+ structure_set = set()
1006
+ for (sx0, sy0, sx1, sy1) in structure_bboxes:
1007
+ for px in range(max(0, sx0), min(w, sx1)):
1008
+ for py in range(max(0, sy0), min(h, sy1)):
1009
+ structure_set.add((px, py))
1010
+
1011
+ text_blobs: List[Tuple[int, int, int, int]] = []
1012
+ for cnt in contours:
1013
+ cx, cy, cw, ch = cv2.boundingRect(cnt)
1014
+ area = cw * ch
1015
+
1016
+ # Skip tiny noise and huge blobs
1017
+ if area < 50 or area > 0.05 * h * w:
1018
+ continue
1019
+
1020
+ # Text lines are wider than they are tall (aspect > 1.5), or are
1021
+ # narrow vertical labels. Very square blobs are likely structure parts.
1022
+ aspect = max(cw, ch) / max(min(cw, ch), 1)
1023
+ if aspect < 1.5:
1024
+ continue
1025
+
1026
+ # Skip blobs that overlap significantly with any structure bbox
1027
+ blob_cx = cx + cw // 2
1028
+ blob_cy = cy + ch // 2
1029
+ in_structure = False
1030
+ for (sx0, sy0, sx1, sy1) in structure_bboxes:
1031
+ if sx0 <= blob_cx <= sx1 and sy0 <= blob_cy <= sy1:
1032
+ in_structure = True
1033
+ break
1034
+ if in_structure:
1035
+ continue
1036
+
1037
+ text_blobs.append((cx, cy, cx + cw, cy + ch))
1038
+
1039
+ if not text_blobs:
1040
+ return [None] * len(structure_bboxes)
1041
+
1042
+ # --- Phase 2: Try OCR on the candidates ---
1043
+ # Check for OCR availability (pytesseract preferred, easyocr fallback)
1044
+ ocr_fn = _get_ocr_fn()
1045
+ if ocr_fn is None:
1046
+ # No OCR available; return None for all structures but record that
1047
+ # text blobs were detected (useful for debugging).
1048
+ return [None] * len(structure_bboxes)
1049
+
1050
+ # Associate each text blob with the nearest structure (by edge distance)
1051
+ labels = [None] * len(structure_bboxes)
1052
+
1053
+ for (bx0, by0, bx1, by1) in text_blobs:
1054
+ # Check if this blob falls within the search_margin of any structure
1055
+ best_dist = float("inf")
1056
+ best_idx = -1
1057
+
1058
+ for si, (sx0, sy0, sx1, sy1) in enumerate(structure_bboxes):
1059
+ # Expand structure bbox by search_margin
1060
+ ex0, ey0 = sx0 - search_margin, sy0 - search_margin
1061
+ ex1, ey1 = sx1 + search_margin, sy1 + search_margin
1062
+
1063
+ # Check if blob centre is within expanded bbox
1064
+ bcx, bcy = (bx0 + bx1) // 2, (by0 + by1) // 2
1065
+ if ex0 <= bcx <= ex1 and ey0 <= bcy <= ey1:
1066
+ # Compute edge-to-edge distance
1067
+ dx = max(0, max(sx0 - bx1, bx0 - sx1))
1068
+ dy = max(0, max(sy0 - by1, by0 - sy1))
1069
+ dist = (dx * dx + dy * dy) ** 0.5
1070
+ if dist < best_dist:
1071
+ best_dist = dist
1072
+ best_idx = si
1073
+
1074
+ if best_idx < 0:
1075
+ continue
1076
+
1077
+ # OCR the blob
1078
+ try:
1079
+ crop = bgr[by0:by1, bx0:bx1]
1080
+ text = ocr_fn(crop).strip()
1081
+ except Exception:
1082
+ text = None
1083
+
1084
+ if text:
1085
+ # Append to existing label (a structure can have multiple labels)
1086
+ existing = labels[best_idx]
1087
+ labels[best_idx] = f"{existing} {text}".strip() if existing else text
1088
+
1089
+ return labels
1090
+
1091
+
1092
+ def _get_ocr_fn():
1093
+ """
1094
+ Return a callable f(bgr_crop) -> str that performs OCR on a BGR image crop.
1095
+
1096
+ Tries pytesseract first, then easyocr. Returns None if neither is available.
1097
+ """
1098
+ # Try pytesseract (fastest, most common)
1099
+ try:
1100
+ import pytesseract
1101
+ from PIL import Image as _PILImage
1102
+
1103
+ def _tesseract_ocr(bgr_crop: "np.ndarray") -> str:
1104
+ rgb = cv2.cvtColor(bgr_crop, cv2.COLOR_BGR2RGB)
1105
+ pil_img = _PILImage.fromarray(rgb)
1106
+ return pytesseract.image_to_string(pil_img, config="--psm 7").strip()
1107
+
1108
+ return _tesseract_ocr
1109
+ except ImportError:
1110
+ pass
1111
+
1112
+ # Try easyocr (slower startup, but no external binary required)
1113
+ try:
1114
+ import easyocr
1115
+
1116
+ _reader = easyocr.Reader(["en"], gpu=False, verbose=False)
1117
+
1118
+ def _easyocr_ocr(bgr_crop: "np.ndarray") -> str:
1119
+ results = _reader.readtext(bgr_crop, detail=0)
1120
+ return " ".join(results).strip()
1121
+
1122
+ return _easyocr_ocr
1123
+ except ImportError:
1124
+ pass
1125
+
1126
+ return None
1127
+
1128
+
1129
+ # ---------------------------------------------------------------------------
1130
+ # Public API: extract_structures_from_image
1131
+ # ---------------------------------------------------------------------------
1132
+
1133
+ def extract_structures_from_image(
1134
+ image_path: str,
1135
+ page: int = 0,
1136
+ segment: bool = True,
1137
+ hand_drawn: bool = False,
1138
+ verbose: bool = False,
1139
+ merge_gap: Optional[int] = None,
1140
+ detect_labels: bool = True,
1141
+ ) -> Dict:
1142
+ """
1143
+ Extract all chemical structures from an image using DECIMER.
1144
+
1145
+ Takes a PNG, JPG, or PDF path and returns a structured JSON dict with every
1146
+ detected molecule, its SMILES, DECIMER confidence score, bounding box in
1147
+ image pixel coordinates, and (when OCR is available) any nearby text label.
1148
+
1149
+ Parameters
1150
+ ----------
1151
+ image_path : path to PNG/JPG/PDF image file
1152
+ page : PDF page index (0-based); ignored for raster images
1153
+ segment : if True (default), segment the image into individual
1154
+ structure regions before passing each to DECIMER.
1155
+ Set False when the whole image is a single structure.
1156
+ hand_drawn : use the DECIMER hand-drawn model instead of the default
1157
+ printed-structure model
1158
+ verbose : print progress messages to stderr
1159
+ merge_gap : pixel gap for merging nearby segmentation boxes.
1160
+ None = adaptive (median-based). 0 = no merging.
1161
+ detect_labels : if True (default), attempt to detect text labels near
1162
+ each structure. Requires pytesseract or easyocr to
1163
+ return non-None label values; without an OCR library the
1164
+ label field is always null.
1165
+
1166
+ Returns
1167
+ -------
1168
+ dict with the following keys:
1169
+
1170
+ ok (bool) True on success, False on error
1171
+ image_path (str) Absolute path of the input image
1172
+ structures (list) One entry per detected structure:
1173
+ smiles (str) DECIMER-predicted SMILES (may be "")
1174
+ confidence (float|null) Geometric-mean per-token DECIMER score
1175
+ in [0, 1], or null if unavailable
1176
+ bbox (list) [x0, y0, x1, y1] pixel coords (top-left,
1177
+ bottom-right) in the input image
1178
+ label (str|null) Nearby text label detected by OCR, or null
1179
+ error (str) Only present on failure (ok=False)
1180
+
1181
+ Examples
1182
+ --------
1183
+ >>> from cdxml_toolkit.image.structure_from_image import extract_structures_from_image
1184
+ >>> result = extract_structures_from_image("scheme.png")
1185
+ >>> if result["ok"]:
1186
+ ... for s in result["structures"]:
1187
+ ... print(s["smiles"], s["confidence"])
1188
+
1189
+ Notes
1190
+ -----
1191
+ - DECIMER models are downloaded to ~/.data/DECIMER-V2/ on first run (~570 MB).
1192
+ - Confidence uses geometric mean of per-character DECIMER scores, making it
1193
+ sensitive to low-confidence characters. Scores above ~0.85 are reliable;
1194
+ below ~0.70 the SMILES should be verified manually.
1195
+ - Labels are detected only when pytesseract or easyocr is installed.
1196
+ Install either with: pip install pytesseract or pip install easyocr
1197
+ - For backward-compatible low-level access (returns List[Dict] with atoms/bonds),
1198
+ use _extract_structures_raw() directly.
1199
+ """
1200
+ abs_path = os.path.abspath(image_path)
1201
+
1202
+ # Guard: DECIMER is required
1203
+ try:
1204
+ _load_decimer(hand_drawn=hand_drawn)
1205
+ except ImportError as exc:
1206
+ return {
1207
+ "ok": False,
1208
+ "image_path": abs_path,
1209
+ "structures": [],
1210
+ "error": str(exc),
1211
+ }
1212
+
1213
+ # Guard: OpenCV is required for segmentation and label detection
1214
+ if not HAS_CV2:
1215
+ return {
1216
+ "ok": False,
1217
+ "image_path": abs_path,
1218
+ "structures": [],
1219
+ "error": (
1220
+ "opencv-python is required. "
1221
+ "Install with: pip install opencv-python"
1222
+ ),
1223
+ }
1224
+
1225
+ try:
1226
+ raw = _extract_structures_raw(
1227
+ image_path=image_path,
1228
+ page=page,
1229
+ segment=segment,
1230
+ hand_drawn=hand_drawn,
1231
+ verbose=verbose,
1232
+ merge_gap=merge_gap,
1233
+ )
1234
+ except FileNotFoundError as exc:
1235
+ return {
1236
+ "ok": False,
1237
+ "image_path": abs_path,
1238
+ "structures": [],
1239
+ "error": str(exc),
1240
+ }
1241
+ except Exception as exc:
1242
+ return {
1243
+ "ok": False,
1244
+ "image_path": abs_path,
1245
+ "structures": [],
1246
+ "error": f"Extraction failed: {exc}",
1247
+ }
1248
+
1249
+ # Detect nearby text labels (spatial proximity + optional OCR)
1250
+ labels: List[Optional[str]] = [None] * len(raw)
1251
+ if detect_labels and HAS_CV2 and raw:
1252
+ try:
1253
+ bgr = load_image(image_path, page=page)
1254
+ bboxes = [tuple(entry["bbox"]) for entry in raw]
1255
+ labels = _detect_nearby_labels(bgr, bboxes) # type: ignore[arg-type]
1256
+ except Exception:
1257
+ # Label detection is best-effort; never fail the whole extraction
1258
+ labels = [None] * len(raw)
1259
+
1260
+ structures = []
1261
+ for entry, label in zip(raw, labels):
1262
+ structures.append({
1263
+ "smiles": entry.get("smiles", ""),
1264
+ "confidence": entry.get("confidence"),
1265
+ "bbox": entry.get("bbox", []),
1266
+ "label": label,
1267
+ })
1268
+
1269
+ return {
1270
+ "ok": True,
1271
+ "image_path": abs_path,
1272
+ "structures": structures,
1273
+ }
1274
+
1275
+
1276
+ # ---------------------------------------------------------------------------
1277
+ # CDXML output (optional, wraps cdxml_builder)
1278
+ # ---------------------------------------------------------------------------
1279
+
1280
+ def _format_cdxml_header(bbox: str) -> str:
1281
+ """Format CDXML_HEADER template with ACS Document 1996 style constants."""
1282
+ return _CDXML_HEADER.format(
1283
+ bbox=bbox,
1284
+ label_font=ACS_LABEL_FONT,
1285
+ label_size=ACS_LABEL_SIZE,
1286
+ label_face=ACS_LABEL_FACE,
1287
+ caption_size=ACS_CAPTION_SIZE,
1288
+ hash_spacing=ACS_HASH_SPACING,
1289
+ margin_width=ACS_MARGIN_WIDTH,
1290
+ line_width=ACS_LINE_WIDTH,
1291
+ bold_width=ACS_BOLD_WIDTH,
1292
+ bond_length=ACS_BOND_LENGTH_STR,
1293
+ bond_spacing=ACS_BOND_SPACING,
1294
+ chain_angle=ACS_CHAIN_ANGLE_STR,
1295
+ )
1296
+
1297
+
1298
+ def _best_smiles_component(smiles: str) -> str:
1299
+ """
1300
+ For a dot-separated multi-component SMILES, return the single component
1301
+ that is most likely to be the real chemical structure (largest heavy-atom
1302
+ count that is also a valid RDKit molecule). Filters out junk fragments
1303
+ like lone alkyne chains, single atoms, very short chains, etc.
1304
+ """
1305
+ if "." not in smiles:
1306
+ return smiles
1307
+
1308
+ parts = smiles.split(".")
1309
+ best = ""
1310
+ best_score = -1
1311
+
1312
+ for part in parts:
1313
+ part = part.strip()
1314
+ if not part:
1315
+ continue
1316
+ # Quick atom-count heuristic before RDKit parse
1317
+ heavy = sum(1 for c in part if c.isupper())
1318
+ if heavy < 3:
1319
+ continue
1320
+ # Penalise pure alkyne/alkene chains (no rings, no heteroatoms)
1321
+ has_heteroatom = any(c in part for c in "NOSFPClBrI")
1322
+ has_ring = "1" in part or "2" in part or "3" in part or "@" in part
1323
+ score = heavy * 10 + (50 if has_heteroatom else 0) + (30 if has_ring else 0)
1324
+ if score > best_score:
1325
+ best = part
1326
+ best_score = score
1327
+
1328
+ return best if best else smiles.split(".")[0]
1329
+
1330
+
1331
+ def _translate_atoms_xml(frag_xml: str, dx: float, dy: float) -> str:
1332
+ """
1333
+ Shift all coordinate attributes in a fragment XML string by (dx, dy).
1334
+ Handles: p="x y" and BoundingBox="x1 y1 x2 y2".
1335
+ Both patterns appear in <fragment>, <n>, and <t> elements.
1336
+ """
1337
+ import re
1338
+
1339
+ def shift_p(m: "re.Match") -> str:
1340
+ x, y = float(m.group(1)), float(m.group(2))
1341
+ return f'p="{x + dx:.3f} {y + dy:.3f}"'
1342
+
1343
+ def shift_bb(m: "re.Match") -> str:
1344
+ vals = [float(v) for v in m.group(1).split()]
1345
+ shifted = [
1346
+ f"{vals[0] + dx:.3f}", f"{vals[1] + dy:.3f}",
1347
+ f"{vals[2] + dx:.3f}", f"{vals[3] + dy:.3f}",
1348
+ ]
1349
+ return f'BoundingBox="{" ".join(shifted)}"'
1350
+
1351
+ frag_xml = re.sub(r'\bp="([-\d.]+)\s+([-\d.]+)"', shift_p, frag_xml)
1352
+ frag_xml = re.sub(r'\bBoundingBox="((?:[-\d.]+ ?){4})"', shift_bb, frag_xml)
1353
+ return frag_xml
1354
+
1355
+
1356
+ def results_to_cdxml(results: List[Dict]) -> str:
1357
+ """
1358
+ Convert extracted structures to a CDXML document (multiple molecules on one page).
1359
+
1360
+ Each valid structure is placed left-to-right, spaced by its actual atom
1361
+ bounding box. The correct translation is computed from the fragment's
1362
+ real atom x/y range (atoms were normalised to centre ≈ (200, 300)), then
1363
+ shifted so fragment i lands at (x_cursor + half_width, ROW_Y).
1364
+
1365
+ Multi-component SMILES (dot-separated) are filtered to retain only the
1366
+ largest / most drug-like component before building.
1367
+
1368
+ Requires cdxml_builder.py to be importable from the same directory.
1369
+ """
1370
+ import importlib.util
1371
+ import xml.etree.ElementTree as ET
1372
+
1373
+ _dir = os.path.dirname(os.path.abspath(__file__))
1374
+ try:
1375
+ spec = importlib.util.spec_from_file_location(
1376
+ "cdxml_builder", os.path.join(_dir, "cdxml_builder.py")
1377
+ )
1378
+ cdxml_builder = importlib.util.module_from_spec(spec)
1379
+ spec.loader.exec_module(cdxml_builder)
1380
+ except Exception as exc:
1381
+ raise ImportError(f"Could not import cdxml_builder.py: {exc}") from exc
1382
+
1383
+ PAGE_MARGIN = 36.0 # pt from page left edge to first atom bbox left
1384
+ MOL_GAP = 40.0 # pt gap between adjacent molecule bounding boxes
1385
+ ROW_Y = 300.0 # y-centre for the row of molecules
1386
+ LABEL_PAD = 10.0 # extra pt added around atom bbox for labels
1387
+
1388
+ # --- Build each molecule, measure its atom bbox, then place ---
1389
+ placed_fragments: List[str] = []
1390
+ half_heights: List[float] = []
1391
+ x_cursor = PAGE_MARGIN
1392
+ start_id = 1000
1393
+
1394
+ for entry in results:
1395
+ atoms = entry.get("atoms", [])
1396
+ bonds = entry.get("bonds", [])
1397
+ if not atoms:
1398
+ continue
1399
+
1400
+ # If this entry came from a multi-component SMILES, re-derive coords
1401
+ # from only the best component so we don't get a stacked mess.
1402
+ smiles = entry.get("smiles", "")
1403
+ if smiles and "." in smiles:
1404
+ best = _best_smiles_component(smiles)
1405
+ if best != smiles:
1406
+ mol_data = smiles_to_coords(best, offset_index=0)
1407
+ if mol_data:
1408
+ atoms, bonds = normalize_for_cdxml(
1409
+ mol_data["atoms"], mol_data["bonds"],
1410
+ center_x=200.0, center_y=300.0,
1411
+ )
1412
+
1413
+ # Measure actual atom coordinate bounding box
1414
+ xs = [a["x"] for a in atoms]
1415
+ ys = [a["y"] for a in atoms]
1416
+ atom_xmin = min(xs); atom_xmax = max(xs)
1417
+ atom_ymin = min(ys); atom_ymax = max(ys)
1418
+ mol_w = (atom_xmax - atom_xmin) + LABEL_PAD * 2
1419
+ mol_h = (atom_ymax - atom_ymin) + LABEL_PAD * 2
1420
+ mol_w = max(mol_w, ACS_BOND_LENGTH_PT * 2)
1421
+ mol_h = max(mol_h, ACS_BOND_LENGTH_PT * 2)
1422
+
1423
+ # Build fragment XML (atoms are centred near (200, 300) already)
1424
+ cdxml_str = cdxml_builder.build_molecule_cdxml(atoms, bonds, start_id=start_id)
1425
+ root = ET.fromstring(cdxml_str)
1426
+ page_el = root.find("page")
1427
+ if page_el is None:
1428
+ continue
1429
+ frag_xmls = [ET.tostring(f, encoding="unicode") for f in page_el.findall("fragment")]
1430
+ if not frag_xmls:
1431
+ continue
1432
+
1433
+ # Compute atom bbox centre in the built (origin) coordinates
1434
+ origin_cx = (atom_xmin + atom_xmax) / 2.0
1435
+ origin_cy = (atom_ymin + atom_ymax) / 2.0
1436
+
1437
+ # Target position: centre of the slot we're placing this molecule into
1438
+ target_cx = x_cursor + mol_w / 2.0
1439
+ target_cy = ROW_Y
1440
+
1441
+ dx = target_cx - origin_cx
1442
+ dy = target_cy - origin_cy
1443
+
1444
+ for fxml in frag_xmls:
1445
+ placed_fragments.append(_translate_atoms_xml(fxml, dx, dy))
1446
+
1447
+ half_heights.append(mol_h / 2.0)
1448
+ x_cursor += mol_w + MOL_GAP
1449
+ start_id += len(atoms) * 3 + 200
1450
+
1451
+ if not placed_fragments:
1452
+ return ""
1453
+
1454
+ page_width = x_cursor - MOL_GAP + PAGE_MARGIN
1455
+ page_height = ROW_Y + max(half_heights) + PAGE_MARGIN
1456
+ page_bb = f"0 0 {page_width:.1f} {page_height:.1f}"
1457
+ page_content = "\n ".join(placed_fragments)
1458
+
1459
+ return (
1460
+ _format_cdxml_header(page_bb) + "\n"
1461
+ f'<page BoundingBox="{page_bb}">\n'
1462
+ f' {page_content}\n'
1463
+ '</page>\n'
1464
+ + _CDXML_FOOTER + "\n"
1465
+ )
1466
+
1467
+
1468
+ def results_to_cdxml_chemscript(
1469
+ results: List[Dict],
1470
+ verbose: bool = False,
1471
+ ) -> str:
1472
+ """
1473
+ Convert extracted structures to CDXML using ChemScript for cleanup.
1474
+
1475
+ For each structure with a valid SMILES, ChemScript's smiles_to_cdxml()
1476
+ is called — this runs CleanupStructure() internally, producing
1477
+ ChemDraw-native coordinates with proper aromaticity, bond lengths,
1478
+ and ACS 1996 style. The resulting fragment XMLs are then laid out
1479
+ left-to-right on a single page.
1480
+
1481
+ Requires chemscript_bridge.py to be importable from the same directory,
1482
+ and a working ChemDraw + ChemScript 32-bit environment.
1483
+ """
1484
+ import importlib.util
1485
+ import xml.etree.ElementTree as ET
1486
+ import re
1487
+
1488
+ def log(msg: str):
1489
+ if verbose:
1490
+ print(f"[structure_from_image] {msg}", file=sys.stderr)
1491
+
1492
+ # Import chemscript_bridge
1493
+ _dir = os.path.dirname(os.path.abspath(__file__))
1494
+ try:
1495
+ spec = importlib.util.spec_from_file_location(
1496
+ "chemscript_bridge", os.path.join(_dir, "chemscript_bridge.py")
1497
+ )
1498
+ csb_module = importlib.util.module_from_spec(spec)
1499
+ spec.loader.exec_module(csb_module)
1500
+ except Exception as exc:
1501
+ raise ImportError(
1502
+ f"Could not import chemscript_bridge.py: {exc}\n"
1503
+ "The --cleanup flag requires ChemDraw and chemscript_bridge."
1504
+ ) from exc
1505
+
1506
+ PAGE_MARGIN = 36.0
1507
+ MOL_GAP = 40.0
1508
+ ROW_Y = 300.0
1509
+ LABEL_PAD = 10.0
1510
+
1511
+ # --- Build each molecule via ChemScript, extract fragment, measure bbox ---
1512
+ log("Opening ChemScript bridge...")
1513
+ cs = csb_module.ChemScriptBridge()
1514
+
1515
+ frag_data: List[Tuple[str, float, float, float, float]] = []
1516
+ # Each item: (fragment_xml, xmin, ymin, xmax, ymax)
1517
+
1518
+ try:
1519
+ for entry in results:
1520
+ smiles = entry.get("smiles", "").strip()
1521
+ if not smiles:
1522
+ continue
1523
+
1524
+ # For multi-component SMILES, pick the best fragment
1525
+ if "." in smiles:
1526
+ smiles = _best_smiles_component(smiles)
1527
+
1528
+ log(f" ChemScript: {smiles[:60]}...")
1529
+ try:
1530
+ cdxml_str = cs.smiles_to_cdxml(smiles)
1531
+ except Exception as exc:
1532
+ log(f" ChemScript failed for {smiles[:40]}: {exc}")
1533
+ continue
1534
+
1535
+ if not cdxml_str or "<CDXML" not in cdxml_str:
1536
+ log(f" ChemScript returned empty CDXML")
1537
+ continue
1538
+
1539
+ # Parse the CDXML and extract all <fragment> elements + measure coords
1540
+ root = ET.fromstring(cdxml_str)
1541
+ page_el = root.find("page")
1542
+ if page_el is None:
1543
+ continue
1544
+
1545
+ for frag in page_el.findall("fragment"):
1546
+ frag_xml = ET.tostring(frag, encoding="unicode")
1547
+
1548
+ # Measure atom positions from <n> elements
1549
+ xs, ys = [], []
1550
+ for n in frag.findall("n"):
1551
+ p = n.get("p")
1552
+ if p:
1553
+ parts = p.split()
1554
+ if len(parts) >= 2:
1555
+ xs.append(float(parts[0]))
1556
+ ys.append(float(parts[1]))
1557
+
1558
+ if not xs:
1559
+ continue
1560
+
1561
+ frag_data.append((
1562
+ frag_xml,
1563
+ min(xs), min(ys), max(xs), max(ys),
1564
+ ))
1565
+ finally:
1566
+ cs.close()
1567
+
1568
+ if not frag_data:
1569
+ return ""
1570
+
1571
+ # --- Lay out fragments left-to-right ---
1572
+ placed_fragments: List[str] = []
1573
+ half_heights: List[float] = []
1574
+ x_cursor = PAGE_MARGIN
1575
+
1576
+ for frag_xml, xmin, ymin, xmax, ymax in frag_data:
1577
+ mol_w = (xmax - xmin) + LABEL_PAD * 2
1578
+ mol_h = (ymax - ymin) + LABEL_PAD * 2
1579
+ mol_w = max(mol_w, ACS_BOND_LENGTH_PT * 2)
1580
+ mol_h = max(mol_h, ACS_BOND_LENGTH_PT * 2)
1581
+
1582
+ origin_cx = (xmin + xmax) / 2.0
1583
+ origin_cy = (ymin + ymax) / 2.0
1584
+
1585
+ target_cx = x_cursor + mol_w / 2.0
1586
+ target_cy = ROW_Y
1587
+
1588
+ dx = target_cx - origin_cx
1589
+ dy = target_cy - origin_cy
1590
+
1591
+ placed_fragments.append(_translate_atoms_xml(frag_xml, dx, dy))
1592
+ half_heights.append(mol_h / 2.0)
1593
+ x_cursor += mol_w + MOL_GAP
1594
+
1595
+ if not placed_fragments:
1596
+ return ""
1597
+
1598
+ page_width = x_cursor - MOL_GAP + PAGE_MARGIN
1599
+ page_height = ROW_Y + max(half_heights) + PAGE_MARGIN
1600
+ page_bb = f"0 0 {page_width:.1f} {page_height:.1f}"
1601
+ page_content = "\n ".join(placed_fragments)
1602
+
1603
+ return (
1604
+ _format_cdxml_header(page_bb) + "\n"
1605
+ f'<page BoundingBox="{page_bb}">\n'
1606
+ f' {page_content}\n'
1607
+ '</page>\n'
1608
+ + _CDXML_FOOTER + "\n"
1609
+ )
1610
+
1611
+
1612
+ # ---------------------------------------------------------------------------
1613
+ # CLI
1614
+ # ---------------------------------------------------------------------------
1615
+
1616
+ def _build_parser() -> argparse.ArgumentParser:
1617
+ p = argparse.ArgumentParser(
1618
+ prog="structure_from_image.py",
1619
+ description="Extract chemical structures from images using DECIMER.",
1620
+ formatter_class=argparse.RawDescriptionHelpFormatter,
1621
+ epilog=__doc__.split("Notes")[0].split("Usage\n-----")[1].strip(),
1622
+ )
1623
+ p.add_argument("--input", "-i", required=True,
1624
+ help="Input image (PNG/JPG) or PDF file")
1625
+ p.add_argument("--output", "-o", default="-",
1626
+ help="Output file path; '-' writes JSON to stdout (default)")
1627
+ p.add_argument("--page", type=int, default=0,
1628
+ help="PDF page to process, 0-indexed (default: 0)")
1629
+ p.add_argument("--format", choices=["json", "cdxml"], default="json",
1630
+ help="Output format (default: json)")
1631
+ p.add_argument("--no-segment", dest="segment", action="store_false",
1632
+ help="Treat whole image as a single structure (skip segmentation)")
1633
+ p.add_argument("--hand-drawn", action="store_true",
1634
+ help="Use DECIMER hand-drawn model")
1635
+ p.add_argument("--cleanup", action="store_true",
1636
+ help="Use ChemScript to clean up structures — produces "
1637
+ "ChemDraw-native coordinates, proper aromaticity, "
1638
+ "and ACS 1996 style (requires ChemDraw + chemscript_bridge)")
1639
+ p.add_argument("--gap", type=int, default=None,
1640
+ help="Merge gap in pixels for segmentation box merging. "
1641
+ "Default: adaptive (based on image density). "
1642
+ "Use 0 to disable merging entirely.")
1643
+ p.add_argument("--verbose", "-v", action="store_true",
1644
+ help="Print progress messages to stderr")
1645
+ return p
1646
+
1647
+
1648
+ def main(argv: Optional[List[str]] = None) -> int:
1649
+ parser = _build_parser()
1650
+ args = parser.parse_args(argv)
1651
+
1652
+ if not os.path.isfile(args.input):
1653
+ print(f"ERROR: Input file not found: {args.input}", file=sys.stderr)
1654
+ return 1
1655
+
1656
+ if not HAS_CV2:
1657
+ print("ERROR: opencv-python not installed. Run: pip install opencv-python",
1658
+ file=sys.stderr)
1659
+ return 1
1660
+
1661
+ try:
1662
+ results = extract_structures_from_image(
1663
+ image_path=args.input,
1664
+ page=args.page,
1665
+ segment=args.segment,
1666
+ hand_drawn=args.hand_drawn,
1667
+ verbose=args.verbose,
1668
+ merge_gap=args.gap,
1669
+ )
1670
+ except Exception as exc:
1671
+ print(f"ERROR: {exc}", file=sys.stderr)
1672
+ if args.verbose:
1673
+ import traceback
1674
+ traceback.print_exc(file=sys.stderr)
1675
+ return 1
1676
+
1677
+ # Format output
1678
+ if args.format == "cdxml":
1679
+ try:
1680
+ if args.cleanup:
1681
+ output_str = results_to_cdxml_chemscript(
1682
+ results, verbose=args.verbose,
1683
+ )
1684
+ else:
1685
+ output_str = results_to_cdxml(results)
1686
+ except Exception as exc:
1687
+ print(f"ERROR building CDXML: {exc}", file=sys.stderr)
1688
+ if args.verbose:
1689
+ import traceback
1690
+ traceback.print_exc(file=sys.stderr)
1691
+ return 1
1692
+ if not output_str:
1693
+ print("WARNING: No valid structures to write to CDXML.", file=sys.stderr)
1694
+ return 1
1695
+ else:
1696
+ output_str = json.dumps(results, indent=2)
1697
+
1698
+ # Write output
1699
+ if args.output == "-":
1700
+ print(output_str)
1701
+ else:
1702
+ with open(args.output, "w", encoding="utf-8") as fh:
1703
+ fh.write(output_str)
1704
+ if args.verbose:
1705
+ print(f"Wrote {args.output}", file=sys.stderr)
1706
+
1707
+ return 0
1708
+
1709
+
1710
+ if __name__ == "__main__":
1711
+ sys.exit(main())