cdxml-toolkit 0.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cdxml_toolkit/__init__.py +18 -0
- cdxml_toolkit/_jre/__init__.py +2 -0
- cdxml_toolkit/_jre/temurin-21-jre-win-x64.zip +0 -0
- cdxml_toolkit/analysis/__init__.py +35 -0
- cdxml_toolkit/analysis/deterministic/__init__.py +12 -0
- cdxml_toolkit/analysis/deterministic/discover_experiment_files.py +413 -0
- cdxml_toolkit/analysis/deterministic/lab_book_formatter.py +701 -0
- cdxml_toolkit/analysis/deterministic/lcms_file_categorizer.py +928 -0
- cdxml_toolkit/analysis/deterministic/lcms_identifier.py +598 -0
- cdxml_toolkit/analysis/deterministic/mass_resolver.py +654 -0
- cdxml_toolkit/analysis/deterministic/multi_lcms_analyzer.py +1412 -0
- cdxml_toolkit/analysis/deterministic/procedure_writer.py +446 -0
- cdxml_toolkit/analysis/extract_nmr.py +47 -0
- cdxml_toolkit/analysis/format_procedure_entry.py +479 -0
- cdxml_toolkit/analysis/lcms_analyzer.py +1299 -0
- cdxml_toolkit/analysis/parse_analysis_file.py +134 -0
- cdxml_toolkit/cdxml_builder.py +920 -0
- cdxml_toolkit/cdxml_utils.py +342 -0
- cdxml_toolkit/chemdraw/__init__.py +5 -0
- cdxml_toolkit/chemdraw/_chemscript_server.py +562 -0
- cdxml_toolkit/chemdraw/cdx_converter.py +527 -0
- cdxml_toolkit/chemdraw/cdxml_to_image.py +262 -0
- cdxml_toolkit/chemdraw/cdxml_to_image_rdkit.py +296 -0
- cdxml_toolkit/chemdraw/chemscript_bridge.py +901 -0
- cdxml_toolkit/constants.py +304 -0
- cdxml_toolkit/coord_normalizer.py +438 -0
- cdxml_toolkit/deterministic_pipeline/__init__.py +6 -0
- cdxml_toolkit/deterministic_pipeline/legacy/__init__.py +5 -0
- cdxml_toolkit/deterministic_pipeline/legacy/eln_cdx_cleanup.py +509 -0
- cdxml_toolkit/deterministic_pipeline/legacy/eln_enrichment.py +1394 -0
- cdxml_toolkit/deterministic_pipeline/legacy/scheme_aligner.py +428 -0
- cdxml_toolkit/deterministic_pipeline/legacy/scheme_polisher.py +1337 -0
- cdxml_toolkit/deterministic_pipeline/legacy/scheme_polisher_v2.py +1340 -0
- cdxml_toolkit/deterministic_pipeline/scheme_reader_audit.py +931 -0
- cdxml_toolkit/deterministic_pipeline/scheme_reader_verify.py +1160 -0
- cdxml_toolkit/image/__init__.py +15 -0
- cdxml_toolkit/image/reaction_from_image.py +2103 -0
- cdxml_toolkit/image/structure_from_image.py +1711 -0
- cdxml_toolkit/layout/__init__.py +5 -0
- cdxml_toolkit/layout/alignment.py +1642 -0
- cdxml_toolkit/layout/reaction_cleanup.py +1002 -0
- cdxml_toolkit/layout/scheme_merger.py +2260 -0
- cdxml_toolkit/mcp_server/__init__.py +0 -0
- cdxml_toolkit/mcp_server/__main__.py +5 -0
- cdxml_toolkit/mcp_server/server.py +1567 -0
- cdxml_toolkit/naming/__init__.py +6 -0
- cdxml_toolkit/naming/aligned_namer.py +2342 -0
- cdxml_toolkit/naming/mol_builder.py +3722 -0
- cdxml_toolkit/naming/name_decomposer.py +2843 -0
- cdxml_toolkit/naming/reactions_datamol.json +2414 -0
- cdxml_toolkit/office/__init__.py +5 -0
- cdxml_toolkit/office/doc_from_template.py +722 -0
- cdxml_toolkit/office/ole_embedder.py +808 -0
- cdxml_toolkit/office/ole_extractor.py +272 -0
- cdxml_toolkit/perception/__init__.py +10 -0
- cdxml_toolkit/perception/compound_search.py +229 -0
- cdxml_toolkit/perception/eln_csv_parser.py +240 -0
- cdxml_toolkit/perception/rdf_parser.py +664 -0
- cdxml_toolkit/perception/reactant_heuristic.py +1045 -0
- cdxml_toolkit/perception/reaction_parser.py +2150 -0
- cdxml_toolkit/perception/scheme_reader.py +2948 -0
- cdxml_toolkit/perception/scheme_refine.py +1404 -0
- cdxml_toolkit/perception/scheme_segmenter.py +619 -0
- cdxml_toolkit/perception/spatial_assignment.py +1013 -0
- cdxml_toolkit/rdkit_utils.py +605 -0
- cdxml_toolkit/render/__init__.py +17 -0
- cdxml_toolkit/render/auto_layout.py +229 -0
- cdxml_toolkit/render/compact_parser.py +632 -0
- cdxml_toolkit/render/parser.py +706 -0
- cdxml_toolkit/render/render_scheme.py +267 -0
- cdxml_toolkit/render/renderer.py +2387 -0
- cdxml_toolkit/render/schema.py +90 -0
- cdxml_toolkit/render/scheme_maker.py +1043 -0
- cdxml_toolkit/render/scheme_yaml_writer.py +1487 -0
- cdxml_toolkit/resolve/__init__.py +13 -0
- cdxml_toolkit/resolve/cas_resolver.py +430 -0
- cdxml_toolkit/resolve/chemscanner_abbreviations.json +28813 -0
- cdxml_toolkit/resolve/condensed_formula.py +493 -0
- cdxml_toolkit/resolve/jre_manager.py +195 -0
- cdxml_toolkit/resolve/reagent_abbreviations.json +1046 -0
- cdxml_toolkit/resolve/reagent_db.py +285 -0
- cdxml_toolkit/resolve/superatom_data.json +2856 -0
- cdxml_toolkit/resolve/superatom_table.py +146 -0
- cdxml_toolkit/text_formatting.py +298 -0
- cdxml_toolkit-0.5.0.dist-info/METADATA +318 -0
- cdxml_toolkit-0.5.0.dist-info/RECORD +91 -0
- cdxml_toolkit-0.5.0.dist-info/WHEEL +5 -0
- cdxml_toolkit-0.5.0.dist-info/entry_points.txt +17 -0
- cdxml_toolkit-0.5.0.dist-info/licenses/LICENSE +21 -0
- cdxml_toolkit-0.5.0.dist-info/licenses/NOTICE.md +37 -0
- cdxml_toolkit-0.5.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,1711 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
structure_from_image.py — Extract chemical structures from images using DECIMER.
|
|
4
|
+
|
|
5
|
+
Takes a PNG/JPG image or a PDF page and returns SMILES + 2D atom coordinates
|
|
6
|
+
for every detected chemical structure.
|
|
7
|
+
|
|
8
|
+
Pipeline
|
|
9
|
+
--------
|
|
10
|
+
1. Input : image file (PNG/JPG) or PDF (one page extracted per run)
|
|
11
|
+
2. Segment: detect and crop individual structure regions using OpenCV
|
|
12
|
+
(white-background connected-component / contour approach)
|
|
13
|
+
3. DECIMER: convert each cropped image to SMILES
|
|
14
|
+
(DECIMER Image Transformer v2, ~285 MB model, downloaded on first run)
|
|
15
|
+
4. RDKit : SMILES → 2D coordinates (Compute2DCoords)
|
|
16
|
+
5. Output : JSON with SMILES + normalised atom/bond data per structure,
|
|
17
|
+
ready for cdxml_builder.py; optionally write CDXML directly.
|
|
18
|
+
|
|
19
|
+
Usage
|
|
20
|
+
-----
|
|
21
|
+
Single image, JSON output:
|
|
22
|
+
python structure_from_image.py --input image.png --output structures.json
|
|
23
|
+
|
|
24
|
+
PDF page (0-indexed):
|
|
25
|
+
python structure_from_image.py --input paper.pdf --page 0 --output out.json
|
|
26
|
+
|
|
27
|
+
Pipe straight to CDXML builder (multi-molecule page):
|
|
28
|
+
python structure_from_image.py --input image.png | python cdxml_builder.py --mode multi
|
|
29
|
+
|
|
30
|
+
Hand-drawn structures (uses DECIMER hand-drawn model):
|
|
31
|
+
python structure_from_image.py --input sketch.png --hand-drawn
|
|
32
|
+
|
|
33
|
+
Skip segmentation (whole image is one structure):
|
|
34
|
+
python structure_from_image.py --input single_structure.png --no-segment
|
|
35
|
+
|
|
36
|
+
Output JSON format
|
|
37
|
+
------------------
|
|
38
|
+
[
|
|
39
|
+
{
|
|
40
|
+
"index": 0,
|
|
41
|
+
"smiles": "c1ccccc1",
|
|
42
|
+
"bbox": [x0, y0, x1, y1], # pixel coords in the input image
|
|
43
|
+
"atoms": [
|
|
44
|
+
{"index": 1, "symbol": "C", "x": 200.0, "y": 300.0},
|
|
45
|
+
...
|
|
46
|
+
],
|
|
47
|
+
"bonds": [
|
|
48
|
+
{"index": 1, "order": 1, "atom1": 1, "atom2": 2},
|
|
49
|
+
...
|
|
50
|
+
]
|
|
51
|
+
},
|
|
52
|
+
...
|
|
53
|
+
]
|
|
54
|
+
|
|
55
|
+
Notes
|
|
56
|
+
-----
|
|
57
|
+
- DECIMER models download to ~/.data/DECIMER-V2/ on first run (~570 MB total).
|
|
58
|
+
- TensorFlow 2.20 prints hardware-capability warnings to stderr; these are harmless.
|
|
59
|
+
- Segmentation uses an OpenCV contour approach tuned for white-background publication
|
|
60
|
+
figures. For densely packed figures (multiple overlapping structures) it works best
|
|
61
|
+
on clean, high-resolution images (≥150 DPI equivalent).
|
|
62
|
+
- Coordinates are normalised to ACS 1996 style (bond length 14.40 pt) by
|
|
63
|
+
coord_normalizer.normalize_coords().
|
|
64
|
+
"""
|
|
65
|
+
|
|
66
|
+
import argparse
|
|
67
|
+
import json
|
|
68
|
+
import math
|
|
69
|
+
import os
|
|
70
|
+
import sys
|
|
71
|
+
import tempfile
|
|
72
|
+
from copy import deepcopy
|
|
73
|
+
from typing import Dict, List, Optional, Tuple
|
|
74
|
+
|
|
75
|
+
# ---------------------------------------------------------------------------
|
|
76
|
+
# Optional heavy imports (warn gracefully if missing)
|
|
77
|
+
# ---------------------------------------------------------------------------
|
|
78
|
+
|
|
79
|
+
try:
|
|
80
|
+
import cv2
|
|
81
|
+
import numpy as np
|
|
82
|
+
HAS_CV2 = True
|
|
83
|
+
except ImportError:
|
|
84
|
+
HAS_CV2 = False
|
|
85
|
+
|
|
86
|
+
try:
|
|
87
|
+
from PIL import Image
|
|
88
|
+
HAS_PIL = True
|
|
89
|
+
except ImportError:
|
|
90
|
+
HAS_PIL = False
|
|
91
|
+
|
|
92
|
+
try:
|
|
93
|
+
import pymupdf # PyMuPDF — PDF rendering
|
|
94
|
+
HAS_PYMUPDF = True
|
|
95
|
+
except ImportError:
|
|
96
|
+
HAS_PYMUPDF = False
|
|
97
|
+
|
|
98
|
+
try:
|
|
99
|
+
from rdkit import Chem
|
|
100
|
+
from rdkit.Chem import AllChem
|
|
101
|
+
HAS_RDKIT = True
|
|
102
|
+
except ImportError:
|
|
103
|
+
HAS_RDKIT = False
|
|
104
|
+
|
|
105
|
+
# ---------------------------------------------------------------------------
|
|
106
|
+
# DECIMER lazy loader — loads ONE model on demand, not both at import time.
|
|
107
|
+
#
|
|
108
|
+
# Upstream DECIMER eagerly loads both the standard AND hand-drawn models
|
|
109
|
+
# (~332 MB each) at import time via module-level get_models(). This takes
|
|
110
|
+
# ~50 s on CPU. We bypass that by loading only the model we need, only
|
|
111
|
+
# when predict is first called, cutting cold-start roughly in half.
|
|
112
|
+
# ---------------------------------------------------------------------------
|
|
113
|
+
import threading as _threading
|
|
114
|
+
|
|
115
|
+
_decimer_predict = None
|
|
116
|
+
_decimer_mode = None # tracks which model is loaded: "standard" or "hand_drawn"
|
|
117
|
+
_decimer_lock = _threading.Lock()
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
def _load_decimer(hand_drawn: bool = False):
|
|
121
|
+
"""Lazy-load DECIMER, loading only the requested model (not both).
|
|
122
|
+
|
|
123
|
+
On first call, loads TensorFlow + one DECIMER SavedModel (~25 s instead
|
|
124
|
+
of ~50 s). Subsequent calls with the same ``hand_drawn`` flag return
|
|
125
|
+
instantly. If the flag changes, the other model is loaded on demand.
|
|
126
|
+
|
|
127
|
+
Thread-safe: a lock prevents duplicate loads when the MCP server's
|
|
128
|
+
background preload thread and a tool call race.
|
|
129
|
+
"""
|
|
130
|
+
global _decimer_predict, _decimer_mode
|
|
131
|
+
requested = "hand_drawn" if hand_drawn else "standard"
|
|
132
|
+
|
|
133
|
+
if _decimer_predict is not None and _decimer_mode == requested:
|
|
134
|
+
return _decimer_predict
|
|
135
|
+
|
|
136
|
+
with _decimer_lock:
|
|
137
|
+
# Double-check after acquiring the lock (another thread may have
|
|
138
|
+
# finished loading while we waited).
|
|
139
|
+
if _decimer_predict is not None and _decimer_mode == requested:
|
|
140
|
+
return _decimer_predict
|
|
141
|
+
|
|
142
|
+
try:
|
|
143
|
+
import tensorflow as tf
|
|
144
|
+
import pystow
|
|
145
|
+
except ImportError as exc:
|
|
146
|
+
raise ImportError(
|
|
147
|
+
"DECIMER is not installed. Run:\n"
|
|
148
|
+
" pip install cdxml-toolkit[decimer]\n"
|
|
149
|
+
f"Original error: {exc}"
|
|
150
|
+
) from exc
|
|
151
|
+
|
|
152
|
+
# Bypass DECIMER's __init__.py which eagerly loads BOTH models (~50 s).
|
|
153
|
+
# Instead, load only the two helper submodules we need (utils,
|
|
154
|
+
# pre_process) via importlib, then tf.saved_model.load for ONE model.
|
|
155
|
+
import importlib.util, sys, types, pickle
|
|
156
|
+
|
|
157
|
+
spec = importlib.util.find_spec("DECIMER")
|
|
158
|
+
if spec is None or spec.submodule_search_locations is None:
|
|
159
|
+
raise ImportError("DECIMER package not found")
|
|
160
|
+
pkg_dir = spec.submodule_search_locations[0]
|
|
161
|
+
|
|
162
|
+
# Register a stub DECIMER package so submodule imports resolve
|
|
163
|
+
if "DECIMER" not in sys.modules:
|
|
164
|
+
stub = types.ModuleType("DECIMER")
|
|
165
|
+
stub.__path__ = [pkg_dir]
|
|
166
|
+
stub.__package__ = "DECIMER"
|
|
167
|
+
sys.modules["DECIMER"] = stub
|
|
168
|
+
|
|
169
|
+
def _load_submodule(name):
|
|
170
|
+
fqn = f"DECIMER.{name}"
|
|
171
|
+
if fqn in sys.modules:
|
|
172
|
+
return sys.modules[fqn]
|
|
173
|
+
sub_spec = importlib.util.spec_from_file_location(
|
|
174
|
+
fqn, os.path.join(pkg_dir, f"{name}.py"),
|
|
175
|
+
)
|
|
176
|
+
mod = importlib.util.module_from_spec(sub_spec)
|
|
177
|
+
sys.modules[fqn] = mod
|
|
178
|
+
sub_spec.loader.exec_module(mod)
|
|
179
|
+
return mod
|
|
180
|
+
|
|
181
|
+
utils = _load_submodule("utils")
|
|
182
|
+
pre_process = _load_submodule("pre_process")
|
|
183
|
+
|
|
184
|
+
# Locate models on disk (downloads ~570 MB on first run)
|
|
185
|
+
default_path = pystow.join("DECIMER-V2")
|
|
186
|
+
model_urls = {
|
|
187
|
+
"DECIMER": "https://zenodo.org/record/8300489/files/models.zip",
|
|
188
|
+
"DECIMER_HandDrawn": "https://zenodo.org/records/10781330/files/DECIMER_HandDrawn_model.zip",
|
|
189
|
+
}
|
|
190
|
+
model_paths = utils.ensure_models(
|
|
191
|
+
default_path=default_path, model_urls=model_urls,
|
|
192
|
+
)
|
|
193
|
+
|
|
194
|
+
# Load tokenizer (fast, ~0 s)
|
|
195
|
+
tokenizer_path = os.path.join(
|
|
196
|
+
model_paths["DECIMER"], "assets", "tokenizer_SMILES.pkl"
|
|
197
|
+
)
|
|
198
|
+
try:
|
|
199
|
+
with open(tokenizer_path, "rb") as f:
|
|
200
|
+
tokenizer = pickle.load(f)
|
|
201
|
+
except ModuleNotFoundError:
|
|
202
|
+
# Keras 2→3 compat: redirect keras.preprocessing.text
|
|
203
|
+
class _K2Unpickler(pickle.Unpickler):
|
|
204
|
+
def find_class(self, module, name):
|
|
205
|
+
if module.startswith("keras."):
|
|
206
|
+
module = module.replace("keras.", "tensorflow.keras.", 1)
|
|
207
|
+
return super().find_class(module, name)
|
|
208
|
+
with open(tokenizer_path, "rb") as f:
|
|
209
|
+
tokenizer = _K2Unpickler(f).load()
|
|
210
|
+
|
|
211
|
+
# Load only the requested model (~25 s instead of ~50 s for both)
|
|
212
|
+
model_key = "DECIMER_HandDrawn" if hand_drawn else "DECIMER"
|
|
213
|
+
model = tf.saved_model.load(model_paths[model_key])
|
|
214
|
+
|
|
215
|
+
def _predict(image_input, confidence=False, hand_drawn=False):
|
|
216
|
+
"""Predict SMILES from an image (numpy array or file path)."""
|
|
217
|
+
chemical_structure = pre_process.decode_image(image_input)
|
|
218
|
+
predicted_tokens, confidence_values = model(
|
|
219
|
+
tf.constant(chemical_structure)
|
|
220
|
+
)
|
|
221
|
+
outputs = [tokenizer.index_word[i] for i in predicted_tokens[0].numpy()]
|
|
222
|
+
smiles = (
|
|
223
|
+
"".join(str(t) for t in outputs)
|
|
224
|
+
.replace("<start>", "")
|
|
225
|
+
.replace("<end>", "")
|
|
226
|
+
)
|
|
227
|
+
smiles = utils.decoder(smiles)
|
|
228
|
+
|
|
229
|
+
if confidence:
|
|
230
|
+
conf_pairs = [
|
|
231
|
+
(
|
|
232
|
+
utils.decoder(
|
|
233
|
+
tokenizer.index_word[predicted_tokens[0].numpy()[i]]
|
|
234
|
+
),
|
|
235
|
+
confidence_values[i].numpy(),
|
|
236
|
+
)
|
|
237
|
+
for i in range(len(confidence_values))
|
|
238
|
+
]
|
|
239
|
+
# strip <start>/<end> tokens
|
|
240
|
+
conf_pairs = conf_pairs[1:-1]
|
|
241
|
+
return smiles, conf_pairs
|
|
242
|
+
|
|
243
|
+
return smiles
|
|
244
|
+
|
|
245
|
+
_decimer_predict = _predict
|
|
246
|
+
_decimer_mode = requested
|
|
247
|
+
return _decimer_predict
|
|
248
|
+
|
|
249
|
+
|
|
250
|
+
# ---------------------------------------------------------------------------
|
|
251
|
+
# Image I/O
|
|
252
|
+
# ---------------------------------------------------------------------------
|
|
253
|
+
|
|
254
|
+
def load_image(path: str, page: int = 0) -> "np.ndarray":
|
|
255
|
+
"""
|
|
256
|
+
Load an image from a PNG/JPG file or from a specific page of a PDF.
|
|
257
|
+
|
|
258
|
+
Returns an RGB numpy array (H x W x 3).
|
|
259
|
+
"""
|
|
260
|
+
if not HAS_CV2:
|
|
261
|
+
raise RuntimeError("opencv-python is required. Run: pip install opencv-python")
|
|
262
|
+
|
|
263
|
+
ext = os.path.splitext(path)[1].lower()
|
|
264
|
+
|
|
265
|
+
if ext == ".pdf":
|
|
266
|
+
if not HAS_PYMUPDF:
|
|
267
|
+
raise RuntimeError("PyMuPDF is required for PDF input. Run: pip install pymupdf")
|
|
268
|
+
doc = pymupdf.open(path)
|
|
269
|
+
if page >= len(doc):
|
|
270
|
+
raise ValueError(f"PDF has {len(doc)} pages; requested page {page} (0-indexed)")
|
|
271
|
+
pg = doc[page]
|
|
272
|
+
# Render at 150 DPI (matrix scale = 150/72)
|
|
273
|
+
matrix = pymupdf.Matrix(150 / 72, 150 / 72)
|
|
274
|
+
pix = pg.get_pixmap(matrix=matrix, alpha=False)
|
|
275
|
+
arr = np.frombuffer(pix.samples, dtype=np.uint8).reshape(pix.h, pix.w, pix.n)
|
|
276
|
+
doc.close()
|
|
277
|
+
# PyMuPDF returns RGB; convert to BGR for OpenCV
|
|
278
|
+
return cv2.cvtColor(arr, cv2.COLOR_RGB2BGR)
|
|
279
|
+
|
|
280
|
+
img = cv2.imread(path, cv2.IMREAD_COLOR)
|
|
281
|
+
if img is None:
|
|
282
|
+
raise FileNotFoundError(f"Cannot read image: {path}")
|
|
283
|
+
return img
|
|
284
|
+
|
|
285
|
+
|
|
286
|
+
# ---------------------------------------------------------------------------
|
|
287
|
+
# Segmentation
|
|
288
|
+
# ---------------------------------------------------------------------------
|
|
289
|
+
|
|
290
|
+
_MIN_STRUCTURE_AREA_PX = 1500 # ignore regions smaller than this (noise)
|
|
291
|
+
_MIN_SIDE_PX = 40 # ignore regions thinner than this
|
|
292
|
+
_PADDING_PX = 12 # padding around detected bounding boxes
|
|
293
|
+
_MAX_AREA_FRACTION = 0.90 # ignore boxes covering >90% of image (whole-page)
|
|
294
|
+
_MIN_ASPECT_RATIO = 0.15 # reject very wide/tall thin strips (text lines, arrows)
|
|
295
|
+
_MAX_SMILES_LEN = 500 # truncated/garbage DECIMER outputs above this length
|
|
296
|
+
|
|
297
|
+
|
|
298
|
+
def _to_gray_binary(bgr: "np.ndarray") -> "np.ndarray":
|
|
299
|
+
"""Convert BGR image to a binary mask where dark (non-white) pixels are 1."""
|
|
300
|
+
gray = cv2.cvtColor(bgr, cv2.COLOR_BGR2GRAY)
|
|
301
|
+
# Otsu threshold; for white-background publication figures this finds ink
|
|
302
|
+
_, binary = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)
|
|
303
|
+
return binary
|
|
304
|
+
|
|
305
|
+
|
|
306
|
+
def _merge_nearby_boxes(
|
|
307
|
+
boxes: List[Tuple[int, int, int, int]],
|
|
308
|
+
gap: int = 30,
|
|
309
|
+
) -> List[Tuple[int, int, int, int]]:
|
|
310
|
+
"""
|
|
311
|
+
Iteratively merge bounding boxes that are close together (within `gap` pixels).
|
|
312
|
+
This groups fragmented bond lines and labels that belong to one structure.
|
|
313
|
+
"""
|
|
314
|
+
if not boxes:
|
|
315
|
+
return []
|
|
316
|
+
|
|
317
|
+
changed = True
|
|
318
|
+
while changed:
|
|
319
|
+
changed = False
|
|
320
|
+
merged: List[Tuple[int, int, int, int]] = []
|
|
321
|
+
used = [False] * len(boxes)
|
|
322
|
+
for i, (x0, y0, x1, y1) in enumerate(boxes):
|
|
323
|
+
if used[i]:
|
|
324
|
+
continue
|
|
325
|
+
# Expand by gap for proximity test
|
|
326
|
+
ex0, ey0, ex1, ey1 = x0 - gap, y0 - gap, x1 + gap, y1 + gap
|
|
327
|
+
for j, (ax0, ay0, ax1, ay1) in enumerate(boxes):
|
|
328
|
+
if used[j] or j == i:
|
|
329
|
+
continue
|
|
330
|
+
# Overlap test on expanded box
|
|
331
|
+
if ax0 <= ex1 and ax1 >= ex0 and ay0 <= ey1 and ay1 >= ey0:
|
|
332
|
+
x0 = min(x0, ax0)
|
|
333
|
+
y0 = min(y0, ay0)
|
|
334
|
+
x1 = max(x1, ax1)
|
|
335
|
+
y1 = max(y1, ay1)
|
|
336
|
+
ex0, ey0, ex1, ey1 = x0 - gap, y0 - gap, x1 + gap, y1 + gap
|
|
337
|
+
used[j] = True
|
|
338
|
+
changed = True
|
|
339
|
+
merged.append((x0, y0, x1, y1))
|
|
340
|
+
used[i] = True
|
|
341
|
+
boxes = merged
|
|
342
|
+
return boxes
|
|
343
|
+
|
|
344
|
+
|
|
345
|
+
def _adaptive_gap(boxes: List[Tuple[int, int, int, int]]) -> int:
|
|
346
|
+
"""
|
|
347
|
+
Compute an adaptive merge gap based on inter-box distances.
|
|
348
|
+
|
|
349
|
+
Finds the minimum edge-to-edge distance between each pair of boxes,
|
|
350
|
+
then uses the median of these nearest-neighbour distances. The gap is
|
|
351
|
+
set to 50% of that median, clamped to [8, 40]. This prevents merging
|
|
352
|
+
truly distinct structures in dense figures while still grouping
|
|
353
|
+
fragments that belong to one molecule.
|
|
354
|
+
|
|
355
|
+
Falls back to 25 if there are fewer than 3 boxes.
|
|
356
|
+
"""
|
|
357
|
+
n = len(boxes)
|
|
358
|
+
if n < 3:
|
|
359
|
+
return 25 # reasonable default for sparse images
|
|
360
|
+
|
|
361
|
+
# Compute minimum edge-to-edge distance for each box to its nearest neighbour
|
|
362
|
+
def _edge_dist(a: Tuple[int, int, int, int], b: Tuple[int, int, int, int]) -> float:
|
|
363
|
+
ax0, ay0, ax1, ay1 = a
|
|
364
|
+
bx0, by0, bx1, by1 = b
|
|
365
|
+
dx = max(0, max(ax0 - bx1, bx0 - ax1))
|
|
366
|
+
dy = max(0, max(ay0 - by1, by0 - ay1))
|
|
367
|
+
return (dx**2 + dy**2) ** 0.5
|
|
368
|
+
|
|
369
|
+
nn_dists = []
|
|
370
|
+
for i in range(n):
|
|
371
|
+
min_d = float("inf")
|
|
372
|
+
for j in range(n):
|
|
373
|
+
if i == j:
|
|
374
|
+
continue
|
|
375
|
+
d = _edge_dist(boxes[i], boxes[j])
|
|
376
|
+
if d < min_d:
|
|
377
|
+
min_d = d
|
|
378
|
+
nn_dists.append(min_d)
|
|
379
|
+
|
|
380
|
+
nn_dists.sort()
|
|
381
|
+
median_nn = nn_dists[len(nn_dists) // 2]
|
|
382
|
+
|
|
383
|
+
# Gap = 50% of median nearest-neighbour distance
|
|
384
|
+
gap = int(median_nn * 0.50)
|
|
385
|
+
return max(8, min(40, gap))
|
|
386
|
+
|
|
387
|
+
|
|
388
|
+
def segment_structures(
|
|
389
|
+
bgr: "np.ndarray",
|
|
390
|
+
merge_gap: Optional[int] = None,
|
|
391
|
+
) -> List[Tuple["np.ndarray", Tuple[int, int, int, int]]]:
|
|
392
|
+
"""
|
|
393
|
+
Detect chemical structure regions in a BGR image.
|
|
394
|
+
|
|
395
|
+
Returns a list of (cropped_bgr, (x0, y0, x1, y1)) tuples, one per detected
|
|
396
|
+
structure, sorted left→right, top→bottom.
|
|
397
|
+
|
|
398
|
+
Parameters
|
|
399
|
+
----------
|
|
400
|
+
bgr : BGR image array (from OpenCV)
|
|
401
|
+
merge_gap : pixel gap for merging nearby boxes. None = adaptive
|
|
402
|
+
(computed from median box size). Set to 0 to disable merging.
|
|
403
|
+
|
|
404
|
+
Strategy: threshold → morphological close to fill small gaps → find external
|
|
405
|
+
contours → filter by area/size → merge nearby boxes → crop with padding.
|
|
406
|
+
"""
|
|
407
|
+
if not HAS_CV2:
|
|
408
|
+
raise RuntimeError("opencv-python is required.")
|
|
409
|
+
|
|
410
|
+
h, w = bgr.shape[:2]
|
|
411
|
+
total_px = h * w
|
|
412
|
+
|
|
413
|
+
binary = _to_gray_binary(bgr)
|
|
414
|
+
|
|
415
|
+
# Morphological close: connect nearby ink pixels (bond lines, letters)
|
|
416
|
+
kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (5, 5))
|
|
417
|
+
closed = cv2.morphologyEx(binary, cv2.MORPH_CLOSE, kernel, iterations=3)
|
|
418
|
+
|
|
419
|
+
contours, _ = cv2.findContours(closed, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
|
|
420
|
+
|
|
421
|
+
raw_boxes: List[Tuple[int, int, int, int]] = []
|
|
422
|
+
for cnt in contours:
|
|
423
|
+
x, y, cw, ch = cv2.boundingRect(cnt)
|
|
424
|
+
area = cw * ch
|
|
425
|
+
if area < _MIN_STRUCTURE_AREA_PX:
|
|
426
|
+
continue
|
|
427
|
+
if cw < _MIN_SIDE_PX or ch < _MIN_SIDE_PX:
|
|
428
|
+
continue
|
|
429
|
+
if area > _MAX_AREA_FRACTION * total_px:
|
|
430
|
+
continue
|
|
431
|
+
# Reject very thin horizontal/vertical strips (text lines, arrows)
|
|
432
|
+
aspect = min(cw, ch) / max(cw, ch)
|
|
433
|
+
if aspect < _MIN_ASPECT_RATIO:
|
|
434
|
+
continue
|
|
435
|
+
raw_boxes.append((x, y, x + cw, y + ch))
|
|
436
|
+
|
|
437
|
+
if not raw_boxes:
|
|
438
|
+
# Fall back to the whole image as one region
|
|
439
|
+
return [(bgr.copy(), (0, 0, w, h))]
|
|
440
|
+
|
|
441
|
+
gap = merge_gap if merge_gap is not None else _adaptive_gap(raw_boxes)
|
|
442
|
+
merged = _merge_nearby_boxes(raw_boxes, gap=gap)
|
|
443
|
+
|
|
444
|
+
# Sort top→bottom, left→right (row then column)
|
|
445
|
+
merged.sort(key=lambda b: (b[1] // 100, b[0]))
|
|
446
|
+
|
|
447
|
+
results = []
|
|
448
|
+
for x0, y0, x1, y1 in merged:
|
|
449
|
+
# Add padding, clamp to image bounds
|
|
450
|
+
px0 = max(0, x0 - _PADDING_PX)
|
|
451
|
+
py0 = max(0, y0 - _PADDING_PX)
|
|
452
|
+
px1 = min(w, x1 + _PADDING_PX)
|
|
453
|
+
py1 = min(h, y1 + _PADDING_PX)
|
|
454
|
+
crop = bgr[py0:py1, px0:px1]
|
|
455
|
+
results.append((crop, (px0, py0, px1, py1)))
|
|
456
|
+
|
|
457
|
+
return results
|
|
458
|
+
|
|
459
|
+
|
|
460
|
+
# ---------------------------------------------------------------------------
|
|
461
|
+
# SMILES → atom/bond data via RDKit
|
|
462
|
+
# ---------------------------------------------------------------------------
|
|
463
|
+
|
|
464
|
+
def _ring_double_bond_side(
|
|
465
|
+
mol: "Chem.Mol",
|
|
466
|
+
bond: "Chem.Bond",
|
|
467
|
+
) -> Optional[str]:
|
|
468
|
+
"""
|
|
469
|
+
For a double bond that is part of a ring, determine whether the second
|
|
470
|
+
bond line should be drawn to the Right or Left (relative to bond direction
|
|
471
|
+
begin→end). Returns None for non-ring double bonds.
|
|
472
|
+
|
|
473
|
+
Strategy: find the ring neighbour of the begin atom that is NOT the end
|
|
474
|
+
atom; the cross-product of (end-begin) × (neighbour-begin) gives the
|
|
475
|
+
side. Positive z → neighbour is to the left → double bond offset Right,
|
|
476
|
+
and vice-versa. This matches ChemDraw's DoublePosition convention.
|
|
477
|
+
"""
|
|
478
|
+
if not bond.IsInRing():
|
|
479
|
+
return None
|
|
480
|
+
conf = mol.GetConformer()
|
|
481
|
+
bi = bond.GetBeginAtomIdx()
|
|
482
|
+
ei = bond.GetEndAtomIdx()
|
|
483
|
+
bx, by = conf.GetAtomPosition(bi).x, conf.GetAtomPosition(bi).y
|
|
484
|
+
ex, ey = conf.GetAtomPosition(ei).x, conf.GetAtomPosition(ei).y
|
|
485
|
+
dx, dy = ex - bx, ey - by # bond vector
|
|
486
|
+
|
|
487
|
+
# Find a ring neighbour of begin-atom (other than end-atom)
|
|
488
|
+
ri = mol.GetRingInfo()
|
|
489
|
+
# Find the smallest ring containing this bond. For fused ring systems
|
|
490
|
+
# (e.g. thienopyrimidine), using only the smallest ring's atoms ensures
|
|
491
|
+
# the double-bond offset points toward the ring interior, not outward
|
|
492
|
+
# (which would look exocyclic).
|
|
493
|
+
containing_rings = [ring for ring in ri.AtomRings()
|
|
494
|
+
if bi in ring and ei in ring]
|
|
495
|
+
if not containing_rings:
|
|
496
|
+
return None
|
|
497
|
+
smallest_ring = min(containing_rings, key=len)
|
|
498
|
+
ring_atoms = set(smallest_ring)
|
|
499
|
+
|
|
500
|
+
for nb in mol.GetAtomWithIdx(bi).GetNeighbors():
|
|
501
|
+
ni = nb.GetIdx()
|
|
502
|
+
if ni == ei:
|
|
503
|
+
continue
|
|
504
|
+
if ni not in ring_atoms:
|
|
505
|
+
continue
|
|
506
|
+
nx, ny = conf.GetAtomPosition(ni).x, conf.GetAtomPosition(ni).y
|
|
507
|
+
# Cross product z-component: (bond vec) × (neighbour vec from begin)
|
|
508
|
+
cross_z = dx * (ny - by) - dy * (nx - bx)
|
|
509
|
+
# Positive cross → neighbour is to the left of bond direction
|
|
510
|
+
# ChemDraw DoublePosition="Right" means the second line is on the
|
|
511
|
+
# right side of the bond (i.e. away from the ring interior when
|
|
512
|
+
# neighbour is to the left)
|
|
513
|
+
return "Right" if cross_z > 0 else "Left"
|
|
514
|
+
|
|
515
|
+
return None
|
|
516
|
+
|
|
517
|
+
|
|
518
|
+
def _rdkit_mol_to_atom_bond_dicts(
|
|
519
|
+
mol: "Chem.Mol",
|
|
520
|
+
offset_index: int = 0,
|
|
521
|
+
) -> Tuple[List[Dict], List[Dict]]:
|
|
522
|
+
"""
|
|
523
|
+
Convert an RDKit Mol (with 2D conformer, already Kekulized) to atom/bond
|
|
524
|
+
dicts matching the format expected by coord_normalizer / cdxml_builder.
|
|
525
|
+
|
|
526
|
+
Atom indices are 1-based and offset by `offset_index` to allow unique
|
|
527
|
+
numbering across multiple molecules.
|
|
528
|
+
|
|
529
|
+
The mol MUST have been Kekulized with clearAromaticFlags=True before
|
|
530
|
+
calling this function, so that all bonds have explicit SINGLE/DOUBLE/TRIPLE
|
|
531
|
+
types (no AROMATIC). This is required for correct ChemDraw rendering —
|
|
532
|
+
ChemDraw 16 does not recognise Order="1.5" as an aromatic bond.
|
|
533
|
+
"""
|
|
534
|
+
conf = mol.GetConformer()
|
|
535
|
+
atoms = []
|
|
536
|
+
rdkit_to_local: Dict[int, int] = {} # rdkit 0-based → output 1-based
|
|
537
|
+
for i, atom in enumerate(mol.GetAtoms()):
|
|
538
|
+
pos = conf.GetAtomPosition(i)
|
|
539
|
+
local_idx = i + 1 + offset_index
|
|
540
|
+
rdkit_to_local[i] = local_idx
|
|
541
|
+
a: Dict = {
|
|
542
|
+
"index": local_idx,
|
|
543
|
+
"symbol": atom.GetSymbol(),
|
|
544
|
+
"x": round(float(pos.x), 4),
|
|
545
|
+
"y": round(float(pos.y), 4),
|
|
546
|
+
}
|
|
547
|
+
charge = atom.GetFormalCharge()
|
|
548
|
+
if charge != 0:
|
|
549
|
+
a["charge"] = charge
|
|
550
|
+
# GetTotalNumHs works even after Kekulize
|
|
551
|
+
nh = atom.GetTotalNumHs(includeNeighbors=False)
|
|
552
|
+
if atom.GetSymbol() != "C":
|
|
553
|
+
a["num_hydrogens"] = nh
|
|
554
|
+
isotope = atom.GetIsotope()
|
|
555
|
+
if isotope:
|
|
556
|
+
a["isotope"] = isotope
|
|
557
|
+
atoms.append(a)
|
|
558
|
+
|
|
559
|
+
bonds = []
|
|
560
|
+
for bi, bond in enumerate(mol.GetBonds()):
|
|
561
|
+
order_map = {
|
|
562
|
+
Chem.BondType.SINGLE: 1,
|
|
563
|
+
Chem.BondType.DOUBLE: 2,
|
|
564
|
+
Chem.BondType.TRIPLE: 3,
|
|
565
|
+
# AROMATIC should not appear after Kekulize, but keep as fallback
|
|
566
|
+
Chem.BondType.AROMATIC: 2,
|
|
567
|
+
}
|
|
568
|
+
order = order_map.get(bond.GetBondType(), 1)
|
|
569
|
+
|
|
570
|
+
# Bond direction for wedge/dash stereo
|
|
571
|
+
cfg = 0
|
|
572
|
+
bd = bond.GetBondDir()
|
|
573
|
+
if bd == Chem.BondDir.BEGINWEDGE:
|
|
574
|
+
cfg = 1
|
|
575
|
+
elif bd == Chem.BondDir.BEGINDASH:
|
|
576
|
+
cfg = 6
|
|
577
|
+
|
|
578
|
+
bond_dict: Dict = {
|
|
579
|
+
"index": bi + 1 + offset_index,
|
|
580
|
+
"order": order,
|
|
581
|
+
"atom1": rdkit_to_local[bond.GetBeginAtomIdx()],
|
|
582
|
+
"atom2": rdkit_to_local[bond.GetEndAtomIdx()],
|
|
583
|
+
"cfg": cfg,
|
|
584
|
+
}
|
|
585
|
+
|
|
586
|
+
# For in-ring double bonds, add DoublePosition so ChemDraw draws the
|
|
587
|
+
# second line on the correct (inside-ring) side.
|
|
588
|
+
if order == 2:
|
|
589
|
+
side = _ring_double_bond_side(mol, bond)
|
|
590
|
+
if side:
|
|
591
|
+
bond_dict["double_pos"] = side
|
|
592
|
+
|
|
593
|
+
bonds.append(bond_dict)
|
|
594
|
+
|
|
595
|
+
return atoms, bonds
|
|
596
|
+
|
|
597
|
+
|
|
598
|
+
def smiles_to_coords(smiles: str, offset_index: int = 0) -> Optional[Dict]:
|
|
599
|
+
"""
|
|
600
|
+
Convert a SMILES string to 2D atom/bond data using RDKit.
|
|
601
|
+
|
|
602
|
+
Returns a dict with "atoms" and "bonds" lists (raw RDKit Angstrom units),
|
|
603
|
+
or None if the SMILES is invalid or coordinate generation fails.
|
|
604
|
+
"""
|
|
605
|
+
if not HAS_RDKIT:
|
|
606
|
+
raise RuntimeError("RDKit is required. Activate the LLMChem conda environment.")
|
|
607
|
+
|
|
608
|
+
if not smiles or smiles.strip() in ("", "FAILED", "N/A"):
|
|
609
|
+
return None
|
|
610
|
+
|
|
611
|
+
mol = Chem.MolFromSmiles(smiles)
|
|
612
|
+
if mol is None:
|
|
613
|
+
return None
|
|
614
|
+
|
|
615
|
+
# Generate 2D coords directly on heavy atoms. Previous versions did
|
|
616
|
+
# AddHs → Compute2DCoords → RemoveHs, but that causes RDKit to lay out
|
|
617
|
+
# alkyl chains in a straight line (all bonds collinear) instead of a
|
|
618
|
+
# proper zigzag, because the algorithm spaces out explicit H positions
|
|
619
|
+
# and the heavy-atom backbone becomes linear.
|
|
620
|
+
result = AllChem.Compute2DCoords(mol)
|
|
621
|
+
if result != 0:
|
|
622
|
+
return None
|
|
623
|
+
|
|
624
|
+
# Kekulize AFTER coord generation so bond orders are explicit SINGLE/DOUBLE.
|
|
625
|
+
# clearAromaticFlags=True ensures GetBondType() returns SINGLE/DOUBLE, not
|
|
626
|
+
# AROMATIC — required for correct ChemDraw rendering (no Order="1.5").
|
|
627
|
+
try:
|
|
628
|
+
Chem.Kekulize(mol, clearAromaticFlags=True)
|
|
629
|
+
except Exception:
|
|
630
|
+
# If Kekulization fails (unusual), proceed anyway; aromatic bonds will
|
|
631
|
+
# be mapped to order=2 as a fallback in _rdkit_mol_to_atom_bond_dicts.
|
|
632
|
+
pass
|
|
633
|
+
|
|
634
|
+
atoms, bonds = _rdkit_mol_to_atom_bond_dicts(mol, offset_index=offset_index)
|
|
635
|
+
return {"atoms": atoms, "bonds": bonds}
|
|
636
|
+
|
|
637
|
+
|
|
638
|
+
# ---------------------------------------------------------------------------
|
|
639
|
+
# Coordinate normalisation (inline, no import dependency on coord_normalizer)
|
|
640
|
+
# ---------------------------------------------------------------------------
|
|
641
|
+
|
|
642
|
+
from ..constants import (
|
|
643
|
+
ACS_BOND_LENGTH as ACS_BOND_LENGTH_PT,
|
|
644
|
+
CDXML_HEADER as _CDXML_HEADER,
|
|
645
|
+
CDXML_FOOTER as _CDXML_FOOTER,
|
|
646
|
+
ACS_LABEL_FONT, ACS_LABEL_SIZE, ACS_LABEL_FACE,
|
|
647
|
+
ACS_CAPTION_SIZE, ACS_HASH_SPACING, ACS_MARGIN_WIDTH,
|
|
648
|
+
ACS_LINE_WIDTH, ACS_BOLD_WIDTH, ACS_BOND_LENGTH_STR,
|
|
649
|
+
ACS_BOND_SPACING, ACS_CHAIN_ANGLE_STR,
|
|
650
|
+
)
|
|
651
|
+
|
|
652
|
+
|
|
653
|
+
def _average_bond_length(atoms: List[Dict], bonds: List[Dict]) -> float:
|
|
654
|
+
if not bonds:
|
|
655
|
+
return 1.0
|
|
656
|
+
xy = {a["index"]: (a["x"], a["y"]) for a in atoms}
|
|
657
|
+
lengths = [
|
|
658
|
+
math.hypot(
|
|
659
|
+
xy.get(b["atom1"], (0, 0))[0] - xy.get(b["atom2"], (0, 0))[0],
|
|
660
|
+
xy.get(b["atom1"], (0, 0))[1] - xy.get(b["atom2"], (0, 0))[1],
|
|
661
|
+
)
|
|
662
|
+
for b in bonds
|
|
663
|
+
if math.hypot(
|
|
664
|
+
xy.get(b["atom1"], (0, 0))[0] - xy.get(b["atom2"], (0, 0))[0],
|
|
665
|
+
xy.get(b["atom1"], (0, 0))[1] - xy.get(b["atom2"], (0, 0))[1],
|
|
666
|
+
) > 1e-6
|
|
667
|
+
]
|
|
668
|
+
return sum(lengths) / len(lengths) if lengths else 1.0
|
|
669
|
+
|
|
670
|
+
|
|
671
|
+
def normalize_for_cdxml(
|
|
672
|
+
atoms: List[Dict],
|
|
673
|
+
bonds: List[Dict],
|
|
674
|
+
center_x: float = 200.0,
|
|
675
|
+
center_y: float = 300.0,
|
|
676
|
+
) -> Tuple[List[Dict], List[Dict]]:
|
|
677
|
+
"""
|
|
678
|
+
Scale + flip-y + centre coordinates for CDXML output (ACS 1996, 14.40 pt bonds).
|
|
679
|
+
RDKit coords are Angstroms, y-up. CDXML is points, y-down.
|
|
680
|
+
"""
|
|
681
|
+
atoms = deepcopy(atoms)
|
|
682
|
+
bonds = deepcopy(bonds)
|
|
683
|
+
|
|
684
|
+
if not atoms:
|
|
685
|
+
return atoms, bonds
|
|
686
|
+
|
|
687
|
+
# Flip y
|
|
688
|
+
for a in atoms:
|
|
689
|
+
a["y"] = -a["y"]
|
|
690
|
+
|
|
691
|
+
# Scale
|
|
692
|
+
avg_bl = _average_bond_length(atoms, bonds)
|
|
693
|
+
if avg_bl > 1e-6:
|
|
694
|
+
scale = ACS_BOND_LENGTH_PT / avg_bl
|
|
695
|
+
for a in atoms:
|
|
696
|
+
a["x"] *= scale
|
|
697
|
+
a["y"] *= scale
|
|
698
|
+
|
|
699
|
+
# Centre
|
|
700
|
+
xs = [a["x"] for a in atoms]
|
|
701
|
+
ys = [a["y"] for a in atoms]
|
|
702
|
+
cx = (min(xs) + max(xs)) / 2.0
|
|
703
|
+
cy = (min(ys) + max(ys)) / 2.0
|
|
704
|
+
for a in atoms:
|
|
705
|
+
a["x"] = round(a["x"] - cx + center_x, 3)
|
|
706
|
+
a["y"] = round(a["y"] - cy + center_y, 3)
|
|
707
|
+
|
|
708
|
+
return atoms, bonds
|
|
709
|
+
|
|
710
|
+
|
|
711
|
+
# ---------------------------------------------------------------------------
|
|
712
|
+
# Mass data enrichment
|
|
713
|
+
# ---------------------------------------------------------------------------
|
|
714
|
+
|
|
715
|
+
def enrich_with_mass_data(results: List[Dict]) -> None:
|
|
716
|
+
"""Add formula, mw, exact_mass, and adducts to each extracted structure.
|
|
717
|
+
|
|
718
|
+
Mutates *results* in place. Requires RDKit; silently skips if unavailable.
|
|
719
|
+
"""
|
|
720
|
+
if not HAS_RDKIT:
|
|
721
|
+
return
|
|
722
|
+
|
|
723
|
+
from rdkit.Chem import Descriptors, rdMolDescriptors
|
|
724
|
+
|
|
725
|
+
for entry in results:
|
|
726
|
+
smiles = entry.get("smiles", "").strip()
|
|
727
|
+
if not smiles:
|
|
728
|
+
continue
|
|
729
|
+
mol = Chem.MolFromSmiles(smiles)
|
|
730
|
+
if mol is None:
|
|
731
|
+
continue
|
|
732
|
+
|
|
733
|
+
exact_mass_full = Descriptors.ExactMolWt(mol)
|
|
734
|
+
mw = Descriptors.MolWt(mol)
|
|
735
|
+
formula = rdMolDescriptors.CalcMolFormula(mol)
|
|
736
|
+
|
|
737
|
+
# Salt splitting: neutral = largest fragment
|
|
738
|
+
frags = Chem.GetMolFrags(mol, asMols=True)
|
|
739
|
+
if len(frags) > 1:
|
|
740
|
+
neutral_mol = max(frags, key=lambda m: m.GetNumHeavyAtoms())
|
|
741
|
+
exact_mass = Descriptors.ExactMolWt(neutral_mol)
|
|
742
|
+
else:
|
|
743
|
+
exact_mass = exact_mass_full
|
|
744
|
+
|
|
745
|
+
entry["formula"] = formula
|
|
746
|
+
entry["mw"] = round(mw, 4)
|
|
747
|
+
entry["exact_mass"] = round(exact_mass, 5)
|
|
748
|
+
entry["exact_mass_full"] = round(exact_mass_full, 5)
|
|
749
|
+
entry["adducts"] = {
|
|
750
|
+
"[M+H]+": round(exact_mass + 1.00728, 5),
|
|
751
|
+
"[M-H]-": round(exact_mass - 1.00728, 5),
|
|
752
|
+
"[M+Na]+": round(exact_mass + 22.98922, 5),
|
|
753
|
+
"[M+formate]-": round(exact_mass + 44.99820, 5),
|
|
754
|
+
}
|
|
755
|
+
|
|
756
|
+
|
|
757
|
+
# ---------------------------------------------------------------------------
|
|
758
|
+
# Main extraction pipeline
|
|
759
|
+
# ---------------------------------------------------------------------------
|
|
760
|
+
|
|
761
|
+
def _extract_structures_raw(
|
|
762
|
+
image_path: str,
|
|
763
|
+
page: int = 0,
|
|
764
|
+
segment: bool = True,
|
|
765
|
+
hand_drawn: bool = False,
|
|
766
|
+
verbose: bool = False,
|
|
767
|
+
merge_gap: Optional[int] = None,
|
|
768
|
+
) -> List[Dict]:
|
|
769
|
+
"""
|
|
770
|
+
Full pipeline: image → segmented crops → SMILES → 2D coords.
|
|
771
|
+
|
|
772
|
+
This is the internal low-level function. Call extract_structures_from_image()
|
|
773
|
+
for the public API that returns structured JSON.
|
|
774
|
+
|
|
775
|
+
Parameters
|
|
776
|
+
----------
|
|
777
|
+
image_path : path to PNG/JPG/PDF
|
|
778
|
+
page : PDF page number (0-indexed); ignored for image files
|
|
779
|
+
segment : if False, treat whole image as one structure
|
|
780
|
+
hand_drawn : use DECIMER hand-drawn model
|
|
781
|
+
verbose : print progress to stderr
|
|
782
|
+
merge_gap : pixel gap for merging nearby boxes during segmentation.
|
|
783
|
+
None = adaptive (based on median box size). 0 = no merging.
|
|
784
|
+
|
|
785
|
+
Returns
|
|
786
|
+
-------
|
|
787
|
+
List of dicts, one per detected structure:
|
|
788
|
+
{
|
|
789
|
+
"index": int,
|
|
790
|
+
"smiles": str,
|
|
791
|
+
"confidence": float or None, # mean per-token DECIMER confidence, 0-1
|
|
792
|
+
"bbox": [x0, y0, x1, y1],
|
|
793
|
+
"atoms": [...],
|
|
794
|
+
"bonds": [...]
|
|
795
|
+
}
|
|
796
|
+
"""
|
|
797
|
+
def log(msg: str):
|
|
798
|
+
if verbose:
|
|
799
|
+
print(f"[structure_from_image] {msg}", file=sys.stderr)
|
|
800
|
+
|
|
801
|
+
# 1. Load image
|
|
802
|
+
log(f"Loading {image_path}" + (f" page {page}" if image_path.lower().endswith(".pdf") else ""))
|
|
803
|
+
bgr = load_image(image_path, page=page)
|
|
804
|
+
h, w = bgr.shape[:2]
|
|
805
|
+
log(f"Image size: {w}x{h} px")
|
|
806
|
+
|
|
807
|
+
# 2. Segment
|
|
808
|
+
if segment:
|
|
809
|
+
log("Segmenting structures...")
|
|
810
|
+
regions = segment_structures(bgr, merge_gap=merge_gap)
|
|
811
|
+
log(f"Found {len(regions)} candidate region(s)")
|
|
812
|
+
else:
|
|
813
|
+
regions = [(bgr.copy(), (0, 0, w, h))]
|
|
814
|
+
log("Skipping segmentation (--no-segment)")
|
|
815
|
+
|
|
816
|
+
# 3. Load DECIMER (deferred)
|
|
817
|
+
log("Loading DECIMER model (may take a moment on first call)...")
|
|
818
|
+
predict_fn = _load_decimer(hand_drawn=hand_drawn)
|
|
819
|
+
|
|
820
|
+
# 4. Process each region
|
|
821
|
+
results: List[Dict] = []
|
|
822
|
+
atom_offset = 0
|
|
823
|
+
|
|
824
|
+
for i, (crop, bbox) in enumerate(regions):
|
|
825
|
+
log(f"Processing region {i+1}/{len(regions)} — bbox {bbox}")
|
|
826
|
+
|
|
827
|
+
# Try to call DECIMER with confidence=True to get per-token scores.
|
|
828
|
+
# Falls back to confidence=False if the version doesn't support it.
|
|
829
|
+
raw_confidence = None
|
|
830
|
+
try:
|
|
831
|
+
result = predict_fn(crop, confidence=True, hand_drawn=hand_drawn)
|
|
832
|
+
if isinstance(result, tuple):
|
|
833
|
+
smiles, raw_confidence = result[0], result[1]
|
|
834
|
+
else:
|
|
835
|
+
smiles = result
|
|
836
|
+
except TypeError:
|
|
837
|
+
# Older DECIMER versions don't accept numpy; fall back to temp file
|
|
838
|
+
with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tf:
|
|
839
|
+
tmp_path = tf.name
|
|
840
|
+
try:
|
|
841
|
+
cv2.imwrite(tmp_path, crop)
|
|
842
|
+
try:
|
|
843
|
+
result = predict_fn(tmp_path, confidence=True)
|
|
844
|
+
if isinstance(result, tuple):
|
|
845
|
+
smiles, raw_confidence = result[0], result[1]
|
|
846
|
+
else:
|
|
847
|
+
smiles = result
|
|
848
|
+
except TypeError:
|
|
849
|
+
smiles = predict_fn(tmp_path)
|
|
850
|
+
finally:
|
|
851
|
+
os.unlink(tmp_path)
|
|
852
|
+
except Exception as exc:
|
|
853
|
+
log(f" DECIMER failed: {exc}")
|
|
854
|
+
smiles = ""
|
|
855
|
+
|
|
856
|
+
smiles = smiles.strip() if smiles else ""
|
|
857
|
+
|
|
858
|
+
# Sanity check: abnormally long SMILES usually means DECIMER is reading
|
|
859
|
+
# text/arrows/noise rather than a chemical structure.
|
|
860
|
+
if len(smiles) > _MAX_SMILES_LEN:
|
|
861
|
+
log(f" SMILES too long ({len(smiles)} chars) — discarding as noise")
|
|
862
|
+
smiles = ""
|
|
863
|
+
raw_confidence = None
|
|
864
|
+
|
|
865
|
+
# Compute a single confidence scalar from per-token scores
|
|
866
|
+
confidence = _compute_confidence_score(raw_confidence)
|
|
867
|
+
|
|
868
|
+
log(f" SMILES: {smiles or '(none)'}"
|
|
869
|
+
+ (f" confidence: {confidence:.3f}" if confidence is not None else ""))
|
|
870
|
+
|
|
871
|
+
# 5. SMILES → 2D coordinates
|
|
872
|
+
mol_data = None
|
|
873
|
+
if smiles:
|
|
874
|
+
mol_data = smiles_to_coords(smiles, offset_index=atom_offset)
|
|
875
|
+
if mol_data is None:
|
|
876
|
+
log(f" RDKit could not parse SMILES: {smiles}")
|
|
877
|
+
else:
|
|
878
|
+
# Normalise to ACS 1996 CDXML coords.
|
|
879
|
+
# Use a fixed origin here; final placement is done in
|
|
880
|
+
# results_to_cdxml() based on actual bounding boxes.
|
|
881
|
+
atoms_norm, bonds_norm = normalize_for_cdxml(
|
|
882
|
+
mol_data["atoms"],
|
|
883
|
+
mol_data["bonds"],
|
|
884
|
+
center_x=200.0,
|
|
885
|
+
center_y=300.0,
|
|
886
|
+
)
|
|
887
|
+
mol_data["atoms"] = atoms_norm
|
|
888
|
+
mol_data["bonds"] = bonds_norm
|
|
889
|
+
atom_offset += len(mol_data["atoms"])
|
|
890
|
+
|
|
891
|
+
entry: Dict = {
|
|
892
|
+
"index": i,
|
|
893
|
+
"smiles": smiles,
|
|
894
|
+
"confidence": confidence,
|
|
895
|
+
"bbox": list(bbox),
|
|
896
|
+
}
|
|
897
|
+
if mol_data:
|
|
898
|
+
entry["atoms"] = mol_data["atoms"]
|
|
899
|
+
entry["bonds"] = mol_data["bonds"]
|
|
900
|
+
else:
|
|
901
|
+
entry["atoms"] = []
|
|
902
|
+
entry["bonds"] = []
|
|
903
|
+
|
|
904
|
+
results.append(entry)
|
|
905
|
+
|
|
906
|
+
# Enrich with mass data (formula, MW, exact_mass, adducts)
|
|
907
|
+
enrich_with_mass_data(results)
|
|
908
|
+
|
|
909
|
+
log(f"Done. {len(results)} structure(s) extracted.")
|
|
910
|
+
return results
|
|
911
|
+
|
|
912
|
+
|
|
913
|
+
# ---------------------------------------------------------------------------
|
|
914
|
+
# Confidence scoring
|
|
915
|
+
# ---------------------------------------------------------------------------
|
|
916
|
+
|
|
917
|
+
def _compute_confidence_score(
|
|
918
|
+
raw_confidence: Optional[list],
|
|
919
|
+
) -> Optional[float]:
|
|
920
|
+
"""
|
|
921
|
+
Reduce DECIMER's per-token confidence list to a single scalar in [0, 1].
|
|
922
|
+
|
|
923
|
+
DECIMER returns a list of (token, score) tuples when called with
|
|
924
|
+
confidence=True. This function computes the geometric mean of the
|
|
925
|
+
scores, which is more sensitive to low-confidence tokens than the
|
|
926
|
+
arithmetic mean and better reflects overall prediction reliability.
|
|
927
|
+
|
|
928
|
+
Returns None if no confidence data is available.
|
|
929
|
+
"""
|
|
930
|
+
if not raw_confidence:
|
|
931
|
+
return None
|
|
932
|
+
|
|
933
|
+
scores = []
|
|
934
|
+
for item in raw_confidence:
|
|
935
|
+
if isinstance(item, (tuple, list)) and len(item) >= 2:
|
|
936
|
+
try:
|
|
937
|
+
scores.append(float(item[1]))
|
|
938
|
+
except (TypeError, ValueError):
|
|
939
|
+
pass
|
|
940
|
+
else:
|
|
941
|
+
try:
|
|
942
|
+
scores.append(float(item))
|
|
943
|
+
except (TypeError, ValueError):
|
|
944
|
+
pass
|
|
945
|
+
|
|
946
|
+
if not scores:
|
|
947
|
+
return None
|
|
948
|
+
|
|
949
|
+
# Geometric mean (log-space to avoid underflow)
|
|
950
|
+
import math as _math
|
|
951
|
+
log_sum = sum(_math.log(max(s, 1e-9)) for s in scores)
|
|
952
|
+
return round(_math.exp(log_sum / len(scores)), 4)
|
|
953
|
+
|
|
954
|
+
|
|
955
|
+
# ---------------------------------------------------------------------------
|
|
956
|
+
# Nearby text label detection
|
|
957
|
+
# ---------------------------------------------------------------------------
|
|
958
|
+
|
|
959
|
+
def _detect_nearby_labels(
|
|
960
|
+
bgr: "np.ndarray",
|
|
961
|
+
structure_bboxes: List[Tuple[int, int, int, int]],
|
|
962
|
+
search_margin: int = 80,
|
|
963
|
+
) -> List[Optional[str]]:
|
|
964
|
+
"""
|
|
965
|
+
Detect text labels near each structure bounding box in the image.
|
|
966
|
+
|
|
967
|
+
Uses a two-phase strategy:
|
|
968
|
+
1. Find candidate text regions via OpenCV contours (small, elongated blobs
|
|
969
|
+
that look like text lines rather than structure fragments).
|
|
970
|
+
2. If pytesseract or easyocr is available, OCR those regions and associate
|
|
971
|
+
the nearest text label to each structure. If neither is installed,
|
|
972
|
+
returns None for every structure.
|
|
973
|
+
|
|
974
|
+
Parameters
|
|
975
|
+
----------
|
|
976
|
+
bgr : BGR image array
|
|
977
|
+
structure_bboxes : list of (x0, y0, x1, y1) for each detected structure
|
|
978
|
+
search_margin : how many pixels outside the structure bbox to search
|
|
979
|
+
for associated text labels
|
|
980
|
+
|
|
981
|
+
Returns
|
|
982
|
+
-------
|
|
983
|
+
List of str|None, one per structure. Each entry is the detected label
|
|
984
|
+
text (stripped) or None if no label was found or OCR is unavailable.
|
|
985
|
+
"""
|
|
986
|
+
if not HAS_CV2 or not structure_bboxes:
|
|
987
|
+
return [None] * len(structure_bboxes)
|
|
988
|
+
|
|
989
|
+
import numpy as np
|
|
990
|
+
|
|
991
|
+
h, w = bgr.shape[:2]
|
|
992
|
+
|
|
993
|
+
# --- Phase 1: Find candidate text regions ---
|
|
994
|
+
# Text regions tend to be: small area, high aspect ratio (wide and short),
|
|
995
|
+
# located outside the structure bounding boxes.
|
|
996
|
+
gray = cv2.cvtColor(bgr, cv2.COLOR_BGR2GRAY)
|
|
997
|
+
_, binary = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)
|
|
998
|
+
|
|
999
|
+
# Use a smaller morphological kernel to preserve text character separations
|
|
1000
|
+
kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (3, 3))
|
|
1001
|
+
closed = cv2.morphologyEx(binary, cv2.MORPH_CLOSE, kernel, iterations=1)
|
|
1002
|
+
contours, _ = cv2.findContours(closed, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
|
|
1003
|
+
|
|
1004
|
+
# Build a mask of structure regions (to exclude them from label search)
|
|
1005
|
+
structure_set = set()
|
|
1006
|
+
for (sx0, sy0, sx1, sy1) in structure_bboxes:
|
|
1007
|
+
for px in range(max(0, sx0), min(w, sx1)):
|
|
1008
|
+
for py in range(max(0, sy0), min(h, sy1)):
|
|
1009
|
+
structure_set.add((px, py))
|
|
1010
|
+
|
|
1011
|
+
text_blobs: List[Tuple[int, int, int, int]] = []
|
|
1012
|
+
for cnt in contours:
|
|
1013
|
+
cx, cy, cw, ch = cv2.boundingRect(cnt)
|
|
1014
|
+
area = cw * ch
|
|
1015
|
+
|
|
1016
|
+
# Skip tiny noise and huge blobs
|
|
1017
|
+
if area < 50 or area > 0.05 * h * w:
|
|
1018
|
+
continue
|
|
1019
|
+
|
|
1020
|
+
# Text lines are wider than they are tall (aspect > 1.5), or are
|
|
1021
|
+
# narrow vertical labels. Very square blobs are likely structure parts.
|
|
1022
|
+
aspect = max(cw, ch) / max(min(cw, ch), 1)
|
|
1023
|
+
if aspect < 1.5:
|
|
1024
|
+
continue
|
|
1025
|
+
|
|
1026
|
+
# Skip blobs that overlap significantly with any structure bbox
|
|
1027
|
+
blob_cx = cx + cw // 2
|
|
1028
|
+
blob_cy = cy + ch // 2
|
|
1029
|
+
in_structure = False
|
|
1030
|
+
for (sx0, sy0, sx1, sy1) in structure_bboxes:
|
|
1031
|
+
if sx0 <= blob_cx <= sx1 and sy0 <= blob_cy <= sy1:
|
|
1032
|
+
in_structure = True
|
|
1033
|
+
break
|
|
1034
|
+
if in_structure:
|
|
1035
|
+
continue
|
|
1036
|
+
|
|
1037
|
+
text_blobs.append((cx, cy, cx + cw, cy + ch))
|
|
1038
|
+
|
|
1039
|
+
if not text_blobs:
|
|
1040
|
+
return [None] * len(structure_bboxes)
|
|
1041
|
+
|
|
1042
|
+
# --- Phase 2: Try OCR on the candidates ---
|
|
1043
|
+
# Check for OCR availability (pytesseract preferred, easyocr fallback)
|
|
1044
|
+
ocr_fn = _get_ocr_fn()
|
|
1045
|
+
if ocr_fn is None:
|
|
1046
|
+
# No OCR available; return None for all structures but record that
|
|
1047
|
+
# text blobs were detected (useful for debugging).
|
|
1048
|
+
return [None] * len(structure_bboxes)
|
|
1049
|
+
|
|
1050
|
+
# Associate each text blob with the nearest structure (by edge distance)
|
|
1051
|
+
labels = [None] * len(structure_bboxes)
|
|
1052
|
+
|
|
1053
|
+
for (bx0, by0, bx1, by1) in text_blobs:
|
|
1054
|
+
# Check if this blob falls within the search_margin of any structure
|
|
1055
|
+
best_dist = float("inf")
|
|
1056
|
+
best_idx = -1
|
|
1057
|
+
|
|
1058
|
+
for si, (sx0, sy0, sx1, sy1) in enumerate(structure_bboxes):
|
|
1059
|
+
# Expand structure bbox by search_margin
|
|
1060
|
+
ex0, ey0 = sx0 - search_margin, sy0 - search_margin
|
|
1061
|
+
ex1, ey1 = sx1 + search_margin, sy1 + search_margin
|
|
1062
|
+
|
|
1063
|
+
# Check if blob centre is within expanded bbox
|
|
1064
|
+
bcx, bcy = (bx0 + bx1) // 2, (by0 + by1) // 2
|
|
1065
|
+
if ex0 <= bcx <= ex1 and ey0 <= bcy <= ey1:
|
|
1066
|
+
# Compute edge-to-edge distance
|
|
1067
|
+
dx = max(0, max(sx0 - bx1, bx0 - sx1))
|
|
1068
|
+
dy = max(0, max(sy0 - by1, by0 - sy1))
|
|
1069
|
+
dist = (dx * dx + dy * dy) ** 0.5
|
|
1070
|
+
if dist < best_dist:
|
|
1071
|
+
best_dist = dist
|
|
1072
|
+
best_idx = si
|
|
1073
|
+
|
|
1074
|
+
if best_idx < 0:
|
|
1075
|
+
continue
|
|
1076
|
+
|
|
1077
|
+
# OCR the blob
|
|
1078
|
+
try:
|
|
1079
|
+
crop = bgr[by0:by1, bx0:bx1]
|
|
1080
|
+
text = ocr_fn(crop).strip()
|
|
1081
|
+
except Exception:
|
|
1082
|
+
text = None
|
|
1083
|
+
|
|
1084
|
+
if text:
|
|
1085
|
+
# Append to existing label (a structure can have multiple labels)
|
|
1086
|
+
existing = labels[best_idx]
|
|
1087
|
+
labels[best_idx] = f"{existing} {text}".strip() if existing else text
|
|
1088
|
+
|
|
1089
|
+
return labels
|
|
1090
|
+
|
|
1091
|
+
|
|
1092
|
+
def _get_ocr_fn():
|
|
1093
|
+
"""
|
|
1094
|
+
Return a callable f(bgr_crop) -> str that performs OCR on a BGR image crop.
|
|
1095
|
+
|
|
1096
|
+
Tries pytesseract first, then easyocr. Returns None if neither is available.
|
|
1097
|
+
"""
|
|
1098
|
+
# Try pytesseract (fastest, most common)
|
|
1099
|
+
try:
|
|
1100
|
+
import pytesseract
|
|
1101
|
+
from PIL import Image as _PILImage
|
|
1102
|
+
|
|
1103
|
+
def _tesseract_ocr(bgr_crop: "np.ndarray") -> str:
|
|
1104
|
+
rgb = cv2.cvtColor(bgr_crop, cv2.COLOR_BGR2RGB)
|
|
1105
|
+
pil_img = _PILImage.fromarray(rgb)
|
|
1106
|
+
return pytesseract.image_to_string(pil_img, config="--psm 7").strip()
|
|
1107
|
+
|
|
1108
|
+
return _tesseract_ocr
|
|
1109
|
+
except ImportError:
|
|
1110
|
+
pass
|
|
1111
|
+
|
|
1112
|
+
# Try easyocr (slower startup, but no external binary required)
|
|
1113
|
+
try:
|
|
1114
|
+
import easyocr
|
|
1115
|
+
|
|
1116
|
+
_reader = easyocr.Reader(["en"], gpu=False, verbose=False)
|
|
1117
|
+
|
|
1118
|
+
def _easyocr_ocr(bgr_crop: "np.ndarray") -> str:
|
|
1119
|
+
results = _reader.readtext(bgr_crop, detail=0)
|
|
1120
|
+
return " ".join(results).strip()
|
|
1121
|
+
|
|
1122
|
+
return _easyocr_ocr
|
|
1123
|
+
except ImportError:
|
|
1124
|
+
pass
|
|
1125
|
+
|
|
1126
|
+
return None
|
|
1127
|
+
|
|
1128
|
+
|
|
1129
|
+
# ---------------------------------------------------------------------------
|
|
1130
|
+
# Public API: extract_structures_from_image
|
|
1131
|
+
# ---------------------------------------------------------------------------
|
|
1132
|
+
|
|
1133
|
+
def extract_structures_from_image(
|
|
1134
|
+
image_path: str,
|
|
1135
|
+
page: int = 0,
|
|
1136
|
+
segment: bool = True,
|
|
1137
|
+
hand_drawn: bool = False,
|
|
1138
|
+
verbose: bool = False,
|
|
1139
|
+
merge_gap: Optional[int] = None,
|
|
1140
|
+
detect_labels: bool = True,
|
|
1141
|
+
) -> Dict:
|
|
1142
|
+
"""
|
|
1143
|
+
Extract all chemical structures from an image using DECIMER.
|
|
1144
|
+
|
|
1145
|
+
Takes a PNG, JPG, or PDF path and returns a structured JSON dict with every
|
|
1146
|
+
detected molecule, its SMILES, DECIMER confidence score, bounding box in
|
|
1147
|
+
image pixel coordinates, and (when OCR is available) any nearby text label.
|
|
1148
|
+
|
|
1149
|
+
Parameters
|
|
1150
|
+
----------
|
|
1151
|
+
image_path : path to PNG/JPG/PDF image file
|
|
1152
|
+
page : PDF page index (0-based); ignored for raster images
|
|
1153
|
+
segment : if True (default), segment the image into individual
|
|
1154
|
+
structure regions before passing each to DECIMER.
|
|
1155
|
+
Set False when the whole image is a single structure.
|
|
1156
|
+
hand_drawn : use the DECIMER hand-drawn model instead of the default
|
|
1157
|
+
printed-structure model
|
|
1158
|
+
verbose : print progress messages to stderr
|
|
1159
|
+
merge_gap : pixel gap for merging nearby segmentation boxes.
|
|
1160
|
+
None = adaptive (median-based). 0 = no merging.
|
|
1161
|
+
detect_labels : if True (default), attempt to detect text labels near
|
|
1162
|
+
each structure. Requires pytesseract or easyocr to
|
|
1163
|
+
return non-None label values; without an OCR library the
|
|
1164
|
+
label field is always null.
|
|
1165
|
+
|
|
1166
|
+
Returns
|
|
1167
|
+
-------
|
|
1168
|
+
dict with the following keys:
|
|
1169
|
+
|
|
1170
|
+
ok (bool) True on success, False on error
|
|
1171
|
+
image_path (str) Absolute path of the input image
|
|
1172
|
+
structures (list) One entry per detected structure:
|
|
1173
|
+
smiles (str) DECIMER-predicted SMILES (may be "")
|
|
1174
|
+
confidence (float|null) Geometric-mean per-token DECIMER score
|
|
1175
|
+
in [0, 1], or null if unavailable
|
|
1176
|
+
bbox (list) [x0, y0, x1, y1] pixel coords (top-left,
|
|
1177
|
+
bottom-right) in the input image
|
|
1178
|
+
label (str|null) Nearby text label detected by OCR, or null
|
|
1179
|
+
error (str) Only present on failure (ok=False)
|
|
1180
|
+
|
|
1181
|
+
Examples
|
|
1182
|
+
--------
|
|
1183
|
+
>>> from cdxml_toolkit.image.structure_from_image import extract_structures_from_image
|
|
1184
|
+
>>> result = extract_structures_from_image("scheme.png")
|
|
1185
|
+
>>> if result["ok"]:
|
|
1186
|
+
... for s in result["structures"]:
|
|
1187
|
+
... print(s["smiles"], s["confidence"])
|
|
1188
|
+
|
|
1189
|
+
Notes
|
|
1190
|
+
-----
|
|
1191
|
+
- DECIMER models are downloaded to ~/.data/DECIMER-V2/ on first run (~570 MB).
|
|
1192
|
+
- Confidence uses geometric mean of per-character DECIMER scores, making it
|
|
1193
|
+
sensitive to low-confidence characters. Scores above ~0.85 are reliable;
|
|
1194
|
+
below ~0.70 the SMILES should be verified manually.
|
|
1195
|
+
- Labels are detected only when pytesseract or easyocr is installed.
|
|
1196
|
+
Install either with: pip install pytesseract or pip install easyocr
|
|
1197
|
+
- For backward-compatible low-level access (returns List[Dict] with atoms/bonds),
|
|
1198
|
+
use _extract_structures_raw() directly.
|
|
1199
|
+
"""
|
|
1200
|
+
abs_path = os.path.abspath(image_path)
|
|
1201
|
+
|
|
1202
|
+
# Guard: DECIMER is required
|
|
1203
|
+
try:
|
|
1204
|
+
_load_decimer(hand_drawn=hand_drawn)
|
|
1205
|
+
except ImportError as exc:
|
|
1206
|
+
return {
|
|
1207
|
+
"ok": False,
|
|
1208
|
+
"image_path": abs_path,
|
|
1209
|
+
"structures": [],
|
|
1210
|
+
"error": str(exc),
|
|
1211
|
+
}
|
|
1212
|
+
|
|
1213
|
+
# Guard: OpenCV is required for segmentation and label detection
|
|
1214
|
+
if not HAS_CV2:
|
|
1215
|
+
return {
|
|
1216
|
+
"ok": False,
|
|
1217
|
+
"image_path": abs_path,
|
|
1218
|
+
"structures": [],
|
|
1219
|
+
"error": (
|
|
1220
|
+
"opencv-python is required. "
|
|
1221
|
+
"Install with: pip install opencv-python"
|
|
1222
|
+
),
|
|
1223
|
+
}
|
|
1224
|
+
|
|
1225
|
+
try:
|
|
1226
|
+
raw = _extract_structures_raw(
|
|
1227
|
+
image_path=image_path,
|
|
1228
|
+
page=page,
|
|
1229
|
+
segment=segment,
|
|
1230
|
+
hand_drawn=hand_drawn,
|
|
1231
|
+
verbose=verbose,
|
|
1232
|
+
merge_gap=merge_gap,
|
|
1233
|
+
)
|
|
1234
|
+
except FileNotFoundError as exc:
|
|
1235
|
+
return {
|
|
1236
|
+
"ok": False,
|
|
1237
|
+
"image_path": abs_path,
|
|
1238
|
+
"structures": [],
|
|
1239
|
+
"error": str(exc),
|
|
1240
|
+
}
|
|
1241
|
+
except Exception as exc:
|
|
1242
|
+
return {
|
|
1243
|
+
"ok": False,
|
|
1244
|
+
"image_path": abs_path,
|
|
1245
|
+
"structures": [],
|
|
1246
|
+
"error": f"Extraction failed: {exc}",
|
|
1247
|
+
}
|
|
1248
|
+
|
|
1249
|
+
# Detect nearby text labels (spatial proximity + optional OCR)
|
|
1250
|
+
labels: List[Optional[str]] = [None] * len(raw)
|
|
1251
|
+
if detect_labels and HAS_CV2 and raw:
|
|
1252
|
+
try:
|
|
1253
|
+
bgr = load_image(image_path, page=page)
|
|
1254
|
+
bboxes = [tuple(entry["bbox"]) for entry in raw]
|
|
1255
|
+
labels = _detect_nearby_labels(bgr, bboxes) # type: ignore[arg-type]
|
|
1256
|
+
except Exception:
|
|
1257
|
+
# Label detection is best-effort; never fail the whole extraction
|
|
1258
|
+
labels = [None] * len(raw)
|
|
1259
|
+
|
|
1260
|
+
structures = []
|
|
1261
|
+
for entry, label in zip(raw, labels):
|
|
1262
|
+
structures.append({
|
|
1263
|
+
"smiles": entry.get("smiles", ""),
|
|
1264
|
+
"confidence": entry.get("confidence"),
|
|
1265
|
+
"bbox": entry.get("bbox", []),
|
|
1266
|
+
"label": label,
|
|
1267
|
+
})
|
|
1268
|
+
|
|
1269
|
+
return {
|
|
1270
|
+
"ok": True,
|
|
1271
|
+
"image_path": abs_path,
|
|
1272
|
+
"structures": structures,
|
|
1273
|
+
}
|
|
1274
|
+
|
|
1275
|
+
|
|
1276
|
+
# ---------------------------------------------------------------------------
|
|
1277
|
+
# CDXML output (optional, wraps cdxml_builder)
|
|
1278
|
+
# ---------------------------------------------------------------------------
|
|
1279
|
+
|
|
1280
|
+
def _format_cdxml_header(bbox: str) -> str:
|
|
1281
|
+
"""Format CDXML_HEADER template with ACS Document 1996 style constants."""
|
|
1282
|
+
return _CDXML_HEADER.format(
|
|
1283
|
+
bbox=bbox,
|
|
1284
|
+
label_font=ACS_LABEL_FONT,
|
|
1285
|
+
label_size=ACS_LABEL_SIZE,
|
|
1286
|
+
label_face=ACS_LABEL_FACE,
|
|
1287
|
+
caption_size=ACS_CAPTION_SIZE,
|
|
1288
|
+
hash_spacing=ACS_HASH_SPACING,
|
|
1289
|
+
margin_width=ACS_MARGIN_WIDTH,
|
|
1290
|
+
line_width=ACS_LINE_WIDTH,
|
|
1291
|
+
bold_width=ACS_BOLD_WIDTH,
|
|
1292
|
+
bond_length=ACS_BOND_LENGTH_STR,
|
|
1293
|
+
bond_spacing=ACS_BOND_SPACING,
|
|
1294
|
+
chain_angle=ACS_CHAIN_ANGLE_STR,
|
|
1295
|
+
)
|
|
1296
|
+
|
|
1297
|
+
|
|
1298
|
+
def _best_smiles_component(smiles: str) -> str:
|
|
1299
|
+
"""
|
|
1300
|
+
For a dot-separated multi-component SMILES, return the single component
|
|
1301
|
+
that is most likely to be the real chemical structure (largest heavy-atom
|
|
1302
|
+
count that is also a valid RDKit molecule). Filters out junk fragments
|
|
1303
|
+
like lone alkyne chains, single atoms, very short chains, etc.
|
|
1304
|
+
"""
|
|
1305
|
+
if "." not in smiles:
|
|
1306
|
+
return smiles
|
|
1307
|
+
|
|
1308
|
+
parts = smiles.split(".")
|
|
1309
|
+
best = ""
|
|
1310
|
+
best_score = -1
|
|
1311
|
+
|
|
1312
|
+
for part in parts:
|
|
1313
|
+
part = part.strip()
|
|
1314
|
+
if not part:
|
|
1315
|
+
continue
|
|
1316
|
+
# Quick atom-count heuristic before RDKit parse
|
|
1317
|
+
heavy = sum(1 for c in part if c.isupper())
|
|
1318
|
+
if heavy < 3:
|
|
1319
|
+
continue
|
|
1320
|
+
# Penalise pure alkyne/alkene chains (no rings, no heteroatoms)
|
|
1321
|
+
has_heteroatom = any(c in part for c in "NOSFPClBrI")
|
|
1322
|
+
has_ring = "1" in part or "2" in part or "3" in part or "@" in part
|
|
1323
|
+
score = heavy * 10 + (50 if has_heteroatom else 0) + (30 if has_ring else 0)
|
|
1324
|
+
if score > best_score:
|
|
1325
|
+
best = part
|
|
1326
|
+
best_score = score
|
|
1327
|
+
|
|
1328
|
+
return best if best else smiles.split(".")[0]
|
|
1329
|
+
|
|
1330
|
+
|
|
1331
|
+
def _translate_atoms_xml(frag_xml: str, dx: float, dy: float) -> str:
|
|
1332
|
+
"""
|
|
1333
|
+
Shift all coordinate attributes in a fragment XML string by (dx, dy).
|
|
1334
|
+
Handles: p="x y" and BoundingBox="x1 y1 x2 y2".
|
|
1335
|
+
Both patterns appear in <fragment>, <n>, and <t> elements.
|
|
1336
|
+
"""
|
|
1337
|
+
import re
|
|
1338
|
+
|
|
1339
|
+
def shift_p(m: "re.Match") -> str:
|
|
1340
|
+
x, y = float(m.group(1)), float(m.group(2))
|
|
1341
|
+
return f'p="{x + dx:.3f} {y + dy:.3f}"'
|
|
1342
|
+
|
|
1343
|
+
def shift_bb(m: "re.Match") -> str:
|
|
1344
|
+
vals = [float(v) for v in m.group(1).split()]
|
|
1345
|
+
shifted = [
|
|
1346
|
+
f"{vals[0] + dx:.3f}", f"{vals[1] + dy:.3f}",
|
|
1347
|
+
f"{vals[2] + dx:.3f}", f"{vals[3] + dy:.3f}",
|
|
1348
|
+
]
|
|
1349
|
+
return f'BoundingBox="{" ".join(shifted)}"'
|
|
1350
|
+
|
|
1351
|
+
frag_xml = re.sub(r'\bp="([-\d.]+)\s+([-\d.]+)"', shift_p, frag_xml)
|
|
1352
|
+
frag_xml = re.sub(r'\bBoundingBox="((?:[-\d.]+ ?){4})"', shift_bb, frag_xml)
|
|
1353
|
+
return frag_xml
|
|
1354
|
+
|
|
1355
|
+
|
|
1356
|
+
def results_to_cdxml(results: List[Dict]) -> str:
|
|
1357
|
+
"""
|
|
1358
|
+
Convert extracted structures to a CDXML document (multiple molecules on one page).
|
|
1359
|
+
|
|
1360
|
+
Each valid structure is placed left-to-right, spaced by its actual atom
|
|
1361
|
+
bounding box. The correct translation is computed from the fragment's
|
|
1362
|
+
real atom x/y range (atoms were normalised to centre ≈ (200, 300)), then
|
|
1363
|
+
shifted so fragment i lands at (x_cursor + half_width, ROW_Y).
|
|
1364
|
+
|
|
1365
|
+
Multi-component SMILES (dot-separated) are filtered to retain only the
|
|
1366
|
+
largest / most drug-like component before building.
|
|
1367
|
+
|
|
1368
|
+
Requires cdxml_builder.py to be importable from the same directory.
|
|
1369
|
+
"""
|
|
1370
|
+
import importlib.util
|
|
1371
|
+
import xml.etree.ElementTree as ET
|
|
1372
|
+
|
|
1373
|
+
_dir = os.path.dirname(os.path.abspath(__file__))
|
|
1374
|
+
try:
|
|
1375
|
+
spec = importlib.util.spec_from_file_location(
|
|
1376
|
+
"cdxml_builder", os.path.join(_dir, "cdxml_builder.py")
|
|
1377
|
+
)
|
|
1378
|
+
cdxml_builder = importlib.util.module_from_spec(spec)
|
|
1379
|
+
spec.loader.exec_module(cdxml_builder)
|
|
1380
|
+
except Exception as exc:
|
|
1381
|
+
raise ImportError(f"Could not import cdxml_builder.py: {exc}") from exc
|
|
1382
|
+
|
|
1383
|
+
PAGE_MARGIN = 36.0 # pt from page left edge to first atom bbox left
|
|
1384
|
+
MOL_GAP = 40.0 # pt gap between adjacent molecule bounding boxes
|
|
1385
|
+
ROW_Y = 300.0 # y-centre for the row of molecules
|
|
1386
|
+
LABEL_PAD = 10.0 # extra pt added around atom bbox for labels
|
|
1387
|
+
|
|
1388
|
+
# --- Build each molecule, measure its atom bbox, then place ---
|
|
1389
|
+
placed_fragments: List[str] = []
|
|
1390
|
+
half_heights: List[float] = []
|
|
1391
|
+
x_cursor = PAGE_MARGIN
|
|
1392
|
+
start_id = 1000
|
|
1393
|
+
|
|
1394
|
+
for entry in results:
|
|
1395
|
+
atoms = entry.get("atoms", [])
|
|
1396
|
+
bonds = entry.get("bonds", [])
|
|
1397
|
+
if not atoms:
|
|
1398
|
+
continue
|
|
1399
|
+
|
|
1400
|
+
# If this entry came from a multi-component SMILES, re-derive coords
|
|
1401
|
+
# from only the best component so we don't get a stacked mess.
|
|
1402
|
+
smiles = entry.get("smiles", "")
|
|
1403
|
+
if smiles and "." in smiles:
|
|
1404
|
+
best = _best_smiles_component(smiles)
|
|
1405
|
+
if best != smiles:
|
|
1406
|
+
mol_data = smiles_to_coords(best, offset_index=0)
|
|
1407
|
+
if mol_data:
|
|
1408
|
+
atoms, bonds = normalize_for_cdxml(
|
|
1409
|
+
mol_data["atoms"], mol_data["bonds"],
|
|
1410
|
+
center_x=200.0, center_y=300.0,
|
|
1411
|
+
)
|
|
1412
|
+
|
|
1413
|
+
# Measure actual atom coordinate bounding box
|
|
1414
|
+
xs = [a["x"] for a in atoms]
|
|
1415
|
+
ys = [a["y"] for a in atoms]
|
|
1416
|
+
atom_xmin = min(xs); atom_xmax = max(xs)
|
|
1417
|
+
atom_ymin = min(ys); atom_ymax = max(ys)
|
|
1418
|
+
mol_w = (atom_xmax - atom_xmin) + LABEL_PAD * 2
|
|
1419
|
+
mol_h = (atom_ymax - atom_ymin) + LABEL_PAD * 2
|
|
1420
|
+
mol_w = max(mol_w, ACS_BOND_LENGTH_PT * 2)
|
|
1421
|
+
mol_h = max(mol_h, ACS_BOND_LENGTH_PT * 2)
|
|
1422
|
+
|
|
1423
|
+
# Build fragment XML (atoms are centred near (200, 300) already)
|
|
1424
|
+
cdxml_str = cdxml_builder.build_molecule_cdxml(atoms, bonds, start_id=start_id)
|
|
1425
|
+
root = ET.fromstring(cdxml_str)
|
|
1426
|
+
page_el = root.find("page")
|
|
1427
|
+
if page_el is None:
|
|
1428
|
+
continue
|
|
1429
|
+
frag_xmls = [ET.tostring(f, encoding="unicode") for f in page_el.findall("fragment")]
|
|
1430
|
+
if not frag_xmls:
|
|
1431
|
+
continue
|
|
1432
|
+
|
|
1433
|
+
# Compute atom bbox centre in the built (origin) coordinates
|
|
1434
|
+
origin_cx = (atom_xmin + atom_xmax) / 2.0
|
|
1435
|
+
origin_cy = (atom_ymin + atom_ymax) / 2.0
|
|
1436
|
+
|
|
1437
|
+
# Target position: centre of the slot we're placing this molecule into
|
|
1438
|
+
target_cx = x_cursor + mol_w / 2.0
|
|
1439
|
+
target_cy = ROW_Y
|
|
1440
|
+
|
|
1441
|
+
dx = target_cx - origin_cx
|
|
1442
|
+
dy = target_cy - origin_cy
|
|
1443
|
+
|
|
1444
|
+
for fxml in frag_xmls:
|
|
1445
|
+
placed_fragments.append(_translate_atoms_xml(fxml, dx, dy))
|
|
1446
|
+
|
|
1447
|
+
half_heights.append(mol_h / 2.0)
|
|
1448
|
+
x_cursor += mol_w + MOL_GAP
|
|
1449
|
+
start_id += len(atoms) * 3 + 200
|
|
1450
|
+
|
|
1451
|
+
if not placed_fragments:
|
|
1452
|
+
return ""
|
|
1453
|
+
|
|
1454
|
+
page_width = x_cursor - MOL_GAP + PAGE_MARGIN
|
|
1455
|
+
page_height = ROW_Y + max(half_heights) + PAGE_MARGIN
|
|
1456
|
+
page_bb = f"0 0 {page_width:.1f} {page_height:.1f}"
|
|
1457
|
+
page_content = "\n ".join(placed_fragments)
|
|
1458
|
+
|
|
1459
|
+
return (
|
|
1460
|
+
_format_cdxml_header(page_bb) + "\n"
|
|
1461
|
+
f'<page BoundingBox="{page_bb}">\n'
|
|
1462
|
+
f' {page_content}\n'
|
|
1463
|
+
'</page>\n'
|
|
1464
|
+
+ _CDXML_FOOTER + "\n"
|
|
1465
|
+
)
|
|
1466
|
+
|
|
1467
|
+
|
|
1468
|
+
def results_to_cdxml_chemscript(
|
|
1469
|
+
results: List[Dict],
|
|
1470
|
+
verbose: bool = False,
|
|
1471
|
+
) -> str:
|
|
1472
|
+
"""
|
|
1473
|
+
Convert extracted structures to CDXML using ChemScript for cleanup.
|
|
1474
|
+
|
|
1475
|
+
For each structure with a valid SMILES, ChemScript's smiles_to_cdxml()
|
|
1476
|
+
is called — this runs CleanupStructure() internally, producing
|
|
1477
|
+
ChemDraw-native coordinates with proper aromaticity, bond lengths,
|
|
1478
|
+
and ACS 1996 style. The resulting fragment XMLs are then laid out
|
|
1479
|
+
left-to-right on a single page.
|
|
1480
|
+
|
|
1481
|
+
Requires chemscript_bridge.py to be importable from the same directory,
|
|
1482
|
+
and a working ChemDraw + ChemScript 32-bit environment.
|
|
1483
|
+
"""
|
|
1484
|
+
import importlib.util
|
|
1485
|
+
import xml.etree.ElementTree as ET
|
|
1486
|
+
import re
|
|
1487
|
+
|
|
1488
|
+
def log(msg: str):
|
|
1489
|
+
if verbose:
|
|
1490
|
+
print(f"[structure_from_image] {msg}", file=sys.stderr)
|
|
1491
|
+
|
|
1492
|
+
# Import chemscript_bridge
|
|
1493
|
+
_dir = os.path.dirname(os.path.abspath(__file__))
|
|
1494
|
+
try:
|
|
1495
|
+
spec = importlib.util.spec_from_file_location(
|
|
1496
|
+
"chemscript_bridge", os.path.join(_dir, "chemscript_bridge.py")
|
|
1497
|
+
)
|
|
1498
|
+
csb_module = importlib.util.module_from_spec(spec)
|
|
1499
|
+
spec.loader.exec_module(csb_module)
|
|
1500
|
+
except Exception as exc:
|
|
1501
|
+
raise ImportError(
|
|
1502
|
+
f"Could not import chemscript_bridge.py: {exc}\n"
|
|
1503
|
+
"The --cleanup flag requires ChemDraw and chemscript_bridge."
|
|
1504
|
+
) from exc
|
|
1505
|
+
|
|
1506
|
+
PAGE_MARGIN = 36.0
|
|
1507
|
+
MOL_GAP = 40.0
|
|
1508
|
+
ROW_Y = 300.0
|
|
1509
|
+
LABEL_PAD = 10.0
|
|
1510
|
+
|
|
1511
|
+
# --- Build each molecule via ChemScript, extract fragment, measure bbox ---
|
|
1512
|
+
log("Opening ChemScript bridge...")
|
|
1513
|
+
cs = csb_module.ChemScriptBridge()
|
|
1514
|
+
|
|
1515
|
+
frag_data: List[Tuple[str, float, float, float, float]] = []
|
|
1516
|
+
# Each item: (fragment_xml, xmin, ymin, xmax, ymax)
|
|
1517
|
+
|
|
1518
|
+
try:
|
|
1519
|
+
for entry in results:
|
|
1520
|
+
smiles = entry.get("smiles", "").strip()
|
|
1521
|
+
if not smiles:
|
|
1522
|
+
continue
|
|
1523
|
+
|
|
1524
|
+
# For multi-component SMILES, pick the best fragment
|
|
1525
|
+
if "." in smiles:
|
|
1526
|
+
smiles = _best_smiles_component(smiles)
|
|
1527
|
+
|
|
1528
|
+
log(f" ChemScript: {smiles[:60]}...")
|
|
1529
|
+
try:
|
|
1530
|
+
cdxml_str = cs.smiles_to_cdxml(smiles)
|
|
1531
|
+
except Exception as exc:
|
|
1532
|
+
log(f" ChemScript failed for {smiles[:40]}: {exc}")
|
|
1533
|
+
continue
|
|
1534
|
+
|
|
1535
|
+
if not cdxml_str or "<CDXML" not in cdxml_str:
|
|
1536
|
+
log(f" ChemScript returned empty CDXML")
|
|
1537
|
+
continue
|
|
1538
|
+
|
|
1539
|
+
# Parse the CDXML and extract all <fragment> elements + measure coords
|
|
1540
|
+
root = ET.fromstring(cdxml_str)
|
|
1541
|
+
page_el = root.find("page")
|
|
1542
|
+
if page_el is None:
|
|
1543
|
+
continue
|
|
1544
|
+
|
|
1545
|
+
for frag in page_el.findall("fragment"):
|
|
1546
|
+
frag_xml = ET.tostring(frag, encoding="unicode")
|
|
1547
|
+
|
|
1548
|
+
# Measure atom positions from <n> elements
|
|
1549
|
+
xs, ys = [], []
|
|
1550
|
+
for n in frag.findall("n"):
|
|
1551
|
+
p = n.get("p")
|
|
1552
|
+
if p:
|
|
1553
|
+
parts = p.split()
|
|
1554
|
+
if len(parts) >= 2:
|
|
1555
|
+
xs.append(float(parts[0]))
|
|
1556
|
+
ys.append(float(parts[1]))
|
|
1557
|
+
|
|
1558
|
+
if not xs:
|
|
1559
|
+
continue
|
|
1560
|
+
|
|
1561
|
+
frag_data.append((
|
|
1562
|
+
frag_xml,
|
|
1563
|
+
min(xs), min(ys), max(xs), max(ys),
|
|
1564
|
+
))
|
|
1565
|
+
finally:
|
|
1566
|
+
cs.close()
|
|
1567
|
+
|
|
1568
|
+
if not frag_data:
|
|
1569
|
+
return ""
|
|
1570
|
+
|
|
1571
|
+
# --- Lay out fragments left-to-right ---
|
|
1572
|
+
placed_fragments: List[str] = []
|
|
1573
|
+
half_heights: List[float] = []
|
|
1574
|
+
x_cursor = PAGE_MARGIN
|
|
1575
|
+
|
|
1576
|
+
for frag_xml, xmin, ymin, xmax, ymax in frag_data:
|
|
1577
|
+
mol_w = (xmax - xmin) + LABEL_PAD * 2
|
|
1578
|
+
mol_h = (ymax - ymin) + LABEL_PAD * 2
|
|
1579
|
+
mol_w = max(mol_w, ACS_BOND_LENGTH_PT * 2)
|
|
1580
|
+
mol_h = max(mol_h, ACS_BOND_LENGTH_PT * 2)
|
|
1581
|
+
|
|
1582
|
+
origin_cx = (xmin + xmax) / 2.0
|
|
1583
|
+
origin_cy = (ymin + ymax) / 2.0
|
|
1584
|
+
|
|
1585
|
+
target_cx = x_cursor + mol_w / 2.0
|
|
1586
|
+
target_cy = ROW_Y
|
|
1587
|
+
|
|
1588
|
+
dx = target_cx - origin_cx
|
|
1589
|
+
dy = target_cy - origin_cy
|
|
1590
|
+
|
|
1591
|
+
placed_fragments.append(_translate_atoms_xml(frag_xml, dx, dy))
|
|
1592
|
+
half_heights.append(mol_h / 2.0)
|
|
1593
|
+
x_cursor += mol_w + MOL_GAP
|
|
1594
|
+
|
|
1595
|
+
if not placed_fragments:
|
|
1596
|
+
return ""
|
|
1597
|
+
|
|
1598
|
+
page_width = x_cursor - MOL_GAP + PAGE_MARGIN
|
|
1599
|
+
page_height = ROW_Y + max(half_heights) + PAGE_MARGIN
|
|
1600
|
+
page_bb = f"0 0 {page_width:.1f} {page_height:.1f}"
|
|
1601
|
+
page_content = "\n ".join(placed_fragments)
|
|
1602
|
+
|
|
1603
|
+
return (
|
|
1604
|
+
_format_cdxml_header(page_bb) + "\n"
|
|
1605
|
+
f'<page BoundingBox="{page_bb}">\n'
|
|
1606
|
+
f' {page_content}\n'
|
|
1607
|
+
'</page>\n'
|
|
1608
|
+
+ _CDXML_FOOTER + "\n"
|
|
1609
|
+
)
|
|
1610
|
+
|
|
1611
|
+
|
|
1612
|
+
# ---------------------------------------------------------------------------
|
|
1613
|
+
# CLI
|
|
1614
|
+
# ---------------------------------------------------------------------------
|
|
1615
|
+
|
|
1616
|
+
def _build_parser() -> argparse.ArgumentParser:
|
|
1617
|
+
p = argparse.ArgumentParser(
|
|
1618
|
+
prog="structure_from_image.py",
|
|
1619
|
+
description="Extract chemical structures from images using DECIMER.",
|
|
1620
|
+
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
1621
|
+
epilog=__doc__.split("Notes")[0].split("Usage\n-----")[1].strip(),
|
|
1622
|
+
)
|
|
1623
|
+
p.add_argument("--input", "-i", required=True,
|
|
1624
|
+
help="Input image (PNG/JPG) or PDF file")
|
|
1625
|
+
p.add_argument("--output", "-o", default="-",
|
|
1626
|
+
help="Output file path; '-' writes JSON to stdout (default)")
|
|
1627
|
+
p.add_argument("--page", type=int, default=0,
|
|
1628
|
+
help="PDF page to process, 0-indexed (default: 0)")
|
|
1629
|
+
p.add_argument("--format", choices=["json", "cdxml"], default="json",
|
|
1630
|
+
help="Output format (default: json)")
|
|
1631
|
+
p.add_argument("--no-segment", dest="segment", action="store_false",
|
|
1632
|
+
help="Treat whole image as a single structure (skip segmentation)")
|
|
1633
|
+
p.add_argument("--hand-drawn", action="store_true",
|
|
1634
|
+
help="Use DECIMER hand-drawn model")
|
|
1635
|
+
p.add_argument("--cleanup", action="store_true",
|
|
1636
|
+
help="Use ChemScript to clean up structures — produces "
|
|
1637
|
+
"ChemDraw-native coordinates, proper aromaticity, "
|
|
1638
|
+
"and ACS 1996 style (requires ChemDraw + chemscript_bridge)")
|
|
1639
|
+
p.add_argument("--gap", type=int, default=None,
|
|
1640
|
+
help="Merge gap in pixels for segmentation box merging. "
|
|
1641
|
+
"Default: adaptive (based on image density). "
|
|
1642
|
+
"Use 0 to disable merging entirely.")
|
|
1643
|
+
p.add_argument("--verbose", "-v", action="store_true",
|
|
1644
|
+
help="Print progress messages to stderr")
|
|
1645
|
+
return p
|
|
1646
|
+
|
|
1647
|
+
|
|
1648
|
+
def main(argv: Optional[List[str]] = None) -> int:
|
|
1649
|
+
parser = _build_parser()
|
|
1650
|
+
args = parser.parse_args(argv)
|
|
1651
|
+
|
|
1652
|
+
if not os.path.isfile(args.input):
|
|
1653
|
+
print(f"ERROR: Input file not found: {args.input}", file=sys.stderr)
|
|
1654
|
+
return 1
|
|
1655
|
+
|
|
1656
|
+
if not HAS_CV2:
|
|
1657
|
+
print("ERROR: opencv-python not installed. Run: pip install opencv-python",
|
|
1658
|
+
file=sys.stderr)
|
|
1659
|
+
return 1
|
|
1660
|
+
|
|
1661
|
+
try:
|
|
1662
|
+
results = extract_structures_from_image(
|
|
1663
|
+
image_path=args.input,
|
|
1664
|
+
page=args.page,
|
|
1665
|
+
segment=args.segment,
|
|
1666
|
+
hand_drawn=args.hand_drawn,
|
|
1667
|
+
verbose=args.verbose,
|
|
1668
|
+
merge_gap=args.gap,
|
|
1669
|
+
)
|
|
1670
|
+
except Exception as exc:
|
|
1671
|
+
print(f"ERROR: {exc}", file=sys.stderr)
|
|
1672
|
+
if args.verbose:
|
|
1673
|
+
import traceback
|
|
1674
|
+
traceback.print_exc(file=sys.stderr)
|
|
1675
|
+
return 1
|
|
1676
|
+
|
|
1677
|
+
# Format output
|
|
1678
|
+
if args.format == "cdxml":
|
|
1679
|
+
try:
|
|
1680
|
+
if args.cleanup:
|
|
1681
|
+
output_str = results_to_cdxml_chemscript(
|
|
1682
|
+
results, verbose=args.verbose,
|
|
1683
|
+
)
|
|
1684
|
+
else:
|
|
1685
|
+
output_str = results_to_cdxml(results)
|
|
1686
|
+
except Exception as exc:
|
|
1687
|
+
print(f"ERROR building CDXML: {exc}", file=sys.stderr)
|
|
1688
|
+
if args.verbose:
|
|
1689
|
+
import traceback
|
|
1690
|
+
traceback.print_exc(file=sys.stderr)
|
|
1691
|
+
return 1
|
|
1692
|
+
if not output_str:
|
|
1693
|
+
print("WARNING: No valid structures to write to CDXML.", file=sys.stderr)
|
|
1694
|
+
return 1
|
|
1695
|
+
else:
|
|
1696
|
+
output_str = json.dumps(results, indent=2)
|
|
1697
|
+
|
|
1698
|
+
# Write output
|
|
1699
|
+
if args.output == "-":
|
|
1700
|
+
print(output_str)
|
|
1701
|
+
else:
|
|
1702
|
+
with open(args.output, "w", encoding="utf-8") as fh:
|
|
1703
|
+
fh.write(output_str)
|
|
1704
|
+
if args.verbose:
|
|
1705
|
+
print(f"Wrote {args.output}", file=sys.stderr)
|
|
1706
|
+
|
|
1707
|
+
return 0
|
|
1708
|
+
|
|
1709
|
+
|
|
1710
|
+
if __name__ == "__main__":
|
|
1711
|
+
sys.exit(main())
|