cdxml-toolkit 0.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cdxml_toolkit/__init__.py +18 -0
- cdxml_toolkit/_jre/__init__.py +2 -0
- cdxml_toolkit/_jre/temurin-21-jre-win-x64.zip +0 -0
- cdxml_toolkit/analysis/__init__.py +35 -0
- cdxml_toolkit/analysis/deterministic/__init__.py +12 -0
- cdxml_toolkit/analysis/deterministic/discover_experiment_files.py +413 -0
- cdxml_toolkit/analysis/deterministic/lab_book_formatter.py +701 -0
- cdxml_toolkit/analysis/deterministic/lcms_file_categorizer.py +928 -0
- cdxml_toolkit/analysis/deterministic/lcms_identifier.py +598 -0
- cdxml_toolkit/analysis/deterministic/mass_resolver.py +654 -0
- cdxml_toolkit/analysis/deterministic/multi_lcms_analyzer.py +1412 -0
- cdxml_toolkit/analysis/deterministic/procedure_writer.py +446 -0
- cdxml_toolkit/analysis/extract_nmr.py +47 -0
- cdxml_toolkit/analysis/format_procedure_entry.py +479 -0
- cdxml_toolkit/analysis/lcms_analyzer.py +1299 -0
- cdxml_toolkit/analysis/parse_analysis_file.py +134 -0
- cdxml_toolkit/cdxml_builder.py +920 -0
- cdxml_toolkit/cdxml_utils.py +342 -0
- cdxml_toolkit/chemdraw/__init__.py +5 -0
- cdxml_toolkit/chemdraw/_chemscript_server.py +562 -0
- cdxml_toolkit/chemdraw/cdx_converter.py +527 -0
- cdxml_toolkit/chemdraw/cdxml_to_image.py +262 -0
- cdxml_toolkit/chemdraw/cdxml_to_image_rdkit.py +296 -0
- cdxml_toolkit/chemdraw/chemscript_bridge.py +901 -0
- cdxml_toolkit/constants.py +304 -0
- cdxml_toolkit/coord_normalizer.py +438 -0
- cdxml_toolkit/deterministic_pipeline/__init__.py +6 -0
- cdxml_toolkit/deterministic_pipeline/legacy/__init__.py +5 -0
- cdxml_toolkit/deterministic_pipeline/legacy/eln_cdx_cleanup.py +509 -0
- cdxml_toolkit/deterministic_pipeline/legacy/eln_enrichment.py +1394 -0
- cdxml_toolkit/deterministic_pipeline/legacy/scheme_aligner.py +428 -0
- cdxml_toolkit/deterministic_pipeline/legacy/scheme_polisher.py +1337 -0
- cdxml_toolkit/deterministic_pipeline/legacy/scheme_polisher_v2.py +1340 -0
- cdxml_toolkit/deterministic_pipeline/scheme_reader_audit.py +931 -0
- cdxml_toolkit/deterministic_pipeline/scheme_reader_verify.py +1160 -0
- cdxml_toolkit/image/__init__.py +15 -0
- cdxml_toolkit/image/reaction_from_image.py +2103 -0
- cdxml_toolkit/image/structure_from_image.py +1711 -0
- cdxml_toolkit/layout/__init__.py +5 -0
- cdxml_toolkit/layout/alignment.py +1642 -0
- cdxml_toolkit/layout/reaction_cleanup.py +1002 -0
- cdxml_toolkit/layout/scheme_merger.py +2260 -0
- cdxml_toolkit/mcp_server/__init__.py +0 -0
- cdxml_toolkit/mcp_server/__main__.py +5 -0
- cdxml_toolkit/mcp_server/server.py +1567 -0
- cdxml_toolkit/naming/__init__.py +6 -0
- cdxml_toolkit/naming/aligned_namer.py +2342 -0
- cdxml_toolkit/naming/mol_builder.py +3722 -0
- cdxml_toolkit/naming/name_decomposer.py +2843 -0
- cdxml_toolkit/naming/reactions_datamol.json +2414 -0
- cdxml_toolkit/office/__init__.py +5 -0
- cdxml_toolkit/office/doc_from_template.py +722 -0
- cdxml_toolkit/office/ole_embedder.py +808 -0
- cdxml_toolkit/office/ole_extractor.py +272 -0
- cdxml_toolkit/perception/__init__.py +10 -0
- cdxml_toolkit/perception/compound_search.py +229 -0
- cdxml_toolkit/perception/eln_csv_parser.py +240 -0
- cdxml_toolkit/perception/rdf_parser.py +664 -0
- cdxml_toolkit/perception/reactant_heuristic.py +1045 -0
- cdxml_toolkit/perception/reaction_parser.py +2150 -0
- cdxml_toolkit/perception/scheme_reader.py +2948 -0
- cdxml_toolkit/perception/scheme_refine.py +1404 -0
- cdxml_toolkit/perception/scheme_segmenter.py +619 -0
- cdxml_toolkit/perception/spatial_assignment.py +1013 -0
- cdxml_toolkit/rdkit_utils.py +605 -0
- cdxml_toolkit/render/__init__.py +17 -0
- cdxml_toolkit/render/auto_layout.py +229 -0
- cdxml_toolkit/render/compact_parser.py +632 -0
- cdxml_toolkit/render/parser.py +706 -0
- cdxml_toolkit/render/render_scheme.py +267 -0
- cdxml_toolkit/render/renderer.py +2387 -0
- cdxml_toolkit/render/schema.py +90 -0
- cdxml_toolkit/render/scheme_maker.py +1043 -0
- cdxml_toolkit/render/scheme_yaml_writer.py +1487 -0
- cdxml_toolkit/resolve/__init__.py +13 -0
- cdxml_toolkit/resolve/cas_resolver.py +430 -0
- cdxml_toolkit/resolve/chemscanner_abbreviations.json +28813 -0
- cdxml_toolkit/resolve/condensed_formula.py +493 -0
- cdxml_toolkit/resolve/jre_manager.py +195 -0
- cdxml_toolkit/resolve/reagent_abbreviations.json +1046 -0
- cdxml_toolkit/resolve/reagent_db.py +285 -0
- cdxml_toolkit/resolve/superatom_data.json +2856 -0
- cdxml_toolkit/resolve/superatom_table.py +146 -0
- cdxml_toolkit/text_formatting.py +298 -0
- cdxml_toolkit-0.5.0.dist-info/METADATA +318 -0
- cdxml_toolkit-0.5.0.dist-info/RECORD +91 -0
- cdxml_toolkit-0.5.0.dist-info/WHEEL +5 -0
- cdxml_toolkit-0.5.0.dist-info/entry_points.txt +17 -0
- cdxml_toolkit-0.5.0.dist-info/licenses/LICENSE +21 -0
- cdxml_toolkit-0.5.0.dist-info/licenses/NOTICE.md +37 -0
- cdxml_toolkit-0.5.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,272 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
OLE Extractor — Extract embedded ChemDraw objects from .pptx and .docx files.
|
|
4
|
+
|
|
5
|
+
Office files (PPTX/DOCX) are ZIP archives containing OLE compound documents
|
|
6
|
+
as binary blobs. ChemDraw objects are stored as CDX data inside the OLE
|
|
7
|
+
"CONTENTS" stream. This tool extracts and optionally converts them to CDXML.
|
|
8
|
+
|
|
9
|
+
Usage:
|
|
10
|
+
python ole_extractor.py input.pptx [-o output_dir/] [--format cdxml|cdx|both]
|
|
11
|
+
python ole_extractor.py input.docx [-o output_dir/] [--format cdxml|cdx|both]
|
|
12
|
+
|
|
13
|
+
Requires: olefile, cdx_converter (for CDXML conversion)
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
import argparse
|
|
17
|
+
import io
|
|
18
|
+
import os
|
|
19
|
+
import sys
|
|
20
|
+
import zipfile
|
|
21
|
+
from dataclasses import dataclass, field
|
|
22
|
+
from typing import List, Optional
|
|
23
|
+
|
|
24
|
+
import olefile
|
|
25
|
+
|
|
26
|
+
# ChemDraw OLE CLSID (CS ChemDraw Drawing / CS ChemDraw 3D)
|
|
27
|
+
CHEMDRAW_CLSIDS = {
|
|
28
|
+
"41BA6D21-A02E-11CE-8FD9-0020AFD1F20C", # ChemDraw Drawing
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
# CDX binary magic bytes
|
|
32
|
+
CDX_MAGIC = b"VjCD"
|
|
33
|
+
|
|
34
|
+
# Where Office stores OLE embeddings
|
|
35
|
+
EMBEDDING_PATTERNS = {
|
|
36
|
+
".pptx": "ppt/embeddings/",
|
|
37
|
+
".docx": "word/embeddings/",
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
@dataclass
|
|
42
|
+
class ExtractedObject:
|
|
43
|
+
"""A single extracted ChemDraw object."""
|
|
44
|
+
source_path: str # path inside ZIP (e.g. ppt/embeddings/oleObject1.bin)
|
|
45
|
+
cdx_data: bytes
|
|
46
|
+
cdx_output: Optional[str] = None # path where CDX was saved
|
|
47
|
+
cdxml_output: Optional[str] = None # path where CDXML was saved
|
|
48
|
+
error: Optional[str] = None
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def find_ole_entries(zip_path: str) -> List[str]:
|
|
52
|
+
"""List OLE embedding paths inside a PPTX/DOCX ZIP."""
|
|
53
|
+
ext = os.path.splitext(zip_path)[1].lower()
|
|
54
|
+
prefix = EMBEDDING_PATTERNS.get(ext)
|
|
55
|
+
if prefix is None:
|
|
56
|
+
raise ValueError(
|
|
57
|
+
f"Unsupported file type: {ext}. Use .pptx or .docx."
|
|
58
|
+
)
|
|
59
|
+
|
|
60
|
+
with zipfile.ZipFile(zip_path, "r") as zf:
|
|
61
|
+
return [
|
|
62
|
+
name for name in zf.namelist()
|
|
63
|
+
if name.startswith(prefix) and name.lower().endswith(".bin")
|
|
64
|
+
]
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def is_chemdraw_ole(ole: olefile.OleFileIO) -> bool:
|
|
68
|
+
"""Check if an OLE container holds a ChemDraw object."""
|
|
69
|
+
# Check CLSID
|
|
70
|
+
clsid = ole.root.clsid.upper() if ole.root.clsid else ""
|
|
71
|
+
if clsid in CHEMDRAW_CLSIDS:
|
|
72
|
+
return True
|
|
73
|
+
|
|
74
|
+
# Check for CONTENTS stream with CDX magic
|
|
75
|
+
if ole.exists("CONTENTS"):
|
|
76
|
+
header = ole.openstream("CONTENTS").read(4)
|
|
77
|
+
if header == CDX_MAGIC:
|
|
78
|
+
return True
|
|
79
|
+
|
|
80
|
+
return False
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
def extract_cdx_from_ole(ole_data: bytes) -> Optional[bytes]:
|
|
84
|
+
"""Extract raw CDX bytes from an OLE compound document."""
|
|
85
|
+
if not olefile.isOleFile(io.BytesIO(ole_data)):
|
|
86
|
+
return None
|
|
87
|
+
|
|
88
|
+
ole = olefile.OleFileIO(io.BytesIO(ole_data))
|
|
89
|
+
try:
|
|
90
|
+
if not is_chemdraw_ole(ole):
|
|
91
|
+
return None
|
|
92
|
+
|
|
93
|
+
if ole.exists("CONTENTS"):
|
|
94
|
+
cdx = ole.openstream("CONTENTS").read()
|
|
95
|
+
if cdx[:4] == CDX_MAGIC:
|
|
96
|
+
return cdx
|
|
97
|
+
|
|
98
|
+
# Fallback: check \x01Ole10Native stream
|
|
99
|
+
if ole.exists("\x01Ole10Native"):
|
|
100
|
+
data = ole.openstream("\x01Ole10Native").read()
|
|
101
|
+
# Skip 4-byte length prefix
|
|
102
|
+
if len(data) > 4 and data[4:8] == CDX_MAGIC:
|
|
103
|
+
return data[4:]
|
|
104
|
+
|
|
105
|
+
return None
|
|
106
|
+
finally:
|
|
107
|
+
ole.close()
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
def extract_from_office(
|
|
111
|
+
input_path: str,
|
|
112
|
+
output_dir: Optional[str] = None,
|
|
113
|
+
output_format: str = "cdxml",
|
|
114
|
+
convert_method: str = "auto",
|
|
115
|
+
) -> List[ExtractedObject]:
|
|
116
|
+
"""Extract all ChemDraw objects from a PPTX/DOCX file.
|
|
117
|
+
|
|
118
|
+
Args:
|
|
119
|
+
input_path: Path to .pptx or .docx file.
|
|
120
|
+
output_dir: Directory for extracted files. Default: <basename>_chemdraw/
|
|
121
|
+
output_format: "cdx", "cdxml", or "both".
|
|
122
|
+
convert_method: Backend for CDX→CDXML conversion (passed to cdx_converter).
|
|
123
|
+
|
|
124
|
+
Returns:
|
|
125
|
+
List of ExtractedObject with extraction results.
|
|
126
|
+
"""
|
|
127
|
+
if output_dir is None:
|
|
128
|
+
basename = os.path.splitext(os.path.basename(input_path))[0]
|
|
129
|
+
output_dir = os.path.join(os.path.dirname(input_path) or ".", f"{basename}_chemdraw")
|
|
130
|
+
|
|
131
|
+
os.makedirs(output_dir, exist_ok=True)
|
|
132
|
+
|
|
133
|
+
# Lazy import — only needed if converting to CDXML
|
|
134
|
+
_converter = None
|
|
135
|
+
if output_format in ("cdxml", "both"):
|
|
136
|
+
try:
|
|
137
|
+
from ..chemdraw import cdx_converter
|
|
138
|
+
_converter = cdx_converter
|
|
139
|
+
except ImportError:
|
|
140
|
+
print(
|
|
141
|
+
"Warning: cdx_converter not found. CDX files will be saved "
|
|
142
|
+
"but CDXML conversion is unavailable.",
|
|
143
|
+
file=sys.stderr,
|
|
144
|
+
)
|
|
145
|
+
|
|
146
|
+
ole_entries = find_ole_entries(input_path)
|
|
147
|
+
results = []
|
|
148
|
+
|
|
149
|
+
with zipfile.ZipFile(input_path, "r") as zf:
|
|
150
|
+
for entry in ole_entries:
|
|
151
|
+
ole_data = zf.read(entry)
|
|
152
|
+
cdx_data = extract_cdx_from_ole(ole_data)
|
|
153
|
+
|
|
154
|
+
if cdx_data is None:
|
|
155
|
+
# Not a ChemDraw object — skip silently
|
|
156
|
+
continue
|
|
157
|
+
|
|
158
|
+
# Derive output filename from ZIP entry
|
|
159
|
+
entry_name = os.path.splitext(os.path.basename(entry))[0]
|
|
160
|
+
obj = ExtractedObject(source_path=entry, cdx_data=cdx_data)
|
|
161
|
+
|
|
162
|
+
# Save CDX
|
|
163
|
+
if output_format in ("cdx", "both"):
|
|
164
|
+
cdx_path = os.path.join(output_dir, f"{entry_name}.cdx")
|
|
165
|
+
with open(cdx_path, "wb") as f:
|
|
166
|
+
f.write(cdx_data)
|
|
167
|
+
obj.cdx_output = cdx_path
|
|
168
|
+
|
|
169
|
+
# Convert to CDXML
|
|
170
|
+
if output_format in ("cdxml", "both"):
|
|
171
|
+
cdxml_path = os.path.join(output_dir, f"{entry_name}.cdxml")
|
|
172
|
+
if _converter is not None:
|
|
173
|
+
try:
|
|
174
|
+
cdxml_str = _converter.convert_cdx_to_cdxml(
|
|
175
|
+
cdx_data, method=convert_method
|
|
176
|
+
)
|
|
177
|
+
with open(cdxml_path, "w", encoding="utf-8") as f:
|
|
178
|
+
f.write(cdxml_str)
|
|
179
|
+
obj.cdxml_output = cdxml_path
|
|
180
|
+
except Exception as e:
|
|
181
|
+
obj.error = f"CDXML conversion failed: {e}"
|
|
182
|
+
# Still save CDX as fallback
|
|
183
|
+
if obj.cdx_output is None:
|
|
184
|
+
fallback = os.path.join(output_dir, f"{entry_name}.cdx")
|
|
185
|
+
with open(fallback, "wb") as f:
|
|
186
|
+
f.write(cdx_data)
|
|
187
|
+
obj.cdx_output = fallback
|
|
188
|
+
else:
|
|
189
|
+
# No converter — save CDX instead
|
|
190
|
+
if obj.cdx_output is None:
|
|
191
|
+
fallback = os.path.join(output_dir, f"{entry_name}.cdx")
|
|
192
|
+
with open(fallback, "wb") as f:
|
|
193
|
+
f.write(cdx_data)
|
|
194
|
+
obj.cdx_output = fallback
|
|
195
|
+
obj.error = "cdx_converter unavailable; saved CDX only"
|
|
196
|
+
|
|
197
|
+
results.append(obj)
|
|
198
|
+
|
|
199
|
+
return results
|
|
200
|
+
|
|
201
|
+
|
|
202
|
+
def print_summary(results: List[ExtractedObject], input_path: str) -> None:
|
|
203
|
+
"""Print extraction summary to stdout."""
|
|
204
|
+
print(f"{'=' * 60}")
|
|
205
|
+
print(f"OLE Extractor - {os.path.basename(input_path)}")
|
|
206
|
+
print(f"{'=' * 60}")
|
|
207
|
+
|
|
208
|
+
if not results:
|
|
209
|
+
print("No ChemDraw objects found.")
|
|
210
|
+
return
|
|
211
|
+
|
|
212
|
+
print(f"Found {len(results)} ChemDraw object(s):\n")
|
|
213
|
+
for i, obj in enumerate(results, 1):
|
|
214
|
+
print(f" [{i}] {obj.source_path}")
|
|
215
|
+
print(f" CDX size: {len(obj.cdx_data):,} bytes")
|
|
216
|
+
if obj.cdx_output:
|
|
217
|
+
print(f" CDX: {obj.cdx_output}")
|
|
218
|
+
if obj.cdxml_output:
|
|
219
|
+
size = os.path.getsize(obj.cdxml_output)
|
|
220
|
+
print(f" CDXML: {obj.cdxml_output} ({size:,} bytes)")
|
|
221
|
+
if obj.error:
|
|
222
|
+
print(f" Note: {obj.error}")
|
|
223
|
+
print()
|
|
224
|
+
|
|
225
|
+
|
|
226
|
+
# ---------------------------------------------------------------------------
|
|
227
|
+
# CLI
|
|
228
|
+
# ---------------------------------------------------------------------------
|
|
229
|
+
|
|
230
|
+
def main(argv=None) -> int:
|
|
231
|
+
parser = argparse.ArgumentParser(
|
|
232
|
+
description="Extract embedded ChemDraw objects from .pptx/.docx files."
|
|
233
|
+
)
|
|
234
|
+
parser.add_argument("input", help="Input file (.pptx or .docx)")
|
|
235
|
+
parser.add_argument(
|
|
236
|
+
"-o", "--output-dir",
|
|
237
|
+
help="Output directory (default: <input_basename>_chemdraw/)"
|
|
238
|
+
)
|
|
239
|
+
parser.add_argument(
|
|
240
|
+
"--format",
|
|
241
|
+
choices=["cdxml", "cdx", "both"],
|
|
242
|
+
default="cdxml",
|
|
243
|
+
help="Output format (default: cdxml)"
|
|
244
|
+
)
|
|
245
|
+
parser.add_argument(
|
|
246
|
+
"--method",
|
|
247
|
+
choices=["auto", "com", "pycdxml", "obabel"],
|
|
248
|
+
default="auto",
|
|
249
|
+
help="CDX→CDXML conversion backend (default: auto)"
|
|
250
|
+
)
|
|
251
|
+
args = parser.parse_args(argv)
|
|
252
|
+
|
|
253
|
+
if not os.path.isfile(args.input):
|
|
254
|
+
print(f"Error: file not found: {args.input}", file=sys.stderr)
|
|
255
|
+
return 1
|
|
256
|
+
|
|
257
|
+
try:
|
|
258
|
+
results = extract_from_office(
|
|
259
|
+
args.input,
|
|
260
|
+
output_dir=args.output_dir,
|
|
261
|
+
output_format=args.format,
|
|
262
|
+
convert_method=args.method,
|
|
263
|
+
)
|
|
264
|
+
print_summary(results, args.input)
|
|
265
|
+
return 0
|
|
266
|
+
except Exception as e:
|
|
267
|
+
print(f"Error: {e}", file=sys.stderr)
|
|
268
|
+
return 1
|
|
269
|
+
|
|
270
|
+
|
|
271
|
+
if __name__ == "__main__":
|
|
272
|
+
sys.exit(main())
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
"""Perception — reading and understanding reaction schemes.
|
|
2
|
+
|
|
3
|
+
Everything about extracting semantic meaning from CDXML: which fragments are
|
|
4
|
+
reactants vs products vs reagents, what the arrows connect, what the text
|
|
5
|
+
labels mean.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from .scheme_reader import read_scheme, SchemeDescription
|
|
9
|
+
from .reaction_parser import parse_reaction, ReactionDescriptor, reaction_summary
|
|
10
|
+
from .scheme_segmenter import segment_scheme, classify_scheme_complexity
|
|
@@ -0,0 +1,229 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
compound_search.py — Search for a molecule across a directory of experiments.
|
|
4
|
+
|
|
5
|
+
Given a query SMILES and a directory of experiment subdirectories (each
|
|
6
|
+
containing ELN exports: .cdxml, .csv, .rxn), parses every experiment and
|
|
7
|
+
compares the query against all species using RDKit exact-match and Tanimoto
|
|
8
|
+
fingerprint similarity.
|
|
9
|
+
|
|
10
|
+
Python API:
|
|
11
|
+
from cdxml_toolkit.perception.compound_search import search_compound
|
|
12
|
+
results = search_compound(
|
|
13
|
+
smiles="NCCCCCOc1cccc2c1C(=O)N(C1CCC(=O)NC1=O)C2=O",
|
|
14
|
+
experiment_dir="/path/to/KL-7001",
|
|
15
|
+
)
|
|
16
|
+
"""
|
|
17
|
+
|
|
18
|
+
from __future__ import annotations
|
|
19
|
+
|
|
20
|
+
import traceback
|
|
21
|
+
from pathlib import Path
|
|
22
|
+
from typing import Any, Dict, List, Optional
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
# ---------------------------------------------------------------------------
|
|
26
|
+
# Helpers
|
|
27
|
+
# ---------------------------------------------------------------------------
|
|
28
|
+
|
|
29
|
+
def _canonical(smiles: str) -> Optional[str]:
|
|
30
|
+
"""Return canonical SMILES, or None if invalid."""
|
|
31
|
+
try:
|
|
32
|
+
from rdkit import Chem
|
|
33
|
+
mol = Chem.MolFromSmiles(smiles)
|
|
34
|
+
if mol is None:
|
|
35
|
+
return None
|
|
36
|
+
return Chem.MolToSmiles(mol)
|
|
37
|
+
except Exception:
|
|
38
|
+
return None
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def _morgan_fp(smiles: str):
|
|
42
|
+
"""Return Morgan fingerprint (radius=2, 2048 bits), or None."""
|
|
43
|
+
try:
|
|
44
|
+
from rdkit import Chem
|
|
45
|
+
from rdkit.Chem import AllChem
|
|
46
|
+
mol = Chem.MolFromSmiles(smiles)
|
|
47
|
+
if mol is None:
|
|
48
|
+
return None
|
|
49
|
+
return AllChem.GetMorganFingerprintAsBitVect(mol, radius=2, nBits=2048)
|
|
50
|
+
except Exception:
|
|
51
|
+
return None
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def _tanimoto(fp1, fp2) -> float:
|
|
55
|
+
"""Tanimoto similarity between two RDKit fingerprints."""
|
|
56
|
+
from rdkit import DataStructs
|
|
57
|
+
return DataStructs.TanimotoSimilarity(fp1, fp2)
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def _discover_files(exp_dir: Path):
|
|
61
|
+
"""Return (cdxml_path, csv_path) for a single experiment subdirectory."""
|
|
62
|
+
cdxml_files = list(exp_dir.glob("*.cdxml"))
|
|
63
|
+
csv_files = list(exp_dir.glob("*.csv"))
|
|
64
|
+
cdxml = str(cdxml_files[0]) if cdxml_files else None
|
|
65
|
+
csv = str(csv_files[0]) if csv_files else None
|
|
66
|
+
return cdxml, csv
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
# ---------------------------------------------------------------------------
|
|
70
|
+
# Public API
|
|
71
|
+
# ---------------------------------------------------------------------------
|
|
72
|
+
|
|
73
|
+
def search_compound(
|
|
74
|
+
smiles: str,
|
|
75
|
+
experiment_dir: str,
|
|
76
|
+
similarity_threshold: float = 0.85,
|
|
77
|
+
) -> Dict[str, Any]:
|
|
78
|
+
"""Search for a molecule (by SMILES) across all experiments in a directory.
|
|
79
|
+
|
|
80
|
+
Args:
|
|
81
|
+
smiles: Query molecule as a SMILES string.
|
|
82
|
+
experiment_dir: Path to a directory whose immediate subdirectories are
|
|
83
|
+
individual experiments (each containing .cdxml / .csv files).
|
|
84
|
+
similarity_threshold: Minimum Tanimoto similarity (0–1) for a species
|
|
85
|
+
to appear in ``similar_matches``. Exact matches (same canonical
|
|
86
|
+
SMILES) are always reported regardless of this threshold.
|
|
87
|
+
|
|
88
|
+
Returns:
|
|
89
|
+
A dict with keys:
|
|
90
|
+
ok (bool), query_smiles (str), query_canonical (str),
|
|
91
|
+
exact_matches (list), similar_matches (list),
|
|
92
|
+
experiments_searched (int), experiments_parsed_ok (int),
|
|
93
|
+
parse_errors (list of {"experiment": str, "error": str}).
|
|
94
|
+
"""
|
|
95
|
+
from cdxml_toolkit.perception.reaction_parser import parse_reaction
|
|
96
|
+
|
|
97
|
+
# Validate query
|
|
98
|
+
query_canonical = _canonical(smiles)
|
|
99
|
+
if query_canonical is None:
|
|
100
|
+
return {
|
|
101
|
+
"ok": False,
|
|
102
|
+
"error": f"Invalid query SMILES: {smiles!r}",
|
|
103
|
+
"query_smiles": smiles,
|
|
104
|
+
}
|
|
105
|
+
|
|
106
|
+
query_fp = _morgan_fp(query_canonical)
|
|
107
|
+
|
|
108
|
+
root = Path(experiment_dir)
|
|
109
|
+
if not root.is_dir():
|
|
110
|
+
return {
|
|
111
|
+
"ok": False,
|
|
112
|
+
"error": f"experiment_dir does not exist or is not a directory: {experiment_dir!r}",
|
|
113
|
+
"query_smiles": smiles,
|
|
114
|
+
}
|
|
115
|
+
|
|
116
|
+
# Collect experiment subdirectories (immediate children only)
|
|
117
|
+
exp_dirs = sorted(p for p in root.iterdir() if p.is_dir())
|
|
118
|
+
|
|
119
|
+
exact_matches: List[Dict[str, Any]] = []
|
|
120
|
+
similar_matches: List[Dict[str, Any]] = []
|
|
121
|
+
parse_errors: List[Dict[str, str]] = []
|
|
122
|
+
experiments_parsed_ok = 0
|
|
123
|
+
|
|
124
|
+
for exp_dir in exp_dirs:
|
|
125
|
+
exp_name = exp_dir.name
|
|
126
|
+
cdxml, csv = _discover_files(exp_dir)
|
|
127
|
+
|
|
128
|
+
if cdxml is None:
|
|
129
|
+
# No CDXML → skip silently (no reaction to parse)
|
|
130
|
+
continue
|
|
131
|
+
|
|
132
|
+
try:
|
|
133
|
+
desc = parse_reaction(
|
|
134
|
+
cdxml=cdxml,
|
|
135
|
+
csv=csv,
|
|
136
|
+
use_network=False, # keep offline for batch search
|
|
137
|
+
verbose=False,
|
|
138
|
+
)
|
|
139
|
+
except Exception as exc:
|
|
140
|
+
parse_errors.append({
|
|
141
|
+
"experiment": exp_name,
|
|
142
|
+
"error": f"{type(exc).__name__}: {exc}",
|
|
143
|
+
"traceback": traceback.format_exc(),
|
|
144
|
+
})
|
|
145
|
+
continue
|
|
146
|
+
|
|
147
|
+
experiments_parsed_ok += 1
|
|
148
|
+
|
|
149
|
+
# Pull eln_data fields once per experiment
|
|
150
|
+
eln = desc.eln_data or {}
|
|
151
|
+
product_yield = eln.get("product_yield")
|
|
152
|
+
product_obtained = eln.get("product_obtained")
|
|
153
|
+
sm_mass = eln.get("sm_mass")
|
|
154
|
+
|
|
155
|
+
for sp in desc.species:
|
|
156
|
+
# Prefer the full SMILES; also check neutral form for exact matching
|
|
157
|
+
sp_smiles = sp.smiles or sp.smiles_neutral
|
|
158
|
+
if not sp_smiles:
|
|
159
|
+
continue
|
|
160
|
+
|
|
161
|
+
sp_canonical = _canonical(sp_smiles)
|
|
162
|
+
if sp_canonical is None:
|
|
163
|
+
continue
|
|
164
|
+
|
|
165
|
+
# Neutral canonical (largest fragment, salt-stripped) for comparison
|
|
166
|
+
sp_neutral_canonical = (
|
|
167
|
+
_canonical(sp.smiles_neutral) if sp.smiles_neutral else None
|
|
168
|
+
)
|
|
169
|
+
|
|
170
|
+
# --- Exact match ---
|
|
171
|
+
# Check both the full canonical and the neutral (salt-free) form.
|
|
172
|
+
# This lets "NCCCCCOc1..." match "Cl.NCCCCCOc1..." (HCl salt).
|
|
173
|
+
matched_as_exact = (
|
|
174
|
+
sp_canonical == query_canonical
|
|
175
|
+
or (sp_neutral_canonical and sp_neutral_canonical == query_canonical)
|
|
176
|
+
)
|
|
177
|
+
if matched_as_exact:
|
|
178
|
+
reported_smiles = (
|
|
179
|
+
sp_neutral_canonical
|
|
180
|
+
if sp_neutral_canonical == query_canonical
|
|
181
|
+
else sp_canonical
|
|
182
|
+
)
|
|
183
|
+
record: Dict[str, Any] = {
|
|
184
|
+
"experiment": exp_name,
|
|
185
|
+
"species_name": sp.name or sp.id,
|
|
186
|
+
"role": sp.role,
|
|
187
|
+
"smiles": reported_smiles,
|
|
188
|
+
}
|
|
189
|
+
if sp_canonical != reported_smiles:
|
|
190
|
+
record["smiles_full"] = sp_canonical # show the salt form too
|
|
191
|
+
if product_yield:
|
|
192
|
+
record["yield"] = product_yield
|
|
193
|
+
if product_obtained:
|
|
194
|
+
record["amount_obtained"] = product_obtained
|
|
195
|
+
if sm_mass:
|
|
196
|
+
record["sm_mass"] = sm_mass
|
|
197
|
+
exact_matches.append(record)
|
|
198
|
+
continue # don't also report as similar
|
|
199
|
+
|
|
200
|
+
# --- Similarity match ---
|
|
201
|
+
# Use the neutral form for fingerprint comparison when available
|
|
202
|
+
# (avoids artificially low similarity for salt vs free base).
|
|
203
|
+
fp_smiles = sp_neutral_canonical or sp_canonical
|
|
204
|
+
if query_fp is not None:
|
|
205
|
+
sp_fp = _morgan_fp(fp_smiles)
|
|
206
|
+
if sp_fp is not None:
|
|
207
|
+
sim = _tanimoto(query_fp, sp_fp)
|
|
208
|
+
if sim >= similarity_threshold:
|
|
209
|
+
similar_matches.append({
|
|
210
|
+
"experiment": exp_name,
|
|
211
|
+
"species_name": sp.name or sp.id,
|
|
212
|
+
"role": sp.role,
|
|
213
|
+
"similarity": round(sim, 4),
|
|
214
|
+
"smiles": sp_canonical,
|
|
215
|
+
})
|
|
216
|
+
|
|
217
|
+
# Sort similar matches by descending similarity
|
|
218
|
+
similar_matches.sort(key=lambda x: x["similarity"], reverse=True)
|
|
219
|
+
|
|
220
|
+
return {
|
|
221
|
+
"ok": True,
|
|
222
|
+
"query_smiles": smiles,
|
|
223
|
+
"query_canonical": query_canonical,
|
|
224
|
+
"exact_matches": exact_matches,
|
|
225
|
+
"similar_matches": similar_matches,
|
|
226
|
+
"experiments_searched": len(exp_dirs),
|
|
227
|
+
"experiments_parsed_ok": experiments_parsed_ok,
|
|
228
|
+
"parse_errors": parse_errors,
|
|
229
|
+
}
|