cdxml-toolkit 0.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (91) hide show
  1. cdxml_toolkit/__init__.py +18 -0
  2. cdxml_toolkit/_jre/__init__.py +2 -0
  3. cdxml_toolkit/_jre/temurin-21-jre-win-x64.zip +0 -0
  4. cdxml_toolkit/analysis/__init__.py +35 -0
  5. cdxml_toolkit/analysis/deterministic/__init__.py +12 -0
  6. cdxml_toolkit/analysis/deterministic/discover_experiment_files.py +413 -0
  7. cdxml_toolkit/analysis/deterministic/lab_book_formatter.py +701 -0
  8. cdxml_toolkit/analysis/deterministic/lcms_file_categorizer.py +928 -0
  9. cdxml_toolkit/analysis/deterministic/lcms_identifier.py +598 -0
  10. cdxml_toolkit/analysis/deterministic/mass_resolver.py +654 -0
  11. cdxml_toolkit/analysis/deterministic/multi_lcms_analyzer.py +1412 -0
  12. cdxml_toolkit/analysis/deterministic/procedure_writer.py +446 -0
  13. cdxml_toolkit/analysis/extract_nmr.py +47 -0
  14. cdxml_toolkit/analysis/format_procedure_entry.py +479 -0
  15. cdxml_toolkit/analysis/lcms_analyzer.py +1299 -0
  16. cdxml_toolkit/analysis/parse_analysis_file.py +134 -0
  17. cdxml_toolkit/cdxml_builder.py +920 -0
  18. cdxml_toolkit/cdxml_utils.py +342 -0
  19. cdxml_toolkit/chemdraw/__init__.py +5 -0
  20. cdxml_toolkit/chemdraw/_chemscript_server.py +562 -0
  21. cdxml_toolkit/chemdraw/cdx_converter.py +527 -0
  22. cdxml_toolkit/chemdraw/cdxml_to_image.py +262 -0
  23. cdxml_toolkit/chemdraw/cdxml_to_image_rdkit.py +296 -0
  24. cdxml_toolkit/chemdraw/chemscript_bridge.py +901 -0
  25. cdxml_toolkit/constants.py +304 -0
  26. cdxml_toolkit/coord_normalizer.py +438 -0
  27. cdxml_toolkit/deterministic_pipeline/__init__.py +6 -0
  28. cdxml_toolkit/deterministic_pipeline/legacy/__init__.py +5 -0
  29. cdxml_toolkit/deterministic_pipeline/legacy/eln_cdx_cleanup.py +509 -0
  30. cdxml_toolkit/deterministic_pipeline/legacy/eln_enrichment.py +1394 -0
  31. cdxml_toolkit/deterministic_pipeline/legacy/scheme_aligner.py +428 -0
  32. cdxml_toolkit/deterministic_pipeline/legacy/scheme_polisher.py +1337 -0
  33. cdxml_toolkit/deterministic_pipeline/legacy/scheme_polisher_v2.py +1340 -0
  34. cdxml_toolkit/deterministic_pipeline/scheme_reader_audit.py +931 -0
  35. cdxml_toolkit/deterministic_pipeline/scheme_reader_verify.py +1160 -0
  36. cdxml_toolkit/image/__init__.py +15 -0
  37. cdxml_toolkit/image/reaction_from_image.py +2103 -0
  38. cdxml_toolkit/image/structure_from_image.py +1711 -0
  39. cdxml_toolkit/layout/__init__.py +5 -0
  40. cdxml_toolkit/layout/alignment.py +1642 -0
  41. cdxml_toolkit/layout/reaction_cleanup.py +1002 -0
  42. cdxml_toolkit/layout/scheme_merger.py +2260 -0
  43. cdxml_toolkit/mcp_server/__init__.py +0 -0
  44. cdxml_toolkit/mcp_server/__main__.py +5 -0
  45. cdxml_toolkit/mcp_server/server.py +1567 -0
  46. cdxml_toolkit/naming/__init__.py +6 -0
  47. cdxml_toolkit/naming/aligned_namer.py +2342 -0
  48. cdxml_toolkit/naming/mol_builder.py +3722 -0
  49. cdxml_toolkit/naming/name_decomposer.py +2843 -0
  50. cdxml_toolkit/naming/reactions_datamol.json +2414 -0
  51. cdxml_toolkit/office/__init__.py +5 -0
  52. cdxml_toolkit/office/doc_from_template.py +722 -0
  53. cdxml_toolkit/office/ole_embedder.py +808 -0
  54. cdxml_toolkit/office/ole_extractor.py +272 -0
  55. cdxml_toolkit/perception/__init__.py +10 -0
  56. cdxml_toolkit/perception/compound_search.py +229 -0
  57. cdxml_toolkit/perception/eln_csv_parser.py +240 -0
  58. cdxml_toolkit/perception/rdf_parser.py +664 -0
  59. cdxml_toolkit/perception/reactant_heuristic.py +1045 -0
  60. cdxml_toolkit/perception/reaction_parser.py +2150 -0
  61. cdxml_toolkit/perception/scheme_reader.py +2948 -0
  62. cdxml_toolkit/perception/scheme_refine.py +1404 -0
  63. cdxml_toolkit/perception/scheme_segmenter.py +619 -0
  64. cdxml_toolkit/perception/spatial_assignment.py +1013 -0
  65. cdxml_toolkit/rdkit_utils.py +605 -0
  66. cdxml_toolkit/render/__init__.py +17 -0
  67. cdxml_toolkit/render/auto_layout.py +229 -0
  68. cdxml_toolkit/render/compact_parser.py +632 -0
  69. cdxml_toolkit/render/parser.py +706 -0
  70. cdxml_toolkit/render/render_scheme.py +267 -0
  71. cdxml_toolkit/render/renderer.py +2387 -0
  72. cdxml_toolkit/render/schema.py +90 -0
  73. cdxml_toolkit/render/scheme_maker.py +1043 -0
  74. cdxml_toolkit/render/scheme_yaml_writer.py +1487 -0
  75. cdxml_toolkit/resolve/__init__.py +13 -0
  76. cdxml_toolkit/resolve/cas_resolver.py +430 -0
  77. cdxml_toolkit/resolve/chemscanner_abbreviations.json +28813 -0
  78. cdxml_toolkit/resolve/condensed_formula.py +493 -0
  79. cdxml_toolkit/resolve/jre_manager.py +195 -0
  80. cdxml_toolkit/resolve/reagent_abbreviations.json +1046 -0
  81. cdxml_toolkit/resolve/reagent_db.py +285 -0
  82. cdxml_toolkit/resolve/superatom_data.json +2856 -0
  83. cdxml_toolkit/resolve/superatom_table.py +146 -0
  84. cdxml_toolkit/text_formatting.py +298 -0
  85. cdxml_toolkit-0.5.0.dist-info/METADATA +318 -0
  86. cdxml_toolkit-0.5.0.dist-info/RECORD +91 -0
  87. cdxml_toolkit-0.5.0.dist-info/WHEEL +5 -0
  88. cdxml_toolkit-0.5.0.dist-info/entry_points.txt +17 -0
  89. cdxml_toolkit-0.5.0.dist-info/licenses/LICENSE +21 -0
  90. cdxml_toolkit-0.5.0.dist-info/licenses/NOTICE.md +37 -0
  91. cdxml_toolkit-0.5.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,272 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ OLE Extractor — Extract embedded ChemDraw objects from .pptx and .docx files.
4
+
5
+ Office files (PPTX/DOCX) are ZIP archives containing OLE compound documents
6
+ as binary blobs. ChemDraw objects are stored as CDX data inside the OLE
7
+ "CONTENTS" stream. This tool extracts and optionally converts them to CDXML.
8
+
9
+ Usage:
10
+ python ole_extractor.py input.pptx [-o output_dir/] [--format cdxml|cdx|both]
11
+ python ole_extractor.py input.docx [-o output_dir/] [--format cdxml|cdx|both]
12
+
13
+ Requires: olefile, cdx_converter (for CDXML conversion)
14
+ """
15
+
16
+ import argparse
17
+ import io
18
+ import os
19
+ import sys
20
+ import zipfile
21
+ from dataclasses import dataclass, field
22
+ from typing import List, Optional
23
+
24
+ import olefile
25
+
26
+ # ChemDraw OLE CLSID (CS ChemDraw Drawing / CS ChemDraw 3D)
27
+ CHEMDRAW_CLSIDS = {
28
+ "41BA6D21-A02E-11CE-8FD9-0020AFD1F20C", # ChemDraw Drawing
29
+ }
30
+
31
+ # CDX binary magic bytes
32
+ CDX_MAGIC = b"VjCD"
33
+
34
+ # Where Office stores OLE embeddings
35
+ EMBEDDING_PATTERNS = {
36
+ ".pptx": "ppt/embeddings/",
37
+ ".docx": "word/embeddings/",
38
+ }
39
+
40
+
41
+ @dataclass
42
+ class ExtractedObject:
43
+ """A single extracted ChemDraw object."""
44
+ source_path: str # path inside ZIP (e.g. ppt/embeddings/oleObject1.bin)
45
+ cdx_data: bytes
46
+ cdx_output: Optional[str] = None # path where CDX was saved
47
+ cdxml_output: Optional[str] = None # path where CDXML was saved
48
+ error: Optional[str] = None
49
+
50
+
51
+ def find_ole_entries(zip_path: str) -> List[str]:
52
+ """List OLE embedding paths inside a PPTX/DOCX ZIP."""
53
+ ext = os.path.splitext(zip_path)[1].lower()
54
+ prefix = EMBEDDING_PATTERNS.get(ext)
55
+ if prefix is None:
56
+ raise ValueError(
57
+ f"Unsupported file type: {ext}. Use .pptx or .docx."
58
+ )
59
+
60
+ with zipfile.ZipFile(zip_path, "r") as zf:
61
+ return [
62
+ name for name in zf.namelist()
63
+ if name.startswith(prefix) and name.lower().endswith(".bin")
64
+ ]
65
+
66
+
67
+ def is_chemdraw_ole(ole: olefile.OleFileIO) -> bool:
68
+ """Check if an OLE container holds a ChemDraw object."""
69
+ # Check CLSID
70
+ clsid = ole.root.clsid.upper() if ole.root.clsid else ""
71
+ if clsid in CHEMDRAW_CLSIDS:
72
+ return True
73
+
74
+ # Check for CONTENTS stream with CDX magic
75
+ if ole.exists("CONTENTS"):
76
+ header = ole.openstream("CONTENTS").read(4)
77
+ if header == CDX_MAGIC:
78
+ return True
79
+
80
+ return False
81
+
82
+
83
+ def extract_cdx_from_ole(ole_data: bytes) -> Optional[bytes]:
84
+ """Extract raw CDX bytes from an OLE compound document."""
85
+ if not olefile.isOleFile(io.BytesIO(ole_data)):
86
+ return None
87
+
88
+ ole = olefile.OleFileIO(io.BytesIO(ole_data))
89
+ try:
90
+ if not is_chemdraw_ole(ole):
91
+ return None
92
+
93
+ if ole.exists("CONTENTS"):
94
+ cdx = ole.openstream("CONTENTS").read()
95
+ if cdx[:4] == CDX_MAGIC:
96
+ return cdx
97
+
98
+ # Fallback: check \x01Ole10Native stream
99
+ if ole.exists("\x01Ole10Native"):
100
+ data = ole.openstream("\x01Ole10Native").read()
101
+ # Skip 4-byte length prefix
102
+ if len(data) > 4 and data[4:8] == CDX_MAGIC:
103
+ return data[4:]
104
+
105
+ return None
106
+ finally:
107
+ ole.close()
108
+
109
+
110
+ def extract_from_office(
111
+ input_path: str,
112
+ output_dir: Optional[str] = None,
113
+ output_format: str = "cdxml",
114
+ convert_method: str = "auto",
115
+ ) -> List[ExtractedObject]:
116
+ """Extract all ChemDraw objects from a PPTX/DOCX file.
117
+
118
+ Args:
119
+ input_path: Path to .pptx or .docx file.
120
+ output_dir: Directory for extracted files. Default: <basename>_chemdraw/
121
+ output_format: "cdx", "cdxml", or "both".
122
+ convert_method: Backend for CDX→CDXML conversion (passed to cdx_converter).
123
+
124
+ Returns:
125
+ List of ExtractedObject with extraction results.
126
+ """
127
+ if output_dir is None:
128
+ basename = os.path.splitext(os.path.basename(input_path))[0]
129
+ output_dir = os.path.join(os.path.dirname(input_path) or ".", f"{basename}_chemdraw")
130
+
131
+ os.makedirs(output_dir, exist_ok=True)
132
+
133
+ # Lazy import — only needed if converting to CDXML
134
+ _converter = None
135
+ if output_format in ("cdxml", "both"):
136
+ try:
137
+ from ..chemdraw import cdx_converter
138
+ _converter = cdx_converter
139
+ except ImportError:
140
+ print(
141
+ "Warning: cdx_converter not found. CDX files will be saved "
142
+ "but CDXML conversion is unavailable.",
143
+ file=sys.stderr,
144
+ )
145
+
146
+ ole_entries = find_ole_entries(input_path)
147
+ results = []
148
+
149
+ with zipfile.ZipFile(input_path, "r") as zf:
150
+ for entry in ole_entries:
151
+ ole_data = zf.read(entry)
152
+ cdx_data = extract_cdx_from_ole(ole_data)
153
+
154
+ if cdx_data is None:
155
+ # Not a ChemDraw object — skip silently
156
+ continue
157
+
158
+ # Derive output filename from ZIP entry
159
+ entry_name = os.path.splitext(os.path.basename(entry))[0]
160
+ obj = ExtractedObject(source_path=entry, cdx_data=cdx_data)
161
+
162
+ # Save CDX
163
+ if output_format in ("cdx", "both"):
164
+ cdx_path = os.path.join(output_dir, f"{entry_name}.cdx")
165
+ with open(cdx_path, "wb") as f:
166
+ f.write(cdx_data)
167
+ obj.cdx_output = cdx_path
168
+
169
+ # Convert to CDXML
170
+ if output_format in ("cdxml", "both"):
171
+ cdxml_path = os.path.join(output_dir, f"{entry_name}.cdxml")
172
+ if _converter is not None:
173
+ try:
174
+ cdxml_str = _converter.convert_cdx_to_cdxml(
175
+ cdx_data, method=convert_method
176
+ )
177
+ with open(cdxml_path, "w", encoding="utf-8") as f:
178
+ f.write(cdxml_str)
179
+ obj.cdxml_output = cdxml_path
180
+ except Exception as e:
181
+ obj.error = f"CDXML conversion failed: {e}"
182
+ # Still save CDX as fallback
183
+ if obj.cdx_output is None:
184
+ fallback = os.path.join(output_dir, f"{entry_name}.cdx")
185
+ with open(fallback, "wb") as f:
186
+ f.write(cdx_data)
187
+ obj.cdx_output = fallback
188
+ else:
189
+ # No converter — save CDX instead
190
+ if obj.cdx_output is None:
191
+ fallback = os.path.join(output_dir, f"{entry_name}.cdx")
192
+ with open(fallback, "wb") as f:
193
+ f.write(cdx_data)
194
+ obj.cdx_output = fallback
195
+ obj.error = "cdx_converter unavailable; saved CDX only"
196
+
197
+ results.append(obj)
198
+
199
+ return results
200
+
201
+
202
+ def print_summary(results: List[ExtractedObject], input_path: str) -> None:
203
+ """Print extraction summary to stdout."""
204
+ print(f"{'=' * 60}")
205
+ print(f"OLE Extractor - {os.path.basename(input_path)}")
206
+ print(f"{'=' * 60}")
207
+
208
+ if not results:
209
+ print("No ChemDraw objects found.")
210
+ return
211
+
212
+ print(f"Found {len(results)} ChemDraw object(s):\n")
213
+ for i, obj in enumerate(results, 1):
214
+ print(f" [{i}] {obj.source_path}")
215
+ print(f" CDX size: {len(obj.cdx_data):,} bytes")
216
+ if obj.cdx_output:
217
+ print(f" CDX: {obj.cdx_output}")
218
+ if obj.cdxml_output:
219
+ size = os.path.getsize(obj.cdxml_output)
220
+ print(f" CDXML: {obj.cdxml_output} ({size:,} bytes)")
221
+ if obj.error:
222
+ print(f" Note: {obj.error}")
223
+ print()
224
+
225
+
226
+ # ---------------------------------------------------------------------------
227
+ # CLI
228
+ # ---------------------------------------------------------------------------
229
+
230
+ def main(argv=None) -> int:
231
+ parser = argparse.ArgumentParser(
232
+ description="Extract embedded ChemDraw objects from .pptx/.docx files."
233
+ )
234
+ parser.add_argument("input", help="Input file (.pptx or .docx)")
235
+ parser.add_argument(
236
+ "-o", "--output-dir",
237
+ help="Output directory (default: <input_basename>_chemdraw/)"
238
+ )
239
+ parser.add_argument(
240
+ "--format",
241
+ choices=["cdxml", "cdx", "both"],
242
+ default="cdxml",
243
+ help="Output format (default: cdxml)"
244
+ )
245
+ parser.add_argument(
246
+ "--method",
247
+ choices=["auto", "com", "pycdxml", "obabel"],
248
+ default="auto",
249
+ help="CDX→CDXML conversion backend (default: auto)"
250
+ )
251
+ args = parser.parse_args(argv)
252
+
253
+ if not os.path.isfile(args.input):
254
+ print(f"Error: file not found: {args.input}", file=sys.stderr)
255
+ return 1
256
+
257
+ try:
258
+ results = extract_from_office(
259
+ args.input,
260
+ output_dir=args.output_dir,
261
+ output_format=args.format,
262
+ convert_method=args.method,
263
+ )
264
+ print_summary(results, args.input)
265
+ return 0
266
+ except Exception as e:
267
+ print(f"Error: {e}", file=sys.stderr)
268
+ return 1
269
+
270
+
271
+ if __name__ == "__main__":
272
+ sys.exit(main())
@@ -0,0 +1,10 @@
1
+ """Perception — reading and understanding reaction schemes.
2
+
3
+ Everything about extracting semantic meaning from CDXML: which fragments are
4
+ reactants vs products vs reagents, what the arrows connect, what the text
5
+ labels mean.
6
+ """
7
+
8
+ from .scheme_reader import read_scheme, SchemeDescription
9
+ from .reaction_parser import parse_reaction, ReactionDescriptor, reaction_summary
10
+ from .scheme_segmenter import segment_scheme, classify_scheme_complexity
@@ -0,0 +1,229 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ compound_search.py — Search for a molecule across a directory of experiments.
4
+
5
+ Given a query SMILES and a directory of experiment subdirectories (each
6
+ containing ELN exports: .cdxml, .csv, .rxn), parses every experiment and
7
+ compares the query against all species using RDKit exact-match and Tanimoto
8
+ fingerprint similarity.
9
+
10
+ Python API:
11
+ from cdxml_toolkit.perception.compound_search import search_compound
12
+ results = search_compound(
13
+ smiles="NCCCCCOc1cccc2c1C(=O)N(C1CCC(=O)NC1=O)C2=O",
14
+ experiment_dir="/path/to/KL-7001",
15
+ )
16
+ """
17
+
18
+ from __future__ import annotations
19
+
20
+ import traceback
21
+ from pathlib import Path
22
+ from typing import Any, Dict, List, Optional
23
+
24
+
25
+ # ---------------------------------------------------------------------------
26
+ # Helpers
27
+ # ---------------------------------------------------------------------------
28
+
29
+ def _canonical(smiles: str) -> Optional[str]:
30
+ """Return canonical SMILES, or None if invalid."""
31
+ try:
32
+ from rdkit import Chem
33
+ mol = Chem.MolFromSmiles(smiles)
34
+ if mol is None:
35
+ return None
36
+ return Chem.MolToSmiles(mol)
37
+ except Exception:
38
+ return None
39
+
40
+
41
+ def _morgan_fp(smiles: str):
42
+ """Return Morgan fingerprint (radius=2, 2048 bits), or None."""
43
+ try:
44
+ from rdkit import Chem
45
+ from rdkit.Chem import AllChem
46
+ mol = Chem.MolFromSmiles(smiles)
47
+ if mol is None:
48
+ return None
49
+ return AllChem.GetMorganFingerprintAsBitVect(mol, radius=2, nBits=2048)
50
+ except Exception:
51
+ return None
52
+
53
+
54
+ def _tanimoto(fp1, fp2) -> float:
55
+ """Tanimoto similarity between two RDKit fingerprints."""
56
+ from rdkit import DataStructs
57
+ return DataStructs.TanimotoSimilarity(fp1, fp2)
58
+
59
+
60
+ def _discover_files(exp_dir: Path):
61
+ """Return (cdxml_path, csv_path) for a single experiment subdirectory."""
62
+ cdxml_files = list(exp_dir.glob("*.cdxml"))
63
+ csv_files = list(exp_dir.glob("*.csv"))
64
+ cdxml = str(cdxml_files[0]) if cdxml_files else None
65
+ csv = str(csv_files[0]) if csv_files else None
66
+ return cdxml, csv
67
+
68
+
69
+ # ---------------------------------------------------------------------------
70
+ # Public API
71
+ # ---------------------------------------------------------------------------
72
+
73
+ def search_compound(
74
+ smiles: str,
75
+ experiment_dir: str,
76
+ similarity_threshold: float = 0.85,
77
+ ) -> Dict[str, Any]:
78
+ """Search for a molecule (by SMILES) across all experiments in a directory.
79
+
80
+ Args:
81
+ smiles: Query molecule as a SMILES string.
82
+ experiment_dir: Path to a directory whose immediate subdirectories are
83
+ individual experiments (each containing .cdxml / .csv files).
84
+ similarity_threshold: Minimum Tanimoto similarity (0–1) for a species
85
+ to appear in ``similar_matches``. Exact matches (same canonical
86
+ SMILES) are always reported regardless of this threshold.
87
+
88
+ Returns:
89
+ A dict with keys:
90
+ ok (bool), query_smiles (str), query_canonical (str),
91
+ exact_matches (list), similar_matches (list),
92
+ experiments_searched (int), experiments_parsed_ok (int),
93
+ parse_errors (list of {"experiment": str, "error": str}).
94
+ """
95
+ from cdxml_toolkit.perception.reaction_parser import parse_reaction
96
+
97
+ # Validate query
98
+ query_canonical = _canonical(smiles)
99
+ if query_canonical is None:
100
+ return {
101
+ "ok": False,
102
+ "error": f"Invalid query SMILES: {smiles!r}",
103
+ "query_smiles": smiles,
104
+ }
105
+
106
+ query_fp = _morgan_fp(query_canonical)
107
+
108
+ root = Path(experiment_dir)
109
+ if not root.is_dir():
110
+ return {
111
+ "ok": False,
112
+ "error": f"experiment_dir does not exist or is not a directory: {experiment_dir!r}",
113
+ "query_smiles": smiles,
114
+ }
115
+
116
+ # Collect experiment subdirectories (immediate children only)
117
+ exp_dirs = sorted(p for p in root.iterdir() if p.is_dir())
118
+
119
+ exact_matches: List[Dict[str, Any]] = []
120
+ similar_matches: List[Dict[str, Any]] = []
121
+ parse_errors: List[Dict[str, str]] = []
122
+ experiments_parsed_ok = 0
123
+
124
+ for exp_dir in exp_dirs:
125
+ exp_name = exp_dir.name
126
+ cdxml, csv = _discover_files(exp_dir)
127
+
128
+ if cdxml is None:
129
+ # No CDXML → skip silently (no reaction to parse)
130
+ continue
131
+
132
+ try:
133
+ desc = parse_reaction(
134
+ cdxml=cdxml,
135
+ csv=csv,
136
+ use_network=False, # keep offline for batch search
137
+ verbose=False,
138
+ )
139
+ except Exception as exc:
140
+ parse_errors.append({
141
+ "experiment": exp_name,
142
+ "error": f"{type(exc).__name__}: {exc}",
143
+ "traceback": traceback.format_exc(),
144
+ })
145
+ continue
146
+
147
+ experiments_parsed_ok += 1
148
+
149
+ # Pull eln_data fields once per experiment
150
+ eln = desc.eln_data or {}
151
+ product_yield = eln.get("product_yield")
152
+ product_obtained = eln.get("product_obtained")
153
+ sm_mass = eln.get("sm_mass")
154
+
155
+ for sp in desc.species:
156
+ # Prefer the full SMILES; also check neutral form for exact matching
157
+ sp_smiles = sp.smiles or sp.smiles_neutral
158
+ if not sp_smiles:
159
+ continue
160
+
161
+ sp_canonical = _canonical(sp_smiles)
162
+ if sp_canonical is None:
163
+ continue
164
+
165
+ # Neutral canonical (largest fragment, salt-stripped) for comparison
166
+ sp_neutral_canonical = (
167
+ _canonical(sp.smiles_neutral) if sp.smiles_neutral else None
168
+ )
169
+
170
+ # --- Exact match ---
171
+ # Check both the full canonical and the neutral (salt-free) form.
172
+ # This lets "NCCCCCOc1..." match "Cl.NCCCCCOc1..." (HCl salt).
173
+ matched_as_exact = (
174
+ sp_canonical == query_canonical
175
+ or (sp_neutral_canonical and sp_neutral_canonical == query_canonical)
176
+ )
177
+ if matched_as_exact:
178
+ reported_smiles = (
179
+ sp_neutral_canonical
180
+ if sp_neutral_canonical == query_canonical
181
+ else sp_canonical
182
+ )
183
+ record: Dict[str, Any] = {
184
+ "experiment": exp_name,
185
+ "species_name": sp.name or sp.id,
186
+ "role": sp.role,
187
+ "smiles": reported_smiles,
188
+ }
189
+ if sp_canonical != reported_smiles:
190
+ record["smiles_full"] = sp_canonical # show the salt form too
191
+ if product_yield:
192
+ record["yield"] = product_yield
193
+ if product_obtained:
194
+ record["amount_obtained"] = product_obtained
195
+ if sm_mass:
196
+ record["sm_mass"] = sm_mass
197
+ exact_matches.append(record)
198
+ continue # don't also report as similar
199
+
200
+ # --- Similarity match ---
201
+ # Use the neutral form for fingerprint comparison when available
202
+ # (avoids artificially low similarity for salt vs free base).
203
+ fp_smiles = sp_neutral_canonical or sp_canonical
204
+ if query_fp is not None:
205
+ sp_fp = _morgan_fp(fp_smiles)
206
+ if sp_fp is not None:
207
+ sim = _tanimoto(query_fp, sp_fp)
208
+ if sim >= similarity_threshold:
209
+ similar_matches.append({
210
+ "experiment": exp_name,
211
+ "species_name": sp.name or sp.id,
212
+ "role": sp.role,
213
+ "similarity": round(sim, 4),
214
+ "smiles": sp_canonical,
215
+ })
216
+
217
+ # Sort similar matches by descending similarity
218
+ similar_matches.sort(key=lambda x: x["similarity"], reverse=True)
219
+
220
+ return {
221
+ "ok": True,
222
+ "query_smiles": smiles,
223
+ "query_canonical": query_canonical,
224
+ "exact_matches": exact_matches,
225
+ "similar_matches": similar_matches,
226
+ "experiments_searched": len(exp_dirs),
227
+ "experiments_parsed_ok": experiments_parsed_ok,
228
+ "parse_errors": parse_errors,
229
+ }