cdxml-toolkit 0.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cdxml_toolkit/__init__.py +18 -0
- cdxml_toolkit/_jre/__init__.py +2 -0
- cdxml_toolkit/_jre/temurin-21-jre-win-x64.zip +0 -0
- cdxml_toolkit/analysis/__init__.py +35 -0
- cdxml_toolkit/analysis/deterministic/__init__.py +12 -0
- cdxml_toolkit/analysis/deterministic/discover_experiment_files.py +413 -0
- cdxml_toolkit/analysis/deterministic/lab_book_formatter.py +701 -0
- cdxml_toolkit/analysis/deterministic/lcms_file_categorizer.py +928 -0
- cdxml_toolkit/analysis/deterministic/lcms_identifier.py +598 -0
- cdxml_toolkit/analysis/deterministic/mass_resolver.py +654 -0
- cdxml_toolkit/analysis/deterministic/multi_lcms_analyzer.py +1412 -0
- cdxml_toolkit/analysis/deterministic/procedure_writer.py +446 -0
- cdxml_toolkit/analysis/extract_nmr.py +47 -0
- cdxml_toolkit/analysis/format_procedure_entry.py +479 -0
- cdxml_toolkit/analysis/lcms_analyzer.py +1299 -0
- cdxml_toolkit/analysis/parse_analysis_file.py +134 -0
- cdxml_toolkit/cdxml_builder.py +920 -0
- cdxml_toolkit/cdxml_utils.py +342 -0
- cdxml_toolkit/chemdraw/__init__.py +5 -0
- cdxml_toolkit/chemdraw/_chemscript_server.py +562 -0
- cdxml_toolkit/chemdraw/cdx_converter.py +527 -0
- cdxml_toolkit/chemdraw/cdxml_to_image.py +262 -0
- cdxml_toolkit/chemdraw/cdxml_to_image_rdkit.py +296 -0
- cdxml_toolkit/chemdraw/chemscript_bridge.py +901 -0
- cdxml_toolkit/constants.py +304 -0
- cdxml_toolkit/coord_normalizer.py +438 -0
- cdxml_toolkit/deterministic_pipeline/__init__.py +6 -0
- cdxml_toolkit/deterministic_pipeline/legacy/__init__.py +5 -0
- cdxml_toolkit/deterministic_pipeline/legacy/eln_cdx_cleanup.py +509 -0
- cdxml_toolkit/deterministic_pipeline/legacy/eln_enrichment.py +1394 -0
- cdxml_toolkit/deterministic_pipeline/legacy/scheme_aligner.py +428 -0
- cdxml_toolkit/deterministic_pipeline/legacy/scheme_polisher.py +1337 -0
- cdxml_toolkit/deterministic_pipeline/legacy/scheme_polisher_v2.py +1340 -0
- cdxml_toolkit/deterministic_pipeline/scheme_reader_audit.py +931 -0
- cdxml_toolkit/deterministic_pipeline/scheme_reader_verify.py +1160 -0
- cdxml_toolkit/image/__init__.py +15 -0
- cdxml_toolkit/image/reaction_from_image.py +2103 -0
- cdxml_toolkit/image/structure_from_image.py +1711 -0
- cdxml_toolkit/layout/__init__.py +5 -0
- cdxml_toolkit/layout/alignment.py +1642 -0
- cdxml_toolkit/layout/reaction_cleanup.py +1002 -0
- cdxml_toolkit/layout/scheme_merger.py +2260 -0
- cdxml_toolkit/mcp_server/__init__.py +0 -0
- cdxml_toolkit/mcp_server/__main__.py +5 -0
- cdxml_toolkit/mcp_server/server.py +1567 -0
- cdxml_toolkit/naming/__init__.py +6 -0
- cdxml_toolkit/naming/aligned_namer.py +2342 -0
- cdxml_toolkit/naming/mol_builder.py +3722 -0
- cdxml_toolkit/naming/name_decomposer.py +2843 -0
- cdxml_toolkit/naming/reactions_datamol.json +2414 -0
- cdxml_toolkit/office/__init__.py +5 -0
- cdxml_toolkit/office/doc_from_template.py +722 -0
- cdxml_toolkit/office/ole_embedder.py +808 -0
- cdxml_toolkit/office/ole_extractor.py +272 -0
- cdxml_toolkit/perception/__init__.py +10 -0
- cdxml_toolkit/perception/compound_search.py +229 -0
- cdxml_toolkit/perception/eln_csv_parser.py +240 -0
- cdxml_toolkit/perception/rdf_parser.py +664 -0
- cdxml_toolkit/perception/reactant_heuristic.py +1045 -0
- cdxml_toolkit/perception/reaction_parser.py +2150 -0
- cdxml_toolkit/perception/scheme_reader.py +2948 -0
- cdxml_toolkit/perception/scheme_refine.py +1404 -0
- cdxml_toolkit/perception/scheme_segmenter.py +619 -0
- cdxml_toolkit/perception/spatial_assignment.py +1013 -0
- cdxml_toolkit/rdkit_utils.py +605 -0
- cdxml_toolkit/render/__init__.py +17 -0
- cdxml_toolkit/render/auto_layout.py +229 -0
- cdxml_toolkit/render/compact_parser.py +632 -0
- cdxml_toolkit/render/parser.py +706 -0
- cdxml_toolkit/render/render_scheme.py +267 -0
- cdxml_toolkit/render/renderer.py +2387 -0
- cdxml_toolkit/render/schema.py +90 -0
- cdxml_toolkit/render/scheme_maker.py +1043 -0
- cdxml_toolkit/render/scheme_yaml_writer.py +1487 -0
- cdxml_toolkit/resolve/__init__.py +13 -0
- cdxml_toolkit/resolve/cas_resolver.py +430 -0
- cdxml_toolkit/resolve/chemscanner_abbreviations.json +28813 -0
- cdxml_toolkit/resolve/condensed_formula.py +493 -0
- cdxml_toolkit/resolve/jre_manager.py +195 -0
- cdxml_toolkit/resolve/reagent_abbreviations.json +1046 -0
- cdxml_toolkit/resolve/reagent_db.py +285 -0
- cdxml_toolkit/resolve/superatom_data.json +2856 -0
- cdxml_toolkit/resolve/superatom_table.py +146 -0
- cdxml_toolkit/text_formatting.py +298 -0
- cdxml_toolkit-0.5.0.dist-info/METADATA +318 -0
- cdxml_toolkit-0.5.0.dist-info/RECORD +91 -0
- cdxml_toolkit-0.5.0.dist-info/WHEEL +5 -0
- cdxml_toolkit-0.5.0.dist-info/entry_points.txt +17 -0
- cdxml_toolkit-0.5.0.dist-info/licenses/LICENSE +21 -0
- cdxml_toolkit-0.5.0.dist-info/licenses/NOTICE.md +37 -0
- cdxml_toolkit-0.5.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,240 @@
|
|
|
1
|
+
"""
|
|
2
|
+
ELN CSV Parser — Findmolecule ELN export file parser.
|
|
3
|
+
|
|
4
|
+
Parses semicolon-delimited CSV exports from Findmolecule ELN into structured
|
|
5
|
+
dataclasses. The CSV format uses @TYPE rows to delimit sections (REACTANT,
|
|
6
|
+
SOLVENT, PRODUCT, ANALYSIS).
|
|
7
|
+
|
|
8
|
+
This module is pure stdlib — no external dependencies.
|
|
9
|
+
|
|
10
|
+
Originally part of procedure_writer.py; extracted into the package so that
|
|
11
|
+
eln_enrichment.py and reaction_parser.py can import it without depending on
|
|
12
|
+
private root-level scripts.
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
import csv
|
|
16
|
+
import html as html_mod
|
|
17
|
+
import re
|
|
18
|
+
from dataclasses import dataclass, field
|
|
19
|
+
from typing import Dict, List, Optional
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
# ---------------------------------------------------------------------------
|
|
23
|
+
# Data structures
|
|
24
|
+
# ---------------------------------------------------------------------------
|
|
25
|
+
|
|
26
|
+
@dataclass
|
|
27
|
+
class ReagentInfo:
|
|
28
|
+
name: str
|
|
29
|
+
mass: str
|
|
30
|
+
mmol: str
|
|
31
|
+
equiv: str
|
|
32
|
+
mw: float
|
|
33
|
+
is_substrate: bool
|
|
34
|
+
supplier: str
|
|
35
|
+
volume: str
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
@dataclass
|
|
39
|
+
class SolventInfo:
|
|
40
|
+
name: str
|
|
41
|
+
volume: str
|
|
42
|
+
concentration: str
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
@dataclass
|
|
46
|
+
class ProductInfo:
|
|
47
|
+
name: str
|
|
48
|
+
mw: float
|
|
49
|
+
theoretical_mass: str
|
|
50
|
+
obtained_mass: str
|
|
51
|
+
yield_pct: str
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
@dataclass
|
|
55
|
+
class LCMSFileInfo:
|
|
56
|
+
path: str
|
|
57
|
+
filename: str
|
|
58
|
+
category: str # "tracking", "workup", "purification", "final"
|
|
59
|
+
sort_key: float # numeric key for chronological sorting
|
|
60
|
+
report: Optional[object] = None
|
|
61
|
+
group_prefix: Optional[str] = None # tracking group prefix
|
|
62
|
+
method_variant: Optional[str] = None # filename-derived method hint
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
@dataclass
|
|
66
|
+
class ExperimentData:
|
|
67
|
+
experiment_name: str
|
|
68
|
+
labbook_name: str
|
|
69
|
+
procedure_html: str
|
|
70
|
+
procedure_text: str
|
|
71
|
+
reaction_type: str
|
|
72
|
+
start_date: str
|
|
73
|
+
reactants: List[ReagentInfo] = field(default_factory=list)
|
|
74
|
+
solvents: List[SolventInfo] = field(default_factory=list)
|
|
75
|
+
product: Optional[ProductInfo] = None
|
|
76
|
+
lcms_files: List[LCMSFileInfo] = field(default_factory=list)
|
|
77
|
+
nmr_pdfs: List[str] = field(default_factory=list)
|
|
78
|
+
nmr_data: List[str] = field(default_factory=list)
|
|
79
|
+
sm_mass: Optional[float] = None # CSV-derived MW (fallback)
|
|
80
|
+
product_mass: Optional[float] = None # CSV-derived MW (fallback)
|
|
81
|
+
cdx_path: Optional[str] = None
|
|
82
|
+
rxn_path: Optional[str] = None
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
# ---------------------------------------------------------------------------
|
|
86
|
+
# HTML / text utilities
|
|
87
|
+
# ---------------------------------------------------------------------------
|
|
88
|
+
|
|
89
|
+
def strip_html(html_str: str) -> str:
|
|
90
|
+
"""Strip HTML tags and convert to plain text."""
|
|
91
|
+
text = re.sub(r'<br\s*/?>', '\n', html_str)
|
|
92
|
+
text = re.sub(r'</p>\s*<p[^>]*>', '\n\n', text)
|
|
93
|
+
text = re.sub(r'<p[^>]*>', '', text)
|
|
94
|
+
text = re.sub(r'</p>', '\n', text)
|
|
95
|
+
text = re.sub(r'<img[^>]*>', '', text)
|
|
96
|
+
# Remove all remaining tags
|
|
97
|
+
text = re.sub(r'<[^>]+>', '', text)
|
|
98
|
+
# Decode HTML entities (covers < > & ° &#nnn; etc.)
|
|
99
|
+
text = html_mod.unescape(text)
|
|
100
|
+
# Clean up whitespace
|
|
101
|
+
text = re.sub(r'[ \t]+', ' ', text)
|
|
102
|
+
text = re.sub(r'\n[ \t]+', '\n', text)
|
|
103
|
+
text = re.sub(r'\n{3,}', '\n\n', text)
|
|
104
|
+
return text.strip()
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
def extract_procedure_body(full_text: str) -> str:
|
|
108
|
+
"""Extract the procedure portion, cutting off literature references."""
|
|
109
|
+
# "Reference:" marks start of literature references
|
|
110
|
+
idx = full_text.find('Reference:')
|
|
111
|
+
if idx > 0:
|
|
112
|
+
body = full_text[:idx].strip()
|
|
113
|
+
else:
|
|
114
|
+
body = full_text.strip()
|
|
115
|
+
# Also cut Chinese text blocks (common in patent references)
|
|
116
|
+
m = re.search(r'[\u4e00-\u9fff]', body)
|
|
117
|
+
if m and m.start() > 50:
|
|
118
|
+
body = body[:m.start()].strip()
|
|
119
|
+
return body
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
# ---------------------------------------------------------------------------
|
|
123
|
+
# CSV parser
|
|
124
|
+
# ---------------------------------------------------------------------------
|
|
125
|
+
|
|
126
|
+
def parse_eln_csv(csv_path: str) -> Optional[ExperimentData]:
|
|
127
|
+
"""Parse a Findmolecule ELN CSV export.
|
|
128
|
+
|
|
129
|
+
Parameters
|
|
130
|
+
----------
|
|
131
|
+
csv_path : str
|
|
132
|
+
Path to the semicolon-delimited CSV file.
|
|
133
|
+
|
|
134
|
+
Returns
|
|
135
|
+
-------
|
|
136
|
+
ExperimentData or None if the file has fewer than 2 rows.
|
|
137
|
+
"""
|
|
138
|
+
with open(csv_path, 'r', encoding='utf-8-sig') as f:
|
|
139
|
+
reader = csv.reader(f, delimiter=';', quotechar='"')
|
|
140
|
+
rows = list(reader)
|
|
141
|
+
|
|
142
|
+
if len(rows) < 2:
|
|
143
|
+
return None
|
|
144
|
+
|
|
145
|
+
# Row 0: metadata headers, Row 1: metadata values
|
|
146
|
+
meta_headers = rows[0]
|
|
147
|
+
meta_values = rows[1]
|
|
148
|
+
metadata: Dict[str, str] = {}
|
|
149
|
+
for h, v in zip(meta_headers, meta_values):
|
|
150
|
+
metadata[h] = v
|
|
151
|
+
|
|
152
|
+
# Parse @TYPE sections
|
|
153
|
+
sections: Dict[str, list] = {
|
|
154
|
+
'REACTANT': [], 'SOLVENT': [], 'PRODUCT': [], 'ANALYSIS': []
|
|
155
|
+
}
|
|
156
|
+
current_headers: List[str] = []
|
|
157
|
+
|
|
158
|
+
for row in rows[2:]:
|
|
159
|
+
if not row:
|
|
160
|
+
continue
|
|
161
|
+
if row[0] == '@TYPE':
|
|
162
|
+
current_headers = row[1:]
|
|
163
|
+
continue
|
|
164
|
+
type_name = row[0]
|
|
165
|
+
if type_name in sections:
|
|
166
|
+
data: Dict[str, str] = {}
|
|
167
|
+
for i, h in enumerate(current_headers):
|
|
168
|
+
if i + 1 < len(row):
|
|
169
|
+
data[h] = row[i + 1]
|
|
170
|
+
else:
|
|
171
|
+
data[h] = ''
|
|
172
|
+
sections[type_name].append(data)
|
|
173
|
+
|
|
174
|
+
# Build ExperimentData
|
|
175
|
+
procedure_html = metadata.get('PROCEDURE', '')
|
|
176
|
+
procedure_text = extract_procedure_body(strip_html(procedure_html))
|
|
177
|
+
|
|
178
|
+
exp = ExperimentData(
|
|
179
|
+
experiment_name=metadata.get('EXPERIENCE_NAME', ''),
|
|
180
|
+
labbook_name=metadata.get('LABBOOK_NAME', ''),
|
|
181
|
+
procedure_html=procedure_html,
|
|
182
|
+
procedure_text=procedure_text,
|
|
183
|
+
reaction_type=metadata.get('EXPERIENCE_TYPE_NAME', ''),
|
|
184
|
+
start_date=metadata.get('STARTED_ON', ''),
|
|
185
|
+
)
|
|
186
|
+
|
|
187
|
+
# Reactants
|
|
188
|
+
for r in sections['REACTANT']:
|
|
189
|
+
mw_str = r.get('MOL_WEIGHT', '0')
|
|
190
|
+
try:
|
|
191
|
+
mw = float(mw_str) if mw_str else 0.0
|
|
192
|
+
except ValueError:
|
|
193
|
+
mw = 0.0
|
|
194
|
+
reagent = ReagentInfo(
|
|
195
|
+
name=r.get('REACTANT', '').strip(),
|
|
196
|
+
mass=r.get('MASS', ''),
|
|
197
|
+
mmol=r.get('MMOL', ''),
|
|
198
|
+
equiv=r.get('EQUIV', ''),
|
|
199
|
+
mw=mw,
|
|
200
|
+
is_substrate=r.get('SUBSTRATE', '').lower() == 'true',
|
|
201
|
+
supplier=r.get('SOURCE_SUPPLIER', ''),
|
|
202
|
+
volume=r.get('VOLUME', ''),
|
|
203
|
+
)
|
|
204
|
+
exp.reactants.append(reagent)
|
|
205
|
+
|
|
206
|
+
# Solvents
|
|
207
|
+
for s in sections['SOLVENT']:
|
|
208
|
+
solvent = SolventInfo(
|
|
209
|
+
name=s.get('SOLVENT', ''),
|
|
210
|
+
volume=s.get('VOLUME', ''),
|
|
211
|
+
concentration=s.get('CONCENTRATION', ''),
|
|
212
|
+
)
|
|
213
|
+
exp.solvents.append(solvent)
|
|
214
|
+
|
|
215
|
+
# Product
|
|
216
|
+
for p in sections['PRODUCT']:
|
|
217
|
+
mw_str = p.get('MOL_WEIGHT', '0')
|
|
218
|
+
try:
|
|
219
|
+
mw = float(mw_str) if mw_str else 0.0
|
|
220
|
+
except ValueError:
|
|
221
|
+
mw = 0.0
|
|
222
|
+
exp.product = ProductInfo(
|
|
223
|
+
name=p.get('PRODUCT_NAME', ''),
|
|
224
|
+
mw=mw,
|
|
225
|
+
theoretical_mass=p.get('MASS', ''),
|
|
226
|
+
obtained_mass=p.get('MASS OBTAINED', ''),
|
|
227
|
+
yield_pct=p.get('YIELD', ''),
|
|
228
|
+
)
|
|
229
|
+
|
|
230
|
+
# SM mass from substrate row (pick the largest-MW substrate,
|
|
231
|
+
# since small-MW reagents like HCl can also be marked as substrate)
|
|
232
|
+
substrate_mws = [r.mw for r in exp.reactants if r.is_substrate and r.mw > 50]
|
|
233
|
+
if substrate_mws:
|
|
234
|
+
exp.sm_mass = max(substrate_mws)
|
|
235
|
+
|
|
236
|
+
# Product mass — use CSV MW directly
|
|
237
|
+
if exp.product and exp.product.mw > 0:
|
|
238
|
+
exp.product_mass = exp.product.mw
|
|
239
|
+
|
|
240
|
+
return exp
|