cdxml-toolkit 0.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (91) hide show
  1. cdxml_toolkit/__init__.py +18 -0
  2. cdxml_toolkit/_jre/__init__.py +2 -0
  3. cdxml_toolkit/_jre/temurin-21-jre-win-x64.zip +0 -0
  4. cdxml_toolkit/analysis/__init__.py +35 -0
  5. cdxml_toolkit/analysis/deterministic/__init__.py +12 -0
  6. cdxml_toolkit/analysis/deterministic/discover_experiment_files.py +413 -0
  7. cdxml_toolkit/analysis/deterministic/lab_book_formatter.py +701 -0
  8. cdxml_toolkit/analysis/deterministic/lcms_file_categorizer.py +928 -0
  9. cdxml_toolkit/analysis/deterministic/lcms_identifier.py +598 -0
  10. cdxml_toolkit/analysis/deterministic/mass_resolver.py +654 -0
  11. cdxml_toolkit/analysis/deterministic/multi_lcms_analyzer.py +1412 -0
  12. cdxml_toolkit/analysis/deterministic/procedure_writer.py +446 -0
  13. cdxml_toolkit/analysis/extract_nmr.py +47 -0
  14. cdxml_toolkit/analysis/format_procedure_entry.py +479 -0
  15. cdxml_toolkit/analysis/lcms_analyzer.py +1299 -0
  16. cdxml_toolkit/analysis/parse_analysis_file.py +134 -0
  17. cdxml_toolkit/cdxml_builder.py +920 -0
  18. cdxml_toolkit/cdxml_utils.py +342 -0
  19. cdxml_toolkit/chemdraw/__init__.py +5 -0
  20. cdxml_toolkit/chemdraw/_chemscript_server.py +562 -0
  21. cdxml_toolkit/chemdraw/cdx_converter.py +527 -0
  22. cdxml_toolkit/chemdraw/cdxml_to_image.py +262 -0
  23. cdxml_toolkit/chemdraw/cdxml_to_image_rdkit.py +296 -0
  24. cdxml_toolkit/chemdraw/chemscript_bridge.py +901 -0
  25. cdxml_toolkit/constants.py +304 -0
  26. cdxml_toolkit/coord_normalizer.py +438 -0
  27. cdxml_toolkit/deterministic_pipeline/__init__.py +6 -0
  28. cdxml_toolkit/deterministic_pipeline/legacy/__init__.py +5 -0
  29. cdxml_toolkit/deterministic_pipeline/legacy/eln_cdx_cleanup.py +509 -0
  30. cdxml_toolkit/deterministic_pipeline/legacy/eln_enrichment.py +1394 -0
  31. cdxml_toolkit/deterministic_pipeline/legacy/scheme_aligner.py +428 -0
  32. cdxml_toolkit/deterministic_pipeline/legacy/scheme_polisher.py +1337 -0
  33. cdxml_toolkit/deterministic_pipeline/legacy/scheme_polisher_v2.py +1340 -0
  34. cdxml_toolkit/deterministic_pipeline/scheme_reader_audit.py +931 -0
  35. cdxml_toolkit/deterministic_pipeline/scheme_reader_verify.py +1160 -0
  36. cdxml_toolkit/image/__init__.py +15 -0
  37. cdxml_toolkit/image/reaction_from_image.py +2103 -0
  38. cdxml_toolkit/image/structure_from_image.py +1711 -0
  39. cdxml_toolkit/layout/__init__.py +5 -0
  40. cdxml_toolkit/layout/alignment.py +1642 -0
  41. cdxml_toolkit/layout/reaction_cleanup.py +1002 -0
  42. cdxml_toolkit/layout/scheme_merger.py +2260 -0
  43. cdxml_toolkit/mcp_server/__init__.py +0 -0
  44. cdxml_toolkit/mcp_server/__main__.py +5 -0
  45. cdxml_toolkit/mcp_server/server.py +1567 -0
  46. cdxml_toolkit/naming/__init__.py +6 -0
  47. cdxml_toolkit/naming/aligned_namer.py +2342 -0
  48. cdxml_toolkit/naming/mol_builder.py +3722 -0
  49. cdxml_toolkit/naming/name_decomposer.py +2843 -0
  50. cdxml_toolkit/naming/reactions_datamol.json +2414 -0
  51. cdxml_toolkit/office/__init__.py +5 -0
  52. cdxml_toolkit/office/doc_from_template.py +722 -0
  53. cdxml_toolkit/office/ole_embedder.py +808 -0
  54. cdxml_toolkit/office/ole_extractor.py +272 -0
  55. cdxml_toolkit/perception/__init__.py +10 -0
  56. cdxml_toolkit/perception/compound_search.py +229 -0
  57. cdxml_toolkit/perception/eln_csv_parser.py +240 -0
  58. cdxml_toolkit/perception/rdf_parser.py +664 -0
  59. cdxml_toolkit/perception/reactant_heuristic.py +1045 -0
  60. cdxml_toolkit/perception/reaction_parser.py +2150 -0
  61. cdxml_toolkit/perception/scheme_reader.py +2948 -0
  62. cdxml_toolkit/perception/scheme_refine.py +1404 -0
  63. cdxml_toolkit/perception/scheme_segmenter.py +619 -0
  64. cdxml_toolkit/perception/spatial_assignment.py +1013 -0
  65. cdxml_toolkit/rdkit_utils.py +605 -0
  66. cdxml_toolkit/render/__init__.py +17 -0
  67. cdxml_toolkit/render/auto_layout.py +229 -0
  68. cdxml_toolkit/render/compact_parser.py +632 -0
  69. cdxml_toolkit/render/parser.py +706 -0
  70. cdxml_toolkit/render/render_scheme.py +267 -0
  71. cdxml_toolkit/render/renderer.py +2387 -0
  72. cdxml_toolkit/render/schema.py +90 -0
  73. cdxml_toolkit/render/scheme_maker.py +1043 -0
  74. cdxml_toolkit/render/scheme_yaml_writer.py +1487 -0
  75. cdxml_toolkit/resolve/__init__.py +13 -0
  76. cdxml_toolkit/resolve/cas_resolver.py +430 -0
  77. cdxml_toolkit/resolve/chemscanner_abbreviations.json +28813 -0
  78. cdxml_toolkit/resolve/condensed_formula.py +493 -0
  79. cdxml_toolkit/resolve/jre_manager.py +195 -0
  80. cdxml_toolkit/resolve/reagent_abbreviations.json +1046 -0
  81. cdxml_toolkit/resolve/reagent_db.py +285 -0
  82. cdxml_toolkit/resolve/superatom_data.json +2856 -0
  83. cdxml_toolkit/resolve/superatom_table.py +146 -0
  84. cdxml_toolkit/text_formatting.py +298 -0
  85. cdxml_toolkit-0.5.0.dist-info/METADATA +318 -0
  86. cdxml_toolkit-0.5.0.dist-info/RECORD +91 -0
  87. cdxml_toolkit-0.5.0.dist-info/WHEEL +5 -0
  88. cdxml_toolkit-0.5.0.dist-info/entry_points.txt +17 -0
  89. cdxml_toolkit-0.5.0.dist-info/licenses/LICENSE +21 -0
  90. cdxml_toolkit-0.5.0.dist-info/licenses/NOTICE.md +37 -0
  91. cdxml_toolkit-0.5.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,240 @@
1
+ """
2
+ ELN CSV Parser — Findmolecule ELN export file parser.
3
+
4
+ Parses semicolon-delimited CSV exports from Findmolecule ELN into structured
5
+ dataclasses. The CSV format uses @TYPE rows to delimit sections (REACTANT,
6
+ SOLVENT, PRODUCT, ANALYSIS).
7
+
8
+ This module is pure stdlib — no external dependencies.
9
+
10
+ Originally part of procedure_writer.py; extracted into the package so that
11
+ eln_enrichment.py and reaction_parser.py can import it without depending on
12
+ private root-level scripts.
13
+ """
14
+
15
+ import csv
16
+ import html as html_mod
17
+ import re
18
+ from dataclasses import dataclass, field
19
+ from typing import Dict, List, Optional
20
+
21
+
22
+ # ---------------------------------------------------------------------------
23
+ # Data structures
24
+ # ---------------------------------------------------------------------------
25
+
26
+ @dataclass
27
+ class ReagentInfo:
28
+ name: str
29
+ mass: str
30
+ mmol: str
31
+ equiv: str
32
+ mw: float
33
+ is_substrate: bool
34
+ supplier: str
35
+ volume: str
36
+
37
+
38
+ @dataclass
39
+ class SolventInfo:
40
+ name: str
41
+ volume: str
42
+ concentration: str
43
+
44
+
45
+ @dataclass
46
+ class ProductInfo:
47
+ name: str
48
+ mw: float
49
+ theoretical_mass: str
50
+ obtained_mass: str
51
+ yield_pct: str
52
+
53
+
54
+ @dataclass
55
+ class LCMSFileInfo:
56
+ path: str
57
+ filename: str
58
+ category: str # "tracking", "workup", "purification", "final"
59
+ sort_key: float # numeric key for chronological sorting
60
+ report: Optional[object] = None
61
+ group_prefix: Optional[str] = None # tracking group prefix
62
+ method_variant: Optional[str] = None # filename-derived method hint
63
+
64
+
65
+ @dataclass
66
+ class ExperimentData:
67
+ experiment_name: str
68
+ labbook_name: str
69
+ procedure_html: str
70
+ procedure_text: str
71
+ reaction_type: str
72
+ start_date: str
73
+ reactants: List[ReagentInfo] = field(default_factory=list)
74
+ solvents: List[SolventInfo] = field(default_factory=list)
75
+ product: Optional[ProductInfo] = None
76
+ lcms_files: List[LCMSFileInfo] = field(default_factory=list)
77
+ nmr_pdfs: List[str] = field(default_factory=list)
78
+ nmr_data: List[str] = field(default_factory=list)
79
+ sm_mass: Optional[float] = None # CSV-derived MW (fallback)
80
+ product_mass: Optional[float] = None # CSV-derived MW (fallback)
81
+ cdx_path: Optional[str] = None
82
+ rxn_path: Optional[str] = None
83
+
84
+
85
+ # ---------------------------------------------------------------------------
86
+ # HTML / text utilities
87
+ # ---------------------------------------------------------------------------
88
+
89
+ def strip_html(html_str: str) -> str:
90
+ """Strip HTML tags and convert to plain text."""
91
+ text = re.sub(r'<br\s*/?>', '\n', html_str)
92
+ text = re.sub(r'</p>\s*<p[^>]*>', '\n\n', text)
93
+ text = re.sub(r'<p[^>]*>', '', text)
94
+ text = re.sub(r'</p>', '\n', text)
95
+ text = re.sub(r'<img[^>]*>', '', text)
96
+ # Remove all remaining tags
97
+ text = re.sub(r'<[^>]+>', '', text)
98
+ # Decode HTML entities (covers &nbsp; &lt; &gt; &amp; &deg; &#nnn; etc.)
99
+ text = html_mod.unescape(text)
100
+ # Clean up whitespace
101
+ text = re.sub(r'[ \t]+', ' ', text)
102
+ text = re.sub(r'\n[ \t]+', '\n', text)
103
+ text = re.sub(r'\n{3,}', '\n\n', text)
104
+ return text.strip()
105
+
106
+
107
+ def extract_procedure_body(full_text: str) -> str:
108
+ """Extract the procedure portion, cutting off literature references."""
109
+ # "Reference:" marks start of literature references
110
+ idx = full_text.find('Reference:')
111
+ if idx > 0:
112
+ body = full_text[:idx].strip()
113
+ else:
114
+ body = full_text.strip()
115
+ # Also cut Chinese text blocks (common in patent references)
116
+ m = re.search(r'[\u4e00-\u9fff]', body)
117
+ if m and m.start() > 50:
118
+ body = body[:m.start()].strip()
119
+ return body
120
+
121
+
122
+ # ---------------------------------------------------------------------------
123
+ # CSV parser
124
+ # ---------------------------------------------------------------------------
125
+
126
+ def parse_eln_csv(csv_path: str) -> Optional[ExperimentData]:
127
+ """Parse a Findmolecule ELN CSV export.
128
+
129
+ Parameters
130
+ ----------
131
+ csv_path : str
132
+ Path to the semicolon-delimited CSV file.
133
+
134
+ Returns
135
+ -------
136
+ ExperimentData or None if the file has fewer than 2 rows.
137
+ """
138
+ with open(csv_path, 'r', encoding='utf-8-sig') as f:
139
+ reader = csv.reader(f, delimiter=';', quotechar='"')
140
+ rows = list(reader)
141
+
142
+ if len(rows) < 2:
143
+ return None
144
+
145
+ # Row 0: metadata headers, Row 1: metadata values
146
+ meta_headers = rows[0]
147
+ meta_values = rows[1]
148
+ metadata: Dict[str, str] = {}
149
+ for h, v in zip(meta_headers, meta_values):
150
+ metadata[h] = v
151
+
152
+ # Parse @TYPE sections
153
+ sections: Dict[str, list] = {
154
+ 'REACTANT': [], 'SOLVENT': [], 'PRODUCT': [], 'ANALYSIS': []
155
+ }
156
+ current_headers: List[str] = []
157
+
158
+ for row in rows[2:]:
159
+ if not row:
160
+ continue
161
+ if row[0] == '@TYPE':
162
+ current_headers = row[1:]
163
+ continue
164
+ type_name = row[0]
165
+ if type_name in sections:
166
+ data: Dict[str, str] = {}
167
+ for i, h in enumerate(current_headers):
168
+ if i + 1 < len(row):
169
+ data[h] = row[i + 1]
170
+ else:
171
+ data[h] = ''
172
+ sections[type_name].append(data)
173
+
174
+ # Build ExperimentData
175
+ procedure_html = metadata.get('PROCEDURE', '')
176
+ procedure_text = extract_procedure_body(strip_html(procedure_html))
177
+
178
+ exp = ExperimentData(
179
+ experiment_name=metadata.get('EXPERIENCE_NAME', ''),
180
+ labbook_name=metadata.get('LABBOOK_NAME', ''),
181
+ procedure_html=procedure_html,
182
+ procedure_text=procedure_text,
183
+ reaction_type=metadata.get('EXPERIENCE_TYPE_NAME', ''),
184
+ start_date=metadata.get('STARTED_ON', ''),
185
+ )
186
+
187
+ # Reactants
188
+ for r in sections['REACTANT']:
189
+ mw_str = r.get('MOL_WEIGHT', '0')
190
+ try:
191
+ mw = float(mw_str) if mw_str else 0.0
192
+ except ValueError:
193
+ mw = 0.0
194
+ reagent = ReagentInfo(
195
+ name=r.get('REACTANT', '').strip(),
196
+ mass=r.get('MASS', ''),
197
+ mmol=r.get('MMOL', ''),
198
+ equiv=r.get('EQUIV', ''),
199
+ mw=mw,
200
+ is_substrate=r.get('SUBSTRATE', '').lower() == 'true',
201
+ supplier=r.get('SOURCE_SUPPLIER', ''),
202
+ volume=r.get('VOLUME', ''),
203
+ )
204
+ exp.reactants.append(reagent)
205
+
206
+ # Solvents
207
+ for s in sections['SOLVENT']:
208
+ solvent = SolventInfo(
209
+ name=s.get('SOLVENT', ''),
210
+ volume=s.get('VOLUME', ''),
211
+ concentration=s.get('CONCENTRATION', ''),
212
+ )
213
+ exp.solvents.append(solvent)
214
+
215
+ # Product
216
+ for p in sections['PRODUCT']:
217
+ mw_str = p.get('MOL_WEIGHT', '0')
218
+ try:
219
+ mw = float(mw_str) if mw_str else 0.0
220
+ except ValueError:
221
+ mw = 0.0
222
+ exp.product = ProductInfo(
223
+ name=p.get('PRODUCT_NAME', ''),
224
+ mw=mw,
225
+ theoretical_mass=p.get('MASS', ''),
226
+ obtained_mass=p.get('MASS OBTAINED', ''),
227
+ yield_pct=p.get('YIELD', ''),
228
+ )
229
+
230
+ # SM mass from substrate row (pick the largest-MW substrate,
231
+ # since small-MW reagents like HCl can also be marked as substrate)
232
+ substrate_mws = [r.mw for r in exp.reactants if r.is_substrate and r.mw > 50]
233
+ if substrate_mws:
234
+ exp.sm_mass = max(substrate_mws)
235
+
236
+ # Product mass — use CSV MW directly
237
+ if exp.product and exp.product.mw > 0:
238
+ exp.product_mass = exp.product.mw
239
+
240
+ return exp