cdxml-toolkit 0.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cdxml_toolkit/__init__.py +18 -0
- cdxml_toolkit/_jre/__init__.py +2 -0
- cdxml_toolkit/_jre/temurin-21-jre-win-x64.zip +0 -0
- cdxml_toolkit/analysis/__init__.py +35 -0
- cdxml_toolkit/analysis/deterministic/__init__.py +12 -0
- cdxml_toolkit/analysis/deterministic/discover_experiment_files.py +413 -0
- cdxml_toolkit/analysis/deterministic/lab_book_formatter.py +701 -0
- cdxml_toolkit/analysis/deterministic/lcms_file_categorizer.py +928 -0
- cdxml_toolkit/analysis/deterministic/lcms_identifier.py +598 -0
- cdxml_toolkit/analysis/deterministic/mass_resolver.py +654 -0
- cdxml_toolkit/analysis/deterministic/multi_lcms_analyzer.py +1412 -0
- cdxml_toolkit/analysis/deterministic/procedure_writer.py +446 -0
- cdxml_toolkit/analysis/extract_nmr.py +47 -0
- cdxml_toolkit/analysis/format_procedure_entry.py +479 -0
- cdxml_toolkit/analysis/lcms_analyzer.py +1299 -0
- cdxml_toolkit/analysis/parse_analysis_file.py +134 -0
- cdxml_toolkit/cdxml_builder.py +920 -0
- cdxml_toolkit/cdxml_utils.py +342 -0
- cdxml_toolkit/chemdraw/__init__.py +5 -0
- cdxml_toolkit/chemdraw/_chemscript_server.py +562 -0
- cdxml_toolkit/chemdraw/cdx_converter.py +527 -0
- cdxml_toolkit/chemdraw/cdxml_to_image.py +262 -0
- cdxml_toolkit/chemdraw/cdxml_to_image_rdkit.py +296 -0
- cdxml_toolkit/chemdraw/chemscript_bridge.py +901 -0
- cdxml_toolkit/constants.py +304 -0
- cdxml_toolkit/coord_normalizer.py +438 -0
- cdxml_toolkit/deterministic_pipeline/__init__.py +6 -0
- cdxml_toolkit/deterministic_pipeline/legacy/__init__.py +5 -0
- cdxml_toolkit/deterministic_pipeline/legacy/eln_cdx_cleanup.py +509 -0
- cdxml_toolkit/deterministic_pipeline/legacy/eln_enrichment.py +1394 -0
- cdxml_toolkit/deterministic_pipeline/legacy/scheme_aligner.py +428 -0
- cdxml_toolkit/deterministic_pipeline/legacy/scheme_polisher.py +1337 -0
- cdxml_toolkit/deterministic_pipeline/legacy/scheme_polisher_v2.py +1340 -0
- cdxml_toolkit/deterministic_pipeline/scheme_reader_audit.py +931 -0
- cdxml_toolkit/deterministic_pipeline/scheme_reader_verify.py +1160 -0
- cdxml_toolkit/image/__init__.py +15 -0
- cdxml_toolkit/image/reaction_from_image.py +2103 -0
- cdxml_toolkit/image/structure_from_image.py +1711 -0
- cdxml_toolkit/layout/__init__.py +5 -0
- cdxml_toolkit/layout/alignment.py +1642 -0
- cdxml_toolkit/layout/reaction_cleanup.py +1002 -0
- cdxml_toolkit/layout/scheme_merger.py +2260 -0
- cdxml_toolkit/mcp_server/__init__.py +0 -0
- cdxml_toolkit/mcp_server/__main__.py +5 -0
- cdxml_toolkit/mcp_server/server.py +1567 -0
- cdxml_toolkit/naming/__init__.py +6 -0
- cdxml_toolkit/naming/aligned_namer.py +2342 -0
- cdxml_toolkit/naming/mol_builder.py +3722 -0
- cdxml_toolkit/naming/name_decomposer.py +2843 -0
- cdxml_toolkit/naming/reactions_datamol.json +2414 -0
- cdxml_toolkit/office/__init__.py +5 -0
- cdxml_toolkit/office/doc_from_template.py +722 -0
- cdxml_toolkit/office/ole_embedder.py +808 -0
- cdxml_toolkit/office/ole_extractor.py +272 -0
- cdxml_toolkit/perception/__init__.py +10 -0
- cdxml_toolkit/perception/compound_search.py +229 -0
- cdxml_toolkit/perception/eln_csv_parser.py +240 -0
- cdxml_toolkit/perception/rdf_parser.py +664 -0
- cdxml_toolkit/perception/reactant_heuristic.py +1045 -0
- cdxml_toolkit/perception/reaction_parser.py +2150 -0
- cdxml_toolkit/perception/scheme_reader.py +2948 -0
- cdxml_toolkit/perception/scheme_refine.py +1404 -0
- cdxml_toolkit/perception/scheme_segmenter.py +619 -0
- cdxml_toolkit/perception/spatial_assignment.py +1013 -0
- cdxml_toolkit/rdkit_utils.py +605 -0
- cdxml_toolkit/render/__init__.py +17 -0
- cdxml_toolkit/render/auto_layout.py +229 -0
- cdxml_toolkit/render/compact_parser.py +632 -0
- cdxml_toolkit/render/parser.py +706 -0
- cdxml_toolkit/render/render_scheme.py +267 -0
- cdxml_toolkit/render/renderer.py +2387 -0
- cdxml_toolkit/render/schema.py +90 -0
- cdxml_toolkit/render/scheme_maker.py +1043 -0
- cdxml_toolkit/render/scheme_yaml_writer.py +1487 -0
- cdxml_toolkit/resolve/__init__.py +13 -0
- cdxml_toolkit/resolve/cas_resolver.py +430 -0
- cdxml_toolkit/resolve/chemscanner_abbreviations.json +28813 -0
- cdxml_toolkit/resolve/condensed_formula.py +493 -0
- cdxml_toolkit/resolve/jre_manager.py +195 -0
- cdxml_toolkit/resolve/reagent_abbreviations.json +1046 -0
- cdxml_toolkit/resolve/reagent_db.py +285 -0
- cdxml_toolkit/resolve/superatom_data.json +2856 -0
- cdxml_toolkit/resolve/superatom_table.py +146 -0
- cdxml_toolkit/text_formatting.py +298 -0
- cdxml_toolkit-0.5.0.dist-info/METADATA +318 -0
- cdxml_toolkit-0.5.0.dist-info/RECORD +91 -0
- cdxml_toolkit-0.5.0.dist-info/WHEEL +5 -0
- cdxml_toolkit-0.5.0.dist-info/entry_points.txt +17 -0
- cdxml_toolkit-0.5.0.dist-info/licenses/LICENSE +21 -0
- cdxml_toolkit-0.5.0.dist-info/licenses/NOTICE.md +37 -0
- cdxml_toolkit-0.5.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,446 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
Procedure Writer — Lab Book Entry Assembler
|
|
4
|
+
|
|
5
|
+
Takes LCMS PDFs, NMR PDFs, ELN CSV exports, and CDX/RXN structure files for
|
|
6
|
+
a single reaction and outputs a polished, copy-paste-ready lab book entry with
|
|
7
|
+
three sections:
|
|
8
|
+
PROCEDURE — concise, publication-quality procedure text
|
|
9
|
+
CHARACTERIZATION — LCMS annotations + NMR data
|
|
10
|
+
NOTES — rough observations and inferences
|
|
11
|
+
|
|
12
|
+
Expected masses for LCMS identification are derived from CDX/RXN structure
|
|
13
|
+
files (via ChemScript + RDKit), with fallback to CSV MW values. Tracking
|
|
14
|
+
LCMS analysis is delegated to multi_lcms_analyzer for cross-file compound
|
|
15
|
+
matching, trend detection, and area% timelines.
|
|
16
|
+
|
|
17
|
+
Usage:
|
|
18
|
+
python procedure_writer.py --input-dir path/to/experiment/ --output result.txt
|
|
19
|
+
python procedure_writer.py --input-dir path/to/parent/ --experiment KL-7001-004
|
|
20
|
+
python procedure_writer.py --input-dir path/to/parent/ --experiment KL-7001-004 \\
|
|
21
|
+
--sm-mass 274 --product-mass 459 --output result.txt
|
|
22
|
+
"""
|
|
23
|
+
|
|
24
|
+
import argparse
|
|
25
|
+
import json
|
|
26
|
+
import os
|
|
27
|
+
import re
|
|
28
|
+
import sys
|
|
29
|
+
from typing import List, Optional, Dict
|
|
30
|
+
|
|
31
|
+
# --- LCMS tools ---
|
|
32
|
+
from ..lcms_analyzer import extract_all_text
|
|
33
|
+
from cdxml_toolkit.constants import MIN_SIGNIFICANT_AREA
|
|
34
|
+
|
|
35
|
+
# --- Mass resolution (split out to mass_resolver.py) ---
|
|
36
|
+
from .mass_resolver import (
|
|
37
|
+
ExpectedSpecies,
|
|
38
|
+
extract_expected_masses,
|
|
39
|
+
ADDUCTS, ADDUCT_PRIORITY, MODE_PREFERENCE,
|
|
40
|
+
)
|
|
41
|
+
|
|
42
|
+
# --- LCMS identification (split out to lcms_identifier.py) ---
|
|
43
|
+
from .lcms_identifier import (
|
|
44
|
+
IdentifiedCompound, TrackingAnalysis, IdentifiedPeak, PurifiedAnalysis,
|
|
45
|
+
match_ions_to_species, run_tracking_analysis, run_purified_analysis,
|
|
46
|
+
run_tracking_from_result,
|
|
47
|
+
)
|
|
48
|
+
from .multi_lcms_analyzer import load_analysis_from_json
|
|
49
|
+
|
|
50
|
+
# --- Output formatting (split out to lab_book_formatter.py) ---
|
|
51
|
+
from .lab_book_formatter import (
|
|
52
|
+
SECTION_SEP,
|
|
53
|
+
format_method_name,
|
|
54
|
+
build_procedure_section, build_tracking_narrative,
|
|
55
|
+
build_characterization_section, build_notes_section,
|
|
56
|
+
assemble_output,
|
|
57
|
+
)
|
|
58
|
+
|
|
59
|
+
from .discover_experiment_files import (
|
|
60
|
+
discover_experiment_files,
|
|
61
|
+
DiscoveryResult,
|
|
62
|
+
)
|
|
63
|
+
|
|
64
|
+
# ---------------------------------------------------------------------------
|
|
65
|
+
# Data structures & CSV parser — imported from package
|
|
66
|
+
# ---------------------------------------------------------------------------
|
|
67
|
+
|
|
68
|
+
from cdxml_toolkit.perception.eln_csv_parser import (
|
|
69
|
+
ReagentInfo, SolventInfo, ProductInfo, LCMSFileInfo, ExperimentData,
|
|
70
|
+
strip_html, extract_procedure_body, parse_eln_csv,
|
|
71
|
+
)
|
|
72
|
+
|
|
73
|
+
# ---------------------------------------------------------------------------
|
|
74
|
+
# File discovery (delegates to discover_experiment_files.py)
|
|
75
|
+
# ---------------------------------------------------------------------------
|
|
76
|
+
|
|
77
|
+
def discover_files(input_dir: str,
|
|
78
|
+
experiment_name: Optional[str] = None) -> ExperimentData:
|
|
79
|
+
"""
|
|
80
|
+
Discover all files for an experiment.
|
|
81
|
+
|
|
82
|
+
Delegates file discovery to discover_experiment_files module, then
|
|
83
|
+
parses the CSV and populates an ExperimentData with the results.
|
|
84
|
+
"""
|
|
85
|
+
# Run the standalone discovery
|
|
86
|
+
discovery = discover_experiment_files(input_dir, experiment_name)
|
|
87
|
+
|
|
88
|
+
# Parse CSV if found
|
|
89
|
+
exp = None
|
|
90
|
+
if discovery.csv_files:
|
|
91
|
+
exp = parse_eln_csv(discovery.csv_files[0])
|
|
92
|
+
|
|
93
|
+
if not exp:
|
|
94
|
+
exp = ExperimentData(
|
|
95
|
+
experiment_name=discovery.experiment,
|
|
96
|
+
labbook_name='', procedure_html='', procedure_text='',
|
|
97
|
+
reaction_type='', start_date='',
|
|
98
|
+
)
|
|
99
|
+
|
|
100
|
+
# CDX / RXN — take first of each
|
|
101
|
+
if discovery.cdx_files:
|
|
102
|
+
exp.cdx_path = discovery.cdx_files[0]
|
|
103
|
+
if discovery.rxn_files:
|
|
104
|
+
exp.rxn_path = discovery.rxn_files[0]
|
|
105
|
+
|
|
106
|
+
# LCMS files
|
|
107
|
+
for lf in discovery.lcms_files:
|
|
108
|
+
exp.lcms_files.append(LCMSFileInfo(
|
|
109
|
+
path=lf.path,
|
|
110
|
+
filename=os.path.basename(lf.path),
|
|
111
|
+
category=lf.category,
|
|
112
|
+
sort_key=lf.sort_key,
|
|
113
|
+
group_prefix=lf.group_prefix,
|
|
114
|
+
method_variant=lf.method_variant,
|
|
115
|
+
))
|
|
116
|
+
|
|
117
|
+
# NMR PDFs
|
|
118
|
+
exp.nmr_pdfs = list(discovery.nmr_files)
|
|
119
|
+
|
|
120
|
+
return exp
|
|
121
|
+
|
|
122
|
+
# ---------------------------------------------------------------------------
|
|
123
|
+
# NMR extraction
|
|
124
|
+
# ---------------------------------------------------------------------------
|
|
125
|
+
|
|
126
|
+
def extract_nmr_data(pdf_path: str) -> List[str]:
|
|
127
|
+
"""
|
|
128
|
+
Extract reported NMR data strings from an NMR PDF.
|
|
129
|
+
|
|
130
|
+
Searches for patterns like:
|
|
131
|
+
1H NMR (400 MHz, DMSO-d6) delta ...
|
|
132
|
+
13C NMR (101 MHz, DMSO-d6) delta ...
|
|
133
|
+
19F NMR (376 MHz, DMSO-d6) delta ...
|
|
134
|
+
"""
|
|
135
|
+
try:
|
|
136
|
+
text = extract_all_text(pdf_path)
|
|
137
|
+
except Exception as e:
|
|
138
|
+
print(f"Warning: Could not read NMR PDF {pdf_path}: {e}",
|
|
139
|
+
file=sys.stderr)
|
|
140
|
+
return []
|
|
141
|
+
|
|
142
|
+
results = []
|
|
143
|
+
|
|
144
|
+
# Pattern for NMR data strings
|
|
145
|
+
# Match: "1H NMR" or "13C NMR" or "19F NMR" etc., followed by the data
|
|
146
|
+
# The data string continues until a period followed by newline, or
|
|
147
|
+
# until a non-NMR line is encountered.
|
|
148
|
+
nmr_pattern = re.compile(
|
|
149
|
+
r'(\d+[A-Z]\s+NMR\s*' # nucleus: 1H, 13C, 19F, etc.
|
|
150
|
+
r'\([^)]+\)\s*' # (400 MHz, solvent)
|
|
151
|
+
r'[\u03b4\u00b4d]\s*' # delta or delta symbol
|
|
152
|
+
r'.+?)(?=\.\s*$|\.\s*\d+[A-Z]\s+NMR|\Z)', # capture until end
|
|
153
|
+
re.MULTILINE | re.DOTALL
|
|
154
|
+
)
|
|
155
|
+
|
|
156
|
+
seen = set()
|
|
157
|
+
for m in nmr_pattern.finditer(text):
|
|
158
|
+
data_str = m.group(1).strip()
|
|
159
|
+
# Clean up: normalize whitespace, remove line breaks within data
|
|
160
|
+
data_str = re.sub(r'\s+', ' ', data_str)
|
|
161
|
+
# Ensure it ends with a period
|
|
162
|
+
if not data_str.endswith('.'):
|
|
163
|
+
# Find the last closing paren with H count
|
|
164
|
+
last_paren = data_str.rfind(')')
|
|
165
|
+
if last_paren > 0:
|
|
166
|
+
data_str = data_str[:last_paren + 1] + '.'
|
|
167
|
+
# Deduplicate — NMR PDFs often repeat data on each page
|
|
168
|
+
if data_str not in seen:
|
|
169
|
+
seen.add(data_str)
|
|
170
|
+
results.append(data_str)
|
|
171
|
+
|
|
172
|
+
return results
|
|
173
|
+
|
|
174
|
+
# ---------------------------------------------------------------------------
|
|
175
|
+
# NMR batch parsing
|
|
176
|
+
# ---------------------------------------------------------------------------
|
|
177
|
+
|
|
178
|
+
def parse_all_nmr(exp: ExperimentData) -> None:
|
|
179
|
+
"""Extract NMR data from all NMR PDFs (with cross-file deduplication)."""
|
|
180
|
+
seen = set()
|
|
181
|
+
for pdf_path in exp.nmr_pdfs:
|
|
182
|
+
data = extract_nmr_data(pdf_path)
|
|
183
|
+
new_count = 0
|
|
184
|
+
for d in data:
|
|
185
|
+
if d not in seen:
|
|
186
|
+
seen.add(d)
|
|
187
|
+
exp.nmr_data.append(d)
|
|
188
|
+
new_count += 1
|
|
189
|
+
if new_count:
|
|
190
|
+
print(f" Found NMR data in {os.path.basename(pdf_path)}: "
|
|
191
|
+
f"{new_count} entries", file=sys.stderr)
|
|
192
|
+
elif data:
|
|
193
|
+
print(f" NMR PDF {os.path.basename(pdf_path)}: "
|
|
194
|
+
f"data already seen (duplicate)", file=sys.stderr)
|
|
195
|
+
else:
|
|
196
|
+
print(f" NMR PDF {os.path.basename(pdf_path)}: "
|
|
197
|
+
f"no reported data string found", file=sys.stderr)
|
|
198
|
+
|
|
199
|
+
# ---------------------------------------------------------------------------
|
|
200
|
+
# CLI
|
|
201
|
+
# ---------------------------------------------------------------------------
|
|
202
|
+
|
|
203
|
+
def _build_arg_parser() -> argparse.ArgumentParser:
|
|
204
|
+
p = argparse.ArgumentParser(
|
|
205
|
+
description="Procedure Writer — Lab Book Entry Assembler",
|
|
206
|
+
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
207
|
+
epilog=__doc__,
|
|
208
|
+
)
|
|
209
|
+
p.add_argument("--input-dir", "-i", required=True,
|
|
210
|
+
help="Directory containing experiment files "
|
|
211
|
+
"(experiment dir or parent dir)")
|
|
212
|
+
p.add_argument("--experiment", "-e", default=None,
|
|
213
|
+
help="Experiment name (e.g., KL-7001-004). "
|
|
214
|
+
"Required if input-dir is the parent directory.")
|
|
215
|
+
p.add_argument("--sm-mass", type=float, default=None,
|
|
216
|
+
help="Exact mass (MW) of starting material. "
|
|
217
|
+
"Auto-detected from CSV if not provided.")
|
|
218
|
+
p.add_argument("--product-mass", type=float, default=None,
|
|
219
|
+
help="Exact mass (MW) of desired product. "
|
|
220
|
+
"Auto-detected from CSV if not provided.")
|
|
221
|
+
p.add_argument("--predict-byproducts", action="store_true",
|
|
222
|
+
help="Predict reaction byproducts via FlowER for LCMS "
|
|
223
|
+
"matching (requires 'flower' conda env; results "
|
|
224
|
+
"are cached)")
|
|
225
|
+
p.add_argument("--flower-json", default=None,
|
|
226
|
+
help="Pre-computed FlowER byproduct predictions JSON "
|
|
227
|
+
"(from run_pipeline Phase 3.15). Adds predicted "
|
|
228
|
+
"byproducts to expected species for LCMS matching.")
|
|
229
|
+
p.add_argument("--tracking-json", default=None,
|
|
230
|
+
help="Pre-computed multi-LCMS tracking analysis JSON "
|
|
231
|
+
"(from multi_lcms_analyzer --json). Skips re-parsing "
|
|
232
|
+
"tracking PDFs.")
|
|
233
|
+
p.add_argument("--output", "-o", default=None,
|
|
234
|
+
help="Output file path (default: stdout)")
|
|
235
|
+
p.add_argument("--json-errors", action="store_true",
|
|
236
|
+
help="Output structured JSON error objects to stderr on "
|
|
237
|
+
"failure (for agent orchestration)")
|
|
238
|
+
return p
|
|
239
|
+
|
|
240
|
+
|
|
241
|
+
def _emit_json_error(error_code: str, detail: str,
|
|
242
|
+
file: str = None, *, stream=sys.stderr) -> None:
|
|
243
|
+
"""Write a structured JSON error to stderr."""
|
|
244
|
+
obj = {"error": error_code, "detail": detail}
|
|
245
|
+
if file:
|
|
246
|
+
obj["file"] = file
|
|
247
|
+
print(json.dumps(obj), file=stream)
|
|
248
|
+
|
|
249
|
+
|
|
250
|
+
def main(argv=None) -> int:
|
|
251
|
+
parser = _build_arg_parser()
|
|
252
|
+
args = parser.parse_args(argv)
|
|
253
|
+
|
|
254
|
+
try:
|
|
255
|
+
return _main_inner(args)
|
|
256
|
+
except Exception as e:
|
|
257
|
+
if args.json_errors:
|
|
258
|
+
msg = str(e).lower()
|
|
259
|
+
if "csv" in msg or "parse" in msg:
|
|
260
|
+
code = "csv_parse_failed"
|
|
261
|
+
elif "lcms" in msg or "pdf" in msg:
|
|
262
|
+
code = "lcms_analysis_failed"
|
|
263
|
+
elif "nmr" in msg:
|
|
264
|
+
code = "nmr_extraction_failed"
|
|
265
|
+
elif "mass" in msg or "structure" in msg:
|
|
266
|
+
code = "mass_resolution_failed"
|
|
267
|
+
else:
|
|
268
|
+
code = "procedure_failed"
|
|
269
|
+
_emit_json_error(code, str(e))
|
|
270
|
+
else:
|
|
271
|
+
print(f"ERROR: {e}", file=sys.stderr)
|
|
272
|
+
return 1
|
|
273
|
+
|
|
274
|
+
|
|
275
|
+
def _main_inner(args) -> int:
|
|
276
|
+
print("Procedure Writer — discovering files...", file=sys.stderr)
|
|
277
|
+
|
|
278
|
+
# Discover files
|
|
279
|
+
exp = discover_files(args.input_dir, args.experiment)
|
|
280
|
+
|
|
281
|
+
print(f"Experiment: {exp.experiment_name}", file=sys.stderr)
|
|
282
|
+
print(f" CSV procedure: {'yes' if exp.procedure_text else 'no'}",
|
|
283
|
+
file=sys.stderr)
|
|
284
|
+
print(f" Reactants: {len(exp.reactants)}", file=sys.stderr)
|
|
285
|
+
print(f" LCMS files: {len(exp.lcms_files)}", file=sys.stderr)
|
|
286
|
+
print(f" NMR PDFs: {len(exp.nmr_pdfs)}", file=sys.stderr)
|
|
287
|
+
print(f" CDX: {os.path.basename(exp.cdx_path) if exp.cdx_path else 'none'}",
|
|
288
|
+
file=sys.stderr)
|
|
289
|
+
print(f" RXN: {os.path.basename(exp.rxn_path) if exp.rxn_path else 'none'}",
|
|
290
|
+
file=sys.stderr)
|
|
291
|
+
|
|
292
|
+
# Override masses from CLI if provided
|
|
293
|
+
if args.sm_mass is not None:
|
|
294
|
+
exp.sm_mass = args.sm_mass
|
|
295
|
+
if args.product_mass is not None:
|
|
296
|
+
exp.product_mass = args.product_mass
|
|
297
|
+
|
|
298
|
+
if exp.sm_mass:
|
|
299
|
+
print(f" SM mass (CSV): {exp.sm_mass:.3f}", file=sys.stderr)
|
|
300
|
+
if exp.product_mass:
|
|
301
|
+
print(f" Product mass (CSV): {exp.product_mass:.3f}", file=sys.stderr)
|
|
302
|
+
|
|
303
|
+
# Extract expected masses from CDX/RXN (or CSV fallback)
|
|
304
|
+
print("\nDetermining expected species masses...", file=sys.stderr)
|
|
305
|
+
expected = extract_expected_masses(
|
|
306
|
+
exp, predict_byproducts=args.predict_byproducts)
|
|
307
|
+
|
|
308
|
+
# Load pre-computed FlowER byproduct predictions if provided
|
|
309
|
+
if args.flower_json and os.path.isfile(args.flower_json):
|
|
310
|
+
print(f"\nLoading FlowER predictions from "
|
|
311
|
+
f"{os.path.basename(args.flower_json)}...", file=sys.stderr)
|
|
312
|
+
try:
|
|
313
|
+
import json as _json
|
|
314
|
+
with open(args.flower_json, "r", encoding="utf-8") as f:
|
|
315
|
+
flower_data = _json.load(f)
|
|
316
|
+
existing_masses = [s.exact_mass for s in expected]
|
|
317
|
+
from cdxml_toolkit.constants import MASS_TOLERANCE
|
|
318
|
+
n_loaded = 0
|
|
319
|
+
for entry in flower_data:
|
|
320
|
+
em = entry.get("exact_mass", 0)
|
|
321
|
+
# Skip duplicates of existing species
|
|
322
|
+
if any(abs(em - m) < MASS_TOLERANCE for m in existing_masses):
|
|
323
|
+
continue
|
|
324
|
+
sp = ExpectedSpecies(
|
|
325
|
+
name=entry.get("name", "BP-?"),
|
|
326
|
+
role=entry.get("role", "byproduct"),
|
|
327
|
+
exact_mass=em,
|
|
328
|
+
smiles=entry.get("smiles", ""),
|
|
329
|
+
adducts=entry.get("adducts", {}),
|
|
330
|
+
source_file=args.flower_json,
|
|
331
|
+
)
|
|
332
|
+
expected.append(sp)
|
|
333
|
+
existing_masses.append(em)
|
|
334
|
+
n_loaded += 1
|
|
335
|
+
print(f" Loaded {n_loaded} byproduct(s) from FlowER JSON",
|
|
336
|
+
file=sys.stderr)
|
|
337
|
+
except Exception as e:
|
|
338
|
+
print(f" Warning: Could not load FlowER JSON: {e}",
|
|
339
|
+
file=sys.stderr)
|
|
340
|
+
|
|
341
|
+
for sp in expected:
|
|
342
|
+
mh = sp.adducts.get("[M+H]+", 0)
|
|
343
|
+
mh_neg = sp.adducts.get("[M-H]-", 0)
|
|
344
|
+
print(f" {sp.name} ({sp.role}): {sp.exact_mass:.3f} Da"
|
|
345
|
+
f" [M+H]+ {mh:.1f} [M-H]- {mh_neg:.1f}",
|
|
346
|
+
file=sys.stderr)
|
|
347
|
+
|
|
348
|
+
# Run tracking LCMS analysis (multi-LCMS)
|
|
349
|
+
tracking = TrackingAnalysis()
|
|
350
|
+
if args.tracking_json and os.path.isfile(args.tracking_json):
|
|
351
|
+
# Use pre-computed tracking analysis (avoids re-parsing PDFs)
|
|
352
|
+
print(f"\nLoading pre-computed tracking analysis from "
|
|
353
|
+
f"{os.path.basename(args.tracking_json)}...", file=sys.stderr)
|
|
354
|
+
analysis = load_analysis_from_json(args.tracking_json)
|
|
355
|
+
print(f" {len(analysis.compounds)} compounds, "
|
|
356
|
+
f"{len(analysis.files)} files", file=sys.stderr)
|
|
357
|
+
tracking = run_tracking_from_result(analysis, expected)
|
|
358
|
+
for ic in tracking.identified:
|
|
359
|
+
print(f" Compound RT {ic.compound.canonical_rt:.2f} -> "
|
|
360
|
+
f"{ic.species.name} ({ic.adduct} {ic.matched_mz:.1f})",
|
|
361
|
+
file=sys.stderr)
|
|
362
|
+
if tracking.unidentified:
|
|
363
|
+
n_sig = sum(1 for c in tracking.unidentified if c.max_area > MIN_SIGNIFICANT_AREA)
|
|
364
|
+
print(f" {len(tracking.unidentified)} unidentified compounds "
|
|
365
|
+
f"({n_sig} with area > 2%)", file=sys.stderr)
|
|
366
|
+
else:
|
|
367
|
+
tracking_files = [lf for lf in exp.lcms_files
|
|
368
|
+
if lf.category == "tracking"]
|
|
369
|
+
if tracking_files:
|
|
370
|
+
print(f"\nRunning tracking analysis "
|
|
371
|
+
f"({len(tracking_files)} files)...", file=sys.stderr)
|
|
372
|
+
tracking = run_tracking_analysis(exp, expected)
|
|
373
|
+
for ic in tracking.identified:
|
|
374
|
+
print(f" Compound RT {ic.compound.canonical_rt:.2f} -> "
|
|
375
|
+
f"{ic.species.name} ({ic.adduct} {ic.matched_mz:.1f})",
|
|
376
|
+
file=sys.stderr)
|
|
377
|
+
if tracking.unidentified:
|
|
378
|
+
n_sig = sum(1 for c in tracking.unidentified if c.max_area > MIN_SIGNIFICANT_AREA)
|
|
379
|
+
print(f" {len(tracking.unidentified)} unidentified compounds "
|
|
380
|
+
f"({n_sig} with area > 2%)", file=sys.stderr)
|
|
381
|
+
|
|
382
|
+
# Parse purified product LCMS (final files preferred, workup fallback)
|
|
383
|
+
purified = PurifiedAnalysis()
|
|
384
|
+
final_files = [lf for lf in exp.lcms_files if lf.category == "final"]
|
|
385
|
+
workup_files = [lf for lf in exp.lcms_files if lf.category == "workup"]
|
|
386
|
+
if final_files or workup_files:
|
|
387
|
+
print(f"\nAnalyzing purified product LCMS...", file=sys.stderr)
|
|
388
|
+
purified = run_purified_analysis(exp, expected)
|
|
389
|
+
purity_parts = []
|
|
390
|
+
if purified.purity_tac is not None:
|
|
391
|
+
purity_parts.append(f"TAC {purified.purity_tac:.0f}%")
|
|
392
|
+
if purified.purity_220nm is not None:
|
|
393
|
+
purity_parts.append(f"220nm {purified.purity_220nm:.0f}%")
|
|
394
|
+
if purified.purity_254nm is not None:
|
|
395
|
+
purity_parts.append(f"254nm {purified.purity_254nm:.0f}%")
|
|
396
|
+
if purity_parts:
|
|
397
|
+
print(f" Product purity: {', '.join(purity_parts)}",
|
|
398
|
+
file=sys.stderr)
|
|
399
|
+
|
|
400
|
+
# Extract NMR data
|
|
401
|
+
if exp.nmr_pdfs:
|
|
402
|
+
print(f"\nExtracting NMR data...", file=sys.stderr)
|
|
403
|
+
parse_all_nmr(exp)
|
|
404
|
+
|
|
405
|
+
# Build output sections
|
|
406
|
+
print(f"\nAssembling lab book entry...", file=sys.stderr)
|
|
407
|
+
procedure = build_procedure_section(exp, tracking)
|
|
408
|
+
characterization = build_characterization_section(
|
|
409
|
+
exp, expected, tracking, purified)
|
|
410
|
+
notes = build_notes_section(exp, expected, tracking, purified)
|
|
411
|
+
|
|
412
|
+
result = assemble_output(procedure, characterization, notes)
|
|
413
|
+
|
|
414
|
+
# FlowER byproduct reference CDXML (if predictions were made via
|
|
415
|
+
# --predict-byproducts inline mode)
|
|
416
|
+
if args.predict_byproducts and args.output:
|
|
417
|
+
try:
|
|
418
|
+
from mass_resolver import get_last_flower_predictions
|
|
419
|
+
from experiments.byproduct_prediction.flower_predictor import (
|
|
420
|
+
write_byproducts_cdxml,
|
|
421
|
+
)
|
|
422
|
+
flower_all = get_last_flower_predictions()
|
|
423
|
+
if flower_all:
|
|
424
|
+
base, _ = os.path.splitext(args.output)
|
|
425
|
+
cdxml_path = f"{base}-flower-predictions.cdxml"
|
|
426
|
+
write_byproducts_cdxml(flower_all, cdxml_path)
|
|
427
|
+
except ImportError:
|
|
428
|
+
pass
|
|
429
|
+
except Exception as e:
|
|
430
|
+
print(f" Warning: Could not write FlowER CDXML: {e}",
|
|
431
|
+
file=sys.stderr)
|
|
432
|
+
|
|
433
|
+
# Output
|
|
434
|
+
if args.output:
|
|
435
|
+
with open(args.output, 'w', encoding='utf-8') as f:
|
|
436
|
+
f.write(result)
|
|
437
|
+
print(f"\nOutput written to {args.output}", file=sys.stderr)
|
|
438
|
+
else:
|
|
439
|
+
sys.stdout.buffer.write(result.encode('utf-8'))
|
|
440
|
+
sys.stdout.buffer.write(b'\n')
|
|
441
|
+
|
|
442
|
+
return 0
|
|
443
|
+
|
|
444
|
+
|
|
445
|
+
if __name__ == "__main__":
|
|
446
|
+
sys.exit(main())
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
NMR Data Extractor — standalone CLI wrapper.
|
|
4
|
+
|
|
5
|
+
Extracts reported NMR data strings (1H, 13C, 19F, etc.) from MestReNova
|
|
6
|
+
PDF exports. Delegates to procedure_writer.extract_nmr_data().
|
|
7
|
+
|
|
8
|
+
Usage:
|
|
9
|
+
python extract_nmr.py path/to/nmr.pdf [path/to/nmr2.pdf ...]
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
import argparse
|
|
13
|
+
import sys
|
|
14
|
+
|
|
15
|
+
from .procedure_writer import extract_nmr_data
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def main(argv=None) -> int:
|
|
19
|
+
parser = argparse.ArgumentParser(description="Extract NMR data from PDFs")
|
|
20
|
+
parser.add_argument('files', nargs='+', help='NMR PDF files')
|
|
21
|
+
parser.add_argument('--output', '-o', type=str, default=None,
|
|
22
|
+
help='Output file (default: stdout)')
|
|
23
|
+
args = parser.parse_args(argv)
|
|
24
|
+
|
|
25
|
+
seen = set()
|
|
26
|
+
results = []
|
|
27
|
+
for pdf in args.files:
|
|
28
|
+
for line in extract_nmr_data(pdf):
|
|
29
|
+
if line not in seen:
|
|
30
|
+
seen.add(line)
|
|
31
|
+
results.append(line)
|
|
32
|
+
|
|
33
|
+
output = "\n".join(results)
|
|
34
|
+
if args.output:
|
|
35
|
+
with open(args.output, 'w', encoding='utf-8') as f:
|
|
36
|
+
f.write(output + "\n")
|
|
37
|
+
print(f"Wrote {len(results)} NMR entries to {args.output}",
|
|
38
|
+
file=sys.stderr)
|
|
39
|
+
else:
|
|
40
|
+
if output:
|
|
41
|
+
print(output)
|
|
42
|
+
|
|
43
|
+
return 0
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
if __name__ == "__main__":
|
|
47
|
+
sys.exit(main())
|