cdxml-toolkit 0.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cdxml_toolkit/__init__.py +18 -0
- cdxml_toolkit/_jre/__init__.py +2 -0
- cdxml_toolkit/_jre/temurin-21-jre-win-x64.zip +0 -0
- cdxml_toolkit/analysis/__init__.py +35 -0
- cdxml_toolkit/analysis/deterministic/__init__.py +12 -0
- cdxml_toolkit/analysis/deterministic/discover_experiment_files.py +413 -0
- cdxml_toolkit/analysis/deterministic/lab_book_formatter.py +701 -0
- cdxml_toolkit/analysis/deterministic/lcms_file_categorizer.py +928 -0
- cdxml_toolkit/analysis/deterministic/lcms_identifier.py +598 -0
- cdxml_toolkit/analysis/deterministic/mass_resolver.py +654 -0
- cdxml_toolkit/analysis/deterministic/multi_lcms_analyzer.py +1412 -0
- cdxml_toolkit/analysis/deterministic/procedure_writer.py +446 -0
- cdxml_toolkit/analysis/extract_nmr.py +47 -0
- cdxml_toolkit/analysis/format_procedure_entry.py +479 -0
- cdxml_toolkit/analysis/lcms_analyzer.py +1299 -0
- cdxml_toolkit/analysis/parse_analysis_file.py +134 -0
- cdxml_toolkit/cdxml_builder.py +920 -0
- cdxml_toolkit/cdxml_utils.py +342 -0
- cdxml_toolkit/chemdraw/__init__.py +5 -0
- cdxml_toolkit/chemdraw/_chemscript_server.py +562 -0
- cdxml_toolkit/chemdraw/cdx_converter.py +527 -0
- cdxml_toolkit/chemdraw/cdxml_to_image.py +262 -0
- cdxml_toolkit/chemdraw/cdxml_to_image_rdkit.py +296 -0
- cdxml_toolkit/chemdraw/chemscript_bridge.py +901 -0
- cdxml_toolkit/constants.py +304 -0
- cdxml_toolkit/coord_normalizer.py +438 -0
- cdxml_toolkit/deterministic_pipeline/__init__.py +6 -0
- cdxml_toolkit/deterministic_pipeline/legacy/__init__.py +5 -0
- cdxml_toolkit/deterministic_pipeline/legacy/eln_cdx_cleanup.py +509 -0
- cdxml_toolkit/deterministic_pipeline/legacy/eln_enrichment.py +1394 -0
- cdxml_toolkit/deterministic_pipeline/legacy/scheme_aligner.py +428 -0
- cdxml_toolkit/deterministic_pipeline/legacy/scheme_polisher.py +1337 -0
- cdxml_toolkit/deterministic_pipeline/legacy/scheme_polisher_v2.py +1340 -0
- cdxml_toolkit/deterministic_pipeline/scheme_reader_audit.py +931 -0
- cdxml_toolkit/deterministic_pipeline/scheme_reader_verify.py +1160 -0
- cdxml_toolkit/image/__init__.py +15 -0
- cdxml_toolkit/image/reaction_from_image.py +2103 -0
- cdxml_toolkit/image/structure_from_image.py +1711 -0
- cdxml_toolkit/layout/__init__.py +5 -0
- cdxml_toolkit/layout/alignment.py +1642 -0
- cdxml_toolkit/layout/reaction_cleanup.py +1002 -0
- cdxml_toolkit/layout/scheme_merger.py +2260 -0
- cdxml_toolkit/mcp_server/__init__.py +0 -0
- cdxml_toolkit/mcp_server/__main__.py +5 -0
- cdxml_toolkit/mcp_server/server.py +1567 -0
- cdxml_toolkit/naming/__init__.py +6 -0
- cdxml_toolkit/naming/aligned_namer.py +2342 -0
- cdxml_toolkit/naming/mol_builder.py +3722 -0
- cdxml_toolkit/naming/name_decomposer.py +2843 -0
- cdxml_toolkit/naming/reactions_datamol.json +2414 -0
- cdxml_toolkit/office/__init__.py +5 -0
- cdxml_toolkit/office/doc_from_template.py +722 -0
- cdxml_toolkit/office/ole_embedder.py +808 -0
- cdxml_toolkit/office/ole_extractor.py +272 -0
- cdxml_toolkit/perception/__init__.py +10 -0
- cdxml_toolkit/perception/compound_search.py +229 -0
- cdxml_toolkit/perception/eln_csv_parser.py +240 -0
- cdxml_toolkit/perception/rdf_parser.py +664 -0
- cdxml_toolkit/perception/reactant_heuristic.py +1045 -0
- cdxml_toolkit/perception/reaction_parser.py +2150 -0
- cdxml_toolkit/perception/scheme_reader.py +2948 -0
- cdxml_toolkit/perception/scheme_refine.py +1404 -0
- cdxml_toolkit/perception/scheme_segmenter.py +619 -0
- cdxml_toolkit/perception/spatial_assignment.py +1013 -0
- cdxml_toolkit/rdkit_utils.py +605 -0
- cdxml_toolkit/render/__init__.py +17 -0
- cdxml_toolkit/render/auto_layout.py +229 -0
- cdxml_toolkit/render/compact_parser.py +632 -0
- cdxml_toolkit/render/parser.py +706 -0
- cdxml_toolkit/render/render_scheme.py +267 -0
- cdxml_toolkit/render/renderer.py +2387 -0
- cdxml_toolkit/render/schema.py +90 -0
- cdxml_toolkit/render/scheme_maker.py +1043 -0
- cdxml_toolkit/render/scheme_yaml_writer.py +1487 -0
- cdxml_toolkit/resolve/__init__.py +13 -0
- cdxml_toolkit/resolve/cas_resolver.py +430 -0
- cdxml_toolkit/resolve/chemscanner_abbreviations.json +28813 -0
- cdxml_toolkit/resolve/condensed_formula.py +493 -0
- cdxml_toolkit/resolve/jre_manager.py +195 -0
- cdxml_toolkit/resolve/reagent_abbreviations.json +1046 -0
- cdxml_toolkit/resolve/reagent_db.py +285 -0
- cdxml_toolkit/resolve/superatom_data.json +2856 -0
- cdxml_toolkit/resolve/superatom_table.py +146 -0
- cdxml_toolkit/text_formatting.py +298 -0
- cdxml_toolkit-0.5.0.dist-info/METADATA +318 -0
- cdxml_toolkit-0.5.0.dist-info/RECORD +91 -0
- cdxml_toolkit-0.5.0.dist-info/WHEEL +5 -0
- cdxml_toolkit-0.5.0.dist-info/entry_points.txt +17 -0
- cdxml_toolkit-0.5.0.dist-info/licenses/LICENSE +21 -0
- cdxml_toolkit-0.5.0.dist-info/licenses/NOTICE.md +37 -0
- cdxml_toolkit-0.5.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,134 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
parse_analysis_file.py — Unified LCMS / NMR PDF parser.
|
|
4
|
+
|
|
5
|
+
Detects whether a PDF is an LCMS report (Waters MassLynx standard or manual
|
|
6
|
+
integration) or an NMR report, then delegates to the appropriate parser and
|
|
7
|
+
returns a normalised dict.
|
|
8
|
+
|
|
9
|
+
Python API:
|
|
10
|
+
from cdxml_toolkit.analysis.parse_analysis_file import parse_analysis_file
|
|
11
|
+
result = parse_analysis_file("KL-7001-011-purified.pdf")
|
|
12
|
+
# result["file_type"] -> "lcms" or "nmr"
|
|
13
|
+
# result["data"] -> parsed data dict
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
from __future__ import annotations
|
|
17
|
+
|
|
18
|
+
import dataclasses
|
|
19
|
+
import traceback
|
|
20
|
+
from typing import Any, Dict
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def _dataclass_to_dict(obj: Any) -> Any:
|
|
24
|
+
"""Recursively convert dataclasses (and lists/dicts thereof) to plain dicts."""
|
|
25
|
+
if dataclasses.is_dataclass(obj) and not isinstance(obj, type):
|
|
26
|
+
return {k: _dataclass_to_dict(v) for k, v in dataclasses.asdict(obj).items()}
|
|
27
|
+
if isinstance(obj, list):
|
|
28
|
+
return [_dataclass_to_dict(i) for i in obj]
|
|
29
|
+
if isinstance(obj, dict):
|
|
30
|
+
return {k: _dataclass_to_dict(v) for k, v in obj.items()}
|
|
31
|
+
return obj
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def parse_analysis_file(pdf_path: str) -> Dict[str, Any]:
|
|
35
|
+
"""Detect and parse an LCMS or NMR PDF report.
|
|
36
|
+
|
|
37
|
+
Detection order:
|
|
38
|
+
1. Waters MassLynx standard report → ``parse_report()``
|
|
39
|
+
2. Waters MassLynx manual integration → ``parse_manual_report()``
|
|
40
|
+
3. NMR PDF (MestReNova) → ``extract_nmr_data()``
|
|
41
|
+
4. None of the above → error dict
|
|
42
|
+
|
|
43
|
+
Args:
|
|
44
|
+
pdf_path: Absolute or relative path to the PDF file.
|
|
45
|
+
|
|
46
|
+
Returns:
|
|
47
|
+
Dict with keys:
|
|
48
|
+
ok (bool)
|
|
49
|
+
file_type ("lcms" | "nmr") — only present when ok=True
|
|
50
|
+
file_path (str)
|
|
51
|
+
data (dict) — parsed content; structure depends on file_type:
|
|
52
|
+
lcms (standard): LCMSReport as dict
|
|
53
|
+
lcms (manual): ManualLCMSReport as dict, plus
|
|
54
|
+
"variant": "manual_integration"
|
|
55
|
+
nmr: {"nmr_strings": ["1H NMR ..."]}
|
|
56
|
+
error (str) — only present when ok=False
|
|
57
|
+
"""
|
|
58
|
+
from cdxml_toolkit.analysis.lcms_analyzer import (
|
|
59
|
+
is_waters_report,
|
|
60
|
+
parse_report,
|
|
61
|
+
is_manual_integration,
|
|
62
|
+
parse_manual_report,
|
|
63
|
+
)
|
|
64
|
+
from cdxml_toolkit.analysis.deterministic.procedure_writer import extract_nmr_data
|
|
65
|
+
|
|
66
|
+
base_result: Dict[str, Any] = {"file_path": pdf_path}
|
|
67
|
+
|
|
68
|
+
# --- Attempt 1: standard Waters report ---
|
|
69
|
+
try:
|
|
70
|
+
if is_waters_report(pdf_path):
|
|
71
|
+
report = parse_report(pdf_path)
|
|
72
|
+
return {
|
|
73
|
+
**base_result,
|
|
74
|
+
"ok": True,
|
|
75
|
+
"file_type": "lcms",
|
|
76
|
+
"data": _dataclass_to_dict(report),
|
|
77
|
+
}
|
|
78
|
+
except Exception as exc:
|
|
79
|
+
return {
|
|
80
|
+
**base_result,
|
|
81
|
+
"ok": False,
|
|
82
|
+
"file_type": "lcms",
|
|
83
|
+
"error": f"Waters report detected but parsing failed: {exc}",
|
|
84
|
+
"traceback": traceback.format_exc(),
|
|
85
|
+
}
|
|
86
|
+
|
|
87
|
+
# --- Attempt 2: manual integration report ---
|
|
88
|
+
try:
|
|
89
|
+
if is_manual_integration(pdf_path):
|
|
90
|
+
report = parse_manual_report(pdf_path)
|
|
91
|
+
data = _dataclass_to_dict(report)
|
|
92
|
+
data["variant"] = "manual_integration"
|
|
93
|
+
return {
|
|
94
|
+
**base_result,
|
|
95
|
+
"ok": True,
|
|
96
|
+
"file_type": "lcms",
|
|
97
|
+
"data": data,
|
|
98
|
+
}
|
|
99
|
+
except Exception as exc:
|
|
100
|
+
return {
|
|
101
|
+
**base_result,
|
|
102
|
+
"ok": False,
|
|
103
|
+
"file_type": "lcms",
|
|
104
|
+
"error": f"Manual integration report detected but parsing failed: {exc}",
|
|
105
|
+
"traceback": traceback.format_exc(),
|
|
106
|
+
}
|
|
107
|
+
|
|
108
|
+
# --- Attempt 3: NMR PDF ---
|
|
109
|
+
try:
|
|
110
|
+
nmr_strings = extract_nmr_data(pdf_path)
|
|
111
|
+
if nmr_strings:
|
|
112
|
+
return {
|
|
113
|
+
**base_result,
|
|
114
|
+
"ok": True,
|
|
115
|
+
"file_type": "nmr",
|
|
116
|
+
"data": {"nmr_strings": nmr_strings},
|
|
117
|
+
}
|
|
118
|
+
except Exception as exc:
|
|
119
|
+
return {
|
|
120
|
+
**base_result,
|
|
121
|
+
"ok": False,
|
|
122
|
+
"error": f"NMR extraction failed: {exc}",
|
|
123
|
+
"traceback": traceback.format_exc(),
|
|
124
|
+
}
|
|
125
|
+
|
|
126
|
+
# --- Nothing matched ---
|
|
127
|
+
return {
|
|
128
|
+
**base_result,
|
|
129
|
+
"ok": False,
|
|
130
|
+
"error": (
|
|
131
|
+
"Could not detect file type: not a Waters standard report, "
|
|
132
|
+
"not a manual integration export, and no NMR data strings found."
|
|
133
|
+
),
|
|
134
|
+
}
|