cdxml-toolkit 0.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (91) hide show
  1. cdxml_toolkit/__init__.py +18 -0
  2. cdxml_toolkit/_jre/__init__.py +2 -0
  3. cdxml_toolkit/_jre/temurin-21-jre-win-x64.zip +0 -0
  4. cdxml_toolkit/analysis/__init__.py +35 -0
  5. cdxml_toolkit/analysis/deterministic/__init__.py +12 -0
  6. cdxml_toolkit/analysis/deterministic/discover_experiment_files.py +413 -0
  7. cdxml_toolkit/analysis/deterministic/lab_book_formatter.py +701 -0
  8. cdxml_toolkit/analysis/deterministic/lcms_file_categorizer.py +928 -0
  9. cdxml_toolkit/analysis/deterministic/lcms_identifier.py +598 -0
  10. cdxml_toolkit/analysis/deterministic/mass_resolver.py +654 -0
  11. cdxml_toolkit/analysis/deterministic/multi_lcms_analyzer.py +1412 -0
  12. cdxml_toolkit/analysis/deterministic/procedure_writer.py +446 -0
  13. cdxml_toolkit/analysis/extract_nmr.py +47 -0
  14. cdxml_toolkit/analysis/format_procedure_entry.py +479 -0
  15. cdxml_toolkit/analysis/lcms_analyzer.py +1299 -0
  16. cdxml_toolkit/analysis/parse_analysis_file.py +134 -0
  17. cdxml_toolkit/cdxml_builder.py +920 -0
  18. cdxml_toolkit/cdxml_utils.py +342 -0
  19. cdxml_toolkit/chemdraw/__init__.py +5 -0
  20. cdxml_toolkit/chemdraw/_chemscript_server.py +562 -0
  21. cdxml_toolkit/chemdraw/cdx_converter.py +527 -0
  22. cdxml_toolkit/chemdraw/cdxml_to_image.py +262 -0
  23. cdxml_toolkit/chemdraw/cdxml_to_image_rdkit.py +296 -0
  24. cdxml_toolkit/chemdraw/chemscript_bridge.py +901 -0
  25. cdxml_toolkit/constants.py +304 -0
  26. cdxml_toolkit/coord_normalizer.py +438 -0
  27. cdxml_toolkit/deterministic_pipeline/__init__.py +6 -0
  28. cdxml_toolkit/deterministic_pipeline/legacy/__init__.py +5 -0
  29. cdxml_toolkit/deterministic_pipeline/legacy/eln_cdx_cleanup.py +509 -0
  30. cdxml_toolkit/deterministic_pipeline/legacy/eln_enrichment.py +1394 -0
  31. cdxml_toolkit/deterministic_pipeline/legacy/scheme_aligner.py +428 -0
  32. cdxml_toolkit/deterministic_pipeline/legacy/scheme_polisher.py +1337 -0
  33. cdxml_toolkit/deterministic_pipeline/legacy/scheme_polisher_v2.py +1340 -0
  34. cdxml_toolkit/deterministic_pipeline/scheme_reader_audit.py +931 -0
  35. cdxml_toolkit/deterministic_pipeline/scheme_reader_verify.py +1160 -0
  36. cdxml_toolkit/image/__init__.py +15 -0
  37. cdxml_toolkit/image/reaction_from_image.py +2103 -0
  38. cdxml_toolkit/image/structure_from_image.py +1711 -0
  39. cdxml_toolkit/layout/__init__.py +5 -0
  40. cdxml_toolkit/layout/alignment.py +1642 -0
  41. cdxml_toolkit/layout/reaction_cleanup.py +1002 -0
  42. cdxml_toolkit/layout/scheme_merger.py +2260 -0
  43. cdxml_toolkit/mcp_server/__init__.py +0 -0
  44. cdxml_toolkit/mcp_server/__main__.py +5 -0
  45. cdxml_toolkit/mcp_server/server.py +1567 -0
  46. cdxml_toolkit/naming/__init__.py +6 -0
  47. cdxml_toolkit/naming/aligned_namer.py +2342 -0
  48. cdxml_toolkit/naming/mol_builder.py +3722 -0
  49. cdxml_toolkit/naming/name_decomposer.py +2843 -0
  50. cdxml_toolkit/naming/reactions_datamol.json +2414 -0
  51. cdxml_toolkit/office/__init__.py +5 -0
  52. cdxml_toolkit/office/doc_from_template.py +722 -0
  53. cdxml_toolkit/office/ole_embedder.py +808 -0
  54. cdxml_toolkit/office/ole_extractor.py +272 -0
  55. cdxml_toolkit/perception/__init__.py +10 -0
  56. cdxml_toolkit/perception/compound_search.py +229 -0
  57. cdxml_toolkit/perception/eln_csv_parser.py +240 -0
  58. cdxml_toolkit/perception/rdf_parser.py +664 -0
  59. cdxml_toolkit/perception/reactant_heuristic.py +1045 -0
  60. cdxml_toolkit/perception/reaction_parser.py +2150 -0
  61. cdxml_toolkit/perception/scheme_reader.py +2948 -0
  62. cdxml_toolkit/perception/scheme_refine.py +1404 -0
  63. cdxml_toolkit/perception/scheme_segmenter.py +619 -0
  64. cdxml_toolkit/perception/spatial_assignment.py +1013 -0
  65. cdxml_toolkit/rdkit_utils.py +605 -0
  66. cdxml_toolkit/render/__init__.py +17 -0
  67. cdxml_toolkit/render/auto_layout.py +229 -0
  68. cdxml_toolkit/render/compact_parser.py +632 -0
  69. cdxml_toolkit/render/parser.py +706 -0
  70. cdxml_toolkit/render/render_scheme.py +267 -0
  71. cdxml_toolkit/render/renderer.py +2387 -0
  72. cdxml_toolkit/render/schema.py +90 -0
  73. cdxml_toolkit/render/scheme_maker.py +1043 -0
  74. cdxml_toolkit/render/scheme_yaml_writer.py +1487 -0
  75. cdxml_toolkit/resolve/__init__.py +13 -0
  76. cdxml_toolkit/resolve/cas_resolver.py +430 -0
  77. cdxml_toolkit/resolve/chemscanner_abbreviations.json +28813 -0
  78. cdxml_toolkit/resolve/condensed_formula.py +493 -0
  79. cdxml_toolkit/resolve/jre_manager.py +195 -0
  80. cdxml_toolkit/resolve/reagent_abbreviations.json +1046 -0
  81. cdxml_toolkit/resolve/reagent_db.py +285 -0
  82. cdxml_toolkit/resolve/superatom_data.json +2856 -0
  83. cdxml_toolkit/resolve/superatom_table.py +146 -0
  84. cdxml_toolkit/text_formatting.py +298 -0
  85. cdxml_toolkit-0.5.0.dist-info/METADATA +318 -0
  86. cdxml_toolkit-0.5.0.dist-info/RECORD +91 -0
  87. cdxml_toolkit-0.5.0.dist-info/WHEEL +5 -0
  88. cdxml_toolkit-0.5.0.dist-info/entry_points.txt +17 -0
  89. cdxml_toolkit-0.5.0.dist-info/licenses/LICENSE +21 -0
  90. cdxml_toolkit-0.5.0.dist-info/licenses/NOTICE.md +37 -0
  91. cdxml_toolkit-0.5.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,134 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ parse_analysis_file.py — Unified LCMS / NMR PDF parser.
4
+
5
+ Detects whether a PDF is an LCMS report (Waters MassLynx standard or manual
6
+ integration) or an NMR report, then delegates to the appropriate parser and
7
+ returns a normalised dict.
8
+
9
+ Python API:
10
+ from cdxml_toolkit.analysis.parse_analysis_file import parse_analysis_file
11
+ result = parse_analysis_file("KL-7001-011-purified.pdf")
12
+ # result["file_type"] -> "lcms" or "nmr"
13
+ # result["data"] -> parsed data dict
14
+ """
15
+
16
+ from __future__ import annotations
17
+
18
+ import dataclasses
19
+ import traceback
20
+ from typing import Any, Dict
21
+
22
+
23
+ def _dataclass_to_dict(obj: Any) -> Any:
24
+ """Recursively convert dataclasses (and lists/dicts thereof) to plain dicts."""
25
+ if dataclasses.is_dataclass(obj) and not isinstance(obj, type):
26
+ return {k: _dataclass_to_dict(v) for k, v in dataclasses.asdict(obj).items()}
27
+ if isinstance(obj, list):
28
+ return [_dataclass_to_dict(i) for i in obj]
29
+ if isinstance(obj, dict):
30
+ return {k: _dataclass_to_dict(v) for k, v in obj.items()}
31
+ return obj
32
+
33
+
34
+ def parse_analysis_file(pdf_path: str) -> Dict[str, Any]:
35
+ """Detect and parse an LCMS or NMR PDF report.
36
+
37
+ Detection order:
38
+ 1. Waters MassLynx standard report → ``parse_report()``
39
+ 2. Waters MassLynx manual integration → ``parse_manual_report()``
40
+ 3. NMR PDF (MestReNova) → ``extract_nmr_data()``
41
+ 4. None of the above → error dict
42
+
43
+ Args:
44
+ pdf_path: Absolute or relative path to the PDF file.
45
+
46
+ Returns:
47
+ Dict with keys:
48
+ ok (bool)
49
+ file_type ("lcms" | "nmr") — only present when ok=True
50
+ file_path (str)
51
+ data (dict) — parsed content; structure depends on file_type:
52
+ lcms (standard): LCMSReport as dict
53
+ lcms (manual): ManualLCMSReport as dict, plus
54
+ "variant": "manual_integration"
55
+ nmr: {"nmr_strings": ["1H NMR ..."]}
56
+ error (str) — only present when ok=False
57
+ """
58
+ from cdxml_toolkit.analysis.lcms_analyzer import (
59
+ is_waters_report,
60
+ parse_report,
61
+ is_manual_integration,
62
+ parse_manual_report,
63
+ )
64
+ from cdxml_toolkit.analysis.deterministic.procedure_writer import extract_nmr_data
65
+
66
+ base_result: Dict[str, Any] = {"file_path": pdf_path}
67
+
68
+ # --- Attempt 1: standard Waters report ---
69
+ try:
70
+ if is_waters_report(pdf_path):
71
+ report = parse_report(pdf_path)
72
+ return {
73
+ **base_result,
74
+ "ok": True,
75
+ "file_type": "lcms",
76
+ "data": _dataclass_to_dict(report),
77
+ }
78
+ except Exception as exc:
79
+ return {
80
+ **base_result,
81
+ "ok": False,
82
+ "file_type": "lcms",
83
+ "error": f"Waters report detected but parsing failed: {exc}",
84
+ "traceback": traceback.format_exc(),
85
+ }
86
+
87
+ # --- Attempt 2: manual integration report ---
88
+ try:
89
+ if is_manual_integration(pdf_path):
90
+ report = parse_manual_report(pdf_path)
91
+ data = _dataclass_to_dict(report)
92
+ data["variant"] = "manual_integration"
93
+ return {
94
+ **base_result,
95
+ "ok": True,
96
+ "file_type": "lcms",
97
+ "data": data,
98
+ }
99
+ except Exception as exc:
100
+ return {
101
+ **base_result,
102
+ "ok": False,
103
+ "file_type": "lcms",
104
+ "error": f"Manual integration report detected but parsing failed: {exc}",
105
+ "traceback": traceback.format_exc(),
106
+ }
107
+
108
+ # --- Attempt 3: NMR PDF ---
109
+ try:
110
+ nmr_strings = extract_nmr_data(pdf_path)
111
+ if nmr_strings:
112
+ return {
113
+ **base_result,
114
+ "ok": True,
115
+ "file_type": "nmr",
116
+ "data": {"nmr_strings": nmr_strings},
117
+ }
118
+ except Exception as exc:
119
+ return {
120
+ **base_result,
121
+ "ok": False,
122
+ "error": f"NMR extraction failed: {exc}",
123
+ "traceback": traceback.format_exc(),
124
+ }
125
+
126
+ # --- Nothing matched ---
127
+ return {
128
+ **base_result,
129
+ "ok": False,
130
+ "error": (
131
+ "Could not detect file type: not a Waters standard report, "
132
+ "not a manual integration export, and no NMR data strings found."
133
+ ),
134
+ }