cdxml-toolkit 0.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (91) hide show
  1. cdxml_toolkit/__init__.py +18 -0
  2. cdxml_toolkit/_jre/__init__.py +2 -0
  3. cdxml_toolkit/_jre/temurin-21-jre-win-x64.zip +0 -0
  4. cdxml_toolkit/analysis/__init__.py +35 -0
  5. cdxml_toolkit/analysis/deterministic/__init__.py +12 -0
  6. cdxml_toolkit/analysis/deterministic/discover_experiment_files.py +413 -0
  7. cdxml_toolkit/analysis/deterministic/lab_book_formatter.py +701 -0
  8. cdxml_toolkit/analysis/deterministic/lcms_file_categorizer.py +928 -0
  9. cdxml_toolkit/analysis/deterministic/lcms_identifier.py +598 -0
  10. cdxml_toolkit/analysis/deterministic/mass_resolver.py +654 -0
  11. cdxml_toolkit/analysis/deterministic/multi_lcms_analyzer.py +1412 -0
  12. cdxml_toolkit/analysis/deterministic/procedure_writer.py +446 -0
  13. cdxml_toolkit/analysis/extract_nmr.py +47 -0
  14. cdxml_toolkit/analysis/format_procedure_entry.py +479 -0
  15. cdxml_toolkit/analysis/lcms_analyzer.py +1299 -0
  16. cdxml_toolkit/analysis/parse_analysis_file.py +134 -0
  17. cdxml_toolkit/cdxml_builder.py +920 -0
  18. cdxml_toolkit/cdxml_utils.py +342 -0
  19. cdxml_toolkit/chemdraw/__init__.py +5 -0
  20. cdxml_toolkit/chemdraw/_chemscript_server.py +562 -0
  21. cdxml_toolkit/chemdraw/cdx_converter.py +527 -0
  22. cdxml_toolkit/chemdraw/cdxml_to_image.py +262 -0
  23. cdxml_toolkit/chemdraw/cdxml_to_image_rdkit.py +296 -0
  24. cdxml_toolkit/chemdraw/chemscript_bridge.py +901 -0
  25. cdxml_toolkit/constants.py +304 -0
  26. cdxml_toolkit/coord_normalizer.py +438 -0
  27. cdxml_toolkit/deterministic_pipeline/__init__.py +6 -0
  28. cdxml_toolkit/deterministic_pipeline/legacy/__init__.py +5 -0
  29. cdxml_toolkit/deterministic_pipeline/legacy/eln_cdx_cleanup.py +509 -0
  30. cdxml_toolkit/deterministic_pipeline/legacy/eln_enrichment.py +1394 -0
  31. cdxml_toolkit/deterministic_pipeline/legacy/scheme_aligner.py +428 -0
  32. cdxml_toolkit/deterministic_pipeline/legacy/scheme_polisher.py +1337 -0
  33. cdxml_toolkit/deterministic_pipeline/legacy/scheme_polisher_v2.py +1340 -0
  34. cdxml_toolkit/deterministic_pipeline/scheme_reader_audit.py +931 -0
  35. cdxml_toolkit/deterministic_pipeline/scheme_reader_verify.py +1160 -0
  36. cdxml_toolkit/image/__init__.py +15 -0
  37. cdxml_toolkit/image/reaction_from_image.py +2103 -0
  38. cdxml_toolkit/image/structure_from_image.py +1711 -0
  39. cdxml_toolkit/layout/__init__.py +5 -0
  40. cdxml_toolkit/layout/alignment.py +1642 -0
  41. cdxml_toolkit/layout/reaction_cleanup.py +1002 -0
  42. cdxml_toolkit/layout/scheme_merger.py +2260 -0
  43. cdxml_toolkit/mcp_server/__init__.py +0 -0
  44. cdxml_toolkit/mcp_server/__main__.py +5 -0
  45. cdxml_toolkit/mcp_server/server.py +1567 -0
  46. cdxml_toolkit/naming/__init__.py +6 -0
  47. cdxml_toolkit/naming/aligned_namer.py +2342 -0
  48. cdxml_toolkit/naming/mol_builder.py +3722 -0
  49. cdxml_toolkit/naming/name_decomposer.py +2843 -0
  50. cdxml_toolkit/naming/reactions_datamol.json +2414 -0
  51. cdxml_toolkit/office/__init__.py +5 -0
  52. cdxml_toolkit/office/doc_from_template.py +722 -0
  53. cdxml_toolkit/office/ole_embedder.py +808 -0
  54. cdxml_toolkit/office/ole_extractor.py +272 -0
  55. cdxml_toolkit/perception/__init__.py +10 -0
  56. cdxml_toolkit/perception/compound_search.py +229 -0
  57. cdxml_toolkit/perception/eln_csv_parser.py +240 -0
  58. cdxml_toolkit/perception/rdf_parser.py +664 -0
  59. cdxml_toolkit/perception/reactant_heuristic.py +1045 -0
  60. cdxml_toolkit/perception/reaction_parser.py +2150 -0
  61. cdxml_toolkit/perception/scheme_reader.py +2948 -0
  62. cdxml_toolkit/perception/scheme_refine.py +1404 -0
  63. cdxml_toolkit/perception/scheme_segmenter.py +619 -0
  64. cdxml_toolkit/perception/spatial_assignment.py +1013 -0
  65. cdxml_toolkit/rdkit_utils.py +605 -0
  66. cdxml_toolkit/render/__init__.py +17 -0
  67. cdxml_toolkit/render/auto_layout.py +229 -0
  68. cdxml_toolkit/render/compact_parser.py +632 -0
  69. cdxml_toolkit/render/parser.py +706 -0
  70. cdxml_toolkit/render/render_scheme.py +267 -0
  71. cdxml_toolkit/render/renderer.py +2387 -0
  72. cdxml_toolkit/render/schema.py +90 -0
  73. cdxml_toolkit/render/scheme_maker.py +1043 -0
  74. cdxml_toolkit/render/scheme_yaml_writer.py +1487 -0
  75. cdxml_toolkit/resolve/__init__.py +13 -0
  76. cdxml_toolkit/resolve/cas_resolver.py +430 -0
  77. cdxml_toolkit/resolve/chemscanner_abbreviations.json +28813 -0
  78. cdxml_toolkit/resolve/condensed_formula.py +493 -0
  79. cdxml_toolkit/resolve/jre_manager.py +195 -0
  80. cdxml_toolkit/resolve/reagent_abbreviations.json +1046 -0
  81. cdxml_toolkit/resolve/reagent_db.py +285 -0
  82. cdxml_toolkit/resolve/superatom_data.json +2856 -0
  83. cdxml_toolkit/resolve/superatom_table.py +146 -0
  84. cdxml_toolkit/text_formatting.py +298 -0
  85. cdxml_toolkit-0.5.0.dist-info/METADATA +318 -0
  86. cdxml_toolkit-0.5.0.dist-info/RECORD +91 -0
  87. cdxml_toolkit-0.5.0.dist-info/WHEEL +5 -0
  88. cdxml_toolkit-0.5.0.dist-info/entry_points.txt +17 -0
  89. cdxml_toolkit-0.5.0.dist-info/licenses/LICENSE +21 -0
  90. cdxml_toolkit-0.5.0.dist-info/licenses/NOTICE.md +37 -0
  91. cdxml_toolkit-0.5.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,446 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Procedure Writer — Lab Book Entry Assembler
4
+
5
+ Takes LCMS PDFs, NMR PDFs, ELN CSV exports, and CDX/RXN structure files for
6
+ a single reaction and outputs a polished, copy-paste-ready lab book entry with
7
+ three sections:
8
+ PROCEDURE — concise, publication-quality procedure text
9
+ CHARACTERIZATION — LCMS annotations + NMR data
10
+ NOTES — rough observations and inferences
11
+
12
+ Expected masses for LCMS identification are derived from CDX/RXN structure
13
+ files (via ChemScript + RDKit), with fallback to CSV MW values. Tracking
14
+ LCMS analysis is delegated to multi_lcms_analyzer for cross-file compound
15
+ matching, trend detection, and area% timelines.
16
+
17
+ Usage:
18
+ python procedure_writer.py --input-dir path/to/experiment/ --output result.txt
19
+ python procedure_writer.py --input-dir path/to/parent/ --experiment KL-7001-004
20
+ python procedure_writer.py --input-dir path/to/parent/ --experiment KL-7001-004 \\
21
+ --sm-mass 274 --product-mass 459 --output result.txt
22
+ """
23
+
24
+ import argparse
25
+ import json
26
+ import os
27
+ import re
28
+ import sys
29
+ from typing import List, Optional, Dict
30
+
31
+ # --- LCMS tools ---
32
+ from ..lcms_analyzer import extract_all_text
33
+ from cdxml_toolkit.constants import MIN_SIGNIFICANT_AREA
34
+
35
+ # --- Mass resolution (split out to mass_resolver.py) ---
36
+ from .mass_resolver import (
37
+ ExpectedSpecies,
38
+ extract_expected_masses,
39
+ ADDUCTS, ADDUCT_PRIORITY, MODE_PREFERENCE,
40
+ )
41
+
42
+ # --- LCMS identification (split out to lcms_identifier.py) ---
43
+ from .lcms_identifier import (
44
+ IdentifiedCompound, TrackingAnalysis, IdentifiedPeak, PurifiedAnalysis,
45
+ match_ions_to_species, run_tracking_analysis, run_purified_analysis,
46
+ run_tracking_from_result,
47
+ )
48
+ from .multi_lcms_analyzer import load_analysis_from_json
49
+
50
+ # --- Output formatting (split out to lab_book_formatter.py) ---
51
+ from .lab_book_formatter import (
52
+ SECTION_SEP,
53
+ format_method_name,
54
+ build_procedure_section, build_tracking_narrative,
55
+ build_characterization_section, build_notes_section,
56
+ assemble_output,
57
+ )
58
+
59
+ from .discover_experiment_files import (
60
+ discover_experiment_files,
61
+ DiscoveryResult,
62
+ )
63
+
64
+ # ---------------------------------------------------------------------------
65
+ # Data structures & CSV parser — imported from package
66
+ # ---------------------------------------------------------------------------
67
+
68
+ from cdxml_toolkit.perception.eln_csv_parser import (
69
+ ReagentInfo, SolventInfo, ProductInfo, LCMSFileInfo, ExperimentData,
70
+ strip_html, extract_procedure_body, parse_eln_csv,
71
+ )
72
+
73
+ # ---------------------------------------------------------------------------
74
+ # File discovery (delegates to discover_experiment_files.py)
75
+ # ---------------------------------------------------------------------------
76
+
77
+ def discover_files(input_dir: str,
78
+ experiment_name: Optional[str] = None) -> ExperimentData:
79
+ """
80
+ Discover all files for an experiment.
81
+
82
+ Delegates file discovery to discover_experiment_files module, then
83
+ parses the CSV and populates an ExperimentData with the results.
84
+ """
85
+ # Run the standalone discovery
86
+ discovery = discover_experiment_files(input_dir, experiment_name)
87
+
88
+ # Parse CSV if found
89
+ exp = None
90
+ if discovery.csv_files:
91
+ exp = parse_eln_csv(discovery.csv_files[0])
92
+
93
+ if not exp:
94
+ exp = ExperimentData(
95
+ experiment_name=discovery.experiment,
96
+ labbook_name='', procedure_html='', procedure_text='',
97
+ reaction_type='', start_date='',
98
+ )
99
+
100
+ # CDX / RXN — take first of each
101
+ if discovery.cdx_files:
102
+ exp.cdx_path = discovery.cdx_files[0]
103
+ if discovery.rxn_files:
104
+ exp.rxn_path = discovery.rxn_files[0]
105
+
106
+ # LCMS files
107
+ for lf in discovery.lcms_files:
108
+ exp.lcms_files.append(LCMSFileInfo(
109
+ path=lf.path,
110
+ filename=os.path.basename(lf.path),
111
+ category=lf.category,
112
+ sort_key=lf.sort_key,
113
+ group_prefix=lf.group_prefix,
114
+ method_variant=lf.method_variant,
115
+ ))
116
+
117
+ # NMR PDFs
118
+ exp.nmr_pdfs = list(discovery.nmr_files)
119
+
120
+ return exp
121
+
122
+ # ---------------------------------------------------------------------------
123
+ # NMR extraction
124
+ # ---------------------------------------------------------------------------
125
+
126
+ def extract_nmr_data(pdf_path: str) -> List[str]:
127
+ """
128
+ Extract reported NMR data strings from an NMR PDF.
129
+
130
+ Searches for patterns like:
131
+ 1H NMR (400 MHz, DMSO-d6) delta ...
132
+ 13C NMR (101 MHz, DMSO-d6) delta ...
133
+ 19F NMR (376 MHz, DMSO-d6) delta ...
134
+ """
135
+ try:
136
+ text = extract_all_text(pdf_path)
137
+ except Exception as e:
138
+ print(f"Warning: Could not read NMR PDF {pdf_path}: {e}",
139
+ file=sys.stderr)
140
+ return []
141
+
142
+ results = []
143
+
144
+ # Pattern for NMR data strings
145
+ # Match: "1H NMR" or "13C NMR" or "19F NMR" etc., followed by the data
146
+ # The data string continues until a period followed by newline, or
147
+ # until a non-NMR line is encountered.
148
+ nmr_pattern = re.compile(
149
+ r'(\d+[A-Z]\s+NMR\s*' # nucleus: 1H, 13C, 19F, etc.
150
+ r'\([^)]+\)\s*' # (400 MHz, solvent)
151
+ r'[\u03b4\u00b4d]\s*' # delta or delta symbol
152
+ r'.+?)(?=\.\s*$|\.\s*\d+[A-Z]\s+NMR|\Z)', # capture until end
153
+ re.MULTILINE | re.DOTALL
154
+ )
155
+
156
+ seen = set()
157
+ for m in nmr_pattern.finditer(text):
158
+ data_str = m.group(1).strip()
159
+ # Clean up: normalize whitespace, remove line breaks within data
160
+ data_str = re.sub(r'\s+', ' ', data_str)
161
+ # Ensure it ends with a period
162
+ if not data_str.endswith('.'):
163
+ # Find the last closing paren with H count
164
+ last_paren = data_str.rfind(')')
165
+ if last_paren > 0:
166
+ data_str = data_str[:last_paren + 1] + '.'
167
+ # Deduplicate — NMR PDFs often repeat data on each page
168
+ if data_str not in seen:
169
+ seen.add(data_str)
170
+ results.append(data_str)
171
+
172
+ return results
173
+
174
+ # ---------------------------------------------------------------------------
175
+ # NMR batch parsing
176
+ # ---------------------------------------------------------------------------
177
+
178
+ def parse_all_nmr(exp: ExperimentData) -> None:
179
+ """Extract NMR data from all NMR PDFs (with cross-file deduplication)."""
180
+ seen = set()
181
+ for pdf_path in exp.nmr_pdfs:
182
+ data = extract_nmr_data(pdf_path)
183
+ new_count = 0
184
+ for d in data:
185
+ if d not in seen:
186
+ seen.add(d)
187
+ exp.nmr_data.append(d)
188
+ new_count += 1
189
+ if new_count:
190
+ print(f" Found NMR data in {os.path.basename(pdf_path)}: "
191
+ f"{new_count} entries", file=sys.stderr)
192
+ elif data:
193
+ print(f" NMR PDF {os.path.basename(pdf_path)}: "
194
+ f"data already seen (duplicate)", file=sys.stderr)
195
+ else:
196
+ print(f" NMR PDF {os.path.basename(pdf_path)}: "
197
+ f"no reported data string found", file=sys.stderr)
198
+
199
+ # ---------------------------------------------------------------------------
200
+ # CLI
201
+ # ---------------------------------------------------------------------------
202
+
203
+ def _build_arg_parser() -> argparse.ArgumentParser:
204
+ p = argparse.ArgumentParser(
205
+ description="Procedure Writer — Lab Book Entry Assembler",
206
+ formatter_class=argparse.RawDescriptionHelpFormatter,
207
+ epilog=__doc__,
208
+ )
209
+ p.add_argument("--input-dir", "-i", required=True,
210
+ help="Directory containing experiment files "
211
+ "(experiment dir or parent dir)")
212
+ p.add_argument("--experiment", "-e", default=None,
213
+ help="Experiment name (e.g., KL-7001-004). "
214
+ "Required if input-dir is the parent directory.")
215
+ p.add_argument("--sm-mass", type=float, default=None,
216
+ help="Exact mass (MW) of starting material. "
217
+ "Auto-detected from CSV if not provided.")
218
+ p.add_argument("--product-mass", type=float, default=None,
219
+ help="Exact mass (MW) of desired product. "
220
+ "Auto-detected from CSV if not provided.")
221
+ p.add_argument("--predict-byproducts", action="store_true",
222
+ help="Predict reaction byproducts via FlowER for LCMS "
223
+ "matching (requires 'flower' conda env; results "
224
+ "are cached)")
225
+ p.add_argument("--flower-json", default=None,
226
+ help="Pre-computed FlowER byproduct predictions JSON "
227
+ "(from run_pipeline Phase 3.15). Adds predicted "
228
+ "byproducts to expected species for LCMS matching.")
229
+ p.add_argument("--tracking-json", default=None,
230
+ help="Pre-computed multi-LCMS tracking analysis JSON "
231
+ "(from multi_lcms_analyzer --json). Skips re-parsing "
232
+ "tracking PDFs.")
233
+ p.add_argument("--output", "-o", default=None,
234
+ help="Output file path (default: stdout)")
235
+ p.add_argument("--json-errors", action="store_true",
236
+ help="Output structured JSON error objects to stderr on "
237
+ "failure (for agent orchestration)")
238
+ return p
239
+
240
+
241
+ def _emit_json_error(error_code: str, detail: str,
242
+ file: str = None, *, stream=sys.stderr) -> None:
243
+ """Write a structured JSON error to stderr."""
244
+ obj = {"error": error_code, "detail": detail}
245
+ if file:
246
+ obj["file"] = file
247
+ print(json.dumps(obj), file=stream)
248
+
249
+
250
+ def main(argv=None) -> int:
251
+ parser = _build_arg_parser()
252
+ args = parser.parse_args(argv)
253
+
254
+ try:
255
+ return _main_inner(args)
256
+ except Exception as e:
257
+ if args.json_errors:
258
+ msg = str(e).lower()
259
+ if "csv" in msg or "parse" in msg:
260
+ code = "csv_parse_failed"
261
+ elif "lcms" in msg or "pdf" in msg:
262
+ code = "lcms_analysis_failed"
263
+ elif "nmr" in msg:
264
+ code = "nmr_extraction_failed"
265
+ elif "mass" in msg or "structure" in msg:
266
+ code = "mass_resolution_failed"
267
+ else:
268
+ code = "procedure_failed"
269
+ _emit_json_error(code, str(e))
270
+ else:
271
+ print(f"ERROR: {e}", file=sys.stderr)
272
+ return 1
273
+
274
+
275
+ def _main_inner(args) -> int:
276
+ print("Procedure Writer — discovering files...", file=sys.stderr)
277
+
278
+ # Discover files
279
+ exp = discover_files(args.input_dir, args.experiment)
280
+
281
+ print(f"Experiment: {exp.experiment_name}", file=sys.stderr)
282
+ print(f" CSV procedure: {'yes' if exp.procedure_text else 'no'}",
283
+ file=sys.stderr)
284
+ print(f" Reactants: {len(exp.reactants)}", file=sys.stderr)
285
+ print(f" LCMS files: {len(exp.lcms_files)}", file=sys.stderr)
286
+ print(f" NMR PDFs: {len(exp.nmr_pdfs)}", file=sys.stderr)
287
+ print(f" CDX: {os.path.basename(exp.cdx_path) if exp.cdx_path else 'none'}",
288
+ file=sys.stderr)
289
+ print(f" RXN: {os.path.basename(exp.rxn_path) if exp.rxn_path else 'none'}",
290
+ file=sys.stderr)
291
+
292
+ # Override masses from CLI if provided
293
+ if args.sm_mass is not None:
294
+ exp.sm_mass = args.sm_mass
295
+ if args.product_mass is not None:
296
+ exp.product_mass = args.product_mass
297
+
298
+ if exp.sm_mass:
299
+ print(f" SM mass (CSV): {exp.sm_mass:.3f}", file=sys.stderr)
300
+ if exp.product_mass:
301
+ print(f" Product mass (CSV): {exp.product_mass:.3f}", file=sys.stderr)
302
+
303
+ # Extract expected masses from CDX/RXN (or CSV fallback)
304
+ print("\nDetermining expected species masses...", file=sys.stderr)
305
+ expected = extract_expected_masses(
306
+ exp, predict_byproducts=args.predict_byproducts)
307
+
308
+ # Load pre-computed FlowER byproduct predictions if provided
309
+ if args.flower_json and os.path.isfile(args.flower_json):
310
+ print(f"\nLoading FlowER predictions from "
311
+ f"{os.path.basename(args.flower_json)}...", file=sys.stderr)
312
+ try:
313
+ import json as _json
314
+ with open(args.flower_json, "r", encoding="utf-8") as f:
315
+ flower_data = _json.load(f)
316
+ existing_masses = [s.exact_mass for s in expected]
317
+ from cdxml_toolkit.constants import MASS_TOLERANCE
318
+ n_loaded = 0
319
+ for entry in flower_data:
320
+ em = entry.get("exact_mass", 0)
321
+ # Skip duplicates of existing species
322
+ if any(abs(em - m) < MASS_TOLERANCE for m in existing_masses):
323
+ continue
324
+ sp = ExpectedSpecies(
325
+ name=entry.get("name", "BP-?"),
326
+ role=entry.get("role", "byproduct"),
327
+ exact_mass=em,
328
+ smiles=entry.get("smiles", ""),
329
+ adducts=entry.get("adducts", {}),
330
+ source_file=args.flower_json,
331
+ )
332
+ expected.append(sp)
333
+ existing_masses.append(em)
334
+ n_loaded += 1
335
+ print(f" Loaded {n_loaded} byproduct(s) from FlowER JSON",
336
+ file=sys.stderr)
337
+ except Exception as e:
338
+ print(f" Warning: Could not load FlowER JSON: {e}",
339
+ file=sys.stderr)
340
+
341
+ for sp in expected:
342
+ mh = sp.adducts.get("[M+H]+", 0)
343
+ mh_neg = sp.adducts.get("[M-H]-", 0)
344
+ print(f" {sp.name} ({sp.role}): {sp.exact_mass:.3f} Da"
345
+ f" [M+H]+ {mh:.1f} [M-H]- {mh_neg:.1f}",
346
+ file=sys.stderr)
347
+
348
+ # Run tracking LCMS analysis (multi-LCMS)
349
+ tracking = TrackingAnalysis()
350
+ if args.tracking_json and os.path.isfile(args.tracking_json):
351
+ # Use pre-computed tracking analysis (avoids re-parsing PDFs)
352
+ print(f"\nLoading pre-computed tracking analysis from "
353
+ f"{os.path.basename(args.tracking_json)}...", file=sys.stderr)
354
+ analysis = load_analysis_from_json(args.tracking_json)
355
+ print(f" {len(analysis.compounds)} compounds, "
356
+ f"{len(analysis.files)} files", file=sys.stderr)
357
+ tracking = run_tracking_from_result(analysis, expected)
358
+ for ic in tracking.identified:
359
+ print(f" Compound RT {ic.compound.canonical_rt:.2f} -> "
360
+ f"{ic.species.name} ({ic.adduct} {ic.matched_mz:.1f})",
361
+ file=sys.stderr)
362
+ if tracking.unidentified:
363
+ n_sig = sum(1 for c in tracking.unidentified if c.max_area > MIN_SIGNIFICANT_AREA)
364
+ print(f" {len(tracking.unidentified)} unidentified compounds "
365
+ f"({n_sig} with area > 2%)", file=sys.stderr)
366
+ else:
367
+ tracking_files = [lf for lf in exp.lcms_files
368
+ if lf.category == "tracking"]
369
+ if tracking_files:
370
+ print(f"\nRunning tracking analysis "
371
+ f"({len(tracking_files)} files)...", file=sys.stderr)
372
+ tracking = run_tracking_analysis(exp, expected)
373
+ for ic in tracking.identified:
374
+ print(f" Compound RT {ic.compound.canonical_rt:.2f} -> "
375
+ f"{ic.species.name} ({ic.adduct} {ic.matched_mz:.1f})",
376
+ file=sys.stderr)
377
+ if tracking.unidentified:
378
+ n_sig = sum(1 for c in tracking.unidentified if c.max_area > MIN_SIGNIFICANT_AREA)
379
+ print(f" {len(tracking.unidentified)} unidentified compounds "
380
+ f"({n_sig} with area > 2%)", file=sys.stderr)
381
+
382
+ # Parse purified product LCMS (final files preferred, workup fallback)
383
+ purified = PurifiedAnalysis()
384
+ final_files = [lf for lf in exp.lcms_files if lf.category == "final"]
385
+ workup_files = [lf for lf in exp.lcms_files if lf.category == "workup"]
386
+ if final_files or workup_files:
387
+ print(f"\nAnalyzing purified product LCMS...", file=sys.stderr)
388
+ purified = run_purified_analysis(exp, expected)
389
+ purity_parts = []
390
+ if purified.purity_tac is not None:
391
+ purity_parts.append(f"TAC {purified.purity_tac:.0f}%")
392
+ if purified.purity_220nm is not None:
393
+ purity_parts.append(f"220nm {purified.purity_220nm:.0f}%")
394
+ if purified.purity_254nm is not None:
395
+ purity_parts.append(f"254nm {purified.purity_254nm:.0f}%")
396
+ if purity_parts:
397
+ print(f" Product purity: {', '.join(purity_parts)}",
398
+ file=sys.stderr)
399
+
400
+ # Extract NMR data
401
+ if exp.nmr_pdfs:
402
+ print(f"\nExtracting NMR data...", file=sys.stderr)
403
+ parse_all_nmr(exp)
404
+
405
+ # Build output sections
406
+ print(f"\nAssembling lab book entry...", file=sys.stderr)
407
+ procedure = build_procedure_section(exp, tracking)
408
+ characterization = build_characterization_section(
409
+ exp, expected, tracking, purified)
410
+ notes = build_notes_section(exp, expected, tracking, purified)
411
+
412
+ result = assemble_output(procedure, characterization, notes)
413
+
414
+ # FlowER byproduct reference CDXML (if predictions were made via
415
+ # --predict-byproducts inline mode)
416
+ if args.predict_byproducts and args.output:
417
+ try:
418
+ from mass_resolver import get_last_flower_predictions
419
+ from experiments.byproduct_prediction.flower_predictor import (
420
+ write_byproducts_cdxml,
421
+ )
422
+ flower_all = get_last_flower_predictions()
423
+ if flower_all:
424
+ base, _ = os.path.splitext(args.output)
425
+ cdxml_path = f"{base}-flower-predictions.cdxml"
426
+ write_byproducts_cdxml(flower_all, cdxml_path)
427
+ except ImportError:
428
+ pass
429
+ except Exception as e:
430
+ print(f" Warning: Could not write FlowER CDXML: {e}",
431
+ file=sys.stderr)
432
+
433
+ # Output
434
+ if args.output:
435
+ with open(args.output, 'w', encoding='utf-8') as f:
436
+ f.write(result)
437
+ print(f"\nOutput written to {args.output}", file=sys.stderr)
438
+ else:
439
+ sys.stdout.buffer.write(result.encode('utf-8'))
440
+ sys.stdout.buffer.write(b'\n')
441
+
442
+ return 0
443
+
444
+
445
+ if __name__ == "__main__":
446
+ sys.exit(main())
@@ -0,0 +1,47 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ NMR Data Extractor — standalone CLI wrapper.
4
+
5
+ Extracts reported NMR data strings (1H, 13C, 19F, etc.) from MestReNova
6
+ PDF exports. Delegates to procedure_writer.extract_nmr_data().
7
+
8
+ Usage:
9
+ python extract_nmr.py path/to/nmr.pdf [path/to/nmr2.pdf ...]
10
+ """
11
+
12
+ import argparse
13
+ import sys
14
+
15
+ from .procedure_writer import extract_nmr_data
16
+
17
+
18
+ def main(argv=None) -> int:
19
+ parser = argparse.ArgumentParser(description="Extract NMR data from PDFs")
20
+ parser.add_argument('files', nargs='+', help='NMR PDF files')
21
+ parser.add_argument('--output', '-o', type=str, default=None,
22
+ help='Output file (default: stdout)')
23
+ args = parser.parse_args(argv)
24
+
25
+ seen = set()
26
+ results = []
27
+ for pdf in args.files:
28
+ for line in extract_nmr_data(pdf):
29
+ if line not in seen:
30
+ seen.add(line)
31
+ results.append(line)
32
+
33
+ output = "\n".join(results)
34
+ if args.output:
35
+ with open(args.output, 'w', encoding='utf-8') as f:
36
+ f.write(output + "\n")
37
+ print(f"Wrote {len(results)} NMR entries to {args.output}",
38
+ file=sys.stderr)
39
+ else:
40
+ if output:
41
+ print(output)
42
+
43
+ return 0
44
+
45
+
46
+ if __name__ == "__main__":
47
+ sys.exit(main())