cdxml-toolkit 0.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (91) hide show
  1. cdxml_toolkit/__init__.py +18 -0
  2. cdxml_toolkit/_jre/__init__.py +2 -0
  3. cdxml_toolkit/_jre/temurin-21-jre-win-x64.zip +0 -0
  4. cdxml_toolkit/analysis/__init__.py +35 -0
  5. cdxml_toolkit/analysis/deterministic/__init__.py +12 -0
  6. cdxml_toolkit/analysis/deterministic/discover_experiment_files.py +413 -0
  7. cdxml_toolkit/analysis/deterministic/lab_book_formatter.py +701 -0
  8. cdxml_toolkit/analysis/deterministic/lcms_file_categorizer.py +928 -0
  9. cdxml_toolkit/analysis/deterministic/lcms_identifier.py +598 -0
  10. cdxml_toolkit/analysis/deterministic/mass_resolver.py +654 -0
  11. cdxml_toolkit/analysis/deterministic/multi_lcms_analyzer.py +1412 -0
  12. cdxml_toolkit/analysis/deterministic/procedure_writer.py +446 -0
  13. cdxml_toolkit/analysis/extract_nmr.py +47 -0
  14. cdxml_toolkit/analysis/format_procedure_entry.py +479 -0
  15. cdxml_toolkit/analysis/lcms_analyzer.py +1299 -0
  16. cdxml_toolkit/analysis/parse_analysis_file.py +134 -0
  17. cdxml_toolkit/cdxml_builder.py +920 -0
  18. cdxml_toolkit/cdxml_utils.py +342 -0
  19. cdxml_toolkit/chemdraw/__init__.py +5 -0
  20. cdxml_toolkit/chemdraw/_chemscript_server.py +562 -0
  21. cdxml_toolkit/chemdraw/cdx_converter.py +527 -0
  22. cdxml_toolkit/chemdraw/cdxml_to_image.py +262 -0
  23. cdxml_toolkit/chemdraw/cdxml_to_image_rdkit.py +296 -0
  24. cdxml_toolkit/chemdraw/chemscript_bridge.py +901 -0
  25. cdxml_toolkit/constants.py +304 -0
  26. cdxml_toolkit/coord_normalizer.py +438 -0
  27. cdxml_toolkit/deterministic_pipeline/__init__.py +6 -0
  28. cdxml_toolkit/deterministic_pipeline/legacy/__init__.py +5 -0
  29. cdxml_toolkit/deterministic_pipeline/legacy/eln_cdx_cleanup.py +509 -0
  30. cdxml_toolkit/deterministic_pipeline/legacy/eln_enrichment.py +1394 -0
  31. cdxml_toolkit/deterministic_pipeline/legacy/scheme_aligner.py +428 -0
  32. cdxml_toolkit/deterministic_pipeline/legacy/scheme_polisher.py +1337 -0
  33. cdxml_toolkit/deterministic_pipeline/legacy/scheme_polisher_v2.py +1340 -0
  34. cdxml_toolkit/deterministic_pipeline/scheme_reader_audit.py +931 -0
  35. cdxml_toolkit/deterministic_pipeline/scheme_reader_verify.py +1160 -0
  36. cdxml_toolkit/image/__init__.py +15 -0
  37. cdxml_toolkit/image/reaction_from_image.py +2103 -0
  38. cdxml_toolkit/image/structure_from_image.py +1711 -0
  39. cdxml_toolkit/layout/__init__.py +5 -0
  40. cdxml_toolkit/layout/alignment.py +1642 -0
  41. cdxml_toolkit/layout/reaction_cleanup.py +1002 -0
  42. cdxml_toolkit/layout/scheme_merger.py +2260 -0
  43. cdxml_toolkit/mcp_server/__init__.py +0 -0
  44. cdxml_toolkit/mcp_server/__main__.py +5 -0
  45. cdxml_toolkit/mcp_server/server.py +1567 -0
  46. cdxml_toolkit/naming/__init__.py +6 -0
  47. cdxml_toolkit/naming/aligned_namer.py +2342 -0
  48. cdxml_toolkit/naming/mol_builder.py +3722 -0
  49. cdxml_toolkit/naming/name_decomposer.py +2843 -0
  50. cdxml_toolkit/naming/reactions_datamol.json +2414 -0
  51. cdxml_toolkit/office/__init__.py +5 -0
  52. cdxml_toolkit/office/doc_from_template.py +722 -0
  53. cdxml_toolkit/office/ole_embedder.py +808 -0
  54. cdxml_toolkit/office/ole_extractor.py +272 -0
  55. cdxml_toolkit/perception/__init__.py +10 -0
  56. cdxml_toolkit/perception/compound_search.py +229 -0
  57. cdxml_toolkit/perception/eln_csv_parser.py +240 -0
  58. cdxml_toolkit/perception/rdf_parser.py +664 -0
  59. cdxml_toolkit/perception/reactant_heuristic.py +1045 -0
  60. cdxml_toolkit/perception/reaction_parser.py +2150 -0
  61. cdxml_toolkit/perception/scheme_reader.py +2948 -0
  62. cdxml_toolkit/perception/scheme_refine.py +1404 -0
  63. cdxml_toolkit/perception/scheme_segmenter.py +619 -0
  64. cdxml_toolkit/perception/spatial_assignment.py +1013 -0
  65. cdxml_toolkit/rdkit_utils.py +605 -0
  66. cdxml_toolkit/render/__init__.py +17 -0
  67. cdxml_toolkit/render/auto_layout.py +229 -0
  68. cdxml_toolkit/render/compact_parser.py +632 -0
  69. cdxml_toolkit/render/parser.py +706 -0
  70. cdxml_toolkit/render/render_scheme.py +267 -0
  71. cdxml_toolkit/render/renderer.py +2387 -0
  72. cdxml_toolkit/render/schema.py +90 -0
  73. cdxml_toolkit/render/scheme_maker.py +1043 -0
  74. cdxml_toolkit/render/scheme_yaml_writer.py +1487 -0
  75. cdxml_toolkit/resolve/__init__.py +13 -0
  76. cdxml_toolkit/resolve/cas_resolver.py +430 -0
  77. cdxml_toolkit/resolve/chemscanner_abbreviations.json +28813 -0
  78. cdxml_toolkit/resolve/condensed_formula.py +493 -0
  79. cdxml_toolkit/resolve/jre_manager.py +195 -0
  80. cdxml_toolkit/resolve/reagent_abbreviations.json +1046 -0
  81. cdxml_toolkit/resolve/reagent_db.py +285 -0
  82. cdxml_toolkit/resolve/superatom_data.json +2856 -0
  83. cdxml_toolkit/resolve/superatom_table.py +146 -0
  84. cdxml_toolkit/text_formatting.py +298 -0
  85. cdxml_toolkit-0.5.0.dist-info/METADATA +318 -0
  86. cdxml_toolkit-0.5.0.dist-info/RECORD +91 -0
  87. cdxml_toolkit-0.5.0.dist-info/WHEEL +5 -0
  88. cdxml_toolkit-0.5.0.dist-info/entry_points.txt +17 -0
  89. cdxml_toolkit-0.5.0.dist-info/licenses/LICENSE +21 -0
  90. cdxml_toolkit-0.5.0.dist-info/licenses/NOTICE.md +37 -0
  91. cdxml_toolkit-0.5.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,701 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Lab Book Formatter — Output Section Generation for Procedure Writer
4
+
5
+ Builds the three sections of a lab book entry:
6
+ PROCEDURE — setup text, tracking narrative, workup, purification, isolation
7
+ CHARACTERIZATION — LCMS annotations (tracking + purified) + NMR data
8
+ NOTES — conversion timeline, unidentified compounds, diagnostics
9
+
10
+ Also contains formatting helpers for LCMS method names, purity reporting,
11
+ UV lambda-max values, and narrative inference from LCMS filenames.
12
+
13
+ Usage:
14
+ from lab_book_formatter import (
15
+ build_procedure_section, build_characterization_section,
16
+ build_notes_section, assemble_output,
17
+ )
18
+ """
19
+
20
+ import math
21
+ import os
22
+ import re
23
+ import sys
24
+ from typing import List, Optional, Dict
25
+
26
+ from cdxml_toolkit.constants import MIN_REPORT_AREA_PCT
27
+ from .mass_resolver import ExpectedSpecies
28
+ from .lcms_identifier import (
29
+ IdentifiedCompound, IdentifiedPeak,
30
+ TrackingAnalysis, PurifiedAnalysis,
31
+ )
32
+
33
+ # ---------------------------------------------------------------------------
34
+ # Constants
35
+ # ---------------------------------------------------------------------------
36
+
37
+ SECTION_SEP = "=" * 60
38
+
39
+ # ---------------------------------------------------------------------------
40
+ # Method formatting
41
+ # ---------------------------------------------------------------------------
42
+
43
+ def format_method_name(method_path: str) -> str:
44
+ """Extract short method name like 'AmF 2 min' from method path."""
45
+ basename = os.path.basename(method_path).replace('.olp', '')
46
+ parts = basename.split('_')
47
+
48
+ buffer_type = ""
49
+ runtime = ""
50
+
51
+ for p in parts:
52
+ pl = p.lower()
53
+ if 'amf' in pl:
54
+ buffer_type = "AmF"
55
+ elif 'amb' in pl or 'ambic' in pl:
56
+ buffer_type = "AmB"
57
+ elif pl == 'fa':
58
+ buffer_type = "FA"
59
+ elif 'tfa' in pl:
60
+ buffer_type = "TFA"
61
+
62
+ if 'min' in pl:
63
+ time_str = pl.replace('min', '').replace('p', '.')
64
+ try:
65
+ time_val = float(time_str)
66
+ runtime = f"{round(time_val)} min"
67
+ except ValueError:
68
+ runtime = p
69
+
70
+ pieces = [x for x in [buffer_type, runtime] if x]
71
+ return " ".join(pieces) if pieces else basename
72
+
73
+ # ---------------------------------------------------------------------------
74
+ # Tracking narrative helpers
75
+ # ---------------------------------------------------------------------------
76
+
77
+ def _infer_event_from_filename(filename: str) -> Optional[str]:
78
+ """Infer a reaction event from an LCMS filename suffix."""
79
+ name = filename.lower()
80
+
81
+ # Additional reagent addition
82
+ m = re.search(r'add\s*(\d+)\s*mg\s*(\w+)', name)
83
+ if m:
84
+ amount = m.group(1)
85
+ reagent = m.group(2).upper()
86
+ return f"Additional {reagent} ({amount} mg) was added"
87
+
88
+ if 'addmore' in name:
89
+ return "Additional reagent was added"
90
+
91
+ # Quenching
92
+ m = re.search(r'(\d+)\s*ml\s*me', name)
93
+ if m:
94
+ return f"The reaction was quenched with MeOH ({m.group(1)} mL)"
95
+
96
+ return None
97
+
98
+
99
+ def _infer_timepoint_desc(filename: str) -> str:
100
+ """Extract a timepoint description from filename for narrative use."""
101
+ name = filename.lower()
102
+
103
+ # Remove experiment prefix and extension
104
+ base = os.path.splitext(os.path.basename(name))[0]
105
+
106
+ # Extract time patterns
107
+ m = re.search(r'(\d+)\s*min', base)
108
+ if m:
109
+ if 'premix' in base:
110
+ return f"after {m.group(1)} min premixing"
111
+ return f"after {m.group(1)} min"
112
+
113
+ m = re.search(r'(\d+)\s*h\b', base)
114
+ if m:
115
+ return f"after {m.group(1)} h"
116
+
117
+ if re.search(r'\bON\b', os.path.basename(filename)) or 'overnight' in name:
118
+ return "after overnight stirring"
119
+
120
+ if 'beforeadd' in name:
121
+ return "before adding more reagent"
122
+
123
+ if 'premix' in name:
124
+ return "after premixing"
125
+
126
+ return ""
127
+
128
+
129
+ def _timepoint_for_file(files, file_index: int) -> str:
130
+ """Get timepoint description for a file by index."""
131
+ if file_index < len(files):
132
+ return _infer_timepoint_desc(files[file_index].filename)
133
+ return ""
134
+
135
+
136
+ def build_tracking_narrative(exp, tracking: TrackingAnalysis) -> str:
137
+ """
138
+ Build a concise reaction monitoring narrative from multi-LCMS data.
139
+
140
+ Uses identified compound trends and area% timelines to produce
141
+ a paragraph summarizing the conversion story.
142
+ """
143
+ if not tracking.result or not tracking.identified:
144
+ return ""
145
+
146
+ # Find SM and product compounds
147
+ sm_ic = None
148
+ dp_ic = None
149
+ for ic in tracking.identified:
150
+ if ic.species.role == "substrate":
151
+ sm_ic = ic
152
+ elif ic.species.role == "product":
153
+ dp_ic = ic
154
+
155
+ if not sm_ic and not dp_ic:
156
+ return ""
157
+
158
+ events = []
159
+ files = tracking.result.files
160
+
161
+ # Check for reagent addition events from filenames
162
+ for lf in exp.lcms_files:
163
+ if lf.category == "tracking":
164
+ event = _infer_event_from_filename(lf.filename)
165
+ if event:
166
+ events.append(event + ".")
167
+ break # Only report first addition
168
+
169
+ # Compute conversion at each timepoint from area% data
170
+ sm_areas = sm_ic.compound.area_pct_by_file if sm_ic else {}
171
+ dp_areas = dp_ic.compound.area_pct_by_file if dp_ic else {}
172
+
173
+ prev_conversion = None
174
+ reported_complete = False
175
+
176
+ for fi in range(len(files)):
177
+ fe = files[fi]
178
+ is_premix = 'premix' in fe.filename.lower()
179
+ tp = _infer_timepoint_desc(fe.filename)
180
+
181
+ sm_area = sm_areas.get(fi)
182
+ dp_area = dp_areas.get(fi)
183
+
184
+ # Compute conversion if both SM and DP are tracked
185
+ if sm_ic and dp_ic:
186
+ sa = sm_area if sm_area is not None else 0
187
+ da = dp_area if dp_area is not None else 0
188
+ total = sa + da
189
+ conversion = (da / total * 100) if total > 1.0 else None
190
+ elif sm_ic and sm_area is not None:
191
+ # No product identified — track SM consumption
192
+ conversion = 100 - sm_area if sm_area < 95 else None
193
+ else:
194
+ conversion = None
195
+
196
+ if is_premix or conversion is None:
197
+ continue
198
+
199
+ if prev_conversion is None:
200
+ if conversion > 0 and tp:
201
+ if conversion >= 95:
202
+ events.append(
203
+ f"LCMS {tp} indicated complete consumption "
204
+ f"of starting material.")
205
+ reported_complete = True
206
+ else:
207
+ events.append(
208
+ f"LCMS {tp} indicated ~{conversion:.0f}% "
209
+ f"conversion.")
210
+ else:
211
+ delta = conversion - prev_conversion
212
+ if conversion >= 95 and not reported_complete:
213
+ events.append(
214
+ f"LCMS {tp} indicated complete consumption "
215
+ f"of starting material.")
216
+ reported_complete = True
217
+ elif reported_complete:
218
+ pass
219
+ elif abs(delta) < 5:
220
+ if not any('did not improve' in e for e in events):
221
+ events.append(
222
+ f"Conversion did not improve {tp} "
223
+ f"(~{conversion:.0f}%).")
224
+ elif delta > 10:
225
+ events.append(
226
+ f"LCMS {tp} showed ~{conversion:.0f}% conversion.")
227
+
228
+ prev_conversion = conversion
229
+
230
+ if not events:
231
+ # Fallback: report trends
232
+ parts = []
233
+ if sm_ic:
234
+ parts.append(f"SM {sm_ic.compound.trend}")
235
+ if dp_ic:
236
+ parts.append(f"DP {dp_ic.compound.trend}")
237
+ if parts:
238
+ return "LCMS tracking: " + ", ".join(parts) + "."
239
+ return ""
240
+
241
+ return " ".join(events)
242
+
243
+
244
+ # ---------------------------------------------------------------------------
245
+ # Workup / purification inference
246
+ # ---------------------------------------------------------------------------
247
+
248
+ def _infer_workup_steps(exp) -> List[str]:
249
+ """Infer workup steps from LCMS filenames."""
250
+ steps = []
251
+ seen = set()
252
+
253
+ for lf in exp.lcms_files:
254
+ name = lf.filename.lower()
255
+
256
+ if 'eawash' in name and 'ea_wash' not in seen:
257
+ steps.append("washed with EtOAc")
258
+ seen.add('ea_wash')
259
+ elif 'dcmwash' in name and 'dcm_wash' not in seen:
260
+ steps.append("washed with DCM")
261
+ seen.add('dcm_wash')
262
+ elif 'wash' in name and 'rewash' not in name and 'wash' not in seen:
263
+ steps.append("washed")
264
+ seen.add('wash')
265
+ elif 'rewash' in name and 'rewash' not in seen:
266
+ steps.append("re-washed")
267
+ seen.add('rewash')
268
+ elif 'pellet' in name and 'pellet' not in seen:
269
+ steps.append("precipitate collected")
270
+ seen.add('pellet')
271
+ elif 'super' in name and 'super' not in seen:
272
+ steps.append("supernatant separated")
273
+ seen.add('super')
274
+
275
+ return steps
276
+
277
+
278
+ def _infer_purification(exp) -> Optional[str]:
279
+ """Infer purification method from LCMS filenames."""
280
+ names = [lf.filename.lower() for lf in exp.lcms_files]
281
+ all_names = " ".join(names)
282
+
283
+ methods = []
284
+ if 'nppurif' in all_names or 'np-' in all_names:
285
+ methods.append("normal phase column chromatography")
286
+ if 'c18' in all_names:
287
+ methods.append("reversed phase (C18) column chromatography")
288
+ if 'peakcomb' in all_names:
289
+ methods.append("fractions combined based on LCMS")
290
+
291
+ has_purified = any('purified' in n or 'pure' in n for n in names)
292
+
293
+ if methods:
294
+ return "Purified by " + " followed by ".join(methods) + "."
295
+ elif has_purified:
296
+ return "Purified (method not specified in LCMS filenames)."
297
+
298
+ return None
299
+
300
+ # ---------------------------------------------------------------------------
301
+ # Procedure section
302
+ # ---------------------------------------------------------------------------
303
+
304
+ def _strip_nmr_from_procedure(text: str) -> str:
305
+ """Remove NMR data strings from procedure text.
306
+
307
+ NMR data (e.g. '1H NMR (400 MHz, DMSO-d6) δ ...') belongs in
308
+ CHARACTERIZATION, not PROCEDURE. If the ELN CSV procedure text
309
+ contains NMR data, strip it out to avoid duplication.
310
+ """
311
+ # Pattern matches "1H NMR (...) δ ..." through to the end of the data
312
+ # (terminated by a period + whitespace/newline, or end of string)
313
+ nmr_pattern = re.compile(
314
+ r'\d+[A-Z]\s+NMR\s*\([^)]+\)\s*[\u03b4\u00b4d]\s*.+?'
315
+ r'(?=\.\s*(?:\d+[A-Z]\s+NMR|\n|$)|\Z)',
316
+ re.DOTALL
317
+ )
318
+ cleaned = nmr_pattern.sub('', text)
319
+ # Clean up leftover whitespace / blank lines from removal
320
+ cleaned = re.sub(r'\n{3,}', '\n\n', cleaned)
321
+ return cleaned.strip()
322
+
323
+
324
+ def _procedure_has_tracking(text: str) -> bool:
325
+ keywords = ['lcms', 'tracking', 'conversion', 'consumption of sm',
326
+ 'reaction complete', 'indicated', 'lc-ms']
327
+ tl = text.lower()
328
+ return any(kw in tl for kw in keywords)
329
+
330
+
331
+ def _procedure_has_workup(text: str) -> bool:
332
+ keywords = ['was concentrated', 'was quenched', 'was washed',
333
+ 'was extracted', 'aqueous', 'was dried', 'workup']
334
+ tl = text.lower()
335
+ return any(kw in tl for kw in keywords)
336
+
337
+
338
+ def _procedure_has_isolation(text: str) -> bool:
339
+ keywords = ['was purified', 'to give', 'to afford',
340
+ 'column chromatography', 'chromatography', 'purified by']
341
+ tl = text.lower()
342
+ return any(kw in tl for kw in keywords)
343
+
344
+
345
+ def build_procedure_section(exp, tracking: TrackingAnalysis) -> str:
346
+ """Build the complete PROCEDURE section."""
347
+ parts = []
348
+
349
+ # 1. Setup from CSV procedure text
350
+ # Strip NMR data from procedure (it belongs in CHARACTERIZATION)
351
+ setup = exp.procedure_text if exp.procedure_text else ""
352
+ setup = _strip_nmr_from_procedure(setup) if setup else ""
353
+
354
+ if setup:
355
+ parts.append(setup)
356
+ else:
357
+ parts.append("[Procedure text not available in CSV.]")
358
+
359
+ # 2. Reaction monitoring (only if not already in procedure text)
360
+ if not _procedure_has_tracking(setup):
361
+ narrative = build_tracking_narrative(exp, tracking)
362
+ if narrative:
363
+ parts.append(narrative)
364
+
365
+ # 3. Workup (only if not already described)
366
+ if not _procedure_has_workup(setup):
367
+ workup_steps = _infer_workup_steps(exp)
368
+ if workup_steps:
369
+ parts.append("The reaction mixture was " +
370
+ ", ".join(workup_steps) + ".")
371
+
372
+ # 4. Purification (only if not already described)
373
+ if not _procedure_has_isolation(setup):
374
+ purif = _infer_purification(exp)
375
+ if purif:
376
+ parts.append(purif)
377
+
378
+ # 5. Product isolation
379
+ if exp.product and not _procedure_has_isolation(setup):
380
+ if exp.product.obtained_mass and exp.product.yield_pct:
381
+ parts.append(
382
+ f"Obtained {exp.product.name} "
383
+ f"({exp.product.obtained_mass}, {exp.product.yield_pct}).")
384
+ elif exp.product.obtained_mass:
385
+ parts.append(
386
+ f"Obtained {exp.product.name} "
387
+ f"({exp.product.obtained_mass}).")
388
+
389
+ return "\n\n".join(parts)
390
+
391
+ # ---------------------------------------------------------------------------
392
+ # Characterization section
393
+ # ---------------------------------------------------------------------------
394
+
395
+ def _format_lambda_max(wavelengths: List[float]) -> str:
396
+ """Format UV lambda max values, e.g. 'λmax 218, 254 nm'.
397
+
398
+ Uses arithmetic rounding (half-up) rather than Python's default
399
+ banker's rounding, so 222.5 -> 223 not 222.
400
+ """
401
+ if not wavelengths:
402
+ return ""
403
+ wl_strs = [str(math.floor(wl + 0.5)) for wl in sorted(wavelengths)]
404
+ return f"\u03bbmax {', '.join(wl_strs)} nm"
405
+
406
+
407
+ def _format_purity(purified: PurifiedAnalysis) -> str:
408
+ """Format purity from all available detectors."""
409
+ parts = []
410
+ if purified.purity_tac is not None:
411
+ parts.append(f"TAC {purified.purity_tac:.0f}%")
412
+ if purified.purity_220nm is not None:
413
+ parts.append(f"220nm {purified.purity_220nm:.0f}%")
414
+ if purified.purity_254nm is not None:
415
+ parts.append(f"254nm {purified.purity_254nm:.0f}%")
416
+ if not parts:
417
+ return ""
418
+ return "; purity " + ", ".join(parts)
419
+
420
+
421
+ def build_characterization_section(
422
+ exp,
423
+ expected: List[ExpectedSpecies],
424
+ tracking: TrackingAnalysis,
425
+ purified: PurifiedAnalysis,
426
+ ) -> str:
427
+ """Build the CHARACTERIZATION section with LCMS and NMR data."""
428
+ if not expected:
429
+ return "[Expected species masses not available — cannot annotate LCMS.]"
430
+
431
+ lines = []
432
+
433
+ # Tracking LCMS — report identified species from multi-LCMS analysis
434
+ # Only include compounds with max area% >= MIN_REPORT_AREA_PCT
435
+ if tracking.result and tracking.identified:
436
+ instrument = tracking.result.instrument
437
+ method = tracking.result.method_short
438
+
439
+ # Build annotation from identified compounds
440
+ # Order: SM first, then product, then other reactants
441
+ role_order = {"substrate": 0, "product": 1, "reactant": 2}
442
+ sorted_ic = sorted(
443
+ tracking.identified,
444
+ key=lambda ic: (role_order.get(ic.species.role, 3),
445
+ ic.compound.canonical_rt))
446
+
447
+ parts = []
448
+ for ic in sorted_ic:
449
+ if ic.compound.max_area < MIN_REPORT_AREA_PCT:
450
+ continue
451
+ entry = (f"{ic.species.name} RT {ic.compound.canonical_rt:.2f}, "
452
+ f"{ic.adduct} {ic.matched_mz:.1f}")
453
+ lm = _format_lambda_max(ic.compound.uv_lambda_max)
454
+ if lm:
455
+ entry += f", {lm}"
456
+ parts.append(entry)
457
+
458
+ if parts:
459
+ lines.append(
460
+ f"Reaction tracking LCMS ({instrument}, {method}): "
461
+ + "; ".join(parts))
462
+
463
+ # Purified/Crude product LCMS — deduplicate by species (keep highest area)
464
+ # Only include peaks with area% >= MIN_REPORT_AREA_PCT
465
+ _product_label = ("Crude product LCMS" if purified.is_crude_fallback
466
+ else "Purified product LCMS")
467
+ if purified.report and purified.identified:
468
+ instrument = purified.report.instrument
469
+ method = purified.report.method_short
470
+
471
+ best_by_species: Dict[str, IdentifiedPeak] = {}
472
+ for ip in purified.identified:
473
+ area = ip.peak.area_pct or 0
474
+ if area < MIN_REPORT_AREA_PCT:
475
+ continue
476
+ key = ip.species.name
477
+ if key not in best_by_species or (
478
+ area > (best_by_species[key].peak.area_pct or 0)):
479
+ best_by_species[key] = ip
480
+
481
+ parts = []
482
+ for ip in best_by_species.values():
483
+ entry = (f"{ip.species.name} RT {ip.peak.rt:.2f}, "
484
+ f"{ip.adduct} {ip.matched_mz:.1f}")
485
+ lm = _format_lambda_max(ip.peak.uv_lambda_max)
486
+ if lm:
487
+ entry += f", {lm}"
488
+ parts.append(entry)
489
+
490
+ purity_str = _format_purity(purified)
491
+
492
+ if parts:
493
+ lines.append(
494
+ f"{_product_label} ({instrument}, {method}): "
495
+ + "; ".join(parts) + purity_str)
496
+
497
+ elif purified.report and not purified.identified:
498
+ # No species identified by mass spec — report dominant peak's purity
499
+ # and UV data if the main peak is large enough (likely purified product)
500
+ instrument = purified.report.instrument
501
+ method = purified.report.method_short
502
+
503
+ # Find the dominant peak (highest TAC area%)
504
+ best_peak = None
505
+ best_area = 0.0
506
+ for peak in purified.report.peaks:
507
+ area = peak.area_pct or 0
508
+ if area > best_area:
509
+ best_area = area
510
+ best_peak = peak
511
+
512
+ if best_peak and best_area >= MIN_REPORT_AREA_PCT:
513
+ entry = f"RT {best_peak.rt:.2f}"
514
+ lm = _format_lambda_max(best_peak.uv_lambda_max)
515
+ if lm:
516
+ entry += f", {lm}"
517
+
518
+ # Build purity from the dominant peak directly
519
+ purity_parts = []
520
+ if best_peak.area_pct is not None:
521
+ purity_parts.append(f"TAC {best_peak.area_pct:.0f}%")
522
+ if best_peak.area_pct_220nm is not None:
523
+ purity_parts.append(f"220nm {best_peak.area_pct_220nm:.0f}%")
524
+ if best_peak.area_pct_254nm is not None:
525
+ purity_parts.append(f"254nm {best_peak.area_pct_254nm:.0f}%")
526
+ purity_str = ("; purity " + ", ".join(purity_parts)
527
+ if purity_parts else "")
528
+
529
+ lines.append(
530
+ f"{_product_label} ({instrument}, {method}): "
531
+ f"{entry}{purity_str} [no MS data]")
532
+
533
+ # NMR data
534
+ for nmr_str in exp.nmr_data:
535
+ lines.append(nmr_str)
536
+
537
+ if not lines:
538
+ return "[No characterization data available.]"
539
+
540
+ return "\n\n".join(lines)
541
+
542
+ # ---------------------------------------------------------------------------
543
+ # Notes section
544
+ # ---------------------------------------------------------------------------
545
+
546
+ def build_notes_section(
547
+ exp,
548
+ expected: List[ExpectedSpecies],
549
+ tracking: TrackingAnalysis,
550
+ purified: PurifiedAnalysis,
551
+ ) -> str:
552
+ """Build the NOTES section with observations and inferences."""
553
+ notes = []
554
+
555
+ # Conversion timeline from multi-LCMS tracking data
556
+ if tracking.result and tracking.identified:
557
+ sm_ic = next((ic for ic in tracking.identified
558
+ if ic.species.role == "substrate"), None)
559
+ dp_ic = next((ic for ic in tracking.identified
560
+ if ic.species.role == "product"), None)
561
+
562
+ if sm_ic or dp_ic:
563
+ for fi, fe in enumerate(tracking.result.files):
564
+ sm_area = (sm_ic.compound.area_pct_by_file.get(fi)
565
+ if sm_ic else None)
566
+ dp_area = (dp_ic.compound.area_pct_by_file.get(fi)
567
+ if dp_ic else None)
568
+
569
+ # Compute conversion%: 1 - SM/(SM+DP)
570
+ conv_str = None
571
+ if sm_area is not None and dp_area is not None:
572
+ total = sm_area + dp_area
573
+ if total > 0:
574
+ conv = (1.0 - sm_area / total) * 100
575
+ conv_str = f"conversion {conv:.0f}%"
576
+ elif sm_area is not None and dp_area is None:
577
+ # No product detected yet
578
+ conv_str = "conversion 0%"
579
+ elif dp_area is not None and sm_area is None:
580
+ # No SM detected — full conversion
581
+ conv_str = "conversion 100%"
582
+
583
+ tp = _infer_timepoint_desc(fe.filename)
584
+ if conv_str is not None:
585
+ notes.append(f"- {fe.filename}: {conv_str}"
586
+ f"{' ' + tp if tp else ''}")
587
+
588
+ # Unidentified compounds in tracking (only significant ones)
589
+ if tracking.unidentified:
590
+ for c in tracking.unidentified:
591
+ if c.max_area >= MIN_REPORT_AREA_PCT:
592
+ ions_strs = []
593
+ for ic in (c.recurring_ions or c.other_ions)[:3]:
594
+ mode_str = "ESI+" if ic.mode == "ES+" else "ESI-"
595
+ ions_strs.append(f"{mode_str} {ic.mean_mz:.1f}")
596
+ ions_str = ", ".join(ions_strs) if ions_strs else "no ions"
597
+ notes.append(
598
+ f"- Unidentified compound RT {c.canonical_rt:.2f} "
599
+ f"({c.trend}, max {c.max_area:.1f}%): {ions_str}")
600
+
601
+ # Unknown peaks in purified product
602
+ if purified.report and purified.identified:
603
+ identified_rts = {ip.peak.rt for ip in purified.identified}
604
+ for peak in purified.report.peaks:
605
+ if peak.rt not in identified_rts and (peak.area_pct or 0) > 1.0:
606
+ ions_strs = []
607
+ for spec in peak.ms_spectra:
608
+ if spec.top_ions:
609
+ mode_str = "ESI+" if spec.mode == "ES+" else "ESI-"
610
+ ions_strs.append(f"{mode_str} {spec.top_ions[0]:.1f}")
611
+ ions_str = ", ".join(ions_strs) if ions_strs else "no MS"
612
+ _product_type = "crude product" if purified.is_crude_fallback else "purified product"
613
+ notes.append(
614
+ f"- Unknown peak in {_product_type}: RT {peak.rt:.2f}, "
615
+ f"{peak.area_pct:.1f}% ({ions_str})")
616
+
617
+ # Purified product LCMS source file
618
+ if purified.file_info:
619
+ _product_label = ("Crude product LCMS" if purified.is_crude_fallback
620
+ else "Purified product LCMS")
621
+ notes.append(f"- {_product_label}: {purified.file_info.filename}")
622
+
623
+ # Species source — check if masses came from structure or CSV
624
+ source_files = {sp.source_file for sp in expected if sp.source_file}
625
+ if source_files:
626
+ source = next(iter(source_files))
627
+ notes.append(f"- Expected masses from structure file "
628
+ f"({os.path.basename(source)})")
629
+ elif expected:
630
+ notes.append("- Expected masses from CSV MW (no CDX/RXN available)")
631
+
632
+ # Analysis warnings (method mismatches, outliers, discarded files)
633
+ if tracking.result and tracking.result.warnings:
634
+ for w in tracking.result.warnings:
635
+ notes.append(f"- LCMS analysis: {w}")
636
+ if tracking.result and tracking.result.discarded_files:
637
+ disc_names = [f.filename for f in tracking.result.discarded_files]
638
+ notes.append(
639
+ f"- {len(disc_names)} LCMS file(s) from different instrument/method "
640
+ f"excluded from tracking: {', '.join(disc_names)}")
641
+
642
+ # File categorization summary
643
+ categories = {}
644
+ for lf in exp.lcms_files:
645
+ categories.setdefault(lf.category, []).append(lf.filename)
646
+ if categories:
647
+ notes.append(f"- LCMS files: {len(exp.lcms_files)} total "
648
+ f"({', '.join(f'{len(v)} {k}' for k, v in categories.items())})")
649
+
650
+ # Missing data flags
651
+ if not exp.procedure_text:
652
+ notes.append("- Procedure text was empty")
653
+ if not exp.lcms_files:
654
+ notes.append("- No LCMS files found")
655
+ if not exp.nmr_pdfs:
656
+ notes.append("- No NMR PDFs found")
657
+ elif not exp.nmr_data:
658
+ notes.append("- NMR PDFs present but no reported data string found "
659
+ "— needs manual extraction")
660
+ if exp.product and not exp.product.obtained_mass:
661
+ notes.append("- Yield/mass obtained not recorded in CSV")
662
+ if not expected:
663
+ notes.append("- No expected species masses available")
664
+
665
+ return "\n".join(notes) if notes else "No notes."
666
+
667
+ # ---------------------------------------------------------------------------
668
+ # Output assembly
669
+ # ---------------------------------------------------------------------------
670
+
671
+ def assemble_output(procedure: str, characterization: str,
672
+ notes: str) -> str:
673
+ """Assemble the three sections into final output."""
674
+ parts = [
675
+ SECTION_SEP,
676
+ "PROCEDURE",
677
+ SECTION_SEP,
678
+ "",
679
+ procedure,
680
+ "",
681
+ SECTION_SEP,
682
+ "CHARACTERIZATION",
683
+ SECTION_SEP,
684
+ "",
685
+ characterization,
686
+ "",
687
+ SECTION_SEP,
688
+ "NOTES",
689
+ SECTION_SEP,
690
+ "",
691
+ notes,
692
+ ]
693
+ return "\n".join(parts)
694
+
695
+ # ---------------------------------------------------------------------------
696
+ # CLI placeholder
697
+ # ---------------------------------------------------------------------------
698
+
699
+ if __name__ == "__main__":
700
+ print("lab_book_formatter: no standalone CLI — "
701
+ "import from procedure_writer.py")