cdxml-toolkit 0.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (91) hide show
  1. cdxml_toolkit/__init__.py +18 -0
  2. cdxml_toolkit/_jre/__init__.py +2 -0
  3. cdxml_toolkit/_jre/temurin-21-jre-win-x64.zip +0 -0
  4. cdxml_toolkit/analysis/__init__.py +35 -0
  5. cdxml_toolkit/analysis/deterministic/__init__.py +12 -0
  6. cdxml_toolkit/analysis/deterministic/discover_experiment_files.py +413 -0
  7. cdxml_toolkit/analysis/deterministic/lab_book_formatter.py +701 -0
  8. cdxml_toolkit/analysis/deterministic/lcms_file_categorizer.py +928 -0
  9. cdxml_toolkit/analysis/deterministic/lcms_identifier.py +598 -0
  10. cdxml_toolkit/analysis/deterministic/mass_resolver.py +654 -0
  11. cdxml_toolkit/analysis/deterministic/multi_lcms_analyzer.py +1412 -0
  12. cdxml_toolkit/analysis/deterministic/procedure_writer.py +446 -0
  13. cdxml_toolkit/analysis/extract_nmr.py +47 -0
  14. cdxml_toolkit/analysis/format_procedure_entry.py +479 -0
  15. cdxml_toolkit/analysis/lcms_analyzer.py +1299 -0
  16. cdxml_toolkit/analysis/parse_analysis_file.py +134 -0
  17. cdxml_toolkit/cdxml_builder.py +920 -0
  18. cdxml_toolkit/cdxml_utils.py +342 -0
  19. cdxml_toolkit/chemdraw/__init__.py +5 -0
  20. cdxml_toolkit/chemdraw/_chemscript_server.py +562 -0
  21. cdxml_toolkit/chemdraw/cdx_converter.py +527 -0
  22. cdxml_toolkit/chemdraw/cdxml_to_image.py +262 -0
  23. cdxml_toolkit/chemdraw/cdxml_to_image_rdkit.py +296 -0
  24. cdxml_toolkit/chemdraw/chemscript_bridge.py +901 -0
  25. cdxml_toolkit/constants.py +304 -0
  26. cdxml_toolkit/coord_normalizer.py +438 -0
  27. cdxml_toolkit/deterministic_pipeline/__init__.py +6 -0
  28. cdxml_toolkit/deterministic_pipeline/legacy/__init__.py +5 -0
  29. cdxml_toolkit/deterministic_pipeline/legacy/eln_cdx_cleanup.py +509 -0
  30. cdxml_toolkit/deterministic_pipeline/legacy/eln_enrichment.py +1394 -0
  31. cdxml_toolkit/deterministic_pipeline/legacy/scheme_aligner.py +428 -0
  32. cdxml_toolkit/deterministic_pipeline/legacy/scheme_polisher.py +1337 -0
  33. cdxml_toolkit/deterministic_pipeline/legacy/scheme_polisher_v2.py +1340 -0
  34. cdxml_toolkit/deterministic_pipeline/scheme_reader_audit.py +931 -0
  35. cdxml_toolkit/deterministic_pipeline/scheme_reader_verify.py +1160 -0
  36. cdxml_toolkit/image/__init__.py +15 -0
  37. cdxml_toolkit/image/reaction_from_image.py +2103 -0
  38. cdxml_toolkit/image/structure_from_image.py +1711 -0
  39. cdxml_toolkit/layout/__init__.py +5 -0
  40. cdxml_toolkit/layout/alignment.py +1642 -0
  41. cdxml_toolkit/layout/reaction_cleanup.py +1002 -0
  42. cdxml_toolkit/layout/scheme_merger.py +2260 -0
  43. cdxml_toolkit/mcp_server/__init__.py +0 -0
  44. cdxml_toolkit/mcp_server/__main__.py +5 -0
  45. cdxml_toolkit/mcp_server/server.py +1567 -0
  46. cdxml_toolkit/naming/__init__.py +6 -0
  47. cdxml_toolkit/naming/aligned_namer.py +2342 -0
  48. cdxml_toolkit/naming/mol_builder.py +3722 -0
  49. cdxml_toolkit/naming/name_decomposer.py +2843 -0
  50. cdxml_toolkit/naming/reactions_datamol.json +2414 -0
  51. cdxml_toolkit/office/__init__.py +5 -0
  52. cdxml_toolkit/office/doc_from_template.py +722 -0
  53. cdxml_toolkit/office/ole_embedder.py +808 -0
  54. cdxml_toolkit/office/ole_extractor.py +272 -0
  55. cdxml_toolkit/perception/__init__.py +10 -0
  56. cdxml_toolkit/perception/compound_search.py +229 -0
  57. cdxml_toolkit/perception/eln_csv_parser.py +240 -0
  58. cdxml_toolkit/perception/rdf_parser.py +664 -0
  59. cdxml_toolkit/perception/reactant_heuristic.py +1045 -0
  60. cdxml_toolkit/perception/reaction_parser.py +2150 -0
  61. cdxml_toolkit/perception/scheme_reader.py +2948 -0
  62. cdxml_toolkit/perception/scheme_refine.py +1404 -0
  63. cdxml_toolkit/perception/scheme_segmenter.py +619 -0
  64. cdxml_toolkit/perception/spatial_assignment.py +1013 -0
  65. cdxml_toolkit/rdkit_utils.py +605 -0
  66. cdxml_toolkit/render/__init__.py +17 -0
  67. cdxml_toolkit/render/auto_layout.py +229 -0
  68. cdxml_toolkit/render/compact_parser.py +632 -0
  69. cdxml_toolkit/render/parser.py +706 -0
  70. cdxml_toolkit/render/render_scheme.py +267 -0
  71. cdxml_toolkit/render/renderer.py +2387 -0
  72. cdxml_toolkit/render/schema.py +90 -0
  73. cdxml_toolkit/render/scheme_maker.py +1043 -0
  74. cdxml_toolkit/render/scheme_yaml_writer.py +1487 -0
  75. cdxml_toolkit/resolve/__init__.py +13 -0
  76. cdxml_toolkit/resolve/cas_resolver.py +430 -0
  77. cdxml_toolkit/resolve/chemscanner_abbreviations.json +28813 -0
  78. cdxml_toolkit/resolve/condensed_formula.py +493 -0
  79. cdxml_toolkit/resolve/jre_manager.py +195 -0
  80. cdxml_toolkit/resolve/reagent_abbreviations.json +1046 -0
  81. cdxml_toolkit/resolve/reagent_db.py +285 -0
  82. cdxml_toolkit/resolve/superatom_data.json +2856 -0
  83. cdxml_toolkit/resolve/superatom_table.py +146 -0
  84. cdxml_toolkit/text_formatting.py +298 -0
  85. cdxml_toolkit-0.5.0.dist-info/METADATA +318 -0
  86. cdxml_toolkit-0.5.0.dist-info/RECORD +91 -0
  87. cdxml_toolkit-0.5.0.dist-info/WHEEL +5 -0
  88. cdxml_toolkit-0.5.0.dist-info/entry_points.txt +17 -0
  89. cdxml_toolkit-0.5.0.dist-info/licenses/LICENSE +21 -0
  90. cdxml_toolkit-0.5.0.dist-info/licenses/NOTICE.md +37 -0
  91. cdxml_toolkit-0.5.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,598 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ LCMS Identifier — Species Identification from Ion m/z Values
4
+
5
+ Matches observed LCMS ions against expected species adducts to identify
6
+ compounds in tracking and purified-product chromatograms. Handles both
7
+ multi-file tracking analysis (via multi_lcms_analyzer) and single-file
8
+ purified product analysis.
9
+
10
+ Key types:
11
+ - IdentifiedCompound: a multi-LCMS compound matched to an expected species
12
+ - IdentifiedPeak: a single-report peak matched to an expected species
13
+ - TrackingAnalysis: wrapper for multi-LCMS tracking results
14
+ - PurifiedAnalysis: wrapper for purified product LCMS results
15
+
16
+ Usage:
17
+ from lcms_identifier import (
18
+ match_ions_to_species, run_tracking_analysis, run_purified_analysis,
19
+ )
20
+ """
21
+
22
+ import os
23
+ import sys
24
+ from dataclasses import dataclass, field
25
+ from typing import List, Optional, Tuple
26
+
27
+ from ..lcms_analyzer import parse_report, LCMSReport, ChromPeak
28
+ from .lcms_file_categorizer import (
29
+ categorize_lcms_files_batch,
30
+ calibrate_sort_keys_hybrid,
31
+ )
32
+ from .multi_lcms_analyzer import (
33
+ FileEntry as MultiFileEntry,
34
+ analyze as multi_analyze,
35
+ AnalysisResult,
36
+ Compound as MultiCompound,
37
+ IonCluster,
38
+ extract_run_datetime,
39
+ load_analysis_from_json,
40
+ )
41
+ from cdxml_toolkit.constants import MASS_TOLERANCE
42
+ from .mass_resolver import (
43
+ ExpectedSpecies,
44
+ ADDUCTS, ADDUCT_PRIORITY, MODE_PREFERENCE,
45
+ )
46
+
47
+ # Role-based priority: SM/DP preferred over reactants, which beat byproducts.
48
+ # Lower number = preferred.
49
+ ROLE_PRIORITY = {
50
+ "substrate": 0,
51
+ "product": 0,
52
+ "reactant": 1,
53
+ "reagent": 1,
54
+ "byproduct": 2,
55
+ }
56
+
57
+ # ---------------------------------------------------------------------------
58
+ # Data structures
59
+ # ---------------------------------------------------------------------------
60
+
61
+ @dataclass
62
+ class IdentifiedCompound:
63
+ """A multi-LCMS compound matched to an expected species."""
64
+ compound: MultiCompound
65
+ species: ExpectedSpecies
66
+ adduct: str # e.g. "[M+H]+"
67
+ matched_mz: float # observed m/z that matched
68
+
69
+ @dataclass
70
+ class TrackingAnalysis:
71
+ """Results of multi-LCMS tracking analysis with species identification."""
72
+ result: Optional[AnalysisResult] = None
73
+ identified: List[IdentifiedCompound] = field(default_factory=list)
74
+ unidentified: List[MultiCompound] = field(default_factory=list)
75
+ files: List[MultiFileEntry] = field(default_factory=list)
76
+
77
+ @dataclass
78
+ class IdentifiedPeak:
79
+ """A single-report chromatographic peak matched to an expected species."""
80
+ peak: ChromPeak
81
+ species: ExpectedSpecies
82
+ adduct: str
83
+ matched_mz: float
84
+
85
+ @dataclass
86
+ class PurifiedAnalysis:
87
+ """Results of purified product LCMS analysis."""
88
+ report: Optional[LCMSReport] = None
89
+ file_info: Optional[object] = None
90
+ identified: List[IdentifiedPeak] = field(default_factory=list)
91
+ # Per-detector purity of the product peak (None = not detected)
92
+ purity_tac: Optional[float] = None
93
+ purity_220nm: Optional[float] = None
94
+ purity_254nm: Optional[float] = None
95
+ # True when no "final" file was found and a workup file was used instead
96
+ is_crude_fallback: bool = False
97
+
98
+ # ---------------------------------------------------------------------------
99
+ # Ion matching
100
+ # ---------------------------------------------------------------------------
101
+
102
+ def match_ions_to_species(
103
+ ions: List[Tuple[str, float, int]],
104
+ expected: List[ExpectedSpecies],
105
+ tolerance: float = MASS_TOLERANCE,
106
+ ) -> Optional[Tuple[ExpectedSpecies, str, float]]:
107
+ """
108
+ Match observed ions against expected species adducts.
109
+
110
+ Candidate matches are ranked by:
111
+ 1. Adduct priority — [M+H]+/[M-H]- preferred over [M+Na]+/[M+formate]-
112
+ 2. Role priority — SM/DP preferred over reactants, over byproducts
113
+ 3. Ion rank — lower rank = more intense = preferred
114
+ 4. ESI mode — ESI+ preferred over ESI- (tiebreaker)
115
+ 5. Mass accuracy — closer delta preferred (final tiebreaker)
116
+
117
+ Args:
118
+ ions: list of (mode, m/z, rank) tuples.
119
+ rank 0 = base peak (most intense).
120
+ expected: list of ExpectedSpecies with computed adducts
121
+ tolerance: matching tolerance in Da
122
+
123
+ Returns:
124
+ (species, adduct_name, matched_mz) or None
125
+ """
126
+ candidates = []
127
+
128
+ for obs_mode, obs_mz, obs_rank in ions:
129
+ for species in expected:
130
+ for adduct_name, expected_mz in species.adducts.items():
131
+ adduct_mode = ADDUCTS[adduct_name][0]
132
+ if obs_mode != adduct_mode:
133
+ continue
134
+ delta = abs(obs_mz - expected_mz)
135
+ if delta < tolerance:
136
+ candidates.append((
137
+ ADDUCT_PRIORITY[adduct_name], # 0=primary, 1=secondary
138
+ ROLE_PRIORITY.get(species.role, 1), # 0=SM/DP, 2=byproduct
139
+ obs_rank, # 0=base peak
140
+ MODE_PREFERENCE.get(obs_mode, 1), # 0=ES+, 1=ES-
141
+ delta, # mass accuracy
142
+ species, adduct_name, obs_mz,
143
+ ))
144
+
145
+ if not candidates:
146
+ return None
147
+
148
+ candidates.sort(key=lambda c: c[:5])
149
+ best = candidates[0]
150
+ return (best[5], best[6], best[7])
151
+
152
+
153
+ def _try_assign_species(match, used_species):
154
+ """Assign a matched species, with isomer fallback for products.
155
+
156
+ When a compound's ions match a product species (e.g. "DP") that has
157
+ already been assigned to a larger peak, this creates a "{name}-isomer"
158
+ variant instead of dropping the compound to unidentified. Handles
159
+ regioisomeric / diastereomeric products that share the same exact mass.
160
+
161
+ Args:
162
+ match: result from match_ions_to_species(), or None
163
+ used_species: set of already-assigned species names (modified in-place)
164
+
165
+ Returns:
166
+ (species, adduct, mz) if assigned, or None
167
+ """
168
+ if match is None:
169
+ return None
170
+
171
+ species, adduct, mz = match
172
+
173
+ if species.name not in used_species:
174
+ used_species.add(species.name)
175
+ return (species, adduct, mz)
176
+
177
+ # Isomer fallback for products
178
+ if species.role == "product":
179
+ isomer_name = f"{species.name}-isomer"
180
+ if isomer_name not in used_species:
181
+ isomer_sp = ExpectedSpecies(
182
+ name=isomer_name,
183
+ role=species.role,
184
+ exact_mass=species.exact_mass,
185
+ smiles=species.smiles,
186
+ adducts=dict(species.adducts),
187
+ source_file=species.source_file,
188
+ )
189
+ used_species.add(isomer_name)
190
+ return (isomer_sp, adduct, mz)
191
+
192
+ return None
193
+
194
+
195
+ # ---------------------------------------------------------------------------
196
+ # Tracking LCMS analysis (via multi_lcms_analyzer)
197
+ # ---------------------------------------------------------------------------
198
+
199
+ def _run_single_file_tracking(
200
+ lf,
201
+ expected: List[ExpectedSpecies],
202
+ ) -> TrackingAnalysis:
203
+ """
204
+ Analyze a single tracking LCMS file without multi_lcms_analyzer.
205
+
206
+ Parses the report, matches peaks to expected species, and wraps results
207
+ in TrackingAnalysis-compatible structures. No cross-file trending or
208
+ ion-recurrence filtering — those require multiple files.
209
+ """
210
+ try:
211
+ report = parse_report(lf.path)
212
+ lf.report = report
213
+ print(f" Parsed tracking: {lf.filename} "
214
+ f"({len(report.peaks)} peaks)", file=sys.stderr)
215
+ except Exception as e:
216
+ print(f" Warning: Could not parse {lf.filename}: {e}",
217
+ file=sys.stderr)
218
+ return TrackingAnalysis()
219
+
220
+ # Build FileEntry for compatibility with notes builder
221
+ fe = MultiFileEntry(
222
+ path=os.path.abspath(lf.path),
223
+ filename=lf.filename,
224
+ category=lf.category,
225
+ sort_key=lf.sort_key,
226
+ report=report,
227
+ )
228
+
229
+ # Match peaks to expected species (same approach as purified analysis)
230
+ identified = []
231
+ unidentified_compounds = []
232
+ used_species = set()
233
+ next_id = 1
234
+
235
+ # Sort peaks by area descending — match larger peaks first
236
+ sorted_peaks = sorted(report.peaks,
237
+ key=lambda p: p.area_pct or 0, reverse=True)
238
+
239
+ for peak in sorted_peaks:
240
+ # Build ions list from peak's mass spectra
241
+ ions = []
242
+ for spec in peak.ms_spectra:
243
+ for rank, mz in enumerate(spec.top_ions):
244
+ ions.append((spec.mode, mz, rank))
245
+
246
+ # Build a MultiCompound wrapper for this peak
247
+ mc = MultiCompound(compound_id=next_id, canonical_rt=peak.rt)
248
+ mc.max_area = peak.area_pct or 0.0
249
+ mc.uv_lambda_max = list(peak.uv_lambda_max) if peak.uv_lambda_max else []
250
+ mc.trend = "stable"
251
+ mc.trend_detail = "single file"
252
+ # area maps keyed by file index (only index 0 for single file)
253
+ if peak.area_pct is not None:
254
+ mc.area_pct_by_file[0] = peak.area_pct
255
+ if peak.area_pct_220nm is not None:
256
+ mc.area_pct_220_by_file[0] = peak.area_pct_220nm
257
+ if peak.area_pct_254nm is not None:
258
+ mc.area_pct_254_by_file[0] = peak.area_pct_254nm
259
+ next_id += 1
260
+
261
+ match = match_ions_to_species(ions, expected)
262
+ assigned = _try_assign_species(match, used_species)
263
+ if assigned:
264
+ species, adduct, mz = assigned
265
+ identified.append(IdentifiedCompound(
266
+ compound=mc, species=species, adduct=adduct, matched_mz=mz,
267
+ ))
268
+ else:
269
+ unidentified_compounds.append(mc)
270
+
271
+ # Build AnalysisResult for compatibility with characterization builder
272
+ all_compounds = [ic.compound for ic in identified] + unidentified_compounds
273
+ result = AnalysisResult(
274
+ instrument=report.instrument or "Unknown",
275
+ method_short=report.method_short or "Unknown",
276
+ files=[fe],
277
+ compounds=all_compounds,
278
+ )
279
+
280
+ return TrackingAnalysis(
281
+ result=result,
282
+ identified=identified,
283
+ unidentified=unidentified_compounds,
284
+ files=[fe],
285
+ )
286
+
287
+
288
+ def _cross_validate_method(file_entries: List[MultiFileEntry]) -> List[str]:
289
+ """
290
+ Cross-validate filename method modifier (e.g. -AmB) against the actual
291
+ PDF method path. Returns a list of warning strings for mismatches.
292
+ """
293
+ warnings = []
294
+ for fe in file_entries:
295
+ if not fe.method_variant or not fe.report:
296
+ continue
297
+ pdf_method = fe.report.method_path.lower()
298
+ # Map modifier to the substring expected in the method path
299
+ variant = fe.method_variant.lower()
300
+ # Strip 'foc' suffix — "-AmBfoc" still means buffer is AmB
301
+ core_variant = variant.replace('foc', '')
302
+ if core_variant and core_variant not in pdf_method:
303
+ warnings.append(
304
+ f"Method mismatch: {fe.filename} has filename modifier "
305
+ f"'-{fe.method_variant}' but PDF method is "
306
+ f"'{os.path.basename(fe.report.method_path)}'"
307
+ )
308
+ return warnings
309
+
310
+
311
+ def run_tracking_analysis(
312
+ exp,
313
+ expected: List[ExpectedSpecies],
314
+ ) -> TrackingAnalysis:
315
+ """
316
+ Analyze tracking LCMS files and identify compounds.
317
+
318
+ Single file → direct parse + ion matching (no multi_lcms_analyzer).
319
+ Multiple files → multi_lcms_analyzer for cross-file compound tracking.
320
+
321
+ Groups files by (instrument, method) and picks the largest group.
322
+ Uses hybrid sort keys: filename tokens for group 1, PDF acquisition
323
+ timestamps for groups 2+.
324
+ """
325
+ tracking_files = [lf for lf in exp.lcms_files if lf.category == "tracking"]
326
+
327
+ if not tracking_files:
328
+ return TrackingAnalysis()
329
+
330
+ if len(tracking_files) == 1:
331
+ return _run_single_file_tracking(tracking_files[0], expected)
332
+
333
+ # --- Multiple tracking files: use multi_lcms_analyzer ---
334
+ file_entries = []
335
+ for lf in tracking_files:
336
+ try:
337
+ report = parse_report(lf.path)
338
+ run_dt = extract_run_datetime(lf.path)
339
+ fe = MultiFileEntry(
340
+ path=os.path.abspath(lf.path),
341
+ filename=lf.filename,
342
+ category=lf.category,
343
+ sort_key=lf.sort_key,
344
+ report=report,
345
+ run_datetime=run_dt,
346
+ group_prefix=getattr(lf, 'group_prefix', None),
347
+ method_variant=getattr(lf, 'method_variant', None),
348
+ )
349
+ file_entries.append(fe)
350
+ print(f" Parsed tracking: {lf.filename} "
351
+ f"({len(report.peaks)} peaks)", file=sys.stderr)
352
+ except Exception as e:
353
+ print(f" Warning: Could not parse {lf.filename}: {e}",
354
+ file=sys.stderr)
355
+
356
+ if not file_entries:
357
+ return TrackingAnalysis(files=file_entries)
358
+
359
+ if len(file_entries) == 1:
360
+ # Only one file parsed successfully — fall back to single-file
361
+ lf = tracking_files[0]
362
+ lf.report = file_entries[0].report
363
+ return _run_single_file_tracking(lf, expected)
364
+
365
+ # --- Hybrid sort key recalibration ---
366
+ # Recalibrate groups 2+ using real PDF acquisition timestamps.
367
+ # Group 1 keeps filename-derived sort keys (chemist controls submission
368
+ # order at the start of a reaction).
369
+ #
370
+ # Recover the tracking group info from the batch categorizer.
371
+ # We need it to know which files belong to which prefix-group.
372
+ tracking_filenames = [lf.filename for lf in tracking_files]
373
+ batch = categorize_lcms_files_batch(
374
+ tracking_filenames,
375
+ exp.experiment_name if hasattr(exp, 'experiment_name') else "")
376
+
377
+ if batch.tracking_groups and len(batch.tracking_groups) > 1:
378
+ run_dts = {fe.filename: fe.run_datetime
379
+ for fe in file_entries if fe.run_datetime}
380
+ if run_dts:
381
+ calibrate_sort_keys_hybrid(
382
+ batch.tracking_groups, batch, run_dts)
383
+ # Update FileEntry sort_keys from recalibrated batch result
384
+ for fe in file_entries:
385
+ fc = batch.files.get(fe.filename)
386
+ if fc is not None:
387
+ fe.sort_key = fc.sort_key
388
+
389
+ # --- Method cross-validation ---
390
+ method_warnings = _cross_validate_method(file_entries)
391
+
392
+ # --- Run multi-LCMS analysis ---
393
+ # Group by (instrument, method); pick only the biggest group.
394
+ results = multi_analyze(
395
+ files=file_entries,
396
+ rt_tol=0.02,
397
+ mz_tol=0.5,
398
+ trend_threshold=0.2,
399
+ ignore_instrument=False,
400
+ use_run_time=False, # sort_key is now the single source of truth
401
+ pick_biggest_group=True,
402
+ )
403
+
404
+ if not results:
405
+ return TrackingAnalysis(files=file_entries)
406
+
407
+ # Take the (now single) result from the biggest group
408
+ analysis = results[0]
409
+
410
+ # Append method cross-validation warnings
411
+ if method_warnings:
412
+ analysis.warnings.extend(method_warnings)
413
+
414
+ # Match compounds to expected species
415
+ identified = []
416
+ unidentified = []
417
+ used_species = set()
418
+
419
+ # Sort compounds by max_area descending (match larger compounds first)
420
+ sorted_compounds = sorted(
421
+ analysis.compounds, key=lambda c: c.max_area, reverse=True)
422
+
423
+ for compound in sorted_compounds:
424
+ # Collect all ions as (mode, mz, rank) tuples — recurring first
425
+ ions = []
426
+ for ic in compound.recurring_ions:
427
+ ions.append((ic.mode, ic.mean_mz, ic.best_rank))
428
+ for ic in compound.other_ions:
429
+ ions.append((ic.mode, ic.mean_mz, ic.best_rank))
430
+
431
+ match = match_ions_to_species(ions, expected)
432
+ assigned = _try_assign_species(match, used_species)
433
+ if assigned:
434
+ species, adduct, mz = assigned
435
+ identified.append(IdentifiedCompound(
436
+ compound=compound,
437
+ species=species,
438
+ adduct=adduct,
439
+ matched_mz=mz,
440
+ ))
441
+ else:
442
+ unidentified.append(compound)
443
+
444
+ return TrackingAnalysis(
445
+ result=analysis,
446
+ identified=identified,
447
+ unidentified=unidentified,
448
+ files=file_entries,
449
+ )
450
+
451
+
452
+ def run_tracking_from_result(
453
+ analysis: AnalysisResult,
454
+ expected: List[ExpectedSpecies],
455
+ ) -> TrackingAnalysis:
456
+ """
457
+ Identify compounds in a pre-computed AnalysisResult.
458
+
459
+ Same species-matching logic as run_tracking_analysis(), but skips PDF
460
+ parsing and multi_analyze() — accepts an already-computed result
461
+ (e.g. loaded from JSON via load_analysis_from_json()).
462
+ """
463
+ identified = []
464
+ unidentified = []
465
+ used_species = set()
466
+
467
+ sorted_compounds = sorted(
468
+ analysis.compounds, key=lambda c: c.max_area, reverse=True)
469
+
470
+ for compound in sorted_compounds:
471
+ ions = []
472
+ for ic in compound.recurring_ions:
473
+ ions.append((ic.mode, ic.mean_mz, ic.best_rank))
474
+ for ic in compound.other_ions:
475
+ ions.append((ic.mode, ic.mean_mz, ic.best_rank))
476
+
477
+ match = match_ions_to_species(ions, expected)
478
+ assigned = _try_assign_species(match, used_species)
479
+ if assigned:
480
+ species, adduct, mz = assigned
481
+ identified.append(IdentifiedCompound(
482
+ compound=compound,
483
+ species=species,
484
+ adduct=adduct,
485
+ matched_mz=mz,
486
+ ))
487
+ else:
488
+ unidentified.append(compound)
489
+
490
+ return TrackingAnalysis(
491
+ result=analysis,
492
+ identified=identified,
493
+ unidentified=unidentified,
494
+ files=analysis.files,
495
+ )
496
+
497
+
498
+ # ---------------------------------------------------------------------------
499
+ # Purified product LCMS analysis
500
+ # ---------------------------------------------------------------------------
501
+
502
+ def run_purified_analysis(
503
+ exp,
504
+ expected: List[ExpectedSpecies],
505
+ ) -> PurifiedAnalysis:
506
+ """Parse and analyze the purified product LCMS file.
507
+
508
+ Selection order:
509
+ 1. Files categorized as "final" (e.g. NPpurified, C18-purified)
510
+ 2. Fallback: last workup file chronologically (e.g. crude, wash)
511
+ """
512
+ final_files = [lf for lf in exp.lcms_files if lf.category == "final"]
513
+ crude_fallback = False
514
+
515
+ if not final_files:
516
+ # Fallback: use the chronologically last workup file
517
+ workup_files = [lf for lf in exp.lcms_files
518
+ if lf.category == "workup"]
519
+ if workup_files:
520
+ crude_fallback = True
521
+ # Sort by actual LCMS run datetime (preferred) then sort_key
522
+ for wf in workup_files:
523
+ wf._run_dt = extract_run_datetime(wf.path)
524
+ # Files with run_datetime sort after those without; among
525
+ # those with datetime, latest wins; ties break by sort_key.
526
+ workup_files.sort(
527
+ key=lambda f: (f._run_dt or "", f.sort_key))
528
+ lf = workup_files[-1]
529
+ print(f" No purified-product LCMS file found — "
530
+ f"using last workup file: {lf.filename}"
531
+ f"{' (run ' + lf._run_dt + ')' if lf._run_dt else ''}",
532
+ file=sys.stderr)
533
+ else:
534
+ return PurifiedAnalysis()
535
+ else:
536
+ # Use the last final file (most relevant)
537
+ lf = final_files[-1]
538
+ try:
539
+ report = parse_report(lf.path)
540
+ lf.report = report
541
+ print(f" Parsed purified: {lf.filename} "
542
+ f"({len(report.peaks)} peaks)", file=sys.stderr)
543
+ except Exception as e:
544
+ print(f" Warning: Could not parse {lf.filename}: {e}",
545
+ file=sys.stderr)
546
+ return PurifiedAnalysis(file_info=lf)
547
+
548
+ # Match peaks to expected species
549
+ identified = []
550
+ for peak in report.peaks:
551
+ # Build ions list from peak's mass spectra (with rank)
552
+ ions = []
553
+ for spec in peak.ms_spectra:
554
+ for rank, mz in enumerate(spec.top_ions):
555
+ ions.append((spec.mode, mz, rank))
556
+
557
+ match = match_ions_to_species(ions, expected)
558
+ if match:
559
+ species, adduct, mz = match
560
+ identified.append(IdentifiedPeak(
561
+ peak=peak, species=species, adduct=adduct, matched_mz=mz,
562
+ ))
563
+
564
+ # Product purity: area% of the product peak on each detector.
565
+ # If multiple peaks match the product, use the highest-area one.
566
+ purity_tac = None
567
+ purity_220 = None
568
+ purity_254 = None
569
+ for ip in identified:
570
+ if ip.species.role == "product":
571
+ if ip.peak.area_pct is not None and (
572
+ purity_tac is None or ip.peak.area_pct > purity_tac):
573
+ purity_tac = ip.peak.area_pct
574
+ if ip.peak.area_pct_220nm is not None and (
575
+ purity_220 is None or ip.peak.area_pct_220nm > purity_220):
576
+ purity_220 = ip.peak.area_pct_220nm
577
+ if ip.peak.area_pct_254nm is not None and (
578
+ purity_254 is None or ip.peak.area_pct_254nm > purity_254):
579
+ purity_254 = ip.peak.area_pct_254nm
580
+
581
+ return PurifiedAnalysis(
582
+ report=report,
583
+ file_info=lf,
584
+ identified=identified,
585
+ purity_tac=purity_tac,
586
+ purity_220nm=purity_220,
587
+ purity_254nm=purity_254,
588
+ is_crude_fallback=crude_fallback,
589
+ )
590
+
591
+
592
+ # ---------------------------------------------------------------------------
593
+ # CLI placeholder
594
+ # ---------------------------------------------------------------------------
595
+
596
+ if __name__ == "__main__":
597
+ print("lcms_identifier: no standalone CLI — "
598
+ "import from procedure_writer.py or use directly")