cdxml-toolkit 0.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (91) hide show
  1. cdxml_toolkit/__init__.py +18 -0
  2. cdxml_toolkit/_jre/__init__.py +2 -0
  3. cdxml_toolkit/_jre/temurin-21-jre-win-x64.zip +0 -0
  4. cdxml_toolkit/analysis/__init__.py +35 -0
  5. cdxml_toolkit/analysis/deterministic/__init__.py +12 -0
  6. cdxml_toolkit/analysis/deterministic/discover_experiment_files.py +413 -0
  7. cdxml_toolkit/analysis/deterministic/lab_book_formatter.py +701 -0
  8. cdxml_toolkit/analysis/deterministic/lcms_file_categorizer.py +928 -0
  9. cdxml_toolkit/analysis/deterministic/lcms_identifier.py +598 -0
  10. cdxml_toolkit/analysis/deterministic/mass_resolver.py +654 -0
  11. cdxml_toolkit/analysis/deterministic/multi_lcms_analyzer.py +1412 -0
  12. cdxml_toolkit/analysis/deterministic/procedure_writer.py +446 -0
  13. cdxml_toolkit/analysis/extract_nmr.py +47 -0
  14. cdxml_toolkit/analysis/format_procedure_entry.py +479 -0
  15. cdxml_toolkit/analysis/lcms_analyzer.py +1299 -0
  16. cdxml_toolkit/analysis/parse_analysis_file.py +134 -0
  17. cdxml_toolkit/cdxml_builder.py +920 -0
  18. cdxml_toolkit/cdxml_utils.py +342 -0
  19. cdxml_toolkit/chemdraw/__init__.py +5 -0
  20. cdxml_toolkit/chemdraw/_chemscript_server.py +562 -0
  21. cdxml_toolkit/chemdraw/cdx_converter.py +527 -0
  22. cdxml_toolkit/chemdraw/cdxml_to_image.py +262 -0
  23. cdxml_toolkit/chemdraw/cdxml_to_image_rdkit.py +296 -0
  24. cdxml_toolkit/chemdraw/chemscript_bridge.py +901 -0
  25. cdxml_toolkit/constants.py +304 -0
  26. cdxml_toolkit/coord_normalizer.py +438 -0
  27. cdxml_toolkit/deterministic_pipeline/__init__.py +6 -0
  28. cdxml_toolkit/deterministic_pipeline/legacy/__init__.py +5 -0
  29. cdxml_toolkit/deterministic_pipeline/legacy/eln_cdx_cleanup.py +509 -0
  30. cdxml_toolkit/deterministic_pipeline/legacy/eln_enrichment.py +1394 -0
  31. cdxml_toolkit/deterministic_pipeline/legacy/scheme_aligner.py +428 -0
  32. cdxml_toolkit/deterministic_pipeline/legacy/scheme_polisher.py +1337 -0
  33. cdxml_toolkit/deterministic_pipeline/legacy/scheme_polisher_v2.py +1340 -0
  34. cdxml_toolkit/deterministic_pipeline/scheme_reader_audit.py +931 -0
  35. cdxml_toolkit/deterministic_pipeline/scheme_reader_verify.py +1160 -0
  36. cdxml_toolkit/image/__init__.py +15 -0
  37. cdxml_toolkit/image/reaction_from_image.py +2103 -0
  38. cdxml_toolkit/image/structure_from_image.py +1711 -0
  39. cdxml_toolkit/layout/__init__.py +5 -0
  40. cdxml_toolkit/layout/alignment.py +1642 -0
  41. cdxml_toolkit/layout/reaction_cleanup.py +1002 -0
  42. cdxml_toolkit/layout/scheme_merger.py +2260 -0
  43. cdxml_toolkit/mcp_server/__init__.py +0 -0
  44. cdxml_toolkit/mcp_server/__main__.py +5 -0
  45. cdxml_toolkit/mcp_server/server.py +1567 -0
  46. cdxml_toolkit/naming/__init__.py +6 -0
  47. cdxml_toolkit/naming/aligned_namer.py +2342 -0
  48. cdxml_toolkit/naming/mol_builder.py +3722 -0
  49. cdxml_toolkit/naming/name_decomposer.py +2843 -0
  50. cdxml_toolkit/naming/reactions_datamol.json +2414 -0
  51. cdxml_toolkit/office/__init__.py +5 -0
  52. cdxml_toolkit/office/doc_from_template.py +722 -0
  53. cdxml_toolkit/office/ole_embedder.py +808 -0
  54. cdxml_toolkit/office/ole_extractor.py +272 -0
  55. cdxml_toolkit/perception/__init__.py +10 -0
  56. cdxml_toolkit/perception/compound_search.py +229 -0
  57. cdxml_toolkit/perception/eln_csv_parser.py +240 -0
  58. cdxml_toolkit/perception/rdf_parser.py +664 -0
  59. cdxml_toolkit/perception/reactant_heuristic.py +1045 -0
  60. cdxml_toolkit/perception/reaction_parser.py +2150 -0
  61. cdxml_toolkit/perception/scheme_reader.py +2948 -0
  62. cdxml_toolkit/perception/scheme_refine.py +1404 -0
  63. cdxml_toolkit/perception/scheme_segmenter.py +619 -0
  64. cdxml_toolkit/perception/spatial_assignment.py +1013 -0
  65. cdxml_toolkit/rdkit_utils.py +605 -0
  66. cdxml_toolkit/render/__init__.py +17 -0
  67. cdxml_toolkit/render/auto_layout.py +229 -0
  68. cdxml_toolkit/render/compact_parser.py +632 -0
  69. cdxml_toolkit/render/parser.py +706 -0
  70. cdxml_toolkit/render/render_scheme.py +267 -0
  71. cdxml_toolkit/render/renderer.py +2387 -0
  72. cdxml_toolkit/render/schema.py +90 -0
  73. cdxml_toolkit/render/scheme_maker.py +1043 -0
  74. cdxml_toolkit/render/scheme_yaml_writer.py +1487 -0
  75. cdxml_toolkit/resolve/__init__.py +13 -0
  76. cdxml_toolkit/resolve/cas_resolver.py +430 -0
  77. cdxml_toolkit/resolve/chemscanner_abbreviations.json +28813 -0
  78. cdxml_toolkit/resolve/condensed_formula.py +493 -0
  79. cdxml_toolkit/resolve/jre_manager.py +195 -0
  80. cdxml_toolkit/resolve/reagent_abbreviations.json +1046 -0
  81. cdxml_toolkit/resolve/reagent_db.py +285 -0
  82. cdxml_toolkit/resolve/superatom_data.json +2856 -0
  83. cdxml_toolkit/resolve/superatom_table.py +146 -0
  84. cdxml_toolkit/text_formatting.py +298 -0
  85. cdxml_toolkit-0.5.0.dist-info/METADATA +318 -0
  86. cdxml_toolkit-0.5.0.dist-info/RECORD +91 -0
  87. cdxml_toolkit-0.5.0.dist-info/WHEEL +5 -0
  88. cdxml_toolkit-0.5.0.dist-info/entry_points.txt +17 -0
  89. cdxml_toolkit-0.5.0.dist-info/licenses/LICENSE +21 -0
  90. cdxml_toolkit-0.5.0.dist-info/licenses/NOTICE.md +37 -0
  91. cdxml_toolkit-0.5.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,1412 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Multi-LCMS Analyzer
4
+ Collates peaks across multiple LCMS files from the same reaction to:
5
+ 1. Match peaks across files (same compound identification by RT + UV ratio)
6
+ 2. Merge mass spectrum ions into recurring vs one-off lists
7
+ 3. Track area% trends over time (increasing / decreasing / stable)
8
+
9
+ Input: MassLynx PDF files (parsed internally via lcms_analyzer.parse_report).
10
+ Output: Text report (default) or structured JSON (--json).
11
+
12
+ Usage:
13
+ python multi_lcms_analyzer.py \\
14
+ file1.pdf file2.pdf file3.pdf ... \\
15
+ --rt-tolerance 0.02 \\
16
+ --mz-tolerance 0.5 \\
17
+ --trend-threshold 0.2
18
+ """
19
+
20
+ import argparse
21
+ import json
22
+ import os
23
+ import re
24
+ import sys
25
+ from collections import Counter, defaultdict
26
+ from dataclasses import dataclass, field
27
+ from statistics import median
28
+ from typing import List, Optional, Dict, Tuple
29
+
30
+ from cdxml_toolkit.constants import (
31
+ LCMS_RT_TOLERANCE,
32
+ LCMS_MZ_TOLERANCE,
33
+ LCMS_TREND_THRESHOLD,
34
+ LCMS_MIN_SUMMARY_AREA,
35
+ )
36
+ from ..lcms_analyzer import (
37
+ parse_report, LCMSReport, ChromPeak, MassSpectrum, format_basic_report,
38
+ is_waters_report, method_basename,
39
+ )
40
+ from .lcms_file_categorizer import categorize_lcms_file, _AMBIGUOUS_SORT_KEYS
41
+
42
+ # ---------------------------------------------------------------------------
43
+ # Data structures
44
+ # ---------------------------------------------------------------------------
45
+
46
+ @dataclass
47
+ class FileEntry:
48
+ """Metadata for one LCMS file in the analysis."""
49
+ path: str
50
+ filename: str
51
+ category: str # "tracking", "workup", "purification", "final"
52
+ sort_key: float
53
+ report: Optional[LCMSReport] = None
54
+ run_datetime: Optional[str] = None # "YYYY-MM-DD HH:MM:SS" from PDF
55
+ ambiguous_time: bool = False # True for "beforeadd" etc.
56
+ group_prefix: Optional[str] = None # tracking group prefix (from batch categorizer)
57
+ method_variant: Optional[str] = None # filename-derived method hint (AmB, AmF, etc.)
58
+
59
+ @dataclass
60
+ class IonCluster:
61
+ """A group of m/z values across files that represent the same ion."""
62
+ mean_mz: float
63
+ mode: str # "ES+" or "ES-"
64
+ occurrences: int # number of files this ion appeared in
65
+ best_rank: int # best (lowest) rank seen across files (0 = base peak)
66
+ mz_values: List[float] = field(default_factory=list)
67
+
68
+ @dataclass
69
+ class Compound:
70
+ """A matched compound tracked across multiple LCMS files."""
71
+ compound_id: int
72
+ canonical_rt: float = 0.0
73
+ uv_ratio: Optional[float] = None # area_220nm / area_254nm
74
+
75
+ # Per-file data: keyed by file index (chronological order)
76
+ rt_by_file: Dict[int, float] = field(default_factory=dict)
77
+ area_pct_by_file: Dict[int, Optional[float]] = field(default_factory=dict)
78
+ area_220_by_file: Dict[int, Optional[float]] = field(default_factory=dict)
79
+ area_254_by_file: Dict[int, Optional[float]] = field(default_factory=dict)
80
+ area_pct_220_by_file: Dict[int, Optional[float]] = field(default_factory=dict)
81
+ area_pct_254_by_file: Dict[int, Optional[float]] = field(default_factory=dict)
82
+
83
+ # Raw ion collection: (mode, mz, rank_in_top_ions, file_index)
84
+ all_ions: List[Tuple[str, float, int, int]] = field(default_factory=list)
85
+
86
+ # Merged ion clusters (populated after matching)
87
+ recurring_ions: List[IonCluster] = field(default_factory=list)
88
+ other_ions: List[IonCluster] = field(default_factory=list)
89
+
90
+ # UV lambda-max consensus
91
+ uv_lambda_max: List[float] = field(default_factory=list)
92
+
93
+ # Trend
94
+ trend: str = "stable"
95
+ trend_detail: str = ""
96
+ max_area: float = 0.0 # max observed area% (excluding outlier files)
97
+
98
+ @dataclass
99
+ class AnalysisResult:
100
+ """Complete result of the multi-file LCMS analysis."""
101
+ instrument: str
102
+ method_short: str
103
+ method_key: str = "" # method basename for grouping (lowercased)
104
+ files: List[FileEntry] = field(default_factory=list)
105
+ compounds: List[Compound] = field(default_factory=list)
106
+ warnings: List[str] = field(default_factory=list)
107
+ excluded_files: set = field(default_factory=set) # indices of outlier files
108
+ ambiguous_files: set = field(default_factory=set) # indices with uncertain timing
109
+ discarded_files: List[FileEntry] = field(default_factory=list) # files from other groups
110
+
111
+ # ---------------------------------------------------------------------------
112
+ # Note: File categorization code has been extracted to lcms_file_categorizer.py
113
+ # ---------------------------------------------------------------------------
114
+
115
+ def extract_run_datetime(pdf_path: str) -> Optional[str]:
116
+ """
117
+ Extract the acquisition date+time from a MassLynx PDF.
118
+ Looks for 'Date:DD-Mon-YYYY' and 'Time:HH:MM:SS' in the header.
119
+ Returns ISO-format string 'YYYY-MM-DD HH:MM:SS' or None.
120
+ """
121
+ from ..lcms_analyzer import extract_all_text
122
+
123
+ try:
124
+ text = extract_all_text(pdf_path)
125
+ except Exception:
126
+ return None
127
+
128
+ date_m = re.search(r'Date:(\d{1,2}-\w{3}-\d{4})', text)
129
+ time_m = re.search(r'Time:(\d{1,2}:\d{2}:\d{2})', text)
130
+
131
+ if not date_m or not time_m:
132
+ return None
133
+
134
+ try:
135
+ from datetime import datetime as _dt
136
+ dt = _dt.strptime(f"{date_m.group(1)} {time_m.group(1)}",
137
+ "%d-%b-%Y %H:%M:%S")
138
+ return dt.strftime("%Y-%m-%d %H:%M:%S")
139
+ except ValueError:
140
+ return None
141
+
142
+
143
+ # ---------------------------------------------------------------------------
144
+ # UV ratio helpers
145
+ # ---------------------------------------------------------------------------
146
+
147
+ def compute_uv_ratio(peak: ChromPeak) -> Optional[float]:
148
+ """
149
+ Compute area_220nm / area_254nm for a peak.
150
+ Returns None if either area is missing or zero (inconclusive data).
151
+ Only returns a meaningful ratio when both areas are present and non-zero.
152
+ """
153
+ a220 = peak.area_220nm
154
+ a254 = peak.area_254nm
155
+ if a220 is None or a254 is None:
156
+ return None
157
+ if a220 == 0 or a254 == 0:
158
+ return None
159
+ return a220 / a254
160
+
161
+
162
+ def check_uv_compatibility(ratio_a: Optional[float],
163
+ ratio_b: Optional[float]) -> Optional[bool]:
164
+ """
165
+ Check if two UV ratios are compatible.
166
+ Returns True (compatible), False (incompatible), or None (inconclusive).
167
+
168
+ Only rejects when BOTH ratios are finite and clearly outside 2x of each
169
+ other. If either ratio is None, returns None (inconclusive — the peak
170
+ might just not have been detected on one channel in a particular run).
171
+ """
172
+ if ratio_a is None or ratio_b is None:
173
+ return None
174
+
175
+ if ratio_b == 0:
176
+ return None
177
+
178
+ factor = ratio_a / ratio_b
179
+ if 0.5 <= factor <= 2.0:
180
+ return True
181
+ return False
182
+
183
+ # ---------------------------------------------------------------------------
184
+ # Peak matching
185
+ # ---------------------------------------------------------------------------
186
+
187
+ def _update_compound_uv_ratio(compound: Compound):
188
+ """Recompute compound's UV ratio as median of all finite observed ratios."""
189
+ ratios = []
190
+ for fi in compound.area_220_by_file:
191
+ a220 = compound.area_220_by_file.get(fi)
192
+ a254 = compound.area_254_by_file.get(fi)
193
+ if a220 is not None and a254 is not None and a254 > 0:
194
+ ratios.append(a220 / a254)
195
+ if ratios:
196
+ compound.uv_ratio = median(ratios)
197
+
198
+
199
+ def create_compound(cid: int, peak: ChromPeak, ratio: Optional[float],
200
+ file_idx: int) -> Compound:
201
+ """Create a new Compound from a seed peak."""
202
+ c = Compound(compound_id=cid, canonical_rt=peak.rt, uv_ratio=ratio)
203
+ c.rt_by_file[file_idx] = peak.rt
204
+ c.area_pct_by_file[file_idx] = peak.area_pct
205
+ c.area_220_by_file[file_idx] = peak.area_220nm
206
+ c.area_254_by_file[file_idx] = peak.area_254nm
207
+ c.area_pct_220_by_file[file_idx] = peak.area_pct_220nm
208
+ c.area_pct_254_by_file[file_idx] = peak.area_pct_254nm
209
+ # Collect ions with rank info
210
+ for spec in peak.ms_spectra:
211
+ for rank, mz in enumerate(spec.top_ions):
212
+ c.all_ions.append((spec.mode, mz, rank, file_idx))
213
+ return c
214
+
215
+
216
+ def attach_peak_to_compound(compound: Compound, peak: ChromPeak,
217
+ file_idx: int):
218
+ """Add a peak's data to an existing compound."""
219
+ compound.rt_by_file[file_idx] = peak.rt
220
+ compound.area_pct_by_file[file_idx] = peak.area_pct
221
+ compound.area_220_by_file[file_idx] = peak.area_220nm
222
+ compound.area_254_by_file[file_idx] = peak.area_254nm
223
+ compound.area_pct_220_by_file[file_idx] = peak.area_pct_220nm
224
+ compound.area_pct_254_by_file[file_idx] = peak.area_pct_254nm
225
+ for spec in peak.ms_spectra:
226
+ for rank, mz in enumerate(spec.top_ions):
227
+ compound.all_ions.append((spec.mode, mz, rank, file_idx))
228
+ # Update running canonical RT (will be finalized later)
229
+ _update_compound_uv_ratio(compound)
230
+
231
+
232
+ def find_and_match(peak: ChromPeak, ratio: Optional[float],
233
+ compounds: List[Compound], rt_tol: float,
234
+ used_ids: set) -> Optional[Compound]:
235
+ """
236
+ Find the best matching compound for a peak.
237
+ Returns the compound or None if no match found.
238
+ """
239
+ candidates = []
240
+ for compound in compounds:
241
+ if compound.compound_id in used_ids:
242
+ continue
243
+
244
+ rt_delta = abs(peak.rt - compound.canonical_rt)
245
+ if rt_delta > rt_tol:
246
+ continue
247
+
248
+ uv_ok = check_uv_compatibility(ratio, compound.uv_ratio)
249
+ # If UV data available and incompatible, skip
250
+ if uv_ok is False:
251
+ continue
252
+
253
+ # Score: lower is better. UV confirmation halves the score.
254
+ score = rt_delta
255
+ if uv_ok is True:
256
+ score *= 0.5
257
+
258
+ candidates.append((compound, score))
259
+
260
+ if not candidates:
261
+ return None
262
+
263
+ candidates.sort(key=lambda x: x[1])
264
+ return candidates[0][0]
265
+
266
+
267
+ def match_peaks_across_files(files: List[FileEntry],
268
+ rt_tol: float) -> List[Compound]:
269
+ """
270
+ Match peaks across all files and return a list of Compounds.
271
+ Files must be pre-sorted chronologically.
272
+ """
273
+ compounds: List[Compound] = []
274
+ next_id = 1
275
+
276
+ for file_idx, fe in enumerate(files):
277
+ if fe.report is None:
278
+ continue
279
+
280
+ peaks = fe.report.peaks
281
+ if not peaks:
282
+ continue
283
+
284
+ # Compute UV ratio for each peak
285
+ peaks_with_ratio = [(p, compute_uv_ratio(p)) for p in peaks]
286
+
287
+ if not compounds:
288
+ # First file with peaks — seed all compounds
289
+ for peak, ratio in peaks_with_ratio:
290
+ compounds.append(create_compound(next_id, peak, ratio,
291
+ file_idx))
292
+ next_id += 1
293
+ continue
294
+
295
+ # Match peaks: process largest peaks first for stable matching
296
+ sorted_peaks = sorted(peaks_with_ratio,
297
+ key=lambda x: x[0].area_pct or 0,
298
+ reverse=True)
299
+ used_ids: set = set()
300
+
301
+ unmatched = []
302
+ for peak, ratio in sorted_peaks:
303
+ match = find_and_match(peak, ratio, compounds, rt_tol, used_ids)
304
+ if match:
305
+ attach_peak_to_compound(match, peak, file_idx)
306
+ used_ids.add(match.compound_id)
307
+ else:
308
+ unmatched.append((peak, ratio))
309
+
310
+ # Create new compounds for unmatched peaks
311
+ for peak, ratio in unmatched:
312
+ compounds.append(create_compound(next_id, peak, ratio, file_idx))
313
+ next_id += 1
314
+
315
+ return compounds
316
+
317
+ # ---------------------------------------------------------------------------
318
+ # Post-processing
319
+ # ---------------------------------------------------------------------------
320
+
321
+ def compute_canonical_rt(compound: Compound):
322
+ """Set canonical RT by majority vote (mode of rounded values)."""
323
+ rt_values = list(compound.rt_by_file.values())
324
+ if not rt_values:
325
+ return
326
+ rounded = [round(rt, 2) for rt in rt_values]
327
+ counter = Counter(rounded)
328
+ max_count = max(counter.values())
329
+ modes = sorted(rt for rt, count in counter.items() if count == max_count)
330
+ compound.canonical_rt = modes[0] if len(modes) == 1 else round(
331
+ median(modes), 2)
332
+
333
+
334
+ def cluster_ions(compound: Compound, mz_tol: float, total_files: int,
335
+ max_ion_rank: Optional[int] = None):
336
+ """Group ions within mz_tol, split into recurring vs other.
337
+
338
+ Args:
339
+ max_ion_rank: If set, exclude "other ions" whose best rank >= this
340
+ value from the output (0-based). E.g. max_ion_rank=5 keeps only
341
+ ions ranked 0-4 (base peak through rank 5 display).
342
+ """
343
+ # Separate by mode
344
+ ions_by_mode: Dict[str, List[Tuple[float, int, int]]] = defaultdict(list)
345
+ for mode, mz, rank, file_idx in compound.all_ions:
346
+ ions_by_mode[mode].append((mz, rank, file_idx))
347
+
348
+ all_clusters: List[IonCluster] = []
349
+
350
+ for mode, ion_list in ions_by_mode.items():
351
+ ion_list.sort(key=lambda x: x[0]) # sort by m/z
352
+
353
+ clusters: list = []
354
+ for mz, rank, file_idx in ion_list:
355
+ placed = False
356
+ for cl in clusters:
357
+ if abs(mz - cl['center']) <= mz_tol:
358
+ cl['values'].append(mz)
359
+ cl['files'].add(file_idx)
360
+ cl['best_rank'] = min(cl['best_rank'], rank)
361
+ cl['center'] = sum(cl['values']) / len(cl['values'])
362
+ placed = True
363
+ break
364
+ if not placed:
365
+ clusters.append({
366
+ 'center': mz,
367
+ 'values': [mz],
368
+ 'files': {file_idx},
369
+ 'mode': mode,
370
+ 'best_rank': rank,
371
+ })
372
+
373
+ for cl in clusters:
374
+ all_clusters.append(IonCluster(
375
+ mean_mz=round(sum(cl['values']) / len(cl['values']), 1),
376
+ mode=cl['mode'],
377
+ occurrences=len(cl['files']),
378
+ best_rank=cl['best_rank'],
379
+ mz_values=cl['values'],
380
+ ))
381
+
382
+ # Determine how many distinct files this compound was observed in
383
+ files_observed = set()
384
+ for _, _, _, fi in compound.all_ions:
385
+ files_observed.add(fi)
386
+ n_files_observed = len(files_observed)
387
+
388
+ if n_files_observed <= 1:
389
+ # Compound only in 1 file: ALL ions are canonical (no "other" bucket).
390
+ # When there's only one observation, the recurring-vs-other distinction
391
+ # is meaningless — the chemist needs to see all ions to identify the
392
+ # compound.
393
+ compound.recurring_ions = sorted(
394
+ all_clusters,
395
+ key=lambda c: (c.best_rank, c.mean_mz),
396
+ )
397
+ compound.other_ions = []
398
+ else:
399
+ # Split recurring (>=2 files) vs other
400
+ compound.recurring_ions = sorted(
401
+ [c for c in all_clusters if c.occurrences >= 2],
402
+ key=lambda c: (-c.occurrences, c.best_rank, c.mean_mz),
403
+ )
404
+ other = [c for c in all_clusters if c.occurrences < 2]
405
+ if max_ion_rank is not None:
406
+ other = [c for c in other if c.best_rank < max_ion_rank]
407
+ compound.other_ions = sorted(
408
+ other,
409
+ key=lambda c: (c.best_rank, c.mean_mz),
410
+ )
411
+
412
+
413
+ def _best_area_pct(compound: Compound, fi: int) -> Optional[float]:
414
+ """Return the best available area% for a compound in a given file.
415
+ Prefers TAC, falls back to 220nm, then 254nm."""
416
+ area = compound.area_pct_by_file.get(fi)
417
+ if area is not None:
418
+ return area
419
+ area = compound.area_pct_220_by_file.get(fi)
420
+ if area is not None:
421
+ return area
422
+ return compound.area_pct_254_by_file.get(fi)
423
+
424
+
425
+ def compute_trend(compound: Compound, total_files: int,
426
+ threshold: float, excluded_files: set = None):
427
+ """
428
+ Determine area% trend: increasing / decreasing / stable.
429
+
430
+ For files where the compound is NOT observed but which fall between
431
+ (or after) files where it IS observed, treat area as 0%. This
432
+ correctly marks consumed compounds as "decreasing" and newly formed
433
+ compounds as "increasing". Excluded (outlier) files are skipped.
434
+ """
435
+ if excluded_files is None:
436
+ excluded_files = set()
437
+
438
+ # Build the full timeline across ALL non-excluded files
439
+ seen_files = set(compound.area_pct_by_file.keys()) - excluded_files
440
+ if not seen_files:
441
+ compound.trend = "stable"
442
+ compound.trend_detail = "no area data"
443
+ return
444
+
445
+ first_seen = min(seen_files)
446
+ last_seen = max(seen_files)
447
+
448
+ # Only one observation → can't determine trend
449
+ observed = []
450
+ for fi in sorted(seen_files):
451
+ area = _best_area_pct(compound, fi)
452
+ if area is not None:
453
+ observed.append((fi, area))
454
+ if len(observed) < 1:
455
+ compound.trend = "stable"
456
+ compound.trend_detail = "no area data"
457
+ return
458
+
459
+ # Always set max_area from observed data (even with a single file,
460
+ # so that downstream consumers like procedure_writer can filter on it)
461
+ compound.max_area = max(a for _, a in observed)
462
+
463
+ if len(observed) == 1 and first_seen == last_seen:
464
+ compound.trend = "stable"
465
+ compound.trend_detail = "single observation"
466
+ return
467
+
468
+ # Build complete timeline: for files between first_seen and the
469
+ # last non-excluded file, fill in 0% where compound is absent
470
+ last_file_idx = max(i for i in range(total_files)
471
+ if i not in excluded_files)
472
+ timeline = []
473
+ for fi in range(total_files):
474
+ if fi in excluded_files:
475
+ continue
476
+ if fi < first_seen:
477
+ # Compound not yet appeared — treat as 0%
478
+ timeline.append((fi, 0.0))
479
+ elif fi in compound.area_pct_by_file:
480
+ area = _best_area_pct(compound, fi)
481
+ timeline.append((fi, area if area is not None else 0.0))
482
+ else:
483
+ # File exists but compound not detected — 0%
484
+ timeline.append((fi, 0.0))
485
+
486
+ if len(timeline) < 2:
487
+ compound.trend = "stable"
488
+ compound.trend_detail = "single observation"
489
+ return
490
+
491
+ first_area = timeline[0][1]
492
+ last_area = timeline[-1][1]
493
+ max_area = max(a for _, a in timeline)
494
+
495
+ compound.max_area = max_area
496
+
497
+ if max_area < 0.5:
498
+ compound.trend = "stable"
499
+ compound.trend_detail = "trace levels throughout"
500
+ return
501
+
502
+ change = (last_area - first_area) / max_area
503
+
504
+ if change > threshold:
505
+ compound.trend = "increasing"
506
+ elif change < -threshold:
507
+ compound.trend = "decreasing"
508
+ else:
509
+ compound.trend = "stable"
510
+
511
+ # Determine which detector was used
512
+ det_label = _trend_detector_label(compound)
513
+
514
+ compound.trend_detail = (
515
+ f"{first_area:.1f}% \u2192 {last_area:.1f}% "
516
+ f"({det_label}, max {max_area:.1f}%, change {change:+.0%})"
517
+ )
518
+
519
+
520
+ def _trend_detector_label(compound: Compound) -> str:
521
+ """Return which detector is primarily used for this compound's trend."""
522
+ has_tac = any(v is not None
523
+ for v in compound.area_pct_by_file.values())
524
+ if has_tac:
525
+ return "TAC"
526
+ has_220 = any(v is not None
527
+ for v in compound.area_pct_220_by_file.values())
528
+ if has_220:
529
+ return "220nm"
530
+ return "254nm"
531
+
532
+
533
+ def compute_uv_consensus(compound: Compound):
534
+ """Deduplicate UV lambda-max across all observations."""
535
+ pass # UV lambda-max is populated via _collect_uv_lambda_max
536
+
537
+
538
+ def detect_outlier_files(files: List[FileEntry]) -> Tuple[set, set]:
539
+ """
540
+ Flag files that look like blanks/outliers or have ambiguous timing.
541
+
542
+ Returns (outlier_set, ambiguous_set):
543
+ - outlier_set: indices of blank/failed files (excluded from everything)
544
+ - ambiguous_set: indices of files with uncertain timeline position
545
+ (excluded from trend analysis only)
546
+ """
547
+ ambiguous = {i for i, fe in enumerate(files) if fe.ambiguous_time}
548
+
549
+ if len(files) < 3:
550
+ return set(), ambiguous
551
+
552
+ peak_counts = []
553
+ for fe in files:
554
+ if fe.report:
555
+ peak_counts.append(len(fe.report.peaks))
556
+ else:
557
+ peak_counts.append(0)
558
+
559
+ sorted_counts = sorted(peak_counts)
560
+ median_count = sorted_counts[len(sorted_counts) // 2]
561
+
562
+ if median_count < 3:
563
+ return set(), ambiguous
564
+
565
+ excluded = set()
566
+ for i, fe in enumerate(files):
567
+ if fe.report is None:
568
+ excluded.add(i)
569
+ continue
570
+
571
+ n_peaks = len(fe.report.peaks)
572
+
573
+ # Heuristic 1: far fewer peaks than median
574
+ if n_peaks < median_count * 0.4:
575
+ excluded.add(i)
576
+ continue
577
+
578
+ # Heuristic 2: single dominant peak > 99% TAC area with very few
579
+ # peaks AND the dominant peak is at very low RT (likely solvent/void).
580
+ # Previous version (>95%, <=5 peaks) false-positived on legitimate
581
+ # t=0 and near-complete reaction files where SM or DP dominates.
582
+ tac_peaks = [(p.rt, p.area_pct) for p in fe.report.peaks
583
+ if p.area_pct is not None]
584
+ if tac_peaks:
585
+ dom_rt, dom_area = max(tac_peaks, key=lambda x: x[1])
586
+ if dom_area > 99.0 and n_peaks <= 3 and dom_rt < 0.4:
587
+ excluded.add(i)
588
+
589
+ return excluded, ambiguous
590
+
591
+
592
+ def detect_outlier_files_conservative(
593
+ files: List[FileEntry],
594
+ compounds: List[Compound],
595
+ excluded_files: set,
596
+ significance_floor: float = 5.0,
597
+ threshold: float = 0.5,
598
+ ) -> set:
599
+ """
600
+ Second-pass outlier detection based on multi-species behaviour.
601
+
602
+ After peak matching, check whether the MAJORITY of significant tracked
603
+ compounds show anomalous area% in a given file. One anomalous species is
604
+ likely real chemistry (e.g. an intermediate consumed faster than expected);
605
+ everything being off at once suggests a bad injection.
606
+
607
+ Args:
608
+ files: All FileEntry objects (ordered chronologically).
609
+ compounds: Matched compounds from match_peaks_across_files().
610
+ excluded_files: Files already excluded by first-pass heuristics.
611
+ significance_floor: Only consider compounds with max_area >= this value.
612
+ threshold: Fraction of significant compounds that must be
613
+ anomalous to flag a file (default 0.5 = majority).
614
+
615
+ Returns:
616
+ Set of additional file indices to exclude.
617
+ """
618
+ if len(files) < 4:
619
+ return set()
620
+
621
+ # Only consider "significant" compounds (visible in chromatogram)
622
+ significant = [c for c in compounds if c.max_area >= significance_floor]
623
+ if len(significant) < 2:
624
+ return set()
625
+
626
+ additional_outliers = set()
627
+
628
+ for fi in range(len(files)):
629
+ if fi in excluded_files:
630
+ continue
631
+
632
+ anomalous_count = 0
633
+ evaluated_count = 0
634
+
635
+ for compound in significant:
636
+ # Collect area% from all non-excluded *other* files
637
+ other_areas = []
638
+ first_seen = min(compound.area_pct_by_file.keys(), default=fi)
639
+ last_seen = max(compound.area_pct_by_file.keys(), default=fi)
640
+
641
+ for other_fi in range(len(files)):
642
+ if other_fi == fi or other_fi in excluded_files:
643
+ continue
644
+ area = _best_area_pct(compound, other_fi)
645
+ if area is not None:
646
+ other_areas.append(area)
647
+ elif first_seen <= other_fi <= last_seen:
648
+ # Compound absent between first/last observation → 0%
649
+ other_areas.append(0.0)
650
+
651
+ if len(other_areas) < 2:
652
+ continue
653
+
654
+ evaluated_count += 1
655
+
656
+ # This file's area% (or 0% if compound absent)
657
+ area_in_file = _best_area_pct(compound, fi)
658
+ if area_in_file is None:
659
+ if first_seen <= fi <= last_seen:
660
+ area_in_file = 0.0
661
+ else:
662
+ continue # compound not expected in this file
663
+
664
+ # Check if this file's value is anomalous.
665
+ # "Anomalous" = area deviates from the mean of other files by
666
+ # more than 80% of the max observed area across other files.
667
+ mean_other = sum(other_areas) / len(other_areas)
668
+ max_other = max(other_areas) if other_areas else 1.0
669
+ if max_other < 1.0:
670
+ continue # trace compound, skip
671
+
672
+ deviation = abs(area_in_file - mean_other)
673
+ if deviation > max_other * 0.8:
674
+ anomalous_count += 1
675
+
676
+ if evaluated_count >= 2 and anomalous_count / evaluated_count > threshold:
677
+ additional_outliers.add(fi)
678
+
679
+ return additional_outliers
680
+
681
+
682
+ # ---------------------------------------------------------------------------
683
+ # Output formatters
684
+ # ---------------------------------------------------------------------------
685
+
686
+ def _file_label(fe: FileEntry) -> str:
687
+ """Short display name for a file (strip .pdf, strip common prefix)."""
688
+ return os.path.splitext(fe.filename)[0]
689
+
690
+
691
+ def format_text_report(result: AnalysisResult,
692
+ min_summary_area: float = 2.0,
693
+ hide_other_ions: bool = False) -> str:
694
+ """Render the full text report."""
695
+ lines = []
696
+ sep = "=" * 62
697
+
698
+ # Header
699
+ lines.append(sep)
700
+ lines.append("MULTI-LCMS ANALYSIS")
701
+ lines.append(sep)
702
+ lines.append("")
703
+ lines.append(f"Instrument: {result.instrument}")
704
+ lines.append(f"Method: {result.method_short}")
705
+ lines.append("")
706
+ lines.append(f"Files analyzed ({len(result.files)}):")
707
+ for i, fe in enumerate(result.files):
708
+ flags = []
709
+ if i in result.excluded_files:
710
+ flags.append("EXCLUDED")
711
+ if i in result.ambiguous_files:
712
+ flags.append("ambiguous timing")
713
+ flag_str = f" ** {' | '.join(flags)} **" if flags else ""
714
+ dt_str = f" ({fe.run_datetime})" if fe.run_datetime else ""
715
+ lines.append(
716
+ f" [{i + 1}] {fe.filename:<45s} {fe.category:<14s}"
717
+ f"{dt_str}{flag_str}"
718
+ )
719
+ lines.append("")
720
+
721
+ # ---- REACTION SUMMARY (at top) ----
722
+ lines.append(sep)
723
+ lines.append("REACTION SUMMARY")
724
+ lines.append(sep)
725
+ lines.append("")
726
+
727
+ # Sort each trend group by max observed area (descending)
728
+ # and filter out compounds below the min_summary_area threshold
729
+ def _above_threshold(c: Compound) -> bool:
730
+ return c.max_area >= min_summary_area
731
+
732
+ increasing = sorted(
733
+ [c for c in result.compounds
734
+ if c.trend == "increasing" and _above_threshold(c)],
735
+ key=lambda c: c.max_area, reverse=True)
736
+ decreasing = sorted(
737
+ [c for c in result.compounds
738
+ if c.trend == "decreasing" and _above_threshold(c)],
739
+ key=lambda c: c.max_area, reverse=True)
740
+ stable = sorted(
741
+ [c for c in result.compounds
742
+ if c.trend == "stable" and _above_threshold(c)],
743
+ key=lambda c: c.max_area, reverse=True)
744
+ hidden = [c for c in result.compounds if not _above_threshold(c)]
745
+
746
+ n_files = len(result.files)
747
+
748
+ def _summary_line(c: Compound) -> str:
749
+ """Format one compound line for the summary."""
750
+ parts = [f" Compound {c.compound_id} \u2014 "
751
+ f"RT {c.canonical_rt:.2f} \u2014 {c.trend_detail}"]
752
+ # Append recurring ions (compact)
753
+ if c.recurring_ions:
754
+ ions_str = ", ".join(
755
+ f"{ic.mode} {ic.mean_mz:.1f}"
756
+ for ic in c.recurring_ions[:4] # limit to top 4
757
+ )
758
+ if len(c.recurring_ions) > 4:
759
+ ions_str += f" (+{len(c.recurring_ions) - 4} more)"
760
+ parts.append(f" Ions: {ions_str}")
761
+ return "\n".join(parts)
762
+
763
+ if increasing:
764
+ lines.append("Increasing (likely product / intermediate):")
765
+ for c in increasing:
766
+ lines.append(_summary_line(c))
767
+ lines.append("")
768
+
769
+ if decreasing:
770
+ lines.append("Decreasing (likely starting material / consumed):")
771
+ for c in decreasing:
772
+ lines.append(_summary_line(c))
773
+ lines.append("")
774
+
775
+ if stable:
776
+ lines.append("Stable:")
777
+ for c in stable:
778
+ lines.append(_summary_line(c))
779
+ lines.append("")
780
+
781
+ if hidden:
782
+ lines.append(f"({len(hidden)} minor compound(s) below "
783
+ f"{min_summary_area:.0f}% max area not shown "
784
+ f"in summary — see details below)")
785
+ lines.append("")
786
+
787
+ # Warnings
788
+ if result.warnings:
789
+ lines.append("Warnings:")
790
+ for w in result.warnings:
791
+ lines.append(f" - {w}")
792
+ lines.append("")
793
+
794
+ # ---- DETAILED COMPOUND SECTIONS ----
795
+ lines.append(sep)
796
+ lines.append("COMPOUND DETAILS")
797
+ lines.append(sep)
798
+
799
+ for compound in result.compounds:
800
+ lines.append("")
801
+ lines.append("-" * 62)
802
+ lines.append(
803
+ f"Compound {compound.compound_id} \u2014 "
804
+ f"RT {compound.canonical_rt:.2f} min ({compound.trend})"
805
+ )
806
+ lines.append("-" * 62)
807
+
808
+ # Trend detail
809
+ lines.append(f" Trend: {compound.trend_detail}")
810
+
811
+ # UV ratio
812
+ if compound.uv_ratio is not None:
813
+ if compound.uv_ratio == float('inf'):
814
+ lines.append(" 220:254: 220nm only (no 254nm absorption)")
815
+ else:
816
+ lines.append(f" 220:254: {compound.uv_ratio:.2f}")
817
+
818
+ # UV lambda-max
819
+ if compound.uv_lambda_max:
820
+ import math
821
+ wl_str = ", ".join(
822
+ str(math.floor(w + 0.5)) for w in compound.uv_lambda_max)
823
+ lines.append(f" UV \u03bbmax: {wl_str} nm")
824
+
825
+ # Recurring ions
826
+ if compound.recurring_ions:
827
+ lines.append("")
828
+ lines.append(" Recurring ions:")
829
+ for ic in compound.recurring_ions:
830
+ rank_label = ("base peak" if ic.best_rank == 0
831
+ else f"rank {ic.best_rank + 1}")
832
+ lines.append(
833
+ f" {ic.mode} {ic.mean_mz:.1f} "
834
+ f"(seen in {ic.occurrences}/{n_files} files, "
835
+ f"{rank_label})"
836
+ )
837
+
838
+ # Other ions (single-observation; hidden with --hide-other-ions)
839
+ if not hide_other_ions and compound.other_ions:
840
+ lines.append("")
841
+ lines.append(" Other ions:")
842
+ for ic in compound.other_ions:
843
+ rank_label = ("base peak" if ic.best_rank == 0
844
+ else f"rank {ic.best_rank + 1}")
845
+ lines.append(
846
+ f" {ic.mode} {ic.mean_mz:.1f} "
847
+ f"(seen in {ic.occurrences} file, {rank_label})"
848
+ )
849
+
850
+ # Area% timeline
851
+ lines.append("")
852
+ lines.append(" Area% timeline:")
853
+ for fi in range(n_files):
854
+ fe = result.files[fi]
855
+ flags = []
856
+ if fi in result.excluded_files:
857
+ flags.append("EXCLUDED")
858
+ if fi in result.ambiguous_files:
859
+ flags.append("ambiguous timing")
860
+ excl = f" **{'|'.join(flags)}**" if flags else ""
861
+ if fi in compound.area_pct_by_file:
862
+ tac = compound.area_pct_by_file[fi]
863
+ rt = compound.rt_by_file.get(fi)
864
+ a220 = compound.area_pct_220_by_file.get(fi)
865
+ a254 = compound.area_pct_254_by_file.get(fi)
866
+ det_parts = []
867
+ if tac is not None:
868
+ det_parts.append(f"TAC {tac:.1f}%")
869
+ if a220 is not None:
870
+ det_parts.append(f"220nm {a220:.1f}%")
871
+ if a254 is not None:
872
+ det_parts.append(f"254nm {a254:.1f}%")
873
+ det_str = ", ".join(det_parts) if det_parts else "no data"
874
+ rt_str = f"RT {rt:.2f}" if rt is not None else ""
875
+ lines.append(
876
+ f" [{fi + 1}] {_file_label(fe):<40s} "
877
+ f"{rt_str} {det_str}{excl}"
878
+ )
879
+ else:
880
+ lines.append(
881
+ f" [{fi + 1}] {_file_label(fe):<40s} "
882
+ f" (not detected){excl}"
883
+ )
884
+ lines.append("")
885
+
886
+ return "\n".join(lines)
887
+
888
+
889
+ def format_json_report(result: AnalysisResult) -> str:
890
+ """Render structured JSON output."""
891
+ data = {
892
+ "instrument": result.instrument,
893
+ "method_short": result.method_short,
894
+ "files": [],
895
+ "compounds": [],
896
+ "summary": {"increasing": [], "decreasing": [], "stable": []},
897
+ "warnings": result.warnings,
898
+ }
899
+
900
+ for i, fe in enumerate(result.files):
901
+ fd = {
902
+ "index": i,
903
+ "filename": fe.filename,
904
+ "category": fe.category,
905
+ "sort_key": fe.sort_key,
906
+ }
907
+ if fe.report:
908
+ fd["sample_name"] = fe.report.sample_name
909
+ fd["date"] = fe.report.date
910
+ data["files"].append(fd)
911
+
912
+ for c in result.compounds:
913
+ cd = {
914
+ "compound_id": c.compound_id,
915
+ "canonical_rt": c.canonical_rt,
916
+ "uv_ratio": c.uv_ratio if c.uv_ratio != float('inf') else "inf",
917
+ "trend": c.trend,
918
+ "trend_detail": c.trend_detail,
919
+ "max_area": c.max_area,
920
+ "uv_lambda_max": c.uv_lambda_max,
921
+ "recurring_ions": [
922
+ {"mean_mz": ic.mean_mz, "mode": ic.mode,
923
+ "occurrences": ic.occurrences, "best_rank": ic.best_rank}
924
+ for ic in c.recurring_ions
925
+ ],
926
+ "other_ions": [
927
+ {"mean_mz": ic.mean_mz, "mode": ic.mode,
928
+ "occurrences": ic.occurrences, "best_rank": ic.best_rank}
929
+ for ic in c.other_ions
930
+ ],
931
+ "timeline": [],
932
+ }
933
+ for fi in sorted(c.area_pct_by_file.keys()):
934
+ entry = {
935
+ "file_index": fi,
936
+ "rt": c.rt_by_file.get(fi),
937
+ "area_pct": c.area_pct_by_file.get(fi),
938
+ "area_220": c.area_220_by_file.get(fi),
939
+ "area_254": c.area_254_by_file.get(fi),
940
+ "area_pct_220": c.area_pct_220_by_file.get(fi),
941
+ "area_pct_254": c.area_pct_254_by_file.get(fi),
942
+ }
943
+ cd["timeline"].append(entry)
944
+ data["compounds"].append(cd)
945
+
946
+ # Summary
947
+ data["summary"][c.trend].append(c.compound_id)
948
+
949
+ return json.dumps(data, indent=2, ensure_ascii=False)
950
+
951
+
952
+ def load_analysis_from_json(json_path: str) -> AnalysisResult:
953
+ """Reconstruct an AnalysisResult from a JSON file produced by format_json_report().
954
+
955
+ This allows downstream tools (e.g. procedure_writer) to reuse pre-computed
956
+ multi-LCMS analysis without re-parsing the original PDFs.
957
+ """
958
+ with open(json_path, "r", encoding="utf-8") as f:
959
+ data = json.load(f)
960
+
961
+ files = []
962
+ for fd in data.get("files", []):
963
+ fe = FileEntry(
964
+ path="",
965
+ filename=fd["filename"],
966
+ category=fd["category"],
967
+ sort_key=fd.get("sort_key", 0),
968
+ )
969
+ files.append(fe)
970
+
971
+ compounds = []
972
+ for cd in data.get("compounds", []):
973
+ c = Compound(
974
+ compound_id=cd["compound_id"],
975
+ canonical_rt=cd.get("canonical_rt", 0.0),
976
+ )
977
+ uv_ratio = cd.get("uv_ratio")
978
+ if uv_ratio == "inf":
979
+ c.uv_ratio = float("inf")
980
+ elif uv_ratio is not None:
981
+ c.uv_ratio = uv_ratio
982
+ c.trend = cd.get("trend", "stable")
983
+ c.trend_detail = cd.get("trend_detail", "")
984
+ c.max_area = cd.get("max_area", 0.0)
985
+ c.uv_lambda_max = cd.get("uv_lambda_max", [])
986
+
987
+ for ic_data in cd.get("recurring_ions", []):
988
+ c.recurring_ions.append(IonCluster(
989
+ mean_mz=ic_data["mean_mz"],
990
+ mode=ic_data["mode"],
991
+ occurrences=ic_data.get("occurrences", 1),
992
+ best_rank=ic_data.get("best_rank", 0),
993
+ ))
994
+ for ic_data in cd.get("other_ions", []):
995
+ c.other_ions.append(IonCluster(
996
+ mean_mz=ic_data["mean_mz"],
997
+ mode=ic_data["mode"],
998
+ occurrences=ic_data.get("occurrences", 1),
999
+ best_rank=ic_data.get("best_rank", 0),
1000
+ ))
1001
+
1002
+ for te in cd.get("timeline", []):
1003
+ fi = te["file_index"]
1004
+ if te.get("rt") is not None:
1005
+ c.rt_by_file[fi] = te["rt"]
1006
+ if te.get("area_pct") is not None:
1007
+ c.area_pct_by_file[fi] = te["area_pct"]
1008
+ if te.get("area_220") is not None:
1009
+ c.area_220_by_file[fi] = te["area_220"]
1010
+ if te.get("area_254") is not None:
1011
+ c.area_254_by_file[fi] = te["area_254"]
1012
+ if te.get("area_pct_220") is not None:
1013
+ c.area_pct_220_by_file[fi] = te["area_pct_220"]
1014
+ if te.get("area_pct_254") is not None:
1015
+ c.area_pct_254_by_file[fi] = te["area_pct_254"]
1016
+
1017
+ compounds.append(c)
1018
+
1019
+ return AnalysisResult(
1020
+ instrument=data.get("instrument", "Unknown"),
1021
+ method_short=data.get("method_short", "Unknown"),
1022
+ files=files,
1023
+ compounds=compounds,
1024
+ warnings=data.get("warnings", []),
1025
+ )
1026
+
1027
+
1028
+ # ---------------------------------------------------------------------------
1029
+ # Orchestration
1030
+ # ---------------------------------------------------------------------------
1031
+
1032
+ def analyze(files: List[FileEntry], rt_tol: float, mz_tol: float,
1033
+ trend_threshold: float,
1034
+ ignore_instrument: bool,
1035
+ use_run_time: bool = True,
1036
+ max_ion_rank: Optional[int] = None,
1037
+ pick_biggest_group: bool = False) -> List[AnalysisResult]:
1038
+ """
1039
+ Top-level analysis. Groups files by (instrument, method) and runs
1040
+ peak matching within each group.
1041
+
1042
+ Args:
1043
+ pick_biggest_group: When True and multiple (instrument, method) groups
1044
+ exist, only analyze the largest group — discard the rest. Used by
1045
+ the procedure_writer pipeline where cross-method comparison is not
1046
+ meaningful. When False (default / CLI), analyze each group
1047
+ separately and return one AnalysisResult per group.
1048
+ """
1049
+ warnings: List[str] = []
1050
+ discarded_all: List[FileEntry] = []
1051
+
1052
+ # Filter to files that parsed successfully
1053
+ valid = [f for f in files if f.report is not None]
1054
+ if not valid:
1055
+ return []
1056
+
1057
+ # --- Group by (instrument, method) ---
1058
+ if ignore_instrument:
1059
+ # Legacy mode: ignore instrument but still respect method
1060
+ groups: Dict[str, List[FileEntry]] = defaultdict(list)
1061
+ for f in valid:
1062
+ meth = method_basename(f.report.method_path)
1063
+ groups[f"all|{meth}"].append(f)
1064
+ # If all files share the same method, simplify key to "all"
1065
+ if len(groups) == 1:
1066
+ key = next(iter(groups))
1067
+ groups = {"all": groups[key]}
1068
+ else:
1069
+ groups = defaultdict(list)
1070
+ for f in valid:
1071
+ inst = f.report.instrument
1072
+ meth = method_basename(f.report.method_path)
1073
+ groups[f"{inst}|{meth}"].append(f)
1074
+
1075
+ if len(groups) > 1:
1076
+ group_summaries = []
1077
+ for k, v in sorted(groups.items(), key=lambda x: -len(x[1])):
1078
+ parts = k.split("|", 1)
1079
+ inst_part = parts[0]
1080
+ meth_part = parts[1] if len(parts) > 1 else "?"
1081
+ # Use method_short for human-readable display
1082
+ meth_display = v[0].report.method_short if v[0].report else meth_part
1083
+ group_summaries.append(
1084
+ f"{inst_part}/{meth_display} ({len(v)} files)")
1085
+ warnings.append(
1086
+ f"Files from {len(groups)} (instrument, method) groups: "
1087
+ + ", ".join(group_summaries) + "."
1088
+ )
1089
+
1090
+ if pick_biggest_group:
1091
+ biggest_key = max(groups, key=lambda k: len(groups[k]))
1092
+ for k, v in list(groups.items()):
1093
+ if k != biggest_key:
1094
+ discarded_all.extend(v)
1095
+ disc_names = ", ".join(f.filename for f in v)
1096
+ warnings.append(
1097
+ f"Discarded {len(v)} file(s) from non-primary group "
1098
+ f"({k}): {disc_names}."
1099
+ )
1100
+ groups = {biggest_key: groups[biggest_key]}
1101
+
1102
+ results = []
1103
+ for inst_key, group_files in groups.items():
1104
+ method_warn = []
1105
+
1106
+ # Sort by sort_key — this is the single source of truth for ordering.
1107
+ # For Group 1 tracking files this is filename-derived (chemist controls
1108
+ # submission order). For Group 2+ it may be recalibrated to actual
1109
+ # acquisition timestamps by the caller (lcms_identifier.py).
1110
+ group_files.sort(key=lambda f: f.sort_key)
1111
+
1112
+ # Detect outlier / blank files and ambiguous-time files
1113
+ outliers, ambiguous = detect_outlier_files(group_files)
1114
+ # For trend analysis, exclude both outliers and ambiguous-time files
1115
+ trend_excluded = outliers | ambiguous
1116
+ if outliers:
1117
+ excl_names = [group_files[i].filename
1118
+ for i in sorted(outliers)]
1119
+ method_warn.append(
1120
+ f"Excluded {len(outliers)} file(s) as likely "
1121
+ f"blank/outlier: {', '.join(excl_names)}."
1122
+ )
1123
+ if ambiguous:
1124
+ amb_names = [group_files[i].filename
1125
+ for i in sorted(ambiguous)]
1126
+ method_warn.append(
1127
+ f"Excluded {len(ambiguous)} file(s) with ambiguous "
1128
+ f"timing from trend analysis: {', '.join(amb_names)}."
1129
+ )
1130
+
1131
+ # Match peaks (uses all files for matching, even excluded ones)
1132
+ compounds = match_peaks_across_files(group_files, rt_tol)
1133
+
1134
+ # Post-process — compute canonical RT, UV, ions BEFORE outlier 2nd pass
1135
+ for compound in compounds:
1136
+ compute_canonical_rt(compound)
1137
+ _update_compound_uv_ratio(compound)
1138
+ cluster_ions(compound, mz_tol, len(group_files),
1139
+ max_ion_rank=max_ion_rank)
1140
+
1141
+ # Conservative second-pass outlier detection: flag files where MOST
1142
+ # significant compounds deviate from their expected behaviour.
1143
+ conservative = detect_outlier_files_conservative(
1144
+ group_files, compounds, outliers)
1145
+ if conservative:
1146
+ cons_names = [group_files[i].filename
1147
+ for i in sorted(conservative)]
1148
+ method_warn.append(
1149
+ f"Post-match outlier detection flagged "
1150
+ f"{len(conservative)} additional file(s) where majority "
1151
+ f"of compounds deviate: {', '.join(cons_names)}."
1152
+ )
1153
+ outliers = outliers | conservative
1154
+ trend_excluded = outliers | ambiguous
1155
+
1156
+ # Compute trends with final exclusion set
1157
+ for compound in compounds:
1158
+ compute_trend(compound, len(group_files), trend_threshold,
1159
+ excluded_files=trend_excluded)
1160
+ _collect_uv_lambda_max(compound, group_files)
1161
+
1162
+ # Sort compounds by canonical RT
1163
+ compounds.sort(key=lambda c: c.canonical_rt)
1164
+ # Re-number for clean display
1165
+ for i, c in enumerate(compounds, 1):
1166
+ c.compound_id = i
1167
+
1168
+ first_report = group_files[0].report
1169
+ meth_key = method_basename(
1170
+ first_report.method_path) if first_report else ""
1171
+ result = AnalysisResult(
1172
+ instrument=first_report.instrument if first_report else "Unknown",
1173
+ method_short=first_report.method_short if first_report else "Unknown",
1174
+ method_key=meth_key,
1175
+ files=group_files,
1176
+ compounds=compounds,
1177
+ warnings=warnings + method_warn,
1178
+ excluded_files=outliers,
1179
+ ambiguous_files=ambiguous,
1180
+ discarded_files=discarded_all,
1181
+ )
1182
+ results.append(result)
1183
+
1184
+ return results
1185
+
1186
+
1187
+ def _collect_uv_lambda_max(compound: Compound, files: List[FileEntry]):
1188
+ """Collect and deduplicate UV lambda-max values from matched peaks."""
1189
+ all_wl: List[float] = []
1190
+ for fi, fe in enumerate(files):
1191
+ if fi not in compound.rt_by_file or fe.report is None:
1192
+ continue
1193
+ # Find the peak in this file that was matched
1194
+ target_rt = compound.rt_by_file[fi]
1195
+ for peak in fe.report.peaks:
1196
+ if abs(peak.rt - target_rt) < 0.01:
1197
+ all_wl.extend(peak.uv_lambda_max)
1198
+ break
1199
+
1200
+ # Deduplicate: group within 10nm, take mean of each cluster
1201
+ if not all_wl:
1202
+ compound.uv_lambda_max = []
1203
+ return
1204
+ all_wl.sort()
1205
+ clusters: List[List[float]] = [[all_wl[0]]]
1206
+ for wl in all_wl[1:]:
1207
+ if wl - clusters[-1][-1] <= 10:
1208
+ clusters[-1].append(wl)
1209
+ else:
1210
+ clusters.append([wl])
1211
+ compound.uv_lambda_max = [
1212
+ sum(c) / len(c) for c in clusters
1213
+ ]
1214
+
1215
+ # ---------------------------------------------------------------------------
1216
+ # CLI
1217
+ # ---------------------------------------------------------------------------
1218
+
1219
+ def main(argv=None) -> int:
1220
+ parser = argparse.ArgumentParser(
1221
+ description="Multi-LCMS Analyzer \u2014 collate peaks across "
1222
+ "multiple LCMS files from the same reaction.\n"
1223
+ "By default, files are sorted by their actual LCMS "
1224
+ "acquisition time (extracted from PDF). Use "
1225
+ "--out-of-order if samples were run non-chronologically.",
1226
+ formatter_class=argparse.RawDescriptionHelpFormatter,
1227
+ )
1228
+ parser.add_argument('files', nargs='+',
1229
+ help='MassLynx PDF report files')
1230
+ parser.add_argument('--rt-tolerance', type=float, default=LCMS_RT_TOLERANCE,
1231
+ help='RT matching tolerance in minutes (default: %(default)s)')
1232
+ parser.add_argument('--mz-tolerance', type=float, default=LCMS_MZ_TOLERANCE,
1233
+ help='m/z clustering tolerance in Da (default: %(default)s)')
1234
+ parser.add_argument('--trend-threshold', type=float, default=LCMS_TREND_THRESHOLD,
1235
+ help='Trend change threshold as fraction '
1236
+ '(default: %(default)s)')
1237
+ parser.add_argument('--ignore-instrument', action='store_true',
1238
+ help='Analyze all files together regardless of '
1239
+ 'instrument')
1240
+ parser.add_argument('--out-of-order', action='store_true',
1241
+ help='Use filename-heuristic sorting instead of '
1242
+ 'actual LCMS run time. Use this when samples '
1243
+ 'were not run in chronological order. Files '
1244
+ 'with ambiguous timing (e.g. "beforeadd") '
1245
+ 'will be excluded from trend analysis.')
1246
+ parser.add_argument('--min-summary-area', type=float, default=LCMS_MIN_SUMMARY_AREA,
1247
+ help='Hide compounds below this max area%% from '
1248
+ 'the reaction summary (default: %(default)s)')
1249
+ parser.add_argument('--max-ion-rank', type=int, default=None,
1250
+ help='Filter out "other ions" with rank >= this value '
1251
+ '(0-based). E.g. --max-ion-rank 5 keeps only '
1252
+ 'ions ranked 1-5 in display. Default: no filter')
1253
+ parser.add_argument('--hide-other-ions', action='store_true',
1254
+ help='Hide single-observation "other ions" from '
1255
+ 'compound details (shown by default)')
1256
+ parser.add_argument('--output', '-o', type=str, default=None,
1257
+ help='Output file path (default: stdout)')
1258
+ parser.add_argument('--json', action='store_true',
1259
+ help='Output as structured JSON')
1260
+ parser.add_argument('--json-output', type=str, default=None,
1261
+ help='Save structured JSON to this file (in addition '
1262
+ 'to the normal text output)')
1263
+ parser.add_argument('--json-errors', action='store_true',
1264
+ help='Output structured JSON error objects to stderr '
1265
+ 'on failure (for agent orchestration)')
1266
+
1267
+ args = parser.parse_args(argv)
1268
+
1269
+ use_run_time = not args.out_of_order
1270
+
1271
+ # Filter out non-standard PDFs (manually integrated chromatograms etc.)
1272
+ valid_files = []
1273
+ for path in args.files:
1274
+ if not is_waters_report(path):
1275
+ print(f" Skipping non-standard PDF: {os.path.basename(path)}",
1276
+ file=sys.stderr)
1277
+ else:
1278
+ valid_files.append(path)
1279
+
1280
+ # Parse all reports
1281
+ file_entries: List[FileEntry] = []
1282
+ for path in valid_files:
1283
+ filename = os.path.basename(path)
1284
+ category, sort_key = categorize_lcms_file(filename)
1285
+ ambiguous = sort_key in _AMBIGUOUS_SORT_KEYS
1286
+ try:
1287
+ report = parse_report(path)
1288
+ # Always try to extract run datetime (shown in output)
1289
+ run_dt = extract_run_datetime(path)
1290
+ fe = FileEntry(
1291
+ path=os.path.abspath(path),
1292
+ filename=filename,
1293
+ category=category,
1294
+ sort_key=sort_key,
1295
+ report=report,
1296
+ run_datetime=run_dt,
1297
+ # Only flag ambiguous when NOT using run time for sorting
1298
+ ambiguous_time=ambiguous and not use_run_time,
1299
+ )
1300
+ file_entries.append(fe)
1301
+ dt_info = f", run={run_dt}" if run_dt else ""
1302
+ amb_info = " [ambiguous]" if fe.ambiguous_time else ""
1303
+ print(f" Parsed: {filename} ({len(report.peaks)} peaks, "
1304
+ f"{category}, sort={sort_key}{dt_info}{amb_info})",
1305
+ file=sys.stderr)
1306
+ except Exception as e:
1307
+ print(f" Warning: Could not parse {filename}: {e}",
1308
+ file=sys.stderr)
1309
+
1310
+ if not file_entries:
1311
+ msg = "No files could be parsed."
1312
+ if args.json_errors:
1313
+ _je = {"error": "no_parseable_files", "detail": msg}
1314
+ print(json.dumps(_je), file=sys.stderr)
1315
+ else:
1316
+ print(f"Error: {msg}", file=sys.stderr)
1317
+ return 1
1318
+
1319
+ # Single file — no cross-file analysis needed, output basic report
1320
+ if len(file_entries) == 1:
1321
+ print(" Single file — outputting basic report (no multi-file "
1322
+ "analysis).", file=sys.stderr)
1323
+ output = format_basic_report(file_entries[0].report)
1324
+ if args.output:
1325
+ with open(args.output, 'w', encoding='utf-8') as f:
1326
+ f.write(output)
1327
+ print(f"Output written to {args.output}", file=sys.stderr)
1328
+ else:
1329
+ sys.stdout.buffer.write(output.encode('utf-8'))
1330
+ sys.stdout.buffer.write(b'\n')
1331
+ return 0
1332
+
1333
+ # Sort chronologically — by run_datetime (default) or sort_key (--out-of-order)
1334
+ if use_run_time:
1335
+ # All files with a valid run_datetime sort by that; others fall back
1336
+ has_dt = all(fe.run_datetime is not None for fe in file_entries)
1337
+ if has_dt:
1338
+ file_entries.sort(key=lambda f: f.run_datetime)
1339
+ print(" Sorting by LCMS run time (default).", file=sys.stderr)
1340
+ else:
1341
+ missing = [fe.filename for fe in file_entries
1342
+ if fe.run_datetime is None]
1343
+ print(f" Warning: Could not extract run time from: "
1344
+ f"{', '.join(missing)}. Falling back to filename sort.",
1345
+ file=sys.stderr)
1346
+ file_entries.sort(key=lambda f: f.sort_key)
1347
+ else:
1348
+ file_entries.sort(key=lambda f: f.sort_key)
1349
+ print(" Sorting by filename heuristics (--out-of-order).",
1350
+ file=sys.stderr)
1351
+
1352
+ # Analyze
1353
+ results = analyze(file_entries, args.rt_tolerance, args.mz_tolerance,
1354
+ args.trend_threshold, args.ignore_instrument,
1355
+ use_run_time=use_run_time,
1356
+ max_ion_rank=args.max_ion_rank)
1357
+
1358
+ if not results:
1359
+ msg = "Analysis produced no results."
1360
+ if args.json_errors:
1361
+ _je = {"error": "analysis_empty", "detail": msg}
1362
+ print(json.dumps(_je), file=sys.stderr)
1363
+ else:
1364
+ print(f"Error: {msg}", file=sys.stderr)
1365
+ return 1
1366
+
1367
+ # Save JSON sidecar if requested (for downstream reuse by procedure_writer)
1368
+ if args.json_output:
1369
+ json_parts = [format_json_report(r) for r in results]
1370
+ json_out = "[" + ", ".join(json_parts) + "]" \
1371
+ if len(json_parts) > 1 else json_parts[0]
1372
+ with open(args.json_output, 'w', encoding='utf-8') as f:
1373
+ f.write(json_out)
1374
+ print(f"JSON saved to {args.json_output}", file=sys.stderr)
1375
+
1376
+ # Check if exclusions reduced any group to ≤1 effective file
1377
+ # If so, fall back to basic single-file reports for those files
1378
+ output_parts = []
1379
+ for result in results:
1380
+ effective_count = len(result.files) - len(result.excluded_files)
1381
+ if effective_count <= 1 and not args.json:
1382
+ # Exclusions left ≤1 file — output basic report(s) instead
1383
+ print(f" {len(result.excluded_files)} of {len(result.files)} "
1384
+ f"files excluded — falling back to single-file report(s).",
1385
+ file=sys.stderr)
1386
+ for i, fe in enumerate(result.files):
1387
+ if i not in result.excluded_files and fe.report:
1388
+ output_parts.append(format_basic_report(fe.report))
1389
+ elif args.json:
1390
+ output_parts.append(format_json_report(result))
1391
+ else:
1392
+ output_parts.append(format_text_report(
1393
+ result,
1394
+ min_summary_area=args.min_summary_area,
1395
+ hide_other_ions=args.hide_other_ions,
1396
+ ))
1397
+
1398
+ output = "\n\n".join(output_parts)
1399
+
1400
+ if args.output:
1401
+ with open(args.output, 'w', encoding='utf-8') as f:
1402
+ f.write(output)
1403
+ print(f"Output written to {args.output}", file=sys.stderr)
1404
+ else:
1405
+ sys.stdout.buffer.write(output.encode('utf-8'))
1406
+ sys.stdout.buffer.write(b'\n')
1407
+
1408
+ return 0
1409
+
1410
+
1411
+ if __name__ == '__main__':
1412
+ sys.exit(main())