cdxml-toolkit 0.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cdxml_toolkit/__init__.py +18 -0
- cdxml_toolkit/_jre/__init__.py +2 -0
- cdxml_toolkit/_jre/temurin-21-jre-win-x64.zip +0 -0
- cdxml_toolkit/analysis/__init__.py +35 -0
- cdxml_toolkit/analysis/deterministic/__init__.py +12 -0
- cdxml_toolkit/analysis/deterministic/discover_experiment_files.py +413 -0
- cdxml_toolkit/analysis/deterministic/lab_book_formatter.py +701 -0
- cdxml_toolkit/analysis/deterministic/lcms_file_categorizer.py +928 -0
- cdxml_toolkit/analysis/deterministic/lcms_identifier.py +598 -0
- cdxml_toolkit/analysis/deterministic/mass_resolver.py +654 -0
- cdxml_toolkit/analysis/deterministic/multi_lcms_analyzer.py +1412 -0
- cdxml_toolkit/analysis/deterministic/procedure_writer.py +446 -0
- cdxml_toolkit/analysis/extract_nmr.py +47 -0
- cdxml_toolkit/analysis/format_procedure_entry.py +479 -0
- cdxml_toolkit/analysis/lcms_analyzer.py +1299 -0
- cdxml_toolkit/analysis/parse_analysis_file.py +134 -0
- cdxml_toolkit/cdxml_builder.py +920 -0
- cdxml_toolkit/cdxml_utils.py +342 -0
- cdxml_toolkit/chemdraw/__init__.py +5 -0
- cdxml_toolkit/chemdraw/_chemscript_server.py +562 -0
- cdxml_toolkit/chemdraw/cdx_converter.py +527 -0
- cdxml_toolkit/chemdraw/cdxml_to_image.py +262 -0
- cdxml_toolkit/chemdraw/cdxml_to_image_rdkit.py +296 -0
- cdxml_toolkit/chemdraw/chemscript_bridge.py +901 -0
- cdxml_toolkit/constants.py +304 -0
- cdxml_toolkit/coord_normalizer.py +438 -0
- cdxml_toolkit/deterministic_pipeline/__init__.py +6 -0
- cdxml_toolkit/deterministic_pipeline/legacy/__init__.py +5 -0
- cdxml_toolkit/deterministic_pipeline/legacy/eln_cdx_cleanup.py +509 -0
- cdxml_toolkit/deterministic_pipeline/legacy/eln_enrichment.py +1394 -0
- cdxml_toolkit/deterministic_pipeline/legacy/scheme_aligner.py +428 -0
- cdxml_toolkit/deterministic_pipeline/legacy/scheme_polisher.py +1337 -0
- cdxml_toolkit/deterministic_pipeline/legacy/scheme_polisher_v2.py +1340 -0
- cdxml_toolkit/deterministic_pipeline/scheme_reader_audit.py +931 -0
- cdxml_toolkit/deterministic_pipeline/scheme_reader_verify.py +1160 -0
- cdxml_toolkit/image/__init__.py +15 -0
- cdxml_toolkit/image/reaction_from_image.py +2103 -0
- cdxml_toolkit/image/structure_from_image.py +1711 -0
- cdxml_toolkit/layout/__init__.py +5 -0
- cdxml_toolkit/layout/alignment.py +1642 -0
- cdxml_toolkit/layout/reaction_cleanup.py +1002 -0
- cdxml_toolkit/layout/scheme_merger.py +2260 -0
- cdxml_toolkit/mcp_server/__init__.py +0 -0
- cdxml_toolkit/mcp_server/__main__.py +5 -0
- cdxml_toolkit/mcp_server/server.py +1567 -0
- cdxml_toolkit/naming/__init__.py +6 -0
- cdxml_toolkit/naming/aligned_namer.py +2342 -0
- cdxml_toolkit/naming/mol_builder.py +3722 -0
- cdxml_toolkit/naming/name_decomposer.py +2843 -0
- cdxml_toolkit/naming/reactions_datamol.json +2414 -0
- cdxml_toolkit/office/__init__.py +5 -0
- cdxml_toolkit/office/doc_from_template.py +722 -0
- cdxml_toolkit/office/ole_embedder.py +808 -0
- cdxml_toolkit/office/ole_extractor.py +272 -0
- cdxml_toolkit/perception/__init__.py +10 -0
- cdxml_toolkit/perception/compound_search.py +229 -0
- cdxml_toolkit/perception/eln_csv_parser.py +240 -0
- cdxml_toolkit/perception/rdf_parser.py +664 -0
- cdxml_toolkit/perception/reactant_heuristic.py +1045 -0
- cdxml_toolkit/perception/reaction_parser.py +2150 -0
- cdxml_toolkit/perception/scheme_reader.py +2948 -0
- cdxml_toolkit/perception/scheme_refine.py +1404 -0
- cdxml_toolkit/perception/scheme_segmenter.py +619 -0
- cdxml_toolkit/perception/spatial_assignment.py +1013 -0
- cdxml_toolkit/rdkit_utils.py +605 -0
- cdxml_toolkit/render/__init__.py +17 -0
- cdxml_toolkit/render/auto_layout.py +229 -0
- cdxml_toolkit/render/compact_parser.py +632 -0
- cdxml_toolkit/render/parser.py +706 -0
- cdxml_toolkit/render/render_scheme.py +267 -0
- cdxml_toolkit/render/renderer.py +2387 -0
- cdxml_toolkit/render/schema.py +90 -0
- cdxml_toolkit/render/scheme_maker.py +1043 -0
- cdxml_toolkit/render/scheme_yaml_writer.py +1487 -0
- cdxml_toolkit/resolve/__init__.py +13 -0
- cdxml_toolkit/resolve/cas_resolver.py +430 -0
- cdxml_toolkit/resolve/chemscanner_abbreviations.json +28813 -0
- cdxml_toolkit/resolve/condensed_formula.py +493 -0
- cdxml_toolkit/resolve/jre_manager.py +195 -0
- cdxml_toolkit/resolve/reagent_abbreviations.json +1046 -0
- cdxml_toolkit/resolve/reagent_db.py +285 -0
- cdxml_toolkit/resolve/superatom_data.json +2856 -0
- cdxml_toolkit/resolve/superatom_table.py +146 -0
- cdxml_toolkit/text_formatting.py +298 -0
- cdxml_toolkit-0.5.0.dist-info/METADATA +318 -0
- cdxml_toolkit-0.5.0.dist-info/RECORD +91 -0
- cdxml_toolkit-0.5.0.dist-info/WHEEL +5 -0
- cdxml_toolkit-0.5.0.dist-info/entry_points.txt +17 -0
- cdxml_toolkit-0.5.0.dist-info/licenses/LICENSE +21 -0
- cdxml_toolkit-0.5.0.dist-info/licenses/NOTICE.md +37 -0
- cdxml_toolkit-0.5.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,1412 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
Multi-LCMS Analyzer
|
|
4
|
+
Collates peaks across multiple LCMS files from the same reaction to:
|
|
5
|
+
1. Match peaks across files (same compound identification by RT + UV ratio)
|
|
6
|
+
2. Merge mass spectrum ions into recurring vs one-off lists
|
|
7
|
+
3. Track area% trends over time (increasing / decreasing / stable)
|
|
8
|
+
|
|
9
|
+
Input: MassLynx PDF files (parsed internally via lcms_analyzer.parse_report).
|
|
10
|
+
Output: Text report (default) or structured JSON (--json).
|
|
11
|
+
|
|
12
|
+
Usage:
|
|
13
|
+
python multi_lcms_analyzer.py \\
|
|
14
|
+
file1.pdf file2.pdf file3.pdf ... \\
|
|
15
|
+
--rt-tolerance 0.02 \\
|
|
16
|
+
--mz-tolerance 0.5 \\
|
|
17
|
+
--trend-threshold 0.2
|
|
18
|
+
"""
|
|
19
|
+
|
|
20
|
+
import argparse
|
|
21
|
+
import json
|
|
22
|
+
import os
|
|
23
|
+
import re
|
|
24
|
+
import sys
|
|
25
|
+
from collections import Counter, defaultdict
|
|
26
|
+
from dataclasses import dataclass, field
|
|
27
|
+
from statistics import median
|
|
28
|
+
from typing import List, Optional, Dict, Tuple
|
|
29
|
+
|
|
30
|
+
from cdxml_toolkit.constants import (
|
|
31
|
+
LCMS_RT_TOLERANCE,
|
|
32
|
+
LCMS_MZ_TOLERANCE,
|
|
33
|
+
LCMS_TREND_THRESHOLD,
|
|
34
|
+
LCMS_MIN_SUMMARY_AREA,
|
|
35
|
+
)
|
|
36
|
+
from ..lcms_analyzer import (
|
|
37
|
+
parse_report, LCMSReport, ChromPeak, MassSpectrum, format_basic_report,
|
|
38
|
+
is_waters_report, method_basename,
|
|
39
|
+
)
|
|
40
|
+
from .lcms_file_categorizer import categorize_lcms_file, _AMBIGUOUS_SORT_KEYS
|
|
41
|
+
|
|
42
|
+
# ---------------------------------------------------------------------------
|
|
43
|
+
# Data structures
|
|
44
|
+
# ---------------------------------------------------------------------------
|
|
45
|
+
|
|
46
|
+
@dataclass
|
|
47
|
+
class FileEntry:
|
|
48
|
+
"""Metadata for one LCMS file in the analysis."""
|
|
49
|
+
path: str
|
|
50
|
+
filename: str
|
|
51
|
+
category: str # "tracking", "workup", "purification", "final"
|
|
52
|
+
sort_key: float
|
|
53
|
+
report: Optional[LCMSReport] = None
|
|
54
|
+
run_datetime: Optional[str] = None # "YYYY-MM-DD HH:MM:SS" from PDF
|
|
55
|
+
ambiguous_time: bool = False # True for "beforeadd" etc.
|
|
56
|
+
group_prefix: Optional[str] = None # tracking group prefix (from batch categorizer)
|
|
57
|
+
method_variant: Optional[str] = None # filename-derived method hint (AmB, AmF, etc.)
|
|
58
|
+
|
|
59
|
+
@dataclass
|
|
60
|
+
class IonCluster:
|
|
61
|
+
"""A group of m/z values across files that represent the same ion."""
|
|
62
|
+
mean_mz: float
|
|
63
|
+
mode: str # "ES+" or "ES-"
|
|
64
|
+
occurrences: int # number of files this ion appeared in
|
|
65
|
+
best_rank: int # best (lowest) rank seen across files (0 = base peak)
|
|
66
|
+
mz_values: List[float] = field(default_factory=list)
|
|
67
|
+
|
|
68
|
+
@dataclass
|
|
69
|
+
class Compound:
|
|
70
|
+
"""A matched compound tracked across multiple LCMS files."""
|
|
71
|
+
compound_id: int
|
|
72
|
+
canonical_rt: float = 0.0
|
|
73
|
+
uv_ratio: Optional[float] = None # area_220nm / area_254nm
|
|
74
|
+
|
|
75
|
+
# Per-file data: keyed by file index (chronological order)
|
|
76
|
+
rt_by_file: Dict[int, float] = field(default_factory=dict)
|
|
77
|
+
area_pct_by_file: Dict[int, Optional[float]] = field(default_factory=dict)
|
|
78
|
+
area_220_by_file: Dict[int, Optional[float]] = field(default_factory=dict)
|
|
79
|
+
area_254_by_file: Dict[int, Optional[float]] = field(default_factory=dict)
|
|
80
|
+
area_pct_220_by_file: Dict[int, Optional[float]] = field(default_factory=dict)
|
|
81
|
+
area_pct_254_by_file: Dict[int, Optional[float]] = field(default_factory=dict)
|
|
82
|
+
|
|
83
|
+
# Raw ion collection: (mode, mz, rank_in_top_ions, file_index)
|
|
84
|
+
all_ions: List[Tuple[str, float, int, int]] = field(default_factory=list)
|
|
85
|
+
|
|
86
|
+
# Merged ion clusters (populated after matching)
|
|
87
|
+
recurring_ions: List[IonCluster] = field(default_factory=list)
|
|
88
|
+
other_ions: List[IonCluster] = field(default_factory=list)
|
|
89
|
+
|
|
90
|
+
# UV lambda-max consensus
|
|
91
|
+
uv_lambda_max: List[float] = field(default_factory=list)
|
|
92
|
+
|
|
93
|
+
# Trend
|
|
94
|
+
trend: str = "stable"
|
|
95
|
+
trend_detail: str = ""
|
|
96
|
+
max_area: float = 0.0 # max observed area% (excluding outlier files)
|
|
97
|
+
|
|
98
|
+
@dataclass
|
|
99
|
+
class AnalysisResult:
|
|
100
|
+
"""Complete result of the multi-file LCMS analysis."""
|
|
101
|
+
instrument: str
|
|
102
|
+
method_short: str
|
|
103
|
+
method_key: str = "" # method basename for grouping (lowercased)
|
|
104
|
+
files: List[FileEntry] = field(default_factory=list)
|
|
105
|
+
compounds: List[Compound] = field(default_factory=list)
|
|
106
|
+
warnings: List[str] = field(default_factory=list)
|
|
107
|
+
excluded_files: set = field(default_factory=set) # indices of outlier files
|
|
108
|
+
ambiguous_files: set = field(default_factory=set) # indices with uncertain timing
|
|
109
|
+
discarded_files: List[FileEntry] = field(default_factory=list) # files from other groups
|
|
110
|
+
|
|
111
|
+
# ---------------------------------------------------------------------------
|
|
112
|
+
# Note: File categorization code has been extracted to lcms_file_categorizer.py
|
|
113
|
+
# ---------------------------------------------------------------------------
|
|
114
|
+
|
|
115
|
+
def extract_run_datetime(pdf_path: str) -> Optional[str]:
|
|
116
|
+
"""
|
|
117
|
+
Extract the acquisition date+time from a MassLynx PDF.
|
|
118
|
+
Looks for 'Date:DD-Mon-YYYY' and 'Time:HH:MM:SS' in the header.
|
|
119
|
+
Returns ISO-format string 'YYYY-MM-DD HH:MM:SS' or None.
|
|
120
|
+
"""
|
|
121
|
+
from ..lcms_analyzer import extract_all_text
|
|
122
|
+
|
|
123
|
+
try:
|
|
124
|
+
text = extract_all_text(pdf_path)
|
|
125
|
+
except Exception:
|
|
126
|
+
return None
|
|
127
|
+
|
|
128
|
+
date_m = re.search(r'Date:(\d{1,2}-\w{3}-\d{4})', text)
|
|
129
|
+
time_m = re.search(r'Time:(\d{1,2}:\d{2}:\d{2})', text)
|
|
130
|
+
|
|
131
|
+
if not date_m or not time_m:
|
|
132
|
+
return None
|
|
133
|
+
|
|
134
|
+
try:
|
|
135
|
+
from datetime import datetime as _dt
|
|
136
|
+
dt = _dt.strptime(f"{date_m.group(1)} {time_m.group(1)}",
|
|
137
|
+
"%d-%b-%Y %H:%M:%S")
|
|
138
|
+
return dt.strftime("%Y-%m-%d %H:%M:%S")
|
|
139
|
+
except ValueError:
|
|
140
|
+
return None
|
|
141
|
+
|
|
142
|
+
|
|
143
|
+
# ---------------------------------------------------------------------------
|
|
144
|
+
# UV ratio helpers
|
|
145
|
+
# ---------------------------------------------------------------------------
|
|
146
|
+
|
|
147
|
+
def compute_uv_ratio(peak: ChromPeak) -> Optional[float]:
|
|
148
|
+
"""
|
|
149
|
+
Compute area_220nm / area_254nm for a peak.
|
|
150
|
+
Returns None if either area is missing or zero (inconclusive data).
|
|
151
|
+
Only returns a meaningful ratio when both areas are present and non-zero.
|
|
152
|
+
"""
|
|
153
|
+
a220 = peak.area_220nm
|
|
154
|
+
a254 = peak.area_254nm
|
|
155
|
+
if a220 is None or a254 is None:
|
|
156
|
+
return None
|
|
157
|
+
if a220 == 0 or a254 == 0:
|
|
158
|
+
return None
|
|
159
|
+
return a220 / a254
|
|
160
|
+
|
|
161
|
+
|
|
162
|
+
def check_uv_compatibility(ratio_a: Optional[float],
|
|
163
|
+
ratio_b: Optional[float]) -> Optional[bool]:
|
|
164
|
+
"""
|
|
165
|
+
Check if two UV ratios are compatible.
|
|
166
|
+
Returns True (compatible), False (incompatible), or None (inconclusive).
|
|
167
|
+
|
|
168
|
+
Only rejects when BOTH ratios are finite and clearly outside 2x of each
|
|
169
|
+
other. If either ratio is None, returns None (inconclusive — the peak
|
|
170
|
+
might just not have been detected on one channel in a particular run).
|
|
171
|
+
"""
|
|
172
|
+
if ratio_a is None or ratio_b is None:
|
|
173
|
+
return None
|
|
174
|
+
|
|
175
|
+
if ratio_b == 0:
|
|
176
|
+
return None
|
|
177
|
+
|
|
178
|
+
factor = ratio_a / ratio_b
|
|
179
|
+
if 0.5 <= factor <= 2.0:
|
|
180
|
+
return True
|
|
181
|
+
return False
|
|
182
|
+
|
|
183
|
+
# ---------------------------------------------------------------------------
|
|
184
|
+
# Peak matching
|
|
185
|
+
# ---------------------------------------------------------------------------
|
|
186
|
+
|
|
187
|
+
def _update_compound_uv_ratio(compound: Compound):
|
|
188
|
+
"""Recompute compound's UV ratio as median of all finite observed ratios."""
|
|
189
|
+
ratios = []
|
|
190
|
+
for fi in compound.area_220_by_file:
|
|
191
|
+
a220 = compound.area_220_by_file.get(fi)
|
|
192
|
+
a254 = compound.area_254_by_file.get(fi)
|
|
193
|
+
if a220 is not None and a254 is not None and a254 > 0:
|
|
194
|
+
ratios.append(a220 / a254)
|
|
195
|
+
if ratios:
|
|
196
|
+
compound.uv_ratio = median(ratios)
|
|
197
|
+
|
|
198
|
+
|
|
199
|
+
def create_compound(cid: int, peak: ChromPeak, ratio: Optional[float],
|
|
200
|
+
file_idx: int) -> Compound:
|
|
201
|
+
"""Create a new Compound from a seed peak."""
|
|
202
|
+
c = Compound(compound_id=cid, canonical_rt=peak.rt, uv_ratio=ratio)
|
|
203
|
+
c.rt_by_file[file_idx] = peak.rt
|
|
204
|
+
c.area_pct_by_file[file_idx] = peak.area_pct
|
|
205
|
+
c.area_220_by_file[file_idx] = peak.area_220nm
|
|
206
|
+
c.area_254_by_file[file_idx] = peak.area_254nm
|
|
207
|
+
c.area_pct_220_by_file[file_idx] = peak.area_pct_220nm
|
|
208
|
+
c.area_pct_254_by_file[file_idx] = peak.area_pct_254nm
|
|
209
|
+
# Collect ions with rank info
|
|
210
|
+
for spec in peak.ms_spectra:
|
|
211
|
+
for rank, mz in enumerate(spec.top_ions):
|
|
212
|
+
c.all_ions.append((spec.mode, mz, rank, file_idx))
|
|
213
|
+
return c
|
|
214
|
+
|
|
215
|
+
|
|
216
|
+
def attach_peak_to_compound(compound: Compound, peak: ChromPeak,
|
|
217
|
+
file_idx: int):
|
|
218
|
+
"""Add a peak's data to an existing compound."""
|
|
219
|
+
compound.rt_by_file[file_idx] = peak.rt
|
|
220
|
+
compound.area_pct_by_file[file_idx] = peak.area_pct
|
|
221
|
+
compound.area_220_by_file[file_idx] = peak.area_220nm
|
|
222
|
+
compound.area_254_by_file[file_idx] = peak.area_254nm
|
|
223
|
+
compound.area_pct_220_by_file[file_idx] = peak.area_pct_220nm
|
|
224
|
+
compound.area_pct_254_by_file[file_idx] = peak.area_pct_254nm
|
|
225
|
+
for spec in peak.ms_spectra:
|
|
226
|
+
for rank, mz in enumerate(spec.top_ions):
|
|
227
|
+
compound.all_ions.append((spec.mode, mz, rank, file_idx))
|
|
228
|
+
# Update running canonical RT (will be finalized later)
|
|
229
|
+
_update_compound_uv_ratio(compound)
|
|
230
|
+
|
|
231
|
+
|
|
232
|
+
def find_and_match(peak: ChromPeak, ratio: Optional[float],
|
|
233
|
+
compounds: List[Compound], rt_tol: float,
|
|
234
|
+
used_ids: set) -> Optional[Compound]:
|
|
235
|
+
"""
|
|
236
|
+
Find the best matching compound for a peak.
|
|
237
|
+
Returns the compound or None if no match found.
|
|
238
|
+
"""
|
|
239
|
+
candidates = []
|
|
240
|
+
for compound in compounds:
|
|
241
|
+
if compound.compound_id in used_ids:
|
|
242
|
+
continue
|
|
243
|
+
|
|
244
|
+
rt_delta = abs(peak.rt - compound.canonical_rt)
|
|
245
|
+
if rt_delta > rt_tol:
|
|
246
|
+
continue
|
|
247
|
+
|
|
248
|
+
uv_ok = check_uv_compatibility(ratio, compound.uv_ratio)
|
|
249
|
+
# If UV data available and incompatible, skip
|
|
250
|
+
if uv_ok is False:
|
|
251
|
+
continue
|
|
252
|
+
|
|
253
|
+
# Score: lower is better. UV confirmation halves the score.
|
|
254
|
+
score = rt_delta
|
|
255
|
+
if uv_ok is True:
|
|
256
|
+
score *= 0.5
|
|
257
|
+
|
|
258
|
+
candidates.append((compound, score))
|
|
259
|
+
|
|
260
|
+
if not candidates:
|
|
261
|
+
return None
|
|
262
|
+
|
|
263
|
+
candidates.sort(key=lambda x: x[1])
|
|
264
|
+
return candidates[0][0]
|
|
265
|
+
|
|
266
|
+
|
|
267
|
+
def match_peaks_across_files(files: List[FileEntry],
|
|
268
|
+
rt_tol: float) -> List[Compound]:
|
|
269
|
+
"""
|
|
270
|
+
Match peaks across all files and return a list of Compounds.
|
|
271
|
+
Files must be pre-sorted chronologically.
|
|
272
|
+
"""
|
|
273
|
+
compounds: List[Compound] = []
|
|
274
|
+
next_id = 1
|
|
275
|
+
|
|
276
|
+
for file_idx, fe in enumerate(files):
|
|
277
|
+
if fe.report is None:
|
|
278
|
+
continue
|
|
279
|
+
|
|
280
|
+
peaks = fe.report.peaks
|
|
281
|
+
if not peaks:
|
|
282
|
+
continue
|
|
283
|
+
|
|
284
|
+
# Compute UV ratio for each peak
|
|
285
|
+
peaks_with_ratio = [(p, compute_uv_ratio(p)) for p in peaks]
|
|
286
|
+
|
|
287
|
+
if not compounds:
|
|
288
|
+
# First file with peaks — seed all compounds
|
|
289
|
+
for peak, ratio in peaks_with_ratio:
|
|
290
|
+
compounds.append(create_compound(next_id, peak, ratio,
|
|
291
|
+
file_idx))
|
|
292
|
+
next_id += 1
|
|
293
|
+
continue
|
|
294
|
+
|
|
295
|
+
# Match peaks: process largest peaks first for stable matching
|
|
296
|
+
sorted_peaks = sorted(peaks_with_ratio,
|
|
297
|
+
key=lambda x: x[0].area_pct or 0,
|
|
298
|
+
reverse=True)
|
|
299
|
+
used_ids: set = set()
|
|
300
|
+
|
|
301
|
+
unmatched = []
|
|
302
|
+
for peak, ratio in sorted_peaks:
|
|
303
|
+
match = find_and_match(peak, ratio, compounds, rt_tol, used_ids)
|
|
304
|
+
if match:
|
|
305
|
+
attach_peak_to_compound(match, peak, file_idx)
|
|
306
|
+
used_ids.add(match.compound_id)
|
|
307
|
+
else:
|
|
308
|
+
unmatched.append((peak, ratio))
|
|
309
|
+
|
|
310
|
+
# Create new compounds for unmatched peaks
|
|
311
|
+
for peak, ratio in unmatched:
|
|
312
|
+
compounds.append(create_compound(next_id, peak, ratio, file_idx))
|
|
313
|
+
next_id += 1
|
|
314
|
+
|
|
315
|
+
return compounds
|
|
316
|
+
|
|
317
|
+
# ---------------------------------------------------------------------------
|
|
318
|
+
# Post-processing
|
|
319
|
+
# ---------------------------------------------------------------------------
|
|
320
|
+
|
|
321
|
+
def compute_canonical_rt(compound: Compound):
|
|
322
|
+
"""Set canonical RT by majority vote (mode of rounded values)."""
|
|
323
|
+
rt_values = list(compound.rt_by_file.values())
|
|
324
|
+
if not rt_values:
|
|
325
|
+
return
|
|
326
|
+
rounded = [round(rt, 2) for rt in rt_values]
|
|
327
|
+
counter = Counter(rounded)
|
|
328
|
+
max_count = max(counter.values())
|
|
329
|
+
modes = sorted(rt for rt, count in counter.items() if count == max_count)
|
|
330
|
+
compound.canonical_rt = modes[0] if len(modes) == 1 else round(
|
|
331
|
+
median(modes), 2)
|
|
332
|
+
|
|
333
|
+
|
|
334
|
+
def cluster_ions(compound: Compound, mz_tol: float, total_files: int,
|
|
335
|
+
max_ion_rank: Optional[int] = None):
|
|
336
|
+
"""Group ions within mz_tol, split into recurring vs other.
|
|
337
|
+
|
|
338
|
+
Args:
|
|
339
|
+
max_ion_rank: If set, exclude "other ions" whose best rank >= this
|
|
340
|
+
value from the output (0-based). E.g. max_ion_rank=5 keeps only
|
|
341
|
+
ions ranked 0-4 (base peak through rank 5 display).
|
|
342
|
+
"""
|
|
343
|
+
# Separate by mode
|
|
344
|
+
ions_by_mode: Dict[str, List[Tuple[float, int, int]]] = defaultdict(list)
|
|
345
|
+
for mode, mz, rank, file_idx in compound.all_ions:
|
|
346
|
+
ions_by_mode[mode].append((mz, rank, file_idx))
|
|
347
|
+
|
|
348
|
+
all_clusters: List[IonCluster] = []
|
|
349
|
+
|
|
350
|
+
for mode, ion_list in ions_by_mode.items():
|
|
351
|
+
ion_list.sort(key=lambda x: x[0]) # sort by m/z
|
|
352
|
+
|
|
353
|
+
clusters: list = []
|
|
354
|
+
for mz, rank, file_idx in ion_list:
|
|
355
|
+
placed = False
|
|
356
|
+
for cl in clusters:
|
|
357
|
+
if abs(mz - cl['center']) <= mz_tol:
|
|
358
|
+
cl['values'].append(mz)
|
|
359
|
+
cl['files'].add(file_idx)
|
|
360
|
+
cl['best_rank'] = min(cl['best_rank'], rank)
|
|
361
|
+
cl['center'] = sum(cl['values']) / len(cl['values'])
|
|
362
|
+
placed = True
|
|
363
|
+
break
|
|
364
|
+
if not placed:
|
|
365
|
+
clusters.append({
|
|
366
|
+
'center': mz,
|
|
367
|
+
'values': [mz],
|
|
368
|
+
'files': {file_idx},
|
|
369
|
+
'mode': mode,
|
|
370
|
+
'best_rank': rank,
|
|
371
|
+
})
|
|
372
|
+
|
|
373
|
+
for cl in clusters:
|
|
374
|
+
all_clusters.append(IonCluster(
|
|
375
|
+
mean_mz=round(sum(cl['values']) / len(cl['values']), 1),
|
|
376
|
+
mode=cl['mode'],
|
|
377
|
+
occurrences=len(cl['files']),
|
|
378
|
+
best_rank=cl['best_rank'],
|
|
379
|
+
mz_values=cl['values'],
|
|
380
|
+
))
|
|
381
|
+
|
|
382
|
+
# Determine how many distinct files this compound was observed in
|
|
383
|
+
files_observed = set()
|
|
384
|
+
for _, _, _, fi in compound.all_ions:
|
|
385
|
+
files_observed.add(fi)
|
|
386
|
+
n_files_observed = len(files_observed)
|
|
387
|
+
|
|
388
|
+
if n_files_observed <= 1:
|
|
389
|
+
# Compound only in 1 file: ALL ions are canonical (no "other" bucket).
|
|
390
|
+
# When there's only one observation, the recurring-vs-other distinction
|
|
391
|
+
# is meaningless — the chemist needs to see all ions to identify the
|
|
392
|
+
# compound.
|
|
393
|
+
compound.recurring_ions = sorted(
|
|
394
|
+
all_clusters,
|
|
395
|
+
key=lambda c: (c.best_rank, c.mean_mz),
|
|
396
|
+
)
|
|
397
|
+
compound.other_ions = []
|
|
398
|
+
else:
|
|
399
|
+
# Split recurring (>=2 files) vs other
|
|
400
|
+
compound.recurring_ions = sorted(
|
|
401
|
+
[c for c in all_clusters if c.occurrences >= 2],
|
|
402
|
+
key=lambda c: (-c.occurrences, c.best_rank, c.mean_mz),
|
|
403
|
+
)
|
|
404
|
+
other = [c for c in all_clusters if c.occurrences < 2]
|
|
405
|
+
if max_ion_rank is not None:
|
|
406
|
+
other = [c for c in other if c.best_rank < max_ion_rank]
|
|
407
|
+
compound.other_ions = sorted(
|
|
408
|
+
other,
|
|
409
|
+
key=lambda c: (c.best_rank, c.mean_mz),
|
|
410
|
+
)
|
|
411
|
+
|
|
412
|
+
|
|
413
|
+
def _best_area_pct(compound: Compound, fi: int) -> Optional[float]:
|
|
414
|
+
"""Return the best available area% for a compound in a given file.
|
|
415
|
+
Prefers TAC, falls back to 220nm, then 254nm."""
|
|
416
|
+
area = compound.area_pct_by_file.get(fi)
|
|
417
|
+
if area is not None:
|
|
418
|
+
return area
|
|
419
|
+
area = compound.area_pct_220_by_file.get(fi)
|
|
420
|
+
if area is not None:
|
|
421
|
+
return area
|
|
422
|
+
return compound.area_pct_254_by_file.get(fi)
|
|
423
|
+
|
|
424
|
+
|
|
425
|
+
def compute_trend(compound: Compound, total_files: int,
|
|
426
|
+
threshold: float, excluded_files: set = None):
|
|
427
|
+
"""
|
|
428
|
+
Determine area% trend: increasing / decreasing / stable.
|
|
429
|
+
|
|
430
|
+
For files where the compound is NOT observed but which fall between
|
|
431
|
+
(or after) files where it IS observed, treat area as 0%. This
|
|
432
|
+
correctly marks consumed compounds as "decreasing" and newly formed
|
|
433
|
+
compounds as "increasing". Excluded (outlier) files are skipped.
|
|
434
|
+
"""
|
|
435
|
+
if excluded_files is None:
|
|
436
|
+
excluded_files = set()
|
|
437
|
+
|
|
438
|
+
# Build the full timeline across ALL non-excluded files
|
|
439
|
+
seen_files = set(compound.area_pct_by_file.keys()) - excluded_files
|
|
440
|
+
if not seen_files:
|
|
441
|
+
compound.trend = "stable"
|
|
442
|
+
compound.trend_detail = "no area data"
|
|
443
|
+
return
|
|
444
|
+
|
|
445
|
+
first_seen = min(seen_files)
|
|
446
|
+
last_seen = max(seen_files)
|
|
447
|
+
|
|
448
|
+
# Only one observation → can't determine trend
|
|
449
|
+
observed = []
|
|
450
|
+
for fi in sorted(seen_files):
|
|
451
|
+
area = _best_area_pct(compound, fi)
|
|
452
|
+
if area is not None:
|
|
453
|
+
observed.append((fi, area))
|
|
454
|
+
if len(observed) < 1:
|
|
455
|
+
compound.trend = "stable"
|
|
456
|
+
compound.trend_detail = "no area data"
|
|
457
|
+
return
|
|
458
|
+
|
|
459
|
+
# Always set max_area from observed data (even with a single file,
|
|
460
|
+
# so that downstream consumers like procedure_writer can filter on it)
|
|
461
|
+
compound.max_area = max(a for _, a in observed)
|
|
462
|
+
|
|
463
|
+
if len(observed) == 1 and first_seen == last_seen:
|
|
464
|
+
compound.trend = "stable"
|
|
465
|
+
compound.trend_detail = "single observation"
|
|
466
|
+
return
|
|
467
|
+
|
|
468
|
+
# Build complete timeline: for files between first_seen and the
|
|
469
|
+
# last non-excluded file, fill in 0% where compound is absent
|
|
470
|
+
last_file_idx = max(i for i in range(total_files)
|
|
471
|
+
if i not in excluded_files)
|
|
472
|
+
timeline = []
|
|
473
|
+
for fi in range(total_files):
|
|
474
|
+
if fi in excluded_files:
|
|
475
|
+
continue
|
|
476
|
+
if fi < first_seen:
|
|
477
|
+
# Compound not yet appeared — treat as 0%
|
|
478
|
+
timeline.append((fi, 0.0))
|
|
479
|
+
elif fi in compound.area_pct_by_file:
|
|
480
|
+
area = _best_area_pct(compound, fi)
|
|
481
|
+
timeline.append((fi, area if area is not None else 0.0))
|
|
482
|
+
else:
|
|
483
|
+
# File exists but compound not detected — 0%
|
|
484
|
+
timeline.append((fi, 0.0))
|
|
485
|
+
|
|
486
|
+
if len(timeline) < 2:
|
|
487
|
+
compound.trend = "stable"
|
|
488
|
+
compound.trend_detail = "single observation"
|
|
489
|
+
return
|
|
490
|
+
|
|
491
|
+
first_area = timeline[0][1]
|
|
492
|
+
last_area = timeline[-1][1]
|
|
493
|
+
max_area = max(a for _, a in timeline)
|
|
494
|
+
|
|
495
|
+
compound.max_area = max_area
|
|
496
|
+
|
|
497
|
+
if max_area < 0.5:
|
|
498
|
+
compound.trend = "stable"
|
|
499
|
+
compound.trend_detail = "trace levels throughout"
|
|
500
|
+
return
|
|
501
|
+
|
|
502
|
+
change = (last_area - first_area) / max_area
|
|
503
|
+
|
|
504
|
+
if change > threshold:
|
|
505
|
+
compound.trend = "increasing"
|
|
506
|
+
elif change < -threshold:
|
|
507
|
+
compound.trend = "decreasing"
|
|
508
|
+
else:
|
|
509
|
+
compound.trend = "stable"
|
|
510
|
+
|
|
511
|
+
# Determine which detector was used
|
|
512
|
+
det_label = _trend_detector_label(compound)
|
|
513
|
+
|
|
514
|
+
compound.trend_detail = (
|
|
515
|
+
f"{first_area:.1f}% \u2192 {last_area:.1f}% "
|
|
516
|
+
f"({det_label}, max {max_area:.1f}%, change {change:+.0%})"
|
|
517
|
+
)
|
|
518
|
+
|
|
519
|
+
|
|
520
|
+
def _trend_detector_label(compound: Compound) -> str:
|
|
521
|
+
"""Return which detector is primarily used for this compound's trend."""
|
|
522
|
+
has_tac = any(v is not None
|
|
523
|
+
for v in compound.area_pct_by_file.values())
|
|
524
|
+
if has_tac:
|
|
525
|
+
return "TAC"
|
|
526
|
+
has_220 = any(v is not None
|
|
527
|
+
for v in compound.area_pct_220_by_file.values())
|
|
528
|
+
if has_220:
|
|
529
|
+
return "220nm"
|
|
530
|
+
return "254nm"
|
|
531
|
+
|
|
532
|
+
|
|
533
|
+
def compute_uv_consensus(compound: Compound):
|
|
534
|
+
"""Deduplicate UV lambda-max across all observations."""
|
|
535
|
+
pass # UV lambda-max is populated via _collect_uv_lambda_max
|
|
536
|
+
|
|
537
|
+
|
|
538
|
+
def detect_outlier_files(files: List[FileEntry]) -> Tuple[set, set]:
|
|
539
|
+
"""
|
|
540
|
+
Flag files that look like blanks/outliers or have ambiguous timing.
|
|
541
|
+
|
|
542
|
+
Returns (outlier_set, ambiguous_set):
|
|
543
|
+
- outlier_set: indices of blank/failed files (excluded from everything)
|
|
544
|
+
- ambiguous_set: indices of files with uncertain timeline position
|
|
545
|
+
(excluded from trend analysis only)
|
|
546
|
+
"""
|
|
547
|
+
ambiguous = {i for i, fe in enumerate(files) if fe.ambiguous_time}
|
|
548
|
+
|
|
549
|
+
if len(files) < 3:
|
|
550
|
+
return set(), ambiguous
|
|
551
|
+
|
|
552
|
+
peak_counts = []
|
|
553
|
+
for fe in files:
|
|
554
|
+
if fe.report:
|
|
555
|
+
peak_counts.append(len(fe.report.peaks))
|
|
556
|
+
else:
|
|
557
|
+
peak_counts.append(0)
|
|
558
|
+
|
|
559
|
+
sorted_counts = sorted(peak_counts)
|
|
560
|
+
median_count = sorted_counts[len(sorted_counts) // 2]
|
|
561
|
+
|
|
562
|
+
if median_count < 3:
|
|
563
|
+
return set(), ambiguous
|
|
564
|
+
|
|
565
|
+
excluded = set()
|
|
566
|
+
for i, fe in enumerate(files):
|
|
567
|
+
if fe.report is None:
|
|
568
|
+
excluded.add(i)
|
|
569
|
+
continue
|
|
570
|
+
|
|
571
|
+
n_peaks = len(fe.report.peaks)
|
|
572
|
+
|
|
573
|
+
# Heuristic 1: far fewer peaks than median
|
|
574
|
+
if n_peaks < median_count * 0.4:
|
|
575
|
+
excluded.add(i)
|
|
576
|
+
continue
|
|
577
|
+
|
|
578
|
+
# Heuristic 2: single dominant peak > 99% TAC area with very few
|
|
579
|
+
# peaks AND the dominant peak is at very low RT (likely solvent/void).
|
|
580
|
+
# Previous version (>95%, <=5 peaks) false-positived on legitimate
|
|
581
|
+
# t=0 and near-complete reaction files where SM or DP dominates.
|
|
582
|
+
tac_peaks = [(p.rt, p.area_pct) for p in fe.report.peaks
|
|
583
|
+
if p.area_pct is not None]
|
|
584
|
+
if tac_peaks:
|
|
585
|
+
dom_rt, dom_area = max(tac_peaks, key=lambda x: x[1])
|
|
586
|
+
if dom_area > 99.0 and n_peaks <= 3 and dom_rt < 0.4:
|
|
587
|
+
excluded.add(i)
|
|
588
|
+
|
|
589
|
+
return excluded, ambiguous
|
|
590
|
+
|
|
591
|
+
|
|
592
|
+
def detect_outlier_files_conservative(
|
|
593
|
+
files: List[FileEntry],
|
|
594
|
+
compounds: List[Compound],
|
|
595
|
+
excluded_files: set,
|
|
596
|
+
significance_floor: float = 5.0,
|
|
597
|
+
threshold: float = 0.5,
|
|
598
|
+
) -> set:
|
|
599
|
+
"""
|
|
600
|
+
Second-pass outlier detection based on multi-species behaviour.
|
|
601
|
+
|
|
602
|
+
After peak matching, check whether the MAJORITY of significant tracked
|
|
603
|
+
compounds show anomalous area% in a given file. One anomalous species is
|
|
604
|
+
likely real chemistry (e.g. an intermediate consumed faster than expected);
|
|
605
|
+
everything being off at once suggests a bad injection.
|
|
606
|
+
|
|
607
|
+
Args:
|
|
608
|
+
files: All FileEntry objects (ordered chronologically).
|
|
609
|
+
compounds: Matched compounds from match_peaks_across_files().
|
|
610
|
+
excluded_files: Files already excluded by first-pass heuristics.
|
|
611
|
+
significance_floor: Only consider compounds with max_area >= this value.
|
|
612
|
+
threshold: Fraction of significant compounds that must be
|
|
613
|
+
anomalous to flag a file (default 0.5 = majority).
|
|
614
|
+
|
|
615
|
+
Returns:
|
|
616
|
+
Set of additional file indices to exclude.
|
|
617
|
+
"""
|
|
618
|
+
if len(files) < 4:
|
|
619
|
+
return set()
|
|
620
|
+
|
|
621
|
+
# Only consider "significant" compounds (visible in chromatogram)
|
|
622
|
+
significant = [c for c in compounds if c.max_area >= significance_floor]
|
|
623
|
+
if len(significant) < 2:
|
|
624
|
+
return set()
|
|
625
|
+
|
|
626
|
+
additional_outliers = set()
|
|
627
|
+
|
|
628
|
+
for fi in range(len(files)):
|
|
629
|
+
if fi in excluded_files:
|
|
630
|
+
continue
|
|
631
|
+
|
|
632
|
+
anomalous_count = 0
|
|
633
|
+
evaluated_count = 0
|
|
634
|
+
|
|
635
|
+
for compound in significant:
|
|
636
|
+
# Collect area% from all non-excluded *other* files
|
|
637
|
+
other_areas = []
|
|
638
|
+
first_seen = min(compound.area_pct_by_file.keys(), default=fi)
|
|
639
|
+
last_seen = max(compound.area_pct_by_file.keys(), default=fi)
|
|
640
|
+
|
|
641
|
+
for other_fi in range(len(files)):
|
|
642
|
+
if other_fi == fi or other_fi in excluded_files:
|
|
643
|
+
continue
|
|
644
|
+
area = _best_area_pct(compound, other_fi)
|
|
645
|
+
if area is not None:
|
|
646
|
+
other_areas.append(area)
|
|
647
|
+
elif first_seen <= other_fi <= last_seen:
|
|
648
|
+
# Compound absent between first/last observation → 0%
|
|
649
|
+
other_areas.append(0.0)
|
|
650
|
+
|
|
651
|
+
if len(other_areas) < 2:
|
|
652
|
+
continue
|
|
653
|
+
|
|
654
|
+
evaluated_count += 1
|
|
655
|
+
|
|
656
|
+
# This file's area% (or 0% if compound absent)
|
|
657
|
+
area_in_file = _best_area_pct(compound, fi)
|
|
658
|
+
if area_in_file is None:
|
|
659
|
+
if first_seen <= fi <= last_seen:
|
|
660
|
+
area_in_file = 0.0
|
|
661
|
+
else:
|
|
662
|
+
continue # compound not expected in this file
|
|
663
|
+
|
|
664
|
+
# Check if this file's value is anomalous.
|
|
665
|
+
# "Anomalous" = area deviates from the mean of other files by
|
|
666
|
+
# more than 80% of the max observed area across other files.
|
|
667
|
+
mean_other = sum(other_areas) / len(other_areas)
|
|
668
|
+
max_other = max(other_areas) if other_areas else 1.0
|
|
669
|
+
if max_other < 1.0:
|
|
670
|
+
continue # trace compound, skip
|
|
671
|
+
|
|
672
|
+
deviation = abs(area_in_file - mean_other)
|
|
673
|
+
if deviation > max_other * 0.8:
|
|
674
|
+
anomalous_count += 1
|
|
675
|
+
|
|
676
|
+
if evaluated_count >= 2 and anomalous_count / evaluated_count > threshold:
|
|
677
|
+
additional_outliers.add(fi)
|
|
678
|
+
|
|
679
|
+
return additional_outliers
|
|
680
|
+
|
|
681
|
+
|
|
682
|
+
# ---------------------------------------------------------------------------
|
|
683
|
+
# Output formatters
|
|
684
|
+
# ---------------------------------------------------------------------------
|
|
685
|
+
|
|
686
|
+
def _file_label(fe: FileEntry) -> str:
|
|
687
|
+
"""Short display name for a file (strip .pdf, strip common prefix)."""
|
|
688
|
+
return os.path.splitext(fe.filename)[0]
|
|
689
|
+
|
|
690
|
+
|
|
691
|
+
def format_text_report(result: AnalysisResult,
|
|
692
|
+
min_summary_area: float = 2.0,
|
|
693
|
+
hide_other_ions: bool = False) -> str:
|
|
694
|
+
"""Render the full text report."""
|
|
695
|
+
lines = []
|
|
696
|
+
sep = "=" * 62
|
|
697
|
+
|
|
698
|
+
# Header
|
|
699
|
+
lines.append(sep)
|
|
700
|
+
lines.append("MULTI-LCMS ANALYSIS")
|
|
701
|
+
lines.append(sep)
|
|
702
|
+
lines.append("")
|
|
703
|
+
lines.append(f"Instrument: {result.instrument}")
|
|
704
|
+
lines.append(f"Method: {result.method_short}")
|
|
705
|
+
lines.append("")
|
|
706
|
+
lines.append(f"Files analyzed ({len(result.files)}):")
|
|
707
|
+
for i, fe in enumerate(result.files):
|
|
708
|
+
flags = []
|
|
709
|
+
if i in result.excluded_files:
|
|
710
|
+
flags.append("EXCLUDED")
|
|
711
|
+
if i in result.ambiguous_files:
|
|
712
|
+
flags.append("ambiguous timing")
|
|
713
|
+
flag_str = f" ** {' | '.join(flags)} **" if flags else ""
|
|
714
|
+
dt_str = f" ({fe.run_datetime})" if fe.run_datetime else ""
|
|
715
|
+
lines.append(
|
|
716
|
+
f" [{i + 1}] {fe.filename:<45s} {fe.category:<14s}"
|
|
717
|
+
f"{dt_str}{flag_str}"
|
|
718
|
+
)
|
|
719
|
+
lines.append("")
|
|
720
|
+
|
|
721
|
+
# ---- REACTION SUMMARY (at top) ----
|
|
722
|
+
lines.append(sep)
|
|
723
|
+
lines.append("REACTION SUMMARY")
|
|
724
|
+
lines.append(sep)
|
|
725
|
+
lines.append("")
|
|
726
|
+
|
|
727
|
+
# Sort each trend group by max observed area (descending)
|
|
728
|
+
# and filter out compounds below the min_summary_area threshold
|
|
729
|
+
def _above_threshold(c: Compound) -> bool:
|
|
730
|
+
return c.max_area >= min_summary_area
|
|
731
|
+
|
|
732
|
+
increasing = sorted(
|
|
733
|
+
[c for c in result.compounds
|
|
734
|
+
if c.trend == "increasing" and _above_threshold(c)],
|
|
735
|
+
key=lambda c: c.max_area, reverse=True)
|
|
736
|
+
decreasing = sorted(
|
|
737
|
+
[c for c in result.compounds
|
|
738
|
+
if c.trend == "decreasing" and _above_threshold(c)],
|
|
739
|
+
key=lambda c: c.max_area, reverse=True)
|
|
740
|
+
stable = sorted(
|
|
741
|
+
[c for c in result.compounds
|
|
742
|
+
if c.trend == "stable" and _above_threshold(c)],
|
|
743
|
+
key=lambda c: c.max_area, reverse=True)
|
|
744
|
+
hidden = [c for c in result.compounds if not _above_threshold(c)]
|
|
745
|
+
|
|
746
|
+
n_files = len(result.files)
|
|
747
|
+
|
|
748
|
+
def _summary_line(c: Compound) -> str:
|
|
749
|
+
"""Format one compound line for the summary."""
|
|
750
|
+
parts = [f" Compound {c.compound_id} \u2014 "
|
|
751
|
+
f"RT {c.canonical_rt:.2f} \u2014 {c.trend_detail}"]
|
|
752
|
+
# Append recurring ions (compact)
|
|
753
|
+
if c.recurring_ions:
|
|
754
|
+
ions_str = ", ".join(
|
|
755
|
+
f"{ic.mode} {ic.mean_mz:.1f}"
|
|
756
|
+
for ic in c.recurring_ions[:4] # limit to top 4
|
|
757
|
+
)
|
|
758
|
+
if len(c.recurring_ions) > 4:
|
|
759
|
+
ions_str += f" (+{len(c.recurring_ions) - 4} more)"
|
|
760
|
+
parts.append(f" Ions: {ions_str}")
|
|
761
|
+
return "\n".join(parts)
|
|
762
|
+
|
|
763
|
+
if increasing:
|
|
764
|
+
lines.append("Increasing (likely product / intermediate):")
|
|
765
|
+
for c in increasing:
|
|
766
|
+
lines.append(_summary_line(c))
|
|
767
|
+
lines.append("")
|
|
768
|
+
|
|
769
|
+
if decreasing:
|
|
770
|
+
lines.append("Decreasing (likely starting material / consumed):")
|
|
771
|
+
for c in decreasing:
|
|
772
|
+
lines.append(_summary_line(c))
|
|
773
|
+
lines.append("")
|
|
774
|
+
|
|
775
|
+
if stable:
|
|
776
|
+
lines.append("Stable:")
|
|
777
|
+
for c in stable:
|
|
778
|
+
lines.append(_summary_line(c))
|
|
779
|
+
lines.append("")
|
|
780
|
+
|
|
781
|
+
if hidden:
|
|
782
|
+
lines.append(f"({len(hidden)} minor compound(s) below "
|
|
783
|
+
f"{min_summary_area:.0f}% max area not shown "
|
|
784
|
+
f"in summary — see details below)")
|
|
785
|
+
lines.append("")
|
|
786
|
+
|
|
787
|
+
# Warnings
|
|
788
|
+
if result.warnings:
|
|
789
|
+
lines.append("Warnings:")
|
|
790
|
+
for w in result.warnings:
|
|
791
|
+
lines.append(f" - {w}")
|
|
792
|
+
lines.append("")
|
|
793
|
+
|
|
794
|
+
# ---- DETAILED COMPOUND SECTIONS ----
|
|
795
|
+
lines.append(sep)
|
|
796
|
+
lines.append("COMPOUND DETAILS")
|
|
797
|
+
lines.append(sep)
|
|
798
|
+
|
|
799
|
+
for compound in result.compounds:
|
|
800
|
+
lines.append("")
|
|
801
|
+
lines.append("-" * 62)
|
|
802
|
+
lines.append(
|
|
803
|
+
f"Compound {compound.compound_id} \u2014 "
|
|
804
|
+
f"RT {compound.canonical_rt:.2f} min ({compound.trend})"
|
|
805
|
+
)
|
|
806
|
+
lines.append("-" * 62)
|
|
807
|
+
|
|
808
|
+
# Trend detail
|
|
809
|
+
lines.append(f" Trend: {compound.trend_detail}")
|
|
810
|
+
|
|
811
|
+
# UV ratio
|
|
812
|
+
if compound.uv_ratio is not None:
|
|
813
|
+
if compound.uv_ratio == float('inf'):
|
|
814
|
+
lines.append(" 220:254: 220nm only (no 254nm absorption)")
|
|
815
|
+
else:
|
|
816
|
+
lines.append(f" 220:254: {compound.uv_ratio:.2f}")
|
|
817
|
+
|
|
818
|
+
# UV lambda-max
|
|
819
|
+
if compound.uv_lambda_max:
|
|
820
|
+
import math
|
|
821
|
+
wl_str = ", ".join(
|
|
822
|
+
str(math.floor(w + 0.5)) for w in compound.uv_lambda_max)
|
|
823
|
+
lines.append(f" UV \u03bbmax: {wl_str} nm")
|
|
824
|
+
|
|
825
|
+
# Recurring ions
|
|
826
|
+
if compound.recurring_ions:
|
|
827
|
+
lines.append("")
|
|
828
|
+
lines.append(" Recurring ions:")
|
|
829
|
+
for ic in compound.recurring_ions:
|
|
830
|
+
rank_label = ("base peak" if ic.best_rank == 0
|
|
831
|
+
else f"rank {ic.best_rank + 1}")
|
|
832
|
+
lines.append(
|
|
833
|
+
f" {ic.mode} {ic.mean_mz:.1f} "
|
|
834
|
+
f"(seen in {ic.occurrences}/{n_files} files, "
|
|
835
|
+
f"{rank_label})"
|
|
836
|
+
)
|
|
837
|
+
|
|
838
|
+
# Other ions (single-observation; hidden with --hide-other-ions)
|
|
839
|
+
if not hide_other_ions and compound.other_ions:
|
|
840
|
+
lines.append("")
|
|
841
|
+
lines.append(" Other ions:")
|
|
842
|
+
for ic in compound.other_ions:
|
|
843
|
+
rank_label = ("base peak" if ic.best_rank == 0
|
|
844
|
+
else f"rank {ic.best_rank + 1}")
|
|
845
|
+
lines.append(
|
|
846
|
+
f" {ic.mode} {ic.mean_mz:.1f} "
|
|
847
|
+
f"(seen in {ic.occurrences} file, {rank_label})"
|
|
848
|
+
)
|
|
849
|
+
|
|
850
|
+
# Area% timeline
|
|
851
|
+
lines.append("")
|
|
852
|
+
lines.append(" Area% timeline:")
|
|
853
|
+
for fi in range(n_files):
|
|
854
|
+
fe = result.files[fi]
|
|
855
|
+
flags = []
|
|
856
|
+
if fi in result.excluded_files:
|
|
857
|
+
flags.append("EXCLUDED")
|
|
858
|
+
if fi in result.ambiguous_files:
|
|
859
|
+
flags.append("ambiguous timing")
|
|
860
|
+
excl = f" **{'|'.join(flags)}**" if flags else ""
|
|
861
|
+
if fi in compound.area_pct_by_file:
|
|
862
|
+
tac = compound.area_pct_by_file[fi]
|
|
863
|
+
rt = compound.rt_by_file.get(fi)
|
|
864
|
+
a220 = compound.area_pct_220_by_file.get(fi)
|
|
865
|
+
a254 = compound.area_pct_254_by_file.get(fi)
|
|
866
|
+
det_parts = []
|
|
867
|
+
if tac is not None:
|
|
868
|
+
det_parts.append(f"TAC {tac:.1f}%")
|
|
869
|
+
if a220 is not None:
|
|
870
|
+
det_parts.append(f"220nm {a220:.1f}%")
|
|
871
|
+
if a254 is not None:
|
|
872
|
+
det_parts.append(f"254nm {a254:.1f}%")
|
|
873
|
+
det_str = ", ".join(det_parts) if det_parts else "no data"
|
|
874
|
+
rt_str = f"RT {rt:.2f}" if rt is not None else ""
|
|
875
|
+
lines.append(
|
|
876
|
+
f" [{fi + 1}] {_file_label(fe):<40s} "
|
|
877
|
+
f"{rt_str} {det_str}{excl}"
|
|
878
|
+
)
|
|
879
|
+
else:
|
|
880
|
+
lines.append(
|
|
881
|
+
f" [{fi + 1}] {_file_label(fe):<40s} "
|
|
882
|
+
f" (not detected){excl}"
|
|
883
|
+
)
|
|
884
|
+
lines.append("")
|
|
885
|
+
|
|
886
|
+
return "\n".join(lines)
|
|
887
|
+
|
|
888
|
+
|
|
889
|
+
def format_json_report(result: AnalysisResult) -> str:
|
|
890
|
+
"""Render structured JSON output."""
|
|
891
|
+
data = {
|
|
892
|
+
"instrument": result.instrument,
|
|
893
|
+
"method_short": result.method_short,
|
|
894
|
+
"files": [],
|
|
895
|
+
"compounds": [],
|
|
896
|
+
"summary": {"increasing": [], "decreasing": [], "stable": []},
|
|
897
|
+
"warnings": result.warnings,
|
|
898
|
+
}
|
|
899
|
+
|
|
900
|
+
for i, fe in enumerate(result.files):
|
|
901
|
+
fd = {
|
|
902
|
+
"index": i,
|
|
903
|
+
"filename": fe.filename,
|
|
904
|
+
"category": fe.category,
|
|
905
|
+
"sort_key": fe.sort_key,
|
|
906
|
+
}
|
|
907
|
+
if fe.report:
|
|
908
|
+
fd["sample_name"] = fe.report.sample_name
|
|
909
|
+
fd["date"] = fe.report.date
|
|
910
|
+
data["files"].append(fd)
|
|
911
|
+
|
|
912
|
+
for c in result.compounds:
|
|
913
|
+
cd = {
|
|
914
|
+
"compound_id": c.compound_id,
|
|
915
|
+
"canonical_rt": c.canonical_rt,
|
|
916
|
+
"uv_ratio": c.uv_ratio if c.uv_ratio != float('inf') else "inf",
|
|
917
|
+
"trend": c.trend,
|
|
918
|
+
"trend_detail": c.trend_detail,
|
|
919
|
+
"max_area": c.max_area,
|
|
920
|
+
"uv_lambda_max": c.uv_lambda_max,
|
|
921
|
+
"recurring_ions": [
|
|
922
|
+
{"mean_mz": ic.mean_mz, "mode": ic.mode,
|
|
923
|
+
"occurrences": ic.occurrences, "best_rank": ic.best_rank}
|
|
924
|
+
for ic in c.recurring_ions
|
|
925
|
+
],
|
|
926
|
+
"other_ions": [
|
|
927
|
+
{"mean_mz": ic.mean_mz, "mode": ic.mode,
|
|
928
|
+
"occurrences": ic.occurrences, "best_rank": ic.best_rank}
|
|
929
|
+
for ic in c.other_ions
|
|
930
|
+
],
|
|
931
|
+
"timeline": [],
|
|
932
|
+
}
|
|
933
|
+
for fi in sorted(c.area_pct_by_file.keys()):
|
|
934
|
+
entry = {
|
|
935
|
+
"file_index": fi,
|
|
936
|
+
"rt": c.rt_by_file.get(fi),
|
|
937
|
+
"area_pct": c.area_pct_by_file.get(fi),
|
|
938
|
+
"area_220": c.area_220_by_file.get(fi),
|
|
939
|
+
"area_254": c.area_254_by_file.get(fi),
|
|
940
|
+
"area_pct_220": c.area_pct_220_by_file.get(fi),
|
|
941
|
+
"area_pct_254": c.area_pct_254_by_file.get(fi),
|
|
942
|
+
}
|
|
943
|
+
cd["timeline"].append(entry)
|
|
944
|
+
data["compounds"].append(cd)
|
|
945
|
+
|
|
946
|
+
# Summary
|
|
947
|
+
data["summary"][c.trend].append(c.compound_id)
|
|
948
|
+
|
|
949
|
+
return json.dumps(data, indent=2, ensure_ascii=False)
|
|
950
|
+
|
|
951
|
+
|
|
952
|
+
def load_analysis_from_json(json_path: str) -> AnalysisResult:
|
|
953
|
+
"""Reconstruct an AnalysisResult from a JSON file produced by format_json_report().
|
|
954
|
+
|
|
955
|
+
This allows downstream tools (e.g. procedure_writer) to reuse pre-computed
|
|
956
|
+
multi-LCMS analysis without re-parsing the original PDFs.
|
|
957
|
+
"""
|
|
958
|
+
with open(json_path, "r", encoding="utf-8") as f:
|
|
959
|
+
data = json.load(f)
|
|
960
|
+
|
|
961
|
+
files = []
|
|
962
|
+
for fd in data.get("files", []):
|
|
963
|
+
fe = FileEntry(
|
|
964
|
+
path="",
|
|
965
|
+
filename=fd["filename"],
|
|
966
|
+
category=fd["category"],
|
|
967
|
+
sort_key=fd.get("sort_key", 0),
|
|
968
|
+
)
|
|
969
|
+
files.append(fe)
|
|
970
|
+
|
|
971
|
+
compounds = []
|
|
972
|
+
for cd in data.get("compounds", []):
|
|
973
|
+
c = Compound(
|
|
974
|
+
compound_id=cd["compound_id"],
|
|
975
|
+
canonical_rt=cd.get("canonical_rt", 0.0),
|
|
976
|
+
)
|
|
977
|
+
uv_ratio = cd.get("uv_ratio")
|
|
978
|
+
if uv_ratio == "inf":
|
|
979
|
+
c.uv_ratio = float("inf")
|
|
980
|
+
elif uv_ratio is not None:
|
|
981
|
+
c.uv_ratio = uv_ratio
|
|
982
|
+
c.trend = cd.get("trend", "stable")
|
|
983
|
+
c.trend_detail = cd.get("trend_detail", "")
|
|
984
|
+
c.max_area = cd.get("max_area", 0.0)
|
|
985
|
+
c.uv_lambda_max = cd.get("uv_lambda_max", [])
|
|
986
|
+
|
|
987
|
+
for ic_data in cd.get("recurring_ions", []):
|
|
988
|
+
c.recurring_ions.append(IonCluster(
|
|
989
|
+
mean_mz=ic_data["mean_mz"],
|
|
990
|
+
mode=ic_data["mode"],
|
|
991
|
+
occurrences=ic_data.get("occurrences", 1),
|
|
992
|
+
best_rank=ic_data.get("best_rank", 0),
|
|
993
|
+
))
|
|
994
|
+
for ic_data in cd.get("other_ions", []):
|
|
995
|
+
c.other_ions.append(IonCluster(
|
|
996
|
+
mean_mz=ic_data["mean_mz"],
|
|
997
|
+
mode=ic_data["mode"],
|
|
998
|
+
occurrences=ic_data.get("occurrences", 1),
|
|
999
|
+
best_rank=ic_data.get("best_rank", 0),
|
|
1000
|
+
))
|
|
1001
|
+
|
|
1002
|
+
for te in cd.get("timeline", []):
|
|
1003
|
+
fi = te["file_index"]
|
|
1004
|
+
if te.get("rt") is not None:
|
|
1005
|
+
c.rt_by_file[fi] = te["rt"]
|
|
1006
|
+
if te.get("area_pct") is not None:
|
|
1007
|
+
c.area_pct_by_file[fi] = te["area_pct"]
|
|
1008
|
+
if te.get("area_220") is not None:
|
|
1009
|
+
c.area_220_by_file[fi] = te["area_220"]
|
|
1010
|
+
if te.get("area_254") is not None:
|
|
1011
|
+
c.area_254_by_file[fi] = te["area_254"]
|
|
1012
|
+
if te.get("area_pct_220") is not None:
|
|
1013
|
+
c.area_pct_220_by_file[fi] = te["area_pct_220"]
|
|
1014
|
+
if te.get("area_pct_254") is not None:
|
|
1015
|
+
c.area_pct_254_by_file[fi] = te["area_pct_254"]
|
|
1016
|
+
|
|
1017
|
+
compounds.append(c)
|
|
1018
|
+
|
|
1019
|
+
return AnalysisResult(
|
|
1020
|
+
instrument=data.get("instrument", "Unknown"),
|
|
1021
|
+
method_short=data.get("method_short", "Unknown"),
|
|
1022
|
+
files=files,
|
|
1023
|
+
compounds=compounds,
|
|
1024
|
+
warnings=data.get("warnings", []),
|
|
1025
|
+
)
|
|
1026
|
+
|
|
1027
|
+
|
|
1028
|
+
# ---------------------------------------------------------------------------
|
|
1029
|
+
# Orchestration
|
|
1030
|
+
# ---------------------------------------------------------------------------
|
|
1031
|
+
|
|
1032
|
+
def analyze(files: List[FileEntry], rt_tol: float, mz_tol: float,
|
|
1033
|
+
trend_threshold: float,
|
|
1034
|
+
ignore_instrument: bool,
|
|
1035
|
+
use_run_time: bool = True,
|
|
1036
|
+
max_ion_rank: Optional[int] = None,
|
|
1037
|
+
pick_biggest_group: bool = False) -> List[AnalysisResult]:
|
|
1038
|
+
"""
|
|
1039
|
+
Top-level analysis. Groups files by (instrument, method) and runs
|
|
1040
|
+
peak matching within each group.
|
|
1041
|
+
|
|
1042
|
+
Args:
|
|
1043
|
+
pick_biggest_group: When True and multiple (instrument, method) groups
|
|
1044
|
+
exist, only analyze the largest group — discard the rest. Used by
|
|
1045
|
+
the procedure_writer pipeline where cross-method comparison is not
|
|
1046
|
+
meaningful. When False (default / CLI), analyze each group
|
|
1047
|
+
separately and return one AnalysisResult per group.
|
|
1048
|
+
"""
|
|
1049
|
+
warnings: List[str] = []
|
|
1050
|
+
discarded_all: List[FileEntry] = []
|
|
1051
|
+
|
|
1052
|
+
# Filter to files that parsed successfully
|
|
1053
|
+
valid = [f for f in files if f.report is not None]
|
|
1054
|
+
if not valid:
|
|
1055
|
+
return []
|
|
1056
|
+
|
|
1057
|
+
# --- Group by (instrument, method) ---
|
|
1058
|
+
if ignore_instrument:
|
|
1059
|
+
# Legacy mode: ignore instrument but still respect method
|
|
1060
|
+
groups: Dict[str, List[FileEntry]] = defaultdict(list)
|
|
1061
|
+
for f in valid:
|
|
1062
|
+
meth = method_basename(f.report.method_path)
|
|
1063
|
+
groups[f"all|{meth}"].append(f)
|
|
1064
|
+
# If all files share the same method, simplify key to "all"
|
|
1065
|
+
if len(groups) == 1:
|
|
1066
|
+
key = next(iter(groups))
|
|
1067
|
+
groups = {"all": groups[key]}
|
|
1068
|
+
else:
|
|
1069
|
+
groups = defaultdict(list)
|
|
1070
|
+
for f in valid:
|
|
1071
|
+
inst = f.report.instrument
|
|
1072
|
+
meth = method_basename(f.report.method_path)
|
|
1073
|
+
groups[f"{inst}|{meth}"].append(f)
|
|
1074
|
+
|
|
1075
|
+
if len(groups) > 1:
|
|
1076
|
+
group_summaries = []
|
|
1077
|
+
for k, v in sorted(groups.items(), key=lambda x: -len(x[1])):
|
|
1078
|
+
parts = k.split("|", 1)
|
|
1079
|
+
inst_part = parts[0]
|
|
1080
|
+
meth_part = parts[1] if len(parts) > 1 else "?"
|
|
1081
|
+
# Use method_short for human-readable display
|
|
1082
|
+
meth_display = v[0].report.method_short if v[0].report else meth_part
|
|
1083
|
+
group_summaries.append(
|
|
1084
|
+
f"{inst_part}/{meth_display} ({len(v)} files)")
|
|
1085
|
+
warnings.append(
|
|
1086
|
+
f"Files from {len(groups)} (instrument, method) groups: "
|
|
1087
|
+
+ ", ".join(group_summaries) + "."
|
|
1088
|
+
)
|
|
1089
|
+
|
|
1090
|
+
if pick_biggest_group:
|
|
1091
|
+
biggest_key = max(groups, key=lambda k: len(groups[k]))
|
|
1092
|
+
for k, v in list(groups.items()):
|
|
1093
|
+
if k != biggest_key:
|
|
1094
|
+
discarded_all.extend(v)
|
|
1095
|
+
disc_names = ", ".join(f.filename for f in v)
|
|
1096
|
+
warnings.append(
|
|
1097
|
+
f"Discarded {len(v)} file(s) from non-primary group "
|
|
1098
|
+
f"({k}): {disc_names}."
|
|
1099
|
+
)
|
|
1100
|
+
groups = {biggest_key: groups[biggest_key]}
|
|
1101
|
+
|
|
1102
|
+
results = []
|
|
1103
|
+
for inst_key, group_files in groups.items():
|
|
1104
|
+
method_warn = []
|
|
1105
|
+
|
|
1106
|
+
# Sort by sort_key — this is the single source of truth for ordering.
|
|
1107
|
+
# For Group 1 tracking files this is filename-derived (chemist controls
|
|
1108
|
+
# submission order). For Group 2+ it may be recalibrated to actual
|
|
1109
|
+
# acquisition timestamps by the caller (lcms_identifier.py).
|
|
1110
|
+
group_files.sort(key=lambda f: f.sort_key)
|
|
1111
|
+
|
|
1112
|
+
# Detect outlier / blank files and ambiguous-time files
|
|
1113
|
+
outliers, ambiguous = detect_outlier_files(group_files)
|
|
1114
|
+
# For trend analysis, exclude both outliers and ambiguous-time files
|
|
1115
|
+
trend_excluded = outliers | ambiguous
|
|
1116
|
+
if outliers:
|
|
1117
|
+
excl_names = [group_files[i].filename
|
|
1118
|
+
for i in sorted(outliers)]
|
|
1119
|
+
method_warn.append(
|
|
1120
|
+
f"Excluded {len(outliers)} file(s) as likely "
|
|
1121
|
+
f"blank/outlier: {', '.join(excl_names)}."
|
|
1122
|
+
)
|
|
1123
|
+
if ambiguous:
|
|
1124
|
+
amb_names = [group_files[i].filename
|
|
1125
|
+
for i in sorted(ambiguous)]
|
|
1126
|
+
method_warn.append(
|
|
1127
|
+
f"Excluded {len(ambiguous)} file(s) with ambiguous "
|
|
1128
|
+
f"timing from trend analysis: {', '.join(amb_names)}."
|
|
1129
|
+
)
|
|
1130
|
+
|
|
1131
|
+
# Match peaks (uses all files for matching, even excluded ones)
|
|
1132
|
+
compounds = match_peaks_across_files(group_files, rt_tol)
|
|
1133
|
+
|
|
1134
|
+
# Post-process — compute canonical RT, UV, ions BEFORE outlier 2nd pass
|
|
1135
|
+
for compound in compounds:
|
|
1136
|
+
compute_canonical_rt(compound)
|
|
1137
|
+
_update_compound_uv_ratio(compound)
|
|
1138
|
+
cluster_ions(compound, mz_tol, len(group_files),
|
|
1139
|
+
max_ion_rank=max_ion_rank)
|
|
1140
|
+
|
|
1141
|
+
# Conservative second-pass outlier detection: flag files where MOST
|
|
1142
|
+
# significant compounds deviate from their expected behaviour.
|
|
1143
|
+
conservative = detect_outlier_files_conservative(
|
|
1144
|
+
group_files, compounds, outliers)
|
|
1145
|
+
if conservative:
|
|
1146
|
+
cons_names = [group_files[i].filename
|
|
1147
|
+
for i in sorted(conservative)]
|
|
1148
|
+
method_warn.append(
|
|
1149
|
+
f"Post-match outlier detection flagged "
|
|
1150
|
+
f"{len(conservative)} additional file(s) where majority "
|
|
1151
|
+
f"of compounds deviate: {', '.join(cons_names)}."
|
|
1152
|
+
)
|
|
1153
|
+
outliers = outliers | conservative
|
|
1154
|
+
trend_excluded = outliers | ambiguous
|
|
1155
|
+
|
|
1156
|
+
# Compute trends with final exclusion set
|
|
1157
|
+
for compound in compounds:
|
|
1158
|
+
compute_trend(compound, len(group_files), trend_threshold,
|
|
1159
|
+
excluded_files=trend_excluded)
|
|
1160
|
+
_collect_uv_lambda_max(compound, group_files)
|
|
1161
|
+
|
|
1162
|
+
# Sort compounds by canonical RT
|
|
1163
|
+
compounds.sort(key=lambda c: c.canonical_rt)
|
|
1164
|
+
# Re-number for clean display
|
|
1165
|
+
for i, c in enumerate(compounds, 1):
|
|
1166
|
+
c.compound_id = i
|
|
1167
|
+
|
|
1168
|
+
first_report = group_files[0].report
|
|
1169
|
+
meth_key = method_basename(
|
|
1170
|
+
first_report.method_path) if first_report else ""
|
|
1171
|
+
result = AnalysisResult(
|
|
1172
|
+
instrument=first_report.instrument if first_report else "Unknown",
|
|
1173
|
+
method_short=first_report.method_short if first_report else "Unknown",
|
|
1174
|
+
method_key=meth_key,
|
|
1175
|
+
files=group_files,
|
|
1176
|
+
compounds=compounds,
|
|
1177
|
+
warnings=warnings + method_warn,
|
|
1178
|
+
excluded_files=outliers,
|
|
1179
|
+
ambiguous_files=ambiguous,
|
|
1180
|
+
discarded_files=discarded_all,
|
|
1181
|
+
)
|
|
1182
|
+
results.append(result)
|
|
1183
|
+
|
|
1184
|
+
return results
|
|
1185
|
+
|
|
1186
|
+
|
|
1187
|
+
def _collect_uv_lambda_max(compound: Compound, files: List[FileEntry]):
|
|
1188
|
+
"""Collect and deduplicate UV lambda-max values from matched peaks."""
|
|
1189
|
+
all_wl: List[float] = []
|
|
1190
|
+
for fi, fe in enumerate(files):
|
|
1191
|
+
if fi not in compound.rt_by_file or fe.report is None:
|
|
1192
|
+
continue
|
|
1193
|
+
# Find the peak in this file that was matched
|
|
1194
|
+
target_rt = compound.rt_by_file[fi]
|
|
1195
|
+
for peak in fe.report.peaks:
|
|
1196
|
+
if abs(peak.rt - target_rt) < 0.01:
|
|
1197
|
+
all_wl.extend(peak.uv_lambda_max)
|
|
1198
|
+
break
|
|
1199
|
+
|
|
1200
|
+
# Deduplicate: group within 10nm, take mean of each cluster
|
|
1201
|
+
if not all_wl:
|
|
1202
|
+
compound.uv_lambda_max = []
|
|
1203
|
+
return
|
|
1204
|
+
all_wl.sort()
|
|
1205
|
+
clusters: List[List[float]] = [[all_wl[0]]]
|
|
1206
|
+
for wl in all_wl[1:]:
|
|
1207
|
+
if wl - clusters[-1][-1] <= 10:
|
|
1208
|
+
clusters[-1].append(wl)
|
|
1209
|
+
else:
|
|
1210
|
+
clusters.append([wl])
|
|
1211
|
+
compound.uv_lambda_max = [
|
|
1212
|
+
sum(c) / len(c) for c in clusters
|
|
1213
|
+
]
|
|
1214
|
+
|
|
1215
|
+
# ---------------------------------------------------------------------------
|
|
1216
|
+
# CLI
|
|
1217
|
+
# ---------------------------------------------------------------------------
|
|
1218
|
+
|
|
1219
|
+
def main(argv=None) -> int:
|
|
1220
|
+
parser = argparse.ArgumentParser(
|
|
1221
|
+
description="Multi-LCMS Analyzer \u2014 collate peaks across "
|
|
1222
|
+
"multiple LCMS files from the same reaction.\n"
|
|
1223
|
+
"By default, files are sorted by their actual LCMS "
|
|
1224
|
+
"acquisition time (extracted from PDF). Use "
|
|
1225
|
+
"--out-of-order if samples were run non-chronologically.",
|
|
1226
|
+
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
1227
|
+
)
|
|
1228
|
+
parser.add_argument('files', nargs='+',
|
|
1229
|
+
help='MassLynx PDF report files')
|
|
1230
|
+
parser.add_argument('--rt-tolerance', type=float, default=LCMS_RT_TOLERANCE,
|
|
1231
|
+
help='RT matching tolerance in minutes (default: %(default)s)')
|
|
1232
|
+
parser.add_argument('--mz-tolerance', type=float, default=LCMS_MZ_TOLERANCE,
|
|
1233
|
+
help='m/z clustering tolerance in Da (default: %(default)s)')
|
|
1234
|
+
parser.add_argument('--trend-threshold', type=float, default=LCMS_TREND_THRESHOLD,
|
|
1235
|
+
help='Trend change threshold as fraction '
|
|
1236
|
+
'(default: %(default)s)')
|
|
1237
|
+
parser.add_argument('--ignore-instrument', action='store_true',
|
|
1238
|
+
help='Analyze all files together regardless of '
|
|
1239
|
+
'instrument')
|
|
1240
|
+
parser.add_argument('--out-of-order', action='store_true',
|
|
1241
|
+
help='Use filename-heuristic sorting instead of '
|
|
1242
|
+
'actual LCMS run time. Use this when samples '
|
|
1243
|
+
'were not run in chronological order. Files '
|
|
1244
|
+
'with ambiguous timing (e.g. "beforeadd") '
|
|
1245
|
+
'will be excluded from trend analysis.')
|
|
1246
|
+
parser.add_argument('--min-summary-area', type=float, default=LCMS_MIN_SUMMARY_AREA,
|
|
1247
|
+
help='Hide compounds below this max area%% from '
|
|
1248
|
+
'the reaction summary (default: %(default)s)')
|
|
1249
|
+
parser.add_argument('--max-ion-rank', type=int, default=None,
|
|
1250
|
+
help='Filter out "other ions" with rank >= this value '
|
|
1251
|
+
'(0-based). E.g. --max-ion-rank 5 keeps only '
|
|
1252
|
+
'ions ranked 1-5 in display. Default: no filter')
|
|
1253
|
+
parser.add_argument('--hide-other-ions', action='store_true',
|
|
1254
|
+
help='Hide single-observation "other ions" from '
|
|
1255
|
+
'compound details (shown by default)')
|
|
1256
|
+
parser.add_argument('--output', '-o', type=str, default=None,
|
|
1257
|
+
help='Output file path (default: stdout)')
|
|
1258
|
+
parser.add_argument('--json', action='store_true',
|
|
1259
|
+
help='Output as structured JSON')
|
|
1260
|
+
parser.add_argument('--json-output', type=str, default=None,
|
|
1261
|
+
help='Save structured JSON to this file (in addition '
|
|
1262
|
+
'to the normal text output)')
|
|
1263
|
+
parser.add_argument('--json-errors', action='store_true',
|
|
1264
|
+
help='Output structured JSON error objects to stderr '
|
|
1265
|
+
'on failure (for agent orchestration)')
|
|
1266
|
+
|
|
1267
|
+
args = parser.parse_args(argv)
|
|
1268
|
+
|
|
1269
|
+
use_run_time = not args.out_of_order
|
|
1270
|
+
|
|
1271
|
+
# Filter out non-standard PDFs (manually integrated chromatograms etc.)
|
|
1272
|
+
valid_files = []
|
|
1273
|
+
for path in args.files:
|
|
1274
|
+
if not is_waters_report(path):
|
|
1275
|
+
print(f" Skipping non-standard PDF: {os.path.basename(path)}",
|
|
1276
|
+
file=sys.stderr)
|
|
1277
|
+
else:
|
|
1278
|
+
valid_files.append(path)
|
|
1279
|
+
|
|
1280
|
+
# Parse all reports
|
|
1281
|
+
file_entries: List[FileEntry] = []
|
|
1282
|
+
for path in valid_files:
|
|
1283
|
+
filename = os.path.basename(path)
|
|
1284
|
+
category, sort_key = categorize_lcms_file(filename)
|
|
1285
|
+
ambiguous = sort_key in _AMBIGUOUS_SORT_KEYS
|
|
1286
|
+
try:
|
|
1287
|
+
report = parse_report(path)
|
|
1288
|
+
# Always try to extract run datetime (shown in output)
|
|
1289
|
+
run_dt = extract_run_datetime(path)
|
|
1290
|
+
fe = FileEntry(
|
|
1291
|
+
path=os.path.abspath(path),
|
|
1292
|
+
filename=filename,
|
|
1293
|
+
category=category,
|
|
1294
|
+
sort_key=sort_key,
|
|
1295
|
+
report=report,
|
|
1296
|
+
run_datetime=run_dt,
|
|
1297
|
+
# Only flag ambiguous when NOT using run time for sorting
|
|
1298
|
+
ambiguous_time=ambiguous and not use_run_time,
|
|
1299
|
+
)
|
|
1300
|
+
file_entries.append(fe)
|
|
1301
|
+
dt_info = f", run={run_dt}" if run_dt else ""
|
|
1302
|
+
amb_info = " [ambiguous]" if fe.ambiguous_time else ""
|
|
1303
|
+
print(f" Parsed: {filename} ({len(report.peaks)} peaks, "
|
|
1304
|
+
f"{category}, sort={sort_key}{dt_info}{amb_info})",
|
|
1305
|
+
file=sys.stderr)
|
|
1306
|
+
except Exception as e:
|
|
1307
|
+
print(f" Warning: Could not parse {filename}: {e}",
|
|
1308
|
+
file=sys.stderr)
|
|
1309
|
+
|
|
1310
|
+
if not file_entries:
|
|
1311
|
+
msg = "No files could be parsed."
|
|
1312
|
+
if args.json_errors:
|
|
1313
|
+
_je = {"error": "no_parseable_files", "detail": msg}
|
|
1314
|
+
print(json.dumps(_je), file=sys.stderr)
|
|
1315
|
+
else:
|
|
1316
|
+
print(f"Error: {msg}", file=sys.stderr)
|
|
1317
|
+
return 1
|
|
1318
|
+
|
|
1319
|
+
# Single file — no cross-file analysis needed, output basic report
|
|
1320
|
+
if len(file_entries) == 1:
|
|
1321
|
+
print(" Single file — outputting basic report (no multi-file "
|
|
1322
|
+
"analysis).", file=sys.stderr)
|
|
1323
|
+
output = format_basic_report(file_entries[0].report)
|
|
1324
|
+
if args.output:
|
|
1325
|
+
with open(args.output, 'w', encoding='utf-8') as f:
|
|
1326
|
+
f.write(output)
|
|
1327
|
+
print(f"Output written to {args.output}", file=sys.stderr)
|
|
1328
|
+
else:
|
|
1329
|
+
sys.stdout.buffer.write(output.encode('utf-8'))
|
|
1330
|
+
sys.stdout.buffer.write(b'\n')
|
|
1331
|
+
return 0
|
|
1332
|
+
|
|
1333
|
+
# Sort chronologically — by run_datetime (default) or sort_key (--out-of-order)
|
|
1334
|
+
if use_run_time:
|
|
1335
|
+
# All files with a valid run_datetime sort by that; others fall back
|
|
1336
|
+
has_dt = all(fe.run_datetime is not None for fe in file_entries)
|
|
1337
|
+
if has_dt:
|
|
1338
|
+
file_entries.sort(key=lambda f: f.run_datetime)
|
|
1339
|
+
print(" Sorting by LCMS run time (default).", file=sys.stderr)
|
|
1340
|
+
else:
|
|
1341
|
+
missing = [fe.filename for fe in file_entries
|
|
1342
|
+
if fe.run_datetime is None]
|
|
1343
|
+
print(f" Warning: Could not extract run time from: "
|
|
1344
|
+
f"{', '.join(missing)}. Falling back to filename sort.",
|
|
1345
|
+
file=sys.stderr)
|
|
1346
|
+
file_entries.sort(key=lambda f: f.sort_key)
|
|
1347
|
+
else:
|
|
1348
|
+
file_entries.sort(key=lambda f: f.sort_key)
|
|
1349
|
+
print(" Sorting by filename heuristics (--out-of-order).",
|
|
1350
|
+
file=sys.stderr)
|
|
1351
|
+
|
|
1352
|
+
# Analyze
|
|
1353
|
+
results = analyze(file_entries, args.rt_tolerance, args.mz_tolerance,
|
|
1354
|
+
args.trend_threshold, args.ignore_instrument,
|
|
1355
|
+
use_run_time=use_run_time,
|
|
1356
|
+
max_ion_rank=args.max_ion_rank)
|
|
1357
|
+
|
|
1358
|
+
if not results:
|
|
1359
|
+
msg = "Analysis produced no results."
|
|
1360
|
+
if args.json_errors:
|
|
1361
|
+
_je = {"error": "analysis_empty", "detail": msg}
|
|
1362
|
+
print(json.dumps(_je), file=sys.stderr)
|
|
1363
|
+
else:
|
|
1364
|
+
print(f"Error: {msg}", file=sys.stderr)
|
|
1365
|
+
return 1
|
|
1366
|
+
|
|
1367
|
+
# Save JSON sidecar if requested (for downstream reuse by procedure_writer)
|
|
1368
|
+
if args.json_output:
|
|
1369
|
+
json_parts = [format_json_report(r) for r in results]
|
|
1370
|
+
json_out = "[" + ", ".join(json_parts) + "]" \
|
|
1371
|
+
if len(json_parts) > 1 else json_parts[0]
|
|
1372
|
+
with open(args.json_output, 'w', encoding='utf-8') as f:
|
|
1373
|
+
f.write(json_out)
|
|
1374
|
+
print(f"JSON saved to {args.json_output}", file=sys.stderr)
|
|
1375
|
+
|
|
1376
|
+
# Check if exclusions reduced any group to ≤1 effective file
|
|
1377
|
+
# If so, fall back to basic single-file reports for those files
|
|
1378
|
+
output_parts = []
|
|
1379
|
+
for result in results:
|
|
1380
|
+
effective_count = len(result.files) - len(result.excluded_files)
|
|
1381
|
+
if effective_count <= 1 and not args.json:
|
|
1382
|
+
# Exclusions left ≤1 file — output basic report(s) instead
|
|
1383
|
+
print(f" {len(result.excluded_files)} of {len(result.files)} "
|
|
1384
|
+
f"files excluded — falling back to single-file report(s).",
|
|
1385
|
+
file=sys.stderr)
|
|
1386
|
+
for i, fe in enumerate(result.files):
|
|
1387
|
+
if i not in result.excluded_files and fe.report:
|
|
1388
|
+
output_parts.append(format_basic_report(fe.report))
|
|
1389
|
+
elif args.json:
|
|
1390
|
+
output_parts.append(format_json_report(result))
|
|
1391
|
+
else:
|
|
1392
|
+
output_parts.append(format_text_report(
|
|
1393
|
+
result,
|
|
1394
|
+
min_summary_area=args.min_summary_area,
|
|
1395
|
+
hide_other_ions=args.hide_other_ions,
|
|
1396
|
+
))
|
|
1397
|
+
|
|
1398
|
+
output = "\n\n".join(output_parts)
|
|
1399
|
+
|
|
1400
|
+
if args.output:
|
|
1401
|
+
with open(args.output, 'w', encoding='utf-8') as f:
|
|
1402
|
+
f.write(output)
|
|
1403
|
+
print(f"Output written to {args.output}", file=sys.stderr)
|
|
1404
|
+
else:
|
|
1405
|
+
sys.stdout.buffer.write(output.encode('utf-8'))
|
|
1406
|
+
sys.stdout.buffer.write(b'\n')
|
|
1407
|
+
|
|
1408
|
+
return 0
|
|
1409
|
+
|
|
1410
|
+
|
|
1411
|
+
if __name__ == '__main__':
|
|
1412
|
+
sys.exit(main())
|