cdxml-toolkit 0.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cdxml_toolkit/__init__.py +18 -0
- cdxml_toolkit/_jre/__init__.py +2 -0
- cdxml_toolkit/_jre/temurin-21-jre-win-x64.zip +0 -0
- cdxml_toolkit/analysis/__init__.py +35 -0
- cdxml_toolkit/analysis/deterministic/__init__.py +12 -0
- cdxml_toolkit/analysis/deterministic/discover_experiment_files.py +413 -0
- cdxml_toolkit/analysis/deterministic/lab_book_formatter.py +701 -0
- cdxml_toolkit/analysis/deterministic/lcms_file_categorizer.py +928 -0
- cdxml_toolkit/analysis/deterministic/lcms_identifier.py +598 -0
- cdxml_toolkit/analysis/deterministic/mass_resolver.py +654 -0
- cdxml_toolkit/analysis/deterministic/multi_lcms_analyzer.py +1412 -0
- cdxml_toolkit/analysis/deterministic/procedure_writer.py +446 -0
- cdxml_toolkit/analysis/extract_nmr.py +47 -0
- cdxml_toolkit/analysis/format_procedure_entry.py +479 -0
- cdxml_toolkit/analysis/lcms_analyzer.py +1299 -0
- cdxml_toolkit/analysis/parse_analysis_file.py +134 -0
- cdxml_toolkit/cdxml_builder.py +920 -0
- cdxml_toolkit/cdxml_utils.py +342 -0
- cdxml_toolkit/chemdraw/__init__.py +5 -0
- cdxml_toolkit/chemdraw/_chemscript_server.py +562 -0
- cdxml_toolkit/chemdraw/cdx_converter.py +527 -0
- cdxml_toolkit/chemdraw/cdxml_to_image.py +262 -0
- cdxml_toolkit/chemdraw/cdxml_to_image_rdkit.py +296 -0
- cdxml_toolkit/chemdraw/chemscript_bridge.py +901 -0
- cdxml_toolkit/constants.py +304 -0
- cdxml_toolkit/coord_normalizer.py +438 -0
- cdxml_toolkit/deterministic_pipeline/__init__.py +6 -0
- cdxml_toolkit/deterministic_pipeline/legacy/__init__.py +5 -0
- cdxml_toolkit/deterministic_pipeline/legacy/eln_cdx_cleanup.py +509 -0
- cdxml_toolkit/deterministic_pipeline/legacy/eln_enrichment.py +1394 -0
- cdxml_toolkit/deterministic_pipeline/legacy/scheme_aligner.py +428 -0
- cdxml_toolkit/deterministic_pipeline/legacy/scheme_polisher.py +1337 -0
- cdxml_toolkit/deterministic_pipeline/legacy/scheme_polisher_v2.py +1340 -0
- cdxml_toolkit/deterministic_pipeline/scheme_reader_audit.py +931 -0
- cdxml_toolkit/deterministic_pipeline/scheme_reader_verify.py +1160 -0
- cdxml_toolkit/image/__init__.py +15 -0
- cdxml_toolkit/image/reaction_from_image.py +2103 -0
- cdxml_toolkit/image/structure_from_image.py +1711 -0
- cdxml_toolkit/layout/__init__.py +5 -0
- cdxml_toolkit/layout/alignment.py +1642 -0
- cdxml_toolkit/layout/reaction_cleanup.py +1002 -0
- cdxml_toolkit/layout/scheme_merger.py +2260 -0
- cdxml_toolkit/mcp_server/__init__.py +0 -0
- cdxml_toolkit/mcp_server/__main__.py +5 -0
- cdxml_toolkit/mcp_server/server.py +1567 -0
- cdxml_toolkit/naming/__init__.py +6 -0
- cdxml_toolkit/naming/aligned_namer.py +2342 -0
- cdxml_toolkit/naming/mol_builder.py +3722 -0
- cdxml_toolkit/naming/name_decomposer.py +2843 -0
- cdxml_toolkit/naming/reactions_datamol.json +2414 -0
- cdxml_toolkit/office/__init__.py +5 -0
- cdxml_toolkit/office/doc_from_template.py +722 -0
- cdxml_toolkit/office/ole_embedder.py +808 -0
- cdxml_toolkit/office/ole_extractor.py +272 -0
- cdxml_toolkit/perception/__init__.py +10 -0
- cdxml_toolkit/perception/compound_search.py +229 -0
- cdxml_toolkit/perception/eln_csv_parser.py +240 -0
- cdxml_toolkit/perception/rdf_parser.py +664 -0
- cdxml_toolkit/perception/reactant_heuristic.py +1045 -0
- cdxml_toolkit/perception/reaction_parser.py +2150 -0
- cdxml_toolkit/perception/scheme_reader.py +2948 -0
- cdxml_toolkit/perception/scheme_refine.py +1404 -0
- cdxml_toolkit/perception/scheme_segmenter.py +619 -0
- cdxml_toolkit/perception/spatial_assignment.py +1013 -0
- cdxml_toolkit/rdkit_utils.py +605 -0
- cdxml_toolkit/render/__init__.py +17 -0
- cdxml_toolkit/render/auto_layout.py +229 -0
- cdxml_toolkit/render/compact_parser.py +632 -0
- cdxml_toolkit/render/parser.py +706 -0
- cdxml_toolkit/render/render_scheme.py +267 -0
- cdxml_toolkit/render/renderer.py +2387 -0
- cdxml_toolkit/render/schema.py +90 -0
- cdxml_toolkit/render/scheme_maker.py +1043 -0
- cdxml_toolkit/render/scheme_yaml_writer.py +1487 -0
- cdxml_toolkit/resolve/__init__.py +13 -0
- cdxml_toolkit/resolve/cas_resolver.py +430 -0
- cdxml_toolkit/resolve/chemscanner_abbreviations.json +28813 -0
- cdxml_toolkit/resolve/condensed_formula.py +493 -0
- cdxml_toolkit/resolve/jre_manager.py +195 -0
- cdxml_toolkit/resolve/reagent_abbreviations.json +1046 -0
- cdxml_toolkit/resolve/reagent_db.py +285 -0
- cdxml_toolkit/resolve/superatom_data.json +2856 -0
- cdxml_toolkit/resolve/superatom_table.py +146 -0
- cdxml_toolkit/text_formatting.py +298 -0
- cdxml_toolkit-0.5.0.dist-info/METADATA +318 -0
- cdxml_toolkit-0.5.0.dist-info/RECORD +91 -0
- cdxml_toolkit-0.5.0.dist-info/WHEEL +5 -0
- cdxml_toolkit-0.5.0.dist-info/entry_points.txt +17 -0
- cdxml_toolkit-0.5.0.dist-info/licenses/LICENSE +21 -0
- cdxml_toolkit-0.5.0.dist-info/licenses/NOTICE.md +37 -0
- cdxml_toolkit-0.5.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,928 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
LCMS File Categorizer
|
|
4
|
+
|
|
5
|
+
Categorizes LCMS PDF filenames into experiment phases: tracking, workup,
|
|
6
|
+
purification, final, reference. Two APIs:
|
|
7
|
+
|
|
8
|
+
- categorize_lcms_file(filename) — simple per-file categorization
|
|
9
|
+
- categorize_lcms_files_batch(filenames, experiment_id) — context-aware
|
|
10
|
+
batch categorization with prefix-based tracking groups, modifier
|
|
11
|
+
stripping, special file filtering, and hybrid sort key calibration
|
|
12
|
+
|
|
13
|
+
Pure string-processing engine — no PDF parsing, no external dependencies
|
|
14
|
+
beyond stdlib.
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
import os
|
|
18
|
+
import re
|
|
19
|
+
from collections import defaultdict
|
|
20
|
+
from dataclasses import dataclass, field
|
|
21
|
+
from statistics import median
|
|
22
|
+
from typing import List, Optional, Dict, Tuple
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
# ---------------------------------------------------------------------------
|
|
26
|
+
# Simple file categorization (per-file, no cross-file context)
|
|
27
|
+
# ---------------------------------------------------------------------------
|
|
28
|
+
|
|
29
|
+
def categorize_lcms_file(filename: str) -> Tuple[str, float]:
|
|
30
|
+
"""
|
|
31
|
+
Categorize an LCMS file and return (category, sort_key).
|
|
32
|
+
|
|
33
|
+
Categories: "tracking", "workup", "purification", "final", "reference"
|
|
34
|
+
|
|
35
|
+
Uses the same pattern-matching engine as the batch categorizer
|
|
36
|
+
(_categorize_suffix) but without cross-file context. Strips
|
|
37
|
+
analytical modifiers (-re, -AmB, -W9, etc.) and extracts the
|
|
38
|
+
experiment suffix before categorization.
|
|
39
|
+
"""
|
|
40
|
+
# Strip modifiers and extract suffix (same pipeline as batch)
|
|
41
|
+
stripped, _mods = _strip_modifiers(filename)
|
|
42
|
+
stripped_base = os.path.splitext(stripped)[0] if '.' in stripped else stripped
|
|
43
|
+
experiment_id = _extract_experiment_id(filename)
|
|
44
|
+
suffix = _extract_suffix(stripped_base, experiment_id)
|
|
45
|
+
|
|
46
|
+
# Categorize using the shared engine (assume explicit times exist —
|
|
47
|
+
# conservative: bare tNN treated as purification fractions)
|
|
48
|
+
return _categorize_suffix(suffix, has_explicit_time=True)
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
# Ambiguous sort keys: files whose position in the timeline can't be
|
|
52
|
+
# reliably determined from the filename alone.
|
|
53
|
+
_AMBIGUOUS_SORT_KEYS = {500} # 500 = "beforeadd"
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
# ---------------------------------------------------------------------------
|
|
57
|
+
# Batch file categorization (v2 — prefix-based grouping)
|
|
58
|
+
# ---------------------------------------------------------------------------
|
|
59
|
+
|
|
60
|
+
@dataclass
|
|
61
|
+
class FileModifiers:
|
|
62
|
+
"""Metadata stripped from a filename before categorization."""
|
|
63
|
+
rerun_count: int = 0 # -re=1, -rere=2
|
|
64
|
+
duplicate_num: Optional[int] = None # (2) -> 2
|
|
65
|
+
method_variant: Optional[str] = None # AmB, AmF, AmBfoc, AmFfoc
|
|
66
|
+
method_program: Optional[str] = None # W1, W9, W13, W17, W19
|
|
67
|
+
long_method: bool = False # -long suffix
|
|
68
|
+
concentrated: bool = False # -conc suffix
|
|
69
|
+
focused: bool = False # -focus / -foc suffix
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
@dataclass
|
|
73
|
+
class TrackingGroup:
|
|
74
|
+
"""A group of tracking files sharing a common prefix."""
|
|
75
|
+
prefix: str # e.g. "", "add50mgDEAD", "70C"
|
|
76
|
+
files: List[Tuple[str, float]] # (filename, time_in_minutes)
|
|
77
|
+
offset: float = 0.0 # calibrated offset for sort keys
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
@dataclass
|
|
81
|
+
class FileClassification:
|
|
82
|
+
"""Classification result for one LCMS file."""
|
|
83
|
+
category: str # tracking, workup, purification, final, reference
|
|
84
|
+
sort_key: float
|
|
85
|
+
modifiers: FileModifiers
|
|
86
|
+
group_prefix: Optional[str] = None # for tracking files
|
|
87
|
+
temperature: Optional[float] = None # parsed temperature in Celsius
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
@dataclass
|
|
91
|
+
class BatchResult:
|
|
92
|
+
"""Result of batch categorization for one experiment."""
|
|
93
|
+
experiment_id: str
|
|
94
|
+
files: Dict[str, FileClassification] # filename -> classification
|
|
95
|
+
tracking_groups: List[TrackingGroup]
|
|
96
|
+
filtered_files: List[str] # special files (MS, LC, etc.)
|
|
97
|
+
has_final: bool = False
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
# --- Modifier stripping ---
|
|
101
|
+
|
|
102
|
+
def _strip_modifiers(filename: str) -> Tuple[str, FileModifiers]:
|
|
103
|
+
"""Strip analytical modifiers from filename, return cleaned name + metadata."""
|
|
104
|
+
base = os.path.splitext(filename)[0]
|
|
105
|
+
mods = FileModifiers()
|
|
106
|
+
|
|
107
|
+
# 1. Duplicate number: " (2)" or "(2)" at end
|
|
108
|
+
m = re.search(r'\s*\((\d+)\)\s*$', base)
|
|
109
|
+
if m:
|
|
110
|
+
mods.duplicate_num = int(m.group(1))
|
|
111
|
+
base = base[:m.start()]
|
|
112
|
+
|
|
113
|
+
# 2. Rerun: -rere, -re, -RE at end (check rere first)
|
|
114
|
+
if re.search(r'-[Rr][Ee][Rr][Ee]$', base):
|
|
115
|
+
mods.rerun_count = 2
|
|
116
|
+
base = base[:-5]
|
|
117
|
+
elif re.search(r'-[Rr][Ee]$', base):
|
|
118
|
+
mods.rerun_count = 1
|
|
119
|
+
base = base[:-3]
|
|
120
|
+
# Also handle -rerun / -RERUN
|
|
121
|
+
elif re.search(r'-rerun$', base, re.IGNORECASE):
|
|
122
|
+
mods.rerun_count = 1
|
|
123
|
+
base = base[:-6]
|
|
124
|
+
|
|
125
|
+
# 3. -focus / -foc at end
|
|
126
|
+
if re.search(r'-foc(?:us)?$', base, re.IGNORECASE):
|
|
127
|
+
mods.focused = True
|
|
128
|
+
m2 = re.search(r'-foc(?:us)?$', base, re.IGNORECASE)
|
|
129
|
+
base = base[:m2.start()]
|
|
130
|
+
|
|
131
|
+
# 4. -conc at end
|
|
132
|
+
if re.search(r'-conc$', base, re.IGNORECASE):
|
|
133
|
+
mods.concentrated = True
|
|
134
|
+
base = base[:-5]
|
|
135
|
+
|
|
136
|
+
# 5. -long at end
|
|
137
|
+
if re.search(r'-long$', base, re.IGNORECASE):
|
|
138
|
+
mods.long_method = True
|
|
139
|
+
base = base[:-5]
|
|
140
|
+
|
|
141
|
+
# 6. Method program: -W1, -W3, -W4, -W9, -W13, -W17, -W19 at end
|
|
142
|
+
m = re.search(r'-(W\d+)$', base, re.IGNORECASE)
|
|
143
|
+
if m:
|
|
144
|
+
mods.method_program = m.group(1).upper()
|
|
145
|
+
base = base[:m.start()]
|
|
146
|
+
|
|
147
|
+
# 7. Buffer method: -AmB, -AmF, -AmBfoc, -AmFfoc at end
|
|
148
|
+
# Must come after -foc stripping since -AmFfoc = -AmF + -foc
|
|
149
|
+
m = re.search(r'-(Am[BF](?:foc)?)$', base, re.IGNORECASE)
|
|
150
|
+
if m:
|
|
151
|
+
mods.method_variant = m.group(1)
|
|
152
|
+
base = base[:m.start()]
|
|
153
|
+
|
|
154
|
+
return base, mods
|
|
155
|
+
|
|
156
|
+
|
|
157
|
+
# --- Special file detection ---
|
|
158
|
+
|
|
159
|
+
_SPECIAL_SUFFIXES_RE = re.compile(
|
|
160
|
+
r'(?:'
|
|
161
|
+
r'-MS'
|
|
162
|
+
r'|-LC(?:-COPY)?'
|
|
163
|
+
r'|-LCtrace'
|
|
164
|
+
r'|-UV'
|
|
165
|
+
r'|-manint'
|
|
166
|
+
r'|-landscape'
|
|
167
|
+
r'|-int' # integration screenshot
|
|
168
|
+
r')$',
|
|
169
|
+
re.IGNORECASE
|
|
170
|
+
)
|
|
171
|
+
|
|
172
|
+
|
|
173
|
+
def _is_special_file(cleaned_base: str) -> bool:
|
|
174
|
+
"""Check if this file is a non-standard LCMS report (MS-only, LC-only, etc.)."""
|
|
175
|
+
return bool(_SPECIAL_SUFFIXES_RE.search(cleaned_base))
|
|
176
|
+
|
|
177
|
+
|
|
178
|
+
# --- Experiment ID / suffix extraction ---
|
|
179
|
+
|
|
180
|
+
def _extract_experiment_id(filename: str) -> str:
|
|
181
|
+
"""Extract KL-XXXX-NNN experiment ID from a filename."""
|
|
182
|
+
base = os.path.splitext(filename)[0]
|
|
183
|
+
# Remove (2) duplicate suffix
|
|
184
|
+
base = re.sub(r'\s*\(\d+\)\s*$', '', base)
|
|
185
|
+
m = re.match(r'(KL-\d+-\d+)', base, re.IGNORECASE)
|
|
186
|
+
if m:
|
|
187
|
+
return m.group(1).upper()
|
|
188
|
+
return base.upper()
|
|
189
|
+
|
|
190
|
+
|
|
191
|
+
def _extract_suffix(cleaned_base: str, experiment_id: str) -> str:
|
|
192
|
+
"""Extract the suffix after the experiment ID from a cleaned filename."""
|
|
193
|
+
# Case-insensitive prefix match
|
|
194
|
+
prefix_len = len(experiment_id)
|
|
195
|
+
if cleaned_base[:prefix_len].upper() == experiment_id.upper():
|
|
196
|
+
remainder = cleaned_base[prefix_len:]
|
|
197
|
+
# Strip leading dash, space, or underscore
|
|
198
|
+
remainder = remainder.lstrip('-').lstrip(' ').lstrip('_')
|
|
199
|
+
return remainder
|
|
200
|
+
return cleaned_base
|
|
201
|
+
|
|
202
|
+
|
|
203
|
+
# --- Time token extraction ---
|
|
204
|
+
|
|
205
|
+
def _extract_time_token(suffix: str) -> Optional[Tuple[float, int, int]]:
|
|
206
|
+
"""
|
|
207
|
+
Find the best time token in the suffix.
|
|
208
|
+
|
|
209
|
+
Returns (time_in_minutes, token_start, token_end) or None.
|
|
210
|
+
token_start/end define the span of the time+temperature cluster
|
|
211
|
+
(for prefix extraction: everything before token_start is the group prefix).
|
|
212
|
+
"""
|
|
213
|
+
original = suffix
|
|
214
|
+
candidates = [] # (start, end, time_minutes)
|
|
215
|
+
|
|
216
|
+
# --- Combined temperature+time patterns ---
|
|
217
|
+
# For NNC+time patterns, t_start points to the start of the time portion
|
|
218
|
+
# (after the temperature), so the temperature becomes part of the group
|
|
219
|
+
# prefix. For time+NNC patterns, the time IS at the start so t_start
|
|
220
|
+
# = m.start().
|
|
221
|
+
|
|
222
|
+
# NNCON: 40CON, 80CON, 90CON, 100CON, 105CON — temperature + overnight
|
|
223
|
+
# The "ON" begins after the "C" in "NNCON", so we need to find that offset.
|
|
224
|
+
for m in re.finditer(r'(\d{2,3})C(ON)\b', original):
|
|
225
|
+
candidates.append((m.start(2), m.end(), 960.0))
|
|
226
|
+
|
|
227
|
+
# NNC-ON: 65C-ON — temperature + dash + overnight
|
|
228
|
+
for m in re.finditer(r'(\d{2,3})C-(ON)\b', original):
|
|
229
|
+
candidates.append((m.start(2), m.end(), 960.0))
|
|
230
|
+
|
|
231
|
+
# NNC-OWE / NNC-OWE: 130C-OWE
|
|
232
|
+
for m in re.finditer(r'(\d{2,3})C-?(OWE)\b', original):
|
|
233
|
+
candidates.append((m.start(2), m.end(), 2880.0))
|
|
234
|
+
|
|
235
|
+
# NNC + NhNm: 40C1h25min — time starts at group 2
|
|
236
|
+
for m in re.finditer(r'(\d{2,3})C-?(\d+)h(\d+)\s*m(?:in)?', original, re.IGNORECASE):
|
|
237
|
+
t = float(m.group(2)) * 60 + float(m.group(3))
|
|
238
|
+
candidates.append((m.start(2), m.end(), t))
|
|
239
|
+
|
|
240
|
+
# NNC + Nh: 80C8h, 80C-2h, 120C-5h, 70C-1hmore — time starts at group 2
|
|
241
|
+
for m in re.finditer(r'(\d{2,3})C-?(\d+)h(?!\d)', original, re.IGNORECASE):
|
|
242
|
+
candidates.append((m.start(2), m.end(), float(m.group(2)) * 60))
|
|
243
|
+
|
|
244
|
+
# NNC + Nmin: 100C30min, 50C5min, 50C40min — time starts at group 2
|
|
245
|
+
for m in re.finditer(r'(\d{2,3})C-?(\d+)\s*min', original, re.IGNORECASE):
|
|
246
|
+
candidates.append((m.start(2), m.end(), float(m.group(2))))
|
|
247
|
+
|
|
248
|
+
# NNC + Nm: 50C12m (boundary at word end) — time starts at group 2
|
|
249
|
+
for m in re.finditer(r'(\d{2,3})C-?(\d+)m\b', original, re.IGNORECASE):
|
|
250
|
+
candidates.append((m.start(2), m.end(), float(m.group(2))))
|
|
251
|
+
|
|
252
|
+
# Time + NNC: 30min80C, 100min80C, 12min70C — time IS at the start
|
|
253
|
+
for m in re.finditer(r'(\d+)\s*min(\d{2,3})C', original, re.IGNORECASE):
|
|
254
|
+
candidates.append((m.start(), m.end(), float(m.group(1))))
|
|
255
|
+
|
|
256
|
+
# Time(m) + NNC: 90m70C (if it occurs) — time IS at the start
|
|
257
|
+
for m in re.finditer(r'(\d+)m(\d{2,3})C', original, re.IGNORECASE):
|
|
258
|
+
candidates.append((m.start(), m.end(), float(m.group(1))))
|
|
259
|
+
|
|
260
|
+
# Nh + NNC: 9h125C, 1h50C, 1h70C — time IS at the start
|
|
261
|
+
for m in re.finditer(r'(\d+)h(\d{2,3})C', original, re.IGNORECASE):
|
|
262
|
+
candidates.append((m.start(), m.end(), float(m.group(1)) * 60))
|
|
263
|
+
|
|
264
|
+
# NhNm + NNC: 1h30m80C (if it occurs)
|
|
265
|
+
for m in re.finditer(r'(\d+)h(\d+)m(\d{2,3})C', original, re.IGNORECASE):
|
|
266
|
+
t = float(m.group(1)) * 60 + float(m.group(2))
|
|
267
|
+
candidates.append((m.start(), m.end(), t))
|
|
268
|
+
|
|
269
|
+
# NhNm + NNC or NNC: not commonly observed, skip
|
|
270
|
+
|
|
271
|
+
# --- Standalone time patterns ---
|
|
272
|
+
|
|
273
|
+
# premix with time: premix10min, premix4min, premix7min
|
|
274
|
+
for m in re.finditer(r'premix-?(\d+)\s*min', original, re.IGNORECASE):
|
|
275
|
+
candidates.append((m.start(), m.end(), -float(m.group(1))))
|
|
276
|
+
for m in re.finditer(r'premix-?(\d+)\s*m\b', original, re.IGNORECASE):
|
|
277
|
+
candidates.append((m.start(), m.end(), -float(m.group(1))))
|
|
278
|
+
# premix alone (no time)
|
|
279
|
+
if re.search(r'\bpremix\b', original, re.IGNORECASE):
|
|
280
|
+
m = re.search(r'\bpremix\b', original, re.IGNORECASE)
|
|
281
|
+
# Only if not already matched as premixNmin
|
|
282
|
+
if not re.search(r'premix-?\d+', original, re.IGNORECASE):
|
|
283
|
+
candidates.append((m.start(), m.end(), -10.0))
|
|
284
|
+
|
|
285
|
+
# NhNmin: 1h30min, 2h45min, 3h20min
|
|
286
|
+
for m in re.finditer(r'(\d+)h(\d+)\s*min', original, re.IGNORECASE):
|
|
287
|
+
if not _preceded_by_temp(original, m.start()):
|
|
288
|
+
candidates.append((m.start(), m.end(),
|
|
289
|
+
float(m.group(1)) * 60 + float(m.group(2))))
|
|
290
|
+
|
|
291
|
+
# NhNm: 1h27m, 4h30m, 2h45m, 1h50mrerun
|
|
292
|
+
# Allow m to be followed by non-digit (not just word boundary)
|
|
293
|
+
for m in re.finditer(r'(\d+)h(\d+)m(?![0-9])', original, re.IGNORECASE):
|
|
294
|
+
if not _preceded_by_temp(original, m.start()):
|
|
295
|
+
candidates.append((m.start(), m.end(),
|
|
296
|
+
float(m.group(1)) * 60 + float(m.group(2))))
|
|
297
|
+
|
|
298
|
+
# ON — case-sensitive (uppercase ON). Must NOT be followed by uppercase
|
|
299
|
+
# letters (to avoid matching inside "ONCE", "ONLY", etc.).
|
|
300
|
+
# Allowed after lowercase (airdryON, scavON) and before lowercase
|
|
301
|
+
# (ONrecheck = overnight recheck).
|
|
302
|
+
for m in re.finditer(r'ON(?![A-Z])', original):
|
|
303
|
+
if not _preceded_by_temp(original, m.start()):
|
|
304
|
+
candidates.append((m.start(), m.end(), 960.0))
|
|
305
|
+
|
|
306
|
+
# OWE — case-sensitive, same relaxed boundary
|
|
307
|
+
for m in re.finditer(r'OWE(?![A-Z])', original):
|
|
308
|
+
if not _preceded_by_temp(original, m.start()):
|
|
309
|
+
candidates.append((m.start(), m.end(), 2880.0))
|
|
310
|
+
|
|
311
|
+
# Nh: 1h, 12h, 16h — not preceded by temp, not followed by digit (NhNm)
|
|
312
|
+
for m in re.finditer(r'(\d+)h(?!\d)', original, re.IGNORECASE):
|
|
313
|
+
if not _preceded_by_temp(original, m.start()):
|
|
314
|
+
candidates.append((m.start(), m.end(), float(m.group(1)) * 60))
|
|
315
|
+
|
|
316
|
+
# Nmin: 30min, 128min — not preceded by temp
|
|
317
|
+
for m in re.finditer(r'(\d+)\s*min(?:s)?', original, re.IGNORECASE):
|
|
318
|
+
if not _preceded_by_temp(original, m.start()):
|
|
319
|
+
candidates.append((m.start(), m.end(), float(m.group(1))))
|
|
320
|
+
|
|
321
|
+
# Nm: 90m, 40m, 30mrt — not preceded by temp.
|
|
322
|
+
# Reject only when followed by "in" (to avoid double-matching Nmin as Nm)
|
|
323
|
+
# or by another "m" (mm, mol); allow other suffixes like rt, sp, p.
|
|
324
|
+
for m in re.finditer(r'(\d+)m(?!in|m|ol)', original, re.IGNORECASE):
|
|
325
|
+
if not _preceded_by_temp(original, m.start()):
|
|
326
|
+
candidates.append((m.start(), m.end(), float(m.group(1))))
|
|
327
|
+
|
|
328
|
+
# "onehour" / "overnight" as special text
|
|
329
|
+
for m in re.finditer(r'\bonehour\b', original, re.IGNORECASE):
|
|
330
|
+
candidates.append((m.start(), m.end(), 60.0))
|
|
331
|
+
for m in re.finditer(r'\bovernight\b', original, re.IGNORECASE):
|
|
332
|
+
candidates.append((m.start(), m.end(), 960.0))
|
|
333
|
+
|
|
334
|
+
if not candidates:
|
|
335
|
+
return None
|
|
336
|
+
|
|
337
|
+
# Prefer the most specific (longest span) match; break ties by rightmost
|
|
338
|
+
# De-duplicate overlapping candidates: keep the longest span at each position
|
|
339
|
+
candidates.sort(key=lambda c: (-(c[1] - c[0]), -c[0]))
|
|
340
|
+
best = candidates[0]
|
|
341
|
+
return (best[2], best[0], best[1])
|
|
342
|
+
|
|
343
|
+
|
|
344
|
+
def _preceded_by_temp(suffix: str, pos: int) -> bool:
|
|
345
|
+
"""Check if position is immediately preceded by a NNC temperature pattern."""
|
|
346
|
+
before = suffix[:pos]
|
|
347
|
+
return bool(re.search(r'\d{2,3}C-?$', before, re.IGNORECASE))
|
|
348
|
+
|
|
349
|
+
|
|
350
|
+
def _extract_temperature(suffix: str) -> Optional[float]:
|
|
351
|
+
"""Extract temperature in Celsius from suffix if present."""
|
|
352
|
+
m = re.search(r'(?<![tT])(\d{2,3})C', suffix)
|
|
353
|
+
if m:
|
|
354
|
+
return float(m.group(1))
|
|
355
|
+
return None
|
|
356
|
+
|
|
357
|
+
|
|
358
|
+
# --- Categorization logic ---
|
|
359
|
+
|
|
360
|
+
# Final product patterns
|
|
361
|
+
_FINAL_RE = re.compile(
|
|
362
|
+
r'(?:'
|
|
363
|
+
r'purified' # nppurified, rppurified, THFRPpurified, c18purified
|
|
364
|
+
r'|lyo' # lyo, repurlyo, lyotwice, rerelyo
|
|
365
|
+
r'|verify' # verify, AmBverify
|
|
366
|
+
r'|prodchk' # product check
|
|
367
|
+
r'|(?:^|[^a-zA-Z])(?:NMRsample|NMRsamp|QC|NMR)(?:[^a-zA-Z]|$)'
|
|
368
|
+
r')',
|
|
369
|
+
re.IGNORECASE
|
|
370
|
+
)
|
|
371
|
+
_FINAL_STANDALONE_RE = re.compile(
|
|
372
|
+
r'^(?:NMR|final|finalNMR|finalvial|prod|final\d?)$',
|
|
373
|
+
re.IGNORECASE
|
|
374
|
+
)
|
|
375
|
+
|
|
376
|
+
# Purification tube number: optional prefix + tNN or tNNtoNN
|
|
377
|
+
_PURIF_PREFIX_RE = re.compile(
|
|
378
|
+
r'^(?:NP\d?|RP\d?|C18(?:-\d)?|prep\d?|col\d?|I\d|scout|THF(?:RP)?'
|
|
379
|
+
r'|THFrecov|EArecov|recol|scavNP|KADrecov|final\d?|recov(?:NP)?'
|
|
380
|
+
r'|MeCN(?:col)?|actual|firstinj|prevbatch'
|
|
381
|
+
r'|fchk\d?|meohtest' # fraction check, MeOH test
|
|
382
|
+
r'|p\d|v\d|vial\d' # p1-, v1-, vial1- column/vial prefixes
|
|
383
|
+
r'|first|second' # first/second injection
|
|
384
|
+
r'|step\d' # step1, step2 purification steps
|
|
385
|
+
r')'
|
|
386
|
+
r'-?',
|
|
387
|
+
re.IGNORECASE
|
|
388
|
+
)
|
|
389
|
+
_TUBE_NUM_RE = re.compile(r't(\d+)(?:to(\d+))?', re.IGNORECASE)
|
|
390
|
+
|
|
391
|
+
# Purification keywords (not tube numbers)
|
|
392
|
+
_PURIF_KEYWORDS = {
|
|
393
|
+
'comb', 'combed', 'peakcomb', 'colload', 'load', 'loading', 'flush',
|
|
394
|
+
'tflush', 'tload', 'tail', 'tails', 'repur', 'repurified',
|
|
395
|
+
'npcomb', 'npcombed', 'rpcomb', 'rprecomb', 'c18comb', 'c18load',
|
|
396
|
+
'thfcol-comb', 'meccol-i1to4comb',
|
|
397
|
+
'impfrac', 'reload',
|
|
398
|
+
'onetube', 'nptails', 'npminor',
|
|
399
|
+
'tend', 'tblob', 'tlast', # tube end/last/blob
|
|
400
|
+
'fchk', 'fchk1', 'fchk2', # fraction check
|
|
401
|
+
}
|
|
402
|
+
|
|
403
|
+
# Workup keywords
|
|
404
|
+
_WORKUP_KEYWORDS = {
|
|
405
|
+
'crude', 'cr', 'extract', 'ext', 'wash', 'washed', 'washing',
|
|
406
|
+
'rewash', 'rewashed', 'aq', 'org', 'brine',
|
|
407
|
+
'dried', 'driedonce', 'redried', 'combdried',
|
|
408
|
+
'filter', 'filtered', 'filtrate', 'fil', 'filtersolid',
|
|
409
|
+
'pellet', 'pel', 'super', 'ppt',
|
|
410
|
+
'quench', 'quenched',
|
|
411
|
+
'silfil', 'cefil',
|
|
412
|
+
'rotovap', 'rotatrap', 'rota',
|
|
413
|
+
'slurry', 'recryst', 'workup',
|
|
414
|
+
'nofil', 'or',
|
|
415
|
+
}
|
|
416
|
+
|
|
417
|
+
# Reference patterns (starting material checks, not reaction monitoring)
|
|
418
|
+
_REFERENCE_RE = re.compile(
|
|
419
|
+
r'^(?:SM\d?|RAE|RAESM|ArI|ArBr|ArBrSM|ArISM|aniline|chloride|SMchloride'
|
|
420
|
+
r'|TP-SM|Clref|tolref|SMPDref|SMPDCT|DDQSM|chlorideSM|SManiline'
|
|
421
|
+
r'|X\d{3}|E\d{3}|INT\d+|KADDP|SMwith\w+'
|
|
422
|
+
r'|spiking|SMcheck|SMchk|SMconfirm|SMrecov|SM-verify'
|
|
423
|
+
r'|aminopySM|AmPySM|smmix|smix|smrtmix|SMS'
|
|
424
|
+
r'|byprod|ref\d?'
|
|
425
|
+
r')(?:$|-)',
|
|
426
|
+
re.IGNORECASE
|
|
427
|
+
)
|
|
428
|
+
|
|
429
|
+
# Additional reference patterns: NpNF fluorine equivalents (titration experiments)
|
|
430
|
+
_FLUORINE_EQUIV_RE = re.compile(
|
|
431
|
+
r'^(?:\d*p?\d+F(?:mol)?|\d+F)$',
|
|
432
|
+
re.IGNORECASE
|
|
433
|
+
)
|
|
434
|
+
|
|
435
|
+
|
|
436
|
+
def _categorize_suffix(suffix: str, has_explicit_time: bool) -> Tuple[str, float]:
|
|
437
|
+
"""
|
|
438
|
+
Categorize a single file's suffix.
|
|
439
|
+
|
|
440
|
+
Args:
|
|
441
|
+
suffix: The cleaned suffix (modifiers stripped, experiment ID removed).
|
|
442
|
+
has_explicit_time: True if other files in the experiment have
|
|
443
|
+
explicit time tokens (Nmin, Nh, ON, OWE — not tNN).
|
|
444
|
+
|
|
445
|
+
Returns:
|
|
446
|
+
(category, preliminary_sort_key)
|
|
447
|
+
"""
|
|
448
|
+
if not suffix:
|
|
449
|
+
return 'tracking', 100
|
|
450
|
+
|
|
451
|
+
lower = suffix.lower()
|
|
452
|
+
|
|
453
|
+
# --- Priority 0.5: "final-IN-tNN" purification tube fractions ---
|
|
454
|
+
# Must come before the final product check since "final" is a prefix here
|
|
455
|
+
if re.match(r'final\d?-(?:I\d|i\d)', suffix, re.IGNORECASE):
|
|
456
|
+
return 'purification', 3000
|
|
457
|
+
|
|
458
|
+
# --- Priority 1: Final product ---
|
|
459
|
+
if _FINAL_RE.search(suffix) or _FINAL_STANDALONE_RE.match(suffix):
|
|
460
|
+
# Exception: "crude-NMR" or "crude-NMRsample" is workup
|
|
461
|
+
if 'crude' in lower:
|
|
462
|
+
return 'workup', 2000
|
|
463
|
+
# Exception: method-prefix + "purified" = purification, not final
|
|
464
|
+
# NPpurified, RPpurified, C18purified, THFRPpurified, scavNPpurified
|
|
465
|
+
if re.match(r'^(?:NP|RP|C18|THFRP|THF|scavNP|col)\d?-?purified',
|
|
466
|
+
suffix, re.IGNORECASE):
|
|
467
|
+
return 'purification', 3000
|
|
468
|
+
return 'final', 9000
|
|
469
|
+
|
|
470
|
+
# --- Priority 2: Purification ---
|
|
471
|
+
# Check for tube numbers: [prefix]-tNN[toNN]
|
|
472
|
+
# First strip any purification prefix to find the tNN part
|
|
473
|
+
test_suffix = suffix
|
|
474
|
+
purif_prefix_match = _PURIF_PREFIX_RE.match(suffix)
|
|
475
|
+
if purif_prefix_match:
|
|
476
|
+
test_suffix = suffix[purif_prefix_match.end():]
|
|
477
|
+
|
|
478
|
+
# Recursively strip purification prefixes (p1-c18-t16, final-I1-t4, etc.)
|
|
479
|
+
for _ in range(3): # max 3 levels of nesting
|
|
480
|
+
new_match = _PURIF_PREFIX_RE.match(test_suffix)
|
|
481
|
+
if new_match:
|
|
482
|
+
test_suffix = test_suffix[new_match.end():]
|
|
483
|
+
else:
|
|
484
|
+
break
|
|
485
|
+
|
|
486
|
+
# Also strip inline injection number: I1-, I2-, I1t, etc.
|
|
487
|
+
inj_match = re.match(r'I\d+-?', test_suffix)
|
|
488
|
+
if inj_match:
|
|
489
|
+
test_suffix = test_suffix[inj_match.end():]
|
|
490
|
+
|
|
491
|
+
# Also handle leading dash: -t12 → strip dash
|
|
492
|
+
if test_suffix.startswith('-'):
|
|
493
|
+
test_suffix = test_suffix[1:]
|
|
494
|
+
|
|
495
|
+
tube_match = _TUBE_NUM_RE.match(test_suffix)
|
|
496
|
+
# Also try bare numbers after purification prefix (first-17, vial2-24)
|
|
497
|
+
if not tube_match and purif_prefix_match and re.match(r'^\d+(?:to\d+)?$', test_suffix):
|
|
498
|
+
tube_num = int(re.match(r'^(\d+)', test_suffix).group(1))
|
|
499
|
+
return 'purification', 3000 + tube_num
|
|
500
|
+
|
|
501
|
+
if tube_match:
|
|
502
|
+
has_purif_prefix = purif_prefix_match is not None and purif_prefix_match.end() > 0
|
|
503
|
+
tube_num = int(tube_match.group(1))
|
|
504
|
+
|
|
505
|
+
if has_purif_prefix:
|
|
506
|
+
# NP-t13, RP-t25, C18-t79 — always purification
|
|
507
|
+
return 'purification', 3000 + tube_num
|
|
508
|
+
elif has_explicit_time:
|
|
509
|
+
# Bare tNN in an experiment with explicit time tokens
|
|
510
|
+
# High tube numbers (>30) are almost certainly purification
|
|
511
|
+
# Low numbers are ambiguous but still likely purification if
|
|
512
|
+
# explicit times exist alongside
|
|
513
|
+
return 'purification', 3000 + tube_num
|
|
514
|
+
else:
|
|
515
|
+
# Bare tNN, no explicit time tokens — ambiguous
|
|
516
|
+
# Likely purification (tube numbers rarely used for tracking)
|
|
517
|
+
return 'purification', 3000 + tube_num
|
|
518
|
+
|
|
519
|
+
# Purification keywords
|
|
520
|
+
# Split on - and check each part
|
|
521
|
+
parts_lower = set(re.split(r'[-_\s]', lower))
|
|
522
|
+
if parts_lower & _PURIF_KEYWORDS:
|
|
523
|
+
return 'purification', 3000
|
|
524
|
+
|
|
525
|
+
# More flexible purification keyword match (substring)
|
|
526
|
+
if any(kw in lower for kw in ['peakcomb', 'peak-comb', 'colload',
|
|
527
|
+
'rpload', 'npcomb', 'npcombed',
|
|
528
|
+
'rpcomb', 'c18comb', 'c18load',
|
|
529
|
+
'tpeakcomb',
|
|
530
|
+
'c18repur', 'rprepur',
|
|
531
|
+
'meohwash', # column wash
|
|
532
|
+
'impfrac', 'reload',
|
|
533
|
+
'loading',
|
|
534
|
+
'mecncol', 'mecnrecov',
|
|
535
|
+
'purefracs', 'lesspure', 'morepure',
|
|
536
|
+
'combed',
|
|
537
|
+
]):
|
|
538
|
+
return 'purification', 3000
|
|
539
|
+
|
|
540
|
+
# Any suffix starting with a purification method prefix (C18, NP, RP,
|
|
541
|
+
# etc.) is purification — even if the remainder looks like workup
|
|
542
|
+
# (e.g. C18-DCMext = DCM extraction from C18 eluate, still purification;
|
|
543
|
+
# NP-dried = dried NP fractions, still purification).
|
|
544
|
+
if purif_prefix_match and purif_prefix_match.end() > 0:
|
|
545
|
+
# A known purification method prefix was matched
|
|
546
|
+
return 'purification', 3000
|
|
547
|
+
|
|
548
|
+
# RN-tNN patterns (R2t5 = round 2, tube 5)
|
|
549
|
+
if re.match(r'^R\d+t\d+', suffix, re.IGNORECASE):
|
|
550
|
+
return 'purification', 3000
|
|
551
|
+
|
|
552
|
+
# Bare tube range: NNtoNN (39to41, 46to49)
|
|
553
|
+
if re.match(r'^\d+to\d+$', suffix, re.IGNORECASE):
|
|
554
|
+
return 'purification', 3000
|
|
555
|
+
|
|
556
|
+
# peak1, peak2 — purification peak fractions
|
|
557
|
+
if re.match(r'^peak\d+$', lower):
|
|
558
|
+
return 'purification', 3000
|
|
559
|
+
|
|
560
|
+
# --- Priority 3: Workup ---
|
|
561
|
+
if parts_lower & _WORKUP_KEYWORDS:
|
|
562
|
+
return 'workup', 2000
|
|
563
|
+
|
|
564
|
+
# More flexible workup match (substring for compound words)
|
|
565
|
+
if any(kw in lower for kw in ['crude', 'washed', 'dried', 'quench',
|
|
566
|
+
'silfil', 'cefil', 'rotovap',
|
|
567
|
+
'rotatrap', 'workup', 'extraction',
|
|
568
|
+
'filtrate', 'pellet', 'super',
|
|
569
|
+
'slurry', 'recryst',
|
|
570
|
+
'dcmext', # DCM extraction
|
|
571
|
+
'ipawash', # IPA wash
|
|
572
|
+
'hclwash', # HCl wash
|
|
573
|
+
'washing', # washing steps
|
|
574
|
+
'orgph', # organic phase
|
|
575
|
+
'orgwash', # organic wash
|
|
576
|
+
'nahco3wu', 'naohwu', 'waterwu', # wu = workup
|
|
577
|
+
'bicarbwash', 'naohwash',
|
|
578
|
+
'syrfilter', # syringe filter
|
|
579
|
+
'eaext', # EA extraction
|
|
580
|
+
'rotaeaext', # rotavap + EA extraction
|
|
581
|
+
'wutest', # workup test
|
|
582
|
+
'mainpeak', # mainpeakhclwash etc.
|
|
583
|
+
'dmsodil', # DMSO dilution
|
|
584
|
+
'b4rota', # before rotavap
|
|
585
|
+
]):
|
|
586
|
+
return 'workup', 2000
|
|
587
|
+
|
|
588
|
+
# Standalone workup: aq1, aq2, org1, Naq, Ncr, Norgwash, etc.
|
|
589
|
+
if re.match(r'^(?:aq|org)\d*$', lower):
|
|
590
|
+
return 'workup', 2000
|
|
591
|
+
# Numbered workup: 1orgwash, 2aq, 3Cr
|
|
592
|
+
if re.match(r'^\d+(?:aq|org|cr|wash|ext)', lower):
|
|
593
|
+
return 'workup', 2000
|
|
594
|
+
# cent (centrifuge)
|
|
595
|
+
if lower == 'cent':
|
|
596
|
+
return 'workup', 2000
|
|
597
|
+
|
|
598
|
+
# --- Priority 4: Tracking (files with time tokens) ---
|
|
599
|
+
time_result = _extract_time_token(suffix)
|
|
600
|
+
if time_result:
|
|
601
|
+
time_min, _, _ = time_result
|
|
602
|
+
return 'tracking', time_min
|
|
603
|
+
|
|
604
|
+
# --- Priority 5: Reference ---
|
|
605
|
+
if _REFERENCE_RE.match(suffix):
|
|
606
|
+
return 'reference', 50
|
|
607
|
+
# Fluorine equivalents: 1p55F, 2p7F, 0p6F, p28Fmol, p8F, etc.
|
|
608
|
+
if _FLUORINE_EQUIV_RE.match(suffix):
|
|
609
|
+
return 'reference', 50
|
|
610
|
+
# N-ref patterns: 2-ref, 3-ref
|
|
611
|
+
if re.match(r'^\d+-ref', lower):
|
|
612
|
+
return 'reference', 50
|
|
613
|
+
# 4-br — aryl bromide reference
|
|
614
|
+
if re.match(r'^\d+-(?:br|cl|i)\b', lower):
|
|
615
|
+
return 'reference', 50
|
|
616
|
+
|
|
617
|
+
# --- Fallback ---
|
|
618
|
+
# Some patterns that are clearly a certain category but didn't match above
|
|
619
|
+
|
|
620
|
+
# Purification-related fallbacks
|
|
621
|
+
if 'flush' in lower or 'ipaflush' in lower:
|
|
622
|
+
return 'purification', 3000
|
|
623
|
+
if 'trap' in lower and lower != 'rotatrap':
|
|
624
|
+
return 'purification', 3000
|
|
625
|
+
if 'recov' in lower:
|
|
626
|
+
return 'purification', 3000
|
|
627
|
+
if lower.startswith('rp') and any(x in lower for x in ['peak', 'inj', 'chk', '-i']):
|
|
628
|
+
return 'purification', 3000
|
|
629
|
+
if 'kadmix' in lower or 'fracks' in lower:
|
|
630
|
+
return 'purification', 3000
|
|
631
|
+
if re.match(r'^rp\d?-', lower): # rp-sm, rp-peak, rp2-...
|
|
632
|
+
# rp-sm is reference, others are purification
|
|
633
|
+
if 'sm' in lower:
|
|
634
|
+
return 'reference', 50
|
|
635
|
+
return 'purification', 3000
|
|
636
|
+
# step1-meoh, step2-X — purification step procedures
|
|
637
|
+
if re.match(r'^step\d+', lower):
|
|
638
|
+
return 'purification', 3000
|
|
639
|
+
|
|
640
|
+
# Workup-related fallbacks
|
|
641
|
+
if 'solid' in lower and 'gold' not in lower:
|
|
642
|
+
return 'workup', 2000
|
|
643
|
+
if 'sludge' in lower or 'residue' in lower:
|
|
644
|
+
return 'workup', 2000
|
|
645
|
+
if lower.startswith('or') and len(lower) <= 3: # "or", "or1"
|
|
646
|
+
return 'workup', 2000
|
|
647
|
+
if lower == 'rext' or lower == 'res':
|
|
648
|
+
return 'workup', 2000
|
|
649
|
+
if lower == 'ea': # ethyl acetate workup
|
|
650
|
+
return 'workup', 2000
|
|
651
|
+
if 'trit' in lower and lower != 'et3n': # trituration
|
|
652
|
+
return 'workup', 2000
|
|
653
|
+
|
|
654
|
+
# Reference fallbacks
|
|
655
|
+
if 'smmix' in lower or 'smix' in lower or 'smrtmix' in lower:
|
|
656
|
+
return 'reference', 50
|
|
657
|
+
if lower.startswith('imp') and not lower.startswith('impfrac'):
|
|
658
|
+
return 'reference', 50
|
|
659
|
+
if 'arbrsm' in lower or 'arism' in lower:
|
|
660
|
+
return 'reference', 50
|
|
661
|
+
|
|
662
|
+
# Tracking qualitative timepoints
|
|
663
|
+
if 'morning' in lower or 'monmorn' in lower or 'beforeleave' in lower:
|
|
664
|
+
return 'tracking', 500 # qualitative timepoint, ambiguous ordering
|
|
665
|
+
if 'beforeadd' in lower or 'beforescav' in lower:
|
|
666
|
+
return 'tracking', 500
|
|
667
|
+
if 'afteradd' in lower:
|
|
668
|
+
return 'tracking', 600
|
|
669
|
+
if 'startmix' in lower or 'start' == lower or 'mix' == lower:
|
|
670
|
+
return 'tracking', 0
|
|
671
|
+
if 'check' in lower or 'chk' in lower:
|
|
672
|
+
return 'tracking', 100
|
|
673
|
+
if lower.startswith('step'):
|
|
674
|
+
return 'tracking', 100
|
|
675
|
+
if 'befvac' in lower or 'aftvac' in lower:
|
|
676
|
+
return 'tracking', 100
|
|
677
|
+
if lower.startswith('add') and not lower.startswith('adduct'):
|
|
678
|
+
return 'tracking', 600 # addDCM, addEt3N, etc. — after main reaction
|
|
679
|
+
if 'insert' in lower or 'lc' in lower.split('-'):
|
|
680
|
+
return 'tracking', 100
|
|
681
|
+
if '1moreh' in lower or 'moreh' in lower:
|
|
682
|
+
return 'tracking', 100
|
|
683
|
+
if 'rxnmix' in lower or 'crmix' in lower:
|
|
684
|
+
return 'tracking', 0
|
|
685
|
+
if 'onemorehour' in lower:
|
|
686
|
+
return 'tracking', 60
|
|
687
|
+
if 'longtime' in lower:
|
|
688
|
+
return 'tracking', 500
|
|
689
|
+
if 'aftersfc' in lower:
|
|
690
|
+
return 'tracking', 600 # after SFC purification (but tracking, not purif)
|
|
691
|
+
if 'moreconc' in lower:
|
|
692
|
+
return 'tracking', 100
|
|
693
|
+
# Volume patterns: NNNul (microliters)
|
|
694
|
+
if re.match(r'.*\d+ul$', lower):
|
|
695
|
+
return 'tracking', 100
|
|
696
|
+
if 'heatint' in lower or 'rtint' in lower:
|
|
697
|
+
return 'tracking', 100
|
|
698
|
+
# beforelyo is tracking (sample taken before lyophilization)
|
|
699
|
+
if 'beforelyo' in lower:
|
|
700
|
+
return 'tracking', 100
|
|
701
|
+
|
|
702
|
+
return 'tracking', 100
|
|
703
|
+
|
|
704
|
+
|
|
705
|
+
# --- Batch categorization main entry ---
|
|
706
|
+
|
|
707
|
+
def categorize_lcms_files_batch(
|
|
708
|
+
filenames: List[str],
|
|
709
|
+
experiment_id: Optional[str] = None,
|
|
710
|
+
) -> BatchResult:
|
|
711
|
+
"""
|
|
712
|
+
Batch-categorize all LCMS files for one experiment.
|
|
713
|
+
|
|
714
|
+
Unlike categorize_lcms_file() (which processes files independently),
|
|
715
|
+
this function uses cross-file context to resolve ambiguities:
|
|
716
|
+
- tNN as purification fraction vs tracking timepoint
|
|
717
|
+
- Multi-phase tracking (add-more, scavenger, temperature changes)
|
|
718
|
+
- Modifier stripping (-re, -AmB, -W9, etc.)
|
|
719
|
+
- Special file filtering (-MS, -LC, etc.)
|
|
720
|
+
|
|
721
|
+
Args:
|
|
722
|
+
filenames: All LCMS PDF filenames for the experiment (basenames).
|
|
723
|
+
experiment_id: Experiment ID (e.g. "KL-1001-065"). If None,
|
|
724
|
+
auto-detected from the first filename.
|
|
725
|
+
|
|
726
|
+
Returns:
|
|
727
|
+
BatchResult with per-file categories, tracking groups,
|
|
728
|
+
filtered files, and has_final flag.
|
|
729
|
+
"""
|
|
730
|
+
if not filenames:
|
|
731
|
+
return BatchResult(experiment_id=experiment_id or "", files={},
|
|
732
|
+
tracking_groups=[], filtered_files=[])
|
|
733
|
+
|
|
734
|
+
if experiment_id is None:
|
|
735
|
+
experiment_id = _extract_experiment_id(filenames[0])
|
|
736
|
+
|
|
737
|
+
result = BatchResult(
|
|
738
|
+
experiment_id=experiment_id,
|
|
739
|
+
files={},
|
|
740
|
+
tracking_groups=[],
|
|
741
|
+
filtered_files=[],
|
|
742
|
+
)
|
|
743
|
+
|
|
744
|
+
# Phase 1: Strip modifiers, filter special files, extract suffixes
|
|
745
|
+
cleaned = {} # filename -> (cleaned_base, suffix, modifiers)
|
|
746
|
+
for fn in filenames:
|
|
747
|
+
stripped, mods = _strip_modifiers(fn)
|
|
748
|
+
stripped_base = os.path.splitext(stripped)[0] if '.' in stripped else stripped
|
|
749
|
+
|
|
750
|
+
if _is_special_file(stripped_base):
|
|
751
|
+
result.filtered_files.append(fn)
|
|
752
|
+
continue
|
|
753
|
+
|
|
754
|
+
suffix = _extract_suffix(stripped_base, experiment_id)
|
|
755
|
+
cleaned[fn] = (stripped_base, suffix, mods)
|
|
756
|
+
|
|
757
|
+
# Phase 2: Scan for explicit time tokens (to resolve tNN ambiguity)
|
|
758
|
+
# "Explicit time" = a non-tube time pattern (Nmin, Nh, ON, OWE, premix)
|
|
759
|
+
has_explicit_time = False
|
|
760
|
+
for fn, (_, suffix, _) in cleaned.items():
|
|
761
|
+
tt = _extract_time_token(suffix)
|
|
762
|
+
if tt is not None:
|
|
763
|
+
has_explicit_time = True
|
|
764
|
+
break
|
|
765
|
+
|
|
766
|
+
# Phase 3: Categorize each file
|
|
767
|
+
tracking_candidates = [] # (filename, suffix, time_min, group_prefix)
|
|
768
|
+
|
|
769
|
+
for fn, (stripped_base, suffix, mods) in cleaned.items():
|
|
770
|
+
cat, sort_key = _categorize_suffix(suffix, has_explicit_time)
|
|
771
|
+
|
|
772
|
+
temp = _extract_temperature(suffix)
|
|
773
|
+
|
|
774
|
+
fc = FileClassification(
|
|
775
|
+
category=cat,
|
|
776
|
+
sort_key=sort_key,
|
|
777
|
+
modifiers=mods,
|
|
778
|
+
temperature=temp,
|
|
779
|
+
)
|
|
780
|
+
|
|
781
|
+
if cat == 'tracking':
|
|
782
|
+
tt = _extract_time_token(suffix)
|
|
783
|
+
if tt is not None:
|
|
784
|
+
time_min, t_start, t_end = tt
|
|
785
|
+
# Group prefix = everything before the time token
|
|
786
|
+
prefix = suffix[:t_start].rstrip('-').rstrip(' ').rstrip('_')
|
|
787
|
+
fc.group_prefix = prefix
|
|
788
|
+
fc.sort_key = time_min
|
|
789
|
+
tracking_candidates.append((fn, suffix, time_min, prefix))
|
|
790
|
+
else:
|
|
791
|
+
fc.group_prefix = "__notime__"
|
|
792
|
+
|
|
793
|
+
result.files[fn] = fc
|
|
794
|
+
|
|
795
|
+
# Phase 4: Group tracking files by prefix
|
|
796
|
+
groups_dict = defaultdict(list)
|
|
797
|
+
for fn, suffix, time_min, prefix in tracking_candidates:
|
|
798
|
+
groups_dict[prefix].append((fn, time_min))
|
|
799
|
+
|
|
800
|
+
# Sort groups by median time value
|
|
801
|
+
sorted_groups = []
|
|
802
|
+
for prefix in sorted(groups_dict.keys(),
|
|
803
|
+
key=lambda p: median([t for _, t in groups_dict[p]])):
|
|
804
|
+
files_in_group = groups_dict[prefix]
|
|
805
|
+
# Sort files within group by time
|
|
806
|
+
files_in_group.sort(key=lambda x: x[1])
|
|
807
|
+
tg = TrackingGroup(prefix=prefix, files=files_in_group)
|
|
808
|
+
sorted_groups.append(tg)
|
|
809
|
+
|
|
810
|
+
# Phase 5: Assign calibrated sort keys for multi-group tracking.
|
|
811
|
+
# At categorization time we don't have PDF timestamps, so use the
|
|
812
|
+
# fallback mode (arbitrary +100 min gap between groups). After PDFs
|
|
813
|
+
# are parsed, callers can recalibrate groups 2+ with real timestamps
|
|
814
|
+
# via calibrate_sort_keys_hybrid().
|
|
815
|
+
calibrate_sort_keys_hybrid(sorted_groups, result)
|
|
816
|
+
|
|
817
|
+
# Record group prefix on each tracking file
|
|
818
|
+
for group in sorted_groups:
|
|
819
|
+
for fn, _ in group.files:
|
|
820
|
+
if fn in result.files:
|
|
821
|
+
result.files[fn].group_prefix = group.prefix
|
|
822
|
+
|
|
823
|
+
result.tracking_groups = sorted_groups
|
|
824
|
+
|
|
825
|
+
# Phase 6: Check for final files
|
|
826
|
+
result.has_final = any(
|
|
827
|
+
fc.category == 'final' for fc in result.files.values()
|
|
828
|
+
)
|
|
829
|
+
|
|
830
|
+
return result
|
|
831
|
+
|
|
832
|
+
|
|
833
|
+
# ---------------------------------------------------------------------------
|
|
834
|
+
# Hybrid sort key calibration (filename for group 1, timestamps for group 2+)
|
|
835
|
+
# ---------------------------------------------------------------------------
|
|
836
|
+
|
|
837
|
+
def calibrate_sort_keys_hybrid(
|
|
838
|
+
sorted_groups: List['TrackingGroup'],
|
|
839
|
+
result: 'BatchResult',
|
|
840
|
+
run_datetimes: Optional[Dict[str, str]] = None,
|
|
841
|
+
) -> None:
|
|
842
|
+
"""
|
|
843
|
+
Assign sort keys to tracking files across multiple tracking groups.
|
|
844
|
+
|
|
845
|
+
Group 1 (or single-group reactions): uses ONLY filename-derived time
|
|
846
|
+
tokens. The chemist often prepares samples ahead and may submit them
|
|
847
|
+
out of order on the instrument queue — filename order reflects the
|
|
848
|
+
intended chronology.
|
|
849
|
+
|
|
850
|
+
Groups 2+: uses actual PDF acquisition timestamps when available.
|
|
851
|
+
At this stage the chemist is adding reagent or changing temperature
|
|
852
|
+
and runs are overwhelmingly in chronological order. The real time
|
|
853
|
+
gap between the last sample of group N-1 and the first sample of
|
|
854
|
+
group N is used as the inter-group offset. Within-group ordering
|
|
855
|
+
also follows acquisition timestamps.
|
|
856
|
+
|
|
857
|
+
Args:
|
|
858
|
+
sorted_groups: TrackingGroup list sorted by median time.
|
|
859
|
+
result: BatchResult whose files dict will be updated.
|
|
860
|
+
run_datetimes: Optional mapping of filename → "YYYY-MM-DD HH:MM:SS".
|
|
861
|
+
When None, falls back to arbitrary +100 min gap
|
|
862
|
+
(suitable for categorization-time before PDFs are parsed).
|
|
863
|
+
"""
|
|
864
|
+
from datetime import datetime as _dt
|
|
865
|
+
|
|
866
|
+
if not sorted_groups:
|
|
867
|
+
return
|
|
868
|
+
|
|
869
|
+
prev_max_sk = 0.0
|
|
870
|
+
prev_max_fn = None # filename of file with highest sort_key in prev group
|
|
871
|
+
|
|
872
|
+
for i, group in enumerate(sorted_groups):
|
|
873
|
+
if i == 0:
|
|
874
|
+
# Group 1: ONLY filename-derived time tokens
|
|
875
|
+
group.offset = 0.0
|
|
876
|
+
for fn, time_min in group.files:
|
|
877
|
+
result.files[fn].sort_key = time_min
|
|
878
|
+
if group.files:
|
|
879
|
+
prev_max_sk = max(t for _, t in group.files)
|
|
880
|
+
prev_max_fn = max(group.files, key=lambda x: x[1])[0]
|
|
881
|
+
else:
|
|
882
|
+
# Groups 2+: use real PDF timestamps if available
|
|
883
|
+
real_gap_used = False
|
|
884
|
+
|
|
885
|
+
if run_datetimes and prev_max_fn:
|
|
886
|
+
prev_dt_str = run_datetimes.get(prev_max_fn)
|
|
887
|
+
|
|
888
|
+
# Sort THIS group by acquisition time (not filename tokens)
|
|
889
|
+
group_with_dt = [(fn, t, run_datetimes.get(fn))
|
|
890
|
+
for fn, t in group.files]
|
|
891
|
+
has_all_dt = (prev_dt_str is not None and
|
|
892
|
+
all(dt is not None for _, _, dt in group_with_dt))
|
|
893
|
+
|
|
894
|
+
if has_all_dt:
|
|
895
|
+
try:
|
|
896
|
+
prev_dt = _dt.strptime(prev_dt_str,
|
|
897
|
+
"%Y-%m-%d %H:%M:%S")
|
|
898
|
+
# Re-sort group files by acquisition time
|
|
899
|
+
group_with_dt.sort(key=lambda x: x[2])
|
|
900
|
+
group.files = [(fn, t) for fn, t, _ in group_with_dt]
|
|
901
|
+
|
|
902
|
+
# Assign sort keys: offset from prev group's last file
|
|
903
|
+
for fn, _orig_t, dt_str in group_with_dt:
|
|
904
|
+
curr_dt = _dt.strptime(dt_str,
|
|
905
|
+
"%Y-%m-%d %H:%M:%S")
|
|
906
|
+
offset_min = (curr_dt - prev_dt).total_seconds() / 60
|
|
907
|
+
if offset_min < 0:
|
|
908
|
+
offset_min = 0 # safety: clock skew
|
|
909
|
+
result.files[fn].sort_key = (prev_max_sk
|
|
910
|
+
+ offset_min)
|
|
911
|
+
|
|
912
|
+
group.offset = prev_max_sk
|
|
913
|
+
prev_max_sk = max(result.files[fn].sort_key
|
|
914
|
+
for fn, _ in group.files)
|
|
915
|
+
prev_max_fn = max(group.files,
|
|
916
|
+
key=lambda x: result.files[x[0]].sort_key)[0]
|
|
917
|
+
real_gap_used = True
|
|
918
|
+
except (ValueError, TypeError):
|
|
919
|
+
pass # fall through to fallback
|
|
920
|
+
|
|
921
|
+
if not real_gap_used:
|
|
922
|
+
# Fallback: arbitrary +100 min gap
|
|
923
|
+
group.offset = prev_max_sk + 100
|
|
924
|
+
for fn, time_min in group.files:
|
|
925
|
+
result.files[fn].sort_key = group.offset + time_min
|
|
926
|
+
prev_max_sk = max(result.files[fn].sort_key
|
|
927
|
+
for fn, _ in group.files)
|
|
928
|
+
prev_max_fn = max(group.files, key=lambda x: x[1])[0]
|