cdxml-toolkit 0.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (91) hide show
  1. cdxml_toolkit/__init__.py +18 -0
  2. cdxml_toolkit/_jre/__init__.py +2 -0
  3. cdxml_toolkit/_jre/temurin-21-jre-win-x64.zip +0 -0
  4. cdxml_toolkit/analysis/__init__.py +35 -0
  5. cdxml_toolkit/analysis/deterministic/__init__.py +12 -0
  6. cdxml_toolkit/analysis/deterministic/discover_experiment_files.py +413 -0
  7. cdxml_toolkit/analysis/deterministic/lab_book_formatter.py +701 -0
  8. cdxml_toolkit/analysis/deterministic/lcms_file_categorizer.py +928 -0
  9. cdxml_toolkit/analysis/deterministic/lcms_identifier.py +598 -0
  10. cdxml_toolkit/analysis/deterministic/mass_resolver.py +654 -0
  11. cdxml_toolkit/analysis/deterministic/multi_lcms_analyzer.py +1412 -0
  12. cdxml_toolkit/analysis/deterministic/procedure_writer.py +446 -0
  13. cdxml_toolkit/analysis/extract_nmr.py +47 -0
  14. cdxml_toolkit/analysis/format_procedure_entry.py +479 -0
  15. cdxml_toolkit/analysis/lcms_analyzer.py +1299 -0
  16. cdxml_toolkit/analysis/parse_analysis_file.py +134 -0
  17. cdxml_toolkit/cdxml_builder.py +920 -0
  18. cdxml_toolkit/cdxml_utils.py +342 -0
  19. cdxml_toolkit/chemdraw/__init__.py +5 -0
  20. cdxml_toolkit/chemdraw/_chemscript_server.py +562 -0
  21. cdxml_toolkit/chemdraw/cdx_converter.py +527 -0
  22. cdxml_toolkit/chemdraw/cdxml_to_image.py +262 -0
  23. cdxml_toolkit/chemdraw/cdxml_to_image_rdkit.py +296 -0
  24. cdxml_toolkit/chemdraw/chemscript_bridge.py +901 -0
  25. cdxml_toolkit/constants.py +304 -0
  26. cdxml_toolkit/coord_normalizer.py +438 -0
  27. cdxml_toolkit/deterministic_pipeline/__init__.py +6 -0
  28. cdxml_toolkit/deterministic_pipeline/legacy/__init__.py +5 -0
  29. cdxml_toolkit/deterministic_pipeline/legacy/eln_cdx_cleanup.py +509 -0
  30. cdxml_toolkit/deterministic_pipeline/legacy/eln_enrichment.py +1394 -0
  31. cdxml_toolkit/deterministic_pipeline/legacy/scheme_aligner.py +428 -0
  32. cdxml_toolkit/deterministic_pipeline/legacy/scheme_polisher.py +1337 -0
  33. cdxml_toolkit/deterministic_pipeline/legacy/scheme_polisher_v2.py +1340 -0
  34. cdxml_toolkit/deterministic_pipeline/scheme_reader_audit.py +931 -0
  35. cdxml_toolkit/deterministic_pipeline/scheme_reader_verify.py +1160 -0
  36. cdxml_toolkit/image/__init__.py +15 -0
  37. cdxml_toolkit/image/reaction_from_image.py +2103 -0
  38. cdxml_toolkit/image/structure_from_image.py +1711 -0
  39. cdxml_toolkit/layout/__init__.py +5 -0
  40. cdxml_toolkit/layout/alignment.py +1642 -0
  41. cdxml_toolkit/layout/reaction_cleanup.py +1002 -0
  42. cdxml_toolkit/layout/scheme_merger.py +2260 -0
  43. cdxml_toolkit/mcp_server/__init__.py +0 -0
  44. cdxml_toolkit/mcp_server/__main__.py +5 -0
  45. cdxml_toolkit/mcp_server/server.py +1567 -0
  46. cdxml_toolkit/naming/__init__.py +6 -0
  47. cdxml_toolkit/naming/aligned_namer.py +2342 -0
  48. cdxml_toolkit/naming/mol_builder.py +3722 -0
  49. cdxml_toolkit/naming/name_decomposer.py +2843 -0
  50. cdxml_toolkit/naming/reactions_datamol.json +2414 -0
  51. cdxml_toolkit/office/__init__.py +5 -0
  52. cdxml_toolkit/office/doc_from_template.py +722 -0
  53. cdxml_toolkit/office/ole_embedder.py +808 -0
  54. cdxml_toolkit/office/ole_extractor.py +272 -0
  55. cdxml_toolkit/perception/__init__.py +10 -0
  56. cdxml_toolkit/perception/compound_search.py +229 -0
  57. cdxml_toolkit/perception/eln_csv_parser.py +240 -0
  58. cdxml_toolkit/perception/rdf_parser.py +664 -0
  59. cdxml_toolkit/perception/reactant_heuristic.py +1045 -0
  60. cdxml_toolkit/perception/reaction_parser.py +2150 -0
  61. cdxml_toolkit/perception/scheme_reader.py +2948 -0
  62. cdxml_toolkit/perception/scheme_refine.py +1404 -0
  63. cdxml_toolkit/perception/scheme_segmenter.py +619 -0
  64. cdxml_toolkit/perception/spatial_assignment.py +1013 -0
  65. cdxml_toolkit/rdkit_utils.py +605 -0
  66. cdxml_toolkit/render/__init__.py +17 -0
  67. cdxml_toolkit/render/auto_layout.py +229 -0
  68. cdxml_toolkit/render/compact_parser.py +632 -0
  69. cdxml_toolkit/render/parser.py +706 -0
  70. cdxml_toolkit/render/render_scheme.py +267 -0
  71. cdxml_toolkit/render/renderer.py +2387 -0
  72. cdxml_toolkit/render/schema.py +90 -0
  73. cdxml_toolkit/render/scheme_maker.py +1043 -0
  74. cdxml_toolkit/render/scheme_yaml_writer.py +1487 -0
  75. cdxml_toolkit/resolve/__init__.py +13 -0
  76. cdxml_toolkit/resolve/cas_resolver.py +430 -0
  77. cdxml_toolkit/resolve/chemscanner_abbreviations.json +28813 -0
  78. cdxml_toolkit/resolve/condensed_formula.py +493 -0
  79. cdxml_toolkit/resolve/jre_manager.py +195 -0
  80. cdxml_toolkit/resolve/reagent_abbreviations.json +1046 -0
  81. cdxml_toolkit/resolve/reagent_db.py +285 -0
  82. cdxml_toolkit/resolve/superatom_data.json +2856 -0
  83. cdxml_toolkit/resolve/superatom_table.py +146 -0
  84. cdxml_toolkit/text_formatting.py +298 -0
  85. cdxml_toolkit-0.5.0.dist-info/METADATA +318 -0
  86. cdxml_toolkit-0.5.0.dist-info/RECORD +91 -0
  87. cdxml_toolkit-0.5.0.dist-info/WHEEL +5 -0
  88. cdxml_toolkit-0.5.0.dist-info/entry_points.txt +17 -0
  89. cdxml_toolkit-0.5.0.dist-info/licenses/LICENSE +21 -0
  90. cdxml_toolkit-0.5.0.dist-info/licenses/NOTICE.md +37 -0
  91. cdxml_toolkit-0.5.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,928 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ LCMS File Categorizer
4
+
5
+ Categorizes LCMS PDF filenames into experiment phases: tracking, workup,
6
+ purification, final, reference. Two APIs:
7
+
8
+ - categorize_lcms_file(filename) — simple per-file categorization
9
+ - categorize_lcms_files_batch(filenames, experiment_id) — context-aware
10
+ batch categorization with prefix-based tracking groups, modifier
11
+ stripping, special file filtering, and hybrid sort key calibration
12
+
13
+ Pure string-processing engine — no PDF parsing, no external dependencies
14
+ beyond stdlib.
15
+ """
16
+
17
+ import os
18
+ import re
19
+ from collections import defaultdict
20
+ from dataclasses import dataclass, field
21
+ from statistics import median
22
+ from typing import List, Optional, Dict, Tuple
23
+
24
+
25
+ # ---------------------------------------------------------------------------
26
+ # Simple file categorization (per-file, no cross-file context)
27
+ # ---------------------------------------------------------------------------
28
+
29
+ def categorize_lcms_file(filename: str) -> Tuple[str, float]:
30
+ """
31
+ Categorize an LCMS file and return (category, sort_key).
32
+
33
+ Categories: "tracking", "workup", "purification", "final", "reference"
34
+
35
+ Uses the same pattern-matching engine as the batch categorizer
36
+ (_categorize_suffix) but without cross-file context. Strips
37
+ analytical modifiers (-re, -AmB, -W9, etc.) and extracts the
38
+ experiment suffix before categorization.
39
+ """
40
+ # Strip modifiers and extract suffix (same pipeline as batch)
41
+ stripped, _mods = _strip_modifiers(filename)
42
+ stripped_base = os.path.splitext(stripped)[0] if '.' in stripped else stripped
43
+ experiment_id = _extract_experiment_id(filename)
44
+ suffix = _extract_suffix(stripped_base, experiment_id)
45
+
46
+ # Categorize using the shared engine (assume explicit times exist —
47
+ # conservative: bare tNN treated as purification fractions)
48
+ return _categorize_suffix(suffix, has_explicit_time=True)
49
+
50
+
51
+ # Ambiguous sort keys: files whose position in the timeline can't be
52
+ # reliably determined from the filename alone.
53
+ _AMBIGUOUS_SORT_KEYS = {500} # 500 = "beforeadd"
54
+
55
+
56
+ # ---------------------------------------------------------------------------
57
+ # Batch file categorization (v2 — prefix-based grouping)
58
+ # ---------------------------------------------------------------------------
59
+
60
+ @dataclass
61
+ class FileModifiers:
62
+ """Metadata stripped from a filename before categorization."""
63
+ rerun_count: int = 0 # -re=1, -rere=2
64
+ duplicate_num: Optional[int] = None # (2) -> 2
65
+ method_variant: Optional[str] = None # AmB, AmF, AmBfoc, AmFfoc
66
+ method_program: Optional[str] = None # W1, W9, W13, W17, W19
67
+ long_method: bool = False # -long suffix
68
+ concentrated: bool = False # -conc suffix
69
+ focused: bool = False # -focus / -foc suffix
70
+
71
+
72
+ @dataclass
73
+ class TrackingGroup:
74
+ """A group of tracking files sharing a common prefix."""
75
+ prefix: str # e.g. "", "add50mgDEAD", "70C"
76
+ files: List[Tuple[str, float]] # (filename, time_in_minutes)
77
+ offset: float = 0.0 # calibrated offset for sort keys
78
+
79
+
80
+ @dataclass
81
+ class FileClassification:
82
+ """Classification result for one LCMS file."""
83
+ category: str # tracking, workup, purification, final, reference
84
+ sort_key: float
85
+ modifiers: FileModifiers
86
+ group_prefix: Optional[str] = None # for tracking files
87
+ temperature: Optional[float] = None # parsed temperature in Celsius
88
+
89
+
90
+ @dataclass
91
+ class BatchResult:
92
+ """Result of batch categorization for one experiment."""
93
+ experiment_id: str
94
+ files: Dict[str, FileClassification] # filename -> classification
95
+ tracking_groups: List[TrackingGroup]
96
+ filtered_files: List[str] # special files (MS, LC, etc.)
97
+ has_final: bool = False
98
+
99
+
100
+ # --- Modifier stripping ---
101
+
102
+ def _strip_modifiers(filename: str) -> Tuple[str, FileModifiers]:
103
+ """Strip analytical modifiers from filename, return cleaned name + metadata."""
104
+ base = os.path.splitext(filename)[0]
105
+ mods = FileModifiers()
106
+
107
+ # 1. Duplicate number: " (2)" or "(2)" at end
108
+ m = re.search(r'\s*\((\d+)\)\s*$', base)
109
+ if m:
110
+ mods.duplicate_num = int(m.group(1))
111
+ base = base[:m.start()]
112
+
113
+ # 2. Rerun: -rere, -re, -RE at end (check rere first)
114
+ if re.search(r'-[Rr][Ee][Rr][Ee]$', base):
115
+ mods.rerun_count = 2
116
+ base = base[:-5]
117
+ elif re.search(r'-[Rr][Ee]$', base):
118
+ mods.rerun_count = 1
119
+ base = base[:-3]
120
+ # Also handle -rerun / -RERUN
121
+ elif re.search(r'-rerun$', base, re.IGNORECASE):
122
+ mods.rerun_count = 1
123
+ base = base[:-6]
124
+
125
+ # 3. -focus / -foc at end
126
+ if re.search(r'-foc(?:us)?$', base, re.IGNORECASE):
127
+ mods.focused = True
128
+ m2 = re.search(r'-foc(?:us)?$', base, re.IGNORECASE)
129
+ base = base[:m2.start()]
130
+
131
+ # 4. -conc at end
132
+ if re.search(r'-conc$', base, re.IGNORECASE):
133
+ mods.concentrated = True
134
+ base = base[:-5]
135
+
136
+ # 5. -long at end
137
+ if re.search(r'-long$', base, re.IGNORECASE):
138
+ mods.long_method = True
139
+ base = base[:-5]
140
+
141
+ # 6. Method program: -W1, -W3, -W4, -W9, -W13, -W17, -W19 at end
142
+ m = re.search(r'-(W\d+)$', base, re.IGNORECASE)
143
+ if m:
144
+ mods.method_program = m.group(1).upper()
145
+ base = base[:m.start()]
146
+
147
+ # 7. Buffer method: -AmB, -AmF, -AmBfoc, -AmFfoc at end
148
+ # Must come after -foc stripping since -AmFfoc = -AmF + -foc
149
+ m = re.search(r'-(Am[BF](?:foc)?)$', base, re.IGNORECASE)
150
+ if m:
151
+ mods.method_variant = m.group(1)
152
+ base = base[:m.start()]
153
+
154
+ return base, mods
155
+
156
+
157
+ # --- Special file detection ---
158
+
159
+ _SPECIAL_SUFFIXES_RE = re.compile(
160
+ r'(?:'
161
+ r'-MS'
162
+ r'|-LC(?:-COPY)?'
163
+ r'|-LCtrace'
164
+ r'|-UV'
165
+ r'|-manint'
166
+ r'|-landscape'
167
+ r'|-int' # integration screenshot
168
+ r')$',
169
+ re.IGNORECASE
170
+ )
171
+
172
+
173
+ def _is_special_file(cleaned_base: str) -> bool:
174
+ """Check if this file is a non-standard LCMS report (MS-only, LC-only, etc.)."""
175
+ return bool(_SPECIAL_SUFFIXES_RE.search(cleaned_base))
176
+
177
+
178
+ # --- Experiment ID / suffix extraction ---
179
+
180
+ def _extract_experiment_id(filename: str) -> str:
181
+ """Extract KL-XXXX-NNN experiment ID from a filename."""
182
+ base = os.path.splitext(filename)[0]
183
+ # Remove (2) duplicate suffix
184
+ base = re.sub(r'\s*\(\d+\)\s*$', '', base)
185
+ m = re.match(r'(KL-\d+-\d+)', base, re.IGNORECASE)
186
+ if m:
187
+ return m.group(1).upper()
188
+ return base.upper()
189
+
190
+
191
+ def _extract_suffix(cleaned_base: str, experiment_id: str) -> str:
192
+ """Extract the suffix after the experiment ID from a cleaned filename."""
193
+ # Case-insensitive prefix match
194
+ prefix_len = len(experiment_id)
195
+ if cleaned_base[:prefix_len].upper() == experiment_id.upper():
196
+ remainder = cleaned_base[prefix_len:]
197
+ # Strip leading dash, space, or underscore
198
+ remainder = remainder.lstrip('-').lstrip(' ').lstrip('_')
199
+ return remainder
200
+ return cleaned_base
201
+
202
+
203
+ # --- Time token extraction ---
204
+
205
+ def _extract_time_token(suffix: str) -> Optional[Tuple[float, int, int]]:
206
+ """
207
+ Find the best time token in the suffix.
208
+
209
+ Returns (time_in_minutes, token_start, token_end) or None.
210
+ token_start/end define the span of the time+temperature cluster
211
+ (for prefix extraction: everything before token_start is the group prefix).
212
+ """
213
+ original = suffix
214
+ candidates = [] # (start, end, time_minutes)
215
+
216
+ # --- Combined temperature+time patterns ---
217
+ # For NNC+time patterns, t_start points to the start of the time portion
218
+ # (after the temperature), so the temperature becomes part of the group
219
+ # prefix. For time+NNC patterns, the time IS at the start so t_start
220
+ # = m.start().
221
+
222
+ # NNCON: 40CON, 80CON, 90CON, 100CON, 105CON — temperature + overnight
223
+ # The "ON" begins after the "C" in "NNCON", so we need to find that offset.
224
+ for m in re.finditer(r'(\d{2,3})C(ON)\b', original):
225
+ candidates.append((m.start(2), m.end(), 960.0))
226
+
227
+ # NNC-ON: 65C-ON — temperature + dash + overnight
228
+ for m in re.finditer(r'(\d{2,3})C-(ON)\b', original):
229
+ candidates.append((m.start(2), m.end(), 960.0))
230
+
231
+ # NNC-OWE / NNC-OWE: 130C-OWE
232
+ for m in re.finditer(r'(\d{2,3})C-?(OWE)\b', original):
233
+ candidates.append((m.start(2), m.end(), 2880.0))
234
+
235
+ # NNC + NhNm: 40C1h25min — time starts at group 2
236
+ for m in re.finditer(r'(\d{2,3})C-?(\d+)h(\d+)\s*m(?:in)?', original, re.IGNORECASE):
237
+ t = float(m.group(2)) * 60 + float(m.group(3))
238
+ candidates.append((m.start(2), m.end(), t))
239
+
240
+ # NNC + Nh: 80C8h, 80C-2h, 120C-5h, 70C-1hmore — time starts at group 2
241
+ for m in re.finditer(r'(\d{2,3})C-?(\d+)h(?!\d)', original, re.IGNORECASE):
242
+ candidates.append((m.start(2), m.end(), float(m.group(2)) * 60))
243
+
244
+ # NNC + Nmin: 100C30min, 50C5min, 50C40min — time starts at group 2
245
+ for m in re.finditer(r'(\d{2,3})C-?(\d+)\s*min', original, re.IGNORECASE):
246
+ candidates.append((m.start(2), m.end(), float(m.group(2))))
247
+
248
+ # NNC + Nm: 50C12m (boundary at word end) — time starts at group 2
249
+ for m in re.finditer(r'(\d{2,3})C-?(\d+)m\b', original, re.IGNORECASE):
250
+ candidates.append((m.start(2), m.end(), float(m.group(2))))
251
+
252
+ # Time + NNC: 30min80C, 100min80C, 12min70C — time IS at the start
253
+ for m in re.finditer(r'(\d+)\s*min(\d{2,3})C', original, re.IGNORECASE):
254
+ candidates.append((m.start(), m.end(), float(m.group(1))))
255
+
256
+ # Time(m) + NNC: 90m70C (if it occurs) — time IS at the start
257
+ for m in re.finditer(r'(\d+)m(\d{2,3})C', original, re.IGNORECASE):
258
+ candidates.append((m.start(), m.end(), float(m.group(1))))
259
+
260
+ # Nh + NNC: 9h125C, 1h50C, 1h70C — time IS at the start
261
+ for m in re.finditer(r'(\d+)h(\d{2,3})C', original, re.IGNORECASE):
262
+ candidates.append((m.start(), m.end(), float(m.group(1)) * 60))
263
+
264
+ # NhNm + NNC: 1h30m80C (if it occurs)
265
+ for m in re.finditer(r'(\d+)h(\d+)m(\d{2,3})C', original, re.IGNORECASE):
266
+ t = float(m.group(1)) * 60 + float(m.group(2))
267
+ candidates.append((m.start(), m.end(), t))
268
+
269
+ # NhNm + NNC or NNC: not commonly observed, skip
270
+
271
+ # --- Standalone time patterns ---
272
+
273
+ # premix with time: premix10min, premix4min, premix7min
274
+ for m in re.finditer(r'premix-?(\d+)\s*min', original, re.IGNORECASE):
275
+ candidates.append((m.start(), m.end(), -float(m.group(1))))
276
+ for m in re.finditer(r'premix-?(\d+)\s*m\b', original, re.IGNORECASE):
277
+ candidates.append((m.start(), m.end(), -float(m.group(1))))
278
+ # premix alone (no time)
279
+ if re.search(r'\bpremix\b', original, re.IGNORECASE):
280
+ m = re.search(r'\bpremix\b', original, re.IGNORECASE)
281
+ # Only if not already matched as premixNmin
282
+ if not re.search(r'premix-?\d+', original, re.IGNORECASE):
283
+ candidates.append((m.start(), m.end(), -10.0))
284
+
285
+ # NhNmin: 1h30min, 2h45min, 3h20min
286
+ for m in re.finditer(r'(\d+)h(\d+)\s*min', original, re.IGNORECASE):
287
+ if not _preceded_by_temp(original, m.start()):
288
+ candidates.append((m.start(), m.end(),
289
+ float(m.group(1)) * 60 + float(m.group(2))))
290
+
291
+ # NhNm: 1h27m, 4h30m, 2h45m, 1h50mrerun
292
+ # Allow m to be followed by non-digit (not just word boundary)
293
+ for m in re.finditer(r'(\d+)h(\d+)m(?![0-9])', original, re.IGNORECASE):
294
+ if not _preceded_by_temp(original, m.start()):
295
+ candidates.append((m.start(), m.end(),
296
+ float(m.group(1)) * 60 + float(m.group(2))))
297
+
298
+ # ON — case-sensitive (uppercase ON). Must NOT be followed by uppercase
299
+ # letters (to avoid matching inside "ONCE", "ONLY", etc.).
300
+ # Allowed after lowercase (airdryON, scavON) and before lowercase
301
+ # (ONrecheck = overnight recheck).
302
+ for m in re.finditer(r'ON(?![A-Z])', original):
303
+ if not _preceded_by_temp(original, m.start()):
304
+ candidates.append((m.start(), m.end(), 960.0))
305
+
306
+ # OWE — case-sensitive, same relaxed boundary
307
+ for m in re.finditer(r'OWE(?![A-Z])', original):
308
+ if not _preceded_by_temp(original, m.start()):
309
+ candidates.append((m.start(), m.end(), 2880.0))
310
+
311
+ # Nh: 1h, 12h, 16h — not preceded by temp, not followed by digit (NhNm)
312
+ for m in re.finditer(r'(\d+)h(?!\d)', original, re.IGNORECASE):
313
+ if not _preceded_by_temp(original, m.start()):
314
+ candidates.append((m.start(), m.end(), float(m.group(1)) * 60))
315
+
316
+ # Nmin: 30min, 128min — not preceded by temp
317
+ for m in re.finditer(r'(\d+)\s*min(?:s)?', original, re.IGNORECASE):
318
+ if not _preceded_by_temp(original, m.start()):
319
+ candidates.append((m.start(), m.end(), float(m.group(1))))
320
+
321
+ # Nm: 90m, 40m, 30mrt — not preceded by temp.
322
+ # Reject only when followed by "in" (to avoid double-matching Nmin as Nm)
323
+ # or by another "m" (mm, mol); allow other suffixes like rt, sp, p.
324
+ for m in re.finditer(r'(\d+)m(?!in|m|ol)', original, re.IGNORECASE):
325
+ if not _preceded_by_temp(original, m.start()):
326
+ candidates.append((m.start(), m.end(), float(m.group(1))))
327
+
328
+ # "onehour" / "overnight" as special text
329
+ for m in re.finditer(r'\bonehour\b', original, re.IGNORECASE):
330
+ candidates.append((m.start(), m.end(), 60.0))
331
+ for m in re.finditer(r'\bovernight\b', original, re.IGNORECASE):
332
+ candidates.append((m.start(), m.end(), 960.0))
333
+
334
+ if not candidates:
335
+ return None
336
+
337
+ # Prefer the most specific (longest span) match; break ties by rightmost
338
+ # De-duplicate overlapping candidates: keep the longest span at each position
339
+ candidates.sort(key=lambda c: (-(c[1] - c[0]), -c[0]))
340
+ best = candidates[0]
341
+ return (best[2], best[0], best[1])
342
+
343
+
344
+ def _preceded_by_temp(suffix: str, pos: int) -> bool:
345
+ """Check if position is immediately preceded by a NNC temperature pattern."""
346
+ before = suffix[:pos]
347
+ return bool(re.search(r'\d{2,3}C-?$', before, re.IGNORECASE))
348
+
349
+
350
+ def _extract_temperature(suffix: str) -> Optional[float]:
351
+ """Extract temperature in Celsius from suffix if present."""
352
+ m = re.search(r'(?<![tT])(\d{2,3})C', suffix)
353
+ if m:
354
+ return float(m.group(1))
355
+ return None
356
+
357
+
358
+ # --- Categorization logic ---
359
+
360
+ # Final product patterns
361
+ _FINAL_RE = re.compile(
362
+ r'(?:'
363
+ r'purified' # nppurified, rppurified, THFRPpurified, c18purified
364
+ r'|lyo' # lyo, repurlyo, lyotwice, rerelyo
365
+ r'|verify' # verify, AmBverify
366
+ r'|prodchk' # product check
367
+ r'|(?:^|[^a-zA-Z])(?:NMRsample|NMRsamp|QC|NMR)(?:[^a-zA-Z]|$)'
368
+ r')',
369
+ re.IGNORECASE
370
+ )
371
+ _FINAL_STANDALONE_RE = re.compile(
372
+ r'^(?:NMR|final|finalNMR|finalvial|prod|final\d?)$',
373
+ re.IGNORECASE
374
+ )
375
+
376
+ # Purification tube number: optional prefix + tNN or tNNtoNN
377
+ _PURIF_PREFIX_RE = re.compile(
378
+ r'^(?:NP\d?|RP\d?|C18(?:-\d)?|prep\d?|col\d?|I\d|scout|THF(?:RP)?'
379
+ r'|THFrecov|EArecov|recol|scavNP|KADrecov|final\d?|recov(?:NP)?'
380
+ r'|MeCN(?:col)?|actual|firstinj|prevbatch'
381
+ r'|fchk\d?|meohtest' # fraction check, MeOH test
382
+ r'|p\d|v\d|vial\d' # p1-, v1-, vial1- column/vial prefixes
383
+ r'|first|second' # first/second injection
384
+ r'|step\d' # step1, step2 purification steps
385
+ r')'
386
+ r'-?',
387
+ re.IGNORECASE
388
+ )
389
+ _TUBE_NUM_RE = re.compile(r't(\d+)(?:to(\d+))?', re.IGNORECASE)
390
+
391
+ # Purification keywords (not tube numbers)
392
+ _PURIF_KEYWORDS = {
393
+ 'comb', 'combed', 'peakcomb', 'colload', 'load', 'loading', 'flush',
394
+ 'tflush', 'tload', 'tail', 'tails', 'repur', 'repurified',
395
+ 'npcomb', 'npcombed', 'rpcomb', 'rprecomb', 'c18comb', 'c18load',
396
+ 'thfcol-comb', 'meccol-i1to4comb',
397
+ 'impfrac', 'reload',
398
+ 'onetube', 'nptails', 'npminor',
399
+ 'tend', 'tblob', 'tlast', # tube end/last/blob
400
+ 'fchk', 'fchk1', 'fchk2', # fraction check
401
+ }
402
+
403
+ # Workup keywords
404
+ _WORKUP_KEYWORDS = {
405
+ 'crude', 'cr', 'extract', 'ext', 'wash', 'washed', 'washing',
406
+ 'rewash', 'rewashed', 'aq', 'org', 'brine',
407
+ 'dried', 'driedonce', 'redried', 'combdried',
408
+ 'filter', 'filtered', 'filtrate', 'fil', 'filtersolid',
409
+ 'pellet', 'pel', 'super', 'ppt',
410
+ 'quench', 'quenched',
411
+ 'silfil', 'cefil',
412
+ 'rotovap', 'rotatrap', 'rota',
413
+ 'slurry', 'recryst', 'workup',
414
+ 'nofil', 'or',
415
+ }
416
+
417
+ # Reference patterns (starting material checks, not reaction monitoring)
418
+ _REFERENCE_RE = re.compile(
419
+ r'^(?:SM\d?|RAE|RAESM|ArI|ArBr|ArBrSM|ArISM|aniline|chloride|SMchloride'
420
+ r'|TP-SM|Clref|tolref|SMPDref|SMPDCT|DDQSM|chlorideSM|SManiline'
421
+ r'|X\d{3}|E\d{3}|INT\d+|KADDP|SMwith\w+'
422
+ r'|spiking|SMcheck|SMchk|SMconfirm|SMrecov|SM-verify'
423
+ r'|aminopySM|AmPySM|smmix|smix|smrtmix|SMS'
424
+ r'|byprod|ref\d?'
425
+ r')(?:$|-)',
426
+ re.IGNORECASE
427
+ )
428
+
429
+ # Additional reference patterns: NpNF fluorine equivalents (titration experiments)
430
+ _FLUORINE_EQUIV_RE = re.compile(
431
+ r'^(?:\d*p?\d+F(?:mol)?|\d+F)$',
432
+ re.IGNORECASE
433
+ )
434
+
435
+
436
+ def _categorize_suffix(suffix: str, has_explicit_time: bool) -> Tuple[str, float]:
437
+ """
438
+ Categorize a single file's suffix.
439
+
440
+ Args:
441
+ suffix: The cleaned suffix (modifiers stripped, experiment ID removed).
442
+ has_explicit_time: True if other files in the experiment have
443
+ explicit time tokens (Nmin, Nh, ON, OWE — not tNN).
444
+
445
+ Returns:
446
+ (category, preliminary_sort_key)
447
+ """
448
+ if not suffix:
449
+ return 'tracking', 100
450
+
451
+ lower = suffix.lower()
452
+
453
+ # --- Priority 0.5: "final-IN-tNN" purification tube fractions ---
454
+ # Must come before the final product check since "final" is a prefix here
455
+ if re.match(r'final\d?-(?:I\d|i\d)', suffix, re.IGNORECASE):
456
+ return 'purification', 3000
457
+
458
+ # --- Priority 1: Final product ---
459
+ if _FINAL_RE.search(suffix) or _FINAL_STANDALONE_RE.match(suffix):
460
+ # Exception: "crude-NMR" or "crude-NMRsample" is workup
461
+ if 'crude' in lower:
462
+ return 'workup', 2000
463
+ # Exception: method-prefix + "purified" = purification, not final
464
+ # NPpurified, RPpurified, C18purified, THFRPpurified, scavNPpurified
465
+ if re.match(r'^(?:NP|RP|C18|THFRP|THF|scavNP|col)\d?-?purified',
466
+ suffix, re.IGNORECASE):
467
+ return 'purification', 3000
468
+ return 'final', 9000
469
+
470
+ # --- Priority 2: Purification ---
471
+ # Check for tube numbers: [prefix]-tNN[toNN]
472
+ # First strip any purification prefix to find the tNN part
473
+ test_suffix = suffix
474
+ purif_prefix_match = _PURIF_PREFIX_RE.match(suffix)
475
+ if purif_prefix_match:
476
+ test_suffix = suffix[purif_prefix_match.end():]
477
+
478
+ # Recursively strip purification prefixes (p1-c18-t16, final-I1-t4, etc.)
479
+ for _ in range(3): # max 3 levels of nesting
480
+ new_match = _PURIF_PREFIX_RE.match(test_suffix)
481
+ if new_match:
482
+ test_suffix = test_suffix[new_match.end():]
483
+ else:
484
+ break
485
+
486
+ # Also strip inline injection number: I1-, I2-, I1t, etc.
487
+ inj_match = re.match(r'I\d+-?', test_suffix)
488
+ if inj_match:
489
+ test_suffix = test_suffix[inj_match.end():]
490
+
491
+ # Also handle leading dash: -t12 → strip dash
492
+ if test_suffix.startswith('-'):
493
+ test_suffix = test_suffix[1:]
494
+
495
+ tube_match = _TUBE_NUM_RE.match(test_suffix)
496
+ # Also try bare numbers after purification prefix (first-17, vial2-24)
497
+ if not tube_match and purif_prefix_match and re.match(r'^\d+(?:to\d+)?$', test_suffix):
498
+ tube_num = int(re.match(r'^(\d+)', test_suffix).group(1))
499
+ return 'purification', 3000 + tube_num
500
+
501
+ if tube_match:
502
+ has_purif_prefix = purif_prefix_match is not None and purif_prefix_match.end() > 0
503
+ tube_num = int(tube_match.group(1))
504
+
505
+ if has_purif_prefix:
506
+ # NP-t13, RP-t25, C18-t79 — always purification
507
+ return 'purification', 3000 + tube_num
508
+ elif has_explicit_time:
509
+ # Bare tNN in an experiment with explicit time tokens
510
+ # High tube numbers (>30) are almost certainly purification
511
+ # Low numbers are ambiguous but still likely purification if
512
+ # explicit times exist alongside
513
+ return 'purification', 3000 + tube_num
514
+ else:
515
+ # Bare tNN, no explicit time tokens — ambiguous
516
+ # Likely purification (tube numbers rarely used for tracking)
517
+ return 'purification', 3000 + tube_num
518
+
519
+ # Purification keywords
520
+ # Split on - and check each part
521
+ parts_lower = set(re.split(r'[-_\s]', lower))
522
+ if parts_lower & _PURIF_KEYWORDS:
523
+ return 'purification', 3000
524
+
525
+ # More flexible purification keyword match (substring)
526
+ if any(kw in lower for kw in ['peakcomb', 'peak-comb', 'colload',
527
+ 'rpload', 'npcomb', 'npcombed',
528
+ 'rpcomb', 'c18comb', 'c18load',
529
+ 'tpeakcomb',
530
+ 'c18repur', 'rprepur',
531
+ 'meohwash', # column wash
532
+ 'impfrac', 'reload',
533
+ 'loading',
534
+ 'mecncol', 'mecnrecov',
535
+ 'purefracs', 'lesspure', 'morepure',
536
+ 'combed',
537
+ ]):
538
+ return 'purification', 3000
539
+
540
+ # Any suffix starting with a purification method prefix (C18, NP, RP,
541
+ # etc.) is purification — even if the remainder looks like workup
542
+ # (e.g. C18-DCMext = DCM extraction from C18 eluate, still purification;
543
+ # NP-dried = dried NP fractions, still purification).
544
+ if purif_prefix_match and purif_prefix_match.end() > 0:
545
+ # A known purification method prefix was matched
546
+ return 'purification', 3000
547
+
548
+ # RN-tNN patterns (R2t5 = round 2, tube 5)
549
+ if re.match(r'^R\d+t\d+', suffix, re.IGNORECASE):
550
+ return 'purification', 3000
551
+
552
+ # Bare tube range: NNtoNN (39to41, 46to49)
553
+ if re.match(r'^\d+to\d+$', suffix, re.IGNORECASE):
554
+ return 'purification', 3000
555
+
556
+ # peak1, peak2 — purification peak fractions
557
+ if re.match(r'^peak\d+$', lower):
558
+ return 'purification', 3000
559
+
560
+ # --- Priority 3: Workup ---
561
+ if parts_lower & _WORKUP_KEYWORDS:
562
+ return 'workup', 2000
563
+
564
+ # More flexible workup match (substring for compound words)
565
+ if any(kw in lower for kw in ['crude', 'washed', 'dried', 'quench',
566
+ 'silfil', 'cefil', 'rotovap',
567
+ 'rotatrap', 'workup', 'extraction',
568
+ 'filtrate', 'pellet', 'super',
569
+ 'slurry', 'recryst',
570
+ 'dcmext', # DCM extraction
571
+ 'ipawash', # IPA wash
572
+ 'hclwash', # HCl wash
573
+ 'washing', # washing steps
574
+ 'orgph', # organic phase
575
+ 'orgwash', # organic wash
576
+ 'nahco3wu', 'naohwu', 'waterwu', # wu = workup
577
+ 'bicarbwash', 'naohwash',
578
+ 'syrfilter', # syringe filter
579
+ 'eaext', # EA extraction
580
+ 'rotaeaext', # rotavap + EA extraction
581
+ 'wutest', # workup test
582
+ 'mainpeak', # mainpeakhclwash etc.
583
+ 'dmsodil', # DMSO dilution
584
+ 'b4rota', # before rotavap
585
+ ]):
586
+ return 'workup', 2000
587
+
588
+ # Standalone workup: aq1, aq2, org1, Naq, Ncr, Norgwash, etc.
589
+ if re.match(r'^(?:aq|org)\d*$', lower):
590
+ return 'workup', 2000
591
+ # Numbered workup: 1orgwash, 2aq, 3Cr
592
+ if re.match(r'^\d+(?:aq|org|cr|wash|ext)', lower):
593
+ return 'workup', 2000
594
+ # cent (centrifuge)
595
+ if lower == 'cent':
596
+ return 'workup', 2000
597
+
598
+ # --- Priority 4: Tracking (files with time tokens) ---
599
+ time_result = _extract_time_token(suffix)
600
+ if time_result:
601
+ time_min, _, _ = time_result
602
+ return 'tracking', time_min
603
+
604
+ # --- Priority 5: Reference ---
605
+ if _REFERENCE_RE.match(suffix):
606
+ return 'reference', 50
607
+ # Fluorine equivalents: 1p55F, 2p7F, 0p6F, p28Fmol, p8F, etc.
608
+ if _FLUORINE_EQUIV_RE.match(suffix):
609
+ return 'reference', 50
610
+ # N-ref patterns: 2-ref, 3-ref
611
+ if re.match(r'^\d+-ref', lower):
612
+ return 'reference', 50
613
+ # 4-br — aryl bromide reference
614
+ if re.match(r'^\d+-(?:br|cl|i)\b', lower):
615
+ return 'reference', 50
616
+
617
+ # --- Fallback ---
618
+ # Some patterns that are clearly a certain category but didn't match above
619
+
620
+ # Purification-related fallbacks
621
+ if 'flush' in lower or 'ipaflush' in lower:
622
+ return 'purification', 3000
623
+ if 'trap' in lower and lower != 'rotatrap':
624
+ return 'purification', 3000
625
+ if 'recov' in lower:
626
+ return 'purification', 3000
627
+ if lower.startswith('rp') and any(x in lower for x in ['peak', 'inj', 'chk', '-i']):
628
+ return 'purification', 3000
629
+ if 'kadmix' in lower or 'fracks' in lower:
630
+ return 'purification', 3000
631
+ if re.match(r'^rp\d?-', lower): # rp-sm, rp-peak, rp2-...
632
+ # rp-sm is reference, others are purification
633
+ if 'sm' in lower:
634
+ return 'reference', 50
635
+ return 'purification', 3000
636
+ # step1-meoh, step2-X — purification step procedures
637
+ if re.match(r'^step\d+', lower):
638
+ return 'purification', 3000
639
+
640
+ # Workup-related fallbacks
641
+ if 'solid' in lower and 'gold' not in lower:
642
+ return 'workup', 2000
643
+ if 'sludge' in lower or 'residue' in lower:
644
+ return 'workup', 2000
645
+ if lower.startswith('or') and len(lower) <= 3: # "or", "or1"
646
+ return 'workup', 2000
647
+ if lower == 'rext' or lower == 'res':
648
+ return 'workup', 2000
649
+ if lower == 'ea': # ethyl acetate workup
650
+ return 'workup', 2000
651
+ if 'trit' in lower and lower != 'et3n': # trituration
652
+ return 'workup', 2000
653
+
654
+ # Reference fallbacks
655
+ if 'smmix' in lower or 'smix' in lower or 'smrtmix' in lower:
656
+ return 'reference', 50
657
+ if lower.startswith('imp') and not lower.startswith('impfrac'):
658
+ return 'reference', 50
659
+ if 'arbrsm' in lower or 'arism' in lower:
660
+ return 'reference', 50
661
+
662
+ # Tracking qualitative timepoints
663
+ if 'morning' in lower or 'monmorn' in lower or 'beforeleave' in lower:
664
+ return 'tracking', 500 # qualitative timepoint, ambiguous ordering
665
+ if 'beforeadd' in lower or 'beforescav' in lower:
666
+ return 'tracking', 500
667
+ if 'afteradd' in lower:
668
+ return 'tracking', 600
669
+ if 'startmix' in lower or 'start' == lower or 'mix' == lower:
670
+ return 'tracking', 0
671
+ if 'check' in lower or 'chk' in lower:
672
+ return 'tracking', 100
673
+ if lower.startswith('step'):
674
+ return 'tracking', 100
675
+ if 'befvac' in lower or 'aftvac' in lower:
676
+ return 'tracking', 100
677
+ if lower.startswith('add') and not lower.startswith('adduct'):
678
+ return 'tracking', 600 # addDCM, addEt3N, etc. — after main reaction
679
+ if 'insert' in lower or 'lc' in lower.split('-'):
680
+ return 'tracking', 100
681
+ if '1moreh' in lower or 'moreh' in lower:
682
+ return 'tracking', 100
683
+ if 'rxnmix' in lower or 'crmix' in lower:
684
+ return 'tracking', 0
685
+ if 'onemorehour' in lower:
686
+ return 'tracking', 60
687
+ if 'longtime' in lower:
688
+ return 'tracking', 500
689
+ if 'aftersfc' in lower:
690
+ return 'tracking', 600 # after SFC purification (but tracking, not purif)
691
+ if 'moreconc' in lower:
692
+ return 'tracking', 100
693
+ # Volume patterns: NNNul (microliters)
694
+ if re.match(r'.*\d+ul$', lower):
695
+ return 'tracking', 100
696
+ if 'heatint' in lower or 'rtint' in lower:
697
+ return 'tracking', 100
698
+ # beforelyo is tracking (sample taken before lyophilization)
699
+ if 'beforelyo' in lower:
700
+ return 'tracking', 100
701
+
702
+ return 'tracking', 100
703
+
704
+
705
+ # --- Batch categorization main entry ---
706
+
707
+ def categorize_lcms_files_batch(
708
+ filenames: List[str],
709
+ experiment_id: Optional[str] = None,
710
+ ) -> BatchResult:
711
+ """
712
+ Batch-categorize all LCMS files for one experiment.
713
+
714
+ Unlike categorize_lcms_file() (which processes files independently),
715
+ this function uses cross-file context to resolve ambiguities:
716
+ - tNN as purification fraction vs tracking timepoint
717
+ - Multi-phase tracking (add-more, scavenger, temperature changes)
718
+ - Modifier stripping (-re, -AmB, -W9, etc.)
719
+ - Special file filtering (-MS, -LC, etc.)
720
+
721
+ Args:
722
+ filenames: All LCMS PDF filenames for the experiment (basenames).
723
+ experiment_id: Experiment ID (e.g. "KL-1001-065"). If None,
724
+ auto-detected from the first filename.
725
+
726
+ Returns:
727
+ BatchResult with per-file categories, tracking groups,
728
+ filtered files, and has_final flag.
729
+ """
730
+ if not filenames:
731
+ return BatchResult(experiment_id=experiment_id or "", files={},
732
+ tracking_groups=[], filtered_files=[])
733
+
734
+ if experiment_id is None:
735
+ experiment_id = _extract_experiment_id(filenames[0])
736
+
737
+ result = BatchResult(
738
+ experiment_id=experiment_id,
739
+ files={},
740
+ tracking_groups=[],
741
+ filtered_files=[],
742
+ )
743
+
744
+ # Phase 1: Strip modifiers, filter special files, extract suffixes
745
+ cleaned = {} # filename -> (cleaned_base, suffix, modifiers)
746
+ for fn in filenames:
747
+ stripped, mods = _strip_modifiers(fn)
748
+ stripped_base = os.path.splitext(stripped)[0] if '.' in stripped else stripped
749
+
750
+ if _is_special_file(stripped_base):
751
+ result.filtered_files.append(fn)
752
+ continue
753
+
754
+ suffix = _extract_suffix(stripped_base, experiment_id)
755
+ cleaned[fn] = (stripped_base, suffix, mods)
756
+
757
+ # Phase 2: Scan for explicit time tokens (to resolve tNN ambiguity)
758
+ # "Explicit time" = a non-tube time pattern (Nmin, Nh, ON, OWE, premix)
759
+ has_explicit_time = False
760
+ for fn, (_, suffix, _) in cleaned.items():
761
+ tt = _extract_time_token(suffix)
762
+ if tt is not None:
763
+ has_explicit_time = True
764
+ break
765
+
766
+ # Phase 3: Categorize each file
767
+ tracking_candidates = [] # (filename, suffix, time_min, group_prefix)
768
+
769
+ for fn, (stripped_base, suffix, mods) in cleaned.items():
770
+ cat, sort_key = _categorize_suffix(suffix, has_explicit_time)
771
+
772
+ temp = _extract_temperature(suffix)
773
+
774
+ fc = FileClassification(
775
+ category=cat,
776
+ sort_key=sort_key,
777
+ modifiers=mods,
778
+ temperature=temp,
779
+ )
780
+
781
+ if cat == 'tracking':
782
+ tt = _extract_time_token(suffix)
783
+ if tt is not None:
784
+ time_min, t_start, t_end = tt
785
+ # Group prefix = everything before the time token
786
+ prefix = suffix[:t_start].rstrip('-').rstrip(' ').rstrip('_')
787
+ fc.group_prefix = prefix
788
+ fc.sort_key = time_min
789
+ tracking_candidates.append((fn, suffix, time_min, prefix))
790
+ else:
791
+ fc.group_prefix = "__notime__"
792
+
793
+ result.files[fn] = fc
794
+
795
+ # Phase 4: Group tracking files by prefix
796
+ groups_dict = defaultdict(list)
797
+ for fn, suffix, time_min, prefix in tracking_candidates:
798
+ groups_dict[prefix].append((fn, time_min))
799
+
800
+ # Sort groups by median time value
801
+ sorted_groups = []
802
+ for prefix in sorted(groups_dict.keys(),
803
+ key=lambda p: median([t for _, t in groups_dict[p]])):
804
+ files_in_group = groups_dict[prefix]
805
+ # Sort files within group by time
806
+ files_in_group.sort(key=lambda x: x[1])
807
+ tg = TrackingGroup(prefix=prefix, files=files_in_group)
808
+ sorted_groups.append(tg)
809
+
810
+ # Phase 5: Assign calibrated sort keys for multi-group tracking.
811
+ # At categorization time we don't have PDF timestamps, so use the
812
+ # fallback mode (arbitrary +100 min gap between groups). After PDFs
813
+ # are parsed, callers can recalibrate groups 2+ with real timestamps
814
+ # via calibrate_sort_keys_hybrid().
815
+ calibrate_sort_keys_hybrid(sorted_groups, result)
816
+
817
+ # Record group prefix on each tracking file
818
+ for group in sorted_groups:
819
+ for fn, _ in group.files:
820
+ if fn in result.files:
821
+ result.files[fn].group_prefix = group.prefix
822
+
823
+ result.tracking_groups = sorted_groups
824
+
825
+ # Phase 6: Check for final files
826
+ result.has_final = any(
827
+ fc.category == 'final' for fc in result.files.values()
828
+ )
829
+
830
+ return result
831
+
832
+
833
+ # ---------------------------------------------------------------------------
834
+ # Hybrid sort key calibration (filename for group 1, timestamps for group 2+)
835
+ # ---------------------------------------------------------------------------
836
+
837
+ def calibrate_sort_keys_hybrid(
838
+ sorted_groups: List['TrackingGroup'],
839
+ result: 'BatchResult',
840
+ run_datetimes: Optional[Dict[str, str]] = None,
841
+ ) -> None:
842
+ """
843
+ Assign sort keys to tracking files across multiple tracking groups.
844
+
845
+ Group 1 (or single-group reactions): uses ONLY filename-derived time
846
+ tokens. The chemist often prepares samples ahead and may submit them
847
+ out of order on the instrument queue — filename order reflects the
848
+ intended chronology.
849
+
850
+ Groups 2+: uses actual PDF acquisition timestamps when available.
851
+ At this stage the chemist is adding reagent or changing temperature
852
+ and runs are overwhelmingly in chronological order. The real time
853
+ gap between the last sample of group N-1 and the first sample of
854
+ group N is used as the inter-group offset. Within-group ordering
855
+ also follows acquisition timestamps.
856
+
857
+ Args:
858
+ sorted_groups: TrackingGroup list sorted by median time.
859
+ result: BatchResult whose files dict will be updated.
860
+ run_datetimes: Optional mapping of filename → "YYYY-MM-DD HH:MM:SS".
861
+ When None, falls back to arbitrary +100 min gap
862
+ (suitable for categorization-time before PDFs are parsed).
863
+ """
864
+ from datetime import datetime as _dt
865
+
866
+ if not sorted_groups:
867
+ return
868
+
869
+ prev_max_sk = 0.0
870
+ prev_max_fn = None # filename of file with highest sort_key in prev group
871
+
872
+ for i, group in enumerate(sorted_groups):
873
+ if i == 0:
874
+ # Group 1: ONLY filename-derived time tokens
875
+ group.offset = 0.0
876
+ for fn, time_min in group.files:
877
+ result.files[fn].sort_key = time_min
878
+ if group.files:
879
+ prev_max_sk = max(t for _, t in group.files)
880
+ prev_max_fn = max(group.files, key=lambda x: x[1])[0]
881
+ else:
882
+ # Groups 2+: use real PDF timestamps if available
883
+ real_gap_used = False
884
+
885
+ if run_datetimes and prev_max_fn:
886
+ prev_dt_str = run_datetimes.get(prev_max_fn)
887
+
888
+ # Sort THIS group by acquisition time (not filename tokens)
889
+ group_with_dt = [(fn, t, run_datetimes.get(fn))
890
+ for fn, t in group.files]
891
+ has_all_dt = (prev_dt_str is not None and
892
+ all(dt is not None for _, _, dt in group_with_dt))
893
+
894
+ if has_all_dt:
895
+ try:
896
+ prev_dt = _dt.strptime(prev_dt_str,
897
+ "%Y-%m-%d %H:%M:%S")
898
+ # Re-sort group files by acquisition time
899
+ group_with_dt.sort(key=lambda x: x[2])
900
+ group.files = [(fn, t) for fn, t, _ in group_with_dt]
901
+
902
+ # Assign sort keys: offset from prev group's last file
903
+ for fn, _orig_t, dt_str in group_with_dt:
904
+ curr_dt = _dt.strptime(dt_str,
905
+ "%Y-%m-%d %H:%M:%S")
906
+ offset_min = (curr_dt - prev_dt).total_seconds() / 60
907
+ if offset_min < 0:
908
+ offset_min = 0 # safety: clock skew
909
+ result.files[fn].sort_key = (prev_max_sk
910
+ + offset_min)
911
+
912
+ group.offset = prev_max_sk
913
+ prev_max_sk = max(result.files[fn].sort_key
914
+ for fn, _ in group.files)
915
+ prev_max_fn = max(group.files,
916
+ key=lambda x: result.files[x[0]].sort_key)[0]
917
+ real_gap_used = True
918
+ except (ValueError, TypeError):
919
+ pass # fall through to fallback
920
+
921
+ if not real_gap_used:
922
+ # Fallback: arbitrary +100 min gap
923
+ group.offset = prev_max_sk + 100
924
+ for fn, time_min in group.files:
925
+ result.files[fn].sort_key = group.offset + time_min
926
+ prev_max_sk = max(result.files[fn].sort_key
927
+ for fn, _ in group.files)
928
+ prev_max_fn = max(group.files, key=lambda x: x[1])[0]