parseet 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,426 @@
1
+ from collections import defaultdict
2
+ from pathlib import Path
3
+ import logging
4
+ import re
5
+ import uuid
6
+
7
+ logger = logging.getLogger(__name__)
8
+
9
+ # ---------------------------------------------------------------------------
10
+ # Constants
11
+ # ---------------------------------------------------------------------------
12
+
13
+ FILENAME_PATTERN = re.compile(
14
+ r"^(?P<index>\d+)_(?P<sample>.+)_(?P<column>C18|HILIC)_(?P<mode>n|p|np)\.raw$"
15
+ )
16
+
17
+ CONTROL_SAMPLES: frozenset[str] = frozenset({"Blank", "QC", "FM", "FMA", "FMB"})
18
+
19
+
20
+ # ---------------------------------------------------------------------------
21
+ # Result helper
22
+ # ---------------------------------------------------------------------------
23
+
24
+ class CheckResult:
25
+ """Accumulates errors, warnings, and informational messages."""
26
+
27
+ def __init__(self):
28
+ self.errors: list[str] = []
29
+ self.warnings: list[str] = []
30
+ self.ok: list[str] = []
31
+
32
+ def error(self, msg: str) -> None:
33
+ self.errors.append(msg)
34
+ logger.error(msg)
35
+
36
+ def warn(self, msg: str) -> None:
37
+ self.warnings.append(msg)
38
+ logger.warning(msg)
39
+
40
+ def good(self, msg: str) -> None:
41
+ self.ok.append(msg)
42
+ logger.info(msg)
43
+
44
+ @property
45
+ def has_errors(self) -> bool:
46
+ return bool(self.errors)
47
+
48
+
49
+ # ---------------------------------------------------------------------------
50
+ # Filename helpers
51
+ # ---------------------------------------------------------------------------
52
+
53
+ def _numeric_prefix(path: Path) -> int:
54
+ """Return the leading integer of a filename, or inf if unparseable."""
55
+ try:
56
+ return int(path.name.split("_", 1)[0])
57
+ except ValueError:
58
+ return float("inf")
59
+
60
+
61
+ def _parse_filename(filename: str) -> dict | None:
62
+ """Return the named groups from FILENAME_PATTERN, or None on no match."""
63
+ m = FILENAME_PATTERN.match(filename)
64
+ return m.groupdict() if m else None
65
+
66
+
67
+ def _build_filename(index: int, sample: str, column: str, mode: str, leading_zeros: int) -> str:
68
+ return f"{index:0{leading_zeros}d}_{sample.replace('_', '-')}_{column}_{mode}.raw"
69
+
70
+
71
+ # ---------------------------------------------------------------------------
72
+ # Sub-functions
73
+ # ---------------------------------------------------------------------------
74
+
75
+ def _collect_raw_files(folder: Path, result: CheckResult) -> list[Path]:
76
+ """
77
+ Glob all .raw files in *folder*.
78
+
79
+ Returns an empty list (and registers an error) when none are found.
80
+ """
81
+ raw_files = sorted(folder.glob("*.raw"))
82
+ if not raw_files:
83
+ result.error(f"No .raw files found in '{folder}'")
84
+ else:
85
+ result.good(f"Found {len(raw_files)} .raw file(s) in '{folder}'")
86
+ return raw_files
87
+
88
+
89
+ def _check_time_vs_alpha_order(
90
+ raw_files: list[Path],
91
+ result: CheckResult,
92
+ ) -> tuple[list[Path], list[Path], bool]:
93
+ """
94
+ Compare alphabetical (numeric-prefix) order against last-modified order.
95
+
96
+ Returns
97
+ -------
98
+ alpha_order, time_order:
99
+ The two sorted file lists.
100
+ already_correct:
101
+ True when orders match, indexing starts at 01, and all names have
102
+ exactly 3 underscores.
103
+ """
104
+ alpha_order = sorted(raw_files, key=_numeric_prefix)
105
+ time_order = sorted(raw_files, key=lambda f: f.stat().st_mtime)
106
+
107
+ already_correct = (
108
+ alpha_order == time_order
109
+ and alpha_order[0].stem.startswith("01")
110
+ and all(f.name.count("_") == 3 for f in alpha_order)
111
+ )
112
+
113
+ if already_correct:
114
+ result.good("Alphabetical order matches last-modified time order")
115
+ else:
116
+ result.warn("Alphabetical order does NOT match last-modified time order")
117
+
118
+ return alpha_order, time_order, already_correct
119
+
120
+
121
+ def _parse_and_build_rename_map(
122
+ time_order: list[Path],
123
+ result: CheckResult,
124
+ ) -> tuple[dict[str, str], list[str], dict[str, list[Path]]]:
125
+ """
126
+ Parse every file in *time_order* and build the rename map.
127
+
128
+ Returns
129
+ -------
130
+ rename_map:
131
+ ``{old_name: new_name}`` for every successfully parsed file.
132
+ samples:
133
+ Ordered list of sample names (one per parsed file).
134
+ sample_to_files:
135
+ Mapping of sample name → list of Path objects.
136
+
137
+ Raises
138
+ ------
139
+ ValueError
140
+ If any filename does not match FILENAME_PATTERN.
141
+ """
142
+ rename_map: dict[str, str] = {}
143
+ samples: list[str] = []
144
+ sample_to_files: dict[str, list[Path]] = defaultdict(list)
145
+ invalid_files: list[str] = []
146
+
147
+ for idx, file in enumerate(time_order, start=1):
148
+ parsed = _parse_filename(file.name)
149
+ if not parsed:
150
+ invalid_files.append(file.name)
151
+ continue
152
+
153
+ sample = parsed["sample"].replace("_", "-")
154
+ leading_zeros = len(str(len(time_order)))
155
+ new_name = _build_filename(idx, sample, parsed["column"], parsed["mode"], leading_zeros)
156
+
157
+ rename_map[file.name] = new_name
158
+ samples.append(sample)
159
+ sample_to_files[sample].append(file)
160
+ logger.debug("Parsed '%s' → sample='%s' new_name='%s'", file.name, sample, new_name)
161
+
162
+ if invalid_files:
163
+ raise ValueError("Invalid filenames:\n" + "\n".join(invalid_files))
164
+
165
+ # Warn about duplicate sample names
166
+ seen: set[str] = set()
167
+ for sample in samples:
168
+ if samples.count(sample) > 1 and sample not in seen:
169
+ seen.add(sample)
170
+ count = samples.count(sample)
171
+ result.warn(
172
+ f"Duplicate sample name '{sample}' appears {count} time(s) "
173
+ "in raw files — may cause renaming and validation issues"
174
+ )
175
+
176
+ return rename_map, samples, sample_to_files
177
+
178
+
179
+ def _validate_against_samplesheet(
180
+ samplesheet, # pd.DataFrame | None
181
+ samples: list[str],
182
+ sample_to_files: dict[str, list[Path]],
183
+ ignore_id: bool,
184
+ result: CheckResult,
185
+ ) -> None:
186
+ """
187
+ Cross-check parsed raw-file sample names against the samplesheet.
188
+
189
+ Errors are registered for:
190
+ - samples present in raw files but absent from the samplesheet
191
+ (``ID``-prefixed samples are treated separately and can be suppressed
192
+ with *ignore_id*)
193
+ - samples present in the samplesheet but absent from raw files
194
+
195
+ Duplicate sample names in the samplesheet are registered as warnings.
196
+ """
197
+ if samplesheet is None:
198
+ logger.debug("No samplesheet provided — skipping cross-validation")
199
+ return
200
+
201
+ sheet_names_series = samplesheet["Sample name"].astype(str).str.strip()
202
+ sheet_samples = set(sheet_names_series)
203
+
204
+ # Warn about samplesheet duplicates
205
+ if len(sheet_samples) != len(sheet_names_series):
206
+ dupes = sheet_names_series[sheet_names_series.duplicated(keep=False)]
207
+ for name in sorted(set(dupes)):
208
+ count = dupes.tolist().count(name)
209
+ result.warn(
210
+ f"Duplicate samplesheet entry: '{name}' appears {count} time(s)"
211
+ )
212
+
213
+ # Raw samples, excluding known controls
214
+ raw_samples = {
215
+ s for s in samples
216
+ if s not in CONTROL_SAMPLES and not s.startswith("QC")
217
+ }
218
+
219
+ missing_in_sheet = raw_samples - sheet_samples
220
+ missing_in_raw = sheet_samples - raw_samples
221
+
222
+ if missing_in_sheet:
223
+ id_samples = {s for s in missing_in_sheet if s.startswith("ID")}
224
+ other_samples = missing_in_sheet - id_samples
225
+
226
+ if id_samples:
227
+ if ignore_id:
228
+ result.good(
229
+ f"Ignoring {len(id_samples)} 'ID'-prefixed sample(s) "
230
+ "missing from samplesheet (--ignore-id active)"
231
+ )
232
+ else:
233
+ for s in sorted(id_samples):
234
+ files = ", ".join(f.name for f in sample_to_files[s])
235
+ result.error(
236
+ f"Sample '{s}' starts with 'ID' and is missing from "
237
+ f"the samplesheet — files: {files}. "
238
+ "Use --ignore-id to skip."
239
+ )
240
+
241
+ for s in sorted(other_samples):
242
+ files = ", ".join(f.name for f in sample_to_files[s])
243
+ result.error(
244
+ f"Sample '{s}' is present in raw files but missing from "
245
+ f"the samplesheet — files: {files}"
246
+ )
247
+
248
+ if missing_in_raw:
249
+ for s in sorted(missing_in_raw):
250
+ result.error(
251
+ f"Sample '{s}' is present in the samplesheet but has no "
252
+ "matching .raw file"
253
+ )
254
+
255
+ if not missing_in_sheet and not missing_in_raw:
256
+ result.good("All samples in raw files are present in the samplesheet")
257
+
258
+
259
+ def _log_rename_proposals(rename_map: dict[str, str], result: CheckResult) -> None:
260
+ """Register info messages for every rename that would change a filename."""
261
+ changes = [(old, new) for old, new in rename_map.items() if old != new]
262
+ if not changes:
263
+ result.good("No renames needed — all filenames already correct")
264
+ return
265
+ for old, new in changes:
266
+ result.good(f"Proposed rename: '{old}' → '{new}'")
267
+
268
+
269
+ def _apply_renames(folder: Path, rename_map: dict[str, str], result: CheckResult) -> None:
270
+ """
271
+ Atomically rename files using a two-phase temp-name strategy to avoid
272
+ collisions when names are swapped.
273
+ """
274
+ temp_map: dict[str, str] = {}
275
+
276
+ # Phase 1: move every changed file to a unique temp name
277
+ for old_name, new_name in rename_map.items():
278
+ if old_name == new_name:
279
+ continue
280
+ old_path = folder / old_name
281
+ temp_name = f".tmp_{uuid.uuid4().hex}_{old_name}"
282
+ old_path.rename(folder / temp_name)
283
+ temp_map[temp_name] = new_name
284
+ logger.debug("Phase-1 rename: '%s' → '%s'", old_name, temp_name)
285
+
286
+ # Phase 2: move each temp file to its final name
287
+ for temp_name, final_name in temp_map.items():
288
+ (folder / temp_name).rename(folder / final_name)
289
+ logger.debug("Phase-2 rename: '%s' → '%s'", temp_name, final_name)
290
+
291
+ result.good(f"Renamed {len(temp_map)} file(s)")
292
+
293
+
294
+ def _write_rename_log(
295
+ folder: Path,
296
+ rename_map: dict[str, str],
297
+ output_txt: str,
298
+ result: CheckResult,
299
+ ) -> None:
300
+ """Write the old→new mapping to a tab-separated text file."""
301
+ output_path = folder / output_txt
302
+ with open(output_path, "w") as fh:
303
+ for old_name, new_name in rename_map.items():
304
+ fh.write(f"{old_name}\t{new_name}\n")
305
+ result.good(f"Rename log written to: '{output_path}'")
306
+
307
+
308
+ # ---------------------------------------------------------------------------
309
+ # Public entry point
310
+ # ---------------------------------------------------------------------------
311
+
312
+ def check_raw_file_order(
313
+ folder_path,
314
+ samplesheet=None,
315
+ output_txt: str = "correct_time_order.txt",
316
+ dry_run: bool = True,
317
+ ignore_id: bool = False,
318
+ ) -> tuple[dict[str, str], list[str], list[str], list[str]]:
319
+ """
320
+ Validate and (optionally) fix the numeric ordering of .raw files.
321
+
322
+ Compares the alphabetical (numeric-prefix) order of ``.raw`` files
323
+ against their last-modified timestamps, cross-validates sample names
324
+ against the samplesheet, and optionally renames files in place using a
325
+ collision-safe two-phase strategy.
326
+
327
+ Parameters
328
+ ----------
329
+ folder_path:
330
+ Directory containing the ``.raw`` files.
331
+ samplesheet:
332
+ Optional DataFrame with a ``"Sample name"`` column. When provided,
333
+ sample names extracted from filenames are cross-checked against it.
334
+ output_txt:
335
+ Name of the tab-separated rename-log file written inside
336
+ *folder_path*.
337
+ dry_run:
338
+ When ``True`` (default), propose renames without applying them.
339
+ ignore_id:
340
+ When ``True``, suppress errors for samples whose names start with
341
+ ``"ID"`` but are absent from the samplesheet.
342
+
343
+ Returns
344
+ -------
345
+ rename_map:
346
+ ``{old_filename: new_filename}`` for every parsed .raw file.
347
+ errors:
348
+ Blocking error messages collected during validation.
349
+ warnings:
350
+ Non-blocking warning messages.
351
+ ok:
352
+ Passed-check messages.
353
+
354
+ Raises
355
+ ------
356
+ ValueError
357
+ If any filename does not match the expected pattern.
358
+ """
359
+ result = CheckResult()
360
+ folder = Path(folder_path)
361
+ logger.info("Checking raw file order in '%s'", folder)
362
+
363
+ # 1. Collect .raw files
364
+ raw_files = _collect_raw_files(folder, result)
365
+ if not raw_files:
366
+ return {}, result.errors, result.warnings, result.ok
367
+
368
+ # 2. Compare alphabetical vs time order
369
+ _alpha_order, time_order, already_correct = _check_time_vs_alpha_order(
370
+ raw_files, result
371
+ )
372
+
373
+ # 3. Parse files and build rename map (raises on invalid filenames)
374
+ rename_map, samples, sample_to_files = _parse_and_build_rename_map(
375
+ time_order, result
376
+ )
377
+
378
+ # 4. Cross-validate against samplesheet (always, regardless of order)
379
+ _validate_against_samplesheet(
380
+ samplesheet, samples, sample_to_files, ignore_id, result
381
+ )
382
+
383
+ # 5. Stop here if the order is already correct
384
+ if already_correct:
385
+ logger.info("Order is already correct — nothing to rename")
386
+ return rename_map, result.errors, result.warnings, result.ok
387
+
388
+ # 6. Log proposed renames
389
+ _log_rename_proposals(rename_map, result)
390
+
391
+ # 7. Apply renames or report dry-run status
392
+ if result.has_errors:
393
+ result.warn("Skipping rename step due to validation errors")
394
+ elif not dry_run:
395
+ _apply_renames(folder, rename_map, result)
396
+ else:
397
+ result.good("Dry run enabled — no files were renamed")
398
+
399
+ # 8. Write rename log
400
+ _write_rename_log(folder, rename_map, output_txt, result)
401
+
402
+ logger.info(
403
+ "Done — %d error(s), %d warning(s)",
404
+ len(result.errors), len(result.warnings),
405
+ )
406
+
407
+ return rename_map, result.errors, result.warnings, result.ok
408
+
409
+ if __name__ == "__main__":
410
+ import pandas as pd
411
+ from parse_samplesheet import parse_samplesheet
412
+ from setup_logger import configure_logging
413
+ configure_logging(level=logging.INFO)
414
+
415
+ df = pd.read_excel("test_files/LC-MS_metabolomics_PLASMA.xlsx", header=None)
416
+ sample_df, _, _, _, _ = parse_samplesheet(df, )
417
+ rename_map, errors, warnings, ok = check_raw_file_order(
418
+ "test_files/raw_files",
419
+ samplesheet=sample_df,
420
+ output_txt="proposed_renames.txt",
421
+ dry_run=True,
422
+ ignore_id=True,
423
+ )
424
+ #print("Errors:", errors)
425
+ #print("Warnings:", warnings)
426
+ #print("OK messages:", ok)