parseet 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- parseet/.DS_Store +0 -0
- parseet/__init__.py +2 -0
- parseet/app.py +63 -0
- parseet/backend.py +283 -0
- parseet/cli.py +177 -0
- parseet/config/__init__.py +6 -0
- parseet/config/config.toml +7 -0
- parseet/core/single_process.py +45 -0
- parseet/core/utils/__init__.py +5 -0
- parseet/core/utils/build_parser.py +33 -0
- parseet/core/utils/check_order.py +426 -0
- parseet/core/utils/lcms_check_samples.py +397 -0
- parseet/core/utils/parse_samplesheet.py +363 -0
- parseet/core/utils/setup_logger.py +24 -0
- parseet/main.qml +1057 -0
- parseet/version.py +1 -0
- parseet-0.2.0.dist-info/METADATA +56 -0
- parseet-0.2.0.dist-info/RECORD +20 -0
- parseet-0.2.0.dist-info/WHEEL +4 -0
- parseet-0.2.0.dist-info/entry_points.txt +6 -0
|
@@ -0,0 +1,426 @@
|
|
|
1
|
+
from collections import defaultdict
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
import logging
|
|
4
|
+
import re
|
|
5
|
+
import uuid
|
|
6
|
+
|
|
7
|
+
logger = logging.getLogger(__name__)
|
|
8
|
+
|
|
9
|
+
# ---------------------------------------------------------------------------
|
|
10
|
+
# Constants
|
|
11
|
+
# ---------------------------------------------------------------------------
|
|
12
|
+
|
|
13
|
+
FILENAME_PATTERN = re.compile(
|
|
14
|
+
r"^(?P<index>\d+)_(?P<sample>.+)_(?P<column>C18|HILIC)_(?P<mode>n|p|np)\.raw$"
|
|
15
|
+
)
|
|
16
|
+
|
|
17
|
+
CONTROL_SAMPLES: frozenset[str] = frozenset({"Blank", "QC", "FM", "FMA", "FMB"})
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
# ---------------------------------------------------------------------------
|
|
21
|
+
# Result helper
|
|
22
|
+
# ---------------------------------------------------------------------------
|
|
23
|
+
|
|
24
|
+
class CheckResult:
|
|
25
|
+
"""Accumulates errors, warnings, and informational messages."""
|
|
26
|
+
|
|
27
|
+
def __init__(self):
|
|
28
|
+
self.errors: list[str] = []
|
|
29
|
+
self.warnings: list[str] = []
|
|
30
|
+
self.ok: list[str] = []
|
|
31
|
+
|
|
32
|
+
def error(self, msg: str) -> None:
|
|
33
|
+
self.errors.append(msg)
|
|
34
|
+
logger.error(msg)
|
|
35
|
+
|
|
36
|
+
def warn(self, msg: str) -> None:
|
|
37
|
+
self.warnings.append(msg)
|
|
38
|
+
logger.warning(msg)
|
|
39
|
+
|
|
40
|
+
def good(self, msg: str) -> None:
|
|
41
|
+
self.ok.append(msg)
|
|
42
|
+
logger.info(msg)
|
|
43
|
+
|
|
44
|
+
@property
|
|
45
|
+
def has_errors(self) -> bool:
|
|
46
|
+
return bool(self.errors)
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
# ---------------------------------------------------------------------------
|
|
50
|
+
# Filename helpers
|
|
51
|
+
# ---------------------------------------------------------------------------
|
|
52
|
+
|
|
53
|
+
def _numeric_prefix(path: Path) -> int:
|
|
54
|
+
"""Return the leading integer of a filename, or inf if unparseable."""
|
|
55
|
+
try:
|
|
56
|
+
return int(path.name.split("_", 1)[0])
|
|
57
|
+
except ValueError:
|
|
58
|
+
return float("inf")
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def _parse_filename(filename: str) -> dict | None:
|
|
62
|
+
"""Return the named groups from FILENAME_PATTERN, or None on no match."""
|
|
63
|
+
m = FILENAME_PATTERN.match(filename)
|
|
64
|
+
return m.groupdict() if m else None
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def _build_filename(index: int, sample: str, column: str, mode: str, leading_zeros: int) -> str:
|
|
68
|
+
return f"{index:0{leading_zeros}d}_{sample.replace('_', '-')}_{column}_{mode}.raw"
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
# ---------------------------------------------------------------------------
|
|
72
|
+
# Sub-functions
|
|
73
|
+
# ---------------------------------------------------------------------------
|
|
74
|
+
|
|
75
|
+
def _collect_raw_files(folder: Path, result: CheckResult) -> list[Path]:
|
|
76
|
+
"""
|
|
77
|
+
Glob all .raw files in *folder*.
|
|
78
|
+
|
|
79
|
+
Returns an empty list (and registers an error) when none are found.
|
|
80
|
+
"""
|
|
81
|
+
raw_files = sorted(folder.glob("*.raw"))
|
|
82
|
+
if not raw_files:
|
|
83
|
+
result.error(f"No .raw files found in '{folder}'")
|
|
84
|
+
else:
|
|
85
|
+
result.good(f"Found {len(raw_files)} .raw file(s) in '{folder}'")
|
|
86
|
+
return raw_files
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
def _check_time_vs_alpha_order(
|
|
90
|
+
raw_files: list[Path],
|
|
91
|
+
result: CheckResult,
|
|
92
|
+
) -> tuple[list[Path], list[Path], bool]:
|
|
93
|
+
"""
|
|
94
|
+
Compare alphabetical (numeric-prefix) order against last-modified order.
|
|
95
|
+
|
|
96
|
+
Returns
|
|
97
|
+
-------
|
|
98
|
+
alpha_order, time_order:
|
|
99
|
+
The two sorted file lists.
|
|
100
|
+
already_correct:
|
|
101
|
+
True when orders match, indexing starts at 01, and all names have
|
|
102
|
+
exactly 3 underscores.
|
|
103
|
+
"""
|
|
104
|
+
alpha_order = sorted(raw_files, key=_numeric_prefix)
|
|
105
|
+
time_order = sorted(raw_files, key=lambda f: f.stat().st_mtime)
|
|
106
|
+
|
|
107
|
+
already_correct = (
|
|
108
|
+
alpha_order == time_order
|
|
109
|
+
and alpha_order[0].stem.startswith("01")
|
|
110
|
+
and all(f.name.count("_") == 3 for f in alpha_order)
|
|
111
|
+
)
|
|
112
|
+
|
|
113
|
+
if already_correct:
|
|
114
|
+
result.good("Alphabetical order matches last-modified time order")
|
|
115
|
+
else:
|
|
116
|
+
result.warn("Alphabetical order does NOT match last-modified time order")
|
|
117
|
+
|
|
118
|
+
return alpha_order, time_order, already_correct
|
|
119
|
+
|
|
120
|
+
|
|
121
|
+
def _parse_and_build_rename_map(
|
|
122
|
+
time_order: list[Path],
|
|
123
|
+
result: CheckResult,
|
|
124
|
+
) -> tuple[dict[str, str], list[str], dict[str, list[Path]]]:
|
|
125
|
+
"""
|
|
126
|
+
Parse every file in *time_order* and build the rename map.
|
|
127
|
+
|
|
128
|
+
Returns
|
|
129
|
+
-------
|
|
130
|
+
rename_map:
|
|
131
|
+
``{old_name: new_name}`` for every successfully parsed file.
|
|
132
|
+
samples:
|
|
133
|
+
Ordered list of sample names (one per parsed file).
|
|
134
|
+
sample_to_files:
|
|
135
|
+
Mapping of sample name → list of Path objects.
|
|
136
|
+
|
|
137
|
+
Raises
|
|
138
|
+
------
|
|
139
|
+
ValueError
|
|
140
|
+
If any filename does not match FILENAME_PATTERN.
|
|
141
|
+
"""
|
|
142
|
+
rename_map: dict[str, str] = {}
|
|
143
|
+
samples: list[str] = []
|
|
144
|
+
sample_to_files: dict[str, list[Path]] = defaultdict(list)
|
|
145
|
+
invalid_files: list[str] = []
|
|
146
|
+
|
|
147
|
+
for idx, file in enumerate(time_order, start=1):
|
|
148
|
+
parsed = _parse_filename(file.name)
|
|
149
|
+
if not parsed:
|
|
150
|
+
invalid_files.append(file.name)
|
|
151
|
+
continue
|
|
152
|
+
|
|
153
|
+
sample = parsed["sample"].replace("_", "-")
|
|
154
|
+
leading_zeros = len(str(len(time_order)))
|
|
155
|
+
new_name = _build_filename(idx, sample, parsed["column"], parsed["mode"], leading_zeros)
|
|
156
|
+
|
|
157
|
+
rename_map[file.name] = new_name
|
|
158
|
+
samples.append(sample)
|
|
159
|
+
sample_to_files[sample].append(file)
|
|
160
|
+
logger.debug("Parsed '%s' → sample='%s' new_name='%s'", file.name, sample, new_name)
|
|
161
|
+
|
|
162
|
+
if invalid_files:
|
|
163
|
+
raise ValueError("Invalid filenames:\n" + "\n".join(invalid_files))
|
|
164
|
+
|
|
165
|
+
# Warn about duplicate sample names
|
|
166
|
+
seen: set[str] = set()
|
|
167
|
+
for sample in samples:
|
|
168
|
+
if samples.count(sample) > 1 and sample not in seen:
|
|
169
|
+
seen.add(sample)
|
|
170
|
+
count = samples.count(sample)
|
|
171
|
+
result.warn(
|
|
172
|
+
f"Duplicate sample name '{sample}' appears {count} time(s) "
|
|
173
|
+
"in raw files — may cause renaming and validation issues"
|
|
174
|
+
)
|
|
175
|
+
|
|
176
|
+
return rename_map, samples, sample_to_files
|
|
177
|
+
|
|
178
|
+
|
|
179
|
+
def _validate_against_samplesheet(
|
|
180
|
+
samplesheet, # pd.DataFrame | None
|
|
181
|
+
samples: list[str],
|
|
182
|
+
sample_to_files: dict[str, list[Path]],
|
|
183
|
+
ignore_id: bool,
|
|
184
|
+
result: CheckResult,
|
|
185
|
+
) -> None:
|
|
186
|
+
"""
|
|
187
|
+
Cross-check parsed raw-file sample names against the samplesheet.
|
|
188
|
+
|
|
189
|
+
Errors are registered for:
|
|
190
|
+
- samples present in raw files but absent from the samplesheet
|
|
191
|
+
(``ID``-prefixed samples are treated separately and can be suppressed
|
|
192
|
+
with *ignore_id*)
|
|
193
|
+
- samples present in the samplesheet but absent from raw files
|
|
194
|
+
|
|
195
|
+
Duplicate sample names in the samplesheet are registered as warnings.
|
|
196
|
+
"""
|
|
197
|
+
if samplesheet is None:
|
|
198
|
+
logger.debug("No samplesheet provided — skipping cross-validation")
|
|
199
|
+
return
|
|
200
|
+
|
|
201
|
+
sheet_names_series = samplesheet["Sample name"].astype(str).str.strip()
|
|
202
|
+
sheet_samples = set(sheet_names_series)
|
|
203
|
+
|
|
204
|
+
# Warn about samplesheet duplicates
|
|
205
|
+
if len(sheet_samples) != len(sheet_names_series):
|
|
206
|
+
dupes = sheet_names_series[sheet_names_series.duplicated(keep=False)]
|
|
207
|
+
for name in sorted(set(dupes)):
|
|
208
|
+
count = dupes.tolist().count(name)
|
|
209
|
+
result.warn(
|
|
210
|
+
f"Duplicate samplesheet entry: '{name}' appears {count} time(s)"
|
|
211
|
+
)
|
|
212
|
+
|
|
213
|
+
# Raw samples, excluding known controls
|
|
214
|
+
raw_samples = {
|
|
215
|
+
s for s in samples
|
|
216
|
+
if s not in CONTROL_SAMPLES and not s.startswith("QC")
|
|
217
|
+
}
|
|
218
|
+
|
|
219
|
+
missing_in_sheet = raw_samples - sheet_samples
|
|
220
|
+
missing_in_raw = sheet_samples - raw_samples
|
|
221
|
+
|
|
222
|
+
if missing_in_sheet:
|
|
223
|
+
id_samples = {s for s in missing_in_sheet if s.startswith("ID")}
|
|
224
|
+
other_samples = missing_in_sheet - id_samples
|
|
225
|
+
|
|
226
|
+
if id_samples:
|
|
227
|
+
if ignore_id:
|
|
228
|
+
result.good(
|
|
229
|
+
f"Ignoring {len(id_samples)} 'ID'-prefixed sample(s) "
|
|
230
|
+
"missing from samplesheet (--ignore-id active)"
|
|
231
|
+
)
|
|
232
|
+
else:
|
|
233
|
+
for s in sorted(id_samples):
|
|
234
|
+
files = ", ".join(f.name for f in sample_to_files[s])
|
|
235
|
+
result.error(
|
|
236
|
+
f"Sample '{s}' starts with 'ID' and is missing from "
|
|
237
|
+
f"the samplesheet — files: {files}. "
|
|
238
|
+
"Use --ignore-id to skip."
|
|
239
|
+
)
|
|
240
|
+
|
|
241
|
+
for s in sorted(other_samples):
|
|
242
|
+
files = ", ".join(f.name for f in sample_to_files[s])
|
|
243
|
+
result.error(
|
|
244
|
+
f"Sample '{s}' is present in raw files but missing from "
|
|
245
|
+
f"the samplesheet — files: {files}"
|
|
246
|
+
)
|
|
247
|
+
|
|
248
|
+
if missing_in_raw:
|
|
249
|
+
for s in sorted(missing_in_raw):
|
|
250
|
+
result.error(
|
|
251
|
+
f"Sample '{s}' is present in the samplesheet but has no "
|
|
252
|
+
"matching .raw file"
|
|
253
|
+
)
|
|
254
|
+
|
|
255
|
+
if not missing_in_sheet and not missing_in_raw:
|
|
256
|
+
result.good("All samples in raw files are present in the samplesheet")
|
|
257
|
+
|
|
258
|
+
|
|
259
|
+
def _log_rename_proposals(rename_map: dict[str, str], result: CheckResult) -> None:
|
|
260
|
+
"""Register info messages for every rename that would change a filename."""
|
|
261
|
+
changes = [(old, new) for old, new in rename_map.items() if old != new]
|
|
262
|
+
if not changes:
|
|
263
|
+
result.good("No renames needed — all filenames already correct")
|
|
264
|
+
return
|
|
265
|
+
for old, new in changes:
|
|
266
|
+
result.good(f"Proposed rename: '{old}' → '{new}'")
|
|
267
|
+
|
|
268
|
+
|
|
269
|
+
def _apply_renames(folder: Path, rename_map: dict[str, str], result: CheckResult) -> None:
|
|
270
|
+
"""
|
|
271
|
+
Atomically rename files using a two-phase temp-name strategy to avoid
|
|
272
|
+
collisions when names are swapped.
|
|
273
|
+
"""
|
|
274
|
+
temp_map: dict[str, str] = {}
|
|
275
|
+
|
|
276
|
+
# Phase 1: move every changed file to a unique temp name
|
|
277
|
+
for old_name, new_name in rename_map.items():
|
|
278
|
+
if old_name == new_name:
|
|
279
|
+
continue
|
|
280
|
+
old_path = folder / old_name
|
|
281
|
+
temp_name = f".tmp_{uuid.uuid4().hex}_{old_name}"
|
|
282
|
+
old_path.rename(folder / temp_name)
|
|
283
|
+
temp_map[temp_name] = new_name
|
|
284
|
+
logger.debug("Phase-1 rename: '%s' → '%s'", old_name, temp_name)
|
|
285
|
+
|
|
286
|
+
# Phase 2: move each temp file to its final name
|
|
287
|
+
for temp_name, final_name in temp_map.items():
|
|
288
|
+
(folder / temp_name).rename(folder / final_name)
|
|
289
|
+
logger.debug("Phase-2 rename: '%s' → '%s'", temp_name, final_name)
|
|
290
|
+
|
|
291
|
+
result.good(f"Renamed {len(temp_map)} file(s)")
|
|
292
|
+
|
|
293
|
+
|
|
294
|
+
def _write_rename_log(
|
|
295
|
+
folder: Path,
|
|
296
|
+
rename_map: dict[str, str],
|
|
297
|
+
output_txt: str,
|
|
298
|
+
result: CheckResult,
|
|
299
|
+
) -> None:
|
|
300
|
+
"""Write the old→new mapping to a tab-separated text file."""
|
|
301
|
+
output_path = folder / output_txt
|
|
302
|
+
with open(output_path, "w") as fh:
|
|
303
|
+
for old_name, new_name in rename_map.items():
|
|
304
|
+
fh.write(f"{old_name}\t{new_name}\n")
|
|
305
|
+
result.good(f"Rename log written to: '{output_path}'")
|
|
306
|
+
|
|
307
|
+
|
|
308
|
+
# ---------------------------------------------------------------------------
|
|
309
|
+
# Public entry point
|
|
310
|
+
# ---------------------------------------------------------------------------
|
|
311
|
+
|
|
312
|
+
def check_raw_file_order(
|
|
313
|
+
folder_path,
|
|
314
|
+
samplesheet=None,
|
|
315
|
+
output_txt: str = "correct_time_order.txt",
|
|
316
|
+
dry_run: bool = True,
|
|
317
|
+
ignore_id: bool = False,
|
|
318
|
+
) -> tuple[dict[str, str], list[str], list[str], list[str]]:
|
|
319
|
+
"""
|
|
320
|
+
Validate and (optionally) fix the numeric ordering of .raw files.
|
|
321
|
+
|
|
322
|
+
Compares the alphabetical (numeric-prefix) order of ``.raw`` files
|
|
323
|
+
against their last-modified timestamps, cross-validates sample names
|
|
324
|
+
against the samplesheet, and optionally renames files in place using a
|
|
325
|
+
collision-safe two-phase strategy.
|
|
326
|
+
|
|
327
|
+
Parameters
|
|
328
|
+
----------
|
|
329
|
+
folder_path:
|
|
330
|
+
Directory containing the ``.raw`` files.
|
|
331
|
+
samplesheet:
|
|
332
|
+
Optional DataFrame with a ``"Sample name"`` column. When provided,
|
|
333
|
+
sample names extracted from filenames are cross-checked against it.
|
|
334
|
+
output_txt:
|
|
335
|
+
Name of the tab-separated rename-log file written inside
|
|
336
|
+
*folder_path*.
|
|
337
|
+
dry_run:
|
|
338
|
+
When ``True`` (default), propose renames without applying them.
|
|
339
|
+
ignore_id:
|
|
340
|
+
When ``True``, suppress errors for samples whose names start with
|
|
341
|
+
``"ID"`` but are absent from the samplesheet.
|
|
342
|
+
|
|
343
|
+
Returns
|
|
344
|
+
-------
|
|
345
|
+
rename_map:
|
|
346
|
+
``{old_filename: new_filename}`` for every parsed .raw file.
|
|
347
|
+
errors:
|
|
348
|
+
Blocking error messages collected during validation.
|
|
349
|
+
warnings:
|
|
350
|
+
Non-blocking warning messages.
|
|
351
|
+
ok:
|
|
352
|
+
Passed-check messages.
|
|
353
|
+
|
|
354
|
+
Raises
|
|
355
|
+
------
|
|
356
|
+
ValueError
|
|
357
|
+
If any filename does not match the expected pattern.
|
|
358
|
+
"""
|
|
359
|
+
result = CheckResult()
|
|
360
|
+
folder = Path(folder_path)
|
|
361
|
+
logger.info("Checking raw file order in '%s'", folder)
|
|
362
|
+
|
|
363
|
+
# 1. Collect .raw files
|
|
364
|
+
raw_files = _collect_raw_files(folder, result)
|
|
365
|
+
if not raw_files:
|
|
366
|
+
return {}, result.errors, result.warnings, result.ok
|
|
367
|
+
|
|
368
|
+
# 2. Compare alphabetical vs time order
|
|
369
|
+
_alpha_order, time_order, already_correct = _check_time_vs_alpha_order(
|
|
370
|
+
raw_files, result
|
|
371
|
+
)
|
|
372
|
+
|
|
373
|
+
# 3. Parse files and build rename map (raises on invalid filenames)
|
|
374
|
+
rename_map, samples, sample_to_files = _parse_and_build_rename_map(
|
|
375
|
+
time_order, result
|
|
376
|
+
)
|
|
377
|
+
|
|
378
|
+
# 4. Cross-validate against samplesheet (always, regardless of order)
|
|
379
|
+
_validate_against_samplesheet(
|
|
380
|
+
samplesheet, samples, sample_to_files, ignore_id, result
|
|
381
|
+
)
|
|
382
|
+
|
|
383
|
+
# 5. Stop here if the order is already correct
|
|
384
|
+
if already_correct:
|
|
385
|
+
logger.info("Order is already correct — nothing to rename")
|
|
386
|
+
return rename_map, result.errors, result.warnings, result.ok
|
|
387
|
+
|
|
388
|
+
# 6. Log proposed renames
|
|
389
|
+
_log_rename_proposals(rename_map, result)
|
|
390
|
+
|
|
391
|
+
# 7. Apply renames or report dry-run status
|
|
392
|
+
if result.has_errors:
|
|
393
|
+
result.warn("Skipping rename step due to validation errors")
|
|
394
|
+
elif not dry_run:
|
|
395
|
+
_apply_renames(folder, rename_map, result)
|
|
396
|
+
else:
|
|
397
|
+
result.good("Dry run enabled — no files were renamed")
|
|
398
|
+
|
|
399
|
+
# 8. Write rename log
|
|
400
|
+
_write_rename_log(folder, rename_map, output_txt, result)
|
|
401
|
+
|
|
402
|
+
logger.info(
|
|
403
|
+
"Done — %d error(s), %d warning(s)",
|
|
404
|
+
len(result.errors), len(result.warnings),
|
|
405
|
+
)
|
|
406
|
+
|
|
407
|
+
return rename_map, result.errors, result.warnings, result.ok
|
|
408
|
+
|
|
409
|
+
if __name__ == "__main__":
|
|
410
|
+
import pandas as pd
|
|
411
|
+
from parse_samplesheet import parse_samplesheet
|
|
412
|
+
from setup_logger import configure_logging
|
|
413
|
+
configure_logging(level=logging.INFO)
|
|
414
|
+
|
|
415
|
+
df = pd.read_excel("test_files/LC-MS_metabolomics_PLASMA.xlsx", header=None)
|
|
416
|
+
sample_df, _, _, _, _ = parse_samplesheet(df, )
|
|
417
|
+
rename_map, errors, warnings, ok = check_raw_file_order(
|
|
418
|
+
"test_files/raw_files",
|
|
419
|
+
samplesheet=sample_df,
|
|
420
|
+
output_txt="proposed_renames.txt",
|
|
421
|
+
dry_run=True,
|
|
422
|
+
ignore_id=True,
|
|
423
|
+
)
|
|
424
|
+
#print("Errors:", errors)
|
|
425
|
+
#print("Warnings:", warnings)
|
|
426
|
+
#print("OK messages:", ok)
|