parseet 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- parseet/.DS_Store +0 -0
- parseet/__init__.py +2 -0
- parseet/app.py +63 -0
- parseet/backend.py +283 -0
- parseet/cli.py +177 -0
- parseet/config/__init__.py +6 -0
- parseet/config/config.toml +7 -0
- parseet/core/single_process.py +45 -0
- parseet/core/utils/__init__.py +5 -0
- parseet/core/utils/build_parser.py +33 -0
- parseet/core/utils/check_order.py +426 -0
- parseet/core/utils/lcms_check_samples.py +397 -0
- parseet/core/utils/parse_samplesheet.py +363 -0
- parseet/core/utils/setup_logger.py +24 -0
- parseet/main.qml +1057 -0
- parseet/version.py +1 -0
- parseet-0.2.0.dist-info/METADATA +56 -0
- parseet-0.2.0.dist-info/RECORD +20 -0
- parseet-0.2.0.dist-info/WHEEL +4 -0
- parseet-0.2.0.dist-info/entry_points.txt +6 -0
|
@@ -0,0 +1,397 @@
|
|
|
1
|
+
#!/usr/bin/env python
|
|
2
|
+
"""
|
|
3
|
+
Integration File Sample Checker
|
|
4
|
+
================================
|
|
5
|
+
|
|
6
|
+
Validates and aligns an integration Excel file against a reference
|
|
7
|
+
samplesheet.
|
|
8
|
+
|
|
9
|
+
It verifies that:
|
|
10
|
+
- All integration file columns correspond to valid sample names, QC, FM,
|
|
11
|
+
or Blank entries.
|
|
12
|
+
- Naming conventions are respected.
|
|
13
|
+
- All samples appear in both the samplesheet and the integration file.
|
|
14
|
+
|
|
15
|
+
The output is a merged samplesheet ordered to match the integration file
|
|
16
|
+
column order.
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
import logging
|
|
20
|
+
import re
|
|
21
|
+
|
|
22
|
+
import pandas as pd
|
|
23
|
+
|
|
24
|
+
logger = logging.getLogger(__name__)
|
|
25
|
+
|
|
26
|
+
# ---------------------------------------------------------------------------
|
|
27
|
+
# Special column types recognised in integration files
|
|
28
|
+
# ---------------------------------------------------------------------------
|
|
29
|
+
_SPECIAL_TYPES: tuple[tuple[str, str], ...] = (
|
|
30
|
+
("QC", "QC"),
|
|
31
|
+
("FM", "FM"),
|
|
32
|
+
("Blank", "Blank"),
|
|
33
|
+
)
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
# ---------------------------------------------------------------------------
|
|
37
|
+
# Result helper
|
|
38
|
+
# ---------------------------------------------------------------------------
|
|
39
|
+
|
|
40
|
+
class CheckResult:
|
|
41
|
+
"""Accumulates errors, warnings, and informational messages."""
|
|
42
|
+
|
|
43
|
+
def __init__(self):
|
|
44
|
+
self.errors: list[str] = []
|
|
45
|
+
self.warnings: list[str] = []
|
|
46
|
+
self.ok: list[str] = []
|
|
47
|
+
|
|
48
|
+
def error(self, msg: str) -> None:
|
|
49
|
+
self.errors.append(msg)
|
|
50
|
+
logger.error(msg)
|
|
51
|
+
|
|
52
|
+
def warn(self, msg: str) -> None:
|
|
53
|
+
self.warnings.append(msg)
|
|
54
|
+
logger.warning(msg)
|
|
55
|
+
|
|
56
|
+
def good(self, msg: str) -> None:
|
|
57
|
+
self.ok.append(msg)
|
|
58
|
+
logger.info(msg)
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
# ---------------------------------------------------------------------------
|
|
62
|
+
# Sub-functions
|
|
63
|
+
# ---------------------------------------------------------------------------
|
|
64
|
+
|
|
65
|
+
def _read_data(integration_file: str, software: str) -> pd.DataFrame:
|
|
66
|
+
"""
|
|
67
|
+
Read the integration file based on its software type.
|
|
68
|
+
|
|
69
|
+
Parameters
|
|
70
|
+
----------
|
|
71
|
+
integration_file:
|
|
72
|
+
Path to the Excel / TSV integration file.
|
|
73
|
+
software:
|
|
74
|
+
``"crommy"`` or ``"msdial"``.
|
|
75
|
+
|
|
76
|
+
Returns
|
|
77
|
+
-------
|
|
78
|
+
pandas.DataFrame
|
|
79
|
+
|
|
80
|
+
Raises
|
|
81
|
+
------
|
|
82
|
+
ValueError
|
|
83
|
+
If *software* is not supported.
|
|
84
|
+
"""
|
|
85
|
+
if software == "crommy":
|
|
86
|
+
logger.debug("Reading crommy file: %s", integration_file)
|
|
87
|
+
return pd.read_excel(integration_file, sheet_name="Sheet1", index_col=0)
|
|
88
|
+
if software == "msdial":
|
|
89
|
+
logger.debug("Reading msdial file: %s", integration_file)
|
|
90
|
+
return pd.read_table(integration_file, sep="\t", header=4)
|
|
91
|
+
raise ValueError(f"Unsupported integration file type: '{software}'")
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
def _extract_sample_columns(data: pd.DataFrame, software: str) -> pd.Index:
|
|
95
|
+
"""
|
|
96
|
+
Slice out the intensity/peak columns that follow the sentinel column
|
|
97
|
+
(``"Charge"`` for crommy, ``"MS/MS spectrum"`` for msdial).
|
|
98
|
+
|
|
99
|
+
Parameters
|
|
100
|
+
----------
|
|
101
|
+
data:
|
|
102
|
+
Raw integration DataFrame (already sorted by index).
|
|
103
|
+
software:
|
|
104
|
+
``"crommy"`` or ``"msdial"``.
|
|
105
|
+
|
|
106
|
+
Returns
|
|
107
|
+
-------
|
|
108
|
+
pandas.Index
|
|
109
|
+
Ordered column labels for the sample/QC/blank entries.
|
|
110
|
+
"""
|
|
111
|
+
sentinel = "MS/MS spectrum" if software == "msdial" else "Charge"
|
|
112
|
+
start_index = list(data.columns).index(sentinel)
|
|
113
|
+
last_index = data.columns.get_loc("1") if software == "msdial" else len(data.columns)
|
|
114
|
+
columns = data.columns[start_index + 1 : last_index]
|
|
115
|
+
logger.debug("Extracted %d sample columns after '%s'", len(columns), sentinel)
|
|
116
|
+
return columns
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
def _build_sample_pattern(samplesheet: pd.DataFrame) -> re.Pattern:
|
|
120
|
+
"""
|
|
121
|
+
Compile a single regex that matches ``<order>_<sample>_<phase>_<polarity>``
|
|
122
|
+
column names and captures the sample name in group 1.
|
|
123
|
+
|
|
124
|
+
Parameters
|
|
125
|
+
----------
|
|
126
|
+
samplesheet:
|
|
127
|
+
Must contain a ``"Sample name"`` column.
|
|
128
|
+
|
|
129
|
+
Returns
|
|
130
|
+
-------
|
|
131
|
+
re.Pattern
|
|
132
|
+
"""
|
|
133
|
+
sample_names = samplesheet["Sample name"].tolist()
|
|
134
|
+
samples_alternation = "|".join(re.escape(s) for s in sample_names)
|
|
135
|
+
pattern = re.compile(rf"\d+_({samples_alternation})_(C18|HILIC)_(np|p|n)")
|
|
136
|
+
logger.debug("Built sample-matching pattern for %d sample name(s)", len(sample_names))
|
|
137
|
+
return pattern
|
|
138
|
+
|
|
139
|
+
|
|
140
|
+
def _classify_columns(
|
|
141
|
+
columns: pd.Index,
|
|
142
|
+
pattern: re.Pattern,
|
|
143
|
+
result: CheckResult,
|
|
144
|
+
) -> pd.DataFrame:
|
|
145
|
+
"""
|
|
146
|
+
Classify each integration column as a sample, QC/FM/Blank, or unmatched.
|
|
147
|
+
|
|
148
|
+
Parameters
|
|
149
|
+
----------
|
|
150
|
+
columns:
|
|
151
|
+
Integration file column labels to classify.
|
|
152
|
+
pattern:
|
|
153
|
+
Compiled regex from :func:`_build_sample_pattern`.
|
|
154
|
+
result:
|
|
155
|
+
Accumulates warnings/errors encountered during classification.
|
|
156
|
+
|
|
157
|
+
Returns
|
|
158
|
+
-------
|
|
159
|
+
pandas.DataFrame
|
|
160
|
+
Columns: ``Sample name``, ``filename``, ``_type``.
|
|
161
|
+
|
|
162
|
+
Raises
|
|
163
|
+
------
|
|
164
|
+
ValueError
|
|
165
|
+
If any column has the wrong number of underscores, or if any column
|
|
166
|
+
cannot be matched to a sample or a known special type.
|
|
167
|
+
"""
|
|
168
|
+
matched_entries: list[dict] = []
|
|
169
|
+
wrong_name_entries: list[str] = []
|
|
170
|
+
unmatched_entries: list[str] = []
|
|
171
|
+
|
|
172
|
+
for entry in columns:
|
|
173
|
+
if entry.count("_") != 3:
|
|
174
|
+
wrong_name_entries.append(entry)
|
|
175
|
+
|
|
176
|
+
m = pattern.search(entry)
|
|
177
|
+
if m:
|
|
178
|
+
matched_entries.append({
|
|
179
|
+
"Sample name": m.group(1),
|
|
180
|
+
"filename": entry,
|
|
181
|
+
"_type": "sample",
|
|
182
|
+
})
|
|
183
|
+
result.good(f"Matched sample column: '{entry}' → '{m.group(1)}'")
|
|
184
|
+
else:
|
|
185
|
+
special_type = next(
|
|
186
|
+
(t for key, t in _SPECIAL_TYPES if key in entry), None
|
|
187
|
+
)
|
|
188
|
+
matched_entries.append({
|
|
189
|
+
"Sample name": None,
|
|
190
|
+
"filename": entry,
|
|
191
|
+
"_type": special_type or "Unmatched",
|
|
192
|
+
})
|
|
193
|
+
if special_type:
|
|
194
|
+
result.good(f"Recognised special column: '{entry}' → {special_type}")
|
|
195
|
+
else:
|
|
196
|
+
unmatched_entries.append(entry)
|
|
197
|
+
|
|
198
|
+
if wrong_name_entries:
|
|
199
|
+
raise ValueError(
|
|
200
|
+
"These columns have a wrong number of underscores: "
|
|
201
|
+
+ ", ".join(wrong_name_entries)
|
|
202
|
+
)
|
|
203
|
+
if unmatched_entries:
|
|
204
|
+
raise ValueError(
|
|
205
|
+
"These columns have no match in the samplesheet: "
|
|
206
|
+
+ ", ".join(unmatched_entries)
|
|
207
|
+
)
|
|
208
|
+
|
|
209
|
+
return pd.DataFrame(matched_entries)
|
|
210
|
+
|
|
211
|
+
|
|
212
|
+
def _merge_and_validate(
|
|
213
|
+
samplesheet: pd.DataFrame,
|
|
214
|
+
matched_df: pd.DataFrame,
|
|
215
|
+
columns: pd.Index,
|
|
216
|
+
result: CheckResult,
|
|
217
|
+
) -> pd.DataFrame:
|
|
218
|
+
"""
|
|
219
|
+
Outer-merge the samplesheet with the classified columns and verify that
|
|
220
|
+
every samplesheet entry has a corresponding integration column.
|
|
221
|
+
|
|
222
|
+
Parameters
|
|
223
|
+
----------
|
|
224
|
+
samplesheet:
|
|
225
|
+
Original sample metadata DataFrame.
|
|
226
|
+
matched_df:
|
|
227
|
+
Output of :func:`_classify_columns`.
|
|
228
|
+
columns:
|
|
229
|
+
Original ordered column index (used to assign ``col_order``).
|
|
230
|
+
result:
|
|
231
|
+
Accumulates warnings/errors.
|
|
232
|
+
|
|
233
|
+
Returns
|
|
234
|
+
-------
|
|
235
|
+
pandas.DataFrame
|
|
236
|
+
Merged DataFrame with ``col_order``, ``polarity``, and
|
|
237
|
+
``sample_order`` columns added.
|
|
238
|
+
|
|
239
|
+
Raises
|
|
240
|
+
------
|
|
241
|
+
ValueError
|
|
242
|
+
If any samplesheet entry has no matching integration column.
|
|
243
|
+
"""
|
|
244
|
+
col_order_map = {name: i for i, name in enumerate(columns)}
|
|
245
|
+
matched_df = matched_df.copy()
|
|
246
|
+
matched_df["col_order"] = matched_df["filename"].map(col_order_map)
|
|
247
|
+
|
|
248
|
+
merged_df = pd.merge(samplesheet, matched_df, on="Sample name", how="outer")
|
|
249
|
+
|
|
250
|
+
missing_in_data = merged_df.loc[merged_df["filename"].isna(), "Sample name"]
|
|
251
|
+
if not missing_in_data.empty:
|
|
252
|
+
raise ValueError(
|
|
253
|
+
"These samplesheet entries have no match in the integration file: "
|
|
254
|
+
+ ", ".join(missing_in_data)
|
|
255
|
+
)
|
|
256
|
+
|
|
257
|
+
result.good(
|
|
258
|
+
f"All {(merged_df['_type'] == 'sample').sum()} sample(s) matched successfully"
|
|
259
|
+
)
|
|
260
|
+
|
|
261
|
+
# Derive polarity and injection order from the filename
|
|
262
|
+
split = merged_df["filename"].str.split("_")
|
|
263
|
+
merged_df["polarity"] = split.str[-1].str.lower().astype("category")
|
|
264
|
+
merged_df["sample_order"] = split.str[0].astype(int)
|
|
265
|
+
|
|
266
|
+
return merged_df
|
|
267
|
+
|
|
268
|
+
|
|
269
|
+
def _handle_batch_column(
|
|
270
|
+
merged_df: pd.DataFrame,
|
|
271
|
+
result: CheckResult,
|
|
272
|
+
) -> tuple[pd.DataFrame, str]:
|
|
273
|
+
"""
|
|
274
|
+
Ensure a ``"Batch"`` column is present and is of string type.
|
|
275
|
+
|
|
276
|
+
Parameters
|
|
277
|
+
----------
|
|
278
|
+
merged_df:
|
|
279
|
+
Merged sample DataFrame.
|
|
280
|
+
result:
|
|
281
|
+
Accumulates warnings/errors.
|
|
282
|
+
|
|
283
|
+
Returns
|
|
284
|
+
-------
|
|
285
|
+
tuple[pandas.DataFrame, str]
|
|
286
|
+
The (possibly modified) DataFrame and an informational message string.
|
|
287
|
+
"""
|
|
288
|
+
if "Batch" not in merged_df.columns:
|
|
289
|
+
merged_df = merged_df.copy()
|
|
290
|
+
merged_df["Batch"] = "1"
|
|
291
|
+
message = ""
|
|
292
|
+
result.good("No 'Batch' column found — defaulting all samples to batch '1'")
|
|
293
|
+
else:
|
|
294
|
+
merged_df = merged_df.copy()
|
|
295
|
+
merged_df["Batch"] = merged_df["Batch"].astype(str)
|
|
296
|
+
message = "Please fill Batch info for blanks and QC."
|
|
297
|
+
result.warn(message)
|
|
298
|
+
|
|
299
|
+
return merged_df, message
|
|
300
|
+
|
|
301
|
+
|
|
302
|
+
# ---------------------------------------------------------------------------
|
|
303
|
+
# Public entry point
|
|
304
|
+
# ---------------------------------------------------------------------------
|
|
305
|
+
|
|
306
|
+
def check_samples(
|
|
307
|
+
samplesheet: pd.DataFrame,
|
|
308
|
+
integration_file: str,
|
|
309
|
+
software: str = "crommy",
|
|
310
|
+
) -> tuple[pd.DataFrame, str, list[str], list[str], list[str]]:
|
|
311
|
+
"""
|
|
312
|
+
Validate and match samples between a samplesheet and an integration file.
|
|
313
|
+
|
|
314
|
+
Reads an Excel/TSV integration file, extracts sample-related columns,
|
|
315
|
+
and matches them against the ``"Sample name"`` column in the provided
|
|
316
|
+
samplesheet. Enforces naming conventions, checks for missing or extra
|
|
317
|
+
samples, and returns a merged DataFrame ordered according to the
|
|
318
|
+
integration file column layout.
|
|
319
|
+
|
|
320
|
+
Parameters
|
|
321
|
+
----------
|
|
322
|
+
samplesheet:
|
|
323
|
+
DataFrame containing sample metadata. Must include a
|
|
324
|
+
``"Sample name"`` column.
|
|
325
|
+
integration_file:
|
|
326
|
+
Path to the Excel integration file.
|
|
327
|
+
software:
|
|
328
|
+
Integration software: ``"crommy"`` (default) or ``"msdial"``.
|
|
329
|
+
|
|
330
|
+
Returns
|
|
331
|
+
-------
|
|
332
|
+
merged_df:
|
|
333
|
+
DataFrame combining the samplesheet with matched integration file
|
|
334
|
+
entries, ordered to match the integration file.
|
|
335
|
+
message:
|
|
336
|
+
Informational message regarding batch handling (empty string when
|
|
337
|
+
no action is needed).
|
|
338
|
+
errors:
|
|
339
|
+
Blocking error messages collected during validation.
|
|
340
|
+
warnings:
|
|
341
|
+
Non-blocking warning messages collected during validation.
|
|
342
|
+
ok:
|
|
343
|
+
Passed-check messages collected during validation.
|
|
344
|
+
|
|
345
|
+
Raises
|
|
346
|
+
------
|
|
347
|
+
ValueError
|
|
348
|
+
If column naming conventions are violated, or samples are missing
|
|
349
|
+
from either side of the merge.
|
|
350
|
+
"""
|
|
351
|
+
result = CheckResult()
|
|
352
|
+
logger.info(
|
|
353
|
+
"Starting sample check — software='%s', file='%s'",
|
|
354
|
+
software, integration_file,
|
|
355
|
+
)
|
|
356
|
+
|
|
357
|
+
# 1. Read the integration file
|
|
358
|
+
data = _read_data(integration_file, software)
|
|
359
|
+
data.sort_index(inplace=True)
|
|
360
|
+
|
|
361
|
+
# 2. Slice the sample/QC/blank columns
|
|
362
|
+
columns = _extract_sample_columns(data, software)
|
|
363
|
+
|
|
364
|
+
# 3. Build the sample-matching regex
|
|
365
|
+
pattern = _build_sample_pattern(samplesheet)
|
|
366
|
+
|
|
367
|
+
# 4. Classify every column
|
|
368
|
+
matched_df = _classify_columns(columns, pattern, result)
|
|
369
|
+
|
|
370
|
+
# 5. Merge and validate completeness
|
|
371
|
+
merged_df = _merge_and_validate(samplesheet, matched_df, columns, result)
|
|
372
|
+
|
|
373
|
+
# 6. Resolve the Batch column
|
|
374
|
+
merged_df, message = _handle_batch_column(merged_df, result)
|
|
375
|
+
|
|
376
|
+
# 7. Sort to match integration file column order
|
|
377
|
+
merged_df = (
|
|
378
|
+
merged_df
|
|
379
|
+
.sort_values("col_order")
|
|
380
|
+
.reset_index(drop=True)
|
|
381
|
+
.drop(columns="col_order")
|
|
382
|
+
)
|
|
383
|
+
|
|
384
|
+
logger.info(
|
|
385
|
+
"Sample check complete — %d error(s), %d warning(s), %d ok",
|
|
386
|
+
len(result.errors), len(result.warnings), len(result.ok),
|
|
387
|
+
)
|
|
388
|
+
|
|
389
|
+
return merged_df, message, result.errors, result.warnings, result.ok
|
|
390
|
+
|
|
391
|
+
if __name__ == "__main__":
|
|
392
|
+
from parse_samplesheet import parse_samplesheet
|
|
393
|
+
df = pd.read_excel("test_files/LC-MS_metabolomics_PLASMA.xlsx", header=None)
|
|
394
|
+
sample_df, _, _, _, _ = parse_samplesheet(df, )
|
|
395
|
+
|
|
396
|
+
merged_df, message, errors, warnings, ok = check_samples(sample_df, "test_files/HILICp_Areas.txt", software="msdial")
|
|
397
|
+
print(merged_df)
|