parseet 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- parseet/.DS_Store +0 -0
- parseet/__init__.py +2 -0
- parseet/app.py +63 -0
- parseet/backend.py +283 -0
- parseet/cli.py +177 -0
- parseet/config/__init__.py +6 -0
- parseet/config/config.toml +7 -0
- parseet/core/single_process.py +45 -0
- parseet/core/utils/__init__.py +5 -0
- parseet/core/utils/build_parser.py +33 -0
- parseet/core/utils/check_order.py +426 -0
- parseet/core/utils/lcms_check_samples.py +397 -0
- parseet/core/utils/parse_samplesheet.py +363 -0
- parseet/core/utils/setup_logger.py +24 -0
- parseet/main.qml +1057 -0
- parseet/version.py +1 -0
- parseet-0.2.0.dist-info/METADATA +56 -0
- parseet-0.2.0.dist-info/RECORD +20 -0
- parseet-0.2.0.dist-info/WHEEL +4 -0
- parseet-0.2.0.dist-info/entry_points.txt +6 -0
|
@@ -0,0 +1,363 @@
|
|
|
1
|
+
import pandas as pd
|
|
2
|
+
import logging
|
|
3
|
+
import re
|
|
4
|
+
|
|
5
|
+
logger = logging.getLogger(__name__)
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
# ---------------------------------------------------------------------------
|
|
9
|
+
# Result helpers
|
|
10
|
+
# ---------------------------------------------------------------------------
|
|
11
|
+
|
|
12
|
+
class ParseResult:
|
|
13
|
+
"""Accumulates errors, warnings, ok messages, and column descriptions."""
|
|
14
|
+
|
|
15
|
+
def __init__(self):
|
|
16
|
+
self.errors: list[str] = []
|
|
17
|
+
self.warnings: list[str] = []
|
|
18
|
+
self.ok: list[str] = []
|
|
19
|
+
self.col_descriptions: dict[str, dict] = {}
|
|
20
|
+
|
|
21
|
+
def error(self, msg: str):
|
|
22
|
+
self.errors.append(msg)
|
|
23
|
+
logger.error(msg)
|
|
24
|
+
|
|
25
|
+
def warn(self, msg: str):
|
|
26
|
+
self.warnings.append(msg)
|
|
27
|
+
logger.warning(msg)
|
|
28
|
+
|
|
29
|
+
def good(self, msg: str):
|
|
30
|
+
self.ok.append(msg)
|
|
31
|
+
logger.info(msg)
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
# ---------------------------------------------------------------------------
|
|
35
|
+
# Sub-functions
|
|
36
|
+
# ---------------------------------------------------------------------------
|
|
37
|
+
|
|
38
|
+
def _check_metadata(df_input: pd.DataFrame, result: ParseResult) -> None:
|
|
39
|
+
"""Validate the top-level metadata rows (rows 0-3, column B)."""
|
|
40
|
+
meta_labels = ["Referent Name", "Referent Email", "Group", "Multi-round"]
|
|
41
|
+
for i, label in enumerate(meta_labels):
|
|
42
|
+
value = df_input.iloc[i, 1] if i < len(df_input) else None
|
|
43
|
+
if pd.isna(value):
|
|
44
|
+
result.error(f"Missing {label}")
|
|
45
|
+
else:
|
|
46
|
+
result.good(f"{label} found: {value}")
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def _find_sample_block(df_input: pd.DataFrame, result: ParseResult) -> pd.DataFrame:
|
|
50
|
+
"""
|
|
51
|
+
Locate the header row that contains 'Sample name' and return the
|
|
52
|
+
sub-DataFrame starting from that row (with the header promoted).
|
|
53
|
+
|
|
54
|
+
Raises ValueError if the header row cannot be found.
|
|
55
|
+
"""
|
|
56
|
+
header_row_idx = None
|
|
57
|
+
for i, row in df_input.iterrows():
|
|
58
|
+
if row.astype(str).str.contains("Sample name", case=False).any():
|
|
59
|
+
header_row_idx = i
|
|
60
|
+
break
|
|
61
|
+
|
|
62
|
+
if header_row_idx is None:
|
|
63
|
+
result.error("Header row not found")
|
|
64
|
+
raise ValueError("No header row found in samplesheet")
|
|
65
|
+
|
|
66
|
+
logger.info("Found header row at index %d", header_row_idx)
|
|
67
|
+
|
|
68
|
+
# Promote that row to column names
|
|
69
|
+
df = df_input.iloc[header_row_idx:].copy()
|
|
70
|
+
df.columns = df.iloc[0]
|
|
71
|
+
df = df.iloc[1:].reset_index(drop=True)
|
|
72
|
+
|
|
73
|
+
# Basic clean-up
|
|
74
|
+
df["Sample name"] = df["Sample name"].astype(str).str.strip()
|
|
75
|
+
if "Ignore" in df.columns:
|
|
76
|
+
df = df[df["Ignore"].isna()].reset_index(drop=True)
|
|
77
|
+
|
|
78
|
+
result.good(f"Sample block found: {len(df)} sample(s)")
|
|
79
|
+
return df
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
def _check_sample_names(df_samples: pd.DataFrame, result: ParseResult) -> pd.DataFrame:
|
|
83
|
+
"""Warn and sanitise sample names that contain disallowed characters."""
|
|
84
|
+
allowed = re.compile(r'^[A-Za-z0-9\-]+$')
|
|
85
|
+
invalid_mask = df_samples["Sample name"].apply(
|
|
86
|
+
lambda x: not bool(allowed.match(str(x))) if pd.notnull(x) else False
|
|
87
|
+
)
|
|
88
|
+
invalid_count = invalid_mask.sum()
|
|
89
|
+
|
|
90
|
+
if invalid_count > 0:
|
|
91
|
+
result.warn(
|
|
92
|
+
f"Found {invalid_count} sample name(s) with disallowed characters — replacing with '-'."
|
|
93
|
+
)
|
|
94
|
+
df_samples = df_samples.copy()
|
|
95
|
+
df_samples["Sample name"] = df_samples["Sample name"].apply(
|
|
96
|
+
lambda x: re.sub(r'[^A-Za-z0-9\-]', '-', str(x))
|
|
97
|
+
)
|
|
98
|
+
else:
|
|
99
|
+
result.good("All sample names are valid")
|
|
100
|
+
|
|
101
|
+
return df_samples
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
def _describe_metadata_columns(
|
|
105
|
+
df_samples: pd.DataFrame, result: ParseResult
|
|
106
|
+
) -> None:
|
|
107
|
+
"""
|
|
108
|
+
Inspect every metadata column between 'Quantity' and the first
|
|
109
|
+
'Contrast-N' column, populate result.col_descriptions, and emit
|
|
110
|
+
warnings for missing values or label inconsistencies.
|
|
111
|
+
"""
|
|
112
|
+
if "Quantity" not in df_samples.columns:
|
|
113
|
+
result.warn("'Quantity' column not found — skipping metadata column inspection")
|
|
114
|
+
return
|
|
115
|
+
|
|
116
|
+
quantity_idx = df_samples.columns.get_loc("Quantity")
|
|
117
|
+
contrast_cols = [c for c in df_samples.columns if re.fullmatch(r"Contrast-\d+", str(c))]
|
|
118
|
+
first_contrast_idx = (
|
|
119
|
+
df_samples.columns.get_loc(contrast_cols[0]) if contrast_cols else len(df_samples.columns)
|
|
120
|
+
)
|
|
121
|
+
metadata_cols = df_samples.columns[quantity_idx:first_contrast_idx]
|
|
122
|
+
logger.info("Found %d metadata column(s)", len(metadata_cols))
|
|
123
|
+
|
|
124
|
+
for col in metadata_cols:
|
|
125
|
+
col_data = df_samples[col]
|
|
126
|
+
_describe_one_column(col, col_data, result)
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
def _describe_one_column(
|
|
130
|
+
col: str, col_data: pd.Series, result: ParseResult
|
|
131
|
+
) -> None:
|
|
132
|
+
"""Build a description for a single metadata column and register warnings."""
|
|
133
|
+
col_type = col_data.dtype
|
|
134
|
+
|
|
135
|
+
if col_type in ("float64", "int64"):
|
|
136
|
+
description: dict = {
|
|
137
|
+
"type": str(col_type),
|
|
138
|
+
"max": col_data.max(),
|
|
139
|
+
"mean": col_data.mean(),
|
|
140
|
+
"min": col_data.min(),
|
|
141
|
+
}
|
|
142
|
+
if col_data.isna().any():
|
|
143
|
+
result.warn(f"Missing values in numeric column '{col}'")
|
|
144
|
+
description["has_missing"] = True
|
|
145
|
+
|
|
146
|
+
logger.info(
|
|
147
|
+
"Metadata '%s' [numeric]: min=%.2f mean=%.2f max=%.2f",
|
|
148
|
+
col, description["min"], description["mean"], description["max"],
|
|
149
|
+
)
|
|
150
|
+
|
|
151
|
+
else:
|
|
152
|
+
cat_counts = col_data.value_counts(dropna=False)
|
|
153
|
+
categories = {
|
|
154
|
+
(str(val) if pd.notna(val) else "__missing__"): int(cnt)
|
|
155
|
+
for val, cnt in cat_counts.items()
|
|
156
|
+
}
|
|
157
|
+
description = {
|
|
158
|
+
"type": "categorical",
|
|
159
|
+
"n_categories": int(cat_counts.dropna().shape[0]),
|
|
160
|
+
"categories": categories,
|
|
161
|
+
}
|
|
162
|
+
|
|
163
|
+
# Missing values
|
|
164
|
+
missing_count = col_data.isna().sum()
|
|
165
|
+
if missing_count > 0:
|
|
166
|
+
result.warn(f"Missing values in categorical column '{col}' ({missing_count} rows)")
|
|
167
|
+
description["has_missing"] = True
|
|
168
|
+
|
|
169
|
+
# Label-consistency check (flag rare case variants)
|
|
170
|
+
label_issues = _find_label_inconsistencies(col, col_data)
|
|
171
|
+
if label_issues:
|
|
172
|
+
for issue in label_issues:
|
|
173
|
+
result.warn(issue)
|
|
174
|
+
description["label_warnings"] = label_issues
|
|
175
|
+
|
|
176
|
+
logger.info(
|
|
177
|
+
"Metadata '%s' [categorical]: %d category/ies — %s",
|
|
178
|
+
col,
|
|
179
|
+
description["n_categories"],
|
|
180
|
+
", ".join(f"{k}={v}" for k, v in categories.items()),
|
|
181
|
+
)
|
|
182
|
+
|
|
183
|
+
result.col_descriptions[col] = description
|
|
184
|
+
|
|
185
|
+
|
|
186
|
+
def _find_label_inconsistencies(col: str, col_data: pd.Series) -> list[str]:
|
|
187
|
+
"""
|
|
188
|
+
Return warning strings for labels that look like case variants of each
|
|
189
|
+
other but appear far less often than the dominant spelling.
|
|
190
|
+
"""
|
|
191
|
+
lowered_map: dict[str, list[str]] = {}
|
|
192
|
+
for label in col_data.dropna().unique():
|
|
193
|
+
lowered_map.setdefault(str(label).lower(), []).append(str(label))
|
|
194
|
+
|
|
195
|
+
issues = []
|
|
196
|
+
for variants in lowered_map.values():
|
|
197
|
+
if len(variants) < 2:
|
|
198
|
+
continue
|
|
199
|
+
counts = {v: int((col_data == v).sum()) for v in variants}
|
|
200
|
+
max_count = max(counts.values())
|
|
201
|
+
for label, count in counts.items():
|
|
202
|
+
if count < 0.2 * max_count:
|
|
203
|
+
issues.append(
|
|
204
|
+
f"Column '{col}': label '{label}' appears {count} time(s) "
|
|
205
|
+
f"vs similar variant(s) {variants} — possible typo or case mismatch"
|
|
206
|
+
)
|
|
207
|
+
return issues
|
|
208
|
+
|
|
209
|
+
|
|
210
|
+
def _validate_contrast_columns(df_samples: pd.DataFrame, result: ParseResult) -> None:
|
|
211
|
+
"""
|
|
212
|
+
Check every column whose name starts with 'Contrast'.
|
|
213
|
+
Valid names match Contrast-<N>; valid values are 1 or 2 (or NaN).
|
|
214
|
+
"""
|
|
215
|
+
contrast_columns = [c for c in df_samples.columns if str(c).startswith("Contrast")]
|
|
216
|
+
|
|
217
|
+
for col in contrast_columns:
|
|
218
|
+
if not re.fullmatch(r"Contrast-\d+", str(col)):
|
|
219
|
+
result.error(f"Invalid contrast column name: '{col}' (expected Contrast-<number>)")
|
|
220
|
+
continue
|
|
221
|
+
|
|
222
|
+
invalid_values = _collect_invalid_int_values(df_samples[col], allowed={1, 2})
|
|
223
|
+
if invalid_values:
|
|
224
|
+
result.error(f"Invalid value(s) in '{col}': {invalid_values}")
|
|
225
|
+
else:
|
|
226
|
+
c1 = (df_samples[col].astype(str) == "1").sum()
|
|
227
|
+
c2 = (df_samples[col].astype(str) == "2").sum()
|
|
228
|
+
result.good(f"Contrast '{col}': group-1={c1}, group-2={c2}")
|
|
229
|
+
|
|
230
|
+
|
|
231
|
+
def _validate_timecourse_columns(df_samples: pd.DataFrame, result: ParseResult) -> None:
|
|
232
|
+
"""
|
|
233
|
+
Check every column whose name starts with 'Timecourse'.
|
|
234
|
+
Valid names match Timecourse-<N>; values must be non-negative integers.
|
|
235
|
+
"""
|
|
236
|
+
timecourse_columns = [c for c in df_samples.columns if str(c).startswith("Timecourse")]
|
|
237
|
+
|
|
238
|
+
for col in timecourse_columns:
|
|
239
|
+
if not re.fullmatch(r"Timecourse-\d+", str(col)):
|
|
240
|
+
result.error(f"Invalid timecourse column name: '{col}' (expected Timecourse-<number>)")
|
|
241
|
+
continue
|
|
242
|
+
|
|
243
|
+
invalid_values = _collect_invalid_int_values(df_samples[col], allowed=None)
|
|
244
|
+
if invalid_values:
|
|
245
|
+
result.error(f"Invalid value(s) in '{col}': {invalid_values}")
|
|
246
|
+
else:
|
|
247
|
+
counts = df_samples[col].dropna().value_counts().sort_index()
|
|
248
|
+
summary = ", ".join(f"{int(v)}:{c}" for v, c in counts.items())
|
|
249
|
+
result.good(f"Timecourse '{col}': {summary}")
|
|
250
|
+
|
|
251
|
+
|
|
252
|
+
def _collect_invalid_int_values(series: pd.Series, allowed: set[int] | None) -> set:
|
|
253
|
+
"""
|
|
254
|
+
Return the set of values in *series* that cannot be cast to int, or
|
|
255
|
+
(when *allowed* is provided) that are not in the allowed set.
|
|
256
|
+
NaN values are ignored.
|
|
257
|
+
"""
|
|
258
|
+
invalid: set = set()
|
|
259
|
+
for v in series.dropna().unique():
|
|
260
|
+
try:
|
|
261
|
+
v_int = int(v)
|
|
262
|
+
if float(v) != v_int: # reject 1.5, etc.
|
|
263
|
+
raise ValueError
|
|
264
|
+
if allowed is not None and v_int not in allowed:
|
|
265
|
+
raise ValueError
|
|
266
|
+
except (ValueError, TypeError):
|
|
267
|
+
invalid.add(v)
|
|
268
|
+
return invalid
|
|
269
|
+
|
|
270
|
+
|
|
271
|
+
def _coerce_group_columns(df_samples: pd.DataFrame) -> pd.DataFrame:
|
|
272
|
+
"""Cast contrast and timecourse columns to nullable Int64, then to clean strings."""
|
|
273
|
+
group_cols = [
|
|
274
|
+
c for c in df_samples.columns
|
|
275
|
+
if re.fullmatch(r"(Contrast|Timecourse)-\d+", str(c))
|
|
276
|
+
]
|
|
277
|
+
if group_cols:
|
|
278
|
+
df_samples = df_samples.copy()
|
|
279
|
+
df_samples[group_cols] = (
|
|
280
|
+
df_samples[group_cols]
|
|
281
|
+
.apply(pd.to_numeric, errors="coerce")
|
|
282
|
+
.astype("Int64")
|
|
283
|
+
.astype(str)
|
|
284
|
+
.replace("<NA>", "")
|
|
285
|
+
)
|
|
286
|
+
return df_samples
|
|
287
|
+
|
|
288
|
+
|
|
289
|
+
# ---------------------------------------------------------------------------
|
|
290
|
+
# Public entry point
|
|
291
|
+
# ---------------------------------------------------------------------------
|
|
292
|
+
|
|
293
|
+
def parse_samplesheet(
|
|
294
|
+
df_input: pd.DataFrame,
|
|
295
|
+
) -> tuple[pd.DataFrame, list[str], list[str], list[str], dict[str, dict]]:
|
|
296
|
+
"""
|
|
297
|
+
Parse and validate a samplesheet DataFrame.
|
|
298
|
+
|
|
299
|
+
Parameters
|
|
300
|
+
----------
|
|
301
|
+
df_input:
|
|
302
|
+
Raw DataFrame as loaded from the spreadsheet (no header manipulation).
|
|
303
|
+
|
|
304
|
+
Returns
|
|
305
|
+
-------
|
|
306
|
+
df_samples:
|
|
307
|
+
Cleaned sample table.
|
|
308
|
+
errors:
|
|
309
|
+
List of blocking error messages.
|
|
310
|
+
warnings:
|
|
311
|
+
List of non-blocking warning messages.
|
|
312
|
+
ok:
|
|
313
|
+
List of passed-check messages.
|
|
314
|
+
col_descriptions:
|
|
315
|
+
Per-column summary statistics / category info.
|
|
316
|
+
"""
|
|
317
|
+
result = ParseResult()
|
|
318
|
+
|
|
319
|
+
# 1. Top-level metadata (rows 0-3)
|
|
320
|
+
_check_metadata(df_input, result)
|
|
321
|
+
|
|
322
|
+
# 2. Locate the sample block
|
|
323
|
+
df_samples = _find_sample_block(df_input, result)
|
|
324
|
+
|
|
325
|
+
# 3. Sanitise sample names
|
|
326
|
+
df_samples = _check_sample_names(df_samples, result)
|
|
327
|
+
|
|
328
|
+
# 4. Describe metadata columns
|
|
329
|
+
_describe_metadata_columns(df_samples, result)
|
|
330
|
+
|
|
331
|
+
# 5. Validate contrast columns
|
|
332
|
+
_validate_contrast_columns(df_samples, result)
|
|
333
|
+
|
|
334
|
+
# 6. Validate timecourse columns
|
|
335
|
+
_validate_timecourse_columns(df_samples, result)
|
|
336
|
+
|
|
337
|
+
# 7. Coerce group columns to clean string representation
|
|
338
|
+
df_samples = _coerce_group_columns(df_samples)
|
|
339
|
+
|
|
340
|
+
logger.info(
|
|
341
|
+
"Parsing complete — %d error(s), %d warning(s)",
|
|
342
|
+
len(result.errors), len(result.warnings),
|
|
343
|
+
)
|
|
344
|
+
|
|
345
|
+
return df_samples, result.errors, result.warnings, result.ok, result.col_descriptions
|
|
346
|
+
|
|
347
|
+
"""
|
|
348
|
+
if __name__ == "__main__":
|
|
349
|
+
|
|
350
|
+
for test_file in [
|
|
351
|
+
"test_files/LC-MS_metabolomics_PLASMA.xlsx",
|
|
352
|
+
"test_files/LC-MS_metabolomics_PLASMA_warnings.xlsx",
|
|
353
|
+
]:
|
|
354
|
+
df = pd.read_excel(test_file, header=None)
|
|
355
|
+
df_samples, errors, warnings, ok, col_desc = parse_samplesheet(df)
|
|
356
|
+
print(f"\nResults for {test_file}:")
|
|
357
|
+
print("Errors:", errors)
|
|
358
|
+
print("Warnings:", warnings)
|
|
359
|
+
print("OK:", ok)
|
|
360
|
+
print("Column descriptions:")
|
|
361
|
+
for col, desc in col_desc.items():
|
|
362
|
+
print(f" {col}: {desc}")
|
|
363
|
+
"""
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
|
|
4
|
+
def configure_logging(
|
|
5
|
+
*,
|
|
6
|
+
level=logging.INFO,
|
|
7
|
+
log_file: Path | None = None,
|
|
8
|
+
):
|
|
9
|
+
handlers = []
|
|
10
|
+
|
|
11
|
+
# Console handler
|
|
12
|
+
console_handler = logging.StreamHandler()
|
|
13
|
+
handlers.append(console_handler)
|
|
14
|
+
|
|
15
|
+
# Optional file handler
|
|
16
|
+
if log_file is not None:
|
|
17
|
+
file_handler = logging.FileHandler(log_file)
|
|
18
|
+
handlers.append(file_handler)
|
|
19
|
+
|
|
20
|
+
logging.basicConfig(
|
|
21
|
+
level=level,
|
|
22
|
+
format="%(asctime)s [%(levelname)s] %(name)s.%(funcName)s: %(message)s",
|
|
23
|
+
handlers=handlers,
|
|
24
|
+
)
|