parseet 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,363 @@
1
+ import pandas as pd
2
+ import logging
3
+ import re
4
+
5
+ logger = logging.getLogger(__name__)
6
+
7
+
8
+ # ---------------------------------------------------------------------------
9
+ # Result helpers
10
+ # ---------------------------------------------------------------------------
11
+
12
+ class ParseResult:
13
+ """Accumulates errors, warnings, ok messages, and column descriptions."""
14
+
15
+ def __init__(self):
16
+ self.errors: list[str] = []
17
+ self.warnings: list[str] = []
18
+ self.ok: list[str] = []
19
+ self.col_descriptions: dict[str, dict] = {}
20
+
21
+ def error(self, msg: str):
22
+ self.errors.append(msg)
23
+ logger.error(msg)
24
+
25
+ def warn(self, msg: str):
26
+ self.warnings.append(msg)
27
+ logger.warning(msg)
28
+
29
+ def good(self, msg: str):
30
+ self.ok.append(msg)
31
+ logger.info(msg)
32
+
33
+
34
+ # ---------------------------------------------------------------------------
35
+ # Sub-functions
36
+ # ---------------------------------------------------------------------------
37
+
38
+ def _check_metadata(df_input: pd.DataFrame, result: ParseResult) -> None:
39
+ """Validate the top-level metadata rows (rows 0-3, column B)."""
40
+ meta_labels = ["Referent Name", "Referent Email", "Group", "Multi-round"]
41
+ for i, label in enumerate(meta_labels):
42
+ value = df_input.iloc[i, 1] if i < len(df_input) else None
43
+ if pd.isna(value):
44
+ result.error(f"Missing {label}")
45
+ else:
46
+ result.good(f"{label} found: {value}")
47
+
48
+
49
+ def _find_sample_block(df_input: pd.DataFrame, result: ParseResult) -> pd.DataFrame:
50
+ """
51
+ Locate the header row that contains 'Sample name' and return the
52
+ sub-DataFrame starting from that row (with the header promoted).
53
+
54
+ Raises ValueError if the header row cannot be found.
55
+ """
56
+ header_row_idx = None
57
+ for i, row in df_input.iterrows():
58
+ if row.astype(str).str.contains("Sample name", case=False).any():
59
+ header_row_idx = i
60
+ break
61
+
62
+ if header_row_idx is None:
63
+ result.error("Header row not found")
64
+ raise ValueError("No header row found in samplesheet")
65
+
66
+ logger.info("Found header row at index %d", header_row_idx)
67
+
68
+ # Promote that row to column names
69
+ df = df_input.iloc[header_row_idx:].copy()
70
+ df.columns = df.iloc[0]
71
+ df = df.iloc[1:].reset_index(drop=True)
72
+
73
+ # Basic clean-up
74
+ df["Sample name"] = df["Sample name"].astype(str).str.strip()
75
+ if "Ignore" in df.columns:
76
+ df = df[df["Ignore"].isna()].reset_index(drop=True)
77
+
78
+ result.good(f"Sample block found: {len(df)} sample(s)")
79
+ return df
80
+
81
+
82
+ def _check_sample_names(df_samples: pd.DataFrame, result: ParseResult) -> pd.DataFrame:
83
+ """Warn and sanitise sample names that contain disallowed characters."""
84
+ allowed = re.compile(r'^[A-Za-z0-9\-]+$')
85
+ invalid_mask = df_samples["Sample name"].apply(
86
+ lambda x: not bool(allowed.match(str(x))) if pd.notnull(x) else False
87
+ )
88
+ invalid_count = invalid_mask.sum()
89
+
90
+ if invalid_count > 0:
91
+ result.warn(
92
+ f"Found {invalid_count} sample name(s) with disallowed characters — replacing with '-'."
93
+ )
94
+ df_samples = df_samples.copy()
95
+ df_samples["Sample name"] = df_samples["Sample name"].apply(
96
+ lambda x: re.sub(r'[^A-Za-z0-9\-]', '-', str(x))
97
+ )
98
+ else:
99
+ result.good("All sample names are valid")
100
+
101
+ return df_samples
102
+
103
+
104
+ def _describe_metadata_columns(
105
+ df_samples: pd.DataFrame, result: ParseResult
106
+ ) -> None:
107
+ """
108
+ Inspect every metadata column between 'Quantity' and the first
109
+ 'Contrast-N' column, populate result.col_descriptions, and emit
110
+ warnings for missing values or label inconsistencies.
111
+ """
112
+ if "Quantity" not in df_samples.columns:
113
+ result.warn("'Quantity' column not found — skipping metadata column inspection")
114
+ return
115
+
116
+ quantity_idx = df_samples.columns.get_loc("Quantity")
117
+ contrast_cols = [c for c in df_samples.columns if re.fullmatch(r"Contrast-\d+", str(c))]
118
+ first_contrast_idx = (
119
+ df_samples.columns.get_loc(contrast_cols[0]) if contrast_cols else len(df_samples.columns)
120
+ )
121
+ metadata_cols = df_samples.columns[quantity_idx:first_contrast_idx]
122
+ logger.info("Found %d metadata column(s)", len(metadata_cols))
123
+
124
+ for col in metadata_cols:
125
+ col_data = df_samples[col]
126
+ _describe_one_column(col, col_data, result)
127
+
128
+
129
+ def _describe_one_column(
130
+ col: str, col_data: pd.Series, result: ParseResult
131
+ ) -> None:
132
+ """Build a description for a single metadata column and register warnings."""
133
+ col_type = col_data.dtype
134
+
135
+ if col_type in ("float64", "int64"):
136
+ description: dict = {
137
+ "type": str(col_type),
138
+ "max": col_data.max(),
139
+ "mean": col_data.mean(),
140
+ "min": col_data.min(),
141
+ }
142
+ if col_data.isna().any():
143
+ result.warn(f"Missing values in numeric column '{col}'")
144
+ description["has_missing"] = True
145
+
146
+ logger.info(
147
+ "Metadata '%s' [numeric]: min=%.2f mean=%.2f max=%.2f",
148
+ col, description["min"], description["mean"], description["max"],
149
+ )
150
+
151
+ else:
152
+ cat_counts = col_data.value_counts(dropna=False)
153
+ categories = {
154
+ (str(val) if pd.notna(val) else "__missing__"): int(cnt)
155
+ for val, cnt in cat_counts.items()
156
+ }
157
+ description = {
158
+ "type": "categorical",
159
+ "n_categories": int(cat_counts.dropna().shape[0]),
160
+ "categories": categories,
161
+ }
162
+
163
+ # Missing values
164
+ missing_count = col_data.isna().sum()
165
+ if missing_count > 0:
166
+ result.warn(f"Missing values in categorical column '{col}' ({missing_count} rows)")
167
+ description["has_missing"] = True
168
+
169
+ # Label-consistency check (flag rare case variants)
170
+ label_issues = _find_label_inconsistencies(col, col_data)
171
+ if label_issues:
172
+ for issue in label_issues:
173
+ result.warn(issue)
174
+ description["label_warnings"] = label_issues
175
+
176
+ logger.info(
177
+ "Metadata '%s' [categorical]: %d category/ies — %s",
178
+ col,
179
+ description["n_categories"],
180
+ ", ".join(f"{k}={v}" for k, v in categories.items()),
181
+ )
182
+
183
+ result.col_descriptions[col] = description
184
+
185
+
186
+ def _find_label_inconsistencies(col: str, col_data: pd.Series) -> list[str]:
187
+ """
188
+ Return warning strings for labels that look like case variants of each
189
+ other but appear far less often than the dominant spelling.
190
+ """
191
+ lowered_map: dict[str, list[str]] = {}
192
+ for label in col_data.dropna().unique():
193
+ lowered_map.setdefault(str(label).lower(), []).append(str(label))
194
+
195
+ issues = []
196
+ for variants in lowered_map.values():
197
+ if len(variants) < 2:
198
+ continue
199
+ counts = {v: int((col_data == v).sum()) for v in variants}
200
+ max_count = max(counts.values())
201
+ for label, count in counts.items():
202
+ if count < 0.2 * max_count:
203
+ issues.append(
204
+ f"Column '{col}': label '{label}' appears {count} time(s) "
205
+ f"vs similar variant(s) {variants} — possible typo or case mismatch"
206
+ )
207
+ return issues
208
+
209
+
210
+ def _validate_contrast_columns(df_samples: pd.DataFrame, result: ParseResult) -> None:
211
+ """
212
+ Check every column whose name starts with 'Contrast'.
213
+ Valid names match Contrast-<N>; valid values are 1 or 2 (or NaN).
214
+ """
215
+ contrast_columns = [c for c in df_samples.columns if str(c).startswith("Contrast")]
216
+
217
+ for col in contrast_columns:
218
+ if not re.fullmatch(r"Contrast-\d+", str(col)):
219
+ result.error(f"Invalid contrast column name: '{col}' (expected Contrast-<number>)")
220
+ continue
221
+
222
+ invalid_values = _collect_invalid_int_values(df_samples[col], allowed={1, 2})
223
+ if invalid_values:
224
+ result.error(f"Invalid value(s) in '{col}': {invalid_values}")
225
+ else:
226
+ c1 = (df_samples[col].astype(str) == "1").sum()
227
+ c2 = (df_samples[col].astype(str) == "2").sum()
228
+ result.good(f"Contrast '{col}': group-1={c1}, group-2={c2}")
229
+
230
+
231
+ def _validate_timecourse_columns(df_samples: pd.DataFrame, result: ParseResult) -> None:
232
+ """
233
+ Check every column whose name starts with 'Timecourse'.
234
+ Valid names match Timecourse-<N>; values must be non-negative integers.
235
+ """
236
+ timecourse_columns = [c for c in df_samples.columns if str(c).startswith("Timecourse")]
237
+
238
+ for col in timecourse_columns:
239
+ if not re.fullmatch(r"Timecourse-\d+", str(col)):
240
+ result.error(f"Invalid timecourse column name: '{col}' (expected Timecourse-<number>)")
241
+ continue
242
+
243
+ invalid_values = _collect_invalid_int_values(df_samples[col], allowed=None)
244
+ if invalid_values:
245
+ result.error(f"Invalid value(s) in '{col}': {invalid_values}")
246
+ else:
247
+ counts = df_samples[col].dropna().value_counts().sort_index()
248
+ summary = ", ".join(f"{int(v)}:{c}" for v, c in counts.items())
249
+ result.good(f"Timecourse '{col}': {summary}")
250
+
251
+
252
+ def _collect_invalid_int_values(series: pd.Series, allowed: set[int] | None) -> set:
253
+ """
254
+ Return the set of values in *series* that cannot be cast to int, or
255
+ (when *allowed* is provided) that are not in the allowed set.
256
+ NaN values are ignored.
257
+ """
258
+ invalid: set = set()
259
+ for v in series.dropna().unique():
260
+ try:
261
+ v_int = int(v)
262
+ if float(v) != v_int: # reject 1.5, etc.
263
+ raise ValueError
264
+ if allowed is not None and v_int not in allowed:
265
+ raise ValueError
266
+ except (ValueError, TypeError):
267
+ invalid.add(v)
268
+ return invalid
269
+
270
+
271
+ def _coerce_group_columns(df_samples: pd.DataFrame) -> pd.DataFrame:
272
+ """Cast contrast and timecourse columns to nullable Int64, then to clean strings."""
273
+ group_cols = [
274
+ c for c in df_samples.columns
275
+ if re.fullmatch(r"(Contrast|Timecourse)-\d+", str(c))
276
+ ]
277
+ if group_cols:
278
+ df_samples = df_samples.copy()
279
+ df_samples[group_cols] = (
280
+ df_samples[group_cols]
281
+ .apply(pd.to_numeric, errors="coerce")
282
+ .astype("Int64")
283
+ .astype(str)
284
+ .replace("<NA>", "")
285
+ )
286
+ return df_samples
287
+
288
+
289
+ # ---------------------------------------------------------------------------
290
+ # Public entry point
291
+ # ---------------------------------------------------------------------------
292
+
293
+ def parse_samplesheet(
294
+ df_input: pd.DataFrame,
295
+ ) -> tuple[pd.DataFrame, list[str], list[str], list[str], dict[str, dict]]:
296
+ """
297
+ Parse and validate a samplesheet DataFrame.
298
+
299
+ Parameters
300
+ ----------
301
+ df_input:
302
+ Raw DataFrame as loaded from the spreadsheet (no header manipulation).
303
+
304
+ Returns
305
+ -------
306
+ df_samples:
307
+ Cleaned sample table.
308
+ errors:
309
+ List of blocking error messages.
310
+ warnings:
311
+ List of non-blocking warning messages.
312
+ ok:
313
+ List of passed-check messages.
314
+ col_descriptions:
315
+ Per-column summary statistics / category info.
316
+ """
317
+ result = ParseResult()
318
+
319
+ # 1. Top-level metadata (rows 0-3)
320
+ _check_metadata(df_input, result)
321
+
322
+ # 2. Locate the sample block
323
+ df_samples = _find_sample_block(df_input, result)
324
+
325
+ # 3. Sanitise sample names
326
+ df_samples = _check_sample_names(df_samples, result)
327
+
328
+ # 4. Describe metadata columns
329
+ _describe_metadata_columns(df_samples, result)
330
+
331
+ # 5. Validate contrast columns
332
+ _validate_contrast_columns(df_samples, result)
333
+
334
+ # 6. Validate timecourse columns
335
+ _validate_timecourse_columns(df_samples, result)
336
+
337
+ # 7. Coerce group columns to clean string representation
338
+ df_samples = _coerce_group_columns(df_samples)
339
+
340
+ logger.info(
341
+ "Parsing complete — %d error(s), %d warning(s)",
342
+ len(result.errors), len(result.warnings),
343
+ )
344
+
345
+ return df_samples, result.errors, result.warnings, result.ok, result.col_descriptions
346
+
347
+ """
348
+ if __name__ == "__main__":
349
+
350
+ for test_file in [
351
+ "test_files/LC-MS_metabolomics_PLASMA.xlsx",
352
+ "test_files/LC-MS_metabolomics_PLASMA_warnings.xlsx",
353
+ ]:
354
+ df = pd.read_excel(test_file, header=None)
355
+ df_samples, errors, warnings, ok, col_desc = parse_samplesheet(df)
356
+ print(f"\nResults for {test_file}:")
357
+ print("Errors:", errors)
358
+ print("Warnings:", warnings)
359
+ print("OK:", ok)
360
+ print("Column descriptions:")
361
+ for col, desc in col_desc.items():
362
+ print(f" {col}: {desc}")
363
+ """
@@ -0,0 +1,24 @@
1
+ import logging
2
+ from pathlib import Path
3
+
4
+ def configure_logging(
5
+ *,
6
+ level=logging.INFO,
7
+ log_file: Path | None = None,
8
+ ):
9
+ handlers = []
10
+
11
+ # Console handler
12
+ console_handler = logging.StreamHandler()
13
+ handlers.append(console_handler)
14
+
15
+ # Optional file handler
16
+ if log_file is not None:
17
+ file_handler = logging.FileHandler(log_file)
18
+ handlers.append(file_handler)
19
+
20
+ logging.basicConfig(
21
+ level=level,
22
+ format="%(asctime)s [%(levelname)s] %(name)s.%(funcName)s: %(message)s",
23
+ handlers=handlers,
24
+ )