parseet 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,397 @@
1
+ #!/usr/bin/env python
2
+ """
3
+ Integration File Sample Checker
4
+ ================================
5
+
6
+ Validates and aligns an integration Excel file against a reference
7
+ samplesheet.
8
+
9
+ It verifies that:
10
+ - All integration file columns correspond to valid sample names, QC, FM,
11
+ or Blank entries.
12
+ - Naming conventions are respected.
13
+ - All samples appear in both the samplesheet and the integration file.
14
+
15
+ The output is a merged samplesheet ordered to match the integration file
16
+ column order.
17
+ """
18
+
19
+ import logging
20
+ import re
21
+
22
+ import pandas as pd
23
+
24
+ logger = logging.getLogger(__name__)
25
+
26
+ # ---------------------------------------------------------------------------
27
+ # Special column types recognised in integration files
28
+ # ---------------------------------------------------------------------------
29
+ _SPECIAL_TYPES: tuple[tuple[str, str], ...] = (
30
+ ("QC", "QC"),
31
+ ("FM", "FM"),
32
+ ("Blank", "Blank"),
33
+ )
34
+
35
+
36
+ # ---------------------------------------------------------------------------
37
+ # Result helper
38
+ # ---------------------------------------------------------------------------
39
+
40
+ class CheckResult:
41
+ """Accumulates errors, warnings, and informational messages."""
42
+
43
+ def __init__(self):
44
+ self.errors: list[str] = []
45
+ self.warnings: list[str] = []
46
+ self.ok: list[str] = []
47
+
48
+ def error(self, msg: str) -> None:
49
+ self.errors.append(msg)
50
+ logger.error(msg)
51
+
52
+ def warn(self, msg: str) -> None:
53
+ self.warnings.append(msg)
54
+ logger.warning(msg)
55
+
56
+ def good(self, msg: str) -> None:
57
+ self.ok.append(msg)
58
+ logger.info(msg)
59
+
60
+
61
+ # ---------------------------------------------------------------------------
62
+ # Sub-functions
63
+ # ---------------------------------------------------------------------------
64
+
65
+ def _read_data(integration_file: str, software: str) -> pd.DataFrame:
66
+ """
67
+ Read the integration file based on its software type.
68
+
69
+ Parameters
70
+ ----------
71
+ integration_file:
72
+ Path to the Excel / TSV integration file.
73
+ software:
74
+ ``"crommy"`` or ``"msdial"``.
75
+
76
+ Returns
77
+ -------
78
+ pandas.DataFrame
79
+
80
+ Raises
81
+ ------
82
+ ValueError
83
+ If *software* is not supported.
84
+ """
85
+ if software == "crommy":
86
+ logger.debug("Reading crommy file: %s", integration_file)
87
+ return pd.read_excel(integration_file, sheet_name="Sheet1", index_col=0)
88
+ if software == "msdial":
89
+ logger.debug("Reading msdial file: %s", integration_file)
90
+ return pd.read_table(integration_file, sep="\t", header=4)
91
+ raise ValueError(f"Unsupported integration file type: '{software}'")
92
+
93
+
94
+ def _extract_sample_columns(data: pd.DataFrame, software: str) -> pd.Index:
95
+ """
96
+ Slice out the intensity/peak columns that follow the sentinel column
97
+ (``"Charge"`` for crommy, ``"MS/MS spectrum"`` for msdial).
98
+
99
+ Parameters
100
+ ----------
101
+ data:
102
+ Raw integration DataFrame (already sorted by index).
103
+ software:
104
+ ``"crommy"`` or ``"msdial"``.
105
+
106
+ Returns
107
+ -------
108
+ pandas.Index
109
+ Ordered column labels for the sample/QC/blank entries.
110
+ """
111
+ sentinel = "MS/MS spectrum" if software == "msdial" else "Charge"
112
+ start_index = list(data.columns).index(sentinel)
113
+ last_index = data.columns.get_loc("1") if software == "msdial" else len(data.columns)
114
+ columns = data.columns[start_index + 1 : last_index]
115
+ logger.debug("Extracted %d sample columns after '%s'", len(columns), sentinel)
116
+ return columns
117
+
118
+
119
+ def _build_sample_pattern(samplesheet: pd.DataFrame) -> re.Pattern:
120
+ """
121
+ Compile a single regex that matches ``<order>_<sample>_<phase>_<polarity>``
122
+ column names and captures the sample name in group 1.
123
+
124
+ Parameters
125
+ ----------
126
+ samplesheet:
127
+ Must contain a ``"Sample name"`` column.
128
+
129
+ Returns
130
+ -------
131
+ re.Pattern
132
+ """
133
+ sample_names = samplesheet["Sample name"].tolist()
134
+ samples_alternation = "|".join(re.escape(s) for s in sample_names)
135
+ pattern = re.compile(rf"\d+_({samples_alternation})_(C18|HILIC)_(np|p|n)")
136
+ logger.debug("Built sample-matching pattern for %d sample name(s)", len(sample_names))
137
+ return pattern
138
+
139
+
140
+ def _classify_columns(
141
+ columns: pd.Index,
142
+ pattern: re.Pattern,
143
+ result: CheckResult,
144
+ ) -> pd.DataFrame:
145
+ """
146
+ Classify each integration column as a sample, QC/FM/Blank, or unmatched.
147
+
148
+ Parameters
149
+ ----------
150
+ columns:
151
+ Integration file column labels to classify.
152
+ pattern:
153
+ Compiled regex from :func:`_build_sample_pattern`.
154
+ result:
155
+ Accumulates warnings/errors encountered during classification.
156
+
157
+ Returns
158
+ -------
159
+ pandas.DataFrame
160
+ Columns: ``Sample name``, ``filename``, ``_type``.
161
+
162
+ Raises
163
+ ------
164
+ ValueError
165
+ If any column has the wrong number of underscores, or if any column
166
+ cannot be matched to a sample or a known special type.
167
+ """
168
+ matched_entries: list[dict] = []
169
+ wrong_name_entries: list[str] = []
170
+ unmatched_entries: list[str] = []
171
+
172
+ for entry in columns:
173
+ if entry.count("_") != 3:
174
+ wrong_name_entries.append(entry)
175
+
176
+ m = pattern.search(entry)
177
+ if m:
178
+ matched_entries.append({
179
+ "Sample name": m.group(1),
180
+ "filename": entry,
181
+ "_type": "sample",
182
+ })
183
+ result.good(f"Matched sample column: '{entry}' → '{m.group(1)}'")
184
+ else:
185
+ special_type = next(
186
+ (t for key, t in _SPECIAL_TYPES if key in entry), None
187
+ )
188
+ matched_entries.append({
189
+ "Sample name": None,
190
+ "filename": entry,
191
+ "_type": special_type or "Unmatched",
192
+ })
193
+ if special_type:
194
+ result.good(f"Recognised special column: '{entry}' → {special_type}")
195
+ else:
196
+ unmatched_entries.append(entry)
197
+
198
+ if wrong_name_entries:
199
+ raise ValueError(
200
+ "These columns have a wrong number of underscores: "
201
+ + ", ".join(wrong_name_entries)
202
+ )
203
+ if unmatched_entries:
204
+ raise ValueError(
205
+ "These columns have no match in the samplesheet: "
206
+ + ", ".join(unmatched_entries)
207
+ )
208
+
209
+ return pd.DataFrame(matched_entries)
210
+
211
+
212
+ def _merge_and_validate(
213
+ samplesheet: pd.DataFrame,
214
+ matched_df: pd.DataFrame,
215
+ columns: pd.Index,
216
+ result: CheckResult,
217
+ ) -> pd.DataFrame:
218
+ """
219
+ Outer-merge the samplesheet with the classified columns and verify that
220
+ every samplesheet entry has a corresponding integration column.
221
+
222
+ Parameters
223
+ ----------
224
+ samplesheet:
225
+ Original sample metadata DataFrame.
226
+ matched_df:
227
+ Output of :func:`_classify_columns`.
228
+ columns:
229
+ Original ordered column index (used to assign ``col_order``).
230
+ result:
231
+ Accumulates warnings/errors.
232
+
233
+ Returns
234
+ -------
235
+ pandas.DataFrame
236
+ Merged DataFrame with ``col_order``, ``polarity``, and
237
+ ``sample_order`` columns added.
238
+
239
+ Raises
240
+ ------
241
+ ValueError
242
+ If any samplesheet entry has no matching integration column.
243
+ """
244
+ col_order_map = {name: i for i, name in enumerate(columns)}
245
+ matched_df = matched_df.copy()
246
+ matched_df["col_order"] = matched_df["filename"].map(col_order_map)
247
+
248
+ merged_df = pd.merge(samplesheet, matched_df, on="Sample name", how="outer")
249
+
250
+ missing_in_data = merged_df.loc[merged_df["filename"].isna(), "Sample name"]
251
+ if not missing_in_data.empty:
252
+ raise ValueError(
253
+ "These samplesheet entries have no match in the integration file: "
254
+ + ", ".join(missing_in_data)
255
+ )
256
+
257
+ result.good(
258
+ f"All {(merged_df['_type'] == 'sample').sum()} sample(s) matched successfully"
259
+ )
260
+
261
+ # Derive polarity and injection order from the filename
262
+ split = merged_df["filename"].str.split("_")
263
+ merged_df["polarity"] = split.str[-1].str.lower().astype("category")
264
+ merged_df["sample_order"] = split.str[0].astype(int)
265
+
266
+ return merged_df
267
+
268
+
269
+ def _handle_batch_column(
270
+ merged_df: pd.DataFrame,
271
+ result: CheckResult,
272
+ ) -> tuple[pd.DataFrame, str]:
273
+ """
274
+ Ensure a ``"Batch"`` column is present and is of string type.
275
+
276
+ Parameters
277
+ ----------
278
+ merged_df:
279
+ Merged sample DataFrame.
280
+ result:
281
+ Accumulates warnings/errors.
282
+
283
+ Returns
284
+ -------
285
+ tuple[pandas.DataFrame, str]
286
+ The (possibly modified) DataFrame and an informational message string.
287
+ """
288
+ if "Batch" not in merged_df.columns:
289
+ merged_df = merged_df.copy()
290
+ merged_df["Batch"] = "1"
291
+ message = ""
292
+ result.good("No 'Batch' column found — defaulting all samples to batch '1'")
293
+ else:
294
+ merged_df = merged_df.copy()
295
+ merged_df["Batch"] = merged_df["Batch"].astype(str)
296
+ message = "Please fill Batch info for blanks and QC."
297
+ result.warn(message)
298
+
299
+ return merged_df, message
300
+
301
+
302
+ # ---------------------------------------------------------------------------
303
+ # Public entry point
304
+ # ---------------------------------------------------------------------------
305
+
306
+ def check_samples(
307
+ samplesheet: pd.DataFrame,
308
+ integration_file: str,
309
+ software: str = "crommy",
310
+ ) -> tuple[pd.DataFrame, str, list[str], list[str], list[str]]:
311
+ """
312
+ Validate and match samples between a samplesheet and an integration file.
313
+
314
+ Reads an Excel/TSV integration file, extracts sample-related columns,
315
+ and matches them against the ``"Sample name"`` column in the provided
316
+ samplesheet. Enforces naming conventions, checks for missing or extra
317
+ samples, and returns a merged DataFrame ordered according to the
318
+ integration file column layout.
319
+
320
+ Parameters
321
+ ----------
322
+ samplesheet:
323
+ DataFrame containing sample metadata. Must include a
324
+ ``"Sample name"`` column.
325
+ integration_file:
326
+ Path to the Excel integration file.
327
+ software:
328
+ Integration software: ``"crommy"`` (default) or ``"msdial"``.
329
+
330
+ Returns
331
+ -------
332
+ merged_df:
333
+ DataFrame combining the samplesheet with matched integration file
334
+ entries, ordered to match the integration file.
335
+ message:
336
+ Informational message regarding batch handling (empty string when
337
+ no action is needed).
338
+ errors:
339
+ Blocking error messages collected during validation.
340
+ warnings:
341
+ Non-blocking warning messages collected during validation.
342
+ ok:
343
+ Passed-check messages collected during validation.
344
+
345
+ Raises
346
+ ------
347
+ ValueError
348
+ If column naming conventions are violated, or samples are missing
349
+ from either side of the merge.
350
+ """
351
+ result = CheckResult()
352
+ logger.info(
353
+ "Starting sample check — software='%s', file='%s'",
354
+ software, integration_file,
355
+ )
356
+
357
+ # 1. Read the integration file
358
+ data = _read_data(integration_file, software)
359
+ data.sort_index(inplace=True)
360
+
361
+ # 2. Slice the sample/QC/blank columns
362
+ columns = _extract_sample_columns(data, software)
363
+
364
+ # 3. Build the sample-matching regex
365
+ pattern = _build_sample_pattern(samplesheet)
366
+
367
+ # 4. Classify every column
368
+ matched_df = _classify_columns(columns, pattern, result)
369
+
370
+ # 5. Merge and validate completeness
371
+ merged_df = _merge_and_validate(samplesheet, matched_df, columns, result)
372
+
373
+ # 6. Resolve the Batch column
374
+ merged_df, message = _handle_batch_column(merged_df, result)
375
+
376
+ # 7. Sort to match integration file column order
377
+ merged_df = (
378
+ merged_df
379
+ .sort_values("col_order")
380
+ .reset_index(drop=True)
381
+ .drop(columns="col_order")
382
+ )
383
+
384
+ logger.info(
385
+ "Sample check complete — %d error(s), %d warning(s), %d ok",
386
+ len(result.errors), len(result.warnings), len(result.ok),
387
+ )
388
+
389
+ return merged_df, message, result.errors, result.warnings, result.ok
390
+
391
+ if __name__ == "__main__":
392
+ from parse_samplesheet import parse_samplesheet
393
+ df = pd.read_excel("test_files/LC-MS_metabolomics_PLASMA.xlsx", header=None)
394
+ sample_df, _, _, _, _ = parse_samplesheet(df, )
395
+
396
+ merged_df, message, errors, warnings, ok = check_samples(sample_df, "test_files/HILICp_Areas.txt", software="msdial")
397
+ print(merged_df)