pywombat 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pywombat/__init__.py +2 -0
- pywombat/cli.py +935 -0
- pywombat-0.1.0.dist-info/METADATA +142 -0
- pywombat-0.1.0.dist-info/RECORD +6 -0
- pywombat-0.1.0.dist-info/WHEEL +4 -0
- pywombat-0.1.0.dist-info/entry_points.txt +2 -0
pywombat/__init__.py
ADDED
pywombat/cli.py
ADDED
|
@@ -0,0 +1,935 @@
|
|
|
1
|
+
"""CLI for wombat tool."""
|
|
2
|
+
|
|
3
|
+
import re
|
|
4
|
+
import warnings
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from typing import Optional
|
|
7
|
+
|
|
8
|
+
import click
|
|
9
|
+
import polars as pl
|
|
10
|
+
import yaml
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
@click.command()
|
|
14
|
+
@click.argument("input_file", type=click.Path(exists=True, path_type=Path))
|
|
15
|
+
@click.option(
|
|
16
|
+
"-o",
|
|
17
|
+
"--output",
|
|
18
|
+
type=str,
|
|
19
|
+
help="Output file prefix. If not specified, prints to stdout.",
|
|
20
|
+
)
|
|
21
|
+
@click.option(
|
|
22
|
+
"-f",
|
|
23
|
+
"--format",
|
|
24
|
+
"output_format",
|
|
25
|
+
type=click.Choice(["tsv", "parquet"], case_sensitive=False),
|
|
26
|
+
default="tsv",
|
|
27
|
+
help="Output format: tsv (default) or parquet.",
|
|
28
|
+
)
|
|
29
|
+
@click.option("-v", "--verbose", is_flag=True, help="Enable verbose output.")
|
|
30
|
+
@click.option(
|
|
31
|
+
"-p",
|
|
32
|
+
"--pedigree",
|
|
33
|
+
type=click.Path(exists=True, path_type=Path),
|
|
34
|
+
help="Pedigree file to add father and mother genotype columns.",
|
|
35
|
+
)
|
|
36
|
+
@click.option(
|
|
37
|
+
"-F",
|
|
38
|
+
"--filter-config",
|
|
39
|
+
type=click.Path(exists=True, path_type=Path),
|
|
40
|
+
help="Filter configuration YAML file to apply quality and impact filters.",
|
|
41
|
+
)
|
|
42
|
+
def cli(
|
|
43
|
+
input_file: Path,
|
|
44
|
+
output: Optional[str],
|
|
45
|
+
output_format: str,
|
|
46
|
+
verbose: bool,
|
|
47
|
+
pedigree: Optional[Path],
|
|
48
|
+
filter_config: Optional[Path],
|
|
49
|
+
):
|
|
50
|
+
"""
|
|
51
|
+
Wombat: A tool for processing bcftools tabulated TSV files.
|
|
52
|
+
|
|
53
|
+
This command:
|
|
54
|
+
|
|
55
|
+
\b
|
|
56
|
+
1. Expands the '(null)' column containing NAME=value pairs separated by ';'
|
|
57
|
+
2. Preserves the CSQ (Consequence) column without melting
|
|
58
|
+
3. Melts sample columns into rows with sample names
|
|
59
|
+
4. Splits sample values (GT:DP:GQ:AD format) into separate columns:
|
|
60
|
+
- sample_gt: Genotype
|
|
61
|
+
- sample_dp: Read depth
|
|
62
|
+
- sample_gq: Genotype quality
|
|
63
|
+
- sample_ad: Allele depth (second value from comma-separated list)
|
|
64
|
+
- sample_vaf: Variant allele frequency (sample_ad / sample_dp)
|
|
65
|
+
|
|
66
|
+
\b
|
|
67
|
+
Examples:
|
|
68
|
+
wombat input.tsv -o output
|
|
69
|
+
wombat input.tsv -o output -f parquet
|
|
70
|
+
wombat input.tsv > output.tsv
|
|
71
|
+
"""
|
|
72
|
+
try:
|
|
73
|
+
if verbose:
|
|
74
|
+
click.echo(f"Reading input file: {input_file}", err=True)
|
|
75
|
+
|
|
76
|
+
# Read the TSV file
|
|
77
|
+
df = pl.read_csv(input_file, separator="\t")
|
|
78
|
+
|
|
79
|
+
if verbose:
|
|
80
|
+
click.echo(
|
|
81
|
+
f"Input shape: {df.shape[0]} rows, {df.shape[1]} columns", err=True
|
|
82
|
+
)
|
|
83
|
+
|
|
84
|
+
# Read pedigree file if provided
|
|
85
|
+
pedigree_df = None
|
|
86
|
+
if pedigree:
|
|
87
|
+
if verbose:
|
|
88
|
+
click.echo(f"Reading pedigree file: {pedigree}", err=True)
|
|
89
|
+
pedigree_df = read_pedigree(pedigree)
|
|
90
|
+
|
|
91
|
+
# Process the dataframe
|
|
92
|
+
formatted_df = format_bcftools_tsv(df, pedigree_df)
|
|
93
|
+
|
|
94
|
+
if verbose:
|
|
95
|
+
click.echo(
|
|
96
|
+
f"Output shape: {formatted_df.shape[0]} rows, {formatted_df.shape[1]} columns",
|
|
97
|
+
err=True,
|
|
98
|
+
)
|
|
99
|
+
|
|
100
|
+
# Apply filters if provided
|
|
101
|
+
filter_config_data = None
|
|
102
|
+
if filter_config:
|
|
103
|
+
if verbose:
|
|
104
|
+
click.echo(f"Reading filter config: {filter_config}", err=True)
|
|
105
|
+
filter_config_data = load_filter_config(filter_config)
|
|
106
|
+
|
|
107
|
+
# Apply filters and write output
|
|
108
|
+
if filter_config_data:
|
|
109
|
+
apply_filters_and_write(
|
|
110
|
+
formatted_df,
|
|
111
|
+
filter_config_data,
|
|
112
|
+
output,
|
|
113
|
+
output_format,
|
|
114
|
+
verbose,
|
|
115
|
+
)
|
|
116
|
+
else:
|
|
117
|
+
# No filters - write single output file
|
|
118
|
+
if output:
|
|
119
|
+
# Construct output filename with prefix and format
|
|
120
|
+
output_path = Path(f"{output}.{output_format}")
|
|
121
|
+
|
|
122
|
+
if output_format == "tsv":
|
|
123
|
+
formatted_df.write_csv(output_path, separator="\t")
|
|
124
|
+
elif output_format == "parquet":
|
|
125
|
+
formatted_df.write_parquet(output_path)
|
|
126
|
+
|
|
127
|
+
click.echo(f"Formatted data written to {output_path}", err=True)
|
|
128
|
+
else:
|
|
129
|
+
# Write to stdout (only for TSV format)
|
|
130
|
+
if output_format != "tsv":
|
|
131
|
+
click.echo(
|
|
132
|
+
"Error: stdout output only supported for TSV format. Use -o to specify an output prefix for parquet.",
|
|
133
|
+
err=True,
|
|
134
|
+
)
|
|
135
|
+
raise click.Abort()
|
|
136
|
+
click.echo(formatted_df.write_csv(separator="\t"), nl=False)
|
|
137
|
+
|
|
138
|
+
except Exception as e:
|
|
139
|
+
click.echo(f"Error: {e}", err=True)
|
|
140
|
+
raise click.Abort()
|
|
141
|
+
|
|
142
|
+
|
|
143
|
+
def load_filter_config(config_path: Path) -> dict:
|
|
144
|
+
"""Load and parse filter configuration from YAML file."""
|
|
145
|
+
with open(config_path, "r") as f:
|
|
146
|
+
config = yaml.safe_load(f)
|
|
147
|
+
return config
|
|
148
|
+
|
|
149
|
+
|
|
150
|
+
def apply_quality_filters(
|
|
151
|
+
df: pl.DataFrame, quality_config: dict, verbose: bool = False
|
|
152
|
+
) -> pl.DataFrame:
|
|
153
|
+
"""Apply quality filters to the dataframe."""
|
|
154
|
+
if quality_config is None:
|
|
155
|
+
return df
|
|
156
|
+
|
|
157
|
+
original_rows = df.shape[0]
|
|
158
|
+
|
|
159
|
+
# Check for rare genotypes with '2' and warn
|
|
160
|
+
if "2" in str(df["sample_gt"].to_list()):
|
|
161
|
+
rare_gts = df.filter(pl.col("sample_gt").str.contains("2"))
|
|
162
|
+
if rare_gts.shape[0] > 0:
|
|
163
|
+
warnings.warn(
|
|
164
|
+
f"Found {rare_gts.shape[0]} rows with rare genotypes containing '2'. These will be kept."
|
|
165
|
+
)
|
|
166
|
+
|
|
167
|
+
# Filter: sample_gt must contain at least one '1' (default: true)
|
|
168
|
+
filter_no_alt = quality_config.get("filter_no_alt_allele", True)
|
|
169
|
+
if filter_no_alt:
|
|
170
|
+
df = df.filter(
|
|
171
|
+
pl.col("sample_gt").str.contains("1")
|
|
172
|
+
| pl.col("sample_gt").str.contains("2")
|
|
173
|
+
)
|
|
174
|
+
|
|
175
|
+
# Apply minimum depth filter
|
|
176
|
+
if "sample_dp_min" in quality_config:
|
|
177
|
+
min_dp = quality_config["sample_dp_min"]
|
|
178
|
+
df = df.filter(pl.col("sample_dp").cast(pl.Float64, strict=False) >= min_dp)
|
|
179
|
+
|
|
180
|
+
# Apply minimum GQ filter
|
|
181
|
+
if "sample_gq_min" in quality_config:
|
|
182
|
+
min_gq = quality_config["sample_gq_min"]
|
|
183
|
+
df = df.filter(pl.col("sample_gq").cast(pl.Float64, strict=False) >= min_gq)
|
|
184
|
+
|
|
185
|
+
# Determine genotype for VAF filters
|
|
186
|
+
# Het: contains exactly one '1' (0/1 or 1/0)
|
|
187
|
+
# HomAlt: 1/1
|
|
188
|
+
# HomRef: 0/0
|
|
189
|
+
is_het = (pl.col("sample_gt").str.count_matches("1") == 1) & ~pl.col(
|
|
190
|
+
"sample_gt"
|
|
191
|
+
).str.contains("2")
|
|
192
|
+
is_hom_alt = pl.col("sample_gt") == "1/1"
|
|
193
|
+
is_hom_ref = pl.col("sample_gt") == "0/0"
|
|
194
|
+
|
|
195
|
+
# VAF filters for heterozygous
|
|
196
|
+
if "sample_vaf_het_min" in quality_config:
|
|
197
|
+
min_vaf_het = quality_config["sample_vaf_het_min"]
|
|
198
|
+
df = df.filter(~is_het | (pl.col("sample_vaf") >= min_vaf_het))
|
|
199
|
+
|
|
200
|
+
if "sample_vaf_het_max" in quality_config:
|
|
201
|
+
max_vaf_het = quality_config["sample_vaf_het_max"]
|
|
202
|
+
df = df.filter(~is_het | (pl.col("sample_vaf") <= max_vaf_het))
|
|
203
|
+
|
|
204
|
+
# VAF filters for homozygous alternate
|
|
205
|
+
if "sample_vaf_homalt_min" in quality_config:
|
|
206
|
+
min_vaf_homalt = quality_config["sample_vaf_homalt_min"]
|
|
207
|
+
df = df.filter(~is_hom_alt | (pl.col("sample_vaf") >= min_vaf_homalt))
|
|
208
|
+
|
|
209
|
+
# VAF filters for homozygous reference (wild type)
|
|
210
|
+
if "sample_vaf_hom_ref_max" in quality_config:
|
|
211
|
+
max_vaf_hom_ref = quality_config["sample_vaf_hom_ref_max"]
|
|
212
|
+
df = df.filter(~is_hom_ref | (pl.col("sample_vaf") <= max_vaf_hom_ref))
|
|
213
|
+
|
|
214
|
+
# Apply filters to parents if they exist and option is enabled
|
|
215
|
+
apply_to_parents = quality_config.get("apply_to_parents", False)
|
|
216
|
+
if apply_to_parents and "father" in df.columns:
|
|
217
|
+
# Apply same filters to father columns
|
|
218
|
+
if "sample_dp_min" in quality_config:
|
|
219
|
+
min_dp = quality_config["sample_dp_min"]
|
|
220
|
+
df = df.filter(
|
|
221
|
+
(pl.col("father_dp").is_null())
|
|
222
|
+
| (pl.col("father_dp").cast(pl.Float64, strict=False) >= min_dp)
|
|
223
|
+
)
|
|
224
|
+
|
|
225
|
+
if "sample_gq_min" in quality_config:
|
|
226
|
+
min_gq = quality_config["sample_gq_min"]
|
|
227
|
+
df = df.filter(
|
|
228
|
+
(pl.col("father_gq").is_null())
|
|
229
|
+
| (pl.col("father_gq").cast(pl.Float64, strict=False) >= min_gq)
|
|
230
|
+
)
|
|
231
|
+
|
|
232
|
+
# Father genotype checks
|
|
233
|
+
father_is_het = (pl.col("father_gt").str.count_matches("1") == 1) & ~pl.col(
|
|
234
|
+
"father_gt"
|
|
235
|
+
).str.contains("2")
|
|
236
|
+
father_is_hom_alt = pl.col("father_gt") == "1/1"
|
|
237
|
+
father_is_hom_ref = pl.col("father_gt") == "0/0"
|
|
238
|
+
|
|
239
|
+
if "sample_vaf_het_min" in quality_config:
|
|
240
|
+
min_vaf_het = quality_config["sample_vaf_het_min"]
|
|
241
|
+
df = df.filter(
|
|
242
|
+
pl.col("father_vaf").is_null()
|
|
243
|
+
| ~father_is_het
|
|
244
|
+
| (pl.col("father_vaf") >= min_vaf_het)
|
|
245
|
+
)
|
|
246
|
+
|
|
247
|
+
if "sample_vaf_het_max" in quality_config:
|
|
248
|
+
max_vaf_het = quality_config["sample_vaf_het_max"]
|
|
249
|
+
df = df.filter(
|
|
250
|
+
pl.col("father_vaf").is_null()
|
|
251
|
+
| ~father_is_het
|
|
252
|
+
| (pl.col("father_vaf") <= max_vaf_het)
|
|
253
|
+
)
|
|
254
|
+
|
|
255
|
+
if "sample_vaf_homalt_min" in quality_config:
|
|
256
|
+
min_vaf_homalt = quality_config["sample_vaf_homalt_min"]
|
|
257
|
+
df = df.filter(
|
|
258
|
+
pl.col("father_vaf").is_null()
|
|
259
|
+
| ~father_is_hom_alt
|
|
260
|
+
| (pl.col("father_vaf") >= min_vaf_homalt)
|
|
261
|
+
)
|
|
262
|
+
|
|
263
|
+
if "sample_vaf_hom_ref_max" in quality_config:
|
|
264
|
+
max_vaf_hom_ref = quality_config["sample_vaf_hom_ref_max"]
|
|
265
|
+
df = df.filter(
|
|
266
|
+
pl.col("father_vaf").is_null()
|
|
267
|
+
| ~father_is_hom_ref
|
|
268
|
+
| (pl.col("father_vaf") <= max_vaf_hom_ref)
|
|
269
|
+
)
|
|
270
|
+
|
|
271
|
+
# Apply same filters to mother columns
|
|
272
|
+
if "sample_dp_min" in quality_config:
|
|
273
|
+
min_dp = quality_config["sample_dp_min"]
|
|
274
|
+
df = df.filter(
|
|
275
|
+
(pl.col("mother_dp").is_null())
|
|
276
|
+
| (pl.col("mother_dp").cast(pl.Float64, strict=False) >= min_dp)
|
|
277
|
+
)
|
|
278
|
+
|
|
279
|
+
if "sample_gq_min" in quality_config:
|
|
280
|
+
min_gq = quality_config["sample_gq_min"]
|
|
281
|
+
df = df.filter(
|
|
282
|
+
(pl.col("mother_gq").is_null())
|
|
283
|
+
| (pl.col("mother_gq").cast(pl.Float64, strict=False) >= min_gq)
|
|
284
|
+
)
|
|
285
|
+
|
|
286
|
+
# Mother genotype checks
|
|
287
|
+
mother_is_het = (pl.col("mother_gt").str.count_matches("1") == 1) & ~pl.col(
|
|
288
|
+
"mother_gt"
|
|
289
|
+
).str.contains("2")
|
|
290
|
+
mother_is_hom_alt = pl.col("mother_gt") == "1/1"
|
|
291
|
+
mother_is_hom_ref = pl.col("mother_gt") == "0/0"
|
|
292
|
+
|
|
293
|
+
if "sample_vaf_het_min" in quality_config:
|
|
294
|
+
min_vaf_het = quality_config["sample_vaf_het_min"]
|
|
295
|
+
df = df.filter(
|
|
296
|
+
pl.col("mother_vaf").is_null()
|
|
297
|
+
| ~mother_is_het
|
|
298
|
+
| (pl.col("mother_vaf") >= min_vaf_het)
|
|
299
|
+
)
|
|
300
|
+
|
|
301
|
+
if "sample_vaf_het_max" in quality_config:
|
|
302
|
+
max_vaf_het = quality_config["sample_vaf_het_max"]
|
|
303
|
+
df = df.filter(
|
|
304
|
+
pl.col("mother_vaf").is_null()
|
|
305
|
+
| ~mother_is_het
|
|
306
|
+
| (pl.col("mother_vaf") <= max_vaf_het)
|
|
307
|
+
)
|
|
308
|
+
|
|
309
|
+
if "sample_vaf_homalt_min" in quality_config:
|
|
310
|
+
min_vaf_homalt = quality_config["sample_vaf_homalt_min"]
|
|
311
|
+
df = df.filter(
|
|
312
|
+
pl.col("mother_vaf").is_null()
|
|
313
|
+
| ~mother_is_hom_alt
|
|
314
|
+
| (pl.col("mother_vaf") >= min_vaf_homalt)
|
|
315
|
+
)
|
|
316
|
+
|
|
317
|
+
if "sample_vaf_hom_ref_max" in quality_config:
|
|
318
|
+
max_vaf_hom_ref = quality_config["sample_vaf_hom_ref_max"]
|
|
319
|
+
df = df.filter(
|
|
320
|
+
pl.col("mother_vaf").is_null()
|
|
321
|
+
| ~mother_is_hom_ref
|
|
322
|
+
| (pl.col("mother_vaf") <= max_vaf_hom_ref)
|
|
323
|
+
)
|
|
324
|
+
|
|
325
|
+
if verbose:
|
|
326
|
+
filtered_rows = df.shape[0]
|
|
327
|
+
click.echo(
|
|
328
|
+
f"Quality filters: {original_rows} -> {filtered_rows} rows ({original_rows - filtered_rows} filtered out)",
|
|
329
|
+
err=True,
|
|
330
|
+
)
|
|
331
|
+
|
|
332
|
+
return df
|
|
333
|
+
|
|
334
|
+
|
|
335
|
+
def parse_impact_filter_expression(expression: str, df: pl.DataFrame) -> pl.Expr:
|
|
336
|
+
"""Parse a filter expression string into a Polars expression."""
|
|
337
|
+
# Replace operators with Polars equivalents
|
|
338
|
+
# Support: =, !=, <=, >=, <, >, &, |, ()
|
|
339
|
+
|
|
340
|
+
expr_str = expression.strip()
|
|
341
|
+
|
|
342
|
+
# Split by logical operators while preserving them
|
|
343
|
+
tokens = re.split(r"(\s*[&|]\s*|\(|\))", expr_str)
|
|
344
|
+
tokens = [t.strip() for t in tokens if t.strip()]
|
|
345
|
+
|
|
346
|
+
def parse_condition(condition: str) -> pl.Expr:
|
|
347
|
+
"""Parse a single condition into a Polars expression."""
|
|
348
|
+
condition = condition.strip()
|
|
349
|
+
|
|
350
|
+
# Try different operators in order of specificity
|
|
351
|
+
for op in ["<=", ">=", "!=", "=", "<", ">"]:
|
|
352
|
+
if op in condition:
|
|
353
|
+
parts = condition.split(op, 1)
|
|
354
|
+
if len(parts) == 2:
|
|
355
|
+
col_name = parts[0].strip()
|
|
356
|
+
value = parts[1].strip()
|
|
357
|
+
|
|
358
|
+
# Check if column exists
|
|
359
|
+
if col_name not in df.columns:
|
|
360
|
+
raise ValueError(f"Column '{col_name}' not found in dataframe")
|
|
361
|
+
|
|
362
|
+
# Try to convert value to number, otherwise treat as string
|
|
363
|
+
try:
|
|
364
|
+
value_num = float(value)
|
|
365
|
+
col_expr = pl.col(col_name).cast(pl.Float64, strict=False)
|
|
366
|
+
|
|
367
|
+
if op == "=":
|
|
368
|
+
return col_expr == value_num
|
|
369
|
+
elif op == "!=":
|
|
370
|
+
return col_expr != value_num
|
|
371
|
+
elif op == "<=":
|
|
372
|
+
return col_expr <= value_num
|
|
373
|
+
elif op == ">=":
|
|
374
|
+
return col_expr >= value_num
|
|
375
|
+
elif op == "<":
|
|
376
|
+
return col_expr < value_num
|
|
377
|
+
elif op == ">":
|
|
378
|
+
return col_expr > value_num
|
|
379
|
+
except ValueError:
|
|
380
|
+
# String comparison (case-insensitive)
|
|
381
|
+
value = value.strip("'\"")
|
|
382
|
+
col_expr = pl.col(col_name).str.to_lowercase()
|
|
383
|
+
value_lower = value.lower()
|
|
384
|
+
|
|
385
|
+
if op == "=":
|
|
386
|
+
return col_expr == value_lower
|
|
387
|
+
elif op == "!=":
|
|
388
|
+
return col_expr != value_lower
|
|
389
|
+
else:
|
|
390
|
+
raise ValueError(
|
|
391
|
+
f"Operator '{op}' not supported for string comparison"
|
|
392
|
+
)
|
|
393
|
+
break
|
|
394
|
+
|
|
395
|
+
raise ValueError(f"Could not parse condition: {condition}")
|
|
396
|
+
|
|
397
|
+
def build_expression(tokens: list, idx: int = 0) -> tuple[pl.Expr, int]:
|
|
398
|
+
"""Recursively build expression from tokens."""
|
|
399
|
+
if idx >= len(tokens):
|
|
400
|
+
return None, idx
|
|
401
|
+
|
|
402
|
+
result = None
|
|
403
|
+
i = idx
|
|
404
|
+
|
|
405
|
+
while i < len(tokens):
|
|
406
|
+
token = tokens[i]
|
|
407
|
+
|
|
408
|
+
if token == "(":
|
|
409
|
+
# Parse sub-expression
|
|
410
|
+
sub_expr, new_i = build_expression(tokens, i + 1)
|
|
411
|
+
if result is None:
|
|
412
|
+
result = sub_expr
|
|
413
|
+
i = new_i
|
|
414
|
+
elif token == ")":
|
|
415
|
+
# End of sub-expression
|
|
416
|
+
return result, i + 1
|
|
417
|
+
elif token == "&":
|
|
418
|
+
# AND operator
|
|
419
|
+
next_expr, new_i = build_expression(tokens, i + 1)
|
|
420
|
+
if next_expr is not None:
|
|
421
|
+
result = result & next_expr if result is not None else next_expr
|
|
422
|
+
return result, new_i
|
|
423
|
+
elif token == "|":
|
|
424
|
+
# OR operator
|
|
425
|
+
next_expr, new_i = build_expression(tokens, i + 1)
|
|
426
|
+
if next_expr is not None:
|
|
427
|
+
result = result | next_expr if result is not None else next_expr
|
|
428
|
+
return result, new_i
|
|
429
|
+
else:
|
|
430
|
+
# It's a condition
|
|
431
|
+
cond_expr = parse_condition(token)
|
|
432
|
+
if result is None:
|
|
433
|
+
result = cond_expr
|
|
434
|
+
else:
|
|
435
|
+
# If we have a result and encounter another condition without an operator,
|
|
436
|
+
# assume AND
|
|
437
|
+
result = result & cond_expr
|
|
438
|
+
|
|
439
|
+
i += 1
|
|
440
|
+
|
|
441
|
+
return result, i
|
|
442
|
+
|
|
443
|
+
expr, _ = build_expression(tokens)
|
|
444
|
+
return expr
|
|
445
|
+
|
|
446
|
+
|
|
447
|
+
def apply_impact_filters(
|
|
448
|
+
df: pl.DataFrame,
|
|
449
|
+
impact_config: list,
|
|
450
|
+
output_prefix: str,
|
|
451
|
+
output_format: str,
|
|
452
|
+
verbose: bool,
|
|
453
|
+
):
|
|
454
|
+
"""Apply impact filters and create separate output files."""
|
|
455
|
+
if not impact_config:
|
|
456
|
+
return
|
|
457
|
+
|
|
458
|
+
# Sort impact filters by priority (lower number = higher priority)
|
|
459
|
+
impact_filters = sorted(impact_config, key=lambda x: x.get("priority", 999))
|
|
460
|
+
|
|
461
|
+
# Create a dict to store variants by impact filter
|
|
462
|
+
impact_variants = {}
|
|
463
|
+
|
|
464
|
+
# Apply each impact filter
|
|
465
|
+
for impact_filter in impact_filters:
|
|
466
|
+
name = impact_filter["name"]
|
|
467
|
+
priority = impact_filter.get("priority", 999)
|
|
468
|
+
expression = impact_filter["expression"]
|
|
469
|
+
|
|
470
|
+
if verbose:
|
|
471
|
+
click.echo(
|
|
472
|
+
f"Applying impact filter '{name}' (priority {priority})...", err=True
|
|
473
|
+
)
|
|
474
|
+
|
|
475
|
+
try:
|
|
476
|
+
# Parse and apply the filter expression
|
|
477
|
+
filter_expr = parse_impact_filter_expression(expression, df)
|
|
478
|
+
filtered_df = df.filter(filter_expr)
|
|
479
|
+
|
|
480
|
+
if verbose:
|
|
481
|
+
click.echo(
|
|
482
|
+
f" Impact filter '{name}': {filtered_df.shape[0]} variants",
|
|
483
|
+
err=True,
|
|
484
|
+
)
|
|
485
|
+
|
|
486
|
+
# Store the filtered dataframe with its priority
|
|
487
|
+
impact_variants[name] = {
|
|
488
|
+
"df": filtered_df,
|
|
489
|
+
"priority": priority,
|
|
490
|
+
}
|
|
491
|
+
except Exception as e:
|
|
492
|
+
click.echo(
|
|
493
|
+
f"Error applying impact filter '{name}': {e}",
|
|
494
|
+
err=True,
|
|
495
|
+
)
|
|
496
|
+
raise
|
|
497
|
+
|
|
498
|
+
# Add flag_higher_impact column to each filtered dataframe
|
|
499
|
+
for name, data in impact_variants.items():
|
|
500
|
+
filtered_df = data["df"]
|
|
501
|
+
priority = data["priority"]
|
|
502
|
+
|
|
503
|
+
# Find variants that appear in higher priority filters
|
|
504
|
+
higher_priority_filters = []
|
|
505
|
+
for other_name, other_data in impact_variants.items():
|
|
506
|
+
if other_data["priority"] < priority:
|
|
507
|
+
higher_priority_filters.append(other_name)
|
|
508
|
+
|
|
509
|
+
if higher_priority_filters:
|
|
510
|
+
# Create a set of variant keys from higher priority filters
|
|
511
|
+
variant_keys_in_higher = set()
|
|
512
|
+
for other_name in higher_priority_filters:
|
|
513
|
+
other_df = impact_variants[other_name]["df"]
|
|
514
|
+
for row in other_df.select(["#CHROM", "POS", "REF", "ALT"]).iter_rows():
|
|
515
|
+
variant_keys_in_higher.add(tuple(row))
|
|
516
|
+
|
|
517
|
+
# Add flag_higher_impact column
|
|
518
|
+
def check_higher_impact(chrom, pos, ref, alt):
|
|
519
|
+
key = (chrom, pos, ref, alt)
|
|
520
|
+
if key in variant_keys_in_higher:
|
|
521
|
+
# Find which filters it appears in
|
|
522
|
+
filters_with_variant = []
|
|
523
|
+
for other_name in higher_priority_filters:
|
|
524
|
+
other_df = impact_variants[other_name]["df"]
|
|
525
|
+
match = other_df.filter(
|
|
526
|
+
(pl.col("#CHROM") == chrom)
|
|
527
|
+
& (pl.col("POS") == pos)
|
|
528
|
+
& (pl.col("REF") == ref)
|
|
529
|
+
& (pl.col("ALT") == alt)
|
|
530
|
+
)
|
|
531
|
+
if match.shape[0] > 0:
|
|
532
|
+
filters_with_variant.append(other_name)
|
|
533
|
+
return (
|
|
534
|
+
", ".join(filters_with_variant) if filters_with_variant else ""
|
|
535
|
+
)
|
|
536
|
+
return ""
|
|
537
|
+
|
|
538
|
+
# Add the flag column
|
|
539
|
+
filtered_df = filtered_df.with_columns(
|
|
540
|
+
[
|
|
541
|
+
pl.struct(["#CHROM", "POS", "REF", "ALT"])
|
|
542
|
+
.map_elements(
|
|
543
|
+
lambda x: check_higher_impact(
|
|
544
|
+
x["#CHROM"], x["POS"], x["REF"], x["ALT"]
|
|
545
|
+
),
|
|
546
|
+
return_dtype=pl.Utf8,
|
|
547
|
+
)
|
|
548
|
+
.alias("flag_higher_impact")
|
|
549
|
+
]
|
|
550
|
+
)
|
|
551
|
+
else:
|
|
552
|
+
# No higher priority filters
|
|
553
|
+
filtered_df = filtered_df.with_columns(
|
|
554
|
+
[pl.lit("").alias("flag_higher_impact")]
|
|
555
|
+
)
|
|
556
|
+
|
|
557
|
+
# Write to file
|
|
558
|
+
output_filename = f"{output_prefix}_{name}.{output_format}"
|
|
559
|
+
output_path = Path(output_filename)
|
|
560
|
+
|
|
561
|
+
if output_format == "tsv":
|
|
562
|
+
filtered_df.write_csv(output_path, separator="\t")
|
|
563
|
+
elif output_format == "parquet":
|
|
564
|
+
filtered_df.write_parquet(output_path)
|
|
565
|
+
|
|
566
|
+
click.echo(
|
|
567
|
+
f"Written {filtered_df.shape[0]} variants to {output_path}", err=True
|
|
568
|
+
)
|
|
569
|
+
|
|
570
|
+
|
|
571
|
+
def apply_filters_and_write(
|
|
572
|
+
df: pl.DataFrame,
|
|
573
|
+
filter_config: dict,
|
|
574
|
+
output_prefix: Optional[str],
|
|
575
|
+
output_format: str,
|
|
576
|
+
verbose: bool,
|
|
577
|
+
):
|
|
578
|
+
"""Apply filters and write output files."""
|
|
579
|
+
# Apply quality filters first
|
|
580
|
+
quality_config = filter_config.get("quality", {})
|
|
581
|
+
filtered_df = apply_quality_filters(df, quality_config, verbose)
|
|
582
|
+
|
|
583
|
+
# Get impact filters
|
|
584
|
+
impact_config = filter_config.get("impact", [])
|
|
585
|
+
|
|
586
|
+
if not impact_config:
|
|
587
|
+
# No impact filters - write single output file
|
|
588
|
+
if not output_prefix:
|
|
589
|
+
# Write to stdout
|
|
590
|
+
if output_format != "tsv":
|
|
591
|
+
click.echo(
|
|
592
|
+
"Error: stdout output only supported for TSV format.",
|
|
593
|
+
err=True,
|
|
594
|
+
)
|
|
595
|
+
raise click.Abort()
|
|
596
|
+
click.echo(filtered_df.write_csv(separator="\t"), nl=False)
|
|
597
|
+
else:
|
|
598
|
+
output_path = Path(f"{output_prefix}.{output_format}")
|
|
599
|
+
|
|
600
|
+
if output_format == "tsv":
|
|
601
|
+
filtered_df.write_csv(output_path, separator="\t")
|
|
602
|
+
elif output_format == "parquet":
|
|
603
|
+
filtered_df.write_parquet(output_path)
|
|
604
|
+
|
|
605
|
+
click.echo(f"Formatted data written to {output_path}", err=True)
|
|
606
|
+
else:
|
|
607
|
+
# Apply impact filters and create multiple output files
|
|
608
|
+
if not output_prefix:
|
|
609
|
+
click.echo(
|
|
610
|
+
"Error: Output prefix required when using impact filters.",
|
|
611
|
+
err=True,
|
|
612
|
+
)
|
|
613
|
+
raise click.Abort()
|
|
614
|
+
|
|
615
|
+
apply_impact_filters(
|
|
616
|
+
filtered_df,
|
|
617
|
+
impact_config,
|
|
618
|
+
output_prefix,
|
|
619
|
+
output_format,
|
|
620
|
+
verbose,
|
|
621
|
+
)
|
|
622
|
+
|
|
623
|
+
|
|
624
|
+
def read_pedigree(pedigree_path: Path) -> pl.DataFrame:
|
|
625
|
+
"""
|
|
626
|
+
Read a pedigree file and return a DataFrame with sample relationships.
|
|
627
|
+
|
|
628
|
+
Args:
|
|
629
|
+
pedigree_path: Path to the pedigree file
|
|
630
|
+
|
|
631
|
+
Returns:
|
|
632
|
+
DataFrame with columns: sample_id, father_id, mother_id
|
|
633
|
+
"""
|
|
634
|
+
# Try reading with header first
|
|
635
|
+
df = pl.read_csv(pedigree_path, separator="\t")
|
|
636
|
+
|
|
637
|
+
# Check if first row has 'FID' in first column (indicates header)
|
|
638
|
+
if df.columns[0] == "FID" or "sample_id" in df.columns:
|
|
639
|
+
# Has header - use it as-is
|
|
640
|
+
pass
|
|
641
|
+
else:
|
|
642
|
+
# No header - assume standard pedigree format
|
|
643
|
+
# FID, sample_id, father_id, mother_id, sex, phenotype
|
|
644
|
+
df.columns = ["FID", "sample_id", "father_id", "mother_id", "sex", "phenotype"]
|
|
645
|
+
|
|
646
|
+
# Ensure we have the required columns (try different possible names)
|
|
647
|
+
if "sample_id" not in df.columns and len(df.columns) >= 4:
|
|
648
|
+
# Try to identify columns by position
|
|
649
|
+
df = df.rename(
|
|
650
|
+
{
|
|
651
|
+
df.columns[1]: "sample_id",
|
|
652
|
+
df.columns[2]: "father_id",
|
|
653
|
+
df.columns[3]: "mother_id",
|
|
654
|
+
}
|
|
655
|
+
)
|
|
656
|
+
|
|
657
|
+
# Handle different column names for father/mother
|
|
658
|
+
if "FatherBarcode" in df.columns:
|
|
659
|
+
df = df.rename({"FatherBarcode": "father_id", "MotherBarcode": "mother_id"})
|
|
660
|
+
|
|
661
|
+
# Select only the columns we need
|
|
662
|
+
pedigree_df = df.select(["sample_id", "father_id", "mother_id"])
|
|
663
|
+
|
|
664
|
+
# Replace 0 and -9 with null (indicating no parent)
|
|
665
|
+
pedigree_df = pedigree_df.with_columns(
|
|
666
|
+
[
|
|
667
|
+
pl.when(pl.col("father_id").cast(pl.Utf8).is_in(["0", "-9"]))
|
|
668
|
+
.then(None)
|
|
669
|
+
.otherwise(pl.col("father_id"))
|
|
670
|
+
.alias("father_id"),
|
|
671
|
+
pl.when(pl.col("mother_id").cast(pl.Utf8).is_in(["0", "-9"]))
|
|
672
|
+
.then(None)
|
|
673
|
+
.otherwise(pl.col("mother_id"))
|
|
674
|
+
.alias("mother_id"),
|
|
675
|
+
]
|
|
676
|
+
)
|
|
677
|
+
|
|
678
|
+
return pedigree_df
|
|
679
|
+
|
|
680
|
+
|
|
681
|
+
def add_parent_genotypes(df: pl.DataFrame, pedigree_df: pl.DataFrame) -> pl.DataFrame:
|
|
682
|
+
"""
|
|
683
|
+
Add father and mother genotype columns to the DataFrame.
|
|
684
|
+
|
|
685
|
+
Args:
|
|
686
|
+
df: DataFrame with sample genotype information
|
|
687
|
+
pedigree_df: DataFrame with parent relationships
|
|
688
|
+
|
|
689
|
+
Returns:
|
|
690
|
+
DataFrame with added parent genotype columns
|
|
691
|
+
"""
|
|
692
|
+
# Join with pedigree to get father and mother IDs for each sample
|
|
693
|
+
df = df.join(pedigree_df, left_on="sample", right_on="sample_id", how="left")
|
|
694
|
+
|
|
695
|
+
# Define the core variant-identifying columns for joining parent genotypes
|
|
696
|
+
# We only want to join on genomic position, not on annotation columns
|
|
697
|
+
# This ensures we match parents even if they have different VEP annotations
|
|
698
|
+
core_variant_cols = ["#CHROM", "POS", "REF", "ALT"]
|
|
699
|
+
# Check which columns actually exist in the dataframe
|
|
700
|
+
join_cols = [col for col in core_variant_cols if col in df.columns]
|
|
701
|
+
|
|
702
|
+
# Create a self-join friendly version of the data for looking up parent genotypes
|
|
703
|
+
# We select only the join columns + sample genotype information
|
|
704
|
+
parent_lookup = df.select(
|
|
705
|
+
join_cols
|
|
706
|
+
+ [
|
|
707
|
+
pl.col("sample"),
|
|
708
|
+
pl.col("sample_gt"),
|
|
709
|
+
pl.col("sample_dp"),
|
|
710
|
+
pl.col("sample_gq"),
|
|
711
|
+
pl.col("sample_ad"),
|
|
712
|
+
pl.col("sample_vaf"),
|
|
713
|
+
]
|
|
714
|
+
).unique()
|
|
715
|
+
|
|
716
|
+
# Join for father's genotypes
|
|
717
|
+
# Match on genomic position AND father_id == sample
|
|
718
|
+
father_data = parent_lookup.rename(
|
|
719
|
+
{
|
|
720
|
+
"sample": "father_id",
|
|
721
|
+
"sample_gt": "father_gt",
|
|
722
|
+
"sample_dp": "father_dp",
|
|
723
|
+
"sample_gq": "father_gq",
|
|
724
|
+
"sample_ad": "father_ad",
|
|
725
|
+
"sample_vaf": "father_vaf",
|
|
726
|
+
}
|
|
727
|
+
)
|
|
728
|
+
|
|
729
|
+
df = df.join(father_data, on=join_cols + ["father_id"], how="left")
|
|
730
|
+
|
|
731
|
+
# Join for mother's genotypes
|
|
732
|
+
mother_data = parent_lookup.rename(
|
|
733
|
+
{
|
|
734
|
+
"sample": "mother_id",
|
|
735
|
+
"sample_gt": "mother_gt",
|
|
736
|
+
"sample_dp": "mother_dp",
|
|
737
|
+
"sample_gq": "mother_gq",
|
|
738
|
+
"sample_ad": "mother_ad",
|
|
739
|
+
"sample_vaf": "mother_vaf",
|
|
740
|
+
}
|
|
741
|
+
)
|
|
742
|
+
|
|
743
|
+
df = df.join(mother_data, on=join_cols + ["mother_id"], how="left")
|
|
744
|
+
|
|
745
|
+
# Rename father_id and mother_id to father and mother for debugging
|
|
746
|
+
df = df.rename({"father_id": "father", "mother_id": "mother"})
|
|
747
|
+
|
|
748
|
+
# Replace '.' with '0' for parent DP and GQ columns
|
|
749
|
+
df = df.with_columns(
|
|
750
|
+
[
|
|
751
|
+
pl.when(pl.col("father_dp") == ".")
|
|
752
|
+
.then(pl.lit("0"))
|
|
753
|
+
.otherwise(pl.col("father_dp"))
|
|
754
|
+
.alias("father_dp"),
|
|
755
|
+
pl.when(pl.col("father_gq") == ".")
|
|
756
|
+
.then(pl.lit("0"))
|
|
757
|
+
.otherwise(pl.col("father_gq"))
|
|
758
|
+
.alias("father_gq"),
|
|
759
|
+
pl.when(pl.col("mother_dp") == ".")
|
|
760
|
+
.then(pl.lit("0"))
|
|
761
|
+
.otherwise(pl.col("mother_dp"))
|
|
762
|
+
.alias("mother_dp"),
|
|
763
|
+
pl.when(pl.col("mother_gq") == ".")
|
|
764
|
+
.then(pl.lit("0"))
|
|
765
|
+
.otherwise(pl.col("mother_gq"))
|
|
766
|
+
.alias("mother_gq"),
|
|
767
|
+
]
|
|
768
|
+
)
|
|
769
|
+
|
|
770
|
+
return df
|
|
771
|
+
|
|
772
|
+
|
|
773
|
+
def format_bcftools_tsv(
|
|
774
|
+
df: pl.DataFrame, pedigree_df: Optional[pl.DataFrame] = None
|
|
775
|
+
) -> pl.DataFrame:
|
|
776
|
+
"""
|
|
777
|
+
Format a bcftools tabulated TSV DataFrame.
|
|
778
|
+
|
|
779
|
+
Args:
|
|
780
|
+
df: Input DataFrame from bcftools
|
|
781
|
+
pedigree_df: Optional pedigree DataFrame with parent information
|
|
782
|
+
|
|
783
|
+
Returns:
|
|
784
|
+
Formatted DataFrame with expanded fields and melted samples
|
|
785
|
+
"""
|
|
786
|
+
# Find the (null) column
|
|
787
|
+
if "(null)" not in df.columns:
|
|
788
|
+
raise ValueError("Column '(null)' not found in the input file")
|
|
789
|
+
|
|
790
|
+
# Get column index of (null)
|
|
791
|
+
null_col_idx = df.columns.index("(null)")
|
|
792
|
+
|
|
793
|
+
# Split columns into: before (null), (null), and after (null)
|
|
794
|
+
cols_after = df.columns[null_col_idx + 1 :]
|
|
795
|
+
|
|
796
|
+
# Step 1: Expand the (null) column
|
|
797
|
+
# Split by semicolon and create new columns
|
|
798
|
+
|
|
799
|
+
# First, we need to extract all unique field names from the (null) column
|
|
800
|
+
# to know what columns to create
|
|
801
|
+
null_values = df.select("(null)").to_series()
|
|
802
|
+
all_fields = set()
|
|
803
|
+
|
|
804
|
+
for value in null_values:
|
|
805
|
+
if value and not (isinstance(value, float)): # Skip null/NaN values
|
|
806
|
+
pairs = str(value).split(";")
|
|
807
|
+
for pair in pairs:
|
|
808
|
+
if "=" in pair:
|
|
809
|
+
field_name = pair.split("=", 1)[0]
|
|
810
|
+
all_fields.add(field_name)
|
|
811
|
+
|
|
812
|
+
# Create expressions to extract each field
|
|
813
|
+
for field in sorted(all_fields):
|
|
814
|
+
# Extract the field value from the (null) column
|
|
815
|
+
# Pattern: extract value after "field=" and before ";" or end of string
|
|
816
|
+
df = df.with_columns(
|
|
817
|
+
pl.col("(null)").str.extract(f"{field}=([^;]+)").alias(field)
|
|
818
|
+
)
|
|
819
|
+
|
|
820
|
+
# Drop the original (null) column
|
|
821
|
+
df = df.drop("(null)")
|
|
822
|
+
|
|
823
|
+
# Drop CSQ column if it exists (it was extracted from (null) column)
|
|
824
|
+
if "CSQ" in df.columns:
|
|
825
|
+
df = df.drop("CSQ")
|
|
826
|
+
|
|
827
|
+
# Step 2: Identify sample columns and extract sample names
|
|
828
|
+
# Sample columns have format "sample_name:..." in the header
|
|
829
|
+
# Skip the CSQ column as it should not be melted (handled above)
|
|
830
|
+
sample_cols = []
|
|
831
|
+
sample_names = []
|
|
832
|
+
|
|
833
|
+
for col in cols_after:
|
|
834
|
+
# Skip CSQ column
|
|
835
|
+
if col == "CSQ":
|
|
836
|
+
continue
|
|
837
|
+
|
|
838
|
+
if ":" in col:
|
|
839
|
+
sample_name = col.split(":", 1)[0]
|
|
840
|
+
sample_cols.append(col)
|
|
841
|
+
sample_names.append(sample_name)
|
|
842
|
+
else:
|
|
843
|
+
# If no colon, treat the whole column name as sample name
|
|
844
|
+
sample_cols.append(col)
|
|
845
|
+
sample_names.append(col)
|
|
846
|
+
|
|
847
|
+
if not sample_cols:
|
|
848
|
+
# No sample columns to melt, just return expanded data
|
|
849
|
+
return df
|
|
850
|
+
|
|
851
|
+
# Step 3: Melt the sample columns
|
|
852
|
+
# Keep all columns except sample columns as id_vars
|
|
853
|
+
id_vars = [col for col in df.columns if col not in sample_cols]
|
|
854
|
+
|
|
855
|
+
# Create a mapping of old column names to sample names
|
|
856
|
+
rename_map = {old: new for old, new in zip(sample_cols, sample_names)}
|
|
857
|
+
|
|
858
|
+
# Rename sample columns to just sample names before melting
|
|
859
|
+
df = df.rename(rename_map)
|
|
860
|
+
|
|
861
|
+
# Melt the dataframe
|
|
862
|
+
melted_df = df.melt(
|
|
863
|
+
id_vars=id_vars,
|
|
864
|
+
value_vars=sample_names,
|
|
865
|
+
variable_name="sample",
|
|
866
|
+
value_name="sample_value",
|
|
867
|
+
)
|
|
868
|
+
|
|
869
|
+
# Step 4: Split sample_value into GT:DP:GQ:AD format
|
|
870
|
+
# Split on ':' to get individual fields
|
|
871
|
+
# Use nullable=True to handle missing fields gracefully
|
|
872
|
+
melted_df = melted_df.with_columns(
|
|
873
|
+
[
|
|
874
|
+
# GT - first field (nullable for missing data)
|
|
875
|
+
pl.col("sample_value")
|
|
876
|
+
.str.split(":")
|
|
877
|
+
.list.get(0, null_on_oob=True)
|
|
878
|
+
.alias("sample_gt"),
|
|
879
|
+
# DP - second field (nullable for missing data)
|
|
880
|
+
pl.col("sample_value")
|
|
881
|
+
.str.split(":")
|
|
882
|
+
.list.get(1, null_on_oob=True)
|
|
883
|
+
.alias("sample_dp"),
|
|
884
|
+
# GQ - third field (nullable for missing data)
|
|
885
|
+
pl.col("sample_value")
|
|
886
|
+
.str.split(":")
|
|
887
|
+
.list.get(2, null_on_oob=True)
|
|
888
|
+
.alias("sample_gq"),
|
|
889
|
+
# AD - fourth field, split on ',' and keep second value (nullable)
|
|
890
|
+
pl.col("sample_value")
|
|
891
|
+
.str.split(":")
|
|
892
|
+
.list.get(3, null_on_oob=True)
|
|
893
|
+
.str.split(",")
|
|
894
|
+
.list.get(1, null_on_oob=True)
|
|
895
|
+
.alias("sample_ad"),
|
|
896
|
+
]
|
|
897
|
+
)
|
|
898
|
+
|
|
899
|
+
# Replace '.' with '0' for DP and GQ columns
|
|
900
|
+
melted_df = melted_df.with_columns(
|
|
901
|
+
[
|
|
902
|
+
pl.when(pl.col("sample_dp") == ".")
|
|
903
|
+
.then(pl.lit("0"))
|
|
904
|
+
.otherwise(pl.col("sample_dp"))
|
|
905
|
+
.alias("sample_dp"),
|
|
906
|
+
pl.when(pl.col("sample_gq") == ".")
|
|
907
|
+
.then(pl.lit("0"))
|
|
908
|
+
.otherwise(pl.col("sample_gq"))
|
|
909
|
+
.alias("sample_gq"),
|
|
910
|
+
]
|
|
911
|
+
)
|
|
912
|
+
|
|
913
|
+
# Step 5: Calculate sample_vaf as sample_ad / sample_dp
|
|
914
|
+
# Convert to numeric, calculate ratio, handle division by zero
|
|
915
|
+
melted_df = melted_df.with_columns(
|
|
916
|
+
[
|
|
917
|
+
(
|
|
918
|
+
pl.col("sample_ad").cast(pl.Float64, strict=False)
|
|
919
|
+
/ pl.col("sample_dp").cast(pl.Float64, strict=False)
|
|
920
|
+
).alias("sample_vaf")
|
|
921
|
+
]
|
|
922
|
+
)
|
|
923
|
+
|
|
924
|
+
# Drop the original sample_value column
|
|
925
|
+
melted_df = melted_df.drop("sample_value")
|
|
926
|
+
|
|
927
|
+
# Step 6: Add parent genotype information if pedigree is provided
|
|
928
|
+
if pedigree_df is not None:
|
|
929
|
+
melted_df = add_parent_genotypes(melted_df, pedigree_df)
|
|
930
|
+
|
|
931
|
+
return melted_df
|
|
932
|
+
|
|
933
|
+
|
|
934
|
+
if __name__ == "__main__":
|
|
935
|
+
cli()
|
|
@@ -0,0 +1,142 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: pywombat
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: A CLI tool for processing and filtering bcftools tabulated TSV files with pedigree support
|
|
5
|
+
Project-URL: Homepage, https://github.com/bourgeron-lab/pywombat
|
|
6
|
+
Project-URL: Repository, https://github.com/bourgeron-lab/pywombat
|
|
7
|
+
Project-URL: Issues, https://github.com/bourgeron-lab/pywombat/issues
|
|
8
|
+
Author-email: Freddy Cliquet <fcliquet@pasteur.fr>
|
|
9
|
+
License: MIT
|
|
10
|
+
Keywords: bioinformatics,genomics,pedigree,variant-calling,vcf
|
|
11
|
+
Classifier: Development Status :: 3 - Alpha
|
|
12
|
+
Classifier: Intended Audience :: Science/Research
|
|
13
|
+
Classifier: Programming Language :: Python :: 3
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
15
|
+
Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
|
|
16
|
+
Requires-Python: >=3.12
|
|
17
|
+
Requires-Dist: click>=8.1.0
|
|
18
|
+
Requires-Dist: polars>=0.19.0
|
|
19
|
+
Requires-Dist: pyyaml>=6.0
|
|
20
|
+
Description-Content-Type: text/markdown
|
|
21
|
+
|
|
22
|
+
# PyWombat
|
|
23
|
+
|
|
24
|
+
A CLI tool for processing bcftools tabulated TSV files.
|
|
25
|
+
|
|
26
|
+
## Installation
|
|
27
|
+
|
|
28
|
+
This is a UV-managed Python package. To install:
|
|
29
|
+
|
|
30
|
+
```bash
|
|
31
|
+
uv sync
|
|
32
|
+
```
|
|
33
|
+
|
|
34
|
+
## Usage
|
|
35
|
+
|
|
36
|
+
The `wombat` command processes bcftools tabulated TSV files:
|
|
37
|
+
|
|
38
|
+
```bash
|
|
39
|
+
# Format a bcftools TSV file and print to stdout
|
|
40
|
+
wombat input.tsv
|
|
41
|
+
|
|
42
|
+
# Format and save to output file (creates output.tsv by default)
|
|
43
|
+
wombat input.tsv -o output
|
|
44
|
+
|
|
45
|
+
# Format and save as parquet
|
|
46
|
+
wombat input.tsv -o output -f parquet
|
|
47
|
+
wombat input.tsv -o output --format parquet
|
|
48
|
+
|
|
49
|
+
# Format with pedigree information to add parent genotypes
|
|
50
|
+
wombat input.tsv --pedigree pedigree.tsv -o output
|
|
51
|
+
```
|
|
52
|
+
|
|
53
|
+
### What does `wombat` do?
|
|
54
|
+
|
|
55
|
+
The `wombat` command processes bcftools tabulated TSV files by:
|
|
56
|
+
|
|
57
|
+
1. **Expanding the `(null)` column**: This column contains multiple fields in the format `NAME=value` separated by semicolons (e.g., `DP=30;AF=0.5;AC=2`). Each field is extracted into its own column.
|
|
58
|
+
|
|
59
|
+
2. **Preserving the `CSQ` column**: The CSQ (Consequence) column is preserved as-is and not melted, allowing VEP annotations to remain intact.
|
|
60
|
+
|
|
61
|
+
3. **Melting and splitting sample columns**: After the `(null)` column, there are typically sample columns with values in `GT:DP:GQ:AD` format. The tool:
|
|
62
|
+
- Extracts the sample name (the part before the first `:` character)
|
|
63
|
+
- Transforms the wide format into long format
|
|
64
|
+
- Creates a `sample` column with the sample names
|
|
65
|
+
- Splits the sample values into separate columns:
|
|
66
|
+
- `sample_gt`: Genotype (e.g., 0/1, 1/1)
|
|
67
|
+
- `sample_dp`: Read depth
|
|
68
|
+
- `sample_gq`: Genotype quality
|
|
69
|
+
- `sample_ad`: Allele depth (takes the second value from comma-separated list)
|
|
70
|
+
- `sample_vaf`: Variant allele frequency (calculated as sample_ad / sample_dp)
|
|
71
|
+
|
|
72
|
+
### Example
|
|
73
|
+
|
|
74
|
+
**Input:**
|
|
75
|
+
|
|
76
|
+
```tsv
|
|
77
|
+
CHROM POS REF ALT (null) Sample1:GT:Sample1:DP:Sample1:GQ:Sample1:AD Sample2:GT:Sample2:DP:Sample2:GQ:Sample2:AD
|
|
78
|
+
chr1 100 A T DP=30;AF=0.5;AC=2 0/1:15:99:5,10 1/1:18:99:0,18
|
|
79
|
+
```
|
|
80
|
+
|
|
81
|
+
**Output:**
|
|
82
|
+
|
|
83
|
+
```tsv
|
|
84
|
+
CHROM POS REF ALT AC AF DP sample sample_gt sample_dp sample_gq sample_ad sample_vaf
|
|
85
|
+
chr1 100 A T 2 0.5 30 Sample1 0/1 15 99 10 0.6667
|
|
86
|
+
chr1 100 A T 2 0.5 30 Sample2 1/1 18 99 18 1.0
|
|
87
|
+
```
|
|
88
|
+
|
|
89
|
+
Notes:
|
|
90
|
+
|
|
91
|
+
- The `sample_ad` column contains the second value from the AD field (e.g., from `5,10` it extracts `10`)
|
|
92
|
+
- The `sample_vaf` column is the variant allele frequency calculated as `sample_ad / sample_dp`
|
|
93
|
+
- By default, output is in TSV format. Use `-f parquet` to output as Parquet files
|
|
94
|
+
- The `-o` option specifies an output prefix (e.g., `-o output` creates `output.tsv` or `output.parquet`)
|
|
95
|
+
|
|
96
|
+
### Pedigree Support
|
|
97
|
+
|
|
98
|
+
You can provide a pedigree file with the `--pedigree` option to add parent genotype information to the output. This enables trio analysis by including the father's and mother's genotypes for each sample.
|
|
99
|
+
|
|
100
|
+
**Pedigree File Format:**
|
|
101
|
+
|
|
102
|
+
The pedigree file should be a tab-separated file with the following columns:
|
|
103
|
+
|
|
104
|
+
- `FID`: Family ID
|
|
105
|
+
- `sample_id`: Sample identifier (matches the sample names in the VCF)
|
|
106
|
+
- `FatherBarcode`: Father's sample identifier (use `0` or `-9` if unknown)
|
|
107
|
+
- `MotherBarcode`: Mother's sample identifier (use `0` or `-9` if unknown)
|
|
108
|
+
- `Sex`: Sex of the sample (optional)
|
|
109
|
+
- `Pheno`: Phenotype information (optional)
|
|
110
|
+
|
|
111
|
+
Example pedigree file:
|
|
112
|
+
|
|
113
|
+
```tsv
|
|
114
|
+
FID sample_id FatherBarcode MotherBarcode Sex Pheno
|
|
115
|
+
FAM1 Child1 Father1 Mother1 1 2
|
|
116
|
+
FAM1 Father1 0 0 1 1
|
|
117
|
+
FAM1 Mother1 0 0 2 1
|
|
118
|
+
```
|
|
119
|
+
|
|
120
|
+
**Output with Pedigree:**
|
|
121
|
+
|
|
122
|
+
When using `--pedigree`, the output will include additional columns for each parent:
|
|
123
|
+
|
|
124
|
+
- `father_gt`, `father_dp`, `father_gq`, `father_ad`, `father_vaf`: Father's genotype information
|
|
125
|
+
- `mother_gt`, `mother_dp`, `mother_gq`, `mother_ad`, `mother_vaf`: Mother's genotype information
|
|
126
|
+
|
|
127
|
+
These columns will contain the parent's genotype data for the same variant, allowing you to analyze inheritance patterns.
|
|
128
|
+
|
|
129
|
+
## Development
|
|
130
|
+
|
|
131
|
+
This project uses:
|
|
132
|
+
|
|
133
|
+
- **UV** for package management
|
|
134
|
+
- **Polars** for fast data processing
|
|
135
|
+
- **Click** for CLI interface
|
|
136
|
+
|
|
137
|
+
## Testing
|
|
138
|
+
|
|
139
|
+
Test files are available in the `tests/` directory:
|
|
140
|
+
|
|
141
|
+
- `test.tabulated.tsv` - Real bcftools output
|
|
142
|
+
- `test_small.tsv` - Small example for quick testing
|
|
@@ -0,0 +1,6 @@
|
|
|
1
|
+
pywombat/__init__.py,sha256=iIPN9vJtsIUhl_DiKNnknxCamLinfayodLLFK8y-aJg,54
|
|
2
|
+
pywombat/cli.py,sha256=kUokXfnaSCKLXiCu7jXbYOPlGTtL5wSzocM9gFtPy30,32801
|
|
3
|
+
pywombat-0.1.0.dist-info/METADATA,sha256=3RlA_lLC7hKUxIrhQvvbBKEolYGOl_EVJgyDfDLI0sU,4982
|
|
4
|
+
pywombat-0.1.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
|
5
|
+
pywombat-0.1.0.dist-info/entry_points.txt,sha256=Vt7U2ypbiEgCBlEV71ZPk287H5_HKmPBT4iBu6duEcE,44
|
|
6
|
+
pywombat-0.1.0.dist-info/RECORD,,
|