pywombat 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
pywombat/__init__.py ADDED
@@ -0,0 +1,2 @@
1
+ def main() -> None:
2
+ print("Hello from pywombat!")
pywombat/cli.py ADDED
@@ -0,0 +1,935 @@
1
+ """CLI for wombat tool."""
2
+
3
+ import re
4
+ import warnings
5
+ from pathlib import Path
6
+ from typing import Optional
7
+
8
+ import click
9
+ import polars as pl
10
+ import yaml
11
+
12
+
13
+ @click.command()
14
+ @click.argument("input_file", type=click.Path(exists=True, path_type=Path))
15
+ @click.option(
16
+ "-o",
17
+ "--output",
18
+ type=str,
19
+ help="Output file prefix. If not specified, prints to stdout.",
20
+ )
21
+ @click.option(
22
+ "-f",
23
+ "--format",
24
+ "output_format",
25
+ type=click.Choice(["tsv", "parquet"], case_sensitive=False),
26
+ default="tsv",
27
+ help="Output format: tsv (default) or parquet.",
28
+ )
29
+ @click.option("-v", "--verbose", is_flag=True, help="Enable verbose output.")
30
+ @click.option(
31
+ "-p",
32
+ "--pedigree",
33
+ type=click.Path(exists=True, path_type=Path),
34
+ help="Pedigree file to add father and mother genotype columns.",
35
+ )
36
+ @click.option(
37
+ "-F",
38
+ "--filter-config",
39
+ type=click.Path(exists=True, path_type=Path),
40
+ help="Filter configuration YAML file to apply quality and impact filters.",
41
+ )
42
+ def cli(
43
+ input_file: Path,
44
+ output: Optional[str],
45
+ output_format: str,
46
+ verbose: bool,
47
+ pedigree: Optional[Path],
48
+ filter_config: Optional[Path],
49
+ ):
50
+ """
51
+ Wombat: A tool for processing bcftools tabulated TSV files.
52
+
53
+ This command:
54
+
55
+ \b
56
+ 1. Expands the '(null)' column containing NAME=value pairs separated by ';'
57
+ 2. Preserves the CSQ (Consequence) column without melting
58
+ 3. Melts sample columns into rows with sample names
59
+ 4. Splits sample values (GT:DP:GQ:AD format) into separate columns:
60
+ - sample_gt: Genotype
61
+ - sample_dp: Read depth
62
+ - sample_gq: Genotype quality
63
+ - sample_ad: Allele depth (second value from comma-separated list)
64
+ - sample_vaf: Variant allele frequency (sample_ad / sample_dp)
65
+
66
+ \b
67
+ Examples:
68
+ wombat input.tsv -o output
69
+ wombat input.tsv -o output -f parquet
70
+ wombat input.tsv > output.tsv
71
+ """
72
+ try:
73
+ if verbose:
74
+ click.echo(f"Reading input file: {input_file}", err=True)
75
+
76
+ # Read the TSV file
77
+ df = pl.read_csv(input_file, separator="\t")
78
+
79
+ if verbose:
80
+ click.echo(
81
+ f"Input shape: {df.shape[0]} rows, {df.shape[1]} columns", err=True
82
+ )
83
+
84
+ # Read pedigree file if provided
85
+ pedigree_df = None
86
+ if pedigree:
87
+ if verbose:
88
+ click.echo(f"Reading pedigree file: {pedigree}", err=True)
89
+ pedigree_df = read_pedigree(pedigree)
90
+
91
+ # Process the dataframe
92
+ formatted_df = format_bcftools_tsv(df, pedigree_df)
93
+
94
+ if verbose:
95
+ click.echo(
96
+ f"Output shape: {formatted_df.shape[0]} rows, {formatted_df.shape[1]} columns",
97
+ err=True,
98
+ )
99
+
100
+ # Apply filters if provided
101
+ filter_config_data = None
102
+ if filter_config:
103
+ if verbose:
104
+ click.echo(f"Reading filter config: {filter_config}", err=True)
105
+ filter_config_data = load_filter_config(filter_config)
106
+
107
+ # Apply filters and write output
108
+ if filter_config_data:
109
+ apply_filters_and_write(
110
+ formatted_df,
111
+ filter_config_data,
112
+ output,
113
+ output_format,
114
+ verbose,
115
+ )
116
+ else:
117
+ # No filters - write single output file
118
+ if output:
119
+ # Construct output filename with prefix and format
120
+ output_path = Path(f"{output}.{output_format}")
121
+
122
+ if output_format == "tsv":
123
+ formatted_df.write_csv(output_path, separator="\t")
124
+ elif output_format == "parquet":
125
+ formatted_df.write_parquet(output_path)
126
+
127
+ click.echo(f"Formatted data written to {output_path}", err=True)
128
+ else:
129
+ # Write to stdout (only for TSV format)
130
+ if output_format != "tsv":
131
+ click.echo(
132
+ "Error: stdout output only supported for TSV format. Use -o to specify an output prefix for parquet.",
133
+ err=True,
134
+ )
135
+ raise click.Abort()
136
+ click.echo(formatted_df.write_csv(separator="\t"), nl=False)
137
+
138
+ except Exception as e:
139
+ click.echo(f"Error: {e}", err=True)
140
+ raise click.Abort()
141
+
142
+
143
+ def load_filter_config(config_path: Path) -> dict:
144
+ """Load and parse filter configuration from YAML file."""
145
+ with open(config_path, "r") as f:
146
+ config = yaml.safe_load(f)
147
+ return config
148
+
149
+
150
+ def apply_quality_filters(
151
+ df: pl.DataFrame, quality_config: dict, verbose: bool = False
152
+ ) -> pl.DataFrame:
153
+ """Apply quality filters to the dataframe."""
154
+ if quality_config is None:
155
+ return df
156
+
157
+ original_rows = df.shape[0]
158
+
159
+ # Check for rare genotypes with '2' and warn
160
+ if "2" in str(df["sample_gt"].to_list()):
161
+ rare_gts = df.filter(pl.col("sample_gt").str.contains("2"))
162
+ if rare_gts.shape[0] > 0:
163
+ warnings.warn(
164
+ f"Found {rare_gts.shape[0]} rows with rare genotypes containing '2'. These will be kept."
165
+ )
166
+
167
+ # Filter: sample_gt must contain at least one '1' (default: true)
168
+ filter_no_alt = quality_config.get("filter_no_alt_allele", True)
169
+ if filter_no_alt:
170
+ df = df.filter(
171
+ pl.col("sample_gt").str.contains("1")
172
+ | pl.col("sample_gt").str.contains("2")
173
+ )
174
+
175
+ # Apply minimum depth filter
176
+ if "sample_dp_min" in quality_config:
177
+ min_dp = quality_config["sample_dp_min"]
178
+ df = df.filter(pl.col("sample_dp").cast(pl.Float64, strict=False) >= min_dp)
179
+
180
+ # Apply minimum GQ filter
181
+ if "sample_gq_min" in quality_config:
182
+ min_gq = quality_config["sample_gq_min"]
183
+ df = df.filter(pl.col("sample_gq").cast(pl.Float64, strict=False) >= min_gq)
184
+
185
+ # Determine genotype for VAF filters
186
+ # Het: contains exactly one '1' (0/1 or 1/0)
187
+ # HomAlt: 1/1
188
+ # HomRef: 0/0
189
+ is_het = (pl.col("sample_gt").str.count_matches("1") == 1) & ~pl.col(
190
+ "sample_gt"
191
+ ).str.contains("2")
192
+ is_hom_alt = pl.col("sample_gt") == "1/1"
193
+ is_hom_ref = pl.col("sample_gt") == "0/0"
194
+
195
+ # VAF filters for heterozygous
196
+ if "sample_vaf_het_min" in quality_config:
197
+ min_vaf_het = quality_config["sample_vaf_het_min"]
198
+ df = df.filter(~is_het | (pl.col("sample_vaf") >= min_vaf_het))
199
+
200
+ if "sample_vaf_het_max" in quality_config:
201
+ max_vaf_het = quality_config["sample_vaf_het_max"]
202
+ df = df.filter(~is_het | (pl.col("sample_vaf") <= max_vaf_het))
203
+
204
+ # VAF filters for homozygous alternate
205
+ if "sample_vaf_homalt_min" in quality_config:
206
+ min_vaf_homalt = quality_config["sample_vaf_homalt_min"]
207
+ df = df.filter(~is_hom_alt | (pl.col("sample_vaf") >= min_vaf_homalt))
208
+
209
+ # VAF filters for homozygous reference (wild type)
210
+ if "sample_vaf_hom_ref_max" in quality_config:
211
+ max_vaf_hom_ref = quality_config["sample_vaf_hom_ref_max"]
212
+ df = df.filter(~is_hom_ref | (pl.col("sample_vaf") <= max_vaf_hom_ref))
213
+
214
+ # Apply filters to parents if they exist and option is enabled
215
+ apply_to_parents = quality_config.get("apply_to_parents", False)
216
+ if apply_to_parents and "father" in df.columns:
217
+ # Apply same filters to father columns
218
+ if "sample_dp_min" in quality_config:
219
+ min_dp = quality_config["sample_dp_min"]
220
+ df = df.filter(
221
+ (pl.col("father_dp").is_null())
222
+ | (pl.col("father_dp").cast(pl.Float64, strict=False) >= min_dp)
223
+ )
224
+
225
+ if "sample_gq_min" in quality_config:
226
+ min_gq = quality_config["sample_gq_min"]
227
+ df = df.filter(
228
+ (pl.col("father_gq").is_null())
229
+ | (pl.col("father_gq").cast(pl.Float64, strict=False) >= min_gq)
230
+ )
231
+
232
+ # Father genotype checks
233
+ father_is_het = (pl.col("father_gt").str.count_matches("1") == 1) & ~pl.col(
234
+ "father_gt"
235
+ ).str.contains("2")
236
+ father_is_hom_alt = pl.col("father_gt") == "1/1"
237
+ father_is_hom_ref = pl.col("father_gt") == "0/0"
238
+
239
+ if "sample_vaf_het_min" in quality_config:
240
+ min_vaf_het = quality_config["sample_vaf_het_min"]
241
+ df = df.filter(
242
+ pl.col("father_vaf").is_null()
243
+ | ~father_is_het
244
+ | (pl.col("father_vaf") >= min_vaf_het)
245
+ )
246
+
247
+ if "sample_vaf_het_max" in quality_config:
248
+ max_vaf_het = quality_config["sample_vaf_het_max"]
249
+ df = df.filter(
250
+ pl.col("father_vaf").is_null()
251
+ | ~father_is_het
252
+ | (pl.col("father_vaf") <= max_vaf_het)
253
+ )
254
+
255
+ if "sample_vaf_homalt_min" in quality_config:
256
+ min_vaf_homalt = quality_config["sample_vaf_homalt_min"]
257
+ df = df.filter(
258
+ pl.col("father_vaf").is_null()
259
+ | ~father_is_hom_alt
260
+ | (pl.col("father_vaf") >= min_vaf_homalt)
261
+ )
262
+
263
+ if "sample_vaf_hom_ref_max" in quality_config:
264
+ max_vaf_hom_ref = quality_config["sample_vaf_hom_ref_max"]
265
+ df = df.filter(
266
+ pl.col("father_vaf").is_null()
267
+ | ~father_is_hom_ref
268
+ | (pl.col("father_vaf") <= max_vaf_hom_ref)
269
+ )
270
+
271
+ # Apply same filters to mother columns
272
+ if "sample_dp_min" in quality_config:
273
+ min_dp = quality_config["sample_dp_min"]
274
+ df = df.filter(
275
+ (pl.col("mother_dp").is_null())
276
+ | (pl.col("mother_dp").cast(pl.Float64, strict=False) >= min_dp)
277
+ )
278
+
279
+ if "sample_gq_min" in quality_config:
280
+ min_gq = quality_config["sample_gq_min"]
281
+ df = df.filter(
282
+ (pl.col("mother_gq").is_null())
283
+ | (pl.col("mother_gq").cast(pl.Float64, strict=False) >= min_gq)
284
+ )
285
+
286
+ # Mother genotype checks
287
+ mother_is_het = (pl.col("mother_gt").str.count_matches("1") == 1) & ~pl.col(
288
+ "mother_gt"
289
+ ).str.contains("2")
290
+ mother_is_hom_alt = pl.col("mother_gt") == "1/1"
291
+ mother_is_hom_ref = pl.col("mother_gt") == "0/0"
292
+
293
+ if "sample_vaf_het_min" in quality_config:
294
+ min_vaf_het = quality_config["sample_vaf_het_min"]
295
+ df = df.filter(
296
+ pl.col("mother_vaf").is_null()
297
+ | ~mother_is_het
298
+ | (pl.col("mother_vaf") >= min_vaf_het)
299
+ )
300
+
301
+ if "sample_vaf_het_max" in quality_config:
302
+ max_vaf_het = quality_config["sample_vaf_het_max"]
303
+ df = df.filter(
304
+ pl.col("mother_vaf").is_null()
305
+ | ~mother_is_het
306
+ | (pl.col("mother_vaf") <= max_vaf_het)
307
+ )
308
+
309
+ if "sample_vaf_homalt_min" in quality_config:
310
+ min_vaf_homalt = quality_config["sample_vaf_homalt_min"]
311
+ df = df.filter(
312
+ pl.col("mother_vaf").is_null()
313
+ | ~mother_is_hom_alt
314
+ | (pl.col("mother_vaf") >= min_vaf_homalt)
315
+ )
316
+
317
+ if "sample_vaf_hom_ref_max" in quality_config:
318
+ max_vaf_hom_ref = quality_config["sample_vaf_hom_ref_max"]
319
+ df = df.filter(
320
+ pl.col("mother_vaf").is_null()
321
+ | ~mother_is_hom_ref
322
+ | (pl.col("mother_vaf") <= max_vaf_hom_ref)
323
+ )
324
+
325
+ if verbose:
326
+ filtered_rows = df.shape[0]
327
+ click.echo(
328
+ f"Quality filters: {original_rows} -> {filtered_rows} rows ({original_rows - filtered_rows} filtered out)",
329
+ err=True,
330
+ )
331
+
332
+ return df
333
+
334
+
335
+ def parse_impact_filter_expression(expression: str, df: pl.DataFrame) -> pl.Expr:
336
+ """Parse a filter expression string into a Polars expression."""
337
+ # Replace operators with Polars equivalents
338
+ # Support: =, !=, <=, >=, <, >, &, |, ()
339
+
340
+ expr_str = expression.strip()
341
+
342
+ # Split by logical operators while preserving them
343
+ tokens = re.split(r"(\s*[&|]\s*|\(|\))", expr_str)
344
+ tokens = [t.strip() for t in tokens if t.strip()]
345
+
346
+ def parse_condition(condition: str) -> pl.Expr:
347
+ """Parse a single condition into a Polars expression."""
348
+ condition = condition.strip()
349
+
350
+ # Try different operators in order of specificity
351
+ for op in ["<=", ">=", "!=", "=", "<", ">"]:
352
+ if op in condition:
353
+ parts = condition.split(op, 1)
354
+ if len(parts) == 2:
355
+ col_name = parts[0].strip()
356
+ value = parts[1].strip()
357
+
358
+ # Check if column exists
359
+ if col_name not in df.columns:
360
+ raise ValueError(f"Column '{col_name}' not found in dataframe")
361
+
362
+ # Try to convert value to number, otherwise treat as string
363
+ try:
364
+ value_num = float(value)
365
+ col_expr = pl.col(col_name).cast(pl.Float64, strict=False)
366
+
367
+ if op == "=":
368
+ return col_expr == value_num
369
+ elif op == "!=":
370
+ return col_expr != value_num
371
+ elif op == "<=":
372
+ return col_expr <= value_num
373
+ elif op == ">=":
374
+ return col_expr >= value_num
375
+ elif op == "<":
376
+ return col_expr < value_num
377
+ elif op == ">":
378
+ return col_expr > value_num
379
+ except ValueError:
380
+ # String comparison (case-insensitive)
381
+ value = value.strip("'\"")
382
+ col_expr = pl.col(col_name).str.to_lowercase()
383
+ value_lower = value.lower()
384
+
385
+ if op == "=":
386
+ return col_expr == value_lower
387
+ elif op == "!=":
388
+ return col_expr != value_lower
389
+ else:
390
+ raise ValueError(
391
+ f"Operator '{op}' not supported for string comparison"
392
+ )
393
+ break
394
+
395
+ raise ValueError(f"Could not parse condition: {condition}")
396
+
397
+ def build_expression(tokens: list, idx: int = 0) -> tuple[pl.Expr, int]:
398
+ """Recursively build expression from tokens."""
399
+ if idx >= len(tokens):
400
+ return None, idx
401
+
402
+ result = None
403
+ i = idx
404
+
405
+ while i < len(tokens):
406
+ token = tokens[i]
407
+
408
+ if token == "(":
409
+ # Parse sub-expression
410
+ sub_expr, new_i = build_expression(tokens, i + 1)
411
+ if result is None:
412
+ result = sub_expr
413
+ i = new_i
414
+ elif token == ")":
415
+ # End of sub-expression
416
+ return result, i + 1
417
+ elif token == "&":
418
+ # AND operator
419
+ next_expr, new_i = build_expression(tokens, i + 1)
420
+ if next_expr is not None:
421
+ result = result & next_expr if result is not None else next_expr
422
+ return result, new_i
423
+ elif token == "|":
424
+ # OR operator
425
+ next_expr, new_i = build_expression(tokens, i + 1)
426
+ if next_expr is not None:
427
+ result = result | next_expr if result is not None else next_expr
428
+ return result, new_i
429
+ else:
430
+ # It's a condition
431
+ cond_expr = parse_condition(token)
432
+ if result is None:
433
+ result = cond_expr
434
+ else:
435
+ # If we have a result and encounter another condition without an operator,
436
+ # assume AND
437
+ result = result & cond_expr
438
+
439
+ i += 1
440
+
441
+ return result, i
442
+
443
+ expr, _ = build_expression(tokens)
444
+ return expr
445
+
446
+
447
+ def apply_impact_filters(
448
+ df: pl.DataFrame,
449
+ impact_config: list,
450
+ output_prefix: str,
451
+ output_format: str,
452
+ verbose: bool,
453
+ ):
454
+ """Apply impact filters and create separate output files."""
455
+ if not impact_config:
456
+ return
457
+
458
+ # Sort impact filters by priority (lower number = higher priority)
459
+ impact_filters = sorted(impact_config, key=lambda x: x.get("priority", 999))
460
+
461
+ # Create a dict to store variants by impact filter
462
+ impact_variants = {}
463
+
464
+ # Apply each impact filter
465
+ for impact_filter in impact_filters:
466
+ name = impact_filter["name"]
467
+ priority = impact_filter.get("priority", 999)
468
+ expression = impact_filter["expression"]
469
+
470
+ if verbose:
471
+ click.echo(
472
+ f"Applying impact filter '{name}' (priority {priority})...", err=True
473
+ )
474
+
475
+ try:
476
+ # Parse and apply the filter expression
477
+ filter_expr = parse_impact_filter_expression(expression, df)
478
+ filtered_df = df.filter(filter_expr)
479
+
480
+ if verbose:
481
+ click.echo(
482
+ f" Impact filter '{name}': {filtered_df.shape[0]} variants",
483
+ err=True,
484
+ )
485
+
486
+ # Store the filtered dataframe with its priority
487
+ impact_variants[name] = {
488
+ "df": filtered_df,
489
+ "priority": priority,
490
+ }
491
+ except Exception as e:
492
+ click.echo(
493
+ f"Error applying impact filter '{name}': {e}",
494
+ err=True,
495
+ )
496
+ raise
497
+
498
+ # Add flag_higher_impact column to each filtered dataframe
499
+ for name, data in impact_variants.items():
500
+ filtered_df = data["df"]
501
+ priority = data["priority"]
502
+
503
+ # Find variants that appear in higher priority filters
504
+ higher_priority_filters = []
505
+ for other_name, other_data in impact_variants.items():
506
+ if other_data["priority"] < priority:
507
+ higher_priority_filters.append(other_name)
508
+
509
+ if higher_priority_filters:
510
+ # Create a set of variant keys from higher priority filters
511
+ variant_keys_in_higher = set()
512
+ for other_name in higher_priority_filters:
513
+ other_df = impact_variants[other_name]["df"]
514
+ for row in other_df.select(["#CHROM", "POS", "REF", "ALT"]).iter_rows():
515
+ variant_keys_in_higher.add(tuple(row))
516
+
517
+ # Add flag_higher_impact column
518
+ def check_higher_impact(chrom, pos, ref, alt):
519
+ key = (chrom, pos, ref, alt)
520
+ if key in variant_keys_in_higher:
521
+ # Find which filters it appears in
522
+ filters_with_variant = []
523
+ for other_name in higher_priority_filters:
524
+ other_df = impact_variants[other_name]["df"]
525
+ match = other_df.filter(
526
+ (pl.col("#CHROM") == chrom)
527
+ & (pl.col("POS") == pos)
528
+ & (pl.col("REF") == ref)
529
+ & (pl.col("ALT") == alt)
530
+ )
531
+ if match.shape[0] > 0:
532
+ filters_with_variant.append(other_name)
533
+ return (
534
+ ", ".join(filters_with_variant) if filters_with_variant else ""
535
+ )
536
+ return ""
537
+
538
+ # Add the flag column
539
+ filtered_df = filtered_df.with_columns(
540
+ [
541
+ pl.struct(["#CHROM", "POS", "REF", "ALT"])
542
+ .map_elements(
543
+ lambda x: check_higher_impact(
544
+ x["#CHROM"], x["POS"], x["REF"], x["ALT"]
545
+ ),
546
+ return_dtype=pl.Utf8,
547
+ )
548
+ .alias("flag_higher_impact")
549
+ ]
550
+ )
551
+ else:
552
+ # No higher priority filters
553
+ filtered_df = filtered_df.with_columns(
554
+ [pl.lit("").alias("flag_higher_impact")]
555
+ )
556
+
557
+ # Write to file
558
+ output_filename = f"{output_prefix}_{name}.{output_format}"
559
+ output_path = Path(output_filename)
560
+
561
+ if output_format == "tsv":
562
+ filtered_df.write_csv(output_path, separator="\t")
563
+ elif output_format == "parquet":
564
+ filtered_df.write_parquet(output_path)
565
+
566
+ click.echo(
567
+ f"Written {filtered_df.shape[0]} variants to {output_path}", err=True
568
+ )
569
+
570
+
571
+ def apply_filters_and_write(
572
+ df: pl.DataFrame,
573
+ filter_config: dict,
574
+ output_prefix: Optional[str],
575
+ output_format: str,
576
+ verbose: bool,
577
+ ):
578
+ """Apply filters and write output files."""
579
+ # Apply quality filters first
580
+ quality_config = filter_config.get("quality", {})
581
+ filtered_df = apply_quality_filters(df, quality_config, verbose)
582
+
583
+ # Get impact filters
584
+ impact_config = filter_config.get("impact", [])
585
+
586
+ if not impact_config:
587
+ # No impact filters - write single output file
588
+ if not output_prefix:
589
+ # Write to stdout
590
+ if output_format != "tsv":
591
+ click.echo(
592
+ "Error: stdout output only supported for TSV format.",
593
+ err=True,
594
+ )
595
+ raise click.Abort()
596
+ click.echo(filtered_df.write_csv(separator="\t"), nl=False)
597
+ else:
598
+ output_path = Path(f"{output_prefix}.{output_format}")
599
+
600
+ if output_format == "tsv":
601
+ filtered_df.write_csv(output_path, separator="\t")
602
+ elif output_format == "parquet":
603
+ filtered_df.write_parquet(output_path)
604
+
605
+ click.echo(f"Formatted data written to {output_path}", err=True)
606
+ else:
607
+ # Apply impact filters and create multiple output files
608
+ if not output_prefix:
609
+ click.echo(
610
+ "Error: Output prefix required when using impact filters.",
611
+ err=True,
612
+ )
613
+ raise click.Abort()
614
+
615
+ apply_impact_filters(
616
+ filtered_df,
617
+ impact_config,
618
+ output_prefix,
619
+ output_format,
620
+ verbose,
621
+ )
622
+
623
+
624
+ def read_pedigree(pedigree_path: Path) -> pl.DataFrame:
625
+ """
626
+ Read a pedigree file and return a DataFrame with sample relationships.
627
+
628
+ Args:
629
+ pedigree_path: Path to the pedigree file
630
+
631
+ Returns:
632
+ DataFrame with columns: sample_id, father_id, mother_id
633
+ """
634
+ # Try reading with header first
635
+ df = pl.read_csv(pedigree_path, separator="\t")
636
+
637
+ # Check if first row has 'FID' in first column (indicates header)
638
+ if df.columns[0] == "FID" or "sample_id" in df.columns:
639
+ # Has header - use it as-is
640
+ pass
641
+ else:
642
+ # No header - assume standard pedigree format
643
+ # FID, sample_id, father_id, mother_id, sex, phenotype
644
+ df.columns = ["FID", "sample_id", "father_id", "mother_id", "sex", "phenotype"]
645
+
646
+ # Ensure we have the required columns (try different possible names)
647
+ if "sample_id" not in df.columns and len(df.columns) >= 4:
648
+ # Try to identify columns by position
649
+ df = df.rename(
650
+ {
651
+ df.columns[1]: "sample_id",
652
+ df.columns[2]: "father_id",
653
+ df.columns[3]: "mother_id",
654
+ }
655
+ )
656
+
657
+ # Handle different column names for father/mother
658
+ if "FatherBarcode" in df.columns:
659
+ df = df.rename({"FatherBarcode": "father_id", "MotherBarcode": "mother_id"})
660
+
661
+ # Select only the columns we need
662
+ pedigree_df = df.select(["sample_id", "father_id", "mother_id"])
663
+
664
+ # Replace 0 and -9 with null (indicating no parent)
665
+ pedigree_df = pedigree_df.with_columns(
666
+ [
667
+ pl.when(pl.col("father_id").cast(pl.Utf8).is_in(["0", "-9"]))
668
+ .then(None)
669
+ .otherwise(pl.col("father_id"))
670
+ .alias("father_id"),
671
+ pl.when(pl.col("mother_id").cast(pl.Utf8).is_in(["0", "-9"]))
672
+ .then(None)
673
+ .otherwise(pl.col("mother_id"))
674
+ .alias("mother_id"),
675
+ ]
676
+ )
677
+
678
+ return pedigree_df
679
+
680
+
681
+ def add_parent_genotypes(df: pl.DataFrame, pedigree_df: pl.DataFrame) -> pl.DataFrame:
682
+ """
683
+ Add father and mother genotype columns to the DataFrame.
684
+
685
+ Args:
686
+ df: DataFrame with sample genotype information
687
+ pedigree_df: DataFrame with parent relationships
688
+
689
+ Returns:
690
+ DataFrame with added parent genotype columns
691
+ """
692
+ # Join with pedigree to get father and mother IDs for each sample
693
+ df = df.join(pedigree_df, left_on="sample", right_on="sample_id", how="left")
694
+
695
+ # Define the core variant-identifying columns for joining parent genotypes
696
+ # We only want to join on genomic position, not on annotation columns
697
+ # This ensures we match parents even if they have different VEP annotations
698
+ core_variant_cols = ["#CHROM", "POS", "REF", "ALT"]
699
+ # Check which columns actually exist in the dataframe
700
+ join_cols = [col for col in core_variant_cols if col in df.columns]
701
+
702
+ # Create a self-join friendly version of the data for looking up parent genotypes
703
+ # We select only the join columns + sample genotype information
704
+ parent_lookup = df.select(
705
+ join_cols
706
+ + [
707
+ pl.col("sample"),
708
+ pl.col("sample_gt"),
709
+ pl.col("sample_dp"),
710
+ pl.col("sample_gq"),
711
+ pl.col("sample_ad"),
712
+ pl.col("sample_vaf"),
713
+ ]
714
+ ).unique()
715
+
716
+ # Join for father's genotypes
717
+ # Match on genomic position AND father_id == sample
718
+ father_data = parent_lookup.rename(
719
+ {
720
+ "sample": "father_id",
721
+ "sample_gt": "father_gt",
722
+ "sample_dp": "father_dp",
723
+ "sample_gq": "father_gq",
724
+ "sample_ad": "father_ad",
725
+ "sample_vaf": "father_vaf",
726
+ }
727
+ )
728
+
729
+ df = df.join(father_data, on=join_cols + ["father_id"], how="left")
730
+
731
+ # Join for mother's genotypes
732
+ mother_data = parent_lookup.rename(
733
+ {
734
+ "sample": "mother_id",
735
+ "sample_gt": "mother_gt",
736
+ "sample_dp": "mother_dp",
737
+ "sample_gq": "mother_gq",
738
+ "sample_ad": "mother_ad",
739
+ "sample_vaf": "mother_vaf",
740
+ }
741
+ )
742
+
743
+ df = df.join(mother_data, on=join_cols + ["mother_id"], how="left")
744
+
745
+ # Rename father_id and mother_id to father and mother for debugging
746
+ df = df.rename({"father_id": "father", "mother_id": "mother"})
747
+
748
+ # Replace '.' with '0' for parent DP and GQ columns
749
+ df = df.with_columns(
750
+ [
751
+ pl.when(pl.col("father_dp") == ".")
752
+ .then(pl.lit("0"))
753
+ .otherwise(pl.col("father_dp"))
754
+ .alias("father_dp"),
755
+ pl.when(pl.col("father_gq") == ".")
756
+ .then(pl.lit("0"))
757
+ .otherwise(pl.col("father_gq"))
758
+ .alias("father_gq"),
759
+ pl.when(pl.col("mother_dp") == ".")
760
+ .then(pl.lit("0"))
761
+ .otherwise(pl.col("mother_dp"))
762
+ .alias("mother_dp"),
763
+ pl.when(pl.col("mother_gq") == ".")
764
+ .then(pl.lit("0"))
765
+ .otherwise(pl.col("mother_gq"))
766
+ .alias("mother_gq"),
767
+ ]
768
+ )
769
+
770
+ return df
771
+
772
+
773
+ def format_bcftools_tsv(
774
+ df: pl.DataFrame, pedigree_df: Optional[pl.DataFrame] = None
775
+ ) -> pl.DataFrame:
776
+ """
777
+ Format a bcftools tabulated TSV DataFrame.
778
+
779
+ Args:
780
+ df: Input DataFrame from bcftools
781
+ pedigree_df: Optional pedigree DataFrame with parent information
782
+
783
+ Returns:
784
+ Formatted DataFrame with expanded fields and melted samples
785
+ """
786
+ # Find the (null) column
787
+ if "(null)" not in df.columns:
788
+ raise ValueError("Column '(null)' not found in the input file")
789
+
790
+ # Get column index of (null)
791
+ null_col_idx = df.columns.index("(null)")
792
+
793
+ # Split columns into: before (null), (null), and after (null)
794
+ cols_after = df.columns[null_col_idx + 1 :]
795
+
796
+ # Step 1: Expand the (null) column
797
+ # Split by semicolon and create new columns
798
+
799
+ # First, we need to extract all unique field names from the (null) column
800
+ # to know what columns to create
801
+ null_values = df.select("(null)").to_series()
802
+ all_fields = set()
803
+
804
+ for value in null_values:
805
+ if value and not (isinstance(value, float)): # Skip null/NaN values
806
+ pairs = str(value).split(";")
807
+ for pair in pairs:
808
+ if "=" in pair:
809
+ field_name = pair.split("=", 1)[0]
810
+ all_fields.add(field_name)
811
+
812
+ # Create expressions to extract each field
813
+ for field in sorted(all_fields):
814
+ # Extract the field value from the (null) column
815
+ # Pattern: extract value after "field=" and before ";" or end of string
816
+ df = df.with_columns(
817
+ pl.col("(null)").str.extract(f"{field}=([^;]+)").alias(field)
818
+ )
819
+
820
+ # Drop the original (null) column
821
+ df = df.drop("(null)")
822
+
823
+ # Drop CSQ column if it exists (it was extracted from (null) column)
824
+ if "CSQ" in df.columns:
825
+ df = df.drop("CSQ")
826
+
827
+ # Step 2: Identify sample columns and extract sample names
828
+ # Sample columns have format "sample_name:..." in the header
829
+ # Skip the CSQ column as it should not be melted (handled above)
830
+ sample_cols = []
831
+ sample_names = []
832
+
833
+ for col in cols_after:
834
+ # Skip CSQ column
835
+ if col == "CSQ":
836
+ continue
837
+
838
+ if ":" in col:
839
+ sample_name = col.split(":", 1)[0]
840
+ sample_cols.append(col)
841
+ sample_names.append(sample_name)
842
+ else:
843
+ # If no colon, treat the whole column name as sample name
844
+ sample_cols.append(col)
845
+ sample_names.append(col)
846
+
847
+ if not sample_cols:
848
+ # No sample columns to melt, just return expanded data
849
+ return df
850
+
851
+ # Step 3: Melt the sample columns
852
+ # Keep all columns except sample columns as id_vars
853
+ id_vars = [col for col in df.columns if col not in sample_cols]
854
+
855
+ # Create a mapping of old column names to sample names
856
+ rename_map = {old: new for old, new in zip(sample_cols, sample_names)}
857
+
858
+ # Rename sample columns to just sample names before melting
859
+ df = df.rename(rename_map)
860
+
861
+ # Melt the dataframe
862
+ melted_df = df.melt(
863
+ id_vars=id_vars,
864
+ value_vars=sample_names,
865
+ variable_name="sample",
866
+ value_name="sample_value",
867
+ )
868
+
869
+ # Step 4: Split sample_value into GT:DP:GQ:AD format
870
+ # Split on ':' to get individual fields
871
+ # Use nullable=True to handle missing fields gracefully
872
+ melted_df = melted_df.with_columns(
873
+ [
874
+ # GT - first field (nullable for missing data)
875
+ pl.col("sample_value")
876
+ .str.split(":")
877
+ .list.get(0, null_on_oob=True)
878
+ .alias("sample_gt"),
879
+ # DP - second field (nullable for missing data)
880
+ pl.col("sample_value")
881
+ .str.split(":")
882
+ .list.get(1, null_on_oob=True)
883
+ .alias("sample_dp"),
884
+ # GQ - third field (nullable for missing data)
885
+ pl.col("sample_value")
886
+ .str.split(":")
887
+ .list.get(2, null_on_oob=True)
888
+ .alias("sample_gq"),
889
+ # AD - fourth field, split on ',' and keep second value (nullable)
890
+ pl.col("sample_value")
891
+ .str.split(":")
892
+ .list.get(3, null_on_oob=True)
893
+ .str.split(",")
894
+ .list.get(1, null_on_oob=True)
895
+ .alias("sample_ad"),
896
+ ]
897
+ )
898
+
899
+ # Replace '.' with '0' for DP and GQ columns
900
+ melted_df = melted_df.with_columns(
901
+ [
902
+ pl.when(pl.col("sample_dp") == ".")
903
+ .then(pl.lit("0"))
904
+ .otherwise(pl.col("sample_dp"))
905
+ .alias("sample_dp"),
906
+ pl.when(pl.col("sample_gq") == ".")
907
+ .then(pl.lit("0"))
908
+ .otherwise(pl.col("sample_gq"))
909
+ .alias("sample_gq"),
910
+ ]
911
+ )
912
+
913
+ # Step 5: Calculate sample_vaf as sample_ad / sample_dp
914
+ # Convert to numeric, calculate ratio, handle division by zero
915
+ melted_df = melted_df.with_columns(
916
+ [
917
+ (
918
+ pl.col("sample_ad").cast(pl.Float64, strict=False)
919
+ / pl.col("sample_dp").cast(pl.Float64, strict=False)
920
+ ).alias("sample_vaf")
921
+ ]
922
+ )
923
+
924
+ # Drop the original sample_value column
925
+ melted_df = melted_df.drop("sample_value")
926
+
927
+ # Step 6: Add parent genotype information if pedigree is provided
928
+ if pedigree_df is not None:
929
+ melted_df = add_parent_genotypes(melted_df, pedigree_df)
930
+
931
+ return melted_df
932
+
933
+
934
+ if __name__ == "__main__":
935
+ cli()
@@ -0,0 +1,142 @@
1
+ Metadata-Version: 2.4
2
+ Name: pywombat
3
+ Version: 0.1.0
4
+ Summary: A CLI tool for processing and filtering bcftools tabulated TSV files with pedigree support
5
+ Project-URL: Homepage, https://github.com/bourgeron-lab/pywombat
6
+ Project-URL: Repository, https://github.com/bourgeron-lab/pywombat
7
+ Project-URL: Issues, https://github.com/bourgeron-lab/pywombat/issues
8
+ Author-email: Freddy Cliquet <fcliquet@pasteur.fr>
9
+ License: MIT
10
+ Keywords: bioinformatics,genomics,pedigree,variant-calling,vcf
11
+ Classifier: Development Status :: 3 - Alpha
12
+ Classifier: Intended Audience :: Science/Research
13
+ Classifier: Programming Language :: Python :: 3
14
+ Classifier: Programming Language :: Python :: 3.12
15
+ Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
16
+ Requires-Python: >=3.12
17
+ Requires-Dist: click>=8.1.0
18
+ Requires-Dist: polars>=0.19.0
19
+ Requires-Dist: pyyaml>=6.0
20
+ Description-Content-Type: text/markdown
21
+
22
+ # PyWombat
23
+
24
+ A CLI tool for processing bcftools tabulated TSV files.
25
+
26
+ ## Installation
27
+
28
+ This is a UV-managed Python package. To install:
29
+
30
+ ```bash
31
+ uv sync
32
+ ```
33
+
34
+ ## Usage
35
+
36
+ The `wombat` command processes bcftools tabulated TSV files:
37
+
38
+ ```bash
39
+ # Format a bcftools TSV file and print to stdout
40
+ wombat input.tsv
41
+
42
+ # Format and save to output file (creates output.tsv by default)
43
+ wombat input.tsv -o output
44
+
45
+ # Format and save as parquet
46
+ wombat input.tsv -o output -f parquet
47
+ wombat input.tsv -o output --format parquet
48
+
49
+ # Format with pedigree information to add parent genotypes
50
+ wombat input.tsv --pedigree pedigree.tsv -o output
51
+ ```
52
+
53
+ ### What does `wombat` do?
54
+
55
+ The `wombat` command processes bcftools tabulated TSV files by:
56
+
57
+ 1. **Expanding the `(null)` column**: This column contains multiple fields in the format `NAME=value` separated by semicolons (e.g., `DP=30;AF=0.5;AC=2`). Each field is extracted into its own column.
58
+
59
+ 2. **Preserving the `CSQ` column**: The CSQ (Consequence) column is preserved as-is and not melted, allowing VEP annotations to remain intact.
60
+
61
+ 3. **Melting and splitting sample columns**: After the `(null)` column, there are typically sample columns with values in `GT:DP:GQ:AD` format. The tool:
62
+ - Extracts the sample name (the part before the first `:` character)
63
+ - Transforms the wide format into long format
64
+ - Creates a `sample` column with the sample names
65
+ - Splits the sample values into separate columns:
66
+ - `sample_gt`: Genotype (e.g., 0/1, 1/1)
67
+ - `sample_dp`: Read depth
68
+ - `sample_gq`: Genotype quality
69
+ - `sample_ad`: Allele depth (takes the second value from comma-separated list)
70
+ - `sample_vaf`: Variant allele frequency (calculated as sample_ad / sample_dp)
71
+
72
+ ### Example
73
+
74
+ **Input:**
75
+
76
+ ```tsv
77
+ CHROM POS REF ALT (null) Sample1:GT:Sample1:DP:Sample1:GQ:Sample1:AD Sample2:GT:Sample2:DP:Sample2:GQ:Sample2:AD
78
+ chr1 100 A T DP=30;AF=0.5;AC=2 0/1:15:99:5,10 1/1:18:99:0,18
79
+ ```
80
+
81
+ **Output:**
82
+
83
+ ```tsv
84
+ CHROM POS REF ALT AC AF DP sample sample_gt sample_dp sample_gq sample_ad sample_vaf
85
+ chr1 100 A T 2 0.5 30 Sample1 0/1 15 99 10 0.6667
86
+ chr1 100 A T 2 0.5 30 Sample2 1/1 18 99 18 1.0
87
+ ```
88
+
89
+ Notes:
90
+
91
+ - The `sample_ad` column contains the second value from the AD field (e.g., from `5,10` it extracts `10`)
92
+ - The `sample_vaf` column is the variant allele frequency calculated as `sample_ad / sample_dp`
93
+ - By default, output is in TSV format. Use `-f parquet` to output as Parquet files
94
+ - The `-o` option specifies an output prefix (e.g., `-o output` creates `output.tsv` or `output.parquet`)
95
+
96
+ ### Pedigree Support
97
+
98
+ You can provide a pedigree file with the `--pedigree` option to add parent genotype information to the output. This enables trio analysis by including the father's and mother's genotypes for each sample.
99
+
100
+ **Pedigree File Format:**
101
+
102
+ The pedigree file should be a tab-separated file with the following columns:
103
+
104
+ - `FID`: Family ID
105
+ - `sample_id`: Sample identifier (matches the sample names in the VCF)
106
+ - `FatherBarcode`: Father's sample identifier (use `0` or `-9` if unknown)
107
+ - `MotherBarcode`: Mother's sample identifier (use `0` or `-9` if unknown)
108
+ - `Sex`: Sex of the sample (optional)
109
+ - `Pheno`: Phenotype information (optional)
110
+
111
+ Example pedigree file:
112
+
113
+ ```tsv
114
+ FID sample_id FatherBarcode MotherBarcode Sex Pheno
115
+ FAM1 Child1 Father1 Mother1 1 2
116
+ FAM1 Father1 0 0 1 1
117
+ FAM1 Mother1 0 0 2 1
118
+ ```
119
+
120
+ **Output with Pedigree:**
121
+
122
+ When using `--pedigree`, the output will include additional columns for each parent:
123
+
124
+ - `father_gt`, `father_dp`, `father_gq`, `father_ad`, `father_vaf`: Father's genotype information
125
+ - `mother_gt`, `mother_dp`, `mother_gq`, `mother_ad`, `mother_vaf`: Mother's genotype information
126
+
127
+ These columns will contain the parent's genotype data for the same variant, allowing you to analyze inheritance patterns.
128
+
129
+ ## Development
130
+
131
+ This project uses:
132
+
133
+ - **UV** for package management
134
+ - **Polars** for fast data processing
135
+ - **Click** for CLI interface
136
+
137
+ ## Testing
138
+
139
+ Test files are available in the `tests/` directory:
140
+
141
+ - `test.tabulated.tsv` - Real bcftools output
142
+ - `test_small.tsv` - Small example for quick testing
@@ -0,0 +1,6 @@
1
+ pywombat/__init__.py,sha256=iIPN9vJtsIUhl_DiKNnknxCamLinfayodLLFK8y-aJg,54
2
+ pywombat/cli.py,sha256=kUokXfnaSCKLXiCu7jXbYOPlGTtL5wSzocM9gFtPy30,32801
3
+ pywombat-0.1.0.dist-info/METADATA,sha256=3RlA_lLC7hKUxIrhQvvbBKEolYGOl_EVJgyDfDLI0sU,4982
4
+ pywombat-0.1.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
5
+ pywombat-0.1.0.dist-info/entry_points.txt,sha256=Vt7U2ypbiEgCBlEV71ZPk287H5_HKmPBT4iBu6duEcE,44
6
+ pywombat-0.1.0.dist-info/RECORD,,
@@ -0,0 +1,4 @@
1
+ Wheel-Version: 1.0
2
+ Generator: hatchling 1.27.0
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
@@ -0,0 +1,2 @@
1
+ [console_scripts]
2
+ wombat = pywombat.cli:cli