pywombat 0.5.0__py3-none-any.whl → 1.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pywombat/cli.py +907 -30
- pywombat-1.0.1.dist-info/METADATA +641 -0
- pywombat-1.0.1.dist-info/RECORD +6 -0
- pywombat-0.5.0.dist-info/METADATA +0 -142
- pywombat-0.5.0.dist-info/RECORD +0 -6
- {pywombat-0.5.0.dist-info → pywombat-1.0.1.dist-info}/WHEEL +0 -0
- {pywombat-0.5.0.dist-info → pywombat-1.0.1.dist-info}/entry_points.txt +0 -0
pywombat/cli.py
CHANGED
|
@@ -122,7 +122,7 @@ def cli(
|
|
|
122
122
|
else:
|
|
123
123
|
output = input_stem
|
|
124
124
|
|
|
125
|
-
#
|
|
125
|
+
# Process using streaming mode
|
|
126
126
|
if verbose:
|
|
127
127
|
click.echo("Processing with streaming mode...", err=True)
|
|
128
128
|
|
|
@@ -147,7 +147,9 @@ def cli(
|
|
|
147
147
|
|
|
148
148
|
# Apply filters if provided
|
|
149
149
|
if filter_config_data:
|
|
150
|
-
lazy_df = apply_filters_lazy(
|
|
150
|
+
lazy_df = apply_filters_lazy(
|
|
151
|
+
lazy_df, filter_config_data, verbose, pedigree_df
|
|
152
|
+
)
|
|
151
153
|
|
|
152
154
|
# Write output
|
|
153
155
|
output_path = Path(f"{output}.{output_format}")
|
|
@@ -463,6 +465,321 @@ def apply_quality_filters(
|
|
|
463
465
|
return df
|
|
464
466
|
|
|
465
467
|
|
|
468
|
+
# ------------------ De novo (DNM) filter helpers ------------------
|
|
469
|
+
|
|
470
|
+
|
|
471
|
+
def _chrom_short(chrom: str) -> str:
|
|
472
|
+
"""Normalize chromosome name to short form (e.g., 'chrX' -> 'X')."""
|
|
473
|
+
if chrom is None:
|
|
474
|
+
return ""
|
|
475
|
+
chrom = str(chrom)
|
|
476
|
+
return chrom[3:] if chrom.lower().startswith("chr") else chrom
|
|
477
|
+
|
|
478
|
+
|
|
479
|
+
def _pos_in_par(chrom: str, pos: int, par_regions: dict) -> bool:
|
|
480
|
+
"""Return True if (chrom,pos) falls in any PAR region from config.
|
|
481
|
+
|
|
482
|
+
Normalizes chromosome names to match both 'X'/'chrX' formats.
|
|
483
|
+
"""
|
|
484
|
+
if not par_regions:
|
|
485
|
+
return False
|
|
486
|
+
|
|
487
|
+
chrom_short = _chrom_short(chrom)
|
|
488
|
+
|
|
489
|
+
for assembly, regions in par_regions.items():
|
|
490
|
+
for region_name, region in regions.items():
|
|
491
|
+
region_chrom = _chrom_short(region.get("chrom", "X"))
|
|
492
|
+
start = int(region.get("start"))
|
|
493
|
+
end = int(region.get("end"))
|
|
494
|
+
# Normalize both to uppercase for comparison
|
|
495
|
+
if region_chrom.upper() == chrom_short.upper() and start <= pos <= end:
|
|
496
|
+
return True
|
|
497
|
+
return False
|
|
498
|
+
|
|
499
|
+
|
|
500
|
+
def apply_de_novo_filter(
|
|
501
|
+
df: pl.DataFrame,
|
|
502
|
+
dnm_config: dict,
|
|
503
|
+
verbose: bool = False,
|
|
504
|
+
pedigree_df: Optional[pl.DataFrame] = None,
|
|
505
|
+
) -> pl.DataFrame:
|
|
506
|
+
"""Apply de novo detection filters to dataframe using vectorized operations.
|
|
507
|
+
|
|
508
|
+
dnm_config expected keys:
|
|
509
|
+
- sample_dp_min, sample_gq_min, sample_vaf_min
|
|
510
|
+
- parent_dp_min, parent_gq_min, parent_vaf_max
|
|
511
|
+
- par_regions: dict with PAR regions keyed by assembly
|
|
512
|
+
|
|
513
|
+
This function will read `sex` from `df` when present; otherwise it will use
|
|
514
|
+
the `pedigree_df` (which should contain `sample_id` and `sex`).
|
|
515
|
+
"""
|
|
516
|
+
if not dnm_config:
|
|
517
|
+
return df
|
|
518
|
+
|
|
519
|
+
# Required thresholds
|
|
520
|
+
s_dp = dnm_config.get("sample_dp_min", 10)
|
|
521
|
+
s_gq = dnm_config.get("sample_gq_min", 18)
|
|
522
|
+
s_vaf = dnm_config.get("sample_vaf_min", 0.15)
|
|
523
|
+
s_vaf_hemizygous = dnm_config.get("sample_vaf_hemizygous_min", 0.85)
|
|
524
|
+
|
|
525
|
+
p_dp = dnm_config.get("parent_dp_min", 10)
|
|
526
|
+
p_gq = dnm_config.get("parent_gq_min", 18)
|
|
527
|
+
p_vaf = dnm_config.get("parent_vaf_max", 0.02)
|
|
528
|
+
|
|
529
|
+
fafmax_max = dnm_config.get("fafmax_faf95_max_genomes_max", None)
|
|
530
|
+
genomes_filters_pass_only = dnm_config.get("genomes_filters_pass_only", False)
|
|
531
|
+
|
|
532
|
+
par_regions = dnm_config.get("par_regions", {})
|
|
533
|
+
|
|
534
|
+
original = df.shape[0]
|
|
535
|
+
|
|
536
|
+
# Ensure we have parent identifiers (father/mother). Try to add from pedigree if missing.
|
|
537
|
+
if "father_id" not in df.columns or "mother_id" not in df.columns:
|
|
538
|
+
if pedigree_df is not None:
|
|
539
|
+
# Join pedigree to get father/mother/sample sex if needed
|
|
540
|
+
df = df.join(
|
|
541
|
+
pedigree_df, left_on="sample", right_on="sample_id", how="left"
|
|
542
|
+
)
|
|
543
|
+
else:
|
|
544
|
+
raise click.Abort(
|
|
545
|
+
"DNM filtering requires a pedigree (with father/mother IDs and sex)."
|
|
546
|
+
)
|
|
547
|
+
|
|
548
|
+
if verbose:
|
|
549
|
+
click.echo(
|
|
550
|
+
f"DNM: Starting with {df.shape[0]} variants after pedigree join", err=True
|
|
551
|
+
)
|
|
552
|
+
|
|
553
|
+
# Optimization #6: Early exit for variants with missing parents
|
|
554
|
+
df = df.filter(
|
|
555
|
+
pl.col("father_id").is_not_null() & pl.col("mother_id").is_not_null()
|
|
556
|
+
)
|
|
557
|
+
|
|
558
|
+
if verbose:
|
|
559
|
+
click.echo(
|
|
560
|
+
f"DNM: {df.shape[0]} variants after filtering missing parents", err=True
|
|
561
|
+
)
|
|
562
|
+
|
|
563
|
+
if df.shape[0] == 0:
|
|
564
|
+
if verbose:
|
|
565
|
+
click.echo(
|
|
566
|
+
f"De novo filter: {original} -> 0 rows (all missing parents)", err=True
|
|
567
|
+
)
|
|
568
|
+
return df
|
|
569
|
+
|
|
570
|
+
# Ensure we have sex information
|
|
571
|
+
if "sex" not in df.columns:
|
|
572
|
+
if pedigree_df is not None and "sex" in pedigree_df.columns:
|
|
573
|
+
# Re-join to get sex if not already present
|
|
574
|
+
if "sex" not in df.columns:
|
|
575
|
+
df = df.join(
|
|
576
|
+
pedigree_df.select(["sample_id", "sex"]),
|
|
577
|
+
left_on="sample",
|
|
578
|
+
right_on="sample_id",
|
|
579
|
+
how="left",
|
|
580
|
+
)
|
|
581
|
+
else:
|
|
582
|
+
raise click.Abort(
|
|
583
|
+
"DNM filtering requires sex information in the pedigree (column 'sex')."
|
|
584
|
+
)
|
|
585
|
+
|
|
586
|
+
# Filter out variants with missing/partial genotypes (containing '.')
|
|
587
|
+
# Sample genotype must be fully called (no '.', './0', '1/.', './.' etc.)
|
|
588
|
+
df = df.filter(
|
|
589
|
+
~pl.col("sample_gt").str.contains(r"\.") & pl.col("sample_gt").is_not_null()
|
|
590
|
+
)
|
|
591
|
+
|
|
592
|
+
if verbose:
|
|
593
|
+
click.echo(
|
|
594
|
+
f"DNM: {df.shape[0]} variants after removing samples with missing/partial GT",
|
|
595
|
+
err=True,
|
|
596
|
+
)
|
|
597
|
+
|
|
598
|
+
# Ensure proband passes basic sample thresholds (DP, GQ)
|
|
599
|
+
# VAF threshold will be applied differently for hemizygous vs diploid variants
|
|
600
|
+
df = df.filter(
|
|
601
|
+
(pl.col("sample_dp").cast(pl.Float64, strict=False) >= s_dp)
|
|
602
|
+
& (pl.col("sample_gq").cast(pl.Float64, strict=False) >= s_gq)
|
|
603
|
+
)
|
|
604
|
+
|
|
605
|
+
if verbose:
|
|
606
|
+
click.echo(
|
|
607
|
+
f"DNM: {df.shape[0]} variants after proband QC (DP>={s_dp}, GQ>={s_gq})",
|
|
608
|
+
err=True,
|
|
609
|
+
)
|
|
610
|
+
|
|
611
|
+
if df.shape[0] == 0:
|
|
612
|
+
if verbose:
|
|
613
|
+
click.echo(
|
|
614
|
+
f"De novo filter: {original} -> 0 rows (sample QC failed)", err=True
|
|
615
|
+
)
|
|
616
|
+
return df
|
|
617
|
+
|
|
618
|
+
# Optimization #7: Vectorized parent filtering using when/then expressions
|
|
619
|
+
# Build chromosome-specific parent filters
|
|
620
|
+
|
|
621
|
+
# Ensure #CHROM is string type for operations (convert from categorical if needed)
|
|
622
|
+
if "#CHROM" in df.columns and df.schema["#CHROM"] == pl.Categorical:
|
|
623
|
+
df = df.with_columns(pl.col("#CHROM").cast(pl.Utf8))
|
|
624
|
+
|
|
625
|
+
# Normalize chromosome to short form for comparison
|
|
626
|
+
df = df.with_columns(
|
|
627
|
+
pl.col("#CHROM")
|
|
628
|
+
.str.replace("^chr", "")
|
|
629
|
+
.str.to_uppercase()
|
|
630
|
+
.alias("_chrom_short")
|
|
631
|
+
)
|
|
632
|
+
|
|
633
|
+
# Determine if variant is in PAR region (vectorized)
|
|
634
|
+
par_mask = pl.lit(False)
|
|
635
|
+
if par_regions:
|
|
636
|
+
for assembly, regions in par_regions.items():
|
|
637
|
+
for region_name, region in regions.items():
|
|
638
|
+
region_chrom = _chrom_short(region.get("chrom", "X")).upper()
|
|
639
|
+
start = int(region.get("start"))
|
|
640
|
+
end = int(region.get("end"))
|
|
641
|
+
par_mask = par_mask | (
|
|
642
|
+
(pl.col("_chrom_short") == region_chrom)
|
|
643
|
+
& (pl.col("POS") >= start)
|
|
644
|
+
& (pl.col("POS") <= end)
|
|
645
|
+
)
|
|
646
|
+
|
|
647
|
+
df = df.with_columns(par_mask.alias("_in_par"))
|
|
648
|
+
|
|
649
|
+
# Normalize sex to uppercase string for comparison
|
|
650
|
+
df = df.with_columns(
|
|
651
|
+
pl.col("sex").cast(pl.Utf8).str.to_uppercase().alias("_sex_norm")
|
|
652
|
+
)
|
|
653
|
+
|
|
654
|
+
# Determine if variant is hemizygous (X in males outside PAR, or Y in males)
|
|
655
|
+
is_hemizygous = (
|
|
656
|
+
(pl.col("_chrom_short") == "X")
|
|
657
|
+
& (pl.col("_sex_norm").is_in(["1", "M"]))
|
|
658
|
+
& ~pl.col("_in_par")
|
|
659
|
+
) | (pl.col("_chrom_short") == "Y")
|
|
660
|
+
|
|
661
|
+
# Determine if variant is homozygous (1/1, 2/2, etc.)
|
|
662
|
+
is_homozygous = pl.col("sample_gt").is_in(["1/1", "2/2", "3/3"])
|
|
663
|
+
|
|
664
|
+
# Apply VAF threshold: hemizygous and homozygous variants require higher VAF (>=0.85)
|
|
665
|
+
sample_vaf_filter = (
|
|
666
|
+
pl.when(is_hemizygous | is_homozygous)
|
|
667
|
+
.then(pl.col("sample_vaf") >= s_vaf_hemizygous)
|
|
668
|
+
.otherwise(pl.col("sample_vaf") > s_vaf)
|
|
669
|
+
)
|
|
670
|
+
|
|
671
|
+
df = df.filter(sample_vaf_filter)
|
|
672
|
+
|
|
673
|
+
if verbose:
|
|
674
|
+
click.echo(
|
|
675
|
+
f"DNM: {df.shape[0]} variants after VAF filtering (het>{s_vaf}, hom/hemizygous>={s_vaf_hemizygous})",
|
|
676
|
+
err=True,
|
|
677
|
+
)
|
|
678
|
+
|
|
679
|
+
# Apply fafmax_faf95_max_genomes filter if specified
|
|
680
|
+
if fafmax_max is not None:
|
|
681
|
+
if "fafmax_faf95_max_genomes" in df.columns:
|
|
682
|
+
df = df.filter(
|
|
683
|
+
(
|
|
684
|
+
pl.col("fafmax_faf95_max_genomes").cast(pl.Float64, strict=False)
|
|
685
|
+
<= fafmax_max
|
|
686
|
+
)
|
|
687
|
+
| pl.col("fafmax_faf95_max_genomes").is_null()
|
|
688
|
+
)
|
|
689
|
+
if verbose:
|
|
690
|
+
click.echo(
|
|
691
|
+
f"DNM: {df.shape[0]} variants after fafmax_faf95_max_genomes filter (<={fafmax_max})",
|
|
692
|
+
err=True,
|
|
693
|
+
)
|
|
694
|
+
elif verbose:
|
|
695
|
+
click.echo(
|
|
696
|
+
"DNM: Warning - fafmax_faf95_max_genomes column not found, skipping frequency filter",
|
|
697
|
+
err=True,
|
|
698
|
+
)
|
|
699
|
+
|
|
700
|
+
# Apply genomes_filters filter if specified
|
|
701
|
+
if genomes_filters_pass_only:
|
|
702
|
+
if "genomes_filters" in df.columns:
|
|
703
|
+
df = df.filter(
|
|
704
|
+
(pl.col("genomes_filters") == ".") | pl.col("genomes_filters").is_null()
|
|
705
|
+
)
|
|
706
|
+
if verbose:
|
|
707
|
+
click.echo(
|
|
708
|
+
f"DNM: {df.shape[0]} variants after genomes_filters filter (pass only)",
|
|
709
|
+
err=True,
|
|
710
|
+
)
|
|
711
|
+
elif verbose:
|
|
712
|
+
click.echo(
|
|
713
|
+
"DNM: Warning - genomes_filters column not found, skipping genomes_filters filter",
|
|
714
|
+
err=True,
|
|
715
|
+
)
|
|
716
|
+
|
|
717
|
+
# Build parent quality checks (common to all)
|
|
718
|
+
father_qual_ok = (pl.col("father_dp").cast(pl.Float64, strict=False) >= p_dp) & (
|
|
719
|
+
pl.col("father_gq").cast(pl.Float64, strict=False) >= p_gq
|
|
720
|
+
)
|
|
721
|
+
mother_qual_ok = (pl.col("mother_dp").cast(pl.Float64, strict=False) >= p_dp) & (
|
|
722
|
+
pl.col("mother_gq").cast(pl.Float64, strict=False) >= p_gq
|
|
723
|
+
)
|
|
724
|
+
|
|
725
|
+
father_vaf_ok = pl.col("father_vaf").is_null() | (pl.col("father_vaf") < p_vaf)
|
|
726
|
+
mother_vaf_ok = pl.col("mother_vaf").is_null() | (pl.col("mother_vaf") < p_vaf)
|
|
727
|
+
|
|
728
|
+
# Parent genotype checks: ensure no '.' in genotypes (no ./., 0/., 1/. etc.)
|
|
729
|
+
father_gt_ok = (
|
|
730
|
+
~pl.col("father_gt").str.contains(r"\.") & pl.col("father_gt").is_not_null()
|
|
731
|
+
)
|
|
732
|
+
mother_gt_ok = (
|
|
733
|
+
~pl.col("mother_gt").str.contains(r"\.") & pl.col("mother_gt").is_not_null()
|
|
734
|
+
)
|
|
735
|
+
|
|
736
|
+
# Build comprehensive parent filter using when/then logic
|
|
737
|
+
parent_filter = (
|
|
738
|
+
pl.when(pl.col("_in_par"))
|
|
739
|
+
# PAR region: both parents must be reference (autosomal-like) with valid GTs
|
|
740
|
+
.then(
|
|
741
|
+
father_qual_ok
|
|
742
|
+
& father_vaf_ok
|
|
743
|
+
& father_gt_ok
|
|
744
|
+
& mother_qual_ok
|
|
745
|
+
& mother_vaf_ok
|
|
746
|
+
& mother_gt_ok
|
|
747
|
+
)
|
|
748
|
+
.when(pl.col("_chrom_short") == "Y")
|
|
749
|
+
# Y chromosome: only check father (mother doesn't have Y), father GT must be valid
|
|
750
|
+
.then(father_qual_ok & father_vaf_ok & father_gt_ok)
|
|
751
|
+
.when((pl.col("_chrom_short") == "X") & (pl.col("_sex_norm").is_in(["1", "M"])))
|
|
752
|
+
# X chromosome, male proband: father is hemizygous, only check mother VAF and GT
|
|
753
|
+
.then(father_qual_ok & mother_qual_ok & mother_vaf_ok & mother_gt_ok)
|
|
754
|
+
# Default (autosomes or X/female): both parents must be reference with valid GTs
|
|
755
|
+
.otherwise(
|
|
756
|
+
father_qual_ok
|
|
757
|
+
& father_vaf_ok
|
|
758
|
+
& father_gt_ok
|
|
759
|
+
& mother_qual_ok
|
|
760
|
+
& mother_vaf_ok
|
|
761
|
+
& mother_gt_ok
|
|
762
|
+
)
|
|
763
|
+
)
|
|
764
|
+
|
|
765
|
+
# Apply parent filter
|
|
766
|
+
result = df.filter(parent_filter)
|
|
767
|
+
|
|
768
|
+
if verbose:
|
|
769
|
+
click.echo(
|
|
770
|
+
f"DNM: {result.shape[0]} variants after parent genotype filtering (parent DP>={p_dp}, GQ>={p_gq}, VAF<{p_vaf})",
|
|
771
|
+
err=True,
|
|
772
|
+
)
|
|
773
|
+
|
|
774
|
+
# Drop temporary columns
|
|
775
|
+
result = result.drop(["_chrom_short", "_in_par", "_sex_norm"])
|
|
776
|
+
|
|
777
|
+
if verbose:
|
|
778
|
+
click.echo(f"De novo filter: {original} -> {result.shape[0]} rows", err=True)
|
|
779
|
+
|
|
780
|
+
return result
|
|
781
|
+
|
|
782
|
+
|
|
466
783
|
def parse_impact_filter_expression(expression: str, df: pl.DataFrame) -> pl.Expr:
|
|
467
784
|
"""Parse a filter expression string into a Polars expression."""
|
|
468
785
|
# Replace operators with Polars equivalents
|
|
@@ -733,8 +1050,45 @@ def apply_filters_and_write(
|
|
|
733
1050
|
output_prefix: Optional[str],
|
|
734
1051
|
output_format: str,
|
|
735
1052
|
verbose: bool,
|
|
1053
|
+
pedigree_df: Optional[pl.DataFrame] = None,
|
|
736
1054
|
):
|
|
737
1055
|
"""Apply filters and write output files."""
|
|
1056
|
+
# If DNM mode is enabled, apply DNM-specific criteria and skip impact/frequency filters
|
|
1057
|
+
if filter_config.get("dnm", {}).get("enabled", False):
|
|
1058
|
+
dnm_cfg = {}
|
|
1059
|
+
# Merge quality & dnm-specific thresholds into a single config for the function
|
|
1060
|
+
dnm_cfg.update(filter_config.get("quality", {}))
|
|
1061
|
+
dnm_cfg.update(filter_config.get("dnm", {}))
|
|
1062
|
+
|
|
1063
|
+
filtered_df = apply_de_novo_filter(
|
|
1064
|
+
df, dnm_cfg, verbose, pedigree_df=pedigree_df
|
|
1065
|
+
)
|
|
1066
|
+
|
|
1067
|
+
# Write result (same behavior as non-impact single-output)
|
|
1068
|
+
if not output_prefix:
|
|
1069
|
+
if output_format != "tsv":
|
|
1070
|
+
click.echo(
|
|
1071
|
+
"Error: stdout output only supported for TSV format.",
|
|
1072
|
+
err=True,
|
|
1073
|
+
)
|
|
1074
|
+
raise click.Abort()
|
|
1075
|
+
click.echo(filtered_df.write_csv(separator="\t"), nl=False)
|
|
1076
|
+
else:
|
|
1077
|
+
output_path = Path(f"{output_prefix}.{output_format}")
|
|
1078
|
+
|
|
1079
|
+
if output_format == "tsv":
|
|
1080
|
+
filtered_df.write_csv(output_path, separator="\t")
|
|
1081
|
+
elif output_format == "tsv.gz":
|
|
1082
|
+
csv_content = filtered_df.write_csv(separator="\t")
|
|
1083
|
+
with gzip.open(output_path, "wt") as f:
|
|
1084
|
+
f.write(csv_content)
|
|
1085
|
+
elif output_format == "parquet":
|
|
1086
|
+
filtered_df.write_parquet(output_path)
|
|
1087
|
+
|
|
1088
|
+
click.echo(f"De novo variants written to {output_path}", err=True)
|
|
1089
|
+
|
|
1090
|
+
return
|
|
1091
|
+
|
|
738
1092
|
# Apply quality filters first
|
|
739
1093
|
quality_config = filter_config.get("quality", {})
|
|
740
1094
|
filtered_df = apply_quality_filters(df, quality_config, verbose)
|
|
@@ -832,8 +1186,16 @@ def read_pedigree(pedigree_path: Path) -> pl.DataFrame:
|
|
|
832
1186
|
if "FatherBarcode" in df.columns:
|
|
833
1187
|
df = df.rename({"FatherBarcode": "father_id", "MotherBarcode": "mother_id"})
|
|
834
1188
|
|
|
835
|
-
#
|
|
836
|
-
|
|
1189
|
+
# Normalize sex column name if present (e.g., 'Sex' or 'sex')
|
|
1190
|
+
sex_col = next((c for c in df.columns if c.lower() == "sex"), None)
|
|
1191
|
+
if sex_col and sex_col != "sex":
|
|
1192
|
+
df = df.rename({sex_col: "sex"})
|
|
1193
|
+
|
|
1194
|
+
# Select only the columns we need (include sex if present)
|
|
1195
|
+
select_cols = ["sample_id", "father_id", "mother_id"]
|
|
1196
|
+
if "sex" in df.columns:
|
|
1197
|
+
select_cols.append("sex")
|
|
1198
|
+
pedigree_df = df.select(select_cols)
|
|
837
1199
|
|
|
838
1200
|
# Replace 0 and -9 with null (indicating no parent)
|
|
839
1201
|
pedigree_df = pedigree_df.with_columns(
|
|
@@ -944,36 +1306,32 @@ def add_parent_genotypes(df: pl.DataFrame, pedigree_df: pl.DataFrame) -> pl.Data
|
|
|
944
1306
|
return df
|
|
945
1307
|
|
|
946
1308
|
|
|
947
|
-
def
|
|
948
|
-
df: pl.DataFrame, pedigree_df: Optional[pl.DataFrame] = None
|
|
949
|
-
) -> pl.DataFrame:
|
|
1309
|
+
def format_expand_annotations(df: pl.DataFrame) -> pl.DataFrame:
|
|
950
1310
|
"""
|
|
951
|
-
|
|
1311
|
+
Expand the (null) annotation column into separate columns.
|
|
1312
|
+
|
|
1313
|
+
This is a separate step that can be applied after filtering to avoid
|
|
1314
|
+
expensive annotation expansion on variants that will be filtered out.
|
|
1315
|
+
|
|
1316
|
+
Handles two types of INFO fields:
|
|
1317
|
+
- Key-value pairs (e.g., "DP=30") -> extracted as string values
|
|
1318
|
+
- Boolean flags (e.g., "PASS", "DB") -> created as True/False columns
|
|
952
1319
|
|
|
953
1320
|
Args:
|
|
954
|
-
df:
|
|
955
|
-
pedigree_df: Optional pedigree DataFrame with parent information
|
|
1321
|
+
df: DataFrame with (null) column
|
|
956
1322
|
|
|
957
1323
|
Returns:
|
|
958
|
-
|
|
1324
|
+
DataFrame with expanded annotation columns
|
|
959
1325
|
"""
|
|
960
1326
|
# Find the (null) column
|
|
961
1327
|
if "(null)" not in df.columns:
|
|
962
|
-
|
|
963
|
-
|
|
964
|
-
# Get column index of (null)
|
|
965
|
-
null_col_idx = df.columns.index("(null)")
|
|
966
|
-
|
|
967
|
-
# Split columns into: before (null), (null), and after (null)
|
|
968
|
-
cols_after = df.columns[null_col_idx + 1 :]
|
|
969
|
-
|
|
970
|
-
# Step 1: Expand the (null) column
|
|
971
|
-
# Split by semicolon and create new columns
|
|
1328
|
+
# Already expanded or missing - return as-is
|
|
1329
|
+
return df
|
|
972
1330
|
|
|
973
|
-
#
|
|
974
|
-
# to know what columns to create
|
|
1331
|
+
# Extract all unique field names and flags from the (null) column
|
|
975
1332
|
null_values = df.select("(null)").to_series()
|
|
976
1333
|
all_fields = set()
|
|
1334
|
+
all_flags = set()
|
|
977
1335
|
|
|
978
1336
|
for value in null_values:
|
|
979
1337
|
if value and not (isinstance(value, float)): # Skip null/NaN values
|
|
@@ -982,8 +1340,10 @@ def format_bcftools_tsv(
|
|
|
982
1340
|
if "=" in pair:
|
|
983
1341
|
field_name = pair.split("=", 1)[0]
|
|
984
1342
|
all_fields.add(field_name)
|
|
1343
|
+
elif pair.strip(): # Boolean flag (no '=')
|
|
1344
|
+
all_flags.add(pair.strip())
|
|
985
1345
|
|
|
986
|
-
# Create expressions to extract each field
|
|
1346
|
+
# Create expressions to extract each key-value field
|
|
987
1347
|
for field in sorted(all_fields):
|
|
988
1348
|
# Extract the field value from the (null) column
|
|
989
1349
|
# Pattern: extract value after "field=" and before ";" or end of string
|
|
@@ -991,6 +1351,14 @@ def format_bcftools_tsv(
|
|
|
991
1351
|
pl.col("(null)").str.extract(f"{field}=([^;]+)").alias(field)
|
|
992
1352
|
)
|
|
993
1353
|
|
|
1354
|
+
# Create boolean columns for flags
|
|
1355
|
+
for flag in sorted(all_flags):
|
|
1356
|
+
# Check if flag appears in the (null) column (as whole word)
|
|
1357
|
+
# Use regex to match flag as a separate field (not part of another field name)
|
|
1358
|
+
df = df.with_columns(
|
|
1359
|
+
pl.col("(null)").str.contains(f"(^|;){flag}(;|$)").alias(flag)
|
|
1360
|
+
)
|
|
1361
|
+
|
|
994
1362
|
# Drop the original (null) column
|
|
995
1363
|
df = df.drop("(null)")
|
|
996
1364
|
|
|
@@ -998,9 +1366,42 @@ def format_bcftools_tsv(
|
|
|
998
1366
|
if "CSQ" in df.columns:
|
|
999
1367
|
df = df.drop("CSQ")
|
|
1000
1368
|
|
|
1001
|
-
|
|
1002
|
-
|
|
1003
|
-
|
|
1369
|
+
return df
|
|
1370
|
+
|
|
1371
|
+
|
|
1372
|
+
def format_bcftools_tsv_minimal(
|
|
1373
|
+
df: pl.DataFrame, pedigree_df: Optional[pl.DataFrame] = None
|
|
1374
|
+
) -> pl.DataFrame:
|
|
1375
|
+
"""
|
|
1376
|
+
Format a bcftools tabulated TSV with minimal processing.
|
|
1377
|
+
|
|
1378
|
+
This version SKIPS expanding the (null) annotation field and only:
|
|
1379
|
+
- Melts sample columns into rows
|
|
1380
|
+
- Extracts GT:DP:GQ:AD:VAF from sample values
|
|
1381
|
+
- Optionally adds parent genotypes if pedigree provided
|
|
1382
|
+
|
|
1383
|
+
Use this for filtering workflows where you want to apply filters BEFORE
|
|
1384
|
+
expensive annotation expansion. Call format_expand_annotations() afterwards
|
|
1385
|
+
on filtered results.
|
|
1386
|
+
|
|
1387
|
+
Args:
|
|
1388
|
+
df: Input DataFrame from bcftools
|
|
1389
|
+
pedigree_df: Optional pedigree DataFrame with parent information
|
|
1390
|
+
|
|
1391
|
+
Returns:
|
|
1392
|
+
Formatted DataFrame with melted samples (annotations still in (null) column)
|
|
1393
|
+
"""
|
|
1394
|
+
# Find the (null) column
|
|
1395
|
+
if "(null)" not in df.columns:
|
|
1396
|
+
raise ValueError("Column '(null)' not found in the input file")
|
|
1397
|
+
|
|
1398
|
+
# Get column index of (null)
|
|
1399
|
+
null_col_idx = df.columns.index("(null)")
|
|
1400
|
+
|
|
1401
|
+
# Split columns into: before (null), (null), and after (null)
|
|
1402
|
+
cols_after = df.columns[null_col_idx + 1 :]
|
|
1403
|
+
|
|
1404
|
+
# Step 1: Identify sample columns (SKIP annotation expansion)
|
|
1004
1405
|
sample_cols = []
|
|
1005
1406
|
sample_names = []
|
|
1006
1407
|
|
|
@@ -1019,10 +1420,10 @@ def format_bcftools_tsv(
|
|
|
1019
1420
|
sample_names.append(col)
|
|
1020
1421
|
|
|
1021
1422
|
if not sample_cols:
|
|
1022
|
-
# No sample columns to melt
|
|
1423
|
+
# No sample columns to melt
|
|
1023
1424
|
return df
|
|
1024
1425
|
|
|
1025
|
-
# Step
|
|
1426
|
+
# Step 2: Melt the sample columns
|
|
1026
1427
|
# Keep all columns except sample columns as id_vars
|
|
1027
1428
|
id_vars = [col for col in df.columns if col not in sample_cols]
|
|
1028
1429
|
|
|
@@ -1105,6 +1506,37 @@ def format_bcftools_tsv(
|
|
|
1105
1506
|
return melted_df
|
|
1106
1507
|
|
|
1107
1508
|
|
|
1509
|
+
def format_bcftools_tsv(
|
|
1510
|
+
df: pl.DataFrame, pedigree_df: Optional[pl.DataFrame] = None
|
|
1511
|
+
) -> pl.DataFrame:
|
|
1512
|
+
"""
|
|
1513
|
+
Format a bcftools tabulated TSV DataFrame (full processing).
|
|
1514
|
+
|
|
1515
|
+
This is the complete formatting that:
|
|
1516
|
+
1. Melts samples and extracts GT:DP:GQ:AD:VAF
|
|
1517
|
+
2. Expands (null) annotation field into separate columns
|
|
1518
|
+
3. Adds parent genotypes if pedigree provided
|
|
1519
|
+
|
|
1520
|
+
For DNM filtering workflows, consider using format_bcftools_tsv_minimal()
|
|
1521
|
+
+ apply_de_novo_filter() + format_expand_annotations() to avoid expanding
|
|
1522
|
+
annotations on variants that will be filtered out.
|
|
1523
|
+
|
|
1524
|
+
Args:
|
|
1525
|
+
df: Input DataFrame from bcftools
|
|
1526
|
+
pedigree_df: Optional pedigree DataFrame with parent information
|
|
1527
|
+
|
|
1528
|
+
Returns:
|
|
1529
|
+
Formatted DataFrame with expanded fields and melted samples
|
|
1530
|
+
"""
|
|
1531
|
+
# First do minimal formatting (melt + sample columns)
|
|
1532
|
+
melted_df = format_bcftools_tsv_minimal(df, pedigree_df)
|
|
1533
|
+
|
|
1534
|
+
# Then expand annotations
|
|
1535
|
+
expanded_df = format_expand_annotations(melted_df)
|
|
1536
|
+
|
|
1537
|
+
return expanded_df
|
|
1538
|
+
|
|
1539
|
+
|
|
1108
1540
|
def format_bcftools_tsv_lazy(
|
|
1109
1541
|
lazy_df: pl.LazyFrame, pedigree_df: Optional[pl.DataFrame] = None
|
|
1110
1542
|
) -> pl.LazyFrame:
|
|
@@ -1120,10 +1552,455 @@ def format_bcftools_tsv_lazy(
|
|
|
1120
1552
|
return formatted_df.lazy()
|
|
1121
1553
|
|
|
1122
1554
|
|
|
1555
|
+
# ------------------ Chunked two-pass processing with progress ------------------
|
|
1556
|
+
import io
|
|
1557
|
+
import math
|
|
1558
|
+
|
|
1559
|
+
|
|
1560
|
+
def _open_file_lines(path: Path):
|
|
1561
|
+
"""Yield header line and an iterator over the remaining lines (text)."""
|
|
1562
|
+
if str(path).endswith(".gz"):
|
|
1563
|
+
import gzip as _gzip
|
|
1564
|
+
|
|
1565
|
+
f = _gzip.open(path, "rt")
|
|
1566
|
+
else:
|
|
1567
|
+
f = open(path, "rt")
|
|
1568
|
+
|
|
1569
|
+
try:
|
|
1570
|
+
header = f.readline()
|
|
1571
|
+
for line in f:
|
|
1572
|
+
yield header, line
|
|
1573
|
+
finally:
|
|
1574
|
+
f.close()
|
|
1575
|
+
|
|
1576
|
+
|
|
1577
|
+
def _line_iterator(path: Path, chunk_size: int = 50000):
|
|
1578
|
+
"""Yield (header, chunk_lines) tuples where chunk_lines is a list of lines."""
|
|
1579
|
+
if str(path).endswith(".gz"):
|
|
1580
|
+
import gzip as _gzip
|
|
1581
|
+
|
|
1582
|
+
f = _gzip.open(path, "rt")
|
|
1583
|
+
else:
|
|
1584
|
+
f = open(path, "rt")
|
|
1585
|
+
|
|
1586
|
+
try:
|
|
1587
|
+
header = f.readline()
|
|
1588
|
+
while True:
|
|
1589
|
+
chunk = []
|
|
1590
|
+
for _ in range(chunk_size):
|
|
1591
|
+
line = f.readline()
|
|
1592
|
+
if not line:
|
|
1593
|
+
break
|
|
1594
|
+
chunk.append(line)
|
|
1595
|
+
if not chunk:
|
|
1596
|
+
break
|
|
1597
|
+
yield header, chunk
|
|
1598
|
+
finally:
|
|
1599
|
+
f.close()
|
|
1600
|
+
|
|
1601
|
+
|
|
1602
|
+
def build_parent_lookup_from_file(
|
|
1603
|
+
path: Path,
|
|
1604
|
+
pedigree_df: Optional[pl.DataFrame] = None,
|
|
1605
|
+
progress_bar=None,
|
|
1606
|
+
verbose: bool = False,
|
|
1607
|
+
chunk_size: int = 50000,
|
|
1608
|
+
):
|
|
1609
|
+
"""First pass: build a minimal parent lookup DataFrame with per-sample genotypes.
|
|
1610
|
+
|
|
1611
|
+
Returns a tuple (lookup_df, total_lines) where total_lines is the approximate number
|
|
1612
|
+
of data lines processed (excluding header). If a `progress_bar` is provided it will
|
|
1613
|
+
be updated as we process chunks.
|
|
1614
|
+
"""
|
|
1615
|
+
parts = []
|
|
1616
|
+
|
|
1617
|
+
schema_overrides = {
|
|
1618
|
+
col: pl.Utf8
|
|
1619
|
+
for col in [
|
|
1620
|
+
"FID",
|
|
1621
|
+
"sample_id",
|
|
1622
|
+
"father_id",
|
|
1623
|
+
"mother_id",
|
|
1624
|
+
"FatherBarcode",
|
|
1625
|
+
"MotherBarcode",
|
|
1626
|
+
"sample",
|
|
1627
|
+
]
|
|
1628
|
+
}
|
|
1629
|
+
|
|
1630
|
+
processed = 0
|
|
1631
|
+
chunk_idx = 0
|
|
1632
|
+
for header, chunk in _line_iterator(path, chunk_size=chunk_size):
|
|
1633
|
+
chunk_idx += 1
|
|
1634
|
+
content = header + "".join(chunk)
|
|
1635
|
+
try:
|
|
1636
|
+
df_chunk = pl.read_csv(
|
|
1637
|
+
io.StringIO(content), separator="\t", schema_overrides=schema_overrides
|
|
1638
|
+
)
|
|
1639
|
+
except Exception:
|
|
1640
|
+
# Skip unparsable chunk
|
|
1641
|
+
processed += len(chunk)
|
|
1642
|
+
if progress_bar is not None:
|
|
1643
|
+
progress_bar.update(len(chunk))
|
|
1644
|
+
elif verbose and (chunk_idx % 10 == 0):
|
|
1645
|
+
click.echo(
|
|
1646
|
+
f"Building lookup: processed ~{processed} lines...", err=True
|
|
1647
|
+
)
|
|
1648
|
+
continue
|
|
1649
|
+
|
|
1650
|
+
try:
|
|
1651
|
+
# Use minimal format for lookup building (skip annotation expansion)
|
|
1652
|
+
formatted = format_bcftools_tsv_minimal(df_chunk, pedigree_df=None)
|
|
1653
|
+
except Exception:
|
|
1654
|
+
# If chunk cannot be parsed into variants, skip
|
|
1655
|
+
processed += len(chunk)
|
|
1656
|
+
if progress_bar is not None:
|
|
1657
|
+
progress_bar.update(len(chunk))
|
|
1658
|
+
elif verbose and (chunk_idx % 10 == 0):
|
|
1659
|
+
click.echo(
|
|
1660
|
+
f"Building lookup: processed ~{processed} lines...", err=True
|
|
1661
|
+
)
|
|
1662
|
+
continue
|
|
1663
|
+
|
|
1664
|
+
cols = [
|
|
1665
|
+
"#CHROM",
|
|
1666
|
+
"POS",
|
|
1667
|
+
"REF",
|
|
1668
|
+
"ALT",
|
|
1669
|
+
"sample",
|
|
1670
|
+
"sample_gt",
|
|
1671
|
+
"sample_dp",
|
|
1672
|
+
"sample_gq",
|
|
1673
|
+
"sample_ad",
|
|
1674
|
+
"sample_vaf",
|
|
1675
|
+
]
|
|
1676
|
+
sel = [c for c in cols if c in formatted.columns]
|
|
1677
|
+
# Optimization: Only store non-reference genotypes in lookup (skip 0/0)
|
|
1678
|
+
part = formatted.select(sel).filter(pl.col("sample_gt") != "0/0").unique()
|
|
1679
|
+
parts.append(part)
|
|
1680
|
+
|
|
1681
|
+
# Update progress
|
|
1682
|
+
processed += len(chunk)
|
|
1683
|
+
if progress_bar is not None:
|
|
1684
|
+
progress_bar.update(len(chunk))
|
|
1685
|
+
elif verbose and (chunk_idx % 10 == 0):
|
|
1686
|
+
click.echo(f"Building lookup: processed ~{processed} lines...", err=True)
|
|
1687
|
+
|
|
1688
|
+
if parts:
|
|
1689
|
+
lookup = pl.concat(parts).unique()
|
|
1690
|
+
else:
|
|
1691
|
+
lookup = pl.DataFrame([])
|
|
1692
|
+
|
|
1693
|
+
# processed currently counts number of data lines seen
|
|
1694
|
+
return lookup, processed
|
|
1695
|
+
|
|
1696
|
+
|
|
1697
|
+
def add_parent_genotypes_from_lookup(
|
|
1698
|
+
df: pl.DataFrame, parent_lookup: pl.DataFrame
|
|
1699
|
+
) -> pl.DataFrame:
|
|
1700
|
+
"""Join father/mother genotype info from the parent_lookup into df.
|
|
1701
|
+
|
|
1702
|
+
Assumes df has columns: #CHROM, POS, REF, ALT, father_id, mother_id
|
|
1703
|
+
"""
|
|
1704
|
+
join_cols = [c for c in ["#CHROM", "POS", "REF", "ALT"] if c in df.columns]
|
|
1705
|
+
|
|
1706
|
+
if parent_lookup.is_empty():
|
|
1707
|
+
# Create empty parent columns
|
|
1708
|
+
return df
|
|
1709
|
+
|
|
1710
|
+
# Prepare father lookup
|
|
1711
|
+
father_lookup = parent_lookup.rename(
|
|
1712
|
+
{
|
|
1713
|
+
"sample": "father",
|
|
1714
|
+
"sample_gt": "father_gt",
|
|
1715
|
+
"sample_dp": "father_dp",
|
|
1716
|
+
"sample_gq": "father_gq",
|
|
1717
|
+
"sample_ad": "father_ad",
|
|
1718
|
+
"sample_vaf": "father_vaf",
|
|
1719
|
+
}
|
|
1720
|
+
)
|
|
1721
|
+
|
|
1722
|
+
# Left join on join_cols + ['father']
|
|
1723
|
+
if "father_id" in df.columns:
|
|
1724
|
+
df = df.join(
|
|
1725
|
+
father_lookup,
|
|
1726
|
+
left_on=join_cols + ["father_id"],
|
|
1727
|
+
right_on=join_cols + ["father"],
|
|
1728
|
+
how="left",
|
|
1729
|
+
)
|
|
1730
|
+
else:
|
|
1731
|
+
# No father id, attempt join on 'father' column
|
|
1732
|
+
df = df.join(
|
|
1733
|
+
father_lookup,
|
|
1734
|
+
on=join_cols + ["father"],
|
|
1735
|
+
how="left",
|
|
1736
|
+
)
|
|
1737
|
+
|
|
1738
|
+
# Prepare mother lookup
|
|
1739
|
+
mother_lookup = parent_lookup.rename(
|
|
1740
|
+
{
|
|
1741
|
+
"sample": "mother",
|
|
1742
|
+
"sample_gt": "mother_gt",
|
|
1743
|
+
"sample_dp": "mother_dp",
|
|
1744
|
+
"sample_gq": "mother_gq",
|
|
1745
|
+
"sample_ad": "mother_ad",
|
|
1746
|
+
"sample_vaf": "mother_vaf",
|
|
1747
|
+
}
|
|
1748
|
+
)
|
|
1749
|
+
|
|
1750
|
+
if "mother_id" in df.columns:
|
|
1751
|
+
df = df.join(
|
|
1752
|
+
mother_lookup,
|
|
1753
|
+
left_on=join_cols + ["mother_id"],
|
|
1754
|
+
right_on=join_cols + ["mother"],
|
|
1755
|
+
how="left",
|
|
1756
|
+
)
|
|
1757
|
+
else:
|
|
1758
|
+
df = df.join(
|
|
1759
|
+
mother_lookup,
|
|
1760
|
+
on=join_cols + ["mother"],
|
|
1761
|
+
how="left",
|
|
1762
|
+
)
|
|
1763
|
+
|
|
1764
|
+
# Normalize '.' to '0' for DP/GQ like previous function
|
|
1765
|
+
df = df.with_columns(
|
|
1766
|
+
[
|
|
1767
|
+
pl.when(pl.col("father_dp") == ".")
|
|
1768
|
+
.then(pl.lit("0"))
|
|
1769
|
+
.otherwise(pl.col("father_dp"))
|
|
1770
|
+
.alias("father_dp"),
|
|
1771
|
+
pl.when(pl.col("father_gq") == ".")
|
|
1772
|
+
.then(pl.lit("0"))
|
|
1773
|
+
.otherwise(pl.col("father_gq"))
|
|
1774
|
+
.alias("father_gq"),
|
|
1775
|
+
pl.when(pl.col("mother_dp") == ".")
|
|
1776
|
+
.then(pl.lit("0"))
|
|
1777
|
+
.otherwise(pl.col("mother_dp"))
|
|
1778
|
+
.alias("mother_dp"),
|
|
1779
|
+
pl.when(pl.col("mother_gq") == ".")
|
|
1780
|
+
.then(pl.lit("0"))
|
|
1781
|
+
.otherwise(pl.col("mother_gq"))
|
|
1782
|
+
.alias("mother_gq"),
|
|
1783
|
+
]
|
|
1784
|
+
)
|
|
1785
|
+
|
|
1786
|
+
return df
|
|
1787
|
+
|
|
1788
|
+
|
|
1789
|
+
def process_with_progress(
|
|
1790
|
+
input_path: Path,
|
|
1791
|
+
output_prefix: str,
|
|
1792
|
+
output_format: str,
|
|
1793
|
+
pedigree_df: Optional[pl.DataFrame],
|
|
1794
|
+
filter_config: Optional[dict],
|
|
1795
|
+
verbose: bool,
|
|
1796
|
+
chunk_size: int = 50000,
|
|
1797
|
+
):
|
|
1798
|
+
"""Process input in two passes and show a progress bar.
|
|
1799
|
+
|
|
1800
|
+
Pass 1: build parent lookup
|
|
1801
|
+
Pass 2: process chunks, join parent genotypes, apply filters, and write incrementally
|
|
1802
|
+
"""
|
|
1803
|
+
# tqdm optional
|
|
1804
|
+
try:
|
|
1805
|
+
from tqdm.auto import tqdm
|
|
1806
|
+
except Exception:
|
|
1807
|
+
tqdm = None
|
|
1808
|
+
|
|
1809
|
+
# Build parent lookup in a single pass (counts lines while building lookup)
|
|
1810
|
+
if verbose:
|
|
1811
|
+
click.echo("Pass 1: building parent genotype lookup (single pass)...", err=True)
|
|
1812
|
+
|
|
1813
|
+
pbar_lookup = None
|
|
1814
|
+
if tqdm is not None:
|
|
1815
|
+
# No known total yet; tqdm will show progress increasing
|
|
1816
|
+
pbar_lookup = tqdm(desc="Building parent lookup", unit="lines")
|
|
1817
|
+
|
|
1818
|
+
parent_lookup, total_lines = build_parent_lookup_from_file(
|
|
1819
|
+
input_path,
|
|
1820
|
+
pedigree_df,
|
|
1821
|
+
progress_bar=pbar_lookup,
|
|
1822
|
+
verbose=verbose,
|
|
1823
|
+
chunk_size=chunk_size,
|
|
1824
|
+
)
|
|
1825
|
+
|
|
1826
|
+
if pbar_lookup is not None:
|
|
1827
|
+
pbar_lookup.close()
|
|
1828
|
+
|
|
1829
|
+
if verbose:
|
|
1830
|
+
click.echo(
|
|
1831
|
+
f"Parent lookup contains {parent_lookup.shape[0]} genotype entries (from ~{total_lines} lines)",
|
|
1832
|
+
err=True,
|
|
1833
|
+
)
|
|
1834
|
+
|
|
1835
|
+
total_chunks = math.ceil(total_lines / chunk_size) if chunk_size > 0 else 1
|
|
1836
|
+
|
|
1837
|
+
# Prepare output paths
|
|
1838
|
+
if output_format == "tsv":
|
|
1839
|
+
out_path = Path(f"{output_prefix}.tsv")
|
|
1840
|
+
elif output_format == "tsv.gz":
|
|
1841
|
+
out_path = Path(f"{output_prefix}.tsv.gz")
|
|
1842
|
+
else:
|
|
1843
|
+
# We'll write parquet parts
|
|
1844
|
+
out_path = Path(f"{output_prefix}")
|
|
1845
|
+
|
|
1846
|
+
first_write = True
|
|
1847
|
+
|
|
1848
|
+
# Iterate chunks and process
|
|
1849
|
+
iterator = _line_iterator(input_path, chunk_size=chunk_size)
|
|
1850
|
+
chunk_idx = 0
|
|
1851
|
+
|
|
1852
|
+
progress_bar = None
|
|
1853
|
+
processed_lines = 0
|
|
1854
|
+
if tqdm is not None:
|
|
1855
|
+
progress_bar = tqdm(total=total_lines, desc="Processing variants")
|
|
1856
|
+
|
|
1857
|
+
for header, chunk in iterator:
|
|
1858
|
+
chunk_idx += 1
|
|
1859
|
+
chunk_count = len(chunk)
|
|
1860
|
+
|
|
1861
|
+
# Update progress at start of chunk to show we're working
|
|
1862
|
+
if progress_bar is not None:
|
|
1863
|
+
progress_bar.set_postfix_str(f"chunk {chunk_idx}")
|
|
1864
|
+
|
|
1865
|
+
content = header + "".join(chunk)
|
|
1866
|
+
df_chunk = pl.read_csv(io.StringIO(content), separator="\t")
|
|
1867
|
+
|
|
1868
|
+
# Use MINIMAL format (skip annotation expansion for now)
|
|
1869
|
+
try:
|
|
1870
|
+
melted = format_bcftools_tsv_minimal(df_chunk, pedigree_df=None)
|
|
1871
|
+
except Exception:
|
|
1872
|
+
# If parse fails for chunk, skip
|
|
1873
|
+
processed_lines += chunk_count
|
|
1874
|
+
if progress_bar is not None:
|
|
1875
|
+
progress_bar.update(chunk_count)
|
|
1876
|
+
elif verbose and (chunk_idx % 10 == 0):
|
|
1877
|
+
click.echo(
|
|
1878
|
+
f"Processed {processed_lines}/{total_lines} lines...", err=True
|
|
1879
|
+
)
|
|
1880
|
+
continue
|
|
1881
|
+
|
|
1882
|
+
# Optimization #1: Early GT filtering for DNM mode - skip reference-only variants
|
|
1883
|
+
if filter_config and filter_config.get("dnm", {}).get("enabled", False):
|
|
1884
|
+
melted = melted.filter(
|
|
1885
|
+
pl.col("sample_gt").str.contains("1")
|
|
1886
|
+
| pl.col("sample_gt").str.contains("2")
|
|
1887
|
+
)
|
|
1888
|
+
if melted.shape[0] == 0:
|
|
1889
|
+
# All variants filtered out, skip to next chunk
|
|
1890
|
+
processed_lines += chunk_count
|
|
1891
|
+
if progress_bar is not None:
|
|
1892
|
+
progress_bar.update(chunk_count)
|
|
1893
|
+
continue
|
|
1894
|
+
|
|
1895
|
+
# Attach parent ids from pedigree
|
|
1896
|
+
if pedigree_df is not None and "sample" in melted.columns:
|
|
1897
|
+
melted = melted.join(
|
|
1898
|
+
pedigree_df, left_on="sample", right_on="sample_id", how="left"
|
|
1899
|
+
)
|
|
1900
|
+
|
|
1901
|
+
# Add parent genotypes from global lookup
|
|
1902
|
+
melted = add_parent_genotypes_from_lookup(melted, parent_lookup)
|
|
1903
|
+
|
|
1904
|
+
# Apply filters BEFORE expanding annotations (key optimization)
|
|
1905
|
+
if filter_config and filter_config.get("dnm", {}).get("enabled", False):
|
|
1906
|
+
cfg = {}
|
|
1907
|
+
cfg.update(filter_config.get("quality", {}))
|
|
1908
|
+
cfg.update(filter_config.get("dnm", {}))
|
|
1909
|
+
filtered = apply_de_novo_filter(
|
|
1910
|
+
melted, cfg, verbose=False, pedigree_df=pedigree_df
|
|
1911
|
+
)
|
|
1912
|
+
else:
|
|
1913
|
+
# Apply standard quality filters
|
|
1914
|
+
filtered = melted
|
|
1915
|
+
quality_cfg = filter_config.get("quality", {}) if filter_config else {}
|
|
1916
|
+
if quality_cfg:
|
|
1917
|
+
filtered = apply_quality_filters(filtered, quality_cfg, verbose=False)
|
|
1918
|
+
|
|
1919
|
+
# Apply expression filter if present
|
|
1920
|
+
expr = filter_config.get("expression") if filter_config else None
|
|
1921
|
+
if expr:
|
|
1922
|
+
try:
|
|
1923
|
+
expr_parsed = parse_impact_filter_expression(expr, filtered)
|
|
1924
|
+
filtered = filtered.filter(expr_parsed)
|
|
1925
|
+
except Exception:
|
|
1926
|
+
# If expression parsing fails on a chunk, skip applying it
|
|
1927
|
+
pass
|
|
1928
|
+
|
|
1929
|
+
# NOW expand annotations only for variants that passed filters
|
|
1930
|
+
if filtered.shape[0] > 0 and "(null)" in filtered.columns:
|
|
1931
|
+
if progress_bar is not None:
|
|
1932
|
+
progress_bar.set_postfix_str(f"expanding annotations chunk {chunk_idx}")
|
|
1933
|
+
filtered = format_expand_annotations(filtered)
|
|
1934
|
+
|
|
1935
|
+
# Update progress after filtering (before write)
|
|
1936
|
+
if progress_bar is not None:
|
|
1937
|
+
progress_bar.set_postfix_str(f"writing chunk {chunk_idx}")
|
|
1938
|
+
|
|
1939
|
+
# Write filtered chunk to file (skip if empty)
|
|
1940
|
+
if filtered.shape[0] > 0:
|
|
1941
|
+
if output_format in ("tsv", "tsv.gz"):
|
|
1942
|
+
csv_text = filtered.write_csv(separator="\t")
|
|
1943
|
+
# First write includes header; subsequent writes skip header
|
|
1944
|
+
if first_write:
|
|
1945
|
+
write_text = csv_text
|
|
1946
|
+
first_write = False
|
|
1947
|
+
if output_format == "tsv.gz":
|
|
1948
|
+
with gzip.open(out_path, "wt") as f:
|
|
1949
|
+
f.write(write_text)
|
|
1950
|
+
else:
|
|
1951
|
+
with open(out_path, "wt") as f:
|
|
1952
|
+
f.write(write_text)
|
|
1953
|
+
else:
|
|
1954
|
+
# Skip header
|
|
1955
|
+
tail = "\n".join(csv_text.splitlines()[1:])
|
|
1956
|
+
if output_format == "tsv.gz":
|
|
1957
|
+
with gzip.open(out_path, "at") as f:
|
|
1958
|
+
f.write("\n" + tail)
|
|
1959
|
+
else:
|
|
1960
|
+
with open(out_path, "at") as f:
|
|
1961
|
+
f.write("\n" + tail)
|
|
1962
|
+
else:
|
|
1963
|
+
# Parquet: write part file
|
|
1964
|
+
part_path = out_path.with_suffix(f".part{chunk_idx}.parquet")
|
|
1965
|
+
filtered.write_parquet(part_path)
|
|
1966
|
+
|
|
1967
|
+
if progress_bar is not None:
|
|
1968
|
+
progress_bar.update(chunk_count)
|
|
1969
|
+
progress_bar.set_postfix_str("") # Clear status
|
|
1970
|
+
else:
|
|
1971
|
+
processed_lines += chunk_count
|
|
1972
|
+
if verbose and (chunk_idx % 10 == 0):
|
|
1973
|
+
click.echo(
|
|
1974
|
+
f"Processed {processed_lines}/{total_lines} lines...", err=True
|
|
1975
|
+
)
|
|
1976
|
+
|
|
1977
|
+
if progress_bar is not None:
|
|
1978
|
+
progress_bar.close()
|
|
1979
|
+
|
|
1980
|
+
if verbose:
|
|
1981
|
+
click.echo("Processing complete.", err=True)
|
|
1982
|
+
|
|
1983
|
+
|
|
1123
1984
|
def apply_filters_lazy(
|
|
1124
|
-
lazy_df: pl.LazyFrame,
|
|
1985
|
+
lazy_df: pl.LazyFrame,
|
|
1986
|
+
filter_config: dict,
|
|
1987
|
+
verbose: bool = False,
|
|
1988
|
+
pedigree_df: Optional[pl.DataFrame] = None,
|
|
1125
1989
|
) -> pl.LazyFrame:
|
|
1126
1990
|
"""Apply quality and expression filters using lazy operations."""
|
|
1991
|
+
# If DNM mode is enabled, we need to collect and apply DNM logic
|
|
1992
|
+
if filter_config.get("dnm", {}).get("enabled", False):
|
|
1993
|
+
dnm_cfg = {}
|
|
1994
|
+
dnm_cfg.update(filter_config.get("quality", {}))
|
|
1995
|
+
dnm_cfg.update(filter_config.get("dnm", {}))
|
|
1996
|
+
|
|
1997
|
+
# Collect minimally and apply DNM filter eagerly, then return lazy frame
|
|
1998
|
+
df = lazy_df.collect(streaming=True)
|
|
1999
|
+
filtered_df = apply_de_novo_filter(
|
|
2000
|
+
df, dnm_cfg, verbose, pedigree_df=pedigree_df
|
|
2001
|
+
)
|
|
2002
|
+
return filtered_df.lazy()
|
|
2003
|
+
|
|
1127
2004
|
quality_config = filter_config.get("quality", {})
|
|
1128
2005
|
expression = filter_config.get("expression")
|
|
1129
2006
|
|