pywombat 0.4.0__py3-none-any.whl → 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
pywombat/cli.py CHANGED
@@ -122,19 +122,34 @@ def cli(
122
122
  else:
123
123
  output = input_stem
124
124
 
125
- # Use streaming approach with lazy API
125
+ # Process using streaming mode
126
126
  if verbose:
127
127
  click.echo("Processing with streaming mode...", err=True)
128
128
 
129
129
  # Build lazy query
130
- lazy_df = pl.scan_csv(input_file, separator="\t")
130
+ # Force certain columns to string type
131
+ string_columns = [
132
+ "FID",
133
+ "sample_id",
134
+ "father_id",
135
+ "mother_id",
136
+ "FatherBarcode",
137
+ "MotherBarcode",
138
+ "sample",
139
+ ]
140
+ schema_overrides = {col: pl.Utf8 for col in string_columns}
141
+ lazy_df = pl.scan_csv(
142
+ input_file, separator="\t", schema_overrides=schema_overrides
143
+ )
131
144
 
132
145
  # Apply formatting transformations
133
146
  lazy_df = format_bcftools_tsv_lazy(lazy_df, pedigree_df)
134
147
 
135
148
  # Apply filters if provided
136
149
  if filter_config_data:
137
- lazy_df = apply_filters_lazy(lazy_df, filter_config_data, verbose)
150
+ lazy_df = apply_filters_lazy(
151
+ lazy_df, filter_config_data, verbose, pedigree_df
152
+ )
138
153
 
139
154
  # Write output
140
155
  output_path = Path(f"{output}.{output_format}")
@@ -185,7 +200,18 @@ def debug_variant(
185
200
  click.echo(f"Debug mode: searching for {chrom}:{pos}", err=True)
186
201
 
187
202
  # Read and format the data
188
- df = pl.read_csv(input_file, separator="\t")
203
+ # Force certain columns to string type
204
+ string_columns = [
205
+ "FID",
206
+ "sample_id",
207
+ "father_id",
208
+ "mother_id",
209
+ "FatherBarcode",
210
+ "MotherBarcode",
211
+ "sample",
212
+ ]
213
+ schema_overrides = {col: pl.Utf8 for col in string_columns}
214
+ df = pl.read_csv(input_file, separator="\t", schema_overrides=schema_overrides)
189
215
  formatted_df = format_bcftools_tsv(df, pedigree_df)
190
216
 
191
217
  # Filter to matching rows
@@ -439,6 +465,321 @@ def apply_quality_filters(
439
465
  return df
440
466
 
441
467
 
468
+ # ------------------ De novo (DNM) filter helpers ------------------
469
+
470
+
471
+ def _chrom_short(chrom: str) -> str:
472
+ """Normalize chromosome name to short form (e.g., 'chrX' -> 'X')."""
473
+ if chrom is None:
474
+ return ""
475
+ chrom = str(chrom)
476
+ return chrom[3:] if chrom.lower().startswith("chr") else chrom
477
+
478
+
479
+ def _pos_in_par(chrom: str, pos: int, par_regions: dict) -> bool:
480
+ """Return True if (chrom,pos) falls in any PAR region from config.
481
+
482
+ Normalizes chromosome names to match both 'X'/'chrX' formats.
483
+ """
484
+ if not par_regions:
485
+ return False
486
+
487
+ chrom_short = _chrom_short(chrom)
488
+
489
+ for assembly, regions in par_regions.items():
490
+ for region_name, region in regions.items():
491
+ region_chrom = _chrom_short(region.get("chrom", "X"))
492
+ start = int(region.get("start"))
493
+ end = int(region.get("end"))
494
+ # Normalize both to uppercase for comparison
495
+ if region_chrom.upper() == chrom_short.upper() and start <= pos <= end:
496
+ return True
497
+ return False
498
+
499
+
500
+ def apply_de_novo_filter(
501
+ df: pl.DataFrame,
502
+ dnm_config: dict,
503
+ verbose: bool = False,
504
+ pedigree_df: Optional[pl.DataFrame] = None,
505
+ ) -> pl.DataFrame:
506
+ """Apply de novo detection filters to dataframe using vectorized operations.
507
+
508
+ dnm_config expected keys:
509
+ - sample_dp_min, sample_gq_min, sample_vaf_min
510
+ - parent_dp_min, parent_gq_min, parent_vaf_max
511
+ - par_regions: dict with PAR regions keyed by assembly
512
+
513
+ This function will read `sex` from `df` when present; otherwise it will use
514
+ the `pedigree_df` (which should contain `sample_id` and `sex`).
515
+ """
516
+ if not dnm_config:
517
+ return df
518
+
519
+ # Required thresholds
520
+ s_dp = dnm_config.get("sample_dp_min", 10)
521
+ s_gq = dnm_config.get("sample_gq_min", 18)
522
+ s_vaf = dnm_config.get("sample_vaf_min", 0.15)
523
+ s_vaf_hemizygous = dnm_config.get("sample_vaf_hemizygous_min", 0.85)
524
+
525
+ p_dp = dnm_config.get("parent_dp_min", 10)
526
+ p_gq = dnm_config.get("parent_gq_min", 18)
527
+ p_vaf = dnm_config.get("parent_vaf_max", 0.02)
528
+
529
+ fafmax_max = dnm_config.get("fafmax_faf95_max_genomes_max", None)
530
+ genomes_filters_pass_only = dnm_config.get("genomes_filters_pass_only", False)
531
+
532
+ par_regions = dnm_config.get("par_regions", {})
533
+
534
+ original = df.shape[0]
535
+
536
+ # Ensure we have parent identifiers (father/mother). Try to add from pedigree if missing.
537
+ if "father_id" not in df.columns or "mother_id" not in df.columns:
538
+ if pedigree_df is not None:
539
+ # Join pedigree to get father/mother/sample sex if needed
540
+ df = df.join(
541
+ pedigree_df, left_on="sample", right_on="sample_id", how="left"
542
+ )
543
+ else:
544
+ raise click.Abort(
545
+ "DNM filtering requires a pedigree (with father/mother IDs and sex)."
546
+ )
547
+
548
+ if verbose:
549
+ click.echo(
550
+ f"DNM: Starting with {df.shape[0]} variants after pedigree join", err=True
551
+ )
552
+
553
+ # Optimization #6: Early exit for variants with missing parents
554
+ df = df.filter(
555
+ pl.col("father_id").is_not_null() & pl.col("mother_id").is_not_null()
556
+ )
557
+
558
+ if verbose:
559
+ click.echo(
560
+ f"DNM: {df.shape[0]} variants after filtering missing parents", err=True
561
+ )
562
+
563
+ if df.shape[0] == 0:
564
+ if verbose:
565
+ click.echo(
566
+ f"De novo filter: {original} -> 0 rows (all missing parents)", err=True
567
+ )
568
+ return df
569
+
570
+ # Ensure we have sex information
571
+ if "sex" not in df.columns:
572
+ if pedigree_df is not None and "sex" in pedigree_df.columns:
573
+ # Re-join to get sex if not already present
574
+ if "sex" not in df.columns:
575
+ df = df.join(
576
+ pedigree_df.select(["sample_id", "sex"]),
577
+ left_on="sample",
578
+ right_on="sample_id",
579
+ how="left",
580
+ )
581
+ else:
582
+ raise click.Abort(
583
+ "DNM filtering requires sex information in the pedigree (column 'sex')."
584
+ )
585
+
586
+ # Filter out variants with missing/partial genotypes (containing '.')
587
+ # Sample genotype must be fully called (no '.', './0', '1/.', './.' etc.)
588
+ df = df.filter(
589
+ ~pl.col("sample_gt").str.contains(r"\.") & pl.col("sample_gt").is_not_null()
590
+ )
591
+
592
+ if verbose:
593
+ click.echo(
594
+ f"DNM: {df.shape[0]} variants after removing samples with missing/partial GT",
595
+ err=True,
596
+ )
597
+
598
+ # Ensure proband passes basic sample thresholds (DP, GQ)
599
+ # VAF threshold will be applied differently for hemizygous vs diploid variants
600
+ df = df.filter(
601
+ (pl.col("sample_dp").cast(pl.Float64, strict=False) >= s_dp)
602
+ & (pl.col("sample_gq").cast(pl.Float64, strict=False) >= s_gq)
603
+ )
604
+
605
+ if verbose:
606
+ click.echo(
607
+ f"DNM: {df.shape[0]} variants after proband QC (DP>={s_dp}, GQ>={s_gq})",
608
+ err=True,
609
+ )
610
+
611
+ if df.shape[0] == 0:
612
+ if verbose:
613
+ click.echo(
614
+ f"De novo filter: {original} -> 0 rows (sample QC failed)", err=True
615
+ )
616
+ return df
617
+
618
+ # Optimization #7: Vectorized parent filtering using when/then expressions
619
+ # Build chromosome-specific parent filters
620
+
621
+ # Ensure #CHROM is string type for operations (convert from categorical if needed)
622
+ if "#CHROM" in df.columns and df.schema["#CHROM"] == pl.Categorical:
623
+ df = df.with_columns(pl.col("#CHROM").cast(pl.Utf8))
624
+
625
+ # Normalize chromosome to short form for comparison
626
+ df = df.with_columns(
627
+ pl.col("#CHROM")
628
+ .str.replace("^chr", "")
629
+ .str.to_uppercase()
630
+ .alias("_chrom_short")
631
+ )
632
+
633
+ # Determine if variant is in PAR region (vectorized)
634
+ par_mask = pl.lit(False)
635
+ if par_regions:
636
+ for assembly, regions in par_regions.items():
637
+ for region_name, region in regions.items():
638
+ region_chrom = _chrom_short(region.get("chrom", "X")).upper()
639
+ start = int(region.get("start"))
640
+ end = int(region.get("end"))
641
+ par_mask = par_mask | (
642
+ (pl.col("_chrom_short") == region_chrom)
643
+ & (pl.col("POS") >= start)
644
+ & (pl.col("POS") <= end)
645
+ )
646
+
647
+ df = df.with_columns(par_mask.alias("_in_par"))
648
+
649
+ # Normalize sex to uppercase string for comparison
650
+ df = df.with_columns(
651
+ pl.col("sex").cast(pl.Utf8).str.to_uppercase().alias("_sex_norm")
652
+ )
653
+
654
+ # Determine if variant is hemizygous (X in males outside PAR, or Y in males)
655
+ is_hemizygous = (
656
+ (pl.col("_chrom_short") == "X")
657
+ & (pl.col("_sex_norm").is_in(["1", "M"]))
658
+ & ~pl.col("_in_par")
659
+ ) | (pl.col("_chrom_short") == "Y")
660
+
661
+ # Determine if variant is homozygous (1/1, 2/2, etc.)
662
+ is_homozygous = pl.col("sample_gt").is_in(["1/1", "2/2", "3/3"])
663
+
664
+ # Apply VAF threshold: hemizygous and homozygous variants require higher VAF (>=0.85)
665
+ sample_vaf_filter = (
666
+ pl.when(is_hemizygous | is_homozygous)
667
+ .then(pl.col("sample_vaf") >= s_vaf_hemizygous)
668
+ .otherwise(pl.col("sample_vaf") > s_vaf)
669
+ )
670
+
671
+ df = df.filter(sample_vaf_filter)
672
+
673
+ if verbose:
674
+ click.echo(
675
+ f"DNM: {df.shape[0]} variants after VAF filtering (het>{s_vaf}, hom/hemizygous>={s_vaf_hemizygous})",
676
+ err=True,
677
+ )
678
+
679
+ # Apply fafmax_faf95_max_genomes filter if specified
680
+ if fafmax_max is not None:
681
+ if "fafmax_faf95_max_genomes" in df.columns:
682
+ df = df.filter(
683
+ (
684
+ pl.col("fafmax_faf95_max_genomes").cast(pl.Float64, strict=False)
685
+ <= fafmax_max
686
+ )
687
+ | pl.col("fafmax_faf95_max_genomes").is_null()
688
+ )
689
+ if verbose:
690
+ click.echo(
691
+ f"DNM: {df.shape[0]} variants after fafmax_faf95_max_genomes filter (<={fafmax_max})",
692
+ err=True,
693
+ )
694
+ elif verbose:
695
+ click.echo(
696
+ "DNM: Warning - fafmax_faf95_max_genomes column not found, skipping frequency filter",
697
+ err=True,
698
+ )
699
+
700
+ # Apply genomes_filters filter if specified
701
+ if genomes_filters_pass_only:
702
+ if "genomes_filters" in df.columns:
703
+ df = df.filter(
704
+ (pl.col("genomes_filters") == ".") | pl.col("genomes_filters").is_null()
705
+ )
706
+ if verbose:
707
+ click.echo(
708
+ f"DNM: {df.shape[0]} variants after genomes_filters filter (pass only)",
709
+ err=True,
710
+ )
711
+ elif verbose:
712
+ click.echo(
713
+ "DNM: Warning - genomes_filters column not found, skipping genomes_filters filter",
714
+ err=True,
715
+ )
716
+
717
+ # Build parent quality checks (common to all)
718
+ father_qual_ok = (pl.col("father_dp").cast(pl.Float64, strict=False) >= p_dp) & (
719
+ pl.col("father_gq").cast(pl.Float64, strict=False) >= p_gq
720
+ )
721
+ mother_qual_ok = (pl.col("mother_dp").cast(pl.Float64, strict=False) >= p_dp) & (
722
+ pl.col("mother_gq").cast(pl.Float64, strict=False) >= p_gq
723
+ )
724
+
725
+ father_vaf_ok = pl.col("father_vaf").is_null() | (pl.col("father_vaf") < p_vaf)
726
+ mother_vaf_ok = pl.col("mother_vaf").is_null() | (pl.col("mother_vaf") < p_vaf)
727
+
728
+ # Parent genotype checks: ensure no '.' in genotypes (no ./., 0/., 1/. etc.)
729
+ father_gt_ok = (
730
+ ~pl.col("father_gt").str.contains(r"\.") & pl.col("father_gt").is_not_null()
731
+ )
732
+ mother_gt_ok = (
733
+ ~pl.col("mother_gt").str.contains(r"\.") & pl.col("mother_gt").is_not_null()
734
+ )
735
+
736
+ # Build comprehensive parent filter using when/then logic
737
+ parent_filter = (
738
+ pl.when(pl.col("_in_par"))
739
+ # PAR region: both parents must be reference (autosomal-like) with valid GTs
740
+ .then(
741
+ father_qual_ok
742
+ & father_vaf_ok
743
+ & father_gt_ok
744
+ & mother_qual_ok
745
+ & mother_vaf_ok
746
+ & mother_gt_ok
747
+ )
748
+ .when(pl.col("_chrom_short") == "Y")
749
+ # Y chromosome: only check father (mother doesn't have Y), father GT must be valid
750
+ .then(father_qual_ok & father_vaf_ok & father_gt_ok)
751
+ .when((pl.col("_chrom_short") == "X") & (pl.col("_sex_norm").is_in(["1", "M"])))
752
+ # X chromosome, male proband: father is hemizygous, only check mother VAF and GT
753
+ .then(father_qual_ok & mother_qual_ok & mother_vaf_ok & mother_gt_ok)
754
+ # Default (autosomes or X/female): both parents must be reference with valid GTs
755
+ .otherwise(
756
+ father_qual_ok
757
+ & father_vaf_ok
758
+ & father_gt_ok
759
+ & mother_qual_ok
760
+ & mother_vaf_ok
761
+ & mother_gt_ok
762
+ )
763
+ )
764
+
765
+ # Apply parent filter
766
+ result = df.filter(parent_filter)
767
+
768
+ if verbose:
769
+ click.echo(
770
+ f"DNM: {result.shape[0]} variants after parent genotype filtering (parent DP>={p_dp}, GQ>={p_gq}, VAF<{p_vaf})",
771
+ err=True,
772
+ )
773
+
774
+ # Drop temporary columns
775
+ result = result.drop(["_chrom_short", "_in_par", "_sex_norm"])
776
+
777
+ if verbose:
778
+ click.echo(f"De novo filter: {original} -> {result.shape[0]} rows", err=True)
779
+
780
+ return result
781
+
782
+
442
783
  def parse_impact_filter_expression(expression: str, df: pl.DataFrame) -> pl.Expr:
443
784
  """Parse a filter expression string into a Polars expression."""
444
785
  # Replace operators with Polars equivalents
@@ -709,8 +1050,45 @@ def apply_filters_and_write(
709
1050
  output_prefix: Optional[str],
710
1051
  output_format: str,
711
1052
  verbose: bool,
1053
+ pedigree_df: Optional[pl.DataFrame] = None,
712
1054
  ):
713
1055
  """Apply filters and write output files."""
1056
+ # If DNM mode is enabled, apply DNM-specific criteria and skip impact/frequency filters
1057
+ if filter_config.get("dnm", {}).get("enabled", False):
1058
+ dnm_cfg = {}
1059
+ # Merge quality & dnm-specific thresholds into a single config for the function
1060
+ dnm_cfg.update(filter_config.get("quality", {}))
1061
+ dnm_cfg.update(filter_config.get("dnm", {}))
1062
+
1063
+ filtered_df = apply_de_novo_filter(
1064
+ df, dnm_cfg, verbose, pedigree_df=pedigree_df
1065
+ )
1066
+
1067
+ # Write result (same behavior as non-impact single-output)
1068
+ if not output_prefix:
1069
+ if output_format != "tsv":
1070
+ click.echo(
1071
+ "Error: stdout output only supported for TSV format.",
1072
+ err=True,
1073
+ )
1074
+ raise click.Abort()
1075
+ click.echo(filtered_df.write_csv(separator="\t"), nl=False)
1076
+ else:
1077
+ output_path = Path(f"{output_prefix}.{output_format}")
1078
+
1079
+ if output_format == "tsv":
1080
+ filtered_df.write_csv(output_path, separator="\t")
1081
+ elif output_format == "tsv.gz":
1082
+ csv_content = filtered_df.write_csv(separator="\t")
1083
+ with gzip.open(output_path, "wt") as f:
1084
+ f.write(csv_content)
1085
+ elif output_format == "parquet":
1086
+ filtered_df.write_parquet(output_path)
1087
+
1088
+ click.echo(f"De novo variants written to {output_path}", err=True)
1089
+
1090
+ return
1091
+
714
1092
  # Apply quality filters first
715
1093
  quality_config = filter_config.get("quality", {})
716
1094
  filtered_df = apply_quality_filters(df, quality_config, verbose)
@@ -771,7 +1149,18 @@ def read_pedigree(pedigree_path: Path) -> pl.DataFrame:
771
1149
  DataFrame with columns: sample_id, father_id, mother_id
772
1150
  """
773
1151
  # Try reading with header first
774
- df = pl.read_csv(pedigree_path, separator="\t")
1152
+ # Force certain columns to string type
1153
+ string_columns = [
1154
+ "FID",
1155
+ "sample_id",
1156
+ "father_id",
1157
+ "mother_id",
1158
+ "FatherBarcode",
1159
+ "MotherBarcode",
1160
+ "sample",
1161
+ ]
1162
+ schema_overrides = {col: pl.Utf8 for col in string_columns}
1163
+ df = pl.read_csv(pedigree_path, separator="\t", schema_overrides=schema_overrides)
775
1164
 
776
1165
  # Check if first row has 'FID' in first column (indicates header)
777
1166
  if df.columns[0] == "FID" or "sample_id" in df.columns:
@@ -797,8 +1186,16 @@ def read_pedigree(pedigree_path: Path) -> pl.DataFrame:
797
1186
  if "FatherBarcode" in df.columns:
798
1187
  df = df.rename({"FatherBarcode": "father_id", "MotherBarcode": "mother_id"})
799
1188
 
800
- # Select only the columns we need
801
- pedigree_df = df.select(["sample_id", "father_id", "mother_id"])
1189
+ # Normalize sex column name if present (e.g., 'Sex' or 'sex')
1190
+ sex_col = next((c for c in df.columns if c.lower() == "sex"), None)
1191
+ if sex_col and sex_col != "sex":
1192
+ df = df.rename({sex_col: "sex"})
1193
+
1194
+ # Select only the columns we need (include sex if present)
1195
+ select_cols = ["sample_id", "father_id", "mother_id"]
1196
+ if "sex" in df.columns:
1197
+ select_cols.append("sex")
1198
+ pedigree_df = df.select(select_cols)
802
1199
 
803
1200
  # Replace 0 and -9 with null (indicating no parent)
804
1201
  pedigree_df = pedigree_df.with_columns(
@@ -909,34 +1306,25 @@ def add_parent_genotypes(df: pl.DataFrame, pedigree_df: pl.DataFrame) -> pl.Data
909
1306
  return df
910
1307
 
911
1308
 
912
- def format_bcftools_tsv(
913
- df: pl.DataFrame, pedigree_df: Optional[pl.DataFrame] = None
914
- ) -> pl.DataFrame:
1309
+ def format_expand_annotations(df: pl.DataFrame) -> pl.DataFrame:
915
1310
  """
916
- Format a bcftools tabulated TSV DataFrame.
1311
+ Expand the (null) annotation column into separate columns.
1312
+
1313
+ This is a separate step that can be applied after filtering to avoid
1314
+ expensive annotation expansion on variants that will be filtered out.
917
1315
 
918
1316
  Args:
919
- df: Input DataFrame from bcftools
920
- pedigree_df: Optional pedigree DataFrame with parent information
1317
+ df: DataFrame with (null) column
921
1318
 
922
1319
  Returns:
923
- Formatted DataFrame with expanded fields and melted samples
1320
+ DataFrame with expanded annotation columns
924
1321
  """
925
1322
  # Find the (null) column
926
1323
  if "(null)" not in df.columns:
927
- raise ValueError("Column '(null)' not found in the input file")
928
-
929
- # Get column index of (null)
930
- null_col_idx = df.columns.index("(null)")
931
-
932
- # Split columns into: before (null), (null), and after (null)
933
- cols_after = df.columns[null_col_idx + 1 :]
934
-
935
- # Step 1: Expand the (null) column
936
- # Split by semicolon and create new columns
1324
+ # Already expanded or missing - return as-is
1325
+ return df
937
1326
 
938
- # First, we need to extract all unique field names from the (null) column
939
- # to know what columns to create
1327
+ # Extract all unique field names from the (null) column
940
1328
  null_values = df.select("(null)").to_series()
941
1329
  all_fields = set()
942
1330
 
@@ -963,9 +1351,42 @@ def format_bcftools_tsv(
963
1351
  if "CSQ" in df.columns:
964
1352
  df = df.drop("CSQ")
965
1353
 
966
- # Step 2: Identify sample columns and extract sample names
967
- # Sample columns have format "sample_name:..." in the header
968
- # Skip the CSQ column as it should not be melted (handled above)
1354
+ return df
1355
+
1356
+
1357
+ def format_bcftools_tsv_minimal(
1358
+ df: pl.DataFrame, pedigree_df: Optional[pl.DataFrame] = None
1359
+ ) -> pl.DataFrame:
1360
+ """
1361
+ Format a bcftools tabulated TSV with minimal processing.
1362
+
1363
+ This version SKIPS expanding the (null) annotation field and only:
1364
+ - Melts sample columns into rows
1365
+ - Extracts GT:DP:GQ:AD:VAF from sample values
1366
+ - Optionally adds parent genotypes if pedigree provided
1367
+
1368
+ Use this for filtering workflows where you want to apply filters BEFORE
1369
+ expensive annotation expansion. Call format_expand_annotations() afterwards
1370
+ on filtered results.
1371
+
1372
+ Args:
1373
+ df: Input DataFrame from bcftools
1374
+ pedigree_df: Optional pedigree DataFrame with parent information
1375
+
1376
+ Returns:
1377
+ Formatted DataFrame with melted samples (annotations still in (null) column)
1378
+ """
1379
+ # Find the (null) column
1380
+ if "(null)" not in df.columns:
1381
+ raise ValueError("Column '(null)' not found in the input file")
1382
+
1383
+ # Get column index of (null)
1384
+ null_col_idx = df.columns.index("(null)")
1385
+
1386
+ # Split columns into: before (null), (null), and after (null)
1387
+ cols_after = df.columns[null_col_idx + 1 :]
1388
+
1389
+ # Step 1: Identify sample columns (SKIP annotation expansion)
969
1390
  sample_cols = []
970
1391
  sample_names = []
971
1392
 
@@ -984,10 +1405,10 @@ def format_bcftools_tsv(
984
1405
  sample_names.append(col)
985
1406
 
986
1407
  if not sample_cols:
987
- # No sample columns to melt, just return expanded data
1408
+ # No sample columns to melt
988
1409
  return df
989
1410
 
990
- # Step 3: Melt the sample columns
1411
+ # Step 2: Melt the sample columns
991
1412
  # Keep all columns except sample columns as id_vars
992
1413
  id_vars = [col for col in df.columns if col not in sample_cols]
993
1414
 
@@ -1070,6 +1491,37 @@ def format_bcftools_tsv(
1070
1491
  return melted_df
1071
1492
 
1072
1493
 
1494
+ def format_bcftools_tsv(
1495
+ df: pl.DataFrame, pedigree_df: Optional[pl.DataFrame] = None
1496
+ ) -> pl.DataFrame:
1497
+ """
1498
+ Format a bcftools tabulated TSV DataFrame (full processing).
1499
+
1500
+ This is the complete formatting that:
1501
+ 1. Melts samples and extracts GT:DP:GQ:AD:VAF
1502
+ 2. Expands (null) annotation field into separate columns
1503
+ 3. Adds parent genotypes if pedigree provided
1504
+
1505
+ For DNM filtering workflows, consider using format_bcftools_tsv_minimal()
1506
+ + apply_de_novo_filter() + format_expand_annotations() to avoid expanding
1507
+ annotations on variants that will be filtered out.
1508
+
1509
+ Args:
1510
+ df: Input DataFrame from bcftools
1511
+ pedigree_df: Optional pedigree DataFrame with parent information
1512
+
1513
+ Returns:
1514
+ Formatted DataFrame with expanded fields and melted samples
1515
+ """
1516
+ # First do minimal formatting (melt + sample columns)
1517
+ melted_df = format_bcftools_tsv_minimal(df, pedigree_df)
1518
+
1519
+ # Then expand annotations
1520
+ expanded_df = format_expand_annotations(melted_df)
1521
+
1522
+ return expanded_df
1523
+
1524
+
1073
1525
  def format_bcftools_tsv_lazy(
1074
1526
  lazy_df: pl.LazyFrame, pedigree_df: Optional[pl.DataFrame] = None
1075
1527
  ) -> pl.LazyFrame:
@@ -1085,10 +1537,455 @@ def format_bcftools_tsv_lazy(
1085
1537
  return formatted_df.lazy()
1086
1538
 
1087
1539
 
1540
+ # ------------------ Chunked two-pass processing with progress ------------------
1541
+ import io
1542
+ import math
1543
+
1544
+
1545
+ def _open_file_lines(path: Path):
1546
+ """Yield header line and an iterator over the remaining lines (text)."""
1547
+ if str(path).endswith(".gz"):
1548
+ import gzip as _gzip
1549
+
1550
+ f = _gzip.open(path, "rt")
1551
+ else:
1552
+ f = open(path, "rt")
1553
+
1554
+ try:
1555
+ header = f.readline()
1556
+ for line in f:
1557
+ yield header, line
1558
+ finally:
1559
+ f.close()
1560
+
1561
+
1562
+ def _line_iterator(path: Path, chunk_size: int = 50000):
1563
+ """Yield (header, chunk_lines) tuples where chunk_lines is a list of lines."""
1564
+ if str(path).endswith(".gz"):
1565
+ import gzip as _gzip
1566
+
1567
+ f = _gzip.open(path, "rt")
1568
+ else:
1569
+ f = open(path, "rt")
1570
+
1571
+ try:
1572
+ header = f.readline()
1573
+ while True:
1574
+ chunk = []
1575
+ for _ in range(chunk_size):
1576
+ line = f.readline()
1577
+ if not line:
1578
+ break
1579
+ chunk.append(line)
1580
+ if not chunk:
1581
+ break
1582
+ yield header, chunk
1583
+ finally:
1584
+ f.close()
1585
+
1586
+
1587
+ def build_parent_lookup_from_file(
1588
+ path: Path,
1589
+ pedigree_df: Optional[pl.DataFrame] = None,
1590
+ progress_bar=None,
1591
+ verbose: bool = False,
1592
+ chunk_size: int = 50000,
1593
+ ):
1594
+ """First pass: build a minimal parent lookup DataFrame with per-sample genotypes.
1595
+
1596
+ Returns a tuple (lookup_df, total_lines) where total_lines is the approximate number
1597
+ of data lines processed (excluding header). If a `progress_bar` is provided it will
1598
+ be updated as we process chunks.
1599
+ """
1600
+ parts = []
1601
+
1602
+ schema_overrides = {
1603
+ col: pl.Utf8
1604
+ for col in [
1605
+ "FID",
1606
+ "sample_id",
1607
+ "father_id",
1608
+ "mother_id",
1609
+ "FatherBarcode",
1610
+ "MotherBarcode",
1611
+ "sample",
1612
+ ]
1613
+ }
1614
+
1615
+ processed = 0
1616
+ chunk_idx = 0
1617
+ for header, chunk in _line_iterator(path, chunk_size=chunk_size):
1618
+ chunk_idx += 1
1619
+ content = header + "".join(chunk)
1620
+ try:
1621
+ df_chunk = pl.read_csv(
1622
+ io.StringIO(content), separator="\t", schema_overrides=schema_overrides
1623
+ )
1624
+ except Exception:
1625
+ # Skip unparsable chunk
1626
+ processed += len(chunk)
1627
+ if progress_bar is not None:
1628
+ progress_bar.update(len(chunk))
1629
+ elif verbose and (chunk_idx % 10 == 0):
1630
+ click.echo(
1631
+ f"Building lookup: processed ~{processed} lines...", err=True
1632
+ )
1633
+ continue
1634
+
1635
+ try:
1636
+ # Use minimal format for lookup building (skip annotation expansion)
1637
+ formatted = format_bcftools_tsv_minimal(df_chunk, pedigree_df=None)
1638
+ except Exception:
1639
+ # If chunk cannot be parsed into variants, skip
1640
+ processed += len(chunk)
1641
+ if progress_bar is not None:
1642
+ progress_bar.update(len(chunk))
1643
+ elif verbose and (chunk_idx % 10 == 0):
1644
+ click.echo(
1645
+ f"Building lookup: processed ~{processed} lines...", err=True
1646
+ )
1647
+ continue
1648
+
1649
+ cols = [
1650
+ "#CHROM",
1651
+ "POS",
1652
+ "REF",
1653
+ "ALT",
1654
+ "sample",
1655
+ "sample_gt",
1656
+ "sample_dp",
1657
+ "sample_gq",
1658
+ "sample_ad",
1659
+ "sample_vaf",
1660
+ ]
1661
+ sel = [c for c in cols if c in formatted.columns]
1662
+ # Optimization: Only store non-reference genotypes in lookup (skip 0/0)
1663
+ part = formatted.select(sel).filter(pl.col("sample_gt") != "0/0").unique()
1664
+ parts.append(part)
1665
+
1666
+ # Update progress
1667
+ processed += len(chunk)
1668
+ if progress_bar is not None:
1669
+ progress_bar.update(len(chunk))
1670
+ elif verbose and (chunk_idx % 10 == 0):
1671
+ click.echo(f"Building lookup: processed ~{processed} lines...", err=True)
1672
+
1673
+ if parts:
1674
+ lookup = pl.concat(parts).unique()
1675
+ else:
1676
+ lookup = pl.DataFrame([])
1677
+
1678
+ # processed currently counts number of data lines seen
1679
+ return lookup, processed
1680
+
1681
+
1682
+ def add_parent_genotypes_from_lookup(
1683
+ df: pl.DataFrame, parent_lookup: pl.DataFrame
1684
+ ) -> pl.DataFrame:
1685
+ """Join father/mother genotype info from the parent_lookup into df.
1686
+
1687
+ Assumes df has columns: #CHROM, POS, REF, ALT, father_id, mother_id
1688
+ """
1689
+ join_cols = [c for c in ["#CHROM", "POS", "REF", "ALT"] if c in df.columns]
1690
+
1691
+ if parent_lookup.is_empty():
1692
+ # Create empty parent columns
1693
+ return df
1694
+
1695
+ # Prepare father lookup
1696
+ father_lookup = parent_lookup.rename(
1697
+ {
1698
+ "sample": "father",
1699
+ "sample_gt": "father_gt",
1700
+ "sample_dp": "father_dp",
1701
+ "sample_gq": "father_gq",
1702
+ "sample_ad": "father_ad",
1703
+ "sample_vaf": "father_vaf",
1704
+ }
1705
+ )
1706
+
1707
+ # Left join on join_cols + ['father']
1708
+ if "father_id" in df.columns:
1709
+ df = df.join(
1710
+ father_lookup,
1711
+ left_on=join_cols + ["father_id"],
1712
+ right_on=join_cols + ["father"],
1713
+ how="left",
1714
+ )
1715
+ else:
1716
+ # No father id, attempt join on 'father' column
1717
+ df = df.join(
1718
+ father_lookup,
1719
+ on=join_cols + ["father"],
1720
+ how="left",
1721
+ )
1722
+
1723
+ # Prepare mother lookup
1724
+ mother_lookup = parent_lookup.rename(
1725
+ {
1726
+ "sample": "mother",
1727
+ "sample_gt": "mother_gt",
1728
+ "sample_dp": "mother_dp",
1729
+ "sample_gq": "mother_gq",
1730
+ "sample_ad": "mother_ad",
1731
+ "sample_vaf": "mother_vaf",
1732
+ }
1733
+ )
1734
+
1735
+ if "mother_id" in df.columns:
1736
+ df = df.join(
1737
+ mother_lookup,
1738
+ left_on=join_cols + ["mother_id"],
1739
+ right_on=join_cols + ["mother"],
1740
+ how="left",
1741
+ )
1742
+ else:
1743
+ df = df.join(
1744
+ mother_lookup,
1745
+ on=join_cols + ["mother"],
1746
+ how="left",
1747
+ )
1748
+
1749
+ # Normalize '.' to '0' for DP/GQ like previous function
1750
+ df = df.with_columns(
1751
+ [
1752
+ pl.when(pl.col("father_dp") == ".")
1753
+ .then(pl.lit("0"))
1754
+ .otherwise(pl.col("father_dp"))
1755
+ .alias("father_dp"),
1756
+ pl.when(pl.col("father_gq") == ".")
1757
+ .then(pl.lit("0"))
1758
+ .otherwise(pl.col("father_gq"))
1759
+ .alias("father_gq"),
1760
+ pl.when(pl.col("mother_dp") == ".")
1761
+ .then(pl.lit("0"))
1762
+ .otherwise(pl.col("mother_dp"))
1763
+ .alias("mother_dp"),
1764
+ pl.when(pl.col("mother_gq") == ".")
1765
+ .then(pl.lit("0"))
1766
+ .otherwise(pl.col("mother_gq"))
1767
+ .alias("mother_gq"),
1768
+ ]
1769
+ )
1770
+
1771
+ return df
1772
+
1773
+
1774
+ def process_with_progress(
1775
+ input_path: Path,
1776
+ output_prefix: str,
1777
+ output_format: str,
1778
+ pedigree_df: Optional[pl.DataFrame],
1779
+ filter_config: Optional[dict],
1780
+ verbose: bool,
1781
+ chunk_size: int = 50000,
1782
+ ):
1783
+ """Process input in two passes and show a progress bar.
1784
+
1785
+ Pass 1: build parent lookup
1786
+ Pass 2: process chunks, join parent genotypes, apply filters, and write incrementally
1787
+ """
1788
+ # tqdm optional
1789
+ try:
1790
+ from tqdm.auto import tqdm
1791
+ except Exception:
1792
+ tqdm = None
1793
+
1794
+ # Build parent lookup in a single pass (counts lines while building lookup)
1795
+ if verbose:
1796
+ click.echo("Pass 1: building parent genotype lookup (single pass)...", err=True)
1797
+
1798
+ pbar_lookup = None
1799
+ if tqdm is not None:
1800
+ # No known total yet; tqdm will show progress increasing
1801
+ pbar_lookup = tqdm(desc="Building parent lookup", unit="lines")
1802
+
1803
+ parent_lookup, total_lines = build_parent_lookup_from_file(
1804
+ input_path,
1805
+ pedigree_df,
1806
+ progress_bar=pbar_lookup,
1807
+ verbose=verbose,
1808
+ chunk_size=chunk_size,
1809
+ )
1810
+
1811
+ if pbar_lookup is not None:
1812
+ pbar_lookup.close()
1813
+
1814
+ if verbose:
1815
+ click.echo(
1816
+ f"Parent lookup contains {parent_lookup.shape[0]} genotype entries (from ~{total_lines} lines)",
1817
+ err=True,
1818
+ )
1819
+
1820
+ total_chunks = math.ceil(total_lines / chunk_size) if chunk_size > 0 else 1
1821
+
1822
+ # Prepare output paths
1823
+ if output_format == "tsv":
1824
+ out_path = Path(f"{output_prefix}.tsv")
1825
+ elif output_format == "tsv.gz":
1826
+ out_path = Path(f"{output_prefix}.tsv.gz")
1827
+ else:
1828
+ # We'll write parquet parts
1829
+ out_path = Path(f"{output_prefix}")
1830
+
1831
+ first_write = True
1832
+
1833
+ # Iterate chunks and process
1834
+ iterator = _line_iterator(input_path, chunk_size=chunk_size)
1835
+ chunk_idx = 0
1836
+
1837
+ progress_bar = None
1838
+ processed_lines = 0
1839
+ if tqdm is not None:
1840
+ progress_bar = tqdm(total=total_lines, desc="Processing variants")
1841
+
1842
+ for header, chunk in iterator:
1843
+ chunk_idx += 1
1844
+ chunk_count = len(chunk)
1845
+
1846
+ # Update progress at start of chunk to show we're working
1847
+ if progress_bar is not None:
1848
+ progress_bar.set_postfix_str(f"chunk {chunk_idx}")
1849
+
1850
+ content = header + "".join(chunk)
1851
+ df_chunk = pl.read_csv(io.StringIO(content), separator="\t")
1852
+
1853
+ # Use MINIMAL format (skip annotation expansion for now)
1854
+ try:
1855
+ melted = format_bcftools_tsv_minimal(df_chunk, pedigree_df=None)
1856
+ except Exception:
1857
+ # If parse fails for chunk, skip
1858
+ processed_lines += chunk_count
1859
+ if progress_bar is not None:
1860
+ progress_bar.update(chunk_count)
1861
+ elif verbose and (chunk_idx % 10 == 0):
1862
+ click.echo(
1863
+ f"Processed {processed_lines}/{total_lines} lines...", err=True
1864
+ )
1865
+ continue
1866
+
1867
+ # Optimization #1: Early GT filtering for DNM mode - skip reference-only variants
1868
+ if filter_config and filter_config.get("dnm", {}).get("enabled", False):
1869
+ melted = melted.filter(
1870
+ pl.col("sample_gt").str.contains("1")
1871
+ | pl.col("sample_gt").str.contains("2")
1872
+ )
1873
+ if melted.shape[0] == 0:
1874
+ # All variants filtered out, skip to next chunk
1875
+ processed_lines += chunk_count
1876
+ if progress_bar is not None:
1877
+ progress_bar.update(chunk_count)
1878
+ continue
1879
+
1880
+ # Attach parent ids from pedigree
1881
+ if pedigree_df is not None and "sample" in melted.columns:
1882
+ melted = melted.join(
1883
+ pedigree_df, left_on="sample", right_on="sample_id", how="left"
1884
+ )
1885
+
1886
+ # Add parent genotypes from global lookup
1887
+ melted = add_parent_genotypes_from_lookup(melted, parent_lookup)
1888
+
1889
+ # Apply filters BEFORE expanding annotations (key optimization)
1890
+ if filter_config and filter_config.get("dnm", {}).get("enabled", False):
1891
+ cfg = {}
1892
+ cfg.update(filter_config.get("quality", {}))
1893
+ cfg.update(filter_config.get("dnm", {}))
1894
+ filtered = apply_de_novo_filter(
1895
+ melted, cfg, verbose=False, pedigree_df=pedigree_df
1896
+ )
1897
+ else:
1898
+ # Apply standard quality filters
1899
+ filtered = melted
1900
+ quality_cfg = filter_config.get("quality", {}) if filter_config else {}
1901
+ if quality_cfg:
1902
+ filtered = apply_quality_filters(filtered, quality_cfg, verbose=False)
1903
+
1904
+ # Apply expression filter if present
1905
+ expr = filter_config.get("expression") if filter_config else None
1906
+ if expr:
1907
+ try:
1908
+ expr_parsed = parse_impact_filter_expression(expr, filtered)
1909
+ filtered = filtered.filter(expr_parsed)
1910
+ except Exception:
1911
+ # If expression parsing fails on a chunk, skip applying it
1912
+ pass
1913
+
1914
+ # NOW expand annotations only for variants that passed filters
1915
+ if filtered.shape[0] > 0 and "(null)" in filtered.columns:
1916
+ if progress_bar is not None:
1917
+ progress_bar.set_postfix_str(f"expanding annotations chunk {chunk_idx}")
1918
+ filtered = format_expand_annotations(filtered)
1919
+
1920
+ # Update progress after filtering (before write)
1921
+ if progress_bar is not None:
1922
+ progress_bar.set_postfix_str(f"writing chunk {chunk_idx}")
1923
+
1924
+ # Write filtered chunk to file (skip if empty)
1925
+ if filtered.shape[0] > 0:
1926
+ if output_format in ("tsv", "tsv.gz"):
1927
+ csv_text = filtered.write_csv(separator="\t")
1928
+ # First write includes header; subsequent writes skip header
1929
+ if first_write:
1930
+ write_text = csv_text
1931
+ first_write = False
1932
+ if output_format == "tsv.gz":
1933
+ with gzip.open(out_path, "wt") as f:
1934
+ f.write(write_text)
1935
+ else:
1936
+ with open(out_path, "wt") as f:
1937
+ f.write(write_text)
1938
+ else:
1939
+ # Skip header
1940
+ tail = "\n".join(csv_text.splitlines()[1:])
1941
+ if output_format == "tsv.gz":
1942
+ with gzip.open(out_path, "at") as f:
1943
+ f.write("\n" + tail)
1944
+ else:
1945
+ with open(out_path, "at") as f:
1946
+ f.write("\n" + tail)
1947
+ else:
1948
+ # Parquet: write part file
1949
+ part_path = out_path.with_suffix(f".part{chunk_idx}.parquet")
1950
+ filtered.write_parquet(part_path)
1951
+
1952
+ if progress_bar is not None:
1953
+ progress_bar.update(chunk_count)
1954
+ progress_bar.set_postfix_str("") # Clear status
1955
+ else:
1956
+ processed_lines += chunk_count
1957
+ if verbose and (chunk_idx % 10 == 0):
1958
+ click.echo(
1959
+ f"Processed {processed_lines}/{total_lines} lines...", err=True
1960
+ )
1961
+
1962
+ if progress_bar is not None:
1963
+ progress_bar.close()
1964
+
1965
+ if verbose:
1966
+ click.echo("Processing complete.", err=True)
1967
+
1968
+
1088
1969
  def apply_filters_lazy(
1089
- lazy_df: pl.LazyFrame, filter_config: dict, verbose: bool = False
1970
+ lazy_df: pl.LazyFrame,
1971
+ filter_config: dict,
1972
+ verbose: bool = False,
1973
+ pedigree_df: Optional[pl.DataFrame] = None,
1090
1974
  ) -> pl.LazyFrame:
1091
1975
  """Apply quality and expression filters using lazy operations."""
1976
+ # If DNM mode is enabled, we need to collect and apply DNM logic
1977
+ if filter_config.get("dnm", {}).get("enabled", False):
1978
+ dnm_cfg = {}
1979
+ dnm_cfg.update(filter_config.get("quality", {}))
1980
+ dnm_cfg.update(filter_config.get("dnm", {}))
1981
+
1982
+ # Collect minimally and apply DNM filter eagerly, then return lazy frame
1983
+ df = lazy_df.collect(streaming=True)
1984
+ filtered_df = apply_de_novo_filter(
1985
+ df, dnm_cfg, verbose, pedigree_df=pedigree_df
1986
+ )
1987
+ return filtered_df.lazy()
1988
+
1092
1989
  quality_config = filter_config.get("quality", {})
1093
1990
  expression = filter_config.get("expression")
1094
1991