pywombat 0.5.0__py3-none-any.whl → 1.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
pywombat/cli.py CHANGED
@@ -122,7 +122,7 @@ def cli(
122
122
  else:
123
123
  output = input_stem
124
124
 
125
- # Use streaming approach with lazy API
125
+ # Process using streaming mode
126
126
  if verbose:
127
127
  click.echo("Processing with streaming mode...", err=True)
128
128
 
@@ -147,7 +147,9 @@ def cli(
147
147
 
148
148
  # Apply filters if provided
149
149
  if filter_config_data:
150
- lazy_df = apply_filters_lazy(lazy_df, filter_config_data, verbose)
150
+ lazy_df = apply_filters_lazy(
151
+ lazy_df, filter_config_data, verbose, pedigree_df
152
+ )
151
153
 
152
154
  # Write output
153
155
  output_path = Path(f"{output}.{output_format}")
@@ -463,6 +465,321 @@ def apply_quality_filters(
463
465
  return df
464
466
 
465
467
 
468
+ # ------------------ De novo (DNM) filter helpers ------------------
469
+
470
+
471
+ def _chrom_short(chrom: str) -> str:
472
+ """Normalize chromosome name to short form (e.g., 'chrX' -> 'X')."""
473
+ if chrom is None:
474
+ return ""
475
+ chrom = str(chrom)
476
+ return chrom[3:] if chrom.lower().startswith("chr") else chrom
477
+
478
+
479
+ def _pos_in_par(chrom: str, pos: int, par_regions: dict) -> bool:
480
+ """Return True if (chrom,pos) falls in any PAR region from config.
481
+
482
+ Normalizes chromosome names to match both 'X'/'chrX' formats.
483
+ """
484
+ if not par_regions:
485
+ return False
486
+
487
+ chrom_short = _chrom_short(chrom)
488
+
489
+ for assembly, regions in par_regions.items():
490
+ for region_name, region in regions.items():
491
+ region_chrom = _chrom_short(region.get("chrom", "X"))
492
+ start = int(region.get("start"))
493
+ end = int(region.get("end"))
494
+ # Normalize both to uppercase for comparison
495
+ if region_chrom.upper() == chrom_short.upper() and start <= pos <= end:
496
+ return True
497
+ return False
498
+
499
+
500
+ def apply_de_novo_filter(
501
+ df: pl.DataFrame,
502
+ dnm_config: dict,
503
+ verbose: bool = False,
504
+ pedigree_df: Optional[pl.DataFrame] = None,
505
+ ) -> pl.DataFrame:
506
+ """Apply de novo detection filters to dataframe using vectorized operations.
507
+
508
+ dnm_config expected keys:
509
+ - sample_dp_min, sample_gq_min, sample_vaf_min
510
+ - parent_dp_min, parent_gq_min, parent_vaf_max
511
+ - par_regions: dict with PAR regions keyed by assembly
512
+
513
+ This function will read `sex` from `df` when present; otherwise it will use
514
+ the `pedigree_df` (which should contain `sample_id` and `sex`).
515
+ """
516
+ if not dnm_config:
517
+ return df
518
+
519
+ # Required thresholds
520
+ s_dp = dnm_config.get("sample_dp_min", 10)
521
+ s_gq = dnm_config.get("sample_gq_min", 18)
522
+ s_vaf = dnm_config.get("sample_vaf_min", 0.15)
523
+ s_vaf_hemizygous = dnm_config.get("sample_vaf_hemizygous_min", 0.85)
524
+
525
+ p_dp = dnm_config.get("parent_dp_min", 10)
526
+ p_gq = dnm_config.get("parent_gq_min", 18)
527
+ p_vaf = dnm_config.get("parent_vaf_max", 0.02)
528
+
529
+ fafmax_max = dnm_config.get("fafmax_faf95_max_genomes_max", None)
530
+ genomes_filters_pass_only = dnm_config.get("genomes_filters_pass_only", False)
531
+
532
+ par_regions = dnm_config.get("par_regions", {})
533
+
534
+ original = df.shape[0]
535
+
536
+ # Ensure we have parent identifiers (father/mother). Try to add from pedigree if missing.
537
+ if "father_id" not in df.columns or "mother_id" not in df.columns:
538
+ if pedigree_df is not None:
539
+ # Join pedigree to get father/mother/sample sex if needed
540
+ df = df.join(
541
+ pedigree_df, left_on="sample", right_on="sample_id", how="left"
542
+ )
543
+ else:
544
+ raise click.Abort(
545
+ "DNM filtering requires a pedigree (with father/mother IDs and sex)."
546
+ )
547
+
548
+ if verbose:
549
+ click.echo(
550
+ f"DNM: Starting with {df.shape[0]} variants after pedigree join", err=True
551
+ )
552
+
553
+ # Optimization #6: Early exit for variants with missing parents
554
+ df = df.filter(
555
+ pl.col("father_id").is_not_null() & pl.col("mother_id").is_not_null()
556
+ )
557
+
558
+ if verbose:
559
+ click.echo(
560
+ f"DNM: {df.shape[0]} variants after filtering missing parents", err=True
561
+ )
562
+
563
+ if df.shape[0] == 0:
564
+ if verbose:
565
+ click.echo(
566
+ f"De novo filter: {original} -> 0 rows (all missing parents)", err=True
567
+ )
568
+ return df
569
+
570
+ # Ensure we have sex information
571
+ if "sex" not in df.columns:
572
+ if pedigree_df is not None and "sex" in pedigree_df.columns:
573
+ # Re-join to get sex if not already present
574
+ if "sex" not in df.columns:
575
+ df = df.join(
576
+ pedigree_df.select(["sample_id", "sex"]),
577
+ left_on="sample",
578
+ right_on="sample_id",
579
+ how="left",
580
+ )
581
+ else:
582
+ raise click.Abort(
583
+ "DNM filtering requires sex information in the pedigree (column 'sex')."
584
+ )
585
+
586
+ # Filter out variants with missing/partial genotypes (containing '.')
587
+ # Sample genotype must be fully called (no '.', './0', '1/.', './.' etc.)
588
+ df = df.filter(
589
+ ~pl.col("sample_gt").str.contains(r"\.") & pl.col("sample_gt").is_not_null()
590
+ )
591
+
592
+ if verbose:
593
+ click.echo(
594
+ f"DNM: {df.shape[0]} variants after removing samples with missing/partial GT",
595
+ err=True,
596
+ )
597
+
598
+ # Ensure proband passes basic sample thresholds (DP, GQ)
599
+ # VAF threshold will be applied differently for hemizygous vs diploid variants
600
+ df = df.filter(
601
+ (pl.col("sample_dp").cast(pl.Float64, strict=False) >= s_dp)
602
+ & (pl.col("sample_gq").cast(pl.Float64, strict=False) >= s_gq)
603
+ )
604
+
605
+ if verbose:
606
+ click.echo(
607
+ f"DNM: {df.shape[0]} variants after proband QC (DP>={s_dp}, GQ>={s_gq})",
608
+ err=True,
609
+ )
610
+
611
+ if df.shape[0] == 0:
612
+ if verbose:
613
+ click.echo(
614
+ f"De novo filter: {original} -> 0 rows (sample QC failed)", err=True
615
+ )
616
+ return df
617
+
618
+ # Optimization #7: Vectorized parent filtering using when/then expressions
619
+ # Build chromosome-specific parent filters
620
+
621
+ # Ensure #CHROM is string type for operations (convert from categorical if needed)
622
+ if "#CHROM" in df.columns and df.schema["#CHROM"] == pl.Categorical:
623
+ df = df.with_columns(pl.col("#CHROM").cast(pl.Utf8))
624
+
625
+ # Normalize chromosome to short form for comparison
626
+ df = df.with_columns(
627
+ pl.col("#CHROM")
628
+ .str.replace("^chr", "")
629
+ .str.to_uppercase()
630
+ .alias("_chrom_short")
631
+ )
632
+
633
+ # Determine if variant is in PAR region (vectorized)
634
+ par_mask = pl.lit(False)
635
+ if par_regions:
636
+ for assembly, regions in par_regions.items():
637
+ for region_name, region in regions.items():
638
+ region_chrom = _chrom_short(region.get("chrom", "X")).upper()
639
+ start = int(region.get("start"))
640
+ end = int(region.get("end"))
641
+ par_mask = par_mask | (
642
+ (pl.col("_chrom_short") == region_chrom)
643
+ & (pl.col("POS") >= start)
644
+ & (pl.col("POS") <= end)
645
+ )
646
+
647
+ df = df.with_columns(par_mask.alias("_in_par"))
648
+
649
+ # Normalize sex to uppercase string for comparison
650
+ df = df.with_columns(
651
+ pl.col("sex").cast(pl.Utf8).str.to_uppercase().alias("_sex_norm")
652
+ )
653
+
654
+ # Determine if variant is hemizygous (X in males outside PAR, or Y in males)
655
+ is_hemizygous = (
656
+ (pl.col("_chrom_short") == "X")
657
+ & (pl.col("_sex_norm").is_in(["1", "M"]))
658
+ & ~pl.col("_in_par")
659
+ ) | (pl.col("_chrom_short") == "Y")
660
+
661
+ # Determine if variant is homozygous (1/1, 2/2, etc.)
662
+ is_homozygous = pl.col("sample_gt").is_in(["1/1", "2/2", "3/3"])
663
+
664
+ # Apply VAF threshold: hemizygous and homozygous variants require higher VAF (>=0.85)
665
+ sample_vaf_filter = (
666
+ pl.when(is_hemizygous | is_homozygous)
667
+ .then(pl.col("sample_vaf") >= s_vaf_hemizygous)
668
+ .otherwise(pl.col("sample_vaf") > s_vaf)
669
+ )
670
+
671
+ df = df.filter(sample_vaf_filter)
672
+
673
+ if verbose:
674
+ click.echo(
675
+ f"DNM: {df.shape[0]} variants after VAF filtering (het>{s_vaf}, hom/hemizygous>={s_vaf_hemizygous})",
676
+ err=True,
677
+ )
678
+
679
+ # Apply fafmax_faf95_max_genomes filter if specified
680
+ if fafmax_max is not None:
681
+ if "fafmax_faf95_max_genomes" in df.columns:
682
+ df = df.filter(
683
+ (
684
+ pl.col("fafmax_faf95_max_genomes").cast(pl.Float64, strict=False)
685
+ <= fafmax_max
686
+ )
687
+ | pl.col("fafmax_faf95_max_genomes").is_null()
688
+ )
689
+ if verbose:
690
+ click.echo(
691
+ f"DNM: {df.shape[0]} variants after fafmax_faf95_max_genomes filter (<={fafmax_max})",
692
+ err=True,
693
+ )
694
+ elif verbose:
695
+ click.echo(
696
+ "DNM: Warning - fafmax_faf95_max_genomes column not found, skipping frequency filter",
697
+ err=True,
698
+ )
699
+
700
+ # Apply genomes_filters filter if specified
701
+ if genomes_filters_pass_only:
702
+ if "genomes_filters" in df.columns:
703
+ df = df.filter(
704
+ (pl.col("genomes_filters") == ".") | pl.col("genomes_filters").is_null()
705
+ )
706
+ if verbose:
707
+ click.echo(
708
+ f"DNM: {df.shape[0]} variants after genomes_filters filter (pass only)",
709
+ err=True,
710
+ )
711
+ elif verbose:
712
+ click.echo(
713
+ "DNM: Warning - genomes_filters column not found, skipping genomes_filters filter",
714
+ err=True,
715
+ )
716
+
717
+ # Build parent quality checks (common to all)
718
+ father_qual_ok = (pl.col("father_dp").cast(pl.Float64, strict=False) >= p_dp) & (
719
+ pl.col("father_gq").cast(pl.Float64, strict=False) >= p_gq
720
+ )
721
+ mother_qual_ok = (pl.col("mother_dp").cast(pl.Float64, strict=False) >= p_dp) & (
722
+ pl.col("mother_gq").cast(pl.Float64, strict=False) >= p_gq
723
+ )
724
+
725
+ father_vaf_ok = pl.col("father_vaf").is_null() | (pl.col("father_vaf") < p_vaf)
726
+ mother_vaf_ok = pl.col("mother_vaf").is_null() | (pl.col("mother_vaf") < p_vaf)
727
+
728
+ # Parent genotype checks: ensure no '.' in genotypes (no ./., 0/., 1/. etc.)
729
+ father_gt_ok = (
730
+ ~pl.col("father_gt").str.contains(r"\.") & pl.col("father_gt").is_not_null()
731
+ )
732
+ mother_gt_ok = (
733
+ ~pl.col("mother_gt").str.contains(r"\.") & pl.col("mother_gt").is_not_null()
734
+ )
735
+
736
+ # Build comprehensive parent filter using when/then logic
737
+ parent_filter = (
738
+ pl.when(pl.col("_in_par"))
739
+ # PAR region: both parents must be reference (autosomal-like) with valid GTs
740
+ .then(
741
+ father_qual_ok
742
+ & father_vaf_ok
743
+ & father_gt_ok
744
+ & mother_qual_ok
745
+ & mother_vaf_ok
746
+ & mother_gt_ok
747
+ )
748
+ .when(pl.col("_chrom_short") == "Y")
749
+ # Y chromosome: only check father (mother doesn't have Y), father GT must be valid
750
+ .then(father_qual_ok & father_vaf_ok & father_gt_ok)
751
+ .when((pl.col("_chrom_short") == "X") & (pl.col("_sex_norm").is_in(["1", "M"])))
752
+ # X chromosome, male proband: father is hemizygous, only check mother VAF and GT
753
+ .then(father_qual_ok & mother_qual_ok & mother_vaf_ok & mother_gt_ok)
754
+ # Default (autosomes or X/female): both parents must be reference with valid GTs
755
+ .otherwise(
756
+ father_qual_ok
757
+ & father_vaf_ok
758
+ & father_gt_ok
759
+ & mother_qual_ok
760
+ & mother_vaf_ok
761
+ & mother_gt_ok
762
+ )
763
+ )
764
+
765
+ # Apply parent filter
766
+ result = df.filter(parent_filter)
767
+
768
+ if verbose:
769
+ click.echo(
770
+ f"DNM: {result.shape[0]} variants after parent genotype filtering (parent DP>={p_dp}, GQ>={p_gq}, VAF<{p_vaf})",
771
+ err=True,
772
+ )
773
+
774
+ # Drop temporary columns
775
+ result = result.drop(["_chrom_short", "_in_par", "_sex_norm"])
776
+
777
+ if verbose:
778
+ click.echo(f"De novo filter: {original} -> {result.shape[0]} rows", err=True)
779
+
780
+ return result
781
+
782
+
466
783
  def parse_impact_filter_expression(expression: str, df: pl.DataFrame) -> pl.Expr:
467
784
  """Parse a filter expression string into a Polars expression."""
468
785
  # Replace operators with Polars equivalents
@@ -733,8 +1050,45 @@ def apply_filters_and_write(
733
1050
  output_prefix: Optional[str],
734
1051
  output_format: str,
735
1052
  verbose: bool,
1053
+ pedigree_df: Optional[pl.DataFrame] = None,
736
1054
  ):
737
1055
  """Apply filters and write output files."""
1056
+ # If DNM mode is enabled, apply DNM-specific criteria and skip impact/frequency filters
1057
+ if filter_config.get("dnm", {}).get("enabled", False):
1058
+ dnm_cfg = {}
1059
+ # Merge quality & dnm-specific thresholds into a single config for the function
1060
+ dnm_cfg.update(filter_config.get("quality", {}))
1061
+ dnm_cfg.update(filter_config.get("dnm", {}))
1062
+
1063
+ filtered_df = apply_de_novo_filter(
1064
+ df, dnm_cfg, verbose, pedigree_df=pedigree_df
1065
+ )
1066
+
1067
+ # Write result (same behavior as non-impact single-output)
1068
+ if not output_prefix:
1069
+ if output_format != "tsv":
1070
+ click.echo(
1071
+ "Error: stdout output only supported for TSV format.",
1072
+ err=True,
1073
+ )
1074
+ raise click.Abort()
1075
+ click.echo(filtered_df.write_csv(separator="\t"), nl=False)
1076
+ else:
1077
+ output_path = Path(f"{output_prefix}.{output_format}")
1078
+
1079
+ if output_format == "tsv":
1080
+ filtered_df.write_csv(output_path, separator="\t")
1081
+ elif output_format == "tsv.gz":
1082
+ csv_content = filtered_df.write_csv(separator="\t")
1083
+ with gzip.open(output_path, "wt") as f:
1084
+ f.write(csv_content)
1085
+ elif output_format == "parquet":
1086
+ filtered_df.write_parquet(output_path)
1087
+
1088
+ click.echo(f"De novo variants written to {output_path}", err=True)
1089
+
1090
+ return
1091
+
738
1092
  # Apply quality filters first
739
1093
  quality_config = filter_config.get("quality", {})
740
1094
  filtered_df = apply_quality_filters(df, quality_config, verbose)
@@ -832,8 +1186,16 @@ def read_pedigree(pedigree_path: Path) -> pl.DataFrame:
832
1186
  if "FatherBarcode" in df.columns:
833
1187
  df = df.rename({"FatherBarcode": "father_id", "MotherBarcode": "mother_id"})
834
1188
 
835
- # Select only the columns we need
836
- pedigree_df = df.select(["sample_id", "father_id", "mother_id"])
1189
+ # Normalize sex column name if present (e.g., 'Sex' or 'sex')
1190
+ sex_col = next((c for c in df.columns if c.lower() == "sex"), None)
1191
+ if sex_col and sex_col != "sex":
1192
+ df = df.rename({sex_col: "sex"})
1193
+
1194
+ # Select only the columns we need (include sex if present)
1195
+ select_cols = ["sample_id", "father_id", "mother_id"]
1196
+ if "sex" in df.columns:
1197
+ select_cols.append("sex")
1198
+ pedigree_df = df.select(select_cols)
837
1199
 
838
1200
  # Replace 0 and -9 with null (indicating no parent)
839
1201
  pedigree_df = pedigree_df.with_columns(
@@ -944,36 +1306,32 @@ def add_parent_genotypes(df: pl.DataFrame, pedigree_df: pl.DataFrame) -> pl.Data
944
1306
  return df
945
1307
 
946
1308
 
947
- def format_bcftools_tsv(
948
- df: pl.DataFrame, pedigree_df: Optional[pl.DataFrame] = None
949
- ) -> pl.DataFrame:
1309
+ def format_expand_annotations(df: pl.DataFrame) -> pl.DataFrame:
950
1310
  """
951
- Format a bcftools tabulated TSV DataFrame.
1311
+ Expand the (null) annotation column into separate columns.
1312
+
1313
+ This is a separate step that can be applied after filtering to avoid
1314
+ expensive annotation expansion on variants that will be filtered out.
1315
+
1316
+ Handles two types of INFO fields:
1317
+ - Key-value pairs (e.g., "DP=30") -> extracted as string values
1318
+ - Boolean flags (e.g., "PASS", "DB") -> created as True/False columns
952
1319
 
953
1320
  Args:
954
- df: Input DataFrame from bcftools
955
- pedigree_df: Optional pedigree DataFrame with parent information
1321
+ df: DataFrame with (null) column
956
1322
 
957
1323
  Returns:
958
- Formatted DataFrame with expanded fields and melted samples
1324
+ DataFrame with expanded annotation columns
959
1325
  """
960
1326
  # Find the (null) column
961
1327
  if "(null)" not in df.columns:
962
- raise ValueError("Column '(null)' not found in the input file")
963
-
964
- # Get column index of (null)
965
- null_col_idx = df.columns.index("(null)")
966
-
967
- # Split columns into: before (null), (null), and after (null)
968
- cols_after = df.columns[null_col_idx + 1 :]
969
-
970
- # Step 1: Expand the (null) column
971
- # Split by semicolon and create new columns
1328
+ # Already expanded or missing - return as-is
1329
+ return df
972
1330
 
973
- # First, we need to extract all unique field names from the (null) column
974
- # to know what columns to create
1331
+ # Extract all unique field names and flags from the (null) column
975
1332
  null_values = df.select("(null)").to_series()
976
1333
  all_fields = set()
1334
+ all_flags = set()
977
1335
 
978
1336
  for value in null_values:
979
1337
  if value and not (isinstance(value, float)): # Skip null/NaN values
@@ -982,8 +1340,10 @@ def format_bcftools_tsv(
982
1340
  if "=" in pair:
983
1341
  field_name = pair.split("=", 1)[0]
984
1342
  all_fields.add(field_name)
1343
+ elif pair.strip(): # Boolean flag (no '=')
1344
+ all_flags.add(pair.strip())
985
1345
 
986
- # Create expressions to extract each field
1346
+ # Create expressions to extract each key-value field
987
1347
  for field in sorted(all_fields):
988
1348
  # Extract the field value from the (null) column
989
1349
  # Pattern: extract value after "field=" and before ";" or end of string
@@ -991,6 +1351,14 @@ def format_bcftools_tsv(
991
1351
  pl.col("(null)").str.extract(f"{field}=([^;]+)").alias(field)
992
1352
  )
993
1353
 
1354
+ # Create boolean columns for flags
1355
+ for flag in sorted(all_flags):
1356
+ # Check if flag appears in the (null) column (as whole word)
1357
+ # Use regex to match flag as a separate field (not part of another field name)
1358
+ df = df.with_columns(
1359
+ pl.col("(null)").str.contains(f"(^|;){flag}(;|$)").alias(flag)
1360
+ )
1361
+
994
1362
  # Drop the original (null) column
995
1363
  df = df.drop("(null)")
996
1364
 
@@ -998,9 +1366,42 @@ def format_bcftools_tsv(
998
1366
  if "CSQ" in df.columns:
999
1367
  df = df.drop("CSQ")
1000
1368
 
1001
- # Step 2: Identify sample columns and extract sample names
1002
- # Sample columns have format "sample_name:..." in the header
1003
- # Skip the CSQ column as it should not be melted (handled above)
1369
+ return df
1370
+
1371
+
1372
+ def format_bcftools_tsv_minimal(
1373
+ df: pl.DataFrame, pedigree_df: Optional[pl.DataFrame] = None
1374
+ ) -> pl.DataFrame:
1375
+ """
1376
+ Format a bcftools tabulated TSV with minimal processing.
1377
+
1378
+ This version SKIPS expanding the (null) annotation field and only:
1379
+ - Melts sample columns into rows
1380
+ - Extracts GT:DP:GQ:AD:VAF from sample values
1381
+ - Optionally adds parent genotypes if pedigree provided
1382
+
1383
+ Use this for filtering workflows where you want to apply filters BEFORE
1384
+ expensive annotation expansion. Call format_expand_annotations() afterwards
1385
+ on filtered results.
1386
+
1387
+ Args:
1388
+ df: Input DataFrame from bcftools
1389
+ pedigree_df: Optional pedigree DataFrame with parent information
1390
+
1391
+ Returns:
1392
+ Formatted DataFrame with melted samples (annotations still in (null) column)
1393
+ """
1394
+ # Find the (null) column
1395
+ if "(null)" not in df.columns:
1396
+ raise ValueError("Column '(null)' not found in the input file")
1397
+
1398
+ # Get column index of (null)
1399
+ null_col_idx = df.columns.index("(null)")
1400
+
1401
+ # Split columns into: before (null), (null), and after (null)
1402
+ cols_after = df.columns[null_col_idx + 1 :]
1403
+
1404
+ # Step 1: Identify sample columns (SKIP annotation expansion)
1004
1405
  sample_cols = []
1005
1406
  sample_names = []
1006
1407
 
@@ -1019,10 +1420,10 @@ def format_bcftools_tsv(
1019
1420
  sample_names.append(col)
1020
1421
 
1021
1422
  if not sample_cols:
1022
- # No sample columns to melt, just return expanded data
1423
+ # No sample columns to melt
1023
1424
  return df
1024
1425
 
1025
- # Step 3: Melt the sample columns
1426
+ # Step 2: Melt the sample columns
1026
1427
  # Keep all columns except sample columns as id_vars
1027
1428
  id_vars = [col for col in df.columns if col not in sample_cols]
1028
1429
 
@@ -1105,6 +1506,37 @@ def format_bcftools_tsv(
1105
1506
  return melted_df
1106
1507
 
1107
1508
 
1509
+ def format_bcftools_tsv(
1510
+ df: pl.DataFrame, pedigree_df: Optional[pl.DataFrame] = None
1511
+ ) -> pl.DataFrame:
1512
+ """
1513
+ Format a bcftools tabulated TSV DataFrame (full processing).
1514
+
1515
+ This is the complete formatting that:
1516
+ 1. Melts samples and extracts GT:DP:GQ:AD:VAF
1517
+ 2. Expands (null) annotation field into separate columns
1518
+ 3. Adds parent genotypes if pedigree provided
1519
+
1520
+ For DNM filtering workflows, consider using format_bcftools_tsv_minimal()
1521
+ + apply_de_novo_filter() + format_expand_annotations() to avoid expanding
1522
+ annotations on variants that will be filtered out.
1523
+
1524
+ Args:
1525
+ df: Input DataFrame from bcftools
1526
+ pedigree_df: Optional pedigree DataFrame with parent information
1527
+
1528
+ Returns:
1529
+ Formatted DataFrame with expanded fields and melted samples
1530
+ """
1531
+ # First do minimal formatting (melt + sample columns)
1532
+ melted_df = format_bcftools_tsv_minimal(df, pedigree_df)
1533
+
1534
+ # Then expand annotations
1535
+ expanded_df = format_expand_annotations(melted_df)
1536
+
1537
+ return expanded_df
1538
+
1539
+
1108
1540
  def format_bcftools_tsv_lazy(
1109
1541
  lazy_df: pl.LazyFrame, pedigree_df: Optional[pl.DataFrame] = None
1110
1542
  ) -> pl.LazyFrame:
@@ -1120,10 +1552,455 @@ def format_bcftools_tsv_lazy(
1120
1552
  return formatted_df.lazy()
1121
1553
 
1122
1554
 
1555
+ # ------------------ Chunked two-pass processing with progress ------------------
1556
+ import io
1557
+ import math
1558
+
1559
+
1560
+ def _open_file_lines(path: Path):
1561
+ """Yield header line and an iterator over the remaining lines (text)."""
1562
+ if str(path).endswith(".gz"):
1563
+ import gzip as _gzip
1564
+
1565
+ f = _gzip.open(path, "rt")
1566
+ else:
1567
+ f = open(path, "rt")
1568
+
1569
+ try:
1570
+ header = f.readline()
1571
+ for line in f:
1572
+ yield header, line
1573
+ finally:
1574
+ f.close()
1575
+
1576
+
1577
+ def _line_iterator(path: Path, chunk_size: int = 50000):
1578
+ """Yield (header, chunk_lines) tuples where chunk_lines is a list of lines."""
1579
+ if str(path).endswith(".gz"):
1580
+ import gzip as _gzip
1581
+
1582
+ f = _gzip.open(path, "rt")
1583
+ else:
1584
+ f = open(path, "rt")
1585
+
1586
+ try:
1587
+ header = f.readline()
1588
+ while True:
1589
+ chunk = []
1590
+ for _ in range(chunk_size):
1591
+ line = f.readline()
1592
+ if not line:
1593
+ break
1594
+ chunk.append(line)
1595
+ if not chunk:
1596
+ break
1597
+ yield header, chunk
1598
+ finally:
1599
+ f.close()
1600
+
1601
+
1602
+ def build_parent_lookup_from_file(
1603
+ path: Path,
1604
+ pedigree_df: Optional[pl.DataFrame] = None,
1605
+ progress_bar=None,
1606
+ verbose: bool = False,
1607
+ chunk_size: int = 50000,
1608
+ ):
1609
+ """First pass: build a minimal parent lookup DataFrame with per-sample genotypes.
1610
+
1611
+ Returns a tuple (lookup_df, total_lines) where total_lines is the approximate number
1612
+ of data lines processed (excluding header). If a `progress_bar` is provided it will
1613
+ be updated as we process chunks.
1614
+ """
1615
+ parts = []
1616
+
1617
+ schema_overrides = {
1618
+ col: pl.Utf8
1619
+ for col in [
1620
+ "FID",
1621
+ "sample_id",
1622
+ "father_id",
1623
+ "mother_id",
1624
+ "FatherBarcode",
1625
+ "MotherBarcode",
1626
+ "sample",
1627
+ ]
1628
+ }
1629
+
1630
+ processed = 0
1631
+ chunk_idx = 0
1632
+ for header, chunk in _line_iterator(path, chunk_size=chunk_size):
1633
+ chunk_idx += 1
1634
+ content = header + "".join(chunk)
1635
+ try:
1636
+ df_chunk = pl.read_csv(
1637
+ io.StringIO(content), separator="\t", schema_overrides=schema_overrides
1638
+ )
1639
+ except Exception:
1640
+ # Skip unparsable chunk
1641
+ processed += len(chunk)
1642
+ if progress_bar is not None:
1643
+ progress_bar.update(len(chunk))
1644
+ elif verbose and (chunk_idx % 10 == 0):
1645
+ click.echo(
1646
+ f"Building lookup: processed ~{processed} lines...", err=True
1647
+ )
1648
+ continue
1649
+
1650
+ try:
1651
+ # Use minimal format for lookup building (skip annotation expansion)
1652
+ formatted = format_bcftools_tsv_minimal(df_chunk, pedigree_df=None)
1653
+ except Exception:
1654
+ # If chunk cannot be parsed into variants, skip
1655
+ processed += len(chunk)
1656
+ if progress_bar is not None:
1657
+ progress_bar.update(len(chunk))
1658
+ elif verbose and (chunk_idx % 10 == 0):
1659
+ click.echo(
1660
+ f"Building lookup: processed ~{processed} lines...", err=True
1661
+ )
1662
+ continue
1663
+
1664
+ cols = [
1665
+ "#CHROM",
1666
+ "POS",
1667
+ "REF",
1668
+ "ALT",
1669
+ "sample",
1670
+ "sample_gt",
1671
+ "sample_dp",
1672
+ "sample_gq",
1673
+ "sample_ad",
1674
+ "sample_vaf",
1675
+ ]
1676
+ sel = [c for c in cols if c in formatted.columns]
1677
+ # Optimization: Only store non-reference genotypes in lookup (skip 0/0)
1678
+ part = formatted.select(sel).filter(pl.col("sample_gt") != "0/0").unique()
1679
+ parts.append(part)
1680
+
1681
+ # Update progress
1682
+ processed += len(chunk)
1683
+ if progress_bar is not None:
1684
+ progress_bar.update(len(chunk))
1685
+ elif verbose and (chunk_idx % 10 == 0):
1686
+ click.echo(f"Building lookup: processed ~{processed} lines...", err=True)
1687
+
1688
+ if parts:
1689
+ lookup = pl.concat(parts).unique()
1690
+ else:
1691
+ lookup = pl.DataFrame([])
1692
+
1693
+ # processed currently counts number of data lines seen
1694
+ return lookup, processed
1695
+
1696
+
1697
+ def add_parent_genotypes_from_lookup(
1698
+ df: pl.DataFrame, parent_lookup: pl.DataFrame
1699
+ ) -> pl.DataFrame:
1700
+ """Join father/mother genotype info from the parent_lookup into df.
1701
+
1702
+ Assumes df has columns: #CHROM, POS, REF, ALT, father_id, mother_id
1703
+ """
1704
+ join_cols = [c for c in ["#CHROM", "POS", "REF", "ALT"] if c in df.columns]
1705
+
1706
+ if parent_lookup.is_empty():
1707
+ # Create empty parent columns
1708
+ return df
1709
+
1710
+ # Prepare father lookup
1711
+ father_lookup = parent_lookup.rename(
1712
+ {
1713
+ "sample": "father",
1714
+ "sample_gt": "father_gt",
1715
+ "sample_dp": "father_dp",
1716
+ "sample_gq": "father_gq",
1717
+ "sample_ad": "father_ad",
1718
+ "sample_vaf": "father_vaf",
1719
+ }
1720
+ )
1721
+
1722
+ # Left join on join_cols + ['father']
1723
+ if "father_id" in df.columns:
1724
+ df = df.join(
1725
+ father_lookup,
1726
+ left_on=join_cols + ["father_id"],
1727
+ right_on=join_cols + ["father"],
1728
+ how="left",
1729
+ )
1730
+ else:
1731
+ # No father id, attempt join on 'father' column
1732
+ df = df.join(
1733
+ father_lookup,
1734
+ on=join_cols + ["father"],
1735
+ how="left",
1736
+ )
1737
+
1738
+ # Prepare mother lookup
1739
+ mother_lookup = parent_lookup.rename(
1740
+ {
1741
+ "sample": "mother",
1742
+ "sample_gt": "mother_gt",
1743
+ "sample_dp": "mother_dp",
1744
+ "sample_gq": "mother_gq",
1745
+ "sample_ad": "mother_ad",
1746
+ "sample_vaf": "mother_vaf",
1747
+ }
1748
+ )
1749
+
1750
+ if "mother_id" in df.columns:
1751
+ df = df.join(
1752
+ mother_lookup,
1753
+ left_on=join_cols + ["mother_id"],
1754
+ right_on=join_cols + ["mother"],
1755
+ how="left",
1756
+ )
1757
+ else:
1758
+ df = df.join(
1759
+ mother_lookup,
1760
+ on=join_cols + ["mother"],
1761
+ how="left",
1762
+ )
1763
+
1764
+ # Normalize '.' to '0' for DP/GQ like previous function
1765
+ df = df.with_columns(
1766
+ [
1767
+ pl.when(pl.col("father_dp") == ".")
1768
+ .then(pl.lit("0"))
1769
+ .otherwise(pl.col("father_dp"))
1770
+ .alias("father_dp"),
1771
+ pl.when(pl.col("father_gq") == ".")
1772
+ .then(pl.lit("0"))
1773
+ .otherwise(pl.col("father_gq"))
1774
+ .alias("father_gq"),
1775
+ pl.when(pl.col("mother_dp") == ".")
1776
+ .then(pl.lit("0"))
1777
+ .otherwise(pl.col("mother_dp"))
1778
+ .alias("mother_dp"),
1779
+ pl.when(pl.col("mother_gq") == ".")
1780
+ .then(pl.lit("0"))
1781
+ .otherwise(pl.col("mother_gq"))
1782
+ .alias("mother_gq"),
1783
+ ]
1784
+ )
1785
+
1786
+ return df
1787
+
1788
+
1789
+ def process_with_progress(
1790
+ input_path: Path,
1791
+ output_prefix: str,
1792
+ output_format: str,
1793
+ pedigree_df: Optional[pl.DataFrame],
1794
+ filter_config: Optional[dict],
1795
+ verbose: bool,
1796
+ chunk_size: int = 50000,
1797
+ ):
1798
+ """Process input in two passes and show a progress bar.
1799
+
1800
+ Pass 1: build parent lookup
1801
+ Pass 2: process chunks, join parent genotypes, apply filters, and write incrementally
1802
+ """
1803
+ # tqdm optional
1804
+ try:
1805
+ from tqdm.auto import tqdm
1806
+ except Exception:
1807
+ tqdm = None
1808
+
1809
+ # Build parent lookup in a single pass (counts lines while building lookup)
1810
+ if verbose:
1811
+ click.echo("Pass 1: building parent genotype lookup (single pass)...", err=True)
1812
+
1813
+ pbar_lookup = None
1814
+ if tqdm is not None:
1815
+ # No known total yet; tqdm will show progress increasing
1816
+ pbar_lookup = tqdm(desc="Building parent lookup", unit="lines")
1817
+
1818
+ parent_lookup, total_lines = build_parent_lookup_from_file(
1819
+ input_path,
1820
+ pedigree_df,
1821
+ progress_bar=pbar_lookup,
1822
+ verbose=verbose,
1823
+ chunk_size=chunk_size,
1824
+ )
1825
+
1826
+ if pbar_lookup is not None:
1827
+ pbar_lookup.close()
1828
+
1829
+ if verbose:
1830
+ click.echo(
1831
+ f"Parent lookup contains {parent_lookup.shape[0]} genotype entries (from ~{total_lines} lines)",
1832
+ err=True,
1833
+ )
1834
+
1835
+ total_chunks = math.ceil(total_lines / chunk_size) if chunk_size > 0 else 1
1836
+
1837
+ # Prepare output paths
1838
+ if output_format == "tsv":
1839
+ out_path = Path(f"{output_prefix}.tsv")
1840
+ elif output_format == "tsv.gz":
1841
+ out_path = Path(f"{output_prefix}.tsv.gz")
1842
+ else:
1843
+ # We'll write parquet parts
1844
+ out_path = Path(f"{output_prefix}")
1845
+
1846
+ first_write = True
1847
+
1848
+ # Iterate chunks and process
1849
+ iterator = _line_iterator(input_path, chunk_size=chunk_size)
1850
+ chunk_idx = 0
1851
+
1852
+ progress_bar = None
1853
+ processed_lines = 0
1854
+ if tqdm is not None:
1855
+ progress_bar = tqdm(total=total_lines, desc="Processing variants")
1856
+
1857
+ for header, chunk in iterator:
1858
+ chunk_idx += 1
1859
+ chunk_count = len(chunk)
1860
+
1861
+ # Update progress at start of chunk to show we're working
1862
+ if progress_bar is not None:
1863
+ progress_bar.set_postfix_str(f"chunk {chunk_idx}")
1864
+
1865
+ content = header + "".join(chunk)
1866
+ df_chunk = pl.read_csv(io.StringIO(content), separator="\t")
1867
+
1868
+ # Use MINIMAL format (skip annotation expansion for now)
1869
+ try:
1870
+ melted = format_bcftools_tsv_minimal(df_chunk, pedigree_df=None)
1871
+ except Exception:
1872
+ # If parse fails for chunk, skip
1873
+ processed_lines += chunk_count
1874
+ if progress_bar is not None:
1875
+ progress_bar.update(chunk_count)
1876
+ elif verbose and (chunk_idx % 10 == 0):
1877
+ click.echo(
1878
+ f"Processed {processed_lines}/{total_lines} lines...", err=True
1879
+ )
1880
+ continue
1881
+
1882
+ # Optimization #1: Early GT filtering for DNM mode - skip reference-only variants
1883
+ if filter_config and filter_config.get("dnm", {}).get("enabled", False):
1884
+ melted = melted.filter(
1885
+ pl.col("sample_gt").str.contains("1")
1886
+ | pl.col("sample_gt").str.contains("2")
1887
+ )
1888
+ if melted.shape[0] == 0:
1889
+ # All variants filtered out, skip to next chunk
1890
+ processed_lines += chunk_count
1891
+ if progress_bar is not None:
1892
+ progress_bar.update(chunk_count)
1893
+ continue
1894
+
1895
+ # Attach parent ids from pedigree
1896
+ if pedigree_df is not None and "sample" in melted.columns:
1897
+ melted = melted.join(
1898
+ pedigree_df, left_on="sample", right_on="sample_id", how="left"
1899
+ )
1900
+
1901
+ # Add parent genotypes from global lookup
1902
+ melted = add_parent_genotypes_from_lookup(melted, parent_lookup)
1903
+
1904
+ # Apply filters BEFORE expanding annotations (key optimization)
1905
+ if filter_config and filter_config.get("dnm", {}).get("enabled", False):
1906
+ cfg = {}
1907
+ cfg.update(filter_config.get("quality", {}))
1908
+ cfg.update(filter_config.get("dnm", {}))
1909
+ filtered = apply_de_novo_filter(
1910
+ melted, cfg, verbose=False, pedigree_df=pedigree_df
1911
+ )
1912
+ else:
1913
+ # Apply standard quality filters
1914
+ filtered = melted
1915
+ quality_cfg = filter_config.get("quality", {}) if filter_config else {}
1916
+ if quality_cfg:
1917
+ filtered = apply_quality_filters(filtered, quality_cfg, verbose=False)
1918
+
1919
+ # Apply expression filter if present
1920
+ expr = filter_config.get("expression") if filter_config else None
1921
+ if expr:
1922
+ try:
1923
+ expr_parsed = parse_impact_filter_expression(expr, filtered)
1924
+ filtered = filtered.filter(expr_parsed)
1925
+ except Exception:
1926
+ # If expression parsing fails on a chunk, skip applying it
1927
+ pass
1928
+
1929
+ # NOW expand annotations only for variants that passed filters
1930
+ if filtered.shape[0] > 0 and "(null)" in filtered.columns:
1931
+ if progress_bar is not None:
1932
+ progress_bar.set_postfix_str(f"expanding annotations chunk {chunk_idx}")
1933
+ filtered = format_expand_annotations(filtered)
1934
+
1935
+ # Update progress after filtering (before write)
1936
+ if progress_bar is not None:
1937
+ progress_bar.set_postfix_str(f"writing chunk {chunk_idx}")
1938
+
1939
+ # Write filtered chunk to file (skip if empty)
1940
+ if filtered.shape[0] > 0:
1941
+ if output_format in ("tsv", "tsv.gz"):
1942
+ csv_text = filtered.write_csv(separator="\t")
1943
+ # First write includes header; subsequent writes skip header
1944
+ if first_write:
1945
+ write_text = csv_text
1946
+ first_write = False
1947
+ if output_format == "tsv.gz":
1948
+ with gzip.open(out_path, "wt") as f:
1949
+ f.write(write_text)
1950
+ else:
1951
+ with open(out_path, "wt") as f:
1952
+ f.write(write_text)
1953
+ else:
1954
+ # Skip header
1955
+ tail = "\n".join(csv_text.splitlines()[1:])
1956
+ if output_format == "tsv.gz":
1957
+ with gzip.open(out_path, "at") as f:
1958
+ f.write("\n" + tail)
1959
+ else:
1960
+ with open(out_path, "at") as f:
1961
+ f.write("\n" + tail)
1962
+ else:
1963
+ # Parquet: write part file
1964
+ part_path = out_path.with_suffix(f".part{chunk_idx}.parquet")
1965
+ filtered.write_parquet(part_path)
1966
+
1967
+ if progress_bar is not None:
1968
+ progress_bar.update(chunk_count)
1969
+ progress_bar.set_postfix_str("") # Clear status
1970
+ else:
1971
+ processed_lines += chunk_count
1972
+ if verbose and (chunk_idx % 10 == 0):
1973
+ click.echo(
1974
+ f"Processed {processed_lines}/{total_lines} lines...", err=True
1975
+ )
1976
+
1977
+ if progress_bar is not None:
1978
+ progress_bar.close()
1979
+
1980
+ if verbose:
1981
+ click.echo("Processing complete.", err=True)
1982
+
1983
+
1123
1984
  def apply_filters_lazy(
1124
- lazy_df: pl.LazyFrame, filter_config: dict, verbose: bool = False
1985
+ lazy_df: pl.LazyFrame,
1986
+ filter_config: dict,
1987
+ verbose: bool = False,
1988
+ pedigree_df: Optional[pl.DataFrame] = None,
1125
1989
  ) -> pl.LazyFrame:
1126
1990
  """Apply quality and expression filters using lazy operations."""
1991
+ # If DNM mode is enabled, we need to collect and apply DNM logic
1992
+ if filter_config.get("dnm", {}).get("enabled", False):
1993
+ dnm_cfg = {}
1994
+ dnm_cfg.update(filter_config.get("quality", {}))
1995
+ dnm_cfg.update(filter_config.get("dnm", {}))
1996
+
1997
+ # Collect minimally and apply DNM filter eagerly, then return lazy frame
1998
+ df = lazy_df.collect(streaming=True)
1999
+ filtered_df = apply_de_novo_filter(
2000
+ df, dnm_cfg, verbose, pedigree_df=pedigree_df
2001
+ )
2002
+ return filtered_df.lazy()
2003
+
1127
2004
  quality_config = filter_config.get("quality", {})
1128
2005
  expression = filter_config.get("expression")
1129
2006