pywombat 1.1.0__py3-none-any.whl → 1.2.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
pywombat/cli.py CHANGED
@@ -264,6 +264,111 @@ def _process_chunk(
264
264
  return df
265
265
 
266
266
 
267
+ def process_dnm_by_chromosome(
268
+ input_file: Path,
269
+ pedigree_df: pl.DataFrame,
270
+ filter_config: dict,
271
+ output_format: str,
272
+ verbose: bool
273
+ ) -> pl.DataFrame:
274
+ """Process DNM filtering chromosome by chromosome to reduce memory usage.
275
+
276
+ Processes each chromosome separately:
277
+ 1. Load one chromosome at a time from Parquet
278
+ 2. Apply frequency/quality prefilters (before melting)
279
+ 3. Melt samples
280
+ 4. Apply DNM filters
281
+ 5. Combine results from all chromosomes
282
+
283
+ This reduces peak memory from (total_variants × samples) to
284
+ (max_chr_variants × samples).
285
+
286
+ Args:
287
+ input_file: Path to Parquet file
288
+ pedigree_df: Pedigree DataFrame with sample relationships
289
+ filter_config: Filter configuration dict
290
+ output_format: Output format (tsv, tsv.gz, parquet)
291
+ verbose: Whether to print progress messages
292
+
293
+ Returns:
294
+ Combined DataFrame with DNM-filtered variants from all chromosomes
295
+ """
296
+ # Get list of chromosomes
297
+ chromosomes = get_unique_chromosomes(input_file)
298
+
299
+ if verbose:
300
+ click.echo(
301
+ f"DNM per-chromosome processing: {len(chromosomes)} chromosomes", err=True
302
+ )
303
+
304
+ results = []
305
+ dnm_cfg = {}
306
+ dnm_cfg.update(filter_config.get("quality", {}))
307
+ dnm_cfg.update(filter_config.get("dnm", {}))
308
+
309
+ for chrom in chromosomes:
310
+ if verbose:
311
+ click.echo(f"Processing chromosome {chrom}...", err=True)
312
+
313
+ # Load only this chromosome
314
+ lazy_df = pl.scan_parquet(input_file).filter(
315
+ pl.col("#CHROM") == chrom
316
+ )
317
+
318
+ # Apply frequency filters BEFORE melting (Optimization 2)
319
+ lazy_df = apply_dnm_prefilters(lazy_df, filter_config, verbose=False)
320
+
321
+ # Count variants after prefiltering
322
+ if verbose:
323
+ pre_count = lazy_df.select(pl.count()).collect().item()
324
+ click.echo(f" Chromosome {chrom}: {pre_count} variants after prefilter", err=True)
325
+
326
+ # Collect, melt, and apply DNM filters
327
+ df = lazy_df.collect()
328
+
329
+ if df.shape[0] == 0:
330
+ if verbose:
331
+ click.echo(f" Chromosome {chrom}: No variants after prefilter, skipping", err=True)
332
+ continue
333
+
334
+ formatted_df = format_bcftools_tsv_minimal(df, pedigree_df)
335
+
336
+ if verbose:
337
+ click.echo(
338
+ f" Chromosome {chrom}: {formatted_df.shape[0]} rows after melting", err=True
339
+ )
340
+
341
+ # Apply DNM filters (skip prefilters since already applied)
342
+ filtered_df = apply_de_novo_filter(
343
+ formatted_df, dnm_cfg, verbose=False, pedigree_df=pedigree_df,
344
+ skip_prefilters=True
345
+ )
346
+
347
+ if verbose:
348
+ click.echo(
349
+ f" Chromosome {chrom}: {filtered_df.shape[0]} variants passed DNM filter", err=True
350
+ )
351
+
352
+ if filtered_df.shape[0] > 0:
353
+ results.append(filtered_df)
354
+
355
+ # Combine results
356
+ if not results:
357
+ if verbose:
358
+ click.echo("No variants passed DNM filters across all chromosomes", err=True)
359
+ # Return empty DataFrame with correct schema
360
+ return pl.DataFrame()
361
+
362
+ final_df = pl.concat(results)
363
+
364
+ if verbose:
365
+ click.echo(
366
+ f"DNM filtering complete: {final_df.shape[0]} total variants", err=True
367
+ )
368
+
369
+ return final_df
370
+
371
+
267
372
  @cli.command("filter")
268
373
  @click.argument("input_file", type=click.Path(exists=True, path_type=Path))
269
374
  @click.option(
@@ -391,6 +496,42 @@ def filter_cmd(
391
496
  # Parquet input: INFO fields already expanded by 'wombat prepare'
392
497
  lazy_df = pl.scan_parquet(input_file)
393
498
 
499
+ # Check if DNM mode is enabled - use per-chromosome processing
500
+ if filter_config_data and filter_config_data.get("dnm", {}).get("enabled", False):
501
+ if verbose:
502
+ click.echo("DNM mode: Using per-chromosome processing for memory efficiency", err=True)
503
+
504
+ # DNM requires pedigree
505
+ if pedigree_df is None:
506
+ click.echo("Error: DNM filtering requires a pedigree file (--pedigree option)", err=True)
507
+ raise click.Abort()
508
+
509
+ # Process DNM filtering chromosome by chromosome
510
+ formatted_df = process_dnm_by_chromosome(
511
+ input_file,
512
+ pedigree_df,
513
+ filter_config_data,
514
+ output_format,
515
+ verbose
516
+ )
517
+
518
+ # Write output directly
519
+ output_path = Path(f"{output}.{output_format}")
520
+
521
+ if output_format == "tsv":
522
+ formatted_df.write_csv(output_path, separator="\t")
523
+ elif output_format == "tsv.gz":
524
+ csv_content = formatted_df.write_csv(separator="\t")
525
+ with gzip.open(output_path, "wt") as f:
526
+ f.write(csv_content)
527
+ elif output_format == "parquet":
528
+ formatted_df.write_parquet(output_path)
529
+
530
+ if verbose:
531
+ click.echo(f"DNM variants written to {output_path}", err=True)
532
+
533
+ return
534
+
394
535
  # OPTIMIZATION: Apply expression filter BEFORE melting
395
536
  # Expression filters (VEP_IMPACT, etc.) don't depend on sample data
396
537
  if filter_config_data and "expression" in filter_config_data:
@@ -800,11 +941,47 @@ def _pos_in_par(chrom: str, pos: int, par_regions: dict) -> bool:
800
941
  return False
801
942
 
802
943
 
944
+ def get_unique_chromosomes(parquet_file: Path) -> list[str]:
945
+ """Get list of unique chromosomes from Parquet file, sorted naturally.
946
+
947
+ Args:
948
+ parquet_file: Path to Parquet file
949
+
950
+ Returns:
951
+ Sorted list of chromosome names (e.g., ['1', '2', ..., '22', 'X', 'Y', 'MT'])
952
+ """
953
+ # Read just the #CHROM column to get unique values
954
+ df = pl.scan_parquet(parquet_file).select("#CHROM").unique().collect()
955
+ chroms = df["#CHROM"].to_list()
956
+
957
+ # Sort chromosomes properly (1, 2, ..., 22, X, Y, MT)
958
+ def chrom_sort_key(chrom: str) -> tuple:
959
+ """Sort key for natural chromosome ordering."""
960
+ chrom_norm = chrom.replace("chr", "").replace("Chr", "").replace("CHR", "").upper()
961
+
962
+ # Try to parse as integer (autosomes)
963
+ try:
964
+ return (0, int(chrom_norm), "")
965
+ except ValueError:
966
+ pass
967
+
968
+ # Sex chromosomes and mitochondrial
969
+ if chrom_norm in ["X", "Y", "MT", "M"]:
970
+ order = {"X": 23, "Y": 24, "MT": 25, "M": 25}
971
+ return (1, order.get(chrom_norm, 99), chrom_norm)
972
+
973
+ # Other chromosomes (e.g., scaffolds)
974
+ return (2, 0, chrom_norm)
975
+
976
+ return sorted(chroms, key=chrom_sort_key)
977
+
978
+
803
979
  def apply_de_novo_filter(
804
980
  df: pl.DataFrame,
805
981
  dnm_config: dict,
806
982
  verbose: bool = False,
807
983
  pedigree_df: Optional[pl.DataFrame] = None,
984
+ skip_prefilters: bool = False,
808
985
  ) -> pl.DataFrame:
809
986
  """Apply de novo detection filters to dataframe using vectorized operations.
810
987
 
@@ -815,6 +992,13 @@ def apply_de_novo_filter(
815
992
 
816
993
  This function will read `sex` from `df` when present; otherwise it will use
817
994
  the `pedigree_df` (which should contain `sample_id` and `sex`).
995
+
996
+ Args:
997
+ df: DataFrame with melted samples
998
+ dnm_config: DNM configuration dict
999
+ verbose: Whether to print progress messages
1000
+ pedigree_df: Pedigree DataFrame
1001
+ skip_prefilters: If True, skips frequency/genomes_filters (assumes already applied)
818
1002
  """
819
1003
  if not dnm_config:
820
1004
  return df
@@ -979,43 +1163,45 @@ def apply_de_novo_filter(
979
1163
  err=True,
980
1164
  )
981
1165
 
982
- # Apply fafmax_faf95_max_genomes filter if specified
983
- if fafmax_max is not None:
984
- if "fafmax_faf95_max_genomes" in df.columns:
985
- df = df.filter(
986
- (
987
- pl.col("fafmax_faf95_max_genomes").cast(pl.Float64, strict=False)
988
- <= fafmax_max
1166
+ # Apply frequency/quality prefilters if not already applied
1167
+ if not skip_prefilters:
1168
+ # Apply fafmax_faf95_max_genomes filter if specified
1169
+ if fafmax_max is not None:
1170
+ if "fafmax_faf95_max_genomes" in df.columns:
1171
+ df = df.filter(
1172
+ (
1173
+ pl.col("fafmax_faf95_max_genomes").cast(pl.Float64, strict=False)
1174
+ <= fafmax_max
1175
+ )
1176
+ | pl.col("fafmax_faf95_max_genomes").is_null()
989
1177
  )
990
- | pl.col("fafmax_faf95_max_genomes").is_null()
991
- )
992
- if verbose:
1178
+ if verbose:
1179
+ click.echo(
1180
+ f"DNM: {df.shape[0]} variants after fafmax_faf95_max_genomes filter (<={fafmax_max})",
1181
+ err=True,
1182
+ )
1183
+ elif verbose:
993
1184
  click.echo(
994
- f"DNM: {df.shape[0]} variants after fafmax_faf95_max_genomes filter (<={fafmax_max})",
1185
+ "DNM: Warning - fafmax_faf95_max_genomes column not found, skipping frequency filter",
995
1186
  err=True,
996
1187
  )
997
- elif verbose:
998
- click.echo(
999
- "DNM: Warning - fafmax_faf95_max_genomes column not found, skipping frequency filter",
1000
- err=True,
1001
- )
1002
1188
 
1003
- # Apply genomes_filters filter if specified
1004
- if genomes_filters_pass_only:
1005
- if "genomes_filters" in df.columns:
1006
- df = df.filter(
1007
- (pl.col("genomes_filters") == ".") | pl.col("genomes_filters").is_null()
1008
- )
1009
- if verbose:
1189
+ # Apply genomes_filters filter if specified
1190
+ if genomes_filters_pass_only:
1191
+ if "genomes_filters" in df.columns:
1192
+ df = df.filter(
1193
+ (pl.col("genomes_filters") == ".") | pl.col("genomes_filters").is_null()
1194
+ )
1195
+ if verbose:
1196
+ click.echo(
1197
+ f"DNM: {df.shape[0]} variants after genomes_filters filter (pass only)",
1198
+ err=True,
1199
+ )
1200
+ elif verbose:
1010
1201
  click.echo(
1011
- f"DNM: {df.shape[0]} variants after genomes_filters filter (pass only)",
1202
+ "DNM: Warning - genomes_filters column not found, skipping genomes_filters filter",
1012
1203
  err=True,
1013
1204
  )
1014
- elif verbose:
1015
- click.echo(
1016
- "DNM: Warning - genomes_filters column not found, skipping genomes_filters filter",
1017
- err=True,
1018
- )
1019
1205
 
1020
1206
  # Build parent quality checks (common to all)
1021
1207
  father_qual_ok = (pl.col("father_dp").cast(pl.Float64, strict=False) >= p_dp) & (
@@ -2293,6 +2479,55 @@ def process_with_progress(
2293
2479
  click.echo("Processing complete.", err=True)
2294
2480
 
2295
2481
 
2482
+ def apply_dnm_prefilters(
2483
+ lazy_df: pl.LazyFrame,
2484
+ filter_config: dict,
2485
+ verbose: bool = False
2486
+ ) -> pl.LazyFrame:
2487
+ """Apply variant-level DNM filters before melting.
2488
+
2489
+ These filters don't require sample-level data and can be applied
2490
+ on wide-format data to reduce memory usage.
2491
+
2492
+ Applies:
2493
+ - Population frequency filters (fafmax_faf95_max_genomes_max)
2494
+ - Quality filters (genomes_filters PASS only)
2495
+
2496
+ Args:
2497
+ lazy_df: LazyFrame with wide-format data (not melted)
2498
+ filter_config: Filter configuration dict
2499
+ verbose: Whether to print progress messages
2500
+
2501
+ Returns:
2502
+ Filtered LazyFrame
2503
+ """
2504
+ dnm_config = filter_config.get("dnm", {})
2505
+
2506
+ # Frequency filter
2507
+ fafmax_max = dnm_config.get("fafmax_faf95_max_genomes_max")
2508
+ if fafmax_max is not None:
2509
+ lazy_df = lazy_df.filter(
2510
+ (pl.col("fafmax_faf95_max_genomes").cast(pl.Float64, strict=False) <= fafmax_max)
2511
+ | pl.col("fafmax_faf95_max_genomes").is_null()
2512
+ )
2513
+ if verbose:
2514
+ click.echo(
2515
+ f"DNM prefilter: Applied frequency filter (fafmax <= {fafmax_max})", err=True
2516
+ )
2517
+
2518
+ # Quality filter (genomes_filters PASS only)
2519
+ if dnm_config.get("genomes_filters_pass_only", False):
2520
+ lazy_df = lazy_df.filter(
2521
+ (pl.col("genomes_filters") == ".") | pl.col("genomes_filters").is_null()
2522
+ )
2523
+ if verbose:
2524
+ click.echo(
2525
+ "DNM prefilter: Applied genomes_filters PASS filter", err=True
2526
+ )
2527
+
2528
+ return lazy_df
2529
+
2530
+
2296
2531
  def apply_filters_lazy(
2297
2532
  lazy_df: pl.LazyFrame,
2298
2533
  filter_config: dict,
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: pywombat
3
- Version: 1.1.0
3
+ Version: 1.2.1
4
4
  Summary: A CLI tool for processing and filtering bcftools tabulated TSV files with pedigree support
5
5
  Project-URL: Homepage, https://github.com/bourgeron-lab/pywombat
6
6
  Project-URL: Repository, https://github.com/bourgeron-lab/pywombat
@@ -16,6 +16,7 @@ Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
16
16
  Requires-Python: >=3.12
17
17
  Requires-Dist: click>=8.1.0
18
18
  Requires-Dist: polars>=0.19.0
19
+ Requires-Dist: pyarrow>=14.0.0
19
20
  Requires-Dist: pyyaml>=6.0
20
21
  Requires-Dist: tqdm>=4.67.1
21
22
  Provides-Extra: dev
@@ -598,8 +599,15 @@ Each configuration file is fully documented with:
598
599
  2. **Parquet format benefits**:
599
600
  - Columnar storage enables selective column loading
600
601
  - Pre-filtering before melting (expression filters applied before expanding to per-sample rows)
602
+ - **Per-chromosome processing for DNM**: Automatically processes DNM filtering chromosome-by-chromosome
601
603
  - 30% smaller file size vs gzipped TSV
602
604
 
605
+ 3. **De Novo Mutation (DNM) filtering optimization**:
606
+ - Automatically uses per-chromosome processing when DNM mode is enabled
607
+ - Processes one chromosome at a time to reduce peak memory
608
+ - Applies frequency filters before melting to reduce data expansion
609
+ - Example: 38-sample family with 4.2M variants completes in 20 seconds with ~24GB RAM (vs 200GB+ OOM failure)
610
+
603
611
  ### For All Files
604
612
 
605
613
  3. **Pre-filter with bcftools**: Filter by region/gene before PyWombat
@@ -608,12 +616,23 @@ Each configuration file is fully documented with:
608
616
 
609
617
  ### Memory Comparison
610
618
 
619
+ **Expression Filtering** (e.g., VEP_IMPACT filters):
620
+
611
621
  | Approach | 38 samples, 4.2M variants | Memory | Time |
612
622
  |----------|---------------------------|--------|------|
613
623
  | Direct TSV | ❌ OOM (>200GB) | 200+ GB | Failed |
614
624
  | TSV with chunking | ⚠️ Slow | ~30GB | ~3 min |
615
625
  | **Parquet + pre-filter** | ✅ **Optimal** | **~1.2GB** | **<1 sec** |
616
626
 
627
+ **De Novo Mutation (DNM) Filtering**:
628
+
629
+ | Approach | 38 samples, 4.2M variants | Memory | Time | Result |
630
+ |----------|---------------------------|--------|------|--------|
631
+ | Without optimization | ❌ OOM (>200GB) | 200+ GB | Failed | N/A |
632
+ | **Parquet + per-chromosome** | ✅ **Success** | **~24GB** | **20 sec** | **6,788 DNM variants** |
633
+
634
+ *DNM filtering requires sample-level data (cannot pre-filter before melting), but per-chromosome processing reduces peak memory by 88%.*
635
+
617
636
  ---
618
637
 
619
638
  ## Development
@@ -0,0 +1,6 @@
1
+ pywombat/__init__.py,sha256=iIPN9vJtsIUhl_DiKNnknxCamLinfayodLLFK8y-aJg,54
2
+ pywombat/cli.py,sha256=pEPvUTww5Nvj-WqSRZ0QEePnORrcYkhWJv3uVi5DnxM,93728
3
+ pywombat-1.2.1.dist-info/METADATA,sha256=62AZedUVlnxki2ZrSWQ5v2Eh0WGFZf5Lb8dlG_u6h3w,21337
4
+ pywombat-1.2.1.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
5
+ pywombat-1.2.1.dist-info/entry_points.txt,sha256=Vt7U2ypbiEgCBlEV71ZPk287H5_HKmPBT4iBu6duEcE,44
6
+ pywombat-1.2.1.dist-info/RECORD,,
@@ -1,6 +0,0 @@
1
- pywombat/__init__.py,sha256=iIPN9vJtsIUhl_DiKNnknxCamLinfayodLLFK8y-aJg,54
2
- pywombat/cli.py,sha256=-tzD2UJxlByP8aE5uSZ1C6UvgoriJqPMXRNs7xY65nE,85545
3
- pywombat-1.1.0.dist-info/METADATA,sha256=lYL6me-3Cw1wDa_yFdRX5Qj4cre6GMpY3Uqjy0LRwLg,20289
4
- pywombat-1.1.0.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
5
- pywombat-1.1.0.dist-info/entry_points.txt,sha256=Vt7U2ypbiEgCBlEV71ZPk287H5_HKmPBT4iBu6duEcE,44
6
- pywombat-1.1.0.dist-info/RECORD,,