pywombat 1.0.2__py3-none-any.whl → 1.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
pywombat/cli.py CHANGED
@@ -11,13 +11,371 @@ import polars as pl
11
11
  import yaml
12
12
 
13
13
 
14
- @click.command()
14
+ @click.group()
15
+ def cli():
16
+ """
17
+ Wombat: A tool for processing bcftools tabulated TSV files.
18
+
19
+ \b
20
+ Commands:
21
+ filter Process and filter variant data
22
+ prepare Convert TSV to optimized Parquet format
23
+ """
24
+ pass
25
+
26
+
27
+ @cli.command("prepare")
28
+ @click.argument("input_file", type=click.Path(exists=True, path_type=Path))
29
+ @click.option(
30
+ "-o",
31
+ "--output",
32
+ type=click.Path(path_type=Path),
33
+ required=True,
34
+ help="Output Parquet file path.",
35
+ )
36
+ @click.option(
37
+ "--chunk-size",
38
+ type=int,
39
+ default=50000,
40
+ help="Number of rows to process at a time (default: 50000).",
41
+ )
42
+ @click.option("-v", "--verbose", is_flag=True, help="Enable verbose output.")
43
+ def prepare_cmd(
44
+ input_file: Path,
45
+ output: Path,
46
+ chunk_size: int,
47
+ verbose: bool,
48
+ ):
49
+ """
50
+ Convert bcftools TSV to optimized Parquet format.
51
+
52
+ This command pre-processes a TSV file by:
53
+
54
+ \b
55
+ 1. Extracting all INFO fields from the '(null)' column into separate columns
56
+ 2. Applying memory-efficient data types (Categorical for CHROM, UInt32 for POS)
57
+ 3. Writing to Parquet format for efficient columnar access
58
+
59
+ The output Parquet file can then be used with 'wombat filter' for much faster
60
+ and more memory-efficient filtering, especially for large files.
61
+
62
+ \b
63
+ Examples:
64
+ wombat prepare input.tsv.gz -o prepared.parquet
65
+ wombat prepare input.tsv.gz -o prepared.parquet --chunk-size 100000
66
+ """
67
+ try:
68
+ if verbose:
69
+ click.echo(f"Preparing {input_file} -> {output}", err=True)
70
+
71
+ # Ensure output has .parquet extension
72
+ if not str(output).endswith(".parquet"):
73
+ output = Path(f"{output}.parquet")
74
+
75
+ # Process the file
76
+ prepare_parquet(input_file, output, chunk_size, verbose)
77
+
78
+ if verbose:
79
+ click.echo(f"Successfully created {output}", err=True)
80
+
81
+ except Exception as e:
82
+ click.echo(f"Error: {e}", err=True)
83
+ raise click.Abort()
84
+
85
+
86
+ def prepare_parquet(
87
+ input_file: Path,
88
+ output: Path,
89
+ chunk_size: int = 50000,
90
+ verbose: bool = False,
91
+ ) -> None:
92
+ """
93
+ Convert a bcftools TSV file to Parquet with pre-expanded INFO fields.
94
+
95
+ Processes the file in chunks to handle large files without running out of memory.
96
+
97
+ Args:
98
+ input_file: Path to input TSV or TSV.gz file
99
+ output: Path to output Parquet file
100
+ chunk_size: Number of rows to process per chunk
101
+ verbose: Whether to print progress
102
+ """
103
+ from tqdm import tqdm
104
+
105
+ # First pass: discover all INFO fields
106
+ if verbose:
107
+ click.echo("Pass 1: Discovering INFO fields...", err=True)
108
+
109
+ all_fields = set()
110
+ all_flags = set()
111
+ total_lines = 0
112
+
113
+ is_gzipped = str(input_file).endswith(".gz")
114
+ opener = gzip.open if is_gzipped else open
115
+
116
+ with opener(input_file, "rt") as f:
117
+ header_line = f.readline().strip()
118
+ header_cols = header_line.split("\t")
119
+
120
+ # Find the (null) column index dynamically
121
+ null_col_idx = None
122
+ for i, col in enumerate(header_cols):
123
+ if col == "(null)":
124
+ null_col_idx = i
125
+ break
126
+
127
+ if null_col_idx is None:
128
+ if verbose:
129
+ click.echo("Warning: No (null) column found in input", err=True)
130
+ else:
131
+ for line in tqdm(f, desc="Scanning", disable=not verbose):
132
+ total_lines += 1
133
+ parts = line.split("\t")
134
+ if len(parts) > null_col_idx:
135
+ null_value = parts[null_col_idx]
136
+ if null_value and null_value != ".":
137
+ pairs = null_value.split(";")
138
+ for pair in pairs:
139
+ if "=" in pair:
140
+ field_name = pair.split("=", 1)[0]
141
+ all_fields.add(field_name)
142
+ elif pair.strip():
143
+ all_flags.add(pair.strip())
144
+
145
+ if verbose:
146
+ click.echo(
147
+ f"Found {len(all_fields)} key-value fields and {len(all_flags)} flags in {total_lines} variants",
148
+ err=True,
149
+ )
150
+
151
+ # Second pass: process chunks and write Parquet
152
+ if verbose:
153
+ click.echo("Pass 2: Converting to Parquet...", err=True)
154
+
155
+ # Define memory-efficient dtypes
156
+ dtype_overrides = {
157
+ "#CHROM": pl.Categorical,
158
+ "POS": pl.UInt32,
159
+ "FILTER": pl.Categorical,
160
+ }
161
+
162
+ # Create a temporary directory for chunk files
163
+ import tempfile
164
+ import shutil
165
+
166
+ temp_dir = Path(tempfile.mkdtemp(prefix="wombat_prepare_"))
167
+ part_files = []
168
+
169
+ try:
170
+ with opener(input_file, "rt") as f:
171
+ header_line = f.readline().strip()
172
+
173
+ # Process in chunks
174
+ chunk_lines = []
175
+ pbar = tqdm(total=total_lines, desc="Converting", disable=not verbose)
176
+
177
+ for line in f:
178
+ chunk_lines.append(line)
179
+ if len(chunk_lines) >= chunk_size:
180
+ df_chunk = _process_chunk(
181
+ header_line, chunk_lines, all_fields, all_flags, dtype_overrides
182
+ )
183
+ part_file = temp_dir / f"part_{len(part_files):06d}.parquet"
184
+ df_chunk.write_parquet(part_file)
185
+ part_files.append(part_file)
186
+ pbar.update(len(chunk_lines))
187
+ chunk_lines = []
188
+
189
+ # Process remaining lines
190
+ if chunk_lines:
191
+ df_chunk = _process_chunk(
192
+ header_line, chunk_lines, all_fields, all_flags, dtype_overrides
193
+ )
194
+ part_file = temp_dir / f"part_{len(part_files):06d}.parquet"
195
+ df_chunk.write_parquet(part_file)
196
+ part_files.append(part_file)
197
+ pbar.update(len(chunk_lines))
198
+
199
+ pbar.close()
200
+
201
+ # Combine all parts into final output using lazy scanning
202
+ if verbose:
203
+ click.echo(f"Combining {len(part_files)} parts into final output...", err=True)
204
+
205
+ if part_files:
206
+ # Use scan_parquet to lazily read all parts and write combined output
207
+ combined = pl.scan_parquet(part_files).collect()
208
+ combined.write_parquet(output)
209
+
210
+ if verbose:
211
+ click.echo(f"Wrote {len(part_files)} chunks to {output}", err=True)
212
+
213
+ finally:
214
+ # Clean up temporary directory
215
+ shutil.rmtree(temp_dir, ignore_errors=True)
216
+
217
+
218
+ def _process_chunk(
219
+ header: str,
220
+ lines: list,
221
+ fields: set,
222
+ flags: set,
223
+ dtype_overrides: dict,
224
+ ) -> pl.DataFrame:
225
+ """Process a chunk of lines into a DataFrame with expanded INFO fields."""
226
+ import io
227
+
228
+ content = header + "\n" + "".join(lines)
229
+ df = pl.read_csv(
230
+ io.StringIO(content),
231
+ separator="\t",
232
+ infer_schema_length=10000,
233
+ )
234
+
235
+ # Expand INFO fields from (null) column
236
+ if "(null)" in df.columns:
237
+ # Extract key-value fields
238
+ for field in sorted(fields):
239
+ df = df.with_columns(
240
+ pl.col("(null)").str.extract(f"{field}=([^;]+)").alias(field)
241
+ )
242
+
243
+ # Extract boolean flags
244
+ for flag in sorted(flags):
245
+ df = df.with_columns(
246
+ pl.col("(null)").str.contains(f"(^|;){flag}(;|$)").alias(flag)
247
+ )
248
+
249
+ # Drop the original (null) column
250
+ df = df.drop("(null)")
251
+
252
+ # Drop CSQ column if it exists (redundant after expansion)
253
+ if "CSQ" in df.columns:
254
+ df = df.drop("CSQ")
255
+
256
+ # Apply memory-efficient dtypes
257
+ for col, dtype in dtype_overrides.items():
258
+ if col in df.columns:
259
+ try:
260
+ df = df.with_columns(pl.col(col).cast(dtype))
261
+ except Exception:
262
+ pass # Skip if cast fails
263
+
264
+ return df
265
+
266
+
267
+ def process_dnm_by_chromosome(
268
+ input_file: Path,
269
+ pedigree_df: pl.DataFrame,
270
+ filter_config: dict,
271
+ output_format: str,
272
+ verbose: bool
273
+ ) -> pl.DataFrame:
274
+ """Process DNM filtering chromosome by chromosome to reduce memory usage.
275
+
276
+ Processes each chromosome separately:
277
+ 1. Load one chromosome at a time from Parquet
278
+ 2. Apply frequency/quality prefilters (before melting)
279
+ 3. Melt samples
280
+ 4. Apply DNM filters
281
+ 5. Combine results from all chromosomes
282
+
283
+ This reduces peak memory from (total_variants × samples) to
284
+ (max_chr_variants × samples).
285
+
286
+ Args:
287
+ input_file: Path to Parquet file
288
+ pedigree_df: Pedigree DataFrame with sample relationships
289
+ filter_config: Filter configuration dict
290
+ output_format: Output format (tsv, tsv.gz, parquet)
291
+ verbose: Whether to print progress messages
292
+
293
+ Returns:
294
+ Combined DataFrame with DNM-filtered variants from all chromosomes
295
+ """
296
+ # Get list of chromosomes
297
+ chromosomes = get_unique_chromosomes(input_file)
298
+
299
+ if verbose:
300
+ click.echo(
301
+ f"DNM per-chromosome processing: {len(chromosomes)} chromosomes", err=True
302
+ )
303
+
304
+ results = []
305
+ dnm_cfg = {}
306
+ dnm_cfg.update(filter_config.get("quality", {}))
307
+ dnm_cfg.update(filter_config.get("dnm", {}))
308
+
309
+ for chrom in chromosomes:
310
+ if verbose:
311
+ click.echo(f"Processing chromosome {chrom}...", err=True)
312
+
313
+ # Load only this chromosome
314
+ lazy_df = pl.scan_parquet(input_file).filter(
315
+ pl.col("#CHROM") == chrom
316
+ )
317
+
318
+ # Apply frequency filters BEFORE melting (Optimization 2)
319
+ lazy_df = apply_dnm_prefilters(lazy_df, filter_config, verbose=False)
320
+
321
+ # Count variants after prefiltering
322
+ if verbose:
323
+ pre_count = lazy_df.select(pl.count()).collect().item()
324
+ click.echo(f" Chromosome {chrom}: {pre_count} variants after prefilter", err=True)
325
+
326
+ # Collect, melt, and apply DNM filters
327
+ df = lazy_df.collect()
328
+
329
+ if df.shape[0] == 0:
330
+ if verbose:
331
+ click.echo(f" Chromosome {chrom}: No variants after prefilter, skipping", err=True)
332
+ continue
333
+
334
+ formatted_df = format_bcftools_tsv_minimal(df, pedigree_df)
335
+
336
+ if verbose:
337
+ click.echo(
338
+ f" Chromosome {chrom}: {formatted_df.shape[0]} rows after melting", err=True
339
+ )
340
+
341
+ # Apply DNM filters (skip prefilters since already applied)
342
+ filtered_df = apply_de_novo_filter(
343
+ formatted_df, dnm_cfg, verbose=False, pedigree_df=pedigree_df,
344
+ skip_prefilters=True
345
+ )
346
+
347
+ if verbose:
348
+ click.echo(
349
+ f" Chromosome {chrom}: {filtered_df.shape[0]} variants passed DNM filter", err=True
350
+ )
351
+
352
+ if filtered_df.shape[0] > 0:
353
+ results.append(filtered_df)
354
+
355
+ # Combine results
356
+ if not results:
357
+ if verbose:
358
+ click.echo("No variants passed DNM filters across all chromosomes", err=True)
359
+ # Return empty DataFrame with correct schema
360
+ return pl.DataFrame()
361
+
362
+ final_df = pl.concat(results)
363
+
364
+ if verbose:
365
+ click.echo(
366
+ f"DNM filtering complete: {final_df.shape[0]} total variants", err=True
367
+ )
368
+
369
+ return final_df
370
+
371
+
372
+ @cli.command("filter")
15
373
  @click.argument("input_file", type=click.Path(exists=True, path_type=Path))
16
374
  @click.option(
17
375
  "-o",
18
376
  "--output",
19
377
  type=str,
20
- help="Output file prefix. If not specified, prints to stdout.",
378
+ help="Output file prefix. If not specified, generates from input filename.",
21
379
  )
22
380
  @click.option(
23
381
  "-f",
@@ -43,9 +401,9 @@ import yaml
43
401
  @click.option(
44
402
  "--debug",
45
403
  type=str,
46
- help="Debug mode: show rows matching chrom:pos (e.g., chr11:70486013). Displays #CHROM, POS, VEP_SYMBOL, and columns from filter expression.",
404
+ help="Debug mode: show rows matching chrom:pos (e.g., chr11:70486013).",
47
405
  )
48
- def cli(
406
+ def filter_cmd(
49
407
  input_file: Path,
50
408
  output: Optional[str],
51
409
  output_format: str,
@@ -55,36 +413,43 @@ def cli(
55
413
  debug: Optional[str],
56
414
  ):
57
415
  """
58
- Wombat: A tool for processing bcftools tabulated TSV files.
416
+ Process and filter variant data from TSV or Parquet files.
59
417
 
60
- This command:
418
+ \b
419
+ Supports two input formats:
420
+ - TSV/TSV.gz: Full processing (INFO expansion + melting)
421
+ - Parquet: Fast processing (melting only, INFO already expanded)
61
422
 
62
423
  \b
63
- 1. Expands the '(null)' column containing NAME=value pairs separated by ';'
64
- 2. Preserves the CSQ (Consequence) column without melting
65
- 3. Melts sample columns into rows with sample names
66
- 4. Splits sample values (GT:DP:GQ:AD format) into separate columns:
67
- - sample_gt: Genotype
68
- - sample_dp: Read depth
69
- - sample_gq: Genotype quality
70
- - sample_ad: Allele depth (second value from comma-separated list)
71
- - sample_vaf: Variant allele frequency (sample_ad / sample_dp)
424
+ For large files, use 'wombat prepare' first to convert to Parquet,
425
+ then use 'wombat filter' on the Parquet file for better performance.
426
+
427
+ \b
428
+ This command:
429
+ 1. Expands the '(null)' column (TSV only) into separate columns
430
+ 2. Melts sample columns into rows with sample names
431
+ 3. Splits sample values (GT:DP:GQ:AD format) into separate columns
432
+ 4. Applies quality and expression filters (if config provided)
72
433
 
73
434
  \b
74
435
  Examples:
75
- wombat input.tsv -o output
76
- wombat input.tsv -o output -f parquet
77
- wombat input.tsv > output.tsv
436
+ wombat filter input.tsv -o output
437
+ wombat filter prepared.parquet -o output -f parquet
438
+ wombat filter input.tsv -p pedigree.tsv -F config.yml
78
439
  """
79
440
  try:
80
441
  if verbose:
81
442
  click.echo(f"Reading input file: {input_file}", err=True)
82
443
 
83
- # Detect if file is gzipped based on extension
444
+ # Detect input format
445
+ is_parquet = str(input_file).endswith(".parquet")
84
446
  is_gzipped = str(input_file).endswith(".gz")
85
447
 
86
- if verbose and is_gzipped:
87
- click.echo("Detected gzipped file", err=True)
448
+ if verbose:
449
+ if is_parquet:
450
+ click.echo("Detected Parquet input (pre-processed)", err=True)
451
+ elif is_gzipped:
452
+ click.echo("Detected gzipped TSV file", err=True)
88
453
 
89
454
  # Read pedigree file if provided
90
455
  pedigree_df = None
@@ -109,11 +474,11 @@ def cli(
109
474
  if output is None:
110
475
  # Generate default output prefix from input filename
111
476
  input_stem = input_file.name
112
- # Remove .tsv.gz or .tsv extension
113
- if input_stem.endswith(".tsv.gz"):
114
- input_stem = input_stem[:-7] # Remove .tsv.gz
115
- elif input_stem.endswith(".tsv"):
116
- input_stem = input_stem[:-4] # Remove .tsv
477
+ # Remove known extensions
478
+ for ext in [".tsv.gz", ".tsv", ".parquet"]:
479
+ if input_stem.endswith(ext):
480
+ input_stem = input_stem[: -len(ext)]
481
+ break
117
482
 
118
483
  # Add config name if filter is provided
119
484
  if filter_config:
@@ -126,24 +491,103 @@ def cli(
126
491
  if verbose:
127
492
  click.echo("Processing with streaming mode...", err=True)
128
493
 
129
- # Build lazy query
130
- # Force certain columns to string type
131
- string_columns = [
132
- "FID",
133
- "sample_id",
134
- "father_id",
135
- "mother_id",
136
- "FatherBarcode",
137
- "MotherBarcode",
138
- "sample",
139
- ]
140
- schema_overrides = {col: pl.Utf8 for col in string_columns}
141
- lazy_df = pl.scan_csv(
142
- input_file, separator="\t", schema_overrides=schema_overrides
143
- )
494
+ # Build lazy query based on input format
495
+ if is_parquet:
496
+ # Parquet input: INFO fields already expanded by 'wombat prepare'
497
+ lazy_df = pl.scan_parquet(input_file)
498
+
499
+ # Check if DNM mode is enabled - use per-chromosome processing
500
+ if filter_config_data and filter_config_data.get("dnm", {}).get("enabled", False):
501
+ if verbose:
502
+ click.echo("DNM mode: Using per-chromosome processing for memory efficiency", err=True)
503
+
504
+ # DNM requires pedigree
505
+ if pedigree_df is None:
506
+ click.echo("Error: DNM filtering requires a pedigree file (--pedigree option)", err=True)
507
+ raise click.Abort()
508
+
509
+ # Process DNM filtering chromosome by chromosome
510
+ formatted_df = process_dnm_by_chromosome(
511
+ input_file,
512
+ pedigree_df,
513
+ filter_config_data,
514
+ output_format,
515
+ verbose
516
+ )
517
+
518
+ # Write output directly
519
+ output_path = Path(f"{output}.{output_format}")
520
+
521
+ if output_format == "tsv":
522
+ formatted_df.write_csv(output_path, separator="\t")
523
+ elif output_format == "tsv.gz":
524
+ csv_content = formatted_df.write_csv(separator="\t")
525
+ with gzip.open(output_path, "wt") as f:
526
+ f.write(csv_content)
527
+ elif output_format == "parquet":
528
+ formatted_df.write_parquet(output_path)
529
+
530
+ if verbose:
531
+ click.echo(f"DNM variants written to {output_path}", err=True)
532
+
533
+ return
534
+
535
+ # OPTIMIZATION: Apply expression filter BEFORE melting
536
+ # Expression filters (VEP_IMPACT, etc.) don't depend on sample data
537
+ if filter_config_data and "expression" in filter_config_data:
538
+ expression = filter_config_data["expression"]
539
+ if expression and verbose:
540
+ click.echo(
541
+ f"Applying expression filter before melting: {expression}",
542
+ err=True,
543
+ )
544
+
545
+ # Collect a small sample to get schema for expression parsing
546
+ schema_df = lazy_df.head(1).collect()
547
+ try:
548
+ filter_expr = parse_impact_filter_expression(expression, schema_df)
549
+ lazy_df = lazy_df.filter(filter_expr)
550
+
551
+ # Count filtered variants
552
+ if verbose:
553
+ filtered_count = lazy_df.select(pl.len()).collect().item()
554
+ click.echo(
555
+ f"Variants after expression filter: {filtered_count}",
556
+ err=True,
557
+ )
558
+ except ValueError as e:
559
+ if verbose:
560
+ click.echo(
561
+ f"Warning: Could not apply early filter: {e}", err=True
562
+ )
144
563
 
145
- # Apply formatting transformations
146
- lazy_df = format_bcftools_tsv_lazy(lazy_df, pedigree_df)
564
+ # Now collect and melt (on filtered variants only)
565
+ df = lazy_df.collect()
566
+ formatted_df = format_bcftools_tsv_minimal(df, pedigree_df)
567
+ lazy_df = formatted_df.lazy()
568
+
569
+ # Remove expression from config so it's not applied again
570
+ if filter_config_data and "expression" in filter_config_data:
571
+ filter_config_data = filter_config_data.copy()
572
+ del filter_config_data["expression"]
573
+ else:
574
+ # TSV input: need full processing (melt + annotation expansion)
575
+ string_columns = [
576
+ "FID",
577
+ "sample_id",
578
+ "father_id",
579
+ "mother_id",
580
+ "FatherBarcode",
581
+ "MotherBarcode",
582
+ "sample",
583
+ ]
584
+ schema_overrides = {col: pl.Utf8 for col in string_columns}
585
+ lazy_df = pl.scan_csv(
586
+ input_file, separator="\t", schema_overrides=schema_overrides
587
+ )
588
+
589
+ # Apply formatting transformations (melt + expand annotations)
590
+ lazy_df = format_bcftools_tsv_lazy(lazy_df, pedigree_df)
147
591
 
148
592
  # Apply filters if provided
149
593
  if filter_config_data:
@@ -497,11 +941,47 @@ def _pos_in_par(chrom: str, pos: int, par_regions: dict) -> bool:
497
941
  return False
498
942
 
499
943
 
944
+ def get_unique_chromosomes(parquet_file: Path) -> list[str]:
945
+ """Get list of unique chromosomes from Parquet file, sorted naturally.
946
+
947
+ Args:
948
+ parquet_file: Path to Parquet file
949
+
950
+ Returns:
951
+ Sorted list of chromosome names (e.g., ['1', '2', ..., '22', 'X', 'Y', 'MT'])
952
+ """
953
+ # Read just the #CHROM column to get unique values
954
+ df = pl.scan_parquet(parquet_file).select("#CHROM").unique().collect()
955
+ chroms = df["#CHROM"].to_list()
956
+
957
+ # Sort chromosomes properly (1, 2, ..., 22, X, Y, MT)
958
+ def chrom_sort_key(chrom: str) -> tuple:
959
+ """Sort key for natural chromosome ordering."""
960
+ chrom_norm = chrom.replace("chr", "").replace("Chr", "").replace("CHR", "").upper()
961
+
962
+ # Try to parse as integer (autosomes)
963
+ try:
964
+ return (0, int(chrom_norm), "")
965
+ except ValueError:
966
+ pass
967
+
968
+ # Sex chromosomes and mitochondrial
969
+ if chrom_norm in ["X", "Y", "MT", "M"]:
970
+ order = {"X": 23, "Y": 24, "MT": 25, "M": 25}
971
+ return (1, order.get(chrom_norm, 99), chrom_norm)
972
+
973
+ # Other chromosomes (e.g., scaffolds)
974
+ return (2, 0, chrom_norm)
975
+
976
+ return sorted(chroms, key=chrom_sort_key)
977
+
978
+
500
979
  def apply_de_novo_filter(
501
980
  df: pl.DataFrame,
502
981
  dnm_config: dict,
503
982
  verbose: bool = False,
504
983
  pedigree_df: Optional[pl.DataFrame] = None,
984
+ skip_prefilters: bool = False,
505
985
  ) -> pl.DataFrame:
506
986
  """Apply de novo detection filters to dataframe using vectorized operations.
507
987
 
@@ -512,6 +992,13 @@ def apply_de_novo_filter(
512
992
 
513
993
  This function will read `sex` from `df` when present; otherwise it will use
514
994
  the `pedigree_df` (which should contain `sample_id` and `sex`).
995
+
996
+ Args:
997
+ df: DataFrame with melted samples
998
+ dnm_config: DNM configuration dict
999
+ verbose: Whether to print progress messages
1000
+ pedigree_df: Pedigree DataFrame
1001
+ skip_prefilters: If True, skips frequency/genomes_filters (assumes already applied)
515
1002
  """
516
1003
  if not dnm_config:
517
1004
  return df
@@ -676,43 +1163,45 @@ def apply_de_novo_filter(
676
1163
  err=True,
677
1164
  )
678
1165
 
679
- # Apply fafmax_faf95_max_genomes filter if specified
680
- if fafmax_max is not None:
681
- if "fafmax_faf95_max_genomes" in df.columns:
682
- df = df.filter(
683
- (
684
- pl.col("fafmax_faf95_max_genomes").cast(pl.Float64, strict=False)
685
- <= fafmax_max
1166
+ # Apply frequency/quality prefilters if not already applied
1167
+ if not skip_prefilters:
1168
+ # Apply fafmax_faf95_max_genomes filter if specified
1169
+ if fafmax_max is not None:
1170
+ if "fafmax_faf95_max_genomes" in df.columns:
1171
+ df = df.filter(
1172
+ (
1173
+ pl.col("fafmax_faf95_max_genomes").cast(pl.Float64, strict=False)
1174
+ <= fafmax_max
1175
+ )
1176
+ | pl.col("fafmax_faf95_max_genomes").is_null()
686
1177
  )
687
- | pl.col("fafmax_faf95_max_genomes").is_null()
688
- )
689
- if verbose:
1178
+ if verbose:
1179
+ click.echo(
1180
+ f"DNM: {df.shape[0]} variants after fafmax_faf95_max_genomes filter (<={fafmax_max})",
1181
+ err=True,
1182
+ )
1183
+ elif verbose:
690
1184
  click.echo(
691
- f"DNM: {df.shape[0]} variants after fafmax_faf95_max_genomes filter (<={fafmax_max})",
1185
+ "DNM: Warning - fafmax_faf95_max_genomes column not found, skipping frequency filter",
692
1186
  err=True,
693
1187
  )
694
- elif verbose:
695
- click.echo(
696
- "DNM: Warning - fafmax_faf95_max_genomes column not found, skipping frequency filter",
697
- err=True,
698
- )
699
1188
 
700
- # Apply genomes_filters filter if specified
701
- if genomes_filters_pass_only:
702
- if "genomes_filters" in df.columns:
703
- df = df.filter(
704
- (pl.col("genomes_filters") == ".") | pl.col("genomes_filters").is_null()
705
- )
706
- if verbose:
1189
+ # Apply genomes_filters filter if specified
1190
+ if genomes_filters_pass_only:
1191
+ if "genomes_filters" in df.columns:
1192
+ df = df.filter(
1193
+ (pl.col("genomes_filters") == ".") | pl.col("genomes_filters").is_null()
1194
+ )
1195
+ if verbose:
1196
+ click.echo(
1197
+ f"DNM: {df.shape[0]} variants after genomes_filters filter (pass only)",
1198
+ err=True,
1199
+ )
1200
+ elif verbose:
707
1201
  click.echo(
708
- f"DNM: {df.shape[0]} variants after genomes_filters filter (pass only)",
1202
+ "DNM: Warning - genomes_filters column not found, skipping genomes_filters filter",
709
1203
  err=True,
710
1204
  )
711
- elif verbose:
712
- click.echo(
713
- "DNM: Warning - genomes_filters column not found, skipping genomes_filters filter",
714
- err=True,
715
- )
716
1205
 
717
1206
  # Build parent quality checks (common to all)
718
1207
  father_qual_ok = (pl.col("father_dp").cast(pl.Float64, strict=False) >= p_dp) & (
@@ -1394,33 +1883,39 @@ def format_bcftools_tsv_minimal(
1394
1883
  Returns:
1395
1884
  Formatted DataFrame with melted samples (annotations still in (null) column)
1396
1885
  """
1397
- # Find the (null) column
1398
- if "(null)" not in df.columns:
1399
- raise ValueError("Column '(null)' not found in the input file")
1400
-
1401
- # Get column index of (null)
1402
- null_col_idx = df.columns.index("(null)")
1403
-
1404
- # Split columns into: before (null), (null), and after (null)
1405
- cols_after = df.columns[null_col_idx + 1 :]
1886
+ # Determine which columns are sample columns
1887
+ # Sample columns have format "SampleName:GT:SampleName:DP:..." or similar
1888
+ # Non-sample columns are standard VCF columns or annotation columns
1889
+
1890
+ # Standard VCF/annotation columns (not samples)
1891
+ standard_cols = {
1892
+ "#CHROM", "POS", "REF", "ALT", "FILTER", "(null)", "CSQ",
1893
+ "QUAL", "ID", "INFO", "FORMAT"
1894
+ }
1406
1895
 
1407
- # Step 1: Identify sample columns (SKIP annotation expansion)
1896
+ # Find sample columns by looking for columns with ":" in the name
1897
+ # that aren't standard columns
1408
1898
  sample_cols = []
1409
1899
  sample_names = []
1410
1900
 
1411
- for col in cols_after:
1412
- # Skip CSQ column
1413
- if col == "CSQ":
1901
+ for col in df.columns:
1902
+ # Skip standard columns
1903
+ if col in standard_cols:
1904
+ continue
1905
+
1906
+ # Skip columns that look like VEP annotation fields
1907
+ if col.startswith("VEP_") or col.startswith("AF") or col.startswith("AC"):
1414
1908
  continue
1415
1909
 
1910
+ # Sample columns typically have ":" in them (GT:DP:GQ:AD format)
1416
1911
  if ":" in col:
1417
1912
  sample_name = col.split(":", 1)[0]
1418
1913
  sample_cols.append(col)
1419
1914
  sample_names.append(sample_name)
1420
- else:
1421
- # If no colon, treat the whole column name as sample name
1422
- sample_cols.append(col)
1423
- sample_names.append(col)
1915
+ elif col not in df.columns[:10]:
1916
+ # Columns after position 10 that don't match known patterns might be samples
1917
+ # This is a heuristic for unusual sample column formats
1918
+ pass
1424
1919
 
1425
1920
  if not sample_cols:
1426
1921
  # No sample columns to melt
@@ -1984,6 +2479,55 @@ def process_with_progress(
1984
2479
  click.echo("Processing complete.", err=True)
1985
2480
 
1986
2481
 
2482
+ def apply_dnm_prefilters(
2483
+ lazy_df: pl.LazyFrame,
2484
+ filter_config: dict,
2485
+ verbose: bool = False
2486
+ ) -> pl.LazyFrame:
2487
+ """Apply variant-level DNM filters before melting.
2488
+
2489
+ These filters don't require sample-level data and can be applied
2490
+ on wide-format data to reduce memory usage.
2491
+
2492
+ Applies:
2493
+ - Population frequency filters (fafmax_faf95_max_genomes_max)
2494
+ - Quality filters (genomes_filters PASS only)
2495
+
2496
+ Args:
2497
+ lazy_df: LazyFrame with wide-format data (not melted)
2498
+ filter_config: Filter configuration dict
2499
+ verbose: Whether to print progress messages
2500
+
2501
+ Returns:
2502
+ Filtered LazyFrame
2503
+ """
2504
+ dnm_config = filter_config.get("dnm", {})
2505
+
2506
+ # Frequency filter
2507
+ fafmax_max = dnm_config.get("fafmax_faf95_max_genomes_max")
2508
+ if fafmax_max is not None:
2509
+ lazy_df = lazy_df.filter(
2510
+ (pl.col("fafmax_faf95_max_genomes").cast(pl.Float64, strict=False) <= fafmax_max)
2511
+ | pl.col("fafmax_faf95_max_genomes").is_null()
2512
+ )
2513
+ if verbose:
2514
+ click.echo(
2515
+ f"DNM prefilter: Applied frequency filter (fafmax <= {fafmax_max})", err=True
2516
+ )
2517
+
2518
+ # Quality filter (genomes_filters PASS only)
2519
+ if dnm_config.get("genomes_filters_pass_only", False):
2520
+ lazy_df = lazy_df.filter(
2521
+ (pl.col("genomes_filters") == ".") | pl.col("genomes_filters").is_null()
2522
+ )
2523
+ if verbose:
2524
+ click.echo(
2525
+ "DNM prefilter: Applied genomes_filters PASS filter", err=True
2526
+ )
2527
+
2528
+ return lazy_df
2529
+
2530
+
1987
2531
  def apply_filters_lazy(
1988
2532
  lazy_df: pl.LazyFrame,
1989
2533
  filter_config: dict,
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: pywombat
3
- Version: 1.0.2
3
+ Version: 1.2.0
4
4
  Summary: A CLI tool for processing and filtering bcftools tabulated TSV files with pedigree support
5
5
  Project-URL: Homepage, https://github.com/bourgeron-lab/pywombat
6
6
  Project-URL: Repository, https://github.com/bourgeron-lab/pywombat
@@ -18,6 +18,9 @@ Requires-Dist: click>=8.1.0
18
18
  Requires-Dist: polars>=0.19.0
19
19
  Requires-Dist: pyyaml>=6.0
20
20
  Requires-Dist: tqdm>=4.67.1
21
+ Provides-Extra: dev
22
+ Requires-Dist: pytest-cov>=4.0.0; extra == 'dev'
23
+ Requires-Dist: pytest>=7.0.0; extra == 'dev'
21
24
  Description-Content-Type: text/markdown
22
25
 
23
26
  # PyWombat 🦘
@@ -29,14 +32,15 @@ A high-performance CLI tool for processing and filtering bcftools tabulated TSV
29
32
 
30
33
  ## Features
31
34
 
32
- ✨ **Fast Processing**: Uses Polars for efficient data handling
33
- 🔬 **Quality Filtering**: Configurable depth, quality, and VAF thresholds
34
- 👨‍👩‍👧 **Pedigree Support**: Trio and family analysis with parent genotypes
35
- 🧬 **De Novo Detection**: Sex-chromosome-aware DNM identification
36
- 📊 **Flexible Output**: TSV, compressed TSV, or Parquet formats
37
- 🎯 **Expression Filters**: Complex filtering with logical expressions
38
- 🏷️ **Boolean Flag Support**: INFO field flags (PASS, DB, etc.) extracted as True/False columns
39
- ⚡ **Streaming Mode**: Memory-efficient processing of large files
35
+ ✨ **Fast Processing**: Uses Polars for efficient data handling
36
+ 🔬 **Quality Filtering**: Configurable depth, quality, and VAF thresholds
37
+ 👨‍👩‍👧 **Pedigree Support**: Trio and family analysis with parent genotypes
38
+ 🧬 **De Novo Detection**: Sex-chromosome-aware DNM identification
39
+ 📊 **Flexible Output**: TSV, compressed TSV, or Parquet formats
40
+ 🎯 **Expression Filters**: Complex filtering with logical expressions
41
+ 🏷️ **Boolean Flag Support**: INFO field flags (PASS, DB, etc.) extracted as True/False columns
42
+ ⚡ **Memory Optimized**: Two-step workflow for large files (prepare → filter)
43
+ 💾 **Parquet Support**: Pre-process large files for repeated, memory-efficient analysis
40
44
 
41
45
  ---
42
46
 
@@ -47,17 +51,37 @@ A high-performance CLI tool for processing and filtering bcftools tabulated TSV
47
51
  Use `uvx` to run PyWombat without installation:
48
52
 
49
53
  ```bash
50
- # Basic formatting
51
- uvx pywombat input.tsv -o output
54
+ # Basic filtering
55
+ uvx pywombat filter input.tsv -o output
52
56
 
53
- # With filtering
54
- uvx pywombat input.tsv -F examples/rare_variants_high_impact.yml -o output
57
+ # With filter configuration
58
+ uvx pywombat filter input.tsv -F examples/rare_variants_high_impact.yml -o output
55
59
 
56
60
  # De novo mutation detection
57
- uvx pywombat input.tsv --pedigree pedigree.tsv \
61
+ uvx pywombat filter input.tsv --pedigree pedigree.tsv \
58
62
  -F examples/de_novo_mutations.yml -o denovo
59
63
  ```
60
64
 
65
+ ### For Large Files (>1GB or >50 samples)
66
+
67
+ Use the two-step workflow for memory-efficient processing:
68
+
69
+ ```bash
70
+ # Step 1: Prepare (one-time preprocessing)
71
+ uvx pywombat prepare input.tsv.gz -o prepared.parquet
72
+
73
+ # Step 2: Filter (fast, memory-efficient, can be run multiple times)
74
+ uvx pywombat filter prepared.parquet \
75
+ -p pedigree.tsv \
76
+ -F config.yml \
77
+ -o filtered
78
+ ```
79
+
80
+ **Benefits:**
81
+ - Pre-expands INFO fields once (saves time on repeated filtering)
82
+ - Applies filters before melting samples (reduces memory by 95%+)
83
+ - Parquet format enables fast columnar access
84
+
61
85
  ### Installation for Development/Repeated Use
62
86
 
63
87
  ```bash
@@ -69,7 +93,7 @@ cd pywombat
69
93
  uv sync
70
94
 
71
95
  # Run with uv run
72
- uv run wombat input.tsv -o output
96
+ uv run wombat filter input.tsv -o output
73
97
  ```
74
98
 
75
99
  ---
@@ -114,25 +138,62 @@ chr1 100 A T 2 0.5 30 true Sample2 1/1 18 99
114
138
 
115
139
  ---
116
140
 
117
- ## Basic Usage
141
+ ## Commands
142
+
143
+ PyWombat has two main commands:
144
+
145
+ ### `wombat prepare` - Preprocess Large Files
146
+
147
+ Converts TSV/TSV.gz to optimized Parquet format with pre-expanded INFO fields:
148
+
149
+ ```bash
150
+ # Basic usage
151
+ wombat prepare input.tsv.gz -o prepared.parquet
152
+
153
+ # With verbose output
154
+ wombat prepare input.tsv.gz -o prepared.parquet -v
155
+
156
+ # Adjust chunk size for memory constraints
157
+ wombat prepare input.tsv.gz -o prepared.parquet --chunk-size 25000
158
+ ```
159
+
160
+ **What it does:**
161
+ - Extracts all INFO fields (VEP_*, AF, etc.) as separate columns
162
+ - Keeps samples in wide format (not melted yet)
163
+ - Writes memory-efficient Parquet format
164
+ - Processes in chunks to handle files of any size
165
+
166
+ **When to use:**
167
+ - Files >1GB or >50 samples
168
+ - Large families (>10 members)
169
+ - Running multiple filter configurations
170
+ - Repeated analysis of the same dataset
171
+
172
+ ### `wombat filter` - Process and Filter Data
118
173
 
119
- ### Format Without Filtering
174
+ Transforms and filters variant data (works with both TSV and Parquet input):
120
175
 
121
176
  ```bash
122
- # Output to file
123
- uvx pywombat input.tsv -o output
177
+ # Basic filtering (TSV input)
178
+ wombat filter input.tsv -o output
124
179
 
125
- # Output to stdout (useful for piping)
126
- uvx pywombat input.tsv
180
+ # From prepared Parquet (faster, more memory-efficient)
181
+ wombat filter prepared.parquet -o output
182
+
183
+ # With filter configuration
184
+ wombat filter input.tsv -F config.yml -o output
185
+
186
+ # With pedigree
187
+ wombat filter input.tsv -p pedigree.tsv -o output
127
188
 
128
189
  # Compressed output
129
- uvx pywombat input.tsv -o output -f tsv.gz
190
+ wombat filter input.tsv -o output -f tsv.gz
130
191
 
131
- # Parquet format (fastest for large files)
132
- uvx pywombat input.tsv -o output -f parquet
192
+ # Parquet output
193
+ wombat filter input.tsv -o output -f parquet
133
194
 
134
195
  # With verbose output
135
- uvx pywombat input.tsv -o output --verbose
196
+ wombat filter input.tsv -o output -v
136
197
  ```
137
198
 
138
199
  ### With Pedigree (Trio/Family Analysis)
@@ -140,7 +201,7 @@ uvx pywombat input.tsv -o output --verbose
140
201
  Add parent genotype information for inheritance analysis:
141
202
 
142
203
  ```bash
143
- uvx pywombat input.tsv --pedigree pedigree.tsv -o output
204
+ wombat filter input.tsv --pedigree pedigree.tsv -o output
144
205
  ```
145
206
 
146
207
  **Pedigree File Format** (tab-separated):
@@ -178,7 +239,7 @@ PyWombat supports two types of filtering:
178
239
  Filter for ultra-rare, high-impact variants:
179
240
 
180
241
  ```bash
181
- uvx pywombat input.tsv \
242
+ wombat filter input.tsv \
182
243
  -F examples/rare_variants_high_impact.yml \
183
244
  -o rare_variants
184
245
  ```
@@ -210,7 +271,7 @@ expression: "VEP_CANONICAL = YES & VEP_IMPACT = HIGH & VEP_LoF = HC & VEP_LoF_fl
210
271
  Identify de novo mutations in trio data:
211
272
 
212
273
  ```bash
213
- uvx pywombat input.tsv \
274
+ wombat filter input.tsv \
214
275
  --pedigree pedigree.tsv \
215
276
  -F examples/de_novo_mutations.yml \
216
277
  -o denovo
@@ -290,7 +351,7 @@ expression: "VEP_IMPACT = HIGH & VEP_CANONICAL = YES & gnomad_AF < 0.01 & CADD_P
290
351
  Inspect specific variants for troubleshooting:
291
352
 
292
353
  ```bash
293
- uvx pywombat input.tsv \
354
+ wombat filter input.tsv \
294
355
  -F config.yml \
295
356
  --debug chr11:70486013
296
357
  ```
@@ -309,20 +370,20 @@ Shows:
309
370
  ### TSV (Default)
310
371
 
311
372
  ```bash
312
- uvx pywombat input.tsv -o output # Creates output.tsv
313
- uvx pywombat input.tsv -o output -f tsv # Same as above
373
+ wombat filter input.tsv -o output # Creates output.tsv
374
+ wombat filter input.tsv -o output -f tsv # Same as above
314
375
  ```
315
376
 
316
377
  ### Compressed TSV
317
378
 
318
379
  ```bash
319
- uvx pywombat input.tsv -o output -f tsv.gz # Creates output.tsv.gz
380
+ wombat filter input.tsv -o output -f tsv.gz # Creates output.tsv.gz
320
381
  ```
321
382
 
322
383
  ### Parquet (Fastest for Large Files)
323
384
 
324
385
  ```bash
325
- uvx pywombat input.tsv -o output -f parquet # Creates output.parquet
386
+ wombat filter input.tsv -o output -f parquet # Creates output.parquet
326
387
  ```
327
388
 
328
389
  **When to use Parquet:**
@@ -340,7 +401,7 @@ uvx pywombat input.tsv -o output -f parquet # Creates output.parquet
340
401
 
341
402
  ```bash
342
403
  # Step 1: Filter for rare, high-impact variants
343
- uvx pywombat cohort.tsv \
404
+ wombat filter cohort.tsv \
344
405
  -F examples/rare_variants_high_impact.yml \
345
406
  -o rare_variants
346
407
 
@@ -352,24 +413,34 @@ uvx pywombat cohort.tsv \
352
413
 
353
414
  ```bash
354
415
  # Identify de novo mutations in autism cohort
355
- uvx pywombat autism_trios.tsv \
416
+ wombat filter autism_trios.tsv \
356
417
  --pedigree autism_pedigree.tsv \
357
418
  -F examples/de_novo_mutations.yml \
358
419
  -o autism_denovo \
359
- --verbose
420
+ -v
360
421
 
361
422
  # Review output for genes in autism risk lists
362
423
  ```
363
424
 
364
- ### 3. Multi-Family Rare Variant Analysis
425
+ ### 3. Large Multi-Family Analysis (Memory-Optimized)
365
426
 
366
427
  ```bash
367
- # Process multiple families together
368
- uvx pywombat families.tsv \
428
+ # Step 1: Prepare once (preprocesses INFO fields)
429
+ wombat prepare large_cohort.tsv.gz -o prepared.parquet -v
430
+
431
+ # Step 2: Filter with different configurations (fast, memory-efficient)
432
+ wombat filter prepared.parquet \
369
433
  --pedigree families_pedigree.tsv \
370
434
  -F examples/rare_variants_high_impact.yml \
371
435
  -o families_rare_variants \
372
- -f parquet # Parquet for fast downstream analysis
436
+ -v
437
+
438
+ # Step 3: Run additional filters without re-preparing
439
+ wombat filter prepared.parquet \
440
+ --pedigree families_pedigree.tsv \
441
+ -F examples/de_novo_mutations.yml \
442
+ -o families_denovo \
443
+ -v
373
444
  ```
374
445
 
375
446
  ### 4. Custom Expression Filter
@@ -389,7 +460,7 @@ expression: "VEP_IMPACT = HIGH & (gnomad_AF < 0.0001 | gnomad_AF = null)"
389
460
  Apply:
390
461
 
391
462
  ```bash
392
- uvx pywombat input.tsv -F custom_filter.yml -o output
463
+ wombat filter input.tsv -F custom_filter.yml -o output
393
464
  ```
394
465
 
395
466
  ---
@@ -464,7 +535,7 @@ bcftools query -HH \
464
535
  annotated.split.bcf > annotated.tsv
465
536
 
466
537
  # 4. Process with PyWombat
467
- uvx pywombat annotated.tsv -F examples/rare_variants_high_impact.yml -o output
538
+ wombat filter annotated.tsv -F examples/rare_variants_high_impact.yml -o output
468
539
  ```
469
540
 
470
541
  **Why split-vep is required:**
@@ -481,7 +552,7 @@ For production workflows, these commands can be piped together:
481
552
  # Efficient pipeline (single pass through data)
482
553
  bcftools +split-vep -c - -p VEP_ input.vcf.gz | \
483
554
  bcftools query -HH -f '%CHROM\t%POS\t%REF\t%ALT\t%FILTER\t%INFO[\t%GT:%DP:%GQ:%AD]\n' | \
484
- uvx pywombat - -F config.yml -o output
555
+ wombat filter - -F config.yml -o output
485
556
  ```
486
557
 
487
558
  **Note**: For multiple filter configurations, it's more efficient to save the intermediate TSV file rather than regenerating it each time.
@@ -517,11 +588,49 @@ Each configuration file is fully documented with:
517
588
 
518
589
  ## Performance Tips
519
590
 
520
- 1. **Use streaming mode** (default): Efficient for most workflows
521
- 2. **Parquet output**: Faster for large files and repeated analysis
591
+ ### For Large Files (>1GB or >50 samples)
592
+
593
+ 1. **Use the two-step workflow**: `wombat prepare` → `wombat filter`
594
+ - Reduces memory usage by 95%+ (4.2M variants → ~100 after early filtering)
595
+ - Pre-expands INFO fields once, reuse for multiple filter configurations
596
+ - Example: 38-sample family with 4.2M variants processes in <1 second with ~1.2GB RAM
597
+
598
+ 2. **Parquet format benefits**:
599
+ - Columnar storage enables selective column loading
600
+ - Pre-filtering before melting (expression filters applied before expanding to per-sample rows)
601
+ - **Per-chromosome processing for DNM**: Automatically processes DNM filtering chromosome-by-chromosome
602
+ - 30% smaller file size vs gzipped TSV
603
+
604
+ 3. **De Novo Mutation (DNM) filtering optimization**:
605
+ - Automatically uses per-chromosome processing when DNM mode is enabled
606
+ - Processes one chromosome at a time to reduce peak memory
607
+ - Applies frequency filters before melting to reduce data expansion
608
+ - Example: 38-sample family with 4.2M variants completes in 20 seconds with ~24GB RAM (vs 200GB+ OOM failure)
609
+
610
+ ### For All Files
611
+
522
612
  3. **Pre-filter with bcftools**: Filter by region/gene before PyWombat
523
613
  4. **Compressed input**: PyWombat handles `.gz` files natively
524
- 5. **Filter early**: Apply quality filters before complex expression filters
614
+ 5. **Use verbose mode** (`-v`): Monitor progress and filtering statistics
615
+
616
+ ### Memory Comparison
617
+
618
+ **Expression Filtering** (e.g., VEP_IMPACT filters):
619
+
620
+ | Approach | 38 samples, 4.2M variants | Memory | Time |
621
+ |----------|---------------------------|--------|------|
622
+ | Direct TSV | ❌ OOM (>200GB) | 200+ GB | Failed |
623
+ | TSV with chunking | ⚠️ Slow | ~30GB | ~3 min |
624
+ | **Parquet + pre-filter** | ✅ **Optimal** | **~1.2GB** | **<1 sec** |
625
+
626
+ **De Novo Mutation (DNM) Filtering**:
627
+
628
+ | Approach | 38 samples, 4.2M variants | Memory | Time | Result |
629
+ |----------|---------------------------|--------|------|--------|
630
+ | Without optimization | ❌ OOM (>200GB) | 200+ GB | Failed | N/A |
631
+ | **Parquet + per-chromosome** | ✅ **Success** | **~24GB** | **20 sec** | **6,788 DNM variants** |
632
+
633
+ *DNM filtering requires sample-level data (cannot pre-filter before melting), but per-chromosome processing reduces peak memory by 88%.*
525
634
 
526
635
  ---
527
636
 
@@ -588,11 +697,15 @@ pywombat/
588
697
 
589
698
  **Issue**: Memory errors on large files
590
699
 
591
- - **Solution**: Files are processed in streaming mode by default; if issues persist, pre-filter with bcftools
700
+ - **Solution**: Use the two-step workflow: `wombat prepare` then `wombat filter` for 95%+ memory reduction
701
+
702
+ **Issue**: Command not found after upgrading
703
+
704
+ - **Solution**: PyWombat now uses subcommands - use `wombat filter` instead of just `wombat`
592
705
 
593
706
  ### Getting Help
594
707
 
595
- 1. Check `--help` for command options: `uvx pywombat --help`
708
+ 1. Check `--help` for command options: `wombat --help` or `wombat filter --help`
596
709
  2. Review example configurations in [`examples/`](examples/)
597
710
  3. Use `--debug` mode to inspect specific variants
598
711
  4. Use `--verbose` to see filtering steps
@@ -0,0 +1,6 @@
1
+ pywombat/__init__.py,sha256=iIPN9vJtsIUhl_DiKNnknxCamLinfayodLLFK8y-aJg,54
2
+ pywombat/cli.py,sha256=pEPvUTww5Nvj-WqSRZ0QEePnORrcYkhWJv3uVi5DnxM,93728
3
+ pywombat-1.2.0.dist-info/METADATA,sha256=3TeUY6jzQCfrFaQ_BuocdB8374Esqwkoug9L-iZtLT0,21306
4
+ pywombat-1.2.0.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
5
+ pywombat-1.2.0.dist-info/entry_points.txt,sha256=Vt7U2ypbiEgCBlEV71ZPk287H5_HKmPBT4iBu6duEcE,44
6
+ pywombat-1.2.0.dist-info/RECORD,,
@@ -1,6 +0,0 @@
1
- pywombat/__init__.py,sha256=iIPN9vJtsIUhl_DiKNnknxCamLinfayodLLFK8y-aJg,54
2
- pywombat/cli.py,sha256=76pVpYYyl9rCm6TCq86j7xAqEC9pOvjiWCX9MmrBB_o,74994
3
- pywombat-1.0.2.dist-info/METADATA,sha256=slcagFwSvA99GVzKLRZggFPy5dkLEh_09O3dIB8Hfr4,17168
4
- pywombat-1.0.2.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
5
- pywombat-1.0.2.dist-info/entry_points.txt,sha256=Vt7U2ypbiEgCBlEV71ZPk287H5_HKmPBT4iBu6duEcE,44
6
- pywombat-1.0.2.dist-info/RECORD,,