pywombat 1.0.2__py3-none-any.whl → 1.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pywombat/cli.py +633 -89
- {pywombat-1.0.2.dist-info → pywombat-1.2.0.dist-info}/METADATA +161 -48
- pywombat-1.2.0.dist-info/RECORD +6 -0
- pywombat-1.0.2.dist-info/RECORD +0 -6
- {pywombat-1.0.2.dist-info → pywombat-1.2.0.dist-info}/WHEEL +0 -0
- {pywombat-1.0.2.dist-info → pywombat-1.2.0.dist-info}/entry_points.txt +0 -0
pywombat/cli.py
CHANGED
|
@@ -11,13 +11,371 @@ import polars as pl
|
|
|
11
11
|
import yaml
|
|
12
12
|
|
|
13
13
|
|
|
14
|
-
@click.
|
|
14
|
+
@click.group()
|
|
15
|
+
def cli():
|
|
16
|
+
"""
|
|
17
|
+
Wombat: A tool for processing bcftools tabulated TSV files.
|
|
18
|
+
|
|
19
|
+
\b
|
|
20
|
+
Commands:
|
|
21
|
+
filter Process and filter variant data
|
|
22
|
+
prepare Convert TSV to optimized Parquet format
|
|
23
|
+
"""
|
|
24
|
+
pass
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
@cli.command("prepare")
|
|
28
|
+
@click.argument("input_file", type=click.Path(exists=True, path_type=Path))
|
|
29
|
+
@click.option(
|
|
30
|
+
"-o",
|
|
31
|
+
"--output",
|
|
32
|
+
type=click.Path(path_type=Path),
|
|
33
|
+
required=True,
|
|
34
|
+
help="Output Parquet file path.",
|
|
35
|
+
)
|
|
36
|
+
@click.option(
|
|
37
|
+
"--chunk-size",
|
|
38
|
+
type=int,
|
|
39
|
+
default=50000,
|
|
40
|
+
help="Number of rows to process at a time (default: 50000).",
|
|
41
|
+
)
|
|
42
|
+
@click.option("-v", "--verbose", is_flag=True, help="Enable verbose output.")
|
|
43
|
+
def prepare_cmd(
|
|
44
|
+
input_file: Path,
|
|
45
|
+
output: Path,
|
|
46
|
+
chunk_size: int,
|
|
47
|
+
verbose: bool,
|
|
48
|
+
):
|
|
49
|
+
"""
|
|
50
|
+
Convert bcftools TSV to optimized Parquet format.
|
|
51
|
+
|
|
52
|
+
This command pre-processes a TSV file by:
|
|
53
|
+
|
|
54
|
+
\b
|
|
55
|
+
1. Extracting all INFO fields from the '(null)' column into separate columns
|
|
56
|
+
2. Applying memory-efficient data types (Categorical for CHROM, UInt32 for POS)
|
|
57
|
+
3. Writing to Parquet format for efficient columnar access
|
|
58
|
+
|
|
59
|
+
The output Parquet file can then be used with 'wombat filter' for much faster
|
|
60
|
+
and more memory-efficient filtering, especially for large files.
|
|
61
|
+
|
|
62
|
+
\b
|
|
63
|
+
Examples:
|
|
64
|
+
wombat prepare input.tsv.gz -o prepared.parquet
|
|
65
|
+
wombat prepare input.tsv.gz -o prepared.parquet --chunk-size 100000
|
|
66
|
+
"""
|
|
67
|
+
try:
|
|
68
|
+
if verbose:
|
|
69
|
+
click.echo(f"Preparing {input_file} -> {output}", err=True)
|
|
70
|
+
|
|
71
|
+
# Ensure output has .parquet extension
|
|
72
|
+
if not str(output).endswith(".parquet"):
|
|
73
|
+
output = Path(f"{output}.parquet")
|
|
74
|
+
|
|
75
|
+
# Process the file
|
|
76
|
+
prepare_parquet(input_file, output, chunk_size, verbose)
|
|
77
|
+
|
|
78
|
+
if verbose:
|
|
79
|
+
click.echo(f"Successfully created {output}", err=True)
|
|
80
|
+
|
|
81
|
+
except Exception as e:
|
|
82
|
+
click.echo(f"Error: {e}", err=True)
|
|
83
|
+
raise click.Abort()
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
def prepare_parquet(
|
|
87
|
+
input_file: Path,
|
|
88
|
+
output: Path,
|
|
89
|
+
chunk_size: int = 50000,
|
|
90
|
+
verbose: bool = False,
|
|
91
|
+
) -> None:
|
|
92
|
+
"""
|
|
93
|
+
Convert a bcftools TSV file to Parquet with pre-expanded INFO fields.
|
|
94
|
+
|
|
95
|
+
Processes the file in chunks to handle large files without running out of memory.
|
|
96
|
+
|
|
97
|
+
Args:
|
|
98
|
+
input_file: Path to input TSV or TSV.gz file
|
|
99
|
+
output: Path to output Parquet file
|
|
100
|
+
chunk_size: Number of rows to process per chunk
|
|
101
|
+
verbose: Whether to print progress
|
|
102
|
+
"""
|
|
103
|
+
from tqdm import tqdm
|
|
104
|
+
|
|
105
|
+
# First pass: discover all INFO fields
|
|
106
|
+
if verbose:
|
|
107
|
+
click.echo("Pass 1: Discovering INFO fields...", err=True)
|
|
108
|
+
|
|
109
|
+
all_fields = set()
|
|
110
|
+
all_flags = set()
|
|
111
|
+
total_lines = 0
|
|
112
|
+
|
|
113
|
+
is_gzipped = str(input_file).endswith(".gz")
|
|
114
|
+
opener = gzip.open if is_gzipped else open
|
|
115
|
+
|
|
116
|
+
with opener(input_file, "rt") as f:
|
|
117
|
+
header_line = f.readline().strip()
|
|
118
|
+
header_cols = header_line.split("\t")
|
|
119
|
+
|
|
120
|
+
# Find the (null) column index dynamically
|
|
121
|
+
null_col_idx = None
|
|
122
|
+
for i, col in enumerate(header_cols):
|
|
123
|
+
if col == "(null)":
|
|
124
|
+
null_col_idx = i
|
|
125
|
+
break
|
|
126
|
+
|
|
127
|
+
if null_col_idx is None:
|
|
128
|
+
if verbose:
|
|
129
|
+
click.echo("Warning: No (null) column found in input", err=True)
|
|
130
|
+
else:
|
|
131
|
+
for line in tqdm(f, desc="Scanning", disable=not verbose):
|
|
132
|
+
total_lines += 1
|
|
133
|
+
parts = line.split("\t")
|
|
134
|
+
if len(parts) > null_col_idx:
|
|
135
|
+
null_value = parts[null_col_idx]
|
|
136
|
+
if null_value and null_value != ".":
|
|
137
|
+
pairs = null_value.split(";")
|
|
138
|
+
for pair in pairs:
|
|
139
|
+
if "=" in pair:
|
|
140
|
+
field_name = pair.split("=", 1)[0]
|
|
141
|
+
all_fields.add(field_name)
|
|
142
|
+
elif pair.strip():
|
|
143
|
+
all_flags.add(pair.strip())
|
|
144
|
+
|
|
145
|
+
if verbose:
|
|
146
|
+
click.echo(
|
|
147
|
+
f"Found {len(all_fields)} key-value fields and {len(all_flags)} flags in {total_lines} variants",
|
|
148
|
+
err=True,
|
|
149
|
+
)
|
|
150
|
+
|
|
151
|
+
# Second pass: process chunks and write Parquet
|
|
152
|
+
if verbose:
|
|
153
|
+
click.echo("Pass 2: Converting to Parquet...", err=True)
|
|
154
|
+
|
|
155
|
+
# Define memory-efficient dtypes
|
|
156
|
+
dtype_overrides = {
|
|
157
|
+
"#CHROM": pl.Categorical,
|
|
158
|
+
"POS": pl.UInt32,
|
|
159
|
+
"FILTER": pl.Categorical,
|
|
160
|
+
}
|
|
161
|
+
|
|
162
|
+
# Create a temporary directory for chunk files
|
|
163
|
+
import tempfile
|
|
164
|
+
import shutil
|
|
165
|
+
|
|
166
|
+
temp_dir = Path(tempfile.mkdtemp(prefix="wombat_prepare_"))
|
|
167
|
+
part_files = []
|
|
168
|
+
|
|
169
|
+
try:
|
|
170
|
+
with opener(input_file, "rt") as f:
|
|
171
|
+
header_line = f.readline().strip()
|
|
172
|
+
|
|
173
|
+
# Process in chunks
|
|
174
|
+
chunk_lines = []
|
|
175
|
+
pbar = tqdm(total=total_lines, desc="Converting", disable=not verbose)
|
|
176
|
+
|
|
177
|
+
for line in f:
|
|
178
|
+
chunk_lines.append(line)
|
|
179
|
+
if len(chunk_lines) >= chunk_size:
|
|
180
|
+
df_chunk = _process_chunk(
|
|
181
|
+
header_line, chunk_lines, all_fields, all_flags, dtype_overrides
|
|
182
|
+
)
|
|
183
|
+
part_file = temp_dir / f"part_{len(part_files):06d}.parquet"
|
|
184
|
+
df_chunk.write_parquet(part_file)
|
|
185
|
+
part_files.append(part_file)
|
|
186
|
+
pbar.update(len(chunk_lines))
|
|
187
|
+
chunk_lines = []
|
|
188
|
+
|
|
189
|
+
# Process remaining lines
|
|
190
|
+
if chunk_lines:
|
|
191
|
+
df_chunk = _process_chunk(
|
|
192
|
+
header_line, chunk_lines, all_fields, all_flags, dtype_overrides
|
|
193
|
+
)
|
|
194
|
+
part_file = temp_dir / f"part_{len(part_files):06d}.parquet"
|
|
195
|
+
df_chunk.write_parquet(part_file)
|
|
196
|
+
part_files.append(part_file)
|
|
197
|
+
pbar.update(len(chunk_lines))
|
|
198
|
+
|
|
199
|
+
pbar.close()
|
|
200
|
+
|
|
201
|
+
# Combine all parts into final output using lazy scanning
|
|
202
|
+
if verbose:
|
|
203
|
+
click.echo(f"Combining {len(part_files)} parts into final output...", err=True)
|
|
204
|
+
|
|
205
|
+
if part_files:
|
|
206
|
+
# Use scan_parquet to lazily read all parts and write combined output
|
|
207
|
+
combined = pl.scan_parquet(part_files).collect()
|
|
208
|
+
combined.write_parquet(output)
|
|
209
|
+
|
|
210
|
+
if verbose:
|
|
211
|
+
click.echo(f"Wrote {len(part_files)} chunks to {output}", err=True)
|
|
212
|
+
|
|
213
|
+
finally:
|
|
214
|
+
# Clean up temporary directory
|
|
215
|
+
shutil.rmtree(temp_dir, ignore_errors=True)
|
|
216
|
+
|
|
217
|
+
|
|
218
|
+
def _process_chunk(
|
|
219
|
+
header: str,
|
|
220
|
+
lines: list,
|
|
221
|
+
fields: set,
|
|
222
|
+
flags: set,
|
|
223
|
+
dtype_overrides: dict,
|
|
224
|
+
) -> pl.DataFrame:
|
|
225
|
+
"""Process a chunk of lines into a DataFrame with expanded INFO fields."""
|
|
226
|
+
import io
|
|
227
|
+
|
|
228
|
+
content = header + "\n" + "".join(lines)
|
|
229
|
+
df = pl.read_csv(
|
|
230
|
+
io.StringIO(content),
|
|
231
|
+
separator="\t",
|
|
232
|
+
infer_schema_length=10000,
|
|
233
|
+
)
|
|
234
|
+
|
|
235
|
+
# Expand INFO fields from (null) column
|
|
236
|
+
if "(null)" in df.columns:
|
|
237
|
+
# Extract key-value fields
|
|
238
|
+
for field in sorted(fields):
|
|
239
|
+
df = df.with_columns(
|
|
240
|
+
pl.col("(null)").str.extract(f"{field}=([^;]+)").alias(field)
|
|
241
|
+
)
|
|
242
|
+
|
|
243
|
+
# Extract boolean flags
|
|
244
|
+
for flag in sorted(flags):
|
|
245
|
+
df = df.with_columns(
|
|
246
|
+
pl.col("(null)").str.contains(f"(^|;){flag}(;|$)").alias(flag)
|
|
247
|
+
)
|
|
248
|
+
|
|
249
|
+
# Drop the original (null) column
|
|
250
|
+
df = df.drop("(null)")
|
|
251
|
+
|
|
252
|
+
# Drop CSQ column if it exists (redundant after expansion)
|
|
253
|
+
if "CSQ" in df.columns:
|
|
254
|
+
df = df.drop("CSQ")
|
|
255
|
+
|
|
256
|
+
# Apply memory-efficient dtypes
|
|
257
|
+
for col, dtype in dtype_overrides.items():
|
|
258
|
+
if col in df.columns:
|
|
259
|
+
try:
|
|
260
|
+
df = df.with_columns(pl.col(col).cast(dtype))
|
|
261
|
+
except Exception:
|
|
262
|
+
pass # Skip if cast fails
|
|
263
|
+
|
|
264
|
+
return df
|
|
265
|
+
|
|
266
|
+
|
|
267
|
+
def process_dnm_by_chromosome(
|
|
268
|
+
input_file: Path,
|
|
269
|
+
pedigree_df: pl.DataFrame,
|
|
270
|
+
filter_config: dict,
|
|
271
|
+
output_format: str,
|
|
272
|
+
verbose: bool
|
|
273
|
+
) -> pl.DataFrame:
|
|
274
|
+
"""Process DNM filtering chromosome by chromosome to reduce memory usage.
|
|
275
|
+
|
|
276
|
+
Processes each chromosome separately:
|
|
277
|
+
1. Load one chromosome at a time from Parquet
|
|
278
|
+
2. Apply frequency/quality prefilters (before melting)
|
|
279
|
+
3. Melt samples
|
|
280
|
+
4. Apply DNM filters
|
|
281
|
+
5. Combine results from all chromosomes
|
|
282
|
+
|
|
283
|
+
This reduces peak memory from (total_variants × samples) to
|
|
284
|
+
(max_chr_variants × samples).
|
|
285
|
+
|
|
286
|
+
Args:
|
|
287
|
+
input_file: Path to Parquet file
|
|
288
|
+
pedigree_df: Pedigree DataFrame with sample relationships
|
|
289
|
+
filter_config: Filter configuration dict
|
|
290
|
+
output_format: Output format (tsv, tsv.gz, parquet)
|
|
291
|
+
verbose: Whether to print progress messages
|
|
292
|
+
|
|
293
|
+
Returns:
|
|
294
|
+
Combined DataFrame with DNM-filtered variants from all chromosomes
|
|
295
|
+
"""
|
|
296
|
+
# Get list of chromosomes
|
|
297
|
+
chromosomes = get_unique_chromosomes(input_file)
|
|
298
|
+
|
|
299
|
+
if verbose:
|
|
300
|
+
click.echo(
|
|
301
|
+
f"DNM per-chromosome processing: {len(chromosomes)} chromosomes", err=True
|
|
302
|
+
)
|
|
303
|
+
|
|
304
|
+
results = []
|
|
305
|
+
dnm_cfg = {}
|
|
306
|
+
dnm_cfg.update(filter_config.get("quality", {}))
|
|
307
|
+
dnm_cfg.update(filter_config.get("dnm", {}))
|
|
308
|
+
|
|
309
|
+
for chrom in chromosomes:
|
|
310
|
+
if verbose:
|
|
311
|
+
click.echo(f"Processing chromosome {chrom}...", err=True)
|
|
312
|
+
|
|
313
|
+
# Load only this chromosome
|
|
314
|
+
lazy_df = pl.scan_parquet(input_file).filter(
|
|
315
|
+
pl.col("#CHROM") == chrom
|
|
316
|
+
)
|
|
317
|
+
|
|
318
|
+
# Apply frequency filters BEFORE melting (Optimization 2)
|
|
319
|
+
lazy_df = apply_dnm_prefilters(lazy_df, filter_config, verbose=False)
|
|
320
|
+
|
|
321
|
+
# Count variants after prefiltering
|
|
322
|
+
if verbose:
|
|
323
|
+
pre_count = lazy_df.select(pl.count()).collect().item()
|
|
324
|
+
click.echo(f" Chromosome {chrom}: {pre_count} variants after prefilter", err=True)
|
|
325
|
+
|
|
326
|
+
# Collect, melt, and apply DNM filters
|
|
327
|
+
df = lazy_df.collect()
|
|
328
|
+
|
|
329
|
+
if df.shape[0] == 0:
|
|
330
|
+
if verbose:
|
|
331
|
+
click.echo(f" Chromosome {chrom}: No variants after prefilter, skipping", err=True)
|
|
332
|
+
continue
|
|
333
|
+
|
|
334
|
+
formatted_df = format_bcftools_tsv_minimal(df, pedigree_df)
|
|
335
|
+
|
|
336
|
+
if verbose:
|
|
337
|
+
click.echo(
|
|
338
|
+
f" Chromosome {chrom}: {formatted_df.shape[0]} rows after melting", err=True
|
|
339
|
+
)
|
|
340
|
+
|
|
341
|
+
# Apply DNM filters (skip prefilters since already applied)
|
|
342
|
+
filtered_df = apply_de_novo_filter(
|
|
343
|
+
formatted_df, dnm_cfg, verbose=False, pedigree_df=pedigree_df,
|
|
344
|
+
skip_prefilters=True
|
|
345
|
+
)
|
|
346
|
+
|
|
347
|
+
if verbose:
|
|
348
|
+
click.echo(
|
|
349
|
+
f" Chromosome {chrom}: {filtered_df.shape[0]} variants passed DNM filter", err=True
|
|
350
|
+
)
|
|
351
|
+
|
|
352
|
+
if filtered_df.shape[0] > 0:
|
|
353
|
+
results.append(filtered_df)
|
|
354
|
+
|
|
355
|
+
# Combine results
|
|
356
|
+
if not results:
|
|
357
|
+
if verbose:
|
|
358
|
+
click.echo("No variants passed DNM filters across all chromosomes", err=True)
|
|
359
|
+
# Return empty DataFrame with correct schema
|
|
360
|
+
return pl.DataFrame()
|
|
361
|
+
|
|
362
|
+
final_df = pl.concat(results)
|
|
363
|
+
|
|
364
|
+
if verbose:
|
|
365
|
+
click.echo(
|
|
366
|
+
f"DNM filtering complete: {final_df.shape[0]} total variants", err=True
|
|
367
|
+
)
|
|
368
|
+
|
|
369
|
+
return final_df
|
|
370
|
+
|
|
371
|
+
|
|
372
|
+
@cli.command("filter")
|
|
15
373
|
@click.argument("input_file", type=click.Path(exists=True, path_type=Path))
|
|
16
374
|
@click.option(
|
|
17
375
|
"-o",
|
|
18
376
|
"--output",
|
|
19
377
|
type=str,
|
|
20
|
-
help="Output file prefix. If not specified,
|
|
378
|
+
help="Output file prefix. If not specified, generates from input filename.",
|
|
21
379
|
)
|
|
22
380
|
@click.option(
|
|
23
381
|
"-f",
|
|
@@ -43,9 +401,9 @@ import yaml
|
|
|
43
401
|
@click.option(
|
|
44
402
|
"--debug",
|
|
45
403
|
type=str,
|
|
46
|
-
help="Debug mode: show rows matching chrom:pos (e.g., chr11:70486013).
|
|
404
|
+
help="Debug mode: show rows matching chrom:pos (e.g., chr11:70486013).",
|
|
47
405
|
)
|
|
48
|
-
def
|
|
406
|
+
def filter_cmd(
|
|
49
407
|
input_file: Path,
|
|
50
408
|
output: Optional[str],
|
|
51
409
|
output_format: str,
|
|
@@ -55,36 +413,43 @@ def cli(
|
|
|
55
413
|
debug: Optional[str],
|
|
56
414
|
):
|
|
57
415
|
"""
|
|
58
|
-
|
|
416
|
+
Process and filter variant data from TSV or Parquet files.
|
|
59
417
|
|
|
60
|
-
|
|
418
|
+
\b
|
|
419
|
+
Supports two input formats:
|
|
420
|
+
- TSV/TSV.gz: Full processing (INFO expansion + melting)
|
|
421
|
+
- Parquet: Fast processing (melting only, INFO already expanded)
|
|
61
422
|
|
|
62
423
|
\b
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
424
|
+
For large files, use 'wombat prepare' first to convert to Parquet,
|
|
425
|
+
then use 'wombat filter' on the Parquet file for better performance.
|
|
426
|
+
|
|
427
|
+
\b
|
|
428
|
+
This command:
|
|
429
|
+
1. Expands the '(null)' column (TSV only) into separate columns
|
|
430
|
+
2. Melts sample columns into rows with sample names
|
|
431
|
+
3. Splits sample values (GT:DP:GQ:AD format) into separate columns
|
|
432
|
+
4. Applies quality and expression filters (if config provided)
|
|
72
433
|
|
|
73
434
|
\b
|
|
74
435
|
Examples:
|
|
75
|
-
wombat input.tsv -o output
|
|
76
|
-
wombat
|
|
77
|
-
wombat input.tsv
|
|
436
|
+
wombat filter input.tsv -o output
|
|
437
|
+
wombat filter prepared.parquet -o output -f parquet
|
|
438
|
+
wombat filter input.tsv -p pedigree.tsv -F config.yml
|
|
78
439
|
"""
|
|
79
440
|
try:
|
|
80
441
|
if verbose:
|
|
81
442
|
click.echo(f"Reading input file: {input_file}", err=True)
|
|
82
443
|
|
|
83
|
-
# Detect
|
|
444
|
+
# Detect input format
|
|
445
|
+
is_parquet = str(input_file).endswith(".parquet")
|
|
84
446
|
is_gzipped = str(input_file).endswith(".gz")
|
|
85
447
|
|
|
86
|
-
if verbose
|
|
87
|
-
|
|
448
|
+
if verbose:
|
|
449
|
+
if is_parquet:
|
|
450
|
+
click.echo("Detected Parquet input (pre-processed)", err=True)
|
|
451
|
+
elif is_gzipped:
|
|
452
|
+
click.echo("Detected gzipped TSV file", err=True)
|
|
88
453
|
|
|
89
454
|
# Read pedigree file if provided
|
|
90
455
|
pedigree_df = None
|
|
@@ -109,11 +474,11 @@ def cli(
|
|
|
109
474
|
if output is None:
|
|
110
475
|
# Generate default output prefix from input filename
|
|
111
476
|
input_stem = input_file.name
|
|
112
|
-
# Remove
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
477
|
+
# Remove known extensions
|
|
478
|
+
for ext in [".tsv.gz", ".tsv", ".parquet"]:
|
|
479
|
+
if input_stem.endswith(ext):
|
|
480
|
+
input_stem = input_stem[: -len(ext)]
|
|
481
|
+
break
|
|
117
482
|
|
|
118
483
|
# Add config name if filter is provided
|
|
119
484
|
if filter_config:
|
|
@@ -126,24 +491,103 @@ def cli(
|
|
|
126
491
|
if verbose:
|
|
127
492
|
click.echo("Processing with streaming mode...", err=True)
|
|
128
493
|
|
|
129
|
-
# Build lazy query
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
"
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
494
|
+
# Build lazy query based on input format
|
|
495
|
+
if is_parquet:
|
|
496
|
+
# Parquet input: INFO fields already expanded by 'wombat prepare'
|
|
497
|
+
lazy_df = pl.scan_parquet(input_file)
|
|
498
|
+
|
|
499
|
+
# Check if DNM mode is enabled - use per-chromosome processing
|
|
500
|
+
if filter_config_data and filter_config_data.get("dnm", {}).get("enabled", False):
|
|
501
|
+
if verbose:
|
|
502
|
+
click.echo("DNM mode: Using per-chromosome processing for memory efficiency", err=True)
|
|
503
|
+
|
|
504
|
+
# DNM requires pedigree
|
|
505
|
+
if pedigree_df is None:
|
|
506
|
+
click.echo("Error: DNM filtering requires a pedigree file (--pedigree option)", err=True)
|
|
507
|
+
raise click.Abort()
|
|
508
|
+
|
|
509
|
+
# Process DNM filtering chromosome by chromosome
|
|
510
|
+
formatted_df = process_dnm_by_chromosome(
|
|
511
|
+
input_file,
|
|
512
|
+
pedigree_df,
|
|
513
|
+
filter_config_data,
|
|
514
|
+
output_format,
|
|
515
|
+
verbose
|
|
516
|
+
)
|
|
517
|
+
|
|
518
|
+
# Write output directly
|
|
519
|
+
output_path = Path(f"{output}.{output_format}")
|
|
520
|
+
|
|
521
|
+
if output_format == "tsv":
|
|
522
|
+
formatted_df.write_csv(output_path, separator="\t")
|
|
523
|
+
elif output_format == "tsv.gz":
|
|
524
|
+
csv_content = formatted_df.write_csv(separator="\t")
|
|
525
|
+
with gzip.open(output_path, "wt") as f:
|
|
526
|
+
f.write(csv_content)
|
|
527
|
+
elif output_format == "parquet":
|
|
528
|
+
formatted_df.write_parquet(output_path)
|
|
529
|
+
|
|
530
|
+
if verbose:
|
|
531
|
+
click.echo(f"DNM variants written to {output_path}", err=True)
|
|
532
|
+
|
|
533
|
+
return
|
|
534
|
+
|
|
535
|
+
# OPTIMIZATION: Apply expression filter BEFORE melting
|
|
536
|
+
# Expression filters (VEP_IMPACT, etc.) don't depend on sample data
|
|
537
|
+
if filter_config_data and "expression" in filter_config_data:
|
|
538
|
+
expression = filter_config_data["expression"]
|
|
539
|
+
if expression and verbose:
|
|
540
|
+
click.echo(
|
|
541
|
+
f"Applying expression filter before melting: {expression}",
|
|
542
|
+
err=True,
|
|
543
|
+
)
|
|
544
|
+
|
|
545
|
+
# Collect a small sample to get schema for expression parsing
|
|
546
|
+
schema_df = lazy_df.head(1).collect()
|
|
547
|
+
try:
|
|
548
|
+
filter_expr = parse_impact_filter_expression(expression, schema_df)
|
|
549
|
+
lazy_df = lazy_df.filter(filter_expr)
|
|
550
|
+
|
|
551
|
+
# Count filtered variants
|
|
552
|
+
if verbose:
|
|
553
|
+
filtered_count = lazy_df.select(pl.len()).collect().item()
|
|
554
|
+
click.echo(
|
|
555
|
+
f"Variants after expression filter: {filtered_count}",
|
|
556
|
+
err=True,
|
|
557
|
+
)
|
|
558
|
+
except ValueError as e:
|
|
559
|
+
if verbose:
|
|
560
|
+
click.echo(
|
|
561
|
+
f"Warning: Could not apply early filter: {e}", err=True
|
|
562
|
+
)
|
|
144
563
|
|
|
145
|
-
|
|
146
|
-
|
|
564
|
+
# Now collect and melt (on filtered variants only)
|
|
565
|
+
df = lazy_df.collect()
|
|
566
|
+
formatted_df = format_bcftools_tsv_minimal(df, pedigree_df)
|
|
567
|
+
lazy_df = formatted_df.lazy()
|
|
568
|
+
|
|
569
|
+
# Remove expression from config so it's not applied again
|
|
570
|
+
if filter_config_data and "expression" in filter_config_data:
|
|
571
|
+
filter_config_data = filter_config_data.copy()
|
|
572
|
+
del filter_config_data["expression"]
|
|
573
|
+
else:
|
|
574
|
+
# TSV input: need full processing (melt + annotation expansion)
|
|
575
|
+
string_columns = [
|
|
576
|
+
"FID",
|
|
577
|
+
"sample_id",
|
|
578
|
+
"father_id",
|
|
579
|
+
"mother_id",
|
|
580
|
+
"FatherBarcode",
|
|
581
|
+
"MotherBarcode",
|
|
582
|
+
"sample",
|
|
583
|
+
]
|
|
584
|
+
schema_overrides = {col: pl.Utf8 for col in string_columns}
|
|
585
|
+
lazy_df = pl.scan_csv(
|
|
586
|
+
input_file, separator="\t", schema_overrides=schema_overrides
|
|
587
|
+
)
|
|
588
|
+
|
|
589
|
+
# Apply formatting transformations (melt + expand annotations)
|
|
590
|
+
lazy_df = format_bcftools_tsv_lazy(lazy_df, pedigree_df)
|
|
147
591
|
|
|
148
592
|
# Apply filters if provided
|
|
149
593
|
if filter_config_data:
|
|
@@ -497,11 +941,47 @@ def _pos_in_par(chrom: str, pos: int, par_regions: dict) -> bool:
|
|
|
497
941
|
return False
|
|
498
942
|
|
|
499
943
|
|
|
944
|
+
def get_unique_chromosomes(parquet_file: Path) -> list[str]:
|
|
945
|
+
"""Get list of unique chromosomes from Parquet file, sorted naturally.
|
|
946
|
+
|
|
947
|
+
Args:
|
|
948
|
+
parquet_file: Path to Parquet file
|
|
949
|
+
|
|
950
|
+
Returns:
|
|
951
|
+
Sorted list of chromosome names (e.g., ['1', '2', ..., '22', 'X', 'Y', 'MT'])
|
|
952
|
+
"""
|
|
953
|
+
# Read just the #CHROM column to get unique values
|
|
954
|
+
df = pl.scan_parquet(parquet_file).select("#CHROM").unique().collect()
|
|
955
|
+
chroms = df["#CHROM"].to_list()
|
|
956
|
+
|
|
957
|
+
# Sort chromosomes properly (1, 2, ..., 22, X, Y, MT)
|
|
958
|
+
def chrom_sort_key(chrom: str) -> tuple:
|
|
959
|
+
"""Sort key for natural chromosome ordering."""
|
|
960
|
+
chrom_norm = chrom.replace("chr", "").replace("Chr", "").replace("CHR", "").upper()
|
|
961
|
+
|
|
962
|
+
# Try to parse as integer (autosomes)
|
|
963
|
+
try:
|
|
964
|
+
return (0, int(chrom_norm), "")
|
|
965
|
+
except ValueError:
|
|
966
|
+
pass
|
|
967
|
+
|
|
968
|
+
# Sex chromosomes and mitochondrial
|
|
969
|
+
if chrom_norm in ["X", "Y", "MT", "M"]:
|
|
970
|
+
order = {"X": 23, "Y": 24, "MT": 25, "M": 25}
|
|
971
|
+
return (1, order.get(chrom_norm, 99), chrom_norm)
|
|
972
|
+
|
|
973
|
+
# Other chromosomes (e.g., scaffolds)
|
|
974
|
+
return (2, 0, chrom_norm)
|
|
975
|
+
|
|
976
|
+
return sorted(chroms, key=chrom_sort_key)
|
|
977
|
+
|
|
978
|
+
|
|
500
979
|
def apply_de_novo_filter(
|
|
501
980
|
df: pl.DataFrame,
|
|
502
981
|
dnm_config: dict,
|
|
503
982
|
verbose: bool = False,
|
|
504
983
|
pedigree_df: Optional[pl.DataFrame] = None,
|
|
984
|
+
skip_prefilters: bool = False,
|
|
505
985
|
) -> pl.DataFrame:
|
|
506
986
|
"""Apply de novo detection filters to dataframe using vectorized operations.
|
|
507
987
|
|
|
@@ -512,6 +992,13 @@ def apply_de_novo_filter(
|
|
|
512
992
|
|
|
513
993
|
This function will read `sex` from `df` when present; otherwise it will use
|
|
514
994
|
the `pedigree_df` (which should contain `sample_id` and `sex`).
|
|
995
|
+
|
|
996
|
+
Args:
|
|
997
|
+
df: DataFrame with melted samples
|
|
998
|
+
dnm_config: DNM configuration dict
|
|
999
|
+
verbose: Whether to print progress messages
|
|
1000
|
+
pedigree_df: Pedigree DataFrame
|
|
1001
|
+
skip_prefilters: If True, skips frequency/genomes_filters (assumes already applied)
|
|
515
1002
|
"""
|
|
516
1003
|
if not dnm_config:
|
|
517
1004
|
return df
|
|
@@ -676,43 +1163,45 @@ def apply_de_novo_filter(
|
|
|
676
1163
|
err=True,
|
|
677
1164
|
)
|
|
678
1165
|
|
|
679
|
-
# Apply
|
|
680
|
-
if
|
|
681
|
-
|
|
682
|
-
|
|
683
|
-
|
|
684
|
-
|
|
685
|
-
|
|
1166
|
+
# Apply frequency/quality prefilters if not already applied
|
|
1167
|
+
if not skip_prefilters:
|
|
1168
|
+
# Apply fafmax_faf95_max_genomes filter if specified
|
|
1169
|
+
if fafmax_max is not None:
|
|
1170
|
+
if "fafmax_faf95_max_genomes" in df.columns:
|
|
1171
|
+
df = df.filter(
|
|
1172
|
+
(
|
|
1173
|
+
pl.col("fafmax_faf95_max_genomes").cast(pl.Float64, strict=False)
|
|
1174
|
+
<= fafmax_max
|
|
1175
|
+
)
|
|
1176
|
+
| pl.col("fafmax_faf95_max_genomes").is_null()
|
|
686
1177
|
)
|
|
687
|
-
|
|
688
|
-
|
|
689
|
-
|
|
1178
|
+
if verbose:
|
|
1179
|
+
click.echo(
|
|
1180
|
+
f"DNM: {df.shape[0]} variants after fafmax_faf95_max_genomes filter (<={fafmax_max})",
|
|
1181
|
+
err=True,
|
|
1182
|
+
)
|
|
1183
|
+
elif verbose:
|
|
690
1184
|
click.echo(
|
|
691
|
-
|
|
1185
|
+
"DNM: Warning - fafmax_faf95_max_genomes column not found, skipping frequency filter",
|
|
692
1186
|
err=True,
|
|
693
1187
|
)
|
|
694
|
-
elif verbose:
|
|
695
|
-
click.echo(
|
|
696
|
-
"DNM: Warning - fafmax_faf95_max_genomes column not found, skipping frequency filter",
|
|
697
|
-
err=True,
|
|
698
|
-
)
|
|
699
1188
|
|
|
700
|
-
|
|
701
|
-
|
|
702
|
-
|
|
703
|
-
|
|
704
|
-
|
|
705
|
-
|
|
706
|
-
|
|
1189
|
+
# Apply genomes_filters filter if specified
|
|
1190
|
+
if genomes_filters_pass_only:
|
|
1191
|
+
if "genomes_filters" in df.columns:
|
|
1192
|
+
df = df.filter(
|
|
1193
|
+
(pl.col("genomes_filters") == ".") | pl.col("genomes_filters").is_null()
|
|
1194
|
+
)
|
|
1195
|
+
if verbose:
|
|
1196
|
+
click.echo(
|
|
1197
|
+
f"DNM: {df.shape[0]} variants after genomes_filters filter (pass only)",
|
|
1198
|
+
err=True,
|
|
1199
|
+
)
|
|
1200
|
+
elif verbose:
|
|
707
1201
|
click.echo(
|
|
708
|
-
|
|
1202
|
+
"DNM: Warning - genomes_filters column not found, skipping genomes_filters filter",
|
|
709
1203
|
err=True,
|
|
710
1204
|
)
|
|
711
|
-
elif verbose:
|
|
712
|
-
click.echo(
|
|
713
|
-
"DNM: Warning - genomes_filters column not found, skipping genomes_filters filter",
|
|
714
|
-
err=True,
|
|
715
|
-
)
|
|
716
1205
|
|
|
717
1206
|
# Build parent quality checks (common to all)
|
|
718
1207
|
father_qual_ok = (pl.col("father_dp").cast(pl.Float64, strict=False) >= p_dp) & (
|
|
@@ -1394,33 +1883,39 @@ def format_bcftools_tsv_minimal(
|
|
|
1394
1883
|
Returns:
|
|
1395
1884
|
Formatted DataFrame with melted samples (annotations still in (null) column)
|
|
1396
1885
|
"""
|
|
1397
|
-
#
|
|
1398
|
-
|
|
1399
|
-
|
|
1400
|
-
|
|
1401
|
-
#
|
|
1402
|
-
|
|
1403
|
-
|
|
1404
|
-
|
|
1405
|
-
|
|
1886
|
+
# Determine which columns are sample columns
|
|
1887
|
+
# Sample columns have format "SampleName:GT:SampleName:DP:..." or similar
|
|
1888
|
+
# Non-sample columns are standard VCF columns or annotation columns
|
|
1889
|
+
|
|
1890
|
+
# Standard VCF/annotation columns (not samples)
|
|
1891
|
+
standard_cols = {
|
|
1892
|
+
"#CHROM", "POS", "REF", "ALT", "FILTER", "(null)", "CSQ",
|
|
1893
|
+
"QUAL", "ID", "INFO", "FORMAT"
|
|
1894
|
+
}
|
|
1406
1895
|
|
|
1407
|
-
#
|
|
1896
|
+
# Find sample columns by looking for columns with ":" in the name
|
|
1897
|
+
# that aren't standard columns
|
|
1408
1898
|
sample_cols = []
|
|
1409
1899
|
sample_names = []
|
|
1410
1900
|
|
|
1411
|
-
for col in
|
|
1412
|
-
# Skip
|
|
1413
|
-
if col
|
|
1901
|
+
for col in df.columns:
|
|
1902
|
+
# Skip standard columns
|
|
1903
|
+
if col in standard_cols:
|
|
1904
|
+
continue
|
|
1905
|
+
|
|
1906
|
+
# Skip columns that look like VEP annotation fields
|
|
1907
|
+
if col.startswith("VEP_") or col.startswith("AF") or col.startswith("AC"):
|
|
1414
1908
|
continue
|
|
1415
1909
|
|
|
1910
|
+
# Sample columns typically have ":" in them (GT:DP:GQ:AD format)
|
|
1416
1911
|
if ":" in col:
|
|
1417
1912
|
sample_name = col.split(":", 1)[0]
|
|
1418
1913
|
sample_cols.append(col)
|
|
1419
1914
|
sample_names.append(sample_name)
|
|
1420
|
-
|
|
1421
|
-
#
|
|
1422
|
-
|
|
1423
|
-
|
|
1915
|
+
elif col not in df.columns[:10]:
|
|
1916
|
+
# Columns after position 10 that don't match known patterns might be samples
|
|
1917
|
+
# This is a heuristic for unusual sample column formats
|
|
1918
|
+
pass
|
|
1424
1919
|
|
|
1425
1920
|
if not sample_cols:
|
|
1426
1921
|
# No sample columns to melt
|
|
@@ -1984,6 +2479,55 @@ def process_with_progress(
|
|
|
1984
2479
|
click.echo("Processing complete.", err=True)
|
|
1985
2480
|
|
|
1986
2481
|
|
|
2482
|
+
def apply_dnm_prefilters(
|
|
2483
|
+
lazy_df: pl.LazyFrame,
|
|
2484
|
+
filter_config: dict,
|
|
2485
|
+
verbose: bool = False
|
|
2486
|
+
) -> pl.LazyFrame:
|
|
2487
|
+
"""Apply variant-level DNM filters before melting.
|
|
2488
|
+
|
|
2489
|
+
These filters don't require sample-level data and can be applied
|
|
2490
|
+
on wide-format data to reduce memory usage.
|
|
2491
|
+
|
|
2492
|
+
Applies:
|
|
2493
|
+
- Population frequency filters (fafmax_faf95_max_genomes_max)
|
|
2494
|
+
- Quality filters (genomes_filters PASS only)
|
|
2495
|
+
|
|
2496
|
+
Args:
|
|
2497
|
+
lazy_df: LazyFrame with wide-format data (not melted)
|
|
2498
|
+
filter_config: Filter configuration dict
|
|
2499
|
+
verbose: Whether to print progress messages
|
|
2500
|
+
|
|
2501
|
+
Returns:
|
|
2502
|
+
Filtered LazyFrame
|
|
2503
|
+
"""
|
|
2504
|
+
dnm_config = filter_config.get("dnm", {})
|
|
2505
|
+
|
|
2506
|
+
# Frequency filter
|
|
2507
|
+
fafmax_max = dnm_config.get("fafmax_faf95_max_genomes_max")
|
|
2508
|
+
if fafmax_max is not None:
|
|
2509
|
+
lazy_df = lazy_df.filter(
|
|
2510
|
+
(pl.col("fafmax_faf95_max_genomes").cast(pl.Float64, strict=False) <= fafmax_max)
|
|
2511
|
+
| pl.col("fafmax_faf95_max_genomes").is_null()
|
|
2512
|
+
)
|
|
2513
|
+
if verbose:
|
|
2514
|
+
click.echo(
|
|
2515
|
+
f"DNM prefilter: Applied frequency filter (fafmax <= {fafmax_max})", err=True
|
|
2516
|
+
)
|
|
2517
|
+
|
|
2518
|
+
# Quality filter (genomes_filters PASS only)
|
|
2519
|
+
if dnm_config.get("genomes_filters_pass_only", False):
|
|
2520
|
+
lazy_df = lazy_df.filter(
|
|
2521
|
+
(pl.col("genomes_filters") == ".") | pl.col("genomes_filters").is_null()
|
|
2522
|
+
)
|
|
2523
|
+
if verbose:
|
|
2524
|
+
click.echo(
|
|
2525
|
+
"DNM prefilter: Applied genomes_filters PASS filter", err=True
|
|
2526
|
+
)
|
|
2527
|
+
|
|
2528
|
+
return lazy_df
|
|
2529
|
+
|
|
2530
|
+
|
|
1987
2531
|
def apply_filters_lazy(
|
|
1988
2532
|
lazy_df: pl.LazyFrame,
|
|
1989
2533
|
filter_config: dict,
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: pywombat
|
|
3
|
-
Version: 1.0
|
|
3
|
+
Version: 1.2.0
|
|
4
4
|
Summary: A CLI tool for processing and filtering bcftools tabulated TSV files with pedigree support
|
|
5
5
|
Project-URL: Homepage, https://github.com/bourgeron-lab/pywombat
|
|
6
6
|
Project-URL: Repository, https://github.com/bourgeron-lab/pywombat
|
|
@@ -18,6 +18,9 @@ Requires-Dist: click>=8.1.0
|
|
|
18
18
|
Requires-Dist: polars>=0.19.0
|
|
19
19
|
Requires-Dist: pyyaml>=6.0
|
|
20
20
|
Requires-Dist: tqdm>=4.67.1
|
|
21
|
+
Provides-Extra: dev
|
|
22
|
+
Requires-Dist: pytest-cov>=4.0.0; extra == 'dev'
|
|
23
|
+
Requires-Dist: pytest>=7.0.0; extra == 'dev'
|
|
21
24
|
Description-Content-Type: text/markdown
|
|
22
25
|
|
|
23
26
|
# PyWombat 🦘
|
|
@@ -29,14 +32,15 @@ A high-performance CLI tool for processing and filtering bcftools tabulated TSV
|
|
|
29
32
|
|
|
30
33
|
## Features
|
|
31
34
|
|
|
32
|
-
✨ **Fast Processing**: Uses Polars for efficient data handling
|
|
33
|
-
🔬 **Quality Filtering**: Configurable depth, quality, and VAF thresholds
|
|
34
|
-
👨👩👧 **Pedigree Support**: Trio and family analysis with parent genotypes
|
|
35
|
-
🧬 **De Novo Detection**: Sex-chromosome-aware DNM identification
|
|
36
|
-
📊 **Flexible Output**: TSV, compressed TSV, or Parquet formats
|
|
37
|
-
🎯 **Expression Filters**: Complex filtering with logical expressions
|
|
38
|
-
🏷️ **Boolean Flag Support**: INFO field flags (PASS, DB, etc.) extracted as True/False columns
|
|
39
|
-
⚡ **
|
|
35
|
+
✨ **Fast Processing**: Uses Polars for efficient data handling
|
|
36
|
+
🔬 **Quality Filtering**: Configurable depth, quality, and VAF thresholds
|
|
37
|
+
👨👩👧 **Pedigree Support**: Trio and family analysis with parent genotypes
|
|
38
|
+
🧬 **De Novo Detection**: Sex-chromosome-aware DNM identification
|
|
39
|
+
📊 **Flexible Output**: TSV, compressed TSV, or Parquet formats
|
|
40
|
+
🎯 **Expression Filters**: Complex filtering with logical expressions
|
|
41
|
+
🏷️ **Boolean Flag Support**: INFO field flags (PASS, DB, etc.) extracted as True/False columns
|
|
42
|
+
⚡ **Memory Optimized**: Two-step workflow for large files (prepare → filter)
|
|
43
|
+
💾 **Parquet Support**: Pre-process large files for repeated, memory-efficient analysis
|
|
40
44
|
|
|
41
45
|
---
|
|
42
46
|
|
|
@@ -47,17 +51,37 @@ A high-performance CLI tool for processing and filtering bcftools tabulated TSV
|
|
|
47
51
|
Use `uvx` to run PyWombat without installation:
|
|
48
52
|
|
|
49
53
|
```bash
|
|
50
|
-
# Basic
|
|
51
|
-
uvx pywombat input.tsv -o output
|
|
54
|
+
# Basic filtering
|
|
55
|
+
uvx pywombat filter input.tsv -o output
|
|
52
56
|
|
|
53
|
-
# With
|
|
54
|
-
uvx pywombat input.tsv -F examples/rare_variants_high_impact.yml -o output
|
|
57
|
+
# With filter configuration
|
|
58
|
+
uvx pywombat filter input.tsv -F examples/rare_variants_high_impact.yml -o output
|
|
55
59
|
|
|
56
60
|
# De novo mutation detection
|
|
57
|
-
uvx pywombat input.tsv --pedigree pedigree.tsv \
|
|
61
|
+
uvx pywombat filter input.tsv --pedigree pedigree.tsv \
|
|
58
62
|
-F examples/de_novo_mutations.yml -o denovo
|
|
59
63
|
```
|
|
60
64
|
|
|
65
|
+
### For Large Files (>1GB or >50 samples)
|
|
66
|
+
|
|
67
|
+
Use the two-step workflow for memory-efficient processing:
|
|
68
|
+
|
|
69
|
+
```bash
|
|
70
|
+
# Step 1: Prepare (one-time preprocessing)
|
|
71
|
+
uvx pywombat prepare input.tsv.gz -o prepared.parquet
|
|
72
|
+
|
|
73
|
+
# Step 2: Filter (fast, memory-efficient, can be run multiple times)
|
|
74
|
+
uvx pywombat filter prepared.parquet \
|
|
75
|
+
-p pedigree.tsv \
|
|
76
|
+
-F config.yml \
|
|
77
|
+
-o filtered
|
|
78
|
+
```
|
|
79
|
+
|
|
80
|
+
**Benefits:**
|
|
81
|
+
- Pre-expands INFO fields once (saves time on repeated filtering)
|
|
82
|
+
- Applies filters before melting samples (reduces memory by 95%+)
|
|
83
|
+
- Parquet format enables fast columnar access
|
|
84
|
+
|
|
61
85
|
### Installation for Development/Repeated Use
|
|
62
86
|
|
|
63
87
|
```bash
|
|
@@ -69,7 +93,7 @@ cd pywombat
|
|
|
69
93
|
uv sync
|
|
70
94
|
|
|
71
95
|
# Run with uv run
|
|
72
|
-
uv run wombat input.tsv -o output
|
|
96
|
+
uv run wombat filter input.tsv -o output
|
|
73
97
|
```
|
|
74
98
|
|
|
75
99
|
---
|
|
@@ -114,25 +138,62 @@ chr1 100 A T 2 0.5 30 true Sample2 1/1 18 99
|
|
|
114
138
|
|
|
115
139
|
---
|
|
116
140
|
|
|
117
|
-
##
|
|
141
|
+
## Commands
|
|
142
|
+
|
|
143
|
+
PyWombat has two main commands:
|
|
144
|
+
|
|
145
|
+
### `wombat prepare` - Preprocess Large Files
|
|
146
|
+
|
|
147
|
+
Converts TSV/TSV.gz to optimized Parquet format with pre-expanded INFO fields:
|
|
148
|
+
|
|
149
|
+
```bash
|
|
150
|
+
# Basic usage
|
|
151
|
+
wombat prepare input.tsv.gz -o prepared.parquet
|
|
152
|
+
|
|
153
|
+
# With verbose output
|
|
154
|
+
wombat prepare input.tsv.gz -o prepared.parquet -v
|
|
155
|
+
|
|
156
|
+
# Adjust chunk size for memory constraints
|
|
157
|
+
wombat prepare input.tsv.gz -o prepared.parquet --chunk-size 25000
|
|
158
|
+
```
|
|
159
|
+
|
|
160
|
+
**What it does:**
|
|
161
|
+
- Extracts all INFO fields (VEP_*, AF, etc.) as separate columns
|
|
162
|
+
- Keeps samples in wide format (not melted yet)
|
|
163
|
+
- Writes memory-efficient Parquet format
|
|
164
|
+
- Processes in chunks to handle files of any size
|
|
165
|
+
|
|
166
|
+
**When to use:**
|
|
167
|
+
- Files >1GB or >50 samples
|
|
168
|
+
- Large families (>10 members)
|
|
169
|
+
- Running multiple filter configurations
|
|
170
|
+
- Repeated analysis of the same dataset
|
|
171
|
+
|
|
172
|
+
### `wombat filter` - Process and Filter Data
|
|
118
173
|
|
|
119
|
-
|
|
174
|
+
Transforms and filters variant data (works with both TSV and Parquet input):
|
|
120
175
|
|
|
121
176
|
```bash
|
|
122
|
-
#
|
|
123
|
-
|
|
177
|
+
# Basic filtering (TSV input)
|
|
178
|
+
wombat filter input.tsv -o output
|
|
124
179
|
|
|
125
|
-
#
|
|
126
|
-
|
|
180
|
+
# From prepared Parquet (faster, more memory-efficient)
|
|
181
|
+
wombat filter prepared.parquet -o output
|
|
182
|
+
|
|
183
|
+
# With filter configuration
|
|
184
|
+
wombat filter input.tsv -F config.yml -o output
|
|
185
|
+
|
|
186
|
+
# With pedigree
|
|
187
|
+
wombat filter input.tsv -p pedigree.tsv -o output
|
|
127
188
|
|
|
128
189
|
# Compressed output
|
|
129
|
-
|
|
190
|
+
wombat filter input.tsv -o output -f tsv.gz
|
|
130
191
|
|
|
131
|
-
# Parquet
|
|
132
|
-
|
|
192
|
+
# Parquet output
|
|
193
|
+
wombat filter input.tsv -o output -f parquet
|
|
133
194
|
|
|
134
195
|
# With verbose output
|
|
135
|
-
|
|
196
|
+
wombat filter input.tsv -o output -v
|
|
136
197
|
```
|
|
137
198
|
|
|
138
199
|
### With Pedigree (Trio/Family Analysis)
|
|
@@ -140,7 +201,7 @@ uvx pywombat input.tsv -o output --verbose
|
|
|
140
201
|
Add parent genotype information for inheritance analysis:
|
|
141
202
|
|
|
142
203
|
```bash
|
|
143
|
-
|
|
204
|
+
wombat filter input.tsv --pedigree pedigree.tsv -o output
|
|
144
205
|
```
|
|
145
206
|
|
|
146
207
|
**Pedigree File Format** (tab-separated):
|
|
@@ -178,7 +239,7 @@ PyWombat supports two types of filtering:
|
|
|
178
239
|
Filter for ultra-rare, high-impact variants:
|
|
179
240
|
|
|
180
241
|
```bash
|
|
181
|
-
|
|
242
|
+
wombat filter input.tsv \
|
|
182
243
|
-F examples/rare_variants_high_impact.yml \
|
|
183
244
|
-o rare_variants
|
|
184
245
|
```
|
|
@@ -210,7 +271,7 @@ expression: "VEP_CANONICAL = YES & VEP_IMPACT = HIGH & VEP_LoF = HC & VEP_LoF_fl
|
|
|
210
271
|
Identify de novo mutations in trio data:
|
|
211
272
|
|
|
212
273
|
```bash
|
|
213
|
-
|
|
274
|
+
wombat filter input.tsv \
|
|
214
275
|
--pedigree pedigree.tsv \
|
|
215
276
|
-F examples/de_novo_mutations.yml \
|
|
216
277
|
-o denovo
|
|
@@ -290,7 +351,7 @@ expression: "VEP_IMPACT = HIGH & VEP_CANONICAL = YES & gnomad_AF < 0.01 & CADD_P
|
|
|
290
351
|
Inspect specific variants for troubleshooting:
|
|
291
352
|
|
|
292
353
|
```bash
|
|
293
|
-
|
|
354
|
+
wombat filter input.tsv \
|
|
294
355
|
-F config.yml \
|
|
295
356
|
--debug chr11:70486013
|
|
296
357
|
```
|
|
@@ -309,20 +370,20 @@ Shows:
|
|
|
309
370
|
### TSV (Default)
|
|
310
371
|
|
|
311
372
|
```bash
|
|
312
|
-
|
|
313
|
-
|
|
373
|
+
wombat filter input.tsv -o output # Creates output.tsv
|
|
374
|
+
wombat filter input.tsv -o output -f tsv # Same as above
|
|
314
375
|
```
|
|
315
376
|
|
|
316
377
|
### Compressed TSV
|
|
317
378
|
|
|
318
379
|
```bash
|
|
319
|
-
|
|
380
|
+
wombat filter input.tsv -o output -f tsv.gz # Creates output.tsv.gz
|
|
320
381
|
```
|
|
321
382
|
|
|
322
383
|
### Parquet (Fastest for Large Files)
|
|
323
384
|
|
|
324
385
|
```bash
|
|
325
|
-
|
|
386
|
+
wombat filter input.tsv -o output -f parquet # Creates output.parquet
|
|
326
387
|
```
|
|
327
388
|
|
|
328
389
|
**When to use Parquet:**
|
|
@@ -340,7 +401,7 @@ uvx pywombat input.tsv -o output -f parquet # Creates output.parquet
|
|
|
340
401
|
|
|
341
402
|
```bash
|
|
342
403
|
# Step 1: Filter for rare, high-impact variants
|
|
343
|
-
|
|
404
|
+
wombat filter cohort.tsv \
|
|
344
405
|
-F examples/rare_variants_high_impact.yml \
|
|
345
406
|
-o rare_variants
|
|
346
407
|
|
|
@@ -352,24 +413,34 @@ uvx pywombat cohort.tsv \
|
|
|
352
413
|
|
|
353
414
|
```bash
|
|
354
415
|
# Identify de novo mutations in autism cohort
|
|
355
|
-
|
|
416
|
+
wombat filter autism_trios.tsv \
|
|
356
417
|
--pedigree autism_pedigree.tsv \
|
|
357
418
|
-F examples/de_novo_mutations.yml \
|
|
358
419
|
-o autism_denovo \
|
|
359
|
-
|
|
420
|
+
-v
|
|
360
421
|
|
|
361
422
|
# Review output for genes in autism risk lists
|
|
362
423
|
```
|
|
363
424
|
|
|
364
|
-
### 3. Multi-Family
|
|
425
|
+
### 3. Large Multi-Family Analysis (Memory-Optimized)
|
|
365
426
|
|
|
366
427
|
```bash
|
|
367
|
-
#
|
|
368
|
-
|
|
428
|
+
# Step 1: Prepare once (preprocesses INFO fields)
|
|
429
|
+
wombat prepare large_cohort.tsv.gz -o prepared.parquet -v
|
|
430
|
+
|
|
431
|
+
# Step 2: Filter with different configurations (fast, memory-efficient)
|
|
432
|
+
wombat filter prepared.parquet \
|
|
369
433
|
--pedigree families_pedigree.tsv \
|
|
370
434
|
-F examples/rare_variants_high_impact.yml \
|
|
371
435
|
-o families_rare_variants \
|
|
372
|
-
-
|
|
436
|
+
-v
|
|
437
|
+
|
|
438
|
+
# Step 3: Run additional filters without re-preparing
|
|
439
|
+
wombat filter prepared.parquet \
|
|
440
|
+
--pedigree families_pedigree.tsv \
|
|
441
|
+
-F examples/de_novo_mutations.yml \
|
|
442
|
+
-o families_denovo \
|
|
443
|
+
-v
|
|
373
444
|
```
|
|
374
445
|
|
|
375
446
|
### 4. Custom Expression Filter
|
|
@@ -389,7 +460,7 @@ expression: "VEP_IMPACT = HIGH & (gnomad_AF < 0.0001 | gnomad_AF = null)"
|
|
|
389
460
|
Apply:
|
|
390
461
|
|
|
391
462
|
```bash
|
|
392
|
-
|
|
463
|
+
wombat filter input.tsv -F custom_filter.yml -o output
|
|
393
464
|
```
|
|
394
465
|
|
|
395
466
|
---
|
|
@@ -464,7 +535,7 @@ bcftools query -HH \
|
|
|
464
535
|
annotated.split.bcf > annotated.tsv
|
|
465
536
|
|
|
466
537
|
# 4. Process with PyWombat
|
|
467
|
-
|
|
538
|
+
wombat filter annotated.tsv -F examples/rare_variants_high_impact.yml -o output
|
|
468
539
|
```
|
|
469
540
|
|
|
470
541
|
**Why split-vep is required:**
|
|
@@ -481,7 +552,7 @@ For production workflows, these commands can be piped together:
|
|
|
481
552
|
# Efficient pipeline (single pass through data)
|
|
482
553
|
bcftools +split-vep -c - -p VEP_ input.vcf.gz | \
|
|
483
554
|
bcftools query -HH -f '%CHROM\t%POS\t%REF\t%ALT\t%FILTER\t%INFO[\t%GT:%DP:%GQ:%AD]\n' | \
|
|
484
|
-
|
|
555
|
+
wombat filter - -F config.yml -o output
|
|
485
556
|
```
|
|
486
557
|
|
|
487
558
|
**Note**: For multiple filter configurations, it's more efficient to save the intermediate TSV file rather than regenerating it each time.
|
|
@@ -517,11 +588,49 @@ Each configuration file is fully documented with:
|
|
|
517
588
|
|
|
518
589
|
## Performance Tips
|
|
519
590
|
|
|
520
|
-
|
|
521
|
-
|
|
591
|
+
### For Large Files (>1GB or >50 samples)
|
|
592
|
+
|
|
593
|
+
1. **Use the two-step workflow**: `wombat prepare` → `wombat filter`
|
|
594
|
+
- Reduces memory usage by 95%+ (4.2M variants → ~100 after early filtering)
|
|
595
|
+
- Pre-expands INFO fields once, reuse for multiple filter configurations
|
|
596
|
+
- Example: 38-sample family with 4.2M variants processes in <1 second with ~1.2GB RAM
|
|
597
|
+
|
|
598
|
+
2. **Parquet format benefits**:
|
|
599
|
+
- Columnar storage enables selective column loading
|
|
600
|
+
- Pre-filtering before melting (expression filters applied before expanding to per-sample rows)
|
|
601
|
+
- **Per-chromosome processing for DNM**: Automatically processes DNM filtering chromosome-by-chromosome
|
|
602
|
+
- 30% smaller file size vs gzipped TSV
|
|
603
|
+
|
|
604
|
+
3. **De Novo Mutation (DNM) filtering optimization**:
|
|
605
|
+
- Automatically uses per-chromosome processing when DNM mode is enabled
|
|
606
|
+
- Processes one chromosome at a time to reduce peak memory
|
|
607
|
+
- Applies frequency filters before melting to reduce data expansion
|
|
608
|
+
- Example: 38-sample family with 4.2M variants completes in 20 seconds with ~24GB RAM (vs 200GB+ OOM failure)
|
|
609
|
+
|
|
610
|
+
### For All Files
|
|
611
|
+
|
|
522
612
|
3. **Pre-filter with bcftools**: Filter by region/gene before PyWombat
|
|
523
613
|
4. **Compressed input**: PyWombat handles `.gz` files natively
|
|
524
|
-
5. **
|
|
614
|
+
5. **Use verbose mode** (`-v`): Monitor progress and filtering statistics
|
|
615
|
+
|
|
616
|
+
### Memory Comparison
|
|
617
|
+
|
|
618
|
+
**Expression Filtering** (e.g., VEP_IMPACT filters):
|
|
619
|
+
|
|
620
|
+
| Approach | 38 samples, 4.2M variants | Memory | Time |
|
|
621
|
+
|----------|---------------------------|--------|------|
|
|
622
|
+
| Direct TSV | ❌ OOM (>200GB) | 200+ GB | Failed |
|
|
623
|
+
| TSV with chunking | ⚠️ Slow | ~30GB | ~3 min |
|
|
624
|
+
| **Parquet + pre-filter** | ✅ **Optimal** | **~1.2GB** | **<1 sec** |
|
|
625
|
+
|
|
626
|
+
**De Novo Mutation (DNM) Filtering**:
|
|
627
|
+
|
|
628
|
+
| Approach | 38 samples, 4.2M variants | Memory | Time | Result |
|
|
629
|
+
|----------|---------------------------|--------|------|--------|
|
|
630
|
+
| Without optimization | ❌ OOM (>200GB) | 200+ GB | Failed | N/A |
|
|
631
|
+
| **Parquet + per-chromosome** | ✅ **Success** | **~24GB** | **20 sec** | **6,788 DNM variants** |
|
|
632
|
+
|
|
633
|
+
*DNM filtering requires sample-level data (cannot pre-filter before melting), but per-chromosome processing reduces peak memory by 88%.*
|
|
525
634
|
|
|
526
635
|
---
|
|
527
636
|
|
|
@@ -588,11 +697,15 @@ pywombat/
|
|
|
588
697
|
|
|
589
698
|
**Issue**: Memory errors on large files
|
|
590
699
|
|
|
591
|
-
- **Solution**:
|
|
700
|
+
- **Solution**: Use the two-step workflow: `wombat prepare` then `wombat filter` for 95%+ memory reduction
|
|
701
|
+
|
|
702
|
+
**Issue**: Command not found after upgrading
|
|
703
|
+
|
|
704
|
+
- **Solution**: PyWombat now uses subcommands - use `wombat filter` instead of just `wombat`
|
|
592
705
|
|
|
593
706
|
### Getting Help
|
|
594
707
|
|
|
595
|
-
1. Check `--help` for command options: `
|
|
708
|
+
1. Check `--help` for command options: `wombat --help` or `wombat filter --help`
|
|
596
709
|
2. Review example configurations in [`examples/`](examples/)
|
|
597
710
|
3. Use `--debug` mode to inspect specific variants
|
|
598
711
|
4. Use `--verbose` to see filtering steps
|
|
@@ -0,0 +1,6 @@
|
|
|
1
|
+
pywombat/__init__.py,sha256=iIPN9vJtsIUhl_DiKNnknxCamLinfayodLLFK8y-aJg,54
|
|
2
|
+
pywombat/cli.py,sha256=pEPvUTww5Nvj-WqSRZ0QEePnORrcYkhWJv3uVi5DnxM,93728
|
|
3
|
+
pywombat-1.2.0.dist-info/METADATA,sha256=3TeUY6jzQCfrFaQ_BuocdB8374Esqwkoug9L-iZtLT0,21306
|
|
4
|
+
pywombat-1.2.0.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
|
|
5
|
+
pywombat-1.2.0.dist-info/entry_points.txt,sha256=Vt7U2ypbiEgCBlEV71ZPk287H5_HKmPBT4iBu6duEcE,44
|
|
6
|
+
pywombat-1.2.0.dist-info/RECORD,,
|
pywombat-1.0.2.dist-info/RECORD
DELETED
|
@@ -1,6 +0,0 @@
|
|
|
1
|
-
pywombat/__init__.py,sha256=iIPN9vJtsIUhl_DiKNnknxCamLinfayodLLFK8y-aJg,54
|
|
2
|
-
pywombat/cli.py,sha256=76pVpYYyl9rCm6TCq86j7xAqEC9pOvjiWCX9MmrBB_o,74994
|
|
3
|
-
pywombat-1.0.2.dist-info/METADATA,sha256=slcagFwSvA99GVzKLRZggFPy5dkLEh_09O3dIB8Hfr4,17168
|
|
4
|
-
pywombat-1.0.2.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
|
|
5
|
-
pywombat-1.0.2.dist-info/entry_points.txt,sha256=Vt7U2ypbiEgCBlEV71ZPk287H5_HKmPBT4iBu6duEcE,44
|
|
6
|
-
pywombat-1.0.2.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|