pywombat 1.0.1__py3-none-any.whl → 1.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pywombat/cli.py +372 -60
- {pywombat-1.0.1.dist-info → pywombat-1.1.0.dist-info}/METADATA +143 -48
- pywombat-1.1.0.dist-info/RECORD +6 -0
- pywombat-1.0.1.dist-info/RECORD +0 -6
- {pywombat-1.0.1.dist-info → pywombat-1.1.0.dist-info}/WHEEL +0 -0
- {pywombat-1.0.1.dist-info → pywombat-1.1.0.dist-info}/entry_points.txt +0 -0
pywombat/cli.py
CHANGED
|
@@ -11,13 +11,266 @@ import polars as pl
|
|
|
11
11
|
import yaml
|
|
12
12
|
|
|
13
13
|
|
|
14
|
-
@click.
|
|
14
|
+
@click.group()
|
|
15
|
+
def cli():
|
|
16
|
+
"""
|
|
17
|
+
Wombat: A tool for processing bcftools tabulated TSV files.
|
|
18
|
+
|
|
19
|
+
\b
|
|
20
|
+
Commands:
|
|
21
|
+
filter Process and filter variant data
|
|
22
|
+
prepare Convert TSV to optimized Parquet format
|
|
23
|
+
"""
|
|
24
|
+
pass
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
@cli.command("prepare")
|
|
28
|
+
@click.argument("input_file", type=click.Path(exists=True, path_type=Path))
|
|
29
|
+
@click.option(
|
|
30
|
+
"-o",
|
|
31
|
+
"--output",
|
|
32
|
+
type=click.Path(path_type=Path),
|
|
33
|
+
required=True,
|
|
34
|
+
help="Output Parquet file path.",
|
|
35
|
+
)
|
|
36
|
+
@click.option(
|
|
37
|
+
"--chunk-size",
|
|
38
|
+
type=int,
|
|
39
|
+
default=50000,
|
|
40
|
+
help="Number of rows to process at a time (default: 50000).",
|
|
41
|
+
)
|
|
42
|
+
@click.option("-v", "--verbose", is_flag=True, help="Enable verbose output.")
|
|
43
|
+
def prepare_cmd(
|
|
44
|
+
input_file: Path,
|
|
45
|
+
output: Path,
|
|
46
|
+
chunk_size: int,
|
|
47
|
+
verbose: bool,
|
|
48
|
+
):
|
|
49
|
+
"""
|
|
50
|
+
Convert bcftools TSV to optimized Parquet format.
|
|
51
|
+
|
|
52
|
+
This command pre-processes a TSV file by:
|
|
53
|
+
|
|
54
|
+
\b
|
|
55
|
+
1. Extracting all INFO fields from the '(null)' column into separate columns
|
|
56
|
+
2. Applying memory-efficient data types (Categorical for CHROM, UInt32 for POS)
|
|
57
|
+
3. Writing to Parquet format for efficient columnar access
|
|
58
|
+
|
|
59
|
+
The output Parquet file can then be used with 'wombat filter' for much faster
|
|
60
|
+
and more memory-efficient filtering, especially for large files.
|
|
61
|
+
|
|
62
|
+
\b
|
|
63
|
+
Examples:
|
|
64
|
+
wombat prepare input.tsv.gz -o prepared.parquet
|
|
65
|
+
wombat prepare input.tsv.gz -o prepared.parquet --chunk-size 100000
|
|
66
|
+
"""
|
|
67
|
+
try:
|
|
68
|
+
if verbose:
|
|
69
|
+
click.echo(f"Preparing {input_file} -> {output}", err=True)
|
|
70
|
+
|
|
71
|
+
# Ensure output has .parquet extension
|
|
72
|
+
if not str(output).endswith(".parquet"):
|
|
73
|
+
output = Path(f"{output}.parquet")
|
|
74
|
+
|
|
75
|
+
# Process the file
|
|
76
|
+
prepare_parquet(input_file, output, chunk_size, verbose)
|
|
77
|
+
|
|
78
|
+
if verbose:
|
|
79
|
+
click.echo(f"Successfully created {output}", err=True)
|
|
80
|
+
|
|
81
|
+
except Exception as e:
|
|
82
|
+
click.echo(f"Error: {e}", err=True)
|
|
83
|
+
raise click.Abort()
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
def prepare_parquet(
|
|
87
|
+
input_file: Path,
|
|
88
|
+
output: Path,
|
|
89
|
+
chunk_size: int = 50000,
|
|
90
|
+
verbose: bool = False,
|
|
91
|
+
) -> None:
|
|
92
|
+
"""
|
|
93
|
+
Convert a bcftools TSV file to Parquet with pre-expanded INFO fields.
|
|
94
|
+
|
|
95
|
+
Processes the file in chunks to handle large files without running out of memory.
|
|
96
|
+
|
|
97
|
+
Args:
|
|
98
|
+
input_file: Path to input TSV or TSV.gz file
|
|
99
|
+
output: Path to output Parquet file
|
|
100
|
+
chunk_size: Number of rows to process per chunk
|
|
101
|
+
verbose: Whether to print progress
|
|
102
|
+
"""
|
|
103
|
+
from tqdm import tqdm
|
|
104
|
+
|
|
105
|
+
# First pass: discover all INFO fields
|
|
106
|
+
if verbose:
|
|
107
|
+
click.echo("Pass 1: Discovering INFO fields...", err=True)
|
|
108
|
+
|
|
109
|
+
all_fields = set()
|
|
110
|
+
all_flags = set()
|
|
111
|
+
total_lines = 0
|
|
112
|
+
|
|
113
|
+
is_gzipped = str(input_file).endswith(".gz")
|
|
114
|
+
opener = gzip.open if is_gzipped else open
|
|
115
|
+
|
|
116
|
+
with opener(input_file, "rt") as f:
|
|
117
|
+
header_line = f.readline().strip()
|
|
118
|
+
header_cols = header_line.split("\t")
|
|
119
|
+
|
|
120
|
+
# Find the (null) column index dynamically
|
|
121
|
+
null_col_idx = None
|
|
122
|
+
for i, col in enumerate(header_cols):
|
|
123
|
+
if col == "(null)":
|
|
124
|
+
null_col_idx = i
|
|
125
|
+
break
|
|
126
|
+
|
|
127
|
+
if null_col_idx is None:
|
|
128
|
+
if verbose:
|
|
129
|
+
click.echo("Warning: No (null) column found in input", err=True)
|
|
130
|
+
else:
|
|
131
|
+
for line in tqdm(f, desc="Scanning", disable=not verbose):
|
|
132
|
+
total_lines += 1
|
|
133
|
+
parts = line.split("\t")
|
|
134
|
+
if len(parts) > null_col_idx:
|
|
135
|
+
null_value = parts[null_col_idx]
|
|
136
|
+
if null_value and null_value != ".":
|
|
137
|
+
pairs = null_value.split(";")
|
|
138
|
+
for pair in pairs:
|
|
139
|
+
if "=" in pair:
|
|
140
|
+
field_name = pair.split("=", 1)[0]
|
|
141
|
+
all_fields.add(field_name)
|
|
142
|
+
elif pair.strip():
|
|
143
|
+
all_flags.add(pair.strip())
|
|
144
|
+
|
|
145
|
+
if verbose:
|
|
146
|
+
click.echo(
|
|
147
|
+
f"Found {len(all_fields)} key-value fields and {len(all_flags)} flags in {total_lines} variants",
|
|
148
|
+
err=True,
|
|
149
|
+
)
|
|
150
|
+
|
|
151
|
+
# Second pass: process chunks and write Parquet
|
|
152
|
+
if verbose:
|
|
153
|
+
click.echo("Pass 2: Converting to Parquet...", err=True)
|
|
154
|
+
|
|
155
|
+
# Define memory-efficient dtypes
|
|
156
|
+
dtype_overrides = {
|
|
157
|
+
"#CHROM": pl.Categorical,
|
|
158
|
+
"POS": pl.UInt32,
|
|
159
|
+
"FILTER": pl.Categorical,
|
|
160
|
+
}
|
|
161
|
+
|
|
162
|
+
# Create a temporary directory for chunk files
|
|
163
|
+
import tempfile
|
|
164
|
+
import shutil
|
|
165
|
+
|
|
166
|
+
temp_dir = Path(tempfile.mkdtemp(prefix="wombat_prepare_"))
|
|
167
|
+
part_files = []
|
|
168
|
+
|
|
169
|
+
try:
|
|
170
|
+
with opener(input_file, "rt") as f:
|
|
171
|
+
header_line = f.readline().strip()
|
|
172
|
+
|
|
173
|
+
# Process in chunks
|
|
174
|
+
chunk_lines = []
|
|
175
|
+
pbar = tqdm(total=total_lines, desc="Converting", disable=not verbose)
|
|
176
|
+
|
|
177
|
+
for line in f:
|
|
178
|
+
chunk_lines.append(line)
|
|
179
|
+
if len(chunk_lines) >= chunk_size:
|
|
180
|
+
df_chunk = _process_chunk(
|
|
181
|
+
header_line, chunk_lines, all_fields, all_flags, dtype_overrides
|
|
182
|
+
)
|
|
183
|
+
part_file = temp_dir / f"part_{len(part_files):06d}.parquet"
|
|
184
|
+
df_chunk.write_parquet(part_file)
|
|
185
|
+
part_files.append(part_file)
|
|
186
|
+
pbar.update(len(chunk_lines))
|
|
187
|
+
chunk_lines = []
|
|
188
|
+
|
|
189
|
+
# Process remaining lines
|
|
190
|
+
if chunk_lines:
|
|
191
|
+
df_chunk = _process_chunk(
|
|
192
|
+
header_line, chunk_lines, all_fields, all_flags, dtype_overrides
|
|
193
|
+
)
|
|
194
|
+
part_file = temp_dir / f"part_{len(part_files):06d}.parquet"
|
|
195
|
+
df_chunk.write_parquet(part_file)
|
|
196
|
+
part_files.append(part_file)
|
|
197
|
+
pbar.update(len(chunk_lines))
|
|
198
|
+
|
|
199
|
+
pbar.close()
|
|
200
|
+
|
|
201
|
+
# Combine all parts into final output using lazy scanning
|
|
202
|
+
if verbose:
|
|
203
|
+
click.echo(f"Combining {len(part_files)} parts into final output...", err=True)
|
|
204
|
+
|
|
205
|
+
if part_files:
|
|
206
|
+
# Use scan_parquet to lazily read all parts and write combined output
|
|
207
|
+
combined = pl.scan_parquet(part_files).collect()
|
|
208
|
+
combined.write_parquet(output)
|
|
209
|
+
|
|
210
|
+
if verbose:
|
|
211
|
+
click.echo(f"Wrote {len(part_files)} chunks to {output}", err=True)
|
|
212
|
+
|
|
213
|
+
finally:
|
|
214
|
+
# Clean up temporary directory
|
|
215
|
+
shutil.rmtree(temp_dir, ignore_errors=True)
|
|
216
|
+
|
|
217
|
+
|
|
218
|
+
def _process_chunk(
|
|
219
|
+
header: str,
|
|
220
|
+
lines: list,
|
|
221
|
+
fields: set,
|
|
222
|
+
flags: set,
|
|
223
|
+
dtype_overrides: dict,
|
|
224
|
+
) -> pl.DataFrame:
|
|
225
|
+
"""Process a chunk of lines into a DataFrame with expanded INFO fields."""
|
|
226
|
+
import io
|
|
227
|
+
|
|
228
|
+
content = header + "\n" + "".join(lines)
|
|
229
|
+
df = pl.read_csv(
|
|
230
|
+
io.StringIO(content),
|
|
231
|
+
separator="\t",
|
|
232
|
+
infer_schema_length=10000,
|
|
233
|
+
)
|
|
234
|
+
|
|
235
|
+
# Expand INFO fields from (null) column
|
|
236
|
+
if "(null)" in df.columns:
|
|
237
|
+
# Extract key-value fields
|
|
238
|
+
for field in sorted(fields):
|
|
239
|
+
df = df.with_columns(
|
|
240
|
+
pl.col("(null)").str.extract(f"{field}=([^;]+)").alias(field)
|
|
241
|
+
)
|
|
242
|
+
|
|
243
|
+
# Extract boolean flags
|
|
244
|
+
for flag in sorted(flags):
|
|
245
|
+
df = df.with_columns(
|
|
246
|
+
pl.col("(null)").str.contains(f"(^|;){flag}(;|$)").alias(flag)
|
|
247
|
+
)
|
|
248
|
+
|
|
249
|
+
# Drop the original (null) column
|
|
250
|
+
df = df.drop("(null)")
|
|
251
|
+
|
|
252
|
+
# Drop CSQ column if it exists (redundant after expansion)
|
|
253
|
+
if "CSQ" in df.columns:
|
|
254
|
+
df = df.drop("CSQ")
|
|
255
|
+
|
|
256
|
+
# Apply memory-efficient dtypes
|
|
257
|
+
for col, dtype in dtype_overrides.items():
|
|
258
|
+
if col in df.columns:
|
|
259
|
+
try:
|
|
260
|
+
df = df.with_columns(pl.col(col).cast(dtype))
|
|
261
|
+
except Exception:
|
|
262
|
+
pass # Skip if cast fails
|
|
263
|
+
|
|
264
|
+
return df
|
|
265
|
+
|
|
266
|
+
|
|
267
|
+
@cli.command("filter")
|
|
15
268
|
@click.argument("input_file", type=click.Path(exists=True, path_type=Path))
|
|
16
269
|
@click.option(
|
|
17
270
|
"-o",
|
|
18
271
|
"--output",
|
|
19
272
|
type=str,
|
|
20
|
-
help="Output file prefix. If not specified,
|
|
273
|
+
help="Output file prefix. If not specified, generates from input filename.",
|
|
21
274
|
)
|
|
22
275
|
@click.option(
|
|
23
276
|
"-f",
|
|
@@ -43,9 +296,9 @@ import yaml
|
|
|
43
296
|
@click.option(
|
|
44
297
|
"--debug",
|
|
45
298
|
type=str,
|
|
46
|
-
help="Debug mode: show rows matching chrom:pos (e.g., chr11:70486013).
|
|
299
|
+
help="Debug mode: show rows matching chrom:pos (e.g., chr11:70486013).",
|
|
47
300
|
)
|
|
48
|
-
def
|
|
301
|
+
def filter_cmd(
|
|
49
302
|
input_file: Path,
|
|
50
303
|
output: Optional[str],
|
|
51
304
|
output_format: str,
|
|
@@ -55,36 +308,43 @@ def cli(
|
|
|
55
308
|
debug: Optional[str],
|
|
56
309
|
):
|
|
57
310
|
"""
|
|
58
|
-
|
|
311
|
+
Process and filter variant data from TSV or Parquet files.
|
|
59
312
|
|
|
60
|
-
|
|
313
|
+
\b
|
|
314
|
+
Supports two input formats:
|
|
315
|
+
- TSV/TSV.gz: Full processing (INFO expansion + melting)
|
|
316
|
+
- Parquet: Fast processing (melting only, INFO already expanded)
|
|
61
317
|
|
|
62
318
|
\b
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
319
|
+
For large files, use 'wombat prepare' first to convert to Parquet,
|
|
320
|
+
then use 'wombat filter' on the Parquet file for better performance.
|
|
321
|
+
|
|
322
|
+
\b
|
|
323
|
+
This command:
|
|
324
|
+
1. Expands the '(null)' column (TSV only) into separate columns
|
|
325
|
+
2. Melts sample columns into rows with sample names
|
|
326
|
+
3. Splits sample values (GT:DP:GQ:AD format) into separate columns
|
|
327
|
+
4. Applies quality and expression filters (if config provided)
|
|
72
328
|
|
|
73
329
|
\b
|
|
74
330
|
Examples:
|
|
75
|
-
wombat input.tsv -o output
|
|
76
|
-
wombat
|
|
77
|
-
wombat input.tsv
|
|
331
|
+
wombat filter input.tsv -o output
|
|
332
|
+
wombat filter prepared.parquet -o output -f parquet
|
|
333
|
+
wombat filter input.tsv -p pedigree.tsv -F config.yml
|
|
78
334
|
"""
|
|
79
335
|
try:
|
|
80
336
|
if verbose:
|
|
81
337
|
click.echo(f"Reading input file: {input_file}", err=True)
|
|
82
338
|
|
|
83
|
-
# Detect
|
|
339
|
+
# Detect input format
|
|
340
|
+
is_parquet = str(input_file).endswith(".parquet")
|
|
84
341
|
is_gzipped = str(input_file).endswith(".gz")
|
|
85
342
|
|
|
86
|
-
if verbose
|
|
87
|
-
|
|
343
|
+
if verbose:
|
|
344
|
+
if is_parquet:
|
|
345
|
+
click.echo("Detected Parquet input (pre-processed)", err=True)
|
|
346
|
+
elif is_gzipped:
|
|
347
|
+
click.echo("Detected gzipped TSV file", err=True)
|
|
88
348
|
|
|
89
349
|
# Read pedigree file if provided
|
|
90
350
|
pedigree_df = None
|
|
@@ -109,11 +369,11 @@ def cli(
|
|
|
109
369
|
if output is None:
|
|
110
370
|
# Generate default output prefix from input filename
|
|
111
371
|
input_stem = input_file.name
|
|
112
|
-
# Remove
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
372
|
+
# Remove known extensions
|
|
373
|
+
for ext in [".tsv.gz", ".tsv", ".parquet"]:
|
|
374
|
+
if input_stem.endswith(ext):
|
|
375
|
+
input_stem = input_stem[: -len(ext)]
|
|
376
|
+
break
|
|
117
377
|
|
|
118
378
|
# Add config name if filter is provided
|
|
119
379
|
if filter_config:
|
|
@@ -126,24 +386,67 @@ def cli(
|
|
|
126
386
|
if verbose:
|
|
127
387
|
click.echo("Processing with streaming mode...", err=True)
|
|
128
388
|
|
|
129
|
-
# Build lazy query
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
"
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
)
|
|
389
|
+
# Build lazy query based on input format
|
|
390
|
+
if is_parquet:
|
|
391
|
+
# Parquet input: INFO fields already expanded by 'wombat prepare'
|
|
392
|
+
lazy_df = pl.scan_parquet(input_file)
|
|
393
|
+
|
|
394
|
+
# OPTIMIZATION: Apply expression filter BEFORE melting
|
|
395
|
+
# Expression filters (VEP_IMPACT, etc.) don't depend on sample data
|
|
396
|
+
if filter_config_data and "expression" in filter_config_data:
|
|
397
|
+
expression = filter_config_data["expression"]
|
|
398
|
+
if expression and verbose:
|
|
399
|
+
click.echo(
|
|
400
|
+
f"Applying expression filter before melting: {expression}",
|
|
401
|
+
err=True,
|
|
402
|
+
)
|
|
144
403
|
|
|
145
|
-
|
|
146
|
-
|
|
404
|
+
# Collect a small sample to get schema for expression parsing
|
|
405
|
+
schema_df = lazy_df.head(1).collect()
|
|
406
|
+
try:
|
|
407
|
+
filter_expr = parse_impact_filter_expression(expression, schema_df)
|
|
408
|
+
lazy_df = lazy_df.filter(filter_expr)
|
|
409
|
+
|
|
410
|
+
# Count filtered variants
|
|
411
|
+
if verbose:
|
|
412
|
+
filtered_count = lazy_df.select(pl.len()).collect().item()
|
|
413
|
+
click.echo(
|
|
414
|
+
f"Variants after expression filter: {filtered_count}",
|
|
415
|
+
err=True,
|
|
416
|
+
)
|
|
417
|
+
except ValueError as e:
|
|
418
|
+
if verbose:
|
|
419
|
+
click.echo(
|
|
420
|
+
f"Warning: Could not apply early filter: {e}", err=True
|
|
421
|
+
)
|
|
422
|
+
|
|
423
|
+
# Now collect and melt (on filtered variants only)
|
|
424
|
+
df = lazy_df.collect()
|
|
425
|
+
formatted_df = format_bcftools_tsv_minimal(df, pedigree_df)
|
|
426
|
+
lazy_df = formatted_df.lazy()
|
|
427
|
+
|
|
428
|
+
# Remove expression from config so it's not applied again
|
|
429
|
+
if filter_config_data and "expression" in filter_config_data:
|
|
430
|
+
filter_config_data = filter_config_data.copy()
|
|
431
|
+
del filter_config_data["expression"]
|
|
432
|
+
else:
|
|
433
|
+
# TSV input: need full processing (melt + annotation expansion)
|
|
434
|
+
string_columns = [
|
|
435
|
+
"FID",
|
|
436
|
+
"sample_id",
|
|
437
|
+
"father_id",
|
|
438
|
+
"mother_id",
|
|
439
|
+
"FatherBarcode",
|
|
440
|
+
"MotherBarcode",
|
|
441
|
+
"sample",
|
|
442
|
+
]
|
|
443
|
+
schema_overrides = {col: pl.Utf8 for col in string_columns}
|
|
444
|
+
lazy_df = pl.scan_csv(
|
|
445
|
+
input_file, separator="\t", schema_overrides=schema_overrides
|
|
446
|
+
)
|
|
447
|
+
|
|
448
|
+
# Apply formatting transformations (melt + expand annotations)
|
|
449
|
+
lazy_df = format_bcftools_tsv_lazy(lazy_df, pedigree_df)
|
|
147
450
|
|
|
148
451
|
# Apply filters if provided
|
|
149
452
|
if filter_config_data:
|
|
@@ -1198,15 +1501,18 @@ def read_pedigree(pedigree_path: Path) -> pl.DataFrame:
|
|
|
1198
1501
|
pedigree_df = df.select(select_cols)
|
|
1199
1502
|
|
|
1200
1503
|
# Replace 0 and -9 with null (indicating no parent)
|
|
1504
|
+
# Explicit cast to Utf8 ensures type is preserved even when all values become null
|
|
1201
1505
|
pedigree_df = pedigree_df.with_columns(
|
|
1202
1506
|
[
|
|
1203
1507
|
pl.when(pl.col("father_id").cast(pl.Utf8).is_in(["0", "-9"]))
|
|
1204
1508
|
.then(None)
|
|
1205
1509
|
.otherwise(pl.col("father_id"))
|
|
1510
|
+
.cast(pl.Utf8)
|
|
1206
1511
|
.alias("father_id"),
|
|
1207
1512
|
pl.when(pl.col("mother_id").cast(pl.Utf8).is_in(["0", "-9"]))
|
|
1208
1513
|
.then(None)
|
|
1209
1514
|
.otherwise(pl.col("mother_id"))
|
|
1515
|
+
.cast(pl.Utf8)
|
|
1210
1516
|
.alias("mother_id"),
|
|
1211
1517
|
]
|
|
1212
1518
|
)
|
|
@@ -1391,33 +1697,39 @@ def format_bcftools_tsv_minimal(
|
|
|
1391
1697
|
Returns:
|
|
1392
1698
|
Formatted DataFrame with melted samples (annotations still in (null) column)
|
|
1393
1699
|
"""
|
|
1394
|
-
#
|
|
1395
|
-
|
|
1396
|
-
|
|
1397
|
-
|
|
1398
|
-
#
|
|
1399
|
-
|
|
1400
|
-
|
|
1401
|
-
|
|
1402
|
-
|
|
1700
|
+
# Determine which columns are sample columns
|
|
1701
|
+
# Sample columns have format "SampleName:GT:SampleName:DP:..." or similar
|
|
1702
|
+
# Non-sample columns are standard VCF columns or annotation columns
|
|
1703
|
+
|
|
1704
|
+
# Standard VCF/annotation columns (not samples)
|
|
1705
|
+
standard_cols = {
|
|
1706
|
+
"#CHROM", "POS", "REF", "ALT", "FILTER", "(null)", "CSQ",
|
|
1707
|
+
"QUAL", "ID", "INFO", "FORMAT"
|
|
1708
|
+
}
|
|
1403
1709
|
|
|
1404
|
-
#
|
|
1710
|
+
# Find sample columns by looking for columns with ":" in the name
|
|
1711
|
+
# that aren't standard columns
|
|
1405
1712
|
sample_cols = []
|
|
1406
1713
|
sample_names = []
|
|
1407
1714
|
|
|
1408
|
-
for col in
|
|
1409
|
-
# Skip
|
|
1410
|
-
if col
|
|
1715
|
+
for col in df.columns:
|
|
1716
|
+
# Skip standard columns
|
|
1717
|
+
if col in standard_cols:
|
|
1411
1718
|
continue
|
|
1412
1719
|
|
|
1720
|
+
# Skip columns that look like VEP annotation fields
|
|
1721
|
+
if col.startswith("VEP_") or col.startswith("AF") or col.startswith("AC"):
|
|
1722
|
+
continue
|
|
1723
|
+
|
|
1724
|
+
# Sample columns typically have ":" in them (GT:DP:GQ:AD format)
|
|
1413
1725
|
if ":" in col:
|
|
1414
1726
|
sample_name = col.split(":", 1)[0]
|
|
1415
1727
|
sample_cols.append(col)
|
|
1416
1728
|
sample_names.append(sample_name)
|
|
1417
|
-
|
|
1418
|
-
#
|
|
1419
|
-
|
|
1420
|
-
|
|
1729
|
+
elif col not in df.columns[:10]:
|
|
1730
|
+
# Columns after position 10 that don't match known patterns might be samples
|
|
1731
|
+
# This is a heuristic for unusual sample column formats
|
|
1732
|
+
pass
|
|
1421
1733
|
|
|
1422
1734
|
if not sample_cols:
|
|
1423
1735
|
# No sample columns to melt
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: pywombat
|
|
3
|
-
Version: 1.0
|
|
3
|
+
Version: 1.1.0
|
|
4
4
|
Summary: A CLI tool for processing and filtering bcftools tabulated TSV files with pedigree support
|
|
5
5
|
Project-URL: Homepage, https://github.com/bourgeron-lab/pywombat
|
|
6
6
|
Project-URL: Repository, https://github.com/bourgeron-lab/pywombat
|
|
@@ -18,6 +18,9 @@ Requires-Dist: click>=8.1.0
|
|
|
18
18
|
Requires-Dist: polars>=0.19.0
|
|
19
19
|
Requires-Dist: pyyaml>=6.0
|
|
20
20
|
Requires-Dist: tqdm>=4.67.1
|
|
21
|
+
Provides-Extra: dev
|
|
22
|
+
Requires-Dist: pytest-cov>=4.0.0; extra == 'dev'
|
|
23
|
+
Requires-Dist: pytest>=7.0.0; extra == 'dev'
|
|
21
24
|
Description-Content-Type: text/markdown
|
|
22
25
|
|
|
23
26
|
# PyWombat 🦘
|
|
@@ -29,14 +32,15 @@ A high-performance CLI tool for processing and filtering bcftools tabulated TSV
|
|
|
29
32
|
|
|
30
33
|
## Features
|
|
31
34
|
|
|
32
|
-
✨ **Fast Processing**: Uses Polars for efficient data handling
|
|
33
|
-
🔬 **Quality Filtering**: Configurable depth, quality, and VAF thresholds
|
|
34
|
-
👨👩👧 **Pedigree Support**: Trio and family analysis with parent genotypes
|
|
35
|
-
🧬 **De Novo Detection**: Sex-chromosome-aware DNM identification
|
|
36
|
-
📊 **Flexible Output**: TSV, compressed TSV, or Parquet formats
|
|
37
|
-
🎯 **Expression Filters**: Complex filtering with logical expressions
|
|
38
|
-
🏷️ **Boolean Flag Support**: INFO field flags (PASS, DB, etc.) extracted as True/False columns
|
|
39
|
-
⚡ **
|
|
35
|
+
✨ **Fast Processing**: Uses Polars for efficient data handling
|
|
36
|
+
🔬 **Quality Filtering**: Configurable depth, quality, and VAF thresholds
|
|
37
|
+
👨👩👧 **Pedigree Support**: Trio and family analysis with parent genotypes
|
|
38
|
+
🧬 **De Novo Detection**: Sex-chromosome-aware DNM identification
|
|
39
|
+
📊 **Flexible Output**: TSV, compressed TSV, or Parquet formats
|
|
40
|
+
🎯 **Expression Filters**: Complex filtering with logical expressions
|
|
41
|
+
🏷️ **Boolean Flag Support**: INFO field flags (PASS, DB, etc.) extracted as True/False columns
|
|
42
|
+
⚡ **Memory Optimized**: Two-step workflow for large files (prepare → filter)
|
|
43
|
+
💾 **Parquet Support**: Pre-process large files for repeated, memory-efficient analysis
|
|
40
44
|
|
|
41
45
|
---
|
|
42
46
|
|
|
@@ -47,17 +51,37 @@ A high-performance CLI tool for processing and filtering bcftools tabulated TSV
|
|
|
47
51
|
Use `uvx` to run PyWombat without installation:
|
|
48
52
|
|
|
49
53
|
```bash
|
|
50
|
-
# Basic
|
|
51
|
-
uvx pywombat input.tsv -o output
|
|
54
|
+
# Basic filtering
|
|
55
|
+
uvx pywombat filter input.tsv -o output
|
|
52
56
|
|
|
53
|
-
# With
|
|
54
|
-
uvx pywombat input.tsv -F examples/rare_variants_high_impact.yml -o output
|
|
57
|
+
# With filter configuration
|
|
58
|
+
uvx pywombat filter input.tsv -F examples/rare_variants_high_impact.yml -o output
|
|
55
59
|
|
|
56
60
|
# De novo mutation detection
|
|
57
|
-
uvx pywombat input.tsv --pedigree pedigree.tsv \
|
|
61
|
+
uvx pywombat filter input.tsv --pedigree pedigree.tsv \
|
|
58
62
|
-F examples/de_novo_mutations.yml -o denovo
|
|
59
63
|
```
|
|
60
64
|
|
|
65
|
+
### For Large Files (>1GB or >50 samples)
|
|
66
|
+
|
|
67
|
+
Use the two-step workflow for memory-efficient processing:
|
|
68
|
+
|
|
69
|
+
```bash
|
|
70
|
+
# Step 1: Prepare (one-time preprocessing)
|
|
71
|
+
uvx pywombat prepare input.tsv.gz -o prepared.parquet
|
|
72
|
+
|
|
73
|
+
# Step 2: Filter (fast, memory-efficient, can be run multiple times)
|
|
74
|
+
uvx pywombat filter prepared.parquet \
|
|
75
|
+
-p pedigree.tsv \
|
|
76
|
+
-F config.yml \
|
|
77
|
+
-o filtered
|
|
78
|
+
```
|
|
79
|
+
|
|
80
|
+
**Benefits:**
|
|
81
|
+
- Pre-expands INFO fields once (saves time on repeated filtering)
|
|
82
|
+
- Applies filters before melting samples (reduces memory by 95%+)
|
|
83
|
+
- Parquet format enables fast columnar access
|
|
84
|
+
|
|
61
85
|
### Installation for Development/Repeated Use
|
|
62
86
|
|
|
63
87
|
```bash
|
|
@@ -69,7 +93,7 @@ cd pywombat
|
|
|
69
93
|
uv sync
|
|
70
94
|
|
|
71
95
|
# Run with uv run
|
|
72
|
-
uv run wombat input.tsv -o output
|
|
96
|
+
uv run wombat filter input.tsv -o output
|
|
73
97
|
```
|
|
74
98
|
|
|
75
99
|
---
|
|
@@ -114,25 +138,62 @@ chr1 100 A T 2 0.5 30 true Sample2 1/1 18 99
|
|
|
114
138
|
|
|
115
139
|
---
|
|
116
140
|
|
|
117
|
-
##
|
|
141
|
+
## Commands
|
|
142
|
+
|
|
143
|
+
PyWombat has two main commands:
|
|
144
|
+
|
|
145
|
+
### `wombat prepare` - Preprocess Large Files
|
|
146
|
+
|
|
147
|
+
Converts TSV/TSV.gz to optimized Parquet format with pre-expanded INFO fields:
|
|
148
|
+
|
|
149
|
+
```bash
|
|
150
|
+
# Basic usage
|
|
151
|
+
wombat prepare input.tsv.gz -o prepared.parquet
|
|
152
|
+
|
|
153
|
+
# With verbose output
|
|
154
|
+
wombat prepare input.tsv.gz -o prepared.parquet -v
|
|
155
|
+
|
|
156
|
+
# Adjust chunk size for memory constraints
|
|
157
|
+
wombat prepare input.tsv.gz -o prepared.parquet --chunk-size 25000
|
|
158
|
+
```
|
|
159
|
+
|
|
160
|
+
**What it does:**
|
|
161
|
+
- Extracts all INFO fields (VEP_*, AF, etc.) as separate columns
|
|
162
|
+
- Keeps samples in wide format (not melted yet)
|
|
163
|
+
- Writes memory-efficient Parquet format
|
|
164
|
+
- Processes in chunks to handle files of any size
|
|
165
|
+
|
|
166
|
+
**When to use:**
|
|
167
|
+
- Files >1GB or >50 samples
|
|
168
|
+
- Large families (>10 members)
|
|
169
|
+
- Running multiple filter configurations
|
|
170
|
+
- Repeated analysis of the same dataset
|
|
171
|
+
|
|
172
|
+
### `wombat filter` - Process and Filter Data
|
|
118
173
|
|
|
119
|
-
|
|
174
|
+
Transforms and filters variant data (works with both TSV and Parquet input):
|
|
120
175
|
|
|
121
176
|
```bash
|
|
122
|
-
#
|
|
123
|
-
|
|
177
|
+
# Basic filtering (TSV input)
|
|
178
|
+
wombat filter input.tsv -o output
|
|
179
|
+
|
|
180
|
+
# From prepared Parquet (faster, more memory-efficient)
|
|
181
|
+
wombat filter prepared.parquet -o output
|
|
182
|
+
|
|
183
|
+
# With filter configuration
|
|
184
|
+
wombat filter input.tsv -F config.yml -o output
|
|
124
185
|
|
|
125
|
-
#
|
|
126
|
-
|
|
186
|
+
# With pedigree
|
|
187
|
+
wombat filter input.tsv -p pedigree.tsv -o output
|
|
127
188
|
|
|
128
189
|
# Compressed output
|
|
129
|
-
|
|
190
|
+
wombat filter input.tsv -o output -f tsv.gz
|
|
130
191
|
|
|
131
|
-
# Parquet
|
|
132
|
-
|
|
192
|
+
# Parquet output
|
|
193
|
+
wombat filter input.tsv -o output -f parquet
|
|
133
194
|
|
|
134
195
|
# With verbose output
|
|
135
|
-
|
|
196
|
+
wombat filter input.tsv -o output -v
|
|
136
197
|
```
|
|
137
198
|
|
|
138
199
|
### With Pedigree (Trio/Family Analysis)
|
|
@@ -140,7 +201,7 @@ uvx pywombat input.tsv -o output --verbose
|
|
|
140
201
|
Add parent genotype information for inheritance analysis:
|
|
141
202
|
|
|
142
203
|
```bash
|
|
143
|
-
|
|
204
|
+
wombat filter input.tsv --pedigree pedigree.tsv -o output
|
|
144
205
|
```
|
|
145
206
|
|
|
146
207
|
**Pedigree File Format** (tab-separated):
|
|
@@ -178,7 +239,7 @@ PyWombat supports two types of filtering:
|
|
|
178
239
|
Filter for ultra-rare, high-impact variants:
|
|
179
240
|
|
|
180
241
|
```bash
|
|
181
|
-
|
|
242
|
+
wombat filter input.tsv \
|
|
182
243
|
-F examples/rare_variants_high_impact.yml \
|
|
183
244
|
-o rare_variants
|
|
184
245
|
```
|
|
@@ -210,7 +271,7 @@ expression: "VEP_CANONICAL = YES & VEP_IMPACT = HIGH & VEP_LoF = HC & VEP_LoF_fl
|
|
|
210
271
|
Identify de novo mutations in trio data:
|
|
211
272
|
|
|
212
273
|
```bash
|
|
213
|
-
|
|
274
|
+
wombat filter input.tsv \
|
|
214
275
|
--pedigree pedigree.tsv \
|
|
215
276
|
-F examples/de_novo_mutations.yml \
|
|
216
277
|
-o denovo
|
|
@@ -290,7 +351,7 @@ expression: "VEP_IMPACT = HIGH & VEP_CANONICAL = YES & gnomad_AF < 0.01 & CADD_P
|
|
|
290
351
|
Inspect specific variants for troubleshooting:
|
|
291
352
|
|
|
292
353
|
```bash
|
|
293
|
-
|
|
354
|
+
wombat filter input.tsv \
|
|
294
355
|
-F config.yml \
|
|
295
356
|
--debug chr11:70486013
|
|
296
357
|
```
|
|
@@ -309,20 +370,20 @@ Shows:
|
|
|
309
370
|
### TSV (Default)
|
|
310
371
|
|
|
311
372
|
```bash
|
|
312
|
-
|
|
313
|
-
|
|
373
|
+
wombat filter input.tsv -o output # Creates output.tsv
|
|
374
|
+
wombat filter input.tsv -o output -f tsv # Same as above
|
|
314
375
|
```
|
|
315
376
|
|
|
316
377
|
### Compressed TSV
|
|
317
378
|
|
|
318
379
|
```bash
|
|
319
|
-
|
|
380
|
+
wombat filter input.tsv -o output -f tsv.gz # Creates output.tsv.gz
|
|
320
381
|
```
|
|
321
382
|
|
|
322
383
|
### Parquet (Fastest for Large Files)
|
|
323
384
|
|
|
324
385
|
```bash
|
|
325
|
-
|
|
386
|
+
wombat filter input.tsv -o output -f parquet # Creates output.parquet
|
|
326
387
|
```
|
|
327
388
|
|
|
328
389
|
**When to use Parquet:**
|
|
@@ -340,7 +401,7 @@ uvx pywombat input.tsv -o output -f parquet # Creates output.parquet
|
|
|
340
401
|
|
|
341
402
|
```bash
|
|
342
403
|
# Step 1: Filter for rare, high-impact variants
|
|
343
|
-
|
|
404
|
+
wombat filter cohort.tsv \
|
|
344
405
|
-F examples/rare_variants_high_impact.yml \
|
|
345
406
|
-o rare_variants
|
|
346
407
|
|
|
@@ -352,24 +413,34 @@ uvx pywombat cohort.tsv \
|
|
|
352
413
|
|
|
353
414
|
```bash
|
|
354
415
|
# Identify de novo mutations in autism cohort
|
|
355
|
-
|
|
416
|
+
wombat filter autism_trios.tsv \
|
|
356
417
|
--pedigree autism_pedigree.tsv \
|
|
357
418
|
-F examples/de_novo_mutations.yml \
|
|
358
419
|
-o autism_denovo \
|
|
359
|
-
|
|
420
|
+
-v
|
|
360
421
|
|
|
361
422
|
# Review output for genes in autism risk lists
|
|
362
423
|
```
|
|
363
424
|
|
|
364
|
-
### 3. Multi-Family
|
|
425
|
+
### 3. Large Multi-Family Analysis (Memory-Optimized)
|
|
365
426
|
|
|
366
427
|
```bash
|
|
367
|
-
#
|
|
368
|
-
|
|
428
|
+
# Step 1: Prepare once (preprocesses INFO fields)
|
|
429
|
+
wombat prepare large_cohort.tsv.gz -o prepared.parquet -v
|
|
430
|
+
|
|
431
|
+
# Step 2: Filter with different configurations (fast, memory-efficient)
|
|
432
|
+
wombat filter prepared.parquet \
|
|
369
433
|
--pedigree families_pedigree.tsv \
|
|
370
434
|
-F examples/rare_variants_high_impact.yml \
|
|
371
435
|
-o families_rare_variants \
|
|
372
|
-
-
|
|
436
|
+
-v
|
|
437
|
+
|
|
438
|
+
# Step 3: Run additional filters without re-preparing
|
|
439
|
+
wombat filter prepared.parquet \
|
|
440
|
+
--pedigree families_pedigree.tsv \
|
|
441
|
+
-F examples/de_novo_mutations.yml \
|
|
442
|
+
-o families_denovo \
|
|
443
|
+
-v
|
|
373
444
|
```
|
|
374
445
|
|
|
375
446
|
### 4. Custom Expression Filter
|
|
@@ -389,7 +460,7 @@ expression: "VEP_IMPACT = HIGH & (gnomad_AF < 0.0001 | gnomad_AF = null)"
|
|
|
389
460
|
Apply:
|
|
390
461
|
|
|
391
462
|
```bash
|
|
392
|
-
|
|
463
|
+
wombat filter input.tsv -F custom_filter.yml -o output
|
|
393
464
|
```
|
|
394
465
|
|
|
395
466
|
---
|
|
@@ -464,7 +535,7 @@ bcftools query -HH \
|
|
|
464
535
|
annotated.split.bcf > annotated.tsv
|
|
465
536
|
|
|
466
537
|
# 4. Process with PyWombat
|
|
467
|
-
|
|
538
|
+
wombat filter annotated.tsv -F examples/rare_variants_high_impact.yml -o output
|
|
468
539
|
```
|
|
469
540
|
|
|
470
541
|
**Why split-vep is required:**
|
|
@@ -481,7 +552,7 @@ For production workflows, these commands can be piped together:
|
|
|
481
552
|
# Efficient pipeline (single pass through data)
|
|
482
553
|
bcftools +split-vep -c - -p VEP_ input.vcf.gz | \
|
|
483
554
|
bcftools query -HH -f '%CHROM\t%POS\t%REF\t%ALT\t%FILTER\t%INFO[\t%GT:%DP:%GQ:%AD]\n' | \
|
|
484
|
-
|
|
555
|
+
wombat filter - -F config.yml -o output
|
|
485
556
|
```
|
|
486
557
|
|
|
487
558
|
**Note**: For multiple filter configurations, it's more efficient to save the intermediate TSV file rather than regenerating it each time.
|
|
@@ -517,11 +588,31 @@ Each configuration file is fully documented with:
|
|
|
517
588
|
|
|
518
589
|
## Performance Tips
|
|
519
590
|
|
|
520
|
-
|
|
521
|
-
|
|
591
|
+
### For Large Files (>1GB or >50 samples)
|
|
592
|
+
|
|
593
|
+
1. **Use the two-step workflow**: `wombat prepare` → `wombat filter`
|
|
594
|
+
- Reduces memory usage by 95%+ (4.2M variants → ~100 after early filtering)
|
|
595
|
+
- Pre-expands INFO fields once, reuse for multiple filter configurations
|
|
596
|
+
- Example: 38-sample family with 4.2M variants processes in <1 second with ~1.2GB RAM
|
|
597
|
+
|
|
598
|
+
2. **Parquet format benefits**:
|
|
599
|
+
- Columnar storage enables selective column loading
|
|
600
|
+
- Pre-filtering before melting (expression filters applied before expanding to per-sample rows)
|
|
601
|
+
- 30% smaller file size vs gzipped TSV
|
|
602
|
+
|
|
603
|
+
### For All Files
|
|
604
|
+
|
|
522
605
|
3. **Pre-filter with bcftools**: Filter by region/gene before PyWombat
|
|
523
606
|
4. **Compressed input**: PyWombat handles `.gz` files natively
|
|
524
|
-
5. **
|
|
607
|
+
5. **Use verbose mode** (`-v`): Monitor progress and filtering statistics
|
|
608
|
+
|
|
609
|
+
### Memory Comparison
|
|
610
|
+
|
|
611
|
+
| Approach | 38 samples, 4.2M variants | Memory | Time |
|
|
612
|
+
|----------|---------------------------|--------|------|
|
|
613
|
+
| Direct TSV | ❌ OOM (>200GB) | 200+ GB | Failed |
|
|
614
|
+
| TSV with chunking | ⚠️ Slow | ~30GB | ~3 min |
|
|
615
|
+
| **Parquet + pre-filter** | ✅ **Optimal** | **~1.2GB** | **<1 sec** |
|
|
525
616
|
|
|
526
617
|
---
|
|
527
618
|
|
|
@@ -588,11 +679,15 @@ pywombat/
|
|
|
588
679
|
|
|
589
680
|
**Issue**: Memory errors on large files
|
|
590
681
|
|
|
591
|
-
- **Solution**:
|
|
682
|
+
- **Solution**: Use the two-step workflow: `wombat prepare` then `wombat filter` for 95%+ memory reduction
|
|
683
|
+
|
|
684
|
+
**Issue**: Command not found after upgrading
|
|
685
|
+
|
|
686
|
+
- **Solution**: PyWombat now uses subcommands - use `wombat filter` instead of just `wombat`
|
|
592
687
|
|
|
593
688
|
### Getting Help
|
|
594
689
|
|
|
595
|
-
1. Check `--help` for command options: `
|
|
690
|
+
1. Check `--help` for command options: `wombat --help` or `wombat filter --help`
|
|
596
691
|
2. Review example configurations in [`examples/`](examples/)
|
|
597
692
|
3. Use `--debug` mode to inspect specific variants
|
|
598
693
|
4. Use `--verbose` to see filtering steps
|
|
@@ -0,0 +1,6 @@
|
|
|
1
|
+
pywombat/__init__.py,sha256=iIPN9vJtsIUhl_DiKNnknxCamLinfayodLLFK8y-aJg,54
|
|
2
|
+
pywombat/cli.py,sha256=-tzD2UJxlByP8aE5uSZ1C6UvgoriJqPMXRNs7xY65nE,85545
|
|
3
|
+
pywombat-1.1.0.dist-info/METADATA,sha256=lYL6me-3Cw1wDa_yFdRX5Qj4cre6GMpY3Uqjy0LRwLg,20289
|
|
4
|
+
pywombat-1.1.0.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
|
|
5
|
+
pywombat-1.1.0.dist-info/entry_points.txt,sha256=Vt7U2ypbiEgCBlEV71ZPk287H5_HKmPBT4iBu6duEcE,44
|
|
6
|
+
pywombat-1.1.0.dist-info/RECORD,,
|
pywombat-1.0.1.dist-info/RECORD
DELETED
|
@@ -1,6 +0,0 @@
|
|
|
1
|
-
pywombat/__init__.py,sha256=iIPN9vJtsIUhl_DiKNnknxCamLinfayodLLFK8y-aJg,54
|
|
2
|
-
pywombat/cli.py,sha256=eaChYSTxEc3lXxVRKe3X8bRGKmgxUE0Vuy9Cr5wPTi4,74853
|
|
3
|
-
pywombat-1.0.1.dist-info/METADATA,sha256=G0xdJEOwfB-J1ZOy6qphijM4JBygZppMeRs0J8mzSj0,17168
|
|
4
|
-
pywombat-1.0.1.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
|
|
5
|
-
pywombat-1.0.1.dist-info/entry_points.txt,sha256=Vt7U2ypbiEgCBlEV71ZPk287H5_HKmPBT4iBu6duEcE,44
|
|
6
|
-
pywombat-1.0.1.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|