pywombat 1.0.2__py3-none-any.whl → 1.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
pywombat/cli.py CHANGED
@@ -11,13 +11,266 @@ import polars as pl
11
11
  import yaml
12
12
 
13
13
 
14
- @click.command()
14
+ @click.group()
15
+ def cli():
16
+ """
17
+ Wombat: A tool for processing bcftools tabulated TSV files.
18
+
19
+ \b
20
+ Commands:
21
+ filter Process and filter variant data
22
+ prepare Convert TSV to optimized Parquet format
23
+ """
24
+ pass
25
+
26
+
27
+ @cli.command("prepare")
28
+ @click.argument("input_file", type=click.Path(exists=True, path_type=Path))
29
+ @click.option(
30
+ "-o",
31
+ "--output",
32
+ type=click.Path(path_type=Path),
33
+ required=True,
34
+ help="Output Parquet file path.",
35
+ )
36
+ @click.option(
37
+ "--chunk-size",
38
+ type=int,
39
+ default=50000,
40
+ help="Number of rows to process at a time (default: 50000).",
41
+ )
42
+ @click.option("-v", "--verbose", is_flag=True, help="Enable verbose output.")
43
+ def prepare_cmd(
44
+ input_file: Path,
45
+ output: Path,
46
+ chunk_size: int,
47
+ verbose: bool,
48
+ ):
49
+ """
50
+ Convert bcftools TSV to optimized Parquet format.
51
+
52
+ This command pre-processes a TSV file by:
53
+
54
+ \b
55
+ 1. Extracting all INFO fields from the '(null)' column into separate columns
56
+ 2. Applying memory-efficient data types (Categorical for CHROM, UInt32 for POS)
57
+ 3. Writing to Parquet format for efficient columnar access
58
+
59
+ The output Parquet file can then be used with 'wombat filter' for much faster
60
+ and more memory-efficient filtering, especially for large files.
61
+
62
+ \b
63
+ Examples:
64
+ wombat prepare input.tsv.gz -o prepared.parquet
65
+ wombat prepare input.tsv.gz -o prepared.parquet --chunk-size 100000
66
+ """
67
+ try:
68
+ if verbose:
69
+ click.echo(f"Preparing {input_file} -> {output}", err=True)
70
+
71
+ # Ensure output has .parquet extension
72
+ if not str(output).endswith(".parquet"):
73
+ output = Path(f"{output}.parquet")
74
+
75
+ # Process the file
76
+ prepare_parquet(input_file, output, chunk_size, verbose)
77
+
78
+ if verbose:
79
+ click.echo(f"Successfully created {output}", err=True)
80
+
81
+ except Exception as e:
82
+ click.echo(f"Error: {e}", err=True)
83
+ raise click.Abort()
84
+
85
+
86
+ def prepare_parquet(
87
+ input_file: Path,
88
+ output: Path,
89
+ chunk_size: int = 50000,
90
+ verbose: bool = False,
91
+ ) -> None:
92
+ """
93
+ Convert a bcftools TSV file to Parquet with pre-expanded INFO fields.
94
+
95
+ Processes the file in chunks to handle large files without running out of memory.
96
+
97
+ Args:
98
+ input_file: Path to input TSV or TSV.gz file
99
+ output: Path to output Parquet file
100
+ chunk_size: Number of rows to process per chunk
101
+ verbose: Whether to print progress
102
+ """
103
+ from tqdm import tqdm
104
+
105
+ # First pass: discover all INFO fields
106
+ if verbose:
107
+ click.echo("Pass 1: Discovering INFO fields...", err=True)
108
+
109
+ all_fields = set()
110
+ all_flags = set()
111
+ total_lines = 0
112
+
113
+ is_gzipped = str(input_file).endswith(".gz")
114
+ opener = gzip.open if is_gzipped else open
115
+
116
+ with opener(input_file, "rt") as f:
117
+ header_line = f.readline().strip()
118
+ header_cols = header_line.split("\t")
119
+
120
+ # Find the (null) column index dynamically
121
+ null_col_idx = None
122
+ for i, col in enumerate(header_cols):
123
+ if col == "(null)":
124
+ null_col_idx = i
125
+ break
126
+
127
+ if null_col_idx is None:
128
+ if verbose:
129
+ click.echo("Warning: No (null) column found in input", err=True)
130
+ else:
131
+ for line in tqdm(f, desc="Scanning", disable=not verbose):
132
+ total_lines += 1
133
+ parts = line.split("\t")
134
+ if len(parts) > null_col_idx:
135
+ null_value = parts[null_col_idx]
136
+ if null_value and null_value != ".":
137
+ pairs = null_value.split(";")
138
+ for pair in pairs:
139
+ if "=" in pair:
140
+ field_name = pair.split("=", 1)[0]
141
+ all_fields.add(field_name)
142
+ elif pair.strip():
143
+ all_flags.add(pair.strip())
144
+
145
+ if verbose:
146
+ click.echo(
147
+ f"Found {len(all_fields)} key-value fields and {len(all_flags)} flags in {total_lines} variants",
148
+ err=True,
149
+ )
150
+
151
+ # Second pass: process chunks and write Parquet
152
+ if verbose:
153
+ click.echo("Pass 2: Converting to Parquet...", err=True)
154
+
155
+ # Define memory-efficient dtypes
156
+ dtype_overrides = {
157
+ "#CHROM": pl.Categorical,
158
+ "POS": pl.UInt32,
159
+ "FILTER": pl.Categorical,
160
+ }
161
+
162
+ # Create a temporary directory for chunk files
163
+ import tempfile
164
+ import shutil
165
+
166
+ temp_dir = Path(tempfile.mkdtemp(prefix="wombat_prepare_"))
167
+ part_files = []
168
+
169
+ try:
170
+ with opener(input_file, "rt") as f:
171
+ header_line = f.readline().strip()
172
+
173
+ # Process in chunks
174
+ chunk_lines = []
175
+ pbar = tqdm(total=total_lines, desc="Converting", disable=not verbose)
176
+
177
+ for line in f:
178
+ chunk_lines.append(line)
179
+ if len(chunk_lines) >= chunk_size:
180
+ df_chunk = _process_chunk(
181
+ header_line, chunk_lines, all_fields, all_flags, dtype_overrides
182
+ )
183
+ part_file = temp_dir / f"part_{len(part_files):06d}.parquet"
184
+ df_chunk.write_parquet(part_file)
185
+ part_files.append(part_file)
186
+ pbar.update(len(chunk_lines))
187
+ chunk_lines = []
188
+
189
+ # Process remaining lines
190
+ if chunk_lines:
191
+ df_chunk = _process_chunk(
192
+ header_line, chunk_lines, all_fields, all_flags, dtype_overrides
193
+ )
194
+ part_file = temp_dir / f"part_{len(part_files):06d}.parquet"
195
+ df_chunk.write_parquet(part_file)
196
+ part_files.append(part_file)
197
+ pbar.update(len(chunk_lines))
198
+
199
+ pbar.close()
200
+
201
+ # Combine all parts into final output using lazy scanning
202
+ if verbose:
203
+ click.echo(f"Combining {len(part_files)} parts into final output...", err=True)
204
+
205
+ if part_files:
206
+ # Use scan_parquet to lazily read all parts and write combined output
207
+ combined = pl.scan_parquet(part_files).collect()
208
+ combined.write_parquet(output)
209
+
210
+ if verbose:
211
+ click.echo(f"Wrote {len(part_files)} chunks to {output}", err=True)
212
+
213
+ finally:
214
+ # Clean up temporary directory
215
+ shutil.rmtree(temp_dir, ignore_errors=True)
216
+
217
+
218
+ def _process_chunk(
219
+ header: str,
220
+ lines: list,
221
+ fields: set,
222
+ flags: set,
223
+ dtype_overrides: dict,
224
+ ) -> pl.DataFrame:
225
+ """Process a chunk of lines into a DataFrame with expanded INFO fields."""
226
+ import io
227
+
228
+ content = header + "\n" + "".join(lines)
229
+ df = pl.read_csv(
230
+ io.StringIO(content),
231
+ separator="\t",
232
+ infer_schema_length=10000,
233
+ )
234
+
235
+ # Expand INFO fields from (null) column
236
+ if "(null)" in df.columns:
237
+ # Extract key-value fields
238
+ for field in sorted(fields):
239
+ df = df.with_columns(
240
+ pl.col("(null)").str.extract(f"{field}=([^;]+)").alias(field)
241
+ )
242
+
243
+ # Extract boolean flags
244
+ for flag in sorted(flags):
245
+ df = df.with_columns(
246
+ pl.col("(null)").str.contains(f"(^|;){flag}(;|$)").alias(flag)
247
+ )
248
+
249
+ # Drop the original (null) column
250
+ df = df.drop("(null)")
251
+
252
+ # Drop CSQ column if it exists (redundant after expansion)
253
+ if "CSQ" in df.columns:
254
+ df = df.drop("CSQ")
255
+
256
+ # Apply memory-efficient dtypes
257
+ for col, dtype in dtype_overrides.items():
258
+ if col in df.columns:
259
+ try:
260
+ df = df.with_columns(pl.col(col).cast(dtype))
261
+ except Exception:
262
+ pass # Skip if cast fails
263
+
264
+ return df
265
+
266
+
267
+ @cli.command("filter")
15
268
  @click.argument("input_file", type=click.Path(exists=True, path_type=Path))
16
269
  @click.option(
17
270
  "-o",
18
271
  "--output",
19
272
  type=str,
20
- help="Output file prefix. If not specified, prints to stdout.",
273
+ help="Output file prefix. If not specified, generates from input filename.",
21
274
  )
22
275
  @click.option(
23
276
  "-f",
@@ -43,9 +296,9 @@ import yaml
43
296
  @click.option(
44
297
  "--debug",
45
298
  type=str,
46
- help="Debug mode: show rows matching chrom:pos (e.g., chr11:70486013). Displays #CHROM, POS, VEP_SYMBOL, and columns from filter expression.",
299
+ help="Debug mode: show rows matching chrom:pos (e.g., chr11:70486013).",
47
300
  )
48
- def cli(
301
+ def filter_cmd(
49
302
  input_file: Path,
50
303
  output: Optional[str],
51
304
  output_format: str,
@@ -55,36 +308,43 @@ def cli(
55
308
  debug: Optional[str],
56
309
  ):
57
310
  """
58
- Wombat: A tool for processing bcftools tabulated TSV files.
311
+ Process and filter variant data from TSV or Parquet files.
59
312
 
60
- This command:
313
+ \b
314
+ Supports two input formats:
315
+ - TSV/TSV.gz: Full processing (INFO expansion + melting)
316
+ - Parquet: Fast processing (melting only, INFO already expanded)
61
317
 
62
318
  \b
63
- 1. Expands the '(null)' column containing NAME=value pairs separated by ';'
64
- 2. Preserves the CSQ (Consequence) column without melting
65
- 3. Melts sample columns into rows with sample names
66
- 4. Splits sample values (GT:DP:GQ:AD format) into separate columns:
67
- - sample_gt: Genotype
68
- - sample_dp: Read depth
69
- - sample_gq: Genotype quality
70
- - sample_ad: Allele depth (second value from comma-separated list)
71
- - sample_vaf: Variant allele frequency (sample_ad / sample_dp)
319
+ For large files, use 'wombat prepare' first to convert to Parquet,
320
+ then use 'wombat filter' on the Parquet file for better performance.
321
+
322
+ \b
323
+ This command:
324
+ 1. Expands the '(null)' column (TSV only) into separate columns
325
+ 2. Melts sample columns into rows with sample names
326
+ 3. Splits sample values (GT:DP:GQ:AD format) into separate columns
327
+ 4. Applies quality and expression filters (if config provided)
72
328
 
73
329
  \b
74
330
  Examples:
75
- wombat input.tsv -o output
76
- wombat input.tsv -o output -f parquet
77
- wombat input.tsv > output.tsv
331
+ wombat filter input.tsv -o output
332
+ wombat filter prepared.parquet -o output -f parquet
333
+ wombat filter input.tsv -p pedigree.tsv -F config.yml
78
334
  """
79
335
  try:
80
336
  if verbose:
81
337
  click.echo(f"Reading input file: {input_file}", err=True)
82
338
 
83
- # Detect if file is gzipped based on extension
339
+ # Detect input format
340
+ is_parquet = str(input_file).endswith(".parquet")
84
341
  is_gzipped = str(input_file).endswith(".gz")
85
342
 
86
- if verbose and is_gzipped:
87
- click.echo("Detected gzipped file", err=True)
343
+ if verbose:
344
+ if is_parquet:
345
+ click.echo("Detected Parquet input (pre-processed)", err=True)
346
+ elif is_gzipped:
347
+ click.echo("Detected gzipped TSV file", err=True)
88
348
 
89
349
  # Read pedigree file if provided
90
350
  pedigree_df = None
@@ -109,11 +369,11 @@ def cli(
109
369
  if output is None:
110
370
  # Generate default output prefix from input filename
111
371
  input_stem = input_file.name
112
- # Remove .tsv.gz or .tsv extension
113
- if input_stem.endswith(".tsv.gz"):
114
- input_stem = input_stem[:-7] # Remove .tsv.gz
115
- elif input_stem.endswith(".tsv"):
116
- input_stem = input_stem[:-4] # Remove .tsv
372
+ # Remove known extensions
373
+ for ext in [".tsv.gz", ".tsv", ".parquet"]:
374
+ if input_stem.endswith(ext):
375
+ input_stem = input_stem[: -len(ext)]
376
+ break
117
377
 
118
378
  # Add config name if filter is provided
119
379
  if filter_config:
@@ -126,24 +386,67 @@ def cli(
126
386
  if verbose:
127
387
  click.echo("Processing with streaming mode...", err=True)
128
388
 
129
- # Build lazy query
130
- # Force certain columns to string type
131
- string_columns = [
132
- "FID",
133
- "sample_id",
134
- "father_id",
135
- "mother_id",
136
- "FatherBarcode",
137
- "MotherBarcode",
138
- "sample",
139
- ]
140
- schema_overrides = {col: pl.Utf8 for col in string_columns}
141
- lazy_df = pl.scan_csv(
142
- input_file, separator="\t", schema_overrides=schema_overrides
143
- )
389
+ # Build lazy query based on input format
390
+ if is_parquet:
391
+ # Parquet input: INFO fields already expanded by 'wombat prepare'
392
+ lazy_df = pl.scan_parquet(input_file)
393
+
394
+ # OPTIMIZATION: Apply expression filter BEFORE melting
395
+ # Expression filters (VEP_IMPACT, etc.) don't depend on sample data
396
+ if filter_config_data and "expression" in filter_config_data:
397
+ expression = filter_config_data["expression"]
398
+ if expression and verbose:
399
+ click.echo(
400
+ f"Applying expression filter before melting: {expression}",
401
+ err=True,
402
+ )
144
403
 
145
- # Apply formatting transformations
146
- lazy_df = format_bcftools_tsv_lazy(lazy_df, pedigree_df)
404
+ # Collect a small sample to get schema for expression parsing
405
+ schema_df = lazy_df.head(1).collect()
406
+ try:
407
+ filter_expr = parse_impact_filter_expression(expression, schema_df)
408
+ lazy_df = lazy_df.filter(filter_expr)
409
+
410
+ # Count filtered variants
411
+ if verbose:
412
+ filtered_count = lazy_df.select(pl.len()).collect().item()
413
+ click.echo(
414
+ f"Variants after expression filter: {filtered_count}",
415
+ err=True,
416
+ )
417
+ except ValueError as e:
418
+ if verbose:
419
+ click.echo(
420
+ f"Warning: Could not apply early filter: {e}", err=True
421
+ )
422
+
423
+ # Now collect and melt (on filtered variants only)
424
+ df = lazy_df.collect()
425
+ formatted_df = format_bcftools_tsv_minimal(df, pedigree_df)
426
+ lazy_df = formatted_df.lazy()
427
+
428
+ # Remove expression from config so it's not applied again
429
+ if filter_config_data and "expression" in filter_config_data:
430
+ filter_config_data = filter_config_data.copy()
431
+ del filter_config_data["expression"]
432
+ else:
433
+ # TSV input: need full processing (melt + annotation expansion)
434
+ string_columns = [
435
+ "FID",
436
+ "sample_id",
437
+ "father_id",
438
+ "mother_id",
439
+ "FatherBarcode",
440
+ "MotherBarcode",
441
+ "sample",
442
+ ]
443
+ schema_overrides = {col: pl.Utf8 for col in string_columns}
444
+ lazy_df = pl.scan_csv(
445
+ input_file, separator="\t", schema_overrides=schema_overrides
446
+ )
447
+
448
+ # Apply formatting transformations (melt + expand annotations)
449
+ lazy_df = format_bcftools_tsv_lazy(lazy_df, pedigree_df)
147
450
 
148
451
  # Apply filters if provided
149
452
  if filter_config_data:
@@ -1394,33 +1697,39 @@ def format_bcftools_tsv_minimal(
1394
1697
  Returns:
1395
1698
  Formatted DataFrame with melted samples (annotations still in (null) column)
1396
1699
  """
1397
- # Find the (null) column
1398
- if "(null)" not in df.columns:
1399
- raise ValueError("Column '(null)' not found in the input file")
1400
-
1401
- # Get column index of (null)
1402
- null_col_idx = df.columns.index("(null)")
1403
-
1404
- # Split columns into: before (null), (null), and after (null)
1405
- cols_after = df.columns[null_col_idx + 1 :]
1700
+ # Determine which columns are sample columns
1701
+ # Sample columns have format "SampleName:GT:SampleName:DP:..." or similar
1702
+ # Non-sample columns are standard VCF columns or annotation columns
1703
+
1704
+ # Standard VCF/annotation columns (not samples)
1705
+ standard_cols = {
1706
+ "#CHROM", "POS", "REF", "ALT", "FILTER", "(null)", "CSQ",
1707
+ "QUAL", "ID", "INFO", "FORMAT"
1708
+ }
1406
1709
 
1407
- # Step 1: Identify sample columns (SKIP annotation expansion)
1710
+ # Find sample columns by looking for columns with ":" in the name
1711
+ # that aren't standard columns
1408
1712
  sample_cols = []
1409
1713
  sample_names = []
1410
1714
 
1411
- for col in cols_after:
1412
- # Skip CSQ column
1413
- if col == "CSQ":
1715
+ for col in df.columns:
1716
+ # Skip standard columns
1717
+ if col in standard_cols:
1414
1718
  continue
1415
1719
 
1720
+ # Skip columns that look like VEP annotation fields
1721
+ if col.startswith("VEP_") or col.startswith("AF") or col.startswith("AC"):
1722
+ continue
1723
+
1724
+ # Sample columns typically have ":" in them (GT:DP:GQ:AD format)
1416
1725
  if ":" in col:
1417
1726
  sample_name = col.split(":", 1)[0]
1418
1727
  sample_cols.append(col)
1419
1728
  sample_names.append(sample_name)
1420
- else:
1421
- # If no colon, treat the whole column name as sample name
1422
- sample_cols.append(col)
1423
- sample_names.append(col)
1729
+ elif col not in df.columns[:10]:
1730
+ # Columns after position 10 that don't match known patterns might be samples
1731
+ # This is a heuristic for unusual sample column formats
1732
+ pass
1424
1733
 
1425
1734
  if not sample_cols:
1426
1735
  # No sample columns to melt
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: pywombat
3
- Version: 1.0.2
3
+ Version: 1.1.0
4
4
  Summary: A CLI tool for processing and filtering bcftools tabulated TSV files with pedigree support
5
5
  Project-URL: Homepage, https://github.com/bourgeron-lab/pywombat
6
6
  Project-URL: Repository, https://github.com/bourgeron-lab/pywombat
@@ -18,6 +18,9 @@ Requires-Dist: click>=8.1.0
18
18
  Requires-Dist: polars>=0.19.0
19
19
  Requires-Dist: pyyaml>=6.0
20
20
  Requires-Dist: tqdm>=4.67.1
21
+ Provides-Extra: dev
22
+ Requires-Dist: pytest-cov>=4.0.0; extra == 'dev'
23
+ Requires-Dist: pytest>=7.0.0; extra == 'dev'
21
24
  Description-Content-Type: text/markdown
22
25
 
23
26
  # PyWombat 🦘
@@ -29,14 +32,15 @@ A high-performance CLI tool for processing and filtering bcftools tabulated TSV
29
32
 
30
33
  ## Features
31
34
 
32
- ✨ **Fast Processing**: Uses Polars for efficient data handling
33
- 🔬 **Quality Filtering**: Configurable depth, quality, and VAF thresholds
34
- 👨‍👩‍👧 **Pedigree Support**: Trio and family analysis with parent genotypes
35
- 🧬 **De Novo Detection**: Sex-chromosome-aware DNM identification
36
- 📊 **Flexible Output**: TSV, compressed TSV, or Parquet formats
37
- 🎯 **Expression Filters**: Complex filtering with logical expressions
38
- 🏷️ **Boolean Flag Support**: INFO field flags (PASS, DB, etc.) extracted as True/False columns
39
- ⚡ **Streaming Mode**: Memory-efficient processing of large files
35
+ ✨ **Fast Processing**: Uses Polars for efficient data handling
36
+ 🔬 **Quality Filtering**: Configurable depth, quality, and VAF thresholds
37
+ 👨‍👩‍👧 **Pedigree Support**: Trio and family analysis with parent genotypes
38
+ 🧬 **De Novo Detection**: Sex-chromosome-aware DNM identification
39
+ 📊 **Flexible Output**: TSV, compressed TSV, or Parquet formats
40
+ 🎯 **Expression Filters**: Complex filtering with logical expressions
41
+ 🏷️ **Boolean Flag Support**: INFO field flags (PASS, DB, etc.) extracted as True/False columns
42
+ ⚡ **Memory Optimized**: Two-step workflow for large files (prepare → filter)
43
+ 💾 **Parquet Support**: Pre-process large files for repeated, memory-efficient analysis
40
44
 
41
45
  ---
42
46
 
@@ -47,17 +51,37 @@ A high-performance CLI tool for processing and filtering bcftools tabulated TSV
47
51
  Use `uvx` to run PyWombat without installation:
48
52
 
49
53
  ```bash
50
- # Basic formatting
51
- uvx pywombat input.tsv -o output
54
+ # Basic filtering
55
+ uvx pywombat filter input.tsv -o output
52
56
 
53
- # With filtering
54
- uvx pywombat input.tsv -F examples/rare_variants_high_impact.yml -o output
57
+ # With filter configuration
58
+ uvx pywombat filter input.tsv -F examples/rare_variants_high_impact.yml -o output
55
59
 
56
60
  # De novo mutation detection
57
- uvx pywombat input.tsv --pedigree pedigree.tsv \
61
+ uvx pywombat filter input.tsv --pedigree pedigree.tsv \
58
62
  -F examples/de_novo_mutations.yml -o denovo
59
63
  ```
60
64
 
65
+ ### For Large Files (>1GB or >50 samples)
66
+
67
+ Use the two-step workflow for memory-efficient processing:
68
+
69
+ ```bash
70
+ # Step 1: Prepare (one-time preprocessing)
71
+ uvx pywombat prepare input.tsv.gz -o prepared.parquet
72
+
73
+ # Step 2: Filter (fast, memory-efficient, can be run multiple times)
74
+ uvx pywombat filter prepared.parquet \
75
+ -p pedigree.tsv \
76
+ -F config.yml \
77
+ -o filtered
78
+ ```
79
+
80
+ **Benefits:**
81
+ - Pre-expands INFO fields once (saves time on repeated filtering)
82
+ - Applies filters before melting samples (reduces memory by 95%+)
83
+ - Parquet format enables fast columnar access
84
+
61
85
  ### Installation for Development/Repeated Use
62
86
 
63
87
  ```bash
@@ -69,7 +93,7 @@ cd pywombat
69
93
  uv sync
70
94
 
71
95
  # Run with uv run
72
- uv run wombat input.tsv -o output
96
+ uv run wombat filter input.tsv -o output
73
97
  ```
74
98
 
75
99
  ---
@@ -114,25 +138,62 @@ chr1 100 A T 2 0.5 30 true Sample2 1/1 18 99
114
138
 
115
139
  ---
116
140
 
117
- ## Basic Usage
141
+ ## Commands
142
+
143
+ PyWombat has two main commands:
144
+
145
+ ### `wombat prepare` - Preprocess Large Files
146
+
147
+ Converts TSV/TSV.gz to optimized Parquet format with pre-expanded INFO fields:
148
+
149
+ ```bash
150
+ # Basic usage
151
+ wombat prepare input.tsv.gz -o prepared.parquet
152
+
153
+ # With verbose output
154
+ wombat prepare input.tsv.gz -o prepared.parquet -v
155
+
156
+ # Adjust chunk size for memory constraints
157
+ wombat prepare input.tsv.gz -o prepared.parquet --chunk-size 25000
158
+ ```
159
+
160
+ **What it does:**
161
+ - Extracts all INFO fields (VEP_*, AF, etc.) as separate columns
162
+ - Keeps samples in wide format (not melted yet)
163
+ - Writes memory-efficient Parquet format
164
+ - Processes in chunks to handle files of any size
165
+
166
+ **When to use:**
167
+ - Files >1GB or >50 samples
168
+ - Large families (>10 members)
169
+ - Running multiple filter configurations
170
+ - Repeated analysis of the same dataset
171
+
172
+ ### `wombat filter` - Process and Filter Data
118
173
 
119
- ### Format Without Filtering
174
+ Transforms and filters variant data (works with both TSV and Parquet input):
120
175
 
121
176
  ```bash
122
- # Output to file
123
- uvx pywombat input.tsv -o output
177
+ # Basic filtering (TSV input)
178
+ wombat filter input.tsv -o output
179
+
180
+ # From prepared Parquet (faster, more memory-efficient)
181
+ wombat filter prepared.parquet -o output
182
+
183
+ # With filter configuration
184
+ wombat filter input.tsv -F config.yml -o output
124
185
 
125
- # Output to stdout (useful for piping)
126
- uvx pywombat input.tsv
186
+ # With pedigree
187
+ wombat filter input.tsv -p pedigree.tsv -o output
127
188
 
128
189
  # Compressed output
129
- uvx pywombat input.tsv -o output -f tsv.gz
190
+ wombat filter input.tsv -o output -f tsv.gz
130
191
 
131
- # Parquet format (fastest for large files)
132
- uvx pywombat input.tsv -o output -f parquet
192
+ # Parquet output
193
+ wombat filter input.tsv -o output -f parquet
133
194
 
134
195
  # With verbose output
135
- uvx pywombat input.tsv -o output --verbose
196
+ wombat filter input.tsv -o output -v
136
197
  ```
137
198
 
138
199
  ### With Pedigree (Trio/Family Analysis)
@@ -140,7 +201,7 @@ uvx pywombat input.tsv -o output --verbose
140
201
  Add parent genotype information for inheritance analysis:
141
202
 
142
203
  ```bash
143
- uvx pywombat input.tsv --pedigree pedigree.tsv -o output
204
+ wombat filter input.tsv --pedigree pedigree.tsv -o output
144
205
  ```
145
206
 
146
207
  **Pedigree File Format** (tab-separated):
@@ -178,7 +239,7 @@ PyWombat supports two types of filtering:
178
239
  Filter for ultra-rare, high-impact variants:
179
240
 
180
241
  ```bash
181
- uvx pywombat input.tsv \
242
+ wombat filter input.tsv \
182
243
  -F examples/rare_variants_high_impact.yml \
183
244
  -o rare_variants
184
245
  ```
@@ -210,7 +271,7 @@ expression: "VEP_CANONICAL = YES & VEP_IMPACT = HIGH & VEP_LoF = HC & VEP_LoF_fl
210
271
  Identify de novo mutations in trio data:
211
272
 
212
273
  ```bash
213
- uvx pywombat input.tsv \
274
+ wombat filter input.tsv \
214
275
  --pedigree pedigree.tsv \
215
276
  -F examples/de_novo_mutations.yml \
216
277
  -o denovo
@@ -290,7 +351,7 @@ expression: "VEP_IMPACT = HIGH & VEP_CANONICAL = YES & gnomad_AF < 0.01 & CADD_P
290
351
  Inspect specific variants for troubleshooting:
291
352
 
292
353
  ```bash
293
- uvx pywombat input.tsv \
354
+ wombat filter input.tsv \
294
355
  -F config.yml \
295
356
  --debug chr11:70486013
296
357
  ```
@@ -309,20 +370,20 @@ Shows:
309
370
  ### TSV (Default)
310
371
 
311
372
  ```bash
312
- uvx pywombat input.tsv -o output # Creates output.tsv
313
- uvx pywombat input.tsv -o output -f tsv # Same as above
373
+ wombat filter input.tsv -o output # Creates output.tsv
374
+ wombat filter input.tsv -o output -f tsv # Same as above
314
375
  ```
315
376
 
316
377
  ### Compressed TSV
317
378
 
318
379
  ```bash
319
- uvx pywombat input.tsv -o output -f tsv.gz # Creates output.tsv.gz
380
+ wombat filter input.tsv -o output -f tsv.gz # Creates output.tsv.gz
320
381
  ```
321
382
 
322
383
  ### Parquet (Fastest for Large Files)
323
384
 
324
385
  ```bash
325
- uvx pywombat input.tsv -o output -f parquet # Creates output.parquet
386
+ wombat filter input.tsv -o output -f parquet # Creates output.parquet
326
387
  ```
327
388
 
328
389
  **When to use Parquet:**
@@ -340,7 +401,7 @@ uvx pywombat input.tsv -o output -f parquet # Creates output.parquet
340
401
 
341
402
  ```bash
342
403
  # Step 1: Filter for rare, high-impact variants
343
- uvx pywombat cohort.tsv \
404
+ wombat filter cohort.tsv \
344
405
  -F examples/rare_variants_high_impact.yml \
345
406
  -o rare_variants
346
407
 
@@ -352,24 +413,34 @@ uvx pywombat cohort.tsv \
352
413
 
353
414
  ```bash
354
415
  # Identify de novo mutations in autism cohort
355
- uvx pywombat autism_trios.tsv \
416
+ wombat filter autism_trios.tsv \
356
417
  --pedigree autism_pedigree.tsv \
357
418
  -F examples/de_novo_mutations.yml \
358
419
  -o autism_denovo \
359
- --verbose
420
+ -v
360
421
 
361
422
  # Review output for genes in autism risk lists
362
423
  ```
363
424
 
364
- ### 3. Multi-Family Rare Variant Analysis
425
+ ### 3. Large Multi-Family Analysis (Memory-Optimized)
365
426
 
366
427
  ```bash
367
- # Process multiple families together
368
- uvx pywombat families.tsv \
428
+ # Step 1: Prepare once (preprocesses INFO fields)
429
+ wombat prepare large_cohort.tsv.gz -o prepared.parquet -v
430
+
431
+ # Step 2: Filter with different configurations (fast, memory-efficient)
432
+ wombat filter prepared.parquet \
369
433
  --pedigree families_pedigree.tsv \
370
434
  -F examples/rare_variants_high_impact.yml \
371
435
  -o families_rare_variants \
372
- -f parquet # Parquet for fast downstream analysis
436
+ -v
437
+
438
+ # Step 3: Run additional filters without re-preparing
439
+ wombat filter prepared.parquet \
440
+ --pedigree families_pedigree.tsv \
441
+ -F examples/de_novo_mutations.yml \
442
+ -o families_denovo \
443
+ -v
373
444
  ```
374
445
 
375
446
  ### 4. Custom Expression Filter
@@ -389,7 +460,7 @@ expression: "VEP_IMPACT = HIGH & (gnomad_AF < 0.0001 | gnomad_AF = null)"
389
460
  Apply:
390
461
 
391
462
  ```bash
392
- uvx pywombat input.tsv -F custom_filter.yml -o output
463
+ wombat filter input.tsv -F custom_filter.yml -o output
393
464
  ```
394
465
 
395
466
  ---
@@ -464,7 +535,7 @@ bcftools query -HH \
464
535
  annotated.split.bcf > annotated.tsv
465
536
 
466
537
  # 4. Process with PyWombat
467
- uvx pywombat annotated.tsv -F examples/rare_variants_high_impact.yml -o output
538
+ wombat filter annotated.tsv -F examples/rare_variants_high_impact.yml -o output
468
539
  ```
469
540
 
470
541
  **Why split-vep is required:**
@@ -481,7 +552,7 @@ For production workflows, these commands can be piped together:
481
552
  # Efficient pipeline (single pass through data)
482
553
  bcftools +split-vep -c - -p VEP_ input.vcf.gz | \
483
554
  bcftools query -HH -f '%CHROM\t%POS\t%REF\t%ALT\t%FILTER\t%INFO[\t%GT:%DP:%GQ:%AD]\n' | \
484
- uvx pywombat - -F config.yml -o output
555
+ wombat filter - -F config.yml -o output
485
556
  ```
486
557
 
487
558
  **Note**: For multiple filter configurations, it's more efficient to save the intermediate TSV file rather than regenerating it each time.
@@ -517,11 +588,31 @@ Each configuration file is fully documented with:
517
588
 
518
589
  ## Performance Tips
519
590
 
520
- 1. **Use streaming mode** (default): Efficient for most workflows
521
- 2. **Parquet output**: Faster for large files and repeated analysis
591
+ ### For Large Files (>1GB or >50 samples)
592
+
593
+ 1. **Use the two-step workflow**: `wombat prepare` → `wombat filter`
594
+ - Reduces memory usage by 95%+ (4.2M variants → ~100 after early filtering)
595
+ - Pre-expands INFO fields once, reuse for multiple filter configurations
596
+ - Example: 38-sample family with 4.2M variants processes in <1 second with ~1.2GB RAM
597
+
598
+ 2. **Parquet format benefits**:
599
+ - Columnar storage enables selective column loading
600
+ - Pre-filtering before melting (expression filters applied before expanding to per-sample rows)
601
+ - 30% smaller file size vs gzipped TSV
602
+
603
+ ### For All Files
604
+
522
605
  3. **Pre-filter with bcftools**: Filter by region/gene before PyWombat
523
606
  4. **Compressed input**: PyWombat handles `.gz` files natively
524
- 5. **Filter early**: Apply quality filters before complex expression filters
607
+ 5. **Use verbose mode** (`-v`): Monitor progress and filtering statistics
608
+
609
+ ### Memory Comparison
610
+
611
+ | Approach | 38 samples, 4.2M variants | Memory | Time |
612
+ |----------|---------------------------|--------|------|
613
+ | Direct TSV | ❌ OOM (>200GB) | 200+ GB | Failed |
614
+ | TSV with chunking | ⚠️ Slow | ~30GB | ~3 min |
615
+ | **Parquet + pre-filter** | ✅ **Optimal** | **~1.2GB** | **<1 sec** |
525
616
 
526
617
  ---
527
618
 
@@ -588,11 +679,15 @@ pywombat/
588
679
 
589
680
  **Issue**: Memory errors on large files
590
681
 
591
- - **Solution**: Files are processed in streaming mode by default; if issues persist, pre-filter with bcftools
682
+ - **Solution**: Use the two-step workflow: `wombat prepare` then `wombat filter` for 95%+ memory reduction
683
+
684
+ **Issue**: Command not found after upgrading
685
+
686
+ - **Solution**: PyWombat now uses subcommands - use `wombat filter` instead of just `wombat`
592
687
 
593
688
  ### Getting Help
594
689
 
595
- 1. Check `--help` for command options: `uvx pywombat --help`
690
+ 1. Check `--help` for command options: `wombat --help` or `wombat filter --help`
596
691
  2. Review example configurations in [`examples/`](examples/)
597
692
  3. Use `--debug` mode to inspect specific variants
598
693
  4. Use `--verbose` to see filtering steps
@@ -0,0 +1,6 @@
1
+ pywombat/__init__.py,sha256=iIPN9vJtsIUhl_DiKNnknxCamLinfayodLLFK8y-aJg,54
2
+ pywombat/cli.py,sha256=-tzD2UJxlByP8aE5uSZ1C6UvgoriJqPMXRNs7xY65nE,85545
3
+ pywombat-1.1.0.dist-info/METADATA,sha256=lYL6me-3Cw1wDa_yFdRX5Qj4cre6GMpY3Uqjy0LRwLg,20289
4
+ pywombat-1.1.0.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
5
+ pywombat-1.1.0.dist-info/entry_points.txt,sha256=Vt7U2ypbiEgCBlEV71ZPk287H5_HKmPBT4iBu6duEcE,44
6
+ pywombat-1.1.0.dist-info/RECORD,,
@@ -1,6 +0,0 @@
1
- pywombat/__init__.py,sha256=iIPN9vJtsIUhl_DiKNnknxCamLinfayodLLFK8y-aJg,54
2
- pywombat/cli.py,sha256=76pVpYYyl9rCm6TCq86j7xAqEC9pOvjiWCX9MmrBB_o,74994
3
- pywombat-1.0.2.dist-info/METADATA,sha256=slcagFwSvA99GVzKLRZggFPy5dkLEh_09O3dIB8Hfr4,17168
4
- pywombat-1.0.2.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
5
- pywombat-1.0.2.dist-info/entry_points.txt,sha256=Vt7U2ypbiEgCBlEV71ZPk287H5_HKmPBT4iBu6duEcE,44
6
- pywombat-1.0.2.dist-info/RECORD,,