PyPI - pywombat - Versions diffs - 1.0.2__py3-none-any.whl → 1.2.0__py3-none-any.whl - Mend

pywombat 1.0.2py3-none-any.whl → 1.2.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (6) hide show

pywombat/cli.py +633 -89
{pywombat-1.0.2.dist-info → pywombat-1.2.0.dist-info}/METADATA +161 -48
pywombat-1.2.0.dist-info/RECORD +6 -0
pywombat-1.0.2.dist-info/RECORD +0 -6
{pywombat-1.0.2.dist-info → pywombat-1.2.0.dist-info}/WHEEL +0 -0
{pywombat-1.0.2.dist-info → pywombat-1.2.0.dist-info}/entry_points.txt +0 -0

pywombat/cli.py CHANGED Viewed

@@ -11,13 +11,371 @@ import polars as pl
 import yaml
-@click.command()
+@click.group()
+def cli():
+    """
+    Wombat: A tool for processing bcftools tabulated TSV files.
+    \b
+    Commands:
+        filter   Process and filter variant data
+        prepare  Convert TSV to optimized Parquet format
+    """
+    pass
+@cli.command("prepare")
+@click.argument("input_file", type=click.Path(exists=True, path_type=Path))
+@click.option(
+    "-o",
+    "--output",
+    type=click.Path(path_type=Path),
+    required=True,
+    help="Output Parquet file path.",
+)
+@click.option(
+    "--chunk-size",
+    type=int,
+    default=50000,
+    help="Number of rows to process at a time (default: 50000).",
+)
+@click.option("-v", "--verbose", is_flag=True, help="Enable verbose output.")
+def prepare_cmd(
+    input_file: Path,
+    output: Path,
+    chunk_size: int,
+    verbose: bool,
+):
+    """
+    Convert bcftools TSV to optimized Parquet format.
+    This command pre-processes a TSV file by:
+    \b
+    1. Extracting all INFO fields from the '(null)' column into separate columns
+    2. Applying memory-efficient data types (Categorical for CHROM, UInt32 for POS)
+    3. Writing to Parquet format for efficient columnar access
+    The output Parquet file can then be used with 'wombat filter' for much faster
+    and more memory-efficient filtering, especially for large files.
+    \b
+    Examples:
+        wombat prepare input.tsv.gz -o prepared.parquet
+        wombat prepare input.tsv.gz -o prepared.parquet --chunk-size 100000
+    """
+    try:
+        if verbose:
+            click.echo(f"Preparing {input_file} -> {output}", err=True)
+        # Ensure output has .parquet extension
+        if not str(output).endswith(".parquet"):
+            output = Path(f"{output}.parquet")
+        # Process the file
+        prepare_parquet(input_file, output, chunk_size, verbose)
+        if verbose:
+            click.echo(f"Successfully created {output}", err=True)
+    except Exception as e:
+        click.echo(f"Error: {e}", err=True)
+        raise click.Abort()
+def prepare_parquet(
+    input_file: Path,
+    output: Path,
+    chunk_size: int = 50000,
+    verbose: bool = False,
+) -> None:
+    """
+    Convert a bcftools TSV file to Parquet with pre-expanded INFO fields.
+    Processes the file in chunks to handle large files without running out of memory.
+    Args:
+        input_file: Path to input TSV or TSV.gz file
+        output: Path to output Parquet file
+        chunk_size: Number of rows to process per chunk
+        verbose: Whether to print progress
+    """
+    from tqdm import tqdm
+    # First pass: discover all INFO fields
+    if verbose:
+        click.echo("Pass 1: Discovering INFO fields...", err=True)
+    all_fields = set()
+    all_flags = set()
+    total_lines = 0
+    is_gzipped = str(input_file).endswith(".gz")
+    opener = gzip.open if is_gzipped else open
+    with opener(input_file, "rt") as f:
+        header_line = f.readline().strip()
+        header_cols = header_line.split("\t")
+        # Find the (null) column index dynamically
+        null_col_idx = None
+        for i, col in enumerate(header_cols):
+            if col == "(null)":
+                null_col_idx = i
+                break
+        if null_col_idx is None:
+            if verbose:
+                click.echo("Warning: No (null) column found in input", err=True)
+        else:
+            for line in tqdm(f, desc="Scanning", disable=not verbose):
+                total_lines += 1
+                parts = line.split("\t")
+                if len(parts) > null_col_idx:
+                    null_value = parts[null_col_idx]
+                    if null_value and null_value != ".":
+                        pairs = null_value.split(";")
+                        for pair in pairs:
+                            if "=" in pair:
+                                field_name = pair.split("=", 1)[0]
+                                all_fields.add(field_name)
+                            elif pair.strip():
+                                all_flags.add(pair.strip())
+    if verbose:
+        click.echo(
+            f"Found {len(all_fields)} key-value fields and {len(all_flags)} flags in {total_lines} variants",
+            err=True,
+        )
+    # Second pass: process chunks and write Parquet
+    if verbose:
+        click.echo("Pass 2: Converting to Parquet...", err=True)
+    # Define memory-efficient dtypes
+    dtype_overrides = {
+        "#CHROM": pl.Categorical,
+        "POS": pl.UInt32,
+        "FILTER": pl.Categorical,
+    }
+    # Create a temporary directory for chunk files
+    import tempfile
+    import shutil
+    temp_dir = Path(tempfile.mkdtemp(prefix="wombat_prepare_"))
+    part_files = []
+    try:
+        with opener(input_file, "rt") as f:
+            header_line = f.readline().strip()
+            # Process in chunks
+            chunk_lines = []
+            pbar = tqdm(total=total_lines, desc="Converting", disable=not verbose)
+            for line in f:
+                chunk_lines.append(line)
+                if len(chunk_lines) >= chunk_size:
+                    df_chunk = _process_chunk(
+                        header_line, chunk_lines, all_fields, all_flags, dtype_overrides
+                    )
+                    part_file = temp_dir / f"part_{len(part_files):06d}.parquet"
+                    df_chunk.write_parquet(part_file)
+                    part_files.append(part_file)
+                    pbar.update(len(chunk_lines))
+                    chunk_lines = []
+            # Process remaining lines
+            if chunk_lines:
+                df_chunk = _process_chunk(
+                    header_line, chunk_lines, all_fields, all_flags, dtype_overrides
+                )
+                part_file = temp_dir / f"part_{len(part_files):06d}.parquet"
+                df_chunk.write_parquet(part_file)
+                part_files.append(part_file)
+                pbar.update(len(chunk_lines))
+            pbar.close()
+        # Combine all parts into final output using lazy scanning
+        if verbose:
+            click.echo(f"Combining {len(part_files)} parts into final output...", err=True)
+        if part_files:
+            # Use scan_parquet to lazily read all parts and write combined output
+            combined = pl.scan_parquet(part_files).collect()
+            combined.write_parquet(output)
+        if verbose:
+            click.echo(f"Wrote {len(part_files)} chunks to {output}", err=True)
+    finally:
+        # Clean up temporary directory
+        shutil.rmtree(temp_dir, ignore_errors=True)
+def _process_chunk(
+    header: str,
+    lines: list,
+    fields: set,
+    flags: set,
+    dtype_overrides: dict,
+) -> pl.DataFrame:
+    """Process a chunk of lines into a DataFrame with expanded INFO fields."""
+    import io
+    content = header + "\n" + "".join(lines)
+    df = pl.read_csv(
+        io.StringIO(content),
+        separator="\t",
+        infer_schema_length=10000,
+    )
+    # Expand INFO fields from (null) column
+    if "(null)" in df.columns:
+        # Extract key-value fields
+        for field in sorted(fields):
+            df = df.with_columns(
+                pl.col("(null)").str.extract(f"{field}=([^;]+)").alias(field)
+            )
+        # Extract boolean flags
+        for flag in sorted(flags):
+            df = df.with_columns(
+                pl.col("(null)").str.contains(f"(^|;){flag}(;|$)").alias(flag)
+            )
+        # Drop the original (null) column
+        df = df.drop("(null)")
+    # Drop CSQ column if it exists (redundant after expansion)
+    if "CSQ" in df.columns:
+        df = df.drop("CSQ")
+    # Apply memory-efficient dtypes
+    for col, dtype in dtype_overrides.items():
+        if col in df.columns:
+            try:
+                df = df.with_columns(pl.col(col).cast(dtype))
+            except Exception:
+                pass  # Skip if cast fails
+    return df
+def process_dnm_by_chromosome(
+    input_file: Path,
+    pedigree_df: pl.DataFrame,
+    filter_config: dict,
+    output_format: str,
+    verbose: bool
+) -> pl.DataFrame:
+    """Process DNM filtering chromosome by chromosome to reduce memory usage.
+    Processes each chromosome separately:
+    1. Load one chromosome at a time from Parquet
+    2. Apply frequency/quality prefilters (before melting)
+    3. Melt samples
+    4. Apply DNM filters
+    5. Combine results from all chromosomes
+    This reduces peak memory from (total_variants × samples) to
+    (max_chr_variants × samples).
+    Args:
+        input_file: Path to Parquet file
+        pedigree_df: Pedigree DataFrame with sample relationships
+        filter_config: Filter configuration dict
+        output_format: Output format (tsv, tsv.gz, parquet)
+        verbose: Whether to print progress messages
+    Returns:
+        Combined DataFrame with DNM-filtered variants from all chromosomes
+    """
+    # Get list of chromosomes
+    chromosomes = get_unique_chromosomes(input_file)
+    if verbose:
+        click.echo(
+            f"DNM per-chromosome processing: {len(chromosomes)} chromosomes", err=True
+        )
+    results = []
+    dnm_cfg = {}
+    dnm_cfg.update(filter_config.get("quality", {}))
+    dnm_cfg.update(filter_config.get("dnm", {}))
+    for chrom in chromosomes:
+        if verbose:
+            click.echo(f"Processing chromosome {chrom}...", err=True)
+        # Load only this chromosome
+        lazy_df = pl.scan_parquet(input_file).filter(
+            pl.col("#CHROM") == chrom
+        )
+        # Apply frequency filters BEFORE melting (Optimization 2)
+        lazy_df = apply_dnm_prefilters(lazy_df, filter_config, verbose=False)
+        # Count variants after prefiltering
+        if verbose:
+            pre_count = lazy_df.select(pl.count()).collect().item()
+            click.echo(f"  Chromosome {chrom}: {pre_count} variants after prefilter", err=True)
+        # Collect, melt, and apply DNM filters
+        df = lazy_df.collect()
+        if df.shape[0] == 0:
+            if verbose:
+                click.echo(f"  Chromosome {chrom}: No variants after prefilter, skipping", err=True)
+            continue
+        formatted_df = format_bcftools_tsv_minimal(df, pedigree_df)
+        if verbose:
+            click.echo(
+                f"  Chromosome {chrom}: {formatted_df.shape[0]} rows after melting", err=True
+            )
+        # Apply DNM filters (skip prefilters since already applied)
+        filtered_df = apply_de_novo_filter(
+            formatted_df, dnm_cfg, verbose=False, pedigree_df=pedigree_df,
+            skip_prefilters=True
+        )
+        if verbose:
+            click.echo(
+                f"  Chromosome {chrom}: {filtered_df.shape[0]} variants passed DNM filter", err=True
+            )
+        if filtered_df.shape[0] > 0:
+            results.append(filtered_df)
+    # Combine results
+    if not results:
+        if verbose:
+            click.echo("No variants passed DNM filters across all chromosomes", err=True)
+        # Return empty DataFrame with correct schema
+        return pl.DataFrame()
+    final_df = pl.concat(results)
+    if verbose:
+        click.echo(
+            f"DNM filtering complete: {final_df.shape[0]} total variants", err=True
+        )
+    return final_df
+@cli.command("filter")
 @click.argument("input_file", type=click.Path(exists=True, path_type=Path))
 @click.option(
     "-o",
     "--output",
     type=str,
-    help="Output file prefix. If not specified, prints to stdout.",
+    help="Output file prefix. If not specified, generates from input filename.",
 )
 @click.option(
     "-f",
@@ -43,9 +401,9 @@ import yaml
 @click.option(
     "--debug",
     type=str,
-    help="Debug mode: show rows matching chrom:pos (e.g., chr11:70486013). Displays #CHROM, POS, VEP_SYMBOL, and columns from filter expression.",
+    help="Debug mode: show rows matching chrom:pos (e.g., chr11:70486013).",
 )
-def cli(
+def filter_cmd(
     input_file: Path,
     output: Optional[str],
     output_format: str,
@@ -55,36 +413,43 @@ def cli(
     debug: Optional[str],
 ):
     """
-    Wombat: A tool for processing bcftools tabulated TSV files.
+    Process and filter variant data from TSV or Parquet files.
-    This command:
+    \b
+    Supports two input formats:
+    - TSV/TSV.gz: Full processing (INFO expansion + melting)
+    - Parquet: Fast processing (melting only, INFO already expanded)
     \b
-    1. Expands the '(null)' column containing NAME=value pairs separated by ';'
-    2. Preserves the CSQ (Consequence) column without melting
-    3. Melts sample columns into rows with sample names
-    4. Splits sample values (GT:DP:GQ:AD format) into separate columns:
-       - sample_gt: Genotype
-       - sample_dp: Read depth
-       - sample_gq: Genotype quality
-       - sample_ad: Allele depth (second value from comma-separated list)
-       - sample_vaf: Variant allele frequency (sample_ad / sample_dp)
+    For large files, use 'wombat prepare' first to convert to Parquet,
+    then use 'wombat filter' on the Parquet file for better performance.
+    \b
+    This command:
+    1. Expands the '(null)' column (TSV only) into separate columns
+    2. Melts sample columns into rows with sample names
+    3. Splits sample values (GT:DP:GQ:AD format) into separate columns
+    4. Applies quality and expression filters (if config provided)
     \b
     Examples:
-        wombat input.tsv -o output
-        wombat input.tsv -o output -f parquet
-        wombat input.tsv > output.tsv
+        wombat filter input.tsv -o output
+        wombat filter prepared.parquet -o output -f parquet
+        wombat filter input.tsv -p pedigree.tsv -F config.yml
     """
     try:
         if verbose:
             click.echo(f"Reading input file: {input_file}", err=True)
-        # Detect if file is gzipped based on extension
+        # Detect input format
+        is_parquet = str(input_file).endswith(".parquet")
         is_gzipped = str(input_file).endswith(".gz")
-        if verbose and is_gzipped:
-            click.echo("Detected gzipped file", err=True)
+        if verbose:
+            if is_parquet:
+                click.echo("Detected Parquet input (pre-processed)", err=True)
+            elif is_gzipped:
+                click.echo("Detected gzipped TSV file", err=True)
         # Read pedigree file if provided
         pedigree_df = None
@@ -109,11 +474,11 @@ def cli(
         if output is None:
             # Generate default output prefix from input filename
             input_stem = input_file.name
-            # Remove .tsv.gz or .tsv extension
-            if input_stem.endswith(".tsv.gz"):
-                input_stem = input_stem[:-7]  # Remove .tsv.gz
-            elif input_stem.endswith(".tsv"):
-                input_stem = input_stem[:-4]  # Remove .tsv
+            # Remove known extensions
+            for ext in [".tsv.gz", ".tsv", ".parquet"]:
+                if input_stem.endswith(ext):
+                    input_stem = input_stem[: -len(ext)]
+                    break
             # Add config name if filter is provided
             if filter_config:
@@ -126,24 +491,103 @@ def cli(
         if verbose:
             click.echo("Processing with streaming mode...", err=True)
-        # Build lazy query
-        # Force certain columns to string type
-        string_columns = [
-            "FID",
-            "sample_id",
-            "father_id",
-            "mother_id",
-            "FatherBarcode",
-            "MotherBarcode",
-            "sample",
-        ]
-        schema_overrides = {col: pl.Utf8 for col in string_columns}
-        lazy_df = pl.scan_csv(
-            input_file, separator="\t", schema_overrides=schema_overrides
-        )
+        # Build lazy query based on input format
+        if is_parquet:
+            # Parquet input: INFO fields already expanded by 'wombat prepare'
+            lazy_df = pl.scan_parquet(input_file)
+            # Check if DNM mode is enabled - use per-chromosome processing
+            if filter_config_data and filter_config_data.get("dnm", {}).get("enabled", False):
+                if verbose:
+                    click.echo("DNM mode: Using per-chromosome processing for memory efficiency", err=True)
+                # DNM requires pedigree
+                if pedigree_df is None:
+                    click.echo("Error: DNM filtering requires a pedigree file (--pedigree option)", err=True)
+                    raise click.Abort()
+                # Process DNM filtering chromosome by chromosome
+                formatted_df = process_dnm_by_chromosome(
+                    input_file,
+                    pedigree_df,
+                    filter_config_data,
+                    output_format,
+                    verbose
+                )
+                # Write output directly
+                output_path = Path(f"{output}.{output_format}")
+                if output_format == "tsv":
+                    formatted_df.write_csv(output_path, separator="\t")
+                elif output_format == "tsv.gz":
+                    csv_content = formatted_df.write_csv(separator="\t")
+                    with gzip.open(output_path, "wt") as f:
+                        f.write(csv_content)
+                elif output_format == "parquet":
+                    formatted_df.write_parquet(output_path)
+                if verbose:
+                    click.echo(f"DNM variants written to {output_path}", err=True)
+                return
+            # OPTIMIZATION: Apply expression filter BEFORE melting
+            # Expression filters (VEP_IMPACT, etc.) don't depend on sample data
+            if filter_config_data and "expression" in filter_config_data:
+                expression = filter_config_data["expression"]
+                if expression and verbose:
+                    click.echo(
+                        f"Applying expression filter before melting: {expression}",
+                        err=True,
+                    )
+                # Collect a small sample to get schema for expression parsing
+                schema_df = lazy_df.head(1).collect()
+                try:
+                    filter_expr = parse_impact_filter_expression(expression, schema_df)
+                    lazy_df = lazy_df.filter(filter_expr)
+                    # Count filtered variants
+                    if verbose:
+                        filtered_count = lazy_df.select(pl.len()).collect().item()
+                        click.echo(
+                            f"Variants after expression filter: {filtered_count}",
+                            err=True,
+                        )
+                except ValueError as e:
+                    if verbose:
+                        click.echo(
+                            f"Warning: Could not apply early filter: {e}", err=True
+                        )
-        # Apply formatting transformations
-        lazy_df = format_bcftools_tsv_lazy(lazy_df, pedigree_df)
+            # Now collect and melt (on filtered variants only)
+            df = lazy_df.collect()
+            formatted_df = format_bcftools_tsv_minimal(df, pedigree_df)
+            lazy_df = formatted_df.lazy()
+            # Remove expression from config so it's not applied again
+            if filter_config_data and "expression" in filter_config_data:
+                filter_config_data = filter_config_data.copy()
+                del filter_config_data["expression"]
+        else:
+            # TSV input: need full processing (melt + annotation expansion)
+            string_columns = [
+                "FID",
+                "sample_id",
+                "father_id",
+                "mother_id",
+                "FatherBarcode",
+                "MotherBarcode",
+                "sample",
+            ]
+            schema_overrides = {col: pl.Utf8 for col in string_columns}
+            lazy_df = pl.scan_csv(
+                input_file, separator="\t", schema_overrides=schema_overrides
+            )
+            # Apply formatting transformations (melt + expand annotations)
+            lazy_df = format_bcftools_tsv_lazy(lazy_df, pedigree_df)
         # Apply filters if provided
         if filter_config_data:
@@ -497,11 +941,47 @@ def _pos_in_par(chrom: str, pos: int, par_regions: dict) -> bool:
     return False
+def get_unique_chromosomes(parquet_file: Path) -> list[str]:
+    """Get list of unique chromosomes from Parquet file, sorted naturally.
+    Args:
+        parquet_file: Path to Parquet file
+    Returns:
+        Sorted list of chromosome names (e.g., ['1', '2', ..., '22', 'X', 'Y', 'MT'])
+    """
+    # Read just the #CHROM column to get unique values
+    df = pl.scan_parquet(parquet_file).select("#CHROM").unique().collect()
+    chroms = df["#CHROM"].to_list()
+    # Sort chromosomes properly (1, 2, ..., 22, X, Y, MT)
+    def chrom_sort_key(chrom: str) -> tuple:
+        """Sort key for natural chromosome ordering."""
+        chrom_norm = chrom.replace("chr", "").replace("Chr", "").replace("CHR", "").upper()
+        # Try to parse as integer (autosomes)
+        try:
+            return (0, int(chrom_norm), "")
+        except ValueError:
+            pass
+        # Sex chromosomes and mitochondrial
+        if chrom_norm in ["X", "Y", "MT", "M"]:
+            order = {"X": 23, "Y": 24, "MT": 25, "M": 25}
+            return (1, order.get(chrom_norm, 99), chrom_norm)
+        # Other chromosomes (e.g., scaffolds)
+        return (2, 0, chrom_norm)
+    return sorted(chroms, key=chrom_sort_key)
 def apply_de_novo_filter(
     df: pl.DataFrame,
     dnm_config: dict,
     verbose: bool = False,
     pedigree_df: Optional[pl.DataFrame] = None,
+    skip_prefilters: bool = False,
 ) -> pl.DataFrame:
     """Apply de novo detection filters to dataframe using vectorized operations.
@@ -512,6 +992,13 @@ def apply_de_novo_filter(
     This function will read `sex` from `df` when present; otherwise it will use
     the `pedigree_df` (which should contain `sample_id` and `sex`).
+    Args:
+        df: DataFrame with melted samples
+        dnm_config: DNM configuration dict
+        verbose: Whether to print progress messages
+        pedigree_df: Pedigree DataFrame
+        skip_prefilters: If True, skips frequency/genomes_filters (assumes already applied)
     """
     if not dnm_config:
         return df
@@ -676,43 +1163,45 @@ def apply_de_novo_filter(
             err=True,
         )
-    # Apply fafmax_faf95_max_genomes filter if specified
-    if fafmax_max is not None:
-        if "fafmax_faf95_max_genomes" in df.columns:
-            df = df.filter(
-                (
-                    pl.col("fafmax_faf95_max_genomes").cast(pl.Float64, strict=False)
-                    <= fafmax_max
+    # Apply frequency/quality prefilters if not already applied
+    if not skip_prefilters:
+        # Apply fafmax_faf95_max_genomes filter if specified
+        if fafmax_max is not None:
+            if "fafmax_faf95_max_genomes" in df.columns:
+                df = df.filter(
+                    (
+                        pl.col("fafmax_faf95_max_genomes").cast(pl.Float64, strict=False)
+                        <= fafmax_max
+                    )
+                    | pl.col("fafmax_faf95_max_genomes").is_null()
                 )
-                | pl.col("fafmax_faf95_max_genomes").is_null()
-            )
-            if verbose:
+                if verbose:
+                    click.echo(
+                        f"DNM: {df.shape[0]} variants after fafmax_faf95_max_genomes filter (<={fafmax_max})",
+                        err=True,
+                    )
+            elif verbose:
                 click.echo(
-                    f"DNM: {df.shape[0]} variants after fafmax_faf95_max_genomes filter (<={fafmax_max})",
+                    "DNM: Warning - fafmax_faf95_max_genomes column not found, skipping frequency filter",
                     err=True,
                 )
-        elif verbose:
-            click.echo(
-                "DNM: Warning - fafmax_faf95_max_genomes column not found, skipping frequency filter",
-                err=True,
-            )
-    # Apply genomes_filters filter if specified
-    if genomes_filters_pass_only:
-        if "genomes_filters" in df.columns:
-            df = df.filter(
-                (pl.col("genomes_filters") == ".") | pl.col("genomes_filters").is_null()
-            )
-            if verbose:
+        # Apply genomes_filters filter if specified
+        if genomes_filters_pass_only:
+            if "genomes_filters" in df.columns:
+                df = df.filter(
+                    (pl.col("genomes_filters") == ".") | pl.col("genomes_filters").is_null()
+                )
+                if verbose:
+                    click.echo(
+                        f"DNM: {df.shape[0]} variants after genomes_filters filter (pass only)",
+                        err=True,
+                    )
+            elif verbose:
                 click.echo(
-                    f"DNM: {df.shape[0]} variants after genomes_filters filter (pass only)",
+                    "DNM: Warning - genomes_filters column not found, skipping genomes_filters filter",
                     err=True,
                 )
-        elif verbose:
-            click.echo(
-                "DNM: Warning - genomes_filters column not found, skipping genomes_filters filter",
-                err=True,
-            )
     # Build parent quality checks (common to all)
     father_qual_ok = (pl.col("father_dp").cast(pl.Float64, strict=False) >= p_dp) & (
@@ -1394,33 +1883,39 @@ def format_bcftools_tsv_minimal(
     Returns:
         Formatted DataFrame with melted samples (annotations still in (null) column)
     """
-    # Find the (null) column
-    if "(null)" not in df.columns:
-        raise ValueError("Column '(null)' not found in the input file")
-    # Get column index of (null)
-    null_col_idx = df.columns.index("(null)")
-    # Split columns into: before (null), (null), and after (null)
-    cols_after = df.columns[null_col_idx + 1 :]
+    # Determine which columns are sample columns
+    # Sample columns have format "SampleName:GT:SampleName:DP:..." or similar
+    # Non-sample columns are standard VCF columns or annotation columns
+    # Standard VCF/annotation columns (not samples)
+    standard_cols = {
+        "#CHROM", "POS", "REF", "ALT", "FILTER", "(null)", "CSQ",
+        "QUAL", "ID", "INFO", "FORMAT"
+    }
-    # Step 1: Identify sample columns (SKIP annotation expansion)
+    # Find sample columns by looking for columns with ":" in the name
+    # that aren't standard columns
     sample_cols = []
     sample_names = []
-    for col in cols_after:
-        # Skip CSQ column
-        if col == "CSQ":
+    for col in df.columns:
+        # Skip standard columns
+        if col in standard_cols:
+            continue
+        # Skip columns that look like VEP annotation fields
+        if col.startswith("VEP_") or col.startswith("AF") or col.startswith("AC"):
             continue
+        # Sample columns typically have ":" in them (GT:DP:GQ:AD format)
         if ":" in col:
             sample_name = col.split(":", 1)[0]
             sample_cols.append(col)
             sample_names.append(sample_name)
-        else:
-            # If no colon, treat the whole column name as sample name
-            sample_cols.append(col)
-            sample_names.append(col)
+        elif col not in df.columns[:10]:
+            # Columns after position 10 that don't match known patterns might be samples
+            # This is a heuristic for unusual sample column formats
+            pass
     if not sample_cols:
         # No sample columns to melt
@@ -1984,6 +2479,55 @@ def process_with_progress(
         click.echo("Processing complete.", err=True)
+def apply_dnm_prefilters(
+    lazy_df: pl.LazyFrame,
+    filter_config: dict,
+    verbose: bool = False
+) -> pl.LazyFrame:
+    """Apply variant-level DNM filters before melting.
+    These filters don't require sample-level data and can be applied
+    on wide-format data to reduce memory usage.
+    Applies:
+    - Population frequency filters (fafmax_faf95_max_genomes_max)
+    - Quality filters (genomes_filters PASS only)
+    Args:
+        lazy_df: LazyFrame with wide-format data (not melted)
+        filter_config: Filter configuration dict
+        verbose: Whether to print progress messages
+    Returns:
+        Filtered LazyFrame
+    """
+    dnm_config = filter_config.get("dnm", {})
+    # Frequency filter
+    fafmax_max = dnm_config.get("fafmax_faf95_max_genomes_max")
+    if fafmax_max is not None:
+        lazy_df = lazy_df.filter(
+            (pl.col("fafmax_faf95_max_genomes").cast(pl.Float64, strict=False) <= fafmax_max)
+            | pl.col("fafmax_faf95_max_genomes").is_null()
+        )
+        if verbose:
+            click.echo(
+                f"DNM prefilter: Applied frequency filter (fafmax <= {fafmax_max})", err=True
+            )
+    # Quality filter (genomes_filters PASS only)
+    if dnm_config.get("genomes_filters_pass_only", False):
+        lazy_df = lazy_df.filter(
+            (pl.col("genomes_filters") == ".") | pl.col("genomes_filters").is_null()
+        )
+        if verbose:
+            click.echo(
+                "DNM prefilter: Applied genomes_filters PASS filter", err=True
+            )
+    return lazy_df
 def apply_filters_lazy(
     lazy_df: pl.LazyFrame,
     filter_config: dict,

{pywombat-1.0.2.dist-info → pywombat-1.2.0.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: pywombat
-Version: 1.0.2
+Version: 1.2.0
 Summary: A CLI tool for processing and filtering bcftools tabulated TSV files with pedigree support
 Project-URL: Homepage, https://github.com/bourgeron-lab/pywombat
 Project-URL: Repository, https://github.com/bourgeron-lab/pywombat
@@ -18,6 +18,9 @@ Requires-Dist: click>=8.1.0
 Requires-Dist: polars>=0.19.0
 Requires-Dist: pyyaml>=6.0
 Requires-Dist: tqdm>=4.67.1
+Provides-Extra: dev
+Requires-Dist: pytest-cov>=4.0.0; extra == 'dev'
+Requires-Dist: pytest>=7.0.0; extra == 'dev'
 Description-Content-Type: text/markdown
 # PyWombat 🦘
@@ -29,14 +32,15 @@ A high-performance CLI tool for processing and filtering bcftools tabulated TSV
 ## Features
-✨ **Fast Processing**: Uses Polars for efficient data handling
-🔬 **Quality Filtering**: Configurable depth, quality, and VAF thresholds
-👨‍👩‍👧 **Pedigree Support**: Trio and family analysis with parent genotypes
-🧬 **De Novo Detection**: Sex-chromosome-aware DNM identification
-📊 **Flexible Output**: TSV, compressed TSV, or Parquet formats
-🎯 **Expression Filters**: Complex filtering with logical expressions
-🏷️ **Boolean Flag Support**: INFO field flags (PASS, DB, etc.) extracted as True/False columns
-⚡ **Streaming Mode**: Memory-efficient processing of large files
+✨ **Fast Processing**: Uses Polars for efficient data handling
+🔬 **Quality Filtering**: Configurable depth, quality, and VAF thresholds
+👨‍👩‍👧 **Pedigree Support**: Trio and family analysis with parent genotypes
+🧬 **De Novo Detection**: Sex-chromosome-aware DNM identification
+📊 **Flexible Output**: TSV, compressed TSV, or Parquet formats
+🎯 **Expression Filters**: Complex filtering with logical expressions
+🏷️ **Boolean Flag Support**: INFO field flags (PASS, DB, etc.) extracted as True/False columns
+⚡ **Memory Optimized**: Two-step workflow for large files (prepare → filter)
+💾 **Parquet Support**: Pre-process large files for repeated, memory-efficient analysis
 ---
@@ -47,17 +51,37 @@ A high-performance CLI tool for processing and filtering bcftools tabulated TSV
 Use `uvx` to run PyWombat without installation:
 ```bash
-# Basic formatting
-uvx pywombat input.tsv -o output
+# Basic filtering
+uvx pywombat filter input.tsv -o output
-# With filtering
-uvx pywombat input.tsv -F examples/rare_variants_high_impact.yml -o output
+# With filter configuration
+uvx pywombat filter input.tsv -F examples/rare_variants_high_impact.yml -o output
 # De novo mutation detection
-uvx pywombat input.tsv --pedigree pedigree.tsv \
+uvx pywombat filter input.tsv --pedigree pedigree.tsv \
   -F examples/de_novo_mutations.yml -o denovo
 ```
+### For Large Files (>1GB or >50 samples)
+Use the two-step workflow for memory-efficient processing:
+```bash
+# Step 1: Prepare (one-time preprocessing)
+uvx pywombat prepare input.tsv.gz -o prepared.parquet
+# Step 2: Filter (fast, memory-efficient, can be run multiple times)
+uvx pywombat filter prepared.parquet \
+  -p pedigree.tsv \
+  -F config.yml \
+  -o filtered
+```
+**Benefits:**
+- Pre-expands INFO fields once (saves time on repeated filtering)
+- Applies filters before melting samples (reduces memory by 95%+)
+- Parquet format enables fast columnar access
 ### Installation for Development/Repeated Use
 ```bash
@@ -69,7 +93,7 @@ cd pywombat
 uv sync
 # Run with uv run
-uv run wombat input.tsv -o output
+uv run wombat filter input.tsv -o output
 ```
 ---
@@ -114,25 +138,62 @@ chr1    100  A    T    2   0.5  30  true  Sample2  1/1        18         99
 ---
-## Basic Usage
+## Commands
+PyWombat has two main commands:
+### `wombat prepare` - Preprocess Large Files
+Converts TSV/TSV.gz to optimized Parquet format with pre-expanded INFO fields:
+```bash
+# Basic usage
+wombat prepare input.tsv.gz -o prepared.parquet
+# With verbose output
+wombat prepare input.tsv.gz -o prepared.parquet -v
+# Adjust chunk size for memory constraints
+wombat prepare input.tsv.gz -o prepared.parquet --chunk-size 25000
+```
+**What it does:**
+- Extracts all INFO fields (VEP_*, AF, etc.) as separate columns
+- Keeps samples in wide format (not melted yet)
+- Writes memory-efficient Parquet format
+- Processes in chunks to handle files of any size
+**When to use:**
+- Files >1GB or >50 samples
+- Large families (>10 members)
+- Running multiple filter configurations
+- Repeated analysis of the same dataset
+### `wombat filter` - Process and Filter Data
-### Format Without Filtering
+Transforms and filters variant data (works with both TSV and Parquet input):
 ```bash
-# Output to file
-uvx pywombat input.tsv -o output
+# Basic filtering (TSV input)
+wombat filter input.tsv -o output
-# Output to stdout (useful for piping)
-uvx pywombat input.tsv
+# From prepared Parquet (faster, more memory-efficient)
+wombat filter prepared.parquet -o output
+# With filter configuration
+wombat filter input.tsv -F config.yml -o output
+# With pedigree
+wombat filter input.tsv -p pedigree.tsv -o output
 # Compressed output
-uvx pywombat input.tsv -o output -f tsv.gz
+wombat filter input.tsv -o output -f tsv.gz
-# Parquet format (fastest for large files)
-uvx pywombat input.tsv -o output -f parquet
+# Parquet output
+wombat filter input.tsv -o output -f parquet
 # With verbose output
-uvx pywombat input.tsv -o output --verbose
+wombat filter input.tsv -o output -v
 ```
 ### With Pedigree (Trio/Family Analysis)
@@ -140,7 +201,7 @@ uvx pywombat input.tsv -o output --verbose
 Add parent genotype information for inheritance analysis:
 ```bash
-uvx pywombat input.tsv --pedigree pedigree.tsv -o output
+wombat filter input.tsv --pedigree pedigree.tsv -o output
 ```
 **Pedigree File Format** (tab-separated):
@@ -178,7 +239,7 @@ PyWombat supports two types of filtering:
 Filter for ultra-rare, high-impact variants:
 ```bash
-uvx pywombat input.tsv \
+wombat filter input.tsv \
   -F examples/rare_variants_high_impact.yml \
   -o rare_variants
 ```
@@ -210,7 +271,7 @@ expression: "VEP_CANONICAL = YES & VEP_IMPACT = HIGH & VEP_LoF = HC & VEP_LoF_fl
 Identify de novo mutations in trio data:
 ```bash
-uvx pywombat input.tsv \
+wombat filter input.tsv \
   --pedigree pedigree.tsv \
   -F examples/de_novo_mutations.yml \
   -o denovo
@@ -290,7 +351,7 @@ expression: "VEP_IMPACT = HIGH & VEP_CANONICAL = YES & gnomad_AF < 0.01 & CADD_P
 Inspect specific variants for troubleshooting:
 ```bash
-uvx pywombat input.tsv \
+wombat filter input.tsv \
   -F config.yml \
   --debug chr11:70486013
 ```
@@ -309,20 +370,20 @@ Shows:
 ### TSV (Default)
 ```bash
-uvx pywombat input.tsv -o output          # Creates output.tsv
-uvx pywombat input.tsv -o output -f tsv   # Same as above
+wombat filter input.tsv -o output          # Creates output.tsv
+wombat filter input.tsv -o output -f tsv   # Same as above
 ```
 ### Compressed TSV
 ```bash
-uvx pywombat input.tsv -o output -f tsv.gz  # Creates output.tsv.gz
+wombat filter input.tsv -o output -f tsv.gz  # Creates output.tsv.gz
 ```
 ### Parquet (Fastest for Large Files)
 ```bash
-uvx pywombat input.tsv -o output -f parquet  # Creates output.parquet
+wombat filter input.tsv -o output -f parquet  # Creates output.parquet
 ```
 **When to use Parquet:**
@@ -340,7 +401,7 @@ uvx pywombat input.tsv -o output -f parquet  # Creates output.parquet
 ```bash
 # Step 1: Filter for rare, high-impact variants
-uvx pywombat cohort.tsv \
+wombat filter cohort.tsv \
   -F examples/rare_variants_high_impact.yml \
   -o rare_variants
@@ -352,24 +413,34 @@ uvx pywombat cohort.tsv \
 ```bash
 # Identify de novo mutations in autism cohort
-uvx pywombat autism_trios.tsv \
+wombat filter autism_trios.tsv \
   --pedigree autism_pedigree.tsv \
   -F examples/de_novo_mutations.yml \
   -o autism_denovo \
-  --verbose
+  -v
 # Review output for genes in autism risk lists
 ```
-### 3. Multi-Family Rare Variant Analysis
+### 3. Large Multi-Family Analysis (Memory-Optimized)
 ```bash
-# Process multiple families together
-uvx pywombat families.tsv \
+# Step 1: Prepare once (preprocesses INFO fields)
+wombat prepare large_cohort.tsv.gz -o prepared.parquet -v
+# Step 2: Filter with different configurations (fast, memory-efficient)
+wombat filter prepared.parquet \
   --pedigree families_pedigree.tsv \
   -F examples/rare_variants_high_impact.yml \
   -o families_rare_variants \
-  -f parquet  # Parquet for fast downstream analysis
+  -v
+# Step 3: Run additional filters without re-preparing
+wombat filter prepared.parquet \
+  --pedigree families_pedigree.tsv \
+  -F examples/de_novo_mutations.yml \
+  -o families_denovo \
+  -v
 ```
 ### 4. Custom Expression Filter
@@ -389,7 +460,7 @@ expression: "VEP_IMPACT = HIGH & (gnomad_AF < 0.0001 | gnomad_AF = null)"
 Apply:
 ```bash
-uvx pywombat input.tsv -F custom_filter.yml -o output
+wombat filter input.tsv -F custom_filter.yml -o output
 ```
 ---
@@ -464,7 +535,7 @@ bcftools query -HH \
   annotated.split.bcf > annotated.tsv
 # 4. Process with PyWombat
-uvx pywombat annotated.tsv -F examples/rare_variants_high_impact.yml -o output
+wombat filter annotated.tsv -F examples/rare_variants_high_impact.yml -o output
 ```
 **Why split-vep is required:**
@@ -481,7 +552,7 @@ For production workflows, these commands can be piped together:
 # Efficient pipeline (single pass through data)
 bcftools +split-vep -c - -p VEP_ input.vcf.gz | \
   bcftools query -HH -f '%CHROM\t%POS\t%REF\t%ALT\t%FILTER\t%INFO[\t%GT:%DP:%GQ:%AD]\n' | \
-  uvx pywombat - -F config.yml -o output
+  wombat filter - -F config.yml -o output
 ```
 **Note**: For multiple filter configurations, it's more efficient to save the intermediate TSV file rather than regenerating it each time.
@@ -517,11 +588,49 @@ Each configuration file is fully documented with:
 ## Performance Tips
-1. **Use streaming mode** (default): Efficient for most workflows
-2. **Parquet output**: Faster for large files and repeated analysis
+### For Large Files (>1GB or >50 samples)
+1. **Use the two-step workflow**: `wombat prepare` → `wombat filter`
+   - Reduces memory usage by 95%+ (4.2M variants → ~100 after early filtering)
+   - Pre-expands INFO fields once, reuse for multiple filter configurations
+   - Example: 38-sample family with 4.2M variants processes in <1 second with ~1.2GB RAM
+2. **Parquet format benefits**:
+   - Columnar storage enables selective column loading
+   - Pre-filtering before melting (expression filters applied before expanding to per-sample rows)
+   - **Per-chromosome processing for DNM**: Automatically processes DNM filtering chromosome-by-chromosome
+   - 30% smaller file size vs gzipped TSV
+3. **De Novo Mutation (DNM) filtering optimization**:
+   - Automatically uses per-chromosome processing when DNM mode is enabled
+   - Processes one chromosome at a time to reduce peak memory
+   - Applies frequency filters before melting to reduce data expansion
+   - Example: 38-sample family with 4.2M variants completes in 20 seconds with ~24GB RAM (vs 200GB+ OOM failure)
+### For All Files
 3. **Pre-filter with bcftools**: Filter by region/gene before PyWombat
 4. **Compressed input**: PyWombat handles `.gz` files natively
-5. **Filter early**: Apply quality filters before complex expression filters
+5. **Use verbose mode** (`-v`): Monitor progress and filtering statistics
+### Memory Comparison
+**Expression Filtering** (e.g., VEP_IMPACT filters):
+| Approach | 38 samples, 4.2M variants | Memory | Time |
+|----------|---------------------------|--------|------|
+| Direct TSV | ❌ OOM (>200GB) | 200+ GB | Failed |
+| TSV with chunking | ⚠️ Slow | ~30GB | ~3 min |
+| **Parquet + pre-filter** | ✅ **Optimal** | **~1.2GB** | **<1 sec** |
+**De Novo Mutation (DNM) Filtering**:
+| Approach | 38 samples, 4.2M variants | Memory | Time | Result |
+|----------|---------------------------|--------|------|--------|
+| Without optimization | ❌ OOM (>200GB) | 200+ GB | Failed | N/A |
+| **Parquet + per-chromosome** | ✅ **Success** | **~24GB** | **20 sec** | **6,788 DNM variants** |
+*DNM filtering requires sample-level data (cannot pre-filter before melting), but per-chromosome processing reduces peak memory by 88%.*
 ---
@@ -588,11 +697,15 @@ pywombat/
 **Issue**: Memory errors on large files
-- **Solution**: Files are processed in streaming mode by default; if issues persist, pre-filter with bcftools
+- **Solution**: Use the two-step workflow: `wombat prepare` then `wombat filter` for 95%+ memory reduction
+**Issue**: Command not found after upgrading
+- **Solution**: PyWombat now uses subcommands - use `wombat filter` instead of just `wombat`
 ### Getting Help
-1. Check `--help` for command options: `uvx pywombat --help`
+1. Check `--help` for command options: `wombat --help` or `wombat filter --help`
 2. Review example configurations in [`examples/`](examples/)
 3. Use `--debug` mode to inspect specific variants
 4. Use `--verbose` to see filtering steps

pywombat-1.2.0.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,6 @@
+pywombat/__init__.py,sha256=iIPN9vJtsIUhl_DiKNnknxCamLinfayodLLFK8y-aJg,54
+pywombat/cli.py,sha256=pEPvUTww5Nvj-WqSRZ0QEePnORrcYkhWJv3uVi5DnxM,93728
+pywombat-1.2.0.dist-info/METADATA,sha256=3TeUY6jzQCfrFaQ_BuocdB8374Esqwkoug9L-iZtLT0,21306
+pywombat-1.2.0.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
+pywombat-1.2.0.dist-info/entry_points.txt,sha256=Vt7U2ypbiEgCBlEV71ZPk287H5_HKmPBT4iBu6duEcE,44
+pywombat-1.2.0.dist-info/RECORD,,

pywombat-1.0.2.dist-info/RECORD DELETED Viewed

@@ -1,6 +0,0 @@
-pywombat/__init__.py,sha256=iIPN9vJtsIUhl_DiKNnknxCamLinfayodLLFK8y-aJg,54
-pywombat/cli.py,sha256=76pVpYYyl9rCm6TCq86j7xAqEC9pOvjiWCX9MmrBB_o,74994
-pywombat-1.0.2.dist-info/METADATA,sha256=slcagFwSvA99GVzKLRZggFPy5dkLEh_09O3dIB8Hfr4,17168
-pywombat-1.0.2.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
-pywombat-1.0.2.dist-info/entry_points.txt,sha256=Vt7U2ypbiEgCBlEV71ZPk287H5_HKmPBT4iBu6duEcE,44
-pywombat-1.0.2.dist-info/RECORD,,

{pywombat-1.0.2.dist-info → pywombat-1.2.0.dist-info}/WHEEL RENAMED Viewed

File without changes

{pywombat-1.0.2.dist-info → pywombat-1.2.0.dist-info}/entry_points.txt RENAMED Viewed

File without changes

pywombat 1.0.2__py3-none-any.whl → 1.2.0__py3-none-any.whl

pywombat 1.0.2py3-none-any.whl → 1.2.0py3-none-any.whl