PyPI - pywombat - Versions diffs - 0.2.0__py3-none-any.whl → 0.4.0__py3-none-any.whl - Mend

pywombat 0.2.0py3-none-any.whl → 0.4.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (6) hide show

pywombat/cli.py +289 -39
{pywombat-0.2.0.dist-info → pywombat-0.4.0.dist-info}/METADATA +1 -1
pywombat-0.4.0.dist-info/RECORD +6 -0
pywombat-0.2.0.dist-info/RECORD +0 -6
{pywombat-0.2.0.dist-info → pywombat-0.4.0.dist-info}/WHEEL +0 -0
{pywombat-0.2.0.dist-info → pywombat-0.4.0.dist-info}/entry_points.txt +0 -0

pywombat/cli.py CHANGED Viewed

@@ -40,6 +40,11 @@ import yaml
     type=click.Path(exists=True, path_type=Path),
     help="Filter configuration YAML file to apply quality and impact filters.",
 )
+@click.option(
+    "--debug",
+    type=str,
+    help="Debug mode: show rows matching chrom:pos (e.g., chr11:70486013). Displays #CHROM, POS, VEP_SYMBOL, and columns from filter expression.",
+)
 def cli(
     input_file: Path,
     output: Optional[str],
@@ -47,6 +52,7 @@ def cli(
     verbose: bool,
     pedigree: Optional[Path],
     filter_config: Optional[Path],
+    debug: Optional[str],
 ):
     """
     Wombat: A tool for processing bcftools tabulated TSV files.
@@ -80,14 +86,6 @@ def cli(
         if verbose and is_gzipped:
             click.echo("Detected gzipped file", err=True)
-        # Read the TSV file (handles both plain and gzipped)
-        df = pl.read_csv(input_file, separator="\t")
-        if verbose:
-            click.echo(
-                f"Input shape: {df.shape[0]} rows, {df.shape[1]} columns", err=True
-            )
         # Read pedigree file if provided
         pedigree_df = None
         if pedigree:
@@ -95,22 +93,18 @@ def cli(
                 click.echo(f"Reading pedigree file: {pedigree}", err=True)
             pedigree_df = read_pedigree(pedigree)
-        # Process the dataframe
-        formatted_df = format_bcftools_tsv(df, pedigree_df)
-        if verbose:
-            click.echo(
-                f"Output shape: {formatted_df.shape[0]} rows, {formatted_df.shape[1]} columns",
-                err=True,
-            )
-        # Apply filters if provided
+        # Load filter config if provided
         filter_config_data = None
         if filter_config:
             if verbose:
                 click.echo(f"Reading filter config: {filter_config}", err=True)
             filter_config_data = load_filter_config(filter_config)
+        # Debug mode: show specific variant
+        if debug:
+            debug_variant(input_file, pedigree_df, filter_config_data, debug, verbose)
+            return
         # Determine output prefix
         if output is None:
             # Generate default output prefix from input filename
@@ -128,36 +122,131 @@ def cli(
             else:
                 output = input_stem
-        # Apply filters and write output
+        # Use streaming approach with lazy API
+        if verbose:
+            click.echo("Processing with streaming mode...", err=True)
+        # Build lazy query
+        lazy_df = pl.scan_csv(input_file, separator="\t")
+        # Apply formatting transformations
+        lazy_df = format_bcftools_tsv_lazy(lazy_df, pedigree_df)
+        # Apply filters if provided
         if filter_config_data:
-            apply_filters_and_write(
-                formatted_df,
-                filter_config_data,
-                output,
-                output_format,
-                verbose,
-            )
-        else:
-            # No filters - write single output file
-            # Construct output filename with prefix and format
-            output_path = Path(f"{output}.{output_format}")
+            lazy_df = apply_filters_lazy(lazy_df, filter_config_data, verbose)
-            if output_format == "tsv":
-                formatted_df.write_csv(output_path, separator="\t")
-            elif output_format == "tsv.gz":
-                csv_content = formatted_df.write_csv(separator="\t")
-                with gzip.open(output_path, "wt") as f:
-                    f.write(csv_content)
-            elif output_format == "parquet":
-                formatted_df.write_parquet(output_path)
+        # Write output
+        output_path = Path(f"{output}.{output_format}")
-            click.echo(f"Formatted data written to {output_path}", err=True)
+        if output_format == "tsv":
+            lazy_df.sink_csv(output_path, separator="\t")
+        elif output_format == "tsv.gz":
+            # For gzip, we need to collect and write
+            df = lazy_df.collect()
+            csv_content = df.write_csv(separator="\t")
+            with gzip.open(output_path, "wt") as f:
+                f.write(csv_content)
+        elif output_format == "parquet":
+            lazy_df.sink_parquet(output_path)
+        if verbose:
+            click.echo(f"Data written to {output_path}", err=True)
     except Exception as e:
         click.echo(f"Error: {e}", err=True)
         raise click.Abort()
+def debug_variant(
+    input_file: Path,
+    pedigree_df: Optional[pl.DataFrame],
+    filter_config: Optional[dict],
+    debug_pos: str,
+    verbose: bool,
+):
+    """Debug mode: display rows matching a specific chrom:pos."""
+    # Parse debug position
+    if ":" not in debug_pos:
+        click.echo(
+            "Error: Debug position must be in format 'chrom:pos' (e.g., chr11:70486013)",
+            err=True,
+        )
+        raise click.Abort()
+    chrom, pos = debug_pos.split(":", 1)
+    try:
+        pos = int(pos)
+    except ValueError:
+        click.echo(f"Error: Position must be an integer, got '{pos}'", err=True)
+        raise click.Abort()
+    if verbose:
+        click.echo(f"Debug mode: searching for {chrom}:{pos}", err=True)
+    # Read and format the data
+    df = pl.read_csv(input_file, separator="\t")
+    formatted_df = format_bcftools_tsv(df, pedigree_df)
+    # Filter to matching rows
+    matching_rows = formatted_df.filter(
+        (pl.col("#CHROM") == chrom) & (pl.col("POS") == pos)
+    )
+    if matching_rows.shape[0] == 0:
+        click.echo(f"No rows found matching {chrom}:{pos}", err=True)
+        return
+    # Determine which columns to display
+    columns_to_show = ["#CHROM", "POS"]
+    # Add VEP_SYMBOL if it exists
+    if "VEP_SYMBOL" in matching_rows.columns:
+        columns_to_show.append("VEP_SYMBOL")
+    # Extract column names from expression if filter config provided
+    if filter_config and "expression" in filter_config:
+        expression = filter_config["expression"]
+        # Extract column names from expression using regex
+        # Match patterns like "column_name" before operators
+        column_pattern = r"\b([A-Za-z_][A-Za-z0-9_]*)\b\s*[=!<>]"
+        found_columns = re.findall(column_pattern, expression)
+        for col in found_columns:
+            if col in matching_rows.columns and col not in columns_to_show:
+                columns_to_show.append(col)
+    # Select only the columns we want to display
+    display_df = matching_rows.select(
+        [col for col in columns_to_show if col in matching_rows.columns]
+    )
+    # Replace null and NaN values with <null> and <NaN> for display
+    for col in display_df.columns:
+        if display_df[col].dtype in [pl.Float32, pl.Float64]:
+            # For numeric columns, handle both NaN and null
+            display_df = display_df.with_columns(
+                pl.when(pl.col(col).is_null())
+                .then(pl.lit("<null>"))
+                .when(pl.col(col).is_nan())
+                .then(pl.lit("<NaN>"))
+                .otherwise(pl.col(col).cast(pl.Utf8))
+                .alias(col)
+            )
+        else:
+            # For non-numeric columns, only handle null
+            display_df = display_df.with_columns(
+                pl.when(pl.col(col).is_null())
+                .then(pl.lit("<null>"))
+                .otherwise(pl.col(col).cast(pl.Utf8))
+                .alias(col)
+            )
+    # Display the results
+    click.echo(f"\nFound {matching_rows.shape[0]} row(s) matching {chrom}:{pos}:\n")
+    click.echo(display_df.write_csv(separator="\t"))
 def load_filter_config(config_path: Path) -> dict:
     """Load and parse filter configuration from YAML file."""
     with open(config_path, "r") as f:
@@ -377,6 +466,30 @@ def parse_impact_filter_expression(expression: str, df: pl.DataFrame) -> pl.Expr
                     if col_name not in df.columns:
                         raise ValueError(f"Column '{col_name}' not found in dataframe")
+                    # Check for null value
+                    if value.upper() == "NULL":
+                        col_expr = pl.col(col_name)
+                        if op == "=":
+                            return col_expr.is_null()
+                        elif op == "!=":
+                            return ~col_expr.is_null()
+                        else:
+                            raise ValueError(
+                                f"Operator '{op}' not supported for null comparison, use = or !="
+                            )
+                    # Check for NaN value
+                    if value.upper() == "NAN":
+                        col_expr = pl.col(col_name).cast(pl.Float64, strict=False)
+                        if op == "=":
+                            return col_expr.is_nan()
+                        elif op == "!=":
+                            return ~col_expr.is_nan()
+                        else:
+                            raise ValueError(
+                                f"Operator '{op}' not supported for NaN comparison, use = or !="
+                            )
                     # Try to convert value to number, otherwise treat as string
                     try:
                         value_num = float(value)
@@ -957,5 +1070,142 @@ def format_bcftools_tsv(
     return melted_df
+def format_bcftools_tsv_lazy(
+    lazy_df: pl.LazyFrame, pedigree_df: Optional[pl.DataFrame] = None
+) -> pl.LazyFrame:
+    """
+    Format a bcftools tabulated TSV using lazy operations for streaming.
+    This is a simplified version that collects minimally for complex operations.
+    """
+    # For complex transformations like melting, we need to collect temporarily
+    # but we do this in a streaming fashion
+    df = lazy_df.collect(streaming=True)
+    formatted_df = format_bcftools_tsv(df, pedigree_df)
+    return formatted_df.lazy()
+def apply_filters_lazy(
+    lazy_df: pl.LazyFrame, filter_config: dict, verbose: bool = False
+) -> pl.LazyFrame:
+    """Apply quality and expression filters using lazy operations."""
+    quality_config = filter_config.get("quality", {})
+    expression = filter_config.get("expression")
+    # Apply quality filters
+    if quality_config:
+        # Filter: sample_gt must contain at least one '1' (default: true)
+        filter_no_alt = quality_config.get("filter_no_alt_allele", True)
+        if filter_no_alt:
+            lazy_df = lazy_df.filter(
+                pl.col("sample_gt").str.contains("1")
+                | pl.col("sample_gt").str.contains("2")
+            )
+        # Apply minimum depth filter
+        if "sample_dp_min" in quality_config:
+            min_dp = quality_config["sample_dp_min"]
+            lazy_df = lazy_df.filter(
+                pl.col("sample_dp").cast(pl.Float64, strict=False) >= min_dp
+            )
+        # Apply minimum GQ filter
+        if "sample_gq_min" in quality_config:
+            min_gq = quality_config["sample_gq_min"]
+            lazy_df = lazy_df.filter(
+                pl.col("sample_gq").cast(pl.Float64, strict=False) >= min_gq
+            )
+        # VAF filters for heterozygous (0/1 or 1/0)
+        if (
+            "sample_vaf_het_min" in quality_config
+            or "sample_vaf_het_max" in quality_config
+        ):
+            # Check if genotype is het (contains one '1' and one '0', no '2')
+            is_het = (
+                (pl.col("sample_gt").str.count_matches("1") == 1)
+                & (pl.col("sample_gt").str.count_matches("0") == 1)
+                & (~pl.col("sample_gt").str.contains("2"))
+            )
+            het_conditions = []
+            if "sample_vaf_het_min" in quality_config:
+                het_conditions.append(
+                    pl.col("sample_vaf") >= quality_config["sample_vaf_het_min"]
+                )
+            if "sample_vaf_het_max" in quality_config:
+                het_conditions.append(
+                    pl.col("sample_vaf") <= quality_config["sample_vaf_het_max"]
+                )
+            if het_conditions:
+                het_filter = het_conditions[0]
+                for cond in het_conditions[1:]:
+                    het_filter = het_filter & cond
+                lazy_df = lazy_df.filter(~is_het | het_filter)
+        # VAF filter for homozygous alternate (1/1)
+        if "sample_vaf_homalt_min" in quality_config:
+            is_homalt = pl.col("sample_gt") == "1/1"
+            lazy_df = lazy_df.filter(
+                ~is_homalt
+                | (pl.col("sample_vaf") >= quality_config["sample_vaf_homalt_min"])
+            )
+        # VAF filter for homozygous reference (0/0)
+        if "sample_vaf_hom_ref_max" in quality_config:
+            is_hom_ref = pl.col("sample_gt") == "0/0"
+            lazy_df = lazy_df.filter(
+                ~is_hom_ref
+                | (pl.col("sample_vaf") <= quality_config["sample_vaf_hom_ref_max"])
+            )
+        # Apply same filters to parents if requested
+        apply_to_parents = quality_config.get("apply_to_parents", False)
+        if apply_to_parents:
+            # Father filters
+            if "sample_dp_min" in quality_config:
+                min_dp = quality_config["sample_dp_min"]
+                lazy_df = lazy_df.filter(
+                    (pl.col("father_dp").is_null())
+                    | (pl.col("father_dp").cast(pl.Float64, strict=False) >= min_dp)
+                )
+            if "sample_gq_min" in quality_config:
+                min_gq = quality_config["sample_gq_min"]
+                lazy_df = lazy_df.filter(
+                    (pl.col("father_gq").is_null())
+                    | (pl.col("father_gq").cast(pl.Float64, strict=False) >= min_gq)
+                )
+            # Mother filters
+            if "sample_dp_min" in quality_config:
+                min_dp = quality_config["sample_dp_min"]
+                lazy_df = lazy_df.filter(
+                    (pl.col("mother_dp").is_null())
+                    | (pl.col("mother_dp").cast(pl.Float64, strict=False) >= min_dp)
+                )
+            if "sample_gq_min" in quality_config:
+                min_gq = quality_config["sample_gq_min"]
+                lazy_df = lazy_df.filter(
+                    (pl.col("mother_gq").is_null())
+                    | (pl.col("mother_gq").cast(pl.Float64, strict=False) >= min_gq)
+                )
+    # Apply expression filter if provided
+    if expression:
+        if verbose:
+            click.echo(f"Applying expression filter: {expression}", err=True)
+        # We need to collect temporarily to use parse_impact_filter_expression
+        df = lazy_df.collect(streaming=True)
+        filter_expr = parse_impact_filter_expression(expression, df)
+        lazy_df = df.lazy().filter(filter_expr)
+    return lazy_df
 if __name__ == "__main__":
     cli()

{pywombat-0.2.0.dist-info → pywombat-0.4.0.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: pywombat
-Version: 0.2.0
+Version: 0.4.0
 Summary: A CLI tool for processing and filtering bcftools tabulated TSV files with pedigree support
 Project-URL: Homepage, https://github.com/bourgeron-lab/pywombat
 Project-URL: Repository, https://github.com/bourgeron-lab/pywombat

pywombat-0.4.0.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,6 @@
+pywombat/__init__.py,sha256=iIPN9vJtsIUhl_DiKNnknxCamLinfayodLLFK8y-aJg,54
+pywombat/cli.py,sha256=dg38E39VpdJhKQt3aGSHwSiLWn1W8JnUkcsy3ZUHD5w,43518
+pywombat-0.4.0.dist-info/METADATA,sha256=ZKPTIp9ud2AIVbcujg4ciq900DX-UkGs5oafa41jxTQ,4982
+pywombat-0.4.0.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
+pywombat-0.4.0.dist-info/entry_points.txt,sha256=Vt7U2ypbiEgCBlEV71ZPk287H5_HKmPBT4iBu6duEcE,44
+pywombat-0.4.0.dist-info/RECORD,,

pywombat-0.2.0.dist-info/RECORD DELETED Viewed

@@ -1,6 +0,0 @@
-pywombat/__init__.py,sha256=iIPN9vJtsIUhl_DiKNnknxCamLinfayodLLFK8y-aJg,54
-pywombat/cli.py,sha256=PZKV6FoqZyGgG7_mMIO2FzyeONdBaCqnhDATYsQJqMo,33899
-pywombat-0.2.0.dist-info/METADATA,sha256=7Qg2XnaTM92pmIewu5fw_vrcQW5JCVkkj2q6mNC9v88,4982
-pywombat-0.2.0.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
-pywombat-0.2.0.dist-info/entry_points.txt,sha256=Vt7U2ypbiEgCBlEV71ZPk287H5_HKmPBT4iBu6duEcE,44
-pywombat-0.2.0.dist-info/RECORD,,

{pywombat-0.2.0.dist-info → pywombat-0.4.0.dist-info}/WHEEL RENAMED Viewed

File without changes

{pywombat-0.2.0.dist-info → pywombat-0.4.0.dist-info}/entry_points.txt RENAMED Viewed

File without changes

pywombat 0.2.0__py3-none-any.whl → 0.4.0__py3-none-any.whl

pywombat 0.2.0py3-none-any.whl → 0.4.0py3-none-any.whl