PyPI - pywombat - Versions diffs - 1.0.0__py3-none-any.whl → 1.0.2__py3-none-any.whl - Mend

pywombat 1.0.0py3-none-any.whl → 1.0.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (6) hide show

pywombat/cli.py CHANGED Viewed

@@ -1198,15 +1198,18 @@ def read_pedigree(pedigree_path: Path) -> pl.DataFrame:
     pedigree_df = df.select(select_cols)
     # Replace 0 and -9 with null (indicating no parent)
+    # Explicit cast to Utf8 ensures type is preserved even when all values become null
     pedigree_df = pedigree_df.with_columns(
         [
             pl.when(pl.col("father_id").cast(pl.Utf8).is_in(["0", "-9"]))
             .then(None)
             .otherwise(pl.col("father_id"))
+            .cast(pl.Utf8)
             .alias("father_id"),
             pl.when(pl.col("mother_id").cast(pl.Utf8).is_in(["0", "-9"]))
             .then(None)
             .otherwise(pl.col("mother_id"))
+            .cast(pl.Utf8)
             .alias("mother_id"),
         ]
     )
@@ -1313,6 +1316,10 @@ def format_expand_annotations(df: pl.DataFrame) -> pl.DataFrame:
     This is a separate step that can be applied after filtering to avoid
     expensive annotation expansion on variants that will be filtered out.
+    Handles two types of INFO fields:
+    - Key-value pairs (e.g., "DP=30") -> extracted as string values
+    - Boolean flags (e.g., "PASS", "DB") -> created as True/False columns
     Args:
         df: DataFrame with (null) column
@@ -1324,9 +1331,10 @@ def format_expand_annotations(df: pl.DataFrame) -> pl.DataFrame:
         # Already expanded or missing - return as-is
         return df
-    # Extract all unique field names from the (null) column
+    # Extract all unique field names and flags from the (null) column
     null_values = df.select("(null)").to_series()
     all_fields = set()
+    all_flags = set()
     for value in null_values:
         if value and not (isinstance(value, float)):  # Skip null/NaN values
@@ -1335,8 +1343,10 @@ def format_expand_annotations(df: pl.DataFrame) -> pl.DataFrame:
                 if "=" in pair:
                     field_name = pair.split("=", 1)[0]
                     all_fields.add(field_name)
+                elif pair.strip():  # Boolean flag (no '=')
+                    all_flags.add(pair.strip())
-    # Create expressions to extract each field
+    # Create expressions to extract each key-value field
     for field in sorted(all_fields):
         # Extract the field value from the (null) column
         # Pattern: extract value after "field=" and before ";" or end of string
@@ -1344,6 +1354,14 @@ def format_expand_annotations(df: pl.DataFrame) -> pl.DataFrame:
             pl.col("(null)").str.extract(f"{field}=([^;]+)").alias(field)
         )
+    # Create boolean columns for flags
+    for flag in sorted(all_flags):
+        # Check if flag appears in the (null) column (as whole word)
+        # Use regex to match flag as a separate field (not part of another field name)
+        df = df.with_columns(
+            pl.col("(null)").str.contains(f"(^|;){flag}(;|$)").alias(flag)
+        )
     # Drop the original (null) column
     df = df.drop("(null)")

{pywombat-1.0.0.dist-info → pywombat-1.0.2.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: pywombat
-Version: 1.0.0
+Version: 1.0.2
 Summary: A CLI tool for processing and filtering bcftools tabulated TSV files with pedigree support
 Project-URL: Homepage, https://github.com/bourgeron-lab/pywombat
 Project-URL: Repository, https://github.com/bourgeron-lab/pywombat
@@ -35,6 +35,7 @@ A high-performance CLI tool for processing and filtering bcftools tabulated TSV
 🧬 **De Novo Detection**: Sex-chromosome-aware DNM identification
 📊 **Flexible Output**: TSV, compressed TSV, or Parquet formats
 🎯 **Expression Filters**: Complex filtering with logical expressions
+🏷️ **Boolean Flag Support**: INFO field flags (PASS, DB, etc.) extracted as True/False columns
 ⚡ **Streaming Mode**: Memory-efficient processing of large files
 ---
@@ -77,7 +78,7 @@ uv run wombat input.tsv -o output
 PyWombat transforms bcftools tabulated TSV files into analysis-ready formats by:
-1. **Expanding the `(null)` INFO column**: Extracts all `NAME=value` fields (e.g., `DP=30;AF=0.5;AC=2`) into separate columns
+1. **Expanding the `(null)` INFO column**: Extracts all `NAME=value` fields (e.g., `DP=30;AF=0.5;AC=2`) and boolean flags (e.g., `PASS`, `DB`) into separate columns
 2. **Melting sample columns**: Converts wide-format sample data into long format with one row per variant-sample combination
 3. **Extracting genotype data**: Parses `GT:DP:GQ:AD` format into separate columns with calculated VAF
 4. **Adding parent data**: Joins father/mother genotypes when pedigree is provided
@@ -88,20 +89,22 @@ PyWombat transforms bcftools tabulated TSV files into analysis-ready formats by:
 **Input (Wide Format):**
 ```tsv
-#CHROM  POS  REF  ALT  (null)              Sample1:GT:DP:GQ:AD  Sample2:GT:DP:GQ:AD
-chr1    100  A    T    DP=30;AF=0.5;AC=2   0/1:15:99:5,10      1/1:18:99:0,18
+#CHROM  POS  REF  ALT  (null)                      Sample1:GT:DP:GQ:AD  Sample2:GT:DP:GQ:AD
+chr1    100  A    T    DP=30;AF=0.5;PASS;AC=2      0/1:15:99:5,10      1/1:18:99:0,18
 ```
 **Output (Long Format):**
 ```tsv
-#CHROM  POS  REF  ALT  AC  AF   DP  sample   sample_gt  sample_dp  sample_gq  sample_ad  sample_vaf
-chr1    100  A    T    2   0.5  30  Sample1  0/1        15         99         10         0.6667
-chr1    100  A    T    2   0.5  30  Sample2  1/1        18         99         18         1.0
+#CHROM  POS  REF  ALT  AC  AF   DP  PASS  sample   sample_gt  sample_dp  sample_gq  sample_ad  sample_vaf
+chr1    100  A    T    2   0.5  30  true  Sample1  0/1        15         99         10         0.6667
+chr1    100  A    T    2   0.5  30  true  Sample2  1/1        18         99         18         1.0
 ```
 **Generated Columns:**
+- INFO fields with `=`: Extracted as separate columns (e.g., `DP`, `AF`, `AC`)
+- INFO boolean flags: Extracted as True/False columns (e.g., `PASS`, `DB`, `SOMATIC`)
 - `sample`: Sample identifier
 - `sample_gt`: Genotype (e.g., 0/1, 1/1)
 - `sample_dp`: Read depth (total coverage)

pywombat-1.0.2.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,6 @@
+pywombat/__init__.py,sha256=iIPN9vJtsIUhl_DiKNnknxCamLinfayodLLFK8y-aJg,54
+pywombat/cli.py,sha256=76pVpYYyl9rCm6TCq86j7xAqEC9pOvjiWCX9MmrBB_o,74994
+pywombat-1.0.2.dist-info/METADATA,sha256=slcagFwSvA99GVzKLRZggFPy5dkLEh_09O3dIB8Hfr4,17168
+pywombat-1.0.2.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
+pywombat-1.0.2.dist-info/entry_points.txt,sha256=Vt7U2ypbiEgCBlEV71ZPk287H5_HKmPBT4iBu6duEcE,44
+pywombat-1.0.2.dist-info/RECORD,,

pywombat-1.0.0.dist-info/RECORD DELETED Viewed

@@ -1,6 +0,0 @@
-pywombat/__init__.py,sha256=iIPN9vJtsIUhl_DiKNnknxCamLinfayodLLFK8y-aJg,54
-pywombat/cli.py,sha256=FK1bEKtFD1Drp5LNdXaVie4zyjYbZc3wTbsjms-wISU,74176
-pywombat-1.0.0.dist-info/METADATA,sha256=bIm5-Az795PLluvA_6yBPcHkcq6EOZbvB_g-4jPjx_U,16828
-pywombat-1.0.0.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
-pywombat-1.0.0.dist-info/entry_points.txt,sha256=Vt7U2ypbiEgCBlEV71ZPk287H5_HKmPBT4iBu6duEcE,44
-pywombat-1.0.0.dist-info/RECORD,,

{pywombat-1.0.0.dist-info → pywombat-1.0.2.dist-info}/WHEEL RENAMED Viewed

File without changes

{pywombat-1.0.0.dist-info → pywombat-1.0.2.dist-info}/entry_points.txt RENAMED Viewed

File without changes

pywombat 1.0.0__py3-none-any.whl → 1.0.2__py3-none-any.whl

pywombat 1.0.0py3-none-any.whl → 1.0.2py3-none-any.whl