pywombat 1.0.0__py3-none-any.whl → 1.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pywombat/cli.py +17 -2
- {pywombat-1.0.0.dist-info → pywombat-1.0.1.dist-info}/METADATA +10 -7
- pywombat-1.0.1.dist-info/RECORD +6 -0
- pywombat-1.0.0.dist-info/RECORD +0 -6
- {pywombat-1.0.0.dist-info → pywombat-1.0.1.dist-info}/WHEEL +0 -0
- {pywombat-1.0.0.dist-info → pywombat-1.0.1.dist-info}/entry_points.txt +0 -0
pywombat/cli.py
CHANGED
|
@@ -1313,6 +1313,10 @@ def format_expand_annotations(df: pl.DataFrame) -> pl.DataFrame:
|
|
|
1313
1313
|
This is a separate step that can be applied after filtering to avoid
|
|
1314
1314
|
expensive annotation expansion on variants that will be filtered out.
|
|
1315
1315
|
|
|
1316
|
+
Handles two types of INFO fields:
|
|
1317
|
+
- Key-value pairs (e.g., "DP=30") -> extracted as string values
|
|
1318
|
+
- Boolean flags (e.g., "PASS", "DB") -> created as True/False columns
|
|
1319
|
+
|
|
1316
1320
|
Args:
|
|
1317
1321
|
df: DataFrame with (null) column
|
|
1318
1322
|
|
|
@@ -1324,9 +1328,10 @@ def format_expand_annotations(df: pl.DataFrame) -> pl.DataFrame:
|
|
|
1324
1328
|
# Already expanded or missing - return as-is
|
|
1325
1329
|
return df
|
|
1326
1330
|
|
|
1327
|
-
# Extract all unique field names from the (null) column
|
|
1331
|
+
# Extract all unique field names and flags from the (null) column
|
|
1328
1332
|
null_values = df.select("(null)").to_series()
|
|
1329
1333
|
all_fields = set()
|
|
1334
|
+
all_flags = set()
|
|
1330
1335
|
|
|
1331
1336
|
for value in null_values:
|
|
1332
1337
|
if value and not (isinstance(value, float)): # Skip null/NaN values
|
|
@@ -1335,8 +1340,10 @@ def format_expand_annotations(df: pl.DataFrame) -> pl.DataFrame:
|
|
|
1335
1340
|
if "=" in pair:
|
|
1336
1341
|
field_name = pair.split("=", 1)[0]
|
|
1337
1342
|
all_fields.add(field_name)
|
|
1343
|
+
elif pair.strip(): # Boolean flag (no '=')
|
|
1344
|
+
all_flags.add(pair.strip())
|
|
1338
1345
|
|
|
1339
|
-
# Create expressions to extract each field
|
|
1346
|
+
# Create expressions to extract each key-value field
|
|
1340
1347
|
for field in sorted(all_fields):
|
|
1341
1348
|
# Extract the field value from the (null) column
|
|
1342
1349
|
# Pattern: extract value after "field=" and before ";" or end of string
|
|
@@ -1344,6 +1351,14 @@ def format_expand_annotations(df: pl.DataFrame) -> pl.DataFrame:
|
|
|
1344
1351
|
pl.col("(null)").str.extract(f"{field}=([^;]+)").alias(field)
|
|
1345
1352
|
)
|
|
1346
1353
|
|
|
1354
|
+
# Create boolean columns for flags
|
|
1355
|
+
for flag in sorted(all_flags):
|
|
1356
|
+
# Check if flag appears in the (null) column (as whole word)
|
|
1357
|
+
# Use regex to match flag as a separate field (not part of another field name)
|
|
1358
|
+
df = df.with_columns(
|
|
1359
|
+
pl.col("(null)").str.contains(f"(^|;){flag}(;|$)").alias(flag)
|
|
1360
|
+
)
|
|
1361
|
+
|
|
1347
1362
|
# Drop the original (null) column
|
|
1348
1363
|
df = df.drop("(null)")
|
|
1349
1364
|
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: pywombat
|
|
3
|
-
Version: 1.0.
|
|
3
|
+
Version: 1.0.1
|
|
4
4
|
Summary: A CLI tool for processing and filtering bcftools tabulated TSV files with pedigree support
|
|
5
5
|
Project-URL: Homepage, https://github.com/bourgeron-lab/pywombat
|
|
6
6
|
Project-URL: Repository, https://github.com/bourgeron-lab/pywombat
|
|
@@ -35,6 +35,7 @@ A high-performance CLI tool for processing and filtering bcftools tabulated TSV
|
|
|
35
35
|
🧬 **De Novo Detection**: Sex-chromosome-aware DNM identification
|
|
36
36
|
📊 **Flexible Output**: TSV, compressed TSV, or Parquet formats
|
|
37
37
|
🎯 **Expression Filters**: Complex filtering with logical expressions
|
|
38
|
+
🏷️ **Boolean Flag Support**: INFO field flags (PASS, DB, etc.) extracted as True/False columns
|
|
38
39
|
⚡ **Streaming Mode**: Memory-efficient processing of large files
|
|
39
40
|
|
|
40
41
|
---
|
|
@@ -77,7 +78,7 @@ uv run wombat input.tsv -o output
|
|
|
77
78
|
|
|
78
79
|
PyWombat transforms bcftools tabulated TSV files into analysis-ready formats by:
|
|
79
80
|
|
|
80
|
-
1. **Expanding the `(null)` INFO column**: Extracts all `NAME=value` fields (e.g., `DP=30;AF=0.5;AC=2`) into separate columns
|
|
81
|
+
1. **Expanding the `(null)` INFO column**: Extracts all `NAME=value` fields (e.g., `DP=30;AF=0.5;AC=2`) and boolean flags (e.g., `PASS`, `DB`) into separate columns
|
|
81
82
|
2. **Melting sample columns**: Converts wide-format sample data into long format with one row per variant-sample combination
|
|
82
83
|
3. **Extracting genotype data**: Parses `GT:DP:GQ:AD` format into separate columns with calculated VAF
|
|
83
84
|
4. **Adding parent data**: Joins father/mother genotypes when pedigree is provided
|
|
@@ -88,20 +89,22 @@ PyWombat transforms bcftools tabulated TSV files into analysis-ready formats by:
|
|
|
88
89
|
**Input (Wide Format):**
|
|
89
90
|
|
|
90
91
|
```tsv
|
|
91
|
-
#CHROM POS REF ALT (null)
|
|
92
|
-
chr1 100 A T DP=30;AF=0.5;AC=2
|
|
92
|
+
#CHROM POS REF ALT (null) Sample1:GT:DP:GQ:AD Sample2:GT:DP:GQ:AD
|
|
93
|
+
chr1 100 A T DP=30;AF=0.5;PASS;AC=2 0/1:15:99:5,10 1/1:18:99:0,18
|
|
93
94
|
```
|
|
94
95
|
|
|
95
96
|
**Output (Long Format):**
|
|
96
97
|
|
|
97
98
|
```tsv
|
|
98
|
-
#CHROM POS REF ALT AC AF DP sample sample_gt sample_dp sample_gq sample_ad sample_vaf
|
|
99
|
-
chr1 100 A T 2 0.5 30 Sample1 0/1 15 99 10 0.6667
|
|
100
|
-
chr1 100 A T 2 0.5 30 Sample2 1/1 18 99 18 1.0
|
|
99
|
+
#CHROM POS REF ALT AC AF DP PASS sample sample_gt sample_dp sample_gq sample_ad sample_vaf
|
|
100
|
+
chr1 100 A T 2 0.5 30 true Sample1 0/1 15 99 10 0.6667
|
|
101
|
+
chr1 100 A T 2 0.5 30 true Sample2 1/1 18 99 18 1.0
|
|
101
102
|
```
|
|
102
103
|
|
|
103
104
|
**Generated Columns:**
|
|
104
105
|
|
|
106
|
+
- INFO fields with `=`: Extracted as separate columns (e.g., `DP`, `AF`, `AC`)
|
|
107
|
+
- INFO boolean flags: Extracted as True/False columns (e.g., `PASS`, `DB`, `SOMATIC`)
|
|
105
108
|
- `sample`: Sample identifier
|
|
106
109
|
- `sample_gt`: Genotype (e.g., 0/1, 1/1)
|
|
107
110
|
- `sample_dp`: Read depth (total coverage)
|
|
@@ -0,0 +1,6 @@
|
|
|
1
|
+
pywombat/__init__.py,sha256=iIPN9vJtsIUhl_DiKNnknxCamLinfayodLLFK8y-aJg,54
|
|
2
|
+
pywombat/cli.py,sha256=eaChYSTxEc3lXxVRKe3X8bRGKmgxUE0Vuy9Cr5wPTi4,74853
|
|
3
|
+
pywombat-1.0.1.dist-info/METADATA,sha256=G0xdJEOwfB-J1ZOy6qphijM4JBygZppMeRs0J8mzSj0,17168
|
|
4
|
+
pywombat-1.0.1.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
|
|
5
|
+
pywombat-1.0.1.dist-info/entry_points.txt,sha256=Vt7U2ypbiEgCBlEV71ZPk287H5_HKmPBT4iBu6duEcE,44
|
|
6
|
+
pywombat-1.0.1.dist-info/RECORD,,
|
pywombat-1.0.0.dist-info/RECORD
DELETED
|
@@ -1,6 +0,0 @@
|
|
|
1
|
-
pywombat/__init__.py,sha256=iIPN9vJtsIUhl_DiKNnknxCamLinfayodLLFK8y-aJg,54
|
|
2
|
-
pywombat/cli.py,sha256=FK1bEKtFD1Drp5LNdXaVie4zyjYbZc3wTbsjms-wISU,74176
|
|
3
|
-
pywombat-1.0.0.dist-info/METADATA,sha256=bIm5-Az795PLluvA_6yBPcHkcq6EOZbvB_g-4jPjx_U,16828
|
|
4
|
-
pywombat-1.0.0.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
|
|
5
|
-
pywombat-1.0.0.dist-info/entry_points.txt,sha256=Vt7U2ypbiEgCBlEV71ZPk287H5_HKmPBT4iBu6duEcE,44
|
|
6
|
-
pywombat-1.0.0.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|