pywombat 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,33 @@
1
+ name: Publish to PyPI
2
+
3
+ on:
4
+ release:
5
+ types: [published]
6
+ workflow_dispatch: # Allows manual trigger
7
+
8
+ jobs:
9
+ build-and-publish:
10
+ runs-on: ubuntu-latest
11
+ permissions:
12
+ id-token: write # Required for trusted publishing
13
+ contents: read
14
+
15
+ steps:
16
+ - name: Checkout code
17
+ uses: actions/checkout@v4
18
+
19
+ - name: Set up Python
20
+ uses: actions/setup-python@v5
21
+ with:
22
+ python-version: '3.12'
23
+
24
+ - name: Install build dependencies
25
+ run: |
26
+ python -m pip install --upgrade pip
27
+ pip install build hatchling
28
+
29
+ - name: Build package
30
+ run: python -m build
31
+
32
+ - name: Publish to PyPI
33
+ uses: pypa/gh-action-pypi-publish@release/v1
@@ -0,0 +1,12 @@
1
+ # Python-generated files
2
+ __pycache__/
3
+ *.py[oc]
4
+ build/
5
+ dist/
6
+ wheels/
7
+ *.egg-info
8
+
9
+ # Virtual environments
10
+ .venv
11
+ output.tsv
12
+ tests/*
@@ -0,0 +1 @@
1
+ 3.12
@@ -0,0 +1,142 @@
1
+ Metadata-Version: 2.4
2
+ Name: pywombat
3
+ Version: 0.1.0
4
+ Summary: A CLI tool for processing and filtering bcftools tabulated TSV files with pedigree support
5
+ Project-URL: Homepage, https://github.com/bourgeron-lab/pywombat
6
+ Project-URL: Repository, https://github.com/bourgeron-lab/pywombat
7
+ Project-URL: Issues, https://github.com/bourgeron-lab/pywombat/issues
8
+ Author-email: Freddy Cliquet <fcliquet@pasteur.fr>
9
+ License: MIT
10
+ Keywords: bioinformatics,genomics,pedigree,variant-calling,vcf
11
+ Classifier: Development Status :: 3 - Alpha
12
+ Classifier: Intended Audience :: Science/Research
13
+ Classifier: Programming Language :: Python :: 3
14
+ Classifier: Programming Language :: Python :: 3.12
15
+ Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
16
+ Requires-Python: >=3.12
17
+ Requires-Dist: click>=8.1.0
18
+ Requires-Dist: polars>=0.19.0
19
+ Requires-Dist: pyyaml>=6.0
20
+ Description-Content-Type: text/markdown
21
+
22
+ # PyWombat
23
+
24
+ A CLI tool for processing bcftools tabulated TSV files.
25
+
26
+ ## Installation
27
+
28
+ This is a UV-managed Python package. To install:
29
+
30
+ ```bash
31
+ uv sync
32
+ ```
33
+
34
+ ## Usage
35
+
36
+ The `wombat` command processes bcftools tabulated TSV files:
37
+
38
+ ```bash
39
+ # Format a bcftools TSV file and print to stdout
40
+ wombat input.tsv
41
+
42
+ # Format and save to output file (creates output.tsv by default)
43
+ wombat input.tsv -o output
44
+
45
+ # Format and save as parquet
46
+ wombat input.tsv -o output -f parquet
47
+ wombat input.tsv -o output --format parquet
48
+
49
+ # Format with pedigree information to add parent genotypes
50
+ wombat input.tsv --pedigree pedigree.tsv -o output
51
+ ```
52
+
53
+ ### What does `wombat` do?
54
+
55
+ The `wombat` command processes bcftools tabulated TSV files by:
56
+
57
+ 1. **Expanding the `(null)` column**: This column contains multiple fields in the format `NAME=value` separated by semicolons (e.g., `DP=30;AF=0.5;AC=2`). Each field is extracted into its own column.
58
+
59
+ 2. **Preserving the `CSQ` column**: The CSQ (Consequence) column is preserved as-is and not melted, allowing VEP annotations to remain intact.
60
+
61
+ 3. **Melting and splitting sample columns**: After the `(null)` column, there are typically sample columns with values in `GT:DP:GQ:AD` format. The tool:
62
+ - Extracts the sample name (the part before the first `:` character)
63
+ - Transforms the wide format into long format
64
+ - Creates a `sample` column with the sample names
65
+ - Splits the sample values into separate columns:
66
+ - `sample_gt`: Genotype (e.g., 0/1, 1/1)
67
+ - `sample_dp`: Read depth
68
+ - `sample_gq`: Genotype quality
69
+ - `sample_ad`: Allele depth (takes the second value from comma-separated list)
70
+ - `sample_vaf`: Variant allele frequency (calculated as sample_ad / sample_dp)
71
+
72
+ ### Example
73
+
74
+ **Input:**
75
+
76
+ ```tsv
77
+ CHROM POS REF ALT (null) Sample1:GT:Sample1:DP:Sample1:GQ:Sample1:AD Sample2:GT:Sample2:DP:Sample2:GQ:Sample2:AD
78
+ chr1 100 A T DP=30;AF=0.5;AC=2 0/1:15:99:5,10 1/1:18:99:0,18
79
+ ```
80
+
81
+ **Output:**
82
+
83
+ ```tsv
84
+ CHROM POS REF ALT AC AF DP sample sample_gt sample_dp sample_gq sample_ad sample_vaf
85
+ chr1 100 A T 2 0.5 30 Sample1 0/1 15 99 10 0.6667
86
+ chr1 100 A T 2 0.5 30 Sample2 1/1 18 99 18 1.0
87
+ ```
88
+
89
+ Notes:
90
+
91
+ - The `sample_ad` column contains the second value from the AD field (e.g., from `5,10` it extracts `10`)
92
+ - The `sample_vaf` column is the variant allele frequency calculated as `sample_ad / sample_dp`
93
+ - By default, output is in TSV format. Use `-f parquet` to output as Parquet files
94
+ - The `-o` option specifies an output prefix (e.g., `-o output` creates `output.tsv` or `output.parquet`)
95
+
96
+ ### Pedigree Support
97
+
98
+ You can provide a pedigree file with the `--pedigree` option to add parent genotype information to the output. This enables trio analysis by including the father's and mother's genotypes for each sample.
99
+
100
+ **Pedigree File Format:**
101
+
102
+ The pedigree file should be a tab-separated file with the following columns:
103
+
104
+ - `FID`: Family ID
105
+ - `sample_id`: Sample identifier (matches the sample names in the VCF)
106
+ - `FatherBarcode`: Father's sample identifier (use `0` or `-9` if unknown)
107
+ - `MotherBarcode`: Mother's sample identifier (use `0` or `-9` if unknown)
108
+ - `Sex`: Sex of the sample (optional)
109
+ - `Pheno`: Phenotype information (optional)
110
+
111
+ Example pedigree file:
112
+
113
+ ```tsv
114
+ FID sample_id FatherBarcode MotherBarcode Sex Pheno
115
+ FAM1 Child1 Father1 Mother1 1 2
116
+ FAM1 Father1 0 0 1 1
117
+ FAM1 Mother1 0 0 2 1
118
+ ```
119
+
120
+ **Output with Pedigree:**
121
+
122
+ When using `--pedigree`, the output will include additional columns for each parent:
123
+
124
+ - `father_gt`, `father_dp`, `father_gq`, `father_ad`, `father_vaf`: Father's genotype information
125
+ - `mother_gt`, `mother_dp`, `mother_gq`, `mother_ad`, `mother_vaf`: Mother's genotype information
126
+
127
+ These columns will contain the parent's genotype data for the same variant, allowing you to analyze inheritance patterns.
128
+
129
+ ## Development
130
+
131
+ This project uses:
132
+
133
+ - **UV** for package management
134
+ - **Polars** for fast data processing
135
+ - **Click** for CLI interface
136
+
137
+ ## Testing
138
+
139
+ Test files are available in the `tests/` directory:
140
+
141
+ - `test.tabulated.tsv` - Real bcftools output
142
+ - `test_small.tsv` - Small example for quick testing
@@ -0,0 +1,92 @@
1
+ # Wombat Quick Start
2
+
3
+ ## Installation
4
+
5
+ ```bash
6
+ cd /Users/fcliquet/Workspace/pywombat
7
+ uv sync
8
+ ```
9
+
10
+ ## Basic Usage
11
+
12
+ ```bash
13
+ # Format a bcftools TSV file
14
+ uv run wombat format tests/test.tabulated.tsv -o output.tsv
15
+
16
+ # With verbose output to see progress
17
+ uv run wombat format tests/test.tabulated.tsv -o output.tsv --verbose
18
+
19
+ # Output to stdout
20
+ uv run wombat format tests/test.tabulated.tsv
21
+
22
+ # View help
23
+ uv run wombat --help
24
+ uv run wombat format --help
25
+ ```
26
+
27
+ ## What It Does
28
+
29
+ ### Input (Wide Format)
30
+
31
+ ```
32
+ CHROM POS REF ALT (null) Sample1:GT Sample2:GT
33
+ chr1 100 A T DP=30;AF=0.5;AC=2 0/1 1/1
34
+ ```
35
+
36
+ ### Output (Long Format)
37
+
38
+ ```
39
+ CHROM POS REF ALT AC AF DP sample sample_value
40
+ chr1 100 A T 2 0.5 30 Sample1 0/1
41
+ chr1 100 A T 2 0.5 30 Sample2 1/1
42
+ ```
43
+
44
+ ## Features
45
+
46
+ 1. **Expands (null) column**: Splits `DP=30;AF=0.5;AC=2` into separate columns
47
+ 2. **Melts samples**: Converts wide sample columns to long format
48
+ 3. **Fast processing**: Uses Polars for efficient data handling
49
+
50
+ ## Testing
51
+
52
+ ```bash
53
+ # Run the test suite
54
+ uv run python tests/test_format.py
55
+
56
+ # Test with small example
57
+ uv run wombat format tests/test_small.tsv -o tests/output_small.tsv
58
+
59
+ # Test with real data
60
+ uv run wombat format tests/test.tabulated.tsv -o tests/output.tsv
61
+ ```
62
+
63
+ ## Project Structure
64
+
65
+ ```
66
+ pywombat/
67
+ ├── pyproject.toml # Project configuration with dependencies
68
+ ├── README.md # Main documentation
69
+ ├── USAGE.md # Detailed usage guide
70
+ ├── src/
71
+ │ └── pywombat/
72
+ │ ├── __init__.py # Package init
73
+ │ └── cli.py # Main CLI implementation
74
+ └── tests/
75
+ ├── test_format.py # Unit tests
76
+ ├── test_small.tsv # Small test file
77
+ └── test.tabulated.tsv # Real bcftools output
78
+ ```
79
+
80
+ ## Dependencies
81
+
82
+ - **polars** (>=0.19.0): Fast DataFrame library for data processing
83
+ - **click** (>=8.1.0): Command-line interface framework
84
+
85
+ ## Troubleshooting
86
+
87
+ If you encounter issues:
88
+
89
+ 1. Make sure dependencies are installed: `uv sync`
90
+ 2. Check that input file is valid TSV with a `(null)` column
91
+ 3. Use `--verbose` flag to see processing details
92
+ 4. For large files, ensure sufficient memory is available
@@ -0,0 +1,121 @@
1
+ # PyWombat
2
+
3
+ A CLI tool for processing bcftools tabulated TSV files.
4
+
5
+ ## Installation
6
+
7
+ This is a UV-managed Python package. To install:
8
+
9
+ ```bash
10
+ uv sync
11
+ ```
12
+
13
+ ## Usage
14
+
15
+ The `wombat` command processes bcftools tabulated TSV files:
16
+
17
+ ```bash
18
+ # Format a bcftools TSV file and print to stdout
19
+ wombat input.tsv
20
+
21
+ # Format and save to output file (creates output.tsv by default)
22
+ wombat input.tsv -o output
23
+
24
+ # Format and save as parquet
25
+ wombat input.tsv -o output -f parquet
26
+ wombat input.tsv -o output --format parquet
27
+
28
+ # Format with pedigree information to add parent genotypes
29
+ wombat input.tsv --pedigree pedigree.tsv -o output
30
+ ```
31
+
32
+ ### What does `wombat` do?
33
+
34
+ The `wombat` command processes bcftools tabulated TSV files by:
35
+
36
+ 1. **Expanding the `(null)` column**: This column contains multiple fields in the format `NAME=value` separated by semicolons (e.g., `DP=30;AF=0.5;AC=2`). Each field is extracted into its own column.
37
+
38
+ 2. **Preserving the `CSQ` column**: The CSQ (Consequence) column is preserved as-is and not melted, allowing VEP annotations to remain intact.
39
+
40
+ 3. **Melting and splitting sample columns**: After the `(null)` column, there are typically sample columns with values in `GT:DP:GQ:AD` format. The tool:
41
+ - Extracts the sample name (the part before the first `:` character)
42
+ - Transforms the wide format into long format
43
+ - Creates a `sample` column with the sample names
44
+ - Splits the sample values into separate columns:
45
+ - `sample_gt`: Genotype (e.g., 0/1, 1/1)
46
+ - `sample_dp`: Read depth
47
+ - `sample_gq`: Genotype quality
48
+ - `sample_ad`: Allele depth (takes the second value from comma-separated list)
49
+ - `sample_vaf`: Variant allele frequency (calculated as sample_ad / sample_dp)
50
+
51
+ ### Example
52
+
53
+ **Input:**
54
+
55
+ ```tsv
56
+ CHROM POS REF ALT (null) Sample1:GT:Sample1:DP:Sample1:GQ:Sample1:AD Sample2:GT:Sample2:DP:Sample2:GQ:Sample2:AD
57
+ chr1 100 A T DP=30;AF=0.5;AC=2 0/1:15:99:5,10 1/1:18:99:0,18
58
+ ```
59
+
60
+ **Output:**
61
+
62
+ ```tsv
63
+ CHROM POS REF ALT AC AF DP sample sample_gt sample_dp sample_gq sample_ad sample_vaf
64
+ chr1 100 A T 2 0.5 30 Sample1 0/1 15 99 10 0.6667
65
+ chr1 100 A T 2 0.5 30 Sample2 1/1 18 99 18 1.0
66
+ ```
67
+
68
+ Notes:
69
+
70
+ - The `sample_ad` column contains the second value from the AD field (e.g., from `5,10` it extracts `10`)
71
+ - The `sample_vaf` column is the variant allele frequency calculated as `sample_ad / sample_dp`
72
+ - By default, output is in TSV format. Use `-f parquet` to output as Parquet files
73
+ - The `-o` option specifies an output prefix (e.g., `-o output` creates `output.tsv` or `output.parquet`)
74
+
75
+ ### Pedigree Support
76
+
77
+ You can provide a pedigree file with the `--pedigree` option to add parent genotype information to the output. This enables trio analysis by including the father's and mother's genotypes for each sample.
78
+
79
+ **Pedigree File Format:**
80
+
81
+ The pedigree file should be a tab-separated file with the following columns:
82
+
83
+ - `FID`: Family ID
84
+ - `sample_id`: Sample identifier (matches the sample names in the VCF)
85
+ - `FatherBarcode`: Father's sample identifier (use `0` or `-9` if unknown)
86
+ - `MotherBarcode`: Mother's sample identifier (use `0` or `-9` if unknown)
87
+ - `Sex`: Sex of the sample (optional)
88
+ - `Pheno`: Phenotype information (optional)
89
+
90
+ Example pedigree file:
91
+
92
+ ```tsv
93
+ FID sample_id FatherBarcode MotherBarcode Sex Pheno
94
+ FAM1 Child1 Father1 Mother1 1 2
95
+ FAM1 Father1 0 0 1 1
96
+ FAM1 Mother1 0 0 2 1
97
+ ```
98
+
99
+ **Output with Pedigree:**
100
+
101
+ When using `--pedigree`, the output will include additional columns for each parent:
102
+
103
+ - `father_gt`, `father_dp`, `father_gq`, `father_ad`, `father_vaf`: Father's genotype information
104
+ - `mother_gt`, `mother_dp`, `mother_gq`, `mother_ad`, `mother_vaf`: Mother's genotype information
105
+
106
+ These columns will contain the parent's genotype data for the same variant, allowing you to analyze inheritance patterns.
107
+
108
+ ## Development
109
+
110
+ This project uses:
111
+
112
+ - **UV** for package management
113
+ - **Polars** for fast data processing
114
+ - **Click** for CLI interface
115
+
116
+ ## Testing
117
+
118
+ Test files are available in the `tests/` directory:
119
+
120
+ - `test.tabulated.tsv` - Real bcftools output
121
+ - `test_small.tsv` - Small example for quick testing
@@ -0,0 +1,29 @@
1
+ [project]
2
+ name = "pywombat"
3
+ version = "0.1.0"
4
+ description = "A CLI tool for processing and filtering bcftools tabulated TSV files with pedigree support"
5
+ readme = "README.md"
6
+ authors = [{ name = "Freddy Cliquet", email = "fcliquet@pasteur.fr" }]
7
+ requires-python = ">=3.12"
8
+ dependencies = ["polars>=0.19.0", "click>=8.1.0", "pyyaml>=6.0"]
9
+ license = { text = "MIT" }
10
+ keywords = ["vcf", "bioinformatics", "genomics", "pedigree", "variant-calling"]
11
+ classifiers = [
12
+ "Development Status :: 3 - Alpha",
13
+ "Intended Audience :: Science/Research",
14
+ "Topic :: Scientific/Engineering :: Bio-Informatics",
15
+ "Programming Language :: Python :: 3",
16
+ "Programming Language :: Python :: 3.12",
17
+ ]
18
+
19
+ [project.urls]
20
+ Homepage = "https://github.com/bourgeron-lab/pywombat"
21
+ Repository = "https://github.com/bourgeron-lab/pywombat"
22
+ Issues = "https://github.com/bourgeron-lab/pywombat/issues"
23
+
24
+ [project.scripts]
25
+ wombat = "pywombat.cli:cli"
26
+
27
+ [build-system]
28
+ requires = ["hatchling"]
29
+ build-backend = "hatchling.build"
@@ -0,0 +1,2 @@
1
+ def main() -> None:
2
+ print("Hello from pywombat!")