pywombat 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pywombat-0.1.0/.github/workflows/publish.yml +33 -0
- pywombat-0.1.0/.gitignore +12 -0
- pywombat-0.1.0/.python-version +1 -0
- pywombat-0.1.0/PKG-INFO +142 -0
- pywombat-0.1.0/QUICKSTART.md +92 -0
- pywombat-0.1.0/README.md +121 -0
- pywombat-0.1.0/pyproject.toml +29 -0
- pywombat-0.1.0/src/pywombat/__init__.py +2 -0
- pywombat-0.1.0/src/pywombat/cli.py +935 -0
- pywombat-0.1.0/uv.lock +113 -0
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
name: Publish to PyPI
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
release:
|
|
5
|
+
types: [published]
|
|
6
|
+
workflow_dispatch: # Allows manual trigger
|
|
7
|
+
|
|
8
|
+
jobs:
|
|
9
|
+
build-and-publish:
|
|
10
|
+
runs-on: ubuntu-latest
|
|
11
|
+
permissions:
|
|
12
|
+
id-token: write # Required for trusted publishing
|
|
13
|
+
contents: read
|
|
14
|
+
|
|
15
|
+
steps:
|
|
16
|
+
- name: Checkout code
|
|
17
|
+
uses: actions/checkout@v4
|
|
18
|
+
|
|
19
|
+
- name: Set up Python
|
|
20
|
+
uses: actions/setup-python@v5
|
|
21
|
+
with:
|
|
22
|
+
python-version: '3.12'
|
|
23
|
+
|
|
24
|
+
- name: Install build dependencies
|
|
25
|
+
run: |
|
|
26
|
+
python -m pip install --upgrade pip
|
|
27
|
+
pip install build hatchling
|
|
28
|
+
|
|
29
|
+
- name: Build package
|
|
30
|
+
run: python -m build
|
|
31
|
+
|
|
32
|
+
- name: Publish to PyPI
|
|
33
|
+
uses: pypa/gh-action-pypi-publish@release/v1
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
3.12
|
pywombat-0.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,142 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: pywombat
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: A CLI tool for processing and filtering bcftools tabulated TSV files with pedigree support
|
|
5
|
+
Project-URL: Homepage, https://github.com/bourgeron-lab/pywombat
|
|
6
|
+
Project-URL: Repository, https://github.com/bourgeron-lab/pywombat
|
|
7
|
+
Project-URL: Issues, https://github.com/bourgeron-lab/pywombat/issues
|
|
8
|
+
Author-email: Freddy Cliquet <fcliquet@pasteur.fr>
|
|
9
|
+
License: MIT
|
|
10
|
+
Keywords: bioinformatics,genomics,pedigree,variant-calling,vcf
|
|
11
|
+
Classifier: Development Status :: 3 - Alpha
|
|
12
|
+
Classifier: Intended Audience :: Science/Research
|
|
13
|
+
Classifier: Programming Language :: Python :: 3
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
15
|
+
Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
|
|
16
|
+
Requires-Python: >=3.12
|
|
17
|
+
Requires-Dist: click>=8.1.0
|
|
18
|
+
Requires-Dist: polars>=0.19.0
|
|
19
|
+
Requires-Dist: pyyaml>=6.0
|
|
20
|
+
Description-Content-Type: text/markdown
|
|
21
|
+
|
|
22
|
+
# PyWombat
|
|
23
|
+
|
|
24
|
+
A CLI tool for processing bcftools tabulated TSV files.
|
|
25
|
+
|
|
26
|
+
## Installation
|
|
27
|
+
|
|
28
|
+
This is a UV-managed Python package. To install:
|
|
29
|
+
|
|
30
|
+
```bash
|
|
31
|
+
uv sync
|
|
32
|
+
```
|
|
33
|
+
|
|
34
|
+
## Usage
|
|
35
|
+
|
|
36
|
+
The `wombat` command processes bcftools tabulated TSV files:
|
|
37
|
+
|
|
38
|
+
```bash
|
|
39
|
+
# Format a bcftools TSV file and print to stdout
|
|
40
|
+
wombat input.tsv
|
|
41
|
+
|
|
42
|
+
# Format and save to output file (creates output.tsv by default)
|
|
43
|
+
wombat input.tsv -o output
|
|
44
|
+
|
|
45
|
+
# Format and save as parquet
|
|
46
|
+
wombat input.tsv -o output -f parquet
|
|
47
|
+
wombat input.tsv -o output --format parquet
|
|
48
|
+
|
|
49
|
+
# Format with pedigree information to add parent genotypes
|
|
50
|
+
wombat input.tsv --pedigree pedigree.tsv -o output
|
|
51
|
+
```
|
|
52
|
+
|
|
53
|
+
### What does `wombat` do?
|
|
54
|
+
|
|
55
|
+
The `wombat` command processes bcftools tabulated TSV files by:
|
|
56
|
+
|
|
57
|
+
1. **Expanding the `(null)` column**: This column contains multiple fields in the format `NAME=value` separated by semicolons (e.g., `DP=30;AF=0.5;AC=2`). Each field is extracted into its own column.
|
|
58
|
+
|
|
59
|
+
2. **Preserving the `CSQ` column**: The CSQ (Consequence) column is preserved as-is and not melted, allowing VEP annotations to remain intact.
|
|
60
|
+
|
|
61
|
+
3. **Melting and splitting sample columns**: After the `(null)` column, there are typically sample columns with values in `GT:DP:GQ:AD` format. The tool:
|
|
62
|
+
- Extracts the sample name (the part before the first `:` character)
|
|
63
|
+
- Transforms the wide format into long format
|
|
64
|
+
- Creates a `sample` column with the sample names
|
|
65
|
+
- Splits the sample values into separate columns:
|
|
66
|
+
- `sample_gt`: Genotype (e.g., 0/1, 1/1)
|
|
67
|
+
- `sample_dp`: Read depth
|
|
68
|
+
- `sample_gq`: Genotype quality
|
|
69
|
+
- `sample_ad`: Allele depth (takes the second value from comma-separated list)
|
|
70
|
+
- `sample_vaf`: Variant allele frequency (calculated as sample_ad / sample_dp)
|
|
71
|
+
|
|
72
|
+
### Example
|
|
73
|
+
|
|
74
|
+
**Input:**
|
|
75
|
+
|
|
76
|
+
```tsv
|
|
77
|
+
CHROM POS REF ALT (null) Sample1:GT:Sample1:DP:Sample1:GQ:Sample1:AD Sample2:GT:Sample2:DP:Sample2:GQ:Sample2:AD
|
|
78
|
+
chr1 100 A T DP=30;AF=0.5;AC=2 0/1:15:99:5,10 1/1:18:99:0,18
|
|
79
|
+
```
|
|
80
|
+
|
|
81
|
+
**Output:**
|
|
82
|
+
|
|
83
|
+
```tsv
|
|
84
|
+
CHROM POS REF ALT AC AF DP sample sample_gt sample_dp sample_gq sample_ad sample_vaf
|
|
85
|
+
chr1 100 A T 2 0.5 30 Sample1 0/1 15 99 10 0.6667
|
|
86
|
+
chr1 100 A T 2 0.5 30 Sample2 1/1 18 99 18 1.0
|
|
87
|
+
```
|
|
88
|
+
|
|
89
|
+
Notes:
|
|
90
|
+
|
|
91
|
+
- The `sample_ad` column contains the second value from the AD field (e.g., from `5,10` it extracts `10`)
|
|
92
|
+
- The `sample_vaf` column is the variant allele frequency calculated as `sample_ad / sample_dp`
|
|
93
|
+
- By default, output is in TSV format. Use `-f parquet` to output as Parquet files
|
|
94
|
+
- The `-o` option specifies an output prefix (e.g., `-o output` creates `output.tsv` or `output.parquet`)
|
|
95
|
+
|
|
96
|
+
### Pedigree Support
|
|
97
|
+
|
|
98
|
+
You can provide a pedigree file with the `--pedigree` option to add parent genotype information to the output. This enables trio analysis by including the father's and mother's genotypes for each sample.
|
|
99
|
+
|
|
100
|
+
**Pedigree File Format:**
|
|
101
|
+
|
|
102
|
+
The pedigree file should be a tab-separated file with the following columns:
|
|
103
|
+
|
|
104
|
+
- `FID`: Family ID
|
|
105
|
+
- `sample_id`: Sample identifier (matches the sample names in the VCF)
|
|
106
|
+
- `FatherBarcode`: Father's sample identifier (use `0` or `-9` if unknown)
|
|
107
|
+
- `MotherBarcode`: Mother's sample identifier (use `0` or `-9` if unknown)
|
|
108
|
+
- `Sex`: Sex of the sample (optional)
|
|
109
|
+
- `Pheno`: Phenotype information (optional)
|
|
110
|
+
|
|
111
|
+
Example pedigree file:
|
|
112
|
+
|
|
113
|
+
```tsv
|
|
114
|
+
FID sample_id FatherBarcode MotherBarcode Sex Pheno
|
|
115
|
+
FAM1 Child1 Father1 Mother1 1 2
|
|
116
|
+
FAM1 Father1 0 0 1 1
|
|
117
|
+
FAM1 Mother1 0 0 2 1
|
|
118
|
+
```
|
|
119
|
+
|
|
120
|
+
**Output with Pedigree:**
|
|
121
|
+
|
|
122
|
+
When using `--pedigree`, the output will include additional columns for each parent:
|
|
123
|
+
|
|
124
|
+
- `father_gt`, `father_dp`, `father_gq`, `father_ad`, `father_vaf`: Father's genotype information
|
|
125
|
+
- `mother_gt`, `mother_dp`, `mother_gq`, `mother_ad`, `mother_vaf`: Mother's genotype information
|
|
126
|
+
|
|
127
|
+
These columns will contain the parent's genotype data for the same variant, allowing you to analyze inheritance patterns.
|
|
128
|
+
|
|
129
|
+
## Development
|
|
130
|
+
|
|
131
|
+
This project uses:
|
|
132
|
+
|
|
133
|
+
- **UV** for package management
|
|
134
|
+
- **Polars** for fast data processing
|
|
135
|
+
- **Click** for CLI interface
|
|
136
|
+
|
|
137
|
+
## Testing
|
|
138
|
+
|
|
139
|
+
Test files are available in the `tests/` directory:
|
|
140
|
+
|
|
141
|
+
- `test.tabulated.tsv` - Real bcftools output
|
|
142
|
+
- `test_small.tsv` - Small example for quick testing
|
|
@@ -0,0 +1,92 @@
|
|
|
1
|
+
# Wombat Quick Start
|
|
2
|
+
|
|
3
|
+
## Installation
|
|
4
|
+
|
|
5
|
+
```bash
|
|
6
|
+
cd /Users/fcliquet/Workspace/pywombat
|
|
7
|
+
uv sync
|
|
8
|
+
```
|
|
9
|
+
|
|
10
|
+
## Basic Usage
|
|
11
|
+
|
|
12
|
+
```bash
|
|
13
|
+
# Format a bcftools TSV file
|
|
14
|
+
uv run wombat format tests/test.tabulated.tsv -o output.tsv
|
|
15
|
+
|
|
16
|
+
# With verbose output to see progress
|
|
17
|
+
uv run wombat format tests/test.tabulated.tsv -o output.tsv --verbose
|
|
18
|
+
|
|
19
|
+
# Output to stdout
|
|
20
|
+
uv run wombat format tests/test.tabulated.tsv
|
|
21
|
+
|
|
22
|
+
# View help
|
|
23
|
+
uv run wombat --help
|
|
24
|
+
uv run wombat format --help
|
|
25
|
+
```
|
|
26
|
+
|
|
27
|
+
## What It Does
|
|
28
|
+
|
|
29
|
+
### Input (Wide Format)
|
|
30
|
+
|
|
31
|
+
```
|
|
32
|
+
CHROM POS REF ALT (null) Sample1:GT Sample2:GT
|
|
33
|
+
chr1 100 A T DP=30;AF=0.5;AC=2 0/1 1/1
|
|
34
|
+
```
|
|
35
|
+
|
|
36
|
+
### Output (Long Format)
|
|
37
|
+
|
|
38
|
+
```
|
|
39
|
+
CHROM POS REF ALT AC AF DP sample sample_value
|
|
40
|
+
chr1 100 A T 2 0.5 30 Sample1 0/1
|
|
41
|
+
chr1 100 A T 2 0.5 30 Sample2 1/1
|
|
42
|
+
```
|
|
43
|
+
|
|
44
|
+
## Features
|
|
45
|
+
|
|
46
|
+
1. **Expands (null) column**: Splits `DP=30;AF=0.5;AC=2` into separate columns
|
|
47
|
+
2. **Melts samples**: Converts wide sample columns to long format
|
|
48
|
+
3. **Fast processing**: Uses Polars for efficient data handling
|
|
49
|
+
|
|
50
|
+
## Testing
|
|
51
|
+
|
|
52
|
+
```bash
|
|
53
|
+
# Run the test suite
|
|
54
|
+
uv run python tests/test_format.py
|
|
55
|
+
|
|
56
|
+
# Test with small example
|
|
57
|
+
uv run wombat format tests/test_small.tsv -o tests/output_small.tsv
|
|
58
|
+
|
|
59
|
+
# Test with real data
|
|
60
|
+
uv run wombat format tests/test.tabulated.tsv -o tests/output.tsv
|
|
61
|
+
```
|
|
62
|
+
|
|
63
|
+
## Project Structure
|
|
64
|
+
|
|
65
|
+
```
|
|
66
|
+
pywombat/
|
|
67
|
+
├── pyproject.toml # Project configuration with dependencies
|
|
68
|
+
├── README.md # Main documentation
|
|
69
|
+
├── USAGE.md # Detailed usage guide
|
|
70
|
+
├── src/
|
|
71
|
+
│ └── pywombat/
|
|
72
|
+
│ ├── __init__.py # Package init
|
|
73
|
+
│ └── cli.py # Main CLI implementation
|
|
74
|
+
└── tests/
|
|
75
|
+
├── test_format.py # Unit tests
|
|
76
|
+
├── test_small.tsv # Small test file
|
|
77
|
+
└── test.tabulated.tsv # Real bcftools output
|
|
78
|
+
```
|
|
79
|
+
|
|
80
|
+
## Dependencies
|
|
81
|
+
|
|
82
|
+
- **polars** (>=0.19.0): Fast DataFrame library for data processing
|
|
83
|
+
- **click** (>=8.1.0): Command-line interface framework
|
|
84
|
+
|
|
85
|
+
## Troubleshooting
|
|
86
|
+
|
|
87
|
+
If you encounter issues:
|
|
88
|
+
|
|
89
|
+
1. Make sure dependencies are installed: `uv sync`
|
|
90
|
+
2. Check that input file is valid TSV with a `(null)` column
|
|
91
|
+
3. Use `--verbose` flag to see processing details
|
|
92
|
+
4. For large files, ensure sufficient memory is available
|
pywombat-0.1.0/README.md
ADDED
|
@@ -0,0 +1,121 @@
|
|
|
1
|
+
# PyWombat
|
|
2
|
+
|
|
3
|
+
A CLI tool for processing bcftools tabulated TSV files.
|
|
4
|
+
|
|
5
|
+
## Installation
|
|
6
|
+
|
|
7
|
+
This is a UV-managed Python package. To install:
|
|
8
|
+
|
|
9
|
+
```bash
|
|
10
|
+
uv sync
|
|
11
|
+
```
|
|
12
|
+
|
|
13
|
+
## Usage
|
|
14
|
+
|
|
15
|
+
The `wombat` command processes bcftools tabulated TSV files:
|
|
16
|
+
|
|
17
|
+
```bash
|
|
18
|
+
# Format a bcftools TSV file and print to stdout
|
|
19
|
+
wombat input.tsv
|
|
20
|
+
|
|
21
|
+
# Format and save to output file (creates output.tsv by default)
|
|
22
|
+
wombat input.tsv -o output
|
|
23
|
+
|
|
24
|
+
# Format and save as parquet
|
|
25
|
+
wombat input.tsv -o output -f parquet
|
|
26
|
+
wombat input.tsv -o output --format parquet
|
|
27
|
+
|
|
28
|
+
# Format with pedigree information to add parent genotypes
|
|
29
|
+
wombat input.tsv --pedigree pedigree.tsv -o output
|
|
30
|
+
```
|
|
31
|
+
|
|
32
|
+
### What does `wombat` do?
|
|
33
|
+
|
|
34
|
+
The `wombat` command processes bcftools tabulated TSV files by:
|
|
35
|
+
|
|
36
|
+
1. **Expanding the `(null)` column**: This column contains multiple fields in the format `NAME=value` separated by semicolons (e.g., `DP=30;AF=0.5;AC=2`). Each field is extracted into its own column.
|
|
37
|
+
|
|
38
|
+
2. **Preserving the `CSQ` column**: The CSQ (Consequence) column is preserved as-is and not melted, allowing VEP annotations to remain intact.
|
|
39
|
+
|
|
40
|
+
3. **Melting and splitting sample columns**: After the `(null)` column, there are typically sample columns with values in `GT:DP:GQ:AD` format. The tool:
|
|
41
|
+
- Extracts the sample name (the part before the first `:` character)
|
|
42
|
+
- Transforms the wide format into long format
|
|
43
|
+
- Creates a `sample` column with the sample names
|
|
44
|
+
- Splits the sample values into separate columns:
|
|
45
|
+
- `sample_gt`: Genotype (e.g., 0/1, 1/1)
|
|
46
|
+
- `sample_dp`: Read depth
|
|
47
|
+
- `sample_gq`: Genotype quality
|
|
48
|
+
- `sample_ad`: Allele depth (takes the second value from comma-separated list)
|
|
49
|
+
- `sample_vaf`: Variant allele frequency (calculated as sample_ad / sample_dp)
|
|
50
|
+
|
|
51
|
+
### Example
|
|
52
|
+
|
|
53
|
+
**Input:**
|
|
54
|
+
|
|
55
|
+
```tsv
|
|
56
|
+
CHROM POS REF ALT (null) Sample1:GT:Sample1:DP:Sample1:GQ:Sample1:AD Sample2:GT:Sample2:DP:Sample2:GQ:Sample2:AD
|
|
57
|
+
chr1 100 A T DP=30;AF=0.5;AC=2 0/1:15:99:5,10 1/1:18:99:0,18
|
|
58
|
+
```
|
|
59
|
+
|
|
60
|
+
**Output:**
|
|
61
|
+
|
|
62
|
+
```tsv
|
|
63
|
+
CHROM POS REF ALT AC AF DP sample sample_gt sample_dp sample_gq sample_ad sample_vaf
|
|
64
|
+
chr1 100 A T 2 0.5 30 Sample1 0/1 15 99 10 0.6667
|
|
65
|
+
chr1 100 A T 2 0.5 30 Sample2 1/1 18 99 18 1.0
|
|
66
|
+
```
|
|
67
|
+
|
|
68
|
+
Notes:
|
|
69
|
+
|
|
70
|
+
- The `sample_ad` column contains the second value from the AD field (e.g., from `5,10` it extracts `10`)
|
|
71
|
+
- The `sample_vaf` column is the variant allele frequency calculated as `sample_ad / sample_dp`
|
|
72
|
+
- By default, output is in TSV format. Use `-f parquet` to output as Parquet files
|
|
73
|
+
- The `-o` option specifies an output prefix (e.g., `-o output` creates `output.tsv` or `output.parquet`)
|
|
74
|
+
|
|
75
|
+
### Pedigree Support
|
|
76
|
+
|
|
77
|
+
You can provide a pedigree file with the `--pedigree` option to add parent genotype information to the output. This enables trio analysis by including the father's and mother's genotypes for each sample.
|
|
78
|
+
|
|
79
|
+
**Pedigree File Format:**
|
|
80
|
+
|
|
81
|
+
The pedigree file should be a tab-separated file with the following columns:
|
|
82
|
+
|
|
83
|
+
- `FID`: Family ID
|
|
84
|
+
- `sample_id`: Sample identifier (matches the sample names in the VCF)
|
|
85
|
+
- `FatherBarcode`: Father's sample identifier (use `0` or `-9` if unknown)
|
|
86
|
+
- `MotherBarcode`: Mother's sample identifier (use `0` or `-9` if unknown)
|
|
87
|
+
- `Sex`: Sex of the sample (optional)
|
|
88
|
+
- `Pheno`: Phenotype information (optional)
|
|
89
|
+
|
|
90
|
+
Example pedigree file:
|
|
91
|
+
|
|
92
|
+
```tsv
|
|
93
|
+
FID sample_id FatherBarcode MotherBarcode Sex Pheno
|
|
94
|
+
FAM1 Child1 Father1 Mother1 1 2
|
|
95
|
+
FAM1 Father1 0 0 1 1
|
|
96
|
+
FAM1 Mother1 0 0 2 1
|
|
97
|
+
```
|
|
98
|
+
|
|
99
|
+
**Output with Pedigree:**
|
|
100
|
+
|
|
101
|
+
When using `--pedigree`, the output will include additional columns for each parent:
|
|
102
|
+
|
|
103
|
+
- `father_gt`, `father_dp`, `father_gq`, `father_ad`, `father_vaf`: Father's genotype information
|
|
104
|
+
- `mother_gt`, `mother_dp`, `mother_gq`, `mother_ad`, `mother_vaf`: Mother's genotype information
|
|
105
|
+
|
|
106
|
+
These columns will contain the parent's genotype data for the same variant, allowing you to analyze inheritance patterns.
|
|
107
|
+
|
|
108
|
+
## Development
|
|
109
|
+
|
|
110
|
+
This project uses:
|
|
111
|
+
|
|
112
|
+
- **UV** for package management
|
|
113
|
+
- **Polars** for fast data processing
|
|
114
|
+
- **Click** for CLI interface
|
|
115
|
+
|
|
116
|
+
## Testing
|
|
117
|
+
|
|
118
|
+
Test files are available in the `tests/` directory:
|
|
119
|
+
|
|
120
|
+
- `test.tabulated.tsv` - Real bcftools output
|
|
121
|
+
- `test_small.tsv` - Small example for quick testing
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
[project]
|
|
2
|
+
name = "pywombat"
|
|
3
|
+
version = "0.1.0"
|
|
4
|
+
description = "A CLI tool for processing and filtering bcftools tabulated TSV files with pedigree support"
|
|
5
|
+
readme = "README.md"
|
|
6
|
+
authors = [{ name = "Freddy Cliquet", email = "fcliquet@pasteur.fr" }]
|
|
7
|
+
requires-python = ">=3.12"
|
|
8
|
+
dependencies = ["polars>=0.19.0", "click>=8.1.0", "pyyaml>=6.0"]
|
|
9
|
+
license = { text = "MIT" }
|
|
10
|
+
keywords = ["vcf", "bioinformatics", "genomics", "pedigree", "variant-calling"]
|
|
11
|
+
classifiers = [
|
|
12
|
+
"Development Status :: 3 - Alpha",
|
|
13
|
+
"Intended Audience :: Science/Research",
|
|
14
|
+
"Topic :: Scientific/Engineering :: Bio-Informatics",
|
|
15
|
+
"Programming Language :: Python :: 3",
|
|
16
|
+
"Programming Language :: Python :: 3.12",
|
|
17
|
+
]
|
|
18
|
+
|
|
19
|
+
[project.urls]
|
|
20
|
+
Homepage = "https://github.com/bourgeron-lab/pywombat"
|
|
21
|
+
Repository = "https://github.com/bourgeron-lab/pywombat"
|
|
22
|
+
Issues = "https://github.com/bourgeron-lab/pywombat/issues"
|
|
23
|
+
|
|
24
|
+
[project.scripts]
|
|
25
|
+
wombat = "pywombat.cli:cli"
|
|
26
|
+
|
|
27
|
+
[build-system]
|
|
28
|
+
requires = ["hatchling"]
|
|
29
|
+
build-backend = "hatchling.build"
|