afquery 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- afquery-0.1.0/.github/workflows/ci.yml +28 -0
- afquery-0.1.0/.github/workflows/release.yml +138 -0
- afquery-0.1.0/.gitignore +61 -0
- afquery-0.1.0/PKG-INFO +15 -0
- afquery-0.1.0/README.md +355 -0
- afquery-0.1.0/docs/webhook_schema.md +106 -0
- afquery-0.1.0/pyproject.toml +39 -0
- afquery-0.1.0/recipes/afquery/meta.yaml +47 -0
- afquery-0.1.0/resources/normalize_vcf.sh +115 -0
- afquery-0.1.0/src/afquery/__init__.py +5 -0
- afquery-0.1.0/src/afquery/_version.py +34 -0
- afquery-0.1.0/src/afquery/annotate.py +264 -0
- afquery-0.1.0/src/afquery/benchmark.py +181 -0
- afquery-0.1.0/src/afquery/bitmaps.py +38 -0
- afquery-0.1.0/src/afquery/capture.py +94 -0
- afquery-0.1.0/src/afquery/cli.py +474 -0
- afquery-0.1.0/src/afquery/cli.pyr +369 -0
- afquery-0.1.0/src/afquery/constants.py +43 -0
- afquery-0.1.0/src/afquery/database.py +284 -0
- afquery-0.1.0/src/afquery/dump.py +440 -0
- afquery-0.1.0/src/afquery/models.py +71 -0
- afquery-0.1.0/src/afquery/ploidy.py +80 -0
- afquery-0.1.0/src/afquery/preprocess/__init__.py +283 -0
- afquery-0.1.0/src/afquery/preprocess/build.py +601 -0
- afquery-0.1.0/src/afquery/preprocess/compact.py +158 -0
- afquery-0.1.0/src/afquery/preprocess/ingest.py +179 -0
- afquery-0.1.0/src/afquery/preprocess/manifest.py +109 -0
- afquery-0.1.0/src/afquery/preprocess/regions.py +14 -0
- afquery-0.1.0/src/afquery/preprocess/synth.py +92 -0
- afquery-0.1.0/src/afquery/preprocess/update.py +697 -0
- afquery-0.1.0/src/afquery/query.py +373 -0
- afquery-0.1.0/tests/conftest.py +216 -0
- afquery-0.1.0/tests/data/annotate_input.vcf +8 -0
- afquery-0.1.0/tests/data/annotate_multi_bucket.vcf +6 -0
- afquery-0.1.0/tests/data/annotate_multi_chrom.vcf +7 -0
- afquery-0.1.0/tests/data/beds/wes_kit_a.bed +2 -0
- afquery-0.1.0/tests/data/beds/wes_kit_b.bed +1 -0
- afquery-0.1.0/tests/data/expected_results.json +77 -0
- afquery-0.1.0/tests/data/manifest.tsv +11 -0
- afquery-0.1.0/tests/data/vcfs/S00.vcf +13 -0
- afquery-0.1.0/tests/data/vcfs/S01.vcf +8 -0
- afquery-0.1.0/tests/data/vcfs/S02.vcf +10 -0
- afquery-0.1.0/tests/data/vcfs/S03.vcf +6 -0
- afquery-0.1.0/tests/data/vcfs/S04.vcf +6 -0
- afquery-0.1.0/tests/data/vcfs/S05.vcf +8 -0
- afquery-0.1.0/tests/data/vcfs/S06.vcf +5 -0
- afquery-0.1.0/tests/data/vcfs/S07.vcf +6 -0
- afquery-0.1.0/tests/data/vcfs/S08.vcf +5 -0
- afquery-0.1.0/tests/data/vcfs/S09.vcf +5 -0
- afquery-0.1.0/tests/test_annotate.py +319 -0
- afquery-0.1.0/tests/test_batch.py +196 -0
- afquery-0.1.0/tests/test_benchmark.py +104 -0
- afquery-0.1.0/tests/test_bitmaps.py +155 -0
- afquery-0.1.0/tests/test_capture.py +96 -0
- afquery-0.1.0/tests/test_cli.py +284 -0
- afquery-0.1.0/tests/test_compact.py +182 -0
- afquery-0.1.0/tests/test_constants.py +50 -0
- afquery-0.1.0/tests/test_dump.py +387 -0
- afquery-0.1.0/tests/test_haploid_stats.py +144 -0
- afquery-0.1.0/tests/test_info.py +185 -0
- afquery-0.1.0/tests/test_pass_filter.py +193 -0
- afquery-0.1.0/tests/test_ploidy.py +117 -0
- afquery-0.1.0/tests/test_preprocess.py +595 -0
- afquery-0.1.0/tests/test_query.py +146 -0
- afquery-0.1.0/tests/test_sample_filter.py +296 -0
- afquery-0.1.0/tests/test_synth.py +98 -0
- afquery-0.1.0/tests/test_synthetic_stats.py +277 -0
- afquery-0.1.0/tests/test_update.py +396 -0
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
name: CI
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
push:
|
|
5
|
+
branches: ["master"]
|
|
6
|
+
pull_request:
|
|
7
|
+
branches: ["master"]
|
|
8
|
+
|
|
9
|
+
jobs:
|
|
10
|
+
test:
|
|
11
|
+
runs-on: ubuntu-latest
|
|
12
|
+
strategy:
|
|
13
|
+
matrix:
|
|
14
|
+
python-version: ["3.10", "3.11", "3.12"]
|
|
15
|
+
steps:
|
|
16
|
+
- uses: actions/checkout@v4
|
|
17
|
+
with:
|
|
18
|
+
fetch-depth: 0 # hatch-vcs needs full history to resolve version
|
|
19
|
+
|
|
20
|
+
- uses: actions/setup-python@v5
|
|
21
|
+
with:
|
|
22
|
+
python-version: ${{ matrix.python-version }}
|
|
23
|
+
|
|
24
|
+
- name: Install
|
|
25
|
+
run: pip install -e ".[dev]"
|
|
26
|
+
|
|
27
|
+
- name: Test
|
|
28
|
+
run: pytest --tb=short -q
|
|
@@ -0,0 +1,138 @@
|
|
|
1
|
+
name: Release
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
push:
|
|
5
|
+
tags:
|
|
6
|
+
- "v[0-9]*" # v1.2.4, v1.2.4rc1, etc.
|
|
7
|
+
|
|
8
|
+
jobs:
|
|
9
|
+
|
|
10
|
+
# ── 1. Test ──────────────────────────────────────────────────────────
|
|
11
|
+
test:
|
|
12
|
+
runs-on: ubuntu-latest
|
|
13
|
+
strategy:
|
|
14
|
+
matrix:
|
|
15
|
+
python-version: ["3.10", "3.11", "3.12"]
|
|
16
|
+
steps:
|
|
17
|
+
- uses: actions/checkout@v4
|
|
18
|
+
with:
|
|
19
|
+
fetch-depth: 0
|
|
20
|
+
- uses: actions/setup-python@v5
|
|
21
|
+
with:
|
|
22
|
+
python-version: ${{ matrix.python-version }}
|
|
23
|
+
- run: pip install -e ".[dev]"
|
|
24
|
+
- run: pytest --tb=short -q
|
|
25
|
+
|
|
26
|
+
# ── 2. Build ─────────────────────────────────────────────────────────
|
|
27
|
+
build:
|
|
28
|
+
needs: test
|
|
29
|
+
runs-on: ubuntu-latest
|
|
30
|
+
steps:
|
|
31
|
+
- uses: actions/checkout@v4
|
|
32
|
+
with:
|
|
33
|
+
fetch-depth: 0 # required: hatch-vcs reads git tags for version
|
|
34
|
+
- uses: actions/setup-python@v5
|
|
35
|
+
with:
|
|
36
|
+
python-version: "3.11"
|
|
37
|
+
- run: pip install build
|
|
38
|
+
- run: python -m build
|
|
39
|
+
- uses: actions/upload-artifact@v4
|
|
40
|
+
with:
|
|
41
|
+
name: dist
|
|
42
|
+
path: dist/
|
|
43
|
+
|
|
44
|
+
# ── 3. Publish to PyPI (OIDC — always, RC and final) ─────────────────
|
|
45
|
+
publish-pypi:
|
|
46
|
+
needs: build
|
|
47
|
+
runs-on: ubuntu-latest
|
|
48
|
+
environment:
|
|
49
|
+
name: pypi
|
|
50
|
+
url: https://pypi.org/p/afquery
|
|
51
|
+
permissions:
|
|
52
|
+
id-token: write # required for OIDC Trusted Publishing
|
|
53
|
+
steps:
|
|
54
|
+
- uses: actions/download-artifact@v4
|
|
55
|
+
with:
|
|
56
|
+
name: dist
|
|
57
|
+
path: dist/
|
|
58
|
+
- uses: pypa/gh-action-pypi-publish@release/v1
|
|
59
|
+
|
|
60
|
+
# ── 4. GitHub Release ────────────────────────────────────────────────
|
|
61
|
+
github-release:
|
|
62
|
+
needs: build
|
|
63
|
+
runs-on: ubuntu-latest
|
|
64
|
+
permissions:
|
|
65
|
+
contents: write
|
|
66
|
+
steps:
|
|
67
|
+
- uses: actions/download-artifact@v4
|
|
68
|
+
with:
|
|
69
|
+
name: dist
|
|
70
|
+
path: dist/
|
|
71
|
+
- uses: softprops/action-gh-release@v2
|
|
72
|
+
with:
|
|
73
|
+
files: dist/*
|
|
74
|
+
generate_release_notes: true
|
|
75
|
+
|
|
76
|
+
# ── 5. Bioconda PR (final releases only — no "rc" in tag) ────────────
|
|
77
|
+
bioconda-pr:
|
|
78
|
+
needs: publish-pypi
|
|
79
|
+
runs-on: ubuntu-latest
|
|
80
|
+
if: ${{ !contains(github.ref_name, 'rc') }} # skip RC tags
|
|
81
|
+
steps:
|
|
82
|
+
- name: Get version from tag
|
|
83
|
+
run: echo "VERSION=${GITHUB_REF#refs/tags/v}" >> $GITHUB_ENV
|
|
84
|
+
|
|
85
|
+
- name: Fetch SHA256 from PyPI
|
|
86
|
+
run: |
|
|
87
|
+
SHA256=$(curl -s https://pypi.org/pypi/afquery/${VERSION}/json \
|
|
88
|
+
| jq -r '.urls[] | select(.packagetype=="sdist") | .digests.sha256')
|
|
89
|
+
echo "SHA256=${SHA256}" >> $GITHUB_ENV
|
|
90
|
+
|
|
91
|
+
- name: Get fork owner
|
|
92
|
+
env:
|
|
93
|
+
GH_TOKEN: ${{ secrets.GH_PAT }}
|
|
94
|
+
run: echo "FORK_USER=$(gh api user --jq .login)" >> $GITHUB_ENV
|
|
95
|
+
|
|
96
|
+
- name: Sync fork with upstream
|
|
97
|
+
env:
|
|
98
|
+
GH_TOKEN: ${{ secrets.GH_PAT }}
|
|
99
|
+
run: gh repo sync ${FORK_USER}/bioconda-recipes --source bioconda/bioconda-recipes --branch master
|
|
100
|
+
|
|
101
|
+
- name: Checkout afquery repo
|
|
102
|
+
uses: actions/checkout@v4
|
|
103
|
+
with:
|
|
104
|
+
path: afquery-repo
|
|
105
|
+
|
|
106
|
+
- name: Checkout fork of bioconda-recipes
|
|
107
|
+
uses: actions/checkout@v4
|
|
108
|
+
with:
|
|
109
|
+
repository: ${{ env.FORK_USER }}/bioconda-recipes
|
|
110
|
+
token: ${{ secrets.GH_PAT }}
|
|
111
|
+
path: bioconda-recipes
|
|
112
|
+
|
|
113
|
+
- name: Update recipe
|
|
114
|
+
working-directory: bioconda-recipes
|
|
115
|
+
run: |
|
|
116
|
+
mkdir -p recipes/afquery
|
|
117
|
+
cp ../afquery-repo/recipes/afquery/meta.yaml recipes/afquery/meta.yaml
|
|
118
|
+
sed -i "s/^{% set version = .* %}/{% set version = \"${VERSION}\" %}/" recipes/afquery/meta.yaml
|
|
119
|
+
sed -i "s/sha256: .*/sha256: ${SHA256}/" recipes/afquery/meta.yaml
|
|
120
|
+
sed -i "s/^ number: .*/ number: 0/" recipes/afquery/meta.yaml
|
|
121
|
+
|
|
122
|
+
- name: Create pull request
|
|
123
|
+
working-directory: bioconda-recipes
|
|
124
|
+
env:
|
|
125
|
+
GH_TOKEN: ${{ secrets.GH_PAT }}
|
|
126
|
+
run: |
|
|
127
|
+
git config user.name "github-actions[bot]"
|
|
128
|
+
git config user.email "github-actions[bot]@users.noreply.github.com"
|
|
129
|
+
git checkout -b afquery-${VERSION}
|
|
130
|
+
git add recipes/afquery/meta.yaml
|
|
131
|
+
git commit -m "Update afquery to ${VERSION}"
|
|
132
|
+
git push origin afquery-${VERSION}
|
|
133
|
+
gh pr create \
|
|
134
|
+
--repo bioconda/bioconda-recipes \
|
|
135
|
+
--title "Update afquery to ${VERSION}" \
|
|
136
|
+
--body "Automated version bump from https://github.com/dlopez-bioinfo/afquery/releases/tag/v${VERSION}" \
|
|
137
|
+
--head "${FORK_USER}:afquery-${VERSION}" \
|
|
138
|
+
--base master
|
afquery-0.1.0/.gitignore
ADDED
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
# Python
|
|
2
|
+
__pycache__/
|
|
3
|
+
*.py[cod]
|
|
4
|
+
*$py.class
|
|
5
|
+
*.so
|
|
6
|
+
.Python
|
|
7
|
+
build/
|
|
8
|
+
develop-eggs/
|
|
9
|
+
dist/
|
|
10
|
+
downloads/
|
|
11
|
+
eggs/
|
|
12
|
+
.eggs/
|
|
13
|
+
lib/
|
|
14
|
+
lib64/
|
|
15
|
+
parts/
|
|
16
|
+
sdist/
|
|
17
|
+
var/
|
|
18
|
+
wheels/
|
|
19
|
+
*.egg-info/
|
|
20
|
+
.installed.cfg
|
|
21
|
+
*.egg
|
|
22
|
+
MANIFEST
|
|
23
|
+
pip-log.txt
|
|
24
|
+
pip-delete-this-directory.txt
|
|
25
|
+
|
|
26
|
+
# Virtual environments
|
|
27
|
+
venv/
|
|
28
|
+
ENV/
|
|
29
|
+
env/
|
|
30
|
+
.venv
|
|
31
|
+
|
|
32
|
+
# Testing
|
|
33
|
+
.pytest_cache/
|
|
34
|
+
.coverage
|
|
35
|
+
htmlcov/
|
|
36
|
+
.tox/
|
|
37
|
+
.hypothesis/
|
|
38
|
+
|
|
39
|
+
# IDE
|
|
40
|
+
.vscode/
|
|
41
|
+
.idea/
|
|
42
|
+
*.swp
|
|
43
|
+
*.swo
|
|
44
|
+
*~
|
|
45
|
+
.DS_Store
|
|
46
|
+
Thumbs.db
|
|
47
|
+
|
|
48
|
+
# Temporary files
|
|
49
|
+
*.tmp
|
|
50
|
+
*.log
|
|
51
|
+
*.bak
|
|
52
|
+
*~
|
|
53
|
+
|
|
54
|
+
# Auto-generated by hatch-vcs at build time
|
|
55
|
+
src/afquery/_version.py
|
|
56
|
+
|
|
57
|
+
# Database files (generated)
|
|
58
|
+
*.duckdb
|
|
59
|
+
*.duckdb.wal
|
|
60
|
+
*.parquet
|
|
61
|
+
db/*
|
afquery-0.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: afquery
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Genomic allele frequency query engine with bitmap-encoded genotypes
|
|
5
|
+
Requires-Python: >=3.10
|
|
6
|
+
Requires-Dist: click>=8.1
|
|
7
|
+
Requires-Dist: cyvcf2>=0.30
|
|
8
|
+
Requires-Dist: duckdb>=0.10
|
|
9
|
+
Requires-Dist: pyarrow>=14.0
|
|
10
|
+
Requires-Dist: pyranges<0.2,>=0.1.2
|
|
11
|
+
Requires-Dist: pyroaring>=0.4.8
|
|
12
|
+
Requires-Dist: tqdm>=4.60
|
|
13
|
+
Provides-Extra: dev
|
|
14
|
+
Requires-Dist: pytest-cov>=5.0; extra == 'dev'
|
|
15
|
+
Requires-Dist: pytest>=8.0; extra == 'dev'
|
afquery-0.1.0/README.md
ADDED
|
@@ -0,0 +1,355 @@
|
|
|
1
|
+
# afquery
|
|
2
|
+
|
|
3
|
+
Genomic allele frequency query engine with bitmap-encoded genotypes. Fast, file-based queries over 10K-50K samples at sub-100ms latency.
|
|
4
|
+
|
|
5
|
+
## Features
|
|
6
|
+
|
|
7
|
+
- **Fast point queries**: <100ms cold start, ~10ms warm queries on single positions
|
|
8
|
+
- **Batch queries**: Multi-position queries via SQL IN clauses or temporary tables
|
|
9
|
+
- **Region queries**: Genomic range queries with automatic partitioning
|
|
10
|
+
- **VCF annotation**: Annotate VCF files with computed allele frequencies and sample genotypes
|
|
11
|
+
- **Ploidy-aware**: Correct AN/AC computation for autosomes, chrX, chrY, and chrM
|
|
12
|
+
- **Bitmap compression**: Roaring Bitmaps for efficient genotype storage
|
|
13
|
+
- **In-process**: No server process—queries run locally with DuckDB
|
|
14
|
+
- **Incremental updates**: Add new samples to existing databases
|
|
15
|
+
- **Multiple genome builds**: Support for GRCh37 and GRCh38
|
|
16
|
+
|
|
17
|
+
## Installation
|
|
18
|
+
|
|
19
|
+
```bash
|
|
20
|
+
pip install -e /home/dan/projects/afquery/ --break-system-packages
|
|
21
|
+
```
|
|
22
|
+
|
|
23
|
+
Requires Python 3.10+.
|
|
24
|
+
|
|
25
|
+
## Quick Start
|
|
26
|
+
|
|
27
|
+
### 1. Create a Database
|
|
28
|
+
|
|
29
|
+
First, prepare a manifest TSV with sample metadata:
|
|
30
|
+
|
|
31
|
+
```tsv
|
|
32
|
+
sample_name sex tech_name vcf_path phenotype_codes
|
|
33
|
+
sample_1 male wgs vcfs/sample_1.vcf E11.9,I10
|
|
34
|
+
sample_2 female wes_kit_a vcfs/sample_2.vcf E11.9
|
|
35
|
+
sample_3 male wgs vcfs/sample_3.vcf I10
|
|
36
|
+
```
|
|
37
|
+
|
|
38
|
+
**Key points about the manifest:**
|
|
39
|
+
- `tech_name`: Either `wgs` (case-insensitive) for whole genome, or a custom technology name
|
|
40
|
+
- `vcf_path`: Path to the single-sample VCF file (relative to manifest directory, or absolute)
|
|
41
|
+
- For WES/exome technologies, capture regions are loaded from `--bed-dir/{tech_name}.bed`
|
|
42
|
+
- Example: `tech_name=wes_kit_a` → loads `beds/wes_kit_a.bed`
|
|
43
|
+
|
|
44
|
+
Organize your files:
|
|
45
|
+
```
|
|
46
|
+
project/
|
|
47
|
+
├── manifest.tsv
|
|
48
|
+
├── vcfs/
|
|
49
|
+
│ ├── sample_1.vcf
|
|
50
|
+
│ ├── sample_2.vcf
|
|
51
|
+
│ └── sample_3.vcf
|
|
52
|
+
└── beds/ # Required for non-WGS technologies
|
|
53
|
+
└── wes_kit_a.bed # BED file for WES kit A
|
|
54
|
+
```
|
|
55
|
+
|
|
56
|
+
Then preprocess your VCF files:
|
|
57
|
+
|
|
58
|
+
```bash
|
|
59
|
+
afquery preprocess \
|
|
60
|
+
--manifest manifest.tsv \
|
|
61
|
+
--bed-dir ./beds/ \
|
|
62
|
+
--output-dir ./my_db/ \
|
|
63
|
+
--genome-build GRCh38
|
|
64
|
+
```
|
|
65
|
+
|
|
66
|
+
This creates:
|
|
67
|
+
- `my_db/manifest.json` — database metadata
|
|
68
|
+
- `my_db/metadata.sqlite` — samples, technologies, phenotype codes, precomputed bitmaps
|
|
69
|
+
- `my_db/variants/{chrom}.parquet` — variant data with encoded genotypes
|
|
70
|
+
- `my_db/capture/` — capture regions for each technology
|
|
71
|
+
|
|
72
|
+
### 2. Query Allele Frequencies
|
|
73
|
+
|
|
74
|
+
```bash
|
|
75
|
+
# Point query
|
|
76
|
+
afquery query --db my_db --chrom chr1 --pos 1000 --alt G
|
|
77
|
+
|
|
78
|
+
# Batch query (100 positions)
|
|
79
|
+
afquery query-batch --db my_db --positions positions.tsv --phenotype E11.9
|
|
80
|
+
|
|
81
|
+
# Region query
|
|
82
|
+
afquery query --db my_db --chrom chr1 --start 1000 --end 10000 --phenotype E11.9 --sex M
|
|
83
|
+
```
|
|
84
|
+
|
|
85
|
+
### 3. Annotate VCF Files
|
|
86
|
+
|
|
87
|
+
```bash
|
|
88
|
+
afquery annotate \
|
|
89
|
+
--db my_db \
|
|
90
|
+
--vcf input.vcf \
|
|
91
|
+
--output annotated.vcf \
|
|
92
|
+
--phenotype E11.9 \
|
|
93
|
+
--tech WGS
|
|
94
|
+
```
|
|
95
|
+
|
|
96
|
+
Adds `AFQUERY_AC`, `AFQUERY_AN`, `AFQUERY_AF` and genotype fields to your VCF.
|
|
97
|
+
|
|
98
|
+
## Python API
|
|
99
|
+
|
|
100
|
+
```python
|
|
101
|
+
from afquery import Database
|
|
102
|
+
|
|
103
|
+
db = Database("/path/to/db")
|
|
104
|
+
|
|
105
|
+
# Single position query
|
|
106
|
+
# Automatically filters samples by: sex + phenotype codes + capture coverage
|
|
107
|
+
result = db.query(
|
|
108
|
+
chrom="chr1",
|
|
109
|
+
pos=1000,
|
|
110
|
+
alt="G",
|
|
111
|
+
phenotype_codes=["E11.9"],
|
|
112
|
+
sex="both"
|
|
113
|
+
)
|
|
114
|
+
print(f"AC={result.ac}, AN={result.an}, AF={result.af}")
|
|
115
|
+
|
|
116
|
+
# Batch query (multi-variant)
|
|
117
|
+
results = db.query_batch(
|
|
118
|
+
"chr1",
|
|
119
|
+
variants=[(1500, "A", "T"), (3500, "G", "C")],
|
|
120
|
+
phenotype=["E11.9"],
|
|
121
|
+
)
|
|
122
|
+
|
|
123
|
+
# Region query (genomic range)
|
|
124
|
+
results = db.query_region(
|
|
125
|
+
chrom="chr1",
|
|
126
|
+
start=1000,
|
|
127
|
+
end=10000,
|
|
128
|
+
phenotype_codes=["E11.9", "I10"]
|
|
129
|
+
)
|
|
130
|
+
|
|
131
|
+
# Annotate VCF with allele frequencies
|
|
132
|
+
# Note: tech_name filters annotation to samples of that technology
|
|
133
|
+
db.annotate_vcf(
|
|
134
|
+
vcf_path="input.vcf",
|
|
135
|
+
output_path="annotated.vcf",
|
|
136
|
+
phenotype_codes=["E11.9"],
|
|
137
|
+
tech_name="wgs" # Only annotate using WGS samples
|
|
138
|
+
)
|
|
139
|
+
```
|
|
140
|
+
|
|
141
|
+
**How samples are filtered in queries**:
|
|
142
|
+
- Sex filter: `male`, `female`, or `both`
|
|
143
|
+
- phenotype filter: All codes must match
|
|
144
|
+
- Capture filter: Automatic—only samples whose tech's BED covers the position
|
|
145
|
+
|
|
146
|
+
## Database Structure
|
|
147
|
+
|
|
148
|
+
```
|
|
149
|
+
my_db/
|
|
150
|
+
├── manifest.json # Metadata: genome_build, sample_count, schema_version
|
|
151
|
+
├── metadata.sqlite # SQLite: samples, technologies, phenotype codes, bitmaps
|
|
152
|
+
├── variants/
|
|
153
|
+
│ ├── chr1.parquet
|
|
154
|
+
│ ├── chr2.parquet
|
|
155
|
+
│ └── ...
|
|
156
|
+
└── capture/
|
|
157
|
+
├── tech_0.pickle # WGS capture region (always covered)
|
|
158
|
+
└── tech_1.pickle # WES kit capture region
|
|
159
|
+
```
|
|
160
|
+
|
|
161
|
+
Each variant row contains:
|
|
162
|
+
- `pos` — 1-based genomic position
|
|
163
|
+
- `ref` — reference allele
|
|
164
|
+
- `alt` — alternate allele
|
|
165
|
+
- `het_bitmap` — Roaring Bitmap of heterozygous samples
|
|
166
|
+
- `hom_bitmap` — Roaring Bitmap of homozygous samples
|
|
167
|
+
|
|
168
|
+
## How Capture BED Files Are Associated with Samples
|
|
169
|
+
|
|
170
|
+
Samples are linked to capture regions through their **technology**:
|
|
171
|
+
|
|
172
|
+
1. **Manifest specifies technology**: Each sample lists `tech_name` (e.g., `wgs`, `wes_kit_a`)
|
|
173
|
+
2. **Technology maps to BED file**:
|
|
174
|
+
- **WGS**: No BED file needed (always fully covered)
|
|
175
|
+
- **WES/Custom**: BED file loaded from `{bed_dir}/{tech_name}.bed`
|
|
176
|
+
3. **Storage in database**:
|
|
177
|
+
- `metadata.sqlite::technologies` stores tech_id, tech_name, and bed_path
|
|
178
|
+
- `metadata.sqlite::samples` stores sample_id, sample_name, and tech_id (foreign key)
|
|
179
|
+
4. **Query-time filtering**: When querying, samples are filtered by:
|
|
180
|
+
- Sex (male/female/both)
|
|
181
|
+
- phenotype diagnosis codes
|
|
182
|
+
- Capture region coverage (via tech's BED file)
|
|
183
|
+
|
|
184
|
+
**Example**: If you have samples on two exome kits:
|
|
185
|
+
```tsv
|
|
186
|
+
sample_name sex tech_name vcf_path phenotype_codes
|
|
187
|
+
S001 male exome_v1 vcfs/S001.vcf E11.9
|
|
188
|
+
S002 female exome_v1 vcfs/S002.vcf E11.9
|
|
189
|
+
S003 male exome_v2 vcfs/S003.vcf I10
|
|
190
|
+
```
|
|
191
|
+
|
|
192
|
+
Then provide:
|
|
193
|
+
```
|
|
194
|
+
beds/
|
|
195
|
+
├── exome_v1.bed # Coverage for samples S001, S002
|
|
196
|
+
└── exome_v2.bed # Coverage for sample S003
|
|
197
|
+
```
|
|
198
|
+
|
|
199
|
+
At query time, each sample's eligible regions are determined by its tech's BED file.
|
|
200
|
+
|
|
201
|
+
## Advanced Features
|
|
202
|
+
|
|
203
|
+
### Incremental Updates (add_samples)
|
|
204
|
+
|
|
205
|
+
```bash
|
|
206
|
+
afquery add-samples \
|
|
207
|
+
--db my_db \
|
|
208
|
+
--manifest new_samples.tsv \
|
|
209
|
+
--vcf-dir ./new_vcfs/
|
|
210
|
+
```
|
|
211
|
+
|
|
212
|
+
Adds new samples without rebuilding the entire database.
|
|
213
|
+
|
|
214
|
+
### Compact Database
|
|
215
|
+
|
|
216
|
+
Remove samples and reclaim disk space:
|
|
217
|
+
|
|
218
|
+
```bash
|
|
219
|
+
afquery compact --db my_db --samples-to-remove sample_1,sample_2
|
|
220
|
+
```
|
|
221
|
+
|
|
222
|
+
### Run Benchmarks
|
|
223
|
+
|
|
224
|
+
```bash
|
|
225
|
+
afquery benchmark --db my_db --n-queries 1000 --query-type point
|
|
226
|
+
```
|
|
227
|
+
|
|
228
|
+
### Generate Synthetic Data
|
|
229
|
+
|
|
230
|
+
```bash
|
|
231
|
+
afquery synth --output synthetic_db/ --n-samples 5000 --n-variants 100000
|
|
232
|
+
```
|
|
233
|
+
|
|
234
|
+
## Ploidy Rules
|
|
235
|
+
|
|
236
|
+
AF computation respects chromosome-specific ploidy:
|
|
237
|
+
|
|
238
|
+
| Chromosome | Formula |
|
|
239
|
+
|---|---|
|
|
240
|
+
| Autosomes | `AN = 2 × eligible_samples` |
|
|
241
|
+
| chrM | `AN = 1 × eligible_samples` |
|
|
242
|
+
| chrY | `AN = 1 × eligible_males` |
|
|
243
|
+
| chrX (PAR) | `AN = 2 × eligible_samples` |
|
|
244
|
+
| chrX (non-PAR) | `AN = 2 × eligible_females + 1 × eligible_males` |
|
|
245
|
+
|
|
246
|
+
Where `eligible` = samples matching sex, phenotype, and technology capture filters.
|
|
247
|
+
|
|
248
|
+
## Performance Targets
|
|
249
|
+
|
|
250
|
+
- **Point query (cold)**: <100 ms
|
|
251
|
+
- **Point query (warm)**: ~10 ms
|
|
252
|
+
- **Batch 100 positions**: ~200 ms
|
|
253
|
+
- **VCF annotation (5K variants)**: ~30 s
|
|
254
|
+
- **VCF annotation (5M variants)**: ~30 min
|
|
255
|
+
|
|
256
|
+
## Command Reference
|
|
257
|
+
|
|
258
|
+
```
|
|
259
|
+
afquery query Query single position
|
|
260
|
+
afquery query-batch Batch query multiple positions
|
|
261
|
+
afquery annotate Annotate VCF file
|
|
262
|
+
afquery info Show database info
|
|
263
|
+
afquery preprocess Build database from VCFs
|
|
264
|
+
afquery add-samples Add new samples to database
|
|
265
|
+
afquery compact Remove samples and reclaim space
|
|
266
|
+
afquery synth Generate synthetic test database
|
|
267
|
+
afquery benchmark Run performance benchmarks
|
|
268
|
+
```
|
|
269
|
+
|
|
270
|
+
Run `afquery --help` for full options.
|
|
271
|
+
|
|
272
|
+
## Development
|
|
273
|
+
|
|
274
|
+
### Running Tests
|
|
275
|
+
|
|
276
|
+
```bash
|
|
277
|
+
# All 190 tests
|
|
278
|
+
python3 -m pytest --tb=short -q
|
|
279
|
+
|
|
280
|
+
# Specific test module
|
|
281
|
+
python3 -m pytest tests/test_query.py -v
|
|
282
|
+
```
|
|
283
|
+
|
|
284
|
+
### Key Modules
|
|
285
|
+
|
|
286
|
+
- `afquery.query` — QueryEngine, point/batch/region queries
|
|
287
|
+
- `afquery.annotate` — VCF annotation pipeline
|
|
288
|
+
- `afquery.database` — Database wrapper (public API)
|
|
289
|
+
- `afquery.preprocess` — Manifest parsing, VCF ingestion, Parquet building
|
|
290
|
+
- `afquery.bitmaps` — Roaring Bitmap encoding/decoding
|
|
291
|
+
- `afquery.ploidy` — Chromosome-specific ploidy rules
|
|
292
|
+
- `afquery.models` — Data classes (QueryResult, ParsedSample, etc.)
|
|
293
|
+
|
|
294
|
+
### Architecture
|
|
295
|
+
|
|
296
|
+
See `brain/architecture.md` for detailed system design, data flow, and query algorithm.
|
|
297
|
+
|
|
298
|
+
## Genome Builds
|
|
299
|
+
|
|
300
|
+
- **GRCh37** (hg19) — PAR regions: chrX:1-2649520
|
|
301
|
+
- **GRCh38** (hg38) — PAR regions: chrX:1-3099677
|
|
302
|
+
|
|
303
|
+
## Technologies Supported
|
|
304
|
+
|
|
305
|
+
- **WGS** — Whole genome sequencing (always fully covered, no BED file needed)
|
|
306
|
+
- Manifest: `tech_name = wgs` (case-insensitive)
|
|
307
|
+
- Query-time: All positions in genome considered covered
|
|
308
|
+
|
|
309
|
+
- **WES** — Whole exome sequencing (coverage defined by BED file)
|
|
310
|
+
- Manifest: `tech_name = wes_kit_a` (or any custom name)
|
|
311
|
+
- Preprocessing: Loads `{bed_dir}/wes_kit_a.bed` (0-based half-open BED format)
|
|
312
|
+
- Query-time: Only positions within BED intervals considered covered
|
|
313
|
+
|
|
314
|
+
- **Custom** — Any technology with a BED file (e.g., gene panels, targeted sequencing)
|
|
315
|
+
- Manifest: Use any `tech_name`
|
|
316
|
+
- Preprocessing: Loads `{bed_dir}/{tech_name}.bed`
|
|
317
|
+
- Query-time: Respects BED file coverage
|
|
318
|
+
|
|
319
|
+
## Troubleshooting
|
|
320
|
+
|
|
321
|
+
### ImportError: `cyvcf2`
|
|
322
|
+
|
|
323
|
+
`cyvcf2` import happens inside worker processes during preprocessing. Do not import at module level.
|
|
324
|
+
|
|
325
|
+
### DuckDB Temp Files
|
|
326
|
+
|
|
327
|
+
Uses Parquet format (not Arrow IPC) for compatibility. Set `DUCKDB_TEMP_DIRECTORY` if needed.
|
|
328
|
+
|
|
329
|
+
### Sample IDs
|
|
330
|
+
|
|
331
|
+
Sample IDs are 0-indexed and monotonically increasing. Never reuse removed IDs—use `compact` to reclaim space.
|
|
332
|
+
|
|
333
|
+
## Contributing
|
|
334
|
+
|
|
335
|
+
1. Read `brain/project_state.json` for current phase and test count
|
|
336
|
+
2. Read `brain/architecture.md` for system design
|
|
337
|
+
3. Follow code conventions in `CLAUDE.md`
|
|
338
|
+
4. Update `brain/` docs after architectural changes
|
|
339
|
+
5. Run tests before submitting
|
|
340
|
+
|
|
341
|
+
## License
|
|
342
|
+
|
|
343
|
+
(Add your license here)
|
|
344
|
+
|
|
345
|
+
## Citation
|
|
346
|
+
|
|
347
|
+
If you use afquery in research, please cite:
|
|
348
|
+
|
|
349
|
+
```
|
|
350
|
+
(Citation format to be determined)
|
|
351
|
+
```
|
|
352
|
+
|
|
353
|
+
---
|
|
354
|
+
|
|
355
|
+
**Status**: Phase 5 complete (190 tests passing). Active development.
|
|
@@ -0,0 +1,106 @@
|
|
|
1
|
+
# Webhook / API JSON Schema
|
|
2
|
+
|
|
3
|
+
## Batch Query
|
|
4
|
+
|
|
5
|
+
### Request
|
|
6
|
+
|
|
7
|
+
```json
|
|
8
|
+
{
|
|
9
|
+
"chrom": "chr1",
|
|
10
|
+
"positions": [1000, 2000, 3000],
|
|
11
|
+
"phenotype": ["E11.9", "I10"],
|
|
12
|
+
"sex": "both"
|
|
13
|
+
}
|
|
14
|
+
```
|
|
15
|
+
|
|
16
|
+
| Field | Type | Required | Notes |
|
|
17
|
+
|-------------|-----------------|----------|--------------------------------------------|
|
|
18
|
+
| `chrom` | string | yes | Any form accepted: `"1"`, `"chr1"`, `"X"` |
|
|
19
|
+
| `positions` | array of int | yes | 1-based positions; duplicates are ignored |
|
|
20
|
+
| `phenotype` | array of string | yes | phenotype codes; union semantics |
|
|
21
|
+
| `sex` | string | no | `"both"` (default), `"male"`, `"female"` |
|
|
22
|
+
|
|
23
|
+
### Response
|
|
24
|
+
|
|
25
|
+
Array of variant objects, one per variant found (absent variants are omitted). Sorted by `(pos, alt)`.
|
|
26
|
+
|
|
27
|
+
```json
|
|
28
|
+
[
|
|
29
|
+
{
|
|
30
|
+
"chrom": "chr1",
|
|
31
|
+
"pos": 1000,
|
|
32
|
+
"ref": "A",
|
|
33
|
+
"alt": "T",
|
|
34
|
+
"AC": 5,
|
|
35
|
+
"AN": 100,
|
|
36
|
+
"AF": 0.05,
|
|
37
|
+
"n_eligible": 50
|
|
38
|
+
}
|
|
39
|
+
]
|
|
40
|
+
```
|
|
41
|
+
|
|
42
|
+
| Field | Type | Notes |
|
|
43
|
+
|--------------|--------|----------------------------------------------|
|
|
44
|
+
| `chrom` | string | Canonical form (`chr`-prefixed) |
|
|
45
|
+
| `pos` | int | 1-based |
|
|
46
|
+
| `ref` | string | Reference allele |
|
|
47
|
+
| `alt` | string | Alternate allele (one object per alt) |
|
|
48
|
+
| `AC` | int | Allele count in eligible samples |
|
|
49
|
+
| `AN` | int | Allele number (ploidy-aware) |
|
|
50
|
+
| `AF` | float | `AC / AN` |
|
|
51
|
+
| `n_eligible` | int | Number of eligible samples at this position |
|
|
52
|
+
|
|
53
|
+
Positions where `AN == 0` (no eligible samples covered) are silently omitted.
|
|
54
|
+
|
|
55
|
+
---
|
|
56
|
+
|
|
57
|
+
## Single-Position Query
|
|
58
|
+
|
|
59
|
+
Same as batch query with a single-element `positions` array.
|
|
60
|
+
|
|
61
|
+
---
|
|
62
|
+
|
|
63
|
+
## VCF Annotation
|
|
64
|
+
|
|
65
|
+
### Request
|
|
66
|
+
|
|
67
|
+
`POST /annotate` — multipart form data.
|
|
68
|
+
|
|
69
|
+
| Part | Content-Type | Description |
|
|
70
|
+
|----------|----------------------------|-----------------------------------|
|
|
71
|
+
| `vcf` | `application/octet-stream` | Input VCF file (plain or gzipped) |
|
|
72
|
+
| `params` | `application/json` | JSON object (see below) |
|
|
73
|
+
|
|
74
|
+
`params` object:
|
|
75
|
+
|
|
76
|
+
```json
|
|
77
|
+
{
|
|
78
|
+
"phenotype": ["E11.9"],
|
|
79
|
+
"sex": "both"
|
|
80
|
+
}
|
|
81
|
+
```
|
|
82
|
+
|
|
83
|
+
### Response
|
|
84
|
+
|
|
85
|
+
Annotated VCF file stream (`Content-Type: text/plain`).
|
|
86
|
+
|
|
87
|
+
Added INFO fields:
|
|
88
|
+
|
|
89
|
+
| Field | Number | Type | Description |
|
|
90
|
+
|------------|--------|---------|--------------------------------------|
|
|
91
|
+
| `AFQUERY_AC` | A | Integer | Allele count per ALT in eligible set |
|
|
92
|
+
| `AFQUERY_AN` | 1 | Integer | Allele number (0 if uncovered) |
|
|
93
|
+
| `AFQUERY_AF` | A | Float | Allele frequency per ALT |
|
|
94
|
+
|
|
95
|
+
**Notes**:
|
|
96
|
+
- `AFQUERY_AN=0` means the position was not covered for the given filters; `AFQUERY_AC` and `AFQUERY_AF` are absent (`.`) in that case.
|
|
97
|
+
- `AFQUERY_AC=0`, `AFQUERY_AF=0.0` with `AFQUERY_AN>0` means the position was covered but the specific allele was not observed in the database.
|
|
98
|
+
- For multi-allelic sites, `AFQUERY_AC` and `AFQUERY_AF` are comma-separated lists aligned to the `ALT` column.
|
|
99
|
+
|
|
100
|
+
### Completion summary (trailing header)
|
|
101
|
+
|
|
102
|
+
After the VCF body, a summary is written to stderr (or returned in a trailing `X-AFQUERY-Stats` response header):
|
|
103
|
+
|
|
104
|
+
```json
|
|
105
|
+
{"n_variants": 1000, "n_annotated": 850, "n_uncovered": 12}
|
|
106
|
+
```
|