afquery 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (68) hide show
  1. afquery-0.1.0/.github/workflows/ci.yml +28 -0
  2. afquery-0.1.0/.github/workflows/release.yml +138 -0
  3. afquery-0.1.0/.gitignore +61 -0
  4. afquery-0.1.0/PKG-INFO +15 -0
  5. afquery-0.1.0/README.md +355 -0
  6. afquery-0.1.0/docs/webhook_schema.md +106 -0
  7. afquery-0.1.0/pyproject.toml +39 -0
  8. afquery-0.1.0/recipes/afquery/meta.yaml +47 -0
  9. afquery-0.1.0/resources/normalize_vcf.sh +115 -0
  10. afquery-0.1.0/src/afquery/__init__.py +5 -0
  11. afquery-0.1.0/src/afquery/_version.py +34 -0
  12. afquery-0.1.0/src/afquery/annotate.py +264 -0
  13. afquery-0.1.0/src/afquery/benchmark.py +181 -0
  14. afquery-0.1.0/src/afquery/bitmaps.py +38 -0
  15. afquery-0.1.0/src/afquery/capture.py +94 -0
  16. afquery-0.1.0/src/afquery/cli.py +474 -0
  17. afquery-0.1.0/src/afquery/cli.pyr +369 -0
  18. afquery-0.1.0/src/afquery/constants.py +43 -0
  19. afquery-0.1.0/src/afquery/database.py +284 -0
  20. afquery-0.1.0/src/afquery/dump.py +440 -0
  21. afquery-0.1.0/src/afquery/models.py +71 -0
  22. afquery-0.1.0/src/afquery/ploidy.py +80 -0
  23. afquery-0.1.0/src/afquery/preprocess/__init__.py +283 -0
  24. afquery-0.1.0/src/afquery/preprocess/build.py +601 -0
  25. afquery-0.1.0/src/afquery/preprocess/compact.py +158 -0
  26. afquery-0.1.0/src/afquery/preprocess/ingest.py +179 -0
  27. afquery-0.1.0/src/afquery/preprocess/manifest.py +109 -0
  28. afquery-0.1.0/src/afquery/preprocess/regions.py +14 -0
  29. afquery-0.1.0/src/afquery/preprocess/synth.py +92 -0
  30. afquery-0.1.0/src/afquery/preprocess/update.py +697 -0
  31. afquery-0.1.0/src/afquery/query.py +373 -0
  32. afquery-0.1.0/tests/conftest.py +216 -0
  33. afquery-0.1.0/tests/data/annotate_input.vcf +8 -0
  34. afquery-0.1.0/tests/data/annotate_multi_bucket.vcf +6 -0
  35. afquery-0.1.0/tests/data/annotate_multi_chrom.vcf +7 -0
  36. afquery-0.1.0/tests/data/beds/wes_kit_a.bed +2 -0
  37. afquery-0.1.0/tests/data/beds/wes_kit_b.bed +1 -0
  38. afquery-0.1.0/tests/data/expected_results.json +77 -0
  39. afquery-0.1.0/tests/data/manifest.tsv +11 -0
  40. afquery-0.1.0/tests/data/vcfs/S00.vcf +13 -0
  41. afquery-0.1.0/tests/data/vcfs/S01.vcf +8 -0
  42. afquery-0.1.0/tests/data/vcfs/S02.vcf +10 -0
  43. afquery-0.1.0/tests/data/vcfs/S03.vcf +6 -0
  44. afquery-0.1.0/tests/data/vcfs/S04.vcf +6 -0
  45. afquery-0.1.0/tests/data/vcfs/S05.vcf +8 -0
  46. afquery-0.1.0/tests/data/vcfs/S06.vcf +5 -0
  47. afquery-0.1.0/tests/data/vcfs/S07.vcf +6 -0
  48. afquery-0.1.0/tests/data/vcfs/S08.vcf +5 -0
  49. afquery-0.1.0/tests/data/vcfs/S09.vcf +5 -0
  50. afquery-0.1.0/tests/test_annotate.py +319 -0
  51. afquery-0.1.0/tests/test_batch.py +196 -0
  52. afquery-0.1.0/tests/test_benchmark.py +104 -0
  53. afquery-0.1.0/tests/test_bitmaps.py +155 -0
  54. afquery-0.1.0/tests/test_capture.py +96 -0
  55. afquery-0.1.0/tests/test_cli.py +284 -0
  56. afquery-0.1.0/tests/test_compact.py +182 -0
  57. afquery-0.1.0/tests/test_constants.py +50 -0
  58. afquery-0.1.0/tests/test_dump.py +387 -0
  59. afquery-0.1.0/tests/test_haploid_stats.py +144 -0
  60. afquery-0.1.0/tests/test_info.py +185 -0
  61. afquery-0.1.0/tests/test_pass_filter.py +193 -0
  62. afquery-0.1.0/tests/test_ploidy.py +117 -0
  63. afquery-0.1.0/tests/test_preprocess.py +595 -0
  64. afquery-0.1.0/tests/test_query.py +146 -0
  65. afquery-0.1.0/tests/test_sample_filter.py +296 -0
  66. afquery-0.1.0/tests/test_synth.py +98 -0
  67. afquery-0.1.0/tests/test_synthetic_stats.py +277 -0
  68. afquery-0.1.0/tests/test_update.py +396 -0
@@ -0,0 +1,28 @@
1
+ name: CI
2
+
3
+ on:
4
+ push:
5
+ branches: ["master"]
6
+ pull_request:
7
+ branches: ["master"]
8
+
9
+ jobs:
10
+ test:
11
+ runs-on: ubuntu-latest
12
+ strategy:
13
+ matrix:
14
+ python-version: ["3.10", "3.11", "3.12"]
15
+ steps:
16
+ - uses: actions/checkout@v4
17
+ with:
18
+ fetch-depth: 0 # hatch-vcs needs full history to resolve version
19
+
20
+ - uses: actions/setup-python@v5
21
+ with:
22
+ python-version: ${{ matrix.python-version }}
23
+
24
+ - name: Install
25
+ run: pip install -e ".[dev]"
26
+
27
+ - name: Test
28
+ run: pytest --tb=short -q
@@ -0,0 +1,138 @@
1
+ name: Release
2
+
3
+ on:
4
+ push:
5
+ tags:
6
+ - "v[0-9]*" # v1.2.4, v1.2.4rc1, etc.
7
+
8
+ jobs:
9
+
10
+ # ── 1. Test ──────────────────────────────────────────────────────────
11
+ test:
12
+ runs-on: ubuntu-latest
13
+ strategy:
14
+ matrix:
15
+ python-version: ["3.10", "3.11", "3.12"]
16
+ steps:
17
+ - uses: actions/checkout@v4
18
+ with:
19
+ fetch-depth: 0
20
+ - uses: actions/setup-python@v5
21
+ with:
22
+ python-version: ${{ matrix.python-version }}
23
+ - run: pip install -e ".[dev]"
24
+ - run: pytest --tb=short -q
25
+
26
+ # ── 2. Build ─────────────────────────────────────────────────────────
27
+ build:
28
+ needs: test
29
+ runs-on: ubuntu-latest
30
+ steps:
31
+ - uses: actions/checkout@v4
32
+ with:
33
+ fetch-depth: 0 # required: hatch-vcs reads git tags for version
34
+ - uses: actions/setup-python@v5
35
+ with:
36
+ python-version: "3.11"
37
+ - run: pip install build
38
+ - run: python -m build
39
+ - uses: actions/upload-artifact@v4
40
+ with:
41
+ name: dist
42
+ path: dist/
43
+
44
+ # ── 3. Publish to PyPI (OIDC — always, RC and final) ─────────────────
45
+ publish-pypi:
46
+ needs: build
47
+ runs-on: ubuntu-latest
48
+ environment:
49
+ name: pypi
50
+ url: https://pypi.org/p/afquery
51
+ permissions:
52
+ id-token: write # required for OIDC Trusted Publishing
53
+ steps:
54
+ - uses: actions/download-artifact@v4
55
+ with:
56
+ name: dist
57
+ path: dist/
58
+ - uses: pypa/gh-action-pypi-publish@release/v1
59
+
60
+ # ── 4. GitHub Release ────────────────────────────────────────────────
61
+ github-release:
62
+ needs: build
63
+ runs-on: ubuntu-latest
64
+ permissions:
65
+ contents: write
66
+ steps:
67
+ - uses: actions/download-artifact@v4
68
+ with:
69
+ name: dist
70
+ path: dist/
71
+ - uses: softprops/action-gh-release@v2
72
+ with:
73
+ files: dist/*
74
+ generate_release_notes: true
75
+
76
+ # ── 5. Bioconda PR (final releases only — no "rc" in tag) ────────────
77
+ bioconda-pr:
78
+ needs: publish-pypi
79
+ runs-on: ubuntu-latest
80
+ if: ${{ !contains(github.ref_name, 'rc') }} # skip RC tags
81
+ steps:
82
+ - name: Get version from tag
83
+ run: echo "VERSION=${GITHUB_REF#refs/tags/v}" >> $GITHUB_ENV
84
+
85
+ - name: Fetch SHA256 from PyPI
86
+ run: |
87
+ SHA256=$(curl -s https://pypi.org/pypi/afquery/${VERSION}/json \
88
+ | jq -r '.urls[] | select(.packagetype=="sdist") | .digests.sha256')
89
+ echo "SHA256=${SHA256}" >> $GITHUB_ENV
90
+
91
+ - name: Get fork owner
92
+ env:
93
+ GH_TOKEN: ${{ secrets.GH_PAT }}
94
+ run: echo "FORK_USER=$(gh api user --jq .login)" >> $GITHUB_ENV
95
+
96
+ - name: Sync fork with upstream
97
+ env:
98
+ GH_TOKEN: ${{ secrets.GH_PAT }}
99
+ run: gh repo sync ${FORK_USER}/bioconda-recipes --source bioconda/bioconda-recipes --branch master
100
+
101
+ - name: Checkout afquery repo
102
+ uses: actions/checkout@v4
103
+ with:
104
+ path: afquery-repo
105
+
106
+ - name: Checkout fork of bioconda-recipes
107
+ uses: actions/checkout@v4
108
+ with:
109
+ repository: ${{ env.FORK_USER }}/bioconda-recipes
110
+ token: ${{ secrets.GH_PAT }}
111
+ path: bioconda-recipes
112
+
113
+ - name: Update recipe
114
+ working-directory: bioconda-recipes
115
+ run: |
116
+ mkdir -p recipes/afquery
117
+ cp ../afquery-repo/recipes/afquery/meta.yaml recipes/afquery/meta.yaml
118
+ sed -i "s/^{% set version = .* %}/{% set version = \"${VERSION}\" %}/" recipes/afquery/meta.yaml
119
+ sed -i "s/sha256: .*/sha256: ${SHA256}/" recipes/afquery/meta.yaml
120
+ sed -i "s/^ number: .*/ number: 0/" recipes/afquery/meta.yaml
121
+
122
+ - name: Create pull request
123
+ working-directory: bioconda-recipes
124
+ env:
125
+ GH_TOKEN: ${{ secrets.GH_PAT }}
126
+ run: |
127
+ git config user.name "github-actions[bot]"
128
+ git config user.email "github-actions[bot]@users.noreply.github.com"
129
+ git checkout -b afquery-${VERSION}
130
+ git add recipes/afquery/meta.yaml
131
+ git commit -m "Update afquery to ${VERSION}"
132
+ git push origin afquery-${VERSION}
133
+ gh pr create \
134
+ --repo bioconda/bioconda-recipes \
135
+ --title "Update afquery to ${VERSION}" \
136
+ --body "Automated version bump from https://github.com/dlopez-bioinfo/afquery/releases/tag/v${VERSION}" \
137
+ --head "${FORK_USER}:afquery-${VERSION}" \
138
+ --base master
@@ -0,0 +1,61 @@
1
+ # Python
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+ *.so
6
+ .Python
7
+ build/
8
+ develop-eggs/
9
+ dist/
10
+ downloads/
11
+ eggs/
12
+ .eggs/
13
+ lib/
14
+ lib64/
15
+ parts/
16
+ sdist/
17
+ var/
18
+ wheels/
19
+ *.egg-info/
20
+ .installed.cfg
21
+ *.egg
22
+ MANIFEST
23
+ pip-log.txt
24
+ pip-delete-this-directory.txt
25
+
26
+ # Virtual environments
27
+ venv/
28
+ ENV/
29
+ env/
30
+ .venv
31
+
32
+ # Testing
33
+ .pytest_cache/
34
+ .coverage
35
+ htmlcov/
36
+ .tox/
37
+ .hypothesis/
38
+
39
+ # IDE
40
+ .vscode/
41
+ .idea/
42
+ *.swp
43
+ *.swo
44
+ *~
45
+ .DS_Store
46
+ Thumbs.db
47
+
48
+ # Temporary files
49
+ *.tmp
50
+ *.log
51
+ *.bak
52
+ *~
53
+
54
+ # Auto-generated by hatch-vcs at build time
55
+ src/afquery/_version.py
56
+
57
+ # Database files (generated)
58
+ *.duckdb
59
+ *.duckdb.wal
60
+ *.parquet
61
+ db/*
afquery-0.1.0/PKG-INFO ADDED
@@ -0,0 +1,15 @@
1
+ Metadata-Version: 2.4
2
+ Name: afquery
3
+ Version: 0.1.0
4
+ Summary: Genomic allele frequency query engine with bitmap-encoded genotypes
5
+ Requires-Python: >=3.10
6
+ Requires-Dist: click>=8.1
7
+ Requires-Dist: cyvcf2>=0.30
8
+ Requires-Dist: duckdb>=0.10
9
+ Requires-Dist: pyarrow>=14.0
10
+ Requires-Dist: pyranges<0.2,>=0.1.2
11
+ Requires-Dist: pyroaring>=0.4.8
12
+ Requires-Dist: tqdm>=4.60
13
+ Provides-Extra: dev
14
+ Requires-Dist: pytest-cov>=5.0; extra == 'dev'
15
+ Requires-Dist: pytest>=8.0; extra == 'dev'
@@ -0,0 +1,355 @@
1
+ # afquery
2
+
3
+ Genomic allele frequency query engine with bitmap-encoded genotypes. Fast, file-based queries over 10K-50K samples at sub-100ms latency.
4
+
5
+ ## Features
6
+
7
+ - **Fast point queries**: <100ms cold start, ~10ms warm queries on single positions
8
+ - **Batch queries**: Multi-position queries via SQL IN clauses or temporary tables
9
+ - **Region queries**: Genomic range queries with automatic partitioning
10
+ - **VCF annotation**: Annotate VCF files with computed allele frequencies and sample genotypes
11
+ - **Ploidy-aware**: Correct AN/AC computation for autosomes, chrX, chrY, and chrM
12
+ - **Bitmap compression**: Roaring Bitmaps for efficient genotype storage
13
+ - **In-process**: No server process—queries run locally with DuckDB
14
+ - **Incremental updates**: Add new samples to existing databases
15
+ - **Multiple genome builds**: Support for GRCh37 and GRCh38
16
+
17
+ ## Installation
18
+
19
+ ```bash
20
+ pip install -e /home/dan/projects/afquery/ --break-system-packages
21
+ ```
22
+
23
+ Requires Python 3.10+.
24
+
25
+ ## Quick Start
26
+
27
+ ### 1. Create a Database
28
+
29
+ First, prepare a manifest TSV with sample metadata:
30
+
31
+ ```tsv
32
+ sample_name sex tech_name vcf_path phenotype_codes
33
+ sample_1 male wgs vcfs/sample_1.vcf E11.9,I10
34
+ sample_2 female wes_kit_a vcfs/sample_2.vcf E11.9
35
+ sample_3 male wgs vcfs/sample_3.vcf I10
36
+ ```
37
+
38
+ **Key points about the manifest:**
39
+ - `tech_name`: Either `wgs` (case-insensitive) for whole genome, or a custom technology name
40
+ - `vcf_path`: Path to the single-sample VCF file (relative to manifest directory, or absolute)
41
+ - For WES/exome technologies, capture regions are loaded from `--bed-dir/{tech_name}.bed`
42
+ - Example: `tech_name=wes_kit_a` → loads `beds/wes_kit_a.bed`
43
+
44
+ Organize your files:
45
+ ```
46
+ project/
47
+ ├── manifest.tsv
48
+ ├── vcfs/
49
+ │ ├── sample_1.vcf
50
+ │ ├── sample_2.vcf
51
+ │ └── sample_3.vcf
52
+ └── beds/ # Required for non-WGS technologies
53
+ └── wes_kit_a.bed # BED file for WES kit A
54
+ ```
55
+
56
+ Then preprocess your VCF files:
57
+
58
+ ```bash
59
+ afquery preprocess \
60
+ --manifest manifest.tsv \
61
+ --bed-dir ./beds/ \
62
+ --output-dir ./my_db/ \
63
+ --genome-build GRCh38
64
+ ```
65
+
66
+ This creates:
67
+ - `my_db/manifest.json` — database metadata
68
+ - `my_db/metadata.sqlite` — samples, technologies, phenotype codes, precomputed bitmaps
69
+ - `my_db/variants/{chrom}.parquet` — variant data with encoded genotypes
70
+ - `my_db/capture/` — capture regions for each technology
71
+
72
+ ### 2. Query Allele Frequencies
73
+
74
+ ```bash
75
+ # Point query
76
+ afquery query --db my_db --chrom chr1 --pos 1000 --alt G
77
+
78
+ # Batch query (100 positions)
79
+ afquery query-batch --db my_db --positions positions.tsv --phenotype E11.9
80
+
81
+ # Region query
82
+ afquery query --db my_db --chrom chr1 --start 1000 --end 10000 --phenotype E11.9 --sex M
83
+ ```
84
+
85
+ ### 3. Annotate VCF Files
86
+
87
+ ```bash
88
+ afquery annotate \
89
+ --db my_db \
90
+ --vcf input.vcf \
91
+ --output annotated.vcf \
92
+ --phenotype E11.9 \
93
+ --tech WGS
94
+ ```
95
+
96
+ Adds `AFQUERY_AC`, `AFQUERY_AN`, `AFQUERY_AF` and genotype fields to your VCF.
97
+
98
+ ## Python API
99
+
100
+ ```python
101
+ from afquery import Database
102
+
103
+ db = Database("/path/to/db")
104
+
105
+ # Single position query
106
+ # Automatically filters samples by: sex + phenotype codes + capture coverage
107
+ result = db.query(
108
+ chrom="chr1",
109
+ pos=1000,
110
+ alt="G",
111
+ phenotype_codes=["E11.9"],
112
+ sex="both"
113
+ )
114
+ print(f"AC={result.ac}, AN={result.an}, AF={result.af}")
115
+
116
+ # Batch query (multi-variant)
117
+ results = db.query_batch(
118
+ "chr1",
119
+ variants=[(1500, "A", "T"), (3500, "G", "C")],
120
+ phenotype=["E11.9"],
121
+ )
122
+
123
+ # Region query (genomic range)
124
+ results = db.query_region(
125
+ chrom="chr1",
126
+ start=1000,
127
+ end=10000,
128
+ phenotype_codes=["E11.9", "I10"]
129
+ )
130
+
131
+ # Annotate VCF with allele frequencies
132
+ # Note: tech_name filters annotation to samples of that technology
133
+ db.annotate_vcf(
134
+ vcf_path="input.vcf",
135
+ output_path="annotated.vcf",
136
+ phenotype_codes=["E11.9"],
137
+ tech_name="wgs" # Only annotate using WGS samples
138
+ )
139
+ ```
140
+
141
+ **How samples are filtered in queries**:
142
+ - Sex filter: `male`, `female`, or `both`
143
+ - phenotype filter: All codes must match
144
+ - Capture filter: Automatic—only samples whose tech's BED covers the position
145
+
146
+ ## Database Structure
147
+
148
+ ```
149
+ my_db/
150
+ ├── manifest.json # Metadata: genome_build, sample_count, schema_version
151
+ ├── metadata.sqlite # SQLite: samples, technologies, phenotype codes, bitmaps
152
+ ├── variants/
153
+ │ ├── chr1.parquet
154
+ │ ├── chr2.parquet
155
+ │ └── ...
156
+ └── capture/
157
+ ├── tech_0.pickle # WGS capture region (always covered)
158
+ └── tech_1.pickle # WES kit capture region
159
+ ```
160
+
161
+ Each variant row contains:
162
+ - `pos` — 1-based genomic position
163
+ - `ref` — reference allele
164
+ - `alt` — alternate allele
165
+ - `het_bitmap` — Roaring Bitmap of heterozygous samples
166
+ - `hom_bitmap` — Roaring Bitmap of homozygous samples
167
+
168
+ ## How Capture BED Files Are Associated with Samples
169
+
170
+ Samples are linked to capture regions through their **technology**:
171
+
172
+ 1. **Manifest specifies technology**: Each sample lists `tech_name` (e.g., `wgs`, `wes_kit_a`)
173
+ 2. **Technology maps to BED file**:
174
+ - **WGS**: No BED file needed (always fully covered)
175
+ - **WES/Custom**: BED file loaded from `{bed_dir}/{tech_name}.bed`
176
+ 3. **Storage in database**:
177
+ - `metadata.sqlite::technologies` stores tech_id, tech_name, and bed_path
178
+ - `metadata.sqlite::samples` stores sample_id, sample_name, and tech_id (foreign key)
179
+ 4. **Query-time filtering**: When querying, samples are filtered by:
180
+ - Sex (male/female/both)
181
+ - phenotype diagnosis codes
182
+ - Capture region coverage (via tech's BED file)
183
+
184
+ **Example**: If you have samples on two exome kits:
185
+ ```tsv
186
+ sample_name sex tech_name vcf_path phenotype_codes
187
+ S001 male exome_v1 vcfs/S001.vcf E11.9
188
+ S002 female exome_v1 vcfs/S002.vcf E11.9
189
+ S003 male exome_v2 vcfs/S003.vcf I10
190
+ ```
191
+
192
+ Then provide:
193
+ ```
194
+ beds/
195
+ ├── exome_v1.bed # Coverage for samples S001, S002
196
+ └── exome_v2.bed # Coverage for sample S003
197
+ ```
198
+
199
+ At query time, each sample's eligible regions are determined by its tech's BED file.
200
+
201
+ ## Advanced Features
202
+
203
+ ### Incremental Updates (add_samples)
204
+
205
+ ```bash
206
+ afquery add-samples \
207
+ --db my_db \
208
+ --manifest new_samples.tsv \
209
+ --vcf-dir ./new_vcfs/
210
+ ```
211
+
212
+ Adds new samples without rebuilding the entire database.
213
+
214
+ ### Compact Database
215
+
216
+ Remove samples and reclaim disk space:
217
+
218
+ ```bash
219
+ afquery compact --db my_db --samples-to-remove sample_1,sample_2
220
+ ```
221
+
222
+ ### Run Benchmarks
223
+
224
+ ```bash
225
+ afquery benchmark --db my_db --n-queries 1000 --query-type point
226
+ ```
227
+
228
+ ### Generate Synthetic Data
229
+
230
+ ```bash
231
+ afquery synth --output synthetic_db/ --n-samples 5000 --n-variants 100000
232
+ ```
233
+
234
+ ## Ploidy Rules
235
+
236
+ AF computation respects chromosome-specific ploidy:
237
+
238
+ | Chromosome | Formula |
239
+ |---|---|
240
+ | Autosomes | `AN = 2 × eligible_samples` |
241
+ | chrM | `AN = 1 × eligible_samples` |
242
+ | chrY | `AN = 1 × eligible_males` |
243
+ | chrX (PAR) | `AN = 2 × eligible_samples` |
244
+ | chrX (non-PAR) | `AN = 2 × eligible_females + 1 × eligible_males` |
245
+
246
+ Where `eligible` = samples matching sex, phenotype, and technology capture filters.
247
+
248
+ ## Performance Targets
249
+
250
+ - **Point query (cold)**: <100 ms
251
+ - **Point query (warm)**: ~10 ms
252
+ - **Batch 100 positions**: ~200 ms
253
+ - **VCF annotation (5K variants)**: ~30 s
254
+ - **VCF annotation (5M variants)**: ~30 min
255
+
256
+ ## Command Reference
257
+
258
+ ```
259
+ afquery query Query single position
260
+ afquery query-batch Batch query multiple positions
261
+ afquery annotate Annotate VCF file
262
+ afquery info Show database info
263
+ afquery preprocess Build database from VCFs
264
+ afquery add-samples Add new samples to database
265
+ afquery compact Remove samples and reclaim space
266
+ afquery synth Generate synthetic test database
267
+ afquery benchmark Run performance benchmarks
268
+ ```
269
+
270
+ Run `afquery --help` for full options.
271
+
272
+ ## Development
273
+
274
+ ### Running Tests
275
+
276
+ ```bash
277
+ # All 190 tests
278
+ python3 -m pytest --tb=short -q
279
+
280
+ # Specific test module
281
+ python3 -m pytest tests/test_query.py -v
282
+ ```
283
+
284
+ ### Key Modules
285
+
286
+ - `afquery.query` — QueryEngine, point/batch/region queries
287
+ - `afquery.annotate` — VCF annotation pipeline
288
+ - `afquery.database` — Database wrapper (public API)
289
+ - `afquery.preprocess` — Manifest parsing, VCF ingestion, Parquet building
290
+ - `afquery.bitmaps` — Roaring Bitmap encoding/decoding
291
+ - `afquery.ploidy` — Chromosome-specific ploidy rules
292
+ - `afquery.models` — Data classes (QueryResult, ParsedSample, etc.)
293
+
294
+ ### Architecture
295
+
296
+ See `brain/architecture.md` for detailed system design, data flow, and query algorithm.
297
+
298
+ ## Genome Builds
299
+
300
+ - **GRCh37** (hg19) — PAR regions: chrX:1-2649520
301
+ - **GRCh38** (hg38) — PAR regions: chrX:1-3099677
302
+
303
+ ## Technologies Supported
304
+
305
+ - **WGS** — Whole genome sequencing (always fully covered, no BED file needed)
306
+ - Manifest: `tech_name = wgs` (case-insensitive)
307
+ - Query-time: All positions in genome considered covered
308
+
309
+ - **WES** — Whole exome sequencing (coverage defined by BED file)
310
+ - Manifest: `tech_name = wes_kit_a` (or any custom name)
311
+ - Preprocessing: Loads `{bed_dir}/wes_kit_a.bed` (0-based half-open BED format)
312
+ - Query-time: Only positions within BED intervals considered covered
313
+
314
+ - **Custom** — Any technology with a BED file (e.g., gene panels, targeted sequencing)
315
+ - Manifest: Use any `tech_name`
316
+ - Preprocessing: Loads `{bed_dir}/{tech_name}.bed`
317
+ - Query-time: Respects BED file coverage
318
+
319
+ ## Troubleshooting
320
+
321
+ ### ImportError: `cyvcf2`
322
+
323
+ `cyvcf2` import happens inside worker processes during preprocessing. Do not import at module level.
324
+
325
+ ### DuckDB Temp Files
326
+
327
+ Uses Parquet format (not Arrow IPC) for compatibility. Set `DUCKDB_TEMP_DIRECTORY` if needed.
328
+
329
+ ### Sample IDs
330
+
331
+ Sample IDs are 0-indexed and monotonically increasing. Never reuse removed IDs—use `compact` to reclaim space.
332
+
333
+ ## Contributing
334
+
335
+ 1. Read `brain/project_state.json` for current phase and test count
336
+ 2. Read `brain/architecture.md` for system design
337
+ 3. Follow code conventions in `CLAUDE.md`
338
+ 4. Update `brain/` docs after architectural changes
339
+ 5. Run tests before submitting
340
+
341
+ ## License
342
+
343
+ (Add your license here)
344
+
345
+ ## Citation
346
+
347
+ If you use afquery in research, please cite:
348
+
349
+ ```
350
+ (Citation format to be determined)
351
+ ```
352
+
353
+ ---
354
+
355
+ **Status**: Phase 5 complete (190 tests passing). Active development.
@@ -0,0 +1,106 @@
1
+ # Webhook / API JSON Schema
2
+
3
+ ## Batch Query
4
+
5
+ ### Request
6
+
7
+ ```json
8
+ {
9
+ "chrom": "chr1",
10
+ "positions": [1000, 2000, 3000],
11
+ "phenotype": ["E11.9", "I10"],
12
+ "sex": "both"
13
+ }
14
+ ```
15
+
16
+ | Field | Type | Required | Notes |
17
+ |-------------|-----------------|----------|--------------------------------------------|
18
+ | `chrom` | string | yes | Any form accepted: `"1"`, `"chr1"`, `"X"` |
19
+ | `positions` | array of int | yes | 1-based positions; duplicates are ignored |
20
+ | `phenotype` | array of string | yes | phenotype codes; union semantics |
21
+ | `sex` | string | no | `"both"` (default), `"male"`, `"female"` |
22
+
23
+ ### Response
24
+
25
+ Array of variant objects, one per variant found (absent variants are omitted). Sorted by `(pos, alt)`.
26
+
27
+ ```json
28
+ [
29
+ {
30
+ "chrom": "chr1",
31
+ "pos": 1000,
32
+ "ref": "A",
33
+ "alt": "T",
34
+ "AC": 5,
35
+ "AN": 100,
36
+ "AF": 0.05,
37
+ "n_eligible": 50
38
+ }
39
+ ]
40
+ ```
41
+
42
+ | Field | Type | Notes |
43
+ |--------------|--------|----------------------------------------------|
44
+ | `chrom` | string | Canonical form (`chr`-prefixed) |
45
+ | `pos` | int | 1-based |
46
+ | `ref` | string | Reference allele |
47
+ | `alt` | string | Alternate allele (one object per alt) |
48
+ | `AC` | int | Allele count in eligible samples |
49
+ | `AN` | int | Allele number (ploidy-aware) |
50
+ | `AF` | float | `AC / AN` |
51
+ | `n_eligible` | int | Number of eligible samples at this position |
52
+
53
+ Positions where `AN == 0` (no eligible samples covered) are silently omitted.
54
+
55
+ ---
56
+
57
+ ## Single-Position Query
58
+
59
+ Same as batch query with a single-element `positions` array.
60
+
61
+ ---
62
+
63
+ ## VCF Annotation
64
+
65
+ ### Request
66
+
67
+ `POST /annotate` — multipart form data.
68
+
69
+ | Part | Content-Type | Description |
70
+ |----------|----------------------------|-----------------------------------|
71
+ | `vcf` | `application/octet-stream` | Input VCF file (plain or gzipped) |
72
+ | `params` | `application/json` | JSON object (see below) |
73
+
74
+ `params` object:
75
+
76
+ ```json
77
+ {
78
+ "phenotype": ["E11.9"],
79
+ "sex": "both"
80
+ }
81
+ ```
82
+
83
+ ### Response
84
+
85
+ Annotated VCF file stream (`Content-Type: text/plain`).
86
+
87
+ Added INFO fields:
88
+
89
+ | Field | Number | Type | Description |
90
+ |------------|--------|---------|--------------------------------------|
91
+ | `AFQUERY_AC` | A | Integer | Allele count per ALT in eligible set |
92
+ | `AFQUERY_AN` | 1 | Integer | Allele number (0 if uncovered) |
93
+ | `AFQUERY_AF` | A | Float | Allele frequency per ALT |
94
+
95
+ **Notes**:
96
+ - `AFQUERY_AN=0` means the position was not covered for the given filters; `AFQUERY_AC` and `AFQUERY_AF` are absent (`.`) in that case.
97
+ - `AFQUERY_AC=0`, `AFQUERY_AF=0.0` with `AFQUERY_AN>0` means the position was covered but the specific allele was not observed in the database.
98
+ - For multi-allelic sites, `AFQUERY_AC` and `AFQUERY_AF` are comma-separated lists aligned to the `ALT` column.
99
+
100
+ ### Completion summary (trailing header)
101
+
102
+ After the VCF body, a summary is written to stderr (or returned in a trailing `X-AFQUERY-Stats` response header):
103
+
104
+ ```json
105
+ {"n_variants": 1000, "n_annotated": 850, "n_uncovered": 12}
106
+ ```