vflank 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- vflank-0.1.0/.claude/skills/ddpcr-conventions/SKILL.md +144 -0
- vflank-0.1.0/.dockerignore +28 -0
- vflank-0.1.0/.github/workflows/ci.yml +47 -0
- vflank-0.1.0/.github/workflows/docs.yml +41 -0
- vflank-0.1.0/.github/workflows/release.yml +61 -0
- vflank-0.1.0/.gitignore +39 -0
- vflank-0.1.0/.pre-commit-config.yaml +23 -0
- vflank-0.1.0/CHANGELOG.md +26 -0
- vflank-0.1.0/CLAUDE.md +102 -0
- vflank-0.1.0/CONTRIBUTING.md +38 -0
- vflank-0.1.0/Dockerfile +24 -0
- vflank-0.1.0/LICENSE +201 -0
- vflank-0.1.0/PKG-INFO +154 -0
- vflank-0.1.0/README.md +113 -0
- vflank-0.1.0/docs/ARCHITECTURE.md +109 -0
- vflank-0.1.0/docs/DEVELOPER.md +245 -0
- vflank-0.1.0/docs/changelog.md +1 -0
- vflank-0.1.0/docs/getting-started/installation.md +44 -0
- vflank-0.1.0/docs/getting-started/quickstart.md +58 -0
- vflank-0.1.0/docs/index.md +47 -0
- vflank-0.1.0/docs/reference/api.md +57 -0
- vflank-0.1.0/docs/reference/cli.md +7 -0
- vflank-0.1.0/docs/research/gnomad-api.md +225 -0
- vflank-0.1.0/docs/research/sv-vcf-input.md +123 -0
- vflank-0.1.0/docs/user-guide/fusions.md +64 -0
- vflank-0.1.0/docs/user-guide/masking.md +71 -0
- vflank-0.1.0/docs/user-guide/small-variants.md +66 -0
- vflank-0.1.0/mkdocs.yml +98 -0
- vflank-0.1.0/pyproject.toml +82 -0
- vflank-0.1.0/src/vflank/__init__.py +3 -0
- vflank-0.1.0/src/vflank/cli/__init__.py +1 -0
- vflank-0.1.0/src/vflank/cli/_masking.py +44 -0
- vflank-0.1.0/src/vflank/cli/app.py +45 -0
- vflank-0.1.0/src/vflank/cli/fusion.py +182 -0
- vflank-0.1.0/src/vflank/cli/small.py +463 -0
- vflank-0.1.0/src/vflank/core/__init__.py +1 -0
- vflank-0.1.0/src/vflank/core/chrom.py +120 -0
- vflank-0.1.0/src/vflank/core/flanks.py +99 -0
- vflank-0.1.0/src/vflank/core/fusion.py +105 -0
- vflank-0.1.0/src/vflank/core/popfreq.py +227 -0
- vflank-0.1.0/src/vflank/core/popfreq_api.py +204 -0
- vflank-0.1.0/src/vflank/core/skips.py +27 -0
- vflank-0.1.0/src/vflank/core/variant.py +52 -0
- vflank-0.1.0/src/vflank/errors.py +27 -0
- vflank-0.1.0/src/vflank/io/__init__.py +1 -0
- vflank-0.1.0/src/vflank/io/breakpoints.py +107 -0
- vflank-0.1.0/src/vflank/io/fasta.py +44 -0
- vflank-0.1.0/src/vflank/io/maf.py +133 -0
- vflank-0.1.0/src/vflank/io/reference.py +80 -0
- vflank-0.1.0/src/vflank/io/report.py +35 -0
- vflank-0.1.0/src/vflank/logging.py +57 -0
- vflank-0.1.0/src/vflank/py.typed +0 -0
- vflank-0.1.0/tests/integration/test_fusion_pipeline.py +63 -0
- vflank-0.1.0/tests/integration/test_small_pipeline.py +255 -0
- vflank-0.1.0/tests/unit/test_breakpoints.py +61 -0
- vflank-0.1.0/tests/unit/test_chrom.py +54 -0
- vflank-0.1.0/tests/unit/test_flanks.py +50 -0
- vflank-0.1.0/tests/unit/test_fusion.py +97 -0
- vflank-0.1.0/tests/unit/test_popfreq.py +87 -0
- vflank-0.1.0/tests/unit/test_popfreq_api.py +147 -0
- vflank-0.1.0/tests/unit/test_report.py +25 -0
- vflank-0.1.0/tests/unit/test_skips.py +13 -0
- vflank-0.1.0/tests/unit/test_variant.py +23 -0
|
@@ -0,0 +1,144 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: ddpcr-conventions
|
|
3
|
+
description: ddPCR assay-design domain reference for the vflank repo — genomic coordinate systems (1-based MAF vs 0-based pysam), MAF column semantics, gnomAD allele-frequency masking, flank masking modes A–D, heterozygous/consensus handling, and fusion-junction logic. Use when editing or reasoning about flank extraction, coordinate math, SNP/population-frequency masking, BAM consensus, or fusion breakpoints in this codebase.
|
|
4
|
+
---
|
|
5
|
+
|
|
6
|
+
# ddPCR conventions & domain reference
|
|
7
|
+
|
|
8
|
+
Background knowledge for working on vflank correctly. The code-level conventions
|
|
9
|
+
live in `CLAUDE.md`; this is the *why* and the biology that prevents silent,
|
|
10
|
+
expensive mistakes.
|
|
11
|
+
|
|
12
|
+
## What ddPCR needs from us, and why masking matters
|
|
13
|
+
|
|
14
|
+
Droplet Digital PCR quantifies a target by partitioning DNA into thousands of
|
|
15
|
+
droplets and amplifying with a primer pair + a sequence-specific probe. The
|
|
16
|
+
assay only works if the **primers and probe anneal cleanly to the template that
|
|
17
|
+
is actually present in the patient.**
|
|
18
|
+
|
|
19
|
+
The failure mode vflank exists to prevent: if a primer or probe sits over a
|
|
20
|
+
**polymorphic position** (a common germline SNP, or a patient-private variant),
|
|
21
|
+
then in patients carrying the alternate allele the oligo mismatches → reduced
|
|
22
|
+
binding, allele dropout, or assay failure. So we **mask** positions that vary,
|
|
23
|
+
turning them into `N` (or an IUPAC code) so the downstream designer avoids
|
|
24
|
+
placing an oligo 3′ end there. We mask the **flanks** (primer-landing territory);
|
|
25
|
+
the variant of interest itself is always shown literally as `[REF/ALT]` — that's
|
|
26
|
+
the target, not something to hide.
|
|
27
|
+
|
|
28
|
+
Default flank is ±200 bp, matching ddPCR amplicon scale (~60–200 bp products,
|
|
29
|
+
designed from a wider candidate window).
|
|
30
|
+
|
|
31
|
+
## Coordinate systems — get this wrong and the sequence is silently wrong
|
|
32
|
+
|
|
33
|
+
- **MAF**: 1-based, fully-closed `[start, end]`. A SNP has `start == end`.
|
|
34
|
+
- **pysam** (`FastaFile.fetch`, tabix `fetch`): 0-based, half-open `[start, end)`.
|
|
35
|
+
- **VCF** `POS`: 1-based.
|
|
36
|
+
|
|
37
|
+
Flank extraction (`core/flanks.ReferenceFlankSource`):
|
|
38
|
+
```
|
|
39
|
+
left = fetch(chrom, max(0, start - flank - 1), start - 1) # bases before variant
|
|
40
|
+
right = fetch(chrom, end, end + flank) # bases after variant
|
|
41
|
+
```
|
|
42
|
+
Masking a 1-based VCF position into a flank string:
|
|
43
|
+
```
|
|
44
|
+
idx = pos - region_start_0based - 1
|
|
45
|
+
```
|
|
46
|
+
**pysam never raises on an over-run** — it returns a truncated string at contig
|
|
47
|
+
ends. A flank shorter than requested is a real condition; report it, never drop
|
|
48
|
+
the record silently.
|
|
49
|
+
|
|
50
|
+
## MAF column semantics (TCGA/MSK)
|
|
51
|
+
|
|
52
|
+
Required: `Chromosome`, `Start_Position`, `End_Position`, `Reference_Allele`,
|
|
53
|
+
`Tumor_Seq_Allele2` (the somatic alt — *Allele2*, not Allele1). Metadata used in
|
|
54
|
+
headers: `Hugo_Symbol`, `HGVSp_Short`, `HGVSc`, `Tumor_Sample_Barcode`.
|
|
55
|
+
|
|
56
|
+
Allele encoding: `-` denotes an empty allele (insertion has `Reference_Allele = -`;
|
|
57
|
+
deletion has `Tumor_Seq_Allele2 = -`). Chromosome may arrive as `7`, `chr7`,
|
|
58
|
+
`Chr7`, numeric `23/24/25` (X/Y/MT), or `M/chrM`; `core/chrom.normalise_chrom`
|
|
59
|
+
canonicalises all of these to the bare form.
|
|
60
|
+
|
|
61
|
+
## gnomAD allele-frequency masking
|
|
62
|
+
|
|
63
|
+
We mask a flank position if a gnomAD variant there is a **single-base
|
|
64
|
+
substitution** (REF length 1, every ALT length 1) whose **max AF ≥ threshold**
|
|
65
|
+
(default 0.001 = 0.1%). Indels are deliberately *not* masked (conservative —
|
|
66
|
+
avoids over-masking primer territory).
|
|
67
|
+
|
|
68
|
+
AF fields differ by build/release:
|
|
69
|
+
- **gnomAD v4.1 (hg38)** exposes both `AF` and `AF_grpmax`; we take the max.
|
|
70
|
+
- **gnomAD v2.1.1 (hg19)** has `AF` but **no `AF_grpmax`** — only `AF` is used.
|
|
71
|
+
- Guard against `.` and `NaN` AF tokens (the parser rejects `NaN` via `f == f`).
|
|
72
|
+
|
|
73
|
+
Files are per-chromosome bgzipped VCFs + tabix index; resolution is by known
|
|
74
|
+
filename patterns (`core/popfreq.GNOMAD_PATTERNS`, keyed by build then
|
|
75
|
+
genome/exome). A contig legitimately absent from a VCF (e.g. MT) is logged at
|
|
76
|
+
DEBUG, not failed.
|
|
77
|
+
|
|
78
|
+
### Two backends and `--pop-data` (genome / exome / both)
|
|
79
|
+
|
|
80
|
+
The mask comes from one of two interchangeable sources (same `get_positions`
|
|
81
|
+
interface, selected by `--pop-source`):
|
|
82
|
+
- **`vcf`** (`GnomadStore`) — local per-chromosome VCFs. Reproducible, bulk, HPC.
|
|
83
|
+
- **`api`** (`GnomadApiSource`) — gnomAD GraphQL API, no download, rate-limited
|
|
84
|
+
to ~10 req/IP/60s → small cohorts only. Region cache + throttle + backoff.
|
|
85
|
+
|
|
86
|
+
`--pop-data {genome,exome,both}` (default `genome`): flanks often fall in
|
|
87
|
+
non-coding regions where **only genomes have data**, so genome is the default;
|
|
88
|
+
`exome` adds power in coding regions; `both` masks the **union** (common in
|
|
89
|
+
either cohort). gnomAD v4 (hg38) is the only build with a pooled `joint` set; we
|
|
90
|
+
use max-AF union for `both` for one code path.
|
|
91
|
+
|
|
92
|
+
API AF rule (no per-population `af` field): `af = max(seq.af, max(ac/an over
|
|
93
|
+
populations))` across the chosen kinds; SNPs only. Build → dataset:
|
|
94
|
+
hg19 → `gnomad_r2_1`/GRCh37, hg38 → `gnomad_r4`/GRCh38; region query is 1-based
|
|
95
|
+
inclusive. Requesting `--pop-data exome`/`both` on the VCF source without the
|
|
96
|
+
exome files **fails fast** (`GnomadStore.preflight`) — never a silent
|
|
97
|
+
genome-only fallback.
|
|
98
|
+
|
|
99
|
+
## Flank source modes (the `FlankSource` strategy)
|
|
100
|
+
|
|
101
|
+
| Mode | Inputs | Flank source | Masking |
|
|
102
|
+
|------|--------|--------------|---------|
|
|
103
|
+
| A | MAF + FASTA | reference | none |
|
|
104
|
+
| B | + gnomAD | reference | common SNPs → N |
|
|
105
|
+
| C | + sample BAM | patient consensus, reference fallback at low depth | het / low-confidence → N or IUPAC |
|
|
106
|
+
| D | all | patient consensus | gnomAD ∪ observed-het |
|
|
107
|
+
|
|
108
|
+
A and B are implemented (`ReferenceFlankSource`). C/D are the differentiator:
|
|
109
|
+
patient consensus catches **private/rare** variants gnomAD never sees — the ones
|
|
110
|
+
that silently break a primer for one specific patient.
|
|
111
|
+
|
|
112
|
+
### Consensus (modes C/D) plan
|
|
113
|
+
- Build from BAM via `bcftools mpileup → call → consensus --iupac-codes`, or
|
|
114
|
+
pysam pileup. Validate against `samtools consensus` as an oracle.
|
|
115
|
+
- A position is **heterozygous** (→ mask) if the second-most-common allele
|
|
116
|
+
exceeds ~25–30% with sufficient depth; below a min-depth threshold, fall back
|
|
117
|
+
to the reference base and flag it.
|
|
118
|
+
- Indels in the pileup shift coordinates — the hard part. `kindel` is the
|
|
119
|
+
reference implementation for CIGAR-described indel reconciliation, but it is
|
|
120
|
+
haploid/clonal (viral) and does **not** flag heterozygosity, so its majority
|
|
121
|
+
call must be augmented with diploid het detection.
|
|
122
|
+
|
|
123
|
+
## Fusion / structural-variant path (SV)
|
|
124
|
+
|
|
125
|
+
A gene fusion is defined by two breakpoints on (possibly) different chromosomes.
|
|
126
|
+
The chimeric **junction sequence** is the only fusion-specific template, so the
|
|
127
|
+
ddPCR probe must **span the junction**. vflank builds it in `core/fusion.py`:
|
|
128
|
+
fetch each partner's flank, orient per the strand bit (`0`=plus, `1`=minus),
|
|
129
|
+
concatenate across the breakpoint with **no separator** (an early `"-"` join
|
|
130
|
+
corrupted the junction base — never insert non-ACGT). Masking is applied in
|
|
131
|
+
genomic space *before* reverse-complement (`revcomp(N)=N`). The corrected
|
|
132
|
+
junction model and the iCallSV `CT`→strand mapping are in
|
|
133
|
+
docs/research/sv-vcf-input.md. Probe design over the junction is delegated to
|
|
134
|
+
Primer3 (not implemented yet).
|
|
135
|
+
|
|
136
|
+
## Downstream emit formats (where vflank stops)
|
|
137
|
+
|
|
138
|
+
- **Olivar** (small-variant amplicons) consumes a FASTA + a SNP CSV with columns
|
|
139
|
+
`START, STOP, FREQ` (1-based). vflank's gnomAD scan and BAM-het detection both
|
|
140
|
+
produce exactly this — `--emit-olivar` is the integration seam. The `FREQ`
|
|
141
|
+
column is where patient-specific risk (from a BAM) gets injected.
|
|
142
|
+
- **Primer3** handles the fusion-junction probe.
|
|
143
|
+
|
|
144
|
+
vflank produces inputs for these tools; it does not design primers itself.
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
.git
|
|
2
|
+
.github
|
|
3
|
+
.claude
|
|
4
|
+
docs
|
|
5
|
+
site
|
|
6
|
+
tests
|
|
7
|
+
*.egg-info
|
|
8
|
+
__pycache__
|
|
9
|
+
*.py[cod]
|
|
10
|
+
.pytest_cache
|
|
11
|
+
.mypy_cache
|
|
12
|
+
.ruff_cache
|
|
13
|
+
.venv
|
|
14
|
+
venv
|
|
15
|
+
.coverage
|
|
16
|
+
htmlcov
|
|
17
|
+
# data files
|
|
18
|
+
*.fasta
|
|
19
|
+
*.fa
|
|
20
|
+
*.fai
|
|
21
|
+
*.gzi
|
|
22
|
+
*.bam
|
|
23
|
+
*.bai
|
|
24
|
+
*.vcf
|
|
25
|
+
*.vcf.gz
|
|
26
|
+
*.vcf.bgz
|
|
27
|
+
*.tbi
|
|
28
|
+
*.csi
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
name: CI
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
push:
|
|
5
|
+
branches: [main]
|
|
6
|
+
pull_request:
|
|
7
|
+
|
|
8
|
+
permissions:
|
|
9
|
+
contents: read
|
|
10
|
+
|
|
11
|
+
concurrency:
|
|
12
|
+
group: ci-${{ github.ref }}
|
|
13
|
+
cancel-in-progress: true
|
|
14
|
+
|
|
15
|
+
jobs:
|
|
16
|
+
lint:
|
|
17
|
+
name: Lint & type-check
|
|
18
|
+
runs-on: ubuntu-latest
|
|
19
|
+
steps:
|
|
20
|
+
- uses: actions/checkout@v4
|
|
21
|
+
- uses: actions/setup-python@v5
|
|
22
|
+
with:
|
|
23
|
+
python-version: "3.12"
|
|
24
|
+
cache: pip
|
|
25
|
+
- run: pip install -e ".[dev]"
|
|
26
|
+
- name: Ruff
|
|
27
|
+
run: ruff check src tests
|
|
28
|
+
- name: Mypy
|
|
29
|
+
run: mypy src/vflank/core src/vflank/io
|
|
30
|
+
|
|
31
|
+
test:
|
|
32
|
+
name: Test (${{ matrix.os }}, py${{ matrix.python-version }})
|
|
33
|
+
strategy:
|
|
34
|
+
fail-fast: false
|
|
35
|
+
matrix:
|
|
36
|
+
os: [ubuntu-latest, macos-latest]
|
|
37
|
+
python-version: ["3.10", "3.11", "3.12"]
|
|
38
|
+
runs-on: ${{ matrix.os }}
|
|
39
|
+
steps:
|
|
40
|
+
- uses: actions/checkout@v4
|
|
41
|
+
- uses: actions/setup-python@v5
|
|
42
|
+
with:
|
|
43
|
+
python-version: ${{ matrix.python-version }}
|
|
44
|
+
cache: pip
|
|
45
|
+
- run: pip install -e ".[dev]"
|
|
46
|
+
- name: Pytest
|
|
47
|
+
run: pytest --cov=vflank --cov-report=term-missing
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
name: Docs
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
push:
|
|
5
|
+
branches: [main]
|
|
6
|
+
tags: ["v*"]
|
|
7
|
+
workflow_dispatch:
|
|
8
|
+
|
|
9
|
+
permissions:
|
|
10
|
+
contents: write
|
|
11
|
+
|
|
12
|
+
jobs:
|
|
13
|
+
deploy:
|
|
14
|
+
runs-on: ubuntu-latest
|
|
15
|
+
steps:
|
|
16
|
+
- uses: actions/checkout@v4
|
|
17
|
+
with:
|
|
18
|
+
fetch-depth: 0 # mike needs full history + the gh-pages branch
|
|
19
|
+
- uses: actions/setup-python@v5
|
|
20
|
+
with:
|
|
21
|
+
python-version: "3.12"
|
|
22
|
+
cache: pip
|
|
23
|
+
- run: pip install -e ".[docs]"
|
|
24
|
+
- name: Configure git
|
|
25
|
+
run: |
|
|
26
|
+
git config user.name "github-actions[bot]"
|
|
27
|
+
git config user.email "github-actions[bot]@users.noreply.github.com"
|
|
28
|
+
git fetch origin gh-pages --depth=1 || true
|
|
29
|
+
|
|
30
|
+
# main -> the rolling "dev" docs
|
|
31
|
+
- name: Deploy dev docs
|
|
32
|
+
if: github.ref == 'refs/heads/main'
|
|
33
|
+
run: mike deploy --push --update-aliases dev
|
|
34
|
+
|
|
35
|
+
# vX.Y.Z tag -> that version, aliased "latest" and set as default
|
|
36
|
+
- name: Deploy release docs
|
|
37
|
+
if: startsWith(github.ref, 'refs/tags/v')
|
|
38
|
+
run: |
|
|
39
|
+
VERSION="${GITHUB_REF_NAME#v}"
|
|
40
|
+
mike deploy --push --update-aliases "$VERSION" latest
|
|
41
|
+
mike set-default --push latest
|
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
name: Release
|
|
2
|
+
|
|
3
|
+
# Fires when a GitHub Release is published (its tag, e.g. v0.1.0, drives the
|
|
4
|
+
# version). Publishes the package to PyPI and the image to GHCR.
|
|
5
|
+
on:
|
|
6
|
+
release:
|
|
7
|
+
types: [published]
|
|
8
|
+
workflow_dispatch:
|
|
9
|
+
|
|
10
|
+
jobs:
|
|
11
|
+
pypi:
|
|
12
|
+
name: Publish to PyPI
|
|
13
|
+
runs-on: ubuntu-latest
|
|
14
|
+
permissions:
|
|
15
|
+
contents: read # required for actions/checkout (permissions: block is exhaustive)
|
|
16
|
+
id-token: write # OIDC for PyPI Trusted Publishing (no API token needed)
|
|
17
|
+
steps:
|
|
18
|
+
- uses: actions/checkout@v4
|
|
19
|
+
- uses: actions/setup-python@v5
|
|
20
|
+
with:
|
|
21
|
+
python-version: "3.12"
|
|
22
|
+
- name: Build sdist + wheel
|
|
23
|
+
run: |
|
|
24
|
+
pip install build
|
|
25
|
+
python -m build
|
|
26
|
+
- name: Publish
|
|
27
|
+
uses: pypa/gh-action-pypi-publish@release/v1
|
|
28
|
+
|
|
29
|
+
docker:
|
|
30
|
+
name: Publish image to GHCR
|
|
31
|
+
runs-on: ubuntu-latest
|
|
32
|
+
permissions:
|
|
33
|
+
contents: read
|
|
34
|
+
packages: write
|
|
35
|
+
steps:
|
|
36
|
+
- uses: actions/checkout@v4
|
|
37
|
+
- uses: docker/setup-buildx-action@v3
|
|
38
|
+
- name: Log in to GHCR
|
|
39
|
+
uses: docker/login-action@v3
|
|
40
|
+
with:
|
|
41
|
+
registry: ghcr.io
|
|
42
|
+
username: ${{ github.actor }}
|
|
43
|
+
password: ${{ secrets.GITHUB_TOKEN }}
|
|
44
|
+
- name: Image metadata (tags + labels)
|
|
45
|
+
id: meta
|
|
46
|
+
uses: docker/metadata-action@v5
|
|
47
|
+
with:
|
|
48
|
+
images: ghcr.io/${{ github.repository }} # lowercased automatically
|
|
49
|
+
tags: |
|
|
50
|
+
type=semver,pattern={{version}}
|
|
51
|
+
type=semver,pattern={{major}}.{{minor}}
|
|
52
|
+
type=raw,value=latest
|
|
53
|
+
- name: Build and push
|
|
54
|
+
uses: docker/build-push-action@v6
|
|
55
|
+
with:
|
|
56
|
+
context: .
|
|
57
|
+
push: true
|
|
58
|
+
tags: ${{ steps.meta.outputs.tags }}
|
|
59
|
+
labels: ${{ steps.meta.outputs.labels }}
|
|
60
|
+
cache-from: type=gha
|
|
61
|
+
cache-to: type=gha,mode=max
|
vflank-0.1.0/.gitignore
ADDED
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
# Python
|
|
2
|
+
__pycache__/
|
|
3
|
+
*.py[cod]
|
|
4
|
+
*.egg-info/
|
|
5
|
+
.eggs/
|
|
6
|
+
build/
|
|
7
|
+
dist/
|
|
8
|
+
.venv/
|
|
9
|
+
venv/
|
|
10
|
+
|
|
11
|
+
# Tooling caches
|
|
12
|
+
.pytest_cache/
|
|
13
|
+
.mypy_cache/
|
|
14
|
+
.ruff_cache/
|
|
15
|
+
.coverage
|
|
16
|
+
htmlcov/
|
|
17
|
+
|
|
18
|
+
# Bioinformatics artifacts (large; never commit)
|
|
19
|
+
*.fasta
|
|
20
|
+
*.fa
|
|
21
|
+
*.fai
|
|
22
|
+
*.gzi
|
|
23
|
+
*.bam
|
|
24
|
+
*.bai
|
|
25
|
+
*.vcf
|
|
26
|
+
*.vcf.gz
|
|
27
|
+
*.vcf.bgz
|
|
28
|
+
*.tbi
|
|
29
|
+
*.csi
|
|
30
|
+
flanking_sequences.fasta
|
|
31
|
+
|
|
32
|
+
# OS / editor
|
|
33
|
+
.DS_Store
|
|
34
|
+
*.swp
|
|
35
|
+
.idea/
|
|
36
|
+
.vscode/
|
|
37
|
+
|
|
38
|
+
# Claude Code local (per-user) settings
|
|
39
|
+
.claude/settings.local.json
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
# Run: pre-commit install (then hooks run on `git commit`)
|
|
2
|
+
# Or: pre-commit run --all-files
|
|
3
|
+
repos:
|
|
4
|
+
- repo: https://github.com/astral-sh/ruff-pre-commit
|
|
5
|
+
rev: v0.8.4
|
|
6
|
+
hooks:
|
|
7
|
+
- id: ruff
|
|
8
|
+
args: [--fix]
|
|
9
|
+
- id: ruff-format
|
|
10
|
+
- repo: https://github.com/pre-commit/mirrors-mypy
|
|
11
|
+
rev: v1.13.0
|
|
12
|
+
hooks:
|
|
13
|
+
- id: mypy
|
|
14
|
+
files: ^src/vflank/(core|io)/
|
|
15
|
+
additional_dependencies: [types-setuptools]
|
|
16
|
+
- repo: https://github.com/pre-commit/pre-commit-hooks
|
|
17
|
+
rev: v5.0.0
|
|
18
|
+
hooks:
|
|
19
|
+
- id: end-of-file-fixer
|
|
20
|
+
- id: trailing-whitespace
|
|
21
|
+
- id: check-yaml
|
|
22
|
+
- id: check-toml
|
|
23
|
+
- id: check-added-large-files
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
# Changelog
|
|
2
|
+
|
|
3
|
+
All notable changes to this project are documented here. The format is based on
|
|
4
|
+
[Keep a Changelog](https://keepachangelog.com/en/1.1.0/), and this project
|
|
5
|
+
adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
|
6
|
+
|
|
7
|
+
## [Unreleased]
|
|
8
|
+
|
|
9
|
+
## [0.1.0] - 2026-06-12
|
|
10
|
+
|
|
11
|
+
### Added
|
|
12
|
+
- Small-variant flank extraction from MAF (`vflank small run`), with `inspect`
|
|
13
|
+
and `list-vcf` helpers.
|
|
14
|
+
- Common-SNP masking from two interchangeable backends (`--pop-source`):
|
|
15
|
+
local gnomAD VCFs and the gnomAD GraphQL API (no download).
|
|
16
|
+
- `--pop-data {genome,exome,both}` for both backends.
|
|
17
|
+
- Per-variant deduplication keyed on `CHR_POS_REF_ALT` (`--dedup/--no-dedup`).
|
|
18
|
+
- Structural-variant junction extraction (`vflank fusion run`) from the simple
|
|
19
|
+
iCallSV/iAnnotateSV breakpoint TSV (columns matched by name), with
|
|
20
|
+
reverse-complement-aware junction construction and optional flank masking.
|
|
21
|
+
- Genome-build guard (hg19/hg38 vs FASTA), flank-truncation detection, and a
|
|
22
|
+
categorised skip summary + optional TSV run report.
|
|
23
|
+
- Documentation site (MkDocs Material) and GitHub Actions CI.
|
|
24
|
+
|
|
25
|
+
[Unreleased]: https://github.com/rhshah/vFlank/compare/v0.1.0...HEAD
|
|
26
|
+
[0.1.0]: https://github.com/rhshah/vFlank/releases/tag/v0.1.0
|
vflank-0.1.0/CLAUDE.md
ADDED
|
@@ -0,0 +1,102 @@
|
|
|
1
|
+
# CLAUDE.md — working guide for the vflank repository
|
|
2
|
+
|
|
3
|
+
## What this project is
|
|
4
|
+
|
|
5
|
+
`vflank` is the **variant-aware, masked-flank front-end** of a ddPCR
|
|
6
|
+
assay-design pipeline. It extracts the sequence flanking genomic variants and
|
|
7
|
+
masks positions that would compromise a primer/probe, then emits clean target
|
|
8
|
+
sequences.
|
|
9
|
+
|
|
10
|
+
It is **not** a primer designer. Design is delegated downstream — Olivar
|
|
11
|
+
(small-variant amplicons) and Primer3 (fusion-junction probes) — invoked
|
|
12
|
+
out-of-process. Do not add a primer/probe design algorithm to this package; add
|
|
13
|
+
*emit formats* that feed those tools. See [docs/ARCHITECTURE.md](docs/ARCHITECTURE.md)
|
|
14
|
+
for the full plan, scope boundary, and milestone roadmap.
|
|
15
|
+
|
|
16
|
+
## Repository layout
|
|
17
|
+
|
|
18
|
+
```
|
|
19
|
+
src/vflank/
|
|
20
|
+
├── core/ chrom · variant · flanks · popfreq · popfreq_api · fusion · skips
|
|
21
|
+
├── io/ maf · reference · fasta · breakpoints · report ← file access
|
|
22
|
+
├── cli/ app (root) · small (run/inspect/list-vcf) · fusion (run)
|
|
23
|
+
├── logging.py · errors.py
|
|
24
|
+
tests/ unit/ · integration/
|
|
25
|
+
docs/ARCHITECTURE.md ← design & roadmap
|
|
26
|
+
docs/research/ ← gnomAD-API & SV/VCF design notes
|
|
27
|
+
```
|
|
28
|
+
|
|
29
|
+
The original reference scripts (`get_flanking_sequence.py`,
|
|
30
|
+
`design_fusion_primers.py`, `config_ES_CTDNA_03.cfg`) have been fully ported /
|
|
31
|
+
re-implemented and removed. The conventions extracted from them live in
|
|
32
|
+
[docs/research/](docs/research/) (corrected fusion-junction model, iCallSV
|
|
33
|
+
strand mapping, etc.).
|
|
34
|
+
|
|
35
|
+
## Quality gate — run before declaring any change done
|
|
36
|
+
|
|
37
|
+
```bash
|
|
38
|
+
python -m ruff check src tests
|
|
39
|
+
python -m mypy src/vflank/core src/vflank/io
|
|
40
|
+
python -m pytest
|
|
41
|
+
```
|
|
42
|
+
|
|
43
|
+
All three must pass. Tests run without installing the package (`pyproject.toml`
|
|
44
|
+
sets `pythonpath = ["src"]`). The dev environment is mambaforge Python 3.10 with
|
|
45
|
+
`typer`, `rich`, `pysam`, `pandas`, `pytest`, `ruff`, `mypy` available.
|
|
46
|
+
|
|
47
|
+
## Working discipline (the bar for this repo)
|
|
48
|
+
|
|
49
|
+
- **Review before and after every change.** Before: read the surrounding code
|
|
50
|
+
and run the gate. After: re-run the gate; re-read the diff for duplication,
|
|
51
|
+
dead code, and unused symbols.
|
|
52
|
+
- **No silent failures.** Every error path must surface — raise a typed
|
|
53
|
+
`VflankError`, or log at an appropriate level, or record and report in the run
|
|
54
|
+
summary. Do not swallow exceptions or return empty results without a log. The
|
|
55
|
+
existing patterns: flank truncation at contig ends is detected and reported;
|
|
56
|
+
contig-absent in a VCF is logged at DEBUG; build mismatch warns.
|
|
57
|
+
- **No dead or duplicated code.** If you remove the last caller of something,
|
|
58
|
+
remove the thing. Don't add an exception class / helper "for later."
|
|
59
|
+
- **Keep the hot kernels pure.** `chrom.normalise_chrom`, `popfreq.parse_common_snp_positions`,
|
|
60
|
+
and `flanks.mask_sequence` are pure functions over plain values so they are
|
|
61
|
+
unit-testable without pysam and can later be ported to Rust. Preserve that.
|
|
62
|
+
- **Update comments, logging, and tests with the code** — not as an afterthought.
|
|
63
|
+
|
|
64
|
+
## Coordinate conventions (the #1 source of bugs — read before editing flanks)
|
|
65
|
+
|
|
66
|
+
- **MAF coordinates are 1-based, fully-closed `[start, end]`.**
|
|
67
|
+
- **pysam (`FastaFile.fetch`, tabix) is 0-based, half-open `[start, end)`.**
|
|
68
|
+
- Flank math (see `core/flanks.ReferenceFlankSource.fetch`):
|
|
69
|
+
- left flank = `fa.fetch(chrom, max(0, start-flank-1), start-1)` — bases *before* the variant
|
|
70
|
+
- right flank = `fa.fetch(chrom, end, end+flank)` — bases *after* the variant
|
|
71
|
+
- the variant interval `[start, end]` itself is excluded from both flanks
|
|
72
|
+
- Masking maps a 1-based VCF position back to a 0-based index *within a flank*:
|
|
73
|
+
`idx = pos - region_start_0based - 1` (see `flanks.mask_sequence`).
|
|
74
|
+
- pysam silently returns a **short string** when a window runs off a contig end —
|
|
75
|
+
always treat a shorter-than-requested flank as a real condition to report.
|
|
76
|
+
|
|
77
|
+
## Domain knowledge
|
|
78
|
+
|
|
79
|
+
For the deeper biology (why masking matters for ddPCR, gnomAD AF semantics across
|
|
80
|
+
builds, masking modes A–D, indel/het caveats), invoke the `ddpcr-conventions`
|
|
81
|
+
skill — it carries the reference detail that doesn't belong in always-on context.
|
|
82
|
+
|
|
83
|
+
## Conventions
|
|
84
|
+
|
|
85
|
+
- **Chromosome handling:** the canonical internal form is the *bare* chromosome
|
|
86
|
+
(`"7"`, `"X"`, `"MT"`). Normalise MAF input with `core/chrom.normalise_chrom`;
|
|
87
|
+
convert to a file's notation only at fetch time via `ReferenceFasta.contig` /
|
|
88
|
+
the gnomAD store. Notation (`chr1` vs `1`) is auto-detected per file.
|
|
89
|
+
- **Output:** two FASTA records per variant — raw and `Masked__…` — with the
|
|
90
|
+
variant shown literally as `[REF/ALT]` between the flanks. Only single-base
|
|
91
|
+
population SNPs are masked.
|
|
92
|
+
- **CLI:** Typer; status/Rich output goes to **stderr** (the `console` in
|
|
93
|
+
`logging.py`); stdout/files are for data. Library/diagnostic messages go
|
|
94
|
+
through the `vflank` logger; the CLI presents the formatted summary.
|
|
95
|
+
- **Errors:** raise `VflankError` subclasses (`errors.py`) for user-facing
|
|
96
|
+
failures; the CLI catches them and prints a clean message.
|
|
97
|
+
|
|
98
|
+
## Git
|
|
99
|
+
|
|
100
|
+
- Work on a branch; the foundation is committed on `main`.
|
|
101
|
+
- End commit messages with: `Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>`
|
|
102
|
+
- Commit/push only when asked.
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
# Contributing to vflank
|
|
2
|
+
|
|
3
|
+
Thanks for your interest in improving vflank! This is the short version; the
|
|
4
|
+
full [Developer Guide](docs/DEVELOPER.md) covers setup, layout, and how to
|
|
5
|
+
extend the package.
|
|
6
|
+
|
|
7
|
+
## Quick start
|
|
8
|
+
|
|
9
|
+
```bash
|
|
10
|
+
git clone https://github.com/rhshah/vFlank.git
|
|
11
|
+
cd vFlank
|
|
12
|
+
pip install -e ".[dev]"
|
|
13
|
+
python -m pytest # tests resolve `vflank` from src/ automatically
|
|
14
|
+
```
|
|
15
|
+
|
|
16
|
+
## The quality gate (run before every PR)
|
|
17
|
+
|
|
18
|
+
```bash
|
|
19
|
+
python -m ruff check src tests
|
|
20
|
+
python -m mypy src/vflank/core src/vflank/io
|
|
21
|
+
python -m pytest
|
|
22
|
+
```
|
|
23
|
+
|
|
24
|
+
All three must pass; CI runs them on Linux + macOS across Python 3.10–3.12.
|
|
25
|
+
|
|
26
|
+
## Conventions
|
|
27
|
+
|
|
28
|
+
- Work on a feature branch; keep `core/` pure and I/O-free.
|
|
29
|
+
- No silent failures — surface every error path (raise / log / report).
|
|
30
|
+
- Add or update tests in the matching `tests/` subtree.
|
|
31
|
+
- Match the surrounding style; the coordinate conventions in
|
|
32
|
+
[CLAUDE.md](CLAUDE.md) are the #1 thing to get right when touching flanks.
|
|
33
|
+
- End commit messages and PRs with a clear, present-tense summary.
|
|
34
|
+
|
|
35
|
+
## Reporting issues
|
|
36
|
+
|
|
37
|
+
Open an issue at https://github.com/rhshah/vFlank/issues with a minimal
|
|
38
|
+
reproduction (a few-line MAF/TSV + the command) where possible.
|
vflank-0.1.0/Dockerfile
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
# syntax=docker/dockerfile:1
|
|
2
|
+
|
|
3
|
+
# --- Build the wheel ---
|
|
4
|
+
FROM python:3.12-slim AS builder
|
|
5
|
+
WORKDIR /src
|
|
6
|
+
COPY pyproject.toml README.md LICENSE ./
|
|
7
|
+
COPY src ./src
|
|
8
|
+
RUN pip install --no-cache-dir build && python -m build --wheel --outdir /dist
|
|
9
|
+
|
|
10
|
+
# --- Runtime ---
|
|
11
|
+
FROM python:3.12-slim
|
|
12
|
+
LABEL org.opencontainers.image.source="https://github.com/rhshah/vFlank" \
|
|
13
|
+
org.opencontainers.image.description="Variant-aware flanking-sequence extraction and masking for ddPCR assay design" \
|
|
14
|
+
org.opencontainers.image.licenses="Apache-2.0"
|
|
15
|
+
|
|
16
|
+
# pysam ships manylinux wheels with htslib bundled, so no system htslib needed.
|
|
17
|
+
COPY --from=builder /dist/*.whl /tmp/
|
|
18
|
+
RUN pip install --no-cache-dir /tmp/*.whl && rm -rf /tmp/*.whl
|
|
19
|
+
|
|
20
|
+
# Reference FASTAs / VCFs are mounted at runtime, e.g.:
|
|
21
|
+
# docker run --rm -v "$PWD:/data" ghcr.io/rhshah/vflank \
|
|
22
|
+
# small run /data/variants.maf -r /data/GRCh37.fasta -g hg19 -o /data/out.fasta
|
|
23
|
+
ENTRYPOINT ["vflank"]
|
|
24
|
+
CMD ["--help"]
|