vflank 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (63) hide show
  1. vflank-0.1.0/.claude/skills/ddpcr-conventions/SKILL.md +144 -0
  2. vflank-0.1.0/.dockerignore +28 -0
  3. vflank-0.1.0/.github/workflows/ci.yml +47 -0
  4. vflank-0.1.0/.github/workflows/docs.yml +41 -0
  5. vflank-0.1.0/.github/workflows/release.yml +61 -0
  6. vflank-0.1.0/.gitignore +39 -0
  7. vflank-0.1.0/.pre-commit-config.yaml +23 -0
  8. vflank-0.1.0/CHANGELOG.md +26 -0
  9. vflank-0.1.0/CLAUDE.md +102 -0
  10. vflank-0.1.0/CONTRIBUTING.md +38 -0
  11. vflank-0.1.0/Dockerfile +24 -0
  12. vflank-0.1.0/LICENSE +201 -0
  13. vflank-0.1.0/PKG-INFO +154 -0
  14. vflank-0.1.0/README.md +113 -0
  15. vflank-0.1.0/docs/ARCHITECTURE.md +109 -0
  16. vflank-0.1.0/docs/DEVELOPER.md +245 -0
  17. vflank-0.1.0/docs/changelog.md +1 -0
  18. vflank-0.1.0/docs/getting-started/installation.md +44 -0
  19. vflank-0.1.0/docs/getting-started/quickstart.md +58 -0
  20. vflank-0.1.0/docs/index.md +47 -0
  21. vflank-0.1.0/docs/reference/api.md +57 -0
  22. vflank-0.1.0/docs/reference/cli.md +7 -0
  23. vflank-0.1.0/docs/research/gnomad-api.md +225 -0
  24. vflank-0.1.0/docs/research/sv-vcf-input.md +123 -0
  25. vflank-0.1.0/docs/user-guide/fusions.md +64 -0
  26. vflank-0.1.0/docs/user-guide/masking.md +71 -0
  27. vflank-0.1.0/docs/user-guide/small-variants.md +66 -0
  28. vflank-0.1.0/mkdocs.yml +98 -0
  29. vflank-0.1.0/pyproject.toml +82 -0
  30. vflank-0.1.0/src/vflank/__init__.py +3 -0
  31. vflank-0.1.0/src/vflank/cli/__init__.py +1 -0
  32. vflank-0.1.0/src/vflank/cli/_masking.py +44 -0
  33. vflank-0.1.0/src/vflank/cli/app.py +45 -0
  34. vflank-0.1.0/src/vflank/cli/fusion.py +182 -0
  35. vflank-0.1.0/src/vflank/cli/small.py +463 -0
  36. vflank-0.1.0/src/vflank/core/__init__.py +1 -0
  37. vflank-0.1.0/src/vflank/core/chrom.py +120 -0
  38. vflank-0.1.0/src/vflank/core/flanks.py +99 -0
  39. vflank-0.1.0/src/vflank/core/fusion.py +105 -0
  40. vflank-0.1.0/src/vflank/core/popfreq.py +227 -0
  41. vflank-0.1.0/src/vflank/core/popfreq_api.py +204 -0
  42. vflank-0.1.0/src/vflank/core/skips.py +27 -0
  43. vflank-0.1.0/src/vflank/core/variant.py +52 -0
  44. vflank-0.1.0/src/vflank/errors.py +27 -0
  45. vflank-0.1.0/src/vflank/io/__init__.py +1 -0
  46. vflank-0.1.0/src/vflank/io/breakpoints.py +107 -0
  47. vflank-0.1.0/src/vflank/io/fasta.py +44 -0
  48. vflank-0.1.0/src/vflank/io/maf.py +133 -0
  49. vflank-0.1.0/src/vflank/io/reference.py +80 -0
  50. vflank-0.1.0/src/vflank/io/report.py +35 -0
  51. vflank-0.1.0/src/vflank/logging.py +57 -0
  52. vflank-0.1.0/src/vflank/py.typed +0 -0
  53. vflank-0.1.0/tests/integration/test_fusion_pipeline.py +63 -0
  54. vflank-0.1.0/tests/integration/test_small_pipeline.py +255 -0
  55. vflank-0.1.0/tests/unit/test_breakpoints.py +61 -0
  56. vflank-0.1.0/tests/unit/test_chrom.py +54 -0
  57. vflank-0.1.0/tests/unit/test_flanks.py +50 -0
  58. vflank-0.1.0/tests/unit/test_fusion.py +97 -0
  59. vflank-0.1.0/tests/unit/test_popfreq.py +87 -0
  60. vflank-0.1.0/tests/unit/test_popfreq_api.py +147 -0
  61. vflank-0.1.0/tests/unit/test_report.py +25 -0
  62. vflank-0.1.0/tests/unit/test_skips.py +13 -0
  63. vflank-0.1.0/tests/unit/test_variant.py +23 -0
@@ -0,0 +1,144 @@
1
+ ---
2
+ name: ddpcr-conventions
3
+ description: ddPCR assay-design domain reference for the vflank repo — genomic coordinate systems (1-based MAF vs 0-based pysam), MAF column semantics, gnomAD allele-frequency masking, flank masking modes A–D, heterozygous/consensus handling, and fusion-junction logic. Use when editing or reasoning about flank extraction, coordinate math, SNP/population-frequency masking, BAM consensus, or fusion breakpoints in this codebase.
4
+ ---
5
+
6
+ # ddPCR conventions & domain reference
7
+
8
+ Background knowledge for working on vflank correctly. The code-level conventions
9
+ live in `CLAUDE.md`; this is the *why* and the biology that prevents silent,
10
+ expensive mistakes.
11
+
12
+ ## What ddPCR needs from us, and why masking matters
13
+
14
+ Droplet Digital PCR quantifies a target by partitioning DNA into thousands of
15
+ droplets and amplifying with a primer pair + a sequence-specific probe. The
16
+ assay only works if the **primers and probe anneal cleanly to the template that
17
+ is actually present in the patient.**
18
+
19
+ The failure mode vflank exists to prevent: if a primer or probe sits over a
20
+ **polymorphic position** (a common germline SNP, or a patient-private variant),
21
+ then in patients carrying the alternate allele the oligo mismatches → reduced
22
+ binding, allele dropout, or assay failure. So we **mask** positions that vary,
23
+ turning them into `N` (or an IUPAC code) so the downstream designer avoids
24
+ placing an oligo 3′ end there. We mask the **flanks** (primer-landing territory);
25
+ the variant of interest itself is always shown literally as `[REF/ALT]` — that's
26
+ the target, not something to hide.
27
+
28
+ Default flank is ±200 bp, matching ddPCR amplicon scale (~60–200 bp products,
29
+ designed from a wider candidate window).
30
+
31
+ ## Coordinate systems — get this wrong and the sequence is silently wrong
32
+
33
+ - **MAF**: 1-based, fully-closed `[start, end]`. A SNP has `start == end`.
34
+ - **pysam** (`FastaFile.fetch`, tabix `fetch`): 0-based, half-open `[start, end)`.
35
+ - **VCF** `POS`: 1-based.
36
+
37
+ Flank extraction (`core/flanks.ReferenceFlankSource`):
38
+ ```
39
+ left = fetch(chrom, max(0, start - flank - 1), start - 1) # bases before variant
40
+ right = fetch(chrom, end, end + flank) # bases after variant
41
+ ```
42
+ Masking a 1-based VCF position into a flank string:
43
+ ```
44
+ idx = pos - region_start_0based - 1
45
+ ```
46
+ **pysam never raises on an over-run** — it returns a truncated string at contig
47
+ ends. A flank shorter than requested is a real condition; report it, never drop
48
+ the record silently.
49
+
50
+ ## MAF column semantics (TCGA/MSK)
51
+
52
+ Required: `Chromosome`, `Start_Position`, `End_Position`, `Reference_Allele`,
53
+ `Tumor_Seq_Allele2` (the somatic alt — *Allele2*, not Allele1). Metadata used in
54
+ headers: `Hugo_Symbol`, `HGVSp_Short`, `HGVSc`, `Tumor_Sample_Barcode`.
55
+
56
+ Allele encoding: `-` denotes an empty allele (insertion has `Reference_Allele = -`;
57
+ deletion has `Tumor_Seq_Allele2 = -`). Chromosome may arrive as `7`, `chr7`,
58
+ `Chr7`, numeric `23/24/25` (X/Y/MT), or `M/chrM`; `core/chrom.normalise_chrom`
59
+ canonicalises all of these to the bare form.
60
+
61
+ ## gnomAD allele-frequency masking
62
+
63
+ We mask a flank position if a gnomAD variant there is a **single-base
64
+ substitution** (REF length 1, every ALT length 1) whose **max AF ≥ threshold**
65
+ (default 0.001 = 0.1%). Indels are deliberately *not* masked (conservative —
66
+ avoids over-masking primer territory).
67
+
68
+ AF fields differ by build/release:
69
+ - **gnomAD v4.1 (hg38)** exposes both `AF` and `AF_grpmax`; we take the max.
70
+ - **gnomAD v2.1.1 (hg19)** has `AF` but **no `AF_grpmax`** — only `AF` is used.
71
+ - Guard against `.` and `NaN` AF tokens (the parser rejects `NaN` via `f == f`).
72
+
73
+ Files are per-chromosome bgzipped VCFs + tabix index; resolution is by known
74
+ filename patterns (`core/popfreq.GNOMAD_PATTERNS`, keyed by build then
75
+ genome/exome). A contig legitimately absent from a VCF (e.g. MT) is logged at
76
+ DEBUG, not failed.
77
+
78
+ ### Two backends and `--pop-data` (genome / exome / both)
79
+
80
+ The mask comes from one of two interchangeable sources (same `get_positions`
81
+ interface, selected by `--pop-source`):
82
+ - **`vcf`** (`GnomadStore`) — local per-chromosome VCFs. Reproducible, bulk, HPC.
83
+ - **`api`** (`GnomadApiSource`) — gnomAD GraphQL API, no download, rate-limited
84
+ to ~10 req/IP/60s → small cohorts only. Region cache + throttle + backoff.
85
+
86
+ `--pop-data {genome,exome,both}` (default `genome`): flanks often fall in
87
+ non-coding regions where **only genomes have data**, so genome is the default;
88
+ `exome` adds power in coding regions; `both` masks the **union** (common in
89
+ either cohort). gnomAD v4 (hg38) is the only build with a pooled `joint` set; we
90
+ use max-AF union for `both` for one code path.
91
+
92
+ API AF rule (no per-population `af` field): `af = max(seq.af, max(ac/an over
93
+ populations))` across the chosen kinds; SNPs only. Build → dataset:
94
+ hg19 → `gnomad_r2_1`/GRCh37, hg38 → `gnomad_r4`/GRCh38; region query is 1-based
95
+ inclusive. Requesting `--pop-data exome`/`both` on the VCF source without the
96
+ exome files **fails fast** (`GnomadStore.preflight`) — never a silent
97
+ genome-only fallback.
98
+
99
+ ## Flank source modes (the `FlankSource` strategy)
100
+
101
+ | Mode | Inputs | Flank source | Masking |
102
+ |------|--------|--------------|---------|
103
+ | A | MAF + FASTA | reference | none |
104
+ | B | + gnomAD | reference | common SNPs → N |
105
+ | C | + sample BAM | patient consensus, reference fallback at low depth | het / low-confidence → N or IUPAC |
106
+ | D | all | patient consensus | gnomAD ∪ observed-het |
107
+
108
+ A and B are implemented (`ReferenceFlankSource`). C/D are the differentiator:
109
+ patient consensus catches **private/rare** variants gnomAD never sees — the ones
110
+ that silently break a primer for one specific patient.
111
+
112
+ ### Consensus (modes C/D) plan
113
+ - Build from BAM via `bcftools mpileup → call → consensus --iupac-codes`, or
114
+ pysam pileup. Validate against `samtools consensus` as an oracle.
115
+ - A position is **heterozygous** (→ mask) if the second-most-common allele
116
+ exceeds ~25–30% with sufficient depth; below a min-depth threshold, fall back
117
+ to the reference base and flag it.
118
+ - Indels in the pileup shift coordinates — the hard part. `kindel` is the
119
+ reference implementation for CIGAR-described indel reconciliation, but it is
120
+ haploid/clonal (viral) and does **not** flag heterozygosity, so its majority
121
+ call must be augmented with diploid het detection.
122
+
123
+ ## Fusion / structural-variant path (SV)
124
+
125
+ A gene fusion is defined by two breakpoints on (possibly) different chromosomes.
126
+ The chimeric **junction sequence** is the only fusion-specific template, so the
127
+ ddPCR probe must **span the junction**. vflank builds it in `core/fusion.py`:
128
+ fetch each partner's flank, orient per the strand bit (`0`=plus, `1`=minus),
129
+ concatenate across the breakpoint with **no separator** (an early `"-"` join
130
+ corrupted the junction base — never insert non-ACGT). Masking is applied in
131
+ genomic space *before* reverse-complement (`revcomp(N)=N`). The corrected
132
+ junction model and the iCallSV `CT`→strand mapping are in
133
+ docs/research/sv-vcf-input.md. Probe design over the junction is delegated to
134
+ Primer3 (not implemented yet).
135
+
136
+ ## Downstream emit formats (where vflank stops)
137
+
138
+ - **Olivar** (small-variant amplicons) consumes a FASTA + a SNP CSV with columns
139
+ `START, STOP, FREQ` (1-based). vflank's gnomAD scan and BAM-het detection both
140
+ produce exactly this — `--emit-olivar` is the integration seam. The `FREQ`
141
+ column is where patient-specific risk (from a BAM) gets injected.
142
+ - **Primer3** handles the fusion-junction probe.
143
+
144
+ vflank produces inputs for these tools; it does not design primers itself.
@@ -0,0 +1,28 @@
1
+ .git
2
+ .github
3
+ .claude
4
+ docs
5
+ site
6
+ tests
7
+ *.egg-info
8
+ __pycache__
9
+ *.py[cod]
10
+ .pytest_cache
11
+ .mypy_cache
12
+ .ruff_cache
13
+ .venv
14
+ venv
15
+ .coverage
16
+ htmlcov
17
+ # data files
18
+ *.fasta
19
+ *.fa
20
+ *.fai
21
+ *.gzi
22
+ *.bam
23
+ *.bai
24
+ *.vcf
25
+ *.vcf.gz
26
+ *.vcf.bgz
27
+ *.tbi
28
+ *.csi
@@ -0,0 +1,47 @@
1
+ name: CI
2
+
3
+ on:
4
+ push:
5
+ branches: [main]
6
+ pull_request:
7
+
8
+ permissions:
9
+ contents: read
10
+
11
+ concurrency:
12
+ group: ci-${{ github.ref }}
13
+ cancel-in-progress: true
14
+
15
+ jobs:
16
+ lint:
17
+ name: Lint & type-check
18
+ runs-on: ubuntu-latest
19
+ steps:
20
+ - uses: actions/checkout@v4
21
+ - uses: actions/setup-python@v5
22
+ with:
23
+ python-version: "3.12"
24
+ cache: pip
25
+ - run: pip install -e ".[dev]"
26
+ - name: Ruff
27
+ run: ruff check src tests
28
+ - name: Mypy
29
+ run: mypy src/vflank/core src/vflank/io
30
+
31
+ test:
32
+ name: Test (${{ matrix.os }}, py${{ matrix.python-version }})
33
+ strategy:
34
+ fail-fast: false
35
+ matrix:
36
+ os: [ubuntu-latest, macos-latest]
37
+ python-version: ["3.10", "3.11", "3.12"]
38
+ runs-on: ${{ matrix.os }}
39
+ steps:
40
+ - uses: actions/checkout@v4
41
+ - uses: actions/setup-python@v5
42
+ with:
43
+ python-version: ${{ matrix.python-version }}
44
+ cache: pip
45
+ - run: pip install -e ".[dev]"
46
+ - name: Pytest
47
+ run: pytest --cov=vflank --cov-report=term-missing
@@ -0,0 +1,41 @@
1
+ name: Docs
2
+
3
+ on:
4
+ push:
5
+ branches: [main]
6
+ tags: ["v*"]
7
+ workflow_dispatch:
8
+
9
+ permissions:
10
+ contents: write
11
+
12
+ jobs:
13
+ deploy:
14
+ runs-on: ubuntu-latest
15
+ steps:
16
+ - uses: actions/checkout@v4
17
+ with:
18
+ fetch-depth: 0 # mike needs full history + the gh-pages branch
19
+ - uses: actions/setup-python@v5
20
+ with:
21
+ python-version: "3.12"
22
+ cache: pip
23
+ - run: pip install -e ".[docs]"
24
+ - name: Configure git
25
+ run: |
26
+ git config user.name "github-actions[bot]"
27
+ git config user.email "github-actions[bot]@users.noreply.github.com"
28
+ git fetch origin gh-pages --depth=1 || true
29
+
30
+ # main -> the rolling "dev" docs
31
+ - name: Deploy dev docs
32
+ if: github.ref == 'refs/heads/main'
33
+ run: mike deploy --push --update-aliases dev
34
+
35
+ # vX.Y.Z tag -> that version, aliased "latest" and set as default
36
+ - name: Deploy release docs
37
+ if: startsWith(github.ref, 'refs/tags/v')
38
+ run: |
39
+ VERSION="${GITHUB_REF_NAME#v}"
40
+ mike deploy --push --update-aliases "$VERSION" latest
41
+ mike set-default --push latest
@@ -0,0 +1,61 @@
1
+ name: Release
2
+
3
+ # Fires when a GitHub Release is published (its tag, e.g. v0.1.0, drives the
4
+ # version). Publishes the package to PyPI and the image to GHCR.
5
+ on:
6
+ release:
7
+ types: [published]
8
+ workflow_dispatch:
9
+
10
+ jobs:
11
+ pypi:
12
+ name: Publish to PyPI
13
+ runs-on: ubuntu-latest
14
+ permissions:
15
+ contents: read # required for actions/checkout (permissions: block is exhaustive)
16
+ id-token: write # OIDC for PyPI Trusted Publishing (no API token needed)
17
+ steps:
18
+ - uses: actions/checkout@v4
19
+ - uses: actions/setup-python@v5
20
+ with:
21
+ python-version: "3.12"
22
+ - name: Build sdist + wheel
23
+ run: |
24
+ pip install build
25
+ python -m build
26
+ - name: Publish
27
+ uses: pypa/gh-action-pypi-publish@release/v1
28
+
29
+ docker:
30
+ name: Publish image to GHCR
31
+ runs-on: ubuntu-latest
32
+ permissions:
33
+ contents: read
34
+ packages: write
35
+ steps:
36
+ - uses: actions/checkout@v4
37
+ - uses: docker/setup-buildx-action@v3
38
+ - name: Log in to GHCR
39
+ uses: docker/login-action@v3
40
+ with:
41
+ registry: ghcr.io
42
+ username: ${{ github.actor }}
43
+ password: ${{ secrets.GITHUB_TOKEN }}
44
+ - name: Image metadata (tags + labels)
45
+ id: meta
46
+ uses: docker/metadata-action@v5
47
+ with:
48
+ images: ghcr.io/${{ github.repository }} # lowercased automatically
49
+ tags: |
50
+ type=semver,pattern={{version}}
51
+ type=semver,pattern={{major}}.{{minor}}
52
+ type=raw,value=latest
53
+ - name: Build and push
54
+ uses: docker/build-push-action@v6
55
+ with:
56
+ context: .
57
+ push: true
58
+ tags: ${{ steps.meta.outputs.tags }}
59
+ labels: ${{ steps.meta.outputs.labels }}
60
+ cache-from: type=gha
61
+ cache-to: type=gha,mode=max
@@ -0,0 +1,39 @@
1
+ # Python
2
+ __pycache__/
3
+ *.py[cod]
4
+ *.egg-info/
5
+ .eggs/
6
+ build/
7
+ dist/
8
+ .venv/
9
+ venv/
10
+
11
+ # Tooling caches
12
+ .pytest_cache/
13
+ .mypy_cache/
14
+ .ruff_cache/
15
+ .coverage
16
+ htmlcov/
17
+
18
+ # Bioinformatics artifacts (large; never commit)
19
+ *.fasta
20
+ *.fa
21
+ *.fai
22
+ *.gzi
23
+ *.bam
24
+ *.bai
25
+ *.vcf
26
+ *.vcf.gz
27
+ *.vcf.bgz
28
+ *.tbi
29
+ *.csi
30
+ flanking_sequences.fasta
31
+
32
+ # OS / editor
33
+ .DS_Store
34
+ *.swp
35
+ .idea/
36
+ .vscode/
37
+
38
+ # Claude Code local (per-user) settings
39
+ .claude/settings.local.json
@@ -0,0 +1,23 @@
1
+ # Run: pre-commit install (then hooks run on `git commit`)
2
+ # Or: pre-commit run --all-files
3
+ repos:
4
+ - repo: https://github.com/astral-sh/ruff-pre-commit
5
+ rev: v0.8.4
6
+ hooks:
7
+ - id: ruff
8
+ args: [--fix]
9
+ - id: ruff-format
10
+ - repo: https://github.com/pre-commit/mirrors-mypy
11
+ rev: v1.13.0
12
+ hooks:
13
+ - id: mypy
14
+ files: ^src/vflank/(core|io)/
15
+ additional_dependencies: [types-setuptools]
16
+ - repo: https://github.com/pre-commit/pre-commit-hooks
17
+ rev: v5.0.0
18
+ hooks:
19
+ - id: end-of-file-fixer
20
+ - id: trailing-whitespace
21
+ - id: check-yaml
22
+ - id: check-toml
23
+ - id: check-added-large-files
@@ -0,0 +1,26 @@
1
+ # Changelog
2
+
3
+ All notable changes to this project are documented here. The format is based on
4
+ [Keep a Changelog](https://keepachangelog.com/en/1.1.0/), and this project
5
+ adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
6
+
7
+ ## [Unreleased]
8
+
9
+ ## [0.1.0] - 2026-06-12
10
+
11
+ ### Added
12
+ - Small-variant flank extraction from MAF (`vflank small run`), with `inspect`
13
+ and `list-vcf` helpers.
14
+ - Common-SNP masking from two interchangeable backends (`--pop-source`):
15
+ local gnomAD VCFs and the gnomAD GraphQL API (no download).
16
+ - `--pop-data {genome,exome,both}` for both backends.
17
+ - Per-variant deduplication keyed on `CHR_POS_REF_ALT` (`--dedup/--no-dedup`).
18
+ - Structural-variant junction extraction (`vflank fusion run`) from the simple
19
+ iCallSV/iAnnotateSV breakpoint TSV (columns matched by name), with
20
+ reverse-complement-aware junction construction and optional flank masking.
21
+ - Genome-build guard (hg19/hg38 vs FASTA), flank-truncation detection, and a
22
+ categorised skip summary + optional TSV run report.
23
+ - Documentation site (MkDocs Material) and GitHub Actions CI.
24
+
25
+ [Unreleased]: https://github.com/rhshah/vFlank/compare/v0.1.0...HEAD
26
+ [0.1.0]: https://github.com/rhshah/vFlank/releases/tag/v0.1.0
vflank-0.1.0/CLAUDE.md ADDED
@@ -0,0 +1,102 @@
1
+ # CLAUDE.md — working guide for the vflank repository
2
+
3
+ ## What this project is
4
+
5
+ `vflank` is the **variant-aware, masked-flank front-end** of a ddPCR
6
+ assay-design pipeline. It extracts the sequence flanking genomic variants and
7
+ masks positions that would compromise a primer/probe, then emits clean target
8
+ sequences.
9
+
10
+ It is **not** a primer designer. Design is delegated downstream — Olivar
11
+ (small-variant amplicons) and Primer3 (fusion-junction probes) — invoked
12
+ out-of-process. Do not add a primer/probe design algorithm to this package; add
13
+ *emit formats* that feed those tools. See [docs/ARCHITECTURE.md](docs/ARCHITECTURE.md)
14
+ for the full plan, scope boundary, and milestone roadmap.
15
+
16
+ ## Repository layout
17
+
18
+ ```
19
+ src/vflank/
20
+ ├── core/ chrom · variant · flanks · popfreq · popfreq_api · fusion · skips
21
+ ├── io/ maf · reference · fasta · breakpoints · report ← file access
22
+ ├── cli/ app (root) · small (run/inspect/list-vcf) · fusion (run)
23
+ ├── logging.py · errors.py
24
+ tests/ unit/ · integration/
25
+ docs/ARCHITECTURE.md ← design & roadmap
26
+ docs/research/ ← gnomAD-API & SV/VCF design notes
27
+ ```
28
+
29
+ The original reference scripts (`get_flanking_sequence.py`,
30
+ `design_fusion_primers.py`, `config_ES_CTDNA_03.cfg`) have been fully ported /
31
+ re-implemented and removed. The conventions extracted from them live in
32
+ [docs/research/](docs/research/) (corrected fusion-junction model, iCallSV
33
+ strand mapping, etc.).
34
+
35
+ ## Quality gate — run before declaring any change done
36
+
37
+ ```bash
38
+ python -m ruff check src tests
39
+ python -m mypy src/vflank/core src/vflank/io
40
+ python -m pytest
41
+ ```
42
+
43
+ All three must pass. Tests run without installing the package (`pyproject.toml`
44
+ sets `pythonpath = ["src"]`). The dev environment is mambaforge Python 3.10 with
45
+ `typer`, `rich`, `pysam`, `pandas`, `pytest`, `ruff`, `mypy` available.
46
+
47
+ ## Working discipline (the bar for this repo)
48
+
49
+ - **Review before and after every change.** Before: read the surrounding code
50
+ and run the gate. After: re-run the gate; re-read the diff for duplication,
51
+ dead code, and unused symbols.
52
+ - **No silent failures.** Every error path must surface — raise a typed
53
+ `VflankError`, or log at an appropriate level, or record and report in the run
54
+ summary. Do not swallow exceptions or return empty results without a log. The
55
+ existing patterns: flank truncation at contig ends is detected and reported;
56
+ contig-absent in a VCF is logged at DEBUG; build mismatch warns.
57
+ - **No dead or duplicated code.** If you remove the last caller of something,
58
+ remove the thing. Don't add an exception class / helper "for later."
59
+ - **Keep the hot kernels pure.** `chrom.normalise_chrom`, `popfreq.parse_common_snp_positions`,
60
+ and `flanks.mask_sequence` are pure functions over plain values so they are
61
+ unit-testable without pysam and can later be ported to Rust. Preserve that.
62
+ - **Update comments, logging, and tests with the code** — not as an afterthought.
63
+
64
+ ## Coordinate conventions (the #1 source of bugs — read before editing flanks)
65
+
66
+ - **MAF coordinates are 1-based, fully-closed `[start, end]`.**
67
+ - **pysam (`FastaFile.fetch`, tabix) is 0-based, half-open `[start, end)`.**
68
+ - Flank math (see `core/flanks.ReferenceFlankSource.fetch`):
69
+ - left flank = `fa.fetch(chrom, max(0, start-flank-1), start-1)` — bases *before* the variant
70
+ - right flank = `fa.fetch(chrom, end, end+flank)` — bases *after* the variant
71
+ - the variant interval `[start, end]` itself is excluded from both flanks
72
+ - Masking maps a 1-based VCF position back to a 0-based index *within a flank*:
73
+ `idx = pos - region_start_0based - 1` (see `flanks.mask_sequence`).
74
+ - pysam silently returns a **short string** when a window runs off a contig end —
75
+ always treat a shorter-than-requested flank as a real condition to report.
76
+
77
+ ## Domain knowledge
78
+
79
+ For the deeper biology (why masking matters for ddPCR, gnomAD AF semantics across
80
+ builds, masking modes A–D, indel/het caveats), invoke the `ddpcr-conventions`
81
+ skill — it carries the reference detail that doesn't belong in always-on context.
82
+
83
+ ## Conventions
84
+
85
+ - **Chromosome handling:** the canonical internal form is the *bare* chromosome
86
+ (`"7"`, `"X"`, `"MT"`). Normalise MAF input with `core/chrom.normalise_chrom`;
87
+ convert to a file's notation only at fetch time via `ReferenceFasta.contig` /
88
+ the gnomAD store. Notation (`chr1` vs `1`) is auto-detected per file.
89
+ - **Output:** two FASTA records per variant — raw and `Masked__…` — with the
90
+ variant shown literally as `[REF/ALT]` between the flanks. Only single-base
91
+ population SNPs are masked.
92
+ - **CLI:** Typer; status/Rich output goes to **stderr** (the `console` in
93
+ `logging.py`); stdout/files are for data. Library/diagnostic messages go
94
+ through the `vflank` logger; the CLI presents the formatted summary.
95
+ - **Errors:** raise `VflankError` subclasses (`errors.py`) for user-facing
96
+ failures; the CLI catches them and prints a clean message.
97
+
98
+ ## Git
99
+
100
+ - Work on a branch; the foundation is committed on `main`.
101
+ - End commit messages with: `Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>`
102
+ - Commit/push only when asked.
@@ -0,0 +1,38 @@
1
+ # Contributing to vflank
2
+
3
+ Thanks for your interest in improving vflank! This is the short version; the
4
+ full [Developer Guide](docs/DEVELOPER.md) covers setup, layout, and how to
5
+ extend the package.
6
+
7
+ ## Quick start
8
+
9
+ ```bash
10
+ git clone https://github.com/rhshah/vFlank.git
11
+ cd vFlank
12
+ pip install -e ".[dev]"
13
+ python -m pytest # tests resolve `vflank` from src/ automatically
14
+ ```
15
+
16
+ ## The quality gate (run before every PR)
17
+
18
+ ```bash
19
+ python -m ruff check src tests
20
+ python -m mypy src/vflank/core src/vflank/io
21
+ python -m pytest
22
+ ```
23
+
24
+ All three must pass; CI runs them on Linux + macOS across Python 3.10–3.12.
25
+
26
+ ## Conventions
27
+
28
+ - Work on a feature branch; keep `core/` pure and I/O-free.
29
+ - No silent failures — surface every error path (raise / log / report).
30
+ - Add or update tests in the matching `tests/` subtree.
31
+ - Match the surrounding style; the coordinate conventions in
32
+ [CLAUDE.md](CLAUDE.md) are the #1 thing to get right when touching flanks.
33
+ - End commit messages and PRs with a clear, present-tense summary.
34
+
35
+ ## Reporting issues
36
+
37
+ Open an issue at https://github.com/rhshah/vFlank/issues with a minimal
38
+ reproduction (a few-line MAF/TSV + the command) where possible.
@@ -0,0 +1,24 @@
1
+ # syntax=docker/dockerfile:1
2
+
3
+ # --- Build the wheel ---
4
+ FROM python:3.12-slim AS builder
5
+ WORKDIR /src
6
+ COPY pyproject.toml README.md LICENSE ./
7
+ COPY src ./src
8
+ RUN pip install --no-cache-dir build && python -m build --wheel --outdir /dist
9
+
10
+ # --- Runtime ---
11
+ FROM python:3.12-slim
12
+ LABEL org.opencontainers.image.source="https://github.com/rhshah/vFlank" \
13
+ org.opencontainers.image.description="Variant-aware flanking-sequence extraction and masking for ddPCR assay design" \
14
+ org.opencontainers.image.licenses="Apache-2.0"
15
+
16
+ # pysam ships manylinux wheels with htslib bundled, so no system htslib needed.
17
+ COPY --from=builder /dist/*.whl /tmp/
18
+ RUN pip install --no-cache-dir /tmp/*.whl && rm -rf /tmp/*.whl
19
+
20
+ # Reference FASTAs / VCFs are mounted at runtime, e.g.:
21
+ # docker run --rm -v "$PWD:/data" ghcr.io/rhshah/vflank \
22
+ # small run /data/variants.maf -r /data/GRCh37.fasta -g hg19 -o /data/out.fasta
23
+ ENTRYPOINT ["vflank"]
24
+ CMD ["--help"]