vflank 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- vflank/__init__.py +3 -0
- vflank/cli/__init__.py +1 -0
- vflank/cli/_masking.py +44 -0
- vflank/cli/app.py +45 -0
- vflank/cli/fusion.py +182 -0
- vflank/cli/small.py +463 -0
- vflank/core/__init__.py +1 -0
- vflank/core/chrom.py +120 -0
- vflank/core/flanks.py +99 -0
- vflank/core/fusion.py +105 -0
- vflank/core/popfreq.py +227 -0
- vflank/core/popfreq_api.py +204 -0
- vflank/core/skips.py +27 -0
- vflank/core/variant.py +52 -0
- vflank/errors.py +27 -0
- vflank/io/__init__.py +1 -0
- vflank/io/breakpoints.py +107 -0
- vflank/io/fasta.py +44 -0
- vflank/io/maf.py +133 -0
- vflank/io/reference.py +80 -0
- vflank/io/report.py +35 -0
- vflank/logging.py +57 -0
- vflank/py.typed +0 -0
- vflank-0.1.0.dist-info/METADATA +154 -0
- vflank-0.1.0.dist-info/RECORD +28 -0
- vflank-0.1.0.dist-info/WHEEL +4 -0
- vflank-0.1.0.dist-info/entry_points.txt +2 -0
- vflank-0.1.0.dist-info/licenses/LICENSE +201 -0
vflank/__init__.py
ADDED
vflank/cli/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""Command-line interface (Typer)."""
|
vflank/cli/_masking.py
ADDED
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
"""Shared construction of the population-frequency masking source for CLIs.
|
|
2
|
+
|
|
3
|
+
Both ``small`` and ``fusion`` select a masking backend the same way; this keeps
|
|
4
|
+
that logic in one place.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
from collections.abc import Iterable
|
|
10
|
+
from pathlib import Path
|
|
11
|
+
|
|
12
|
+
from ..core.popfreq import GnomadStore
|
|
13
|
+
from ..core.popfreq_api import GnomadApiSource
|
|
14
|
+
from ..errors import VflankError
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def validate_pop_options(pop_source: str, pop_data: str) -> None:
|
|
18
|
+
if pop_source not in ("vcf", "api"):
|
|
19
|
+
raise VflankError(f"--pop-source must be 'vcf' or 'api', got '{pop_source}'")
|
|
20
|
+
if pop_data not in ("genome", "exome", "both"):
|
|
21
|
+
raise VflankError(f"--pop-data must be 'genome', 'exome', or 'both', got '{pop_data}'")
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def make_pop_source(
|
|
25
|
+
pop_source: str,
|
|
26
|
+
pop_vcf_dir: Path | None,
|
|
27
|
+
genome_build: str,
|
|
28
|
+
pop_data: str,
|
|
29
|
+
chroms: Iterable[str],
|
|
30
|
+
) -> GnomadStore | GnomadApiSource | None:
|
|
31
|
+
"""Build the masking source, or None if VCF backend with no directory.
|
|
32
|
+
|
|
33
|
+
For the VCF backend, runs ``preflight`` against ``chroms`` so a wholly-absent
|
|
34
|
+
requested data kind fails fast (no silent genome-only fallback).
|
|
35
|
+
"""
|
|
36
|
+
if pop_source == "api":
|
|
37
|
+
return GnomadApiSource(genome_build, pop_data)
|
|
38
|
+
if pop_vcf_dir is not None:
|
|
39
|
+
store = GnomadStore(pop_vcf_dir, genome_build, pop_data)
|
|
40
|
+
resolved = sorted(set(chroms))
|
|
41
|
+
if resolved:
|
|
42
|
+
store.preflight(resolved)
|
|
43
|
+
return store
|
|
44
|
+
return None
|
vflank/cli/app.py
ADDED
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
"""vflank root CLI.
|
|
2
|
+
|
|
3
|
+
vflank small run ... # extract + mask flanks from a MAF
|
|
4
|
+
vflank small inspect ... # preview MAF columns
|
|
5
|
+
vflank small list-vcf ... # verify gnomAD directory coverage
|
|
6
|
+
vflank version
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
import typer
|
|
12
|
+
|
|
13
|
+
from .. import __version__
|
|
14
|
+
from ..logging import setup_logging
|
|
15
|
+
from . import fusion, small
|
|
16
|
+
|
|
17
|
+
app = typer.Typer(
|
|
18
|
+
name="vflank",
|
|
19
|
+
help="Variant-aware flanking-sequence extraction and masking for ddPCR assay design.",
|
|
20
|
+
add_completion=False,
|
|
21
|
+
no_args_is_help=True,
|
|
22
|
+
)
|
|
23
|
+
app.add_typer(small.app, name="small", help="Small-variant (SNP/indel) flank extraction.")
|
|
24
|
+
app.add_typer(fusion.app, name="fusion", help="Structural-variant junction extraction.")
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
@app.callback()
|
|
28
|
+
def main(
|
|
29
|
+
verbose: bool = typer.Option(False, "--verbose", "-v", help="Enable DEBUG logging."),
|
|
30
|
+
quiet: bool = typer.Option(False, "--quiet", "-q", help="Only show warnings and errors."),
|
|
31
|
+
debug: bool = typer.Option(False, "--debug", help="DEBUG logging + rich tracebacks."),
|
|
32
|
+
):
|
|
33
|
+
"""Global options applied before any subcommand."""
|
|
34
|
+
verbosity = 1 if (verbose or debug) else (-1 if quiet else 0)
|
|
35
|
+
setup_logging(verbosity, show_tracebacks=debug)
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
@app.command()
|
|
39
|
+
def version():
|
|
40
|
+
"""Print the vflank version."""
|
|
41
|
+
typer.echo(__version__)
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
if __name__ == "__main__":
|
|
45
|
+
app()
|
vflank/cli/fusion.py
ADDED
|
@@ -0,0 +1,182 @@
|
|
|
1
|
+
"""`vflank fusion` — structural-variant junction extraction (simple TSV input).
|
|
2
|
+
|
|
3
|
+
Reads a breakpoint table (chr1 pos1 str1 chr2 pos2 str2; columns matched by
|
|
4
|
+
name) and writes one FASTA record per fusion: the chimeric junction sequence a
|
|
5
|
+
ddPCR probe spans. VCF/BND input is a later phase.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
import time
|
|
11
|
+
from pathlib import Path
|
|
12
|
+
|
|
13
|
+
import typer
|
|
14
|
+
from rich.table import Table
|
|
15
|
+
|
|
16
|
+
from ..core.chrom import normalise_chrom
|
|
17
|
+
from ..core.fusion import build_junction
|
|
18
|
+
from ..core.popfreq_api import dataset_for_build
|
|
19
|
+
from ..errors import VflankError
|
|
20
|
+
from ..io import breakpoints as bp_io
|
|
21
|
+
from ..io import fasta as fasta_io
|
|
22
|
+
from ..io.breakpoints import SvColumns
|
|
23
|
+
from ..io.reference import ReferenceFasta
|
|
24
|
+
from ..logging import console
|
|
25
|
+
from ._masking import make_pop_source, validate_pop_options
|
|
26
|
+
|
|
27
|
+
app = typer.Typer(no_args_is_help=True)
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
@app.command()
|
|
31
|
+
def run(
|
|
32
|
+
sv_file: Path = typer.Argument(
|
|
33
|
+
..., exists=True, help="Breakpoint TSV (chr1 pos1 str1 chr2 pos2 str2)."
|
|
34
|
+
),
|
|
35
|
+
ref_genome: Path = typer.Option(
|
|
36
|
+
..., "--ref-genome", "-r", help="Indexed reference FASTA (.fai required)."
|
|
37
|
+
),
|
|
38
|
+
genome_build: str = typer.Option("hg19", "--genome-build", "-g", help="hg19 or hg38."),
|
|
39
|
+
flank: int = typer.Option(
|
|
40
|
+
200, "--flank", "-f", min=1, max=10_000,
|
|
41
|
+
help="Bases taken from each partner (junction is up to 2x this).",
|
|
42
|
+
),
|
|
43
|
+
pop_vcf_dir: Path | None = typer.Option(
|
|
44
|
+
None, "--pop-vcf-dir", "-d",
|
|
45
|
+
help="Directory of gnomAD VCFs to mask junction flanks. Omit to skip masking.",
|
|
46
|
+
),
|
|
47
|
+
pop_data: str = typer.Option(
|
|
48
|
+
"genome", "--pop-data", help="gnomAD data to mask against: genome, exome, or both."
|
|
49
|
+
),
|
|
50
|
+
pop_source: str = typer.Option(
|
|
51
|
+
"vcf", "--pop-source", help="Masking backend: vcf or api (no download)."
|
|
52
|
+
),
|
|
53
|
+
af_threshold: float = typer.Option(
|
|
54
|
+
0.001, "--af-threshold", min=0.0, max=1.0, help="Min population AF to mask a SNP."
|
|
55
|
+
),
|
|
56
|
+
output: Path = typer.Option(
|
|
57
|
+
Path("fusion_junctions.fasta"), "--output", "-o", help="Output FASTA file."
|
|
58
|
+
),
|
|
59
|
+
chr1_col: str = typer.Option(SvColumns.chr1, "--chr1-col"),
|
|
60
|
+
pos1_col: str = typer.Option(SvColumns.pos1, "--pos1-col"),
|
|
61
|
+
str1_col: str = typer.Option(SvColumns.str1, "--str1-col"),
|
|
62
|
+
chr2_col: str = typer.Option(SvColumns.chr2, "--chr2-col"),
|
|
63
|
+
pos2_col: str = typer.Option(SvColumns.pos2, "--pos2-col"),
|
|
64
|
+
str2_col: str = typer.Option(SvColumns.str2, "--str2-col"),
|
|
65
|
+
name_col: str = typer.Option(SvColumns.name, "--name-col"),
|
|
66
|
+
sample_col: str = typer.Option(SvColumns.sample, "--sample-col"),
|
|
67
|
+
):
|
|
68
|
+
"""Build fusion-junction sequences for a breakpoint table and write a FASTA."""
|
|
69
|
+
cols = SvColumns(
|
|
70
|
+
chr1_col, pos1_col, str1_col, chr2_col, pos2_col, str2_col, name_col, sample_col
|
|
71
|
+
)
|
|
72
|
+
try:
|
|
73
|
+
_run(sv_file, ref_genome, genome_build, flank, pop_vcf_dir, pop_data,
|
|
74
|
+
pop_source, af_threshold, output, cols)
|
|
75
|
+
except VflankError as exc:
|
|
76
|
+
console.print(f"[bold red]ERROR:[/bold red] {exc}")
|
|
77
|
+
raise typer.Exit(1) from exc
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
def _run(sv_file, ref_genome, genome_build, flank, pop_vcf_dir, pop_data,
|
|
81
|
+
pop_source, af_threshold, output, cols: SvColumns):
|
|
82
|
+
t0 = time.time()
|
|
83
|
+
console.rule("[bold blue]vflank fusion run[/bold blue]")
|
|
84
|
+
if genome_build not in ("hg19", "hg38"):
|
|
85
|
+
raise VflankError(f"--genome-build must be 'hg19' or 'hg38', got '{genome_build}'")
|
|
86
|
+
validate_pop_options(pop_source, pop_data)
|
|
87
|
+
if pop_vcf_dir is not None and not pop_vcf_dir.is_dir():
|
|
88
|
+
raise VflankError(f"--pop-vcf-dir is not a directory: {pop_vcf_dir}")
|
|
89
|
+
|
|
90
|
+
console.print(f"[bold]Loading breakpoints:[/bold] {sv_file}")
|
|
91
|
+
df = bp_io.load_sv_table(sv_file, cols)
|
|
92
|
+
console.print(f" {len(df):,} fusion(s)")
|
|
93
|
+
|
|
94
|
+
reference = ReferenceFasta(ref_genome)
|
|
95
|
+
console.print(f"[bold]Reference:[/bold] {ref_genome} [dim]({genome_build})[/dim]")
|
|
96
|
+
build_warn = reference.check_build(genome_build)
|
|
97
|
+
if build_warn:
|
|
98
|
+
console.print(f" [bold yellow]⚠ {build_warn}[/bold yellow]")
|
|
99
|
+
|
|
100
|
+
# --- Masking source (optional) — masks the junction flanks ---
|
|
101
|
+
bp_chroms = {
|
|
102
|
+
b
|
|
103
|
+
for col in (cols.chr1, cols.chr2)
|
|
104
|
+
for b, _err in (normalise_chrom(v) for v in df[col].dropna().unique())
|
|
105
|
+
if b
|
|
106
|
+
}
|
|
107
|
+
gnomad = make_pop_source(pop_source, pop_vcf_dir, genome_build, pop_data, bp_chroms)
|
|
108
|
+
if pop_source == "api":
|
|
109
|
+
dataset = dataset_for_build(genome_build)[1]
|
|
110
|
+
console.print(f"[bold]Masking:[/bold] gnomAD API [dim]({pop_data}, {dataset})[/dim]")
|
|
111
|
+
elif gnomad is not None:
|
|
112
|
+
console.print(f"[bold]Masking:[/bold] {pop_vcf_dir} [dim](pop-data={pop_data})[/dim]")
|
|
113
|
+
console.print(f"[bold]Flank:[/bold] {flank} bp/partner (junction ≤ {2 * flank} bp)\n")
|
|
114
|
+
|
|
115
|
+
records: list[str] = []
|
|
116
|
+
skipped = 0
|
|
117
|
+
n_masked_total = 0
|
|
118
|
+
skip_reasons: list[str] = []
|
|
119
|
+
summary_rows: list[dict] = []
|
|
120
|
+
|
|
121
|
+
for row_idx, row in df.iterrows():
|
|
122
|
+
fusion, reason = bp_io.parse_fusion_row(row, cols)
|
|
123
|
+
if reason is not None:
|
|
124
|
+
skip_reasons.append(f"row {row_idx} — {reason}")
|
|
125
|
+
skipped += 1
|
|
126
|
+
continue
|
|
127
|
+
try:
|
|
128
|
+
jr = build_junction(reference, fusion, flank, gnomad=gnomad, af_threshold=af_threshold)
|
|
129
|
+
except Exception as exc: # noqa: BLE001
|
|
130
|
+
skip_reasons.append(f"row {row_idx} {fusion.name} — junction error: {exc}")
|
|
131
|
+
skipped += 1
|
|
132
|
+
continue
|
|
133
|
+
|
|
134
|
+
label = fasta_io.safe_header(fusion.name or "fusion")
|
|
135
|
+
bp = f"{fusion.bp1.chrom}_{fusion.bp1.pos}_{fusion.bp1.strand}__" \
|
|
136
|
+
f"{fusion.bp2.chrom}_{fusion.bp2.pos}_{fusion.bp2.strand}"
|
|
137
|
+
prefix = f"{fasta_io.safe_header(fusion.sample)}__" if fusion.sample else ""
|
|
138
|
+
header = f"{prefix}{label}__{bp}__j{jr.junction_index}"
|
|
139
|
+
records.append(f">{header}\n{jr.sequence}\n")
|
|
140
|
+
records.append(f">Masked__{header}\n{jr.masked_sequence}\n")
|
|
141
|
+
n_masked_total += jr.n_masked
|
|
142
|
+
|
|
143
|
+
truncated = len(jr.sequence) < 2 * flank
|
|
144
|
+
summary_rows.append({
|
|
145
|
+
"Name": fusion.name or ".", "BP1": f"{fusion.bp1.chrom}:{fusion.bp1.pos}",
|
|
146
|
+
"BP2": f"{fusion.bp2.chrom}:{fusion.bp2.pos}",
|
|
147
|
+
"Len": len(jr.sequence), "Junction": jr.junction_index,
|
|
148
|
+
"N": jr.n_masked, "Trunc": truncated,
|
|
149
|
+
})
|
|
150
|
+
|
|
151
|
+
reference.close()
|
|
152
|
+
if gnomad is not None:
|
|
153
|
+
gnomad.close()
|
|
154
|
+
fasta_io.write_fasta(output, records)
|
|
155
|
+
|
|
156
|
+
console.rule("[bold green]Results[/bold green]")
|
|
157
|
+
if summary_rows:
|
|
158
|
+
table = Table(show_header=True, header_style="bold cyan")
|
|
159
|
+
for col in ("Name", "BP1", "BP2", "Len", "Junction", "N", "Trunc"):
|
|
160
|
+
table.add_column(col)
|
|
161
|
+
for r in summary_rows[:50]:
|
|
162
|
+
n_str = f"[yellow]{r['N']}[/yellow]" if r["N"] else "[dim]0[/dim]"
|
|
163
|
+
table.add_row(
|
|
164
|
+
r["Name"], r["BP1"], r["BP2"], str(r["Len"]),
|
|
165
|
+
str(r["Junction"]), n_str, "[yellow]yes[/yellow]" if r["Trunc"] else "no",
|
|
166
|
+
)
|
|
167
|
+
console.print(table)
|
|
168
|
+
|
|
169
|
+
if skip_reasons:
|
|
170
|
+
console.print(f"\n[bold yellow]Skipped {skipped}:[/bold yellow]")
|
|
171
|
+
for reason in skip_reasons[:20]:
|
|
172
|
+
console.print(f" • {reason}")
|
|
173
|
+
|
|
174
|
+
mask_line = f"[bold]Bases masked:[/bold] {n_masked_total:>6,}\n" if n_masked_total else ""
|
|
175
|
+
console.print(
|
|
176
|
+
f"\n[bold]Fusions:[/bold] {len(df):>6,}\n"
|
|
177
|
+
f"[bold]Records:[/bold] {len(records):>6,} [dim](raw + masked per fusion)[/dim]\n"
|
|
178
|
+
f"[bold]Skipped:[/bold] {skipped:>6,}\n"
|
|
179
|
+
+ mask_line +
|
|
180
|
+
f"[bold]Output:[/bold] [cyan]{output.resolve()}[/cyan] "
|
|
181
|
+
f"[dim]({time.time() - t0:.1f}s)[/dim]"
|
|
182
|
+
)
|