vflank 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
vflank/__init__.py ADDED
@@ -0,0 +1,3 @@
1
+ """vflank — variant-aware flanking-sequence extraction and masking for ddPCR assay design."""
2
+
3
+ __version__ = "0.1.0"
vflank/cli/__init__.py ADDED
@@ -0,0 +1 @@
1
+ """Command-line interface (Typer)."""
vflank/cli/_masking.py ADDED
@@ -0,0 +1,44 @@
1
+ """Shared construction of the population-frequency masking source for CLIs.
2
+
3
+ Both ``small`` and ``fusion`` select a masking backend the same way; this keeps
4
+ that logic in one place.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ from collections.abc import Iterable
10
+ from pathlib import Path
11
+
12
+ from ..core.popfreq import GnomadStore
13
+ from ..core.popfreq_api import GnomadApiSource
14
+ from ..errors import VflankError
15
+
16
+
17
+ def validate_pop_options(pop_source: str, pop_data: str) -> None:
18
+ if pop_source not in ("vcf", "api"):
19
+ raise VflankError(f"--pop-source must be 'vcf' or 'api', got '{pop_source}'")
20
+ if pop_data not in ("genome", "exome", "both"):
21
+ raise VflankError(f"--pop-data must be 'genome', 'exome', or 'both', got '{pop_data}'")
22
+
23
+
24
+ def make_pop_source(
25
+ pop_source: str,
26
+ pop_vcf_dir: Path | None,
27
+ genome_build: str,
28
+ pop_data: str,
29
+ chroms: Iterable[str],
30
+ ) -> GnomadStore | GnomadApiSource | None:
31
+ """Build the masking source, or None if VCF backend with no directory.
32
+
33
+ For the VCF backend, runs ``preflight`` against ``chroms`` so a wholly-absent
34
+ requested data kind fails fast (no silent genome-only fallback).
35
+ """
36
+ if pop_source == "api":
37
+ return GnomadApiSource(genome_build, pop_data)
38
+ if pop_vcf_dir is not None:
39
+ store = GnomadStore(pop_vcf_dir, genome_build, pop_data)
40
+ resolved = sorted(set(chroms))
41
+ if resolved:
42
+ store.preflight(resolved)
43
+ return store
44
+ return None
vflank/cli/app.py ADDED
@@ -0,0 +1,45 @@
1
+ """vflank root CLI.
2
+
3
+ vflank small run ... # extract + mask flanks from a MAF
4
+ vflank small inspect ... # preview MAF columns
5
+ vflank small list-vcf ... # verify gnomAD directory coverage
6
+ vflank version
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ import typer
12
+
13
+ from .. import __version__
14
+ from ..logging import setup_logging
15
+ from . import fusion, small
16
+
17
+ app = typer.Typer(
18
+ name="vflank",
19
+ help="Variant-aware flanking-sequence extraction and masking for ddPCR assay design.",
20
+ add_completion=False,
21
+ no_args_is_help=True,
22
+ )
23
+ app.add_typer(small.app, name="small", help="Small-variant (SNP/indel) flank extraction.")
24
+ app.add_typer(fusion.app, name="fusion", help="Structural-variant junction extraction.")
25
+
26
+
27
+ @app.callback()
28
+ def main(
29
+ verbose: bool = typer.Option(False, "--verbose", "-v", help="Enable DEBUG logging."),
30
+ quiet: bool = typer.Option(False, "--quiet", "-q", help="Only show warnings and errors."),
31
+ debug: bool = typer.Option(False, "--debug", help="DEBUG logging + rich tracebacks."),
32
+ ):
33
+ """Global options applied before any subcommand."""
34
+ verbosity = 1 if (verbose or debug) else (-1 if quiet else 0)
35
+ setup_logging(verbosity, show_tracebacks=debug)
36
+
37
+
38
+ @app.command()
39
+ def version():
40
+ """Print the vflank version."""
41
+ typer.echo(__version__)
42
+
43
+
44
+ if __name__ == "__main__":
45
+ app()
vflank/cli/fusion.py ADDED
@@ -0,0 +1,182 @@
1
+ """`vflank fusion` — structural-variant junction extraction (simple TSV input).
2
+
3
+ Reads a breakpoint table (chr1 pos1 str1 chr2 pos2 str2; columns matched by
4
+ name) and writes one FASTA record per fusion: the chimeric junction sequence a
5
+ ddPCR probe spans. VCF/BND input is a later phase.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ import time
11
+ from pathlib import Path
12
+
13
+ import typer
14
+ from rich.table import Table
15
+
16
+ from ..core.chrom import normalise_chrom
17
+ from ..core.fusion import build_junction
18
+ from ..core.popfreq_api import dataset_for_build
19
+ from ..errors import VflankError
20
+ from ..io import breakpoints as bp_io
21
+ from ..io import fasta as fasta_io
22
+ from ..io.breakpoints import SvColumns
23
+ from ..io.reference import ReferenceFasta
24
+ from ..logging import console
25
+ from ._masking import make_pop_source, validate_pop_options
26
+
27
+ app = typer.Typer(no_args_is_help=True)
28
+
29
+
30
+ @app.command()
31
+ def run(
32
+ sv_file: Path = typer.Argument(
33
+ ..., exists=True, help="Breakpoint TSV (chr1 pos1 str1 chr2 pos2 str2)."
34
+ ),
35
+ ref_genome: Path = typer.Option(
36
+ ..., "--ref-genome", "-r", help="Indexed reference FASTA (.fai required)."
37
+ ),
38
+ genome_build: str = typer.Option("hg19", "--genome-build", "-g", help="hg19 or hg38."),
39
+ flank: int = typer.Option(
40
+ 200, "--flank", "-f", min=1, max=10_000,
41
+ help="Bases taken from each partner (junction is up to 2x this).",
42
+ ),
43
+ pop_vcf_dir: Path | None = typer.Option(
44
+ None, "--pop-vcf-dir", "-d",
45
+ help="Directory of gnomAD VCFs to mask junction flanks. Omit to skip masking.",
46
+ ),
47
+ pop_data: str = typer.Option(
48
+ "genome", "--pop-data", help="gnomAD data to mask against: genome, exome, or both."
49
+ ),
50
+ pop_source: str = typer.Option(
51
+ "vcf", "--pop-source", help="Masking backend: vcf or api (no download)."
52
+ ),
53
+ af_threshold: float = typer.Option(
54
+ 0.001, "--af-threshold", min=0.0, max=1.0, help="Min population AF to mask a SNP."
55
+ ),
56
+ output: Path = typer.Option(
57
+ Path("fusion_junctions.fasta"), "--output", "-o", help="Output FASTA file."
58
+ ),
59
+ chr1_col: str = typer.Option(SvColumns.chr1, "--chr1-col"),
60
+ pos1_col: str = typer.Option(SvColumns.pos1, "--pos1-col"),
61
+ str1_col: str = typer.Option(SvColumns.str1, "--str1-col"),
62
+ chr2_col: str = typer.Option(SvColumns.chr2, "--chr2-col"),
63
+ pos2_col: str = typer.Option(SvColumns.pos2, "--pos2-col"),
64
+ str2_col: str = typer.Option(SvColumns.str2, "--str2-col"),
65
+ name_col: str = typer.Option(SvColumns.name, "--name-col"),
66
+ sample_col: str = typer.Option(SvColumns.sample, "--sample-col"),
67
+ ):
68
+ """Build fusion-junction sequences for a breakpoint table and write a FASTA."""
69
+ cols = SvColumns(
70
+ chr1_col, pos1_col, str1_col, chr2_col, pos2_col, str2_col, name_col, sample_col
71
+ )
72
+ try:
73
+ _run(sv_file, ref_genome, genome_build, flank, pop_vcf_dir, pop_data,
74
+ pop_source, af_threshold, output, cols)
75
+ except VflankError as exc:
76
+ console.print(f"[bold red]ERROR:[/bold red] {exc}")
77
+ raise typer.Exit(1) from exc
78
+
79
+
80
+ def _run(sv_file, ref_genome, genome_build, flank, pop_vcf_dir, pop_data,
81
+ pop_source, af_threshold, output, cols: SvColumns):
82
+ t0 = time.time()
83
+ console.rule("[bold blue]vflank fusion run[/bold blue]")
84
+ if genome_build not in ("hg19", "hg38"):
85
+ raise VflankError(f"--genome-build must be 'hg19' or 'hg38', got '{genome_build}'")
86
+ validate_pop_options(pop_source, pop_data)
87
+ if pop_vcf_dir is not None and not pop_vcf_dir.is_dir():
88
+ raise VflankError(f"--pop-vcf-dir is not a directory: {pop_vcf_dir}")
89
+
90
+ console.print(f"[bold]Loading breakpoints:[/bold] {sv_file}")
91
+ df = bp_io.load_sv_table(sv_file, cols)
92
+ console.print(f" {len(df):,} fusion(s)")
93
+
94
+ reference = ReferenceFasta(ref_genome)
95
+ console.print(f"[bold]Reference:[/bold] {ref_genome} [dim]({genome_build})[/dim]")
96
+ build_warn = reference.check_build(genome_build)
97
+ if build_warn:
98
+ console.print(f" [bold yellow]⚠ {build_warn}[/bold yellow]")
99
+
100
+ # --- Masking source (optional) — masks the junction flanks ---
101
+ bp_chroms = {
102
+ b
103
+ for col in (cols.chr1, cols.chr2)
104
+ for b, _err in (normalise_chrom(v) for v in df[col].dropna().unique())
105
+ if b
106
+ }
107
+ gnomad = make_pop_source(pop_source, pop_vcf_dir, genome_build, pop_data, bp_chroms)
108
+ if pop_source == "api":
109
+ dataset = dataset_for_build(genome_build)[1]
110
+ console.print(f"[bold]Masking:[/bold] gnomAD API [dim]({pop_data}, {dataset})[/dim]")
111
+ elif gnomad is not None:
112
+ console.print(f"[bold]Masking:[/bold] {pop_vcf_dir} [dim](pop-data={pop_data})[/dim]")
113
+ console.print(f"[bold]Flank:[/bold] {flank} bp/partner (junction ≤ {2 * flank} bp)\n")
114
+
115
+ records: list[str] = []
116
+ skipped = 0
117
+ n_masked_total = 0
118
+ skip_reasons: list[str] = []
119
+ summary_rows: list[dict] = []
120
+
121
+ for row_idx, row in df.iterrows():
122
+ fusion, reason = bp_io.parse_fusion_row(row, cols)
123
+ if reason is not None:
124
+ skip_reasons.append(f"row {row_idx} — {reason}")
125
+ skipped += 1
126
+ continue
127
+ try:
128
+ jr = build_junction(reference, fusion, flank, gnomad=gnomad, af_threshold=af_threshold)
129
+ except Exception as exc: # noqa: BLE001
130
+ skip_reasons.append(f"row {row_idx} {fusion.name} — junction error: {exc}")
131
+ skipped += 1
132
+ continue
133
+
134
+ label = fasta_io.safe_header(fusion.name or "fusion")
135
+ bp = f"{fusion.bp1.chrom}_{fusion.bp1.pos}_{fusion.bp1.strand}__" \
136
+ f"{fusion.bp2.chrom}_{fusion.bp2.pos}_{fusion.bp2.strand}"
137
+ prefix = f"{fasta_io.safe_header(fusion.sample)}__" if fusion.sample else ""
138
+ header = f"{prefix}{label}__{bp}__j{jr.junction_index}"
139
+ records.append(f">{header}\n{jr.sequence}\n")
140
+ records.append(f">Masked__{header}\n{jr.masked_sequence}\n")
141
+ n_masked_total += jr.n_masked
142
+
143
+ truncated = len(jr.sequence) < 2 * flank
144
+ summary_rows.append({
145
+ "Name": fusion.name or ".", "BP1": f"{fusion.bp1.chrom}:{fusion.bp1.pos}",
146
+ "BP2": f"{fusion.bp2.chrom}:{fusion.bp2.pos}",
147
+ "Len": len(jr.sequence), "Junction": jr.junction_index,
148
+ "N": jr.n_masked, "Trunc": truncated,
149
+ })
150
+
151
+ reference.close()
152
+ if gnomad is not None:
153
+ gnomad.close()
154
+ fasta_io.write_fasta(output, records)
155
+
156
+ console.rule("[bold green]Results[/bold green]")
157
+ if summary_rows:
158
+ table = Table(show_header=True, header_style="bold cyan")
159
+ for col in ("Name", "BP1", "BP2", "Len", "Junction", "N", "Trunc"):
160
+ table.add_column(col)
161
+ for r in summary_rows[:50]:
162
+ n_str = f"[yellow]{r['N']}[/yellow]" if r["N"] else "[dim]0[/dim]"
163
+ table.add_row(
164
+ r["Name"], r["BP1"], r["BP2"], str(r["Len"]),
165
+ str(r["Junction"]), n_str, "[yellow]yes[/yellow]" if r["Trunc"] else "no",
166
+ )
167
+ console.print(table)
168
+
169
+ if skip_reasons:
170
+ console.print(f"\n[bold yellow]Skipped {skipped}:[/bold yellow]")
171
+ for reason in skip_reasons[:20]:
172
+ console.print(f" • {reason}")
173
+
174
+ mask_line = f"[bold]Bases masked:[/bold] {n_masked_total:>6,}\n" if n_masked_total else ""
175
+ console.print(
176
+ f"\n[bold]Fusions:[/bold] {len(df):>6,}\n"
177
+ f"[bold]Records:[/bold] {len(records):>6,} [dim](raw + masked per fusion)[/dim]\n"
178
+ f"[bold]Skipped:[/bold] {skipped:>6,}\n"
179
+ + mask_line +
180
+ f"[bold]Output:[/bold] [cyan]{output.resolve()}[/cyan] "
181
+ f"[dim]({time.time() - t0:.1f}s)[/dim]"
182
+ )