svphaser 2.0.4__py3-none-any.whl → 2.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- svphaser/__init__.py +17 -12
- svphaser/_version.py +2 -2
- svphaser/cli.py +20 -38
- svphaser/phasing/_workers.py +347 -41
- svphaser/phasing/algorithms.py +33 -15
- svphaser/phasing/io.py +140 -31
- svphaser/phasing/types.py +12 -5
- svphaser-2.1.0.dist-info/METADATA +231 -0
- svphaser-2.1.0.dist-info/RECORD +16 -0
- {svphaser-2.0.4.dist-info → svphaser-2.1.0.dist-info}/WHEEL +1 -1
- svphaser-2.0.4.dist-info/METADATA +0 -203
- svphaser-2.0.4.dist-info/RECORD +0 -16
- {svphaser-2.0.4.dist-info → svphaser-2.1.0.dist-info}/entry_points.txt +0 -0
- {svphaser-2.0.4.dist-info → svphaser-2.1.0.dist-info}/licenses/LICENSE +0 -0
svphaser/__init__.py
CHANGED
|
@@ -1,7 +1,10 @@
|
|
|
1
1
|
"""Top-level SvPhaser package.
|
|
2
2
|
|
|
3
|
-
Public surface kept tiny:
|
|
4
|
-
|
|
3
|
+
Public surface kept tiny:
|
|
4
|
+
- __version__
|
|
5
|
+
- a convenience `phase()` wrapper around svphaser.phasing.io.phase_vcf()
|
|
6
|
+
|
|
7
|
+
Defaults are chosen to match the recommended SvPhaser settings for long-read SV phasing.
|
|
5
8
|
"""
|
|
6
9
|
|
|
7
10
|
from __future__ import annotations
|
|
@@ -11,7 +14,7 @@ from pathlib import Path
|
|
|
11
14
|
# --------------------------------------------------------------------
|
|
12
15
|
# Robust version lookup:
|
|
13
16
|
# - Prefer installed package metadata (works for wheels and PEP 660 editables)
|
|
14
|
-
# - Fall back to
|
|
17
|
+
# - Fall back to _version.py for raw-source/dev use
|
|
15
18
|
# --------------------------------------------------------------------
|
|
16
19
|
try:
|
|
17
20
|
from importlib.metadata import version as _pkg_version # Python 3.8+
|
|
@@ -19,14 +22,14 @@ try:
|
|
|
19
22
|
__version__ = _pkg_version("svphaser")
|
|
20
23
|
except Exception:
|
|
21
24
|
try:
|
|
22
|
-
from ._version import __version__ #
|
|
25
|
+
from ._version import __version__ # overwritten in builds when using setuptools-scm
|
|
23
26
|
except Exception: # highly defensive
|
|
24
27
|
__version__ = "0+unknown"
|
|
25
28
|
|
|
26
29
|
# Centralized defaults (keep CLI in sync)
|
|
27
30
|
DEFAULT_MIN_SUPPORT: int = 10
|
|
28
|
-
DEFAULT_MAJOR_DELTA: float = 0.
|
|
29
|
-
DEFAULT_EQUAL_DELTA: float = 0.
|
|
31
|
+
DEFAULT_MAJOR_DELTA: float = 0.60
|
|
32
|
+
DEFAULT_EQUAL_DELTA: float = 0.10
|
|
30
33
|
DEFAULT_GQ_BINS: str = "30:High,10:Moderate"
|
|
31
34
|
|
|
32
35
|
|
|
@@ -44,8 +47,10 @@ def phase(
|
|
|
44
47
|
) -> tuple[Path, Path]:
|
|
45
48
|
"""Phase *sv_vcf* using HP-tagged *bam*, writing outputs into *out_dir*.
|
|
46
49
|
|
|
47
|
-
|
|
48
|
-
|
|
50
|
+
Notes
|
|
51
|
+
-----
|
|
52
|
+
- Step B semantics: `min_support` is applied to TOTAL ALT-supporting reads (n1+n2).
|
|
53
|
+
- Near-ties (<= equal_delta) are treated as ambiguous (./.), not homozygous ALT.
|
|
49
54
|
|
|
50
55
|
Returns
|
|
51
56
|
-------
|
|
@@ -66,13 +71,13 @@ def phase(
|
|
|
66
71
|
out_csv = out_dir_p / f"{stem}_phased.csv"
|
|
67
72
|
|
|
68
73
|
phase_vcf(
|
|
69
|
-
sv_vcf,
|
|
70
|
-
bam,
|
|
71
|
-
out_dir=out_dir_p,
|
|
74
|
+
Path(sv_vcf),
|
|
75
|
+
Path(bam),
|
|
76
|
+
out_dir=out_dir_p,
|
|
72
77
|
min_support=min_support,
|
|
73
78
|
major_delta=major_delta,
|
|
74
79
|
equal_delta=equal_delta,
|
|
75
|
-
gq_bins=gq_bins,
|
|
80
|
+
gq_bins=gq_bins,
|
|
76
81
|
threads=threads,
|
|
77
82
|
)
|
|
78
83
|
return out_vcf, out_csv
|
svphaser/_version.py
CHANGED
|
@@ -28,7 +28,7 @@ version_tuple: VERSION_TUPLE
|
|
|
28
28
|
commit_id: COMMIT_ID
|
|
29
29
|
__commit_id__: COMMIT_ID
|
|
30
30
|
|
|
31
|
-
__version__ = version = '2.0
|
|
32
|
-
__version_tuple__ = version_tuple = (2,
|
|
31
|
+
__version__ = version = '2.1.0'
|
|
32
|
+
__version_tuple__ = version_tuple = (2, 1, 0)
|
|
33
33
|
|
|
34
34
|
__commit_id__ = commit_id = None
|
svphaser/cli.py
CHANGED
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
#!/usr/bin/env python3
|
|
2
|
-
"""
|
|
3
|
-
svphaser.cli
|
|
2
|
+
"""svphaser.cli
|
|
4
3
|
============
|
|
5
4
|
Command-line interface for **SvPhaser**.
|
|
6
5
|
|
|
7
6
|
The program writes two files inside **--out-dir** (or the CWD):
|
|
8
7
|
|
|
9
|
-
* ``<stem>_phased.vcf`` (uncompressed; GT/GQ
|
|
10
|
-
* ``<stem>_phased.csv`` (tabular summary
|
|
8
|
+
* ``<stem>_phased.vcf`` (uncompressed; GT/GQ injected; optional INFO=GQBIN)
|
|
9
|
+
* ``<stem>_phased.csv`` (tabular summary incl. n1/n2/gt/gq and optional gq_label)
|
|
11
10
|
"""
|
|
11
|
+
|
|
12
12
|
from __future__ import annotations
|
|
13
13
|
|
|
14
14
|
from pathlib import Path
|
|
@@ -27,7 +27,7 @@ from svphaser import (
|
|
|
27
27
|
app = typer.Typer(add_completion=False, rich_markup_mode="rich")
|
|
28
28
|
|
|
29
29
|
|
|
30
|
-
def _version_callback(value: bool):
|
|
30
|
+
def _version_callback(value: bool) -> None:
|
|
31
31
|
if value:
|
|
32
32
|
typer.echo(__version__)
|
|
33
33
|
raise typer.Exit()
|
|
@@ -44,30 +44,20 @@ def main(
|
|
|
44
44
|
callback=_version_callback,
|
|
45
45
|
),
|
|
46
46
|
] = None
|
|
47
|
-
):
|
|
47
|
+
) -> None:
|
|
48
48
|
"""SvPhaser – Structural-variant phasing from HP-tagged long-read BAMs."""
|
|
49
|
-
# no-op; callback handles --version
|
|
50
49
|
return
|
|
51
50
|
|
|
52
51
|
|
|
53
|
-
# ──────────────────────────────────────────────────────────────────────────
|
|
54
|
-
# phase command
|
|
55
|
-
# ──────────────────────────────────────────────────────────────────────────
|
|
56
52
|
@app.command("phase")
|
|
57
53
|
def phase_cmd(
|
|
58
54
|
sv_vcf: Annotated[
|
|
59
55
|
Path,
|
|
60
|
-
typer.Argument(
|
|
61
|
-
exists=True,
|
|
62
|
-
help="Input *un-phased* SV VCF (.vcf or .vcf.gz)",
|
|
63
|
-
),
|
|
56
|
+
typer.Argument(exists=True, help="Input *un-phased* SV VCF (.vcf or .vcf.gz)"),
|
|
64
57
|
],
|
|
65
58
|
bam: Annotated[
|
|
66
59
|
Path,
|
|
67
|
-
typer.Argument(
|
|
68
|
-
exists=True,
|
|
69
|
-
help="Long-read BAM/CRAM with HP tags",
|
|
70
|
-
),
|
|
60
|
+
typer.Argument(exists=True, help="Long-read BAM/CRAM with HP tags"),
|
|
71
61
|
],
|
|
72
62
|
out_dir: Annotated[
|
|
73
63
|
Path,
|
|
@@ -90,9 +80,8 @@ def phase_cmd(
|
|
|
90
80
|
int,
|
|
91
81
|
typer.Option(
|
|
92
82
|
help=(
|
|
93
|
-
"Minimum
|
|
94
|
-
"
|
|
95
|
-
"are dropped entirely."
|
|
83
|
+
"Minimum TOTAL ALT-supporting reads required to keep an SV (n1+n2). "
|
|
84
|
+
"If (n1+n2) < min_support the SV is dropped (written to *_dropped_svs.csv)."
|
|
96
85
|
),
|
|
97
86
|
show_default=True,
|
|
98
87
|
),
|
|
@@ -100,14 +89,14 @@ def phase_cmd(
|
|
|
100
89
|
major_delta: Annotated[
|
|
101
90
|
float,
|
|
102
91
|
typer.Option(
|
|
103
|
-
help="
|
|
92
|
+
help="max(n1,n2)/N >= this ⇒ strong majority ⇒ GT 1|0 or 0|1",
|
|
104
93
|
show_default=True,
|
|
105
94
|
),
|
|
106
95
|
] = DEFAULT_MAJOR_DELTA,
|
|
107
96
|
equal_delta: Annotated[
|
|
108
97
|
float,
|
|
109
98
|
typer.Option(
|
|
110
|
-
help="|n1−n2|/N
|
|
99
|
+
help="|n1−n2|/N <= this ⇒ near-tie ⇒ GT ./. (ambiguous)",
|
|
111
100
|
show_default=True,
|
|
112
101
|
),
|
|
113
102
|
] = DEFAULT_EQUAL_DELTA,
|
|
@@ -116,9 +105,8 @@ def phase_cmd(
|
|
|
116
105
|
str,
|
|
117
106
|
typer.Option(
|
|
118
107
|
help=(
|
|
119
|
-
"Comma-separated GQ≥threshold:Label definitions "
|
|
120
|
-
"
|
|
121
|
-
"[gq_label] and in the VCF INFO field HP_GQBIN when set."
|
|
108
|
+
"Comma-separated GQ≥threshold:Label definitions (e.g. '30:High,10:Moderate'). "
|
|
109
|
+
"Labels appear in CSV column [gq_label] and in the VCF INFO field GQBIN."
|
|
122
110
|
),
|
|
123
111
|
show_default=True,
|
|
124
112
|
),
|
|
@@ -134,13 +122,11 @@ def phase_cmd(
|
|
|
134
122
|
),
|
|
135
123
|
] = None,
|
|
136
124
|
) -> None:
|
|
137
|
-
"""Phase structural variants using
|
|
138
|
-
# Initialise logging BEFORE we import anything that might log
|
|
125
|
+
"""Phase structural variants using SV-type-aware ALT-support evidence."""
|
|
139
126
|
from svphaser.logging import init as _init_logging
|
|
140
127
|
|
|
141
|
-
_init_logging("INFO")
|
|
128
|
+
_init_logging("INFO")
|
|
142
129
|
|
|
143
|
-
# Resolve output paths
|
|
144
130
|
if not out_dir.exists():
|
|
145
131
|
out_dir.mkdir(parents=True)
|
|
146
132
|
|
|
@@ -153,25 +139,21 @@ def phase_cmd(
|
|
|
153
139
|
out_vcf = out_dir / f"{stem}_phased.vcf"
|
|
154
140
|
out_csv = out_dir / f"{stem}_phased.csv"
|
|
155
141
|
|
|
156
|
-
# Lazy import so `svphaser --help` works without heavy deps
|
|
157
142
|
from svphaser.phasing.io import phase_vcf
|
|
158
143
|
|
|
159
144
|
try:
|
|
160
145
|
phase_vcf(
|
|
161
146
|
sv_vcf,
|
|
162
147
|
bam,
|
|
163
|
-
out_dir=out_dir,
|
|
148
|
+
out_dir=out_dir,
|
|
164
149
|
min_support=min_support,
|
|
165
150
|
major_delta=major_delta,
|
|
166
151
|
equal_delta=equal_delta,
|
|
167
|
-
gq_bins=gq_bins,
|
|
152
|
+
gq_bins=gq_bins,
|
|
168
153
|
threads=threads,
|
|
169
154
|
)
|
|
170
155
|
typer.secho(f"✔ Phased VCF → {out_vcf}", fg=typer.colors.GREEN)
|
|
171
156
|
typer.secho(f"✔ Phased CSV → {out_csv}", fg=typer.colors.GREEN)
|
|
172
|
-
except Exception:
|
|
173
|
-
typer.secho(
|
|
174
|
-
"[SvPhaser] 💥 Unhandled error during phasing",
|
|
175
|
-
fg=typer.colors.RED,
|
|
176
|
-
)
|
|
157
|
+
except Exception:
|
|
158
|
+
typer.secho("[SvPhaser] 💥 Unhandled error during phasing", fg=typer.colors.RED)
|
|
177
159
|
raise
|
svphaser/phasing/_workers.py
CHANGED
|
@@ -1,18 +1,26 @@
|
|
|
1
|
-
"""
|
|
2
|
-
svphaser.phasing._workers
|
|
1
|
+
"""svphaser.phasing._workers
|
|
3
2
|
=========================
|
|
4
|
-
Worker-process code.
|
|
3
|
+
Worker-process code.
|
|
4
|
+
|
|
5
|
+
Step B update (biological correctness):
|
|
6
|
+
- `n1/n2` are now *ALT-supporting read counts* per haplotype,
|
|
7
|
+
not raw overlap/coverage.
|
|
8
|
+
- Evidence is SV-type-aware:
|
|
9
|
+
- DEL: large CIGAR 'D' spanning breakpoints (and optional split-read via SA)
|
|
10
|
+
- INS: large CIGAR 'I' near POS
|
|
11
|
+
- BND: split-read via SA linking to partner chrom:pos
|
|
12
|
+
- INV: split-read via SA to the END breakpoint with strand flip
|
|
5
13
|
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
3. Counts HP-tagged reads in the long-read BAM/CRAM.
|
|
9
|
-
4. Classifies the haplotype + GQ, adds optional GQ-bin label.
|
|
10
|
-
5. Returns a DataFrame to the parent.
|
|
14
|
+
This is designed to match what IGV "SV-support" reads typically show,
|
|
15
|
+
so counts will be closer to the 5/8 style numbers you observed (instead of 27/30).
|
|
11
16
|
"""
|
|
12
17
|
|
|
13
18
|
from __future__ import annotations
|
|
14
19
|
|
|
20
|
+
import re
|
|
21
|
+
from collections.abc import Iterable
|
|
15
22
|
from pathlib import Path
|
|
23
|
+
from typing import Any
|
|
16
24
|
|
|
17
25
|
import pandas as pd
|
|
18
26
|
import pysam
|
|
@@ -23,35 +31,320 @@ from .types import WorkerOpts
|
|
|
23
31
|
|
|
24
32
|
__all__ = ["_phase_chrom_worker"]
|
|
25
33
|
|
|
34
|
+
# Default breakpoint window (bp) used for evidence gathering.
|
|
35
|
+
DEFAULT_BP_WINDOW = 100
|
|
26
36
|
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
if read.is_unmapped or read.is_secondary or read.is_supplementary:
|
|
36
|
-
continue
|
|
37
|
-
if not read.has_tag("HP"):
|
|
38
|
-
continue
|
|
39
|
-
tag = read.get_tag("HP")
|
|
40
|
-
if tag == 1:
|
|
41
|
-
hp1 += 1
|
|
42
|
-
elif tag == 2:
|
|
43
|
-
hp2 += 1
|
|
44
|
-
return hp1, hp2
|
|
37
|
+
# Require event size in the read CIGAR to be at least this fraction of SVLEN.
|
|
38
|
+
MIN_CIGAR_FRACTION = 0.30
|
|
39
|
+
|
|
40
|
+
# And at least this many bp (avoid counting tiny indels / alignment noise).
|
|
41
|
+
MIN_CIGAR_BP = 30
|
|
42
|
+
|
|
43
|
+
# ALT for BND often contains partner like "]chr3:198172833]N" or "N]chr5:181462057]".
|
|
44
|
+
_BND_RE = re.compile(r"[\[\]]([^:\[\]]+):(\d+)[\[\]]")
|
|
45
45
|
|
|
46
46
|
|
|
47
47
|
def _has_tabix_index(vcf_path: Path) -> bool:
|
|
48
|
-
"""Return True if <file>.tbi or <file>.csi exists
|
|
48
|
+
"""Return True if <file>.tbi or <file>.csi exists."""
|
|
49
49
|
return (
|
|
50
50
|
vcf_path.with_suffix(vcf_path.suffix + ".tbi").exists()
|
|
51
51
|
or vcf_path.with_suffix(vcf_path.suffix + ".csi").exists()
|
|
52
52
|
)
|
|
53
53
|
|
|
54
54
|
|
|
55
|
+
def _coerce_int(x: Any) -> int | None:
|
|
56
|
+
"""Convert cyvcf2 INFO values to int if possible."""
|
|
57
|
+
if x is None:
|
|
58
|
+
return None
|
|
59
|
+
if isinstance(x, (list, tuple)):
|
|
60
|
+
return _coerce_int(x[0]) if x else None
|
|
61
|
+
try:
|
|
62
|
+
return int(x)
|
|
63
|
+
except Exception:
|
|
64
|
+
return None
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def _svlen_from_record(rec: Variant, pos: int, end: int) -> int:
|
|
68
|
+
svlen = _coerce_int(rec.INFO.get("SVLEN"))
|
|
69
|
+
if svlen is None:
|
|
70
|
+
# VCF END is 1-based inclusive for DEL; length approx END-POS+1
|
|
71
|
+
return abs(end - pos) + 1
|
|
72
|
+
return abs(svlen)
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
def _parse_bnd_partner(alt: str, rec: Variant) -> tuple[str | None, int | None]:
|
|
76
|
+
"""Return (chr2, pos2) for BND if possible."""
|
|
77
|
+
m = _BND_RE.search(alt)
|
|
78
|
+
if m:
|
|
79
|
+
return m.group(1), int(m.group(2))
|
|
80
|
+
chr2 = rec.INFO.get("CHR2")
|
|
81
|
+
# Some callers store partner position in INFO; Sniffles2 uses ALT string primarily.
|
|
82
|
+
return (str(chr2), None) if chr2 else (None, None)
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
def _parse_sa_tag(read: pysam.AlignedSegment) -> list[tuple[str, int, str]]:
|
|
86
|
+
"""Parse SA tag into list of (rname, pos1, strand). pos1 is 1-based."""
|
|
87
|
+
if not read.has_tag("SA"):
|
|
88
|
+
return []
|
|
89
|
+
sa_raw = read.get_tag("SA")
|
|
90
|
+
out: list[tuple[str, int, str]] = []
|
|
91
|
+
for entry in str(sa_raw).split(";"):
|
|
92
|
+
if not entry:
|
|
93
|
+
continue
|
|
94
|
+
# rname,pos,strand,cigar,mapq,nm
|
|
95
|
+
parts = entry.split(",")
|
|
96
|
+
if len(parts) < 3:
|
|
97
|
+
continue
|
|
98
|
+
rname = parts[0]
|
|
99
|
+
try:
|
|
100
|
+
pos1 = int(parts[1])
|
|
101
|
+
except ValueError:
|
|
102
|
+
continue
|
|
103
|
+
strand = parts[2]
|
|
104
|
+
out.append((rname, pos1, strand))
|
|
105
|
+
return out
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
def _iter_candidate_reads(
|
|
109
|
+
bam: pysam.AlignmentFile,
|
|
110
|
+
chrom: str,
|
|
111
|
+
regions_1based: list[tuple[int, int]],
|
|
112
|
+
) -> Iterable[pysam.AlignedSegment]:
|
|
113
|
+
"""Yield reads from multiple 1-based-inclusive regions, allowing overlap."""
|
|
114
|
+
for start1, end1 in regions_1based:
|
|
115
|
+
# pysam fetch: 0-based start, end-exclusive
|
|
116
|
+
start0 = max(0, start1 - 1)
|
|
117
|
+
end0 = max(0, end1) # end1 is inclusive 1-based, so end-exclusive 0-based is end1
|
|
118
|
+
for read in bam.fetch(chrom, start0, end0):
|
|
119
|
+
if read.is_unmapped or read.is_secondary:
|
|
120
|
+
continue
|
|
121
|
+
yield read
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
def _supports_del(
|
|
125
|
+
read: pysam.AlignedSegment,
|
|
126
|
+
*,
|
|
127
|
+
pos0: int,
|
|
128
|
+
end_excl0: int,
|
|
129
|
+
svlen: int,
|
|
130
|
+
bp_window: int,
|
|
131
|
+
) -> bool:
|
|
132
|
+
"""DEL support: a large CIGAR deletion spanning POS..END (within window).
|
|
133
|
+
|
|
134
|
+
pos0: 0-based start breakpoint
|
|
135
|
+
end_excl0: 0-based exclusive end breakpoint (VCF END maps naturally to this)
|
|
136
|
+
"""
|
|
137
|
+
if read.cigartuples is None:
|
|
138
|
+
return False
|
|
139
|
+
|
|
140
|
+
min_len = max(MIN_CIGAR_BP, int(MIN_CIGAR_FRACTION * svlen))
|
|
141
|
+
|
|
142
|
+
ref = read.reference_start
|
|
143
|
+
for op, length in read.cigartuples:
|
|
144
|
+
if op in (0, 7, 8):
|
|
145
|
+
ref += length
|
|
146
|
+
elif op in (2, 3):
|
|
147
|
+
# deletion / ref skip
|
|
148
|
+
if op == 2 and length >= min_len:
|
|
149
|
+
del_start = ref
|
|
150
|
+
del_end = ref + length
|
|
151
|
+
if abs(del_start - pos0) <= bp_window and abs(del_end - end_excl0) <= bp_window:
|
|
152
|
+
return True
|
|
153
|
+
ref += length
|
|
154
|
+
elif op in (1, 4, 5, 6):
|
|
155
|
+
# insertion or clipping: does not consume reference
|
|
156
|
+
continue
|
|
157
|
+
|
|
158
|
+
# Fallback: split-read evidence (SA) hitting END region
|
|
159
|
+
for rname, sa_pos1, _strand in _parse_sa_tag(read):
|
|
160
|
+
if rname != read.reference_name:
|
|
161
|
+
continue
|
|
162
|
+
sa_pos0 = sa_pos1 - 1
|
|
163
|
+
if abs(sa_pos0 - (end_excl0 - 1)) <= bp_window:
|
|
164
|
+
return True
|
|
165
|
+
|
|
166
|
+
return False
|
|
167
|
+
|
|
168
|
+
|
|
169
|
+
def _supports_ins(
|
|
170
|
+
read: pysam.AlignedSegment,
|
|
171
|
+
*,
|
|
172
|
+
pos0: int,
|
|
173
|
+
svlen: int,
|
|
174
|
+
bp_window: int,
|
|
175
|
+
) -> bool:
|
|
176
|
+
"""INS support: a large CIGAR insertion near POS (within window)."""
|
|
177
|
+
if read.cigartuples is None:
|
|
178
|
+
return False
|
|
179
|
+
|
|
180
|
+
min_len = max(MIN_CIGAR_BP, int(MIN_CIGAR_FRACTION * svlen))
|
|
181
|
+
|
|
182
|
+
ref = read.reference_start
|
|
183
|
+
for op, length in read.cigartuples:
|
|
184
|
+
if op in (0, 7, 8):
|
|
185
|
+
ref += length
|
|
186
|
+
elif op == 1:
|
|
187
|
+
if length >= min_len and abs(ref - pos0) <= bp_window:
|
|
188
|
+
return True
|
|
189
|
+
elif op in (2, 3):
|
|
190
|
+
ref += length
|
|
191
|
+
else:
|
|
192
|
+
continue
|
|
193
|
+
return False
|
|
194
|
+
|
|
195
|
+
|
|
196
|
+
def _supports_bnd(
|
|
197
|
+
read: pysam.AlignedSegment,
|
|
198
|
+
*,
|
|
199
|
+
pos0: int,
|
|
200
|
+
chr2: str,
|
|
201
|
+
pos2_1based: int,
|
|
202
|
+
bp_window: int,
|
|
203
|
+
) -> bool:
|
|
204
|
+
"""BND support: SA tag links to partner chrom:pos."""
|
|
205
|
+
# Must overlap POS window (caller already fetched around POS, but keep conservative)
|
|
206
|
+
if abs(read.reference_start - pos0) > 10 * bp_window:
|
|
207
|
+
return False
|
|
208
|
+
|
|
209
|
+
pos2_0 = pos2_1based - 1
|
|
210
|
+
for rname, sa_pos1, _strand in _parse_sa_tag(read):
|
|
211
|
+
if rname != chr2:
|
|
212
|
+
continue
|
|
213
|
+
if abs((sa_pos1 - 1) - pos2_0) <= bp_window:
|
|
214
|
+
return True
|
|
215
|
+
return False
|
|
216
|
+
|
|
217
|
+
|
|
218
|
+
def _supports_inv(
|
|
219
|
+
read: pysam.AlignedSegment,
|
|
220
|
+
*,
|
|
221
|
+
pos0: int,
|
|
222
|
+
end0: int,
|
|
223
|
+
bp_window: int,
|
|
224
|
+
) -> bool:
|
|
225
|
+
"""INV support: SA to the other breakpoint on same chrom with strand flip."""
|
|
226
|
+
strand_primary = "-" if read.is_reverse else "+"
|
|
227
|
+
|
|
228
|
+
for rname, sa_pos1, sa_strand in _parse_sa_tag(read):
|
|
229
|
+
if rname != read.reference_name:
|
|
230
|
+
continue
|
|
231
|
+
sa_pos0 = sa_pos1 - 1
|
|
232
|
+
if abs(sa_pos0 - end0) <= bp_window and sa_strand != strand_primary:
|
|
233
|
+
return True
|
|
234
|
+
if abs(sa_pos0 - pos0) <= bp_window and sa_strand != strand_primary:
|
|
235
|
+
return True
|
|
236
|
+
return False
|
|
237
|
+
|
|
238
|
+
|
|
239
|
+
def _read_supports_variant(
|
|
240
|
+
read: pysam.AlignedSegment,
|
|
241
|
+
svtype: str,
|
|
242
|
+
*,
|
|
243
|
+
pos0: int,
|
|
244
|
+
end_excl0: int,
|
|
245
|
+
svlen: int,
|
|
246
|
+
bp_window: int,
|
|
247
|
+
chr2: str | None = None,
|
|
248
|
+
pos2: int | None = None,
|
|
249
|
+
) -> bool:
|
|
250
|
+
"""Return True if the read provides evidence for the given SV type.
|
|
251
|
+
|
|
252
|
+
This is a thin wrapper around the type-specific support helpers so the
|
|
253
|
+
main counting routine remains small and easier to read/test.
|
|
254
|
+
"""
|
|
255
|
+
if svtype == "DEL":
|
|
256
|
+
return _supports_del(read, pos0=pos0, end_excl0=end_excl0, svlen=svlen, bp_window=bp_window)
|
|
257
|
+
if svtype == "INS":
|
|
258
|
+
return _supports_ins(read, pos0=pos0, svlen=svlen, bp_window=bp_window)
|
|
259
|
+
if svtype == "BND" and chr2 and pos2:
|
|
260
|
+
return _supports_bnd(
|
|
261
|
+
read,
|
|
262
|
+
pos0=pos0,
|
|
263
|
+
chr2=str(chr2),
|
|
264
|
+
pos2_1based=int(pos2),
|
|
265
|
+
bp_window=bp_window,
|
|
266
|
+
)
|
|
267
|
+
if svtype == "INV":
|
|
268
|
+
return _supports_inv(read, pos0=pos0, end0=(end_excl0 - 1), bp_window=bp_window)
|
|
269
|
+
|
|
270
|
+
# Unknown / other SVs: conservative fallback – look for large indel near POS.
|
|
271
|
+
return _supports_ins(read, pos0=pos0, svlen=svlen, bp_window=bp_window) or _supports_del(
|
|
272
|
+
read, pos0=pos0, end_excl0=end_excl0, svlen=svlen, bp_window=bp_window
|
|
273
|
+
)
|
|
274
|
+
|
|
275
|
+
|
|
276
|
+
def _count_hp_sv_support(
|
|
277
|
+
bam: pysam.AlignmentFile,
|
|
278
|
+
chrom: str,
|
|
279
|
+
rec: Variant,
|
|
280
|
+
*,
|
|
281
|
+
bp_window: int = DEFAULT_BP_WINDOW,
|
|
282
|
+
) -> tuple[int, int, int, str]: # noqa: C901
|
|
283
|
+
"""Return (n1, n2, sv_end, alt_str) where n1/n2 are ALT-support counts."""
|
|
284
|
+
|
|
285
|
+
pos1 = int(rec.POS)
|
|
286
|
+
sv_end = int(rec.end) if getattr(rec, "end", None) is not None else pos1
|
|
287
|
+
alt = ",".join(rec.ALT) if rec.ALT else "<N>"
|
|
288
|
+
svtype = str(rec.INFO.get("SVTYPE", "NA"))
|
|
289
|
+
|
|
290
|
+
svlen = _svlen_from_record(rec, pos1, sv_end)
|
|
291
|
+
|
|
292
|
+
# Build regions (1-based inclusive) to fetch candidates.
|
|
293
|
+
regions: list[tuple[int, int]] = [(max(1, pos1 - bp_window), pos1 + bp_window)]
|
|
294
|
+
if svtype in {"DEL", "INV"} and sv_end != pos1:
|
|
295
|
+
regions.append((max(1, sv_end - bp_window), sv_end + bp_window))
|
|
296
|
+
|
|
297
|
+
# For BND we need the partner locus
|
|
298
|
+
chr2 = None
|
|
299
|
+
pos2 = None
|
|
300
|
+
if svtype == "BND":
|
|
301
|
+
chr2, pos2 = _parse_bnd_partner(alt, rec)
|
|
302
|
+
|
|
303
|
+
# Deduplicate by query name: count each read once if it supports ALT.
|
|
304
|
+
# Track hp per read name (in case different segments appear).
|
|
305
|
+
state: dict[str, dict[str, Any]] = {}
|
|
306
|
+
|
|
307
|
+
pos0 = pos1 - 1
|
|
308
|
+
end_excl0 = sv_end # see note in _supports_del: END maps to 0-based exclusive
|
|
309
|
+
|
|
310
|
+
for read in _iter_candidate_reads(bam, chrom, regions):
|
|
311
|
+
qn = read.query_name
|
|
312
|
+
if qn is None:
|
|
313
|
+
continue
|
|
314
|
+
st = state.setdefault(qn, {"hp": None, "support": False})
|
|
315
|
+
|
|
316
|
+
if st["hp"] is None and read.has_tag("HP"):
|
|
317
|
+
st["hp"] = read.get_tag("HP")
|
|
318
|
+
|
|
319
|
+
# If we already know this read supports, we can skip extra work
|
|
320
|
+
if st["support"] is True:
|
|
321
|
+
continue
|
|
322
|
+
|
|
323
|
+
if _read_supports_variant(
|
|
324
|
+
read,
|
|
325
|
+
svtype,
|
|
326
|
+
pos0=pos0,
|
|
327
|
+
end_excl0=end_excl0,
|
|
328
|
+
svlen=svlen,
|
|
329
|
+
bp_window=bp_window,
|
|
330
|
+
chr2=chr2,
|
|
331
|
+
pos2=pos2,
|
|
332
|
+
):
|
|
333
|
+
st["support"] = True
|
|
334
|
+
|
|
335
|
+
n1 = n2 = 0
|
|
336
|
+
for st in state.values():
|
|
337
|
+
if not st["support"]:
|
|
338
|
+
continue
|
|
339
|
+
hp = st["hp"]
|
|
340
|
+
if hp == 1:
|
|
341
|
+
n1 += 1
|
|
342
|
+
elif hp == 2:
|
|
343
|
+
n2 += 1
|
|
344
|
+
|
|
345
|
+
return n1, n2, sv_end, alt
|
|
346
|
+
|
|
347
|
+
|
|
55
348
|
def _phase_chrom_worker(
|
|
56
349
|
chrom: str,
|
|
57
350
|
vcf_path: Path,
|
|
@@ -63,7 +356,7 @@ def _phase_chrom_worker(
|
|
|
63
356
|
|
|
64
357
|
rows: list[dict[str, object]] = []
|
|
65
358
|
|
|
66
|
-
# Try fast random access first, fall back to linear scan if
|
|
359
|
+
# Try fast random access first, fall back to linear scan if needed
|
|
67
360
|
use_region_iter = _has_tabix_index(vcf_path)
|
|
68
361
|
records_iter = (
|
|
69
362
|
rdr(f"{chrom}") if use_region_iter else (rec for rec in rdr if rec.CHROM == chrom)
|
|
@@ -71,26 +364,39 @@ def _phase_chrom_worker(
|
|
|
71
364
|
|
|
72
365
|
for rec in records_iter: # type: ignore[arg-type]
|
|
73
366
|
assert isinstance(rec, Variant)
|
|
74
|
-
sv_end = rec.end if getattr(rec, "end", None) is not None else rec.POS
|
|
75
|
-
n1, n2 = _count_hp_reads(bam, chrom, rec.POS, sv_end)
|
|
76
|
-
|
|
77
|
-
gt, gq = classify_haplotype(
|
|
78
|
-
n1,
|
|
79
|
-
n2,
|
|
80
|
-
min_support=opts.min_support,
|
|
81
|
-
major_delta=opts.major_delta,
|
|
82
|
-
equal_delta=opts.equal_delta,
|
|
83
|
-
)
|
|
84
367
|
|
|
85
|
-
|
|
368
|
+
n1, n2, sv_end, alt = _count_hp_sv_support(bam, chrom, rec)
|
|
369
|
+
|
|
370
|
+
# Respect caller genotype for homozygous ALT: do not infer 1|1 from balance.
|
|
371
|
+
is_hom_alt = False
|
|
372
|
+
if rec.genotypes:
|
|
373
|
+
# cyvcf2: [a1, a2, phased]
|
|
374
|
+
a1, a2 = rec.genotypes[0][0], rec.genotypes[0][1]
|
|
375
|
+
is_hom_alt = a1 == 1 and a2 == 1
|
|
376
|
+
|
|
377
|
+
if is_hom_alt:
|
|
378
|
+
gt, gq = "1|1", 0
|
|
379
|
+
else:
|
|
380
|
+
gt, gq = classify_haplotype(
|
|
381
|
+
n1,
|
|
382
|
+
n2,
|
|
383
|
+
min_support=opts.min_support,
|
|
384
|
+
major_delta=opts.major_delta,
|
|
385
|
+
equal_delta=opts.equal_delta,
|
|
386
|
+
)
|
|
387
|
+
|
|
388
|
+
row: dict[str, object] = dict(
|
|
86
389
|
chrom=chrom,
|
|
87
|
-
pos=rec.POS,
|
|
390
|
+
pos=int(rec.POS),
|
|
391
|
+
end=int(sv_end),
|
|
88
392
|
id=rec.ID or ".",
|
|
393
|
+
alt=alt,
|
|
89
394
|
svtype=rec.INFO.get("SVTYPE", "NA"),
|
|
90
395
|
n1=n1,
|
|
91
396
|
n2=n2,
|
|
92
397
|
gt=gt,
|
|
93
398
|
gq=gq,
|
|
399
|
+
gq_label=None,
|
|
94
400
|
)
|
|
95
401
|
|
|
96
402
|
if opts.gq_bins:
|
|
@@ -99,7 +405,7 @@ def _phase_chrom_worker(
|
|
|
99
405
|
row["gq_label"] = label
|
|
100
406
|
break
|
|
101
407
|
|
|
102
|
-
rows.append(row)
|
|
408
|
+
rows.append(row)
|
|
103
409
|
|
|
104
410
|
rdr.close()
|
|
105
411
|
bam.close()
|