svphaser 2.0.6__py3-none-any.whl → 2.1.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
svphaser/__init__.py CHANGED
@@ -1,7 +1,10 @@
1
1
  """Top-level SvPhaser package.
2
2
 
3
- Public surface kept tiny: a version string and a convenience helper
4
- that calls the library’s main phasing routine.
3
+ Public surface kept tiny:
4
+ - __version__
5
+ - a convenience `phase()` wrapper around svphaser.phasing.io.phase_vcf()
6
+
7
+ Defaults are chosen to match the recommended SvPhaser settings for long-read SV phasing.
5
8
  """
6
9
 
7
10
  from __future__ import annotations
@@ -11,7 +14,7 @@ from pathlib import Path
11
14
  # --------------------------------------------------------------------
12
15
  # Robust version lookup:
13
16
  # - Prefer installed package metadata (works for wheels and PEP 660 editables)
14
- # - Fall back to placeholder in _version.py for raw-source/dev use
17
+ # - Fall back to _version.py for raw-source/dev use
15
18
  # --------------------------------------------------------------------
16
19
  try:
17
20
  from importlib.metadata import version as _pkg_version # Python 3.8+
@@ -19,14 +22,14 @@ try:
19
22
  __version__ = _pkg_version("svphaser")
20
23
  except Exception:
21
24
  try:
22
- from ._version import __version__ # "0+unknown" in repo; overwritten in builds
25
+ from ._version import __version__ # overwritten in builds when using setuptools-scm
23
26
  except Exception: # highly defensive
24
27
  __version__ = "0+unknown"
25
28
 
26
29
  # Centralized defaults (keep CLI in sync)
27
30
  DEFAULT_MIN_SUPPORT: int = 10
28
- DEFAULT_MAJOR_DELTA: float = 0.70
29
- DEFAULT_EQUAL_DELTA: float = 0.25
31
+ DEFAULT_MAJOR_DELTA: float = 0.60
32
+ DEFAULT_EQUAL_DELTA: float = 0.10
30
33
  DEFAULT_GQ_BINS: str = "30:High,10:Moderate"
31
34
 
32
35
 
@@ -44,8 +47,10 @@ def phase(
44
47
  ) -> tuple[Path, Path]:
45
48
  """Phase *sv_vcf* using HP-tagged *bam*, writing outputs into *out_dir*.
46
49
 
47
- Thin wrapper around :py:func:`svphaser.phasing.io.phase_vcf` so users/tests
48
- can skip importing submodules.
50
+ Notes
51
+ -----
52
+ - Step B semantics: `min_support` is applied to TOTAL ALT-supporting reads (n1+n2).
53
+ - Near-ties (<= equal_delta) are treated as ambiguous (./.), not homozygous ALT.
49
54
 
50
55
  Returns
51
56
  -------
@@ -66,13 +71,13 @@ def phase(
66
71
  out_csv = out_dir_p / f"{stem}_phased.csv"
67
72
 
68
73
  phase_vcf(
69
- sv_vcf,
70
- bam,
71
- out_dir=out_dir_p, # type: ignore[arg-type]
74
+ Path(sv_vcf),
75
+ Path(bam),
76
+ out_dir=out_dir_p,
72
77
  min_support=min_support,
73
78
  major_delta=major_delta,
74
79
  equal_delta=equal_delta,
75
- gq_bins=gq_bins, # type: ignore[arg-type]
80
+ gq_bins=gq_bins,
76
81
  threads=threads,
77
82
  )
78
83
  return out_vcf, out_csv
svphaser/_version.py CHANGED
@@ -28,7 +28,7 @@ version_tuple: VERSION_TUPLE
28
28
  commit_id: COMMIT_ID
29
29
  __commit_id__: COMMIT_ID
30
30
 
31
- __version__ = version = '2.0.6'
32
- __version_tuple__ = version_tuple = (2, 0, 6)
31
+ __version__ = version = '2.1.2'
32
+ __version_tuple__ = version_tuple = (2, 1, 2)
33
33
 
34
34
  __commit_id__ = commit_id = None
svphaser/cli.py CHANGED
@@ -1,14 +1,14 @@
1
1
  #!/usr/bin/env python3
2
- """
3
- svphaser.cli
2
+ """svphaser.cli
4
3
  ============
5
4
  Command-line interface for **SvPhaser**.
6
5
 
7
6
  The program writes two files inside **--out-dir** (or the CWD):
8
7
 
9
- * ``<stem>_phased.vcf`` (uncompressed; GT/GQ and optional HP_GQBIN injected)
10
- * ``<stem>_phased.csv`` (tabular summary including gq_label column)
8
+ * ``<stem>_phased.vcf`` (uncompressed; GT/GQ injected; optional INFO=GQBIN)
9
+ * ``<stem>_phased.csv`` (tabular summary incl. n1/n2/gt/gq and optional gq_label)
11
10
  """
11
+
12
12
  from __future__ import annotations
13
13
 
14
14
  from pathlib import Path
@@ -27,7 +27,7 @@ from svphaser import (
27
27
  app = typer.Typer(add_completion=False, rich_markup_mode="rich")
28
28
 
29
29
 
30
- def _version_callback(value: bool):
30
+ def _version_callback(value: bool) -> None:
31
31
  if value:
32
32
  typer.echo(__version__)
33
33
  raise typer.Exit()
@@ -44,30 +44,20 @@ def main(
44
44
  callback=_version_callback,
45
45
  ),
46
46
  ] = None
47
- ):
47
+ ) -> None:
48
48
  """SvPhaser – Structural-variant phasing from HP-tagged long-read BAMs."""
49
- # no-op; callback handles --version
50
49
  return
51
50
 
52
51
 
53
- # ──────────────────────────────────────────────────────────────────────────
54
- # phase command
55
- # ──────────────────────────────────────────────────────────────────────────
56
52
  @app.command("phase")
57
53
  def phase_cmd(
58
54
  sv_vcf: Annotated[
59
55
  Path,
60
- typer.Argument(
61
- exists=True,
62
- help="Input *un-phased* SV VCF (.vcf or .vcf.gz)",
63
- ),
56
+ typer.Argument(exists=True, help="Input *un-phased* SV VCF (.vcf or .vcf.gz)"),
64
57
  ],
65
58
  bam: Annotated[
66
59
  Path,
67
- typer.Argument(
68
- exists=True,
69
- help="Long-read BAM/CRAM with HP tags",
70
- ),
60
+ typer.Argument(exists=True, help="Long-read BAM/CRAM with HP tags"),
71
61
  ],
72
62
  out_dir: Annotated[
73
63
  Path,
@@ -90,9 +80,8 @@ def phase_cmd(
90
80
  int,
91
81
  typer.Option(
92
82
  help=(
93
- "Minimum HP-tagged reads per haplotype. "
94
- "SVs where *both* n1 AND n2 fall below this "
95
- "are dropped entirely."
83
+ "Minimum TOTAL ALT-supporting reads required to keep an SV (n1+n2). "
84
+ "If (n1+n2) < min_support the SV is dropped (written to *_dropped_svs.csv)."
96
85
  ),
97
86
  show_default=True,
98
87
  ),
@@ -100,14 +89,14 @@ def phase_cmd(
100
89
  major_delta: Annotated[
101
90
  float,
102
91
  typer.Option(
103
- help="r >= this ⇒ strong majority ⇒ GT 1|0 or 0|1",
92
+ help="max(n1,n2)/N >= this ⇒ strong majority ⇒ GT 1|0 or 0|1",
104
93
  show_default=True,
105
94
  ),
106
95
  ] = DEFAULT_MAJOR_DELTA,
107
96
  equal_delta: Annotated[
108
97
  float,
109
98
  typer.Option(
110
- help="|n1−n2|/N this ⇒ near-tie ⇒ GT 1|1",
99
+ help="|n1−n2|/N <= this ⇒ near-tie ⇒ GT ./. (ambiguous)",
111
100
  show_default=True,
112
101
  ),
113
102
  ] = DEFAULT_EQUAL_DELTA,
@@ -116,9 +105,8 @@ def phase_cmd(
116
105
  str,
117
106
  typer.Option(
118
107
  help=(
119
- "Comma-separated GQ≥threshold:Label definitions "
120
- "(e.g. '30:High,10:Moderate'). Labels appear in the CSV "
121
- "[gq_label] and in the VCF INFO field HP_GQBIN when set."
108
+ "Comma-separated GQ≥threshold:Label definitions (e.g. '30:High,10:Moderate'). "
109
+ "Labels appear in CSV column [gq_label] and in the VCF INFO field GQBIN."
122
110
  ),
123
111
  show_default=True,
124
112
  ),
@@ -134,13 +122,11 @@ def phase_cmd(
134
122
  ),
135
123
  ] = None,
136
124
  ) -> None:
137
- """Phase structural variants using HP-tagged read evidence."""
138
- # Initialise logging BEFORE we import anything that might log
125
+ """Phase structural variants using SV-type-aware ALT-support evidence."""
139
126
  from svphaser.logging import init as _init_logging
140
127
 
141
- _init_logging("INFO") # or "DEBUG" if you want more detail
128
+ _init_logging("INFO")
142
129
 
143
- # Resolve output paths
144
130
  if not out_dir.exists():
145
131
  out_dir.mkdir(parents=True)
146
132
 
@@ -153,25 +139,21 @@ def phase_cmd(
153
139
  out_vcf = out_dir / f"{stem}_phased.vcf"
154
140
  out_csv = out_dir / f"{stem}_phased.csv"
155
141
 
156
- # Lazy import so `svphaser --help` works without heavy deps
157
142
  from svphaser.phasing.io import phase_vcf
158
143
 
159
144
  try:
160
145
  phase_vcf(
161
146
  sv_vcf,
162
147
  bam,
163
- out_dir=out_dir, # type: ignore[arg-type]
148
+ out_dir=out_dir,
164
149
  min_support=min_support,
165
150
  major_delta=major_delta,
166
151
  equal_delta=equal_delta,
167
- gq_bins=gq_bins, # type: ignore[arg-type]
152
+ gq_bins=gq_bins,
168
153
  threads=threads,
169
154
  )
170
155
  typer.secho(f"✔ Phased VCF → {out_vcf}", fg=typer.colors.GREEN)
171
156
  typer.secho(f"✔ Phased CSV → {out_csv}", fg=typer.colors.GREEN)
172
- except Exception: # pragma: no cover
173
- typer.secho(
174
- "[SvPhaser] 💥 Unhandled error during phasing",
175
- fg=typer.colors.RED,
176
- )
157
+ except Exception:
158
+ typer.secho("[SvPhaser] 💥 Unhandled error during phasing", fg=typer.colors.RED)
177
159
  raise
@@ -1,18 +1,26 @@
1
- """
2
- svphaser.phasing._workers
1
+ """svphaser.phasing._workers
3
2
  =========================
4
- Worker-process code. Each worker:
3
+ Worker-process code.
4
+
5
+ Step B update (biological correctness):
6
+ - `n1/n2` are now *ALT-supporting read counts* per haplotype,
7
+ not raw overlap/coverage.
8
+ - Evidence is SV-type-aware:
9
+ - DEL: large CIGAR 'D' spanning breakpoints (and optional split-read via SA)
10
+ - INS: large CIGAR 'I' near POS
11
+ - BND: split-read via SA linking to partner chrom:pos
12
+ - INV: split-read via SA to the END breakpoint with strand flip
5
13
 
6
- 1. Opens the (possibly un-indexed) SV VCF.
7
- 2. Scans only the records for *its* chromosome.
8
- 3. Counts HP-tagged reads in the long-read BAM/CRAM.
9
- 4. Classifies the haplotype + GQ, adds optional GQ-bin label.
10
- 5. Returns a DataFrame to the parent.
14
+ This is designed to match what IGV "SV-support" reads typically show,
15
+ so counts will be closer to the 5/8 style numbers you observed (instead of 27/30).
11
16
  """
12
17
 
13
18
  from __future__ import annotations
14
19
 
20
+ import re
21
+ from collections.abc import Iterable
15
22
  from pathlib import Path
23
+ from typing import Any
16
24
 
17
25
  import pandas as pd
18
26
  import pysam
@@ -23,35 +31,320 @@ from .types import WorkerOpts
23
31
 
24
32
  __all__ = ["_phase_chrom_worker"]
25
33
 
34
+ # Default breakpoint window (bp) used for evidence gathering.
35
+ DEFAULT_BP_WINDOW = 100
26
36
 
27
- def _count_hp_reads(
28
- bam: pysam.AlignmentFile,
29
- chrom: str,
30
- start: int,
31
- end: int,
32
- ) -> tuple[int, int]:
33
- hp1 = hp2 = 0
34
- for read in bam.fetch(chrom, max(0, start - 1), end + 1):
35
- if read.is_unmapped or read.is_secondary or read.is_supplementary:
36
- continue
37
- if not read.has_tag("HP"):
38
- continue
39
- tag = read.get_tag("HP")
40
- if tag == 1:
41
- hp1 += 1
42
- elif tag == 2:
43
- hp2 += 1
44
- return hp1, hp2
37
+ # Require event size in the read CIGAR to be at least this fraction of SVLEN.
38
+ MIN_CIGAR_FRACTION = 0.30
39
+
40
+ # And at least this many bp (avoid counting tiny indels / alignment noise).
41
+ MIN_CIGAR_BP = 30
42
+
43
+ # ALT for BND often contains partner like "]chr3:198172833]N" or "N]chr5:181462057]".
44
+ _BND_RE = re.compile(r"[\[\]]([^:\[\]]+):(\d+)[\[\]]")
45
45
 
46
46
 
47
47
  def _has_tabix_index(vcf_path: Path) -> bool:
48
- """Return True if <file>.tbi or <file>.csi exists (supports .vcf.gz.{tbi,csi})."""
48
+ """Return True if <file>.tbi or <file>.csi exists."""
49
49
  return (
50
50
  vcf_path.with_suffix(vcf_path.suffix + ".tbi").exists()
51
51
  or vcf_path.with_suffix(vcf_path.suffix + ".csi").exists()
52
52
  )
53
53
 
54
54
 
55
+ def _coerce_int(x: Any) -> int | None:
56
+ """Convert cyvcf2 INFO values to int if possible."""
57
+ if x is None:
58
+ return None
59
+ if isinstance(x, (list, tuple)):
60
+ return _coerce_int(x[0]) if x else None
61
+ try:
62
+ return int(x)
63
+ except Exception:
64
+ return None
65
+
66
+
67
+ def _svlen_from_record(rec: Variant, pos: int, end: int) -> int:
68
+ svlen = _coerce_int(rec.INFO.get("SVLEN"))
69
+ if svlen is None:
70
+ # VCF END is 1-based inclusive for DEL; length approx END-POS+1
71
+ return abs(end - pos) + 1
72
+ return abs(svlen)
73
+
74
+
75
+ def _parse_bnd_partner(alt: str, rec: Variant) -> tuple[str | None, int | None]:
76
+ """Return (chr2, pos2) for BND if possible."""
77
+ m = _BND_RE.search(alt)
78
+ if m:
79
+ return m.group(1), int(m.group(2))
80
+ chr2 = rec.INFO.get("CHR2")
81
+ # Some callers store partner position in INFO; Sniffles2 uses ALT string primarily.
82
+ return (str(chr2), None) if chr2 else (None, None)
83
+
84
+
85
+ def _parse_sa_tag(read: pysam.AlignedSegment) -> list[tuple[str, int, str]]:
86
+ """Parse SA tag into list of (rname, pos1, strand). pos1 is 1-based."""
87
+ if not read.has_tag("SA"):
88
+ return []
89
+ sa_raw = read.get_tag("SA")
90
+ out: list[tuple[str, int, str]] = []
91
+ for entry in str(sa_raw).split(";"):
92
+ if not entry:
93
+ continue
94
+ # rname,pos,strand,cigar,mapq,nm
95
+ parts = entry.split(",")
96
+ if len(parts) < 3:
97
+ continue
98
+ rname = parts[0]
99
+ try:
100
+ pos1 = int(parts[1])
101
+ except ValueError:
102
+ continue
103
+ strand = parts[2]
104
+ out.append((rname, pos1, strand))
105
+ return out
106
+
107
+
108
+ def _iter_candidate_reads(
109
+ bam: pysam.AlignmentFile,
110
+ chrom: str,
111
+ regions_1based: list[tuple[int, int]],
112
+ ) -> Iterable[pysam.AlignedSegment]:
113
+ """Yield reads from multiple 1-based-inclusive regions, allowing overlap."""
114
+ for start1, end1 in regions_1based:
115
+ # pysam fetch: 0-based start, end-exclusive
116
+ start0 = max(0, start1 - 1)
117
+ end0 = max(0, end1) # end1 is inclusive 1-based, so end-exclusive 0-based is end1
118
+ for read in bam.fetch(chrom, start0, end0):
119
+ if read.is_unmapped or read.is_secondary:
120
+ continue
121
+ yield read
122
+
123
+
124
+ def _supports_del(
125
+ read: pysam.AlignedSegment,
126
+ *,
127
+ pos0: int,
128
+ end_excl0: int,
129
+ svlen: int,
130
+ bp_window: int,
131
+ ) -> bool:
132
+ """DEL support: a large CIGAR deletion spanning POS..END (within window).
133
+
134
+ pos0: 0-based start breakpoint
135
+ end_excl0: 0-based exclusive end breakpoint (VCF END maps naturally to this)
136
+ """
137
+ if read.cigartuples is None:
138
+ return False
139
+
140
+ min_len = max(MIN_CIGAR_BP, int(MIN_CIGAR_FRACTION * svlen))
141
+
142
+ ref = read.reference_start
143
+ for op, length in read.cigartuples:
144
+ if op in (0, 7, 8):
145
+ ref += length
146
+ elif op in (2, 3):
147
+ # deletion / ref skip
148
+ if op == 2 and length >= min_len:
149
+ del_start = ref
150
+ del_end = ref + length
151
+ if abs(del_start - pos0) <= bp_window and abs(del_end - end_excl0) <= bp_window:
152
+ return True
153
+ ref += length
154
+ elif op in (1, 4, 5, 6):
155
+ # insertion or clipping: does not consume reference
156
+ continue
157
+
158
+ # Fallback: split-read evidence (SA) hitting END region
159
+ for rname, sa_pos1, _strand in _parse_sa_tag(read):
160
+ if rname != read.reference_name:
161
+ continue
162
+ sa_pos0 = sa_pos1 - 1
163
+ if abs(sa_pos0 - (end_excl0 - 1)) <= bp_window:
164
+ return True
165
+
166
+ return False
167
+
168
+
169
+ def _supports_ins(
170
+ read: pysam.AlignedSegment,
171
+ *,
172
+ pos0: int,
173
+ svlen: int,
174
+ bp_window: int,
175
+ ) -> bool:
176
+ """INS support: a large CIGAR insertion near POS (within window)."""
177
+ if read.cigartuples is None:
178
+ return False
179
+
180
+ min_len = max(MIN_CIGAR_BP, int(MIN_CIGAR_FRACTION * svlen))
181
+
182
+ ref = read.reference_start
183
+ for op, length in read.cigartuples:
184
+ if op in (0, 7, 8):
185
+ ref += length
186
+ elif op == 1:
187
+ if length >= min_len and abs(ref - pos0) <= bp_window:
188
+ return True
189
+ elif op in (2, 3):
190
+ ref += length
191
+ else:
192
+ continue
193
+ return False
194
+
195
+
196
+ def _supports_bnd(
197
+ read: pysam.AlignedSegment,
198
+ *,
199
+ pos0: int,
200
+ chr2: str,
201
+ pos2_1based: int,
202
+ bp_window: int,
203
+ ) -> bool:
204
+ """BND support: SA tag links to partner chrom:pos."""
205
+ # Must overlap POS window (caller already fetched around POS, but keep conservative)
206
+ if abs(read.reference_start - pos0) > 10 * bp_window:
207
+ return False
208
+
209
+ pos2_0 = pos2_1based - 1
210
+ for rname, sa_pos1, _strand in _parse_sa_tag(read):
211
+ if rname != chr2:
212
+ continue
213
+ if abs((sa_pos1 - 1) - pos2_0) <= bp_window:
214
+ return True
215
+ return False
216
+
217
+
218
+ def _supports_inv(
219
+ read: pysam.AlignedSegment,
220
+ *,
221
+ pos0: int,
222
+ end0: int,
223
+ bp_window: int,
224
+ ) -> bool:
225
+ """INV support: SA to the other breakpoint on same chrom with strand flip."""
226
+ strand_primary = "-" if read.is_reverse else "+"
227
+
228
+ for rname, sa_pos1, sa_strand in _parse_sa_tag(read):
229
+ if rname != read.reference_name:
230
+ continue
231
+ sa_pos0 = sa_pos1 - 1
232
+ if abs(sa_pos0 - end0) <= bp_window and sa_strand != strand_primary:
233
+ return True
234
+ if abs(sa_pos0 - pos0) <= bp_window and sa_strand != strand_primary:
235
+ return True
236
+ return False
237
+
238
+
239
+ def _read_supports_variant(
240
+ read: pysam.AlignedSegment,
241
+ svtype: str,
242
+ *,
243
+ pos0: int,
244
+ end_excl0: int,
245
+ svlen: int,
246
+ bp_window: int,
247
+ chr2: str | None = None,
248
+ pos2: int | None = None,
249
+ ) -> bool:
250
+ """Return True if the read provides evidence for the given SV type.
251
+
252
+ This is a thin wrapper around the type-specific support helpers so the
253
+ main counting routine remains small and easier to read/test.
254
+ """
255
+ if svtype == "DEL":
256
+ return _supports_del(read, pos0=pos0, end_excl0=end_excl0, svlen=svlen, bp_window=bp_window)
257
+ if svtype == "INS":
258
+ return _supports_ins(read, pos0=pos0, svlen=svlen, bp_window=bp_window)
259
+ if svtype == "BND" and chr2 and pos2:
260
+ return _supports_bnd(
261
+ read,
262
+ pos0=pos0,
263
+ chr2=str(chr2),
264
+ pos2_1based=int(pos2),
265
+ bp_window=bp_window,
266
+ )
267
+ if svtype == "INV":
268
+ return _supports_inv(read, pos0=pos0, end0=(end_excl0 - 1), bp_window=bp_window)
269
+
270
+ # Unknown / other SVs: conservative fallback – look for large indel near POS.
271
+ return _supports_ins(read, pos0=pos0, svlen=svlen, bp_window=bp_window) or _supports_del(
272
+ read, pos0=pos0, end_excl0=end_excl0, svlen=svlen, bp_window=bp_window
273
+ )
274
+
275
+
276
+ def _count_hp_sv_support(
277
+ bam: pysam.AlignmentFile,
278
+ chrom: str,
279
+ rec: Variant,
280
+ *,
281
+ bp_window: int = DEFAULT_BP_WINDOW,
282
+ ) -> tuple[int, int, int, str]: # noqa: C901
283
+ """Return (n1, n2, sv_end, alt_str) where n1/n2 are ALT-support counts."""
284
+
285
+ pos1 = int(rec.POS)
286
+ sv_end = int(rec.end) if getattr(rec, "end", None) is not None else pos1
287
+ alt = ",".join(rec.ALT) if rec.ALT else "<N>"
288
+ svtype = str(rec.INFO.get("SVTYPE", "NA"))
289
+
290
+ svlen = _svlen_from_record(rec, pos1, sv_end)
291
+
292
+ # Build regions (1-based inclusive) to fetch candidates.
293
+ regions: list[tuple[int, int]] = [(max(1, pos1 - bp_window), pos1 + bp_window)]
294
+ if svtype in {"DEL", "INV"} and sv_end != pos1:
295
+ regions.append((max(1, sv_end - bp_window), sv_end + bp_window))
296
+
297
+ # For BND we need the partner locus
298
+ chr2 = None
299
+ pos2 = None
300
+ if svtype == "BND":
301
+ chr2, pos2 = _parse_bnd_partner(alt, rec)
302
+
303
+ # Deduplicate by query name: count each read once if it supports ALT.
304
+ # Track hp per read name (in case different segments appear).
305
+ state: dict[str, dict[str, Any]] = {}
306
+
307
+ pos0 = pos1 - 1
308
+ end_excl0 = sv_end # see note in _supports_del: END maps to 0-based exclusive
309
+
310
+ for read in _iter_candidate_reads(bam, chrom, regions):
311
+ qn = read.query_name
312
+ if qn is None:
313
+ continue
314
+ st = state.setdefault(qn, {"hp": None, "support": False})
315
+
316
+ if st["hp"] is None and read.has_tag("HP"):
317
+ st["hp"] = read.get_tag("HP")
318
+
319
+ # If we already know this read supports, we can skip extra work
320
+ if st["support"] is True:
321
+ continue
322
+
323
+ if _read_supports_variant(
324
+ read,
325
+ svtype,
326
+ pos0=pos0,
327
+ end_excl0=end_excl0,
328
+ svlen=svlen,
329
+ bp_window=bp_window,
330
+ chr2=chr2,
331
+ pos2=pos2,
332
+ ):
333
+ st["support"] = True
334
+
335
+ n1 = n2 = 0
336
+ for st in state.values():
337
+ if not st["support"]:
338
+ continue
339
+ hp = st["hp"]
340
+ if hp == 1:
341
+ n1 += 1
342
+ elif hp == 2:
343
+ n2 += 1
344
+
345
+ return n1, n2, sv_end, alt
346
+
347
+
55
348
  def _phase_chrom_worker(
56
349
  chrom: str,
57
350
  vcf_path: Path,
@@ -63,7 +356,7 @@ def _phase_chrom_worker(
63
356
 
64
357
  rows: list[dict[str, object]] = []
65
358
 
66
- # Try fast random access first, fall back to linear scan if that fails
359
+ # Try fast random access first, fall back to linear scan if needed
67
360
  use_region_iter = _has_tabix_index(vcf_path)
68
361
  records_iter = (
69
362
  rdr(f"{chrom}") if use_region_iter else (rec for rec in rdr if rec.CHROM == chrom)
@@ -71,26 +364,39 @@ def _phase_chrom_worker(
71
364
 
72
365
  for rec in records_iter: # type: ignore[arg-type]
73
366
  assert isinstance(rec, Variant)
74
- sv_end = rec.end if getattr(rec, "end", None) is not None else rec.POS
75
- n1, n2 = _count_hp_reads(bam, chrom, rec.POS, sv_end)
76
-
77
- gt, gq = classify_haplotype(
78
- n1,
79
- n2,
80
- min_support=opts.min_support,
81
- major_delta=opts.major_delta,
82
- equal_delta=opts.equal_delta,
83
- )
84
367
 
85
- row = dict(
368
+ n1, n2, sv_end, alt = _count_hp_sv_support(bam, chrom, rec)
369
+
370
+ # Respect caller genotype for homozygous ALT: do not infer 1|1 from balance.
371
+ is_hom_alt = False
372
+ if rec.genotypes:
373
+ # cyvcf2: [a1, a2, phased]
374
+ a1, a2 = rec.genotypes[0][0], rec.genotypes[0][1]
375
+ is_hom_alt = a1 == 1 and a2 == 1
376
+
377
+ if is_hom_alt:
378
+ gt, gq = "1|1", 0
379
+ else:
380
+ gt, gq = classify_haplotype(
381
+ n1,
382
+ n2,
383
+ min_support=opts.min_support,
384
+ major_delta=opts.major_delta,
385
+ equal_delta=opts.equal_delta,
386
+ )
387
+
388
+ row: dict[str, object] = dict(
86
389
  chrom=chrom,
87
- pos=rec.POS, # cyvcf2 POS is already 1-based
390
+ pos=int(rec.POS),
391
+ end=int(sv_end),
88
392
  id=rec.ID or ".",
393
+ alt=alt,
89
394
  svtype=rec.INFO.get("SVTYPE", "NA"),
90
395
  n1=n1,
91
396
  n2=n2,
92
397
  gt=gt,
93
398
  gq=gq,
399
+ gq_label=None,
94
400
  )
95
401
 
96
402
  if opts.gq_bins:
@@ -99,7 +405,7 @@ def _phase_chrom_worker(
99
405
  row["gq_label"] = label
100
406
  break
101
407
 
102
- rows.append(row) # type: ignore[assignment]
408
+ rows.append(row)
103
409
 
104
410
  rdr.close()
105
411
  bam.close()