svphaser 2.0.6__py3-none-any.whl → 2.1.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- svphaser/__init__.py +17 -12
- svphaser/_version.py +2 -2
- svphaser/cli.py +20 -38
- svphaser/phasing/_workers.py +347 -41
- svphaser/phasing/algorithms.py +22 -5
- svphaser/phasing/io.py +140 -31
- svphaser/phasing/types.py +12 -5
- svphaser-2.1.2.dist-info/METADATA +231 -0
- svphaser-2.1.2.dist-info/RECORD +16 -0
- {svphaser-2.0.6.dist-info → svphaser-2.1.2.dist-info}/WHEEL +1 -1
- svphaser-2.0.6.dist-info/METADATA +0 -203
- svphaser-2.0.6.dist-info/RECORD +0 -16
- {svphaser-2.0.6.dist-info → svphaser-2.1.2.dist-info}/entry_points.txt +0 -0
- {svphaser-2.0.6.dist-info → svphaser-2.1.2.dist-info}/licenses/LICENSE +0 -0
svphaser/phasing/algorithms.py
CHANGED
|
@@ -3,6 +3,12 @@
|
|
|
3
3
|
1) Exact binomial tail for small depth (N ≤ 200).
|
|
4
4
|
2) Continuity-corrected normal approximation for deep coverage (N > 200).
|
|
5
5
|
3) Phred GQ capped at 99.
|
|
6
|
+
|
|
7
|
+
Step B semantics:
|
|
8
|
+
- `min_support` is interpreted as a *total ALT-support* threshold (n1+n2).
|
|
9
|
+
- Near-ties are treated as *ambiguous* (./.), not homozygous ALT.
|
|
10
|
+
Homozygous ALT (1|1) should come from the caller's genotype (input VCF),
|
|
11
|
+
not from a balance test.
|
|
6
12
|
"""
|
|
7
13
|
|
|
8
14
|
from __future__ import annotations
|
|
@@ -45,20 +51,31 @@ def classify_haplotype(
|
|
|
45
51
|
n2: int,
|
|
46
52
|
*,
|
|
47
53
|
min_support: int = 10,
|
|
48
|
-
major_delta: float = 0.
|
|
54
|
+
major_delta: float = 0.60,
|
|
49
55
|
equal_delta: float = 0.10,
|
|
50
56
|
) -> tuple[str, int]:
|
|
57
|
+
"""Classify which haplotype carries the ALT allele.
|
|
58
|
+
|
|
59
|
+
Returns:
|
|
60
|
+
- GT: "1|0" (ALT on hap1) or "0|1" (ALT on hap2) or "./." (ambiguous)
|
|
61
|
+
- GQ: phred-scaled confidence based on haplotype imbalance
|
|
62
|
+
|
|
63
|
+
Notes:
|
|
64
|
+
- `min_support` is applied to total ALT-support reads.
|
|
65
|
+
- Near-ties (<= equal_delta) are treated as ambiguous (./.).
|
|
66
|
+
"""
|
|
67
|
+
|
|
51
68
|
total = n1 + n2
|
|
52
|
-
if
|
|
69
|
+
if total <= 0:
|
|
53
70
|
return "./.", 0
|
|
54
|
-
if total
|
|
71
|
+
if total < min_support:
|
|
55
72
|
return "./.", 0
|
|
56
73
|
|
|
57
74
|
gq = phasing_gq(n1, n2)
|
|
58
75
|
|
|
59
|
-
# 1) near-tie
|
|
76
|
+
# 1) near-tie → ambiguous
|
|
60
77
|
if abs(n1 - n2) / total <= equal_delta:
|
|
61
|
-
return "
|
|
78
|
+
return "./.", gq
|
|
62
79
|
|
|
63
80
|
# 2) strong majority → heterozygous phased
|
|
64
81
|
r1 = n1 / total
|
svphaser/phasing/io.py
CHANGED
|
@@ -1,30 +1,56 @@
|
|
|
1
|
-
"""
|
|
2
|
-
svphaser.phasing.io
|
|
1
|
+
"""svphaser.phasing.io
|
|
3
2
|
===================
|
|
4
3
|
High-level “engine” – orchestrates per-chromosome workers, merges results,
|
|
5
|
-
applies the global
|
|
4
|
+
applies the global support filter, then writes CSV + VCF.
|
|
5
|
+
|
|
6
|
+
Step B update (biological correctness):
|
|
7
|
+
- `min_support` is interpreted as a *total ALT-support* threshold (n1+n2).
|
|
8
|
+
The filter drops an SV only if (n1+n2) < min_support.
|
|
6
9
|
|
|
7
|
-
|
|
8
|
-
|
|
10
|
+
Step A fixes retained:
|
|
11
|
+
- collision-resistant VCF record matching using (CHROM, POS, ID, END, ALT)
|
|
12
|
+
- correct INFO composition (no duplicated keys; proper FLAG handling)
|
|
13
|
+
- typing fixes to satisfy Pylance/Mypy
|
|
9
14
|
"""
|
|
10
15
|
|
|
11
16
|
from __future__ import annotations
|
|
12
17
|
|
|
13
18
|
import logging
|
|
19
|
+
import math
|
|
14
20
|
import multiprocessing as mp
|
|
15
21
|
from pathlib import Path
|
|
22
|
+
from typing import Any, TypedDict
|
|
16
23
|
|
|
17
24
|
import pandas as pd
|
|
18
25
|
from cyvcf2 import Reader
|
|
19
26
|
|
|
20
27
|
from ._workers import _phase_chrom_worker
|
|
21
|
-
from .types import GQBin, WorkerOpts
|
|
28
|
+
from .types import GQBin, SVKey, SVKeyLegacy, WorkerOpts
|
|
22
29
|
|
|
23
30
|
__all__ = ["phase_vcf"]
|
|
24
31
|
|
|
25
32
|
logger = logging.getLogger(__name__)
|
|
26
33
|
|
|
27
34
|
|
|
35
|
+
class VcfRec(TypedDict):
|
|
36
|
+
REF: str
|
|
37
|
+
ALT: str
|
|
38
|
+
QUAL: object
|
|
39
|
+
FILTER: str
|
|
40
|
+
INFO: dict[str, Any]
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def _is_missing_scalar(x: Any) -> bool:
|
|
44
|
+
"""True for None / NaN / empty string."""
|
|
45
|
+
if x is None:
|
|
46
|
+
return True
|
|
47
|
+
if isinstance(x, float) and math.isnan(x):
|
|
48
|
+
return True
|
|
49
|
+
if isinstance(x, str) and x.strip() == "":
|
|
50
|
+
return True
|
|
51
|
+
return False
|
|
52
|
+
|
|
53
|
+
|
|
28
54
|
def phase_vcf(
|
|
29
55
|
sv_vcf: Path,
|
|
30
56
|
bam: Path,
|
|
@@ -56,7 +82,7 @@ def phase_vcf(
|
|
|
56
82
|
thr_s, lbl = thr_lbl.split(":")
|
|
57
83
|
except ValueError as err:
|
|
58
84
|
raise ValueError(
|
|
59
|
-
f"Invalid gq-bin specifier: '{thr_lbl}'.
|
|
85
|
+
f"Invalid gq-bin specifier: '{thr_lbl}'. Use '30:High,10:Moderate'."
|
|
60
86
|
) from err
|
|
61
87
|
bins.append((int(thr_s), lbl))
|
|
62
88
|
bins.sort(key=lambda x: x[0], reverse=True)
|
|
@@ -89,7 +115,6 @@ def phase_vcf(
|
|
|
89
115
|
ctx = mp.get_context("spawn")
|
|
90
116
|
|
|
91
117
|
if threads == 1:
|
|
92
|
-
# Serial path is handy for debugging
|
|
93
118
|
for args in worker_args:
|
|
94
119
|
df = _phase_chrom_worker(*args)
|
|
95
120
|
dataframes.append(df)
|
|
@@ -102,16 +127,29 @@ def phase_vcf(
|
|
|
102
127
|
chrom = df.iloc[0]["chrom"] if not df.empty else "?"
|
|
103
128
|
logger.info("chr %-6s ✔ phased %5d SVs", chrom, len(df))
|
|
104
129
|
|
|
105
|
-
# 5 ─ Merge & apply *global*
|
|
130
|
+
# 5 ─ Merge & apply *global* support filter (Step B: total ALT-support)
|
|
106
131
|
if dataframes:
|
|
107
132
|
merged = pd.concat(dataframes, ignore_index=True)
|
|
108
133
|
else:
|
|
109
134
|
merged = pd.DataFrame(
|
|
110
|
-
columns=[
|
|
135
|
+
columns=[
|
|
136
|
+
"chrom",
|
|
137
|
+
"pos",
|
|
138
|
+
"end",
|
|
139
|
+
"id",
|
|
140
|
+
"alt",
|
|
141
|
+
"svtype",
|
|
142
|
+
"n1",
|
|
143
|
+
"n2",
|
|
144
|
+
"gt",
|
|
145
|
+
"gq",
|
|
146
|
+
"gq_label",
|
|
147
|
+
]
|
|
111
148
|
)
|
|
112
149
|
|
|
113
150
|
pre = len(merged)
|
|
114
|
-
|
|
151
|
+
total_support = merged["n1"].astype(int) + merged["n2"].astype(int)
|
|
152
|
+
keep = total_support >= int(min_support)
|
|
115
153
|
|
|
116
154
|
stem = sv_vcf.name.removesuffix(".vcf.gz").removesuffix(".vcf")
|
|
117
155
|
|
|
@@ -122,7 +160,7 @@ def phase_vcf(
|
|
|
122
160
|
|
|
123
161
|
kept = merged.loc[keep].reset_index(drop=True)
|
|
124
162
|
if dropped := pre - len(kept):
|
|
125
|
-
logger.info("
|
|
163
|
+
logger.info("Support filter removed %d SVs", dropped)
|
|
126
164
|
|
|
127
165
|
# 6 ─ Write CSV
|
|
128
166
|
out_csv = out_dir / f"{stem}_phased.csv"
|
|
@@ -138,32 +176,54 @@ def phase_vcf(
|
|
|
138
176
|
# ──────────────────────────────────────────────────────────────────────
|
|
139
177
|
# Small helpers to keep complexity down
|
|
140
178
|
# ──────────────────────────────────────────────────────────────────────
|
|
179
|
+
|
|
180
|
+
|
|
141
181
|
def _vcf_info_lookup(
|
|
142
182
|
in_vcf: Path,
|
|
143
|
-
) -> tuple[dict[
|
|
144
|
-
"""Scan input VCF once
|
|
183
|
+
) -> tuple[dict[SVKey, VcfRec], dict[SVKeyLegacy, list[SVKey]], list[str], str]:
|
|
184
|
+
"""Scan input VCF once.
|
|
185
|
+
|
|
186
|
+
Returns:
|
|
187
|
+
- full_lookup: maps (CHROM, POS, ID, END, ALT) -> record components
|
|
188
|
+
- legacy_index: maps (CHROM, POS, ID) -> list of full keys (fallback)
|
|
189
|
+
- raw_header_lines
|
|
190
|
+
- sample_name
|
|
191
|
+
"""
|
|
145
192
|
rdr = Reader(str(in_vcf))
|
|
146
193
|
raw_header_lines = rdr.raw_header.strip().splitlines()
|
|
147
194
|
sample_name = rdr.samples[0] if rdr.samples else "SAMPLE"
|
|
148
195
|
|
|
149
|
-
|
|
196
|
+
full_lookup: dict[SVKey, VcfRec] = {}
|
|
197
|
+
legacy_index: dict[SVKeyLegacy, list[SVKey]] = {}
|
|
198
|
+
|
|
150
199
|
for rec in rdr:
|
|
151
|
-
|
|
152
|
-
|
|
200
|
+
chrom = rec.CHROM
|
|
201
|
+
pos = int(rec.POS)
|
|
202
|
+
vid = rec.ID or "."
|
|
203
|
+
end = int(rec.end) if getattr(rec, "end", None) is not None else int(pos)
|
|
204
|
+
alt = ",".join(rec.ALT) if rec.ALT else "<N>"
|
|
205
|
+
|
|
206
|
+
info_dict: dict[str, Any] = {}
|
|
153
207
|
for k in rec.INFO:
|
|
154
208
|
info_key = k[0] if isinstance(k, tuple) else k
|
|
155
209
|
v = rec.INFO.get(info_key)
|
|
156
210
|
if v is not None:
|
|
157
211
|
info_dict[info_key] = v
|
|
158
|
-
|
|
212
|
+
|
|
213
|
+
fkey: SVKey = (chrom, pos, vid, end, alt)
|
|
214
|
+
lkey: SVKeyLegacy = (chrom, pos, vid)
|
|
215
|
+
|
|
216
|
+
full_lookup[fkey] = {
|
|
159
217
|
"REF": rec.REF,
|
|
160
|
-
"ALT":
|
|
218
|
+
"ALT": alt,
|
|
161
219
|
"QUAL": rec.QUAL if rec.QUAL is not None else ".",
|
|
162
220
|
"FILTER": rec.FILTER if rec.FILTER else "PASS",
|
|
163
221
|
"INFO": info_dict,
|
|
164
222
|
}
|
|
223
|
+
legacy_index.setdefault(lkey, []).append(fkey)
|
|
224
|
+
|
|
165
225
|
rdr.close()
|
|
166
|
-
return
|
|
226
|
+
return full_lookup, legacy_index, raw_header_lines, sample_name
|
|
167
227
|
|
|
168
228
|
|
|
169
229
|
def _write_headers(
|
|
@@ -196,25 +256,62 @@ def _write_headers(
|
|
|
196
256
|
out.write("#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t" + sample_name + "\n")
|
|
197
257
|
|
|
198
258
|
|
|
199
|
-
def _compose_info_str(orig_info: dict[str,
|
|
200
|
-
"""Compose
|
|
259
|
+
def _compose_info_str(orig_info: dict[str, Any], svtype: Any, gq_label: Any) -> str:
|
|
260
|
+
"""Compose INFO with SVTYPE first, proper FLAG handling, then GQBIN."""
|
|
201
261
|
items: list[str] = []
|
|
262
|
+
|
|
263
|
+
if svtype:
|
|
264
|
+
items.append(f"SVTYPE={svtype}")
|
|
265
|
+
|
|
202
266
|
for k, v in orig_info.items():
|
|
203
267
|
if k == "SVTYPE":
|
|
204
268
|
continue
|
|
205
|
-
|
|
206
|
-
|
|
269
|
+
if v is None:
|
|
270
|
+
continue
|
|
271
|
+
# cyvcf2 represents INFO flags as boolean True
|
|
207
272
|
if v is True:
|
|
208
|
-
items.append(k)
|
|
273
|
+
items.append(str(k))
|
|
209
274
|
else:
|
|
210
275
|
items.append(f"{k}={v}")
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
if gq_label is not None and pd.notnull(gq_label):
|
|
276
|
+
|
|
277
|
+
if not _is_missing_scalar(gq_label):
|
|
214
278
|
items.append(f"GQBIN={gq_label}")
|
|
279
|
+
|
|
215
280
|
return ";".join(items) if items else "."
|
|
216
281
|
|
|
217
282
|
|
|
283
|
+
def _select_info_record(
|
|
284
|
+
full_lookup: dict[SVKey, VcfRec],
|
|
285
|
+
legacy_index: dict[SVKeyLegacy, list[SVKey]],
|
|
286
|
+
*,
|
|
287
|
+
chrom: str,
|
|
288
|
+
pos: int,
|
|
289
|
+
vid: str,
|
|
290
|
+
end: int | None,
|
|
291
|
+
alt: str | None,
|
|
292
|
+
) -> VcfRec | None:
|
|
293
|
+
"""Pick the best matching input VCF record for this phased row."""
|
|
294
|
+
if end is not None and alt is not None:
|
|
295
|
+
hit = full_lookup.get((chrom, pos, vid, int(end), str(alt)))
|
|
296
|
+
if hit is not None:
|
|
297
|
+
return hit
|
|
298
|
+
|
|
299
|
+
candidates = legacy_index.get((chrom, pos, vid), [])
|
|
300
|
+
if not candidates:
|
|
301
|
+
return None
|
|
302
|
+
|
|
303
|
+
if len(candidates) == 1:
|
|
304
|
+
return full_lookup[candidates[0]]
|
|
305
|
+
|
|
306
|
+
if end is not None:
|
|
307
|
+
end_matches = [k for k in candidates if k[3] == int(end)]
|
|
308
|
+
if len(end_matches) == 1:
|
|
309
|
+
return full_lookup[end_matches[0]]
|
|
310
|
+
|
|
311
|
+
# Still ambiguous: refuse to guess
|
|
312
|
+
return None
|
|
313
|
+
|
|
314
|
+
|
|
218
315
|
def _write_phased_vcf(
|
|
219
316
|
out_vcf: Path,
|
|
220
317
|
in_vcf: Path,
|
|
@@ -223,7 +320,7 @@ def _write_phased_vcf(
|
|
|
223
320
|
gqbin_in_header: bool,
|
|
224
321
|
) -> None:
|
|
225
322
|
"""Write a phased VCF: tab-delimited, compliant, with ensured GT/GQ (and GQBIN if used)."""
|
|
226
|
-
|
|
323
|
+
full_lookup, legacy_index, raw_header_lines, sample_name = _vcf_info_lookup(in_vcf)
|
|
227
324
|
|
|
228
325
|
with open(out_vcf, "w", newline="") as out:
|
|
229
326
|
_write_headers(out, raw_header_lines, sample_name, gqbin_in_header=gqbin_in_header)
|
|
@@ -232,14 +329,26 @@ def _write_phased_vcf(
|
|
|
232
329
|
chrom = str(getattr(row, "chrom", "."))
|
|
233
330
|
pos = int(getattr(row, "pos", 0))
|
|
234
331
|
vid = str(getattr(row, "id", "."))
|
|
332
|
+
|
|
333
|
+
end = getattr(row, "end", None)
|
|
334
|
+
alt = getattr(row, "alt", None)
|
|
335
|
+
|
|
235
336
|
gt = str(getattr(row, "gt", "./."))
|
|
236
337
|
gq = str(getattr(row, "gq", "0"))
|
|
237
338
|
svtype = getattr(row, "svtype", None)
|
|
238
339
|
gq_label = getattr(row, "gq_label", None)
|
|
239
340
|
|
|
240
|
-
info =
|
|
341
|
+
info = _select_info_record(
|
|
342
|
+
full_lookup,
|
|
343
|
+
legacy_index,
|
|
344
|
+
chrom=chrom,
|
|
345
|
+
pos=pos,
|
|
346
|
+
vid=vid,
|
|
347
|
+
end=int(end) if end is not None else None,
|
|
348
|
+
alt=str(alt) if alt is not None else None,
|
|
349
|
+
)
|
|
241
350
|
if info is None:
|
|
242
|
-
logger.warning("Could not
|
|
351
|
+
logger.warning("Could not uniquely match VCF record for %s:%s %s", chrom, pos, vid)
|
|
243
352
|
continue
|
|
244
353
|
|
|
245
354
|
info_str = _compose_info_str(info["INFO"], svtype, gq_label)
|
svphaser/phasing/types.py
CHANGED
|
@@ -1,7 +1,8 @@
|
|
|
1
1
|
"""svphaser.phasing.types
|
|
2
2
|
========================
|
|
3
|
-
|
|
4
|
-
|
|
3
|
+
Common type aliases & small data structures.
|
|
4
|
+
|
|
5
|
+
We keep this module light to avoid circular imports.
|
|
5
6
|
"""
|
|
6
7
|
|
|
7
8
|
from __future__ import annotations
|
|
@@ -9,8 +10,14 @@ from __future__ import annotations
|
|
|
9
10
|
from dataclasses import dataclass
|
|
10
11
|
from typing import NamedTuple
|
|
11
12
|
|
|
12
|
-
|
|
13
|
-
|
|
13
|
+
# Legacy key (older writer used this; can collide when ID='.' or same POS repeats)
|
|
14
|
+
SVKeyLegacy = tuple[str, int, str] # (CHROM, POS, ID)
|
|
15
|
+
|
|
16
|
+
# Collision-resistant key for matching phased rows back to original VCF records
|
|
17
|
+
SVKey = tuple[str, int, str, int, str] # (CHROM, POS, ID, END, ALT)
|
|
18
|
+
|
|
19
|
+
# GQ bin spec: (threshold, label)
|
|
20
|
+
GQBin = tuple[int, str] # e.g. (30, "High")
|
|
14
21
|
|
|
15
22
|
|
|
16
23
|
@dataclass(slots=True, frozen=True)
|
|
@@ -20,7 +27,7 @@ class WorkerOpts:
|
|
|
20
27
|
min_support: int
|
|
21
28
|
major_delta: float
|
|
22
29
|
equal_delta: float
|
|
23
|
-
gq_bins: list[GQBin]
|
|
30
|
+
gq_bins: list[GQBin]
|
|
24
31
|
|
|
25
32
|
|
|
26
33
|
class CallTuple(NamedTuple):
|
|
@@ -0,0 +1,231 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: svphaser
|
|
3
|
+
Version: 2.1.2
|
|
4
|
+
Summary: Structural-variant phasing from HP-tagged long-read BAMs
|
|
5
|
+
Project-URL: Homepage, https://github.com/SFGLab/SvPhaser
|
|
6
|
+
Project-URL: Issues, https://github.com/SFGLab/SvPhaser/issues
|
|
7
|
+
Project-URL: Source, https://github.com/SFGLab/SvPhaser
|
|
8
|
+
Author-email: SvPhaser Team <you@lab.org>
|
|
9
|
+
License: MIT
|
|
10
|
+
License-File: LICENSE
|
|
11
|
+
Keywords: BAM,ONT,VCF,genomics,long-reads,phasing,structural-variants
|
|
12
|
+
Classifier: Development Status :: 4 - Beta
|
|
13
|
+
Classifier: Intended Audience :: Science/Research
|
|
14
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
15
|
+
Classifier: Operating System :: OS Independent
|
|
16
|
+
Classifier: Programming Language :: Python
|
|
17
|
+
Classifier: Programming Language :: Python :: 3
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
20
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
21
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
22
|
+
Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
|
|
23
|
+
Requires-Python: >=3.9
|
|
24
|
+
Requires-Dist: cyvcf2>=0.30
|
|
25
|
+
Requires-Dist: pandas>=2.1
|
|
26
|
+
Requires-Dist: pysam>=0.23
|
|
27
|
+
Requires-Dist: typer>=0.14
|
|
28
|
+
Provides-Extra: bench
|
|
29
|
+
Requires-Dist: py-spy>=0.3; extra == 'bench'
|
|
30
|
+
Requires-Dist: pytest-benchmark>=4.0; extra == 'bench'
|
|
31
|
+
Provides-Extra: dev
|
|
32
|
+
Requires-Dist: black>=24.3; extra == 'dev'
|
|
33
|
+
Requires-Dist: build>=1.2; extra == 'dev'
|
|
34
|
+
Requires-Dist: hypothesis>=6.90; extra == 'dev'
|
|
35
|
+
Requires-Dist: mypy>=1.8; extra == 'dev'
|
|
36
|
+
Requires-Dist: pandas-stubs>=2.0; extra == 'dev'
|
|
37
|
+
Requires-Dist: pre-commit>=3.6; extra == 'dev'
|
|
38
|
+
Requires-Dist: pytest-cov>=5; extra == 'dev'
|
|
39
|
+
Requires-Dist: pytest-xdist>=3.5; extra == 'dev'
|
|
40
|
+
Requires-Dist: pytest>=8; extra == 'dev'
|
|
41
|
+
Requires-Dist: ruff>=0.5; extra == 'dev'
|
|
42
|
+
Requires-Dist: tox>=4.10; extra == 'dev'
|
|
43
|
+
Requires-Dist: twine>=5.0; extra == 'dev'
|
|
44
|
+
Provides-Extra: plots
|
|
45
|
+
Requires-Dist: matplotlib>=3.7; extra == 'plots'
|
|
46
|
+
Description-Content-Type: text/markdown
|
|
47
|
+
|
|
48
|
+
# SvPhaser
|
|
49
|
+
|
|
50
|
+
> **Haplotype-aware structural-variant (SV) genotyper for long-read data**
|
|
51
|
+
|
|
52
|
+
[](https://pypi.org/project/svphaser/)
|
|
53
|
+
[](https://pypi.org/project/svphaser/)
|
|
54
|
+
[](LICENSE)
|
|
55
|
+
|
|
56
|
+
---
|
|
57
|
+
|
|
58
|
+
**SvPhaser** phases **pre-called structural variants (SVs)** using **HP-tagged** long-read alignments (PacBio HiFi, ONT Q20+, …).
|
|
59
|
+
|
|
60
|
+
Think of it as *WhatsHap* for insertions/deletions/duplications:
|
|
61
|
+
- **we do not discover SVs**
|
|
62
|
+
- **we assign haplotype genotypes** (`0|1`, `1|0`, `1|1`, or `./.`)
|
|
63
|
+
- and compute a **Genotype Quality (GQ)** score
|
|
64
|
+
|
|
65
|
+
All in a single, embarrassingly-parallel pass over the genome.
|
|
66
|
+
|
|
67
|
+
## Highlights
|
|
68
|
+
|
|
69
|
+
- **Fast per-chromosome multiprocessing** (scale-out on multi-core CPUs).
|
|
70
|
+
- **Deterministic Δ-based decision logic** (no MCMC / HMM).
|
|
71
|
+
- **CLI + Python API**.
|
|
72
|
+
- **Non-destructive VCF augmentation**: injects phasing fields while preserving the original header and records.
|
|
73
|
+
- **Configurable confidence bins** + optional plots.
|
|
74
|
+
|
|
75
|
+
## Installation
|
|
76
|
+
|
|
77
|
+
### From PyPI (recommended)
|
|
78
|
+
|
|
79
|
+
```bash
|
|
80
|
+
# Requires Python >= 3.9
|
|
81
|
+
pip install svphaser
|
|
82
|
+
````
|
|
83
|
+
|
|
84
|
+
Optional extras (if you use them):
|
|
85
|
+
|
|
86
|
+
```bash
|
|
87
|
+
pip install "svphaser[plots]"
|
|
88
|
+
pip install "svphaser[bench]"
|
|
89
|
+
pip install "svphaser[dev]"
|
|
90
|
+
```
|
|
91
|
+
|
|
92
|
+
### From source
|
|
93
|
+
|
|
94
|
+
```bash
|
|
95
|
+
git clone https://github.com/SFGLab/SvPhaser.git
|
|
96
|
+
cd SvPhaser
|
|
97
|
+
pip install -e .
|
|
98
|
+
```
|
|
99
|
+
|
|
100
|
+
## Inputs & requirements
|
|
101
|
+
|
|
102
|
+
SvPhaser expects:
|
|
103
|
+
|
|
104
|
+
1. **Unphased SV VCF** (`.vcf` / `.vcf.gz`)
|
|
105
|
+
|
|
106
|
+
* SVs should already be called by your preferred SV caller.
|
|
107
|
+
|
|
108
|
+
2. **HP-tagged BAM** (long-read alignments)
|
|
109
|
+
|
|
110
|
+
* Reads must contain haplotype tags (e.g., `HP`) produced by an upstream phasing pipeline.
|
|
111
|
+
|
|
112
|
+
If your BAM is not HP-tagged, SvPhaser cannot assign haplotypes.
|
|
113
|
+
|
|
114
|
+
## Quick start (CLI)
|
|
115
|
+
|
|
116
|
+
```bash
|
|
117
|
+
svphaser phase \
|
|
118
|
+
sample_unphased.vcf.gz \
|
|
119
|
+
sample.sorted_phased.bam \
|
|
120
|
+
--out-dir results/ \
|
|
121
|
+
--min-support 10 \
|
|
122
|
+
--major-delta 0.70 \
|
|
123
|
+
--equal-delta 0.25 \
|
|
124
|
+
--gq-bins "30:High,10:Moderate" \
|
|
125
|
+
--threads 32
|
|
126
|
+
```
|
|
127
|
+
|
|
128
|
+
### Outputs
|
|
129
|
+
|
|
130
|
+
Inside `results/`:
|
|
131
|
+
|
|
132
|
+
* `*_phased.vcf` — your original VCF with additional INFO fields:
|
|
133
|
+
|
|
134
|
+
* `HP_GT` — phased genotype
|
|
135
|
+
* `HP_GQ` — genotype quality score
|
|
136
|
+
* `HP_GQBIN` — confidence bin label (based on your `--gq-bins`)
|
|
137
|
+
* `*_phased.csv` — tidy table for plotting / downstream analysis
|
|
138
|
+
|
|
139
|
+
For algorithmic details, see: **`docs/methodology.md`**.
|
|
140
|
+
|
|
141
|
+
## Python API
|
|
142
|
+
|
|
143
|
+
```python
|
|
144
|
+
from pathlib import Path
|
|
145
|
+
from svphaser.phasing.io import phase_vcf
|
|
146
|
+
|
|
147
|
+
phase_vcf(
|
|
148
|
+
Path("sample.vcf.gz"),
|
|
149
|
+
Path("sample.bam"),
|
|
150
|
+
out_dir=Path("results"),
|
|
151
|
+
min_support=10,
|
|
152
|
+
major_delta=0.70,
|
|
153
|
+
equal_delta=0.25,
|
|
154
|
+
gq_bins="30:High,10:Moderate",
|
|
155
|
+
threads=8,
|
|
156
|
+
)
|
|
157
|
+
```
|
|
158
|
+
|
|
159
|
+
The phased table can also be loaded from the generated CSV for custom analytics.
|
|
160
|
+
|
|
161
|
+
## Repository structure (high level)
|
|
162
|
+
|
|
163
|
+
```
|
|
164
|
+
SvPhaser/
|
|
165
|
+
├─ src/svphaser/ # importable package
|
|
166
|
+
├─ tests/ # test suite + small fixtures (if present)
|
|
167
|
+
├─ docs/ # methodology + notes
|
|
168
|
+
├─ notebooks/ # experiments / analysis (if present)
|
|
169
|
+
├─ figures/ # plots & diagrams (if present)
|
|
170
|
+
├─ pyproject.toml
|
|
171
|
+
└─ CHANGELOG.md
|
|
172
|
+
```
|
|
173
|
+
|
|
174
|
+
## Development
|
|
175
|
+
|
|
176
|
+
```bash
|
|
177
|
+
git clone https://github.com/SFGLab/SvPhaser.git
|
|
178
|
+
cd SvPhaser
|
|
179
|
+
|
|
180
|
+
python -m venv .venv
|
|
181
|
+
source .venv/bin/activate
|
|
182
|
+
|
|
183
|
+
pip install -e ".[dev]"
|
|
184
|
+
pytest -q
|
|
185
|
+
mypy src/svphaser
|
|
186
|
+
```
|
|
187
|
+
|
|
188
|
+
See `CONTRIBUTING.md` for contribution guidelines.
|
|
189
|
+
|
|
190
|
+
## Citing SvPhaser
|
|
191
|
+
|
|
192
|
+
If SvPhaser contributed to your research, please cite:
|
|
193
|
+
|
|
194
|
+
```bibtex
|
|
195
|
+
@software{svphaser2025,
|
|
196
|
+
author = {Pranjul Mishra and Sachin Gadakh},
|
|
197
|
+
title = {SvPhaser: Haplotype-aware structural-variant genotyping from HP-tagged long-read BAMs},
|
|
198
|
+
version = {2.0.6},
|
|
199
|
+
year = {2025},
|
|
200
|
+
month = nov,
|
|
201
|
+
url = {https://github.com/SFGLab/SvPhaser},
|
|
202
|
+
note = {PyPI: https://pypi.org/project/svphaser/}
|
|
203
|
+
}
|
|
204
|
+
```
|
|
205
|
+
|
|
206
|
+
(If you need maximum rigor for a paper, cite a specific git commit hash too.)
|
|
207
|
+
|
|
208
|
+
## License
|
|
209
|
+
|
|
210
|
+
SvPhaser is released under the **MIT License** — see [LICENSE](LICENSE).
|
|
211
|
+
|
|
212
|
+
## Contact
|
|
213
|
+
|
|
214
|
+
Developed by **Team 5 (BioAI Hackathon)**.
|
|
215
|
+
|
|
216
|
+
* Pranjul Mishra — [pranjul.mishra@proton.me](mailto:pranjul.mishra@proton.me)
|
|
217
|
+
* Sachin Gadakh — [s.gadakh@cent.uw.edu.pl](mailto:s.gadakh@cent.uw.edu.pl)
|
|
218
|
+
|
|
219
|
+
Issues and feature requests: please open a GitHub issue.
|
|
220
|
+
|
|
221
|
+
```
|
|
222
|
+
|
|
223
|
+
### Two hard notes (don’t ignore)
|
|
224
|
+
- If you **don’t actually have CI set up**, don’t show a CI badge. A fake badge is worse than no badge.
|
|
225
|
+
- If your repo layout doesn’t include `notebooks/figures/tests fixtures`, either adjust that tree block or remove it to avoid “template smell.”
|
|
226
|
+
|
|
227
|
+
If you want, paste your **current `.github/workflows` filenames** (or tell me if you have none) and I’ll add the *correct* CI badge line too—without guessing.
|
|
228
|
+
::contentReference[oaicite:1]{index=1}
|
|
229
|
+
```
|
|
230
|
+
|
|
231
|
+
[1]: https://pypi.org/project/svphaser/ "svphaser · PyPI"
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
svphaser/__init__.py,sha256=h4eLAi05OsAqs9Evii526TVr2CUo3dW-iHO-RjXtv_E,2599
|
|
2
|
+
svphaser/__main__.py,sha256=lWe9boXc4JIpJEknv9dXqSsjS1Glk_FnYV9eBzUEzA0,86
|
|
3
|
+
svphaser/_version.py,sha256=1HfU5X4wnJmiuzlejKwSquKM3pc6v_HUoAD1U98eCOw,704
|
|
4
|
+
svphaser/cli.py,sha256=w8LwEYBWwjJWuLA1AE-XuyMVCzgjS2DjBwCBJHjq0EA,4509
|
|
5
|
+
svphaser/logging.py,sha256=yw7Z8az-sZL-x4qhvPmu7aGVLbhSOYiSdrNqCO8bVtw,841
|
|
6
|
+
svphaser/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
7
|
+
svphaser/phasing/__init__.py,sha256=RcXg2EGc7Dyuq8wDP-bJmtVNKs8f3YqByjTel0mMujM,464
|
|
8
|
+
svphaser/phasing/_workers.py,sha256=8RGBuT5ZMxGpnFu18KqJKJmic7gFRAAGI_HtcClle84,12375
|
|
9
|
+
svphaser/phasing/algorithms.py,sha256=Z7ARpvIXXRtu87qxnumNQBy9haWkO_GGJhTPKYG1TDg,2374
|
|
10
|
+
svphaser/phasing/io.py,sha256=wEV8ZsNpCBzwq-bLM0J1alVTARtYWGH7Qhe5PVXkohg,11676
|
|
11
|
+
svphaser/phasing/types.py,sha256=fux_thtaqT9U8DxVreyyz25g12URdm6fUDej2u3O0J0,981
|
|
12
|
+
svphaser-2.1.2.dist-info/METADATA,sha256=6c7h905ltEou3UIt3DxgXFUR6rsFEkT0NOe3Jp1jDoU,6847
|
|
13
|
+
svphaser-2.1.2.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
|
|
14
|
+
svphaser-2.1.2.dist-info/entry_points.txt,sha256=YFeRJLvcvTc4dazgyVOg3HJ4fNrvBrVDaPX6ULapqeU,46
|
|
15
|
+
svphaser-2.1.2.dist-info/licenses/LICENSE,sha256=qsY5iOcewwIwvhQbj7naSP6tpJAc05Mv0DfhrouPoBU,1102
|
|
16
|
+
svphaser-2.1.2.dist-info/RECORD,,
|