svphaser 2.0.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,12 @@
1
+ # Ignore all files in output directories
2
+ output_*/
3
+ output/
4
+ __pycache__/
5
+ # Ignore all CSV and VCF files anywhere (optional, only if you want)
6
+ *.csv
7
+ *.vcf
8
+ # Or, ignore only those in the output directories:
9
+ output_*/**/*.csv
10
+ output_*/**/*.vcf
11
+ output/**/*.csv
12
+ output/**/*.vcf
svphaser-2.0.1/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2025 Structural and Functional Genomics Laboratory
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,203 @@
1
+ Metadata-Version: 2.4
2
+ Name: svphaser
3
+ Version: 2.0.1
4
+ Summary: Structural-variant phasing from HP-tagged long-read BAMs
5
+ Project-URL: Homepage, https://github.com/your-org/svphaser
6
+ Project-URL: Issues, https://github.com/your-org/svphaser/issues
7
+ Project-URL: Source, https://github.com/your-org/svphaser
8
+ Author-email: SvPhaser Team <you@lab.org>
9
+ License: MIT
10
+ License-File: LICENSE
11
+ Keywords: BAM,ONT,VCF,genomics,long-reads,phasing,structural-variants
12
+ Classifier: Development Status :: 4 - Beta
13
+ Classifier: Intended Audience :: Science/Research
14
+ Classifier: License :: OSI Approved :: MIT License
15
+ Classifier: Operating System :: OS Independent
16
+ Classifier: Programming Language :: Python
17
+ Classifier: Programming Language :: Python :: 3
18
+ Classifier: Programming Language :: Python :: 3.9
19
+ Classifier: Programming Language :: Python :: 3.10
20
+ Classifier: Programming Language :: Python :: 3.11
21
+ Classifier: Programming Language :: Python :: 3.12
22
+ Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
23
+ Requires-Python: >=3.9
24
+ Requires-Dist: cyvcf2>=0.30
25
+ Requires-Dist: pandas>=2.1
26
+ Requires-Dist: pysam>=0.23
27
+ Requires-Dist: typer>=0.14
28
+ Provides-Extra: bench
29
+ Requires-Dist: py-spy>=0.3; extra == 'bench'
30
+ Requires-Dist: pytest-benchmark>=4.0; extra == 'bench'
31
+ Provides-Extra: dev
32
+ Requires-Dist: black>=24.3; extra == 'dev'
33
+ Requires-Dist: build>=1.2; extra == 'dev'
34
+ Requires-Dist: hypothesis>=6.90; extra == 'dev'
35
+ Requires-Dist: mypy>=1.8; extra == 'dev'
36
+ Requires-Dist: pandas-stubs>=2.0; extra == 'dev'
37
+ Requires-Dist: pre-commit>=3.6; extra == 'dev'
38
+ Requires-Dist: pytest-cov>=5; extra == 'dev'
39
+ Requires-Dist: pytest-xdist>=3.5; extra == 'dev'
40
+ Requires-Dist: pytest>=8; extra == 'dev'
41
+ Requires-Dist: ruff>=0.5; extra == 'dev'
42
+ Requires-Dist: tox>=4.10; extra == 'dev'
43
+ Requires-Dist: twine>=5.0; extra == 'dev'
44
+ Provides-Extra: plots
45
+ Requires-Dist: matplotlib>=3.7; extra == 'plots'
46
+ Description-Content-Type: text/markdown
47
+
48
+ # SvPhaser
49
+
50
+ > **Haplotype‑aware structural‑variant genotyper for long‑read data**
51
+
52
+ [![PyPI version](https://img.shields.io/pypi/v/svphaser.svg?logo=pypi)](https://pypi.org/project/svphaser)
53
+ [![Tests](https://img.shields.io/github/actions/workflow/status/your‑org/SvPhaser/ci.yml?label=ci)](https://github.com/your‑org/SvPhaser/actions)
54
+ [![License](https://img.shields.io/github/license/your‑org/SvPhaser.svg)](LICENSE)
55
+
56
+ ---
57
+
58
+ `SvPhaser` phases **pre‑called structural variants (SVs)** using *HP‑tagged* long‑read alignments (PacBio HiFi, ONT Q20+, …). Think of it as *WhatsHap* for insertions/deletions/duplications: we do **not** discover SVs; we assign each variant a haplotype genotype (`0|1`, `1|0`, `1|1`, or `./.`) together with a **Genotype Quality (GQ)** score – all in a single, embarrassingly‑parallel pass over the genome.
59
+
60
+ ## Key highlights
61
+
62
+ * **Fast, per‑chromosome multiprocessing** – linear scale‑out on 32‑core workstations.
63
+ * **Deterministic Δ‑based decision tree** – no MCMC or hidden state machines.
64
+ * **Friendly CLI** (`svphaser phase …`) and importable Python API.
65
+ * **Seamless VCF injection** – adds `HP_GT`, `HP_GQ`, `HP_GQBIN` INFO tags while copying the original header verbatim.
66
+ * **Configurable confidence bins** and publication‑ready plots (see `result_images/`).
67
+
68
+ ---
69
+
70
+ ## Installation
71
+
72
+ ```bash
73
+ # Requires Python ≥3.9
74
+ pip install svphaser # PyPI (coming soon)
75
+ # or
76
+ pip install git+https://github.com/your‑org/SvPhaser.git@v0.2.0
77
+ ```
78
+
79
+ `cyvcf2`, `pysam`, `typer[all]`, and `pandas` are pulled in automatically.
80
+
81
+ ## Quick‑start
82
+
83
+ ```bash
84
+ svphaser phase \
85
+ sample_unphased.vcf.gz \
86
+ sample.sorted_phased.bam \
87
+ --out-dir results/ \
88
+ --min-support 10 \
89
+ --major-delta 0.70 \
90
+ --equal-delta 0.25 \
91
+ --gq-bins "30:High,10:Moderate" \
92
+ --threads 32
93
+ ```
94
+
95
+ Outputs (written inside **`results/`**)
96
+
97
+ ```
98
+ sample_unphased_phased.vcf # original VCF + HP_* INFO fields
99
+ sample_unphased_phased.csv # tidy table for plotting / downstream R
100
+ ```
101
+
102
+ See [`docs/methodology.md`](docs/Methodology.md) and the flow‑chart below for algorithmic details.
103
+
104
+ ![SvPhaser methodology](docs/result_images/methodology_diagram.png)
105
+
106
+ ## Folder layout
107
+
108
+ ```
109
+ SvPhaser/
110
+ ├─ src/svphaser/ # importable package
111
+ │ ├─ cli.py # Typer entry‑point
112
+ │ ├─ logging.py # unified log setup
113
+ │ └─ phasing/
114
+ │ ├─ algorithms.py # core maths
115
+ │ ├─ io.py # driver & I/O
116
+ │ ├─ _workers.py # per‑chrom processes
117
+ │ └─ types.py # thin dataclasses
118
+ ├─ tests/ # pytest suite + mini data
119
+ ├─ docs/ # extra documentation
120
+ ├─ result_images/ # generated plots & diagrams
121
+ └─ CHANGELOG.md
122
+ ```
123
+
124
+ ## Python usage
125
+
126
+ ```python
127
+ from pathlib import Path
128
+ from svphaser.phasing.io import phase_vcf
129
+
130
+ phase_vcf(
131
+ Path("sample.vcf.gz"),
132
+ Path("sample.bam"),
133
+ out_dir=Path("results"),
134
+ min_support=10,
135
+ major_delta=0.70,
136
+ equal_delta=0.25,
137
+ gq_bins="30:High,10:Moderate",
138
+ threads=8,
139
+ )
140
+ ```
141
+
142
+ The resulting `DataFrame` can be loaded from the CSV for custom analytics.
143
+
144
+
145
+
146
+
147
+ ## Development & contributing
148
+
149
+ 1. Clone and create a virtual env:
150
+
151
+ ```bash
152
+ git clone https://github.com/your‑org/SvPhaser.git && cd SvPhaser
153
+ python -m venv .venv && source .venv/bin/activate
154
+ pip install -e .[dev]
155
+ ```
156
+ 2. Run the test‑suite & type checks:
157
+
158
+ ```bash
159
+ pytest -q
160
+ mypy src/svphaser
161
+ black --check src tests
162
+ ```
163
+ 3. Send a PR targeting the **`dev`** branch; one topic per PR.
164
+
165
+ Please read `CONTRIBUTING.md` (to come) for style‑guides and the DCO sign‑off.
166
+
167
+ ## Citing SvPhaser
168
+
169
+ If SvPhaser contributed to your research, please cite:
170
+
171
+ ```bibtex
172
+ @software{svphaser2024,
173
+ author = {Pranjul Mishra, Sachin Ghadak, CeNT Lab},
174
+ title = {SvPhaser: haplotype‑aware SV genotyping},
175
+ version = {0.2.0},
176
+ date = {2024-06-18},
177
+ url = {https://github.com/your‑org/SvPhaser}
178
+ }
179
+ ```
180
+
181
+
182
+
183
+
184
+ ## License
185
+ `SvPhaser` is released under the MIT License – see [`LICENSE`](LICENSE).
186
+
187
+
188
+
189
+
190
+
191
+ ## 📬 Contact
192
+
193
+ Developed by **Team5** (*BioAI Hackathon*) – Sachin Gadakh & Pranjul Mishra.
194
+
195
+ Lead contacts:
196
+ • [pranjul.mishra@proton.me](mailto:pranjul.mishra@proton.me)
197
+ • [s.gadakh@cent.uw.edu.pl](mailto:s.gadakh@cent.uw.edu.pl)
198
+
199
+ Feedback, feature requests and bug reports are all appreciated — feel free to open a GitHub issue or reach out by e‑mail.
200
+
201
+ ---
202
+
203
+ *Happy phasing!*
@@ -0,0 +1,156 @@
1
+ # SvPhaser
2
+
3
+ > **Haplotype‑aware structural‑variant genotyper for long‑read data**
4
+
5
+ [![PyPI version](https://img.shields.io/pypi/v/svphaser.svg?logo=pypi)](https://pypi.org/project/svphaser)
6
+ [![Tests](https://img.shields.io/github/actions/workflow/status/your‑org/SvPhaser/ci.yml?label=ci)](https://github.com/your‑org/SvPhaser/actions)
7
+ [![License](https://img.shields.io/github/license/your‑org/SvPhaser.svg)](LICENSE)
8
+
9
+ ---
10
+
11
+ `SvPhaser` phases **pre‑called structural variants (SVs)** using *HP‑tagged* long‑read alignments (PacBio HiFi, ONT Q20+, …). Think of it as *WhatsHap* for insertions/deletions/duplications: we do **not** discover SVs; we assign each variant a haplotype genotype (`0|1`, `1|0`, `1|1`, or `./.`) together with a **Genotype Quality (GQ)** score – all in a single, embarrassingly‑parallel pass over the genome.
12
+
13
+ ## Key highlights
14
+
15
+ * **Fast, per‑chromosome multiprocessing** – linear scale‑out on 32‑core workstations.
16
+ * **Deterministic Δ‑based decision tree** – no MCMC or hidden state machines.
17
+ * **Friendly CLI** (`svphaser phase …`) and importable Python API.
18
+ * **Seamless VCF injection** – adds `HP_GT`, `HP_GQ`, `HP_GQBIN` INFO tags while copying the original header verbatim.
19
+ * **Configurable confidence bins** and publication‑ready plots (see `result_images/`).
20
+
21
+ ---
22
+
23
+ ## Installation
24
+
25
+ ```bash
26
+ # Requires Python ≥3.9
27
+ pip install svphaser # PyPI (coming soon)
28
+ # or
29
+ pip install git+https://github.com/your‑org/SvPhaser.git@v0.2.0
30
+ ```
31
+
32
+ `cyvcf2`, `pysam`, `typer[all]`, and `pandas` are pulled in automatically.
33
+
34
+ ## Quick‑start
35
+
36
+ ```bash
37
+ svphaser phase \
38
+ sample_unphased.vcf.gz \
39
+ sample.sorted_phased.bam \
40
+ --out-dir results/ \
41
+ --min-support 10 \
42
+ --major-delta 0.70 \
43
+ --equal-delta 0.25 \
44
+ --gq-bins "30:High,10:Moderate" \
45
+ --threads 32
46
+ ```
47
+
48
+ Outputs (written inside **`results/`**)
49
+
50
+ ```
51
+ sample_unphased_phased.vcf # original VCF + HP_* INFO fields
52
+ sample_unphased_phased.csv # tidy table for plotting / downstream R
53
+ ```
54
+
55
+ See [`docs/methodology.md`](docs/Methodology.md) and the flow‑chart below for algorithmic details.
56
+
57
+ ![SvPhaser methodology](docs/result_images/methodology_diagram.png)
58
+
59
+ ## Folder layout
60
+
61
+ ```
62
+ SvPhaser/
63
+ ├─ src/svphaser/ # importable package
64
+ │ ├─ cli.py # Typer entry‑point
65
+ │ ├─ logging.py # unified log setup
66
+ │ └─ phasing/
67
+ │ ├─ algorithms.py # core maths
68
+ │ ├─ io.py # driver & I/O
69
+ │ ├─ _workers.py # per‑chrom processes
70
+ │ └─ types.py # thin dataclasses
71
+ ├─ tests/ # pytest suite + mini data
72
+ ├─ docs/ # extra documentation
73
+ ├─ result_images/ # generated plots & diagrams
74
+ └─ CHANGELOG.md
75
+ ```
76
+
77
+ ## Python usage
78
+
79
+ ```python
80
+ from pathlib import Path
81
+ from svphaser.phasing.io import phase_vcf
82
+
83
+ phase_vcf(
84
+ Path("sample.vcf.gz"),
85
+ Path("sample.bam"),
86
+ out_dir=Path("results"),
87
+ min_support=10,
88
+ major_delta=0.70,
89
+ equal_delta=0.25,
90
+ gq_bins="30:High,10:Moderate",
91
+ threads=8,
92
+ )
93
+ ```
94
+
95
+ The resulting `DataFrame` can be loaded from the CSV for custom analytics.
96
+
97
+
98
+
99
+
100
+ ## Development & contributing
101
+
102
+ 1. Clone and create a virtual env:
103
+
104
+ ```bash
105
+ git clone https://github.com/your‑org/SvPhaser.git && cd SvPhaser
106
+ python -m venv .venv && source .venv/bin/activate
107
+ pip install -e .[dev]
108
+ ```
109
+ 2. Run the test‑suite & type checks:
110
+
111
+ ```bash
112
+ pytest -q
113
+ mypy src/svphaser
114
+ black --check src tests
115
+ ```
116
+ 3. Send a PR targeting the **`dev`** branch; one topic per PR.
117
+
118
+ Please read `CONTRIBUTING.md` (to come) for style‑guides and the DCO sign‑off.
119
+
120
+ ## Citing SvPhaser
121
+
122
+ If SvPhaser contributed to your research, please cite:
123
+
124
+ ```bibtex
125
+ @software{svphaser2024,
126
+ author = {Pranjul Mishra, Sachin Ghadak, CeNT Lab},
127
+ title = {SvPhaser: haplotype‑aware SV genotyping},
128
+ version = {0.2.0},
129
+ date = {2024-06-18},
130
+ url = {https://github.com/your‑org/SvPhaser}
131
+ }
132
+ ```
133
+
134
+
135
+
136
+
137
+ ## License
138
+ `SvPhaser` is released under the MIT License – see [`LICENSE`](LICENSE).
139
+
140
+
141
+
142
+
143
+
144
+ ## 📬 Contact
145
+
146
+ Developed by **Team5** (*BioAI Hackathon*) – Sachin Gadakh & Pranjul Mishra.
147
+
148
+ Lead contacts:
149
+ • [pranjul.mishra@proton.me](mailto:pranjul.mishra@proton.me)
150
+ • [s.gadakh@cent.uw.edu.pl](mailto:s.gadakh@cent.uw.edu.pl)
151
+
152
+ Feedback, feature requests and bug reports are all appreciated — feel free to open a GitHub issue or reach out by e‑mail.
153
+
154
+ ---
155
+
156
+ *Happy phasing!*
@@ -0,0 +1,127 @@
1
+ # -------------------------------------------------------------------
2
+ # SvPhaser • project metadata (PEP-621) with hatch-vcs versioning
3
+ # -------------------------------------------------------------------
4
+ [build-system]
5
+ requires = ["hatchling>=1.24", "hatch-vcs>=0.4"]
6
+ build-backend = "hatchling.build"
7
+
8
+ [project]
9
+ name = "svphaser"
10
+ dynamic = ["version"]
11
+ description = "Structural-variant phasing from HP-tagged long-read BAMs"
12
+ readme = "README.md"
13
+ requires-python = ">=3.9"
14
+ license = { text = "MIT" }
15
+ authors = [{ name = "SvPhaser Team", email = "you@lab.org" }]
16
+ keywords = ["genomics", "structural-variants", "phasing", "long-reads", "ONT", "BAM", "VCF"]
17
+ classifiers = [
18
+ "Development Status :: 4 - Beta",
19
+ "Intended Audience :: Science/Research",
20
+ "License :: OSI Approved :: MIT License",
21
+ "Programming Language :: Python",
22
+ "Programming Language :: Python :: 3",
23
+ "Programming Language :: Python :: 3.9",
24
+ "Programming Language :: Python :: 3.10",
25
+ "Programming Language :: Python :: 3.11",
26
+ "Programming Language :: Python :: 3.12",
27
+ "Topic :: Scientific/Engineering :: Bio-Informatics",
28
+ "Operating System :: OS Independent",
29
+ ]
30
+
31
+ dependencies = [
32
+ "pysam>=0.23",
33
+ "cyvcf2>=0.30",
34
+ "typer>=0.14",
35
+ "pandas>=2.1",
36
+ ]
37
+
38
+ [project.optional-dependencies]
39
+ plots = ["matplotlib>=3.7"]
40
+ bench = ["pytest-benchmark>=4.0", "py-spy>=0.3"]
41
+ dev = [
42
+ "pytest>=8",
43
+ "pytest-cov>=5",
44
+ "pytest-xdist>=3.5",
45
+ "hypothesis>=6.90",
46
+ "ruff>=0.5",
47
+ "mypy>=1.8",
48
+ "black>=24.3",
49
+ "pre-commit>=3.6",
50
+ "build>=1.2",
51
+ "twine>=5.0",
52
+ "tox>=4.10",
53
+ "pandas-stubs>=2.0",
54
+ ]
55
+
56
+ [project.urls]
57
+ Homepage = "https://github.com/your-org/svphaser"
58
+ Issues = "https://github.com/your-org/svphaser/issues"
59
+ Source = "https://github.com/your-org/svphaser"
60
+
61
+ [project.scripts]
62
+ svphaser = "svphaser.cli:app"
63
+
64
+ # -------------------------------------------------------------------
65
+ # Hatch build config (src/ layout + typed package)
66
+ # -------------------------------------------------------------------
67
+ [tool.hatch.build.targets.wheel]
68
+ packages = ["src/svphaser"]
69
+ include = ["src/svphaser/py.typed"]
70
+
71
+ [tool.hatch.build.targets.sdist]
72
+ include = ["src/**", "README.md", "LICENSE", "pyproject.toml"]
73
+
74
+ # Version from Git tags like v2.0.0 → 2.0.0
75
+ [tool.hatch.version]
76
+ source = "vcs"
77
+
78
+ [tool.hatch.build.hooks.vcs]
79
+ version-file = "src/svphaser/_version.py"
80
+ tag-pattern = "v(?P<version>.+)"
81
+
82
+ # -------------------------------------------------------------------
83
+ # Tooling (ruff / black / mypy / pytest / coverage)
84
+ # -------------------------------------------------------------------
85
+ [tool.black]
86
+ line-length = 100
87
+ target-version = ["py39"]
88
+
89
+ [tool.ruff]
90
+ line-length = 100
91
+ target-version = "py39"
92
+ exclude = [
93
+ "notebooks/**",
94
+ "docs/**/Presentation*/**",
95
+ ]
96
+
97
+ [tool.ruff.lint]
98
+ select = ["E", "F", "W", "I", "UP", "B", "C90"]
99
+ ignore = ["E203"] # black-compatible slicing
100
+
101
+ [tool.ruff.lint.mccabe]
102
+ max-complexity = 12
103
+
104
+ [tool.mypy]
105
+ python_version = "3.9"
106
+ strict = true
107
+ show_error_codes = true
108
+ warn_unreachable = true
109
+ ignore_missing_imports = true
110
+
111
+ [tool.pytest.ini_options]
112
+ minversion = "8.0"
113
+ addopts = "-q --strict-markers --disable-warnings"
114
+ testpaths = ["tests"]
115
+ xfail_strict = true
116
+
117
+ [tool.coverage.run]
118
+ branch = true
119
+ source = ["svphaser"]
120
+
121
+ [tool.coverage.report]
122
+ exclude_lines = [
123
+ "pragma: no cover",
124
+ "if TYPE_CHECKING:",
125
+ "raise NotImplementedError",
126
+ ]
127
+ precision = 1
@@ -0,0 +1,88 @@
1
+ """Top-level SvPhaser package.
2
+
3
+ Public surface kept tiny: a version string and a convenience helper
4
+ that calls the library’s main phasing routine.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ from pathlib import Path
10
+
11
+ # --------------------------------------------------------------------
12
+ # Robust version lookup:
13
+ # - Prefer installed package metadata (works for wheels and PEP 660 editables)
14
+ # - Fall back to placeholder in _version.py for raw-source/dev use
15
+ # --------------------------------------------------------------------
16
+ try:
17
+ from importlib.metadata import version as _pkg_version # Python 3.8+
18
+
19
+ __version__ = _pkg_version("svphaser")
20
+ except Exception:
21
+ try:
22
+ from ._version import __version__ # "0+unknown" in repo; overwritten in builds
23
+ except Exception: # highly defensive
24
+ __version__ = "0+unknown"
25
+
26
+ # Centralized defaults (keep CLI in sync)
27
+ DEFAULT_MIN_SUPPORT: int = 10
28
+ DEFAULT_MAJOR_DELTA: float = 0.70
29
+ DEFAULT_EQUAL_DELTA: float = 0.25
30
+ DEFAULT_GQ_BINS: str = "30:High,10:Moderate"
31
+
32
+
33
+ def phase(
34
+ sv_vcf: Path | str,
35
+ bam: Path | str,
36
+ /,
37
+ *,
38
+ out_dir: Path | str = ".",
39
+ min_support: int = DEFAULT_MIN_SUPPORT,
40
+ major_delta: float = DEFAULT_MAJOR_DELTA,
41
+ equal_delta: float = DEFAULT_EQUAL_DELTA,
42
+ gq_bins: str = DEFAULT_GQ_BINS,
43
+ threads: int | None = None,
44
+ ) -> tuple[Path, Path]:
45
+ """Phase *sv_vcf* using HP-tagged *bam*, writing outputs into *out_dir*.
46
+
47
+ Thin wrapper around :py:func:`svphaser.phasing.io.phase_vcf` so users/tests
48
+ can skip importing submodules.
49
+
50
+ Returns
51
+ -------
52
+ (out_vcf_path, out_csv_path)
53
+ """
54
+ from .phasing.io import phase_vcf # local import avoids heavy deps at import-time
55
+
56
+ out_dir_p = Path(out_dir)
57
+ out_dir_p.mkdir(parents=True, exist_ok=True)
58
+
59
+ stem = Path(sv_vcf).name
60
+ if stem.endswith(".vcf.gz"):
61
+ stem = stem[:-7]
62
+ elif stem.endswith(".vcf"):
63
+ stem = stem[:-4]
64
+
65
+ out_vcf = out_dir_p / f"{stem}_phased.vcf"
66
+ out_csv = out_dir_p / f"{stem}_phased.csv"
67
+
68
+ phase_vcf(
69
+ sv_vcf,
70
+ bam,
71
+ out_dir=out_dir_p, # type: ignore[arg-type]
72
+ min_support=min_support,
73
+ major_delta=major_delta,
74
+ equal_delta=equal_delta,
75
+ gq_bins=gq_bins, # type: ignore[arg-type]
76
+ threads=threads,
77
+ )
78
+ return out_vcf, out_csv
79
+
80
+
81
+ __all__ = [
82
+ "phase",
83
+ "__version__",
84
+ "DEFAULT_MIN_SUPPORT",
85
+ "DEFAULT_MAJOR_DELTA",
86
+ "DEFAULT_EQUAL_DELTA",
87
+ "DEFAULT_GQ_BINS",
88
+ ]
@@ -0,0 +1,5 @@
1
+ # src/svphaser/__main__.py
2
+ from .cli import app
3
+
4
+ if __name__ == "__main__":
5
+ app()
@@ -0,0 +1,34 @@
1
+ # file generated by setuptools-scm
2
+ # don't change, don't track in version control
3
+
4
+ __all__ = [
5
+ "__version__",
6
+ "__version_tuple__",
7
+ "version",
8
+ "version_tuple",
9
+ "__commit_id__",
10
+ "commit_id",
11
+ ]
12
+
13
+ TYPE_CHECKING = False
14
+ if TYPE_CHECKING:
15
+ from typing import Tuple
16
+ from typing import Union
17
+
18
+ VERSION_TUPLE = Tuple[Union[int, str], ...]
19
+ COMMIT_ID = Union[str, None]
20
+ else:
21
+ VERSION_TUPLE = object
22
+ COMMIT_ID = object
23
+
24
+ version: str
25
+ __version__: str
26
+ __version_tuple__: VERSION_TUPLE
27
+ version_tuple: VERSION_TUPLE
28
+ commit_id: COMMIT_ID
29
+ __commit_id__: COMMIT_ID
30
+
31
+ __version__ = version = '2.0.1'
32
+ __version_tuple__ = version_tuple = (2, 0, 1)
33
+
34
+ __commit_id__ = commit_id = None
@@ -0,0 +1,177 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ svphaser.cli
4
+ ============
5
+ Command-line interface for **SvPhaser**.
6
+
7
+ The program writes two files inside **--out-dir** (or the CWD):
8
+
9
+ * ``<stem>_phased.vcf`` (uncompressed; GT/GQ and optional HP_GQBIN injected)
10
+ * ``<stem>_phased.csv`` (tabular summary including gq_label column)
11
+ """
12
+ from __future__ import annotations
13
+
14
+ from pathlib import Path
15
+ from typing import Annotated
16
+
17
+ import typer
18
+
19
+ from svphaser import (
20
+ DEFAULT_EQUAL_DELTA,
21
+ DEFAULT_GQ_BINS,
22
+ DEFAULT_MAJOR_DELTA,
23
+ DEFAULT_MIN_SUPPORT,
24
+ __version__,
25
+ )
26
+
27
+ app = typer.Typer(add_completion=False, rich_markup_mode="rich")
28
+
29
+
30
+ def _version_callback(value: bool):
31
+ if value:
32
+ typer.echo(__version__)
33
+ raise typer.Exit()
34
+
35
+
36
+ @app.callback()
37
+ def main(
38
+ version: Annotated[
39
+ bool | None,
40
+ typer.Option(
41
+ "--version",
42
+ help="Show SvPhaser version and exit.",
43
+ is_flag=True,
44
+ callback=_version_callback,
45
+ ),
46
+ ] = None
47
+ ):
48
+ """SvPhaser – Structural-variant phasing from HP-tagged long-read BAMs."""
49
+ # no-op; callback handles --version
50
+ return
51
+
52
+
53
+ # ──────────────────────────────────────────────────────────────────────────
54
+ # phase command
55
+ # ──────────────────────────────────────────────────────────────────────────
56
+ @app.command("phase")
57
+ def phase_cmd(
58
+ sv_vcf: Annotated[
59
+ Path,
60
+ typer.Argument(
61
+ exists=True,
62
+ help="Input *un-phased* SV VCF (.vcf or .vcf.gz)",
63
+ ),
64
+ ],
65
+ bam: Annotated[
66
+ Path,
67
+ typer.Argument(
68
+ exists=True,
69
+ help="Long-read BAM/CRAM with HP tags",
70
+ ),
71
+ ],
72
+ out_dir: Annotated[
73
+ Path,
74
+ typer.Option(
75
+ "--out-dir",
76
+ "-o",
77
+ exists=False,
78
+ file_okay=False,
79
+ dir_okay=True,
80
+ writable=True,
81
+ help=(
82
+ "Directory in which to write <stem>_phased.vcf & .csv "
83
+ "(created if missing; defaults to current dir)."
84
+ ),
85
+ show_default=True,
86
+ ),
87
+ ] = Path("."),
88
+ # ---------- thresholds ------------------------------------------------
89
+ min_support: Annotated[
90
+ int,
91
+ typer.Option(
92
+ help=(
93
+ "Minimum HP-tagged reads per haplotype. "
94
+ "SVs where *both* n1 AND n2 fall below this "
95
+ "are dropped entirely."
96
+ ),
97
+ show_default=True,
98
+ ),
99
+ ] = DEFAULT_MIN_SUPPORT,
100
+ major_delta: Annotated[
101
+ float,
102
+ typer.Option(
103
+ help="r >= this ⇒ strong majority ⇒ GT 1|0 or 0|1",
104
+ show_default=True,
105
+ ),
106
+ ] = DEFAULT_MAJOR_DELTA,
107
+ equal_delta: Annotated[
108
+ float,
109
+ typer.Option(
110
+ help="|n1−n2|/N ≤ this ⇒ near-tie ⇒ GT 1|1",
111
+ show_default=True,
112
+ ),
113
+ ] = DEFAULT_EQUAL_DELTA,
114
+ # ---------- confidence bins ------------------------------------------
115
+ gq_bins: Annotated[
116
+ str,
117
+ typer.Option(
118
+ help=(
119
+ "Comma-separated GQ≥threshold:Label definitions "
120
+ "(e.g. '30:High,10:Moderate'). Labels appear in the CSV "
121
+ "[gq_label] and in the VCF INFO field HP_GQBIN when set."
122
+ ),
123
+ show_default=True,
124
+ ),
125
+ ] = DEFAULT_GQ_BINS,
126
+ # ---------- multiprocessing ------------------------------------------
127
+ threads: Annotated[
128
+ int | None,
129
+ typer.Option(
130
+ "-t",
131
+ "--threads",
132
+ help="Worker processes to use (defaults to all CPU cores).",
133
+ show_default=True,
134
+ ),
135
+ ] = None,
136
+ ) -> None:
137
+ """Phase structural variants using HP-tagged read evidence."""
138
+ # Initialise logging BEFORE we import anything that might log
139
+ from svphaser.logging import init as _init_logging
140
+
141
+ _init_logging("INFO") # or "DEBUG" if you want more detail
142
+
143
+ # Resolve output paths
144
+ if not out_dir.exists():
145
+ out_dir.mkdir(parents=True)
146
+
147
+ stem = sv_vcf.name
148
+ if stem.endswith(".vcf.gz"):
149
+ stem = stem[:-7]
150
+ elif stem.endswith(".vcf"):
151
+ stem = stem[:-4]
152
+
153
+ out_vcf = out_dir / f"{stem}_phased.vcf"
154
+ out_csv = out_dir / f"{stem}_phased.csv"
155
+
156
+ # Lazy import so `svphaser --help` works without heavy deps
157
+ from svphaser.phasing.io import phase_vcf
158
+
159
+ try:
160
+ phase_vcf(
161
+ sv_vcf,
162
+ bam,
163
+ out_dir=out_dir, # type: ignore[arg-type]
164
+ min_support=min_support,
165
+ major_delta=major_delta,
166
+ equal_delta=equal_delta,
167
+ gq_bins=gq_bins, # type: ignore[arg-type]
168
+ threads=threads,
169
+ )
170
+ typer.secho(f"✔ Phased VCF → {out_vcf}", fg=typer.colors.GREEN)
171
+ typer.secho(f"✔ Phased CSV → {out_csv}", fg=typer.colors.GREEN)
172
+ except Exception: # pragma: no cover
173
+ typer.secho(
174
+ "[SvPhaser] 💥 Unhandled error during phasing",
175
+ fg=typer.colors.RED,
176
+ )
177
+ raise
@@ -0,0 +1,34 @@
1
+ """
2
+ svphaser.logging
3
+ ================
4
+ One-liner that gives us colour-free, concise log messages on stderr.
5
+
6
+ Importing this module *once* anywhere in the program is enough.
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ import logging
12
+ import sys
13
+ from typing import Literal
14
+
15
+ _Level = Literal["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"]
16
+
17
+
18
+ def init(level: _Level | int = "INFO") -> None:
19
+ """Install a basic stderr handler – safe to call multiple times."""
20
+ root = logging.getLogger()
21
+ if root.handlers: # already initialised
22
+ return
23
+
24
+ logging.basicConfig(
25
+ level=level,
26
+ stream=sys.stderr,
27
+ format="%(levelname).1s | %(message)s",
28
+ datefmt="%H:%M:%S",
29
+ )
30
+
31
+
32
+ def get_logger(name: str) -> logging.Logger:
33
+ """Convenience helper for module-level loggers."""
34
+ return logging.getLogger(name)
@@ -0,0 +1,20 @@
1
+ # src/svphaser/phasing/__init__.py
2
+ """Public API for svphaser.phasing."""
3
+
4
+ from __future__ import annotations
5
+
6
+ import logging
7
+
8
+ from .algorithms import classify_haplotype, phasing_gq
9
+ from .io import phase_vcf
10
+ from .types import WorkerOpts
11
+
12
+ __all__ = [
13
+ "phase_vcf",
14
+ "classify_haplotype",
15
+ "phasing_gq",
16
+ "WorkerOpts",
17
+ ]
18
+
19
+ # Library logging: don't emit anything unless the app configures it.
20
+ logging.getLogger(__name__).addHandler(logging.NullHandler())
@@ -0,0 +1,106 @@
1
+ """
2
+ svphaser.phasing._workers
3
+ =========================
4
+ Worker-process code. Each worker:
5
+
6
+ 1. Opens the (possibly un-indexed) SV VCF.
7
+ 2. Scans only the records for *its* chromosome.
8
+ 3. Counts HP-tagged reads in the long-read BAM/CRAM.
9
+ 4. Classifies the haplotype + GQ, adds optional GQ-bin label.
10
+ 5. Returns a DataFrame to the parent.
11
+ """
12
+
13
+ from __future__ import annotations
14
+
15
+ from pathlib import Path
16
+
17
+ import pandas as pd
18
+ import pysam
19
+ from cyvcf2 import Reader, Variant # type: ignore
20
+
21
+ from .algorithms import classify_haplotype
22
+ from .types import WorkerOpts
23
+
24
+ __all__ = ["_phase_chrom_worker"]
25
+
26
+
27
+ def _count_hp_reads(
28
+ bam: pysam.AlignmentFile,
29
+ chrom: str,
30
+ start: int,
31
+ end: int,
32
+ ) -> tuple[int, int]:
33
+ hp1 = hp2 = 0
34
+ for read in bam.fetch(chrom, max(0, start - 1), end + 1):
35
+ if read.is_unmapped or read.is_secondary or read.is_supplementary:
36
+ continue
37
+ if not read.has_tag("HP"):
38
+ continue
39
+ tag = read.get_tag("HP")
40
+ if tag == 1:
41
+ hp1 += 1
42
+ elif tag == 2:
43
+ hp2 += 1
44
+ return hp1, hp2
45
+
46
+
47
+ def _has_tabix_index(vcf_path: Path) -> bool:
48
+ """Return True if <file>.tbi or <file>.csi exists (supports .vcf.gz.{tbi,csi})."""
49
+ return (
50
+ vcf_path.with_suffix(vcf_path.suffix + ".tbi").exists()
51
+ or vcf_path.with_suffix(vcf_path.suffix + ".csi").exists()
52
+ )
53
+
54
+
55
+ def _phase_chrom_worker(
56
+ chrom: str,
57
+ vcf_path: Path,
58
+ bam_path: Path,
59
+ opts: WorkerOpts,
60
+ ) -> pd.DataFrame:
61
+ bam = pysam.AlignmentFile(str(bam_path), "rb")
62
+ rdr = Reader(str(vcf_path))
63
+
64
+ rows: list[dict[str, object]] = []
65
+
66
+ # Try fast random access first, fall back to linear scan if that fails
67
+ use_region_iter = _has_tabix_index(vcf_path)
68
+ records_iter = (
69
+ rdr(f"{chrom}") if use_region_iter else (rec for rec in rdr if rec.CHROM == chrom)
70
+ )
71
+
72
+ for rec in records_iter: # type: ignore[arg-type]
73
+ assert isinstance(rec, Variant)
74
+ sv_end = rec.end if getattr(rec, "end", None) is not None else rec.POS
75
+ n1, n2 = _count_hp_reads(bam, chrom, rec.POS, sv_end)
76
+
77
+ gt, gq = classify_haplotype(
78
+ n1,
79
+ n2,
80
+ min_support=opts.min_support,
81
+ major_delta=opts.major_delta,
82
+ equal_delta=opts.equal_delta,
83
+ )
84
+
85
+ row = dict(
86
+ chrom=chrom,
87
+ pos=rec.POS, # cyvcf2 POS is already 1-based
88
+ id=rec.ID or ".",
89
+ svtype=rec.INFO.get("SVTYPE", "NA"),
90
+ n1=n1,
91
+ n2=n2,
92
+ gt=gt,
93
+ gq=gq,
94
+ )
95
+
96
+ if opts.gq_bins:
97
+ for thr, label in opts.gq_bins:
98
+ if gq >= thr:
99
+ row["gq_label"] = label
100
+ break
101
+
102
+ rows.append(row) # type: ignore[assignment]
103
+
104
+ rdr.close()
105
+ bam.close()
106
+ return pd.DataFrame(rows)
@@ -0,0 +1,71 @@
1
+ """Pure maths for SvPhaser – overflow-safe revision.
2
+
3
+ 1) Exact binomial tail for small depth (N ≤ 200).
4
+ 2) Continuity-corrected normal approximation for deep coverage (N > 200).
5
+ 3) Phred GQ capped at 99.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ import math
11
+
12
+ __all__ = ["classify_haplotype", "phasing_gq"]
13
+
14
+ NORMAL_THRESHOLD = 200 # switch to Gaussian SF above this depth
15
+ MAX_GQ = 99
16
+
17
+
18
+ def phasing_gq(n1: int, n2: int) -> int:
19
+ """Return Phred-scaled Genotype Quality, overflow-safe for deep data."""
20
+ total = n1 + n2
21
+ if total == 0:
22
+ return 0
23
+
24
+ k = max(n1, n2)
25
+
26
+ if total > NORMAL_THRESHOLD:
27
+ mu = total / 2.0
28
+ sigma = math.sqrt(total * 0.25)
29
+ z = (k - 0.5 - mu) / sigma
30
+ p_err = 0.5 * math.erfc(z / math.sqrt(2.0)) # survival function
31
+ else:
32
+ p = 0.5
33
+ tail = 0.0
34
+ for i in range(k, total + 1):
35
+ tail += math.comb(total, i) * (p**i) * ((1 - p) ** (total - i))
36
+ p_err = tail
37
+
38
+ p_err = max(p_err, 1e-300) # guard log(0)
39
+ gq = int(round(-10.0 * math.log10(p_err)))
40
+ return min(gq, MAX_GQ)
41
+
42
+
43
+ def classify_haplotype(
44
+ n1: int,
45
+ n2: int,
46
+ *,
47
+ min_support: int = 10,
48
+ major_delta: float = 0.70,
49
+ equal_delta: float = 0.25,
50
+ ) -> tuple[str, int]:
51
+ """Return (GT, GQ) using ratio thresholds and an overflow-safe GQ."""
52
+ total = n1 + n2
53
+
54
+ if n1 < min_support and n2 < min_support:
55
+ return "./.", 0
56
+ if total == 0:
57
+ return "./.", 0
58
+
59
+ gq = phasing_gq(n1, n2)
60
+ r1 = n1 / total
61
+ r2 = n2 / total
62
+
63
+ if r1 >= major_delta:
64
+ gt = "1|0"
65
+ elif r2 >= major_delta:
66
+ gt = "0|1"
67
+ elif abs(n1 - n2) / total <= equal_delta:
68
+ gt = "1|1"
69
+ else:
70
+ gt = "./."
71
+ return gt, gq
@@ -0,0 +1,258 @@
1
+ """
2
+ svphaser.phasing.io
3
+ ===================
4
+ High-level “engine” – orchestrates per-chromosome workers, merges results,
5
+ applies the global depth filter, then writes CSV + VCF.
6
+
7
+ Workers receive only simple (pickle-safe) arguments; each worker opens its
8
+ own BAM/VCF to avoid sharing handles between processes.
9
+ """
10
+
11
+ from __future__ import annotations
12
+
13
+ import logging
14
+ import multiprocessing as mp
15
+ from pathlib import Path
16
+
17
+ import pandas as pd
18
+ from cyvcf2 import Reader
19
+
20
+ from ._workers import _phase_chrom_worker
21
+ from .types import GQBin, WorkerOpts
22
+
23
+ __all__ = ["phase_vcf"]
24
+
25
+ logger = logging.getLogger(__name__)
26
+
27
+
28
+ def phase_vcf(
29
+ sv_vcf: Path,
30
+ bam: Path,
31
+ *,
32
+ out_dir: Path,
33
+ min_support: int,
34
+ major_delta: float,
35
+ equal_delta: float,
36
+ gq_bins: str,
37
+ threads: int | None,
38
+ ) -> None:
39
+ """Phase *sv_vcf* against *bam* and write outputs to *out_dir*.
40
+
41
+ Files:
42
+ - *_phased.vcf
43
+ - *_phased.csv
44
+ - *_dropped_svs.csv
45
+ """
46
+ out_dir.mkdir(parents=True, exist_ok=True)
47
+
48
+ # 1 ─ Parse --gq-bins → list[(int,label)]
49
+ bins: list[GQBin] = []
50
+ if gq_bins.strip():
51
+ for part in gq_bins.split(","):
52
+ thr_lbl = part.strip()
53
+ if not thr_lbl:
54
+ continue
55
+ try:
56
+ thr_s, lbl = thr_lbl.split(":")
57
+ except ValueError as err:
58
+ raise ValueError(
59
+ f"Invalid gq-bin specifier: '{thr_lbl}'. " "Use '30:High,10:Moderate'."
60
+ ) from err
61
+ bins.append((int(thr_s), lbl))
62
+ bins.sort(key=lambda x: x[0], reverse=True)
63
+
64
+ # 2 ─ Build immutable options holder for workers
65
+ opts = WorkerOpts(
66
+ min_support=min_support,
67
+ major_delta=major_delta,
68
+ equal_delta=equal_delta,
69
+ gq_bins=bins,
70
+ )
71
+
72
+ # 3 ─ Discover chromosomes (cheap – no variants parsed yet)
73
+ rdr = Reader(str(sv_vcf))
74
+ chroms: tuple[str, ...] = tuple(rdr.seqnames)
75
+ rdr.close()
76
+
77
+ # 4 ─ Launch one worker per chromosome (or ≤threads)
78
+ worker_args: list[tuple[str, Path, Path, WorkerOpts]] = [(c, sv_vcf, bam, opts) for c in chroms]
79
+
80
+ threads = threads or mp.cpu_count() or 1
81
+ logger.info("SvPhaser ▶ workers: %d", threads)
82
+
83
+ dataframes: list[pd.DataFrame] = []
84
+
85
+ # Use 'fork' when available (fast on Linux); fall back to 'spawn' elsewhere.
86
+ try:
87
+ ctx = mp.get_context("fork")
88
+ except ValueError:
89
+ ctx = mp.get_context("spawn")
90
+
91
+ if threads == 1:
92
+ # Serial path is handy for debugging
93
+ for args in worker_args:
94
+ df = _phase_chrom_worker(*args)
95
+ dataframes.append(df)
96
+ chrom = df.iloc[0]["chrom"] if not df.empty else "?"
97
+ logger.info("chr %-6s ✔ phased %5d SVs", chrom, len(df))
98
+ else:
99
+ with ctx.Pool(processes=threads) as pool:
100
+ for df in pool.starmap(_phase_chrom_worker, worker_args, chunksize=1):
101
+ dataframes.append(df)
102
+ chrom = df.iloc[0]["chrom"] if not df.empty else "?"
103
+ logger.info("chr %-6s ✔ phased %5d SVs", chrom, len(df))
104
+
105
+ # 5 ─ Merge & apply *global* depth filter
106
+ if dataframes:
107
+ merged = pd.concat(dataframes, ignore_index=True)
108
+ else:
109
+ merged = pd.DataFrame(
110
+ columns=["chrom", "pos", "id", "svtype", "n1", "n2", "gt", "gq", "gq_label"]
111
+ )
112
+
113
+ pre = len(merged)
114
+ keep = ~((merged["n1"] < min_support) & (merged["n2"] < min_support))
115
+
116
+ stem = sv_vcf.name.removesuffix(".vcf.gz").removesuffix(".vcf")
117
+
118
+ # Save dropped SVs for transparency
119
+ dropped_csv = out_dir / f"{stem}_dropped_svs.csv"
120
+ merged.loc[~keep].to_csv(dropped_csv, index=False)
121
+ logger.info("Dropped SVs → %s (%d SVs)", dropped_csv, int((~keep).sum()))
122
+
123
+ kept = merged.loc[keep].reset_index(drop=True)
124
+ if dropped := pre - len(kept):
125
+ logger.info("Depth filter removed %d SVs", dropped)
126
+
127
+ # 6 ─ Write CSV
128
+ out_csv = out_dir / f"{stem}_phased.csv"
129
+ kept.to_csv(out_csv, index=False)
130
+ logger.info("CSV → %s (%d SVs)", out_csv, len(kept))
131
+
132
+ # 7 ─ Write VCF
133
+ out_vcf = out_dir / f"{stem}_phased.vcf"
134
+ _write_phased_vcf(out_vcf, sv_vcf, kept, gqbin_in_header=bool(bins))
135
+ logger.info("VCF → %s", out_vcf)
136
+
137
+
138
+ # ──────────────────────────────────────────────────────────────────────
139
+ # Small helpers to keep complexity down
140
+ # ──────────────────────────────────────────────────────────────────────
141
+ def _vcf_info_lookup(
142
+ in_vcf: Path,
143
+ ) -> tuple[dict[tuple[str, int, str], dict[str, object]], list[str], str]:
144
+ """Scan input VCF once: return (lookup, raw_header_lines, sample_name)."""
145
+ rdr = Reader(str(in_vcf))
146
+ raw_header_lines = rdr.raw_header.strip().splitlines()
147
+ sample_name = rdr.samples[0] if rdr.samples else "SAMPLE"
148
+
149
+ lookup: dict[tuple[str, int, str], dict[str, object]] = {}
150
+ for rec in rdr:
151
+ key = (rec.CHROM, rec.POS, rec.ID or ".")
152
+ info_dict: dict[str, object] = {}
153
+ for k in rec.INFO:
154
+ info_key = k[0] if isinstance(k, tuple) else k
155
+ v = rec.INFO.get(info_key)
156
+ if v is not None:
157
+ info_dict[info_key] = v
158
+ lookup[key] = {
159
+ "REF": rec.REF,
160
+ "ALT": rec.ALT[0] if rec.ALT else "<N>",
161
+ "QUAL": rec.QUAL if rec.QUAL is not None else ".",
162
+ "FILTER": rec.FILTER if rec.FILTER else "PASS",
163
+ "INFO": info_dict,
164
+ }
165
+ rdr.close()
166
+ return lookup, raw_header_lines, sample_name
167
+
168
+
169
+ def _write_headers(
170
+ out,
171
+ raw_header_lines: list[str],
172
+ sample_name: str,
173
+ *,
174
+ gqbin_in_header: bool,
175
+ ) -> None:
176
+ """Write preserved meta headers + ensure GT/GQ/GQBIN, then the column header."""
177
+ have_gt = any("##FORMAT=<ID=GT" in ln for ln in raw_header_lines)
178
+ have_gq = any("##FORMAT=<ID=GQ" in ln for ln in raw_header_lines)
179
+ have_gqbin = any("##INFO=<ID=GQBIN" in ln for ln in raw_header_lines)
180
+
181
+ for line in raw_header_lines:
182
+ if line.startswith("##"):
183
+ out.write(line.rstrip() + "\n")
184
+
185
+ if not have_gt:
186
+ out.write('##FORMAT=<ID=GT,Number=1,Type=String,Description="Phased genotype">\n')
187
+ if not have_gq:
188
+ out.write(
189
+ "##FORMAT=<ID=GQ,Number=1,Type=Integer," 'Description="Genotype Quality (Phred)">\n'
190
+ )
191
+ if gqbin_in_header and not have_gqbin:
192
+ out.write(
193
+ "##INFO=<ID=GQBIN,Number=1,Type=String," 'Description="GQ bin label from SvPhaser">\n'
194
+ )
195
+
196
+ out.write("#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t" + sample_name + "\n")
197
+
198
+
199
+ def _compose_info_str(
200
+ orig_info: dict[str, object],
201
+ svtype: object,
202
+ gq_label: object,
203
+ ) -> str:
204
+ """Compose the INFO string with SVTYPE first, original keys (no duplicate), then GQBIN."""
205
+ items: list[str] = []
206
+ for k, v in orig_info.items():
207
+ if k == "SVTYPE":
208
+ continue
209
+ items.append(f"{k}={v}")
210
+ if svtype:
211
+ items.insert(0, f"SVTYPE={svtype}")
212
+ if gq_label is not None and pd.notnull(gq_label):
213
+ items.append(f"GQBIN={gq_label}")
214
+ return ";".join(items) if items else "."
215
+
216
+
217
+ def _write_phased_vcf(
218
+ out_vcf: Path,
219
+ in_vcf: Path,
220
+ df: pd.DataFrame,
221
+ *,
222
+ gqbin_in_header: bool,
223
+ ) -> None:
224
+ """Write a phased VCF: tab-delimited, compliant, with ensured GT/GQ (and GQBIN if used)."""
225
+ lookup, raw_header_lines, sample_name = _vcf_info_lookup(in_vcf)
226
+
227
+ with open(out_vcf, "w", newline="") as out:
228
+ _write_headers(out, raw_header_lines, sample_name, gqbin_in_header=gqbin_in_header)
229
+
230
+ for row in df.itertuples(index=False):
231
+ chrom = str(getattr(row, "chrom", "."))
232
+ pos = int(getattr(row, "pos", 0))
233
+ vid = str(getattr(row, "id", "."))
234
+ gt = str(getattr(row, "gt", "./."))
235
+ gq = str(getattr(row, "gq", "0"))
236
+ svtype = getattr(row, "svtype", None)
237
+ gq_label = getattr(row, "gq_label", None)
238
+
239
+ info = lookup.get((chrom, pos, vid))
240
+ if info is None:
241
+ logger.warning("Could not find VCF info for %s:%s %s", chrom, pos, vid)
242
+ continue
243
+
244
+ info_str = _compose_info_str(info["INFO"], svtype, gq_label)
245
+
246
+ fields = [
247
+ chrom,
248
+ str(pos),
249
+ vid,
250
+ str(info["REF"]),
251
+ str(info["ALT"]),
252
+ str(info["QUAL"]),
253
+ str(info["FILTER"]),
254
+ info_str,
255
+ "GT:GQ",
256
+ f"{gt}:{gq}",
257
+ ]
258
+ out.write("\t".join(fields) + "\n")
@@ -0,0 +1,31 @@
1
+ """svphaser.phasing.types
2
+ ========================
3
+ Central place for common type aliases & lightweight data classes.
4
+ Keeping them here avoids circular imports and MyPy noise.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ from dataclasses import dataclass
10
+ from typing import NamedTuple
11
+
12
+ SVKey = tuple[str, int, str] # (chrom, POS, ID) – ID is "." if empty
13
+ GQBin = tuple[int, str] # (threshold, label), e.g. (30, "High")
14
+
15
+
16
+ @dataclass(slots=True, frozen=True)
17
+ class WorkerOpts:
18
+ """Non-changing knobs passed into every worker."""
19
+
20
+ min_support: int
21
+ major_delta: float
22
+ equal_delta: float
23
+ gq_bins: list[GQBin] # already parsed by cli → phase_vcf
24
+
25
+
26
+ class CallTuple(NamedTuple):
27
+ """Return type per-variant from algorithms.classify_haplotype()."""
28
+
29
+ gt: str
30
+ gq: int
31
+ gq_label: str | None
File without changes