varcode 2.2.1__tar.gz → 2.3.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {varcode-2.2.1/varcode.egg-info → varcode-2.3.0}/PKG-INFO +1 -1
- varcode-2.3.0/tests/test_genotype.py +266 -0
- varcode-2.3.0/tests/test_genotype_from_vcf.py +248 -0
- {varcode-2.2.1 → varcode-2.3.0}/varcode/__init__.py +7 -1
- {varcode-2.2.1 → varcode-2.3.0}/varcode/errors.py +5 -0
- varcode-2.3.0/varcode/genotype.py +200 -0
- {varcode-2.2.1 → varcode-2.3.0}/varcode/variant_collection.py +138 -0
- varcode-2.3.0/varcode/version.py +1 -0
- {varcode-2.2.1 → varcode-2.3.0/varcode.egg-info}/PKG-INFO +1 -1
- {varcode-2.2.1 → varcode-2.3.0}/varcode.egg-info/SOURCES.txt +3 -0
- varcode-2.2.1/varcode/version.py +0 -1
- {varcode-2.2.1 → varcode-2.3.0}/LICENSE +0 -0
- {varcode-2.2.1 → varcode-2.3.0}/MANIFEST.in +0 -0
- {varcode-2.2.1 → varcode-2.3.0}/README.md +0 -0
- {varcode-2.2.1 → varcode-2.3.0}/pyproject.toml +0 -0
- {varcode-2.2.1 → varcode-2.3.0}/requirements.txt +0 -0
- {varcode-2.2.1 → varcode-2.3.0}/setup.cfg +0 -0
- {varcode-2.2.1 → varcode-2.3.0}/tests/__init__.py +0 -0
- {varcode-2.2.1 → varcode-2.3.0}/tests/benchmark_vcf_load.py +0 -0
- {varcode-2.2.1 → varcode-2.3.0}/tests/common.py +0 -0
- {varcode-2.2.1 → varcode-2.3.0}/tests/data.py +0 -0
- {varcode-2.2.1 → varcode-2.3.0}/tests/test_cli_effects.py +0 -0
- {varcode-2.2.1 → varcode-2.3.0}/tests/test_cli_genes.py +0 -0
- {varcode-2.2.1 → varcode-2.3.0}/tests/test_collection_filtering.py +0 -0
- {varcode-2.2.1 → varcode-2.3.0}/tests/test_collection_variants_attr_consistency.py +0 -0
- {varcode-2.2.1 → varcode-2.3.0}/tests/test_common.py +0 -0
- {varcode-2.2.1 → varcode-2.3.0}/tests/test_cosmic_mutations.py +0 -0
- {varcode-2.2.1 → varcode-2.3.0}/tests/test_csv_roundtrip.py +0 -0
- {varcode-2.2.1 → varcode-2.3.0}/tests/test_dbnsfp_validation.py +0 -0
- {varcode-2.2.1 → varcode-2.3.0}/tests/test_effect_annotation_errors.py +0 -0
- {varcode-2.2.1 → varcode-2.3.0}/tests/test_effect_classes.py +0 -0
- {varcode-2.2.1 → varcode-2.3.0}/tests/test_effect_collection.py +0 -0
- {varcode-2.2.1 → varcode-2.3.0}/tests/test_effect_collection_serialization.py +0 -0
- {varcode-2.2.1 → varcode-2.3.0}/tests/test_effect_collection_sort_order.py +0 -0
- {varcode-2.2.1 → varcode-2.3.0}/tests/test_effects_from_mutagenix_variants.py +0 -0
- {varcode-2.2.1 → varcode-2.3.0}/tests/test_exonic_splice_site.py +0 -0
- {varcode-2.2.1 → varcode-2.3.0}/tests/test_frameshift_helpers.py +0 -0
- {varcode-2.2.1 → varcode-2.3.0}/tests/test_maf.py +0 -0
- {varcode-2.2.1 → varcode-2.3.0}/tests/test_mm10_klf6_frameshift.py +0 -0
- {varcode-2.2.1 → varcode-2.3.0}/tests/test_mouse.py +0 -0
- {varcode-2.2.1 → varcode-2.3.0}/tests/test_mutate.py +0 -0
- {varcode-2.2.1 → varcode-2.3.0}/tests/test_no_duplicate_variants.py +0 -0
- {varcode-2.2.1 → varcode-2.3.0}/tests/test_premature_stop_short_description.py +0 -0
- {varcode-2.2.1 → varcode-2.3.0}/tests/test_problematic_variants.py +0 -0
- {varcode-2.2.1 → varcode-2.3.0}/tests/test_reference.py +0 -0
- {varcode-2.2.1 → varcode-2.3.0}/tests/test_reference_mismatch_error.py +0 -0
- {varcode-2.2.1 → varcode-2.3.0}/tests/test_silent_aa_pos.py +0 -0
- {varcode-2.2.1 → varcode-2.3.0}/tests/test_silent_hgvs_description.py +0 -0
- {varcode-2.2.1 → varcode-2.3.0}/tests/test_splice_site_effects.py +0 -0
- {varcode-2.2.1 → varcode-2.3.0}/tests/test_stop_codon_classification_bugs.py +0 -0
- {varcode-2.2.1 → varcode-2.3.0}/tests/test_string_helpers.py +0 -0
- {varcode-2.2.1 → varcode-2.3.0}/tests/test_symbolic_alleles.py +0 -0
- {varcode-2.2.1 → varcode-2.3.0}/tests/test_timings.py +0 -0
- {varcode-2.2.1 → varcode-2.3.0}/tests/test_variant.py +0 -0
- {varcode-2.2.1 → varcode-2.3.0}/tests/test_variant_collection.py +0 -0
- {varcode-2.2.1 → varcode-2.3.0}/tests/test_vcf.py +0 -0
- {varcode-2.2.1 → varcode-2.3.0}/tests/test_vcf_output.py +0 -0
- {varcode-2.2.1 → varcode-2.3.0}/varcode/cli/__init__.py +0 -0
- {varcode-2.2.1 → varcode-2.3.0}/varcode/cli/effects_script.py +0 -0
- {varcode-2.2.1 → varcode-2.3.0}/varcode/cli/genes_script.py +0 -0
- {varcode-2.2.1 → varcode-2.3.0}/varcode/cli/logging.conf +0 -0
- {varcode-2.2.1 → varcode-2.3.0}/varcode/cli/variant_args.py +0 -0
- {varcode-2.2.1 → varcode-2.3.0}/varcode/cli/version_info.py +0 -0
- {varcode-2.2.1 → varcode-2.3.0}/varcode/common.py +0 -0
- {varcode-2.2.1 → varcode-2.3.0}/varcode/csv_helpers.py +0 -0
- {varcode-2.2.1 → varcode-2.3.0}/varcode/effects/__init__.py +0 -0
- {varcode-2.2.1 → varcode-2.3.0}/varcode/effects/common.py +0 -0
- {varcode-2.2.1 → varcode-2.3.0}/varcode/effects/effect_classes.py +0 -0
- {varcode-2.2.1 → varcode-2.3.0}/varcode/effects/effect_collection.py +0 -0
- {varcode-2.2.1 → varcode-2.3.0}/varcode/effects/effect_helpers.py +0 -0
- {varcode-2.2.1 → varcode-2.3.0}/varcode/effects/effect_ordering.py +0 -0
- {varcode-2.2.1 → varcode-2.3.0}/varcode/effects/effect_prediction.py +0 -0
- {varcode-2.2.1 → varcode-2.3.0}/varcode/effects/effect_prediction_coding.py +0 -0
- {varcode-2.2.1 → varcode-2.3.0}/varcode/effects/effect_prediction_coding_frameshift.py +0 -0
- {varcode-2.2.1 → varcode-2.3.0}/varcode/effects/effect_prediction_coding_in_frame.py +0 -0
- {varcode-2.2.1 → varcode-2.3.0}/varcode/effects/mutate.py +0 -0
- {varcode-2.2.1 → varcode-2.3.0}/varcode/effects/transcript_helpers.py +0 -0
- {varcode-2.2.1 → varcode-2.3.0}/varcode/effects/translate.py +0 -0
- {varcode-2.2.1 → varcode-2.3.0}/varcode/maf.py +0 -0
- {varcode-2.2.1 → varcode-2.3.0}/varcode/nucleotides.py +0 -0
- {varcode-2.2.1 → varcode-2.3.0}/varcode/reference.py +0 -0
- {varcode-2.2.1 → varcode-2.3.0}/varcode/string_helpers.py +0 -0
- {varcode-2.2.1 → varcode-2.3.0}/varcode/ucsc_reference_names.py +0 -0
- {varcode-2.2.1 → varcode-2.3.0}/varcode/util.py +0 -0
- {varcode-2.2.1 → varcode-2.3.0}/varcode/variant.py +0 -0
- {varcode-2.2.1 → varcode-2.3.0}/varcode/vcf.py +0 -0
- {varcode-2.2.1 → varcode-2.3.0}/varcode/vcf_output.py +0 -0
- {varcode-2.2.1 → varcode-2.3.0}/varcode.egg-info/dependency_links.txt +0 -0
- {varcode-2.2.1 → varcode-2.3.0}/varcode.egg-info/entry_points.txt +0 -0
- {varcode-2.2.1 → varcode-2.3.0}/varcode.egg-info/requires.txt +0 -0
- {varcode-2.2.1 → varcode-2.3.0}/varcode.egg-info/top_level.txt +0 -0
|
@@ -0,0 +1,266 @@
|
|
|
1
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
2
|
+
# you may not use this file except in compliance with the License.
|
|
3
|
+
# You may obtain a copy of the License at
|
|
4
|
+
#
|
|
5
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
6
|
+
#
|
|
7
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
8
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
9
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
10
|
+
# See the License for the specific language governing permissions and
|
|
11
|
+
# limitations under the License.
|
|
12
|
+
|
|
13
|
+
"""
|
|
14
|
+
Unit tests for the Genotype dataclass and GT parsing (openvax/varcode#267).
|
|
15
|
+
|
|
16
|
+
Tests that touch a real VCF end-to-end live in
|
|
17
|
+
``tests/test_genotype_from_vcf.py``.
|
|
18
|
+
"""
|
|
19
|
+
|
|
20
|
+
import pytest
|
|
21
|
+
|
|
22
|
+
from varcode import Genotype, Zygosity
|
|
23
|
+
from varcode.genotype import parse_gt_string
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
# ------------------------------------------------------------------
|
|
27
|
+
# parse_gt_string: one low-level function, many cases
|
|
28
|
+
# ------------------------------------------------------------------
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def test_parse_gt_diploid_unphased():
|
|
32
|
+
assert parse_gt_string("0/1") == ((0, 1), False)
|
|
33
|
+
assert parse_gt_string("1/0") == ((1, 0), False)
|
|
34
|
+
assert parse_gt_string("1/1") == ((1, 1), False)
|
|
35
|
+
assert parse_gt_string("0/0") == ((0, 0), False)
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def test_parse_gt_diploid_phased():
|
|
39
|
+
assert parse_gt_string("0|1") == ((0, 1), True)
|
|
40
|
+
assert parse_gt_string("1|0") == ((1, 0), True)
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def test_parse_gt_multiallelic():
|
|
44
|
+
# Multi-allelic: 1/2 means one copy of alt #1, one copy of alt #2.
|
|
45
|
+
assert parse_gt_string("1/2") == ((1, 2), False)
|
|
46
|
+
assert parse_gt_string("2|1") == ((2, 1), True)
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def test_parse_gt_haploid():
|
|
50
|
+
# Chromosomes X/Y in males, mitochondrial, etc. — single allele.
|
|
51
|
+
assert parse_gt_string("1") == ((1,), False)
|
|
52
|
+
assert parse_gt_string("0") == ((0,), False)
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def test_parse_gt_missing():
|
|
56
|
+
assert parse_gt_string("./.") == ((None, None), False)
|
|
57
|
+
assert parse_gt_string(".") == ((None,), False)
|
|
58
|
+
assert parse_gt_string("") == ((None,), False)
|
|
59
|
+
assert parse_gt_string(None) == ((None,), False)
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def test_parse_gt_partial_missing():
|
|
63
|
+
# Half-called genotypes — one haplotype missing.
|
|
64
|
+
assert parse_gt_string("./1") == ((None, 1), False)
|
|
65
|
+
assert parse_gt_string("0/.") == ((0, None), False)
|
|
66
|
+
assert parse_gt_string(".|1") == ((None, 1), True)
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
def test_parse_gt_polyploid():
|
|
70
|
+
# Triploid (e.g. trisomy) or higher — tuple simply grows.
|
|
71
|
+
alleles, phased = parse_gt_string("0/1/1")
|
|
72
|
+
assert alleles == (0, 1, 1)
|
|
73
|
+
assert phased is False
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
# ------------------------------------------------------------------
|
|
77
|
+
# Genotype construction from pyvcf-style sample info dicts
|
|
78
|
+
# ------------------------------------------------------------------
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
def test_genotype_from_sample_info_full():
|
|
82
|
+
gt = Genotype.from_sample_info({
|
|
83
|
+
"GT": "0/1",
|
|
84
|
+
"AD": [10, 5],
|
|
85
|
+
"DP": 15,
|
|
86
|
+
"GQ": 99,
|
|
87
|
+
})
|
|
88
|
+
assert gt.raw_gt == "0/1"
|
|
89
|
+
assert gt.alleles == (0, 1)
|
|
90
|
+
assert gt.phased is False
|
|
91
|
+
assert gt.allele_depths == (10, 5)
|
|
92
|
+
assert gt.total_depth == 15
|
|
93
|
+
assert gt.genotype_quality == 99
|
|
94
|
+
assert gt.phase_set is None
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
def test_genotype_from_sample_info_phased_with_ps():
|
|
98
|
+
gt = Genotype.from_sample_info({
|
|
99
|
+
"GT": "0|1",
|
|
100
|
+
"PS": 100,
|
|
101
|
+
"AD": [8, 7],
|
|
102
|
+
"DP": 15,
|
|
103
|
+
"GQ": 99,
|
|
104
|
+
})
|
|
105
|
+
assert gt.phased is True
|
|
106
|
+
assert gt.phase_set == 100
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
def test_genotype_from_sample_info_nocall_handles_none_values():
|
|
110
|
+
# Pyvcf returns None for all fields when the call is ./.
|
|
111
|
+
gt = Genotype.from_sample_info({
|
|
112
|
+
"GT": "./.",
|
|
113
|
+
"AD": None,
|
|
114
|
+
"DP": None,
|
|
115
|
+
"GQ": None,
|
|
116
|
+
})
|
|
117
|
+
assert gt.alleles == (None, None)
|
|
118
|
+
assert gt.allele_depths is None
|
|
119
|
+
assert gt.is_missing
|
|
120
|
+
assert not gt.is_called
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
def test_genotype_from_sample_info_none_input():
|
|
124
|
+
# Entirely missing sample_info dict (e.g. variant not constructed
|
|
125
|
+
# from a VCF).
|
|
126
|
+
gt = Genotype.from_sample_info(None)
|
|
127
|
+
assert gt.is_missing
|
|
128
|
+
assert gt.alleles == (None, None)
|
|
129
|
+
|
|
130
|
+
|
|
131
|
+
# ------------------------------------------------------------------
|
|
132
|
+
# General predicates (alt-agnostic)
|
|
133
|
+
# ------------------------------------------------------------------
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
def test_is_called_and_is_missing():
|
|
137
|
+
assert Genotype.from_sample_info({"GT": "0/1"}).is_called
|
|
138
|
+
assert not Genotype.from_sample_info({"GT": "./."}).is_called
|
|
139
|
+
assert Genotype.from_sample_info({"GT": "./."}).is_missing
|
|
140
|
+
# Partial-missing is still "called" because one allele is known.
|
|
141
|
+
assert Genotype.from_sample_info({"GT": "./1"}).is_called
|
|
142
|
+
|
|
143
|
+
|
|
144
|
+
def test_ploidy():
|
|
145
|
+
assert Genotype.from_sample_info({"GT": "0/1"}).ploidy == 2
|
|
146
|
+
assert Genotype.from_sample_info({"GT": "1"}).ploidy == 1
|
|
147
|
+
assert Genotype.from_sample_info({"GT": "0/1/1"}).ploidy == 3
|
|
148
|
+
|
|
149
|
+
|
|
150
|
+
def test_is_haploid():
|
|
151
|
+
assert Genotype.from_sample_info({"GT": "1"}).is_haploid
|
|
152
|
+
assert not Genotype.from_sample_info({"GT": "0/1"}).is_haploid
|
|
153
|
+
|
|
154
|
+
|
|
155
|
+
# ------------------------------------------------------------------
|
|
156
|
+
# Alt-relative zygosity (the business end of the API)
|
|
157
|
+
# ------------------------------------------------------------------
|
|
158
|
+
|
|
159
|
+
|
|
160
|
+
def test_zygosity_heterozygous_simple():
|
|
161
|
+
gt = Genotype.from_sample_info({"GT": "0/1"})
|
|
162
|
+
assert gt.zygosity_for_alt(1) is Zygosity.HETEROZYGOUS
|
|
163
|
+
assert gt.carries_alt(1)
|
|
164
|
+
assert gt.copies_of_alt(1) == 1
|
|
165
|
+
|
|
166
|
+
|
|
167
|
+
def test_zygosity_homozygous_alt_simple():
|
|
168
|
+
gt = Genotype.from_sample_info({"GT": "1/1"})
|
|
169
|
+
assert gt.zygosity_for_alt(1) is Zygosity.HOMOZYGOUS
|
|
170
|
+
assert gt.carries_alt(1)
|
|
171
|
+
assert gt.copies_of_alt(1) == 2
|
|
172
|
+
|
|
173
|
+
|
|
174
|
+
def test_zygosity_homozygous_ref_is_absent_for_any_alt():
|
|
175
|
+
gt = Genotype.from_sample_info({"GT": "0/0"})
|
|
176
|
+
# Relative to alt 1: sample doesn't have this alt.
|
|
177
|
+
assert gt.zygosity_for_alt(1) is Zygosity.ABSENT
|
|
178
|
+
assert not gt.carries_alt(1)
|
|
179
|
+
assert gt.copies_of_alt(1) == 0
|
|
180
|
+
|
|
181
|
+
|
|
182
|
+
def test_zygosity_missing():
|
|
183
|
+
gt = Genotype.from_sample_info({"GT": "./."})
|
|
184
|
+
assert gt.zygosity_for_alt(1) is Zygosity.MISSING
|
|
185
|
+
assert not gt.carries_alt(1)
|
|
186
|
+
assert gt.copies_of_alt(1) == 0
|
|
187
|
+
|
|
188
|
+
|
|
189
|
+
def test_zygosity_multiallelic_querying_different_alts():
|
|
190
|
+
# GT = 1/2 means one copy of alt #1, one copy of alt #2.
|
|
191
|
+
# Querying alt 1: het (one copy of this alt, one of a different alt).
|
|
192
|
+
# Querying alt 2: also het.
|
|
193
|
+
# Querying alt 3 (not carried): absent.
|
|
194
|
+
gt = Genotype.from_sample_info({"GT": "1/2"})
|
|
195
|
+
assert gt.zygosity_for_alt(1) is Zygosity.HETEROZYGOUS
|
|
196
|
+
assert gt.zygosity_for_alt(2) is Zygosity.HETEROZYGOUS
|
|
197
|
+
assert gt.zygosity_for_alt(3) is Zygosity.ABSENT
|
|
198
|
+
|
|
199
|
+
|
|
200
|
+
def test_zygosity_multiallelic_homozygous_for_second_alt():
|
|
201
|
+
# GT = 2/2 means both copies are alt #2.
|
|
202
|
+
# Alt 2: hom. Alt 1: absent (sample doesn't have alt 1).
|
|
203
|
+
gt = Genotype.from_sample_info({"GT": "2/2"})
|
|
204
|
+
assert gt.zygosity_for_alt(2) is Zygosity.HOMOZYGOUS
|
|
205
|
+
assert gt.zygosity_for_alt(1) is Zygosity.ABSENT
|
|
206
|
+
|
|
207
|
+
|
|
208
|
+
def test_zygosity_haploid_single_alt():
|
|
209
|
+
# chrY in a male with an alt call: GT = 1.
|
|
210
|
+
gt = Genotype.from_sample_info({"GT": "1"})
|
|
211
|
+
# Single-allele calls with that allele equal to alt: all copies
|
|
212
|
+
# are alt → classify as HOMOZYGOUS.
|
|
213
|
+
assert gt.zygosity_for_alt(1) is Zygosity.HOMOZYGOUS
|
|
214
|
+
assert gt.carries_alt(1)
|
|
215
|
+
|
|
216
|
+
|
|
217
|
+
def test_zygosity_partial_call():
|
|
218
|
+
# GT = ./1: one allele missing, the other is alt #1.
|
|
219
|
+
# Out of the called alleles (just one), all are alt #1 → HOMOZYGOUS.
|
|
220
|
+
# This is the defensible read: we count called alleles only.
|
|
221
|
+
gt = Genotype.from_sample_info({"GT": "./1"})
|
|
222
|
+
assert gt.zygosity_for_alt(1) is Zygosity.HOMOZYGOUS
|
|
223
|
+
|
|
224
|
+
|
|
225
|
+
# ------------------------------------------------------------------
|
|
226
|
+
# Per-allele depth lookup
|
|
227
|
+
# ------------------------------------------------------------------
|
|
228
|
+
|
|
229
|
+
|
|
230
|
+
def test_depth_for_alt():
|
|
231
|
+
gt = Genotype.from_sample_info({"GT": "0/1", "AD": [10, 5]})
|
|
232
|
+
assert gt.depth_for_alt(0) == 10 # ref depth
|
|
233
|
+
assert gt.depth_for_alt(1) == 5 # first alt depth
|
|
234
|
+
assert gt.depth_for_alt(2) is None # out of range
|
|
235
|
+
|
|
236
|
+
|
|
237
|
+
def test_depth_for_alt_returns_none_when_ad_missing():
|
|
238
|
+
gt = Genotype.from_sample_info({"GT": "0/1"})
|
|
239
|
+
assert gt.depth_for_alt(1) is None
|
|
240
|
+
|
|
241
|
+
|
|
242
|
+
def test_depth_for_alt_multiallelic():
|
|
243
|
+
# AD has one entry per allele (ref + each alt).
|
|
244
|
+
gt = Genotype.from_sample_info({"GT": "1/2", "AD": [3, 7, 5]})
|
|
245
|
+
assert gt.depth_for_alt(0) == 3
|
|
246
|
+
assert gt.depth_for_alt(1) == 7
|
|
247
|
+
assert gt.depth_for_alt(2) == 5
|
|
248
|
+
|
|
249
|
+
|
|
250
|
+
# ------------------------------------------------------------------
|
|
251
|
+
# Dataclass-y ergonomics: hashable, equatable, frozen
|
|
252
|
+
# ------------------------------------------------------------------
|
|
253
|
+
|
|
254
|
+
|
|
255
|
+
def test_genotype_is_frozen():
|
|
256
|
+
gt = Genotype.from_sample_info({"GT": "0/1"})
|
|
257
|
+
with pytest.raises((AttributeError, Exception)):
|
|
258
|
+
gt.alleles = (1, 1) # type: ignore
|
|
259
|
+
|
|
260
|
+
|
|
261
|
+
def test_genotype_equality():
|
|
262
|
+
a = Genotype.from_sample_info({"GT": "0/1", "AD": [10, 5], "DP": 15})
|
|
263
|
+
b = Genotype.from_sample_info({"GT": "0/1", "AD": [10, 5], "DP": 15})
|
|
264
|
+
assert a == b
|
|
265
|
+
c = Genotype.from_sample_info({"GT": "1/1", "AD": [0, 15], "DP": 15})
|
|
266
|
+
assert a != c
|
|
@@ -0,0 +1,248 @@
|
|
|
1
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
2
|
+
# you may not use this file except in compliance with the License.
|
|
3
|
+
# You may obtain a copy of the License at
|
|
4
|
+
#
|
|
5
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
6
|
+
#
|
|
7
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
8
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
9
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
10
|
+
# See the License for the specific language governing permissions and
|
|
11
|
+
# limitations under the License.
|
|
12
|
+
|
|
13
|
+
"""
|
|
14
|
+
End-to-end tests for VariantCollection genotype/zygosity access —
|
|
15
|
+
exercises the full path from VCF → Variant → Genotype (openvax/varcode#267).
|
|
16
|
+
"""
|
|
17
|
+
|
|
18
|
+
import os
|
|
19
|
+
import tempfile
|
|
20
|
+
|
|
21
|
+
import pytest
|
|
22
|
+
|
|
23
|
+
from varcode import (
|
|
24
|
+
Genotype,
|
|
25
|
+
SampleNotFoundError,
|
|
26
|
+
Zygosity,
|
|
27
|
+
load_vcf,
|
|
28
|
+
)
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
VCF_BODY = """##fileformat=VCFv4.1
|
|
32
|
+
##reference=GRCh38
|
|
33
|
+
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
|
|
34
|
+
##FORMAT=<ID=AD,Number=R,Type=Integer,Description="Allele depths">
|
|
35
|
+
##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Read depth">
|
|
36
|
+
##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Genotype quality">
|
|
37
|
+
##FORMAT=<ID=PS,Number=1,Type=Integer,Description="Phase set">
|
|
38
|
+
#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT tumor normal
|
|
39
|
+
17 43082575 . C T 100 PASS . GT:AD:DP:GQ 0/1:10,5:15:99 0/0:20,0:20:99
|
|
40
|
+
7 117531114 . G T 100 PASS . GT:AD:DP:GQ 1/1:0,20:20:99 0/1:10,10:20:99
|
|
41
|
+
1 100 . A T,G 100 PASS . GT:AD:DP:GQ 1/2:5,5,5:15:99 0/1:10,5,0:15:99
|
|
42
|
+
17 43082576 . C A 100 PASS . GT:AD:DP:GQ:PS 0|1:8,7:15:99:100 ./.:.:.:.:.
|
|
43
|
+
"""
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
@pytest.fixture
|
|
47
|
+
def multi_sample_vcf():
|
|
48
|
+
fd, path = tempfile.mkstemp(suffix=".vcf")
|
|
49
|
+
with os.fdopen(fd, "w") as f:
|
|
50
|
+
f.write(VCF_BODY)
|
|
51
|
+
yield path
|
|
52
|
+
os.unlink(path)
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
# -------------------------------------------------------------------
|
|
56
|
+
# Sample discovery
|
|
57
|
+
# -------------------------------------------------------------------
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def test_samples_property_lists_names(multi_sample_vcf):
|
|
61
|
+
vc = load_vcf(multi_sample_vcf, genome="GRCh38")
|
|
62
|
+
assert vc.samples == ["normal", "tumor"]
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
def test_has_sample_data_true_for_vcf_with_samples(multi_sample_vcf):
|
|
66
|
+
vc = load_vcf(multi_sample_vcf, genome="GRCh38")
|
|
67
|
+
assert vc.has_sample_data() is True
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def test_has_sample_data_false_for_directly_constructed(multi_sample_vcf):
|
|
71
|
+
# Directly-constructed variants have no sample_info metadata.
|
|
72
|
+
from varcode import Variant, VariantCollection
|
|
73
|
+
vc = VariantCollection(variants=[
|
|
74
|
+
Variant("17", 43082575, "C", "T", "GRCh38"),
|
|
75
|
+
])
|
|
76
|
+
assert vc.has_sample_data() is False
|
|
77
|
+
assert vc.samples == []
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
# -------------------------------------------------------------------
|
|
81
|
+
# Per-variant genotype access
|
|
82
|
+
# -------------------------------------------------------------------
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
def test_genotype_for_heterozygous_sample(multi_sample_vcf):
|
|
86
|
+
vc = load_vcf(multi_sample_vcf, genome="GRCh38")
|
|
87
|
+
# chr17:43082575 C>T — tumor 0/1, normal 0/0.
|
|
88
|
+
variant = next(
|
|
89
|
+
v for v in vc if v.start == 43082575 and v.alt == "T"
|
|
90
|
+
)
|
|
91
|
+
tumor_gt = vc.genotype(variant, "tumor")
|
|
92
|
+
assert tumor_gt.alleles == (0, 1)
|
|
93
|
+
assert tumor_gt.is_called
|
|
94
|
+
assert tumor_gt.zygosity_for_alt(1) is Zygosity.HETEROZYGOUS
|
|
95
|
+
assert tumor_gt.allele_depths == (10, 5)
|
|
96
|
+
assert tumor_gt.genotype_quality == 99
|
|
97
|
+
|
|
98
|
+
normal_gt = vc.genotype(variant, "normal")
|
|
99
|
+
assert normal_gt.alleles == (0, 0)
|
|
100
|
+
assert normal_gt.zygosity_for_alt(1) is Zygosity.ABSENT
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
def test_genotype_for_homozygous_sample(multi_sample_vcf):
|
|
104
|
+
vc = load_vcf(multi_sample_vcf, genome="GRCh38")
|
|
105
|
+
variant = next(v for v in vc if v.start == 117531114)
|
|
106
|
+
tumor_gt = vc.genotype(variant, "tumor")
|
|
107
|
+
assert tumor_gt.alleles == (1, 1)
|
|
108
|
+
assert tumor_gt.zygosity_for_alt(1) is Zygosity.HOMOZYGOUS
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
def test_genotype_phased_call_preserves_phase_info(multi_sample_vcf):
|
|
112
|
+
vc = load_vcf(multi_sample_vcf, genome="GRCh38")
|
|
113
|
+
variant = next(v for v in vc if v.start == 43082576)
|
|
114
|
+
tumor_gt = vc.genotype(variant, "tumor")
|
|
115
|
+
assert tumor_gt.phased is True
|
|
116
|
+
assert tumor_gt.phase_set == 100
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
def test_genotype_nocall(multi_sample_vcf):
|
|
120
|
+
vc = load_vcf(multi_sample_vcf, genome="GRCh38")
|
|
121
|
+
variant = next(v for v in vc if v.start == 43082576)
|
|
122
|
+
normal_gt = vc.genotype(variant, "normal")
|
|
123
|
+
assert normal_gt.is_missing
|
|
124
|
+
assert normal_gt.zygosity_for_alt(1) is Zygosity.MISSING
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
def test_genotype_unknown_sample_raises(multi_sample_vcf):
|
|
128
|
+
vc = load_vcf(multi_sample_vcf, genome="GRCh38")
|
|
129
|
+
variant = vc[0]
|
|
130
|
+
with pytest.raises(SampleNotFoundError):
|
|
131
|
+
vc.genotype(variant, "nonexistent_sample")
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
def test_sample_not_found_is_key_error_subclass():
|
|
135
|
+
# Callers who only catch KeyError should still work.
|
|
136
|
+
assert issubclass(SampleNotFoundError, KeyError)
|
|
137
|
+
|
|
138
|
+
|
|
139
|
+
def test_genotype_returns_none_for_variant_without_sample_info():
|
|
140
|
+
# A variant that wasn't loaded from a multi-sample VCF.
|
|
141
|
+
from varcode import Variant, VariantCollection
|
|
142
|
+
v = Variant("17", 43082575, "C", "T", "GRCh38")
|
|
143
|
+
vc = VariantCollection(variants=[v])
|
|
144
|
+
assert vc.genotype(v, "anyone") is None
|
|
145
|
+
|
|
146
|
+
|
|
147
|
+
# -------------------------------------------------------------------
|
|
148
|
+
# Multi-allelic sites
|
|
149
|
+
# -------------------------------------------------------------------
|
|
150
|
+
|
|
151
|
+
|
|
152
|
+
def test_multiallelic_row_splits_into_separate_variants_with_own_genotype(
|
|
153
|
+
multi_sample_vcf):
|
|
154
|
+
vc = load_vcf(multi_sample_vcf, genome="GRCh38")
|
|
155
|
+
# chr1:100 A>T,G — tumor has GT=1/2 (one T, one G).
|
|
156
|
+
at = next(v for v in vc if v.start == 100 and v.alt == "T")
|
|
157
|
+
ag = next(v for v in vc if v.start == 100 and v.alt == "G")
|
|
158
|
+
|
|
159
|
+
# Relative to each alt, the tumor is heterozygous (carries one
|
|
160
|
+
# copy of this alt and one copy of a different alt).
|
|
161
|
+
assert vc.zygosity(at, "tumor") is Zygosity.HETEROZYGOUS
|
|
162
|
+
assert vc.zygosity(ag, "tumor") is Zygosity.HETEROZYGOUS
|
|
163
|
+
|
|
164
|
+
# Normal has GT=0/1 (one ref, one T). Relative to T: het.
|
|
165
|
+
# Relative to G: absent (normal doesn't have the G alt).
|
|
166
|
+
assert vc.zygosity(at, "normal") is Zygosity.HETEROZYGOUS
|
|
167
|
+
assert vc.zygosity(ag, "normal") is Zygosity.ABSENT
|
|
168
|
+
|
|
169
|
+
|
|
170
|
+
# -------------------------------------------------------------------
|
|
171
|
+
# Convenience filters
|
|
172
|
+
# -------------------------------------------------------------------
|
|
173
|
+
|
|
174
|
+
|
|
175
|
+
def test_for_sample_filters_to_variants_carried_by_that_sample(
|
|
176
|
+
multi_sample_vcf):
|
|
177
|
+
vc = load_vcf(multi_sample_vcf, genome="GRCh38")
|
|
178
|
+
|
|
179
|
+
tumor_variants = vc.for_sample("tumor")
|
|
180
|
+
# Tumor carries all 4 variant-derived alts except normal-only ones.
|
|
181
|
+
# Check concretely:
|
|
182
|
+
# - 17:43082575 C>T: tumor 0/1 -> het, carried
|
|
183
|
+
# - 7:117531114 G>T: tumor 1/1 -> hom, carried
|
|
184
|
+
# - 1:100 A>T: tumor 1/2 -> carries T (alt #1)
|
|
185
|
+
# - 1:100 A>G: tumor 1/2 -> carries G (alt #2)
|
|
186
|
+
# - 17:43082576 C>A: tumor 0|1 -> het, carried
|
|
187
|
+
assert len(tumor_variants) == 5
|
|
188
|
+
|
|
189
|
+
normal_variants = vc.for_sample("normal")
|
|
190
|
+
# - 17:43082575 C>T: normal 0/0 -> absent
|
|
191
|
+
# - 7:117531114 G>T: normal 0/1 -> het, carried
|
|
192
|
+
# - 1:100 A>T: normal 0/1 -> carries T
|
|
193
|
+
# - 1:100 A>G: normal 0/1 -> absent (normal doesn't have G)
|
|
194
|
+
# - 17:43082576 C>A: normal ./. -> missing, not carried
|
|
195
|
+
assert len(normal_variants) == 2
|
|
196
|
+
|
|
197
|
+
|
|
198
|
+
def test_heterozygous_in_excludes_homozygous_calls(multi_sample_vcf):
|
|
199
|
+
vc = load_vcf(multi_sample_vcf, genome="GRCh38")
|
|
200
|
+
het_in_tumor = vc.heterozygous_in("tumor")
|
|
201
|
+
starts = sorted(v.start for v in het_in_tumor)
|
|
202
|
+
# Tumor is het at:
|
|
203
|
+
# - 17:43082575 (0/1)
|
|
204
|
+
# - 1:100 (1/2 — het for both T and G)
|
|
205
|
+
# - 17:43082576 (0|1)
|
|
206
|
+
# NOT at 7:117531114 (1/1 is hom, not het)
|
|
207
|
+
assert starts == [100, 100, 43082575, 43082576]
|
|
208
|
+
|
|
209
|
+
|
|
210
|
+
def test_homozygous_alt_in(multi_sample_vcf):
|
|
211
|
+
vc = load_vcf(multi_sample_vcf, genome="GRCh38")
|
|
212
|
+
hom_in_tumor = vc.homozygous_alt_in("tumor")
|
|
213
|
+
assert len(hom_in_tumor) == 1
|
|
214
|
+
assert hom_in_tumor[0].start == 117531114
|
|
215
|
+
|
|
216
|
+
|
|
217
|
+
def test_for_sample_with_unknown_sample_raises(multi_sample_vcf):
|
|
218
|
+
vc = load_vcf(multi_sample_vcf, genome="GRCh38")
|
|
219
|
+
# Fail fast on typos rather than silently returning empty.
|
|
220
|
+
with pytest.raises(SampleNotFoundError):
|
|
221
|
+
vc.for_sample("nonexistent")
|
|
222
|
+
|
|
223
|
+
|
|
224
|
+
def test_filter_chain_composes(multi_sample_vcf):
|
|
225
|
+
# Cross-sample queries fall out of set operations on the primitives.
|
|
226
|
+
vc = load_vcf(multi_sample_vcf, genome="GRCh38")
|
|
227
|
+
# "In tumor but not in normal" — somatic candidates.
|
|
228
|
+
tumor_set = set(vc.for_sample("tumor"))
|
|
229
|
+
normal_set = set(vc.for_sample("normal"))
|
|
230
|
+
somatic = tumor_set - normal_set
|
|
231
|
+
# Tumor carries: 17:43082575, 7:117531114, 1:100(T), 1:100(G), 17:43082576.
|
|
232
|
+
# Normal carries: 7:117531114, 1:100(T).
|
|
233
|
+
# Somatic = 17:43082575, 1:100(G), 17:43082576.
|
|
234
|
+
assert len(somatic) == 3
|
|
235
|
+
starts = sorted(v.start for v in somatic)
|
|
236
|
+
assert starts == [100, 43082575, 43082576]
|
|
237
|
+
|
|
238
|
+
|
|
239
|
+
# -------------------------------------------------------------------
|
|
240
|
+
# Package-level exports
|
|
241
|
+
# -------------------------------------------------------------------
|
|
242
|
+
|
|
243
|
+
|
|
244
|
+
def test_package_level_exports():
|
|
245
|
+
import varcode
|
|
246
|
+
assert varcode.Genotype is Genotype
|
|
247
|
+
assert varcode.Zygosity is Zygosity
|
|
248
|
+
assert varcode.SampleNotFoundError is SampleNotFoundError
|
|
@@ -11,7 +11,8 @@
|
|
|
11
11
|
# See the License for the specific language governing permissions and
|
|
12
12
|
# limitations under the License.
|
|
13
13
|
|
|
14
|
-
from .errors import ReferenceMismatchError
|
|
14
|
+
from .errors import ReferenceMismatchError, SampleNotFoundError
|
|
15
|
+
from .genotype import Genotype, Zygosity
|
|
15
16
|
from .variant import Variant
|
|
16
17
|
from .variant_collection import VariantCollection
|
|
17
18
|
from .maf import load_maf, load_maf_dataframe
|
|
@@ -33,6 +34,10 @@ __all__ = [
|
|
|
33
34
|
"EffectCollection",
|
|
34
35
|
"VariantCollection",
|
|
35
36
|
|
|
37
|
+
# genotype / zygosity
|
|
38
|
+
"Genotype",
|
|
39
|
+
"Zygosity",
|
|
40
|
+
|
|
36
41
|
# effects
|
|
37
42
|
"effect_priority",
|
|
38
43
|
"top_priority_effect",
|
|
@@ -41,6 +46,7 @@ __all__ = [
|
|
|
41
46
|
|
|
42
47
|
# exceptions
|
|
43
48
|
"ReferenceMismatchError",
|
|
49
|
+
"SampleNotFoundError",
|
|
44
50
|
|
|
45
51
|
# file loading
|
|
46
52
|
"load_maf",
|
|
@@ -18,6 +18,11 @@ Exception types raised by varcode. ``ReferenceMismatchError`` subclasses
|
|
|
18
18
|
"""
|
|
19
19
|
|
|
20
20
|
|
|
21
|
+
class SampleNotFoundError(KeyError):
|
|
22
|
+
"""Raised when genotype info is requested for a sample that isn't
|
|
23
|
+
present in the VariantCollection's source VCF(s)."""
|
|
24
|
+
|
|
25
|
+
|
|
21
26
|
class ReferenceMismatchError(ValueError):
|
|
22
27
|
"""Raised when a variant's reported ref allele does not match the
|
|
23
28
|
reference genome at the variant's position.
|
|
@@ -0,0 +1,200 @@
|
|
|
1
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
2
|
+
# you may not use this file except in compliance with the License.
|
|
3
|
+
# You may obtain a copy of the License at
|
|
4
|
+
#
|
|
5
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
6
|
+
#
|
|
7
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
8
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
9
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
10
|
+
# See the License for the specific language governing permissions and
|
|
11
|
+
# limitations under the License.
|
|
12
|
+
|
|
13
|
+
"""
|
|
14
|
+
Per-sample genotype representation — see openvax/varcode#267.
|
|
15
|
+
|
|
16
|
+
A ``Genotype`` captures one sample's call at one variant locus: the
|
|
17
|
+
alleles observed, whether the call was phased, and any supporting
|
|
18
|
+
FORMAT fields (AD, DP, GQ, PS) that were present in the VCF. Zygosity
|
|
19
|
+
is computed relative to a specific alt allele index, which matters for
|
|
20
|
+
multi-allelic sites where a sample may carry a different alt from the
|
|
21
|
+
one being queried.
|
|
22
|
+
|
|
23
|
+
This module is intentionally free of circular imports so it can be
|
|
24
|
+
used anywhere in varcode without pulling in the rest of the package.
|
|
25
|
+
"""
|
|
26
|
+
|
|
27
|
+
from dataclasses import dataclass
|
|
28
|
+
from enum import Enum
|
|
29
|
+
from typing import Optional, Tuple
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
class Zygosity(Enum):
|
|
33
|
+
"""Zygosity of a sample's genotype relative to a specific alt allele.
|
|
34
|
+
|
|
35
|
+
``ABSENT`` is distinct from ``MISSING``: ABSENT means the call
|
|
36
|
+
exists but doesn't include the alt in question (e.g. the sample
|
|
37
|
+
is ref-ref, or carries a *different* alt at a multi-allelic
|
|
38
|
+
site). MISSING means the call itself is ``./.`` or the sample
|
|
39
|
+
wasn't called.
|
|
40
|
+
"""
|
|
41
|
+
ABSENT = "absent"
|
|
42
|
+
HETEROZYGOUS = "het"
|
|
43
|
+
HOMOZYGOUS = "hom"
|
|
44
|
+
MISSING = "missing"
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def parse_gt_string(gt_str):
|
|
48
|
+
"""Parse a VCF ``GT`` string into ``(alleles, phased)``.
|
|
49
|
+
|
|
50
|
+
Handles:
|
|
51
|
+
* Standard diploid: ``"0/1"``, ``"1/1"``, ``"1/2"``
|
|
52
|
+
* Phased diploid: ``"0|1"``, ``"1|0"``
|
|
53
|
+
* Haploid (chrY, chrM): ``"1"``, ``"0"``
|
|
54
|
+
* Polyploid: ``"0/1/1"`` → alleles = (0, 1, 1)
|
|
55
|
+
* Missing: ``"./."``, ``"."``, ``""``, ``None``
|
|
56
|
+
* Partial missing: ``"./1"`` → alleles = (None, 1)
|
|
57
|
+
|
|
58
|
+
Returns a tuple ``(alleles, phased)`` where ``alleles`` is a tuple
|
|
59
|
+
of ``Optional[int]`` (``None`` for missing) and ``phased`` is
|
|
60
|
+
``True`` iff the string used the ``|`` delimiter.
|
|
61
|
+
"""
|
|
62
|
+
if not gt_str or gt_str == ".":
|
|
63
|
+
return ((None,), False)
|
|
64
|
+
if "|" in gt_str:
|
|
65
|
+
phased = True
|
|
66
|
+
parts = gt_str.split("|")
|
|
67
|
+
else:
|
|
68
|
+
phased = False
|
|
69
|
+
parts = gt_str.split("/")
|
|
70
|
+
alleles = tuple(
|
|
71
|
+
None if p == "." else int(p)
|
|
72
|
+
for p in parts
|
|
73
|
+
)
|
|
74
|
+
return alleles, phased
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
@dataclass(frozen=True)
|
|
78
|
+
class Genotype:
|
|
79
|
+
"""One sample's genotype at one variant locus.
|
|
80
|
+
|
|
81
|
+
The ``alleles`` tuple encodes the observed alleles using VCF GT
|
|
82
|
+
semantics: ``0`` is the reference allele, ``1`` is the first ALT
|
|
83
|
+
listed on the VCF row, ``2`` is the second, and so on. ``None``
|
|
84
|
+
indicates a no-call on that haplotype.
|
|
85
|
+
|
|
86
|
+
For varcode's variant-level API, note that ``Variant.alt`` is a
|
|
87
|
+
*specific* alt (a multi-allelic VCF row is split into one Variant
|
|
88
|
+
per alt). When querying zygosity relative to a Variant, use the
|
|
89
|
+
variant's ``alt_allele_index`` from the collection's metadata and
|
|
90
|
+
add 1 to get the GT-encoded index, then call
|
|
91
|
+
:meth:`zygosity_for_alt` or :meth:`carries_alt`.
|
|
92
|
+
"""
|
|
93
|
+
|
|
94
|
+
raw_gt: str
|
|
95
|
+
alleles: Tuple[Optional[int], ...]
|
|
96
|
+
phased: bool = False
|
|
97
|
+
phase_set: Optional[int] = None
|
|
98
|
+
allele_depths: Optional[Tuple[int, ...]] = None
|
|
99
|
+
total_depth: Optional[int] = None
|
|
100
|
+
genotype_quality: Optional[int] = None
|
|
101
|
+
|
|
102
|
+
# ---- construction ------------------------------------------------
|
|
103
|
+
|
|
104
|
+
@classmethod
|
|
105
|
+
def from_sample_info(cls, sample_info):
|
|
106
|
+
"""Build a Genotype from pyvcf's ``call.data._asdict()`` output.
|
|
107
|
+
|
|
108
|
+
Handles the keys varcode normally sees: ``GT``, ``AD``, ``DP``,
|
|
109
|
+
``GQ``, ``PS``. Missing keys default to ``None``.
|
|
110
|
+
"""
|
|
111
|
+
if sample_info is None:
|
|
112
|
+
return cls(raw_gt="./.", alleles=(None, None), phased=False)
|
|
113
|
+
gt_str = sample_info.get("GT")
|
|
114
|
+
if gt_str is None:
|
|
115
|
+
gt_str = "./."
|
|
116
|
+
alleles, phased = parse_gt_string(gt_str)
|
|
117
|
+
ad = sample_info.get("AD")
|
|
118
|
+
return cls(
|
|
119
|
+
raw_gt=gt_str,
|
|
120
|
+
alleles=alleles,
|
|
121
|
+
phased=phased,
|
|
122
|
+
phase_set=sample_info.get("PS"),
|
|
123
|
+
allele_depths=tuple(ad) if ad is not None else None,
|
|
124
|
+
total_depth=sample_info.get("DP"),
|
|
125
|
+
genotype_quality=sample_info.get("GQ"),
|
|
126
|
+
)
|
|
127
|
+
|
|
128
|
+
# ---- general predicates (alt-agnostic) --------------------------
|
|
129
|
+
|
|
130
|
+
@property
|
|
131
|
+
def is_called(self) -> bool:
|
|
132
|
+
"""True if at least one allele is non-None."""
|
|
133
|
+
return any(a is not None for a in self.alleles)
|
|
134
|
+
|
|
135
|
+
@property
|
|
136
|
+
def is_missing(self) -> bool:
|
|
137
|
+
return not self.is_called
|
|
138
|
+
|
|
139
|
+
@property
|
|
140
|
+
def ploidy(self) -> int:
|
|
141
|
+
"""Number of alleles in the call (including missing)."""
|
|
142
|
+
return len(self.alleles)
|
|
143
|
+
|
|
144
|
+
@property
|
|
145
|
+
def is_haploid(self) -> bool:
|
|
146
|
+
return self.ploidy == 1
|
|
147
|
+
|
|
148
|
+
# ---- alt-relative predicates ------------------------------------
|
|
149
|
+
|
|
150
|
+
def carries_alt(self, alt_index: int) -> bool:
|
|
151
|
+
"""True if this sample's genotype contains the given alt.
|
|
152
|
+
|
|
153
|
+
``alt_index`` uses VCF GT encoding: ``1`` is the first alt on
|
|
154
|
+
the row, ``2`` is the second, etc. (i.e. one more than
|
|
155
|
+
``alt_allele_index`` from the VariantCollection metadata).
|
|
156
|
+
"""
|
|
157
|
+
return any(
|
|
158
|
+
a == alt_index
|
|
159
|
+
for a in self.alleles
|
|
160
|
+
if a is not None
|
|
161
|
+
)
|
|
162
|
+
|
|
163
|
+
def copies_of_alt(self, alt_index: int) -> int:
|
|
164
|
+
"""Number of haplotypes carrying the given alt."""
|
|
165
|
+
return sum(
|
|
166
|
+
1 for a in self.alleles
|
|
167
|
+
if a is not None and a == alt_index
|
|
168
|
+
)
|
|
169
|
+
|
|
170
|
+
def zygosity_for_alt(self, alt_index: int) -> Zygosity:
|
|
171
|
+
"""Classify the sample's zygosity relative to one alt allele.
|
|
172
|
+
|
|
173
|
+
Multi-allelic aware: ``GT=1/2`` queried for alt ``1`` returns
|
|
174
|
+
``HETEROZYGOUS`` (one copy of this alt, one of a different
|
|
175
|
+
alt); queried for alt ``3`` it returns ``ABSENT``.
|
|
176
|
+
"""
|
|
177
|
+
called = [a for a in self.alleles if a is not None]
|
|
178
|
+
if len(called) == 0:
|
|
179
|
+
return Zygosity.MISSING
|
|
180
|
+
n_copies = sum(1 for a in called if a == alt_index)
|
|
181
|
+
if n_copies == 0:
|
|
182
|
+
return Zygosity.ABSENT
|
|
183
|
+
if n_copies == len(called):
|
|
184
|
+
return Zygosity.HOMOZYGOUS
|
|
185
|
+
return Zygosity.HETEROZYGOUS
|
|
186
|
+
|
|
187
|
+
# ---- depth helpers ----------------------------------------------
|
|
188
|
+
|
|
189
|
+
def depth_for_alt(self, alt_index: int) -> Optional[int]:
|
|
190
|
+
"""Per-allele read depth for a given alt, from the ``AD`` field.
|
|
191
|
+
|
|
192
|
+
``AD`` is indexed with ref at position 0 and alt #1 at
|
|
193
|
+
position 1, etc., so ``alt_index`` should use GT encoding
|
|
194
|
+
(``1`` = first alt).
|
|
195
|
+
"""
|
|
196
|
+
if self.allele_depths is None:
|
|
197
|
+
return None
|
|
198
|
+
if alt_index >= len(self.allele_depths):
|
|
199
|
+
return None
|
|
200
|
+
return self.allele_depths[alt_index]
|
|
@@ -24,6 +24,8 @@ from .csv_helpers import (
|
|
|
24
24
|
warn_on_version_drift,
|
|
25
25
|
write_metadata_header,
|
|
26
26
|
)
|
|
27
|
+
from .errors import SampleNotFoundError
|
|
28
|
+
from .genotype import Genotype, Zygosity
|
|
27
29
|
from .variant import Variant, variant_ascending_position_sort_key
|
|
28
30
|
from .version import __version__ as _varcode_version
|
|
29
31
|
|
|
@@ -485,6 +487,142 @@ class VariantCollection(Collection):
|
|
|
485
487
|
sort_key=sort_key,
|
|
486
488
|
)
|
|
487
489
|
|
|
490
|
+
# ------------------------------------------------------------------
|
|
491
|
+
# Genotype / zygosity access (openvax/varcode#267).
|
|
492
|
+
#
|
|
493
|
+
# The VCF loader already captures per-sample FORMAT fields in
|
|
494
|
+
# self.source_to_metadata_dict[path][variant]['sample_info']. These
|
|
495
|
+
# methods surface that data as structured Genotype objects and
|
|
496
|
+
# provide sample-aware filtering helpers.
|
|
497
|
+
# ------------------------------------------------------------------
|
|
498
|
+
|
|
499
|
+
def _metadata_for(self, variant):
|
|
500
|
+
"""Find the metadata dict for a variant across all sources.
|
|
501
|
+
|
|
502
|
+
Returns None if the variant isn't tracked in any of this
|
|
503
|
+
collection's source-keyed metadata maps.
|
|
504
|
+
"""
|
|
505
|
+
for variant_map in self.source_to_metadata_dict.values():
|
|
506
|
+
if variant in variant_map:
|
|
507
|
+
return variant_map[variant]
|
|
508
|
+
return None
|
|
509
|
+
|
|
510
|
+
@property
|
|
511
|
+
def samples(self):
|
|
512
|
+
"""Sorted list of sample names present in the collection's
|
|
513
|
+
``sample_info`` metadata (empty if no VCFs with sample columns
|
|
514
|
+
were loaded)."""
|
|
515
|
+
sample_set = set()
|
|
516
|
+
for variant_map in self.source_to_metadata_dict.values():
|
|
517
|
+
for meta in variant_map.values():
|
|
518
|
+
sample_info = meta.get("sample_info") if meta else None
|
|
519
|
+
if sample_info:
|
|
520
|
+
sample_set.update(sample_info.keys())
|
|
521
|
+
return sorted(sample_set)
|
|
522
|
+
|
|
523
|
+
def has_sample_data(self):
|
|
524
|
+
"""True if the collection has any per-sample genotype info."""
|
|
525
|
+
return len(self.samples) > 0
|
|
526
|
+
|
|
527
|
+
def genotype(self, variant, sample):
|
|
528
|
+
"""Return the ``Genotype`` for ``sample`` at ``variant``.
|
|
529
|
+
|
|
530
|
+
Parameters
|
|
531
|
+
----------
|
|
532
|
+
variant : Variant
|
|
533
|
+
sample : str
|
|
534
|
+
|
|
535
|
+
Returns
|
|
536
|
+
-------
|
|
537
|
+
Genotype or None
|
|
538
|
+
``None`` if the variant has no sample_info metadata at all
|
|
539
|
+
(e.g. it was constructed directly rather than loaded from
|
|
540
|
+
a multi-sample VCF).
|
|
541
|
+
|
|
542
|
+
Raises
|
|
543
|
+
------
|
|
544
|
+
SampleNotFoundError
|
|
545
|
+
If the variant's metadata exists but doesn't include the
|
|
546
|
+
requested sample. Subclass of ``KeyError``.
|
|
547
|
+
"""
|
|
548
|
+
meta = self._metadata_for(variant)
|
|
549
|
+
if meta is None:
|
|
550
|
+
return None
|
|
551
|
+
sample_info = meta.get("sample_info")
|
|
552
|
+
if sample_info is None:
|
|
553
|
+
return None
|
|
554
|
+
if sample not in sample_info:
|
|
555
|
+
raise SampleNotFoundError(
|
|
556
|
+
"Sample %r not found in %s. Available samples: %s" % (
|
|
557
|
+
sample, variant, sorted(sample_info.keys())))
|
|
558
|
+
return Genotype.from_sample_info(sample_info[sample])
|
|
559
|
+
|
|
560
|
+
def _alt_index_for(self, variant):
|
|
561
|
+
"""VCF GT-encoded index (1-based) of the variant's alt on the
|
|
562
|
+
original VCF row, or ``None`` if unknown.
|
|
563
|
+
|
|
564
|
+
Single-alt rows and variants not loaded from VCF effectively
|
|
565
|
+
have alt_allele_index == 0, which encodes to GT index 1.
|
|
566
|
+
"""
|
|
567
|
+
meta = self._metadata_for(variant)
|
|
568
|
+
if meta is None:
|
|
569
|
+
return 1
|
|
570
|
+
idx = meta.get("alt_allele_index")
|
|
571
|
+
if idx is None:
|
|
572
|
+
return 1
|
|
573
|
+
return idx + 1
|
|
574
|
+
|
|
575
|
+
def zygosity(self, variant, sample):
|
|
576
|
+
"""Zygosity of the given sample at the given variant.
|
|
577
|
+
|
|
578
|
+
Multi-allelic aware: at a site split into multiple Variants,
|
|
579
|
+
each asks "does this sample carry *this* alt?".
|
|
580
|
+
"""
|
|
581
|
+
gt = self.genotype(variant, sample)
|
|
582
|
+
if gt is None:
|
|
583
|
+
return Zygosity.MISSING
|
|
584
|
+
return gt.zygosity_for_alt(self._alt_index_for(variant))
|
|
585
|
+
|
|
586
|
+
def for_sample(self, sample):
|
|
587
|
+
"""Return a VariantCollection restricted to variants where
|
|
588
|
+
``sample`` carries the alt (heterozygous or homozygous). Useful
|
|
589
|
+
for multi-sample VCFs where not every row is called in every
|
|
590
|
+
sample.
|
|
591
|
+
"""
|
|
592
|
+
return self._filter_by_zygosity(
|
|
593
|
+
sample,
|
|
594
|
+
keep=lambda z: z in (Zygosity.HETEROZYGOUS, Zygosity.HOMOZYGOUS),
|
|
595
|
+
)
|
|
596
|
+
|
|
597
|
+
def heterozygous_in(self, sample):
|
|
598
|
+
"""Variants where ``sample`` is heterozygous for this variant's alt."""
|
|
599
|
+
return self._filter_by_zygosity(
|
|
600
|
+
sample,
|
|
601
|
+
keep=lambda z: z is Zygosity.HETEROZYGOUS,
|
|
602
|
+
)
|
|
603
|
+
|
|
604
|
+
def homozygous_alt_in(self, sample):
|
|
605
|
+
"""Variants where ``sample`` is homozygous for this variant's alt."""
|
|
606
|
+
return self._filter_by_zygosity(
|
|
607
|
+
sample,
|
|
608
|
+
keep=lambda z: z is Zygosity.HOMOZYGOUS,
|
|
609
|
+
)
|
|
610
|
+
|
|
611
|
+
def _filter_by_zygosity(self, sample, keep):
|
|
612
|
+
# Pre-validate the sample once so a typo fails loudly rather
|
|
613
|
+
# than silently returning an empty collection.
|
|
614
|
+
if self.has_sample_data() and sample not in self.samples:
|
|
615
|
+
raise SampleNotFoundError(
|
|
616
|
+
"Sample %r not found. Available samples: %s" % (
|
|
617
|
+
sample, self.samples))
|
|
618
|
+
return self.filter(
|
|
619
|
+
lambda v: keep(self.zygosity(v, sample))
|
|
620
|
+
)
|
|
621
|
+
|
|
622
|
+
# ------------------------------------------------------------------
|
|
623
|
+
# (end genotype methods)
|
|
624
|
+
# ------------------------------------------------------------------
|
|
625
|
+
|
|
488
626
|
def clone_without_ucsc_data(self):
|
|
489
627
|
variants = [v.clone_without_ucsc_data() for v in self]
|
|
490
628
|
return self.clone_with_new_elements(variants)
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
__version__ = "2.3.0"
|
|
@@ -23,6 +23,8 @@ tests/test_effect_collection_sort_order.py
|
|
|
23
23
|
tests/test_effects_from_mutagenix_variants.py
|
|
24
24
|
tests/test_exonic_splice_site.py
|
|
25
25
|
tests/test_frameshift_helpers.py
|
|
26
|
+
tests/test_genotype.py
|
|
27
|
+
tests/test_genotype_from_vcf.py
|
|
26
28
|
tests/test_maf.py
|
|
27
29
|
tests/test_mm10_klf6_frameshift.py
|
|
28
30
|
tests/test_mouse.py
|
|
@@ -47,6 +49,7 @@ varcode/__init__.py
|
|
|
47
49
|
varcode/common.py
|
|
48
50
|
varcode/csv_helpers.py
|
|
49
51
|
varcode/errors.py
|
|
52
|
+
varcode/genotype.py
|
|
50
53
|
varcode/maf.py
|
|
51
54
|
varcode/nucleotides.py
|
|
52
55
|
varcode/reference.py
|
varcode-2.2.1/varcode/version.py
DELETED
|
@@ -1 +0,0 @@
|
|
|
1
|
-
__version__ = "2.2.1"
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|