varcode 2.2.0__tar.gz → 2.3.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (91) hide show
  1. {varcode-2.2.0/varcode.egg-info → varcode-2.3.0}/PKG-INFO +1 -1
  2. varcode-2.3.0/tests/test_genotype.py +266 -0
  3. varcode-2.3.0/tests/test_genotype_from_vcf.py +248 -0
  4. varcode-2.3.0/tests/test_reference_mismatch_error.py +103 -0
  5. {varcode-2.2.0 → varcode-2.3.0}/varcode/__init__.py +12 -2
  6. {varcode-2.2.0 → varcode-2.3.0}/varcode/effects/effect_prediction.py +10 -11
  7. {varcode-2.2.0 → varcode-2.3.0}/varcode/effects/effect_prediction_coding.py +9 -7
  8. varcode-2.3.0/varcode/errors.py +74 -0
  9. varcode-2.3.0/varcode/genotype.py +200 -0
  10. {varcode-2.2.0 → varcode-2.3.0}/varcode/variant_collection.py +138 -0
  11. varcode-2.3.0/varcode/version.py +1 -0
  12. {varcode-2.2.0 → varcode-2.3.0/varcode.egg-info}/PKG-INFO +1 -1
  13. {varcode-2.2.0 → varcode-2.3.0}/varcode.egg-info/SOURCES.txt +5 -0
  14. varcode-2.2.0/varcode/version.py +0 -1
  15. {varcode-2.2.0 → varcode-2.3.0}/LICENSE +0 -0
  16. {varcode-2.2.0 → varcode-2.3.0}/MANIFEST.in +0 -0
  17. {varcode-2.2.0 → varcode-2.3.0}/README.md +0 -0
  18. {varcode-2.2.0 → varcode-2.3.0}/pyproject.toml +0 -0
  19. {varcode-2.2.0 → varcode-2.3.0}/requirements.txt +0 -0
  20. {varcode-2.2.0 → varcode-2.3.0}/setup.cfg +0 -0
  21. {varcode-2.2.0 → varcode-2.3.0}/tests/__init__.py +0 -0
  22. {varcode-2.2.0 → varcode-2.3.0}/tests/benchmark_vcf_load.py +0 -0
  23. {varcode-2.2.0 → varcode-2.3.0}/tests/common.py +0 -0
  24. {varcode-2.2.0 → varcode-2.3.0}/tests/data.py +0 -0
  25. {varcode-2.2.0 → varcode-2.3.0}/tests/test_cli_effects.py +0 -0
  26. {varcode-2.2.0 → varcode-2.3.0}/tests/test_cli_genes.py +0 -0
  27. {varcode-2.2.0 → varcode-2.3.0}/tests/test_collection_filtering.py +0 -0
  28. {varcode-2.2.0 → varcode-2.3.0}/tests/test_collection_variants_attr_consistency.py +0 -0
  29. {varcode-2.2.0 → varcode-2.3.0}/tests/test_common.py +0 -0
  30. {varcode-2.2.0 → varcode-2.3.0}/tests/test_cosmic_mutations.py +0 -0
  31. {varcode-2.2.0 → varcode-2.3.0}/tests/test_csv_roundtrip.py +0 -0
  32. {varcode-2.2.0 → varcode-2.3.0}/tests/test_dbnsfp_validation.py +0 -0
  33. {varcode-2.2.0 → varcode-2.3.0}/tests/test_effect_annotation_errors.py +0 -0
  34. {varcode-2.2.0 → varcode-2.3.0}/tests/test_effect_classes.py +0 -0
  35. {varcode-2.2.0 → varcode-2.3.0}/tests/test_effect_collection.py +0 -0
  36. {varcode-2.2.0 → varcode-2.3.0}/tests/test_effect_collection_serialization.py +0 -0
  37. {varcode-2.2.0 → varcode-2.3.0}/tests/test_effect_collection_sort_order.py +0 -0
  38. {varcode-2.2.0 → varcode-2.3.0}/tests/test_effects_from_mutagenix_variants.py +0 -0
  39. {varcode-2.2.0 → varcode-2.3.0}/tests/test_exonic_splice_site.py +0 -0
  40. {varcode-2.2.0 → varcode-2.3.0}/tests/test_frameshift_helpers.py +0 -0
  41. {varcode-2.2.0 → varcode-2.3.0}/tests/test_maf.py +0 -0
  42. {varcode-2.2.0 → varcode-2.3.0}/tests/test_mm10_klf6_frameshift.py +0 -0
  43. {varcode-2.2.0 → varcode-2.3.0}/tests/test_mouse.py +0 -0
  44. {varcode-2.2.0 → varcode-2.3.0}/tests/test_mutate.py +0 -0
  45. {varcode-2.2.0 → varcode-2.3.0}/tests/test_no_duplicate_variants.py +0 -0
  46. {varcode-2.2.0 → varcode-2.3.0}/tests/test_premature_stop_short_description.py +0 -0
  47. {varcode-2.2.0 → varcode-2.3.0}/tests/test_problematic_variants.py +0 -0
  48. {varcode-2.2.0 → varcode-2.3.0}/tests/test_reference.py +0 -0
  49. {varcode-2.2.0 → varcode-2.3.0}/tests/test_silent_aa_pos.py +0 -0
  50. {varcode-2.2.0 → varcode-2.3.0}/tests/test_silent_hgvs_description.py +0 -0
  51. {varcode-2.2.0 → varcode-2.3.0}/tests/test_splice_site_effects.py +0 -0
  52. {varcode-2.2.0 → varcode-2.3.0}/tests/test_stop_codon_classification_bugs.py +0 -0
  53. {varcode-2.2.0 → varcode-2.3.0}/tests/test_string_helpers.py +0 -0
  54. {varcode-2.2.0 → varcode-2.3.0}/tests/test_symbolic_alleles.py +0 -0
  55. {varcode-2.2.0 → varcode-2.3.0}/tests/test_timings.py +0 -0
  56. {varcode-2.2.0 → varcode-2.3.0}/tests/test_variant.py +0 -0
  57. {varcode-2.2.0 → varcode-2.3.0}/tests/test_variant_collection.py +0 -0
  58. {varcode-2.2.0 → varcode-2.3.0}/tests/test_vcf.py +0 -0
  59. {varcode-2.2.0 → varcode-2.3.0}/tests/test_vcf_output.py +0 -0
  60. {varcode-2.2.0 → varcode-2.3.0}/varcode/cli/__init__.py +0 -0
  61. {varcode-2.2.0 → varcode-2.3.0}/varcode/cli/effects_script.py +0 -0
  62. {varcode-2.2.0 → varcode-2.3.0}/varcode/cli/genes_script.py +0 -0
  63. {varcode-2.2.0 → varcode-2.3.0}/varcode/cli/logging.conf +0 -0
  64. {varcode-2.2.0 → varcode-2.3.0}/varcode/cli/variant_args.py +0 -0
  65. {varcode-2.2.0 → varcode-2.3.0}/varcode/cli/version_info.py +0 -0
  66. {varcode-2.2.0 → varcode-2.3.0}/varcode/common.py +0 -0
  67. {varcode-2.2.0 → varcode-2.3.0}/varcode/csv_helpers.py +0 -0
  68. {varcode-2.2.0 → varcode-2.3.0}/varcode/effects/__init__.py +0 -0
  69. {varcode-2.2.0 → varcode-2.3.0}/varcode/effects/common.py +0 -0
  70. {varcode-2.2.0 → varcode-2.3.0}/varcode/effects/effect_classes.py +0 -0
  71. {varcode-2.2.0 → varcode-2.3.0}/varcode/effects/effect_collection.py +0 -0
  72. {varcode-2.2.0 → varcode-2.3.0}/varcode/effects/effect_helpers.py +0 -0
  73. {varcode-2.2.0 → varcode-2.3.0}/varcode/effects/effect_ordering.py +0 -0
  74. {varcode-2.2.0 → varcode-2.3.0}/varcode/effects/effect_prediction_coding_frameshift.py +0 -0
  75. {varcode-2.2.0 → varcode-2.3.0}/varcode/effects/effect_prediction_coding_in_frame.py +0 -0
  76. {varcode-2.2.0 → varcode-2.3.0}/varcode/effects/mutate.py +0 -0
  77. {varcode-2.2.0 → varcode-2.3.0}/varcode/effects/transcript_helpers.py +0 -0
  78. {varcode-2.2.0 → varcode-2.3.0}/varcode/effects/translate.py +0 -0
  79. {varcode-2.2.0 → varcode-2.3.0}/varcode/maf.py +0 -0
  80. {varcode-2.2.0 → varcode-2.3.0}/varcode/nucleotides.py +0 -0
  81. {varcode-2.2.0 → varcode-2.3.0}/varcode/reference.py +0 -0
  82. {varcode-2.2.0 → varcode-2.3.0}/varcode/string_helpers.py +0 -0
  83. {varcode-2.2.0 → varcode-2.3.0}/varcode/ucsc_reference_names.py +0 -0
  84. {varcode-2.2.0 → varcode-2.3.0}/varcode/util.py +0 -0
  85. {varcode-2.2.0 → varcode-2.3.0}/varcode/variant.py +0 -0
  86. {varcode-2.2.0 → varcode-2.3.0}/varcode/vcf.py +0 -0
  87. {varcode-2.2.0 → varcode-2.3.0}/varcode/vcf_output.py +0 -0
  88. {varcode-2.2.0 → varcode-2.3.0}/varcode.egg-info/dependency_links.txt +0 -0
  89. {varcode-2.2.0 → varcode-2.3.0}/varcode.egg-info/entry_points.txt +0 -0
  90. {varcode-2.2.0 → varcode-2.3.0}/varcode.egg-info/requires.txt +0 -0
  91. {varcode-2.2.0 → varcode-2.3.0}/varcode.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: varcode
3
- Version: 2.2.0
3
+ Version: 2.3.0
4
4
  Summary: Variant annotation in Python
5
5
  Author-email: Alex Rubinsteyn <alex.rubinsteyn@unc.edu>
6
6
  Project-URL: Homepage, https://github.com/openvax/varcode
@@ -0,0 +1,266 @@
1
+ # Licensed under the Apache License, Version 2.0 (the "License");
2
+ # you may not use this file except in compliance with the License.
3
+ # You may obtain a copy of the License at
4
+ #
5
+ # http://www.apache.org/licenses/LICENSE-2.0
6
+ #
7
+ # Unless required by applicable law or agreed to in writing, software
8
+ # distributed under the License is distributed on an "AS IS" BASIS,
9
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10
+ # See the License for the specific language governing permissions and
11
+ # limitations under the License.
12
+
13
+ """
14
+ Unit tests for the Genotype dataclass and GT parsing (openvax/varcode#267).
15
+
16
+ Tests that touch a real VCF end-to-end live in
17
+ ``tests/test_genotype_from_vcf.py``.
18
+ """
19
+
20
+ import pytest
21
+
22
+ from varcode import Genotype, Zygosity
23
+ from varcode.genotype import parse_gt_string
24
+
25
+
26
+ # ------------------------------------------------------------------
27
+ # parse_gt_string: one low-level function, many cases
28
+ # ------------------------------------------------------------------
29
+
30
+
31
+ def test_parse_gt_diploid_unphased():
32
+ assert parse_gt_string("0/1") == ((0, 1), False)
33
+ assert parse_gt_string("1/0") == ((1, 0), False)
34
+ assert parse_gt_string("1/1") == ((1, 1), False)
35
+ assert parse_gt_string("0/0") == ((0, 0), False)
36
+
37
+
38
+ def test_parse_gt_diploid_phased():
39
+ assert parse_gt_string("0|1") == ((0, 1), True)
40
+ assert parse_gt_string("1|0") == ((1, 0), True)
41
+
42
+
43
+ def test_parse_gt_multiallelic():
44
+ # Multi-allelic: 1/2 means one copy of alt #1, one copy of alt #2.
45
+ assert parse_gt_string("1/2") == ((1, 2), False)
46
+ assert parse_gt_string("2|1") == ((2, 1), True)
47
+
48
+
49
+ def test_parse_gt_haploid():
50
+ # Chromosomes X/Y in males, mitochondrial, etc. — single allele.
51
+ assert parse_gt_string("1") == ((1,), False)
52
+ assert parse_gt_string("0") == ((0,), False)
53
+
54
+
55
+ def test_parse_gt_missing():
56
+ assert parse_gt_string("./.") == ((None, None), False)
57
+ assert parse_gt_string(".") == ((None,), False)
58
+ assert parse_gt_string("") == ((None,), False)
59
+ assert parse_gt_string(None) == ((None,), False)
60
+
61
+
62
+ def test_parse_gt_partial_missing():
63
+ # Half-called genotypes — one haplotype missing.
64
+ assert parse_gt_string("./1") == ((None, 1), False)
65
+ assert parse_gt_string("0/.") == ((0, None), False)
66
+ assert parse_gt_string(".|1") == ((None, 1), True)
67
+
68
+
69
+ def test_parse_gt_polyploid():
70
+ # Triploid (e.g. trisomy) or higher — tuple simply grows.
71
+ alleles, phased = parse_gt_string("0/1/1")
72
+ assert alleles == (0, 1, 1)
73
+ assert phased is False
74
+
75
+
76
+ # ------------------------------------------------------------------
77
+ # Genotype construction from pyvcf-style sample info dicts
78
+ # ------------------------------------------------------------------
79
+
80
+
81
+ def test_genotype_from_sample_info_full():
82
+ gt = Genotype.from_sample_info({
83
+ "GT": "0/1",
84
+ "AD": [10, 5],
85
+ "DP": 15,
86
+ "GQ": 99,
87
+ })
88
+ assert gt.raw_gt == "0/1"
89
+ assert gt.alleles == (0, 1)
90
+ assert gt.phased is False
91
+ assert gt.allele_depths == (10, 5)
92
+ assert gt.total_depth == 15
93
+ assert gt.genotype_quality == 99
94
+ assert gt.phase_set is None
95
+
96
+
97
+ def test_genotype_from_sample_info_phased_with_ps():
98
+ gt = Genotype.from_sample_info({
99
+ "GT": "0|1",
100
+ "PS": 100,
101
+ "AD": [8, 7],
102
+ "DP": 15,
103
+ "GQ": 99,
104
+ })
105
+ assert gt.phased is True
106
+ assert gt.phase_set == 100
107
+
108
+
109
+ def test_genotype_from_sample_info_nocall_handles_none_values():
110
+ # Pyvcf returns None for all fields when the call is ./.
111
+ gt = Genotype.from_sample_info({
112
+ "GT": "./.",
113
+ "AD": None,
114
+ "DP": None,
115
+ "GQ": None,
116
+ })
117
+ assert gt.alleles == (None, None)
118
+ assert gt.allele_depths is None
119
+ assert gt.is_missing
120
+ assert not gt.is_called
121
+
122
+
123
+ def test_genotype_from_sample_info_none_input():
124
+ # Entirely missing sample_info dict (e.g. variant not constructed
125
+ # from a VCF).
126
+ gt = Genotype.from_sample_info(None)
127
+ assert gt.is_missing
128
+ assert gt.alleles == (None, None)
129
+
130
+
131
+ # ------------------------------------------------------------------
132
+ # General predicates (alt-agnostic)
133
+ # ------------------------------------------------------------------
134
+
135
+
136
+ def test_is_called_and_is_missing():
137
+ assert Genotype.from_sample_info({"GT": "0/1"}).is_called
138
+ assert not Genotype.from_sample_info({"GT": "./."}).is_called
139
+ assert Genotype.from_sample_info({"GT": "./."}).is_missing
140
+ # Partial-missing is still "called" because one allele is known.
141
+ assert Genotype.from_sample_info({"GT": "./1"}).is_called
142
+
143
+
144
+ def test_ploidy():
145
+ assert Genotype.from_sample_info({"GT": "0/1"}).ploidy == 2
146
+ assert Genotype.from_sample_info({"GT": "1"}).ploidy == 1
147
+ assert Genotype.from_sample_info({"GT": "0/1/1"}).ploidy == 3
148
+
149
+
150
+ def test_is_haploid():
151
+ assert Genotype.from_sample_info({"GT": "1"}).is_haploid
152
+ assert not Genotype.from_sample_info({"GT": "0/1"}).is_haploid
153
+
154
+
155
+ # ------------------------------------------------------------------
156
+ # Alt-relative zygosity (the business end of the API)
157
+ # ------------------------------------------------------------------
158
+
159
+
160
+ def test_zygosity_heterozygous_simple():
161
+ gt = Genotype.from_sample_info({"GT": "0/1"})
162
+ assert gt.zygosity_for_alt(1) is Zygosity.HETEROZYGOUS
163
+ assert gt.carries_alt(1)
164
+ assert gt.copies_of_alt(1) == 1
165
+
166
+
167
+ def test_zygosity_homozygous_alt_simple():
168
+ gt = Genotype.from_sample_info({"GT": "1/1"})
169
+ assert gt.zygosity_for_alt(1) is Zygosity.HOMOZYGOUS
170
+ assert gt.carries_alt(1)
171
+ assert gt.copies_of_alt(1) == 2
172
+
173
+
174
+ def test_zygosity_homozygous_ref_is_absent_for_any_alt():
175
+ gt = Genotype.from_sample_info({"GT": "0/0"})
176
+ # Relative to alt 1: sample doesn't have this alt.
177
+ assert gt.zygosity_for_alt(1) is Zygosity.ABSENT
178
+ assert not gt.carries_alt(1)
179
+ assert gt.copies_of_alt(1) == 0
180
+
181
+
182
+ def test_zygosity_missing():
183
+ gt = Genotype.from_sample_info({"GT": "./."})
184
+ assert gt.zygosity_for_alt(1) is Zygosity.MISSING
185
+ assert not gt.carries_alt(1)
186
+ assert gt.copies_of_alt(1) == 0
187
+
188
+
189
+ def test_zygosity_multiallelic_querying_different_alts():
190
+ # GT = 1/2 means one copy of alt #1, one copy of alt #2.
191
+ # Querying alt 1: het (one copy of this alt, one of a different alt).
192
+ # Querying alt 2: also het.
193
+ # Querying alt 3 (not carried): absent.
194
+ gt = Genotype.from_sample_info({"GT": "1/2"})
195
+ assert gt.zygosity_for_alt(1) is Zygosity.HETEROZYGOUS
196
+ assert gt.zygosity_for_alt(2) is Zygosity.HETEROZYGOUS
197
+ assert gt.zygosity_for_alt(3) is Zygosity.ABSENT
198
+
199
+
200
+ def test_zygosity_multiallelic_homozygous_for_second_alt():
201
+ # GT = 2/2 means both copies are alt #2.
202
+ # Alt 2: hom. Alt 1: absent (sample doesn't have alt 1).
203
+ gt = Genotype.from_sample_info({"GT": "2/2"})
204
+ assert gt.zygosity_for_alt(2) is Zygosity.HOMOZYGOUS
205
+ assert gt.zygosity_for_alt(1) is Zygosity.ABSENT
206
+
207
+
208
+ def test_zygosity_haploid_single_alt():
209
+ # chrY in a male with an alt call: GT = 1.
210
+ gt = Genotype.from_sample_info({"GT": "1"})
211
+ # Single-allele calls with that allele equal to alt: all copies
212
+ # are alt → classify as HOMOZYGOUS.
213
+ assert gt.zygosity_for_alt(1) is Zygosity.HOMOZYGOUS
214
+ assert gt.carries_alt(1)
215
+
216
+
217
+ def test_zygosity_partial_call():
218
+ # GT = ./1: one allele missing, the other is alt #1.
219
+ # Out of the called alleles (just one), all are alt #1 → HOMOZYGOUS.
220
+ # This is the defensible read: we count called alleles only.
221
+ gt = Genotype.from_sample_info({"GT": "./1"})
222
+ assert gt.zygosity_for_alt(1) is Zygosity.HOMOZYGOUS
223
+
224
+
225
+ # ------------------------------------------------------------------
226
+ # Per-allele depth lookup
227
+ # ------------------------------------------------------------------
228
+
229
+
230
+ def test_depth_for_alt():
231
+ gt = Genotype.from_sample_info({"GT": "0/1", "AD": [10, 5]})
232
+ assert gt.depth_for_alt(0) == 10 # ref depth
233
+ assert gt.depth_for_alt(1) == 5 # first alt depth
234
+ assert gt.depth_for_alt(2) is None # out of range
235
+
236
+
237
+ def test_depth_for_alt_returns_none_when_ad_missing():
238
+ gt = Genotype.from_sample_info({"GT": "0/1"})
239
+ assert gt.depth_for_alt(1) is None
240
+
241
+
242
+ def test_depth_for_alt_multiallelic():
243
+ # AD has one entry per allele (ref + each alt).
244
+ gt = Genotype.from_sample_info({"GT": "1/2", "AD": [3, 7, 5]})
245
+ assert gt.depth_for_alt(0) == 3
246
+ assert gt.depth_for_alt(1) == 7
247
+ assert gt.depth_for_alt(2) == 5
248
+
249
+
250
+ # ------------------------------------------------------------------
251
+ # Dataclass-y ergonomics: hashable, equatable, frozen
252
+ # ------------------------------------------------------------------
253
+
254
+
255
+ def test_genotype_is_frozen():
256
+ gt = Genotype.from_sample_info({"GT": "0/1"})
257
+ with pytest.raises((AttributeError, Exception)):
258
+ gt.alleles = (1, 1) # type: ignore
259
+
260
+
261
+ def test_genotype_equality():
262
+ a = Genotype.from_sample_info({"GT": "0/1", "AD": [10, 5], "DP": 15})
263
+ b = Genotype.from_sample_info({"GT": "0/1", "AD": [10, 5], "DP": 15})
264
+ assert a == b
265
+ c = Genotype.from_sample_info({"GT": "1/1", "AD": [0, 15], "DP": 15})
266
+ assert a != c
@@ -0,0 +1,248 @@
1
+ # Licensed under the Apache License, Version 2.0 (the "License");
2
+ # you may not use this file except in compliance with the License.
3
+ # You may obtain a copy of the License at
4
+ #
5
+ # http://www.apache.org/licenses/LICENSE-2.0
6
+ #
7
+ # Unless required by applicable law or agreed to in writing, software
8
+ # distributed under the License is distributed on an "AS IS" BASIS,
9
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10
+ # See the License for the specific language governing permissions and
11
+ # limitations under the License.
12
+
13
+ """
14
+ End-to-end tests for VariantCollection genotype/zygosity access —
15
+ exercises the full path from VCF → Variant → Genotype (openvax/varcode#267).
16
+ """
17
+
18
+ import os
19
+ import tempfile
20
+
21
+ import pytest
22
+
23
+ from varcode import (
24
+ Genotype,
25
+ SampleNotFoundError,
26
+ Zygosity,
27
+ load_vcf,
28
+ )
29
+
30
+
31
+ VCF_BODY = """##fileformat=VCFv4.1
32
+ ##reference=GRCh38
33
+ ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
34
+ ##FORMAT=<ID=AD,Number=R,Type=Integer,Description="Allele depths">
35
+ ##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Read depth">
36
+ ##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Genotype quality">
37
+ ##FORMAT=<ID=PS,Number=1,Type=Integer,Description="Phase set">
38
+ #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT tumor normal
39
+ 17 43082575 . C T 100 PASS . GT:AD:DP:GQ 0/1:10,5:15:99 0/0:20,0:20:99
40
+ 7 117531114 . G T 100 PASS . GT:AD:DP:GQ 1/1:0,20:20:99 0/1:10,10:20:99
41
+ 1 100 . A T,G 100 PASS . GT:AD:DP:GQ 1/2:5,5,5:15:99 0/1:10,5,0:15:99
42
+ 17 43082576 . C A 100 PASS . GT:AD:DP:GQ:PS 0|1:8,7:15:99:100 ./.:.:.:.:.
43
+ """
44
+
45
+
46
+ @pytest.fixture
47
+ def multi_sample_vcf():
48
+ fd, path = tempfile.mkstemp(suffix=".vcf")
49
+ with os.fdopen(fd, "w") as f:
50
+ f.write(VCF_BODY)
51
+ yield path
52
+ os.unlink(path)
53
+
54
+
55
+ # -------------------------------------------------------------------
56
+ # Sample discovery
57
+ # -------------------------------------------------------------------
58
+
59
+
60
+ def test_samples_property_lists_names(multi_sample_vcf):
61
+ vc = load_vcf(multi_sample_vcf, genome="GRCh38")
62
+ assert vc.samples == ["normal", "tumor"]
63
+
64
+
65
+ def test_has_sample_data_true_for_vcf_with_samples(multi_sample_vcf):
66
+ vc = load_vcf(multi_sample_vcf, genome="GRCh38")
67
+ assert vc.has_sample_data() is True
68
+
69
+
70
+ def test_has_sample_data_false_for_directly_constructed(multi_sample_vcf):
71
+ # Directly-constructed variants have no sample_info metadata.
72
+ from varcode import Variant, VariantCollection
73
+ vc = VariantCollection(variants=[
74
+ Variant("17", 43082575, "C", "T", "GRCh38"),
75
+ ])
76
+ assert vc.has_sample_data() is False
77
+ assert vc.samples == []
78
+
79
+
80
+ # -------------------------------------------------------------------
81
+ # Per-variant genotype access
82
+ # -------------------------------------------------------------------
83
+
84
+
85
+ def test_genotype_for_heterozygous_sample(multi_sample_vcf):
86
+ vc = load_vcf(multi_sample_vcf, genome="GRCh38")
87
+ # chr17:43082575 C>T — tumor 0/1, normal 0/0.
88
+ variant = next(
89
+ v for v in vc if v.start == 43082575 and v.alt == "T"
90
+ )
91
+ tumor_gt = vc.genotype(variant, "tumor")
92
+ assert tumor_gt.alleles == (0, 1)
93
+ assert tumor_gt.is_called
94
+ assert tumor_gt.zygosity_for_alt(1) is Zygosity.HETEROZYGOUS
95
+ assert tumor_gt.allele_depths == (10, 5)
96
+ assert tumor_gt.genotype_quality == 99
97
+
98
+ normal_gt = vc.genotype(variant, "normal")
99
+ assert normal_gt.alleles == (0, 0)
100
+ assert normal_gt.zygosity_for_alt(1) is Zygosity.ABSENT
101
+
102
+
103
+ def test_genotype_for_homozygous_sample(multi_sample_vcf):
104
+ vc = load_vcf(multi_sample_vcf, genome="GRCh38")
105
+ variant = next(v for v in vc if v.start == 117531114)
106
+ tumor_gt = vc.genotype(variant, "tumor")
107
+ assert tumor_gt.alleles == (1, 1)
108
+ assert tumor_gt.zygosity_for_alt(1) is Zygosity.HOMOZYGOUS
109
+
110
+
111
+ def test_genotype_phased_call_preserves_phase_info(multi_sample_vcf):
112
+ vc = load_vcf(multi_sample_vcf, genome="GRCh38")
113
+ variant = next(v for v in vc if v.start == 43082576)
114
+ tumor_gt = vc.genotype(variant, "tumor")
115
+ assert tumor_gt.phased is True
116
+ assert tumor_gt.phase_set == 100
117
+
118
+
119
+ def test_genotype_nocall(multi_sample_vcf):
120
+ vc = load_vcf(multi_sample_vcf, genome="GRCh38")
121
+ variant = next(v for v in vc if v.start == 43082576)
122
+ normal_gt = vc.genotype(variant, "normal")
123
+ assert normal_gt.is_missing
124
+ assert normal_gt.zygosity_for_alt(1) is Zygosity.MISSING
125
+
126
+
127
+ def test_genotype_unknown_sample_raises(multi_sample_vcf):
128
+ vc = load_vcf(multi_sample_vcf, genome="GRCh38")
129
+ variant = vc[0]
130
+ with pytest.raises(SampleNotFoundError):
131
+ vc.genotype(variant, "nonexistent_sample")
132
+
133
+
134
+ def test_sample_not_found_is_key_error_subclass():
135
+ # Callers who only catch KeyError should still work.
136
+ assert issubclass(SampleNotFoundError, KeyError)
137
+
138
+
139
+ def test_genotype_returns_none_for_variant_without_sample_info():
140
+ # A variant that wasn't loaded from a multi-sample VCF.
141
+ from varcode import Variant, VariantCollection
142
+ v = Variant("17", 43082575, "C", "T", "GRCh38")
143
+ vc = VariantCollection(variants=[v])
144
+ assert vc.genotype(v, "anyone") is None
145
+
146
+
147
+ # -------------------------------------------------------------------
148
+ # Multi-allelic sites
149
+ # -------------------------------------------------------------------
150
+
151
+
152
+ def test_multiallelic_row_splits_into_separate_variants_with_own_genotype(
153
+ multi_sample_vcf):
154
+ vc = load_vcf(multi_sample_vcf, genome="GRCh38")
155
+ # chr1:100 A>T,G — tumor has GT=1/2 (one T, one G).
156
+ at = next(v for v in vc if v.start == 100 and v.alt == "T")
157
+ ag = next(v for v in vc if v.start == 100 and v.alt == "G")
158
+
159
+ # Relative to each alt, the tumor is heterozygous (carries one
160
+ # copy of this alt and one copy of a different alt).
161
+ assert vc.zygosity(at, "tumor") is Zygosity.HETEROZYGOUS
162
+ assert vc.zygosity(ag, "tumor") is Zygosity.HETEROZYGOUS
163
+
164
+ # Normal has GT=0/1 (one ref, one T). Relative to T: het.
165
+ # Relative to G: absent (normal doesn't have the G alt).
166
+ assert vc.zygosity(at, "normal") is Zygosity.HETEROZYGOUS
167
+ assert vc.zygosity(ag, "normal") is Zygosity.ABSENT
168
+
169
+
170
+ # -------------------------------------------------------------------
171
+ # Convenience filters
172
+ # -------------------------------------------------------------------
173
+
174
+
175
+ def test_for_sample_filters_to_variants_carried_by_that_sample(
176
+ multi_sample_vcf):
177
+ vc = load_vcf(multi_sample_vcf, genome="GRCh38")
178
+
179
+ tumor_variants = vc.for_sample("tumor")
180
+ # Tumor carries all 4 variant-derived alts except normal-only ones.
181
+ # Check concretely:
182
+ # - 17:43082575 C>T: tumor 0/1 -> het, carried
183
+ # - 7:117531114 G>T: tumor 1/1 -> hom, carried
184
+ # - 1:100 A>T: tumor 1/2 -> carries T (alt #1)
185
+ # - 1:100 A>G: tumor 1/2 -> carries G (alt #2)
186
+ # - 17:43082576 C>A: tumor 0|1 -> het, carried
187
+ assert len(tumor_variants) == 5
188
+
189
+ normal_variants = vc.for_sample("normal")
190
+ # - 17:43082575 C>T: normal 0/0 -> absent
191
+ # - 7:117531114 G>T: normal 0/1 -> het, carried
192
+ # - 1:100 A>T: normal 0/1 -> carries T
193
+ # - 1:100 A>G: normal 0/1 -> absent (normal doesn't have G)
194
+ # - 17:43082576 C>A: normal ./. -> missing, not carried
195
+ assert len(normal_variants) == 2
196
+
197
+
198
+ def test_heterozygous_in_excludes_homozygous_calls(multi_sample_vcf):
199
+ vc = load_vcf(multi_sample_vcf, genome="GRCh38")
200
+ het_in_tumor = vc.heterozygous_in("tumor")
201
+ starts = sorted(v.start for v in het_in_tumor)
202
+ # Tumor is het at:
203
+ # - 17:43082575 (0/1)
204
+ # - 1:100 (1/2 — het for both T and G)
205
+ # - 17:43082576 (0|1)
206
+ # NOT at 7:117531114 (1/1 is hom, not het)
207
+ assert starts == [100, 100, 43082575, 43082576]
208
+
209
+
210
+ def test_homozygous_alt_in(multi_sample_vcf):
211
+ vc = load_vcf(multi_sample_vcf, genome="GRCh38")
212
+ hom_in_tumor = vc.homozygous_alt_in("tumor")
213
+ assert len(hom_in_tumor) == 1
214
+ assert hom_in_tumor[0].start == 117531114
215
+
216
+
217
+ def test_for_sample_with_unknown_sample_raises(multi_sample_vcf):
218
+ vc = load_vcf(multi_sample_vcf, genome="GRCh38")
219
+ # Fail fast on typos rather than silently returning empty.
220
+ with pytest.raises(SampleNotFoundError):
221
+ vc.for_sample("nonexistent")
222
+
223
+
224
+ def test_filter_chain_composes(multi_sample_vcf):
225
+ # Cross-sample queries fall out of set operations on the primitives.
226
+ vc = load_vcf(multi_sample_vcf, genome="GRCh38")
227
+ # "In tumor but not in normal" — somatic candidates.
228
+ tumor_set = set(vc.for_sample("tumor"))
229
+ normal_set = set(vc.for_sample("normal"))
230
+ somatic = tumor_set - normal_set
231
+ # Tumor carries: 17:43082575, 7:117531114, 1:100(T), 1:100(G), 17:43082576.
232
+ # Normal carries: 7:117531114, 1:100(T).
233
+ # Somatic = 17:43082575, 1:100(G), 17:43082576.
234
+ assert len(somatic) == 3
235
+ starts = sorted(v.start for v in somatic)
236
+ assert starts == [100, 43082575, 43082576]
237
+
238
+
239
+ # -------------------------------------------------------------------
240
+ # Package-level exports
241
+ # -------------------------------------------------------------------
242
+
243
+
244
+ def test_package_level_exports():
245
+ import varcode
246
+ assert varcode.Genotype is Genotype
247
+ assert varcode.Zygosity is Zygosity
248
+ assert varcode.SampleNotFoundError is SampleNotFoundError
@@ -0,0 +1,103 @@
1
+ # Licensed under the Apache License, Version 2.0 (the "License");
2
+ # you may not use this file except in compliance with the License.
3
+ # You may obtain a copy of the License at
4
+ #
5
+ # http://www.apache.org/licenses/LICENSE-2.0
6
+ #
7
+ # Unless required by applicable law or agreed to in writing, software
8
+ # distributed under the License is distributed on an "AS IS" BASIS,
9
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10
+ # See the License for the specific language governing permissions and
11
+ # limitations under the License.
12
+
13
+ """
14
+ Regression tests for https://github.com/openvax/varcode/issues/215
15
+ (and the duplicate symptom in #246).
16
+
17
+ When a variant's ref allele doesn't match the reference genome at the
18
+ variant's position — typically because the variant was called against
19
+ a different build, the ref field was populated with the patient's
20
+ germline allele, or there's strand confusion — varcode should raise a
21
+ dedicated ``ReferenceMismatchError`` with an actionable message, not
22
+ a generic ``ValueError`` or ``AssertionError``.
23
+ """
24
+
25
+ import pytest
26
+
27
+ import varcode
28
+ from varcode import Variant, ReferenceMismatchError
29
+
30
+
31
+ def _construct_mismatching_variant():
32
+ """Construct a variant whose ref doesn't match the GRCh38 genome.
33
+
34
+ Uses CFTR exon 4 (chr7:117530899-117531114 on GRCh38, + strand)
35
+ where the real genome has specific bases. We claim ref='Z'... well,
36
+ varcode rejects unknown nucleotides, so instead we use a valid
37
+ base that doesn't match the genome at that position.
38
+ """
39
+ # chr7:117531114 on GRCh38 is G (last base of CFTR exon 4). Claim
40
+ # the variant has ref='T' (which is wrong). This will fail the
41
+ # transcript-vs-variant ref check.
42
+ return Variant("7", 117531114, "T", "A", "GRCh38")
43
+
44
+
45
+ def test_ref_mismatch_raises_reference_mismatch_error():
46
+ variant = _construct_mismatching_variant()
47
+ with pytest.raises(ReferenceMismatchError):
48
+ variant.effects()
49
+
50
+
51
+ def test_reference_mismatch_error_is_value_error_subclass():
52
+ # Keep the existing contract: callers that catch ValueError still
53
+ # see this. (predict_variant_effect_on_transcript_or_failure relies
54
+ # on this for the Failure-effect fallback path.)
55
+ assert issubclass(ReferenceMismatchError, ValueError)
56
+
57
+
58
+ def test_reference_mismatch_error_message_is_actionable():
59
+ variant = _construct_mismatching_variant()
60
+ try:
61
+ variant.effects()
62
+ except ReferenceMismatchError as e:
63
+ msg = str(e)
64
+ # Names the variant so the user can find it.
65
+ assert "117531114" in msg
66
+ # Shows both the expected (genome) and observed (variant) bases.
67
+ assert "'T'" in msg # variant's claimed ref
68
+ assert "'G'" in msg # actual genome base
69
+ # Suggests the most common causes.
70
+ assert "genome build" in msg or "germline" in msg or "strand" in msg
71
+ # Points at the escape hatch.
72
+ assert "raise_on_error=False" in msg
73
+ else:
74
+ raise AssertionError("Expected ReferenceMismatchError")
75
+
76
+
77
+ def test_reference_mismatch_error_carries_structured_fields():
78
+ variant = _construct_mismatching_variant()
79
+ try:
80
+ variant.effects()
81
+ except ReferenceMismatchError as e:
82
+ assert e.variant == variant
83
+ assert e.transcript is not None
84
+ assert e.expected_ref == "G"
85
+ assert e.observed_ref == "T"
86
+ else:
87
+ raise AssertionError("Expected ReferenceMismatchError")
88
+
89
+
90
+ def test_ref_mismatch_with_raise_on_error_false_returns_failure():
91
+ # When the user opts into error suppression, the mismatch should
92
+ # collapse into a Failure effect (the existing contract).
93
+ from varcode.effects import Failure
94
+ variant = _construct_mismatching_variant()
95
+ effects = variant.effects(raise_on_error=False)
96
+ assert any(isinstance(e, Failure) for e in effects), \
97
+ "Expected at least one Failure effect when raise_on_error=False"
98
+
99
+
100
+ def test_reference_mismatch_error_exposed_at_package_level():
101
+ # Users should be able to catch varcode.ReferenceMismatchError
102
+ # without importing from a submodule.
103
+ assert varcode.ReferenceMismatchError is ReferenceMismatchError
@@ -11,6 +11,8 @@
11
11
  # See the License for the specific language governing permissions and
12
12
  # limitations under the License.
13
13
 
14
+ from .errors import ReferenceMismatchError, SampleNotFoundError
15
+ from .genotype import Genotype, Zygosity
14
16
  from .variant import Variant
15
17
  from .variant_collection import VariantCollection
16
18
  from .maf import load_maf, load_maf_dataframe
@@ -22,22 +24,30 @@ from .effects import (
22
24
  MutationEffect,
23
25
  NonsilentCodingMutation,
24
26
  )
25
- from .version import __version__
27
+ from .version import __version__
26
28
 
27
29
  __all__ = [
28
- "__version__",
30
+ "__version__",
29
31
 
30
32
  # basic classes
31
33
  "Variant",
32
34
  "EffectCollection",
33
35
  "VariantCollection",
34
36
 
37
+ # genotype / zygosity
38
+ "Genotype",
39
+ "Zygosity",
40
+
35
41
  # effects
36
42
  "effect_priority",
37
43
  "top_priority_effect",
38
44
  "MutationEffect",
39
45
  "NonsilentCodingMutation",
40
46
 
47
+ # exceptions
48
+ "ReferenceMismatchError",
49
+ "SampleNotFoundError",
50
+
41
51
  # file loading
42
52
  "load_maf",
43
53
  "load_maf_dataframe",