tskit 1.0.1__cp314-cp314-macosx_10_15_universal2.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- _tskit.cpython-314-darwin.so +0 -0
- tskit/__init__.py +92 -0
- tskit/__main__.py +4 -0
- tskit/_version.py +4 -0
- tskit/cli.py +273 -0
- tskit/combinatorics.py +1522 -0
- tskit/drawing.py +2809 -0
- tskit/exceptions.py +70 -0
- tskit/genotypes.py +410 -0
- tskit/intervals.py +601 -0
- tskit/jit/__init__.py +0 -0
- tskit/jit/numba.py +674 -0
- tskit/metadata.py +1147 -0
- tskit/provenance.py +150 -0
- tskit/provenance.schema.json +72 -0
- tskit/stats.py +165 -0
- tskit/tables.py +4858 -0
- tskit/text_formats.py +456 -0
- tskit/trees.py +11457 -0
- tskit/util.py +901 -0
- tskit/vcf.py +219 -0
- tskit-1.0.1.dist-info/METADATA +105 -0
- tskit-1.0.1.dist-info/RECORD +27 -0
- tskit-1.0.1.dist-info/WHEEL +5 -0
- tskit-1.0.1.dist-info/entry_points.txt +2 -0
- tskit-1.0.1.dist-info/licenses/LICENSE +21 -0
- tskit-1.0.1.dist-info/top_level.txt +2 -0
tskit/exceptions.py
ADDED
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
# MIT License
|
|
2
|
+
#
|
|
3
|
+
# Copyright (c) 2018-2021 Tskit Developers
|
|
4
|
+
# Copyright (c) 2017 University of Oxford
|
|
5
|
+
#
|
|
6
|
+
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
7
|
+
# of this software and associated documentation files (the "Software"), to deal
|
|
8
|
+
# in the Software without restriction, including without limitation the rights
|
|
9
|
+
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
10
|
+
# copies of the Software, and to permit persons to whom the Software is
|
|
11
|
+
# furnished to do so, subject to the following conditions:
|
|
12
|
+
#
|
|
13
|
+
# The above copyright notice and this permission notice shall be included in all
|
|
14
|
+
# copies or substantial portions of the Software.
|
|
15
|
+
#
|
|
16
|
+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
17
|
+
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
18
|
+
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
19
|
+
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
20
|
+
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
21
|
+
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
22
|
+
# SOFTWARE.
|
|
23
|
+
"""
|
|
24
|
+
Exceptions defined in tskit.
|
|
25
|
+
"""
|
|
26
|
+
from _tskit import FileFormatError # noqa: F401
|
|
27
|
+
from _tskit import IdentityPairsNotStoredError # noqa: F401
|
|
28
|
+
from _tskit import IdentitySegmentsNotStoredError # noqa: F401
|
|
29
|
+
from _tskit import LibraryError # noqa: F401
|
|
30
|
+
from _tskit import TskitException # noqa: F401
|
|
31
|
+
from _tskit import VersionTooNewError # noqa: F401
|
|
32
|
+
from _tskit import VersionTooOldError # noqa: F401
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
class DuplicatePositionsError(TskitException):
|
|
36
|
+
"""
|
|
37
|
+
Duplicate positions in the list of sites.
|
|
38
|
+
"""
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
class ProvenanceValidationError(TskitException):
|
|
42
|
+
"""
|
|
43
|
+
A JSON document did not validate against the provenance schema.
|
|
44
|
+
"""
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
class MetadataValidationError(TskitException):
|
|
48
|
+
"""
|
|
49
|
+
A metadata object did not validate against the provenance schema.
|
|
50
|
+
"""
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
class MetadataSchemaValidationError(TskitException):
|
|
54
|
+
"""
|
|
55
|
+
A metadata schema object did not validate against the metaschema.
|
|
56
|
+
"""
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
class MetadataEncodingError(TskitException):
|
|
60
|
+
"""
|
|
61
|
+
A metadata object was of a type that could not be encoded
|
|
62
|
+
"""
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
class ImmutableTableError(ValueError):
|
|
66
|
+
"""
|
|
67
|
+
Raised when attempting to modify an immutable table view.
|
|
68
|
+
|
|
69
|
+
Use TreeSequence.dump_tables() to get a mutable copy.
|
|
70
|
+
"""
|
tskit/genotypes.py
ADDED
|
@@ -0,0 +1,410 @@
|
|
|
1
|
+
#
|
|
2
|
+
# MIT License
|
|
3
|
+
#
|
|
4
|
+
# Copyright (c) 2018-2024 Tskit Developers
|
|
5
|
+
# Copyright (c) 2015-2018 University of Oxford
|
|
6
|
+
#
|
|
7
|
+
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
8
|
+
# of this software and associated documentation files (the "Software"), to deal
|
|
9
|
+
# in the Software without restriction, including without limitation the rights
|
|
10
|
+
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
11
|
+
# copies of the Software, and to permit persons to whom the Software is
|
|
12
|
+
# furnished to do so, subject to the following conditions:
|
|
13
|
+
#
|
|
14
|
+
# The above copyright notice and this permission notice shall be included in all
|
|
15
|
+
# copies or substantial portions of the Software.
|
|
16
|
+
#
|
|
17
|
+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
18
|
+
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
19
|
+
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
20
|
+
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
21
|
+
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
22
|
+
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
23
|
+
# SOFTWARE.
|
|
24
|
+
from __future__ import annotations
|
|
25
|
+
|
|
26
|
+
import collections
|
|
27
|
+
import logging
|
|
28
|
+
import typing
|
|
29
|
+
|
|
30
|
+
import numpy as np
|
|
31
|
+
|
|
32
|
+
import _tskit
|
|
33
|
+
import tskit
|
|
34
|
+
import tskit.trees as trees
|
|
35
|
+
import tskit.util as util
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
class Variant:
|
|
39
|
+
"""
|
|
40
|
+
A variant in a tree sequence, describing the observed genetic variation
|
|
41
|
+
among the specified nodes (by default, the sample nodes) for a given site.
|
|
42
|
+
A variant consists of (a) a tuple of **alleles** listing the potential
|
|
43
|
+
allelic states which the requested nodes at this site can possess; (b) an
|
|
44
|
+
array of **genotypes** mapping node IDs to the observed alleles; (c) a
|
|
45
|
+
reference to the :class:`Site` at which the Variant has been decoded; and
|
|
46
|
+
(d) an array of **samples** giving the node ID to which each element of the
|
|
47
|
+
genotypes array corresponds.
|
|
48
|
+
|
|
49
|
+
After creation a Variant is not yet decoded, and has no genotypes.
|
|
50
|
+
To decode a Variant, call the :meth:`decode` method. The Variant class will then
|
|
51
|
+
use a Tree, internal to the Variant, to seek to the position of the site and
|
|
52
|
+
decode the genotypes at that site. It is therefore much more efficient to visit
|
|
53
|
+
sites in sequential genomic order, either in a forwards or backwards direction,
|
|
54
|
+
than to do so randomly.
|
|
55
|
+
|
|
56
|
+
Each element in the ``alleles`` tuple is a string, representing an
|
|
57
|
+
observed allelic state that may be seen at this site. The ``alleles`` tuple,
|
|
58
|
+
which is guaranteed not to contain any duplicates, is generated in one of two
|
|
59
|
+
ways. The first (and default) way is for ``tskit`` to generate the encoding on
|
|
60
|
+
the fly while generating genotypes. In this case, the first element of this
|
|
61
|
+
tuple is guaranteed to be the same as the site's ``ancestral_state`` value.
|
|
62
|
+
Note that allelic values may be listed that are not referred to by any
|
|
63
|
+
samples. For example, if we have a site that is fixed for the derived state
|
|
64
|
+
(i.e., we have a mutation over the tree root), all genotypes will be 1, but
|
|
65
|
+
the alleles list will be equal to ``('0', '1')``. Other than the
|
|
66
|
+
ancestral state being the first allele, the alleles are listed in
|
|
67
|
+
no particular order, and the ordering should not be relied upon
|
|
68
|
+
(but see the notes on missing data below).
|
|
69
|
+
|
|
70
|
+
The second way is for the user to define the mapping between
|
|
71
|
+
genotype values and allelic state strings using the
|
|
72
|
+
``alleles`` parameter to the :meth:`TreeSequence.variants` method.
|
|
73
|
+
In this case, there is no indication of which allele is the ancestral state,
|
|
74
|
+
as the ordering is determined by the user.
|
|
75
|
+
|
|
76
|
+
The ``genotypes`` represent the observed allelic states for each requested
|
|
77
|
+
node, such that ``var.alleles[var.genotypes[j]]`` gives the string allele
|
|
78
|
+
for the node at index ``j`` (i.e., for ``variant.samples[j]``). Thus, the
|
|
79
|
+
elements of the genotypes array are
|
|
80
|
+
indexes into the ``alleles`` list. The genotypes are provided in this
|
|
81
|
+
way via a numpy numeric array to enable efficient calculations. To obtain a
|
|
82
|
+
(less efficient) array of allele strings for each node, you can use e.g.
|
|
83
|
+
``np.asarray(variant.alleles)[variant.genotypes]``.
|
|
84
|
+
|
|
85
|
+
When :ref:`missing data<sec_data_model_missing_data>` is present at a given
|
|
86
|
+
site, the property ``has_missing_data`` will be True, at least one element
|
|
87
|
+
of the ``genotypes`` array will be equal to ``tskit.MISSING_DATA``, and the
|
|
88
|
+
last element of the ``alleles`` array will be ``None``. Note that in this
|
|
89
|
+
case ``variant.num_alleles`` will **not** be equal to
|
|
90
|
+
``len(variant.alleles)``. The rationale for adding ``None`` to the end of
|
|
91
|
+
the ``alleles`` list is to help code that does not handle missing data
|
|
92
|
+
correctly fail early rather than introducing subtle and hard-to-find bugs.
|
|
93
|
+
As ``tskit.MISSING_DATA`` is equal to -1, code that decodes genotypes into
|
|
94
|
+
allelic values without taking missing data into account would otherwise
|
|
95
|
+
incorrectly output the last allele in the list.
|
|
96
|
+
|
|
97
|
+
:param TreeSequence tree_sequence: The tree sequence to which this variant
|
|
98
|
+
belongs.
|
|
99
|
+
:param array_like samples: An array of node IDs for which to generate
|
|
100
|
+
genotypes, or ``None`` for all sample nodes. Non-sample nodes may also
|
|
101
|
+
be provided to generate genotypes for internal nodes. Default: ``None``.
|
|
102
|
+
:param bool isolated_as_missing: If True, the genotype value assigned to
|
|
103
|
+
isolated nodes without mutations (samples or non-samples) is
|
|
104
|
+
:data:`.MISSING_DATA` (-1). If False, such nodes will be
|
|
105
|
+
assigned the allele index for the ancestral state.
|
|
106
|
+
Default: True.
|
|
107
|
+
:param tuple alleles: A tuple of strings defining the encoding of
|
|
108
|
+
alleles as integer genotype values. At least one allele must be provided.
|
|
109
|
+
If duplicate alleles are provided, output genotypes will always be
|
|
110
|
+
encoded as the first occurrence of the allele. If None (the default),
|
|
111
|
+
the alleles are encoded as they are encountered during genotype
|
|
112
|
+
generation.
|
|
113
|
+
|
|
114
|
+
"""
|
|
115
|
+
|
|
116
|
+
def __init__(
|
|
117
|
+
self, tree_sequence, samples=None, isolated_as_missing=None, alleles=None
|
|
118
|
+
):
|
|
119
|
+
if isolated_as_missing is None:
|
|
120
|
+
isolated_as_missing = True
|
|
121
|
+
self.tree_sequence = tree_sequence
|
|
122
|
+
if samples is not None:
|
|
123
|
+
samples = util.safe_np_int_cast(samples, np.int32)
|
|
124
|
+
self._ll_variant = _tskit.Variant(
|
|
125
|
+
tree_sequence._ll_tree_sequence,
|
|
126
|
+
samples=samples,
|
|
127
|
+
isolated_as_missing=isolated_as_missing,
|
|
128
|
+
alleles=alleles,
|
|
129
|
+
)
|
|
130
|
+
|
|
131
|
+
def _check_decoded(self):
|
|
132
|
+
if self._ll_variant.site_id == tskit.NULL:
|
|
133
|
+
raise ValueError(
|
|
134
|
+
"This variant has not yet been decoded at a specific site, "
|
|
135
|
+
"call Variant.decode to set the site."
|
|
136
|
+
)
|
|
137
|
+
|
|
138
|
+
@property
|
|
139
|
+
def site(self) -> trees.Site:
|
|
140
|
+
"""
|
|
141
|
+
The Site object for the site at which this variant has been decoded.
|
|
142
|
+
"""
|
|
143
|
+
self._check_decoded()
|
|
144
|
+
return self.tree_sequence.site(self._ll_variant.site_id)
|
|
145
|
+
|
|
146
|
+
@property
|
|
147
|
+
def alleles(self) -> tuple[str | None, ...]:
|
|
148
|
+
"""
|
|
149
|
+
A tuple of the allelic values which nodes can possess at the current
|
|
150
|
+
site. Unless an encoding of alleles is specified when creating this
|
|
151
|
+
variant instance, the first element of this tuple is always the site's
|
|
152
|
+
ancestral state.
|
|
153
|
+
"""
|
|
154
|
+
return self._ll_variant.alleles
|
|
155
|
+
|
|
156
|
+
@property
|
|
157
|
+
def samples(self) -> np.ndarray:
|
|
158
|
+
"""
|
|
159
|
+
A numpy array of the node ids whose genotypes will be returned
|
|
160
|
+
by the :meth:`genotypes` method.
|
|
161
|
+
"""
|
|
162
|
+
return self._ll_variant.samples
|
|
163
|
+
|
|
164
|
+
@property
|
|
165
|
+
def genotypes(self) -> np.ndarray:
|
|
166
|
+
"""
|
|
167
|
+
An array of indexes into the list ``alleles``, giving the
|
|
168
|
+
state of each requested node at the current site.
|
|
169
|
+
"""
|
|
170
|
+
self._check_decoded()
|
|
171
|
+
return self._ll_variant.genotypes
|
|
172
|
+
|
|
173
|
+
@property
|
|
174
|
+
def isolated_as_missing(self) -> bool:
|
|
175
|
+
"""
|
|
176
|
+
True if isolated nodes are decoded to missing data. If False, isolated
|
|
177
|
+
nodes are decoded to the ancestral state.
|
|
178
|
+
"""
|
|
179
|
+
return self._ll_variant.isolated_as_missing
|
|
180
|
+
|
|
181
|
+
@property
|
|
182
|
+
def has_missing_data(self) -> bool:
|
|
183
|
+
"""
|
|
184
|
+
True if there is missing data for any of the
|
|
185
|
+
requested nodes at the current site.
|
|
186
|
+
"""
|
|
187
|
+
alleles = self._ll_variant.alleles
|
|
188
|
+
return len(alleles) > 0 and alleles[-1] is None
|
|
189
|
+
|
|
190
|
+
@property
|
|
191
|
+
def num_missing(self) -> int:
|
|
192
|
+
"""
|
|
193
|
+
The number of requested nodes with missing data at this site.
|
|
194
|
+
"""
|
|
195
|
+
return np.sum(self.genotypes == tskit.NULL)
|
|
196
|
+
|
|
197
|
+
@property
|
|
198
|
+
def num_alleles(self) -> int:
|
|
199
|
+
"""
|
|
200
|
+
The number of distinct alleles at this site. Note that this may
|
|
201
|
+
not be the same as the number of distinct values in the genotypes
|
|
202
|
+
array: firstly missing data is not counted as an allele, and secondly,
|
|
203
|
+
the site may contain mutations to alternative allele states (which are
|
|
204
|
+
counted in the number of alleles) without the mutation being inherited
|
|
205
|
+
by any of the requested nodes.
|
|
206
|
+
"""
|
|
207
|
+
return len(self.alleles) - self.has_missing_data
|
|
208
|
+
|
|
209
|
+
# Deprecated alias to avoid breaking existing code.
|
|
210
|
+
@property
|
|
211
|
+
def position(self) -> float:
|
|
212
|
+
return self.site.position
|
|
213
|
+
|
|
214
|
+
# Deprecated alias to avoid breaking existing code.
|
|
215
|
+
@property
|
|
216
|
+
def index(self) -> int:
|
|
217
|
+
return self._ll_variant.site_id
|
|
218
|
+
|
|
219
|
+
# We need a custom eq for the numpy array
|
|
220
|
+
def __eq__(self, other) -> bool:
|
|
221
|
+
return (
|
|
222
|
+
isinstance(other, Variant)
|
|
223
|
+
and self.tree_sequence == other.tree_sequence
|
|
224
|
+
and self._ll_variant.site_id == other._ll_variant.site_id
|
|
225
|
+
and self._ll_variant.site_id != tskit.NULL
|
|
226
|
+
and self._ll_variant.alleles == other._ll_variant.alleles
|
|
227
|
+
and np.array_equal(self._ll_variant.genotypes, other._ll_variant.genotypes)
|
|
228
|
+
)
|
|
229
|
+
|
|
230
|
+
def decode(self, site_id) -> None:
|
|
231
|
+
"""
|
|
232
|
+
Decode the variant at the given site, setting the site ID, genotypes and
|
|
233
|
+
alleles to those of the site and samples of this Variant.
|
|
234
|
+
|
|
235
|
+
:param int site_id: The ID of the site to decode. This must be a valid site ID.
|
|
236
|
+
"""
|
|
237
|
+
self._ll_variant.decode(site_id)
|
|
238
|
+
|
|
239
|
+
def copy(self) -> Variant:
|
|
240
|
+
"""
|
|
241
|
+
Create a copy of this Variant. Note that calling :meth:`decode` on the
|
|
242
|
+
copy will fail as it does not take a copy of the internal tree.
|
|
243
|
+
|
|
244
|
+
:return: The copy of this Variant.
|
|
245
|
+
"""
|
|
246
|
+
variant_copy = Variant.__new__(Variant)
|
|
247
|
+
variant_copy.tree_sequence = self.tree_sequence
|
|
248
|
+
variant_copy._ll_variant = self._ll_variant.restricted_copy()
|
|
249
|
+
return variant_copy
|
|
250
|
+
|
|
251
|
+
def states(self, missing_data_string=None) -> np.ndarray:
|
|
252
|
+
"""
|
|
253
|
+
Returns the allelic states at this site as an array of strings.
|
|
254
|
+
|
|
255
|
+
.. warning::
|
|
256
|
+
Using this method is inefficient compared to working with the
|
|
257
|
+
underlying integer representation of genotypes as returned by
|
|
258
|
+
the :attr:`~Variant.genotypes` property.
|
|
259
|
+
|
|
260
|
+
:param str missing_data_string: A string that will be used to represent missing
|
|
261
|
+
data. If any normal allele contains this character, an error is raised.
|
|
262
|
+
Default: `None`, treated as `'N'`.
|
|
263
|
+
:return: An numpy array of strings of length ``num_sites``.
|
|
264
|
+
"""
|
|
265
|
+
if missing_data_string is None:
|
|
266
|
+
missing_data_string = "N"
|
|
267
|
+
elif not isinstance(missing_data_string, str):
|
|
268
|
+
# Must explicitly test here, otherwise we output a numpy object array
|
|
269
|
+
raise ValueError("Missing data string is not a string")
|
|
270
|
+
alleles = self.alleles
|
|
271
|
+
if alleles[-1] is None:
|
|
272
|
+
if missing_data_string in alleles:
|
|
273
|
+
raise ValueError(
|
|
274
|
+
"An existing allele is equal to the "
|
|
275
|
+
f"missing data string '{missing_data_string}'"
|
|
276
|
+
)
|
|
277
|
+
alleles = alleles[:-1] + (missing_data_string,)
|
|
278
|
+
return np.array(alleles)[self.genotypes]
|
|
279
|
+
|
|
280
|
+
def counts(self) -> typing.Counter[str | None]:
|
|
281
|
+
"""
|
|
282
|
+
Returns a :class:`python:collections.Counter` object providing counts for each
|
|
283
|
+
possible :attr:`allele <Variant.alleles>` at this site: i.e. the number of
|
|
284
|
+
samples possessing that allele among the set of samples specified when creating
|
|
285
|
+
this Variant (by default, this is all the sample nodes in the tree sequence).
|
|
286
|
+
Missing data is represented by an allelic state of ``None``.
|
|
287
|
+
|
|
288
|
+
:return: A counter of the number of samples associated with each allele.
|
|
289
|
+
"""
|
|
290
|
+
counts = collections.Counter()
|
|
291
|
+
if self.alleles[-1] is None:
|
|
292
|
+
# we have to treat the last element of the genotypes array as special
|
|
293
|
+
counts[None] = np.sum(self.genotypes == tskit.MISSING_DATA)
|
|
294
|
+
for i, allele in enumerate(self.alleles[:-1]):
|
|
295
|
+
counts[allele] = np.sum(self.genotypes == i)
|
|
296
|
+
else:
|
|
297
|
+
bincounts = np.bincount(self.genotypes, minlength=self.num_alleles)
|
|
298
|
+
for i, allele in enumerate(self.alleles):
|
|
299
|
+
counts[allele] = bincounts[i]
|
|
300
|
+
return counts
|
|
301
|
+
|
|
302
|
+
def frequencies(self, remove_missing=None) -> dict[str, float]:
|
|
303
|
+
"""
|
|
304
|
+
Return a dictionary mapping each possible :attr:`allele <Variant.alleles>`
|
|
305
|
+
at this site to the frequency of that allele: i.e. the number of samples
|
|
306
|
+
with that allele divided by the total number of samples, among the set of
|
|
307
|
+
samples specified when creating this Variant (by default, this is all the
|
|
308
|
+
sample nodes in the tree sequence). Note, therefore, that if a restricted set
|
|
309
|
+
of samples was specified on creation, the allele frequencies returned here
|
|
310
|
+
will *not* be the global allele frequencies in the whole tree sequence.
|
|
311
|
+
|
|
312
|
+
:param bool remove_missing: If True, only samples with non-missing data will
|
|
313
|
+
be counted in the total number of samples used to calculate the frequency,
|
|
314
|
+
and no information on the frequency of missing data is returned. Otherwise
|
|
315
|
+
(default), samples with missing data are included when calculating
|
|
316
|
+
frequencies.
|
|
317
|
+
:return: A dictionary mapping allelic states to the frequency of each allele
|
|
318
|
+
among the samples
|
|
319
|
+
"""
|
|
320
|
+
if remove_missing is None:
|
|
321
|
+
remove_missing = False
|
|
322
|
+
total = len(self.samples)
|
|
323
|
+
if remove_missing:
|
|
324
|
+
total -= self.num_missing
|
|
325
|
+
if total == 0:
|
|
326
|
+
logging.warning(
|
|
327
|
+
"No non-missing samples at this site, frequencies undefined"
|
|
328
|
+
)
|
|
329
|
+
return {
|
|
330
|
+
allele: count / total if total > 0 else np.nan
|
|
331
|
+
for allele, count in self.counts().items()
|
|
332
|
+
if not (allele is None and remove_missing)
|
|
333
|
+
}
|
|
334
|
+
|
|
335
|
+
def __str__(self) -> str:
|
|
336
|
+
"""
|
|
337
|
+
Return a plain text summary of the contents of a variant.
|
|
338
|
+
"""
|
|
339
|
+
try:
|
|
340
|
+
site_id = util.format_number(self.site.id, sep=",")
|
|
341
|
+
site_position = util.format_number(self.site.position, sep=",")
|
|
342
|
+
counts = self.counts()
|
|
343
|
+
freqs = self.frequencies()
|
|
344
|
+
samples = util.format_number(len(self.samples), sep=",")
|
|
345
|
+
num_alleles = util.format_number(self.num_alleles, sep=",")
|
|
346
|
+
rows = (
|
|
347
|
+
[
|
|
348
|
+
["Site id", f"{site_id}"],
|
|
349
|
+
["Site position", f"{site_position}"],
|
|
350
|
+
["Number of samples", f"{samples}"],
|
|
351
|
+
["Number of alleles", f"{num_alleles}"],
|
|
352
|
+
]
|
|
353
|
+
+ [
|
|
354
|
+
[
|
|
355
|
+
f"Samples with allele "
|
|
356
|
+
f"""{'missing' if k is None else "'" + k + "'"}""",
|
|
357
|
+
f"{util.format_number(counts[k], sep=',')} "
|
|
358
|
+
f"({util.format_number(freqs[k] * 100, 2, sep=',')}%)",
|
|
359
|
+
]
|
|
360
|
+
for k in self.alleles
|
|
361
|
+
]
|
|
362
|
+
+ [
|
|
363
|
+
["Has missing data", str(self.has_missing_data)],
|
|
364
|
+
["Isolated as missing", str(bool(self.isolated_as_missing))],
|
|
365
|
+
]
|
|
366
|
+
)
|
|
367
|
+
except ValueError as err:
|
|
368
|
+
rows = [[str(err), ""]]
|
|
369
|
+
return util.unicode_table(rows, title="Variant")
|
|
370
|
+
|
|
371
|
+
def _repr_html_(self) -> str:
|
|
372
|
+
"""
|
|
373
|
+
Return an html summary of a variant. Called by Jupyter notebooks
|
|
374
|
+
to render a Variant.
|
|
375
|
+
"""
|
|
376
|
+
return util.variant_html(self)
|
|
377
|
+
|
|
378
|
+
def __repr__(self):
|
|
379
|
+
d = {
|
|
380
|
+
"site": self.site,
|
|
381
|
+
"samples": self.samples,
|
|
382
|
+
"alleles": self.alleles,
|
|
383
|
+
"genotypes": self.genotypes,
|
|
384
|
+
"has_missing_data": self.has_missing_data,
|
|
385
|
+
"isolated_as_missing": self.isolated_as_missing,
|
|
386
|
+
}
|
|
387
|
+
return f"Variant({repr(d)})"
|
|
388
|
+
|
|
389
|
+
|
|
390
|
+
#
|
|
391
|
+
# Miscellaneous auxiliary methods.
|
|
392
|
+
#
|
|
393
|
+
def allele_remap(alleles_from, alleles_to):
|
|
394
|
+
# Returns an index map from the elements in one list (alleles_from)
|
|
395
|
+
# to the elements of another list (alleles_to).
|
|
396
|
+
#
|
|
397
|
+
# If some elements in alleles_from are not in alleles_to,
|
|
398
|
+
# then indices outside of alleles_to are used.
|
|
399
|
+
alleles_to = np.array(alleles_to, dtype="U")
|
|
400
|
+
alleles_from = np.array(alleles_from, dtype="U")
|
|
401
|
+
allele_map = np.empty_like(alleles_from, dtype="uint32")
|
|
402
|
+
overflow = len(alleles_to)
|
|
403
|
+
for i, allele in enumerate(alleles_from):
|
|
404
|
+
try:
|
|
405
|
+
# Use the index of the first matching element.
|
|
406
|
+
allele_map[i] = np.where(alleles_to == allele)[0][0]
|
|
407
|
+
except IndexError:
|
|
408
|
+
allele_map[i] = overflow
|
|
409
|
+
overflow += 1
|
|
410
|
+
return allele_map
|