tskit 1.0.1__cp314-cp314-macosx_10_15_universal2.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
tskit/exceptions.py ADDED
@@ -0,0 +1,70 @@
1
+ # MIT License
2
+ #
3
+ # Copyright (c) 2018-2021 Tskit Developers
4
+ # Copyright (c) 2017 University of Oxford
5
+ #
6
+ # Permission is hereby granted, free of charge, to any person obtaining a copy
7
+ # of this software and associated documentation files (the "Software"), to deal
8
+ # in the Software without restriction, including without limitation the rights
9
+ # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10
+ # copies of the Software, and to permit persons to whom the Software is
11
+ # furnished to do so, subject to the following conditions:
12
+ #
13
+ # The above copyright notice and this permission notice shall be included in all
14
+ # copies or substantial portions of the Software.
15
+ #
16
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17
+ # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18
+ # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19
+ # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20
+ # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21
+ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22
+ # SOFTWARE.
23
+ """
24
+ Exceptions defined in tskit.
25
+ """
26
+ from _tskit import FileFormatError # noqa: F401
27
+ from _tskit import IdentityPairsNotStoredError # noqa: F401
28
+ from _tskit import IdentitySegmentsNotStoredError # noqa: F401
29
+ from _tskit import LibraryError # noqa: F401
30
+ from _tskit import TskitException # noqa: F401
31
+ from _tskit import VersionTooNewError # noqa: F401
32
+ from _tskit import VersionTooOldError # noqa: F401
33
+
34
+
35
+ class DuplicatePositionsError(TskitException):
36
+ """
37
+ Duplicate positions in the list of sites.
38
+ """
39
+
40
+
41
+ class ProvenanceValidationError(TskitException):
42
+ """
43
+ A JSON document did not validate against the provenance schema.
44
+ """
45
+
46
+
47
+ class MetadataValidationError(TskitException):
48
+ """
49
+ A metadata object did not validate against the provenance schema.
50
+ """
51
+
52
+
53
+ class MetadataSchemaValidationError(TskitException):
54
+ """
55
+ A metadata schema object did not validate against the metaschema.
56
+ """
57
+
58
+
59
+ class MetadataEncodingError(TskitException):
60
+ """
61
+ A metadata object was of a type that could not be encoded
62
+ """
63
+
64
+
65
+ class ImmutableTableError(ValueError):
66
+ """
67
+ Raised when attempting to modify an immutable table view.
68
+
69
+ Use TreeSequence.dump_tables() to get a mutable copy.
70
+ """
tskit/genotypes.py ADDED
@@ -0,0 +1,410 @@
1
+ #
2
+ # MIT License
3
+ #
4
+ # Copyright (c) 2018-2024 Tskit Developers
5
+ # Copyright (c) 2015-2018 University of Oxford
6
+ #
7
+ # Permission is hereby granted, free of charge, to any person obtaining a copy
8
+ # of this software and associated documentation files (the "Software"), to deal
9
+ # in the Software without restriction, including without limitation the rights
10
+ # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
11
+ # copies of the Software, and to permit persons to whom the Software is
12
+ # furnished to do so, subject to the following conditions:
13
+ #
14
+ # The above copyright notice and this permission notice shall be included in all
15
+ # copies or substantial portions of the Software.
16
+ #
17
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18
+ # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19
+ # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
20
+ # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21
+ # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22
+ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
23
+ # SOFTWARE.
24
+ from __future__ import annotations
25
+
26
+ import collections
27
+ import logging
28
+ import typing
29
+
30
+ import numpy as np
31
+
32
+ import _tskit
33
+ import tskit
34
+ import tskit.trees as trees
35
+ import tskit.util as util
36
+
37
+
38
+ class Variant:
39
+ """
40
+ A variant in a tree sequence, describing the observed genetic variation
41
+ among the specified nodes (by default, the sample nodes) for a given site.
42
+ A variant consists of (a) a tuple of **alleles** listing the potential
43
+ allelic states which the requested nodes at this site can possess; (b) an
44
+ array of **genotypes** mapping node IDs to the observed alleles; (c) a
45
+ reference to the :class:`Site` at which the Variant has been decoded; and
46
+ (d) an array of **samples** giving the node ID to which each element of the
47
+ genotypes array corresponds.
48
+
49
+ After creation a Variant is not yet decoded, and has no genotypes.
50
+ To decode a Variant, call the :meth:`decode` method. The Variant class will then
51
+ use a Tree, internal to the Variant, to seek to the position of the site and
52
+ decode the genotypes at that site. It is therefore much more efficient to visit
53
+ sites in sequential genomic order, either in a forwards or backwards direction,
54
+ than to do so randomly.
55
+
56
+ Each element in the ``alleles`` tuple is a string, representing an
57
+ observed allelic state that may be seen at this site. The ``alleles`` tuple,
58
+ which is guaranteed not to contain any duplicates, is generated in one of two
59
+ ways. The first (and default) way is for ``tskit`` to generate the encoding on
60
+ the fly while generating genotypes. In this case, the first element of this
61
+ tuple is guaranteed to be the same as the site's ``ancestral_state`` value.
62
+ Note that allelic values may be listed that are not referred to by any
63
+ samples. For example, if we have a site that is fixed for the derived state
64
+ (i.e., we have a mutation over the tree root), all genotypes will be 1, but
65
+ the alleles list will be equal to ``('0', '1')``. Other than the
66
+ ancestral state being the first allele, the alleles are listed in
67
+ no particular order, and the ordering should not be relied upon
68
+ (but see the notes on missing data below).
69
+
70
+ The second way is for the user to define the mapping between
71
+ genotype values and allelic state strings using the
72
+ ``alleles`` parameter to the :meth:`TreeSequence.variants` method.
73
+ In this case, there is no indication of which allele is the ancestral state,
74
+ as the ordering is determined by the user.
75
+
76
+ The ``genotypes`` represent the observed allelic states for each requested
77
+ node, such that ``var.alleles[var.genotypes[j]]`` gives the string allele
78
+ for the node at index ``j`` (i.e., for ``variant.samples[j]``). Thus, the
79
+ elements of the genotypes array are
80
+ indexes into the ``alleles`` list. The genotypes are provided in this
81
+ way via a numpy numeric array to enable efficient calculations. To obtain a
82
+ (less efficient) array of allele strings for each node, you can use e.g.
83
+ ``np.asarray(variant.alleles)[variant.genotypes]``.
84
+
85
+ When :ref:`missing data<sec_data_model_missing_data>` is present at a given
86
+ site, the property ``has_missing_data`` will be True, at least one element
87
+ of the ``genotypes`` array will be equal to ``tskit.MISSING_DATA``, and the
88
+ last element of the ``alleles`` array will be ``None``. Note that in this
89
+ case ``variant.num_alleles`` will **not** be equal to
90
+ ``len(variant.alleles)``. The rationale for adding ``None`` to the end of
91
+ the ``alleles`` list is to help code that does not handle missing data
92
+ correctly fail early rather than introducing subtle and hard-to-find bugs.
93
+ As ``tskit.MISSING_DATA`` is equal to -1, code that decodes genotypes into
94
+ allelic values without taking missing data into account would otherwise
95
+ incorrectly output the last allele in the list.
96
+
97
+ :param TreeSequence tree_sequence: The tree sequence to which this variant
98
+ belongs.
99
+ :param array_like samples: An array of node IDs for which to generate
100
+ genotypes, or ``None`` for all sample nodes. Non-sample nodes may also
101
+ be provided to generate genotypes for internal nodes. Default: ``None``.
102
+ :param bool isolated_as_missing: If True, the genotype value assigned to
103
+ isolated nodes without mutations (samples or non-samples) is
104
+ :data:`.MISSING_DATA` (-1). If False, such nodes will be
105
+ assigned the allele index for the ancestral state.
106
+ Default: True.
107
+ :param tuple alleles: A tuple of strings defining the encoding of
108
+ alleles as integer genotype values. At least one allele must be provided.
109
+ If duplicate alleles are provided, output genotypes will always be
110
+ encoded as the first occurrence of the allele. If None (the default),
111
+ the alleles are encoded as they are encountered during genotype
112
+ generation.
113
+
114
+ """
115
+
116
+ def __init__(
117
+ self, tree_sequence, samples=None, isolated_as_missing=None, alleles=None
118
+ ):
119
+ if isolated_as_missing is None:
120
+ isolated_as_missing = True
121
+ self.tree_sequence = tree_sequence
122
+ if samples is not None:
123
+ samples = util.safe_np_int_cast(samples, np.int32)
124
+ self._ll_variant = _tskit.Variant(
125
+ tree_sequence._ll_tree_sequence,
126
+ samples=samples,
127
+ isolated_as_missing=isolated_as_missing,
128
+ alleles=alleles,
129
+ )
130
+
131
+ def _check_decoded(self):
132
+ if self._ll_variant.site_id == tskit.NULL:
133
+ raise ValueError(
134
+ "This variant has not yet been decoded at a specific site, "
135
+ "call Variant.decode to set the site."
136
+ )
137
+
138
+ @property
139
+ def site(self) -> trees.Site:
140
+ """
141
+ The Site object for the site at which this variant has been decoded.
142
+ """
143
+ self._check_decoded()
144
+ return self.tree_sequence.site(self._ll_variant.site_id)
145
+
146
+ @property
147
+ def alleles(self) -> tuple[str | None, ...]:
148
+ """
149
+ A tuple of the allelic values which nodes can possess at the current
150
+ site. Unless an encoding of alleles is specified when creating this
151
+ variant instance, the first element of this tuple is always the site's
152
+ ancestral state.
153
+ """
154
+ return self._ll_variant.alleles
155
+
156
+ @property
157
+ def samples(self) -> np.ndarray:
158
+ """
159
+ A numpy array of the node ids whose genotypes will be returned
160
+ by the :meth:`genotypes` method.
161
+ """
162
+ return self._ll_variant.samples
163
+
164
+ @property
165
+ def genotypes(self) -> np.ndarray:
166
+ """
167
+ An array of indexes into the list ``alleles``, giving the
168
+ state of each requested node at the current site.
169
+ """
170
+ self._check_decoded()
171
+ return self._ll_variant.genotypes
172
+
173
+ @property
174
+ def isolated_as_missing(self) -> bool:
175
+ """
176
+ True if isolated nodes are decoded to missing data. If False, isolated
177
+ nodes are decoded to the ancestral state.
178
+ """
179
+ return self._ll_variant.isolated_as_missing
180
+
181
+ @property
182
+ def has_missing_data(self) -> bool:
183
+ """
184
+ True if there is missing data for any of the
185
+ requested nodes at the current site.
186
+ """
187
+ alleles = self._ll_variant.alleles
188
+ return len(alleles) > 0 and alleles[-1] is None
189
+
190
+ @property
191
+ def num_missing(self) -> int:
192
+ """
193
+ The number of requested nodes with missing data at this site.
194
+ """
195
+ return np.sum(self.genotypes == tskit.NULL)
196
+
197
+ @property
198
+ def num_alleles(self) -> int:
199
+ """
200
+ The number of distinct alleles at this site. Note that this may
201
+ not be the same as the number of distinct values in the genotypes
202
+ array: firstly missing data is not counted as an allele, and secondly,
203
+ the site may contain mutations to alternative allele states (which are
204
+ counted in the number of alleles) without the mutation being inherited
205
+ by any of the requested nodes.
206
+ """
207
+ return len(self.alleles) - self.has_missing_data
208
+
209
+ # Deprecated alias to avoid breaking existing code.
210
+ @property
211
+ def position(self) -> float:
212
+ return self.site.position
213
+
214
+ # Deprecated alias to avoid breaking existing code.
215
+ @property
216
+ def index(self) -> int:
217
+ return self._ll_variant.site_id
218
+
219
+ # We need a custom eq for the numpy array
220
+ def __eq__(self, other) -> bool:
221
+ return (
222
+ isinstance(other, Variant)
223
+ and self.tree_sequence == other.tree_sequence
224
+ and self._ll_variant.site_id == other._ll_variant.site_id
225
+ and self._ll_variant.site_id != tskit.NULL
226
+ and self._ll_variant.alleles == other._ll_variant.alleles
227
+ and np.array_equal(self._ll_variant.genotypes, other._ll_variant.genotypes)
228
+ )
229
+
230
+ def decode(self, site_id) -> None:
231
+ """
232
+ Decode the variant at the given site, setting the site ID, genotypes and
233
+ alleles to those of the site and samples of this Variant.
234
+
235
+ :param int site_id: The ID of the site to decode. This must be a valid site ID.
236
+ """
237
+ self._ll_variant.decode(site_id)
238
+
239
+ def copy(self) -> Variant:
240
+ """
241
+ Create a copy of this Variant. Note that calling :meth:`decode` on the
242
+ copy will fail as it does not take a copy of the internal tree.
243
+
244
+ :return: The copy of this Variant.
245
+ """
246
+ variant_copy = Variant.__new__(Variant)
247
+ variant_copy.tree_sequence = self.tree_sequence
248
+ variant_copy._ll_variant = self._ll_variant.restricted_copy()
249
+ return variant_copy
250
+
251
+ def states(self, missing_data_string=None) -> np.ndarray:
252
+ """
253
+ Returns the allelic states at this site as an array of strings.
254
+
255
+ .. warning::
256
+ Using this method is inefficient compared to working with the
257
+ underlying integer representation of genotypes as returned by
258
+ the :attr:`~Variant.genotypes` property.
259
+
260
+ :param str missing_data_string: A string that will be used to represent missing
261
+ data. If any normal allele contains this character, an error is raised.
262
+ Default: `None`, treated as `'N'`.
263
+ :return: An numpy array of strings of length ``num_sites``.
264
+ """
265
+ if missing_data_string is None:
266
+ missing_data_string = "N"
267
+ elif not isinstance(missing_data_string, str):
268
+ # Must explicitly test here, otherwise we output a numpy object array
269
+ raise ValueError("Missing data string is not a string")
270
+ alleles = self.alleles
271
+ if alleles[-1] is None:
272
+ if missing_data_string in alleles:
273
+ raise ValueError(
274
+ "An existing allele is equal to the "
275
+ f"missing data string '{missing_data_string}'"
276
+ )
277
+ alleles = alleles[:-1] + (missing_data_string,)
278
+ return np.array(alleles)[self.genotypes]
279
+
280
+ def counts(self) -> typing.Counter[str | None]:
281
+ """
282
+ Returns a :class:`python:collections.Counter` object providing counts for each
283
+ possible :attr:`allele <Variant.alleles>` at this site: i.e. the number of
284
+ samples possessing that allele among the set of samples specified when creating
285
+ this Variant (by default, this is all the sample nodes in the tree sequence).
286
+ Missing data is represented by an allelic state of ``None``.
287
+
288
+ :return: A counter of the number of samples associated with each allele.
289
+ """
290
+ counts = collections.Counter()
291
+ if self.alleles[-1] is None:
292
+ # we have to treat the last element of the genotypes array as special
293
+ counts[None] = np.sum(self.genotypes == tskit.MISSING_DATA)
294
+ for i, allele in enumerate(self.alleles[:-1]):
295
+ counts[allele] = np.sum(self.genotypes == i)
296
+ else:
297
+ bincounts = np.bincount(self.genotypes, minlength=self.num_alleles)
298
+ for i, allele in enumerate(self.alleles):
299
+ counts[allele] = bincounts[i]
300
+ return counts
301
+
302
+ def frequencies(self, remove_missing=None) -> dict[str, float]:
303
+ """
304
+ Return a dictionary mapping each possible :attr:`allele <Variant.alleles>`
305
+ at this site to the frequency of that allele: i.e. the number of samples
306
+ with that allele divided by the total number of samples, among the set of
307
+ samples specified when creating this Variant (by default, this is all the
308
+ sample nodes in the tree sequence). Note, therefore, that if a restricted set
309
+ of samples was specified on creation, the allele frequencies returned here
310
+ will *not* be the global allele frequencies in the whole tree sequence.
311
+
312
+ :param bool remove_missing: If True, only samples with non-missing data will
313
+ be counted in the total number of samples used to calculate the frequency,
314
+ and no information on the frequency of missing data is returned. Otherwise
315
+ (default), samples with missing data are included when calculating
316
+ frequencies.
317
+ :return: A dictionary mapping allelic states to the frequency of each allele
318
+ among the samples
319
+ """
320
+ if remove_missing is None:
321
+ remove_missing = False
322
+ total = len(self.samples)
323
+ if remove_missing:
324
+ total -= self.num_missing
325
+ if total == 0:
326
+ logging.warning(
327
+ "No non-missing samples at this site, frequencies undefined"
328
+ )
329
+ return {
330
+ allele: count / total if total > 0 else np.nan
331
+ for allele, count in self.counts().items()
332
+ if not (allele is None and remove_missing)
333
+ }
334
+
335
+ def __str__(self) -> str:
336
+ """
337
+ Return a plain text summary of the contents of a variant.
338
+ """
339
+ try:
340
+ site_id = util.format_number(self.site.id, sep=",")
341
+ site_position = util.format_number(self.site.position, sep=",")
342
+ counts = self.counts()
343
+ freqs = self.frequencies()
344
+ samples = util.format_number(len(self.samples), sep=",")
345
+ num_alleles = util.format_number(self.num_alleles, sep=",")
346
+ rows = (
347
+ [
348
+ ["Site id", f"{site_id}"],
349
+ ["Site position", f"{site_position}"],
350
+ ["Number of samples", f"{samples}"],
351
+ ["Number of alleles", f"{num_alleles}"],
352
+ ]
353
+ + [
354
+ [
355
+ f"Samples with allele "
356
+ f"""{'missing' if k is None else "'" + k + "'"}""",
357
+ f"{util.format_number(counts[k], sep=',')} "
358
+ f"({util.format_number(freqs[k] * 100, 2, sep=',')}%)",
359
+ ]
360
+ for k in self.alleles
361
+ ]
362
+ + [
363
+ ["Has missing data", str(self.has_missing_data)],
364
+ ["Isolated as missing", str(bool(self.isolated_as_missing))],
365
+ ]
366
+ )
367
+ except ValueError as err:
368
+ rows = [[str(err), ""]]
369
+ return util.unicode_table(rows, title="Variant")
370
+
371
+ def _repr_html_(self) -> str:
372
+ """
373
+ Return an html summary of a variant. Called by Jupyter notebooks
374
+ to render a Variant.
375
+ """
376
+ return util.variant_html(self)
377
+
378
+ def __repr__(self):
379
+ d = {
380
+ "site": self.site,
381
+ "samples": self.samples,
382
+ "alleles": self.alleles,
383
+ "genotypes": self.genotypes,
384
+ "has_missing_data": self.has_missing_data,
385
+ "isolated_as_missing": self.isolated_as_missing,
386
+ }
387
+ return f"Variant({repr(d)})"
388
+
389
+
390
+ #
391
+ # Miscellaneous auxiliary methods.
392
+ #
393
+ def allele_remap(alleles_from, alleles_to):
394
+ # Returns an index map from the elements in one list (alleles_from)
395
+ # to the elements of another list (alleles_to).
396
+ #
397
+ # If some elements in alleles_from are not in alleles_to,
398
+ # then indices outside of alleles_to are used.
399
+ alleles_to = np.array(alleles_to, dtype="U")
400
+ alleles_from = np.array(alleles_from, dtype="U")
401
+ allele_map = np.empty_like(alleles_from, dtype="uint32")
402
+ overflow = len(alleles_to)
403
+ for i, allele in enumerate(alleles_from):
404
+ try:
405
+ # Use the index of the first matching element.
406
+ allele_map[i] = np.where(alleles_to == allele)[0][0]
407
+ except IndexError:
408
+ allele_map[i] = overflow
409
+ overflow += 1
410
+ return allele_map