tskit 1.0.1__cp314-cp314-macosx_10_15_universal2.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- _tskit.cpython-314-darwin.so +0 -0
- tskit/__init__.py +92 -0
- tskit/__main__.py +4 -0
- tskit/_version.py +4 -0
- tskit/cli.py +273 -0
- tskit/combinatorics.py +1522 -0
- tskit/drawing.py +2809 -0
- tskit/exceptions.py +70 -0
- tskit/genotypes.py +410 -0
- tskit/intervals.py +601 -0
- tskit/jit/__init__.py +0 -0
- tskit/jit/numba.py +674 -0
- tskit/metadata.py +1147 -0
- tskit/provenance.py +150 -0
- tskit/provenance.schema.json +72 -0
- tskit/stats.py +165 -0
- tskit/tables.py +4858 -0
- tskit/text_formats.py +456 -0
- tskit/trees.py +11457 -0
- tskit/util.py +901 -0
- tskit/vcf.py +219 -0
- tskit-1.0.1.dist-info/METADATA +105 -0
- tskit-1.0.1.dist-info/RECORD +27 -0
- tskit-1.0.1.dist-info/WHEEL +5 -0
- tskit-1.0.1.dist-info/entry_points.txt +2 -0
- tskit-1.0.1.dist-info/licenses/LICENSE +21 -0
- tskit-1.0.1.dist-info/top_level.txt +2 -0
tskit/provenance.py
ADDED
|
@@ -0,0 +1,150 @@
|
|
|
1
|
+
# MIT License
|
|
2
|
+
#
|
|
3
|
+
# Copyright (c) 2018-2024 Tskit Developers
|
|
4
|
+
# Copyright (c) 2016-2017 University of Oxford
|
|
5
|
+
#
|
|
6
|
+
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
7
|
+
# of this software and associated documentation files (the "Software"), to deal
|
|
8
|
+
# in the Software without restriction, including without limitation the rights
|
|
9
|
+
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
10
|
+
# copies of the Software, and to permit persons to whom the Software is
|
|
11
|
+
# furnished to do so, subject to the following conditions:
|
|
12
|
+
#
|
|
13
|
+
# The above copyright notice and this permission notice shall be included in all
|
|
14
|
+
# copies or substantial portions of the Software.
|
|
15
|
+
#
|
|
16
|
+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
17
|
+
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
18
|
+
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
19
|
+
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
20
|
+
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
21
|
+
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
22
|
+
# SOFTWARE.
|
|
23
|
+
"""
|
|
24
|
+
Common provenance methods used to determine the state and versions
|
|
25
|
+
of various dependencies and the OS.
|
|
26
|
+
"""
|
|
27
|
+
import json
|
|
28
|
+
import os.path
|
|
29
|
+
import platform
|
|
30
|
+
import sys
|
|
31
|
+
import time
|
|
32
|
+
|
|
33
|
+
try:
|
|
34
|
+
import resource
|
|
35
|
+
except ImportError:
|
|
36
|
+
resource = None # resource.getrusage absent on windows
|
|
37
|
+
|
|
38
|
+
import jsonschema
|
|
39
|
+
|
|
40
|
+
import _tskit
|
|
41
|
+
import tskit.exceptions as exceptions
|
|
42
|
+
from . import _version
|
|
43
|
+
|
|
44
|
+
__version__ = _version.tskit_version
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
# NOTE: the APIs here are all preliminary. We should have a class that encapsulates
|
|
48
|
+
# all of the required functionality, including parsing and printing out provenance
|
|
49
|
+
# records. This will replace the current functions.
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def get_environment(extra_libs=None, include_tskit=True):
|
|
53
|
+
"""
|
|
54
|
+
Returns a dictionary describing the environment in which tskit
|
|
55
|
+
is currently running.
|
|
56
|
+
|
|
57
|
+
This API is tentative and will change in the future when a more
|
|
58
|
+
comprehensive provenance API is implemented.
|
|
59
|
+
"""
|
|
60
|
+
env = {
|
|
61
|
+
"os": {
|
|
62
|
+
"system": platform.system(),
|
|
63
|
+
"node": platform.node(),
|
|
64
|
+
"release": platform.release(),
|
|
65
|
+
"version": platform.version(),
|
|
66
|
+
"machine": platform.machine(),
|
|
67
|
+
},
|
|
68
|
+
"python": {
|
|
69
|
+
"implementation": platform.python_implementation(),
|
|
70
|
+
"version": platform.python_version(),
|
|
71
|
+
},
|
|
72
|
+
}
|
|
73
|
+
libs = {"kastore": {"version": ".".join(map(str, _tskit.get_kastore_version()))}}
|
|
74
|
+
if include_tskit:
|
|
75
|
+
libs["tskit"] = {"version": __version__}
|
|
76
|
+
if extra_libs is not None:
|
|
77
|
+
libs.update(extra_libs)
|
|
78
|
+
env["libraries"] = libs
|
|
79
|
+
return env
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
def get_resources(start_time):
|
|
83
|
+
# Returns a dict describing the resources used by the current process
|
|
84
|
+
times = os.times()
|
|
85
|
+
ret = {
|
|
86
|
+
"elapsed_time": time.time() - start_time,
|
|
87
|
+
"user_time": times.user + times.children_user,
|
|
88
|
+
"sys_time": times.system + times.children_system,
|
|
89
|
+
}
|
|
90
|
+
if resource is not None:
|
|
91
|
+
# Don't report max memory on Windows, we would need an external dep like psutil
|
|
92
|
+
ret["max_memory"] = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss
|
|
93
|
+
if sys.platform != "darwin":
|
|
94
|
+
ret["max_memory"] *= 1024 # Linux, freeBSD et al reports in KiB, not bytes
|
|
95
|
+
|
|
96
|
+
return ret
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
def get_provenance_dict(parameters=None):
|
|
100
|
+
"""
|
|
101
|
+
Returns a dictionary encoding an execution of tskit conforming to the
|
|
102
|
+
provenance schema.
|
|
103
|
+
"""
|
|
104
|
+
document = {
|
|
105
|
+
"schema_version": "1.0.0",
|
|
106
|
+
"software": {"name": "tskit", "version": __version__},
|
|
107
|
+
"parameters": parameters,
|
|
108
|
+
"environment": get_environment(include_tskit=False),
|
|
109
|
+
}
|
|
110
|
+
return document
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
# Cache the schema
|
|
114
|
+
_schema = None
|
|
115
|
+
|
|
116
|
+
|
|
117
|
+
def get_schema():
|
|
118
|
+
"""
|
|
119
|
+
Returns the tskit provenance :ref:`provenance schema <sec_provenance>` as
|
|
120
|
+
a dict.
|
|
121
|
+
|
|
122
|
+
:return: The provenance schema.
|
|
123
|
+
:rtype: dict
|
|
124
|
+
"""
|
|
125
|
+
global _schema
|
|
126
|
+
if _schema is None:
|
|
127
|
+
base = os.path.dirname(__file__)
|
|
128
|
+
schema_file = os.path.join(base, "provenance.schema.json")
|
|
129
|
+
with open(schema_file) as f:
|
|
130
|
+
_schema = json.load(f)
|
|
131
|
+
# Return a copy to avoid issues with modifying the cached schema
|
|
132
|
+
return dict(_schema)
|
|
133
|
+
|
|
134
|
+
|
|
135
|
+
def validate_provenance(provenance):
|
|
136
|
+
"""
|
|
137
|
+
Validates the specified dict-like object against the tskit
|
|
138
|
+
:ref:`provenance schema <sec_provenance>`. If the input does
|
|
139
|
+
not represent a valid instance of the schema an exception is
|
|
140
|
+
raised.
|
|
141
|
+
|
|
142
|
+
:param dict provenance: The dictionary representing a JSON document
|
|
143
|
+
to be validated against the schema.
|
|
144
|
+
:raises ProvenanceValidationError: if the schema is not valid.
|
|
145
|
+
"""
|
|
146
|
+
schema = get_schema()
|
|
147
|
+
try:
|
|
148
|
+
jsonschema.validate(provenance, schema)
|
|
149
|
+
except jsonschema.exceptions.ValidationError as ve:
|
|
150
|
+
raise exceptions.ProvenanceValidationError from ve
|
|
@@ -0,0 +1,72 @@
|
|
|
1
|
+
{
|
|
2
|
+
"schema": "http://json-schema.org/draft-07/schema#",
|
|
3
|
+
"version": "1.1.0",
|
|
4
|
+
"title": "tskit provenance",
|
|
5
|
+
"description": "The combination of software, parameters and environment that produced a tree sequence",
|
|
6
|
+
"type": "object",
|
|
7
|
+
"required": ["schema_version", "software", "parameters", "environment"],
|
|
8
|
+
"properties": {
|
|
9
|
+
"schema_version": {
|
|
10
|
+
"description": "The version of this schema used.",
|
|
11
|
+
"type": "string",
|
|
12
|
+
"minLength": 1
|
|
13
|
+
},
|
|
14
|
+
"software": {
|
|
15
|
+
"description": "The primary software used to produce the tree sequence.",
|
|
16
|
+
"type": "object",
|
|
17
|
+
"required": ["name", "version"],
|
|
18
|
+
"properties": {
|
|
19
|
+
"name": {
|
|
20
|
+
"description": "The name of the primary software.",
|
|
21
|
+
"type": "string",
|
|
22
|
+
"minLength": 1
|
|
23
|
+
},
|
|
24
|
+
"version": {
|
|
25
|
+
"description": "The version of primary software.",
|
|
26
|
+
"type": "string",
|
|
27
|
+
"minLength": 1
|
|
28
|
+
}
|
|
29
|
+
}
|
|
30
|
+
},
|
|
31
|
+
"parameters": {
|
|
32
|
+
"description": "The parameters used to produce the tree sequence.",
|
|
33
|
+
"type": "object"
|
|
34
|
+
},
|
|
35
|
+
"environment": {
|
|
36
|
+
"description": "The computational environment within which the primary software ran.",
|
|
37
|
+
"type": "object",
|
|
38
|
+
"properties": {
|
|
39
|
+
"os": {
|
|
40
|
+
"description": "Operating system.",
|
|
41
|
+
"type": "object"
|
|
42
|
+
},
|
|
43
|
+
"libraries": {
|
|
44
|
+
"description": "Details of libraries the primary software linked against.",
|
|
45
|
+
"type": "object"
|
|
46
|
+
}
|
|
47
|
+
}
|
|
48
|
+
},
|
|
49
|
+
"resources": {
|
|
50
|
+
"description": "Resources used by this operation.",
|
|
51
|
+
"type": "object",
|
|
52
|
+
"properties": {
|
|
53
|
+
"elapsed_time": {
|
|
54
|
+
"description": "Wall clock time in used in seconds.",
|
|
55
|
+
"type": "number"
|
|
56
|
+
},
|
|
57
|
+
"user_time": {
|
|
58
|
+
"description": "User time used in seconds.",
|
|
59
|
+
"type": "number"
|
|
60
|
+
},
|
|
61
|
+
"sys_time": {
|
|
62
|
+
"description": "System time used in seconds.",
|
|
63
|
+
"type": "number"
|
|
64
|
+
},
|
|
65
|
+
"max_memory": {
|
|
66
|
+
"description": "Maximum memory used in bytes.",
|
|
67
|
+
"type": "number"
|
|
68
|
+
}
|
|
69
|
+
}
|
|
70
|
+
}
|
|
71
|
+
}
|
|
72
|
+
}
|
tskit/stats.py
ADDED
|
@@ -0,0 +1,165 @@
|
|
|
1
|
+
# MIT License
|
|
2
|
+
#
|
|
3
|
+
# Copyright (c) 2018-2024 Tskit Developers
|
|
4
|
+
#
|
|
5
|
+
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
# of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
# in the Software without restriction, including without limitation the rights
|
|
8
|
+
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
# copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
# furnished to do so, subject to the following conditions:
|
|
11
|
+
#
|
|
12
|
+
# The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
# copies or substantial portions of the Software.
|
|
14
|
+
#
|
|
15
|
+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
# SOFTWARE.
|
|
22
|
+
"""
|
|
23
|
+
Module responsible for computing various statistics on tree sequences.
|
|
24
|
+
"""
|
|
25
|
+
import sys
|
|
26
|
+
import threading
|
|
27
|
+
|
|
28
|
+
import numpy as np
|
|
29
|
+
|
|
30
|
+
import _tskit
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
class LdCalculator:
|
|
34
|
+
"""
|
|
35
|
+
Class for calculating `linkage disequilibrium
|
|
36
|
+
<https://en.wikipedia.org/wiki/Linkage_disequilibrium>`_ coefficients
|
|
37
|
+
between pairs of sites in a :class:`TreeSequence`.
|
|
38
|
+
|
|
39
|
+
.. note:: This interface is deprecated and a replacement is planned.
|
|
40
|
+
Please see https://github.com/tskit-dev/tskit/issues/1900 for
|
|
41
|
+
more information. Note also that the current implementation is
|
|
42
|
+
quite limited (see warning below).
|
|
43
|
+
|
|
44
|
+
.. warning:: This class does not currently support sites that have more than one
|
|
45
|
+
mutation. Using it on such a tree sequence will raise a LibraryError with
|
|
46
|
+
an "Only infinite sites mutations supported" message.
|
|
47
|
+
|
|
48
|
+
Silent mutations are also not supported and will result in a LibraryError.
|
|
49
|
+
|
|
50
|
+
:param TreeSequence tree_sequence: The tree sequence of interest.
|
|
51
|
+
"""
|
|
52
|
+
|
|
53
|
+
def __init__(self, tree_sequence):
|
|
54
|
+
self._tree_sequence = tree_sequence
|
|
55
|
+
self._ll_ld_calculator = _tskit.LdCalculator(
|
|
56
|
+
tree_sequence.get_ll_tree_sequence()
|
|
57
|
+
)
|
|
58
|
+
# To protect low-level C code, only one method may execute on the
|
|
59
|
+
# low-level objects at one time.
|
|
60
|
+
self._instance_lock = threading.Lock()
|
|
61
|
+
|
|
62
|
+
def get_r2(self, a, b):
|
|
63
|
+
# Deprecated alias for r2(a, b)
|
|
64
|
+
return self.r2(a, b)
|
|
65
|
+
|
|
66
|
+
def r2(self, a, b):
|
|
67
|
+
"""
|
|
68
|
+
Returns the value of the :math:`r^2` statistic between the pair of
|
|
69
|
+
sites at the specified indexes. This method is *not* an efficient
|
|
70
|
+
method for computing large numbers of pairwise LD values; please use either
|
|
71
|
+
:meth:`.r2_array` or :meth:`.r2_matrix` for this purpose.
|
|
72
|
+
|
|
73
|
+
:param int a: The index of the first site.
|
|
74
|
+
:param int b: The index of the second site.
|
|
75
|
+
:return: The value of :math:`r^2` between the sites at indexes
|
|
76
|
+
``a`` and ``b``.
|
|
77
|
+
:rtype: float
|
|
78
|
+
"""
|
|
79
|
+
with self._instance_lock:
|
|
80
|
+
return self._ll_ld_calculator.get_r2(a, b)
|
|
81
|
+
|
|
82
|
+
def get_r2_array(self, a, direction=1, max_mutations=None, max_distance=None):
|
|
83
|
+
# Deprecated alias for r2_array
|
|
84
|
+
return self.r2_array(
|
|
85
|
+
a,
|
|
86
|
+
direction=direction,
|
|
87
|
+
max_mutations=max_mutations,
|
|
88
|
+
max_distance=max_distance,
|
|
89
|
+
)
|
|
90
|
+
|
|
91
|
+
def r2_array(
|
|
92
|
+
self, a, direction=1, max_mutations=None, max_distance=None, max_sites=None
|
|
93
|
+
):
|
|
94
|
+
"""
|
|
95
|
+
Returns the value of the :math:`r^2` statistic between the focal
|
|
96
|
+
site at index :math:`a` and a set of other sites. The method
|
|
97
|
+
operates by starting at the focal site and iterating over adjacent
|
|
98
|
+
sites (in either the forward or backwards direction) until either a
|
|
99
|
+
maximum number of other sites have been considered (using the
|
|
100
|
+
``max_sites`` parameter), a maximum distance in sequence
|
|
101
|
+
coordinates has been reached (using the ``max_distance`` parameter) or
|
|
102
|
+
the start/end of the sequence has been reached. For every site
|
|
103
|
+
:math:`b` considered, we then insert the value of :math:`r^2` between
|
|
104
|
+
:math:`a` and :math:`b` at the corresponding index in an array, and
|
|
105
|
+
return the entire array. If the returned array is :math:`x` and
|
|
106
|
+
``direction`` is :data:`tskit.FORWARD` then :math:`x[0]` is the
|
|
107
|
+
value of the statistic for :math:`a` and :math:`a + 1`, :math:`x[1]`
|
|
108
|
+
the value for :math:`a` and :math:`a + 2`, etc. Similarly, if
|
|
109
|
+
``direction`` is :data:`tskit.REVERSE` then :math:`x[0]` is the
|
|
110
|
+
value of the statistic for :math:`a` and :math:`a - 1`, :math:`x[1]`
|
|
111
|
+
the value for :math:`a` and :math:`a - 2`, etc.
|
|
112
|
+
|
|
113
|
+
:param int a: The index of the focal sites.
|
|
114
|
+
:param int direction: The direction in which to travel when
|
|
115
|
+
examining other sites. Must be either
|
|
116
|
+
:data:`tskit.FORWARD` or :data:`tskit.REVERSE`. Defaults
|
|
117
|
+
to :data:`tskit.FORWARD`.
|
|
118
|
+
:param int max_sites: The maximum number of sites to return
|
|
119
|
+
:math:`r^2` values for. Defaults to as many sites as
|
|
120
|
+
possible.
|
|
121
|
+
:param int max_mutations: Deprecated synonym for max_sites.
|
|
122
|
+
:param float max_distance: The maximum absolute distance between
|
|
123
|
+
the focal sites and those for which :math:`r^2` values
|
|
124
|
+
are returned.
|
|
125
|
+
:return: An array of double precision floating point values
|
|
126
|
+
representing the :math:`r^2` values for sites in the
|
|
127
|
+
specified direction.
|
|
128
|
+
:rtype: numpy.ndarray
|
|
129
|
+
"""
|
|
130
|
+
if max_mutations is not None and max_sites is not None:
|
|
131
|
+
raise ValueError("max_mutations is a deprecated synonym for max_sites")
|
|
132
|
+
if max_mutations is not None:
|
|
133
|
+
max_sites = max_mutations
|
|
134
|
+
max_sites = -1 if max_sites is None else max_sites
|
|
135
|
+
if max_distance is None:
|
|
136
|
+
max_distance = sys.float_info.max
|
|
137
|
+
with self._instance_lock:
|
|
138
|
+
return self._ll_ld_calculator.get_r2_array(
|
|
139
|
+
a,
|
|
140
|
+
direction=direction,
|
|
141
|
+
max_sites=max_sites,
|
|
142
|
+
max_distance=max_distance,
|
|
143
|
+
)
|
|
144
|
+
|
|
145
|
+
def get_r2_matrix(self):
|
|
146
|
+
# Deprecated alias for r2_matrix
|
|
147
|
+
return self.r2_matrix()
|
|
148
|
+
|
|
149
|
+
def r2_matrix(self):
|
|
150
|
+
"""
|
|
151
|
+
Returns the complete :math:`m \\times m` matrix of pairwise
|
|
152
|
+
:math:`r^2` values in a tree sequence with :math:`m` sites.
|
|
153
|
+
|
|
154
|
+
:return: An 2 dimensional square array of double precision
|
|
155
|
+
floating point values representing the :math:`r^2` values for
|
|
156
|
+
all pairs of sites.
|
|
157
|
+
:rtype: numpy.ndarray
|
|
158
|
+
"""
|
|
159
|
+
m = self._tree_sequence.num_sites
|
|
160
|
+
A = np.ones((m, m), dtype=float)
|
|
161
|
+
for j in range(m - 1):
|
|
162
|
+
a = self.get_r2_array(j)
|
|
163
|
+
A[j, j + 1 :] = a
|
|
164
|
+
A[j + 1 :, j] = a
|
|
165
|
+
return A
|