tskit 1.0.1__cp314-cp314-macosx_10_15_universal2.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- _tskit.cpython-314-darwin.so +0 -0
- tskit/__init__.py +92 -0
- tskit/__main__.py +4 -0
- tskit/_version.py +4 -0
- tskit/cli.py +273 -0
- tskit/combinatorics.py +1522 -0
- tskit/drawing.py +2809 -0
- tskit/exceptions.py +70 -0
- tskit/genotypes.py +410 -0
- tskit/intervals.py +601 -0
- tskit/jit/__init__.py +0 -0
- tskit/jit/numba.py +674 -0
- tskit/metadata.py +1147 -0
- tskit/provenance.py +150 -0
- tskit/provenance.schema.json +72 -0
- tskit/stats.py +165 -0
- tskit/tables.py +4858 -0
- tskit/text_formats.py +456 -0
- tskit/trees.py +11457 -0
- tskit/util.py +901 -0
- tskit/vcf.py +219 -0
- tskit-1.0.1.dist-info/METADATA +105 -0
- tskit-1.0.1.dist-info/RECORD +27 -0
- tskit-1.0.1.dist-info/WHEEL +5 -0
- tskit-1.0.1.dist-info/entry_points.txt +2 -0
- tskit-1.0.1.dist-info/licenses/LICENSE +21 -0
- tskit-1.0.1.dist-info/top_level.txt +2 -0
tskit/intervals.py
ADDED
|
@@ -0,0 +1,601 @@
|
|
|
1
|
+
# MIT License
|
|
2
|
+
#
|
|
3
|
+
# Copyright (c) 2023-2025 Tskit Developers
|
|
4
|
+
# Copyright (C) 2020-2021 University of Oxford
|
|
5
|
+
#
|
|
6
|
+
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
7
|
+
# of this software and associated documentation files (the "Software"), to deal
|
|
8
|
+
# in the Software without restriction, including without limitation the rights
|
|
9
|
+
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
10
|
+
# copies of the Software, and to permit persons to whom the Software is
|
|
11
|
+
# furnished to do so, subject to the following conditions:
|
|
12
|
+
#
|
|
13
|
+
# The above copyright notice and this permission notice shall be included in all
|
|
14
|
+
# copies or substantial portions of the Software.
|
|
15
|
+
#
|
|
16
|
+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
17
|
+
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
18
|
+
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
19
|
+
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
20
|
+
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
21
|
+
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
22
|
+
# SOFTWARE.
|
|
23
|
+
#
|
|
24
|
+
"""
|
|
25
|
+
Utilities for working with intervals and interval maps.
|
|
26
|
+
"""
|
|
27
|
+
from __future__ import annotations
|
|
28
|
+
|
|
29
|
+
import collections.abc
|
|
30
|
+
import numbers
|
|
31
|
+
|
|
32
|
+
import numpy as np
|
|
33
|
+
|
|
34
|
+
import tskit
|
|
35
|
+
import tskit.util as util
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
class RateMap(collections.abc.Mapping):
|
|
39
|
+
"""
|
|
40
|
+
A class mapping a non-negative rate value to a set of non-overlapping intervals
|
|
41
|
+
along the genome. Intervals for which the rate is unknown (i.e., missing data)
|
|
42
|
+
are encoded by NaN values in the ``rate`` array.
|
|
43
|
+
|
|
44
|
+
:param list position: A list of :math:`n+1` positions, starting at 0, and ending
|
|
45
|
+
in the sequence length over which the RateMap will apply.
|
|
46
|
+
:param list rate: A list of :math:`n` positive rates that apply between each
|
|
47
|
+
position. Intervals with missing data are encoded by NaN values.
|
|
48
|
+
"""
|
|
49
|
+
|
|
50
|
+
# The args are marked keyword only to give us some flexibility in how we
|
|
51
|
+
# create class this in the future.
|
|
52
|
+
def __init__(
|
|
53
|
+
self,
|
|
54
|
+
*,
|
|
55
|
+
position,
|
|
56
|
+
rate,
|
|
57
|
+
):
|
|
58
|
+
# Making the arrays read-only guarantees rate and cumulative mass stay in sync
|
|
59
|
+
# We prevent the arrays themselves being overwritten by making self.position,
|
|
60
|
+
# etc properties.
|
|
61
|
+
|
|
62
|
+
# TODO we always coerce the position type to float here, but we may not
|
|
63
|
+
# want to do this. int32 is a perfectly good choice a lot of the time.
|
|
64
|
+
self._position = np.array(position, dtype=float)
|
|
65
|
+
self._position.flags.writeable = False
|
|
66
|
+
self._rate = np.array(rate, dtype=float)
|
|
67
|
+
self._rate.flags.writeable = False
|
|
68
|
+
size = len(self._position)
|
|
69
|
+
if size < 2:
|
|
70
|
+
raise ValueError("Must have at least two positions")
|
|
71
|
+
if len(self._rate) != size - 1:
|
|
72
|
+
raise ValueError(
|
|
73
|
+
"Rate array must have one less entry than the position array"
|
|
74
|
+
)
|
|
75
|
+
if self._position[0] != 0:
|
|
76
|
+
raise ValueError("First position must be zero")
|
|
77
|
+
|
|
78
|
+
span = self.span
|
|
79
|
+
if np.any(span <= 0):
|
|
80
|
+
bad_pos = np.where(span <= 0)[0] + 1
|
|
81
|
+
raise ValueError(
|
|
82
|
+
f"Position values not strictly increasing at indexes {bad_pos}"
|
|
83
|
+
)
|
|
84
|
+
if np.any(self._rate < 0):
|
|
85
|
+
bad_rates = np.where(self._rate < 0)[0]
|
|
86
|
+
raise ValueError(f"Rate values negative at indexes {bad_rates}")
|
|
87
|
+
self._missing = np.isnan(self.rate)
|
|
88
|
+
self._num_missing_intervals = np.sum(self._missing)
|
|
89
|
+
if self._num_missing_intervals == len(self.rate):
|
|
90
|
+
raise ValueError("All intervals are missing data")
|
|
91
|
+
# We don't expose the cumulative mass array as a part of the array
|
|
92
|
+
# API is it's not quite as obvious how it lines up for each interval.
|
|
93
|
+
# It's really the sum of the mass up to but not including the current
|
|
94
|
+
# interval, which is a bit confusing. Probably best to just leave
|
|
95
|
+
# it as a function, so that people can sample at regular positions
|
|
96
|
+
# along the genome anyway, emphasising that it's a continuous function,
|
|
97
|
+
# not a step function like the other interval attributes.
|
|
98
|
+
self._cumulative_mass = np.insert(np.nancumsum(self.mass), 0, 0)
|
|
99
|
+
assert self._cumulative_mass[0] == 0
|
|
100
|
+
self._cumulative_mass.flags.writeable = False
|
|
101
|
+
|
|
102
|
+
@property
|
|
103
|
+
def left(self):
|
|
104
|
+
"""
|
|
105
|
+
The left position of each interval (inclusive).
|
|
106
|
+
"""
|
|
107
|
+
return self._position[:-1]
|
|
108
|
+
|
|
109
|
+
@property
|
|
110
|
+
def right(self):
|
|
111
|
+
"""
|
|
112
|
+
The right position of each interval (exclusive).
|
|
113
|
+
"""
|
|
114
|
+
return self._position[1:]
|
|
115
|
+
|
|
116
|
+
@property
|
|
117
|
+
def mid(self):
|
|
118
|
+
"""
|
|
119
|
+
Returns the midpoint of each interval.
|
|
120
|
+
"""
|
|
121
|
+
mid = self.left + self.span / 2
|
|
122
|
+
mid.flags.writeable = False
|
|
123
|
+
return mid
|
|
124
|
+
|
|
125
|
+
@property
|
|
126
|
+
def span(self):
|
|
127
|
+
"""
|
|
128
|
+
Returns the span (i.e., ``right - left``) of each of the intervals.
|
|
129
|
+
"""
|
|
130
|
+
span = self.right - self.left
|
|
131
|
+
span.flags.writeable = False
|
|
132
|
+
return span
|
|
133
|
+
|
|
134
|
+
@property
|
|
135
|
+
def position(self):
|
|
136
|
+
"""
|
|
137
|
+
The breakpoint positions between intervals. This is equal to the
|
|
138
|
+
:attr:`~.RateMap.left` array with the :attr:`sequence_length`
|
|
139
|
+
appended.
|
|
140
|
+
"""
|
|
141
|
+
return self._position
|
|
142
|
+
|
|
143
|
+
@property
|
|
144
|
+
def rate(self):
|
|
145
|
+
"""
|
|
146
|
+
The rate associated with each interval. Missing data is encoded
|
|
147
|
+
by NaN values.
|
|
148
|
+
"""
|
|
149
|
+
return self._rate
|
|
150
|
+
|
|
151
|
+
@property
|
|
152
|
+
def mass(self):
|
|
153
|
+
r"""
|
|
154
|
+
The "mass" of each interval, defined as the :attr:`~.RateMap.rate`
|
|
155
|
+
:math:`\times` :attr:`~.RateMap.span`. This is NaN for intervals
|
|
156
|
+
containing missing data.
|
|
157
|
+
"""
|
|
158
|
+
return self._rate * self.span
|
|
159
|
+
|
|
160
|
+
@property
|
|
161
|
+
def missing(self):
|
|
162
|
+
"""
|
|
163
|
+
A boolean array encoding whether each interval contains missing data.
|
|
164
|
+
Equivalent to ``np.isnan(rate_map.rate)``
|
|
165
|
+
"""
|
|
166
|
+
return self._missing
|
|
167
|
+
|
|
168
|
+
@property
|
|
169
|
+
def non_missing(self):
|
|
170
|
+
"""
|
|
171
|
+
A boolean array encoding whether each interval contains non-missing data.
|
|
172
|
+
Equivalent to ``np.logical_not(np.isnan(rate_map.rate))``
|
|
173
|
+
"""
|
|
174
|
+
return ~self._missing
|
|
175
|
+
|
|
176
|
+
#
|
|
177
|
+
# Interval counts
|
|
178
|
+
#
|
|
179
|
+
|
|
180
|
+
@property
|
|
181
|
+
def num_intervals(self) -> int:
|
|
182
|
+
"""
|
|
183
|
+
The total number of intervals in this map. Equal to
|
|
184
|
+
:attr:`~.RateMap.num_missing_intervals` +
|
|
185
|
+
:attr:`~.RateMap.num_non_missing_intervals`.
|
|
186
|
+
"""
|
|
187
|
+
return len(self._rate)
|
|
188
|
+
|
|
189
|
+
@property
|
|
190
|
+
def num_missing_intervals(self) -> int:
|
|
191
|
+
"""
|
|
192
|
+
Returns the number of missing intervals, i.e., those in which the
|
|
193
|
+
:attr:`~.RateMap.rate` value is a NaN.
|
|
194
|
+
"""
|
|
195
|
+
return self._num_missing_intervals
|
|
196
|
+
|
|
197
|
+
@property
|
|
198
|
+
def num_non_missing_intervals(self) -> int:
|
|
199
|
+
"""
|
|
200
|
+
The number of non missing intervals, i.e., those in which the
|
|
201
|
+
:attr:`~.RateMap.rate` value is not a NaN.
|
|
202
|
+
"""
|
|
203
|
+
return self.num_intervals - self.num_missing_intervals
|
|
204
|
+
|
|
205
|
+
@property
|
|
206
|
+
def sequence_length(self):
|
|
207
|
+
"""
|
|
208
|
+
The sequence length covered by this map
|
|
209
|
+
"""
|
|
210
|
+
return self.position[-1]
|
|
211
|
+
|
|
212
|
+
@property
|
|
213
|
+
def total_mass(self):
|
|
214
|
+
"""
|
|
215
|
+
The cumulative total mass over the entire map.
|
|
216
|
+
"""
|
|
217
|
+
return self._cumulative_mass[-1]
|
|
218
|
+
|
|
219
|
+
@property
|
|
220
|
+
def mean_rate(self):
|
|
221
|
+
"""
|
|
222
|
+
The mean rate over this map weighted by the span covered by each rate.
|
|
223
|
+
Unknown intervals are excluded.
|
|
224
|
+
"""
|
|
225
|
+
total_span = np.sum(self.span[self.non_missing])
|
|
226
|
+
return self.total_mass / total_span
|
|
227
|
+
|
|
228
|
+
def get_rate(self, x):
|
|
229
|
+
"""
|
|
230
|
+
Return the rate at the specified list of positions.
|
|
231
|
+
|
|
232
|
+
.. note:: This function will return a NaN value for any positions
|
|
233
|
+
that contain missing data.
|
|
234
|
+
|
|
235
|
+
:param numpy.ndarray x: The positions for which to return values.
|
|
236
|
+
:return: An array of rates, the same length as ``x``.
|
|
237
|
+
:rtype: numpy.ndarray
|
|
238
|
+
"""
|
|
239
|
+
loc = np.searchsorted(self.position, x, side="right") - 1
|
|
240
|
+
if np.any(loc < 0) or np.any(loc >= len(self.rate)):
|
|
241
|
+
raise ValueError("position out of bounds")
|
|
242
|
+
return self.rate[loc]
|
|
243
|
+
|
|
244
|
+
def get_cumulative_mass(self, x):
|
|
245
|
+
"""
|
|
246
|
+
Return the cumulative mass of the map up to (but not including) a
|
|
247
|
+
given point for a list of positions along the map. This is equal to
|
|
248
|
+
the integral of the rate from 0 to the point.
|
|
249
|
+
|
|
250
|
+
:param numpy.ndarray x: The positions for which to return values.
|
|
251
|
+
|
|
252
|
+
:return: An array of cumulative mass values, the same length as ``x``
|
|
253
|
+
:rtype: numpy.ndarray
|
|
254
|
+
"""
|
|
255
|
+
x = np.array(x)
|
|
256
|
+
if np.any(x < 0) or np.any(x > self.sequence_length):
|
|
257
|
+
raise ValueError(f"Cannot have positions < 0 or > {self.sequence_length}")
|
|
258
|
+
return np.interp(x, self.position, self._cumulative_mass)
|
|
259
|
+
|
|
260
|
+
def find_index(self, x: float) -> int:
|
|
261
|
+
"""
|
|
262
|
+
Returns the index of the interval that the specified position falls within,
|
|
263
|
+
such that ``rate_map.left[index] <= x < self.rate_map.right[index]``.
|
|
264
|
+
|
|
265
|
+
:param float x: The position to search.
|
|
266
|
+
:return: The index of the interval containing this point.
|
|
267
|
+
:rtype: int
|
|
268
|
+
:raises KeyError: if the position is not contained in any of the intervals.
|
|
269
|
+
"""
|
|
270
|
+
if x < 0 or x >= self.sequence_length:
|
|
271
|
+
raise KeyError(f"Position {x} out of bounds")
|
|
272
|
+
index = np.searchsorted(self.position, x, side="left")
|
|
273
|
+
if x < self.position[index]:
|
|
274
|
+
index -= 1
|
|
275
|
+
assert self.left[index] <= x < self.right[index]
|
|
276
|
+
return index
|
|
277
|
+
|
|
278
|
+
def missing_intervals(self):
|
|
279
|
+
"""
|
|
280
|
+
Returns the left and right coordinates of the intervals containing
|
|
281
|
+
missing data in this map as a 2D numpy array
|
|
282
|
+
with shape (:attr:`~.RateMap.num_missing_intervals`, 2). Each row
|
|
283
|
+
of this returned array is therefore a ``left``, ``right`` tuple
|
|
284
|
+
corresponding to the coordinates of the missing intervals.
|
|
285
|
+
|
|
286
|
+
:return: A numpy array of the coordinates of intervals containing
|
|
287
|
+
missing data.
|
|
288
|
+
:rtype: numpy.ndarray
|
|
289
|
+
"""
|
|
290
|
+
out = np.empty((self.num_missing_intervals, 2))
|
|
291
|
+
out[:, 0] = self.left[self.missing]
|
|
292
|
+
out[:, 1] = self.right[self.missing]
|
|
293
|
+
return out
|
|
294
|
+
|
|
295
|
+
def asdict(self):
|
|
296
|
+
return {"position": self.position, "rate": self.rate}
|
|
297
|
+
|
|
298
|
+
#
|
|
299
|
+
# Dunder methods. We implement the Mapping protocol via __iter__, __len__
|
|
300
|
+
# and __getitem__. We have some extra semantics for __getitem__, providing
|
|
301
|
+
# slice notation.
|
|
302
|
+
#
|
|
303
|
+
|
|
304
|
+
def __iter__(self):
|
|
305
|
+
# The clinching argument for using mid here is that if we used
|
|
306
|
+
# left instead we would have
|
|
307
|
+
# RateMap([0, 1], [0.1]) == RateMap([0, 100], [0.1])
|
|
308
|
+
# by the inherited definition of equality since the dictionary items
|
|
309
|
+
# would be equal.
|
|
310
|
+
# Similarly, we only return the midpoints of known intervals
|
|
311
|
+
# because NaN values are not equal, and we would need to do
|
|
312
|
+
# something to work around this. It seems reasonable that
|
|
313
|
+
# this high-level operation returns the *known* values only
|
|
314
|
+
# anyway.
|
|
315
|
+
yield from self.mid[self.non_missing]
|
|
316
|
+
|
|
317
|
+
def __len__(self):
|
|
318
|
+
return np.sum(self.non_missing)
|
|
319
|
+
|
|
320
|
+
def __getitem__(self, key):
|
|
321
|
+
if isinstance(key, slice):
|
|
322
|
+
if key.step is not None:
|
|
323
|
+
raise TypeError("Only interval slicing is supported")
|
|
324
|
+
return self.slice(key.start, key.stop)
|
|
325
|
+
if isinstance(key, numbers.Number):
|
|
326
|
+
index = self.find_index(key)
|
|
327
|
+
if np.isnan(self.rate[index]):
|
|
328
|
+
# To be consistent with the __iter__ definition above we
|
|
329
|
+
# don't consider these missing positions to be "in" the map.
|
|
330
|
+
raise KeyError(f"Position {key} is within a missing interval")
|
|
331
|
+
return self.rate[index]
|
|
332
|
+
# TODO we could implement numpy array indexing here and call
|
|
333
|
+
# to get_rate. Note we'd need to take care that we return a keyerror
|
|
334
|
+
# if the returned array contains any nans though.
|
|
335
|
+
raise KeyError("Key {key} not in map")
|
|
336
|
+
|
|
337
|
+
def _text_header_and_rows(self, limit=None):
|
|
338
|
+
headers = ("left", "right", "mid", "span", "rate")
|
|
339
|
+
num_rows = len(self.left)
|
|
340
|
+
rows = []
|
|
341
|
+
row_indexes = util.truncate_rows(num_rows, limit)
|
|
342
|
+
for j in row_indexes:
|
|
343
|
+
if j == -1:
|
|
344
|
+
rows.append(f"__skipped__{num_rows - limit}")
|
|
345
|
+
else:
|
|
346
|
+
rows.append(
|
|
347
|
+
[
|
|
348
|
+
f"{self.left[j]:.10g}",
|
|
349
|
+
f"{self.right[j]:.10g}",
|
|
350
|
+
f"{self.mid[j]:.10g}",
|
|
351
|
+
f"{self.span[j]:.10g}",
|
|
352
|
+
f"{self.rate[j]:.2g}",
|
|
353
|
+
]
|
|
354
|
+
)
|
|
355
|
+
return headers, rows
|
|
356
|
+
|
|
357
|
+
def __str__(self):
|
|
358
|
+
header, rows = self._text_header_and_rows(
|
|
359
|
+
limit=tskit._print_options["max_lines"]
|
|
360
|
+
)
|
|
361
|
+
table = util.unicode_table(
|
|
362
|
+
rows=rows,
|
|
363
|
+
header=header,
|
|
364
|
+
column_alignments="<<>>>",
|
|
365
|
+
)
|
|
366
|
+
return table
|
|
367
|
+
|
|
368
|
+
def _repr_html_(self):
|
|
369
|
+
header, rows = self._text_header_and_rows(
|
|
370
|
+
limit=tskit._print_options["max_lines"]
|
|
371
|
+
)
|
|
372
|
+
return util.html_table(rows, header=header)
|
|
373
|
+
|
|
374
|
+
def __repr__(self):
|
|
375
|
+
return f"RateMap(position={repr(self.position)}, rate={repr(self.rate)})"
|
|
376
|
+
|
|
377
|
+
#
|
|
378
|
+
# Methods for building rate maps.
|
|
379
|
+
#
|
|
380
|
+
|
|
381
|
+
def copy(self) -> RateMap:
|
|
382
|
+
"""
|
|
383
|
+
Returns a deep copy of this RateMap.
|
|
384
|
+
"""
|
|
385
|
+
# We take read-only copies of the arrays in the constructor anyway, so
|
|
386
|
+
# no need for copying.
|
|
387
|
+
return RateMap(position=self.position, rate=self.rate)
|
|
388
|
+
|
|
389
|
+
def slice(self, left=None, right=None, *, trim=False) -> RateMap: # noqa: A003
|
|
390
|
+
"""
|
|
391
|
+
Returns a subset of this rate map in the specified interval.
|
|
392
|
+
|
|
393
|
+
:param float left: The left coordinate (inclusive) of the region to keep.
|
|
394
|
+
If ``None``, defaults to 0.
|
|
395
|
+
:param float right: The right coordinate (exclusive) of the region to keep.
|
|
396
|
+
If ``None``, defaults to the sequence length.
|
|
397
|
+
:param bool trim: If True, remove the flanking regions such that the
|
|
398
|
+
sequence length of the new rate map is ``right`` - ``left``. If ``False``
|
|
399
|
+
(default), do not change the coordinate system and mark the flanking
|
|
400
|
+
regions as "unknown".
|
|
401
|
+
:return: A new RateMap instance
|
|
402
|
+
:rtype: RateMap
|
|
403
|
+
"""
|
|
404
|
+
left = 0 if left is None else left
|
|
405
|
+
right = self.sequence_length if right is None else right
|
|
406
|
+
if not (0 <= left < right <= self.sequence_length):
|
|
407
|
+
raise KeyError(f"Invalid slice: left={left}, right={right}")
|
|
408
|
+
|
|
409
|
+
i = self.find_index(left)
|
|
410
|
+
j = i + np.searchsorted(self.position[i:], right, side="right")
|
|
411
|
+
if right > self.position[j - 1]:
|
|
412
|
+
j += 1
|
|
413
|
+
|
|
414
|
+
position = self.position[i:j].copy()
|
|
415
|
+
rate = self.rate[i : j - 1].copy()
|
|
416
|
+
position[0] = left
|
|
417
|
+
position[-1] = right
|
|
418
|
+
|
|
419
|
+
if trim:
|
|
420
|
+
# Return trimmed map with changed coords
|
|
421
|
+
return RateMap(position=position - left, rate=rate)
|
|
422
|
+
|
|
423
|
+
# Need to check regions before & after sliced region are filled out:
|
|
424
|
+
if left != 0:
|
|
425
|
+
if np.isnan(rate[0]):
|
|
426
|
+
position[0] = 0 # Extend
|
|
427
|
+
else:
|
|
428
|
+
rate = np.insert(rate, 0, np.nan) # Prepend
|
|
429
|
+
position = np.insert(position, 0, 0)
|
|
430
|
+
if right != self.position[-1]:
|
|
431
|
+
if np.isnan(rate[-1]):
|
|
432
|
+
position[-1] = self.sequence_length # Extend
|
|
433
|
+
else:
|
|
434
|
+
rate = np.append(rate, np.nan) # Append
|
|
435
|
+
position = np.append(position, self.position[-1])
|
|
436
|
+
return RateMap(position=position, rate=rate)
|
|
437
|
+
|
|
438
|
+
@staticmethod
|
|
439
|
+
def uniform(sequence_length, rate) -> RateMap:
|
|
440
|
+
"""
|
|
441
|
+
Create a uniform rate map
|
|
442
|
+
"""
|
|
443
|
+
return RateMap(position=[0, sequence_length], rate=[rate])
|
|
444
|
+
|
|
445
|
+
@staticmethod
|
|
446
|
+
def read_hapmap(
|
|
447
|
+
fileobj,
|
|
448
|
+
sequence_length=None,
|
|
449
|
+
*,
|
|
450
|
+
has_header=True,
|
|
451
|
+
position_col=None,
|
|
452
|
+
rate_col=None,
|
|
453
|
+
map_col=None,
|
|
454
|
+
):
|
|
455
|
+
# Black barfs with an INTERNAL_ERROR trying to reformat this docstring,
|
|
456
|
+
# so we explicitly disable reformatting here.
|
|
457
|
+
# fmt: off
|
|
458
|
+
"""
|
|
459
|
+
Parses the specified file in HapMap format and returns a :class:`.RateMap`.
|
|
460
|
+
HapMap files must white-space-delimited, and by default are assumed to
|
|
461
|
+
contain a single header line (which is ignored). Each subsequent line
|
|
462
|
+
then contains a physical position (in base pairs) and either a genetic
|
|
463
|
+
map position (in centiMorgans) or a recombination rate (in centiMorgans
|
|
464
|
+
per megabase). The value in the rate column in a given line gives the
|
|
465
|
+
constant rate between the physical position in that line (inclusive) and the
|
|
466
|
+
physical position on the next line (exclusive).
|
|
467
|
+
By default, the second column of the file is taken
|
|
468
|
+
as the physical position and the fourth column is taken as the genetic
|
|
469
|
+
position, as seen in the following sample of the format::
|
|
470
|
+
|
|
471
|
+
Chromosome Position(bp) Rate(cM/Mb) Map(cM)
|
|
472
|
+
chr10 48232 0.1614 0.002664
|
|
473
|
+
chr10 48486 0.1589 0.002705
|
|
474
|
+
chr10 50009 0.159 0.002947
|
|
475
|
+
chr10 52147 0.1574 0.003287
|
|
476
|
+
...
|
|
477
|
+
chr10 133762002 3.358 181.129345
|
|
478
|
+
chr10 133766368 0.000 181.144008
|
|
479
|
+
|
|
480
|
+
In the example above, the first row has a nonzero genetic map position
|
|
481
|
+
(last column, cM), implying a nonzero recombination rate before that
|
|
482
|
+
position, that is assumed to extend to the start of the chromosome
|
|
483
|
+
(at position 0 bp). However, if the first line has a nonzero bp position
|
|
484
|
+
(second column) and a zero genetic map position (last column, cM),
|
|
485
|
+
then the recombination rate before that position is *unknown*, producing
|
|
486
|
+
:ref:`missing data <sec_rate_maps_missing>`.
|
|
487
|
+
|
|
488
|
+
.. note::
|
|
489
|
+
The rows are all assumed to come from the same contig, and the
|
|
490
|
+
first column is currently ignored. Therefore if you have a single
|
|
491
|
+
file containing several contigs or chromosomes, you must must split
|
|
492
|
+
it up into multiple files, and pass each one separately to this
|
|
493
|
+
function.
|
|
494
|
+
|
|
495
|
+
:param str fileobj: Filename or file to read. This is passed directly
|
|
496
|
+
to :func:`numpy.loadtxt`, so if the filename extension is .gz or .bz2,
|
|
497
|
+
the file is decompressed first
|
|
498
|
+
:param float sequence_length: The total length of the map. If ``None``,
|
|
499
|
+
then assume it is the last physical position listed in the file.
|
|
500
|
+
Otherwise it must be greater then or equal to the last physical
|
|
501
|
+
position in the file, and the region between the last physical position
|
|
502
|
+
and the sequence_length is padded with a rate of zero.
|
|
503
|
+
:param bool has_header: If True (default), assume the file has a header row
|
|
504
|
+
and ignore the first line of the file.
|
|
505
|
+
:param int position_col: The zero-based index of the column in the file
|
|
506
|
+
specifying the physical position in base pairs. If ``None`` (default)
|
|
507
|
+
assume an index of 1 (i.e. the second column).
|
|
508
|
+
:param int rate_col: The zero-based index of the column in the file
|
|
509
|
+
specifying the rate in cM/Mb. If ``None`` (default) do not use the rate
|
|
510
|
+
column, but calculate rates using the genetic map positions, as
|
|
511
|
+
specified in ``map_col``. If the rate column is used, the
|
|
512
|
+
interval from 0 to first physical position in the file is marked as
|
|
513
|
+
unknown, and the last value in the rate column must be zero.
|
|
514
|
+
:param int map_col: The zero-based index of the column in the file
|
|
515
|
+
specifying the genetic map position in centiMorgans. If ``None``
|
|
516
|
+
(default), assume an index of 3 (i.e. the fourth column). If the first
|
|
517
|
+
genetic position is 0 the interval from position 0 to the first
|
|
518
|
+
physical position in the file is marked as unknown. Otherwise, act
|
|
519
|
+
as if an additional row, specifying physical position 0 and genetic
|
|
520
|
+
position 0, exists at the start of the file.
|
|
521
|
+
:return: A RateMap object.
|
|
522
|
+
:rtype: RateMap
|
|
523
|
+
"""
|
|
524
|
+
# fmt: on
|
|
525
|
+
column_defs = {} # column definitions passed to np.loadtxt
|
|
526
|
+
if rate_col is None and map_col is None:
|
|
527
|
+
# Default to map_col
|
|
528
|
+
map_col = 3
|
|
529
|
+
elif rate_col is not None and map_col is not None:
|
|
530
|
+
raise ValueError("Cannot specify both rate_col and map_col")
|
|
531
|
+
if map_col is not None:
|
|
532
|
+
column_defs[map_col] = ("map", float)
|
|
533
|
+
else:
|
|
534
|
+
column_defs[rate_col] = ("rate", float)
|
|
535
|
+
position_col = 1 if position_col is None else position_col
|
|
536
|
+
if position_col in column_defs:
|
|
537
|
+
raise ValueError(
|
|
538
|
+
"Cannot specify the same columns for position_col and "
|
|
539
|
+
"rate_col or map_col"
|
|
540
|
+
)
|
|
541
|
+
column_defs[position_col] = ("pos", int)
|
|
542
|
+
|
|
543
|
+
column_names = [c[0] for c in column_defs.values()]
|
|
544
|
+
column_data = np.loadtxt(
|
|
545
|
+
fileobj,
|
|
546
|
+
skiprows=1 if has_header else 0,
|
|
547
|
+
dtype=list(column_defs.values()),
|
|
548
|
+
usecols=list(column_defs.keys()),
|
|
549
|
+
unpack=True,
|
|
550
|
+
)
|
|
551
|
+
data = dict(zip(column_names, column_data))
|
|
552
|
+
|
|
553
|
+
if "map" not in data:
|
|
554
|
+
assert "rate" in data
|
|
555
|
+
if data["rate"][-1] != 0:
|
|
556
|
+
raise ValueError("The last entry in the 'rate' column must be zero")
|
|
557
|
+
pos_Mb = data["pos"] / 1e6
|
|
558
|
+
map_pos = np.cumsum(data["rate"][:-1] * np.diff(pos_Mb))
|
|
559
|
+
data["map"] = np.insert(map_pos, 0, 0) / 100
|
|
560
|
+
else:
|
|
561
|
+
data["map"] /= 100 # Convert centiMorgans to Morgans
|
|
562
|
+
if len(data["map"]) == 0:
|
|
563
|
+
raise ValueError("Empty hapmap file")
|
|
564
|
+
|
|
565
|
+
# TO DO: read in chrom name from col 0 and poss set as .name
|
|
566
|
+
# attribute on the RateMap
|
|
567
|
+
|
|
568
|
+
physical_positions = data["pos"]
|
|
569
|
+
genetic_positions = data["map"]
|
|
570
|
+
start = physical_positions[0]
|
|
571
|
+
end = physical_positions[-1]
|
|
572
|
+
|
|
573
|
+
if genetic_positions[0] > 0 and start == 0:
|
|
574
|
+
raise ValueError(
|
|
575
|
+
"The map distance at the start of the chromosome must be zero"
|
|
576
|
+
)
|
|
577
|
+
if start > 0:
|
|
578
|
+
physical_positions = np.insert(physical_positions, 0, 0)
|
|
579
|
+
if genetic_positions[0] > 0:
|
|
580
|
+
# Exception for a map that starts > 0cM: include the start rate
|
|
581
|
+
# in the mean
|
|
582
|
+
start = 0
|
|
583
|
+
genetic_positions = np.insert(genetic_positions, 0, 0)
|
|
584
|
+
|
|
585
|
+
if sequence_length is not None:
|
|
586
|
+
if sequence_length < end:
|
|
587
|
+
raise ValueError(
|
|
588
|
+
"The sequence_length cannot be less than the last physical position "
|
|
589
|
+
f" ({physical_positions[-1]})"
|
|
590
|
+
)
|
|
591
|
+
if sequence_length > end:
|
|
592
|
+
physical_positions = np.append(physical_positions, sequence_length)
|
|
593
|
+
genetic_positions = np.append(genetic_positions, genetic_positions[-1])
|
|
594
|
+
|
|
595
|
+
assert genetic_positions[0] == 0
|
|
596
|
+
rate = np.diff(genetic_positions) / np.diff(physical_positions)
|
|
597
|
+
if start != 0:
|
|
598
|
+
rate[0] = np.nan
|
|
599
|
+
if end != physical_positions[-1]:
|
|
600
|
+
rate[-1] = np.nan
|
|
601
|
+
return RateMap(position=physical_positions, rate=rate)
|
tskit/jit/__init__.py
ADDED
|
File without changes
|