tskit 1.0.1__cp314-cp314-macosx_10_15_universal2.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
tskit/intervals.py ADDED
@@ -0,0 +1,601 @@
1
+ # MIT License
2
+ #
3
+ # Copyright (c) 2023-2025 Tskit Developers
4
+ # Copyright (C) 2020-2021 University of Oxford
5
+ #
6
+ # Permission is hereby granted, free of charge, to any person obtaining a copy
7
+ # of this software and associated documentation files (the "Software"), to deal
8
+ # in the Software without restriction, including without limitation the rights
9
+ # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10
+ # copies of the Software, and to permit persons to whom the Software is
11
+ # furnished to do so, subject to the following conditions:
12
+ #
13
+ # The above copyright notice and this permission notice shall be included in all
14
+ # copies or substantial portions of the Software.
15
+ #
16
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17
+ # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18
+ # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19
+ # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20
+ # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21
+ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22
+ # SOFTWARE.
23
+ #
24
+ """
25
+ Utilities for working with intervals and interval maps.
26
+ """
27
+ from __future__ import annotations
28
+
29
+ import collections.abc
30
+ import numbers
31
+
32
+ import numpy as np
33
+
34
+ import tskit
35
+ import tskit.util as util
36
+
37
+
38
+ class RateMap(collections.abc.Mapping):
39
+ """
40
+ A class mapping a non-negative rate value to a set of non-overlapping intervals
41
+ along the genome. Intervals for which the rate is unknown (i.e., missing data)
42
+ are encoded by NaN values in the ``rate`` array.
43
+
44
+ :param list position: A list of :math:`n+1` positions, starting at 0, and ending
45
+ in the sequence length over which the RateMap will apply.
46
+ :param list rate: A list of :math:`n` positive rates that apply between each
47
+ position. Intervals with missing data are encoded by NaN values.
48
+ """
49
+
50
+ # The args are marked keyword only to give us some flexibility in how we
51
+ # create class this in the future.
52
+ def __init__(
53
+ self,
54
+ *,
55
+ position,
56
+ rate,
57
+ ):
58
+ # Making the arrays read-only guarantees rate and cumulative mass stay in sync
59
+ # We prevent the arrays themselves being overwritten by making self.position,
60
+ # etc properties.
61
+
62
+ # TODO we always coerce the position type to float here, but we may not
63
+ # want to do this. int32 is a perfectly good choice a lot of the time.
64
+ self._position = np.array(position, dtype=float)
65
+ self._position.flags.writeable = False
66
+ self._rate = np.array(rate, dtype=float)
67
+ self._rate.flags.writeable = False
68
+ size = len(self._position)
69
+ if size < 2:
70
+ raise ValueError("Must have at least two positions")
71
+ if len(self._rate) != size - 1:
72
+ raise ValueError(
73
+ "Rate array must have one less entry than the position array"
74
+ )
75
+ if self._position[0] != 0:
76
+ raise ValueError("First position must be zero")
77
+
78
+ span = self.span
79
+ if np.any(span <= 0):
80
+ bad_pos = np.where(span <= 0)[0] + 1
81
+ raise ValueError(
82
+ f"Position values not strictly increasing at indexes {bad_pos}"
83
+ )
84
+ if np.any(self._rate < 0):
85
+ bad_rates = np.where(self._rate < 0)[0]
86
+ raise ValueError(f"Rate values negative at indexes {bad_rates}")
87
+ self._missing = np.isnan(self.rate)
88
+ self._num_missing_intervals = np.sum(self._missing)
89
+ if self._num_missing_intervals == len(self.rate):
90
+ raise ValueError("All intervals are missing data")
91
+ # We don't expose the cumulative mass array as a part of the array
92
+ # API is it's not quite as obvious how it lines up for each interval.
93
+ # It's really the sum of the mass up to but not including the current
94
+ # interval, which is a bit confusing. Probably best to just leave
95
+ # it as a function, so that people can sample at regular positions
96
+ # along the genome anyway, emphasising that it's a continuous function,
97
+ # not a step function like the other interval attributes.
98
+ self._cumulative_mass = np.insert(np.nancumsum(self.mass), 0, 0)
99
+ assert self._cumulative_mass[0] == 0
100
+ self._cumulative_mass.flags.writeable = False
101
+
102
+ @property
103
+ def left(self):
104
+ """
105
+ The left position of each interval (inclusive).
106
+ """
107
+ return self._position[:-1]
108
+
109
+ @property
110
+ def right(self):
111
+ """
112
+ The right position of each interval (exclusive).
113
+ """
114
+ return self._position[1:]
115
+
116
+ @property
117
+ def mid(self):
118
+ """
119
+ Returns the midpoint of each interval.
120
+ """
121
+ mid = self.left + self.span / 2
122
+ mid.flags.writeable = False
123
+ return mid
124
+
125
+ @property
126
+ def span(self):
127
+ """
128
+ Returns the span (i.e., ``right - left``) of each of the intervals.
129
+ """
130
+ span = self.right - self.left
131
+ span.flags.writeable = False
132
+ return span
133
+
134
+ @property
135
+ def position(self):
136
+ """
137
+ The breakpoint positions between intervals. This is equal to the
138
+ :attr:`~.RateMap.left` array with the :attr:`sequence_length`
139
+ appended.
140
+ """
141
+ return self._position
142
+
143
+ @property
144
+ def rate(self):
145
+ """
146
+ The rate associated with each interval. Missing data is encoded
147
+ by NaN values.
148
+ """
149
+ return self._rate
150
+
151
+ @property
152
+ def mass(self):
153
+ r"""
154
+ The "mass" of each interval, defined as the :attr:`~.RateMap.rate`
155
+ :math:`\times` :attr:`~.RateMap.span`. This is NaN for intervals
156
+ containing missing data.
157
+ """
158
+ return self._rate * self.span
159
+
160
+ @property
161
+ def missing(self):
162
+ """
163
+ A boolean array encoding whether each interval contains missing data.
164
+ Equivalent to ``np.isnan(rate_map.rate)``
165
+ """
166
+ return self._missing
167
+
168
+ @property
169
+ def non_missing(self):
170
+ """
171
+ A boolean array encoding whether each interval contains non-missing data.
172
+ Equivalent to ``np.logical_not(np.isnan(rate_map.rate))``
173
+ """
174
+ return ~self._missing
175
+
176
+ #
177
+ # Interval counts
178
+ #
179
+
180
+ @property
181
+ def num_intervals(self) -> int:
182
+ """
183
+ The total number of intervals in this map. Equal to
184
+ :attr:`~.RateMap.num_missing_intervals` +
185
+ :attr:`~.RateMap.num_non_missing_intervals`.
186
+ """
187
+ return len(self._rate)
188
+
189
+ @property
190
+ def num_missing_intervals(self) -> int:
191
+ """
192
+ Returns the number of missing intervals, i.e., those in which the
193
+ :attr:`~.RateMap.rate` value is a NaN.
194
+ """
195
+ return self._num_missing_intervals
196
+
197
+ @property
198
+ def num_non_missing_intervals(self) -> int:
199
+ """
200
+ The number of non missing intervals, i.e., those in which the
201
+ :attr:`~.RateMap.rate` value is not a NaN.
202
+ """
203
+ return self.num_intervals - self.num_missing_intervals
204
+
205
+ @property
206
+ def sequence_length(self):
207
+ """
208
+ The sequence length covered by this map
209
+ """
210
+ return self.position[-1]
211
+
212
+ @property
213
+ def total_mass(self):
214
+ """
215
+ The cumulative total mass over the entire map.
216
+ """
217
+ return self._cumulative_mass[-1]
218
+
219
+ @property
220
+ def mean_rate(self):
221
+ """
222
+ The mean rate over this map weighted by the span covered by each rate.
223
+ Unknown intervals are excluded.
224
+ """
225
+ total_span = np.sum(self.span[self.non_missing])
226
+ return self.total_mass / total_span
227
+
228
+ def get_rate(self, x):
229
+ """
230
+ Return the rate at the specified list of positions.
231
+
232
+ .. note:: This function will return a NaN value for any positions
233
+ that contain missing data.
234
+
235
+ :param numpy.ndarray x: The positions for which to return values.
236
+ :return: An array of rates, the same length as ``x``.
237
+ :rtype: numpy.ndarray
238
+ """
239
+ loc = np.searchsorted(self.position, x, side="right") - 1
240
+ if np.any(loc < 0) or np.any(loc >= len(self.rate)):
241
+ raise ValueError("position out of bounds")
242
+ return self.rate[loc]
243
+
244
+ def get_cumulative_mass(self, x):
245
+ """
246
+ Return the cumulative mass of the map up to (but not including) a
247
+ given point for a list of positions along the map. This is equal to
248
+ the integral of the rate from 0 to the point.
249
+
250
+ :param numpy.ndarray x: The positions for which to return values.
251
+
252
+ :return: An array of cumulative mass values, the same length as ``x``
253
+ :rtype: numpy.ndarray
254
+ """
255
+ x = np.array(x)
256
+ if np.any(x < 0) or np.any(x > self.sequence_length):
257
+ raise ValueError(f"Cannot have positions < 0 or > {self.sequence_length}")
258
+ return np.interp(x, self.position, self._cumulative_mass)
259
+
260
+ def find_index(self, x: float) -> int:
261
+ """
262
+ Returns the index of the interval that the specified position falls within,
263
+ such that ``rate_map.left[index] <= x < self.rate_map.right[index]``.
264
+
265
+ :param float x: The position to search.
266
+ :return: The index of the interval containing this point.
267
+ :rtype: int
268
+ :raises KeyError: if the position is not contained in any of the intervals.
269
+ """
270
+ if x < 0 or x >= self.sequence_length:
271
+ raise KeyError(f"Position {x} out of bounds")
272
+ index = np.searchsorted(self.position, x, side="left")
273
+ if x < self.position[index]:
274
+ index -= 1
275
+ assert self.left[index] <= x < self.right[index]
276
+ return index
277
+
278
+ def missing_intervals(self):
279
+ """
280
+ Returns the left and right coordinates of the intervals containing
281
+ missing data in this map as a 2D numpy array
282
+ with shape (:attr:`~.RateMap.num_missing_intervals`, 2). Each row
283
+ of this returned array is therefore a ``left``, ``right`` tuple
284
+ corresponding to the coordinates of the missing intervals.
285
+
286
+ :return: A numpy array of the coordinates of intervals containing
287
+ missing data.
288
+ :rtype: numpy.ndarray
289
+ """
290
+ out = np.empty((self.num_missing_intervals, 2))
291
+ out[:, 0] = self.left[self.missing]
292
+ out[:, 1] = self.right[self.missing]
293
+ return out
294
+
295
+ def asdict(self):
296
+ return {"position": self.position, "rate": self.rate}
297
+
298
+ #
299
+ # Dunder methods. We implement the Mapping protocol via __iter__, __len__
300
+ # and __getitem__. We have some extra semantics for __getitem__, providing
301
+ # slice notation.
302
+ #
303
+
304
+ def __iter__(self):
305
+ # The clinching argument for using mid here is that if we used
306
+ # left instead we would have
307
+ # RateMap([0, 1], [0.1]) == RateMap([0, 100], [0.1])
308
+ # by the inherited definition of equality since the dictionary items
309
+ # would be equal.
310
+ # Similarly, we only return the midpoints of known intervals
311
+ # because NaN values are not equal, and we would need to do
312
+ # something to work around this. It seems reasonable that
313
+ # this high-level operation returns the *known* values only
314
+ # anyway.
315
+ yield from self.mid[self.non_missing]
316
+
317
+ def __len__(self):
318
+ return np.sum(self.non_missing)
319
+
320
+ def __getitem__(self, key):
321
+ if isinstance(key, slice):
322
+ if key.step is not None:
323
+ raise TypeError("Only interval slicing is supported")
324
+ return self.slice(key.start, key.stop)
325
+ if isinstance(key, numbers.Number):
326
+ index = self.find_index(key)
327
+ if np.isnan(self.rate[index]):
328
+ # To be consistent with the __iter__ definition above we
329
+ # don't consider these missing positions to be "in" the map.
330
+ raise KeyError(f"Position {key} is within a missing interval")
331
+ return self.rate[index]
332
+ # TODO we could implement numpy array indexing here and call
333
+ # to get_rate. Note we'd need to take care that we return a keyerror
334
+ # if the returned array contains any nans though.
335
+ raise KeyError("Key {key} not in map")
336
+
337
+ def _text_header_and_rows(self, limit=None):
338
+ headers = ("left", "right", "mid", "span", "rate")
339
+ num_rows = len(self.left)
340
+ rows = []
341
+ row_indexes = util.truncate_rows(num_rows, limit)
342
+ for j in row_indexes:
343
+ if j == -1:
344
+ rows.append(f"__skipped__{num_rows - limit}")
345
+ else:
346
+ rows.append(
347
+ [
348
+ f"{self.left[j]:.10g}",
349
+ f"{self.right[j]:.10g}",
350
+ f"{self.mid[j]:.10g}",
351
+ f"{self.span[j]:.10g}",
352
+ f"{self.rate[j]:.2g}",
353
+ ]
354
+ )
355
+ return headers, rows
356
+
357
+ def __str__(self):
358
+ header, rows = self._text_header_and_rows(
359
+ limit=tskit._print_options["max_lines"]
360
+ )
361
+ table = util.unicode_table(
362
+ rows=rows,
363
+ header=header,
364
+ column_alignments="<<>>>",
365
+ )
366
+ return table
367
+
368
+ def _repr_html_(self):
369
+ header, rows = self._text_header_and_rows(
370
+ limit=tskit._print_options["max_lines"]
371
+ )
372
+ return util.html_table(rows, header=header)
373
+
374
+ def __repr__(self):
375
+ return f"RateMap(position={repr(self.position)}, rate={repr(self.rate)})"
376
+
377
+ #
378
+ # Methods for building rate maps.
379
+ #
380
+
381
+ def copy(self) -> RateMap:
382
+ """
383
+ Returns a deep copy of this RateMap.
384
+ """
385
+ # We take read-only copies of the arrays in the constructor anyway, so
386
+ # no need for copying.
387
+ return RateMap(position=self.position, rate=self.rate)
388
+
389
+ def slice(self, left=None, right=None, *, trim=False) -> RateMap: # noqa: A003
390
+ """
391
+ Returns a subset of this rate map in the specified interval.
392
+
393
+ :param float left: The left coordinate (inclusive) of the region to keep.
394
+ If ``None``, defaults to 0.
395
+ :param float right: The right coordinate (exclusive) of the region to keep.
396
+ If ``None``, defaults to the sequence length.
397
+ :param bool trim: If True, remove the flanking regions such that the
398
+ sequence length of the new rate map is ``right`` - ``left``. If ``False``
399
+ (default), do not change the coordinate system and mark the flanking
400
+ regions as "unknown".
401
+ :return: A new RateMap instance
402
+ :rtype: RateMap
403
+ """
404
+ left = 0 if left is None else left
405
+ right = self.sequence_length if right is None else right
406
+ if not (0 <= left < right <= self.sequence_length):
407
+ raise KeyError(f"Invalid slice: left={left}, right={right}")
408
+
409
+ i = self.find_index(left)
410
+ j = i + np.searchsorted(self.position[i:], right, side="right")
411
+ if right > self.position[j - 1]:
412
+ j += 1
413
+
414
+ position = self.position[i:j].copy()
415
+ rate = self.rate[i : j - 1].copy()
416
+ position[0] = left
417
+ position[-1] = right
418
+
419
+ if trim:
420
+ # Return trimmed map with changed coords
421
+ return RateMap(position=position - left, rate=rate)
422
+
423
+ # Need to check regions before & after sliced region are filled out:
424
+ if left != 0:
425
+ if np.isnan(rate[0]):
426
+ position[0] = 0 # Extend
427
+ else:
428
+ rate = np.insert(rate, 0, np.nan) # Prepend
429
+ position = np.insert(position, 0, 0)
430
+ if right != self.position[-1]:
431
+ if np.isnan(rate[-1]):
432
+ position[-1] = self.sequence_length # Extend
433
+ else:
434
+ rate = np.append(rate, np.nan) # Append
435
+ position = np.append(position, self.position[-1])
436
+ return RateMap(position=position, rate=rate)
437
+
438
+ @staticmethod
439
+ def uniform(sequence_length, rate) -> RateMap:
440
+ """
441
+ Create a uniform rate map
442
+ """
443
+ return RateMap(position=[0, sequence_length], rate=[rate])
444
+
445
+ @staticmethod
446
+ def read_hapmap(
447
+ fileobj,
448
+ sequence_length=None,
449
+ *,
450
+ has_header=True,
451
+ position_col=None,
452
+ rate_col=None,
453
+ map_col=None,
454
+ ):
455
+ # Black barfs with an INTERNAL_ERROR trying to reformat this docstring,
456
+ # so we explicitly disable reformatting here.
457
+ # fmt: off
458
+ """
459
+ Parses the specified file in HapMap format and returns a :class:`.RateMap`.
460
+ HapMap files must white-space-delimited, and by default are assumed to
461
+ contain a single header line (which is ignored). Each subsequent line
462
+ then contains a physical position (in base pairs) and either a genetic
463
+ map position (in centiMorgans) or a recombination rate (in centiMorgans
464
+ per megabase). The value in the rate column in a given line gives the
465
+ constant rate between the physical position in that line (inclusive) and the
466
+ physical position on the next line (exclusive).
467
+ By default, the second column of the file is taken
468
+ as the physical position and the fourth column is taken as the genetic
469
+ position, as seen in the following sample of the format::
470
+
471
+ Chromosome Position(bp) Rate(cM/Mb) Map(cM)
472
+ chr10 48232 0.1614 0.002664
473
+ chr10 48486 0.1589 0.002705
474
+ chr10 50009 0.159 0.002947
475
+ chr10 52147 0.1574 0.003287
476
+ ...
477
+ chr10 133762002 3.358 181.129345
478
+ chr10 133766368 0.000 181.144008
479
+
480
+ In the example above, the first row has a nonzero genetic map position
481
+ (last column, cM), implying a nonzero recombination rate before that
482
+ position, that is assumed to extend to the start of the chromosome
483
+ (at position 0 bp). However, if the first line has a nonzero bp position
484
+ (second column) and a zero genetic map position (last column, cM),
485
+ then the recombination rate before that position is *unknown*, producing
486
+ :ref:`missing data <sec_rate_maps_missing>`.
487
+
488
+ .. note::
489
+ The rows are all assumed to come from the same contig, and the
490
+ first column is currently ignored. Therefore if you have a single
491
+ file containing several contigs or chromosomes, you must must split
492
+ it up into multiple files, and pass each one separately to this
493
+ function.
494
+
495
+ :param str fileobj: Filename or file to read. This is passed directly
496
+ to :func:`numpy.loadtxt`, so if the filename extension is .gz or .bz2,
497
+ the file is decompressed first
498
+ :param float sequence_length: The total length of the map. If ``None``,
499
+ then assume it is the last physical position listed in the file.
500
+ Otherwise it must be greater then or equal to the last physical
501
+ position in the file, and the region between the last physical position
502
+ and the sequence_length is padded with a rate of zero.
503
+ :param bool has_header: If True (default), assume the file has a header row
504
+ and ignore the first line of the file.
505
+ :param int position_col: The zero-based index of the column in the file
506
+ specifying the physical position in base pairs. If ``None`` (default)
507
+ assume an index of 1 (i.e. the second column).
508
+ :param int rate_col: The zero-based index of the column in the file
509
+ specifying the rate in cM/Mb. If ``None`` (default) do not use the rate
510
+ column, but calculate rates using the genetic map positions, as
511
+ specified in ``map_col``. If the rate column is used, the
512
+ interval from 0 to first physical position in the file is marked as
513
+ unknown, and the last value in the rate column must be zero.
514
+ :param int map_col: The zero-based index of the column in the file
515
+ specifying the genetic map position in centiMorgans. If ``None``
516
+ (default), assume an index of 3 (i.e. the fourth column). If the first
517
+ genetic position is 0 the interval from position 0 to the first
518
+ physical position in the file is marked as unknown. Otherwise, act
519
+ as if an additional row, specifying physical position 0 and genetic
520
+ position 0, exists at the start of the file.
521
+ :return: A RateMap object.
522
+ :rtype: RateMap
523
+ """
524
+ # fmt: on
525
+ column_defs = {} # column definitions passed to np.loadtxt
526
+ if rate_col is None and map_col is None:
527
+ # Default to map_col
528
+ map_col = 3
529
+ elif rate_col is not None and map_col is not None:
530
+ raise ValueError("Cannot specify both rate_col and map_col")
531
+ if map_col is not None:
532
+ column_defs[map_col] = ("map", float)
533
+ else:
534
+ column_defs[rate_col] = ("rate", float)
535
+ position_col = 1 if position_col is None else position_col
536
+ if position_col in column_defs:
537
+ raise ValueError(
538
+ "Cannot specify the same columns for position_col and "
539
+ "rate_col or map_col"
540
+ )
541
+ column_defs[position_col] = ("pos", int)
542
+
543
+ column_names = [c[0] for c in column_defs.values()]
544
+ column_data = np.loadtxt(
545
+ fileobj,
546
+ skiprows=1 if has_header else 0,
547
+ dtype=list(column_defs.values()),
548
+ usecols=list(column_defs.keys()),
549
+ unpack=True,
550
+ )
551
+ data = dict(zip(column_names, column_data))
552
+
553
+ if "map" not in data:
554
+ assert "rate" in data
555
+ if data["rate"][-1] != 0:
556
+ raise ValueError("The last entry in the 'rate' column must be zero")
557
+ pos_Mb = data["pos"] / 1e6
558
+ map_pos = np.cumsum(data["rate"][:-1] * np.diff(pos_Mb))
559
+ data["map"] = np.insert(map_pos, 0, 0) / 100
560
+ else:
561
+ data["map"] /= 100 # Convert centiMorgans to Morgans
562
+ if len(data["map"]) == 0:
563
+ raise ValueError("Empty hapmap file")
564
+
565
+ # TO DO: read in chrom name from col 0 and poss set as .name
566
+ # attribute on the RateMap
567
+
568
+ physical_positions = data["pos"]
569
+ genetic_positions = data["map"]
570
+ start = physical_positions[0]
571
+ end = physical_positions[-1]
572
+
573
+ if genetic_positions[0] > 0 and start == 0:
574
+ raise ValueError(
575
+ "The map distance at the start of the chromosome must be zero"
576
+ )
577
+ if start > 0:
578
+ physical_positions = np.insert(physical_positions, 0, 0)
579
+ if genetic_positions[0] > 0:
580
+ # Exception for a map that starts > 0cM: include the start rate
581
+ # in the mean
582
+ start = 0
583
+ genetic_positions = np.insert(genetic_positions, 0, 0)
584
+
585
+ if sequence_length is not None:
586
+ if sequence_length < end:
587
+ raise ValueError(
588
+ "The sequence_length cannot be less than the last physical position "
589
+ f" ({physical_positions[-1]})"
590
+ )
591
+ if sequence_length > end:
592
+ physical_positions = np.append(physical_positions, sequence_length)
593
+ genetic_positions = np.append(genetic_positions, genetic_positions[-1])
594
+
595
+ assert genetic_positions[0] == 0
596
+ rate = np.diff(genetic_positions) / np.diff(physical_positions)
597
+ if start != 0:
598
+ rate[0] = np.nan
599
+ if end != physical_positions[-1]:
600
+ rate[-1] = np.nan
601
+ return RateMap(position=physical_positions, rate=rate)
tskit/jit/__init__.py ADDED
File without changes