tskit 1.0.0b3__cp313-cp313-win_amd64.whl → 1.0.1__cp313-cp313-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- _tskit.cp313-win_amd64.pyd +0 -0
- tskit/_version.py +1 -1
- tskit/drawing.py +2 -4
- tskit/genotypes.py +23 -20
- tskit/metadata.py +1 -1
- tskit/tables.py +51 -26
- tskit/text_formats.py +4 -0
- tskit/trees.py +413 -245
- tskit/util.py +6 -7
- {tskit-1.0.0b3.dist-info → tskit-1.0.1.dist-info}/METADATA +8 -8
- tskit-1.0.1.dist-info/RECORD +27 -0
- {tskit-1.0.0b3.dist-info → tskit-1.0.1.dist-info}/WHEEL +1 -1
- tskit-1.0.0b3.dist-info/RECORD +0 -27
- {tskit-1.0.0b3.dist-info → tskit-1.0.1.dist-info}/entry_points.txt +0 -0
- {tskit-1.0.0b3.dist-info → tskit-1.0.1.dist-info}/licenses/LICENSE +0 -0
- {tskit-1.0.0b3.dist-info → tskit-1.0.1.dist-info}/top_level.txt +0 -0
tskit/trees.py
CHANGED
|
@@ -370,7 +370,11 @@ class Site(util.Dataclass):
|
|
|
370
370
|
mutations: np.ndarray
|
|
371
371
|
"""
|
|
372
372
|
The list of mutations at this site. Mutations within a site are returned in the
|
|
373
|
-
|
|
373
|
+
|
|
374
|
+
order they are specified in the underlying :class:`MutationTable`. For canonical
|
|
375
|
+
(i.e., valid) tables, this means ancestral mutations precede their descendants, so
|
|
376
|
+
older mutations (as defined by the canonical mutation ordering; see
|
|
377
|
+
:ref:`sec_mutation_requirements`) appear before younger ones.
|
|
374
378
|
"""
|
|
375
379
|
metadata: bytes | dict | None
|
|
376
380
|
"""
|
|
@@ -571,8 +575,8 @@ class Migration(util.Dataclass):
|
|
|
571
575
|
"""
|
|
572
576
|
id: int # noqa A003
|
|
573
577
|
"""
|
|
574
|
-
The integer ID of this
|
|
575
|
-
:attr:`TreeSequence.
|
|
578
|
+
The integer ID of this migration. Varies from 0 to
|
|
579
|
+
:attr:`TreeSequence.num_migrations` - 1.
|
|
576
580
|
"""
|
|
577
581
|
|
|
578
582
|
|
|
@@ -770,7 +774,7 @@ class Tree:
|
|
|
770
774
|
calling the :meth:`TreeSequence.trees` iterator.
|
|
771
775
|
|
|
772
776
|
:return: The root threshold.
|
|
773
|
-
:rtype:
|
|
777
|
+
:rtype: int
|
|
774
778
|
"""
|
|
775
779
|
return self._ll_tree.get_root_threshold()
|
|
776
780
|
|
|
@@ -881,7 +885,8 @@ class Tree:
|
|
|
881
885
|
|
|
882
886
|
:param float position: The position along the sequence length to
|
|
883
887
|
seek to.
|
|
884
|
-
:raises ValueError: If
|
|
888
|
+
:raises ValueError: If ``position`` is less than 0 or ``position`` is greater
|
|
889
|
+
than or equal to
|
|
885
890
|
:attr:`TreeSequence.sequence_length`.
|
|
886
891
|
"""
|
|
887
892
|
if position < 0 or position >= self.tree_sequence.sequence_length:
|
|
@@ -918,7 +923,7 @@ class Tree:
|
|
|
918
923
|
the interval :math:`[0, \\text{span})` and the :attr:`~Tree.tree_sequence`
|
|
919
924
|
from which the tree is taken will have its
|
|
920
925
|
:attr:`~tskit.TreeSequence.sequence_length` equal to ``span``.
|
|
921
|
-
:param
|
|
926
|
+
:param float branch_length: The minimum length of a branch in this tree.
|
|
922
927
|
:raises ValueError: If the given rank is out of bounds for trees
|
|
923
928
|
with ``num_leaves`` leaves.
|
|
924
929
|
"""
|
|
@@ -3593,7 +3598,7 @@ def parse_nodes(source, strict=True, encoding="utf8", base64_metadata=True, tabl
|
|
|
3593
3598
|
return table
|
|
3594
3599
|
|
|
3595
3600
|
|
|
3596
|
-
def parse_edges(source, strict=True, table=None):
|
|
3601
|
+
def parse_edges(source, strict=True, table=None, encoding="utf8", base64_metadata=True):
|
|
3597
3602
|
"""
|
|
3598
3603
|
Parse the specified file-like object containing a whitespace delimited
|
|
3599
3604
|
description of a edge table and returns the corresponding :class:`EdgeTable`
|
|
@@ -3609,6 +3614,9 @@ def parse_edges(source, strict=True, table=None):
|
|
|
3609
3614
|
False, a relaxed whitespace splitting algorithm is used.
|
|
3610
3615
|
:param EdgeTable table: If specified, write the edges into this table. If
|
|
3611
3616
|
not, create a new :class:`EdgeTable` instance and return.
|
|
3617
|
+
:param str encoding: Encoding used for text representation.
|
|
3618
|
+
:param bool base64_metadata: If True, metadata is encoded using Base64
|
|
3619
|
+
encoding; otherwise, as plain text.
|
|
3612
3620
|
"""
|
|
3613
3621
|
sep = None
|
|
3614
3622
|
if strict:
|
|
@@ -3620,6 +3628,12 @@ def parse_edges(source, strict=True, table=None):
|
|
|
3620
3628
|
right_index = header.index("right")
|
|
3621
3629
|
parent_index = header.index("parent")
|
|
3622
3630
|
children_index = header.index("child")
|
|
3631
|
+
metadata_index = None
|
|
3632
|
+
try:
|
|
3633
|
+
metadata_index = header.index("metadata")
|
|
3634
|
+
except ValueError:
|
|
3635
|
+
pass
|
|
3636
|
+
default_metadata = b""
|
|
3623
3637
|
for line in source:
|
|
3624
3638
|
tokens = line.rstrip("\n").split(sep)
|
|
3625
3639
|
if len(tokens) >= 4:
|
|
@@ -3627,8 +3641,19 @@ def parse_edges(source, strict=True, table=None):
|
|
|
3627
3641
|
right = float(tokens[right_index])
|
|
3628
3642
|
parent = int(tokens[parent_index])
|
|
3629
3643
|
children = tuple(map(int, tokens[children_index].split(",")))
|
|
3644
|
+
metadata = default_metadata
|
|
3645
|
+
if metadata_index is not None and metadata_index < len(tokens):
|
|
3646
|
+
metadata = tokens[metadata_index].encode(encoding)
|
|
3647
|
+
if base64_metadata:
|
|
3648
|
+
metadata = base64.b64decode(metadata)
|
|
3630
3649
|
for child in children:
|
|
3631
|
-
table.add_row(
|
|
3650
|
+
table.add_row(
|
|
3651
|
+
left=left,
|
|
3652
|
+
right=right,
|
|
3653
|
+
parent=parent,
|
|
3654
|
+
child=child,
|
|
3655
|
+
metadata=metadata,
|
|
3656
|
+
)
|
|
3632
3657
|
return table
|
|
3633
3658
|
|
|
3634
3659
|
|
|
@@ -4368,6 +4393,22 @@ class TreeSequence:
|
|
|
4368
4393
|
self._ll_tree_sequence.dump_tables(ll_tables)
|
|
4369
4394
|
return tables.TableCollection(ll_tables=ll_tables)
|
|
4370
4395
|
|
|
4396
|
+
def link_ancestors(self, samples, ancestors):
|
|
4397
|
+
"""
|
|
4398
|
+
Equivalent to :meth:`TableCollection.link_ancestors`; see that method for full
|
|
4399
|
+
documentation and parameter semantics.
|
|
4400
|
+
|
|
4401
|
+
:param list[int] samples: Node IDs to retain as samples.
|
|
4402
|
+
:param list[int] ancestors: Node IDs to treat as ancestors.
|
|
4403
|
+
:return: An :class:`tables.EdgeTable` containing the genealogical links between
|
|
4404
|
+
the supplied ``samples`` and ``ancestors``.
|
|
4405
|
+
:rtype: tables.EdgeTable
|
|
4406
|
+
"""
|
|
4407
|
+
samples = util.safe_np_int_cast(samples, np.int32)
|
|
4408
|
+
ancestors = util.safe_np_int_cast(ancestors, np.int32)
|
|
4409
|
+
ll_edge_table = self._ll_tree_sequence.link_ancestors(samples, ancestors)
|
|
4410
|
+
return tables.EdgeTable(ll_table=ll_edge_table)
|
|
4411
|
+
|
|
4371
4412
|
def dump_text(
|
|
4372
4413
|
self,
|
|
4373
4414
|
nodes=None,
|
|
@@ -4767,7 +4808,8 @@ class TreeSequence:
|
|
|
4767
4808
|
Returns an iterable sequence of all the :ref:`nodes <sec_node_table_definition>`
|
|
4768
4809
|
in this tree sequence.
|
|
4769
4810
|
|
|
4770
|
-
.. note::
|
|
4811
|
+
.. note::
|
|
4812
|
+
Although node ids are commonly ordered by node time, this is not a
|
|
4771
4813
|
formal tree sequence requirement. If you wish to iterate over nodes in
|
|
4772
4814
|
time order, you should therefore use ``order="timeasc"`` (and wrap the
|
|
4773
4815
|
resulting sequence in the standard Python :func:`python:reversed` function
|
|
@@ -5321,13 +5363,13 @@ class TreeSequence:
|
|
|
5321
5363
|
Returns an iterator over the strings of haplotypes that result from
|
|
5322
5364
|
the trees and mutations in this tree sequence. Each haplotype string
|
|
5323
5365
|
is guaranteed to be of the same length. A tree sequence with
|
|
5324
|
-
:math:`n`
|
|
5325
|
-
``right`` will return a total
|
|
5326
|
-
strings of :math:`s` alleles concatenated together, where an allele
|
|
5366
|
+
:math:`n` requested nodes (default: the number of sample nodes) and with
|
|
5367
|
+
:math:`s` sites lying between ``left`` and ``right`` will return a total
|
|
5368
|
+
of :math:`n` strings of :math:`s` alleles concatenated together, where an allele
|
|
5327
5369
|
consists of a single ascii character (tree sequences that include alleles
|
|
5328
5370
|
which are not a single character in length, or where the character is
|
|
5329
5371
|
non-ascii, will raise an error). The first string returned is the
|
|
5330
|
-
haplotype for the first requested
|
|
5372
|
+
haplotype for the first requested node, and so on.
|
|
5331
5373
|
|
|
5332
5374
|
The alleles at each site must be represented by single byte characters,
|
|
5333
5375
|
(i.e., variants must be single nucleotide polymorphisms, or SNPs), hence
|
|
@@ -5336,8 +5378,8 @@ class TreeSequence:
|
|
|
5336
5378
|
haplotype ``h``, the value of ``h[j]`` will therefore be the observed
|
|
5337
5379
|
allelic state at site ``j``.
|
|
5338
5380
|
|
|
5339
|
-
If ``isolated_as_missing`` is True (the default), isolated
|
|
5340
|
-
mutations directly above them will be treated as
|
|
5381
|
+
If ``isolated_as_missing`` is True (the default), isolated nodes without
|
|
5382
|
+
mutations directly above them (whether samples or non-samples) will be treated as
|
|
5341
5383
|
:ref:`missing data<sec_data_model_missing_data>` and will be
|
|
5342
5384
|
represented in the string by the ``missing_data_character``. If
|
|
5343
5385
|
instead it is set to False, missing data will be assigned the ancestral state
|
|
@@ -5346,8 +5388,10 @@ class TreeSequence:
|
|
|
5346
5388
|
behaviour in versions prior to 0.2.0. Prior to 0.3.0 the `impute_missing_data`
|
|
5347
5389
|
argument controlled this behaviour.
|
|
5348
5390
|
|
|
5391
|
+
It is also possible to provide **non-sample** nodes via the ``samples``
|
|
5392
|
+
argument if you wish to output haplotypes for (e.g.) internal nodes.
|
|
5349
5393
|
See also the :meth:`.variants` iterator for site-centric access
|
|
5350
|
-
to
|
|
5394
|
+
to genotypes for the requested nodes.
|
|
5351
5395
|
|
|
5352
5396
|
.. warning::
|
|
5353
5397
|
For large datasets, this method can consume a **very large** amount of
|
|
@@ -5365,9 +5409,10 @@ class TreeSequence:
|
|
|
5365
5409
|
be used to represent missing data.
|
|
5366
5410
|
If any normal allele contains this character, an error is raised.
|
|
5367
5411
|
Default: 'N'.
|
|
5368
|
-
:param list[int] samples: The
|
|
5369
|
-
``None`` (default), return haplotypes for all the
|
|
5370
|
-
sequence, in the order given by the :meth:`.samples` method.
|
|
5412
|
+
:param list[int] samples: The node IDs for which to output haplotypes. If
|
|
5413
|
+
``None`` (default), return haplotypes for all the sample nodes in the tree
|
|
5414
|
+
sequence, in the order given by the :meth:`.samples` method. Non-sample
|
|
5415
|
+
nodes may also be provided.
|
|
5371
5416
|
:param int left: Haplotype strings will start with the first site at or after
|
|
5372
5417
|
this genomic position. If ``None`` (default) start at the first site.
|
|
5373
5418
|
:param int right: Haplotype strings will end with the last site before this
|
|
@@ -5438,9 +5483,13 @@ class TreeSequence:
|
|
|
5438
5483
|
generated; output order of genotypes in the returned variants
|
|
5439
5484
|
corresponds to the order of the samples in this list. It is also
|
|
5440
5485
|
possible to provide **non-sample** nodes as an argument here, if you
|
|
5441
|
-
wish to generate genotypes for (e.g.) internal nodes.
|
|
5442
|
-
|
|
5443
|
-
|
|
5486
|
+
wish to generate genotypes for (e.g.) internal nodes. Missingness is
|
|
5487
|
+
detected for any requested node (sample or non-sample) when
|
|
5488
|
+
``isolated_as_missing`` is True: if a node is isolated at a site (i.e.,
|
|
5489
|
+
has no parent and no children in the marginal tree) and has no mutation
|
|
5490
|
+
above it at that site, its genotype will be reported as
|
|
5491
|
+
:data:`MISSING_DATA` (-1). If ``isolated_as_missing`` is False, such
|
|
5492
|
+
nodes are assigned the site's ancestral allele index.
|
|
5444
5493
|
|
|
5445
5494
|
If isolated samples are present at a given site without mutations above them,
|
|
5446
5495
|
they are interpreted by default as
|
|
@@ -5530,19 +5579,23 @@ class TreeSequence:
|
|
|
5530
5579
|
"""
|
|
5531
5580
|
Returns an :math:`m \\times n` numpy array of the genotypes in this
|
|
5532
5581
|
tree sequence, where :math:`m` is the number of sites and :math:`n`
|
|
5533
|
-
the number of
|
|
5534
|
-
|
|
5535
|
-
|
|
5536
|
-
|
|
5537
|
-
|
|
5538
|
-
|
|
5539
|
-
|
|
5540
|
-
|
|
5541
|
-
|
|
5542
|
-
|
|
5543
|
-
|
|
5544
|
-
|
|
5545
|
-
|
|
5582
|
+
is the number of requested nodes (default: the number of sample nodes).
|
|
5583
|
+
The genotypes are the indexes into the array of ``alleles``, as
|
|
5584
|
+
described for the :class:`Variant` class.
|
|
5585
|
+
|
|
5586
|
+
It is possible to provide **non-sample** nodes via the ``samples``
|
|
5587
|
+
argument if you wish to generate genotypes for (e.g.) internal nodes.
|
|
5588
|
+
Missingness is detected for any requested node (sample or non-sample)
|
|
5589
|
+
when ``isolated_as_missing`` is True: if a node is isolated at a site
|
|
5590
|
+
(i.e., has no parent and no children in the marginal tree) and has no
|
|
5591
|
+
mutation above it at that site, its genotype will be reported as
|
|
5592
|
+
:data:`MISSING_DATA` (-1).
|
|
5593
|
+
|
|
5594
|
+
Such nodes are treated as missing data by default. If
|
|
5595
|
+
``isolated_as_missing`` is set to False, they will not be treated as
|
|
5596
|
+
missing, and will instead be assigned the ancestral state. This was the
|
|
5597
|
+
default behaviour in versions prior to 0.2.0. Prior to 0.3.0 the
|
|
5598
|
+
``impute_missing_data`` argument controlled this behaviour.
|
|
5546
5599
|
|
|
5547
5600
|
.. warning::
|
|
5548
5601
|
This method can consume a **very large** amount of memory! If
|
|
@@ -5550,10 +5603,12 @@ class TreeSequence:
|
|
|
5550
5603
|
access them sequentially using the :meth:`.variants` iterator.
|
|
5551
5604
|
|
|
5552
5605
|
:param array_like samples: An array of node IDs for which to generate
|
|
5553
|
-
genotypes
|
|
5606
|
+
genotypes. If ``None`` (default), generate genotypes for all sample
|
|
5607
|
+
nodes. Non-sample nodes may also be provided, in which case genotypes
|
|
5608
|
+
will be generated for those nodes too.
|
|
5554
5609
|
:param bool isolated_as_missing: If True, the genotype value assigned to
|
|
5555
|
-
|
|
5556
|
-
:data:`.MISSING_DATA` (-1). If False,
|
|
5610
|
+
isolated nodes without mutations (samples or non-samples) is
|
|
5611
|
+
:data:`.MISSING_DATA` (-1). If False, such nodes will be
|
|
5557
5612
|
assigned the allele index for the ancestral state.
|
|
5558
5613
|
Default: True.
|
|
5559
5614
|
:param tuple alleles: A tuple of strings describing the encoding of
|
|
@@ -5602,21 +5657,24 @@ class TreeSequence:
|
|
|
5602
5657
|
*,
|
|
5603
5658
|
reference_sequence=None,
|
|
5604
5659
|
missing_data_character=None,
|
|
5660
|
+
isolated_as_missing=None,
|
|
5605
5661
|
samples=None,
|
|
5606
5662
|
left=None,
|
|
5607
5663
|
right=None,
|
|
5608
5664
|
):
|
|
5609
5665
|
"""
|
|
5610
5666
|
Returns an iterator over the full sequence alignments for the defined samples
|
|
5611
|
-
in this tree sequence. Each alignment ``a`` is a string of length
|
|
5612
|
-
the first character is the genomic sequence at the ``start``
|
|
5613
|
-
genome (defaulting to 0) and the last character is the
|
|
5614
|
-
position before the ``stop`` value (defaulting to the
|
|
5615
|
-
of this tree sequence, which must have
|
|
5616
|
-
By default ``L`` is therefore equal
|
|
5617
|
-
and ``a[j]`` is the nucleotide value at
|
|
5618
|
-
|
|
5619
|
-
|
|
5667
|
+
in this tree sequence. Each yielded alignment ``a`` is a string of length
|
|
5668
|
+
``L`` where the first character is the genomic sequence at the ``start``
|
|
5669
|
+
position in the genome (defaulting to 0) and the last character is the
|
|
5670
|
+
genomic sequence one position before the ``stop`` value (defaulting to the
|
|
5671
|
+
:attr:`.sequence_length` of this tree sequence, which must have
|
|
5672
|
+
:attr:`.discrete_genome` equal to True). By default ``L`` is therefore equal
|
|
5673
|
+
to the :attr:`.sequence_length`, and ``a[j]`` is the nucleotide value at
|
|
5674
|
+
genomic position ``j``.
|
|
5675
|
+
|
|
5676
|
+
.. note::
|
|
5677
|
+
This is inherently a **zero-based** representation of the sequence
|
|
5620
5678
|
coordinate space. Care will be needed when interacting with other
|
|
5621
5679
|
libraries and upstream coordinate spaces.
|
|
5622
5680
|
|
|
@@ -5665,31 +5723,44 @@ class TreeSequence:
|
|
|
5665
5723
|
single byte characters, (i.e., variants must be single nucleotide
|
|
5666
5724
|
polymorphisms, or SNPs).
|
|
5667
5725
|
|
|
5668
|
-
|
|
5669
|
-
|
|
5670
|
-
|
|
5671
|
-
|
|
5672
|
-
|
|
5726
|
+
Missing data handling
|
|
5727
|
+
|
|
5728
|
+
- If ``isolated_as_missing=True`` (default), nodes that are isolated
|
|
5729
|
+
(no parent and no children) are rendered as the missing character across
|
|
5730
|
+
each tree interval. At site positions, the per-site allele overrides the
|
|
5731
|
+
missing character; if a genotype is missing (``-1``), the missing
|
|
5732
|
+
character is retained.
|
|
5733
|
+
- If ``isolated_as_missing=False``, no missing overlay is applied. At sites,
|
|
5734
|
+
genotypes are decoded as usual; at non-sites, bases come from the
|
|
5735
|
+
reference sequence.
|
|
5673
5736
|
|
|
5674
5737
|
See also the :meth:`.variants` iterator for site-centric access
|
|
5675
5738
|
to sample genotypes and :meth:`.haplotypes` for access to sample sequences
|
|
5676
5739
|
at just the sites in the tree sequence.
|
|
5677
5740
|
|
|
5678
5741
|
:param str reference_sequence: The reference sequence to fill in
|
|
5679
|
-
gaps between sites in the alignments.
|
|
5742
|
+
gaps between sites in the alignments. If provided, it must be a
|
|
5743
|
+
string of length equal to :attr:`.sequence_length`; the sequence is
|
|
5744
|
+
sliced internally to the requested ``[left, right)`` interval.
|
|
5680
5745
|
:param str missing_data_character: A single ascii character that will
|
|
5681
5746
|
be used to represent missing data.
|
|
5682
5747
|
If any normal allele contains this character, an error is raised.
|
|
5683
5748
|
Default: 'N'.
|
|
5684
|
-
:param
|
|
5685
|
-
|
|
5686
|
-
|
|
5749
|
+
:param bool isolated_as_missing: If True, treat isolated nodes as missing
|
|
5750
|
+
across the covered tree intervals (see above). If None (default), this
|
|
5751
|
+
is treated as True.
|
|
5752
|
+
:param list[int] samples: The nodes for which to output alignments. If
|
|
5753
|
+
``None`` (default), return alignments for all sample nodes in the order
|
|
5754
|
+
given by the :meth:`.samples` method. Non-sample nodes are also supported
|
|
5755
|
+
and will be decoded at sites in the same way as samples.
|
|
5687
5756
|
:param int left: Alignments will start at this genomic position. If ``None``
|
|
5688
5757
|
(default) alignments start at 0.
|
|
5689
|
-
:param int right: Alignments will stop before this genomic position.
|
|
5690
|
-
(default) alignments will continue until the end of the
|
|
5758
|
+
:param int right: Alignments will stop before this genomic position.
|
|
5759
|
+
If ``None`` (default) alignments will continue until the end of the
|
|
5760
|
+
tree sequence.
|
|
5691
5761
|
:return: An iterator over the alignment strings for specified samples in
|
|
5692
|
-
this tree sequence, in the order given in ``samples``.
|
|
5762
|
+
this tree sequence, in the order given in ``samples``. Each string has
|
|
5763
|
+
length ``L = right - left``.
|
|
5693
5764
|
:rtype: collections.abc.Iterable
|
|
5694
5765
|
:raises ValueError: if any genome coordinate in this tree sequence is not
|
|
5695
5766
|
discrete, or if the ``reference_sequence`` is not of the correct length.
|
|
@@ -5703,60 +5774,53 @@ class TreeSequence:
|
|
|
5703
5774
|
"N" if missing_data_character is None else missing_data_character
|
|
5704
5775
|
)
|
|
5705
5776
|
|
|
5706
|
-
|
|
5707
|
-
|
|
5708
|
-
if reference_sequence is None:
|
|
5709
|
-
if self.has_reference_sequence():
|
|
5710
|
-
# This may be inefficient - see #1989. However, since we're
|
|
5711
|
-
# n copies of the reference sequence anyway, this is a relatively
|
|
5712
|
-
# minor tweak. We may also want to recode the below not to use direct
|
|
5713
|
-
# access to the .data attribute, e.g. if we allow reference sequences
|
|
5714
|
-
# to start at non-zero positions
|
|
5715
|
-
reference_sequence = self.reference_sequence.data[
|
|
5716
|
-
interval.left : interval.right
|
|
5717
|
-
]
|
|
5718
|
-
else:
|
|
5719
|
-
reference_sequence = missing_data_character * L
|
|
5777
|
+
if isolated_as_missing is None:
|
|
5778
|
+
isolated_as_missing = True
|
|
5720
5779
|
|
|
5721
|
-
if len(
|
|
5722
|
-
|
|
5723
|
-
|
|
5724
|
-
|
|
5725
|
-
|
|
5726
|
-
|
|
5780
|
+
if len(missing_data_character) != 1:
|
|
5781
|
+
raise TypeError("missing_data_character must be a single character")
|
|
5782
|
+
|
|
5783
|
+
# Determine the reference sequence for the whole tree sequence
|
|
5784
|
+
full_ref = None
|
|
5785
|
+
if reference_sequence is not None:
|
|
5786
|
+
full_ref = reference_sequence
|
|
5787
|
+
elif self.has_reference_sequence():
|
|
5788
|
+
# This may be inefficient - see #1989. However, since we're
|
|
5789
|
+
# n copies of the reference sequence anyway, this is a relatively
|
|
5790
|
+
# minor tweak. We may also want to recode the below not to use direct
|
|
5791
|
+
# access to the .data attribute, e.g. if we allow reference sequences
|
|
5792
|
+
# to start at non-zero positions
|
|
5793
|
+
full_ref = self.reference_sequence.data
|
|
5794
|
+
|
|
5795
|
+
if full_ref is None:
|
|
5796
|
+
full_ref = missing_data_character * int(self.sequence_length)
|
|
5797
|
+
else:
|
|
5798
|
+
if len(full_ref) != int(self.sequence_length):
|
|
5727
5799
|
raise ValueError(
|
|
5728
|
-
"The reference sequence
|
|
5800
|
+
"The reference sequence must be equal to the tree sequence length"
|
|
5729
5801
|
)
|
|
5730
|
-
|
|
5731
|
-
|
|
5732
|
-
|
|
5733
|
-
|
|
5734
|
-
|
|
5735
|
-
|
|
5736
|
-
|
|
5737
|
-
|
|
5738
|
-
|
|
5739
|
-
|
|
5740
|
-
|
|
5741
|
-
|
|
5742
|
-
|
|
5743
|
-
|
|
5744
|
-
|
|
5745
|
-
|
|
5746
|
-
"The current implementation may also incorrectly identify an "
|
|
5747
|
-
"input tree sequence has having missing data."
|
|
5748
|
-
)
|
|
5749
|
-
H, (first_site_id, last_site_id) = self._haplotypes_array(
|
|
5750
|
-
interval=interval,
|
|
5751
|
-
missing_data_character=missing_data_character,
|
|
5752
|
-
samples=samples,
|
|
5802
|
+
|
|
5803
|
+
try:
|
|
5804
|
+
ref_bytes = full_ref.encode("ascii")
|
|
5805
|
+
missing_data_character.encode("ascii")
|
|
5806
|
+
except UnicodeEncodeError:
|
|
5807
|
+
raise
|
|
5808
|
+
|
|
5809
|
+
sample_ids = self.samples() if samples is None else list(samples)
|
|
5810
|
+
|
|
5811
|
+
flat = self._ll_tree_sequence.decode_alignments(
|
|
5812
|
+
ref_bytes,
|
|
5813
|
+
sample_ids,
|
|
5814
|
+
int(interval.left),
|
|
5815
|
+
int(interval.right),
|
|
5816
|
+
missing_data_character,
|
|
5817
|
+
bool(isolated_as_missing),
|
|
5753
5818
|
)
|
|
5754
|
-
|
|
5755
|
-
|
|
5756
|
-
|
|
5757
|
-
|
|
5758
|
-
|
|
5759
|
-
yield a.tobytes().decode("ascii")
|
|
5819
|
+
|
|
5820
|
+
span = int(interval.span)
|
|
5821
|
+
for j in range(len(sample_ids)):
|
|
5822
|
+
offset = j * span
|
|
5823
|
+
yield flat[offset : offset + span].decode("ascii")
|
|
5760
5824
|
|
|
5761
5825
|
@property
|
|
5762
5826
|
def individuals_population(self):
|
|
@@ -6469,6 +6533,9 @@ class TreeSequence:
|
|
|
6469
6533
|
samples = self._ll_tree_sequence.get_samples()
|
|
6470
6534
|
keep = np.full(shape=samples.shape, fill_value=True)
|
|
6471
6535
|
if population is not None:
|
|
6536
|
+
if not isinstance(population, numbers.Integral):
|
|
6537
|
+
raise ValueError("`population` must be an integer ID")
|
|
6538
|
+
population = int(population)
|
|
6472
6539
|
sample_population = self.nodes_population[samples]
|
|
6473
6540
|
keep = np.logical_and(keep, sample_population == population)
|
|
6474
6541
|
if time is not None:
|
|
@@ -6581,13 +6648,13 @@ class TreeSequence:
|
|
|
6581
6648
|
to the sites in the tree sequence object.
|
|
6582
6649
|
|
|
6583
6650
|
.. note::
|
|
6584
|
-
|
|
6585
|
-
|
|
6586
|
-
|
|
6587
|
-
|
|
6588
|
-
|
|
6589
|
-
|
|
6590
|
-
|
|
6651
|
+
Older code often uses the ``ploidy=2`` argument, because old
|
|
6652
|
+
versions of msprime did not output individual data. Specifying
|
|
6653
|
+
individuals in the tree sequence is more robust, and since tree
|
|
6654
|
+
sequences now typically contain individuals (e.g., as produced by
|
|
6655
|
+
``msprime.sim_ancestry( )``), this is not necessary, and the
|
|
6656
|
+
``ploidy`` argument can safely be removed as part of the process
|
|
6657
|
+
of updating from the msprime 0.x legacy API.
|
|
6591
6658
|
|
|
6592
6659
|
:param io.IOBase output: The file-like object to write the VCF output.
|
|
6593
6660
|
:param int ploidy: The ploidy of the individuals to be written to
|
|
@@ -6672,6 +6739,7 @@ class TreeSequence:
|
|
|
6672
6739
|
wrap_width=60,
|
|
6673
6740
|
reference_sequence=None,
|
|
6674
6741
|
missing_data_character=None,
|
|
6742
|
+
isolated_as_missing=None,
|
|
6675
6743
|
):
|
|
6676
6744
|
"""
|
|
6677
6745
|
Writes the :meth:`.alignments` for this tree sequence to file in
|
|
@@ -6696,12 +6764,6 @@ class TreeSequence:
|
|
|
6696
6764
|
|
|
6697
6765
|
ts.write_fasta("output.fa")
|
|
6698
6766
|
|
|
6699
|
-
.. warning:: :ref:`Missing data<sec_data_model_missing_data>` is not
|
|
6700
|
-
currently supported by this method and it will raise a ValueError
|
|
6701
|
-
if called on tree sequences containing isolated samples.
|
|
6702
|
-
See https://github.com/tskit-dev/tskit/issues/1896 for more
|
|
6703
|
-
information.
|
|
6704
|
-
|
|
6705
6767
|
:param file_or_path: The file object or path to write the output.
|
|
6706
6768
|
Paths can be either strings or :class:`python:pathlib.Path` objects.
|
|
6707
6769
|
:param int wrap_width: The number of sequence
|
|
@@ -6710,6 +6772,7 @@ class TreeSequence:
|
|
|
6710
6772
|
(Default=60).
|
|
6711
6773
|
:param str reference_sequence: As for the :meth:`.alignments` method.
|
|
6712
6774
|
:param str missing_data_character: As for the :meth:`.alignments` method.
|
|
6775
|
+
:param bool isolated_as_missing: As for the :meth:`.alignments` method.
|
|
6713
6776
|
"""
|
|
6714
6777
|
text_formats.write_fasta(
|
|
6715
6778
|
self,
|
|
@@ -6717,6 +6780,7 @@ class TreeSequence:
|
|
|
6717
6780
|
wrap_width=wrap_width,
|
|
6718
6781
|
reference_sequence=reference_sequence,
|
|
6719
6782
|
missing_data_character=missing_data_character,
|
|
6783
|
+
isolated_as_missing=isolated_as_missing,
|
|
6720
6784
|
)
|
|
6721
6785
|
|
|
6722
6786
|
def as_fasta(self, **kwargs):
|
|
@@ -6740,6 +6804,7 @@ class TreeSequence:
|
|
|
6740
6804
|
include_alignments=None,
|
|
6741
6805
|
reference_sequence=None,
|
|
6742
6806
|
missing_data_character=None,
|
|
6807
|
+
isolated_as_missing=None,
|
|
6743
6808
|
):
|
|
6744
6809
|
"""
|
|
6745
6810
|
Returns a `nexus encoding <https://en.wikipedia.org/wiki/Nexus_file>`_
|
|
@@ -6823,10 +6888,7 @@ class TreeSequence:
|
|
|
6823
6888
|
as our convention of using trees with multiple roots
|
|
6824
6889
|
is not often supported by newick parsers. Thus, the method
|
|
6825
6890
|
will raise a ValueError if we try to output trees with
|
|
6826
|
-
multiple roots.
|
|
6827
|
-
is not currently supported for alignment data.
|
|
6828
|
-
See https://github.com/tskit-dev/tskit/issues/1896 for more
|
|
6829
|
-
information.
|
|
6891
|
+
multiple roots.
|
|
6830
6892
|
|
|
6831
6893
|
.. seealso: See also the :meth:`.as_nexus` method which will
|
|
6832
6894
|
return this nexus representation as a string.
|
|
@@ -6841,6 +6903,7 @@ class TreeSequence:
|
|
|
6841
6903
|
:param str reference_sequence: As for the :meth:`.alignments` method.
|
|
6842
6904
|
:param str missing_data_character: As for the :meth:`.alignments` method,
|
|
6843
6905
|
but defaults to "?".
|
|
6906
|
+
:param bool isolated_as_missing: As for the :meth:`.alignments` method.
|
|
6844
6907
|
:return: A nexus representation of this :class:`TreeSequence`
|
|
6845
6908
|
:rtype: str
|
|
6846
6909
|
"""
|
|
@@ -6852,6 +6915,7 @@ class TreeSequence:
|
|
|
6852
6915
|
include_alignments=include_alignments,
|
|
6853
6916
|
reference_sequence=reference_sequence,
|
|
6854
6917
|
missing_data_character=missing_data_character,
|
|
6918
|
+
isolated_as_missing=isolated_as_missing,
|
|
6855
6919
|
)
|
|
6856
6920
|
|
|
6857
6921
|
def as_nexus(self, **kwargs):
|
|
@@ -7198,19 +7262,32 @@ class TreeSequence:
|
|
|
7198
7262
|
self, *args, node_mappings=None, record_provenance=True, add_populations=None
|
|
7199
7263
|
):
|
|
7200
7264
|
r"""
|
|
7201
|
-
Concatenate a set of tree sequences to the right of this one, by
|
|
7202
|
-
|
|
7203
|
-
|
|
7204
|
-
|
|
7205
|
-
|
|
7265
|
+
Concatenate a set of tree sequences to the right of this one, by shifting
|
|
7266
|
+
their coordinate systems and adding all edges, sites, mutations, and
|
|
7267
|
+
any additional nodes, individuals, or populations needed for these.
|
|
7268
|
+
Concretely, to concatenate an ``other`` tree sequence to ``self``, the value
|
|
7269
|
+
of ``self.sequence_length`` is added to all genomic coordinates in ``other``,
|
|
7270
|
+
and then the concatenated tree sequence will contain all edges, sites, and
|
|
7271
|
+
mutations in both. Which nodes in ``other`` are treated as "new", and hence
|
|
7272
|
+
added as well, is controlled by ``node_mappings``. Any individuals to which
|
|
7273
|
+
new nodes belong are added as well.
|
|
7274
|
+
|
|
7275
|
+
The method uses :meth:`.shift` followed by :meth:`.union`, with
|
|
7276
|
+
``all_mutations=True``, ``all_edges=True``, and ``check_shared_equality=False``.
|
|
7277
|
+
|
|
7278
|
+
By default, the samples in current and input tree sequences are assumed to
|
|
7279
|
+
refer to the same nodes, and are matched based on the numerical order of
|
|
7280
|
+
sample node IDs; all other nodes are assumed to be new. This can be
|
|
7281
|
+
changed by providing explicit ``node_mappings`` for each input tree sequence
|
|
7282
|
+
(see below).
|
|
7206
7283
|
|
|
7207
7284
|
.. note::
|
|
7208
|
-
To add gaps between the concatenated
|
|
7209
|
-
to remove gaps, use :meth:`trim` before concatenating.
|
|
7285
|
+
To add gaps between the concatenated tree sequences, use :meth:`shift`
|
|
7286
|
+
or to remove gaps, use :meth:`trim` before concatenating.
|
|
7210
7287
|
|
|
7211
7288
|
:param TreeSequence \*args: A list of other tree sequences to append to
|
|
7212
7289
|
the right of this one.
|
|
7213
|
-
:param Union[list, None] node_mappings:
|
|
7290
|
+
:param Union[list, None] node_mappings: A list of node mappings for each
|
|
7214
7291
|
input tree sequence in ``args``. Each should either be an array of
|
|
7215
7292
|
integers of the same length as the number of nodes in the equivalent
|
|
7216
7293
|
input tree sequence (see :meth:`~TreeSequence.union` for details), or
|
|
@@ -7252,6 +7329,8 @@ class TreeSequence:
|
|
|
7252
7329
|
other_tables,
|
|
7253
7330
|
node_mapping=node_mapping,
|
|
7254
7331
|
check_shared_equality=False, # Else checks fail with internal samples
|
|
7332
|
+
all_mutations=True,
|
|
7333
|
+
all_edges=True,
|
|
7255
7334
|
record_provenance=False,
|
|
7256
7335
|
add_populations=add_populations,
|
|
7257
7336
|
)
|
|
@@ -7340,7 +7419,7 @@ class TreeSequence:
|
|
|
7340
7419
|
is its associated ``time`` value, or the time of its node if the
|
|
7341
7420
|
mutation's time was marked as unknown (:data:`UNKNOWN_TIME`).
|
|
7342
7421
|
|
|
7343
|
-
Migrations are not supported, and a LibraryError will be
|
|
7422
|
+
Migrations are not supported, and a LibraryError will be raised if
|
|
7344
7423
|
called on a tree sequence containing migration information.
|
|
7345
7424
|
|
|
7346
7425
|
.. seealso:: This method is implemented using the :meth:`.split_edges`
|
|
@@ -7376,7 +7455,9 @@ class TreeSequence:
|
|
|
7376
7455
|
`n` to `c` are extended, and the span of the edge from `p` to `c` is
|
|
7377
7456
|
reduced. Thus, the ancestral haplotype represented by `n` is extended
|
|
7378
7457
|
to a longer span of the genome. However, any edges whose child node is
|
|
7379
|
-
a sample are not modified.
|
|
7458
|
+
a sample are not modified. See
|
|
7459
|
+
`Fritze et al. (2025) <https://doi.org/10.1093/genetics/iyaf198>`_
|
|
7460
|
+
for more details.
|
|
7380
7461
|
|
|
7381
7462
|
Since some edges may be removed entirely, this process usually reduces
|
|
7382
7463
|
the number of edges in the tree sequence.
|
|
@@ -7399,15 +7480,15 @@ class TreeSequence:
|
|
|
7399
7480
|
known mutation times. See :meth:`.impute_unknown_mutations_time` if
|
|
7400
7481
|
mutation times are not known.
|
|
7401
7482
|
|
|
7402
|
-
|
|
7403
|
-
|
|
7404
|
-
|
|
7405
|
-
|
|
7406
|
-
|
|
7483
|
+
.. note::
|
|
7484
|
+
The method will not affect the marginal trees (so, if the original tree
|
|
7485
|
+
sequence was simplified, then following up with `simplify` will recover
|
|
7486
|
+
the original tree sequence, possibly with edges in a different order).
|
|
7487
|
+
It will also not affect the genotype matrix, or any of the tables other
|
|
7488
|
+
than the edge table or the node column in the mutation table.
|
|
7407
7489
|
|
|
7408
|
-
:param int
|
|
7490
|
+
:param int max_iter: The maximum number of iterations over the tree
|
|
7409
7491
|
sequence. Defaults to 10.
|
|
7410
|
-
|
|
7411
7492
|
:return: A new tree sequence with unary nodes extended.
|
|
7412
7493
|
:rtype: tskit.TreeSequence
|
|
7413
7494
|
"""
|
|
@@ -7432,11 +7513,15 @@ class TreeSequence:
|
|
|
7432
7513
|
the ancestry of these nodes - for that, see :meth:`.simplify`.
|
|
7433
7514
|
|
|
7434
7515
|
This has the side effect that it may change the order of the nodes,
|
|
7435
|
-
|
|
7436
|
-
in the new tree sequence will be in the order provided in ``nodes
|
|
7437
|
-
|
|
7438
|
-
|
|
7439
|
-
|
|
7516
|
+
populations, individuals, and migrations in the tree sequence. Nodes
|
|
7517
|
+
in the new tree sequence will be in the order provided in ``nodes``.
|
|
7518
|
+
Populations will be ordered in ascending order of the lowest ID of
|
|
7519
|
+
the nodes that refer to them. Individuals will be not only ordered
|
|
7520
|
+
so that :attr:`~Individual.parents` come before children (see
|
|
7521
|
+
:meth:`~TableCollection.sort_individuals`) but in addition
|
|
7522
|
+
will be secondarily sorted in ascending order of the lowest ID of
|
|
7523
|
+
their referring nodes. (However, ``reorder_populations`` may be set
|
|
7524
|
+
to ``False`` to keep the population table unchanged.)
|
|
7440
7525
|
|
|
7441
7526
|
By default, the method removes all individuals and populations not
|
|
7442
7527
|
referenced by any nodes, and all sites not referenced by any mutations.
|
|
@@ -7480,6 +7565,9 @@ class TreeSequence:
|
|
|
7480
7565
|
check_shared_equality=True,
|
|
7481
7566
|
add_populations=True,
|
|
7482
7567
|
record_provenance=True,
|
|
7568
|
+
*,
|
|
7569
|
+
all_edges=False,
|
|
7570
|
+
all_mutations=False,
|
|
7483
7571
|
):
|
|
7484
7572
|
"""
|
|
7485
7573
|
Returns an expanded tree sequence which contains the node-wise union of
|
|
@@ -7495,8 +7583,8 @@ class TreeSequence:
|
|
|
7495
7583
|
1. Individuals whose nodes are new to ``self``.
|
|
7496
7584
|
2. Edges whose parent or child are new to ``self``.
|
|
7497
7585
|
3. Mutations whose nodes are new to ``self``.
|
|
7498
|
-
4. Sites
|
|
7499
|
-
added mutation.
|
|
7586
|
+
4. Sites whose positions are not present in the site positions in
|
|
7587
|
+
``self``, if the site contains a newly added mutation.
|
|
7500
7588
|
|
|
7501
7589
|
This can be thought of as a "node-wise" union: for instance, it can not
|
|
7502
7590
|
be used to add new edges between two nodes already in ``self`` or new
|
|
@@ -7513,17 +7601,47 @@ class TreeSequence:
|
|
|
7513
7601
|
nodes are in entirely new populations, then you must set up the
|
|
7514
7602
|
population table first, and then union with ``add_populations=False``.
|
|
7515
7603
|
|
|
7516
|
-
|
|
7517
|
-
|
|
7518
|
-
|
|
7604
|
+
This method makes sense if the "shared" portions of the tree sequences
|
|
7605
|
+
are equal; the option ``check_shared_equality`` performs a consistency
|
|
7606
|
+
check that this is true. If this check is disabled, it is very easy to
|
|
7607
|
+
produce nonsensical results via subtle inconsistencies.
|
|
7608
|
+
|
|
7609
|
+
The behavior above can be changed by ``all_edges`` and ``all_mutations``.
|
|
7610
|
+
If ``all_edges`` is True, then all edges in ``other`` are added to
|
|
7611
|
+
``self``, instead of only edges adjacent to added nodes. If
|
|
7612
|
+
``all_mutations`` is True, then similarly all mutations in ``other``
|
|
7613
|
+
are added (not just those on added nodes); furthermore, all sites
|
|
7614
|
+
at positions without a site already present are added to ``self``.
|
|
7615
|
+
The intended use case for these options is a "disjoint" union,
|
|
7616
|
+
where for instance the two tree sequences contain information about
|
|
7617
|
+
disjoint segments of the genome (see :meth:`.concatenate`).
|
|
7618
|
+
For some such applications it may be necessary to set
|
|
7619
|
+
``check_shared_equality=False``: for instance, if ``other`` has
|
|
7620
|
+
an identical copy of the node table but no edges, then
|
|
7621
|
+
``all_mutations=True, check_shared_equality=False`` can be used
|
|
7622
|
+
to add mutations to ``self``.
|
|
7519
7623
|
|
|
7520
|
-
|
|
7521
|
-
|
|
7522
|
-
|
|
7624
|
+
.. warning::
|
|
7625
|
+
If an equivalent node is specified in ``other``, the
|
|
7626
|
+
version in ``self`` is used without checking the node
|
|
7627
|
+
properties are the same. Similarly, if the same site position
|
|
7628
|
+
is present in both ``self`` and ``other``, the version in
|
|
7629
|
+
``self`` is used without checking that site properties are
|
|
7630
|
+
the same. In these cases metadata and e.g. node times or ancestral
|
|
7631
|
+
states in ``other`` are simply ignored.
|
|
7632
|
+
|
|
7633
|
+
.. note::
|
|
7634
|
+
This operation also sorts the resulting tables, so the resulting
|
|
7635
|
+
tree sequence may not be equal to ``self`` even if nothing new
|
|
7636
|
+
was added (although it would differ only in ordering of the tables).
|
|
7523
7637
|
|
|
7524
|
-
:param
|
|
7638
|
+
:param TreeSequence other: Another tree sequence.
|
|
7525
7639
|
:param list node_mapping: An array of node IDs that relate nodes in
|
|
7526
7640
|
``other`` to nodes in ``self``.
|
|
7641
|
+
:param bool all_edges: If True, then all edges in ``other`` are added
|
|
7642
|
+
to ``self``.
|
|
7643
|
+
:param bool all_mutations: If True, then all mutations and sites in
|
|
7644
|
+
``other`` are added to ``self``.
|
|
7527
7645
|
:param bool check_shared_equality: If True, the shared portions of the
|
|
7528
7646
|
tree sequences will be checked for equality. It does so by
|
|
7529
7647
|
running :meth:`TreeSequence.subset` on both ``self`` and ``other``
|
|
@@ -7533,6 +7651,11 @@ class TreeSequence:
|
|
|
7533
7651
|
assigned new population IDs.
|
|
7534
7652
|
:param bool record_provenance: Whether to record a provenance entry
|
|
7535
7653
|
in the provenance table for this operation.
|
|
7654
|
+
:return: The union of the two tree sequences.
|
|
7655
|
+
:rtype: tskit.TreeSequence
|
|
7656
|
+
:raises: **tskit.LibraryError** -- If the resulting tree sequence is invalid
|
|
7657
|
+
(for instance, a node is specified to have two distinct
|
|
7658
|
+
parents on the same interval)
|
|
7536
7659
|
"""
|
|
7537
7660
|
tables = self.dump_tables()
|
|
7538
7661
|
other_tables = other.dump_tables()
|
|
@@ -7542,6 +7665,8 @@ class TreeSequence:
|
|
|
7542
7665
|
check_shared_equality=check_shared_equality,
|
|
7543
7666
|
add_populations=add_populations,
|
|
7544
7667
|
record_provenance=record_provenance,
|
|
7668
|
+
all_edges=all_edges,
|
|
7669
|
+
all_mutations=all_mutations,
|
|
7545
7670
|
)
|
|
7546
7671
|
return tables.tree_sequence()
|
|
7547
7672
|
|
|
@@ -8611,52 +8736,6 @@ class TreeSequence:
|
|
|
8611
8736
|
sizes = np.array(sizes, dtype=size_dtype)
|
|
8612
8737
|
return flat, sizes
|
|
8613
8738
|
|
|
8614
|
-
# def divergence_matrix(self, sample_sets, windows=None, mode="site"):
|
|
8615
|
-
# """
|
|
8616
|
-
# Finds the mean divergence between pairs of samples from each set of
|
|
8617
|
-
# samples and in each window. Returns a numpy array indexed by (window,
|
|
8618
|
-
# sample_set, sample_set). Diagonal entries are corrected so that the
|
|
8619
|
-
# value gives the mean divergence for *distinct* samples, but it is not
|
|
8620
|
-
# checked whether the sample_sets are disjoint (so offdiagonals are not
|
|
8621
|
-
# corrected). For this reason, if an element of `sample_sets` has only
|
|
8622
|
-
# one element, the corresponding diagonal will be NaN.
|
|
8623
|
-
|
|
8624
|
-
# The mean divergence between two samples is defined to be the mean: (as
|
|
8625
|
-
# a TreeStat) length of all edges separating them in the tree, or (as a
|
|
8626
|
-
# SiteStat) density of segregating sites, at a uniformly chosen position
|
|
8627
|
-
# on the genome.
|
|
8628
|
-
|
|
8629
|
-
# :param list sample_sets: A list of sets of IDs of samples.
|
|
8630
|
-
# :param iterable windows: The breakpoints of the windows (including start
|
|
8631
|
-
# and end, so has one more entry than number of windows).
|
|
8632
|
-
# :return: A list of the upper triangle of mean TMRCA values in row-major
|
|
8633
|
-
# order, including the diagonal.
|
|
8634
|
-
# """
|
|
8635
|
-
# ns = len(sample_sets)
|
|
8636
|
-
# indexes = [(i, j) for i in range(ns) for j in range(i, ns)]
|
|
8637
|
-
# x = self.divergence(sample_sets, indexes, windows, mode=mode)
|
|
8638
|
-
# nw = len(windows) - 1
|
|
8639
|
-
# A = np.ones((nw, ns, ns), dtype=float)
|
|
8640
|
-
# for w in range(nw):
|
|
8641
|
-
# k = 0
|
|
8642
|
-
# for i in range(ns):
|
|
8643
|
-
# for j in range(i, ns):
|
|
8644
|
-
# A[w, i, j] = A[w, j, i] = x[w][k]
|
|
8645
|
-
# k += 1
|
|
8646
|
-
# return A
|
|
8647
|
-
# NOTE: see older definition of divmat here, which may be useful when documenting
|
|
8648
|
-
# this function. See https://github.com/tskit-dev/tskit/issues/2781
|
|
8649
|
-
|
|
8650
|
-
# NOTE for documentation of sample_sets. We *must* use samples currently because
|
|
8651
|
-
# the normalisation for non-sample nodes is tricky. Do we normalise by the
|
|
8652
|
-
# total span of the ts where the node is 'present' in the tree? We avoid this
|
|
8653
|
-
# by insisting on sample nodes.
|
|
8654
|
-
|
|
8655
|
-
# NOTE for documentation of num_threads. Need to explain that the
|
|
8656
|
-
# its best to think of as the number of background *worker* threads.
|
|
8657
|
-
# default is to run without any worker threads. If you want to run
|
|
8658
|
-
# with all the cores on the machine, use num_threads=os.cpu_count().
|
|
8659
|
-
|
|
8660
8739
|
def divergence_matrix(
|
|
8661
8740
|
self,
|
|
8662
8741
|
sample_sets=None,
|
|
@@ -8666,6 +8745,41 @@ class TreeSequence:
|
|
|
8666
8745
|
mode=None,
|
|
8667
8746
|
span_normalise=True,
|
|
8668
8747
|
):
|
|
8748
|
+
"""
|
|
8749
|
+
Finds the matrix of pairwise :meth:`.divergence` values between groups
|
|
8750
|
+
of sample nodes. Returns a numpy array indexed by (window,
|
|
8751
|
+
sample_set, sample_set): the [k,i,j]th value of the result gives the
|
|
8752
|
+
mean divergence between pairs of samples from the i-th and j-th
|
|
8753
|
+
sample sets in the k-th window. As for :meth:`.divergence`,
|
|
8754
|
+
diagonal entries are corrected so that the
|
|
8755
|
+
value gives the mean divergence for *distinct* samples,
|
|
8756
|
+
and so diagonal entries are given by the :meth:`.diversity` of that
|
|
8757
|
+
sample set. For this reason, if an element of `sample_sets` has only
|
|
8758
|
+
one element, the corresponding :meth:`.diversity` will be NaN.
|
|
8759
|
+
However, this method will place a value of 0 in the diagonal instead of NaN
|
|
8760
|
+
in such cases; otherwise, this is equivalent to computing values with
|
|
8761
|
+
`meth`:.divergence`.
|
|
8762
|
+
However, this is (usually) more efficient than computing many
|
|
8763
|
+
pairwise values using the `indexes` argument to :meth:`.divergence`,
|
|
8764
|
+
so see :meth:`.divergence` for a description of what exactly is computed.
|
|
8765
|
+
|
|
8766
|
+
:param list sample_sets: A list of sets of IDs of samples.
|
|
8767
|
+
:param list windows: The breakpoints of the windows (including start
|
|
8768
|
+
and end, so has one more entry than number of windows).
|
|
8769
|
+
:param str mode: A string giving the "type" of the statistic to be computed
|
|
8770
|
+
(defaults to "site"; the other option is "branch").
|
|
8771
|
+
:return: An array indexed by (window, sample_set, sample_set), or if windows is
|
|
8772
|
+
`None`, an array indexed by (sample_set, sample_set).
|
|
8773
|
+
"""
|
|
8774
|
+
# NOTE for documentation of sample_sets. We *must* use samples currently because
|
|
8775
|
+
# the normalisation for non-sample nodes is tricky. Do we normalise by the
|
|
8776
|
+
# total span of the ts where the node is 'present' in the tree? We avoid this
|
|
8777
|
+
# by insisting on sample nodes.
|
|
8778
|
+
|
|
8779
|
+
# NOTE for documentation of num_threads. Need to explain that the
|
|
8780
|
+
# its best to think of as the number of background *worker* threads.
|
|
8781
|
+
# default is to run without any worker threads. If you want to run
|
|
8782
|
+
# with all the cores on the machine, use num_threads=os.cpu_count().
|
|
8669
8783
|
windows_specified = windows is not None
|
|
8670
8784
|
windows = self.parse_windows(windows)
|
|
8671
8785
|
mode = "site" if mode is None else mode
|
|
@@ -8873,7 +8987,16 @@ class TreeSequence:
|
|
|
8873
8987
|
"""
|
|
8874
8988
|
Computes the full matrix of pairwise genetic relatedness values
|
|
8875
8989
|
between (and within) pairs of sets of nodes from ``sample_sets``.
|
|
8876
|
-
|
|
8990
|
+
Returns a numpy array indexed by (window, sample_set, sample_set):
|
|
8991
|
+
the [k,i,j]th value of the result gives the
|
|
8992
|
+
genetic relatedness between pairs of samples from the i-th and j-th
|
|
8993
|
+
sample sets in the k-th window.
|
|
8994
|
+
This is (usually) more efficient than computing many pairwise
|
|
8995
|
+
values using the `indexes` argument to :meth:`.genetic_relatedness`.
|
|
8996
|
+
Specifically, this computes :meth:`.genetic_relatedness` with
|
|
8997
|
+
``centre=True`` and ``proportion=False`` (with caveats, see below).
|
|
8998
|
+
|
|
8999
|
+
*Warning:* in some cases, this does not compute exactly the same thing as
|
|
8877
9000
|
:meth:`.genetic_relatedness`: see below for more details.
|
|
8878
9001
|
|
|
8879
9002
|
If `mode="branch"`, then the value obtained is the same as that from
|
|
@@ -8881,29 +9004,35 @@ class TreeSequence:
|
|
|
8881
9004
|
`proportion=False`. The same is true if `mode="site"` and all sites have
|
|
8882
9005
|
at most one mutation.
|
|
8883
9006
|
|
|
8884
|
-
However, if some sites have more than one mutation, the value may differ
|
|
9007
|
+
However, if some sites have more than one mutation, the value may differ
|
|
9008
|
+
from that given by :meth:`.genetic_relatedness`:, although if the proportion
|
|
9009
|
+
of such sites is small, the difference will be small.
|
|
8885
9010
|
The reason is that this function (for efficiency) computes relatedness
|
|
8886
|
-
using :meth:`.
|
|
9011
|
+
using :meth:`.divergence_matrix` and the following relationship.
|
|
8887
9012
|
"Relatedness" measures the number of *shared* alleles (or branches),
|
|
8888
9013
|
while "divergence" measures the number of *non-shared* alleles (or branches).
|
|
8889
9014
|
Let :math:`T_i` be the total distance from sample :math:`i` up to the root;
|
|
8890
|
-
then if :math:`D_{ij}` is the divergence between :math:`i` and
|
|
8891
|
-
and :math:`R_{ij}` is the relatedness between :math:`i`
|
|
8892
|
-
:math:`T_i + T_j = D_{ij} + 2 R_{ij}.`
|
|
9015
|
+
then if :math:`D_{ij}` is the branch-mode divergence between :math:`i` and
|
|
9016
|
+
:math:`j` and :math:`R_{ij}` is the branch-mode relatedness between :math:`i`
|
|
9017
|
+
and :math:`j`, then :math:`T_i + T_j = D_{ij} + 2 R_{ij}.`
|
|
8893
9018
|
So, for any samples :math:`I`, :math:`J`, :math:`S`, :math:`T`
|
|
8894
9019
|
(that may now be random choices),
|
|
8895
9020
|
:math:`R_{IJ}-R_{IS}-R_{JT}+R_{ST} = (D_{IJ}-D_{IS}-D_{JT}+D_{ST})/ (-2)`.
|
|
8896
|
-
|
|
8897
|
-
|
|
8898
|
-
|
|
9021
|
+
This is exactly what we want for (centered) relatedness.
|
|
9022
|
+
However, this relationship does not necessarily hold for `mode="site"`:
|
|
9023
|
+
it does hold if we can treat "number of differing alleles" as distances
|
|
9024
|
+
on the tree, but this is not necessarily the case in the presence of
|
|
9025
|
+
multiple mutations.
|
|
8899
9026
|
|
|
8900
|
-
Another
|
|
9027
|
+
Another note regarding the above relationship between :math:`R` and :math:`D`
|
|
8901
9028
|
is that :meth:`.divergence` of a sample set to itself does not include
|
|
8902
9029
|
the "self" comparisons (so as to provide an unbiased estimator of a
|
|
8903
9030
|
population quantity), while the usual definition of genetic relatedness
|
|
8904
9031
|
*does* include such comparisons (to provide, for instance, an appropriate
|
|
8905
9032
|
value for prospective results beginning with only a given set of
|
|
8906
|
-
individuals).
|
|
9033
|
+
individuals). So, diagonal entries in the relatedness matrix returned here
|
|
9034
|
+
are obtained from :meth:`divergence_matrix` after first correcting
|
|
9035
|
+
diagonals to include these "self" comparisons.
|
|
8907
9036
|
|
|
8908
9037
|
:param list sample_sets: A list of lists of Node IDs, specifying the
|
|
8909
9038
|
groups of nodes to compute the statistic with.
|
|
@@ -8912,11 +9041,35 @@ class TreeSequence:
|
|
|
8912
9041
|
:param str mode: A string giving the "type" of the statistic to be computed
|
|
8913
9042
|
(defaults to "site").
|
|
8914
9043
|
:param bool span_normalise: Whether to divide the result by the span of the
|
|
8915
|
-
window (defaults to True).
|
|
8916
|
-
:return:
|
|
8917
|
-
|
|
8918
|
-
|
|
8919
|
-
|
|
9044
|
+
window (defaults to True).
|
|
9045
|
+
:return: An array indexed by (window, sample_set, sample_set), or if windows is
|
|
9046
|
+
`None`, an array indexed by (sample_set, sample_set).
|
|
9047
|
+
"""
|
|
9048
|
+
# Further notes on the relationship between relatedness (R)
|
|
9049
|
+
# and divergence (D) in mode="site":
|
|
9050
|
+
# The summary function for divergence is "p (1-q)",
|
|
9051
|
+
# where p and q are the allele frequencies in the two sample sets;
|
|
9052
|
+
# while for relatedness it is "pq". Summing across *all* alleles,
|
|
9053
|
+
# we get that relatedness plus divergence is
|
|
9054
|
+
# p1 (1-q1) + p1 q1 + ... + pk (1-qk) + pk qk = p1 + ... + pk = 1 .
|
|
9055
|
+
# This implies that
|
|
9056
|
+
# ts.divergence(..., span_normalise=False)
|
|
9057
|
+
# + ts.genetic_relatedness(..., span_normalise=False, centre=False,
|
|
9058
|
+
# proportion=False, polarised=False)
|
|
9059
|
+
# == ts.num_sites
|
|
9060
|
+
# This could be the basis for a similar relationship between R and D.
|
|
9061
|
+
# However, that relationship holds only with polarised=False, which is not
|
|
9062
|
+
# the default, or what this function does (for good reason).
|
|
9063
|
+
# So, without setting polarised=False, we have that that for samples i and j,
|
|
9064
|
+
# divergence plus relatedness is equal to (something like)
|
|
9065
|
+
# the total number of sites at which both i and j are ancestral;
|
|
9066
|
+
# this depends on the samples and so does not cancel out of the centred
|
|
9067
|
+
# version. We could work through these relationships to figure out what exactly
|
|
9068
|
+
# the difference between genetic_relatedness_matrix(mode="site") and
|
|
9069
|
+
# genetic_relatedness(mode="site") is, in the general case of multiple
|
|
9070
|
+
# mutations... but that would be confusing, probably not that useful,
|
|
9071
|
+
# and the short version of all this is that "it's complicated".
|
|
9072
|
+
|
|
8920
9073
|
D = self.divergence_matrix(
|
|
8921
9074
|
sample_sets,
|
|
8922
9075
|
windows=windows,
|
|
@@ -9088,6 +9241,7 @@ class TreeSequence:
|
|
|
9088
9241
|
mode=mode,
|
|
9089
9242
|
centre=False,
|
|
9090
9243
|
nodes=indices,
|
|
9244
|
+
span_normalise=False, # <- non-default!
|
|
9091
9245
|
)[0]
|
|
9092
9246
|
x = x - x.mean(axis=0) if centre else x
|
|
9093
9247
|
|
|
@@ -9118,6 +9272,7 @@ class TreeSequence:
|
|
|
9118
9272
|
mode=mode,
|
|
9119
9273
|
centre=False,
|
|
9120
9274
|
nodes=samples,
|
|
9275
|
+
span_normalise=False, # <- non-default!
|
|
9121
9276
|
)[0]
|
|
9122
9277
|
|
|
9123
9278
|
def bincount_fn(w):
|
|
@@ -9148,23 +9303,28 @@ class TreeSequence:
|
|
|
9148
9303
|
eigenvectors of the genetic relatedness matrix, which are obtained by a
|
|
9149
9304
|
randomized singular value decomposition (rSVD) algorithm.
|
|
9150
9305
|
|
|
9151
|
-
Concretely,
|
|
9152
|
-
|
|
9153
|
-
:
|
|
9154
|
-
between sample :math:`i` and sample :math:`j
|
|
9155
|
-
|
|
9306
|
+
Concretely, take :math:`M` as the matrix of non-span-normalised
|
|
9307
|
+
genetic relatedness values, for instance obtained by
|
|
9308
|
+
setting :math:`M_{ij}` to be the :meth:`~.TreeSequence.genetic_relatedness`
|
|
9309
|
+
between sample :math:`i` and sample :math:`j` with the specified ``mode``,
|
|
9310
|
+
``proportion=False`` and ``span_normalise=False``. Then by default this
|
|
9311
|
+
returns the top ``num_components`` eigenvectors of :math:`M`, so that
|
|
9156
9312
|
``output.factors[i,k]`` is the position of sample `i` on the `k` th PC.
|
|
9157
|
-
If ``samples`` or ``individuals`` are provided, then this does the same
|
|
9158
|
-
except with :math:`M_{ij}` either the relatedness between
|
|
9159
|
-
and ``samples[j]`` or the
|
|
9160
|
-
respectively.
|
|
9313
|
+
If ``samples`` or ``individuals`` are provided, then this does the same
|
|
9314
|
+
thing, except with :math:`M_{ij}` either the relatedness between
|
|
9315
|
+
``samples[i]`` and ``samples[j]`` or the average relatedness between the
|
|
9316
|
+
nodes of ``individuals[i]`` and ``individuals[j]``, respectively.
|
|
9317
|
+
Factors are normalized to have norm 1, i.e.,
|
|
9318
|
+
``output.factors[:,k] ** 2).sum() == 1)`` for any ``k``.
|
|
9161
9319
|
|
|
9162
9320
|
The parameters ``centre`` and ``mode`` are passed to
|
|
9163
|
-
:meth
|
|
9164
|
-
|
|
9165
|
-
If ``
|
|
9166
|
-
|
|
9167
|
-
|
|
9321
|
+
:meth:`~.TreeSequence.genetic_relatedness`: the default ``centre=True`` results
|
|
9322
|
+
in factors whose elements sum to zero; ``mode`` currently only supports the
|
|
9323
|
+
``"branch"`` setting. If ``windows`` are provided then PCA is carried out
|
|
9324
|
+
separately in each genomic window. If ``time_windows`` is provided, then genetic
|
|
9325
|
+
relatedness is measured using only ancestral material within the given time
|
|
9326
|
+
window (see :meth:`decapitate <.TreeSequence.decapitate>` for how this is
|
|
9327
|
+
defined).
|
|
9168
9328
|
|
|
9169
9329
|
So that the method scales to large tree sequences, the underlying method
|
|
9170
9330
|
relies on a randomized SVD algorithm, using
|
|
@@ -9840,7 +10000,7 @@ class TreeSequence:
|
|
|
9840
10000
|
b = 2 * (n**2 + n + 3) / (9 * n * (n - 1)) - (n + 2) / (h * n) + g / h**2
|
|
9841
10001
|
c = h**2 + g
|
|
9842
10002
|
|
|
9843
|
-
What is computed for diversity and
|
|
10003
|
+
What is computed for diversity and segregating sites depends on ``mode``;
|
|
9844
10004
|
see those functions for more details.
|
|
9845
10005
|
|
|
9846
10006
|
:param list sample_sets: A list of lists of Node IDs, specifying the
|
|
@@ -9903,6 +10063,11 @@ class TreeSequence:
|
|
|
9903
10063
|
What is computed for diversity and divergence depends on ``mode``;
|
|
9904
10064
|
see those functions for more details.
|
|
9905
10065
|
|
|
10066
|
+
For ``mode='site'``, this definition of Fst appears as equation (6) in
|
|
10067
|
+
`Slatkin (1991) <https://doi.org/10.1017/S0016672300029827>`_, and
|
|
10068
|
+
is also found as equation (9) in
|
|
10069
|
+
`Nei (1973) <https://doi.org/10.1073/pnas.70.12.3321>`_.
|
|
10070
|
+
|
|
9906
10071
|
:param list sample_sets: A list of lists of Node IDs, specifying the
|
|
9907
10072
|
groups of nodes to compute the statistic with.
|
|
9908
10073
|
:param list indexes: A list of 2-tuples.
|
|
@@ -10324,7 +10489,8 @@ class TreeSequence:
|
|
|
10324
10489
|
|
|
10325
10490
|
For an precise mathematical definition of GNN, see https://doi.org/10.1101/458067
|
|
10326
10491
|
|
|
10327
|
-
.. note::
|
|
10492
|
+
.. note::
|
|
10493
|
+
The reference sets need not include all the samples, hence the most
|
|
10328
10494
|
recent common ancestral node of the reference sets, :math:`a`, need not be
|
|
10329
10495
|
the immediate ancestor of the focal node. If the reference sets only comprise
|
|
10330
10496
|
sequences from relatively distant individuals, the GNN statistic may end up
|
|
@@ -10436,7 +10602,7 @@ class TreeSequence:
|
|
|
10436
10602
|
represented by the tree sequence.
|
|
10437
10603
|
|
|
10438
10604
|
:param list within: A list of node IDs defining set of nodes that
|
|
10439
|
-
we
|
|
10605
|
+
we find IBD segments for. If not specified, this defaults to
|
|
10440
10606
|
all samples in the tree sequence.
|
|
10441
10607
|
:param list[list] between: A list of lists of sample node IDs. Given
|
|
10442
10608
|
two sample sets A and B, only IBD segments will be returned such
|
|
@@ -10451,7 +10617,7 @@ class TreeSequence:
|
|
|
10451
10617
|
segment) is greater than this value will be included. (Default=0)
|
|
10452
10618
|
:param bool store_pairs: If True store information separately for each
|
|
10453
10619
|
pair of samples ``(a, b)`` that are found to be IBD. Otherwise
|
|
10454
|
-
store summary information about all sample
|
|
10620
|
+
store summary information about all sample pairs. (Default=False)
|
|
10455
10621
|
:param bool store_segments: If True store each IBD segment
|
|
10456
10622
|
``(left, right, c)`` and associate it with the corresponding
|
|
10457
10623
|
sample pair ``(a, b)``. If True, implies ``store_pairs``.
|
|
@@ -10882,7 +11048,7 @@ class TreeSequence:
|
|
|
10882
11048
|
mapping is created by first checking if the tree sequence contains individuals.
|
|
10883
11049
|
If it does, the mapping is created using the individuals in the tree sequence.
|
|
10884
11050
|
By default only the sample nodes of the individuals are included in the mapping,
|
|
10885
|
-
unless
|
|
11051
|
+
unless ``include_non_sample_nodes`` is set to True, in which case all nodes
|
|
10886
11052
|
belonging to the individuals are included. Any individuals without any nodes
|
|
10887
11053
|
will have no nodes in their row of the mapping, being essentially of zero ploidy.
|
|
10888
11054
|
If no individuals are present, the mapping is created using only the sample nodes
|
|
@@ -10890,20 +11056,22 @@ class TreeSequence:
|
|
|
10890
11056
|
|
|
10891
11057
|
As the tskit data model allows non-integer positions, site positions and contig
|
|
10892
11058
|
length are transformed to integer values suitable for VCF output. The
|
|
10893
|
-
transformation is done using the
|
|
11059
|
+
transformation is done using the ``position_transform`` function, which must
|
|
10894
11060
|
return an integer numpy array the same dimension as the input. By default,
|
|
10895
11061
|
this is set to ``numpy.round()`` which will round values to the nearest integer.
|
|
10896
11062
|
|
|
10897
|
-
If neither
|
|
10898
|
-
individual names are set to "tsk_{individual_id}" for each individual. If
|
|
10899
|
-
no individuals are present, the individual names are set to "tsk_{i}" with
|
|
10900
|
-
|
|
11063
|
+
If neither ``name_metadata_key`` nor ``individual_names`` is specified, the
|
|
11064
|
+
individual names are set to ``"tsk_{individual_id}"`` for each individual. If
|
|
11065
|
+
no individuals are present, the individual names are set to ``"tsk_{i}"`` with
|
|
11066
|
+
``0 <= i < num_sample_nodes/ploidy``.
|
|
10901
11067
|
|
|
10902
|
-
A
|
|
11068
|
+
A warning is emitted if any sample nodes do not have an individual ID.
|
|
10903
11069
|
|
|
10904
11070
|
:param list individuals: Specific individual IDs to include in the VCF. If not
|
|
10905
11071
|
specified and the tree sequence contains individuals, all individuals are
|
|
10906
|
-
included at least one
|
|
11072
|
+
included that are associated with least one sample node (or at least one of
|
|
11073
|
+
any node if ``include_non_sample_nodes`` is True), and the mapping arrays
|
|
11074
|
+
will be in ascending order of the ID of the individual in the tree sequence.
|
|
10907
11075
|
:param int ploidy: The ploidy, or number of nodes per individual. Only used when
|
|
10908
11076
|
the tree sequence does not contain individuals. Cannot be used if the tree
|
|
10909
11077
|
sequence contains individuals. Defaults to 1 if not specified.
|