tskit 1.0.0b2__cp313-cp313-win_amd64.whl → 1.0.1__cp313-cp313-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- _tskit.cp313-win_amd64.pyd +0 -0
- tskit/_version.py +1 -1
- tskit/combinatorics.py +6 -0
- tskit/drawing.py +2 -4
- tskit/exceptions.py +8 -0
- tskit/genotypes.py +23 -20
- tskit/metadata.py +105 -1
- tskit/tables.py +1043 -493
- tskit/text_formats.py +4 -0
- tskit/trees.py +440 -261
- tskit/util.py +6 -7
- {tskit-1.0.0b2.dist-info → tskit-1.0.1.dist-info}/METADATA +8 -8
- tskit-1.0.1.dist-info/RECORD +27 -0
- {tskit-1.0.0b2.dist-info → tskit-1.0.1.dist-info}/WHEEL +1 -1
- tskit-1.0.0b2.dist-info/RECORD +0 -27
- {tskit-1.0.0b2.dist-info → tskit-1.0.1.dist-info}/entry_points.txt +0 -0
- {tskit-1.0.0b2.dist-info → tskit-1.0.1.dist-info}/licenses/LICENSE +0 -0
- {tskit-1.0.0b2.dist-info → tskit-1.0.1.dist-info}/top_level.txt +0 -0
tskit/trees.py
CHANGED
|
@@ -370,7 +370,11 @@ class Site(util.Dataclass):
|
|
|
370
370
|
mutations: np.ndarray
|
|
371
371
|
"""
|
|
372
372
|
The list of mutations at this site. Mutations within a site are returned in the
|
|
373
|
-
|
|
373
|
+
|
|
374
|
+
order they are specified in the underlying :class:`MutationTable`. For canonical
|
|
375
|
+
(i.e., valid) tables, this means ancestral mutations precede their descendants, so
|
|
376
|
+
older mutations (as defined by the canonical mutation ordering; see
|
|
377
|
+
:ref:`sec_mutation_requirements`) appear before younger ones.
|
|
374
378
|
"""
|
|
375
379
|
metadata: bytes | dict | None
|
|
376
380
|
"""
|
|
@@ -571,8 +575,8 @@ class Migration(util.Dataclass):
|
|
|
571
575
|
"""
|
|
572
576
|
id: int # noqa A003
|
|
573
577
|
"""
|
|
574
|
-
The integer ID of this
|
|
575
|
-
:attr:`TreeSequence.
|
|
578
|
+
The integer ID of this migration. Varies from 0 to
|
|
579
|
+
:attr:`TreeSequence.num_migrations` - 1.
|
|
576
580
|
"""
|
|
577
581
|
|
|
578
582
|
|
|
@@ -770,7 +774,7 @@ class Tree:
|
|
|
770
774
|
calling the :meth:`TreeSequence.trees` iterator.
|
|
771
775
|
|
|
772
776
|
:return: The root threshold.
|
|
773
|
-
:rtype:
|
|
777
|
+
:rtype: int
|
|
774
778
|
"""
|
|
775
779
|
return self._ll_tree.get_root_threshold()
|
|
776
780
|
|
|
@@ -881,7 +885,8 @@ class Tree:
|
|
|
881
885
|
|
|
882
886
|
:param float position: The position along the sequence length to
|
|
883
887
|
seek to.
|
|
884
|
-
:raises ValueError: If
|
|
888
|
+
:raises ValueError: If ``position`` is less than 0 or ``position`` is greater
|
|
889
|
+
than or equal to
|
|
885
890
|
:attr:`TreeSequence.sequence_length`.
|
|
886
891
|
"""
|
|
887
892
|
if position < 0 or position >= self.tree_sequence.sequence_length:
|
|
@@ -918,7 +923,7 @@ class Tree:
|
|
|
918
923
|
the interval :math:`[0, \\text{span})` and the :attr:`~Tree.tree_sequence`
|
|
919
924
|
from which the tree is taken will have its
|
|
920
925
|
:attr:`~tskit.TreeSequence.sequence_length` equal to ``span``.
|
|
921
|
-
:param
|
|
926
|
+
:param float branch_length: The minimum length of a branch in this tree.
|
|
922
927
|
:raises ValueError: If the given rank is out of bounds for trees
|
|
923
928
|
with ``num_leaves`` leaves.
|
|
924
929
|
"""
|
|
@@ -3593,7 +3598,7 @@ def parse_nodes(source, strict=True, encoding="utf8", base64_metadata=True, tabl
|
|
|
3593
3598
|
return table
|
|
3594
3599
|
|
|
3595
3600
|
|
|
3596
|
-
def parse_edges(source, strict=True, table=None):
|
|
3601
|
+
def parse_edges(source, strict=True, table=None, encoding="utf8", base64_metadata=True):
|
|
3597
3602
|
"""
|
|
3598
3603
|
Parse the specified file-like object containing a whitespace delimited
|
|
3599
3604
|
description of a edge table and returns the corresponding :class:`EdgeTable`
|
|
@@ -3609,6 +3614,9 @@ def parse_edges(source, strict=True, table=None):
|
|
|
3609
3614
|
False, a relaxed whitespace splitting algorithm is used.
|
|
3610
3615
|
:param EdgeTable table: If specified, write the edges into this table. If
|
|
3611
3616
|
not, create a new :class:`EdgeTable` instance and return.
|
|
3617
|
+
:param str encoding: Encoding used for text representation.
|
|
3618
|
+
:param bool base64_metadata: If True, metadata is encoded using Base64
|
|
3619
|
+
encoding; otherwise, as plain text.
|
|
3612
3620
|
"""
|
|
3613
3621
|
sep = None
|
|
3614
3622
|
if strict:
|
|
@@ -3620,6 +3628,12 @@ def parse_edges(source, strict=True, table=None):
|
|
|
3620
3628
|
right_index = header.index("right")
|
|
3621
3629
|
parent_index = header.index("parent")
|
|
3622
3630
|
children_index = header.index("child")
|
|
3631
|
+
metadata_index = None
|
|
3632
|
+
try:
|
|
3633
|
+
metadata_index = header.index("metadata")
|
|
3634
|
+
except ValueError:
|
|
3635
|
+
pass
|
|
3636
|
+
default_metadata = b""
|
|
3623
3637
|
for line in source:
|
|
3624
3638
|
tokens = line.rstrip("\n").split(sep)
|
|
3625
3639
|
if len(tokens) >= 4:
|
|
@@ -3627,8 +3641,19 @@ def parse_edges(source, strict=True, table=None):
|
|
|
3627
3641
|
right = float(tokens[right_index])
|
|
3628
3642
|
parent = int(tokens[parent_index])
|
|
3629
3643
|
children = tuple(map(int, tokens[children_index].split(",")))
|
|
3644
|
+
metadata = default_metadata
|
|
3645
|
+
if metadata_index is not None and metadata_index < len(tokens):
|
|
3646
|
+
metadata = tokens[metadata_index].encode(encoding)
|
|
3647
|
+
if base64_metadata:
|
|
3648
|
+
metadata = base64.b64decode(metadata)
|
|
3630
3649
|
for child in children:
|
|
3631
|
-
table.add_row(
|
|
3650
|
+
table.add_row(
|
|
3651
|
+
left=left,
|
|
3652
|
+
right=right,
|
|
3653
|
+
parent=parent,
|
|
3654
|
+
child=child,
|
|
3655
|
+
metadata=metadata,
|
|
3656
|
+
)
|
|
3632
3657
|
return table
|
|
3633
3658
|
|
|
3634
3659
|
|
|
@@ -4136,6 +4161,7 @@ class TreeSequence:
|
|
|
4136
4161
|
|
|
4137
4162
|
def __init__(self, ll_tree_sequence):
|
|
4138
4163
|
self._ll_tree_sequence = ll_tree_sequence
|
|
4164
|
+
self._immutable_tables = None
|
|
4139
4165
|
metadata_schema_strings = self._ll_tree_sequence.get_table_metadata_schemas()
|
|
4140
4166
|
metadata_schema_instances = {
|
|
4141
4167
|
name: metadata_module.parse_metadata_schema(
|
|
@@ -4321,21 +4347,29 @@ class TreeSequence:
|
|
|
4321
4347
|
@property
|
|
4322
4348
|
def tables(self):
|
|
4323
4349
|
"""
|
|
4324
|
-
Returns the
|
|
4325
|
-
sequence, intended for read-only access. See :meth:`.dump_tables` if you wish
|
|
4326
|
-
to modify the tables.
|
|
4350
|
+
Returns an immutable view of the tables underlying this tree sequence.
|
|
4327
4351
|
|
|
4328
|
-
|
|
4329
|
-
|
|
4330
|
-
**view** in the future. Thus, if the tables will subsequently be
|
|
4331
|
-
updated, please use the :meth:`.dump_tables` method instead as
|
|
4332
|
-
this will always return a new copy of the TableCollection.
|
|
4352
|
+
This view shares the same data as the TreeSequence (zero-copy).
|
|
4353
|
+
Use :meth:`.dump_tables` for a modifiable copy.
|
|
4333
4354
|
|
|
4334
|
-
|
|
4335
|
-
|
|
4336
|
-
|
|
4355
|
+
Note that if tskit was built with Numpy 1, this method acts as
|
|
4356
|
+
:meth:`.dump_tables` and returns a mutable TableCollection.
|
|
4357
|
+
|
|
4358
|
+
:return: An immutable view of the TableCollection underlying this tree sequence.
|
|
4337
4359
|
"""
|
|
4338
|
-
|
|
4360
|
+
if not _tskit.HAS_NUMPY_2:
|
|
4361
|
+
warnings.warn(
|
|
4362
|
+
"Immutable table views require tskit to be built against NumPy 2.0 or "
|
|
4363
|
+
"newer. Falling back to returning a mutable TableCollection.",
|
|
4364
|
+
UserWarning,
|
|
4365
|
+
stacklevel=2,
|
|
4366
|
+
)
|
|
4367
|
+
return self.dump_tables()
|
|
4368
|
+
if self._immutable_tables is None:
|
|
4369
|
+
self._immutable_tables = tables.ImmutableTableCollection(
|
|
4370
|
+
self._ll_tree_sequence
|
|
4371
|
+
)
|
|
4372
|
+
return self._immutable_tables
|
|
4339
4373
|
|
|
4340
4374
|
@property
|
|
4341
4375
|
def nbytes(self):
|
|
@@ -4359,6 +4393,22 @@ class TreeSequence:
|
|
|
4359
4393
|
self._ll_tree_sequence.dump_tables(ll_tables)
|
|
4360
4394
|
return tables.TableCollection(ll_tables=ll_tables)
|
|
4361
4395
|
|
|
4396
|
+
def link_ancestors(self, samples, ancestors):
|
|
4397
|
+
"""
|
|
4398
|
+
Equivalent to :meth:`TableCollection.link_ancestors`; see that method for full
|
|
4399
|
+
documentation and parameter semantics.
|
|
4400
|
+
|
|
4401
|
+
:param list[int] samples: Node IDs to retain as samples.
|
|
4402
|
+
:param list[int] ancestors: Node IDs to treat as ancestors.
|
|
4403
|
+
:return: An :class:`tables.EdgeTable` containing the genealogical links between
|
|
4404
|
+
the supplied ``samples`` and ``ancestors``.
|
|
4405
|
+
:rtype: tables.EdgeTable
|
|
4406
|
+
"""
|
|
4407
|
+
samples = util.safe_np_int_cast(samples, np.int32)
|
|
4408
|
+
ancestors = util.safe_np_int_cast(ancestors, np.int32)
|
|
4409
|
+
ll_edge_table = self._ll_tree_sequence.link_ancestors(samples, ancestors)
|
|
4410
|
+
return tables.EdgeTable(ll_table=ll_edge_table)
|
|
4411
|
+
|
|
4362
4412
|
def dump_text(
|
|
4363
4413
|
self,
|
|
4364
4414
|
nodes=None,
|
|
@@ -4758,7 +4808,8 @@ class TreeSequence:
|
|
|
4758
4808
|
Returns an iterable sequence of all the :ref:`nodes <sec_node_table_definition>`
|
|
4759
4809
|
in this tree sequence.
|
|
4760
4810
|
|
|
4761
|
-
.. note::
|
|
4811
|
+
.. note::
|
|
4812
|
+
Although node ids are commonly ordered by node time, this is not a
|
|
4762
4813
|
formal tree sequence requirement. If you wish to iterate over nodes in
|
|
4763
4814
|
time order, you should therefore use ``order="timeasc"`` (and wrap the
|
|
4764
4815
|
resulting sequence in the standard Python :func:`python:reversed` function
|
|
@@ -5312,13 +5363,13 @@ class TreeSequence:
|
|
|
5312
5363
|
Returns an iterator over the strings of haplotypes that result from
|
|
5313
5364
|
the trees and mutations in this tree sequence. Each haplotype string
|
|
5314
5365
|
is guaranteed to be of the same length. A tree sequence with
|
|
5315
|
-
:math:`n`
|
|
5316
|
-
``right`` will return a total
|
|
5317
|
-
strings of :math:`s` alleles concatenated together, where an allele
|
|
5366
|
+
:math:`n` requested nodes (default: the number of sample nodes) and with
|
|
5367
|
+
:math:`s` sites lying between ``left`` and ``right`` will return a total
|
|
5368
|
+
of :math:`n` strings of :math:`s` alleles concatenated together, where an allele
|
|
5318
5369
|
consists of a single ascii character (tree sequences that include alleles
|
|
5319
5370
|
which are not a single character in length, or where the character is
|
|
5320
5371
|
non-ascii, will raise an error). The first string returned is the
|
|
5321
|
-
haplotype for the first requested
|
|
5372
|
+
haplotype for the first requested node, and so on.
|
|
5322
5373
|
|
|
5323
5374
|
The alleles at each site must be represented by single byte characters,
|
|
5324
5375
|
(i.e., variants must be single nucleotide polymorphisms, or SNPs), hence
|
|
@@ -5327,8 +5378,8 @@ class TreeSequence:
|
|
|
5327
5378
|
haplotype ``h``, the value of ``h[j]`` will therefore be the observed
|
|
5328
5379
|
allelic state at site ``j``.
|
|
5329
5380
|
|
|
5330
|
-
If ``isolated_as_missing`` is True (the default), isolated
|
|
5331
|
-
mutations directly above them will be treated as
|
|
5381
|
+
If ``isolated_as_missing`` is True (the default), isolated nodes without
|
|
5382
|
+
mutations directly above them (whether samples or non-samples) will be treated as
|
|
5332
5383
|
:ref:`missing data<sec_data_model_missing_data>` and will be
|
|
5333
5384
|
represented in the string by the ``missing_data_character``. If
|
|
5334
5385
|
instead it is set to False, missing data will be assigned the ancestral state
|
|
@@ -5337,8 +5388,10 @@ class TreeSequence:
|
|
|
5337
5388
|
behaviour in versions prior to 0.2.0. Prior to 0.3.0 the `impute_missing_data`
|
|
5338
5389
|
argument controlled this behaviour.
|
|
5339
5390
|
|
|
5391
|
+
It is also possible to provide **non-sample** nodes via the ``samples``
|
|
5392
|
+
argument if you wish to output haplotypes for (e.g.) internal nodes.
|
|
5340
5393
|
See also the :meth:`.variants` iterator for site-centric access
|
|
5341
|
-
to
|
|
5394
|
+
to genotypes for the requested nodes.
|
|
5342
5395
|
|
|
5343
5396
|
.. warning::
|
|
5344
5397
|
For large datasets, this method can consume a **very large** amount of
|
|
@@ -5356,9 +5409,10 @@ class TreeSequence:
|
|
|
5356
5409
|
be used to represent missing data.
|
|
5357
5410
|
If any normal allele contains this character, an error is raised.
|
|
5358
5411
|
Default: 'N'.
|
|
5359
|
-
:param list[int] samples: The
|
|
5360
|
-
``None`` (default), return haplotypes for all the
|
|
5361
|
-
sequence, in the order given by the :meth:`.samples` method.
|
|
5412
|
+
:param list[int] samples: The node IDs for which to output haplotypes. If
|
|
5413
|
+
``None`` (default), return haplotypes for all the sample nodes in the tree
|
|
5414
|
+
sequence, in the order given by the :meth:`.samples` method. Non-sample
|
|
5415
|
+
nodes may also be provided.
|
|
5362
5416
|
:param int left: Haplotype strings will start with the first site at or after
|
|
5363
5417
|
this genomic position. If ``None`` (default) start at the first site.
|
|
5364
5418
|
:param int right: Haplotype strings will end with the last site before this
|
|
@@ -5429,9 +5483,13 @@ class TreeSequence:
|
|
|
5429
5483
|
generated; output order of genotypes in the returned variants
|
|
5430
5484
|
corresponds to the order of the samples in this list. It is also
|
|
5431
5485
|
possible to provide **non-sample** nodes as an argument here, if you
|
|
5432
|
-
wish to generate genotypes for (e.g.) internal nodes.
|
|
5433
|
-
|
|
5434
|
-
|
|
5486
|
+
wish to generate genotypes for (e.g.) internal nodes. Missingness is
|
|
5487
|
+
detected for any requested node (sample or non-sample) when
|
|
5488
|
+
``isolated_as_missing`` is True: if a node is isolated at a site (i.e.,
|
|
5489
|
+
has no parent and no children in the marginal tree) and has no mutation
|
|
5490
|
+
above it at that site, its genotype will be reported as
|
|
5491
|
+
:data:`MISSING_DATA` (-1). If ``isolated_as_missing`` is False, such
|
|
5492
|
+
nodes are assigned the site's ancestral allele index.
|
|
5435
5493
|
|
|
5436
5494
|
If isolated samples are present at a given site without mutations above them,
|
|
5437
5495
|
they are interpreted by default as
|
|
@@ -5521,19 +5579,23 @@ class TreeSequence:
|
|
|
5521
5579
|
"""
|
|
5522
5580
|
Returns an :math:`m \\times n` numpy array of the genotypes in this
|
|
5523
5581
|
tree sequence, where :math:`m` is the number of sites and :math:`n`
|
|
5524
|
-
the number of
|
|
5525
|
-
|
|
5526
|
-
|
|
5527
|
-
|
|
5528
|
-
|
|
5529
|
-
|
|
5530
|
-
|
|
5531
|
-
|
|
5532
|
-
|
|
5533
|
-
|
|
5534
|
-
|
|
5535
|
-
|
|
5536
|
-
|
|
5582
|
+
is the number of requested nodes (default: the number of sample nodes).
|
|
5583
|
+
The genotypes are the indexes into the array of ``alleles``, as
|
|
5584
|
+
described for the :class:`Variant` class.
|
|
5585
|
+
|
|
5586
|
+
It is possible to provide **non-sample** nodes via the ``samples``
|
|
5587
|
+
argument if you wish to generate genotypes for (e.g.) internal nodes.
|
|
5588
|
+
Missingness is detected for any requested node (sample or non-sample)
|
|
5589
|
+
when ``isolated_as_missing`` is True: if a node is isolated at a site
|
|
5590
|
+
(i.e., has no parent and no children in the marginal tree) and has no
|
|
5591
|
+
mutation above it at that site, its genotype will be reported as
|
|
5592
|
+
:data:`MISSING_DATA` (-1).
|
|
5593
|
+
|
|
5594
|
+
Such nodes are treated as missing data by default. If
|
|
5595
|
+
``isolated_as_missing`` is set to False, they will not be treated as
|
|
5596
|
+
missing, and will instead be assigned the ancestral state. This was the
|
|
5597
|
+
default behaviour in versions prior to 0.2.0. Prior to 0.3.0 the
|
|
5598
|
+
``impute_missing_data`` argument controlled this behaviour.
|
|
5537
5599
|
|
|
5538
5600
|
.. warning::
|
|
5539
5601
|
This method can consume a **very large** amount of memory! If
|
|
@@ -5541,10 +5603,12 @@ class TreeSequence:
|
|
|
5541
5603
|
access them sequentially using the :meth:`.variants` iterator.
|
|
5542
5604
|
|
|
5543
5605
|
:param array_like samples: An array of node IDs for which to generate
|
|
5544
|
-
genotypes
|
|
5606
|
+
genotypes. If ``None`` (default), generate genotypes for all sample
|
|
5607
|
+
nodes. Non-sample nodes may also be provided, in which case genotypes
|
|
5608
|
+
will be generated for those nodes too.
|
|
5545
5609
|
:param bool isolated_as_missing: If True, the genotype value assigned to
|
|
5546
|
-
|
|
5547
|
-
:data:`.MISSING_DATA` (-1). If False,
|
|
5610
|
+
isolated nodes without mutations (samples or non-samples) is
|
|
5611
|
+
:data:`.MISSING_DATA` (-1). If False, such nodes will be
|
|
5548
5612
|
assigned the allele index for the ancestral state.
|
|
5549
5613
|
Default: True.
|
|
5550
5614
|
:param tuple alleles: A tuple of strings describing the encoding of
|
|
@@ -5593,21 +5657,24 @@ class TreeSequence:
|
|
|
5593
5657
|
*,
|
|
5594
5658
|
reference_sequence=None,
|
|
5595
5659
|
missing_data_character=None,
|
|
5660
|
+
isolated_as_missing=None,
|
|
5596
5661
|
samples=None,
|
|
5597
5662
|
left=None,
|
|
5598
5663
|
right=None,
|
|
5599
5664
|
):
|
|
5600
5665
|
"""
|
|
5601
5666
|
Returns an iterator over the full sequence alignments for the defined samples
|
|
5602
|
-
in this tree sequence. Each alignment ``a`` is a string of length
|
|
5603
|
-
the first character is the genomic sequence at the ``start``
|
|
5604
|
-
genome (defaulting to 0) and the last character is the
|
|
5605
|
-
position before the ``stop`` value (defaulting to the
|
|
5606
|
-
of this tree sequence, which must have
|
|
5607
|
-
By default ``L`` is therefore equal
|
|
5608
|
-
and ``a[j]`` is the nucleotide value at
|
|
5609
|
-
|
|
5610
|
-
|
|
5667
|
+
in this tree sequence. Each yielded alignment ``a`` is a string of length
|
|
5668
|
+
``L`` where the first character is the genomic sequence at the ``start``
|
|
5669
|
+
position in the genome (defaulting to 0) and the last character is the
|
|
5670
|
+
genomic sequence one position before the ``stop`` value (defaulting to the
|
|
5671
|
+
:attr:`.sequence_length` of this tree sequence, which must have
|
|
5672
|
+
:attr:`.discrete_genome` equal to True). By default ``L`` is therefore equal
|
|
5673
|
+
to the :attr:`.sequence_length`, and ``a[j]`` is the nucleotide value at
|
|
5674
|
+
genomic position ``j``.
|
|
5675
|
+
|
|
5676
|
+
.. note::
|
|
5677
|
+
This is inherently a **zero-based** representation of the sequence
|
|
5611
5678
|
coordinate space. Care will be needed when interacting with other
|
|
5612
5679
|
libraries and upstream coordinate spaces.
|
|
5613
5680
|
|
|
@@ -5656,31 +5723,44 @@ class TreeSequence:
|
|
|
5656
5723
|
single byte characters, (i.e., variants must be single nucleotide
|
|
5657
5724
|
polymorphisms, or SNPs).
|
|
5658
5725
|
|
|
5659
|
-
|
|
5660
|
-
|
|
5661
|
-
|
|
5662
|
-
|
|
5663
|
-
|
|
5726
|
+
Missing data handling
|
|
5727
|
+
|
|
5728
|
+
- If ``isolated_as_missing=True`` (default), nodes that are isolated
|
|
5729
|
+
(no parent and no children) are rendered as the missing character across
|
|
5730
|
+
each tree interval. At site positions, the per-site allele overrides the
|
|
5731
|
+
missing character; if a genotype is missing (``-1``), the missing
|
|
5732
|
+
character is retained.
|
|
5733
|
+
- If ``isolated_as_missing=False``, no missing overlay is applied. At sites,
|
|
5734
|
+
genotypes are decoded as usual; at non-sites, bases come from the
|
|
5735
|
+
reference sequence.
|
|
5664
5736
|
|
|
5665
5737
|
See also the :meth:`.variants` iterator for site-centric access
|
|
5666
5738
|
to sample genotypes and :meth:`.haplotypes` for access to sample sequences
|
|
5667
5739
|
at just the sites in the tree sequence.
|
|
5668
5740
|
|
|
5669
5741
|
:param str reference_sequence: The reference sequence to fill in
|
|
5670
|
-
gaps between sites in the alignments.
|
|
5742
|
+
gaps between sites in the alignments. If provided, it must be a
|
|
5743
|
+
string of length equal to :attr:`.sequence_length`; the sequence is
|
|
5744
|
+
sliced internally to the requested ``[left, right)`` interval.
|
|
5671
5745
|
:param str missing_data_character: A single ascii character that will
|
|
5672
5746
|
be used to represent missing data.
|
|
5673
5747
|
If any normal allele contains this character, an error is raised.
|
|
5674
5748
|
Default: 'N'.
|
|
5675
|
-
:param
|
|
5676
|
-
|
|
5677
|
-
|
|
5749
|
+
:param bool isolated_as_missing: If True, treat isolated nodes as missing
|
|
5750
|
+
across the covered tree intervals (see above). If None (default), this
|
|
5751
|
+
is treated as True.
|
|
5752
|
+
:param list[int] samples: The nodes for which to output alignments. If
|
|
5753
|
+
``None`` (default), return alignments for all sample nodes in the order
|
|
5754
|
+
given by the :meth:`.samples` method. Non-sample nodes are also supported
|
|
5755
|
+
and will be decoded at sites in the same way as samples.
|
|
5678
5756
|
:param int left: Alignments will start at this genomic position. If ``None``
|
|
5679
5757
|
(default) alignments start at 0.
|
|
5680
|
-
:param int right: Alignments will stop before this genomic position.
|
|
5681
|
-
(default) alignments will continue until the end of the
|
|
5758
|
+
:param int right: Alignments will stop before this genomic position.
|
|
5759
|
+
If ``None`` (default) alignments will continue until the end of the
|
|
5760
|
+
tree sequence.
|
|
5682
5761
|
:return: An iterator over the alignment strings for specified samples in
|
|
5683
|
-
this tree sequence, in the order given in ``samples``.
|
|
5762
|
+
this tree sequence, in the order given in ``samples``. Each string has
|
|
5763
|
+
length ``L = right - left``.
|
|
5684
5764
|
:rtype: collections.abc.Iterable
|
|
5685
5765
|
:raises ValueError: if any genome coordinate in this tree sequence is not
|
|
5686
5766
|
discrete, or if the ``reference_sequence`` is not of the correct length.
|
|
@@ -5694,60 +5774,53 @@ class TreeSequence:
|
|
|
5694
5774
|
"N" if missing_data_character is None else missing_data_character
|
|
5695
5775
|
)
|
|
5696
5776
|
|
|
5697
|
-
|
|
5698
|
-
|
|
5699
|
-
if reference_sequence is None:
|
|
5700
|
-
if self.has_reference_sequence():
|
|
5701
|
-
# This may be inefficient - see #1989. However, since we're
|
|
5702
|
-
# n copies of the reference sequence anyway, this is a relatively
|
|
5703
|
-
# minor tweak. We may also want to recode the below not to use direct
|
|
5704
|
-
# access to the .data attribute, e.g. if we allow reference sequences
|
|
5705
|
-
# to start at non-zero positions
|
|
5706
|
-
reference_sequence = self.reference_sequence.data[
|
|
5707
|
-
interval.left : interval.right
|
|
5708
|
-
]
|
|
5709
|
-
else:
|
|
5710
|
-
reference_sequence = missing_data_character * L
|
|
5777
|
+
if isolated_as_missing is None:
|
|
5778
|
+
isolated_as_missing = True
|
|
5711
5779
|
|
|
5712
|
-
if len(
|
|
5713
|
-
|
|
5714
|
-
|
|
5715
|
-
|
|
5716
|
-
|
|
5717
|
-
|
|
5780
|
+
if len(missing_data_character) != 1:
|
|
5781
|
+
raise TypeError("missing_data_character must be a single character")
|
|
5782
|
+
|
|
5783
|
+
# Determine the reference sequence for the whole tree sequence
|
|
5784
|
+
full_ref = None
|
|
5785
|
+
if reference_sequence is not None:
|
|
5786
|
+
full_ref = reference_sequence
|
|
5787
|
+
elif self.has_reference_sequence():
|
|
5788
|
+
# This may be inefficient - see #1989. However, since we're
|
|
5789
|
+
# n copies of the reference sequence anyway, this is a relatively
|
|
5790
|
+
# minor tweak. We may also want to recode the below not to use direct
|
|
5791
|
+
# access to the .data attribute, e.g. if we allow reference sequences
|
|
5792
|
+
# to start at non-zero positions
|
|
5793
|
+
full_ref = self.reference_sequence.data
|
|
5794
|
+
|
|
5795
|
+
if full_ref is None:
|
|
5796
|
+
full_ref = missing_data_character * int(self.sequence_length)
|
|
5797
|
+
else:
|
|
5798
|
+
if len(full_ref) != int(self.sequence_length):
|
|
5718
5799
|
raise ValueError(
|
|
5719
|
-
"The reference sequence
|
|
5800
|
+
"The reference sequence must be equal to the tree sequence length"
|
|
5720
5801
|
)
|
|
5721
|
-
|
|
5722
|
-
|
|
5723
|
-
|
|
5724
|
-
|
|
5725
|
-
|
|
5726
|
-
|
|
5727
|
-
|
|
5728
|
-
|
|
5729
|
-
|
|
5730
|
-
|
|
5731
|
-
|
|
5732
|
-
|
|
5733
|
-
|
|
5734
|
-
|
|
5735
|
-
|
|
5736
|
-
|
|
5737
|
-
"The current implementation may also incorrectly identify an "
|
|
5738
|
-
"input tree sequence has having missing data."
|
|
5739
|
-
)
|
|
5740
|
-
H, (first_site_id, last_site_id) = self._haplotypes_array(
|
|
5741
|
-
interval=interval,
|
|
5742
|
-
missing_data_character=missing_data_character,
|
|
5743
|
-
samples=samples,
|
|
5802
|
+
|
|
5803
|
+
try:
|
|
5804
|
+
ref_bytes = full_ref.encode("ascii")
|
|
5805
|
+
missing_data_character.encode("ascii")
|
|
5806
|
+
except UnicodeEncodeError:
|
|
5807
|
+
raise
|
|
5808
|
+
|
|
5809
|
+
sample_ids = self.samples() if samples is None else list(samples)
|
|
5810
|
+
|
|
5811
|
+
flat = self._ll_tree_sequence.decode_alignments(
|
|
5812
|
+
ref_bytes,
|
|
5813
|
+
sample_ids,
|
|
5814
|
+
int(interval.left),
|
|
5815
|
+
int(interval.right),
|
|
5816
|
+
missing_data_character,
|
|
5817
|
+
bool(isolated_as_missing),
|
|
5744
5818
|
)
|
|
5745
|
-
|
|
5746
|
-
|
|
5747
|
-
|
|
5748
|
-
|
|
5749
|
-
|
|
5750
|
-
yield a.tobytes().decode("ascii")
|
|
5819
|
+
|
|
5820
|
+
span = int(interval.span)
|
|
5821
|
+
for j in range(len(sample_ids)):
|
|
5822
|
+
offset = j * span
|
|
5823
|
+
yield flat[offset : offset + span].decode("ascii")
|
|
5751
5824
|
|
|
5752
5825
|
@property
|
|
5753
5826
|
def individuals_population(self):
|
|
@@ -5978,7 +6051,9 @@ class TreeSequence:
|
|
|
5978
6051
|
"The sites_ancestral_state property requires numpy 2.0 or later."
|
|
5979
6052
|
)
|
|
5980
6053
|
if self._sites_ancestral_state is None:
|
|
5981
|
-
self._sites_ancestral_state =
|
|
6054
|
+
self._sites_ancestral_state = (
|
|
6055
|
+
self._ll_tree_sequence.sites_ancestral_state_string
|
|
6056
|
+
)
|
|
5982
6057
|
return self._sites_ancestral_state
|
|
5983
6058
|
|
|
5984
6059
|
@property
|
|
@@ -6050,7 +6125,7 @@ class TreeSequence:
|
|
|
6050
6125
|
)
|
|
6051
6126
|
if self._mutations_derived_state is None:
|
|
6052
6127
|
self._mutations_derived_state = (
|
|
6053
|
-
self._ll_tree_sequence.
|
|
6128
|
+
self._ll_tree_sequence.mutations_derived_state_string
|
|
6054
6129
|
)
|
|
6055
6130
|
return self._mutations_derived_state
|
|
6056
6131
|
|
|
@@ -6098,7 +6173,7 @@ class TreeSequence:
|
|
|
6098
6173
|
)
|
|
6099
6174
|
if self._mutations_inherited_state is None:
|
|
6100
6175
|
self._mutations_inherited_state = (
|
|
6101
|
-
self._ll_tree_sequence.
|
|
6176
|
+
self._ll_tree_sequence.mutations_inherited_state_string
|
|
6102
6177
|
)
|
|
6103
6178
|
return self._mutations_inherited_state
|
|
6104
6179
|
|
|
@@ -6458,6 +6533,9 @@ class TreeSequence:
|
|
|
6458
6533
|
samples = self._ll_tree_sequence.get_samples()
|
|
6459
6534
|
keep = np.full(shape=samples.shape, fill_value=True)
|
|
6460
6535
|
if population is not None:
|
|
6536
|
+
if not isinstance(population, numbers.Integral):
|
|
6537
|
+
raise ValueError("`population` must be an integer ID")
|
|
6538
|
+
population = int(population)
|
|
6461
6539
|
sample_population = self.nodes_population[samples]
|
|
6462
6540
|
keep = np.logical_and(keep, sample_population == population)
|
|
6463
6541
|
if time is not None:
|
|
@@ -6570,13 +6648,13 @@ class TreeSequence:
|
|
|
6570
6648
|
to the sites in the tree sequence object.
|
|
6571
6649
|
|
|
6572
6650
|
.. note::
|
|
6573
|
-
|
|
6574
|
-
|
|
6575
|
-
|
|
6576
|
-
|
|
6577
|
-
|
|
6578
|
-
|
|
6579
|
-
|
|
6651
|
+
Older code often uses the ``ploidy=2`` argument, because old
|
|
6652
|
+
versions of msprime did not output individual data. Specifying
|
|
6653
|
+
individuals in the tree sequence is more robust, and since tree
|
|
6654
|
+
sequences now typically contain individuals (e.g., as produced by
|
|
6655
|
+
``msprime.sim_ancestry( )``), this is not necessary, and the
|
|
6656
|
+
``ploidy`` argument can safely be removed as part of the process
|
|
6657
|
+
of updating from the msprime 0.x legacy API.
|
|
6580
6658
|
|
|
6581
6659
|
:param io.IOBase output: The file-like object to write the VCF output.
|
|
6582
6660
|
:param int ploidy: The ploidy of the individuals to be written to
|
|
@@ -6661,6 +6739,7 @@ class TreeSequence:
|
|
|
6661
6739
|
wrap_width=60,
|
|
6662
6740
|
reference_sequence=None,
|
|
6663
6741
|
missing_data_character=None,
|
|
6742
|
+
isolated_as_missing=None,
|
|
6664
6743
|
):
|
|
6665
6744
|
"""
|
|
6666
6745
|
Writes the :meth:`.alignments` for this tree sequence to file in
|
|
@@ -6685,12 +6764,6 @@ class TreeSequence:
|
|
|
6685
6764
|
|
|
6686
6765
|
ts.write_fasta("output.fa")
|
|
6687
6766
|
|
|
6688
|
-
.. warning:: :ref:`Missing data<sec_data_model_missing_data>` is not
|
|
6689
|
-
currently supported by this method and it will raise a ValueError
|
|
6690
|
-
if called on tree sequences containing isolated samples.
|
|
6691
|
-
See https://github.com/tskit-dev/tskit/issues/1896 for more
|
|
6692
|
-
information.
|
|
6693
|
-
|
|
6694
6767
|
:param file_or_path: The file object or path to write the output.
|
|
6695
6768
|
Paths can be either strings or :class:`python:pathlib.Path` objects.
|
|
6696
6769
|
:param int wrap_width: The number of sequence
|
|
@@ -6699,6 +6772,7 @@ class TreeSequence:
|
|
|
6699
6772
|
(Default=60).
|
|
6700
6773
|
:param str reference_sequence: As for the :meth:`.alignments` method.
|
|
6701
6774
|
:param str missing_data_character: As for the :meth:`.alignments` method.
|
|
6775
|
+
:param bool isolated_as_missing: As for the :meth:`.alignments` method.
|
|
6702
6776
|
"""
|
|
6703
6777
|
text_formats.write_fasta(
|
|
6704
6778
|
self,
|
|
@@ -6706,6 +6780,7 @@ class TreeSequence:
|
|
|
6706
6780
|
wrap_width=wrap_width,
|
|
6707
6781
|
reference_sequence=reference_sequence,
|
|
6708
6782
|
missing_data_character=missing_data_character,
|
|
6783
|
+
isolated_as_missing=isolated_as_missing,
|
|
6709
6784
|
)
|
|
6710
6785
|
|
|
6711
6786
|
def as_fasta(self, **kwargs):
|
|
@@ -6729,6 +6804,7 @@ class TreeSequence:
|
|
|
6729
6804
|
include_alignments=None,
|
|
6730
6805
|
reference_sequence=None,
|
|
6731
6806
|
missing_data_character=None,
|
|
6807
|
+
isolated_as_missing=None,
|
|
6732
6808
|
):
|
|
6733
6809
|
"""
|
|
6734
6810
|
Returns a `nexus encoding <https://en.wikipedia.org/wiki/Nexus_file>`_
|
|
@@ -6812,10 +6888,7 @@ class TreeSequence:
|
|
|
6812
6888
|
as our convention of using trees with multiple roots
|
|
6813
6889
|
is not often supported by newick parsers. Thus, the method
|
|
6814
6890
|
will raise a ValueError if we try to output trees with
|
|
6815
|
-
multiple roots.
|
|
6816
|
-
is not currently supported for alignment data.
|
|
6817
|
-
See https://github.com/tskit-dev/tskit/issues/1896 for more
|
|
6818
|
-
information.
|
|
6891
|
+
multiple roots.
|
|
6819
6892
|
|
|
6820
6893
|
.. seealso: See also the :meth:`.as_nexus` method which will
|
|
6821
6894
|
return this nexus representation as a string.
|
|
@@ -6830,6 +6903,7 @@ class TreeSequence:
|
|
|
6830
6903
|
:param str reference_sequence: As for the :meth:`.alignments` method.
|
|
6831
6904
|
:param str missing_data_character: As for the :meth:`.alignments` method,
|
|
6832
6905
|
but defaults to "?".
|
|
6906
|
+
:param bool isolated_as_missing: As for the :meth:`.alignments` method.
|
|
6833
6907
|
:return: A nexus representation of this :class:`TreeSequence`
|
|
6834
6908
|
:rtype: str
|
|
6835
6909
|
"""
|
|
@@ -6841,6 +6915,7 @@ class TreeSequence:
|
|
|
6841
6915
|
include_alignments=include_alignments,
|
|
6842
6916
|
reference_sequence=reference_sequence,
|
|
6843
6917
|
missing_data_character=missing_data_character,
|
|
6918
|
+
isolated_as_missing=isolated_as_missing,
|
|
6844
6919
|
)
|
|
6845
6920
|
|
|
6846
6921
|
def as_nexus(self, **kwargs):
|
|
@@ -7187,19 +7262,32 @@ class TreeSequence:
|
|
|
7187
7262
|
self, *args, node_mappings=None, record_provenance=True, add_populations=None
|
|
7188
7263
|
):
|
|
7189
7264
|
r"""
|
|
7190
|
-
Concatenate a set of tree sequences to the right of this one, by
|
|
7191
|
-
|
|
7192
|
-
|
|
7193
|
-
|
|
7194
|
-
|
|
7265
|
+
Concatenate a set of tree sequences to the right of this one, by shifting
|
|
7266
|
+
their coordinate systems and adding all edges, sites, mutations, and
|
|
7267
|
+
any additional nodes, individuals, or populations needed for these.
|
|
7268
|
+
Concretely, to concatenate an ``other`` tree sequence to ``self``, the value
|
|
7269
|
+
of ``self.sequence_length`` is added to all genomic coordinates in ``other``,
|
|
7270
|
+
and then the concatenated tree sequence will contain all edges, sites, and
|
|
7271
|
+
mutations in both. Which nodes in ``other`` are treated as "new", and hence
|
|
7272
|
+
added as well, is controlled by ``node_mappings``. Any individuals to which
|
|
7273
|
+
new nodes belong are added as well.
|
|
7274
|
+
|
|
7275
|
+
The method uses :meth:`.shift` followed by :meth:`.union`, with
|
|
7276
|
+
``all_mutations=True``, ``all_edges=True``, and ``check_shared_equality=False``.
|
|
7277
|
+
|
|
7278
|
+
By default, the samples in current and input tree sequences are assumed to
|
|
7279
|
+
refer to the same nodes, and are matched based on the numerical order of
|
|
7280
|
+
sample node IDs; all other nodes are assumed to be new. This can be
|
|
7281
|
+
changed by providing explicit ``node_mappings`` for each input tree sequence
|
|
7282
|
+
(see below).
|
|
7195
7283
|
|
|
7196
7284
|
.. note::
|
|
7197
|
-
To add gaps between the concatenated
|
|
7198
|
-
to remove gaps, use :meth:`trim` before concatenating.
|
|
7285
|
+
To add gaps between the concatenated tree sequences, use :meth:`shift`
|
|
7286
|
+
or to remove gaps, use :meth:`trim` before concatenating.
|
|
7199
7287
|
|
|
7200
7288
|
:param TreeSequence \*args: A list of other tree sequences to append to
|
|
7201
7289
|
the right of this one.
|
|
7202
|
-
:param Union[list, None] node_mappings:
|
|
7290
|
+
:param Union[list, None] node_mappings: A list of node mappings for each
|
|
7203
7291
|
input tree sequence in ``args``. Each should either be an array of
|
|
7204
7292
|
integers of the same length as the number of nodes in the equivalent
|
|
7205
7293
|
input tree sequence (see :meth:`~TreeSequence.union` for details), or
|
|
@@ -7241,6 +7329,8 @@ class TreeSequence:
|
|
|
7241
7329
|
other_tables,
|
|
7242
7330
|
node_mapping=node_mapping,
|
|
7243
7331
|
check_shared_equality=False, # Else checks fail with internal samples
|
|
7332
|
+
all_mutations=True,
|
|
7333
|
+
all_edges=True,
|
|
7244
7334
|
record_provenance=False,
|
|
7245
7335
|
add_populations=add_populations,
|
|
7246
7336
|
)
|
|
@@ -7329,7 +7419,7 @@ class TreeSequence:
|
|
|
7329
7419
|
is its associated ``time`` value, or the time of its node if the
|
|
7330
7420
|
mutation's time was marked as unknown (:data:`UNKNOWN_TIME`).
|
|
7331
7421
|
|
|
7332
|
-
Migrations are not supported, and a LibraryError will be
|
|
7422
|
+
Migrations are not supported, and a LibraryError will be raised if
|
|
7333
7423
|
called on a tree sequence containing migration information.
|
|
7334
7424
|
|
|
7335
7425
|
.. seealso:: This method is implemented using the :meth:`.split_edges`
|
|
@@ -7365,7 +7455,9 @@ class TreeSequence:
|
|
|
7365
7455
|
`n` to `c` are extended, and the span of the edge from `p` to `c` is
|
|
7366
7456
|
reduced. Thus, the ancestral haplotype represented by `n` is extended
|
|
7367
7457
|
to a longer span of the genome. However, any edges whose child node is
|
|
7368
|
-
a sample are not modified.
|
|
7458
|
+
a sample are not modified. See
|
|
7459
|
+
`Fritze et al. (2025) <https://doi.org/10.1093/genetics/iyaf198>`_
|
|
7460
|
+
for more details.
|
|
7369
7461
|
|
|
7370
7462
|
Since some edges may be removed entirely, this process usually reduces
|
|
7371
7463
|
the number of edges in the tree sequence.
|
|
@@ -7388,15 +7480,15 @@ class TreeSequence:
|
|
|
7388
7480
|
known mutation times. See :meth:`.impute_unknown_mutations_time` if
|
|
7389
7481
|
mutation times are not known.
|
|
7390
7482
|
|
|
7391
|
-
|
|
7392
|
-
|
|
7393
|
-
|
|
7394
|
-
|
|
7395
|
-
|
|
7483
|
+
.. note::
|
|
7484
|
+
The method will not affect the marginal trees (so, if the original tree
|
|
7485
|
+
sequence was simplified, then following up with `simplify` will recover
|
|
7486
|
+
the original tree sequence, possibly with edges in a different order).
|
|
7487
|
+
It will also not affect the genotype matrix, or any of the tables other
|
|
7488
|
+
than the edge table or the node column in the mutation table.
|
|
7396
7489
|
|
|
7397
|
-
:param int
|
|
7490
|
+
:param int max_iter: The maximum number of iterations over the tree
|
|
7398
7491
|
sequence. Defaults to 10.
|
|
7399
|
-
|
|
7400
7492
|
:return: A new tree sequence with unary nodes extended.
|
|
7401
7493
|
:rtype: tskit.TreeSequence
|
|
7402
7494
|
"""
|
|
@@ -7421,11 +7513,15 @@ class TreeSequence:
|
|
|
7421
7513
|
the ancestry of these nodes - for that, see :meth:`.simplify`.
|
|
7422
7514
|
|
|
7423
7515
|
This has the side effect that it may change the order of the nodes,
|
|
7424
|
-
|
|
7425
|
-
in the new tree sequence will be in the order provided in ``nodes
|
|
7426
|
-
|
|
7427
|
-
|
|
7428
|
-
|
|
7516
|
+
populations, individuals, and migrations in the tree sequence. Nodes
|
|
7517
|
+
in the new tree sequence will be in the order provided in ``nodes``.
|
|
7518
|
+
Populations will be ordered in ascending order of the lowest ID of
|
|
7519
|
+
the nodes that refer to them. Individuals will be not only ordered
|
|
7520
|
+
so that :attr:`~Individual.parents` come before children (see
|
|
7521
|
+
:meth:`~TableCollection.sort_individuals`) but in addition
|
|
7522
|
+
will be secondarily sorted in ascending order of the lowest ID of
|
|
7523
|
+
their referring nodes. (However, ``reorder_populations`` may be set
|
|
7524
|
+
to ``False`` to keep the population table unchanged.)
|
|
7429
7525
|
|
|
7430
7526
|
By default, the method removes all individuals and populations not
|
|
7431
7527
|
referenced by any nodes, and all sites not referenced by any mutations.
|
|
@@ -7469,6 +7565,9 @@ class TreeSequence:
|
|
|
7469
7565
|
check_shared_equality=True,
|
|
7470
7566
|
add_populations=True,
|
|
7471
7567
|
record_provenance=True,
|
|
7568
|
+
*,
|
|
7569
|
+
all_edges=False,
|
|
7570
|
+
all_mutations=False,
|
|
7472
7571
|
):
|
|
7473
7572
|
"""
|
|
7474
7573
|
Returns an expanded tree sequence which contains the node-wise union of
|
|
@@ -7484,8 +7583,8 @@ class TreeSequence:
|
|
|
7484
7583
|
1. Individuals whose nodes are new to ``self``.
|
|
7485
7584
|
2. Edges whose parent or child are new to ``self``.
|
|
7486
7585
|
3. Mutations whose nodes are new to ``self``.
|
|
7487
|
-
4. Sites
|
|
7488
|
-
added mutation.
|
|
7586
|
+
4. Sites whose positions are not present in the site positions in
|
|
7587
|
+
``self``, if the site contains a newly added mutation.
|
|
7489
7588
|
|
|
7490
7589
|
This can be thought of as a "node-wise" union: for instance, it can not
|
|
7491
7590
|
be used to add new edges between two nodes already in ``self`` or new
|
|
@@ -7502,17 +7601,47 @@ class TreeSequence:
|
|
|
7502
7601
|
nodes are in entirely new populations, then you must set up the
|
|
7503
7602
|
population table first, and then union with ``add_populations=False``.
|
|
7504
7603
|
|
|
7505
|
-
|
|
7506
|
-
|
|
7507
|
-
|
|
7604
|
+
This method makes sense if the "shared" portions of the tree sequences
|
|
7605
|
+
are equal; the option ``check_shared_equality`` performs a consistency
|
|
7606
|
+
check that this is true. If this check is disabled, it is very easy to
|
|
7607
|
+
produce nonsensical results via subtle inconsistencies.
|
|
7608
|
+
|
|
7609
|
+
The behavior above can be changed by ``all_edges`` and ``all_mutations``.
|
|
7610
|
+
If ``all_edges`` is True, then all edges in ``other`` are added to
|
|
7611
|
+
``self``, instead of only edges adjacent to added nodes. If
|
|
7612
|
+
``all_mutations`` is True, then similarly all mutations in ``other``
|
|
7613
|
+
are added (not just those on added nodes); furthermore, all sites
|
|
7614
|
+
at positions without a site already present are added to ``self``.
|
|
7615
|
+
The intended use case for these options is a "disjoint" union,
|
|
7616
|
+
where for instance the two tree sequences contain information about
|
|
7617
|
+
disjoint segments of the genome (see :meth:`.concatenate`).
|
|
7618
|
+
For some such applications it may be necessary to set
|
|
7619
|
+
``check_shared_equality=False``: for instance, if ``other`` has
|
|
7620
|
+
an identical copy of the node table but no edges, then
|
|
7621
|
+
``all_mutations=True, check_shared_equality=False`` can be used
|
|
7622
|
+
to add mutations to ``self``.
|
|
7508
7623
|
|
|
7509
|
-
|
|
7510
|
-
|
|
7511
|
-
|
|
7624
|
+
.. warning::
|
|
7625
|
+
If an equivalent node is specified in ``other``, the
|
|
7626
|
+
version in ``self`` is used without checking the node
|
|
7627
|
+
properties are the same. Similarly, if the same site position
|
|
7628
|
+
is present in both ``self`` and ``other``, the version in
|
|
7629
|
+
``self`` is used without checking that site properties are
|
|
7630
|
+
the same. In these cases metadata and e.g. node times or ancestral
|
|
7631
|
+
states in ``other`` are simply ignored.
|
|
7512
7632
|
|
|
7513
|
-
|
|
7633
|
+
.. note::
|
|
7634
|
+
This operation also sorts the resulting tables, so the resulting
|
|
7635
|
+
tree sequence may not be equal to ``self`` even if nothing new
|
|
7636
|
+
was added (although it would differ only in ordering of the tables).
|
|
7637
|
+
|
|
7638
|
+
:param TreeSequence other: Another tree sequence.
|
|
7514
7639
|
:param list node_mapping: An array of node IDs that relate nodes in
|
|
7515
7640
|
``other`` to nodes in ``self``.
|
|
7641
|
+
:param bool all_edges: If True, then all edges in ``other`` are added
|
|
7642
|
+
to ``self``.
|
|
7643
|
+
:param bool all_mutations: If True, then all mutations and sites in
|
|
7644
|
+
``other`` are added to ``self``.
|
|
7516
7645
|
:param bool check_shared_equality: If True, the shared portions of the
|
|
7517
7646
|
tree sequences will be checked for equality. It does so by
|
|
7518
7647
|
running :meth:`TreeSequence.subset` on both ``self`` and ``other``
|
|
@@ -7522,6 +7651,11 @@ class TreeSequence:
|
|
|
7522
7651
|
assigned new population IDs.
|
|
7523
7652
|
:param bool record_provenance: Whether to record a provenance entry
|
|
7524
7653
|
in the provenance table for this operation.
|
|
7654
|
+
:return: The union of the two tree sequences.
|
|
7655
|
+
:rtype: tskit.TreeSequence
|
|
7656
|
+
:raises: **tskit.LibraryError** -- If the resulting tree sequence is invalid
|
|
7657
|
+
(for instance, a node is specified to have two distinct
|
|
7658
|
+
parents on the same interval)
|
|
7525
7659
|
"""
|
|
7526
7660
|
tables = self.dump_tables()
|
|
7527
7661
|
other_tables = other.dump_tables()
|
|
@@ -7531,6 +7665,8 @@ class TreeSequence:
|
|
|
7531
7665
|
check_shared_equality=check_shared_equality,
|
|
7532
7666
|
add_populations=add_populations,
|
|
7533
7667
|
record_provenance=record_provenance,
|
|
7668
|
+
all_edges=all_edges,
|
|
7669
|
+
all_mutations=all_mutations,
|
|
7534
7670
|
)
|
|
7535
7671
|
return tables.tree_sequence()
|
|
7536
7672
|
|
|
@@ -8600,52 +8736,6 @@ class TreeSequence:
|
|
|
8600
8736
|
sizes = np.array(sizes, dtype=size_dtype)
|
|
8601
8737
|
return flat, sizes
|
|
8602
8738
|
|
|
8603
|
-
# def divergence_matrix(self, sample_sets, windows=None, mode="site"):
|
|
8604
|
-
# """
|
|
8605
|
-
# Finds the mean divergence between pairs of samples from each set of
|
|
8606
|
-
# samples and in each window. Returns a numpy array indexed by (window,
|
|
8607
|
-
# sample_set, sample_set). Diagonal entries are corrected so that the
|
|
8608
|
-
# value gives the mean divergence for *distinct* samples, but it is not
|
|
8609
|
-
# checked whether the sample_sets are disjoint (so offdiagonals are not
|
|
8610
|
-
# corrected). For this reason, if an element of `sample_sets` has only
|
|
8611
|
-
# one element, the corresponding diagonal will be NaN.
|
|
8612
|
-
|
|
8613
|
-
# The mean divergence between two samples is defined to be the mean: (as
|
|
8614
|
-
# a TreeStat) length of all edges separating them in the tree, or (as a
|
|
8615
|
-
# SiteStat) density of segregating sites, at a uniformly chosen position
|
|
8616
|
-
# on the genome.
|
|
8617
|
-
|
|
8618
|
-
# :param list sample_sets: A list of sets of IDs of samples.
|
|
8619
|
-
# :param iterable windows: The breakpoints of the windows (including start
|
|
8620
|
-
# and end, so has one more entry than number of windows).
|
|
8621
|
-
# :return: A list of the upper triangle of mean TMRCA values in row-major
|
|
8622
|
-
# order, including the diagonal.
|
|
8623
|
-
# """
|
|
8624
|
-
# ns = len(sample_sets)
|
|
8625
|
-
# indexes = [(i, j) for i in range(ns) for j in range(i, ns)]
|
|
8626
|
-
# x = self.divergence(sample_sets, indexes, windows, mode=mode)
|
|
8627
|
-
# nw = len(windows) - 1
|
|
8628
|
-
# A = np.ones((nw, ns, ns), dtype=float)
|
|
8629
|
-
# for w in range(nw):
|
|
8630
|
-
# k = 0
|
|
8631
|
-
# for i in range(ns):
|
|
8632
|
-
# for j in range(i, ns):
|
|
8633
|
-
# A[w, i, j] = A[w, j, i] = x[w][k]
|
|
8634
|
-
# k += 1
|
|
8635
|
-
# return A
|
|
8636
|
-
# NOTE: see older definition of divmat here, which may be useful when documenting
|
|
8637
|
-
# this function. See https://github.com/tskit-dev/tskit/issues/2781
|
|
8638
|
-
|
|
8639
|
-
# NOTE for documentation of sample_sets. We *must* use samples currently because
|
|
8640
|
-
# the normalisation for non-sample nodes is tricky. Do we normalise by the
|
|
8641
|
-
# total span of the ts where the node is 'present' in the tree? We avoid this
|
|
8642
|
-
# by insisting on sample nodes.
|
|
8643
|
-
|
|
8644
|
-
# NOTE for documentation of num_threads. Need to explain that the
|
|
8645
|
-
# its best to think of as the number of background *worker* threads.
|
|
8646
|
-
# default is to run without any worker threads. If you want to run
|
|
8647
|
-
# with all the cores on the machine, use num_threads=os.cpu_count().
|
|
8648
|
-
|
|
8649
8739
|
def divergence_matrix(
|
|
8650
8740
|
self,
|
|
8651
8741
|
sample_sets=None,
|
|
@@ -8655,6 +8745,41 @@ class TreeSequence:
|
|
|
8655
8745
|
mode=None,
|
|
8656
8746
|
span_normalise=True,
|
|
8657
8747
|
):
|
|
8748
|
+
"""
|
|
8749
|
+
Finds the matrix of pairwise :meth:`.divergence` values between groups
|
|
8750
|
+
of sample nodes. Returns a numpy array indexed by (window,
|
|
8751
|
+
sample_set, sample_set): the [k,i,j]th value of the result gives the
|
|
8752
|
+
mean divergence between pairs of samples from the i-th and j-th
|
|
8753
|
+
sample sets in the k-th window. As for :meth:`.divergence`,
|
|
8754
|
+
diagonal entries are corrected so that the
|
|
8755
|
+
value gives the mean divergence for *distinct* samples,
|
|
8756
|
+
and so diagonal entries are given by the :meth:`.diversity` of that
|
|
8757
|
+
sample set. For this reason, if an element of `sample_sets` has only
|
|
8758
|
+
one element, the corresponding :meth:`.diversity` will be NaN.
|
|
8759
|
+
However, this method will place a value of 0 in the diagonal instead of NaN
|
|
8760
|
+
in such cases; otherwise, this is equivalent to computing values with
|
|
8761
|
+
`meth`:.divergence`.
|
|
8762
|
+
However, this is (usually) more efficient than computing many
|
|
8763
|
+
pairwise values using the `indexes` argument to :meth:`.divergence`,
|
|
8764
|
+
so see :meth:`.divergence` for a description of what exactly is computed.
|
|
8765
|
+
|
|
8766
|
+
:param list sample_sets: A list of sets of IDs of samples.
|
|
8767
|
+
:param list windows: The breakpoints of the windows (including start
|
|
8768
|
+
and end, so has one more entry than number of windows).
|
|
8769
|
+
:param str mode: A string giving the "type" of the statistic to be computed
|
|
8770
|
+
(defaults to "site"; the other option is "branch").
|
|
8771
|
+
:return: An array indexed by (window, sample_set, sample_set), or if windows is
|
|
8772
|
+
`None`, an array indexed by (sample_set, sample_set).
|
|
8773
|
+
"""
|
|
8774
|
+
# NOTE for documentation of sample_sets. We *must* use samples currently because
|
|
8775
|
+
# the normalisation for non-sample nodes is tricky. Do we normalise by the
|
|
8776
|
+
# total span of the ts where the node is 'present' in the tree? We avoid this
|
|
8777
|
+
# by insisting on sample nodes.
|
|
8778
|
+
|
|
8779
|
+
# NOTE for documentation of num_threads. Need to explain that the
|
|
8780
|
+
# its best to think of as the number of background *worker* threads.
|
|
8781
|
+
# default is to run without any worker threads. If you want to run
|
|
8782
|
+
# with all the cores on the machine, use num_threads=os.cpu_count().
|
|
8658
8783
|
windows_specified = windows is not None
|
|
8659
8784
|
windows = self.parse_windows(windows)
|
|
8660
8785
|
mode = "site" if mode is None else mode
|
|
@@ -8862,7 +8987,16 @@ class TreeSequence:
|
|
|
8862
8987
|
"""
|
|
8863
8988
|
Computes the full matrix of pairwise genetic relatedness values
|
|
8864
8989
|
between (and within) pairs of sets of nodes from ``sample_sets``.
|
|
8865
|
-
|
|
8990
|
+
Returns a numpy array indexed by (window, sample_set, sample_set):
|
|
8991
|
+
the [k,i,j]th value of the result gives the
|
|
8992
|
+
genetic relatedness between pairs of samples from the i-th and j-th
|
|
8993
|
+
sample sets in the k-th window.
|
|
8994
|
+
This is (usually) more efficient than computing many pairwise
|
|
8995
|
+
values using the `indexes` argument to :meth:`.genetic_relatedness`.
|
|
8996
|
+
Specifically, this computes :meth:`.genetic_relatedness` with
|
|
8997
|
+
``centre=True`` and ``proportion=False`` (with caveats, see below).
|
|
8998
|
+
|
|
8999
|
+
*Warning:* in some cases, this does not compute exactly the same thing as
|
|
8866
9000
|
:meth:`.genetic_relatedness`: see below for more details.
|
|
8867
9001
|
|
|
8868
9002
|
If `mode="branch"`, then the value obtained is the same as that from
|
|
@@ -8870,29 +9004,35 @@ class TreeSequence:
|
|
|
8870
9004
|
`proportion=False`. The same is true if `mode="site"` and all sites have
|
|
8871
9005
|
at most one mutation.
|
|
8872
9006
|
|
|
8873
|
-
However, if some sites have more than one mutation, the value may differ
|
|
9007
|
+
However, if some sites have more than one mutation, the value may differ
|
|
9008
|
+
from that given by :meth:`.genetic_relatedness`:, although if the proportion
|
|
9009
|
+
of such sites is small, the difference will be small.
|
|
8874
9010
|
The reason is that this function (for efficiency) computes relatedness
|
|
8875
|
-
using :meth:`.
|
|
9011
|
+
using :meth:`.divergence_matrix` and the following relationship.
|
|
8876
9012
|
"Relatedness" measures the number of *shared* alleles (or branches),
|
|
8877
9013
|
while "divergence" measures the number of *non-shared* alleles (or branches).
|
|
8878
9014
|
Let :math:`T_i` be the total distance from sample :math:`i` up to the root;
|
|
8879
|
-
then if :math:`D_{ij}` is the divergence between :math:`i` and
|
|
8880
|
-
and :math:`R_{ij}` is the relatedness between :math:`i`
|
|
8881
|
-
:math:`T_i + T_j = D_{ij} + 2 R_{ij}.`
|
|
9015
|
+
then if :math:`D_{ij}` is the branch-mode divergence between :math:`i` and
|
|
9016
|
+
:math:`j` and :math:`R_{ij}` is the branch-mode relatedness between :math:`i`
|
|
9017
|
+
and :math:`j`, then :math:`T_i + T_j = D_{ij} + 2 R_{ij}.`
|
|
8882
9018
|
So, for any samples :math:`I`, :math:`J`, :math:`S`, :math:`T`
|
|
8883
9019
|
(that may now be random choices),
|
|
8884
9020
|
:math:`R_{IJ}-R_{IS}-R_{JT}+R_{ST} = (D_{IJ}-D_{IS}-D_{JT}+D_{ST})/ (-2)`.
|
|
8885
|
-
|
|
8886
|
-
|
|
8887
|
-
|
|
9021
|
+
This is exactly what we want for (centered) relatedness.
|
|
9022
|
+
However, this relationship does not necessarily hold for `mode="site"`:
|
|
9023
|
+
it does hold if we can treat "number of differing alleles" as distances
|
|
9024
|
+
on the tree, but this is not necessarily the case in the presence of
|
|
9025
|
+
multiple mutations.
|
|
8888
9026
|
|
|
8889
|
-
Another
|
|
9027
|
+
Another note regarding the above relationship between :math:`R` and :math:`D`
|
|
8890
9028
|
is that :meth:`.divergence` of a sample set to itself does not include
|
|
8891
9029
|
the "self" comparisons (so as to provide an unbiased estimator of a
|
|
8892
9030
|
population quantity), while the usual definition of genetic relatedness
|
|
8893
9031
|
*does* include such comparisons (to provide, for instance, an appropriate
|
|
8894
9032
|
value for prospective results beginning with only a given set of
|
|
8895
|
-
individuals).
|
|
9033
|
+
individuals). So, diagonal entries in the relatedness matrix returned here
|
|
9034
|
+
are obtained from :meth:`divergence_matrix` after first correcting
|
|
9035
|
+
diagonals to include these "self" comparisons.
|
|
8896
9036
|
|
|
8897
9037
|
:param list sample_sets: A list of lists of Node IDs, specifying the
|
|
8898
9038
|
groups of nodes to compute the statistic with.
|
|
@@ -8901,11 +9041,35 @@ class TreeSequence:
|
|
|
8901
9041
|
:param str mode: A string giving the "type" of the statistic to be computed
|
|
8902
9042
|
(defaults to "site").
|
|
8903
9043
|
:param bool span_normalise: Whether to divide the result by the span of the
|
|
8904
|
-
window (defaults to True).
|
|
8905
|
-
:return:
|
|
8906
|
-
|
|
8907
|
-
|
|
8908
|
-
|
|
9044
|
+
window (defaults to True).
|
|
9045
|
+
:return: An array indexed by (window, sample_set, sample_set), or if windows is
|
|
9046
|
+
`None`, an array indexed by (sample_set, sample_set).
|
|
9047
|
+
"""
|
|
9048
|
+
# Further notes on the relationship between relatedness (R)
|
|
9049
|
+
# and divergence (D) in mode="site":
|
|
9050
|
+
# The summary function for divergence is "p (1-q)",
|
|
9051
|
+
# where p and q are the allele frequencies in the two sample sets;
|
|
9052
|
+
# while for relatedness it is "pq". Summing across *all* alleles,
|
|
9053
|
+
# we get that relatedness plus divergence is
|
|
9054
|
+
# p1 (1-q1) + p1 q1 + ... + pk (1-qk) + pk qk = p1 + ... + pk = 1 .
|
|
9055
|
+
# This implies that
|
|
9056
|
+
# ts.divergence(..., span_normalise=False)
|
|
9057
|
+
# + ts.genetic_relatedness(..., span_normalise=False, centre=False,
|
|
9058
|
+
# proportion=False, polarised=False)
|
|
9059
|
+
# == ts.num_sites
|
|
9060
|
+
# This could be the basis for a similar relationship between R and D.
|
|
9061
|
+
# However, that relationship holds only with polarised=False, which is not
|
|
9062
|
+
# the default, or what this function does (for good reason).
|
|
9063
|
+
# So, without setting polarised=False, we have that that for samples i and j,
|
|
9064
|
+
# divergence plus relatedness is equal to (something like)
|
|
9065
|
+
# the total number of sites at which both i and j are ancestral;
|
|
9066
|
+
# this depends on the samples and so does not cancel out of the centred
|
|
9067
|
+
# version. We could work through these relationships to figure out what exactly
|
|
9068
|
+
# the difference between genetic_relatedness_matrix(mode="site") and
|
|
9069
|
+
# genetic_relatedness(mode="site") is, in the general case of multiple
|
|
9070
|
+
# mutations... but that would be confusing, probably not that useful,
|
|
9071
|
+
# and the short version of all this is that "it's complicated".
|
|
9072
|
+
|
|
8909
9073
|
D = self.divergence_matrix(
|
|
8910
9074
|
sample_sets,
|
|
8911
9075
|
windows=windows,
|
|
@@ -9077,6 +9241,7 @@ class TreeSequence:
|
|
|
9077
9241
|
mode=mode,
|
|
9078
9242
|
centre=False,
|
|
9079
9243
|
nodes=indices,
|
|
9244
|
+
span_normalise=False, # <- non-default!
|
|
9080
9245
|
)[0]
|
|
9081
9246
|
x = x - x.mean(axis=0) if centre else x
|
|
9082
9247
|
|
|
@@ -9107,6 +9272,7 @@ class TreeSequence:
|
|
|
9107
9272
|
mode=mode,
|
|
9108
9273
|
centre=False,
|
|
9109
9274
|
nodes=samples,
|
|
9275
|
+
span_normalise=False, # <- non-default!
|
|
9110
9276
|
)[0]
|
|
9111
9277
|
|
|
9112
9278
|
def bincount_fn(w):
|
|
@@ -9137,23 +9303,28 @@ class TreeSequence:
|
|
|
9137
9303
|
eigenvectors of the genetic relatedness matrix, which are obtained by a
|
|
9138
9304
|
randomized singular value decomposition (rSVD) algorithm.
|
|
9139
9305
|
|
|
9140
|
-
Concretely,
|
|
9141
|
-
|
|
9142
|
-
:
|
|
9143
|
-
between sample :math:`i` and sample :math:`j
|
|
9144
|
-
|
|
9306
|
+
Concretely, take :math:`M` as the matrix of non-span-normalised
|
|
9307
|
+
genetic relatedness values, for instance obtained by
|
|
9308
|
+
setting :math:`M_{ij}` to be the :meth:`~.TreeSequence.genetic_relatedness`
|
|
9309
|
+
between sample :math:`i` and sample :math:`j` with the specified ``mode``,
|
|
9310
|
+
``proportion=False`` and ``span_normalise=False``. Then by default this
|
|
9311
|
+
returns the top ``num_components`` eigenvectors of :math:`M`, so that
|
|
9145
9312
|
``output.factors[i,k]`` is the position of sample `i` on the `k` th PC.
|
|
9146
|
-
If ``samples`` or ``individuals`` are provided, then this does the same
|
|
9147
|
-
except with :math:`M_{ij}` either the relatedness between
|
|
9148
|
-
and ``samples[j]`` or the
|
|
9149
|
-
respectively.
|
|
9313
|
+
If ``samples`` or ``individuals`` are provided, then this does the same
|
|
9314
|
+
thing, except with :math:`M_{ij}` either the relatedness between
|
|
9315
|
+
``samples[i]`` and ``samples[j]`` or the average relatedness between the
|
|
9316
|
+
nodes of ``individuals[i]`` and ``individuals[j]``, respectively.
|
|
9317
|
+
Factors are normalized to have norm 1, i.e.,
|
|
9318
|
+
``output.factors[:,k] ** 2).sum() == 1)`` for any ``k``.
|
|
9150
9319
|
|
|
9151
9320
|
The parameters ``centre`` and ``mode`` are passed to
|
|
9152
|
-
:meth
|
|
9153
|
-
|
|
9154
|
-
If ``
|
|
9155
|
-
|
|
9156
|
-
|
|
9321
|
+
:meth:`~.TreeSequence.genetic_relatedness`: the default ``centre=True`` results
|
|
9322
|
+
in factors whose elements sum to zero; ``mode`` currently only supports the
|
|
9323
|
+
``"branch"`` setting. If ``windows`` are provided then PCA is carried out
|
|
9324
|
+
separately in each genomic window. If ``time_windows`` is provided, then genetic
|
|
9325
|
+
relatedness is measured using only ancestral material within the given time
|
|
9326
|
+
window (see :meth:`decapitate <.TreeSequence.decapitate>` for how this is
|
|
9327
|
+
defined).
|
|
9157
9328
|
|
|
9158
9329
|
So that the method scales to large tree sequences, the underlying method
|
|
9159
9330
|
relies on a randomized SVD algorithm, using
|
|
@@ -9829,7 +10000,7 @@ class TreeSequence:
|
|
|
9829
10000
|
b = 2 * (n**2 + n + 3) / (9 * n * (n - 1)) - (n + 2) / (h * n) + g / h**2
|
|
9830
10001
|
c = h**2 + g
|
|
9831
10002
|
|
|
9832
|
-
What is computed for diversity and
|
|
10003
|
+
What is computed for diversity and segregating sites depends on ``mode``;
|
|
9833
10004
|
see those functions for more details.
|
|
9834
10005
|
|
|
9835
10006
|
:param list sample_sets: A list of lists of Node IDs, specifying the
|
|
@@ -9892,6 +10063,11 @@ class TreeSequence:
|
|
|
9892
10063
|
What is computed for diversity and divergence depends on ``mode``;
|
|
9893
10064
|
see those functions for more details.
|
|
9894
10065
|
|
|
10066
|
+
For ``mode='site'``, this definition of Fst appears as equation (6) in
|
|
10067
|
+
`Slatkin (1991) <https://doi.org/10.1017/S0016672300029827>`_, and
|
|
10068
|
+
is also found as equation (9) in
|
|
10069
|
+
`Nei (1973) <https://doi.org/10.1073/pnas.70.12.3321>`_.
|
|
10070
|
+
|
|
9895
10071
|
:param list sample_sets: A list of lists of Node IDs, specifying the
|
|
9896
10072
|
groups of nodes to compute the statistic with.
|
|
9897
10073
|
:param list indexes: A list of 2-tuples.
|
|
@@ -10313,7 +10489,8 @@ class TreeSequence:
|
|
|
10313
10489
|
|
|
10314
10490
|
For an precise mathematical definition of GNN, see https://doi.org/10.1101/458067
|
|
10315
10491
|
|
|
10316
|
-
.. note::
|
|
10492
|
+
.. note::
|
|
10493
|
+
The reference sets need not include all the samples, hence the most
|
|
10317
10494
|
recent common ancestral node of the reference sets, :math:`a`, need not be
|
|
10318
10495
|
the immediate ancestor of the focal node. If the reference sets only comprise
|
|
10319
10496
|
sequences from relatively distant individuals, the GNN statistic may end up
|
|
@@ -10425,7 +10602,7 @@ class TreeSequence:
|
|
|
10425
10602
|
represented by the tree sequence.
|
|
10426
10603
|
|
|
10427
10604
|
:param list within: A list of node IDs defining set of nodes that
|
|
10428
|
-
we
|
|
10605
|
+
we find IBD segments for. If not specified, this defaults to
|
|
10429
10606
|
all samples in the tree sequence.
|
|
10430
10607
|
:param list[list] between: A list of lists of sample node IDs. Given
|
|
10431
10608
|
two sample sets A and B, only IBD segments will be returned such
|
|
@@ -10440,7 +10617,7 @@ class TreeSequence:
|
|
|
10440
10617
|
segment) is greater than this value will be included. (Default=0)
|
|
10441
10618
|
:param bool store_pairs: If True store information separately for each
|
|
10442
10619
|
pair of samples ``(a, b)`` that are found to be IBD. Otherwise
|
|
10443
|
-
store summary information about all sample
|
|
10620
|
+
store summary information about all sample pairs. (Default=False)
|
|
10444
10621
|
:param bool store_segments: If True store each IBD segment
|
|
10445
10622
|
``(left, right, c)`` and associate it with the corresponding
|
|
10446
10623
|
sample pair ``(a, b)``. If True, implies ``store_pairs``.
|
|
@@ -10449,7 +10626,7 @@ class TreeSequence:
|
|
|
10449
10626
|
IBD information.
|
|
10450
10627
|
:rtype: IdentitySegments
|
|
10451
10628
|
"""
|
|
10452
|
-
return self.
|
|
10629
|
+
return self.dump_tables().ibd_segments(
|
|
10453
10630
|
within=within,
|
|
10454
10631
|
between=between,
|
|
10455
10632
|
max_time=max_time,
|
|
@@ -10871,7 +11048,7 @@ class TreeSequence:
|
|
|
10871
11048
|
mapping is created by first checking if the tree sequence contains individuals.
|
|
10872
11049
|
If it does, the mapping is created using the individuals in the tree sequence.
|
|
10873
11050
|
By default only the sample nodes of the individuals are included in the mapping,
|
|
10874
|
-
unless
|
|
11051
|
+
unless ``include_non_sample_nodes`` is set to True, in which case all nodes
|
|
10875
11052
|
belonging to the individuals are included. Any individuals without any nodes
|
|
10876
11053
|
will have no nodes in their row of the mapping, being essentially of zero ploidy.
|
|
10877
11054
|
If no individuals are present, the mapping is created using only the sample nodes
|
|
@@ -10879,20 +11056,22 @@ class TreeSequence:
|
|
|
10879
11056
|
|
|
10880
11057
|
As the tskit data model allows non-integer positions, site positions and contig
|
|
10881
11058
|
length are transformed to integer values suitable for VCF output. The
|
|
10882
|
-
transformation is done using the
|
|
11059
|
+
transformation is done using the ``position_transform`` function, which must
|
|
10883
11060
|
return an integer numpy array the same dimension as the input. By default,
|
|
10884
11061
|
this is set to ``numpy.round()`` which will round values to the nearest integer.
|
|
10885
11062
|
|
|
10886
|
-
If neither
|
|
10887
|
-
individual names are set to "tsk_{individual_id}" for each individual. If
|
|
10888
|
-
no individuals are present, the individual names are set to "tsk_{i}" with
|
|
10889
|
-
|
|
11063
|
+
If neither ``name_metadata_key`` nor ``individual_names`` is specified, the
|
|
11064
|
+
individual names are set to ``"tsk_{individual_id}"`` for each individual. If
|
|
11065
|
+
no individuals are present, the individual names are set to ``"tsk_{i}"`` with
|
|
11066
|
+
``0 <= i < num_sample_nodes/ploidy``.
|
|
10890
11067
|
|
|
10891
|
-
A
|
|
11068
|
+
A warning is emitted if any sample nodes do not have an individual ID.
|
|
10892
11069
|
|
|
10893
11070
|
:param list individuals: Specific individual IDs to include in the VCF. If not
|
|
10894
11071
|
specified and the tree sequence contains individuals, all individuals are
|
|
10895
|
-
included at least one
|
|
11072
|
+
included that are associated with least one sample node (or at least one of
|
|
11073
|
+
any node if ``include_non_sample_nodes`` is True), and the mapping arrays
|
|
11074
|
+
will be in ascending order of the ID of the individual in the tree sequence.
|
|
10896
11075
|
:param int ploidy: The ploidy, or number of nodes per individual. Only used when
|
|
10897
11076
|
the tree sequence does not contain individuals. Cannot be used if the tree
|
|
10898
11077
|
sequence contains individuals. Defaults to 1 if not specified.
|