tskit 1.0.0b2__cp313-cp313-win_amd64.whl → 1.0.1__cp313-cp313-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
tskit/trees.py CHANGED
@@ -370,7 +370,11 @@ class Site(util.Dataclass):
370
370
  mutations: np.ndarray
371
371
  """
372
372
  The list of mutations at this site. Mutations within a site are returned in the
373
- order they are specified in the underlying :class:`MutationTable`.
373
+
374
+ order they are specified in the underlying :class:`MutationTable`. For canonical
375
+ (i.e., valid) tables, this means ancestral mutations precede their descendants, so
376
+ older mutations (as defined by the canonical mutation ordering; see
377
+ :ref:`sec_mutation_requirements`) appear before younger ones.
374
378
  """
375
379
  metadata: bytes | dict | None
376
380
  """
@@ -571,8 +575,8 @@ class Migration(util.Dataclass):
571
575
  """
572
576
  id: int # noqa A003
573
577
  """
574
- The integer ID of this mutation. Varies from 0 to
575
- :attr:`TreeSequence.num_mutations` - 1.
578
+ The integer ID of this migration. Varies from 0 to
579
+ :attr:`TreeSequence.num_migrations` - 1.
576
580
  """
577
581
 
578
582
 
@@ -770,7 +774,7 @@ class Tree:
770
774
  calling the :meth:`TreeSequence.trees` iterator.
771
775
 
772
776
  :return: The root threshold.
773
- :rtype: :class:`TreeSequence`
777
+ :rtype: int
774
778
  """
775
779
  return self._ll_tree.get_root_threshold()
776
780
 
@@ -881,7 +885,8 @@ class Tree:
881
885
 
882
886
  :param float position: The position along the sequence length to
883
887
  seek to.
884
- :raises ValueError: If 0 < position or position >=
888
+ :raises ValueError: If ``position`` is less than 0 or ``position`` is greater
889
+ than or equal to
885
890
  :attr:`TreeSequence.sequence_length`.
886
891
  """
887
892
  if position < 0 or position >= self.tree_sequence.sequence_length:
@@ -918,7 +923,7 @@ class Tree:
918
923
  the interval :math:`[0, \\text{span})` and the :attr:`~Tree.tree_sequence`
919
924
  from which the tree is taken will have its
920
925
  :attr:`~tskit.TreeSequence.sequence_length` equal to ``span``.
921
- :param: float branch_length: The minimum length of a branch in this tree.
926
+ :param float branch_length: The minimum length of a branch in this tree.
922
927
  :raises ValueError: If the given rank is out of bounds for trees
923
928
  with ``num_leaves`` leaves.
924
929
  """
@@ -3593,7 +3598,7 @@ def parse_nodes(source, strict=True, encoding="utf8", base64_metadata=True, tabl
3593
3598
  return table
3594
3599
 
3595
3600
 
3596
- def parse_edges(source, strict=True, table=None):
3601
+ def parse_edges(source, strict=True, table=None, encoding="utf8", base64_metadata=True):
3597
3602
  """
3598
3603
  Parse the specified file-like object containing a whitespace delimited
3599
3604
  description of a edge table and returns the corresponding :class:`EdgeTable`
@@ -3609,6 +3614,9 @@ def parse_edges(source, strict=True, table=None):
3609
3614
  False, a relaxed whitespace splitting algorithm is used.
3610
3615
  :param EdgeTable table: If specified, write the edges into this table. If
3611
3616
  not, create a new :class:`EdgeTable` instance and return.
3617
+ :param str encoding: Encoding used for text representation.
3618
+ :param bool base64_metadata: If True, metadata is encoded using Base64
3619
+ encoding; otherwise, as plain text.
3612
3620
  """
3613
3621
  sep = None
3614
3622
  if strict:
@@ -3620,6 +3628,12 @@ def parse_edges(source, strict=True, table=None):
3620
3628
  right_index = header.index("right")
3621
3629
  parent_index = header.index("parent")
3622
3630
  children_index = header.index("child")
3631
+ metadata_index = None
3632
+ try:
3633
+ metadata_index = header.index("metadata")
3634
+ except ValueError:
3635
+ pass
3636
+ default_metadata = b""
3623
3637
  for line in source:
3624
3638
  tokens = line.rstrip("\n").split(sep)
3625
3639
  if len(tokens) >= 4:
@@ -3627,8 +3641,19 @@ def parse_edges(source, strict=True, table=None):
3627
3641
  right = float(tokens[right_index])
3628
3642
  parent = int(tokens[parent_index])
3629
3643
  children = tuple(map(int, tokens[children_index].split(",")))
3644
+ metadata = default_metadata
3645
+ if metadata_index is not None and metadata_index < len(tokens):
3646
+ metadata = tokens[metadata_index].encode(encoding)
3647
+ if base64_metadata:
3648
+ metadata = base64.b64decode(metadata)
3630
3649
  for child in children:
3631
- table.add_row(left=left, right=right, parent=parent, child=child)
3650
+ table.add_row(
3651
+ left=left,
3652
+ right=right,
3653
+ parent=parent,
3654
+ child=child,
3655
+ metadata=metadata,
3656
+ )
3632
3657
  return table
3633
3658
 
3634
3659
 
@@ -4136,6 +4161,7 @@ class TreeSequence:
4136
4161
 
4137
4162
  def __init__(self, ll_tree_sequence):
4138
4163
  self._ll_tree_sequence = ll_tree_sequence
4164
+ self._immutable_tables = None
4139
4165
  metadata_schema_strings = self._ll_tree_sequence.get_table_metadata_schemas()
4140
4166
  metadata_schema_instances = {
4141
4167
  name: metadata_module.parse_metadata_schema(
@@ -4321,21 +4347,29 @@ class TreeSequence:
4321
4347
  @property
4322
4348
  def tables(self):
4323
4349
  """
4324
- Returns the :class:`tables<TableCollection>` underlying this tree
4325
- sequence, intended for read-only access. See :meth:`.dump_tables` if you wish
4326
- to modify the tables.
4350
+ Returns an immutable view of the tables underlying this tree sequence.
4327
4351
 
4328
- .. warning:: This property currently returns a copy of the tables
4329
- underlying a tree sequence but it may return a read-only
4330
- **view** in the future. Thus, if the tables will subsequently be
4331
- updated, please use the :meth:`.dump_tables` method instead as
4332
- this will always return a new copy of the TableCollection.
4352
+ This view shares the same data as the TreeSequence (zero-copy).
4353
+ Use :meth:`.dump_tables` for a modifiable copy.
4333
4354
 
4334
- :return: A :class:`TableCollection` containing all a copy of the
4335
- tables underlying this tree sequence.
4336
- :rtype: TableCollection
4355
+ Note that if tskit was built with Numpy 1, this method acts as
4356
+ :meth:`.dump_tables` and returns a mutable TableCollection.
4357
+
4358
+ :return: An immutable view of the TableCollection underlying this tree sequence.
4337
4359
  """
4338
- return self.dump_tables()
4360
+ if not _tskit.HAS_NUMPY_2:
4361
+ warnings.warn(
4362
+ "Immutable table views require tskit to be built against NumPy 2.0 or "
4363
+ "newer. Falling back to returning a mutable TableCollection.",
4364
+ UserWarning,
4365
+ stacklevel=2,
4366
+ )
4367
+ return self.dump_tables()
4368
+ if self._immutable_tables is None:
4369
+ self._immutable_tables = tables.ImmutableTableCollection(
4370
+ self._ll_tree_sequence
4371
+ )
4372
+ return self._immutable_tables
4339
4373
 
4340
4374
  @property
4341
4375
  def nbytes(self):
@@ -4359,6 +4393,22 @@ class TreeSequence:
4359
4393
  self._ll_tree_sequence.dump_tables(ll_tables)
4360
4394
  return tables.TableCollection(ll_tables=ll_tables)
4361
4395
 
4396
+ def link_ancestors(self, samples, ancestors):
4397
+ """
4398
+ Equivalent to :meth:`TableCollection.link_ancestors`; see that method for full
4399
+ documentation and parameter semantics.
4400
+
4401
+ :param list[int] samples: Node IDs to retain as samples.
4402
+ :param list[int] ancestors: Node IDs to treat as ancestors.
4403
+ :return: An :class:`tables.EdgeTable` containing the genealogical links between
4404
+ the supplied ``samples`` and ``ancestors``.
4405
+ :rtype: tables.EdgeTable
4406
+ """
4407
+ samples = util.safe_np_int_cast(samples, np.int32)
4408
+ ancestors = util.safe_np_int_cast(ancestors, np.int32)
4409
+ ll_edge_table = self._ll_tree_sequence.link_ancestors(samples, ancestors)
4410
+ return tables.EdgeTable(ll_table=ll_edge_table)
4411
+
4362
4412
  def dump_text(
4363
4413
  self,
4364
4414
  nodes=None,
@@ -4758,7 +4808,8 @@ class TreeSequence:
4758
4808
  Returns an iterable sequence of all the :ref:`nodes <sec_node_table_definition>`
4759
4809
  in this tree sequence.
4760
4810
 
4761
- .. note:: Although node ids are commonly ordered by node time, this is not a
4811
+ .. note::
4812
+ Although node ids are commonly ordered by node time, this is not a
4762
4813
  formal tree sequence requirement. If you wish to iterate over nodes in
4763
4814
  time order, you should therefore use ``order="timeasc"`` (and wrap the
4764
4815
  resulting sequence in the standard Python :func:`python:reversed` function
@@ -5312,13 +5363,13 @@ class TreeSequence:
5312
5363
  Returns an iterator over the strings of haplotypes that result from
5313
5364
  the trees and mutations in this tree sequence. Each haplotype string
5314
5365
  is guaranteed to be of the same length. A tree sequence with
5315
- :math:`n` samples and with :math:`s` sites lying between ``left`` and
5316
- ``right`` will return a total of :math:`n`
5317
- strings of :math:`s` alleles concatenated together, where an allele
5366
+ :math:`n` requested nodes (default: the number of sample nodes) and with
5367
+ :math:`s` sites lying between ``left`` and ``right`` will return a total
5368
+ of :math:`n` strings of :math:`s` alleles concatenated together, where an allele
5318
5369
  consists of a single ascii character (tree sequences that include alleles
5319
5370
  which are not a single character in length, or where the character is
5320
5371
  non-ascii, will raise an error). The first string returned is the
5321
- haplotype for the first requested sample, and so on.
5372
+ haplotype for the first requested node, and so on.
5322
5373
 
5323
5374
  The alleles at each site must be represented by single byte characters,
5324
5375
  (i.e., variants must be single nucleotide polymorphisms, or SNPs), hence
@@ -5327,8 +5378,8 @@ class TreeSequence:
5327
5378
  haplotype ``h``, the value of ``h[j]`` will therefore be the observed
5328
5379
  allelic state at site ``j``.
5329
5380
 
5330
- If ``isolated_as_missing`` is True (the default), isolated samples without
5331
- mutations directly above them will be treated as
5381
+ If ``isolated_as_missing`` is True (the default), isolated nodes without
5382
+ mutations directly above them (whether samples or non-samples) will be treated as
5332
5383
  :ref:`missing data<sec_data_model_missing_data>` and will be
5333
5384
  represented in the string by the ``missing_data_character``. If
5334
5385
  instead it is set to False, missing data will be assigned the ancestral state
@@ -5337,8 +5388,10 @@ class TreeSequence:
5337
5388
  behaviour in versions prior to 0.2.0. Prior to 0.3.0 the `impute_missing_data`
5338
5389
  argument controlled this behaviour.
5339
5390
 
5391
+ It is also possible to provide **non-sample** nodes via the ``samples``
5392
+ argument if you wish to output haplotypes for (e.g.) internal nodes.
5340
5393
  See also the :meth:`.variants` iterator for site-centric access
5341
- to sample genotypes.
5394
+ to genotypes for the requested nodes.
5342
5395
 
5343
5396
  .. warning::
5344
5397
  For large datasets, this method can consume a **very large** amount of
@@ -5356,9 +5409,10 @@ class TreeSequence:
5356
5409
  be used to represent missing data.
5357
5410
  If any normal allele contains this character, an error is raised.
5358
5411
  Default: 'N'.
5359
- :param list[int] samples: The samples for which to output haplotypes. If
5360
- ``None`` (default), return haplotypes for all the samples in the tree
5361
- sequence, in the order given by the :meth:`.samples` method.
5412
+ :param list[int] samples: The node IDs for which to output haplotypes. If
5413
+ ``None`` (default), return haplotypes for all the sample nodes in the tree
5414
+ sequence, in the order given by the :meth:`.samples` method. Non-sample
5415
+ nodes may also be provided.
5362
5416
  :param int left: Haplotype strings will start with the first site at or after
5363
5417
  this genomic position. If ``None`` (default) start at the first site.
5364
5418
  :param int right: Haplotype strings will end with the last site before this
@@ -5429,9 +5483,13 @@ class TreeSequence:
5429
5483
  generated; output order of genotypes in the returned variants
5430
5484
  corresponds to the order of the samples in this list. It is also
5431
5485
  possible to provide **non-sample** nodes as an argument here, if you
5432
- wish to generate genotypes for (e.g.) internal nodes. However,
5433
- ``isolated_as_missing`` must be False in this case, as it is not
5434
- possible to detect missing data for non-sample nodes.
5486
+ wish to generate genotypes for (e.g.) internal nodes. Missingness is
5487
+ detected for any requested node (sample or non-sample) when
5488
+ ``isolated_as_missing`` is True: if a node is isolated at a site (i.e.,
5489
+ has no parent and no children in the marginal tree) and has no mutation
5490
+ above it at that site, its genotype will be reported as
5491
+ :data:`MISSING_DATA` (-1). If ``isolated_as_missing`` is False, such
5492
+ nodes are assigned the site's ancestral allele index.
5435
5493
 
5436
5494
  If isolated samples are present at a given site without mutations above them,
5437
5495
  they are interpreted by default as
@@ -5521,19 +5579,23 @@ class TreeSequence:
5521
5579
  """
5522
5580
  Returns an :math:`m \\times n` numpy array of the genotypes in this
5523
5581
  tree sequence, where :math:`m` is the number of sites and :math:`n`
5524
- the number of samples. The genotypes are the indexes into the array
5525
- of ``alleles``, as described for the :class:`Variant` class.
5526
-
5527
- If isolated samples are present at a given site without mutations above them,
5528
- they will be interpreted as :ref:`missing data<sec_data_model_missing_data>`
5529
- the genotypes array will contain a special value :data:`MISSING_DATA`
5530
- (-1) to identify these missing samples.
5531
-
5532
- Such samples are treated as missing data by default, but if
5533
- ``isolated_as_missing`` is set to to False, they will not be treated as missing,
5534
- and so assigned the ancestral state. This was the default behaviour in
5535
- versions prior to 0.2.0. Prior to 0.3.0 the `impute_missing_data`
5536
- argument controlled this behaviour.
5582
+ is the number of requested nodes (default: the number of sample nodes).
5583
+ The genotypes are the indexes into the array of ``alleles``, as
5584
+ described for the :class:`Variant` class.
5585
+
5586
+ It is possible to provide **non-sample** nodes via the ``samples``
5587
+ argument if you wish to generate genotypes for (e.g.) internal nodes.
5588
+ Missingness is detected for any requested node (sample or non-sample)
5589
+ when ``isolated_as_missing`` is True: if a node is isolated at a site
5590
+ (i.e., has no parent and no children in the marginal tree) and has no
5591
+ mutation above it at that site, its genotype will be reported as
5592
+ :data:`MISSING_DATA` (-1).
5593
+
5594
+ Such nodes are treated as missing data by default. If
5595
+ ``isolated_as_missing`` is set to False, they will not be treated as
5596
+ missing, and will instead be assigned the ancestral state. This was the
5597
+ default behaviour in versions prior to 0.2.0. Prior to 0.3.0 the
5598
+ ``impute_missing_data`` argument controlled this behaviour.
5537
5599
 
5538
5600
  .. warning::
5539
5601
  This method can consume a **very large** amount of memory! If
@@ -5541,10 +5603,12 @@ class TreeSequence:
5541
5603
  access them sequentially using the :meth:`.variants` iterator.
5542
5604
 
5543
5605
  :param array_like samples: An array of node IDs for which to generate
5544
- genotypes, or None for all sample nodes. Default: None.
5606
+ genotypes. If ``None`` (default), generate genotypes for all sample
5607
+ nodes. Non-sample nodes may also be provided, in which case genotypes
5608
+ will be generated for those nodes too.
5545
5609
  :param bool isolated_as_missing: If True, the genotype value assigned to
5546
- missing samples (i.e., isolated samples without mutations) is
5547
- :data:`.MISSING_DATA` (-1). If False, missing samples will be
5610
+ isolated nodes without mutations (samples or non-samples) is
5611
+ :data:`.MISSING_DATA` (-1). If False, such nodes will be
5548
5612
  assigned the allele index for the ancestral state.
5549
5613
  Default: True.
5550
5614
  :param tuple alleles: A tuple of strings describing the encoding of
@@ -5593,21 +5657,24 @@ class TreeSequence:
5593
5657
  *,
5594
5658
  reference_sequence=None,
5595
5659
  missing_data_character=None,
5660
+ isolated_as_missing=None,
5596
5661
  samples=None,
5597
5662
  left=None,
5598
5663
  right=None,
5599
5664
  ):
5600
5665
  """
5601
5666
  Returns an iterator over the full sequence alignments for the defined samples
5602
- in this tree sequence. Each alignment ``a`` is a string of length ``L`` where
5603
- the first character is the genomic sequence at the ``start`` position in the
5604
- genome (defaulting to 0) and the last character is the genomic sequence one
5605
- position before the ``stop`` value (defaulting to the :attr:`.sequence_length`
5606
- of this tree sequence, which must have :attr:`.discrete_genome` equal to True).
5607
- By default ``L`` is therefore equal to the :attr:`.sequence_length`,
5608
- and ``a[j]`` is the nucleotide value at genomic position ``j``.
5609
-
5610
- .. note:: This is inherently a **zero-based** representation of the sequence
5667
+ in this tree sequence. Each yielded alignment ``a`` is a string of length
5668
+ ``L`` where the first character is the genomic sequence at the ``start``
5669
+ position in the genome (defaulting to 0) and the last character is the
5670
+ genomic sequence one position before the ``stop`` value (defaulting to the
5671
+ :attr:`.sequence_length` of this tree sequence, which must have
5672
+ :attr:`.discrete_genome` equal to True). By default ``L`` is therefore equal
5673
+ to the :attr:`.sequence_length`, and ``a[j]`` is the nucleotide value at
5674
+ genomic position ``j``.
5675
+
5676
+ .. note::
5677
+ This is inherently a **zero-based** representation of the sequence
5611
5678
  coordinate space. Care will be needed when interacting with other
5612
5679
  libraries and upstream coordinate spaces.
5613
5680
 
@@ -5656,31 +5723,44 @@ class TreeSequence:
5656
5723
  single byte characters, (i.e., variants must be single nucleotide
5657
5724
  polymorphisms, or SNPs).
5658
5725
 
5659
- .. warning:: :ref:`Missing data<sec_data_model_missing_data>` is not
5660
- currently supported by this method and it will raise a ValueError
5661
- if called on tree sequences containing isolated samples.
5662
- See https://github.com/tskit-dev/tskit/issues/1896 for more
5663
- information.
5726
+ Missing data handling
5727
+
5728
+ - If ``isolated_as_missing=True`` (default), nodes that are isolated
5729
+ (no parent and no children) are rendered as the missing character across
5730
+ each tree interval. At site positions, the per-site allele overrides the
5731
+ missing character; if a genotype is missing (``-1``), the missing
5732
+ character is retained.
5733
+ - If ``isolated_as_missing=False``, no missing overlay is applied. At sites,
5734
+ genotypes are decoded as usual; at non-sites, bases come from the
5735
+ reference sequence.
5664
5736
 
5665
5737
  See also the :meth:`.variants` iterator for site-centric access
5666
5738
  to sample genotypes and :meth:`.haplotypes` for access to sample sequences
5667
5739
  at just the sites in the tree sequence.
5668
5740
 
5669
5741
  :param str reference_sequence: The reference sequence to fill in
5670
- gaps between sites in the alignments.
5742
+ gaps between sites in the alignments. If provided, it must be a
5743
+ string of length equal to :attr:`.sequence_length`; the sequence is
5744
+ sliced internally to the requested ``[left, right)`` interval.
5671
5745
  :param str missing_data_character: A single ascii character that will
5672
5746
  be used to represent missing data.
5673
5747
  If any normal allele contains this character, an error is raised.
5674
5748
  Default: 'N'.
5675
- :param list[int] samples: The samples for which to output alignments. If
5676
- ``None`` (default), return alignments for all the samples in the tree
5677
- sequence, in the order given by the :meth:`.samples` method.
5749
+ :param bool isolated_as_missing: If True, treat isolated nodes as missing
5750
+ across the covered tree intervals (see above). If None (default), this
5751
+ is treated as True.
5752
+ :param list[int] samples: The nodes for which to output alignments. If
5753
+ ``None`` (default), return alignments for all sample nodes in the order
5754
+ given by the :meth:`.samples` method. Non-sample nodes are also supported
5755
+ and will be decoded at sites in the same way as samples.
5678
5756
  :param int left: Alignments will start at this genomic position. If ``None``
5679
5757
  (default) alignments start at 0.
5680
- :param int right: Alignments will stop before this genomic position. If ``None``
5681
- (default) alignments will continue until the end of the tree sequence.
5758
+ :param int right: Alignments will stop before this genomic position.
5759
+ If ``None`` (default) alignments will continue until the end of the
5760
+ tree sequence.
5682
5761
  :return: An iterator over the alignment strings for specified samples in
5683
- this tree sequence, in the order given in ``samples``.
5762
+ this tree sequence, in the order given in ``samples``. Each string has
5763
+ length ``L = right - left``.
5684
5764
  :rtype: collections.abc.Iterable
5685
5765
  :raises ValueError: if any genome coordinate in this tree sequence is not
5686
5766
  discrete, or if the ``reference_sequence`` is not of the correct length.
@@ -5694,60 +5774,53 @@ class TreeSequence:
5694
5774
  "N" if missing_data_character is None else missing_data_character
5695
5775
  )
5696
5776
 
5697
- L = interval.span
5698
- a = np.empty(L, dtype=np.int8)
5699
- if reference_sequence is None:
5700
- if self.has_reference_sequence():
5701
- # This may be inefficient - see #1989. However, since we're
5702
- # n copies of the reference sequence anyway, this is a relatively
5703
- # minor tweak. We may also want to recode the below not to use direct
5704
- # access to the .data attribute, e.g. if we allow reference sequences
5705
- # to start at non-zero positions
5706
- reference_sequence = self.reference_sequence.data[
5707
- interval.left : interval.right
5708
- ]
5709
- else:
5710
- reference_sequence = missing_data_character * L
5777
+ if isolated_as_missing is None:
5778
+ isolated_as_missing = True
5711
5779
 
5712
- if len(reference_sequence) != L:
5713
- if interval.right == int(self.sequence_length):
5714
- raise ValueError(
5715
- "The reference sequence is shorter than the tree sequence length"
5716
- )
5717
- else:
5780
+ if len(missing_data_character) != 1:
5781
+ raise TypeError("missing_data_character must be a single character")
5782
+
5783
+ # Determine the reference sequence for the whole tree sequence
5784
+ full_ref = None
5785
+ if reference_sequence is not None:
5786
+ full_ref = reference_sequence
5787
+ elif self.has_reference_sequence():
5788
+ # This may be inefficient - see #1989. However, since we're
5789
+ # n copies of the reference sequence anyway, this is a relatively
5790
+ # minor tweak. We may also want to recode the below not to use direct
5791
+ # access to the .data attribute, e.g. if we allow reference sequences
5792
+ # to start at non-zero positions
5793
+ full_ref = self.reference_sequence.data
5794
+
5795
+ if full_ref is None:
5796
+ full_ref = missing_data_character * int(self.sequence_length)
5797
+ else:
5798
+ if len(full_ref) != int(self.sequence_length):
5718
5799
  raise ValueError(
5719
- "The reference sequence ends before the requested stop position"
5800
+ "The reference sequence must be equal to the tree sequence length"
5720
5801
  )
5721
- ref_bytes = reference_sequence.encode("ascii")
5722
- a[:] = np.frombuffer(ref_bytes, dtype=np.int8)
5723
-
5724
- # To do this properly we'll have to detect the missing data as
5725
- # part of a full implementation of alignments in C. The current
5726
- # definition might not be calling some degenerate cases correctly;
5727
- # see https://github.com/tskit-dev/tskit/issues/1908
5728
- #
5729
- # Note also that this will call the presence of missing data
5730
- # incorrectly if have a sample isolated over the region (a, b],
5731
- # and if we have sites at each position from a to b, and at
5732
- # each site there is a mutation over the isolated sample.
5733
- if any(tree._has_isolated_samples() for tree in self.trees()):
5734
- raise ValueError(
5735
- "Missing data not currently supported in alignments; see "
5736
- "https://github.com/tskit-dev/tskit/issues/1896 for details."
5737
- "The current implementation may also incorrectly identify an "
5738
- "input tree sequence has having missing data."
5739
- )
5740
- H, (first_site_id, last_site_id) = self._haplotypes_array(
5741
- interval=interval,
5742
- missing_data_character=missing_data_character,
5743
- samples=samples,
5802
+
5803
+ try:
5804
+ ref_bytes = full_ref.encode("ascii")
5805
+ missing_data_character.encode("ascii")
5806
+ except UnicodeEncodeError:
5807
+ raise
5808
+
5809
+ sample_ids = self.samples() if samples is None else list(samples)
5810
+
5811
+ flat = self._ll_tree_sequence.decode_alignments(
5812
+ ref_bytes,
5813
+ sample_ids,
5814
+ int(interval.left),
5815
+ int(interval.right),
5816
+ missing_data_character,
5817
+ bool(isolated_as_missing),
5744
5818
  )
5745
- site_pos = self.sites_position.astype(np.int64)[
5746
- first_site_id : last_site_id + 1
5747
- ]
5748
- for h in H:
5749
- a[site_pos - interval.left] = h
5750
- yield a.tobytes().decode("ascii")
5819
+
5820
+ span = int(interval.span)
5821
+ for j in range(len(sample_ids)):
5822
+ offset = j * span
5823
+ yield flat[offset : offset + span].decode("ascii")
5751
5824
 
5752
5825
  @property
5753
5826
  def individuals_population(self):
@@ -5978,7 +6051,9 @@ class TreeSequence:
5978
6051
  "The sites_ancestral_state property requires numpy 2.0 or later."
5979
6052
  )
5980
6053
  if self._sites_ancestral_state is None:
5981
- self._sites_ancestral_state = self._ll_tree_sequence.sites_ancestral_state
6054
+ self._sites_ancestral_state = (
6055
+ self._ll_tree_sequence.sites_ancestral_state_string
6056
+ )
5982
6057
  return self._sites_ancestral_state
5983
6058
 
5984
6059
  @property
@@ -6050,7 +6125,7 @@ class TreeSequence:
6050
6125
  )
6051
6126
  if self._mutations_derived_state is None:
6052
6127
  self._mutations_derived_state = (
6053
- self._ll_tree_sequence.mutations_derived_state
6128
+ self._ll_tree_sequence.mutations_derived_state_string
6054
6129
  )
6055
6130
  return self._mutations_derived_state
6056
6131
 
@@ -6098,7 +6173,7 @@ class TreeSequence:
6098
6173
  )
6099
6174
  if self._mutations_inherited_state is None:
6100
6175
  self._mutations_inherited_state = (
6101
- self._ll_tree_sequence.mutations_inherited_state
6176
+ self._ll_tree_sequence.mutations_inherited_state_string
6102
6177
  )
6103
6178
  return self._mutations_inherited_state
6104
6179
 
@@ -6458,6 +6533,9 @@ class TreeSequence:
6458
6533
  samples = self._ll_tree_sequence.get_samples()
6459
6534
  keep = np.full(shape=samples.shape, fill_value=True)
6460
6535
  if population is not None:
6536
+ if not isinstance(population, numbers.Integral):
6537
+ raise ValueError("`population` must be an integer ID")
6538
+ population = int(population)
6461
6539
  sample_population = self.nodes_population[samples]
6462
6540
  keep = np.logical_and(keep, sample_population == population)
6463
6541
  if time is not None:
@@ -6570,13 +6648,13 @@ class TreeSequence:
6570
6648
  to the sites in the tree sequence object.
6571
6649
 
6572
6650
  .. note::
6573
- Older code often uses the ``ploidy=2`` argument, because old
6574
- versions of msprime did not output individual data. Specifying
6575
- individuals in the tree sequence is more robust, and since tree
6576
- sequences now typically contain individuals (e.g., as produced by
6577
- ``msprime.sim_ancestry( )``), this is not necessary, and the
6578
- ``ploidy`` argument can safely be removed as part of the process
6579
- of updating from the msprime 0.x legacy API.
6651
+ Older code often uses the ``ploidy=2`` argument, because old
6652
+ versions of msprime did not output individual data. Specifying
6653
+ individuals in the tree sequence is more robust, and since tree
6654
+ sequences now typically contain individuals (e.g., as produced by
6655
+ ``msprime.sim_ancestry( )``), this is not necessary, and the
6656
+ ``ploidy`` argument can safely be removed as part of the process
6657
+ of updating from the msprime 0.x legacy API.
6580
6658
 
6581
6659
  :param io.IOBase output: The file-like object to write the VCF output.
6582
6660
  :param int ploidy: The ploidy of the individuals to be written to
@@ -6661,6 +6739,7 @@ class TreeSequence:
6661
6739
  wrap_width=60,
6662
6740
  reference_sequence=None,
6663
6741
  missing_data_character=None,
6742
+ isolated_as_missing=None,
6664
6743
  ):
6665
6744
  """
6666
6745
  Writes the :meth:`.alignments` for this tree sequence to file in
@@ -6685,12 +6764,6 @@ class TreeSequence:
6685
6764
 
6686
6765
  ts.write_fasta("output.fa")
6687
6766
 
6688
- .. warning:: :ref:`Missing data<sec_data_model_missing_data>` is not
6689
- currently supported by this method and it will raise a ValueError
6690
- if called on tree sequences containing isolated samples.
6691
- See https://github.com/tskit-dev/tskit/issues/1896 for more
6692
- information.
6693
-
6694
6767
  :param file_or_path: The file object or path to write the output.
6695
6768
  Paths can be either strings or :class:`python:pathlib.Path` objects.
6696
6769
  :param int wrap_width: The number of sequence
@@ -6699,6 +6772,7 @@ class TreeSequence:
6699
6772
  (Default=60).
6700
6773
  :param str reference_sequence: As for the :meth:`.alignments` method.
6701
6774
  :param str missing_data_character: As for the :meth:`.alignments` method.
6775
+ :param bool isolated_as_missing: As for the :meth:`.alignments` method.
6702
6776
  """
6703
6777
  text_formats.write_fasta(
6704
6778
  self,
@@ -6706,6 +6780,7 @@ class TreeSequence:
6706
6780
  wrap_width=wrap_width,
6707
6781
  reference_sequence=reference_sequence,
6708
6782
  missing_data_character=missing_data_character,
6783
+ isolated_as_missing=isolated_as_missing,
6709
6784
  )
6710
6785
 
6711
6786
  def as_fasta(self, **kwargs):
@@ -6729,6 +6804,7 @@ class TreeSequence:
6729
6804
  include_alignments=None,
6730
6805
  reference_sequence=None,
6731
6806
  missing_data_character=None,
6807
+ isolated_as_missing=None,
6732
6808
  ):
6733
6809
  """
6734
6810
  Returns a `nexus encoding <https://en.wikipedia.org/wiki/Nexus_file>`_
@@ -6812,10 +6888,7 @@ class TreeSequence:
6812
6888
  as our convention of using trees with multiple roots
6813
6889
  is not often supported by newick parsers. Thus, the method
6814
6890
  will raise a ValueError if we try to output trees with
6815
- multiple roots. Additionally, missing data
6816
- is not currently supported for alignment data.
6817
- See https://github.com/tskit-dev/tskit/issues/1896 for more
6818
- information.
6891
+ multiple roots.
6819
6892
 
6820
6893
  .. seealso: See also the :meth:`.as_nexus` method which will
6821
6894
  return this nexus representation as a string.
@@ -6830,6 +6903,7 @@ class TreeSequence:
6830
6903
  :param str reference_sequence: As for the :meth:`.alignments` method.
6831
6904
  :param str missing_data_character: As for the :meth:`.alignments` method,
6832
6905
  but defaults to "?".
6906
+ :param bool isolated_as_missing: As for the :meth:`.alignments` method.
6833
6907
  :return: A nexus representation of this :class:`TreeSequence`
6834
6908
  :rtype: str
6835
6909
  """
@@ -6841,6 +6915,7 @@ class TreeSequence:
6841
6915
  include_alignments=include_alignments,
6842
6916
  reference_sequence=reference_sequence,
6843
6917
  missing_data_character=missing_data_character,
6918
+ isolated_as_missing=isolated_as_missing,
6844
6919
  )
6845
6920
 
6846
6921
  def as_nexus(self, **kwargs):
@@ -7187,19 +7262,32 @@ class TreeSequence:
7187
7262
  self, *args, node_mappings=None, record_provenance=True, add_populations=None
7188
7263
  ):
7189
7264
  r"""
7190
- Concatenate a set of tree sequences to the right of this one, by repeatedly
7191
- calling :meth:`~TreeSequence.union` with an (optional)
7192
- node mapping for each of the ``others``. If any node mapping is ``None``
7193
- only map the sample nodes between the input tree sequence and this one,
7194
- based on the numerical order of sample node IDs.
7265
+ Concatenate a set of tree sequences to the right of this one, by shifting
7266
+ their coordinate systems and adding all edges, sites, mutations, and
7267
+ any additional nodes, individuals, or populations needed for these.
7268
+ Concretely, to concatenate an ``other`` tree sequence to ``self``, the value
7269
+ of ``self.sequence_length`` is added to all genomic coordinates in ``other``,
7270
+ and then the concatenated tree sequence will contain all edges, sites, and
7271
+ mutations in both. Which nodes in ``other`` are treated as "new", and hence
7272
+ added as well, is controlled by ``node_mappings``. Any individuals to which
7273
+ new nodes belong are added as well.
7274
+
7275
+ The method uses :meth:`.shift` followed by :meth:`.union`, with
7276
+ ``all_mutations=True``, ``all_edges=True``, and ``check_shared_equality=False``.
7277
+
7278
+ By default, the samples in current and input tree sequences are assumed to
7279
+ refer to the same nodes, and are matched based on the numerical order of
7280
+ sample node IDs; all other nodes are assumed to be new. This can be
7281
+ changed by providing explicit ``node_mappings`` for each input tree sequence
7282
+ (see below).
7195
7283
 
7196
7284
  .. note::
7197
- To add gaps between the concatenated tables, use :meth:`shift` or
7198
- to remove gaps, use :meth:`trim` before concatenating.
7285
+ To add gaps between the concatenated tree sequences, use :meth:`shift`
7286
+ or to remove gaps, use :meth:`trim` before concatenating.
7199
7287
 
7200
7288
  :param TreeSequence \*args: A list of other tree sequences to append to
7201
7289
  the right of this one.
7202
- :param Union[list, None] node_mappings: An list of node mappings for each
7290
+ :param Union[list, None] node_mappings: A list of node mappings for each
7203
7291
  input tree sequence in ``args``. Each should either be an array of
7204
7292
  integers of the same length as the number of nodes in the equivalent
7205
7293
  input tree sequence (see :meth:`~TreeSequence.union` for details), or
@@ -7241,6 +7329,8 @@ class TreeSequence:
7241
7329
  other_tables,
7242
7330
  node_mapping=node_mapping,
7243
7331
  check_shared_equality=False, # Else checks fail with internal samples
7332
+ all_mutations=True,
7333
+ all_edges=True,
7244
7334
  record_provenance=False,
7245
7335
  add_populations=add_populations,
7246
7336
  )
@@ -7329,7 +7419,7 @@ class TreeSequence:
7329
7419
  is its associated ``time`` value, or the time of its node if the
7330
7420
  mutation's time was marked as unknown (:data:`UNKNOWN_TIME`).
7331
7421
 
7332
- Migrations are not supported, and a LibraryError will be raise if
7422
+ Migrations are not supported, and a LibraryError will be raised if
7333
7423
  called on a tree sequence containing migration information.
7334
7424
 
7335
7425
  .. seealso:: This method is implemented using the :meth:`.split_edges`
@@ -7365,7 +7455,9 @@ class TreeSequence:
7365
7455
  `n` to `c` are extended, and the span of the edge from `p` to `c` is
7366
7456
  reduced. Thus, the ancestral haplotype represented by `n` is extended
7367
7457
  to a longer span of the genome. However, any edges whose child node is
7368
- a sample are not modified.
7458
+ a sample are not modified. See
7459
+ `Fritze et al. (2025) <https://doi.org/10.1093/genetics/iyaf198>`_
7460
+ for more details.
7369
7461
 
7370
7462
  Since some edges may be removed entirely, this process usually reduces
7371
7463
  the number of edges in the tree sequence.
@@ -7388,15 +7480,15 @@ class TreeSequence:
7388
7480
  known mutation times. See :meth:`.impute_unknown_mutations_time` if
7389
7481
  mutation times are not known.
7390
7482
 
7391
- The method will not affect the marginal trees (so, if the original tree
7392
- sequence was simplified, then following up with `simplify` will recover
7393
- the original tree sequence, possibly with edges in a different order).
7394
- It will also not affect the genotype matrix, or any of the tables other
7395
- than the edge table or the node column in the mutation table.
7483
+ .. note::
7484
+ The method will not affect the marginal trees (so, if the original tree
7485
+ sequence was simplified, then following up with `simplify` will recover
7486
+ the original tree sequence, possibly with edges in a different order).
7487
+ It will also not affect the genotype matrix, or any of the tables other
7488
+ than the edge table or the node column in the mutation table.
7396
7489
 
7397
- :param int max_iters: The maximum number of iterations over the tree
7490
+ :param int max_iter: The maximum number of iterations over the tree
7398
7491
  sequence. Defaults to 10.
7399
-
7400
7492
  :return: A new tree sequence with unary nodes extended.
7401
7493
  :rtype: tskit.TreeSequence
7402
7494
  """
@@ -7421,11 +7513,15 @@ class TreeSequence:
7421
7513
  the ancestry of these nodes - for that, see :meth:`.simplify`.
7422
7514
 
7423
7515
  This has the side effect that it may change the order of the nodes,
7424
- individuals, populations, and migrations in the tree sequence: the nodes
7425
- in the new tree sequence will be in the order provided in ``nodes``, and
7426
- both individuals and populations will be ordered by the earliest retained
7427
- node that refers to them. (However, ``reorder_populations`` may be set to
7428
- False to keep the population table unchanged.)
7516
+ populations, individuals, and migrations in the tree sequence. Nodes
7517
+ in the new tree sequence will be in the order provided in ``nodes``.
7518
+ Populations will be ordered in ascending order of the lowest ID of
7519
+ the nodes that refer to them. Individuals will be not only ordered
7520
+ so that :attr:`~Individual.parents` come before children (see
7521
+ :meth:`~TableCollection.sort_individuals`) but in addition
7522
+ will be secondarily sorted in ascending order of the lowest ID of
7523
+ their referring nodes. (However, ``reorder_populations`` may be set
7524
+ to ``False`` to keep the population table unchanged.)
7429
7525
 
7430
7526
  By default, the method removes all individuals and populations not
7431
7527
  referenced by any nodes, and all sites not referenced by any mutations.
@@ -7469,6 +7565,9 @@ class TreeSequence:
7469
7565
  check_shared_equality=True,
7470
7566
  add_populations=True,
7471
7567
  record_provenance=True,
7568
+ *,
7569
+ all_edges=False,
7570
+ all_mutations=False,
7472
7571
  ):
7473
7572
  """
7474
7573
  Returns an expanded tree sequence which contains the node-wise union of
@@ -7484,8 +7583,8 @@ class TreeSequence:
7484
7583
  1. Individuals whose nodes are new to ``self``.
7485
7584
  2. Edges whose parent or child are new to ``self``.
7486
7585
  3. Mutations whose nodes are new to ``self``.
7487
- 4. Sites which were not present in ``self``, if the site contains a newly
7488
- added mutation.
7586
+ 4. Sites whose positions are not present in the site positions in
7587
+ ``self``, if the site contains a newly added mutation.
7489
7588
 
7490
7589
  This can be thought of as a "node-wise" union: for instance, it can not
7491
7590
  be used to add new edges between two nodes already in ``self`` or new
@@ -7502,17 +7601,47 @@ class TreeSequence:
7502
7601
  nodes are in entirely new populations, then you must set up the
7503
7602
  population table first, and then union with ``add_populations=False``.
7504
7603
 
7505
- If the resulting tree sequence is invalid (for instance, a node is
7506
- specified to have two distinct parents on the same interval),
7507
- an error will be raised.
7604
+ This method makes sense if the "shared" portions of the tree sequences
7605
+ are equal; the option ``check_shared_equality`` performs a consistency
7606
+ check that this is true. If this check is disabled, it is very easy to
7607
+ produce nonsensical results via subtle inconsistencies.
7608
+
7609
+ The behavior above can be changed by ``all_edges`` and ``all_mutations``.
7610
+ If ``all_edges`` is True, then all edges in ``other`` are added to
7611
+ ``self``, instead of only edges adjacent to added nodes. If
7612
+ ``all_mutations`` is True, then similarly all mutations in ``other``
7613
+ are added (not just those on added nodes); furthermore, all sites
7614
+ at positions without a site already present are added to ``self``.
7615
+ The intended use case for these options is a "disjoint" union,
7616
+ where for instance the two tree sequences contain information about
7617
+ disjoint segments of the genome (see :meth:`.concatenate`).
7618
+ For some such applications it may be necessary to set
7619
+ ``check_shared_equality=False``: for instance, if ``other`` has
7620
+ an identical copy of the node table but no edges, then
7621
+ ``all_mutations=True, check_shared_equality=False`` can be used
7622
+ to add mutations to ``self``.
7508
7623
 
7509
- Note that this operation also sorts the resulting tables, so the
7510
- resulting tree sequence may not be equal to ``self`` even if nothing
7511
- new was added (although it would differ only in ordering of the tables).
7624
+ .. warning::
7625
+ If an equivalent node is specified in ``other``, the
7626
+ version in ``self`` is used without checking the node
7627
+ properties are the same. Similarly, if the same site position
7628
+ is present in both ``self`` and ``other``, the version in
7629
+ ``self`` is used without checking that site properties are
7630
+ the same. In these cases metadata and e.g. node times or ancestral
7631
+ states in ``other`` are simply ignored.
7512
7632
 
7513
- :param TableCollection other: Another table collection.
7633
+ .. note::
7634
+ This operation also sorts the resulting tables, so the resulting
7635
+ tree sequence may not be equal to ``self`` even if nothing new
7636
+ was added (although it would differ only in ordering of the tables).
7637
+
7638
+ :param TreeSequence other: Another tree sequence.
7514
7639
  :param list node_mapping: An array of node IDs that relate nodes in
7515
7640
  ``other`` to nodes in ``self``.
7641
+ :param bool all_edges: If True, then all edges in ``other`` are added
7642
+ to ``self``.
7643
+ :param bool all_mutations: If True, then all mutations and sites in
7644
+ ``other`` are added to ``self``.
7516
7645
  :param bool check_shared_equality: If True, the shared portions of the
7517
7646
  tree sequences will be checked for equality. It does so by
7518
7647
  running :meth:`TreeSequence.subset` on both ``self`` and ``other``
@@ -7522,6 +7651,11 @@ class TreeSequence:
7522
7651
  assigned new population IDs.
7523
7652
  :param bool record_provenance: Whether to record a provenance entry
7524
7653
  in the provenance table for this operation.
7654
+ :return: The union of the two tree sequences.
7655
+ :rtype: tskit.TreeSequence
7656
+ :raises: **tskit.LibraryError** -- If the resulting tree sequence is invalid
7657
+ (for instance, a node is specified to have two distinct
7658
+ parents on the same interval)
7525
7659
  """
7526
7660
  tables = self.dump_tables()
7527
7661
  other_tables = other.dump_tables()
@@ -7531,6 +7665,8 @@ class TreeSequence:
7531
7665
  check_shared_equality=check_shared_equality,
7532
7666
  add_populations=add_populations,
7533
7667
  record_provenance=record_provenance,
7668
+ all_edges=all_edges,
7669
+ all_mutations=all_mutations,
7534
7670
  )
7535
7671
  return tables.tree_sequence()
7536
7672
 
@@ -8600,52 +8736,6 @@ class TreeSequence:
8600
8736
  sizes = np.array(sizes, dtype=size_dtype)
8601
8737
  return flat, sizes
8602
8738
 
8603
- # def divergence_matrix(self, sample_sets, windows=None, mode="site"):
8604
- # """
8605
- # Finds the mean divergence between pairs of samples from each set of
8606
- # samples and in each window. Returns a numpy array indexed by (window,
8607
- # sample_set, sample_set). Diagonal entries are corrected so that the
8608
- # value gives the mean divergence for *distinct* samples, but it is not
8609
- # checked whether the sample_sets are disjoint (so offdiagonals are not
8610
- # corrected). For this reason, if an element of `sample_sets` has only
8611
- # one element, the corresponding diagonal will be NaN.
8612
-
8613
- # The mean divergence between two samples is defined to be the mean: (as
8614
- # a TreeStat) length of all edges separating them in the tree, or (as a
8615
- # SiteStat) density of segregating sites, at a uniformly chosen position
8616
- # on the genome.
8617
-
8618
- # :param list sample_sets: A list of sets of IDs of samples.
8619
- # :param iterable windows: The breakpoints of the windows (including start
8620
- # and end, so has one more entry than number of windows).
8621
- # :return: A list of the upper triangle of mean TMRCA values in row-major
8622
- # order, including the diagonal.
8623
- # """
8624
- # ns = len(sample_sets)
8625
- # indexes = [(i, j) for i in range(ns) for j in range(i, ns)]
8626
- # x = self.divergence(sample_sets, indexes, windows, mode=mode)
8627
- # nw = len(windows) - 1
8628
- # A = np.ones((nw, ns, ns), dtype=float)
8629
- # for w in range(nw):
8630
- # k = 0
8631
- # for i in range(ns):
8632
- # for j in range(i, ns):
8633
- # A[w, i, j] = A[w, j, i] = x[w][k]
8634
- # k += 1
8635
- # return A
8636
- # NOTE: see older definition of divmat here, which may be useful when documenting
8637
- # this function. See https://github.com/tskit-dev/tskit/issues/2781
8638
-
8639
- # NOTE for documentation of sample_sets. We *must* use samples currently because
8640
- # the normalisation for non-sample nodes is tricky. Do we normalise by the
8641
- # total span of the ts where the node is 'present' in the tree? We avoid this
8642
- # by insisting on sample nodes.
8643
-
8644
- # NOTE for documentation of num_threads. Need to explain that the
8645
- # its best to think of as the number of background *worker* threads.
8646
- # default is to run without any worker threads. If you want to run
8647
- # with all the cores on the machine, use num_threads=os.cpu_count().
8648
-
8649
8739
  def divergence_matrix(
8650
8740
  self,
8651
8741
  sample_sets=None,
@@ -8655,6 +8745,41 @@ class TreeSequence:
8655
8745
  mode=None,
8656
8746
  span_normalise=True,
8657
8747
  ):
8748
+ """
8749
+ Finds the matrix of pairwise :meth:`.divergence` values between groups
8750
+ of sample nodes. Returns a numpy array indexed by (window,
8751
+ sample_set, sample_set): the [k,i,j]th value of the result gives the
8752
+ mean divergence between pairs of samples from the i-th and j-th
8753
+ sample sets in the k-th window. As for :meth:`.divergence`,
8754
+ diagonal entries are corrected so that the
8755
+ value gives the mean divergence for *distinct* samples,
8756
+ and so diagonal entries are given by the :meth:`.diversity` of that
8757
+ sample set. For this reason, if an element of `sample_sets` has only
8758
+ one element, the corresponding :meth:`.diversity` will be NaN.
8759
+ However, this method will place a value of 0 in the diagonal instead of NaN
8760
+ in such cases; otherwise, this is equivalent to computing values with
8761
+ `meth`:.divergence`.
8762
+ However, this is (usually) more efficient than computing many
8763
+ pairwise values using the `indexes` argument to :meth:`.divergence`,
8764
+ so see :meth:`.divergence` for a description of what exactly is computed.
8765
+
8766
+ :param list sample_sets: A list of sets of IDs of samples.
8767
+ :param list windows: The breakpoints of the windows (including start
8768
+ and end, so has one more entry than number of windows).
8769
+ :param str mode: A string giving the "type" of the statistic to be computed
8770
+ (defaults to "site"; the other option is "branch").
8771
+ :return: An array indexed by (window, sample_set, sample_set), or if windows is
8772
+ `None`, an array indexed by (sample_set, sample_set).
8773
+ """
8774
+ # NOTE for documentation of sample_sets. We *must* use samples currently because
8775
+ # the normalisation for non-sample nodes is tricky. Do we normalise by the
8776
+ # total span of the ts where the node is 'present' in the tree? We avoid this
8777
+ # by insisting on sample nodes.
8778
+
8779
+ # NOTE for documentation of num_threads. Need to explain that the
8780
+ # its best to think of as the number of background *worker* threads.
8781
+ # default is to run without any worker threads. If you want to run
8782
+ # with all the cores on the machine, use num_threads=os.cpu_count().
8658
8783
  windows_specified = windows is not None
8659
8784
  windows = self.parse_windows(windows)
8660
8785
  mode = "site" if mode is None else mode
@@ -8862,7 +8987,16 @@ class TreeSequence:
8862
8987
  """
8863
8988
  Computes the full matrix of pairwise genetic relatedness values
8864
8989
  between (and within) pairs of sets of nodes from ``sample_sets``.
8865
- *Warning:* this does not compute exactly the same thing as
8990
+ Returns a numpy array indexed by (window, sample_set, sample_set):
8991
+ the [k,i,j]th value of the result gives the
8992
+ genetic relatedness between pairs of samples from the i-th and j-th
8993
+ sample sets in the k-th window.
8994
+ This is (usually) more efficient than computing many pairwise
8995
+ values using the `indexes` argument to :meth:`.genetic_relatedness`.
8996
+ Specifically, this computes :meth:`.genetic_relatedness` with
8997
+ ``centre=True`` and ``proportion=False`` (with caveats, see below).
8998
+
8999
+ *Warning:* in some cases, this does not compute exactly the same thing as
8866
9000
  :meth:`.genetic_relatedness`: see below for more details.
8867
9001
 
8868
9002
  If `mode="branch"`, then the value obtained is the same as that from
@@ -8870,29 +9004,35 @@ class TreeSequence:
8870
9004
  `proportion=False`. The same is true if `mode="site"` and all sites have
8871
9005
  at most one mutation.
8872
9006
 
8873
- However, if some sites have more than one mutation, the value may differ.
9007
+ However, if some sites have more than one mutation, the value may differ
9008
+ from that given by :meth:`.genetic_relatedness`:, although if the proportion
9009
+ of such sites is small, the difference will be small.
8874
9010
  The reason is that this function (for efficiency) computes relatedness
8875
- using :meth:`.divergence` and the following relationship.
9011
+ using :meth:`.divergence_matrix` and the following relationship.
8876
9012
  "Relatedness" measures the number of *shared* alleles (or branches),
8877
9013
  while "divergence" measures the number of *non-shared* alleles (or branches).
8878
9014
  Let :math:`T_i` be the total distance from sample :math:`i` up to the root;
8879
- then if :math:`D_{ij}` is the divergence between :math:`i` and :math:`j`
8880
- and :math:`R_{ij}` is the relatedness between :math:`i` and :math:`j`, then
8881
- :math:`T_i + T_j = D_{ij} + 2 R_{ij}.`
9015
+ then if :math:`D_{ij}` is the branch-mode divergence between :math:`i` and
9016
+ :math:`j` and :math:`R_{ij}` is the branch-mode relatedness between :math:`i`
9017
+ and :math:`j`, then :math:`T_i + T_j = D_{ij} + 2 R_{ij}.`
8882
9018
  So, for any samples :math:`I`, :math:`J`, :math:`S`, :math:`T`
8883
9019
  (that may now be random choices),
8884
9020
  :math:`R_{IJ}-R_{IS}-R_{JT}+R_{ST} = (D_{IJ}-D_{IS}-D_{JT}+D_{ST})/ (-2)`.
8885
- Note, however, that this relationship only holds for `mode="site"`
8886
- if we can treat "number of differing alleles" as distances on the tree;
8887
- this is not necessarily the case in the presence of multiple mutations.
9021
+ This is exactly what we want for (centered) relatedness.
9022
+ However, this relationship does not necessarily hold for `mode="site"`:
9023
+ it does hold if we can treat "number of differing alleles" as distances
9024
+ on the tree, but this is not necessarily the case in the presence of
9025
+ multiple mutations.
8888
9026
 
8889
- Another caveat in the above relationship between :math:`R` and :math:`D`
9027
+ Another note regarding the above relationship between :math:`R` and :math:`D`
8890
9028
  is that :meth:`.divergence` of a sample set to itself does not include
8891
9029
  the "self" comparisons (so as to provide an unbiased estimator of a
8892
9030
  population quantity), while the usual definition of genetic relatedness
8893
9031
  *does* include such comparisons (to provide, for instance, an appropriate
8894
9032
  value for prospective results beginning with only a given set of
8895
- individuals).
9033
+ individuals). So, diagonal entries in the relatedness matrix returned here
9034
+ are obtained from :meth:`divergence_matrix` after first correcting
9035
+ diagonals to include these "self" comparisons.
8896
9036
 
8897
9037
  :param list sample_sets: A list of lists of Node IDs, specifying the
8898
9038
  groups of nodes to compute the statistic with.
@@ -8901,11 +9041,35 @@ class TreeSequence:
8901
9041
  :param str mode: A string giving the "type" of the statistic to be computed
8902
9042
  (defaults to "site").
8903
9043
  :param bool span_normalise: Whether to divide the result by the span of the
8904
- window (defaults to True). Has no effect if ``proportion`` is True.
8905
- :return: A ndarray with shape equal to (num windows, num statistics).
8906
- If there is one pair of sample sets and windows=None, a numpy scalar is
8907
- returned.
8908
- """
9044
+ window (defaults to True).
9045
+ :return: An array indexed by (window, sample_set, sample_set), or if windows is
9046
+ `None`, an array indexed by (sample_set, sample_set).
9047
+ """
9048
+ # Further notes on the relationship between relatedness (R)
9049
+ # and divergence (D) in mode="site":
9050
+ # The summary function for divergence is "p (1-q)",
9051
+ # where p and q are the allele frequencies in the two sample sets;
9052
+ # while for relatedness it is "pq". Summing across *all* alleles,
9053
+ # we get that relatedness plus divergence is
9054
+ # p1 (1-q1) + p1 q1 + ... + pk (1-qk) + pk qk = p1 + ... + pk = 1 .
9055
+ # This implies that
9056
+ # ts.divergence(..., span_normalise=False)
9057
+ # + ts.genetic_relatedness(..., span_normalise=False, centre=False,
9058
+ # proportion=False, polarised=False)
9059
+ # == ts.num_sites
9060
+ # This could be the basis for a similar relationship between R and D.
9061
+ # However, that relationship holds only with polarised=False, which is not
9062
+ # the default, or what this function does (for good reason).
9063
+ # So, without setting polarised=False, we have that that for samples i and j,
9064
+ # divergence plus relatedness is equal to (something like)
9065
+ # the total number of sites at which both i and j are ancestral;
9066
+ # this depends on the samples and so does not cancel out of the centred
9067
+ # version. We could work through these relationships to figure out what exactly
9068
+ # the difference between genetic_relatedness_matrix(mode="site") and
9069
+ # genetic_relatedness(mode="site") is, in the general case of multiple
9070
+ # mutations... but that would be confusing, probably not that useful,
9071
+ # and the short version of all this is that "it's complicated".
9072
+
8909
9073
  D = self.divergence_matrix(
8910
9074
  sample_sets,
8911
9075
  windows=windows,
@@ -9077,6 +9241,7 @@ class TreeSequence:
9077
9241
  mode=mode,
9078
9242
  centre=False,
9079
9243
  nodes=indices,
9244
+ span_normalise=False, # <- non-default!
9080
9245
  )[0]
9081
9246
  x = x - x.mean(axis=0) if centre else x
9082
9247
 
@@ -9107,6 +9272,7 @@ class TreeSequence:
9107
9272
  mode=mode,
9108
9273
  centre=False,
9109
9274
  nodes=samples,
9275
+ span_normalise=False, # <- non-default!
9110
9276
  )[0]
9111
9277
 
9112
9278
  def bincount_fn(w):
@@ -9137,23 +9303,28 @@ class TreeSequence:
9137
9303
  eigenvectors of the genetic relatedness matrix, which are obtained by a
9138
9304
  randomized singular value decomposition (rSVD) algorithm.
9139
9305
 
9140
- Concretely, if :math:`M` is the matrix of genetic relatedness values, with
9141
- :math:`M_{ij}` the output of
9142
- :meth:`genetic_relatedness <.TreeSequence.genetic_relatedness>`
9143
- between sample :math:`i` and sample :math:`j`, then by default this returns
9144
- the top ``num_components`` eigenvectors of :math:`M`, so that
9306
+ Concretely, take :math:`M` as the matrix of non-span-normalised
9307
+ genetic relatedness values, for instance obtained by
9308
+ setting :math:`M_{ij}` to be the :meth:`~.TreeSequence.genetic_relatedness`
9309
+ between sample :math:`i` and sample :math:`j` with the specified ``mode``,
9310
+ ``proportion=False`` and ``span_normalise=False``. Then by default this
9311
+ returns the top ``num_components`` eigenvectors of :math:`M`, so that
9145
9312
  ``output.factors[i,k]`` is the position of sample `i` on the `k` th PC.
9146
- If ``samples`` or ``individuals`` are provided, then this does the same thing,
9147
- except with :math:`M_{ij}` either the relatedness between ``samples[i]``
9148
- and ``samples[j]`` or the nodes of ``individuals[i]`` and ``individuals[j]``,
9149
- respectively.
9313
+ If ``samples`` or ``individuals`` are provided, then this does the same
9314
+ thing, except with :math:`M_{ij}` either the relatedness between
9315
+ ``samples[i]`` and ``samples[j]`` or the average relatedness between the
9316
+ nodes of ``individuals[i]`` and ``individuals[j]``, respectively.
9317
+ Factors are normalized to have norm 1, i.e.,
9318
+ ``output.factors[:,k] ** 2).sum() == 1)`` for any ``k``.
9150
9319
 
9151
9320
  The parameters ``centre`` and ``mode`` are passed to
9152
- :meth:`genetic_relatedness <.TreeSequence.genetic_relatedness>`;
9153
- if ``windows`` are provided then PCA is carried out separately in each window.
9154
- If ``time_windows`` is provided, then genetic relatedness is measured using only
9155
- ancestral material within the given time window (see
9156
- :meth:`decapitate <.TreeSequence.decapitate>` for how this is defined).
9321
+ :meth:`~.TreeSequence.genetic_relatedness`: the default ``centre=True`` results
9322
+ in factors whose elements sum to zero; ``mode`` currently only supports the
9323
+ ``"branch"`` setting. If ``windows`` are provided then PCA is carried out
9324
+ separately in each genomic window. If ``time_windows`` is provided, then genetic
9325
+ relatedness is measured using only ancestral material within the given time
9326
+ window (see :meth:`decapitate <.TreeSequence.decapitate>` for how this is
9327
+ defined).
9157
9328
 
9158
9329
  So that the method scales to large tree sequences, the underlying method
9159
9330
  relies on a randomized SVD algorithm, using
@@ -9829,7 +10000,7 @@ class TreeSequence:
9829
10000
  b = 2 * (n**2 + n + 3) / (9 * n * (n - 1)) - (n + 2) / (h * n) + g / h**2
9830
10001
  c = h**2 + g
9831
10002
 
9832
- What is computed for diversity and divergence depends on ``mode``;
10003
+ What is computed for diversity and segregating sites depends on ``mode``;
9833
10004
  see those functions for more details.
9834
10005
 
9835
10006
  :param list sample_sets: A list of lists of Node IDs, specifying the
@@ -9892,6 +10063,11 @@ class TreeSequence:
9892
10063
  What is computed for diversity and divergence depends on ``mode``;
9893
10064
  see those functions for more details.
9894
10065
 
10066
+ For ``mode='site'``, this definition of Fst appears as equation (6) in
10067
+ `Slatkin (1991) <https://doi.org/10.1017/S0016672300029827>`_, and
10068
+ is also found as equation (9) in
10069
+ `Nei (1973) <https://doi.org/10.1073/pnas.70.12.3321>`_.
10070
+
9895
10071
  :param list sample_sets: A list of lists of Node IDs, specifying the
9896
10072
  groups of nodes to compute the statistic with.
9897
10073
  :param list indexes: A list of 2-tuples.
@@ -10313,7 +10489,8 @@ class TreeSequence:
10313
10489
 
10314
10490
  For an precise mathematical definition of GNN, see https://doi.org/10.1101/458067
10315
10491
 
10316
- .. note:: The reference sets need not include all the samples, hence the most
10492
+ .. note::
10493
+ The reference sets need not include all the samples, hence the most
10317
10494
  recent common ancestral node of the reference sets, :math:`a`, need not be
10318
10495
  the immediate ancestor of the focal node. If the reference sets only comprise
10319
10496
  sequences from relatively distant individuals, the GNN statistic may end up
@@ -10425,7 +10602,7 @@ class TreeSequence:
10425
10602
  represented by the tree sequence.
10426
10603
 
10427
10604
  :param list within: A list of node IDs defining set of nodes that
10428
- we finding IBD segments for. If not specified, this defaults to
10605
+ we find IBD segments for. If not specified, this defaults to
10429
10606
  all samples in the tree sequence.
10430
10607
  :param list[list] between: A list of lists of sample node IDs. Given
10431
10608
  two sample sets A and B, only IBD segments will be returned such
@@ -10440,7 +10617,7 @@ class TreeSequence:
10440
10617
  segment) is greater than this value will be included. (Default=0)
10441
10618
  :param bool store_pairs: If True store information separately for each
10442
10619
  pair of samples ``(a, b)`` that are found to be IBD. Otherwise
10443
- store summary information about all sample apirs. (Default=False)
10620
+ store summary information about all sample pairs. (Default=False)
10444
10621
  :param bool store_segments: If True store each IBD segment
10445
10622
  ``(left, right, c)`` and associate it with the corresponding
10446
10623
  sample pair ``(a, b)``. If True, implies ``store_pairs``.
@@ -10449,7 +10626,7 @@ class TreeSequence:
10449
10626
  IBD information.
10450
10627
  :rtype: IdentitySegments
10451
10628
  """
10452
- return self.tables.ibd_segments(
10629
+ return self.dump_tables().ibd_segments(
10453
10630
  within=within,
10454
10631
  between=between,
10455
10632
  max_time=max_time,
@@ -10871,7 +11048,7 @@ class TreeSequence:
10871
11048
  mapping is created by first checking if the tree sequence contains individuals.
10872
11049
  If it does, the mapping is created using the individuals in the tree sequence.
10873
11050
  By default only the sample nodes of the individuals are included in the mapping,
10874
- unless `include_non_sample_nodes` is set to True, in which case all nodes
11051
+ unless ``include_non_sample_nodes`` is set to True, in which case all nodes
10875
11052
  belonging to the individuals are included. Any individuals without any nodes
10876
11053
  will have no nodes in their row of the mapping, being essentially of zero ploidy.
10877
11054
  If no individuals are present, the mapping is created using only the sample nodes
@@ -10879,20 +11056,22 @@ class TreeSequence:
10879
11056
 
10880
11057
  As the tskit data model allows non-integer positions, site positions and contig
10881
11058
  length are transformed to integer values suitable for VCF output. The
10882
- transformation is done using the `position_transform` function, which must
11059
+ transformation is done using the ``position_transform`` function, which must
10883
11060
  return an integer numpy array the same dimension as the input. By default,
10884
11061
  this is set to ``numpy.round()`` which will round values to the nearest integer.
10885
11062
 
10886
- If neither `name_metadata_key` nor `individual_names` is not specified, the
10887
- individual names are set to "tsk_{individual_id}" for each individual. If
10888
- no individuals are present, the individual names are set to "tsk_{i}" with
10889
- `0 <= i < num_sample_nodes/ploidy`.
11063
+ If neither ``name_metadata_key`` nor ``individual_names`` is specified, the
11064
+ individual names are set to ``"tsk_{individual_id}"`` for each individual. If
11065
+ no individuals are present, the individual names are set to ``"tsk_{i}"`` with
11066
+ ``0 <= i < num_sample_nodes/ploidy``.
10890
11067
 
10891
- A Warning are emmitted if any sample nodes do not have an individual ID.
11068
+ A warning is emitted if any sample nodes do not have an individual ID.
10892
11069
 
10893
11070
  :param list individuals: Specific individual IDs to include in the VCF. If not
10894
11071
  specified and the tree sequence contains individuals, all individuals are
10895
- included at least one node.
11072
+ included that are associated with least one sample node (or at least one of
11073
+ any node if ``include_non_sample_nodes`` is True), and the mapping arrays
11074
+ will be in ascending order of the ID of the individual in the tree sequence.
10896
11075
  :param int ploidy: The ploidy, or number of nodes per individual. Only used when
10897
11076
  the tree sequence does not contain individuals. Cannot be used if the tree
10898
11077
  sequence contains individuals. Defaults to 1 if not specified.