tskit 1.0.0b3__cp313-cp313-win_amd64.whl → 1.0.1__cp313-cp313-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
tskit/trees.py CHANGED
@@ -370,7 +370,11 @@ class Site(util.Dataclass):
370
370
  mutations: np.ndarray
371
371
  """
372
372
  The list of mutations at this site. Mutations within a site are returned in the
373
- order they are specified in the underlying :class:`MutationTable`.
373
+
374
+ order they are specified in the underlying :class:`MutationTable`. For canonical
375
+ (i.e., valid) tables, this means ancestral mutations precede their descendants, so
376
+ older mutations (as defined by the canonical mutation ordering; see
377
+ :ref:`sec_mutation_requirements`) appear before younger ones.
374
378
  """
375
379
  metadata: bytes | dict | None
376
380
  """
@@ -571,8 +575,8 @@ class Migration(util.Dataclass):
571
575
  """
572
576
  id: int # noqa A003
573
577
  """
574
- The integer ID of this mutation. Varies from 0 to
575
- :attr:`TreeSequence.num_mutations` - 1.
578
+ The integer ID of this migration. Varies from 0 to
579
+ :attr:`TreeSequence.num_migrations` - 1.
576
580
  """
577
581
 
578
582
 
@@ -770,7 +774,7 @@ class Tree:
770
774
  calling the :meth:`TreeSequence.trees` iterator.
771
775
 
772
776
  :return: The root threshold.
773
- :rtype: :class:`TreeSequence`
777
+ :rtype: int
774
778
  """
775
779
  return self._ll_tree.get_root_threshold()
776
780
 
@@ -881,7 +885,8 @@ class Tree:
881
885
 
882
886
  :param float position: The position along the sequence length to
883
887
  seek to.
884
- :raises ValueError: If 0 < position or position >=
888
+ :raises ValueError: If ``position`` is less than 0 or ``position`` is greater
889
+ than or equal to
885
890
  :attr:`TreeSequence.sequence_length`.
886
891
  """
887
892
  if position < 0 or position >= self.tree_sequence.sequence_length:
@@ -918,7 +923,7 @@ class Tree:
918
923
  the interval :math:`[0, \\text{span})` and the :attr:`~Tree.tree_sequence`
919
924
  from which the tree is taken will have its
920
925
  :attr:`~tskit.TreeSequence.sequence_length` equal to ``span``.
921
- :param: float branch_length: The minimum length of a branch in this tree.
926
+ :param float branch_length: The minimum length of a branch in this tree.
922
927
  :raises ValueError: If the given rank is out of bounds for trees
923
928
  with ``num_leaves`` leaves.
924
929
  """
@@ -3593,7 +3598,7 @@ def parse_nodes(source, strict=True, encoding="utf8", base64_metadata=True, tabl
3593
3598
  return table
3594
3599
 
3595
3600
 
3596
- def parse_edges(source, strict=True, table=None):
3601
+ def parse_edges(source, strict=True, table=None, encoding="utf8", base64_metadata=True):
3597
3602
  """
3598
3603
  Parse the specified file-like object containing a whitespace delimited
3599
3604
  description of a edge table and returns the corresponding :class:`EdgeTable`
@@ -3609,6 +3614,9 @@ def parse_edges(source, strict=True, table=None):
3609
3614
  False, a relaxed whitespace splitting algorithm is used.
3610
3615
  :param EdgeTable table: If specified, write the edges into this table. If
3611
3616
  not, create a new :class:`EdgeTable` instance and return.
3617
+ :param str encoding: Encoding used for text representation.
3618
+ :param bool base64_metadata: If True, metadata is encoded using Base64
3619
+ encoding; otherwise, as plain text.
3612
3620
  """
3613
3621
  sep = None
3614
3622
  if strict:
@@ -3620,6 +3628,12 @@ def parse_edges(source, strict=True, table=None):
3620
3628
  right_index = header.index("right")
3621
3629
  parent_index = header.index("parent")
3622
3630
  children_index = header.index("child")
3631
+ metadata_index = None
3632
+ try:
3633
+ metadata_index = header.index("metadata")
3634
+ except ValueError:
3635
+ pass
3636
+ default_metadata = b""
3623
3637
  for line in source:
3624
3638
  tokens = line.rstrip("\n").split(sep)
3625
3639
  if len(tokens) >= 4:
@@ -3627,8 +3641,19 @@ def parse_edges(source, strict=True, table=None):
3627
3641
  right = float(tokens[right_index])
3628
3642
  parent = int(tokens[parent_index])
3629
3643
  children = tuple(map(int, tokens[children_index].split(",")))
3644
+ metadata = default_metadata
3645
+ if metadata_index is not None and metadata_index < len(tokens):
3646
+ metadata = tokens[metadata_index].encode(encoding)
3647
+ if base64_metadata:
3648
+ metadata = base64.b64decode(metadata)
3630
3649
  for child in children:
3631
- table.add_row(left=left, right=right, parent=parent, child=child)
3650
+ table.add_row(
3651
+ left=left,
3652
+ right=right,
3653
+ parent=parent,
3654
+ child=child,
3655
+ metadata=metadata,
3656
+ )
3632
3657
  return table
3633
3658
 
3634
3659
 
@@ -4368,6 +4393,22 @@ class TreeSequence:
4368
4393
  self._ll_tree_sequence.dump_tables(ll_tables)
4369
4394
  return tables.TableCollection(ll_tables=ll_tables)
4370
4395
 
4396
+ def link_ancestors(self, samples, ancestors):
4397
+ """
4398
+ Equivalent to :meth:`TableCollection.link_ancestors`; see that method for full
4399
+ documentation and parameter semantics.
4400
+
4401
+ :param list[int] samples: Node IDs to retain as samples.
4402
+ :param list[int] ancestors: Node IDs to treat as ancestors.
4403
+ :return: An :class:`tables.EdgeTable` containing the genealogical links between
4404
+ the supplied ``samples`` and ``ancestors``.
4405
+ :rtype: tables.EdgeTable
4406
+ """
4407
+ samples = util.safe_np_int_cast(samples, np.int32)
4408
+ ancestors = util.safe_np_int_cast(ancestors, np.int32)
4409
+ ll_edge_table = self._ll_tree_sequence.link_ancestors(samples, ancestors)
4410
+ return tables.EdgeTable(ll_table=ll_edge_table)
4411
+
4371
4412
  def dump_text(
4372
4413
  self,
4373
4414
  nodes=None,
@@ -4767,7 +4808,8 @@ class TreeSequence:
4767
4808
  Returns an iterable sequence of all the :ref:`nodes <sec_node_table_definition>`
4768
4809
  in this tree sequence.
4769
4810
 
4770
- .. note:: Although node ids are commonly ordered by node time, this is not a
4811
+ .. note::
4812
+ Although node ids are commonly ordered by node time, this is not a
4771
4813
  formal tree sequence requirement. If you wish to iterate over nodes in
4772
4814
  time order, you should therefore use ``order="timeasc"`` (and wrap the
4773
4815
  resulting sequence in the standard Python :func:`python:reversed` function
@@ -5321,13 +5363,13 @@ class TreeSequence:
5321
5363
  Returns an iterator over the strings of haplotypes that result from
5322
5364
  the trees and mutations in this tree sequence. Each haplotype string
5323
5365
  is guaranteed to be of the same length. A tree sequence with
5324
- :math:`n` samples and with :math:`s` sites lying between ``left`` and
5325
- ``right`` will return a total of :math:`n`
5326
- strings of :math:`s` alleles concatenated together, where an allele
5366
+ :math:`n` requested nodes (default: the number of sample nodes) and with
5367
+ :math:`s` sites lying between ``left`` and ``right`` will return a total
5368
+ of :math:`n` strings of :math:`s` alleles concatenated together, where an allele
5327
5369
  consists of a single ascii character (tree sequences that include alleles
5328
5370
  which are not a single character in length, or where the character is
5329
5371
  non-ascii, will raise an error). The first string returned is the
5330
- haplotype for the first requested sample, and so on.
5372
+ haplotype for the first requested node, and so on.
5331
5373
 
5332
5374
  The alleles at each site must be represented by single byte characters,
5333
5375
  (i.e., variants must be single nucleotide polymorphisms, or SNPs), hence
@@ -5336,8 +5378,8 @@ class TreeSequence:
5336
5378
  haplotype ``h``, the value of ``h[j]`` will therefore be the observed
5337
5379
  allelic state at site ``j``.
5338
5380
 
5339
- If ``isolated_as_missing`` is True (the default), isolated samples without
5340
- mutations directly above them will be treated as
5381
+ If ``isolated_as_missing`` is True (the default), isolated nodes without
5382
+ mutations directly above them (whether samples or non-samples) will be treated as
5341
5383
  :ref:`missing data<sec_data_model_missing_data>` and will be
5342
5384
  represented in the string by the ``missing_data_character``. If
5343
5385
  instead it is set to False, missing data will be assigned the ancestral state
@@ -5346,8 +5388,10 @@ class TreeSequence:
5346
5388
  behaviour in versions prior to 0.2.0. Prior to 0.3.0 the `impute_missing_data`
5347
5389
  argument controlled this behaviour.
5348
5390
 
5391
+ It is also possible to provide **non-sample** nodes via the ``samples``
5392
+ argument if you wish to output haplotypes for (e.g.) internal nodes.
5349
5393
  See also the :meth:`.variants` iterator for site-centric access
5350
- to sample genotypes.
5394
+ to genotypes for the requested nodes.
5351
5395
 
5352
5396
  .. warning::
5353
5397
  For large datasets, this method can consume a **very large** amount of
@@ -5365,9 +5409,10 @@ class TreeSequence:
5365
5409
  be used to represent missing data.
5366
5410
  If any normal allele contains this character, an error is raised.
5367
5411
  Default: 'N'.
5368
- :param list[int] samples: The samples for which to output haplotypes. If
5369
- ``None`` (default), return haplotypes for all the samples in the tree
5370
- sequence, in the order given by the :meth:`.samples` method.
5412
+ :param list[int] samples: The node IDs for which to output haplotypes. If
5413
+ ``None`` (default), return haplotypes for all the sample nodes in the tree
5414
+ sequence, in the order given by the :meth:`.samples` method. Non-sample
5415
+ nodes may also be provided.
5371
5416
  :param int left: Haplotype strings will start with the first site at or after
5372
5417
  this genomic position. If ``None`` (default) start at the first site.
5373
5418
  :param int right: Haplotype strings will end with the last site before this
@@ -5438,9 +5483,13 @@ class TreeSequence:
5438
5483
  generated; output order of genotypes in the returned variants
5439
5484
  corresponds to the order of the samples in this list. It is also
5440
5485
  possible to provide **non-sample** nodes as an argument here, if you
5441
- wish to generate genotypes for (e.g.) internal nodes. However,
5442
- ``isolated_as_missing`` must be False in this case, as it is not
5443
- possible to detect missing data for non-sample nodes.
5486
+ wish to generate genotypes for (e.g.) internal nodes. Missingness is
5487
+ detected for any requested node (sample or non-sample) when
5488
+ ``isolated_as_missing`` is True: if a node is isolated at a site (i.e.,
5489
+ has no parent and no children in the marginal tree) and has no mutation
5490
+ above it at that site, its genotype will be reported as
5491
+ :data:`MISSING_DATA` (-1). If ``isolated_as_missing`` is False, such
5492
+ nodes are assigned the site's ancestral allele index.
5444
5493
 
5445
5494
  If isolated samples are present at a given site without mutations above them,
5446
5495
  they are interpreted by default as
@@ -5530,19 +5579,23 @@ class TreeSequence:
5530
5579
  """
5531
5580
  Returns an :math:`m \\times n` numpy array of the genotypes in this
5532
5581
  tree sequence, where :math:`m` is the number of sites and :math:`n`
5533
- the number of samples. The genotypes are the indexes into the array
5534
- of ``alleles``, as described for the :class:`Variant` class.
5535
-
5536
- If isolated samples are present at a given site without mutations above them,
5537
- they will be interpreted as :ref:`missing data<sec_data_model_missing_data>`
5538
- the genotypes array will contain a special value :data:`MISSING_DATA`
5539
- (-1) to identify these missing samples.
5540
-
5541
- Such samples are treated as missing data by default, but if
5542
- ``isolated_as_missing`` is set to to False, they will not be treated as missing,
5543
- and so assigned the ancestral state. This was the default behaviour in
5544
- versions prior to 0.2.0. Prior to 0.3.0 the `impute_missing_data`
5545
- argument controlled this behaviour.
5582
+ is the number of requested nodes (default: the number of sample nodes).
5583
+ The genotypes are the indexes into the array of ``alleles``, as
5584
+ described for the :class:`Variant` class.
5585
+
5586
+ It is possible to provide **non-sample** nodes via the ``samples``
5587
+ argument if you wish to generate genotypes for (e.g.) internal nodes.
5588
+ Missingness is detected for any requested node (sample or non-sample)
5589
+ when ``isolated_as_missing`` is True: if a node is isolated at a site
5590
+ (i.e., has no parent and no children in the marginal tree) and has no
5591
+ mutation above it at that site, its genotype will be reported as
5592
+ :data:`MISSING_DATA` (-1).
5593
+
5594
+ Such nodes are treated as missing data by default. If
5595
+ ``isolated_as_missing`` is set to False, they will not be treated as
5596
+ missing, and will instead be assigned the ancestral state. This was the
5597
+ default behaviour in versions prior to 0.2.0. Prior to 0.3.0 the
5598
+ ``impute_missing_data`` argument controlled this behaviour.
5546
5599
 
5547
5600
  .. warning::
5548
5601
  This method can consume a **very large** amount of memory! If
@@ -5550,10 +5603,12 @@ class TreeSequence:
5550
5603
  access them sequentially using the :meth:`.variants` iterator.
5551
5604
 
5552
5605
  :param array_like samples: An array of node IDs for which to generate
5553
- genotypes, or None for all sample nodes. Default: None.
5606
+ genotypes. If ``None`` (default), generate genotypes for all sample
5607
+ nodes. Non-sample nodes may also be provided, in which case genotypes
5608
+ will be generated for those nodes too.
5554
5609
  :param bool isolated_as_missing: If True, the genotype value assigned to
5555
- missing samples (i.e., isolated samples without mutations) is
5556
- :data:`.MISSING_DATA` (-1). If False, missing samples will be
5610
+ isolated nodes without mutations (samples or non-samples) is
5611
+ :data:`.MISSING_DATA` (-1). If False, such nodes will be
5557
5612
  assigned the allele index for the ancestral state.
5558
5613
  Default: True.
5559
5614
  :param tuple alleles: A tuple of strings describing the encoding of
@@ -5602,21 +5657,24 @@ class TreeSequence:
5602
5657
  *,
5603
5658
  reference_sequence=None,
5604
5659
  missing_data_character=None,
5660
+ isolated_as_missing=None,
5605
5661
  samples=None,
5606
5662
  left=None,
5607
5663
  right=None,
5608
5664
  ):
5609
5665
  """
5610
5666
  Returns an iterator over the full sequence alignments for the defined samples
5611
- in this tree sequence. Each alignment ``a`` is a string of length ``L`` where
5612
- the first character is the genomic sequence at the ``start`` position in the
5613
- genome (defaulting to 0) and the last character is the genomic sequence one
5614
- position before the ``stop`` value (defaulting to the :attr:`.sequence_length`
5615
- of this tree sequence, which must have :attr:`.discrete_genome` equal to True).
5616
- By default ``L`` is therefore equal to the :attr:`.sequence_length`,
5617
- and ``a[j]`` is the nucleotide value at genomic position ``j``.
5618
-
5619
- .. note:: This is inherently a **zero-based** representation of the sequence
5667
+ in this tree sequence. Each yielded alignment ``a`` is a string of length
5668
+ ``L`` where the first character is the genomic sequence at the ``start``
5669
+ position in the genome (defaulting to 0) and the last character is the
5670
+ genomic sequence one position before the ``stop`` value (defaulting to the
5671
+ :attr:`.sequence_length` of this tree sequence, which must have
5672
+ :attr:`.discrete_genome` equal to True). By default ``L`` is therefore equal
5673
+ to the :attr:`.sequence_length`, and ``a[j]`` is the nucleotide value at
5674
+ genomic position ``j``.
5675
+
5676
+ .. note::
5677
+ This is inherently a **zero-based** representation of the sequence
5620
5678
  coordinate space. Care will be needed when interacting with other
5621
5679
  libraries and upstream coordinate spaces.
5622
5680
 
@@ -5665,31 +5723,44 @@ class TreeSequence:
5665
5723
  single byte characters, (i.e., variants must be single nucleotide
5666
5724
  polymorphisms, or SNPs).
5667
5725
 
5668
- .. warning:: :ref:`Missing data<sec_data_model_missing_data>` is not
5669
- currently supported by this method and it will raise a ValueError
5670
- if called on tree sequences containing isolated samples.
5671
- See https://github.com/tskit-dev/tskit/issues/1896 for more
5672
- information.
5726
+ Missing data handling
5727
+
5728
+ - If ``isolated_as_missing=True`` (default), nodes that are isolated
5729
+ (no parent and no children) are rendered as the missing character across
5730
+ each tree interval. At site positions, the per-site allele overrides the
5731
+ missing character; if a genotype is missing (``-1``), the missing
5732
+ character is retained.
5733
+ - If ``isolated_as_missing=False``, no missing overlay is applied. At sites,
5734
+ genotypes are decoded as usual; at non-sites, bases come from the
5735
+ reference sequence.
5673
5736
 
5674
5737
  See also the :meth:`.variants` iterator for site-centric access
5675
5738
  to sample genotypes and :meth:`.haplotypes` for access to sample sequences
5676
5739
  at just the sites in the tree sequence.
5677
5740
 
5678
5741
  :param str reference_sequence: The reference sequence to fill in
5679
- gaps between sites in the alignments.
5742
+ gaps between sites in the alignments. If provided, it must be a
5743
+ string of length equal to :attr:`.sequence_length`; the sequence is
5744
+ sliced internally to the requested ``[left, right)`` interval.
5680
5745
  :param str missing_data_character: A single ascii character that will
5681
5746
  be used to represent missing data.
5682
5747
  If any normal allele contains this character, an error is raised.
5683
5748
  Default: 'N'.
5684
- :param list[int] samples: The samples for which to output alignments. If
5685
- ``None`` (default), return alignments for all the samples in the tree
5686
- sequence, in the order given by the :meth:`.samples` method.
5749
+ :param bool isolated_as_missing: If True, treat isolated nodes as missing
5750
+ across the covered tree intervals (see above). If None (default), this
5751
+ is treated as True.
5752
+ :param list[int] samples: The nodes for which to output alignments. If
5753
+ ``None`` (default), return alignments for all sample nodes in the order
5754
+ given by the :meth:`.samples` method. Non-sample nodes are also supported
5755
+ and will be decoded at sites in the same way as samples.
5687
5756
  :param int left: Alignments will start at this genomic position. If ``None``
5688
5757
  (default) alignments start at 0.
5689
- :param int right: Alignments will stop before this genomic position. If ``None``
5690
- (default) alignments will continue until the end of the tree sequence.
5758
+ :param int right: Alignments will stop before this genomic position.
5759
+ If ``None`` (default) alignments will continue until the end of the
5760
+ tree sequence.
5691
5761
  :return: An iterator over the alignment strings for specified samples in
5692
- this tree sequence, in the order given in ``samples``.
5762
+ this tree sequence, in the order given in ``samples``. Each string has
5763
+ length ``L = right - left``.
5693
5764
  :rtype: collections.abc.Iterable
5694
5765
  :raises ValueError: if any genome coordinate in this tree sequence is not
5695
5766
  discrete, or if the ``reference_sequence`` is not of the correct length.
@@ -5703,60 +5774,53 @@ class TreeSequence:
5703
5774
  "N" if missing_data_character is None else missing_data_character
5704
5775
  )
5705
5776
 
5706
- L = interval.span
5707
- a = np.empty(L, dtype=np.int8)
5708
- if reference_sequence is None:
5709
- if self.has_reference_sequence():
5710
- # This may be inefficient - see #1989. However, since we're
5711
- # n copies of the reference sequence anyway, this is a relatively
5712
- # minor tweak. We may also want to recode the below not to use direct
5713
- # access to the .data attribute, e.g. if we allow reference sequences
5714
- # to start at non-zero positions
5715
- reference_sequence = self.reference_sequence.data[
5716
- interval.left : interval.right
5717
- ]
5718
- else:
5719
- reference_sequence = missing_data_character * L
5777
+ if isolated_as_missing is None:
5778
+ isolated_as_missing = True
5720
5779
 
5721
- if len(reference_sequence) != L:
5722
- if interval.right == int(self.sequence_length):
5723
- raise ValueError(
5724
- "The reference sequence is shorter than the tree sequence length"
5725
- )
5726
- else:
5780
+ if len(missing_data_character) != 1:
5781
+ raise TypeError("missing_data_character must be a single character")
5782
+
5783
+ # Determine the reference sequence for the whole tree sequence
5784
+ full_ref = None
5785
+ if reference_sequence is not None:
5786
+ full_ref = reference_sequence
5787
+ elif self.has_reference_sequence():
5788
+ # This may be inefficient - see #1989. However, since we're
5789
+ # n copies of the reference sequence anyway, this is a relatively
5790
+ # minor tweak. We may also want to recode the below not to use direct
5791
+ # access to the .data attribute, e.g. if we allow reference sequences
5792
+ # to start at non-zero positions
5793
+ full_ref = self.reference_sequence.data
5794
+
5795
+ if full_ref is None:
5796
+ full_ref = missing_data_character * int(self.sequence_length)
5797
+ else:
5798
+ if len(full_ref) != int(self.sequence_length):
5727
5799
  raise ValueError(
5728
- "The reference sequence ends before the requested stop position"
5800
+ "The reference sequence must be equal to the tree sequence length"
5729
5801
  )
5730
- ref_bytes = reference_sequence.encode("ascii")
5731
- a[:] = np.frombuffer(ref_bytes, dtype=np.int8)
5732
-
5733
- # To do this properly we'll have to detect the missing data as
5734
- # part of a full implementation of alignments in C. The current
5735
- # definition might not be calling some degenerate cases correctly;
5736
- # see https://github.com/tskit-dev/tskit/issues/1908
5737
- #
5738
- # Note also that this will call the presence of missing data
5739
- # incorrectly if have a sample isolated over the region (a, b],
5740
- # and if we have sites at each position from a to b, and at
5741
- # each site there is a mutation over the isolated sample.
5742
- if any(tree._has_isolated_samples() for tree in self.trees()):
5743
- raise ValueError(
5744
- "Missing data not currently supported in alignments; see "
5745
- "https://github.com/tskit-dev/tskit/issues/1896 for details."
5746
- "The current implementation may also incorrectly identify an "
5747
- "input tree sequence has having missing data."
5748
- )
5749
- H, (first_site_id, last_site_id) = self._haplotypes_array(
5750
- interval=interval,
5751
- missing_data_character=missing_data_character,
5752
- samples=samples,
5802
+
5803
+ try:
5804
+ ref_bytes = full_ref.encode("ascii")
5805
+ missing_data_character.encode("ascii")
5806
+ except UnicodeEncodeError:
5807
+ raise
5808
+
5809
+ sample_ids = self.samples() if samples is None else list(samples)
5810
+
5811
+ flat = self._ll_tree_sequence.decode_alignments(
5812
+ ref_bytes,
5813
+ sample_ids,
5814
+ int(interval.left),
5815
+ int(interval.right),
5816
+ missing_data_character,
5817
+ bool(isolated_as_missing),
5753
5818
  )
5754
- site_pos = self.sites_position.astype(np.int64)[
5755
- first_site_id : last_site_id + 1
5756
- ]
5757
- for h in H:
5758
- a[site_pos - interval.left] = h
5759
- yield a.tobytes().decode("ascii")
5819
+
5820
+ span = int(interval.span)
5821
+ for j in range(len(sample_ids)):
5822
+ offset = j * span
5823
+ yield flat[offset : offset + span].decode("ascii")
5760
5824
 
5761
5825
  @property
5762
5826
  def individuals_population(self):
@@ -6469,6 +6533,9 @@ class TreeSequence:
6469
6533
  samples = self._ll_tree_sequence.get_samples()
6470
6534
  keep = np.full(shape=samples.shape, fill_value=True)
6471
6535
  if population is not None:
6536
+ if not isinstance(population, numbers.Integral):
6537
+ raise ValueError("`population` must be an integer ID")
6538
+ population = int(population)
6472
6539
  sample_population = self.nodes_population[samples]
6473
6540
  keep = np.logical_and(keep, sample_population == population)
6474
6541
  if time is not None:
@@ -6581,13 +6648,13 @@ class TreeSequence:
6581
6648
  to the sites in the tree sequence object.
6582
6649
 
6583
6650
  .. note::
6584
- Older code often uses the ``ploidy=2`` argument, because old
6585
- versions of msprime did not output individual data. Specifying
6586
- individuals in the tree sequence is more robust, and since tree
6587
- sequences now typically contain individuals (e.g., as produced by
6588
- ``msprime.sim_ancestry( )``), this is not necessary, and the
6589
- ``ploidy`` argument can safely be removed as part of the process
6590
- of updating from the msprime 0.x legacy API.
6651
+ Older code often uses the ``ploidy=2`` argument, because old
6652
+ versions of msprime did not output individual data. Specifying
6653
+ individuals in the tree sequence is more robust, and since tree
6654
+ sequences now typically contain individuals (e.g., as produced by
6655
+ ``msprime.sim_ancestry( )``), this is not necessary, and the
6656
+ ``ploidy`` argument can safely be removed as part of the process
6657
+ of updating from the msprime 0.x legacy API.
6591
6658
 
6592
6659
  :param io.IOBase output: The file-like object to write the VCF output.
6593
6660
  :param int ploidy: The ploidy of the individuals to be written to
@@ -6672,6 +6739,7 @@ class TreeSequence:
6672
6739
  wrap_width=60,
6673
6740
  reference_sequence=None,
6674
6741
  missing_data_character=None,
6742
+ isolated_as_missing=None,
6675
6743
  ):
6676
6744
  """
6677
6745
  Writes the :meth:`.alignments` for this tree sequence to file in
@@ -6696,12 +6764,6 @@ class TreeSequence:
6696
6764
 
6697
6765
  ts.write_fasta("output.fa")
6698
6766
 
6699
- .. warning:: :ref:`Missing data<sec_data_model_missing_data>` is not
6700
- currently supported by this method and it will raise a ValueError
6701
- if called on tree sequences containing isolated samples.
6702
- See https://github.com/tskit-dev/tskit/issues/1896 for more
6703
- information.
6704
-
6705
6767
  :param file_or_path: The file object or path to write the output.
6706
6768
  Paths can be either strings or :class:`python:pathlib.Path` objects.
6707
6769
  :param int wrap_width: The number of sequence
@@ -6710,6 +6772,7 @@ class TreeSequence:
6710
6772
  (Default=60).
6711
6773
  :param str reference_sequence: As for the :meth:`.alignments` method.
6712
6774
  :param str missing_data_character: As for the :meth:`.alignments` method.
6775
+ :param bool isolated_as_missing: As for the :meth:`.alignments` method.
6713
6776
  """
6714
6777
  text_formats.write_fasta(
6715
6778
  self,
@@ -6717,6 +6780,7 @@ class TreeSequence:
6717
6780
  wrap_width=wrap_width,
6718
6781
  reference_sequence=reference_sequence,
6719
6782
  missing_data_character=missing_data_character,
6783
+ isolated_as_missing=isolated_as_missing,
6720
6784
  )
6721
6785
 
6722
6786
  def as_fasta(self, **kwargs):
@@ -6740,6 +6804,7 @@ class TreeSequence:
6740
6804
  include_alignments=None,
6741
6805
  reference_sequence=None,
6742
6806
  missing_data_character=None,
6807
+ isolated_as_missing=None,
6743
6808
  ):
6744
6809
  """
6745
6810
  Returns a `nexus encoding <https://en.wikipedia.org/wiki/Nexus_file>`_
@@ -6823,10 +6888,7 @@ class TreeSequence:
6823
6888
  as our convention of using trees with multiple roots
6824
6889
  is not often supported by newick parsers. Thus, the method
6825
6890
  will raise a ValueError if we try to output trees with
6826
- multiple roots. Additionally, missing data
6827
- is not currently supported for alignment data.
6828
- See https://github.com/tskit-dev/tskit/issues/1896 for more
6829
- information.
6891
+ multiple roots.
6830
6892
 
6831
6893
  .. seealso: See also the :meth:`.as_nexus` method which will
6832
6894
  return this nexus representation as a string.
@@ -6841,6 +6903,7 @@ class TreeSequence:
6841
6903
  :param str reference_sequence: As for the :meth:`.alignments` method.
6842
6904
  :param str missing_data_character: As for the :meth:`.alignments` method,
6843
6905
  but defaults to "?".
6906
+ :param bool isolated_as_missing: As for the :meth:`.alignments` method.
6844
6907
  :return: A nexus representation of this :class:`TreeSequence`
6845
6908
  :rtype: str
6846
6909
  """
@@ -6852,6 +6915,7 @@ class TreeSequence:
6852
6915
  include_alignments=include_alignments,
6853
6916
  reference_sequence=reference_sequence,
6854
6917
  missing_data_character=missing_data_character,
6918
+ isolated_as_missing=isolated_as_missing,
6855
6919
  )
6856
6920
 
6857
6921
  def as_nexus(self, **kwargs):
@@ -7198,19 +7262,32 @@ class TreeSequence:
7198
7262
  self, *args, node_mappings=None, record_provenance=True, add_populations=None
7199
7263
  ):
7200
7264
  r"""
7201
- Concatenate a set of tree sequences to the right of this one, by repeatedly
7202
- calling :meth:`~TreeSequence.union` with an (optional)
7203
- node mapping for each of the ``others``. If any node mapping is ``None``
7204
- only map the sample nodes between the input tree sequence and this one,
7205
- based on the numerical order of sample node IDs.
7265
+ Concatenate a set of tree sequences to the right of this one, by shifting
7266
+ their coordinate systems and adding all edges, sites, mutations, and
7267
+ any additional nodes, individuals, or populations needed for these.
7268
+ Concretely, to concatenate an ``other`` tree sequence to ``self``, the value
7269
+ of ``self.sequence_length`` is added to all genomic coordinates in ``other``,
7270
+ and then the concatenated tree sequence will contain all edges, sites, and
7271
+ mutations in both. Which nodes in ``other`` are treated as "new", and hence
7272
+ added as well, is controlled by ``node_mappings``. Any individuals to which
7273
+ new nodes belong are added as well.
7274
+
7275
+ The method uses :meth:`.shift` followed by :meth:`.union`, with
7276
+ ``all_mutations=True``, ``all_edges=True``, and ``check_shared_equality=False``.
7277
+
7278
+ By default, the samples in current and input tree sequences are assumed to
7279
+ refer to the same nodes, and are matched based on the numerical order of
7280
+ sample node IDs; all other nodes are assumed to be new. This can be
7281
+ changed by providing explicit ``node_mappings`` for each input tree sequence
7282
+ (see below).
7206
7283
 
7207
7284
  .. note::
7208
- To add gaps between the concatenated tables, use :meth:`shift` or
7209
- to remove gaps, use :meth:`trim` before concatenating.
7285
+ To add gaps between the concatenated tree sequences, use :meth:`shift`
7286
+ or to remove gaps, use :meth:`trim` before concatenating.
7210
7287
 
7211
7288
  :param TreeSequence \*args: A list of other tree sequences to append to
7212
7289
  the right of this one.
7213
- :param Union[list, None] node_mappings: An list of node mappings for each
7290
+ :param Union[list, None] node_mappings: A list of node mappings for each
7214
7291
  input tree sequence in ``args``. Each should either be an array of
7215
7292
  integers of the same length as the number of nodes in the equivalent
7216
7293
  input tree sequence (see :meth:`~TreeSequence.union` for details), or
@@ -7252,6 +7329,8 @@ class TreeSequence:
7252
7329
  other_tables,
7253
7330
  node_mapping=node_mapping,
7254
7331
  check_shared_equality=False, # Else checks fail with internal samples
7332
+ all_mutations=True,
7333
+ all_edges=True,
7255
7334
  record_provenance=False,
7256
7335
  add_populations=add_populations,
7257
7336
  )
@@ -7340,7 +7419,7 @@ class TreeSequence:
7340
7419
  is its associated ``time`` value, or the time of its node if the
7341
7420
  mutation's time was marked as unknown (:data:`UNKNOWN_TIME`).
7342
7421
 
7343
- Migrations are not supported, and a LibraryError will be raise if
7422
+ Migrations are not supported, and a LibraryError will be raised if
7344
7423
  called on a tree sequence containing migration information.
7345
7424
 
7346
7425
  .. seealso:: This method is implemented using the :meth:`.split_edges`
@@ -7376,7 +7455,9 @@ class TreeSequence:
7376
7455
  `n` to `c` are extended, and the span of the edge from `p` to `c` is
7377
7456
  reduced. Thus, the ancestral haplotype represented by `n` is extended
7378
7457
  to a longer span of the genome. However, any edges whose child node is
7379
- a sample are not modified.
7458
+ a sample are not modified. See
7459
+ `Fritze et al. (2025) <https://doi.org/10.1093/genetics/iyaf198>`_
7460
+ for more details.
7380
7461
 
7381
7462
  Since some edges may be removed entirely, this process usually reduces
7382
7463
  the number of edges in the tree sequence.
@@ -7399,15 +7480,15 @@ class TreeSequence:
7399
7480
  known mutation times. See :meth:`.impute_unknown_mutations_time` if
7400
7481
  mutation times are not known.
7401
7482
 
7402
- The method will not affect the marginal trees (so, if the original tree
7403
- sequence was simplified, then following up with `simplify` will recover
7404
- the original tree sequence, possibly with edges in a different order).
7405
- It will also not affect the genotype matrix, or any of the tables other
7406
- than the edge table or the node column in the mutation table.
7483
+ .. note::
7484
+ The method will not affect the marginal trees (so, if the original tree
7485
+ sequence was simplified, then following up with `simplify` will recover
7486
+ the original tree sequence, possibly with edges in a different order).
7487
+ It will also not affect the genotype matrix, or any of the tables other
7488
+ than the edge table or the node column in the mutation table.
7407
7489
 
7408
- :param int max_iters: The maximum number of iterations over the tree
7490
+ :param int max_iter: The maximum number of iterations over the tree
7409
7491
  sequence. Defaults to 10.
7410
-
7411
7492
  :return: A new tree sequence with unary nodes extended.
7412
7493
  :rtype: tskit.TreeSequence
7413
7494
  """
@@ -7432,11 +7513,15 @@ class TreeSequence:
7432
7513
  the ancestry of these nodes - for that, see :meth:`.simplify`.
7433
7514
 
7434
7515
  This has the side effect that it may change the order of the nodes,
7435
- individuals, populations, and migrations in the tree sequence: the nodes
7436
- in the new tree sequence will be in the order provided in ``nodes``, and
7437
- both individuals and populations will be ordered by the earliest retained
7438
- node that refers to them. (However, ``reorder_populations`` may be set to
7439
- False to keep the population table unchanged.)
7516
+ populations, individuals, and migrations in the tree sequence. Nodes
7517
+ in the new tree sequence will be in the order provided in ``nodes``.
7518
+ Populations will be ordered in ascending order of the lowest ID of
7519
+ the nodes that refer to them. Individuals will be not only ordered
7520
+ so that :attr:`~Individual.parents` come before children (see
7521
+ :meth:`~TableCollection.sort_individuals`) but in addition
7522
+ will be secondarily sorted in ascending order of the lowest ID of
7523
+ their referring nodes. (However, ``reorder_populations`` may be set
7524
+ to ``False`` to keep the population table unchanged.)
7440
7525
 
7441
7526
  By default, the method removes all individuals and populations not
7442
7527
  referenced by any nodes, and all sites not referenced by any mutations.
@@ -7480,6 +7565,9 @@ class TreeSequence:
7480
7565
  check_shared_equality=True,
7481
7566
  add_populations=True,
7482
7567
  record_provenance=True,
7568
+ *,
7569
+ all_edges=False,
7570
+ all_mutations=False,
7483
7571
  ):
7484
7572
  """
7485
7573
  Returns an expanded tree sequence which contains the node-wise union of
@@ -7495,8 +7583,8 @@ class TreeSequence:
7495
7583
  1. Individuals whose nodes are new to ``self``.
7496
7584
  2. Edges whose parent or child are new to ``self``.
7497
7585
  3. Mutations whose nodes are new to ``self``.
7498
- 4. Sites which were not present in ``self``, if the site contains a newly
7499
- added mutation.
7586
+ 4. Sites whose positions are not present in the site positions in
7587
+ ``self``, if the site contains a newly added mutation.
7500
7588
 
7501
7589
  This can be thought of as a "node-wise" union: for instance, it can not
7502
7590
  be used to add new edges between two nodes already in ``self`` or new
@@ -7513,17 +7601,47 @@ class TreeSequence:
7513
7601
  nodes are in entirely new populations, then you must set up the
7514
7602
  population table first, and then union with ``add_populations=False``.
7515
7603
 
7516
- If the resulting tree sequence is invalid (for instance, a node is
7517
- specified to have two distinct parents on the same interval),
7518
- an error will be raised.
7604
+ This method makes sense if the "shared" portions of the tree sequences
7605
+ are equal; the option ``check_shared_equality`` performs a consistency
7606
+ check that this is true. If this check is disabled, it is very easy to
7607
+ produce nonsensical results via subtle inconsistencies.
7608
+
7609
+ The behavior above can be changed by ``all_edges`` and ``all_mutations``.
7610
+ If ``all_edges`` is True, then all edges in ``other`` are added to
7611
+ ``self``, instead of only edges adjacent to added nodes. If
7612
+ ``all_mutations`` is True, then similarly all mutations in ``other``
7613
+ are added (not just those on added nodes); furthermore, all sites
7614
+ at positions without a site already present are added to ``self``.
7615
+ The intended use case for these options is a "disjoint" union,
7616
+ where for instance the two tree sequences contain information about
7617
+ disjoint segments of the genome (see :meth:`.concatenate`).
7618
+ For some such applications it may be necessary to set
7619
+ ``check_shared_equality=False``: for instance, if ``other`` has
7620
+ an identical copy of the node table but no edges, then
7621
+ ``all_mutations=True, check_shared_equality=False`` can be used
7622
+ to add mutations to ``self``.
7519
7623
 
7520
- Note that this operation also sorts the resulting tables, so the
7521
- resulting tree sequence may not be equal to ``self`` even if nothing
7522
- new was added (although it would differ only in ordering of the tables).
7624
+ .. warning::
7625
+ If an equivalent node is specified in ``other``, the
7626
+ version in ``self`` is used without checking the node
7627
+ properties are the same. Similarly, if the same site position
7628
+ is present in both ``self`` and ``other``, the version in
7629
+ ``self`` is used without checking that site properties are
7630
+ the same. In these cases metadata and e.g. node times or ancestral
7631
+ states in ``other`` are simply ignored.
7632
+
7633
+ .. note::
7634
+ This operation also sorts the resulting tables, so the resulting
7635
+ tree sequence may not be equal to ``self`` even if nothing new
7636
+ was added (although it would differ only in ordering of the tables).
7523
7637
 
7524
- :param TableCollection other: Another table collection.
7638
+ :param TreeSequence other: Another tree sequence.
7525
7639
  :param list node_mapping: An array of node IDs that relate nodes in
7526
7640
  ``other`` to nodes in ``self``.
7641
+ :param bool all_edges: If True, then all edges in ``other`` are added
7642
+ to ``self``.
7643
+ :param bool all_mutations: If True, then all mutations and sites in
7644
+ ``other`` are added to ``self``.
7527
7645
  :param bool check_shared_equality: If True, the shared portions of the
7528
7646
  tree sequences will be checked for equality. It does so by
7529
7647
  running :meth:`TreeSequence.subset` on both ``self`` and ``other``
@@ -7533,6 +7651,11 @@ class TreeSequence:
7533
7651
  assigned new population IDs.
7534
7652
  :param bool record_provenance: Whether to record a provenance entry
7535
7653
  in the provenance table for this operation.
7654
+ :return: The union of the two tree sequences.
7655
+ :rtype: tskit.TreeSequence
7656
+ :raises: **tskit.LibraryError** -- If the resulting tree sequence is invalid
7657
+ (for instance, a node is specified to have two distinct
7658
+ parents on the same interval)
7536
7659
  """
7537
7660
  tables = self.dump_tables()
7538
7661
  other_tables = other.dump_tables()
@@ -7542,6 +7665,8 @@ class TreeSequence:
7542
7665
  check_shared_equality=check_shared_equality,
7543
7666
  add_populations=add_populations,
7544
7667
  record_provenance=record_provenance,
7668
+ all_edges=all_edges,
7669
+ all_mutations=all_mutations,
7545
7670
  )
7546
7671
  return tables.tree_sequence()
7547
7672
 
@@ -8611,52 +8736,6 @@ class TreeSequence:
8611
8736
  sizes = np.array(sizes, dtype=size_dtype)
8612
8737
  return flat, sizes
8613
8738
 
8614
- # def divergence_matrix(self, sample_sets, windows=None, mode="site"):
8615
- # """
8616
- # Finds the mean divergence between pairs of samples from each set of
8617
- # samples and in each window. Returns a numpy array indexed by (window,
8618
- # sample_set, sample_set). Diagonal entries are corrected so that the
8619
- # value gives the mean divergence for *distinct* samples, but it is not
8620
- # checked whether the sample_sets are disjoint (so offdiagonals are not
8621
- # corrected). For this reason, if an element of `sample_sets` has only
8622
- # one element, the corresponding diagonal will be NaN.
8623
-
8624
- # The mean divergence between two samples is defined to be the mean: (as
8625
- # a TreeStat) length of all edges separating them in the tree, or (as a
8626
- # SiteStat) density of segregating sites, at a uniformly chosen position
8627
- # on the genome.
8628
-
8629
- # :param list sample_sets: A list of sets of IDs of samples.
8630
- # :param iterable windows: The breakpoints of the windows (including start
8631
- # and end, so has one more entry than number of windows).
8632
- # :return: A list of the upper triangle of mean TMRCA values in row-major
8633
- # order, including the diagonal.
8634
- # """
8635
- # ns = len(sample_sets)
8636
- # indexes = [(i, j) for i in range(ns) for j in range(i, ns)]
8637
- # x = self.divergence(sample_sets, indexes, windows, mode=mode)
8638
- # nw = len(windows) - 1
8639
- # A = np.ones((nw, ns, ns), dtype=float)
8640
- # for w in range(nw):
8641
- # k = 0
8642
- # for i in range(ns):
8643
- # for j in range(i, ns):
8644
- # A[w, i, j] = A[w, j, i] = x[w][k]
8645
- # k += 1
8646
- # return A
8647
- # NOTE: see older definition of divmat here, which may be useful when documenting
8648
- # this function. See https://github.com/tskit-dev/tskit/issues/2781
8649
-
8650
- # NOTE for documentation of sample_sets. We *must* use samples currently because
8651
- # the normalisation for non-sample nodes is tricky. Do we normalise by the
8652
- # total span of the ts where the node is 'present' in the tree? We avoid this
8653
- # by insisting on sample nodes.
8654
-
8655
- # NOTE for documentation of num_threads. Need to explain that the
8656
- # its best to think of as the number of background *worker* threads.
8657
- # default is to run without any worker threads. If you want to run
8658
- # with all the cores on the machine, use num_threads=os.cpu_count().
8659
-
8660
8739
  def divergence_matrix(
8661
8740
  self,
8662
8741
  sample_sets=None,
@@ -8666,6 +8745,41 @@ class TreeSequence:
8666
8745
  mode=None,
8667
8746
  span_normalise=True,
8668
8747
  ):
8748
+ """
8749
+ Finds the matrix of pairwise :meth:`.divergence` values between groups
8750
+ of sample nodes. Returns a numpy array indexed by (window,
8751
+ sample_set, sample_set): the [k,i,j]th value of the result gives the
8752
+ mean divergence between pairs of samples from the i-th and j-th
8753
+ sample sets in the k-th window. As for :meth:`.divergence`,
8754
+ diagonal entries are corrected so that the
8755
+ value gives the mean divergence for *distinct* samples,
8756
+ and so diagonal entries are given by the :meth:`.diversity` of that
8757
+ sample set. For this reason, if an element of `sample_sets` has only
8758
+ one element, the corresponding :meth:`.diversity` will be NaN.
8759
+ However, this method will place a value of 0 in the diagonal instead of NaN
8760
+ in such cases; otherwise, this is equivalent to computing values with
8761
+ `meth`:.divergence`.
8762
+ However, this is (usually) more efficient than computing many
8763
+ pairwise values using the `indexes` argument to :meth:`.divergence`,
8764
+ so see :meth:`.divergence` for a description of what exactly is computed.
8765
+
8766
+ :param list sample_sets: A list of sets of IDs of samples.
8767
+ :param list windows: The breakpoints of the windows (including start
8768
+ and end, so has one more entry than number of windows).
8769
+ :param str mode: A string giving the "type" of the statistic to be computed
8770
+ (defaults to "site"; the other option is "branch").
8771
+ :return: An array indexed by (window, sample_set, sample_set), or if windows is
8772
+ `None`, an array indexed by (sample_set, sample_set).
8773
+ """
8774
+ # NOTE for documentation of sample_sets. We *must* use samples currently because
8775
+ # the normalisation for non-sample nodes is tricky. Do we normalise by the
8776
+ # total span of the ts where the node is 'present' in the tree? We avoid this
8777
+ # by insisting on sample nodes.
8778
+
8779
+ # NOTE for documentation of num_threads. Need to explain that the
8780
+ # its best to think of as the number of background *worker* threads.
8781
+ # default is to run without any worker threads. If you want to run
8782
+ # with all the cores on the machine, use num_threads=os.cpu_count().
8669
8783
  windows_specified = windows is not None
8670
8784
  windows = self.parse_windows(windows)
8671
8785
  mode = "site" if mode is None else mode
@@ -8873,7 +8987,16 @@ class TreeSequence:
8873
8987
  """
8874
8988
  Computes the full matrix of pairwise genetic relatedness values
8875
8989
  between (and within) pairs of sets of nodes from ``sample_sets``.
8876
- *Warning:* this does not compute exactly the same thing as
8990
+ Returns a numpy array indexed by (window, sample_set, sample_set):
8991
+ the [k,i,j]th value of the result gives the
8992
+ genetic relatedness between pairs of samples from the i-th and j-th
8993
+ sample sets in the k-th window.
8994
+ This is (usually) more efficient than computing many pairwise
8995
+ values using the `indexes` argument to :meth:`.genetic_relatedness`.
8996
+ Specifically, this computes :meth:`.genetic_relatedness` with
8997
+ ``centre=True`` and ``proportion=False`` (with caveats, see below).
8998
+
8999
+ *Warning:* in some cases, this does not compute exactly the same thing as
8877
9000
  :meth:`.genetic_relatedness`: see below for more details.
8878
9001
 
8879
9002
  If `mode="branch"`, then the value obtained is the same as that from
@@ -8881,29 +9004,35 @@ class TreeSequence:
8881
9004
  `proportion=False`. The same is true if `mode="site"` and all sites have
8882
9005
  at most one mutation.
8883
9006
 
8884
- However, if some sites have more than one mutation, the value may differ.
9007
+ However, if some sites have more than one mutation, the value may differ
9008
+ from that given by :meth:`.genetic_relatedness`:, although if the proportion
9009
+ of such sites is small, the difference will be small.
8885
9010
  The reason is that this function (for efficiency) computes relatedness
8886
- using :meth:`.divergence` and the following relationship.
9011
+ using :meth:`.divergence_matrix` and the following relationship.
8887
9012
  "Relatedness" measures the number of *shared* alleles (or branches),
8888
9013
  while "divergence" measures the number of *non-shared* alleles (or branches).
8889
9014
  Let :math:`T_i` be the total distance from sample :math:`i` up to the root;
8890
- then if :math:`D_{ij}` is the divergence between :math:`i` and :math:`j`
8891
- and :math:`R_{ij}` is the relatedness between :math:`i` and :math:`j`, then
8892
- :math:`T_i + T_j = D_{ij} + 2 R_{ij}.`
9015
+ then if :math:`D_{ij}` is the branch-mode divergence between :math:`i` and
9016
+ :math:`j` and :math:`R_{ij}` is the branch-mode relatedness between :math:`i`
9017
+ and :math:`j`, then :math:`T_i + T_j = D_{ij} + 2 R_{ij}.`
8893
9018
  So, for any samples :math:`I`, :math:`J`, :math:`S`, :math:`T`
8894
9019
  (that may now be random choices),
8895
9020
  :math:`R_{IJ}-R_{IS}-R_{JT}+R_{ST} = (D_{IJ}-D_{IS}-D_{JT}+D_{ST})/ (-2)`.
8896
- Note, however, that this relationship only holds for `mode="site"`
8897
- if we can treat "number of differing alleles" as distances on the tree;
8898
- this is not necessarily the case in the presence of multiple mutations.
9021
+ This is exactly what we want for (centered) relatedness.
9022
+ However, this relationship does not necessarily hold for `mode="site"`:
9023
+ it does hold if we can treat "number of differing alleles" as distances
9024
+ on the tree, but this is not necessarily the case in the presence of
9025
+ multiple mutations.
8899
9026
 
8900
- Another caveat in the above relationship between :math:`R` and :math:`D`
9027
+ Another note regarding the above relationship between :math:`R` and :math:`D`
8901
9028
  is that :meth:`.divergence` of a sample set to itself does not include
8902
9029
  the "self" comparisons (so as to provide an unbiased estimator of a
8903
9030
  population quantity), while the usual definition of genetic relatedness
8904
9031
  *does* include such comparisons (to provide, for instance, an appropriate
8905
9032
  value for prospective results beginning with only a given set of
8906
- individuals).
9033
+ individuals). So, diagonal entries in the relatedness matrix returned here
9034
+ are obtained from :meth:`divergence_matrix` after first correcting
9035
+ diagonals to include these "self" comparisons.
8907
9036
 
8908
9037
  :param list sample_sets: A list of lists of Node IDs, specifying the
8909
9038
  groups of nodes to compute the statistic with.
@@ -8912,11 +9041,35 @@ class TreeSequence:
8912
9041
  :param str mode: A string giving the "type" of the statistic to be computed
8913
9042
  (defaults to "site").
8914
9043
  :param bool span_normalise: Whether to divide the result by the span of the
8915
- window (defaults to True). Has no effect if ``proportion`` is True.
8916
- :return: A ndarray with shape equal to (num windows, num statistics).
8917
- If there is one pair of sample sets and windows=None, a numpy scalar is
8918
- returned.
8919
- """
9044
+ window (defaults to True).
9045
+ :return: An array indexed by (window, sample_set, sample_set), or if windows is
9046
+ `None`, an array indexed by (sample_set, sample_set).
9047
+ """
9048
+ # Further notes on the relationship between relatedness (R)
9049
+ # and divergence (D) in mode="site":
9050
+ # The summary function for divergence is "p (1-q)",
9051
+ # where p and q are the allele frequencies in the two sample sets;
9052
+ # while for relatedness it is "pq". Summing across *all* alleles,
9053
+ # we get that relatedness plus divergence is
9054
+ # p1 (1-q1) + p1 q1 + ... + pk (1-qk) + pk qk = p1 + ... + pk = 1 .
9055
+ # This implies that
9056
+ # ts.divergence(..., span_normalise=False)
9057
+ # + ts.genetic_relatedness(..., span_normalise=False, centre=False,
9058
+ # proportion=False, polarised=False)
9059
+ # == ts.num_sites
9060
+ # This could be the basis for a similar relationship between R and D.
9061
+ # However, that relationship holds only with polarised=False, which is not
9062
+ # the default, or what this function does (for good reason).
9063
+ # So, without setting polarised=False, we have that that for samples i and j,
9064
+ # divergence plus relatedness is equal to (something like)
9065
+ # the total number of sites at which both i and j are ancestral;
9066
+ # this depends on the samples and so does not cancel out of the centred
9067
+ # version. We could work through these relationships to figure out what exactly
9068
+ # the difference between genetic_relatedness_matrix(mode="site") and
9069
+ # genetic_relatedness(mode="site") is, in the general case of multiple
9070
+ # mutations... but that would be confusing, probably not that useful,
9071
+ # and the short version of all this is that "it's complicated".
9072
+
8920
9073
  D = self.divergence_matrix(
8921
9074
  sample_sets,
8922
9075
  windows=windows,
@@ -9088,6 +9241,7 @@ class TreeSequence:
9088
9241
  mode=mode,
9089
9242
  centre=False,
9090
9243
  nodes=indices,
9244
+ span_normalise=False, # <- non-default!
9091
9245
  )[0]
9092
9246
  x = x - x.mean(axis=0) if centre else x
9093
9247
 
@@ -9118,6 +9272,7 @@ class TreeSequence:
9118
9272
  mode=mode,
9119
9273
  centre=False,
9120
9274
  nodes=samples,
9275
+ span_normalise=False, # <- non-default!
9121
9276
  )[0]
9122
9277
 
9123
9278
  def bincount_fn(w):
@@ -9148,23 +9303,28 @@ class TreeSequence:
9148
9303
  eigenvectors of the genetic relatedness matrix, which are obtained by a
9149
9304
  randomized singular value decomposition (rSVD) algorithm.
9150
9305
 
9151
- Concretely, if :math:`M` is the matrix of genetic relatedness values, with
9152
- :math:`M_{ij}` the output of
9153
- :meth:`genetic_relatedness <.TreeSequence.genetic_relatedness>`
9154
- between sample :math:`i` and sample :math:`j`, then by default this returns
9155
- the top ``num_components`` eigenvectors of :math:`M`, so that
9306
+ Concretely, take :math:`M` as the matrix of non-span-normalised
9307
+ genetic relatedness values, for instance obtained by
9308
+ setting :math:`M_{ij}` to be the :meth:`~.TreeSequence.genetic_relatedness`
9309
+ between sample :math:`i` and sample :math:`j` with the specified ``mode``,
9310
+ ``proportion=False`` and ``span_normalise=False``. Then by default this
9311
+ returns the top ``num_components`` eigenvectors of :math:`M`, so that
9156
9312
  ``output.factors[i,k]`` is the position of sample `i` on the `k` th PC.
9157
- If ``samples`` or ``individuals`` are provided, then this does the same thing,
9158
- except with :math:`M_{ij}` either the relatedness between ``samples[i]``
9159
- and ``samples[j]`` or the nodes of ``individuals[i]`` and ``individuals[j]``,
9160
- respectively.
9313
+ If ``samples`` or ``individuals`` are provided, then this does the same
9314
+ thing, except with :math:`M_{ij}` either the relatedness between
9315
+ ``samples[i]`` and ``samples[j]`` or the average relatedness between the
9316
+ nodes of ``individuals[i]`` and ``individuals[j]``, respectively.
9317
+ Factors are normalized to have norm 1, i.e.,
9318
+ ``output.factors[:,k] ** 2).sum() == 1)`` for any ``k``.
9161
9319
 
9162
9320
  The parameters ``centre`` and ``mode`` are passed to
9163
- :meth:`genetic_relatedness <.TreeSequence.genetic_relatedness>`;
9164
- if ``windows`` are provided then PCA is carried out separately in each window.
9165
- If ``time_windows`` is provided, then genetic relatedness is measured using only
9166
- ancestral material within the given time window (see
9167
- :meth:`decapitate <.TreeSequence.decapitate>` for how this is defined).
9321
+ :meth:`~.TreeSequence.genetic_relatedness`: the default ``centre=True`` results
9322
+ in factors whose elements sum to zero; ``mode`` currently only supports the
9323
+ ``"branch"`` setting. If ``windows`` are provided then PCA is carried out
9324
+ separately in each genomic window. If ``time_windows`` is provided, then genetic
9325
+ relatedness is measured using only ancestral material within the given time
9326
+ window (see :meth:`decapitate <.TreeSequence.decapitate>` for how this is
9327
+ defined).
9168
9328
 
9169
9329
  So that the method scales to large tree sequences, the underlying method
9170
9330
  relies on a randomized SVD algorithm, using
@@ -9840,7 +10000,7 @@ class TreeSequence:
9840
10000
  b = 2 * (n**2 + n + 3) / (9 * n * (n - 1)) - (n + 2) / (h * n) + g / h**2
9841
10001
  c = h**2 + g
9842
10002
 
9843
- What is computed for diversity and divergence depends on ``mode``;
10003
+ What is computed for diversity and segregating sites depends on ``mode``;
9844
10004
  see those functions for more details.
9845
10005
 
9846
10006
  :param list sample_sets: A list of lists of Node IDs, specifying the
@@ -9903,6 +10063,11 @@ class TreeSequence:
9903
10063
  What is computed for diversity and divergence depends on ``mode``;
9904
10064
  see those functions for more details.
9905
10065
 
10066
+ For ``mode='site'``, this definition of Fst appears as equation (6) in
10067
+ `Slatkin (1991) <https://doi.org/10.1017/S0016672300029827>`_, and
10068
+ is also found as equation (9) in
10069
+ `Nei (1973) <https://doi.org/10.1073/pnas.70.12.3321>`_.
10070
+
9906
10071
  :param list sample_sets: A list of lists of Node IDs, specifying the
9907
10072
  groups of nodes to compute the statistic with.
9908
10073
  :param list indexes: A list of 2-tuples.
@@ -10324,7 +10489,8 @@ class TreeSequence:
10324
10489
 
10325
10490
  For an precise mathematical definition of GNN, see https://doi.org/10.1101/458067
10326
10491
 
10327
- .. note:: The reference sets need not include all the samples, hence the most
10492
+ .. note::
10493
+ The reference sets need not include all the samples, hence the most
10328
10494
  recent common ancestral node of the reference sets, :math:`a`, need not be
10329
10495
  the immediate ancestor of the focal node. If the reference sets only comprise
10330
10496
  sequences from relatively distant individuals, the GNN statistic may end up
@@ -10436,7 +10602,7 @@ class TreeSequence:
10436
10602
  represented by the tree sequence.
10437
10603
 
10438
10604
  :param list within: A list of node IDs defining set of nodes that
10439
- we finding IBD segments for. If not specified, this defaults to
10605
+ we find IBD segments for. If not specified, this defaults to
10440
10606
  all samples in the tree sequence.
10441
10607
  :param list[list] between: A list of lists of sample node IDs. Given
10442
10608
  two sample sets A and B, only IBD segments will be returned such
@@ -10451,7 +10617,7 @@ class TreeSequence:
10451
10617
  segment) is greater than this value will be included. (Default=0)
10452
10618
  :param bool store_pairs: If True store information separately for each
10453
10619
  pair of samples ``(a, b)`` that are found to be IBD. Otherwise
10454
- store summary information about all sample apirs. (Default=False)
10620
+ store summary information about all sample pairs. (Default=False)
10455
10621
  :param bool store_segments: If True store each IBD segment
10456
10622
  ``(left, right, c)`` and associate it with the corresponding
10457
10623
  sample pair ``(a, b)``. If True, implies ``store_pairs``.
@@ -10882,7 +11048,7 @@ class TreeSequence:
10882
11048
  mapping is created by first checking if the tree sequence contains individuals.
10883
11049
  If it does, the mapping is created using the individuals in the tree sequence.
10884
11050
  By default only the sample nodes of the individuals are included in the mapping,
10885
- unless `include_non_sample_nodes` is set to True, in which case all nodes
11051
+ unless ``include_non_sample_nodes`` is set to True, in which case all nodes
10886
11052
  belonging to the individuals are included. Any individuals without any nodes
10887
11053
  will have no nodes in their row of the mapping, being essentially of zero ploidy.
10888
11054
  If no individuals are present, the mapping is created using only the sample nodes
@@ -10890,20 +11056,22 @@ class TreeSequence:
10890
11056
 
10891
11057
  As the tskit data model allows non-integer positions, site positions and contig
10892
11058
  length are transformed to integer values suitable for VCF output. The
10893
- transformation is done using the `position_transform` function, which must
11059
+ transformation is done using the ``position_transform`` function, which must
10894
11060
  return an integer numpy array the same dimension as the input. By default,
10895
11061
  this is set to ``numpy.round()`` which will round values to the nearest integer.
10896
11062
 
10897
- If neither `name_metadata_key` nor `individual_names` is not specified, the
10898
- individual names are set to "tsk_{individual_id}" for each individual. If
10899
- no individuals are present, the individual names are set to "tsk_{i}" with
10900
- `0 <= i < num_sample_nodes/ploidy`.
11063
+ If neither ``name_metadata_key`` nor ``individual_names`` is specified, the
11064
+ individual names are set to ``"tsk_{individual_id}"`` for each individual. If
11065
+ no individuals are present, the individual names are set to ``"tsk_{i}"`` with
11066
+ ``0 <= i < num_sample_nodes/ploidy``.
10901
11067
 
10902
- A Warning are emmitted if any sample nodes do not have an individual ID.
11068
+ A warning is emitted if any sample nodes do not have an individual ID.
10903
11069
 
10904
11070
  :param list individuals: Specific individual IDs to include in the VCF. If not
10905
11071
  specified and the tree sequence contains individuals, all individuals are
10906
- included at least one node.
11072
+ included that are associated with least one sample node (or at least one of
11073
+ any node if ``include_non_sample_nodes`` is True), and the mapping arrays
11074
+ will be in ascending order of the ID of the individual in the tree sequence.
10907
11075
  :param int ploidy: The ploidy, or number of nodes per individual. Only used when
10908
11076
  the tree sequence does not contain individuals. Cannot be used if the tree
10909
11077
  sequence contains individuals. Defaults to 1 if not specified.