PyPI - tskit - Versions diffs - 1.0.0b3__cp313-cp313-win_amd64.whl → 1.0.1__cp313-cp313-win_amd64.whl - Mend

tskit 1.0.0b3__cp313-cp313-win_amd64.whl → 1.0.1__cp313-cp313-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (16) hide show

_tskit.cp313-win_amd64.pyd +0 -0
tskit/_version.py +1 -1
tskit/drawing.py +2 -4
tskit/genotypes.py +23 -20
tskit/metadata.py +1 -1
tskit/tables.py +51 -26
tskit/text_formats.py +4 -0
tskit/trees.py +413 -245
tskit/util.py +6 -7
{tskit-1.0.0b3.dist-info → tskit-1.0.1.dist-info}/METADATA +8 -8
tskit-1.0.1.dist-info/RECORD +27 -0
{tskit-1.0.0b3.dist-info → tskit-1.0.1.dist-info}/WHEEL +1 -1
tskit-1.0.0b3.dist-info/RECORD +0 -27
{tskit-1.0.0b3.dist-info → tskit-1.0.1.dist-info}/entry_points.txt +0 -0
{tskit-1.0.0b3.dist-info → tskit-1.0.1.dist-info}/licenses/LICENSE +0 -0
{tskit-1.0.0b3.dist-info → tskit-1.0.1.dist-info}/top_level.txt +0 -0

tskit/trees.py CHANGED Viewed

@@ -370,7 +370,11 @@ class Site(util.Dataclass):
     mutations: np.ndarray
     """
     The list of mutations at this site. Mutations within a site are returned in the
-    order they are specified in the underlying :class:`MutationTable`.
+    order they are specified in the underlying :class:`MutationTable`. For canonical
+    (i.e., valid) tables, this means ancestral mutations precede their descendants, so
+    older mutations (as defined by the canonical mutation ordering; see
+    :ref:`sec_mutation_requirements`) appear before younger ones.
     """
     metadata: bytes | dict | None
     """
@@ -571,8 +575,8 @@ class Migration(util.Dataclass):
     """
     id: int  # noqa A003
     """
-    The integer ID of this mutation. Varies from 0 to
-    :attr:`TreeSequence.num_mutations` - 1.
+    The integer ID of this migration. Varies from 0 to
+    :attr:`TreeSequence.num_migrations` - 1.
     """
@@ -770,7 +774,7 @@ class Tree:
         calling the :meth:`TreeSequence.trees` iterator.
         :return: The root threshold.
-        :rtype: :class:`TreeSequence`
+        :rtype: int
         """
         return self._ll_tree.get_root_threshold()
@@ -881,7 +885,8 @@ class Tree:
         :param float position: The position along the sequence length to
             seek to.
-        :raises ValueError: If 0 < position or position >=
+        :raises ValueError: If ``position`` is less than 0 or ``position`` is greater
+            than or equal to
             :attr:`TreeSequence.sequence_length`.
         """
         if position < 0 or position >= self.tree_sequence.sequence_length:
@@ -918,7 +923,7 @@ class Tree:
             the interval :math:`[0, \\text{span})` and the :attr:`~Tree.tree_sequence`
             from which the tree is taken will have its
             :attr:`~tskit.TreeSequence.sequence_length` equal to ``span``.
-        :param: float branch_length: The minimum length of a branch in this tree.
+        :param float branch_length: The minimum length of a branch in this tree.
         :raises ValueError: If the given rank is out of bounds for trees
             with ``num_leaves`` leaves.
         """
@@ -3593,7 +3598,7 @@ def parse_nodes(source, strict=True, encoding="utf8", base64_metadata=True, tabl
     return table
-def parse_edges(source, strict=True, table=None):
+def parse_edges(source, strict=True, table=None, encoding="utf8", base64_metadata=True):
     """
     Parse the specified file-like object containing a whitespace delimited
     description of a edge table and returns the corresponding :class:`EdgeTable`
@@ -3609,6 +3614,9 @@ def parse_edges(source, strict=True, table=None):
         False, a relaxed whitespace splitting algorithm is used.
     :param EdgeTable table: If specified, write the edges into this table. If
         not, create a new :class:`EdgeTable` instance and return.
+    :param str encoding: Encoding used for text representation.
+    :param bool base64_metadata: If True, metadata is encoded using Base64
+        encoding; otherwise, as plain text.
     """
     sep = None
     if strict:
@@ -3620,6 +3628,12 @@ def parse_edges(source, strict=True, table=None):
     right_index = header.index("right")
     parent_index = header.index("parent")
     children_index = header.index("child")
+    metadata_index = None
+    try:
+        metadata_index = header.index("metadata")
+    except ValueError:
+        pass
+    default_metadata = b""
     for line in source:
         tokens = line.rstrip("\n").split(sep)
         if len(tokens) >= 4:
@@ -3627,8 +3641,19 @@ def parse_edges(source, strict=True, table=None):
             right = float(tokens[right_index])
             parent = int(tokens[parent_index])
             children = tuple(map(int, tokens[children_index].split(",")))
+            metadata = default_metadata
+            if metadata_index is not None and metadata_index < len(tokens):
+                metadata = tokens[metadata_index].encode(encoding)
+                if base64_metadata:
+                    metadata = base64.b64decode(metadata)
             for child in children:
-                table.add_row(left=left, right=right, parent=parent, child=child)
+                table.add_row(
+                    left=left,
+                    right=right,
+                    parent=parent,
+                    child=child,
+                    metadata=metadata,
+                )
     return table
@@ -4368,6 +4393,22 @@ class TreeSequence:
         self._ll_tree_sequence.dump_tables(ll_tables)
         return tables.TableCollection(ll_tables=ll_tables)
+    def link_ancestors(self, samples, ancestors):
+        """
+        Equivalent to :meth:`TableCollection.link_ancestors`; see that method for full
+        documentation and parameter semantics.
+        :param list[int] samples: Node IDs to retain as samples.
+        :param list[int] ancestors: Node IDs to treat as ancestors.
+        :return: An :class:`tables.EdgeTable` containing the genealogical links between
+            the supplied ``samples`` and ``ancestors``.
+        :rtype: tables.EdgeTable
+        """
+        samples = util.safe_np_int_cast(samples, np.int32)
+        ancestors = util.safe_np_int_cast(ancestors, np.int32)
+        ll_edge_table = self._ll_tree_sequence.link_ancestors(samples, ancestors)
+        return tables.EdgeTable(ll_table=ll_edge_table)
     def dump_text(
         self,
         nodes=None,
@@ -4767,7 +4808,8 @@ class TreeSequence:
         Returns an iterable sequence of all the :ref:`nodes <sec_node_table_definition>`
         in this tree sequence.
-        .. note:: Although node ids are commonly ordered by node time, this is not a
+        .. note::
+            Although node ids are commonly ordered by node time, this is not a
             formal tree sequence requirement. If you wish to iterate over nodes in
             time order, you should therefore use ``order="timeasc"`` (and wrap the
             resulting sequence in the standard Python :func:`python:reversed` function
@@ -5321,13 +5363,13 @@ class TreeSequence:
         Returns an iterator over the strings of haplotypes that result from
         the trees and mutations in this tree sequence. Each haplotype string
         is guaranteed to be of the same length. A tree sequence with
-        :math:`n` samples and with :math:`s` sites lying between ``left`` and
-        ``right`` will return a total of :math:`n`
-        strings of :math:`s` alleles concatenated together, where an allele
+        :math:`n` requested nodes (default: the number of sample nodes) and with
+        :math:`s` sites lying between ``left`` and ``right`` will return a total
+        of :math:`n` strings of :math:`s` alleles concatenated together, where an allele
         consists of a single ascii character (tree sequences that include alleles
         which are not a single character in length, or where the character is
         non-ascii, will raise an error). The first string returned is the
-        haplotype for the first requested sample, and so on.
+        haplotype for the first requested node, and so on.
         The alleles at each site must be represented by single byte characters,
         (i.e., variants must be single nucleotide polymorphisms, or SNPs), hence
@@ -5336,8 +5378,8 @@ class TreeSequence:
         haplotype ``h``, the value of ``h[j]`` will therefore be the observed
         allelic state at site ``j``.
-        If ``isolated_as_missing`` is True (the default), isolated samples without
-        mutations directly above them will be treated as
+        If ``isolated_as_missing`` is True (the default), isolated nodes without
+        mutations directly above them (whether samples or non-samples) will be treated as
         :ref:`missing data<sec_data_model_missing_data>` and will be
         represented in the string by the ``missing_data_character``. If
         instead it is set to False, missing data will be assigned the ancestral state
@@ -5346,8 +5388,10 @@ class TreeSequence:
         behaviour in versions prior to 0.2.0. Prior to 0.3.0 the `impute_missing_data`
         argument controlled this behaviour.
+        It is also possible to provide **non-sample** nodes via the ``samples``
+        argument if you wish to output haplotypes for (e.g.) internal nodes.
         See also the :meth:`.variants` iterator for site-centric access
-        to sample genotypes.
+        to genotypes for the requested nodes.
         .. warning::
             For large datasets, this method can consume a **very large** amount of
@@ -5365,9 +5409,10 @@ class TreeSequence:
             be used to represent missing data.
             If any normal allele contains this character, an error is raised.
             Default: 'N'.
-        :param list[int] samples: The samples for which to output haplotypes. If
-            ``None`` (default), return haplotypes for all the samples in the tree
-            sequence, in the order given by the :meth:`.samples` method.
+        :param list[int] samples: The node IDs for which to output haplotypes. If
+            ``None`` (default), return haplotypes for all the sample nodes in the tree
+            sequence, in the order given by the :meth:`.samples` method. Non-sample
+            nodes may also be provided.
         :param int left: Haplotype strings will start with the first site at or after
             this genomic position. If ``None`` (default) start at the first site.
         :param int right: Haplotype strings will end with the last site before this
@@ -5438,9 +5483,13 @@ class TreeSequence:
         generated; output order of genotypes in the returned variants
         corresponds to the order of the samples in this list. It is also
         possible to provide **non-sample** nodes as an argument here, if you
-        wish to generate genotypes for (e.g.) internal nodes. However,
-        ``isolated_as_missing`` must be False in this case, as it is not
-        possible to detect missing data for non-sample nodes.
+        wish to generate genotypes for (e.g.) internal nodes. Missingness is
+        detected for any requested node (sample or non-sample) when
+        ``isolated_as_missing`` is True: if a node is isolated at a site (i.e.,
+        has no parent and no children in the marginal tree) and has no mutation
+        above it at that site, its genotype will be reported as
+        :data:`MISSING_DATA` (-1). If ``isolated_as_missing`` is False, such
+        nodes are assigned the site's ancestral allele index.
         If isolated samples are present at a given site without mutations above them,
         they are interpreted by default as
@@ -5530,19 +5579,23 @@ class TreeSequence:
         """
         Returns an :math:`m \\times n` numpy array of the genotypes in this
         tree sequence, where :math:`m` is the number of sites and :math:`n`
-        the number of samples. The genotypes are the indexes into the array
-        of ``alleles``, as described for the :class:`Variant` class.
-        If isolated samples are present at a given site without mutations above them,
-        they will be interpreted as :ref:`missing data<sec_data_model_missing_data>`
-        the genotypes array will contain a special value :data:`MISSING_DATA`
-        (-1) to identify these missing samples.
-        Such samples are treated as missing data by default, but if
-        ``isolated_as_missing`` is set to to False, they will not be treated as missing,
-        and so assigned the ancestral state. This was the default behaviour in
-        versions prior to 0.2.0. Prior to 0.3.0 the `impute_missing_data`
-        argument controlled this behaviour.
+        is the number of requested nodes (default: the number of sample nodes).
+        The genotypes are the indexes into the array of ``alleles``, as
+        described for the :class:`Variant` class.
+        It is possible to provide **non-sample** nodes via the ``samples``
+        argument if you wish to generate genotypes for (e.g.) internal nodes.
+        Missingness is detected for any requested node (sample or non-sample)
+        when ``isolated_as_missing`` is True: if a node is isolated at a site
+        (i.e., has no parent and no children in the marginal tree) and has no
+        mutation above it at that site, its genotype will be reported as
+        :data:`MISSING_DATA` (-1).
+        Such nodes are treated as missing data by default. If
+        ``isolated_as_missing`` is set to False, they will not be treated as
+        missing, and will instead be assigned the ancestral state. This was the
+        default behaviour in versions prior to 0.2.0. Prior to 0.3.0 the
+        ``impute_missing_data`` argument controlled this behaviour.
         .. warning::
             This method can consume a **very large** amount of memory! If
@@ -5550,10 +5603,12 @@ class TreeSequence:
             access them sequentially using the :meth:`.variants` iterator.
         :param array_like samples: An array of node IDs for which to generate
-            genotypes, or None for all sample nodes. Default: None.
+            genotypes. If ``None`` (default), generate genotypes for all sample
+            nodes. Non-sample nodes may also be provided, in which case genotypes
+            will be generated for those nodes too.
         :param bool isolated_as_missing: If True, the genotype value assigned to
-            missing samples (i.e., isolated samples without mutations) is
-            :data:`.MISSING_DATA` (-1). If False, missing samples will be
+            isolated nodes without mutations (samples or non-samples) is
+            :data:`.MISSING_DATA` (-1). If False, such nodes will be
             assigned the allele index for the ancestral state.
             Default: True.
         :param tuple alleles: A tuple of strings describing the encoding of
@@ -5602,21 +5657,24 @@ class TreeSequence:
         *,
         reference_sequence=None,
         missing_data_character=None,
+        isolated_as_missing=None,
         samples=None,
         left=None,
         right=None,
     ):
         """
         Returns an iterator over the full sequence alignments for the defined samples
-        in this tree sequence. Each alignment ``a`` is a string of length ``L`` where
-        the first character is the genomic sequence at the ``start`` position in the
-        genome (defaulting to 0) and the last character is the genomic sequence one
-        position before the ``stop`` value (defaulting to the :attr:`.sequence_length`
-        of this tree sequence, which must have :attr:`.discrete_genome` equal to True).
-        By default ``L`` is therefore equal to the :attr:`.sequence_length`,
-        and ``a[j]`` is the nucleotide value at genomic position ``j``.
-        .. note:: This is inherently a **zero-based** representation of the sequence
+        in this tree sequence. Each yielded alignment ``a`` is a string of length
+        ``L`` where the first character is the genomic sequence at the ``start``
+        position in the genome (defaulting to 0) and the last character is the
+        genomic sequence one position before the ``stop`` value (defaulting to the
+        :attr:`.sequence_length` of this tree sequence, which must have
+        :attr:`.discrete_genome` equal to True). By default ``L`` is therefore equal
+        to the :attr:`.sequence_length`, and ``a[j]`` is the nucleotide value at
+        genomic position ``j``.
+        .. note::
+            This is inherently a **zero-based** representation of the sequence
             coordinate space. Care will be needed when interacting with other
             libraries and upstream coordinate spaces.
@@ -5665,31 +5723,44 @@ class TreeSequence:
            single byte characters, (i.e., variants must be single nucleotide
            polymorphisms, or SNPs).
-        .. warning:: :ref:`Missing data<sec_data_model_missing_data>` is not
-           currently supported by this method and it will raise a ValueError
-           if called on tree sequences containing isolated samples.
-           See https://github.com/tskit-dev/tskit/issues/1896 for more
-           information.
+        Missing data handling
+        - If ``isolated_as_missing=True`` (default), nodes that are isolated
+          (no parent and no children) are rendered as the missing character across
+          each tree interval. At site positions, the per-site allele overrides the
+          missing character; if a genotype is missing (``-1``), the missing
+          character is retained.
+        - If ``isolated_as_missing=False``, no missing overlay is applied. At sites,
+          genotypes are decoded as usual; at non-sites, bases come from the
+          reference sequence.
         See also the :meth:`.variants` iterator for site-centric access
         to sample genotypes and :meth:`.haplotypes` for access to sample sequences
         at just the sites in the tree sequence.
         :param str reference_sequence: The reference sequence to fill in
-            gaps between sites in the alignments.
+            gaps between sites in the alignments. If provided, it must be a
+            string of length equal to :attr:`.sequence_length`; the sequence is
+            sliced internally to the requested ``[left, right)`` interval.
         :param str missing_data_character: A single ascii character that will
             be used to represent missing data.
             If any normal allele contains this character, an error is raised.
             Default: 'N'.
-        :param list[int] samples: The samples for which to output alignments. If
-            ``None`` (default), return alignments for all the samples in the tree
-            sequence, in the order given by the :meth:`.samples` method.
+        :param bool isolated_as_missing: If True, treat isolated nodes as missing
+            across the covered tree intervals (see above). If None (default), this
+            is treated as True.
+        :param list[int] samples: The nodes for which to output alignments. If
+            ``None`` (default), return alignments for all sample nodes in the order
+            given by the :meth:`.samples` method. Non-sample nodes are also supported
+            and will be decoded at sites in the same way as samples.
         :param int left: Alignments will start at this genomic position. If ``None``
             (default) alignments start at 0.
-        :param int right: Alignments will stop before this genomic position. If ``None``
-            (default) alignments will continue until the end of the tree sequence.
+        :param int right: Alignments will stop before this genomic position.
+            If ``None`` (default) alignments will continue until the end of the
+            tree sequence.
         :return: An iterator over the alignment strings for specified samples in
-            this tree sequence, in the order given in ``samples``.
+            this tree sequence, in the order given in ``samples``. Each string has
+            length ``L = right - left``.
         :rtype: collections.abc.Iterable
         :raises ValueError: if any genome coordinate in this tree sequence is not
             discrete, or if the ``reference_sequence`` is not of the correct length.
@@ -5703,60 +5774,53 @@ class TreeSequence:
             "N" if missing_data_character is None else missing_data_character
         )
-        L = interval.span
-        a = np.empty(L, dtype=np.int8)
-        if reference_sequence is None:
-            if self.has_reference_sequence():
-                # This may be inefficient - see #1989. However, since we're
-                # n copies of the reference sequence anyway, this is a relatively
-                # minor tweak. We may also want to recode the below not to use direct
-                # access to the .data attribute, e.g. if we allow reference sequences
-                # to start at non-zero positions
-                reference_sequence = self.reference_sequence.data[
-                    interval.left : interval.right
-                ]
-            else:
-                reference_sequence = missing_data_character * L
+        if isolated_as_missing is None:
+            isolated_as_missing = True
-        if len(reference_sequence) != L:
-            if interval.right == int(self.sequence_length):
-                raise ValueError(
-                    "The reference sequence is shorter than the tree sequence length"
-                )
-            else:
+        if len(missing_data_character) != 1:
+            raise TypeError("missing_data_character must be a single character")
+        # Determine the reference sequence for the whole tree sequence
+        full_ref = None
+        if reference_sequence is not None:
+            full_ref = reference_sequence
+        elif self.has_reference_sequence():
+            # This may be inefficient - see #1989. However, since we're
+            # n copies of the reference sequence anyway, this is a relatively
+            # minor tweak. We may also want to recode the below not to use direct
+            # access to the .data attribute, e.g. if we allow reference sequences
+            # to start at non-zero positions
+            full_ref = self.reference_sequence.data
+        if full_ref is None:
+            full_ref = missing_data_character * int(self.sequence_length)
+        else:
+            if len(full_ref) != int(self.sequence_length):
                 raise ValueError(
-                    "The reference sequence ends before the requested stop position"
+                    "The reference sequence must be equal to the tree sequence length"
                 )
-        ref_bytes = reference_sequence.encode("ascii")
-        a[:] = np.frombuffer(ref_bytes, dtype=np.int8)
-        # To do this properly we'll have to detect the missing data as
-        # part of a full implementation of alignments in C. The current
-        # definition might not be calling some degenerate cases correctly;
-        # see https://github.com/tskit-dev/tskit/issues/1908
-        #
-        # Note also that this will call the presence of missing data
-        # incorrectly if have a sample isolated over the region (a, b],
-        # and if we have sites at each position from a to b, and at
-        # each site there is a mutation over the isolated sample.
-        if any(tree._has_isolated_samples() for tree in self.trees()):
-            raise ValueError(
-                "Missing data not currently supported in alignments; see "
-                "https://github.com/tskit-dev/tskit/issues/1896 for details."
-                "The current implementation may also incorrectly identify an "
-                "input tree sequence has having missing data."
-            )
-        H, (first_site_id, last_site_id) = self._haplotypes_array(
-            interval=interval,
-            missing_data_character=missing_data_character,
-            samples=samples,
+        try:
+            ref_bytes = full_ref.encode("ascii")
+            missing_data_character.encode("ascii")
+        except UnicodeEncodeError:
+            raise
+        sample_ids = self.samples() if samples is None else list(samples)
+        flat = self._ll_tree_sequence.decode_alignments(
+            ref_bytes,
+            sample_ids,
+            int(interval.left),
+            int(interval.right),
+            missing_data_character,
+            bool(isolated_as_missing),
         )
-        site_pos = self.sites_position.astype(np.int64)[
-            first_site_id : last_site_id + 1
-        ]
-        for h in H:
-            a[site_pos - interval.left] = h
-            yield a.tobytes().decode("ascii")
+        span = int(interval.span)
+        for j in range(len(sample_ids)):
+            offset = j * span
+            yield flat[offset : offset + span].decode("ascii")
     @property
     def individuals_population(self):
@@ -6469,6 +6533,9 @@ class TreeSequence:
         samples = self._ll_tree_sequence.get_samples()
         keep = np.full(shape=samples.shape, fill_value=True)
         if population is not None:
+            if not isinstance(population, numbers.Integral):
+                raise ValueError("`population` must be an integer ID")
+            population = int(population)
             sample_population = self.nodes_population[samples]
             keep = np.logical_and(keep, sample_population == population)
         if time is not None:
@@ -6581,13 +6648,13 @@ class TreeSequence:
         to the sites in the tree sequence object.
         .. note::
-           Older code often uses the ``ploidy=2`` argument, because old
-           versions of msprime did not output individual data. Specifying
-           individuals in the tree sequence is more robust, and since tree
-           sequences now  typically contain individuals (e.g., as produced by
-           ``msprime.sim_ancestry( )``), this is not necessary, and the
-           ``ploidy`` argument can safely be removed as part of the process
-           of updating from the msprime 0.x legacy API.
+            Older code often uses the ``ploidy=2`` argument, because old
+            versions of msprime did not output individual data. Specifying
+            individuals in the tree sequence is more robust, and since tree
+            sequences now  typically contain individuals (e.g., as produced by
+            ``msprime.sim_ancestry( )``), this is not necessary, and the
+            ``ploidy`` argument can safely be removed as part of the process
+            of updating from the msprime 0.x legacy API.
         :param io.IOBase output: The file-like object to write the VCF output.
         :param int ploidy: The ploidy of the individuals to be written to
@@ -6672,6 +6739,7 @@ class TreeSequence:
         wrap_width=60,
         reference_sequence=None,
         missing_data_character=None,
+        isolated_as_missing=None,
     ):
         """
         Writes the :meth:`.alignments` for this tree sequence to file in
@@ -6696,12 +6764,6 @@ class TreeSequence:
             ts.write_fasta("output.fa")
-        .. warning:: :ref:`Missing data<sec_data_model_missing_data>` is not
-            currently supported by this method and it will raise a ValueError
-            if called on tree sequences containing isolated samples.
-            See https://github.com/tskit-dev/tskit/issues/1896 for more
-            information.
         :param file_or_path: The file object or path to write the output.
             Paths can be either strings or :class:`python:pathlib.Path` objects.
         :param int wrap_width: The number of sequence
@@ -6710,6 +6772,7 @@ class TreeSequence:
             (Default=60).
         :param str reference_sequence: As for the :meth:`.alignments` method.
         :param str missing_data_character: As for the :meth:`.alignments` method.
+        :param bool isolated_as_missing: As for the :meth:`.alignments` method.
         """
         text_formats.write_fasta(
             self,
@@ -6717,6 +6780,7 @@ class TreeSequence:
             wrap_width=wrap_width,
             reference_sequence=reference_sequence,
             missing_data_character=missing_data_character,
+            isolated_as_missing=isolated_as_missing,
         )
     def as_fasta(self, **kwargs):
@@ -6740,6 +6804,7 @@ class TreeSequence:
         include_alignments=None,
         reference_sequence=None,
         missing_data_character=None,
+        isolated_as_missing=None,
     ):
         """
         Returns a `nexus encoding <https://en.wikipedia.org/wiki/Nexus_file>`_
@@ -6823,10 +6888,7 @@ class TreeSequence:
             as our convention of using trees with multiple roots
             is not often supported by newick parsers. Thus, the method
             will raise a ValueError if we try to output trees with
-            multiple roots. Additionally, missing data
-            is not currently supported for alignment data.
-            See https://github.com/tskit-dev/tskit/issues/1896 for more
-            information.
+            multiple roots.
         .. seealso: See also the :meth:`.as_nexus` method which will
             return this nexus representation as a string.
@@ -6841,6 +6903,7 @@ class TreeSequence:
         :param str reference_sequence: As for the :meth:`.alignments` method.
         :param str missing_data_character: As for the :meth:`.alignments` method,
             but defaults to "?".
+        :param bool isolated_as_missing: As for the :meth:`.alignments` method.
         :return: A nexus representation of this :class:`TreeSequence`
         :rtype: str
         """
@@ -6852,6 +6915,7 @@ class TreeSequence:
             include_alignments=include_alignments,
             reference_sequence=reference_sequence,
             missing_data_character=missing_data_character,
+            isolated_as_missing=isolated_as_missing,
         )
     def as_nexus(self, **kwargs):
@@ -7198,19 +7262,32 @@ class TreeSequence:
         self, *args, node_mappings=None, record_provenance=True, add_populations=None
     ):
         r"""
-        Concatenate a set of tree sequences to the right of this one, by repeatedly
-        calling :meth:`~TreeSequence.union` with an (optional)
-        node mapping for each of the ``others``. If any node mapping is ``None``
-        only map the sample nodes between the input tree sequence and this one,
-        based on the numerical order of sample node IDs.
+        Concatenate a set of tree sequences to the right of this one, by shifting
+        their coordinate systems and adding all edges, sites, mutations, and
+        any additional nodes, individuals, or populations needed for these.
+        Concretely, to concatenate an ``other`` tree sequence to ``self``, the value
+        of ``self.sequence_length`` is added to all genomic coordinates in ``other``,
+        and then the concatenated tree sequence  will contain all edges, sites, and
+        mutations in both. Which nodes in ``other`` are treated as "new", and hence
+        added as well, is controlled by ``node_mappings``. Any individuals to which
+        new nodes belong are added as well.
+        The method uses :meth:`.shift` followed by :meth:`.union`, with
+        ``all_mutations=True``, ``all_edges=True``, and ``check_shared_equality=False``.
+        By default, the samples in current and input tree sequences are assumed to
+        refer to the same nodes, and are matched based on the numerical order of
+        sample node IDs; all other nodes are assumed to be new. This can be
+        changed by providing explicit ``node_mappings`` for each input tree sequence
+        (see below).
         .. note::
-            To add gaps between the concatenated tables, use :meth:`shift` or
-            to remove gaps, use :meth:`trim` before concatenating.
+            To add gaps between the concatenated tree sequences, use :meth:`shift`
+            or to remove gaps, use :meth:`trim` before concatenating.
         :param TreeSequence \*args: A list of other tree sequences to append to
             the right of this one.
-        :param Union[list, None] node_mappings: An list of node mappings for each
+        :param Union[list, None] node_mappings: A list of node mappings for each
             input tree sequence in ``args``. Each should either be an array of
             integers of the same length as the number of nodes in the equivalent
             input tree sequence (see :meth:`~TreeSequence.union` for details), or
@@ -7252,6 +7329,8 @@ class TreeSequence:
                 other_tables,
                 node_mapping=node_mapping,
                 check_shared_equality=False,  # Else checks fail with internal samples
+                all_mutations=True,
+                all_edges=True,
                 record_provenance=False,
                 add_populations=add_populations,
             )
@@ -7340,7 +7419,7 @@ class TreeSequence:
         is its associated ``time`` value, or the time of its node if the
         mutation's time was marked as unknown (:data:`UNKNOWN_TIME`).
-        Migrations are not supported, and a LibraryError will be raise if
+        Migrations are not supported, and a LibraryError will be raised if
         called on a tree sequence containing migration information.
         .. seealso:: This method is implemented using the :meth:`.split_edges`
@@ -7376,7 +7455,9 @@ class TreeSequence:
         `n` to `c` are extended, and the span of the edge from `p` to `c` is
         reduced. Thus, the ancestral haplotype represented by `n` is extended
         to a longer span of the genome. However, any edges whose child node is
-        a sample are not modified.
+        a sample are not modified. See
+        `Fritze et al. (2025) <https://doi.org/10.1093/genetics/iyaf198>`_
+        for more details.
         Since some edges may be removed entirely, this process usually reduces
         the number of edges in the tree sequence.
@@ -7399,15 +7480,15 @@ class TreeSequence:
         known mutation times.  See :meth:`.impute_unknown_mutations_time` if
         mutation times are not known.
-        The method will not affect the marginal trees (so, if the original tree
-        sequence was simplified, then following up with `simplify` will recover
-        the original tree sequence, possibly with edges in a different order).
-        It will also not affect the genotype matrix, or any of the tables other
-        than the edge table or the node column in the mutation table.
+        .. note::
+            The method will not affect the marginal trees (so, if the original tree
+            sequence was simplified, then following up with `simplify` will recover
+            the original tree sequence, possibly with edges in a different order).
+            It will also not affect the genotype matrix, or any of the tables other
+            than the edge table or the node column in the mutation table.
-        :param int max_iters: The maximum number of iterations over the tree
+        :param int max_iter: The maximum number of iterations over the tree
             sequence. Defaults to 10.
         :return: A new tree sequence with unary nodes extended.
         :rtype: tskit.TreeSequence
         """
@@ -7432,11 +7513,15 @@ class TreeSequence:
         the ancestry of these nodes - for that, see :meth:`.simplify`.
         This has the side effect that it may change the order of the nodes,
-        individuals, populations, and migrations in the tree sequence: the nodes
-        in the new tree sequence will be in the order provided in ``nodes``, and
-        both individuals and populations will be ordered by the earliest retained
-        node that refers to them. (However, ``reorder_populations`` may be set to
-        False to keep the population table unchanged.)
+        populations, individuals, and migrations in the tree sequence. Nodes
+        in the new tree sequence will be in the order provided in ``nodes``.
+        Populations will be ordered in ascending order of the lowest ID of
+        the nodes that refer to them. Individuals will be not only ordered
+        so that :attr:`~Individual.parents` come before children (see
+        :meth:`~TableCollection.sort_individuals`) but in addition
+        will be secondarily sorted in ascending order of the lowest ID of
+        their referring nodes. (However, ``reorder_populations`` may be set
+        to ``False`` to keep the population table unchanged.)
         By default, the method removes all individuals and populations not
         referenced by any nodes, and all sites not referenced by any mutations.
@@ -7480,6 +7565,9 @@ class TreeSequence:
         check_shared_equality=True,
         add_populations=True,
         record_provenance=True,
+        *,
+        all_edges=False,
+        all_mutations=False,
     ):
         """
         Returns an expanded tree sequence which contains the node-wise union of
@@ -7495,8 +7583,8 @@ class TreeSequence:
         1. Individuals whose nodes are new to ``self``.
         2. Edges whose parent or child are new to ``self``.
         3. Mutations whose nodes are new to ``self``.
-        4. Sites which were not present in ``self``, if the site contains a newly
-           added mutation.
+        4. Sites whose positions are not present in the site positions in
+           ``self``, if the site contains a newly added mutation.
         This can be thought of as a "node-wise" union: for instance, it can not
         be used to add new edges between two nodes already in ``self`` or new
@@ -7513,17 +7601,47 @@ class TreeSequence:
         nodes are in entirely new populations, then you must set up the
         population table first, and then union with ``add_populations=False``.
-        If the resulting tree sequence is invalid (for instance, a node is
-        specified to have two distinct parents on the same interval),
-        an error will be raised.
+        This method makes sense if the "shared" portions of the tree sequences
+        are equal; the option ``check_shared_equality`` performs a consistency
+        check that this is true. If this check is disabled, it is very easy to
+        produce nonsensical results via subtle inconsistencies.
+        The behavior above can be changed by ``all_edges`` and ``all_mutations``.
+        If ``all_edges`` is True, then all edges in ``other`` are added to
+        ``self``, instead of only edges adjacent to added nodes. If
+        ``all_mutations`` is True, then similarly all mutations in ``other``
+        are added (not just those on added nodes); furthermore, all sites
+        at positions without a site already present are added to ``self``.
+        The intended use case for these options is a "disjoint" union,
+        where for instance the two tree sequences contain information about
+        disjoint segments of the genome (see :meth:`.concatenate`).
+        For some such applications it may be necessary to set
+        ``check_shared_equality=False``: for instance, if ``other`` has
+        an identical copy of the node table but no edges, then
+        ``all_mutations=True, check_shared_equality=False`` can be used
+        to add mutations to ``self``.
-        Note that this operation also sorts the resulting tables, so the
-        resulting tree sequence may not be equal to ``self`` even if nothing
-        new was added (although it would differ only in ordering of the tables).
+        .. warning::
+            If an equivalent node is specified in ``other``, the
+            version in ``self`` is used without checking the node
+            properties are the same. Similarly, if the same site position
+            is present in both ``self`` and ``other``, the version in
+            ``self`` is used without checking that site properties are
+            the same. In these cases metadata and e.g. node times or ancestral
+            states in ``other`` are simply ignored.
+        .. note::
+            This operation also sorts the resulting tables, so the resulting
+            tree sequence may not be equal to ``self`` even if nothing new
+            was added (although it would differ only in ordering of the tables).
-        :param TableCollection other: Another table collection.
+        :param TreeSequence other: Another tree sequence.
         :param list node_mapping: An array of node IDs that relate nodes in
             ``other`` to nodes in ``self``.
+        :param bool all_edges: If True, then all edges in ``other`` are added
+            to ``self``.
+        :param bool all_mutations: If True, then all mutations and sites in
+            ``other`` are added to ``self``.
         :param bool check_shared_equality: If True, the shared portions of the
             tree sequences will be checked for equality. It does so by
             running :meth:`TreeSequence.subset` on both ``self`` and ``other``
@@ -7533,6 +7651,11 @@ class TreeSequence:
             assigned new population IDs.
         :param bool record_provenance: Whether to record a provenance entry
             in the provenance table for this operation.
+        :return: The union of the two tree sequences.
+        :rtype: tskit.TreeSequence
+        :raises: **tskit.LibraryError** -- If the resulting tree sequence is invalid
+            (for instance, a node is specified to have two distinct
+            parents on the same interval)
         """
         tables = self.dump_tables()
         other_tables = other.dump_tables()
@@ -7542,6 +7665,8 @@ class TreeSequence:
             check_shared_equality=check_shared_equality,
             add_populations=add_populations,
             record_provenance=record_provenance,
+            all_edges=all_edges,
+            all_mutations=all_mutations,
         )
         return tables.tree_sequence()
@@ -8611,52 +8736,6 @@ class TreeSequence:
             sizes = np.array(sizes, dtype=size_dtype)
         return flat, sizes
-    # def divergence_matrix(self, sample_sets, windows=None, mode="site"):
-    #     """
-    #     Finds the mean divergence  between pairs of samples from each set of
-    #     samples and in each window. Returns a numpy array indexed by (window,
-    #     sample_set, sample_set).  Diagonal entries are corrected so that the
-    #     value gives the mean divergence for *distinct* samples, but it is not
-    #     checked whether the sample_sets are disjoint (so offdiagonals are not
-    #     corrected).  For this reason, if an element of `sample_sets` has only
-    #     one element, the corresponding diagonal will be NaN.
-    #     The mean divergence between two samples is defined to be the mean: (as
-    #     a TreeStat) length of all edges separating them in the tree, or (as a
-    #     SiteStat) density of segregating sites, at a uniformly chosen position
-    #     on the genome.
-    #     :param list sample_sets: A list of sets of IDs of samples.
-    #     :param iterable windows: The breakpoints of the windows (including start
-    #         and end, so has one more entry than number of windows).
-    #     :return: A list of the upper triangle of mean TMRCA values in row-major
-    #         order, including the diagonal.
-    #     """
-    #     ns = len(sample_sets)
-    #     indexes = [(i, j) for i in range(ns) for j in range(i, ns)]
-    #     x = self.divergence(sample_sets, indexes, windows, mode=mode)
-    #     nw = len(windows) - 1
-    #     A = np.ones((nw, ns, ns), dtype=float)
-    #     for w in range(nw):
-    #         k = 0
-    #         for i in range(ns):
-    #             for j in range(i, ns):
-    #                 A[w, i, j] = A[w, j, i] = x[w][k]
-    #                 k += 1
-    #     return A
-    # NOTE: see older definition of divmat here, which may be useful when documenting
-    # this function. See https://github.com/tskit-dev/tskit/issues/2781
-    # NOTE for documentation of sample_sets. We *must* use samples currently because
-    # the normalisation for non-sample nodes is tricky. Do we normalise by the
-    # total span of the ts where the node is 'present' in the tree? We avoid this
-    # by insisting on sample nodes.
-    # NOTE for documentation of num_threads. Need to explain that the
-    # its best to think of as the number of background *worker* threads.
-    # default is to run without any worker threads. If you want to run
-    # with all the cores on the machine, use num_threads=os.cpu_count().
     def divergence_matrix(
         self,
         sample_sets=None,
@@ -8666,6 +8745,41 @@ class TreeSequence:
         mode=None,
         span_normalise=True,
     ):
+        """
+        Finds the matrix of pairwise :meth:`.divergence` values between groups
+        of sample nodes. Returns a numpy array indexed by (window,
+        sample_set, sample_set): the [k,i,j]th value of the result gives the
+        mean divergence between pairs of samples from the i-th and j-th
+        sample sets in the k-th window. As for :meth:`.divergence`,
+        diagonal entries are corrected so that the
+        value gives the mean divergence for *distinct* samples,
+        and so diagonal entries are given by the :meth:`.diversity` of that
+        sample set.  For this reason, if an element of `sample_sets` has only
+        one element, the corresponding :meth:`.diversity` will be NaN.
+        However, this method will place a value of 0 in the diagonal instead of NaN
+        in such cases; otherwise, this is equivalent to computing values with
+        `meth`:.divergence`.
+        However, this is (usually) more efficient than computing many
+        pairwise values using the `indexes` argument to :meth:`.divergence`,
+        so see :meth:`.divergence` for a description of what exactly is computed.
+        :param list sample_sets: A list of sets of IDs of samples.
+        :param list windows: The breakpoints of the windows (including start
+            and end, so has one more entry than number of windows).
+        :param str mode: A string giving the "type" of the statistic to be computed
+            (defaults to "site"; the other option is "branch").
+        :return: An array indexed by (window, sample_set, sample_set), or if windows is
+            `None`, an array indexed by (sample_set, sample_set).
+        """
+        # NOTE for documentation of sample_sets. We *must* use samples currently because
+        # the normalisation for non-sample nodes is tricky. Do we normalise by the
+        # total span of the ts where the node is 'present' in the tree? We avoid this
+        # by insisting on sample nodes.
+        # NOTE for documentation of num_threads. Need to explain that the
+        # its best to think of as the number of background *worker* threads.
+        # default is to run without any worker threads. If you want to run
+        # with all the cores on the machine, use num_threads=os.cpu_count().
         windows_specified = windows is not None
         windows = self.parse_windows(windows)
         mode = "site" if mode is None else mode
@@ -8873,7 +8987,16 @@ class TreeSequence:
         """
         Computes the full matrix of pairwise genetic relatedness values
         between (and within) pairs of sets of nodes from ``sample_sets``.
-        *Warning:* this does not compute exactly the same thing as
+        Returns a numpy array indexed by (window, sample_set, sample_set):
+        the [k,i,j]th value of the result gives the
+        genetic relatedness between pairs of samples from the i-th and j-th
+        sample sets in the k-th window.
+        This is (usually) more efficient than computing many pairwise
+        values using the `indexes` argument to :meth:`.genetic_relatedness`.
+        Specifically, this computes :meth:`.genetic_relatedness` with
+        ``centre=True`` and ``proportion=False`` (with caveats, see below).
+        *Warning:* in some cases, this does not compute exactly the same thing as
         :meth:`.genetic_relatedness`: see below for more details.
         If `mode="branch"`, then the value obtained is the same as that from
@@ -8881,29 +9004,35 @@ class TreeSequence:
         `proportion=False`. The same is true if `mode="site"` and all sites have
         at most one mutation.
-        However, if some sites have more than one mutation, the value may differ.
+        However, if some sites have more than one mutation, the value may differ
+        from that given by :meth:`.genetic_relatedness`:, although if the proportion
+        of such sites is small, the difference will be small.
         The reason is that this function (for efficiency) computes relatedness
-        using :meth:`.divergence` and the following relationship.
+        using :meth:`.divergence_matrix` and the following relationship.
         "Relatedness" measures the number of *shared* alleles (or branches),
         while "divergence" measures the number of *non-shared* alleles (or branches).
         Let :math:`T_i` be the total distance from sample :math:`i` up to the root;
-        then if :math:`D_{ij}` is the divergence between :math:`i` and :math:`j`
-        and :math:`R_{ij}` is the relatedness between :math:`i` and :math:`j`, then
-        :math:`T_i + T_j = D_{ij} + 2 R_{ij}.`
+        then if :math:`D_{ij}` is the branch-mode divergence between :math:`i` and
+        :math:`j` and :math:`R_{ij}` is the branch-mode relatedness between :math:`i`
+        and :math:`j`, then :math:`T_i + T_j = D_{ij} + 2 R_{ij}.`
         So, for any samples :math:`I`, :math:`J`, :math:`S`, :math:`T`
         (that may now be random choices),
         :math:`R_{IJ}-R_{IS}-R_{JT}+R_{ST} = (D_{IJ}-D_{IS}-D_{JT}+D_{ST})/ (-2)`.
-        Note, however, that this relationship only holds for `mode="site"`
-        if we can treat "number of differing alleles" as distances on the tree;
-        this is not necessarily the case in the presence of multiple mutations.
+        This is exactly what we want for (centered) relatedness.
+        However, this relationship does not necessarily hold for `mode="site"`:
+        it does hold if we can treat "number of differing alleles" as distances
+        on the tree, but this is not necessarily the case in the presence of
+        multiple mutations.
-        Another caveat in the above relationship between :math:`R` and :math:`D`
+        Another note regarding the above relationship between :math:`R` and :math:`D`
         is that :meth:`.divergence` of a sample set to itself does not include
         the "self" comparisons (so as to provide an unbiased estimator of a
         population quantity), while the usual definition of genetic relatedness
         *does* include such comparisons (to provide, for instance, an appropriate
         value for prospective results beginning with only a given set of
-        individuals).
+        individuals). So, diagonal entries in the relatedness matrix returned here
+        are obtained from :meth:`divergence_matrix` after first correcting
+        diagonals to include these "self" comparisons.
         :param list sample_sets: A list of lists of Node IDs, specifying the
             groups of nodes to compute the statistic with.
@@ -8912,11 +9041,35 @@ class TreeSequence:
         :param str mode: A string giving the "type" of the statistic to be computed
             (defaults to "site").
         :param bool span_normalise: Whether to divide the result by the span of the
-            window (defaults to True). Has no effect if ``proportion`` is True.
-        :return: A ndarray with shape equal to (num windows, num statistics).
-            If there is one pair of sample sets and windows=None, a numpy scalar is
-            returned.
-        """
+            window (defaults to True).
+        :return: An array indexed by (window, sample_set, sample_set), or if windows is
+            `None`, an array indexed by (sample_set, sample_set).
+        """
+        # Further notes on the relationship between relatedness (R)
+        # and divergence (D) in mode="site":
+        # The summary function for divergence is "p (1-q)",
+        # where p and q are the allele frequencies in the two sample sets;
+        # while for relatedness it is "pq". Summing across *all* alleles,
+        # we get that relatedness plus divergence is
+        # p1 (1-q1) + p1 q1 + ... + pk (1-qk) + pk qk = p1 + ... + pk = 1 .
+        # This implies that
+        # ts.divergence(..., span_normalise=False)
+        # + ts.genetic_relatedness(..., span_normalise=False, centre=False,
+        #       proportion=False, polarised=False)
+        # == ts.num_sites
+        # This could be the basis for a similar relationship between R and D.
+        # However, that relationship holds only with polarised=False, which is not
+        # the default, or what this function does (for good reason).
+        # So, without setting polarised=False, we have that that for samples i and j,
+        # divergence plus relatedness is equal to (something like)
+        # the total number of sites at which both i and j are ancestral;
+        # this depends on the samples and so does not cancel out of the centred
+        # version. We could work through these relationships to figure out what exactly
+        # the difference between genetic_relatedness_matrix(mode="site") and
+        # genetic_relatedness(mode="site") is, in the general case of multiple
+        # mutations... but that would be confusing, probably not that useful,
+        # and the short version of all this is that "it's complicated".
         D = self.divergence_matrix(
             sample_sets,
             windows=windows,
@@ -9088,6 +9241,7 @@ class TreeSequence:
             mode=mode,
             centre=False,
             nodes=indices,
+            span_normalise=False,  # <- non-default!
         )[0]
         x = x - x.mean(axis=0) if centre else x
@@ -9118,6 +9272,7 @@ class TreeSequence:
             mode=mode,
             centre=False,
             nodes=samples,
+            span_normalise=False,  # <- non-default!
         )[0]
         def bincount_fn(w):
@@ -9148,23 +9303,28 @@ class TreeSequence:
         eigenvectors of the genetic relatedness matrix, which are obtained by a
         randomized singular value decomposition (rSVD) algorithm.
-        Concretely, if :math:`M` is the matrix of genetic relatedness values, with
-        :math:`M_{ij}` the output of
-        :meth:`genetic_relatedness <.TreeSequence.genetic_relatedness>`
-        between sample :math:`i` and sample :math:`j`, then by default this returns
-        the top ``num_components`` eigenvectors of :math:`M`, so that
+        Concretely, take :math:`M` as the matrix of non-span-normalised
+        genetic relatedness values, for instance obtained by
+        setting :math:`M_{ij}` to be the :meth:`~.TreeSequence.genetic_relatedness`
+        between sample :math:`i` and sample :math:`j` with the specified ``mode``,
+        ``proportion=False`` and ``span_normalise=False``. Then by default this
+        returns the top ``num_components`` eigenvectors of :math:`M`, so that
         ``output.factors[i,k]`` is the position of sample `i` on the `k` th PC.
-        If ``samples`` or ``individuals`` are provided, then this does the same thing,
-        except with :math:`M_{ij}` either the relatedness between ``samples[i]``
-        and ``samples[j]`` or the nodes of ``individuals[i]`` and ``individuals[j]``,
-        respectively.
+        If ``samples`` or ``individuals`` are provided, then this does the same
+        thing, except with :math:`M_{ij}` either the relatedness between
+        ``samples[i]`` and ``samples[j]`` or the average relatedness between the
+        nodes of ``individuals[i]`` and ``individuals[j]``, respectively.
+        Factors are normalized to have norm 1, i.e.,
+        ``output.factors[:,k] ** 2).sum() == 1)`` for any ``k``.
         The parameters ``centre`` and ``mode`` are passed to
-        :meth:`genetic_relatedness <.TreeSequence.genetic_relatedness>`;
-        if ``windows`` are provided then PCA is carried out separately in each window.
-        If ``time_windows`` is provided, then genetic relatedness is measured using only
-        ancestral material within the given time window (see
-        :meth:`decapitate <.TreeSequence.decapitate>` for how this is defined).
+        :meth:`~.TreeSequence.genetic_relatedness`: the default ``centre=True`` results
+        in factors whose elements sum to zero; ``mode`` currently only supports the
+        ``"branch"`` setting. If ``windows`` are provided then PCA is carried out
+        separately in each genomic window. If ``time_windows`` is provided, then genetic
+        relatedness is measured using only ancestral material within the given time
+        window (see :meth:`decapitate <.TreeSequence.decapitate>` for how this is
+        defined).
         So that the method scales to large tree sequences, the underlying method
         relies on a randomized SVD algorithm, using
@@ -9840,7 +10000,7 @@ class TreeSequence:
             b = 2 * (n**2 + n + 3) / (9 * n * (n - 1)) - (n + 2) / (h * n) + g / h**2
             c = h**2 + g
-        What is computed for diversity and divergence depends on ``mode``;
+        What is computed for diversity and segregating sites depends on ``mode``;
         see those functions for more details.
         :param list sample_sets: A list of lists of Node IDs, specifying the
@@ -9903,6 +10063,11 @@ class TreeSequence:
         What is computed for diversity and divergence depends on ``mode``;
         see those functions for more details.
+        For ``mode='site'``, this definition of Fst appears as equation (6) in
+        `Slatkin (1991) <https://doi.org/10.1017/S0016672300029827>`_, and
+        is also found as equation (9) in
+        `Nei (1973) <https://doi.org/10.1073/pnas.70.12.3321>`_.
         :param list sample_sets: A list of lists of Node IDs, specifying the
             groups of nodes to compute the statistic with.
         :param list indexes: A list of 2-tuples.
@@ -10324,7 +10489,8 @@ class TreeSequence:
         For an precise mathematical definition of GNN, see https://doi.org/10.1101/458067
-        .. note:: The reference sets need not include all the samples, hence the most
+        .. note::
+            The reference sets need not include all the samples, hence the most
             recent common ancestral node of the reference sets, :math:`a`, need not be
             the immediate ancestor of the focal node. If the reference sets only comprise
             sequences from relatively distant individuals, the GNN statistic may end up
@@ -10436,7 +10602,7 @@ class TreeSequence:
         represented by the tree sequence.
         :param list within: A list of node IDs defining set of nodes that
-            we finding IBD segments for. If not specified, this defaults to
+            we find IBD segments for. If not specified, this defaults to
             all samples in the tree sequence.
         :param list[list] between: A list of lists of sample node IDs. Given
             two sample sets A and B, only IBD segments will be returned such
@@ -10451,7 +10617,7 @@ class TreeSequence:
             segment) is greater than this value will be included. (Default=0)
         :param bool store_pairs: If True store information separately for each
             pair of samples ``(a, b)`` that are found to be IBD. Otherwise
-            store summary information about all sample apirs. (Default=False)
+            store summary information about all sample pairs. (Default=False)
         :param bool store_segments: If True store each IBD segment
             ``(left, right, c)`` and associate it with the corresponding
             sample pair ``(a, b)``. If True, implies ``store_pairs``.
@@ -10882,7 +11048,7 @@ class TreeSequence:
         mapping is created by first checking if the tree sequence contains individuals.
         If it does, the mapping is created using the individuals in the tree sequence.
         By default only the sample nodes of the individuals are included in the mapping,
-        unless `include_non_sample_nodes` is set to True, in which case all nodes
+        unless ``include_non_sample_nodes`` is set to True, in which case all nodes
         belonging to the individuals are included. Any individuals without any nodes
         will have no nodes in their row of the mapping, being essentially of zero ploidy.
         If no individuals are present, the mapping is created using only the sample nodes
@@ -10890,20 +11056,22 @@ class TreeSequence:
         As the tskit data model allows non-integer positions, site positions and contig
         length are transformed to integer values suitable for VCF output. The
-        transformation is done using the `position_transform` function, which must
+        transformation is done using the ``position_transform`` function, which must
         return an integer numpy array the same dimension as the input. By default,
         this is set to ``numpy.round()`` which will round values to the nearest integer.
-        If neither `name_metadata_key` nor `individual_names` is not specified, the
-        individual names are set to "tsk_{individual_id}" for each individual. If
-        no individuals are present, the individual names are set to "tsk_{i}" with
-        `0 <= i < num_sample_nodes/ploidy`.
+        If neither ``name_metadata_key`` nor ``individual_names`` is specified, the
+        individual names are set to ``"tsk_{individual_id}"`` for each individual. If
+        no individuals are present, the individual names are set to ``"tsk_{i}"`` with
+        ``0 <= i < num_sample_nodes/ploidy``.
-        A Warning are emmitted if any sample nodes do not have an individual ID.
+        A warning is emitted if any sample nodes do not have an individual ID.
         :param list individuals: Specific individual IDs to include in the VCF. If not
             specified and the tree sequence contains individuals, all individuals are
-            included at least one node.
+            included that are associated with least one sample node (or at least one of
+            any node if ``include_non_sample_nodes`` is True), and the mapping arrays
+            will be in ascending order of the ID of the individual in the tree sequence.
         :param int ploidy: The ploidy, or number of nodes per individual. Only used when
             the tree sequence does not contain individuals. Cannot be used if the tree
             sequence contains individuals. Defaults to 1 if not specified.