ruby-minigraph 0.0.20.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/LICENSE.txt +21 -0
- data/README.md +62 -0
- data/ext/Rakefile +56 -0
- data/ext/cmappy/cmappy.c +7 -0
- data/ext/cmappy/cmappy.h +8 -0
- data/ext/minigraph/LICENSE.txt +23 -0
- data/ext/minigraph/Makefile +66 -0
- data/ext/minigraph/NEWS.md +317 -0
- data/ext/minigraph/README.md +207 -0
- data/ext/minigraph/algo.c +194 -0
- data/ext/minigraph/algo.h +33 -0
- data/ext/minigraph/asm-call.c +147 -0
- data/ext/minigraph/bseq.c +133 -0
- data/ext/minigraph/bseq.h +76 -0
- data/ext/minigraph/cal_cov.c +139 -0
- data/ext/minigraph/doc/example1.png +0 -0
- data/ext/minigraph/doc/example2.png +0 -0
- data/ext/minigraph/doc/examples.graffle +0 -0
- data/ext/minigraph/format.c +241 -0
- data/ext/minigraph/galign.c +140 -0
- data/ext/minigraph/gchain1.c +532 -0
- data/ext/minigraph/gcmisc.c +223 -0
- data/ext/minigraph/gfa-aug.c +260 -0
- data/ext/minigraph/gfa-base.c +526 -0
- data/ext/minigraph/gfa-bbl.c +372 -0
- data/ext/minigraph/gfa-ed.c +617 -0
- data/ext/minigraph/gfa-io.c +395 -0
- data/ext/minigraph/gfa-priv.h +154 -0
- data/ext/minigraph/gfa.h +166 -0
- data/ext/minigraph/ggen.c +182 -0
- data/ext/minigraph/ggen.h +21 -0
- data/ext/minigraph/ggsimple.c +570 -0
- data/ext/minigraph/gmap.c +211 -0
- data/ext/minigraph/index.c +230 -0
- data/ext/minigraph/kalloc.c +224 -0
- data/ext/minigraph/kalloc.h +82 -0
- data/ext/minigraph/kavl.h +414 -0
- data/ext/minigraph/kdq.h +134 -0
- data/ext/minigraph/ketopt.h +116 -0
- data/ext/minigraph/khashl.h +348 -0
- data/ext/minigraph/krmq.h +474 -0
- data/ext/minigraph/kseq.h +256 -0
- data/ext/minigraph/ksort.h +164 -0
- data/ext/minigraph/kstring.h +165 -0
- data/ext/minigraph/kthread.c +159 -0
- data/ext/minigraph/kthread.h +15 -0
- data/ext/minigraph/kvec-km.h +105 -0
- data/ext/minigraph/kvec.h +110 -0
- data/ext/minigraph/lchain.c +441 -0
- data/ext/minigraph/main.c +301 -0
- data/ext/minigraph/map-algo.c +500 -0
- data/ext/minigraph/mgpriv.h +128 -0
- data/ext/minigraph/minigraph.1 +359 -0
- data/ext/minigraph/minigraph.h +176 -0
- data/ext/minigraph/miniwfa.c +834 -0
- data/ext/minigraph/miniwfa.h +95 -0
- data/ext/minigraph/misc/mgutils.js +1451 -0
- data/ext/minigraph/misc.c +12 -0
- data/ext/minigraph/options.c +134 -0
- data/ext/minigraph/shortk.c +251 -0
- data/ext/minigraph/sketch.c +109 -0
- data/ext/minigraph/sys.c +147 -0
- data/ext/minigraph/sys.h +20 -0
- data/ext/minigraph/test/MT-chimp.fa +277 -0
- data/ext/minigraph/test/MT-human.fa +239 -0
- data/ext/minigraph/test/MT-orangA.fa +276 -0
- data/ext/minigraph/test/MT.gfa +19 -0
- data/ext/minigraph/tex/Makefile +13 -0
- data/ext/minigraph/tex/minigraph.bib +676 -0
- data/ext/minigraph/tex/minigraph.tex +986 -0
- data/ext/minigraph/tex/plots/CHM13-f1-90.bb.anno.gp +42 -0
- data/ext/minigraph/tex/plots/CHM13-f1-90.bb.anno.tbl +13 -0
- data/ext/minigraph/tex/plots/CHM13-f1-90.bb.mini-inter-none.win.gp +269 -0
- data/ext/minigraph/tex/plots/CHM13-f1-90.bb.mini-inter-none.win.sh +7 -0
- data/ext/minigraph/tex/plots/CHM13v1.cen.bed +23 -0
- data/ext/minigraph/tex/plots/CHM13v1.size +23 -0
- data/ext/minigraph/tex/plots/anno2tbl.js +40 -0
- data/ext/minigraph/tex/plots/bedutils.js +367 -0
- data/ext/minigraph/tex/plots/chr-plot.js +130 -0
- data/ext/minigraph/tex/plots/gen-anno.mak +24 -0
- data/ext/minigraph.patch +21 -0
- data/lib/minigraph/ffi/constants.rb +230 -0
- data/lib/minigraph/ffi/functions.rb +70 -0
- data/lib/minigraph/ffi/mappy.rb +8 -0
- data/lib/minigraph/ffi.rb +27 -0
- data/lib/minigraph/version.rb +5 -0
- data/lib/minigraph.rb +72 -0
- metadata +159 -0
@@ -0,0 +1,986 @@
|
|
1
|
+
%% BioMed_Central_Tex_Template_v1.06
|
2
|
+
|
3
|
+
\documentclass[twocolumn]{bmcart}
|
4
|
+
|
5
|
+
%%% Load packages
|
6
|
+
\usepackage{amsthm,amsmath}
|
7
|
+
\RequirePackage{hyperref}
|
8
|
+
\usepackage[utf8]{inputenc} %unicode support
|
9
|
+
|
10
|
+
\usepackage{graphicx}
|
11
|
+
%\def\includegraphic{}
|
12
|
+
%\def\includegraphics{}
|
13
|
+
|
14
|
+
%%% Put your definitions there:
|
15
|
+
\startlocaldefs
|
16
|
+
\endlocaldefs
|
17
|
+
|
18
|
+
|
19
|
+
%%% Begin ...
|
20
|
+
\begin{document}
|
21
|
+
|
22
|
+
%%% Start of article front matter
|
23
|
+
\begin{frontmatter}
|
24
|
+
|
25
|
+
\begin{fmbox}
|
26
|
+
\dochead{Method}
|
27
|
+
|
28
|
+
\title{The design and construction of reference pangenome graphs with minigraph}
|
29
|
+
|
30
|
+
\author[
|
31
|
+
addressref={aff1,aff2}, % id's of addresses, e.g. {aff1,aff2}
|
32
|
+
corref={aff1}, % id of corresponding address, if any
|
33
|
+
email={hli@ds.dfci.harvard.edu} % email address
|
34
|
+
]{\inits{HL}\fnm{Heng} \snm{Li}}
|
35
|
+
\author[
|
36
|
+
addressref={aff1,aff2},
|
37
|
+
]{\inits{XF}\fnm{Xiaowen} \snm{Feng}}
|
38
|
+
\author[
|
39
|
+
addressref={aff2},
|
40
|
+
]{\inits{CC}\fnm{Chong} \snm{Chu}}
|
41
|
+
|
42
|
+
\address[id=aff1]{% % unique id
|
43
|
+
\orgname{Department of Data Sciences, Dana-Farber Cancer Institute}, % university, etc
|
44
|
+
\city{Boston, MA 02215}, % city
|
45
|
+
\cny{USA} % country
|
46
|
+
}
|
47
|
+
\address[id=aff2]{%
|
48
|
+
\orgname{Department of Biomedical Informatics, Harvard Medical School},
|
49
|
+
\city{Boston, MA 02215},
|
50
|
+
\cny{USA}
|
51
|
+
}
|
52
|
+
|
53
|
+
\begin{abstractbox}
|
54
|
+
|
55
|
+
\begin{abstract} % abstract
|
56
|
+
The recent advances in sequencing technologies enable the assembly of
|
57
|
+
individual genomes to the quality of the reference genome. How to integrate
|
58
|
+
multiple genomes from the same species and make the integrated representation
|
59
|
+
accessible to biologists remains an open challenge. Here, we propose a
|
60
|
+
graph-based data model and associated formats to represent multiple genomes
|
61
|
+
while preserving the coordinate of the linear reference genome. We implement
|
62
|
+
our ideas in the minigraph toolkit and demonstrate that we can efficiently
|
63
|
+
construct a pangenome graph and compactly encode tens of thousands of
|
64
|
+
structural variants missing from the current reference genome.
|
65
|
+
\end{abstract}
|
66
|
+
|
67
|
+
\begin{keyword}
|
68
|
+
\kwd{bioinformatics}
|
69
|
+
\kwd{genomics}
|
70
|
+
\kwd{pangenome}
|
71
|
+
\end{keyword}
|
72
|
+
|
73
|
+
\end{abstractbox}
|
74
|
+
|
75
|
+
\end{fmbox}
|
76
|
+
|
77
|
+
\end{frontmatter}
|
78
|
+
|
79
|
+
%%
|
80
|
+
\section*{Background}
|
81
|
+
|
82
|
+
The human reference genome is a fundamental resource for human genetics and
|
83
|
+
biomedical research. The primary sequences of the reference genome
|
84
|
+
GRCh38~\cite{Schneider:2017aa} are a mosaic of haplotypes with each haplotype segment derived
|
85
|
+
from a single human individual. They cannot represent the genetic diversity in
|
86
|
+
human populations and as a result, each individual may carry thousands of large
|
87
|
+
germline variants absent from the reference genome~\cite{Huddleston:2017aa}.
|
88
|
+
Some of these variants are likely associated with phenotype~\cite{Eichler_2010}
|
89
|
+
but are often missed or misinterpreted when we map sequence data to GRCh38, in
|
90
|
+
particular with short reads~\cite{Li:2018aa}. This under-representation of
|
91
|
+
genetic diversity may become a limiting factor in our understanding of genetic
|
92
|
+
variations.
|
93
|
+
|
94
|
+
Meanwhile, the advances in long-read sequencing technologies make it possible
|
95
|
+
to assemble a human individual to a quality comparable to
|
96
|
+
GRCh38~\cite{Schneider:2017aa,Wenger_2019}. There are already a dozen of
|
97
|
+
high-quality human assemblies available in GenBank~\cite{Audano:2019aa}.
|
98
|
+
Properly integrating these genomes into a reference \emph{pangenome}, which
|
99
|
+
refers to a collection of genomes~\cite{cpgc:2016aa}, would potentially address
|
100
|
+
the issues with a single linear reference.
|
101
|
+
|
102
|
+
A straightforward way to represent a pangenome is to store unaligned genomes
|
103
|
+
in a full-text index that compresses redundancies in sequences identical
|
104
|
+
between individuals~\cite{Makinen:2010aa,Liu_2016,Boucher_2019}. We may
|
105
|
+
retrieve individual genomes from the index, inspect the k-mer spectrum and test
|
106
|
+
the presence of k-mers using standard techniques. In principle, it is also
|
107
|
+
possible to apply canonical read alignment algorithms to map sequences to
|
108
|
+
the collection, but in practice, the redundant hits to multiple genomes will
|
109
|
+
confuse downstream mapping-based analyses~\cite{NA2016159}. It is not clear how
|
110
|
+
to resolve these multiple mappings.
|
111
|
+
|
112
|
+
The other class of methods encodes multiple genomes into a sequence graph,
|
113
|
+
usually by collapsing identical or similar sequences between genomes onto a
|
114
|
+
single representative sequence. The results in a \emph{pangenome graph}. A
|
115
|
+
pangenome graph is a powerful tool to identify core genome, the part of a
|
116
|
+
genome or gene set that is shared across the majority of the strains or related species
|
117
|
+
in a clade~\cite{Vernikos:2015aa}. A common way to construct a basic pangenome
|
118
|
+
graph is to generate a compacted de Bruijn graph
|
119
|
+
(cDBG)~\cite{Marcus:2014xy,Baier_2015,Beller:2016ab,Chikhi:2015aa,Minkin_2016,Chikhi_2016,almodaresi_et_al:LIPIcs:2017:7657}
|
120
|
+
from a set of genomes. Basic cDBG does not keep sample information.
|
121
|
+
\cite{Iqbal:2012aa} proposed colored cDBG with each color represents a sample
|
122
|
+
or a population. Colored cDBG can be constructed
|
123
|
+
efficiently~\cite{Muggli_2019,Holley695338}. However, a colored cDBG discards
|
124
|
+
the chromosomal coordinate and thus disallows the mapping of genomic features.
|
125
|
+
It often includes connections absent from the input genomes and thus encodes
|
126
|
+
sequences more than the input. A colored cDBG cannot serve as a
|
127
|
+
\emph{reference} pangenome graph, either. deBGA~\cite{Liu:2016ac} addresses
|
128
|
+
the issue by labeling each unitig with its possibly multiple locations in the
|
129
|
+
input genome(s). Pufferfish~\cite{Almodaresi:2018aa} further reduces its space
|
130
|
+
requirement. Nonetheless, given hundreds of human genomes, there will be many
|
131
|
+
more vertices in the graph and most vertices are associated with hundreds of
|
132
|
+
labels. Whether deBGA and pufferfish can scale to such datasets remains an open
|
133
|
+
question. GBWT~\cite{Sir_n_2019} provides another practical solution to storage
|
134
|
+
and indexing, but no existing tools can practically construct a cDBG for many
|
135
|
+
human genomes in the GBWT representation.
|
136
|
+
|
137
|
+
In addition to cDBG, we can derive a reference pangenome
|
138
|
+
graph from a single linear multi-sequence alignment (MSA)~\cite{Dilthey_2015,Dilthey_2019}.
|
139
|
+
It has been used for HLA typing but is not applicable to whole chromosomes when
|
140
|
+
they cannot be included in a single linear MSA. The third and possibly the most
|
141
|
+
popular approach to reference graph generation is to call variants from other
|
142
|
+
sources and then incorporate these variants, often in the VCF format~\cite{Danecek:2011qy}, into
|
143
|
+
the reference genome as alternative
|
144
|
+
paths~\cite{Eggertsson:2017aa,Rakocevic_2019,Sibbesen:2018aa,Biederstedt:2018aa,Eggertsson_2019}.
|
145
|
+
However, because VCF does not define coordinates on insertions, this approach
|
146
|
+
cannot properly encode variations on long insertions and is therefore limited
|
147
|
+
to simple variations. There are no satisfactory solutions to the construction
|
148
|
+
of reference pangenome graphs.
|
149
|
+
|
150
|
+
In this article, we introduce the reference Graphical Fragment Assembly (rGFA)
|
151
|
+
format to model reference pangenome graphs. We propose and demonstrate an
|
152
|
+
incremental procedure to construct graphs under this model. The resulting
|
153
|
+
graphs encode structural variations (SVs) of length 100bp or longer without haplotype
|
154
|
+
information. Our implementation, minigraph~\cite{Li_minigraph:2020aa}
|
155
|
+
(\href{https://github.com/lh3/minigraph}{https://github.com/lh3/minigraph}),
|
156
|
+
can construct a pangenome graph from twenty human assemblies in three hours.
|
157
|
+
|
158
|
+
\section*{Results}
|
159
|
+
|
160
|
+
We will first describe a data model for reference pangenome graphs, which
|
161
|
+
establishes the foundation of this article. We will then present a new
|
162
|
+
sequence-to-graph mapper, minigraph, and show how this mapper incrementally
|
163
|
+
constructs a pangenome graph. We will demonstrate the utility of pangenome
|
164
|
+
graphs with a human graph generated from twenty human haplotypes and a primate
|
165
|
+
graph generated from four species.
|
166
|
+
|
167
|
+
\subsection*{Modeling reference pangenome graphs}
|
168
|
+
|
169
|
+
\subsubsection*{Sequence graphs}
|
170
|
+
|
171
|
+
There are several equivalent ways to define a sequence graph. In this article,
|
172
|
+
a \emph{sequence graph} $G(V,E)$ is a bidirected graph. Each vertex $v\in V$ is
|
173
|
+
associated with a DNA sequence; each edge $e\in E$ has two directions, one for
|
174
|
+
each endpoint, which leads to four types of edges: forward-forward,
|
175
|
+
reverse-forward, forward-reverse and reverse-reverse. The directions on an edge
|
176
|
+
dictate how a sequence is spelled from a walk/path in the graph. Common
|
177
|
+
assembly graphs, such as the overlap graph, string graph and de Bruijn graph
|
178
|
+
can all be formulated as sequence graphs.
|
179
|
+
|
180
|
+
\begin{figure}[t]
|
181
|
+
\includegraphics[width=.47\textwidth]{Fig1}
|
182
|
+
\caption{\csentence{Example rGFA and GAF formats.} {\bf (a)} Example rGFA
|
183
|
+
format. rGFA-specific tags include SN, name of the stable sequence from which
|
184
|
+
the vertex is derived; SO, offset on the stable sequence; SR, rank: 0 if the
|
185
|
+
vertex or edge is on the linear reference; $>$0 for non-reference. {\bf (b)}
|
186
|
+
Corresponding sequence graph. Each thick arrow represents an oriented DNA
|
187
|
+
sequence. {\bf (c)} Example GAF format, using the segment coordinate, for
|
188
|
+
reads ``${\tt GTGGCT}$'' and ``${\tt CGTTTCC}$'' mapped to the graph. {\bf
|
189
|
+
(d)} Equivalent GAF format using the stable coordinate.}\label{fig:rgfa}
|
190
|
+
\end{figure}
|
191
|
+
|
192
|
+
The Graphical Fragment Assembly (GFA) format~\cite{Li:2016aa} describes
|
193
|
+
sequence graphs. The core of GFA is defined by the following grammar:
|
194
|
+
|
195
|
+
{\footnotesize
|
196
|
+
\begin{verbatim}
|
197
|
+
|
198
|
+
<GFA> <- (<segment> | <link>)+
|
199
|
+
<segment> <- `S' <segId> <segSeq>
|
200
|
+
<link> <- `L' <segId> [+-] <segId> [+-] <cigar>
|
201
|
+
|
202
|
+
\end{verbatim}}
|
203
|
+
|
204
|
+
{\flushleft
|
205
|
+
A line starting with letter ``${\tt S}$'' corresponds to a vertex and a line
|
206
|
+
starting with ``${\tt L}$'' corresponds
|
207
|
+
to a bidirected edge. In a de Bruijn graph, we often attach sequences to edges
|
208
|
+
instead of vertices~\cite{Pevzner:2001vn,Gnerre:2011ys}. To avoid the confusion, in this
|
209
|
+
article, we also call a vertex as a \emph{segment} and call an edge as a
|
210
|
+
\emph{link}, following the GFA terminology. Fig.~\ref{fig:rgfa}a shows an
|
211
|
+
example GFA that encodes Fig.~\ref{fig:rgfa}b.
|
212
|
+
}
|
213
|
+
|
214
|
+
A sequence graph in the GFA format natively defines a \emph{segment coordinate}
|
215
|
+
system where each base in the graph is uniquely indexed by a
|
216
|
+
2-tuple $({\rm segId},{\rm segOffset})$. For example, in
|
217
|
+
Fig~\ref{fig:rgfa}a, the base at position $({\rm s2},2)$ is ``{\tt G}''.
|
218
|
+
A major problem with this coordinate is that it is decoupled from linear
|
219
|
+
annotations and is sensitive to graph transformations. For example, if we split
|
220
|
+
a segment into two connected segments, the set of sequences spelled from the graph
|
221
|
+
remains the same, but the segment coordinates will be changed. Due to the
|
222
|
+
instability of segment coordinate, a basic sequence graph is inadequate for a
|
223
|
+
reference graph.
|
224
|
+
|
225
|
+
\subsubsection*{Reference pangenome graphs}
|
226
|
+
|
227
|
+
We propose the reference GFA (rGFA) format to encode reference pangenome graphs.
|
228
|
+
rGFA is an extension to GFA with three additional tags that indicate the origin
|
229
|
+
of a segment from linear genomes (Fig.~\ref{fig:rgfa}a). This simple addition
|
230
|
+
gives us a unique stable coordinate system as an extension to the linear
|
231
|
+
reference coordinate (e.g. GRCh38). We can pinpoint a position such as
|
232
|
+
``{\sf chr1:9}'' in the graph and map existing annotations onto the graph. We can
|
233
|
+
also report a path or walk in the stable coordinate. For example, path
|
234
|
+
``{\sf s1$\to$s2$\to$s3}'' unambiguously corresponds to ``{\sf
|
235
|
+
chr1:0-5$\to$chr1:5-8$\to$chr1:8-12}'' or simply ``{\sf chr1:0-12}'' if we
|
236
|
+
merge adjacent coordinate; similarly, ``{\sf s1$\to$s2$\to$s5$\to$s6}''
|
237
|
+
corresponds to ``{\sf chr1:0-8$\to$foo:8-16}''. We will formally describe the
|
238
|
+
path format when introducing the GAF format in the next section.
|
239
|
+
|
240
|
+
In rGFA, each segment is associated with one origin. This apparently trivial
|
241
|
+
requirement in fact imposes a strong restriction on the types of graphs rGFA
|
242
|
+
can encode: it forbids the collapse of different regions from one sequence,
|
243
|
+
which would often happen in a cDBG. We consider this restriction an
|
244
|
+
advantage of rGFA because it requires the graph to have a ``linear'' flavor
|
245
|
+
intuitively and simplifies the data structure to store the graph.
|
246
|
+
|
247
|
+
For simplicity, rGFA disallows overlaps between edges and forbids multiple
|
248
|
+
edges (more than one edges between the same pair of vertices). These two
|
249
|
+
restrictions help to avoid ambiguity and reduce the complexity in
|
250
|
+
implementation. They are not strictly necessary in theory.
|
251
|
+
|
252
|
+
\subsubsection*{The Graphical mApping Format (GAF)}
|
253
|
+
|
254
|
+
\begin{table}[tb]
|
255
|
+
\caption{The Graphical mApping Format (GAF)}\label{tab:gaf}
|
256
|
+
\begin{tabular}{rcp{6cm}}
|
257
|
+
\hline
|
258
|
+
Col & Type & Description \\ \hline
|
259
|
+
1 & string & Query sequence name \\
|
260
|
+
2 & int & Query sequence length \\
|
261
|
+
3 & int & Query start coordinate (0-based; closed) \\
|
262
|
+
4 & int & Query end coordinate (0-based; open) \\
|
263
|
+
5 & char & Strand relative to col. 6 \\
|
264
|
+
6 & string & Graph path matching regular expression \texttt{/([><][\char94\char92s><]+(:\char92d+-\char92d+)?)+\char124([\char94\char92s><]+)/}\\
|
265
|
+
7 & int & Path sequence length \\
|
266
|
+
8 & int & Path start coordinate \\
|
267
|
+
9 & int & Path end coordinate \\
|
268
|
+
10 & int & Number of matching bases in the mapping \\
|
269
|
+
11 & int & Number of bases, including gaps, in the mapping \\
|
270
|
+
12 & int & Mapping quality (0--255 with 255 for missing) \\ \hline
|
271
|
+
\end{tabular}
|
272
|
+
\end{table}
|
273
|
+
|
274
|
+
As there are no text formats for sequence-to-graph alignment, we propose a new
|
275
|
+
Graphical mApping Format (GAF) by extending the Pairwise mApping Format
|
276
|
+
(PAF)~\cite{Li:2016aa}. GAF is TAB-delimited with each column defined in
|
277
|
+
Table~\ref{tab:gaf}. Column 6 encodes a path on the graph. It follows the
|
278
|
+
formal grammar below:
|
279
|
+
|
280
|
+
{\footnotesize
|
281
|
+
\begin{verbatim}
|
282
|
+
|
283
|
+
<path> <- <stableId> | <orientIntv>+
|
284
|
+
<orientIntv> <- (`>' | `<') (<segId> | <stableIntv>)
|
285
|
+
<stableIntv> <- <stableId> `:' <start> `-' <end>
|
286
|
+
|
287
|
+
\end{verbatim}}
|
288
|
+
|
289
|
+
{\flushleft
|
290
|
+
In this grammar, {\tt <segId>} is a segment identifier on an S-line in rGFA;
|
291
|
+
{\tt <stableId>} is a stable sequence name at the {\tt SN} tag on the
|
292
|
+
corresponding S-line. Column 6 can be either a path in the segment coordinate
|
293
|
+
(Fig.~\ref{fig:rgfa}c) or an equivalent path in the stable coordinate
|
294
|
+
(Fig.~\ref{fig:rgfa}d). We can merge adjacent stable coordinates if the two
|
295
|
+
segments are originated from the same stable sequence and the end offset of the
|
296
|
+
first segment is equal to the start offset of the second segment. For example,
|
297
|
+
``{\tt >chr1:0-5>chr1:5-8}'' can be simplified to ``{\tt >chr1:0-8}''.
|
298
|
+
Furthermore, if a path in column 6 is derived from one reference sequence, we
|
299
|
+
recommend to replace it with the entire reference path on the forward
|
300
|
+
orientation (e.g. see ``read1'' in Fig.~\ref{fig:rgfa}d). With this convention,
|
301
|
+
a GAF line is reduced to PAF for a sequence mapped to a reference sequence.
|
302
|
+
Similar to PAF, GAF also allows optional tags in the SAM-like format. Base
|
303
|
+
alignment is kept at the {\tt cg} tag.}
|
304
|
+
|
305
|
+
Minigraph produces GAF in both the segment and the stable coordinate.
|
306
|
+
GraphAligner~\cite{Rautiainen810812} produces GAF in the segment coordinate
|
307
|
+
only, which can be converted to the stable coordinate.
|
308
|
+
|
309
|
+
\begin{figure}[t]
|
310
|
+
\includegraphics[width=.47\textwidth]{Fig2}
|
311
|
+
\caption{\csentence{Minigraph algorithms.} {\bf (a)} Diagram of the minigraph
|
312
|
+
mapping algorithm. Minigraph seeds alignments with minimizers, finds good
|
313
|
+
enough linear chains, connects them in the graph and seeks the most weighted
|
314
|
+
path as a graph chain. {\bf (b)} Diagram of incremental graph construction. A
|
315
|
+
graph is iteratively constructed by mapping each assembly to an existing
|
316
|
+
graph and augmenting the graph with long poorly mapped sequences in the
|
317
|
+
assembly.}\label{fig:mg}
|
318
|
+
\end{figure}
|
319
|
+
|
320
|
+
\subsection*{Sequence-to-graph mapping}
|
321
|
+
|
322
|
+
Our incremental graph construction algorithm relies on genome-to-graph
|
323
|
+
alignment (Fig.~\ref{fig:mg}b). As existing sequence-to-graph
|
324
|
+
aligners~\cite{Rautiainen810812,Garrison:2018aa} do not work with
|
325
|
+
chromosome-long query sequences, we adapted minimap2~\cite{Li:2018ab} for our
|
326
|
+
purpose and implemented minigraph (Fig.~\ref{fig:mg}a). Briefly, minigraph uses
|
327
|
+
a minimap2-like algorithm to find local hits to segments in the graph, ignoring
|
328
|
+
the graph topology. It then chains these local hits if they are connected on
|
329
|
+
the graph, possibly through cycles. This gives the approximate mapping locations. Minigraph does not
|
330
|
+
perform base-level alignment. This is because the graph we construct encodes
|
331
|
+
SVs and rarely contains paths similar at the base level. The best mapping is
|
332
|
+
often clear without base alignment.
|
333
|
+
|
334
|
+
\begin{table}[b]
|
335
|
+
\caption{Performance of sequence-to-graph mapping}\label{tab:mgvga}
|
336
|
+
\begin{tabular}{lrr}
|
337
|
+
\hline
|
338
|
+
& minigraph & GraphAligner \\
|
339
|
+
\hline
|
340
|
+
Indexing time (wall-clock sec) & 100 & 589 \\
|
341
|
+
Mapping time (wall-clock sec) & 79 & 140 \\
|
342
|
+
Peak RAM (GB) & 19.5 & 27.2 \\
|
343
|
+
Percent unmapped reads & 0.5\% & 0\% \\
|
344
|
+
Percent wrong mappings & 1.7\% & 4.6\% \\
|
345
|
+
\hline
|
346
|
+
\end{tabular}
|
347
|
+
\end{table}
|
348
|
+
|
349
|
+
To evaluate the accuracy of minigraph mapping, we simulated PacBio reads from
|
350
|
+
GRCh38 with PBSIM~\cite{Ono:2013aa} and mapped them to the graph we constructed
|
351
|
+
in the next section. Table~\ref{tab:mgvga} compares the performance of
|
352
|
+
minigraph and GraphAligner~\cite{Rautiainen810812} v1.0.10 on 68,857 simulated
|
353
|
+
reads mapped over 8 CPU threads. {\color{black} The N50 read length is 15kb.
|
354
|
+
9,862 reads are mapped across two or more segments by GraphAligner. Note that
|
355
|
+
both minigraph and GraphAligner ignore the stable coordinates during mapping.
|
356
|
+
All segments, originated either from GRCh38 or from individual genomes, are
|
357
|
+
treated equally. To this end, while we simulated reads from GRCh38, we are also
|
358
|
+
evaluating how well mappers work with complex SVs present in any input
|
359
|
+
samples.}
|
360
|
+
|
361
|
+
On this dataset, minigraph
|
362
|
+
is faster than GraphAligner and uses less memory, partly because minigraph does
|
363
|
+
not perform base alignment.
|
364
|
+
As is shown in Table~\ref{tab:mgvga}, minigraph is more accurate than
|
365
|
+
GraphAligner. This is counter-intuitive given that GraphAligner does base
|
366
|
+
alignment. Close inspection reveals that most mismapped reads by minigraph are
|
367
|
+
mapped to the correct genomic loci but wrong graph paths. On the contrary, most
|
368
|
+
mismapped reads by GraphAligner are mapped to wrong genomic loci. This suggests
|
369
|
+
minigraph is better at finding approximate mapping locations but GraphAligner
|
370
|
+
is better at disambiguating similar graph paths. Combining the strength of
|
371
|
+
both could lead to a better graph mapper. We do plan to implement base-level
|
372
|
+
alignment in minigraph in future.
|
373
|
+
|
374
|
+
We have also tried vg v1.21.0~\cite{Garrison:2018aa}. It indexed the same graph in 14.7 wall-clock
|
375
|
+
hours and mapped the simulated reads in 1.8 hours over 8 threads, tens of times
|
376
|
+
slower than minigraph and GraphAligner. However, no reads are mapped in the
|
377
|
+
output. We have not been able to make vg work with our data.
|
378
|
+
|
379
|
+
\subsection*{Generating pangenome graphs}
|
380
|
+
|
381
|
+
Fig.~\ref{fig:mg}b shows how minigraph constructs a pangenome graph (see
|
382
|
+
Methods for details). This procedure is similar to multiple sequence alignment
|
383
|
+
via partial order graph~\cite{Lee_2002} except that minigraph works with cyclic
|
384
|
+
graphs and ignores small variants. Minigraph only considers SVs of
|
385
|
+
100bp--100kb in length and ignores SVs in alignments shorter than 100kb.
|
386
|
+
For each input assembly, it filters out regions covered by two or more primary
|
387
|
+
alignments longer than 20kb in the assembly. This filter avoids paralogous
|
388
|
+
regions in a sample and guarantees that graphs generated by minigraph can be
|
389
|
+
modeled by rGFA.
|
390
|
+
|
391
|
+
As a sanity check, we compared minigraph to dipcall
|
392
|
+
(\href{https://github.com/lh3/dipcall}{https://github.com/lh3/dipcall}) on
|
393
|
+
calling SVs 100bp or longer from a synthetic diploid sample composed of CHM1
|
394
|
+
and CHM13~\cite{Li:2018aa}. Given two SV callsets $A$ and $B$, we say a call in
|
395
|
+
$A$ is \emph{missed} in callset $B$ if there are no calls in $B$ within 1000bp
|
396
|
+
from the call in $A$. With this criterion, 2.7\% of 14,792 SVs called by
|
397
|
+
dipcall are missed by minigraph; 6.0\% of 14,932 minigraph SVs are missed by
|
398
|
+
dipcall. We manually inspected tens of differences in
|
399
|
+
IGV~\cite{Robinson:2011aa} and identified two causes. First, an INDEL longer
|
400
|
+
than 100bp called by one caller may be split into two shorter INDELs by the
|
401
|
+
other caller. There are often more than one smaller SVs around a missed SV
|
402
|
+
call. Second, dipcall skips regions involving high density of SNPs or involving
|
403
|
+
both long insertions and long deletions, but minigraph connects these events
|
404
|
+
and calls SVs in such regions. It tends to call more SVs. Overall, we believe
|
405
|
+
minigraph and dipcall found similar sets of SVs.
|
406
|
+
|
407
|
+
\begin{table}[tb]
|
408
|
+
\caption{Assemblies used for graph construction}\label{tab:asm}
|
409
|
+
\begin{tabular}{llll}
|
410
|
+
\hline
|
411
|
+
Name & Species & Population & Accession/Source \\ \hline
|
412
|
+
CHM1 & Human & N/A & GCA\_001297185.1 \\
|
413
|
+
CHM13 & Human & N/A & GCA\_000983455.1 \\
|
414
|
+
NA12878 & Human & European & \cite{Garg810341}, phased \\
|
415
|
+
NA24385 & Human & Jewish & \cite{Garg810341}, phased \\
|
416
|
+
PGP1 & Human & N/A & \cite{Garg810341}, phased \\
|
417
|
+
NA19240 & Human & African & GCA\_001524155.4 \\
|
418
|
+
HG00514 & Human & East Asian & GCA\_002180035.3 \\
|
419
|
+
HG01352 & Human & American & GCA\_002209525.2 \\
|
420
|
+
NA19434 & Human & African & GCA\_002872155.1 \\
|
421
|
+
HG02818 & Human & African & GCA\_003574075.1 \\
|
422
|
+
HG03486 & Human & African & GCA\_003086635.1 \\
|
423
|
+
HG03807 & Human & South Asian& GCA\_003601015.1 \\
|
424
|
+
HG00733 & Human & American & GCA\_002208065.1 \\
|
425
|
+
HG02059 & Human & East Asian & GCA\_003070785.1 \\
|
426
|
+
HG00268 & Human & European & GCA\_008065235.1 \\
|
427
|
+
HG04217 & Human & South Asian& GCA\_007821485.1 \\
|
428
|
+
AK1 & Human & East Asian & GCA\_001750385.1 \\
|
429
|
+
Clint & Chimpanzee & & GCA\_002880755.3 \\
|
430
|
+
Susie & Gorilla & & GCA\_900006655.3 \\
|
431
|
+
Kamilah & Gorilla & & GCA\_008122165.1 \\
|
432
|
+
Susie & Orangutan & & GCA\_002880775.3 \\
|
433
|
+
\hline
|
434
|
+
\end{tabular}
|
435
|
+
\end{table}
|
436
|
+
|
437
|
+
\begin{figure*}[htbp]
|
438
|
+
\includegraphics[width=.95\textwidth]{Fig3}
|
439
|
+
\caption{\csentence{Characteristics of the human and the great ape graphs.} {\bf
|
440
|
+
(a)} Human variations stratified by repeat class and by the number of
|
441
|
+
alleles of each variation. The repeat annotation was obtained from the
|
442
|
+
longest allele of each variation. VNTR: variable-number tandem repeat, a
|
443
|
+
tandem repeat with the unit motif length $\ge$7bp. STR: short random repeat,
|
444
|
+
a tandem repeat with the unit motif length $\le$6bp. LCR: low-complexity
|
445
|
+
regions. Mixed-inter.: a variation involving $\ge$2 types of interspersed
|
446
|
+
repeats. {\bf (b)} Great ape variations stratified by repeat class and by the
|
447
|
+
number of alleles. {\bf (c)} Human biallelic variations stratified by repeat
|
448
|
+
class and by insertion to/deletion from GRCh38. Both alleles are required to
|
449
|
+
be covered in all assemblies. {\bf (d)} Human-specific biallelic variations
|
450
|
+
stratified by repeat class and by insertion to/deletion from GRCh38. Red bars
|
451
|
+
correspond to insertions to the human lineage. {\bf (e)} Distribution of
|
452
|
+
different types of human variations along chromosomes. {\bf (f)} Boxplot of
|
453
|
+
the longest allele length in each repeat class. Outliers are omitted for the
|
454
|
+
clarity of the figure.}\label{fig:anno}
|
455
|
+
\end{figure*}
|
456
|
+
|
457
|
+
\subsection*{A human pangenome graph}
|
458
|
+
|
459
|
+
Starting with GRCh38, we constructed a human pangenome graph from 20 human
|
460
|
+
haplotypes or haplotype-collapsed assemblies (Table~\ref{tab:asm}). It took
|
461
|
+
minigraph 2.7 wall-clock hours over 24 CPU threads to generate this graph. The
|
462
|
+
peak memory is 98.1GB. The resulting graph consists of 148,618 segments and
|
463
|
+
214,995 links. It contains 37,332 variations, where a \emph{variation}
|
464
|
+
denotes a minimal subgraph that has a single source and a single sink with both
|
465
|
+
segments coming from GRCh38. A path through the bubble between the source and
|
466
|
+
and the sink represents an \emph{allele}.
|
467
|
+
|
468
|
+
Variations in the human graph are enriched with Alus and VNTRs
|
469
|
+
(Fig.~\ref{fig:anno}a). While interspersed repeats are about evenly distributed
|
470
|
+
along chromosomes except in the pseudoautosomal regions (Fig.~\ref{fig:anno}e),
|
471
|
+
VNTRs are enriched towards telomeres~\cite{Audano:2019aa}. It is worth noting
|
472
|
+
the density of minisatellites is also higher in subtelomeres. If we normalize
|
473
|
+
the density of VNTRs in the pangenome graph by the density of minisatellites in
|
474
|
+
GRCh38, the enrichment of VNTRs towards telomeres is still visible but becomes
|
475
|
+
less prominent. At the same time, repeat-less variations are also enriched
|
476
|
+
towards the ends of chromosomes (green areas in Fig.~\ref{fig:anno}e),
|
477
|
+
suggesting subtelomeres tend to harbor SVs anyway. We also
|
478
|
+
identified 85 processed pseudogenes among these variations.
|
479
|
+
|
480
|
+
\begin{figure}
|
481
|
+
\includegraphics[width=.46\textwidth]{igv-edit.png}
|
482
|
+
\caption{\csentence{IGV screenshot of a region enriched with long insertions.}
|
483
|
+
Numbers on wide purple bars indicate insertion lengths. CLR: PacBio noisy
|
484
|
+
continuous long reads. HiFi: PacBio high-fidelity reads.}\label{fig:igv}
|
485
|
+
\end{figure}
|
486
|
+
|
487
|
+
Another noticeable feature of VNTRs is that over half of VNTR variations are
|
488
|
+
multiallelic (Fig.~\ref{fig:anno}a). Fig.~\ref{fig:igv} shows a multi-allelic
|
489
|
+
region composed of VNTRs. We can see many insertions of different lengths. The
|
490
|
+
two different NA12878 assemblies also disagree with each other, which we often
|
491
|
+
see around other VNTR loci in NA12878 as well. We have not inspected raw reads
|
492
|
+
in this particular example, but we tend to believe the disagreement is caused
|
493
|
+
by local misassemblies rather than somatic mutations. In addition, due to the
|
494
|
+
multiallelic nature of such VNTRs, the two haplotypes in a human individual are
|
495
|
+
often different. Assemblies mixing the two haplotypes (aka collapsed
|
496
|
+
assemblies) may have more troubles in these regions. Multiallelic VNTRs are
|
497
|
+
hard to assemble correctly.
|
498
|
+
|
499
|
+
Multiallelic VNTRs are also hard to align and to call. In Fig.~\ref{fig:igv},
|
500
|
+
the insertion positions are often different, which could be caused by a few
|
501
|
+
mutations or sequencing errors. A naive alignment-based SV caller would call a
|
502
|
+
dozen of low-frequency insertions in this region, which does not reflect these
|
503
|
+
correlated events. Without base-level alignment, minigraph may
|
504
|
+
have more troubles with obtaining the optimal alignment in these complex VNTR
|
505
|
+
regions. Improved data quality, assembly algorithms and graph mapping
|
506
|
+
algorithms are required to investigate VNTR regions in detail.
|
507
|
+
|
508
|
+
\subsection*{A great ape pangenome graph}
|
509
|
+
|
510
|
+
We also constructed a great ape pangenome graph from GRCh38, one chimpanzee,
|
511
|
+
two gorillas and one orangutan (Table~\ref{tab:asm}). This graph contains
|
512
|
+
206,452 variations, over four times more than the human graph. About half of
|
513
|
+
variations are originated from orangutan, the species most distant from human.
|
514
|
+
|
515
|
+
In the great ape graph, the L1-to-Alu ratio is close to 1:1, much higher than
|
516
|
+
the ratio in the human graph (Fig.~\ref{fig:anno}b vs Fig.~\ref{fig:anno}a).
|
517
|
+
This is perhaps correlated with the elevated L1 activity in great
|
518
|
+
apes~\cite{Mathews:2003aa}. Of retrotransposon-related variations specific to
|
519
|
+
the human lineage, the overwhelming majority are insertions
|
520
|
+
(Fig.~\ref{fig:anno}d), which is expected as transpositions lead to insertions
|
521
|
+
only. Most human-specific Alu deletions are incomplete and involve ancient Alu
|
522
|
+
subfamilies. They are likely genomic deletions that happen to hit Alus. In
|
523
|
+
contrast, the majority of ``partial-repeats'' are deletions from the human
|
524
|
+
lineage. Two thirds of autosomal insertions in this category are segmental
|
525
|
+
duplications in GRCh38. In all, minigraph is an efficient tool to study closely
|
526
|
+
related species.
|
527
|
+
|
528
|
+
\subsection*{Blacklist regions from human pangenome graphs}
|
529
|
+
|
530
|
+
The human pangenome graph effectively encodes SVs $\ge$100bp
|
531
|
+
in 20 genomes. These large-scale variations could be a frequent source of
|
532
|
+
technical artifacts in variant calling with short reads. To test this
|
533
|
+
hypothesis, we compared short-read SNP calls with vs without regions around SVs
|
534
|
+
in the pangenome graph.
|
535
|
+
|
536
|
+
We constructed a human pangenome graph excluding CHM1 and CHM13, the two
|
537
|
+
samples used in the SynDip benchmark~\cite{Li:2018aa}, and generated regions
|
538
|
+
around variations (see Methods), which we call as \emph{blacklist regions},
|
539
|
+
following the rationale in~\cite{Amemiya:2019aa}. Blacklist regions is totaled
|
540
|
+
29.2Mb in length, intersecting 0.7\% of confident regions in
|
541
|
+
SynDip~\cite{Li:2018aa}; 0.7\% of truth SNPs are contained in blacklist regions
|
542
|
+
-- true SNPs are not enriched in blacklist regions.
|
543
|
+
|
544
|
+
We mapped short reads used in~\cite{Li:2018aa} with minimap2 and called
|
545
|
+
variants with GATK v4.1.2~\cite{Depristo:2011vn}. This callset
|
546
|
+
contains 32,879 false positive SNPs, 21\% of which fall in blacklist regions --
|
547
|
+
false SNP calls are highly enriched in this $<$1\% region of human genome. This
|
548
|
+
confirms a noticeable fraction of false SNP calls using short reads are
|
549
|
+
resulted from misalignment involving SVs.
|
550
|
+
|
551
|
+
\section*{Discussion}
|
552
|
+
|
553
|
+
Based on the GFA assembly format~\cite{Li:2016aa}, we proposed the rGFA format,
|
554
|
+
which defines a data model for reference pangenome graphs at the same time.
|
555
|
+
rGFA takes a linear reference genome as the backbone and maintains the
|
556
|
+
conceptual ``linearity'' of input genomes.
|
557
|
+
|
558
|
+
rGFA is not the only pangenome graph model. Vg~\cite{Garrison:2018aa}
|
559
|
+
encodes a stable sequence with a path through the sequence graph~\cite{10.12688/f1000research.19630.1}. A segment
|
560
|
+
in the graph may occur on multiple paths, or occur multiple times on one path
|
561
|
+
if there are cycles in the graph. This way, vg allows different regions in one
|
562
|
+
chromosome collapsed to one segment. We call such a graph as a collapsed graph. rGFA
|
563
|
+
cannot encode a collapsed graph. The vg model is thus more general.
|
564
|
+
|
565
|
+
In our view, however, the reference pangenome graph should not be a collapsed
|
566
|
+
graph. In a collapsed graph, the definition of orthology is not clear because
|
567
|
+
multiple sequences from the same sample may go through the same segment.
|
568
|
+
Without the concept of orthology, we cannot define variations, either. In
|
569
|
+
addition, due to the one-to-many relationship between segments and the
|
570
|
+
reference genome, it is intricate to derive the stable coordinate of a path in
|
571
|
+
a collapsed graph. For example, suppose segment {\sf s1} corresponds to two
|
572
|
+
regions {\sf chr1:100-200} and {\sf chr1:500-600}. To convert a path {\sf
|
573
|
+
s2$\to$s1$\to$s3} to the stable coordinate, we have to inspect adjacent
|
574
|
+
segments to tell which {\sf s1} corresponds to; this becomes more challenging
|
575
|
+
when {\sf s2} and {\sf s3} represent multiple regions in the reference genome.
|
576
|
+
In contrast, rGFA inherently forbids a collapsed graph and avoids the potential
|
577
|
+
issues above. This makes rGFA simpler than vg's path model and easier to work
|
578
|
+
with.
|
579
|
+
|
580
|
+
To demonstrate practical applications of rGFA, we developed minigraph to
|
581
|
+
incrementally generate pangenome graphs. It can generate a graph from 20
|
582
|
+
genomes in three hours and can scale to hundreds of genomes in future. A
|
583
|
+
limitation of minigraph is that it does not perform base alignment and may be
|
584
|
+
confused by similar paths in the graph. {\color{black} Unfortunately, base-level
|
585
|
+
sequence-to-graph alignment is not a fully solved problem. Partial-order graph
|
586
|
+
alignment~\cite{Lee_2002} and PaSGAL~\cite{DBLP:conf/ipps/JainMZDA19} only work
|
587
|
+
with directed acyclic graphs (DAGs). Vg~\cite{Garrison:2018aa} uses a heuristic
|
588
|
+
to unroll cycles but it is expotential in time in the worst case and for DAGs,
|
589
|
+
its exact mode is tens of times slower than PaSGAL. Antipov et
|
590
|
+
al~\cite{Antipov:2016aa} proved that alignment against cyclic graphs can be
|
591
|
+
done in polynomial time. GraphAligner~\cite{Rautiainen810812} implements a
|
592
|
+
fast quadratic algorithm for computing edit distance~\cite{Rautiainen_2019}.
|
593
|
+
However, edit distance based alignment disallows long INDELs and is often
|
594
|
+
inadequate for accurate variant calling. Jain et al~\cite{Jain_2020} recently
|
595
|
+
proposed a quadratic algorithm for alignment with affine gap penalty but the
|
596
|
+
authors focused on the theoretical analysis only. To the best of our knowledge,
|
597
|
+
no tools can efficiently perform sequence-to-graph alignment under affine gap
|
598
|
+
cost. We plan to learn from the existing algorithms and implement fast base
|
599
|
+
alignment in minigraph in future. This may take significant effort.}
|
600
|
+
|
601
|
+
Another limitation of minigraph is
|
602
|
+
that it is unable to align sequences against a graph encoding all small variants.
|
603
|
+
Such a graph will be composed of millions of short segments. Not
|
604
|
+
indexing minimizers across segments, minigraph will fail to seed the initial
|
605
|
+
linear chains. This limitation can only be resolved by completely changing the
|
606
|
+
minigraph mapping algorithm. Nonetheless, small variants are easier to
|
607
|
+
analyze with the standard methods. Incorporating these variants unnecessarily
|
608
|
+
enlarges the graph, complicates implementations, increases the rate of false
|
609
|
+
mappings~\cite{Pritt_2018} and reduces the performance of common tasks. There
|
610
|
+
is also no known algorithm that can construct such a complex graph for hundreds
|
611
|
+
of human genomes.
|
612
|
+
|
613
|
+
Minigraph does not keep track of the sample information as of now. To address
|
614
|
+
this issue, we are considering to implement colored rGFA, similar to colored de
|
615
|
+
Bruijn graphs~\cite{Iqbal:2012aa}. In a colored rGFA, a color represents one
|
616
|
+
sample. Each segment or link is associated with one or multiple colors,
|
617
|
+
indicating the sources of the segment or the link. Colors can be stored in an
|
618
|
+
rGFA tag or in a separate segment/link-by-sample binary
|
619
|
+
matrix~\cite{Holley695338}. The matrix representation may be more compact given
|
620
|
+
a large number of samples.
|
621
|
+
|
622
|
+
We have shown minigraph can be a fast and powerful research tool to summarize
|
623
|
+
SVs at the population scale and to study the evolution of closely related
|
624
|
+
species. A more practical question is how a reference pangenome graph may
|
625
|
+
influence routine data analysis. Here is our limited view.
|
626
|
+
|
627
|
+
We think a critical role a reference graph plays is that it extends the
|
628
|
+
coordinate system of a linear reference genome. This allows us to annotate
|
629
|
+
variations in highly diverse regions such as the human HLA and KIR regions. The
|
630
|
+
existing pipelines largely ignore these variations because most of them cannot
|
631
|
+
be encoded in the primary assembly of GRCh38.
|
632
|
+
|
633
|
+
The extended graph coordinate system further helps to consistently represent
|
634
|
+
complex SVs. Given multiple samples, the current practice is to call SVs from
|
635
|
+
individual samples and then merge them. Two subtly different SVs, especially
|
636
|
+
long insertions, may be called at two distinct locations and treated as
|
637
|
+
separate events. With the minigraph procedure, the two SVs are likely to
|
638
|
+
be aligned together as long as they are similar to each other and are
|
639
|
+
sufficiently different from the reference allele. To some extent, minigraph is
|
640
|
+
performing multiple sequence alignment with partial order
|
641
|
+
alignment~\cite{Lee_2002}. This procedure is more robust to different
|
642
|
+
representations of the same SV than naive merging. When we refer to a SNP, we often use its
|
643
|
+
chromosomal coordinate such as ``chr1:12345''. We rarely do so for SVs because
|
644
|
+
their positions are sensitive to alignment and SV callers. The more consistent
|
645
|
+
SV representation implied by a pangenome graph will help to alleviate the issue
|
646
|
+
and subsequently facilitate the genotyping of
|
647
|
+
SVs~\cite{Hickey_2020,Eggertsson_2019,Chen_2019}.
|
648
|
+
|
649
|
+
While we believe a reference pangenome graph will make complex variations more
|
650
|
+
accessible by geneticists and biologists, we suspect a great majority of
|
651
|
+
biomedical researchers will still rely on a linear reference genome due to the
|
652
|
+
conceptual simplicity of linear genomes and the mature tool chains developed in
|
653
|
+
decades. Many analyses such as SNP calling in well behaved regions do not
|
654
|
+
benefit much from a pangenome representation, either. Nonetheless, a pangenome
|
655
|
+
reference still helps applications based on linear references. With a graph
|
656
|
+
reference, we may blacklist regions enriched with SVs that lead to small variant
|
657
|
+
calling errors. We may potentially generate ``decoy'' sequences that are
|
658
|
+
missing from the primary assembly to attract falsely mapped reads away. We may
|
659
|
+
perform read alignment against a graph, project the alignment to the linear
|
660
|
+
coordinate and finish the rest of analyses in the linear space. We anticipate a
|
661
|
+
pangenome reference to supplement the linear reference, not to replace it.
|
662
|
+
|
663
|
+
\section*{Conclusions}
|
664
|
+
|
665
|
+
Complex human sequence variations are like genomic dark matter: they are
|
666
|
+
pervasive in our genomes but are often opaque to the assay with the existing
|
667
|
+
tools. We envision a pangenome graph reference will become an effective
|
668
|
+
means to the study of these complex variations. We proposed a data model (rGFA),
|
669
|
+
designed formats (rGFA and GAF) and developed companion tools (minigraph and
|
670
|
+
gfatools) to demonstrate the feasibility of our vision. Our work is still
|
671
|
+
preliminary but it is likely to set a starting point to the development of the
|
672
|
+
next-generation graph-based tools, which may ultimately help us to understand
|
673
|
+
our genomes better.
|
674
|
+
|
675
|
+
\section*{Methods}
|
676
|
+
|
677
|
+
\subsection*{The minigraph mapping algorithm}
|
678
|
+
|
679
|
+
\subsubsection*{Seeding and linear chaining}
|
680
|
+
Similar to minimap2, minigraph uses minimizers on segments as seeds. It also
|
681
|
+
applies a similar chaining algorithm but with different scoring and with a new
|
682
|
+
heuristic to speed up chaining over long distances. For the completeness of
|
683
|
+
this article, we will describe part of the minimap2 chaining algorithm here.
|
684
|
+
|
685
|
+
\paragraph*{Minimap2-like chaining}
|
686
|
+
Formally, an \emph{anchor} is a 3-tuple $(x,y,w)$, representing a closed
|
687
|
+
interval $[x-w+1,x]$ on a segment in the reference graph matching an interval
|
688
|
+
$[y-w+1,y]$ on the query. Given a list of anchors sorted by $x$, let $f(i)$ be
|
689
|
+
the maximal chaining score up to the $i$-th anchor in the list. $f(i)$ can be
|
690
|
+
computed by:
|
691
|
+
\begin{equation}\label{eq:dp}
|
692
|
+
f(i)=\max\big\{\max_{i>j\ge1}\{f(j)+\alpha(j,i)-\beta(j,i)\},w_i\big\}
|
693
|
+
\end{equation}
|
694
|
+
where $\alpha(j,i)=\min\big\{\min\{y_i-y_j,x_i-x_j\},w_i\big\}$ is
|
695
|
+
the number of matching bases between anchor $i$ and $j$.
|
696
|
+
$\beta(j,i)$ is the gap penalty. Let $g_{ji}=|(y_i-y_j)-(x_i-x_j)|$
|
697
|
+
be the gap length and $d_{ji}=\min\{y_i-y_j,x_i-x_j\}$ be the smaller distance
|
698
|
+
between the two anchors. Minigraph uses the following gap cost:
|
699
|
+
$$
|
700
|
+
\beta(j,i)=\left\{\begin{array}{ll}
|
701
|
+
\infty & (g_{ji}>G) \\
|
702
|
+
c_1\cdot g_{ji} + c_2\cdot d_{ji} + \log_2{g_{ji}} & (0<g_{ji}\le G) \\
|
703
|
+
0 & (g_{ji}=0)\\
|
704
|
+
\end{array}\right.
|
705
|
+
$$
|
706
|
+
where $G=100000$ in the graph construction mode, $c_1=e^{-dw}$ and
|
707
|
+
$c_2=0.05\cdot e^{-dw}$. By default, $d=0.01$ is the expected per-base sequence
|
708
|
+
divergence and $w=19$ is the minimizer length. In comparison, minimap2 applies
|
709
|
+
$G=5000$, $c_1=0.19$ and $c_2=0$. Minigraph allows much larger gaps between
|
710
|
+
minimizers and more heavily penalizes gaps.
|
711
|
+
|
712
|
+
Solving Eq.~\ref{eq:dp} leads to an $O(n^2)$ algorithm where $n$ is the number
|
713
|
+
of anchors. This algorithm is slow for large $n$. Minimap2 introduces
|
714
|
+
heuristics to speed up the computation by approximating this equation. It works
|
715
|
+
well for minimap2 that only allows small gaps and has base-level alignment as a
|
716
|
+
fix to chaining errors. However, as minigraph intends to chain much longer
|
717
|
+
gaps, the minimap2 algorithm occasionally misses the optimal alignment in long
|
718
|
+
segmental duplications and produces false variations. Minigraph introduces a
|
719
|
+
new heuristic to speed up chaining.
|
720
|
+
|
721
|
+
\begin{figure}[tb]
|
722
|
+
\centering
|
723
|
+
\includegraphics[width=.36\textwidth]{Fig5}
|
724
|
+
\caption{\csentence{Implementing 1-dimension Range-Min-Query (RMQ).} Given a
|
725
|
+
set of 2-tuples, a binary search tree is built for the first values in the
|
726
|
+
tuples. Each node $p$ in the tree is associated with a pointer. The pointer
|
727
|
+
points to the node that is in the subtree descended from $p$ and has the
|
728
|
+
minimal second value. In this example, ${\rm RMQ}(20,50)=14$.}\label{fig:rmq}
|
729
|
+
\end{figure}
|
730
|
+
|
731
|
+
\paragraph*{Dynamic 1-dimension Range-Min-Query}
|
732
|
+
Before we move onto the minigraph solution, we will first introduce
|
733
|
+
Range-Min-Query (RMQ). Given a set of 2-tuples $\{(y_i,s_i)\}$, ${\rm
|
734
|
+
RMQ}(a,b)$ returns the minimum $s_j$ among $\{s_j:a\le y_j\le b\}$.
|
735
|
+
We implemented 1-dimension RMQ with a modified AVL tree, a type of balanced
|
736
|
+
binary search tree (Fig.~\ref{fig:rmq}). When performing ${\rm RMQ}(a,b)$,
|
737
|
+
we first find the smallest and the largest nodes within interval $[a,b]$ using
|
738
|
+
the standard algorithm. In this example, the two nodes are (21,32) and (45,21),
|
739
|
+
respectively. We then traverse the path between the two nodes to find the
|
740
|
+
minimum. With a balanced tree structure, we do not need to descend into
|
741
|
+
subtrees. The time complexity is $O(m\log m)$, where $m$ is the number of nodes
|
742
|
+
in the tree. We can insert nodes to or delete nodes from the tree while
|
743
|
+
maintaining the property of the tree. This achieves dynamic RMQ.
|
744
|
+
|
745
|
+
\paragraph*{Chaining with a linear gap cost function}
|
746
|
+
A linear gap cost takes the form of
|
747
|
+
$\beta'(j,i)=c_1[(y_i-y_j)+(x_i-x_j)]$. Given a list of anchors
|
748
|
+
$(x_i,y_i,w_i)$ sorted by position $x_i$, let
|
749
|
+
\begin{equation}\label{eq:dp2}
|
750
|
+
f'(i)=\max_{\substack{\text{$i>j\ge1$}\\ \text{$x_i-G\le x_j\le x_i-w_i$}\\ \text{$y_i-G\le y_j\le y_i-w_i$}}}\big\{f'(j)+w_j-\beta'(j,i)\big\}
|
751
|
+
\end{equation}
|
752
|
+
We can find the optimal $f'(i)$ in $O(n\log n)$ time with
|
753
|
+
RMQ~\cite{DBLP:conf/wabi/AbouelhodaO03,Otto:2011aa}. To see that, define
|
754
|
+
$$h'(j)=f'(j)+w_j+c_1(y_j+x_j)$$
|
755
|
+
The following condition
|
756
|
+
$$f'(j)+w_j-\beta'(j,i)>f'(k)+w_k-\beta'(k,i)$$
|
757
|
+
is equivalent to $h'(j)>h'(k)$, independent of $i$. If we maintain ${\rm
|
758
|
+
RMQ}_i$ as the binary tree that keeps $\{(y_j,-h'(j)):j<i,x_i-G\le x_j\le x_i-w_i\}$, we have
|
759
|
+
$$
|
760
|
+
f'(i)=-{\rm RMQ}_i(y_i-G,y_i-w_i)-c_1(x_i+y_i)
|
761
|
+
$$
|
762
|
+
This solves Eq.~\ref{eq:dp2} in $O(n\log n)$ time.
|
763
|
+
|
764
|
+
\paragraph*{Minigraph linear chaining}
|
765
|
+
While chaining with a linear gap cost function can be solved efficiently, we
|
766
|
+
prefer more realistic cost function used in minimap2. In practical
|
767
|
+
implementation, when we come to anchor $i$, we find the optimal predecessor $j_*$
|
768
|
+
under the desired gap cost $\beta(j,i)$ for anchors $\{j:j<i,x_i-G'\le
|
769
|
+
x_j<x_i,y_i-G'\le y_j<y_i\}$, where $G'<G$ is set to 10000 by default.
|
770
|
+
Meanwhile, we use the RMQ-based algorithm to identify the anchor $j'_{*}$ optimal
|
771
|
+
under the linear gap cost $\beta'(j,i)$. We choose $j'_*$ as the optimal
|
772
|
+
predecessor if
|
773
|
+
$$
|
774
|
+
f(j_*)+\alpha(j_*,i)-\beta(j_*,i)<f(j'_*)+\alpha(j'_*,i)-\beta(j'_*,i)
|
775
|
+
$$
|
776
|
+
This may occasionally happen around long segmental duplications when the
|
777
|
+
minimap2 heuristic misses the optimal solution. Effectively, minigraph does
|
778
|
+
thorough search in a small window and approximate search in a large window
|
779
|
+
using a faster but less sophisticated gap cost function.
|
780
|
+
|
781
|
+
\subsubsection*{Graph chaining}
|
782
|
+
|
783
|
+
Minigraph generates a set of linear chains $\{L_i\}$ with the procedure above
|
784
|
+
that completely ignores the graph topology. It then applies another round of
|
785
|
+
chaining taking the account of the topology.
|
786
|
+
|
787
|
+
We say linear chain $L_i$ \emph{precedes} $L_j$, written as $L_i\prec L_j$, if
|
788
|
+
(1) the ending coordinate of $L_i$ on the query sequence is smaller than the
|
789
|
+
ending coordinate of $L_j$, and (2) there is a walk from $L_i$ to $L_j$ in the
|
790
|
+
graph. If there are multiple walks from $L_i$ to $L_j$, minigraph enumerates
|
791
|
+
the shortest 16 walks and chooses the walk with its length being the closest to
|
792
|
+
the query distance between $L_i$ and $L_j$.
|
793
|
+
|
794
|
+
Given a list of linear chains sorted by their ending coordinates on the query
|
795
|
+
sequence, let $g(i)$ be the optimal graph chaining score up to linear chain
|
796
|
+
$L_i$. We can compute $g(i)$ with another dynamic programming:
|
797
|
+
$$
|
798
|
+
g(i)=\max\big\{\max_{L_j\prec L_i}\{g(j)+\omega(L_j)-\beta(j,i)\},\omega(L_i)\big\}
|
799
|
+
$$
|
800
|
+
where $\beta(j,i)$ is the weight between $L_i$ and $L_j$. As minigraph does not
|
801
|
+
perform base-level alignment, $\beta(j,i)$ is the same as the gap penalty
|
802
|
+
function used for linear chaining. $\omega(L_i)$ is the optimal score of $L_i$
|
803
|
+
computed during linear chaining.
|
804
|
+
|
805
|
+
The procedure above has two limitations. First, when computing the weight
|
806
|
+
between $L_i$ and $L_j$, minigraph largely ignores base sequences and only considers
|
807
|
+
the distance between them on both the query and the graph. When there are
|
808
|
+
multiple walks of similar lengths between $L_i$ and $L_j$, minigraph miss the
|
809
|
+
graph chain that leads to the best base alignment. Although we added a
|
810
|
+
heuristic by considering 17-mer matches between the query and the graph paths,
|
811
|
+
we found this heursitc is not reliable in complex regions. Second, minigraph only
|
812
|
+
enumerates the shortest 16 walks. In complex subgraphs, the optimal walk from
|
813
|
+
$L_i$ to $L_j$ may not be among them. We plan to implement base
|
814
|
+
alignment to address the limitations. We may use the current minigraph algorithm
|
815
|
+
for easy cases and apply the more expensive base alignment when the current
|
816
|
+
algorithm potentially fails.
|
817
|
+
|
818
|
+
The graph chaining algorithm results in one or multiple graph chains. A
|
819
|
+
\emph{graph chain} is a list of anchors $(s_i,x_i,y_i,w_i)$, where
|
820
|
+
$[x_i-w_i+1,x_i]$ on segment $s_i$ in the graph matches $[y_i-w_i+1,y_i]$ on
|
821
|
+
the query sequence. A graph chain satisfies the following conditions: if $i<j$,
|
822
|
+
$y_i<y_j$; if $i<j$ and $s_i=s_j$, we have $x_i<x_j$; if $s_i\not=s_{i+1}$, the
|
823
|
+
two segments are adjacent on the graph. It is an extension to linear chains.
|
824
|
+
|
825
|
+
\subsection*{The minigraph graph generation algorithm}
|
826
|
+
|
827
|
+
Using the minimap2 algorithm~\cite{Li:2018ab}, minigraph identifies a set of
|
828
|
+
\emph{primary chains} that do not greatly overlap with each other on the query
|
829
|
+
sequence. A region on the query is considered to be \emph{orthogonal} to the
|
830
|
+
reference if the region is contained in a primary chain longer than 100kb and
|
831
|
+
it is not intersecting other primary chains longer than 20kb.
|
832
|
+
|
833
|
+
Minigraph scans primary chains in orthogonal regions and identifies subregions
|
834
|
+
where the query subsequences significantly differs from the corresponding
|
835
|
+
reference subsequences. To achieve that, minigraph computes a score $h_i$ for
|
836
|
+
each adjacent pair of anchors $(s_i,x_i,y_i,w_i)$ and
|
837
|
+
$(s_{i+1},x_{i+1},y_{i+1},w_{i+1})$. Let $d^x_i$ be the distance between the
|
838
|
+
two anchors on the graph and $d^y_i=y_{i+1}-y_i$ be the distance on the query
|
839
|
+
sequence. $h_i$ is computed as
|
840
|
+
\begin{equation}\label{eq:hi}
|
841
|
+
h_i=\left\{\begin{array}{ll}
|
842
|
+
-10 & \mbox{if $d^x_i=d^y_i\le w_{i+1}$} \\
|
843
|
+
\eta\cdot\max\{d^x_i,d^y_i\} & \mbox{otherwise}\\
|
844
|
+
\end{array}\right.
|
845
|
+
\end{equation}
|
846
|
+
where $\eta$ is the density of anchors averaged across all primary graph
|
847
|
+
chains. Define $H(i,j)=\sum_{k=i}^j h_k$. A highly divergent region between the
|
848
|
+
query and the graph will be associated with a large $H(i,j)$. Minigraph uses
|
849
|
+
the Ruzzo-Tompa algorithm~\cite{DBLP:conf/ismb/RuzzoT99} to identify all
|
850
|
+
maximal scoring intervals on list $(h_i)$, which correspond to divergent
|
851
|
+
regions. In each identified divergent region, minigraph performs base
|
852
|
+
alignment~\cite{Suzuki:2018aa,Li:2018ab} between the query and the graph
|
853
|
+
sequences and retains a region if it involves an INDEL $\ge$100bp in length or
|
854
|
+
a $\ge$100bp region with base-level identity below 80\%. In Eq.~\ref{eq:hi},
|
855
|
+
-10 is an insensitive parameter due to the downstream filtering. In the end,
|
856
|
+
minigraph augments the existing graph with identified variations
|
857
|
+
(Fig.~\ref{fig:mg}b).
|
858
|
+
|
859
|
+
\subsection*{Annotating variations}
|
860
|
+
|
861
|
+
We applied RepeatMasker~\cite{Tarailo-Graovac:2009aa} v1.332 to classify
|
862
|
+
interspersed repeats in the longest allele sequence of each variation.
|
863
|
+
RepeatMasker is unable to annotate VNTRs with long motifs. It also often
|
864
|
+
interprets VNTRs as impure STRs. Therefore, we did not use the RepeatMasker
|
865
|
+
VNTR or STR annotations directly. Instead, we combined RepeatMasker and
|
866
|
+
SDUST~\cite{Morgulis:2006aa} results to collect low-complexity regions (LCRs).
|
867
|
+
We identified pure tandem repeats composed of a motif occurring twice or more
|
868
|
+
(implemented in
|
869
|
+
\href{https://github.com/lh3/etrf}{https://github.com/lh3/etrf}). An LCR is
|
870
|
+
classified as VNTR if 70\% of the LCR is VNTR; similarly, an LCR is classified
|
871
|
+
as STR if 70\% is STR; the rest are classified as ``Other-LCR'' in
|
872
|
+
Fig.~\ref{fig:anno}. The annotation script is available in the minigraph GitHub
|
873
|
+
repository.
|
874
|
+
|
875
|
+
\subsection*{Creating blacklist regions}
|
876
|
+
|
877
|
+
For each variation in the graph, we extend its genomic interval on GRCh38 by
|
878
|
+
50bp from each end. We name this set of intervals as $I_0$. We align sequences
|
879
|
+
inserted to GRCh38 against GRCh38 with ``minimap2 -cxasm20 -r2k'' and filter
|
880
|
+
out alignments with mapping quality below 5. Let $I(a,b)$ be the set of GRCh38
|
881
|
+
intervals that are contained in alignments with identity between $a$ and $b$.
|
882
|
+
The blacklist regions are computed by $I_0\cup I(0,0.99)\setminus I(0.998,1)$,
|
883
|
+
where ``$\cup$'' denotes the interval union operation and ``$\setminus$''
|
884
|
+
denotes interval subtraction.
|
885
|
+
|
886
|
+
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
887
|
+
%% %%
|
888
|
+
%% Backmatter begins here %%
|
889
|
+
%% %%
|
890
|
+
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
891
|
+
|
892
|
+
\begin{backmatter}
|
893
|
+
|
894
|
+
\section*{Competing interests}
|
895
|
+
The authors declare that they have no competing interests.
|
896
|
+
|
897
|
+
\section*{Ethical Approval}
|
898
|
+
Ethical approval was not needed for this study.
|
899
|
+
|
900
|
+
\section*{Author's contributions}
|
901
|
+
HL conceived the project, developed minigraph and drafted the manuscript.
|
902
|
+
XF did the pseudogene analysis. CC helped with RepeatMasker annotation.
|
903
|
+
All authors helped to revise the manuscript.
|
904
|
+
|
905
|
+
\section*{Acknowledgements}
|
906
|
+
We are grateful to Benedict Paten and Erik Garrison for discussions on
|
907
|
+
pangenome graphs. We thank minigraph users who have suggested features and
|
908
|
+
helped to fix various issues.
|
909
|
+
|
910
|
+
\section*{Funding}
|
911
|
+
This work is supported by National Institutes of Health (NIH) grant
|
912
|
+
U01HG010961 and R01HG010040.
|
913
|
+
|
914
|
+
\section*{Availability of data and materials}
|
915
|
+
Minigraph is openly available at
|
916
|
+
\href{https://github.com/lh3/minigraph}{https://github.com/lh3/minigraph}.
|
917
|
+
This repository also includes the script to convert from the segment coordinate
|
918
|
+
to the stable coordinate, to annotate variations and to generate blacklist
|
919
|
+
regions from the graph. The companion gfatools is available at
|
920
|
+
\href{https://github.com/lh3/gfatols}{https://github.com/lh3/gfatools}. The
|
921
|
+
human and the great ape graphs are hosted at
|
922
|
+
\href{ftp://ftp.dfci.harvard.edu/pub/hli/minigraph/}{ftp://ftp.dfci.harvard.edu/pub/hli/minigraph/}.
|
923
|
+
The NA12878, NA24385 and PGP1 phased assemblies were downloaded from
|
924
|
+
\href{ftp://ftp.dfci.harvard.edu/pub/hli/whdenovo/}{ftp://ftp.dfci.harvard.edu/pub/hli/whdenovo/}.
|
925
|
+
Assemblies generated by McDonnell Genome Institute include
|
926
|
+
GCA\_001524155.4 for NA19240, GCA\_002180035.3 for HG00514, GCA\_002209525.2
|
927
|
+
for HG01352, GCA\_002872155.1 for NA19434, GCA\_003574075.1 for HG02818,
|
928
|
+
GCA\_003086635.1 for HG03486, GCA\_003086635.1 for HG03486, GCA\_003601015.1
|
929
|
+
for HG03807, GCA\_002208065.1 for HG00733, GCA\_003070785.1 for HG02059,
|
930
|
+
GCA\_008065235.1 for HG00268 and GCA\_007821485.1 for HG04217. Other assemblies
|
931
|
+
are available from GenBank under accession GCA\_001297185.1 for
|
932
|
+
CHM1~\cite{Huddleston:2017aa}, GCA\_000983455.1 for
|
933
|
+
CHM13~\cite{Huddleston:2017aa}, GCA\_001750385.1 for AK1~\cite{Seo:2016aa},
|
934
|
+
GCA\_002880755.3 for chimpanzee Clint~\cite{Kronenberg:2018aa},
|
935
|
+
GCA\_900006655.3 for gorilla Susie~\cite{Gordon:2016kq}, GCA\_008122165.1 for
|
936
|
+
gorilla Kamilah~\cite{Kronenberg:2018aa} and GCA\_002880775.3 for orangutan
|
937
|
+
Susie~\cite{Kronenberg:2018aa}.
|
938
|
+
|
939
|
+
|
940
|
+
\bibliographystyle{bmc-mathphys}
|
941
|
+
\bibliography{minigraph}
|
942
|
+
|
943
|
+
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
944
|
+
%% %%
|
945
|
+
%% Figures %%
|
946
|
+
%% %%
|
947
|
+
%% NB: this is for captions and %%
|
948
|
+
%% Titles. All graphics must be %%
|
949
|
+
%% submitted separately and NOT %%
|
950
|
+
%% included in the Tex document %%
|
951
|
+
%% %%
|
952
|
+
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
953
|
+
|
954
|
+
%\section*{Figures}
|
955
|
+
|
956
|
+
%\begin{figure}[h!]
|
957
|
+
% \caption{\csentence{Sample figure title.}
|
958
|
+
% Figure legend text.}
|
959
|
+
% \end{figure}
|
960
|
+
|
961
|
+
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
962
|
+
%% %%
|
963
|
+
%% Tables %%
|
964
|
+
%% %%
|
965
|
+
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
966
|
+
|
967
|
+
%\section*{Tables}
|
968
|
+
|
969
|
+
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
970
|
+
%% %%
|
971
|
+
%% Additional Files %%
|
972
|
+
%% %%
|
973
|
+
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
974
|
+
|
975
|
+
%\section*{Additional Files}
|
976
|
+
% \subsection*{Additional file 1 --- Sample additional file title}
|
977
|
+
% Additional file descriptions text (including details of how to
|
978
|
+
% view the file, if it is in a non-standard format or the file extension). This might
|
979
|
+
% refer to a multi-page table or a figure.
|
980
|
+
|
981
|
+
% \subsection*{Additional file 2 --- Sample additional file title}
|
982
|
+
% Additional file descriptions text.
|
983
|
+
|
984
|
+
|
985
|
+
\end{backmatter}
|
986
|
+
\end{document}
|