ruby-minigraph 0.0.20.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (89) hide show
  1. checksums.yaml +7 -0
  2. data/LICENSE.txt +21 -0
  3. data/README.md +62 -0
  4. data/ext/Rakefile +56 -0
  5. data/ext/cmappy/cmappy.c +7 -0
  6. data/ext/cmappy/cmappy.h +8 -0
  7. data/ext/minigraph/LICENSE.txt +23 -0
  8. data/ext/minigraph/Makefile +66 -0
  9. data/ext/minigraph/NEWS.md +317 -0
  10. data/ext/minigraph/README.md +207 -0
  11. data/ext/minigraph/algo.c +194 -0
  12. data/ext/minigraph/algo.h +33 -0
  13. data/ext/minigraph/asm-call.c +147 -0
  14. data/ext/minigraph/bseq.c +133 -0
  15. data/ext/minigraph/bseq.h +76 -0
  16. data/ext/minigraph/cal_cov.c +139 -0
  17. data/ext/minigraph/doc/example1.png +0 -0
  18. data/ext/minigraph/doc/example2.png +0 -0
  19. data/ext/minigraph/doc/examples.graffle +0 -0
  20. data/ext/minigraph/format.c +241 -0
  21. data/ext/minigraph/galign.c +140 -0
  22. data/ext/minigraph/gchain1.c +532 -0
  23. data/ext/minigraph/gcmisc.c +223 -0
  24. data/ext/minigraph/gfa-aug.c +260 -0
  25. data/ext/minigraph/gfa-base.c +526 -0
  26. data/ext/minigraph/gfa-bbl.c +372 -0
  27. data/ext/minigraph/gfa-ed.c +617 -0
  28. data/ext/minigraph/gfa-io.c +395 -0
  29. data/ext/minigraph/gfa-priv.h +154 -0
  30. data/ext/minigraph/gfa.h +166 -0
  31. data/ext/minigraph/ggen.c +182 -0
  32. data/ext/minigraph/ggen.h +21 -0
  33. data/ext/minigraph/ggsimple.c +570 -0
  34. data/ext/minigraph/gmap.c +211 -0
  35. data/ext/minigraph/index.c +230 -0
  36. data/ext/minigraph/kalloc.c +224 -0
  37. data/ext/minigraph/kalloc.h +82 -0
  38. data/ext/minigraph/kavl.h +414 -0
  39. data/ext/minigraph/kdq.h +134 -0
  40. data/ext/minigraph/ketopt.h +116 -0
  41. data/ext/minigraph/khashl.h +348 -0
  42. data/ext/minigraph/krmq.h +474 -0
  43. data/ext/minigraph/kseq.h +256 -0
  44. data/ext/minigraph/ksort.h +164 -0
  45. data/ext/minigraph/kstring.h +165 -0
  46. data/ext/minigraph/kthread.c +159 -0
  47. data/ext/minigraph/kthread.h +15 -0
  48. data/ext/minigraph/kvec-km.h +105 -0
  49. data/ext/minigraph/kvec.h +110 -0
  50. data/ext/minigraph/lchain.c +441 -0
  51. data/ext/minigraph/main.c +301 -0
  52. data/ext/minigraph/map-algo.c +500 -0
  53. data/ext/minigraph/mgpriv.h +128 -0
  54. data/ext/minigraph/minigraph.1 +359 -0
  55. data/ext/minigraph/minigraph.h +176 -0
  56. data/ext/minigraph/miniwfa.c +834 -0
  57. data/ext/minigraph/miniwfa.h +95 -0
  58. data/ext/minigraph/misc/mgutils.js +1451 -0
  59. data/ext/minigraph/misc.c +12 -0
  60. data/ext/minigraph/options.c +134 -0
  61. data/ext/minigraph/shortk.c +251 -0
  62. data/ext/minigraph/sketch.c +109 -0
  63. data/ext/minigraph/sys.c +147 -0
  64. data/ext/minigraph/sys.h +20 -0
  65. data/ext/minigraph/test/MT-chimp.fa +277 -0
  66. data/ext/minigraph/test/MT-human.fa +239 -0
  67. data/ext/minigraph/test/MT-orangA.fa +276 -0
  68. data/ext/minigraph/test/MT.gfa +19 -0
  69. data/ext/minigraph/tex/Makefile +13 -0
  70. data/ext/minigraph/tex/minigraph.bib +676 -0
  71. data/ext/minigraph/tex/minigraph.tex +986 -0
  72. data/ext/minigraph/tex/plots/CHM13-f1-90.bb.anno.gp +42 -0
  73. data/ext/minigraph/tex/plots/CHM13-f1-90.bb.anno.tbl +13 -0
  74. data/ext/minigraph/tex/plots/CHM13-f1-90.bb.mini-inter-none.win.gp +269 -0
  75. data/ext/minigraph/tex/plots/CHM13-f1-90.bb.mini-inter-none.win.sh +7 -0
  76. data/ext/minigraph/tex/plots/CHM13v1.cen.bed +23 -0
  77. data/ext/minigraph/tex/plots/CHM13v1.size +23 -0
  78. data/ext/minigraph/tex/plots/anno2tbl.js +40 -0
  79. data/ext/minigraph/tex/plots/bedutils.js +367 -0
  80. data/ext/minigraph/tex/plots/chr-plot.js +130 -0
  81. data/ext/minigraph/tex/plots/gen-anno.mak +24 -0
  82. data/ext/minigraph.patch +21 -0
  83. data/lib/minigraph/ffi/constants.rb +230 -0
  84. data/lib/minigraph/ffi/functions.rb +70 -0
  85. data/lib/minigraph/ffi/mappy.rb +8 -0
  86. data/lib/minigraph/ffi.rb +27 -0
  87. data/lib/minigraph/version.rb +5 -0
  88. data/lib/minigraph.rb +72 -0
  89. metadata +159 -0
@@ -0,0 +1,986 @@
1
+ %% BioMed_Central_Tex_Template_v1.06
2
+
3
+ \documentclass[twocolumn]{bmcart}
4
+
5
+ %%% Load packages
6
+ \usepackage{amsthm,amsmath}
7
+ \RequirePackage{hyperref}
8
+ \usepackage[utf8]{inputenc} %unicode support
9
+
10
+ \usepackage{graphicx}
11
+ %\def\includegraphic{}
12
+ %\def\includegraphics{}
13
+
14
+ %%% Put your definitions there:
15
+ \startlocaldefs
16
+ \endlocaldefs
17
+
18
+
19
+ %%% Begin ...
20
+ \begin{document}
21
+
22
+ %%% Start of article front matter
23
+ \begin{frontmatter}
24
+
25
+ \begin{fmbox}
26
+ \dochead{Method}
27
+
28
+ \title{The design and construction of reference pangenome graphs with minigraph}
29
+
30
+ \author[
31
+ addressref={aff1,aff2}, % id's of addresses, e.g. {aff1,aff2}
32
+ corref={aff1}, % id of corresponding address, if any
33
+ email={hli@ds.dfci.harvard.edu} % email address
34
+ ]{\inits{HL}\fnm{Heng} \snm{Li}}
35
+ \author[
36
+ addressref={aff1,aff2},
37
+ ]{\inits{XF}\fnm{Xiaowen} \snm{Feng}}
38
+ \author[
39
+ addressref={aff2},
40
+ ]{\inits{CC}\fnm{Chong} \snm{Chu}}
41
+
42
+ \address[id=aff1]{% % unique id
43
+ \orgname{Department of Data Sciences, Dana-Farber Cancer Institute}, % university, etc
44
+ \city{Boston, MA 02215}, % city
45
+ \cny{USA} % country
46
+ }
47
+ \address[id=aff2]{%
48
+ \orgname{Department of Biomedical Informatics, Harvard Medical School},
49
+ \city{Boston, MA 02215},
50
+ \cny{USA}
51
+ }
52
+
53
+ \begin{abstractbox}
54
+
55
+ \begin{abstract} % abstract
56
+ The recent advances in sequencing technologies enable the assembly of
57
+ individual genomes to the quality of the reference genome. How to integrate
58
+ multiple genomes from the same species and make the integrated representation
59
+ accessible to biologists remains an open challenge. Here, we propose a
60
+ graph-based data model and associated formats to represent multiple genomes
61
+ while preserving the coordinate of the linear reference genome. We implement
62
+ our ideas in the minigraph toolkit and demonstrate that we can efficiently
63
+ construct a pangenome graph and compactly encode tens of thousands of
64
+ structural variants missing from the current reference genome.
65
+ \end{abstract}
66
+
67
+ \begin{keyword}
68
+ \kwd{bioinformatics}
69
+ \kwd{genomics}
70
+ \kwd{pangenome}
71
+ \end{keyword}
72
+
73
+ \end{abstractbox}
74
+
75
+ \end{fmbox}
76
+
77
+ \end{frontmatter}
78
+
79
+ %%
80
+ \section*{Background}
81
+
82
+ The human reference genome is a fundamental resource for human genetics and
83
+ biomedical research. The primary sequences of the reference genome
84
+ GRCh38~\cite{Schneider:2017aa} are a mosaic of haplotypes with each haplotype segment derived
85
+ from a single human individual. They cannot represent the genetic diversity in
86
+ human populations and as a result, each individual may carry thousands of large
87
+ germline variants absent from the reference genome~\cite{Huddleston:2017aa}.
88
+ Some of these variants are likely associated with phenotype~\cite{Eichler_2010}
89
+ but are often missed or misinterpreted when we map sequence data to GRCh38, in
90
+ particular with short reads~\cite{Li:2018aa}. This under-representation of
91
+ genetic diversity may become a limiting factor in our understanding of genetic
92
+ variations.
93
+
94
+ Meanwhile, the advances in long-read sequencing technologies make it possible
95
+ to assemble a human individual to a quality comparable to
96
+ GRCh38~\cite{Schneider:2017aa,Wenger_2019}. There are already a dozen of
97
+ high-quality human assemblies available in GenBank~\cite{Audano:2019aa}.
98
+ Properly integrating these genomes into a reference \emph{pangenome}, which
99
+ refers to a collection of genomes~\cite{cpgc:2016aa}, would potentially address
100
+ the issues with a single linear reference.
101
+
102
+ A straightforward way to represent a pangenome is to store unaligned genomes
103
+ in a full-text index that compresses redundancies in sequences identical
104
+ between individuals~\cite{Makinen:2010aa,Liu_2016,Boucher_2019}. We may
105
+ retrieve individual genomes from the index, inspect the k-mer spectrum and test
106
+ the presence of k-mers using standard techniques. In principle, it is also
107
+ possible to apply canonical read alignment algorithms to map sequences to
108
+ the collection, but in practice, the redundant hits to multiple genomes will
109
+ confuse downstream mapping-based analyses~\cite{NA2016159}. It is not clear how
110
+ to resolve these multiple mappings.
111
+
112
+ The other class of methods encodes multiple genomes into a sequence graph,
113
+ usually by collapsing identical or similar sequences between genomes onto a
114
+ single representative sequence. The results in a \emph{pangenome graph}. A
115
+ pangenome graph is a powerful tool to identify core genome, the part of a
116
+ genome or gene set that is shared across the majority of the strains or related species
117
+ in a clade~\cite{Vernikos:2015aa}. A common way to construct a basic pangenome
118
+ graph is to generate a compacted de Bruijn graph
119
+ (cDBG)~\cite{Marcus:2014xy,Baier_2015,Beller:2016ab,Chikhi:2015aa,Minkin_2016,Chikhi_2016,almodaresi_et_al:LIPIcs:2017:7657}
120
+ from a set of genomes. Basic cDBG does not keep sample information.
121
+ \cite{Iqbal:2012aa} proposed colored cDBG with each color represents a sample
122
+ or a population. Colored cDBG can be constructed
123
+ efficiently~\cite{Muggli_2019,Holley695338}. However, a colored cDBG discards
124
+ the chromosomal coordinate and thus disallows the mapping of genomic features.
125
+ It often includes connections absent from the input genomes and thus encodes
126
+ sequences more than the input. A colored cDBG cannot serve as a
127
+ \emph{reference} pangenome graph, either. deBGA~\cite{Liu:2016ac} addresses
128
+ the issue by labeling each unitig with its possibly multiple locations in the
129
+ input genome(s). Pufferfish~\cite{Almodaresi:2018aa} further reduces its space
130
+ requirement. Nonetheless, given hundreds of human genomes, there will be many
131
+ more vertices in the graph and most vertices are associated with hundreds of
132
+ labels. Whether deBGA and pufferfish can scale to such datasets remains an open
133
+ question. GBWT~\cite{Sir_n_2019} provides another practical solution to storage
134
+ and indexing, but no existing tools can practically construct a cDBG for many
135
+ human genomes in the GBWT representation.
136
+
137
+ In addition to cDBG, we can derive a reference pangenome
138
+ graph from a single linear multi-sequence alignment (MSA)~\cite{Dilthey_2015,Dilthey_2019}.
139
+ It has been used for HLA typing but is not applicable to whole chromosomes when
140
+ they cannot be included in a single linear MSA. The third and possibly the most
141
+ popular approach to reference graph generation is to call variants from other
142
+ sources and then incorporate these variants, often in the VCF format~\cite{Danecek:2011qy}, into
143
+ the reference genome as alternative
144
+ paths~\cite{Eggertsson:2017aa,Rakocevic_2019,Sibbesen:2018aa,Biederstedt:2018aa,Eggertsson_2019}.
145
+ However, because VCF does not define coordinates on insertions, this approach
146
+ cannot properly encode variations on long insertions and is therefore limited
147
+ to simple variations. There are no satisfactory solutions to the construction
148
+ of reference pangenome graphs.
149
+
150
+ In this article, we introduce the reference Graphical Fragment Assembly (rGFA)
151
+ format to model reference pangenome graphs. We propose and demonstrate an
152
+ incremental procedure to construct graphs under this model. The resulting
153
+ graphs encode structural variations (SVs) of length 100bp or longer without haplotype
154
+ information. Our implementation, minigraph~\cite{Li_minigraph:2020aa}
155
+ (\href{https://github.com/lh3/minigraph}{https://github.com/lh3/minigraph}),
156
+ can construct a pangenome graph from twenty human assemblies in three hours.
157
+
158
+ \section*{Results}
159
+
160
+ We will first describe a data model for reference pangenome graphs, which
161
+ establishes the foundation of this article. We will then present a new
162
+ sequence-to-graph mapper, minigraph, and show how this mapper incrementally
163
+ constructs a pangenome graph. We will demonstrate the utility of pangenome
164
+ graphs with a human graph generated from twenty human haplotypes and a primate
165
+ graph generated from four species.
166
+
167
+ \subsection*{Modeling reference pangenome graphs}
168
+
169
+ \subsubsection*{Sequence graphs}
170
+
171
+ There are several equivalent ways to define a sequence graph. In this article,
172
+ a \emph{sequence graph} $G(V,E)$ is a bidirected graph. Each vertex $v\in V$ is
173
+ associated with a DNA sequence; each edge $e\in E$ has two directions, one for
174
+ each endpoint, which leads to four types of edges: forward-forward,
175
+ reverse-forward, forward-reverse and reverse-reverse. The directions on an edge
176
+ dictate how a sequence is spelled from a walk/path in the graph. Common
177
+ assembly graphs, such as the overlap graph, string graph and de Bruijn graph
178
+ can all be formulated as sequence graphs.
179
+
180
+ \begin{figure}[t]
181
+ \includegraphics[width=.47\textwidth]{Fig1}
182
+ \caption{\csentence{Example rGFA and GAF formats.} {\bf (a)} Example rGFA
183
+ format. rGFA-specific tags include SN, name of the stable sequence from which
184
+ the vertex is derived; SO, offset on the stable sequence; SR, rank: 0 if the
185
+ vertex or edge is on the linear reference; $>$0 for non-reference. {\bf (b)}
186
+ Corresponding sequence graph. Each thick arrow represents an oriented DNA
187
+ sequence. {\bf (c)} Example GAF format, using the segment coordinate, for
188
+ reads ``${\tt GTGGCT}$'' and ``${\tt CGTTTCC}$'' mapped to the graph. {\bf
189
+ (d)} Equivalent GAF format using the stable coordinate.}\label{fig:rgfa}
190
+ \end{figure}
191
+
192
+ The Graphical Fragment Assembly (GFA) format~\cite{Li:2016aa} describes
193
+ sequence graphs. The core of GFA is defined by the following grammar:
194
+
195
+ {\footnotesize
196
+ \begin{verbatim}
197
+
198
+ <GFA> <- (<segment> | <link>)+
199
+ <segment> <- `S' <segId> <segSeq>
200
+ <link> <- `L' <segId> [+-] <segId> [+-] <cigar>
201
+
202
+ \end{verbatim}}
203
+
204
+ {\flushleft
205
+ A line starting with letter ``${\tt S}$'' corresponds to a vertex and a line
206
+ starting with ``${\tt L}$'' corresponds
207
+ to a bidirected edge. In a de Bruijn graph, we often attach sequences to edges
208
+ instead of vertices~\cite{Pevzner:2001vn,Gnerre:2011ys}. To avoid the confusion, in this
209
+ article, we also call a vertex as a \emph{segment} and call an edge as a
210
+ \emph{link}, following the GFA terminology. Fig.~\ref{fig:rgfa}a shows an
211
+ example GFA that encodes Fig.~\ref{fig:rgfa}b.
212
+ }
213
+
214
+ A sequence graph in the GFA format natively defines a \emph{segment coordinate}
215
+ system where each base in the graph is uniquely indexed by a
216
+ 2-tuple $({\rm segId},{\rm segOffset})$. For example, in
217
+ Fig~\ref{fig:rgfa}a, the base at position $({\rm s2},2)$ is ``{\tt G}''.
218
+ A major problem with this coordinate is that it is decoupled from linear
219
+ annotations and is sensitive to graph transformations. For example, if we split
220
+ a segment into two connected segments, the set of sequences spelled from the graph
221
+ remains the same, but the segment coordinates will be changed. Due to the
222
+ instability of segment coordinate, a basic sequence graph is inadequate for a
223
+ reference graph.
224
+
225
+ \subsubsection*{Reference pangenome graphs}
226
+
227
+ We propose the reference GFA (rGFA) format to encode reference pangenome graphs.
228
+ rGFA is an extension to GFA with three additional tags that indicate the origin
229
+ of a segment from linear genomes (Fig.~\ref{fig:rgfa}a). This simple addition
230
+ gives us a unique stable coordinate system as an extension to the linear
231
+ reference coordinate (e.g. GRCh38). We can pinpoint a position such as
232
+ ``{\sf chr1:9}'' in the graph and map existing annotations onto the graph. We can
233
+ also report a path or walk in the stable coordinate. For example, path
234
+ ``{\sf s1$\to$s2$\to$s3}'' unambiguously corresponds to ``{\sf
235
+ chr1:0-5$\to$chr1:5-8$\to$chr1:8-12}'' or simply ``{\sf chr1:0-12}'' if we
236
+ merge adjacent coordinate; similarly, ``{\sf s1$\to$s2$\to$s5$\to$s6}''
237
+ corresponds to ``{\sf chr1:0-8$\to$foo:8-16}''. We will formally describe the
238
+ path format when introducing the GAF format in the next section.
239
+
240
+ In rGFA, each segment is associated with one origin. This apparently trivial
241
+ requirement in fact imposes a strong restriction on the types of graphs rGFA
242
+ can encode: it forbids the collapse of different regions from one sequence,
243
+ which would often happen in a cDBG. We consider this restriction an
244
+ advantage of rGFA because it requires the graph to have a ``linear'' flavor
245
+ intuitively and simplifies the data structure to store the graph.
246
+
247
+ For simplicity, rGFA disallows overlaps between edges and forbids multiple
248
+ edges (more than one edges between the same pair of vertices). These two
249
+ restrictions help to avoid ambiguity and reduce the complexity in
250
+ implementation. They are not strictly necessary in theory.
251
+
252
+ \subsubsection*{The Graphical mApping Format (GAF)}
253
+
254
+ \begin{table}[tb]
255
+ \caption{The Graphical mApping Format (GAF)}\label{tab:gaf}
256
+ \begin{tabular}{rcp{6cm}}
257
+ \hline
258
+ Col & Type & Description \\ \hline
259
+ 1 & string & Query sequence name \\
260
+ 2 & int & Query sequence length \\
261
+ 3 & int & Query start coordinate (0-based; closed) \\
262
+ 4 & int & Query end coordinate (0-based; open) \\
263
+ 5 & char & Strand relative to col. 6 \\
264
+ 6 & string & Graph path matching regular expression \texttt{/([><][\char94\char92s><]+(:\char92d+-\char92d+)?)+\char124([\char94\char92s><]+)/}\\
265
+ 7 & int & Path sequence length \\
266
+ 8 & int & Path start coordinate \\
267
+ 9 & int & Path end coordinate \\
268
+ 10 & int & Number of matching bases in the mapping \\
269
+ 11 & int & Number of bases, including gaps, in the mapping \\
270
+ 12 & int & Mapping quality (0--255 with 255 for missing) \\ \hline
271
+ \end{tabular}
272
+ \end{table}
273
+
274
+ As there are no text formats for sequence-to-graph alignment, we propose a new
275
+ Graphical mApping Format (GAF) by extending the Pairwise mApping Format
276
+ (PAF)~\cite{Li:2016aa}. GAF is TAB-delimited with each column defined in
277
+ Table~\ref{tab:gaf}. Column 6 encodes a path on the graph. It follows the
278
+ formal grammar below:
279
+
280
+ {\footnotesize
281
+ \begin{verbatim}
282
+
283
+ <path> <- <stableId> | <orientIntv>+
284
+ <orientIntv> <- (`>' | `<') (<segId> | <stableIntv>)
285
+ <stableIntv> <- <stableId> `:' <start> `-' <end>
286
+
287
+ \end{verbatim}}
288
+
289
+ {\flushleft
290
+ In this grammar, {\tt <segId>} is a segment identifier on an S-line in rGFA;
291
+ {\tt <stableId>} is a stable sequence name at the {\tt SN} tag on the
292
+ corresponding S-line. Column 6 can be either a path in the segment coordinate
293
+ (Fig.~\ref{fig:rgfa}c) or an equivalent path in the stable coordinate
294
+ (Fig.~\ref{fig:rgfa}d). We can merge adjacent stable coordinates if the two
295
+ segments are originated from the same stable sequence and the end offset of the
296
+ first segment is equal to the start offset of the second segment. For example,
297
+ ``{\tt >chr1:0-5>chr1:5-8}'' can be simplified to ``{\tt >chr1:0-8}''.
298
+ Furthermore, if a path in column 6 is derived from one reference sequence, we
299
+ recommend to replace it with the entire reference path on the forward
300
+ orientation (e.g. see ``read1'' in Fig.~\ref{fig:rgfa}d). With this convention,
301
+ a GAF line is reduced to PAF for a sequence mapped to a reference sequence.
302
+ Similar to PAF, GAF also allows optional tags in the SAM-like format. Base
303
+ alignment is kept at the {\tt cg} tag.}
304
+
305
+ Minigraph produces GAF in both the segment and the stable coordinate.
306
+ GraphAligner~\cite{Rautiainen810812} produces GAF in the segment coordinate
307
+ only, which can be converted to the stable coordinate.
308
+
309
+ \begin{figure}[t]
310
+ \includegraphics[width=.47\textwidth]{Fig2}
311
+ \caption{\csentence{Minigraph algorithms.} {\bf (a)} Diagram of the minigraph
312
+ mapping algorithm. Minigraph seeds alignments with minimizers, finds good
313
+ enough linear chains, connects them in the graph and seeks the most weighted
314
+ path as a graph chain. {\bf (b)} Diagram of incremental graph construction. A
315
+ graph is iteratively constructed by mapping each assembly to an existing
316
+ graph and augmenting the graph with long poorly mapped sequences in the
317
+ assembly.}\label{fig:mg}
318
+ \end{figure}
319
+
320
+ \subsection*{Sequence-to-graph mapping}
321
+
322
+ Our incremental graph construction algorithm relies on genome-to-graph
323
+ alignment (Fig.~\ref{fig:mg}b). As existing sequence-to-graph
324
+ aligners~\cite{Rautiainen810812,Garrison:2018aa} do not work with
325
+ chromosome-long query sequences, we adapted minimap2~\cite{Li:2018ab} for our
326
+ purpose and implemented minigraph (Fig.~\ref{fig:mg}a). Briefly, minigraph uses
327
+ a minimap2-like algorithm to find local hits to segments in the graph, ignoring
328
+ the graph topology. It then chains these local hits if they are connected on
329
+ the graph, possibly through cycles. This gives the approximate mapping locations. Minigraph does not
330
+ perform base-level alignment. This is because the graph we construct encodes
331
+ SVs and rarely contains paths similar at the base level. The best mapping is
332
+ often clear without base alignment.
333
+
334
+ \begin{table}[b]
335
+ \caption{Performance of sequence-to-graph mapping}\label{tab:mgvga}
336
+ \begin{tabular}{lrr}
337
+ \hline
338
+ & minigraph & GraphAligner \\
339
+ \hline
340
+ Indexing time (wall-clock sec) & 100 & 589 \\
341
+ Mapping time (wall-clock sec) & 79 & 140 \\
342
+ Peak RAM (GB) & 19.5 & 27.2 \\
343
+ Percent unmapped reads & 0.5\% & 0\% \\
344
+ Percent wrong mappings & 1.7\% & 4.6\% \\
345
+ \hline
346
+ \end{tabular}
347
+ \end{table}
348
+
349
+ To evaluate the accuracy of minigraph mapping, we simulated PacBio reads from
350
+ GRCh38 with PBSIM~\cite{Ono:2013aa} and mapped them to the graph we constructed
351
+ in the next section. Table~\ref{tab:mgvga} compares the performance of
352
+ minigraph and GraphAligner~\cite{Rautiainen810812} v1.0.10 on 68,857 simulated
353
+ reads mapped over 8 CPU threads. {\color{black} The N50 read length is 15kb.
354
+ 9,862 reads are mapped across two or more segments by GraphAligner. Note that
355
+ both minigraph and GraphAligner ignore the stable coordinates during mapping.
356
+ All segments, originated either from GRCh38 or from individual genomes, are
357
+ treated equally. To this end, while we simulated reads from GRCh38, we are also
358
+ evaluating how well mappers work with complex SVs present in any input
359
+ samples.}
360
+
361
+ On this dataset, minigraph
362
+ is faster than GraphAligner and uses less memory, partly because minigraph does
363
+ not perform base alignment.
364
+ As is shown in Table~\ref{tab:mgvga}, minigraph is more accurate than
365
+ GraphAligner. This is counter-intuitive given that GraphAligner does base
366
+ alignment. Close inspection reveals that most mismapped reads by minigraph are
367
+ mapped to the correct genomic loci but wrong graph paths. On the contrary, most
368
+ mismapped reads by GraphAligner are mapped to wrong genomic loci. This suggests
369
+ minigraph is better at finding approximate mapping locations but GraphAligner
370
+ is better at disambiguating similar graph paths. Combining the strength of
371
+ both could lead to a better graph mapper. We do plan to implement base-level
372
+ alignment in minigraph in future.
373
+
374
+ We have also tried vg v1.21.0~\cite{Garrison:2018aa}. It indexed the same graph in 14.7 wall-clock
375
+ hours and mapped the simulated reads in 1.8 hours over 8 threads, tens of times
376
+ slower than minigraph and GraphAligner. However, no reads are mapped in the
377
+ output. We have not been able to make vg work with our data.
378
+
379
+ \subsection*{Generating pangenome graphs}
380
+
381
+ Fig.~\ref{fig:mg}b shows how minigraph constructs a pangenome graph (see
382
+ Methods for details). This procedure is similar to multiple sequence alignment
383
+ via partial order graph~\cite{Lee_2002} except that minigraph works with cyclic
384
+ graphs and ignores small variants. Minigraph only considers SVs of
385
+ 100bp--100kb in length and ignores SVs in alignments shorter than 100kb.
386
+ For each input assembly, it filters out regions covered by two or more primary
387
+ alignments longer than 20kb in the assembly. This filter avoids paralogous
388
+ regions in a sample and guarantees that graphs generated by minigraph can be
389
+ modeled by rGFA.
390
+
391
+ As a sanity check, we compared minigraph to dipcall
392
+ (\href{https://github.com/lh3/dipcall}{https://github.com/lh3/dipcall}) on
393
+ calling SVs 100bp or longer from a synthetic diploid sample composed of CHM1
394
+ and CHM13~\cite{Li:2018aa}. Given two SV callsets $A$ and $B$, we say a call in
395
+ $A$ is \emph{missed} in callset $B$ if there are no calls in $B$ within 1000bp
396
+ from the call in $A$. With this criterion, 2.7\% of 14,792 SVs called by
397
+ dipcall are missed by minigraph; 6.0\% of 14,932 minigraph SVs are missed by
398
+ dipcall. We manually inspected tens of differences in
399
+ IGV~\cite{Robinson:2011aa} and identified two causes. First, an INDEL longer
400
+ than 100bp called by one caller may be split into two shorter INDELs by the
401
+ other caller. There are often more than one smaller SVs around a missed SV
402
+ call. Second, dipcall skips regions involving high density of SNPs or involving
403
+ both long insertions and long deletions, but minigraph connects these events
404
+ and calls SVs in such regions. It tends to call more SVs. Overall, we believe
405
+ minigraph and dipcall found similar sets of SVs.
406
+
407
+ \begin{table}[tb]
408
+ \caption{Assemblies used for graph construction}\label{tab:asm}
409
+ \begin{tabular}{llll}
410
+ \hline
411
+ Name & Species & Population & Accession/Source \\ \hline
412
+ CHM1 & Human & N/A & GCA\_001297185.1 \\
413
+ CHM13 & Human & N/A & GCA\_000983455.1 \\
414
+ NA12878 & Human & European & \cite{Garg810341}, phased \\
415
+ NA24385 & Human & Jewish & \cite{Garg810341}, phased \\
416
+ PGP1 & Human & N/A & \cite{Garg810341}, phased \\
417
+ NA19240 & Human & African & GCA\_001524155.4 \\
418
+ HG00514 & Human & East Asian & GCA\_002180035.3 \\
419
+ HG01352 & Human & American & GCA\_002209525.2 \\
420
+ NA19434 & Human & African & GCA\_002872155.1 \\
421
+ HG02818 & Human & African & GCA\_003574075.1 \\
422
+ HG03486 & Human & African & GCA\_003086635.1 \\
423
+ HG03807 & Human & South Asian& GCA\_003601015.1 \\
424
+ HG00733 & Human & American & GCA\_002208065.1 \\
425
+ HG02059 & Human & East Asian & GCA\_003070785.1 \\
426
+ HG00268 & Human & European & GCA\_008065235.1 \\
427
+ HG04217 & Human & South Asian& GCA\_007821485.1 \\
428
+ AK1 & Human & East Asian & GCA\_001750385.1 \\
429
+ Clint & Chimpanzee & & GCA\_002880755.3 \\
430
+ Susie & Gorilla & & GCA\_900006655.3 \\
431
+ Kamilah & Gorilla & & GCA\_008122165.1 \\
432
+ Susie & Orangutan & & GCA\_002880775.3 \\
433
+ \hline
434
+ \end{tabular}
435
+ \end{table}
436
+
437
+ \begin{figure*}[htbp]
438
+ \includegraphics[width=.95\textwidth]{Fig3}
439
+ \caption{\csentence{Characteristics of the human and the great ape graphs.} {\bf
440
+ (a)} Human variations stratified by repeat class and by the number of
441
+ alleles of each variation. The repeat annotation was obtained from the
442
+ longest allele of each variation. VNTR: variable-number tandem repeat, a
443
+ tandem repeat with the unit motif length $\ge$7bp. STR: short random repeat,
444
+ a tandem repeat with the unit motif length $\le$6bp. LCR: low-complexity
445
+ regions. Mixed-inter.: a variation involving $\ge$2 types of interspersed
446
+ repeats. {\bf (b)} Great ape variations stratified by repeat class and by the
447
+ number of alleles. {\bf (c)} Human biallelic variations stratified by repeat
448
+ class and by insertion to/deletion from GRCh38. Both alleles are required to
449
+ be covered in all assemblies. {\bf (d)} Human-specific biallelic variations
450
+ stratified by repeat class and by insertion to/deletion from GRCh38. Red bars
451
+ correspond to insertions to the human lineage. {\bf (e)} Distribution of
452
+ different types of human variations along chromosomes. {\bf (f)} Boxplot of
453
+ the longest allele length in each repeat class. Outliers are omitted for the
454
+ clarity of the figure.}\label{fig:anno}
455
+ \end{figure*}
456
+
457
+ \subsection*{A human pangenome graph}
458
+
459
+ Starting with GRCh38, we constructed a human pangenome graph from 20 human
460
+ haplotypes or haplotype-collapsed assemblies (Table~\ref{tab:asm}). It took
461
+ minigraph 2.7 wall-clock hours over 24 CPU threads to generate this graph. The
462
+ peak memory is 98.1GB. The resulting graph consists of 148,618 segments and
463
+ 214,995 links. It contains 37,332 variations, where a \emph{variation}
464
+ denotes a minimal subgraph that has a single source and a single sink with both
465
+ segments coming from GRCh38. A path through the bubble between the source and
466
+ and the sink represents an \emph{allele}.
467
+
468
+ Variations in the human graph are enriched with Alus and VNTRs
469
+ (Fig.~\ref{fig:anno}a). While interspersed repeats are about evenly distributed
470
+ along chromosomes except in the pseudoautosomal regions (Fig.~\ref{fig:anno}e),
471
+ VNTRs are enriched towards telomeres~\cite{Audano:2019aa}. It is worth noting
472
+ the density of minisatellites is also higher in subtelomeres. If we normalize
473
+ the density of VNTRs in the pangenome graph by the density of minisatellites in
474
+ GRCh38, the enrichment of VNTRs towards telomeres is still visible but becomes
475
+ less prominent. At the same time, repeat-less variations are also enriched
476
+ towards the ends of chromosomes (green areas in Fig.~\ref{fig:anno}e),
477
+ suggesting subtelomeres tend to harbor SVs anyway. We also
478
+ identified 85 processed pseudogenes among these variations.
479
+
480
+ \begin{figure}
481
+ \includegraphics[width=.46\textwidth]{igv-edit.png}
482
+ \caption{\csentence{IGV screenshot of a region enriched with long insertions.}
483
+ Numbers on wide purple bars indicate insertion lengths. CLR: PacBio noisy
484
+ continuous long reads. HiFi: PacBio high-fidelity reads.}\label{fig:igv}
485
+ \end{figure}
486
+
487
+ Another noticeable feature of VNTRs is that over half of VNTR variations are
488
+ multiallelic (Fig.~\ref{fig:anno}a). Fig.~\ref{fig:igv} shows a multi-allelic
489
+ region composed of VNTRs. We can see many insertions of different lengths. The
490
+ two different NA12878 assemblies also disagree with each other, which we often
491
+ see around other VNTR loci in NA12878 as well. We have not inspected raw reads
492
+ in this particular example, but we tend to believe the disagreement is caused
493
+ by local misassemblies rather than somatic mutations. In addition, due to the
494
+ multiallelic nature of such VNTRs, the two haplotypes in a human individual are
495
+ often different. Assemblies mixing the two haplotypes (aka collapsed
496
+ assemblies) may have more troubles in these regions. Multiallelic VNTRs are
497
+ hard to assemble correctly.
498
+
499
+ Multiallelic VNTRs are also hard to align and to call. In Fig.~\ref{fig:igv},
500
+ the insertion positions are often different, which could be caused by a few
501
+ mutations or sequencing errors. A naive alignment-based SV caller would call a
502
+ dozen of low-frequency insertions in this region, which does not reflect these
503
+ correlated events. Without base-level alignment, minigraph may
504
+ have more troubles with obtaining the optimal alignment in these complex VNTR
505
+ regions. Improved data quality, assembly algorithms and graph mapping
506
+ algorithms are required to investigate VNTR regions in detail.
507
+
508
+ \subsection*{A great ape pangenome graph}
509
+
510
+ We also constructed a great ape pangenome graph from GRCh38, one chimpanzee,
511
+ two gorillas and one orangutan (Table~\ref{tab:asm}). This graph contains
512
+ 206,452 variations, over four times more than the human graph. About half of
513
+ variations are originated from orangutan, the species most distant from human.
514
+
515
+ In the great ape graph, the L1-to-Alu ratio is close to 1:1, much higher than
516
+ the ratio in the human graph (Fig.~\ref{fig:anno}b vs Fig.~\ref{fig:anno}a).
517
+ This is perhaps correlated with the elevated L1 activity in great
518
+ apes~\cite{Mathews:2003aa}. Of retrotransposon-related variations specific to
519
+ the human lineage, the overwhelming majority are insertions
520
+ (Fig.~\ref{fig:anno}d), which is expected as transpositions lead to insertions
521
+ only. Most human-specific Alu deletions are incomplete and involve ancient Alu
522
+ subfamilies. They are likely genomic deletions that happen to hit Alus. In
523
+ contrast, the majority of ``partial-repeats'' are deletions from the human
524
+ lineage. Two thirds of autosomal insertions in this category are segmental
525
+ duplications in GRCh38. In all, minigraph is an efficient tool to study closely
526
+ related species.
527
+
528
+ \subsection*{Blacklist regions from human pangenome graphs}
529
+
530
+ The human pangenome graph effectively encodes SVs $\ge$100bp
531
+ in 20 genomes. These large-scale variations could be a frequent source of
532
+ technical artifacts in variant calling with short reads. To test this
533
+ hypothesis, we compared short-read SNP calls with vs without regions around SVs
534
+ in the pangenome graph.
535
+
536
+ We constructed a human pangenome graph excluding CHM1 and CHM13, the two
537
+ samples used in the SynDip benchmark~\cite{Li:2018aa}, and generated regions
538
+ around variations (see Methods), which we call as \emph{blacklist regions},
539
+ following the rationale in~\cite{Amemiya:2019aa}. Blacklist regions is totaled
540
+ 29.2Mb in length, intersecting 0.7\% of confident regions in
541
+ SynDip~\cite{Li:2018aa}; 0.7\% of truth SNPs are contained in blacklist regions
542
+ -- true SNPs are not enriched in blacklist regions.
543
+
544
+ We mapped short reads used in~\cite{Li:2018aa} with minimap2 and called
545
+ variants with GATK v4.1.2~\cite{Depristo:2011vn}. This callset
546
+ contains 32,879 false positive SNPs, 21\% of which fall in blacklist regions --
547
+ false SNP calls are highly enriched in this $<$1\% region of human genome. This
548
+ confirms a noticeable fraction of false SNP calls using short reads are
549
+ resulted from misalignment involving SVs.
550
+
551
+ \section*{Discussion}
552
+
553
+ Based on the GFA assembly format~\cite{Li:2016aa}, we proposed the rGFA format,
554
+ which defines a data model for reference pangenome graphs at the same time.
555
+ rGFA takes a linear reference genome as the backbone and maintains the
556
+ conceptual ``linearity'' of input genomes.
557
+
558
+ rGFA is not the only pangenome graph model. Vg~\cite{Garrison:2018aa}
559
+ encodes a stable sequence with a path through the sequence graph~\cite{10.12688/f1000research.19630.1}. A segment
560
+ in the graph may occur on multiple paths, or occur multiple times on one path
561
+ if there are cycles in the graph. This way, vg allows different regions in one
562
+ chromosome collapsed to one segment. We call such a graph as a collapsed graph. rGFA
563
+ cannot encode a collapsed graph. The vg model is thus more general.
564
+
565
+ In our view, however, the reference pangenome graph should not be a collapsed
566
+ graph. In a collapsed graph, the definition of orthology is not clear because
567
+ multiple sequences from the same sample may go through the same segment.
568
+ Without the concept of orthology, we cannot define variations, either. In
569
+ addition, due to the one-to-many relationship between segments and the
570
+ reference genome, it is intricate to derive the stable coordinate of a path in
571
+ a collapsed graph. For example, suppose segment {\sf s1} corresponds to two
572
+ regions {\sf chr1:100-200} and {\sf chr1:500-600}. To convert a path {\sf
573
+ s2$\to$s1$\to$s3} to the stable coordinate, we have to inspect adjacent
574
+ segments to tell which {\sf s1} corresponds to; this becomes more challenging
575
+ when {\sf s2} and {\sf s3} represent multiple regions in the reference genome.
576
+ In contrast, rGFA inherently forbids a collapsed graph and avoids the potential
577
+ issues above. This makes rGFA simpler than vg's path model and easier to work
578
+ with.
579
+
580
+ To demonstrate practical applications of rGFA, we developed minigraph to
581
+ incrementally generate pangenome graphs. It can generate a graph from 20
582
+ genomes in three hours and can scale to hundreds of genomes in future. A
583
+ limitation of minigraph is that it does not perform base alignment and may be
584
+ confused by similar paths in the graph. {\color{black} Unfortunately, base-level
585
+ sequence-to-graph alignment is not a fully solved problem. Partial-order graph
586
+ alignment~\cite{Lee_2002} and PaSGAL~\cite{DBLP:conf/ipps/JainMZDA19} only work
587
+ with directed acyclic graphs (DAGs). Vg~\cite{Garrison:2018aa} uses a heuristic
588
+ to unroll cycles but it is expotential in time in the worst case and for DAGs,
589
+ its exact mode is tens of times slower than PaSGAL. Antipov et
590
+ al~\cite{Antipov:2016aa} proved that alignment against cyclic graphs can be
591
+ done in polynomial time. GraphAligner~\cite{Rautiainen810812} implements a
592
+ fast quadratic algorithm for computing edit distance~\cite{Rautiainen_2019}.
593
+ However, edit distance based alignment disallows long INDELs and is often
594
+ inadequate for accurate variant calling. Jain et al~\cite{Jain_2020} recently
595
+ proposed a quadratic algorithm for alignment with affine gap penalty but the
596
+ authors focused on the theoretical analysis only. To the best of our knowledge,
597
+ no tools can efficiently perform sequence-to-graph alignment under affine gap
598
+ cost. We plan to learn from the existing algorithms and implement fast base
599
+ alignment in minigraph in future. This may take significant effort.}
600
+
601
+ Another limitation of minigraph is
602
+ that it is unable to align sequences against a graph encoding all small variants.
603
+ Such a graph will be composed of millions of short segments. Not
604
+ indexing minimizers across segments, minigraph will fail to seed the initial
605
+ linear chains. This limitation can only be resolved by completely changing the
606
+ minigraph mapping algorithm. Nonetheless, small variants are easier to
607
+ analyze with the standard methods. Incorporating these variants unnecessarily
608
+ enlarges the graph, complicates implementations, increases the rate of false
609
+ mappings~\cite{Pritt_2018} and reduces the performance of common tasks. There
610
+ is also no known algorithm that can construct such a complex graph for hundreds
611
+ of human genomes.
612
+
613
+ Minigraph does not keep track of the sample information as of now. To address
614
+ this issue, we are considering to implement colored rGFA, similar to colored de
615
+ Bruijn graphs~\cite{Iqbal:2012aa}. In a colored rGFA, a color represents one
616
+ sample. Each segment or link is associated with one or multiple colors,
617
+ indicating the sources of the segment or the link. Colors can be stored in an
618
+ rGFA tag or in a separate segment/link-by-sample binary
619
+ matrix~\cite{Holley695338}. The matrix representation may be more compact given
620
+ a large number of samples.
621
+
622
+ We have shown minigraph can be a fast and powerful research tool to summarize
623
+ SVs at the population scale and to study the evolution of closely related
624
+ species. A more practical question is how a reference pangenome graph may
625
+ influence routine data analysis. Here is our limited view.
626
+
627
+ We think a critical role a reference graph plays is that it extends the
628
+ coordinate system of a linear reference genome. This allows us to annotate
629
+ variations in highly diverse regions such as the human HLA and KIR regions. The
630
+ existing pipelines largely ignore these variations because most of them cannot
631
+ be encoded in the primary assembly of GRCh38.
632
+
633
+ The extended graph coordinate system further helps to consistently represent
634
+ complex SVs. Given multiple samples, the current practice is to call SVs from
635
+ individual samples and then merge them. Two subtly different SVs, especially
636
+ long insertions, may be called at two distinct locations and treated as
637
+ separate events. With the minigraph procedure, the two SVs are likely to
638
+ be aligned together as long as they are similar to each other and are
639
+ sufficiently different from the reference allele. To some extent, minigraph is
640
+ performing multiple sequence alignment with partial order
641
+ alignment~\cite{Lee_2002}. This procedure is more robust to different
642
+ representations of the same SV than naive merging. When we refer to a SNP, we often use its
643
+ chromosomal coordinate such as ``chr1:12345''. We rarely do so for SVs because
644
+ their positions are sensitive to alignment and SV callers. The more consistent
645
+ SV representation implied by a pangenome graph will help to alleviate the issue
646
+ and subsequently facilitate the genotyping of
647
+ SVs~\cite{Hickey_2020,Eggertsson_2019,Chen_2019}.
648
+
649
+ While we believe a reference pangenome graph will make complex variations more
650
+ accessible by geneticists and biologists, we suspect a great majority of
651
+ biomedical researchers will still rely on a linear reference genome due to the
652
+ conceptual simplicity of linear genomes and the mature tool chains developed in
653
+ decades. Many analyses such as SNP calling in well behaved regions do not
654
+ benefit much from a pangenome representation, either. Nonetheless, a pangenome
655
+ reference still helps applications based on linear references. With a graph
656
+ reference, we may blacklist regions enriched with SVs that lead to small variant
657
+ calling errors. We may potentially generate ``decoy'' sequences that are
658
+ missing from the primary assembly to attract falsely mapped reads away. We may
659
+ perform read alignment against a graph, project the alignment to the linear
660
+ coordinate and finish the rest of analyses in the linear space. We anticipate a
661
+ pangenome reference to supplement the linear reference, not to replace it.
662
+
663
+ \section*{Conclusions}
664
+
665
+ Complex human sequence variations are like genomic dark matter: they are
666
+ pervasive in our genomes but are often opaque to the assay with the existing
667
+ tools. We envision a pangenome graph reference will become an effective
668
+ means to the study of these complex variations. We proposed a data model (rGFA),
669
+ designed formats (rGFA and GAF) and developed companion tools (minigraph and
670
+ gfatools) to demonstrate the feasibility of our vision. Our work is still
671
+ preliminary but it is likely to set a starting point to the development of the
672
+ next-generation graph-based tools, which may ultimately help us to understand
673
+ our genomes better.
674
+
675
+ \section*{Methods}
676
+
677
+ \subsection*{The minigraph mapping algorithm}
678
+
679
+ \subsubsection*{Seeding and linear chaining}
680
+ Similar to minimap2, minigraph uses minimizers on segments as seeds. It also
681
+ applies a similar chaining algorithm but with different scoring and with a new
682
+ heuristic to speed up chaining over long distances. For the completeness of
683
+ this article, we will describe part of the minimap2 chaining algorithm here.
684
+
685
+ \paragraph*{Minimap2-like chaining}
686
+ Formally, an \emph{anchor} is a 3-tuple $(x,y,w)$, representing a closed
687
+ interval $[x-w+1,x]$ on a segment in the reference graph matching an interval
688
+ $[y-w+1,y]$ on the query. Given a list of anchors sorted by $x$, let $f(i)$ be
689
+ the maximal chaining score up to the $i$-th anchor in the list. $f(i)$ can be
690
+ computed by:
691
+ \begin{equation}\label{eq:dp}
692
+ f(i)=\max\big\{\max_{i>j\ge1}\{f(j)+\alpha(j,i)-\beta(j,i)\},w_i\big\}
693
+ \end{equation}
694
+ where $\alpha(j,i)=\min\big\{\min\{y_i-y_j,x_i-x_j\},w_i\big\}$ is
695
+ the number of matching bases between anchor $i$ and $j$.
696
+ $\beta(j,i)$ is the gap penalty. Let $g_{ji}=|(y_i-y_j)-(x_i-x_j)|$
697
+ be the gap length and $d_{ji}=\min\{y_i-y_j,x_i-x_j\}$ be the smaller distance
698
+ between the two anchors. Minigraph uses the following gap cost:
699
+ $$
700
+ \beta(j,i)=\left\{\begin{array}{ll}
701
+ \infty & (g_{ji}>G) \\
702
+ c_1\cdot g_{ji} + c_2\cdot d_{ji} + \log_2{g_{ji}} & (0<g_{ji}\le G) \\
703
+ 0 & (g_{ji}=0)\\
704
+ \end{array}\right.
705
+ $$
706
+ where $G=100000$ in the graph construction mode, $c_1=e^{-dw}$ and
707
+ $c_2=0.05\cdot e^{-dw}$. By default, $d=0.01$ is the expected per-base sequence
708
+ divergence and $w=19$ is the minimizer length. In comparison, minimap2 applies
709
+ $G=5000$, $c_1=0.19$ and $c_2=0$. Minigraph allows much larger gaps between
710
+ minimizers and more heavily penalizes gaps.
711
+
712
+ Solving Eq.~\ref{eq:dp} leads to an $O(n^2)$ algorithm where $n$ is the number
713
+ of anchors. This algorithm is slow for large $n$. Minimap2 introduces
714
+ heuristics to speed up the computation by approximating this equation. It works
715
+ well for minimap2 that only allows small gaps and has base-level alignment as a
716
+ fix to chaining errors. However, as minigraph intends to chain much longer
717
+ gaps, the minimap2 algorithm occasionally misses the optimal alignment in long
718
+ segmental duplications and produces false variations. Minigraph introduces a
719
+ new heuristic to speed up chaining.
720
+
721
+ \begin{figure}[tb]
722
+ \centering
723
+ \includegraphics[width=.36\textwidth]{Fig5}
724
+ \caption{\csentence{Implementing 1-dimension Range-Min-Query (RMQ).} Given a
725
+ set of 2-tuples, a binary search tree is built for the first values in the
726
+ tuples. Each node $p$ in the tree is associated with a pointer. The pointer
727
+ points to the node that is in the subtree descended from $p$ and has the
728
+ minimal second value. In this example, ${\rm RMQ}(20,50)=14$.}\label{fig:rmq}
729
+ \end{figure}
730
+
731
+ \paragraph*{Dynamic 1-dimension Range-Min-Query}
732
+ Before we move onto the minigraph solution, we will first introduce
733
+ Range-Min-Query (RMQ). Given a set of 2-tuples $\{(y_i,s_i)\}$, ${\rm
734
+ RMQ}(a,b)$ returns the minimum $s_j$ among $\{s_j:a\le y_j\le b\}$.
735
+ We implemented 1-dimension RMQ with a modified AVL tree, a type of balanced
736
+ binary search tree (Fig.~\ref{fig:rmq}). When performing ${\rm RMQ}(a,b)$,
737
+ we first find the smallest and the largest nodes within interval $[a,b]$ using
738
+ the standard algorithm. In this example, the two nodes are (21,32) and (45,21),
739
+ respectively. We then traverse the path between the two nodes to find the
740
+ minimum. With a balanced tree structure, we do not need to descend into
741
+ subtrees. The time complexity is $O(m\log m)$, where $m$ is the number of nodes
742
+ in the tree. We can insert nodes to or delete nodes from the tree while
743
+ maintaining the property of the tree. This achieves dynamic RMQ.
744
+
745
+ \paragraph*{Chaining with a linear gap cost function}
746
+ A linear gap cost takes the form of
747
+ $\beta'(j,i)=c_1[(y_i-y_j)+(x_i-x_j)]$. Given a list of anchors
748
+ $(x_i,y_i,w_i)$ sorted by position $x_i$, let
749
+ \begin{equation}\label{eq:dp2}
750
+ f'(i)=\max_{\substack{\text{$i>j\ge1$}\\ \text{$x_i-G\le x_j\le x_i-w_i$}\\ \text{$y_i-G\le y_j\le y_i-w_i$}}}\big\{f'(j)+w_j-\beta'(j,i)\big\}
751
+ \end{equation}
752
+ We can find the optimal $f'(i)$ in $O(n\log n)$ time with
753
+ RMQ~\cite{DBLP:conf/wabi/AbouelhodaO03,Otto:2011aa}. To see that, define
754
+ $$h'(j)=f'(j)+w_j+c_1(y_j+x_j)$$
755
+ The following condition
756
+ $$f'(j)+w_j-\beta'(j,i)>f'(k)+w_k-\beta'(k,i)$$
757
+ is equivalent to $h'(j)>h'(k)$, independent of $i$. If we maintain ${\rm
758
+ RMQ}_i$ as the binary tree that keeps $\{(y_j,-h'(j)):j<i,x_i-G\le x_j\le x_i-w_i\}$, we have
759
+ $$
760
+ f'(i)=-{\rm RMQ}_i(y_i-G,y_i-w_i)-c_1(x_i+y_i)
761
+ $$
762
+ This solves Eq.~\ref{eq:dp2} in $O(n\log n)$ time.
763
+
764
+ \paragraph*{Minigraph linear chaining}
765
+ While chaining with a linear gap cost function can be solved efficiently, we
766
+ prefer more realistic cost function used in minimap2. In practical
767
+ implementation, when we come to anchor $i$, we find the optimal predecessor $j_*$
768
+ under the desired gap cost $\beta(j,i)$ for anchors $\{j:j<i,x_i-G'\le
769
+ x_j<x_i,y_i-G'\le y_j<y_i\}$, where $G'<G$ is set to 10000 by default.
770
+ Meanwhile, we use the RMQ-based algorithm to identify the anchor $j'_{*}$ optimal
771
+ under the linear gap cost $\beta'(j,i)$. We choose $j'_*$ as the optimal
772
+ predecessor if
773
+ $$
774
+ f(j_*)+\alpha(j_*,i)-\beta(j_*,i)<f(j'_*)+\alpha(j'_*,i)-\beta(j'_*,i)
775
+ $$
776
+ This may occasionally happen around long segmental duplications when the
777
+ minimap2 heuristic misses the optimal solution. Effectively, minigraph does
778
+ thorough search in a small window and approximate search in a large window
779
+ using a faster but less sophisticated gap cost function.
780
+
781
+ \subsubsection*{Graph chaining}
782
+
783
+ Minigraph generates a set of linear chains $\{L_i\}$ with the procedure above
784
+ that completely ignores the graph topology. It then applies another round of
785
+ chaining taking the account of the topology.
786
+
787
+ We say linear chain $L_i$ \emph{precedes} $L_j$, written as $L_i\prec L_j$, if
788
+ (1) the ending coordinate of $L_i$ on the query sequence is smaller than the
789
+ ending coordinate of $L_j$, and (2) there is a walk from $L_i$ to $L_j$ in the
790
+ graph. If there are multiple walks from $L_i$ to $L_j$, minigraph enumerates
791
+ the shortest 16 walks and chooses the walk with its length being the closest to
792
+ the query distance between $L_i$ and $L_j$.
793
+
794
+ Given a list of linear chains sorted by their ending coordinates on the query
795
+ sequence, let $g(i)$ be the optimal graph chaining score up to linear chain
796
+ $L_i$. We can compute $g(i)$ with another dynamic programming:
797
+ $$
798
+ g(i)=\max\big\{\max_{L_j\prec L_i}\{g(j)+\omega(L_j)-\beta(j,i)\},\omega(L_i)\big\}
799
+ $$
800
+ where $\beta(j,i)$ is the weight between $L_i$ and $L_j$. As minigraph does not
801
+ perform base-level alignment, $\beta(j,i)$ is the same as the gap penalty
802
+ function used for linear chaining. $\omega(L_i)$ is the optimal score of $L_i$
803
+ computed during linear chaining.
804
+
805
+ The procedure above has two limitations. First, when computing the weight
806
+ between $L_i$ and $L_j$, minigraph largely ignores base sequences and only considers
807
+ the distance between them on both the query and the graph. When there are
808
+ multiple walks of similar lengths between $L_i$ and $L_j$, minigraph miss the
809
+ graph chain that leads to the best base alignment. Although we added a
810
+ heuristic by considering 17-mer matches between the query and the graph paths,
811
+ we found this heursitc is not reliable in complex regions. Second, minigraph only
812
+ enumerates the shortest 16 walks. In complex subgraphs, the optimal walk from
813
+ $L_i$ to $L_j$ may not be among them. We plan to implement base
814
+ alignment to address the limitations. We may use the current minigraph algorithm
815
+ for easy cases and apply the more expensive base alignment when the current
816
+ algorithm potentially fails.
817
+
818
+ The graph chaining algorithm results in one or multiple graph chains. A
819
+ \emph{graph chain} is a list of anchors $(s_i,x_i,y_i,w_i)$, where
820
+ $[x_i-w_i+1,x_i]$ on segment $s_i$ in the graph matches $[y_i-w_i+1,y_i]$ on
821
+ the query sequence. A graph chain satisfies the following conditions: if $i<j$,
822
+ $y_i<y_j$; if $i<j$ and $s_i=s_j$, we have $x_i<x_j$; if $s_i\not=s_{i+1}$, the
823
+ two segments are adjacent on the graph. It is an extension to linear chains.
824
+
825
+ \subsection*{The minigraph graph generation algorithm}
826
+
827
+ Using the minimap2 algorithm~\cite{Li:2018ab}, minigraph identifies a set of
828
+ \emph{primary chains} that do not greatly overlap with each other on the query
829
+ sequence. A region on the query is considered to be \emph{orthogonal} to the
830
+ reference if the region is contained in a primary chain longer than 100kb and
831
+ it is not intersecting other primary chains longer than 20kb.
832
+
833
+ Minigraph scans primary chains in orthogonal regions and identifies subregions
834
+ where the query subsequences significantly differs from the corresponding
835
+ reference subsequences. To achieve that, minigraph computes a score $h_i$ for
836
+ each adjacent pair of anchors $(s_i,x_i,y_i,w_i)$ and
837
+ $(s_{i+1},x_{i+1},y_{i+1},w_{i+1})$. Let $d^x_i$ be the distance between the
838
+ two anchors on the graph and $d^y_i=y_{i+1}-y_i$ be the distance on the query
839
+ sequence. $h_i$ is computed as
840
+ \begin{equation}\label{eq:hi}
841
+ h_i=\left\{\begin{array}{ll}
842
+ -10 & \mbox{if $d^x_i=d^y_i\le w_{i+1}$} \\
843
+ \eta\cdot\max\{d^x_i,d^y_i\} & \mbox{otherwise}\\
844
+ \end{array}\right.
845
+ \end{equation}
846
+ where $\eta$ is the density of anchors averaged across all primary graph
847
+ chains. Define $H(i,j)=\sum_{k=i}^j h_k$. A highly divergent region between the
848
+ query and the graph will be associated with a large $H(i,j)$. Minigraph uses
849
+ the Ruzzo-Tompa algorithm~\cite{DBLP:conf/ismb/RuzzoT99} to identify all
850
+ maximal scoring intervals on list $(h_i)$, which correspond to divergent
851
+ regions. In each identified divergent region, minigraph performs base
852
+ alignment~\cite{Suzuki:2018aa,Li:2018ab} between the query and the graph
853
+ sequences and retains a region if it involves an INDEL $\ge$100bp in length or
854
+ a $\ge$100bp region with base-level identity below 80\%. In Eq.~\ref{eq:hi},
855
+ -10 is an insensitive parameter due to the downstream filtering. In the end,
856
+ minigraph augments the existing graph with identified variations
857
+ (Fig.~\ref{fig:mg}b).
858
+
859
+ \subsection*{Annotating variations}
860
+
861
+ We applied RepeatMasker~\cite{Tarailo-Graovac:2009aa} v1.332 to classify
862
+ interspersed repeats in the longest allele sequence of each variation.
863
+ RepeatMasker is unable to annotate VNTRs with long motifs. It also often
864
+ interprets VNTRs as impure STRs. Therefore, we did not use the RepeatMasker
865
+ VNTR or STR annotations directly. Instead, we combined RepeatMasker and
866
+ SDUST~\cite{Morgulis:2006aa} results to collect low-complexity regions (LCRs).
867
+ We identified pure tandem repeats composed of a motif occurring twice or more
868
+ (implemented in
869
+ \href{https://github.com/lh3/etrf}{https://github.com/lh3/etrf}). An LCR is
870
+ classified as VNTR if 70\% of the LCR is VNTR; similarly, an LCR is classified
871
+ as STR if 70\% is STR; the rest are classified as ``Other-LCR'' in
872
+ Fig.~\ref{fig:anno}. The annotation script is available in the minigraph GitHub
873
+ repository.
874
+
875
+ \subsection*{Creating blacklist regions}
876
+
877
+ For each variation in the graph, we extend its genomic interval on GRCh38 by
878
+ 50bp from each end. We name this set of intervals as $I_0$. We align sequences
879
+ inserted to GRCh38 against GRCh38 with ``minimap2 -cxasm20 -r2k'' and filter
880
+ out alignments with mapping quality below 5. Let $I(a,b)$ be the set of GRCh38
881
+ intervals that are contained in alignments with identity between $a$ and $b$.
882
+ The blacklist regions are computed by $I_0\cup I(0,0.99)\setminus I(0.998,1)$,
883
+ where ``$\cup$'' denotes the interval union operation and ``$\setminus$''
884
+ denotes interval subtraction.
885
+
886
+ %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
887
+ %% %%
888
+ %% Backmatter begins here %%
889
+ %% %%
890
+ %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
891
+
892
+ \begin{backmatter}
893
+
894
+ \section*{Competing interests}
895
+ The authors declare that they have no competing interests.
896
+
897
+ \section*{Ethical Approval}
898
+ Ethical approval was not needed for this study.
899
+
900
+ \section*{Author's contributions}
901
+ HL conceived the project, developed minigraph and drafted the manuscript.
902
+ XF did the pseudogene analysis. CC helped with RepeatMasker annotation.
903
+ All authors helped to revise the manuscript.
904
+
905
+ \section*{Acknowledgements}
906
+ We are grateful to Benedict Paten and Erik Garrison for discussions on
907
+ pangenome graphs. We thank minigraph users who have suggested features and
908
+ helped to fix various issues.
909
+
910
+ \section*{Funding}
911
+ This work is supported by National Institutes of Health (NIH) grant
912
+ U01HG010961 and R01HG010040.
913
+
914
+ \section*{Availability of data and materials}
915
+ Minigraph is openly available at
916
+ \href{https://github.com/lh3/minigraph}{https://github.com/lh3/minigraph}.
917
+ This repository also includes the script to convert from the segment coordinate
918
+ to the stable coordinate, to annotate variations and to generate blacklist
919
+ regions from the graph. The companion gfatools is available at
920
+ \href{https://github.com/lh3/gfatols}{https://github.com/lh3/gfatools}. The
921
+ human and the great ape graphs are hosted at
922
+ \href{ftp://ftp.dfci.harvard.edu/pub/hli/minigraph/}{ftp://ftp.dfci.harvard.edu/pub/hli/minigraph/}.
923
+ The NA12878, NA24385 and PGP1 phased assemblies were downloaded from
924
+ \href{ftp://ftp.dfci.harvard.edu/pub/hli/whdenovo/}{ftp://ftp.dfci.harvard.edu/pub/hli/whdenovo/}.
925
+ Assemblies generated by McDonnell Genome Institute include
926
+ GCA\_001524155.4 for NA19240, GCA\_002180035.3 for HG00514, GCA\_002209525.2
927
+ for HG01352, GCA\_002872155.1 for NA19434, GCA\_003574075.1 for HG02818,
928
+ GCA\_003086635.1 for HG03486, GCA\_003086635.1 for HG03486, GCA\_003601015.1
929
+ for HG03807, GCA\_002208065.1 for HG00733, GCA\_003070785.1 for HG02059,
930
+ GCA\_008065235.1 for HG00268 and GCA\_007821485.1 for HG04217. Other assemblies
931
+ are available from GenBank under accession GCA\_001297185.1 for
932
+ CHM1~\cite{Huddleston:2017aa}, GCA\_000983455.1 for
933
+ CHM13~\cite{Huddleston:2017aa}, GCA\_001750385.1 for AK1~\cite{Seo:2016aa},
934
+ GCA\_002880755.3 for chimpanzee Clint~\cite{Kronenberg:2018aa},
935
+ GCA\_900006655.3 for gorilla Susie~\cite{Gordon:2016kq}, GCA\_008122165.1 for
936
+ gorilla Kamilah~\cite{Kronenberg:2018aa} and GCA\_002880775.3 for orangutan
937
+ Susie~\cite{Kronenberg:2018aa}.
938
+
939
+
940
+ \bibliographystyle{bmc-mathphys}
941
+ \bibliography{minigraph}
942
+
943
+ %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
944
+ %% %%
945
+ %% Figures %%
946
+ %% %%
947
+ %% NB: this is for captions and %%
948
+ %% Titles. All graphics must be %%
949
+ %% submitted separately and NOT %%
950
+ %% included in the Tex document %%
951
+ %% %%
952
+ %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
953
+
954
+ %\section*{Figures}
955
+
956
+ %\begin{figure}[h!]
957
+ % \caption{\csentence{Sample figure title.}
958
+ % Figure legend text.}
959
+ % \end{figure}
960
+
961
+ %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
962
+ %% %%
963
+ %% Tables %%
964
+ %% %%
965
+ %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
966
+
967
+ %\section*{Tables}
968
+
969
+ %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
970
+ %% %%
971
+ %% Additional Files %%
972
+ %% %%
973
+ %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
974
+
975
+ %\section*{Additional Files}
976
+ % \subsection*{Additional file 1 --- Sample additional file title}
977
+ % Additional file descriptions text (including details of how to
978
+ % view the file, if it is in a non-standard format or the file extension). This might
979
+ % refer to a multi-page table or a figure.
980
+
981
+ % \subsection*{Additional file 2 --- Sample additional file title}
982
+ % Additional file descriptions text.
983
+
984
+
985
+ \end{backmatter}
986
+ \end{document}