ruby-minigraph 0.0.20.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (89) hide show
  1. checksums.yaml +7 -0
  2. data/LICENSE.txt +21 -0
  3. data/README.md +62 -0
  4. data/ext/Rakefile +56 -0
  5. data/ext/cmappy/cmappy.c +7 -0
  6. data/ext/cmappy/cmappy.h +8 -0
  7. data/ext/minigraph/LICENSE.txt +23 -0
  8. data/ext/minigraph/Makefile +66 -0
  9. data/ext/minigraph/NEWS.md +317 -0
  10. data/ext/minigraph/README.md +207 -0
  11. data/ext/minigraph/algo.c +194 -0
  12. data/ext/minigraph/algo.h +33 -0
  13. data/ext/minigraph/asm-call.c +147 -0
  14. data/ext/minigraph/bseq.c +133 -0
  15. data/ext/minigraph/bseq.h +76 -0
  16. data/ext/minigraph/cal_cov.c +139 -0
  17. data/ext/minigraph/doc/example1.png +0 -0
  18. data/ext/minigraph/doc/example2.png +0 -0
  19. data/ext/minigraph/doc/examples.graffle +0 -0
  20. data/ext/minigraph/format.c +241 -0
  21. data/ext/minigraph/galign.c +140 -0
  22. data/ext/minigraph/gchain1.c +532 -0
  23. data/ext/minigraph/gcmisc.c +223 -0
  24. data/ext/minigraph/gfa-aug.c +260 -0
  25. data/ext/minigraph/gfa-base.c +526 -0
  26. data/ext/minigraph/gfa-bbl.c +372 -0
  27. data/ext/minigraph/gfa-ed.c +617 -0
  28. data/ext/minigraph/gfa-io.c +395 -0
  29. data/ext/minigraph/gfa-priv.h +154 -0
  30. data/ext/minigraph/gfa.h +166 -0
  31. data/ext/minigraph/ggen.c +182 -0
  32. data/ext/minigraph/ggen.h +21 -0
  33. data/ext/minigraph/ggsimple.c +570 -0
  34. data/ext/minigraph/gmap.c +211 -0
  35. data/ext/minigraph/index.c +230 -0
  36. data/ext/minigraph/kalloc.c +224 -0
  37. data/ext/minigraph/kalloc.h +82 -0
  38. data/ext/minigraph/kavl.h +414 -0
  39. data/ext/minigraph/kdq.h +134 -0
  40. data/ext/minigraph/ketopt.h +116 -0
  41. data/ext/minigraph/khashl.h +348 -0
  42. data/ext/minigraph/krmq.h +474 -0
  43. data/ext/minigraph/kseq.h +256 -0
  44. data/ext/minigraph/ksort.h +164 -0
  45. data/ext/minigraph/kstring.h +165 -0
  46. data/ext/minigraph/kthread.c +159 -0
  47. data/ext/minigraph/kthread.h +15 -0
  48. data/ext/minigraph/kvec-km.h +105 -0
  49. data/ext/minigraph/kvec.h +110 -0
  50. data/ext/minigraph/lchain.c +441 -0
  51. data/ext/minigraph/main.c +301 -0
  52. data/ext/minigraph/map-algo.c +500 -0
  53. data/ext/minigraph/mgpriv.h +128 -0
  54. data/ext/minigraph/minigraph.1 +359 -0
  55. data/ext/minigraph/minigraph.h +176 -0
  56. data/ext/minigraph/miniwfa.c +834 -0
  57. data/ext/minigraph/miniwfa.h +95 -0
  58. data/ext/minigraph/misc/mgutils.js +1451 -0
  59. data/ext/minigraph/misc.c +12 -0
  60. data/ext/minigraph/options.c +134 -0
  61. data/ext/minigraph/shortk.c +251 -0
  62. data/ext/minigraph/sketch.c +109 -0
  63. data/ext/minigraph/sys.c +147 -0
  64. data/ext/minigraph/sys.h +20 -0
  65. data/ext/minigraph/test/MT-chimp.fa +277 -0
  66. data/ext/minigraph/test/MT-human.fa +239 -0
  67. data/ext/minigraph/test/MT-orangA.fa +276 -0
  68. data/ext/minigraph/test/MT.gfa +19 -0
  69. data/ext/minigraph/tex/Makefile +13 -0
  70. data/ext/minigraph/tex/minigraph.bib +676 -0
  71. data/ext/minigraph/tex/minigraph.tex +986 -0
  72. data/ext/minigraph/tex/plots/CHM13-f1-90.bb.anno.gp +42 -0
  73. data/ext/minigraph/tex/plots/CHM13-f1-90.bb.anno.tbl +13 -0
  74. data/ext/minigraph/tex/plots/CHM13-f1-90.bb.mini-inter-none.win.gp +269 -0
  75. data/ext/minigraph/tex/plots/CHM13-f1-90.bb.mini-inter-none.win.sh +7 -0
  76. data/ext/minigraph/tex/plots/CHM13v1.cen.bed +23 -0
  77. data/ext/minigraph/tex/plots/CHM13v1.size +23 -0
  78. data/ext/minigraph/tex/plots/anno2tbl.js +40 -0
  79. data/ext/minigraph/tex/plots/bedutils.js +367 -0
  80. data/ext/minigraph/tex/plots/chr-plot.js +130 -0
  81. data/ext/minigraph/tex/plots/gen-anno.mak +24 -0
  82. data/ext/minigraph.patch +21 -0
  83. data/lib/minigraph/ffi/constants.rb +230 -0
  84. data/lib/minigraph/ffi/functions.rb +70 -0
  85. data/lib/minigraph/ffi/mappy.rb +8 -0
  86. data/lib/minigraph/ffi.rb +27 -0
  87. data/lib/minigraph/version.rb +5 -0
  88. data/lib/minigraph.rb +72 -0
  89. metadata +159 -0
@@ -0,0 +1,986 @@
1
+ %% BioMed_Central_Tex_Template_v1.06
2
+
3
+ \documentclass[twocolumn]{bmcart}
4
+
5
+ %%% Load packages
6
+ \usepackage{amsthm,amsmath}
7
+ \RequirePackage{hyperref}
8
+ \usepackage[utf8]{inputenc} %unicode support
9
+
10
+ \usepackage{graphicx}
11
+ %\def\includegraphic{}
12
+ %\def\includegraphics{}
13
+
14
+ %%% Put your definitions there:
15
+ \startlocaldefs
16
+ \endlocaldefs
17
+
18
+
19
+ %%% Begin ...
20
+ \begin{document}
21
+
22
+ %%% Start of article front matter
23
+ \begin{frontmatter}
24
+
25
+ \begin{fmbox}
26
+ \dochead{Method}
27
+
28
+ \title{The design and construction of reference pangenome graphs with minigraph}
29
+
30
+ \author[
31
+ addressref={aff1,aff2}, % id's of addresses, e.g. {aff1,aff2}
32
+ corref={aff1}, % id of corresponding address, if any
33
+ email={hli@ds.dfci.harvard.edu} % email address
34
+ ]{\inits{HL}\fnm{Heng} \snm{Li}}
35
+ \author[
36
+ addressref={aff1,aff2},
37
+ ]{\inits{XF}\fnm{Xiaowen} \snm{Feng}}
38
+ \author[
39
+ addressref={aff2},
40
+ ]{\inits{CC}\fnm{Chong} \snm{Chu}}
41
+
42
+ \address[id=aff1]{% % unique id
43
+ \orgname{Department of Data Sciences, Dana-Farber Cancer Institute}, % university, etc
44
+ \city{Boston, MA 02215}, % city
45
+ \cny{USA} % country
46
+ }
47
+ \address[id=aff2]{%
48
+ \orgname{Department of Biomedical Informatics, Harvard Medical School},
49
+ \city{Boston, MA 02215},
50
+ \cny{USA}
51
+ }
52
+
53
+ \begin{abstractbox}
54
+
55
+ \begin{abstract} % abstract
56
+ The recent advances in sequencing technologies enable the assembly of
57
+ individual genomes to the quality of the reference genome. How to integrate
58
+ multiple genomes from the same species and make the integrated representation
59
+ accessible to biologists remains an open challenge. Here, we propose a
60
+ graph-based data model and associated formats to represent multiple genomes
61
+ while preserving the coordinate of the linear reference genome. We implement
62
+ our ideas in the minigraph toolkit and demonstrate that we can efficiently
63
+ construct a pangenome graph and compactly encode tens of thousands of
64
+ structural variants missing from the current reference genome.
65
+ \end{abstract}
66
+
67
+ \begin{keyword}
68
+ \kwd{bioinformatics}
69
+ \kwd{genomics}
70
+ \kwd{pangenome}
71
+ \end{keyword}
72
+
73
+ \end{abstractbox}
74
+
75
+ \end{fmbox}
76
+
77
+ \end{frontmatter}
78
+
79
+ %%
80
+ \section*{Background}
81
+
82
+ The human reference genome is a fundamental resource for human genetics and
83
+ biomedical research. The primary sequences of the reference genome
84
+ GRCh38~\cite{Schneider:2017aa} are a mosaic of haplotypes with each haplotype segment derived
85
+ from a single human individual. They cannot represent the genetic diversity in
86
+ human populations and as a result, each individual may carry thousands of large
87
+ germline variants absent from the reference genome~\cite{Huddleston:2017aa}.
88
+ Some of these variants are likely associated with phenotype~\cite{Eichler_2010}
89
+ but are often missed or misinterpreted when we map sequence data to GRCh38, in
90
+ particular with short reads~\cite{Li:2018aa}. This under-representation of
91
+ genetic diversity may become a limiting factor in our understanding of genetic
92
+ variations.
93
+
94
+ Meanwhile, the advances in long-read sequencing technologies make it possible
95
+ to assemble a human individual to a quality comparable to
96
+ GRCh38~\cite{Schneider:2017aa,Wenger_2019}. There are already a dozen of
97
+ high-quality human assemblies available in GenBank~\cite{Audano:2019aa}.
98
+ Properly integrating these genomes into a reference \emph{pangenome}, which
99
+ refers to a collection of genomes~\cite{cpgc:2016aa}, would potentially address
100
+ the issues with a single linear reference.
101
+
102
+ A straightforward way to represent a pangenome is to store unaligned genomes
103
+ in a full-text index that compresses redundancies in sequences identical
104
+ between individuals~\cite{Makinen:2010aa,Liu_2016,Boucher_2019}. We may
105
+ retrieve individual genomes from the index, inspect the k-mer spectrum and test
106
+ the presence of k-mers using standard techniques. In principle, it is also
107
+ possible to apply canonical read alignment algorithms to map sequences to
108
+ the collection, but in practice, the redundant hits to multiple genomes will
109
+ confuse downstream mapping-based analyses~\cite{NA2016159}. It is not clear how
110
+ to resolve these multiple mappings.
111
+
112
+ The other class of methods encodes multiple genomes into a sequence graph,
113
+ usually by collapsing identical or similar sequences between genomes onto a
114
+ single representative sequence. The results in a \emph{pangenome graph}. A
115
+ pangenome graph is a powerful tool to identify core genome, the part of a
116
+ genome or gene set that is shared across the majority of the strains or related species
117
+ in a clade~\cite{Vernikos:2015aa}. A common way to construct a basic pangenome
118
+ graph is to generate a compacted de Bruijn graph
119
+ (cDBG)~\cite{Marcus:2014xy,Baier_2015,Beller:2016ab,Chikhi:2015aa,Minkin_2016,Chikhi_2016,almodaresi_et_al:LIPIcs:2017:7657}
120
+ from a set of genomes. Basic cDBG does not keep sample information.
121
+ \cite{Iqbal:2012aa} proposed colored cDBG with each color represents a sample
122
+ or a population. Colored cDBG can be constructed
123
+ efficiently~\cite{Muggli_2019,Holley695338}. However, a colored cDBG discards
124
+ the chromosomal coordinate and thus disallows the mapping of genomic features.
125
+ It often includes connections absent from the input genomes and thus encodes
126
+ sequences more than the input. A colored cDBG cannot serve as a
127
+ \emph{reference} pangenome graph, either. deBGA~\cite{Liu:2016ac} addresses
128
+ the issue by labeling each unitig with its possibly multiple locations in the
129
+ input genome(s). Pufferfish~\cite{Almodaresi:2018aa} further reduces its space
130
+ requirement. Nonetheless, given hundreds of human genomes, there will be many
131
+ more vertices in the graph and most vertices are associated with hundreds of
132
+ labels. Whether deBGA and pufferfish can scale to such datasets remains an open
133
+ question. GBWT~\cite{Sir_n_2019} provides another practical solution to storage
134
+ and indexing, but no existing tools can practically construct a cDBG for many
135
+ human genomes in the GBWT representation.
136
+
137
+ In addition to cDBG, we can derive a reference pangenome
138
+ graph from a single linear multi-sequence alignment (MSA)~\cite{Dilthey_2015,Dilthey_2019}.
139
+ It has been used for HLA typing but is not applicable to whole chromosomes when
140
+ they cannot be included in a single linear MSA. The third and possibly the most
141
+ popular approach to reference graph generation is to call variants from other
142
+ sources and then incorporate these variants, often in the VCF format~\cite{Danecek:2011qy}, into
143
+ the reference genome as alternative
144
+ paths~\cite{Eggertsson:2017aa,Rakocevic_2019,Sibbesen:2018aa,Biederstedt:2018aa,Eggertsson_2019}.
145
+ However, because VCF does not define coordinates on insertions, this approach
146
+ cannot properly encode variations on long insertions and is therefore limited
147
+ to simple variations. There are no satisfactory solutions to the construction
148
+ of reference pangenome graphs.
149
+
150
+ In this article, we introduce the reference Graphical Fragment Assembly (rGFA)
151
+ format to model reference pangenome graphs. We propose and demonstrate an
152
+ incremental procedure to construct graphs under this model. The resulting
153
+ graphs encode structural variations (SVs) of length 100bp or longer without haplotype
154
+ information. Our implementation, minigraph~\cite{Li_minigraph:2020aa}
155
+ (\href{https://github.com/lh3/minigraph}{https://github.com/lh3/minigraph}),
156
+ can construct a pangenome graph from twenty human assemblies in three hours.
157
+
158
+ \section*{Results}
159
+
160
+ We will first describe a data model for reference pangenome graphs, which
161
+ establishes the foundation of this article. We will then present a new
162
+ sequence-to-graph mapper, minigraph, and show how this mapper incrementally
163
+ constructs a pangenome graph. We will demonstrate the utility of pangenome
164
+ graphs with a human graph generated from twenty human haplotypes and a primate
165
+ graph generated from four species.
166
+
167
+ \subsection*{Modeling reference pangenome graphs}
168
+
169
+ \subsubsection*{Sequence graphs}
170
+
171
+ There are several equivalent ways to define a sequence graph. In this article,
172
+ a \emph{sequence graph} $G(V,E)$ is a bidirected graph. Each vertex $v\in V$ is
173
+ associated with a DNA sequence; each edge $e\in E$ has two directions, one for
174
+ each endpoint, which leads to four types of edges: forward-forward,
175
+ reverse-forward, forward-reverse and reverse-reverse. The directions on an edge
176
+ dictate how a sequence is spelled from a walk/path in the graph. Common
177
+ assembly graphs, such as the overlap graph, string graph and de Bruijn graph
178
+ can all be formulated as sequence graphs.
179
+
180
+ \begin{figure}[t]
181
+ \includegraphics[width=.47\textwidth]{Fig1}
182
+ \caption{\csentence{Example rGFA and GAF formats.} {\bf (a)} Example rGFA
183
+ format. rGFA-specific tags include SN, name of the stable sequence from which
184
+ the vertex is derived; SO, offset on the stable sequence; SR, rank: 0 if the
185
+ vertex or edge is on the linear reference; $>$0 for non-reference. {\bf (b)}
186
+ Corresponding sequence graph. Each thick arrow represents an oriented DNA
187
+ sequence. {\bf (c)} Example GAF format, using the segment coordinate, for
188
+ reads ``${\tt GTGGCT}$'' and ``${\tt CGTTTCC}$'' mapped to the graph. {\bf
189
+ (d)} Equivalent GAF format using the stable coordinate.}\label{fig:rgfa}
190
+ \end{figure}
191
+
192
+ The Graphical Fragment Assembly (GFA) format~\cite{Li:2016aa} describes
193
+ sequence graphs. The core of GFA is defined by the following grammar:
194
+
195
+ {\footnotesize
196
+ \begin{verbatim}
197
+
198
+ <GFA> <- (<segment> | <link>)+
199
+ <segment> <- `S' <segId> <segSeq>
200
+ <link> <- `L' <segId> [+-] <segId> [+-] <cigar>
201
+
202
+ \end{verbatim}}
203
+
204
+ {\flushleft
205
+ A line starting with letter ``${\tt S}$'' corresponds to a vertex and a line
206
+ starting with ``${\tt L}$'' corresponds
207
+ to a bidirected edge. In a de Bruijn graph, we often attach sequences to edges
208
+ instead of vertices~\cite{Pevzner:2001vn,Gnerre:2011ys}. To avoid the confusion, in this
209
+ article, we also call a vertex as a \emph{segment} and call an edge as a
210
+ \emph{link}, following the GFA terminology. Fig.~\ref{fig:rgfa}a shows an
211
+ example GFA that encodes Fig.~\ref{fig:rgfa}b.
212
+ }
213
+
214
+ A sequence graph in the GFA format natively defines a \emph{segment coordinate}
215
+ system where each base in the graph is uniquely indexed by a
216
+ 2-tuple $({\rm segId},{\rm segOffset})$. For example, in
217
+ Fig~\ref{fig:rgfa}a, the base at position $({\rm s2},2)$ is ``{\tt G}''.
218
+ A major problem with this coordinate is that it is decoupled from linear
219
+ annotations and is sensitive to graph transformations. For example, if we split
220
+ a segment into two connected segments, the set of sequences spelled from the graph
221
+ remains the same, but the segment coordinates will be changed. Due to the
222
+ instability of segment coordinate, a basic sequence graph is inadequate for a
223
+ reference graph.
224
+
225
+ \subsubsection*{Reference pangenome graphs}
226
+
227
+ We propose the reference GFA (rGFA) format to encode reference pangenome graphs.
228
+ rGFA is an extension to GFA with three additional tags that indicate the origin
229
+ of a segment from linear genomes (Fig.~\ref{fig:rgfa}a). This simple addition
230
+ gives us a unique stable coordinate system as an extension to the linear
231
+ reference coordinate (e.g. GRCh38). We can pinpoint a position such as
232
+ ``{\sf chr1:9}'' in the graph and map existing annotations onto the graph. We can
233
+ also report a path or walk in the stable coordinate. For example, path
234
+ ``{\sf s1$\to$s2$\to$s3}'' unambiguously corresponds to ``{\sf
235
+ chr1:0-5$\to$chr1:5-8$\to$chr1:8-12}'' or simply ``{\sf chr1:0-12}'' if we
236
+ merge adjacent coordinate; similarly, ``{\sf s1$\to$s2$\to$s5$\to$s6}''
237
+ corresponds to ``{\sf chr1:0-8$\to$foo:8-16}''. We will formally describe the
238
+ path format when introducing the GAF format in the next section.
239
+
240
+ In rGFA, each segment is associated with one origin. This apparently trivial
241
+ requirement in fact imposes a strong restriction on the types of graphs rGFA
242
+ can encode: it forbids the collapse of different regions from one sequence,
243
+ which would often happen in a cDBG. We consider this restriction an
244
+ advantage of rGFA because it requires the graph to have a ``linear'' flavor
245
+ intuitively and simplifies the data structure to store the graph.
246
+
247
+ For simplicity, rGFA disallows overlaps between edges and forbids multiple
248
+ edges (more than one edges between the same pair of vertices). These two
249
+ restrictions help to avoid ambiguity and reduce the complexity in
250
+ implementation. They are not strictly necessary in theory.
251
+
252
+ \subsubsection*{The Graphical mApping Format (GAF)}
253
+
254
+ \begin{table}[tb]
255
+ \caption{The Graphical mApping Format (GAF)}\label{tab:gaf}
256
+ \begin{tabular}{rcp{6cm}}
257
+ \hline
258
+ Col & Type & Description \\ \hline
259
+ 1 & string & Query sequence name \\
260
+ 2 & int & Query sequence length \\
261
+ 3 & int & Query start coordinate (0-based; closed) \\
262
+ 4 & int & Query end coordinate (0-based; open) \\
263
+ 5 & char & Strand relative to col. 6 \\
264
+ 6 & string & Graph path matching regular expression \texttt{/([><][\char94\char92s><]+(:\char92d+-\char92d+)?)+\char124([\char94\char92s><]+)/}\\
265
+ 7 & int & Path sequence length \\
266
+ 8 & int & Path start coordinate \\
267
+ 9 & int & Path end coordinate \\
268
+ 10 & int & Number of matching bases in the mapping \\
269
+ 11 & int & Number of bases, including gaps, in the mapping \\
270
+ 12 & int & Mapping quality (0--255 with 255 for missing) \\ \hline
271
+ \end{tabular}
272
+ \end{table}
273
+
274
+ As there are no text formats for sequence-to-graph alignment, we propose a new
275
+ Graphical mApping Format (GAF) by extending the Pairwise mApping Format
276
+ (PAF)~\cite{Li:2016aa}. GAF is TAB-delimited with each column defined in
277
+ Table~\ref{tab:gaf}. Column 6 encodes a path on the graph. It follows the
278
+ formal grammar below:
279
+
280
+ {\footnotesize
281
+ \begin{verbatim}
282
+
283
+ <path> <- <stableId> | <orientIntv>+
284
+ <orientIntv> <- (`>' | `<') (<segId> | <stableIntv>)
285
+ <stableIntv> <- <stableId> `:' <start> `-' <end>
286
+
287
+ \end{verbatim}}
288
+
289
+ {\flushleft
290
+ In this grammar, {\tt <segId>} is a segment identifier on an S-line in rGFA;
291
+ {\tt <stableId>} is a stable sequence name at the {\tt SN} tag on the
292
+ corresponding S-line. Column 6 can be either a path in the segment coordinate
293
+ (Fig.~\ref{fig:rgfa}c) or an equivalent path in the stable coordinate
294
+ (Fig.~\ref{fig:rgfa}d). We can merge adjacent stable coordinates if the two
295
+ segments are originated from the same stable sequence and the end offset of the
296
+ first segment is equal to the start offset of the second segment. For example,
297
+ ``{\tt >chr1:0-5>chr1:5-8}'' can be simplified to ``{\tt >chr1:0-8}''.
298
+ Furthermore, if a path in column 6 is derived from one reference sequence, we
299
+ recommend to replace it with the entire reference path on the forward
300
+ orientation (e.g. see ``read1'' in Fig.~\ref{fig:rgfa}d). With this convention,
301
+ a GAF line is reduced to PAF for a sequence mapped to a reference sequence.
302
+ Similar to PAF, GAF also allows optional tags in the SAM-like format. Base
303
+ alignment is kept at the {\tt cg} tag.}
304
+
305
+ Minigraph produces GAF in both the segment and the stable coordinate.
306
+ GraphAligner~\cite{Rautiainen810812} produces GAF in the segment coordinate
307
+ only, which can be converted to the stable coordinate.
308
+
309
+ \begin{figure}[t]
310
+ \includegraphics[width=.47\textwidth]{Fig2}
311
+ \caption{\csentence{Minigraph algorithms.} {\bf (a)} Diagram of the minigraph
312
+ mapping algorithm. Minigraph seeds alignments with minimizers, finds good
313
+ enough linear chains, connects them in the graph and seeks the most weighted
314
+ path as a graph chain. {\bf (b)} Diagram of incremental graph construction. A
315
+ graph is iteratively constructed by mapping each assembly to an existing
316
+ graph and augmenting the graph with long poorly mapped sequences in the
317
+ assembly.}\label{fig:mg}
318
+ \end{figure}
319
+
320
+ \subsection*{Sequence-to-graph mapping}
321
+
322
+ Our incremental graph construction algorithm relies on genome-to-graph
323
+ alignment (Fig.~\ref{fig:mg}b). As existing sequence-to-graph
324
+ aligners~\cite{Rautiainen810812,Garrison:2018aa} do not work with
325
+ chromosome-long query sequences, we adapted minimap2~\cite{Li:2018ab} for our
326
+ purpose and implemented minigraph (Fig.~\ref{fig:mg}a). Briefly, minigraph uses
327
+ a minimap2-like algorithm to find local hits to segments in the graph, ignoring
328
+ the graph topology. It then chains these local hits if they are connected on
329
+ the graph, possibly through cycles. This gives the approximate mapping locations. Minigraph does not
330
+ perform base-level alignment. This is because the graph we construct encodes
331
+ SVs and rarely contains paths similar at the base level. The best mapping is
332
+ often clear without base alignment.
333
+
334
+ \begin{table}[b]
335
+ \caption{Performance of sequence-to-graph mapping}\label{tab:mgvga}
336
+ \begin{tabular}{lrr}
337
+ \hline
338
+ & minigraph & GraphAligner \\
339
+ \hline
340
+ Indexing time (wall-clock sec) & 100 & 589 \\
341
+ Mapping time (wall-clock sec) & 79 & 140 \\
342
+ Peak RAM (GB) & 19.5 & 27.2 \\
343
+ Percent unmapped reads & 0.5\% & 0\% \\
344
+ Percent wrong mappings & 1.7\% & 4.6\% \\
345
+ \hline
346
+ \end{tabular}
347
+ \end{table}
348
+
349
+ To evaluate the accuracy of minigraph mapping, we simulated PacBio reads from
350
+ GRCh38 with PBSIM~\cite{Ono:2013aa} and mapped them to the graph we constructed
351
+ in the next section. Table~\ref{tab:mgvga} compares the performance of
352
+ minigraph and GraphAligner~\cite{Rautiainen810812} v1.0.10 on 68,857 simulated
353
+ reads mapped over 8 CPU threads. {\color{black} The N50 read length is 15kb.
354
+ 9,862 reads are mapped across two or more segments by GraphAligner. Note that
355
+ both minigraph and GraphAligner ignore the stable coordinates during mapping.
356
+ All segments, originated either from GRCh38 or from individual genomes, are
357
+ treated equally. To this end, while we simulated reads from GRCh38, we are also
358
+ evaluating how well mappers work with complex SVs present in any input
359
+ samples.}
360
+
361
+ On this dataset, minigraph
362
+ is faster than GraphAligner and uses less memory, partly because minigraph does
363
+ not perform base alignment.
364
+ As is shown in Table~\ref{tab:mgvga}, minigraph is more accurate than
365
+ GraphAligner. This is counter-intuitive given that GraphAligner does base
366
+ alignment. Close inspection reveals that most mismapped reads by minigraph are
367
+ mapped to the correct genomic loci but wrong graph paths. On the contrary, most
368
+ mismapped reads by GraphAligner are mapped to wrong genomic loci. This suggests
369
+ minigraph is better at finding approximate mapping locations but GraphAligner
370
+ is better at disambiguating similar graph paths. Combining the strength of
371
+ both could lead to a better graph mapper. We do plan to implement base-level
372
+ alignment in minigraph in future.
373
+
374
+ We have also tried vg v1.21.0~\cite{Garrison:2018aa}. It indexed the same graph in 14.7 wall-clock
375
+ hours and mapped the simulated reads in 1.8 hours over 8 threads, tens of times
376
+ slower than minigraph and GraphAligner. However, no reads are mapped in the
377
+ output. We have not been able to make vg work with our data.
378
+
379
+ \subsection*{Generating pangenome graphs}
380
+
381
+ Fig.~\ref{fig:mg}b shows how minigraph constructs a pangenome graph (see
382
+ Methods for details). This procedure is similar to multiple sequence alignment
383
+ via partial order graph~\cite{Lee_2002} except that minigraph works with cyclic
384
+ graphs and ignores small variants. Minigraph only considers SVs of
385
+ 100bp--100kb in length and ignores SVs in alignments shorter than 100kb.
386
+ For each input assembly, it filters out regions covered by two or more primary
387
+ alignments longer than 20kb in the assembly. This filter avoids paralogous
388
+ regions in a sample and guarantees that graphs generated by minigraph can be
389
+ modeled by rGFA.
390
+
391
+ As a sanity check, we compared minigraph to dipcall
392
+ (\href{https://github.com/lh3/dipcall}{https://github.com/lh3/dipcall}) on
393
+ calling SVs 100bp or longer from a synthetic diploid sample composed of CHM1
394
+ and CHM13~\cite{Li:2018aa}. Given two SV callsets $A$ and $B$, we say a call in
395
+ $A$ is \emph{missed} in callset $B$ if there are no calls in $B$ within 1000bp
396
+ from the call in $A$. With this criterion, 2.7\% of 14,792 SVs called by
397
+ dipcall are missed by minigraph; 6.0\% of 14,932 minigraph SVs are missed by
398
+ dipcall. We manually inspected tens of differences in
399
+ IGV~\cite{Robinson:2011aa} and identified two causes. First, an INDEL longer
400
+ than 100bp called by one caller may be split into two shorter INDELs by the
401
+ other caller. There are often more than one smaller SVs around a missed SV
402
+ call. Second, dipcall skips regions involving high density of SNPs or involving
403
+ both long insertions and long deletions, but minigraph connects these events
404
+ and calls SVs in such regions. It tends to call more SVs. Overall, we believe
405
+ minigraph and dipcall found similar sets of SVs.
406
+
407
+ \begin{table}[tb]
408
+ \caption{Assemblies used for graph construction}\label{tab:asm}
409
+ \begin{tabular}{llll}
410
+ \hline
411
+ Name & Species & Population & Accession/Source \\ \hline
412
+ CHM1 & Human & N/A & GCA\_001297185.1 \\
413
+ CHM13 & Human & N/A & GCA\_000983455.1 \\
414
+ NA12878 & Human & European & \cite{Garg810341}, phased \\
415
+ NA24385 & Human & Jewish & \cite{Garg810341}, phased \\
416
+ PGP1 & Human & N/A & \cite{Garg810341}, phased \\
417
+ NA19240 & Human & African & GCA\_001524155.4 \\
418
+ HG00514 & Human & East Asian & GCA\_002180035.3 \\
419
+ HG01352 & Human & American & GCA\_002209525.2 \\
420
+ NA19434 & Human & African & GCA\_002872155.1 \\
421
+ HG02818 & Human & African & GCA\_003574075.1 \\
422
+ HG03486 & Human & African & GCA\_003086635.1 \\
423
+ HG03807 & Human & South Asian& GCA\_003601015.1 \\
424
+ HG00733 & Human & American & GCA\_002208065.1 \\
425
+ HG02059 & Human & East Asian & GCA\_003070785.1 \\
426
+ HG00268 & Human & European & GCA\_008065235.1 \\
427
+ HG04217 & Human & South Asian& GCA\_007821485.1 \\
428
+ AK1 & Human & East Asian & GCA\_001750385.1 \\
429
+ Clint & Chimpanzee & & GCA\_002880755.3 \\
430
+ Susie & Gorilla & & GCA\_900006655.3 \\
431
+ Kamilah & Gorilla & & GCA\_008122165.1 \\
432
+ Susie & Orangutan & & GCA\_002880775.3 \\
433
+ \hline
434
+ \end{tabular}
435
+ \end{table}
436
+
437
+ \begin{figure*}[htbp]
438
+ \includegraphics[width=.95\textwidth]{Fig3}
439
+ \caption{\csentence{Characteristics of the human and the great ape graphs.} {\bf
440
+ (a)} Human variations stratified by repeat class and by the number of
441
+ alleles of each variation. The repeat annotation was obtained from the
442
+ longest allele of each variation. VNTR: variable-number tandem repeat, a
443
+ tandem repeat with the unit motif length $\ge$7bp. STR: short random repeat,
444
+ a tandem repeat with the unit motif length $\le$6bp. LCR: low-complexity
445
+ regions. Mixed-inter.: a variation involving $\ge$2 types of interspersed
446
+ repeats. {\bf (b)} Great ape variations stratified by repeat class and by the
447
+ number of alleles. {\bf (c)} Human biallelic variations stratified by repeat
448
+ class and by insertion to/deletion from GRCh38. Both alleles are required to
449
+ be covered in all assemblies. {\bf (d)} Human-specific biallelic variations
450
+ stratified by repeat class and by insertion to/deletion from GRCh38. Red bars
451
+ correspond to insertions to the human lineage. {\bf (e)} Distribution of
452
+ different types of human variations along chromosomes. {\bf (f)} Boxplot of
453
+ the longest allele length in each repeat class. Outliers are omitted for the
454
+ clarity of the figure.}\label{fig:anno}
455
+ \end{figure*}
456
+
457
+ \subsection*{A human pangenome graph}
458
+
459
+ Starting with GRCh38, we constructed a human pangenome graph from 20 human
460
+ haplotypes or haplotype-collapsed assemblies (Table~\ref{tab:asm}). It took
461
+ minigraph 2.7 wall-clock hours over 24 CPU threads to generate this graph. The
462
+ peak memory is 98.1GB. The resulting graph consists of 148,618 segments and
463
+ 214,995 links. It contains 37,332 variations, where a \emph{variation}
464
+ denotes a minimal subgraph that has a single source and a single sink with both
465
+ segments coming from GRCh38. A path through the bubble between the source and
466
+ and the sink represents an \emph{allele}.
467
+
468
+ Variations in the human graph are enriched with Alus and VNTRs
469
+ (Fig.~\ref{fig:anno}a). While interspersed repeats are about evenly distributed
470
+ along chromosomes except in the pseudoautosomal regions (Fig.~\ref{fig:anno}e),
471
+ VNTRs are enriched towards telomeres~\cite{Audano:2019aa}. It is worth noting
472
+ the density of minisatellites is also higher in subtelomeres. If we normalize
473
+ the density of VNTRs in the pangenome graph by the density of minisatellites in
474
+ GRCh38, the enrichment of VNTRs towards telomeres is still visible but becomes
475
+ less prominent. At the same time, repeat-less variations are also enriched
476
+ towards the ends of chromosomes (green areas in Fig.~\ref{fig:anno}e),
477
+ suggesting subtelomeres tend to harbor SVs anyway. We also
478
+ identified 85 processed pseudogenes among these variations.
479
+
480
+ \begin{figure}
481
+ \includegraphics[width=.46\textwidth]{igv-edit.png}
482
+ \caption{\csentence{IGV screenshot of a region enriched with long insertions.}
483
+ Numbers on wide purple bars indicate insertion lengths. CLR: PacBio noisy
484
+ continuous long reads. HiFi: PacBio high-fidelity reads.}\label{fig:igv}
485
+ \end{figure}
486
+
487
+ Another noticeable feature of VNTRs is that over half of VNTR variations are
488
+ multiallelic (Fig.~\ref{fig:anno}a). Fig.~\ref{fig:igv} shows a multi-allelic
489
+ region composed of VNTRs. We can see many insertions of different lengths. The
490
+ two different NA12878 assemblies also disagree with each other, which we often
491
+ see around other VNTR loci in NA12878 as well. We have not inspected raw reads
492
+ in this particular example, but we tend to believe the disagreement is caused
493
+ by local misassemblies rather than somatic mutations. In addition, due to the
494
+ multiallelic nature of such VNTRs, the two haplotypes in a human individual are
495
+ often different. Assemblies mixing the two haplotypes (aka collapsed
496
+ assemblies) may have more troubles in these regions. Multiallelic VNTRs are
497
+ hard to assemble correctly.
498
+
499
+ Multiallelic VNTRs are also hard to align and to call. In Fig.~\ref{fig:igv},
500
+ the insertion positions are often different, which could be caused by a few
501
+ mutations or sequencing errors. A naive alignment-based SV caller would call a
502
+ dozen of low-frequency insertions in this region, which does not reflect these
503
+ correlated events. Without base-level alignment, minigraph may
504
+ have more troubles with obtaining the optimal alignment in these complex VNTR
505
+ regions. Improved data quality, assembly algorithms and graph mapping
506
+ algorithms are required to investigate VNTR regions in detail.
507
+
508
+ \subsection*{A great ape pangenome graph}
509
+
510
+ We also constructed a great ape pangenome graph from GRCh38, one chimpanzee,
511
+ two gorillas and one orangutan (Table~\ref{tab:asm}). This graph contains
512
+ 206,452 variations, over four times more than the human graph. About half of
513
+ variations are originated from orangutan, the species most distant from human.
514
+
515
+ In the great ape graph, the L1-to-Alu ratio is close to 1:1, much higher than
516
+ the ratio in the human graph (Fig.~\ref{fig:anno}b vs Fig.~\ref{fig:anno}a).
517
+ This is perhaps correlated with the elevated L1 activity in great
518
+ apes~\cite{Mathews:2003aa}. Of retrotransposon-related variations specific to
519
+ the human lineage, the overwhelming majority are insertions
520
+ (Fig.~\ref{fig:anno}d), which is expected as transpositions lead to insertions
521
+ only. Most human-specific Alu deletions are incomplete and involve ancient Alu
522
+ subfamilies. They are likely genomic deletions that happen to hit Alus. In
523
+ contrast, the majority of ``partial-repeats'' are deletions from the human
524
+ lineage. Two thirds of autosomal insertions in this category are segmental
525
+ duplications in GRCh38. In all, minigraph is an efficient tool to study closely
526
+ related species.
527
+
528
+ \subsection*{Blacklist regions from human pangenome graphs}
529
+
530
+ The human pangenome graph effectively encodes SVs $\ge$100bp
531
+ in 20 genomes. These large-scale variations could be a frequent source of
532
+ technical artifacts in variant calling with short reads. To test this
533
+ hypothesis, we compared short-read SNP calls with vs without regions around SVs
534
+ in the pangenome graph.
535
+
536
+ We constructed a human pangenome graph excluding CHM1 and CHM13, the two
537
+ samples used in the SynDip benchmark~\cite{Li:2018aa}, and generated regions
538
+ around variations (see Methods), which we call as \emph{blacklist regions},
539
+ following the rationale in~\cite{Amemiya:2019aa}. Blacklist regions is totaled
540
+ 29.2Mb in length, intersecting 0.7\% of confident regions in
541
+ SynDip~\cite{Li:2018aa}; 0.7\% of truth SNPs are contained in blacklist regions
542
+ -- true SNPs are not enriched in blacklist regions.
543
+
544
+ We mapped short reads used in~\cite{Li:2018aa} with minimap2 and called
545
+ variants with GATK v4.1.2~\cite{Depristo:2011vn}. This callset
546
+ contains 32,879 false positive SNPs, 21\% of which fall in blacklist regions --
547
+ false SNP calls are highly enriched in this $<$1\% region of human genome. This
548
+ confirms a noticeable fraction of false SNP calls using short reads are
549
+ resulted from misalignment involving SVs.
550
+
551
+ \section*{Discussion}
552
+
553
+ Based on the GFA assembly format~\cite{Li:2016aa}, we proposed the rGFA format,
554
+ which defines a data model for reference pangenome graphs at the same time.
555
+ rGFA takes a linear reference genome as the backbone and maintains the
556
+ conceptual ``linearity'' of input genomes.
557
+
558
+ rGFA is not the only pangenome graph model. Vg~\cite{Garrison:2018aa}
559
+ encodes a stable sequence with a path through the sequence graph~\cite{10.12688/f1000research.19630.1}. A segment
560
+ in the graph may occur on multiple paths, or occur multiple times on one path
561
+ if there are cycles in the graph. This way, vg allows different regions in one
562
+ chromosome collapsed to one segment. We call such a graph as a collapsed graph. rGFA
563
+ cannot encode a collapsed graph. The vg model is thus more general.
564
+
565
+ In our view, however, the reference pangenome graph should not be a collapsed
566
+ graph. In a collapsed graph, the definition of orthology is not clear because
567
+ multiple sequences from the same sample may go through the same segment.
568
+ Without the concept of orthology, we cannot define variations, either. In
569
+ addition, due to the one-to-many relationship between segments and the
570
+ reference genome, it is intricate to derive the stable coordinate of a path in
571
+ a collapsed graph. For example, suppose segment {\sf s1} corresponds to two
572
+ regions {\sf chr1:100-200} and {\sf chr1:500-600}. To convert a path {\sf
573
+ s2$\to$s1$\to$s3} to the stable coordinate, we have to inspect adjacent
574
+ segments to tell which {\sf s1} corresponds to; this becomes more challenging
575
+ when {\sf s2} and {\sf s3} represent multiple regions in the reference genome.
576
+ In contrast, rGFA inherently forbids a collapsed graph and avoids the potential
577
+ issues above. This makes rGFA simpler than vg's path model and easier to work
578
+ with.
579
+
580
+ To demonstrate practical applications of rGFA, we developed minigraph to
581
+ incrementally generate pangenome graphs. It can generate a graph from 20
582
+ genomes in three hours and can scale to hundreds of genomes in future. A
583
+ limitation of minigraph is that it does not perform base alignment and may be
584
+ confused by similar paths in the graph. {\color{black} Unfortunately, base-level
585
+ sequence-to-graph alignment is not a fully solved problem. Partial-order graph
586
+ alignment~\cite{Lee_2002} and PaSGAL~\cite{DBLP:conf/ipps/JainMZDA19} only work
587
+ with directed acyclic graphs (DAGs). Vg~\cite{Garrison:2018aa} uses a heuristic
588
+ to unroll cycles but it is expotential in time in the worst case and for DAGs,
589
+ its exact mode is tens of times slower than PaSGAL. Antipov et
590
+ al~\cite{Antipov:2016aa} proved that alignment against cyclic graphs can be
591
+ done in polynomial time. GraphAligner~\cite{Rautiainen810812} implements a
592
+ fast quadratic algorithm for computing edit distance~\cite{Rautiainen_2019}.
593
+ However, edit distance based alignment disallows long INDELs and is often
594
+ inadequate for accurate variant calling. Jain et al~\cite{Jain_2020} recently
595
+ proposed a quadratic algorithm for alignment with affine gap penalty but the
596
+ authors focused on the theoretical analysis only. To the best of our knowledge,
597
+ no tools can efficiently perform sequence-to-graph alignment under affine gap
598
+ cost. We plan to learn from the existing algorithms and implement fast base
599
+ alignment in minigraph in future. This may take significant effort.}
600
+
601
+ Another limitation of minigraph is
602
+ that it is unable to align sequences against a graph encoding all small variants.
603
+ Such a graph will be composed of millions of short segments. Not
604
+ indexing minimizers across segments, minigraph will fail to seed the initial
605
+ linear chains. This limitation can only be resolved by completely changing the
606
+ minigraph mapping algorithm. Nonetheless, small variants are easier to
607
+ analyze with the standard methods. Incorporating these variants unnecessarily
608
+ enlarges the graph, complicates implementations, increases the rate of false
609
+ mappings~\cite{Pritt_2018} and reduces the performance of common tasks. There
610
+ is also no known algorithm that can construct such a complex graph for hundreds
611
+ of human genomes.
612
+
613
+ Minigraph does not keep track of the sample information as of now. To address
614
+ this issue, we are considering to implement colored rGFA, similar to colored de
615
+ Bruijn graphs~\cite{Iqbal:2012aa}. In a colored rGFA, a color represents one
616
+ sample. Each segment or link is associated with one or multiple colors,
617
+ indicating the sources of the segment or the link. Colors can be stored in an
618
+ rGFA tag or in a separate segment/link-by-sample binary
619
+ matrix~\cite{Holley695338}. The matrix representation may be more compact given
620
+ a large number of samples.
621
+
622
+ We have shown minigraph can be a fast and powerful research tool to summarize
623
+ SVs at the population scale and to study the evolution of closely related
624
+ species. A more practical question is how a reference pangenome graph may
625
+ influence routine data analysis. Here is our limited view.
626
+
627
+ We think a critical role a reference graph plays is that it extends the
628
+ coordinate system of a linear reference genome. This allows us to annotate
629
+ variations in highly diverse regions such as the human HLA and KIR regions. The
630
+ existing pipelines largely ignore these variations because most of them cannot
631
+ be encoded in the primary assembly of GRCh38.
632
+
633
+ The extended graph coordinate system further helps to consistently represent
634
+ complex SVs. Given multiple samples, the current practice is to call SVs from
635
+ individual samples and then merge them. Two subtly different SVs, especially
636
+ long insertions, may be called at two distinct locations and treated as
637
+ separate events. With the minigraph procedure, the two SVs are likely to
638
+ be aligned together as long as they are similar to each other and are
639
+ sufficiently different from the reference allele. To some extent, minigraph is
640
+ performing multiple sequence alignment with partial order
641
+ alignment~\cite{Lee_2002}. This procedure is more robust to different
642
+ representations of the same SV than naive merging. When we refer to a SNP, we often use its
643
+ chromosomal coordinate such as ``chr1:12345''. We rarely do so for SVs because
644
+ their positions are sensitive to alignment and SV callers. The more consistent
645
+ SV representation implied by a pangenome graph will help to alleviate the issue
646
+ and subsequently facilitate the genotyping of
647
+ SVs~\cite{Hickey_2020,Eggertsson_2019,Chen_2019}.
648
+
649
+ While we believe a reference pangenome graph will make complex variations more
650
+ accessible by geneticists and biologists, we suspect a great majority of
651
+ biomedical researchers will still rely on a linear reference genome due to the
652
+ conceptual simplicity of linear genomes and the mature tool chains developed in
653
+ decades. Many analyses such as SNP calling in well behaved regions do not
654
+ benefit much from a pangenome representation, either. Nonetheless, a pangenome
655
+ reference still helps applications based on linear references. With a graph
656
+ reference, we may blacklist regions enriched with SVs that lead to small variant
657
+ calling errors. We may potentially generate ``decoy'' sequences that are
658
+ missing from the primary assembly to attract falsely mapped reads away. We may
659
+ perform read alignment against a graph, project the alignment to the linear
660
+ coordinate and finish the rest of analyses in the linear space. We anticipate a
661
+ pangenome reference to supplement the linear reference, not to replace it.
662
+
663
+ \section*{Conclusions}
664
+
665
+ Complex human sequence variations are like genomic dark matter: they are
666
+ pervasive in our genomes but are often opaque to the assay with the existing
667
+ tools. We envision a pangenome graph reference will become an effective
668
+ means to the study of these complex variations. We proposed a data model (rGFA),
669
+ designed formats (rGFA and GAF) and developed companion tools (minigraph and
670
+ gfatools) to demonstrate the feasibility of our vision. Our work is still
671
+ preliminary but it is likely to set a starting point to the development of the
672
+ next-generation graph-based tools, which may ultimately help us to understand
673
+ our genomes better.
674
+
675
+ \section*{Methods}
676
+
677
+ \subsection*{The minigraph mapping algorithm}
678
+
679
+ \subsubsection*{Seeding and linear chaining}
680
+ Similar to minimap2, minigraph uses minimizers on segments as seeds. It also
681
+ applies a similar chaining algorithm but with different scoring and with a new
682
+ heuristic to speed up chaining over long distances. For the completeness of
683
+ this article, we will describe part of the minimap2 chaining algorithm here.
684
+
685
+ \paragraph*{Minimap2-like chaining}
686
+ Formally, an \emph{anchor} is a 3-tuple $(x,y,w)$, representing a closed
687
+ interval $[x-w+1,x]$ on a segment in the reference graph matching an interval
688
+ $[y-w+1,y]$ on the query. Given a list of anchors sorted by $x$, let $f(i)$ be
689
+ the maximal chaining score up to the $i$-th anchor in the list. $f(i)$ can be
690
+ computed by:
691
+ \begin{equation}\label{eq:dp}
692
+ f(i)=\max\big\{\max_{i>j\ge1}\{f(j)+\alpha(j,i)-\beta(j,i)\},w_i\big\}
693
+ \end{equation}
694
+ where $\alpha(j,i)=\min\big\{\min\{y_i-y_j,x_i-x_j\},w_i\big\}$ is
695
+ the number of matching bases between anchor $i$ and $j$.
696
+ $\beta(j,i)$ is the gap penalty. Let $g_{ji}=|(y_i-y_j)-(x_i-x_j)|$
697
+ be the gap length and $d_{ji}=\min\{y_i-y_j,x_i-x_j\}$ be the smaller distance
698
+ between the two anchors. Minigraph uses the following gap cost:
699
+ $$
700
+ \beta(j,i)=\left\{\begin{array}{ll}
701
+ \infty & (g_{ji}>G) \\
702
+ c_1\cdot g_{ji} + c_2\cdot d_{ji} + \log_2{g_{ji}} & (0<g_{ji}\le G) \\
703
+ 0 & (g_{ji}=0)\\
704
+ \end{array}\right.
705
+ $$
706
+ where $G=100000$ in the graph construction mode, $c_1=e^{-dw}$ and
707
+ $c_2=0.05\cdot e^{-dw}$. By default, $d=0.01$ is the expected per-base sequence
708
+ divergence and $w=19$ is the minimizer length. In comparison, minimap2 applies
709
+ $G=5000$, $c_1=0.19$ and $c_2=0$. Minigraph allows much larger gaps between
710
+ minimizers and more heavily penalizes gaps.
711
+
712
+ Solving Eq.~\ref{eq:dp} leads to an $O(n^2)$ algorithm where $n$ is the number
713
+ of anchors. This algorithm is slow for large $n$. Minimap2 introduces
714
+ heuristics to speed up the computation by approximating this equation. It works
715
+ well for minimap2 that only allows small gaps and has base-level alignment as a
716
+ fix to chaining errors. However, as minigraph intends to chain much longer
717
+ gaps, the minimap2 algorithm occasionally misses the optimal alignment in long
718
+ segmental duplications and produces false variations. Minigraph introduces a
719
+ new heuristic to speed up chaining.
720
+
721
+ \begin{figure}[tb]
722
+ \centering
723
+ \includegraphics[width=.36\textwidth]{Fig5}
724
+ \caption{\csentence{Implementing 1-dimension Range-Min-Query (RMQ).} Given a
725
+ set of 2-tuples, a binary search tree is built for the first values in the
726
+ tuples. Each node $p$ in the tree is associated with a pointer. The pointer
727
+ points to the node that is in the subtree descended from $p$ and has the
728
+ minimal second value. In this example, ${\rm RMQ}(20,50)=14$.}\label{fig:rmq}
729
+ \end{figure}
730
+
731
+ \paragraph*{Dynamic 1-dimension Range-Min-Query}
732
+ Before we move onto the minigraph solution, we will first introduce
733
+ Range-Min-Query (RMQ). Given a set of 2-tuples $\{(y_i,s_i)\}$, ${\rm
734
+ RMQ}(a,b)$ returns the minimum $s_j$ among $\{s_j:a\le y_j\le b\}$.
735
+ We implemented 1-dimension RMQ with a modified AVL tree, a type of balanced
736
+ binary search tree (Fig.~\ref{fig:rmq}). When performing ${\rm RMQ}(a,b)$,
737
+ we first find the smallest and the largest nodes within interval $[a,b]$ using
738
+ the standard algorithm. In this example, the two nodes are (21,32) and (45,21),
739
+ respectively. We then traverse the path between the two nodes to find the
740
+ minimum. With a balanced tree structure, we do not need to descend into
741
+ subtrees. The time complexity is $O(m\log m)$, where $m$ is the number of nodes
742
+ in the tree. We can insert nodes to or delete nodes from the tree while
743
+ maintaining the property of the tree. This achieves dynamic RMQ.
744
+
745
+ \paragraph*{Chaining with a linear gap cost function}
746
+ A linear gap cost takes the form of
747
+ $\beta'(j,i)=c_1[(y_i-y_j)+(x_i-x_j)]$. Given a list of anchors
748
+ $(x_i,y_i,w_i)$ sorted by position $x_i$, let
749
+ \begin{equation}\label{eq:dp2}
750
+ f'(i)=\max_{\substack{\text{$i>j\ge1$}\\ \text{$x_i-G\le x_j\le x_i-w_i$}\\ \text{$y_i-G\le y_j\le y_i-w_i$}}}\big\{f'(j)+w_j-\beta'(j,i)\big\}
751
+ \end{equation}
752
+ We can find the optimal $f'(i)$ in $O(n\log n)$ time with
753
+ RMQ~\cite{DBLP:conf/wabi/AbouelhodaO03,Otto:2011aa}. To see that, define
754
+ $$h'(j)=f'(j)+w_j+c_1(y_j+x_j)$$
755
+ The following condition
756
+ $$f'(j)+w_j-\beta'(j,i)>f'(k)+w_k-\beta'(k,i)$$
757
+ is equivalent to $h'(j)>h'(k)$, independent of $i$. If we maintain ${\rm
758
+ RMQ}_i$ as the binary tree that keeps $\{(y_j,-h'(j)):j<i,x_i-G\le x_j\le x_i-w_i\}$, we have
759
+ $$
760
+ f'(i)=-{\rm RMQ}_i(y_i-G,y_i-w_i)-c_1(x_i+y_i)
761
+ $$
762
+ This solves Eq.~\ref{eq:dp2} in $O(n\log n)$ time.
763
+
764
+ \paragraph*{Minigraph linear chaining}
765
+ While chaining with a linear gap cost function can be solved efficiently, we
766
+ prefer more realistic cost function used in minimap2. In practical
767
+ implementation, when we come to anchor $i$, we find the optimal predecessor $j_*$
768
+ under the desired gap cost $\beta(j,i)$ for anchors $\{j:j<i,x_i-G'\le
769
+ x_j<x_i,y_i-G'\le y_j<y_i\}$, where $G'<G$ is set to 10000 by default.
770
+ Meanwhile, we use the RMQ-based algorithm to identify the anchor $j'_{*}$ optimal
771
+ under the linear gap cost $\beta'(j,i)$. We choose $j'_*$ as the optimal
772
+ predecessor if
773
+ $$
774
+ f(j_*)+\alpha(j_*,i)-\beta(j_*,i)<f(j'_*)+\alpha(j'_*,i)-\beta(j'_*,i)
775
+ $$
776
+ This may occasionally happen around long segmental duplications when the
777
+ minimap2 heuristic misses the optimal solution. Effectively, minigraph does
778
+ thorough search in a small window and approximate search in a large window
779
+ using a faster but less sophisticated gap cost function.
780
+
781
+ \subsubsection*{Graph chaining}
782
+
783
+ Minigraph generates a set of linear chains $\{L_i\}$ with the procedure above
784
+ that completely ignores the graph topology. It then applies another round of
785
+ chaining taking the account of the topology.
786
+
787
+ We say linear chain $L_i$ \emph{precedes} $L_j$, written as $L_i\prec L_j$, if
788
+ (1) the ending coordinate of $L_i$ on the query sequence is smaller than the
789
+ ending coordinate of $L_j$, and (2) there is a walk from $L_i$ to $L_j$ in the
790
+ graph. If there are multiple walks from $L_i$ to $L_j$, minigraph enumerates
791
+ the shortest 16 walks and chooses the walk with its length being the closest to
792
+ the query distance between $L_i$ and $L_j$.
793
+
794
+ Given a list of linear chains sorted by their ending coordinates on the query
795
+ sequence, let $g(i)$ be the optimal graph chaining score up to linear chain
796
+ $L_i$. We can compute $g(i)$ with another dynamic programming:
797
+ $$
798
+ g(i)=\max\big\{\max_{L_j\prec L_i}\{g(j)+\omega(L_j)-\beta(j,i)\},\omega(L_i)\big\}
799
+ $$
800
+ where $\beta(j,i)$ is the weight between $L_i$ and $L_j$. As minigraph does not
801
+ perform base-level alignment, $\beta(j,i)$ is the same as the gap penalty
802
+ function used for linear chaining. $\omega(L_i)$ is the optimal score of $L_i$
803
+ computed during linear chaining.
804
+
805
+ The procedure above has two limitations. First, when computing the weight
806
+ between $L_i$ and $L_j$, minigraph largely ignores base sequences and only considers
807
+ the distance between them on both the query and the graph. When there are
808
+ multiple walks of similar lengths between $L_i$ and $L_j$, minigraph miss the
809
+ graph chain that leads to the best base alignment. Although we added a
810
+ heuristic by considering 17-mer matches between the query and the graph paths,
811
+ we found this heursitc is not reliable in complex regions. Second, minigraph only
812
+ enumerates the shortest 16 walks. In complex subgraphs, the optimal walk from
813
+ $L_i$ to $L_j$ may not be among them. We plan to implement base
814
+ alignment to address the limitations. We may use the current minigraph algorithm
815
+ for easy cases and apply the more expensive base alignment when the current
816
+ algorithm potentially fails.
817
+
818
+ The graph chaining algorithm results in one or multiple graph chains. A
819
+ \emph{graph chain} is a list of anchors $(s_i,x_i,y_i,w_i)$, where
820
+ $[x_i-w_i+1,x_i]$ on segment $s_i$ in the graph matches $[y_i-w_i+1,y_i]$ on
821
+ the query sequence. A graph chain satisfies the following conditions: if $i<j$,
822
+ $y_i<y_j$; if $i<j$ and $s_i=s_j$, we have $x_i<x_j$; if $s_i\not=s_{i+1}$, the
823
+ two segments are adjacent on the graph. It is an extension to linear chains.
824
+
825
+ \subsection*{The minigraph graph generation algorithm}
826
+
827
+ Using the minimap2 algorithm~\cite{Li:2018ab}, minigraph identifies a set of
828
+ \emph{primary chains} that do not greatly overlap with each other on the query
829
+ sequence. A region on the query is considered to be \emph{orthogonal} to the
830
+ reference if the region is contained in a primary chain longer than 100kb and
831
+ it is not intersecting other primary chains longer than 20kb.
832
+
833
+ Minigraph scans primary chains in orthogonal regions and identifies subregions
834
+ where the query subsequences significantly differs from the corresponding
835
+ reference subsequences. To achieve that, minigraph computes a score $h_i$ for
836
+ each adjacent pair of anchors $(s_i,x_i,y_i,w_i)$ and
837
+ $(s_{i+1},x_{i+1},y_{i+1},w_{i+1})$. Let $d^x_i$ be the distance between the
838
+ two anchors on the graph and $d^y_i=y_{i+1}-y_i$ be the distance on the query
839
+ sequence. $h_i$ is computed as
840
+ \begin{equation}\label{eq:hi}
841
+ h_i=\left\{\begin{array}{ll}
842
+ -10 & \mbox{if $d^x_i=d^y_i\le w_{i+1}$} \\
843
+ \eta\cdot\max\{d^x_i,d^y_i\} & \mbox{otherwise}\\
844
+ \end{array}\right.
845
+ \end{equation}
846
+ where $\eta$ is the density of anchors averaged across all primary graph
847
+ chains. Define $H(i,j)=\sum_{k=i}^j h_k$. A highly divergent region between the
848
+ query and the graph will be associated with a large $H(i,j)$. Minigraph uses
849
+ the Ruzzo-Tompa algorithm~\cite{DBLP:conf/ismb/RuzzoT99} to identify all
850
+ maximal scoring intervals on list $(h_i)$, which correspond to divergent
851
+ regions. In each identified divergent region, minigraph performs base
852
+ alignment~\cite{Suzuki:2018aa,Li:2018ab} between the query and the graph
853
+ sequences and retains a region if it involves an INDEL $\ge$100bp in length or
854
+ a $\ge$100bp region with base-level identity below 80\%. In Eq.~\ref{eq:hi},
855
+ -10 is an insensitive parameter due to the downstream filtering. In the end,
856
+ minigraph augments the existing graph with identified variations
857
+ (Fig.~\ref{fig:mg}b).
858
+
859
+ \subsection*{Annotating variations}
860
+
861
+ We applied RepeatMasker~\cite{Tarailo-Graovac:2009aa} v1.332 to classify
862
+ interspersed repeats in the longest allele sequence of each variation.
863
+ RepeatMasker is unable to annotate VNTRs with long motifs. It also often
864
+ interprets VNTRs as impure STRs. Therefore, we did not use the RepeatMasker
865
+ VNTR or STR annotations directly. Instead, we combined RepeatMasker and
866
+ SDUST~\cite{Morgulis:2006aa} results to collect low-complexity regions (LCRs).
867
+ We identified pure tandem repeats composed of a motif occurring twice or more
868
+ (implemented in
869
+ \href{https://github.com/lh3/etrf}{https://github.com/lh3/etrf}). An LCR is
870
+ classified as VNTR if 70\% of the LCR is VNTR; similarly, an LCR is classified
871
+ as STR if 70\% is STR; the rest are classified as ``Other-LCR'' in
872
+ Fig.~\ref{fig:anno}. The annotation script is available in the minigraph GitHub
873
+ repository.
874
+
875
+ \subsection*{Creating blacklist regions}
876
+
877
+ For each variation in the graph, we extend its genomic interval on GRCh38 by
878
+ 50bp from each end. We name this set of intervals as $I_0$. We align sequences
879
+ inserted to GRCh38 against GRCh38 with ``minimap2 -cxasm20 -r2k'' and filter
880
+ out alignments with mapping quality below 5. Let $I(a,b)$ be the set of GRCh38
881
+ intervals that are contained in alignments with identity between $a$ and $b$.
882
+ The blacklist regions are computed by $I_0\cup I(0,0.99)\setminus I(0.998,1)$,
883
+ where ``$\cup$'' denotes the interval union operation and ``$\setminus$''
884
+ denotes interval subtraction.
885
+
886
+ %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
887
+ %% %%
888
+ %% Backmatter begins here %%
889
+ %% %%
890
+ %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
891
+
892
+ \begin{backmatter}
893
+
894
+ \section*{Competing interests}
895
+ The authors declare that they have no competing interests.
896
+
897
+ \section*{Ethical Approval}
898
+ Ethical approval was not needed for this study.
899
+
900
+ \section*{Author's contributions}
901
+ HL conceived the project, developed minigraph and drafted the manuscript.
902
+ XF did the pseudogene analysis. CC helped with RepeatMasker annotation.
903
+ All authors helped to revise the manuscript.
904
+
905
+ \section*{Acknowledgements}
906
+ We are grateful to Benedict Paten and Erik Garrison for discussions on
907
+ pangenome graphs. We thank minigraph users who have suggested features and
908
+ helped to fix various issues.
909
+
910
+ \section*{Funding}
911
+ This work is supported by National Institutes of Health (NIH) grant
912
+ U01HG010961 and R01HG010040.
913
+
914
+ \section*{Availability of data and materials}
915
+ Minigraph is openly available at
916
+ \href{https://github.com/lh3/minigraph}{https://github.com/lh3/minigraph}.
917
+ This repository also includes the script to convert from the segment coordinate
918
+ to the stable coordinate, to annotate variations and to generate blacklist
919
+ regions from the graph. The companion gfatools is available at
920
+ \href{https://github.com/lh3/gfatols}{https://github.com/lh3/gfatools}. The
921
+ human and the great ape graphs are hosted at
922
+ \href{ftp://ftp.dfci.harvard.edu/pub/hli/minigraph/}{ftp://ftp.dfci.harvard.edu/pub/hli/minigraph/}.
923
+ The NA12878, NA24385 and PGP1 phased assemblies were downloaded from
924
+ \href{ftp://ftp.dfci.harvard.edu/pub/hli/whdenovo/}{ftp://ftp.dfci.harvard.edu/pub/hli/whdenovo/}.
925
+ Assemblies generated by McDonnell Genome Institute include
926
+ GCA\_001524155.4 for NA19240, GCA\_002180035.3 for HG00514, GCA\_002209525.2
927
+ for HG01352, GCA\_002872155.1 for NA19434, GCA\_003574075.1 for HG02818,
928
+ GCA\_003086635.1 for HG03486, GCA\_003086635.1 for HG03486, GCA\_003601015.1
929
+ for HG03807, GCA\_002208065.1 for HG00733, GCA\_003070785.1 for HG02059,
930
+ GCA\_008065235.1 for HG00268 and GCA\_007821485.1 for HG04217. Other assemblies
931
+ are available from GenBank under accession GCA\_001297185.1 for
932
+ CHM1~\cite{Huddleston:2017aa}, GCA\_000983455.1 for
933
+ CHM13~\cite{Huddleston:2017aa}, GCA\_001750385.1 for AK1~\cite{Seo:2016aa},
934
+ GCA\_002880755.3 for chimpanzee Clint~\cite{Kronenberg:2018aa},
935
+ GCA\_900006655.3 for gorilla Susie~\cite{Gordon:2016kq}, GCA\_008122165.1 for
936
+ gorilla Kamilah~\cite{Kronenberg:2018aa} and GCA\_002880775.3 for orangutan
937
+ Susie~\cite{Kronenberg:2018aa}.
938
+
939
+
940
+ \bibliographystyle{bmc-mathphys}
941
+ \bibliography{minigraph}
942
+
943
+ %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
944
+ %% %%
945
+ %% Figures %%
946
+ %% %%
947
+ %% NB: this is for captions and %%
948
+ %% Titles. All graphics must be %%
949
+ %% submitted separately and NOT %%
950
+ %% included in the Tex document %%
951
+ %% %%
952
+ %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
953
+
954
+ %\section*{Figures}
955
+
956
+ %\begin{figure}[h!]
957
+ % \caption{\csentence{Sample figure title.}
958
+ % Figure legend text.}
959
+ % \end{figure}
960
+
961
+ %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
962
+ %% %%
963
+ %% Tables %%
964
+ %% %%
965
+ %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
966
+
967
+ %\section*{Tables}
968
+
969
+ %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
970
+ %% %%
971
+ %% Additional Files %%
972
+ %% %%
973
+ %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
974
+
975
+ %\section*{Additional Files}
976
+ % \subsection*{Additional file 1 --- Sample additional file title}
977
+ % Additional file descriptions text (including details of how to
978
+ % view the file, if it is in a non-standard format or the file extension). This might
979
+ % refer to a multi-page table or a figure.
980
+
981
+ % \subsection*{Additional file 2 --- Sample additional file title}
982
+ % Additional file descriptions text.
983
+
984
+
985
+ \end{backmatter}
986
+ \end{document}