minimap2 0.2.22.0 → 0.2.24.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (101) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +60 -76
  3. data/ext/Rakefile +55 -0
  4. data/ext/cmappy/cmappy.c +129 -0
  5. data/ext/cmappy/cmappy.h +44 -0
  6. data/ext/minimap2/FAQ.md +46 -0
  7. data/ext/minimap2/LICENSE.txt +24 -0
  8. data/ext/minimap2/MANIFEST.in +10 -0
  9. data/ext/minimap2/Makefile +132 -0
  10. data/ext/minimap2/Makefile.simde +97 -0
  11. data/ext/minimap2/NEWS.md +821 -0
  12. data/ext/minimap2/README.md +403 -0
  13. data/ext/minimap2/align.c +1020 -0
  14. data/ext/minimap2/bseq.c +169 -0
  15. data/ext/minimap2/bseq.h +64 -0
  16. data/ext/minimap2/code_of_conduct.md +30 -0
  17. data/ext/minimap2/cookbook.md +243 -0
  18. data/ext/minimap2/esterr.c +64 -0
  19. data/ext/minimap2/example.c +63 -0
  20. data/ext/minimap2/format.c +559 -0
  21. data/ext/minimap2/hit.c +466 -0
  22. data/ext/minimap2/index.c +775 -0
  23. data/ext/minimap2/kalloc.c +205 -0
  24. data/ext/minimap2/kalloc.h +76 -0
  25. data/ext/minimap2/kdq.h +132 -0
  26. data/ext/minimap2/ketopt.h +120 -0
  27. data/ext/minimap2/khash.h +615 -0
  28. data/ext/minimap2/krmq.h +474 -0
  29. data/ext/minimap2/kseq.h +256 -0
  30. data/ext/minimap2/ksort.h +153 -0
  31. data/ext/minimap2/ksw2.h +184 -0
  32. data/ext/minimap2/ksw2_dispatch.c +96 -0
  33. data/ext/minimap2/ksw2_extd2_sse.c +402 -0
  34. data/ext/minimap2/ksw2_exts2_sse.c +416 -0
  35. data/ext/minimap2/ksw2_extz2_sse.c +313 -0
  36. data/ext/minimap2/ksw2_ll_sse.c +152 -0
  37. data/ext/minimap2/kthread.c +159 -0
  38. data/ext/minimap2/kthread.h +15 -0
  39. data/ext/minimap2/kvec.h +105 -0
  40. data/ext/minimap2/lchain.c +369 -0
  41. data/ext/minimap2/main.c +459 -0
  42. data/ext/minimap2/map.c +714 -0
  43. data/ext/minimap2/minimap.h +410 -0
  44. data/ext/minimap2/minimap2.1 +725 -0
  45. data/ext/minimap2/misc/README.md +179 -0
  46. data/ext/minimap2/misc/mmphase.js +335 -0
  47. data/ext/minimap2/misc/paftools.js +3149 -0
  48. data/ext/minimap2/misc.c +162 -0
  49. data/ext/minimap2/mmpriv.h +132 -0
  50. data/ext/minimap2/options.c +234 -0
  51. data/ext/minimap2/pe.c +177 -0
  52. data/ext/minimap2/python/README.rst +196 -0
  53. data/ext/minimap2/python/cmappy.h +152 -0
  54. data/ext/minimap2/python/cmappy.pxd +153 -0
  55. data/ext/minimap2/python/mappy.pyx +273 -0
  56. data/ext/minimap2/python/minimap2.py +39 -0
  57. data/ext/minimap2/sdust.c +213 -0
  58. data/ext/minimap2/sdust.h +25 -0
  59. data/ext/minimap2/seed.c +131 -0
  60. data/ext/minimap2/setup.py +55 -0
  61. data/ext/minimap2/sketch.c +143 -0
  62. data/ext/minimap2/splitidx.c +84 -0
  63. data/ext/minimap2/sse2neon/emmintrin.h +1689 -0
  64. data/ext/minimap2/test/MT-human.fa +278 -0
  65. data/ext/minimap2/test/MT-orang.fa +276 -0
  66. data/ext/minimap2/test/q-inv.fa +4 -0
  67. data/ext/minimap2/test/q2.fa +2 -0
  68. data/ext/minimap2/test/t-inv.fa +127 -0
  69. data/ext/minimap2/test/t2.fa +2 -0
  70. data/ext/minimap2/tex/Makefile +21 -0
  71. data/ext/minimap2/tex/bioinfo.cls +930 -0
  72. data/ext/minimap2/tex/blasr-mc.eval +17 -0
  73. data/ext/minimap2/tex/bowtie2-s3.sam.eval +28 -0
  74. data/ext/minimap2/tex/bwa-s3.sam.eval +52 -0
  75. data/ext/minimap2/tex/bwa.eval +55 -0
  76. data/ext/minimap2/tex/eval2roc.pl +33 -0
  77. data/ext/minimap2/tex/graphmap.eval +4 -0
  78. data/ext/minimap2/tex/hs38-simu.sh +10 -0
  79. data/ext/minimap2/tex/minialign.eval +49 -0
  80. data/ext/minimap2/tex/minimap2.bib +460 -0
  81. data/ext/minimap2/tex/minimap2.tex +724 -0
  82. data/ext/minimap2/tex/mm2-s3.sam.eval +62 -0
  83. data/ext/minimap2/tex/mm2-update.tex +240 -0
  84. data/ext/minimap2/tex/mm2.approx.eval +12 -0
  85. data/ext/minimap2/tex/mm2.eval +13 -0
  86. data/ext/minimap2/tex/natbib.bst +1288 -0
  87. data/ext/minimap2/tex/natbib.sty +803 -0
  88. data/ext/minimap2/tex/ngmlr.eval +38 -0
  89. data/ext/minimap2/tex/roc.gp +60 -0
  90. data/ext/minimap2/tex/snap-s3.sam.eval +62 -0
  91. data/ext/minimap2.patch +19 -0
  92. data/lib/minimap2/aligner.rb +4 -4
  93. data/lib/minimap2/alignment.rb +11 -11
  94. data/lib/minimap2/ffi/constants.rb +20 -16
  95. data/lib/minimap2/ffi/functions.rb +5 -0
  96. data/lib/minimap2/ffi.rb +4 -5
  97. data/lib/minimap2/version.rb +2 -2
  98. data/lib/minimap2.rb +51 -15
  99. metadata +97 -79
  100. data/lib/minimap2/ffi_helper.rb +0 -53
  101. data/vendor/libminimap2.so +0 -0
@@ -0,0 +1,724 @@
1
+ \documentclass{bioinfo}
2
+ \copyrightyear{2018}
3
+ \pubyear{2018}
4
+
5
+ \usepackage{graphicx}
6
+ \usepackage{hyperref}
7
+ \usepackage{url}
8
+ \usepackage{amsmath}
9
+ \usepackage[ruled,vlined]{algorithm2e}
10
+ \newcommand\mycommfont[1]{\footnotesize\rmfamily{\it #1}}
11
+ \SetCommentSty{mycommfont}
12
+ \SetKwComment{Comment}{$\triangleright$\ }{}
13
+
14
+ \usepackage{natbib}
15
+ \bibliographystyle{apalike}
16
+
17
+ \DeclareMathOperator*{\argmax}{argmax}
18
+
19
+ \begin{document}
20
+ \firstpage{1}
21
+
22
+ \title[Aligning nucleotide sequences with minimap2]{Minimap2: pairwise alignment for nucleotide sequences}
23
+ \author[Li]{Heng Li}
24
+ \address{Broad Institute, 415 Main Street, Cambridge, MA 02142, USA}
25
+
26
+ \maketitle
27
+
28
+ \begin{abstract}
29
+
30
+ \section{Motivation:} Recent advances in sequencing technologies promise
31
+ ultra-long reads of $\sim$100 kilo bases (kb) in average, full-length mRNA or
32
+ cDNA reads in high throughput and genomic contigs over 100 mega bases (Mb) in
33
+ length. Existing alignment programs are unable or inefficient to process such data
34
+ at scale, which presses for the development of new alignment algorithms.
35
+
36
+ \section{Results:} Minimap2 is a general-purpose alignment program to map DNA or long
37
+ mRNA sequences against a large reference database. It works with accurate short
38
+ reads of $\ge$100bp in length, $\ge$1kb genomic reads at error rate $\sim$15\%,
39
+ full-length noisy Direct RNA or cDNA reads, and assembly contigs or closely
40
+ related full chromosomes of hundreds of megabases in length. Minimap2 does
41
+ split-read alignment, employs concave gap cost for long insertions and
42
+ deletions (INDELs) and introduces new heuristics to reduce spurious alignments.
43
+ It is 3--4 times as fast as mainstream short-read mappers at comparable
44
+ accuracy, and is $\ge$30 times faster than long-read genomic or cDNA
45
+ mappers at higher accuracy, surpassing most aligners specialized in one type of
46
+ alignment.
47
+
48
+ \section{Availability and implementation:}
49
+ \href{https://github.com/lh3/minimap2}{https://github.com/lh3/minimap2}
50
+
51
+ \section{Contact:} hengli@broadinstitute.org
52
+ \end{abstract}
53
+
54
+ \section{Introduction}
55
+
56
+ Single Molecule Real-Time (SMRT) sequencing technology and Oxford Nanopore
57
+ technologies (ONT) produce reads over 10kbp in length at an error rate
58
+ $\sim$15\%. Several aligners have been developed for such
59
+ data~\citep{Chaisson:2012aa,Li:2013aa,Liu:2016ab,Sovic:2016aa,Liu:2017aa,Lin:2017aa,Sedlazeck169557}.
60
+ Most of them were five times as slow as mainstream short-read
61
+ aligners~\citep{Langmead:2012fk,Li:2013aa} in terms of the number of bases
62
+ mapped per second. We speculated there could be substantial room for speedup on
63
+ the thought that 10kb long sequences should be easier to map than 100bp reads
64
+ because we can more effectively skip repetitive regions, which are often the
65
+ bottleneck of short-read alignment. We confirmed our speculation by achieving
66
+ approximate mapping 50 times faster than BWA-MEM~\citep{Li:2016aa}.
67
+ \citet{Suzuki:2018aa} extended our work with a fast and novel algorithm on
68
+ generating base-level alignment, which in turn inspired us to develop minimap2
69
+ with added functionality.
70
+
71
+ Both SMRT and ONT have been applied to the sequencing of spliced mRNAs (RNA-seq). While
72
+ traditional mRNA aligners work~\citep{Wu:2005vn,Iwata:2012aa}, they are not
73
+ optimized for long noisy sequence reads and are tens of times slower than
74
+ dedicated long-read aligners. When developing minimap2 initially for aligning
75
+ genomic DNA only, we realized minor modifications could enable the base
76
+ algorithm to map mRNAs as well. Minimap2 becomes a first RNA-seq aligner
77
+ specifically designed for long noisy reads. We have also extended the original
78
+ algorithm to map short reads at a speed faster than several mainstream
79
+ short-read mappers.
80
+
81
+ In this article, we will describe the minimap2 algorithm and its applications
82
+ to different types of input sequences. We will evaluate the performance and
83
+ accuracy of minimap2 on several simulated and real data sets and demonstrate
84
+ the versatility of minimap2.
85
+
86
+ \begin{methods}
87
+ \section{Methods}
88
+
89
+ Minimap2 follows a typical seed-chain-align procedure as is used by most
90
+ full-genome aligners. It collects minimizers~\citep{Roberts:2004fv} of the
91
+ reference sequences and indexes them in a hash table, with the key being the
92
+ hash of a minimizer and the value being a list of locations of the minimizer
93
+ copies. Then for each query
94
+ sequence, minimap2 takes query minimizers as \emph{seeds}, finds exact matches
95
+ (i.e. \emph{anchors}) to the reference, and identifies sets of colinear anchors as
96
+ \emph{chains}. If base-level alignment is requested, minimap2 applies dynamic
97
+ programming (DP) to extend from the ends of chains and to close
98
+ regions between adjacent anchors in chains.
99
+
100
+ Minimap2 uses indexing and seeding algorithms similar to
101
+ minimap~\citep{Li:2016aa}, and furthers the predecessor with more accurate
102
+ chaining, the ability to produce base-level alignment and the support of
103
+ spliced alignment.
104
+
105
+ \subsection{Chaining}
106
+
107
+ \subsubsection{Chaining}
108
+ An \emph{anchor} is a 3-tuple $(x,y,w)$, indicating interval $[x-w+1,x]$ on the
109
+ reference matching interval $[y-w+1,y]$ on the query. Given a list of anchors
110
+ sorted by ending reference position $x$, let $f(i)$ be the maximal chaining
111
+ score up to the $i$-th anchor in the list. $f(i)$ can be calculated with
112
+ dynamic programming:
113
+ \begin{equation}\label{eq:chain}
114
+ f(i)=\max\big\{\max_{i>j\ge 1} \{ f(j)+\alpha(j,i)-\beta(j,i) \},w_i\big\}
115
+ \end{equation}
116
+ where $\alpha(j,i)=\min\big\{\min\{y_i-y_j,x_i-x_j\},w_i\big\}$ is the number of
117
+ matching bases between the two anchors. $\beta(j,i)>0$ is the gap cost. It
118
+ equals $\infty$ if $y_j\ge y_i$ or $\max\{y_i-y_j,x_i-x_j\}>G$ (i.e. the
119
+ distance between two anchors is too large); otherwise
120
+ \begin{equation}\label{eq:chain-gap}
121
+ \beta(j,i)=\gamma_c\big((y_i-y_j)-(x_i-x_j)\big)
122
+ \end{equation}
123
+ In implementation, a gap of length $l$ costs
124
+ \[
125
+ \gamma_c(l)=\left\{\begin{array}{ll}
126
+ 0.01\cdot \bar{w}\cdot|l|+0.5\log_2|l| & (l\not=0) \\
127
+ 0 & (l=0)
128
+ \end{array}\right.
129
+ \]
130
+ where $\bar{w}$ is the average seed length. For $N$ anchors, directly computing all $f(\cdot)$ with
131
+ Eq.~(\ref{eq:chain}) takes $O(N^2)$ time. Although theoretically faster
132
+ chaining algorithms exist~\citep{Abouelhoda:2005aa}, they
133
+ are inapplicable to generic gap cost, complex to implement and usually
134
+ associated with a large constant. We introduced a simple heuristic to
135
+ accelerate chaining.
136
+
137
+ We note that if anchor $i$ is chained to $j$, chaining $i$ to a predecessor
138
+ of $j$ is likely to yield a lower score. When evaluating Eq.~(\ref{eq:chain}),
139
+ we start from anchor $i-1$ and stop the process if we cannot find a better
140
+ score after up to $h$ iterations. This approach reduces the average time to
141
+ $O(hN)$. In practice, we can almost always find the optimal chain with
142
+ $h=50$; even if the heuristic fails, the optimal chain is often close.
143
+
144
+ \subsubsection{Backtracking}
145
+ Let $P(i)$ be the index of the best predecessor of anchor $i$. It equals 0 if
146
+ $f(i)=w_i$ or $\argmax_j\{f(j)+\alpha(j,i)-\beta(j,i)\}$ otherwise. For each
147
+ anchor $i$ in the descending order of $f(i)$, we apply $P(\cdot)$ repeatedly to
148
+ find its predecessor and mark each visited $i$ as `used', until $P(i)=0$ or we
149
+ reach an already `used' $i$. This way we find all chains with no anchors used
150
+ in more than one chains.
151
+
152
+ \subsubsection{Identifying primary chains}\label{sec:primary}
153
+ In the absence of copy number changes, each query segment should not be mapped
154
+ to two places in the reference. However, chains found at the previous step may
155
+ have significant or complete overlaps due to repeats in the reference~\citep{Li:2010fk}.
156
+ Minimap2 used the following procedure to identify \emph{primary chains} that do
157
+ not greatly overlap on the query.
158
+
159
+ Let $Q$ be an empty set initially. For each
160
+ chain from the best to the worst according to their chaining scores: if on the
161
+ query, the chain overlaps with a chain in $Q$ by 50\% or higher percentage of
162
+ the shorter chain, mark the chain as secondary to the chain in $Q$; otherwise,
163
+ add the chain to $Q$. In the end, $Q$ contains all the primary chains. We did
164
+ not choose a more sophisticated data structure (e.g. range tree or k-d tree)
165
+ because this step is not the performance bottleneck.
166
+
167
+ For each primary chain, minimap2 estimates its mapping quality with an
168
+ empirical formula:
169
+ \[
170
+ {\rm mapQ}=40\cdot (1-f_2/f_1)\cdot\min\{1,m/10\}\cdot\log f_1
171
+ \]
172
+ where $\log$ denotes natural logarithm, $m$ is the number of anchors on the primary chain, $f_1$ is the chaining
173
+ score, and $f_2\le f_1$ is the score of the best chain that is secondary to the
174
+ primary chain. Intuitively, a chain is assigned to a higher mapping quality if
175
+ it is long and its best secondary chain is weak.
176
+
177
+ \subsubsection{Estimating per-base sequence divergence}
178
+ Suppose a query sequence harbors $n$ seeds of length $k$, $m$ of which are
179
+ present in a chain. We want to estimate the sequence divergence $\epsilon$
180
+ between the query and the reference sequences in the chain. This is useful
181
+ when base-level alignment is too expensive to perform.
182
+
183
+ If we model substitutions with a homogeneous Poisson process along the query
184
+ sequence, the probablity of seeing $k$ consecutive bases without substitutions
185
+ is $e^{-k\epsilon}$. On the assumption that all $k$-mers are independent of
186
+ each other, the likelihood function of $\epsilon$ is
187
+ \[
188
+ \mathcal{L}(\epsilon|n,m,k)=e^{-m\cdot k\epsilon}(1-e^{-k\epsilon})^{n-m}
189
+ \]
190
+ The maximum likelihood estimate of $\epsilon$ is
191
+ \[
192
+ \hat{\epsilon}=\frac{1}{k}\log\frac{n}{m}
193
+ \]
194
+ In reality, sequencing errors are sometimes clustered and $k$-mers are not
195
+ independent of each other, especially when we take minimizers as seeds. These
196
+ violate the assumptions in the derivation above. As a result, $\hat{\epsilon}$
197
+ is only approximate and can be biased. It also ignores long deletions from the
198
+ reference sequence. In practice, fortunately, $\hat{\epsilon}$ is often close
199
+ to and strongly correlated with the sequence divergence estimated from
200
+ base-level alignments. On the several datasets used in
201
+ Section~\ref{sec:long-genomic}, the Spearman correlation coefficient is around
202
+ $0.9$.
203
+
204
+ \subsubsection{Indexing with homopolymer compressed $k$-mers}
205
+ SmartDenovo
206
+ (\href{https://github.com/ruanjue/smartdenovo}{https://github.com/ruanjue/smartdenovo};
207
+ J. Ruan, personal communication) indexes reads with homopolymer-compressed (HPC)
208
+ $k$-mers and finds the strategy improves overlap sensitivity for SMRT reads.
209
+ Minimap2 adopts the same heuristic.
210
+
211
+ The HPC string of a string $s$, denoted by ${\rm HPC}(s)$, is constructed by
212
+ contracting homopolymers in $s$ to a single base. An HPC $k$-mer of $s$ is a
213
+ $k$-long substring of ${\rm HPC}(s)$. For example, suppose $s={\tt GGATTTTCCA}$,
214
+ ${\rm HPC}(s)={\tt GATCA}$ and the first HPC 4-mer is ${\tt GATC}$.
215
+
216
+ To demonstrate the effectiveness of HPC $k$-mers, we performed read overlapping
217
+ for the example {\it E. coli} SMRT reads from PBcR~\citep{Berlin:2015xy}, using
218
+ different types of $k$-mers. With normal 15bp minimizers per 5bp window,
219
+ minimap2 finds 90.9\% of $\ge$2kb overlaps inferred from the read-to-reference
220
+ alignment. With HPC 19-mers per 5bp window, minimap2 finds 97.4\% of overlaps. It achieves this
221
+ higher sensitivity by indexing 1/3 fewer minimizers, which further helps
222
+ performance. HPC-based indexing reduces the sensitivity for current ONT reads, though.
223
+
224
+ \subsection{Aligning genomic DNA}\label{sec:genomic}
225
+
226
+ \subsubsection{Alignment with 2-piece affine gap cost}
227
+
228
+ Minimap2 performs DP-based global alignment between adjacent anchors in a
229
+ chain. It uses a 2-piece affine gap cost~\citep{Gotoh:1990aa}:
230
+ \begin{equation}\label{eq:2-piece}
231
+ \gamma_a(l)=\min\{q+|l|\cdot e,\tilde{q}+|l|\cdot\tilde{e}\}
232
+ \end{equation}
233
+ Without losing generality, we always assume $q+e<\tilde{q}+\tilde{e}$.
234
+ On the condition that $e>\tilde{e}$, it applies cost $q+|l|\cdot e$ to gaps
235
+ shorter than $\lceil(\tilde{q}-q)/(e-\tilde{e})\rceil$ and applies
236
+ $\tilde{q}+|l|\cdot\tilde{e}$ to longer gaps. This scheme helps to recover
237
+ longer insertions and deletions~(INDELs).
238
+
239
+ The equation to compute the optimal alignment under $\gamma_a(\cdot)$ is
240
+ \begin{equation}\label{eq:ae86}
241
+ \left\{\begin{array}{l}
242
+ H_{ij} = \max\{H_{i-1,j-1}+s(i,j),E_{ij},F_{ij},\tilde{E}_{ij},\tilde{F}_{ij}\}\\
243
+ E_{i+1,j}= \max\{H_{ij}-q,E_{ij}\}-e\\
244
+ F_{i,j+1}= \max\{H_{ij}-q,F_{ij}\}-e\\
245
+ \tilde{E}_{i+1,j}= \max\{H_{ij}-\tilde{q},\tilde{E}_{ij}\}-\tilde{e}\\
246
+ \tilde{F}_{i,j+1}= \max\{H_{ij}-\tilde{q},\tilde{F}_{ij}\}-\tilde{e}
247
+ \end{array}\right.
248
+ \end{equation}
249
+ where $s(i,j)$ is the score between the $i$-th reference base and $j$-th query
250
+ base. Eq.~(\ref{eq:ae86}) is a natural extension to the equation under affine
251
+ gap cost~\citep{Gotoh:1982aa,Altschul:1986aa}.
252
+
253
+ \subsubsection{The Suzuki-Kasahara formulation}
254
+
255
+ When we allow gaps longer than several hundred base pairs, nucleotide-level
256
+ alignment is much slower than chaining. SSE acceleration is critical to the
257
+ performance of minimap2. Traditional SSE implementations~\citep{Farrar:2007hs}
258
+ based on Eq.~(\ref{eq:ae86}) can achieve 16-way parallelization for short
259
+ sequences, but only 4-way parallelization when the peak alignment score reaches
260
+ 32767. Long sequence alignment may exceed this threshold. Inspired by
261
+ \citet{Wu:1996aa} and the following work, \citet{Suzuki:2018aa} proposed a
262
+ difference-based formulation that lifted this limitation.
263
+ In case of 2-piece gap cost, define
264
+ \[
265
+ \left\{\begin{array}{ll}
266
+ u_{ij}\triangleq H_{ij}-H_{i-1,j} & v_{ij}\triangleq H_{ij}-H_{i,j-1} \\
267
+ x_{ij}\triangleq E_{i+1,j}-H_{ij} & \tilde{x}_{ij}\triangleq \tilde{E}_{i+1,j}-H_{ij} \\
268
+ y_{ij}\triangleq F_{i,j+1}-H_{ij} & \tilde{y}_{ij}\triangleq \tilde{F}_{i,j+1}-H_{ij}
269
+ \end{array}\right.
270
+ \]
271
+ We can transform Eq.~(\ref{eq:ae86}) to
272
+ \begin{equation}\label{eq:suzuki}
273
+ \left\{\begin{array}{lll}
274
+ z_{ij}&=&\max\{s(i,j),x_{i-1,j}+v_{i-1,j},y_{i,j-1}+u_{i,j-1},\\
275
+ &&\tilde{x}_{i-1,j}+v_{i-1,j},\tilde{y}_{i,j-1}+u_{i,j-1}\}\\
276
+ u_{ij}&=&z_{ij}-v_{i-1,j}\\
277
+ v_{ij}&=&z_{ij}-u_{i,j-1}\\
278
+ x_{ij}&=&\max\{0,x_{i-1,j}+v_{i-1,j}-z_{ij}+q\}-q-e\\
279
+ y_{ij}&=&\max\{0,y_{i,j-1}+u_{i,j-1}-z_{ij}+q\}-q-e\\
280
+ \tilde{x}_{ij}&=&\max\{0,\tilde{x}_{i-1,j}+v_{i-1,j}-z_{ij}+\tilde{q}\}-\tilde{q}-\tilde{e}\\
281
+ \tilde{y}_{ij}&=&\max\{0,\tilde{y}_{i,j-1}+u_{i,j-1}-z_{ij}+\tilde{q}\}-\tilde{q}-\tilde{e}
282
+ \end{array}\right.
283
+ \end{equation}
284
+ where $z_{ij}$ is a temporary variable that does not need to be stored.
285
+
286
+ An important property of Eq.~(\ref{eq:suzuki}) is that all values are bounded
287
+ by scoring parameters. To see that,
288
+ \[
289
+ x_{ij}=E_{i+1,j}-H_{ij}=\max\{-q,E_{ij}-H_{ij}\}-e
290
+ \]
291
+ With $E_{ij}\le H_{ij}$, we have
292
+ \[
293
+ -q-e\le x_{ij}\le\max\{-q,0\}-e=-e
294
+ \]
295
+ and similar inequations for $y_{ij}$, $\tilde{x}_{ij}$ and $\tilde{y}_{ij}$.
296
+ In addition,
297
+ \[
298
+ u_{ij}=z_{ij}-v_{i-1,j}\ge\max\{x_{i-1,j},\tilde{x}_{i-1,j}\}\ge-q-e
299
+ \]
300
+ As the maximum value of $z_{ij}=H_{ij}-H_{i-1,j-1}$ is $M$, the maximal
301
+ matching score, we can derive
302
+ \[
303
+ u_{ij}\le M-v_{i-1,j}\le M+q+e
304
+ \]
305
+ In conclusion, in Eq.~(\ref{eq:suzuki}), $x$ and $y$ are bounded by $[-q-e,-e]$,
306
+ $\tilde{x}$ and $\tilde{y}$ by $[-\tilde{q}-\tilde{e},-\tilde{e}]$, and $u$ and
307
+ $v$ by $[-q-e,M+q+e]$. When $-128\le-q-e<M+q+e\le127$, each of them can be stored as
308
+ a 8-bit integer. This enables 16-way SSE vectorization regardless of the peak
309
+ score of the alignment.
310
+
311
+ For a more efficient SSE implementation, we transform the row-column coordinate
312
+ to the diagonal-antidiagonal coordinate by letting $r\gets i+j$ and $t\gets i$.
313
+ Eq.~(\ref{eq:suzuki}) becomes:
314
+ \begin{equation*}
315
+ \left\{\begin{array}{lll}
316
+ z_{rt}&=&\max\{s(t,r-t),x_{r-1,t-1}+v_{r-1,t-1},y_{r-1,t}\\
317
+ &&+u_{r-1,t},\tilde{x}_{r-1,t-1}+v_{r-1,t-1},\tilde{y}_{r-1,t}+u_{r-1,t}\}\\
318
+ u_{rt}&=&z_{rt}-v_{r-1,t-1}\\
319
+ v_{rt}&=&z_{rt}-u_{r-1,t}\\
320
+ x_{rt}&=&\max\{0,x_{r-1,t-1}+v_{r-1,t-1}-z_{rt}+q\}-q-e\\
321
+ y_{rt}&=&\max\{0,y_{r-1,t}+u_{r-1,t}-z_{rt}+q\}-q-e\\
322
+ \tilde{x}_{rt}&=&\max\{0,\tilde{x}_{r-1,t-1}+v_{r-1,t-1}-z_{rt}+\tilde{q}\}-\tilde{q}-\tilde{e}\\
323
+ \tilde{y}_{rt}&=&\max\{0,\tilde{y}_{r-1,t}+u_{r-1,t}-z_{rt}+\tilde{q}\}-\tilde{q}-\tilde{e}
324
+ \end{array}\right.
325
+ \end{equation*}
326
+ In this formulation, cells with the same diagonal index $r$ are independent of
327
+ each other. This allows us to fully vectorize the computation of all cells on
328
+ the same anti-diagonal in one inner loop. It also simplifies banded alignment (500bp band width by default),
329
+ which would be difficult with striped vectorization~\citep{Farrar:2007hs}.
330
+
331
+ On the condition that $q+e<\tilde{q}+\tilde{e}$ and $e>\tilde{e}$, the initial
332
+ values in the diagonal-antidiagonal formuation are
333
+ \[
334
+ \left\{\begin{array}{l}
335
+ x_{r-1,-1}=y_{r-1,r}=-q-e\\
336
+ \tilde{x}_{r-1,-1}=\tilde{y}_{r-1,r}=-\tilde{q}-\tilde{e}\\
337
+ u_{r-1,r}=v_{r-1,-1}=\eta(r)\\
338
+ \end{array}\right.
339
+ \]
340
+ where
341
+ \[
342
+ \eta(r)=\left\{\begin{array}{ll}
343
+ -q-e & (r=0) \\
344
+ -e & (r<\lceil\frac{\tilde{q}-q}{e-\tilde{e}}-1\rceil) \\
345
+ r\cdot(e-\tilde{e})-(\tilde{q}-q)-\tilde{e} & (r=\lceil\frac{\tilde{q}-q}{e-\tilde{e}}-1\rceil) \\
346
+ -\tilde{e} & (r>\lceil\frac{\tilde{q}-q}{e-\tilde{e}}-1\rceil)
347
+ \end{array}\right.
348
+ \]
349
+ These can be derived from the initial values for Eq.~(\ref{eq:ae86}).
350
+
351
+ When performing global alignment, we do not need to compute $H_{rt}$ in each cell.
352
+ We use 16-way vectorization throughout the alignment process. When extending
353
+ alignments from ends of chains, we need to find the cell $(r,t)$ where $H_{rt}$
354
+ reaches the maximum. We resort to 4-way vectorization to compute
355
+ $H_{rt}=H_{r-1,t}+u_{rt}$. Because this computation is simple,
356
+ Eq.~(\ref{eq:suzuki}) is still the dominant performance bottleneck.
357
+
358
+ In practice, our 16-way vectorized implementation of global alignment is three
359
+ times as fast as Parasail's 4-way vectorization~\citep{Daily:2016aa}. Without
360
+ banding, our implementation is slower than Edlib~\citep{Sosic:2017aa}, but with
361
+ a 1000bp band, it is considerably faster. When performing global alignment
362
+ between anchors, we expect the alignment to stay close to the diagonal of the
363
+ DP matrix. Banding is applicable most of the time.
364
+
365
+ \subsubsection{The Z-drop heuristic}
366
+
367
+ With global alignment, minimap2 may force to align unrelated sequences between
368
+ two adjacent anchors. To avoid such an artifact, we compute accumulative
369
+ alignment score along the alignment path and break the alignment where the
370
+ score drops too fast in the diagonal direction. More precisely, let $S(i,j)$ be
371
+ the alignment score along the alignment path ending at cell $(i,j)$ in the DP
372
+ matrix. We break the alignment if there exist $(i',j')$ and $(i,j)$, $i'<i$ and
373
+ $j'<j$, such that
374
+ \[
375
+ S(i',j')-S(i,j)>Z+e\cdot|(i-i')-(j-j')|
376
+ \]
377
+ where $e$ is the gap extension cost and $Z$ is an arbitrary threshold.
378
+ This strategy is first used in BWA-MEM. It is similar to X-drop employed in
379
+ BLAST~\citep{Altschul:1997vn}, but unlike X-drop, it would not break the
380
+ alignment in the presence of a single long gap.
381
+
382
+ When minimap2 breaks a global alignment between two anchors, it performs local
383
+ alignment between the two subsequences involved in the global alignment, but
384
+ this time with the one subsequence reverse complemented. This additional
385
+ alignment step may identify short inversions that are missed during chaining.
386
+
387
+ \subsubsection{Filtering out misplaced anchors}
388
+ Due to sequencing errors and local homology, some anchors in a chain may be
389
+ wrong. If we blindly align regions between two misplaced anchors, we will
390
+ produce a suboptimal alignment. To reduce this artifact, we filter out
391
+ anchors that lead to a $>$10bp insertion and a $>$10bp deletion at the same
392
+ time, and filter out terminal anchors that lead to a long gap towards the ends
393
+ of a chain. These heuristics greatly alleviate the issues with misplaced
394
+ anchors, but they are unable to fix all such errors. Local misalignment is a
395
+ limitation of minimap2 which we hope to address in future.
396
+
397
+ \subsection{Aligning spliced sequences}
398
+
399
+ The algorithm described above can be adapted to spliced alignment. In this
400
+ mode, the chaining gap cost distinguishes insertions to and deletions from the
401
+ reference: $\gamma_c(l)$ in Eq.~(\ref{eq:chain-gap}) takes the form of
402
+ \[
403
+ \gamma_c(l)=\left\{\begin{array}{ll}
404
+ 0.01\cdot\bar{w}\cdot l+0.5\log_2 l & (l>0) \\
405
+ \min\{0.01\cdot\bar{w}\cdot|l|,\log_2|l|\} & (l<0)
406
+ \end{array}\right.
407
+ \]
408
+ Similarly, the gap cost function used for DP-based alignment is changed to
409
+ \[
410
+ \gamma_a(l)=\left\{\begin{array}{ll}
411
+ q+l\cdot e & (l>0) \\
412
+ \min\{q+|l|\cdot e,\tilde{q}\} & (l<0)
413
+ \end{array}\right.
414
+ \]
415
+ In alignment, a deletion no shorter than $\lceil(\tilde{q}-q)/e\rceil$ is
416
+ regarded as an intron, which pays no cost to gap extensions.
417
+
418
+ To pinpoint precise splicing junctions, minimap2 introduces reference-dependent
419
+ cost to penalize non-canonical splicing:
420
+ \begin{equation}\label{eq:splice}
421
+ \left\{\begin{array}{l}
422
+ H_{ij} = \max\{H_{i-1,j-1}+s(i,j),E_{ij},F_{ij},\tilde{E}_{ij}-a(i)\}\\
423
+ E_{i+1,j}= \max\{H_{ij}-q,E_{ij}\}-e\\
424
+ F_{i,j+1}= \max\{H_{ij}-q,F_{ij}\}-e\\
425
+ \tilde{E}_{i+1,j}= \max\{H_{ij}-d(i)-\tilde{q},\tilde{E}_{ij}\}\\
426
+ \end{array}\right.
427
+ \end{equation}
428
+ Let $T$ be the reference sequence. $d(i)$ is computed as
429
+ \[d(i)=\left\{\begin{array}{ll}
430
+ 0 & \mbox{if $T[i+1,i+3]$ is ${\tt GTA}$ or ${\tt GTG}$} \\
431
+ p/2 & \mbox{if $T[i+1,i+3]$ is ${\tt GTC}$ or ${\tt GTT}$} \\
432
+ p & \mbox{otherwise}
433
+ \end{array}\right.\]
434
+ where $T[i,j]$ extracts a substring of $T$ between $i$ and $j$ inclusively.
435
+ $d(i)$ penalizes non-canonical donor sites with $p$ and less frequent Eukaryotic
436
+ splicing signal ${\tt GT[C/T]}$ with $p/2$~\citep{Irimia:2008aa}. Similarly,
437
+ \[a(i)=\left\{\begin{array}{ll}
438
+ 0 & \mbox{if $T[i-2,i]$ is ${\tt CAG}$ or ${\tt TAG}$} \\
439
+ p/2 & \mbox{if $T[i-2,i]$ is ${\tt AAG}$ or ${\tt GAG}$} \\
440
+ p & \mbox{otherwise}
441
+ \end{array}\right.\]
442
+ models the acceptor signal. Eq.~(\ref{eq:splice}) is close to an equation in
443
+ \citet{Zhang:2006aa} except that we allow insertions immediately followed by
444
+ deletions and vice versa; in addition, we use the Suzuki-Kasahara diagonal
445
+ formulation in actual implementation.
446
+
447
+ If RNA-seq reads are not sequenced from stranded libraries, the read strand
448
+ relative to the underlying transcript is unknown. By default, minimap2 aligns
449
+ each chain twice, first assuming ${\tt GT}$--${\tt AG}$ as the splicing signal
450
+ and then assuming ${\tt CT}$--${\tt AC}$, the reverse complement of ${\tt
451
+ GT}$--${\tt AG}$, as the splicing signal. The alignment with a higher score is
452
+ taken as the final alignment. This procedure also infers the relative strand of
453
+ reads that span canonical splicing sites.
454
+
455
+ In the spliced alignment mode, minimap2 further increases the density of
456
+ minimizers and disables banded alignment. Together with the two-round DP-based
457
+ alignment, spliced alignment is several times slower than genomic DNA
458
+ alignment.
459
+
460
+ \subsection{Aligning short paired-end reads}
461
+
462
+ During chaining, minimap2 takes a pair of reads as one fragment with a gap of
463
+ unknown length in the middle. It applies a normal gap cost between seeds on the
464
+ same read but is a more permissive gap cost between seeds on different reads.
465
+ More precisely, the gap cost during chaining is ($l\not=0$):
466
+ \[
467
+ \gamma_c(l)=\left\{\begin{array}{ll}
468
+ 0.01\cdot\bar{w}\cdot |l|+0.5\log_2 |l| & \mbox{if two seeds on the same read} \\
469
+ \min\{0.01\cdot\bar{w}\cdot|l|,\log_2|l|\} & \mbox{otherwise}
470
+ \end{array}\right.
471
+ \]
472
+ After identifying primary chains (Section~\ref{sec:primary}), we split each
473
+ fragment chain into two read chains and perform alignment for each read as in
474
+ Section~\ref{sec:genomic}. Finally, we pair hits of each read end to find
475
+ consistent paired-end alignments.
476
+
477
+ \end{methods}
478
+
479
+ \section{Results}
480
+
481
+ Minimap2 is implemented in the C programming language and comes with APIs in
482
+ both C and Python. It is distributed under the MIT license, free to both
483
+ commercial and academic uses. Minimap2 uses the same base algorithm for all
484
+ applications, but it has to apply different sets of parameters depending on
485
+ input data types. Similar to BWA-MEM, minimap2 introduces `presets' that
486
+ modify multiple parameters with a simple invocation. Detailed settings
487
+ and command-line options can be found in the minimap2 manpage. In addition to
488
+ the applications evaluated in the following sections, minimap2 also retains
489
+ minimap's functionality to find overlaps between long reads and to search
490
+ against large multi-species databases such as \emph{nt} from NCBI.
491
+
492
+ \subsection{Aligning long genomic reads}\label{sec:long-genomic}
493
+
494
+ \begin{figure}[!tb]
495
+ \centering
496
+ \includegraphics[width=.5\textwidth]{roc-color.pdf}
497
+ \caption{Evaluation on aligning simulated reads. Simulated reads were mapped
498
+ to the primary assembly of human genome GRCh38. A read is considered correctly
499
+ mapped if its longest alignment overlaps with the true interval, and the
500
+ overlap length is $\ge$10\% of the true interval length. Read alignments are
501
+ sorted by mapping quality in the descending order. For each mapping quality
502
+ threshold, the fraction of alignments (out of the number of input reads) with
503
+ mapping quality above the threshold and their error rate are
504
+ plotted along the curve. (a) long-read alignment evaluation. 33,088 $\ge$1000bp
505
+ reads were simulated using pbsim~\citep{Ono:2013aa} with error profile sampled
506
+ from file `m131017\_060208\_42213\_*.1.*' downloaded at
507
+ \href{http://bit.ly/chm1p5c3}{http://bit.ly/chm1p5c3}. The N50 read length is
508
+ 11,628. Aligners were run under the default setting for SMRT reads.
509
+ Kart outputted all alignments at mapping quality 60, so is not shown in the
510
+ figure. It mapped nearly all reads with 4.1\% of alignments being wrong, less
511
+ accurate than others. (b) short-read alignment evaluation. 10 million pairs of
512
+ 150bp reads were simulated using mason2~\citep{Holtgrewe:2010aa} with option
513
+ `\mbox{--illumina-prob-mismatch-scale 2.5}'. Short-read aligners were run under
514
+ the default setting except for changing the maximum fragment length to
515
+ 800bp.}\label{fig:eval}
516
+ \end{figure}
517
+
518
+ As a sanity check, we evaluated minimap2 on simulated human reads along with
519
+ BLASR~(v1.MC.rc64; \citealp{Chaisson:2012aa}),
520
+ BWA-MEM~(v0.7.15; \citealp{Li:2013aa}),
521
+ GraphMap~(v0.5.2; \citealp{Sovic:2016aa}),
522
+ Kart~(v2.2.5; \citealp{Lin:2017aa}),
523
+ minialign~(v0.5.3; \href{https://github.com/ocxtal/minialign}{https://github.com/ocxtal/minialign}) and
524
+ NGMLR~(v0.2.5; \citealp{Sedlazeck169557}). We excluded rHAT~\citep{Liu:2016ab}
525
+ and LAMSA~\citep{Liu:2017aa} because they either
526
+ crashed or produced malformatted output. In this evaluation, minimap2 has
527
+ higher power to distinguish unique and repetitive hits, and achieves overall
528
+ higher mapping accuracy (Fig.~\ref{fig:eval}a). Minimap2 and
529
+ NGMLR provide better mapping quality estimate: they rarely give repetitive hits
530
+ high mapping quality. Apparently, other aligners may
531
+ occasionally miss close suboptimal hits and be overconfident in wrong mappings.
532
+ On run time, minimap2 took 200 CPU seconds, comparable to minialign and Kart, and is over
533
+ 30 times faster than the rest. Minimap2 consumed 6.8GB memory at the peak,
534
+ more than BWA-MEM (5.4GB), similar to NGMLR and less than others.
535
+
536
+ On real human SMRT reads, the relative performance and fraction of mapped reads reported by
537
+ these aligners are broadly similar to the metrics on simulated data. We are
538
+ unable to provide a good estimate of mapping error rate due to the lack of the
539
+ truth. On ONT $\sim$100kb human reads~\citep{Jain128835}, BWA-MEM failed.
540
+ Kart, minialign and minimap2 are over 70 times faster than others. We have also
541
+ examined tens of $\ge$100bp INDELs in IGV~\citep{Robinson:2011aa} and can
542
+ confirm the observation by~\citet{Sedlazeck169557} that BWA-MEM often breaks
543
+ them into shorter gaps. The issue is much alleviated with minimap2, thanks
544
+ to the 2-piece affine gap cost.
545
+
546
+ \subsection{Aligning long spliced reads}
547
+
548
+ We evaluated minimap2 on SIRV control data~(AC:SRR5286959;
549
+ \citealp{Byrne:2017aa}) where the truth is known. Minimap2 predicted 59\,918
550
+ introns from 11\,018 reads. 93.8\% of splice juctions are precise. We examined
551
+ wrongly predicted junctions and found the majority were caused by clustered
552
+ splicing signals (e.g. two adjacent ${\tt GT}$ sites). When INDEL sequencing
553
+ errors are frequent, it is difficult to find precise splicing sites in this
554
+ case. If we allow up to 10bp distance from true splicing sites, 98.4\% of
555
+ aligned introns are approximately correct. It is worth noting that for SIRV, we
556
+ asked minimap2 to model the ${\tt GT..AG}$ splicing signal only without extra
557
+ bases. This is because SIRV does not honor the evolutionarily prevalent signal
558
+ ${\tt GT[A/G]..[C/T]AG}$~\citep{Irimia:2008aa}.
559
+
560
+ \begin{table}[!tb]
561
+ \processtable{Evaluation of junction accuracy on 2D ONT reads}
562
+ {\footnotesize\label{tab:intron}
563
+ \begin{tabular}{p{3.1cm}rrrr}
564
+ \toprule
565
+ & GMAP & minimap2 & SpAln & STAR\\
566
+ \midrule
567
+ Run time (CPU min) & 631 & 15.9 & 2\,076 & 33.9 \\
568
+ Peak RAM (GByte) & 8.9 & 14.5 & 3.2 & 29.2\vspace{1em}\\
569
+ \# aligned reads & 103\,669 & 104\,199 & 103\,711 & 26\,479 \\
570
+ \# chimeric alignments & 1\,904 & 1\,488 & 0 & 0 \\
571
+ \# non-spliced alignments & 15\,854 & 14\,798 & 17\,033 & 10\,545\vspace{1em}\\
572
+ \# aligned introns & 692\,275 & 693\,553 & 692\,945 & 78\,603 \\
573
+ \# novel introns & 11\,239 & 3\,113 & 8\,550 & 1\,214 \\
574
+ \% exact introns & 83.8\% & 94.0\% & 87.9\% & 55.2\% \\
575
+ \% approx. introns & 91.8\% & 96.9\% & 92.5\% & 82.4\% \\
576
+ \botrule
577
+ \end{tabular}
578
+ }{Mouse cDNA reads (AC:SRR5286960; R9.4 chemistry) were mapped to the primary assembly of mouse
579
+ genome GRCm38 with the following tools and command options: minimap2 (`-ax
580
+ splice'); GMAP (`-n 0 --min-intronlength 30 --cross-species'); SpAln (`-Q7 -LS
581
+ -S3'); STARlong (according to
582
+ \href{http://bit.ly/star-pb}{http://bit.ly/star-pb}). The alignments were
583
+ compared to the EnsEMBL gene annotation, release 89. A predicted intron
584
+ is \emph{novel} if it has no overlaps with any annotated introns. An intron
585
+ is \emph{exact} if it is identical to an annotated intron. An intron is
586
+ \emph{approximate} if both its 5'- and 3'-end are within 10bp around the ends
587
+ of an annotated intron. Chimeric alignments are defined in the SAM spec~\citep{Li:2009ys}.}
588
+ \end{table}
589
+
590
+ We next aligned real mouse reads~\citep{Byrne:2017aa} with GMAP~(v2017-06-20;
591
+ \citealp{Wu:2005vn}), minimap2, SpAln~(v2.3.1; \citealp{Iwata:2012aa}) and
592
+ STAR~(v2.5.3a; \citealp{Dobin:2013kx}). In general, minimap2 is more
593
+ consistent with existing annotations (Table~\ref{tab:intron}): it finds
594
+ more junctions with a higher percentage being exactly or approximately correct.
595
+ Minimap2 is over 40 times faster than GMAP and SpAln. While STAR is close to
596
+ minimap2 in speed, it does not work well with noisy reads.
597
+
598
+ We have also evaluated spliced aligners on a human Nanopore Direct RNA-seq
599
+ dataset (\href{http://bit.ly/na12878ont}{http://bit.ly/na12878ont}). Minimap2
600
+ aligned 10 million reads in $<$1 wall-clock hour using 16 CPU cores. 94.2\% of
601
+ aligned splice junctions consistent with gene annotations. In comparison,
602
+ GMAP under option `-k 14 -n 0 --min-intronlength 30 --cross-species' is 160
603
+ times slower; 68.7\% of GMAP junctions are found in known gene annotations. The
604
+ percentage increases to 84.1\% if an aligned junction within 10bp from an
605
+ annotated junction is considered to be correct. On a public Iso-Seq dataset
606
+ (human Alzheimer brain from
607
+ \href{http://bit.ly/isoseqpub}{http://bit.ly/isoseqpub}), minimap2 is also
608
+ faster at higher junction accuracy in comparison to other aligners in
609
+ Table~\ref{tab:intron}.
610
+
611
+ We noted that GMAP and SpAln have not been optimized for noisy reads. We are
612
+ showing the best setting we have experimented, but their developers should be
613
+ able to improve their accuracy further.
614
+
615
+ %\begin{table}[!tb]
616
+ %\processtable{Evaluation of junction accuracy on SMRT Iso-Seq reads}
617
+ %{\footnotesize
618
+ %\begin{tabular}{lrrrr}
619
+ %\toprule
620
+ % & GMAP & minimap2 & SpAln & STAR \\ % one GMAP thread took 14 days to align a tiny fraction of reads
621
+ %\midrule
622
+ %Run time (CPU min) & - & 243 & 2,352 & 1,647 \\
623
+ %\# aligned reads & 1,113,502 & 1,123,025 & 1,094,092 & 682,452 \\
624
+ %\# chimeric alignments & 48,927 & 33,091 & 0 & 0 \\
625
+ %\# non-spliced alignments & 334,097 & 339,081 & 291,447 & 272,536 \vspace{1em}\\
626
+ %\# aligned introns & 8,922,221 & 9,071,755 & 9,208,564 & 3,029,121 \\
627
+ %\# novel introns & 48,927 & 42,773 & 82,230 & 17,791 \\
628
+ %\% exact introns & 90.6\% & 94.9\% & 91.7\% & 84.7\% \\
629
+ %\% approx. introns & 94.0\% & 96.9\% & 93.4\% & 93.8\% \\
630
+ %\botrule
631
+ %\end{tabular}
632
+ %}{}
633
+ %\end{table}
634
+
635
+ \subsection{Aligning short genomic reads}
636
+
637
+ We evaluated minimap2 along with Bowtie2~(v2.3.3; \citealt{Langmead:2012fk}), BWA-MEM and
638
+ SNAP (v1.0beta23; \citealt{Zaharia:2011aa}). Minimap2 is 3--4 times as fast as Bowtie2 and
639
+ BWA-MEM, but is 1.3 times slower than SNAP. Minimap2 is more accurate on this
640
+ simulated data set than Bowtie2 and SNAP but less accurate than BWA-MEM
641
+ (Fig.~\ref{fig:eval}b). Closer investigation reveals that BWA-MEM achieves
642
+ a higher accuracy partly because it tries to locally align a read in a small
643
+ region close to its mate. If we disable this feature, BWA-MEM becomes slightly
644
+ less accurate than minimap2. We might implement a similar heuristic
645
+ in minimap2 in future.
646
+
647
+ To evaluate the accuracy of minimap2 on real data, we aligned human reads
648
+ (AC:ERR1341796) with BWA-MEM and minimap2, and called SNPs and small INDELs
649
+ with GATK HaplotypeCaller v3.5~\citep{Depristo:2011vn}. This run was sequenced
650
+ from experimentally mixed CHM1 and CHM13 cell lines. Both of them are homozygous
651
+ across the whole genome and have been \emph{de novo} assembled with SMRT reads
652
+ to high quality. This allowed us to construct an independent truth variant
653
+ dataset~\citep{Li223297} for
654
+ ERR1341796. In this evaluation, minimap2 has higher SNP false negative rate
655
+ (FNR; 2.6\% of minimap2 vs 2.3\% of BWA-MEM), but fewer false positive SNPs per
656
+ million bases (FPPM; 7.0 vs 8.8), similar INDEL FNR (11.2\% vs 11.3\%) and
657
+ similar INDEL FPPM (6.4 vs 6.5). Minimap2 is broadly comparable to BWA-MEM in the
658
+ context of small variant calling.
659
+
660
+ \subsection{Aligning long-read assemblies}
661
+
662
+ Minimap2 can align a SMRT assembly (AC:GCA\_001297185.1) against GRCh38 in 7
663
+ minutes using 8 CPU cores, over 20 times faster than nucmer from
664
+ MUMmer4~\citep{Marcais:2018aa}. With the paftools.js script from the minimap2
665
+ package, we called 2.67 million single-base substitutions out of 2.78Gbp
666
+ genomic regions. The transition-to-transversion ratio (ts/tv) is 2.01. In
667
+ comparison, using MUMmer4's dnadiff pipeline, we called 2.86 million
668
+ substitutions in 2.83Gbp at ts/tv=1.87. Given that ts/tv averaged across the
669
+ human genome is about 2 but ts/tv averaged over random errors is 0.5, the
670
+ minimap2 callset arguably has higher precision at lower sensitivity.
671
+
672
+ The sample being assembled is a female. Minimap2 still called 201 substitutions
673
+ on the Y chromosome. These substitutions all come from one contig aligned at
674
+ 96.8\% sequence identity. The contig could be a segmental duplication
675
+ absent from GRCh38. In constrast, dnadiff called 9070 substitutions on the Y
676
+ chromosome across 73 SMRT contigs. This again implies our minimap2-based
677
+ pipeline has higher precision.
678
+
679
+ \section{Discussions}
680
+
681
+ Minimap2 is a versatile mapper and pairwise aligner for nucleotide sequences.
682
+ It works with short reads, assembly contigs and long noisy genomic and RNA-seq
683
+ reads, and can be used as a read mapper, long-read overlapper or a full-genome
684
+ aligner. Minimap2 is also accurate and efficient, often outperforming other
685
+ domain-specific alignment tools in terms of both speed and accuracy.
686
+
687
+ The capability of minimap2 comes from a fast base-level alignment algorithm and
688
+ an accurate chaining algorithm. When aligning long query sequences, base-level
689
+ alignment is often the performance bottleneck. The Suzuki-Kasahara algorithm
690
+ greatly alleviates the bottleneck and enables DP-based splice alignment
691
+ involving $>$100kb introns, which was impractically slow ten years ago. The
692
+ minimap2 chaining algorithm is fast and highly accurate by itself. In fact,
693
+ chaining alone is more accurate than all the other long-read mappers in
694
+ Fig.~\ref{fig:eval}a (data not shown). This accuracy helps to reduce downstream
695
+ base-level alignment of candidate chains, which is still several times slower than
696
+ chaining even with the Suzuki-Kasahara improvement. In addition, taking a
697
+ general form, minimap2 chaining can be adapted to non-typical data types such as
698
+ spliced reads and multiple reads per fragment. This gives us the opportunity to
699
+ extend the same base algorithm to a variety of use cases.
700
+
701
+ Modern mainstream aligners often use a full-text index, such as suffix array or
702
+ FM-index, to index reference sequences. An advantage of this approach is that
703
+ we can use exact seeds of arbitrary lengths, which helps to increase seed
704
+ uniqueness and reduce unsuccessful extensions. Minimap2 indexes reference
705
+ k-mers with a hash table instead. Such fixed-length seeds are inferior to
706
+ variable-length seeds in theory, but can be computed much more efficiently in
707
+ practice. When a query sequence has multiple seed hits, we can afford to skip
708
+ highly repetitive seeds without affecting the final accuracy. This further
709
+ alleviates the concern with the seeding uniqueness. At the same time, at low
710
+ sequence identity, it is rare to see long seeds anyway. Hash table is the ideal
711
+ data structure for mapping long noisy sequences.
712
+
713
+ \section*{Acknowledgements}
714
+ We owe a debt of gratitude to H. Suzuki and M. Kasahara for releasing their
715
+ masterpiece and insightful notes before formal publication. We thank M.
716
+ Schatz, P. Rescheneder and F. Sedlazeck for pointing out the limitation of
717
+ BWA-MEM. We are also grateful to minimap2 users who have greatly helped to
718
+ suggest features and to fix various issues.
719
+
720
+ \paragraph{Funding\textcolon} NHGRI 1R01HG010040-01
721
+
722
+ \bibliography{minimap2}
723
+
724
+ \end{document}