minimap2 0.2.22.0 → 0.2.24.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (101) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +60 -76
  3. data/ext/Rakefile +55 -0
  4. data/ext/cmappy/cmappy.c +129 -0
  5. data/ext/cmappy/cmappy.h +44 -0
  6. data/ext/minimap2/FAQ.md +46 -0
  7. data/ext/minimap2/LICENSE.txt +24 -0
  8. data/ext/minimap2/MANIFEST.in +10 -0
  9. data/ext/minimap2/Makefile +132 -0
  10. data/ext/minimap2/Makefile.simde +97 -0
  11. data/ext/minimap2/NEWS.md +821 -0
  12. data/ext/minimap2/README.md +403 -0
  13. data/ext/minimap2/align.c +1020 -0
  14. data/ext/minimap2/bseq.c +169 -0
  15. data/ext/minimap2/bseq.h +64 -0
  16. data/ext/minimap2/code_of_conduct.md +30 -0
  17. data/ext/minimap2/cookbook.md +243 -0
  18. data/ext/minimap2/esterr.c +64 -0
  19. data/ext/minimap2/example.c +63 -0
  20. data/ext/minimap2/format.c +559 -0
  21. data/ext/minimap2/hit.c +466 -0
  22. data/ext/minimap2/index.c +775 -0
  23. data/ext/minimap2/kalloc.c +205 -0
  24. data/ext/minimap2/kalloc.h +76 -0
  25. data/ext/minimap2/kdq.h +132 -0
  26. data/ext/minimap2/ketopt.h +120 -0
  27. data/ext/minimap2/khash.h +615 -0
  28. data/ext/minimap2/krmq.h +474 -0
  29. data/ext/minimap2/kseq.h +256 -0
  30. data/ext/minimap2/ksort.h +153 -0
  31. data/ext/minimap2/ksw2.h +184 -0
  32. data/ext/minimap2/ksw2_dispatch.c +96 -0
  33. data/ext/minimap2/ksw2_extd2_sse.c +402 -0
  34. data/ext/minimap2/ksw2_exts2_sse.c +416 -0
  35. data/ext/minimap2/ksw2_extz2_sse.c +313 -0
  36. data/ext/minimap2/ksw2_ll_sse.c +152 -0
  37. data/ext/minimap2/kthread.c +159 -0
  38. data/ext/minimap2/kthread.h +15 -0
  39. data/ext/minimap2/kvec.h +105 -0
  40. data/ext/minimap2/lchain.c +369 -0
  41. data/ext/minimap2/main.c +459 -0
  42. data/ext/minimap2/map.c +714 -0
  43. data/ext/minimap2/minimap.h +410 -0
  44. data/ext/minimap2/minimap2.1 +725 -0
  45. data/ext/minimap2/misc/README.md +179 -0
  46. data/ext/minimap2/misc/mmphase.js +335 -0
  47. data/ext/minimap2/misc/paftools.js +3149 -0
  48. data/ext/minimap2/misc.c +162 -0
  49. data/ext/minimap2/mmpriv.h +132 -0
  50. data/ext/minimap2/options.c +234 -0
  51. data/ext/minimap2/pe.c +177 -0
  52. data/ext/minimap2/python/README.rst +196 -0
  53. data/ext/minimap2/python/cmappy.h +152 -0
  54. data/ext/minimap2/python/cmappy.pxd +153 -0
  55. data/ext/minimap2/python/mappy.pyx +273 -0
  56. data/ext/minimap2/python/minimap2.py +39 -0
  57. data/ext/minimap2/sdust.c +213 -0
  58. data/ext/minimap2/sdust.h +25 -0
  59. data/ext/minimap2/seed.c +131 -0
  60. data/ext/minimap2/setup.py +55 -0
  61. data/ext/minimap2/sketch.c +143 -0
  62. data/ext/minimap2/splitidx.c +84 -0
  63. data/ext/minimap2/sse2neon/emmintrin.h +1689 -0
  64. data/ext/minimap2/test/MT-human.fa +278 -0
  65. data/ext/minimap2/test/MT-orang.fa +276 -0
  66. data/ext/minimap2/test/q-inv.fa +4 -0
  67. data/ext/minimap2/test/q2.fa +2 -0
  68. data/ext/minimap2/test/t-inv.fa +127 -0
  69. data/ext/minimap2/test/t2.fa +2 -0
  70. data/ext/minimap2/tex/Makefile +21 -0
  71. data/ext/minimap2/tex/bioinfo.cls +930 -0
  72. data/ext/minimap2/tex/blasr-mc.eval +17 -0
  73. data/ext/minimap2/tex/bowtie2-s3.sam.eval +28 -0
  74. data/ext/minimap2/tex/bwa-s3.sam.eval +52 -0
  75. data/ext/minimap2/tex/bwa.eval +55 -0
  76. data/ext/minimap2/tex/eval2roc.pl +33 -0
  77. data/ext/minimap2/tex/graphmap.eval +4 -0
  78. data/ext/minimap2/tex/hs38-simu.sh +10 -0
  79. data/ext/minimap2/tex/minialign.eval +49 -0
  80. data/ext/minimap2/tex/minimap2.bib +460 -0
  81. data/ext/minimap2/tex/minimap2.tex +724 -0
  82. data/ext/minimap2/tex/mm2-s3.sam.eval +62 -0
  83. data/ext/minimap2/tex/mm2-update.tex +240 -0
  84. data/ext/minimap2/tex/mm2.approx.eval +12 -0
  85. data/ext/minimap2/tex/mm2.eval +13 -0
  86. data/ext/minimap2/tex/natbib.bst +1288 -0
  87. data/ext/minimap2/tex/natbib.sty +803 -0
  88. data/ext/minimap2/tex/ngmlr.eval +38 -0
  89. data/ext/minimap2/tex/roc.gp +60 -0
  90. data/ext/minimap2/tex/snap-s3.sam.eval +62 -0
  91. data/ext/minimap2.patch +19 -0
  92. data/lib/minimap2/aligner.rb +4 -4
  93. data/lib/minimap2/alignment.rb +11 -11
  94. data/lib/minimap2/ffi/constants.rb +20 -16
  95. data/lib/minimap2/ffi/functions.rb +5 -0
  96. data/lib/minimap2/ffi.rb +4 -5
  97. data/lib/minimap2/version.rb +2 -2
  98. data/lib/minimap2.rb +51 -15
  99. metadata +97 -79
  100. data/lib/minimap2/ffi_helper.rb +0 -53
  101. data/vendor/libminimap2.so +0 -0
@@ -0,0 +1,725 @@
1
+ .TH minimap2 1 "18 December 2021" "minimap2-2.24 (r1122)" "Bioinformatics tools"
2
+ .SH NAME
3
+ .PP
4
+ minimap2 - mapping and alignment between collections of DNA sequences
5
+ .SH SYNOPSIS
6
+ * Indexing the target sequences (optional):
7
+ .RS 4
8
+ minimap2
9
+ .RB [ -x
10
+ .IR preset ]
11
+ .B -d
12
+ .I target.mmi
13
+ .I target.fa
14
+ .br
15
+ minimap2
16
+ .RB [ -H ]
17
+ .RB [ -k
18
+ .IR kmer ]
19
+ .RB [ -w
20
+ .IR miniWinSize ]
21
+ .RB [ -I
22
+ .IR batchSize ]
23
+ .B -d
24
+ .I target.mmi
25
+ .I target.fa
26
+ .RE
27
+
28
+ * Long-read alignment with CIGAR:
29
+ .RS 4
30
+ minimap2
31
+ .B -a
32
+ .RB [ -x
33
+ .IR preset ]
34
+ .I target.mmi
35
+ .I query.fa
36
+ >
37
+ .I output.sam
38
+ .br
39
+ minimap2
40
+ .B -c
41
+ .RB [ -H ]
42
+ .RB [ -k
43
+ .IR kmer ]
44
+ .RB [ -w
45
+ .IR miniWinSize ]
46
+ .RB [ ... ]
47
+ .I target.fa
48
+ .I query.fa
49
+ >
50
+ .I output.paf
51
+ .RE
52
+
53
+ * Long-read overlap without CIGAR:
54
+ .RS 4
55
+ minimap2
56
+ .B -x
57
+ ava-ont
58
+ .RB [ -t
59
+ .IR nThreads ]
60
+ .I target.fa
61
+ .I query.fa
62
+ >
63
+ .I output.paf
64
+ .RE
65
+ .SH DESCRIPTION
66
+ .PP
67
+ Minimap2 is a fast sequence mapping and alignment program that can find
68
+ overlaps between long noisy reads, or map long reads or their assemblies to a
69
+ reference genome optionally with detailed alignment (i.e. CIGAR). At present,
70
+ it works efficiently with query sequences from a few kilobases to ~100
71
+ megabases in length at a error rate ~15%. Minimap2 outputs in the PAF or the
72
+ SAM format.
73
+ .SH OPTIONS
74
+ .SS Indexing options
75
+ .TP 10
76
+ .BI -k \ INT
77
+ Minimizer k-mer length [15]
78
+ .TP
79
+ .BI -w \ INT
80
+ Minimizer window size [10]. A minimizer is the smallest k-mer
81
+ in a window of w consecutive k-mers.
82
+ .TP
83
+ .B -H
84
+ Use homopolymer-compressed (HPC) minimizers. An HPC sequence is constructed by
85
+ contracting homopolymer runs to a single base. An HPC minimizer is a minimizer
86
+ on the HPC sequence.
87
+ .TP
88
+ .BI -I \ NUM
89
+ Load at most
90
+ .I NUM
91
+ target bases into RAM for indexing [4G]. If there are more than
92
+ .I NUM
93
+ bases in
94
+ .IR target.fa ,
95
+ minimap2 needs to read
96
+ .I query.fa
97
+ multiple times to map it against each batch of target sequences.
98
+ .I NUM
99
+ may be ending with k/K/m/M/g/G. NB: mapping quality is incorrect given a
100
+ multi-part index.
101
+ .TP
102
+ .B --idx-no-seq
103
+ Don't store target sequences in the index. It saves disk space and memory but
104
+ the index generated with this option will not work with
105
+ .B -a
106
+ or
107
+ .BR -c .
108
+ When base-level alignment is not requested, this option is automatically applied.
109
+ .TP
110
+ .BI -d \ FILE
111
+ Save the minimizer index of
112
+ .I target.fa
113
+ to
114
+ .I FILE
115
+ [no dump]. Minimap2 indexing is fast. It can index the human genome in a couple
116
+ of minutes. If even shorter startup time is desired, use this option to save
117
+ the index. Indexing options are fixed in the index file. When an index file is
118
+ provided as the target sequences, options
119
+ .BR -H ,
120
+ .BR -k ,
121
+ .BR -w ,
122
+ .B -I
123
+ will be effectively overridden by the options stored in the index file.
124
+ .TP
125
+ .BI --alt \ FILE
126
+ List of ALT contigs [null]
127
+ .TP
128
+ .BI --alt-drop \ FLOAT
129
+ Drop ALT hits by
130
+ .I FLOAT
131
+ fraction when ranking and computing mapping quality [0.15]
132
+ .SS Mapping options
133
+ .TP 10
134
+ .BI -f \ FLOAT | INT1 [, INT2 ]
135
+ If fraction, ignore top
136
+ .I FLOAT
137
+ fraction of most frequent minimizers [0.0002]. If integer,
138
+ ignore minimizers occuring more than
139
+ .I INT1
140
+ times.
141
+ .I INT2
142
+ is only effective in the
143
+ .B --sr
144
+ or
145
+ .B -xsr
146
+ mode, which sets the threshold for a second round of seeding.
147
+ .TP
148
+ .BI -U \ INT1 [, INT2 ]
149
+ Lower and upper bounds of k-mer occurrences [10,1000000]. The final k-mer occurrence threshold is
150
+ .RI max{ INT1 ,\ min{ INT2 ,
151
+ .BR -f }}.
152
+ This option prevents excessively small or large
153
+ .B -f
154
+ estimated from the input reference. Available since r1034 and deprecating
155
+ .B --min-occ-floor
156
+ in earlier versions of minimap2.
157
+ .TP
158
+ .BI --q-occ-frac \ FLOAT
159
+ Discard a query minimizer if its occurrence is higher than
160
+ .I FLOAT
161
+ fraction of query minimizers and than the reference occurrence threshold
162
+ [0.01]. Set 0 to disable. Available since r1105.
163
+ .TP
164
+ .BI -e \ INT
165
+ Sample a high-frequency minimizer every
166
+ .I INT
167
+ basepairs [500].
168
+ .TP
169
+ .BI -g \ NUM
170
+ Stop chain enlongation if there are no minimizers within
171
+ .IR NUM -bp
172
+ [10k].
173
+ .TP
174
+ .BI -r \ NUM1 [, NUM2 ]
175
+ Bandwidth for chaining and base alignment [500,20k].
176
+ .I NUM1
177
+ is used for initial chaining and alignment extension;
178
+ .I NUM2
179
+ for RMQ-based re-chaining and closing gaps in alignments.
180
+ .TP
181
+ .BI -n \ INT
182
+ Discard chains consisting of
183
+ .RI < INT
184
+ number of minimizers [3]
185
+ .TP
186
+ .BI -m \ INT
187
+ Discard chains with chaining score
188
+ .RI < INT
189
+ [40]. Chaining score equals the approximate number of matching bases minus a
190
+ concave gap penalty. It is computed with dynamic programming.
191
+ .TP
192
+ .B -D
193
+ If query sequence name/length are identical to the target name/length, ignore
194
+ diagonal anchors. This option also reduces DP-based extension along the
195
+ diagonal.
196
+ .TP
197
+ .B -P
198
+ Retain all chains and don't attempt to set primary chains. Options
199
+ .B -p
200
+ and
201
+ .B -N
202
+ have no effect when this option is in use.
203
+ .TP
204
+ .BR --dual = yes | no
205
+ If
206
+ .BR no ,
207
+ skip query-target pairs wherein the query name is lexicographically greater
208
+ than the target name [yes]
209
+ .TP
210
+ .B -X
211
+ Equivalent to
212
+ .RB ' -DP
213
+ .BR --dual = no
214
+ .BR --no-long-join '.
215
+ Primarily used for all-vs-all read overlapping.
216
+ .TP
217
+ .BI -p \ FLOAT
218
+ Minimal secondary-to-primary score ratio to output secondary mappings [0.8].
219
+ Between two chains overlaping over half of the shorter chain (controlled by
220
+ .BR -M ),
221
+ the chain with a lower score is secondary to the chain with a higher score.
222
+ If the ratio of the scores is below
223
+ .IR FLOAT ,
224
+ the secondary chain will not be outputted or extended with DP alignment later.
225
+ This option has no effect when
226
+ .B -X
227
+ is applied.
228
+ .TP
229
+ .BI -N \ INT
230
+ Output at most
231
+ .I INT
232
+ secondary alignments [5]. This option has no effect when
233
+ .B -X
234
+ is applied.
235
+ .TP
236
+ .BI -G \ NUM
237
+ Maximum gap on the reference (effective with
238
+ .BR -xsplice / --splice ).
239
+ This option also changes the chaining and alignment band width to
240
+ .IR NUM .
241
+ Increasing this option slows down spliced alignment. [200k]
242
+ .TP
243
+ .BI -F \ NUM
244
+ Maximum fragment length (aka insert size; effective with
245
+ .BR -xsr / --frag = yes )
246
+ [800]
247
+ .TP
248
+ .BI -M \ FLOAT
249
+ Mark as secondary a chain that overlaps with a better chain by
250
+ .I FLOAT
251
+ or more of the shorter chain [0.5]
252
+ .TP
253
+ .BR --rmq = no | yes
254
+ Use the minigraph chaining algorithm [no]. The minigraph algorithm is better
255
+ for aligning contigs through long INDELs.
256
+ .TP
257
+ .B --hard-mask-level
258
+ Honor option
259
+ .B -M
260
+ and disable a heurstic to save unmapped subsequences and disables
261
+ .BR --mask-len .
262
+ .TP
263
+ .BI --mask-len \ NUM
264
+ Keep an alignment if dropping it leaves an unaligned region on query longer than
265
+ .IR INT
266
+ [inf]. Effective without
267
+ .BR --hard-mask-level .
268
+ .TP
269
+ .BI --max-chain-skip \ INT
270
+ A heuristics that stops chaining early [25]. Minimap2 uses dynamic programming
271
+ for chaining. The time complexity is quadratic in the number of seeds. This
272
+ option makes minimap2 exits the inner loop if it repeatedly sees seeds already
273
+ on chains. Set
274
+ .I INT
275
+ to a large number to switch off this heurstics.
276
+ .TP
277
+ .BI --max-chain-iter \ INT
278
+ Check up to
279
+ .I INT
280
+ partial chains during chaining [5000]. This is a heuristic to avoid quadratic
281
+ time complexity in the worst case.
282
+ .TP
283
+ .BI --chain-gap-scale \ FLOAT
284
+ Scale of gap cost during chaining [1.0]
285
+ .TP
286
+ .B --no-long-join
287
+ Disable the long gap patching heuristic. When this option is applied, the
288
+ maximum alignment gap is mostly controlled by
289
+ .BR -r .
290
+ .TP
291
+ .B --splice
292
+ Enable the splice alignment mode.
293
+ .TP
294
+ .B --sr
295
+ Enable short-read alignment heuristics. In the short-read mode, minimap2
296
+ applies a second round of chaining with a higher minimizer occurrence threshold
297
+ if no good chain is found. In addition, minimap2 attempts to patch gaps between
298
+ seeds with ungapped alignment.
299
+ .TP
300
+ .BI --split-prefix \ STR
301
+ Prefix to create temporary files. Typically used for a multi-part index.
302
+ .TP
303
+ .BR --frag = no | yes
304
+ Whether to enable the fragment mode [no]
305
+ .TP
306
+ .B --for-only
307
+ Only map to the forward strand of the reference sequences. For paired-end
308
+ reads in the forward-reverse orientation, the first read is mapped to forward
309
+ strand of the reference and the second read to the reverse stand.
310
+ .TP
311
+ .B --rev-only
312
+ Only map to the reverse complement strand of the reference sequences.
313
+ .TP
314
+ .BR --heap-sort = no | yes
315
+ If yes, sort anchors with heap merge, instead of radix sort. Heap merge is
316
+ faster for short reads, but slower for long reads. [no]
317
+ .TP
318
+ .B --no-pairing
319
+ Treat two reads in a pair as independent reads. The mate related fields in SAM
320
+ are still properly populated.
321
+ .TP
322
+ .B --no-hash-name
323
+ Produce the same alignment for identical sequences regardless of their sequence names.
324
+ .SS Alignment options
325
+ .TP 10
326
+ .BI -A \ INT
327
+ Matching score [2]
328
+ .TP
329
+ .BI -B \ INT
330
+ Mismatching penalty [4]
331
+ .TP
332
+ .BI -O \ INT1[,INT2]
333
+ Gap open penalty [4,24]. If
334
+ .I INT2
335
+ is not specified, it is set to
336
+ .IR INT1 .
337
+ .TP
338
+ .BI -E \ INT1[,INT2]
339
+ Gap extension penalty [2,1]. A gap of length
340
+ .I k
341
+ costs
342
+ .RI min{ O1 + k * E1 , O2 + k * E2 }.
343
+ In the splice mode, the second gap penalties are not used.
344
+ .TP
345
+ .BI -C \ INT
346
+ Cost for a non-canonical GT-AG splicing (effective with
347
+ .BR --splice )
348
+ [0]
349
+ .TP
350
+ .BI -z \ INT1[,INT2]
351
+ Truncate an alignment if the running alignment score drops too quickly along
352
+ the diagonal of the DP matrix (diagonal X-drop, or Z-drop) [400,200]. If the
353
+ drop of score is above
354
+ .IR INT2 ,
355
+ minimap2 will reverse complement the query in the related region and align
356
+ again to test small inversions. Minimap2 truncates alignment if there is an
357
+ inversion or the drop of score is greater than
358
+ .IR INT1 .
359
+ Decrease
360
+ .I INT2
361
+ to find small inversions at the cost of performance and false positives.
362
+ Increase
363
+ .I INT1
364
+ to improves the contiguity of alignment at the cost of poor alignment in the
365
+ middle.
366
+ .TP
367
+ .BI -s \ INT
368
+ Minimal peak DP alignment score to output [40]. The peak score is computed from
369
+ the final CIGAR. It is the score of the max scoring segment in the alignment
370
+ and may be different from the total alignment score.
371
+ .TP
372
+ .BI -u \ CHAR
373
+ How to find canonical splicing sites GT-AG -
374
+ .BR f :
375
+ transcript strand;
376
+ .BR b :
377
+ both strands;
378
+ .BR n :
379
+ no attempt to match GT-AG [n]
380
+ .TP
381
+ .BI --end-bonus \ INT
382
+ Score bonus when alignment extends to the end of the query sequence [0].
383
+ .TP
384
+ .BI --score-N \ INT
385
+ Score of a mismatch involving ambiguous bases [1].
386
+ .TP
387
+ .BR --splice-flank = yes | no
388
+ Assume the next base to a
389
+ .B GT
390
+ donor site tends to be A/G (91% in human and 92% in mouse) and the preceding
391
+ base to a
392
+ .B AG
393
+ acceptor tends to be C/T [no].
394
+ This trend is evolutionarily conservative, all the way to S. cerevisiae
395
+ (PMID:18688272). Specifying this option generally leads to higher junction
396
+ accuracy by several percents, so it is applied by default with
397
+ .BR --splice .
398
+ However, the SIRV control does not honor this trend
399
+ (only ~60%). This option reduces accuracy. If you are benchmarking minimap2
400
+ on SIRV data, please add
401
+ .B --splice-flank=no
402
+ to the command line.
403
+ .TP
404
+ .BR --junc-bed \ FILE
405
+ Gene annotations in the BED12 format (aka 12-column BED), or intron positions
406
+ in 5-column BED. With this option, minimap2 prefers splicing in annotations.
407
+ BED12 file can be converted from GTF/GFF3 with `paftools.js gff2bed anno.gtf'
408
+ [].
409
+ .TP
410
+ .BR --junc-bonus \ INT
411
+ Score bonus for a splice donor or acceptor found in annotation (effective with
412
+ .BR --junc-bed )
413
+ [9].
414
+ .TP
415
+ .BI --end-seed-pen \ INT
416
+ Drop a terminal anchor if
417
+ .IR s <log( g )+ INT ,
418
+ where
419
+ .I s
420
+ is the local alignment score around the anchor and
421
+ .I g
422
+ the length of the terminal gap in the chain. This option is only effective
423
+ with
424
+ .BR --splice .
425
+ It helps to avoid tiny terminal exons. [6]
426
+ .TP
427
+ .B --no-end-flt
428
+ Don't filter seeds towards the ends of chains before performing base-level
429
+ alignment.
430
+ .TP
431
+ .BI --cap-sw-mem \ NUM
432
+ Skip alignment if the DP matrix size is above
433
+ .IR NUM .
434
+ Set 0 to disable [100m].
435
+ .TP
436
+ .BI --cap-kalloc \ NUM
437
+ Free thread-local kalloc memory reservoir if after the alignment the size of the reservoir above
438
+ .IR NUM .
439
+ Set 0 to disable [0].
440
+ .SS Input/output options
441
+ .TP 10
442
+ .B -a
443
+ Generate CIGAR and output alignments in the SAM format. Minimap2 outputs in PAF
444
+ by default.
445
+ .TP
446
+ .BI -o \ FILE
447
+ Output alignments to
448
+ .I FILE
449
+ [stdout].
450
+ .TP
451
+ .B -Q
452
+ Ignore base quality in the input file.
453
+ .TP
454
+ .B -L
455
+ Write CIGAR with >65535 operators at the CG tag. Older tools are unable to
456
+ convert alignments with >65535 CIGAR ops to BAM. This option makes minimap2 SAM
457
+ compatible with older tools. Newer tools recognizes this tag and reconstruct
458
+ the real CIGAR in memory.
459
+ .TP
460
+ .BI -R \ STR
461
+ SAM read group line in a format like
462
+ .B @RG\\\\tID:foo\\\\tSM:bar
463
+ [].
464
+ .TP
465
+ .B -y
466
+ Copy input FASTA/Q comments to output.
467
+ .TP
468
+ .B -c
469
+ Generate CIGAR. In PAF, the CIGAR is written to the `cg' custom tag.
470
+ .TP
471
+ .BI --cs[= STR ]
472
+ Output the
473
+ .B cs
474
+ tag.
475
+ .I STR
476
+ can be either
477
+ .I short
478
+ or
479
+ .IR long .
480
+ If no
481
+ .I STR
482
+ is given,
483
+ .I short
484
+ is assumed. [none]
485
+ .TP
486
+ .B --MD
487
+ Output the MD tag (see the SAM spec).
488
+ .TP
489
+ .B --eqx
490
+ Output =/X CIGAR operators for sequence match/mismatch.
491
+ .TP
492
+ .B -Y
493
+ In SAM output, use soft clipping for supplementary alignments.
494
+ .TP
495
+ .BI --seed \ INT
496
+ Integer seed for randomizing equally best hits. Minimap2 hashes
497
+ .I INT
498
+ and read name when choosing between equally best hits. [11]
499
+ .TP
500
+ .BI -t \ INT
501
+ Number of threads [3]. Minimap2 uses at most three threads when indexing target
502
+ sequences, and uses up to
503
+ .IR INT +1
504
+ threads when mapping (the extra thread is for I/O, which is frequently idle and
505
+ takes little CPU time).
506
+ .TP
507
+ .B -2
508
+ Use two I/O threads during mapping. By default, minimap2 uses one I/O thread.
509
+ When I/O is slow (e.g. piping to gzip, or reading from a slow pipe), the I/O
510
+ thread may become the bottleneck. Apply this option to use one thread for input
511
+ and another thread for output, at the cost of increased peak RAM.
512
+ .TP
513
+ .BI -K \ NUM
514
+ Number of bases loaded into memory to process in a mini-batch [500M].
515
+ Similar to option
516
+ .BR -I ,
517
+ K/M/G/k/m/g suffix is accepted. A large
518
+ .I NUM
519
+ helps load balancing in the multi-threading mode, at the cost of increased
520
+ memory.
521
+ .TP
522
+ .BR --secondary = yes | no
523
+ Whether to output secondary alignments [yes]
524
+ .TP
525
+ .BI --max-qlen \ NUM
526
+ Filter out query sequences longer than
527
+ .IR NUM .
528
+ .TP
529
+ .B --paf-no-hit
530
+ In PAF, output unmapped queries; the strand and the reference name fields are
531
+ set to `*'. Warning: some paftools.js commands may not work with such output
532
+ for the moment.
533
+ .TP
534
+ .B --sam-hit-only
535
+ In SAM, don't output unmapped reads.
536
+ .TP
537
+ .B --version
538
+ Print version number to stdout
539
+ .SS Preset options
540
+ .TP 10
541
+ .BI -x \ STR
542
+ Preset []. This option applies multiple options at the same time. It should be
543
+ applied before other options because options applied later will overwrite the
544
+ values set by
545
+ .BR -x .
546
+ Available
547
+ .I STR
548
+ are:
549
+ .RS
550
+ .TP 10
551
+ .B map-ont
552
+ Align noisy long reads of ~10% error rate to a reference genome. This is the
553
+ default mode.
554
+ .TP
555
+ .B map-hifi
556
+ Align PacBio high-fidelity (HiFi) reads to a reference genome
557
+ .RB ( -k19
558
+ .B -w19 -U50,500 -g10k -A1 -B4 -O6,26 -E2,1
559
+ .BR -s200 ).
560
+ .TP
561
+ .B map-pb
562
+ Align older PacBio continuous long (CLR) reads to a reference genome
563
+ .RB ( -Hk19 ).
564
+ .TP
565
+ .B asm5
566
+ Long assembly to reference mapping
567
+ .RB ( -k19
568
+ .B -w19 -U50,500 --rmq -r1k,100k -g10k -A1 -B19 -O39,81 -E3,1 -s200 -z200
569
+ .BR -N50 ).
570
+ Typically, the alignment will not extend to regions with 5% or higher sequence
571
+ divergence. Only use this preset if the average divergence is far below 5%.
572
+ .TP
573
+ .B asm10
574
+ Long assembly to reference mapping
575
+ .RB ( -k19
576
+ .B -w19 -U50,500 --rmq -r1k,100k -g10k -A1 -B9 -O16,41 -E2,1 -s200 -z200
577
+ .BR -N50 ).
578
+ Up to 10% sequence divergence.
579
+ .TP
580
+ .B asm20
581
+ Long assembly to reference mapping
582
+ .RB ( -k19
583
+ .B -w10 -U50,500 --rmq -r1k,100k -g10k -A1 -B4 -O6,26 -E2,1 -s200 -z200
584
+ .BR -N50 ).
585
+ Up to 20% sequence divergence.
586
+ .TP
587
+ .B splice
588
+ Long-read spliced alignment
589
+ .RB ( -k15
590
+ .B -w5 --splice -g2k -G200k -A1 -B2 -O2,32 -E1,0 -b0 -C9 -z200 -ub --junc-bonus=9 --cap-sw-mem=0
591
+ .BR --splice-flank=yes ).
592
+ In the splice mode, 1) long deletions are taken as introns and represented as
593
+ the
594
+ .RB ` N '
595
+ CIGAR operator; 2) long insertions are disabled; 3) deletion and insertion gap
596
+ costs are different during chaining; 4) the computation of the
597
+ .RB ` ms '
598
+ tag ignores introns to demote hits to pseudogenes.
599
+ .TP
600
+ .B splice:hq
601
+ Long-read splice alignment for PacBio CCS reads
602
+ .RB ( -xsplice
603
+ .B -C5 -O6,24
604
+ .BR -B4 ).
605
+ .TP
606
+ .B sr
607
+ Short single-end reads without splicing
608
+ .RB ( -k21
609
+ .B -w11 --sr --frag=yes -A2 -B8 -O12,32 -E2,1 -b0 -r100 -p.5 -N20 -f1000,5000 -n2 -m25
610
+ .B -s40 -g100 -2K50m --heap-sort=yes
611
+ .BR --secondary=no ).
612
+ .TP
613
+ .B ava-pb
614
+ PacBio CLR all-vs-all overlap mapping
615
+ .RB ( -Hk19
616
+ .B -Xw5 -e0
617
+ .BR -m100 ).
618
+ .TP
619
+ .B ava-ont
620
+ Oxford Nanopore all-vs-all overlap mapping
621
+ .RB ( -k15
622
+ .B -Xw5 -e0 -m100
623
+ .BR -r2k ).
624
+ .RE
625
+ .SS Miscellaneous options
626
+ .TP 10
627
+ .B --no-kalloc
628
+ Use the libc default allocator instead of the kalloc thread-local allocator.
629
+ This debugging option is mostly used with Valgrind to detect invalid memory
630
+ accesses. Minimap2 runs slower with this option, especially in the
631
+ multi-threading mode.
632
+ .TP
633
+ .B --print-qname
634
+ Print query names to stderr, mostly to see which query is crashing minimap2.
635
+ .TP
636
+ .B --print-seeds
637
+ Print seed positions to stderr, for debugging only.
638
+ .SH OUTPUT FORMAT
639
+ .PP
640
+ Minimap2 outputs mapping positions in the Pairwise mApping Format (PAF) by
641
+ default. PAF is a TAB-delimited text format with each line consisting of at
642
+ least 12 fields as are described in the following table:
643
+ .TS
644
+ center box;
645
+ cb | cb | cb
646
+ r | c | l .
647
+ Col Type Description
648
+ _
649
+ 1 string Query sequence name
650
+ 2 int Query sequence length
651
+ 3 int Query start coordinate (0-based)
652
+ 4 int Query end coordinate (0-based)
653
+ 5 char `+' if query/target on the same strand; `-' if opposite
654
+ 6 string Target sequence name
655
+ 7 int Target sequence length
656
+ 8 int Target start coordinate on the original strand
657
+ 9 int Target end coordinate on the original strand
658
+ 10 int Number of matching bases in the mapping
659
+ 11 int Number bases, including gaps, in the mapping
660
+ 12 int Mapping quality (0-255 with 255 for missing)
661
+ .TE
662
+
663
+ .PP
664
+ When alignment is available, column 11 gives the total number of sequence
665
+ matches, mismatches and gaps in the alignment; column 10 divided by column 11
666
+ gives the BLAST-like alignment identity. When alignment is unavailable,
667
+ these two columns are approximate. PAF may optionally have additional fields in
668
+ the SAM-like typed key-value format. Minimap2 may output the following tags:
669
+ .TS
670
+ center box;
671
+ cb | cb | cb
672
+ r | c | l .
673
+ Tag Type Description
674
+ _
675
+ tp A Type of aln: P/primary, S/secondary and I,i/inversion
676
+ cm i Number of minimizers on the chain
677
+ s1 i Chaining score
678
+ s2 i Chaining score of the best secondary chain
679
+ NM i Total number of mismatches and gaps in the alignment
680
+ MD Z To generate the ref sequence in the alignment
681
+ AS i DP alignment score
682
+ SA Z List of other supplementary alignments
683
+ ms i DP score of the max scoring segment in the alignment
684
+ nn i Number of ambiguous bases in the alignment
685
+ ts A Transcript strand (splice mode only)
686
+ cg Z CIGAR string (only in PAF)
687
+ cs Z Difference string
688
+ dv f Approximate per-base sequence divergence
689
+ de f Gap-compressed per-base sequence divergence
690
+ rl i Length of query regions harboring repetitive seeds
691
+ .TE
692
+
693
+ .PP
694
+ The
695
+ .B cs
696
+ tag encodes difference sequences in the short form or the entire query
697
+ .I AND
698
+ reference sequences in the long form. It consists of a series of operations:
699
+ .TS
700
+ center box;
701
+ cb | cb |cb
702
+ r | l | l .
703
+ Op Regex Description
704
+ _
705
+ = [ACGTN]+ Identical sequence (long form)
706
+ : [0-9]+ Identical sequence length
707
+ * [acgtn][acgtn] Substitution: ref to query
708
+ + [acgtn]+ Insertion to the reference
709
+ - [acgtn]+ Deletion from the reference
710
+ ~ [acgtn]{2}[0-9]+[acgtn]{2} Intron length and splice signal
711
+ .TE
712
+
713
+ .SH LIMITATIONS
714
+ .TP 2
715
+ *
716
+ Minimap2 may produce suboptimal alignments through long low-complexity regions
717
+ where seed positions may be suboptimal. This should not be a big concern
718
+ because even the optimal alignment may be wrong in such regions.
719
+ .TP
720
+ *
721
+ Minimap2 requires SSE2 or NEON instructions to compile. It is possible to add
722
+ non-SSE2/NEON support, but it would make minimap2 slower by several times.
723
+ .SH SEE ALSO
724
+ .PP
725
+ miniasm(1), minimap(1), bwa(1).