minimap2 0.2.23.0 → 0.2.23.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (96) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +60 -76
  3. data/ext/Rakefile +41 -0
  4. data/ext/cmappy/cmappy.c +129 -0
  5. data/ext/cmappy/cmappy.h +44 -0
  6. data/ext/minimap2/FAQ.md +46 -0
  7. data/ext/minimap2/LICENSE.txt +24 -0
  8. data/ext/minimap2/MANIFEST.in +10 -0
  9. data/ext/minimap2/Makefile +132 -0
  10. data/ext/minimap2/Makefile.simde +97 -0
  11. data/ext/minimap2/NEWS.md +807 -0
  12. data/ext/minimap2/README.md +403 -0
  13. data/ext/minimap2/align.c +1020 -0
  14. data/ext/minimap2/bseq.c +169 -0
  15. data/ext/minimap2/bseq.h +64 -0
  16. data/ext/minimap2/code_of_conduct.md +30 -0
  17. data/ext/minimap2/cookbook.md +243 -0
  18. data/ext/minimap2/esterr.c +64 -0
  19. data/ext/minimap2/example.c +63 -0
  20. data/ext/minimap2/format.c +559 -0
  21. data/ext/minimap2/hit.c +466 -0
  22. data/ext/minimap2/index.c +775 -0
  23. data/ext/minimap2/kalloc.c +205 -0
  24. data/ext/minimap2/kalloc.h +76 -0
  25. data/ext/minimap2/kdq.h +132 -0
  26. data/ext/minimap2/ketopt.h +120 -0
  27. data/ext/minimap2/khash.h +615 -0
  28. data/ext/minimap2/krmq.h +474 -0
  29. data/ext/minimap2/kseq.h +256 -0
  30. data/ext/minimap2/ksort.h +153 -0
  31. data/ext/minimap2/ksw2.h +184 -0
  32. data/ext/minimap2/ksw2_dispatch.c +96 -0
  33. data/ext/minimap2/ksw2_extd2_sse.c +402 -0
  34. data/ext/minimap2/ksw2_exts2_sse.c +416 -0
  35. data/ext/minimap2/ksw2_extz2_sse.c +313 -0
  36. data/ext/minimap2/ksw2_ll_sse.c +152 -0
  37. data/ext/minimap2/kthread.c +159 -0
  38. data/ext/minimap2/kthread.h +15 -0
  39. data/ext/minimap2/kvec.h +105 -0
  40. data/ext/minimap2/lchain.c +344 -0
  41. data/ext/minimap2/main.c +455 -0
  42. data/ext/minimap2/map.c +714 -0
  43. data/ext/minimap2/minimap.h +409 -0
  44. data/ext/minimap2/minimap2.1 +722 -0
  45. data/ext/minimap2/misc/README.md +179 -0
  46. data/ext/minimap2/misc/mmphase.js +335 -0
  47. data/ext/minimap2/misc/paftools.js +3149 -0
  48. data/ext/minimap2/misc.c +162 -0
  49. data/ext/minimap2/mmpriv.h +131 -0
  50. data/ext/minimap2/options.c +233 -0
  51. data/ext/minimap2/pe.c +177 -0
  52. data/ext/minimap2/python/README.rst +196 -0
  53. data/ext/minimap2/python/cmappy.h +152 -0
  54. data/ext/minimap2/python/cmappy.pxd +153 -0
  55. data/ext/minimap2/python/mappy.pyx +273 -0
  56. data/ext/minimap2/python/minimap2.py +39 -0
  57. data/ext/minimap2/sdust.c +213 -0
  58. data/ext/minimap2/sdust.h +25 -0
  59. data/ext/minimap2/seed.c +131 -0
  60. data/ext/minimap2/setup.py +55 -0
  61. data/ext/minimap2/sketch.c +143 -0
  62. data/ext/minimap2/splitidx.c +84 -0
  63. data/ext/minimap2/sse2neon/emmintrin.h +1689 -0
  64. data/ext/minimap2/test/MT-human.fa +278 -0
  65. data/ext/minimap2/test/MT-orang.fa +276 -0
  66. data/ext/minimap2/test/q-inv.fa +4 -0
  67. data/ext/minimap2/test/q2.fa +2 -0
  68. data/ext/minimap2/test/t-inv.fa +127 -0
  69. data/ext/minimap2/test/t2.fa +2 -0
  70. data/ext/minimap2/tex/Makefile +21 -0
  71. data/ext/minimap2/tex/bioinfo.cls +930 -0
  72. data/ext/minimap2/tex/blasr-mc.eval +17 -0
  73. data/ext/minimap2/tex/bowtie2-s3.sam.eval +28 -0
  74. data/ext/minimap2/tex/bwa-s3.sam.eval +52 -0
  75. data/ext/minimap2/tex/bwa.eval +55 -0
  76. data/ext/minimap2/tex/eval2roc.pl +33 -0
  77. data/ext/minimap2/tex/graphmap.eval +4 -0
  78. data/ext/minimap2/tex/hs38-simu.sh +10 -0
  79. data/ext/minimap2/tex/minialign.eval +49 -0
  80. data/ext/minimap2/tex/minimap2.bib +460 -0
  81. data/ext/minimap2/tex/minimap2.tex +724 -0
  82. data/ext/minimap2/tex/mm2-s3.sam.eval +62 -0
  83. data/ext/minimap2/tex/mm2-update.tex +240 -0
  84. data/ext/minimap2/tex/mm2.approx.eval +12 -0
  85. data/ext/minimap2/tex/mm2.eval +13 -0
  86. data/ext/minimap2/tex/natbib.bst +1288 -0
  87. data/ext/minimap2/tex/natbib.sty +803 -0
  88. data/ext/minimap2/tex/ngmlr.eval +38 -0
  89. data/ext/minimap2/tex/roc.gp +60 -0
  90. data/ext/minimap2/tex/snap-s3.sam.eval +62 -0
  91. data/ext/minimap2.patch +19 -0
  92. data/{vendor → ext/vendor}/libminimap2.so +0 -0
  93. data/lib/minimap2/ffi/functions.rb +5 -0
  94. data/lib/minimap2/version.rb +1 -1
  95. data/lib/minimap2.rb +32 -0
  96. metadata +94 -4
@@ -0,0 +1,722 @@
1
+ .TH minimap2 1 "18 November 2021" "minimap2-2.23 (r1111)" "Bioinformatics tools"
2
+ .SH NAME
3
+ .PP
4
+ minimap2 - mapping and alignment between collections of DNA sequences
5
+ .SH SYNOPSIS
6
+ * Indexing the target sequences (optional):
7
+ .RS 4
8
+ minimap2
9
+ .RB [ -x
10
+ .IR preset ]
11
+ .B -d
12
+ .I target.mmi
13
+ .I target.fa
14
+ .br
15
+ minimap2
16
+ .RB [ -H ]
17
+ .RB [ -k
18
+ .IR kmer ]
19
+ .RB [ -w
20
+ .IR miniWinSize ]
21
+ .RB [ -I
22
+ .IR batchSize ]
23
+ .B -d
24
+ .I target.mmi
25
+ .I target.fa
26
+ .RE
27
+
28
+ * Long-read alignment with CIGAR:
29
+ .RS 4
30
+ minimap2
31
+ .B -a
32
+ .RB [ -x
33
+ .IR preset ]
34
+ .I target.mmi
35
+ .I query.fa
36
+ >
37
+ .I output.sam
38
+ .br
39
+ minimap2
40
+ .B -c
41
+ .RB [ -H ]
42
+ .RB [ -k
43
+ .IR kmer ]
44
+ .RB [ -w
45
+ .IR miniWinSize ]
46
+ .RB [ ... ]
47
+ .I target.fa
48
+ .I query.fa
49
+ >
50
+ .I output.paf
51
+ .RE
52
+
53
+ * Long-read overlap without CIGAR:
54
+ .RS 4
55
+ minimap2
56
+ .B -x
57
+ ava-ont
58
+ .RB [ -t
59
+ .IR nThreads ]
60
+ .I target.fa
61
+ .I query.fa
62
+ >
63
+ .I output.paf
64
+ .RE
65
+ .SH DESCRIPTION
66
+ .PP
67
+ Minimap2 is a fast sequence mapping and alignment program that can find
68
+ overlaps between long noisy reads, or map long reads or their assemblies to a
69
+ reference genome optionally with detailed alignment (i.e. CIGAR). At present,
70
+ it works efficiently with query sequences from a few kilobases to ~100
71
+ megabases in length at a error rate ~15%. Minimap2 outputs in the PAF or the
72
+ SAM format.
73
+ .SH OPTIONS
74
+ .SS Indexing options
75
+ .TP 10
76
+ .BI -k \ INT
77
+ Minimizer k-mer length [15]
78
+ .TP
79
+ .BI -w \ INT
80
+ Minimizer window size [2/3 of k-mer length]. A minimizer is the smallest k-mer
81
+ in a window of w consecutive k-mers.
82
+ .TP
83
+ .B -H
84
+ Use homopolymer-compressed (HPC) minimizers. An HPC sequence is constructed by
85
+ contracting homopolymer runs to a single base. An HPC minimizer is a minimizer
86
+ on the HPC sequence.
87
+ .TP
88
+ .BI -I \ NUM
89
+ Load at most
90
+ .I NUM
91
+ target bases into RAM for indexing [4G]. If there are more than
92
+ .I NUM
93
+ bases in
94
+ .IR target.fa ,
95
+ minimap2 needs to read
96
+ .I query.fa
97
+ multiple times to map it against each batch of target sequences.
98
+ .I NUM
99
+ may be ending with k/K/m/M/g/G. NB: mapping quality is incorrect given a
100
+ multi-part index.
101
+ .TP
102
+ .B --idx-no-seq
103
+ Don't store target sequences in the index. It saves disk space and memory but
104
+ the index generated with this option will not work with
105
+ .B -a
106
+ or
107
+ .BR -c .
108
+ When base-level alignment is not requested, this option is automatically applied.
109
+ .TP
110
+ .BI -d \ FILE
111
+ Save the minimizer index of
112
+ .I target.fa
113
+ to
114
+ .I FILE
115
+ [no dump]. Minimap2 indexing is fast. It can index the human genome in a couple
116
+ of minutes. If even shorter startup time is desired, use this option to save
117
+ the index. Indexing options are fixed in the index file. When an index file is
118
+ provided as the target sequences, options
119
+ .BR -H ,
120
+ .BR -k ,
121
+ .BR -w ,
122
+ .B -I
123
+ will be effectively overridden by the options stored in the index file.
124
+ .TP
125
+ .BI --alt \ FILE
126
+ List of ALT contigs [null]
127
+ .TP
128
+ .BI --alt-drop \ FLOAT
129
+ Drop ALT hits by
130
+ .I FLOAT
131
+ fraction when ranking and computing mapping quality [0.15]
132
+ .SS Mapping options
133
+ .TP 10
134
+ .BI -f \ FLOAT | INT1 [, INT2 ]
135
+ If fraction, ignore top
136
+ .I FLOAT
137
+ fraction of most frequent minimizers [0.0002]. If integer,
138
+ ignore minimizers occuring more than
139
+ .I INT1
140
+ times.
141
+ .I INT2
142
+ is only effective in the
143
+ .B --sr
144
+ or
145
+ .B -xsr
146
+ mode, which sets the threshold for a second round of seeding.
147
+ .TP
148
+ .BI -U \ INT1 [, INT2 ]
149
+ Lower and upper bounds of k-mer occurrences [10,1000000]. The final k-mer occurrence threshold is
150
+ .RI max{ INT1 ,\ min{ INT2 ,
151
+ .BR -f }}.
152
+ This option prevents excessively small or large
153
+ .B -f
154
+ estimated from the input reference. Available since r1034 and deprecating
155
+ .B --min-occ-floor
156
+ in earlier versions of minimap2.
157
+ .TP
158
+ .BI --q-occ-frac \ FLOAT
159
+ Discard a query minimizer if its occurrence is higher than
160
+ .I FLOAT
161
+ fraction of query minimizers and than the reference occurrence threshold
162
+ [0.01]. Set 0 to disable. Available since r1105.
163
+ .TP
164
+ .BI -e \ INT
165
+ Sample a high-frequency minimizer every
166
+ .I INT
167
+ basepairs [500].
168
+ .TP
169
+ .BI -g \ NUM
170
+ Stop chain enlongation if there are no minimizers within
171
+ .IR NUM -bp
172
+ [10k].
173
+ .TP
174
+ .BI -r \ NUM1 [, NUM2 ]
175
+ Bandwidth for chaining and base alignment [500,20k].
176
+ .I NUM1
177
+ is used for initial chaining and alignment extension;
178
+ .I NUM2
179
+ for RMQ-based re-chaining and closing gaps in alignments.
180
+ .TP
181
+ .BI -n \ INT
182
+ Discard chains consisting of
183
+ .RI < INT
184
+ number of minimizers [3]
185
+ .TP
186
+ .BI -m \ INT
187
+ Discard chains with chaining score
188
+ .RI < INT
189
+ [40]. Chaining score equals the approximate number of matching bases minus a
190
+ concave gap penalty. It is computed with dynamic programming.
191
+ .TP
192
+ .B -D
193
+ If query sequence name/length are identical to the target name/length, ignore
194
+ diagonal anchors. This option also reduces DP-based extension along the
195
+ diagonal.
196
+ .TP
197
+ .B -P
198
+ Retain all chains and don't attempt to set primary chains. Options
199
+ .B -p
200
+ and
201
+ .B -N
202
+ have no effect when this option is in use.
203
+ .TP
204
+ .BR --dual = yes | no
205
+ If
206
+ .BR no ,
207
+ skip query-target pairs wherein the query name is lexicographically greater
208
+ than the target name [yes]
209
+ .TP
210
+ .B -X
211
+ Equivalent to
212
+ .RB ' -DP
213
+ .BR --dual = no
214
+ .BR --no-long-join '.
215
+ Primarily used for all-vs-all read overlapping.
216
+ .TP
217
+ .BI -p \ FLOAT
218
+ Minimal secondary-to-primary score ratio to output secondary mappings [0.8].
219
+ Between two chains overlaping over half of the shorter chain (controlled by
220
+ .BR -M ),
221
+ the chain with a lower score is secondary to the chain with a higher score.
222
+ If the ratio of the scores is below
223
+ .IR FLOAT ,
224
+ the secondary chain will not be outputted or extended with DP alignment later.
225
+ This option has no effect when
226
+ .B -X
227
+ is applied.
228
+ .TP
229
+ .BI -N \ INT
230
+ Output at most
231
+ .I INT
232
+ secondary alignments [5]. This option has no effect when
233
+ .B -X
234
+ is applied.
235
+ .TP
236
+ .BI -G \ NUM
237
+ Maximum gap on the reference (effective with
238
+ .BR -xsplice / --splice ).
239
+ This option also changes the chaining and alignment band width to
240
+ .IR NUM .
241
+ Increasing this option slows down spliced alignment. [200k]
242
+ .TP
243
+ .BI -F \ NUM
244
+ Maximum fragment length (aka insert size; effective with
245
+ .BR -xsr / --frag = yes )
246
+ [800]
247
+ .TP
248
+ .BI -M \ FLOAT
249
+ Mark as secondary a chain that overlaps with a better chain by
250
+ .I FLOAT
251
+ or more of the shorter chain [0.5]
252
+ .TP
253
+ .BR --rmq = no | yes
254
+ Use the minigraph chaining algorithm [no]. The minigraph algorithm is better
255
+ for aligning contigs through long INDELs.
256
+ .TP
257
+ .B --hard-mask-level
258
+ Honor option
259
+ .B -M
260
+ and disable a heurstic to save unmapped subsequences and disables
261
+ .BR --mask-len .
262
+ .TP
263
+ .BI --mask-len \ NUM
264
+ Keep an alignment if dropping it leaves an unaligned region on query longer than
265
+ .IR INT
266
+ [inf]. Effective without
267
+ .BR --hard-mask-level .
268
+ .TP
269
+ .BI --max-chain-skip \ INT
270
+ A heuristics that stops chaining early [25]. Minimap2 uses dynamic programming
271
+ for chaining. The time complexity is quadratic in the number of seeds. This
272
+ option makes minimap2 exits the inner loop if it repeatedly sees seeds already
273
+ on chains. Set
274
+ .I INT
275
+ to a large number to switch off this heurstics.
276
+ .TP
277
+ .BI --max-chain-iter \ INT
278
+ Check up to
279
+ .I INT
280
+ partial chains during chaining [5000]. This is a heuristic to avoid quadratic
281
+ time complexity in the worst case.
282
+ .TP
283
+ .BI --chain-gap-scale \ FLOAT
284
+ Scale of gap cost during chaining [1.0]
285
+ .TP
286
+ .B --no-long-join
287
+ Disable the long gap patching heuristic. When this option is applied, the
288
+ maximum alignment gap is mostly controlled by
289
+ .BR -r .
290
+ .TP
291
+ .B --splice
292
+ Enable the splice alignment mode.
293
+ .TP
294
+ .B --sr
295
+ Enable short-read alignment heuristics. In the short-read mode, minimap2
296
+ applies a second round of chaining with a higher minimizer occurrence threshold
297
+ if no good chain is found. In addition, minimap2 attempts to patch gaps between
298
+ seeds with ungapped alignment.
299
+ .TP
300
+ .BI --split-prefix \ STR
301
+ Prefix to create temporary files. Typically used for a multi-part index.
302
+ .TP
303
+ .BR --frag = no | yes
304
+ Whether to enable the fragment mode [no]
305
+ .TP
306
+ .B --for-only
307
+ Only map to the forward strand of the reference sequences. For paired-end
308
+ reads in the forward-reverse orientation, the first read is mapped to forward
309
+ strand of the reference and the second read to the reverse stand.
310
+ .TP
311
+ .B --rev-only
312
+ Only map to the reverse complement strand of the reference sequences.
313
+ .TP
314
+ .BR --heap-sort = no | yes
315
+ If yes, sort anchors with heap merge, instead of radix sort. Heap merge is
316
+ faster for short reads, but slower for long reads. [no]
317
+ .TP
318
+ .B --no-pairing
319
+ Treat two reads in a pair as independent reads. The mate related fields in SAM
320
+ are still properly populated.
321
+ .SS Alignment options
322
+ .TP 10
323
+ .BI -A \ INT
324
+ Matching score [2]
325
+ .TP
326
+ .BI -B \ INT
327
+ Mismatching penalty [4]
328
+ .TP
329
+ .BI -O \ INT1[,INT2]
330
+ Gap open penalty [4,24]. If
331
+ .I INT2
332
+ is not specified, it is set to
333
+ .IR INT1 .
334
+ .TP
335
+ .BI -E \ INT1[,INT2]
336
+ Gap extension penalty [2,1]. A gap of length
337
+ .I k
338
+ costs
339
+ .RI min{ O1 + k * E1 , O2 + k * E2 }.
340
+ In the splice mode, the second gap penalties are not used.
341
+ .TP
342
+ .BI -C \ INT
343
+ Cost for a non-canonical GT-AG splicing (effective with
344
+ .BR --splice )
345
+ [0]
346
+ .TP
347
+ .BI -z \ INT1[,INT2]
348
+ Truncate an alignment if the running alignment score drops too quickly along
349
+ the diagonal of the DP matrix (diagonal X-drop, or Z-drop) [400,200]. If the
350
+ drop of score is above
351
+ .IR INT2 ,
352
+ minimap2 will reverse complement the query in the related region and align
353
+ again to test small inversions. Minimap2 truncates alignment if there is an
354
+ inversion or the drop of score is greater than
355
+ .IR INT1 .
356
+ Decrease
357
+ .I INT2
358
+ to find small inversions at the cost of performance and false positives.
359
+ Increase
360
+ .I INT1
361
+ to improves the contiguity of alignment at the cost of poor alignment in the
362
+ middle.
363
+ .TP
364
+ .BI -s \ INT
365
+ Minimal peak DP alignment score to output [40]. The peak score is computed from
366
+ the final CIGAR. It is the score of the max scoring segment in the alignment
367
+ and may be different from the total alignment score.
368
+ .TP
369
+ .BI -u \ CHAR
370
+ How to find canonical splicing sites GT-AG -
371
+ .BR f :
372
+ transcript strand;
373
+ .BR b :
374
+ both strands;
375
+ .BR n :
376
+ no attempt to match GT-AG [n]
377
+ .TP
378
+ .BI --end-bonus \ INT
379
+ Score bonus when alignment extends to the end of the query sequence [0].
380
+ .TP
381
+ .BI --score-N \ INT
382
+ Score of a mismatch involving ambiguous bases [1].
383
+ .TP
384
+ .BR --splice-flank = yes | no
385
+ Assume the next base to a
386
+ .B GT
387
+ donor site tends to be A/G (91% in human and 92% in mouse) and the preceding
388
+ base to a
389
+ .B AG
390
+ acceptor tends to be C/T [no].
391
+ This trend is evolutionarily conservative, all the way to S. cerevisiae
392
+ (PMID:18688272). Specifying this option generally leads to higher junction
393
+ accuracy by several percents, so it is applied by default with
394
+ .BR --splice .
395
+ However, the SIRV control does not honor this trend
396
+ (only ~60%). This option reduces accuracy. If you are benchmarking minimap2
397
+ on SIRV data, please add
398
+ .B --splice-flank=no
399
+ to the command line.
400
+ .TP
401
+ .BR --junc-bed \ FILE
402
+ Gene annotations in the BED12 format (aka 12-column BED), or intron positions
403
+ in 5-column BED. With this option, minimap2 prefers splicing in annotations.
404
+ BED12 file can be converted from GTF/GFF3 with `paftools.js gff2bed anno.gtf'
405
+ [].
406
+ .TP
407
+ .BR --junc-bonus \ INT
408
+ Score bonus for a splice donor or acceptor found in annotation (effective with
409
+ .BR --junc-bed )
410
+ [9].
411
+ .TP
412
+ .BI --end-seed-pen \ INT
413
+ Drop a terminal anchor if
414
+ .IR s <log( g )+ INT ,
415
+ where
416
+ .I s
417
+ is the local alignment score around the anchor and
418
+ .I g
419
+ the length of the terminal gap in the chain. This option is only effective
420
+ with
421
+ .BR --splice .
422
+ It helps to avoid tiny terminal exons. [6]
423
+ .TP
424
+ .B --no-end-flt
425
+ Don't filter seeds towards the ends of chains before performing base-level
426
+ alignment.
427
+ .TP
428
+ .BI --cap-sw-mem \ NUM
429
+ Skip alignment if the DP matrix size is above
430
+ .IR NUM .
431
+ Set 0 to disable [100m].
432
+ .TP
433
+ .BI --cap-kalloc \ NUM
434
+ Free thread-local kalloc memory reservoir if after the alignment the size of the reservoir above
435
+ .IR NUM .
436
+ Set 0 to disable [0].
437
+ .SS Input/output options
438
+ .TP 10
439
+ .B -a
440
+ Generate CIGAR and output alignments in the SAM format. Minimap2 outputs in PAF
441
+ by default.
442
+ .TP
443
+ .BI -o \ FILE
444
+ Output alignments to
445
+ .I FILE
446
+ [stdout].
447
+ .TP
448
+ .B -Q
449
+ Ignore base quality in the input file.
450
+ .TP
451
+ .B -L
452
+ Write CIGAR with >65535 operators at the CG tag. Older tools are unable to
453
+ convert alignments with >65535 CIGAR ops to BAM. This option makes minimap2 SAM
454
+ compatible with older tools. Newer tools recognizes this tag and reconstruct
455
+ the real CIGAR in memory.
456
+ .TP
457
+ .BI -R \ STR
458
+ SAM read group line in a format like
459
+ .B @RG\\\\tID:foo\\\\tSM:bar
460
+ [].
461
+ .TP
462
+ .B -y
463
+ Copy input FASTA/Q comments to output.
464
+ .TP
465
+ .B -c
466
+ Generate CIGAR. In PAF, the CIGAR is written to the `cg' custom tag.
467
+ .TP
468
+ .BI --cs[= STR ]
469
+ Output the
470
+ .B cs
471
+ tag.
472
+ .I STR
473
+ can be either
474
+ .I short
475
+ or
476
+ .IR long .
477
+ If no
478
+ .I STR
479
+ is given,
480
+ .I short
481
+ is assumed. [none]
482
+ .TP
483
+ .B --MD
484
+ Output the MD tag (see the SAM spec).
485
+ .TP
486
+ .B --eqx
487
+ Output =/X CIGAR operators for sequence match/mismatch.
488
+ .TP
489
+ .B -Y
490
+ In SAM output, use soft clipping for supplementary alignments.
491
+ .TP
492
+ .BI --seed \ INT
493
+ Integer seed for randomizing equally best hits. Minimap2 hashes
494
+ .I INT
495
+ and read name when choosing between equally best hits. [11]
496
+ .TP
497
+ .BI -t \ INT
498
+ Number of threads [3]. Minimap2 uses at most three threads when indexing target
499
+ sequences, and uses up to
500
+ .IR INT +1
501
+ threads when mapping (the extra thread is for I/O, which is frequently idle and
502
+ takes little CPU time).
503
+ .TP
504
+ .B -2
505
+ Use two I/O threads during mapping. By default, minimap2 uses one I/O thread.
506
+ When I/O is slow (e.g. piping to gzip, or reading from a slow pipe), the I/O
507
+ thread may become the bottleneck. Apply this option to use one thread for input
508
+ and another thread for output, at the cost of increased peak RAM.
509
+ .TP
510
+ .BI -K \ NUM
511
+ Number of bases loaded into memory to process in a mini-batch [500M].
512
+ Similar to option
513
+ .BR -I ,
514
+ K/M/G/k/m/g suffix is accepted. A large
515
+ .I NUM
516
+ helps load balancing in the multi-threading mode, at the cost of increased
517
+ memory.
518
+ .TP
519
+ .BR --secondary = yes | no
520
+ Whether to output secondary alignments [yes]
521
+ .TP
522
+ .BI --max-qlen \ NUM
523
+ Filter out query sequences longer than
524
+ .IR NUM .
525
+ .TP
526
+ .B --paf-no-hit
527
+ In PAF, output unmapped queries; the strand and the reference name fields are
528
+ set to `*'. Warning: some paftools.js commands may not work with such output
529
+ for the moment.
530
+ .TP
531
+ .B --sam-hit-only
532
+ In SAM, don't output unmapped reads.
533
+ .TP
534
+ .B --version
535
+ Print version number to stdout
536
+ .SS Preset options
537
+ .TP 10
538
+ .BI -x \ STR
539
+ Preset []. This option applies multiple options at the same time. It should be
540
+ applied before other options because options applied later will overwrite the
541
+ values set by
542
+ .BR -x .
543
+ Available
544
+ .I STR
545
+ are:
546
+ .RS
547
+ .TP 10
548
+ .B map-ont
549
+ Align noisy long reads of ~10% error rate to a reference genome. This is the
550
+ default mode.
551
+ .TP
552
+ .B map-hifi
553
+ Align PacBio high-fidelity (HiFi) reads to a reference genome
554
+ .RB ( -k19
555
+ .B -w19 -U50,500 -g10k -A1 -B4 -O6,26 -E2,1
556
+ .BR -s200 ).
557
+ .TP
558
+ .B map-pb
559
+ Align older PacBio continuous long (CLR) reads to a reference genome
560
+ .RB ( -Hk19 ).
561
+ .TP
562
+ .B asm5
563
+ Long assembly to reference mapping
564
+ .RB ( -k19
565
+ .B -w19 -U50,500 --rmq -r100k -g10k -A1 -B19 -O39,81 -E3,1 -s200 -z200
566
+ .BR -N50 ).
567
+ Typically, the alignment will not extend to regions with 5% or higher sequence
568
+ divergence. Only use this preset if the average divergence is far below 5%.
569
+ .TP
570
+ .B asm10
571
+ Long assembly to reference mapping
572
+ .RB ( -k19
573
+ .B -w19 -U50,500 --rmq -r100k -g10k -A1 -B9 -O16,41 -E2,1 -s200 -z200
574
+ .BR -N50 ).
575
+ Up to 10% sequence divergence.
576
+ .TP
577
+ .B asm20
578
+ Long assembly to reference mapping
579
+ .RB ( -k19
580
+ .B -w10 -U50,500 --rmq -r100k -g10k -A1 -B4 -O6,26 -E2,1 -s200 -z200
581
+ .BR -N50 ).
582
+ Up to 20% sequence divergence.
583
+ .TP
584
+ .B splice
585
+ Long-read spliced alignment
586
+ .RB ( -k15
587
+ .B -w5 --splice -g2k -G200k -A1 -B2 -O2,32 -E1,0 -b0 -C9 -z200 -ub --junc-bonus=9 --cap-sw-mem=0
588
+ .BR --splice-flank=yes ).
589
+ In the splice mode, 1) long deletions are taken as introns and represented as
590
+ the
591
+ .RB ` N '
592
+ CIGAR operator; 2) long insertions are disabled; 3) deletion and insertion gap
593
+ costs are different during chaining; 4) the computation of the
594
+ .RB ` ms '
595
+ tag ignores introns to demote hits to pseudogenes.
596
+ .TP
597
+ .B splice:hq
598
+ Long-read splice alignment for PacBio CCS reads
599
+ .RB ( -xsplice
600
+ .B -C5 -O6,24
601
+ .BR -B4 ).
602
+ .TP
603
+ .B sr
604
+ Short single-end reads without splicing
605
+ .RB ( -k21
606
+ .B -w11 --sr --frag=yes -A2 -B8 -O12,32 -E2,1 -b0 -r100 -p.5 -N20 -f1000,5000 -n2 -m20
607
+ .B -s40 -g100 -2K50m --heap-sort=yes
608
+ .BR --secondary=no ).
609
+ .TP
610
+ .B ava-pb
611
+ PacBio CLR all-vs-all overlap mapping
612
+ .RB ( -Hk19
613
+ .B -Xw5 -e0
614
+ .BR -m100 ).
615
+ .TP
616
+ .B ava-ont
617
+ Oxford Nanopore all-vs-all overlap mapping
618
+ .RB ( -k15
619
+ .B -Xw5 -e0 -m100
620
+ .BR -r2k ).
621
+ .RE
622
+ .SS Miscellaneous options
623
+ .TP 10
624
+ .B --no-kalloc
625
+ Use the libc default allocator instead of the kalloc thread-local allocator.
626
+ This debugging option is mostly used with Valgrind to detect invalid memory
627
+ accesses. Minimap2 runs slower with this option, especially in the
628
+ multi-threading mode.
629
+ .TP
630
+ .B --print-qname
631
+ Print query names to stderr, mostly to see which query is crashing minimap2.
632
+ .TP
633
+ .B --print-seeds
634
+ Print seed positions to stderr, for debugging only.
635
+ .SH OUTPUT FORMAT
636
+ .PP
637
+ Minimap2 outputs mapping positions in the Pairwise mApping Format (PAF) by
638
+ default. PAF is a TAB-delimited text format with each line consisting of at
639
+ least 12 fields as are described in the following table:
640
+ .TS
641
+ center box;
642
+ cb | cb | cb
643
+ r | c | l .
644
+ Col Type Description
645
+ _
646
+ 1 string Query sequence name
647
+ 2 int Query sequence length
648
+ 3 int Query start coordinate (0-based)
649
+ 4 int Query end coordinate (0-based)
650
+ 5 char `+' if query/target on the same strand; `-' if opposite
651
+ 6 string Target sequence name
652
+ 7 int Target sequence length
653
+ 8 int Target start coordinate on the original strand
654
+ 9 int Target end coordinate on the original strand
655
+ 10 int Number of matching bases in the mapping
656
+ 11 int Number bases, including gaps, in the mapping
657
+ 12 int Mapping quality (0-255 with 255 for missing)
658
+ .TE
659
+
660
+ .PP
661
+ When alignment is available, column 11 gives the total number of sequence
662
+ matches, mismatches and gaps in the alignment; column 10 divided by column 11
663
+ gives the BLAST-like alignment identity. When alignment is unavailable,
664
+ these two columns are approximate. PAF may optionally have additional fields in
665
+ the SAM-like typed key-value format. Minimap2 may output the following tags:
666
+ .TS
667
+ center box;
668
+ cb | cb | cb
669
+ r | c | l .
670
+ Tag Type Description
671
+ _
672
+ tp A Type of aln: P/primary, S/secondary and I,i/inversion
673
+ cm i Number of minimizers on the chain
674
+ s1 i Chaining score
675
+ s2 i Chaining score of the best secondary chain
676
+ NM i Total number of mismatches and gaps in the alignment
677
+ MD Z To generate the ref sequence in the alignment
678
+ AS i DP alignment score
679
+ SA Z List of other supplementary alignments
680
+ ms i DP score of the max scoring segment in the alignment
681
+ nn i Number of ambiguous bases in the alignment
682
+ ts A Transcript strand (splice mode only)
683
+ cg Z CIGAR string (only in PAF)
684
+ cs Z Difference string
685
+ dv f Approximate per-base sequence divergence
686
+ de f Gap-compressed per-base sequence divergence
687
+ rl i Length of query regions harboring repetitive seeds
688
+ .TE
689
+
690
+ .PP
691
+ The
692
+ .B cs
693
+ tag encodes difference sequences in the short form or the entire query
694
+ .I AND
695
+ reference sequences in the long form. It consists of a series of operations:
696
+ .TS
697
+ center box;
698
+ cb | cb |cb
699
+ r | l | l .
700
+ Op Regex Description
701
+ _
702
+ = [ACGTN]+ Identical sequence (long form)
703
+ : [0-9]+ Identical sequence length
704
+ * [acgtn][acgtn] Substitution: ref to query
705
+ + [acgtn]+ Insertion to the reference
706
+ - [acgtn]+ Deletion from the reference
707
+ ~ [acgtn]{2}[0-9]+[acgtn]{2} Intron length and splice signal
708
+ .TE
709
+
710
+ .SH LIMITATIONS
711
+ .TP 2
712
+ *
713
+ Minimap2 may produce suboptimal alignments through long low-complexity regions
714
+ where seed positions may be suboptimal. This should not be a big concern
715
+ because even the optimal alignment may be wrong in such regions.
716
+ .TP
717
+ *
718
+ Minimap2 requires SSE2 or NEON instructions to compile. It is possible to add
719
+ non-SSE2/NEON support, but it would make minimap2 slower by several times.
720
+ .SH SEE ALSO
721
+ .PP
722
+ miniasm(1), minimap(1), bwa(1).