minimap2 0.2.22.0 → 0.2.24.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +60 -76
- data/ext/Rakefile +55 -0
- data/ext/cmappy/cmappy.c +129 -0
- data/ext/cmappy/cmappy.h +44 -0
- data/ext/minimap2/FAQ.md +46 -0
- data/ext/minimap2/LICENSE.txt +24 -0
- data/ext/minimap2/MANIFEST.in +10 -0
- data/ext/minimap2/Makefile +132 -0
- data/ext/minimap2/Makefile.simde +97 -0
- data/ext/minimap2/NEWS.md +821 -0
- data/ext/minimap2/README.md +403 -0
- data/ext/minimap2/align.c +1020 -0
- data/ext/minimap2/bseq.c +169 -0
- data/ext/minimap2/bseq.h +64 -0
- data/ext/minimap2/code_of_conduct.md +30 -0
- data/ext/minimap2/cookbook.md +243 -0
- data/ext/minimap2/esterr.c +64 -0
- data/ext/minimap2/example.c +63 -0
- data/ext/minimap2/format.c +559 -0
- data/ext/minimap2/hit.c +466 -0
- data/ext/minimap2/index.c +775 -0
- data/ext/minimap2/kalloc.c +205 -0
- data/ext/minimap2/kalloc.h +76 -0
- data/ext/minimap2/kdq.h +132 -0
- data/ext/minimap2/ketopt.h +120 -0
- data/ext/minimap2/khash.h +615 -0
- data/ext/minimap2/krmq.h +474 -0
- data/ext/minimap2/kseq.h +256 -0
- data/ext/minimap2/ksort.h +153 -0
- data/ext/minimap2/ksw2.h +184 -0
- data/ext/minimap2/ksw2_dispatch.c +96 -0
- data/ext/minimap2/ksw2_extd2_sse.c +402 -0
- data/ext/minimap2/ksw2_exts2_sse.c +416 -0
- data/ext/minimap2/ksw2_extz2_sse.c +313 -0
- data/ext/minimap2/ksw2_ll_sse.c +152 -0
- data/ext/minimap2/kthread.c +159 -0
- data/ext/minimap2/kthread.h +15 -0
- data/ext/minimap2/kvec.h +105 -0
- data/ext/minimap2/lchain.c +369 -0
- data/ext/minimap2/main.c +459 -0
- data/ext/minimap2/map.c +714 -0
- data/ext/minimap2/minimap.h +410 -0
- data/ext/minimap2/minimap2.1 +725 -0
- data/ext/minimap2/misc/README.md +179 -0
- data/ext/minimap2/misc/mmphase.js +335 -0
- data/ext/minimap2/misc/paftools.js +3149 -0
- data/ext/minimap2/misc.c +162 -0
- data/ext/minimap2/mmpriv.h +132 -0
- data/ext/minimap2/options.c +234 -0
- data/ext/minimap2/pe.c +177 -0
- data/ext/minimap2/python/README.rst +196 -0
- data/ext/minimap2/python/cmappy.h +152 -0
- data/ext/minimap2/python/cmappy.pxd +153 -0
- data/ext/minimap2/python/mappy.pyx +273 -0
- data/ext/minimap2/python/minimap2.py +39 -0
- data/ext/minimap2/sdust.c +213 -0
- data/ext/minimap2/sdust.h +25 -0
- data/ext/minimap2/seed.c +131 -0
- data/ext/minimap2/setup.py +55 -0
- data/ext/minimap2/sketch.c +143 -0
- data/ext/minimap2/splitidx.c +84 -0
- data/ext/minimap2/sse2neon/emmintrin.h +1689 -0
- data/ext/minimap2/test/MT-human.fa +278 -0
- data/ext/minimap2/test/MT-orang.fa +276 -0
- data/ext/minimap2/test/q-inv.fa +4 -0
- data/ext/minimap2/test/q2.fa +2 -0
- data/ext/minimap2/test/t-inv.fa +127 -0
- data/ext/minimap2/test/t2.fa +2 -0
- data/ext/minimap2/tex/Makefile +21 -0
- data/ext/minimap2/tex/bioinfo.cls +930 -0
- data/ext/minimap2/tex/blasr-mc.eval +17 -0
- data/ext/minimap2/tex/bowtie2-s3.sam.eval +28 -0
- data/ext/minimap2/tex/bwa-s3.sam.eval +52 -0
- data/ext/minimap2/tex/bwa.eval +55 -0
- data/ext/minimap2/tex/eval2roc.pl +33 -0
- data/ext/minimap2/tex/graphmap.eval +4 -0
- data/ext/minimap2/tex/hs38-simu.sh +10 -0
- data/ext/minimap2/tex/minialign.eval +49 -0
- data/ext/minimap2/tex/minimap2.bib +460 -0
- data/ext/minimap2/tex/minimap2.tex +724 -0
- data/ext/minimap2/tex/mm2-s3.sam.eval +62 -0
- data/ext/minimap2/tex/mm2-update.tex +240 -0
- data/ext/minimap2/tex/mm2.approx.eval +12 -0
- data/ext/minimap2/tex/mm2.eval +13 -0
- data/ext/minimap2/tex/natbib.bst +1288 -0
- data/ext/minimap2/tex/natbib.sty +803 -0
- data/ext/minimap2/tex/ngmlr.eval +38 -0
- data/ext/minimap2/tex/roc.gp +60 -0
- data/ext/minimap2/tex/snap-s3.sam.eval +62 -0
- data/ext/minimap2.patch +19 -0
- data/lib/minimap2/aligner.rb +4 -4
- data/lib/minimap2/alignment.rb +11 -11
- data/lib/minimap2/ffi/constants.rb +20 -16
- data/lib/minimap2/ffi/functions.rb +5 -0
- data/lib/minimap2/ffi.rb +4 -5
- data/lib/minimap2/version.rb +2 -2
- data/lib/minimap2.rb +51 -15
- metadata +97 -79
- data/lib/minimap2/ffi_helper.rb +0 -53
- data/vendor/libminimap2.so +0 -0
data/ext/minimap2/bseq.c
ADDED
@@ -0,0 +1,169 @@
|
|
1
|
+
#include <zlib.h>
|
2
|
+
#include <stdio.h>
|
3
|
+
#include <stdlib.h>
|
4
|
+
#include <assert.h>
|
5
|
+
#define __STDC_LIMIT_MACROS
|
6
|
+
#include "bseq.h"
|
7
|
+
#include "kvec.h"
|
8
|
+
#include "kseq.h"
|
9
|
+
KSEQ_INIT2(, gzFile, gzread)
|
10
|
+
|
11
|
+
unsigned char seq_comp_table[256] = {
|
12
|
+
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
|
13
|
+
16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
|
14
|
+
32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
|
15
|
+
48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63,
|
16
|
+
64, 'T', 'V', 'G', 'H', 'E', 'F', 'C', 'D', 'I', 'J', 'M', 'L', 'K', 'N', 'O',
|
17
|
+
'P', 'Q', 'Y', 'S', 'A', 'A', 'B', 'W', 'X', 'R', 'Z', 91, 92, 93, 94, 95,
|
18
|
+
96, 't', 'v', 'g', 'h', 'e', 'f', 'c', 'd', 'i', 'j', 'm', 'l', 'k', 'n', 'o',
|
19
|
+
'p', 'q', 'y', 's', 'a', 'a', 'b', 'w', 'x', 'r', 'z', 123, 124, 125, 126, 127,
|
20
|
+
128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143,
|
21
|
+
144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159,
|
22
|
+
160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175,
|
23
|
+
176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191,
|
24
|
+
192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207,
|
25
|
+
208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223,
|
26
|
+
224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239,
|
27
|
+
240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255
|
28
|
+
};
|
29
|
+
|
30
|
+
#define CHECK_PAIR_THRES 1000000
|
31
|
+
|
32
|
+
struct mm_bseq_file_s {
|
33
|
+
gzFile fp;
|
34
|
+
kseq_t *ks;
|
35
|
+
mm_bseq1_t s;
|
36
|
+
};
|
37
|
+
|
38
|
+
mm_bseq_file_t *mm_bseq_open(const char *fn)
|
39
|
+
{
|
40
|
+
mm_bseq_file_t *fp;
|
41
|
+
gzFile f;
|
42
|
+
f = fn && strcmp(fn, "-")? gzopen(fn, "r") : gzdopen(0, "r");
|
43
|
+
if (f == 0) return 0;
|
44
|
+
fp = (mm_bseq_file_t*)calloc(1, sizeof(mm_bseq_file_t));
|
45
|
+
fp->fp = f;
|
46
|
+
fp->ks = kseq_init(fp->fp);
|
47
|
+
return fp;
|
48
|
+
}
|
49
|
+
|
50
|
+
void mm_bseq_close(mm_bseq_file_t *fp)
|
51
|
+
{
|
52
|
+
kseq_destroy(fp->ks);
|
53
|
+
gzclose(fp->fp);
|
54
|
+
free(fp);
|
55
|
+
}
|
56
|
+
|
57
|
+
static inline char *kstrdup(const kstring_t *s)
|
58
|
+
{
|
59
|
+
char *t;
|
60
|
+
t = (char*)malloc(s->l + 1);
|
61
|
+
memcpy(t, s->s, s->l + 1);
|
62
|
+
return t;
|
63
|
+
}
|
64
|
+
|
65
|
+
static inline void kseq2bseq(kseq_t *ks, mm_bseq1_t *s, int with_qual, int with_comment)
|
66
|
+
{
|
67
|
+
int i;
|
68
|
+
if (ks->name.l == 0)
|
69
|
+
fprintf(stderr, "[WARNING]\033[1;31m empty sequence name in the input.\033[0m\n");
|
70
|
+
s->name = kstrdup(&ks->name);
|
71
|
+
s->seq = kstrdup(&ks->seq);
|
72
|
+
for (i = 0; i < (int)ks->seq.l; ++i) // convert U to T
|
73
|
+
if (s->seq[i] == 'u' || s->seq[i] == 'U')
|
74
|
+
--s->seq[i];
|
75
|
+
s->qual = with_qual && ks->qual.l? kstrdup(&ks->qual) : 0;
|
76
|
+
s->comment = with_comment && ks->comment.l? kstrdup(&ks->comment) : 0;
|
77
|
+
s->l_seq = ks->seq.l;
|
78
|
+
}
|
79
|
+
|
80
|
+
mm_bseq1_t *mm_bseq_read3(mm_bseq_file_t *fp, int64_t chunk_size, int with_qual, int with_comment, int frag_mode, int *n_)
|
81
|
+
{
|
82
|
+
int64_t size = 0;
|
83
|
+
int ret;
|
84
|
+
kvec_t(mm_bseq1_t) a = {0,0,0};
|
85
|
+
kseq_t *ks = fp->ks;
|
86
|
+
*n_ = 0;
|
87
|
+
if (fp->s.seq) {
|
88
|
+
kv_resize(mm_bseq1_t, 0, a, 256);
|
89
|
+
kv_push(mm_bseq1_t, 0, a, fp->s);
|
90
|
+
size = fp->s.l_seq;
|
91
|
+
memset(&fp->s, 0, sizeof(mm_bseq1_t));
|
92
|
+
}
|
93
|
+
while ((ret = kseq_read(ks)) >= 0) {
|
94
|
+
mm_bseq1_t *s;
|
95
|
+
assert(ks->seq.l <= INT32_MAX);
|
96
|
+
if (a.m == 0) kv_resize(mm_bseq1_t, 0, a, 256);
|
97
|
+
kv_pushp(mm_bseq1_t, 0, a, &s);
|
98
|
+
kseq2bseq(ks, s, with_qual, with_comment);
|
99
|
+
size += s->l_seq;
|
100
|
+
if (size >= chunk_size) {
|
101
|
+
if (frag_mode && a.a[a.n-1].l_seq < CHECK_PAIR_THRES) {
|
102
|
+
while ((ret = kseq_read(ks)) >= 0) {
|
103
|
+
kseq2bseq(ks, &fp->s, with_qual, with_comment);
|
104
|
+
if (mm_qname_same(fp->s.name, a.a[a.n-1].name)) {
|
105
|
+
kv_push(mm_bseq1_t, 0, a, fp->s);
|
106
|
+
memset(&fp->s, 0, sizeof(mm_bseq1_t));
|
107
|
+
} else break;
|
108
|
+
}
|
109
|
+
}
|
110
|
+
break;
|
111
|
+
}
|
112
|
+
}
|
113
|
+
if (ret < -1) {
|
114
|
+
if (a.n) fprintf(stderr, "[WARNING]\033[1;31m failed to parse the FASTA/FASTQ record next to '%s'. Continue anyway.\033[0m\n", a.a[a.n-1].name);
|
115
|
+
else fprintf(stderr, "[WARNING]\033[1;31m failed to parse the first FASTA/FASTQ record. Continue anyway.\033[0m\n");
|
116
|
+
}
|
117
|
+
*n_ = a.n;
|
118
|
+
return a.a;
|
119
|
+
}
|
120
|
+
|
121
|
+
mm_bseq1_t *mm_bseq_read2(mm_bseq_file_t *fp, int64_t chunk_size, int with_qual, int frag_mode, int *n_)
|
122
|
+
{
|
123
|
+
return mm_bseq_read3(fp, chunk_size, with_qual, 0, frag_mode, n_);
|
124
|
+
}
|
125
|
+
|
126
|
+
mm_bseq1_t *mm_bseq_read(mm_bseq_file_t *fp, int64_t chunk_size, int with_qual, int *n_)
|
127
|
+
{
|
128
|
+
return mm_bseq_read2(fp, chunk_size, with_qual, 0, n_);
|
129
|
+
}
|
130
|
+
|
131
|
+
mm_bseq1_t *mm_bseq_read_frag2(int n_fp, mm_bseq_file_t **fp, int64_t chunk_size, int with_qual, int with_comment, int *n_)
|
132
|
+
{
|
133
|
+
int i;
|
134
|
+
int64_t size = 0;
|
135
|
+
kvec_t(mm_bseq1_t) a = {0,0,0};
|
136
|
+
*n_ = 0;
|
137
|
+
if (n_fp < 1) return 0;
|
138
|
+
while (1) {
|
139
|
+
int n_read = 0;
|
140
|
+
for (i = 0; i < n_fp; ++i)
|
141
|
+
if (kseq_read(fp[i]->ks) >= 0)
|
142
|
+
++n_read;
|
143
|
+
if (n_read < n_fp) {
|
144
|
+
if (n_read > 0)
|
145
|
+
fprintf(stderr, "[W::%s]\033[1;31m query files have different number of records; extra records skipped.\033[0m\n", __func__);
|
146
|
+
break; // some file reaches the end
|
147
|
+
}
|
148
|
+
if (a.m == 0) kv_resize(mm_bseq1_t, 0, a, 256);
|
149
|
+
for (i = 0; i < n_fp; ++i) {
|
150
|
+
mm_bseq1_t *s;
|
151
|
+
kv_pushp(mm_bseq1_t, 0, a, &s);
|
152
|
+
kseq2bseq(fp[i]->ks, s, with_qual, with_comment);
|
153
|
+
size += s->l_seq;
|
154
|
+
}
|
155
|
+
if (size >= chunk_size) break;
|
156
|
+
}
|
157
|
+
*n_ = a.n;
|
158
|
+
return a.a;
|
159
|
+
}
|
160
|
+
|
161
|
+
mm_bseq1_t *mm_bseq_read_frag(int n_fp, mm_bseq_file_t **fp, int64_t chunk_size, int with_qual, int *n_)
|
162
|
+
{
|
163
|
+
return mm_bseq_read_frag2(n_fp, fp, chunk_size, with_qual, 0, n_);
|
164
|
+
}
|
165
|
+
|
166
|
+
int mm_bseq_eof(mm_bseq_file_t *fp)
|
167
|
+
{
|
168
|
+
return (ks_eof(fp->ks->f) && fp->s.seq == 0);
|
169
|
+
}
|
data/ext/minimap2/bseq.h
ADDED
@@ -0,0 +1,64 @@
|
|
1
|
+
#ifndef MM_BSEQ_H
|
2
|
+
#define MM_BSEQ_H
|
3
|
+
|
4
|
+
#include <stdint.h>
|
5
|
+
#include <string.h>
|
6
|
+
|
7
|
+
#ifdef __cplusplus
|
8
|
+
extern "C" {
|
9
|
+
#endif
|
10
|
+
|
11
|
+
struct mm_bseq_file_s;
|
12
|
+
typedef struct mm_bseq_file_s mm_bseq_file_t;
|
13
|
+
|
14
|
+
typedef struct {
|
15
|
+
int l_seq, rid;
|
16
|
+
char *name, *seq, *qual, *comment;
|
17
|
+
} mm_bseq1_t;
|
18
|
+
|
19
|
+
mm_bseq_file_t *mm_bseq_open(const char *fn);
|
20
|
+
void mm_bseq_close(mm_bseq_file_t *fp);
|
21
|
+
mm_bseq1_t *mm_bseq_read3(mm_bseq_file_t *fp, int64_t chunk_size, int with_qual, int with_comment, int frag_mode, int *n_);
|
22
|
+
mm_bseq1_t *mm_bseq_read2(mm_bseq_file_t *fp, int64_t chunk_size, int with_qual, int frag_mode, int *n_);
|
23
|
+
mm_bseq1_t *mm_bseq_read(mm_bseq_file_t *fp, int64_t chunk_size, int with_qual, int *n_);
|
24
|
+
mm_bseq1_t *mm_bseq_read_frag2(int n_fp, mm_bseq_file_t **fp, int64_t chunk_size, int with_qual, int with_comment, int *n_);
|
25
|
+
mm_bseq1_t *mm_bseq_read_frag(int n_fp, mm_bseq_file_t **fp, int64_t chunk_size, int with_qual, int *n_);
|
26
|
+
int mm_bseq_eof(mm_bseq_file_t *fp);
|
27
|
+
|
28
|
+
extern unsigned char seq_nt4_table[256];
|
29
|
+
extern unsigned char seq_comp_table[256];
|
30
|
+
|
31
|
+
static inline int mm_qname_len(const char *s)
|
32
|
+
{
|
33
|
+
int l;
|
34
|
+
l = strlen(s);
|
35
|
+
return l >= 3 && s[l-1] >= '0' && s[l-1] <= '9' && s[l-2] == '/'? l - 2 : l;
|
36
|
+
}
|
37
|
+
|
38
|
+
static inline int mm_qname_same(const char *s1, const char *s2)
|
39
|
+
{
|
40
|
+
int l1, l2;
|
41
|
+
l1 = mm_qname_len(s1);
|
42
|
+
l2 = mm_qname_len(s2);
|
43
|
+
return (l1 == l2 && strncmp(s1, s2, l1) == 0);
|
44
|
+
}
|
45
|
+
|
46
|
+
static inline void mm_revcomp_bseq(mm_bseq1_t *s)
|
47
|
+
{
|
48
|
+
int i, t, l = s->l_seq;
|
49
|
+
for (i = 0; i < l>>1; ++i) {
|
50
|
+
t = s->seq[l - i - 1];
|
51
|
+
s->seq[l - i - 1] = seq_comp_table[(uint8_t)s->seq[i]];
|
52
|
+
s->seq[i] = seq_comp_table[t];
|
53
|
+
}
|
54
|
+
if (l&1) s->seq[l>>1] = seq_comp_table[(uint8_t)s->seq[l>>1]];
|
55
|
+
if (s->qual)
|
56
|
+
for (i = 0; i < l>>1; ++i)
|
57
|
+
t = s->qual[l - i - 1], s->qual[l - i - 1] = s->qual[i], s->qual[i] = t;
|
58
|
+
}
|
59
|
+
|
60
|
+
#ifdef __cplusplus
|
61
|
+
}
|
62
|
+
#endif
|
63
|
+
|
64
|
+
#endif
|
@@ -0,0 +1,30 @@
|
|
1
|
+
## Contributor Code of Conduct
|
2
|
+
|
3
|
+
As contributors and maintainers of this project, we pledge to respect all
|
4
|
+
people who contribute through reporting issues, posting feature requests,
|
5
|
+
updating documentation, submitting pull requests or patches, and other
|
6
|
+
activities.
|
7
|
+
|
8
|
+
We are committed to making participation in this project a harassment-free
|
9
|
+
experience for everyone, regardless of level of experience, gender, gender
|
10
|
+
identity and expression, sexual orientation, disability, personal appearance,
|
11
|
+
body size, race, age, or religion.
|
12
|
+
|
13
|
+
Examples of unacceptable behavior by participants include the use of sexual
|
14
|
+
language or imagery, derogatory comments or personal attacks, trolling, public
|
15
|
+
or private harassment, insults, or other unprofessional conduct.
|
16
|
+
|
17
|
+
Project maintainers have the right and responsibility to remove, edit, or
|
18
|
+
reject comments, commits, code, wiki edits, issues, and other contributions
|
19
|
+
that are not aligned to this Code of Conduct. Project maintainers or
|
20
|
+
contributors who do not follow the Code of Conduct may be removed from the
|
21
|
+
project team.
|
22
|
+
|
23
|
+
Instances of abusive, harassing, or otherwise unacceptable behavior may be
|
24
|
+
reported by opening an issue or contacting the maintainer via email.
|
25
|
+
|
26
|
+
This Code of Conduct is adapted from the [Contributor Covenant][cc], [version
|
27
|
+
1.0.0][v1].
|
28
|
+
|
29
|
+
[cc]: http://contributor-covenant.org/
|
30
|
+
[v1]: http://contributor-covenant.org/version/1/0/0/
|
@@ -0,0 +1,243 @@
|
|
1
|
+
## Table of Contents
|
2
|
+
|
3
|
+
- [Introduction & Installation](#intro)
|
4
|
+
- [Mapping Genomic Reads](#map-reads)
|
5
|
+
* [Mapping long reads](#map-pb)
|
6
|
+
* [Mapping Illumina paired-end reads](#map-sr)
|
7
|
+
* [Evaluating mapping accuracy with simulated reads (for developers)](#mapeval)
|
8
|
+
- [Mapping Long RNA-seq Reads](#map-rna)
|
9
|
+
* [Mapping Nanopore 2D cDNA reads](#map-ont-cdna-2d)
|
10
|
+
* [Mapping Nanopore direct-RNA reads](#map-direct-rna)
|
11
|
+
* [Mapping PacBio Iso-seq reads](#map-iso-seq)
|
12
|
+
- [Full-Genome Alignment](#genome-aln)
|
13
|
+
* [Intra-species assembly alignment](#asm-to-ref)
|
14
|
+
* [Cross-species full-genome alignment](#x-species)
|
15
|
+
* [Eyeballing alignment](#view-aln)
|
16
|
+
* [Calling variants from assembly-to-reference alignment](#asm-var)
|
17
|
+
* [Constructing self-homology map](#hom-map)
|
18
|
+
* [Lift Over (for developers)](#liftover)
|
19
|
+
- [Read Overlap](#read-overlap)
|
20
|
+
* [Long-read overlap](#long-read-overlap)
|
21
|
+
* [Evaluating overlap sensitivity (for developers)](#ov-eval)
|
22
|
+
|
23
|
+
## <a name="intro"></a>Introduction & Installation
|
24
|
+
|
25
|
+
This cookbook walks you through a variety of applications of minimap2 and its
|
26
|
+
companion script `paftools.js`. All data here are freely available from the
|
27
|
+
minimap2 release page at version tag [v2.10][v2.10]. Some examples only work
|
28
|
+
with v2.10 or later.
|
29
|
+
|
30
|
+
To acquire the data used in this cookbook and to install minimap2 and paftools,
|
31
|
+
please follow the command lines below:
|
32
|
+
```sh
|
33
|
+
# install minimap2 executables
|
34
|
+
curl -L https://github.com/lh3/minimap2/releases/download/v2.24/minimap2-2.24_x64-linux.tar.bz2 | tar jxf -
|
35
|
+
cp minimap2-2.24_x64-linux/{minimap2,k8,paftools.js} . # copy executables
|
36
|
+
export PATH="$PATH:"`pwd` # put the current directory on PATH
|
37
|
+
# download example datasets
|
38
|
+
curl -L https://github.com/lh3/minimap2/releases/download/v2.10/cookbook-data.tgz | tar zxf -
|
39
|
+
```
|
40
|
+
|
41
|
+
## <a name="map-reads"></a>Mapping Genomic Reads
|
42
|
+
|
43
|
+
### <a name="map-pb"></a>Mapping long reads
|
44
|
+
```sh
|
45
|
+
minimap2 -ax map-pb -t4 ecoli_ref.fa ecoli_p6_25x_canu.fa > mapped.sam
|
46
|
+
```
|
47
|
+
Alternatively, you can create a minimap2 index first and then map:
|
48
|
+
```sh
|
49
|
+
minimap2 -x map-pb -d ecoli-pb.mmi ecoli_ref.fa # create an index
|
50
|
+
minimap2 -ax map-pb ecoli-pb.mmi ecoli_p6_25x_canu.fa > mapped.sam
|
51
|
+
```
|
52
|
+
This will save you a couple of minutes when you map against the human genome.
|
53
|
+
**HOWEVER**, key algorithm parameters such as the k-mer length and window
|
54
|
+
size can't be changed after indexing. Minimap2 will give you a warning if
|
55
|
+
parameters used in a pre-built index doesn't match parameters on the command
|
56
|
+
line. **Please always make sure you are using an intended pre-built index.**
|
57
|
+
|
58
|
+
### <a name="map-sr"></a>Mapping Illumina paired-end reads:
|
59
|
+
```sh
|
60
|
+
minimap2 -ax sr -t4 ecoli_ref.fa ecoli_mason_1.fq ecoli_mason_2.fq > mapped-sr.sam
|
61
|
+
```
|
62
|
+
|
63
|
+
### <a name="mapeval"></a>Evaluating mapping accuracy with simulated reads (for developers)
|
64
|
+
```sh
|
65
|
+
minimap2 -ax sr ecoli_ref.fa ecoli_mason_1.fq ecoli_mason_2.fq | paftools.js mapeval -
|
66
|
+
```
|
67
|
+
The output is:
|
68
|
+
```
|
69
|
+
Q 60 19712 0 0.000000000 19712
|
70
|
+
Q 0 282 219 0.010953286 19994
|
71
|
+
U 6
|
72
|
+
```
|
73
|
+
where a `U`-line gives the number of unmapped reads (for SAM input only); a
|
74
|
+
`Q`-line gives:
|
75
|
+
|
76
|
+
1. Mapping quality (mapQ) threshold
|
77
|
+
2. Number of mapped reads between this threshold and the previous mapQ threshold.
|
78
|
+
3. Number of wrong mappings in the same mapQ interval
|
79
|
+
4. Accumulative mapping error rate
|
80
|
+
5. Accumulative number of mappings
|
81
|
+
|
82
|
+
For `paftools.js mapeval` to work, you need to encode the true read positions
|
83
|
+
in read names in the right format. For [pbsim2][pbsim] and [mason2][mason2], we
|
84
|
+
provide scripts to generate the right format. Simulated reads in this cookbook
|
85
|
+
were created with the following command lines:
|
86
|
+
```sh
|
87
|
+
# in the pbsim2 source code directory:
|
88
|
+
src/pbsim --depth 1 --length-min 5000 --length-mean 20000 --accuracy-mean 0.95 --hmm_model data/R94.model ../ecoli_ref.fa
|
89
|
+
paftools.js pbsim2fq ../ecoli_ref.fa.fai sd_0001.maf > ../ecoli_pbsim.fa
|
90
|
+
|
91
|
+
# mason2 simulation
|
92
|
+
mason_simulator --illumina-prob-mismatch-scale 2.5 -ir ecoli_ref.fa -n 10000 -o tmp-l.fq -or tmp-r.fq -oa tmp.sam
|
93
|
+
paftools.js mason2fq tmp.sam | seqtk seq -1 > ecoli_mason_1.fq
|
94
|
+
paftools.js mason2fq tmp.sam | seqtk seq -2 > ecoli_mason_2.fq
|
95
|
+
```
|
96
|
+
|
97
|
+
|
98
|
+
|
99
|
+
## <a name="map-rna"></a>Mapping Long RNA-seq Reads
|
100
|
+
|
101
|
+
### <a name="map-ont-cdna-2d"></a>Mapping Nanopore 2D cDNA reads
|
102
|
+
```sh
|
103
|
+
minimap2 -ax splice SIRV_E2.fa SIRV_ont-cdna.fa > aln.sam
|
104
|
+
```
|
105
|
+
You can compare the alignment to the true annotations with:
|
106
|
+
```sh
|
107
|
+
paftools.js junceval SIRV_E2C.gtf aln.sam
|
108
|
+
```
|
109
|
+
It gives the percentage of introns found in the annotation. For SIRV data, it
|
110
|
+
is possible to achieve higher junction accuracy with
|
111
|
+
```sh
|
112
|
+
minimap2 -ax splice --splice-flank=no SIRV_E2.fa SIRV_ont-cdna.fa | paftools.js junceval SIRV_E2C.gtf
|
113
|
+
```
|
114
|
+
This is because minimap2 models one additional evolutionarily conserved base
|
115
|
+
around a canonical junction, but SIRV doesn't honor this signal. Option
|
116
|
+
`--splice-flank=no` asks minimap2 no to model this additional base.
|
117
|
+
|
118
|
+
In the output a tag `ts:A:+` indicates that the read strand is the same as the
|
119
|
+
transcript strand; `ts:A:-` indicates the read strand is opposite to the
|
120
|
+
transcript strand. This tag is inferred from the GT-AG signal and is thus only
|
121
|
+
available to spliced reads.
|
122
|
+
|
123
|
+
### <a name="map-direct-rna"></a>Mapping Nanopore direct-RNA reads
|
124
|
+
```sh
|
125
|
+
minimap2 -ax splice -k14 -uf SIRV_E2.fa SIRV_ont-drna.fa > aln.sam
|
126
|
+
```
|
127
|
+
Direct-RNA reads are noisier, so we use a shorter k-mer for improved
|
128
|
+
sensitivity. Here, option `-uf` forces minimap2 to map reads to the forward
|
129
|
+
transcript strand only because direct-RNA reads are stranded. Again, applying
|
130
|
+
`--splice-flank=no` helps junction accuracy for SIRV data.
|
131
|
+
|
132
|
+
### <a name="map-iso-seq"></a>Mapping PacBio Iso-seq reads
|
133
|
+
```sh
|
134
|
+
minimap2 -ax splice -uf -C5 SIRV_E2.fa SIRV_iso-seq.fq > aln.sam
|
135
|
+
```
|
136
|
+
Option `-C5` reduces the penalty on non-canonical splicing sites. It helps
|
137
|
+
to align such sites correctly for data with low error rate such as Iso-seq
|
138
|
+
reads and traditional cDNAs. On this example, minimap2 makes one junction
|
139
|
+
error. Applying `--splice-flank=no` fixes this alignment error.
|
140
|
+
|
141
|
+
Note that the command line above is optimized for the final Iso-seq reads.
|
142
|
+
PacBio's Iso-seq pipeline produces intermediate sequences at varying quality.
|
143
|
+
For example, some intermediate reads are not stranded. For these reads, option
|
144
|
+
`-uf` will lead to more errors. Please revise the minimap2 command line
|
145
|
+
accordingly.
|
146
|
+
|
147
|
+
|
148
|
+
|
149
|
+
## <a name="genome-aln"></a>Full-Genome Alignment
|
150
|
+
|
151
|
+
### <a name="asm-to-ref"></a>Intra-species assembly alignment
|
152
|
+
```sh
|
153
|
+
# option "--cs" is recommended as paftools.js may need it
|
154
|
+
minimap2 -cx asm5 --cs ecoli_ref.fa ecoli_canu.fa > ecoli_canu.paf
|
155
|
+
```
|
156
|
+
Here `ecoli_canu.fa` is the Canu assembly of `ecoli_p6_25x_canu.fa`. This
|
157
|
+
command line outputs alignments in the [PAF format][paf]. Use `-a` instead of
|
158
|
+
`-c` to get output in the SAM format.
|
159
|
+
|
160
|
+
### <a name="x-species"></a>Cross-species full-genome alignment
|
161
|
+
```sh
|
162
|
+
minimap2 -cx asm20 --cs ecoli_ref.fa ecoli_O104:H4.fa > ecoli_O104:H4.paf
|
163
|
+
sort -k6,6 -k8,8n ecoli_O104:H4.paf | paftools.js call -f ecoli_ref.fa -L10000 -l1000 - > out.vcf
|
164
|
+
```
|
165
|
+
Minimap2 has three presets for full-genome alignment: "asm5" for sequence
|
166
|
+
divergence below 1%, "asm10" for divergence around a couple of percent and
|
167
|
+
"asm20" for divergence not more than 10%. In theory, with the right setting,
|
168
|
+
minimap2 should work for sequence pairs with sequence divergence up to ~15%,
|
169
|
+
but this has not been carefully evaluated.
|
170
|
+
|
171
|
+
### <a name="view-aln"></a>Eyeballing alignment
|
172
|
+
```sh
|
173
|
+
# option "--cs" required; minimap2-r741 or higher required for the "asm20" preset
|
174
|
+
minimap2 -cx asm20 --cs ecoli_ref.fa ecoli_O104:H4.fa | paftools.js view - | less -S
|
175
|
+
```
|
176
|
+
This prints the alignment in a BLAST-like format.
|
177
|
+
|
178
|
+
### <a name="asm-var"></a>Calling variants from assembly-to-reference alignment
|
179
|
+
```sh
|
180
|
+
# don't forget the "--cs" option; otherwise it doesn't work
|
181
|
+
minimap2 -cx asm5 --cs ecoli_ref.fa ecoli_canu.fa \
|
182
|
+
| sort -k6,6 -k8,8n \
|
183
|
+
| paftools.js call -f ecoli_ref.fa - > out.vcf
|
184
|
+
```
|
185
|
+
Without option `-f`, `paftools.js call` outputs in a custom format. In this
|
186
|
+
format, lines starting with `R` give the regions covered by one contig only.
|
187
|
+
This information is not available in the VCF output.
|
188
|
+
|
189
|
+
### <a name="hom-map"></a>Constructing self-homology map
|
190
|
+
```sh
|
191
|
+
minimap2 -DP -k19 -w19 -m200 ecoli_ref.fa ecoli_ref.fa > out.paf
|
192
|
+
```
|
193
|
+
Option `-D` asks minimap2 to ignore anchors from perfect self match and `-P`
|
194
|
+
outputs all chains. For large nomes, we don't recommend to perform base-level
|
195
|
+
alignment (with `-c`, `-a` or `--cs`) when `-P` is applied. This is because
|
196
|
+
base-alignment is slow and occasionally gives wrong alignments close to the
|
197
|
+
diagonal of a dotter plot. For E. coli, though, base-alignment is still fast.
|
198
|
+
|
199
|
+
### <a name="liftover"></a>Lift over (for developers)
|
200
|
+
```sh
|
201
|
+
minimap2 -cx asm5 --cs ecoli_ref.fa ecoli_canu.fa > ecoli_canu.paf
|
202
|
+
echo -e 'tig00000001\t200000\t300000' | paftools.js liftover ecoli_canu.paf -
|
203
|
+
```
|
204
|
+
This lifts over a region on query sequences to one or multiple regions on
|
205
|
+
reference sequences. Note that this paftools.js command may not be efficient
|
206
|
+
enough to lift millions of regions.
|
207
|
+
|
208
|
+
|
209
|
+
|
210
|
+
## <a name="read-overlap"></a>Read Overlap
|
211
|
+
|
212
|
+
### <a name="long-read-overlap"></a>Long read overlap
|
213
|
+
```sh
|
214
|
+
# For pacbio reads:
|
215
|
+
minimap2 -x ava-pb ecoli_p6_25x_canu.fa ecoli_p6_25x_canu.fa > overlap.paf
|
216
|
+
# For Nanopore reads (ava-ont also works with PacBio but not as good):
|
217
|
+
minimap2 -x ava-ont -r 10000 ecoli_p6_25x_canu.fa ecoli_p6_25x_canu.fa > overlap.paf
|
218
|
+
# If you have miniasm installed:
|
219
|
+
miniasm -f ecoli_p6_25x_canu.fa overlap.paf > asm.gfa
|
220
|
+
```
|
221
|
+
Here we explicitly applied `-r 10000`. We are considering to set this as the
|
222
|
+
default for the `ava-ont` mode as this seems to improve the contiguity for
|
223
|
+
nanopore read assembly (Loman, personal communication).
|
224
|
+
|
225
|
+
*Minimap2 doesn't work well with short-read overlap.*
|
226
|
+
|
227
|
+
### <a name="ov-eval"></a>Evaluating overlap sensitivity (for developers)
|
228
|
+
|
229
|
+
```sh
|
230
|
+
# read to reference mapping
|
231
|
+
minimap2 -cx map-pb ecoli_ref.fa ecoli_p6_25x_canu.fa > to-ref.paf
|
232
|
+
# evaluate overlap sensitivity
|
233
|
+
sort -k6,6 -k8,8n to-ref.paf | paftools.js ov-eval - overlap.paf
|
234
|
+
```
|
235
|
+
You can see that for PacBio reads, minimap2 achieves higher overlap sensitivity
|
236
|
+
with `-x ava-pb` (99% vs 93% with `-x ava-ont`).
|
237
|
+
|
238
|
+
|
239
|
+
|
240
|
+
[pbsim]: https://github.com/yukiteruono/pbsim2
|
241
|
+
[mason2]: https://github.com/seqan/seqan/tree/master/apps/mason2
|
242
|
+
[paf]: https://github.com/lh3/miniasm/blob/master/PAF.md
|
243
|
+
[v2.10]: https://github.com/lh3/minimap2/releases/tag/v2.10
|
@@ -0,0 +1,64 @@
|
|
1
|
+
#include <math.h>
|
2
|
+
#include <stdio.h>
|
3
|
+
#include <stdlib.h>
|
4
|
+
#include <assert.h>
|
5
|
+
#include "mmpriv.h"
|
6
|
+
|
7
|
+
static inline int32_t get_for_qpos(int32_t qlen, const mm128_t *a)
|
8
|
+
{
|
9
|
+
int32_t x = (int32_t)a->y;
|
10
|
+
int32_t q_span = a->y>>32 & 0xff;
|
11
|
+
if (a->x>>63)
|
12
|
+
x = qlen - 1 - (x + 1 - q_span); // revert the position to the forward strand of query
|
13
|
+
return x;
|
14
|
+
}
|
15
|
+
|
16
|
+
static int get_mini_idx(int qlen, const mm128_t *a, int32_t n, const uint64_t *mini_pos)
|
17
|
+
{
|
18
|
+
int32_t x, L = 0, R = n - 1;
|
19
|
+
x = get_for_qpos(qlen, a);
|
20
|
+
while (L <= R) { // binary search
|
21
|
+
int32_t m = ((uint64_t)L + R) >> 1;
|
22
|
+
int32_t y = (int32_t)mini_pos[m];
|
23
|
+
if (y < x) L = m + 1;
|
24
|
+
else if (y > x) R = m - 1;
|
25
|
+
else return m;
|
26
|
+
}
|
27
|
+
return -1;
|
28
|
+
}
|
29
|
+
|
30
|
+
void mm_est_err(const mm_idx_t *mi, int qlen, int n_regs, mm_reg1_t *regs, const mm128_t *a, int32_t n, const uint64_t *mini_pos)
|
31
|
+
{
|
32
|
+
int i;
|
33
|
+
uint64_t sum_k = 0;
|
34
|
+
float avg_k;
|
35
|
+
|
36
|
+
if (n == 0) return;
|
37
|
+
for (i = 0; i < n; ++i)
|
38
|
+
sum_k += mini_pos[i] >> 32 & 0xff;
|
39
|
+
avg_k = (float)sum_k / n;
|
40
|
+
|
41
|
+
for (i = 0; i < n_regs; ++i) {
|
42
|
+
mm_reg1_t *r = ®s[i];
|
43
|
+
int32_t st, en, j, k, n_match, n_tot, l_ref;
|
44
|
+
r->div = -1.0f;
|
45
|
+
if (r->cnt == 0) continue;
|
46
|
+
st = en = get_mini_idx(qlen, r->rev? &a[r->as + r->cnt - 1] : &a[r->as], n, mini_pos);
|
47
|
+
if (st < 0) {
|
48
|
+
if (mm_verbose >= 2)
|
49
|
+
fprintf(stderr, "[WARNING] logic inconsistency in mm_est_err(). Please contact the developer.\n");
|
50
|
+
continue;
|
51
|
+
}
|
52
|
+
l_ref = mi->seq[r->rid].len;
|
53
|
+
for (k = 1, j = st + 1, n_match = 1; j < n && k < r->cnt; ++j) {
|
54
|
+
int32_t x;
|
55
|
+
x = get_for_qpos(qlen, r->rev? &a[r->as + r->cnt - 1 - k] : &a[r->as + k]);
|
56
|
+
if (x == (int32_t)mini_pos[j])
|
57
|
+
++k, en = j, ++n_match;
|
58
|
+
}
|
59
|
+
n_tot = en - st + 1;
|
60
|
+
if (r->qs > avg_k && r->rs > avg_k) ++n_tot;
|
61
|
+
if (qlen - r->qs > avg_k && l_ref - r->re > avg_k) ++n_tot;
|
62
|
+
r->div = n_match >= n_tot? 0.0f : (float)(1.0 - pow((double)n_match / n_tot, 1.0 / avg_k));
|
63
|
+
}
|
64
|
+
}
|
@@ -0,0 +1,63 @@
|
|
1
|
+
// To compile:
|
2
|
+
// gcc -g -O2 example.c libminimap2.a -lz
|
3
|
+
|
4
|
+
#include <stdlib.h>
|
5
|
+
#include <assert.h>
|
6
|
+
#include <stdio.h>
|
7
|
+
#include <zlib.h>
|
8
|
+
#include "minimap.h"
|
9
|
+
#include "kseq.h"
|
10
|
+
KSEQ_INIT(gzFile, gzread)
|
11
|
+
|
12
|
+
int main(int argc, char *argv[])
|
13
|
+
{
|
14
|
+
mm_idxopt_t iopt;
|
15
|
+
mm_mapopt_t mopt;
|
16
|
+
int n_threads = 3;
|
17
|
+
|
18
|
+
mm_verbose = 2; // disable message output to stderr
|
19
|
+
mm_set_opt(0, &iopt, &mopt);
|
20
|
+
mopt.flag |= MM_F_CIGAR; // perform alignment
|
21
|
+
|
22
|
+
if (argc < 3) {
|
23
|
+
fprintf(stderr, "Usage: minimap2-lite <target.fa> <query.fa>\n");
|
24
|
+
return 1;
|
25
|
+
}
|
26
|
+
|
27
|
+
// open query file for reading; you may use your favorite FASTA/Q parser
|
28
|
+
gzFile f = gzopen(argv[2], "r");
|
29
|
+
assert(f);
|
30
|
+
kseq_t *ks = kseq_init(f);
|
31
|
+
|
32
|
+
// open index reader
|
33
|
+
mm_idx_reader_t *r = mm_idx_reader_open(argv[1], &iopt, 0);
|
34
|
+
mm_idx_t *mi;
|
35
|
+
while ((mi = mm_idx_reader_read(r, n_threads)) != 0) { // traverse each part of the index
|
36
|
+
mm_mapopt_update(&mopt, mi); // this sets the maximum minimizer occurrence; TODO: set a better default in mm_mapopt_init()!
|
37
|
+
mm_tbuf_t *tbuf = mm_tbuf_init(); // thread buffer; for multi-threading, allocate one tbuf for each thread
|
38
|
+
gzrewind(f);
|
39
|
+
kseq_rewind(ks);
|
40
|
+
while (kseq_read(ks) >= 0) { // each kseq_read() call reads one query sequence
|
41
|
+
mm_reg1_t *reg;
|
42
|
+
int j, i, n_reg;
|
43
|
+
reg = mm_map(mi, ks->seq.l, ks->seq.s, &n_reg, tbuf, &mopt, 0); // get all hits for the query
|
44
|
+
for (j = 0; j < n_reg; ++j) { // traverse hits and print them out
|
45
|
+
mm_reg1_t *r = ®[j];
|
46
|
+
assert(r->p); // with MM_F_CIGAR, this should not be NULL
|
47
|
+
printf("%s\t%d\t%d\t%d\t%c\t", ks->name.s, ks->seq.l, r->qs, r->qe, "+-"[r->rev]);
|
48
|
+
printf("%s\t%d\t%d\t%d\t%d\t%d\t%d\tcg:Z:", mi->seq[r->rid].name, mi->seq[r->rid].len, r->rs, r->re, r->mlen, r->blen, r->mapq);
|
49
|
+
for (i = 0; i < r->p->n_cigar; ++i) // IMPORTANT: this gives the CIGAR in the aligned regions. NO soft/hard clippings!
|
50
|
+
printf("%d%c", r->p->cigar[i]>>4, MM_CIGAR_STR[r->p->cigar[i]&0xf]);
|
51
|
+
putchar('\n');
|
52
|
+
free(r->p);
|
53
|
+
}
|
54
|
+
free(reg);
|
55
|
+
}
|
56
|
+
mm_tbuf_destroy(tbuf);
|
57
|
+
mm_idx_destroy(mi);
|
58
|
+
}
|
59
|
+
mm_idx_reader_close(r); // close the index reader
|
60
|
+
kseq_destroy(ks); // close the query file
|
61
|
+
gzclose(f);
|
62
|
+
return 0;
|
63
|
+
}
|