minimap2 0.2.22.0 → 0.2.24.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +60 -76
- data/ext/Rakefile +55 -0
- data/ext/cmappy/cmappy.c +129 -0
- data/ext/cmappy/cmappy.h +44 -0
- data/ext/minimap2/FAQ.md +46 -0
- data/ext/minimap2/LICENSE.txt +24 -0
- data/ext/minimap2/MANIFEST.in +10 -0
- data/ext/minimap2/Makefile +132 -0
- data/ext/minimap2/Makefile.simde +97 -0
- data/ext/minimap2/NEWS.md +821 -0
- data/ext/minimap2/README.md +403 -0
- data/ext/minimap2/align.c +1020 -0
- data/ext/minimap2/bseq.c +169 -0
- data/ext/minimap2/bseq.h +64 -0
- data/ext/minimap2/code_of_conduct.md +30 -0
- data/ext/minimap2/cookbook.md +243 -0
- data/ext/minimap2/esterr.c +64 -0
- data/ext/minimap2/example.c +63 -0
- data/ext/minimap2/format.c +559 -0
- data/ext/minimap2/hit.c +466 -0
- data/ext/minimap2/index.c +775 -0
- data/ext/minimap2/kalloc.c +205 -0
- data/ext/minimap2/kalloc.h +76 -0
- data/ext/minimap2/kdq.h +132 -0
- data/ext/minimap2/ketopt.h +120 -0
- data/ext/minimap2/khash.h +615 -0
- data/ext/minimap2/krmq.h +474 -0
- data/ext/minimap2/kseq.h +256 -0
- data/ext/minimap2/ksort.h +153 -0
- data/ext/minimap2/ksw2.h +184 -0
- data/ext/minimap2/ksw2_dispatch.c +96 -0
- data/ext/minimap2/ksw2_extd2_sse.c +402 -0
- data/ext/minimap2/ksw2_exts2_sse.c +416 -0
- data/ext/minimap2/ksw2_extz2_sse.c +313 -0
- data/ext/minimap2/ksw2_ll_sse.c +152 -0
- data/ext/minimap2/kthread.c +159 -0
- data/ext/minimap2/kthread.h +15 -0
- data/ext/minimap2/kvec.h +105 -0
- data/ext/minimap2/lchain.c +369 -0
- data/ext/minimap2/main.c +459 -0
- data/ext/minimap2/map.c +714 -0
- data/ext/minimap2/minimap.h +410 -0
- data/ext/minimap2/minimap2.1 +725 -0
- data/ext/minimap2/misc/README.md +179 -0
- data/ext/minimap2/misc/mmphase.js +335 -0
- data/ext/minimap2/misc/paftools.js +3149 -0
- data/ext/minimap2/misc.c +162 -0
- data/ext/minimap2/mmpriv.h +132 -0
- data/ext/minimap2/options.c +234 -0
- data/ext/minimap2/pe.c +177 -0
- data/ext/minimap2/python/README.rst +196 -0
- data/ext/minimap2/python/cmappy.h +152 -0
- data/ext/minimap2/python/cmappy.pxd +153 -0
- data/ext/minimap2/python/mappy.pyx +273 -0
- data/ext/minimap2/python/minimap2.py +39 -0
- data/ext/minimap2/sdust.c +213 -0
- data/ext/minimap2/sdust.h +25 -0
- data/ext/minimap2/seed.c +131 -0
- data/ext/minimap2/setup.py +55 -0
- data/ext/minimap2/sketch.c +143 -0
- data/ext/minimap2/splitidx.c +84 -0
- data/ext/minimap2/sse2neon/emmintrin.h +1689 -0
- data/ext/minimap2/test/MT-human.fa +278 -0
- data/ext/minimap2/test/MT-orang.fa +276 -0
- data/ext/minimap2/test/q-inv.fa +4 -0
- data/ext/minimap2/test/q2.fa +2 -0
- data/ext/minimap2/test/t-inv.fa +127 -0
- data/ext/minimap2/test/t2.fa +2 -0
- data/ext/minimap2/tex/Makefile +21 -0
- data/ext/minimap2/tex/bioinfo.cls +930 -0
- data/ext/minimap2/tex/blasr-mc.eval +17 -0
- data/ext/minimap2/tex/bowtie2-s3.sam.eval +28 -0
- data/ext/minimap2/tex/bwa-s3.sam.eval +52 -0
- data/ext/minimap2/tex/bwa.eval +55 -0
- data/ext/minimap2/tex/eval2roc.pl +33 -0
- data/ext/minimap2/tex/graphmap.eval +4 -0
- data/ext/minimap2/tex/hs38-simu.sh +10 -0
- data/ext/minimap2/tex/minialign.eval +49 -0
- data/ext/minimap2/tex/minimap2.bib +460 -0
- data/ext/minimap2/tex/minimap2.tex +724 -0
- data/ext/minimap2/tex/mm2-s3.sam.eval +62 -0
- data/ext/minimap2/tex/mm2-update.tex +240 -0
- data/ext/minimap2/tex/mm2.approx.eval +12 -0
- data/ext/minimap2/tex/mm2.eval +13 -0
- data/ext/minimap2/tex/natbib.bst +1288 -0
- data/ext/minimap2/tex/natbib.sty +803 -0
- data/ext/minimap2/tex/ngmlr.eval +38 -0
- data/ext/minimap2/tex/roc.gp +60 -0
- data/ext/minimap2/tex/snap-s3.sam.eval +62 -0
- data/ext/minimap2.patch +19 -0
- data/lib/minimap2/aligner.rb +4 -4
- data/lib/minimap2/alignment.rb +11 -11
- data/lib/minimap2/ffi/constants.rb +20 -16
- data/lib/minimap2/ffi/functions.rb +5 -0
- data/lib/minimap2/ffi.rb +4 -5
- data/lib/minimap2/version.rb +2 -2
- data/lib/minimap2.rb +51 -15
- metadata +97 -79
- data/lib/minimap2/ffi_helper.rb +0 -53
- data/vendor/libminimap2.so +0 -0
data/ext/minimap2/bseq.c
ADDED
@@ -0,0 +1,169 @@
|
|
1
|
+
#include <zlib.h>
|
2
|
+
#include <stdio.h>
|
3
|
+
#include <stdlib.h>
|
4
|
+
#include <assert.h>
|
5
|
+
#define __STDC_LIMIT_MACROS
|
6
|
+
#include "bseq.h"
|
7
|
+
#include "kvec.h"
|
8
|
+
#include "kseq.h"
|
9
|
+
KSEQ_INIT2(, gzFile, gzread)
|
10
|
+
|
11
|
+
unsigned char seq_comp_table[256] = {
|
12
|
+
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
|
13
|
+
16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
|
14
|
+
32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
|
15
|
+
48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63,
|
16
|
+
64, 'T', 'V', 'G', 'H', 'E', 'F', 'C', 'D', 'I', 'J', 'M', 'L', 'K', 'N', 'O',
|
17
|
+
'P', 'Q', 'Y', 'S', 'A', 'A', 'B', 'W', 'X', 'R', 'Z', 91, 92, 93, 94, 95,
|
18
|
+
96, 't', 'v', 'g', 'h', 'e', 'f', 'c', 'd', 'i', 'j', 'm', 'l', 'k', 'n', 'o',
|
19
|
+
'p', 'q', 'y', 's', 'a', 'a', 'b', 'w', 'x', 'r', 'z', 123, 124, 125, 126, 127,
|
20
|
+
128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143,
|
21
|
+
144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159,
|
22
|
+
160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175,
|
23
|
+
176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191,
|
24
|
+
192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207,
|
25
|
+
208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223,
|
26
|
+
224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239,
|
27
|
+
240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255
|
28
|
+
};
|
29
|
+
|
30
|
+
#define CHECK_PAIR_THRES 1000000
|
31
|
+
|
32
|
+
struct mm_bseq_file_s {
|
33
|
+
gzFile fp;
|
34
|
+
kseq_t *ks;
|
35
|
+
mm_bseq1_t s;
|
36
|
+
};
|
37
|
+
|
38
|
+
mm_bseq_file_t *mm_bseq_open(const char *fn)
|
39
|
+
{
|
40
|
+
mm_bseq_file_t *fp;
|
41
|
+
gzFile f;
|
42
|
+
f = fn && strcmp(fn, "-")? gzopen(fn, "r") : gzdopen(0, "r");
|
43
|
+
if (f == 0) return 0;
|
44
|
+
fp = (mm_bseq_file_t*)calloc(1, sizeof(mm_bseq_file_t));
|
45
|
+
fp->fp = f;
|
46
|
+
fp->ks = kseq_init(fp->fp);
|
47
|
+
return fp;
|
48
|
+
}
|
49
|
+
|
50
|
+
void mm_bseq_close(mm_bseq_file_t *fp)
|
51
|
+
{
|
52
|
+
kseq_destroy(fp->ks);
|
53
|
+
gzclose(fp->fp);
|
54
|
+
free(fp);
|
55
|
+
}
|
56
|
+
|
57
|
+
static inline char *kstrdup(const kstring_t *s)
|
58
|
+
{
|
59
|
+
char *t;
|
60
|
+
t = (char*)malloc(s->l + 1);
|
61
|
+
memcpy(t, s->s, s->l + 1);
|
62
|
+
return t;
|
63
|
+
}
|
64
|
+
|
65
|
+
static inline void kseq2bseq(kseq_t *ks, mm_bseq1_t *s, int with_qual, int with_comment)
|
66
|
+
{
|
67
|
+
int i;
|
68
|
+
if (ks->name.l == 0)
|
69
|
+
fprintf(stderr, "[WARNING]\033[1;31m empty sequence name in the input.\033[0m\n");
|
70
|
+
s->name = kstrdup(&ks->name);
|
71
|
+
s->seq = kstrdup(&ks->seq);
|
72
|
+
for (i = 0; i < (int)ks->seq.l; ++i) // convert U to T
|
73
|
+
if (s->seq[i] == 'u' || s->seq[i] == 'U')
|
74
|
+
--s->seq[i];
|
75
|
+
s->qual = with_qual && ks->qual.l? kstrdup(&ks->qual) : 0;
|
76
|
+
s->comment = with_comment && ks->comment.l? kstrdup(&ks->comment) : 0;
|
77
|
+
s->l_seq = ks->seq.l;
|
78
|
+
}
|
79
|
+
|
80
|
+
mm_bseq1_t *mm_bseq_read3(mm_bseq_file_t *fp, int64_t chunk_size, int with_qual, int with_comment, int frag_mode, int *n_)
|
81
|
+
{
|
82
|
+
int64_t size = 0;
|
83
|
+
int ret;
|
84
|
+
kvec_t(mm_bseq1_t) a = {0,0,0};
|
85
|
+
kseq_t *ks = fp->ks;
|
86
|
+
*n_ = 0;
|
87
|
+
if (fp->s.seq) {
|
88
|
+
kv_resize(mm_bseq1_t, 0, a, 256);
|
89
|
+
kv_push(mm_bseq1_t, 0, a, fp->s);
|
90
|
+
size = fp->s.l_seq;
|
91
|
+
memset(&fp->s, 0, sizeof(mm_bseq1_t));
|
92
|
+
}
|
93
|
+
while ((ret = kseq_read(ks)) >= 0) {
|
94
|
+
mm_bseq1_t *s;
|
95
|
+
assert(ks->seq.l <= INT32_MAX);
|
96
|
+
if (a.m == 0) kv_resize(mm_bseq1_t, 0, a, 256);
|
97
|
+
kv_pushp(mm_bseq1_t, 0, a, &s);
|
98
|
+
kseq2bseq(ks, s, with_qual, with_comment);
|
99
|
+
size += s->l_seq;
|
100
|
+
if (size >= chunk_size) {
|
101
|
+
if (frag_mode && a.a[a.n-1].l_seq < CHECK_PAIR_THRES) {
|
102
|
+
while ((ret = kseq_read(ks)) >= 0) {
|
103
|
+
kseq2bseq(ks, &fp->s, with_qual, with_comment);
|
104
|
+
if (mm_qname_same(fp->s.name, a.a[a.n-1].name)) {
|
105
|
+
kv_push(mm_bseq1_t, 0, a, fp->s);
|
106
|
+
memset(&fp->s, 0, sizeof(mm_bseq1_t));
|
107
|
+
} else break;
|
108
|
+
}
|
109
|
+
}
|
110
|
+
break;
|
111
|
+
}
|
112
|
+
}
|
113
|
+
if (ret < -1) {
|
114
|
+
if (a.n) fprintf(stderr, "[WARNING]\033[1;31m failed to parse the FASTA/FASTQ record next to '%s'. Continue anyway.\033[0m\n", a.a[a.n-1].name);
|
115
|
+
else fprintf(stderr, "[WARNING]\033[1;31m failed to parse the first FASTA/FASTQ record. Continue anyway.\033[0m\n");
|
116
|
+
}
|
117
|
+
*n_ = a.n;
|
118
|
+
return a.a;
|
119
|
+
}
|
120
|
+
|
121
|
+
mm_bseq1_t *mm_bseq_read2(mm_bseq_file_t *fp, int64_t chunk_size, int with_qual, int frag_mode, int *n_)
|
122
|
+
{
|
123
|
+
return mm_bseq_read3(fp, chunk_size, with_qual, 0, frag_mode, n_);
|
124
|
+
}
|
125
|
+
|
126
|
+
mm_bseq1_t *mm_bseq_read(mm_bseq_file_t *fp, int64_t chunk_size, int with_qual, int *n_)
|
127
|
+
{
|
128
|
+
return mm_bseq_read2(fp, chunk_size, with_qual, 0, n_);
|
129
|
+
}
|
130
|
+
|
131
|
+
mm_bseq1_t *mm_bseq_read_frag2(int n_fp, mm_bseq_file_t **fp, int64_t chunk_size, int with_qual, int with_comment, int *n_)
|
132
|
+
{
|
133
|
+
int i;
|
134
|
+
int64_t size = 0;
|
135
|
+
kvec_t(mm_bseq1_t) a = {0,0,0};
|
136
|
+
*n_ = 0;
|
137
|
+
if (n_fp < 1) return 0;
|
138
|
+
while (1) {
|
139
|
+
int n_read = 0;
|
140
|
+
for (i = 0; i < n_fp; ++i)
|
141
|
+
if (kseq_read(fp[i]->ks) >= 0)
|
142
|
+
++n_read;
|
143
|
+
if (n_read < n_fp) {
|
144
|
+
if (n_read > 0)
|
145
|
+
fprintf(stderr, "[W::%s]\033[1;31m query files have different number of records; extra records skipped.\033[0m\n", __func__);
|
146
|
+
break; // some file reaches the end
|
147
|
+
}
|
148
|
+
if (a.m == 0) kv_resize(mm_bseq1_t, 0, a, 256);
|
149
|
+
for (i = 0; i < n_fp; ++i) {
|
150
|
+
mm_bseq1_t *s;
|
151
|
+
kv_pushp(mm_bseq1_t, 0, a, &s);
|
152
|
+
kseq2bseq(fp[i]->ks, s, with_qual, with_comment);
|
153
|
+
size += s->l_seq;
|
154
|
+
}
|
155
|
+
if (size >= chunk_size) break;
|
156
|
+
}
|
157
|
+
*n_ = a.n;
|
158
|
+
return a.a;
|
159
|
+
}
|
160
|
+
|
161
|
+
mm_bseq1_t *mm_bseq_read_frag(int n_fp, mm_bseq_file_t **fp, int64_t chunk_size, int with_qual, int *n_)
|
162
|
+
{
|
163
|
+
return mm_bseq_read_frag2(n_fp, fp, chunk_size, with_qual, 0, n_);
|
164
|
+
}
|
165
|
+
|
166
|
+
int mm_bseq_eof(mm_bseq_file_t *fp)
|
167
|
+
{
|
168
|
+
return (ks_eof(fp->ks->f) && fp->s.seq == 0);
|
169
|
+
}
|
data/ext/minimap2/bseq.h
ADDED
@@ -0,0 +1,64 @@
|
|
1
|
+
#ifndef MM_BSEQ_H
|
2
|
+
#define MM_BSEQ_H
|
3
|
+
|
4
|
+
#include <stdint.h>
|
5
|
+
#include <string.h>
|
6
|
+
|
7
|
+
#ifdef __cplusplus
|
8
|
+
extern "C" {
|
9
|
+
#endif
|
10
|
+
|
11
|
+
struct mm_bseq_file_s;
|
12
|
+
typedef struct mm_bseq_file_s mm_bseq_file_t;
|
13
|
+
|
14
|
+
typedef struct {
|
15
|
+
int l_seq, rid;
|
16
|
+
char *name, *seq, *qual, *comment;
|
17
|
+
} mm_bseq1_t;
|
18
|
+
|
19
|
+
mm_bseq_file_t *mm_bseq_open(const char *fn);
|
20
|
+
void mm_bseq_close(mm_bseq_file_t *fp);
|
21
|
+
mm_bseq1_t *mm_bseq_read3(mm_bseq_file_t *fp, int64_t chunk_size, int with_qual, int with_comment, int frag_mode, int *n_);
|
22
|
+
mm_bseq1_t *mm_bseq_read2(mm_bseq_file_t *fp, int64_t chunk_size, int with_qual, int frag_mode, int *n_);
|
23
|
+
mm_bseq1_t *mm_bseq_read(mm_bseq_file_t *fp, int64_t chunk_size, int with_qual, int *n_);
|
24
|
+
mm_bseq1_t *mm_bseq_read_frag2(int n_fp, mm_bseq_file_t **fp, int64_t chunk_size, int with_qual, int with_comment, int *n_);
|
25
|
+
mm_bseq1_t *mm_bseq_read_frag(int n_fp, mm_bseq_file_t **fp, int64_t chunk_size, int with_qual, int *n_);
|
26
|
+
int mm_bseq_eof(mm_bseq_file_t *fp);
|
27
|
+
|
28
|
+
extern unsigned char seq_nt4_table[256];
|
29
|
+
extern unsigned char seq_comp_table[256];
|
30
|
+
|
31
|
+
static inline int mm_qname_len(const char *s)
|
32
|
+
{
|
33
|
+
int l;
|
34
|
+
l = strlen(s);
|
35
|
+
return l >= 3 && s[l-1] >= '0' && s[l-1] <= '9' && s[l-2] == '/'? l - 2 : l;
|
36
|
+
}
|
37
|
+
|
38
|
+
static inline int mm_qname_same(const char *s1, const char *s2)
|
39
|
+
{
|
40
|
+
int l1, l2;
|
41
|
+
l1 = mm_qname_len(s1);
|
42
|
+
l2 = mm_qname_len(s2);
|
43
|
+
return (l1 == l2 && strncmp(s1, s2, l1) == 0);
|
44
|
+
}
|
45
|
+
|
46
|
+
static inline void mm_revcomp_bseq(mm_bseq1_t *s)
|
47
|
+
{
|
48
|
+
int i, t, l = s->l_seq;
|
49
|
+
for (i = 0; i < l>>1; ++i) {
|
50
|
+
t = s->seq[l - i - 1];
|
51
|
+
s->seq[l - i - 1] = seq_comp_table[(uint8_t)s->seq[i]];
|
52
|
+
s->seq[i] = seq_comp_table[t];
|
53
|
+
}
|
54
|
+
if (l&1) s->seq[l>>1] = seq_comp_table[(uint8_t)s->seq[l>>1]];
|
55
|
+
if (s->qual)
|
56
|
+
for (i = 0; i < l>>1; ++i)
|
57
|
+
t = s->qual[l - i - 1], s->qual[l - i - 1] = s->qual[i], s->qual[i] = t;
|
58
|
+
}
|
59
|
+
|
60
|
+
#ifdef __cplusplus
|
61
|
+
}
|
62
|
+
#endif
|
63
|
+
|
64
|
+
#endif
|
@@ -0,0 +1,30 @@
|
|
1
|
+
## Contributor Code of Conduct
|
2
|
+
|
3
|
+
As contributors and maintainers of this project, we pledge to respect all
|
4
|
+
people who contribute through reporting issues, posting feature requests,
|
5
|
+
updating documentation, submitting pull requests or patches, and other
|
6
|
+
activities.
|
7
|
+
|
8
|
+
We are committed to making participation in this project a harassment-free
|
9
|
+
experience for everyone, regardless of level of experience, gender, gender
|
10
|
+
identity and expression, sexual orientation, disability, personal appearance,
|
11
|
+
body size, race, age, or religion.
|
12
|
+
|
13
|
+
Examples of unacceptable behavior by participants include the use of sexual
|
14
|
+
language or imagery, derogatory comments or personal attacks, trolling, public
|
15
|
+
or private harassment, insults, or other unprofessional conduct.
|
16
|
+
|
17
|
+
Project maintainers have the right and responsibility to remove, edit, or
|
18
|
+
reject comments, commits, code, wiki edits, issues, and other contributions
|
19
|
+
that are not aligned to this Code of Conduct. Project maintainers or
|
20
|
+
contributors who do not follow the Code of Conduct may be removed from the
|
21
|
+
project team.
|
22
|
+
|
23
|
+
Instances of abusive, harassing, or otherwise unacceptable behavior may be
|
24
|
+
reported by opening an issue or contacting the maintainer via email.
|
25
|
+
|
26
|
+
This Code of Conduct is adapted from the [Contributor Covenant][cc], [version
|
27
|
+
1.0.0][v1].
|
28
|
+
|
29
|
+
[cc]: http://contributor-covenant.org/
|
30
|
+
[v1]: http://contributor-covenant.org/version/1/0/0/
|
@@ -0,0 +1,243 @@
|
|
1
|
+
## Table of Contents
|
2
|
+
|
3
|
+
- [Introduction & Installation](#intro)
|
4
|
+
- [Mapping Genomic Reads](#map-reads)
|
5
|
+
* [Mapping long reads](#map-pb)
|
6
|
+
* [Mapping Illumina paired-end reads](#map-sr)
|
7
|
+
* [Evaluating mapping accuracy with simulated reads (for developers)](#mapeval)
|
8
|
+
- [Mapping Long RNA-seq Reads](#map-rna)
|
9
|
+
* [Mapping Nanopore 2D cDNA reads](#map-ont-cdna-2d)
|
10
|
+
* [Mapping Nanopore direct-RNA reads](#map-direct-rna)
|
11
|
+
* [Mapping PacBio Iso-seq reads](#map-iso-seq)
|
12
|
+
- [Full-Genome Alignment](#genome-aln)
|
13
|
+
* [Intra-species assembly alignment](#asm-to-ref)
|
14
|
+
* [Cross-species full-genome alignment](#x-species)
|
15
|
+
* [Eyeballing alignment](#view-aln)
|
16
|
+
* [Calling variants from assembly-to-reference alignment](#asm-var)
|
17
|
+
* [Constructing self-homology map](#hom-map)
|
18
|
+
* [Lift Over (for developers)](#liftover)
|
19
|
+
- [Read Overlap](#read-overlap)
|
20
|
+
* [Long-read overlap](#long-read-overlap)
|
21
|
+
* [Evaluating overlap sensitivity (for developers)](#ov-eval)
|
22
|
+
|
23
|
+
## <a name="intro"></a>Introduction & Installation
|
24
|
+
|
25
|
+
This cookbook walks you through a variety of applications of minimap2 and its
|
26
|
+
companion script `paftools.js`. All data here are freely available from the
|
27
|
+
minimap2 release page at version tag [v2.10][v2.10]. Some examples only work
|
28
|
+
with v2.10 or later.
|
29
|
+
|
30
|
+
To acquire the data used in this cookbook and to install minimap2 and paftools,
|
31
|
+
please follow the command lines below:
|
32
|
+
```sh
|
33
|
+
# install minimap2 executables
|
34
|
+
curl -L https://github.com/lh3/minimap2/releases/download/v2.24/minimap2-2.24_x64-linux.tar.bz2 | tar jxf -
|
35
|
+
cp minimap2-2.24_x64-linux/{minimap2,k8,paftools.js} . # copy executables
|
36
|
+
export PATH="$PATH:"`pwd` # put the current directory on PATH
|
37
|
+
# download example datasets
|
38
|
+
curl -L https://github.com/lh3/minimap2/releases/download/v2.10/cookbook-data.tgz | tar zxf -
|
39
|
+
```
|
40
|
+
|
41
|
+
## <a name="map-reads"></a>Mapping Genomic Reads
|
42
|
+
|
43
|
+
### <a name="map-pb"></a>Mapping long reads
|
44
|
+
```sh
|
45
|
+
minimap2 -ax map-pb -t4 ecoli_ref.fa ecoli_p6_25x_canu.fa > mapped.sam
|
46
|
+
```
|
47
|
+
Alternatively, you can create a minimap2 index first and then map:
|
48
|
+
```sh
|
49
|
+
minimap2 -x map-pb -d ecoli-pb.mmi ecoli_ref.fa # create an index
|
50
|
+
minimap2 -ax map-pb ecoli-pb.mmi ecoli_p6_25x_canu.fa > mapped.sam
|
51
|
+
```
|
52
|
+
This will save you a couple of minutes when you map against the human genome.
|
53
|
+
**HOWEVER**, key algorithm parameters such as the k-mer length and window
|
54
|
+
size can't be changed after indexing. Minimap2 will give you a warning if
|
55
|
+
parameters used in a pre-built index doesn't match parameters on the command
|
56
|
+
line. **Please always make sure you are using an intended pre-built index.**
|
57
|
+
|
58
|
+
### <a name="map-sr"></a>Mapping Illumina paired-end reads:
|
59
|
+
```sh
|
60
|
+
minimap2 -ax sr -t4 ecoli_ref.fa ecoli_mason_1.fq ecoli_mason_2.fq > mapped-sr.sam
|
61
|
+
```
|
62
|
+
|
63
|
+
### <a name="mapeval"></a>Evaluating mapping accuracy with simulated reads (for developers)
|
64
|
+
```sh
|
65
|
+
minimap2 -ax sr ecoli_ref.fa ecoli_mason_1.fq ecoli_mason_2.fq | paftools.js mapeval -
|
66
|
+
```
|
67
|
+
The output is:
|
68
|
+
```
|
69
|
+
Q 60 19712 0 0.000000000 19712
|
70
|
+
Q 0 282 219 0.010953286 19994
|
71
|
+
U 6
|
72
|
+
```
|
73
|
+
where a `U`-line gives the number of unmapped reads (for SAM input only); a
|
74
|
+
`Q`-line gives:
|
75
|
+
|
76
|
+
1. Mapping quality (mapQ) threshold
|
77
|
+
2. Number of mapped reads between this threshold and the previous mapQ threshold.
|
78
|
+
3. Number of wrong mappings in the same mapQ interval
|
79
|
+
4. Accumulative mapping error rate
|
80
|
+
5. Accumulative number of mappings
|
81
|
+
|
82
|
+
For `paftools.js mapeval` to work, you need to encode the true read positions
|
83
|
+
in read names in the right format. For [pbsim2][pbsim] and [mason2][mason2], we
|
84
|
+
provide scripts to generate the right format. Simulated reads in this cookbook
|
85
|
+
were created with the following command lines:
|
86
|
+
```sh
|
87
|
+
# in the pbsim2 source code directory:
|
88
|
+
src/pbsim --depth 1 --length-min 5000 --length-mean 20000 --accuracy-mean 0.95 --hmm_model data/R94.model ../ecoli_ref.fa
|
89
|
+
paftools.js pbsim2fq ../ecoli_ref.fa.fai sd_0001.maf > ../ecoli_pbsim.fa
|
90
|
+
|
91
|
+
# mason2 simulation
|
92
|
+
mason_simulator --illumina-prob-mismatch-scale 2.5 -ir ecoli_ref.fa -n 10000 -o tmp-l.fq -or tmp-r.fq -oa tmp.sam
|
93
|
+
paftools.js mason2fq tmp.sam | seqtk seq -1 > ecoli_mason_1.fq
|
94
|
+
paftools.js mason2fq tmp.sam | seqtk seq -2 > ecoli_mason_2.fq
|
95
|
+
```
|
96
|
+
|
97
|
+
|
98
|
+
|
99
|
+
## <a name="map-rna"></a>Mapping Long RNA-seq Reads
|
100
|
+
|
101
|
+
### <a name="map-ont-cdna-2d"></a>Mapping Nanopore 2D cDNA reads
|
102
|
+
```sh
|
103
|
+
minimap2 -ax splice SIRV_E2.fa SIRV_ont-cdna.fa > aln.sam
|
104
|
+
```
|
105
|
+
You can compare the alignment to the true annotations with:
|
106
|
+
```sh
|
107
|
+
paftools.js junceval SIRV_E2C.gtf aln.sam
|
108
|
+
```
|
109
|
+
It gives the percentage of introns found in the annotation. For SIRV data, it
|
110
|
+
is possible to achieve higher junction accuracy with
|
111
|
+
```sh
|
112
|
+
minimap2 -ax splice --splice-flank=no SIRV_E2.fa SIRV_ont-cdna.fa | paftools.js junceval SIRV_E2C.gtf
|
113
|
+
```
|
114
|
+
This is because minimap2 models one additional evolutionarily conserved base
|
115
|
+
around a canonical junction, but SIRV doesn't honor this signal. Option
|
116
|
+
`--splice-flank=no` asks minimap2 no to model this additional base.
|
117
|
+
|
118
|
+
In the output a tag `ts:A:+` indicates that the read strand is the same as the
|
119
|
+
transcript strand; `ts:A:-` indicates the read strand is opposite to the
|
120
|
+
transcript strand. This tag is inferred from the GT-AG signal and is thus only
|
121
|
+
available to spliced reads.
|
122
|
+
|
123
|
+
### <a name="map-direct-rna"></a>Mapping Nanopore direct-RNA reads
|
124
|
+
```sh
|
125
|
+
minimap2 -ax splice -k14 -uf SIRV_E2.fa SIRV_ont-drna.fa > aln.sam
|
126
|
+
```
|
127
|
+
Direct-RNA reads are noisier, so we use a shorter k-mer for improved
|
128
|
+
sensitivity. Here, option `-uf` forces minimap2 to map reads to the forward
|
129
|
+
transcript strand only because direct-RNA reads are stranded. Again, applying
|
130
|
+
`--splice-flank=no` helps junction accuracy for SIRV data.
|
131
|
+
|
132
|
+
### <a name="map-iso-seq"></a>Mapping PacBio Iso-seq reads
|
133
|
+
```sh
|
134
|
+
minimap2 -ax splice -uf -C5 SIRV_E2.fa SIRV_iso-seq.fq > aln.sam
|
135
|
+
```
|
136
|
+
Option `-C5` reduces the penalty on non-canonical splicing sites. It helps
|
137
|
+
to align such sites correctly for data with low error rate such as Iso-seq
|
138
|
+
reads and traditional cDNAs. On this example, minimap2 makes one junction
|
139
|
+
error. Applying `--splice-flank=no` fixes this alignment error.
|
140
|
+
|
141
|
+
Note that the command line above is optimized for the final Iso-seq reads.
|
142
|
+
PacBio's Iso-seq pipeline produces intermediate sequences at varying quality.
|
143
|
+
For example, some intermediate reads are not stranded. For these reads, option
|
144
|
+
`-uf` will lead to more errors. Please revise the minimap2 command line
|
145
|
+
accordingly.
|
146
|
+
|
147
|
+
|
148
|
+
|
149
|
+
## <a name="genome-aln"></a>Full-Genome Alignment
|
150
|
+
|
151
|
+
### <a name="asm-to-ref"></a>Intra-species assembly alignment
|
152
|
+
```sh
|
153
|
+
# option "--cs" is recommended as paftools.js may need it
|
154
|
+
minimap2 -cx asm5 --cs ecoli_ref.fa ecoli_canu.fa > ecoli_canu.paf
|
155
|
+
```
|
156
|
+
Here `ecoli_canu.fa` is the Canu assembly of `ecoli_p6_25x_canu.fa`. This
|
157
|
+
command line outputs alignments in the [PAF format][paf]. Use `-a` instead of
|
158
|
+
`-c` to get output in the SAM format.
|
159
|
+
|
160
|
+
### <a name="x-species"></a>Cross-species full-genome alignment
|
161
|
+
```sh
|
162
|
+
minimap2 -cx asm20 --cs ecoli_ref.fa ecoli_O104:H4.fa > ecoli_O104:H4.paf
|
163
|
+
sort -k6,6 -k8,8n ecoli_O104:H4.paf | paftools.js call -f ecoli_ref.fa -L10000 -l1000 - > out.vcf
|
164
|
+
```
|
165
|
+
Minimap2 has three presets for full-genome alignment: "asm5" for sequence
|
166
|
+
divergence below 1%, "asm10" for divergence around a couple of percent and
|
167
|
+
"asm20" for divergence not more than 10%. In theory, with the right setting,
|
168
|
+
minimap2 should work for sequence pairs with sequence divergence up to ~15%,
|
169
|
+
but this has not been carefully evaluated.
|
170
|
+
|
171
|
+
### <a name="view-aln"></a>Eyeballing alignment
|
172
|
+
```sh
|
173
|
+
# option "--cs" required; minimap2-r741 or higher required for the "asm20" preset
|
174
|
+
minimap2 -cx asm20 --cs ecoli_ref.fa ecoli_O104:H4.fa | paftools.js view - | less -S
|
175
|
+
```
|
176
|
+
This prints the alignment in a BLAST-like format.
|
177
|
+
|
178
|
+
### <a name="asm-var"></a>Calling variants from assembly-to-reference alignment
|
179
|
+
```sh
|
180
|
+
# don't forget the "--cs" option; otherwise it doesn't work
|
181
|
+
minimap2 -cx asm5 --cs ecoli_ref.fa ecoli_canu.fa \
|
182
|
+
| sort -k6,6 -k8,8n \
|
183
|
+
| paftools.js call -f ecoli_ref.fa - > out.vcf
|
184
|
+
```
|
185
|
+
Without option `-f`, `paftools.js call` outputs in a custom format. In this
|
186
|
+
format, lines starting with `R` give the regions covered by one contig only.
|
187
|
+
This information is not available in the VCF output.
|
188
|
+
|
189
|
+
### <a name="hom-map"></a>Constructing self-homology map
|
190
|
+
```sh
|
191
|
+
minimap2 -DP -k19 -w19 -m200 ecoli_ref.fa ecoli_ref.fa > out.paf
|
192
|
+
```
|
193
|
+
Option `-D` asks minimap2 to ignore anchors from perfect self match and `-P`
|
194
|
+
outputs all chains. For large nomes, we don't recommend to perform base-level
|
195
|
+
alignment (with `-c`, `-a` or `--cs`) when `-P` is applied. This is because
|
196
|
+
base-alignment is slow and occasionally gives wrong alignments close to the
|
197
|
+
diagonal of a dotter plot. For E. coli, though, base-alignment is still fast.
|
198
|
+
|
199
|
+
### <a name="liftover"></a>Lift over (for developers)
|
200
|
+
```sh
|
201
|
+
minimap2 -cx asm5 --cs ecoli_ref.fa ecoli_canu.fa > ecoli_canu.paf
|
202
|
+
echo -e 'tig00000001\t200000\t300000' | paftools.js liftover ecoli_canu.paf -
|
203
|
+
```
|
204
|
+
This lifts over a region on query sequences to one or multiple regions on
|
205
|
+
reference sequences. Note that this paftools.js command may not be efficient
|
206
|
+
enough to lift millions of regions.
|
207
|
+
|
208
|
+
|
209
|
+
|
210
|
+
## <a name="read-overlap"></a>Read Overlap
|
211
|
+
|
212
|
+
### <a name="long-read-overlap"></a>Long read overlap
|
213
|
+
```sh
|
214
|
+
# For pacbio reads:
|
215
|
+
minimap2 -x ava-pb ecoli_p6_25x_canu.fa ecoli_p6_25x_canu.fa > overlap.paf
|
216
|
+
# For Nanopore reads (ava-ont also works with PacBio but not as good):
|
217
|
+
minimap2 -x ava-ont -r 10000 ecoli_p6_25x_canu.fa ecoli_p6_25x_canu.fa > overlap.paf
|
218
|
+
# If you have miniasm installed:
|
219
|
+
miniasm -f ecoli_p6_25x_canu.fa overlap.paf > asm.gfa
|
220
|
+
```
|
221
|
+
Here we explicitly applied `-r 10000`. We are considering to set this as the
|
222
|
+
default for the `ava-ont` mode as this seems to improve the contiguity for
|
223
|
+
nanopore read assembly (Loman, personal communication).
|
224
|
+
|
225
|
+
*Minimap2 doesn't work well with short-read overlap.*
|
226
|
+
|
227
|
+
### <a name="ov-eval"></a>Evaluating overlap sensitivity (for developers)
|
228
|
+
|
229
|
+
```sh
|
230
|
+
# read to reference mapping
|
231
|
+
minimap2 -cx map-pb ecoli_ref.fa ecoli_p6_25x_canu.fa > to-ref.paf
|
232
|
+
# evaluate overlap sensitivity
|
233
|
+
sort -k6,6 -k8,8n to-ref.paf | paftools.js ov-eval - overlap.paf
|
234
|
+
```
|
235
|
+
You can see that for PacBio reads, minimap2 achieves higher overlap sensitivity
|
236
|
+
with `-x ava-pb` (99% vs 93% with `-x ava-ont`).
|
237
|
+
|
238
|
+
|
239
|
+
|
240
|
+
[pbsim]: https://github.com/yukiteruono/pbsim2
|
241
|
+
[mason2]: https://github.com/seqan/seqan/tree/master/apps/mason2
|
242
|
+
[paf]: https://github.com/lh3/miniasm/blob/master/PAF.md
|
243
|
+
[v2.10]: https://github.com/lh3/minimap2/releases/tag/v2.10
|
@@ -0,0 +1,64 @@
|
|
1
|
+
#include <math.h>
|
2
|
+
#include <stdio.h>
|
3
|
+
#include <stdlib.h>
|
4
|
+
#include <assert.h>
|
5
|
+
#include "mmpriv.h"
|
6
|
+
|
7
|
+
static inline int32_t get_for_qpos(int32_t qlen, const mm128_t *a)
|
8
|
+
{
|
9
|
+
int32_t x = (int32_t)a->y;
|
10
|
+
int32_t q_span = a->y>>32 & 0xff;
|
11
|
+
if (a->x>>63)
|
12
|
+
x = qlen - 1 - (x + 1 - q_span); // revert the position to the forward strand of query
|
13
|
+
return x;
|
14
|
+
}
|
15
|
+
|
16
|
+
static int get_mini_idx(int qlen, const mm128_t *a, int32_t n, const uint64_t *mini_pos)
|
17
|
+
{
|
18
|
+
int32_t x, L = 0, R = n - 1;
|
19
|
+
x = get_for_qpos(qlen, a);
|
20
|
+
while (L <= R) { // binary search
|
21
|
+
int32_t m = ((uint64_t)L + R) >> 1;
|
22
|
+
int32_t y = (int32_t)mini_pos[m];
|
23
|
+
if (y < x) L = m + 1;
|
24
|
+
else if (y > x) R = m - 1;
|
25
|
+
else return m;
|
26
|
+
}
|
27
|
+
return -1;
|
28
|
+
}
|
29
|
+
|
30
|
+
void mm_est_err(const mm_idx_t *mi, int qlen, int n_regs, mm_reg1_t *regs, const mm128_t *a, int32_t n, const uint64_t *mini_pos)
|
31
|
+
{
|
32
|
+
int i;
|
33
|
+
uint64_t sum_k = 0;
|
34
|
+
float avg_k;
|
35
|
+
|
36
|
+
if (n == 0) return;
|
37
|
+
for (i = 0; i < n; ++i)
|
38
|
+
sum_k += mini_pos[i] >> 32 & 0xff;
|
39
|
+
avg_k = (float)sum_k / n;
|
40
|
+
|
41
|
+
for (i = 0; i < n_regs; ++i) {
|
42
|
+
mm_reg1_t *r = ®s[i];
|
43
|
+
int32_t st, en, j, k, n_match, n_tot, l_ref;
|
44
|
+
r->div = -1.0f;
|
45
|
+
if (r->cnt == 0) continue;
|
46
|
+
st = en = get_mini_idx(qlen, r->rev? &a[r->as + r->cnt - 1] : &a[r->as], n, mini_pos);
|
47
|
+
if (st < 0) {
|
48
|
+
if (mm_verbose >= 2)
|
49
|
+
fprintf(stderr, "[WARNING] logic inconsistency in mm_est_err(). Please contact the developer.\n");
|
50
|
+
continue;
|
51
|
+
}
|
52
|
+
l_ref = mi->seq[r->rid].len;
|
53
|
+
for (k = 1, j = st + 1, n_match = 1; j < n && k < r->cnt; ++j) {
|
54
|
+
int32_t x;
|
55
|
+
x = get_for_qpos(qlen, r->rev? &a[r->as + r->cnt - 1 - k] : &a[r->as + k]);
|
56
|
+
if (x == (int32_t)mini_pos[j])
|
57
|
+
++k, en = j, ++n_match;
|
58
|
+
}
|
59
|
+
n_tot = en - st + 1;
|
60
|
+
if (r->qs > avg_k && r->rs > avg_k) ++n_tot;
|
61
|
+
if (qlen - r->qs > avg_k && l_ref - r->re > avg_k) ++n_tot;
|
62
|
+
r->div = n_match >= n_tot? 0.0f : (float)(1.0 - pow((double)n_match / n_tot, 1.0 / avg_k));
|
63
|
+
}
|
64
|
+
}
|
@@ -0,0 +1,63 @@
|
|
1
|
+
// To compile:
|
2
|
+
// gcc -g -O2 example.c libminimap2.a -lz
|
3
|
+
|
4
|
+
#include <stdlib.h>
|
5
|
+
#include <assert.h>
|
6
|
+
#include <stdio.h>
|
7
|
+
#include <zlib.h>
|
8
|
+
#include "minimap.h"
|
9
|
+
#include "kseq.h"
|
10
|
+
KSEQ_INIT(gzFile, gzread)
|
11
|
+
|
12
|
+
int main(int argc, char *argv[])
|
13
|
+
{
|
14
|
+
mm_idxopt_t iopt;
|
15
|
+
mm_mapopt_t mopt;
|
16
|
+
int n_threads = 3;
|
17
|
+
|
18
|
+
mm_verbose = 2; // disable message output to stderr
|
19
|
+
mm_set_opt(0, &iopt, &mopt);
|
20
|
+
mopt.flag |= MM_F_CIGAR; // perform alignment
|
21
|
+
|
22
|
+
if (argc < 3) {
|
23
|
+
fprintf(stderr, "Usage: minimap2-lite <target.fa> <query.fa>\n");
|
24
|
+
return 1;
|
25
|
+
}
|
26
|
+
|
27
|
+
// open query file for reading; you may use your favorite FASTA/Q parser
|
28
|
+
gzFile f = gzopen(argv[2], "r");
|
29
|
+
assert(f);
|
30
|
+
kseq_t *ks = kseq_init(f);
|
31
|
+
|
32
|
+
// open index reader
|
33
|
+
mm_idx_reader_t *r = mm_idx_reader_open(argv[1], &iopt, 0);
|
34
|
+
mm_idx_t *mi;
|
35
|
+
while ((mi = mm_idx_reader_read(r, n_threads)) != 0) { // traverse each part of the index
|
36
|
+
mm_mapopt_update(&mopt, mi); // this sets the maximum minimizer occurrence; TODO: set a better default in mm_mapopt_init()!
|
37
|
+
mm_tbuf_t *tbuf = mm_tbuf_init(); // thread buffer; for multi-threading, allocate one tbuf for each thread
|
38
|
+
gzrewind(f);
|
39
|
+
kseq_rewind(ks);
|
40
|
+
while (kseq_read(ks) >= 0) { // each kseq_read() call reads one query sequence
|
41
|
+
mm_reg1_t *reg;
|
42
|
+
int j, i, n_reg;
|
43
|
+
reg = mm_map(mi, ks->seq.l, ks->seq.s, &n_reg, tbuf, &mopt, 0); // get all hits for the query
|
44
|
+
for (j = 0; j < n_reg; ++j) { // traverse hits and print them out
|
45
|
+
mm_reg1_t *r = ®[j];
|
46
|
+
assert(r->p); // with MM_F_CIGAR, this should not be NULL
|
47
|
+
printf("%s\t%d\t%d\t%d\t%c\t", ks->name.s, ks->seq.l, r->qs, r->qe, "+-"[r->rev]);
|
48
|
+
printf("%s\t%d\t%d\t%d\t%d\t%d\t%d\tcg:Z:", mi->seq[r->rid].name, mi->seq[r->rid].len, r->rs, r->re, r->mlen, r->blen, r->mapq);
|
49
|
+
for (i = 0; i < r->p->n_cigar; ++i) // IMPORTANT: this gives the CIGAR in the aligned regions. NO soft/hard clippings!
|
50
|
+
printf("%d%c", r->p->cigar[i]>>4, MM_CIGAR_STR[r->p->cigar[i]&0xf]);
|
51
|
+
putchar('\n');
|
52
|
+
free(r->p);
|
53
|
+
}
|
54
|
+
free(reg);
|
55
|
+
}
|
56
|
+
mm_tbuf_destroy(tbuf);
|
57
|
+
mm_idx_destroy(mi);
|
58
|
+
}
|
59
|
+
mm_idx_reader_close(r); // close the index reader
|
60
|
+
kseq_destroy(ks); // close the query file
|
61
|
+
gzclose(f);
|
62
|
+
return 0;
|
63
|
+
}
|