minimap2 0.2.22.0 → 0.2.24.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +60 -76
- data/ext/Rakefile +55 -0
- data/ext/cmappy/cmappy.c +129 -0
- data/ext/cmappy/cmappy.h +44 -0
- data/ext/minimap2/FAQ.md +46 -0
- data/ext/minimap2/LICENSE.txt +24 -0
- data/ext/minimap2/MANIFEST.in +10 -0
- data/ext/minimap2/Makefile +132 -0
- data/ext/minimap2/Makefile.simde +97 -0
- data/ext/minimap2/NEWS.md +821 -0
- data/ext/minimap2/README.md +403 -0
- data/ext/minimap2/align.c +1020 -0
- data/ext/minimap2/bseq.c +169 -0
- data/ext/minimap2/bseq.h +64 -0
- data/ext/minimap2/code_of_conduct.md +30 -0
- data/ext/minimap2/cookbook.md +243 -0
- data/ext/minimap2/esterr.c +64 -0
- data/ext/minimap2/example.c +63 -0
- data/ext/minimap2/format.c +559 -0
- data/ext/minimap2/hit.c +466 -0
- data/ext/minimap2/index.c +775 -0
- data/ext/minimap2/kalloc.c +205 -0
- data/ext/minimap2/kalloc.h +76 -0
- data/ext/minimap2/kdq.h +132 -0
- data/ext/minimap2/ketopt.h +120 -0
- data/ext/minimap2/khash.h +615 -0
- data/ext/minimap2/krmq.h +474 -0
- data/ext/minimap2/kseq.h +256 -0
- data/ext/minimap2/ksort.h +153 -0
- data/ext/minimap2/ksw2.h +184 -0
- data/ext/minimap2/ksw2_dispatch.c +96 -0
- data/ext/minimap2/ksw2_extd2_sse.c +402 -0
- data/ext/minimap2/ksw2_exts2_sse.c +416 -0
- data/ext/minimap2/ksw2_extz2_sse.c +313 -0
- data/ext/minimap2/ksw2_ll_sse.c +152 -0
- data/ext/minimap2/kthread.c +159 -0
- data/ext/minimap2/kthread.h +15 -0
- data/ext/minimap2/kvec.h +105 -0
- data/ext/minimap2/lchain.c +369 -0
- data/ext/minimap2/main.c +459 -0
- data/ext/minimap2/map.c +714 -0
- data/ext/minimap2/minimap.h +410 -0
- data/ext/minimap2/minimap2.1 +725 -0
- data/ext/minimap2/misc/README.md +179 -0
- data/ext/minimap2/misc/mmphase.js +335 -0
- data/ext/minimap2/misc/paftools.js +3149 -0
- data/ext/minimap2/misc.c +162 -0
- data/ext/minimap2/mmpriv.h +132 -0
- data/ext/minimap2/options.c +234 -0
- data/ext/minimap2/pe.c +177 -0
- data/ext/minimap2/python/README.rst +196 -0
- data/ext/minimap2/python/cmappy.h +152 -0
- data/ext/minimap2/python/cmappy.pxd +153 -0
- data/ext/minimap2/python/mappy.pyx +273 -0
- data/ext/minimap2/python/minimap2.py +39 -0
- data/ext/minimap2/sdust.c +213 -0
- data/ext/minimap2/sdust.h +25 -0
- data/ext/minimap2/seed.c +131 -0
- data/ext/minimap2/setup.py +55 -0
- data/ext/minimap2/sketch.c +143 -0
- data/ext/minimap2/splitidx.c +84 -0
- data/ext/minimap2/sse2neon/emmintrin.h +1689 -0
- data/ext/minimap2/test/MT-human.fa +278 -0
- data/ext/minimap2/test/MT-orang.fa +276 -0
- data/ext/minimap2/test/q-inv.fa +4 -0
- data/ext/minimap2/test/q2.fa +2 -0
- data/ext/minimap2/test/t-inv.fa +127 -0
- data/ext/minimap2/test/t2.fa +2 -0
- data/ext/minimap2/tex/Makefile +21 -0
- data/ext/minimap2/tex/bioinfo.cls +930 -0
- data/ext/minimap2/tex/blasr-mc.eval +17 -0
- data/ext/minimap2/tex/bowtie2-s3.sam.eval +28 -0
- data/ext/minimap2/tex/bwa-s3.sam.eval +52 -0
- data/ext/minimap2/tex/bwa.eval +55 -0
- data/ext/minimap2/tex/eval2roc.pl +33 -0
- data/ext/minimap2/tex/graphmap.eval +4 -0
- data/ext/minimap2/tex/hs38-simu.sh +10 -0
- data/ext/minimap2/tex/minialign.eval +49 -0
- data/ext/minimap2/tex/minimap2.bib +460 -0
- data/ext/minimap2/tex/minimap2.tex +724 -0
- data/ext/minimap2/tex/mm2-s3.sam.eval +62 -0
- data/ext/minimap2/tex/mm2-update.tex +240 -0
- data/ext/minimap2/tex/mm2.approx.eval +12 -0
- data/ext/minimap2/tex/mm2.eval +13 -0
- data/ext/minimap2/tex/natbib.bst +1288 -0
- data/ext/minimap2/tex/natbib.sty +803 -0
- data/ext/minimap2/tex/ngmlr.eval +38 -0
- data/ext/minimap2/tex/roc.gp +60 -0
- data/ext/minimap2/tex/snap-s3.sam.eval +62 -0
- data/ext/minimap2.patch +19 -0
- data/lib/minimap2/aligner.rb +4 -4
- data/lib/minimap2/alignment.rb +11 -11
- data/lib/minimap2/ffi/constants.rb +20 -16
- data/lib/minimap2/ffi/functions.rb +5 -0
- data/lib/minimap2/ffi.rb +4 -5
- data/lib/minimap2/version.rb +2 -2
- data/lib/minimap2.rb +51 -15
- metadata +97 -79
- data/lib/minimap2/ffi_helper.rb +0 -53
- data/vendor/libminimap2.so +0 -0
@@ -0,0 +1,55 @@
|
|
1
|
+
try:
|
2
|
+
from setuptools import setup, Extension
|
3
|
+
except ImportError:
|
4
|
+
from distutils.core import setup
|
5
|
+
from distutils.extension import Extension
|
6
|
+
|
7
|
+
import sys, platform
|
8
|
+
|
9
|
+
sys.path.append('python')
|
10
|
+
|
11
|
+
extra_compile_args = ['-DHAVE_KALLOC']
|
12
|
+
include_dirs = ["."]
|
13
|
+
|
14
|
+
if platform.machine() in ["aarch64", "arm64"]:
|
15
|
+
include_dirs.append("sse2neon/")
|
16
|
+
extra_compile_args.extend(['-ftree-vectorize', '-DKSW_SSE2_ONLY', '-D__SSE2__'])
|
17
|
+
else:
|
18
|
+
extra_compile_args.append('-msse4.1') # WARNING: ancient x86_64 CPUs don't have SSE4
|
19
|
+
|
20
|
+
def readme():
|
21
|
+
with open('python/README.rst') as f:
|
22
|
+
return f.read()
|
23
|
+
|
24
|
+
setup(
|
25
|
+
name = 'mappy',
|
26
|
+
version = '2.24',
|
27
|
+
url = 'https://github.com/lh3/minimap2',
|
28
|
+
description = 'Minimap2 python binding',
|
29
|
+
long_description = readme(),
|
30
|
+
author = 'Heng Li',
|
31
|
+
author_email = 'lh3@me.com',
|
32
|
+
license = 'MIT',
|
33
|
+
keywords = 'sequence-alignment',
|
34
|
+
scripts = ['python/minimap2.py'],
|
35
|
+
ext_modules = [Extension('mappy',
|
36
|
+
sources = ['python/mappy.pyx', 'align.c', 'bseq.c', 'lchain.c', 'seed.c', 'format.c', 'hit.c', 'index.c', 'pe.c', 'options.c',
|
37
|
+
'ksw2_extd2_sse.c', 'ksw2_exts2_sse.c', 'ksw2_extz2_sse.c', 'ksw2_ll_sse.c',
|
38
|
+
'kalloc.c', 'kthread.c', 'map.c', 'misc.c', 'sdust.c', 'sketch.c', 'esterr.c', 'splitidx.c'],
|
39
|
+
depends = ['minimap.h', 'bseq.h', 'kalloc.h', 'kdq.h', 'khash.h', 'kseq.h', 'ksort.h',
|
40
|
+
'ksw2.h', 'kthread.h', 'kvec.h', 'mmpriv.h', 'sdust.h',
|
41
|
+
'python/cmappy.h', 'python/cmappy.pxd'],
|
42
|
+
extra_compile_args = extra_compile_args,
|
43
|
+
include_dirs = include_dirs,
|
44
|
+
libraries = ['z', 'm', 'pthread'])],
|
45
|
+
classifiers = [
|
46
|
+
'Development Status :: 5 - Production/Stable',
|
47
|
+
'License :: OSI Approved :: MIT License',
|
48
|
+
'Operating System :: POSIX',
|
49
|
+
'Programming Language :: C',
|
50
|
+
'Programming Language :: Cython',
|
51
|
+
'Programming Language :: Python :: 2.7',
|
52
|
+
'Programming Language :: Python :: 3',
|
53
|
+
'Intended Audience :: Science/Research',
|
54
|
+
'Topic :: Scientific/Engineering :: Bio-Informatics'],
|
55
|
+
setup_requires=["cython"])
|
@@ -0,0 +1,143 @@
|
|
1
|
+
#include <stdio.h>
|
2
|
+
#include <stdlib.h>
|
3
|
+
#include <assert.h>
|
4
|
+
#include <string.h>
|
5
|
+
#define __STDC_LIMIT_MACROS
|
6
|
+
#include "kvec.h"
|
7
|
+
#include "mmpriv.h"
|
8
|
+
|
9
|
+
unsigned char seq_nt4_table[256] = {
|
10
|
+
0, 1, 2, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
|
11
|
+
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
|
12
|
+
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
|
13
|
+
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
|
14
|
+
4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4,
|
15
|
+
4, 4, 4, 4, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
|
16
|
+
4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4,
|
17
|
+
4, 4, 4, 4, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
|
18
|
+
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
|
19
|
+
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
|
20
|
+
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
|
21
|
+
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
|
22
|
+
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
|
23
|
+
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
|
24
|
+
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
|
25
|
+
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4
|
26
|
+
};
|
27
|
+
|
28
|
+
static inline uint64_t hash64(uint64_t key, uint64_t mask)
|
29
|
+
{
|
30
|
+
key = (~key + (key << 21)) & mask; // key = (key << 21) - key - 1;
|
31
|
+
key = key ^ key >> 24;
|
32
|
+
key = ((key + (key << 3)) + (key << 8)) & mask; // key * 265
|
33
|
+
key = key ^ key >> 14;
|
34
|
+
key = ((key + (key << 2)) + (key << 4)) & mask; // key * 21
|
35
|
+
key = key ^ key >> 28;
|
36
|
+
key = (key + (key << 31)) & mask;
|
37
|
+
return key;
|
38
|
+
}
|
39
|
+
|
40
|
+
typedef struct { // a simplified version of kdq
|
41
|
+
int front, count;
|
42
|
+
int a[32];
|
43
|
+
} tiny_queue_t;
|
44
|
+
|
45
|
+
static inline void tq_push(tiny_queue_t *q, int x)
|
46
|
+
{
|
47
|
+
q->a[((q->count++) + q->front) & 0x1f] = x;
|
48
|
+
}
|
49
|
+
|
50
|
+
static inline int tq_shift(tiny_queue_t *q)
|
51
|
+
{
|
52
|
+
int x;
|
53
|
+
if (q->count == 0) return -1;
|
54
|
+
x = q->a[q->front++];
|
55
|
+
q->front &= 0x1f;
|
56
|
+
--q->count;
|
57
|
+
return x;
|
58
|
+
}
|
59
|
+
|
60
|
+
/**
|
61
|
+
* Find symmetric (w,k)-minimizers on a DNA sequence
|
62
|
+
*
|
63
|
+
* @param km thread-local memory pool; using NULL falls back to malloc()
|
64
|
+
* @param str DNA sequence
|
65
|
+
* @param len length of $str
|
66
|
+
* @param w find a minimizer for every $w consecutive k-mers
|
67
|
+
* @param k k-mer size
|
68
|
+
* @param rid reference ID; will be copied to the output $p array
|
69
|
+
* @param is_hpc homopolymer-compressed or not
|
70
|
+
* @param p minimizers
|
71
|
+
* p->a[i].x = kMer<<8 | kmerSpan
|
72
|
+
* p->a[i].y = rid<<32 | lastPos<<1 | strand
|
73
|
+
* where lastPos is the position of the last base of the i-th minimizer,
|
74
|
+
* and strand indicates whether the minimizer comes from the top or the bottom strand.
|
75
|
+
* Callers may want to set "p->n = 0"; otherwise results are appended to p
|
76
|
+
*/
|
77
|
+
void mm_sketch(void *km, const char *str, int len, int w, int k, uint32_t rid, int is_hpc, mm128_v *p)
|
78
|
+
{
|
79
|
+
uint64_t shift1 = 2 * (k - 1), mask = (1ULL<<2*k) - 1, kmer[2] = {0,0};
|
80
|
+
int i, j, l, buf_pos, min_pos, kmer_span = 0;
|
81
|
+
mm128_t buf[256], min = { UINT64_MAX, UINT64_MAX };
|
82
|
+
tiny_queue_t tq;
|
83
|
+
|
84
|
+
assert(len > 0 && (w > 0 && w < 256) && (k > 0 && k <= 28)); // 56 bits for k-mer; could use long k-mers, but 28 enough in practice
|
85
|
+
memset(buf, 0xff, w * 16);
|
86
|
+
memset(&tq, 0, sizeof(tiny_queue_t));
|
87
|
+
kv_resize(mm128_t, km, *p, p->n + len/w);
|
88
|
+
|
89
|
+
for (i = l = buf_pos = min_pos = 0; i < len; ++i) {
|
90
|
+
int c = seq_nt4_table[(uint8_t)str[i]];
|
91
|
+
mm128_t info = { UINT64_MAX, UINT64_MAX };
|
92
|
+
if (c < 4) { // not an ambiguous base
|
93
|
+
int z;
|
94
|
+
if (is_hpc) {
|
95
|
+
int skip_len = 1;
|
96
|
+
if (i + 1 < len && seq_nt4_table[(uint8_t)str[i + 1]] == c) {
|
97
|
+
for (skip_len = 2; i + skip_len < len; ++skip_len)
|
98
|
+
if (seq_nt4_table[(uint8_t)str[i + skip_len]] != c)
|
99
|
+
break;
|
100
|
+
i += skip_len - 1; // put $i at the end of the current homopolymer run
|
101
|
+
}
|
102
|
+
tq_push(&tq, skip_len);
|
103
|
+
kmer_span += skip_len;
|
104
|
+
if (tq.count > k) kmer_span -= tq_shift(&tq);
|
105
|
+
} else kmer_span = l + 1 < k? l + 1 : k;
|
106
|
+
kmer[0] = (kmer[0] << 2 | c) & mask; // forward k-mer
|
107
|
+
kmer[1] = (kmer[1] >> 2) | (3ULL^c) << shift1; // reverse k-mer
|
108
|
+
if (kmer[0] == kmer[1]) continue; // skip "symmetric k-mers" as we don't know it strand
|
109
|
+
z = kmer[0] < kmer[1]? 0 : 1; // strand
|
110
|
+
++l;
|
111
|
+
if (l >= k && kmer_span < 256) {
|
112
|
+
info.x = hash64(kmer[z], mask) << 8 | kmer_span;
|
113
|
+
info.y = (uint64_t)rid<<32 | (uint32_t)i<<1 | z;
|
114
|
+
}
|
115
|
+
} else l = 0, tq.count = tq.front = 0, kmer_span = 0;
|
116
|
+
buf[buf_pos] = info; // need to do this here as appropriate buf_pos and buf[buf_pos] are needed below
|
117
|
+
if (l == w + k - 1 && min.x != UINT64_MAX) { // special case for the first window - because identical k-mers are not stored yet
|
118
|
+
for (j = buf_pos + 1; j < w; ++j)
|
119
|
+
if (min.x == buf[j].x && buf[j].y != min.y) kv_push(mm128_t, km, *p, buf[j]);
|
120
|
+
for (j = 0; j < buf_pos; ++j)
|
121
|
+
if (min.x == buf[j].x && buf[j].y != min.y) kv_push(mm128_t, km, *p, buf[j]);
|
122
|
+
}
|
123
|
+
if (info.x <= min.x) { // a new minimum; then write the old min
|
124
|
+
if (l >= w + k && min.x != UINT64_MAX) kv_push(mm128_t, km, *p, min);
|
125
|
+
min = info, min_pos = buf_pos;
|
126
|
+
} else if (buf_pos == min_pos) { // old min has moved outside the window
|
127
|
+
if (l >= w + k - 1 && min.x != UINT64_MAX) kv_push(mm128_t, km, *p, min);
|
128
|
+
for (j = buf_pos + 1, min.x = UINT64_MAX; j < w; ++j) // the two loops are necessary when there are identical k-mers
|
129
|
+
if (min.x >= buf[j].x) min = buf[j], min_pos = j; // >= is important s.t. min is always the closest k-mer
|
130
|
+
for (j = 0; j <= buf_pos; ++j)
|
131
|
+
if (min.x >= buf[j].x) min = buf[j], min_pos = j;
|
132
|
+
if (l >= w + k - 1 && min.x != UINT64_MAX) { // write identical k-mers
|
133
|
+
for (j = buf_pos + 1; j < w; ++j) // these two loops make sure the output is sorted
|
134
|
+
if (min.x == buf[j].x && min.y != buf[j].y) kv_push(mm128_t, km, *p, buf[j]);
|
135
|
+
for (j = 0; j <= buf_pos; ++j)
|
136
|
+
if (min.x == buf[j].x && min.y != buf[j].y) kv_push(mm128_t, km, *p, buf[j]);
|
137
|
+
}
|
138
|
+
}
|
139
|
+
if (++buf_pos == w) buf_pos = 0;
|
140
|
+
}
|
141
|
+
if (min.x != UINT64_MAX)
|
142
|
+
kv_push(mm128_t, km, *p, min);
|
143
|
+
}
|
@@ -0,0 +1,84 @@
|
|
1
|
+
#include <string.h>
|
2
|
+
#include <assert.h>
|
3
|
+
#include <stdlib.h>
|
4
|
+
#include <stdio.h>
|
5
|
+
#include <errno.h>
|
6
|
+
#include "mmpriv.h"
|
7
|
+
|
8
|
+
FILE *mm_split_init(const char *prefix, const mm_idx_t *mi)
|
9
|
+
{
|
10
|
+
char *fn;
|
11
|
+
FILE *fp;
|
12
|
+
uint32_t i, k = mi->k;
|
13
|
+
fn = (char*)calloc(strlen(prefix) + 10, 1);
|
14
|
+
sprintf(fn, "%s.%.4d.tmp", prefix, mi->index);
|
15
|
+
if ((fp = fopen(fn, "wb")) == NULL) {
|
16
|
+
if (mm_verbose >= 1)
|
17
|
+
fprintf(stderr, "[ERROR]\033[1;31m failed to write to temporary file '%s'\033[0m: %s\n", fn, strerror(errno));
|
18
|
+
exit(1);
|
19
|
+
}
|
20
|
+
mm_err_fwrite(&k, 4, 1, fp);
|
21
|
+
mm_err_fwrite(&mi->n_seq, 4, 1, fp);
|
22
|
+
for (i = 0; i < mi->n_seq; ++i) {
|
23
|
+
uint32_t l;
|
24
|
+
l = strlen(mi->seq[i].name);
|
25
|
+
mm_err_fwrite(&l, 1, 4, fp);
|
26
|
+
mm_err_fwrite(mi->seq[i].name, 1, l, fp);
|
27
|
+
mm_err_fwrite(&mi->seq[i].len, 4, 1, fp);
|
28
|
+
}
|
29
|
+
free(fn);
|
30
|
+
return fp;
|
31
|
+
}
|
32
|
+
|
33
|
+
mm_idx_t *mm_split_merge_prep(const char *prefix, int n_splits, FILE **fp, uint32_t *n_seq_part)
|
34
|
+
{
|
35
|
+
mm_idx_t *mi = 0;
|
36
|
+
char *fn;
|
37
|
+
int i, j;
|
38
|
+
|
39
|
+
if (n_splits < 1) return 0;
|
40
|
+
fn = CALLOC(char, strlen(prefix) + 10);
|
41
|
+
for (i = 0; i < n_splits; ++i) {
|
42
|
+
sprintf(fn, "%s.%.4d.tmp", prefix, i);
|
43
|
+
if ((fp[i] = fopen(fn, "rb")) == 0) {
|
44
|
+
if (mm_verbose >= 1)
|
45
|
+
fprintf(stderr, "ERROR: failed to open temporary file '%s': %s\n", fn, strerror(errno));
|
46
|
+
for (j = 0; j < i; ++j)
|
47
|
+
fclose(fp[j]);
|
48
|
+
free(fn);
|
49
|
+
return 0;
|
50
|
+
}
|
51
|
+
}
|
52
|
+
free(fn);
|
53
|
+
|
54
|
+
mi = CALLOC(mm_idx_t, 1);
|
55
|
+
for (i = 0; i < n_splits; ++i) {
|
56
|
+
mm_err_fread(&mi->k, 4, 1, fp[i]); // TODO: check if k is all the same
|
57
|
+
mm_err_fread(&n_seq_part[i], 4, 1, fp[i]);
|
58
|
+
mi->n_seq += n_seq_part[i];
|
59
|
+
}
|
60
|
+
mi->seq = CALLOC(mm_idx_seq_t, mi->n_seq);
|
61
|
+
for (i = j = 0; i < n_splits; ++i) {
|
62
|
+
uint32_t k;
|
63
|
+
for (k = 0; k < n_seq_part[i]; ++k, ++j) {
|
64
|
+
uint32_t l;
|
65
|
+
mm_err_fread(&l, 1, 4, fp[i]);
|
66
|
+
mi->seq[j].name = (char*)calloc(l + 1, 1);
|
67
|
+
mm_err_fread(mi->seq[j].name, 1, l, fp[i]);
|
68
|
+
mm_err_fread(&mi->seq[j].len, 4, 1, fp[i]);
|
69
|
+
}
|
70
|
+
}
|
71
|
+
return mi;
|
72
|
+
}
|
73
|
+
|
74
|
+
void mm_split_rm_tmp(const char *prefix, int n_splits)
|
75
|
+
{
|
76
|
+
int i;
|
77
|
+
char *fn;
|
78
|
+
fn = CALLOC(char, strlen(prefix) + 10);
|
79
|
+
for (i = 0; i < n_splits; ++i) {
|
80
|
+
sprintf(fn, "%s.%.4d.tmp", prefix, i);
|
81
|
+
remove(fn);
|
82
|
+
}
|
83
|
+
free(fn);
|
84
|
+
}
|