minimap2 0.2.22.0 → 0.2.24.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (101) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +60 -76
  3. data/ext/Rakefile +55 -0
  4. data/ext/cmappy/cmappy.c +129 -0
  5. data/ext/cmappy/cmappy.h +44 -0
  6. data/ext/minimap2/FAQ.md +46 -0
  7. data/ext/minimap2/LICENSE.txt +24 -0
  8. data/ext/minimap2/MANIFEST.in +10 -0
  9. data/ext/minimap2/Makefile +132 -0
  10. data/ext/minimap2/Makefile.simde +97 -0
  11. data/ext/minimap2/NEWS.md +821 -0
  12. data/ext/minimap2/README.md +403 -0
  13. data/ext/minimap2/align.c +1020 -0
  14. data/ext/minimap2/bseq.c +169 -0
  15. data/ext/minimap2/bseq.h +64 -0
  16. data/ext/minimap2/code_of_conduct.md +30 -0
  17. data/ext/minimap2/cookbook.md +243 -0
  18. data/ext/minimap2/esterr.c +64 -0
  19. data/ext/minimap2/example.c +63 -0
  20. data/ext/minimap2/format.c +559 -0
  21. data/ext/minimap2/hit.c +466 -0
  22. data/ext/minimap2/index.c +775 -0
  23. data/ext/minimap2/kalloc.c +205 -0
  24. data/ext/minimap2/kalloc.h +76 -0
  25. data/ext/minimap2/kdq.h +132 -0
  26. data/ext/minimap2/ketopt.h +120 -0
  27. data/ext/minimap2/khash.h +615 -0
  28. data/ext/minimap2/krmq.h +474 -0
  29. data/ext/minimap2/kseq.h +256 -0
  30. data/ext/minimap2/ksort.h +153 -0
  31. data/ext/minimap2/ksw2.h +184 -0
  32. data/ext/minimap2/ksw2_dispatch.c +96 -0
  33. data/ext/minimap2/ksw2_extd2_sse.c +402 -0
  34. data/ext/minimap2/ksw2_exts2_sse.c +416 -0
  35. data/ext/minimap2/ksw2_extz2_sse.c +313 -0
  36. data/ext/minimap2/ksw2_ll_sse.c +152 -0
  37. data/ext/minimap2/kthread.c +159 -0
  38. data/ext/minimap2/kthread.h +15 -0
  39. data/ext/minimap2/kvec.h +105 -0
  40. data/ext/minimap2/lchain.c +369 -0
  41. data/ext/minimap2/main.c +459 -0
  42. data/ext/minimap2/map.c +714 -0
  43. data/ext/minimap2/minimap.h +410 -0
  44. data/ext/minimap2/minimap2.1 +725 -0
  45. data/ext/minimap2/misc/README.md +179 -0
  46. data/ext/minimap2/misc/mmphase.js +335 -0
  47. data/ext/minimap2/misc/paftools.js +3149 -0
  48. data/ext/minimap2/misc.c +162 -0
  49. data/ext/minimap2/mmpriv.h +132 -0
  50. data/ext/minimap2/options.c +234 -0
  51. data/ext/minimap2/pe.c +177 -0
  52. data/ext/minimap2/python/README.rst +196 -0
  53. data/ext/minimap2/python/cmappy.h +152 -0
  54. data/ext/minimap2/python/cmappy.pxd +153 -0
  55. data/ext/minimap2/python/mappy.pyx +273 -0
  56. data/ext/minimap2/python/minimap2.py +39 -0
  57. data/ext/minimap2/sdust.c +213 -0
  58. data/ext/minimap2/sdust.h +25 -0
  59. data/ext/minimap2/seed.c +131 -0
  60. data/ext/minimap2/setup.py +55 -0
  61. data/ext/minimap2/sketch.c +143 -0
  62. data/ext/minimap2/splitidx.c +84 -0
  63. data/ext/minimap2/sse2neon/emmintrin.h +1689 -0
  64. data/ext/minimap2/test/MT-human.fa +278 -0
  65. data/ext/minimap2/test/MT-orang.fa +276 -0
  66. data/ext/minimap2/test/q-inv.fa +4 -0
  67. data/ext/minimap2/test/q2.fa +2 -0
  68. data/ext/minimap2/test/t-inv.fa +127 -0
  69. data/ext/minimap2/test/t2.fa +2 -0
  70. data/ext/minimap2/tex/Makefile +21 -0
  71. data/ext/minimap2/tex/bioinfo.cls +930 -0
  72. data/ext/minimap2/tex/blasr-mc.eval +17 -0
  73. data/ext/minimap2/tex/bowtie2-s3.sam.eval +28 -0
  74. data/ext/minimap2/tex/bwa-s3.sam.eval +52 -0
  75. data/ext/minimap2/tex/bwa.eval +55 -0
  76. data/ext/minimap2/tex/eval2roc.pl +33 -0
  77. data/ext/minimap2/tex/graphmap.eval +4 -0
  78. data/ext/minimap2/tex/hs38-simu.sh +10 -0
  79. data/ext/minimap2/tex/minialign.eval +49 -0
  80. data/ext/minimap2/tex/minimap2.bib +460 -0
  81. data/ext/minimap2/tex/minimap2.tex +724 -0
  82. data/ext/minimap2/tex/mm2-s3.sam.eval +62 -0
  83. data/ext/minimap2/tex/mm2-update.tex +240 -0
  84. data/ext/minimap2/tex/mm2.approx.eval +12 -0
  85. data/ext/minimap2/tex/mm2.eval +13 -0
  86. data/ext/minimap2/tex/natbib.bst +1288 -0
  87. data/ext/minimap2/tex/natbib.sty +803 -0
  88. data/ext/minimap2/tex/ngmlr.eval +38 -0
  89. data/ext/minimap2/tex/roc.gp +60 -0
  90. data/ext/minimap2/tex/snap-s3.sam.eval +62 -0
  91. data/ext/minimap2.patch +19 -0
  92. data/lib/minimap2/aligner.rb +4 -4
  93. data/lib/minimap2/alignment.rb +11 -11
  94. data/lib/minimap2/ffi/constants.rb +20 -16
  95. data/lib/minimap2/ffi/functions.rb +5 -0
  96. data/lib/minimap2/ffi.rb +4 -5
  97. data/lib/minimap2/version.rb +2 -2
  98. data/lib/minimap2.rb +51 -15
  99. metadata +97 -79
  100. data/lib/minimap2/ffi_helper.rb +0 -53
  101. data/vendor/libminimap2.so +0 -0
@@ -0,0 +1,55 @@
1
+ try:
2
+ from setuptools import setup, Extension
3
+ except ImportError:
4
+ from distutils.core import setup
5
+ from distutils.extension import Extension
6
+
7
+ import sys, platform
8
+
9
+ sys.path.append('python')
10
+
11
+ extra_compile_args = ['-DHAVE_KALLOC']
12
+ include_dirs = ["."]
13
+
14
+ if platform.machine() in ["aarch64", "arm64"]:
15
+ include_dirs.append("sse2neon/")
16
+ extra_compile_args.extend(['-ftree-vectorize', '-DKSW_SSE2_ONLY', '-D__SSE2__'])
17
+ else:
18
+ extra_compile_args.append('-msse4.1') # WARNING: ancient x86_64 CPUs don't have SSE4
19
+
20
+ def readme():
21
+ with open('python/README.rst') as f:
22
+ return f.read()
23
+
24
+ setup(
25
+ name = 'mappy',
26
+ version = '2.24',
27
+ url = 'https://github.com/lh3/minimap2',
28
+ description = 'Minimap2 python binding',
29
+ long_description = readme(),
30
+ author = 'Heng Li',
31
+ author_email = 'lh3@me.com',
32
+ license = 'MIT',
33
+ keywords = 'sequence-alignment',
34
+ scripts = ['python/minimap2.py'],
35
+ ext_modules = [Extension('mappy',
36
+ sources = ['python/mappy.pyx', 'align.c', 'bseq.c', 'lchain.c', 'seed.c', 'format.c', 'hit.c', 'index.c', 'pe.c', 'options.c',
37
+ 'ksw2_extd2_sse.c', 'ksw2_exts2_sse.c', 'ksw2_extz2_sse.c', 'ksw2_ll_sse.c',
38
+ 'kalloc.c', 'kthread.c', 'map.c', 'misc.c', 'sdust.c', 'sketch.c', 'esterr.c', 'splitidx.c'],
39
+ depends = ['minimap.h', 'bseq.h', 'kalloc.h', 'kdq.h', 'khash.h', 'kseq.h', 'ksort.h',
40
+ 'ksw2.h', 'kthread.h', 'kvec.h', 'mmpriv.h', 'sdust.h',
41
+ 'python/cmappy.h', 'python/cmappy.pxd'],
42
+ extra_compile_args = extra_compile_args,
43
+ include_dirs = include_dirs,
44
+ libraries = ['z', 'm', 'pthread'])],
45
+ classifiers = [
46
+ 'Development Status :: 5 - Production/Stable',
47
+ 'License :: OSI Approved :: MIT License',
48
+ 'Operating System :: POSIX',
49
+ 'Programming Language :: C',
50
+ 'Programming Language :: Cython',
51
+ 'Programming Language :: Python :: 2.7',
52
+ 'Programming Language :: Python :: 3',
53
+ 'Intended Audience :: Science/Research',
54
+ 'Topic :: Scientific/Engineering :: Bio-Informatics'],
55
+ setup_requires=["cython"])
@@ -0,0 +1,143 @@
1
+ #include <stdio.h>
2
+ #include <stdlib.h>
3
+ #include <assert.h>
4
+ #include <string.h>
5
+ #define __STDC_LIMIT_MACROS
6
+ #include "kvec.h"
7
+ #include "mmpriv.h"
8
+
9
+ unsigned char seq_nt4_table[256] = {
10
+ 0, 1, 2, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
11
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
12
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
13
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
14
+ 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4,
15
+ 4, 4, 4, 4, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
16
+ 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4,
17
+ 4, 4, 4, 4, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
18
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
19
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
20
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
21
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
22
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
23
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
24
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
25
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4
26
+ };
27
+
28
+ static inline uint64_t hash64(uint64_t key, uint64_t mask)
29
+ {
30
+ key = (~key + (key << 21)) & mask; // key = (key << 21) - key - 1;
31
+ key = key ^ key >> 24;
32
+ key = ((key + (key << 3)) + (key << 8)) & mask; // key * 265
33
+ key = key ^ key >> 14;
34
+ key = ((key + (key << 2)) + (key << 4)) & mask; // key * 21
35
+ key = key ^ key >> 28;
36
+ key = (key + (key << 31)) & mask;
37
+ return key;
38
+ }
39
+
40
+ typedef struct { // a simplified version of kdq
41
+ int front, count;
42
+ int a[32];
43
+ } tiny_queue_t;
44
+
45
+ static inline void tq_push(tiny_queue_t *q, int x)
46
+ {
47
+ q->a[((q->count++) + q->front) & 0x1f] = x;
48
+ }
49
+
50
+ static inline int tq_shift(tiny_queue_t *q)
51
+ {
52
+ int x;
53
+ if (q->count == 0) return -1;
54
+ x = q->a[q->front++];
55
+ q->front &= 0x1f;
56
+ --q->count;
57
+ return x;
58
+ }
59
+
60
+ /**
61
+ * Find symmetric (w,k)-minimizers on a DNA sequence
62
+ *
63
+ * @param km thread-local memory pool; using NULL falls back to malloc()
64
+ * @param str DNA sequence
65
+ * @param len length of $str
66
+ * @param w find a minimizer for every $w consecutive k-mers
67
+ * @param k k-mer size
68
+ * @param rid reference ID; will be copied to the output $p array
69
+ * @param is_hpc homopolymer-compressed or not
70
+ * @param p minimizers
71
+ * p->a[i].x = kMer<<8 | kmerSpan
72
+ * p->a[i].y = rid<<32 | lastPos<<1 | strand
73
+ * where lastPos is the position of the last base of the i-th minimizer,
74
+ * and strand indicates whether the minimizer comes from the top or the bottom strand.
75
+ * Callers may want to set "p->n = 0"; otherwise results are appended to p
76
+ */
77
+ void mm_sketch(void *km, const char *str, int len, int w, int k, uint32_t rid, int is_hpc, mm128_v *p)
78
+ {
79
+ uint64_t shift1 = 2 * (k - 1), mask = (1ULL<<2*k) - 1, kmer[2] = {0,0};
80
+ int i, j, l, buf_pos, min_pos, kmer_span = 0;
81
+ mm128_t buf[256], min = { UINT64_MAX, UINT64_MAX };
82
+ tiny_queue_t tq;
83
+
84
+ assert(len > 0 && (w > 0 && w < 256) && (k > 0 && k <= 28)); // 56 bits for k-mer; could use long k-mers, but 28 enough in practice
85
+ memset(buf, 0xff, w * 16);
86
+ memset(&tq, 0, sizeof(tiny_queue_t));
87
+ kv_resize(mm128_t, km, *p, p->n + len/w);
88
+
89
+ for (i = l = buf_pos = min_pos = 0; i < len; ++i) {
90
+ int c = seq_nt4_table[(uint8_t)str[i]];
91
+ mm128_t info = { UINT64_MAX, UINT64_MAX };
92
+ if (c < 4) { // not an ambiguous base
93
+ int z;
94
+ if (is_hpc) {
95
+ int skip_len = 1;
96
+ if (i + 1 < len && seq_nt4_table[(uint8_t)str[i + 1]] == c) {
97
+ for (skip_len = 2; i + skip_len < len; ++skip_len)
98
+ if (seq_nt4_table[(uint8_t)str[i + skip_len]] != c)
99
+ break;
100
+ i += skip_len - 1; // put $i at the end of the current homopolymer run
101
+ }
102
+ tq_push(&tq, skip_len);
103
+ kmer_span += skip_len;
104
+ if (tq.count > k) kmer_span -= tq_shift(&tq);
105
+ } else kmer_span = l + 1 < k? l + 1 : k;
106
+ kmer[0] = (kmer[0] << 2 | c) & mask; // forward k-mer
107
+ kmer[1] = (kmer[1] >> 2) | (3ULL^c) << shift1; // reverse k-mer
108
+ if (kmer[0] == kmer[1]) continue; // skip "symmetric k-mers" as we don't know it strand
109
+ z = kmer[0] < kmer[1]? 0 : 1; // strand
110
+ ++l;
111
+ if (l >= k && kmer_span < 256) {
112
+ info.x = hash64(kmer[z], mask) << 8 | kmer_span;
113
+ info.y = (uint64_t)rid<<32 | (uint32_t)i<<1 | z;
114
+ }
115
+ } else l = 0, tq.count = tq.front = 0, kmer_span = 0;
116
+ buf[buf_pos] = info; // need to do this here as appropriate buf_pos and buf[buf_pos] are needed below
117
+ if (l == w + k - 1 && min.x != UINT64_MAX) { // special case for the first window - because identical k-mers are not stored yet
118
+ for (j = buf_pos + 1; j < w; ++j)
119
+ if (min.x == buf[j].x && buf[j].y != min.y) kv_push(mm128_t, km, *p, buf[j]);
120
+ for (j = 0; j < buf_pos; ++j)
121
+ if (min.x == buf[j].x && buf[j].y != min.y) kv_push(mm128_t, km, *p, buf[j]);
122
+ }
123
+ if (info.x <= min.x) { // a new minimum; then write the old min
124
+ if (l >= w + k && min.x != UINT64_MAX) kv_push(mm128_t, km, *p, min);
125
+ min = info, min_pos = buf_pos;
126
+ } else if (buf_pos == min_pos) { // old min has moved outside the window
127
+ if (l >= w + k - 1 && min.x != UINT64_MAX) kv_push(mm128_t, km, *p, min);
128
+ for (j = buf_pos + 1, min.x = UINT64_MAX; j < w; ++j) // the two loops are necessary when there are identical k-mers
129
+ if (min.x >= buf[j].x) min = buf[j], min_pos = j; // >= is important s.t. min is always the closest k-mer
130
+ for (j = 0; j <= buf_pos; ++j)
131
+ if (min.x >= buf[j].x) min = buf[j], min_pos = j;
132
+ if (l >= w + k - 1 && min.x != UINT64_MAX) { // write identical k-mers
133
+ for (j = buf_pos + 1; j < w; ++j) // these two loops make sure the output is sorted
134
+ if (min.x == buf[j].x && min.y != buf[j].y) kv_push(mm128_t, km, *p, buf[j]);
135
+ for (j = 0; j <= buf_pos; ++j)
136
+ if (min.x == buf[j].x && min.y != buf[j].y) kv_push(mm128_t, km, *p, buf[j]);
137
+ }
138
+ }
139
+ if (++buf_pos == w) buf_pos = 0;
140
+ }
141
+ if (min.x != UINT64_MAX)
142
+ kv_push(mm128_t, km, *p, min);
143
+ }
@@ -0,0 +1,84 @@
1
+ #include <string.h>
2
+ #include <assert.h>
3
+ #include <stdlib.h>
4
+ #include <stdio.h>
5
+ #include <errno.h>
6
+ #include "mmpriv.h"
7
+
8
+ FILE *mm_split_init(const char *prefix, const mm_idx_t *mi)
9
+ {
10
+ char *fn;
11
+ FILE *fp;
12
+ uint32_t i, k = mi->k;
13
+ fn = (char*)calloc(strlen(prefix) + 10, 1);
14
+ sprintf(fn, "%s.%.4d.tmp", prefix, mi->index);
15
+ if ((fp = fopen(fn, "wb")) == NULL) {
16
+ if (mm_verbose >= 1)
17
+ fprintf(stderr, "[ERROR]\033[1;31m failed to write to temporary file '%s'\033[0m: %s\n", fn, strerror(errno));
18
+ exit(1);
19
+ }
20
+ mm_err_fwrite(&k, 4, 1, fp);
21
+ mm_err_fwrite(&mi->n_seq, 4, 1, fp);
22
+ for (i = 0; i < mi->n_seq; ++i) {
23
+ uint32_t l;
24
+ l = strlen(mi->seq[i].name);
25
+ mm_err_fwrite(&l, 1, 4, fp);
26
+ mm_err_fwrite(mi->seq[i].name, 1, l, fp);
27
+ mm_err_fwrite(&mi->seq[i].len, 4, 1, fp);
28
+ }
29
+ free(fn);
30
+ return fp;
31
+ }
32
+
33
+ mm_idx_t *mm_split_merge_prep(const char *prefix, int n_splits, FILE **fp, uint32_t *n_seq_part)
34
+ {
35
+ mm_idx_t *mi = 0;
36
+ char *fn;
37
+ int i, j;
38
+
39
+ if (n_splits < 1) return 0;
40
+ fn = CALLOC(char, strlen(prefix) + 10);
41
+ for (i = 0; i < n_splits; ++i) {
42
+ sprintf(fn, "%s.%.4d.tmp", prefix, i);
43
+ if ((fp[i] = fopen(fn, "rb")) == 0) {
44
+ if (mm_verbose >= 1)
45
+ fprintf(stderr, "ERROR: failed to open temporary file '%s': %s\n", fn, strerror(errno));
46
+ for (j = 0; j < i; ++j)
47
+ fclose(fp[j]);
48
+ free(fn);
49
+ return 0;
50
+ }
51
+ }
52
+ free(fn);
53
+
54
+ mi = CALLOC(mm_idx_t, 1);
55
+ for (i = 0; i < n_splits; ++i) {
56
+ mm_err_fread(&mi->k, 4, 1, fp[i]); // TODO: check if k is all the same
57
+ mm_err_fread(&n_seq_part[i], 4, 1, fp[i]);
58
+ mi->n_seq += n_seq_part[i];
59
+ }
60
+ mi->seq = CALLOC(mm_idx_seq_t, mi->n_seq);
61
+ for (i = j = 0; i < n_splits; ++i) {
62
+ uint32_t k;
63
+ for (k = 0; k < n_seq_part[i]; ++k, ++j) {
64
+ uint32_t l;
65
+ mm_err_fread(&l, 1, 4, fp[i]);
66
+ mi->seq[j].name = (char*)calloc(l + 1, 1);
67
+ mm_err_fread(mi->seq[j].name, 1, l, fp[i]);
68
+ mm_err_fread(&mi->seq[j].len, 4, 1, fp[i]);
69
+ }
70
+ }
71
+ return mi;
72
+ }
73
+
74
+ void mm_split_rm_tmp(const char *prefix, int n_splits)
75
+ {
76
+ int i;
77
+ char *fn;
78
+ fn = CALLOC(char, strlen(prefix) + 10);
79
+ for (i = 0; i < n_splits; ++i) {
80
+ sprintf(fn, "%s.%.4d.tmp", prefix, i);
81
+ remove(fn);
82
+ }
83
+ free(fn);
84
+ }