minimap2 0.2.22.0 → 0.2.24.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (101) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +60 -76
  3. data/ext/Rakefile +55 -0
  4. data/ext/cmappy/cmappy.c +129 -0
  5. data/ext/cmappy/cmappy.h +44 -0
  6. data/ext/minimap2/FAQ.md +46 -0
  7. data/ext/minimap2/LICENSE.txt +24 -0
  8. data/ext/minimap2/MANIFEST.in +10 -0
  9. data/ext/minimap2/Makefile +132 -0
  10. data/ext/minimap2/Makefile.simde +97 -0
  11. data/ext/minimap2/NEWS.md +821 -0
  12. data/ext/minimap2/README.md +403 -0
  13. data/ext/minimap2/align.c +1020 -0
  14. data/ext/minimap2/bseq.c +169 -0
  15. data/ext/minimap2/bseq.h +64 -0
  16. data/ext/minimap2/code_of_conduct.md +30 -0
  17. data/ext/minimap2/cookbook.md +243 -0
  18. data/ext/minimap2/esterr.c +64 -0
  19. data/ext/minimap2/example.c +63 -0
  20. data/ext/minimap2/format.c +559 -0
  21. data/ext/minimap2/hit.c +466 -0
  22. data/ext/minimap2/index.c +775 -0
  23. data/ext/minimap2/kalloc.c +205 -0
  24. data/ext/minimap2/kalloc.h +76 -0
  25. data/ext/minimap2/kdq.h +132 -0
  26. data/ext/minimap2/ketopt.h +120 -0
  27. data/ext/minimap2/khash.h +615 -0
  28. data/ext/minimap2/krmq.h +474 -0
  29. data/ext/minimap2/kseq.h +256 -0
  30. data/ext/minimap2/ksort.h +153 -0
  31. data/ext/minimap2/ksw2.h +184 -0
  32. data/ext/minimap2/ksw2_dispatch.c +96 -0
  33. data/ext/minimap2/ksw2_extd2_sse.c +402 -0
  34. data/ext/minimap2/ksw2_exts2_sse.c +416 -0
  35. data/ext/minimap2/ksw2_extz2_sse.c +313 -0
  36. data/ext/minimap2/ksw2_ll_sse.c +152 -0
  37. data/ext/minimap2/kthread.c +159 -0
  38. data/ext/minimap2/kthread.h +15 -0
  39. data/ext/minimap2/kvec.h +105 -0
  40. data/ext/minimap2/lchain.c +369 -0
  41. data/ext/minimap2/main.c +459 -0
  42. data/ext/minimap2/map.c +714 -0
  43. data/ext/minimap2/minimap.h +410 -0
  44. data/ext/minimap2/minimap2.1 +725 -0
  45. data/ext/minimap2/misc/README.md +179 -0
  46. data/ext/minimap2/misc/mmphase.js +335 -0
  47. data/ext/minimap2/misc/paftools.js +3149 -0
  48. data/ext/minimap2/misc.c +162 -0
  49. data/ext/minimap2/mmpriv.h +132 -0
  50. data/ext/minimap2/options.c +234 -0
  51. data/ext/minimap2/pe.c +177 -0
  52. data/ext/minimap2/python/README.rst +196 -0
  53. data/ext/minimap2/python/cmappy.h +152 -0
  54. data/ext/minimap2/python/cmappy.pxd +153 -0
  55. data/ext/minimap2/python/mappy.pyx +273 -0
  56. data/ext/minimap2/python/minimap2.py +39 -0
  57. data/ext/minimap2/sdust.c +213 -0
  58. data/ext/minimap2/sdust.h +25 -0
  59. data/ext/minimap2/seed.c +131 -0
  60. data/ext/minimap2/setup.py +55 -0
  61. data/ext/minimap2/sketch.c +143 -0
  62. data/ext/minimap2/splitidx.c +84 -0
  63. data/ext/minimap2/sse2neon/emmintrin.h +1689 -0
  64. data/ext/minimap2/test/MT-human.fa +278 -0
  65. data/ext/minimap2/test/MT-orang.fa +276 -0
  66. data/ext/minimap2/test/q-inv.fa +4 -0
  67. data/ext/minimap2/test/q2.fa +2 -0
  68. data/ext/minimap2/test/t-inv.fa +127 -0
  69. data/ext/minimap2/test/t2.fa +2 -0
  70. data/ext/minimap2/tex/Makefile +21 -0
  71. data/ext/minimap2/tex/bioinfo.cls +930 -0
  72. data/ext/minimap2/tex/blasr-mc.eval +17 -0
  73. data/ext/minimap2/tex/bowtie2-s3.sam.eval +28 -0
  74. data/ext/minimap2/tex/bwa-s3.sam.eval +52 -0
  75. data/ext/minimap2/tex/bwa.eval +55 -0
  76. data/ext/minimap2/tex/eval2roc.pl +33 -0
  77. data/ext/minimap2/tex/graphmap.eval +4 -0
  78. data/ext/minimap2/tex/hs38-simu.sh +10 -0
  79. data/ext/minimap2/tex/minialign.eval +49 -0
  80. data/ext/minimap2/tex/minimap2.bib +460 -0
  81. data/ext/minimap2/tex/minimap2.tex +724 -0
  82. data/ext/minimap2/tex/mm2-s3.sam.eval +62 -0
  83. data/ext/minimap2/tex/mm2-update.tex +240 -0
  84. data/ext/minimap2/tex/mm2.approx.eval +12 -0
  85. data/ext/minimap2/tex/mm2.eval +13 -0
  86. data/ext/minimap2/tex/natbib.bst +1288 -0
  87. data/ext/minimap2/tex/natbib.sty +803 -0
  88. data/ext/minimap2/tex/ngmlr.eval +38 -0
  89. data/ext/minimap2/tex/roc.gp +60 -0
  90. data/ext/minimap2/tex/snap-s3.sam.eval +62 -0
  91. data/ext/minimap2.patch +19 -0
  92. data/lib/minimap2/aligner.rb +4 -4
  93. data/lib/minimap2/alignment.rb +11 -11
  94. data/lib/minimap2/ffi/constants.rb +20 -16
  95. data/lib/minimap2/ffi/functions.rb +5 -0
  96. data/lib/minimap2/ffi.rb +4 -5
  97. data/lib/minimap2/version.rb +2 -2
  98. data/lib/minimap2.rb +51 -15
  99. metadata +97 -79
  100. data/lib/minimap2/ffi_helper.rb +0 -53
  101. data/vendor/libminimap2.so +0 -0
@@ -0,0 +1,55 @@
1
+ try:
2
+ from setuptools import setup, Extension
3
+ except ImportError:
4
+ from distutils.core import setup
5
+ from distutils.extension import Extension
6
+
7
+ import sys, platform
8
+
9
+ sys.path.append('python')
10
+
11
+ extra_compile_args = ['-DHAVE_KALLOC']
12
+ include_dirs = ["."]
13
+
14
+ if platform.machine() in ["aarch64", "arm64"]:
15
+ include_dirs.append("sse2neon/")
16
+ extra_compile_args.extend(['-ftree-vectorize', '-DKSW_SSE2_ONLY', '-D__SSE2__'])
17
+ else:
18
+ extra_compile_args.append('-msse4.1') # WARNING: ancient x86_64 CPUs don't have SSE4
19
+
20
+ def readme():
21
+ with open('python/README.rst') as f:
22
+ return f.read()
23
+
24
+ setup(
25
+ name = 'mappy',
26
+ version = '2.24',
27
+ url = 'https://github.com/lh3/minimap2',
28
+ description = 'Minimap2 python binding',
29
+ long_description = readme(),
30
+ author = 'Heng Li',
31
+ author_email = 'lh3@me.com',
32
+ license = 'MIT',
33
+ keywords = 'sequence-alignment',
34
+ scripts = ['python/minimap2.py'],
35
+ ext_modules = [Extension('mappy',
36
+ sources = ['python/mappy.pyx', 'align.c', 'bseq.c', 'lchain.c', 'seed.c', 'format.c', 'hit.c', 'index.c', 'pe.c', 'options.c',
37
+ 'ksw2_extd2_sse.c', 'ksw2_exts2_sse.c', 'ksw2_extz2_sse.c', 'ksw2_ll_sse.c',
38
+ 'kalloc.c', 'kthread.c', 'map.c', 'misc.c', 'sdust.c', 'sketch.c', 'esterr.c', 'splitidx.c'],
39
+ depends = ['minimap.h', 'bseq.h', 'kalloc.h', 'kdq.h', 'khash.h', 'kseq.h', 'ksort.h',
40
+ 'ksw2.h', 'kthread.h', 'kvec.h', 'mmpriv.h', 'sdust.h',
41
+ 'python/cmappy.h', 'python/cmappy.pxd'],
42
+ extra_compile_args = extra_compile_args,
43
+ include_dirs = include_dirs,
44
+ libraries = ['z', 'm', 'pthread'])],
45
+ classifiers = [
46
+ 'Development Status :: 5 - Production/Stable',
47
+ 'License :: OSI Approved :: MIT License',
48
+ 'Operating System :: POSIX',
49
+ 'Programming Language :: C',
50
+ 'Programming Language :: Cython',
51
+ 'Programming Language :: Python :: 2.7',
52
+ 'Programming Language :: Python :: 3',
53
+ 'Intended Audience :: Science/Research',
54
+ 'Topic :: Scientific/Engineering :: Bio-Informatics'],
55
+ setup_requires=["cython"])
@@ -0,0 +1,143 @@
1
+ #include <stdio.h>
2
+ #include <stdlib.h>
3
+ #include <assert.h>
4
+ #include <string.h>
5
+ #define __STDC_LIMIT_MACROS
6
+ #include "kvec.h"
7
+ #include "mmpriv.h"
8
+
9
+ unsigned char seq_nt4_table[256] = {
10
+ 0, 1, 2, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
11
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
12
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
13
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
14
+ 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4,
15
+ 4, 4, 4, 4, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
16
+ 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4,
17
+ 4, 4, 4, 4, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
18
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
19
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
20
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
21
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
22
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
23
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
24
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
25
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4
26
+ };
27
+
28
+ static inline uint64_t hash64(uint64_t key, uint64_t mask)
29
+ {
30
+ key = (~key + (key << 21)) & mask; // key = (key << 21) - key - 1;
31
+ key = key ^ key >> 24;
32
+ key = ((key + (key << 3)) + (key << 8)) & mask; // key * 265
33
+ key = key ^ key >> 14;
34
+ key = ((key + (key << 2)) + (key << 4)) & mask; // key * 21
35
+ key = key ^ key >> 28;
36
+ key = (key + (key << 31)) & mask;
37
+ return key;
38
+ }
39
+
40
+ typedef struct { // a simplified version of kdq
41
+ int front, count;
42
+ int a[32];
43
+ } tiny_queue_t;
44
+
45
+ static inline void tq_push(tiny_queue_t *q, int x)
46
+ {
47
+ q->a[((q->count++) + q->front) & 0x1f] = x;
48
+ }
49
+
50
+ static inline int tq_shift(tiny_queue_t *q)
51
+ {
52
+ int x;
53
+ if (q->count == 0) return -1;
54
+ x = q->a[q->front++];
55
+ q->front &= 0x1f;
56
+ --q->count;
57
+ return x;
58
+ }
59
+
60
+ /**
61
+ * Find symmetric (w,k)-minimizers on a DNA sequence
62
+ *
63
+ * @param km thread-local memory pool; using NULL falls back to malloc()
64
+ * @param str DNA sequence
65
+ * @param len length of $str
66
+ * @param w find a minimizer for every $w consecutive k-mers
67
+ * @param k k-mer size
68
+ * @param rid reference ID; will be copied to the output $p array
69
+ * @param is_hpc homopolymer-compressed or not
70
+ * @param p minimizers
71
+ * p->a[i].x = kMer<<8 | kmerSpan
72
+ * p->a[i].y = rid<<32 | lastPos<<1 | strand
73
+ * where lastPos is the position of the last base of the i-th minimizer,
74
+ * and strand indicates whether the minimizer comes from the top or the bottom strand.
75
+ * Callers may want to set "p->n = 0"; otherwise results are appended to p
76
+ */
77
+ void mm_sketch(void *km, const char *str, int len, int w, int k, uint32_t rid, int is_hpc, mm128_v *p)
78
+ {
79
+ uint64_t shift1 = 2 * (k - 1), mask = (1ULL<<2*k) - 1, kmer[2] = {0,0};
80
+ int i, j, l, buf_pos, min_pos, kmer_span = 0;
81
+ mm128_t buf[256], min = { UINT64_MAX, UINT64_MAX };
82
+ tiny_queue_t tq;
83
+
84
+ assert(len > 0 && (w > 0 && w < 256) && (k > 0 && k <= 28)); // 56 bits for k-mer; could use long k-mers, but 28 enough in practice
85
+ memset(buf, 0xff, w * 16);
86
+ memset(&tq, 0, sizeof(tiny_queue_t));
87
+ kv_resize(mm128_t, km, *p, p->n + len/w);
88
+
89
+ for (i = l = buf_pos = min_pos = 0; i < len; ++i) {
90
+ int c = seq_nt4_table[(uint8_t)str[i]];
91
+ mm128_t info = { UINT64_MAX, UINT64_MAX };
92
+ if (c < 4) { // not an ambiguous base
93
+ int z;
94
+ if (is_hpc) {
95
+ int skip_len = 1;
96
+ if (i + 1 < len && seq_nt4_table[(uint8_t)str[i + 1]] == c) {
97
+ for (skip_len = 2; i + skip_len < len; ++skip_len)
98
+ if (seq_nt4_table[(uint8_t)str[i + skip_len]] != c)
99
+ break;
100
+ i += skip_len - 1; // put $i at the end of the current homopolymer run
101
+ }
102
+ tq_push(&tq, skip_len);
103
+ kmer_span += skip_len;
104
+ if (tq.count > k) kmer_span -= tq_shift(&tq);
105
+ } else kmer_span = l + 1 < k? l + 1 : k;
106
+ kmer[0] = (kmer[0] << 2 | c) & mask; // forward k-mer
107
+ kmer[1] = (kmer[1] >> 2) | (3ULL^c) << shift1; // reverse k-mer
108
+ if (kmer[0] == kmer[1]) continue; // skip "symmetric k-mers" as we don't know it strand
109
+ z = kmer[0] < kmer[1]? 0 : 1; // strand
110
+ ++l;
111
+ if (l >= k && kmer_span < 256) {
112
+ info.x = hash64(kmer[z], mask) << 8 | kmer_span;
113
+ info.y = (uint64_t)rid<<32 | (uint32_t)i<<1 | z;
114
+ }
115
+ } else l = 0, tq.count = tq.front = 0, kmer_span = 0;
116
+ buf[buf_pos] = info; // need to do this here as appropriate buf_pos and buf[buf_pos] are needed below
117
+ if (l == w + k - 1 && min.x != UINT64_MAX) { // special case for the first window - because identical k-mers are not stored yet
118
+ for (j = buf_pos + 1; j < w; ++j)
119
+ if (min.x == buf[j].x && buf[j].y != min.y) kv_push(mm128_t, km, *p, buf[j]);
120
+ for (j = 0; j < buf_pos; ++j)
121
+ if (min.x == buf[j].x && buf[j].y != min.y) kv_push(mm128_t, km, *p, buf[j]);
122
+ }
123
+ if (info.x <= min.x) { // a new minimum; then write the old min
124
+ if (l >= w + k && min.x != UINT64_MAX) kv_push(mm128_t, km, *p, min);
125
+ min = info, min_pos = buf_pos;
126
+ } else if (buf_pos == min_pos) { // old min has moved outside the window
127
+ if (l >= w + k - 1 && min.x != UINT64_MAX) kv_push(mm128_t, km, *p, min);
128
+ for (j = buf_pos + 1, min.x = UINT64_MAX; j < w; ++j) // the two loops are necessary when there are identical k-mers
129
+ if (min.x >= buf[j].x) min = buf[j], min_pos = j; // >= is important s.t. min is always the closest k-mer
130
+ for (j = 0; j <= buf_pos; ++j)
131
+ if (min.x >= buf[j].x) min = buf[j], min_pos = j;
132
+ if (l >= w + k - 1 && min.x != UINT64_MAX) { // write identical k-mers
133
+ for (j = buf_pos + 1; j < w; ++j) // these two loops make sure the output is sorted
134
+ if (min.x == buf[j].x && min.y != buf[j].y) kv_push(mm128_t, km, *p, buf[j]);
135
+ for (j = 0; j <= buf_pos; ++j)
136
+ if (min.x == buf[j].x && min.y != buf[j].y) kv_push(mm128_t, km, *p, buf[j]);
137
+ }
138
+ }
139
+ if (++buf_pos == w) buf_pos = 0;
140
+ }
141
+ if (min.x != UINT64_MAX)
142
+ kv_push(mm128_t, km, *p, min);
143
+ }
@@ -0,0 +1,84 @@
1
+ #include <string.h>
2
+ #include <assert.h>
3
+ #include <stdlib.h>
4
+ #include <stdio.h>
5
+ #include <errno.h>
6
+ #include "mmpriv.h"
7
+
8
+ FILE *mm_split_init(const char *prefix, const mm_idx_t *mi)
9
+ {
10
+ char *fn;
11
+ FILE *fp;
12
+ uint32_t i, k = mi->k;
13
+ fn = (char*)calloc(strlen(prefix) + 10, 1);
14
+ sprintf(fn, "%s.%.4d.tmp", prefix, mi->index);
15
+ if ((fp = fopen(fn, "wb")) == NULL) {
16
+ if (mm_verbose >= 1)
17
+ fprintf(stderr, "[ERROR]\033[1;31m failed to write to temporary file '%s'\033[0m: %s\n", fn, strerror(errno));
18
+ exit(1);
19
+ }
20
+ mm_err_fwrite(&k, 4, 1, fp);
21
+ mm_err_fwrite(&mi->n_seq, 4, 1, fp);
22
+ for (i = 0; i < mi->n_seq; ++i) {
23
+ uint32_t l;
24
+ l = strlen(mi->seq[i].name);
25
+ mm_err_fwrite(&l, 1, 4, fp);
26
+ mm_err_fwrite(mi->seq[i].name, 1, l, fp);
27
+ mm_err_fwrite(&mi->seq[i].len, 4, 1, fp);
28
+ }
29
+ free(fn);
30
+ return fp;
31
+ }
32
+
33
+ mm_idx_t *mm_split_merge_prep(const char *prefix, int n_splits, FILE **fp, uint32_t *n_seq_part)
34
+ {
35
+ mm_idx_t *mi = 0;
36
+ char *fn;
37
+ int i, j;
38
+
39
+ if (n_splits < 1) return 0;
40
+ fn = CALLOC(char, strlen(prefix) + 10);
41
+ for (i = 0; i < n_splits; ++i) {
42
+ sprintf(fn, "%s.%.4d.tmp", prefix, i);
43
+ if ((fp[i] = fopen(fn, "rb")) == 0) {
44
+ if (mm_verbose >= 1)
45
+ fprintf(stderr, "ERROR: failed to open temporary file '%s': %s\n", fn, strerror(errno));
46
+ for (j = 0; j < i; ++j)
47
+ fclose(fp[j]);
48
+ free(fn);
49
+ return 0;
50
+ }
51
+ }
52
+ free(fn);
53
+
54
+ mi = CALLOC(mm_idx_t, 1);
55
+ for (i = 0; i < n_splits; ++i) {
56
+ mm_err_fread(&mi->k, 4, 1, fp[i]); // TODO: check if k is all the same
57
+ mm_err_fread(&n_seq_part[i], 4, 1, fp[i]);
58
+ mi->n_seq += n_seq_part[i];
59
+ }
60
+ mi->seq = CALLOC(mm_idx_seq_t, mi->n_seq);
61
+ for (i = j = 0; i < n_splits; ++i) {
62
+ uint32_t k;
63
+ for (k = 0; k < n_seq_part[i]; ++k, ++j) {
64
+ uint32_t l;
65
+ mm_err_fread(&l, 1, 4, fp[i]);
66
+ mi->seq[j].name = (char*)calloc(l + 1, 1);
67
+ mm_err_fread(mi->seq[j].name, 1, l, fp[i]);
68
+ mm_err_fread(&mi->seq[j].len, 4, 1, fp[i]);
69
+ }
70
+ }
71
+ return mi;
72
+ }
73
+
74
+ void mm_split_rm_tmp(const char *prefix, int n_splits)
75
+ {
76
+ int i;
77
+ char *fn;
78
+ fn = CALLOC(char, strlen(prefix) + 10);
79
+ for (i = 0; i < n_splits; ++i) {
80
+ sprintf(fn, "%s.%.4d.tmp", prefix, i);
81
+ remove(fn);
82
+ }
83
+ free(fn);
84
+ }