minimap2 0.2.22.0 → 0.2.24.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +60 -76
- data/ext/Rakefile +55 -0
- data/ext/cmappy/cmappy.c +129 -0
- data/ext/cmappy/cmappy.h +44 -0
- data/ext/minimap2/FAQ.md +46 -0
- data/ext/minimap2/LICENSE.txt +24 -0
- data/ext/minimap2/MANIFEST.in +10 -0
- data/ext/minimap2/Makefile +132 -0
- data/ext/minimap2/Makefile.simde +97 -0
- data/ext/minimap2/NEWS.md +821 -0
- data/ext/minimap2/README.md +403 -0
- data/ext/minimap2/align.c +1020 -0
- data/ext/minimap2/bseq.c +169 -0
- data/ext/minimap2/bseq.h +64 -0
- data/ext/minimap2/code_of_conduct.md +30 -0
- data/ext/minimap2/cookbook.md +243 -0
- data/ext/minimap2/esterr.c +64 -0
- data/ext/minimap2/example.c +63 -0
- data/ext/minimap2/format.c +559 -0
- data/ext/minimap2/hit.c +466 -0
- data/ext/minimap2/index.c +775 -0
- data/ext/minimap2/kalloc.c +205 -0
- data/ext/minimap2/kalloc.h +76 -0
- data/ext/minimap2/kdq.h +132 -0
- data/ext/minimap2/ketopt.h +120 -0
- data/ext/minimap2/khash.h +615 -0
- data/ext/minimap2/krmq.h +474 -0
- data/ext/minimap2/kseq.h +256 -0
- data/ext/minimap2/ksort.h +153 -0
- data/ext/minimap2/ksw2.h +184 -0
- data/ext/minimap2/ksw2_dispatch.c +96 -0
- data/ext/minimap2/ksw2_extd2_sse.c +402 -0
- data/ext/minimap2/ksw2_exts2_sse.c +416 -0
- data/ext/minimap2/ksw2_extz2_sse.c +313 -0
- data/ext/minimap2/ksw2_ll_sse.c +152 -0
- data/ext/minimap2/kthread.c +159 -0
- data/ext/minimap2/kthread.h +15 -0
- data/ext/minimap2/kvec.h +105 -0
- data/ext/minimap2/lchain.c +369 -0
- data/ext/minimap2/main.c +459 -0
- data/ext/minimap2/map.c +714 -0
- data/ext/minimap2/minimap.h +410 -0
- data/ext/minimap2/minimap2.1 +725 -0
- data/ext/minimap2/misc/README.md +179 -0
- data/ext/minimap2/misc/mmphase.js +335 -0
- data/ext/minimap2/misc/paftools.js +3149 -0
- data/ext/minimap2/misc.c +162 -0
- data/ext/minimap2/mmpriv.h +132 -0
- data/ext/minimap2/options.c +234 -0
- data/ext/minimap2/pe.c +177 -0
- data/ext/minimap2/python/README.rst +196 -0
- data/ext/minimap2/python/cmappy.h +152 -0
- data/ext/minimap2/python/cmappy.pxd +153 -0
- data/ext/minimap2/python/mappy.pyx +273 -0
- data/ext/minimap2/python/minimap2.py +39 -0
- data/ext/minimap2/sdust.c +213 -0
- data/ext/minimap2/sdust.h +25 -0
- data/ext/minimap2/seed.c +131 -0
- data/ext/minimap2/setup.py +55 -0
- data/ext/minimap2/sketch.c +143 -0
- data/ext/minimap2/splitidx.c +84 -0
- data/ext/minimap2/sse2neon/emmintrin.h +1689 -0
- data/ext/minimap2/test/MT-human.fa +278 -0
- data/ext/minimap2/test/MT-orang.fa +276 -0
- data/ext/minimap2/test/q-inv.fa +4 -0
- data/ext/minimap2/test/q2.fa +2 -0
- data/ext/minimap2/test/t-inv.fa +127 -0
- data/ext/minimap2/test/t2.fa +2 -0
- data/ext/minimap2/tex/Makefile +21 -0
- data/ext/minimap2/tex/bioinfo.cls +930 -0
- data/ext/minimap2/tex/blasr-mc.eval +17 -0
- data/ext/minimap2/tex/bowtie2-s3.sam.eval +28 -0
- data/ext/minimap2/tex/bwa-s3.sam.eval +52 -0
- data/ext/minimap2/tex/bwa.eval +55 -0
- data/ext/minimap2/tex/eval2roc.pl +33 -0
- data/ext/minimap2/tex/graphmap.eval +4 -0
- data/ext/minimap2/tex/hs38-simu.sh +10 -0
- data/ext/minimap2/tex/minialign.eval +49 -0
- data/ext/minimap2/tex/minimap2.bib +460 -0
- data/ext/minimap2/tex/minimap2.tex +724 -0
- data/ext/minimap2/tex/mm2-s3.sam.eval +62 -0
- data/ext/minimap2/tex/mm2-update.tex +240 -0
- data/ext/minimap2/tex/mm2.approx.eval +12 -0
- data/ext/minimap2/tex/mm2.eval +13 -0
- data/ext/minimap2/tex/natbib.bst +1288 -0
- data/ext/minimap2/tex/natbib.sty +803 -0
- data/ext/minimap2/tex/ngmlr.eval +38 -0
- data/ext/minimap2/tex/roc.gp +60 -0
- data/ext/minimap2/tex/snap-s3.sam.eval +62 -0
- data/ext/minimap2.patch +19 -0
- data/lib/minimap2/aligner.rb +4 -4
- data/lib/minimap2/alignment.rb +11 -11
- data/lib/minimap2/ffi/constants.rb +20 -16
- data/lib/minimap2/ffi/functions.rb +5 -0
- data/lib/minimap2/ffi.rb +4 -5
- data/lib/minimap2/version.rb +2 -2
- data/lib/minimap2.rb +51 -15
- metadata +97 -79
- data/lib/minimap2/ffi_helper.rb +0 -53
- data/vendor/libminimap2.so +0 -0
@@ -0,0 +1,775 @@
|
|
1
|
+
#include <stdlib.h>
|
2
|
+
#include <assert.h>
|
3
|
+
#if defined(WIN32) || defined(_WIN32)
|
4
|
+
#include <io.h> // for open(2)
|
5
|
+
#else
|
6
|
+
#include <unistd.h>
|
7
|
+
#endif
|
8
|
+
#include <fcntl.h>
|
9
|
+
#include <stdio.h>
|
10
|
+
#define __STDC_LIMIT_MACROS
|
11
|
+
#include "kthread.h"
|
12
|
+
#include "bseq.h"
|
13
|
+
#include "minimap.h"
|
14
|
+
#include "mmpriv.h"
|
15
|
+
#include "kvec.h"
|
16
|
+
#include "khash.h"
|
17
|
+
|
18
|
+
#define idx_hash(a) ((a)>>1)
|
19
|
+
#define idx_eq(a, b) ((a)>>1 == (b)>>1)
|
20
|
+
KHASH_INIT(idx, uint64_t, uint64_t, 1, idx_hash, idx_eq)
|
21
|
+
typedef khash_t(idx) idxhash_t;
|
22
|
+
|
23
|
+
KHASH_MAP_INIT_STR(str, uint32_t)
|
24
|
+
|
25
|
+
#define kroundup64(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, (x)|=(x)>>32, ++(x))
|
26
|
+
|
27
|
+
typedef struct mm_idx_bucket_s {
|
28
|
+
mm128_v a; // (minimizer, position) array
|
29
|
+
int32_t n; // size of the _p_ array
|
30
|
+
uint64_t *p; // position array for minimizers appearing >1 times
|
31
|
+
void *h; // hash table indexing _p_ and minimizers appearing once
|
32
|
+
} mm_idx_bucket_t;
|
33
|
+
|
34
|
+
typedef struct {
|
35
|
+
int32_t st, en, max; // max is not used for now
|
36
|
+
int32_t score:30, strand:2;
|
37
|
+
} mm_idx_intv1_t;
|
38
|
+
|
39
|
+
typedef struct mm_idx_intv_s {
|
40
|
+
int32_t n, m;
|
41
|
+
mm_idx_intv1_t *a;
|
42
|
+
} mm_idx_intv_t;
|
43
|
+
|
44
|
+
mm_idx_t *mm_idx_init(int w, int k, int b, int flag)
|
45
|
+
{
|
46
|
+
mm_idx_t *mi;
|
47
|
+
if (k*2 < b) b = k * 2;
|
48
|
+
if (w < 1) w = 1;
|
49
|
+
mi = (mm_idx_t*)calloc(1, sizeof(mm_idx_t));
|
50
|
+
mi->w = w, mi->k = k, mi->b = b, mi->flag = flag;
|
51
|
+
mi->B = (mm_idx_bucket_t*)calloc(1<<b, sizeof(mm_idx_bucket_t));
|
52
|
+
if (!(mm_dbg_flag & 1)) mi->km = km_init();
|
53
|
+
return mi;
|
54
|
+
}
|
55
|
+
|
56
|
+
void mm_idx_destroy(mm_idx_t *mi)
|
57
|
+
{
|
58
|
+
uint32_t i;
|
59
|
+
if (mi == 0) return;
|
60
|
+
if (mi->h) kh_destroy(str, (khash_t(str)*)mi->h);
|
61
|
+
if (mi->B) {
|
62
|
+
for (i = 0; i < 1U<<mi->b; ++i) {
|
63
|
+
free(mi->B[i].p);
|
64
|
+
free(mi->B[i].a.a);
|
65
|
+
kh_destroy(idx, (idxhash_t*)mi->B[i].h);
|
66
|
+
}
|
67
|
+
}
|
68
|
+
if (mi->I) {
|
69
|
+
for (i = 0; i < mi->n_seq; ++i)
|
70
|
+
free(mi->I[i].a);
|
71
|
+
free(mi->I);
|
72
|
+
}
|
73
|
+
if (!mi->km) {
|
74
|
+
for (i = 0; i < mi->n_seq; ++i)
|
75
|
+
free(mi->seq[i].name);
|
76
|
+
free(mi->seq);
|
77
|
+
} else km_destroy(mi->km);
|
78
|
+
free(mi->B); free(mi->S); free(mi);
|
79
|
+
}
|
80
|
+
|
81
|
+
const uint64_t *mm_idx_get(const mm_idx_t *mi, uint64_t minier, int *n)
|
82
|
+
{
|
83
|
+
int mask = (1<<mi->b) - 1;
|
84
|
+
khint_t k;
|
85
|
+
mm_idx_bucket_t *b = &mi->B[minier&mask];
|
86
|
+
idxhash_t *h = (idxhash_t*)b->h;
|
87
|
+
*n = 0;
|
88
|
+
if (h == 0) return 0;
|
89
|
+
k = kh_get(idx, h, minier>>mi->b<<1);
|
90
|
+
if (k == kh_end(h)) return 0;
|
91
|
+
if (kh_key(h, k)&1) { // special casing when there is only one k-mer
|
92
|
+
*n = 1;
|
93
|
+
return &kh_val(h, k);
|
94
|
+
} else {
|
95
|
+
*n = (uint32_t)kh_val(h, k);
|
96
|
+
return &b->p[kh_val(h, k)>>32];
|
97
|
+
}
|
98
|
+
}
|
99
|
+
|
100
|
+
void mm_idx_stat(const mm_idx_t *mi)
|
101
|
+
{
|
102
|
+
int n = 0, n1 = 0;
|
103
|
+
uint32_t i;
|
104
|
+
uint64_t sum = 0, len = 0;
|
105
|
+
fprintf(stderr, "[M::%s] kmer size: %d; skip: %d; is_hpc: %d; #seq: %d\n", __func__, mi->k, mi->w, mi->flag&MM_I_HPC, mi->n_seq);
|
106
|
+
for (i = 0; i < mi->n_seq; ++i)
|
107
|
+
len += mi->seq[i].len;
|
108
|
+
for (i = 0; i < 1U<<mi->b; ++i)
|
109
|
+
if (mi->B[i].h) n += kh_size((idxhash_t*)mi->B[i].h);
|
110
|
+
for (i = 0; i < 1U<<mi->b; ++i) {
|
111
|
+
idxhash_t *h = (idxhash_t*)mi->B[i].h;
|
112
|
+
khint_t k;
|
113
|
+
if (h == 0) continue;
|
114
|
+
for (k = 0; k < kh_end(h); ++k)
|
115
|
+
if (kh_exist(h, k)) {
|
116
|
+
sum += kh_key(h, k)&1? 1 : (uint32_t)kh_val(h, k);
|
117
|
+
if (kh_key(h, k)&1) ++n1;
|
118
|
+
}
|
119
|
+
}
|
120
|
+
fprintf(stderr, "[M::%s::%.3f*%.2f] distinct minimizers: %d (%.2f%% are singletons); average occurrences: %.3lf; average spacing: %.3lf; total length: %ld\n",
|
121
|
+
__func__, realtime() - mm_realtime0, cputime() / (realtime() - mm_realtime0), n, 100.0*n1/n, (double)sum / n, (double)len / sum, (long)len);
|
122
|
+
}
|
123
|
+
|
124
|
+
int mm_idx_index_name(mm_idx_t *mi)
|
125
|
+
{
|
126
|
+
khash_t(str) *h;
|
127
|
+
uint32_t i;
|
128
|
+
int has_dup = 0, absent;
|
129
|
+
if (mi->h) return 0;
|
130
|
+
h = kh_init(str);
|
131
|
+
for (i = 0; i < mi->n_seq; ++i) {
|
132
|
+
khint_t k;
|
133
|
+
k = kh_put(str, h, mi->seq[i].name, &absent);
|
134
|
+
if (absent) kh_val(h, k) = i;
|
135
|
+
else has_dup = 1;
|
136
|
+
}
|
137
|
+
mi->h = h;
|
138
|
+
if (has_dup && mm_verbose >= 2)
|
139
|
+
fprintf(stderr, "[WARNING] some database sequences have identical sequence names\n");
|
140
|
+
return has_dup;
|
141
|
+
}
|
142
|
+
|
143
|
+
int mm_idx_name2id(const mm_idx_t *mi, const char *name)
|
144
|
+
{
|
145
|
+
khash_t(str) *h = (khash_t(str)*)mi->h;
|
146
|
+
khint_t k;
|
147
|
+
if (h == 0) return -2;
|
148
|
+
k = kh_get(str, h, name);
|
149
|
+
return k == kh_end(h)? -1 : kh_val(h, k);
|
150
|
+
}
|
151
|
+
|
152
|
+
int mm_idx_getseq(const mm_idx_t *mi, uint32_t rid, uint32_t st, uint32_t en, uint8_t *seq)
|
153
|
+
{
|
154
|
+
uint64_t i, st1, en1;
|
155
|
+
if (rid >= mi->n_seq || st >= mi->seq[rid].len) return -1;
|
156
|
+
if (en > mi->seq[rid].len) en = mi->seq[rid].len;
|
157
|
+
st1 = mi->seq[rid].offset + st;
|
158
|
+
en1 = mi->seq[rid].offset + en;
|
159
|
+
for (i = st1; i < en1; ++i)
|
160
|
+
seq[i - st1] = mm_seq4_get(mi->S, i);
|
161
|
+
return en - st;
|
162
|
+
}
|
163
|
+
|
164
|
+
int mm_idx_getseq_rev(const mm_idx_t *mi, uint32_t rid, uint32_t st, uint32_t en, uint8_t *seq)
|
165
|
+
{
|
166
|
+
uint64_t i, st1, en1;
|
167
|
+
const mm_idx_seq_t *s;
|
168
|
+
if (rid >= mi->n_seq || st >= mi->seq[rid].len) return -1;
|
169
|
+
s = &mi->seq[rid];
|
170
|
+
if (en > s->len) en = s->len;
|
171
|
+
st1 = s->offset + (s->len - en);
|
172
|
+
en1 = s->offset + (s->len - st);
|
173
|
+
for (i = st1; i < en1; ++i) {
|
174
|
+
uint8_t c = mm_seq4_get(mi->S, i);
|
175
|
+
seq[en1 - i - 1] = c < 4? 3 - c : c;
|
176
|
+
}
|
177
|
+
return en - st;
|
178
|
+
}
|
179
|
+
|
180
|
+
int mm_idx_getseq2(const mm_idx_t *mi, int is_rev, uint32_t rid, uint32_t st, uint32_t en, uint8_t *seq)
|
181
|
+
{
|
182
|
+
if (is_rev) return mm_idx_getseq_rev(mi, rid, st, en, seq);
|
183
|
+
else return mm_idx_getseq(mi, rid, st, en, seq);
|
184
|
+
}
|
185
|
+
|
186
|
+
int32_t mm_idx_cal_max_occ(const mm_idx_t *mi, float f)
|
187
|
+
{
|
188
|
+
int i;
|
189
|
+
size_t n = 0;
|
190
|
+
uint32_t thres;
|
191
|
+
khint_t *a, k;
|
192
|
+
if (f <= 0.) return INT32_MAX;
|
193
|
+
for (i = 0; i < 1<<mi->b; ++i)
|
194
|
+
if (mi->B[i].h) n += kh_size((idxhash_t*)mi->B[i].h);
|
195
|
+
a = (uint32_t*)malloc(n * 4);
|
196
|
+
for (i = n = 0; i < 1<<mi->b; ++i) {
|
197
|
+
idxhash_t *h = (idxhash_t*)mi->B[i].h;
|
198
|
+
if (h == 0) continue;
|
199
|
+
for (k = 0; k < kh_end(h); ++k) {
|
200
|
+
if (!kh_exist(h, k)) continue;
|
201
|
+
a[n++] = kh_key(h, k)&1? 1 : (uint32_t)kh_val(h, k);
|
202
|
+
}
|
203
|
+
}
|
204
|
+
thres = ks_ksmall_uint32_t(n, a, (uint32_t)((1. - f) * n)) + 1;
|
205
|
+
free(a);
|
206
|
+
return thres;
|
207
|
+
}
|
208
|
+
|
209
|
+
/*********************************
|
210
|
+
* Sort and generate hash tables *
|
211
|
+
*********************************/
|
212
|
+
|
213
|
+
static void worker_post(void *g, long i, int tid)
|
214
|
+
{
|
215
|
+
int n, n_keys;
|
216
|
+
size_t j, start_a, start_p;
|
217
|
+
idxhash_t *h;
|
218
|
+
mm_idx_t *mi = (mm_idx_t*)g;
|
219
|
+
mm_idx_bucket_t *b = &mi->B[i];
|
220
|
+
if (b->a.n == 0) return;
|
221
|
+
|
222
|
+
// sort by minimizer
|
223
|
+
radix_sort_128x(b->a.a, b->a.a + b->a.n);
|
224
|
+
|
225
|
+
// count and preallocate
|
226
|
+
for (j = 1, n = 1, n_keys = 0, b->n = 0; j <= b->a.n; ++j) {
|
227
|
+
if (j == b->a.n || b->a.a[j].x>>8 != b->a.a[j-1].x>>8) {
|
228
|
+
++n_keys;
|
229
|
+
if (n > 1) b->n += n;
|
230
|
+
n = 1;
|
231
|
+
} else ++n;
|
232
|
+
}
|
233
|
+
h = kh_init(idx);
|
234
|
+
kh_resize(idx, h, n_keys);
|
235
|
+
b->p = (uint64_t*)calloc(b->n, 8);
|
236
|
+
|
237
|
+
// create the hash table
|
238
|
+
for (j = 1, n = 1, start_a = start_p = 0; j <= b->a.n; ++j) {
|
239
|
+
if (j == b->a.n || b->a.a[j].x>>8 != b->a.a[j-1].x>>8) {
|
240
|
+
khint_t itr;
|
241
|
+
int absent;
|
242
|
+
mm128_t *p = &b->a.a[j-1];
|
243
|
+
itr = kh_put(idx, h, p->x>>8>>mi->b<<1, &absent);
|
244
|
+
assert(absent && j == start_a + n);
|
245
|
+
if (n == 1) {
|
246
|
+
kh_key(h, itr) |= 1;
|
247
|
+
kh_val(h, itr) = p->y;
|
248
|
+
} else {
|
249
|
+
int k;
|
250
|
+
for (k = 0; k < n; ++k)
|
251
|
+
b->p[start_p + k] = b->a.a[start_a + k].y;
|
252
|
+
radix_sort_64(&b->p[start_p], &b->p[start_p + n]); // sort by position; needed as in-place radix_sort_128x() is not stable
|
253
|
+
kh_val(h, itr) = (uint64_t)start_p<<32 | n;
|
254
|
+
start_p += n;
|
255
|
+
}
|
256
|
+
start_a = j, n = 1;
|
257
|
+
} else ++n;
|
258
|
+
}
|
259
|
+
b->h = h;
|
260
|
+
assert(b->n == (int32_t)start_p);
|
261
|
+
|
262
|
+
// deallocate and clear b->a
|
263
|
+
kfree(0, b->a.a);
|
264
|
+
b->a.n = b->a.m = 0, b->a.a = 0;
|
265
|
+
}
|
266
|
+
|
267
|
+
static void mm_idx_post(mm_idx_t *mi, int n_threads)
|
268
|
+
{
|
269
|
+
kt_for(n_threads, worker_post, mi, 1<<mi->b);
|
270
|
+
}
|
271
|
+
|
272
|
+
/******************
|
273
|
+
* Generate index *
|
274
|
+
******************/
|
275
|
+
|
276
|
+
#include <string.h>
|
277
|
+
#include <zlib.h>
|
278
|
+
#include "bseq.h"
|
279
|
+
|
280
|
+
typedef struct {
|
281
|
+
int mini_batch_size;
|
282
|
+
uint64_t batch_size, sum_len;
|
283
|
+
mm_bseq_file_t *fp;
|
284
|
+
mm_idx_t *mi;
|
285
|
+
} pipeline_t;
|
286
|
+
|
287
|
+
typedef struct {
|
288
|
+
int n_seq;
|
289
|
+
mm_bseq1_t *seq;
|
290
|
+
mm128_v a;
|
291
|
+
} step_t;
|
292
|
+
|
293
|
+
static void mm_idx_add(mm_idx_t *mi, int n, const mm128_t *a)
|
294
|
+
{
|
295
|
+
int i, mask = (1<<mi->b) - 1;
|
296
|
+
for (i = 0; i < n; ++i) {
|
297
|
+
mm128_v *p = &mi->B[a[i].x>>8&mask].a;
|
298
|
+
kv_push(mm128_t, 0, *p, a[i]);
|
299
|
+
}
|
300
|
+
}
|
301
|
+
|
302
|
+
static void *worker_pipeline(void *shared, int step, void *in)
|
303
|
+
{
|
304
|
+
int i;
|
305
|
+
pipeline_t *p = (pipeline_t*)shared;
|
306
|
+
if (step == 0) { // step 0: read sequences
|
307
|
+
step_t *s;
|
308
|
+
if (p->sum_len > p->batch_size) return 0;
|
309
|
+
s = (step_t*)calloc(1, sizeof(step_t));
|
310
|
+
s->seq = mm_bseq_read(p->fp, p->mini_batch_size, 0, &s->n_seq); // read a mini-batch
|
311
|
+
if (s->seq) {
|
312
|
+
uint32_t old_m, m;
|
313
|
+
assert((uint64_t)p->mi->n_seq + s->n_seq <= UINT32_MAX); // to prevent integer overflow
|
314
|
+
// make room for p->mi->seq
|
315
|
+
old_m = p->mi->n_seq, m = p->mi->n_seq + s->n_seq;
|
316
|
+
kroundup32(m); kroundup32(old_m);
|
317
|
+
if (old_m != m)
|
318
|
+
p->mi->seq = (mm_idx_seq_t*)krealloc(p->mi->km, p->mi->seq, m * sizeof(mm_idx_seq_t));
|
319
|
+
// make room for p->mi->S
|
320
|
+
if (!(p->mi->flag & MM_I_NO_SEQ)) {
|
321
|
+
uint64_t sum_len, old_max_len, max_len;
|
322
|
+
for (i = 0, sum_len = 0; i < s->n_seq; ++i) sum_len += s->seq[i].l_seq;
|
323
|
+
old_max_len = (p->sum_len + 7) / 8;
|
324
|
+
max_len = (p->sum_len + sum_len + 7) / 8;
|
325
|
+
kroundup64(old_max_len); kroundup64(max_len);
|
326
|
+
if (old_max_len != max_len) {
|
327
|
+
p->mi->S = (uint32_t*)realloc(p->mi->S, max_len * 4);
|
328
|
+
memset(&p->mi->S[old_max_len], 0, 4 * (max_len - old_max_len));
|
329
|
+
}
|
330
|
+
}
|
331
|
+
// populate p->mi->seq
|
332
|
+
for (i = 0; i < s->n_seq; ++i) {
|
333
|
+
mm_idx_seq_t *seq = &p->mi->seq[p->mi->n_seq];
|
334
|
+
uint32_t j;
|
335
|
+
if (!(p->mi->flag & MM_I_NO_NAME)) {
|
336
|
+
seq->name = (char*)kmalloc(p->mi->km, strlen(s->seq[i].name) + 1);
|
337
|
+
strcpy(seq->name, s->seq[i].name);
|
338
|
+
} else seq->name = 0;
|
339
|
+
seq->len = s->seq[i].l_seq;
|
340
|
+
seq->offset = p->sum_len;
|
341
|
+
seq->is_alt = 0;
|
342
|
+
// copy the sequence
|
343
|
+
if (!(p->mi->flag & MM_I_NO_SEQ)) {
|
344
|
+
for (j = 0; j < seq->len; ++j) { // TODO: this is not the fastest way, but let's first see if speed matters here
|
345
|
+
uint64_t o = p->sum_len + j;
|
346
|
+
int c = seq_nt4_table[(uint8_t)s->seq[i].seq[j]];
|
347
|
+
mm_seq4_set(p->mi->S, o, c);
|
348
|
+
}
|
349
|
+
}
|
350
|
+
// update p->sum_len and p->mi->n_seq
|
351
|
+
p->sum_len += seq->len;
|
352
|
+
s->seq[i].rid = p->mi->n_seq++;
|
353
|
+
}
|
354
|
+
return s;
|
355
|
+
} else free(s);
|
356
|
+
} else if (step == 1) { // step 1: compute sketch
|
357
|
+
step_t *s = (step_t*)in;
|
358
|
+
for (i = 0; i < s->n_seq; ++i) {
|
359
|
+
mm_bseq1_t *t = &s->seq[i];
|
360
|
+
if (t->l_seq > 0)
|
361
|
+
mm_sketch(0, t->seq, t->l_seq, p->mi->w, p->mi->k, t->rid, p->mi->flag&MM_I_HPC, &s->a);
|
362
|
+
else if (mm_verbose >= 2)
|
363
|
+
fprintf(stderr, "[WARNING] the length database sequence '%s' is 0\n", t->name);
|
364
|
+
free(t->seq); free(t->name);
|
365
|
+
}
|
366
|
+
free(s->seq); s->seq = 0;
|
367
|
+
return s;
|
368
|
+
} else if (step == 2) { // dispatch sketch to buckets
|
369
|
+
step_t *s = (step_t*)in;
|
370
|
+
mm_idx_add(p->mi, s->a.n, s->a.a);
|
371
|
+
kfree(0, s->a.a); free(s);
|
372
|
+
}
|
373
|
+
return 0;
|
374
|
+
}
|
375
|
+
|
376
|
+
mm_idx_t *mm_idx_gen(mm_bseq_file_t *fp, int w, int k, int b, int flag, int mini_batch_size, int n_threads, uint64_t batch_size)
|
377
|
+
{
|
378
|
+
pipeline_t pl;
|
379
|
+
if (fp == 0 || mm_bseq_eof(fp)) return 0;
|
380
|
+
memset(&pl, 0, sizeof(pipeline_t));
|
381
|
+
pl.mini_batch_size = (uint64_t)mini_batch_size < batch_size? mini_batch_size : batch_size;
|
382
|
+
pl.batch_size = batch_size;
|
383
|
+
pl.fp = fp;
|
384
|
+
pl.mi = mm_idx_init(w, k, b, flag);
|
385
|
+
|
386
|
+
kt_pipeline(n_threads < 3? n_threads : 3, worker_pipeline, &pl, 3);
|
387
|
+
if (mm_verbose >= 3)
|
388
|
+
fprintf(stderr, "[M::%s::%.3f*%.2f] collected minimizers\n", __func__, realtime() - mm_realtime0, cputime() / (realtime() - mm_realtime0));
|
389
|
+
|
390
|
+
mm_idx_post(pl.mi, n_threads);
|
391
|
+
if (mm_verbose >= 3)
|
392
|
+
fprintf(stderr, "[M::%s::%.3f*%.2f] sorted minimizers\n", __func__, realtime() - mm_realtime0, cputime() / (realtime() - mm_realtime0));
|
393
|
+
|
394
|
+
return pl.mi;
|
395
|
+
}
|
396
|
+
|
397
|
+
mm_idx_t *mm_idx_build(const char *fn, int w, int k, int flag, int n_threads) // a simpler interface; deprecated
|
398
|
+
{
|
399
|
+
mm_bseq_file_t *fp;
|
400
|
+
mm_idx_t *mi;
|
401
|
+
fp = mm_bseq_open(fn);
|
402
|
+
if (fp == 0) return 0;
|
403
|
+
mi = mm_idx_gen(fp, w, k, 14, flag, 1<<18, n_threads, UINT64_MAX);
|
404
|
+
mm_bseq_close(fp);
|
405
|
+
return mi;
|
406
|
+
}
|
407
|
+
|
408
|
+
mm_idx_t *mm_idx_str(int w, int k, int is_hpc, int bucket_bits, int n, const char **seq, const char **name)
|
409
|
+
{
|
410
|
+
uint64_t sum_len = 0;
|
411
|
+
mm128_v a = {0,0,0};
|
412
|
+
mm_idx_t *mi;
|
413
|
+
khash_t(str) *h;
|
414
|
+
int i, flag = 0;
|
415
|
+
|
416
|
+
if (n <= 0) return 0;
|
417
|
+
for (i = 0; i < n; ++i) // get the total length
|
418
|
+
sum_len += strlen(seq[i]);
|
419
|
+
if (is_hpc) flag |= MM_I_HPC;
|
420
|
+
if (name == 0) flag |= MM_I_NO_NAME;
|
421
|
+
if (bucket_bits < 0) bucket_bits = 14;
|
422
|
+
mi = mm_idx_init(w, k, bucket_bits, flag);
|
423
|
+
mi->n_seq = n;
|
424
|
+
mi->seq = (mm_idx_seq_t*)kcalloc(mi->km, n, sizeof(mm_idx_seq_t)); // ->seq is allocated from km
|
425
|
+
mi->S = (uint32_t*)calloc((sum_len + 7) / 8, 4);
|
426
|
+
mi->h = h = kh_init(str);
|
427
|
+
for (i = 0, sum_len = 0; i < n; ++i) {
|
428
|
+
const char *s = seq[i];
|
429
|
+
mm_idx_seq_t *p = &mi->seq[i];
|
430
|
+
uint32_t j;
|
431
|
+
if (name && name[i]) {
|
432
|
+
int absent;
|
433
|
+
p->name = (char*)kmalloc(mi->km, strlen(name[i]) + 1);
|
434
|
+
strcpy(p->name, name[i]);
|
435
|
+
kh_put(str, h, p->name, &absent);
|
436
|
+
assert(absent);
|
437
|
+
}
|
438
|
+
p->offset = sum_len;
|
439
|
+
p->len = strlen(s);
|
440
|
+
p->is_alt = 0;
|
441
|
+
for (j = 0; j < p->len; ++j) {
|
442
|
+
int c = seq_nt4_table[(uint8_t)s[j]];
|
443
|
+
uint64_t o = sum_len + j;
|
444
|
+
mm_seq4_set(mi->S, o, c);
|
445
|
+
}
|
446
|
+
sum_len += p->len;
|
447
|
+
if (p->len > 0) {
|
448
|
+
a.n = 0;
|
449
|
+
mm_sketch(0, s, p->len, w, k, i, is_hpc, &a);
|
450
|
+
mm_idx_add(mi, a.n, a.a);
|
451
|
+
}
|
452
|
+
}
|
453
|
+
free(a.a);
|
454
|
+
mm_idx_post(mi, 1);
|
455
|
+
return mi;
|
456
|
+
}
|
457
|
+
|
458
|
+
/*************
|
459
|
+
* index I/O *
|
460
|
+
*************/
|
461
|
+
|
462
|
+
void mm_idx_dump(FILE *fp, const mm_idx_t *mi)
|
463
|
+
{
|
464
|
+
uint64_t sum_len = 0;
|
465
|
+
uint32_t x[5], i;
|
466
|
+
|
467
|
+
x[0] = mi->w, x[1] = mi->k, x[2] = mi->b, x[3] = mi->n_seq, x[4] = mi->flag;
|
468
|
+
fwrite(MM_IDX_MAGIC, 1, 4, fp);
|
469
|
+
fwrite(x, 4, 5, fp);
|
470
|
+
for (i = 0; i < mi->n_seq; ++i) {
|
471
|
+
if (mi->seq[i].name) {
|
472
|
+
uint8_t l = strlen(mi->seq[i].name);
|
473
|
+
fwrite(&l, 1, 1, fp);
|
474
|
+
fwrite(mi->seq[i].name, 1, l, fp);
|
475
|
+
} else {
|
476
|
+
uint8_t l = 0;
|
477
|
+
fwrite(&l, 1, 1, fp);
|
478
|
+
}
|
479
|
+
fwrite(&mi->seq[i].len, 4, 1, fp);
|
480
|
+
sum_len += mi->seq[i].len;
|
481
|
+
}
|
482
|
+
for (i = 0; i < 1<<mi->b; ++i) {
|
483
|
+
mm_idx_bucket_t *b = &mi->B[i];
|
484
|
+
khint_t k;
|
485
|
+
idxhash_t *h = (idxhash_t*)b->h;
|
486
|
+
uint32_t size = h? h->size : 0;
|
487
|
+
fwrite(&b->n, 4, 1, fp);
|
488
|
+
fwrite(b->p, 8, b->n, fp);
|
489
|
+
fwrite(&size, 4, 1, fp);
|
490
|
+
if (size == 0) continue;
|
491
|
+
for (k = 0; k < kh_end(h); ++k) {
|
492
|
+
uint64_t x[2];
|
493
|
+
if (!kh_exist(h, k)) continue;
|
494
|
+
x[0] = kh_key(h, k), x[1] = kh_val(h, k);
|
495
|
+
fwrite(x, 8, 2, fp);
|
496
|
+
}
|
497
|
+
}
|
498
|
+
if (!(mi->flag & MM_I_NO_SEQ))
|
499
|
+
fwrite(mi->S, 4, (sum_len + 7) / 8, fp);
|
500
|
+
fflush(fp);
|
501
|
+
}
|
502
|
+
|
503
|
+
mm_idx_t *mm_idx_load(FILE *fp)
|
504
|
+
{
|
505
|
+
char magic[4];
|
506
|
+
uint32_t x[5], i;
|
507
|
+
uint64_t sum_len = 0;
|
508
|
+
mm_idx_t *mi;
|
509
|
+
|
510
|
+
if (fread(magic, 1, 4, fp) != 4) return 0;
|
511
|
+
if (strncmp(magic, MM_IDX_MAGIC, 4) != 0) return 0;
|
512
|
+
if (fread(x, 4, 5, fp) != 5) return 0;
|
513
|
+
mi = mm_idx_init(x[0], x[1], x[2], x[4]);
|
514
|
+
mi->n_seq = x[3];
|
515
|
+
mi->seq = (mm_idx_seq_t*)kcalloc(mi->km, mi->n_seq, sizeof(mm_idx_seq_t));
|
516
|
+
for (i = 0; i < mi->n_seq; ++i) {
|
517
|
+
uint8_t l;
|
518
|
+
mm_idx_seq_t *s = &mi->seq[i];
|
519
|
+
fread(&l, 1, 1, fp);
|
520
|
+
if (l) {
|
521
|
+
s->name = (char*)kmalloc(mi->km, l + 1);
|
522
|
+
fread(s->name, 1, l, fp);
|
523
|
+
s->name[l] = 0;
|
524
|
+
}
|
525
|
+
fread(&s->len, 4, 1, fp);
|
526
|
+
s->offset = sum_len;
|
527
|
+
s->is_alt = 0;
|
528
|
+
sum_len += s->len;
|
529
|
+
}
|
530
|
+
for (i = 0; i < 1<<mi->b; ++i) {
|
531
|
+
mm_idx_bucket_t *b = &mi->B[i];
|
532
|
+
uint32_t j, size;
|
533
|
+
khint_t k;
|
534
|
+
idxhash_t *h;
|
535
|
+
fread(&b->n, 4, 1, fp);
|
536
|
+
b->p = (uint64_t*)malloc(b->n * 8);
|
537
|
+
fread(b->p, 8, b->n, fp);
|
538
|
+
fread(&size, 4, 1, fp);
|
539
|
+
if (size == 0) continue;
|
540
|
+
b->h = h = kh_init(idx);
|
541
|
+
kh_resize(idx, h, size);
|
542
|
+
for (j = 0; j < size; ++j) {
|
543
|
+
uint64_t x[2];
|
544
|
+
int absent;
|
545
|
+
fread(x, 8, 2, fp);
|
546
|
+
k = kh_put(idx, h, x[0], &absent);
|
547
|
+
assert(absent);
|
548
|
+
kh_val(h, k) = x[1];
|
549
|
+
}
|
550
|
+
}
|
551
|
+
if (!(mi->flag & MM_I_NO_SEQ)) {
|
552
|
+
mi->S = (uint32_t*)malloc((sum_len + 7) / 8 * 4);
|
553
|
+
fread(mi->S, 4, (sum_len + 7) / 8, fp);
|
554
|
+
}
|
555
|
+
return mi;
|
556
|
+
}
|
557
|
+
|
558
|
+
int64_t mm_idx_is_idx(const char *fn)
|
559
|
+
{
|
560
|
+
int fd, is_idx = 0;
|
561
|
+
int64_t ret, off_end;
|
562
|
+
char magic[4];
|
563
|
+
|
564
|
+
if (strcmp(fn, "-") == 0) return 0; // read from pipe; not an index
|
565
|
+
fd = open(fn, O_RDONLY);
|
566
|
+
if (fd < 0) return -1; // error
|
567
|
+
#ifdef WIN32
|
568
|
+
if ((off_end = _lseeki64(fd, 0, SEEK_END)) >= 4) {
|
569
|
+
_lseeki64(fd, 0, SEEK_SET);
|
570
|
+
#else
|
571
|
+
if ((off_end = lseek(fd, 0, SEEK_END)) >= 4) {
|
572
|
+
lseek(fd, 0, SEEK_SET);
|
573
|
+
#endif // WIN32
|
574
|
+
ret = read(fd, magic, 4);
|
575
|
+
if (ret == 4 && strncmp(magic, MM_IDX_MAGIC, 4) == 0)
|
576
|
+
is_idx = 1;
|
577
|
+
}
|
578
|
+
close(fd);
|
579
|
+
return is_idx? off_end : 0;
|
580
|
+
}
|
581
|
+
|
582
|
+
mm_idx_reader_t *mm_idx_reader_open(const char *fn, const mm_idxopt_t *opt, const char *fn_out)
|
583
|
+
{
|
584
|
+
int64_t is_idx;
|
585
|
+
mm_idx_reader_t *r;
|
586
|
+
is_idx = mm_idx_is_idx(fn);
|
587
|
+
if (is_idx < 0) return 0; // failed to open the index
|
588
|
+
r = (mm_idx_reader_t*)calloc(1, sizeof(mm_idx_reader_t));
|
589
|
+
r->is_idx = is_idx;
|
590
|
+
if (opt) r->opt = *opt;
|
591
|
+
else mm_idxopt_init(&r->opt);
|
592
|
+
if (r->is_idx) {
|
593
|
+
r->fp.idx = fopen(fn, "rb");
|
594
|
+
r->idx_size = is_idx;
|
595
|
+
} else r->fp.seq = mm_bseq_open(fn);
|
596
|
+
if (fn_out) r->fp_out = fopen(fn_out, "wb");
|
597
|
+
return r;
|
598
|
+
}
|
599
|
+
|
600
|
+
void mm_idx_reader_close(mm_idx_reader_t *r)
|
601
|
+
{
|
602
|
+
if (r->is_idx) fclose(r->fp.idx);
|
603
|
+
else mm_bseq_close(r->fp.seq);
|
604
|
+
if (r->fp_out) fclose(r->fp_out);
|
605
|
+
free(r);
|
606
|
+
}
|
607
|
+
|
608
|
+
mm_idx_t *mm_idx_reader_read(mm_idx_reader_t *r, int n_threads)
|
609
|
+
{
|
610
|
+
mm_idx_t *mi;
|
611
|
+
if (r->is_idx) {
|
612
|
+
mi = mm_idx_load(r->fp.idx);
|
613
|
+
if (mi && mm_verbose >= 2 && (mi->k != r->opt.k || mi->w != r->opt.w || (mi->flag&MM_I_HPC) != (r->opt.flag&MM_I_HPC)))
|
614
|
+
fprintf(stderr, "[WARNING]\033[1;31m Indexing parameters (-k, -w or -H) overridden by parameters used in the prebuilt index.\033[0m\n");
|
615
|
+
} else
|
616
|
+
mi = mm_idx_gen(r->fp.seq, r->opt.w, r->opt.k, r->opt.bucket_bits, r->opt.flag, r->opt.mini_batch_size, n_threads, r->opt.batch_size);
|
617
|
+
if (mi) {
|
618
|
+
if (r->fp_out) mm_idx_dump(r->fp_out, mi);
|
619
|
+
mi->index = r->n_parts++;
|
620
|
+
}
|
621
|
+
return mi;
|
622
|
+
}
|
623
|
+
|
624
|
+
int mm_idx_reader_eof(const mm_idx_reader_t *r) // TODO: in extremely rare cases, mm_bseq_eof() might not work
|
625
|
+
{
|
626
|
+
return r->is_idx? (feof(r->fp.idx) || ftell(r->fp.idx) == r->idx_size) : mm_bseq_eof(r->fp.seq);
|
627
|
+
}
|
628
|
+
|
629
|
+
#include <ctype.h>
|
630
|
+
#include <zlib.h>
|
631
|
+
#include "ksort.h"
|
632
|
+
#include "kseq.h"
|
633
|
+
KSTREAM_DECLARE(gzFile, gzread)
|
634
|
+
|
635
|
+
int mm_idx_alt_read(mm_idx_t *mi, const char *fn)
|
636
|
+
{
|
637
|
+
int n_alt = 0;
|
638
|
+
gzFile fp;
|
639
|
+
kstream_t *ks;
|
640
|
+
kstring_t str = {0,0,0};
|
641
|
+
fp = fn && strcmp(fn, "-")? gzopen(fn, "r") : gzdopen(fileno(stdin), "r");
|
642
|
+
if (fp == 0) return -1;
|
643
|
+
ks = ks_init(fp);
|
644
|
+
if (mi->h == 0) mm_idx_index_name(mi);
|
645
|
+
while (ks_getuntil(ks, KS_SEP_LINE, &str, 0) >= 0) {
|
646
|
+
char *p;
|
647
|
+
int id;
|
648
|
+
for (p = str.s; *p && !isspace(*p); ++p) { }
|
649
|
+
*p = 0;
|
650
|
+
id = mm_idx_name2id(mi, str.s);
|
651
|
+
if (id >= 0) mi->seq[id].is_alt = 1, ++n_alt;
|
652
|
+
}
|
653
|
+
mi->n_alt = n_alt;
|
654
|
+
if (mm_verbose >= 3)
|
655
|
+
fprintf(stderr, "[M::%s] found %d ALT contigs\n", __func__, n_alt);
|
656
|
+
return n_alt;
|
657
|
+
}
|
658
|
+
|
659
|
+
#define sort_key_bed(a) ((a).st)
|
660
|
+
KRADIX_SORT_INIT(bed, mm_idx_intv1_t, sort_key_bed, 4)
|
661
|
+
|
662
|
+
mm_idx_intv_t *mm_idx_read_bed(const mm_idx_t *mi, const char *fn, int read_junc)
|
663
|
+
{
|
664
|
+
gzFile fp;
|
665
|
+
kstream_t *ks;
|
666
|
+
kstring_t str = {0,0,0};
|
667
|
+
mm_idx_intv_t *I;
|
668
|
+
|
669
|
+
fp = fn && strcmp(fn, "-")? gzopen(fn, "r") : gzdopen(fileno(stdin), "r");
|
670
|
+
if (fp == 0) return 0;
|
671
|
+
I = (mm_idx_intv_t*)calloc(mi->n_seq, sizeof(*I));
|
672
|
+
ks = ks_init(fp);
|
673
|
+
while (ks_getuntil(ks, KS_SEP_LINE, &str, 0) >= 0) {
|
674
|
+
mm_idx_intv_t *r;
|
675
|
+
mm_idx_intv1_t t = {-1,-1,-1,-1,0};
|
676
|
+
char *p, *q, *bl, *bs;
|
677
|
+
int32_t i, id = -1, n_blk = 0;
|
678
|
+
for (p = q = str.s, i = 0;; ++p) {
|
679
|
+
if (*p == 0 || *p == '\t') {
|
680
|
+
int32_t c = *p;
|
681
|
+
*p = 0;
|
682
|
+
if (i == 0) { // chr
|
683
|
+
id = mm_idx_name2id(mi, q);
|
684
|
+
if (id < 0) break; // unknown name; TODO: throw a warning
|
685
|
+
} else if (i == 1) { // start
|
686
|
+
t.st = atol(q); // TODO: watch out integer overflow!
|
687
|
+
if (t.st < 0) break;
|
688
|
+
} else if (i == 2) { // end
|
689
|
+
t.en = atol(q);
|
690
|
+
if (t.en < 0) break;
|
691
|
+
} else if (i == 4) { // BED score
|
692
|
+
t.score = atol(q);
|
693
|
+
} else if (i == 5) { // strand
|
694
|
+
t.strand = *q == '+'? 1 : *q == '-'? -1 : 0;
|
695
|
+
} else if (i == 9) {
|
696
|
+
if (!isdigit(*q)) break;
|
697
|
+
n_blk = atol(q);
|
698
|
+
} else if (i == 10) {
|
699
|
+
bl = q;
|
700
|
+
} else if (i == 11) {
|
701
|
+
bs = q;
|
702
|
+
break;
|
703
|
+
}
|
704
|
+
if (c == 0) break;
|
705
|
+
++i, q = p + 1;
|
706
|
+
}
|
707
|
+
}
|
708
|
+
if (id < 0 || t.st < 0 || t.st >= t.en) continue;
|
709
|
+
r = &I[id];
|
710
|
+
if (i >= 11 && read_junc) { // BED12
|
711
|
+
int32_t st, sz, en;
|
712
|
+
st = strtol(bs, &bs, 10); ++bs;
|
713
|
+
sz = strtol(bl, &bl, 10); ++bl;
|
714
|
+
en = t.st + st + sz;
|
715
|
+
for (i = 1; i < n_blk; ++i) {
|
716
|
+
mm_idx_intv1_t s = t;
|
717
|
+
if (r->n == r->m) {
|
718
|
+
r->m = r->m? r->m + (r->m>>1) : 16;
|
719
|
+
r->a = (mm_idx_intv1_t*)realloc(r->a, sizeof(*r->a) * r->m);
|
720
|
+
}
|
721
|
+
st = strtol(bs, &bs, 10); ++bs;
|
722
|
+
sz = strtol(bl, &bl, 10); ++bl;
|
723
|
+
s.st = en, s.en = t.st + st;
|
724
|
+
en = t.st + st + sz;
|
725
|
+
if (s.en > s.st) r->a[r->n++] = s;
|
726
|
+
}
|
727
|
+
} else {
|
728
|
+
if (r->n == r->m) {
|
729
|
+
r->m = r->m? r->m + (r->m>>1) : 16;
|
730
|
+
r->a = (mm_idx_intv1_t*)realloc(r->a, sizeof(*r->a) * r->m);
|
731
|
+
}
|
732
|
+
r->a[r->n++] = t;
|
733
|
+
}
|
734
|
+
}
|
735
|
+
free(str.s);
|
736
|
+
ks_destroy(ks);
|
737
|
+
gzclose(fp);
|
738
|
+
return I;
|
739
|
+
}
|
740
|
+
|
741
|
+
int mm_idx_bed_read(mm_idx_t *mi, const char *fn, int read_junc)
|
742
|
+
{
|
743
|
+
int32_t i;
|
744
|
+
if (mi->h == 0) mm_idx_index_name(mi);
|
745
|
+
mi->I = mm_idx_read_bed(mi, fn, read_junc);
|
746
|
+
if (mi->I == 0) return -1;
|
747
|
+
for (i = 0; i < mi->n_seq; ++i) // TODO: eliminate redundant intervals
|
748
|
+
radix_sort_bed(mi->I[i].a, mi->I[i].a + mi->I[i].n);
|
749
|
+
return 0;
|
750
|
+
}
|
751
|
+
|
752
|
+
int mm_idx_bed_junc(const mm_idx_t *mi, int32_t ctg, int32_t st, int32_t en, uint8_t *s)
|
753
|
+
{
|
754
|
+
int32_t i, left, right;
|
755
|
+
mm_idx_intv_t *r;
|
756
|
+
memset(s, 0, en - st);
|
757
|
+
if (mi->I == 0 || ctg < 0 || ctg >= mi->n_seq) return -1;
|
758
|
+
r = &mi->I[ctg];
|
759
|
+
left = 0, right = r->n;
|
760
|
+
while (right > left) {
|
761
|
+
int32_t mid = left + ((right - left) >> 1);
|
762
|
+
if (r->a[mid].st >= st) right = mid;
|
763
|
+
else left = mid + 1;
|
764
|
+
}
|
765
|
+
for (i = left; i < r->n; ++i) {
|
766
|
+
if (st <= r->a[i].st && en >= r->a[i].en && r->a[i].strand != 0) {
|
767
|
+
if (r->a[i].strand > 0) {
|
768
|
+
s[r->a[i].st - st] |= 1, s[r->a[i].en - 1 - st] |= 2;
|
769
|
+
} else {
|
770
|
+
s[r->a[i].st - st] |= 8, s[r->a[i].en - 1 - st] |= 4;
|
771
|
+
}
|
772
|
+
}
|
773
|
+
}
|
774
|
+
return left;
|
775
|
+
}
|