bio-bwa 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.document +5 -0
- data/Gemfile +15 -0
- data/Gemfile.lock +28 -0
- data/LICENSE.txt +35 -0
- data/README.rdoc +33 -0
- data/Rakefile +56 -0
- data/VERSION +1 -0
- data/bio-bwa.gemspec +152 -0
- data/doc/Bio.html +93 -0
- data/doc/Bio/BWA.html +2884 -0
- data/doc/Bio/BWA/Library.html +229 -0
- data/doc/_index.html +119 -0
- data/doc/class_list.html +36 -0
- data/doc/css/common.css +1 -0
- data/doc/css/full_list.css +53 -0
- data/doc/css/style.css +310 -0
- data/doc/file.LICENSE.html +88 -0
- data/doc/file.README.html +119 -0
- data/doc/file_list.html +41 -0
- data/doc/frames.html +13 -0
- data/doc/index.html +119 -0
- data/doc/js/app.js +203 -0
- data/doc/js/full_list.js +149 -0
- data/doc/js/jquery.js +154 -0
- data/doc/method_list.html +171 -0
- data/doc/top-level-namespace.html +88 -0
- data/ext/COPYING +674 -0
- data/ext/ChangeLog +3864 -0
- data/ext/NEWS +555 -0
- data/ext/README +29 -0
- data/ext/bamlite.c +155 -0
- data/ext/bamlite.h +94 -0
- data/ext/bntseq.c +303 -0
- data/ext/bntseq.h +80 -0
- data/ext/bwa.1 +562 -0
- data/ext/bwape.c +807 -0
- data/ext/bwase.c +686 -0
- data/ext/bwase.h +27 -0
- data/ext/bwaseqio.c +222 -0
- data/ext/bwt.c +250 -0
- data/ext/bwt.h +105 -0
- data/ext/bwt_gen/Makefile +23 -0
- data/ext/bwt_gen/QSufSort.c +496 -0
- data/ext/bwt_gen/QSufSort.h +40 -0
- data/ext/bwt_gen/bwt_gen.c +1547 -0
- data/ext/bwt_gen/bwt_gen.h +105 -0
- data/ext/bwt_lite.c +94 -0
- data/ext/bwt_lite.h +29 -0
- data/ext/bwtaln.c +345 -0
- data/ext/bwtaln.h +150 -0
- data/ext/bwtgap.c +264 -0
- data/ext/bwtgap.h +38 -0
- data/ext/bwtindex.c +186 -0
- data/ext/bwtio.c +77 -0
- data/ext/bwtmisc.c +269 -0
- data/ext/bwtsw2.h +51 -0
- data/ext/bwtsw2_aux.c +650 -0
- data/ext/bwtsw2_chain.c +107 -0
- data/ext/bwtsw2_core.c +594 -0
- data/ext/bwtsw2_main.c +100 -0
- data/ext/cs2nt.c +191 -0
- data/ext/is.c +218 -0
- data/ext/khash.h +506 -0
- data/ext/kseq.h +208 -0
- data/ext/ksort.h +269 -0
- data/ext/kstring.c +35 -0
- data/ext/kstring.h +46 -0
- data/ext/kvec.h +90 -0
- data/ext/main.c +63 -0
- data/ext/main.h +29 -0
- data/ext/mkrf_conf.rb +49 -0
- data/ext/qualfa2fq.pl +27 -0
- data/ext/simple_dp.c +162 -0
- data/ext/simpletest.c +23 -0
- data/ext/solid2fastq.pl +111 -0
- data/ext/stdaln.c +1072 -0
- data/ext/stdaln.h +162 -0
- data/ext/utils.c +82 -0
- data/ext/utils.h +54 -0
- data/lib/bio-bwa.rb +7 -0
- data/lib/bio/bwa.rb +312 -0
- data/lib/bio/bwa/library.rb +42 -0
- data/test/data/testdata.fa +602 -0
- data/test/data/testdata.long.fa +175 -0
- data/test/data/testdata.short.fa +2 -0
- data/test/helper.rb +18 -0
- data/test/test_bio-bwa_basic.rb +62 -0
- data/test/test_bio-bwa_make_index.rb +42 -0
- data/test/test_bio-bwa_run_aln.rb +49 -0
- data/test/test_bio-bwa_sam_conversion.rb +49 -0
- metadata +218 -0
data/ext/bwtsw2_chain.c
ADDED
@@ -0,0 +1,107 @@
|
|
1
|
+
#include <stdio.h>
|
2
|
+
#include "bwtsw2.h"
|
3
|
+
|
4
|
+
typedef struct {
|
5
|
+
uint32_t tbeg, tend;
|
6
|
+
int qbeg, qend;
|
7
|
+
uint32_t flag:1, idx:31;
|
8
|
+
int chain; // also reuse as a counter
|
9
|
+
} hsaip_t;
|
10
|
+
|
11
|
+
#define _hsaip_lt(a, b) ((a).qbeg < (b).qbeg)
|
12
|
+
|
13
|
+
#include "ksort.h"
|
14
|
+
KSORT_INIT(hsaip, hsaip_t, _hsaip_lt)
|
15
|
+
|
16
|
+
static int chaining(const bsw2opt_t *opt, int shift, int n, hsaip_t *z, hsaip_t *chain)
|
17
|
+
{
|
18
|
+
int j, k, m = 0;
|
19
|
+
ks_introsort(hsaip, n, z);
|
20
|
+
for (j = 0; j < n; ++j) {
|
21
|
+
hsaip_t *p = z + j;
|
22
|
+
for (k = m - 1; k >= 0; --k) {
|
23
|
+
hsaip_t *q = chain + k;
|
24
|
+
int x = p->qbeg - q->qbeg; // always positive
|
25
|
+
int y = p->tbeg - q->tbeg;
|
26
|
+
if (y > 0 && x - y <= opt->bw && y - x <= opt->bw) {
|
27
|
+
if (p->qend > q->qend) q->qend = p->qend;
|
28
|
+
if (p->tend > q->tend) q->tend = p->tend;
|
29
|
+
++q->chain;
|
30
|
+
p->chain = shift + k;
|
31
|
+
break;
|
32
|
+
}
|
33
|
+
}
|
34
|
+
if (k < 0) {
|
35
|
+
chain[m] = *p;
|
36
|
+
chain[m].chain = 1;
|
37
|
+
chain[m].idx = p->chain = shift + m;
|
38
|
+
++m;
|
39
|
+
}
|
40
|
+
}
|
41
|
+
return m;
|
42
|
+
}
|
43
|
+
|
44
|
+
void bsw2_chain_filter(const bsw2opt_t *opt, int len, bwtsw2_t *b[2])
|
45
|
+
{
|
46
|
+
hsaip_t *z[2], *chain[2];
|
47
|
+
int i, j, k, n[2], m[2];
|
48
|
+
char *flag;
|
49
|
+
// initialization
|
50
|
+
n[0] = b[0]->n; n[1] = b[1]->n;
|
51
|
+
z[0] = calloc(n[0] + n[1], sizeof(hsaip_t));
|
52
|
+
z[1] = z[0] + n[0];
|
53
|
+
chain[0] = calloc(n[0] + n[1], sizeof(hsaip_t));
|
54
|
+
for (k = j = 0; k < 2; ++k) {
|
55
|
+
for (i = 0; i < b[k]->n; ++i) {
|
56
|
+
bsw2hit_t *p = b[k]->hits + i;
|
57
|
+
hsaip_t *q = z[k] + i;
|
58
|
+
q->flag = k; q->idx = i;
|
59
|
+
q->tbeg = p->k; q->tend = p->k + p->len;
|
60
|
+
q->chain = -1;
|
61
|
+
q->qbeg = p->beg; q->qend = p->end;
|
62
|
+
}
|
63
|
+
}
|
64
|
+
// chaining
|
65
|
+
m[0] = chaining(opt, 0, n[0], z[0], chain[0]);
|
66
|
+
chain[1] = chain[0] + m[0];
|
67
|
+
m[1] = chaining(opt, m[0], n[1], z[1], chain[1]);
|
68
|
+
// change query coordinate on the reverse strand
|
69
|
+
for (k = 0; k < m[1]; ++k) {
|
70
|
+
hsaip_t *p = chain[1] + k;
|
71
|
+
int tmp = p->qbeg;
|
72
|
+
p->qbeg = len - p->qend; p->qend = len - tmp;
|
73
|
+
}
|
74
|
+
// filtering
|
75
|
+
flag = calloc(m[0] + m[1], 1);
|
76
|
+
ks_introsort(hsaip, m[0] + m[1], chain[0]);
|
77
|
+
for (k = 1; k < m[0] + m[1]; ++k) {
|
78
|
+
hsaip_t *p = chain[0] + k;
|
79
|
+
for (j = 0; j < k; ++j) {
|
80
|
+
hsaip_t *q = chain[0] + j;
|
81
|
+
if (flag[q->idx]) continue;
|
82
|
+
if (q->qend >= p->qend && q->chain > p->chain * opt->t_seeds * 2) {
|
83
|
+
flag[p->idx] = 1;
|
84
|
+
break;
|
85
|
+
}
|
86
|
+
}
|
87
|
+
}
|
88
|
+
for (k = 0; k < n[0] + n[1]; ++k) {
|
89
|
+
hsaip_t *p = z[0] + k;
|
90
|
+
if (flag[p->chain])
|
91
|
+
b[p->flag]->hits[p->idx].G = 0;
|
92
|
+
}
|
93
|
+
free(flag);
|
94
|
+
// squeeze out filtered elements in b[2]
|
95
|
+
for (k = 0; k < 2; ++k) {
|
96
|
+
for (j = i = 0; j < n[k]; ++j) {
|
97
|
+
bsw2hit_t *p = b[k]->hits + j;
|
98
|
+
if (p->G) {
|
99
|
+
if (i != j) b[k]->hits[i++] = *p;
|
100
|
+
else ++i;
|
101
|
+
}
|
102
|
+
}
|
103
|
+
b[k]->n = i;
|
104
|
+
}
|
105
|
+
// free
|
106
|
+
free(z[0]); free(chain[0]);
|
107
|
+
}
|
data/ext/bwtsw2_core.c
ADDED
@@ -0,0 +1,594 @@
|
|
1
|
+
#include <stdlib.h>
|
2
|
+
#include <string.h>
|
3
|
+
#include <stdio.h>
|
4
|
+
#include <sys/resource.h>
|
5
|
+
#include <assert.h>
|
6
|
+
#include "bwt_lite.h"
|
7
|
+
#include "bwtsw2.h"
|
8
|
+
#include "bwt.h"
|
9
|
+
#include "kvec.h"
|
10
|
+
|
11
|
+
#include "khash.h"
|
12
|
+
KHASH_MAP_INIT_INT64(64, uint64_t)
|
13
|
+
|
14
|
+
#define MINUS_INF -0x3fffffff
|
15
|
+
#define MASK_LEVEL 0.90f
|
16
|
+
|
17
|
+
struct __mempool_t;
|
18
|
+
static void mp_destroy(struct __mempool_t*);
|
19
|
+
typedef struct {
|
20
|
+
uint32_t qk, ql;
|
21
|
+
int I, D, G;
|
22
|
+
uint32_t pj:2, qlen:30;
|
23
|
+
int tlen;
|
24
|
+
int ppos, upos;
|
25
|
+
int cpos[4];
|
26
|
+
} bsw2cell_t;
|
27
|
+
|
28
|
+
#include "ksort.h"
|
29
|
+
KSORT_INIT_GENERIC(int)
|
30
|
+
#define __hitG_lt(a, b) ((a).G > (b).G)
|
31
|
+
KSORT_INIT(hitG, bsw2hit_t, __hitG_lt)
|
32
|
+
|
33
|
+
static const bsw2cell_t g_default_cell = { 0, 0, MINUS_INF, MINUS_INF, MINUS_INF, 0, 0, 0, -1, -1, {-1, -1, -1, -1} };
|
34
|
+
|
35
|
+
typedef struct {
|
36
|
+
int n, max;
|
37
|
+
uint32_t tk, tl;
|
38
|
+
bsw2cell_t *array;
|
39
|
+
} bsw2entry_t, *bsw2entry_p;
|
40
|
+
|
41
|
+
/* --- BEGIN: Stack operations --- */
|
42
|
+
typedef struct {
|
43
|
+
int n_pending;
|
44
|
+
kvec_t(bsw2entry_p) stack0, pending;
|
45
|
+
struct __mempool_t *pool;
|
46
|
+
} bsw2stack_t;
|
47
|
+
|
48
|
+
#define stack_isempty(s) (kv_size(s->stack0) == 0 && s->n_pending == 0)
|
49
|
+
static void stack_destroy(bsw2stack_t *s) { mp_destroy(s->pool); kv_destroy(s->stack0); kv_destroy(s->pending); free(s); }
|
50
|
+
inline static void stack_push0(bsw2stack_t *s, bsw2entry_p e) { kv_push(bsw2entry_p, s->stack0, e); }
|
51
|
+
inline static bsw2entry_p stack_pop(bsw2stack_t *s)
|
52
|
+
{
|
53
|
+
assert(!(kv_size(s->stack0) == 0 && s->n_pending != 0));
|
54
|
+
return kv_pop(s->stack0);
|
55
|
+
}
|
56
|
+
/* --- END: Stack operations --- */
|
57
|
+
|
58
|
+
/* --- BEGIN: memory pool --- */
|
59
|
+
typedef struct __mempool_t {
|
60
|
+
int cnt; // if cnt!=0, then there must be memory leak
|
61
|
+
kvec_t(bsw2entry_p) pool;
|
62
|
+
} mempool_t;
|
63
|
+
inline static bsw2entry_p mp_alloc(mempool_t *mp)
|
64
|
+
{
|
65
|
+
++mp->cnt;
|
66
|
+
if (kv_size(mp->pool) == 0) return (bsw2entry_t*)calloc(1, sizeof(bsw2entry_t));
|
67
|
+
else return kv_pop(mp->pool);
|
68
|
+
}
|
69
|
+
inline static void mp_free(mempool_t *mp, bsw2entry_p e)
|
70
|
+
{
|
71
|
+
--mp->cnt; e->n = 0;
|
72
|
+
kv_push(bsw2entry_p, mp->pool, e);
|
73
|
+
}
|
74
|
+
static void mp_destroy(struct __mempool_t *mp)
|
75
|
+
{
|
76
|
+
int i;
|
77
|
+
for (i = 0; i != kv_size(mp->pool); ++i) {
|
78
|
+
free(kv_A(mp->pool, i)->array);
|
79
|
+
free(kv_A(mp->pool, i));
|
80
|
+
}
|
81
|
+
kv_destroy(mp->pool);
|
82
|
+
free(mp);
|
83
|
+
}
|
84
|
+
/* --- END: memory pool --- */
|
85
|
+
|
86
|
+
/* --- BEGIN: utilities --- */
|
87
|
+
static khash_t(64) *bsw2_connectivity(const bwtl_t *b)
|
88
|
+
{
|
89
|
+
khash_t(64) *h;
|
90
|
+
uint32_t k, l, cntk[4], cntl[4];
|
91
|
+
uint64_t x;
|
92
|
+
khiter_t iter;
|
93
|
+
int j, ret;
|
94
|
+
kvec_t(uint64_t) stack;
|
95
|
+
|
96
|
+
kv_init(stack);
|
97
|
+
h = kh_init(64);
|
98
|
+
kh_resize(64, h, b->seq_len * 4);
|
99
|
+
x = b->seq_len;
|
100
|
+
kv_push(uint64_t, stack, x);
|
101
|
+
while (kv_size(stack)) {
|
102
|
+
x = kv_pop(stack);
|
103
|
+
k = x>>32; l = (uint32_t)x;
|
104
|
+
bwtl_2occ4(b, k-1, l, cntk, cntl);
|
105
|
+
for (j = 0; j != 4; ++j) {
|
106
|
+
k = b->L2[j] + cntk[j] + 1;
|
107
|
+
l = b->L2[j] + cntl[j];
|
108
|
+
if (k > l) continue;
|
109
|
+
x = (uint64_t)k << 32 | l;
|
110
|
+
iter = kh_put(64, h, x, &ret);
|
111
|
+
if (ret) { // if not present
|
112
|
+
kh_value(h, iter) = 1;
|
113
|
+
kv_push(uint64_t, stack, x);
|
114
|
+
} else ++kh_value(h, iter);
|
115
|
+
}
|
116
|
+
}
|
117
|
+
kv_destroy(stack);
|
118
|
+
//fprintf(stderr, "[bsw2_connectivity] %u nodes in the DAG\n", kh_size(h));
|
119
|
+
return h;
|
120
|
+
}
|
121
|
+
// pick up top T matches at a node
|
122
|
+
static void cut_tail(bsw2entry_t *u, int T, bsw2entry_t *aux)
|
123
|
+
{
|
124
|
+
int i, *a, n, x;
|
125
|
+
if (u->n <= T) return;
|
126
|
+
if (aux->max < u->n) {
|
127
|
+
aux->max = u->n;
|
128
|
+
aux->array = (bsw2cell_t*)realloc(aux->array, aux->max * sizeof(bsw2cell_t));
|
129
|
+
}
|
130
|
+
a = (int*)aux->array;
|
131
|
+
for (i = n = 0; i != u->n; ++i)
|
132
|
+
if (u->array[i].ql && u->array[i].G > 0)
|
133
|
+
a[n++] = -u->array[i].G;
|
134
|
+
if (n <= T) return;
|
135
|
+
x = -ks_ksmall(int, n, a, T);
|
136
|
+
n = 0;
|
137
|
+
for (i = 0; i < u->n; ++i) {
|
138
|
+
bsw2cell_t *p = u->array + i;
|
139
|
+
if (p->G == x) ++n;
|
140
|
+
if (p->G < x || (p->G == x && n >= T)) {
|
141
|
+
p->qk = p->ql = 0; p->G = 0;
|
142
|
+
if (p->ppos >= 0) u->array[p->ppos].cpos[p->pj] = -1;
|
143
|
+
}
|
144
|
+
}
|
145
|
+
}
|
146
|
+
// remove duplicated cells
|
147
|
+
static inline void remove_duplicate(bsw2entry_t *u, khash_t(64) *hash)
|
148
|
+
{
|
149
|
+
int i, ret, j;
|
150
|
+
khiter_t k;
|
151
|
+
uint64_t key;
|
152
|
+
kh_clear(64, hash);
|
153
|
+
for (i = 0; i != u->n; ++i) {
|
154
|
+
bsw2cell_t *p = u->array + i;
|
155
|
+
if (p->ql == 0) continue;
|
156
|
+
key = (uint64_t)p->qk << 32 | p->ql;
|
157
|
+
k = kh_put(64, hash, key, &ret);
|
158
|
+
j = -1;
|
159
|
+
if (ret == 0) {
|
160
|
+
if ((uint32_t)kh_value(hash, k) >= p->G) j = i;
|
161
|
+
else {
|
162
|
+
j = kh_value(hash, k)>>32;
|
163
|
+
kh_value(hash, k) = (uint64_t)i<<32 | p->G;
|
164
|
+
}
|
165
|
+
} else kh_value(hash, k) = (uint64_t)i<<32 | p->G;
|
166
|
+
if (j >= 0) {
|
167
|
+
p = u->array + j;
|
168
|
+
p->qk = p->ql = 0; p->G = 0;
|
169
|
+
if (p->ppos >= 0) u->array[p->ppos].cpos[p->pj] = -3;
|
170
|
+
}
|
171
|
+
}
|
172
|
+
}
|
173
|
+
// merge two entries
|
174
|
+
static void merge_entry(const bsw2opt_t * __restrict opt, bsw2entry_t *u, bsw2entry_t *v, bwtsw2_t *b)
|
175
|
+
{
|
176
|
+
int i;
|
177
|
+
if (u->n + v->n >= u->max) {
|
178
|
+
u->max = u->n + v->n;
|
179
|
+
u->array = (bsw2cell_t*)realloc(u->array, u->max * sizeof(bsw2cell_t));
|
180
|
+
}
|
181
|
+
for (i = 0; i != v->n; ++i) {
|
182
|
+
bsw2cell_t *p = v->array + i;
|
183
|
+
if (p->ppos >= 0) p->ppos += u->n;
|
184
|
+
if (p->cpos[0] >= 0) p->cpos[0] += u->n;
|
185
|
+
if (p->cpos[1] >= 0) p->cpos[1] += u->n;
|
186
|
+
if (p->cpos[2] >= 0) p->cpos[2] += u->n;
|
187
|
+
if (p->cpos[3] >= 0) p->cpos[3] += u->n;
|
188
|
+
}
|
189
|
+
memcpy(u->array + u->n, v->array, v->n * sizeof(bsw2cell_t));
|
190
|
+
u->n += v->n;
|
191
|
+
}
|
192
|
+
|
193
|
+
static inline bsw2cell_t *push_array_p(bsw2entry_t *e)
|
194
|
+
{
|
195
|
+
if (e->n == e->max) {
|
196
|
+
e->max = e->max? e->max<<1 : 256;
|
197
|
+
e->array = (bsw2cell_t*)realloc(e->array, sizeof(bsw2cell_t) * e->max);
|
198
|
+
}
|
199
|
+
return e->array + e->n;
|
200
|
+
}
|
201
|
+
|
202
|
+
static inline double time_elapse(const struct rusage *curr, const struct rusage *last)
|
203
|
+
{
|
204
|
+
long t1 = (curr->ru_utime.tv_sec - last->ru_utime.tv_sec) + (curr->ru_stime.tv_sec - last->ru_stime.tv_sec);
|
205
|
+
long t2 = (curr->ru_utime.tv_usec - last->ru_utime.tv_usec) + (curr->ru_stime.tv_usec - last->ru_stime.tv_usec);
|
206
|
+
return (double)t1 + t2 * 1e-6;
|
207
|
+
}
|
208
|
+
/* --- END: utilities --- */
|
209
|
+
|
210
|
+
/* --- BEGIN: processing partial hits --- */
|
211
|
+
static void save_hits(const bwtl_t *bwt, int thres, bsw2hit_t *hits, bsw2entry_t *u)
|
212
|
+
{
|
213
|
+
int i;
|
214
|
+
uint32_t k;
|
215
|
+
for (i = 0; i < u->n; ++i) {
|
216
|
+
bsw2cell_t *p = u->array + i;
|
217
|
+
if (p->G < thres) continue;
|
218
|
+
for (k = u->tk; k <= u->tl; ++k) {
|
219
|
+
int beg, end;
|
220
|
+
bsw2hit_t *q = 0;
|
221
|
+
beg = bwt->sa[k]; end = beg + p->tlen;
|
222
|
+
if (p->G > hits[beg*2].G) {
|
223
|
+
hits[beg*2+1] = hits[beg*2];
|
224
|
+
q = hits + beg * 2;
|
225
|
+
} else if (p->G > hits[beg*2+1].G) q = hits + beg * 2 + 1;
|
226
|
+
if (q) {
|
227
|
+
q->k = p->qk; q->l = p->ql; q->len = p->qlen; q->G = p->G;
|
228
|
+
q->beg = beg; q->end = end; q->G2 = q->k == q->l? 0 : q->G;
|
229
|
+
q->flag = q->n_seeds = 0;
|
230
|
+
}
|
231
|
+
}
|
232
|
+
}
|
233
|
+
}
|
234
|
+
/* "narrow hits" are node-to-node hits that have a high score and
|
235
|
+
* are not so repetitive (|SA interval|<=IS). */
|
236
|
+
static void save_narrow_hits(const bwtl_t *bwtl, bsw2entry_t *u, bwtsw2_t *b1, int t, int IS)
|
237
|
+
{
|
238
|
+
int i;
|
239
|
+
for (i = 0; i < u->n; ++i) {
|
240
|
+
bsw2hit_t *q;
|
241
|
+
bsw2cell_t *p = u->array + i;
|
242
|
+
if (p->G >= t && p->ql - p->qk + 1 <= IS) { // good narrow hit
|
243
|
+
if (b1->max == b1->n) {
|
244
|
+
b1->max = b1->max? b1->max<<1 : 4;
|
245
|
+
b1->hits = realloc(b1->hits, b1->max * sizeof(bsw2hit_t));
|
246
|
+
}
|
247
|
+
q = &b1->hits[b1->n++];
|
248
|
+
q->k = p->qk; q->l = p->ql;
|
249
|
+
q->len = p->qlen;
|
250
|
+
q->G = p->G; q->G2 = 0;
|
251
|
+
q->beg = bwtl->sa[u->tk]; q->end = q->beg + p->tlen;
|
252
|
+
q->flag = 0;
|
253
|
+
// delete p
|
254
|
+
p->qk = p->ql = 0; p->G = 0;
|
255
|
+
if (p->ppos >= 0) u->array[p->ppos].cpos[p->pj] = -3;
|
256
|
+
}
|
257
|
+
}
|
258
|
+
}
|
259
|
+
/* after this, "narrow SA hits" will be expanded and the coordinates
|
260
|
+
* will be obtained and stored in b->hits[*].k. */
|
261
|
+
int bsw2_resolve_duphits(const bwt_t *bwt, bwtsw2_t *b, int IS)
|
262
|
+
{
|
263
|
+
int i, j, n;
|
264
|
+
if (b->n == 0) return 0;
|
265
|
+
if (bwt) { // convert to chromosomal coordinates if suitable
|
266
|
+
int old_n = b->n;
|
267
|
+
bsw2hit_t *old_hits = b->hits;
|
268
|
+
for (i = n = 0; i < b->n; ++i) {
|
269
|
+
bsw2hit_t *p = old_hits + i;
|
270
|
+
if (p->l - p->k + 1 <= IS) n += p->l - p->k + 1;
|
271
|
+
else if (p->G > 0) ++n;
|
272
|
+
}
|
273
|
+
b->n = b->max = n;
|
274
|
+
b->hits = calloc(b->max, sizeof(bsw2hit_t));
|
275
|
+
for (i = j = 0; i < old_n; ++i) {
|
276
|
+
bsw2hit_t *p = old_hits + i;
|
277
|
+
if (p->l - p->k + 1 <= IS) {
|
278
|
+
bwtint_t k;
|
279
|
+
for (k = p->k; k <= p->l; ++k) {
|
280
|
+
b->hits[j] = *p;
|
281
|
+
b->hits[j].k = bwt_sa(bwt, k);
|
282
|
+
b->hits[j].l = 0;
|
283
|
+
++j;
|
284
|
+
}
|
285
|
+
} else if (p->G > 0) {
|
286
|
+
b->hits[j] = *p;
|
287
|
+
b->hits[j].k = bwt_sa(bwt, p->k);
|
288
|
+
b->hits[j].l = 0;
|
289
|
+
b->hits[j].flag |= 1;
|
290
|
+
++j;
|
291
|
+
}
|
292
|
+
}
|
293
|
+
free(old_hits);
|
294
|
+
}
|
295
|
+
ks_introsort(hitG, b->n, b->hits);
|
296
|
+
for (i = 1; i < b->n; ++i) {
|
297
|
+
bsw2hit_t *p = b->hits + i;
|
298
|
+
if (p->G == 0) break;
|
299
|
+
for (j = 0; j < i; ++j) {
|
300
|
+
bsw2hit_t *q = b->hits + j;
|
301
|
+
int compatible = 1;
|
302
|
+
if (q->G == 0) continue;
|
303
|
+
if (p->l == 0 && q->l == 0) {
|
304
|
+
int qol = (p->end < q->end? p->end : q->end) - (p->beg > q->beg? p->beg : q->beg);
|
305
|
+
if (qol < 0) qol = 0;
|
306
|
+
if ((float)qol / (p->end - p->beg) > MASK_LEVEL || (float)qol / (q->end - q->beg) > MASK_LEVEL) {
|
307
|
+
int64_t tol = (int64_t)(p->k + p->len < q->k + q->len? p->k + p->len : q->k + q->len)
|
308
|
+
- (int64_t)(p->k > q->k? p->k : q->k);
|
309
|
+
if ((double)tol / p->len > MASK_LEVEL || (double)tol / q->len > MASK_LEVEL)
|
310
|
+
compatible = 0;
|
311
|
+
}
|
312
|
+
}
|
313
|
+
if (!compatible) {
|
314
|
+
p->G = 0;
|
315
|
+
break;
|
316
|
+
}
|
317
|
+
}
|
318
|
+
}
|
319
|
+
n = i;
|
320
|
+
for (i = j = 0; i < n; ++i) {
|
321
|
+
if (b->hits[i].G == 0) continue;
|
322
|
+
if (i != j) b->hits[j++] = b->hits[i];
|
323
|
+
else ++j;
|
324
|
+
}
|
325
|
+
b->n = j;
|
326
|
+
return b->n;
|
327
|
+
}
|
328
|
+
|
329
|
+
int bsw2_resolve_query_overlaps(bwtsw2_t *b, float mask_level)
|
330
|
+
{
|
331
|
+
int i, j, n;
|
332
|
+
if (b->n == 0) return 0;
|
333
|
+
ks_introsort(hitG, b->n, b->hits);
|
334
|
+
{ // choose a random one
|
335
|
+
int G0 = b->hits[0].G;
|
336
|
+
for (i = 1; i < b->n; ++i)
|
337
|
+
if (b->hits[i].G != G0) break;
|
338
|
+
j = (int)(i * drand48());
|
339
|
+
if (j) {
|
340
|
+
bsw2hit_t tmp;
|
341
|
+
tmp = b->hits[0]; b->hits[0] = b->hits[j]; b->hits[j] = tmp;
|
342
|
+
}
|
343
|
+
}
|
344
|
+
for (i = 1; i < b->n; ++i) {
|
345
|
+
bsw2hit_t *p = b->hits + i;
|
346
|
+
int all_compatible = 1;
|
347
|
+
if (p->G == 0) break;
|
348
|
+
for (j = 0; j < i; ++j) {
|
349
|
+
bsw2hit_t *q = b->hits + j;
|
350
|
+
int64_t tol = 0;
|
351
|
+
int qol, compatible = 0;
|
352
|
+
float fol;
|
353
|
+
if (q->G == 0) continue;
|
354
|
+
qol = (p->end < q->end? p->end : q->end) - (p->beg > q->beg? p->beg : q->beg);
|
355
|
+
if (qol < 0) qol = 0;
|
356
|
+
if (p->l == 0 && q->l == 0) {
|
357
|
+
tol = (int64_t)(p->k + p->len < q->k + q->len? p->k + p->len : q->k + q->len)
|
358
|
+
- (p->k > q->k? p->k : q->k);
|
359
|
+
if (tol < 0) tol = 0;
|
360
|
+
}
|
361
|
+
fol = (float)qol / (p->end - p->beg < q->end - q->beg? p->end - p->beg : q->end - q->beg);
|
362
|
+
if (fol < mask_level || (tol > 0 && qol < p->end - p->beg && qol < q->end - q->beg)) compatible = 1;
|
363
|
+
if (!compatible) {
|
364
|
+
if (q->G2 < p->G) q->G2 = p->G;
|
365
|
+
all_compatible = 0;
|
366
|
+
}
|
367
|
+
}
|
368
|
+
if (!all_compatible) p->G = 0;
|
369
|
+
}
|
370
|
+
n = i;
|
371
|
+
for (i = j = 0; i < n; ++i) {
|
372
|
+
if (b->hits[i].G == 0) continue;
|
373
|
+
if (i != j) b->hits[j++] = b->hits[i];
|
374
|
+
else ++j;
|
375
|
+
}
|
376
|
+
b->n = j;
|
377
|
+
return j;
|
378
|
+
}
|
379
|
+
/* --- END: processing partial hits --- */
|
380
|
+
|
381
|
+
/* --- BEGIN: global mem pool --- */
|
382
|
+
bsw2global_t *bsw2_global_init()
|
383
|
+
{
|
384
|
+
bsw2global_t *pool;
|
385
|
+
bsw2stack_t *stack;
|
386
|
+
pool = calloc(1, sizeof(bsw2global_t));
|
387
|
+
stack = calloc(1, sizeof(bsw2stack_t));
|
388
|
+
stack->pool = (mempool_t*)calloc(1, sizeof(mempool_t));
|
389
|
+
pool->stack = (void*)stack;
|
390
|
+
return pool;
|
391
|
+
}
|
392
|
+
|
393
|
+
void bsw2_global_destroy(bsw2global_t *pool)
|
394
|
+
{
|
395
|
+
stack_destroy((bsw2stack_t*)pool->stack);
|
396
|
+
free(pool->aln_mem);
|
397
|
+
free(pool);
|
398
|
+
}
|
399
|
+
/* --- END: global mem pool --- */
|
400
|
+
|
401
|
+
static inline int fill_cell(const bsw2opt_t *o, int match_score, bsw2cell_t *c[4])
|
402
|
+
{
|
403
|
+
int G = c[3]? c[3]->G + match_score : MINUS_INF;
|
404
|
+
if (c[1]) {
|
405
|
+
c[0]->I = c[1]->I > c[1]->G - o->q? c[1]->I - o->r : c[1]->G - o->qr;
|
406
|
+
if (c[0]->I > G) G = c[0]->I;
|
407
|
+
} else c[0]->I = MINUS_INF;
|
408
|
+
if (c[2]) {
|
409
|
+
c[0]->D = c[2]->D > c[2]->G - o->q? c[2]->D - o->r : c[2]->G - o->qr;
|
410
|
+
if (c[0]->D > G) G = c[0]->D;
|
411
|
+
} else c[0]->D = MINUS_INF;
|
412
|
+
return(c[0]->G = G);
|
413
|
+
}
|
414
|
+
|
415
|
+
static void init_bwtsw2(const bwtl_t *target, const bwt_t *query, bsw2stack_t *s)
|
416
|
+
{
|
417
|
+
bsw2entry_t *u;
|
418
|
+
bsw2cell_t *x;
|
419
|
+
|
420
|
+
u = mp_alloc(s->pool);
|
421
|
+
u->tk = 0; u->tl = target->seq_len;
|
422
|
+
x = push_array_p(u);
|
423
|
+
*x = g_default_cell;
|
424
|
+
x->G = 0; x->qk = 0; x->ql = query->seq_len;
|
425
|
+
u->n++;
|
426
|
+
stack_push0(s, u);
|
427
|
+
}
|
428
|
+
/* On return, ret[1] keeps not-so-repetitive hits (narrow SA hits); ret[0] keeps all hits (right?) */
|
429
|
+
bwtsw2_t **bsw2_core(const bsw2opt_t *opt, const bwtl_t *target, const bwt_t *query, bsw2global_t *pool)
|
430
|
+
{
|
431
|
+
bsw2stack_t *stack = (bsw2stack_t*)pool->stack;
|
432
|
+
bwtsw2_t *b, *b1, **b_ret;
|
433
|
+
int i, j, score_mat[16], *heap, heap_size, n_tot = 0;
|
434
|
+
struct rusage curr, last;
|
435
|
+
khash_t(64) *rhash, *chash;
|
436
|
+
|
437
|
+
// initialize connectivity hash (chash)
|
438
|
+
chash = bsw2_connectivity(target);
|
439
|
+
// calculate score matrix
|
440
|
+
for (i = 0; i != 4; ++i)
|
441
|
+
for (j = 0; j != 4; ++j)
|
442
|
+
score_mat[i<<2|j] = (i == j)? opt->a : -opt->b;
|
443
|
+
// initialize other variables
|
444
|
+
rhash = kh_init(64);
|
445
|
+
init_bwtsw2(target, query, stack);
|
446
|
+
heap_size = opt->z;
|
447
|
+
heap = calloc(heap_size, sizeof(int));
|
448
|
+
// initialize the return struct
|
449
|
+
b = (bwtsw2_t*)calloc(1, sizeof(bwtsw2_t));
|
450
|
+
b->n = b->max = target->seq_len * 2;
|
451
|
+
b->hits = calloc(b->max, sizeof(bsw2hit_t));
|
452
|
+
b1 = (bwtsw2_t*)calloc(1, sizeof(bwtsw2_t));
|
453
|
+
b_ret = calloc(2, sizeof(void*));
|
454
|
+
b_ret[0] = b; b_ret[1] = b1;
|
455
|
+
// initialize timer
|
456
|
+
getrusage(0, &last);
|
457
|
+
// the main loop: traversal of the DAG
|
458
|
+
while (!stack_isempty(stack)) {
|
459
|
+
int old_n, tj;
|
460
|
+
bsw2entry_t *v;
|
461
|
+
uint32_t k, l, tcntk[4], tcntl[4];
|
462
|
+
|
463
|
+
v = stack_pop(stack); old_n = v->n;
|
464
|
+
n_tot += v->n;
|
465
|
+
|
466
|
+
for (i = 0; i < v->n; ++i) { // test max depth and band width
|
467
|
+
bsw2cell_t *p = v->array + i;
|
468
|
+
if (p->ql == 0) continue;
|
469
|
+
if (p->tlen - (int)p->qlen > opt->bw || (int)p->qlen - p->tlen > opt->bw) {
|
470
|
+
p->qk = p->ql = 0;
|
471
|
+
if (p->ppos >= 0) v->array[p->ppos].cpos[p->pj] = -5;
|
472
|
+
}
|
473
|
+
}
|
474
|
+
|
475
|
+
// get Occ for the DAG
|
476
|
+
bwtl_2occ4(target, v->tk - 1, v->tl, tcntk, tcntl);
|
477
|
+
for (tj = 0; tj != 4; ++tj) { // descend to the children
|
478
|
+
uint32_t qcntk[4], qcntl[4];
|
479
|
+
int qj, *curr_score_mat = score_mat + tj * 4;
|
480
|
+
khiter_t iter;
|
481
|
+
bsw2entry_t *u;
|
482
|
+
|
483
|
+
k = target->L2[tj] + tcntk[tj] + 1;
|
484
|
+
l = target->L2[tj] + tcntl[tj];
|
485
|
+
if (k > l) continue;
|
486
|
+
// update counter
|
487
|
+
iter = kh_get(64, chash, (uint64_t)k<<32 | l);
|
488
|
+
--kh_value(chash, iter);
|
489
|
+
// initialization
|
490
|
+
u = mp_alloc(stack->pool);
|
491
|
+
u->tk = k; u->tl = l;
|
492
|
+
memset(heap, 0, sizeof(int) * opt->z);
|
493
|
+
// loop through all the nodes in v
|
494
|
+
for (i = 0; i < v->n; ++i) {
|
495
|
+
bsw2cell_t *p = v->array + i, *x, *c[4]; // c[0]=>current, c[1]=>I, c[2]=>D, c[3]=>G
|
496
|
+
int is_added = 0;
|
497
|
+
if (p->ql == 0) continue; // deleted node
|
498
|
+
c[0] = x = push_array_p(u);
|
499
|
+
x->G = MINUS_INF;
|
500
|
+
p->upos = x->upos = -1;
|
501
|
+
if (p->ppos >= 0) { // parent has been visited
|
502
|
+
c[1] = (v->array[p->ppos].upos >= 0)? u->array + v->array[p->ppos].upos : 0;
|
503
|
+
c[3] = v->array + p->ppos; c[2] = p;
|
504
|
+
if (fill_cell(opt, curr_score_mat[p->pj], c) > 0) { // then update topology at p and x
|
505
|
+
x->ppos = v->array[p->ppos].upos; // the parent pos in u
|
506
|
+
p->upos = u->n++; // the current pos in u
|
507
|
+
if (x->ppos >= 0) u->array[x->ppos].cpos[p->pj] = p->upos; // the child pos of its parent in u
|
508
|
+
is_added = 1;
|
509
|
+
}
|
510
|
+
} else {
|
511
|
+
x->D = p->D > p->G - opt->q? p->D - opt->r : p->G - opt->qr;
|
512
|
+
if (x->D > 0) {
|
513
|
+
x->G = x->D;
|
514
|
+
x->I = MINUS_INF; x->ppos = -1;
|
515
|
+
p->upos = u->n++;
|
516
|
+
is_added = 1;
|
517
|
+
}
|
518
|
+
}
|
519
|
+
if (is_added) { // x has been added to u->array. fill the remaining variables
|
520
|
+
x->cpos[0] = x->cpos[1] = x->cpos[2] = x->cpos[3] = -1;
|
521
|
+
x->pj = p->pj; x->qk = p->qk; x->ql = p->ql; x->qlen = p->qlen; x->tlen = p->tlen + 1;
|
522
|
+
if (x->G > -heap[0]) {
|
523
|
+
heap[0] = -x->G;
|
524
|
+
ks_heapadjust(int, 0, heap_size, heap);
|
525
|
+
}
|
526
|
+
}
|
527
|
+
if ((x->G > opt->qr && x->G >= -heap[0]) || i < old_n) { // good node in u, or in v
|
528
|
+
if (p->cpos[0] == -1 || p->cpos[1] == -1 || p->cpos[2] == -1 || p->cpos[3] == -1) {
|
529
|
+
bwt_2occ4(query, p->qk - 1, p->ql, qcntk, qcntl);
|
530
|
+
for (qj = 0; qj != 4; ++qj) { // descend to the prefix trie
|
531
|
+
if (p->cpos[qj] != -1) continue; // this node will be visited later
|
532
|
+
k = query->L2[qj] + qcntk[qj] + 1;
|
533
|
+
l = query->L2[qj] + qcntl[qj];
|
534
|
+
if (k > l) { p->cpos[qj] = -2; continue; }
|
535
|
+
x = push_array_p(v);
|
536
|
+
p = v->array + i; // p may not point to the correct position after realloc
|
537
|
+
x->G = x->I = x->D = MINUS_INF;
|
538
|
+
x->qk = k; x->ql = l; x->pj = qj; x->qlen = p->qlen + 1; x->ppos = i; x->tlen = p->tlen;
|
539
|
+
x->cpos[0] = x->cpos[1] = x->cpos[2] = x->cpos[3] = -1;
|
540
|
+
p->cpos[qj] = v->n++;
|
541
|
+
} // ~for(qj)
|
542
|
+
} // ~if(p->cpos[])
|
543
|
+
} // ~if
|
544
|
+
} // ~for(i)
|
545
|
+
if (u->n) save_hits(target, opt->t, b->hits, u);
|
546
|
+
{ // push u to the stack (or to the pending array)
|
547
|
+
uint32_t cnt, pos;
|
548
|
+
cnt = (uint32_t)kh_value(chash, iter);
|
549
|
+
pos = kh_value(chash, iter)>>32;
|
550
|
+
if (pos) { // something in the pending array, then merge
|
551
|
+
bsw2entry_t *w = kv_A(stack->pending, pos-1);
|
552
|
+
if (u->n) {
|
553
|
+
if (w->n < u->n) { // swap
|
554
|
+
w = u; u = kv_A(stack->pending, pos-1); kv_A(stack->pending, pos-1) = w;
|
555
|
+
}
|
556
|
+
merge_entry(opt, w, u, b);
|
557
|
+
}
|
558
|
+
if (cnt == 0) { // move from pending to stack0
|
559
|
+
remove_duplicate(w, rhash);
|
560
|
+
save_narrow_hits(target, w, b1, opt->t, opt->is);
|
561
|
+
cut_tail(w, opt->z, u);
|
562
|
+
stack_push0(stack, w);
|
563
|
+
kv_A(stack->pending, pos-1) = 0;
|
564
|
+
--stack->n_pending;
|
565
|
+
}
|
566
|
+
mp_free(stack->pool, u);
|
567
|
+
} else if (cnt) { // the first time
|
568
|
+
if (u->n) { // push to the pending queue
|
569
|
+
++stack->n_pending;
|
570
|
+
kv_push(bsw2entry_p, stack->pending, u);
|
571
|
+
kh_value(chash, iter) = (uint64_t)kv_size(stack->pending)<<32 | cnt;
|
572
|
+
} else mp_free(stack->pool, u);
|
573
|
+
} else { // cnt == 0, then push to the stack
|
574
|
+
bsw2entry_t *w = mp_alloc(stack->pool);
|
575
|
+
save_narrow_hits(target, u, b1, opt->t, opt->is);
|
576
|
+
cut_tail(u, opt->z, w);
|
577
|
+
mp_free(stack->pool, w);
|
578
|
+
stack_push0(stack, u);
|
579
|
+
}
|
580
|
+
}
|
581
|
+
} // ~for(tj)
|
582
|
+
mp_free(stack->pool, v);
|
583
|
+
} // while(top)
|
584
|
+
getrusage(0, &curr);
|
585
|
+
bsw2_resolve_duphits(query, b, opt->is);
|
586
|
+
bsw2_resolve_duphits(query, b1, opt->is);
|
587
|
+
//fprintf(stderr, "stats: %.3lf sec; %d elems\n", time_elapse(&curr, &last), n_tot);
|
588
|
+
// free
|
589
|
+
free(heap);
|
590
|
+
kh_destroy(64, rhash);
|
591
|
+
kh_destroy(64, chash);
|
592
|
+
stack->pending.n = stack->stack0.n = 0;
|
593
|
+
return b_ret;
|
594
|
+
}
|