bio-bwa 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (91) hide show
  1. data/.document +5 -0
  2. data/Gemfile +15 -0
  3. data/Gemfile.lock +28 -0
  4. data/LICENSE.txt +35 -0
  5. data/README.rdoc +33 -0
  6. data/Rakefile +56 -0
  7. data/VERSION +1 -0
  8. data/bio-bwa.gemspec +152 -0
  9. data/doc/Bio.html +93 -0
  10. data/doc/Bio/BWA.html +2884 -0
  11. data/doc/Bio/BWA/Library.html +229 -0
  12. data/doc/_index.html +119 -0
  13. data/doc/class_list.html +36 -0
  14. data/doc/css/common.css +1 -0
  15. data/doc/css/full_list.css +53 -0
  16. data/doc/css/style.css +310 -0
  17. data/doc/file.LICENSE.html +88 -0
  18. data/doc/file.README.html +119 -0
  19. data/doc/file_list.html +41 -0
  20. data/doc/frames.html +13 -0
  21. data/doc/index.html +119 -0
  22. data/doc/js/app.js +203 -0
  23. data/doc/js/full_list.js +149 -0
  24. data/doc/js/jquery.js +154 -0
  25. data/doc/method_list.html +171 -0
  26. data/doc/top-level-namespace.html +88 -0
  27. data/ext/COPYING +674 -0
  28. data/ext/ChangeLog +3864 -0
  29. data/ext/NEWS +555 -0
  30. data/ext/README +29 -0
  31. data/ext/bamlite.c +155 -0
  32. data/ext/bamlite.h +94 -0
  33. data/ext/bntseq.c +303 -0
  34. data/ext/bntseq.h +80 -0
  35. data/ext/bwa.1 +562 -0
  36. data/ext/bwape.c +807 -0
  37. data/ext/bwase.c +686 -0
  38. data/ext/bwase.h +27 -0
  39. data/ext/bwaseqio.c +222 -0
  40. data/ext/bwt.c +250 -0
  41. data/ext/bwt.h +105 -0
  42. data/ext/bwt_gen/Makefile +23 -0
  43. data/ext/bwt_gen/QSufSort.c +496 -0
  44. data/ext/bwt_gen/QSufSort.h +40 -0
  45. data/ext/bwt_gen/bwt_gen.c +1547 -0
  46. data/ext/bwt_gen/bwt_gen.h +105 -0
  47. data/ext/bwt_lite.c +94 -0
  48. data/ext/bwt_lite.h +29 -0
  49. data/ext/bwtaln.c +345 -0
  50. data/ext/bwtaln.h +150 -0
  51. data/ext/bwtgap.c +264 -0
  52. data/ext/bwtgap.h +38 -0
  53. data/ext/bwtindex.c +186 -0
  54. data/ext/bwtio.c +77 -0
  55. data/ext/bwtmisc.c +269 -0
  56. data/ext/bwtsw2.h +51 -0
  57. data/ext/bwtsw2_aux.c +650 -0
  58. data/ext/bwtsw2_chain.c +107 -0
  59. data/ext/bwtsw2_core.c +594 -0
  60. data/ext/bwtsw2_main.c +100 -0
  61. data/ext/cs2nt.c +191 -0
  62. data/ext/is.c +218 -0
  63. data/ext/khash.h +506 -0
  64. data/ext/kseq.h +208 -0
  65. data/ext/ksort.h +269 -0
  66. data/ext/kstring.c +35 -0
  67. data/ext/kstring.h +46 -0
  68. data/ext/kvec.h +90 -0
  69. data/ext/main.c +63 -0
  70. data/ext/main.h +29 -0
  71. data/ext/mkrf_conf.rb +49 -0
  72. data/ext/qualfa2fq.pl +27 -0
  73. data/ext/simple_dp.c +162 -0
  74. data/ext/simpletest.c +23 -0
  75. data/ext/solid2fastq.pl +111 -0
  76. data/ext/stdaln.c +1072 -0
  77. data/ext/stdaln.h +162 -0
  78. data/ext/utils.c +82 -0
  79. data/ext/utils.h +54 -0
  80. data/lib/bio-bwa.rb +7 -0
  81. data/lib/bio/bwa.rb +312 -0
  82. data/lib/bio/bwa/library.rb +42 -0
  83. data/test/data/testdata.fa +602 -0
  84. data/test/data/testdata.long.fa +175 -0
  85. data/test/data/testdata.short.fa +2 -0
  86. data/test/helper.rb +18 -0
  87. data/test/test_bio-bwa_basic.rb +62 -0
  88. data/test/test_bio-bwa_make_index.rb +42 -0
  89. data/test/test_bio-bwa_run_aln.rb +49 -0
  90. data/test/test_bio-bwa_sam_conversion.rb +49 -0
  91. metadata +218 -0
@@ -0,0 +1,107 @@
1
+ #include <stdio.h>
2
+ #include "bwtsw2.h"
3
+
4
+ typedef struct {
5
+ uint32_t tbeg, tend;
6
+ int qbeg, qend;
7
+ uint32_t flag:1, idx:31;
8
+ int chain; // also reuse as a counter
9
+ } hsaip_t;
10
+
11
+ #define _hsaip_lt(a, b) ((a).qbeg < (b).qbeg)
12
+
13
+ #include "ksort.h"
14
+ KSORT_INIT(hsaip, hsaip_t, _hsaip_lt)
15
+
16
+ static int chaining(const bsw2opt_t *opt, int shift, int n, hsaip_t *z, hsaip_t *chain)
17
+ {
18
+ int j, k, m = 0;
19
+ ks_introsort(hsaip, n, z);
20
+ for (j = 0; j < n; ++j) {
21
+ hsaip_t *p = z + j;
22
+ for (k = m - 1; k >= 0; --k) {
23
+ hsaip_t *q = chain + k;
24
+ int x = p->qbeg - q->qbeg; // always positive
25
+ int y = p->tbeg - q->tbeg;
26
+ if (y > 0 && x - y <= opt->bw && y - x <= opt->bw) {
27
+ if (p->qend > q->qend) q->qend = p->qend;
28
+ if (p->tend > q->tend) q->tend = p->tend;
29
+ ++q->chain;
30
+ p->chain = shift + k;
31
+ break;
32
+ }
33
+ }
34
+ if (k < 0) {
35
+ chain[m] = *p;
36
+ chain[m].chain = 1;
37
+ chain[m].idx = p->chain = shift + m;
38
+ ++m;
39
+ }
40
+ }
41
+ return m;
42
+ }
43
+
44
+ void bsw2_chain_filter(const bsw2opt_t *opt, int len, bwtsw2_t *b[2])
45
+ {
46
+ hsaip_t *z[2], *chain[2];
47
+ int i, j, k, n[2], m[2];
48
+ char *flag;
49
+ // initialization
50
+ n[0] = b[0]->n; n[1] = b[1]->n;
51
+ z[0] = calloc(n[0] + n[1], sizeof(hsaip_t));
52
+ z[1] = z[0] + n[0];
53
+ chain[0] = calloc(n[0] + n[1], sizeof(hsaip_t));
54
+ for (k = j = 0; k < 2; ++k) {
55
+ for (i = 0; i < b[k]->n; ++i) {
56
+ bsw2hit_t *p = b[k]->hits + i;
57
+ hsaip_t *q = z[k] + i;
58
+ q->flag = k; q->idx = i;
59
+ q->tbeg = p->k; q->tend = p->k + p->len;
60
+ q->chain = -1;
61
+ q->qbeg = p->beg; q->qend = p->end;
62
+ }
63
+ }
64
+ // chaining
65
+ m[0] = chaining(opt, 0, n[0], z[0], chain[0]);
66
+ chain[1] = chain[0] + m[0];
67
+ m[1] = chaining(opt, m[0], n[1], z[1], chain[1]);
68
+ // change query coordinate on the reverse strand
69
+ for (k = 0; k < m[1]; ++k) {
70
+ hsaip_t *p = chain[1] + k;
71
+ int tmp = p->qbeg;
72
+ p->qbeg = len - p->qend; p->qend = len - tmp;
73
+ }
74
+ // filtering
75
+ flag = calloc(m[0] + m[1], 1);
76
+ ks_introsort(hsaip, m[0] + m[1], chain[0]);
77
+ for (k = 1; k < m[0] + m[1]; ++k) {
78
+ hsaip_t *p = chain[0] + k;
79
+ for (j = 0; j < k; ++j) {
80
+ hsaip_t *q = chain[0] + j;
81
+ if (flag[q->idx]) continue;
82
+ if (q->qend >= p->qend && q->chain > p->chain * opt->t_seeds * 2) {
83
+ flag[p->idx] = 1;
84
+ break;
85
+ }
86
+ }
87
+ }
88
+ for (k = 0; k < n[0] + n[1]; ++k) {
89
+ hsaip_t *p = z[0] + k;
90
+ if (flag[p->chain])
91
+ b[p->flag]->hits[p->idx].G = 0;
92
+ }
93
+ free(flag);
94
+ // squeeze out filtered elements in b[2]
95
+ for (k = 0; k < 2; ++k) {
96
+ for (j = i = 0; j < n[k]; ++j) {
97
+ bsw2hit_t *p = b[k]->hits + j;
98
+ if (p->G) {
99
+ if (i != j) b[k]->hits[i++] = *p;
100
+ else ++i;
101
+ }
102
+ }
103
+ b[k]->n = i;
104
+ }
105
+ // free
106
+ free(z[0]); free(chain[0]);
107
+ }
data/ext/bwtsw2_core.c ADDED
@@ -0,0 +1,594 @@
1
+ #include <stdlib.h>
2
+ #include <string.h>
3
+ #include <stdio.h>
4
+ #include <sys/resource.h>
5
+ #include <assert.h>
6
+ #include "bwt_lite.h"
7
+ #include "bwtsw2.h"
8
+ #include "bwt.h"
9
+ #include "kvec.h"
10
+
11
+ #include "khash.h"
12
+ KHASH_MAP_INIT_INT64(64, uint64_t)
13
+
14
+ #define MINUS_INF -0x3fffffff
15
+ #define MASK_LEVEL 0.90f
16
+
17
+ struct __mempool_t;
18
+ static void mp_destroy(struct __mempool_t*);
19
+ typedef struct {
20
+ uint32_t qk, ql;
21
+ int I, D, G;
22
+ uint32_t pj:2, qlen:30;
23
+ int tlen;
24
+ int ppos, upos;
25
+ int cpos[4];
26
+ } bsw2cell_t;
27
+
28
+ #include "ksort.h"
29
+ KSORT_INIT_GENERIC(int)
30
+ #define __hitG_lt(a, b) ((a).G > (b).G)
31
+ KSORT_INIT(hitG, bsw2hit_t, __hitG_lt)
32
+
33
+ static const bsw2cell_t g_default_cell = { 0, 0, MINUS_INF, MINUS_INF, MINUS_INF, 0, 0, 0, -1, -1, {-1, -1, -1, -1} };
34
+
35
+ typedef struct {
36
+ int n, max;
37
+ uint32_t tk, tl;
38
+ bsw2cell_t *array;
39
+ } bsw2entry_t, *bsw2entry_p;
40
+
41
+ /* --- BEGIN: Stack operations --- */
42
+ typedef struct {
43
+ int n_pending;
44
+ kvec_t(bsw2entry_p) stack0, pending;
45
+ struct __mempool_t *pool;
46
+ } bsw2stack_t;
47
+
48
+ #define stack_isempty(s) (kv_size(s->stack0) == 0 && s->n_pending == 0)
49
+ static void stack_destroy(bsw2stack_t *s) { mp_destroy(s->pool); kv_destroy(s->stack0); kv_destroy(s->pending); free(s); }
50
+ inline static void stack_push0(bsw2stack_t *s, bsw2entry_p e) { kv_push(bsw2entry_p, s->stack0, e); }
51
+ inline static bsw2entry_p stack_pop(bsw2stack_t *s)
52
+ {
53
+ assert(!(kv_size(s->stack0) == 0 && s->n_pending != 0));
54
+ return kv_pop(s->stack0);
55
+ }
56
+ /* --- END: Stack operations --- */
57
+
58
+ /* --- BEGIN: memory pool --- */
59
+ typedef struct __mempool_t {
60
+ int cnt; // if cnt!=0, then there must be memory leak
61
+ kvec_t(bsw2entry_p) pool;
62
+ } mempool_t;
63
+ inline static bsw2entry_p mp_alloc(mempool_t *mp)
64
+ {
65
+ ++mp->cnt;
66
+ if (kv_size(mp->pool) == 0) return (bsw2entry_t*)calloc(1, sizeof(bsw2entry_t));
67
+ else return kv_pop(mp->pool);
68
+ }
69
+ inline static void mp_free(mempool_t *mp, bsw2entry_p e)
70
+ {
71
+ --mp->cnt; e->n = 0;
72
+ kv_push(bsw2entry_p, mp->pool, e);
73
+ }
74
+ static void mp_destroy(struct __mempool_t *mp)
75
+ {
76
+ int i;
77
+ for (i = 0; i != kv_size(mp->pool); ++i) {
78
+ free(kv_A(mp->pool, i)->array);
79
+ free(kv_A(mp->pool, i));
80
+ }
81
+ kv_destroy(mp->pool);
82
+ free(mp);
83
+ }
84
+ /* --- END: memory pool --- */
85
+
86
+ /* --- BEGIN: utilities --- */
87
+ static khash_t(64) *bsw2_connectivity(const bwtl_t *b)
88
+ {
89
+ khash_t(64) *h;
90
+ uint32_t k, l, cntk[4], cntl[4];
91
+ uint64_t x;
92
+ khiter_t iter;
93
+ int j, ret;
94
+ kvec_t(uint64_t) stack;
95
+
96
+ kv_init(stack);
97
+ h = kh_init(64);
98
+ kh_resize(64, h, b->seq_len * 4);
99
+ x = b->seq_len;
100
+ kv_push(uint64_t, stack, x);
101
+ while (kv_size(stack)) {
102
+ x = kv_pop(stack);
103
+ k = x>>32; l = (uint32_t)x;
104
+ bwtl_2occ4(b, k-1, l, cntk, cntl);
105
+ for (j = 0; j != 4; ++j) {
106
+ k = b->L2[j] + cntk[j] + 1;
107
+ l = b->L2[j] + cntl[j];
108
+ if (k > l) continue;
109
+ x = (uint64_t)k << 32 | l;
110
+ iter = kh_put(64, h, x, &ret);
111
+ if (ret) { // if not present
112
+ kh_value(h, iter) = 1;
113
+ kv_push(uint64_t, stack, x);
114
+ } else ++kh_value(h, iter);
115
+ }
116
+ }
117
+ kv_destroy(stack);
118
+ //fprintf(stderr, "[bsw2_connectivity] %u nodes in the DAG\n", kh_size(h));
119
+ return h;
120
+ }
121
+ // pick up top T matches at a node
122
+ static void cut_tail(bsw2entry_t *u, int T, bsw2entry_t *aux)
123
+ {
124
+ int i, *a, n, x;
125
+ if (u->n <= T) return;
126
+ if (aux->max < u->n) {
127
+ aux->max = u->n;
128
+ aux->array = (bsw2cell_t*)realloc(aux->array, aux->max * sizeof(bsw2cell_t));
129
+ }
130
+ a = (int*)aux->array;
131
+ for (i = n = 0; i != u->n; ++i)
132
+ if (u->array[i].ql && u->array[i].G > 0)
133
+ a[n++] = -u->array[i].G;
134
+ if (n <= T) return;
135
+ x = -ks_ksmall(int, n, a, T);
136
+ n = 0;
137
+ for (i = 0; i < u->n; ++i) {
138
+ bsw2cell_t *p = u->array + i;
139
+ if (p->G == x) ++n;
140
+ if (p->G < x || (p->G == x && n >= T)) {
141
+ p->qk = p->ql = 0; p->G = 0;
142
+ if (p->ppos >= 0) u->array[p->ppos].cpos[p->pj] = -1;
143
+ }
144
+ }
145
+ }
146
+ // remove duplicated cells
147
+ static inline void remove_duplicate(bsw2entry_t *u, khash_t(64) *hash)
148
+ {
149
+ int i, ret, j;
150
+ khiter_t k;
151
+ uint64_t key;
152
+ kh_clear(64, hash);
153
+ for (i = 0; i != u->n; ++i) {
154
+ bsw2cell_t *p = u->array + i;
155
+ if (p->ql == 0) continue;
156
+ key = (uint64_t)p->qk << 32 | p->ql;
157
+ k = kh_put(64, hash, key, &ret);
158
+ j = -1;
159
+ if (ret == 0) {
160
+ if ((uint32_t)kh_value(hash, k) >= p->G) j = i;
161
+ else {
162
+ j = kh_value(hash, k)>>32;
163
+ kh_value(hash, k) = (uint64_t)i<<32 | p->G;
164
+ }
165
+ } else kh_value(hash, k) = (uint64_t)i<<32 | p->G;
166
+ if (j >= 0) {
167
+ p = u->array + j;
168
+ p->qk = p->ql = 0; p->G = 0;
169
+ if (p->ppos >= 0) u->array[p->ppos].cpos[p->pj] = -3;
170
+ }
171
+ }
172
+ }
173
+ // merge two entries
174
+ static void merge_entry(const bsw2opt_t * __restrict opt, bsw2entry_t *u, bsw2entry_t *v, bwtsw2_t *b)
175
+ {
176
+ int i;
177
+ if (u->n + v->n >= u->max) {
178
+ u->max = u->n + v->n;
179
+ u->array = (bsw2cell_t*)realloc(u->array, u->max * sizeof(bsw2cell_t));
180
+ }
181
+ for (i = 0; i != v->n; ++i) {
182
+ bsw2cell_t *p = v->array + i;
183
+ if (p->ppos >= 0) p->ppos += u->n;
184
+ if (p->cpos[0] >= 0) p->cpos[0] += u->n;
185
+ if (p->cpos[1] >= 0) p->cpos[1] += u->n;
186
+ if (p->cpos[2] >= 0) p->cpos[2] += u->n;
187
+ if (p->cpos[3] >= 0) p->cpos[3] += u->n;
188
+ }
189
+ memcpy(u->array + u->n, v->array, v->n * sizeof(bsw2cell_t));
190
+ u->n += v->n;
191
+ }
192
+
193
+ static inline bsw2cell_t *push_array_p(bsw2entry_t *e)
194
+ {
195
+ if (e->n == e->max) {
196
+ e->max = e->max? e->max<<1 : 256;
197
+ e->array = (bsw2cell_t*)realloc(e->array, sizeof(bsw2cell_t) * e->max);
198
+ }
199
+ return e->array + e->n;
200
+ }
201
+
202
+ static inline double time_elapse(const struct rusage *curr, const struct rusage *last)
203
+ {
204
+ long t1 = (curr->ru_utime.tv_sec - last->ru_utime.tv_sec) + (curr->ru_stime.tv_sec - last->ru_stime.tv_sec);
205
+ long t2 = (curr->ru_utime.tv_usec - last->ru_utime.tv_usec) + (curr->ru_stime.tv_usec - last->ru_stime.tv_usec);
206
+ return (double)t1 + t2 * 1e-6;
207
+ }
208
+ /* --- END: utilities --- */
209
+
210
+ /* --- BEGIN: processing partial hits --- */
211
+ static void save_hits(const bwtl_t *bwt, int thres, bsw2hit_t *hits, bsw2entry_t *u)
212
+ {
213
+ int i;
214
+ uint32_t k;
215
+ for (i = 0; i < u->n; ++i) {
216
+ bsw2cell_t *p = u->array + i;
217
+ if (p->G < thres) continue;
218
+ for (k = u->tk; k <= u->tl; ++k) {
219
+ int beg, end;
220
+ bsw2hit_t *q = 0;
221
+ beg = bwt->sa[k]; end = beg + p->tlen;
222
+ if (p->G > hits[beg*2].G) {
223
+ hits[beg*2+1] = hits[beg*2];
224
+ q = hits + beg * 2;
225
+ } else if (p->G > hits[beg*2+1].G) q = hits + beg * 2 + 1;
226
+ if (q) {
227
+ q->k = p->qk; q->l = p->ql; q->len = p->qlen; q->G = p->G;
228
+ q->beg = beg; q->end = end; q->G2 = q->k == q->l? 0 : q->G;
229
+ q->flag = q->n_seeds = 0;
230
+ }
231
+ }
232
+ }
233
+ }
234
+ /* "narrow hits" are node-to-node hits that have a high score and
235
+ * are not so repetitive (|SA interval|<=IS). */
236
+ static void save_narrow_hits(const bwtl_t *bwtl, bsw2entry_t *u, bwtsw2_t *b1, int t, int IS)
237
+ {
238
+ int i;
239
+ for (i = 0; i < u->n; ++i) {
240
+ bsw2hit_t *q;
241
+ bsw2cell_t *p = u->array + i;
242
+ if (p->G >= t && p->ql - p->qk + 1 <= IS) { // good narrow hit
243
+ if (b1->max == b1->n) {
244
+ b1->max = b1->max? b1->max<<1 : 4;
245
+ b1->hits = realloc(b1->hits, b1->max * sizeof(bsw2hit_t));
246
+ }
247
+ q = &b1->hits[b1->n++];
248
+ q->k = p->qk; q->l = p->ql;
249
+ q->len = p->qlen;
250
+ q->G = p->G; q->G2 = 0;
251
+ q->beg = bwtl->sa[u->tk]; q->end = q->beg + p->tlen;
252
+ q->flag = 0;
253
+ // delete p
254
+ p->qk = p->ql = 0; p->G = 0;
255
+ if (p->ppos >= 0) u->array[p->ppos].cpos[p->pj] = -3;
256
+ }
257
+ }
258
+ }
259
+ /* after this, "narrow SA hits" will be expanded and the coordinates
260
+ * will be obtained and stored in b->hits[*].k. */
261
+ int bsw2_resolve_duphits(const bwt_t *bwt, bwtsw2_t *b, int IS)
262
+ {
263
+ int i, j, n;
264
+ if (b->n == 0) return 0;
265
+ if (bwt) { // convert to chromosomal coordinates if suitable
266
+ int old_n = b->n;
267
+ bsw2hit_t *old_hits = b->hits;
268
+ for (i = n = 0; i < b->n; ++i) {
269
+ bsw2hit_t *p = old_hits + i;
270
+ if (p->l - p->k + 1 <= IS) n += p->l - p->k + 1;
271
+ else if (p->G > 0) ++n;
272
+ }
273
+ b->n = b->max = n;
274
+ b->hits = calloc(b->max, sizeof(bsw2hit_t));
275
+ for (i = j = 0; i < old_n; ++i) {
276
+ bsw2hit_t *p = old_hits + i;
277
+ if (p->l - p->k + 1 <= IS) {
278
+ bwtint_t k;
279
+ for (k = p->k; k <= p->l; ++k) {
280
+ b->hits[j] = *p;
281
+ b->hits[j].k = bwt_sa(bwt, k);
282
+ b->hits[j].l = 0;
283
+ ++j;
284
+ }
285
+ } else if (p->G > 0) {
286
+ b->hits[j] = *p;
287
+ b->hits[j].k = bwt_sa(bwt, p->k);
288
+ b->hits[j].l = 0;
289
+ b->hits[j].flag |= 1;
290
+ ++j;
291
+ }
292
+ }
293
+ free(old_hits);
294
+ }
295
+ ks_introsort(hitG, b->n, b->hits);
296
+ for (i = 1; i < b->n; ++i) {
297
+ bsw2hit_t *p = b->hits + i;
298
+ if (p->G == 0) break;
299
+ for (j = 0; j < i; ++j) {
300
+ bsw2hit_t *q = b->hits + j;
301
+ int compatible = 1;
302
+ if (q->G == 0) continue;
303
+ if (p->l == 0 && q->l == 0) {
304
+ int qol = (p->end < q->end? p->end : q->end) - (p->beg > q->beg? p->beg : q->beg);
305
+ if (qol < 0) qol = 0;
306
+ if ((float)qol / (p->end - p->beg) > MASK_LEVEL || (float)qol / (q->end - q->beg) > MASK_LEVEL) {
307
+ int64_t tol = (int64_t)(p->k + p->len < q->k + q->len? p->k + p->len : q->k + q->len)
308
+ - (int64_t)(p->k > q->k? p->k : q->k);
309
+ if ((double)tol / p->len > MASK_LEVEL || (double)tol / q->len > MASK_LEVEL)
310
+ compatible = 0;
311
+ }
312
+ }
313
+ if (!compatible) {
314
+ p->G = 0;
315
+ break;
316
+ }
317
+ }
318
+ }
319
+ n = i;
320
+ for (i = j = 0; i < n; ++i) {
321
+ if (b->hits[i].G == 0) continue;
322
+ if (i != j) b->hits[j++] = b->hits[i];
323
+ else ++j;
324
+ }
325
+ b->n = j;
326
+ return b->n;
327
+ }
328
+
329
+ int bsw2_resolve_query_overlaps(bwtsw2_t *b, float mask_level)
330
+ {
331
+ int i, j, n;
332
+ if (b->n == 0) return 0;
333
+ ks_introsort(hitG, b->n, b->hits);
334
+ { // choose a random one
335
+ int G0 = b->hits[0].G;
336
+ for (i = 1; i < b->n; ++i)
337
+ if (b->hits[i].G != G0) break;
338
+ j = (int)(i * drand48());
339
+ if (j) {
340
+ bsw2hit_t tmp;
341
+ tmp = b->hits[0]; b->hits[0] = b->hits[j]; b->hits[j] = tmp;
342
+ }
343
+ }
344
+ for (i = 1; i < b->n; ++i) {
345
+ bsw2hit_t *p = b->hits + i;
346
+ int all_compatible = 1;
347
+ if (p->G == 0) break;
348
+ for (j = 0; j < i; ++j) {
349
+ bsw2hit_t *q = b->hits + j;
350
+ int64_t tol = 0;
351
+ int qol, compatible = 0;
352
+ float fol;
353
+ if (q->G == 0) continue;
354
+ qol = (p->end < q->end? p->end : q->end) - (p->beg > q->beg? p->beg : q->beg);
355
+ if (qol < 0) qol = 0;
356
+ if (p->l == 0 && q->l == 0) {
357
+ tol = (int64_t)(p->k + p->len < q->k + q->len? p->k + p->len : q->k + q->len)
358
+ - (p->k > q->k? p->k : q->k);
359
+ if (tol < 0) tol = 0;
360
+ }
361
+ fol = (float)qol / (p->end - p->beg < q->end - q->beg? p->end - p->beg : q->end - q->beg);
362
+ if (fol < mask_level || (tol > 0 && qol < p->end - p->beg && qol < q->end - q->beg)) compatible = 1;
363
+ if (!compatible) {
364
+ if (q->G2 < p->G) q->G2 = p->G;
365
+ all_compatible = 0;
366
+ }
367
+ }
368
+ if (!all_compatible) p->G = 0;
369
+ }
370
+ n = i;
371
+ for (i = j = 0; i < n; ++i) {
372
+ if (b->hits[i].G == 0) continue;
373
+ if (i != j) b->hits[j++] = b->hits[i];
374
+ else ++j;
375
+ }
376
+ b->n = j;
377
+ return j;
378
+ }
379
+ /* --- END: processing partial hits --- */
380
+
381
+ /* --- BEGIN: global mem pool --- */
382
+ bsw2global_t *bsw2_global_init()
383
+ {
384
+ bsw2global_t *pool;
385
+ bsw2stack_t *stack;
386
+ pool = calloc(1, sizeof(bsw2global_t));
387
+ stack = calloc(1, sizeof(bsw2stack_t));
388
+ stack->pool = (mempool_t*)calloc(1, sizeof(mempool_t));
389
+ pool->stack = (void*)stack;
390
+ return pool;
391
+ }
392
+
393
+ void bsw2_global_destroy(bsw2global_t *pool)
394
+ {
395
+ stack_destroy((bsw2stack_t*)pool->stack);
396
+ free(pool->aln_mem);
397
+ free(pool);
398
+ }
399
+ /* --- END: global mem pool --- */
400
+
401
+ static inline int fill_cell(const bsw2opt_t *o, int match_score, bsw2cell_t *c[4])
402
+ {
403
+ int G = c[3]? c[3]->G + match_score : MINUS_INF;
404
+ if (c[1]) {
405
+ c[0]->I = c[1]->I > c[1]->G - o->q? c[1]->I - o->r : c[1]->G - o->qr;
406
+ if (c[0]->I > G) G = c[0]->I;
407
+ } else c[0]->I = MINUS_INF;
408
+ if (c[2]) {
409
+ c[0]->D = c[2]->D > c[2]->G - o->q? c[2]->D - o->r : c[2]->G - o->qr;
410
+ if (c[0]->D > G) G = c[0]->D;
411
+ } else c[0]->D = MINUS_INF;
412
+ return(c[0]->G = G);
413
+ }
414
+
415
+ static void init_bwtsw2(const bwtl_t *target, const bwt_t *query, bsw2stack_t *s)
416
+ {
417
+ bsw2entry_t *u;
418
+ bsw2cell_t *x;
419
+
420
+ u = mp_alloc(s->pool);
421
+ u->tk = 0; u->tl = target->seq_len;
422
+ x = push_array_p(u);
423
+ *x = g_default_cell;
424
+ x->G = 0; x->qk = 0; x->ql = query->seq_len;
425
+ u->n++;
426
+ stack_push0(s, u);
427
+ }
428
+ /* On return, ret[1] keeps not-so-repetitive hits (narrow SA hits); ret[0] keeps all hits (right?) */
429
+ bwtsw2_t **bsw2_core(const bsw2opt_t *opt, const bwtl_t *target, const bwt_t *query, bsw2global_t *pool)
430
+ {
431
+ bsw2stack_t *stack = (bsw2stack_t*)pool->stack;
432
+ bwtsw2_t *b, *b1, **b_ret;
433
+ int i, j, score_mat[16], *heap, heap_size, n_tot = 0;
434
+ struct rusage curr, last;
435
+ khash_t(64) *rhash, *chash;
436
+
437
+ // initialize connectivity hash (chash)
438
+ chash = bsw2_connectivity(target);
439
+ // calculate score matrix
440
+ for (i = 0; i != 4; ++i)
441
+ for (j = 0; j != 4; ++j)
442
+ score_mat[i<<2|j] = (i == j)? opt->a : -opt->b;
443
+ // initialize other variables
444
+ rhash = kh_init(64);
445
+ init_bwtsw2(target, query, stack);
446
+ heap_size = opt->z;
447
+ heap = calloc(heap_size, sizeof(int));
448
+ // initialize the return struct
449
+ b = (bwtsw2_t*)calloc(1, sizeof(bwtsw2_t));
450
+ b->n = b->max = target->seq_len * 2;
451
+ b->hits = calloc(b->max, sizeof(bsw2hit_t));
452
+ b1 = (bwtsw2_t*)calloc(1, sizeof(bwtsw2_t));
453
+ b_ret = calloc(2, sizeof(void*));
454
+ b_ret[0] = b; b_ret[1] = b1;
455
+ // initialize timer
456
+ getrusage(0, &last);
457
+ // the main loop: traversal of the DAG
458
+ while (!stack_isempty(stack)) {
459
+ int old_n, tj;
460
+ bsw2entry_t *v;
461
+ uint32_t k, l, tcntk[4], tcntl[4];
462
+
463
+ v = stack_pop(stack); old_n = v->n;
464
+ n_tot += v->n;
465
+
466
+ for (i = 0; i < v->n; ++i) { // test max depth and band width
467
+ bsw2cell_t *p = v->array + i;
468
+ if (p->ql == 0) continue;
469
+ if (p->tlen - (int)p->qlen > opt->bw || (int)p->qlen - p->tlen > opt->bw) {
470
+ p->qk = p->ql = 0;
471
+ if (p->ppos >= 0) v->array[p->ppos].cpos[p->pj] = -5;
472
+ }
473
+ }
474
+
475
+ // get Occ for the DAG
476
+ bwtl_2occ4(target, v->tk - 1, v->tl, tcntk, tcntl);
477
+ for (tj = 0; tj != 4; ++tj) { // descend to the children
478
+ uint32_t qcntk[4], qcntl[4];
479
+ int qj, *curr_score_mat = score_mat + tj * 4;
480
+ khiter_t iter;
481
+ bsw2entry_t *u;
482
+
483
+ k = target->L2[tj] + tcntk[tj] + 1;
484
+ l = target->L2[tj] + tcntl[tj];
485
+ if (k > l) continue;
486
+ // update counter
487
+ iter = kh_get(64, chash, (uint64_t)k<<32 | l);
488
+ --kh_value(chash, iter);
489
+ // initialization
490
+ u = mp_alloc(stack->pool);
491
+ u->tk = k; u->tl = l;
492
+ memset(heap, 0, sizeof(int) * opt->z);
493
+ // loop through all the nodes in v
494
+ for (i = 0; i < v->n; ++i) {
495
+ bsw2cell_t *p = v->array + i, *x, *c[4]; // c[0]=>current, c[1]=>I, c[2]=>D, c[3]=>G
496
+ int is_added = 0;
497
+ if (p->ql == 0) continue; // deleted node
498
+ c[0] = x = push_array_p(u);
499
+ x->G = MINUS_INF;
500
+ p->upos = x->upos = -1;
501
+ if (p->ppos >= 0) { // parent has been visited
502
+ c[1] = (v->array[p->ppos].upos >= 0)? u->array + v->array[p->ppos].upos : 0;
503
+ c[3] = v->array + p->ppos; c[2] = p;
504
+ if (fill_cell(opt, curr_score_mat[p->pj], c) > 0) { // then update topology at p and x
505
+ x->ppos = v->array[p->ppos].upos; // the parent pos in u
506
+ p->upos = u->n++; // the current pos in u
507
+ if (x->ppos >= 0) u->array[x->ppos].cpos[p->pj] = p->upos; // the child pos of its parent in u
508
+ is_added = 1;
509
+ }
510
+ } else {
511
+ x->D = p->D > p->G - opt->q? p->D - opt->r : p->G - opt->qr;
512
+ if (x->D > 0) {
513
+ x->G = x->D;
514
+ x->I = MINUS_INF; x->ppos = -1;
515
+ p->upos = u->n++;
516
+ is_added = 1;
517
+ }
518
+ }
519
+ if (is_added) { // x has been added to u->array. fill the remaining variables
520
+ x->cpos[0] = x->cpos[1] = x->cpos[2] = x->cpos[3] = -1;
521
+ x->pj = p->pj; x->qk = p->qk; x->ql = p->ql; x->qlen = p->qlen; x->tlen = p->tlen + 1;
522
+ if (x->G > -heap[0]) {
523
+ heap[0] = -x->G;
524
+ ks_heapadjust(int, 0, heap_size, heap);
525
+ }
526
+ }
527
+ if ((x->G > opt->qr && x->G >= -heap[0]) || i < old_n) { // good node in u, or in v
528
+ if (p->cpos[0] == -1 || p->cpos[1] == -1 || p->cpos[2] == -1 || p->cpos[3] == -1) {
529
+ bwt_2occ4(query, p->qk - 1, p->ql, qcntk, qcntl);
530
+ for (qj = 0; qj != 4; ++qj) { // descend to the prefix trie
531
+ if (p->cpos[qj] != -1) continue; // this node will be visited later
532
+ k = query->L2[qj] + qcntk[qj] + 1;
533
+ l = query->L2[qj] + qcntl[qj];
534
+ if (k > l) { p->cpos[qj] = -2; continue; }
535
+ x = push_array_p(v);
536
+ p = v->array + i; // p may not point to the correct position after realloc
537
+ x->G = x->I = x->D = MINUS_INF;
538
+ x->qk = k; x->ql = l; x->pj = qj; x->qlen = p->qlen + 1; x->ppos = i; x->tlen = p->tlen;
539
+ x->cpos[0] = x->cpos[1] = x->cpos[2] = x->cpos[3] = -1;
540
+ p->cpos[qj] = v->n++;
541
+ } // ~for(qj)
542
+ } // ~if(p->cpos[])
543
+ } // ~if
544
+ } // ~for(i)
545
+ if (u->n) save_hits(target, opt->t, b->hits, u);
546
+ { // push u to the stack (or to the pending array)
547
+ uint32_t cnt, pos;
548
+ cnt = (uint32_t)kh_value(chash, iter);
549
+ pos = kh_value(chash, iter)>>32;
550
+ if (pos) { // something in the pending array, then merge
551
+ bsw2entry_t *w = kv_A(stack->pending, pos-1);
552
+ if (u->n) {
553
+ if (w->n < u->n) { // swap
554
+ w = u; u = kv_A(stack->pending, pos-1); kv_A(stack->pending, pos-1) = w;
555
+ }
556
+ merge_entry(opt, w, u, b);
557
+ }
558
+ if (cnt == 0) { // move from pending to stack0
559
+ remove_duplicate(w, rhash);
560
+ save_narrow_hits(target, w, b1, opt->t, opt->is);
561
+ cut_tail(w, opt->z, u);
562
+ stack_push0(stack, w);
563
+ kv_A(stack->pending, pos-1) = 0;
564
+ --stack->n_pending;
565
+ }
566
+ mp_free(stack->pool, u);
567
+ } else if (cnt) { // the first time
568
+ if (u->n) { // push to the pending queue
569
+ ++stack->n_pending;
570
+ kv_push(bsw2entry_p, stack->pending, u);
571
+ kh_value(chash, iter) = (uint64_t)kv_size(stack->pending)<<32 | cnt;
572
+ } else mp_free(stack->pool, u);
573
+ } else { // cnt == 0, then push to the stack
574
+ bsw2entry_t *w = mp_alloc(stack->pool);
575
+ save_narrow_hits(target, u, b1, opt->t, opt->is);
576
+ cut_tail(u, opt->z, w);
577
+ mp_free(stack->pool, w);
578
+ stack_push0(stack, u);
579
+ }
580
+ }
581
+ } // ~for(tj)
582
+ mp_free(stack->pool, v);
583
+ } // while(top)
584
+ getrusage(0, &curr);
585
+ bsw2_resolve_duphits(query, b, opt->is);
586
+ bsw2_resolve_duphits(query, b1, opt->is);
587
+ //fprintf(stderr, "stats: %.3lf sec; %d elems\n", time_elapse(&curr, &last), n_tot);
588
+ // free
589
+ free(heap);
590
+ kh_destroy(64, rhash);
591
+ kh_destroy(64, chash);
592
+ stack->pending.n = stack->stack0.n = 0;
593
+ return b_ret;
594
+ }