bio-bwa 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.document +5 -0
- data/Gemfile +15 -0
- data/Gemfile.lock +28 -0
- data/LICENSE.txt +35 -0
- data/README.rdoc +33 -0
- data/Rakefile +56 -0
- data/VERSION +1 -0
- data/bio-bwa.gemspec +152 -0
- data/doc/Bio.html +93 -0
- data/doc/Bio/BWA.html +2884 -0
- data/doc/Bio/BWA/Library.html +229 -0
- data/doc/_index.html +119 -0
- data/doc/class_list.html +36 -0
- data/doc/css/common.css +1 -0
- data/doc/css/full_list.css +53 -0
- data/doc/css/style.css +310 -0
- data/doc/file.LICENSE.html +88 -0
- data/doc/file.README.html +119 -0
- data/doc/file_list.html +41 -0
- data/doc/frames.html +13 -0
- data/doc/index.html +119 -0
- data/doc/js/app.js +203 -0
- data/doc/js/full_list.js +149 -0
- data/doc/js/jquery.js +154 -0
- data/doc/method_list.html +171 -0
- data/doc/top-level-namespace.html +88 -0
- data/ext/COPYING +674 -0
- data/ext/ChangeLog +3864 -0
- data/ext/NEWS +555 -0
- data/ext/README +29 -0
- data/ext/bamlite.c +155 -0
- data/ext/bamlite.h +94 -0
- data/ext/bntseq.c +303 -0
- data/ext/bntseq.h +80 -0
- data/ext/bwa.1 +562 -0
- data/ext/bwape.c +807 -0
- data/ext/bwase.c +686 -0
- data/ext/bwase.h +27 -0
- data/ext/bwaseqio.c +222 -0
- data/ext/bwt.c +250 -0
- data/ext/bwt.h +105 -0
- data/ext/bwt_gen/Makefile +23 -0
- data/ext/bwt_gen/QSufSort.c +496 -0
- data/ext/bwt_gen/QSufSort.h +40 -0
- data/ext/bwt_gen/bwt_gen.c +1547 -0
- data/ext/bwt_gen/bwt_gen.h +105 -0
- data/ext/bwt_lite.c +94 -0
- data/ext/bwt_lite.h +29 -0
- data/ext/bwtaln.c +345 -0
- data/ext/bwtaln.h +150 -0
- data/ext/bwtgap.c +264 -0
- data/ext/bwtgap.h +38 -0
- data/ext/bwtindex.c +186 -0
- data/ext/bwtio.c +77 -0
- data/ext/bwtmisc.c +269 -0
- data/ext/bwtsw2.h +51 -0
- data/ext/bwtsw2_aux.c +650 -0
- data/ext/bwtsw2_chain.c +107 -0
- data/ext/bwtsw2_core.c +594 -0
- data/ext/bwtsw2_main.c +100 -0
- data/ext/cs2nt.c +191 -0
- data/ext/is.c +218 -0
- data/ext/khash.h +506 -0
- data/ext/kseq.h +208 -0
- data/ext/ksort.h +269 -0
- data/ext/kstring.c +35 -0
- data/ext/kstring.h +46 -0
- data/ext/kvec.h +90 -0
- data/ext/main.c +63 -0
- data/ext/main.h +29 -0
- data/ext/mkrf_conf.rb +49 -0
- data/ext/qualfa2fq.pl +27 -0
- data/ext/simple_dp.c +162 -0
- data/ext/simpletest.c +23 -0
- data/ext/solid2fastq.pl +111 -0
- data/ext/stdaln.c +1072 -0
- data/ext/stdaln.h +162 -0
- data/ext/utils.c +82 -0
- data/ext/utils.h +54 -0
- data/lib/bio-bwa.rb +7 -0
- data/lib/bio/bwa.rb +312 -0
- data/lib/bio/bwa/library.rb +42 -0
- data/test/data/testdata.fa +602 -0
- data/test/data/testdata.long.fa +175 -0
- data/test/data/testdata.short.fa +2 -0
- data/test/helper.rb +18 -0
- data/test/test_bio-bwa_basic.rb +62 -0
- data/test/test_bio-bwa_make_index.rb +42 -0
- data/test/test_bio-bwa_run_aln.rb +49 -0
- data/test/test_bio-bwa_sam_conversion.rb +49 -0
- metadata +218 -0
data/ext/bwtsw2.h
ADDED
@@ -0,0 +1,51 @@
|
|
1
|
+
#ifndef LH3_BWTSW2_H
|
2
|
+
#define LH3_BWTSW2_H
|
3
|
+
|
4
|
+
#include <stdint.h>
|
5
|
+
#include "bntseq.h"
|
6
|
+
#include "bwt_lite.h"
|
7
|
+
#include "bwt.h"
|
8
|
+
|
9
|
+
typedef struct {
|
10
|
+
int a, b, q, r, t, qr, bw;
|
11
|
+
int z, is, t_seeds, hard_clip;
|
12
|
+
float yita, mask_level, coef;
|
13
|
+
int n_threads, chunk_size;
|
14
|
+
} bsw2opt_t;
|
15
|
+
|
16
|
+
typedef struct {
|
17
|
+
uint32_t k, l, flag:18, n_seeds:14;
|
18
|
+
int len, G, G2;
|
19
|
+
int beg, end;
|
20
|
+
} bsw2hit_t;
|
21
|
+
|
22
|
+
typedef struct {
|
23
|
+
int n, max;
|
24
|
+
bsw2hit_t *hits;
|
25
|
+
int *n_cigar;
|
26
|
+
uint32_t **cigar;
|
27
|
+
} bwtsw2_t;
|
28
|
+
|
29
|
+
typedef struct {
|
30
|
+
void *stack;
|
31
|
+
int max_l;
|
32
|
+
uint8_t *aln_mem;
|
33
|
+
} bsw2global_t;
|
34
|
+
|
35
|
+
#ifdef __cplusplus
|
36
|
+
extern "C" {
|
37
|
+
#endif
|
38
|
+
|
39
|
+
bsw2opt_t *bsw2_init_opt();
|
40
|
+
bwtsw2_t **bsw2_core(const bsw2opt_t *opt, const bwtl_t *target, const bwt_t *query, bsw2global_t *pool);
|
41
|
+
void bsw2_aln(const bsw2opt_t *opt, const bntseq_t *bns, bwt_t * const target[2], const char *fn);
|
42
|
+
void bsw2_destroy(bwtsw2_t *b);
|
43
|
+
|
44
|
+
bsw2global_t *bsw2_global_init();
|
45
|
+
void bsw2_global_destroy(bsw2global_t *_pool);
|
46
|
+
|
47
|
+
#ifdef __cplusplus
|
48
|
+
}
|
49
|
+
#endif
|
50
|
+
|
51
|
+
#endif
|
data/ext/bwtsw2_aux.c
ADDED
@@ -0,0 +1,650 @@
|
|
1
|
+
#include <stdlib.h>
|
2
|
+
#include <stdio.h>
|
3
|
+
#include <math.h>
|
4
|
+
#ifdef HAVE_CONFIG_H
|
5
|
+
#include "config.h"
|
6
|
+
#endif
|
7
|
+
#ifdef HAVE_PTHREAD
|
8
|
+
#include <pthread.h>
|
9
|
+
#endif
|
10
|
+
#include "bntseq.h"
|
11
|
+
#include "bwt_lite.h"
|
12
|
+
#include "utils.h"
|
13
|
+
#include "bwtsw2.h"
|
14
|
+
#include "stdaln.h"
|
15
|
+
#include "kstring.h"
|
16
|
+
|
17
|
+
#include "kseq.h"
|
18
|
+
KSEQ_INIT(gzFile, gzread)
|
19
|
+
|
20
|
+
#include "ksort.h"
|
21
|
+
#define __left_lt(a, b) ((a).end > (b).end)
|
22
|
+
KSORT_INIT(hit, bsw2hit_t, __left_lt)
|
23
|
+
|
24
|
+
extern unsigned char nst_nt4_table[256];
|
25
|
+
|
26
|
+
unsigned char nt_comp_table[256] = {
|
27
|
+
'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N',
|
28
|
+
'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N',
|
29
|
+
'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N',
|
30
|
+
'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N',
|
31
|
+
'N','T','V','G', 'H','N','N','C', 'D','N','N','M', 'N','K','N','N',
|
32
|
+
'N','N','Y','S', 'A','N','B','W', 'X','R','N','N', 'N','N','N','N',
|
33
|
+
'n','t','v','g', 'h','n','n','c', 'd','n','n','m', 'n','k','n','n',
|
34
|
+
'n','n','y','s', 'a','n','b','w', 'x','r','n','N', 'N','N','N','N',
|
35
|
+
'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N',
|
36
|
+
'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N',
|
37
|
+
'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N',
|
38
|
+
'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N',
|
39
|
+
'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N',
|
40
|
+
'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N',
|
41
|
+
'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N',
|
42
|
+
'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N'
|
43
|
+
};
|
44
|
+
|
45
|
+
extern int bsw2_resolve_duphits(const bwt_t *bwt, bwtsw2_t *b, int IS);
|
46
|
+
extern int bsw2_resolve_query_overlaps(bwtsw2_t *b, float mask_level);
|
47
|
+
|
48
|
+
bsw2opt_t *bsw2_init_opt()
|
49
|
+
{
|
50
|
+
bsw2opt_t *o = (bsw2opt_t*)calloc(1, sizeof(bsw2opt_t));
|
51
|
+
o->a = 1; o->b = 3; o->q = 5; o->r = 2; o->t = 30;
|
52
|
+
o->bw = 50;
|
53
|
+
o->z = 1; o->is = 3; o->t_seeds = 5; o->hard_clip = 0;
|
54
|
+
o->mask_level = 0.50f; o->yita = 5.5f; o->coef = 5.5f;
|
55
|
+
o->qr = o->q + o->r; o->n_threads = 1; o->chunk_size = 10000000;
|
56
|
+
return o;
|
57
|
+
}
|
58
|
+
|
59
|
+
void bsw2_destroy(bwtsw2_t *b)
|
60
|
+
{
|
61
|
+
int i;
|
62
|
+
if (b == 0) return;
|
63
|
+
if (b->cigar)
|
64
|
+
for (i = 0; i < b->n; ++i) free(b->cigar[i]);
|
65
|
+
free(b->cigar); free(b->n_cigar); free(b->hits);
|
66
|
+
free(b);
|
67
|
+
}
|
68
|
+
|
69
|
+
#define __gen_ap(par, opt) do { \
|
70
|
+
int i; \
|
71
|
+
for (i = 0; i < 25; ++i) (par).matrix[i] = -(opt)->b; \
|
72
|
+
for (i = 0; i < 4; ++i) (par).matrix[i*5+i] = (opt)->a; \
|
73
|
+
(par).gap_open = (opt)->q; (par).gap_ext = (opt)->r; \
|
74
|
+
(par).gap_end = (opt)->r; \
|
75
|
+
(par).row = 5; (par).band_width = opt->bw; \
|
76
|
+
} while (0)
|
77
|
+
|
78
|
+
#define __rpac(pac, l, i) (pac[(l-i-1)>>2] >> (~(l-i-1)&3)*2 & 0x3)
|
79
|
+
|
80
|
+
void bsw2_extend_left(const bsw2opt_t *opt, bwtsw2_t *b, uint8_t *_query, int lq, uint8_t *pac, uint32_t l_pac, int is_rev, uint8_t *_mem)
|
81
|
+
{
|
82
|
+
int i, matrix[25];
|
83
|
+
bwtint_t k;
|
84
|
+
uint8_t *target = 0, *query;
|
85
|
+
AlnParam par;
|
86
|
+
|
87
|
+
par.matrix = matrix;
|
88
|
+
__gen_ap(par, opt);
|
89
|
+
query = calloc(lq, 1);
|
90
|
+
// sort according to the descending order of query end
|
91
|
+
ks_introsort(hit, b->n, b->hits);
|
92
|
+
target = calloc(((lq + 1) / 2 * opt->a + opt->r) / opt->r + lq, 1);
|
93
|
+
// reverse _query
|
94
|
+
for (i = 0; i < lq; ++i) query[lq - i - 1] = _query[i];
|
95
|
+
// core loop
|
96
|
+
for (i = 0; i < b->n; ++i) {
|
97
|
+
bsw2hit_t *p = b->hits + i;
|
98
|
+
int lt = ((p->beg + 1) / 2 * opt->a + opt->r) / opt->r + lq;
|
99
|
+
int score, j;
|
100
|
+
path_t path;
|
101
|
+
p->n_seeds = 1;
|
102
|
+
if (p->l || p->k == 0) continue;
|
103
|
+
for (j = score = 0; j < i; ++j) {
|
104
|
+
bsw2hit_t *q = b->hits + j;
|
105
|
+
if (q->beg <= p->beg && q->k <= p->k && q->k + q->len >= p->k + p->len) {
|
106
|
+
if (q->n_seeds < (1<<14) - 2) ++q->n_seeds;
|
107
|
+
++score;
|
108
|
+
}
|
109
|
+
}
|
110
|
+
if (score) continue;
|
111
|
+
if (lt > p->k) lt = p->k;
|
112
|
+
if (is_rev) {
|
113
|
+
for (k = p->k - 1, j = 0; k > 0 && j < lt; --k) // FIXME: k=0 not considered!
|
114
|
+
target[j++] = __rpac(pac, l_pac, k);
|
115
|
+
} else {
|
116
|
+
for (k = p->k - 1, j = 0; k > 0 && j < lt; --k) // FIXME: k=0 not considered!
|
117
|
+
target[j++] = pac[k>>2] >> (~k&3)*2 & 0x3;
|
118
|
+
}
|
119
|
+
lt = j;
|
120
|
+
score = aln_extend_core(target, lt, query + lq - p->beg, p->beg, &par, &path, 0, p->G, _mem);
|
121
|
+
if (score > p->G) { // extensible
|
122
|
+
p->G = score;
|
123
|
+
p->len += path.i;
|
124
|
+
p->beg -= path.j;
|
125
|
+
p->k -= path.i;
|
126
|
+
}
|
127
|
+
}
|
128
|
+
free(query); free(target);
|
129
|
+
}
|
130
|
+
|
131
|
+
void bsw2_extend_rght(const bsw2opt_t *opt, bwtsw2_t *b, uint8_t *query, int lq, uint8_t *pac, uint32_t l_pac, int is_rev, uint8_t *_mem)
|
132
|
+
{
|
133
|
+
int i, matrix[25];
|
134
|
+
uint32_t k;
|
135
|
+
uint8_t *target;
|
136
|
+
AlnParam par;
|
137
|
+
|
138
|
+
par.matrix = matrix;
|
139
|
+
__gen_ap(par, opt);
|
140
|
+
target = calloc(((lq + 1) / 2 * opt->a + opt->r) / opt->r + lq, 1);
|
141
|
+
for (i = 0; i < b->n; ++i) {
|
142
|
+
bsw2hit_t *p = b->hits + i;
|
143
|
+
int lt = ((lq - p->beg + 1) / 2 * opt->a + opt->r) / opt->r + lq;
|
144
|
+
int j, score;
|
145
|
+
path_t path;
|
146
|
+
if (p->l) continue;
|
147
|
+
if (is_rev) {
|
148
|
+
for (k = p->k, j = 0; k < p->k + lt && k < l_pac; ++k)
|
149
|
+
target[j++] = __rpac(pac, l_pac, k);
|
150
|
+
} else {
|
151
|
+
for (k = p->k, j = 0; k < p->k + lt && k < l_pac; ++k)
|
152
|
+
target[j++] = pac[k>>2] >> (~k&3)*2 & 0x3;
|
153
|
+
}
|
154
|
+
lt = j;
|
155
|
+
score = aln_extend_core(target, lt, query + p->beg, lq - p->beg, &par, &path, 0, 1, _mem);
|
156
|
+
// if (score < p->G) fprintf(stderr, "[bsw2_extend_hits] %d < %d\n", score, p->G);
|
157
|
+
if (score >= p->G) {
|
158
|
+
p->G = score;
|
159
|
+
p->len = path.i;
|
160
|
+
p->end = path.j + p->beg;
|
161
|
+
}
|
162
|
+
}
|
163
|
+
free(target);
|
164
|
+
}
|
165
|
+
|
166
|
+
/* generate CIGAR array(s) in b->cigar[] */
|
167
|
+
static void gen_cigar(const bsw2opt_t *opt, int lq, uint8_t *seq[2], uint8_t *pac, bwtsw2_t *b)
|
168
|
+
{
|
169
|
+
uint8_t *target;
|
170
|
+
int i, matrix[25];
|
171
|
+
AlnParam par;
|
172
|
+
path_t *path;
|
173
|
+
|
174
|
+
par.matrix = matrix;
|
175
|
+
__gen_ap(par, opt);
|
176
|
+
i = ((lq + 1) / 2 * opt->a + opt->r) / opt->r + lq; // maximum possible target length
|
177
|
+
target = calloc(i, 1);
|
178
|
+
path = calloc(i + lq, sizeof(path_t));
|
179
|
+
// memory clean up for b
|
180
|
+
if (b->n < b->max) {
|
181
|
+
b->max = b->n;
|
182
|
+
b->hits = realloc(b->hits, b->n * sizeof(bsw2hit_t));
|
183
|
+
}
|
184
|
+
if (b->cigar) free(b->cigar);
|
185
|
+
if (b->n_cigar) free(b->n_cigar);
|
186
|
+
b->cigar = (uint32_t**)calloc(b->max, sizeof(void*));
|
187
|
+
b->n_cigar = (int*)calloc(b->max, sizeof(int));
|
188
|
+
// generate CIGAR
|
189
|
+
for (i = 0; i < b->n; ++i) {
|
190
|
+
bsw2hit_t *p = b->hits + i;
|
191
|
+
uint8_t *query;
|
192
|
+
uint32_t k;
|
193
|
+
int score, path_len, beg, end;
|
194
|
+
if (p->l) continue;
|
195
|
+
beg = (p->flag & 0x10)? lq - p->end : p->beg;
|
196
|
+
end = (p->flag & 0x10)? lq - p->beg : p->end;
|
197
|
+
query = seq[(p->flag & 0x10)? 1 : 0] + beg;
|
198
|
+
for (k = p->k; k < p->k + p->len; ++k) // in principle, no out-of-boundary here
|
199
|
+
target[k - p->k] = pac[k>>2] >> (~k&3)*2 & 0x3;
|
200
|
+
score = aln_global_core(target, p->len, query, end - beg, &par, path, &path_len);
|
201
|
+
b->cigar[i] = aln_path2cigar32(path, path_len, &b->n_cigar[i]);
|
202
|
+
if (beg != 0 || end < lq) { // write soft clipping
|
203
|
+
b->cigar[i] = realloc(b->cigar[i], 4 * (b->n_cigar[i] + 2));
|
204
|
+
if (beg != 0) {
|
205
|
+
memmove(b->cigar[i] + 1, b->cigar[i], b->n_cigar[i] * 4);
|
206
|
+
b->cigar[i][0] = beg<<4 | 4;
|
207
|
+
++b->n_cigar[i];
|
208
|
+
}
|
209
|
+
if (end < lq) {
|
210
|
+
b->cigar[i][b->n_cigar[i]] = (lq - end)<<4 | 4;
|
211
|
+
++b->n_cigar[i];
|
212
|
+
}
|
213
|
+
}
|
214
|
+
}
|
215
|
+
free(target); free(path);
|
216
|
+
}
|
217
|
+
|
218
|
+
/* this is for the debugging purpose only */
|
219
|
+
void bsw2_debug_hits(const bwtsw2_t *b)
|
220
|
+
{
|
221
|
+
int i;
|
222
|
+
printf("# raw hits: %d\n", b->n);
|
223
|
+
for (i = 0; i < b->n; ++i) {
|
224
|
+
bsw2hit_t *p = b->hits + i;
|
225
|
+
if (p->l == 0)
|
226
|
+
printf("%d, %d, %d, %u, %u\n", p->G, p->beg, p->end, p->k, p->l);
|
227
|
+
}
|
228
|
+
}
|
229
|
+
|
230
|
+
static void merge_hits(bwtsw2_t *b[2], int l, int is_reverse)
|
231
|
+
{
|
232
|
+
int i;
|
233
|
+
if (b[0]->n + b[1]->n > b[0]->max) {
|
234
|
+
b[0]->max = b[0]->n + b[1]->n;
|
235
|
+
b[0]->hits = realloc(b[0]->hits, b[0]->max * sizeof(bsw2hit_t));
|
236
|
+
}
|
237
|
+
for (i = 0; i < b[1]->n; ++i) {
|
238
|
+
bsw2hit_t *p = b[0]->hits + b[0]->n + i;
|
239
|
+
*p = b[1]->hits[i];
|
240
|
+
if (is_reverse) {
|
241
|
+
int x = p->beg;
|
242
|
+
p->beg = l - p->end;
|
243
|
+
p->end = l - x;
|
244
|
+
p->flag |= 0x10;
|
245
|
+
}
|
246
|
+
}
|
247
|
+
b[0]->n += b[1]->n;
|
248
|
+
bsw2_destroy(b[1]);
|
249
|
+
b[1] = 0;
|
250
|
+
}
|
251
|
+
/* seq[0] is the forward sequence and seq[1] is the reverse complement. */
|
252
|
+
static bwtsw2_t *bsw2_aln1_core(const bsw2opt_t *opt, const bntseq_t *bns, uint8_t *pac, const bwt_t *target,
|
253
|
+
int l, uint8_t *seq[2], int is_rev, bsw2global_t *pool)
|
254
|
+
{
|
255
|
+
extern void bsw2_chain_filter(const bsw2opt_t *opt, int len, bwtsw2_t *b[2]);
|
256
|
+
bwtsw2_t *b[2], **bb[2];
|
257
|
+
int k;
|
258
|
+
for (k = 0; k < 2; ++k) {
|
259
|
+
bwtl_t *query = bwtl_seq2bwtl(l, seq[k]);
|
260
|
+
bb[k] = bsw2_core(opt, query, target, pool);
|
261
|
+
bwtl_destroy(query);
|
262
|
+
}
|
263
|
+
b[0] = bb[0][1]; b[1] = bb[1][1]; // bb[*][1] are "narrow SA hits"
|
264
|
+
bsw2_chain_filter(opt, l, b);
|
265
|
+
for (k = 0; k < 2; ++k) {
|
266
|
+
bsw2_extend_left(opt, bb[k][1], seq[k], l, pac, bns->l_pac, is_rev, pool->aln_mem);
|
267
|
+
merge_hits(bb[k], l, 0); // bb[k][1] is merged to bb[k][0] here
|
268
|
+
bsw2_resolve_duphits(0, bb[k][0], 0);
|
269
|
+
bsw2_extend_rght(opt, bb[k][0], seq[k], l, pac, bns->l_pac, is_rev, pool->aln_mem);
|
270
|
+
b[k] = bb[k][0];
|
271
|
+
free(bb[k]);
|
272
|
+
}
|
273
|
+
merge_hits(b, l, 1); // again, b[1] is merged to b[0]
|
274
|
+
bsw2_resolve_query_overlaps(b[0], opt->mask_level);
|
275
|
+
return b[0];
|
276
|
+
}
|
277
|
+
|
278
|
+
/* set ->flag to records the origin of the hit (to forward bwt or reverse bwt) */
|
279
|
+
static void flag_fr(bwtsw2_t *b[2])
|
280
|
+
{
|
281
|
+
int i, j;
|
282
|
+
for (i = 0; i < b[0]->n; ++i) {
|
283
|
+
bsw2hit_t *p = b[0]->hits + i;
|
284
|
+
p->flag |= 0x10000;
|
285
|
+
}
|
286
|
+
for (i = 0; i < b[1]->n; ++i) {
|
287
|
+
bsw2hit_t *p = b[1]->hits + i;
|
288
|
+
p->flag |= 0x20000;
|
289
|
+
}
|
290
|
+
for (i = 0; i < b[0]->n; ++i) {
|
291
|
+
bsw2hit_t *p = b[0]->hits + i;
|
292
|
+
for (j = 0; j < b[1]->n; ++j) {
|
293
|
+
bsw2hit_t *q = b[1]->hits + j;
|
294
|
+
if (q->beg == p->beg && q->end == p->end && q->k == p->k && q->len == p->len && q->G == p->G) {
|
295
|
+
q->flag |= 0x30000; p->flag |= 0x30000;
|
296
|
+
break;
|
297
|
+
}
|
298
|
+
}
|
299
|
+
}
|
300
|
+
}
|
301
|
+
|
302
|
+
typedef struct {
|
303
|
+
int l, tid;
|
304
|
+
char *name, *seq, *qual, *sam;
|
305
|
+
} bsw2seq1_t;
|
306
|
+
|
307
|
+
typedef struct {
|
308
|
+
int n, max;
|
309
|
+
bsw2seq1_t *seq;
|
310
|
+
} bsw2seq_t;
|
311
|
+
|
312
|
+
#ifdef HAVE_PTHREAD
|
313
|
+
static pthread_mutex_t g_dbwtsw_lock = PTHREAD_MUTEX_INITIALIZER;
|
314
|
+
#endif
|
315
|
+
|
316
|
+
static int fix_cigar(const char *qname, const bntseq_t *bns, bsw2hit_t *p, int n_cigar, uint32_t *cigar)
|
317
|
+
{
|
318
|
+
// FIXME: this routine does not work if the query bridge three reference sequences
|
319
|
+
int32_t coor, refl, lq;
|
320
|
+
int x, y, i, seqid;
|
321
|
+
bns_coor_pac2real(bns, p->k, p->len, &seqid);
|
322
|
+
coor = p->k - bns->anns[seqid].offset;
|
323
|
+
refl = bns->anns[seqid].len;
|
324
|
+
x = coor; y = 0;
|
325
|
+
// test if the alignment goes beyond the boundary
|
326
|
+
for (i = 0; i < n_cigar; ++i) {
|
327
|
+
int op = cigar[i]&0xf, ln = cigar[i]>>4;
|
328
|
+
if (op == 1 || op == 4 || op == 5) y += ln;
|
329
|
+
else if (op == 2) x += ln;
|
330
|
+
else x += ln, y += ln;
|
331
|
+
}
|
332
|
+
lq = y; // length of the query sequence
|
333
|
+
if (x > refl) { // then fix it
|
334
|
+
int j, nc, mq[2], nlen[2];
|
335
|
+
uint32_t *cn, kk = 0;
|
336
|
+
nc = mq[0] = mq[1] = nlen[0] = nlen[1] = 0;
|
337
|
+
cn = calloc(n_cigar + 3, 4);
|
338
|
+
x = coor; y = 0;
|
339
|
+
for (i = j = 0; i < n_cigar; ++i) {
|
340
|
+
int op = cigar[i]&0xf, ln = cigar[i]>>4;
|
341
|
+
if (op == 4 || op == 5 || op == 1) { // ins or clipping
|
342
|
+
y += ln;
|
343
|
+
cn[j++] = cigar[i];
|
344
|
+
} else if (op == 2) { // del
|
345
|
+
if (x + ln >= refl && nc == 0) {
|
346
|
+
cn[j++] = (uint32_t)(lq - y)<<4 | 4;
|
347
|
+
nc = j;
|
348
|
+
cn[j++] = (uint32_t)y<<4 | 4;
|
349
|
+
kk = p->k + (x + ln - refl);
|
350
|
+
nlen[0] = x - coor;
|
351
|
+
nlen[1] = p->len - nlen[0] - ln;
|
352
|
+
} else cn[j++] = cigar[i];
|
353
|
+
x += ln;
|
354
|
+
} else if (op == 0) { // match
|
355
|
+
if (x + ln >= refl && nc == 0) {
|
356
|
+
// FIXME: not consider a special case where a split right between M and I
|
357
|
+
cn[j++] = (uint32_t)(refl - x)<<4 | 0; // write M
|
358
|
+
cn[j++] = (uint32_t)(lq - y - (refl - x))<<4 | 4; // write S
|
359
|
+
nc = j;
|
360
|
+
mq[0] += refl - x;
|
361
|
+
cn[j++] = (uint32_t)(y + (refl - x))<<4 | 4;
|
362
|
+
if (x + ln - refl) cn[j++] = (uint32_t)(x + ln - refl)<<4 | 0;
|
363
|
+
mq[1] += x + ln - refl;
|
364
|
+
kk = bns->anns[seqid].offset + refl;
|
365
|
+
nlen[0] = refl - coor;
|
366
|
+
nlen[1] = p->len - nlen[0];
|
367
|
+
} else {
|
368
|
+
cn[j++] = cigar[i];
|
369
|
+
mq[nc?1:0] += ln;
|
370
|
+
}
|
371
|
+
x += ln; y += ln;
|
372
|
+
}
|
373
|
+
}
|
374
|
+
if (mq[0] > mq[1]) { // then take the first alignment
|
375
|
+
n_cigar = nc;
|
376
|
+
memcpy(cigar, cn, 4 * nc);
|
377
|
+
p->len = nlen[0];
|
378
|
+
} else {
|
379
|
+
p->k = kk; p->len = nlen[1];
|
380
|
+
n_cigar = j - nc;
|
381
|
+
memcpy(cigar, cn + nc, 4 * (j - nc));
|
382
|
+
}
|
383
|
+
free(cn);
|
384
|
+
}
|
385
|
+
return n_cigar;
|
386
|
+
}
|
387
|
+
|
388
|
+
/* generate SAM lines for a sequence in ks with alignment stored in
|
389
|
+
* b. ks->name and ks->seq will be freed and set to NULL in the end. */
|
390
|
+
static void print_hits(const bntseq_t *bns, const bsw2opt_t *opt, bsw2seq1_t *ks, bwtsw2_t *b)
|
391
|
+
{
|
392
|
+
int i, k;
|
393
|
+
kstring_t str;
|
394
|
+
memset(&str, 0, sizeof(kstring_t));
|
395
|
+
if (b == 0 || b->n == 0) { // no hits
|
396
|
+
ksprintf(&str, "%s\t4\t*\t0\t0\t*\t*\t0\t0\t", ks->name);
|
397
|
+
for (i = 0; i < ks->l; ++i) kputc(ks->seq[i], &str);
|
398
|
+
if (ks->qual) {
|
399
|
+
kputc('\t', &str);
|
400
|
+
for (i = 0; i < ks->l; ++i) kputc(ks->qual[i], &str);
|
401
|
+
} else kputs("\t*", &str);
|
402
|
+
kputc('\n', &str);
|
403
|
+
}
|
404
|
+
for (i = 0; b && i < b->n; ++i) {
|
405
|
+
bsw2hit_t *p = b->hits + i;
|
406
|
+
int32_t seqid = -1, coor = -1;
|
407
|
+
int j, qual, nn = 0;
|
408
|
+
int beg, end;
|
409
|
+
if (p->l == 0) {
|
410
|
+
b->n_cigar[i] = fix_cigar(ks->name, bns, p, b->n_cigar[i], b->cigar[i]);
|
411
|
+
nn = bns_coor_pac2real(bns, p->k, p->len, &seqid);
|
412
|
+
coor = p->k - bns->anns[seqid].offset;
|
413
|
+
}
|
414
|
+
ksprintf(&str, "%s\t%d", ks->name, p->flag&0x10);
|
415
|
+
ksprintf(&str, "\t%s\t%d", seqid>=0? bns->anns[seqid].name : "*", coor + 1);
|
416
|
+
if (p->l == 0) {
|
417
|
+
{ // estimate mapping quality
|
418
|
+
float c = 1.0;
|
419
|
+
int subo = p->G2 > opt->t? p->G2 : opt->t;
|
420
|
+
if (p->flag>>16 == 1 || p->flag>>16 == 2) c *= .5;
|
421
|
+
if (p->n_seeds < 2) c *= .2;
|
422
|
+
qual = (int)(c * (p->G - subo) * (250.0 / p->G + 0.03 / opt->a) + .499);
|
423
|
+
if (qual > 250) qual = 250;
|
424
|
+
if (p->flag&1) qual = 0;
|
425
|
+
}
|
426
|
+
ksprintf(&str, "\t%d\t", qual);
|
427
|
+
for (k = 0; k < b->n_cigar[i]; ++k)
|
428
|
+
ksprintf(&str, "%d%c", b->cigar[i][k]>>4, (opt->hard_clip? "MIDNHHP" : "MIDNSHP")[b->cigar[i][k]&0xf]);
|
429
|
+
} else ksprintf(&str, "\t0\t*");
|
430
|
+
ksprintf(&str, "\t*\t0\t0\t");
|
431
|
+
beg = 0; end = ks->l;
|
432
|
+
if (opt->hard_clip) {
|
433
|
+
if ((b->cigar[i][0]&0xf) == 4) beg += b->cigar[i][0]>>4;
|
434
|
+
if ((b->cigar[i][b->n_cigar[i]-1]&0xf) == 4) end -= b->cigar[i][b->n_cigar[i]-1]>>4;
|
435
|
+
}
|
436
|
+
for (j = beg; j < end; ++j) {
|
437
|
+
if (p->flag&0x10) kputc(nt_comp_table[(int)ks->seq[ks->l - 1 - j]], &str);
|
438
|
+
else kputc(ks->seq[j], &str);
|
439
|
+
}
|
440
|
+
if (ks->qual) {
|
441
|
+
kputc('\t', &str);
|
442
|
+
for (j = beg; j < end; ++j) {
|
443
|
+
if (p->flag&0x10) kputc(ks->qual[ks->l - 1 - j], &str);
|
444
|
+
else kputc(ks->qual[j], &str);
|
445
|
+
}
|
446
|
+
} else ksprintf(&str, "\t*");
|
447
|
+
ksprintf(&str, "\tAS:i:%d\tXS:i:%d\tXF:i:%d\tXE:i:%d\tXN:i:%d", p->G, p->G2, p->flag>>16, p->n_seeds, nn);
|
448
|
+
if (p->l) ksprintf(&str, "\tXI:i:%d", p->l - p->k + 1);
|
449
|
+
kputc('\n', &str);
|
450
|
+
}
|
451
|
+
ks->sam = str.s;
|
452
|
+
free(ks->seq); ks->seq = 0;
|
453
|
+
free(ks->qual); ks->qual = 0;
|
454
|
+
free(ks->name); ks->name = 0;
|
455
|
+
}
|
456
|
+
|
457
|
+
/* Core routine to align reads in _seq. It is separated from
|
458
|
+
* process_seqs() to realize multi-threading */
|
459
|
+
static void bsw2_aln_core(int tid, bsw2seq_t *_seq, const bsw2opt_t *_opt, const bntseq_t *bns, uint8_t *pac, bwt_t * const target[2])
|
460
|
+
{
|
461
|
+
int x;
|
462
|
+
bsw2opt_t opt = *_opt;
|
463
|
+
bsw2global_t *pool = bsw2_global_init();
|
464
|
+
for (x = 0; x < _seq->n; ++x) {
|
465
|
+
bsw2seq1_t *p = _seq->seq + x;
|
466
|
+
uint8_t *seq[2], *rseq[2];
|
467
|
+
int i, l, k;
|
468
|
+
bwtsw2_t *b[2];
|
469
|
+
l = p->l;
|
470
|
+
|
471
|
+
#ifdef HAVE_PTHREAD
|
472
|
+
if (_opt->n_threads > 1) {
|
473
|
+
pthread_mutex_lock(&g_dbwtsw_lock);
|
474
|
+
if (p->tid < 0) p->tid = tid;
|
475
|
+
else if (p->tid != tid) {
|
476
|
+
pthread_mutex_unlock(&g_dbwtsw_lock);
|
477
|
+
continue;
|
478
|
+
} // in pinciple else should not happen
|
479
|
+
pthread_mutex_unlock(&g_dbwtsw_lock);
|
480
|
+
}
|
481
|
+
#endif
|
482
|
+
|
483
|
+
// set opt->t
|
484
|
+
opt.t = _opt->t;
|
485
|
+
if (opt.t < log(l) * opt.coef) opt.t = (int)(log(l) * opt.coef + .499);
|
486
|
+
if (pool->max_l < l) { // then enlarge working space for aln_extend_core()
|
487
|
+
int tmp = ((l + 1) / 2 * opt.a + opt.r) / opt.r + l;
|
488
|
+
pool->max_l = l;
|
489
|
+
pool->aln_mem = realloc(pool->aln_mem, (tmp + 2) * 24);
|
490
|
+
}
|
491
|
+
// set opt->bw
|
492
|
+
opt.bw = _opt->bw;
|
493
|
+
k = (l * opt.a - 2 * opt.q) / (2 * opt.r + opt.a);
|
494
|
+
i = (l * opt.a - opt.a - opt.t) / opt.r;
|
495
|
+
if (k > i) k = i;
|
496
|
+
if (k < 1) k = 1; // I do not know if k==0 causes troubles
|
497
|
+
opt.bw = _opt->bw < k? _opt->bw : k;
|
498
|
+
// set seq[2] and rseq[2]
|
499
|
+
seq[0] = calloc(l * 4, 1);
|
500
|
+
seq[1] = seq[0] + l;
|
501
|
+
rseq[0] = seq[1] + l; rseq[1] = rseq[0] + l;
|
502
|
+
// convert sequences to 2-bit representation
|
503
|
+
for (i = k = 0; i < l; ++i) {
|
504
|
+
int c = nst_nt4_table[(int)p->seq[i]];
|
505
|
+
if (c >= 4) { c = (int)(drand48() * 4); ++k; } // FIXME: ambiguous bases are not properly handled
|
506
|
+
seq[0][i] = c;
|
507
|
+
seq[1][l-1-i] = 3 - c;
|
508
|
+
rseq[0][l-1-i] = c;
|
509
|
+
rseq[1][i] = 3 - c;
|
510
|
+
}
|
511
|
+
if (l - k < opt.t) { // too few unambiguous bases
|
512
|
+
print_hits(bns, &opt, p, 0);
|
513
|
+
free(seq[0]); continue;
|
514
|
+
}
|
515
|
+
// alignment
|
516
|
+
b[0] = bsw2_aln1_core(&opt, bns, pac, target[0], l, seq, 0, pool);
|
517
|
+
for (k = 0; k < b[0]->n; ++k)
|
518
|
+
if (b[0]->hits[k].n_seeds < opt.t_seeds) break;
|
519
|
+
if (k < b[0]->n) {
|
520
|
+
b[1] = bsw2_aln1_core(&opt, bns, pac, target[1], l, rseq, 1, pool);
|
521
|
+
for (i = 0; i < b[1]->n; ++i) {
|
522
|
+
bsw2hit_t *p = b[1]->hits + i;
|
523
|
+
int x = p->beg;
|
524
|
+
p->beg = l - p->end;
|
525
|
+
p->end = l - x;
|
526
|
+
if (p->l == 0) p->k = bns->l_pac - (p->k + p->len);
|
527
|
+
}
|
528
|
+
flag_fr(b);
|
529
|
+
merge_hits(b, l, 0);
|
530
|
+
bsw2_resolve_duphits(0, b[0], 0);
|
531
|
+
bsw2_resolve_query_overlaps(b[0], opt.mask_level);
|
532
|
+
} else b[1] = 0;
|
533
|
+
// generate CIGAR and print SAM
|
534
|
+
gen_cigar(&opt, l, seq, pac, b[0]);
|
535
|
+
print_hits(bns, &opt, p, b[0]);
|
536
|
+
// free
|
537
|
+
free(seq[0]);
|
538
|
+
bsw2_destroy(b[0]);
|
539
|
+
}
|
540
|
+
bsw2_global_destroy(pool);
|
541
|
+
}
|
542
|
+
|
543
|
+
#ifdef HAVE_PTHREAD
|
544
|
+
typedef struct {
|
545
|
+
int tid;
|
546
|
+
bsw2seq_t *_seq;
|
547
|
+
const bsw2opt_t *_opt;
|
548
|
+
const bntseq_t *bns;
|
549
|
+
uint8_t *pac;
|
550
|
+
bwt_t *target[2];
|
551
|
+
} thread_aux_t;
|
552
|
+
|
553
|
+
/* another interface to bsw2_aln_core() to facilitate pthread_create() */
|
554
|
+
static void *worker(void *data)
|
555
|
+
{
|
556
|
+
thread_aux_t *p = (thread_aux_t*)data;
|
557
|
+
bsw2_aln_core(p->tid, p->_seq, p->_opt, p->bns, p->pac, p->target);
|
558
|
+
return 0;
|
559
|
+
}
|
560
|
+
#endif
|
561
|
+
|
562
|
+
/* process sequences stored in _seq, generate SAM lines for these
|
563
|
+
* sequences and reset _seq afterwards. */
|
564
|
+
static void process_seqs(bsw2seq_t *_seq, const bsw2opt_t *opt, const bntseq_t *bns, uint8_t *pac, bwt_t * const target[2])
|
565
|
+
{
|
566
|
+
int i;
|
567
|
+
|
568
|
+
#ifdef HAVE_PTHREAD
|
569
|
+
if (opt->n_threads <= 1) {
|
570
|
+
bsw2_aln_core(0, _seq, opt, bns, pac, target);
|
571
|
+
} else {
|
572
|
+
pthread_t *tid;
|
573
|
+
pthread_attr_t attr;
|
574
|
+
thread_aux_t *data;
|
575
|
+
int j;
|
576
|
+
pthread_attr_init(&attr);
|
577
|
+
pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE);
|
578
|
+
data = (thread_aux_t*)calloc(opt->n_threads, sizeof(thread_aux_t));
|
579
|
+
tid = (pthread_t*)calloc(opt->n_threads, sizeof(pthread_t));
|
580
|
+
for (j = 0; j < opt->n_threads; ++j) {
|
581
|
+
thread_aux_t *p = data + j;
|
582
|
+
p->tid = j; p->_seq = _seq; p->_opt = opt; p->bns = bns;
|
583
|
+
p->pac = pac; p->target[0] = target[0]; p->target[1] = target[1];
|
584
|
+
pthread_create(&tid[j], &attr, worker, p);
|
585
|
+
}
|
586
|
+
for (j = 0; j < opt->n_threads; ++j) pthread_join(tid[j], 0);
|
587
|
+
free(data); free(tid);
|
588
|
+
}
|
589
|
+
#else
|
590
|
+
bsw2_aln_core(0, _seq, opt, bns, pac, target);
|
591
|
+
#endif
|
592
|
+
|
593
|
+
// print and reset
|
594
|
+
for (i = 0; i < _seq->n; ++i) {
|
595
|
+
bsw2seq1_t *p = _seq->seq + i;
|
596
|
+
if (p->sam) printf("%s", p->sam);
|
597
|
+
free(p->name); free(p->seq); free(p->qual); free(p->sam);
|
598
|
+
p->tid = -1; p->l = 0;
|
599
|
+
p->name = p->seq = p->qual = p->sam = 0;
|
600
|
+
}
|
601
|
+
fflush(stdout);
|
602
|
+
_seq->n = 0;
|
603
|
+
}
|
604
|
+
|
605
|
+
void bsw2_aln(const bsw2opt_t *opt, const bntseq_t *bns, bwt_t * const target[2], const char *fn)
|
606
|
+
{
|
607
|
+
gzFile fp;
|
608
|
+
kseq_t *ks;
|
609
|
+
int l, size = 0;
|
610
|
+
uint8_t *pac;
|
611
|
+
bsw2seq_t *_seq;
|
612
|
+
|
613
|
+
pac = calloc(bns->l_pac/4+1, 1);
|
614
|
+
if (pac == 0) {
|
615
|
+
fprintf(stderr, "[bsw2_aln] insufficient memory!\n");
|
616
|
+
return;
|
617
|
+
}
|
618
|
+
for (l = 0; l < bns->n_seqs; ++l)
|
619
|
+
printf("@SQ\tSN:%s\tLN:%d\n", bns->anns[l].name, bns->anns[l].len);
|
620
|
+
fread(pac, 1, bns->l_pac/4+1, bns->fp_pac);
|
621
|
+
fp = xzopen(fn, "r");
|
622
|
+
ks = kseq_init(fp);
|
623
|
+
_seq = calloc(1, sizeof(bsw2seq_t));
|
624
|
+
while ((l = kseq_read(ks)) >= 0) {
|
625
|
+
bsw2seq1_t *p;
|
626
|
+
if (_seq->n == _seq->max) {
|
627
|
+
_seq->max = _seq->max? _seq->max<<1 : 1024;
|
628
|
+
_seq->seq = realloc(_seq->seq, _seq->max * sizeof(bsw2seq1_t));
|
629
|
+
}
|
630
|
+
p = &_seq->seq[_seq->n++];
|
631
|
+
p->tid = -1;
|
632
|
+
p->l = l;
|
633
|
+
p->name = strdup(ks->name.s);
|
634
|
+
p->seq = strdup(ks->seq.s);
|
635
|
+
p->qual = ks->qual.l? strdup(ks->qual.s) : 0;
|
636
|
+
p->sam = 0;
|
637
|
+
size += l;
|
638
|
+
if (size > opt->chunk_size) {
|
639
|
+
fprintf(stderr, "[bsw2_aln] read %d sequences (%d bp)...\n", _seq->n, size);
|
640
|
+
process_seqs(_seq, opt, bns, pac, target);
|
641
|
+
size = 0;
|
642
|
+
}
|
643
|
+
}
|
644
|
+
fprintf(stderr, "[bsw2_aln] read %d sequences (%d bp)...\n", _seq->n, size);
|
645
|
+
process_seqs(_seq, opt, bns, pac, target);
|
646
|
+
free(_seq->seq); free(_seq);
|
647
|
+
kseq_destroy(ks);
|
648
|
+
gzclose(fp);
|
649
|
+
free(pac);
|
650
|
+
}
|