bio-bwa 0.2.0
Sign up to get free protection for your applications and to get access to all the features.
- data/.document +5 -0
- data/Gemfile +15 -0
- data/Gemfile.lock +28 -0
- data/LICENSE.txt +35 -0
- data/README.rdoc +33 -0
- data/Rakefile +56 -0
- data/VERSION +1 -0
- data/bio-bwa.gemspec +152 -0
- data/doc/Bio.html +93 -0
- data/doc/Bio/BWA.html +2884 -0
- data/doc/Bio/BWA/Library.html +229 -0
- data/doc/_index.html +119 -0
- data/doc/class_list.html +36 -0
- data/doc/css/common.css +1 -0
- data/doc/css/full_list.css +53 -0
- data/doc/css/style.css +310 -0
- data/doc/file.LICENSE.html +88 -0
- data/doc/file.README.html +119 -0
- data/doc/file_list.html +41 -0
- data/doc/frames.html +13 -0
- data/doc/index.html +119 -0
- data/doc/js/app.js +203 -0
- data/doc/js/full_list.js +149 -0
- data/doc/js/jquery.js +154 -0
- data/doc/method_list.html +171 -0
- data/doc/top-level-namespace.html +88 -0
- data/ext/COPYING +674 -0
- data/ext/ChangeLog +3864 -0
- data/ext/NEWS +555 -0
- data/ext/README +29 -0
- data/ext/bamlite.c +155 -0
- data/ext/bamlite.h +94 -0
- data/ext/bntseq.c +303 -0
- data/ext/bntseq.h +80 -0
- data/ext/bwa.1 +562 -0
- data/ext/bwape.c +807 -0
- data/ext/bwase.c +686 -0
- data/ext/bwase.h +27 -0
- data/ext/bwaseqio.c +222 -0
- data/ext/bwt.c +250 -0
- data/ext/bwt.h +105 -0
- data/ext/bwt_gen/Makefile +23 -0
- data/ext/bwt_gen/QSufSort.c +496 -0
- data/ext/bwt_gen/QSufSort.h +40 -0
- data/ext/bwt_gen/bwt_gen.c +1547 -0
- data/ext/bwt_gen/bwt_gen.h +105 -0
- data/ext/bwt_lite.c +94 -0
- data/ext/bwt_lite.h +29 -0
- data/ext/bwtaln.c +345 -0
- data/ext/bwtaln.h +150 -0
- data/ext/bwtgap.c +264 -0
- data/ext/bwtgap.h +38 -0
- data/ext/bwtindex.c +186 -0
- data/ext/bwtio.c +77 -0
- data/ext/bwtmisc.c +269 -0
- data/ext/bwtsw2.h +51 -0
- data/ext/bwtsw2_aux.c +650 -0
- data/ext/bwtsw2_chain.c +107 -0
- data/ext/bwtsw2_core.c +594 -0
- data/ext/bwtsw2_main.c +100 -0
- data/ext/cs2nt.c +191 -0
- data/ext/is.c +218 -0
- data/ext/khash.h +506 -0
- data/ext/kseq.h +208 -0
- data/ext/ksort.h +269 -0
- data/ext/kstring.c +35 -0
- data/ext/kstring.h +46 -0
- data/ext/kvec.h +90 -0
- data/ext/main.c +63 -0
- data/ext/main.h +29 -0
- data/ext/mkrf_conf.rb +49 -0
- data/ext/qualfa2fq.pl +27 -0
- data/ext/simple_dp.c +162 -0
- data/ext/simpletest.c +23 -0
- data/ext/solid2fastq.pl +111 -0
- data/ext/stdaln.c +1072 -0
- data/ext/stdaln.h +162 -0
- data/ext/utils.c +82 -0
- data/ext/utils.h +54 -0
- data/lib/bio-bwa.rb +7 -0
- data/lib/bio/bwa.rb +312 -0
- data/lib/bio/bwa/library.rb +42 -0
- data/test/data/testdata.fa +602 -0
- data/test/data/testdata.long.fa +175 -0
- data/test/data/testdata.short.fa +2 -0
- data/test/helper.rb +18 -0
- data/test/test_bio-bwa_basic.rb +62 -0
- data/test/test_bio-bwa_make_index.rb +42 -0
- data/test/test_bio-bwa_run_aln.rb +49 -0
- data/test/test_bio-bwa_sam_conversion.rb +49 -0
- metadata +218 -0
data/ext/bwase.c
ADDED
@@ -0,0 +1,686 @@
|
|
1
|
+
#include <unistd.h>
|
2
|
+
#include <string.h>
|
3
|
+
#include <stdio.h>
|
4
|
+
#include <stdlib.h>
|
5
|
+
#include <math.h>
|
6
|
+
#include <time.h>
|
7
|
+
#include "stdaln.h"
|
8
|
+
#include "bwase.h"
|
9
|
+
#include "bwtaln.h"
|
10
|
+
#include "bntseq.h"
|
11
|
+
#include "utils.h"
|
12
|
+
#include "kstring.h"
|
13
|
+
|
14
|
+
int g_log_n[256];
|
15
|
+
char *bwa_rg_line, *bwa_rg_id;
|
16
|
+
|
17
|
+
void bwa_print_sam_PG();
|
18
|
+
|
19
|
+
void bwa_aln2seq_core(int n_aln, const bwt_aln1_t *aln, bwa_seq_t *s, int set_main, int n_multi)
|
20
|
+
{
|
21
|
+
int i, cnt, best;
|
22
|
+
if (n_aln == 0) {
|
23
|
+
s->type = BWA_TYPE_NO_MATCH;
|
24
|
+
s->c1 = s->c2 = 0;
|
25
|
+
return;
|
26
|
+
}
|
27
|
+
|
28
|
+
if (set_main) {
|
29
|
+
best = aln[0].score;
|
30
|
+
for (i = cnt = 0; i < n_aln; ++i) {
|
31
|
+
const bwt_aln1_t *p = aln + i;
|
32
|
+
if (p->score > best) break;
|
33
|
+
if (drand48() * (p->l - p->k + 1 + cnt) > (double)cnt) {
|
34
|
+
s->n_mm = p->n_mm; s->n_gapo = p->n_gapo; s->n_gape = p->n_gape; s->strand = p->a;
|
35
|
+
s->score = p->score;
|
36
|
+
s->sa = p->k + (bwtint_t)((p->l - p->k + 1) * drand48());
|
37
|
+
}
|
38
|
+
cnt += p->l - p->k + 1;
|
39
|
+
}
|
40
|
+
s->c1 = cnt;
|
41
|
+
for (; i < n_aln; ++i) cnt += aln[i].l - aln[i].k + 1;
|
42
|
+
s->c2 = cnt - s->c1;
|
43
|
+
s->type = s->c1 > 1? BWA_TYPE_REPEAT : BWA_TYPE_UNIQUE;
|
44
|
+
}
|
45
|
+
|
46
|
+
if (n_multi) {
|
47
|
+
int k, rest, n_occ, z = 0;
|
48
|
+
for (k = n_occ = 0; k < n_aln; ++k) {
|
49
|
+
const bwt_aln1_t *q = aln + k;
|
50
|
+
n_occ += q->l - q->k + 1;
|
51
|
+
}
|
52
|
+
if (s->multi) free(s->multi);
|
53
|
+
if (n_occ > n_multi + 1) { // if there are too many hits, generate none of them
|
54
|
+
s->multi = 0; s->n_multi = 0;
|
55
|
+
return;
|
56
|
+
}
|
57
|
+
/* The following code is more flexible than what is required
|
58
|
+
* here. In principle, due to the requirement above, we can
|
59
|
+
* simply output all hits, but the following samples "rest"
|
60
|
+
* number of random hits. */
|
61
|
+
rest = n_occ > n_multi + 1? n_multi + 1 : n_occ; // find one additional for ->sa
|
62
|
+
s->multi = calloc(rest, sizeof(bwt_multi1_t));
|
63
|
+
for (k = 0; k < n_aln; ++k) {
|
64
|
+
const bwt_aln1_t *q = aln + k;
|
65
|
+
if (q->l - q->k + 1 <= rest) {
|
66
|
+
bwtint_t l;
|
67
|
+
for (l = q->k; l <= q->l; ++l) {
|
68
|
+
s->multi[z].pos = l;
|
69
|
+
s->multi[z].gap = q->n_gapo + q->n_gape;
|
70
|
+
s->multi[z].mm = q->n_mm;
|
71
|
+
s->multi[z++].strand = q->a;
|
72
|
+
}
|
73
|
+
rest -= q->l - q->k + 1;
|
74
|
+
} else { // Random sampling (http://code.activestate.com/recipes/272884/). In fact, we never come here.
|
75
|
+
int j, i, k;
|
76
|
+
for (j = rest, i = q->l - q->k + 1, k = 0; j > 0; --j) {
|
77
|
+
double p = 1.0, x = drand48();
|
78
|
+
while (x < p) p -= p * j / (i--);
|
79
|
+
s->multi[z].pos = q->l - i;
|
80
|
+
s->multi[z].gap = q->n_gapo + q->n_gape;
|
81
|
+
s->multi[z].mm = q->n_mm;
|
82
|
+
s->multi[z++].strand = q->a;
|
83
|
+
}
|
84
|
+
rest = 0;
|
85
|
+
break;
|
86
|
+
}
|
87
|
+
}
|
88
|
+
s->n_multi = z;
|
89
|
+
for (k = z = 0; k < s->n_multi; ++k)
|
90
|
+
if (s->multi[k].pos != s->sa)
|
91
|
+
s->multi[z++] = s->multi[k];
|
92
|
+
s->n_multi = z < n_multi? z : n_multi;
|
93
|
+
}
|
94
|
+
}
|
95
|
+
|
96
|
+
void bwa_aln2seq(int n_aln, const bwt_aln1_t *aln, bwa_seq_t *s)
|
97
|
+
{
|
98
|
+
bwa_aln2seq_core(n_aln, aln, s, 1, 0);
|
99
|
+
}
|
100
|
+
|
101
|
+
int bwa_approx_mapQ(const bwa_seq_t *p, int mm)
|
102
|
+
{
|
103
|
+
int n;
|
104
|
+
if (p->c1 == 0) return 23;
|
105
|
+
if (p->c1 > 1) return 0;
|
106
|
+
if (p->n_mm == mm) return 25;
|
107
|
+
if (p->c2 == 0) return 37;
|
108
|
+
n = (p->c2 >= 255)? 255 : p->c2;
|
109
|
+
return (23 < g_log_n[n])? 0 : 23 - g_log_n[n];
|
110
|
+
}
|
111
|
+
|
112
|
+
/**
|
113
|
+
* Derive the actual position in the read from the given suffix array
|
114
|
+
* coordinates. Note that the position will be approximate based on
|
115
|
+
* whether indels appear in the read and whether calculations are
|
116
|
+
* performed from the start or end of the read.
|
117
|
+
*/
|
118
|
+
void bwa_cal_pac_pos_core(const bwt_t *forward_bwt, const bwt_t *reverse_bwt, bwa_seq_t *seq, const int max_mm, const float fnr)
|
119
|
+
{
|
120
|
+
int max_diff;
|
121
|
+
if (seq->type != BWA_TYPE_UNIQUE && seq->type != BWA_TYPE_REPEAT) return;
|
122
|
+
max_diff = fnr > 0.0? bwa_cal_maxdiff(seq->len, BWA_AVG_ERR, fnr) : max_mm;
|
123
|
+
if (seq->strand) { // reverse strand only
|
124
|
+
seq->pos = bwt_sa(forward_bwt, seq->sa);
|
125
|
+
seq->seQ = seq->mapQ = bwa_approx_mapQ(seq, max_diff);
|
126
|
+
} else { // forward strand only
|
127
|
+
/* NB: For gapped alignment, p->pos may not be correct, which
|
128
|
+
* will be fixed in refine_gapped_core(). This line also
|
129
|
+
* determines the way "x" is calculated in
|
130
|
+
* refine_gapped_core() when (ext < 0 && is_end == 0). */
|
131
|
+
seq->pos = reverse_bwt->seq_len - (bwt_sa(reverse_bwt, seq->sa) + seq->len);
|
132
|
+
seq->seQ = seq->mapQ = bwa_approx_mapQ(seq, max_diff);
|
133
|
+
}
|
134
|
+
}
|
135
|
+
|
136
|
+
void bwa_cal_pac_pos(const char *prefix, int n_seqs, bwa_seq_t *seqs, int max_mm, float fnr)
|
137
|
+
{
|
138
|
+
int i, j;
|
139
|
+
char str[1024];
|
140
|
+
bwt_t *bwt;
|
141
|
+
// load forward SA
|
142
|
+
strcpy(str, prefix); strcat(str, ".bwt"); bwt = bwt_restore_bwt(str);
|
143
|
+
strcpy(str, prefix); strcat(str, ".sa"); bwt_restore_sa(str, bwt);
|
144
|
+
for (i = 0; i != n_seqs; ++i) {
|
145
|
+
if (seqs[i].strand) bwa_cal_pac_pos_core(bwt, 0, &seqs[i], max_mm, fnr);
|
146
|
+
for (j = 0; j < seqs[i].n_multi; ++j) {
|
147
|
+
bwt_multi1_t *p = seqs[i].multi + j;
|
148
|
+
if (p->strand) p->pos = bwt_sa(bwt, p->pos);
|
149
|
+
}
|
150
|
+
}
|
151
|
+
bwt_destroy(bwt);
|
152
|
+
// load reverse BWT and SA
|
153
|
+
strcpy(str, prefix); strcat(str, ".rbwt"); bwt = bwt_restore_bwt(str);
|
154
|
+
strcpy(str, prefix); strcat(str, ".rsa"); bwt_restore_sa(str, bwt);
|
155
|
+
for (i = 0; i != n_seqs; ++i) {
|
156
|
+
if (!seqs[i].strand) bwa_cal_pac_pos_core(0, bwt, &seqs[i], max_mm, fnr);
|
157
|
+
for (j = 0; j < seqs[i].n_multi; ++j) {
|
158
|
+
bwt_multi1_t *p = seqs[i].multi + j;
|
159
|
+
if (!p->strand) p->pos = bwt->seq_len - (bwt_sa(bwt, p->pos) + seqs[i].len);
|
160
|
+
}
|
161
|
+
}
|
162
|
+
bwt_destroy(bwt);
|
163
|
+
}
|
164
|
+
|
165
|
+
/* is_end_correct == 1 if (*pos+len) gives the correct coordinate on
|
166
|
+
* forward strand. This happens when p->pos is calculated by
|
167
|
+
* bwa_cal_pac_pos(). is_end_correct==0 if (*pos) gives the correct
|
168
|
+
* coordinate. This happens only for color-converted alignment. */
|
169
|
+
static bwa_cigar_t *refine_gapped_core(bwtint_t l_pac, const ubyte_t *pacseq, int len, const ubyte_t *seq, bwtint_t *_pos,
|
170
|
+
int ext, int *n_cigar, int is_end_correct)
|
171
|
+
{
|
172
|
+
bwa_cigar_t *cigar = 0;
|
173
|
+
ubyte_t *ref_seq;
|
174
|
+
int l = 0, path_len, ref_len;
|
175
|
+
AlnParam ap = aln_param_bwa;
|
176
|
+
path_t *path;
|
177
|
+
int64_t k, __pos = *_pos > l_pac? (int64_t)((int32_t)*_pos) : *_pos;
|
178
|
+
|
179
|
+
ref_len = len + abs(ext);
|
180
|
+
if (ext > 0) {
|
181
|
+
ref_seq = (ubyte_t*)calloc(ref_len, 1);
|
182
|
+
for (k = __pos; k < __pos + ref_len && k < l_pac; ++k)
|
183
|
+
ref_seq[l++] = pacseq[k>>2] >> ((~k&3)<<1) & 3;
|
184
|
+
} else {
|
185
|
+
int64_t x = __pos + (is_end_correct? len : ref_len);
|
186
|
+
ref_seq = (ubyte_t*)calloc(ref_len, 1);
|
187
|
+
for (l = 0, k = x - ref_len > 0? x - ref_len : 0; k < x && k < l_pac; ++k)
|
188
|
+
ref_seq[l++] = pacseq[k>>2] >> ((~k&3)<<1) & 3;
|
189
|
+
}
|
190
|
+
path = (path_t*)calloc(l+len, sizeof(path_t));
|
191
|
+
|
192
|
+
aln_global_core(ref_seq, l, (ubyte_t*)seq, len, &ap, path, &path_len);
|
193
|
+
cigar = bwa_aln_path2cigar(path, path_len, n_cigar);
|
194
|
+
|
195
|
+
if (ext < 0 && is_end_correct) { // fix coordinate for reads mapped on the forward strand
|
196
|
+
for (l = k = 0; k < *n_cigar; ++k) {
|
197
|
+
if (__cigar_op(cigar[k]) == FROM_D) l -= __cigar_len(cigar[k]);
|
198
|
+
else if (__cigar_op(cigar[k]) == FROM_I) l += __cigar_len(cigar[k]);
|
199
|
+
}
|
200
|
+
__pos += l;
|
201
|
+
}
|
202
|
+
|
203
|
+
if (__cigar_op(cigar[0]) == FROM_D) { // deletion at the 5'-end
|
204
|
+
__pos += __cigar_len(cigar[0]);
|
205
|
+
for (k = 0; k < *n_cigar - 1; ++k) cigar[k] = cigar[k+1];
|
206
|
+
--(*n_cigar);
|
207
|
+
}
|
208
|
+
if (__cigar_op(cigar[*n_cigar-1]) == FROM_D) --(*n_cigar); // deletion at the 3'-end
|
209
|
+
|
210
|
+
// change "I" at either end of the read to S. just in case. This should rarely happen...
|
211
|
+
if (__cigar_op(cigar[*n_cigar-1]) == FROM_I) cigar[*n_cigar-1] = __cigar_create(3, (__cigar_len(cigar[*n_cigar-1])));
|
212
|
+
if (__cigar_op(cigar[0]) == FROM_I) cigar[0] = __cigar_create(3, (__cigar_len(cigar[0])));
|
213
|
+
|
214
|
+
*_pos = (bwtint_t)__pos;
|
215
|
+
free(ref_seq); free(path);
|
216
|
+
return cigar;
|
217
|
+
}
|
218
|
+
|
219
|
+
char *bwa_cal_md1(int n_cigar, bwa_cigar_t *cigar, int len, bwtint_t pos, ubyte_t *seq,
|
220
|
+
bwtint_t l_pac, ubyte_t *pacseq, kstring_t *str, int *_nm)
|
221
|
+
{
|
222
|
+
bwtint_t x, y;
|
223
|
+
int z, u, c, nm = 0;
|
224
|
+
str->l = 0; // reset
|
225
|
+
x = pos; y = 0;
|
226
|
+
if (cigar) {
|
227
|
+
int k, l;
|
228
|
+
for (k = u = 0; k < n_cigar; ++k) {
|
229
|
+
l = __cigar_len(cigar[k]);
|
230
|
+
if (__cigar_op(cigar[k]) == FROM_M) {
|
231
|
+
for (z = 0; z < l && x+z < l_pac; ++z) {
|
232
|
+
c = pacseq[(x+z)>>2] >> ((~(x+z)&3)<<1) & 3;
|
233
|
+
if (c > 3 || seq[y+z] > 3 || c != seq[y+z]) {
|
234
|
+
ksprintf(str, "%d", u);
|
235
|
+
kputc("ACGTN"[c], str);
|
236
|
+
++nm;
|
237
|
+
u = 0;
|
238
|
+
} else ++u;
|
239
|
+
}
|
240
|
+
x += l; y += l;
|
241
|
+
/* } else if (cigar[k]>>14 == FROM_I || cigar[k]>>14 == 3) { */
|
242
|
+
} else if (__cigar_op(cigar[k]) == FROM_I || __cigar_op(cigar[k]) == FROM_S) {
|
243
|
+
y += l;
|
244
|
+
if (__cigar_op(cigar[k]) == FROM_I) nm += l;
|
245
|
+
} else if (__cigar_op(cigar[k]) == FROM_D) {
|
246
|
+
ksprintf(str, "%d", u);
|
247
|
+
kputc('^', str);
|
248
|
+
for (z = 0; z < l && x+z < l_pac; ++z)
|
249
|
+
kputc("ACGT"[pacseq[(x+z)>>2] >> ((~(x+z)&3)<<1) & 3], str);
|
250
|
+
u = 0;
|
251
|
+
x += l; nm += l;
|
252
|
+
}
|
253
|
+
}
|
254
|
+
} else { // no gaps
|
255
|
+
for (z = u = 0; z < (bwtint_t)len; ++z) {
|
256
|
+
c = pacseq[(x+z)>>2] >> ((~(x+z)&3)<<1) & 3;
|
257
|
+
if (c > 3 || seq[y+z] > 3 || c != seq[y+z]) {
|
258
|
+
ksprintf(str, "%d", u);
|
259
|
+
kputc("ACGTN"[c], str);
|
260
|
+
++nm;
|
261
|
+
u = 0;
|
262
|
+
} else ++u;
|
263
|
+
}
|
264
|
+
}
|
265
|
+
ksprintf(str, "%d", u);
|
266
|
+
*_nm = nm;
|
267
|
+
return strdup(str->s);
|
268
|
+
}
|
269
|
+
|
270
|
+
void bwa_correct_trimmed(bwa_seq_t *s)
|
271
|
+
{
|
272
|
+
if (s->len == s->full_len) return;
|
273
|
+
if (s->strand == 0) { // forward
|
274
|
+
if (s->cigar && __cigar_op(s->cigar[s->n_cigar-1]) == FROM_S) { // the last is S
|
275
|
+
s->cigar[s->n_cigar-1] += s->full_len - s->len;
|
276
|
+
} else {
|
277
|
+
if (s->cigar == 0) {
|
278
|
+
s->n_cigar = 2;
|
279
|
+
s->cigar = calloc(s->n_cigar, sizeof(bwa_cigar_t));
|
280
|
+
s->cigar[0] = __cigar_create(0, s->len);
|
281
|
+
} else {
|
282
|
+
++s->n_cigar;
|
283
|
+
s->cigar = realloc(s->cigar, s->n_cigar * sizeof(bwa_cigar_t));
|
284
|
+
}
|
285
|
+
s->cigar[s->n_cigar-1] = __cigar_create(3, (s->full_len - s->len));
|
286
|
+
}
|
287
|
+
} else { // reverse
|
288
|
+
if (s->cigar && __cigar_op(s->cigar[0]) == FROM_S) { // the first is S
|
289
|
+
s->cigar[0] += s->full_len - s->len;
|
290
|
+
} else {
|
291
|
+
if (s->cigar == 0) {
|
292
|
+
s->n_cigar = 2;
|
293
|
+
s->cigar = calloc(s->n_cigar, sizeof(bwa_cigar_t));
|
294
|
+
s->cigar[1] = __cigar_create(0, s->len);
|
295
|
+
} else {
|
296
|
+
++s->n_cigar;
|
297
|
+
s->cigar = realloc(s->cigar, s->n_cigar * sizeof(bwa_cigar_t));
|
298
|
+
memmove(s->cigar + 1, s->cigar, (s->n_cigar-1) * sizeof(bwa_cigar_t));
|
299
|
+
}
|
300
|
+
s->cigar[0] = __cigar_create(3, (s->full_len - s->len));
|
301
|
+
}
|
302
|
+
}
|
303
|
+
s->len = s->full_len;
|
304
|
+
}
|
305
|
+
|
306
|
+
void bwa_refine_gapped(const bntseq_t *bns, int n_seqs, bwa_seq_t *seqs, ubyte_t *_pacseq, bntseq_t *ntbns)
|
307
|
+
{
|
308
|
+
ubyte_t *pacseq, *ntpac = 0;
|
309
|
+
int i, j;
|
310
|
+
kstring_t *str;
|
311
|
+
|
312
|
+
if (ntbns) { // in color space
|
313
|
+
ntpac = (ubyte_t*)calloc(ntbns->l_pac/4+1, 1);
|
314
|
+
rewind(ntbns->fp_pac);
|
315
|
+
fread(ntpac, 1, ntbns->l_pac/4 + 1, ntbns->fp_pac);
|
316
|
+
}
|
317
|
+
|
318
|
+
if (!_pacseq) {
|
319
|
+
pacseq = (ubyte_t*)calloc(bns->l_pac/4+1, 1);
|
320
|
+
rewind(bns->fp_pac);
|
321
|
+
fread(pacseq, 1, bns->l_pac/4+1, bns->fp_pac);
|
322
|
+
} else pacseq = _pacseq;
|
323
|
+
for (i = 0; i != n_seqs; ++i) {
|
324
|
+
bwa_seq_t *s = seqs + i;
|
325
|
+
seq_reverse(s->len, s->seq, 0); // IMPORTANT: s->seq is reversed here!!!
|
326
|
+
for (j = 0; j < s->n_multi; ++j) {
|
327
|
+
bwt_multi1_t *q = s->multi + j;
|
328
|
+
int n_cigar;
|
329
|
+
if (q->gap == 0) continue;
|
330
|
+
q->cigar = refine_gapped_core(bns->l_pac, pacseq, s->len, q->strand? s->rseq : s->seq, &q->pos,
|
331
|
+
(q->strand? 1 : -1) * q->gap, &n_cigar, 1);
|
332
|
+
q->n_cigar = n_cigar;
|
333
|
+
}
|
334
|
+
if (s->type == BWA_TYPE_NO_MATCH || s->type == BWA_TYPE_MATESW || s->n_gapo == 0) continue;
|
335
|
+
s->cigar = refine_gapped_core(bns->l_pac, pacseq, s->len, s->strand? s->rseq : s->seq, &s->pos,
|
336
|
+
(s->strand? 1 : -1) * (s->n_gapo + s->n_gape), &s->n_cigar, 1);
|
337
|
+
}
|
338
|
+
|
339
|
+
if (ntbns) { // in color space
|
340
|
+
for (i = 0; i < n_seqs; ++i) {
|
341
|
+
bwa_seq_t *s = seqs + i;
|
342
|
+
bwa_cs2nt_core(s, bns->l_pac, ntpac);
|
343
|
+
for (j = 0; j < s->n_multi; ++j) {
|
344
|
+
bwt_multi1_t *q = s->multi + j;
|
345
|
+
int n_cigar;
|
346
|
+
if (q->gap == 0) continue;
|
347
|
+
free(q->cigar);
|
348
|
+
q->cigar = refine_gapped_core(bns->l_pac, ntpac, s->len, q->strand? s->rseq : s->seq, &q->pos,
|
349
|
+
(q->strand? 1 : -1) * q->gap, &n_cigar, 0);
|
350
|
+
q->n_cigar = n_cigar;
|
351
|
+
}
|
352
|
+
if (s->type != BWA_TYPE_NO_MATCH && s->cigar) { // update cigar again
|
353
|
+
free(s->cigar);
|
354
|
+
s->cigar = refine_gapped_core(bns->l_pac, ntpac, s->len, s->strand? s->rseq : s->seq, &s->pos,
|
355
|
+
(s->strand? 1 : -1) * (s->n_gapo + s->n_gape), &s->n_cigar, 0);
|
356
|
+
}
|
357
|
+
}
|
358
|
+
}
|
359
|
+
|
360
|
+
// generate MD tag
|
361
|
+
str = (kstring_t*)calloc(1, sizeof(kstring_t));
|
362
|
+
for (i = 0; i != n_seqs; ++i) {
|
363
|
+
bwa_seq_t *s = seqs + i;
|
364
|
+
if (s->type != BWA_TYPE_NO_MATCH) {
|
365
|
+
int nm;
|
366
|
+
s->md = bwa_cal_md1(s->n_cigar, s->cigar, s->len, s->pos, s->strand? s->rseq : s->seq,
|
367
|
+
bns->l_pac, ntbns? ntpac : pacseq, str, &nm);
|
368
|
+
s->nm = nm;
|
369
|
+
}
|
370
|
+
}
|
371
|
+
free(str->s); free(str);
|
372
|
+
|
373
|
+
// correct for trimmed reads
|
374
|
+
if (!ntbns) // trimming is only enabled for Illumina reads
|
375
|
+
for (i = 0; i < n_seqs; ++i) bwa_correct_trimmed(seqs + i);
|
376
|
+
|
377
|
+
if (!_pacseq) free(pacseq);
|
378
|
+
free(ntpac);
|
379
|
+
}
|
380
|
+
|
381
|
+
int64_t pos_end(const bwa_seq_t *p)
|
382
|
+
{
|
383
|
+
if (p->cigar) {
|
384
|
+
int j;
|
385
|
+
int64_t x = p->pos;
|
386
|
+
for (j = 0; j != p->n_cigar; ++j) {
|
387
|
+
int op = __cigar_op(p->cigar[j]);
|
388
|
+
if (op == 0 || op == 2) x += __cigar_len(p->cigar[j]);
|
389
|
+
}
|
390
|
+
return x;
|
391
|
+
} else return p->pos + p->len;
|
392
|
+
}
|
393
|
+
|
394
|
+
int64_t pos_end_multi(const bwt_multi1_t *p, int len) // analogy to pos_end()
|
395
|
+
{
|
396
|
+
if (p->cigar) {
|
397
|
+
int j;
|
398
|
+
int64_t x = p->pos;
|
399
|
+
for (j = 0; j != p->n_cigar; ++j) {
|
400
|
+
int op = __cigar_op(p->cigar[j]);
|
401
|
+
if (op == 0 || op == 2) x += __cigar_len(p->cigar[j]);
|
402
|
+
}
|
403
|
+
return x;
|
404
|
+
} else return p->pos + len;
|
405
|
+
}
|
406
|
+
|
407
|
+
static int64_t pos_5(const bwa_seq_t *p)
|
408
|
+
{
|
409
|
+
if (p->type != BWA_TYPE_NO_MATCH)
|
410
|
+
return p->strand? pos_end(p) : p->pos;
|
411
|
+
return -1;
|
412
|
+
}
|
413
|
+
|
414
|
+
void bwa_print_sam1(const bntseq_t *bns, bwa_seq_t *p, const bwa_seq_t *mate, int mode, int max_top2)
|
415
|
+
{
|
416
|
+
int j;
|
417
|
+
if (p->type != BWA_TYPE_NO_MATCH || (mate && mate->type != BWA_TYPE_NO_MATCH)) {
|
418
|
+
int seqid, nn, am = 0, flag = p->extra_flag;
|
419
|
+
char XT;
|
420
|
+
|
421
|
+
if (p->type == BWA_TYPE_NO_MATCH) {
|
422
|
+
p->pos = mate->pos;
|
423
|
+
p->strand = mate->strand;
|
424
|
+
flag |= SAM_FSU;
|
425
|
+
j = 1;
|
426
|
+
} else j = pos_end(p) - p->pos; // j is the length of the reference in the alignment
|
427
|
+
|
428
|
+
// get seqid
|
429
|
+
nn = bns_coor_pac2real(bns, p->pos, j, &seqid);
|
430
|
+
if (p->type != BWA_TYPE_NO_MATCH && p->pos + j - bns->anns[seqid].offset > bns->anns[seqid].len)
|
431
|
+
flag |= SAM_FSU; // flag UNMAP as this alignment bridges two adjacent reference sequences
|
432
|
+
|
433
|
+
// update flag and print it
|
434
|
+
if (p->strand) flag |= SAM_FSR;
|
435
|
+
if (mate) {
|
436
|
+
if (mate->type != BWA_TYPE_NO_MATCH) {
|
437
|
+
if (mate->strand) flag |= SAM_FMR;
|
438
|
+
} else flag |= SAM_FMU;
|
439
|
+
}
|
440
|
+
printf("%s\t%d\t%s\t", p->name, flag, bns->anns[seqid].name);
|
441
|
+
printf("%d\t%d\t", (int)(p->pos - bns->anns[seqid].offset + 1), p->mapQ);
|
442
|
+
|
443
|
+
// print CIGAR
|
444
|
+
if (p->cigar) {
|
445
|
+
for (j = 0; j != p->n_cigar; ++j)
|
446
|
+
printf("%d%c", __cigar_len(p->cigar[j]), "MIDS"[__cigar_op(p->cigar[j])]);
|
447
|
+
} else if (p->type == BWA_TYPE_NO_MATCH) printf("*");
|
448
|
+
else printf("%dM", p->len);
|
449
|
+
|
450
|
+
// print mate coordinate
|
451
|
+
if (mate && mate->type != BWA_TYPE_NO_MATCH) {
|
452
|
+
int m_seqid, m_is_N;
|
453
|
+
long long isize;
|
454
|
+
am = mate->seQ < p->seQ? mate->seQ : p->seQ; // smaller single-end mapping quality
|
455
|
+
// redundant calculation here, but should not matter too much
|
456
|
+
m_is_N = bns_coor_pac2real(bns, mate->pos, mate->len, &m_seqid);
|
457
|
+
printf("\t%s\t", (seqid == m_seqid)? "=" : bns->anns[m_seqid].name);
|
458
|
+
isize = (seqid == m_seqid)? pos_5(mate) - pos_5(p) : 0;
|
459
|
+
if (p->type == BWA_TYPE_NO_MATCH) isize = 0;
|
460
|
+
printf("%d\t%lld\t", (int)(mate->pos - bns->anns[m_seqid].offset + 1), isize);
|
461
|
+
} else if (mate) printf("\t=\t%d\t0\t", (int)(p->pos - bns->anns[seqid].offset + 1));
|
462
|
+
else printf("\t*\t0\t0\t");
|
463
|
+
|
464
|
+
// print sequence and quality
|
465
|
+
if (p->strand == 0)
|
466
|
+
for (j = 0; j != p->full_len; ++j) putchar("ACGTN"[(int)p->seq[j]]);
|
467
|
+
else for (j = 0; j != p->full_len; ++j) putchar("TGCAN"[p->seq[p->full_len - 1 - j]]);
|
468
|
+
putchar('\t');
|
469
|
+
if (p->qual) {
|
470
|
+
if (p->strand) seq_reverse(p->len, p->qual, 0); // reverse quality
|
471
|
+
printf("%s", p->qual);
|
472
|
+
} else printf("*");
|
473
|
+
|
474
|
+
if (bwa_rg_id) printf("\tRG:Z:%s", bwa_rg_id);
|
475
|
+
if (p->bc[0]) printf("\tBC:Z:%s", p->bc);
|
476
|
+
if (p->clip_len < p->full_len) printf("\tXC:i:%d", p->clip_len);
|
477
|
+
if (p->type != BWA_TYPE_NO_MATCH) {
|
478
|
+
int i;
|
479
|
+
// calculate XT tag
|
480
|
+
XT = "NURM"[p->type];
|
481
|
+
if (nn > 10) XT = 'N';
|
482
|
+
// print tags
|
483
|
+
printf("\tXT:A:%c\t%s:i:%d", XT, (mode & BWA_MODE_COMPREAD)? "NM" : "CM", p->nm);
|
484
|
+
if (nn) printf("\tXN:i:%d", nn);
|
485
|
+
if (mate) printf("\tSM:i:%d\tAM:i:%d", p->seQ, am);
|
486
|
+
if (p->type != BWA_TYPE_MATESW) { // X0 and X1 are not available for this type of alignment
|
487
|
+
printf("\tX0:i:%d", p->c1);
|
488
|
+
if (p->c1 <= max_top2) printf("\tX1:i:%d", p->c2);
|
489
|
+
}
|
490
|
+
printf("\tXM:i:%d\tXO:i:%d\tXG:i:%d", p->n_mm, p->n_gapo, p->n_gapo+p->n_gape);
|
491
|
+
if (p->md) printf("\tMD:Z:%s", p->md);
|
492
|
+
// print multiple hits
|
493
|
+
if (p->n_multi) {
|
494
|
+
printf("\tXA:Z:");
|
495
|
+
for (i = 0; i < p->n_multi; ++i) {
|
496
|
+
bwt_multi1_t *q = p->multi + i;
|
497
|
+
int k;
|
498
|
+
j = pos_end_multi(q, p->len) - q->pos;
|
499
|
+
nn = bns_coor_pac2real(bns, q->pos, j, &seqid);
|
500
|
+
printf("%s,%c%d,", bns->anns[seqid].name, q->strand? '-' : '+',
|
501
|
+
(int)(q->pos - bns->anns[seqid].offset + 1));
|
502
|
+
if (q->cigar) {
|
503
|
+
for (k = 0; k < q->n_cigar; ++k)
|
504
|
+
printf("%d%c", __cigar_len(q->cigar[k]), "MIDS"[__cigar_op(q->cigar[k])]);
|
505
|
+
} else printf("%dM", p->len);
|
506
|
+
printf(",%d;", q->gap + q->mm);
|
507
|
+
}
|
508
|
+
}
|
509
|
+
}
|
510
|
+
putchar('\n');
|
511
|
+
} else { // this read has no match
|
512
|
+
ubyte_t *s = p->strand? p->rseq : p->seq;
|
513
|
+
int flag = p->extra_flag | SAM_FSU;
|
514
|
+
if (mate && mate->type == BWA_TYPE_NO_MATCH) flag |= SAM_FMU;
|
515
|
+
printf("%s\t%d\t*\t0\t0\t*\t*\t0\t0\t", p->name, flag);
|
516
|
+
for (j = 0; j != p->len; ++j) putchar("ACGTN"[(int)s[j]]);
|
517
|
+
putchar('\t');
|
518
|
+
if (p->qual) {
|
519
|
+
if (p->strand) seq_reverse(p->len, p->qual, 0); // reverse quality
|
520
|
+
printf("%s", p->qual);
|
521
|
+
} else printf("*");
|
522
|
+
if (bwa_rg_id) printf("\tRG:Z:%s", bwa_rg_id);
|
523
|
+
if (p->bc[0]) printf("\tBC:Z:%s", p->bc);
|
524
|
+
if (p->clip_len < p->full_len) printf("\tXC:i:%d", p->clip_len);
|
525
|
+
putchar('\n');
|
526
|
+
}
|
527
|
+
}
|
528
|
+
|
529
|
+
bntseq_t *bwa_open_nt(const char *prefix)
|
530
|
+
{
|
531
|
+
bntseq_t *ntbns;
|
532
|
+
char *str;
|
533
|
+
str = (char*)calloc(strlen(prefix) + 10, 1);
|
534
|
+
strcat(strcpy(str, prefix), ".nt");
|
535
|
+
ntbns = bns_restore(str);
|
536
|
+
free(str);
|
537
|
+
return ntbns;
|
538
|
+
}
|
539
|
+
|
540
|
+
void bwa_print_sam_SQ(const bntseq_t *bns)
|
541
|
+
{
|
542
|
+
int i;
|
543
|
+
for (i = 0; i < bns->n_seqs; ++i)
|
544
|
+
printf("@SQ\tSN:%s\tLN:%d\n", bns->anns[i].name, bns->anns[i].len);
|
545
|
+
if (bwa_rg_line) printf("%s\n", bwa_rg_line);
|
546
|
+
}
|
547
|
+
|
548
|
+
void bwase_initialize()
|
549
|
+
{
|
550
|
+
int i;
|
551
|
+
for (i = 1; i != 256; ++i) g_log_n[i] = (int)(4.343 * log(i) + 0.5);
|
552
|
+
}
|
553
|
+
|
554
|
+
char *bwa_escape(char *s)
|
555
|
+
{
|
556
|
+
char *p, *q;
|
557
|
+
for (p = q = s; *p; ++p) {
|
558
|
+
if (*p == '\\') {
|
559
|
+
++p;
|
560
|
+
if (*p == 't') *q++ = '\t';
|
561
|
+
else if (*p == 'n') *q++ = '\n';
|
562
|
+
else if (*p == 'r') *q++ = '\r';
|
563
|
+
else if (*p == '\\') *q++ = '\\';
|
564
|
+
} else *q++ = *p;
|
565
|
+
}
|
566
|
+
*q = '\0';
|
567
|
+
return s;
|
568
|
+
}
|
569
|
+
|
570
|
+
int bwa_set_rg(const char *s)
|
571
|
+
{
|
572
|
+
char *p, *q, *r;
|
573
|
+
if (strstr(s, "@RG") != s) return -1;
|
574
|
+
if (bwa_rg_line) free(bwa_rg_line);
|
575
|
+
if (bwa_rg_id) free(bwa_rg_id);
|
576
|
+
bwa_rg_line = strdup(s);
|
577
|
+
bwa_rg_id = 0;
|
578
|
+
bwa_escape(bwa_rg_line);
|
579
|
+
p = strstr(bwa_rg_line, "\tID:");
|
580
|
+
if (p == 0) return -1;
|
581
|
+
p += 4;
|
582
|
+
for (q = p; *q && *q != '\t' && *q != '\n'; ++q);
|
583
|
+
bwa_rg_id = calloc(q - p + 1, 1);
|
584
|
+
for (q = p, r = bwa_rg_id; *q && *q != '\t' && *q != '\n'; ++q)
|
585
|
+
*r++ = *q;
|
586
|
+
return 0;
|
587
|
+
}
|
588
|
+
|
589
|
+
void bwa_sai2sam_se_core(const char *prefix, const char *fn_sa, const char *fn_fa, int n_occ)
|
590
|
+
{
|
591
|
+
extern bwa_seqio_t *bwa_open_reads(int mode, const char *fn_fa);
|
592
|
+
int i, n_seqs, tot_seqs = 0, m_aln;
|
593
|
+
bwt_aln1_t *aln = 0;
|
594
|
+
bwa_seq_t *seqs;
|
595
|
+
bwa_seqio_t *ks;
|
596
|
+
clock_t t;
|
597
|
+
bntseq_t *bns, *ntbns = 0;
|
598
|
+
FILE *fp_sa;
|
599
|
+
gap_opt_t opt;
|
600
|
+
|
601
|
+
// initialization
|
602
|
+
bwase_initialize();
|
603
|
+
bns = bns_restore(prefix);
|
604
|
+
srand48(bns->seed);
|
605
|
+
fp_sa = xopen(fn_sa, "r");
|
606
|
+
|
607
|
+
m_aln = 0;
|
608
|
+
fread(&opt, sizeof(gap_opt_t), 1, fp_sa);
|
609
|
+
if (!(opt.mode & BWA_MODE_COMPREAD)) // in color space; initialize ntpac
|
610
|
+
ntbns = bwa_open_nt(prefix);
|
611
|
+
bwa_print_sam_SQ(bns);
|
612
|
+
bwa_print_sam_PG();
|
613
|
+
// set ks
|
614
|
+
ks = bwa_open_reads(opt.mode, fn_fa);
|
615
|
+
// core loop
|
616
|
+
while ((seqs = bwa_read_seq(ks, 0x40000, &n_seqs, opt.mode, opt.trim_qual)) != 0) {
|
617
|
+
tot_seqs += n_seqs;
|
618
|
+
t = clock();
|
619
|
+
|
620
|
+
// read alignment
|
621
|
+
for (i = 0; i < n_seqs; ++i) {
|
622
|
+
bwa_seq_t *p = seqs + i;
|
623
|
+
int n_aln;
|
624
|
+
fread(&n_aln, 4, 1, fp_sa);
|
625
|
+
if (n_aln > m_aln) {
|
626
|
+
m_aln = n_aln;
|
627
|
+
aln = (bwt_aln1_t*)realloc(aln, sizeof(bwt_aln1_t) * m_aln);
|
628
|
+
}
|
629
|
+
fread(aln, sizeof(bwt_aln1_t), n_aln, fp_sa);
|
630
|
+
bwa_aln2seq_core(n_aln, aln, p, 1, n_occ);
|
631
|
+
}
|
632
|
+
|
633
|
+
fprintf(stderr, "[bwa_aln_core] convert to sequence coordinate... ");
|
634
|
+
bwa_cal_pac_pos(prefix, n_seqs, seqs, opt.max_diff, opt.fnr); // forward bwt will be destroyed here
|
635
|
+
fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); t = clock();
|
636
|
+
|
637
|
+
fprintf(stderr, "[bwa_aln_core] refine gapped alignments... ");
|
638
|
+
bwa_refine_gapped(bns, n_seqs, seqs, 0, ntbns);
|
639
|
+
fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); t = clock();
|
640
|
+
|
641
|
+
fprintf(stderr, "[bwa_aln_core] print alignments... ");
|
642
|
+
for (i = 0; i < n_seqs; ++i)
|
643
|
+
bwa_print_sam1(bns, seqs + i, 0, opt.mode, opt.max_top2);
|
644
|
+
fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); t = clock();
|
645
|
+
|
646
|
+
bwa_free_read_seq(n_seqs, seqs);
|
647
|
+
fprintf(stderr, "[bwa_aln_core] %d sequences have been processed.\n", tot_seqs);
|
648
|
+
}
|
649
|
+
|
650
|
+
// destroy
|
651
|
+
bwa_seq_close(ks);
|
652
|
+
if (ntbns) bns_destroy(ntbns);
|
653
|
+
bns_destroy(bns);
|
654
|
+
fclose(fp_sa);
|
655
|
+
free(aln);
|
656
|
+
}
|
657
|
+
|
658
|
+
int bwa_sai2sam_se(int argc, char *argv[])
|
659
|
+
{
|
660
|
+
int c, n_occ = 3;
|
661
|
+
optind = 1;
|
662
|
+
while ((c = getopt(argc, argv, "hn:f:r:")) >= 0) {
|
663
|
+
switch (c) {
|
664
|
+
case 'h': break;
|
665
|
+
case 'r':
|
666
|
+
if (bwa_set_rg(optarg) < 0) {
|
667
|
+
fprintf(stderr, "[%s] malformated @RG line\n", __func__);
|
668
|
+
return 1;
|
669
|
+
}
|
670
|
+
break;
|
671
|
+
case 'n': n_occ = atoi(optarg); break;
|
672
|
+
case 'f': xreopen(optarg, "w", stdout); break;
|
673
|
+
default: return 1;
|
674
|
+
}
|
675
|
+
}
|
676
|
+
|
677
|
+
if (optind + 3 > argc) {
|
678
|
+
fprintf(stderr, "Usage: bwa samse [-n max_occ] [-f out.sam] [-r RG_line] <prefix> <in.sai> <in.fq>\n");
|
679
|
+
return 1;
|
680
|
+
}
|
681
|
+
bwa_sai2sam_se_core(argv[optind], argv[optind+1], argv[optind+2], n_occ);
|
682
|
+
free(bwa_rg_line); free(bwa_rg_id);
|
683
|
+
fflush(stdout);
|
684
|
+
xreopen("/dev/tty","w",stdout);
|
685
|
+
return 0;
|
686
|
+
}
|