bio-bwa 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.document +5 -0
- data/Gemfile +15 -0
- data/Gemfile.lock +28 -0
- data/LICENSE.txt +35 -0
- data/README.rdoc +33 -0
- data/Rakefile +56 -0
- data/VERSION +1 -0
- data/bio-bwa.gemspec +152 -0
- data/doc/Bio.html +93 -0
- data/doc/Bio/BWA.html +2884 -0
- data/doc/Bio/BWA/Library.html +229 -0
- data/doc/_index.html +119 -0
- data/doc/class_list.html +36 -0
- data/doc/css/common.css +1 -0
- data/doc/css/full_list.css +53 -0
- data/doc/css/style.css +310 -0
- data/doc/file.LICENSE.html +88 -0
- data/doc/file.README.html +119 -0
- data/doc/file_list.html +41 -0
- data/doc/frames.html +13 -0
- data/doc/index.html +119 -0
- data/doc/js/app.js +203 -0
- data/doc/js/full_list.js +149 -0
- data/doc/js/jquery.js +154 -0
- data/doc/method_list.html +171 -0
- data/doc/top-level-namespace.html +88 -0
- data/ext/COPYING +674 -0
- data/ext/ChangeLog +3864 -0
- data/ext/NEWS +555 -0
- data/ext/README +29 -0
- data/ext/bamlite.c +155 -0
- data/ext/bamlite.h +94 -0
- data/ext/bntseq.c +303 -0
- data/ext/bntseq.h +80 -0
- data/ext/bwa.1 +562 -0
- data/ext/bwape.c +807 -0
- data/ext/bwase.c +686 -0
- data/ext/bwase.h +27 -0
- data/ext/bwaseqio.c +222 -0
- data/ext/bwt.c +250 -0
- data/ext/bwt.h +105 -0
- data/ext/bwt_gen/Makefile +23 -0
- data/ext/bwt_gen/QSufSort.c +496 -0
- data/ext/bwt_gen/QSufSort.h +40 -0
- data/ext/bwt_gen/bwt_gen.c +1547 -0
- data/ext/bwt_gen/bwt_gen.h +105 -0
- data/ext/bwt_lite.c +94 -0
- data/ext/bwt_lite.h +29 -0
- data/ext/bwtaln.c +345 -0
- data/ext/bwtaln.h +150 -0
- data/ext/bwtgap.c +264 -0
- data/ext/bwtgap.h +38 -0
- data/ext/bwtindex.c +186 -0
- data/ext/bwtio.c +77 -0
- data/ext/bwtmisc.c +269 -0
- data/ext/bwtsw2.h +51 -0
- data/ext/bwtsw2_aux.c +650 -0
- data/ext/bwtsw2_chain.c +107 -0
- data/ext/bwtsw2_core.c +594 -0
- data/ext/bwtsw2_main.c +100 -0
- data/ext/cs2nt.c +191 -0
- data/ext/is.c +218 -0
- data/ext/khash.h +506 -0
- data/ext/kseq.h +208 -0
- data/ext/ksort.h +269 -0
- data/ext/kstring.c +35 -0
- data/ext/kstring.h +46 -0
- data/ext/kvec.h +90 -0
- data/ext/main.c +63 -0
- data/ext/main.h +29 -0
- data/ext/mkrf_conf.rb +49 -0
- data/ext/qualfa2fq.pl +27 -0
- data/ext/simple_dp.c +162 -0
- data/ext/simpletest.c +23 -0
- data/ext/solid2fastq.pl +111 -0
- data/ext/stdaln.c +1072 -0
- data/ext/stdaln.h +162 -0
- data/ext/utils.c +82 -0
- data/ext/utils.h +54 -0
- data/lib/bio-bwa.rb +7 -0
- data/lib/bio/bwa.rb +312 -0
- data/lib/bio/bwa/library.rb +42 -0
- data/test/data/testdata.fa +602 -0
- data/test/data/testdata.long.fa +175 -0
- data/test/data/testdata.short.fa +2 -0
- data/test/helper.rb +18 -0
- data/test/test_bio-bwa_basic.rb +62 -0
- data/test/test_bio-bwa_make_index.rb +42 -0
- data/test/test_bio-bwa_run_aln.rb +49 -0
- data/test/test_bio-bwa_sam_conversion.rb +49 -0
- metadata +218 -0
data/ext/bwase.c
ADDED
@@ -0,0 +1,686 @@
|
|
1
|
+
#include <unistd.h>
|
2
|
+
#include <string.h>
|
3
|
+
#include <stdio.h>
|
4
|
+
#include <stdlib.h>
|
5
|
+
#include <math.h>
|
6
|
+
#include <time.h>
|
7
|
+
#include "stdaln.h"
|
8
|
+
#include "bwase.h"
|
9
|
+
#include "bwtaln.h"
|
10
|
+
#include "bntseq.h"
|
11
|
+
#include "utils.h"
|
12
|
+
#include "kstring.h"
|
13
|
+
|
14
|
+
int g_log_n[256];
|
15
|
+
char *bwa_rg_line, *bwa_rg_id;
|
16
|
+
|
17
|
+
void bwa_print_sam_PG();
|
18
|
+
|
19
|
+
void bwa_aln2seq_core(int n_aln, const bwt_aln1_t *aln, bwa_seq_t *s, int set_main, int n_multi)
|
20
|
+
{
|
21
|
+
int i, cnt, best;
|
22
|
+
if (n_aln == 0) {
|
23
|
+
s->type = BWA_TYPE_NO_MATCH;
|
24
|
+
s->c1 = s->c2 = 0;
|
25
|
+
return;
|
26
|
+
}
|
27
|
+
|
28
|
+
if (set_main) {
|
29
|
+
best = aln[0].score;
|
30
|
+
for (i = cnt = 0; i < n_aln; ++i) {
|
31
|
+
const bwt_aln1_t *p = aln + i;
|
32
|
+
if (p->score > best) break;
|
33
|
+
if (drand48() * (p->l - p->k + 1 + cnt) > (double)cnt) {
|
34
|
+
s->n_mm = p->n_mm; s->n_gapo = p->n_gapo; s->n_gape = p->n_gape; s->strand = p->a;
|
35
|
+
s->score = p->score;
|
36
|
+
s->sa = p->k + (bwtint_t)((p->l - p->k + 1) * drand48());
|
37
|
+
}
|
38
|
+
cnt += p->l - p->k + 1;
|
39
|
+
}
|
40
|
+
s->c1 = cnt;
|
41
|
+
for (; i < n_aln; ++i) cnt += aln[i].l - aln[i].k + 1;
|
42
|
+
s->c2 = cnt - s->c1;
|
43
|
+
s->type = s->c1 > 1? BWA_TYPE_REPEAT : BWA_TYPE_UNIQUE;
|
44
|
+
}
|
45
|
+
|
46
|
+
if (n_multi) {
|
47
|
+
int k, rest, n_occ, z = 0;
|
48
|
+
for (k = n_occ = 0; k < n_aln; ++k) {
|
49
|
+
const bwt_aln1_t *q = aln + k;
|
50
|
+
n_occ += q->l - q->k + 1;
|
51
|
+
}
|
52
|
+
if (s->multi) free(s->multi);
|
53
|
+
if (n_occ > n_multi + 1) { // if there are too many hits, generate none of them
|
54
|
+
s->multi = 0; s->n_multi = 0;
|
55
|
+
return;
|
56
|
+
}
|
57
|
+
/* The following code is more flexible than what is required
|
58
|
+
* here. In principle, due to the requirement above, we can
|
59
|
+
* simply output all hits, but the following samples "rest"
|
60
|
+
* number of random hits. */
|
61
|
+
rest = n_occ > n_multi + 1? n_multi + 1 : n_occ; // find one additional for ->sa
|
62
|
+
s->multi = calloc(rest, sizeof(bwt_multi1_t));
|
63
|
+
for (k = 0; k < n_aln; ++k) {
|
64
|
+
const bwt_aln1_t *q = aln + k;
|
65
|
+
if (q->l - q->k + 1 <= rest) {
|
66
|
+
bwtint_t l;
|
67
|
+
for (l = q->k; l <= q->l; ++l) {
|
68
|
+
s->multi[z].pos = l;
|
69
|
+
s->multi[z].gap = q->n_gapo + q->n_gape;
|
70
|
+
s->multi[z].mm = q->n_mm;
|
71
|
+
s->multi[z++].strand = q->a;
|
72
|
+
}
|
73
|
+
rest -= q->l - q->k + 1;
|
74
|
+
} else { // Random sampling (http://code.activestate.com/recipes/272884/). In fact, we never come here.
|
75
|
+
int j, i, k;
|
76
|
+
for (j = rest, i = q->l - q->k + 1, k = 0; j > 0; --j) {
|
77
|
+
double p = 1.0, x = drand48();
|
78
|
+
while (x < p) p -= p * j / (i--);
|
79
|
+
s->multi[z].pos = q->l - i;
|
80
|
+
s->multi[z].gap = q->n_gapo + q->n_gape;
|
81
|
+
s->multi[z].mm = q->n_mm;
|
82
|
+
s->multi[z++].strand = q->a;
|
83
|
+
}
|
84
|
+
rest = 0;
|
85
|
+
break;
|
86
|
+
}
|
87
|
+
}
|
88
|
+
s->n_multi = z;
|
89
|
+
for (k = z = 0; k < s->n_multi; ++k)
|
90
|
+
if (s->multi[k].pos != s->sa)
|
91
|
+
s->multi[z++] = s->multi[k];
|
92
|
+
s->n_multi = z < n_multi? z : n_multi;
|
93
|
+
}
|
94
|
+
}
|
95
|
+
|
96
|
+
void bwa_aln2seq(int n_aln, const bwt_aln1_t *aln, bwa_seq_t *s)
|
97
|
+
{
|
98
|
+
bwa_aln2seq_core(n_aln, aln, s, 1, 0);
|
99
|
+
}
|
100
|
+
|
101
|
+
int bwa_approx_mapQ(const bwa_seq_t *p, int mm)
|
102
|
+
{
|
103
|
+
int n;
|
104
|
+
if (p->c1 == 0) return 23;
|
105
|
+
if (p->c1 > 1) return 0;
|
106
|
+
if (p->n_mm == mm) return 25;
|
107
|
+
if (p->c2 == 0) return 37;
|
108
|
+
n = (p->c2 >= 255)? 255 : p->c2;
|
109
|
+
return (23 < g_log_n[n])? 0 : 23 - g_log_n[n];
|
110
|
+
}
|
111
|
+
|
112
|
+
/**
|
113
|
+
* Derive the actual position in the read from the given suffix array
|
114
|
+
* coordinates. Note that the position will be approximate based on
|
115
|
+
* whether indels appear in the read and whether calculations are
|
116
|
+
* performed from the start or end of the read.
|
117
|
+
*/
|
118
|
+
void bwa_cal_pac_pos_core(const bwt_t *forward_bwt, const bwt_t *reverse_bwt, bwa_seq_t *seq, const int max_mm, const float fnr)
|
119
|
+
{
|
120
|
+
int max_diff;
|
121
|
+
if (seq->type != BWA_TYPE_UNIQUE && seq->type != BWA_TYPE_REPEAT) return;
|
122
|
+
max_diff = fnr > 0.0? bwa_cal_maxdiff(seq->len, BWA_AVG_ERR, fnr) : max_mm;
|
123
|
+
if (seq->strand) { // reverse strand only
|
124
|
+
seq->pos = bwt_sa(forward_bwt, seq->sa);
|
125
|
+
seq->seQ = seq->mapQ = bwa_approx_mapQ(seq, max_diff);
|
126
|
+
} else { // forward strand only
|
127
|
+
/* NB: For gapped alignment, p->pos may not be correct, which
|
128
|
+
* will be fixed in refine_gapped_core(). This line also
|
129
|
+
* determines the way "x" is calculated in
|
130
|
+
* refine_gapped_core() when (ext < 0 && is_end == 0). */
|
131
|
+
seq->pos = reverse_bwt->seq_len - (bwt_sa(reverse_bwt, seq->sa) + seq->len);
|
132
|
+
seq->seQ = seq->mapQ = bwa_approx_mapQ(seq, max_diff);
|
133
|
+
}
|
134
|
+
}
|
135
|
+
|
136
|
+
void bwa_cal_pac_pos(const char *prefix, int n_seqs, bwa_seq_t *seqs, int max_mm, float fnr)
|
137
|
+
{
|
138
|
+
int i, j;
|
139
|
+
char str[1024];
|
140
|
+
bwt_t *bwt;
|
141
|
+
// load forward SA
|
142
|
+
strcpy(str, prefix); strcat(str, ".bwt"); bwt = bwt_restore_bwt(str);
|
143
|
+
strcpy(str, prefix); strcat(str, ".sa"); bwt_restore_sa(str, bwt);
|
144
|
+
for (i = 0; i != n_seqs; ++i) {
|
145
|
+
if (seqs[i].strand) bwa_cal_pac_pos_core(bwt, 0, &seqs[i], max_mm, fnr);
|
146
|
+
for (j = 0; j < seqs[i].n_multi; ++j) {
|
147
|
+
bwt_multi1_t *p = seqs[i].multi + j;
|
148
|
+
if (p->strand) p->pos = bwt_sa(bwt, p->pos);
|
149
|
+
}
|
150
|
+
}
|
151
|
+
bwt_destroy(bwt);
|
152
|
+
// load reverse BWT and SA
|
153
|
+
strcpy(str, prefix); strcat(str, ".rbwt"); bwt = bwt_restore_bwt(str);
|
154
|
+
strcpy(str, prefix); strcat(str, ".rsa"); bwt_restore_sa(str, bwt);
|
155
|
+
for (i = 0; i != n_seqs; ++i) {
|
156
|
+
if (!seqs[i].strand) bwa_cal_pac_pos_core(0, bwt, &seqs[i], max_mm, fnr);
|
157
|
+
for (j = 0; j < seqs[i].n_multi; ++j) {
|
158
|
+
bwt_multi1_t *p = seqs[i].multi + j;
|
159
|
+
if (!p->strand) p->pos = bwt->seq_len - (bwt_sa(bwt, p->pos) + seqs[i].len);
|
160
|
+
}
|
161
|
+
}
|
162
|
+
bwt_destroy(bwt);
|
163
|
+
}
|
164
|
+
|
165
|
+
/* is_end_correct == 1 if (*pos+len) gives the correct coordinate on
|
166
|
+
* forward strand. This happens when p->pos is calculated by
|
167
|
+
* bwa_cal_pac_pos(). is_end_correct==0 if (*pos) gives the correct
|
168
|
+
* coordinate. This happens only for color-converted alignment. */
|
169
|
+
static bwa_cigar_t *refine_gapped_core(bwtint_t l_pac, const ubyte_t *pacseq, int len, const ubyte_t *seq, bwtint_t *_pos,
|
170
|
+
int ext, int *n_cigar, int is_end_correct)
|
171
|
+
{
|
172
|
+
bwa_cigar_t *cigar = 0;
|
173
|
+
ubyte_t *ref_seq;
|
174
|
+
int l = 0, path_len, ref_len;
|
175
|
+
AlnParam ap = aln_param_bwa;
|
176
|
+
path_t *path;
|
177
|
+
int64_t k, __pos = *_pos > l_pac? (int64_t)((int32_t)*_pos) : *_pos;
|
178
|
+
|
179
|
+
ref_len = len + abs(ext);
|
180
|
+
if (ext > 0) {
|
181
|
+
ref_seq = (ubyte_t*)calloc(ref_len, 1);
|
182
|
+
for (k = __pos; k < __pos + ref_len && k < l_pac; ++k)
|
183
|
+
ref_seq[l++] = pacseq[k>>2] >> ((~k&3)<<1) & 3;
|
184
|
+
} else {
|
185
|
+
int64_t x = __pos + (is_end_correct? len : ref_len);
|
186
|
+
ref_seq = (ubyte_t*)calloc(ref_len, 1);
|
187
|
+
for (l = 0, k = x - ref_len > 0? x - ref_len : 0; k < x && k < l_pac; ++k)
|
188
|
+
ref_seq[l++] = pacseq[k>>2] >> ((~k&3)<<1) & 3;
|
189
|
+
}
|
190
|
+
path = (path_t*)calloc(l+len, sizeof(path_t));
|
191
|
+
|
192
|
+
aln_global_core(ref_seq, l, (ubyte_t*)seq, len, &ap, path, &path_len);
|
193
|
+
cigar = bwa_aln_path2cigar(path, path_len, n_cigar);
|
194
|
+
|
195
|
+
if (ext < 0 && is_end_correct) { // fix coordinate for reads mapped on the forward strand
|
196
|
+
for (l = k = 0; k < *n_cigar; ++k) {
|
197
|
+
if (__cigar_op(cigar[k]) == FROM_D) l -= __cigar_len(cigar[k]);
|
198
|
+
else if (__cigar_op(cigar[k]) == FROM_I) l += __cigar_len(cigar[k]);
|
199
|
+
}
|
200
|
+
__pos += l;
|
201
|
+
}
|
202
|
+
|
203
|
+
if (__cigar_op(cigar[0]) == FROM_D) { // deletion at the 5'-end
|
204
|
+
__pos += __cigar_len(cigar[0]);
|
205
|
+
for (k = 0; k < *n_cigar - 1; ++k) cigar[k] = cigar[k+1];
|
206
|
+
--(*n_cigar);
|
207
|
+
}
|
208
|
+
if (__cigar_op(cigar[*n_cigar-1]) == FROM_D) --(*n_cigar); // deletion at the 3'-end
|
209
|
+
|
210
|
+
// change "I" at either end of the read to S. just in case. This should rarely happen...
|
211
|
+
if (__cigar_op(cigar[*n_cigar-1]) == FROM_I) cigar[*n_cigar-1] = __cigar_create(3, (__cigar_len(cigar[*n_cigar-1])));
|
212
|
+
if (__cigar_op(cigar[0]) == FROM_I) cigar[0] = __cigar_create(3, (__cigar_len(cigar[0])));
|
213
|
+
|
214
|
+
*_pos = (bwtint_t)__pos;
|
215
|
+
free(ref_seq); free(path);
|
216
|
+
return cigar;
|
217
|
+
}
|
218
|
+
|
219
|
+
char *bwa_cal_md1(int n_cigar, bwa_cigar_t *cigar, int len, bwtint_t pos, ubyte_t *seq,
|
220
|
+
bwtint_t l_pac, ubyte_t *pacseq, kstring_t *str, int *_nm)
|
221
|
+
{
|
222
|
+
bwtint_t x, y;
|
223
|
+
int z, u, c, nm = 0;
|
224
|
+
str->l = 0; // reset
|
225
|
+
x = pos; y = 0;
|
226
|
+
if (cigar) {
|
227
|
+
int k, l;
|
228
|
+
for (k = u = 0; k < n_cigar; ++k) {
|
229
|
+
l = __cigar_len(cigar[k]);
|
230
|
+
if (__cigar_op(cigar[k]) == FROM_M) {
|
231
|
+
for (z = 0; z < l && x+z < l_pac; ++z) {
|
232
|
+
c = pacseq[(x+z)>>2] >> ((~(x+z)&3)<<1) & 3;
|
233
|
+
if (c > 3 || seq[y+z] > 3 || c != seq[y+z]) {
|
234
|
+
ksprintf(str, "%d", u);
|
235
|
+
kputc("ACGTN"[c], str);
|
236
|
+
++nm;
|
237
|
+
u = 0;
|
238
|
+
} else ++u;
|
239
|
+
}
|
240
|
+
x += l; y += l;
|
241
|
+
/* } else if (cigar[k]>>14 == FROM_I || cigar[k]>>14 == 3) { */
|
242
|
+
} else if (__cigar_op(cigar[k]) == FROM_I || __cigar_op(cigar[k]) == FROM_S) {
|
243
|
+
y += l;
|
244
|
+
if (__cigar_op(cigar[k]) == FROM_I) nm += l;
|
245
|
+
} else if (__cigar_op(cigar[k]) == FROM_D) {
|
246
|
+
ksprintf(str, "%d", u);
|
247
|
+
kputc('^', str);
|
248
|
+
for (z = 0; z < l && x+z < l_pac; ++z)
|
249
|
+
kputc("ACGT"[pacseq[(x+z)>>2] >> ((~(x+z)&3)<<1) & 3], str);
|
250
|
+
u = 0;
|
251
|
+
x += l; nm += l;
|
252
|
+
}
|
253
|
+
}
|
254
|
+
} else { // no gaps
|
255
|
+
for (z = u = 0; z < (bwtint_t)len; ++z) {
|
256
|
+
c = pacseq[(x+z)>>2] >> ((~(x+z)&3)<<1) & 3;
|
257
|
+
if (c > 3 || seq[y+z] > 3 || c != seq[y+z]) {
|
258
|
+
ksprintf(str, "%d", u);
|
259
|
+
kputc("ACGTN"[c], str);
|
260
|
+
++nm;
|
261
|
+
u = 0;
|
262
|
+
} else ++u;
|
263
|
+
}
|
264
|
+
}
|
265
|
+
ksprintf(str, "%d", u);
|
266
|
+
*_nm = nm;
|
267
|
+
return strdup(str->s);
|
268
|
+
}
|
269
|
+
|
270
|
+
void bwa_correct_trimmed(bwa_seq_t *s)
|
271
|
+
{
|
272
|
+
if (s->len == s->full_len) return;
|
273
|
+
if (s->strand == 0) { // forward
|
274
|
+
if (s->cigar && __cigar_op(s->cigar[s->n_cigar-1]) == FROM_S) { // the last is S
|
275
|
+
s->cigar[s->n_cigar-1] += s->full_len - s->len;
|
276
|
+
} else {
|
277
|
+
if (s->cigar == 0) {
|
278
|
+
s->n_cigar = 2;
|
279
|
+
s->cigar = calloc(s->n_cigar, sizeof(bwa_cigar_t));
|
280
|
+
s->cigar[0] = __cigar_create(0, s->len);
|
281
|
+
} else {
|
282
|
+
++s->n_cigar;
|
283
|
+
s->cigar = realloc(s->cigar, s->n_cigar * sizeof(bwa_cigar_t));
|
284
|
+
}
|
285
|
+
s->cigar[s->n_cigar-1] = __cigar_create(3, (s->full_len - s->len));
|
286
|
+
}
|
287
|
+
} else { // reverse
|
288
|
+
if (s->cigar && __cigar_op(s->cigar[0]) == FROM_S) { // the first is S
|
289
|
+
s->cigar[0] += s->full_len - s->len;
|
290
|
+
} else {
|
291
|
+
if (s->cigar == 0) {
|
292
|
+
s->n_cigar = 2;
|
293
|
+
s->cigar = calloc(s->n_cigar, sizeof(bwa_cigar_t));
|
294
|
+
s->cigar[1] = __cigar_create(0, s->len);
|
295
|
+
} else {
|
296
|
+
++s->n_cigar;
|
297
|
+
s->cigar = realloc(s->cigar, s->n_cigar * sizeof(bwa_cigar_t));
|
298
|
+
memmove(s->cigar + 1, s->cigar, (s->n_cigar-1) * sizeof(bwa_cigar_t));
|
299
|
+
}
|
300
|
+
s->cigar[0] = __cigar_create(3, (s->full_len - s->len));
|
301
|
+
}
|
302
|
+
}
|
303
|
+
s->len = s->full_len;
|
304
|
+
}
|
305
|
+
|
306
|
+
void bwa_refine_gapped(const bntseq_t *bns, int n_seqs, bwa_seq_t *seqs, ubyte_t *_pacseq, bntseq_t *ntbns)
|
307
|
+
{
|
308
|
+
ubyte_t *pacseq, *ntpac = 0;
|
309
|
+
int i, j;
|
310
|
+
kstring_t *str;
|
311
|
+
|
312
|
+
if (ntbns) { // in color space
|
313
|
+
ntpac = (ubyte_t*)calloc(ntbns->l_pac/4+1, 1);
|
314
|
+
rewind(ntbns->fp_pac);
|
315
|
+
fread(ntpac, 1, ntbns->l_pac/4 + 1, ntbns->fp_pac);
|
316
|
+
}
|
317
|
+
|
318
|
+
if (!_pacseq) {
|
319
|
+
pacseq = (ubyte_t*)calloc(bns->l_pac/4+1, 1);
|
320
|
+
rewind(bns->fp_pac);
|
321
|
+
fread(pacseq, 1, bns->l_pac/4+1, bns->fp_pac);
|
322
|
+
} else pacseq = _pacseq;
|
323
|
+
for (i = 0; i != n_seqs; ++i) {
|
324
|
+
bwa_seq_t *s = seqs + i;
|
325
|
+
seq_reverse(s->len, s->seq, 0); // IMPORTANT: s->seq is reversed here!!!
|
326
|
+
for (j = 0; j < s->n_multi; ++j) {
|
327
|
+
bwt_multi1_t *q = s->multi + j;
|
328
|
+
int n_cigar;
|
329
|
+
if (q->gap == 0) continue;
|
330
|
+
q->cigar = refine_gapped_core(bns->l_pac, pacseq, s->len, q->strand? s->rseq : s->seq, &q->pos,
|
331
|
+
(q->strand? 1 : -1) * q->gap, &n_cigar, 1);
|
332
|
+
q->n_cigar = n_cigar;
|
333
|
+
}
|
334
|
+
if (s->type == BWA_TYPE_NO_MATCH || s->type == BWA_TYPE_MATESW || s->n_gapo == 0) continue;
|
335
|
+
s->cigar = refine_gapped_core(bns->l_pac, pacseq, s->len, s->strand? s->rseq : s->seq, &s->pos,
|
336
|
+
(s->strand? 1 : -1) * (s->n_gapo + s->n_gape), &s->n_cigar, 1);
|
337
|
+
}
|
338
|
+
|
339
|
+
if (ntbns) { // in color space
|
340
|
+
for (i = 0; i < n_seqs; ++i) {
|
341
|
+
bwa_seq_t *s = seqs + i;
|
342
|
+
bwa_cs2nt_core(s, bns->l_pac, ntpac);
|
343
|
+
for (j = 0; j < s->n_multi; ++j) {
|
344
|
+
bwt_multi1_t *q = s->multi + j;
|
345
|
+
int n_cigar;
|
346
|
+
if (q->gap == 0) continue;
|
347
|
+
free(q->cigar);
|
348
|
+
q->cigar = refine_gapped_core(bns->l_pac, ntpac, s->len, q->strand? s->rseq : s->seq, &q->pos,
|
349
|
+
(q->strand? 1 : -1) * q->gap, &n_cigar, 0);
|
350
|
+
q->n_cigar = n_cigar;
|
351
|
+
}
|
352
|
+
if (s->type != BWA_TYPE_NO_MATCH && s->cigar) { // update cigar again
|
353
|
+
free(s->cigar);
|
354
|
+
s->cigar = refine_gapped_core(bns->l_pac, ntpac, s->len, s->strand? s->rseq : s->seq, &s->pos,
|
355
|
+
(s->strand? 1 : -1) * (s->n_gapo + s->n_gape), &s->n_cigar, 0);
|
356
|
+
}
|
357
|
+
}
|
358
|
+
}
|
359
|
+
|
360
|
+
// generate MD tag
|
361
|
+
str = (kstring_t*)calloc(1, sizeof(kstring_t));
|
362
|
+
for (i = 0; i != n_seqs; ++i) {
|
363
|
+
bwa_seq_t *s = seqs + i;
|
364
|
+
if (s->type != BWA_TYPE_NO_MATCH) {
|
365
|
+
int nm;
|
366
|
+
s->md = bwa_cal_md1(s->n_cigar, s->cigar, s->len, s->pos, s->strand? s->rseq : s->seq,
|
367
|
+
bns->l_pac, ntbns? ntpac : pacseq, str, &nm);
|
368
|
+
s->nm = nm;
|
369
|
+
}
|
370
|
+
}
|
371
|
+
free(str->s); free(str);
|
372
|
+
|
373
|
+
// correct for trimmed reads
|
374
|
+
if (!ntbns) // trimming is only enabled for Illumina reads
|
375
|
+
for (i = 0; i < n_seqs; ++i) bwa_correct_trimmed(seqs + i);
|
376
|
+
|
377
|
+
if (!_pacseq) free(pacseq);
|
378
|
+
free(ntpac);
|
379
|
+
}
|
380
|
+
|
381
|
+
int64_t pos_end(const bwa_seq_t *p)
|
382
|
+
{
|
383
|
+
if (p->cigar) {
|
384
|
+
int j;
|
385
|
+
int64_t x = p->pos;
|
386
|
+
for (j = 0; j != p->n_cigar; ++j) {
|
387
|
+
int op = __cigar_op(p->cigar[j]);
|
388
|
+
if (op == 0 || op == 2) x += __cigar_len(p->cigar[j]);
|
389
|
+
}
|
390
|
+
return x;
|
391
|
+
} else return p->pos + p->len;
|
392
|
+
}
|
393
|
+
|
394
|
+
int64_t pos_end_multi(const bwt_multi1_t *p, int len) // analogy to pos_end()
|
395
|
+
{
|
396
|
+
if (p->cigar) {
|
397
|
+
int j;
|
398
|
+
int64_t x = p->pos;
|
399
|
+
for (j = 0; j != p->n_cigar; ++j) {
|
400
|
+
int op = __cigar_op(p->cigar[j]);
|
401
|
+
if (op == 0 || op == 2) x += __cigar_len(p->cigar[j]);
|
402
|
+
}
|
403
|
+
return x;
|
404
|
+
} else return p->pos + len;
|
405
|
+
}
|
406
|
+
|
407
|
+
static int64_t pos_5(const bwa_seq_t *p)
|
408
|
+
{
|
409
|
+
if (p->type != BWA_TYPE_NO_MATCH)
|
410
|
+
return p->strand? pos_end(p) : p->pos;
|
411
|
+
return -1;
|
412
|
+
}
|
413
|
+
|
414
|
+
void bwa_print_sam1(const bntseq_t *bns, bwa_seq_t *p, const bwa_seq_t *mate, int mode, int max_top2)
|
415
|
+
{
|
416
|
+
int j;
|
417
|
+
if (p->type != BWA_TYPE_NO_MATCH || (mate && mate->type != BWA_TYPE_NO_MATCH)) {
|
418
|
+
int seqid, nn, am = 0, flag = p->extra_flag;
|
419
|
+
char XT;
|
420
|
+
|
421
|
+
if (p->type == BWA_TYPE_NO_MATCH) {
|
422
|
+
p->pos = mate->pos;
|
423
|
+
p->strand = mate->strand;
|
424
|
+
flag |= SAM_FSU;
|
425
|
+
j = 1;
|
426
|
+
} else j = pos_end(p) - p->pos; // j is the length of the reference in the alignment
|
427
|
+
|
428
|
+
// get seqid
|
429
|
+
nn = bns_coor_pac2real(bns, p->pos, j, &seqid);
|
430
|
+
if (p->type != BWA_TYPE_NO_MATCH && p->pos + j - bns->anns[seqid].offset > bns->anns[seqid].len)
|
431
|
+
flag |= SAM_FSU; // flag UNMAP as this alignment bridges two adjacent reference sequences
|
432
|
+
|
433
|
+
// update flag and print it
|
434
|
+
if (p->strand) flag |= SAM_FSR;
|
435
|
+
if (mate) {
|
436
|
+
if (mate->type != BWA_TYPE_NO_MATCH) {
|
437
|
+
if (mate->strand) flag |= SAM_FMR;
|
438
|
+
} else flag |= SAM_FMU;
|
439
|
+
}
|
440
|
+
printf("%s\t%d\t%s\t", p->name, flag, bns->anns[seqid].name);
|
441
|
+
printf("%d\t%d\t", (int)(p->pos - bns->anns[seqid].offset + 1), p->mapQ);
|
442
|
+
|
443
|
+
// print CIGAR
|
444
|
+
if (p->cigar) {
|
445
|
+
for (j = 0; j != p->n_cigar; ++j)
|
446
|
+
printf("%d%c", __cigar_len(p->cigar[j]), "MIDS"[__cigar_op(p->cigar[j])]);
|
447
|
+
} else if (p->type == BWA_TYPE_NO_MATCH) printf("*");
|
448
|
+
else printf("%dM", p->len);
|
449
|
+
|
450
|
+
// print mate coordinate
|
451
|
+
if (mate && mate->type != BWA_TYPE_NO_MATCH) {
|
452
|
+
int m_seqid, m_is_N;
|
453
|
+
long long isize;
|
454
|
+
am = mate->seQ < p->seQ? mate->seQ : p->seQ; // smaller single-end mapping quality
|
455
|
+
// redundant calculation here, but should not matter too much
|
456
|
+
m_is_N = bns_coor_pac2real(bns, mate->pos, mate->len, &m_seqid);
|
457
|
+
printf("\t%s\t", (seqid == m_seqid)? "=" : bns->anns[m_seqid].name);
|
458
|
+
isize = (seqid == m_seqid)? pos_5(mate) - pos_5(p) : 0;
|
459
|
+
if (p->type == BWA_TYPE_NO_MATCH) isize = 0;
|
460
|
+
printf("%d\t%lld\t", (int)(mate->pos - bns->anns[m_seqid].offset + 1), isize);
|
461
|
+
} else if (mate) printf("\t=\t%d\t0\t", (int)(p->pos - bns->anns[seqid].offset + 1));
|
462
|
+
else printf("\t*\t0\t0\t");
|
463
|
+
|
464
|
+
// print sequence and quality
|
465
|
+
if (p->strand == 0)
|
466
|
+
for (j = 0; j != p->full_len; ++j) putchar("ACGTN"[(int)p->seq[j]]);
|
467
|
+
else for (j = 0; j != p->full_len; ++j) putchar("TGCAN"[p->seq[p->full_len - 1 - j]]);
|
468
|
+
putchar('\t');
|
469
|
+
if (p->qual) {
|
470
|
+
if (p->strand) seq_reverse(p->len, p->qual, 0); // reverse quality
|
471
|
+
printf("%s", p->qual);
|
472
|
+
} else printf("*");
|
473
|
+
|
474
|
+
if (bwa_rg_id) printf("\tRG:Z:%s", bwa_rg_id);
|
475
|
+
if (p->bc[0]) printf("\tBC:Z:%s", p->bc);
|
476
|
+
if (p->clip_len < p->full_len) printf("\tXC:i:%d", p->clip_len);
|
477
|
+
if (p->type != BWA_TYPE_NO_MATCH) {
|
478
|
+
int i;
|
479
|
+
// calculate XT tag
|
480
|
+
XT = "NURM"[p->type];
|
481
|
+
if (nn > 10) XT = 'N';
|
482
|
+
// print tags
|
483
|
+
printf("\tXT:A:%c\t%s:i:%d", XT, (mode & BWA_MODE_COMPREAD)? "NM" : "CM", p->nm);
|
484
|
+
if (nn) printf("\tXN:i:%d", nn);
|
485
|
+
if (mate) printf("\tSM:i:%d\tAM:i:%d", p->seQ, am);
|
486
|
+
if (p->type != BWA_TYPE_MATESW) { // X0 and X1 are not available for this type of alignment
|
487
|
+
printf("\tX0:i:%d", p->c1);
|
488
|
+
if (p->c1 <= max_top2) printf("\tX1:i:%d", p->c2);
|
489
|
+
}
|
490
|
+
printf("\tXM:i:%d\tXO:i:%d\tXG:i:%d", p->n_mm, p->n_gapo, p->n_gapo+p->n_gape);
|
491
|
+
if (p->md) printf("\tMD:Z:%s", p->md);
|
492
|
+
// print multiple hits
|
493
|
+
if (p->n_multi) {
|
494
|
+
printf("\tXA:Z:");
|
495
|
+
for (i = 0; i < p->n_multi; ++i) {
|
496
|
+
bwt_multi1_t *q = p->multi + i;
|
497
|
+
int k;
|
498
|
+
j = pos_end_multi(q, p->len) - q->pos;
|
499
|
+
nn = bns_coor_pac2real(bns, q->pos, j, &seqid);
|
500
|
+
printf("%s,%c%d,", bns->anns[seqid].name, q->strand? '-' : '+',
|
501
|
+
(int)(q->pos - bns->anns[seqid].offset + 1));
|
502
|
+
if (q->cigar) {
|
503
|
+
for (k = 0; k < q->n_cigar; ++k)
|
504
|
+
printf("%d%c", __cigar_len(q->cigar[k]), "MIDS"[__cigar_op(q->cigar[k])]);
|
505
|
+
} else printf("%dM", p->len);
|
506
|
+
printf(",%d;", q->gap + q->mm);
|
507
|
+
}
|
508
|
+
}
|
509
|
+
}
|
510
|
+
putchar('\n');
|
511
|
+
} else { // this read has no match
|
512
|
+
ubyte_t *s = p->strand? p->rseq : p->seq;
|
513
|
+
int flag = p->extra_flag | SAM_FSU;
|
514
|
+
if (mate && mate->type == BWA_TYPE_NO_MATCH) flag |= SAM_FMU;
|
515
|
+
printf("%s\t%d\t*\t0\t0\t*\t*\t0\t0\t", p->name, flag);
|
516
|
+
for (j = 0; j != p->len; ++j) putchar("ACGTN"[(int)s[j]]);
|
517
|
+
putchar('\t');
|
518
|
+
if (p->qual) {
|
519
|
+
if (p->strand) seq_reverse(p->len, p->qual, 0); // reverse quality
|
520
|
+
printf("%s", p->qual);
|
521
|
+
} else printf("*");
|
522
|
+
if (bwa_rg_id) printf("\tRG:Z:%s", bwa_rg_id);
|
523
|
+
if (p->bc[0]) printf("\tBC:Z:%s", p->bc);
|
524
|
+
if (p->clip_len < p->full_len) printf("\tXC:i:%d", p->clip_len);
|
525
|
+
putchar('\n');
|
526
|
+
}
|
527
|
+
}
|
528
|
+
|
529
|
+
bntseq_t *bwa_open_nt(const char *prefix)
|
530
|
+
{
|
531
|
+
bntseq_t *ntbns;
|
532
|
+
char *str;
|
533
|
+
str = (char*)calloc(strlen(prefix) + 10, 1);
|
534
|
+
strcat(strcpy(str, prefix), ".nt");
|
535
|
+
ntbns = bns_restore(str);
|
536
|
+
free(str);
|
537
|
+
return ntbns;
|
538
|
+
}
|
539
|
+
|
540
|
+
void bwa_print_sam_SQ(const bntseq_t *bns)
|
541
|
+
{
|
542
|
+
int i;
|
543
|
+
for (i = 0; i < bns->n_seqs; ++i)
|
544
|
+
printf("@SQ\tSN:%s\tLN:%d\n", bns->anns[i].name, bns->anns[i].len);
|
545
|
+
if (bwa_rg_line) printf("%s\n", bwa_rg_line);
|
546
|
+
}
|
547
|
+
|
548
|
+
void bwase_initialize()
|
549
|
+
{
|
550
|
+
int i;
|
551
|
+
for (i = 1; i != 256; ++i) g_log_n[i] = (int)(4.343 * log(i) + 0.5);
|
552
|
+
}
|
553
|
+
|
554
|
+
char *bwa_escape(char *s)
|
555
|
+
{
|
556
|
+
char *p, *q;
|
557
|
+
for (p = q = s; *p; ++p) {
|
558
|
+
if (*p == '\\') {
|
559
|
+
++p;
|
560
|
+
if (*p == 't') *q++ = '\t';
|
561
|
+
else if (*p == 'n') *q++ = '\n';
|
562
|
+
else if (*p == 'r') *q++ = '\r';
|
563
|
+
else if (*p == '\\') *q++ = '\\';
|
564
|
+
} else *q++ = *p;
|
565
|
+
}
|
566
|
+
*q = '\0';
|
567
|
+
return s;
|
568
|
+
}
|
569
|
+
|
570
|
+
int bwa_set_rg(const char *s)
|
571
|
+
{
|
572
|
+
char *p, *q, *r;
|
573
|
+
if (strstr(s, "@RG") != s) return -1;
|
574
|
+
if (bwa_rg_line) free(bwa_rg_line);
|
575
|
+
if (bwa_rg_id) free(bwa_rg_id);
|
576
|
+
bwa_rg_line = strdup(s);
|
577
|
+
bwa_rg_id = 0;
|
578
|
+
bwa_escape(bwa_rg_line);
|
579
|
+
p = strstr(bwa_rg_line, "\tID:");
|
580
|
+
if (p == 0) return -1;
|
581
|
+
p += 4;
|
582
|
+
for (q = p; *q && *q != '\t' && *q != '\n'; ++q);
|
583
|
+
bwa_rg_id = calloc(q - p + 1, 1);
|
584
|
+
for (q = p, r = bwa_rg_id; *q && *q != '\t' && *q != '\n'; ++q)
|
585
|
+
*r++ = *q;
|
586
|
+
return 0;
|
587
|
+
}
|
588
|
+
|
589
|
+
void bwa_sai2sam_se_core(const char *prefix, const char *fn_sa, const char *fn_fa, int n_occ)
|
590
|
+
{
|
591
|
+
extern bwa_seqio_t *bwa_open_reads(int mode, const char *fn_fa);
|
592
|
+
int i, n_seqs, tot_seqs = 0, m_aln;
|
593
|
+
bwt_aln1_t *aln = 0;
|
594
|
+
bwa_seq_t *seqs;
|
595
|
+
bwa_seqio_t *ks;
|
596
|
+
clock_t t;
|
597
|
+
bntseq_t *bns, *ntbns = 0;
|
598
|
+
FILE *fp_sa;
|
599
|
+
gap_opt_t opt;
|
600
|
+
|
601
|
+
// initialization
|
602
|
+
bwase_initialize();
|
603
|
+
bns = bns_restore(prefix);
|
604
|
+
srand48(bns->seed);
|
605
|
+
fp_sa = xopen(fn_sa, "r");
|
606
|
+
|
607
|
+
m_aln = 0;
|
608
|
+
fread(&opt, sizeof(gap_opt_t), 1, fp_sa);
|
609
|
+
if (!(opt.mode & BWA_MODE_COMPREAD)) // in color space; initialize ntpac
|
610
|
+
ntbns = bwa_open_nt(prefix);
|
611
|
+
bwa_print_sam_SQ(bns);
|
612
|
+
bwa_print_sam_PG();
|
613
|
+
// set ks
|
614
|
+
ks = bwa_open_reads(opt.mode, fn_fa);
|
615
|
+
// core loop
|
616
|
+
while ((seqs = bwa_read_seq(ks, 0x40000, &n_seqs, opt.mode, opt.trim_qual)) != 0) {
|
617
|
+
tot_seqs += n_seqs;
|
618
|
+
t = clock();
|
619
|
+
|
620
|
+
// read alignment
|
621
|
+
for (i = 0; i < n_seqs; ++i) {
|
622
|
+
bwa_seq_t *p = seqs + i;
|
623
|
+
int n_aln;
|
624
|
+
fread(&n_aln, 4, 1, fp_sa);
|
625
|
+
if (n_aln > m_aln) {
|
626
|
+
m_aln = n_aln;
|
627
|
+
aln = (bwt_aln1_t*)realloc(aln, sizeof(bwt_aln1_t) * m_aln);
|
628
|
+
}
|
629
|
+
fread(aln, sizeof(bwt_aln1_t), n_aln, fp_sa);
|
630
|
+
bwa_aln2seq_core(n_aln, aln, p, 1, n_occ);
|
631
|
+
}
|
632
|
+
|
633
|
+
fprintf(stderr, "[bwa_aln_core] convert to sequence coordinate... ");
|
634
|
+
bwa_cal_pac_pos(prefix, n_seqs, seqs, opt.max_diff, opt.fnr); // forward bwt will be destroyed here
|
635
|
+
fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); t = clock();
|
636
|
+
|
637
|
+
fprintf(stderr, "[bwa_aln_core] refine gapped alignments... ");
|
638
|
+
bwa_refine_gapped(bns, n_seqs, seqs, 0, ntbns);
|
639
|
+
fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); t = clock();
|
640
|
+
|
641
|
+
fprintf(stderr, "[bwa_aln_core] print alignments... ");
|
642
|
+
for (i = 0; i < n_seqs; ++i)
|
643
|
+
bwa_print_sam1(bns, seqs + i, 0, opt.mode, opt.max_top2);
|
644
|
+
fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); t = clock();
|
645
|
+
|
646
|
+
bwa_free_read_seq(n_seqs, seqs);
|
647
|
+
fprintf(stderr, "[bwa_aln_core] %d sequences have been processed.\n", tot_seqs);
|
648
|
+
}
|
649
|
+
|
650
|
+
// destroy
|
651
|
+
bwa_seq_close(ks);
|
652
|
+
if (ntbns) bns_destroy(ntbns);
|
653
|
+
bns_destroy(bns);
|
654
|
+
fclose(fp_sa);
|
655
|
+
free(aln);
|
656
|
+
}
|
657
|
+
|
658
|
+
int bwa_sai2sam_se(int argc, char *argv[])
|
659
|
+
{
|
660
|
+
int c, n_occ = 3;
|
661
|
+
optind = 1;
|
662
|
+
while ((c = getopt(argc, argv, "hn:f:r:")) >= 0) {
|
663
|
+
switch (c) {
|
664
|
+
case 'h': break;
|
665
|
+
case 'r':
|
666
|
+
if (bwa_set_rg(optarg) < 0) {
|
667
|
+
fprintf(stderr, "[%s] malformated @RG line\n", __func__);
|
668
|
+
return 1;
|
669
|
+
}
|
670
|
+
break;
|
671
|
+
case 'n': n_occ = atoi(optarg); break;
|
672
|
+
case 'f': xreopen(optarg, "w", stdout); break;
|
673
|
+
default: return 1;
|
674
|
+
}
|
675
|
+
}
|
676
|
+
|
677
|
+
if (optind + 3 > argc) {
|
678
|
+
fprintf(stderr, "Usage: bwa samse [-n max_occ] [-f out.sam] [-r RG_line] <prefix> <in.sai> <in.fq>\n");
|
679
|
+
return 1;
|
680
|
+
}
|
681
|
+
bwa_sai2sam_se_core(argv[optind], argv[optind+1], argv[optind+2], n_occ);
|
682
|
+
free(bwa_rg_line); free(bwa_rg_id);
|
683
|
+
fflush(stdout);
|
684
|
+
xreopen("/dev/tty","w",stdout);
|
685
|
+
return 0;
|
686
|
+
}
|