bio-bwa 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (91) hide show
  1. data/.document +5 -0
  2. data/Gemfile +15 -0
  3. data/Gemfile.lock +28 -0
  4. data/LICENSE.txt +35 -0
  5. data/README.rdoc +33 -0
  6. data/Rakefile +56 -0
  7. data/VERSION +1 -0
  8. data/bio-bwa.gemspec +152 -0
  9. data/doc/Bio.html +93 -0
  10. data/doc/Bio/BWA.html +2884 -0
  11. data/doc/Bio/BWA/Library.html +229 -0
  12. data/doc/_index.html +119 -0
  13. data/doc/class_list.html +36 -0
  14. data/doc/css/common.css +1 -0
  15. data/doc/css/full_list.css +53 -0
  16. data/doc/css/style.css +310 -0
  17. data/doc/file.LICENSE.html +88 -0
  18. data/doc/file.README.html +119 -0
  19. data/doc/file_list.html +41 -0
  20. data/doc/frames.html +13 -0
  21. data/doc/index.html +119 -0
  22. data/doc/js/app.js +203 -0
  23. data/doc/js/full_list.js +149 -0
  24. data/doc/js/jquery.js +154 -0
  25. data/doc/method_list.html +171 -0
  26. data/doc/top-level-namespace.html +88 -0
  27. data/ext/COPYING +674 -0
  28. data/ext/ChangeLog +3864 -0
  29. data/ext/NEWS +555 -0
  30. data/ext/README +29 -0
  31. data/ext/bamlite.c +155 -0
  32. data/ext/bamlite.h +94 -0
  33. data/ext/bntseq.c +303 -0
  34. data/ext/bntseq.h +80 -0
  35. data/ext/bwa.1 +562 -0
  36. data/ext/bwape.c +807 -0
  37. data/ext/bwase.c +686 -0
  38. data/ext/bwase.h +27 -0
  39. data/ext/bwaseqio.c +222 -0
  40. data/ext/bwt.c +250 -0
  41. data/ext/bwt.h +105 -0
  42. data/ext/bwt_gen/Makefile +23 -0
  43. data/ext/bwt_gen/QSufSort.c +496 -0
  44. data/ext/bwt_gen/QSufSort.h +40 -0
  45. data/ext/bwt_gen/bwt_gen.c +1547 -0
  46. data/ext/bwt_gen/bwt_gen.h +105 -0
  47. data/ext/bwt_lite.c +94 -0
  48. data/ext/bwt_lite.h +29 -0
  49. data/ext/bwtaln.c +345 -0
  50. data/ext/bwtaln.h +150 -0
  51. data/ext/bwtgap.c +264 -0
  52. data/ext/bwtgap.h +38 -0
  53. data/ext/bwtindex.c +186 -0
  54. data/ext/bwtio.c +77 -0
  55. data/ext/bwtmisc.c +269 -0
  56. data/ext/bwtsw2.h +51 -0
  57. data/ext/bwtsw2_aux.c +650 -0
  58. data/ext/bwtsw2_chain.c +107 -0
  59. data/ext/bwtsw2_core.c +594 -0
  60. data/ext/bwtsw2_main.c +100 -0
  61. data/ext/cs2nt.c +191 -0
  62. data/ext/is.c +218 -0
  63. data/ext/khash.h +506 -0
  64. data/ext/kseq.h +208 -0
  65. data/ext/ksort.h +269 -0
  66. data/ext/kstring.c +35 -0
  67. data/ext/kstring.h +46 -0
  68. data/ext/kvec.h +90 -0
  69. data/ext/main.c +63 -0
  70. data/ext/main.h +29 -0
  71. data/ext/mkrf_conf.rb +49 -0
  72. data/ext/qualfa2fq.pl +27 -0
  73. data/ext/simple_dp.c +162 -0
  74. data/ext/simpletest.c +23 -0
  75. data/ext/solid2fastq.pl +111 -0
  76. data/ext/stdaln.c +1072 -0
  77. data/ext/stdaln.h +162 -0
  78. data/ext/utils.c +82 -0
  79. data/ext/utils.h +54 -0
  80. data/lib/bio-bwa.rb +7 -0
  81. data/lib/bio/bwa.rb +312 -0
  82. data/lib/bio/bwa/library.rb +42 -0
  83. data/test/data/testdata.fa +602 -0
  84. data/test/data/testdata.long.fa +175 -0
  85. data/test/data/testdata.short.fa +2 -0
  86. data/test/helper.rb +18 -0
  87. data/test/test_bio-bwa_basic.rb +62 -0
  88. data/test/test_bio-bwa_make_index.rb +42 -0
  89. data/test/test_bio-bwa_run_aln.rb +49 -0
  90. data/test/test_bio-bwa_sam_conversion.rb +49 -0
  91. metadata +218 -0
data/ext/bwase.c ADDED
@@ -0,0 +1,686 @@
1
+ #include <unistd.h>
2
+ #include <string.h>
3
+ #include <stdio.h>
4
+ #include <stdlib.h>
5
+ #include <math.h>
6
+ #include <time.h>
7
+ #include "stdaln.h"
8
+ #include "bwase.h"
9
+ #include "bwtaln.h"
10
+ #include "bntseq.h"
11
+ #include "utils.h"
12
+ #include "kstring.h"
13
+
14
+ int g_log_n[256];
15
+ char *bwa_rg_line, *bwa_rg_id;
16
+
17
+ void bwa_print_sam_PG();
18
+
19
+ void bwa_aln2seq_core(int n_aln, const bwt_aln1_t *aln, bwa_seq_t *s, int set_main, int n_multi)
20
+ {
21
+ int i, cnt, best;
22
+ if (n_aln == 0) {
23
+ s->type = BWA_TYPE_NO_MATCH;
24
+ s->c1 = s->c2 = 0;
25
+ return;
26
+ }
27
+
28
+ if (set_main) {
29
+ best = aln[0].score;
30
+ for (i = cnt = 0; i < n_aln; ++i) {
31
+ const bwt_aln1_t *p = aln + i;
32
+ if (p->score > best) break;
33
+ if (drand48() * (p->l - p->k + 1 + cnt) > (double)cnt) {
34
+ s->n_mm = p->n_mm; s->n_gapo = p->n_gapo; s->n_gape = p->n_gape; s->strand = p->a;
35
+ s->score = p->score;
36
+ s->sa = p->k + (bwtint_t)((p->l - p->k + 1) * drand48());
37
+ }
38
+ cnt += p->l - p->k + 1;
39
+ }
40
+ s->c1 = cnt;
41
+ for (; i < n_aln; ++i) cnt += aln[i].l - aln[i].k + 1;
42
+ s->c2 = cnt - s->c1;
43
+ s->type = s->c1 > 1? BWA_TYPE_REPEAT : BWA_TYPE_UNIQUE;
44
+ }
45
+
46
+ if (n_multi) {
47
+ int k, rest, n_occ, z = 0;
48
+ for (k = n_occ = 0; k < n_aln; ++k) {
49
+ const bwt_aln1_t *q = aln + k;
50
+ n_occ += q->l - q->k + 1;
51
+ }
52
+ if (s->multi) free(s->multi);
53
+ if (n_occ > n_multi + 1) { // if there are too many hits, generate none of them
54
+ s->multi = 0; s->n_multi = 0;
55
+ return;
56
+ }
57
+ /* The following code is more flexible than what is required
58
+ * here. In principle, due to the requirement above, we can
59
+ * simply output all hits, but the following samples "rest"
60
+ * number of random hits. */
61
+ rest = n_occ > n_multi + 1? n_multi + 1 : n_occ; // find one additional for ->sa
62
+ s->multi = calloc(rest, sizeof(bwt_multi1_t));
63
+ for (k = 0; k < n_aln; ++k) {
64
+ const bwt_aln1_t *q = aln + k;
65
+ if (q->l - q->k + 1 <= rest) {
66
+ bwtint_t l;
67
+ for (l = q->k; l <= q->l; ++l) {
68
+ s->multi[z].pos = l;
69
+ s->multi[z].gap = q->n_gapo + q->n_gape;
70
+ s->multi[z].mm = q->n_mm;
71
+ s->multi[z++].strand = q->a;
72
+ }
73
+ rest -= q->l - q->k + 1;
74
+ } else { // Random sampling (http://code.activestate.com/recipes/272884/). In fact, we never come here.
75
+ int j, i, k;
76
+ for (j = rest, i = q->l - q->k + 1, k = 0; j > 0; --j) {
77
+ double p = 1.0, x = drand48();
78
+ while (x < p) p -= p * j / (i--);
79
+ s->multi[z].pos = q->l - i;
80
+ s->multi[z].gap = q->n_gapo + q->n_gape;
81
+ s->multi[z].mm = q->n_mm;
82
+ s->multi[z++].strand = q->a;
83
+ }
84
+ rest = 0;
85
+ break;
86
+ }
87
+ }
88
+ s->n_multi = z;
89
+ for (k = z = 0; k < s->n_multi; ++k)
90
+ if (s->multi[k].pos != s->sa)
91
+ s->multi[z++] = s->multi[k];
92
+ s->n_multi = z < n_multi? z : n_multi;
93
+ }
94
+ }
95
+
96
+ void bwa_aln2seq(int n_aln, const bwt_aln1_t *aln, bwa_seq_t *s)
97
+ {
98
+ bwa_aln2seq_core(n_aln, aln, s, 1, 0);
99
+ }
100
+
101
+ int bwa_approx_mapQ(const bwa_seq_t *p, int mm)
102
+ {
103
+ int n;
104
+ if (p->c1 == 0) return 23;
105
+ if (p->c1 > 1) return 0;
106
+ if (p->n_mm == mm) return 25;
107
+ if (p->c2 == 0) return 37;
108
+ n = (p->c2 >= 255)? 255 : p->c2;
109
+ return (23 < g_log_n[n])? 0 : 23 - g_log_n[n];
110
+ }
111
+
112
+ /**
113
+ * Derive the actual position in the read from the given suffix array
114
+ * coordinates. Note that the position will be approximate based on
115
+ * whether indels appear in the read and whether calculations are
116
+ * performed from the start or end of the read.
117
+ */
118
+ void bwa_cal_pac_pos_core(const bwt_t *forward_bwt, const bwt_t *reverse_bwt, bwa_seq_t *seq, const int max_mm, const float fnr)
119
+ {
120
+ int max_diff;
121
+ if (seq->type != BWA_TYPE_UNIQUE && seq->type != BWA_TYPE_REPEAT) return;
122
+ max_diff = fnr > 0.0? bwa_cal_maxdiff(seq->len, BWA_AVG_ERR, fnr) : max_mm;
123
+ if (seq->strand) { // reverse strand only
124
+ seq->pos = bwt_sa(forward_bwt, seq->sa);
125
+ seq->seQ = seq->mapQ = bwa_approx_mapQ(seq, max_diff);
126
+ } else { // forward strand only
127
+ /* NB: For gapped alignment, p->pos may not be correct, which
128
+ * will be fixed in refine_gapped_core(). This line also
129
+ * determines the way "x" is calculated in
130
+ * refine_gapped_core() when (ext < 0 && is_end == 0). */
131
+ seq->pos = reverse_bwt->seq_len - (bwt_sa(reverse_bwt, seq->sa) + seq->len);
132
+ seq->seQ = seq->mapQ = bwa_approx_mapQ(seq, max_diff);
133
+ }
134
+ }
135
+
136
+ void bwa_cal_pac_pos(const char *prefix, int n_seqs, bwa_seq_t *seqs, int max_mm, float fnr)
137
+ {
138
+ int i, j;
139
+ char str[1024];
140
+ bwt_t *bwt;
141
+ // load forward SA
142
+ strcpy(str, prefix); strcat(str, ".bwt"); bwt = bwt_restore_bwt(str);
143
+ strcpy(str, prefix); strcat(str, ".sa"); bwt_restore_sa(str, bwt);
144
+ for (i = 0; i != n_seqs; ++i) {
145
+ if (seqs[i].strand) bwa_cal_pac_pos_core(bwt, 0, &seqs[i], max_mm, fnr);
146
+ for (j = 0; j < seqs[i].n_multi; ++j) {
147
+ bwt_multi1_t *p = seqs[i].multi + j;
148
+ if (p->strand) p->pos = bwt_sa(bwt, p->pos);
149
+ }
150
+ }
151
+ bwt_destroy(bwt);
152
+ // load reverse BWT and SA
153
+ strcpy(str, prefix); strcat(str, ".rbwt"); bwt = bwt_restore_bwt(str);
154
+ strcpy(str, prefix); strcat(str, ".rsa"); bwt_restore_sa(str, bwt);
155
+ for (i = 0; i != n_seqs; ++i) {
156
+ if (!seqs[i].strand) bwa_cal_pac_pos_core(0, bwt, &seqs[i], max_mm, fnr);
157
+ for (j = 0; j < seqs[i].n_multi; ++j) {
158
+ bwt_multi1_t *p = seqs[i].multi + j;
159
+ if (!p->strand) p->pos = bwt->seq_len - (bwt_sa(bwt, p->pos) + seqs[i].len);
160
+ }
161
+ }
162
+ bwt_destroy(bwt);
163
+ }
164
+
165
+ /* is_end_correct == 1 if (*pos+len) gives the correct coordinate on
166
+ * forward strand. This happens when p->pos is calculated by
167
+ * bwa_cal_pac_pos(). is_end_correct==0 if (*pos) gives the correct
168
+ * coordinate. This happens only for color-converted alignment. */
169
+ static bwa_cigar_t *refine_gapped_core(bwtint_t l_pac, const ubyte_t *pacseq, int len, const ubyte_t *seq, bwtint_t *_pos,
170
+ int ext, int *n_cigar, int is_end_correct)
171
+ {
172
+ bwa_cigar_t *cigar = 0;
173
+ ubyte_t *ref_seq;
174
+ int l = 0, path_len, ref_len;
175
+ AlnParam ap = aln_param_bwa;
176
+ path_t *path;
177
+ int64_t k, __pos = *_pos > l_pac? (int64_t)((int32_t)*_pos) : *_pos;
178
+
179
+ ref_len = len + abs(ext);
180
+ if (ext > 0) {
181
+ ref_seq = (ubyte_t*)calloc(ref_len, 1);
182
+ for (k = __pos; k < __pos + ref_len && k < l_pac; ++k)
183
+ ref_seq[l++] = pacseq[k>>2] >> ((~k&3)<<1) & 3;
184
+ } else {
185
+ int64_t x = __pos + (is_end_correct? len : ref_len);
186
+ ref_seq = (ubyte_t*)calloc(ref_len, 1);
187
+ for (l = 0, k = x - ref_len > 0? x - ref_len : 0; k < x && k < l_pac; ++k)
188
+ ref_seq[l++] = pacseq[k>>2] >> ((~k&3)<<1) & 3;
189
+ }
190
+ path = (path_t*)calloc(l+len, sizeof(path_t));
191
+
192
+ aln_global_core(ref_seq, l, (ubyte_t*)seq, len, &ap, path, &path_len);
193
+ cigar = bwa_aln_path2cigar(path, path_len, n_cigar);
194
+
195
+ if (ext < 0 && is_end_correct) { // fix coordinate for reads mapped on the forward strand
196
+ for (l = k = 0; k < *n_cigar; ++k) {
197
+ if (__cigar_op(cigar[k]) == FROM_D) l -= __cigar_len(cigar[k]);
198
+ else if (__cigar_op(cigar[k]) == FROM_I) l += __cigar_len(cigar[k]);
199
+ }
200
+ __pos += l;
201
+ }
202
+
203
+ if (__cigar_op(cigar[0]) == FROM_D) { // deletion at the 5'-end
204
+ __pos += __cigar_len(cigar[0]);
205
+ for (k = 0; k < *n_cigar - 1; ++k) cigar[k] = cigar[k+1];
206
+ --(*n_cigar);
207
+ }
208
+ if (__cigar_op(cigar[*n_cigar-1]) == FROM_D) --(*n_cigar); // deletion at the 3'-end
209
+
210
+ // change "I" at either end of the read to S. just in case. This should rarely happen...
211
+ if (__cigar_op(cigar[*n_cigar-1]) == FROM_I) cigar[*n_cigar-1] = __cigar_create(3, (__cigar_len(cigar[*n_cigar-1])));
212
+ if (__cigar_op(cigar[0]) == FROM_I) cigar[0] = __cigar_create(3, (__cigar_len(cigar[0])));
213
+
214
+ *_pos = (bwtint_t)__pos;
215
+ free(ref_seq); free(path);
216
+ return cigar;
217
+ }
218
+
219
+ char *bwa_cal_md1(int n_cigar, bwa_cigar_t *cigar, int len, bwtint_t pos, ubyte_t *seq,
220
+ bwtint_t l_pac, ubyte_t *pacseq, kstring_t *str, int *_nm)
221
+ {
222
+ bwtint_t x, y;
223
+ int z, u, c, nm = 0;
224
+ str->l = 0; // reset
225
+ x = pos; y = 0;
226
+ if (cigar) {
227
+ int k, l;
228
+ for (k = u = 0; k < n_cigar; ++k) {
229
+ l = __cigar_len(cigar[k]);
230
+ if (__cigar_op(cigar[k]) == FROM_M) {
231
+ for (z = 0; z < l && x+z < l_pac; ++z) {
232
+ c = pacseq[(x+z)>>2] >> ((~(x+z)&3)<<1) & 3;
233
+ if (c > 3 || seq[y+z] > 3 || c != seq[y+z]) {
234
+ ksprintf(str, "%d", u);
235
+ kputc("ACGTN"[c], str);
236
+ ++nm;
237
+ u = 0;
238
+ } else ++u;
239
+ }
240
+ x += l; y += l;
241
+ /* } else if (cigar[k]>>14 == FROM_I || cigar[k]>>14 == 3) { */
242
+ } else if (__cigar_op(cigar[k]) == FROM_I || __cigar_op(cigar[k]) == FROM_S) {
243
+ y += l;
244
+ if (__cigar_op(cigar[k]) == FROM_I) nm += l;
245
+ } else if (__cigar_op(cigar[k]) == FROM_D) {
246
+ ksprintf(str, "%d", u);
247
+ kputc('^', str);
248
+ for (z = 0; z < l && x+z < l_pac; ++z)
249
+ kputc("ACGT"[pacseq[(x+z)>>2] >> ((~(x+z)&3)<<1) & 3], str);
250
+ u = 0;
251
+ x += l; nm += l;
252
+ }
253
+ }
254
+ } else { // no gaps
255
+ for (z = u = 0; z < (bwtint_t)len; ++z) {
256
+ c = pacseq[(x+z)>>2] >> ((~(x+z)&3)<<1) & 3;
257
+ if (c > 3 || seq[y+z] > 3 || c != seq[y+z]) {
258
+ ksprintf(str, "%d", u);
259
+ kputc("ACGTN"[c], str);
260
+ ++nm;
261
+ u = 0;
262
+ } else ++u;
263
+ }
264
+ }
265
+ ksprintf(str, "%d", u);
266
+ *_nm = nm;
267
+ return strdup(str->s);
268
+ }
269
+
270
+ void bwa_correct_trimmed(bwa_seq_t *s)
271
+ {
272
+ if (s->len == s->full_len) return;
273
+ if (s->strand == 0) { // forward
274
+ if (s->cigar && __cigar_op(s->cigar[s->n_cigar-1]) == FROM_S) { // the last is S
275
+ s->cigar[s->n_cigar-1] += s->full_len - s->len;
276
+ } else {
277
+ if (s->cigar == 0) {
278
+ s->n_cigar = 2;
279
+ s->cigar = calloc(s->n_cigar, sizeof(bwa_cigar_t));
280
+ s->cigar[0] = __cigar_create(0, s->len);
281
+ } else {
282
+ ++s->n_cigar;
283
+ s->cigar = realloc(s->cigar, s->n_cigar * sizeof(bwa_cigar_t));
284
+ }
285
+ s->cigar[s->n_cigar-1] = __cigar_create(3, (s->full_len - s->len));
286
+ }
287
+ } else { // reverse
288
+ if (s->cigar && __cigar_op(s->cigar[0]) == FROM_S) { // the first is S
289
+ s->cigar[0] += s->full_len - s->len;
290
+ } else {
291
+ if (s->cigar == 0) {
292
+ s->n_cigar = 2;
293
+ s->cigar = calloc(s->n_cigar, sizeof(bwa_cigar_t));
294
+ s->cigar[1] = __cigar_create(0, s->len);
295
+ } else {
296
+ ++s->n_cigar;
297
+ s->cigar = realloc(s->cigar, s->n_cigar * sizeof(bwa_cigar_t));
298
+ memmove(s->cigar + 1, s->cigar, (s->n_cigar-1) * sizeof(bwa_cigar_t));
299
+ }
300
+ s->cigar[0] = __cigar_create(3, (s->full_len - s->len));
301
+ }
302
+ }
303
+ s->len = s->full_len;
304
+ }
305
+
306
+ void bwa_refine_gapped(const bntseq_t *bns, int n_seqs, bwa_seq_t *seqs, ubyte_t *_pacseq, bntseq_t *ntbns)
307
+ {
308
+ ubyte_t *pacseq, *ntpac = 0;
309
+ int i, j;
310
+ kstring_t *str;
311
+
312
+ if (ntbns) { // in color space
313
+ ntpac = (ubyte_t*)calloc(ntbns->l_pac/4+1, 1);
314
+ rewind(ntbns->fp_pac);
315
+ fread(ntpac, 1, ntbns->l_pac/4 + 1, ntbns->fp_pac);
316
+ }
317
+
318
+ if (!_pacseq) {
319
+ pacseq = (ubyte_t*)calloc(bns->l_pac/4+1, 1);
320
+ rewind(bns->fp_pac);
321
+ fread(pacseq, 1, bns->l_pac/4+1, bns->fp_pac);
322
+ } else pacseq = _pacseq;
323
+ for (i = 0; i != n_seqs; ++i) {
324
+ bwa_seq_t *s = seqs + i;
325
+ seq_reverse(s->len, s->seq, 0); // IMPORTANT: s->seq is reversed here!!!
326
+ for (j = 0; j < s->n_multi; ++j) {
327
+ bwt_multi1_t *q = s->multi + j;
328
+ int n_cigar;
329
+ if (q->gap == 0) continue;
330
+ q->cigar = refine_gapped_core(bns->l_pac, pacseq, s->len, q->strand? s->rseq : s->seq, &q->pos,
331
+ (q->strand? 1 : -1) * q->gap, &n_cigar, 1);
332
+ q->n_cigar = n_cigar;
333
+ }
334
+ if (s->type == BWA_TYPE_NO_MATCH || s->type == BWA_TYPE_MATESW || s->n_gapo == 0) continue;
335
+ s->cigar = refine_gapped_core(bns->l_pac, pacseq, s->len, s->strand? s->rseq : s->seq, &s->pos,
336
+ (s->strand? 1 : -1) * (s->n_gapo + s->n_gape), &s->n_cigar, 1);
337
+ }
338
+
339
+ if (ntbns) { // in color space
340
+ for (i = 0; i < n_seqs; ++i) {
341
+ bwa_seq_t *s = seqs + i;
342
+ bwa_cs2nt_core(s, bns->l_pac, ntpac);
343
+ for (j = 0; j < s->n_multi; ++j) {
344
+ bwt_multi1_t *q = s->multi + j;
345
+ int n_cigar;
346
+ if (q->gap == 0) continue;
347
+ free(q->cigar);
348
+ q->cigar = refine_gapped_core(bns->l_pac, ntpac, s->len, q->strand? s->rseq : s->seq, &q->pos,
349
+ (q->strand? 1 : -1) * q->gap, &n_cigar, 0);
350
+ q->n_cigar = n_cigar;
351
+ }
352
+ if (s->type != BWA_TYPE_NO_MATCH && s->cigar) { // update cigar again
353
+ free(s->cigar);
354
+ s->cigar = refine_gapped_core(bns->l_pac, ntpac, s->len, s->strand? s->rseq : s->seq, &s->pos,
355
+ (s->strand? 1 : -1) * (s->n_gapo + s->n_gape), &s->n_cigar, 0);
356
+ }
357
+ }
358
+ }
359
+
360
+ // generate MD tag
361
+ str = (kstring_t*)calloc(1, sizeof(kstring_t));
362
+ for (i = 0; i != n_seqs; ++i) {
363
+ bwa_seq_t *s = seqs + i;
364
+ if (s->type != BWA_TYPE_NO_MATCH) {
365
+ int nm;
366
+ s->md = bwa_cal_md1(s->n_cigar, s->cigar, s->len, s->pos, s->strand? s->rseq : s->seq,
367
+ bns->l_pac, ntbns? ntpac : pacseq, str, &nm);
368
+ s->nm = nm;
369
+ }
370
+ }
371
+ free(str->s); free(str);
372
+
373
+ // correct for trimmed reads
374
+ if (!ntbns) // trimming is only enabled for Illumina reads
375
+ for (i = 0; i < n_seqs; ++i) bwa_correct_trimmed(seqs + i);
376
+
377
+ if (!_pacseq) free(pacseq);
378
+ free(ntpac);
379
+ }
380
+
381
+ int64_t pos_end(const bwa_seq_t *p)
382
+ {
383
+ if (p->cigar) {
384
+ int j;
385
+ int64_t x = p->pos;
386
+ for (j = 0; j != p->n_cigar; ++j) {
387
+ int op = __cigar_op(p->cigar[j]);
388
+ if (op == 0 || op == 2) x += __cigar_len(p->cigar[j]);
389
+ }
390
+ return x;
391
+ } else return p->pos + p->len;
392
+ }
393
+
394
+ int64_t pos_end_multi(const bwt_multi1_t *p, int len) // analogy to pos_end()
395
+ {
396
+ if (p->cigar) {
397
+ int j;
398
+ int64_t x = p->pos;
399
+ for (j = 0; j != p->n_cigar; ++j) {
400
+ int op = __cigar_op(p->cigar[j]);
401
+ if (op == 0 || op == 2) x += __cigar_len(p->cigar[j]);
402
+ }
403
+ return x;
404
+ } else return p->pos + len;
405
+ }
406
+
407
+ static int64_t pos_5(const bwa_seq_t *p)
408
+ {
409
+ if (p->type != BWA_TYPE_NO_MATCH)
410
+ return p->strand? pos_end(p) : p->pos;
411
+ return -1;
412
+ }
413
+
414
+ void bwa_print_sam1(const bntseq_t *bns, bwa_seq_t *p, const bwa_seq_t *mate, int mode, int max_top2)
415
+ {
416
+ int j;
417
+ if (p->type != BWA_TYPE_NO_MATCH || (mate && mate->type != BWA_TYPE_NO_MATCH)) {
418
+ int seqid, nn, am = 0, flag = p->extra_flag;
419
+ char XT;
420
+
421
+ if (p->type == BWA_TYPE_NO_MATCH) {
422
+ p->pos = mate->pos;
423
+ p->strand = mate->strand;
424
+ flag |= SAM_FSU;
425
+ j = 1;
426
+ } else j = pos_end(p) - p->pos; // j is the length of the reference in the alignment
427
+
428
+ // get seqid
429
+ nn = bns_coor_pac2real(bns, p->pos, j, &seqid);
430
+ if (p->type != BWA_TYPE_NO_MATCH && p->pos + j - bns->anns[seqid].offset > bns->anns[seqid].len)
431
+ flag |= SAM_FSU; // flag UNMAP as this alignment bridges two adjacent reference sequences
432
+
433
+ // update flag and print it
434
+ if (p->strand) flag |= SAM_FSR;
435
+ if (mate) {
436
+ if (mate->type != BWA_TYPE_NO_MATCH) {
437
+ if (mate->strand) flag |= SAM_FMR;
438
+ } else flag |= SAM_FMU;
439
+ }
440
+ printf("%s\t%d\t%s\t", p->name, flag, bns->anns[seqid].name);
441
+ printf("%d\t%d\t", (int)(p->pos - bns->anns[seqid].offset + 1), p->mapQ);
442
+
443
+ // print CIGAR
444
+ if (p->cigar) {
445
+ for (j = 0; j != p->n_cigar; ++j)
446
+ printf("%d%c", __cigar_len(p->cigar[j]), "MIDS"[__cigar_op(p->cigar[j])]);
447
+ } else if (p->type == BWA_TYPE_NO_MATCH) printf("*");
448
+ else printf("%dM", p->len);
449
+
450
+ // print mate coordinate
451
+ if (mate && mate->type != BWA_TYPE_NO_MATCH) {
452
+ int m_seqid, m_is_N;
453
+ long long isize;
454
+ am = mate->seQ < p->seQ? mate->seQ : p->seQ; // smaller single-end mapping quality
455
+ // redundant calculation here, but should not matter too much
456
+ m_is_N = bns_coor_pac2real(bns, mate->pos, mate->len, &m_seqid);
457
+ printf("\t%s\t", (seqid == m_seqid)? "=" : bns->anns[m_seqid].name);
458
+ isize = (seqid == m_seqid)? pos_5(mate) - pos_5(p) : 0;
459
+ if (p->type == BWA_TYPE_NO_MATCH) isize = 0;
460
+ printf("%d\t%lld\t", (int)(mate->pos - bns->anns[m_seqid].offset + 1), isize);
461
+ } else if (mate) printf("\t=\t%d\t0\t", (int)(p->pos - bns->anns[seqid].offset + 1));
462
+ else printf("\t*\t0\t0\t");
463
+
464
+ // print sequence and quality
465
+ if (p->strand == 0)
466
+ for (j = 0; j != p->full_len; ++j) putchar("ACGTN"[(int)p->seq[j]]);
467
+ else for (j = 0; j != p->full_len; ++j) putchar("TGCAN"[p->seq[p->full_len - 1 - j]]);
468
+ putchar('\t');
469
+ if (p->qual) {
470
+ if (p->strand) seq_reverse(p->len, p->qual, 0); // reverse quality
471
+ printf("%s", p->qual);
472
+ } else printf("*");
473
+
474
+ if (bwa_rg_id) printf("\tRG:Z:%s", bwa_rg_id);
475
+ if (p->bc[0]) printf("\tBC:Z:%s", p->bc);
476
+ if (p->clip_len < p->full_len) printf("\tXC:i:%d", p->clip_len);
477
+ if (p->type != BWA_TYPE_NO_MATCH) {
478
+ int i;
479
+ // calculate XT tag
480
+ XT = "NURM"[p->type];
481
+ if (nn > 10) XT = 'N';
482
+ // print tags
483
+ printf("\tXT:A:%c\t%s:i:%d", XT, (mode & BWA_MODE_COMPREAD)? "NM" : "CM", p->nm);
484
+ if (nn) printf("\tXN:i:%d", nn);
485
+ if (mate) printf("\tSM:i:%d\tAM:i:%d", p->seQ, am);
486
+ if (p->type != BWA_TYPE_MATESW) { // X0 and X1 are not available for this type of alignment
487
+ printf("\tX0:i:%d", p->c1);
488
+ if (p->c1 <= max_top2) printf("\tX1:i:%d", p->c2);
489
+ }
490
+ printf("\tXM:i:%d\tXO:i:%d\tXG:i:%d", p->n_mm, p->n_gapo, p->n_gapo+p->n_gape);
491
+ if (p->md) printf("\tMD:Z:%s", p->md);
492
+ // print multiple hits
493
+ if (p->n_multi) {
494
+ printf("\tXA:Z:");
495
+ for (i = 0; i < p->n_multi; ++i) {
496
+ bwt_multi1_t *q = p->multi + i;
497
+ int k;
498
+ j = pos_end_multi(q, p->len) - q->pos;
499
+ nn = bns_coor_pac2real(bns, q->pos, j, &seqid);
500
+ printf("%s,%c%d,", bns->anns[seqid].name, q->strand? '-' : '+',
501
+ (int)(q->pos - bns->anns[seqid].offset + 1));
502
+ if (q->cigar) {
503
+ for (k = 0; k < q->n_cigar; ++k)
504
+ printf("%d%c", __cigar_len(q->cigar[k]), "MIDS"[__cigar_op(q->cigar[k])]);
505
+ } else printf("%dM", p->len);
506
+ printf(",%d;", q->gap + q->mm);
507
+ }
508
+ }
509
+ }
510
+ putchar('\n');
511
+ } else { // this read has no match
512
+ ubyte_t *s = p->strand? p->rseq : p->seq;
513
+ int flag = p->extra_flag | SAM_FSU;
514
+ if (mate && mate->type == BWA_TYPE_NO_MATCH) flag |= SAM_FMU;
515
+ printf("%s\t%d\t*\t0\t0\t*\t*\t0\t0\t", p->name, flag);
516
+ for (j = 0; j != p->len; ++j) putchar("ACGTN"[(int)s[j]]);
517
+ putchar('\t');
518
+ if (p->qual) {
519
+ if (p->strand) seq_reverse(p->len, p->qual, 0); // reverse quality
520
+ printf("%s", p->qual);
521
+ } else printf("*");
522
+ if (bwa_rg_id) printf("\tRG:Z:%s", bwa_rg_id);
523
+ if (p->bc[0]) printf("\tBC:Z:%s", p->bc);
524
+ if (p->clip_len < p->full_len) printf("\tXC:i:%d", p->clip_len);
525
+ putchar('\n');
526
+ }
527
+ }
528
+
529
+ bntseq_t *bwa_open_nt(const char *prefix)
530
+ {
531
+ bntseq_t *ntbns;
532
+ char *str;
533
+ str = (char*)calloc(strlen(prefix) + 10, 1);
534
+ strcat(strcpy(str, prefix), ".nt");
535
+ ntbns = bns_restore(str);
536
+ free(str);
537
+ return ntbns;
538
+ }
539
+
540
+ void bwa_print_sam_SQ(const bntseq_t *bns)
541
+ {
542
+ int i;
543
+ for (i = 0; i < bns->n_seqs; ++i)
544
+ printf("@SQ\tSN:%s\tLN:%d\n", bns->anns[i].name, bns->anns[i].len);
545
+ if (bwa_rg_line) printf("%s\n", bwa_rg_line);
546
+ }
547
+
548
+ void bwase_initialize()
549
+ {
550
+ int i;
551
+ for (i = 1; i != 256; ++i) g_log_n[i] = (int)(4.343 * log(i) + 0.5);
552
+ }
553
+
554
+ char *bwa_escape(char *s)
555
+ {
556
+ char *p, *q;
557
+ for (p = q = s; *p; ++p) {
558
+ if (*p == '\\') {
559
+ ++p;
560
+ if (*p == 't') *q++ = '\t';
561
+ else if (*p == 'n') *q++ = '\n';
562
+ else if (*p == 'r') *q++ = '\r';
563
+ else if (*p == '\\') *q++ = '\\';
564
+ } else *q++ = *p;
565
+ }
566
+ *q = '\0';
567
+ return s;
568
+ }
569
+
570
+ int bwa_set_rg(const char *s)
571
+ {
572
+ char *p, *q, *r;
573
+ if (strstr(s, "@RG") != s) return -1;
574
+ if (bwa_rg_line) free(bwa_rg_line);
575
+ if (bwa_rg_id) free(bwa_rg_id);
576
+ bwa_rg_line = strdup(s);
577
+ bwa_rg_id = 0;
578
+ bwa_escape(bwa_rg_line);
579
+ p = strstr(bwa_rg_line, "\tID:");
580
+ if (p == 0) return -1;
581
+ p += 4;
582
+ for (q = p; *q && *q != '\t' && *q != '\n'; ++q);
583
+ bwa_rg_id = calloc(q - p + 1, 1);
584
+ for (q = p, r = bwa_rg_id; *q && *q != '\t' && *q != '\n'; ++q)
585
+ *r++ = *q;
586
+ return 0;
587
+ }
588
+
589
+ void bwa_sai2sam_se_core(const char *prefix, const char *fn_sa, const char *fn_fa, int n_occ)
590
+ {
591
+ extern bwa_seqio_t *bwa_open_reads(int mode, const char *fn_fa);
592
+ int i, n_seqs, tot_seqs = 0, m_aln;
593
+ bwt_aln1_t *aln = 0;
594
+ bwa_seq_t *seqs;
595
+ bwa_seqio_t *ks;
596
+ clock_t t;
597
+ bntseq_t *bns, *ntbns = 0;
598
+ FILE *fp_sa;
599
+ gap_opt_t opt;
600
+
601
+ // initialization
602
+ bwase_initialize();
603
+ bns = bns_restore(prefix);
604
+ srand48(bns->seed);
605
+ fp_sa = xopen(fn_sa, "r");
606
+
607
+ m_aln = 0;
608
+ fread(&opt, sizeof(gap_opt_t), 1, fp_sa);
609
+ if (!(opt.mode & BWA_MODE_COMPREAD)) // in color space; initialize ntpac
610
+ ntbns = bwa_open_nt(prefix);
611
+ bwa_print_sam_SQ(bns);
612
+ bwa_print_sam_PG();
613
+ // set ks
614
+ ks = bwa_open_reads(opt.mode, fn_fa);
615
+ // core loop
616
+ while ((seqs = bwa_read_seq(ks, 0x40000, &n_seqs, opt.mode, opt.trim_qual)) != 0) {
617
+ tot_seqs += n_seqs;
618
+ t = clock();
619
+
620
+ // read alignment
621
+ for (i = 0; i < n_seqs; ++i) {
622
+ bwa_seq_t *p = seqs + i;
623
+ int n_aln;
624
+ fread(&n_aln, 4, 1, fp_sa);
625
+ if (n_aln > m_aln) {
626
+ m_aln = n_aln;
627
+ aln = (bwt_aln1_t*)realloc(aln, sizeof(bwt_aln1_t) * m_aln);
628
+ }
629
+ fread(aln, sizeof(bwt_aln1_t), n_aln, fp_sa);
630
+ bwa_aln2seq_core(n_aln, aln, p, 1, n_occ);
631
+ }
632
+
633
+ fprintf(stderr, "[bwa_aln_core] convert to sequence coordinate... ");
634
+ bwa_cal_pac_pos(prefix, n_seqs, seqs, opt.max_diff, opt.fnr); // forward bwt will be destroyed here
635
+ fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); t = clock();
636
+
637
+ fprintf(stderr, "[bwa_aln_core] refine gapped alignments... ");
638
+ bwa_refine_gapped(bns, n_seqs, seqs, 0, ntbns);
639
+ fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); t = clock();
640
+
641
+ fprintf(stderr, "[bwa_aln_core] print alignments... ");
642
+ for (i = 0; i < n_seqs; ++i)
643
+ bwa_print_sam1(bns, seqs + i, 0, opt.mode, opt.max_top2);
644
+ fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); t = clock();
645
+
646
+ bwa_free_read_seq(n_seqs, seqs);
647
+ fprintf(stderr, "[bwa_aln_core] %d sequences have been processed.\n", tot_seqs);
648
+ }
649
+
650
+ // destroy
651
+ bwa_seq_close(ks);
652
+ if (ntbns) bns_destroy(ntbns);
653
+ bns_destroy(bns);
654
+ fclose(fp_sa);
655
+ free(aln);
656
+ }
657
+
658
+ int bwa_sai2sam_se(int argc, char *argv[])
659
+ {
660
+ int c, n_occ = 3;
661
+ optind = 1;
662
+ while ((c = getopt(argc, argv, "hn:f:r:")) >= 0) {
663
+ switch (c) {
664
+ case 'h': break;
665
+ case 'r':
666
+ if (bwa_set_rg(optarg) < 0) {
667
+ fprintf(stderr, "[%s] malformated @RG line\n", __func__);
668
+ return 1;
669
+ }
670
+ break;
671
+ case 'n': n_occ = atoi(optarg); break;
672
+ case 'f': xreopen(optarg, "w", stdout); break;
673
+ default: return 1;
674
+ }
675
+ }
676
+
677
+ if (optind + 3 > argc) {
678
+ fprintf(stderr, "Usage: bwa samse [-n max_occ] [-f out.sam] [-r RG_line] <prefix> <in.sai> <in.fq>\n");
679
+ return 1;
680
+ }
681
+ bwa_sai2sam_se_core(argv[optind], argv[optind+1], argv[optind+2], n_occ);
682
+ free(bwa_rg_line); free(bwa_rg_id);
683
+ fflush(stdout);
684
+ xreopen("/dev/tty","w",stdout);
685
+ return 0;
686
+ }