bio-bwa 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (91) hide show
  1. data/.document +5 -0
  2. data/Gemfile +15 -0
  3. data/Gemfile.lock +28 -0
  4. data/LICENSE.txt +35 -0
  5. data/README.rdoc +33 -0
  6. data/Rakefile +56 -0
  7. data/VERSION +1 -0
  8. data/bio-bwa.gemspec +152 -0
  9. data/doc/Bio.html +93 -0
  10. data/doc/Bio/BWA.html +2884 -0
  11. data/doc/Bio/BWA/Library.html +229 -0
  12. data/doc/_index.html +119 -0
  13. data/doc/class_list.html +36 -0
  14. data/doc/css/common.css +1 -0
  15. data/doc/css/full_list.css +53 -0
  16. data/doc/css/style.css +310 -0
  17. data/doc/file.LICENSE.html +88 -0
  18. data/doc/file.README.html +119 -0
  19. data/doc/file_list.html +41 -0
  20. data/doc/frames.html +13 -0
  21. data/doc/index.html +119 -0
  22. data/doc/js/app.js +203 -0
  23. data/doc/js/full_list.js +149 -0
  24. data/doc/js/jquery.js +154 -0
  25. data/doc/method_list.html +171 -0
  26. data/doc/top-level-namespace.html +88 -0
  27. data/ext/COPYING +674 -0
  28. data/ext/ChangeLog +3864 -0
  29. data/ext/NEWS +555 -0
  30. data/ext/README +29 -0
  31. data/ext/bamlite.c +155 -0
  32. data/ext/bamlite.h +94 -0
  33. data/ext/bntseq.c +303 -0
  34. data/ext/bntseq.h +80 -0
  35. data/ext/bwa.1 +562 -0
  36. data/ext/bwape.c +807 -0
  37. data/ext/bwase.c +686 -0
  38. data/ext/bwase.h +27 -0
  39. data/ext/bwaseqio.c +222 -0
  40. data/ext/bwt.c +250 -0
  41. data/ext/bwt.h +105 -0
  42. data/ext/bwt_gen/Makefile +23 -0
  43. data/ext/bwt_gen/QSufSort.c +496 -0
  44. data/ext/bwt_gen/QSufSort.h +40 -0
  45. data/ext/bwt_gen/bwt_gen.c +1547 -0
  46. data/ext/bwt_gen/bwt_gen.h +105 -0
  47. data/ext/bwt_lite.c +94 -0
  48. data/ext/bwt_lite.h +29 -0
  49. data/ext/bwtaln.c +345 -0
  50. data/ext/bwtaln.h +150 -0
  51. data/ext/bwtgap.c +264 -0
  52. data/ext/bwtgap.h +38 -0
  53. data/ext/bwtindex.c +186 -0
  54. data/ext/bwtio.c +77 -0
  55. data/ext/bwtmisc.c +269 -0
  56. data/ext/bwtsw2.h +51 -0
  57. data/ext/bwtsw2_aux.c +650 -0
  58. data/ext/bwtsw2_chain.c +107 -0
  59. data/ext/bwtsw2_core.c +594 -0
  60. data/ext/bwtsw2_main.c +100 -0
  61. data/ext/cs2nt.c +191 -0
  62. data/ext/is.c +218 -0
  63. data/ext/khash.h +506 -0
  64. data/ext/kseq.h +208 -0
  65. data/ext/ksort.h +269 -0
  66. data/ext/kstring.c +35 -0
  67. data/ext/kstring.h +46 -0
  68. data/ext/kvec.h +90 -0
  69. data/ext/main.c +63 -0
  70. data/ext/main.h +29 -0
  71. data/ext/mkrf_conf.rb +49 -0
  72. data/ext/qualfa2fq.pl +27 -0
  73. data/ext/simple_dp.c +162 -0
  74. data/ext/simpletest.c +23 -0
  75. data/ext/solid2fastq.pl +111 -0
  76. data/ext/stdaln.c +1072 -0
  77. data/ext/stdaln.h +162 -0
  78. data/ext/utils.c +82 -0
  79. data/ext/utils.h +54 -0
  80. data/lib/bio-bwa.rb +7 -0
  81. data/lib/bio/bwa.rb +312 -0
  82. data/lib/bio/bwa/library.rb +42 -0
  83. data/test/data/testdata.fa +602 -0
  84. data/test/data/testdata.long.fa +175 -0
  85. data/test/data/testdata.short.fa +2 -0
  86. data/test/helper.rb +18 -0
  87. data/test/test_bio-bwa_basic.rb +62 -0
  88. data/test/test_bio-bwa_make_index.rb +42 -0
  89. data/test/test_bio-bwa_run_aln.rb +49 -0
  90. data/test/test_bio-bwa_sam_conversion.rb +49 -0
  91. metadata +218 -0
data/ext/bwase.c ADDED
@@ -0,0 +1,686 @@
1
+ #include <unistd.h>
2
+ #include <string.h>
3
+ #include <stdio.h>
4
+ #include <stdlib.h>
5
+ #include <math.h>
6
+ #include <time.h>
7
+ #include "stdaln.h"
8
+ #include "bwase.h"
9
+ #include "bwtaln.h"
10
+ #include "bntseq.h"
11
+ #include "utils.h"
12
+ #include "kstring.h"
13
+
14
+ int g_log_n[256];
15
+ char *bwa_rg_line, *bwa_rg_id;
16
+
17
+ void bwa_print_sam_PG();
18
+
19
+ void bwa_aln2seq_core(int n_aln, const bwt_aln1_t *aln, bwa_seq_t *s, int set_main, int n_multi)
20
+ {
21
+ int i, cnt, best;
22
+ if (n_aln == 0) {
23
+ s->type = BWA_TYPE_NO_MATCH;
24
+ s->c1 = s->c2 = 0;
25
+ return;
26
+ }
27
+
28
+ if (set_main) {
29
+ best = aln[0].score;
30
+ for (i = cnt = 0; i < n_aln; ++i) {
31
+ const bwt_aln1_t *p = aln + i;
32
+ if (p->score > best) break;
33
+ if (drand48() * (p->l - p->k + 1 + cnt) > (double)cnt) {
34
+ s->n_mm = p->n_mm; s->n_gapo = p->n_gapo; s->n_gape = p->n_gape; s->strand = p->a;
35
+ s->score = p->score;
36
+ s->sa = p->k + (bwtint_t)((p->l - p->k + 1) * drand48());
37
+ }
38
+ cnt += p->l - p->k + 1;
39
+ }
40
+ s->c1 = cnt;
41
+ for (; i < n_aln; ++i) cnt += aln[i].l - aln[i].k + 1;
42
+ s->c2 = cnt - s->c1;
43
+ s->type = s->c1 > 1? BWA_TYPE_REPEAT : BWA_TYPE_UNIQUE;
44
+ }
45
+
46
+ if (n_multi) {
47
+ int k, rest, n_occ, z = 0;
48
+ for (k = n_occ = 0; k < n_aln; ++k) {
49
+ const bwt_aln1_t *q = aln + k;
50
+ n_occ += q->l - q->k + 1;
51
+ }
52
+ if (s->multi) free(s->multi);
53
+ if (n_occ > n_multi + 1) { // if there are too many hits, generate none of them
54
+ s->multi = 0; s->n_multi = 0;
55
+ return;
56
+ }
57
+ /* The following code is more flexible than what is required
58
+ * here. In principle, due to the requirement above, we can
59
+ * simply output all hits, but the following samples "rest"
60
+ * number of random hits. */
61
+ rest = n_occ > n_multi + 1? n_multi + 1 : n_occ; // find one additional for ->sa
62
+ s->multi = calloc(rest, sizeof(bwt_multi1_t));
63
+ for (k = 0; k < n_aln; ++k) {
64
+ const bwt_aln1_t *q = aln + k;
65
+ if (q->l - q->k + 1 <= rest) {
66
+ bwtint_t l;
67
+ for (l = q->k; l <= q->l; ++l) {
68
+ s->multi[z].pos = l;
69
+ s->multi[z].gap = q->n_gapo + q->n_gape;
70
+ s->multi[z].mm = q->n_mm;
71
+ s->multi[z++].strand = q->a;
72
+ }
73
+ rest -= q->l - q->k + 1;
74
+ } else { // Random sampling (http://code.activestate.com/recipes/272884/). In fact, we never come here.
75
+ int j, i, k;
76
+ for (j = rest, i = q->l - q->k + 1, k = 0; j > 0; --j) {
77
+ double p = 1.0, x = drand48();
78
+ while (x < p) p -= p * j / (i--);
79
+ s->multi[z].pos = q->l - i;
80
+ s->multi[z].gap = q->n_gapo + q->n_gape;
81
+ s->multi[z].mm = q->n_mm;
82
+ s->multi[z++].strand = q->a;
83
+ }
84
+ rest = 0;
85
+ break;
86
+ }
87
+ }
88
+ s->n_multi = z;
89
+ for (k = z = 0; k < s->n_multi; ++k)
90
+ if (s->multi[k].pos != s->sa)
91
+ s->multi[z++] = s->multi[k];
92
+ s->n_multi = z < n_multi? z : n_multi;
93
+ }
94
+ }
95
+
96
+ void bwa_aln2seq(int n_aln, const bwt_aln1_t *aln, bwa_seq_t *s)
97
+ {
98
+ bwa_aln2seq_core(n_aln, aln, s, 1, 0);
99
+ }
100
+
101
+ int bwa_approx_mapQ(const bwa_seq_t *p, int mm)
102
+ {
103
+ int n;
104
+ if (p->c1 == 0) return 23;
105
+ if (p->c1 > 1) return 0;
106
+ if (p->n_mm == mm) return 25;
107
+ if (p->c2 == 0) return 37;
108
+ n = (p->c2 >= 255)? 255 : p->c2;
109
+ return (23 < g_log_n[n])? 0 : 23 - g_log_n[n];
110
+ }
111
+
112
+ /**
113
+ * Derive the actual position in the read from the given suffix array
114
+ * coordinates. Note that the position will be approximate based on
115
+ * whether indels appear in the read and whether calculations are
116
+ * performed from the start or end of the read.
117
+ */
118
+ void bwa_cal_pac_pos_core(const bwt_t *forward_bwt, const bwt_t *reverse_bwt, bwa_seq_t *seq, const int max_mm, const float fnr)
119
+ {
120
+ int max_diff;
121
+ if (seq->type != BWA_TYPE_UNIQUE && seq->type != BWA_TYPE_REPEAT) return;
122
+ max_diff = fnr > 0.0? bwa_cal_maxdiff(seq->len, BWA_AVG_ERR, fnr) : max_mm;
123
+ if (seq->strand) { // reverse strand only
124
+ seq->pos = bwt_sa(forward_bwt, seq->sa);
125
+ seq->seQ = seq->mapQ = bwa_approx_mapQ(seq, max_diff);
126
+ } else { // forward strand only
127
+ /* NB: For gapped alignment, p->pos may not be correct, which
128
+ * will be fixed in refine_gapped_core(). This line also
129
+ * determines the way "x" is calculated in
130
+ * refine_gapped_core() when (ext < 0 && is_end == 0). */
131
+ seq->pos = reverse_bwt->seq_len - (bwt_sa(reverse_bwt, seq->sa) + seq->len);
132
+ seq->seQ = seq->mapQ = bwa_approx_mapQ(seq, max_diff);
133
+ }
134
+ }
135
+
136
+ void bwa_cal_pac_pos(const char *prefix, int n_seqs, bwa_seq_t *seqs, int max_mm, float fnr)
137
+ {
138
+ int i, j;
139
+ char str[1024];
140
+ bwt_t *bwt;
141
+ // load forward SA
142
+ strcpy(str, prefix); strcat(str, ".bwt"); bwt = bwt_restore_bwt(str);
143
+ strcpy(str, prefix); strcat(str, ".sa"); bwt_restore_sa(str, bwt);
144
+ for (i = 0; i != n_seqs; ++i) {
145
+ if (seqs[i].strand) bwa_cal_pac_pos_core(bwt, 0, &seqs[i], max_mm, fnr);
146
+ for (j = 0; j < seqs[i].n_multi; ++j) {
147
+ bwt_multi1_t *p = seqs[i].multi + j;
148
+ if (p->strand) p->pos = bwt_sa(bwt, p->pos);
149
+ }
150
+ }
151
+ bwt_destroy(bwt);
152
+ // load reverse BWT and SA
153
+ strcpy(str, prefix); strcat(str, ".rbwt"); bwt = bwt_restore_bwt(str);
154
+ strcpy(str, prefix); strcat(str, ".rsa"); bwt_restore_sa(str, bwt);
155
+ for (i = 0; i != n_seqs; ++i) {
156
+ if (!seqs[i].strand) bwa_cal_pac_pos_core(0, bwt, &seqs[i], max_mm, fnr);
157
+ for (j = 0; j < seqs[i].n_multi; ++j) {
158
+ bwt_multi1_t *p = seqs[i].multi + j;
159
+ if (!p->strand) p->pos = bwt->seq_len - (bwt_sa(bwt, p->pos) + seqs[i].len);
160
+ }
161
+ }
162
+ bwt_destroy(bwt);
163
+ }
164
+
165
+ /* is_end_correct == 1 if (*pos+len) gives the correct coordinate on
166
+ * forward strand. This happens when p->pos is calculated by
167
+ * bwa_cal_pac_pos(). is_end_correct==0 if (*pos) gives the correct
168
+ * coordinate. This happens only for color-converted alignment. */
169
+ static bwa_cigar_t *refine_gapped_core(bwtint_t l_pac, const ubyte_t *pacseq, int len, const ubyte_t *seq, bwtint_t *_pos,
170
+ int ext, int *n_cigar, int is_end_correct)
171
+ {
172
+ bwa_cigar_t *cigar = 0;
173
+ ubyte_t *ref_seq;
174
+ int l = 0, path_len, ref_len;
175
+ AlnParam ap = aln_param_bwa;
176
+ path_t *path;
177
+ int64_t k, __pos = *_pos > l_pac? (int64_t)((int32_t)*_pos) : *_pos;
178
+
179
+ ref_len = len + abs(ext);
180
+ if (ext > 0) {
181
+ ref_seq = (ubyte_t*)calloc(ref_len, 1);
182
+ for (k = __pos; k < __pos + ref_len && k < l_pac; ++k)
183
+ ref_seq[l++] = pacseq[k>>2] >> ((~k&3)<<1) & 3;
184
+ } else {
185
+ int64_t x = __pos + (is_end_correct? len : ref_len);
186
+ ref_seq = (ubyte_t*)calloc(ref_len, 1);
187
+ for (l = 0, k = x - ref_len > 0? x - ref_len : 0; k < x && k < l_pac; ++k)
188
+ ref_seq[l++] = pacseq[k>>2] >> ((~k&3)<<1) & 3;
189
+ }
190
+ path = (path_t*)calloc(l+len, sizeof(path_t));
191
+
192
+ aln_global_core(ref_seq, l, (ubyte_t*)seq, len, &ap, path, &path_len);
193
+ cigar = bwa_aln_path2cigar(path, path_len, n_cigar);
194
+
195
+ if (ext < 0 && is_end_correct) { // fix coordinate for reads mapped on the forward strand
196
+ for (l = k = 0; k < *n_cigar; ++k) {
197
+ if (__cigar_op(cigar[k]) == FROM_D) l -= __cigar_len(cigar[k]);
198
+ else if (__cigar_op(cigar[k]) == FROM_I) l += __cigar_len(cigar[k]);
199
+ }
200
+ __pos += l;
201
+ }
202
+
203
+ if (__cigar_op(cigar[0]) == FROM_D) { // deletion at the 5'-end
204
+ __pos += __cigar_len(cigar[0]);
205
+ for (k = 0; k < *n_cigar - 1; ++k) cigar[k] = cigar[k+1];
206
+ --(*n_cigar);
207
+ }
208
+ if (__cigar_op(cigar[*n_cigar-1]) == FROM_D) --(*n_cigar); // deletion at the 3'-end
209
+
210
+ // change "I" at either end of the read to S. just in case. This should rarely happen...
211
+ if (__cigar_op(cigar[*n_cigar-1]) == FROM_I) cigar[*n_cigar-1] = __cigar_create(3, (__cigar_len(cigar[*n_cigar-1])));
212
+ if (__cigar_op(cigar[0]) == FROM_I) cigar[0] = __cigar_create(3, (__cigar_len(cigar[0])));
213
+
214
+ *_pos = (bwtint_t)__pos;
215
+ free(ref_seq); free(path);
216
+ return cigar;
217
+ }
218
+
219
+ char *bwa_cal_md1(int n_cigar, bwa_cigar_t *cigar, int len, bwtint_t pos, ubyte_t *seq,
220
+ bwtint_t l_pac, ubyte_t *pacseq, kstring_t *str, int *_nm)
221
+ {
222
+ bwtint_t x, y;
223
+ int z, u, c, nm = 0;
224
+ str->l = 0; // reset
225
+ x = pos; y = 0;
226
+ if (cigar) {
227
+ int k, l;
228
+ for (k = u = 0; k < n_cigar; ++k) {
229
+ l = __cigar_len(cigar[k]);
230
+ if (__cigar_op(cigar[k]) == FROM_M) {
231
+ for (z = 0; z < l && x+z < l_pac; ++z) {
232
+ c = pacseq[(x+z)>>2] >> ((~(x+z)&3)<<1) & 3;
233
+ if (c > 3 || seq[y+z] > 3 || c != seq[y+z]) {
234
+ ksprintf(str, "%d", u);
235
+ kputc("ACGTN"[c], str);
236
+ ++nm;
237
+ u = 0;
238
+ } else ++u;
239
+ }
240
+ x += l; y += l;
241
+ /* } else if (cigar[k]>>14 == FROM_I || cigar[k]>>14 == 3) { */
242
+ } else if (__cigar_op(cigar[k]) == FROM_I || __cigar_op(cigar[k]) == FROM_S) {
243
+ y += l;
244
+ if (__cigar_op(cigar[k]) == FROM_I) nm += l;
245
+ } else if (__cigar_op(cigar[k]) == FROM_D) {
246
+ ksprintf(str, "%d", u);
247
+ kputc('^', str);
248
+ for (z = 0; z < l && x+z < l_pac; ++z)
249
+ kputc("ACGT"[pacseq[(x+z)>>2] >> ((~(x+z)&3)<<1) & 3], str);
250
+ u = 0;
251
+ x += l; nm += l;
252
+ }
253
+ }
254
+ } else { // no gaps
255
+ for (z = u = 0; z < (bwtint_t)len; ++z) {
256
+ c = pacseq[(x+z)>>2] >> ((~(x+z)&3)<<1) & 3;
257
+ if (c > 3 || seq[y+z] > 3 || c != seq[y+z]) {
258
+ ksprintf(str, "%d", u);
259
+ kputc("ACGTN"[c], str);
260
+ ++nm;
261
+ u = 0;
262
+ } else ++u;
263
+ }
264
+ }
265
+ ksprintf(str, "%d", u);
266
+ *_nm = nm;
267
+ return strdup(str->s);
268
+ }
269
+
270
+ void bwa_correct_trimmed(bwa_seq_t *s)
271
+ {
272
+ if (s->len == s->full_len) return;
273
+ if (s->strand == 0) { // forward
274
+ if (s->cigar && __cigar_op(s->cigar[s->n_cigar-1]) == FROM_S) { // the last is S
275
+ s->cigar[s->n_cigar-1] += s->full_len - s->len;
276
+ } else {
277
+ if (s->cigar == 0) {
278
+ s->n_cigar = 2;
279
+ s->cigar = calloc(s->n_cigar, sizeof(bwa_cigar_t));
280
+ s->cigar[0] = __cigar_create(0, s->len);
281
+ } else {
282
+ ++s->n_cigar;
283
+ s->cigar = realloc(s->cigar, s->n_cigar * sizeof(bwa_cigar_t));
284
+ }
285
+ s->cigar[s->n_cigar-1] = __cigar_create(3, (s->full_len - s->len));
286
+ }
287
+ } else { // reverse
288
+ if (s->cigar && __cigar_op(s->cigar[0]) == FROM_S) { // the first is S
289
+ s->cigar[0] += s->full_len - s->len;
290
+ } else {
291
+ if (s->cigar == 0) {
292
+ s->n_cigar = 2;
293
+ s->cigar = calloc(s->n_cigar, sizeof(bwa_cigar_t));
294
+ s->cigar[1] = __cigar_create(0, s->len);
295
+ } else {
296
+ ++s->n_cigar;
297
+ s->cigar = realloc(s->cigar, s->n_cigar * sizeof(bwa_cigar_t));
298
+ memmove(s->cigar + 1, s->cigar, (s->n_cigar-1) * sizeof(bwa_cigar_t));
299
+ }
300
+ s->cigar[0] = __cigar_create(3, (s->full_len - s->len));
301
+ }
302
+ }
303
+ s->len = s->full_len;
304
+ }
305
+
306
+ void bwa_refine_gapped(const bntseq_t *bns, int n_seqs, bwa_seq_t *seqs, ubyte_t *_pacseq, bntseq_t *ntbns)
307
+ {
308
+ ubyte_t *pacseq, *ntpac = 0;
309
+ int i, j;
310
+ kstring_t *str;
311
+
312
+ if (ntbns) { // in color space
313
+ ntpac = (ubyte_t*)calloc(ntbns->l_pac/4+1, 1);
314
+ rewind(ntbns->fp_pac);
315
+ fread(ntpac, 1, ntbns->l_pac/4 + 1, ntbns->fp_pac);
316
+ }
317
+
318
+ if (!_pacseq) {
319
+ pacseq = (ubyte_t*)calloc(bns->l_pac/4+1, 1);
320
+ rewind(bns->fp_pac);
321
+ fread(pacseq, 1, bns->l_pac/4+1, bns->fp_pac);
322
+ } else pacseq = _pacseq;
323
+ for (i = 0; i != n_seqs; ++i) {
324
+ bwa_seq_t *s = seqs + i;
325
+ seq_reverse(s->len, s->seq, 0); // IMPORTANT: s->seq is reversed here!!!
326
+ for (j = 0; j < s->n_multi; ++j) {
327
+ bwt_multi1_t *q = s->multi + j;
328
+ int n_cigar;
329
+ if (q->gap == 0) continue;
330
+ q->cigar = refine_gapped_core(bns->l_pac, pacseq, s->len, q->strand? s->rseq : s->seq, &q->pos,
331
+ (q->strand? 1 : -1) * q->gap, &n_cigar, 1);
332
+ q->n_cigar = n_cigar;
333
+ }
334
+ if (s->type == BWA_TYPE_NO_MATCH || s->type == BWA_TYPE_MATESW || s->n_gapo == 0) continue;
335
+ s->cigar = refine_gapped_core(bns->l_pac, pacseq, s->len, s->strand? s->rseq : s->seq, &s->pos,
336
+ (s->strand? 1 : -1) * (s->n_gapo + s->n_gape), &s->n_cigar, 1);
337
+ }
338
+
339
+ if (ntbns) { // in color space
340
+ for (i = 0; i < n_seqs; ++i) {
341
+ bwa_seq_t *s = seqs + i;
342
+ bwa_cs2nt_core(s, bns->l_pac, ntpac);
343
+ for (j = 0; j < s->n_multi; ++j) {
344
+ bwt_multi1_t *q = s->multi + j;
345
+ int n_cigar;
346
+ if (q->gap == 0) continue;
347
+ free(q->cigar);
348
+ q->cigar = refine_gapped_core(bns->l_pac, ntpac, s->len, q->strand? s->rseq : s->seq, &q->pos,
349
+ (q->strand? 1 : -1) * q->gap, &n_cigar, 0);
350
+ q->n_cigar = n_cigar;
351
+ }
352
+ if (s->type != BWA_TYPE_NO_MATCH && s->cigar) { // update cigar again
353
+ free(s->cigar);
354
+ s->cigar = refine_gapped_core(bns->l_pac, ntpac, s->len, s->strand? s->rseq : s->seq, &s->pos,
355
+ (s->strand? 1 : -1) * (s->n_gapo + s->n_gape), &s->n_cigar, 0);
356
+ }
357
+ }
358
+ }
359
+
360
+ // generate MD tag
361
+ str = (kstring_t*)calloc(1, sizeof(kstring_t));
362
+ for (i = 0; i != n_seqs; ++i) {
363
+ bwa_seq_t *s = seqs + i;
364
+ if (s->type != BWA_TYPE_NO_MATCH) {
365
+ int nm;
366
+ s->md = bwa_cal_md1(s->n_cigar, s->cigar, s->len, s->pos, s->strand? s->rseq : s->seq,
367
+ bns->l_pac, ntbns? ntpac : pacseq, str, &nm);
368
+ s->nm = nm;
369
+ }
370
+ }
371
+ free(str->s); free(str);
372
+
373
+ // correct for trimmed reads
374
+ if (!ntbns) // trimming is only enabled for Illumina reads
375
+ for (i = 0; i < n_seqs; ++i) bwa_correct_trimmed(seqs + i);
376
+
377
+ if (!_pacseq) free(pacseq);
378
+ free(ntpac);
379
+ }
380
+
381
+ int64_t pos_end(const bwa_seq_t *p)
382
+ {
383
+ if (p->cigar) {
384
+ int j;
385
+ int64_t x = p->pos;
386
+ for (j = 0; j != p->n_cigar; ++j) {
387
+ int op = __cigar_op(p->cigar[j]);
388
+ if (op == 0 || op == 2) x += __cigar_len(p->cigar[j]);
389
+ }
390
+ return x;
391
+ } else return p->pos + p->len;
392
+ }
393
+
394
+ int64_t pos_end_multi(const bwt_multi1_t *p, int len) // analogy to pos_end()
395
+ {
396
+ if (p->cigar) {
397
+ int j;
398
+ int64_t x = p->pos;
399
+ for (j = 0; j != p->n_cigar; ++j) {
400
+ int op = __cigar_op(p->cigar[j]);
401
+ if (op == 0 || op == 2) x += __cigar_len(p->cigar[j]);
402
+ }
403
+ return x;
404
+ } else return p->pos + len;
405
+ }
406
+
407
+ static int64_t pos_5(const bwa_seq_t *p)
408
+ {
409
+ if (p->type != BWA_TYPE_NO_MATCH)
410
+ return p->strand? pos_end(p) : p->pos;
411
+ return -1;
412
+ }
413
+
414
+ void bwa_print_sam1(const bntseq_t *bns, bwa_seq_t *p, const bwa_seq_t *mate, int mode, int max_top2)
415
+ {
416
+ int j;
417
+ if (p->type != BWA_TYPE_NO_MATCH || (mate && mate->type != BWA_TYPE_NO_MATCH)) {
418
+ int seqid, nn, am = 0, flag = p->extra_flag;
419
+ char XT;
420
+
421
+ if (p->type == BWA_TYPE_NO_MATCH) {
422
+ p->pos = mate->pos;
423
+ p->strand = mate->strand;
424
+ flag |= SAM_FSU;
425
+ j = 1;
426
+ } else j = pos_end(p) - p->pos; // j is the length of the reference in the alignment
427
+
428
+ // get seqid
429
+ nn = bns_coor_pac2real(bns, p->pos, j, &seqid);
430
+ if (p->type != BWA_TYPE_NO_MATCH && p->pos + j - bns->anns[seqid].offset > bns->anns[seqid].len)
431
+ flag |= SAM_FSU; // flag UNMAP as this alignment bridges two adjacent reference sequences
432
+
433
+ // update flag and print it
434
+ if (p->strand) flag |= SAM_FSR;
435
+ if (mate) {
436
+ if (mate->type != BWA_TYPE_NO_MATCH) {
437
+ if (mate->strand) flag |= SAM_FMR;
438
+ } else flag |= SAM_FMU;
439
+ }
440
+ printf("%s\t%d\t%s\t", p->name, flag, bns->anns[seqid].name);
441
+ printf("%d\t%d\t", (int)(p->pos - bns->anns[seqid].offset + 1), p->mapQ);
442
+
443
+ // print CIGAR
444
+ if (p->cigar) {
445
+ for (j = 0; j != p->n_cigar; ++j)
446
+ printf("%d%c", __cigar_len(p->cigar[j]), "MIDS"[__cigar_op(p->cigar[j])]);
447
+ } else if (p->type == BWA_TYPE_NO_MATCH) printf("*");
448
+ else printf("%dM", p->len);
449
+
450
+ // print mate coordinate
451
+ if (mate && mate->type != BWA_TYPE_NO_MATCH) {
452
+ int m_seqid, m_is_N;
453
+ long long isize;
454
+ am = mate->seQ < p->seQ? mate->seQ : p->seQ; // smaller single-end mapping quality
455
+ // redundant calculation here, but should not matter too much
456
+ m_is_N = bns_coor_pac2real(bns, mate->pos, mate->len, &m_seqid);
457
+ printf("\t%s\t", (seqid == m_seqid)? "=" : bns->anns[m_seqid].name);
458
+ isize = (seqid == m_seqid)? pos_5(mate) - pos_5(p) : 0;
459
+ if (p->type == BWA_TYPE_NO_MATCH) isize = 0;
460
+ printf("%d\t%lld\t", (int)(mate->pos - bns->anns[m_seqid].offset + 1), isize);
461
+ } else if (mate) printf("\t=\t%d\t0\t", (int)(p->pos - bns->anns[seqid].offset + 1));
462
+ else printf("\t*\t0\t0\t");
463
+
464
+ // print sequence and quality
465
+ if (p->strand == 0)
466
+ for (j = 0; j != p->full_len; ++j) putchar("ACGTN"[(int)p->seq[j]]);
467
+ else for (j = 0; j != p->full_len; ++j) putchar("TGCAN"[p->seq[p->full_len - 1 - j]]);
468
+ putchar('\t');
469
+ if (p->qual) {
470
+ if (p->strand) seq_reverse(p->len, p->qual, 0); // reverse quality
471
+ printf("%s", p->qual);
472
+ } else printf("*");
473
+
474
+ if (bwa_rg_id) printf("\tRG:Z:%s", bwa_rg_id);
475
+ if (p->bc[0]) printf("\tBC:Z:%s", p->bc);
476
+ if (p->clip_len < p->full_len) printf("\tXC:i:%d", p->clip_len);
477
+ if (p->type != BWA_TYPE_NO_MATCH) {
478
+ int i;
479
+ // calculate XT tag
480
+ XT = "NURM"[p->type];
481
+ if (nn > 10) XT = 'N';
482
+ // print tags
483
+ printf("\tXT:A:%c\t%s:i:%d", XT, (mode & BWA_MODE_COMPREAD)? "NM" : "CM", p->nm);
484
+ if (nn) printf("\tXN:i:%d", nn);
485
+ if (mate) printf("\tSM:i:%d\tAM:i:%d", p->seQ, am);
486
+ if (p->type != BWA_TYPE_MATESW) { // X0 and X1 are not available for this type of alignment
487
+ printf("\tX0:i:%d", p->c1);
488
+ if (p->c1 <= max_top2) printf("\tX1:i:%d", p->c2);
489
+ }
490
+ printf("\tXM:i:%d\tXO:i:%d\tXG:i:%d", p->n_mm, p->n_gapo, p->n_gapo+p->n_gape);
491
+ if (p->md) printf("\tMD:Z:%s", p->md);
492
+ // print multiple hits
493
+ if (p->n_multi) {
494
+ printf("\tXA:Z:");
495
+ for (i = 0; i < p->n_multi; ++i) {
496
+ bwt_multi1_t *q = p->multi + i;
497
+ int k;
498
+ j = pos_end_multi(q, p->len) - q->pos;
499
+ nn = bns_coor_pac2real(bns, q->pos, j, &seqid);
500
+ printf("%s,%c%d,", bns->anns[seqid].name, q->strand? '-' : '+',
501
+ (int)(q->pos - bns->anns[seqid].offset + 1));
502
+ if (q->cigar) {
503
+ for (k = 0; k < q->n_cigar; ++k)
504
+ printf("%d%c", __cigar_len(q->cigar[k]), "MIDS"[__cigar_op(q->cigar[k])]);
505
+ } else printf("%dM", p->len);
506
+ printf(",%d;", q->gap + q->mm);
507
+ }
508
+ }
509
+ }
510
+ putchar('\n');
511
+ } else { // this read has no match
512
+ ubyte_t *s = p->strand? p->rseq : p->seq;
513
+ int flag = p->extra_flag | SAM_FSU;
514
+ if (mate && mate->type == BWA_TYPE_NO_MATCH) flag |= SAM_FMU;
515
+ printf("%s\t%d\t*\t0\t0\t*\t*\t0\t0\t", p->name, flag);
516
+ for (j = 0; j != p->len; ++j) putchar("ACGTN"[(int)s[j]]);
517
+ putchar('\t');
518
+ if (p->qual) {
519
+ if (p->strand) seq_reverse(p->len, p->qual, 0); // reverse quality
520
+ printf("%s", p->qual);
521
+ } else printf("*");
522
+ if (bwa_rg_id) printf("\tRG:Z:%s", bwa_rg_id);
523
+ if (p->bc[0]) printf("\tBC:Z:%s", p->bc);
524
+ if (p->clip_len < p->full_len) printf("\tXC:i:%d", p->clip_len);
525
+ putchar('\n');
526
+ }
527
+ }
528
+
529
+ bntseq_t *bwa_open_nt(const char *prefix)
530
+ {
531
+ bntseq_t *ntbns;
532
+ char *str;
533
+ str = (char*)calloc(strlen(prefix) + 10, 1);
534
+ strcat(strcpy(str, prefix), ".nt");
535
+ ntbns = bns_restore(str);
536
+ free(str);
537
+ return ntbns;
538
+ }
539
+
540
+ void bwa_print_sam_SQ(const bntseq_t *bns)
541
+ {
542
+ int i;
543
+ for (i = 0; i < bns->n_seqs; ++i)
544
+ printf("@SQ\tSN:%s\tLN:%d\n", bns->anns[i].name, bns->anns[i].len);
545
+ if (bwa_rg_line) printf("%s\n", bwa_rg_line);
546
+ }
547
+
548
+ void bwase_initialize()
549
+ {
550
+ int i;
551
+ for (i = 1; i != 256; ++i) g_log_n[i] = (int)(4.343 * log(i) + 0.5);
552
+ }
553
+
554
+ char *bwa_escape(char *s)
555
+ {
556
+ char *p, *q;
557
+ for (p = q = s; *p; ++p) {
558
+ if (*p == '\\') {
559
+ ++p;
560
+ if (*p == 't') *q++ = '\t';
561
+ else if (*p == 'n') *q++ = '\n';
562
+ else if (*p == 'r') *q++ = '\r';
563
+ else if (*p == '\\') *q++ = '\\';
564
+ } else *q++ = *p;
565
+ }
566
+ *q = '\0';
567
+ return s;
568
+ }
569
+
570
+ int bwa_set_rg(const char *s)
571
+ {
572
+ char *p, *q, *r;
573
+ if (strstr(s, "@RG") != s) return -1;
574
+ if (bwa_rg_line) free(bwa_rg_line);
575
+ if (bwa_rg_id) free(bwa_rg_id);
576
+ bwa_rg_line = strdup(s);
577
+ bwa_rg_id = 0;
578
+ bwa_escape(bwa_rg_line);
579
+ p = strstr(bwa_rg_line, "\tID:");
580
+ if (p == 0) return -1;
581
+ p += 4;
582
+ for (q = p; *q && *q != '\t' && *q != '\n'; ++q);
583
+ bwa_rg_id = calloc(q - p + 1, 1);
584
+ for (q = p, r = bwa_rg_id; *q && *q != '\t' && *q != '\n'; ++q)
585
+ *r++ = *q;
586
+ return 0;
587
+ }
588
+
589
+ void bwa_sai2sam_se_core(const char *prefix, const char *fn_sa, const char *fn_fa, int n_occ)
590
+ {
591
+ extern bwa_seqio_t *bwa_open_reads(int mode, const char *fn_fa);
592
+ int i, n_seqs, tot_seqs = 0, m_aln;
593
+ bwt_aln1_t *aln = 0;
594
+ bwa_seq_t *seqs;
595
+ bwa_seqio_t *ks;
596
+ clock_t t;
597
+ bntseq_t *bns, *ntbns = 0;
598
+ FILE *fp_sa;
599
+ gap_opt_t opt;
600
+
601
+ // initialization
602
+ bwase_initialize();
603
+ bns = bns_restore(prefix);
604
+ srand48(bns->seed);
605
+ fp_sa = xopen(fn_sa, "r");
606
+
607
+ m_aln = 0;
608
+ fread(&opt, sizeof(gap_opt_t), 1, fp_sa);
609
+ if (!(opt.mode & BWA_MODE_COMPREAD)) // in color space; initialize ntpac
610
+ ntbns = bwa_open_nt(prefix);
611
+ bwa_print_sam_SQ(bns);
612
+ bwa_print_sam_PG();
613
+ // set ks
614
+ ks = bwa_open_reads(opt.mode, fn_fa);
615
+ // core loop
616
+ while ((seqs = bwa_read_seq(ks, 0x40000, &n_seqs, opt.mode, opt.trim_qual)) != 0) {
617
+ tot_seqs += n_seqs;
618
+ t = clock();
619
+
620
+ // read alignment
621
+ for (i = 0; i < n_seqs; ++i) {
622
+ bwa_seq_t *p = seqs + i;
623
+ int n_aln;
624
+ fread(&n_aln, 4, 1, fp_sa);
625
+ if (n_aln > m_aln) {
626
+ m_aln = n_aln;
627
+ aln = (bwt_aln1_t*)realloc(aln, sizeof(bwt_aln1_t) * m_aln);
628
+ }
629
+ fread(aln, sizeof(bwt_aln1_t), n_aln, fp_sa);
630
+ bwa_aln2seq_core(n_aln, aln, p, 1, n_occ);
631
+ }
632
+
633
+ fprintf(stderr, "[bwa_aln_core] convert to sequence coordinate... ");
634
+ bwa_cal_pac_pos(prefix, n_seqs, seqs, opt.max_diff, opt.fnr); // forward bwt will be destroyed here
635
+ fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); t = clock();
636
+
637
+ fprintf(stderr, "[bwa_aln_core] refine gapped alignments... ");
638
+ bwa_refine_gapped(bns, n_seqs, seqs, 0, ntbns);
639
+ fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); t = clock();
640
+
641
+ fprintf(stderr, "[bwa_aln_core] print alignments... ");
642
+ for (i = 0; i < n_seqs; ++i)
643
+ bwa_print_sam1(bns, seqs + i, 0, opt.mode, opt.max_top2);
644
+ fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); t = clock();
645
+
646
+ bwa_free_read_seq(n_seqs, seqs);
647
+ fprintf(stderr, "[bwa_aln_core] %d sequences have been processed.\n", tot_seqs);
648
+ }
649
+
650
+ // destroy
651
+ bwa_seq_close(ks);
652
+ if (ntbns) bns_destroy(ntbns);
653
+ bns_destroy(bns);
654
+ fclose(fp_sa);
655
+ free(aln);
656
+ }
657
+
658
+ int bwa_sai2sam_se(int argc, char *argv[])
659
+ {
660
+ int c, n_occ = 3;
661
+ optind = 1;
662
+ while ((c = getopt(argc, argv, "hn:f:r:")) >= 0) {
663
+ switch (c) {
664
+ case 'h': break;
665
+ case 'r':
666
+ if (bwa_set_rg(optarg) < 0) {
667
+ fprintf(stderr, "[%s] malformated @RG line\n", __func__);
668
+ return 1;
669
+ }
670
+ break;
671
+ case 'n': n_occ = atoi(optarg); break;
672
+ case 'f': xreopen(optarg, "w", stdout); break;
673
+ default: return 1;
674
+ }
675
+ }
676
+
677
+ if (optind + 3 > argc) {
678
+ fprintf(stderr, "Usage: bwa samse [-n max_occ] [-f out.sam] [-r RG_line] <prefix> <in.sai> <in.fq>\n");
679
+ return 1;
680
+ }
681
+ bwa_sai2sam_se_core(argv[optind], argv[optind+1], argv[optind+2], n_occ);
682
+ free(bwa_rg_line); free(bwa_rg_id);
683
+ fflush(stdout);
684
+ xreopen("/dev/tty","w",stdout);
685
+ return 0;
686
+ }