bio-bwa 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (91) hide show
  1. data/.document +5 -0
  2. data/Gemfile +15 -0
  3. data/Gemfile.lock +28 -0
  4. data/LICENSE.txt +35 -0
  5. data/README.rdoc +33 -0
  6. data/Rakefile +56 -0
  7. data/VERSION +1 -0
  8. data/bio-bwa.gemspec +152 -0
  9. data/doc/Bio.html +93 -0
  10. data/doc/Bio/BWA.html +2884 -0
  11. data/doc/Bio/BWA/Library.html +229 -0
  12. data/doc/_index.html +119 -0
  13. data/doc/class_list.html +36 -0
  14. data/doc/css/common.css +1 -0
  15. data/doc/css/full_list.css +53 -0
  16. data/doc/css/style.css +310 -0
  17. data/doc/file.LICENSE.html +88 -0
  18. data/doc/file.README.html +119 -0
  19. data/doc/file_list.html +41 -0
  20. data/doc/frames.html +13 -0
  21. data/doc/index.html +119 -0
  22. data/doc/js/app.js +203 -0
  23. data/doc/js/full_list.js +149 -0
  24. data/doc/js/jquery.js +154 -0
  25. data/doc/method_list.html +171 -0
  26. data/doc/top-level-namespace.html +88 -0
  27. data/ext/COPYING +674 -0
  28. data/ext/ChangeLog +3864 -0
  29. data/ext/NEWS +555 -0
  30. data/ext/README +29 -0
  31. data/ext/bamlite.c +155 -0
  32. data/ext/bamlite.h +94 -0
  33. data/ext/bntseq.c +303 -0
  34. data/ext/bntseq.h +80 -0
  35. data/ext/bwa.1 +562 -0
  36. data/ext/bwape.c +807 -0
  37. data/ext/bwase.c +686 -0
  38. data/ext/bwase.h +27 -0
  39. data/ext/bwaseqio.c +222 -0
  40. data/ext/bwt.c +250 -0
  41. data/ext/bwt.h +105 -0
  42. data/ext/bwt_gen/Makefile +23 -0
  43. data/ext/bwt_gen/QSufSort.c +496 -0
  44. data/ext/bwt_gen/QSufSort.h +40 -0
  45. data/ext/bwt_gen/bwt_gen.c +1547 -0
  46. data/ext/bwt_gen/bwt_gen.h +105 -0
  47. data/ext/bwt_lite.c +94 -0
  48. data/ext/bwt_lite.h +29 -0
  49. data/ext/bwtaln.c +345 -0
  50. data/ext/bwtaln.h +150 -0
  51. data/ext/bwtgap.c +264 -0
  52. data/ext/bwtgap.h +38 -0
  53. data/ext/bwtindex.c +186 -0
  54. data/ext/bwtio.c +77 -0
  55. data/ext/bwtmisc.c +269 -0
  56. data/ext/bwtsw2.h +51 -0
  57. data/ext/bwtsw2_aux.c +650 -0
  58. data/ext/bwtsw2_chain.c +107 -0
  59. data/ext/bwtsw2_core.c +594 -0
  60. data/ext/bwtsw2_main.c +100 -0
  61. data/ext/cs2nt.c +191 -0
  62. data/ext/is.c +218 -0
  63. data/ext/khash.h +506 -0
  64. data/ext/kseq.h +208 -0
  65. data/ext/ksort.h +269 -0
  66. data/ext/kstring.c +35 -0
  67. data/ext/kstring.h +46 -0
  68. data/ext/kvec.h +90 -0
  69. data/ext/main.c +63 -0
  70. data/ext/main.h +29 -0
  71. data/ext/mkrf_conf.rb +49 -0
  72. data/ext/qualfa2fq.pl +27 -0
  73. data/ext/simple_dp.c +162 -0
  74. data/ext/simpletest.c +23 -0
  75. data/ext/solid2fastq.pl +111 -0
  76. data/ext/stdaln.c +1072 -0
  77. data/ext/stdaln.h +162 -0
  78. data/ext/utils.c +82 -0
  79. data/ext/utils.h +54 -0
  80. data/lib/bio-bwa.rb +7 -0
  81. data/lib/bio/bwa.rb +312 -0
  82. data/lib/bio/bwa/library.rb +42 -0
  83. data/test/data/testdata.fa +602 -0
  84. data/test/data/testdata.long.fa +175 -0
  85. data/test/data/testdata.short.fa +2 -0
  86. data/test/helper.rb +18 -0
  87. data/test/test_bio-bwa_basic.rb +62 -0
  88. data/test/test_bio-bwa_make_index.rb +42 -0
  89. data/test/test_bio-bwa_run_aln.rb +49 -0
  90. data/test/test_bio-bwa_sam_conversion.rb +49 -0
  91. metadata +218 -0
data/ext/bwtsw2.h ADDED
@@ -0,0 +1,51 @@
1
+ #ifndef LH3_BWTSW2_H
2
+ #define LH3_BWTSW2_H
3
+
4
+ #include <stdint.h>
5
+ #include "bntseq.h"
6
+ #include "bwt_lite.h"
7
+ #include "bwt.h"
8
+
9
+ typedef struct {
10
+ int a, b, q, r, t, qr, bw;
11
+ int z, is, t_seeds, hard_clip;
12
+ float yita, mask_level, coef;
13
+ int n_threads, chunk_size;
14
+ } bsw2opt_t;
15
+
16
+ typedef struct {
17
+ uint32_t k, l, flag:18, n_seeds:14;
18
+ int len, G, G2;
19
+ int beg, end;
20
+ } bsw2hit_t;
21
+
22
+ typedef struct {
23
+ int n, max;
24
+ bsw2hit_t *hits;
25
+ int *n_cigar;
26
+ uint32_t **cigar;
27
+ } bwtsw2_t;
28
+
29
+ typedef struct {
30
+ void *stack;
31
+ int max_l;
32
+ uint8_t *aln_mem;
33
+ } bsw2global_t;
34
+
35
+ #ifdef __cplusplus
36
+ extern "C" {
37
+ #endif
38
+
39
+ bsw2opt_t *bsw2_init_opt();
40
+ bwtsw2_t **bsw2_core(const bsw2opt_t *opt, const bwtl_t *target, const bwt_t *query, bsw2global_t *pool);
41
+ void bsw2_aln(const bsw2opt_t *opt, const bntseq_t *bns, bwt_t * const target[2], const char *fn);
42
+ void bsw2_destroy(bwtsw2_t *b);
43
+
44
+ bsw2global_t *bsw2_global_init();
45
+ void bsw2_global_destroy(bsw2global_t *_pool);
46
+
47
+ #ifdef __cplusplus
48
+ }
49
+ #endif
50
+
51
+ #endif
data/ext/bwtsw2_aux.c ADDED
@@ -0,0 +1,650 @@
1
+ #include <stdlib.h>
2
+ #include <stdio.h>
3
+ #include <math.h>
4
+ #ifdef HAVE_CONFIG_H
5
+ #include "config.h"
6
+ #endif
7
+ #ifdef HAVE_PTHREAD
8
+ #include <pthread.h>
9
+ #endif
10
+ #include "bntseq.h"
11
+ #include "bwt_lite.h"
12
+ #include "utils.h"
13
+ #include "bwtsw2.h"
14
+ #include "stdaln.h"
15
+ #include "kstring.h"
16
+
17
+ #include "kseq.h"
18
+ KSEQ_INIT(gzFile, gzread)
19
+
20
+ #include "ksort.h"
21
+ #define __left_lt(a, b) ((a).end > (b).end)
22
+ KSORT_INIT(hit, bsw2hit_t, __left_lt)
23
+
24
+ extern unsigned char nst_nt4_table[256];
25
+
26
+ unsigned char nt_comp_table[256] = {
27
+ 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N',
28
+ 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N',
29
+ 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N',
30
+ 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N',
31
+ 'N','T','V','G', 'H','N','N','C', 'D','N','N','M', 'N','K','N','N',
32
+ 'N','N','Y','S', 'A','N','B','W', 'X','R','N','N', 'N','N','N','N',
33
+ 'n','t','v','g', 'h','n','n','c', 'd','n','n','m', 'n','k','n','n',
34
+ 'n','n','y','s', 'a','n','b','w', 'x','r','n','N', 'N','N','N','N',
35
+ 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N',
36
+ 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N',
37
+ 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N',
38
+ 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N',
39
+ 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N',
40
+ 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N',
41
+ 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N',
42
+ 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N'
43
+ };
44
+
45
+ extern int bsw2_resolve_duphits(const bwt_t *bwt, bwtsw2_t *b, int IS);
46
+ extern int bsw2_resolve_query_overlaps(bwtsw2_t *b, float mask_level);
47
+
48
+ bsw2opt_t *bsw2_init_opt()
49
+ {
50
+ bsw2opt_t *o = (bsw2opt_t*)calloc(1, sizeof(bsw2opt_t));
51
+ o->a = 1; o->b = 3; o->q = 5; o->r = 2; o->t = 30;
52
+ o->bw = 50;
53
+ o->z = 1; o->is = 3; o->t_seeds = 5; o->hard_clip = 0;
54
+ o->mask_level = 0.50f; o->yita = 5.5f; o->coef = 5.5f;
55
+ o->qr = o->q + o->r; o->n_threads = 1; o->chunk_size = 10000000;
56
+ return o;
57
+ }
58
+
59
+ void bsw2_destroy(bwtsw2_t *b)
60
+ {
61
+ int i;
62
+ if (b == 0) return;
63
+ if (b->cigar)
64
+ for (i = 0; i < b->n; ++i) free(b->cigar[i]);
65
+ free(b->cigar); free(b->n_cigar); free(b->hits);
66
+ free(b);
67
+ }
68
+
69
+ #define __gen_ap(par, opt) do { \
70
+ int i; \
71
+ for (i = 0; i < 25; ++i) (par).matrix[i] = -(opt)->b; \
72
+ for (i = 0; i < 4; ++i) (par).matrix[i*5+i] = (opt)->a; \
73
+ (par).gap_open = (opt)->q; (par).gap_ext = (opt)->r; \
74
+ (par).gap_end = (opt)->r; \
75
+ (par).row = 5; (par).band_width = opt->bw; \
76
+ } while (0)
77
+
78
+ #define __rpac(pac, l, i) (pac[(l-i-1)>>2] >> (~(l-i-1)&3)*2 & 0x3)
79
+
80
+ void bsw2_extend_left(const bsw2opt_t *opt, bwtsw2_t *b, uint8_t *_query, int lq, uint8_t *pac, uint32_t l_pac, int is_rev, uint8_t *_mem)
81
+ {
82
+ int i, matrix[25];
83
+ bwtint_t k;
84
+ uint8_t *target = 0, *query;
85
+ AlnParam par;
86
+
87
+ par.matrix = matrix;
88
+ __gen_ap(par, opt);
89
+ query = calloc(lq, 1);
90
+ // sort according to the descending order of query end
91
+ ks_introsort(hit, b->n, b->hits);
92
+ target = calloc(((lq + 1) / 2 * opt->a + opt->r) / opt->r + lq, 1);
93
+ // reverse _query
94
+ for (i = 0; i < lq; ++i) query[lq - i - 1] = _query[i];
95
+ // core loop
96
+ for (i = 0; i < b->n; ++i) {
97
+ bsw2hit_t *p = b->hits + i;
98
+ int lt = ((p->beg + 1) / 2 * opt->a + opt->r) / opt->r + lq;
99
+ int score, j;
100
+ path_t path;
101
+ p->n_seeds = 1;
102
+ if (p->l || p->k == 0) continue;
103
+ for (j = score = 0; j < i; ++j) {
104
+ bsw2hit_t *q = b->hits + j;
105
+ if (q->beg <= p->beg && q->k <= p->k && q->k + q->len >= p->k + p->len) {
106
+ if (q->n_seeds < (1<<14) - 2) ++q->n_seeds;
107
+ ++score;
108
+ }
109
+ }
110
+ if (score) continue;
111
+ if (lt > p->k) lt = p->k;
112
+ if (is_rev) {
113
+ for (k = p->k - 1, j = 0; k > 0 && j < lt; --k) // FIXME: k=0 not considered!
114
+ target[j++] = __rpac(pac, l_pac, k);
115
+ } else {
116
+ for (k = p->k - 1, j = 0; k > 0 && j < lt; --k) // FIXME: k=0 not considered!
117
+ target[j++] = pac[k>>2] >> (~k&3)*2 & 0x3;
118
+ }
119
+ lt = j;
120
+ score = aln_extend_core(target, lt, query + lq - p->beg, p->beg, &par, &path, 0, p->G, _mem);
121
+ if (score > p->G) { // extensible
122
+ p->G = score;
123
+ p->len += path.i;
124
+ p->beg -= path.j;
125
+ p->k -= path.i;
126
+ }
127
+ }
128
+ free(query); free(target);
129
+ }
130
+
131
+ void bsw2_extend_rght(const bsw2opt_t *opt, bwtsw2_t *b, uint8_t *query, int lq, uint8_t *pac, uint32_t l_pac, int is_rev, uint8_t *_mem)
132
+ {
133
+ int i, matrix[25];
134
+ uint32_t k;
135
+ uint8_t *target;
136
+ AlnParam par;
137
+
138
+ par.matrix = matrix;
139
+ __gen_ap(par, opt);
140
+ target = calloc(((lq + 1) / 2 * opt->a + opt->r) / opt->r + lq, 1);
141
+ for (i = 0; i < b->n; ++i) {
142
+ bsw2hit_t *p = b->hits + i;
143
+ int lt = ((lq - p->beg + 1) / 2 * opt->a + opt->r) / opt->r + lq;
144
+ int j, score;
145
+ path_t path;
146
+ if (p->l) continue;
147
+ if (is_rev) {
148
+ for (k = p->k, j = 0; k < p->k + lt && k < l_pac; ++k)
149
+ target[j++] = __rpac(pac, l_pac, k);
150
+ } else {
151
+ for (k = p->k, j = 0; k < p->k + lt && k < l_pac; ++k)
152
+ target[j++] = pac[k>>2] >> (~k&3)*2 & 0x3;
153
+ }
154
+ lt = j;
155
+ score = aln_extend_core(target, lt, query + p->beg, lq - p->beg, &par, &path, 0, 1, _mem);
156
+ // if (score < p->G) fprintf(stderr, "[bsw2_extend_hits] %d < %d\n", score, p->G);
157
+ if (score >= p->G) {
158
+ p->G = score;
159
+ p->len = path.i;
160
+ p->end = path.j + p->beg;
161
+ }
162
+ }
163
+ free(target);
164
+ }
165
+
166
+ /* generate CIGAR array(s) in b->cigar[] */
167
+ static void gen_cigar(const bsw2opt_t *opt, int lq, uint8_t *seq[2], uint8_t *pac, bwtsw2_t *b)
168
+ {
169
+ uint8_t *target;
170
+ int i, matrix[25];
171
+ AlnParam par;
172
+ path_t *path;
173
+
174
+ par.matrix = matrix;
175
+ __gen_ap(par, opt);
176
+ i = ((lq + 1) / 2 * opt->a + opt->r) / opt->r + lq; // maximum possible target length
177
+ target = calloc(i, 1);
178
+ path = calloc(i + lq, sizeof(path_t));
179
+ // memory clean up for b
180
+ if (b->n < b->max) {
181
+ b->max = b->n;
182
+ b->hits = realloc(b->hits, b->n * sizeof(bsw2hit_t));
183
+ }
184
+ if (b->cigar) free(b->cigar);
185
+ if (b->n_cigar) free(b->n_cigar);
186
+ b->cigar = (uint32_t**)calloc(b->max, sizeof(void*));
187
+ b->n_cigar = (int*)calloc(b->max, sizeof(int));
188
+ // generate CIGAR
189
+ for (i = 0; i < b->n; ++i) {
190
+ bsw2hit_t *p = b->hits + i;
191
+ uint8_t *query;
192
+ uint32_t k;
193
+ int score, path_len, beg, end;
194
+ if (p->l) continue;
195
+ beg = (p->flag & 0x10)? lq - p->end : p->beg;
196
+ end = (p->flag & 0x10)? lq - p->beg : p->end;
197
+ query = seq[(p->flag & 0x10)? 1 : 0] + beg;
198
+ for (k = p->k; k < p->k + p->len; ++k) // in principle, no out-of-boundary here
199
+ target[k - p->k] = pac[k>>2] >> (~k&3)*2 & 0x3;
200
+ score = aln_global_core(target, p->len, query, end - beg, &par, path, &path_len);
201
+ b->cigar[i] = aln_path2cigar32(path, path_len, &b->n_cigar[i]);
202
+ if (beg != 0 || end < lq) { // write soft clipping
203
+ b->cigar[i] = realloc(b->cigar[i], 4 * (b->n_cigar[i] + 2));
204
+ if (beg != 0) {
205
+ memmove(b->cigar[i] + 1, b->cigar[i], b->n_cigar[i] * 4);
206
+ b->cigar[i][0] = beg<<4 | 4;
207
+ ++b->n_cigar[i];
208
+ }
209
+ if (end < lq) {
210
+ b->cigar[i][b->n_cigar[i]] = (lq - end)<<4 | 4;
211
+ ++b->n_cigar[i];
212
+ }
213
+ }
214
+ }
215
+ free(target); free(path);
216
+ }
217
+
218
+ /* this is for the debugging purpose only */
219
+ void bsw2_debug_hits(const bwtsw2_t *b)
220
+ {
221
+ int i;
222
+ printf("# raw hits: %d\n", b->n);
223
+ for (i = 0; i < b->n; ++i) {
224
+ bsw2hit_t *p = b->hits + i;
225
+ if (p->l == 0)
226
+ printf("%d, %d, %d, %u, %u\n", p->G, p->beg, p->end, p->k, p->l);
227
+ }
228
+ }
229
+
230
+ static void merge_hits(bwtsw2_t *b[2], int l, int is_reverse)
231
+ {
232
+ int i;
233
+ if (b[0]->n + b[1]->n > b[0]->max) {
234
+ b[0]->max = b[0]->n + b[1]->n;
235
+ b[0]->hits = realloc(b[0]->hits, b[0]->max * sizeof(bsw2hit_t));
236
+ }
237
+ for (i = 0; i < b[1]->n; ++i) {
238
+ bsw2hit_t *p = b[0]->hits + b[0]->n + i;
239
+ *p = b[1]->hits[i];
240
+ if (is_reverse) {
241
+ int x = p->beg;
242
+ p->beg = l - p->end;
243
+ p->end = l - x;
244
+ p->flag |= 0x10;
245
+ }
246
+ }
247
+ b[0]->n += b[1]->n;
248
+ bsw2_destroy(b[1]);
249
+ b[1] = 0;
250
+ }
251
+ /* seq[0] is the forward sequence and seq[1] is the reverse complement. */
252
+ static bwtsw2_t *bsw2_aln1_core(const bsw2opt_t *opt, const bntseq_t *bns, uint8_t *pac, const bwt_t *target,
253
+ int l, uint8_t *seq[2], int is_rev, bsw2global_t *pool)
254
+ {
255
+ extern void bsw2_chain_filter(const bsw2opt_t *opt, int len, bwtsw2_t *b[2]);
256
+ bwtsw2_t *b[2], **bb[2];
257
+ int k;
258
+ for (k = 0; k < 2; ++k) {
259
+ bwtl_t *query = bwtl_seq2bwtl(l, seq[k]);
260
+ bb[k] = bsw2_core(opt, query, target, pool);
261
+ bwtl_destroy(query);
262
+ }
263
+ b[0] = bb[0][1]; b[1] = bb[1][1]; // bb[*][1] are "narrow SA hits"
264
+ bsw2_chain_filter(opt, l, b);
265
+ for (k = 0; k < 2; ++k) {
266
+ bsw2_extend_left(opt, bb[k][1], seq[k], l, pac, bns->l_pac, is_rev, pool->aln_mem);
267
+ merge_hits(bb[k], l, 0); // bb[k][1] is merged to bb[k][0] here
268
+ bsw2_resolve_duphits(0, bb[k][0], 0);
269
+ bsw2_extend_rght(opt, bb[k][0], seq[k], l, pac, bns->l_pac, is_rev, pool->aln_mem);
270
+ b[k] = bb[k][0];
271
+ free(bb[k]);
272
+ }
273
+ merge_hits(b, l, 1); // again, b[1] is merged to b[0]
274
+ bsw2_resolve_query_overlaps(b[0], opt->mask_level);
275
+ return b[0];
276
+ }
277
+
278
+ /* set ->flag to records the origin of the hit (to forward bwt or reverse bwt) */
279
+ static void flag_fr(bwtsw2_t *b[2])
280
+ {
281
+ int i, j;
282
+ for (i = 0; i < b[0]->n; ++i) {
283
+ bsw2hit_t *p = b[0]->hits + i;
284
+ p->flag |= 0x10000;
285
+ }
286
+ for (i = 0; i < b[1]->n; ++i) {
287
+ bsw2hit_t *p = b[1]->hits + i;
288
+ p->flag |= 0x20000;
289
+ }
290
+ for (i = 0; i < b[0]->n; ++i) {
291
+ bsw2hit_t *p = b[0]->hits + i;
292
+ for (j = 0; j < b[1]->n; ++j) {
293
+ bsw2hit_t *q = b[1]->hits + j;
294
+ if (q->beg == p->beg && q->end == p->end && q->k == p->k && q->len == p->len && q->G == p->G) {
295
+ q->flag |= 0x30000; p->flag |= 0x30000;
296
+ break;
297
+ }
298
+ }
299
+ }
300
+ }
301
+
302
+ typedef struct {
303
+ int l, tid;
304
+ char *name, *seq, *qual, *sam;
305
+ } bsw2seq1_t;
306
+
307
+ typedef struct {
308
+ int n, max;
309
+ bsw2seq1_t *seq;
310
+ } bsw2seq_t;
311
+
312
+ #ifdef HAVE_PTHREAD
313
+ static pthread_mutex_t g_dbwtsw_lock = PTHREAD_MUTEX_INITIALIZER;
314
+ #endif
315
+
316
+ static int fix_cigar(const char *qname, const bntseq_t *bns, bsw2hit_t *p, int n_cigar, uint32_t *cigar)
317
+ {
318
+ // FIXME: this routine does not work if the query bridge three reference sequences
319
+ int32_t coor, refl, lq;
320
+ int x, y, i, seqid;
321
+ bns_coor_pac2real(bns, p->k, p->len, &seqid);
322
+ coor = p->k - bns->anns[seqid].offset;
323
+ refl = bns->anns[seqid].len;
324
+ x = coor; y = 0;
325
+ // test if the alignment goes beyond the boundary
326
+ for (i = 0; i < n_cigar; ++i) {
327
+ int op = cigar[i]&0xf, ln = cigar[i]>>4;
328
+ if (op == 1 || op == 4 || op == 5) y += ln;
329
+ else if (op == 2) x += ln;
330
+ else x += ln, y += ln;
331
+ }
332
+ lq = y; // length of the query sequence
333
+ if (x > refl) { // then fix it
334
+ int j, nc, mq[2], nlen[2];
335
+ uint32_t *cn, kk = 0;
336
+ nc = mq[0] = mq[1] = nlen[0] = nlen[1] = 0;
337
+ cn = calloc(n_cigar + 3, 4);
338
+ x = coor; y = 0;
339
+ for (i = j = 0; i < n_cigar; ++i) {
340
+ int op = cigar[i]&0xf, ln = cigar[i]>>4;
341
+ if (op == 4 || op == 5 || op == 1) { // ins or clipping
342
+ y += ln;
343
+ cn[j++] = cigar[i];
344
+ } else if (op == 2) { // del
345
+ if (x + ln >= refl && nc == 0) {
346
+ cn[j++] = (uint32_t)(lq - y)<<4 | 4;
347
+ nc = j;
348
+ cn[j++] = (uint32_t)y<<4 | 4;
349
+ kk = p->k + (x + ln - refl);
350
+ nlen[0] = x - coor;
351
+ nlen[1] = p->len - nlen[0] - ln;
352
+ } else cn[j++] = cigar[i];
353
+ x += ln;
354
+ } else if (op == 0) { // match
355
+ if (x + ln >= refl && nc == 0) {
356
+ // FIXME: not consider a special case where a split right between M and I
357
+ cn[j++] = (uint32_t)(refl - x)<<4 | 0; // write M
358
+ cn[j++] = (uint32_t)(lq - y - (refl - x))<<4 | 4; // write S
359
+ nc = j;
360
+ mq[0] += refl - x;
361
+ cn[j++] = (uint32_t)(y + (refl - x))<<4 | 4;
362
+ if (x + ln - refl) cn[j++] = (uint32_t)(x + ln - refl)<<4 | 0;
363
+ mq[1] += x + ln - refl;
364
+ kk = bns->anns[seqid].offset + refl;
365
+ nlen[0] = refl - coor;
366
+ nlen[1] = p->len - nlen[0];
367
+ } else {
368
+ cn[j++] = cigar[i];
369
+ mq[nc?1:0] += ln;
370
+ }
371
+ x += ln; y += ln;
372
+ }
373
+ }
374
+ if (mq[0] > mq[1]) { // then take the first alignment
375
+ n_cigar = nc;
376
+ memcpy(cigar, cn, 4 * nc);
377
+ p->len = nlen[0];
378
+ } else {
379
+ p->k = kk; p->len = nlen[1];
380
+ n_cigar = j - nc;
381
+ memcpy(cigar, cn + nc, 4 * (j - nc));
382
+ }
383
+ free(cn);
384
+ }
385
+ return n_cigar;
386
+ }
387
+
388
+ /* generate SAM lines for a sequence in ks with alignment stored in
389
+ * b. ks->name and ks->seq will be freed and set to NULL in the end. */
390
+ static void print_hits(const bntseq_t *bns, const bsw2opt_t *opt, bsw2seq1_t *ks, bwtsw2_t *b)
391
+ {
392
+ int i, k;
393
+ kstring_t str;
394
+ memset(&str, 0, sizeof(kstring_t));
395
+ if (b == 0 || b->n == 0) { // no hits
396
+ ksprintf(&str, "%s\t4\t*\t0\t0\t*\t*\t0\t0\t", ks->name);
397
+ for (i = 0; i < ks->l; ++i) kputc(ks->seq[i], &str);
398
+ if (ks->qual) {
399
+ kputc('\t', &str);
400
+ for (i = 0; i < ks->l; ++i) kputc(ks->qual[i], &str);
401
+ } else kputs("\t*", &str);
402
+ kputc('\n', &str);
403
+ }
404
+ for (i = 0; b && i < b->n; ++i) {
405
+ bsw2hit_t *p = b->hits + i;
406
+ int32_t seqid = -1, coor = -1;
407
+ int j, qual, nn = 0;
408
+ int beg, end;
409
+ if (p->l == 0) {
410
+ b->n_cigar[i] = fix_cigar(ks->name, bns, p, b->n_cigar[i], b->cigar[i]);
411
+ nn = bns_coor_pac2real(bns, p->k, p->len, &seqid);
412
+ coor = p->k - bns->anns[seqid].offset;
413
+ }
414
+ ksprintf(&str, "%s\t%d", ks->name, p->flag&0x10);
415
+ ksprintf(&str, "\t%s\t%d", seqid>=0? bns->anns[seqid].name : "*", coor + 1);
416
+ if (p->l == 0) {
417
+ { // estimate mapping quality
418
+ float c = 1.0;
419
+ int subo = p->G2 > opt->t? p->G2 : opt->t;
420
+ if (p->flag>>16 == 1 || p->flag>>16 == 2) c *= .5;
421
+ if (p->n_seeds < 2) c *= .2;
422
+ qual = (int)(c * (p->G - subo) * (250.0 / p->G + 0.03 / opt->a) + .499);
423
+ if (qual > 250) qual = 250;
424
+ if (p->flag&1) qual = 0;
425
+ }
426
+ ksprintf(&str, "\t%d\t", qual);
427
+ for (k = 0; k < b->n_cigar[i]; ++k)
428
+ ksprintf(&str, "%d%c", b->cigar[i][k]>>4, (opt->hard_clip? "MIDNHHP" : "MIDNSHP")[b->cigar[i][k]&0xf]);
429
+ } else ksprintf(&str, "\t0\t*");
430
+ ksprintf(&str, "\t*\t0\t0\t");
431
+ beg = 0; end = ks->l;
432
+ if (opt->hard_clip) {
433
+ if ((b->cigar[i][0]&0xf) == 4) beg += b->cigar[i][0]>>4;
434
+ if ((b->cigar[i][b->n_cigar[i]-1]&0xf) == 4) end -= b->cigar[i][b->n_cigar[i]-1]>>4;
435
+ }
436
+ for (j = beg; j < end; ++j) {
437
+ if (p->flag&0x10) kputc(nt_comp_table[(int)ks->seq[ks->l - 1 - j]], &str);
438
+ else kputc(ks->seq[j], &str);
439
+ }
440
+ if (ks->qual) {
441
+ kputc('\t', &str);
442
+ for (j = beg; j < end; ++j) {
443
+ if (p->flag&0x10) kputc(ks->qual[ks->l - 1 - j], &str);
444
+ else kputc(ks->qual[j], &str);
445
+ }
446
+ } else ksprintf(&str, "\t*");
447
+ ksprintf(&str, "\tAS:i:%d\tXS:i:%d\tXF:i:%d\tXE:i:%d\tXN:i:%d", p->G, p->G2, p->flag>>16, p->n_seeds, nn);
448
+ if (p->l) ksprintf(&str, "\tXI:i:%d", p->l - p->k + 1);
449
+ kputc('\n', &str);
450
+ }
451
+ ks->sam = str.s;
452
+ free(ks->seq); ks->seq = 0;
453
+ free(ks->qual); ks->qual = 0;
454
+ free(ks->name); ks->name = 0;
455
+ }
456
+
457
+ /* Core routine to align reads in _seq. It is separated from
458
+ * process_seqs() to realize multi-threading */
459
+ static void bsw2_aln_core(int tid, bsw2seq_t *_seq, const bsw2opt_t *_opt, const bntseq_t *bns, uint8_t *pac, bwt_t * const target[2])
460
+ {
461
+ int x;
462
+ bsw2opt_t opt = *_opt;
463
+ bsw2global_t *pool = bsw2_global_init();
464
+ for (x = 0; x < _seq->n; ++x) {
465
+ bsw2seq1_t *p = _seq->seq + x;
466
+ uint8_t *seq[2], *rseq[2];
467
+ int i, l, k;
468
+ bwtsw2_t *b[2];
469
+ l = p->l;
470
+
471
+ #ifdef HAVE_PTHREAD
472
+ if (_opt->n_threads > 1) {
473
+ pthread_mutex_lock(&g_dbwtsw_lock);
474
+ if (p->tid < 0) p->tid = tid;
475
+ else if (p->tid != tid) {
476
+ pthread_mutex_unlock(&g_dbwtsw_lock);
477
+ continue;
478
+ } // in pinciple else should not happen
479
+ pthread_mutex_unlock(&g_dbwtsw_lock);
480
+ }
481
+ #endif
482
+
483
+ // set opt->t
484
+ opt.t = _opt->t;
485
+ if (opt.t < log(l) * opt.coef) opt.t = (int)(log(l) * opt.coef + .499);
486
+ if (pool->max_l < l) { // then enlarge working space for aln_extend_core()
487
+ int tmp = ((l + 1) / 2 * opt.a + opt.r) / opt.r + l;
488
+ pool->max_l = l;
489
+ pool->aln_mem = realloc(pool->aln_mem, (tmp + 2) * 24);
490
+ }
491
+ // set opt->bw
492
+ opt.bw = _opt->bw;
493
+ k = (l * opt.a - 2 * opt.q) / (2 * opt.r + opt.a);
494
+ i = (l * opt.a - opt.a - opt.t) / opt.r;
495
+ if (k > i) k = i;
496
+ if (k < 1) k = 1; // I do not know if k==0 causes troubles
497
+ opt.bw = _opt->bw < k? _opt->bw : k;
498
+ // set seq[2] and rseq[2]
499
+ seq[0] = calloc(l * 4, 1);
500
+ seq[1] = seq[0] + l;
501
+ rseq[0] = seq[1] + l; rseq[1] = rseq[0] + l;
502
+ // convert sequences to 2-bit representation
503
+ for (i = k = 0; i < l; ++i) {
504
+ int c = nst_nt4_table[(int)p->seq[i]];
505
+ if (c >= 4) { c = (int)(drand48() * 4); ++k; } // FIXME: ambiguous bases are not properly handled
506
+ seq[0][i] = c;
507
+ seq[1][l-1-i] = 3 - c;
508
+ rseq[0][l-1-i] = c;
509
+ rseq[1][i] = 3 - c;
510
+ }
511
+ if (l - k < opt.t) { // too few unambiguous bases
512
+ print_hits(bns, &opt, p, 0);
513
+ free(seq[0]); continue;
514
+ }
515
+ // alignment
516
+ b[0] = bsw2_aln1_core(&opt, bns, pac, target[0], l, seq, 0, pool);
517
+ for (k = 0; k < b[0]->n; ++k)
518
+ if (b[0]->hits[k].n_seeds < opt.t_seeds) break;
519
+ if (k < b[0]->n) {
520
+ b[1] = bsw2_aln1_core(&opt, bns, pac, target[1], l, rseq, 1, pool);
521
+ for (i = 0; i < b[1]->n; ++i) {
522
+ bsw2hit_t *p = b[1]->hits + i;
523
+ int x = p->beg;
524
+ p->beg = l - p->end;
525
+ p->end = l - x;
526
+ if (p->l == 0) p->k = bns->l_pac - (p->k + p->len);
527
+ }
528
+ flag_fr(b);
529
+ merge_hits(b, l, 0);
530
+ bsw2_resolve_duphits(0, b[0], 0);
531
+ bsw2_resolve_query_overlaps(b[0], opt.mask_level);
532
+ } else b[1] = 0;
533
+ // generate CIGAR and print SAM
534
+ gen_cigar(&opt, l, seq, pac, b[0]);
535
+ print_hits(bns, &opt, p, b[0]);
536
+ // free
537
+ free(seq[0]);
538
+ bsw2_destroy(b[0]);
539
+ }
540
+ bsw2_global_destroy(pool);
541
+ }
542
+
543
+ #ifdef HAVE_PTHREAD
544
+ typedef struct {
545
+ int tid;
546
+ bsw2seq_t *_seq;
547
+ const bsw2opt_t *_opt;
548
+ const bntseq_t *bns;
549
+ uint8_t *pac;
550
+ bwt_t *target[2];
551
+ } thread_aux_t;
552
+
553
+ /* another interface to bsw2_aln_core() to facilitate pthread_create() */
554
+ static void *worker(void *data)
555
+ {
556
+ thread_aux_t *p = (thread_aux_t*)data;
557
+ bsw2_aln_core(p->tid, p->_seq, p->_opt, p->bns, p->pac, p->target);
558
+ return 0;
559
+ }
560
+ #endif
561
+
562
+ /* process sequences stored in _seq, generate SAM lines for these
563
+ * sequences and reset _seq afterwards. */
564
+ static void process_seqs(bsw2seq_t *_seq, const bsw2opt_t *opt, const bntseq_t *bns, uint8_t *pac, bwt_t * const target[2])
565
+ {
566
+ int i;
567
+
568
+ #ifdef HAVE_PTHREAD
569
+ if (opt->n_threads <= 1) {
570
+ bsw2_aln_core(0, _seq, opt, bns, pac, target);
571
+ } else {
572
+ pthread_t *tid;
573
+ pthread_attr_t attr;
574
+ thread_aux_t *data;
575
+ int j;
576
+ pthread_attr_init(&attr);
577
+ pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE);
578
+ data = (thread_aux_t*)calloc(opt->n_threads, sizeof(thread_aux_t));
579
+ tid = (pthread_t*)calloc(opt->n_threads, sizeof(pthread_t));
580
+ for (j = 0; j < opt->n_threads; ++j) {
581
+ thread_aux_t *p = data + j;
582
+ p->tid = j; p->_seq = _seq; p->_opt = opt; p->bns = bns;
583
+ p->pac = pac; p->target[0] = target[0]; p->target[1] = target[1];
584
+ pthread_create(&tid[j], &attr, worker, p);
585
+ }
586
+ for (j = 0; j < opt->n_threads; ++j) pthread_join(tid[j], 0);
587
+ free(data); free(tid);
588
+ }
589
+ #else
590
+ bsw2_aln_core(0, _seq, opt, bns, pac, target);
591
+ #endif
592
+
593
+ // print and reset
594
+ for (i = 0; i < _seq->n; ++i) {
595
+ bsw2seq1_t *p = _seq->seq + i;
596
+ if (p->sam) printf("%s", p->sam);
597
+ free(p->name); free(p->seq); free(p->qual); free(p->sam);
598
+ p->tid = -1; p->l = 0;
599
+ p->name = p->seq = p->qual = p->sam = 0;
600
+ }
601
+ fflush(stdout);
602
+ _seq->n = 0;
603
+ }
604
+
605
+ void bsw2_aln(const bsw2opt_t *opt, const bntseq_t *bns, bwt_t * const target[2], const char *fn)
606
+ {
607
+ gzFile fp;
608
+ kseq_t *ks;
609
+ int l, size = 0;
610
+ uint8_t *pac;
611
+ bsw2seq_t *_seq;
612
+
613
+ pac = calloc(bns->l_pac/4+1, 1);
614
+ if (pac == 0) {
615
+ fprintf(stderr, "[bsw2_aln] insufficient memory!\n");
616
+ return;
617
+ }
618
+ for (l = 0; l < bns->n_seqs; ++l)
619
+ printf("@SQ\tSN:%s\tLN:%d\n", bns->anns[l].name, bns->anns[l].len);
620
+ fread(pac, 1, bns->l_pac/4+1, bns->fp_pac);
621
+ fp = xzopen(fn, "r");
622
+ ks = kseq_init(fp);
623
+ _seq = calloc(1, sizeof(bsw2seq_t));
624
+ while ((l = kseq_read(ks)) >= 0) {
625
+ bsw2seq1_t *p;
626
+ if (_seq->n == _seq->max) {
627
+ _seq->max = _seq->max? _seq->max<<1 : 1024;
628
+ _seq->seq = realloc(_seq->seq, _seq->max * sizeof(bsw2seq1_t));
629
+ }
630
+ p = &_seq->seq[_seq->n++];
631
+ p->tid = -1;
632
+ p->l = l;
633
+ p->name = strdup(ks->name.s);
634
+ p->seq = strdup(ks->seq.s);
635
+ p->qual = ks->qual.l? strdup(ks->qual.s) : 0;
636
+ p->sam = 0;
637
+ size += l;
638
+ if (size > opt->chunk_size) {
639
+ fprintf(stderr, "[bsw2_aln] read %d sequences (%d bp)...\n", _seq->n, size);
640
+ process_seqs(_seq, opt, bns, pac, target);
641
+ size = 0;
642
+ }
643
+ }
644
+ fprintf(stderr, "[bsw2_aln] read %d sequences (%d bp)...\n", _seq->n, size);
645
+ process_seqs(_seq, opt, bns, pac, target);
646
+ free(_seq->seq); free(_seq);
647
+ kseq_destroy(ks);
648
+ gzclose(fp);
649
+ free(pac);
650
+ }