bio-bwa 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (91) hide show
  1. data/.document +5 -0
  2. data/Gemfile +15 -0
  3. data/Gemfile.lock +28 -0
  4. data/LICENSE.txt +35 -0
  5. data/README.rdoc +33 -0
  6. data/Rakefile +56 -0
  7. data/VERSION +1 -0
  8. data/bio-bwa.gemspec +152 -0
  9. data/doc/Bio.html +93 -0
  10. data/doc/Bio/BWA.html +2884 -0
  11. data/doc/Bio/BWA/Library.html +229 -0
  12. data/doc/_index.html +119 -0
  13. data/doc/class_list.html +36 -0
  14. data/doc/css/common.css +1 -0
  15. data/doc/css/full_list.css +53 -0
  16. data/doc/css/style.css +310 -0
  17. data/doc/file.LICENSE.html +88 -0
  18. data/doc/file.README.html +119 -0
  19. data/doc/file_list.html +41 -0
  20. data/doc/frames.html +13 -0
  21. data/doc/index.html +119 -0
  22. data/doc/js/app.js +203 -0
  23. data/doc/js/full_list.js +149 -0
  24. data/doc/js/jquery.js +154 -0
  25. data/doc/method_list.html +171 -0
  26. data/doc/top-level-namespace.html +88 -0
  27. data/ext/COPYING +674 -0
  28. data/ext/ChangeLog +3864 -0
  29. data/ext/NEWS +555 -0
  30. data/ext/README +29 -0
  31. data/ext/bamlite.c +155 -0
  32. data/ext/bamlite.h +94 -0
  33. data/ext/bntseq.c +303 -0
  34. data/ext/bntseq.h +80 -0
  35. data/ext/bwa.1 +562 -0
  36. data/ext/bwape.c +807 -0
  37. data/ext/bwase.c +686 -0
  38. data/ext/bwase.h +27 -0
  39. data/ext/bwaseqio.c +222 -0
  40. data/ext/bwt.c +250 -0
  41. data/ext/bwt.h +105 -0
  42. data/ext/bwt_gen/Makefile +23 -0
  43. data/ext/bwt_gen/QSufSort.c +496 -0
  44. data/ext/bwt_gen/QSufSort.h +40 -0
  45. data/ext/bwt_gen/bwt_gen.c +1547 -0
  46. data/ext/bwt_gen/bwt_gen.h +105 -0
  47. data/ext/bwt_lite.c +94 -0
  48. data/ext/bwt_lite.h +29 -0
  49. data/ext/bwtaln.c +345 -0
  50. data/ext/bwtaln.h +150 -0
  51. data/ext/bwtgap.c +264 -0
  52. data/ext/bwtgap.h +38 -0
  53. data/ext/bwtindex.c +186 -0
  54. data/ext/bwtio.c +77 -0
  55. data/ext/bwtmisc.c +269 -0
  56. data/ext/bwtsw2.h +51 -0
  57. data/ext/bwtsw2_aux.c +650 -0
  58. data/ext/bwtsw2_chain.c +107 -0
  59. data/ext/bwtsw2_core.c +594 -0
  60. data/ext/bwtsw2_main.c +100 -0
  61. data/ext/cs2nt.c +191 -0
  62. data/ext/is.c +218 -0
  63. data/ext/khash.h +506 -0
  64. data/ext/kseq.h +208 -0
  65. data/ext/ksort.h +269 -0
  66. data/ext/kstring.c +35 -0
  67. data/ext/kstring.h +46 -0
  68. data/ext/kvec.h +90 -0
  69. data/ext/main.c +63 -0
  70. data/ext/main.h +29 -0
  71. data/ext/mkrf_conf.rb +49 -0
  72. data/ext/qualfa2fq.pl +27 -0
  73. data/ext/simple_dp.c +162 -0
  74. data/ext/simpletest.c +23 -0
  75. data/ext/solid2fastq.pl +111 -0
  76. data/ext/stdaln.c +1072 -0
  77. data/ext/stdaln.h +162 -0
  78. data/ext/utils.c +82 -0
  79. data/ext/utils.h +54 -0
  80. data/lib/bio-bwa.rb +7 -0
  81. data/lib/bio/bwa.rb +312 -0
  82. data/lib/bio/bwa/library.rb +42 -0
  83. data/test/data/testdata.fa +602 -0
  84. data/test/data/testdata.long.fa +175 -0
  85. data/test/data/testdata.short.fa +2 -0
  86. data/test/helper.rb +18 -0
  87. data/test/test_bio-bwa_basic.rb +62 -0
  88. data/test/test_bio-bwa_make_index.rb +42 -0
  89. data/test/test_bio-bwa_run_aln.rb +49 -0
  90. data/test/test_bio-bwa_sam_conversion.rb +49 -0
  91. metadata +218 -0
@@ -0,0 +1,105 @@
1
+ /*
2
+
3
+ BWTConstruct.h BWT-Index Construction
4
+
5
+ This module constructs BWT and auxiliary data structures.
6
+
7
+ Copyright (C) 2004, Wong Chi Kwong.
8
+
9
+ This program is free software; you can redistribute it and/or
10
+ modify it under the terms of the GNU General Public License
11
+ as published by the Free Software Foundation; either version 2
12
+ of the License, or (at your option) any later version.
13
+
14
+ This program is distributed in the hope that it will be useful,
15
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
16
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17
+ GNU General Public License for more details.
18
+
19
+ You should have received a copy of the GNU General Public License
20
+ along with this program; if not, write to the Free Software
21
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
22
+
23
+ */
24
+
25
+ #ifndef BWT_GEN_H
26
+ #define BWT_GEN_H
27
+
28
+ #define ALPHABET_SIZE 4
29
+ #define BIT_PER_CHAR 2
30
+ #define CHAR_PER_WORD 16
31
+ #define CHAR_PER_BYTE 4
32
+
33
+ #define BITS_IN_WORD 32
34
+ #define BITS_IN_BYTE 8
35
+ #define BYTES_IN_WORD 4
36
+
37
+ #define ALL_ONE_MASK 0xFFFFFFFF
38
+ #define DNA_OCC_CNT_TABLE_SIZE_IN_WORD 65536
39
+
40
+ #define BITS_PER_OCC_VALUE 16
41
+ #define OCC_VALUE_PER_WORD 2
42
+ #define OCC_INTERVAL 256
43
+ #define OCC_INTERVAL_MAJOR 65536
44
+
45
+ #define TRUE 1
46
+ #define FALSE 0
47
+
48
+ #define BWTINC_INSERT_SORT_NUM_ITEM 7
49
+
50
+ #define average(value1, value2) ( ((value1) & (value2)) + ((value1) ^ (value2)) / 2 )
51
+ #define min(value1, value2) ( ((value1) < (value2)) ? (value1) : (value2) )
52
+ #define max(value1, value2) ( ((value1) > (value2)) ? (value1) : (value2) )
53
+ #define med3(a, b, c) ( a<b ? (b<c ? b : a<c ? c : a) : (b>c ? b : a>c ? c : a))
54
+ #define swap(a, b, t); t = a; a = b; b = t;
55
+ #define truncateLeft(value, offset) ( (value) << (offset) >> (offset) )
56
+ #define truncateRight(value, offset) ( (value) >> (offset) << (offset) )
57
+ #define DNA_OCC_SUM_EXCEPTION(sum) ((sum & 0xfefefeff) == 0)
58
+
59
+ typedef struct SaIndexRange {
60
+ unsigned int startSaIndex;
61
+ unsigned int endSaIndex;
62
+ } SaIndexRange;
63
+
64
+ typedef struct BWT {
65
+ unsigned int textLength; // length of the text
66
+ unsigned int saInterval; // interval between two SA values stored explicitly
67
+ unsigned int inverseSaInterval; // interval between two inverse SA stored explicitly
68
+ unsigned int inverseSa0; // SA-1[0]
69
+ unsigned int *cumulativeFreq; // cumulative frequency
70
+ unsigned int *bwtCode; // BWT code
71
+ unsigned int *occValue; // Occurrence values stored explicitly
72
+ unsigned int *occValueMajor; // Occurrence values stored explicitly
73
+ unsigned int *saValue; // SA values stored explicitly
74
+ unsigned int *inverseSa; // Inverse SA stored explicitly
75
+ SaIndexRange *saIndexRange; // SA index range
76
+ int saIndexRangeNumOfChar; // Number of characters indexed in SA index range
77
+ unsigned int *saValueOnBoundary; // Pre-calculated frequently referred data
78
+ unsigned int *decodeTable; // For decoding BWT by table lookup
79
+ unsigned int decodeTableGenerated; // == TRUE if decode table is generated on load and will be freed
80
+ unsigned int bwtSizeInWord; // Temporary variable to hold the memory allocated
81
+ unsigned int occSizeInWord; // Temporary variable to hold the memory allocated
82
+ unsigned int occMajorSizeInWord; // Temporary variable to hold the memory allocated
83
+ unsigned int saValueSize; // Temporary variable to hold the memory allocated
84
+ unsigned int inverseSaSize; // Temporary variable to hold the memory allocated
85
+ unsigned int saIndexRangeSize; // Temporary variable to hold the memory allocated
86
+ } BWT;
87
+
88
+ typedef struct BWTInc {
89
+ BWT *bwt;
90
+ unsigned int numberOfIterationDone;
91
+ unsigned int *cumulativeCountInCurrentBuild;
92
+ unsigned int availableWord;
93
+ unsigned int targetTextLength;
94
+ float targetNBit;
95
+ unsigned int buildSize;
96
+ unsigned int initialMaxBuildSize;
97
+ unsigned int incMaxBuildSize;
98
+ unsigned int firstCharInLastIteration;
99
+ unsigned int *workingMemory;
100
+ unsigned int *packedText;
101
+ unsigned char *textBuffer;
102
+ unsigned int *packedShift;
103
+ } BWTInc;
104
+
105
+ #endif
data/ext/bwt_lite.c ADDED
@@ -0,0 +1,94 @@
1
+ #include <stdlib.h>
2
+ #include <string.h>
3
+ #include <stdio.h>
4
+ #include "bwt_lite.h"
5
+
6
+ int is_sa(const uint8_t *T, uint32_t *SA, int n);
7
+ int is_bwt(uint8_t *T, int n);
8
+
9
+ bwtl_t *bwtl_seq2bwtl(int len, const uint8_t *seq)
10
+ {
11
+ bwtl_t *b;
12
+ int i;
13
+ b = (bwtl_t*)calloc(1, sizeof(bwtl_t));
14
+ b->seq_len = len;
15
+
16
+ { // calculate b->bwt
17
+ uint8_t *s;
18
+ b->sa = (uint32_t*)calloc(len + 1, 4);
19
+ is_sa(seq, b->sa, len);
20
+ s = (uint8_t*)calloc(len + 1, 1);
21
+ for (i = 0; i <= len; ++i) {
22
+ if (b->sa[i] == 0) b->primary = i;
23
+ else s[i] = seq[b->sa[i] - 1];
24
+ }
25
+ for (i = b->primary; i < len; ++i) s[i] = s[i + 1];
26
+ b->bwt_size = (len + 15) / 16;
27
+ b->bwt = (uint32_t*)calloc(b->bwt_size, 4);
28
+ for (i = 0; i < len; ++i)
29
+ b->bwt[i>>4] |= s[i] << ((15 - (i&15)) << 1);
30
+ free(s);
31
+ }
32
+ { // calculate b->occ
33
+ uint32_t c[4];
34
+ b->n_occ = (len + 15) / 16 * 4;
35
+ b->occ = (uint32_t*)calloc(b->n_occ, 4);
36
+ memset(c, 0, 16);
37
+ for (i = 0; i < len; ++i) {
38
+ if (i % 16 == 0)
39
+ memcpy(b->occ + (i/16) * 4, c, 16);
40
+ ++c[bwtl_B0(b, i)];
41
+ }
42
+ memcpy(b->L2+1, c, 16);
43
+ for (i = 2; i < 5; ++i) b->L2[i] += b->L2[i-1];
44
+ }
45
+ { // generate cnt_table
46
+ for (i = 0; i != 256; ++i) {
47
+ u_int32_t j, x = 0;
48
+ for (j = 0; j != 4; ++j)
49
+ x |= (((i&3) == j) + ((i>>2&3) == j) + ((i>>4&3) == j) + (i>>6 == j)) << (j<<3);
50
+ b->cnt_table[i] = x;
51
+ }
52
+ }
53
+ return b;
54
+ }
55
+ inline uint32_t bwtl_occ(const bwtl_t *bwt, uint32_t k, uint8_t c)
56
+ {
57
+ uint32_t n, b;
58
+ if (k == bwt->seq_len) return bwt->L2[c+1] - bwt->L2[c];
59
+ if (k == (uint32_t)(-1)) return 0;
60
+ if (k >= bwt->primary) --k; // because $ is not in bwt
61
+ n = bwt->occ[k/16<<2|c];
62
+ b = bwt->bwt[k/16] & ~((1U<<((15-(k&15))<<1)) - 1);
63
+ n += (bwt->cnt_table[b&0xff] + bwt->cnt_table[b>>8&0xff]
64
+ + bwt->cnt_table[b>>16&0xff] + bwt->cnt_table[b>>24]) >> (c<<3) & 0xff;
65
+ if (c == 0) n -= 15 - (k&15); // corrected for the masked bits
66
+ return n;
67
+ }
68
+ inline void bwtl_occ4(const bwtl_t *bwt, uint32_t k, uint32_t cnt[4])
69
+ {
70
+ uint32_t x, b;
71
+ if (k == (uint32_t)(-1)) {
72
+ memset(cnt, 0, 16);
73
+ return;
74
+ }
75
+ if (k >= bwt->primary) --k; // because $ is not in bwt
76
+ memcpy(cnt, bwt->occ + (k>>4<<2), 16);
77
+ b = bwt->bwt[k>>4] & ~((1U<<((~k&15)<<1)) - 1);
78
+ x = bwt->cnt_table[b&0xff] + bwt->cnt_table[b>>8&0xff]
79
+ + bwt->cnt_table[b>>16&0xff] + bwt->cnt_table[b>>24];
80
+ x -= 15 - (k&15);
81
+ cnt[0] += x&0xff; cnt[1] += x>>8&0xff; cnt[2] += x>>16&0xff; cnt[3] += x>>24;
82
+ }
83
+ inline void bwtl_2occ4(const bwtl_t *bwt, uint32_t k, uint32_t l, uint32_t cntk[4], uint32_t cntl[4])
84
+ {
85
+ bwtl_occ4(bwt, k, cntk);
86
+ bwtl_occ4(bwt, l, cntl);
87
+ }
88
+ void bwtl_destroy(bwtl_t *bwt)
89
+ {
90
+ if (bwt) {
91
+ free(bwt->occ); free(bwt->bwt); free(bwt->sa);
92
+ free(bwt);
93
+ }
94
+ }
data/ext/bwt_lite.h ADDED
@@ -0,0 +1,29 @@
1
+ #ifndef BWT_LITE_H_
2
+ #define BWT_LITE_H_
3
+
4
+ #include <stdint.h>
5
+
6
+ typedef struct {
7
+ uint32_t seq_len, bwt_size, n_occ;
8
+ uint32_t primary;
9
+ uint32_t *bwt, *occ, *sa, L2[5];
10
+ uint32_t cnt_table[256];
11
+ } bwtl_t;
12
+
13
+ #define bwtl_B0(b, k) ((b)->bwt[(k)>>4]>>((~(k)&0xf)<<1)&3)
14
+
15
+ #ifdef __cplusplus
16
+ extern "C" {
17
+ #endif
18
+
19
+ bwtl_t *bwtl_seq2bwtl(int len, const uint8_t *seq);
20
+ inline uint32_t bwtl_occ(const bwtl_t *bwt, uint32_t k, uint8_t c);
21
+ inline void bwtl_occ4(const bwtl_t *bwt, uint32_t k, uint32_t cnt[4]);
22
+ inline void bwtl_2occ4(const bwtl_t *bwt, uint32_t k, uint32_t l, uint32_t cntk[4], uint32_t cntl[4]);
23
+ void bwtl_destroy(bwtl_t *bwt);
24
+
25
+ #ifdef __cplusplus
26
+ }
27
+ #endif
28
+
29
+ #endif
data/ext/bwtaln.c ADDED
@@ -0,0 +1,345 @@
1
+ #include <stdio.h>
2
+ #include <unistd.h>
3
+ #include <math.h>
4
+ #include <stdlib.h>
5
+ #include <string.h>
6
+ #include <time.h>
7
+ #include <stdint.h>
8
+ #ifdef HAVE_CONFIG_H
9
+ #include "config.h"
10
+ #endif
11
+ #include "bwtaln.h"
12
+ #include "bwtgap.h"
13
+ #include "utils.h"
14
+
15
+ #ifdef HAVE_PTHREAD
16
+ #define THREAD_BLOCK_SIZE 1024
17
+ #include <pthread.h>
18
+ static pthread_mutex_t g_seq_lock = PTHREAD_MUTEX_INITIALIZER;
19
+ #endif
20
+
21
+ gap_opt_t *gap_init_opt()
22
+ {
23
+ gap_opt_t *o;
24
+ o = (gap_opt_t*)calloc(1, sizeof(gap_opt_t));
25
+ /* IMPORTANT: s_mm*10 should be about the average base error
26
+ rate. Voilating this requirement will break pairing! */
27
+ o->s_mm = 3; o->s_gapo = 11; o->s_gape = 4;
28
+ o->max_diff = -1; o->max_gapo = 1; o->max_gape = 6;
29
+ o->indel_end_skip = 5; o->max_del_occ = 10; o->max_entries = 2000000;
30
+ o->mode = BWA_MODE_GAPE | BWA_MODE_COMPREAD;
31
+ o->seed_len = 32; o->max_seed_diff = 2;
32
+ o->fnr = 0.04;
33
+ o->n_threads = 1;
34
+ o->max_top2 = 30;
35
+ o->trim_qual = 0;
36
+ return o;
37
+ }
38
+
39
+ int bwa_cal_maxdiff(int l, double err, double thres)
40
+ {
41
+ double elambda = exp(-l * err);
42
+ double sum, y = 1.0;
43
+ int k, x = 1;
44
+ for (k = 1, sum = elambda; k < 1000; ++k) {
45
+ y *= l * err;
46
+ x *= k;
47
+ sum += elambda * y / x;
48
+ if (1.0 - sum < thres) return k;
49
+ }
50
+ return 2;
51
+ }
52
+
53
+ // width must be filled as zero
54
+ static int bwt_cal_width(const bwt_t *rbwt, int len, const ubyte_t *str, bwt_width_t *width)
55
+ {
56
+ bwtint_t k, l, ok, ol;
57
+ int i, bid;
58
+ bid = 0;
59
+ k = 0; l = rbwt->seq_len;
60
+ for (i = 0; i < len; ++i) {
61
+ ubyte_t c = str[i];
62
+ if (c < 4) {
63
+ bwt_2occ(rbwt, k - 1, l, c, &ok, &ol);
64
+ k = rbwt->L2[c] + ok + 1;
65
+ l = rbwt->L2[c] + ol;
66
+ }
67
+ if (k > l || c > 3) { // then restart
68
+ k = 0;
69
+ l = rbwt->seq_len;
70
+ ++bid;
71
+ }
72
+ width[i].w = l - k + 1;
73
+ width[i].bid = bid;
74
+ }
75
+ width[len].w = 0;
76
+ width[len].bid = ++bid;
77
+ return bid;
78
+ }
79
+
80
+ void bwa_cal_sa_reg_gap(int tid, bwt_t *const bwt[2], int n_seqs, bwa_seq_t *seqs, const gap_opt_t *opt)
81
+ {
82
+ int i, max_l = 0, max_len;
83
+ gap_stack_t *stack;
84
+ bwt_width_t *w[2], *seed_w[2];
85
+ const ubyte_t *seq[2];
86
+ gap_opt_t local_opt = *opt;
87
+
88
+ // initiate priority stack
89
+ for (i = max_len = 0; i != n_seqs; ++i)
90
+ if (seqs[i].len > max_len) max_len = seqs[i].len;
91
+ if (opt->fnr > 0.0) local_opt.max_diff = bwa_cal_maxdiff(max_len, BWA_AVG_ERR, opt->fnr);
92
+ if (local_opt.max_diff < local_opt.max_gapo) local_opt.max_gapo = local_opt.max_diff;
93
+ stack = gap_init_stack(local_opt.max_diff, local_opt.max_gapo, local_opt.max_gape, &local_opt);
94
+
95
+ seed_w[0] = (bwt_width_t*)calloc(opt->seed_len+1, sizeof(bwt_width_t));
96
+ seed_w[1] = (bwt_width_t*)calloc(opt->seed_len+1, sizeof(bwt_width_t));
97
+ w[0] = w[1] = 0;
98
+ for (i = 0; i != n_seqs; ++i) {
99
+ bwa_seq_t *p = seqs + i;
100
+ #ifdef HAVE_PTHREAD
101
+ if (opt->n_threads > 1) {
102
+ pthread_mutex_lock(&g_seq_lock);
103
+ if (p->tid < 0) { // unassigned
104
+ int j;
105
+ for (j = i; j < n_seqs && j < i + THREAD_BLOCK_SIZE; ++j)
106
+ seqs[j].tid = tid;
107
+ } else if (p->tid != tid) {
108
+ pthread_mutex_unlock(&g_seq_lock);
109
+ continue;
110
+ }
111
+ pthread_mutex_unlock(&g_seq_lock);
112
+ }
113
+ #endif
114
+ p->sa = 0; p->type = BWA_TYPE_NO_MATCH; p->c1 = p->c2 = 0; p->n_aln = 0; p->aln = 0;
115
+ seq[0] = p->seq; seq[1] = p->rseq;
116
+ if (max_l < p->len) {
117
+ max_l = p->len;
118
+ w[0] = (bwt_width_t*)realloc(w[0], (max_l + 1) * sizeof(bwt_width_t));
119
+ w[1] = (bwt_width_t*)realloc(w[1], (max_l + 1) * sizeof(bwt_width_t));
120
+ memset(w[0], 0, (max_l + 1) * sizeof(bwt_width_t));
121
+ memset(w[1], 0, (max_l + 1) * sizeof(bwt_width_t));
122
+ }
123
+ bwt_cal_width(bwt[0], p->len, seq[0], w[0]);
124
+ bwt_cal_width(bwt[1], p->len, seq[1], w[1]);
125
+ if (opt->fnr > 0.0) local_opt.max_diff = bwa_cal_maxdiff(p->len, BWA_AVG_ERR, opt->fnr);
126
+ local_opt.seed_len = opt->seed_len < p->len? opt->seed_len : 0x7fffffff;
127
+ if (p->len > opt->seed_len) {
128
+ bwt_cal_width(bwt[0], opt->seed_len, seq[0] + (p->len - opt->seed_len), seed_w[0]);
129
+ bwt_cal_width(bwt[1], opt->seed_len, seq[1] + (p->len - opt->seed_len), seed_w[1]);
130
+ }
131
+ // core function
132
+ p->aln = bwt_match_gap(bwt, p->len, seq, w, p->len <= opt->seed_len? 0 : seed_w, &local_opt, &p->n_aln, stack);
133
+ // store the alignment
134
+ free(p->name); free(p->seq); free(p->rseq); free(p->qual);
135
+ p->name = 0; p->seq = p->rseq = p->qual = 0;
136
+ }
137
+ free(seed_w[0]); free(seed_w[1]);
138
+ free(w[0]); free(w[1]);
139
+ gap_destroy_stack(stack);
140
+ }
141
+
142
+ #ifdef HAVE_PTHREAD
143
+ typedef struct {
144
+ int tid;
145
+ bwt_t *bwt[2];
146
+ int n_seqs;
147
+ bwa_seq_t *seqs;
148
+ const gap_opt_t *opt;
149
+ } thread_aux_t;
150
+
151
+ static void *worker(void *data)
152
+ {
153
+ thread_aux_t *d = (thread_aux_t*)data;
154
+ bwa_cal_sa_reg_gap(d->tid, d->bwt, d->n_seqs, d->seqs, d->opt);
155
+ return 0;
156
+ }
157
+ #endif
158
+
159
+ bwa_seqio_t *bwa_open_reads(int mode, const char *fn_fa)
160
+ {
161
+ bwa_seqio_t *ks;
162
+ if (mode & BWA_MODE_BAM) { // open BAM
163
+ int which = 0;
164
+ if (mode & BWA_MODE_BAM_SE) which |= 4;
165
+ if (mode & BWA_MODE_BAM_READ1) which |= 1;
166
+ if (mode & BWA_MODE_BAM_READ2) which |= 2;
167
+ if (which == 0) which = 7; // then read all reads
168
+ ks = bwa_bam_open(fn_fa, which);
169
+ } else ks = bwa_seq_open(fn_fa);
170
+ return ks;
171
+ }
172
+
173
+ void bwa_aln_core(const char *prefix, const char *fn_fa, const gap_opt_t *opt)
174
+ {
175
+ int i, n_seqs, tot_seqs = 0;
176
+ bwa_seq_t *seqs;
177
+ bwa_seqio_t *ks;
178
+ clock_t t;
179
+ bwt_t *bwt[2];
180
+
181
+ // initialization
182
+ ks = bwa_open_reads(opt->mode, fn_fa);
183
+
184
+ { // load BWT
185
+ char *str = (char*)calloc(strlen(prefix) + 10, 1);
186
+ strcpy(str, prefix); strcat(str, ".bwt"); bwt[0] = bwt_restore_bwt(str);
187
+ strcpy(str, prefix); strcat(str, ".rbwt"); bwt[1] = bwt_restore_bwt(str);
188
+ free(str);
189
+ }
190
+
191
+ // core loop
192
+ fwrite(opt, sizeof(gap_opt_t), 1, stdout);
193
+ while ((seqs = bwa_read_seq(ks, 0x40000, &n_seqs, opt->mode, opt->trim_qual)) != 0) {
194
+ tot_seqs += n_seqs;
195
+ t = clock();
196
+
197
+ fprintf(stderr, "[bwa_aln_core] calculate SA coordinate... ");
198
+
199
+ #ifdef HAVE_PTHREAD
200
+ if (opt->n_threads <= 1) { // no multi-threading at all
201
+ bwa_cal_sa_reg_gap(0, bwt, n_seqs, seqs, opt);
202
+ } else {
203
+ pthread_t *tid;
204
+ pthread_attr_t attr;
205
+ thread_aux_t *data;
206
+ int j;
207
+ pthread_attr_init(&attr);
208
+ pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE);
209
+ data = (thread_aux_t*)calloc(opt->n_threads, sizeof(thread_aux_t));
210
+ tid = (pthread_t*)calloc(opt->n_threads, sizeof(pthread_t));
211
+ for (j = 0; j < opt->n_threads; ++j) {
212
+ data[j].tid = j; data[j].bwt[0] = bwt[0]; data[j].bwt[1] = bwt[1];
213
+ data[j].n_seqs = n_seqs; data[j].seqs = seqs; data[j].opt = opt;
214
+ pthread_create(&tid[j], &attr, worker, data + j);
215
+ }
216
+ for (j = 0; j < opt->n_threads; ++j) pthread_join(tid[j], 0);
217
+ free(data); free(tid);
218
+ }
219
+ #else
220
+ bwa_cal_sa_reg_gap(0, bwt, n_seqs, seqs, opt);
221
+ #endif
222
+
223
+ fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); t = clock();
224
+
225
+ t = clock();
226
+ fprintf(stderr, "[bwa_aln_core] write to the disk... ");
227
+ for (i = 0; i < n_seqs; ++i) {
228
+ bwa_seq_t *p = seqs + i;
229
+ fwrite(&p->n_aln, 4, 1, stdout);
230
+ if (p->n_aln) fwrite(p->aln, sizeof(bwt_aln1_t), p->n_aln, stdout);
231
+ }
232
+ fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); t = clock();
233
+
234
+ bwa_free_read_seq(n_seqs, seqs);
235
+ fprintf(stderr, "[bwa_aln_core] %d sequences have been processed.\n", tot_seqs);
236
+ }
237
+
238
+ // destroy
239
+ bwt_destroy(bwt[0]); bwt_destroy(bwt[1]);
240
+ bwa_seq_close(ks);
241
+ }
242
+
243
+ int bwa_aln(int argc, char *argv[])
244
+ {
245
+ int c, opte = -1;
246
+ gap_opt_t *opt;
247
+ optind = 1;
248
+ opt = gap_init_opt();
249
+ while ((c = getopt(argc, argv, "n:o:e:i:d:l:k:cLR:m:t:NM:O:E:q:f:b012IB:")) >= 0) {
250
+ switch (c) {
251
+ case 'n':
252
+ if (strstr(optarg, ".")) opt->fnr = atof(optarg), opt->max_diff = -1;
253
+ else opt->max_diff = atoi(optarg), opt->fnr = -1.0;
254
+ break;
255
+ case 'o': opt->max_gapo = atoi(optarg); break;
256
+ case 'e': opte = atoi(optarg); break;
257
+ case 'M': opt->s_mm = atoi(optarg); break;
258
+ case 'O': opt->s_gapo = atoi(optarg); break;
259
+ case 'E': opt->s_gape = atoi(optarg); break;
260
+ case 'd': opt->max_del_occ = atoi(optarg); break;
261
+ case 'i': opt->indel_end_skip = atoi(optarg); break;
262
+ case 'l': opt->seed_len = atoi(optarg); break;
263
+ case 'k': opt->max_seed_diff = atoi(optarg); break;
264
+ case 'm': opt->max_entries = atoi(optarg); break;
265
+ case 't': opt->n_threads = atoi(optarg); break;
266
+ case 'L': opt->mode |= BWA_MODE_LOGGAP; break;
267
+ case 'R': opt->max_top2 = atoi(optarg); break;
268
+ case 'q': opt->trim_qual = atoi(optarg); break;
269
+ case 'c': opt->mode &= ~BWA_MODE_COMPREAD; break;
270
+ case 'N': opt->mode |= BWA_MODE_NONSTOP; opt->max_top2 = 0x7fffffff; break;
271
+ case 'f': xreopen(optarg, "wb", stdout); break;
272
+ case 'b': opt->mode |= BWA_MODE_BAM; break;
273
+ case '0': opt->mode |= BWA_MODE_BAM_SE; break;
274
+ case '1': opt->mode |= BWA_MODE_BAM_READ1; break;
275
+ case '2': opt->mode |= BWA_MODE_BAM_READ2; break;
276
+ case 'I': opt->mode |= BWA_MODE_IL13; break;
277
+ case 'B': opt->mode |= atoi(optarg) << 24; break;
278
+ default: return 1;
279
+ }
280
+ }
281
+ if (opte > 0) {
282
+ opt->max_gape = opte;
283
+ opt->mode &= ~BWA_MODE_GAPE;
284
+ }
285
+
286
+ if (optind + 2 > argc) {
287
+ fprintf(stderr, "\n");
288
+ fprintf(stderr, "Usage: bwa aln [options] <prefix> <in.fq>\n\n");
289
+ fprintf(stderr, "Options: -n NUM max #diff (int) or missing prob under %.2f err rate (float) [%.2f]\n",
290
+ BWA_AVG_ERR, opt->fnr);
291
+ fprintf(stderr, " -o INT maximum number or fraction of gap opens [%d]\n", opt->max_gapo);
292
+ fprintf(stderr, " -e INT maximum number of gap extensions, -1 for disabling long gaps [-1]\n");
293
+ fprintf(stderr, " -i INT do not put an indel within INT bp towards the ends [%d]\n", opt->indel_end_skip);
294
+ fprintf(stderr, " -d INT maximum occurrences for extending a long deletion [%d]\n", opt->max_del_occ);
295
+ fprintf(stderr, " -l INT seed length [%d]\n", opt->seed_len);
296
+ fprintf(stderr, " -k INT maximum differences in the seed [%d]\n", opt->max_seed_diff);
297
+ fprintf(stderr, " -m INT maximum entries in the queue [%d]\n", opt->max_entries);
298
+ fprintf(stderr, " -t INT number of threads [%d]\n", opt->n_threads);
299
+ fprintf(stderr, " -M INT mismatch penalty [%d]\n", opt->s_mm);
300
+ fprintf(stderr, " -O INT gap open penalty [%d]\n", opt->s_gapo);
301
+ fprintf(stderr, " -E INT gap extension penalty [%d]\n", opt->s_gape);
302
+ fprintf(stderr, " -R INT stop searching when there are >INT equally best hits [%d]\n", opt->max_top2);
303
+ fprintf(stderr, " -q INT quality threshold for read trimming down to %dbp [%d]\n", BWA_MIN_RDLEN, opt->trim_qual);
304
+ fprintf(stderr, " -f FILE file to write output to instead of stdout\n");
305
+ fprintf(stderr, " -B INT length of barcode\n");
306
+ fprintf(stderr, " -c input sequences are in the color space\n");
307
+ fprintf(stderr, " -L log-scaled gap penalty for long deletions\n");
308
+ fprintf(stderr, " -N non-iterative mode: search for all n-difference hits (slooow)\n");
309
+ fprintf(stderr, " -I the input is in the Illumina 1.3+ FASTQ-like format\n");
310
+ fprintf(stderr, " -b the input read file is in the BAM format\n");
311
+ fprintf(stderr, " -0 use single-end reads only (effective with -b)\n");
312
+ fprintf(stderr, " -1 use the 1st read in a pair (effective with -b)\n");
313
+ fprintf(stderr, " -2 use the 2nd read in a pair (effective with -b)\n");
314
+ fprintf(stderr, "\n");
315
+ return 1;
316
+ }
317
+ if (opt->fnr > 0.0) {
318
+ int i, k;
319
+ for (i = 17, k = 0; i <= 250; ++i) {
320
+ int l = bwa_cal_maxdiff(i, BWA_AVG_ERR, opt->fnr);
321
+ if (l != k) fprintf(stderr, "[bwa_aln] %dbp reads: max_diff = %d\n", i, l);
322
+ k = l;
323
+ }
324
+ }
325
+ bwa_aln_core(argv[optind], argv[optind+1], opt);
326
+ free(opt);
327
+ fflush(stdout);
328
+ xreopen("/dev/tty","w",stdout);
329
+ return 0;
330
+ }
331
+
332
+ /* rgoya: Temporary clone of aln_path2cigar to accomodate for bwa_cigar_t,
333
+ __cigar_op and __cigar_len while keeping stdaln stand alone */
334
+ bwa_cigar_t *bwa_aln_path2cigar(const path_t *path, int path_len, int *n_cigar)
335
+ {
336
+ uint32_t *cigar32;
337
+ bwa_cigar_t *cigar;
338
+ int i;
339
+ cigar32 = aln_path2cigar32((path_t*) path, path_len, n_cigar);
340
+ cigar = (bwa_cigar_t*)cigar32;
341
+ for (i = 0; i < *n_cigar; ++i)
342
+ cigar[i] = __cigar_create( (cigar32[i]&0xf), (cigar32[i]>>4) );
343
+ return cigar;
344
+ }
345
+