bio-bwa 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (91) hide show
  1. data/.document +5 -0
  2. data/Gemfile +15 -0
  3. data/Gemfile.lock +28 -0
  4. data/LICENSE.txt +35 -0
  5. data/README.rdoc +33 -0
  6. data/Rakefile +56 -0
  7. data/VERSION +1 -0
  8. data/bio-bwa.gemspec +152 -0
  9. data/doc/Bio.html +93 -0
  10. data/doc/Bio/BWA.html +2884 -0
  11. data/doc/Bio/BWA/Library.html +229 -0
  12. data/doc/_index.html +119 -0
  13. data/doc/class_list.html +36 -0
  14. data/doc/css/common.css +1 -0
  15. data/doc/css/full_list.css +53 -0
  16. data/doc/css/style.css +310 -0
  17. data/doc/file.LICENSE.html +88 -0
  18. data/doc/file.README.html +119 -0
  19. data/doc/file_list.html +41 -0
  20. data/doc/frames.html +13 -0
  21. data/doc/index.html +119 -0
  22. data/doc/js/app.js +203 -0
  23. data/doc/js/full_list.js +149 -0
  24. data/doc/js/jquery.js +154 -0
  25. data/doc/method_list.html +171 -0
  26. data/doc/top-level-namespace.html +88 -0
  27. data/ext/COPYING +674 -0
  28. data/ext/ChangeLog +3864 -0
  29. data/ext/NEWS +555 -0
  30. data/ext/README +29 -0
  31. data/ext/bamlite.c +155 -0
  32. data/ext/bamlite.h +94 -0
  33. data/ext/bntseq.c +303 -0
  34. data/ext/bntseq.h +80 -0
  35. data/ext/bwa.1 +562 -0
  36. data/ext/bwape.c +807 -0
  37. data/ext/bwase.c +686 -0
  38. data/ext/bwase.h +27 -0
  39. data/ext/bwaseqio.c +222 -0
  40. data/ext/bwt.c +250 -0
  41. data/ext/bwt.h +105 -0
  42. data/ext/bwt_gen/Makefile +23 -0
  43. data/ext/bwt_gen/QSufSort.c +496 -0
  44. data/ext/bwt_gen/QSufSort.h +40 -0
  45. data/ext/bwt_gen/bwt_gen.c +1547 -0
  46. data/ext/bwt_gen/bwt_gen.h +105 -0
  47. data/ext/bwt_lite.c +94 -0
  48. data/ext/bwt_lite.h +29 -0
  49. data/ext/bwtaln.c +345 -0
  50. data/ext/bwtaln.h +150 -0
  51. data/ext/bwtgap.c +264 -0
  52. data/ext/bwtgap.h +38 -0
  53. data/ext/bwtindex.c +186 -0
  54. data/ext/bwtio.c +77 -0
  55. data/ext/bwtmisc.c +269 -0
  56. data/ext/bwtsw2.h +51 -0
  57. data/ext/bwtsw2_aux.c +650 -0
  58. data/ext/bwtsw2_chain.c +107 -0
  59. data/ext/bwtsw2_core.c +594 -0
  60. data/ext/bwtsw2_main.c +100 -0
  61. data/ext/cs2nt.c +191 -0
  62. data/ext/is.c +218 -0
  63. data/ext/khash.h +506 -0
  64. data/ext/kseq.h +208 -0
  65. data/ext/ksort.h +269 -0
  66. data/ext/kstring.c +35 -0
  67. data/ext/kstring.h +46 -0
  68. data/ext/kvec.h +90 -0
  69. data/ext/main.c +63 -0
  70. data/ext/main.h +29 -0
  71. data/ext/mkrf_conf.rb +49 -0
  72. data/ext/qualfa2fq.pl +27 -0
  73. data/ext/simple_dp.c +162 -0
  74. data/ext/simpletest.c +23 -0
  75. data/ext/solid2fastq.pl +111 -0
  76. data/ext/stdaln.c +1072 -0
  77. data/ext/stdaln.h +162 -0
  78. data/ext/utils.c +82 -0
  79. data/ext/utils.h +54 -0
  80. data/lib/bio-bwa.rb +7 -0
  81. data/lib/bio/bwa.rb +312 -0
  82. data/lib/bio/bwa/library.rb +42 -0
  83. data/test/data/testdata.fa +602 -0
  84. data/test/data/testdata.long.fa +175 -0
  85. data/test/data/testdata.short.fa +2 -0
  86. data/test/helper.rb +18 -0
  87. data/test/test_bio-bwa_basic.rb +62 -0
  88. data/test/test_bio-bwa_make_index.rb +42 -0
  89. data/test/test_bio-bwa_run_aln.rb +49 -0
  90. data/test/test_bio-bwa_sam_conversion.rb +49 -0
  91. metadata +218 -0
@@ -0,0 +1,105 @@
1
+ /*
2
+
3
+ BWTConstruct.h BWT-Index Construction
4
+
5
+ This module constructs BWT and auxiliary data structures.
6
+
7
+ Copyright (C) 2004, Wong Chi Kwong.
8
+
9
+ This program is free software; you can redistribute it and/or
10
+ modify it under the terms of the GNU General Public License
11
+ as published by the Free Software Foundation; either version 2
12
+ of the License, or (at your option) any later version.
13
+
14
+ This program is distributed in the hope that it will be useful,
15
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
16
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17
+ GNU General Public License for more details.
18
+
19
+ You should have received a copy of the GNU General Public License
20
+ along with this program; if not, write to the Free Software
21
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
22
+
23
+ */
24
+
25
+ #ifndef BWT_GEN_H
26
+ #define BWT_GEN_H
27
+
28
+ #define ALPHABET_SIZE 4
29
+ #define BIT_PER_CHAR 2
30
+ #define CHAR_PER_WORD 16
31
+ #define CHAR_PER_BYTE 4
32
+
33
+ #define BITS_IN_WORD 32
34
+ #define BITS_IN_BYTE 8
35
+ #define BYTES_IN_WORD 4
36
+
37
+ #define ALL_ONE_MASK 0xFFFFFFFF
38
+ #define DNA_OCC_CNT_TABLE_SIZE_IN_WORD 65536
39
+
40
+ #define BITS_PER_OCC_VALUE 16
41
+ #define OCC_VALUE_PER_WORD 2
42
+ #define OCC_INTERVAL 256
43
+ #define OCC_INTERVAL_MAJOR 65536
44
+
45
+ #define TRUE 1
46
+ #define FALSE 0
47
+
48
+ #define BWTINC_INSERT_SORT_NUM_ITEM 7
49
+
50
+ #define average(value1, value2) ( ((value1) & (value2)) + ((value1) ^ (value2)) / 2 )
51
+ #define min(value1, value2) ( ((value1) < (value2)) ? (value1) : (value2) )
52
+ #define max(value1, value2) ( ((value1) > (value2)) ? (value1) : (value2) )
53
+ #define med3(a, b, c) ( a<b ? (b<c ? b : a<c ? c : a) : (b>c ? b : a>c ? c : a))
54
+ #define swap(a, b, t); t = a; a = b; b = t;
55
+ #define truncateLeft(value, offset) ( (value) << (offset) >> (offset) )
56
+ #define truncateRight(value, offset) ( (value) >> (offset) << (offset) )
57
+ #define DNA_OCC_SUM_EXCEPTION(sum) ((sum & 0xfefefeff) == 0)
58
+
59
+ typedef struct SaIndexRange {
60
+ unsigned int startSaIndex;
61
+ unsigned int endSaIndex;
62
+ } SaIndexRange;
63
+
64
+ typedef struct BWT {
65
+ unsigned int textLength; // length of the text
66
+ unsigned int saInterval; // interval between two SA values stored explicitly
67
+ unsigned int inverseSaInterval; // interval between two inverse SA stored explicitly
68
+ unsigned int inverseSa0; // SA-1[0]
69
+ unsigned int *cumulativeFreq; // cumulative frequency
70
+ unsigned int *bwtCode; // BWT code
71
+ unsigned int *occValue; // Occurrence values stored explicitly
72
+ unsigned int *occValueMajor; // Occurrence values stored explicitly
73
+ unsigned int *saValue; // SA values stored explicitly
74
+ unsigned int *inverseSa; // Inverse SA stored explicitly
75
+ SaIndexRange *saIndexRange; // SA index range
76
+ int saIndexRangeNumOfChar; // Number of characters indexed in SA index range
77
+ unsigned int *saValueOnBoundary; // Pre-calculated frequently referred data
78
+ unsigned int *decodeTable; // For decoding BWT by table lookup
79
+ unsigned int decodeTableGenerated; // == TRUE if decode table is generated on load and will be freed
80
+ unsigned int bwtSizeInWord; // Temporary variable to hold the memory allocated
81
+ unsigned int occSizeInWord; // Temporary variable to hold the memory allocated
82
+ unsigned int occMajorSizeInWord; // Temporary variable to hold the memory allocated
83
+ unsigned int saValueSize; // Temporary variable to hold the memory allocated
84
+ unsigned int inverseSaSize; // Temporary variable to hold the memory allocated
85
+ unsigned int saIndexRangeSize; // Temporary variable to hold the memory allocated
86
+ } BWT;
87
+
88
+ typedef struct BWTInc {
89
+ BWT *bwt;
90
+ unsigned int numberOfIterationDone;
91
+ unsigned int *cumulativeCountInCurrentBuild;
92
+ unsigned int availableWord;
93
+ unsigned int targetTextLength;
94
+ float targetNBit;
95
+ unsigned int buildSize;
96
+ unsigned int initialMaxBuildSize;
97
+ unsigned int incMaxBuildSize;
98
+ unsigned int firstCharInLastIteration;
99
+ unsigned int *workingMemory;
100
+ unsigned int *packedText;
101
+ unsigned char *textBuffer;
102
+ unsigned int *packedShift;
103
+ } BWTInc;
104
+
105
+ #endif
data/ext/bwt_lite.c ADDED
@@ -0,0 +1,94 @@
1
+ #include <stdlib.h>
2
+ #include <string.h>
3
+ #include <stdio.h>
4
+ #include "bwt_lite.h"
5
+
6
+ int is_sa(const uint8_t *T, uint32_t *SA, int n);
7
+ int is_bwt(uint8_t *T, int n);
8
+
9
+ bwtl_t *bwtl_seq2bwtl(int len, const uint8_t *seq)
10
+ {
11
+ bwtl_t *b;
12
+ int i;
13
+ b = (bwtl_t*)calloc(1, sizeof(bwtl_t));
14
+ b->seq_len = len;
15
+
16
+ { // calculate b->bwt
17
+ uint8_t *s;
18
+ b->sa = (uint32_t*)calloc(len + 1, 4);
19
+ is_sa(seq, b->sa, len);
20
+ s = (uint8_t*)calloc(len + 1, 1);
21
+ for (i = 0; i <= len; ++i) {
22
+ if (b->sa[i] == 0) b->primary = i;
23
+ else s[i] = seq[b->sa[i] - 1];
24
+ }
25
+ for (i = b->primary; i < len; ++i) s[i] = s[i + 1];
26
+ b->bwt_size = (len + 15) / 16;
27
+ b->bwt = (uint32_t*)calloc(b->bwt_size, 4);
28
+ for (i = 0; i < len; ++i)
29
+ b->bwt[i>>4] |= s[i] << ((15 - (i&15)) << 1);
30
+ free(s);
31
+ }
32
+ { // calculate b->occ
33
+ uint32_t c[4];
34
+ b->n_occ = (len + 15) / 16 * 4;
35
+ b->occ = (uint32_t*)calloc(b->n_occ, 4);
36
+ memset(c, 0, 16);
37
+ for (i = 0; i < len; ++i) {
38
+ if (i % 16 == 0)
39
+ memcpy(b->occ + (i/16) * 4, c, 16);
40
+ ++c[bwtl_B0(b, i)];
41
+ }
42
+ memcpy(b->L2+1, c, 16);
43
+ for (i = 2; i < 5; ++i) b->L2[i] += b->L2[i-1];
44
+ }
45
+ { // generate cnt_table
46
+ for (i = 0; i != 256; ++i) {
47
+ u_int32_t j, x = 0;
48
+ for (j = 0; j != 4; ++j)
49
+ x |= (((i&3) == j) + ((i>>2&3) == j) + ((i>>4&3) == j) + (i>>6 == j)) << (j<<3);
50
+ b->cnt_table[i] = x;
51
+ }
52
+ }
53
+ return b;
54
+ }
55
+ inline uint32_t bwtl_occ(const bwtl_t *bwt, uint32_t k, uint8_t c)
56
+ {
57
+ uint32_t n, b;
58
+ if (k == bwt->seq_len) return bwt->L2[c+1] - bwt->L2[c];
59
+ if (k == (uint32_t)(-1)) return 0;
60
+ if (k >= bwt->primary) --k; // because $ is not in bwt
61
+ n = bwt->occ[k/16<<2|c];
62
+ b = bwt->bwt[k/16] & ~((1U<<((15-(k&15))<<1)) - 1);
63
+ n += (bwt->cnt_table[b&0xff] + bwt->cnt_table[b>>8&0xff]
64
+ + bwt->cnt_table[b>>16&0xff] + bwt->cnt_table[b>>24]) >> (c<<3) & 0xff;
65
+ if (c == 0) n -= 15 - (k&15); // corrected for the masked bits
66
+ return n;
67
+ }
68
+ inline void bwtl_occ4(const bwtl_t *bwt, uint32_t k, uint32_t cnt[4])
69
+ {
70
+ uint32_t x, b;
71
+ if (k == (uint32_t)(-1)) {
72
+ memset(cnt, 0, 16);
73
+ return;
74
+ }
75
+ if (k >= bwt->primary) --k; // because $ is not in bwt
76
+ memcpy(cnt, bwt->occ + (k>>4<<2), 16);
77
+ b = bwt->bwt[k>>4] & ~((1U<<((~k&15)<<1)) - 1);
78
+ x = bwt->cnt_table[b&0xff] + bwt->cnt_table[b>>8&0xff]
79
+ + bwt->cnt_table[b>>16&0xff] + bwt->cnt_table[b>>24];
80
+ x -= 15 - (k&15);
81
+ cnt[0] += x&0xff; cnt[1] += x>>8&0xff; cnt[2] += x>>16&0xff; cnt[3] += x>>24;
82
+ }
83
+ inline void bwtl_2occ4(const bwtl_t *bwt, uint32_t k, uint32_t l, uint32_t cntk[4], uint32_t cntl[4])
84
+ {
85
+ bwtl_occ4(bwt, k, cntk);
86
+ bwtl_occ4(bwt, l, cntl);
87
+ }
88
+ void bwtl_destroy(bwtl_t *bwt)
89
+ {
90
+ if (bwt) {
91
+ free(bwt->occ); free(bwt->bwt); free(bwt->sa);
92
+ free(bwt);
93
+ }
94
+ }
data/ext/bwt_lite.h ADDED
@@ -0,0 +1,29 @@
1
+ #ifndef BWT_LITE_H_
2
+ #define BWT_LITE_H_
3
+
4
+ #include <stdint.h>
5
+
6
+ typedef struct {
7
+ uint32_t seq_len, bwt_size, n_occ;
8
+ uint32_t primary;
9
+ uint32_t *bwt, *occ, *sa, L2[5];
10
+ uint32_t cnt_table[256];
11
+ } bwtl_t;
12
+
13
+ #define bwtl_B0(b, k) ((b)->bwt[(k)>>4]>>((~(k)&0xf)<<1)&3)
14
+
15
+ #ifdef __cplusplus
16
+ extern "C" {
17
+ #endif
18
+
19
+ bwtl_t *bwtl_seq2bwtl(int len, const uint8_t *seq);
20
+ inline uint32_t bwtl_occ(const bwtl_t *bwt, uint32_t k, uint8_t c);
21
+ inline void bwtl_occ4(const bwtl_t *bwt, uint32_t k, uint32_t cnt[4]);
22
+ inline void bwtl_2occ4(const bwtl_t *bwt, uint32_t k, uint32_t l, uint32_t cntk[4], uint32_t cntl[4]);
23
+ void bwtl_destroy(bwtl_t *bwt);
24
+
25
+ #ifdef __cplusplus
26
+ }
27
+ #endif
28
+
29
+ #endif
data/ext/bwtaln.c ADDED
@@ -0,0 +1,345 @@
1
+ #include <stdio.h>
2
+ #include <unistd.h>
3
+ #include <math.h>
4
+ #include <stdlib.h>
5
+ #include <string.h>
6
+ #include <time.h>
7
+ #include <stdint.h>
8
+ #ifdef HAVE_CONFIG_H
9
+ #include "config.h"
10
+ #endif
11
+ #include "bwtaln.h"
12
+ #include "bwtgap.h"
13
+ #include "utils.h"
14
+
15
+ #ifdef HAVE_PTHREAD
16
+ #define THREAD_BLOCK_SIZE 1024
17
+ #include <pthread.h>
18
+ static pthread_mutex_t g_seq_lock = PTHREAD_MUTEX_INITIALIZER;
19
+ #endif
20
+
21
+ gap_opt_t *gap_init_opt()
22
+ {
23
+ gap_opt_t *o;
24
+ o = (gap_opt_t*)calloc(1, sizeof(gap_opt_t));
25
+ /* IMPORTANT: s_mm*10 should be about the average base error
26
+ rate. Voilating this requirement will break pairing! */
27
+ o->s_mm = 3; o->s_gapo = 11; o->s_gape = 4;
28
+ o->max_diff = -1; o->max_gapo = 1; o->max_gape = 6;
29
+ o->indel_end_skip = 5; o->max_del_occ = 10; o->max_entries = 2000000;
30
+ o->mode = BWA_MODE_GAPE | BWA_MODE_COMPREAD;
31
+ o->seed_len = 32; o->max_seed_diff = 2;
32
+ o->fnr = 0.04;
33
+ o->n_threads = 1;
34
+ o->max_top2 = 30;
35
+ o->trim_qual = 0;
36
+ return o;
37
+ }
38
+
39
+ int bwa_cal_maxdiff(int l, double err, double thres)
40
+ {
41
+ double elambda = exp(-l * err);
42
+ double sum, y = 1.0;
43
+ int k, x = 1;
44
+ for (k = 1, sum = elambda; k < 1000; ++k) {
45
+ y *= l * err;
46
+ x *= k;
47
+ sum += elambda * y / x;
48
+ if (1.0 - sum < thres) return k;
49
+ }
50
+ return 2;
51
+ }
52
+
53
+ // width must be filled as zero
54
+ static int bwt_cal_width(const bwt_t *rbwt, int len, const ubyte_t *str, bwt_width_t *width)
55
+ {
56
+ bwtint_t k, l, ok, ol;
57
+ int i, bid;
58
+ bid = 0;
59
+ k = 0; l = rbwt->seq_len;
60
+ for (i = 0; i < len; ++i) {
61
+ ubyte_t c = str[i];
62
+ if (c < 4) {
63
+ bwt_2occ(rbwt, k - 1, l, c, &ok, &ol);
64
+ k = rbwt->L2[c] + ok + 1;
65
+ l = rbwt->L2[c] + ol;
66
+ }
67
+ if (k > l || c > 3) { // then restart
68
+ k = 0;
69
+ l = rbwt->seq_len;
70
+ ++bid;
71
+ }
72
+ width[i].w = l - k + 1;
73
+ width[i].bid = bid;
74
+ }
75
+ width[len].w = 0;
76
+ width[len].bid = ++bid;
77
+ return bid;
78
+ }
79
+
80
+ void bwa_cal_sa_reg_gap(int tid, bwt_t *const bwt[2], int n_seqs, bwa_seq_t *seqs, const gap_opt_t *opt)
81
+ {
82
+ int i, max_l = 0, max_len;
83
+ gap_stack_t *stack;
84
+ bwt_width_t *w[2], *seed_w[2];
85
+ const ubyte_t *seq[2];
86
+ gap_opt_t local_opt = *opt;
87
+
88
+ // initiate priority stack
89
+ for (i = max_len = 0; i != n_seqs; ++i)
90
+ if (seqs[i].len > max_len) max_len = seqs[i].len;
91
+ if (opt->fnr > 0.0) local_opt.max_diff = bwa_cal_maxdiff(max_len, BWA_AVG_ERR, opt->fnr);
92
+ if (local_opt.max_diff < local_opt.max_gapo) local_opt.max_gapo = local_opt.max_diff;
93
+ stack = gap_init_stack(local_opt.max_diff, local_opt.max_gapo, local_opt.max_gape, &local_opt);
94
+
95
+ seed_w[0] = (bwt_width_t*)calloc(opt->seed_len+1, sizeof(bwt_width_t));
96
+ seed_w[1] = (bwt_width_t*)calloc(opt->seed_len+1, sizeof(bwt_width_t));
97
+ w[0] = w[1] = 0;
98
+ for (i = 0; i != n_seqs; ++i) {
99
+ bwa_seq_t *p = seqs + i;
100
+ #ifdef HAVE_PTHREAD
101
+ if (opt->n_threads > 1) {
102
+ pthread_mutex_lock(&g_seq_lock);
103
+ if (p->tid < 0) { // unassigned
104
+ int j;
105
+ for (j = i; j < n_seqs && j < i + THREAD_BLOCK_SIZE; ++j)
106
+ seqs[j].tid = tid;
107
+ } else if (p->tid != tid) {
108
+ pthread_mutex_unlock(&g_seq_lock);
109
+ continue;
110
+ }
111
+ pthread_mutex_unlock(&g_seq_lock);
112
+ }
113
+ #endif
114
+ p->sa = 0; p->type = BWA_TYPE_NO_MATCH; p->c1 = p->c2 = 0; p->n_aln = 0; p->aln = 0;
115
+ seq[0] = p->seq; seq[1] = p->rseq;
116
+ if (max_l < p->len) {
117
+ max_l = p->len;
118
+ w[0] = (bwt_width_t*)realloc(w[0], (max_l + 1) * sizeof(bwt_width_t));
119
+ w[1] = (bwt_width_t*)realloc(w[1], (max_l + 1) * sizeof(bwt_width_t));
120
+ memset(w[0], 0, (max_l + 1) * sizeof(bwt_width_t));
121
+ memset(w[1], 0, (max_l + 1) * sizeof(bwt_width_t));
122
+ }
123
+ bwt_cal_width(bwt[0], p->len, seq[0], w[0]);
124
+ bwt_cal_width(bwt[1], p->len, seq[1], w[1]);
125
+ if (opt->fnr > 0.0) local_opt.max_diff = bwa_cal_maxdiff(p->len, BWA_AVG_ERR, opt->fnr);
126
+ local_opt.seed_len = opt->seed_len < p->len? opt->seed_len : 0x7fffffff;
127
+ if (p->len > opt->seed_len) {
128
+ bwt_cal_width(bwt[0], opt->seed_len, seq[0] + (p->len - opt->seed_len), seed_w[0]);
129
+ bwt_cal_width(bwt[1], opt->seed_len, seq[1] + (p->len - opt->seed_len), seed_w[1]);
130
+ }
131
+ // core function
132
+ p->aln = bwt_match_gap(bwt, p->len, seq, w, p->len <= opt->seed_len? 0 : seed_w, &local_opt, &p->n_aln, stack);
133
+ // store the alignment
134
+ free(p->name); free(p->seq); free(p->rseq); free(p->qual);
135
+ p->name = 0; p->seq = p->rseq = p->qual = 0;
136
+ }
137
+ free(seed_w[0]); free(seed_w[1]);
138
+ free(w[0]); free(w[1]);
139
+ gap_destroy_stack(stack);
140
+ }
141
+
142
+ #ifdef HAVE_PTHREAD
143
+ typedef struct {
144
+ int tid;
145
+ bwt_t *bwt[2];
146
+ int n_seqs;
147
+ bwa_seq_t *seqs;
148
+ const gap_opt_t *opt;
149
+ } thread_aux_t;
150
+
151
+ static void *worker(void *data)
152
+ {
153
+ thread_aux_t *d = (thread_aux_t*)data;
154
+ bwa_cal_sa_reg_gap(d->tid, d->bwt, d->n_seqs, d->seqs, d->opt);
155
+ return 0;
156
+ }
157
+ #endif
158
+
159
+ bwa_seqio_t *bwa_open_reads(int mode, const char *fn_fa)
160
+ {
161
+ bwa_seqio_t *ks;
162
+ if (mode & BWA_MODE_BAM) { // open BAM
163
+ int which = 0;
164
+ if (mode & BWA_MODE_BAM_SE) which |= 4;
165
+ if (mode & BWA_MODE_BAM_READ1) which |= 1;
166
+ if (mode & BWA_MODE_BAM_READ2) which |= 2;
167
+ if (which == 0) which = 7; // then read all reads
168
+ ks = bwa_bam_open(fn_fa, which);
169
+ } else ks = bwa_seq_open(fn_fa);
170
+ return ks;
171
+ }
172
+
173
+ void bwa_aln_core(const char *prefix, const char *fn_fa, const gap_opt_t *opt)
174
+ {
175
+ int i, n_seqs, tot_seqs = 0;
176
+ bwa_seq_t *seqs;
177
+ bwa_seqio_t *ks;
178
+ clock_t t;
179
+ bwt_t *bwt[2];
180
+
181
+ // initialization
182
+ ks = bwa_open_reads(opt->mode, fn_fa);
183
+
184
+ { // load BWT
185
+ char *str = (char*)calloc(strlen(prefix) + 10, 1);
186
+ strcpy(str, prefix); strcat(str, ".bwt"); bwt[0] = bwt_restore_bwt(str);
187
+ strcpy(str, prefix); strcat(str, ".rbwt"); bwt[1] = bwt_restore_bwt(str);
188
+ free(str);
189
+ }
190
+
191
+ // core loop
192
+ fwrite(opt, sizeof(gap_opt_t), 1, stdout);
193
+ while ((seqs = bwa_read_seq(ks, 0x40000, &n_seqs, opt->mode, opt->trim_qual)) != 0) {
194
+ tot_seqs += n_seqs;
195
+ t = clock();
196
+
197
+ fprintf(stderr, "[bwa_aln_core] calculate SA coordinate... ");
198
+
199
+ #ifdef HAVE_PTHREAD
200
+ if (opt->n_threads <= 1) { // no multi-threading at all
201
+ bwa_cal_sa_reg_gap(0, bwt, n_seqs, seqs, opt);
202
+ } else {
203
+ pthread_t *tid;
204
+ pthread_attr_t attr;
205
+ thread_aux_t *data;
206
+ int j;
207
+ pthread_attr_init(&attr);
208
+ pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE);
209
+ data = (thread_aux_t*)calloc(opt->n_threads, sizeof(thread_aux_t));
210
+ tid = (pthread_t*)calloc(opt->n_threads, sizeof(pthread_t));
211
+ for (j = 0; j < opt->n_threads; ++j) {
212
+ data[j].tid = j; data[j].bwt[0] = bwt[0]; data[j].bwt[1] = bwt[1];
213
+ data[j].n_seqs = n_seqs; data[j].seqs = seqs; data[j].opt = opt;
214
+ pthread_create(&tid[j], &attr, worker, data + j);
215
+ }
216
+ for (j = 0; j < opt->n_threads; ++j) pthread_join(tid[j], 0);
217
+ free(data); free(tid);
218
+ }
219
+ #else
220
+ bwa_cal_sa_reg_gap(0, bwt, n_seqs, seqs, opt);
221
+ #endif
222
+
223
+ fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); t = clock();
224
+
225
+ t = clock();
226
+ fprintf(stderr, "[bwa_aln_core] write to the disk... ");
227
+ for (i = 0; i < n_seqs; ++i) {
228
+ bwa_seq_t *p = seqs + i;
229
+ fwrite(&p->n_aln, 4, 1, stdout);
230
+ if (p->n_aln) fwrite(p->aln, sizeof(bwt_aln1_t), p->n_aln, stdout);
231
+ }
232
+ fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); t = clock();
233
+
234
+ bwa_free_read_seq(n_seqs, seqs);
235
+ fprintf(stderr, "[bwa_aln_core] %d sequences have been processed.\n", tot_seqs);
236
+ }
237
+
238
+ // destroy
239
+ bwt_destroy(bwt[0]); bwt_destroy(bwt[1]);
240
+ bwa_seq_close(ks);
241
+ }
242
+
243
+ int bwa_aln(int argc, char *argv[])
244
+ {
245
+ int c, opte = -1;
246
+ gap_opt_t *opt;
247
+ optind = 1;
248
+ opt = gap_init_opt();
249
+ while ((c = getopt(argc, argv, "n:o:e:i:d:l:k:cLR:m:t:NM:O:E:q:f:b012IB:")) >= 0) {
250
+ switch (c) {
251
+ case 'n':
252
+ if (strstr(optarg, ".")) opt->fnr = atof(optarg), opt->max_diff = -1;
253
+ else opt->max_diff = atoi(optarg), opt->fnr = -1.0;
254
+ break;
255
+ case 'o': opt->max_gapo = atoi(optarg); break;
256
+ case 'e': opte = atoi(optarg); break;
257
+ case 'M': opt->s_mm = atoi(optarg); break;
258
+ case 'O': opt->s_gapo = atoi(optarg); break;
259
+ case 'E': opt->s_gape = atoi(optarg); break;
260
+ case 'd': opt->max_del_occ = atoi(optarg); break;
261
+ case 'i': opt->indel_end_skip = atoi(optarg); break;
262
+ case 'l': opt->seed_len = atoi(optarg); break;
263
+ case 'k': opt->max_seed_diff = atoi(optarg); break;
264
+ case 'm': opt->max_entries = atoi(optarg); break;
265
+ case 't': opt->n_threads = atoi(optarg); break;
266
+ case 'L': opt->mode |= BWA_MODE_LOGGAP; break;
267
+ case 'R': opt->max_top2 = atoi(optarg); break;
268
+ case 'q': opt->trim_qual = atoi(optarg); break;
269
+ case 'c': opt->mode &= ~BWA_MODE_COMPREAD; break;
270
+ case 'N': opt->mode |= BWA_MODE_NONSTOP; opt->max_top2 = 0x7fffffff; break;
271
+ case 'f': xreopen(optarg, "wb", stdout); break;
272
+ case 'b': opt->mode |= BWA_MODE_BAM; break;
273
+ case '0': opt->mode |= BWA_MODE_BAM_SE; break;
274
+ case '1': opt->mode |= BWA_MODE_BAM_READ1; break;
275
+ case '2': opt->mode |= BWA_MODE_BAM_READ2; break;
276
+ case 'I': opt->mode |= BWA_MODE_IL13; break;
277
+ case 'B': opt->mode |= atoi(optarg) << 24; break;
278
+ default: return 1;
279
+ }
280
+ }
281
+ if (opte > 0) {
282
+ opt->max_gape = opte;
283
+ opt->mode &= ~BWA_MODE_GAPE;
284
+ }
285
+
286
+ if (optind + 2 > argc) {
287
+ fprintf(stderr, "\n");
288
+ fprintf(stderr, "Usage: bwa aln [options] <prefix> <in.fq>\n\n");
289
+ fprintf(stderr, "Options: -n NUM max #diff (int) or missing prob under %.2f err rate (float) [%.2f]\n",
290
+ BWA_AVG_ERR, opt->fnr);
291
+ fprintf(stderr, " -o INT maximum number or fraction of gap opens [%d]\n", opt->max_gapo);
292
+ fprintf(stderr, " -e INT maximum number of gap extensions, -1 for disabling long gaps [-1]\n");
293
+ fprintf(stderr, " -i INT do not put an indel within INT bp towards the ends [%d]\n", opt->indel_end_skip);
294
+ fprintf(stderr, " -d INT maximum occurrences for extending a long deletion [%d]\n", opt->max_del_occ);
295
+ fprintf(stderr, " -l INT seed length [%d]\n", opt->seed_len);
296
+ fprintf(stderr, " -k INT maximum differences in the seed [%d]\n", opt->max_seed_diff);
297
+ fprintf(stderr, " -m INT maximum entries in the queue [%d]\n", opt->max_entries);
298
+ fprintf(stderr, " -t INT number of threads [%d]\n", opt->n_threads);
299
+ fprintf(stderr, " -M INT mismatch penalty [%d]\n", opt->s_mm);
300
+ fprintf(stderr, " -O INT gap open penalty [%d]\n", opt->s_gapo);
301
+ fprintf(stderr, " -E INT gap extension penalty [%d]\n", opt->s_gape);
302
+ fprintf(stderr, " -R INT stop searching when there are >INT equally best hits [%d]\n", opt->max_top2);
303
+ fprintf(stderr, " -q INT quality threshold for read trimming down to %dbp [%d]\n", BWA_MIN_RDLEN, opt->trim_qual);
304
+ fprintf(stderr, " -f FILE file to write output to instead of stdout\n");
305
+ fprintf(stderr, " -B INT length of barcode\n");
306
+ fprintf(stderr, " -c input sequences are in the color space\n");
307
+ fprintf(stderr, " -L log-scaled gap penalty for long deletions\n");
308
+ fprintf(stderr, " -N non-iterative mode: search for all n-difference hits (slooow)\n");
309
+ fprintf(stderr, " -I the input is in the Illumina 1.3+ FASTQ-like format\n");
310
+ fprintf(stderr, " -b the input read file is in the BAM format\n");
311
+ fprintf(stderr, " -0 use single-end reads only (effective with -b)\n");
312
+ fprintf(stderr, " -1 use the 1st read in a pair (effective with -b)\n");
313
+ fprintf(stderr, " -2 use the 2nd read in a pair (effective with -b)\n");
314
+ fprintf(stderr, "\n");
315
+ return 1;
316
+ }
317
+ if (opt->fnr > 0.0) {
318
+ int i, k;
319
+ for (i = 17, k = 0; i <= 250; ++i) {
320
+ int l = bwa_cal_maxdiff(i, BWA_AVG_ERR, opt->fnr);
321
+ if (l != k) fprintf(stderr, "[bwa_aln] %dbp reads: max_diff = %d\n", i, l);
322
+ k = l;
323
+ }
324
+ }
325
+ bwa_aln_core(argv[optind], argv[optind+1], opt);
326
+ free(opt);
327
+ fflush(stdout);
328
+ xreopen("/dev/tty","w",stdout);
329
+ return 0;
330
+ }
331
+
332
+ /* rgoya: Temporary clone of aln_path2cigar to accomodate for bwa_cigar_t,
333
+ __cigar_op and __cigar_len while keeping stdaln stand alone */
334
+ bwa_cigar_t *bwa_aln_path2cigar(const path_t *path, int path_len, int *n_cigar)
335
+ {
336
+ uint32_t *cigar32;
337
+ bwa_cigar_t *cigar;
338
+ int i;
339
+ cigar32 = aln_path2cigar32((path_t*) path, path_len, n_cigar);
340
+ cigar = (bwa_cigar_t*)cigar32;
341
+ for (i = 0; i < *n_cigar; ++i)
342
+ cigar[i] = __cigar_create( (cigar32[i]&0xf), (cigar32[i]>>4) );
343
+ return cigar;
344
+ }
345
+