bio-bwa 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (91) hide show
  1. data/.document +5 -0
  2. data/Gemfile +15 -0
  3. data/Gemfile.lock +28 -0
  4. data/LICENSE.txt +35 -0
  5. data/README.rdoc +33 -0
  6. data/Rakefile +56 -0
  7. data/VERSION +1 -0
  8. data/bio-bwa.gemspec +152 -0
  9. data/doc/Bio.html +93 -0
  10. data/doc/Bio/BWA.html +2884 -0
  11. data/doc/Bio/BWA/Library.html +229 -0
  12. data/doc/_index.html +119 -0
  13. data/doc/class_list.html +36 -0
  14. data/doc/css/common.css +1 -0
  15. data/doc/css/full_list.css +53 -0
  16. data/doc/css/style.css +310 -0
  17. data/doc/file.LICENSE.html +88 -0
  18. data/doc/file.README.html +119 -0
  19. data/doc/file_list.html +41 -0
  20. data/doc/frames.html +13 -0
  21. data/doc/index.html +119 -0
  22. data/doc/js/app.js +203 -0
  23. data/doc/js/full_list.js +149 -0
  24. data/doc/js/jquery.js +154 -0
  25. data/doc/method_list.html +171 -0
  26. data/doc/top-level-namespace.html +88 -0
  27. data/ext/COPYING +674 -0
  28. data/ext/ChangeLog +3864 -0
  29. data/ext/NEWS +555 -0
  30. data/ext/README +29 -0
  31. data/ext/bamlite.c +155 -0
  32. data/ext/bamlite.h +94 -0
  33. data/ext/bntseq.c +303 -0
  34. data/ext/bntseq.h +80 -0
  35. data/ext/bwa.1 +562 -0
  36. data/ext/bwape.c +807 -0
  37. data/ext/bwase.c +686 -0
  38. data/ext/bwase.h +27 -0
  39. data/ext/bwaseqio.c +222 -0
  40. data/ext/bwt.c +250 -0
  41. data/ext/bwt.h +105 -0
  42. data/ext/bwt_gen/Makefile +23 -0
  43. data/ext/bwt_gen/QSufSort.c +496 -0
  44. data/ext/bwt_gen/QSufSort.h +40 -0
  45. data/ext/bwt_gen/bwt_gen.c +1547 -0
  46. data/ext/bwt_gen/bwt_gen.h +105 -0
  47. data/ext/bwt_lite.c +94 -0
  48. data/ext/bwt_lite.h +29 -0
  49. data/ext/bwtaln.c +345 -0
  50. data/ext/bwtaln.h +150 -0
  51. data/ext/bwtgap.c +264 -0
  52. data/ext/bwtgap.h +38 -0
  53. data/ext/bwtindex.c +186 -0
  54. data/ext/bwtio.c +77 -0
  55. data/ext/bwtmisc.c +269 -0
  56. data/ext/bwtsw2.h +51 -0
  57. data/ext/bwtsw2_aux.c +650 -0
  58. data/ext/bwtsw2_chain.c +107 -0
  59. data/ext/bwtsw2_core.c +594 -0
  60. data/ext/bwtsw2_main.c +100 -0
  61. data/ext/cs2nt.c +191 -0
  62. data/ext/is.c +218 -0
  63. data/ext/khash.h +506 -0
  64. data/ext/kseq.h +208 -0
  65. data/ext/ksort.h +269 -0
  66. data/ext/kstring.c +35 -0
  67. data/ext/kstring.h +46 -0
  68. data/ext/kvec.h +90 -0
  69. data/ext/main.c +63 -0
  70. data/ext/main.h +29 -0
  71. data/ext/mkrf_conf.rb +49 -0
  72. data/ext/qualfa2fq.pl +27 -0
  73. data/ext/simple_dp.c +162 -0
  74. data/ext/simpletest.c +23 -0
  75. data/ext/solid2fastq.pl +111 -0
  76. data/ext/stdaln.c +1072 -0
  77. data/ext/stdaln.h +162 -0
  78. data/ext/utils.c +82 -0
  79. data/ext/utils.h +54 -0
  80. data/lib/bio-bwa.rb +7 -0
  81. data/lib/bio/bwa.rb +312 -0
  82. data/lib/bio/bwa/library.rb +42 -0
  83. data/test/data/testdata.fa +602 -0
  84. data/test/data/testdata.long.fa +175 -0
  85. data/test/data/testdata.short.fa +2 -0
  86. data/test/helper.rb +18 -0
  87. data/test/test_bio-bwa_basic.rb +62 -0
  88. data/test/test_bio-bwa_make_index.rb +42 -0
  89. data/test/test_bio-bwa_run_aln.rb +49 -0
  90. data/test/test_bio-bwa_sam_conversion.rb +49 -0
  91. metadata +218 -0
data/ext/bwase.h ADDED
@@ -0,0 +1,27 @@
1
+ #ifndef BWASE_H
2
+ #define BWASE_H
3
+
4
+ #include "bntseq.h"
5
+ #include "bwt.h"
6
+ #include "bwtaln.h"
7
+
8
+ #ifdef __cplusplus
9
+ extern "C" {
10
+ #endif
11
+
12
+ // Initialize mapping tables in the bwa single-end mapper.
13
+ void bwase_initialize();
14
+ // Calculate the approximate position of the sequence from the specified bwt with loaded suffix array.
15
+ void bwa_cal_pac_pos_core(const bwt_t* forward_bwt, const bwt_t* reverse_bwt, bwa_seq_t* seq, const int max_mm, const float fnr);
16
+ // Refine the approximate position of the sequence to an actual placement for the sequence.
17
+ void bwa_refine_gapped(const bntseq_t *bns, int n_seqs, bwa_seq_t *seqs, ubyte_t *_pacseq, bntseq_t *ntbns);
18
+ // Backfill certain alignment properties mainly centering around number of matches.
19
+ void bwa_aln2seq(int n_aln, const bwt_aln1_t *aln, bwa_seq_t *s);
20
+ // Calculate the end position of a read given a certain sequence.
21
+ int64_t pos_end(const bwa_seq_t *p);
22
+
23
+ #ifdef __cplusplus
24
+ }
25
+ #endif
26
+
27
+ #endif // BWASE_H
data/ext/bwaseqio.c ADDED
@@ -0,0 +1,222 @@
1
+ #include <zlib.h>
2
+ #include <ctype.h>
3
+ #include "bwtaln.h"
4
+ #include "utils.h"
5
+ #include "bamlite.h"
6
+
7
+ #include "kseq.h"
8
+ KSEQ_INIT(gzFile, gzread)
9
+
10
+ extern unsigned char nst_nt4_table[256];
11
+ static char bam_nt16_nt4_table[] = { 4, 0, 1, 4, 2, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4 };
12
+
13
+ struct __bwa_seqio_t {
14
+ // for BAM input
15
+ int is_bam, which; // 1st bit: read1, 2nd bit: read2, 3rd: SE
16
+ bamFile fp;
17
+ // for fastq input
18
+ kseq_t *ks;
19
+ };
20
+
21
+ bwa_seqio_t *bwa_bam_open(const char *fn, int which)
22
+ {
23
+ bwa_seqio_t *bs;
24
+ bam_header_t *h;
25
+ bs = (bwa_seqio_t*)calloc(1, sizeof(bwa_seqio_t));
26
+ bs->is_bam = 1;
27
+ bs->which = which;
28
+ bs->fp = bam_open(fn, "r");
29
+ h = bam_header_read(bs->fp);
30
+ bam_header_destroy(h);
31
+ return bs;
32
+ }
33
+
34
+ bwa_seqio_t *bwa_seq_open(const char *fn)
35
+ {
36
+ gzFile fp;
37
+ bwa_seqio_t *bs;
38
+ bs = (bwa_seqio_t*)calloc(1, sizeof(bwa_seqio_t));
39
+ fp = xzopen(fn, "r");
40
+ bs->ks = kseq_init(fp);
41
+ return bs;
42
+ }
43
+
44
+ void bwa_seq_close(bwa_seqio_t *bs)
45
+ {
46
+ if (bs == 0) return;
47
+ if (bs->is_bam) bam_close(bs->fp);
48
+ else {
49
+ gzclose(bs->ks->f->f);
50
+ kseq_destroy(bs->ks);
51
+ }
52
+ free(bs);
53
+ }
54
+
55
+ void seq_reverse(int len, ubyte_t *seq, int is_comp)
56
+ {
57
+ int i;
58
+ if (is_comp) {
59
+ for (i = 0; i < len>>1; ++i) {
60
+ char tmp = seq[len-1-i];
61
+ if (tmp < 4) tmp = 3 - tmp;
62
+ seq[len-1-i] = (seq[i] >= 4)? seq[i] : 3 - seq[i];
63
+ seq[i] = tmp;
64
+ }
65
+ if (len&1) seq[i] = (seq[i] >= 4)? seq[i] : 3 - seq[i];
66
+ } else {
67
+ for (i = 0; i < len>>1; ++i) {
68
+ char tmp = seq[len-1-i];
69
+ seq[len-1-i] = seq[i]; seq[i] = tmp;
70
+ }
71
+ }
72
+ }
73
+
74
+ int bwa_trim_read(int trim_qual, bwa_seq_t *p)
75
+ {
76
+ int s = 0, l, max = 0, max_l = p->len - 1;
77
+ if (trim_qual < 1 || p->qual == 0) return 0;
78
+ for (l = p->len - 1; l >= BWA_MIN_RDLEN - 1; --l) {
79
+ s += trim_qual - (p->qual[l] - 33);
80
+ if (s < 0) break;
81
+ if (s > max) {
82
+ max = s; max_l = l;
83
+ }
84
+ }
85
+ p->clip_len = p->len = max_l + 1;
86
+ return p->full_len - p->len;
87
+ }
88
+
89
+ static bwa_seq_t *bwa_read_bam(bwa_seqio_t *bs, int n_needed, int *n, int is_comp, int trim_qual)
90
+ {
91
+ bwa_seq_t *seqs, *p;
92
+ int n_seqs, l, i;
93
+ long n_trimmed = 0, n_tot = 0;
94
+ bam1_t *b;
95
+
96
+ b = bam_init1();
97
+ n_seqs = 0;
98
+ seqs = (bwa_seq_t*)calloc(n_needed, sizeof(bwa_seq_t));
99
+ while (bam_read1(bs->fp, b) >= 0) {
100
+ uint8_t *s, *q;
101
+ int go = 0;
102
+ if ((bs->which & 1) && (b->core.flag & BAM_FREAD1)) go = 1;
103
+ if ((bs->which & 2) && (b->core.flag & BAM_FREAD2)) go = 1;
104
+ if ((bs->which & 4) && !(b->core.flag& BAM_FREAD1) && !(b->core.flag& BAM_FREAD2))go = 1;
105
+ if (go == 0) continue;
106
+ l = b->core.l_qseq;
107
+ p = &seqs[n_seqs++];
108
+ p->tid = -1; // no assigned to a thread
109
+ p->qual = 0;
110
+ p->full_len = p->clip_len = p->len = l;
111
+ n_tot += p->full_len;
112
+ s = bam1_seq(b); q = bam1_qual(b);
113
+ p->seq = (ubyte_t*)calloc(p->len + 1, 1);
114
+ p->qual = (ubyte_t*)calloc(p->len + 1, 1);
115
+ for (i = 0; i != p->full_len; ++i) {
116
+ p->seq[i] = bam_nt16_nt4_table[(int)bam1_seqi(s, i)];
117
+ p->qual[i] = q[i] + 33 < 126? q[i] + 33 : 126;
118
+ }
119
+ if (bam1_strand(b)) { // then reverse
120
+ seq_reverse(p->len, p->seq, 1);
121
+ seq_reverse(p->len, p->qual, 0);
122
+ }
123
+ if (trim_qual >= 1) n_trimmed += bwa_trim_read(trim_qual, p);
124
+ p->rseq = (ubyte_t*)calloc(p->full_len, 1);
125
+ memcpy(p->rseq, p->seq, p->len);
126
+ seq_reverse(p->len, p->seq, 0); // *IMPORTANT*: will be reversed back in bwa_refine_gapped()
127
+ seq_reverse(p->len, p->rseq, is_comp);
128
+ p->name = strdup((const char*)bam1_qname(b));
129
+ if (n_seqs == n_needed) break;
130
+ }
131
+ *n = n_seqs;
132
+ if (n_seqs && trim_qual >= 1)
133
+ fprintf(stderr, "[bwa_read_seq] %.1f%% bases are trimmed.\n", 100.0f * n_trimmed/n_tot);
134
+ if (n_seqs == 0) {
135
+ free(seqs);
136
+ bam_destroy1(b);
137
+ return 0;
138
+ }
139
+ bam_destroy1(b);
140
+ return seqs;
141
+ }
142
+
143
+ #define BARCODE_LOW_QUAL 13
144
+
145
+ bwa_seq_t *bwa_read_seq(bwa_seqio_t *bs, int n_needed, int *n, int mode, int trim_qual)
146
+ {
147
+ bwa_seq_t *seqs, *p;
148
+ kseq_t *seq = bs->ks;
149
+ int n_seqs, l, i, is_comp = mode&BWA_MODE_COMPREAD, is_64 = mode&BWA_MODE_IL13, l_bc = mode>>24;
150
+ long n_trimmed = 0, n_tot = 0;
151
+
152
+ if (l_bc > 15) {
153
+ fprintf(stderr, "[%s] the maximum barcode length is 15.\n", __func__);
154
+ return 0;
155
+ }
156
+ if (bs->is_bam) return bwa_read_bam(bs, n_needed, n, is_comp, trim_qual); // l_bc has no effect for BAM input
157
+ n_seqs = 0;
158
+ seqs = (bwa_seq_t*)calloc(n_needed, sizeof(bwa_seq_t));
159
+ while ((l = kseq_read(seq)) >= 0) {
160
+ if (is_64 && seq->qual.l)
161
+ for (i = 0; i < seq->qual.l; ++i) seq->qual.s[i] -= 31;
162
+ if (seq->seq.l <= l_bc) continue; // sequence length equals or smaller than the barcode length
163
+ p = &seqs[n_seqs++];
164
+ if (l_bc) { // then trim barcode
165
+ for (i = 0; i < l_bc; ++i)
166
+ p->bc[i] = (seq->qual.l && seq->qual.s[i]-33 < BARCODE_LOW_QUAL)? tolower(seq->seq.s[i]) : toupper(seq->seq.s[i]);
167
+ p->bc[i] = 0;
168
+ for (; i < seq->seq.l; ++i)
169
+ seq->seq.s[i - l_bc] = seq->seq.s[i];
170
+ seq->seq.l -= l_bc; seq->seq.s[seq->seq.l] = 0;
171
+ if (seq->qual.l) {
172
+ for (i = l_bc; i < seq->qual.l; ++i)
173
+ seq->qual.s[i - l_bc] = seq->qual.s[i];
174
+ seq->qual.l -= l_bc; seq->qual.s[seq->qual.l] = 0;
175
+ }
176
+ l = seq->seq.l;
177
+ } else p->bc[0] = 0;
178
+ p->tid = -1; // no assigned to a thread
179
+ p->qual = 0;
180
+ p->full_len = p->clip_len = p->len = l;
181
+ n_tot += p->full_len;
182
+ p->seq = (ubyte_t*)calloc(p->len, 1);
183
+ for (i = 0; i != p->full_len; ++i)
184
+ p->seq[i] = nst_nt4_table[(int)seq->seq.s[i]];
185
+ if (seq->qual.l) { // copy quality
186
+ p->qual = (ubyte_t*)strdup((char*)seq->qual.s);
187
+ if (trim_qual >= 1) n_trimmed += bwa_trim_read(trim_qual, p);
188
+ }
189
+ p->rseq = (ubyte_t*)calloc(p->full_len, 1);
190
+ memcpy(p->rseq, p->seq, p->len);
191
+ seq_reverse(p->len, p->seq, 0); // *IMPORTANT*: will be reversed back in bwa_refine_gapped()
192
+ seq_reverse(p->len, p->rseq, is_comp);
193
+ p->name = strdup((const char*)seq->name.s);
194
+ { // trim /[12]$
195
+ int t = strlen(p->name);
196
+ if (t > 2 && p->name[t-2] == '/' && (p->name[t-1] == '1' || p->name[t-1] == '2')) p->name[t-2] = '\0';
197
+ }
198
+ if (n_seqs == n_needed) break;
199
+ }
200
+ *n = n_seqs;
201
+ if (n_seqs && trim_qual >= 1)
202
+ fprintf(stderr, "[bwa_read_seq] %.1f%% bases are trimmed.\n", 100.0f * n_trimmed/n_tot);
203
+ if (n_seqs == 0) {
204
+ free(seqs);
205
+ return 0;
206
+ }
207
+ return seqs;
208
+ }
209
+
210
+ void bwa_free_read_seq(int n_seqs, bwa_seq_t *seqs)
211
+ {
212
+ int i, j;
213
+ for (i = 0; i != n_seqs; ++i) {
214
+ bwa_seq_t *p = seqs + i;
215
+ for (j = 0; j < p->n_multi; ++j)
216
+ if (p->multi[j].cigar) free(p->multi[j].cigar);
217
+ free(p->name);
218
+ free(p->seq); free(p->rseq); free(p->qual); free(p->aln); free(p->md); free(p->multi);
219
+ free(p->cigar);
220
+ }
221
+ free(seqs);
222
+ }
data/ext/bwt.c ADDED
@@ -0,0 +1,250 @@
1
+ /* The MIT License
2
+
3
+ Copyright (c) 2008 Genome Research Ltd (GRL).
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining
6
+ a copy of this software and associated documentation files (the
7
+ "Software"), to deal in the Software without restriction, including
8
+ without limitation the rights to use, copy, modify, merge, publish,
9
+ distribute, sublicense, and/or sell copies of the Software, and to
10
+ permit persons to whom the Software is furnished to do so, subject to
11
+ the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
20
+ BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
21
+ ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
22
+ CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
23
+ SOFTWARE.
24
+ */
25
+
26
+ /* Contact: Heng Li <lh3@sanger.ac.uk> */
27
+
28
+ #include <stdlib.h>
29
+ #include <stdio.h>
30
+ #include <string.h>
31
+ #include <assert.h>
32
+ #include <stdint.h>
33
+ #include "utils.h"
34
+ #include "bwt.h"
35
+
36
+ void bwt_gen_cnt_table(bwt_t *bwt)
37
+ {
38
+ int i, j;
39
+ for (i = 0; i != 256; ++i) {
40
+ uint32_t x = 0;
41
+ for (j = 0; j != 4; ++j)
42
+ x |= (((i&3) == j) + ((i>>2&3) == j) + ((i>>4&3) == j) + (i>>6 == j)) << (j<<3);
43
+ bwt->cnt_table[i] = x;
44
+ }
45
+ }
46
+
47
+ // bwt->bwt and bwt->occ must be precalculated
48
+ void bwt_cal_sa(bwt_t *bwt, int intv)
49
+ {
50
+ bwtint_t isa, sa, i; // S(isa) = sa
51
+
52
+ xassert(bwt->bwt, "bwt_t::bwt is not initialized.");
53
+
54
+ if (bwt->sa) free(bwt->sa);
55
+ bwt->sa_intv = intv;
56
+ bwt->n_sa = (bwt->seq_len + intv) / intv;
57
+ bwt->sa = (bwtint_t*)calloc(bwt->n_sa, sizeof(bwtint_t));
58
+ // calculate SA value
59
+ isa = 0; sa = bwt->seq_len;
60
+ for (i = 0; i < bwt->seq_len; ++i) {
61
+ if (isa % intv == 0) bwt->sa[isa/intv] = sa;
62
+ --sa;
63
+ isa = bwt_invPsi(bwt, isa);
64
+ }
65
+ if (isa % intv == 0) bwt->sa[isa/intv] = sa;
66
+ bwt->sa[0] = (bwtint_t)-1; // before this line, bwt->sa[0] = bwt->seq_len
67
+ }
68
+
69
+ bwtint_t bwt_sa(const bwt_t *bwt, bwtint_t k)
70
+ {
71
+ bwtint_t sa = 0;
72
+ while (k % bwt->sa_intv != 0) {
73
+ ++sa;
74
+ k = bwt_invPsi(bwt, k);
75
+ }
76
+ /* without setting bwt->sa[0] = -1, the following line should be
77
+ changed to (sa + bwt->sa[k/bwt->sa_intv]) % (bwt->seq_len + 1) */
78
+ return sa + bwt->sa[k/bwt->sa_intv];
79
+ }
80
+
81
+ static inline int __occ_aux(uint64_t y, int c)
82
+ {
83
+ // reduce nucleotide counting to bits counting
84
+ y = ((c&2)? y : ~y) >> 1 & ((c&1)? y : ~y) & 0x5555555555555555ull;
85
+ // count the number of 1s in y
86
+ y = (y & 0x3333333333333333ull) + (y >> 2 & 0x3333333333333333ull);
87
+ return ((y + (y >> 4)) & 0xf0f0f0f0f0f0f0full) * 0x101010101010101ull >> 56;
88
+ }
89
+
90
+ inline bwtint_t bwt_occ(const bwt_t *bwt, bwtint_t k, ubyte_t c)
91
+ {
92
+ bwtint_t n, l, j;
93
+ uint32_t *p;
94
+
95
+ if (k == bwt->seq_len) return bwt->L2[c+1] - bwt->L2[c];
96
+ if (k == (bwtint_t)(-1)) return 0;
97
+ if (k >= bwt->primary) --k; // because $ is not in bwt
98
+
99
+ // retrieve Occ at k/OCC_INTERVAL
100
+ n = (p = bwt_occ_intv(bwt, k))[c];
101
+ p += 4; // jump to the start of the first BWT cell
102
+
103
+ // calculate Occ up to the last k/32
104
+ j = k >> 5 << 5;
105
+ for (l = k/OCC_INTERVAL*OCC_INTERVAL; l < j; l += 32, p += 2)
106
+ n += __occ_aux((uint64_t)p[0]<<32 | p[1], c);
107
+
108
+ // calculate Occ
109
+ n += __occ_aux(((uint64_t)p[0]<<32 | p[1]) & ~((1ull<<((~k&31)<<1)) - 1), c);
110
+ if (c == 0) n -= ~k&31; // corrected for the masked bits
111
+
112
+ return n;
113
+ }
114
+
115
+ // an analogy to bwt_occ() but more efficient, requiring k <= l
116
+ inline void bwt_2occ(const bwt_t *bwt, bwtint_t k, bwtint_t l, ubyte_t c, bwtint_t *ok, bwtint_t *ol)
117
+ {
118
+ bwtint_t _k, _l;
119
+ if (k == l) {
120
+ *ok = *ol = bwt_occ(bwt, k, c);
121
+ return;
122
+ }
123
+ _k = (k >= bwt->primary)? k-1 : k;
124
+ _l = (l >= bwt->primary)? l-1 : l;
125
+ if (_l/OCC_INTERVAL != _k/OCC_INTERVAL || k == (bwtint_t)(-1) || l == (bwtint_t)(-1)) {
126
+ *ok = bwt_occ(bwt, k, c);
127
+ *ol = bwt_occ(bwt, l, c);
128
+ } else {
129
+ bwtint_t m, n, i, j;
130
+ uint32_t *p;
131
+ if (k >= bwt->primary) --k;
132
+ if (l >= bwt->primary) --l;
133
+ n = (p = bwt_occ_intv(bwt, k))[c];
134
+ p += 4;
135
+ // calculate *ok
136
+ j = k >> 5 << 5;
137
+ for (i = k/OCC_INTERVAL*OCC_INTERVAL; i < j; i += 32, p += 2)
138
+ n += __occ_aux((uint64_t)p[0]<<32 | p[1], c);
139
+ m = n;
140
+ n += __occ_aux(((uint64_t)p[0]<<32 | p[1]) & ~((1ull<<((~k&31)<<1)) - 1), c);
141
+ if (c == 0) n -= ~k&31; // corrected for the masked bits
142
+ *ok = n;
143
+ // calculate *ol
144
+ j = l >> 5 << 5;
145
+ for (; i < j; i += 32, p += 2)
146
+ m += __occ_aux((uint64_t)p[0]<<32 | p[1], c);
147
+ m += __occ_aux(((uint64_t)p[0]<<32 | p[1]) & ~((1ull<<((~l&31)<<1)) - 1), c);
148
+ if (c == 0) m -= ~l&31; // corrected for the masked bits
149
+ *ol = m;
150
+ }
151
+ }
152
+
153
+ #define __occ_aux4(bwt, b) \
154
+ ((bwt)->cnt_table[(b)&0xff] + (bwt)->cnt_table[(b)>>8&0xff] \
155
+ + (bwt)->cnt_table[(b)>>16&0xff] + (bwt)->cnt_table[(b)>>24])
156
+
157
+ inline void bwt_occ4(const bwt_t *bwt, bwtint_t k, bwtint_t cnt[4])
158
+ {
159
+ bwtint_t l, j, x;
160
+ uint32_t *p;
161
+ if (k == (bwtint_t)(-1)) {
162
+ memset(cnt, 0, 4 * sizeof(bwtint_t));
163
+ return;
164
+ }
165
+ if (k >= bwt->primary) --k; // because $ is not in bwt
166
+ p = bwt_occ_intv(bwt, k);
167
+ memcpy(cnt, p, 16);
168
+ p += 4;
169
+ j = k >> 4 << 4;
170
+ for (l = k / OCC_INTERVAL * OCC_INTERVAL, x = 0; l < j; l += 16, ++p)
171
+ x += __occ_aux4(bwt, *p);
172
+ x += __occ_aux4(bwt, *p & ~((1U<<((~k&15)<<1)) - 1)) - (~k&15);
173
+ cnt[0] += x&0xff; cnt[1] += x>>8&0xff; cnt[2] += x>>16&0xff; cnt[3] += x>>24;
174
+ }
175
+
176
+ // an analogy to bwt_occ4() but more efficient, requiring k <= l
177
+ inline void bwt_2occ4(const bwt_t *bwt, bwtint_t k, bwtint_t l, bwtint_t cntk[4], bwtint_t cntl[4])
178
+ {
179
+ bwtint_t _k, _l;
180
+ if (k == l) {
181
+ bwt_occ4(bwt, k, cntk);
182
+ memcpy(cntl, cntk, 4 * sizeof(bwtint_t));
183
+ return;
184
+ }
185
+ _k = (k >= bwt->primary)? k-1 : k;
186
+ _l = (l >= bwt->primary)? l-1 : l;
187
+ if (_l/OCC_INTERVAL != _k/OCC_INTERVAL || k == (bwtint_t)(-1) || l == (bwtint_t)(-1)) {
188
+ bwt_occ4(bwt, k, cntk);
189
+ bwt_occ4(bwt, l, cntl);
190
+ } else {
191
+ bwtint_t i, j, x, y;
192
+ uint32_t *p;
193
+ int cl[4];
194
+ if (k >= bwt->primary) --k; // because $ is not in bwt
195
+ if (l >= bwt->primary) --l;
196
+ cl[0] = cl[1] = cl[2] = cl[3] = 0;
197
+ p = bwt_occ_intv(bwt, k);
198
+ memcpy(cntk, p, 4 * sizeof(bwtint_t));
199
+ p += 4;
200
+ // prepare cntk[]
201
+ j = k >> 4 << 4;
202
+ for (i = k / OCC_INTERVAL * OCC_INTERVAL, x = 0; i < j; i += 16, ++p)
203
+ x += __occ_aux4(bwt, *p);
204
+ y = x;
205
+ x += __occ_aux4(bwt, *p & ~((1U<<((~k&15)<<1)) - 1)) - (~k&15);
206
+ // calculate cntl[] and finalize cntk[]
207
+ j = l >> 4 << 4;
208
+ for (; i < j; i += 16, ++p) y += __occ_aux4(bwt, *p);
209
+ y += __occ_aux4(bwt, *p & ~((1U<<((~l&15)<<1)) - 1)) - (~l&15);
210
+ memcpy(cntl, cntk, 16);
211
+ cntk[0] += x&0xff; cntk[1] += x>>8&0xff; cntk[2] += x>>16&0xff; cntk[3] += x>>24;
212
+ cntl[0] += y&0xff; cntl[1] += y>>8&0xff; cntl[2] += y>>16&0xff; cntl[3] += y>>24;
213
+ }
214
+ }
215
+
216
+ int bwt_match_exact(const bwt_t *bwt, int len, const ubyte_t *str, bwtint_t *sa_begin, bwtint_t *sa_end)
217
+ {
218
+ bwtint_t k, l, ok, ol;
219
+ int i;
220
+ k = 0; l = bwt->seq_len;
221
+ for (i = len - 1; i >= 0; --i) {
222
+ ubyte_t c = str[i];
223
+ if (c > 3) return 0; // no match
224
+ bwt_2occ(bwt, k - 1, l, c, &ok, &ol);
225
+ k = bwt->L2[c] + ok + 1;
226
+ l = bwt->L2[c] + ol;
227
+ if (k > l) break; // no match
228
+ }
229
+ if (k > l) return 0; // no match
230
+ if (sa_begin) *sa_begin = k;
231
+ if (sa_end) *sa_end = l;
232
+ return l - k + 1;
233
+ }
234
+
235
+ int bwt_match_exact_alt(const bwt_t *bwt, int len, const ubyte_t *str, bwtint_t *k0, bwtint_t *l0)
236
+ {
237
+ int i;
238
+ bwtint_t k, l, ok, ol;
239
+ k = *k0; l = *l0;
240
+ for (i = len - 1; i >= 0; --i) {
241
+ ubyte_t c = str[i];
242
+ if (c > 3) return 0; // there is an N here. no match
243
+ bwt_2occ(bwt, k - 1, l, c, &ok, &ol);
244
+ k = bwt->L2[c] + ok + 1;
245
+ l = bwt->L2[c] + ol;
246
+ if (k > l) return 0; // no match
247
+ }
248
+ *k0 = k; *l0 = l;
249
+ return l - k + 1;
250
+ }