bio-bwa 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (91) hide show
  1. data/.document +5 -0
  2. data/Gemfile +15 -0
  3. data/Gemfile.lock +28 -0
  4. data/LICENSE.txt +35 -0
  5. data/README.rdoc +33 -0
  6. data/Rakefile +56 -0
  7. data/VERSION +1 -0
  8. data/bio-bwa.gemspec +152 -0
  9. data/doc/Bio.html +93 -0
  10. data/doc/Bio/BWA.html +2884 -0
  11. data/doc/Bio/BWA/Library.html +229 -0
  12. data/doc/_index.html +119 -0
  13. data/doc/class_list.html +36 -0
  14. data/doc/css/common.css +1 -0
  15. data/doc/css/full_list.css +53 -0
  16. data/doc/css/style.css +310 -0
  17. data/doc/file.LICENSE.html +88 -0
  18. data/doc/file.README.html +119 -0
  19. data/doc/file_list.html +41 -0
  20. data/doc/frames.html +13 -0
  21. data/doc/index.html +119 -0
  22. data/doc/js/app.js +203 -0
  23. data/doc/js/full_list.js +149 -0
  24. data/doc/js/jquery.js +154 -0
  25. data/doc/method_list.html +171 -0
  26. data/doc/top-level-namespace.html +88 -0
  27. data/ext/COPYING +674 -0
  28. data/ext/ChangeLog +3864 -0
  29. data/ext/NEWS +555 -0
  30. data/ext/README +29 -0
  31. data/ext/bamlite.c +155 -0
  32. data/ext/bamlite.h +94 -0
  33. data/ext/bntseq.c +303 -0
  34. data/ext/bntseq.h +80 -0
  35. data/ext/bwa.1 +562 -0
  36. data/ext/bwape.c +807 -0
  37. data/ext/bwase.c +686 -0
  38. data/ext/bwase.h +27 -0
  39. data/ext/bwaseqio.c +222 -0
  40. data/ext/bwt.c +250 -0
  41. data/ext/bwt.h +105 -0
  42. data/ext/bwt_gen/Makefile +23 -0
  43. data/ext/bwt_gen/QSufSort.c +496 -0
  44. data/ext/bwt_gen/QSufSort.h +40 -0
  45. data/ext/bwt_gen/bwt_gen.c +1547 -0
  46. data/ext/bwt_gen/bwt_gen.h +105 -0
  47. data/ext/bwt_lite.c +94 -0
  48. data/ext/bwt_lite.h +29 -0
  49. data/ext/bwtaln.c +345 -0
  50. data/ext/bwtaln.h +150 -0
  51. data/ext/bwtgap.c +264 -0
  52. data/ext/bwtgap.h +38 -0
  53. data/ext/bwtindex.c +186 -0
  54. data/ext/bwtio.c +77 -0
  55. data/ext/bwtmisc.c +269 -0
  56. data/ext/bwtsw2.h +51 -0
  57. data/ext/bwtsw2_aux.c +650 -0
  58. data/ext/bwtsw2_chain.c +107 -0
  59. data/ext/bwtsw2_core.c +594 -0
  60. data/ext/bwtsw2_main.c +100 -0
  61. data/ext/cs2nt.c +191 -0
  62. data/ext/is.c +218 -0
  63. data/ext/khash.h +506 -0
  64. data/ext/kseq.h +208 -0
  65. data/ext/ksort.h +269 -0
  66. data/ext/kstring.c +35 -0
  67. data/ext/kstring.h +46 -0
  68. data/ext/kvec.h +90 -0
  69. data/ext/main.c +63 -0
  70. data/ext/main.h +29 -0
  71. data/ext/mkrf_conf.rb +49 -0
  72. data/ext/qualfa2fq.pl +27 -0
  73. data/ext/simple_dp.c +162 -0
  74. data/ext/simpletest.c +23 -0
  75. data/ext/solid2fastq.pl +111 -0
  76. data/ext/stdaln.c +1072 -0
  77. data/ext/stdaln.h +162 -0
  78. data/ext/utils.c +82 -0
  79. data/ext/utils.h +54 -0
  80. data/lib/bio-bwa.rb +7 -0
  81. data/lib/bio/bwa.rb +312 -0
  82. data/lib/bio/bwa/library.rb +42 -0
  83. data/test/data/testdata.fa +602 -0
  84. data/test/data/testdata.long.fa +175 -0
  85. data/test/data/testdata.short.fa +2 -0
  86. data/test/helper.rb +18 -0
  87. data/test/test_bio-bwa_basic.rb +62 -0
  88. data/test/test_bio-bwa_make_index.rb +42 -0
  89. data/test/test_bio-bwa_run_aln.rb +49 -0
  90. data/test/test_bio-bwa_sam_conversion.rb +49 -0
  91. metadata +218 -0
data/ext/bwase.h ADDED
@@ -0,0 +1,27 @@
1
+ #ifndef BWASE_H
2
+ #define BWASE_H
3
+
4
+ #include "bntseq.h"
5
+ #include "bwt.h"
6
+ #include "bwtaln.h"
7
+
8
+ #ifdef __cplusplus
9
+ extern "C" {
10
+ #endif
11
+
12
+ // Initialize mapping tables in the bwa single-end mapper.
13
+ void bwase_initialize();
14
+ // Calculate the approximate position of the sequence from the specified bwt with loaded suffix array.
15
+ void bwa_cal_pac_pos_core(const bwt_t* forward_bwt, const bwt_t* reverse_bwt, bwa_seq_t* seq, const int max_mm, const float fnr);
16
+ // Refine the approximate position of the sequence to an actual placement for the sequence.
17
+ void bwa_refine_gapped(const bntseq_t *bns, int n_seqs, bwa_seq_t *seqs, ubyte_t *_pacseq, bntseq_t *ntbns);
18
+ // Backfill certain alignment properties mainly centering around number of matches.
19
+ void bwa_aln2seq(int n_aln, const bwt_aln1_t *aln, bwa_seq_t *s);
20
+ // Calculate the end position of a read given a certain sequence.
21
+ int64_t pos_end(const bwa_seq_t *p);
22
+
23
+ #ifdef __cplusplus
24
+ }
25
+ #endif
26
+
27
+ #endif // BWASE_H
data/ext/bwaseqio.c ADDED
@@ -0,0 +1,222 @@
1
+ #include <zlib.h>
2
+ #include <ctype.h>
3
+ #include "bwtaln.h"
4
+ #include "utils.h"
5
+ #include "bamlite.h"
6
+
7
+ #include "kseq.h"
8
+ KSEQ_INIT(gzFile, gzread)
9
+
10
+ extern unsigned char nst_nt4_table[256];
11
+ static char bam_nt16_nt4_table[] = { 4, 0, 1, 4, 2, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4 };
12
+
13
+ struct __bwa_seqio_t {
14
+ // for BAM input
15
+ int is_bam, which; // 1st bit: read1, 2nd bit: read2, 3rd: SE
16
+ bamFile fp;
17
+ // for fastq input
18
+ kseq_t *ks;
19
+ };
20
+
21
+ bwa_seqio_t *bwa_bam_open(const char *fn, int which)
22
+ {
23
+ bwa_seqio_t *bs;
24
+ bam_header_t *h;
25
+ bs = (bwa_seqio_t*)calloc(1, sizeof(bwa_seqio_t));
26
+ bs->is_bam = 1;
27
+ bs->which = which;
28
+ bs->fp = bam_open(fn, "r");
29
+ h = bam_header_read(bs->fp);
30
+ bam_header_destroy(h);
31
+ return bs;
32
+ }
33
+
34
+ bwa_seqio_t *bwa_seq_open(const char *fn)
35
+ {
36
+ gzFile fp;
37
+ bwa_seqio_t *bs;
38
+ bs = (bwa_seqio_t*)calloc(1, sizeof(bwa_seqio_t));
39
+ fp = xzopen(fn, "r");
40
+ bs->ks = kseq_init(fp);
41
+ return bs;
42
+ }
43
+
44
+ void bwa_seq_close(bwa_seqio_t *bs)
45
+ {
46
+ if (bs == 0) return;
47
+ if (bs->is_bam) bam_close(bs->fp);
48
+ else {
49
+ gzclose(bs->ks->f->f);
50
+ kseq_destroy(bs->ks);
51
+ }
52
+ free(bs);
53
+ }
54
+
55
+ void seq_reverse(int len, ubyte_t *seq, int is_comp)
56
+ {
57
+ int i;
58
+ if (is_comp) {
59
+ for (i = 0; i < len>>1; ++i) {
60
+ char tmp = seq[len-1-i];
61
+ if (tmp < 4) tmp = 3 - tmp;
62
+ seq[len-1-i] = (seq[i] >= 4)? seq[i] : 3 - seq[i];
63
+ seq[i] = tmp;
64
+ }
65
+ if (len&1) seq[i] = (seq[i] >= 4)? seq[i] : 3 - seq[i];
66
+ } else {
67
+ for (i = 0; i < len>>1; ++i) {
68
+ char tmp = seq[len-1-i];
69
+ seq[len-1-i] = seq[i]; seq[i] = tmp;
70
+ }
71
+ }
72
+ }
73
+
74
+ int bwa_trim_read(int trim_qual, bwa_seq_t *p)
75
+ {
76
+ int s = 0, l, max = 0, max_l = p->len - 1;
77
+ if (trim_qual < 1 || p->qual == 0) return 0;
78
+ for (l = p->len - 1; l >= BWA_MIN_RDLEN - 1; --l) {
79
+ s += trim_qual - (p->qual[l] - 33);
80
+ if (s < 0) break;
81
+ if (s > max) {
82
+ max = s; max_l = l;
83
+ }
84
+ }
85
+ p->clip_len = p->len = max_l + 1;
86
+ return p->full_len - p->len;
87
+ }
88
+
89
+ static bwa_seq_t *bwa_read_bam(bwa_seqio_t *bs, int n_needed, int *n, int is_comp, int trim_qual)
90
+ {
91
+ bwa_seq_t *seqs, *p;
92
+ int n_seqs, l, i;
93
+ long n_trimmed = 0, n_tot = 0;
94
+ bam1_t *b;
95
+
96
+ b = bam_init1();
97
+ n_seqs = 0;
98
+ seqs = (bwa_seq_t*)calloc(n_needed, sizeof(bwa_seq_t));
99
+ while (bam_read1(bs->fp, b) >= 0) {
100
+ uint8_t *s, *q;
101
+ int go = 0;
102
+ if ((bs->which & 1) && (b->core.flag & BAM_FREAD1)) go = 1;
103
+ if ((bs->which & 2) && (b->core.flag & BAM_FREAD2)) go = 1;
104
+ if ((bs->which & 4) && !(b->core.flag& BAM_FREAD1) && !(b->core.flag& BAM_FREAD2))go = 1;
105
+ if (go == 0) continue;
106
+ l = b->core.l_qseq;
107
+ p = &seqs[n_seqs++];
108
+ p->tid = -1; // no assigned to a thread
109
+ p->qual = 0;
110
+ p->full_len = p->clip_len = p->len = l;
111
+ n_tot += p->full_len;
112
+ s = bam1_seq(b); q = bam1_qual(b);
113
+ p->seq = (ubyte_t*)calloc(p->len + 1, 1);
114
+ p->qual = (ubyte_t*)calloc(p->len + 1, 1);
115
+ for (i = 0; i != p->full_len; ++i) {
116
+ p->seq[i] = bam_nt16_nt4_table[(int)bam1_seqi(s, i)];
117
+ p->qual[i] = q[i] + 33 < 126? q[i] + 33 : 126;
118
+ }
119
+ if (bam1_strand(b)) { // then reverse
120
+ seq_reverse(p->len, p->seq, 1);
121
+ seq_reverse(p->len, p->qual, 0);
122
+ }
123
+ if (trim_qual >= 1) n_trimmed += bwa_trim_read(trim_qual, p);
124
+ p->rseq = (ubyte_t*)calloc(p->full_len, 1);
125
+ memcpy(p->rseq, p->seq, p->len);
126
+ seq_reverse(p->len, p->seq, 0); // *IMPORTANT*: will be reversed back in bwa_refine_gapped()
127
+ seq_reverse(p->len, p->rseq, is_comp);
128
+ p->name = strdup((const char*)bam1_qname(b));
129
+ if (n_seqs == n_needed) break;
130
+ }
131
+ *n = n_seqs;
132
+ if (n_seqs && trim_qual >= 1)
133
+ fprintf(stderr, "[bwa_read_seq] %.1f%% bases are trimmed.\n", 100.0f * n_trimmed/n_tot);
134
+ if (n_seqs == 0) {
135
+ free(seqs);
136
+ bam_destroy1(b);
137
+ return 0;
138
+ }
139
+ bam_destroy1(b);
140
+ return seqs;
141
+ }
142
+
143
+ #define BARCODE_LOW_QUAL 13
144
+
145
+ bwa_seq_t *bwa_read_seq(bwa_seqio_t *bs, int n_needed, int *n, int mode, int trim_qual)
146
+ {
147
+ bwa_seq_t *seqs, *p;
148
+ kseq_t *seq = bs->ks;
149
+ int n_seqs, l, i, is_comp = mode&BWA_MODE_COMPREAD, is_64 = mode&BWA_MODE_IL13, l_bc = mode>>24;
150
+ long n_trimmed = 0, n_tot = 0;
151
+
152
+ if (l_bc > 15) {
153
+ fprintf(stderr, "[%s] the maximum barcode length is 15.\n", __func__);
154
+ return 0;
155
+ }
156
+ if (bs->is_bam) return bwa_read_bam(bs, n_needed, n, is_comp, trim_qual); // l_bc has no effect for BAM input
157
+ n_seqs = 0;
158
+ seqs = (bwa_seq_t*)calloc(n_needed, sizeof(bwa_seq_t));
159
+ while ((l = kseq_read(seq)) >= 0) {
160
+ if (is_64 && seq->qual.l)
161
+ for (i = 0; i < seq->qual.l; ++i) seq->qual.s[i] -= 31;
162
+ if (seq->seq.l <= l_bc) continue; // sequence length equals or smaller than the barcode length
163
+ p = &seqs[n_seqs++];
164
+ if (l_bc) { // then trim barcode
165
+ for (i = 0; i < l_bc; ++i)
166
+ p->bc[i] = (seq->qual.l && seq->qual.s[i]-33 < BARCODE_LOW_QUAL)? tolower(seq->seq.s[i]) : toupper(seq->seq.s[i]);
167
+ p->bc[i] = 0;
168
+ for (; i < seq->seq.l; ++i)
169
+ seq->seq.s[i - l_bc] = seq->seq.s[i];
170
+ seq->seq.l -= l_bc; seq->seq.s[seq->seq.l] = 0;
171
+ if (seq->qual.l) {
172
+ for (i = l_bc; i < seq->qual.l; ++i)
173
+ seq->qual.s[i - l_bc] = seq->qual.s[i];
174
+ seq->qual.l -= l_bc; seq->qual.s[seq->qual.l] = 0;
175
+ }
176
+ l = seq->seq.l;
177
+ } else p->bc[0] = 0;
178
+ p->tid = -1; // no assigned to a thread
179
+ p->qual = 0;
180
+ p->full_len = p->clip_len = p->len = l;
181
+ n_tot += p->full_len;
182
+ p->seq = (ubyte_t*)calloc(p->len, 1);
183
+ for (i = 0; i != p->full_len; ++i)
184
+ p->seq[i] = nst_nt4_table[(int)seq->seq.s[i]];
185
+ if (seq->qual.l) { // copy quality
186
+ p->qual = (ubyte_t*)strdup((char*)seq->qual.s);
187
+ if (trim_qual >= 1) n_trimmed += bwa_trim_read(trim_qual, p);
188
+ }
189
+ p->rseq = (ubyte_t*)calloc(p->full_len, 1);
190
+ memcpy(p->rseq, p->seq, p->len);
191
+ seq_reverse(p->len, p->seq, 0); // *IMPORTANT*: will be reversed back in bwa_refine_gapped()
192
+ seq_reverse(p->len, p->rseq, is_comp);
193
+ p->name = strdup((const char*)seq->name.s);
194
+ { // trim /[12]$
195
+ int t = strlen(p->name);
196
+ if (t > 2 && p->name[t-2] == '/' && (p->name[t-1] == '1' || p->name[t-1] == '2')) p->name[t-2] = '\0';
197
+ }
198
+ if (n_seqs == n_needed) break;
199
+ }
200
+ *n = n_seqs;
201
+ if (n_seqs && trim_qual >= 1)
202
+ fprintf(stderr, "[bwa_read_seq] %.1f%% bases are trimmed.\n", 100.0f * n_trimmed/n_tot);
203
+ if (n_seqs == 0) {
204
+ free(seqs);
205
+ return 0;
206
+ }
207
+ return seqs;
208
+ }
209
+
210
+ void bwa_free_read_seq(int n_seqs, bwa_seq_t *seqs)
211
+ {
212
+ int i, j;
213
+ for (i = 0; i != n_seqs; ++i) {
214
+ bwa_seq_t *p = seqs + i;
215
+ for (j = 0; j < p->n_multi; ++j)
216
+ if (p->multi[j].cigar) free(p->multi[j].cigar);
217
+ free(p->name);
218
+ free(p->seq); free(p->rseq); free(p->qual); free(p->aln); free(p->md); free(p->multi);
219
+ free(p->cigar);
220
+ }
221
+ free(seqs);
222
+ }
data/ext/bwt.c ADDED
@@ -0,0 +1,250 @@
1
+ /* The MIT License
2
+
3
+ Copyright (c) 2008 Genome Research Ltd (GRL).
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining
6
+ a copy of this software and associated documentation files (the
7
+ "Software"), to deal in the Software without restriction, including
8
+ without limitation the rights to use, copy, modify, merge, publish,
9
+ distribute, sublicense, and/or sell copies of the Software, and to
10
+ permit persons to whom the Software is furnished to do so, subject to
11
+ the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
20
+ BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
21
+ ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
22
+ CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
23
+ SOFTWARE.
24
+ */
25
+
26
+ /* Contact: Heng Li <lh3@sanger.ac.uk> */
27
+
28
+ #include <stdlib.h>
29
+ #include <stdio.h>
30
+ #include <string.h>
31
+ #include <assert.h>
32
+ #include <stdint.h>
33
+ #include "utils.h"
34
+ #include "bwt.h"
35
+
36
+ void bwt_gen_cnt_table(bwt_t *bwt)
37
+ {
38
+ int i, j;
39
+ for (i = 0; i != 256; ++i) {
40
+ uint32_t x = 0;
41
+ for (j = 0; j != 4; ++j)
42
+ x |= (((i&3) == j) + ((i>>2&3) == j) + ((i>>4&3) == j) + (i>>6 == j)) << (j<<3);
43
+ bwt->cnt_table[i] = x;
44
+ }
45
+ }
46
+
47
+ // bwt->bwt and bwt->occ must be precalculated
48
+ void bwt_cal_sa(bwt_t *bwt, int intv)
49
+ {
50
+ bwtint_t isa, sa, i; // S(isa) = sa
51
+
52
+ xassert(bwt->bwt, "bwt_t::bwt is not initialized.");
53
+
54
+ if (bwt->sa) free(bwt->sa);
55
+ bwt->sa_intv = intv;
56
+ bwt->n_sa = (bwt->seq_len + intv) / intv;
57
+ bwt->sa = (bwtint_t*)calloc(bwt->n_sa, sizeof(bwtint_t));
58
+ // calculate SA value
59
+ isa = 0; sa = bwt->seq_len;
60
+ for (i = 0; i < bwt->seq_len; ++i) {
61
+ if (isa % intv == 0) bwt->sa[isa/intv] = sa;
62
+ --sa;
63
+ isa = bwt_invPsi(bwt, isa);
64
+ }
65
+ if (isa % intv == 0) bwt->sa[isa/intv] = sa;
66
+ bwt->sa[0] = (bwtint_t)-1; // before this line, bwt->sa[0] = bwt->seq_len
67
+ }
68
+
69
+ bwtint_t bwt_sa(const bwt_t *bwt, bwtint_t k)
70
+ {
71
+ bwtint_t sa = 0;
72
+ while (k % bwt->sa_intv != 0) {
73
+ ++sa;
74
+ k = bwt_invPsi(bwt, k);
75
+ }
76
+ /* without setting bwt->sa[0] = -1, the following line should be
77
+ changed to (sa + bwt->sa[k/bwt->sa_intv]) % (bwt->seq_len + 1) */
78
+ return sa + bwt->sa[k/bwt->sa_intv];
79
+ }
80
+
81
+ static inline int __occ_aux(uint64_t y, int c)
82
+ {
83
+ // reduce nucleotide counting to bits counting
84
+ y = ((c&2)? y : ~y) >> 1 & ((c&1)? y : ~y) & 0x5555555555555555ull;
85
+ // count the number of 1s in y
86
+ y = (y & 0x3333333333333333ull) + (y >> 2 & 0x3333333333333333ull);
87
+ return ((y + (y >> 4)) & 0xf0f0f0f0f0f0f0full) * 0x101010101010101ull >> 56;
88
+ }
89
+
90
+ inline bwtint_t bwt_occ(const bwt_t *bwt, bwtint_t k, ubyte_t c)
91
+ {
92
+ bwtint_t n, l, j;
93
+ uint32_t *p;
94
+
95
+ if (k == bwt->seq_len) return bwt->L2[c+1] - bwt->L2[c];
96
+ if (k == (bwtint_t)(-1)) return 0;
97
+ if (k >= bwt->primary) --k; // because $ is not in bwt
98
+
99
+ // retrieve Occ at k/OCC_INTERVAL
100
+ n = (p = bwt_occ_intv(bwt, k))[c];
101
+ p += 4; // jump to the start of the first BWT cell
102
+
103
+ // calculate Occ up to the last k/32
104
+ j = k >> 5 << 5;
105
+ for (l = k/OCC_INTERVAL*OCC_INTERVAL; l < j; l += 32, p += 2)
106
+ n += __occ_aux((uint64_t)p[0]<<32 | p[1], c);
107
+
108
+ // calculate Occ
109
+ n += __occ_aux(((uint64_t)p[0]<<32 | p[1]) & ~((1ull<<((~k&31)<<1)) - 1), c);
110
+ if (c == 0) n -= ~k&31; // corrected for the masked bits
111
+
112
+ return n;
113
+ }
114
+
115
+ // an analogy to bwt_occ() but more efficient, requiring k <= l
116
+ inline void bwt_2occ(const bwt_t *bwt, bwtint_t k, bwtint_t l, ubyte_t c, bwtint_t *ok, bwtint_t *ol)
117
+ {
118
+ bwtint_t _k, _l;
119
+ if (k == l) {
120
+ *ok = *ol = bwt_occ(bwt, k, c);
121
+ return;
122
+ }
123
+ _k = (k >= bwt->primary)? k-1 : k;
124
+ _l = (l >= bwt->primary)? l-1 : l;
125
+ if (_l/OCC_INTERVAL != _k/OCC_INTERVAL || k == (bwtint_t)(-1) || l == (bwtint_t)(-1)) {
126
+ *ok = bwt_occ(bwt, k, c);
127
+ *ol = bwt_occ(bwt, l, c);
128
+ } else {
129
+ bwtint_t m, n, i, j;
130
+ uint32_t *p;
131
+ if (k >= bwt->primary) --k;
132
+ if (l >= bwt->primary) --l;
133
+ n = (p = bwt_occ_intv(bwt, k))[c];
134
+ p += 4;
135
+ // calculate *ok
136
+ j = k >> 5 << 5;
137
+ for (i = k/OCC_INTERVAL*OCC_INTERVAL; i < j; i += 32, p += 2)
138
+ n += __occ_aux((uint64_t)p[0]<<32 | p[1], c);
139
+ m = n;
140
+ n += __occ_aux(((uint64_t)p[0]<<32 | p[1]) & ~((1ull<<((~k&31)<<1)) - 1), c);
141
+ if (c == 0) n -= ~k&31; // corrected for the masked bits
142
+ *ok = n;
143
+ // calculate *ol
144
+ j = l >> 5 << 5;
145
+ for (; i < j; i += 32, p += 2)
146
+ m += __occ_aux((uint64_t)p[0]<<32 | p[1], c);
147
+ m += __occ_aux(((uint64_t)p[0]<<32 | p[1]) & ~((1ull<<((~l&31)<<1)) - 1), c);
148
+ if (c == 0) m -= ~l&31; // corrected for the masked bits
149
+ *ol = m;
150
+ }
151
+ }
152
+
153
+ #define __occ_aux4(bwt, b) \
154
+ ((bwt)->cnt_table[(b)&0xff] + (bwt)->cnt_table[(b)>>8&0xff] \
155
+ + (bwt)->cnt_table[(b)>>16&0xff] + (bwt)->cnt_table[(b)>>24])
156
+
157
+ inline void bwt_occ4(const bwt_t *bwt, bwtint_t k, bwtint_t cnt[4])
158
+ {
159
+ bwtint_t l, j, x;
160
+ uint32_t *p;
161
+ if (k == (bwtint_t)(-1)) {
162
+ memset(cnt, 0, 4 * sizeof(bwtint_t));
163
+ return;
164
+ }
165
+ if (k >= bwt->primary) --k; // because $ is not in bwt
166
+ p = bwt_occ_intv(bwt, k);
167
+ memcpy(cnt, p, 16);
168
+ p += 4;
169
+ j = k >> 4 << 4;
170
+ for (l = k / OCC_INTERVAL * OCC_INTERVAL, x = 0; l < j; l += 16, ++p)
171
+ x += __occ_aux4(bwt, *p);
172
+ x += __occ_aux4(bwt, *p & ~((1U<<((~k&15)<<1)) - 1)) - (~k&15);
173
+ cnt[0] += x&0xff; cnt[1] += x>>8&0xff; cnt[2] += x>>16&0xff; cnt[3] += x>>24;
174
+ }
175
+
176
+ // an analogy to bwt_occ4() but more efficient, requiring k <= l
177
+ inline void bwt_2occ4(const bwt_t *bwt, bwtint_t k, bwtint_t l, bwtint_t cntk[4], bwtint_t cntl[4])
178
+ {
179
+ bwtint_t _k, _l;
180
+ if (k == l) {
181
+ bwt_occ4(bwt, k, cntk);
182
+ memcpy(cntl, cntk, 4 * sizeof(bwtint_t));
183
+ return;
184
+ }
185
+ _k = (k >= bwt->primary)? k-1 : k;
186
+ _l = (l >= bwt->primary)? l-1 : l;
187
+ if (_l/OCC_INTERVAL != _k/OCC_INTERVAL || k == (bwtint_t)(-1) || l == (bwtint_t)(-1)) {
188
+ bwt_occ4(bwt, k, cntk);
189
+ bwt_occ4(bwt, l, cntl);
190
+ } else {
191
+ bwtint_t i, j, x, y;
192
+ uint32_t *p;
193
+ int cl[4];
194
+ if (k >= bwt->primary) --k; // because $ is not in bwt
195
+ if (l >= bwt->primary) --l;
196
+ cl[0] = cl[1] = cl[2] = cl[3] = 0;
197
+ p = bwt_occ_intv(bwt, k);
198
+ memcpy(cntk, p, 4 * sizeof(bwtint_t));
199
+ p += 4;
200
+ // prepare cntk[]
201
+ j = k >> 4 << 4;
202
+ for (i = k / OCC_INTERVAL * OCC_INTERVAL, x = 0; i < j; i += 16, ++p)
203
+ x += __occ_aux4(bwt, *p);
204
+ y = x;
205
+ x += __occ_aux4(bwt, *p & ~((1U<<((~k&15)<<1)) - 1)) - (~k&15);
206
+ // calculate cntl[] and finalize cntk[]
207
+ j = l >> 4 << 4;
208
+ for (; i < j; i += 16, ++p) y += __occ_aux4(bwt, *p);
209
+ y += __occ_aux4(bwt, *p & ~((1U<<((~l&15)<<1)) - 1)) - (~l&15);
210
+ memcpy(cntl, cntk, 16);
211
+ cntk[0] += x&0xff; cntk[1] += x>>8&0xff; cntk[2] += x>>16&0xff; cntk[3] += x>>24;
212
+ cntl[0] += y&0xff; cntl[1] += y>>8&0xff; cntl[2] += y>>16&0xff; cntl[3] += y>>24;
213
+ }
214
+ }
215
+
216
+ int bwt_match_exact(const bwt_t *bwt, int len, const ubyte_t *str, bwtint_t *sa_begin, bwtint_t *sa_end)
217
+ {
218
+ bwtint_t k, l, ok, ol;
219
+ int i;
220
+ k = 0; l = bwt->seq_len;
221
+ for (i = len - 1; i >= 0; --i) {
222
+ ubyte_t c = str[i];
223
+ if (c > 3) return 0; // no match
224
+ bwt_2occ(bwt, k - 1, l, c, &ok, &ol);
225
+ k = bwt->L2[c] + ok + 1;
226
+ l = bwt->L2[c] + ol;
227
+ if (k > l) break; // no match
228
+ }
229
+ if (k > l) return 0; // no match
230
+ if (sa_begin) *sa_begin = k;
231
+ if (sa_end) *sa_end = l;
232
+ return l - k + 1;
233
+ }
234
+
235
+ int bwt_match_exact_alt(const bwt_t *bwt, int len, const ubyte_t *str, bwtint_t *k0, bwtint_t *l0)
236
+ {
237
+ int i;
238
+ bwtint_t k, l, ok, ol;
239
+ k = *k0; l = *l0;
240
+ for (i = len - 1; i >= 0; --i) {
241
+ ubyte_t c = str[i];
242
+ if (c > 3) return 0; // there is an N here. no match
243
+ bwt_2occ(bwt, k - 1, l, c, &ok, &ol);
244
+ k = bwt->L2[c] + ok + 1;
245
+ l = bwt->L2[c] + ol;
246
+ if (k > l) return 0; // no match
247
+ }
248
+ *k0 = k; *l0 = l;
249
+ return l - k + 1;
250
+ }