bio-bwa 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.document +5 -0
- data/Gemfile +15 -0
- data/Gemfile.lock +28 -0
- data/LICENSE.txt +35 -0
- data/README.rdoc +33 -0
- data/Rakefile +56 -0
- data/VERSION +1 -0
- data/bio-bwa.gemspec +152 -0
- data/doc/Bio.html +93 -0
- data/doc/Bio/BWA.html +2884 -0
- data/doc/Bio/BWA/Library.html +229 -0
- data/doc/_index.html +119 -0
- data/doc/class_list.html +36 -0
- data/doc/css/common.css +1 -0
- data/doc/css/full_list.css +53 -0
- data/doc/css/style.css +310 -0
- data/doc/file.LICENSE.html +88 -0
- data/doc/file.README.html +119 -0
- data/doc/file_list.html +41 -0
- data/doc/frames.html +13 -0
- data/doc/index.html +119 -0
- data/doc/js/app.js +203 -0
- data/doc/js/full_list.js +149 -0
- data/doc/js/jquery.js +154 -0
- data/doc/method_list.html +171 -0
- data/doc/top-level-namespace.html +88 -0
- data/ext/COPYING +674 -0
- data/ext/ChangeLog +3864 -0
- data/ext/NEWS +555 -0
- data/ext/README +29 -0
- data/ext/bamlite.c +155 -0
- data/ext/bamlite.h +94 -0
- data/ext/bntseq.c +303 -0
- data/ext/bntseq.h +80 -0
- data/ext/bwa.1 +562 -0
- data/ext/bwape.c +807 -0
- data/ext/bwase.c +686 -0
- data/ext/bwase.h +27 -0
- data/ext/bwaseqio.c +222 -0
- data/ext/bwt.c +250 -0
- data/ext/bwt.h +105 -0
- data/ext/bwt_gen/Makefile +23 -0
- data/ext/bwt_gen/QSufSort.c +496 -0
- data/ext/bwt_gen/QSufSort.h +40 -0
- data/ext/bwt_gen/bwt_gen.c +1547 -0
- data/ext/bwt_gen/bwt_gen.h +105 -0
- data/ext/bwt_lite.c +94 -0
- data/ext/bwt_lite.h +29 -0
- data/ext/bwtaln.c +345 -0
- data/ext/bwtaln.h +150 -0
- data/ext/bwtgap.c +264 -0
- data/ext/bwtgap.h +38 -0
- data/ext/bwtindex.c +186 -0
- data/ext/bwtio.c +77 -0
- data/ext/bwtmisc.c +269 -0
- data/ext/bwtsw2.h +51 -0
- data/ext/bwtsw2_aux.c +650 -0
- data/ext/bwtsw2_chain.c +107 -0
- data/ext/bwtsw2_core.c +594 -0
- data/ext/bwtsw2_main.c +100 -0
- data/ext/cs2nt.c +191 -0
- data/ext/is.c +218 -0
- data/ext/khash.h +506 -0
- data/ext/kseq.h +208 -0
- data/ext/ksort.h +269 -0
- data/ext/kstring.c +35 -0
- data/ext/kstring.h +46 -0
- data/ext/kvec.h +90 -0
- data/ext/main.c +63 -0
- data/ext/main.h +29 -0
- data/ext/mkrf_conf.rb +49 -0
- data/ext/qualfa2fq.pl +27 -0
- data/ext/simple_dp.c +162 -0
- data/ext/simpletest.c +23 -0
- data/ext/solid2fastq.pl +111 -0
- data/ext/stdaln.c +1072 -0
- data/ext/stdaln.h +162 -0
- data/ext/utils.c +82 -0
- data/ext/utils.h +54 -0
- data/lib/bio-bwa.rb +7 -0
- data/lib/bio/bwa.rb +312 -0
- data/lib/bio/bwa/library.rb +42 -0
- data/test/data/testdata.fa +602 -0
- data/test/data/testdata.long.fa +175 -0
- data/test/data/testdata.short.fa +2 -0
- data/test/helper.rb +18 -0
- data/test/test_bio-bwa_basic.rb +62 -0
- data/test/test_bio-bwa_make_index.rb +42 -0
- data/test/test_bio-bwa_run_aln.rb +49 -0
- data/test/test_bio-bwa_sam_conversion.rb +49 -0
- metadata +218 -0
data/ext/bwase.h
ADDED
@@ -0,0 +1,27 @@
|
|
1
|
+
#ifndef BWASE_H
|
2
|
+
#define BWASE_H
|
3
|
+
|
4
|
+
#include "bntseq.h"
|
5
|
+
#include "bwt.h"
|
6
|
+
#include "bwtaln.h"
|
7
|
+
|
8
|
+
#ifdef __cplusplus
|
9
|
+
extern "C" {
|
10
|
+
#endif
|
11
|
+
|
12
|
+
// Initialize mapping tables in the bwa single-end mapper.
|
13
|
+
void bwase_initialize();
|
14
|
+
// Calculate the approximate position of the sequence from the specified bwt with loaded suffix array.
|
15
|
+
void bwa_cal_pac_pos_core(const bwt_t* forward_bwt, const bwt_t* reverse_bwt, bwa_seq_t* seq, const int max_mm, const float fnr);
|
16
|
+
// Refine the approximate position of the sequence to an actual placement for the sequence.
|
17
|
+
void bwa_refine_gapped(const bntseq_t *bns, int n_seqs, bwa_seq_t *seqs, ubyte_t *_pacseq, bntseq_t *ntbns);
|
18
|
+
// Backfill certain alignment properties mainly centering around number of matches.
|
19
|
+
void bwa_aln2seq(int n_aln, const bwt_aln1_t *aln, bwa_seq_t *s);
|
20
|
+
// Calculate the end position of a read given a certain sequence.
|
21
|
+
int64_t pos_end(const bwa_seq_t *p);
|
22
|
+
|
23
|
+
#ifdef __cplusplus
|
24
|
+
}
|
25
|
+
#endif
|
26
|
+
|
27
|
+
#endif // BWASE_H
|
data/ext/bwaseqio.c
ADDED
@@ -0,0 +1,222 @@
|
|
1
|
+
#include <zlib.h>
|
2
|
+
#include <ctype.h>
|
3
|
+
#include "bwtaln.h"
|
4
|
+
#include "utils.h"
|
5
|
+
#include "bamlite.h"
|
6
|
+
|
7
|
+
#include "kseq.h"
|
8
|
+
KSEQ_INIT(gzFile, gzread)
|
9
|
+
|
10
|
+
extern unsigned char nst_nt4_table[256];
|
11
|
+
static char bam_nt16_nt4_table[] = { 4, 0, 1, 4, 2, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4 };
|
12
|
+
|
13
|
+
struct __bwa_seqio_t {
|
14
|
+
// for BAM input
|
15
|
+
int is_bam, which; // 1st bit: read1, 2nd bit: read2, 3rd: SE
|
16
|
+
bamFile fp;
|
17
|
+
// for fastq input
|
18
|
+
kseq_t *ks;
|
19
|
+
};
|
20
|
+
|
21
|
+
bwa_seqio_t *bwa_bam_open(const char *fn, int which)
|
22
|
+
{
|
23
|
+
bwa_seqio_t *bs;
|
24
|
+
bam_header_t *h;
|
25
|
+
bs = (bwa_seqio_t*)calloc(1, sizeof(bwa_seqio_t));
|
26
|
+
bs->is_bam = 1;
|
27
|
+
bs->which = which;
|
28
|
+
bs->fp = bam_open(fn, "r");
|
29
|
+
h = bam_header_read(bs->fp);
|
30
|
+
bam_header_destroy(h);
|
31
|
+
return bs;
|
32
|
+
}
|
33
|
+
|
34
|
+
bwa_seqio_t *bwa_seq_open(const char *fn)
|
35
|
+
{
|
36
|
+
gzFile fp;
|
37
|
+
bwa_seqio_t *bs;
|
38
|
+
bs = (bwa_seqio_t*)calloc(1, sizeof(bwa_seqio_t));
|
39
|
+
fp = xzopen(fn, "r");
|
40
|
+
bs->ks = kseq_init(fp);
|
41
|
+
return bs;
|
42
|
+
}
|
43
|
+
|
44
|
+
void bwa_seq_close(bwa_seqio_t *bs)
|
45
|
+
{
|
46
|
+
if (bs == 0) return;
|
47
|
+
if (bs->is_bam) bam_close(bs->fp);
|
48
|
+
else {
|
49
|
+
gzclose(bs->ks->f->f);
|
50
|
+
kseq_destroy(bs->ks);
|
51
|
+
}
|
52
|
+
free(bs);
|
53
|
+
}
|
54
|
+
|
55
|
+
void seq_reverse(int len, ubyte_t *seq, int is_comp)
|
56
|
+
{
|
57
|
+
int i;
|
58
|
+
if (is_comp) {
|
59
|
+
for (i = 0; i < len>>1; ++i) {
|
60
|
+
char tmp = seq[len-1-i];
|
61
|
+
if (tmp < 4) tmp = 3 - tmp;
|
62
|
+
seq[len-1-i] = (seq[i] >= 4)? seq[i] : 3 - seq[i];
|
63
|
+
seq[i] = tmp;
|
64
|
+
}
|
65
|
+
if (len&1) seq[i] = (seq[i] >= 4)? seq[i] : 3 - seq[i];
|
66
|
+
} else {
|
67
|
+
for (i = 0; i < len>>1; ++i) {
|
68
|
+
char tmp = seq[len-1-i];
|
69
|
+
seq[len-1-i] = seq[i]; seq[i] = tmp;
|
70
|
+
}
|
71
|
+
}
|
72
|
+
}
|
73
|
+
|
74
|
+
int bwa_trim_read(int trim_qual, bwa_seq_t *p)
|
75
|
+
{
|
76
|
+
int s = 0, l, max = 0, max_l = p->len - 1;
|
77
|
+
if (trim_qual < 1 || p->qual == 0) return 0;
|
78
|
+
for (l = p->len - 1; l >= BWA_MIN_RDLEN - 1; --l) {
|
79
|
+
s += trim_qual - (p->qual[l] - 33);
|
80
|
+
if (s < 0) break;
|
81
|
+
if (s > max) {
|
82
|
+
max = s; max_l = l;
|
83
|
+
}
|
84
|
+
}
|
85
|
+
p->clip_len = p->len = max_l + 1;
|
86
|
+
return p->full_len - p->len;
|
87
|
+
}
|
88
|
+
|
89
|
+
static bwa_seq_t *bwa_read_bam(bwa_seqio_t *bs, int n_needed, int *n, int is_comp, int trim_qual)
|
90
|
+
{
|
91
|
+
bwa_seq_t *seqs, *p;
|
92
|
+
int n_seqs, l, i;
|
93
|
+
long n_trimmed = 0, n_tot = 0;
|
94
|
+
bam1_t *b;
|
95
|
+
|
96
|
+
b = bam_init1();
|
97
|
+
n_seqs = 0;
|
98
|
+
seqs = (bwa_seq_t*)calloc(n_needed, sizeof(bwa_seq_t));
|
99
|
+
while (bam_read1(bs->fp, b) >= 0) {
|
100
|
+
uint8_t *s, *q;
|
101
|
+
int go = 0;
|
102
|
+
if ((bs->which & 1) && (b->core.flag & BAM_FREAD1)) go = 1;
|
103
|
+
if ((bs->which & 2) && (b->core.flag & BAM_FREAD2)) go = 1;
|
104
|
+
if ((bs->which & 4) && !(b->core.flag& BAM_FREAD1) && !(b->core.flag& BAM_FREAD2))go = 1;
|
105
|
+
if (go == 0) continue;
|
106
|
+
l = b->core.l_qseq;
|
107
|
+
p = &seqs[n_seqs++];
|
108
|
+
p->tid = -1; // no assigned to a thread
|
109
|
+
p->qual = 0;
|
110
|
+
p->full_len = p->clip_len = p->len = l;
|
111
|
+
n_tot += p->full_len;
|
112
|
+
s = bam1_seq(b); q = bam1_qual(b);
|
113
|
+
p->seq = (ubyte_t*)calloc(p->len + 1, 1);
|
114
|
+
p->qual = (ubyte_t*)calloc(p->len + 1, 1);
|
115
|
+
for (i = 0; i != p->full_len; ++i) {
|
116
|
+
p->seq[i] = bam_nt16_nt4_table[(int)bam1_seqi(s, i)];
|
117
|
+
p->qual[i] = q[i] + 33 < 126? q[i] + 33 : 126;
|
118
|
+
}
|
119
|
+
if (bam1_strand(b)) { // then reverse
|
120
|
+
seq_reverse(p->len, p->seq, 1);
|
121
|
+
seq_reverse(p->len, p->qual, 0);
|
122
|
+
}
|
123
|
+
if (trim_qual >= 1) n_trimmed += bwa_trim_read(trim_qual, p);
|
124
|
+
p->rseq = (ubyte_t*)calloc(p->full_len, 1);
|
125
|
+
memcpy(p->rseq, p->seq, p->len);
|
126
|
+
seq_reverse(p->len, p->seq, 0); // *IMPORTANT*: will be reversed back in bwa_refine_gapped()
|
127
|
+
seq_reverse(p->len, p->rseq, is_comp);
|
128
|
+
p->name = strdup((const char*)bam1_qname(b));
|
129
|
+
if (n_seqs == n_needed) break;
|
130
|
+
}
|
131
|
+
*n = n_seqs;
|
132
|
+
if (n_seqs && trim_qual >= 1)
|
133
|
+
fprintf(stderr, "[bwa_read_seq] %.1f%% bases are trimmed.\n", 100.0f * n_trimmed/n_tot);
|
134
|
+
if (n_seqs == 0) {
|
135
|
+
free(seqs);
|
136
|
+
bam_destroy1(b);
|
137
|
+
return 0;
|
138
|
+
}
|
139
|
+
bam_destroy1(b);
|
140
|
+
return seqs;
|
141
|
+
}
|
142
|
+
|
143
|
+
#define BARCODE_LOW_QUAL 13
|
144
|
+
|
145
|
+
bwa_seq_t *bwa_read_seq(bwa_seqio_t *bs, int n_needed, int *n, int mode, int trim_qual)
|
146
|
+
{
|
147
|
+
bwa_seq_t *seqs, *p;
|
148
|
+
kseq_t *seq = bs->ks;
|
149
|
+
int n_seqs, l, i, is_comp = mode&BWA_MODE_COMPREAD, is_64 = mode&BWA_MODE_IL13, l_bc = mode>>24;
|
150
|
+
long n_trimmed = 0, n_tot = 0;
|
151
|
+
|
152
|
+
if (l_bc > 15) {
|
153
|
+
fprintf(stderr, "[%s] the maximum barcode length is 15.\n", __func__);
|
154
|
+
return 0;
|
155
|
+
}
|
156
|
+
if (bs->is_bam) return bwa_read_bam(bs, n_needed, n, is_comp, trim_qual); // l_bc has no effect for BAM input
|
157
|
+
n_seqs = 0;
|
158
|
+
seqs = (bwa_seq_t*)calloc(n_needed, sizeof(bwa_seq_t));
|
159
|
+
while ((l = kseq_read(seq)) >= 0) {
|
160
|
+
if (is_64 && seq->qual.l)
|
161
|
+
for (i = 0; i < seq->qual.l; ++i) seq->qual.s[i] -= 31;
|
162
|
+
if (seq->seq.l <= l_bc) continue; // sequence length equals or smaller than the barcode length
|
163
|
+
p = &seqs[n_seqs++];
|
164
|
+
if (l_bc) { // then trim barcode
|
165
|
+
for (i = 0; i < l_bc; ++i)
|
166
|
+
p->bc[i] = (seq->qual.l && seq->qual.s[i]-33 < BARCODE_LOW_QUAL)? tolower(seq->seq.s[i]) : toupper(seq->seq.s[i]);
|
167
|
+
p->bc[i] = 0;
|
168
|
+
for (; i < seq->seq.l; ++i)
|
169
|
+
seq->seq.s[i - l_bc] = seq->seq.s[i];
|
170
|
+
seq->seq.l -= l_bc; seq->seq.s[seq->seq.l] = 0;
|
171
|
+
if (seq->qual.l) {
|
172
|
+
for (i = l_bc; i < seq->qual.l; ++i)
|
173
|
+
seq->qual.s[i - l_bc] = seq->qual.s[i];
|
174
|
+
seq->qual.l -= l_bc; seq->qual.s[seq->qual.l] = 0;
|
175
|
+
}
|
176
|
+
l = seq->seq.l;
|
177
|
+
} else p->bc[0] = 0;
|
178
|
+
p->tid = -1; // no assigned to a thread
|
179
|
+
p->qual = 0;
|
180
|
+
p->full_len = p->clip_len = p->len = l;
|
181
|
+
n_tot += p->full_len;
|
182
|
+
p->seq = (ubyte_t*)calloc(p->len, 1);
|
183
|
+
for (i = 0; i != p->full_len; ++i)
|
184
|
+
p->seq[i] = nst_nt4_table[(int)seq->seq.s[i]];
|
185
|
+
if (seq->qual.l) { // copy quality
|
186
|
+
p->qual = (ubyte_t*)strdup((char*)seq->qual.s);
|
187
|
+
if (trim_qual >= 1) n_trimmed += bwa_trim_read(trim_qual, p);
|
188
|
+
}
|
189
|
+
p->rseq = (ubyte_t*)calloc(p->full_len, 1);
|
190
|
+
memcpy(p->rseq, p->seq, p->len);
|
191
|
+
seq_reverse(p->len, p->seq, 0); // *IMPORTANT*: will be reversed back in bwa_refine_gapped()
|
192
|
+
seq_reverse(p->len, p->rseq, is_comp);
|
193
|
+
p->name = strdup((const char*)seq->name.s);
|
194
|
+
{ // trim /[12]$
|
195
|
+
int t = strlen(p->name);
|
196
|
+
if (t > 2 && p->name[t-2] == '/' && (p->name[t-1] == '1' || p->name[t-1] == '2')) p->name[t-2] = '\0';
|
197
|
+
}
|
198
|
+
if (n_seqs == n_needed) break;
|
199
|
+
}
|
200
|
+
*n = n_seqs;
|
201
|
+
if (n_seqs && trim_qual >= 1)
|
202
|
+
fprintf(stderr, "[bwa_read_seq] %.1f%% bases are trimmed.\n", 100.0f * n_trimmed/n_tot);
|
203
|
+
if (n_seqs == 0) {
|
204
|
+
free(seqs);
|
205
|
+
return 0;
|
206
|
+
}
|
207
|
+
return seqs;
|
208
|
+
}
|
209
|
+
|
210
|
+
void bwa_free_read_seq(int n_seqs, bwa_seq_t *seqs)
|
211
|
+
{
|
212
|
+
int i, j;
|
213
|
+
for (i = 0; i != n_seqs; ++i) {
|
214
|
+
bwa_seq_t *p = seqs + i;
|
215
|
+
for (j = 0; j < p->n_multi; ++j)
|
216
|
+
if (p->multi[j].cigar) free(p->multi[j].cigar);
|
217
|
+
free(p->name);
|
218
|
+
free(p->seq); free(p->rseq); free(p->qual); free(p->aln); free(p->md); free(p->multi);
|
219
|
+
free(p->cigar);
|
220
|
+
}
|
221
|
+
free(seqs);
|
222
|
+
}
|
data/ext/bwt.c
ADDED
@@ -0,0 +1,250 @@
|
|
1
|
+
/* The MIT License
|
2
|
+
|
3
|
+
Copyright (c) 2008 Genome Research Ltd (GRL).
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
6
|
+
a copy of this software and associated documentation files (the
|
7
|
+
"Software"), to deal in the Software without restriction, including
|
8
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
9
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
10
|
+
permit persons to whom the Software is furnished to do so, subject to
|
11
|
+
the following conditions:
|
12
|
+
|
13
|
+
The above copyright notice and this permission notice shall be
|
14
|
+
included in all copies or substantial portions of the Software.
|
15
|
+
|
16
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
17
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
18
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
19
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
|
20
|
+
BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
|
21
|
+
ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
|
22
|
+
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
23
|
+
SOFTWARE.
|
24
|
+
*/
|
25
|
+
|
26
|
+
/* Contact: Heng Li <lh3@sanger.ac.uk> */
|
27
|
+
|
28
|
+
#include <stdlib.h>
|
29
|
+
#include <stdio.h>
|
30
|
+
#include <string.h>
|
31
|
+
#include <assert.h>
|
32
|
+
#include <stdint.h>
|
33
|
+
#include "utils.h"
|
34
|
+
#include "bwt.h"
|
35
|
+
|
36
|
+
void bwt_gen_cnt_table(bwt_t *bwt)
|
37
|
+
{
|
38
|
+
int i, j;
|
39
|
+
for (i = 0; i != 256; ++i) {
|
40
|
+
uint32_t x = 0;
|
41
|
+
for (j = 0; j != 4; ++j)
|
42
|
+
x |= (((i&3) == j) + ((i>>2&3) == j) + ((i>>4&3) == j) + (i>>6 == j)) << (j<<3);
|
43
|
+
bwt->cnt_table[i] = x;
|
44
|
+
}
|
45
|
+
}
|
46
|
+
|
47
|
+
// bwt->bwt and bwt->occ must be precalculated
|
48
|
+
void bwt_cal_sa(bwt_t *bwt, int intv)
|
49
|
+
{
|
50
|
+
bwtint_t isa, sa, i; // S(isa) = sa
|
51
|
+
|
52
|
+
xassert(bwt->bwt, "bwt_t::bwt is not initialized.");
|
53
|
+
|
54
|
+
if (bwt->sa) free(bwt->sa);
|
55
|
+
bwt->sa_intv = intv;
|
56
|
+
bwt->n_sa = (bwt->seq_len + intv) / intv;
|
57
|
+
bwt->sa = (bwtint_t*)calloc(bwt->n_sa, sizeof(bwtint_t));
|
58
|
+
// calculate SA value
|
59
|
+
isa = 0; sa = bwt->seq_len;
|
60
|
+
for (i = 0; i < bwt->seq_len; ++i) {
|
61
|
+
if (isa % intv == 0) bwt->sa[isa/intv] = sa;
|
62
|
+
--sa;
|
63
|
+
isa = bwt_invPsi(bwt, isa);
|
64
|
+
}
|
65
|
+
if (isa % intv == 0) bwt->sa[isa/intv] = sa;
|
66
|
+
bwt->sa[0] = (bwtint_t)-1; // before this line, bwt->sa[0] = bwt->seq_len
|
67
|
+
}
|
68
|
+
|
69
|
+
bwtint_t bwt_sa(const bwt_t *bwt, bwtint_t k)
|
70
|
+
{
|
71
|
+
bwtint_t sa = 0;
|
72
|
+
while (k % bwt->sa_intv != 0) {
|
73
|
+
++sa;
|
74
|
+
k = bwt_invPsi(bwt, k);
|
75
|
+
}
|
76
|
+
/* without setting bwt->sa[0] = -1, the following line should be
|
77
|
+
changed to (sa + bwt->sa[k/bwt->sa_intv]) % (bwt->seq_len + 1) */
|
78
|
+
return sa + bwt->sa[k/bwt->sa_intv];
|
79
|
+
}
|
80
|
+
|
81
|
+
static inline int __occ_aux(uint64_t y, int c)
|
82
|
+
{
|
83
|
+
// reduce nucleotide counting to bits counting
|
84
|
+
y = ((c&2)? y : ~y) >> 1 & ((c&1)? y : ~y) & 0x5555555555555555ull;
|
85
|
+
// count the number of 1s in y
|
86
|
+
y = (y & 0x3333333333333333ull) + (y >> 2 & 0x3333333333333333ull);
|
87
|
+
return ((y + (y >> 4)) & 0xf0f0f0f0f0f0f0full) * 0x101010101010101ull >> 56;
|
88
|
+
}
|
89
|
+
|
90
|
+
inline bwtint_t bwt_occ(const bwt_t *bwt, bwtint_t k, ubyte_t c)
|
91
|
+
{
|
92
|
+
bwtint_t n, l, j;
|
93
|
+
uint32_t *p;
|
94
|
+
|
95
|
+
if (k == bwt->seq_len) return bwt->L2[c+1] - bwt->L2[c];
|
96
|
+
if (k == (bwtint_t)(-1)) return 0;
|
97
|
+
if (k >= bwt->primary) --k; // because $ is not in bwt
|
98
|
+
|
99
|
+
// retrieve Occ at k/OCC_INTERVAL
|
100
|
+
n = (p = bwt_occ_intv(bwt, k))[c];
|
101
|
+
p += 4; // jump to the start of the first BWT cell
|
102
|
+
|
103
|
+
// calculate Occ up to the last k/32
|
104
|
+
j = k >> 5 << 5;
|
105
|
+
for (l = k/OCC_INTERVAL*OCC_INTERVAL; l < j; l += 32, p += 2)
|
106
|
+
n += __occ_aux((uint64_t)p[0]<<32 | p[1], c);
|
107
|
+
|
108
|
+
// calculate Occ
|
109
|
+
n += __occ_aux(((uint64_t)p[0]<<32 | p[1]) & ~((1ull<<((~k&31)<<1)) - 1), c);
|
110
|
+
if (c == 0) n -= ~k&31; // corrected for the masked bits
|
111
|
+
|
112
|
+
return n;
|
113
|
+
}
|
114
|
+
|
115
|
+
// an analogy to bwt_occ() but more efficient, requiring k <= l
|
116
|
+
inline void bwt_2occ(const bwt_t *bwt, bwtint_t k, bwtint_t l, ubyte_t c, bwtint_t *ok, bwtint_t *ol)
|
117
|
+
{
|
118
|
+
bwtint_t _k, _l;
|
119
|
+
if (k == l) {
|
120
|
+
*ok = *ol = bwt_occ(bwt, k, c);
|
121
|
+
return;
|
122
|
+
}
|
123
|
+
_k = (k >= bwt->primary)? k-1 : k;
|
124
|
+
_l = (l >= bwt->primary)? l-1 : l;
|
125
|
+
if (_l/OCC_INTERVAL != _k/OCC_INTERVAL || k == (bwtint_t)(-1) || l == (bwtint_t)(-1)) {
|
126
|
+
*ok = bwt_occ(bwt, k, c);
|
127
|
+
*ol = bwt_occ(bwt, l, c);
|
128
|
+
} else {
|
129
|
+
bwtint_t m, n, i, j;
|
130
|
+
uint32_t *p;
|
131
|
+
if (k >= bwt->primary) --k;
|
132
|
+
if (l >= bwt->primary) --l;
|
133
|
+
n = (p = bwt_occ_intv(bwt, k))[c];
|
134
|
+
p += 4;
|
135
|
+
// calculate *ok
|
136
|
+
j = k >> 5 << 5;
|
137
|
+
for (i = k/OCC_INTERVAL*OCC_INTERVAL; i < j; i += 32, p += 2)
|
138
|
+
n += __occ_aux((uint64_t)p[0]<<32 | p[1], c);
|
139
|
+
m = n;
|
140
|
+
n += __occ_aux(((uint64_t)p[0]<<32 | p[1]) & ~((1ull<<((~k&31)<<1)) - 1), c);
|
141
|
+
if (c == 0) n -= ~k&31; // corrected for the masked bits
|
142
|
+
*ok = n;
|
143
|
+
// calculate *ol
|
144
|
+
j = l >> 5 << 5;
|
145
|
+
for (; i < j; i += 32, p += 2)
|
146
|
+
m += __occ_aux((uint64_t)p[0]<<32 | p[1], c);
|
147
|
+
m += __occ_aux(((uint64_t)p[0]<<32 | p[1]) & ~((1ull<<((~l&31)<<1)) - 1), c);
|
148
|
+
if (c == 0) m -= ~l&31; // corrected for the masked bits
|
149
|
+
*ol = m;
|
150
|
+
}
|
151
|
+
}
|
152
|
+
|
153
|
+
#define __occ_aux4(bwt, b) \
|
154
|
+
((bwt)->cnt_table[(b)&0xff] + (bwt)->cnt_table[(b)>>8&0xff] \
|
155
|
+
+ (bwt)->cnt_table[(b)>>16&0xff] + (bwt)->cnt_table[(b)>>24])
|
156
|
+
|
157
|
+
inline void bwt_occ4(const bwt_t *bwt, bwtint_t k, bwtint_t cnt[4])
|
158
|
+
{
|
159
|
+
bwtint_t l, j, x;
|
160
|
+
uint32_t *p;
|
161
|
+
if (k == (bwtint_t)(-1)) {
|
162
|
+
memset(cnt, 0, 4 * sizeof(bwtint_t));
|
163
|
+
return;
|
164
|
+
}
|
165
|
+
if (k >= bwt->primary) --k; // because $ is not in bwt
|
166
|
+
p = bwt_occ_intv(bwt, k);
|
167
|
+
memcpy(cnt, p, 16);
|
168
|
+
p += 4;
|
169
|
+
j = k >> 4 << 4;
|
170
|
+
for (l = k / OCC_INTERVAL * OCC_INTERVAL, x = 0; l < j; l += 16, ++p)
|
171
|
+
x += __occ_aux4(bwt, *p);
|
172
|
+
x += __occ_aux4(bwt, *p & ~((1U<<((~k&15)<<1)) - 1)) - (~k&15);
|
173
|
+
cnt[0] += x&0xff; cnt[1] += x>>8&0xff; cnt[2] += x>>16&0xff; cnt[3] += x>>24;
|
174
|
+
}
|
175
|
+
|
176
|
+
// an analogy to bwt_occ4() but more efficient, requiring k <= l
|
177
|
+
inline void bwt_2occ4(const bwt_t *bwt, bwtint_t k, bwtint_t l, bwtint_t cntk[4], bwtint_t cntl[4])
|
178
|
+
{
|
179
|
+
bwtint_t _k, _l;
|
180
|
+
if (k == l) {
|
181
|
+
bwt_occ4(bwt, k, cntk);
|
182
|
+
memcpy(cntl, cntk, 4 * sizeof(bwtint_t));
|
183
|
+
return;
|
184
|
+
}
|
185
|
+
_k = (k >= bwt->primary)? k-1 : k;
|
186
|
+
_l = (l >= bwt->primary)? l-1 : l;
|
187
|
+
if (_l/OCC_INTERVAL != _k/OCC_INTERVAL || k == (bwtint_t)(-1) || l == (bwtint_t)(-1)) {
|
188
|
+
bwt_occ4(bwt, k, cntk);
|
189
|
+
bwt_occ4(bwt, l, cntl);
|
190
|
+
} else {
|
191
|
+
bwtint_t i, j, x, y;
|
192
|
+
uint32_t *p;
|
193
|
+
int cl[4];
|
194
|
+
if (k >= bwt->primary) --k; // because $ is not in bwt
|
195
|
+
if (l >= bwt->primary) --l;
|
196
|
+
cl[0] = cl[1] = cl[2] = cl[3] = 0;
|
197
|
+
p = bwt_occ_intv(bwt, k);
|
198
|
+
memcpy(cntk, p, 4 * sizeof(bwtint_t));
|
199
|
+
p += 4;
|
200
|
+
// prepare cntk[]
|
201
|
+
j = k >> 4 << 4;
|
202
|
+
for (i = k / OCC_INTERVAL * OCC_INTERVAL, x = 0; i < j; i += 16, ++p)
|
203
|
+
x += __occ_aux4(bwt, *p);
|
204
|
+
y = x;
|
205
|
+
x += __occ_aux4(bwt, *p & ~((1U<<((~k&15)<<1)) - 1)) - (~k&15);
|
206
|
+
// calculate cntl[] and finalize cntk[]
|
207
|
+
j = l >> 4 << 4;
|
208
|
+
for (; i < j; i += 16, ++p) y += __occ_aux4(bwt, *p);
|
209
|
+
y += __occ_aux4(bwt, *p & ~((1U<<((~l&15)<<1)) - 1)) - (~l&15);
|
210
|
+
memcpy(cntl, cntk, 16);
|
211
|
+
cntk[0] += x&0xff; cntk[1] += x>>8&0xff; cntk[2] += x>>16&0xff; cntk[3] += x>>24;
|
212
|
+
cntl[0] += y&0xff; cntl[1] += y>>8&0xff; cntl[2] += y>>16&0xff; cntl[3] += y>>24;
|
213
|
+
}
|
214
|
+
}
|
215
|
+
|
216
|
+
int bwt_match_exact(const bwt_t *bwt, int len, const ubyte_t *str, bwtint_t *sa_begin, bwtint_t *sa_end)
|
217
|
+
{
|
218
|
+
bwtint_t k, l, ok, ol;
|
219
|
+
int i;
|
220
|
+
k = 0; l = bwt->seq_len;
|
221
|
+
for (i = len - 1; i >= 0; --i) {
|
222
|
+
ubyte_t c = str[i];
|
223
|
+
if (c > 3) return 0; // no match
|
224
|
+
bwt_2occ(bwt, k - 1, l, c, &ok, &ol);
|
225
|
+
k = bwt->L2[c] + ok + 1;
|
226
|
+
l = bwt->L2[c] + ol;
|
227
|
+
if (k > l) break; // no match
|
228
|
+
}
|
229
|
+
if (k > l) return 0; // no match
|
230
|
+
if (sa_begin) *sa_begin = k;
|
231
|
+
if (sa_end) *sa_end = l;
|
232
|
+
return l - k + 1;
|
233
|
+
}
|
234
|
+
|
235
|
+
int bwt_match_exact_alt(const bwt_t *bwt, int len, const ubyte_t *str, bwtint_t *k0, bwtint_t *l0)
|
236
|
+
{
|
237
|
+
int i;
|
238
|
+
bwtint_t k, l, ok, ol;
|
239
|
+
k = *k0; l = *l0;
|
240
|
+
for (i = len - 1; i >= 0; --i) {
|
241
|
+
ubyte_t c = str[i];
|
242
|
+
if (c > 3) return 0; // there is an N here. no match
|
243
|
+
bwt_2occ(bwt, k - 1, l, c, &ok, &ol);
|
244
|
+
k = bwt->L2[c] + ok + 1;
|
245
|
+
l = bwt->L2[c] + ol;
|
246
|
+
if (k > l) return 0; // no match
|
247
|
+
}
|
248
|
+
*k0 = k; *l0 = l;
|
249
|
+
return l - k + 1;
|
250
|
+
}
|