bio-bwa 0.2.0
Sign up to get free protection for your applications and to get access to all the features.
- data/.document +5 -0
- data/Gemfile +15 -0
- data/Gemfile.lock +28 -0
- data/LICENSE.txt +35 -0
- data/README.rdoc +33 -0
- data/Rakefile +56 -0
- data/VERSION +1 -0
- data/bio-bwa.gemspec +152 -0
- data/doc/Bio.html +93 -0
- data/doc/Bio/BWA.html +2884 -0
- data/doc/Bio/BWA/Library.html +229 -0
- data/doc/_index.html +119 -0
- data/doc/class_list.html +36 -0
- data/doc/css/common.css +1 -0
- data/doc/css/full_list.css +53 -0
- data/doc/css/style.css +310 -0
- data/doc/file.LICENSE.html +88 -0
- data/doc/file.README.html +119 -0
- data/doc/file_list.html +41 -0
- data/doc/frames.html +13 -0
- data/doc/index.html +119 -0
- data/doc/js/app.js +203 -0
- data/doc/js/full_list.js +149 -0
- data/doc/js/jquery.js +154 -0
- data/doc/method_list.html +171 -0
- data/doc/top-level-namespace.html +88 -0
- data/ext/COPYING +674 -0
- data/ext/ChangeLog +3864 -0
- data/ext/NEWS +555 -0
- data/ext/README +29 -0
- data/ext/bamlite.c +155 -0
- data/ext/bamlite.h +94 -0
- data/ext/bntseq.c +303 -0
- data/ext/bntseq.h +80 -0
- data/ext/bwa.1 +562 -0
- data/ext/bwape.c +807 -0
- data/ext/bwase.c +686 -0
- data/ext/bwase.h +27 -0
- data/ext/bwaseqio.c +222 -0
- data/ext/bwt.c +250 -0
- data/ext/bwt.h +105 -0
- data/ext/bwt_gen/Makefile +23 -0
- data/ext/bwt_gen/QSufSort.c +496 -0
- data/ext/bwt_gen/QSufSort.h +40 -0
- data/ext/bwt_gen/bwt_gen.c +1547 -0
- data/ext/bwt_gen/bwt_gen.h +105 -0
- data/ext/bwt_lite.c +94 -0
- data/ext/bwt_lite.h +29 -0
- data/ext/bwtaln.c +345 -0
- data/ext/bwtaln.h +150 -0
- data/ext/bwtgap.c +264 -0
- data/ext/bwtgap.h +38 -0
- data/ext/bwtindex.c +186 -0
- data/ext/bwtio.c +77 -0
- data/ext/bwtmisc.c +269 -0
- data/ext/bwtsw2.h +51 -0
- data/ext/bwtsw2_aux.c +650 -0
- data/ext/bwtsw2_chain.c +107 -0
- data/ext/bwtsw2_core.c +594 -0
- data/ext/bwtsw2_main.c +100 -0
- data/ext/cs2nt.c +191 -0
- data/ext/is.c +218 -0
- data/ext/khash.h +506 -0
- data/ext/kseq.h +208 -0
- data/ext/ksort.h +269 -0
- data/ext/kstring.c +35 -0
- data/ext/kstring.h +46 -0
- data/ext/kvec.h +90 -0
- data/ext/main.c +63 -0
- data/ext/main.h +29 -0
- data/ext/mkrf_conf.rb +49 -0
- data/ext/qualfa2fq.pl +27 -0
- data/ext/simple_dp.c +162 -0
- data/ext/simpletest.c +23 -0
- data/ext/solid2fastq.pl +111 -0
- data/ext/stdaln.c +1072 -0
- data/ext/stdaln.h +162 -0
- data/ext/utils.c +82 -0
- data/ext/utils.h +54 -0
- data/lib/bio-bwa.rb +7 -0
- data/lib/bio/bwa.rb +312 -0
- data/lib/bio/bwa/library.rb +42 -0
- data/test/data/testdata.fa +602 -0
- data/test/data/testdata.long.fa +175 -0
- data/test/data/testdata.short.fa +2 -0
- data/test/helper.rb +18 -0
- data/test/test_bio-bwa_basic.rb +62 -0
- data/test/test_bio-bwa_make_index.rb +42 -0
- data/test/test_bio-bwa_run_aln.rb +49 -0
- data/test/test_bio-bwa_sam_conversion.rb +49 -0
- metadata +218 -0
data/ext/bwase.h
ADDED
@@ -0,0 +1,27 @@
|
|
1
|
+
#ifndef BWASE_H
|
2
|
+
#define BWASE_H
|
3
|
+
|
4
|
+
#include "bntseq.h"
|
5
|
+
#include "bwt.h"
|
6
|
+
#include "bwtaln.h"
|
7
|
+
|
8
|
+
#ifdef __cplusplus
|
9
|
+
extern "C" {
|
10
|
+
#endif
|
11
|
+
|
12
|
+
// Initialize mapping tables in the bwa single-end mapper.
|
13
|
+
void bwase_initialize();
|
14
|
+
// Calculate the approximate position of the sequence from the specified bwt with loaded suffix array.
|
15
|
+
void bwa_cal_pac_pos_core(const bwt_t* forward_bwt, const bwt_t* reverse_bwt, bwa_seq_t* seq, const int max_mm, const float fnr);
|
16
|
+
// Refine the approximate position of the sequence to an actual placement for the sequence.
|
17
|
+
void bwa_refine_gapped(const bntseq_t *bns, int n_seqs, bwa_seq_t *seqs, ubyte_t *_pacseq, bntseq_t *ntbns);
|
18
|
+
// Backfill certain alignment properties mainly centering around number of matches.
|
19
|
+
void bwa_aln2seq(int n_aln, const bwt_aln1_t *aln, bwa_seq_t *s);
|
20
|
+
// Calculate the end position of a read given a certain sequence.
|
21
|
+
int64_t pos_end(const bwa_seq_t *p);
|
22
|
+
|
23
|
+
#ifdef __cplusplus
|
24
|
+
}
|
25
|
+
#endif
|
26
|
+
|
27
|
+
#endif // BWASE_H
|
data/ext/bwaseqio.c
ADDED
@@ -0,0 +1,222 @@
|
|
1
|
+
#include <zlib.h>
|
2
|
+
#include <ctype.h>
|
3
|
+
#include "bwtaln.h"
|
4
|
+
#include "utils.h"
|
5
|
+
#include "bamlite.h"
|
6
|
+
|
7
|
+
#include "kseq.h"
|
8
|
+
KSEQ_INIT(gzFile, gzread)
|
9
|
+
|
10
|
+
extern unsigned char nst_nt4_table[256];
|
11
|
+
static char bam_nt16_nt4_table[] = { 4, 0, 1, 4, 2, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4 };
|
12
|
+
|
13
|
+
struct __bwa_seqio_t {
|
14
|
+
// for BAM input
|
15
|
+
int is_bam, which; // 1st bit: read1, 2nd bit: read2, 3rd: SE
|
16
|
+
bamFile fp;
|
17
|
+
// for fastq input
|
18
|
+
kseq_t *ks;
|
19
|
+
};
|
20
|
+
|
21
|
+
bwa_seqio_t *bwa_bam_open(const char *fn, int which)
|
22
|
+
{
|
23
|
+
bwa_seqio_t *bs;
|
24
|
+
bam_header_t *h;
|
25
|
+
bs = (bwa_seqio_t*)calloc(1, sizeof(bwa_seqio_t));
|
26
|
+
bs->is_bam = 1;
|
27
|
+
bs->which = which;
|
28
|
+
bs->fp = bam_open(fn, "r");
|
29
|
+
h = bam_header_read(bs->fp);
|
30
|
+
bam_header_destroy(h);
|
31
|
+
return bs;
|
32
|
+
}
|
33
|
+
|
34
|
+
bwa_seqio_t *bwa_seq_open(const char *fn)
|
35
|
+
{
|
36
|
+
gzFile fp;
|
37
|
+
bwa_seqio_t *bs;
|
38
|
+
bs = (bwa_seqio_t*)calloc(1, sizeof(bwa_seqio_t));
|
39
|
+
fp = xzopen(fn, "r");
|
40
|
+
bs->ks = kseq_init(fp);
|
41
|
+
return bs;
|
42
|
+
}
|
43
|
+
|
44
|
+
void bwa_seq_close(bwa_seqio_t *bs)
|
45
|
+
{
|
46
|
+
if (bs == 0) return;
|
47
|
+
if (bs->is_bam) bam_close(bs->fp);
|
48
|
+
else {
|
49
|
+
gzclose(bs->ks->f->f);
|
50
|
+
kseq_destroy(bs->ks);
|
51
|
+
}
|
52
|
+
free(bs);
|
53
|
+
}
|
54
|
+
|
55
|
+
void seq_reverse(int len, ubyte_t *seq, int is_comp)
|
56
|
+
{
|
57
|
+
int i;
|
58
|
+
if (is_comp) {
|
59
|
+
for (i = 0; i < len>>1; ++i) {
|
60
|
+
char tmp = seq[len-1-i];
|
61
|
+
if (tmp < 4) tmp = 3 - tmp;
|
62
|
+
seq[len-1-i] = (seq[i] >= 4)? seq[i] : 3 - seq[i];
|
63
|
+
seq[i] = tmp;
|
64
|
+
}
|
65
|
+
if (len&1) seq[i] = (seq[i] >= 4)? seq[i] : 3 - seq[i];
|
66
|
+
} else {
|
67
|
+
for (i = 0; i < len>>1; ++i) {
|
68
|
+
char tmp = seq[len-1-i];
|
69
|
+
seq[len-1-i] = seq[i]; seq[i] = tmp;
|
70
|
+
}
|
71
|
+
}
|
72
|
+
}
|
73
|
+
|
74
|
+
int bwa_trim_read(int trim_qual, bwa_seq_t *p)
|
75
|
+
{
|
76
|
+
int s = 0, l, max = 0, max_l = p->len - 1;
|
77
|
+
if (trim_qual < 1 || p->qual == 0) return 0;
|
78
|
+
for (l = p->len - 1; l >= BWA_MIN_RDLEN - 1; --l) {
|
79
|
+
s += trim_qual - (p->qual[l] - 33);
|
80
|
+
if (s < 0) break;
|
81
|
+
if (s > max) {
|
82
|
+
max = s; max_l = l;
|
83
|
+
}
|
84
|
+
}
|
85
|
+
p->clip_len = p->len = max_l + 1;
|
86
|
+
return p->full_len - p->len;
|
87
|
+
}
|
88
|
+
|
89
|
+
static bwa_seq_t *bwa_read_bam(bwa_seqio_t *bs, int n_needed, int *n, int is_comp, int trim_qual)
|
90
|
+
{
|
91
|
+
bwa_seq_t *seqs, *p;
|
92
|
+
int n_seqs, l, i;
|
93
|
+
long n_trimmed = 0, n_tot = 0;
|
94
|
+
bam1_t *b;
|
95
|
+
|
96
|
+
b = bam_init1();
|
97
|
+
n_seqs = 0;
|
98
|
+
seqs = (bwa_seq_t*)calloc(n_needed, sizeof(bwa_seq_t));
|
99
|
+
while (bam_read1(bs->fp, b) >= 0) {
|
100
|
+
uint8_t *s, *q;
|
101
|
+
int go = 0;
|
102
|
+
if ((bs->which & 1) && (b->core.flag & BAM_FREAD1)) go = 1;
|
103
|
+
if ((bs->which & 2) && (b->core.flag & BAM_FREAD2)) go = 1;
|
104
|
+
if ((bs->which & 4) && !(b->core.flag& BAM_FREAD1) && !(b->core.flag& BAM_FREAD2))go = 1;
|
105
|
+
if (go == 0) continue;
|
106
|
+
l = b->core.l_qseq;
|
107
|
+
p = &seqs[n_seqs++];
|
108
|
+
p->tid = -1; // no assigned to a thread
|
109
|
+
p->qual = 0;
|
110
|
+
p->full_len = p->clip_len = p->len = l;
|
111
|
+
n_tot += p->full_len;
|
112
|
+
s = bam1_seq(b); q = bam1_qual(b);
|
113
|
+
p->seq = (ubyte_t*)calloc(p->len + 1, 1);
|
114
|
+
p->qual = (ubyte_t*)calloc(p->len + 1, 1);
|
115
|
+
for (i = 0; i != p->full_len; ++i) {
|
116
|
+
p->seq[i] = bam_nt16_nt4_table[(int)bam1_seqi(s, i)];
|
117
|
+
p->qual[i] = q[i] + 33 < 126? q[i] + 33 : 126;
|
118
|
+
}
|
119
|
+
if (bam1_strand(b)) { // then reverse
|
120
|
+
seq_reverse(p->len, p->seq, 1);
|
121
|
+
seq_reverse(p->len, p->qual, 0);
|
122
|
+
}
|
123
|
+
if (trim_qual >= 1) n_trimmed += bwa_trim_read(trim_qual, p);
|
124
|
+
p->rseq = (ubyte_t*)calloc(p->full_len, 1);
|
125
|
+
memcpy(p->rseq, p->seq, p->len);
|
126
|
+
seq_reverse(p->len, p->seq, 0); // *IMPORTANT*: will be reversed back in bwa_refine_gapped()
|
127
|
+
seq_reverse(p->len, p->rseq, is_comp);
|
128
|
+
p->name = strdup((const char*)bam1_qname(b));
|
129
|
+
if (n_seqs == n_needed) break;
|
130
|
+
}
|
131
|
+
*n = n_seqs;
|
132
|
+
if (n_seqs && trim_qual >= 1)
|
133
|
+
fprintf(stderr, "[bwa_read_seq] %.1f%% bases are trimmed.\n", 100.0f * n_trimmed/n_tot);
|
134
|
+
if (n_seqs == 0) {
|
135
|
+
free(seqs);
|
136
|
+
bam_destroy1(b);
|
137
|
+
return 0;
|
138
|
+
}
|
139
|
+
bam_destroy1(b);
|
140
|
+
return seqs;
|
141
|
+
}
|
142
|
+
|
143
|
+
#define BARCODE_LOW_QUAL 13
|
144
|
+
|
145
|
+
bwa_seq_t *bwa_read_seq(bwa_seqio_t *bs, int n_needed, int *n, int mode, int trim_qual)
|
146
|
+
{
|
147
|
+
bwa_seq_t *seqs, *p;
|
148
|
+
kseq_t *seq = bs->ks;
|
149
|
+
int n_seqs, l, i, is_comp = mode&BWA_MODE_COMPREAD, is_64 = mode&BWA_MODE_IL13, l_bc = mode>>24;
|
150
|
+
long n_trimmed = 0, n_tot = 0;
|
151
|
+
|
152
|
+
if (l_bc > 15) {
|
153
|
+
fprintf(stderr, "[%s] the maximum barcode length is 15.\n", __func__);
|
154
|
+
return 0;
|
155
|
+
}
|
156
|
+
if (bs->is_bam) return bwa_read_bam(bs, n_needed, n, is_comp, trim_qual); // l_bc has no effect for BAM input
|
157
|
+
n_seqs = 0;
|
158
|
+
seqs = (bwa_seq_t*)calloc(n_needed, sizeof(bwa_seq_t));
|
159
|
+
while ((l = kseq_read(seq)) >= 0) {
|
160
|
+
if (is_64 && seq->qual.l)
|
161
|
+
for (i = 0; i < seq->qual.l; ++i) seq->qual.s[i] -= 31;
|
162
|
+
if (seq->seq.l <= l_bc) continue; // sequence length equals or smaller than the barcode length
|
163
|
+
p = &seqs[n_seqs++];
|
164
|
+
if (l_bc) { // then trim barcode
|
165
|
+
for (i = 0; i < l_bc; ++i)
|
166
|
+
p->bc[i] = (seq->qual.l && seq->qual.s[i]-33 < BARCODE_LOW_QUAL)? tolower(seq->seq.s[i]) : toupper(seq->seq.s[i]);
|
167
|
+
p->bc[i] = 0;
|
168
|
+
for (; i < seq->seq.l; ++i)
|
169
|
+
seq->seq.s[i - l_bc] = seq->seq.s[i];
|
170
|
+
seq->seq.l -= l_bc; seq->seq.s[seq->seq.l] = 0;
|
171
|
+
if (seq->qual.l) {
|
172
|
+
for (i = l_bc; i < seq->qual.l; ++i)
|
173
|
+
seq->qual.s[i - l_bc] = seq->qual.s[i];
|
174
|
+
seq->qual.l -= l_bc; seq->qual.s[seq->qual.l] = 0;
|
175
|
+
}
|
176
|
+
l = seq->seq.l;
|
177
|
+
} else p->bc[0] = 0;
|
178
|
+
p->tid = -1; // no assigned to a thread
|
179
|
+
p->qual = 0;
|
180
|
+
p->full_len = p->clip_len = p->len = l;
|
181
|
+
n_tot += p->full_len;
|
182
|
+
p->seq = (ubyte_t*)calloc(p->len, 1);
|
183
|
+
for (i = 0; i != p->full_len; ++i)
|
184
|
+
p->seq[i] = nst_nt4_table[(int)seq->seq.s[i]];
|
185
|
+
if (seq->qual.l) { // copy quality
|
186
|
+
p->qual = (ubyte_t*)strdup((char*)seq->qual.s);
|
187
|
+
if (trim_qual >= 1) n_trimmed += bwa_trim_read(trim_qual, p);
|
188
|
+
}
|
189
|
+
p->rseq = (ubyte_t*)calloc(p->full_len, 1);
|
190
|
+
memcpy(p->rseq, p->seq, p->len);
|
191
|
+
seq_reverse(p->len, p->seq, 0); // *IMPORTANT*: will be reversed back in bwa_refine_gapped()
|
192
|
+
seq_reverse(p->len, p->rseq, is_comp);
|
193
|
+
p->name = strdup((const char*)seq->name.s);
|
194
|
+
{ // trim /[12]$
|
195
|
+
int t = strlen(p->name);
|
196
|
+
if (t > 2 && p->name[t-2] == '/' && (p->name[t-1] == '1' || p->name[t-1] == '2')) p->name[t-2] = '\0';
|
197
|
+
}
|
198
|
+
if (n_seqs == n_needed) break;
|
199
|
+
}
|
200
|
+
*n = n_seqs;
|
201
|
+
if (n_seqs && trim_qual >= 1)
|
202
|
+
fprintf(stderr, "[bwa_read_seq] %.1f%% bases are trimmed.\n", 100.0f * n_trimmed/n_tot);
|
203
|
+
if (n_seqs == 0) {
|
204
|
+
free(seqs);
|
205
|
+
return 0;
|
206
|
+
}
|
207
|
+
return seqs;
|
208
|
+
}
|
209
|
+
|
210
|
+
void bwa_free_read_seq(int n_seqs, bwa_seq_t *seqs)
|
211
|
+
{
|
212
|
+
int i, j;
|
213
|
+
for (i = 0; i != n_seqs; ++i) {
|
214
|
+
bwa_seq_t *p = seqs + i;
|
215
|
+
for (j = 0; j < p->n_multi; ++j)
|
216
|
+
if (p->multi[j].cigar) free(p->multi[j].cigar);
|
217
|
+
free(p->name);
|
218
|
+
free(p->seq); free(p->rseq); free(p->qual); free(p->aln); free(p->md); free(p->multi);
|
219
|
+
free(p->cigar);
|
220
|
+
}
|
221
|
+
free(seqs);
|
222
|
+
}
|
data/ext/bwt.c
ADDED
@@ -0,0 +1,250 @@
|
|
1
|
+
/* The MIT License
|
2
|
+
|
3
|
+
Copyright (c) 2008 Genome Research Ltd (GRL).
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
6
|
+
a copy of this software and associated documentation files (the
|
7
|
+
"Software"), to deal in the Software without restriction, including
|
8
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
9
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
10
|
+
permit persons to whom the Software is furnished to do so, subject to
|
11
|
+
the following conditions:
|
12
|
+
|
13
|
+
The above copyright notice and this permission notice shall be
|
14
|
+
included in all copies or substantial portions of the Software.
|
15
|
+
|
16
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
17
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
18
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
19
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
|
20
|
+
BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
|
21
|
+
ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
|
22
|
+
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
23
|
+
SOFTWARE.
|
24
|
+
*/
|
25
|
+
|
26
|
+
/* Contact: Heng Li <lh3@sanger.ac.uk> */
|
27
|
+
|
28
|
+
#include <stdlib.h>
|
29
|
+
#include <stdio.h>
|
30
|
+
#include <string.h>
|
31
|
+
#include <assert.h>
|
32
|
+
#include <stdint.h>
|
33
|
+
#include "utils.h"
|
34
|
+
#include "bwt.h"
|
35
|
+
|
36
|
+
void bwt_gen_cnt_table(bwt_t *bwt)
|
37
|
+
{
|
38
|
+
int i, j;
|
39
|
+
for (i = 0; i != 256; ++i) {
|
40
|
+
uint32_t x = 0;
|
41
|
+
for (j = 0; j != 4; ++j)
|
42
|
+
x |= (((i&3) == j) + ((i>>2&3) == j) + ((i>>4&3) == j) + (i>>6 == j)) << (j<<3);
|
43
|
+
bwt->cnt_table[i] = x;
|
44
|
+
}
|
45
|
+
}
|
46
|
+
|
47
|
+
// bwt->bwt and bwt->occ must be precalculated
|
48
|
+
void bwt_cal_sa(bwt_t *bwt, int intv)
|
49
|
+
{
|
50
|
+
bwtint_t isa, sa, i; // S(isa) = sa
|
51
|
+
|
52
|
+
xassert(bwt->bwt, "bwt_t::bwt is not initialized.");
|
53
|
+
|
54
|
+
if (bwt->sa) free(bwt->sa);
|
55
|
+
bwt->sa_intv = intv;
|
56
|
+
bwt->n_sa = (bwt->seq_len + intv) / intv;
|
57
|
+
bwt->sa = (bwtint_t*)calloc(bwt->n_sa, sizeof(bwtint_t));
|
58
|
+
// calculate SA value
|
59
|
+
isa = 0; sa = bwt->seq_len;
|
60
|
+
for (i = 0; i < bwt->seq_len; ++i) {
|
61
|
+
if (isa % intv == 0) bwt->sa[isa/intv] = sa;
|
62
|
+
--sa;
|
63
|
+
isa = bwt_invPsi(bwt, isa);
|
64
|
+
}
|
65
|
+
if (isa % intv == 0) bwt->sa[isa/intv] = sa;
|
66
|
+
bwt->sa[0] = (bwtint_t)-1; // before this line, bwt->sa[0] = bwt->seq_len
|
67
|
+
}
|
68
|
+
|
69
|
+
bwtint_t bwt_sa(const bwt_t *bwt, bwtint_t k)
|
70
|
+
{
|
71
|
+
bwtint_t sa = 0;
|
72
|
+
while (k % bwt->sa_intv != 0) {
|
73
|
+
++sa;
|
74
|
+
k = bwt_invPsi(bwt, k);
|
75
|
+
}
|
76
|
+
/* without setting bwt->sa[0] = -1, the following line should be
|
77
|
+
changed to (sa + bwt->sa[k/bwt->sa_intv]) % (bwt->seq_len + 1) */
|
78
|
+
return sa + bwt->sa[k/bwt->sa_intv];
|
79
|
+
}
|
80
|
+
|
81
|
+
static inline int __occ_aux(uint64_t y, int c)
|
82
|
+
{
|
83
|
+
// reduce nucleotide counting to bits counting
|
84
|
+
y = ((c&2)? y : ~y) >> 1 & ((c&1)? y : ~y) & 0x5555555555555555ull;
|
85
|
+
// count the number of 1s in y
|
86
|
+
y = (y & 0x3333333333333333ull) + (y >> 2 & 0x3333333333333333ull);
|
87
|
+
return ((y + (y >> 4)) & 0xf0f0f0f0f0f0f0full) * 0x101010101010101ull >> 56;
|
88
|
+
}
|
89
|
+
|
90
|
+
inline bwtint_t bwt_occ(const bwt_t *bwt, bwtint_t k, ubyte_t c)
|
91
|
+
{
|
92
|
+
bwtint_t n, l, j;
|
93
|
+
uint32_t *p;
|
94
|
+
|
95
|
+
if (k == bwt->seq_len) return bwt->L2[c+1] - bwt->L2[c];
|
96
|
+
if (k == (bwtint_t)(-1)) return 0;
|
97
|
+
if (k >= bwt->primary) --k; // because $ is not in bwt
|
98
|
+
|
99
|
+
// retrieve Occ at k/OCC_INTERVAL
|
100
|
+
n = (p = bwt_occ_intv(bwt, k))[c];
|
101
|
+
p += 4; // jump to the start of the first BWT cell
|
102
|
+
|
103
|
+
// calculate Occ up to the last k/32
|
104
|
+
j = k >> 5 << 5;
|
105
|
+
for (l = k/OCC_INTERVAL*OCC_INTERVAL; l < j; l += 32, p += 2)
|
106
|
+
n += __occ_aux((uint64_t)p[0]<<32 | p[1], c);
|
107
|
+
|
108
|
+
// calculate Occ
|
109
|
+
n += __occ_aux(((uint64_t)p[0]<<32 | p[1]) & ~((1ull<<((~k&31)<<1)) - 1), c);
|
110
|
+
if (c == 0) n -= ~k&31; // corrected for the masked bits
|
111
|
+
|
112
|
+
return n;
|
113
|
+
}
|
114
|
+
|
115
|
+
// an analogy to bwt_occ() but more efficient, requiring k <= l
|
116
|
+
inline void bwt_2occ(const bwt_t *bwt, bwtint_t k, bwtint_t l, ubyte_t c, bwtint_t *ok, bwtint_t *ol)
|
117
|
+
{
|
118
|
+
bwtint_t _k, _l;
|
119
|
+
if (k == l) {
|
120
|
+
*ok = *ol = bwt_occ(bwt, k, c);
|
121
|
+
return;
|
122
|
+
}
|
123
|
+
_k = (k >= bwt->primary)? k-1 : k;
|
124
|
+
_l = (l >= bwt->primary)? l-1 : l;
|
125
|
+
if (_l/OCC_INTERVAL != _k/OCC_INTERVAL || k == (bwtint_t)(-1) || l == (bwtint_t)(-1)) {
|
126
|
+
*ok = bwt_occ(bwt, k, c);
|
127
|
+
*ol = bwt_occ(bwt, l, c);
|
128
|
+
} else {
|
129
|
+
bwtint_t m, n, i, j;
|
130
|
+
uint32_t *p;
|
131
|
+
if (k >= bwt->primary) --k;
|
132
|
+
if (l >= bwt->primary) --l;
|
133
|
+
n = (p = bwt_occ_intv(bwt, k))[c];
|
134
|
+
p += 4;
|
135
|
+
// calculate *ok
|
136
|
+
j = k >> 5 << 5;
|
137
|
+
for (i = k/OCC_INTERVAL*OCC_INTERVAL; i < j; i += 32, p += 2)
|
138
|
+
n += __occ_aux((uint64_t)p[0]<<32 | p[1], c);
|
139
|
+
m = n;
|
140
|
+
n += __occ_aux(((uint64_t)p[0]<<32 | p[1]) & ~((1ull<<((~k&31)<<1)) - 1), c);
|
141
|
+
if (c == 0) n -= ~k&31; // corrected for the masked bits
|
142
|
+
*ok = n;
|
143
|
+
// calculate *ol
|
144
|
+
j = l >> 5 << 5;
|
145
|
+
for (; i < j; i += 32, p += 2)
|
146
|
+
m += __occ_aux((uint64_t)p[0]<<32 | p[1], c);
|
147
|
+
m += __occ_aux(((uint64_t)p[0]<<32 | p[1]) & ~((1ull<<((~l&31)<<1)) - 1), c);
|
148
|
+
if (c == 0) m -= ~l&31; // corrected for the masked bits
|
149
|
+
*ol = m;
|
150
|
+
}
|
151
|
+
}
|
152
|
+
|
153
|
+
#define __occ_aux4(bwt, b) \
|
154
|
+
((bwt)->cnt_table[(b)&0xff] + (bwt)->cnt_table[(b)>>8&0xff] \
|
155
|
+
+ (bwt)->cnt_table[(b)>>16&0xff] + (bwt)->cnt_table[(b)>>24])
|
156
|
+
|
157
|
+
inline void bwt_occ4(const bwt_t *bwt, bwtint_t k, bwtint_t cnt[4])
|
158
|
+
{
|
159
|
+
bwtint_t l, j, x;
|
160
|
+
uint32_t *p;
|
161
|
+
if (k == (bwtint_t)(-1)) {
|
162
|
+
memset(cnt, 0, 4 * sizeof(bwtint_t));
|
163
|
+
return;
|
164
|
+
}
|
165
|
+
if (k >= bwt->primary) --k; // because $ is not in bwt
|
166
|
+
p = bwt_occ_intv(bwt, k);
|
167
|
+
memcpy(cnt, p, 16);
|
168
|
+
p += 4;
|
169
|
+
j = k >> 4 << 4;
|
170
|
+
for (l = k / OCC_INTERVAL * OCC_INTERVAL, x = 0; l < j; l += 16, ++p)
|
171
|
+
x += __occ_aux4(bwt, *p);
|
172
|
+
x += __occ_aux4(bwt, *p & ~((1U<<((~k&15)<<1)) - 1)) - (~k&15);
|
173
|
+
cnt[0] += x&0xff; cnt[1] += x>>8&0xff; cnt[2] += x>>16&0xff; cnt[3] += x>>24;
|
174
|
+
}
|
175
|
+
|
176
|
+
// an analogy to bwt_occ4() but more efficient, requiring k <= l
|
177
|
+
inline void bwt_2occ4(const bwt_t *bwt, bwtint_t k, bwtint_t l, bwtint_t cntk[4], bwtint_t cntl[4])
|
178
|
+
{
|
179
|
+
bwtint_t _k, _l;
|
180
|
+
if (k == l) {
|
181
|
+
bwt_occ4(bwt, k, cntk);
|
182
|
+
memcpy(cntl, cntk, 4 * sizeof(bwtint_t));
|
183
|
+
return;
|
184
|
+
}
|
185
|
+
_k = (k >= bwt->primary)? k-1 : k;
|
186
|
+
_l = (l >= bwt->primary)? l-1 : l;
|
187
|
+
if (_l/OCC_INTERVAL != _k/OCC_INTERVAL || k == (bwtint_t)(-1) || l == (bwtint_t)(-1)) {
|
188
|
+
bwt_occ4(bwt, k, cntk);
|
189
|
+
bwt_occ4(bwt, l, cntl);
|
190
|
+
} else {
|
191
|
+
bwtint_t i, j, x, y;
|
192
|
+
uint32_t *p;
|
193
|
+
int cl[4];
|
194
|
+
if (k >= bwt->primary) --k; // because $ is not in bwt
|
195
|
+
if (l >= bwt->primary) --l;
|
196
|
+
cl[0] = cl[1] = cl[2] = cl[3] = 0;
|
197
|
+
p = bwt_occ_intv(bwt, k);
|
198
|
+
memcpy(cntk, p, 4 * sizeof(bwtint_t));
|
199
|
+
p += 4;
|
200
|
+
// prepare cntk[]
|
201
|
+
j = k >> 4 << 4;
|
202
|
+
for (i = k / OCC_INTERVAL * OCC_INTERVAL, x = 0; i < j; i += 16, ++p)
|
203
|
+
x += __occ_aux4(bwt, *p);
|
204
|
+
y = x;
|
205
|
+
x += __occ_aux4(bwt, *p & ~((1U<<((~k&15)<<1)) - 1)) - (~k&15);
|
206
|
+
// calculate cntl[] and finalize cntk[]
|
207
|
+
j = l >> 4 << 4;
|
208
|
+
for (; i < j; i += 16, ++p) y += __occ_aux4(bwt, *p);
|
209
|
+
y += __occ_aux4(bwt, *p & ~((1U<<((~l&15)<<1)) - 1)) - (~l&15);
|
210
|
+
memcpy(cntl, cntk, 16);
|
211
|
+
cntk[0] += x&0xff; cntk[1] += x>>8&0xff; cntk[2] += x>>16&0xff; cntk[3] += x>>24;
|
212
|
+
cntl[0] += y&0xff; cntl[1] += y>>8&0xff; cntl[2] += y>>16&0xff; cntl[3] += y>>24;
|
213
|
+
}
|
214
|
+
}
|
215
|
+
|
216
|
+
int bwt_match_exact(const bwt_t *bwt, int len, const ubyte_t *str, bwtint_t *sa_begin, bwtint_t *sa_end)
|
217
|
+
{
|
218
|
+
bwtint_t k, l, ok, ol;
|
219
|
+
int i;
|
220
|
+
k = 0; l = bwt->seq_len;
|
221
|
+
for (i = len - 1; i >= 0; --i) {
|
222
|
+
ubyte_t c = str[i];
|
223
|
+
if (c > 3) return 0; // no match
|
224
|
+
bwt_2occ(bwt, k - 1, l, c, &ok, &ol);
|
225
|
+
k = bwt->L2[c] + ok + 1;
|
226
|
+
l = bwt->L2[c] + ol;
|
227
|
+
if (k > l) break; // no match
|
228
|
+
}
|
229
|
+
if (k > l) return 0; // no match
|
230
|
+
if (sa_begin) *sa_begin = k;
|
231
|
+
if (sa_end) *sa_end = l;
|
232
|
+
return l - k + 1;
|
233
|
+
}
|
234
|
+
|
235
|
+
int bwt_match_exact_alt(const bwt_t *bwt, int len, const ubyte_t *str, bwtint_t *k0, bwtint_t *l0)
|
236
|
+
{
|
237
|
+
int i;
|
238
|
+
bwtint_t k, l, ok, ol;
|
239
|
+
k = *k0; l = *l0;
|
240
|
+
for (i = len - 1; i >= 0; --i) {
|
241
|
+
ubyte_t c = str[i];
|
242
|
+
if (c > 3) return 0; // there is an N here. no match
|
243
|
+
bwt_2occ(bwt, k - 1, l, c, &ok, &ol);
|
244
|
+
k = bwt->L2[c] + ok + 1;
|
245
|
+
l = bwt->L2[c] + ol;
|
246
|
+
if (k > l) return 0; // no match
|
247
|
+
}
|
248
|
+
*k0 = k; *l0 = l;
|
249
|
+
return l - k + 1;
|
250
|
+
}
|