bio-bwa 0.2.0
Sign up to get free protection for your applications and to get access to all the features.
- data/.document +5 -0
- data/Gemfile +15 -0
- data/Gemfile.lock +28 -0
- data/LICENSE.txt +35 -0
- data/README.rdoc +33 -0
- data/Rakefile +56 -0
- data/VERSION +1 -0
- data/bio-bwa.gemspec +152 -0
- data/doc/Bio.html +93 -0
- data/doc/Bio/BWA.html +2884 -0
- data/doc/Bio/BWA/Library.html +229 -0
- data/doc/_index.html +119 -0
- data/doc/class_list.html +36 -0
- data/doc/css/common.css +1 -0
- data/doc/css/full_list.css +53 -0
- data/doc/css/style.css +310 -0
- data/doc/file.LICENSE.html +88 -0
- data/doc/file.README.html +119 -0
- data/doc/file_list.html +41 -0
- data/doc/frames.html +13 -0
- data/doc/index.html +119 -0
- data/doc/js/app.js +203 -0
- data/doc/js/full_list.js +149 -0
- data/doc/js/jquery.js +154 -0
- data/doc/method_list.html +171 -0
- data/doc/top-level-namespace.html +88 -0
- data/ext/COPYING +674 -0
- data/ext/ChangeLog +3864 -0
- data/ext/NEWS +555 -0
- data/ext/README +29 -0
- data/ext/bamlite.c +155 -0
- data/ext/bamlite.h +94 -0
- data/ext/bntseq.c +303 -0
- data/ext/bntseq.h +80 -0
- data/ext/bwa.1 +562 -0
- data/ext/bwape.c +807 -0
- data/ext/bwase.c +686 -0
- data/ext/bwase.h +27 -0
- data/ext/bwaseqio.c +222 -0
- data/ext/bwt.c +250 -0
- data/ext/bwt.h +105 -0
- data/ext/bwt_gen/Makefile +23 -0
- data/ext/bwt_gen/QSufSort.c +496 -0
- data/ext/bwt_gen/QSufSort.h +40 -0
- data/ext/bwt_gen/bwt_gen.c +1547 -0
- data/ext/bwt_gen/bwt_gen.h +105 -0
- data/ext/bwt_lite.c +94 -0
- data/ext/bwt_lite.h +29 -0
- data/ext/bwtaln.c +345 -0
- data/ext/bwtaln.h +150 -0
- data/ext/bwtgap.c +264 -0
- data/ext/bwtgap.h +38 -0
- data/ext/bwtindex.c +186 -0
- data/ext/bwtio.c +77 -0
- data/ext/bwtmisc.c +269 -0
- data/ext/bwtsw2.h +51 -0
- data/ext/bwtsw2_aux.c +650 -0
- data/ext/bwtsw2_chain.c +107 -0
- data/ext/bwtsw2_core.c +594 -0
- data/ext/bwtsw2_main.c +100 -0
- data/ext/cs2nt.c +191 -0
- data/ext/is.c +218 -0
- data/ext/khash.h +506 -0
- data/ext/kseq.h +208 -0
- data/ext/ksort.h +269 -0
- data/ext/kstring.c +35 -0
- data/ext/kstring.h +46 -0
- data/ext/kvec.h +90 -0
- data/ext/main.c +63 -0
- data/ext/main.h +29 -0
- data/ext/mkrf_conf.rb +49 -0
- data/ext/qualfa2fq.pl +27 -0
- data/ext/simple_dp.c +162 -0
- data/ext/simpletest.c +23 -0
- data/ext/solid2fastq.pl +111 -0
- data/ext/stdaln.c +1072 -0
- data/ext/stdaln.h +162 -0
- data/ext/utils.c +82 -0
- data/ext/utils.h +54 -0
- data/lib/bio-bwa.rb +7 -0
- data/lib/bio/bwa.rb +312 -0
- data/lib/bio/bwa/library.rb +42 -0
- data/test/data/testdata.fa +602 -0
- data/test/data/testdata.long.fa +175 -0
- data/test/data/testdata.short.fa +2 -0
- data/test/helper.rb +18 -0
- data/test/test_bio-bwa_basic.rb +62 -0
- data/test/test_bio-bwa_make_index.rb +42 -0
- data/test/test_bio-bwa_run_aln.rb +49 -0
- data/test/test_bio-bwa_sam_conversion.rb +49 -0
- metadata +218 -0
data/ext/bwtsw2_main.c
ADDED
@@ -0,0 +1,100 @@
|
|
1
|
+
#include <unistd.h>
|
2
|
+
#include <stdlib.h>
|
3
|
+
#include <string.h>
|
4
|
+
#include <stdio.h>
|
5
|
+
#include <math.h>
|
6
|
+
#include "bwt.h"
|
7
|
+
#include "bwtsw2.h"
|
8
|
+
#include "utils.h"
|
9
|
+
|
10
|
+
int bwa_bwtsw2(int argc, char *argv[])
|
11
|
+
{
|
12
|
+
bsw2opt_t *opt;
|
13
|
+
bwt_t *target[2];
|
14
|
+
char buf[1024];
|
15
|
+
bntseq_t *bns;
|
16
|
+
int c;
|
17
|
+
|
18
|
+
opt = bsw2_init_opt();
|
19
|
+
srand48(11);
|
20
|
+
optind = 1;
|
21
|
+
while ((c = getopt(argc, argv, "q:r:a:b:t:T:w:d:z:m:y:s:c:N:Hf:")) >= 0) {
|
22
|
+
switch (c) {
|
23
|
+
case 'q': opt->q = atoi(optarg); break;
|
24
|
+
case 'r': opt->r = atoi(optarg); break;
|
25
|
+
case 'a': opt->a = atoi(optarg); break;
|
26
|
+
case 'b': opt->b = atoi(optarg); break;
|
27
|
+
case 'w': opt->bw = atoi(optarg); break;
|
28
|
+
case 'T': opt->t = atoi(optarg); break;
|
29
|
+
case 't': opt->n_threads = atoi(optarg); break;
|
30
|
+
case 'z': opt->z = atoi(optarg); break;
|
31
|
+
case 'y': opt->yita = atof(optarg); break;
|
32
|
+
case 's': opt->is = atoi(optarg); break;
|
33
|
+
case 'm': opt->mask_level = atof(optarg); break;
|
34
|
+
case 'c': opt->coef = atof(optarg); break;
|
35
|
+
case 'N': opt->t_seeds = atoi(optarg); break;
|
36
|
+
case 'H': opt->hard_clip = 1; break;
|
37
|
+
case 'f': xreopen(optarg, "w", stdout); break;
|
38
|
+
}
|
39
|
+
}
|
40
|
+
opt->qr = opt->q + opt->r;
|
41
|
+
|
42
|
+
if (optind + 2 > argc) {
|
43
|
+
fprintf(stderr, "\n");
|
44
|
+
fprintf(stderr, "Usage: bwa bwasw [options] <target.prefix> <query.fa>\n\n");
|
45
|
+
fprintf(stderr, "Options: -a INT score for a match [%d]\n", opt->a);
|
46
|
+
fprintf(stderr, " -b INT mismatch penalty [%d]\n", opt->b);
|
47
|
+
fprintf(stderr, " -q INT gap open penalty [%d]\n", opt->q);
|
48
|
+
fprintf(stderr, " -r INT gap extension penalty [%d]\n", opt->r);
|
49
|
+
// fprintf(stderr, " -y FLOAT error recurrence coef. (4..16) [%.1f]\n", opt->yita);
|
50
|
+
fprintf(stderr, "\n");
|
51
|
+
fprintf(stderr, " -t INT number of threads [%d]\n", opt->n_threads);
|
52
|
+
fprintf(stderr, " -s INT size of a chunk of reads [%d]\n", opt->chunk_size);
|
53
|
+
fprintf(stderr, "\n");
|
54
|
+
fprintf(stderr, " -w INT band width [%d]\n", opt->bw);
|
55
|
+
fprintf(stderr, " -m FLOAT mask level [%.2f]\n", opt->mask_level);
|
56
|
+
fprintf(stderr, "\n");
|
57
|
+
fprintf(stderr, " -T INT score threshold divided by a [%d]\n", opt->t);
|
58
|
+
fprintf(stderr, " -s INT maximum seeding interval size [%d]\n", opt->is);
|
59
|
+
fprintf(stderr, " -z INT Z-best [%d]\n", opt->z);
|
60
|
+
fprintf(stderr, " -N INT # seeds to trigger reverse alignment [%d]\n", opt->t_seeds);
|
61
|
+
fprintf(stderr, " -c FLOAT coefficient of length-threshold adjustment [%.1f]\n", opt->coef);
|
62
|
+
fprintf(stderr, " -H in SAM output, use hard clipping rather than soft\n");
|
63
|
+
fprintf(stderr, " -f FILE file to output results to instead of stdout\n\n");
|
64
|
+
fprintf(stderr, "Note: For long Illumina, 454 and Sanger reads, assembly contigs, fosmids and\n");
|
65
|
+
fprintf(stderr, " BACs, the default setting usually works well. For the current PacBio\n");
|
66
|
+
fprintf(stderr, " reads (end of 2010), '-b5 -q2 -r1 -z10' is recommended. One may also\n");
|
67
|
+
fprintf(stderr, " increase '-z' for better sensitivity.\n");
|
68
|
+
fprintf(stderr, "\n");
|
69
|
+
|
70
|
+
if (0) {
|
71
|
+
double c, theta, eps, delta;
|
72
|
+
c = opt->a / log(opt->yita);
|
73
|
+
theta = exp(-opt->b / c) / opt->yita;
|
74
|
+
eps = exp(-opt->q / c);
|
75
|
+
delta = exp(-opt->r / c);
|
76
|
+
fprintf(stderr, "mismatch: %lf, gap_open: %lf, gap_ext: %lf\n\n",
|
77
|
+
theta, eps, delta);
|
78
|
+
}
|
79
|
+
return 1;
|
80
|
+
}
|
81
|
+
|
82
|
+
// adjust opt for opt->a
|
83
|
+
opt->t *= opt->a;
|
84
|
+
opt->coef *= opt->a;
|
85
|
+
|
86
|
+
strcpy(buf, argv[optind]); target[0] = bwt_restore_bwt(strcat(buf, ".bwt"));
|
87
|
+
strcpy(buf, argv[optind]); bwt_restore_sa(strcat(buf, ".sa"), target[0]);
|
88
|
+
strcpy(buf, argv[optind]); target[1] = bwt_restore_bwt(strcat(buf, ".rbwt"));
|
89
|
+
strcpy(buf, argv[optind]); bwt_restore_sa(strcat(buf, ".rsa"), target[1]);
|
90
|
+
bns = bns_restore(argv[optind]);
|
91
|
+
|
92
|
+
bsw2_aln(opt, bns, target, argv[optind+1]);
|
93
|
+
|
94
|
+
bns_destroy(bns);
|
95
|
+
bwt_destroy(target[0]); bwt_destroy(target[1]);
|
96
|
+
free(opt);
|
97
|
+
fflush(stdout);
|
98
|
+
xreopen("/dev/tty","w",stdout);
|
99
|
+
return 0;
|
100
|
+
}
|
data/ext/cs2nt.c
ADDED
@@ -0,0 +1,191 @@
|
|
1
|
+
#include <string.h>
|
2
|
+
#include <stdint.h>
|
3
|
+
#include <stdlib.h>
|
4
|
+
#include "bwtaln.h"
|
5
|
+
#include "stdaln.h"
|
6
|
+
|
7
|
+
/*
|
8
|
+
Here is a delicate example. ref_nt=ATTAAC(RBRBG), read_cs=RBBOG. If we
|
9
|
+
decode as ATTGAC(RBGOG), there are one color change and one nt change;
|
10
|
+
if we decode as ATTAAC(RBRBG), there are two color changes.
|
11
|
+
|
12
|
+
In DP, if color quality is smaller than COLOR_MM, we will use COLOR_MM
|
13
|
+
as the penalty; otherwise, we will use color quality as the
|
14
|
+
penalty. This means we always prefer two consistent color changes over
|
15
|
+
a nt change, but if a color has high quality, we may prefer one nt
|
16
|
+
change.
|
17
|
+
|
18
|
+
In the above example, the penalties of the two types of decoding are
|
19
|
+
q(B)+25 and q(B)+q(O), respectively. If q(O)>25, we prefer the first;
|
20
|
+
otherwise the second. Note that no matter what we choose, the fourth
|
21
|
+
base will get a low nt quality.
|
22
|
+
*/
|
23
|
+
|
24
|
+
#define COLOR_MM 19
|
25
|
+
#define NUCL_MM 25
|
26
|
+
|
27
|
+
static const int nst_ntnt2cs_table[] = { 4, 0, 0, 1, 0, 2, 3, 4, 0, 3, 2, 4, 1, 4, 4, 4 };
|
28
|
+
|
29
|
+
/*
|
30
|
+
{A,C,G,T,N} -> {0,1,2,3,4}
|
31
|
+
nt_ref[0..size]: nucleotide reference: 0/1/2/3/4
|
32
|
+
cs_read[0..size-1]: color read+qual sequence: base<<6|qual; qual==63 for N
|
33
|
+
nt_read[0..size]: nucleotide read sequence: 0/1/2/3 (returned)
|
34
|
+
btarray[0..4*size]: backtrack array (working space)
|
35
|
+
*/
|
36
|
+
void cs2nt_DP(int size, const uint8_t *nt_ref, const uint8_t *cs_read, uint8_t *nt_read, uint8_t *btarray)
|
37
|
+
{
|
38
|
+
int h[8], curr, last;
|
39
|
+
int x, y, xmin, hmin, k;
|
40
|
+
|
41
|
+
// h[0..3] and h[4..7] are the current and last best score array, depending on curr and last
|
42
|
+
|
43
|
+
// recursion: initial value
|
44
|
+
if (nt_ref[0] >= 4) memset(h, 0, sizeof(int) << 2);
|
45
|
+
else {
|
46
|
+
for (x = 0; x != 4; ++x) h[x] = NUCL_MM;
|
47
|
+
h[nt_ref[0]] = 0;
|
48
|
+
}
|
49
|
+
// recursion: main loop
|
50
|
+
curr = 1; last = 0;
|
51
|
+
for (k = 1; k <= size; ++k) {
|
52
|
+
for (x = 0; x != 4; ++x) {
|
53
|
+
int min = 0x7fffffff, ymin = 0;
|
54
|
+
for (y = 0; y != 4; ++y) {
|
55
|
+
int s = h[last<<2|y];
|
56
|
+
if ((cs_read[k-1]&0x3f) != 63 && cs_read[k-1]>>6 != nst_ntnt2cs_table[1<<x|1<<y])
|
57
|
+
s += ((cs_read[k-1]&0x3f) < COLOR_MM)? COLOR_MM : (cs_read[k-1]&0x3f); // color mismatch
|
58
|
+
if (nt_ref[k] < 4 && nt_ref[k] != x) s += NUCL_MM; // nt mismatch
|
59
|
+
if (s < min) {
|
60
|
+
min = s; ymin = y;
|
61
|
+
}
|
62
|
+
}
|
63
|
+
h[curr<<2|x] = min; btarray[k<<2|x] = ymin;
|
64
|
+
}
|
65
|
+
last = curr; curr = 1 - curr; // swap
|
66
|
+
}
|
67
|
+
// back trace
|
68
|
+
hmin = 0x7fffffff; xmin = 0;
|
69
|
+
for (x = 0; x != 4; ++x) {
|
70
|
+
if (h[last<<2|x] < hmin) {
|
71
|
+
hmin = h[last<<2|x]; xmin = x;
|
72
|
+
}
|
73
|
+
}
|
74
|
+
nt_read[size] = xmin;
|
75
|
+
for (k = size - 1; k >= 0; --k)
|
76
|
+
nt_read[k] = btarray[(k+1)<<2 | nt_read[k+1]];
|
77
|
+
}
|
78
|
+
/*
|
79
|
+
nt_read[0..size]: nucleotide read sequence: 0/1/2/3
|
80
|
+
cs_read[0..size-1]: color read+qual sequence: base<<6|qual; qual==63 for N
|
81
|
+
tarray[0..size*2-1]: temporary array
|
82
|
+
*/
|
83
|
+
uint8_t *cs2nt_nt_qual(int size, const uint8_t *nt_read, const uint8_t *cs_read, uint8_t *tarray)
|
84
|
+
{
|
85
|
+
int k, c1, c2;
|
86
|
+
uint8_t *t2array = tarray + size;
|
87
|
+
// get the color sequence of nt_read
|
88
|
+
c1 = nt_read[0];
|
89
|
+
for (k = 1; k <= size; ++k) {
|
90
|
+
c2 = nt_read[k]; // in principle, there is no 'N' in nt_read[]; just in case
|
91
|
+
tarray[k-1] = (c1 >= 4 || c2 >= 4)? 4 : nst_ntnt2cs_table[1<<c1 | 1<<c2];
|
92
|
+
c1 = c2;
|
93
|
+
}
|
94
|
+
for (k = 1; k != size; ++k) {
|
95
|
+
int q = 0;
|
96
|
+
if (tarray[k-1] == cs_read[k-1]>>6 && tarray[k] == cs_read[k]>>6) {
|
97
|
+
q = (int)(cs_read[k-1]&0x3f) + (int)(cs_read[k]&0x3f) + 10;
|
98
|
+
} else if (tarray[k-1] == cs_read[k-1]>>6) {
|
99
|
+
q = (int)(cs_read[k-1]&0x3f) - (int)(cs_read[k]&0x3f);
|
100
|
+
} else if (tarray[k] == cs_read[k]>>6) {
|
101
|
+
q = (int)(cs_read[k]&0x3f) - (int)(cs_read[k-1]&0x3f);
|
102
|
+
} // else, q = 0
|
103
|
+
if (q < 0) q = 0;
|
104
|
+
if (q > 60) q = 60;
|
105
|
+
t2array[k] = nt_read[k]<<6 | q;
|
106
|
+
if ((cs_read[k-1]&0x3f) == 63 || (cs_read[k]&0x3f) == 63) t2array[k] = 0;
|
107
|
+
}
|
108
|
+
return t2array + 1; // of size-2
|
109
|
+
}
|
110
|
+
|
111
|
+
// this function will be called when p->seq has been reversed by refine_gapped()
|
112
|
+
void bwa_cs2nt_core(bwa_seq_t *p, bwtint_t l_pac, ubyte_t *pac)
|
113
|
+
{
|
114
|
+
uint8_t *ta, *nt_read, *btarray, *tarray, *nt_ref, *cs_read, *new_nt_read;
|
115
|
+
int i, len;
|
116
|
+
uint8_t *seq;
|
117
|
+
|
118
|
+
// set temporary arrays
|
119
|
+
if (p->type == BWA_TYPE_NO_MATCH) return;
|
120
|
+
len = p->len + p->n_gapo + p->n_gape + 100; // leave enough space
|
121
|
+
ta = (uint8_t*)malloc(len * 7);
|
122
|
+
nt_ref = ta;
|
123
|
+
cs_read = nt_ref + len;
|
124
|
+
nt_read = cs_read + len;
|
125
|
+
btarray = nt_read + len;
|
126
|
+
tarray = nt_read + len;
|
127
|
+
|
128
|
+
#define __gen_csbase(_cs, _i, _seq) do { \
|
129
|
+
int q = p->qual[p->strand? p->len - 1 - (_i) : (_i)] - 33; \
|
130
|
+
if (q > 60) q = 60; \
|
131
|
+
if (_seq[_i] > 3) q = 63; \
|
132
|
+
(_cs) = _seq[_i]<<6 | q; \
|
133
|
+
} while (0)
|
134
|
+
|
135
|
+
// generate len, nt_ref[] and cs_read
|
136
|
+
seq = p->strand? p->rseq : p->seq;
|
137
|
+
nt_ref[0] = p->pos? bns_pac(pac, p->pos-1) : 4;
|
138
|
+
if (p->cigar == 0) { // no gap or clipping
|
139
|
+
len = p->len;
|
140
|
+
for (i = 0; i < p->len; ++i) {
|
141
|
+
__gen_csbase(cs_read[i], i, seq);
|
142
|
+
nt_ref[i+1] = bns_pac(pac, p->pos + i);
|
143
|
+
}
|
144
|
+
} else {
|
145
|
+
int k, z;
|
146
|
+
bwtint_t x, y;
|
147
|
+
x = p->pos; y = 0;
|
148
|
+
for (k = z = 0; k < p->n_cigar; ++k) {
|
149
|
+
int l = __cigar_len(p->cigar[k]);
|
150
|
+
if (__cigar_op(p->cigar[k]) == FROM_M) {
|
151
|
+
for (i = 0; i < l; ++i, ++x, ++y) {
|
152
|
+
__gen_csbase(cs_read[z], y, seq);
|
153
|
+
nt_ref[z+1] = bns_pac(pac, x);
|
154
|
+
++z;
|
155
|
+
}
|
156
|
+
} else if (__cigar_op(p->cigar[k]) == FROM_I) {
|
157
|
+
for (i = 0; i < l; ++i, ++y) {
|
158
|
+
__gen_csbase(cs_read[z], y, seq);
|
159
|
+
nt_ref[z+1] = 4;
|
160
|
+
++z;
|
161
|
+
}
|
162
|
+
} else if (__cigar_op(p->cigar[k]) == FROM_S) y += l;
|
163
|
+
else x += l;
|
164
|
+
}
|
165
|
+
len = z;
|
166
|
+
}
|
167
|
+
|
168
|
+
cs2nt_DP(len, nt_ref, cs_read, nt_read, btarray);
|
169
|
+
new_nt_read = cs2nt_nt_qual(len, nt_read, cs_read, tarray);
|
170
|
+
|
171
|
+
// update p
|
172
|
+
p->len = p->full_len = len - 1;
|
173
|
+
for (i = 0; i < p->len; ++i) {
|
174
|
+
if ((new_nt_read[i]&0x3f) == 63) {
|
175
|
+
p->qual[i] = 33; seq[i] = 4;
|
176
|
+
} else {
|
177
|
+
p->qual[i] = (new_nt_read[i]&0x3f) + 33;
|
178
|
+
seq[i] = new_nt_read[i]>>6;
|
179
|
+
}
|
180
|
+
}
|
181
|
+
p->qual[p->len] = seq[p->len] = 0;
|
182
|
+
if (p->strand) {
|
183
|
+
memcpy(p->seq, seq, p->len);
|
184
|
+
seq_reverse(p->len, p->seq, 1);
|
185
|
+
seq_reverse(p->len, p->qual, 0);
|
186
|
+
} else {
|
187
|
+
memcpy(p->rseq, seq, p->len);
|
188
|
+
seq_reverse(p->len, p->rseq, 1);
|
189
|
+
}
|
190
|
+
free(ta);
|
191
|
+
}
|
data/ext/is.c
ADDED
@@ -0,0 +1,218 @@
|
|
1
|
+
/*
|
2
|
+
* sais.c for sais-lite
|
3
|
+
* Copyright (c) 2008 Yuta Mori All Rights Reserved.
|
4
|
+
*
|
5
|
+
* Permission is hereby granted, free of charge, to any person
|
6
|
+
* obtaining a copy of this software and associated documentation
|
7
|
+
* files (the "Software"), to deal in the Software without
|
8
|
+
* restriction, including without limitation the rights to use,
|
9
|
+
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
10
|
+
* copies of the Software, and to permit persons to whom the
|
11
|
+
* Software is furnished to do so, subject to the following
|
12
|
+
* conditions:
|
13
|
+
*
|
14
|
+
* The above copyright notice and this permission notice shall be
|
15
|
+
* included in all copies or substantial portions of the Software.
|
16
|
+
*
|
17
|
+
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
18
|
+
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
19
|
+
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
20
|
+
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
21
|
+
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
22
|
+
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
23
|
+
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
24
|
+
* OTHER DEALINGS IN THE SOFTWARE.
|
25
|
+
*/
|
26
|
+
|
27
|
+
#include <stdlib.h>
|
28
|
+
|
29
|
+
typedef unsigned char ubyte_t;
|
30
|
+
#define chr(i) (cs == sizeof(int) ? ((const int *)T)[i]:((const unsigned char *)T)[i])
|
31
|
+
|
32
|
+
/* find the start or end of each bucket */
|
33
|
+
static void getCounts(const unsigned char *T, int *C, int n, int k, int cs)
|
34
|
+
{
|
35
|
+
int i;
|
36
|
+
for (i = 0; i < k; ++i) C[i] = 0;
|
37
|
+
for (i = 0; i < n; ++i) ++C[chr(i)];
|
38
|
+
}
|
39
|
+
static void getBuckets(const int *C, int *B, int k, int end)
|
40
|
+
{
|
41
|
+
int i, sum = 0;
|
42
|
+
if (end) {
|
43
|
+
for (i = 0; i < k; ++i) {
|
44
|
+
sum += C[i];
|
45
|
+
B[i] = sum;
|
46
|
+
}
|
47
|
+
} else {
|
48
|
+
for (i = 0; i < k; ++i) {
|
49
|
+
sum += C[i];
|
50
|
+
B[i] = sum - C[i];
|
51
|
+
}
|
52
|
+
}
|
53
|
+
}
|
54
|
+
|
55
|
+
/* compute SA */
|
56
|
+
static void induceSA(const unsigned char *T, int *SA, int *C, int *B, int n, int k, int cs)
|
57
|
+
{
|
58
|
+
int *b, i, j;
|
59
|
+
int c0, c1;
|
60
|
+
/* compute SAl */
|
61
|
+
if (C == B) getCounts(T, C, n, k, cs);
|
62
|
+
getBuckets(C, B, k, 0); /* find starts of buckets */
|
63
|
+
j = n - 1;
|
64
|
+
b = SA + B[c1 = chr(j)];
|
65
|
+
*b++ = ((0 < j) && (chr(j - 1) < c1)) ? ~j : j;
|
66
|
+
for (i = 0; i < n; ++i) {
|
67
|
+
j = SA[i], SA[i] = ~j;
|
68
|
+
if (0 < j) {
|
69
|
+
--j;
|
70
|
+
if ((c0 = chr(j)) != c1) {
|
71
|
+
B[c1] = b - SA;
|
72
|
+
b = SA + B[c1 = c0];
|
73
|
+
}
|
74
|
+
*b++ = ((0 < j) && (chr(j - 1) < c1)) ? ~j : j;
|
75
|
+
}
|
76
|
+
}
|
77
|
+
/* compute SAs */
|
78
|
+
if (C == B) getCounts(T, C, n, k, cs);
|
79
|
+
getBuckets(C, B, k, 1); /* find ends of buckets */
|
80
|
+
for (i = n - 1, b = SA + B[c1 = 0]; 0 <= i; --i) {
|
81
|
+
if (0 < (j = SA[i])) {
|
82
|
+
--j;
|
83
|
+
if ((c0 = chr(j)) != c1) {
|
84
|
+
B[c1] = b - SA;
|
85
|
+
b = SA + B[c1 = c0];
|
86
|
+
}
|
87
|
+
*--b = ((j == 0) || (chr(j - 1) > c1)) ? ~j : j;
|
88
|
+
} else SA[i] = ~j;
|
89
|
+
}
|
90
|
+
}
|
91
|
+
|
92
|
+
/*
|
93
|
+
* find the suffix array SA of T[0..n-1] in {0..k-1}^n use a working
|
94
|
+
* space (excluding T and SA) of at most 2n+O(1) for a constant alphabet
|
95
|
+
*/
|
96
|
+
static int sais_main(const unsigned char *T, int *SA, int fs, int n, int k, int cs)
|
97
|
+
{
|
98
|
+
int *C, *B, *RA;
|
99
|
+
int i, j, c, m, p, q, plen, qlen, name;
|
100
|
+
int c0, c1;
|
101
|
+
int diff;
|
102
|
+
|
103
|
+
/* stage 1: reduce the problem by at least 1/2 sort all the
|
104
|
+
* S-substrings */
|
105
|
+
if (k <= fs) {
|
106
|
+
C = SA + n;
|
107
|
+
B = (k <= (fs - k)) ? C + k : C;
|
108
|
+
} else if ((C = B = (int *) malloc(k * sizeof(int))) == NULL) return -2;
|
109
|
+
getCounts(T, C, n, k, cs);
|
110
|
+
getBuckets(C, B, k, 1); /* find ends of buckets */
|
111
|
+
for (i = 0; i < n; ++i) SA[i] = 0;
|
112
|
+
for (i = n - 2, c = 0, c1 = chr(n - 1); 0 <= i; --i, c1 = c0) {
|
113
|
+
if ((c0 = chr(i)) < (c1 + c)) c = 1;
|
114
|
+
else if (c != 0) SA[--B[c1]] = i + 1, c = 0;
|
115
|
+
}
|
116
|
+
induceSA(T, SA, C, B, n, k, cs);
|
117
|
+
if (fs < k) free(C);
|
118
|
+
/* compact all the sorted substrings into the first m items of SA
|
119
|
+
* 2*m must be not larger than n (proveable) */
|
120
|
+
for (i = 0, m = 0; i < n; ++i) {
|
121
|
+
p = SA[i];
|
122
|
+
if ((0 < p) && (chr(p - 1) > (c0 = chr(p)))) {
|
123
|
+
for (j = p + 1; (j < n) && (c0 == (c1 = chr(j))); ++j);
|
124
|
+
if ((j < n) && (c0 < c1)) SA[m++] = p;
|
125
|
+
}
|
126
|
+
}
|
127
|
+
for (i = m; i < n; ++i) SA[i] = 0; /* init the name array buffer */
|
128
|
+
/* store the length of all substrings */
|
129
|
+
for (i = n - 2, j = n, c = 0, c1 = chr(n - 1); 0 <= i; --i, c1 = c0) {
|
130
|
+
if ((c0 = chr(i)) < (c1 + c)) c = 1;
|
131
|
+
else if (c != 0) {
|
132
|
+
SA[m + ((i + 1) >> 1)] = j - i - 1;
|
133
|
+
j = i + 1;
|
134
|
+
c = 0;
|
135
|
+
}
|
136
|
+
}
|
137
|
+
/* find the lexicographic names of all substrings */
|
138
|
+
for (i = 0, name = 0, q = n, qlen = 0; i < m; ++i) {
|
139
|
+
p = SA[i], plen = SA[m + (p >> 1)], diff = 1;
|
140
|
+
if (plen == qlen) {
|
141
|
+
for (j = 0; (j < plen) && (chr(p + j) == chr(q + j)); j++);
|
142
|
+
if (j == plen) diff = 0;
|
143
|
+
}
|
144
|
+
if (diff != 0) ++name, q = p, qlen = plen;
|
145
|
+
SA[m + (p >> 1)] = name;
|
146
|
+
}
|
147
|
+
|
148
|
+
/* stage 2: solve the reduced problem recurse if names are not yet
|
149
|
+
* unique */
|
150
|
+
if (name < m) {
|
151
|
+
RA = SA + n + fs - m;
|
152
|
+
for (i = n - 1, j = m - 1; m <= i; --i) {
|
153
|
+
if (SA[i] != 0) RA[j--] = SA[i] - 1;
|
154
|
+
}
|
155
|
+
if (sais_main((unsigned char *) RA, SA, fs + n - m * 2, m, name, sizeof(int)) != 0) return -2;
|
156
|
+
for (i = n - 2, j = m - 1, c = 0, c1 = chr(n - 1); 0 <= i; --i, c1 = c0) {
|
157
|
+
if ((c0 = chr(i)) < (c1 + c)) c = 1;
|
158
|
+
else if (c != 0) RA[j--] = i + 1, c = 0; /* get p1 */
|
159
|
+
}
|
160
|
+
for (i = 0; i < m; ++i) SA[i] = RA[SA[i]]; /* get index */
|
161
|
+
}
|
162
|
+
/* stage 3: induce the result for the original problem */
|
163
|
+
if (k <= fs) {
|
164
|
+
C = SA + n;
|
165
|
+
B = (k <= (fs - k)) ? C + k : C;
|
166
|
+
} else if ((C = B = (int *) malloc(k * sizeof(int))) == NULL) return -2;
|
167
|
+
/* put all left-most S characters into their buckets */
|
168
|
+
getCounts(T, C, n, k, cs);
|
169
|
+
getBuckets(C, B, k, 1); /* find ends of buckets */
|
170
|
+
for (i = m; i < n; ++i) SA[i] = 0; /* init SA[m..n-1] */
|
171
|
+
for (i = m - 1; 0 <= i; --i) {
|
172
|
+
j = SA[i], SA[i] = 0;
|
173
|
+
SA[--B[chr(j)]] = j;
|
174
|
+
}
|
175
|
+
induceSA(T, SA, C, B, n, k, cs);
|
176
|
+
if (fs < k) free(C);
|
177
|
+
return 0;
|
178
|
+
}
|
179
|
+
|
180
|
+
/**
|
181
|
+
* Constructs the suffix array of a given string.
|
182
|
+
* @param T[0..n-1] The input string.
|
183
|
+
* @param SA[0..n] The output array of suffixes.
|
184
|
+
* @param n The length of the given string.
|
185
|
+
* @return 0 if no error occurred
|
186
|
+
*/
|
187
|
+
int is_sa(const ubyte_t *T, int *SA, int n)
|
188
|
+
{
|
189
|
+
if ((T == NULL) || (SA == NULL) || (n < 0)) return -1;
|
190
|
+
SA[0] = n;
|
191
|
+
if (n <= 1) {
|
192
|
+
if (n == 1) SA[1] = 0;
|
193
|
+
return 0;
|
194
|
+
}
|
195
|
+
return sais_main(T, SA+1, 0, n, 256, 1);
|
196
|
+
}
|
197
|
+
|
198
|
+
/**
|
199
|
+
* Constructs the burrows-wheeler transformed string of a given string.
|
200
|
+
* @param T[0..n-1] The input string.
|
201
|
+
* @param n The length of the given string.
|
202
|
+
* @return The primary index if no error occurred, -1 or -2 otherwise.
|
203
|
+
*/
|
204
|
+
int is_bwt(ubyte_t *T, int n)
|
205
|
+
{
|
206
|
+
int *SA, i, primary = 0;
|
207
|
+
SA = (int*)calloc(n+1, sizeof(int));
|
208
|
+
is_sa(T, SA, n);
|
209
|
+
|
210
|
+
for (i = 0; i <= n; ++i) {
|
211
|
+
if (SA[i] == 0) primary = i;
|
212
|
+
else SA[i] = T[SA[i] - 1];
|
213
|
+
}
|
214
|
+
for (i = 0; i < primary; ++i) T[i] = SA[i];
|
215
|
+
for (; i < n; ++i) T[i] = SA[i + 1];
|
216
|
+
free(SA);
|
217
|
+
return primary;
|
218
|
+
}
|