divsufsort 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- data/README.txt +77 -0
- data/ext/Makefile +149 -0
- data/ext/divsufsort.c +398 -0
- data/ext/divsufsort.h +191 -0
- data/ext/divsufsort.o +0 -0
- data/ext/divsufsort.so +0 -0
- data/ext/divsufsort_private.h +207 -0
- data/ext/divsufsort_ruby.c +227 -0
- data/ext/divsufsort_ruby.o +0 -0
- data/ext/extconf.rb +18 -0
- data/ext/lfs.h +56 -0
- data/ext/mkmf.log +266 -0
- data/ext/sssort.c +815 -0
- data/ext/sssort.o +0 -0
- data/ext/trsort.c +586 -0
- data/ext/trsort.o +0 -0
- data/ext/utils.c +381 -0
- data/ext/utils.o +0 -0
- data/libdivsufsort/COPYING +27 -0
- data/libdivsufsort/divsufsort.c +398 -0
- data/libdivsufsort/divsufsort.h +191 -0
- data/libdivsufsort/divsufsort_private.h +207 -0
- data/libdivsufsort/lfs.h +56 -0
- data/libdivsufsort/sssort.c +815 -0
- data/libdivsufsort/trsort.c +586 -0
- data/libdivsufsort/utils.c +381 -0
- metadata +80 -0
data/ext/trsort.o
ADDED
Binary file
|
data/ext/utils.c
ADDED
@@ -0,0 +1,381 @@
|
|
1
|
+
/*
|
2
|
+
* utils.c for libdivsufsort
|
3
|
+
* Copyright (c) 2003-2008 Yuta Mori All Rights Reserved.
|
4
|
+
*
|
5
|
+
* Permission is hereby granted, free of charge, to any person
|
6
|
+
* obtaining a copy of this software and associated documentation
|
7
|
+
* files (the "Software"), to deal in the Software without
|
8
|
+
* restriction, including without limitation the rights to use,
|
9
|
+
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
10
|
+
* copies of the Software, and to permit persons to whom the
|
11
|
+
* Software is furnished to do so, subject to the following
|
12
|
+
* conditions:
|
13
|
+
*
|
14
|
+
* The above copyright notice and this permission notice shall be
|
15
|
+
* included in all copies or substantial portions of the Software.
|
16
|
+
*
|
17
|
+
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
18
|
+
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
19
|
+
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
20
|
+
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
21
|
+
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
22
|
+
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
23
|
+
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
24
|
+
* OTHER DEALINGS IN THE SOFTWARE.
|
25
|
+
*/
|
26
|
+
|
27
|
+
#include "divsufsort_private.h"
|
28
|
+
|
29
|
+
|
30
|
+
/*- Private Function -*/
|
31
|
+
|
32
|
+
/* Binary search for inverse bwt. */
|
33
|
+
static
|
34
|
+
saidx_t
|
35
|
+
binarysearch_lower(const saidx_t *A, saidx_t size, saidx_t value) {
|
36
|
+
saidx_t half, i;
|
37
|
+
for(i = 0, half = size >> 1;
|
38
|
+
0 < size;
|
39
|
+
size = half, half >>= 1) {
|
40
|
+
if(A[i + half] < value) {
|
41
|
+
i += half + 1;
|
42
|
+
half -= (size & 1) ^ 1;
|
43
|
+
}
|
44
|
+
}
|
45
|
+
return i;
|
46
|
+
}
|
47
|
+
|
48
|
+
|
49
|
+
/*- Functions -*/
|
50
|
+
|
51
|
+
/* Burrows-Wheeler transform. */
|
52
|
+
saint_t
|
53
|
+
bw_transform(const sauchar_t *T, sauchar_t *U, saidx_t *SA,
|
54
|
+
saidx_t n, saidx_t *idx) {
|
55
|
+
saidx_t *A, i, j, p, t;
|
56
|
+
saint_t c;
|
57
|
+
|
58
|
+
/* Check arguments. */
|
59
|
+
if((T == NULL) || (U == NULL) || (n < 0) || (idx == NULL)) { return -1; }
|
60
|
+
if(n <= 1) {
|
61
|
+
if(n == 1) { U[0] = T[0]; }
|
62
|
+
*idx = n;
|
63
|
+
return 0;
|
64
|
+
}
|
65
|
+
|
66
|
+
if((A = SA) == NULL) {
|
67
|
+
i = divbwt(T, U, NULL, n);
|
68
|
+
if(0 <= i) { *idx = i; i = 0; }
|
69
|
+
return (saint_t)i;
|
70
|
+
}
|
71
|
+
|
72
|
+
/* BW transform. */
|
73
|
+
if(T == U) {
|
74
|
+
t = n;
|
75
|
+
for(i = 0, j = 0; i < n; ++i) {
|
76
|
+
p = t - 1;
|
77
|
+
t = A[i];
|
78
|
+
if(0 <= p) {
|
79
|
+
c = T[j];
|
80
|
+
U[j] = (j <= p) ? T[p] : (sauchar_t)A[p];
|
81
|
+
A[j] = c;
|
82
|
+
j++;
|
83
|
+
} else {
|
84
|
+
*idx = i;
|
85
|
+
}
|
86
|
+
}
|
87
|
+
p = t - 1;
|
88
|
+
if(0 <= p) {
|
89
|
+
c = T[j];
|
90
|
+
U[j] = (j <= p) ? T[p] : (sauchar_t)A[p];
|
91
|
+
A[j] = c;
|
92
|
+
} else {
|
93
|
+
*idx = i;
|
94
|
+
}
|
95
|
+
} else {
|
96
|
+
U[0] = T[n - 1];
|
97
|
+
for(i = 0; A[i] != 0; ++i) { U[i + 1] = T[A[i] - 1]; }
|
98
|
+
*idx = i + 1;
|
99
|
+
for(++i; i < n; ++i) { U[i] = T[A[i] - 1]; }
|
100
|
+
}
|
101
|
+
|
102
|
+
if(SA == NULL) {
|
103
|
+
/* Deallocate memory. */
|
104
|
+
free(A);
|
105
|
+
}
|
106
|
+
|
107
|
+
return 0;
|
108
|
+
}
|
109
|
+
|
110
|
+
/* Inverse Burrows-Wheeler transform. */
|
111
|
+
saint_t
|
112
|
+
inverse_bw_transform(const sauchar_t *T, sauchar_t *U, saidx_t *A,
|
113
|
+
saidx_t n, saidx_t idx) {
|
114
|
+
saidx_t C[ALPHABET_SIZE];
|
115
|
+
sauchar_t D[ALPHABET_SIZE];
|
116
|
+
saidx_t *B;
|
117
|
+
saidx_t i, p;
|
118
|
+
saint_t c, d;
|
119
|
+
|
120
|
+
/* Check arguments. */
|
121
|
+
if((T == NULL) || (U == NULL) || (n < 0) || (idx < 0) ||
|
122
|
+
(n < idx) || ((0 < n) && (idx == 0))) {
|
123
|
+
return -1;
|
124
|
+
}
|
125
|
+
if(n <= 1) { return 0; }
|
126
|
+
|
127
|
+
if((B = A) == NULL) {
|
128
|
+
/* Allocate n*sizeof(saidx_t) bytes of memory. */
|
129
|
+
if((B = (saidx_t *)malloc((size_t)n * sizeof(saidx_t))) == NULL) { return -2; }
|
130
|
+
}
|
131
|
+
|
132
|
+
/* Inverse BW transform. */
|
133
|
+
for(c = 0; c < ALPHABET_SIZE; ++c) { C[c] = 0; }
|
134
|
+
for(i = 0; i < n; ++i) { ++C[T[i]]; }
|
135
|
+
for(c = 0, d = 0, i = 0; c < ALPHABET_SIZE; ++c) {
|
136
|
+
p = C[c];
|
137
|
+
if(0 < p) {
|
138
|
+
C[c] = i;
|
139
|
+
D[d++] = (sauchar_t)c;
|
140
|
+
i += p;
|
141
|
+
}
|
142
|
+
}
|
143
|
+
for(i = 0; i < idx; ++i) { B[C[T[i]]++] = i; }
|
144
|
+
for( ; i < n; ++i) { B[C[T[i]]++] = i + 1; }
|
145
|
+
for(c = 0; c < d; ++c) { C[c] = C[D[c]]; }
|
146
|
+
for(i = 0, p = idx; i < n; ++i) {
|
147
|
+
U[i] = D[binarysearch_lower(C, d, p)];
|
148
|
+
p = B[p - 1];
|
149
|
+
}
|
150
|
+
|
151
|
+
if(A == NULL) {
|
152
|
+
/* Deallocate memory. */
|
153
|
+
free(B);
|
154
|
+
}
|
155
|
+
|
156
|
+
return 0;
|
157
|
+
}
|
158
|
+
|
159
|
+
/* Checks the suffix array SA of the string T. */
|
160
|
+
saint_t
|
161
|
+
sufcheck(const sauchar_t *T, const saidx_t *SA,
|
162
|
+
saidx_t n, saint_t verbose) {
|
163
|
+
saidx_t C[ALPHABET_SIZE];
|
164
|
+
saidx_t i, p, q, t;
|
165
|
+
saint_t c;
|
166
|
+
|
167
|
+
if(verbose) { fprintf(stderr, "sufcheck: "); }
|
168
|
+
|
169
|
+
/* Check arguments. */
|
170
|
+
if((T == NULL) || (SA == NULL) || (n < 0)) {
|
171
|
+
if(verbose) { fprintf(stderr, "Invalid arguments.\n"); }
|
172
|
+
return -1;
|
173
|
+
}
|
174
|
+
if(n == 0) {
|
175
|
+
if(verbose) { fprintf(stderr, "Done.\n"); }
|
176
|
+
return 0;
|
177
|
+
}
|
178
|
+
|
179
|
+
/* check range: [0..n-1] */
|
180
|
+
for(i = 0; i < n; ++i) {
|
181
|
+
if((SA[i] < 0) || (n <= SA[i])) {
|
182
|
+
if(verbose) {
|
183
|
+
fprintf(stderr, "Out of the range [0,%" PRIdSAIDX_T "].\n"
|
184
|
+
" SA[%" PRIdSAIDX_T "]=%" PRIdSAIDX_T "\n",
|
185
|
+
n - 1, i, SA[i]);
|
186
|
+
}
|
187
|
+
return -2;
|
188
|
+
}
|
189
|
+
}
|
190
|
+
|
191
|
+
/* check first characters. */
|
192
|
+
for(i = 1; i < n; ++i) {
|
193
|
+
if(T[SA[i - 1]] > T[SA[i]]) {
|
194
|
+
if(verbose) {
|
195
|
+
fprintf(stderr, "Suffixes in wrong order.\n"
|
196
|
+
" T[SA[%" PRIdSAIDX_T "]=%" PRIdSAIDX_T "]=%d"
|
197
|
+
" > T[SA[%" PRIdSAIDX_T "]=%" PRIdSAIDX_T "]=%d\n",
|
198
|
+
i - 1, SA[i - 1], T[SA[i - 1]], i, SA[i], T[SA[i]]);
|
199
|
+
}
|
200
|
+
return -3;
|
201
|
+
}
|
202
|
+
}
|
203
|
+
|
204
|
+
/* check suffixes. */
|
205
|
+
for(i = 0; i < ALPHABET_SIZE; ++i) { C[i] = 0; }
|
206
|
+
for(i = 0; i < n; ++i) { ++C[T[i]]; }
|
207
|
+
for(i = 0, p = 0; i < ALPHABET_SIZE; ++i) {
|
208
|
+
t = C[i];
|
209
|
+
C[i] = p;
|
210
|
+
p += t;
|
211
|
+
}
|
212
|
+
|
213
|
+
q = C[T[n - 1]];
|
214
|
+
C[T[n - 1]] += 1;
|
215
|
+
for(i = 0; i < n; ++i) {
|
216
|
+
p = SA[i];
|
217
|
+
if(0 < p) {
|
218
|
+
c = T[--p];
|
219
|
+
t = C[c];
|
220
|
+
} else {
|
221
|
+
c = T[p = n - 1];
|
222
|
+
t = q;
|
223
|
+
}
|
224
|
+
if((t < 0) || (p != SA[t])) {
|
225
|
+
if(verbose) {
|
226
|
+
fprintf(stderr, "Suffix in wrong position.\n"
|
227
|
+
" SA[%" PRIdSAIDX_T "]=%" PRIdSAIDX_T " or\n"
|
228
|
+
" SA[%" PRIdSAIDX_T "]=%" PRIdSAIDX_T "\n",
|
229
|
+
t, (0 <= t) ? SA[t] : -1, i, SA[i]);
|
230
|
+
}
|
231
|
+
return -4;
|
232
|
+
}
|
233
|
+
if(t != q) {
|
234
|
+
++C[c];
|
235
|
+
if((n <= C[c]) || (T[SA[C[c]]] != c)) { C[c] = -1; }
|
236
|
+
}
|
237
|
+
}
|
238
|
+
|
239
|
+
if(1 <= verbose) { fprintf(stderr, "Done.\n"); }
|
240
|
+
return 0;
|
241
|
+
}
|
242
|
+
|
243
|
+
|
244
|
+
static
|
245
|
+
int
|
246
|
+
_compare(const sauchar_t *T, saidx_t Tsize,
|
247
|
+
const sauchar_t *P, saidx_t Psize,
|
248
|
+
saidx_t suf, saidx_t *match) {
|
249
|
+
saidx_t i, j;
|
250
|
+
saint_t r;
|
251
|
+
for(i = suf + *match, j = *match, r = 0;
|
252
|
+
(i < Tsize) && (j < Psize) && ((r = T[i] - P[j]) == 0); ++i, ++j) { }
|
253
|
+
*match = j;
|
254
|
+
return (r == 0) ? -(j != Psize) : r;
|
255
|
+
}
|
256
|
+
|
257
|
+
/* Search for the pattern P in the string T. */
|
258
|
+
saidx_t
|
259
|
+
sa_search(const sauchar_t *T, saidx_t Tsize,
|
260
|
+
const sauchar_t *P, saidx_t Psize,
|
261
|
+
const saidx_t *SA, saidx_t SAsize,
|
262
|
+
saidx_t *idx) {
|
263
|
+
saidx_t size, lsize, rsize, half;
|
264
|
+
saidx_t match, lmatch, rmatch;
|
265
|
+
saidx_t llmatch, lrmatch, rlmatch, rrmatch;
|
266
|
+
saidx_t i, j, k;
|
267
|
+
saint_t r;
|
268
|
+
|
269
|
+
if(idx != NULL) { *idx = -1; }
|
270
|
+
if((T == NULL) || (P == NULL) || (SA == NULL) ||
|
271
|
+
(Tsize < 0) || (Psize < 0) || (SAsize < 0)) { return -1; }
|
272
|
+
if((Tsize == 0) || (SAsize == 0)) { return 0; }
|
273
|
+
if(Psize == 0) { if(idx != NULL) { *idx = 0; } return SAsize; }
|
274
|
+
|
275
|
+
for(i = j = k = 0, lmatch = rmatch = 0, size = SAsize, half = size >> 1;
|
276
|
+
0 < size;
|
277
|
+
size = half, half >>= 1) {
|
278
|
+
match = MIN(lmatch, rmatch);
|
279
|
+
r = _compare(T, Tsize, P, Psize, SA[i + half], &match);
|
280
|
+
if(r < 0) {
|
281
|
+
i += half + 1;
|
282
|
+
half -= (size & 1) ^ 1;
|
283
|
+
lmatch = match;
|
284
|
+
} else if(r > 0) {
|
285
|
+
rmatch = match;
|
286
|
+
} else {
|
287
|
+
lsize = half, j = i, rsize = size - half - 1, k = i + half + 1;
|
288
|
+
|
289
|
+
/* left part */
|
290
|
+
for(llmatch = lmatch, lrmatch = match, half = lsize >> 1;
|
291
|
+
0 < lsize;
|
292
|
+
lsize = half, half >>= 1) {
|
293
|
+
lmatch = MIN(llmatch, lrmatch);
|
294
|
+
r = _compare(T, Tsize, P, Psize, SA[j + half], &lmatch);
|
295
|
+
if(r < 0) {
|
296
|
+
j += half + 1;
|
297
|
+
half -= (lsize & 1) ^ 1;
|
298
|
+
llmatch = lmatch;
|
299
|
+
} else {
|
300
|
+
lrmatch = lmatch;
|
301
|
+
}
|
302
|
+
}
|
303
|
+
|
304
|
+
/* right part */
|
305
|
+
for(rlmatch = match, rrmatch = rmatch, half = rsize >> 1;
|
306
|
+
0 < rsize;
|
307
|
+
rsize = half, half >>= 1) {
|
308
|
+
rmatch = MIN(rlmatch, rrmatch);
|
309
|
+
r = _compare(T, Tsize, P, Psize, SA[k + half], &rmatch);
|
310
|
+
if(r <= 0) {
|
311
|
+
k += half + 1;
|
312
|
+
half -= (rsize & 1) ^ 1;
|
313
|
+
rlmatch = rmatch;
|
314
|
+
} else {
|
315
|
+
rrmatch = rmatch;
|
316
|
+
}
|
317
|
+
}
|
318
|
+
|
319
|
+
break;
|
320
|
+
}
|
321
|
+
}
|
322
|
+
|
323
|
+
if(idx != NULL) { *idx = (0 < (k - j)) ? j : i; }
|
324
|
+
return k - j;
|
325
|
+
}
|
326
|
+
|
327
|
+
/* Search for the character c in the string T. */
|
328
|
+
saidx_t
|
329
|
+
sa_simplesearch(const sauchar_t *T, saidx_t Tsize,
|
330
|
+
const saidx_t *SA, saidx_t SAsize,
|
331
|
+
saint_t c, saidx_t *idx) {
|
332
|
+
saidx_t size, lsize, rsize, half;
|
333
|
+
saidx_t i, j, k, p;
|
334
|
+
saint_t r;
|
335
|
+
|
336
|
+
if(idx != NULL) { *idx = -1; }
|
337
|
+
if((T == NULL) || (SA == NULL) || (Tsize < 0) || (SAsize < 0)) { return -1; }
|
338
|
+
if((Tsize == 0) || (SAsize == 0)) { return 0; }
|
339
|
+
|
340
|
+
for(i = j = k = 0, size = SAsize, half = size >> 1;
|
341
|
+
0 < size;
|
342
|
+
size = half, half >>= 1) {
|
343
|
+
p = SA[i + half];
|
344
|
+
r = (p < Tsize) ? T[p] - c : -1;
|
345
|
+
if(r < 0) {
|
346
|
+
i += half + 1;
|
347
|
+
half -= (size & 1) ^ 1;
|
348
|
+
} else if(r == 0) {
|
349
|
+
lsize = half, j = i, rsize = size - half - 1, k = i + half + 1;
|
350
|
+
|
351
|
+
/* left part */
|
352
|
+
for(half = lsize >> 1;
|
353
|
+
0 < lsize;
|
354
|
+
lsize = half, half >>= 1) {
|
355
|
+
p = SA[j + half];
|
356
|
+
r = (p < Tsize) ? T[p] - c : -1;
|
357
|
+
if(r < 0) {
|
358
|
+
j += half + 1;
|
359
|
+
half -= (lsize & 1) ^ 1;
|
360
|
+
}
|
361
|
+
}
|
362
|
+
|
363
|
+
/* right part */
|
364
|
+
for(half = rsize >> 1;
|
365
|
+
0 < rsize;
|
366
|
+
rsize = half, half >>= 1) {
|
367
|
+
p = SA[k + half];
|
368
|
+
r = (p < Tsize) ? T[p] - c : -1;
|
369
|
+
if(r <= 0) {
|
370
|
+
k += half + 1;
|
371
|
+
half -= (rsize & 1) ^ 1;
|
372
|
+
}
|
373
|
+
}
|
374
|
+
|
375
|
+
break;
|
376
|
+
}
|
377
|
+
}
|
378
|
+
|
379
|
+
if(idx != NULL) { *idx = (0 < (k - j)) ? j : i; }
|
380
|
+
return k - j;
|
381
|
+
}
|
data/ext/utils.o
ADDED
Binary file
|
@@ -0,0 +1,27 @@
|
|
1
|
+
The libdivsufsort copyright is as follows:
|
2
|
+
|
3
|
+
Copyright (c) 2003-2008 Yuta Mori All Rights Reserved.
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person
|
6
|
+
obtaining a copy of this software and associated documentation
|
7
|
+
files (the "Software"), to deal in the Software without
|
8
|
+
restriction, including without limitation the rights to use,
|
9
|
+
copy, modify, merge, publish, distribute, sublicense, and/or sell
|
10
|
+
copies of the Software, and to permit persons to whom the
|
11
|
+
Software is furnished to do so, subject to the following
|
12
|
+
conditions:
|
13
|
+
|
14
|
+
The above copyright notice and this permission notice shall be
|
15
|
+
included in all copies or substantial portions of the Software.
|
16
|
+
|
17
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
18
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
19
|
+
OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
20
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
21
|
+
HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
22
|
+
WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
23
|
+
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
24
|
+
OTHER DEALINGS IN THE SOFTWARE.
|
25
|
+
|
26
|
+
See also the libdivsufsort web site:
|
27
|
+
http://libdivsufsort.googlecode.com/ for more information.
|
@@ -0,0 +1,398 @@
|
|
1
|
+
/*
|
2
|
+
* divsufsort.c for libdivsufsort
|
3
|
+
* Copyright (c) 2003-2008 Yuta Mori All Rights Reserved.
|
4
|
+
*
|
5
|
+
* Permission is hereby granted, free of charge, to any person
|
6
|
+
* obtaining a copy of this software and associated documentation
|
7
|
+
* files (the "Software"), to deal in the Software without
|
8
|
+
* restriction, including without limitation the rights to use,
|
9
|
+
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
10
|
+
* copies of the Software, and to permit persons to whom the
|
11
|
+
* Software is furnished to do so, subject to the following
|
12
|
+
* conditions:
|
13
|
+
*
|
14
|
+
* The above copyright notice and this permission notice shall be
|
15
|
+
* included in all copies or substantial portions of the Software.
|
16
|
+
*
|
17
|
+
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
18
|
+
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
19
|
+
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
20
|
+
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
21
|
+
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
22
|
+
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
23
|
+
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
24
|
+
* OTHER DEALINGS IN THE SOFTWARE.
|
25
|
+
*/
|
26
|
+
|
27
|
+
#include "divsufsort_private.h"
|
28
|
+
#ifdef _OPENMP
|
29
|
+
# include <omp.h>
|
30
|
+
#endif
|
31
|
+
|
32
|
+
|
33
|
+
/*- Private Functions -*/
|
34
|
+
|
35
|
+
/* Sorts suffixes of type B*. */
|
36
|
+
static
|
37
|
+
saidx_t
|
38
|
+
sort_typeBstar(const sauchar_t *T, saidx_t *SA,
|
39
|
+
saidx_t *bucket_A, saidx_t *bucket_B,
|
40
|
+
saidx_t n) {
|
41
|
+
saidx_t *PAb, *ISAb, *buf;
|
42
|
+
#ifdef _OPENMP
|
43
|
+
saidx_t *curbuf;
|
44
|
+
saidx_t l;
|
45
|
+
#endif
|
46
|
+
saidx_t i, j, k, t, m, bufsize;
|
47
|
+
saint_t c0, c1;
|
48
|
+
#ifdef _OPENMP
|
49
|
+
saint_t d0, d1;
|
50
|
+
int tmp;
|
51
|
+
#endif
|
52
|
+
|
53
|
+
/* Initialize bucket arrays. */
|
54
|
+
for(i = 0; i < BUCKET_A_SIZE; ++i) { bucket_A[i] = 0; }
|
55
|
+
for(i = 0; i < BUCKET_B_SIZE; ++i) { bucket_B[i] = 0; }
|
56
|
+
|
57
|
+
/* Count the number of occurrences of the first one or two characters of each
|
58
|
+
type A, B and B* suffix. Moreover, store the beginning position of all
|
59
|
+
type B* suffixes into the array SA. */
|
60
|
+
for(i = n - 1, m = n, c0 = T[n - 1]; 0 <= i;) {
|
61
|
+
/* type A suffix. */
|
62
|
+
do { ++BUCKET_A(c1 = c0); } while((0 <= --i) && ((c0 = T[i]) >= c1));
|
63
|
+
if(0 <= i) {
|
64
|
+
/* type B* suffix. */
|
65
|
+
++BUCKET_BSTAR(c0, c1);
|
66
|
+
SA[--m] = i;
|
67
|
+
/* type B suffix. */
|
68
|
+
for(--i, c1 = c0; (0 <= i) && ((c0 = T[i]) <= c1); --i, c1 = c0) {
|
69
|
+
++BUCKET_B(c0, c1);
|
70
|
+
}
|
71
|
+
}
|
72
|
+
}
|
73
|
+
m = n - m;
|
74
|
+
/*
|
75
|
+
note:
|
76
|
+
A type B* suffix is lexicographically smaller than a type B suffix that
|
77
|
+
begins with the same first two characters.
|
78
|
+
*/
|
79
|
+
|
80
|
+
/* Calculate the index of start/end point of each bucket. */
|
81
|
+
for(c0 = 0, i = 0, j = 0; c0 < ALPHABET_SIZE; ++c0) {
|
82
|
+
t = i + BUCKET_A(c0);
|
83
|
+
BUCKET_A(c0) = i + j; /* start point */
|
84
|
+
i = t + BUCKET_B(c0, c0);
|
85
|
+
for(c1 = c0 + 1; c1 < ALPHABET_SIZE; ++c1) {
|
86
|
+
j += BUCKET_BSTAR(c0, c1);
|
87
|
+
BUCKET_BSTAR(c0, c1) = j; /* end point */
|
88
|
+
i += BUCKET_B(c0, c1);
|
89
|
+
}
|
90
|
+
}
|
91
|
+
|
92
|
+
if(0 < m) {
|
93
|
+
/* Sort the type B* suffixes by their first two characters. */
|
94
|
+
PAb = SA + n - m; ISAb = SA + m;
|
95
|
+
for(i = m - 2; 0 <= i; --i) {
|
96
|
+
t = PAb[i], c0 = T[t], c1 = T[t + 1];
|
97
|
+
SA[--BUCKET_BSTAR(c0, c1)] = i;
|
98
|
+
}
|
99
|
+
t = PAb[m - 1], c0 = T[t], c1 = T[t + 1];
|
100
|
+
SA[--BUCKET_BSTAR(c0, c1)] = m - 1;
|
101
|
+
|
102
|
+
/* Sort the type B* substrings using sssort. */
|
103
|
+
#ifdef _OPENMP
|
104
|
+
tmp = omp_get_max_threads();
|
105
|
+
buf = SA + m, bufsize = (n - (2 * m)) / tmp;
|
106
|
+
c0 = ALPHABET_SIZE - 2, c1 = ALPHABET_SIZE - 1, j = m;
|
107
|
+
#pragma omp parallel default(shared) private(curbuf, k, l, d0, d1, tmp)
|
108
|
+
{
|
109
|
+
tmp = omp_get_thread_num();
|
110
|
+
curbuf = buf + tmp * bufsize;
|
111
|
+
k = 0;
|
112
|
+
for(;;) {
|
113
|
+
#pragma omp critical(sssort_lock)
|
114
|
+
{
|
115
|
+
if(0 < (l = j)) {
|
116
|
+
d0 = c0, d1 = c1;
|
117
|
+
do {
|
118
|
+
k = BUCKET_BSTAR(d0, d1);
|
119
|
+
if(--d1 <= d0) {
|
120
|
+
d1 = ALPHABET_SIZE - 1;
|
121
|
+
if(--d0 < 0) { break; }
|
122
|
+
}
|
123
|
+
} while(((l - k) <= 1) && (0 < (l = k)));
|
124
|
+
c0 = d0, c1 = d1, j = k;
|
125
|
+
}
|
126
|
+
}
|
127
|
+
if(l == 0) { break; }
|
128
|
+
sssort(T, PAb, SA + k, SA + l,
|
129
|
+
curbuf, bufsize, 2, n, *(SA + k) == (m - 1));
|
130
|
+
}
|
131
|
+
}
|
132
|
+
#else
|
133
|
+
buf = SA + m, bufsize = n - (2 * m);
|
134
|
+
for(c0 = ALPHABET_SIZE - 2, j = m; 0 < j; --c0) {
|
135
|
+
for(c1 = ALPHABET_SIZE - 1; c0 < c1; j = i, --c1) {
|
136
|
+
i = BUCKET_BSTAR(c0, c1);
|
137
|
+
if(1 < (j - i)) {
|
138
|
+
sssort(T, PAb, SA + i, SA + j,
|
139
|
+
buf, bufsize, 2, n, *(SA + i) == (m - 1));
|
140
|
+
}
|
141
|
+
}
|
142
|
+
}
|
143
|
+
#endif
|
144
|
+
|
145
|
+
/* Compute ranks of type B* substrings. */
|
146
|
+
for(i = m - 1; 0 <= i; --i) {
|
147
|
+
if(0 <= SA[i]) {
|
148
|
+
j = i;
|
149
|
+
do { ISAb[SA[i]] = i; } while((0 <= --i) && (0 <= SA[i]));
|
150
|
+
SA[i + 1] = i - j;
|
151
|
+
if(i <= 0) { break; }
|
152
|
+
}
|
153
|
+
j = i;
|
154
|
+
do { ISAb[SA[i] = ~SA[i]] = j; } while(SA[--i] < 0);
|
155
|
+
ISAb[SA[i]] = j;
|
156
|
+
}
|
157
|
+
|
158
|
+
/* Construct the inverse suffix array of type B* suffixes using trsort. */
|
159
|
+
trsort(ISAb, SA, m, 1);
|
160
|
+
|
161
|
+
/* Set the sorted order of tyoe B* suffixes. */
|
162
|
+
for(i = n - 1, j = m, c0 = T[n - 1]; 0 <= i;) {
|
163
|
+
for(--i, c1 = c0; (0 <= i) && ((c0 = T[i]) >= c1); --i, c1 = c0) { }
|
164
|
+
if(0 <= i) {
|
165
|
+
t = i;
|
166
|
+
for(--i, c1 = c0; (0 <= i) && ((c0 = T[i]) <= c1); --i, c1 = c0) { }
|
167
|
+
SA[ISAb[--j]] = ((t == 0) || (1 < (t - i))) ? t : ~t;
|
168
|
+
}
|
169
|
+
}
|
170
|
+
|
171
|
+
/* Calculate the index of start/end point of each bucket. */
|
172
|
+
BUCKET_B(ALPHABET_SIZE - 1, ALPHABET_SIZE - 1) = n; /* end point */
|
173
|
+
for(c0 = ALPHABET_SIZE - 2, k = m - 1; 0 <= c0; --c0) {
|
174
|
+
i = BUCKET_A(c0 + 1) - 1;
|
175
|
+
for(c1 = ALPHABET_SIZE - 1; c0 < c1; --c1) {
|
176
|
+
t = i - BUCKET_B(c0, c1);
|
177
|
+
BUCKET_B(c0, c1) = i; /* end point */
|
178
|
+
|
179
|
+
/* Move all type B* suffixes to the correct position. */
|
180
|
+
for(i = t, j = BUCKET_BSTAR(c0, c1);
|
181
|
+
j <= k;
|
182
|
+
--i, --k) { SA[i] = SA[k]; }
|
183
|
+
}
|
184
|
+
BUCKET_BSTAR(c0, c0 + 1) = i - BUCKET_B(c0, c0) + 1; /* start point */
|
185
|
+
BUCKET_B(c0, c0) = i; /* end point */
|
186
|
+
}
|
187
|
+
}
|
188
|
+
|
189
|
+
return m;
|
190
|
+
}
|
191
|
+
|
192
|
+
/* Constructs the suffix array by using the sorted order of type B* suffixes. */
|
193
|
+
static
|
194
|
+
void
|
195
|
+
construct_SA(const sauchar_t *T, saidx_t *SA,
|
196
|
+
saidx_t *bucket_A, saidx_t *bucket_B,
|
197
|
+
saidx_t n, saidx_t m) {
|
198
|
+
saidx_t *i, *j, *k;
|
199
|
+
saidx_t s;
|
200
|
+
saint_t c0, c1, c2;
|
201
|
+
|
202
|
+
if(0 < m) {
|
203
|
+
/* Construct the sorted order of type B suffixes by using
|
204
|
+
the sorted order of type B* suffixes. */
|
205
|
+
for(c1 = ALPHABET_SIZE - 2; 0 <= c1; --c1) {
|
206
|
+
/* Scan the suffix array from right to left. */
|
207
|
+
for(i = SA + BUCKET_BSTAR(c1, c1 + 1),
|
208
|
+
j = SA + BUCKET_A(c1 + 1) - 1, k = NULL, c2 = -1;
|
209
|
+
i <= j;
|
210
|
+
--j) {
|
211
|
+
if(0 < (s = *j)) {
|
212
|
+
assert(T[s] == c1);
|
213
|
+
assert(((s + 1) < n) && (T[s] <= T[s + 1]));
|
214
|
+
assert(T[s - 1] <= T[s]);
|
215
|
+
*j = ~s;
|
216
|
+
c0 = T[--s];
|
217
|
+
if((0 < s) && (T[s - 1] > c0)) { s = ~s; }
|
218
|
+
if(c0 != c2) {
|
219
|
+
if(0 <= c2) { BUCKET_B(c2, c1) = k - SA; }
|
220
|
+
k = SA + BUCKET_B(c2 = c0, c1);
|
221
|
+
}
|
222
|
+
assert(k < j);
|
223
|
+
*k-- = s;
|
224
|
+
} else {
|
225
|
+
assert(((s == 0) && (T[s] == c1)) || (s < 0));
|
226
|
+
*j = ~s;
|
227
|
+
}
|
228
|
+
}
|
229
|
+
}
|
230
|
+
}
|
231
|
+
|
232
|
+
/* Construct the suffix array by using
|
233
|
+
the sorted order of type B suffixes. */
|
234
|
+
k = SA + BUCKET_A(c2 = T[n - 1]);
|
235
|
+
*k++ = (T[n - 2] < c2) ? ~(n - 1) : (n - 1);
|
236
|
+
/* Scan the suffix array from left to right. */
|
237
|
+
for(i = SA, j = SA + n; i < j; ++i) {
|
238
|
+
if(0 < (s = *i)) {
|
239
|
+
assert(T[s - 1] >= T[s]);
|
240
|
+
c0 = T[--s];
|
241
|
+
if((s == 0) || (T[s - 1] < c0)) { s = ~s; }
|
242
|
+
if(c0 != c2) {
|
243
|
+
BUCKET_A(c2) = k - SA;
|
244
|
+
k = SA + BUCKET_A(c2 = c0);
|
245
|
+
}
|
246
|
+
assert(i < k);
|
247
|
+
*k++ = s;
|
248
|
+
} else {
|
249
|
+
assert(s < 0);
|
250
|
+
*i = ~s;
|
251
|
+
}
|
252
|
+
}
|
253
|
+
}
|
254
|
+
|
255
|
+
/* Constructs the burrows-wheeler transformed string directly
|
256
|
+
by using the sorted order of type B* suffixes. */
|
257
|
+
static
|
258
|
+
saidx_t
|
259
|
+
construct_BWT(const sauchar_t *T, saidx_t *SA,
|
260
|
+
saidx_t *bucket_A, saidx_t *bucket_B,
|
261
|
+
saidx_t n, saidx_t m) {
|
262
|
+
saidx_t *i, *j, *k, *orig;
|
263
|
+
saidx_t s;
|
264
|
+
saint_t c0, c1, c2;
|
265
|
+
|
266
|
+
if(0 < m) {
|
267
|
+
/* Construct the sorted order of type B suffixes by using
|
268
|
+
the sorted order of type B* suffixes. */
|
269
|
+
for(c1 = ALPHABET_SIZE - 2; 0 <= c1; --c1) {
|
270
|
+
/* Scan the suffix array from right to left. */
|
271
|
+
for(i = SA + BUCKET_BSTAR(c1, c1 + 1),
|
272
|
+
j = SA + BUCKET_A(c1 + 1) - 1, k = NULL, c2 = -1;
|
273
|
+
i <= j;
|
274
|
+
--j) {
|
275
|
+
if(0 < (s = *j)) {
|
276
|
+
assert(T[s] == c1);
|
277
|
+
assert(((s + 1) < n) && (T[s] <= T[s + 1]));
|
278
|
+
assert(T[s - 1] <= T[s]);
|
279
|
+
c0 = T[--s];
|
280
|
+
*j = ~((saidx_t)c0);
|
281
|
+
if((0 < s) && (T[s - 1] > c0)) { s = ~s; }
|
282
|
+
if(c0 != c2) {
|
283
|
+
if(0 <= c2) { BUCKET_B(c2, c1) = k - SA; }
|
284
|
+
k = SA + BUCKET_B(c2 = c0, c1);
|
285
|
+
}
|
286
|
+
assert(k < j);
|
287
|
+
*k-- = s;
|
288
|
+
} else if(s != 0) {
|
289
|
+
*j = ~s;
|
290
|
+
#ifndef NDEBUG
|
291
|
+
} else {
|
292
|
+
assert(T[s] == c1);
|
293
|
+
#endif
|
294
|
+
}
|
295
|
+
}
|
296
|
+
}
|
297
|
+
}
|
298
|
+
|
299
|
+
/* Construct the BWTed string by using
|
300
|
+
the sorted order of type B suffixes. */
|
301
|
+
k = SA + BUCKET_A(c2 = T[n - 1]);
|
302
|
+
*k++ = (T[n - 2] < c2) ? ~((saidx_t)T[n - 2]) : (n - 1);
|
303
|
+
/* Scan the suffix array from left to right. */
|
304
|
+
for(i = SA, j = SA + n, orig = SA; i < j; ++i) {
|
305
|
+
if(0 < (s = *i)) {
|
306
|
+
assert(T[s - 1] >= T[s]);
|
307
|
+
c0 = T[--s];
|
308
|
+
*i = c0;
|
309
|
+
if((0 < s) && (T[s - 1] < c0)) { s = ~((saidx_t)T[s - 1]); }
|
310
|
+
if(c0 != c2) {
|
311
|
+
BUCKET_A(c2) = k - SA;
|
312
|
+
k = SA + BUCKET_A(c2 = c0);
|
313
|
+
}
|
314
|
+
assert(i < k);
|
315
|
+
*k++ = s;
|
316
|
+
} else if(s != 0) {
|
317
|
+
*i = ~s;
|
318
|
+
} else {
|
319
|
+
orig = i;
|
320
|
+
}
|
321
|
+
}
|
322
|
+
|
323
|
+
return orig - SA;
|
324
|
+
}
|
325
|
+
|
326
|
+
|
327
|
+
/*---------------------------------------------------------------------------*/
|
328
|
+
|
329
|
+
/*- Function -*/
|
330
|
+
|
331
|
+
saint_t
|
332
|
+
divsufsort(const sauchar_t *T, saidx_t *SA, saidx_t n) {
|
333
|
+
saidx_t *bucket_A, *bucket_B;
|
334
|
+
saidx_t m;
|
335
|
+
saint_t err = 0;
|
336
|
+
|
337
|
+
/* Check arguments. */
|
338
|
+
if((T == NULL) || (SA == NULL) || (n < 0)) { return -1; }
|
339
|
+
else if(n == 0) { return 0; }
|
340
|
+
else if(n == 1) { SA[0] = 0; return 0; }
|
341
|
+
else if(n == 2) { m = (T[0] < T[1]); SA[m ^ 1] = 0, SA[m] = 1; return 0; }
|
342
|
+
|
343
|
+
bucket_A = (saidx_t *)malloc(BUCKET_A_SIZE * sizeof(saidx_t));
|
344
|
+
bucket_B = (saidx_t *)malloc(BUCKET_B_SIZE * sizeof(saidx_t));
|
345
|
+
|
346
|
+
/* Suffixsort. */
|
347
|
+
if((bucket_A != NULL) && (bucket_B != NULL)) {
|
348
|
+
m = sort_typeBstar(T, SA, bucket_A, bucket_B, n);
|
349
|
+
construct_SA(T, SA, bucket_A, bucket_B, n, m);
|
350
|
+
} else {
|
351
|
+
err = -2;
|
352
|
+
}
|
353
|
+
|
354
|
+
free(bucket_B);
|
355
|
+
free(bucket_A);
|
356
|
+
|
357
|
+
return err;
|
358
|
+
}
|
359
|
+
|
360
|
+
saidx_t
|
361
|
+
divbwt(const sauchar_t *T, sauchar_t *U, saidx_t *A, saidx_t n) {
|
362
|
+
saidx_t *B;
|
363
|
+
saidx_t *bucket_A, *bucket_B;
|
364
|
+
saidx_t m, pidx, i;
|
365
|
+
|
366
|
+
/* Check arguments. */
|
367
|
+
if((T == NULL) || (U == NULL) || (n < 0)) { return -1; }
|
368
|
+
else if(n <= 1) { if(n == 1) { U[0] = T[0]; } return n; }
|
369
|
+
|
370
|
+
if((B = A) == NULL) { B = (saidx_t *)malloc((size_t)(n + 1) * sizeof(saidx_t)); }
|
371
|
+
bucket_A = (saidx_t *)malloc(BUCKET_A_SIZE * sizeof(saidx_t));
|
372
|
+
bucket_B = (saidx_t *)malloc(BUCKET_B_SIZE * sizeof(saidx_t));
|
373
|
+
|
374
|
+
/* Burrows-Wheeler Transform. */
|
375
|
+
if((B != NULL) && (bucket_A != NULL) && (bucket_B != NULL)) {
|
376
|
+
m = sort_typeBstar(T, B, bucket_A, bucket_B, n);
|
377
|
+
pidx = construct_BWT(T, B, bucket_A, bucket_B, n, m);
|
378
|
+
|
379
|
+
/* Copy to output string. */
|
380
|
+
U[0] = T[n - 1];
|
381
|
+
for(i = 0; i < pidx; ++i) { U[i + 1] = (sauchar_t)B[i]; }
|
382
|
+
for(i += 1; i < n; ++i) { U[i] = (sauchar_t)B[i]; }
|
383
|
+
pidx += 1;
|
384
|
+
} else {
|
385
|
+
pidx = -2;
|
386
|
+
}
|
387
|
+
|
388
|
+
free(bucket_B);
|
389
|
+
free(bucket_A);
|
390
|
+
if(A == NULL) { free(B); }
|
391
|
+
|
392
|
+
return pidx;
|
393
|
+
}
|
394
|
+
|
395
|
+
const char *
|
396
|
+
divsufsort_version(void) {
|
397
|
+
return PROJECT_VERSION_FULL;
|
398
|
+
}
|