divsufsort 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- data/README.txt +77 -0
- data/ext/Makefile +149 -0
- data/ext/divsufsort.c +398 -0
- data/ext/divsufsort.h +191 -0
- data/ext/divsufsort.o +0 -0
- data/ext/divsufsort.so +0 -0
- data/ext/divsufsort_private.h +207 -0
- data/ext/divsufsort_ruby.c +227 -0
- data/ext/divsufsort_ruby.o +0 -0
- data/ext/extconf.rb +18 -0
- data/ext/lfs.h +56 -0
- data/ext/mkmf.log +266 -0
- data/ext/sssort.c +815 -0
- data/ext/sssort.o +0 -0
- data/ext/trsort.c +586 -0
- data/ext/trsort.o +0 -0
- data/ext/utils.c +381 -0
- data/ext/utils.o +0 -0
- data/libdivsufsort/COPYING +27 -0
- data/libdivsufsort/divsufsort.c +398 -0
- data/libdivsufsort/divsufsort.h +191 -0
- data/libdivsufsort/divsufsort_private.h +207 -0
- data/libdivsufsort/lfs.h +56 -0
- data/libdivsufsort/sssort.c +815 -0
- data/libdivsufsort/trsort.c +586 -0
- data/libdivsufsort/utils.c +381 -0
- metadata +80 -0
@@ -0,0 +1,381 @@
|
|
1
|
+
/*
|
2
|
+
* utils.c for libdivsufsort
|
3
|
+
* Copyright (c) 2003-2008 Yuta Mori All Rights Reserved.
|
4
|
+
*
|
5
|
+
* Permission is hereby granted, free of charge, to any person
|
6
|
+
* obtaining a copy of this software and associated documentation
|
7
|
+
* files (the "Software"), to deal in the Software without
|
8
|
+
* restriction, including without limitation the rights to use,
|
9
|
+
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
10
|
+
* copies of the Software, and to permit persons to whom the
|
11
|
+
* Software is furnished to do so, subject to the following
|
12
|
+
* conditions:
|
13
|
+
*
|
14
|
+
* The above copyright notice and this permission notice shall be
|
15
|
+
* included in all copies or substantial portions of the Software.
|
16
|
+
*
|
17
|
+
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
18
|
+
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
19
|
+
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
20
|
+
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
21
|
+
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
22
|
+
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
23
|
+
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
24
|
+
* OTHER DEALINGS IN THE SOFTWARE.
|
25
|
+
*/
|
26
|
+
|
27
|
+
#include "divsufsort_private.h"
|
28
|
+
|
29
|
+
|
30
|
+
/*- Private Function -*/
|
31
|
+
|
32
|
+
/* Binary search for inverse bwt. */
|
33
|
+
static
|
34
|
+
saidx_t
|
35
|
+
binarysearch_lower(const saidx_t *A, saidx_t size, saidx_t value) {
|
36
|
+
saidx_t half, i;
|
37
|
+
for(i = 0, half = size >> 1;
|
38
|
+
0 < size;
|
39
|
+
size = half, half >>= 1) {
|
40
|
+
if(A[i + half] < value) {
|
41
|
+
i += half + 1;
|
42
|
+
half -= (size & 1) ^ 1;
|
43
|
+
}
|
44
|
+
}
|
45
|
+
return i;
|
46
|
+
}
|
47
|
+
|
48
|
+
|
49
|
+
/*- Functions -*/
|
50
|
+
|
51
|
+
/* Burrows-Wheeler transform. */
|
52
|
+
saint_t
|
53
|
+
bw_transform(const sauchar_t *T, sauchar_t *U, saidx_t *SA,
|
54
|
+
saidx_t n, saidx_t *idx) {
|
55
|
+
saidx_t *A, i, j, p, t;
|
56
|
+
saint_t c;
|
57
|
+
|
58
|
+
/* Check arguments. */
|
59
|
+
if((T == NULL) || (U == NULL) || (n < 0) || (idx == NULL)) { return -1; }
|
60
|
+
if(n <= 1) {
|
61
|
+
if(n == 1) { U[0] = T[0]; }
|
62
|
+
*idx = n;
|
63
|
+
return 0;
|
64
|
+
}
|
65
|
+
|
66
|
+
if((A = SA) == NULL) {
|
67
|
+
i = divbwt(T, U, NULL, n);
|
68
|
+
if(0 <= i) { *idx = i; i = 0; }
|
69
|
+
return (saint_t)i;
|
70
|
+
}
|
71
|
+
|
72
|
+
/* BW transform. */
|
73
|
+
if(T == U) {
|
74
|
+
t = n;
|
75
|
+
for(i = 0, j = 0; i < n; ++i) {
|
76
|
+
p = t - 1;
|
77
|
+
t = A[i];
|
78
|
+
if(0 <= p) {
|
79
|
+
c = T[j];
|
80
|
+
U[j] = (j <= p) ? T[p] : (sauchar_t)A[p];
|
81
|
+
A[j] = c;
|
82
|
+
j++;
|
83
|
+
} else {
|
84
|
+
*idx = i;
|
85
|
+
}
|
86
|
+
}
|
87
|
+
p = t - 1;
|
88
|
+
if(0 <= p) {
|
89
|
+
c = T[j];
|
90
|
+
U[j] = (j <= p) ? T[p] : (sauchar_t)A[p];
|
91
|
+
A[j] = c;
|
92
|
+
} else {
|
93
|
+
*idx = i;
|
94
|
+
}
|
95
|
+
} else {
|
96
|
+
U[0] = T[n - 1];
|
97
|
+
for(i = 0; A[i] != 0; ++i) { U[i + 1] = T[A[i] - 1]; }
|
98
|
+
*idx = i + 1;
|
99
|
+
for(++i; i < n; ++i) { U[i] = T[A[i] - 1]; }
|
100
|
+
}
|
101
|
+
|
102
|
+
if(SA == NULL) {
|
103
|
+
/* Deallocate memory. */
|
104
|
+
free(A);
|
105
|
+
}
|
106
|
+
|
107
|
+
return 0;
|
108
|
+
}
|
109
|
+
|
110
|
+
/* Inverse Burrows-Wheeler transform. */
|
111
|
+
saint_t
|
112
|
+
inverse_bw_transform(const sauchar_t *T, sauchar_t *U, saidx_t *A,
|
113
|
+
saidx_t n, saidx_t idx) {
|
114
|
+
saidx_t C[ALPHABET_SIZE];
|
115
|
+
sauchar_t D[ALPHABET_SIZE];
|
116
|
+
saidx_t *B;
|
117
|
+
saidx_t i, p;
|
118
|
+
saint_t c, d;
|
119
|
+
|
120
|
+
/* Check arguments. */
|
121
|
+
if((T == NULL) || (U == NULL) || (n < 0) || (idx < 0) ||
|
122
|
+
(n < idx) || ((0 < n) && (idx == 0))) {
|
123
|
+
return -1;
|
124
|
+
}
|
125
|
+
if(n <= 1) { return 0; }
|
126
|
+
|
127
|
+
if((B = A) == NULL) {
|
128
|
+
/* Allocate n*sizeof(saidx_t) bytes of memory. */
|
129
|
+
if((B = (saidx_t *)malloc((size_t)n * sizeof(saidx_t))) == NULL) { return -2; }
|
130
|
+
}
|
131
|
+
|
132
|
+
/* Inverse BW transform. */
|
133
|
+
for(c = 0; c < ALPHABET_SIZE; ++c) { C[c] = 0; }
|
134
|
+
for(i = 0; i < n; ++i) { ++C[T[i]]; }
|
135
|
+
for(c = 0, d = 0, i = 0; c < ALPHABET_SIZE; ++c) {
|
136
|
+
p = C[c];
|
137
|
+
if(0 < p) {
|
138
|
+
C[c] = i;
|
139
|
+
D[d++] = (sauchar_t)c;
|
140
|
+
i += p;
|
141
|
+
}
|
142
|
+
}
|
143
|
+
for(i = 0; i < idx; ++i) { B[C[T[i]]++] = i; }
|
144
|
+
for( ; i < n; ++i) { B[C[T[i]]++] = i + 1; }
|
145
|
+
for(c = 0; c < d; ++c) { C[c] = C[D[c]]; }
|
146
|
+
for(i = 0, p = idx; i < n; ++i) {
|
147
|
+
U[i] = D[binarysearch_lower(C, d, p)];
|
148
|
+
p = B[p - 1];
|
149
|
+
}
|
150
|
+
|
151
|
+
if(A == NULL) {
|
152
|
+
/* Deallocate memory. */
|
153
|
+
free(B);
|
154
|
+
}
|
155
|
+
|
156
|
+
return 0;
|
157
|
+
}
|
158
|
+
|
159
|
+
/* Checks the suffix array SA of the string T. */
|
160
|
+
saint_t
|
161
|
+
sufcheck(const sauchar_t *T, const saidx_t *SA,
|
162
|
+
saidx_t n, saint_t verbose) {
|
163
|
+
saidx_t C[ALPHABET_SIZE];
|
164
|
+
saidx_t i, p, q, t;
|
165
|
+
saint_t c;
|
166
|
+
|
167
|
+
if(verbose) { fprintf(stderr, "sufcheck: "); }
|
168
|
+
|
169
|
+
/* Check arguments. */
|
170
|
+
if((T == NULL) || (SA == NULL) || (n < 0)) {
|
171
|
+
if(verbose) { fprintf(stderr, "Invalid arguments.\n"); }
|
172
|
+
return -1;
|
173
|
+
}
|
174
|
+
if(n == 0) {
|
175
|
+
if(verbose) { fprintf(stderr, "Done.\n"); }
|
176
|
+
return 0;
|
177
|
+
}
|
178
|
+
|
179
|
+
/* check range: [0..n-1] */
|
180
|
+
for(i = 0; i < n; ++i) {
|
181
|
+
if((SA[i] < 0) || (n <= SA[i])) {
|
182
|
+
if(verbose) {
|
183
|
+
fprintf(stderr, "Out of the range [0,%" PRIdSAIDX_T "].\n"
|
184
|
+
" SA[%" PRIdSAIDX_T "]=%" PRIdSAIDX_T "\n",
|
185
|
+
n - 1, i, SA[i]);
|
186
|
+
}
|
187
|
+
return -2;
|
188
|
+
}
|
189
|
+
}
|
190
|
+
|
191
|
+
/* check first characters. */
|
192
|
+
for(i = 1; i < n; ++i) {
|
193
|
+
if(T[SA[i - 1]] > T[SA[i]]) {
|
194
|
+
if(verbose) {
|
195
|
+
fprintf(stderr, "Suffixes in wrong order.\n"
|
196
|
+
" T[SA[%" PRIdSAIDX_T "]=%" PRIdSAIDX_T "]=%d"
|
197
|
+
" > T[SA[%" PRIdSAIDX_T "]=%" PRIdSAIDX_T "]=%d\n",
|
198
|
+
i - 1, SA[i - 1], T[SA[i - 1]], i, SA[i], T[SA[i]]);
|
199
|
+
}
|
200
|
+
return -3;
|
201
|
+
}
|
202
|
+
}
|
203
|
+
|
204
|
+
/* check suffixes. */
|
205
|
+
for(i = 0; i < ALPHABET_SIZE; ++i) { C[i] = 0; }
|
206
|
+
for(i = 0; i < n; ++i) { ++C[T[i]]; }
|
207
|
+
for(i = 0, p = 0; i < ALPHABET_SIZE; ++i) {
|
208
|
+
t = C[i];
|
209
|
+
C[i] = p;
|
210
|
+
p += t;
|
211
|
+
}
|
212
|
+
|
213
|
+
q = C[T[n - 1]];
|
214
|
+
C[T[n - 1]] += 1;
|
215
|
+
for(i = 0; i < n; ++i) {
|
216
|
+
p = SA[i];
|
217
|
+
if(0 < p) {
|
218
|
+
c = T[--p];
|
219
|
+
t = C[c];
|
220
|
+
} else {
|
221
|
+
c = T[p = n - 1];
|
222
|
+
t = q;
|
223
|
+
}
|
224
|
+
if((t < 0) || (p != SA[t])) {
|
225
|
+
if(verbose) {
|
226
|
+
fprintf(stderr, "Suffix in wrong position.\n"
|
227
|
+
" SA[%" PRIdSAIDX_T "]=%" PRIdSAIDX_T " or\n"
|
228
|
+
" SA[%" PRIdSAIDX_T "]=%" PRIdSAIDX_T "\n",
|
229
|
+
t, (0 <= t) ? SA[t] : -1, i, SA[i]);
|
230
|
+
}
|
231
|
+
return -4;
|
232
|
+
}
|
233
|
+
if(t != q) {
|
234
|
+
++C[c];
|
235
|
+
if((n <= C[c]) || (T[SA[C[c]]] != c)) { C[c] = -1; }
|
236
|
+
}
|
237
|
+
}
|
238
|
+
|
239
|
+
if(1 <= verbose) { fprintf(stderr, "Done.\n"); }
|
240
|
+
return 0;
|
241
|
+
}
|
242
|
+
|
243
|
+
|
244
|
+
static
|
245
|
+
int
|
246
|
+
_compare(const sauchar_t *T, saidx_t Tsize,
|
247
|
+
const sauchar_t *P, saidx_t Psize,
|
248
|
+
saidx_t suf, saidx_t *match) {
|
249
|
+
saidx_t i, j;
|
250
|
+
saint_t r;
|
251
|
+
for(i = suf + *match, j = *match, r = 0;
|
252
|
+
(i < Tsize) && (j < Psize) && ((r = T[i] - P[j]) == 0); ++i, ++j) { }
|
253
|
+
*match = j;
|
254
|
+
return (r == 0) ? -(j != Psize) : r;
|
255
|
+
}
|
256
|
+
|
257
|
+
/* Search for the pattern P in the string T. */
|
258
|
+
saidx_t
|
259
|
+
sa_search(const sauchar_t *T, saidx_t Tsize,
|
260
|
+
const sauchar_t *P, saidx_t Psize,
|
261
|
+
const saidx_t *SA, saidx_t SAsize,
|
262
|
+
saidx_t *idx) {
|
263
|
+
saidx_t size, lsize, rsize, half;
|
264
|
+
saidx_t match, lmatch, rmatch;
|
265
|
+
saidx_t llmatch, lrmatch, rlmatch, rrmatch;
|
266
|
+
saidx_t i, j, k;
|
267
|
+
saint_t r;
|
268
|
+
|
269
|
+
if(idx != NULL) { *idx = -1; }
|
270
|
+
if((T == NULL) || (P == NULL) || (SA == NULL) ||
|
271
|
+
(Tsize < 0) || (Psize < 0) || (SAsize < 0)) { return -1; }
|
272
|
+
if((Tsize == 0) || (SAsize == 0)) { return 0; }
|
273
|
+
if(Psize == 0) { if(idx != NULL) { *idx = 0; } return SAsize; }
|
274
|
+
|
275
|
+
for(i = j = k = 0, lmatch = rmatch = 0, size = SAsize, half = size >> 1;
|
276
|
+
0 < size;
|
277
|
+
size = half, half >>= 1) {
|
278
|
+
match = MIN(lmatch, rmatch);
|
279
|
+
r = _compare(T, Tsize, P, Psize, SA[i + half], &match);
|
280
|
+
if(r < 0) {
|
281
|
+
i += half + 1;
|
282
|
+
half -= (size & 1) ^ 1;
|
283
|
+
lmatch = match;
|
284
|
+
} else if(r > 0) {
|
285
|
+
rmatch = match;
|
286
|
+
} else {
|
287
|
+
lsize = half, j = i, rsize = size - half - 1, k = i + half + 1;
|
288
|
+
|
289
|
+
/* left part */
|
290
|
+
for(llmatch = lmatch, lrmatch = match, half = lsize >> 1;
|
291
|
+
0 < lsize;
|
292
|
+
lsize = half, half >>= 1) {
|
293
|
+
lmatch = MIN(llmatch, lrmatch);
|
294
|
+
r = _compare(T, Tsize, P, Psize, SA[j + half], &lmatch);
|
295
|
+
if(r < 0) {
|
296
|
+
j += half + 1;
|
297
|
+
half -= (lsize & 1) ^ 1;
|
298
|
+
llmatch = lmatch;
|
299
|
+
} else {
|
300
|
+
lrmatch = lmatch;
|
301
|
+
}
|
302
|
+
}
|
303
|
+
|
304
|
+
/* right part */
|
305
|
+
for(rlmatch = match, rrmatch = rmatch, half = rsize >> 1;
|
306
|
+
0 < rsize;
|
307
|
+
rsize = half, half >>= 1) {
|
308
|
+
rmatch = MIN(rlmatch, rrmatch);
|
309
|
+
r = _compare(T, Tsize, P, Psize, SA[k + half], &rmatch);
|
310
|
+
if(r <= 0) {
|
311
|
+
k += half + 1;
|
312
|
+
half -= (rsize & 1) ^ 1;
|
313
|
+
rlmatch = rmatch;
|
314
|
+
} else {
|
315
|
+
rrmatch = rmatch;
|
316
|
+
}
|
317
|
+
}
|
318
|
+
|
319
|
+
break;
|
320
|
+
}
|
321
|
+
}
|
322
|
+
|
323
|
+
if(idx != NULL) { *idx = (0 < (k - j)) ? j : i; }
|
324
|
+
return k - j;
|
325
|
+
}
|
326
|
+
|
327
|
+
/* Search for the character c in the string T. */
|
328
|
+
saidx_t
|
329
|
+
sa_simplesearch(const sauchar_t *T, saidx_t Tsize,
|
330
|
+
const saidx_t *SA, saidx_t SAsize,
|
331
|
+
saint_t c, saidx_t *idx) {
|
332
|
+
saidx_t size, lsize, rsize, half;
|
333
|
+
saidx_t i, j, k, p;
|
334
|
+
saint_t r;
|
335
|
+
|
336
|
+
if(idx != NULL) { *idx = -1; }
|
337
|
+
if((T == NULL) || (SA == NULL) || (Tsize < 0) || (SAsize < 0)) { return -1; }
|
338
|
+
if((Tsize == 0) || (SAsize == 0)) { return 0; }
|
339
|
+
|
340
|
+
for(i = j = k = 0, size = SAsize, half = size >> 1;
|
341
|
+
0 < size;
|
342
|
+
size = half, half >>= 1) {
|
343
|
+
p = SA[i + half];
|
344
|
+
r = (p < Tsize) ? T[p] - c : -1;
|
345
|
+
if(r < 0) {
|
346
|
+
i += half + 1;
|
347
|
+
half -= (size & 1) ^ 1;
|
348
|
+
} else if(r == 0) {
|
349
|
+
lsize = half, j = i, rsize = size - half - 1, k = i + half + 1;
|
350
|
+
|
351
|
+
/* left part */
|
352
|
+
for(half = lsize >> 1;
|
353
|
+
0 < lsize;
|
354
|
+
lsize = half, half >>= 1) {
|
355
|
+
p = SA[j + half];
|
356
|
+
r = (p < Tsize) ? T[p] - c : -1;
|
357
|
+
if(r < 0) {
|
358
|
+
j += half + 1;
|
359
|
+
half -= (lsize & 1) ^ 1;
|
360
|
+
}
|
361
|
+
}
|
362
|
+
|
363
|
+
/* right part */
|
364
|
+
for(half = rsize >> 1;
|
365
|
+
0 < rsize;
|
366
|
+
rsize = half, half >>= 1) {
|
367
|
+
p = SA[k + half];
|
368
|
+
r = (p < Tsize) ? T[p] - c : -1;
|
369
|
+
if(r <= 0) {
|
370
|
+
k += half + 1;
|
371
|
+
half -= (rsize & 1) ^ 1;
|
372
|
+
}
|
373
|
+
}
|
374
|
+
|
375
|
+
break;
|
376
|
+
}
|
377
|
+
}
|
378
|
+
|
379
|
+
if(idx != NULL) { *idx = (0 < (k - j)) ? j : i; }
|
380
|
+
return k - j;
|
381
|
+
}
|
metadata
ADDED
@@ -0,0 +1,80 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: divsufsort
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.1.0
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- winebarrel
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
|
12
|
+
date: 2008-10-22 00:00:00 +09:00
|
13
|
+
default_executable:
|
14
|
+
dependencies: []
|
15
|
+
|
16
|
+
description:
|
17
|
+
email: sgwr_dts@yahoo.co.jp
|
18
|
+
executables: []
|
19
|
+
|
20
|
+
extensions:
|
21
|
+
- ext/extconf.rb
|
22
|
+
extra_rdoc_files:
|
23
|
+
- README.txt
|
24
|
+
- ext/divsufsort_ruby.c
|
25
|
+
files:
|
26
|
+
- ext/divsufsort.c
|
27
|
+
- ext/divsufsort.h
|
28
|
+
- ext/divsufsort.o
|
29
|
+
- ext/divsufsort.so
|
30
|
+
- ext/divsufsort_private.h
|
31
|
+
- ext/divsufsort_ruby.c
|
32
|
+
- ext/divsufsort_ruby.o
|
33
|
+
- ext/extconf.rb
|
34
|
+
- ext/lfs.h
|
35
|
+
- ext/Makefile
|
36
|
+
- ext/mkmf.log
|
37
|
+
- ext/sssort.c
|
38
|
+
- ext/sssort.o
|
39
|
+
- ext/trsort.c
|
40
|
+
- ext/trsort.o
|
41
|
+
- ext/utils.c
|
42
|
+
- ext/utils.o
|
43
|
+
- libdivsufsort/COPYING
|
44
|
+
- libdivsufsort/divsufsort.c
|
45
|
+
- libdivsufsort/divsufsort.h
|
46
|
+
- libdivsufsort/divsufsort_private.h
|
47
|
+
- libdivsufsort/lfs.h
|
48
|
+
- libdivsufsort/sssort.c
|
49
|
+
- libdivsufsort/trsort.c
|
50
|
+
- libdivsufsort/utils.c
|
51
|
+
- README.txt
|
52
|
+
has_rdoc: true
|
53
|
+
homepage: http://divsufsort.rubyforge.org
|
54
|
+
post_install_message:
|
55
|
+
rdoc_options:
|
56
|
+
- --title
|
57
|
+
- divsufsort - Ruby bindings for libdivsufsort.
|
58
|
+
require_paths:
|
59
|
+
- lib
|
60
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
61
|
+
requirements:
|
62
|
+
- - ">="
|
63
|
+
- !ruby/object:Gem::Version
|
64
|
+
version: "0"
|
65
|
+
version:
|
66
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
67
|
+
requirements:
|
68
|
+
- - ">="
|
69
|
+
- !ruby/object:Gem::Version
|
70
|
+
version: "0"
|
71
|
+
version:
|
72
|
+
requirements: []
|
73
|
+
|
74
|
+
rubyforge_project: divsufsort
|
75
|
+
rubygems_version: 1.2.0
|
76
|
+
signing_key:
|
77
|
+
specification_version: 2
|
78
|
+
summary: Ruby bindings for libdivsufsort.
|
79
|
+
test_files: []
|
80
|
+
|