divsufsort 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,381 @@
1
+ /*
2
+ * utils.c for libdivsufsort
3
+ * Copyright (c) 2003-2008 Yuta Mori All Rights Reserved.
4
+ *
5
+ * Permission is hereby granted, free of charge, to any person
6
+ * obtaining a copy of this software and associated documentation
7
+ * files (the "Software"), to deal in the Software without
8
+ * restriction, including without limitation the rights to use,
9
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
10
+ * copies of the Software, and to permit persons to whom the
11
+ * Software is furnished to do so, subject to the following
12
+ * conditions:
13
+ *
14
+ * The above copyright notice and this permission notice shall be
15
+ * included in all copies or substantial portions of the Software.
16
+ *
17
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
18
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
19
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
20
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
21
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
22
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
23
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
24
+ * OTHER DEALINGS IN THE SOFTWARE.
25
+ */
26
+
27
+ #include "divsufsort_private.h"
28
+
29
+
30
+ /*- Private Function -*/
31
+
32
+ /* Binary search for inverse bwt. */
33
+ static
34
+ saidx_t
35
+ binarysearch_lower(const saidx_t *A, saidx_t size, saidx_t value) {
36
+ saidx_t half, i;
37
+ for(i = 0, half = size >> 1;
38
+ 0 < size;
39
+ size = half, half >>= 1) {
40
+ if(A[i + half] < value) {
41
+ i += half + 1;
42
+ half -= (size & 1) ^ 1;
43
+ }
44
+ }
45
+ return i;
46
+ }
47
+
48
+
49
+ /*- Functions -*/
50
+
51
+ /* Burrows-Wheeler transform. */
52
+ saint_t
53
+ bw_transform(const sauchar_t *T, sauchar_t *U, saidx_t *SA,
54
+ saidx_t n, saidx_t *idx) {
55
+ saidx_t *A, i, j, p, t;
56
+ saint_t c;
57
+
58
+ /* Check arguments. */
59
+ if((T == NULL) || (U == NULL) || (n < 0) || (idx == NULL)) { return -1; }
60
+ if(n <= 1) {
61
+ if(n == 1) { U[0] = T[0]; }
62
+ *idx = n;
63
+ return 0;
64
+ }
65
+
66
+ if((A = SA) == NULL) {
67
+ i = divbwt(T, U, NULL, n);
68
+ if(0 <= i) { *idx = i; i = 0; }
69
+ return (saint_t)i;
70
+ }
71
+
72
+ /* BW transform. */
73
+ if(T == U) {
74
+ t = n;
75
+ for(i = 0, j = 0; i < n; ++i) {
76
+ p = t - 1;
77
+ t = A[i];
78
+ if(0 <= p) {
79
+ c = T[j];
80
+ U[j] = (j <= p) ? T[p] : (sauchar_t)A[p];
81
+ A[j] = c;
82
+ j++;
83
+ } else {
84
+ *idx = i;
85
+ }
86
+ }
87
+ p = t - 1;
88
+ if(0 <= p) {
89
+ c = T[j];
90
+ U[j] = (j <= p) ? T[p] : (sauchar_t)A[p];
91
+ A[j] = c;
92
+ } else {
93
+ *idx = i;
94
+ }
95
+ } else {
96
+ U[0] = T[n - 1];
97
+ for(i = 0; A[i] != 0; ++i) { U[i + 1] = T[A[i] - 1]; }
98
+ *idx = i + 1;
99
+ for(++i; i < n; ++i) { U[i] = T[A[i] - 1]; }
100
+ }
101
+
102
+ if(SA == NULL) {
103
+ /* Deallocate memory. */
104
+ free(A);
105
+ }
106
+
107
+ return 0;
108
+ }
109
+
110
+ /* Inverse Burrows-Wheeler transform. */
111
+ saint_t
112
+ inverse_bw_transform(const sauchar_t *T, sauchar_t *U, saidx_t *A,
113
+ saidx_t n, saidx_t idx) {
114
+ saidx_t C[ALPHABET_SIZE];
115
+ sauchar_t D[ALPHABET_SIZE];
116
+ saidx_t *B;
117
+ saidx_t i, p;
118
+ saint_t c, d;
119
+
120
+ /* Check arguments. */
121
+ if((T == NULL) || (U == NULL) || (n < 0) || (idx < 0) ||
122
+ (n < idx) || ((0 < n) && (idx == 0))) {
123
+ return -1;
124
+ }
125
+ if(n <= 1) { return 0; }
126
+
127
+ if((B = A) == NULL) {
128
+ /* Allocate n*sizeof(saidx_t) bytes of memory. */
129
+ if((B = (saidx_t *)malloc((size_t)n * sizeof(saidx_t))) == NULL) { return -2; }
130
+ }
131
+
132
+ /* Inverse BW transform. */
133
+ for(c = 0; c < ALPHABET_SIZE; ++c) { C[c] = 0; }
134
+ for(i = 0; i < n; ++i) { ++C[T[i]]; }
135
+ for(c = 0, d = 0, i = 0; c < ALPHABET_SIZE; ++c) {
136
+ p = C[c];
137
+ if(0 < p) {
138
+ C[c] = i;
139
+ D[d++] = (sauchar_t)c;
140
+ i += p;
141
+ }
142
+ }
143
+ for(i = 0; i < idx; ++i) { B[C[T[i]]++] = i; }
144
+ for( ; i < n; ++i) { B[C[T[i]]++] = i + 1; }
145
+ for(c = 0; c < d; ++c) { C[c] = C[D[c]]; }
146
+ for(i = 0, p = idx; i < n; ++i) {
147
+ U[i] = D[binarysearch_lower(C, d, p)];
148
+ p = B[p - 1];
149
+ }
150
+
151
+ if(A == NULL) {
152
+ /* Deallocate memory. */
153
+ free(B);
154
+ }
155
+
156
+ return 0;
157
+ }
158
+
159
+ /* Checks the suffix array SA of the string T. */
160
+ saint_t
161
+ sufcheck(const sauchar_t *T, const saidx_t *SA,
162
+ saidx_t n, saint_t verbose) {
163
+ saidx_t C[ALPHABET_SIZE];
164
+ saidx_t i, p, q, t;
165
+ saint_t c;
166
+
167
+ if(verbose) { fprintf(stderr, "sufcheck: "); }
168
+
169
+ /* Check arguments. */
170
+ if((T == NULL) || (SA == NULL) || (n < 0)) {
171
+ if(verbose) { fprintf(stderr, "Invalid arguments.\n"); }
172
+ return -1;
173
+ }
174
+ if(n == 0) {
175
+ if(verbose) { fprintf(stderr, "Done.\n"); }
176
+ return 0;
177
+ }
178
+
179
+ /* check range: [0..n-1] */
180
+ for(i = 0; i < n; ++i) {
181
+ if((SA[i] < 0) || (n <= SA[i])) {
182
+ if(verbose) {
183
+ fprintf(stderr, "Out of the range [0,%" PRIdSAIDX_T "].\n"
184
+ " SA[%" PRIdSAIDX_T "]=%" PRIdSAIDX_T "\n",
185
+ n - 1, i, SA[i]);
186
+ }
187
+ return -2;
188
+ }
189
+ }
190
+
191
+ /* check first characters. */
192
+ for(i = 1; i < n; ++i) {
193
+ if(T[SA[i - 1]] > T[SA[i]]) {
194
+ if(verbose) {
195
+ fprintf(stderr, "Suffixes in wrong order.\n"
196
+ " T[SA[%" PRIdSAIDX_T "]=%" PRIdSAIDX_T "]=%d"
197
+ " > T[SA[%" PRIdSAIDX_T "]=%" PRIdSAIDX_T "]=%d\n",
198
+ i - 1, SA[i - 1], T[SA[i - 1]], i, SA[i], T[SA[i]]);
199
+ }
200
+ return -3;
201
+ }
202
+ }
203
+
204
+ /* check suffixes. */
205
+ for(i = 0; i < ALPHABET_SIZE; ++i) { C[i] = 0; }
206
+ for(i = 0; i < n; ++i) { ++C[T[i]]; }
207
+ for(i = 0, p = 0; i < ALPHABET_SIZE; ++i) {
208
+ t = C[i];
209
+ C[i] = p;
210
+ p += t;
211
+ }
212
+
213
+ q = C[T[n - 1]];
214
+ C[T[n - 1]] += 1;
215
+ for(i = 0; i < n; ++i) {
216
+ p = SA[i];
217
+ if(0 < p) {
218
+ c = T[--p];
219
+ t = C[c];
220
+ } else {
221
+ c = T[p = n - 1];
222
+ t = q;
223
+ }
224
+ if((t < 0) || (p != SA[t])) {
225
+ if(verbose) {
226
+ fprintf(stderr, "Suffix in wrong position.\n"
227
+ " SA[%" PRIdSAIDX_T "]=%" PRIdSAIDX_T " or\n"
228
+ " SA[%" PRIdSAIDX_T "]=%" PRIdSAIDX_T "\n",
229
+ t, (0 <= t) ? SA[t] : -1, i, SA[i]);
230
+ }
231
+ return -4;
232
+ }
233
+ if(t != q) {
234
+ ++C[c];
235
+ if((n <= C[c]) || (T[SA[C[c]]] != c)) { C[c] = -1; }
236
+ }
237
+ }
238
+
239
+ if(1 <= verbose) { fprintf(stderr, "Done.\n"); }
240
+ return 0;
241
+ }
242
+
243
+
244
+ static
245
+ int
246
+ _compare(const sauchar_t *T, saidx_t Tsize,
247
+ const sauchar_t *P, saidx_t Psize,
248
+ saidx_t suf, saidx_t *match) {
249
+ saidx_t i, j;
250
+ saint_t r;
251
+ for(i = suf + *match, j = *match, r = 0;
252
+ (i < Tsize) && (j < Psize) && ((r = T[i] - P[j]) == 0); ++i, ++j) { }
253
+ *match = j;
254
+ return (r == 0) ? -(j != Psize) : r;
255
+ }
256
+
257
+ /* Search for the pattern P in the string T. */
258
+ saidx_t
259
+ sa_search(const sauchar_t *T, saidx_t Tsize,
260
+ const sauchar_t *P, saidx_t Psize,
261
+ const saidx_t *SA, saidx_t SAsize,
262
+ saidx_t *idx) {
263
+ saidx_t size, lsize, rsize, half;
264
+ saidx_t match, lmatch, rmatch;
265
+ saidx_t llmatch, lrmatch, rlmatch, rrmatch;
266
+ saidx_t i, j, k;
267
+ saint_t r;
268
+
269
+ if(idx != NULL) { *idx = -1; }
270
+ if((T == NULL) || (P == NULL) || (SA == NULL) ||
271
+ (Tsize < 0) || (Psize < 0) || (SAsize < 0)) { return -1; }
272
+ if((Tsize == 0) || (SAsize == 0)) { return 0; }
273
+ if(Psize == 0) { if(idx != NULL) { *idx = 0; } return SAsize; }
274
+
275
+ for(i = j = k = 0, lmatch = rmatch = 0, size = SAsize, half = size >> 1;
276
+ 0 < size;
277
+ size = half, half >>= 1) {
278
+ match = MIN(lmatch, rmatch);
279
+ r = _compare(T, Tsize, P, Psize, SA[i + half], &match);
280
+ if(r < 0) {
281
+ i += half + 1;
282
+ half -= (size & 1) ^ 1;
283
+ lmatch = match;
284
+ } else if(r > 0) {
285
+ rmatch = match;
286
+ } else {
287
+ lsize = half, j = i, rsize = size - half - 1, k = i + half + 1;
288
+
289
+ /* left part */
290
+ for(llmatch = lmatch, lrmatch = match, half = lsize >> 1;
291
+ 0 < lsize;
292
+ lsize = half, half >>= 1) {
293
+ lmatch = MIN(llmatch, lrmatch);
294
+ r = _compare(T, Tsize, P, Psize, SA[j + half], &lmatch);
295
+ if(r < 0) {
296
+ j += half + 1;
297
+ half -= (lsize & 1) ^ 1;
298
+ llmatch = lmatch;
299
+ } else {
300
+ lrmatch = lmatch;
301
+ }
302
+ }
303
+
304
+ /* right part */
305
+ for(rlmatch = match, rrmatch = rmatch, half = rsize >> 1;
306
+ 0 < rsize;
307
+ rsize = half, half >>= 1) {
308
+ rmatch = MIN(rlmatch, rrmatch);
309
+ r = _compare(T, Tsize, P, Psize, SA[k + half], &rmatch);
310
+ if(r <= 0) {
311
+ k += half + 1;
312
+ half -= (rsize & 1) ^ 1;
313
+ rlmatch = rmatch;
314
+ } else {
315
+ rrmatch = rmatch;
316
+ }
317
+ }
318
+
319
+ break;
320
+ }
321
+ }
322
+
323
+ if(idx != NULL) { *idx = (0 < (k - j)) ? j : i; }
324
+ return k - j;
325
+ }
326
+
327
+ /* Search for the character c in the string T. */
328
+ saidx_t
329
+ sa_simplesearch(const sauchar_t *T, saidx_t Tsize,
330
+ const saidx_t *SA, saidx_t SAsize,
331
+ saint_t c, saidx_t *idx) {
332
+ saidx_t size, lsize, rsize, half;
333
+ saidx_t i, j, k, p;
334
+ saint_t r;
335
+
336
+ if(idx != NULL) { *idx = -1; }
337
+ if((T == NULL) || (SA == NULL) || (Tsize < 0) || (SAsize < 0)) { return -1; }
338
+ if((Tsize == 0) || (SAsize == 0)) { return 0; }
339
+
340
+ for(i = j = k = 0, size = SAsize, half = size >> 1;
341
+ 0 < size;
342
+ size = half, half >>= 1) {
343
+ p = SA[i + half];
344
+ r = (p < Tsize) ? T[p] - c : -1;
345
+ if(r < 0) {
346
+ i += half + 1;
347
+ half -= (size & 1) ^ 1;
348
+ } else if(r == 0) {
349
+ lsize = half, j = i, rsize = size - half - 1, k = i + half + 1;
350
+
351
+ /* left part */
352
+ for(half = lsize >> 1;
353
+ 0 < lsize;
354
+ lsize = half, half >>= 1) {
355
+ p = SA[j + half];
356
+ r = (p < Tsize) ? T[p] - c : -1;
357
+ if(r < 0) {
358
+ j += half + 1;
359
+ half -= (lsize & 1) ^ 1;
360
+ }
361
+ }
362
+
363
+ /* right part */
364
+ for(half = rsize >> 1;
365
+ 0 < rsize;
366
+ rsize = half, half >>= 1) {
367
+ p = SA[k + half];
368
+ r = (p < Tsize) ? T[p] - c : -1;
369
+ if(r <= 0) {
370
+ k += half + 1;
371
+ half -= (rsize & 1) ^ 1;
372
+ }
373
+ }
374
+
375
+ break;
376
+ }
377
+ }
378
+
379
+ if(idx != NULL) { *idx = (0 < (k - j)) ? j : i; }
380
+ return k - j;
381
+ }
metadata ADDED
@@ -0,0 +1,80 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: divsufsort
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - winebarrel
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+
12
+ date: 2008-10-22 00:00:00 +09:00
13
+ default_executable:
14
+ dependencies: []
15
+
16
+ description:
17
+ email: sgwr_dts@yahoo.co.jp
18
+ executables: []
19
+
20
+ extensions:
21
+ - ext/extconf.rb
22
+ extra_rdoc_files:
23
+ - README.txt
24
+ - ext/divsufsort_ruby.c
25
+ files:
26
+ - ext/divsufsort.c
27
+ - ext/divsufsort.h
28
+ - ext/divsufsort.o
29
+ - ext/divsufsort.so
30
+ - ext/divsufsort_private.h
31
+ - ext/divsufsort_ruby.c
32
+ - ext/divsufsort_ruby.o
33
+ - ext/extconf.rb
34
+ - ext/lfs.h
35
+ - ext/Makefile
36
+ - ext/mkmf.log
37
+ - ext/sssort.c
38
+ - ext/sssort.o
39
+ - ext/trsort.c
40
+ - ext/trsort.o
41
+ - ext/utils.c
42
+ - ext/utils.o
43
+ - libdivsufsort/COPYING
44
+ - libdivsufsort/divsufsort.c
45
+ - libdivsufsort/divsufsort.h
46
+ - libdivsufsort/divsufsort_private.h
47
+ - libdivsufsort/lfs.h
48
+ - libdivsufsort/sssort.c
49
+ - libdivsufsort/trsort.c
50
+ - libdivsufsort/utils.c
51
+ - README.txt
52
+ has_rdoc: true
53
+ homepage: http://divsufsort.rubyforge.org
54
+ post_install_message:
55
+ rdoc_options:
56
+ - --title
57
+ - divsufsort - Ruby bindings for libdivsufsort.
58
+ require_paths:
59
+ - lib
60
+ required_ruby_version: !ruby/object:Gem::Requirement
61
+ requirements:
62
+ - - ">="
63
+ - !ruby/object:Gem::Version
64
+ version: "0"
65
+ version:
66
+ required_rubygems_version: !ruby/object:Gem::Requirement
67
+ requirements:
68
+ - - ">="
69
+ - !ruby/object:Gem::Version
70
+ version: "0"
71
+ version:
72
+ requirements: []
73
+
74
+ rubyforge_project: divsufsort
75
+ rubygems_version: 1.2.0
76
+ signing_key:
77
+ specification_version: 2
78
+ summary: Ruby bindings for libdivsufsort.
79
+ test_files: []
80
+