gdiff 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- data/CHANGELOG +7 -0
- data/COPYING.suffix_array +278 -0
- data/LICENSE.suffix_array +17 -0
- data/README +40 -0
- data/README.suffix_array +274 -0
- data/bin/gdiff +25 -0
- data/bin/gpatch +25 -0
- data/doc/classes/Diff.html +117 -0
- data/doc/classes/Diff/GDiff.html +120 -0
- data/doc/classes/Diff/GDiff/EGdiffError.html +111 -0
- data/doc/classes/Diff/GDiff/ENoGdiffStream.html +113 -0
- data/doc/classes/Diff/GDiff/EPrematureEndOfStream.html +113 -0
- data/doc/classes/Diff/GDiff/Operations.html +156 -0
- data/doc/classes/Diff/GDiff/Operations/Copy.html +246 -0
- data/doc/classes/Diff/GDiff/Operations/Copy.src/M000014.html +19 -0
- data/doc/classes/Diff/GDiff/Operations/Copy.src/M000015.html +39 -0
- data/doc/classes/Diff/GDiff/Operations/Copy.src/M000016.html +25 -0
- data/doc/classes/Diff/GDiff/Operations/Copy.src/M000017.html +18 -0
- data/doc/classes/Diff/GDiff/Operations/Data.html +246 -0
- data/doc/classes/Diff/GDiff/Operations/Data.src/M000009.html +18 -0
- data/doc/classes/Diff/GDiff/Operations/Data.src/M000010.html +18 -0
- data/doc/classes/Diff/GDiff/Operations/Data.src/M000011.html +35 -0
- data/doc/classes/Diff/GDiff/Operations/Data.src/M000012.html +29 -0
- data/doc/classes/Diff/GDiff/Operations/Data.src/M000013.html +19 -0
- data/doc/classes/SAError.html +111 -0
- data/doc/classes/SuffixArray.html +342 -0
- data/doc/classes/SuffixArray.src/M000001.html +97 -0
- data/doc/classes/SuffixArray.src/M000002.html +73 -0
- data/doc/classes/SuffixArray.src/M000003.html +102 -0
- data/doc/classes/SuffixArray.src/M000004.html +47 -0
- data/doc/classes/SuffixArray.src/M000005.html +44 -0
- data/doc/classes/SuffixArray.src/M000006.html +33 -0
- data/doc/classes/SuffixArray.src/M000007.html +24 -0
- data/doc/classes/SuffixArray.src/M000008.html +46 -0
- data/doc/created.rid +1 -0
- data/doc/files/ext/gdiff/suffix_array/extconf_rb.html +108 -0
- data/doc/files/ext/gdiff/suffix_array/lcp_c.html +101 -0
- data/doc/files/ext/gdiff/suffix_array/sarray_c.html +101 -0
- data/doc/files/ext/gdiff/suffix_array/suffix_array_c.html +101 -0
- data/doc/files/lib/gdiff_rb.html +108 -0
- data/doc/fr_class_index.html +36 -0
- data/doc/fr_file_index.html +31 -0
- data/doc/fr_method_index.html +43 -0
- data/doc/index.html +24 -0
- data/doc/rdoc-style.css +208 -0
- data/ext/gdiff/COPYING +278 -0
- data/ext/gdiff/LICENSE +17 -0
- data/ext/gdiff/README +274 -0
- data/ext/gdiff/extconf.rb +3 -0
- data/ext/gdiff/lcp.c +97 -0
- data/ext/gdiff/sarray.3 +145 -0
- data/ext/gdiff/sarray.c +372 -0
- data/ext/gdiff/sarray.h +13 -0
- data/ext/gdiff/suffix_array.c +510 -0
- data/lib/gdiff.rb +255 -0
- data/setup.rb +1551 -0
- data/test/tc_gdiff.rb +66 -0
- metadata +119 -0
data/ext/gdiff/sarray.c
ADDED
@@ -0,0 +1,372 @@
|
|
1
|
+
/*
|
2
|
+
Hybrid suffix-array builder, written by Sean Quinlan and Sean Doward,
|
3
|
+
distributed under the Plan 9 license, which reads in part
|
4
|
+
|
5
|
+
3.3 With respect to Your distribution of Licensed Software (or any
|
6
|
+
portion thereof), You must include the following information in a
|
7
|
+
conspicuous location governing such distribution (e.g., a separate
|
8
|
+
file) and on all copies of any Source Code version of Licensed
|
9
|
+
Software You distribute:
|
10
|
+
|
11
|
+
"The contents herein includes software initially developed by
|
12
|
+
Lucent Technologies Inc. and others, and is subject to the terms
|
13
|
+
of the Lucent Technologies Inc. Plan 9 Open Source License
|
14
|
+
Agreement. A copy of the Plan 9 Open Source License Agreement is
|
15
|
+
available at: http://plan9.bell-labs.com/plan9dist/download.html
|
16
|
+
or by contacting Lucent Technologies at http: //www.lucent.com.
|
17
|
+
All software distributed under such Agreement is distributed on,
|
18
|
+
obligations and limitations under such Agreement. Portions of
|
19
|
+
the software developed by Lucent Technologies Inc. and others are
|
20
|
+
Copyright (c) 2002. All rights reserved.
|
21
|
+
Contributor(s):___________________________"
|
22
|
+
*/
|
23
|
+
/*
|
24
|
+
int sarray(int a[], int n)
|
25
|
+
Purpose
|
26
|
+
Return in a[] a suffix array for the original
|
27
|
+
contents of a[]. (The original values in a[]
|
28
|
+
are typically serial numbers of distinct tokens
|
29
|
+
in some list.)
|
30
|
+
|
31
|
+
Precondition
|
32
|
+
Array a[] holds n values, with n>=1. Exactly k
|
33
|
+
distinct values, in the range 0..k-1, are present.
|
34
|
+
Value 0, an endmark, appears exactly once, at a[n-1].
|
35
|
+
|
36
|
+
Postcondition
|
37
|
+
Array a[] is a copy of the internal array p[]
|
38
|
+
that records the sorting permutation: if i<j
|
39
|
+
then the original suffix a[p[i]..n-1] is
|
40
|
+
lexicographically less than a[p[j]..n-1].
|
41
|
+
|
42
|
+
Return value
|
43
|
+
-1 on error.
|
44
|
+
Otherwise index i such that a[i]==0, i.e. the
|
45
|
+
index of the whole-string suffix, used in
|
46
|
+
Burrows-Wheeler data compression.
|
47
|
+
*/
|
48
|
+
|
49
|
+
#include <stdlib.h>
|
50
|
+
#include <string.h>
|
51
|
+
#include "sarray.h"
|
52
|
+
|
53
|
+
#define pred(i, h) ((t=(i)-(h))<0? t+n: t)
|
54
|
+
#define succ(i, h) ((t=(i)+(h))>=n? t-n: t)
|
55
|
+
|
56
|
+
enum
|
57
|
+
{
|
58
|
+
BUCK = ~(~0u>>1), /* high bit */
|
59
|
+
MAXI = ~0u>>1, /* biggest int */
|
60
|
+
};
|
61
|
+
|
62
|
+
static void qsort2(int*, int*, int n);
|
63
|
+
static int ssortit(int a[], int p[], int n, int h, int *pe, int nbuck);
|
64
|
+
|
65
|
+
int
|
66
|
+
sarray(int a[], int n)
|
67
|
+
{
|
68
|
+
int i, l;
|
69
|
+
int c, cc, ncc, lab, cum, nbuck;
|
70
|
+
int k;
|
71
|
+
int *p = 0;
|
72
|
+
int result = -1;
|
73
|
+
int *al;
|
74
|
+
int *pl;
|
75
|
+
|
76
|
+
for(k=0,i=0; i<n; i++)
|
77
|
+
if(a[i] > k)
|
78
|
+
k = a[i]; /* max element */
|
79
|
+
k++;
|
80
|
+
if(k>n)
|
81
|
+
goto out;
|
82
|
+
|
83
|
+
nbuck = 0;
|
84
|
+
p = malloc(n*sizeof(int));
|
85
|
+
if(p == 0)
|
86
|
+
goto out;
|
87
|
+
|
88
|
+
|
89
|
+
pl = p + n - k;
|
90
|
+
al = a;
|
91
|
+
memset(pl, -1, k*sizeof(int));
|
92
|
+
|
93
|
+
for(i=0; i<n; i++) { /* (1) link */
|
94
|
+
l = a[i];
|
95
|
+
al[i] = pl[l];
|
96
|
+
pl[l] = i;
|
97
|
+
}
|
98
|
+
|
99
|
+
for(i=0; i<k; i++) /* check input - no holes */
|
100
|
+
if(pl[i]<0)
|
101
|
+
goto out;
|
102
|
+
|
103
|
+
|
104
|
+
lab = 0; /* (2) create p and label a */
|
105
|
+
cum = 0;
|
106
|
+
i = 0;
|
107
|
+
for(c = 0; c < k; c++){
|
108
|
+
for(cc = pl[c]; cc != -1; cc = ncc){
|
109
|
+
ncc = al[cc];
|
110
|
+
al[cc] = lab;
|
111
|
+
cum++;
|
112
|
+
p[i++] = cc;
|
113
|
+
}
|
114
|
+
if(lab + 1 == cum) {
|
115
|
+
i--;
|
116
|
+
} else {
|
117
|
+
p[i-1] |= BUCK;
|
118
|
+
nbuck++;
|
119
|
+
}
|
120
|
+
lab = cum;
|
121
|
+
}
|
122
|
+
|
123
|
+
result = ssortit(a, p, n, 1, p+i, nbuck);
|
124
|
+
memcpy(a, p, n*sizeof(int));
|
125
|
+
|
126
|
+
out:
|
127
|
+
free(p);
|
128
|
+
return result;
|
129
|
+
}
|
130
|
+
|
131
|
+
/* bsarray(uchar buf[], int p[], int n)
|
132
|
+
* The input, buf, is an arbitrary byte array of length n.
|
133
|
+
* The input is copied to temporary storage, relabeling
|
134
|
+
* pairs of input characters and appending a unique end marker
|
135
|
+
* having a value that is effectively less than any input byte.
|
136
|
+
* The suffix array of this extended input is computed and
|
137
|
+
* stored in p, which must have length at least n+1.
|
138
|
+
*
|
139
|
+
* Returns the index of the identity permutation (regarding
|
140
|
+
* the suffix array as a list of circular shifts),
|
141
|
+
* or -1 if there was an error.
|
142
|
+
*/
|
143
|
+
int
|
144
|
+
bsarray(const uchar buf[], int p[], int n)
|
145
|
+
{
|
146
|
+
int *a, buckets[256*256];
|
147
|
+
int i, last, cum, c, cc, ncc, lab, id, nbuck;
|
148
|
+
|
149
|
+
a = malloc((n+1)*sizeof(int));
|
150
|
+
if(a == 0)
|
151
|
+
return -1;
|
152
|
+
|
153
|
+
|
154
|
+
memset(buckets, -1, sizeof(buckets));
|
155
|
+
c = buf[n-1] << 8;
|
156
|
+
last = c;
|
157
|
+
for(i = n - 2; i >= 0; i--){
|
158
|
+
c = (buf[i] << 8) | (c >> 8);
|
159
|
+
a[i] = buckets[c];
|
160
|
+
buckets[c] = i;
|
161
|
+
}
|
162
|
+
|
163
|
+
/*
|
164
|
+
* end of string comes before anything else
|
165
|
+
*/
|
166
|
+
a[n] = 0;
|
167
|
+
|
168
|
+
lab = 1;
|
169
|
+
cum = 1;
|
170
|
+
i = 0;
|
171
|
+
nbuck = 0;
|
172
|
+
for(c = 0; c < 256*256; c++) {
|
173
|
+
/*
|
174
|
+
* last character is followed by unique end of string
|
175
|
+
*/
|
176
|
+
if(c == last) {
|
177
|
+
a[n-1] = lab;
|
178
|
+
cum++;
|
179
|
+
lab++;
|
180
|
+
}
|
181
|
+
|
182
|
+
for(cc = buckets[c]; cc != -1; cc = ncc) {
|
183
|
+
ncc = a[cc];
|
184
|
+
a[cc] = lab;
|
185
|
+
cum++;
|
186
|
+
p[i++] = cc;
|
187
|
+
}
|
188
|
+
if(lab == cum)
|
189
|
+
continue;
|
190
|
+
if(lab + 1 == cum)
|
191
|
+
i--;
|
192
|
+
else {
|
193
|
+
p[i - 1] |= BUCK;
|
194
|
+
nbuck++;
|
195
|
+
}
|
196
|
+
lab = cum;
|
197
|
+
}
|
198
|
+
|
199
|
+
id = ssortit(a, p, n+1, 2, p+i, nbuck);
|
200
|
+
free(a);
|
201
|
+
return id;
|
202
|
+
}
|
203
|
+
|
204
|
+
static int
|
205
|
+
ssortit(int a[], int p[], int n, int h, int *pe, int nbuck)
|
206
|
+
{
|
207
|
+
int *s, *ss, *packing, *sorting;
|
208
|
+
int v, sv, vv, packed, lab, t, i;
|
209
|
+
|
210
|
+
for(; h < n && p < pe; h=2*h) {
|
211
|
+
packing = p;
|
212
|
+
nbuck = 0;
|
213
|
+
|
214
|
+
for(sorting = p; sorting < pe; sorting = s){
|
215
|
+
/*
|
216
|
+
* find length of stuff to sort
|
217
|
+
*/
|
218
|
+
lab = a[*sorting];
|
219
|
+
for(s = sorting; ; s++) {
|
220
|
+
sv = *s;
|
221
|
+
v = a[succ(sv & ~BUCK, h)];
|
222
|
+
if(v & BUCK)
|
223
|
+
v = lab;
|
224
|
+
a[sv & ~BUCK] = v | BUCK;
|
225
|
+
if(sv & BUCK)
|
226
|
+
break;
|
227
|
+
}
|
228
|
+
*s++ &= ~BUCK;
|
229
|
+
nbuck++;
|
230
|
+
|
231
|
+
qsort2(sorting, a, s - sorting);
|
232
|
+
|
233
|
+
v = a[*sorting];
|
234
|
+
a[*sorting] = lab;
|
235
|
+
packed = 0;
|
236
|
+
for(ss = sorting + 1; ss < s; ss++) {
|
237
|
+
sv = *ss;
|
238
|
+
vv = a[sv];
|
239
|
+
if(vv == v) {
|
240
|
+
*packing++ = ss[-1];
|
241
|
+
packed++;
|
242
|
+
} else {
|
243
|
+
if(packed) {
|
244
|
+
*packing++ = ss[-1] | BUCK;
|
245
|
+
}
|
246
|
+
lab += packed + 1;
|
247
|
+
packed = 0;
|
248
|
+
v = vv;
|
249
|
+
}
|
250
|
+
a[sv] = lab;
|
251
|
+
}
|
252
|
+
if(packed) {
|
253
|
+
*packing++ = ss[-1] | BUCK;
|
254
|
+
}
|
255
|
+
}
|
256
|
+
pe = packing;
|
257
|
+
}
|
258
|
+
|
259
|
+
/*
|
260
|
+
* reconstuct the permutation matrix
|
261
|
+
* return index of the entire string
|
262
|
+
*/
|
263
|
+
v = a[0];
|
264
|
+
for(i = 0; i < n; i++)
|
265
|
+
p[a[i]] = i;
|
266
|
+
|
267
|
+
return v;
|
268
|
+
}
|
269
|
+
|
270
|
+
/*
|
271
|
+
* qsort from Bentley and McIlroy, Software--Practice and Experience
|
272
|
+
23 (1993) 1249-1265, specialized for sorting permutations based on
|
273
|
+
successors
|
274
|
+
*/
|
275
|
+
static void
|
276
|
+
vecswap2(int *a, int *b, int n)
|
277
|
+
{
|
278
|
+
while (n-- > 0) {
|
279
|
+
int t = *a;
|
280
|
+
*a++ = *b;
|
281
|
+
*b++ = t;
|
282
|
+
}
|
283
|
+
}
|
284
|
+
|
285
|
+
#define swap2(a, b) { t = *(a); *(a) = *(b); *(b) = t; }
|
286
|
+
|
287
|
+
static int*
|
288
|
+
med3(int *a, int *b, int *c, int *asucc)
|
289
|
+
{
|
290
|
+
int va, vb, vc;
|
291
|
+
|
292
|
+
if ((va=asucc[*a]) == (vb=asucc[*b]))
|
293
|
+
return a;
|
294
|
+
if ((vc=asucc[*c]) == va || vc == vb)
|
295
|
+
return c;
|
296
|
+
return va < vb ?
|
297
|
+
(vb < vc ? b : (va < vc ? c : a))
|
298
|
+
: (vb > vc ? b : (va < vc ? a : c));
|
299
|
+
}
|
300
|
+
|
301
|
+
static void
|
302
|
+
inssort(int *a, int *asucc, int n)
|
303
|
+
{
|
304
|
+
int *pi, *pj, t;
|
305
|
+
|
306
|
+
for (pi = a + 1; --n > 0; pi++)
|
307
|
+
for (pj = pi; pj > a; pj--) {
|
308
|
+
if(asucc[pj[-1]] <= asucc[*pj])
|
309
|
+
break;
|
310
|
+
swap2(pj, pj-1);
|
311
|
+
}
|
312
|
+
}
|
313
|
+
|
314
|
+
static void
|
315
|
+
qsort2(int *a, int *asucc, int n)
|
316
|
+
{
|
317
|
+
int d, r, partval;
|
318
|
+
int *pa, *pb, *pc, *pd, *pl, *pm, *pn, t;
|
319
|
+
|
320
|
+
if (n < 15) {
|
321
|
+
inssort(a, asucc, n);
|
322
|
+
return;
|
323
|
+
}
|
324
|
+
pl = a;
|
325
|
+
pm = a + (n >> 1);
|
326
|
+
pn = a + (n-1);
|
327
|
+
if (n > 30) { /* On big arrays, pseudomedian of 9 */
|
328
|
+
d = (n >> 3);
|
329
|
+
pl = med3(pl, pl+d, pl+2*d, asucc);
|
330
|
+
pm = med3(pm-d, pm, pm+d, asucc);
|
331
|
+
pn = med3(pn-2*d, pn-d, pn, asucc);
|
332
|
+
}
|
333
|
+
pm = med3(pl, pm, pn, asucc);
|
334
|
+
swap2(a, pm);
|
335
|
+
partval = asucc[*a];
|
336
|
+
pa = pb = a + 1;
|
337
|
+
pc = pd = a + n-1;
|
338
|
+
for (;;) {
|
339
|
+
while (pb <= pc && (r = asucc[*pb]-partval) <= 0) {
|
340
|
+
if (r == 0) {
|
341
|
+
swap2(pa, pb);
|
342
|
+
pa++;
|
343
|
+
}
|
344
|
+
pb++;
|
345
|
+
}
|
346
|
+
while (pb <= pc && (r = asucc[*pc]-partval) >= 0) {
|
347
|
+
if (r == 0) {
|
348
|
+
swap2(pc, pd);
|
349
|
+
pd--;
|
350
|
+
}
|
351
|
+
pc--;
|
352
|
+
}
|
353
|
+
if (pb > pc)
|
354
|
+
break;
|
355
|
+
swap2(pb, pc);
|
356
|
+
pb++;
|
357
|
+
pc--;
|
358
|
+
}
|
359
|
+
pn = a + n;
|
360
|
+
r = pa-a;
|
361
|
+
if(pb-pa < r)
|
362
|
+
r = pb-pa;
|
363
|
+
vecswap2(a, pb-r, r);
|
364
|
+
r = pn-pd-1;
|
365
|
+
if(pd-pc < r)
|
366
|
+
r = pd-pc;
|
367
|
+
vecswap2(pb, pn-r, r);
|
368
|
+
if ((r = pb-pa) > 1)
|
369
|
+
qsort2(a, asucc, r);
|
370
|
+
if ((r = pd-pc) > 1)
|
371
|
+
qsort2(a + n-r, asucc, r);
|
372
|
+
}
|
data/ext/gdiff/sarray.h
ADDED
@@ -0,0 +1,13 @@
|
|
1
|
+
#ifndef sarray_h
|
2
|
+
#define sarray_h
|
3
|
+
|
4
|
+
|
5
|
+
typedef unsigned char uchar;
|
6
|
+
|
7
|
+
int sarray(int *a, int n);
|
8
|
+
int bsarray(const uchar *b, int *a, int n);
|
9
|
+
int *lcp(const int *a, const char *s, int n);
|
10
|
+
int lcpa(const int *a, const char *s, int *b, int n);
|
11
|
+
|
12
|
+
#endif
|
13
|
+
|
@@ -0,0 +1,510 @@
|
|
1
|
+
#include <ruby.h>
|
2
|
+
#include <assert.h>
|
3
|
+
#include <sarray.h>
|
4
|
+
|
5
|
+
typedef struct SuffixArray {
|
6
|
+
int *suffix_index;
|
7
|
+
unsigned int ends[256];
|
8
|
+
unsigned int starts[256];
|
9
|
+
} SuffixArray;
|
10
|
+
|
11
|
+
|
12
|
+
#define ERR_NO_ZERO_LENGTH_INPUT "Cannot create a suffix array from a 0 length input source."
|
13
|
+
#define ERR_NOT_INITIALIZED "Initialization failed, you cannot use this object."
|
14
|
+
#define ERR_START_IF_ARRAY "You must provide a start argument if you give an array argument."
|
15
|
+
#define ERR_MISMATCH_LENGTH "The raw array length is different from the source length"
|
16
|
+
static VALUE cSAError;
|
17
|
+
|
18
|
+
|
19
|
+
inline int scan_string(unsigned char *source, size_t src_len,
|
20
|
+
unsigned char *target, size_t *tgt_len)
|
21
|
+
{
|
22
|
+
size_t target_i = 0;
|
23
|
+
size_t source_i = 0;
|
24
|
+
size_t length = 0;
|
25
|
+
|
26
|
+
while(target_i < *tgt_len && source_i < src_len && target[target_i] == source[source_i]) {
|
27
|
+
length++;
|
28
|
+
target_i++;
|
29
|
+
source_i++;
|
30
|
+
}
|
31
|
+
|
32
|
+
|
33
|
+
if(target_i == *tgt_len) {
|
34
|
+
// found a match that's at least as long as the target, so good enough
|
35
|
+
*tgt_len = length; // out parameter for the length that was found
|
36
|
+
return 0;
|
37
|
+
} else {
|
38
|
+
// target and source characters are now different, return that difference
|
39
|
+
*tgt_len = length; // out parameter for the length that was found
|
40
|
+
return target[target_i] - source[source_i];
|
41
|
+
}
|
42
|
+
|
43
|
+
}
|
44
|
+
|
45
|
+
|
46
|
+
size_t find_longest_match(unsigned char *source, size_t src_len,
|
47
|
+
unsigned char *target, size_t *tgt_len,
|
48
|
+
unsigned int starts[], unsigned int ends[], unsigned int sa[])
|
49
|
+
{
|
50
|
+
size_t high = ends[*target] + 1;
|
51
|
+
size_t low = starts[*target];
|
52
|
+
size_t middle = (low + high) / 2; // middle is pre-calculated so the while loop can exit
|
53
|
+
size_t length = 0;
|
54
|
+
size_t scan_len = 0;
|
55
|
+
size_t src_i = 0;
|
56
|
+
int result = 0;
|
57
|
+
size_t last_match = 0;
|
58
|
+
|
59
|
+
while(low <= high && high <= src_len && middle < src_len && length != *tgt_len) {
|
60
|
+
src_i = sa[middle];
|
61
|
+
scan_len = *tgt_len;
|
62
|
+
|
63
|
+
result = scan_string(source + src_i, src_len - src_i, target, &scan_len);
|
64
|
+
|
65
|
+
if(scan_len > length) {
|
66
|
+
length = scan_len;
|
67
|
+
last_match = middle;
|
68
|
+
}
|
69
|
+
|
70
|
+
if(result == 0)
|
71
|
+
// found it so we're done
|
72
|
+
break;
|
73
|
+
else if(result < 0) {
|
74
|
+
// it's less than our current mid-point so drop down
|
75
|
+
high = middle - 1;
|
76
|
+
} else {
|
77
|
+
// it's greater than our current mid-point so push up
|
78
|
+
low = middle + 1;
|
79
|
+
}
|
80
|
+
|
81
|
+
// recalculate the middle
|
82
|
+
middle = (low + high) / 2;
|
83
|
+
}
|
84
|
+
|
85
|
+
// if we get here than it isn't found so break out
|
86
|
+
size_t start = sa[last_match];
|
87
|
+
*tgt_len = length;
|
88
|
+
return start;
|
89
|
+
}
|
90
|
+
|
91
|
+
|
92
|
+
/*
|
93
|
+
* call-seq:
|
94
|
+
* sarray.source -> String
|
95
|
+
*
|
96
|
+
* Returns the source that this suffix array was constructed with.
|
97
|
+
*/
|
98
|
+
static VALUE SuffixArray_source(VALUE self)
|
99
|
+
{
|
100
|
+
return rb_iv_get(self, "@source");
|
101
|
+
}
|
102
|
+
|
103
|
+
|
104
|
+
|
105
|
+
static void SuffixArray_free(void *p) {
|
106
|
+
SuffixArray *sa = (SuffixArray *)p;
|
107
|
+
if(sa->suffix_index) free(sa->suffix_index);
|
108
|
+
if(sa) free(sa);
|
109
|
+
}
|
110
|
+
|
111
|
+
static VALUE SuffixArray_alloc(VALUE klass)
|
112
|
+
{
|
113
|
+
SuffixArray *sa = NULL;
|
114
|
+
|
115
|
+
// setup our internal memory for the suffix array structure
|
116
|
+
return Data_Make_Struct(klass, SuffixArray, 0, SuffixArray_free, sa);
|
117
|
+
}
|
118
|
+
|
119
|
+
|
120
|
+
/*
|
121
|
+
* call-seq:
|
122
|
+
* SuffixArray.new(source, [raw_array], [start]) -> SuffixArray
|
123
|
+
*
|
124
|
+
* Given a string (anything like a string really) this will generate a
|
125
|
+
* suffix array for the string so that you can work with it. The
|
126
|
+
* source cannot be an empty string since this is a useless operation.
|
127
|
+
*
|
128
|
+
* Two optional parameters allow you to restore a suffix array without
|
129
|
+
* running the construction process again. You basically give it the
|
130
|
+
* String from SuffixArray.raw_array and the start from SuffixArray.suffix_start
|
131
|
+
* and it will skip most calculations. <b>This feature is really experimental
|
132
|
+
* and is CPU dependent since the integers in the raw_array are native.</b>
|
133
|
+
*/
|
134
|
+
static VALUE SuffixArray_initialize(int argc, VALUE *argv, VALUE self)
|
135
|
+
{
|
136
|
+
SuffixArray *sa = NULL;
|
137
|
+
size_t i = 0;
|
138
|
+
Data_Get_Struct(self, SuffixArray, sa);
|
139
|
+
assert(sa != NULL);
|
140
|
+
VALUE source;
|
141
|
+
VALUE array;
|
142
|
+
VALUE start;
|
143
|
+
|
144
|
+
// sort out the arguments and such
|
145
|
+
rb_scan_args(argc, argv, "12", &source, &array, &start);
|
146
|
+
|
147
|
+
// get the string value of the source given to us, keep it around for later
|
148
|
+
VALUE sa_source_str = StringValue(source);
|
149
|
+
rb_iv_set(self, "@source", sa_source_str);
|
150
|
+
|
151
|
+
// setup temporary variables for the source and length pointers
|
152
|
+
unsigned char *sa_source = RSTRING(sa_source_str)->ptr;
|
153
|
+
size_t sa_source_len = RSTRING(sa_source_str)->len;
|
154
|
+
|
155
|
+
// error check the whole thing
|
156
|
+
if(sa_source_len == 0) {
|
157
|
+
// we can't have this, so return a nil
|
158
|
+
rb_raise(cSAError, ERR_NO_ZERO_LENGTH_INPUT);
|
159
|
+
}
|
160
|
+
|
161
|
+
if(!NIL_P(array) && NIL_P(start)) {
|
162
|
+
rb_raise(cSAError, ERR_START_IF_ARRAY);
|
163
|
+
} else if (!NIL_P(array) && !NIL_P(start)) {
|
164
|
+
// looks like both parameters were given so check out the lengths
|
165
|
+
if(RSTRING(array)->len / sizeof(int) != sa_source_len) {
|
166
|
+
rb_raise(cSAError, ERR_MISMATCH_LENGTH);
|
167
|
+
}
|
168
|
+
}
|
169
|
+
|
170
|
+
// allocate memory for the index integers
|
171
|
+
sa->suffix_index = malloc(sizeof(int) * (sa_source_len + 1));
|
172
|
+
|
173
|
+
if(NIL_P(array)) {
|
174
|
+
// create the suffix array from the source
|
175
|
+
int st = bsarray(sa_source, sa->suffix_index, sa_source_len-1);
|
176
|
+
|
177
|
+
// set the suffix_start in our object
|
178
|
+
rb_iv_set(self, "@suffix_start", INT2NUM(st));
|
179
|
+
} else {
|
180
|
+
// convert the given array and start to the internal structures needed
|
181
|
+
// the return value is ignored since I can't seem find any consistent definition for
|
182
|
+
// it's value that will tell me if this failed.
|
183
|
+
memcpy(sa->suffix_index, RSTRING(array)->ptr, sa_source_len * sizeof(int));
|
184
|
+
rb_iv_set(self, "@suffix_start", start);
|
185
|
+
}
|
186
|
+
|
187
|
+
unsigned char c = sa_source[sa->suffix_index[0]]; // start off with the first char in the sarray list
|
188
|
+
sa->starts[c] = 0;
|
189
|
+
for(i = 0; i < sa_source_len; i++) {
|
190
|
+
// skip characters until we see a new one
|
191
|
+
if(sa_source[sa->suffix_index[i]] != c) {
|
192
|
+
sa->ends[c] = i-1; // it's -1 since this is a new character, so the end was actually behind this point
|
193
|
+
c = sa_source[sa->suffix_index[i]];
|
194
|
+
sa->starts[c] = i;
|
195
|
+
}
|
196
|
+
}
|
197
|
+
// set the last valid character to get the tail of the sa, the loop will miss it
|
198
|
+
c = sa_source[sa->suffix_index[sa_source_len-1]];
|
199
|
+
sa->ends[c] = sa_source_len-1;
|
200
|
+
|
201
|
+
return INT2FIX(sa_source_len);
|
202
|
+
}
|
203
|
+
|
204
|
+
|
205
|
+
/*
|
206
|
+
* call-seq:
|
207
|
+
* sarray.longest_match(target, from_index) -> [start, length]
|
208
|
+
*
|
209
|
+
* Takes a target string and an index inside that string, and then tries
|
210
|
+
* to find the longest match from that point in the source string for this
|
211
|
+
* SuffixArray object.
|
212
|
+
*
|
213
|
+
* It returns an array of [start, length] of where in the source a length
|
214
|
+
* string from the target would match.
|
215
|
+
*
|
216
|
+
* Refer to the unit test for examples of usage.
|
217
|
+
*/
|
218
|
+
static VALUE SuffixArray_longest_match(VALUE self, VALUE target, VALUE from_index)
|
219
|
+
{
|
220
|
+
SuffixArray *sa = NULL;
|
221
|
+
Data_Get_Struct(self, SuffixArray, sa);
|
222
|
+
|
223
|
+
VALUE sa_source = SuffixArray_source(self);
|
224
|
+
|
225
|
+
if(sa == NULL || sa->suffix_index == NULL || RSTRING(sa_source)->len == 0) {
|
226
|
+
rb_raise(cSAError, ERR_NOT_INITIALIZED);
|
227
|
+
}
|
228
|
+
|
229
|
+
// get the from and for_length arguments as unsigned ints
|
230
|
+
size_t from = NUM2UINT(from_index);
|
231
|
+
|
232
|
+
|
233
|
+
// get better pointers for the source (should already be in String form)
|
234
|
+
unsigned char *source_ptr = RSTRING(sa_source)->ptr;
|
235
|
+
size_t source_len = RSTRING(sa_source)->len;
|
236
|
+
|
237
|
+
// get the target as a string
|
238
|
+
VALUE target_str = StringValue(target);
|
239
|
+
|
240
|
+
// better pointers again, we also need target_len as an in/out parameter
|
241
|
+
unsigned char *target_ptr = RSTRING(target_str)->ptr;
|
242
|
+
size_t target_len = RSTRING(target_str)->len;
|
243
|
+
|
244
|
+
// check the input for validity, returning nil like in array operations
|
245
|
+
if(from > target_len) {
|
246
|
+
return Qnil;
|
247
|
+
}
|
248
|
+
|
249
|
+
// adjust for the from and for_length settings to be within the target len
|
250
|
+
target_ptr += from;
|
251
|
+
target_len -= from;
|
252
|
+
|
253
|
+
size_t start = find_longest_match(source_ptr, source_len, target_ptr, &target_len,
|
254
|
+
sa->starts, sa->ends, sa->suffix_index);
|
255
|
+
|
256
|
+
// create the 2 value return array
|
257
|
+
VALUE result = rb_ary_new();
|
258
|
+
|
259
|
+
rb_ary_push(result, INT2FIX(start));
|
260
|
+
rb_ary_push(result, INT2FIX(target_len));
|
261
|
+
|
262
|
+
return result;
|
263
|
+
}
|
264
|
+
|
265
|
+
|
266
|
+
/*
|
267
|
+
* call-seq:
|
268
|
+
* sarray.longest_nonmatch(target, from_index, min_match) -> [non_match_length, match_start, match_length]
|
269
|
+
*
|
270
|
+
* Mostly the inverse of longest_match, except that it first tries to find a
|
271
|
+
* non-matching region, then a matching region. The target and from_index are
|
272
|
+
* the same as in longest_match. The min_match argument is the smallest matching
|
273
|
+
* region that you'll accept as significant enough to end the non-matching search.
|
274
|
+
* Giving non_match=0 will stop at the first matching region.
|
275
|
+
*
|
276
|
+
* It works by first searching the suffix array for a non-matching region. When it
|
277
|
+
* hits a character that is in the source (according to the suffix array) it tries
|
278
|
+
* to find a matching region. If it can find a matching region that is longer than min_match
|
279
|
+
* then it stops and returns, otherwise it adds this match to the length of the non-matching
|
280
|
+
* region and continues.
|
281
|
+
*
|
282
|
+
* The return value is an Array of [non_match_length, match_start, match_length].
|
283
|
+
*/
|
284
|
+
static VALUE SuffixArray_longest_nonmatch(VALUE self, VALUE target, VALUE from_index, VALUE min_match)
|
285
|
+
{
|
286
|
+
SuffixArray *sa = NULL;
|
287
|
+
Data_Get_Struct(self, SuffixArray, sa);
|
288
|
+
|
289
|
+
VALUE sa_source = SuffixArray_source(self);
|
290
|
+
|
291
|
+
if(sa == NULL || sa->suffix_index == NULL || RSTRING(sa_source)->len == 0) {
|
292
|
+
rb_raise(cSAError, ERR_NOT_INITIALIZED);
|
293
|
+
}
|
294
|
+
|
295
|
+
// get the from and for_length arguments as unsigned ints
|
296
|
+
size_t from = NUM2UINT(from_index);
|
297
|
+
size_t min = NUM2INT(min_match);
|
298
|
+
|
299
|
+
// get better pointers for the source (should already be in String form)
|
300
|
+
unsigned char *source_ptr = RSTRING(sa_source)->ptr;
|
301
|
+
size_t source_len = RSTRING(sa_source)->len;
|
302
|
+
|
303
|
+
// get the target as a string
|
304
|
+
VALUE target_str = StringValue(target);
|
305
|
+
|
306
|
+
// better pointers again, we also need target_len as an in/out parameter
|
307
|
+
unsigned char *target_ptr = RSTRING(target_str)->ptr;
|
308
|
+
size_t target_len = RSTRING(target_str)->len;
|
309
|
+
|
310
|
+
// check the input for validity, returning nil like in array operations
|
311
|
+
if(from > target_len) {
|
312
|
+
return Qnil;
|
313
|
+
}
|
314
|
+
|
315
|
+
|
316
|
+
// adjust for the from and for_length settings to be within the target len
|
317
|
+
unsigned char *scan = target_ptr + from;
|
318
|
+
unsigned char *end = target_ptr + target_len;
|
319
|
+
size_t match_len = 0;
|
320
|
+
size_t match_start = 0;
|
321
|
+
while(scan < end) {
|
322
|
+
if(*scan != source_ptr[sa->suffix_index[sa->starts[*scan]]]) {
|
323
|
+
scan ++;
|
324
|
+
} else {
|
325
|
+
// search remaining stuff for a possible match, which return as a result as well
|
326
|
+
match_len = end - scan;
|
327
|
+
match_start = find_longest_match(source_ptr, source_len, scan, &match_len,
|
328
|
+
sa->starts, sa->ends, sa->suffix_index);
|
329
|
+
|
330
|
+
if(match_len == 0) {
|
331
|
+
// match not found, which really shouldn't happen
|
332
|
+
break;
|
333
|
+
} else if(match_len > min) {
|
334
|
+
// the match is possibly long enough, drop out
|
335
|
+
break;
|
336
|
+
} else {
|
337
|
+
// the number of possibly matching characters is much too small, so we continue by skipping them
|
338
|
+
scan += match_len;
|
339
|
+
// reset the match_len and match_start to 0 to signal that a match hasn't been found yet
|
340
|
+
match_len = match_start = 0;
|
341
|
+
}
|
342
|
+
}
|
343
|
+
}
|
344
|
+
|
345
|
+
VALUE result = rb_ary_new();
|
346
|
+
|
347
|
+
size_t nonmatch_len = (scan - (target_ptr + from));
|
348
|
+
rb_ary_push(result, INT2FIX(nonmatch_len));
|
349
|
+
rb_ary_push(result, INT2FIX(match_start));
|
350
|
+
rb_ary_push(result, INT2FIX(match_len));
|
351
|
+
|
352
|
+
return result;
|
353
|
+
}
|
354
|
+
|
355
|
+
/*
|
356
|
+
* call-seq:
|
357
|
+
* sarray.array -> Array
|
358
|
+
*
|
359
|
+
* Returns a copy of the internal suffix array as an Array of Fixnum objects. This
|
360
|
+
* array is a copy so you're free to mangle it however you wish.
|
361
|
+
*
|
362
|
+
* A suffix array is the sequence of indices into the source that mark each suffix
|
363
|
+
* as if they were sorted.
|
364
|
+
*/
|
365
|
+
static VALUE SuffixArray_array(VALUE self)
|
366
|
+
{
|
367
|
+
SuffixArray *sa = NULL;
|
368
|
+
Data_Get_Struct(self, SuffixArray, sa);
|
369
|
+
|
370
|
+
VALUE sa_source = SuffixArray_source(self);
|
371
|
+
|
372
|
+
if(sa == NULL || sa->suffix_index == NULL || RSTRING(sa_source)->len == 0) {
|
373
|
+
rb_raise(cSAError, ERR_NOT_INITIALIZED);
|
374
|
+
}
|
375
|
+
|
376
|
+
// get the length of the suffix index
|
377
|
+
size_t source_len = RSTRING(sa_source)->len;
|
378
|
+
size_t i = 0;
|
379
|
+
|
380
|
+
VALUE result = rb_ary_new();
|
381
|
+
|
382
|
+
for(i = 0; i < source_len; i++) {
|
383
|
+
rb_ary_push(result, INT2FIX(sa->suffix_index[i]));
|
384
|
+
}
|
385
|
+
|
386
|
+
return result;
|
387
|
+
}
|
388
|
+
|
389
|
+
|
390
|
+
/*
|
391
|
+
* call-seq:
|
392
|
+
* sarray.raw_array -> String
|
393
|
+
*
|
394
|
+
* Returns the "raw" internal suffix array which is an array of C int types used
|
395
|
+
* internally as the suffix array. The purpose of this function is to allow you
|
396
|
+
* to store the suffix_array and then very quickly restore it later without having
|
397
|
+
* to rebuild the suffix array.
|
398
|
+
*
|
399
|
+
* The returned String should be treated as an opaque structure. It is just a
|
400
|
+
* copy of the int[] used internally. This means that it is dependent on your
|
401
|
+
* CPU. If you want something you can use that is cross platform then use the
|
402
|
+
* SuffixArray.array function instead.
|
403
|
+
*/
|
404
|
+
static VALUE SuffixArray_raw_array(VALUE self)
|
405
|
+
{
|
406
|
+
SuffixArray *sa = NULL;
|
407
|
+
Data_Get_Struct(self, SuffixArray, sa);
|
408
|
+
|
409
|
+
VALUE sa_source = SuffixArray_source(self);
|
410
|
+
|
411
|
+
if(sa == NULL || sa->suffix_index == NULL || RSTRING(sa_source)->len == 0) {
|
412
|
+
rb_raise(cSAError, ERR_NOT_INITIALIZED);
|
413
|
+
}
|
414
|
+
|
415
|
+
// build a string that copies this stuff
|
416
|
+
VALUE result = rb_str_new((const char *)sa->suffix_index, RSTRING(sa_source)->len * sizeof(int));
|
417
|
+
|
418
|
+
return result;
|
419
|
+
}
|
420
|
+
|
421
|
+
/*
|
422
|
+
* call-seq:
|
423
|
+
* sarray.start -> Fixnum
|
424
|
+
*
|
425
|
+
* Tells you which index in the suffix array is the longest suffix (also known as the
|
426
|
+
* start of the source string). If you want to get the beginning of the source string
|
427
|
+
* in a round about way you would do this:
|
428
|
+
*
|
429
|
+
* source = "abracadabra"
|
430
|
+
* sa = SuffixArray.new source
|
431
|
+
* first = source[sa.array[sa.start]]]
|
432
|
+
*
|
433
|
+
* Remember that the start is the index into the suffix array where the source starts,
|
434
|
+
* not an index into the source string (that would just be 0).
|
435
|
+
*/
|
436
|
+
static VALUE SuffixArray_suffix_start(VALUE self)
|
437
|
+
{
|
438
|
+
return rb_iv_get(self, "@suffix_start");
|
439
|
+
}
|
440
|
+
|
441
|
+
|
442
|
+
|
443
|
+
|
444
|
+
/*
|
445
|
+
* call-seq:
|
446
|
+
* sarray.all_starts(character) -> Array
|
447
|
+
*
|
448
|
+
* Returns an array containing all the indexes into the source that start
|
449
|
+
* with the given character. This is a very fast operation since the
|
450
|
+
* SuffixArray already knows where each character starts and ends in the
|
451
|
+
* suffix array structure internally. All it does is copy the range of
|
452
|
+
* the suffix array for that region.
|
453
|
+
*/
|
454
|
+
static VALUE SuffixArray_all_starts(VALUE self, VALUE character)
|
455
|
+
{
|
456
|
+
SuffixArray *sa = NULL;
|
457
|
+
Data_Get_Struct(self, SuffixArray, sa);
|
458
|
+
|
459
|
+
VALUE result = rb_ary_new();
|
460
|
+
VALUE char_str = StringValue(character);
|
461
|
+
|
462
|
+
// must be at least one length
|
463
|
+
if(RSTRING(char_str)->len > 0) {
|
464
|
+
size_t ch = (size_t)RSTRING(char_str)->ptr[0];
|
465
|
+
|
466
|
+
// go through all the suffix array indices as indicated by sa->starts and sa->ends
|
467
|
+
size_t start = 0;
|
468
|
+
|
469
|
+
for(start = sa->starts[ch]; start <= sa->ends[ch]; start++) {
|
470
|
+
rb_ary_push(result, INT2FIX(sa->suffix_index[start]));
|
471
|
+
}
|
472
|
+
}
|
473
|
+
|
474
|
+
return result;
|
475
|
+
}
|
476
|
+
|
477
|
+
|
478
|
+
static VALUE cSuffixArray;
|
479
|
+
|
480
|
+
/**
|
481
|
+
* Implements a SuffixArray structure with functions to do useful operations
|
482
|
+
* quickly such as finding matching and non-matching regions, or finding all
|
483
|
+
* the locations of a given character. The suffix array construction algorithm
|
484
|
+
* used was written by Sean Quinlan and Sean Doward and is licensed under the
|
485
|
+
* Plan9 license. Please refer to the sarray.c file for more information.
|
486
|
+
*
|
487
|
+
* The suffix array construction algorithm used is not the fastest available,
|
488
|
+
* but it was the most correctly implemented. There is also a lcp.c file
|
489
|
+
* which implements an O(n) Longest Common Prefix algorithm, but it had
|
490
|
+
* memory errors and buffer overflows which I decided to avoid for now.
|
491
|
+
*
|
492
|
+
* This file is licensed under the GPL license (see LICENSE in the root source
|
493
|
+
* directory).
|
494
|
+
*/
|
495
|
+
void Init_suffix_array()
|
496
|
+
{
|
497
|
+
cSuffixArray = rb_define_class("SuffixArray", rb_cObject);
|
498
|
+
cSAError = rb_define_class("SAError", rb_eStandardError);
|
499
|
+
rb_define_alloc_func(cSuffixArray, SuffixArray_alloc);
|
500
|
+
|
501
|
+
rb_define_method(cSuffixArray, "initialize", SuffixArray_initialize, -1);
|
502
|
+
rb_define_method(cSuffixArray, "longest_match", SuffixArray_longest_match, 2);
|
503
|
+
rb_define_method(cSuffixArray, "longest_nonmatch", SuffixArray_longest_nonmatch, 3);
|
504
|
+
rb_define_method(cSuffixArray, "array", SuffixArray_array, 0);
|
505
|
+
rb_define_method(cSuffixArray, "raw_array", SuffixArray_raw_array, 0);
|
506
|
+
rb_define_method(cSuffixArray, "suffix_start", SuffixArray_suffix_start, 0);
|
507
|
+
rb_define_method(cSuffixArray, "source", SuffixArray_source, 0);
|
508
|
+
rb_define_method(cSuffixArray, "all_starts", SuffixArray_all_starts, 1);
|
509
|
+
|
510
|
+
}
|