gdiff 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (58) hide show
  1. data/CHANGELOG +7 -0
  2. data/COPYING.suffix_array +278 -0
  3. data/LICENSE.suffix_array +17 -0
  4. data/README +40 -0
  5. data/README.suffix_array +274 -0
  6. data/bin/gdiff +25 -0
  7. data/bin/gpatch +25 -0
  8. data/doc/classes/Diff.html +117 -0
  9. data/doc/classes/Diff/GDiff.html +120 -0
  10. data/doc/classes/Diff/GDiff/EGdiffError.html +111 -0
  11. data/doc/classes/Diff/GDiff/ENoGdiffStream.html +113 -0
  12. data/doc/classes/Diff/GDiff/EPrematureEndOfStream.html +113 -0
  13. data/doc/classes/Diff/GDiff/Operations.html +156 -0
  14. data/doc/classes/Diff/GDiff/Operations/Copy.html +246 -0
  15. data/doc/classes/Diff/GDiff/Operations/Copy.src/M000014.html +19 -0
  16. data/doc/classes/Diff/GDiff/Operations/Copy.src/M000015.html +39 -0
  17. data/doc/classes/Diff/GDiff/Operations/Copy.src/M000016.html +25 -0
  18. data/doc/classes/Diff/GDiff/Operations/Copy.src/M000017.html +18 -0
  19. data/doc/classes/Diff/GDiff/Operations/Data.html +246 -0
  20. data/doc/classes/Diff/GDiff/Operations/Data.src/M000009.html +18 -0
  21. data/doc/classes/Diff/GDiff/Operations/Data.src/M000010.html +18 -0
  22. data/doc/classes/Diff/GDiff/Operations/Data.src/M000011.html +35 -0
  23. data/doc/classes/Diff/GDiff/Operations/Data.src/M000012.html +29 -0
  24. data/doc/classes/Diff/GDiff/Operations/Data.src/M000013.html +19 -0
  25. data/doc/classes/SAError.html +111 -0
  26. data/doc/classes/SuffixArray.html +342 -0
  27. data/doc/classes/SuffixArray.src/M000001.html +97 -0
  28. data/doc/classes/SuffixArray.src/M000002.html +73 -0
  29. data/doc/classes/SuffixArray.src/M000003.html +102 -0
  30. data/doc/classes/SuffixArray.src/M000004.html +47 -0
  31. data/doc/classes/SuffixArray.src/M000005.html +44 -0
  32. data/doc/classes/SuffixArray.src/M000006.html +33 -0
  33. data/doc/classes/SuffixArray.src/M000007.html +24 -0
  34. data/doc/classes/SuffixArray.src/M000008.html +46 -0
  35. data/doc/created.rid +1 -0
  36. data/doc/files/ext/gdiff/suffix_array/extconf_rb.html +108 -0
  37. data/doc/files/ext/gdiff/suffix_array/lcp_c.html +101 -0
  38. data/doc/files/ext/gdiff/suffix_array/sarray_c.html +101 -0
  39. data/doc/files/ext/gdiff/suffix_array/suffix_array_c.html +101 -0
  40. data/doc/files/lib/gdiff_rb.html +108 -0
  41. data/doc/fr_class_index.html +36 -0
  42. data/doc/fr_file_index.html +31 -0
  43. data/doc/fr_method_index.html +43 -0
  44. data/doc/index.html +24 -0
  45. data/doc/rdoc-style.css +208 -0
  46. data/ext/gdiff/COPYING +278 -0
  47. data/ext/gdiff/LICENSE +17 -0
  48. data/ext/gdiff/README +274 -0
  49. data/ext/gdiff/extconf.rb +3 -0
  50. data/ext/gdiff/lcp.c +97 -0
  51. data/ext/gdiff/sarray.3 +145 -0
  52. data/ext/gdiff/sarray.c +372 -0
  53. data/ext/gdiff/sarray.h +13 -0
  54. data/ext/gdiff/suffix_array.c +510 -0
  55. data/lib/gdiff.rb +255 -0
  56. data/setup.rb +1551 -0
  57. data/test/tc_gdiff.rb +66 -0
  58. metadata +119 -0
@@ -0,0 +1,372 @@
1
+ /*
2
+ Hybrid suffix-array builder, written by Sean Quinlan and Sean Doward,
3
+ distributed under the Plan 9 license, which reads in part
4
+
5
+ 3.3 With respect to Your distribution of Licensed Software (or any
6
+ portion thereof), You must include the following information in a
7
+ conspicuous location governing such distribution (e.g., a separate
8
+ file) and on all copies of any Source Code version of Licensed
9
+ Software You distribute:
10
+
11
+ "The contents herein includes software initially developed by
12
+ Lucent Technologies Inc. and others, and is subject to the terms
13
+ of the Lucent Technologies Inc. Plan 9 Open Source License
14
+ Agreement. A copy of the Plan 9 Open Source License Agreement is
15
+ available at: http://plan9.bell-labs.com/plan9dist/download.html
16
+ or by contacting Lucent Technologies at http: //www.lucent.com.
17
+ All software distributed under such Agreement is distributed on,
18
+ obligations and limitations under such Agreement. Portions of
19
+ the software developed by Lucent Technologies Inc. and others are
20
+ Copyright (c) 2002. All rights reserved.
21
+ Contributor(s):___________________________"
22
+ */
23
+ /*
24
+ int sarray(int a[], int n)
25
+ Purpose
26
+ Return in a[] a suffix array for the original
27
+ contents of a[]. (The original values in a[]
28
+ are typically serial numbers of distinct tokens
29
+ in some list.)
30
+
31
+ Precondition
32
+ Array a[] holds n values, with n>=1. Exactly k
33
+ distinct values, in the range 0..k-1, are present.
34
+ Value 0, an endmark, appears exactly once, at a[n-1].
35
+
36
+ Postcondition
37
+ Array a[] is a copy of the internal array p[]
38
+ that records the sorting permutation: if i<j
39
+ then the original suffix a[p[i]..n-1] is
40
+ lexicographically less than a[p[j]..n-1].
41
+
42
+ Return value
43
+ -1 on error.
44
+ Otherwise index i such that a[i]==0, i.e. the
45
+ index of the whole-string suffix, used in
46
+ Burrows-Wheeler data compression.
47
+ */
48
+
49
+ #include <stdlib.h>
50
+ #include <string.h>
51
+ #include "sarray.h"
52
+
53
+ #define pred(i, h) ((t=(i)-(h))<0? t+n: t)
54
+ #define succ(i, h) ((t=(i)+(h))>=n? t-n: t)
55
+
56
+ enum
57
+ {
58
+ BUCK = ~(~0u>>1), /* high bit */
59
+ MAXI = ~0u>>1, /* biggest int */
60
+ };
61
+
62
+ static void qsort2(int*, int*, int n);
63
+ static int ssortit(int a[], int p[], int n, int h, int *pe, int nbuck);
64
+
65
+ int
66
+ sarray(int a[], int n)
67
+ {
68
+ int i, l;
69
+ int c, cc, ncc, lab, cum, nbuck;
70
+ int k;
71
+ int *p = 0;
72
+ int result = -1;
73
+ int *al;
74
+ int *pl;
75
+
76
+ for(k=0,i=0; i<n; i++)
77
+ if(a[i] > k)
78
+ k = a[i]; /* max element */
79
+ k++;
80
+ if(k>n)
81
+ goto out;
82
+
83
+ nbuck = 0;
84
+ p = malloc(n*sizeof(int));
85
+ if(p == 0)
86
+ goto out;
87
+
88
+
89
+ pl = p + n - k;
90
+ al = a;
91
+ memset(pl, -1, k*sizeof(int));
92
+
93
+ for(i=0; i<n; i++) { /* (1) link */
94
+ l = a[i];
95
+ al[i] = pl[l];
96
+ pl[l] = i;
97
+ }
98
+
99
+ for(i=0; i<k; i++) /* check input - no holes */
100
+ if(pl[i]<0)
101
+ goto out;
102
+
103
+
104
+ lab = 0; /* (2) create p and label a */
105
+ cum = 0;
106
+ i = 0;
107
+ for(c = 0; c < k; c++){
108
+ for(cc = pl[c]; cc != -1; cc = ncc){
109
+ ncc = al[cc];
110
+ al[cc] = lab;
111
+ cum++;
112
+ p[i++] = cc;
113
+ }
114
+ if(lab + 1 == cum) {
115
+ i--;
116
+ } else {
117
+ p[i-1] |= BUCK;
118
+ nbuck++;
119
+ }
120
+ lab = cum;
121
+ }
122
+
123
+ result = ssortit(a, p, n, 1, p+i, nbuck);
124
+ memcpy(a, p, n*sizeof(int));
125
+
126
+ out:
127
+ free(p);
128
+ return result;
129
+ }
130
+
131
+ /* bsarray(uchar buf[], int p[], int n)
132
+ * The input, buf, is an arbitrary byte array of length n.
133
+ * The input is copied to temporary storage, relabeling
134
+ * pairs of input characters and appending a unique end marker
135
+ * having a value that is effectively less than any input byte.
136
+ * The suffix array of this extended input is computed and
137
+ * stored in p, which must have length at least n+1.
138
+ *
139
+ * Returns the index of the identity permutation (regarding
140
+ * the suffix array as a list of circular shifts),
141
+ * or -1 if there was an error.
142
+ */
143
+ int
144
+ bsarray(const uchar buf[], int p[], int n)
145
+ {
146
+ int *a, buckets[256*256];
147
+ int i, last, cum, c, cc, ncc, lab, id, nbuck;
148
+
149
+ a = malloc((n+1)*sizeof(int));
150
+ if(a == 0)
151
+ return -1;
152
+
153
+
154
+ memset(buckets, -1, sizeof(buckets));
155
+ c = buf[n-1] << 8;
156
+ last = c;
157
+ for(i = n - 2; i >= 0; i--){
158
+ c = (buf[i] << 8) | (c >> 8);
159
+ a[i] = buckets[c];
160
+ buckets[c] = i;
161
+ }
162
+
163
+ /*
164
+ * end of string comes before anything else
165
+ */
166
+ a[n] = 0;
167
+
168
+ lab = 1;
169
+ cum = 1;
170
+ i = 0;
171
+ nbuck = 0;
172
+ for(c = 0; c < 256*256; c++) {
173
+ /*
174
+ * last character is followed by unique end of string
175
+ */
176
+ if(c == last) {
177
+ a[n-1] = lab;
178
+ cum++;
179
+ lab++;
180
+ }
181
+
182
+ for(cc = buckets[c]; cc != -1; cc = ncc) {
183
+ ncc = a[cc];
184
+ a[cc] = lab;
185
+ cum++;
186
+ p[i++] = cc;
187
+ }
188
+ if(lab == cum)
189
+ continue;
190
+ if(lab + 1 == cum)
191
+ i--;
192
+ else {
193
+ p[i - 1] |= BUCK;
194
+ nbuck++;
195
+ }
196
+ lab = cum;
197
+ }
198
+
199
+ id = ssortit(a, p, n+1, 2, p+i, nbuck);
200
+ free(a);
201
+ return id;
202
+ }
203
+
204
+ static int
205
+ ssortit(int a[], int p[], int n, int h, int *pe, int nbuck)
206
+ {
207
+ int *s, *ss, *packing, *sorting;
208
+ int v, sv, vv, packed, lab, t, i;
209
+
210
+ for(; h < n && p < pe; h=2*h) {
211
+ packing = p;
212
+ nbuck = 0;
213
+
214
+ for(sorting = p; sorting < pe; sorting = s){
215
+ /*
216
+ * find length of stuff to sort
217
+ */
218
+ lab = a[*sorting];
219
+ for(s = sorting; ; s++) {
220
+ sv = *s;
221
+ v = a[succ(sv & ~BUCK, h)];
222
+ if(v & BUCK)
223
+ v = lab;
224
+ a[sv & ~BUCK] = v | BUCK;
225
+ if(sv & BUCK)
226
+ break;
227
+ }
228
+ *s++ &= ~BUCK;
229
+ nbuck++;
230
+
231
+ qsort2(sorting, a, s - sorting);
232
+
233
+ v = a[*sorting];
234
+ a[*sorting] = lab;
235
+ packed = 0;
236
+ for(ss = sorting + 1; ss < s; ss++) {
237
+ sv = *ss;
238
+ vv = a[sv];
239
+ if(vv == v) {
240
+ *packing++ = ss[-1];
241
+ packed++;
242
+ } else {
243
+ if(packed) {
244
+ *packing++ = ss[-1] | BUCK;
245
+ }
246
+ lab += packed + 1;
247
+ packed = 0;
248
+ v = vv;
249
+ }
250
+ a[sv] = lab;
251
+ }
252
+ if(packed) {
253
+ *packing++ = ss[-1] | BUCK;
254
+ }
255
+ }
256
+ pe = packing;
257
+ }
258
+
259
+ /*
260
+ * reconstuct the permutation matrix
261
+ * return index of the entire string
262
+ */
263
+ v = a[0];
264
+ for(i = 0; i < n; i++)
265
+ p[a[i]] = i;
266
+
267
+ return v;
268
+ }
269
+
270
+ /*
271
+ * qsort from Bentley and McIlroy, Software--Practice and Experience
272
+ 23 (1993) 1249-1265, specialized for sorting permutations based on
273
+ successors
274
+ */
275
+ static void
276
+ vecswap2(int *a, int *b, int n)
277
+ {
278
+ while (n-- > 0) {
279
+ int t = *a;
280
+ *a++ = *b;
281
+ *b++ = t;
282
+ }
283
+ }
284
+
285
+ #define swap2(a, b) { t = *(a); *(a) = *(b); *(b) = t; }
286
+
287
+ static int*
288
+ med3(int *a, int *b, int *c, int *asucc)
289
+ {
290
+ int va, vb, vc;
291
+
292
+ if ((va=asucc[*a]) == (vb=asucc[*b]))
293
+ return a;
294
+ if ((vc=asucc[*c]) == va || vc == vb)
295
+ return c;
296
+ return va < vb ?
297
+ (vb < vc ? b : (va < vc ? c : a))
298
+ : (vb > vc ? b : (va < vc ? a : c));
299
+ }
300
+
301
+ static void
302
+ inssort(int *a, int *asucc, int n)
303
+ {
304
+ int *pi, *pj, t;
305
+
306
+ for (pi = a + 1; --n > 0; pi++)
307
+ for (pj = pi; pj > a; pj--) {
308
+ if(asucc[pj[-1]] <= asucc[*pj])
309
+ break;
310
+ swap2(pj, pj-1);
311
+ }
312
+ }
313
+
314
+ static void
315
+ qsort2(int *a, int *asucc, int n)
316
+ {
317
+ int d, r, partval;
318
+ int *pa, *pb, *pc, *pd, *pl, *pm, *pn, t;
319
+
320
+ if (n < 15) {
321
+ inssort(a, asucc, n);
322
+ return;
323
+ }
324
+ pl = a;
325
+ pm = a + (n >> 1);
326
+ pn = a + (n-1);
327
+ if (n > 30) { /* On big arrays, pseudomedian of 9 */
328
+ d = (n >> 3);
329
+ pl = med3(pl, pl+d, pl+2*d, asucc);
330
+ pm = med3(pm-d, pm, pm+d, asucc);
331
+ pn = med3(pn-2*d, pn-d, pn, asucc);
332
+ }
333
+ pm = med3(pl, pm, pn, asucc);
334
+ swap2(a, pm);
335
+ partval = asucc[*a];
336
+ pa = pb = a + 1;
337
+ pc = pd = a + n-1;
338
+ for (;;) {
339
+ while (pb <= pc && (r = asucc[*pb]-partval) <= 0) {
340
+ if (r == 0) {
341
+ swap2(pa, pb);
342
+ pa++;
343
+ }
344
+ pb++;
345
+ }
346
+ while (pb <= pc && (r = asucc[*pc]-partval) >= 0) {
347
+ if (r == 0) {
348
+ swap2(pc, pd);
349
+ pd--;
350
+ }
351
+ pc--;
352
+ }
353
+ if (pb > pc)
354
+ break;
355
+ swap2(pb, pc);
356
+ pb++;
357
+ pc--;
358
+ }
359
+ pn = a + n;
360
+ r = pa-a;
361
+ if(pb-pa < r)
362
+ r = pb-pa;
363
+ vecswap2(a, pb-r, r);
364
+ r = pn-pd-1;
365
+ if(pd-pc < r)
366
+ r = pd-pc;
367
+ vecswap2(pb, pn-r, r);
368
+ if ((r = pb-pa) > 1)
369
+ qsort2(a, asucc, r);
370
+ if ((r = pd-pc) > 1)
371
+ qsort2(a + n-r, asucc, r);
372
+ }
@@ -0,0 +1,13 @@
1
+ #ifndef sarray_h
2
+ #define sarray_h
3
+
4
+
5
+ typedef unsigned char uchar;
6
+
7
+ int sarray(int *a, int n);
8
+ int bsarray(const uchar *b, int *a, int n);
9
+ int *lcp(const int *a, const char *s, int n);
10
+ int lcpa(const int *a, const char *s, int *b, int n);
11
+
12
+ #endif
13
+
@@ -0,0 +1,510 @@
1
+ #include <ruby.h>
2
+ #include <assert.h>
3
+ #include <sarray.h>
4
+
5
+ typedef struct SuffixArray {
6
+ int *suffix_index;
7
+ unsigned int ends[256];
8
+ unsigned int starts[256];
9
+ } SuffixArray;
10
+
11
+
12
+ #define ERR_NO_ZERO_LENGTH_INPUT "Cannot create a suffix array from a 0 length input source."
13
+ #define ERR_NOT_INITIALIZED "Initialization failed, you cannot use this object."
14
+ #define ERR_START_IF_ARRAY "You must provide a start argument if you give an array argument."
15
+ #define ERR_MISMATCH_LENGTH "The raw array length is different from the source length"
16
+ static VALUE cSAError;
17
+
18
+
19
+ inline int scan_string(unsigned char *source, size_t src_len,
20
+ unsigned char *target, size_t *tgt_len)
21
+ {
22
+ size_t target_i = 0;
23
+ size_t source_i = 0;
24
+ size_t length = 0;
25
+
26
+ while(target_i < *tgt_len && source_i < src_len && target[target_i] == source[source_i]) {
27
+ length++;
28
+ target_i++;
29
+ source_i++;
30
+ }
31
+
32
+
33
+ if(target_i == *tgt_len) {
34
+ // found a match that's at least as long as the target, so good enough
35
+ *tgt_len = length; // out parameter for the length that was found
36
+ return 0;
37
+ } else {
38
+ // target and source characters are now different, return that difference
39
+ *tgt_len = length; // out parameter for the length that was found
40
+ return target[target_i] - source[source_i];
41
+ }
42
+
43
+ }
44
+
45
+
46
+ size_t find_longest_match(unsigned char *source, size_t src_len,
47
+ unsigned char *target, size_t *tgt_len,
48
+ unsigned int starts[], unsigned int ends[], unsigned int sa[])
49
+ {
50
+ size_t high = ends[*target] + 1;
51
+ size_t low = starts[*target];
52
+ size_t middle = (low + high) / 2; // middle is pre-calculated so the while loop can exit
53
+ size_t length = 0;
54
+ size_t scan_len = 0;
55
+ size_t src_i = 0;
56
+ int result = 0;
57
+ size_t last_match = 0;
58
+
59
+ while(low <= high && high <= src_len && middle < src_len && length != *tgt_len) {
60
+ src_i = sa[middle];
61
+ scan_len = *tgt_len;
62
+
63
+ result = scan_string(source + src_i, src_len - src_i, target, &scan_len);
64
+
65
+ if(scan_len > length) {
66
+ length = scan_len;
67
+ last_match = middle;
68
+ }
69
+
70
+ if(result == 0)
71
+ // found it so we're done
72
+ break;
73
+ else if(result < 0) {
74
+ // it's less than our current mid-point so drop down
75
+ high = middle - 1;
76
+ } else {
77
+ // it's greater than our current mid-point so push up
78
+ low = middle + 1;
79
+ }
80
+
81
+ // recalculate the middle
82
+ middle = (low + high) / 2;
83
+ }
84
+
85
+ // if we get here than it isn't found so break out
86
+ size_t start = sa[last_match];
87
+ *tgt_len = length;
88
+ return start;
89
+ }
90
+
91
+
92
+ /*
93
+ * call-seq:
94
+ * sarray.source -> String
95
+ *
96
+ * Returns the source that this suffix array was constructed with.
97
+ */
98
+ static VALUE SuffixArray_source(VALUE self)
99
+ {
100
+ return rb_iv_get(self, "@source");
101
+ }
102
+
103
+
104
+
105
+ static void SuffixArray_free(void *p) {
106
+ SuffixArray *sa = (SuffixArray *)p;
107
+ if(sa->suffix_index) free(sa->suffix_index);
108
+ if(sa) free(sa);
109
+ }
110
+
111
+ static VALUE SuffixArray_alloc(VALUE klass)
112
+ {
113
+ SuffixArray *sa = NULL;
114
+
115
+ // setup our internal memory for the suffix array structure
116
+ return Data_Make_Struct(klass, SuffixArray, 0, SuffixArray_free, sa);
117
+ }
118
+
119
+
120
+ /*
121
+ * call-seq:
122
+ * SuffixArray.new(source, [raw_array], [start]) -> SuffixArray
123
+ *
124
+ * Given a string (anything like a string really) this will generate a
125
+ * suffix array for the string so that you can work with it. The
126
+ * source cannot be an empty string since this is a useless operation.
127
+ *
128
+ * Two optional parameters allow you to restore a suffix array without
129
+ * running the construction process again. You basically give it the
130
+ * String from SuffixArray.raw_array and the start from SuffixArray.suffix_start
131
+ * and it will skip most calculations. <b>This feature is really experimental
132
+ * and is CPU dependent since the integers in the raw_array are native.</b>
133
+ */
134
+ static VALUE SuffixArray_initialize(int argc, VALUE *argv, VALUE self)
135
+ {
136
+ SuffixArray *sa = NULL;
137
+ size_t i = 0;
138
+ Data_Get_Struct(self, SuffixArray, sa);
139
+ assert(sa != NULL);
140
+ VALUE source;
141
+ VALUE array;
142
+ VALUE start;
143
+
144
+ // sort out the arguments and such
145
+ rb_scan_args(argc, argv, "12", &source, &array, &start);
146
+
147
+ // get the string value of the source given to us, keep it around for later
148
+ VALUE sa_source_str = StringValue(source);
149
+ rb_iv_set(self, "@source", sa_source_str);
150
+
151
+ // setup temporary variables for the source and length pointers
152
+ unsigned char *sa_source = RSTRING(sa_source_str)->ptr;
153
+ size_t sa_source_len = RSTRING(sa_source_str)->len;
154
+
155
+ // error check the whole thing
156
+ if(sa_source_len == 0) {
157
+ // we can't have this, so return a nil
158
+ rb_raise(cSAError, ERR_NO_ZERO_LENGTH_INPUT);
159
+ }
160
+
161
+ if(!NIL_P(array) && NIL_P(start)) {
162
+ rb_raise(cSAError, ERR_START_IF_ARRAY);
163
+ } else if (!NIL_P(array) && !NIL_P(start)) {
164
+ // looks like both parameters were given so check out the lengths
165
+ if(RSTRING(array)->len / sizeof(int) != sa_source_len) {
166
+ rb_raise(cSAError, ERR_MISMATCH_LENGTH);
167
+ }
168
+ }
169
+
170
+ // allocate memory for the index integers
171
+ sa->suffix_index = malloc(sizeof(int) * (sa_source_len + 1));
172
+
173
+ if(NIL_P(array)) {
174
+ // create the suffix array from the source
175
+ int st = bsarray(sa_source, sa->suffix_index, sa_source_len-1);
176
+
177
+ // set the suffix_start in our object
178
+ rb_iv_set(self, "@suffix_start", INT2NUM(st));
179
+ } else {
180
+ // convert the given array and start to the internal structures needed
181
+ // the return value is ignored since I can't seem find any consistent definition for
182
+ // it's value that will tell me if this failed.
183
+ memcpy(sa->suffix_index, RSTRING(array)->ptr, sa_source_len * sizeof(int));
184
+ rb_iv_set(self, "@suffix_start", start);
185
+ }
186
+
187
+ unsigned char c = sa_source[sa->suffix_index[0]]; // start off with the first char in the sarray list
188
+ sa->starts[c] = 0;
189
+ for(i = 0; i < sa_source_len; i++) {
190
+ // skip characters until we see a new one
191
+ if(sa_source[sa->suffix_index[i]] != c) {
192
+ sa->ends[c] = i-1; // it's -1 since this is a new character, so the end was actually behind this point
193
+ c = sa_source[sa->suffix_index[i]];
194
+ sa->starts[c] = i;
195
+ }
196
+ }
197
+ // set the last valid character to get the tail of the sa, the loop will miss it
198
+ c = sa_source[sa->suffix_index[sa_source_len-1]];
199
+ sa->ends[c] = sa_source_len-1;
200
+
201
+ return INT2FIX(sa_source_len);
202
+ }
203
+
204
+
205
+ /*
206
+ * call-seq:
207
+ * sarray.longest_match(target, from_index) -> [start, length]
208
+ *
209
+ * Takes a target string and an index inside that string, and then tries
210
+ * to find the longest match from that point in the source string for this
211
+ * SuffixArray object.
212
+ *
213
+ * It returns an array of [start, length] of where in the source a length
214
+ * string from the target would match.
215
+ *
216
+ * Refer to the unit test for examples of usage.
217
+ */
218
+ static VALUE SuffixArray_longest_match(VALUE self, VALUE target, VALUE from_index)
219
+ {
220
+ SuffixArray *sa = NULL;
221
+ Data_Get_Struct(self, SuffixArray, sa);
222
+
223
+ VALUE sa_source = SuffixArray_source(self);
224
+
225
+ if(sa == NULL || sa->suffix_index == NULL || RSTRING(sa_source)->len == 0) {
226
+ rb_raise(cSAError, ERR_NOT_INITIALIZED);
227
+ }
228
+
229
+ // get the from and for_length arguments as unsigned ints
230
+ size_t from = NUM2UINT(from_index);
231
+
232
+
233
+ // get better pointers for the source (should already be in String form)
234
+ unsigned char *source_ptr = RSTRING(sa_source)->ptr;
235
+ size_t source_len = RSTRING(sa_source)->len;
236
+
237
+ // get the target as a string
238
+ VALUE target_str = StringValue(target);
239
+
240
+ // better pointers again, we also need target_len as an in/out parameter
241
+ unsigned char *target_ptr = RSTRING(target_str)->ptr;
242
+ size_t target_len = RSTRING(target_str)->len;
243
+
244
+ // check the input for validity, returning nil like in array operations
245
+ if(from > target_len) {
246
+ return Qnil;
247
+ }
248
+
249
+ // adjust for the from and for_length settings to be within the target len
250
+ target_ptr += from;
251
+ target_len -= from;
252
+
253
+ size_t start = find_longest_match(source_ptr, source_len, target_ptr, &target_len,
254
+ sa->starts, sa->ends, sa->suffix_index);
255
+
256
+ // create the 2 value return array
257
+ VALUE result = rb_ary_new();
258
+
259
+ rb_ary_push(result, INT2FIX(start));
260
+ rb_ary_push(result, INT2FIX(target_len));
261
+
262
+ return result;
263
+ }
264
+
265
+
266
+ /*
267
+ * call-seq:
268
+ * sarray.longest_nonmatch(target, from_index, min_match) -> [non_match_length, match_start, match_length]
269
+ *
270
+ * Mostly the inverse of longest_match, except that it first tries to find a
271
+ * non-matching region, then a matching region. The target and from_index are
272
+ * the same as in longest_match. The min_match argument is the smallest matching
273
+ * region that you'll accept as significant enough to end the non-matching search.
274
+ * Giving non_match=0 will stop at the first matching region.
275
+ *
276
+ * It works by first searching the suffix array for a non-matching region. When it
277
+ * hits a character that is in the source (according to the suffix array) it tries
278
+ * to find a matching region. If it can find a matching region that is longer than min_match
279
+ * then it stops and returns, otherwise it adds this match to the length of the non-matching
280
+ * region and continues.
281
+ *
282
+ * The return value is an Array of [non_match_length, match_start, match_length].
283
+ */
284
+ static VALUE SuffixArray_longest_nonmatch(VALUE self, VALUE target, VALUE from_index, VALUE min_match)
285
+ {
286
+ SuffixArray *sa = NULL;
287
+ Data_Get_Struct(self, SuffixArray, sa);
288
+
289
+ VALUE sa_source = SuffixArray_source(self);
290
+
291
+ if(sa == NULL || sa->suffix_index == NULL || RSTRING(sa_source)->len == 0) {
292
+ rb_raise(cSAError, ERR_NOT_INITIALIZED);
293
+ }
294
+
295
+ // get the from and for_length arguments as unsigned ints
296
+ size_t from = NUM2UINT(from_index);
297
+ size_t min = NUM2INT(min_match);
298
+
299
+ // get better pointers for the source (should already be in String form)
300
+ unsigned char *source_ptr = RSTRING(sa_source)->ptr;
301
+ size_t source_len = RSTRING(sa_source)->len;
302
+
303
+ // get the target as a string
304
+ VALUE target_str = StringValue(target);
305
+
306
+ // better pointers again, we also need target_len as an in/out parameter
307
+ unsigned char *target_ptr = RSTRING(target_str)->ptr;
308
+ size_t target_len = RSTRING(target_str)->len;
309
+
310
+ // check the input for validity, returning nil like in array operations
311
+ if(from > target_len) {
312
+ return Qnil;
313
+ }
314
+
315
+
316
+ // adjust for the from and for_length settings to be within the target len
317
+ unsigned char *scan = target_ptr + from;
318
+ unsigned char *end = target_ptr + target_len;
319
+ size_t match_len = 0;
320
+ size_t match_start = 0;
321
+ while(scan < end) {
322
+ if(*scan != source_ptr[sa->suffix_index[sa->starts[*scan]]]) {
323
+ scan ++;
324
+ } else {
325
+ // search remaining stuff for a possible match, which return as a result as well
326
+ match_len = end - scan;
327
+ match_start = find_longest_match(source_ptr, source_len, scan, &match_len,
328
+ sa->starts, sa->ends, sa->suffix_index);
329
+
330
+ if(match_len == 0) {
331
+ // match not found, which really shouldn't happen
332
+ break;
333
+ } else if(match_len > min) {
334
+ // the match is possibly long enough, drop out
335
+ break;
336
+ } else {
337
+ // the number of possibly matching characters is much too small, so we continue by skipping them
338
+ scan += match_len;
339
+ // reset the match_len and match_start to 0 to signal that a match hasn't been found yet
340
+ match_len = match_start = 0;
341
+ }
342
+ }
343
+ }
344
+
345
+ VALUE result = rb_ary_new();
346
+
347
+ size_t nonmatch_len = (scan - (target_ptr + from));
348
+ rb_ary_push(result, INT2FIX(nonmatch_len));
349
+ rb_ary_push(result, INT2FIX(match_start));
350
+ rb_ary_push(result, INT2FIX(match_len));
351
+
352
+ return result;
353
+ }
354
+
355
+ /*
356
+ * call-seq:
357
+ * sarray.array -> Array
358
+ *
359
+ * Returns a copy of the internal suffix array as an Array of Fixnum objects. This
360
+ * array is a copy so you're free to mangle it however you wish.
361
+ *
362
+ * A suffix array is the sequence of indices into the source that mark each suffix
363
+ * as if they were sorted.
364
+ */
365
+ static VALUE SuffixArray_array(VALUE self)
366
+ {
367
+ SuffixArray *sa = NULL;
368
+ Data_Get_Struct(self, SuffixArray, sa);
369
+
370
+ VALUE sa_source = SuffixArray_source(self);
371
+
372
+ if(sa == NULL || sa->suffix_index == NULL || RSTRING(sa_source)->len == 0) {
373
+ rb_raise(cSAError, ERR_NOT_INITIALIZED);
374
+ }
375
+
376
+ // get the length of the suffix index
377
+ size_t source_len = RSTRING(sa_source)->len;
378
+ size_t i = 0;
379
+
380
+ VALUE result = rb_ary_new();
381
+
382
+ for(i = 0; i < source_len; i++) {
383
+ rb_ary_push(result, INT2FIX(sa->suffix_index[i]));
384
+ }
385
+
386
+ return result;
387
+ }
388
+
389
+
390
+ /*
391
+ * call-seq:
392
+ * sarray.raw_array -> String
393
+ *
394
+ * Returns the "raw" internal suffix array which is an array of C int types used
395
+ * internally as the suffix array. The purpose of this function is to allow you
396
+ * to store the suffix_array and then very quickly restore it later without having
397
+ * to rebuild the suffix array.
398
+ *
399
+ * The returned String should be treated as an opaque structure. It is just a
400
+ * copy of the int[] used internally. This means that it is dependent on your
401
+ * CPU. If you want something you can use that is cross platform then use the
402
+ * SuffixArray.array function instead.
403
+ */
404
+ static VALUE SuffixArray_raw_array(VALUE self)
405
+ {
406
+ SuffixArray *sa = NULL;
407
+ Data_Get_Struct(self, SuffixArray, sa);
408
+
409
+ VALUE sa_source = SuffixArray_source(self);
410
+
411
+ if(sa == NULL || sa->suffix_index == NULL || RSTRING(sa_source)->len == 0) {
412
+ rb_raise(cSAError, ERR_NOT_INITIALIZED);
413
+ }
414
+
415
+ // build a string that copies this stuff
416
+ VALUE result = rb_str_new((const char *)sa->suffix_index, RSTRING(sa_source)->len * sizeof(int));
417
+
418
+ return result;
419
+ }
420
+
421
+ /*
422
+ * call-seq:
423
+ * sarray.start -> Fixnum
424
+ *
425
+ * Tells you which index in the suffix array is the longest suffix (also known as the
426
+ * start of the source string). If you want to get the beginning of the source string
427
+ * in a round about way you would do this:
428
+ *
429
+ * source = "abracadabra"
430
+ * sa = SuffixArray.new source
431
+ * first = source[sa.array[sa.start]]]
432
+ *
433
+ * Remember that the start is the index into the suffix array where the source starts,
434
+ * not an index into the source string (that would just be 0).
435
+ */
436
+ static VALUE SuffixArray_suffix_start(VALUE self)
437
+ {
438
+ return rb_iv_get(self, "@suffix_start");
439
+ }
440
+
441
+
442
+
443
+
444
+ /*
445
+ * call-seq:
446
+ * sarray.all_starts(character) -> Array
447
+ *
448
+ * Returns an array containing all the indexes into the source that start
449
+ * with the given character. This is a very fast operation since the
450
+ * SuffixArray already knows where each character starts and ends in the
451
+ * suffix array structure internally. All it does is copy the range of
452
+ * the suffix array for that region.
453
+ */
454
+ static VALUE SuffixArray_all_starts(VALUE self, VALUE character)
455
+ {
456
+ SuffixArray *sa = NULL;
457
+ Data_Get_Struct(self, SuffixArray, sa);
458
+
459
+ VALUE result = rb_ary_new();
460
+ VALUE char_str = StringValue(character);
461
+
462
+ // must be at least one length
463
+ if(RSTRING(char_str)->len > 0) {
464
+ size_t ch = (size_t)RSTRING(char_str)->ptr[0];
465
+
466
+ // go through all the suffix array indices as indicated by sa->starts and sa->ends
467
+ size_t start = 0;
468
+
469
+ for(start = sa->starts[ch]; start <= sa->ends[ch]; start++) {
470
+ rb_ary_push(result, INT2FIX(sa->suffix_index[start]));
471
+ }
472
+ }
473
+
474
+ return result;
475
+ }
476
+
477
+
478
+ static VALUE cSuffixArray;
479
+
480
+ /**
481
+ * Implements a SuffixArray structure with functions to do useful operations
482
+ * quickly such as finding matching and non-matching regions, or finding all
483
+ * the locations of a given character. The suffix array construction algorithm
484
+ * used was written by Sean Quinlan and Sean Doward and is licensed under the
485
+ * Plan9 license. Please refer to the sarray.c file for more information.
486
+ *
487
+ * The suffix array construction algorithm used is not the fastest available,
488
+ * but it was the most correctly implemented. There is also a lcp.c file
489
+ * which implements an O(n) Longest Common Prefix algorithm, but it had
490
+ * memory errors and buffer overflows which I decided to avoid for now.
491
+ *
492
+ * This file is licensed under the GPL license (see LICENSE in the root source
493
+ * directory).
494
+ */
495
+ void Init_suffix_array()
496
+ {
497
+ cSuffixArray = rb_define_class("SuffixArray", rb_cObject);
498
+ cSAError = rb_define_class("SAError", rb_eStandardError);
499
+ rb_define_alloc_func(cSuffixArray, SuffixArray_alloc);
500
+
501
+ rb_define_method(cSuffixArray, "initialize", SuffixArray_initialize, -1);
502
+ rb_define_method(cSuffixArray, "longest_match", SuffixArray_longest_match, 2);
503
+ rb_define_method(cSuffixArray, "longest_nonmatch", SuffixArray_longest_nonmatch, 3);
504
+ rb_define_method(cSuffixArray, "array", SuffixArray_array, 0);
505
+ rb_define_method(cSuffixArray, "raw_array", SuffixArray_raw_array, 0);
506
+ rb_define_method(cSuffixArray, "suffix_start", SuffixArray_suffix_start, 0);
507
+ rb_define_method(cSuffixArray, "source", SuffixArray_source, 0);
508
+ rb_define_method(cSuffixArray, "all_starts", SuffixArray_all_starts, 1);
509
+
510
+ }