gdiff 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- data/CHANGELOG +7 -0
- data/COPYING.suffix_array +278 -0
- data/LICENSE.suffix_array +17 -0
- data/README +40 -0
- data/README.suffix_array +274 -0
- data/bin/gdiff +25 -0
- data/bin/gpatch +25 -0
- data/doc/classes/Diff.html +117 -0
- data/doc/classes/Diff/GDiff.html +120 -0
- data/doc/classes/Diff/GDiff/EGdiffError.html +111 -0
- data/doc/classes/Diff/GDiff/ENoGdiffStream.html +113 -0
- data/doc/classes/Diff/GDiff/EPrematureEndOfStream.html +113 -0
- data/doc/classes/Diff/GDiff/Operations.html +156 -0
- data/doc/classes/Diff/GDiff/Operations/Copy.html +246 -0
- data/doc/classes/Diff/GDiff/Operations/Copy.src/M000014.html +19 -0
- data/doc/classes/Diff/GDiff/Operations/Copy.src/M000015.html +39 -0
- data/doc/classes/Diff/GDiff/Operations/Copy.src/M000016.html +25 -0
- data/doc/classes/Diff/GDiff/Operations/Copy.src/M000017.html +18 -0
- data/doc/classes/Diff/GDiff/Operations/Data.html +246 -0
- data/doc/classes/Diff/GDiff/Operations/Data.src/M000009.html +18 -0
- data/doc/classes/Diff/GDiff/Operations/Data.src/M000010.html +18 -0
- data/doc/classes/Diff/GDiff/Operations/Data.src/M000011.html +35 -0
- data/doc/classes/Diff/GDiff/Operations/Data.src/M000012.html +29 -0
- data/doc/classes/Diff/GDiff/Operations/Data.src/M000013.html +19 -0
- data/doc/classes/SAError.html +111 -0
- data/doc/classes/SuffixArray.html +342 -0
- data/doc/classes/SuffixArray.src/M000001.html +97 -0
- data/doc/classes/SuffixArray.src/M000002.html +73 -0
- data/doc/classes/SuffixArray.src/M000003.html +102 -0
- data/doc/classes/SuffixArray.src/M000004.html +47 -0
- data/doc/classes/SuffixArray.src/M000005.html +44 -0
- data/doc/classes/SuffixArray.src/M000006.html +33 -0
- data/doc/classes/SuffixArray.src/M000007.html +24 -0
- data/doc/classes/SuffixArray.src/M000008.html +46 -0
- data/doc/created.rid +1 -0
- data/doc/files/ext/gdiff/suffix_array/extconf_rb.html +108 -0
- data/doc/files/ext/gdiff/suffix_array/lcp_c.html +101 -0
- data/doc/files/ext/gdiff/suffix_array/sarray_c.html +101 -0
- data/doc/files/ext/gdiff/suffix_array/suffix_array_c.html +101 -0
- data/doc/files/lib/gdiff_rb.html +108 -0
- data/doc/fr_class_index.html +36 -0
- data/doc/fr_file_index.html +31 -0
- data/doc/fr_method_index.html +43 -0
- data/doc/index.html +24 -0
- data/doc/rdoc-style.css +208 -0
- data/ext/gdiff/COPYING +278 -0
- data/ext/gdiff/LICENSE +17 -0
- data/ext/gdiff/README +274 -0
- data/ext/gdiff/extconf.rb +3 -0
- data/ext/gdiff/lcp.c +97 -0
- data/ext/gdiff/sarray.3 +145 -0
- data/ext/gdiff/sarray.c +372 -0
- data/ext/gdiff/sarray.h +13 -0
- data/ext/gdiff/suffix_array.c +510 -0
- data/lib/gdiff.rb +255 -0
- data/setup.rb +1551 -0
- data/test/tc_gdiff.rb +66 -0
- metadata +119 -0
@@ -0,0 +1,97 @@
|
|
1
|
+
<?xml version="1.0" encoding="iso-8859-1"?>
|
2
|
+
<!DOCTYPE html
|
3
|
+
PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
|
4
|
+
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
|
5
|
+
|
6
|
+
<html>
|
7
|
+
<head>
|
8
|
+
<title>new (SuffixArray)</title>
|
9
|
+
<meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1" />
|
10
|
+
<link rel="stylesheet" href="../.././rdoc-style.css" type="text/css" media="screen" />
|
11
|
+
</head>
|
12
|
+
<body class="standalone-code">
|
13
|
+
<pre>/*
|
14
|
+
* call-seq:
|
15
|
+
* SuffixArray.new(source, [raw_array], [start]) -> SuffixArray
|
16
|
+
*
|
17
|
+
* Given a string (anything like a string really) this will generate a
|
18
|
+
* suffix array for the string so that you can work with it. The
|
19
|
+
* source cannot be an empty string since this is a useless operation.
|
20
|
+
*
|
21
|
+
* Two optional parameters allow you to restore a suffix array without
|
22
|
+
* running the construction process again. You basically give it the
|
23
|
+
* String from SuffixArray.raw_array and the start from SuffixArray.suffix_start
|
24
|
+
* and it will skip most calculations. <b>This feature is really experimental
|
25
|
+
* and is CPU dependent since the integers in the raw_array are native.</b>
|
26
|
+
*/
|
27
|
+
static VALUE SuffixArray_initialize(int argc, VALUE *argv, VALUE self)
|
28
|
+
{
|
29
|
+
SuffixArray *sa = NULL;
|
30
|
+
size_t i = 0;
|
31
|
+
Data_Get_Struct(self, SuffixArray, sa);
|
32
|
+
assert(sa != NULL);
|
33
|
+
VALUE source;
|
34
|
+
VALUE array;
|
35
|
+
VALUE start;
|
36
|
+
|
37
|
+
// sort out the arguments and such
|
38
|
+
rb_scan_args(argc, argv, "12", &source, &array, &start);
|
39
|
+
|
40
|
+
// get the string value of the source given to us, keep it around for later
|
41
|
+
VALUE sa_source_str = StringValue(source);
|
42
|
+
rb_iv_set(self, "@source", sa_source_str);
|
43
|
+
|
44
|
+
// setup temporary variables for the source and length pointers
|
45
|
+
unsigned char *sa_source = RSTRING(sa_source_str)->ptr;
|
46
|
+
size_t sa_source_len = RSTRING(sa_source_str)->len;
|
47
|
+
|
48
|
+
// error check the whole thing
|
49
|
+
if(sa_source_len == 0) {
|
50
|
+
// we can't have this, so return a nil
|
51
|
+
rb_raise(cSAError, ERR_NO_ZERO_LENGTH_INPUT);
|
52
|
+
}
|
53
|
+
|
54
|
+
if(!NIL_P(array) && NIL_P(start)) {
|
55
|
+
rb_raise(cSAError, ERR_START_IF_ARRAY);
|
56
|
+
} else if (!NIL_P(array) && !NIL_P(start)) {
|
57
|
+
// looks like both parameters were given so check out the lengths
|
58
|
+
if(RSTRING(array)->len / sizeof(int) != sa_source_len) {
|
59
|
+
rb_raise(cSAError, ERR_MISMATCH_LENGTH);
|
60
|
+
}
|
61
|
+
}
|
62
|
+
|
63
|
+
// allocate memory for the index integers
|
64
|
+
sa->suffix_index = malloc(sizeof(int) * (sa_source_len + 1));
|
65
|
+
|
66
|
+
if(NIL_P(array)) {
|
67
|
+
// create the suffix array from the source
|
68
|
+
int st = bsarray(sa_source, sa->suffix_index, sa_source_len-1);
|
69
|
+
|
70
|
+
// set the suffix_start in our object
|
71
|
+
rb_iv_set(self, "@suffix_start", INT2NUM(st));
|
72
|
+
} else {
|
73
|
+
// convert the given array and start to the internal structures needed
|
74
|
+
// the return value is ignored since I can't seem find any consistent definition for
|
75
|
+
// it's value that will tell me if this failed.
|
76
|
+
memcpy(sa->suffix_index, RSTRING(array)->ptr, sa_source_len * sizeof(int));
|
77
|
+
rb_iv_set(self, "@suffix_start", start);
|
78
|
+
}
|
79
|
+
|
80
|
+
unsigned char c = sa_source[sa->suffix_index[0]]; // start off with the first char in the sarray list
|
81
|
+
sa->starts[c] = 0;
|
82
|
+
for(i = 0; i < sa_source_len; i++) {
|
83
|
+
// skip characters until we see a new one
|
84
|
+
if(sa_source[sa->suffix_index[i]] != c) {
|
85
|
+
sa->ends[c] = i-1; // it's -1 since this is a new character, so the end was actually behind this point
|
86
|
+
c = sa_source[sa->suffix_index[i]];
|
87
|
+
sa->starts[c] = i;
|
88
|
+
}
|
89
|
+
}
|
90
|
+
// set the last valid character to get the tail of the sa, the loop will miss it
|
91
|
+
c = sa_source[sa->suffix_index[sa_source_len-1]];
|
92
|
+
sa->ends[c] = sa_source_len-1;
|
93
|
+
|
94
|
+
return INT2FIX(sa_source_len);
|
95
|
+
}</pre>
|
96
|
+
</body>
|
97
|
+
</html>
|
@@ -0,0 +1,73 @@
|
|
1
|
+
<?xml version="1.0" encoding="iso-8859-1"?>
|
2
|
+
<!DOCTYPE html
|
3
|
+
PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
|
4
|
+
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
|
5
|
+
|
6
|
+
<html>
|
7
|
+
<head>
|
8
|
+
<title>longest_match (SuffixArray)</title>
|
9
|
+
<meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1" />
|
10
|
+
<link rel="stylesheet" href="../.././rdoc-style.css" type="text/css" media="screen" />
|
11
|
+
</head>
|
12
|
+
<body class="standalone-code">
|
13
|
+
<pre>/*
|
14
|
+
* call-seq:
|
15
|
+
* sarray.longest_match(target, from_index) -> [start, length]
|
16
|
+
*
|
17
|
+
* Takes a target string and an index inside that string, and then tries
|
18
|
+
* to find the longest match from that point in the source string for this
|
19
|
+
* SuffixArray object.
|
20
|
+
*
|
21
|
+
* It returns an array of [start, length] of where in the source a length
|
22
|
+
* string from the target would match.
|
23
|
+
*
|
24
|
+
* Refer to the unit test for examples of usage.
|
25
|
+
*/
|
26
|
+
static VALUE SuffixArray_longest_match(VALUE self, VALUE target, VALUE from_index)
|
27
|
+
{
|
28
|
+
SuffixArray *sa = NULL;
|
29
|
+
Data_Get_Struct(self, SuffixArray, sa);
|
30
|
+
|
31
|
+
VALUE sa_source = SuffixArray_source(self);
|
32
|
+
|
33
|
+
if(sa == NULL || sa->suffix_index == NULL || RSTRING(sa_source)->len == 0) {
|
34
|
+
rb_raise(cSAError, ERR_NOT_INITIALIZED);
|
35
|
+
}
|
36
|
+
|
37
|
+
// get the from and for_length arguments as unsigned ints
|
38
|
+
size_t from = NUM2UINT(from_index);
|
39
|
+
|
40
|
+
|
41
|
+
// get better pointers for the source (should already be in String form)
|
42
|
+
unsigned char *source_ptr = RSTRING(sa_source)->ptr;
|
43
|
+
size_t source_len = RSTRING(sa_source)->len;
|
44
|
+
|
45
|
+
// get the target as a string
|
46
|
+
VALUE target_str = StringValue(target);
|
47
|
+
|
48
|
+
// better pointers again, we also need target_len as an in/out parameter
|
49
|
+
unsigned char *target_ptr = RSTRING(target_str)->ptr;
|
50
|
+
size_t target_len = RSTRING(target_str)->len;
|
51
|
+
|
52
|
+
// check the input for validity, returning nil like in array operations
|
53
|
+
if(from > target_len) {
|
54
|
+
return Qnil;
|
55
|
+
}
|
56
|
+
|
57
|
+
// adjust for the from and for_length settings to be within the target len
|
58
|
+
target_ptr += from;
|
59
|
+
target_len -= from;
|
60
|
+
|
61
|
+
size_t start = find_longest_match(source_ptr, source_len, target_ptr, &target_len,
|
62
|
+
sa->starts, sa->ends, sa->suffix_index);
|
63
|
+
|
64
|
+
// create the 2 value return array
|
65
|
+
VALUE result = rb_ary_new();
|
66
|
+
|
67
|
+
rb_ary_push(result, INT2FIX(start));
|
68
|
+
rb_ary_push(result, INT2FIX(target_len));
|
69
|
+
|
70
|
+
return result;
|
71
|
+
}</pre>
|
72
|
+
</body>
|
73
|
+
</html>
|
@@ -0,0 +1,102 @@
|
|
1
|
+
<?xml version="1.0" encoding="iso-8859-1"?>
|
2
|
+
<!DOCTYPE html
|
3
|
+
PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
|
4
|
+
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
|
5
|
+
|
6
|
+
<html>
|
7
|
+
<head>
|
8
|
+
<title>longest_nonmatch (SuffixArray)</title>
|
9
|
+
<meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1" />
|
10
|
+
<link rel="stylesheet" href="../.././rdoc-style.css" type="text/css" media="screen" />
|
11
|
+
</head>
|
12
|
+
<body class="standalone-code">
|
13
|
+
<pre>/*
|
14
|
+
* call-seq:
|
15
|
+
* sarray.longest_nonmatch(target, from_index, min_match) -> [non_match_length, match_start, match_length]
|
16
|
+
*
|
17
|
+
* Mostly the inverse of longest_match, except that it first tries to find a
|
18
|
+
* non-matching region, then a matching region. The target and from_index are
|
19
|
+
* the same as in longest_match. The min_match argument is the smallest matching
|
20
|
+
* region that you'll accept as significant enough to end the non-matching search.
|
21
|
+
* Giving non_match=0 will stop at the first matching region.
|
22
|
+
*
|
23
|
+
* It works by first searching the suffix array for a non-matching region. When it
|
24
|
+
* hits a character that is in the source (according to the suffix array) it tries
|
25
|
+
* to find a matching region. If it can find a matching region that is longer than min_match
|
26
|
+
* then it stops and returns, otherwise it adds this match to the length of the non-matching
|
27
|
+
* region and continues.
|
28
|
+
*
|
29
|
+
* The return value is an Array of [non_match_length, match_start, match_length].
|
30
|
+
*/
|
31
|
+
static VALUE SuffixArray_longest_nonmatch(VALUE self, VALUE target, VALUE from_index, VALUE min_match)
|
32
|
+
{
|
33
|
+
SuffixArray *sa = NULL;
|
34
|
+
Data_Get_Struct(self, SuffixArray, sa);
|
35
|
+
|
36
|
+
VALUE sa_source = SuffixArray_source(self);
|
37
|
+
|
38
|
+
if(sa == NULL || sa->suffix_index == NULL || RSTRING(sa_source)->len == 0) {
|
39
|
+
rb_raise(cSAError, ERR_NOT_INITIALIZED);
|
40
|
+
}
|
41
|
+
|
42
|
+
// get the from and for_length arguments as unsigned ints
|
43
|
+
size_t from = NUM2UINT(from_index);
|
44
|
+
size_t min = NUM2INT(min_match);
|
45
|
+
|
46
|
+
// get better pointers for the source (should already be in String form)
|
47
|
+
unsigned char *source_ptr = RSTRING(sa_source)->ptr;
|
48
|
+
size_t source_len = RSTRING(sa_source)->len;
|
49
|
+
|
50
|
+
// get the target as a string
|
51
|
+
VALUE target_str = StringValue(target);
|
52
|
+
|
53
|
+
// better pointers again, we also need target_len as an in/out parameter
|
54
|
+
unsigned char *target_ptr = RSTRING(target_str)->ptr;
|
55
|
+
size_t target_len = RSTRING(target_str)->len;
|
56
|
+
|
57
|
+
// check the input for validity, returning nil like in array operations
|
58
|
+
if(from > target_len) {
|
59
|
+
return Qnil;
|
60
|
+
}
|
61
|
+
|
62
|
+
|
63
|
+
// adjust for the from and for_length settings to be within the target len
|
64
|
+
unsigned char *scan = target_ptr + from;
|
65
|
+
unsigned char *end = target_ptr + target_len;
|
66
|
+
size_t match_len = 0;
|
67
|
+
size_t match_start = 0;
|
68
|
+
while(scan < end) {
|
69
|
+
if(*scan != source_ptr[sa->suffix_index[sa->starts[*scan]]]) {
|
70
|
+
scan ++;
|
71
|
+
} else {
|
72
|
+
// search remaining stuff for a possible match, which return as a result as well
|
73
|
+
match_len = end - scan;
|
74
|
+
match_start = find_longest_match(source_ptr, source_len, scan, &match_len,
|
75
|
+
sa->starts, sa->ends, sa->suffix_index);
|
76
|
+
|
77
|
+
if(match_len == 0) {
|
78
|
+
// match not found, which really shouldn't happen
|
79
|
+
break;
|
80
|
+
} else if(match_len > min) {
|
81
|
+
// the match is possibly long enough, drop out
|
82
|
+
break;
|
83
|
+
} else {
|
84
|
+
// the number of possibly matching characters is much too small, so we continue by skipping them
|
85
|
+
scan += match_len;
|
86
|
+
// reset the match_len and match_start to 0 to signal that a match hasn't been found yet
|
87
|
+
match_len = match_start = 0;
|
88
|
+
}
|
89
|
+
}
|
90
|
+
}
|
91
|
+
|
92
|
+
VALUE result = rb_ary_new();
|
93
|
+
|
94
|
+
size_t nonmatch_len = (scan - (target_ptr + from));
|
95
|
+
rb_ary_push(result, INT2FIX(nonmatch_len));
|
96
|
+
rb_ary_push(result, INT2FIX(match_start));
|
97
|
+
rb_ary_push(result, INT2FIX(match_len));
|
98
|
+
|
99
|
+
return result;
|
100
|
+
}</pre>
|
101
|
+
</body>
|
102
|
+
</html>
|
@@ -0,0 +1,47 @@
|
|
1
|
+
<?xml version="1.0" encoding="iso-8859-1"?>
|
2
|
+
<!DOCTYPE html
|
3
|
+
PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
|
4
|
+
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
|
5
|
+
|
6
|
+
<html>
|
7
|
+
<head>
|
8
|
+
<title>array (SuffixArray)</title>
|
9
|
+
<meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1" />
|
10
|
+
<link rel="stylesheet" href="../.././rdoc-style.css" type="text/css" media="screen" />
|
11
|
+
</head>
|
12
|
+
<body class="standalone-code">
|
13
|
+
<pre>/*
|
14
|
+
* call-seq:
|
15
|
+
* sarray.array -> Array
|
16
|
+
*
|
17
|
+
* Returns a copy of the internal suffix array as an Array of Fixnum objects. This
|
18
|
+
* array is a copy so you're free to mangle it however you wish.
|
19
|
+
*
|
20
|
+
* A suffix array is the sequence of indices into the source that mark each suffix
|
21
|
+
* as if they were sorted.
|
22
|
+
*/
|
23
|
+
static VALUE SuffixArray_array(VALUE self)
|
24
|
+
{
|
25
|
+
SuffixArray *sa = NULL;
|
26
|
+
Data_Get_Struct(self, SuffixArray, sa);
|
27
|
+
|
28
|
+
VALUE sa_source = SuffixArray_source(self);
|
29
|
+
|
30
|
+
if(sa == NULL || sa->suffix_index == NULL || RSTRING(sa_source)->len == 0) {
|
31
|
+
rb_raise(cSAError, ERR_NOT_INITIALIZED);
|
32
|
+
}
|
33
|
+
|
34
|
+
// get the length of the suffix index
|
35
|
+
size_t source_len = RSTRING(sa_source)->len;
|
36
|
+
size_t i = 0;
|
37
|
+
|
38
|
+
VALUE result = rb_ary_new();
|
39
|
+
|
40
|
+
for(i = 0; i < source_len; i++) {
|
41
|
+
rb_ary_push(result, INT2FIX(sa->suffix_index[i]));
|
42
|
+
}
|
43
|
+
|
44
|
+
return result;
|
45
|
+
}</pre>
|
46
|
+
</body>
|
47
|
+
</html>
|
@@ -0,0 +1,44 @@
|
|
1
|
+
<?xml version="1.0" encoding="iso-8859-1"?>
|
2
|
+
<!DOCTYPE html
|
3
|
+
PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
|
4
|
+
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
|
5
|
+
|
6
|
+
<html>
|
7
|
+
<head>
|
8
|
+
<title>raw_array (SuffixArray)</title>
|
9
|
+
<meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1" />
|
10
|
+
<link rel="stylesheet" href="../.././rdoc-style.css" type="text/css" media="screen" />
|
11
|
+
</head>
|
12
|
+
<body class="standalone-code">
|
13
|
+
<pre>/*
|
14
|
+
* call-seq:
|
15
|
+
* sarray.raw_array -> String
|
16
|
+
*
|
17
|
+
* Returns the "raw" internal suffix array which is an array of C int types used
|
18
|
+
* internally as the suffix array. The purpose of this function is to allow you
|
19
|
+
* to store the suffix_array and then very quickly restore it later without having
|
20
|
+
* to rebuild the suffix array.
|
21
|
+
*
|
22
|
+
* The returned String should be treated as an opaque structure. It is just a
|
23
|
+
* copy of the int[] used internally. This means that it is dependent on your
|
24
|
+
* CPU. If you want something you can use that is cross platform then use the
|
25
|
+
* SuffixArray.array function instead.
|
26
|
+
*/
|
27
|
+
static VALUE SuffixArray_raw_array(VALUE self)
|
28
|
+
{
|
29
|
+
SuffixArray *sa = NULL;
|
30
|
+
Data_Get_Struct(self, SuffixArray, sa);
|
31
|
+
|
32
|
+
VALUE sa_source = SuffixArray_source(self);
|
33
|
+
|
34
|
+
if(sa == NULL || sa->suffix_index == NULL || RSTRING(sa_source)->len == 0) {
|
35
|
+
rb_raise(cSAError, ERR_NOT_INITIALIZED);
|
36
|
+
}
|
37
|
+
|
38
|
+
// build a string that copies this stuff
|
39
|
+
VALUE result = rb_str_new((const char *)sa->suffix_index, RSTRING(sa_source)->len * sizeof(int));
|
40
|
+
|
41
|
+
return result;
|
42
|
+
}</pre>
|
43
|
+
</body>
|
44
|
+
</html>
|
@@ -0,0 +1,33 @@
|
|
1
|
+
<?xml version="1.0" encoding="iso-8859-1"?>
|
2
|
+
<!DOCTYPE html
|
3
|
+
PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
|
4
|
+
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
|
5
|
+
|
6
|
+
<html>
|
7
|
+
<head>
|
8
|
+
<title>suffix_start (SuffixArray)</title>
|
9
|
+
<meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1" />
|
10
|
+
<link rel="stylesheet" href="../.././rdoc-style.css" type="text/css" media="screen" />
|
11
|
+
</head>
|
12
|
+
<body class="standalone-code">
|
13
|
+
<pre>/*
|
14
|
+
* call-seq:
|
15
|
+
* sarray.start -> Fixnum
|
16
|
+
*
|
17
|
+
* Tells you which index in the suffix array is the longest suffix (also known as the
|
18
|
+
* start of the source string). If you want to get the beginning of the source string
|
19
|
+
* in a round about way you would do this:
|
20
|
+
*
|
21
|
+
* source = "abracadabra"
|
22
|
+
* sa = SuffixArray.new source
|
23
|
+
* first = source[sa.array[sa.start]]]
|
24
|
+
*
|
25
|
+
* Remember that the start is the index into the suffix array where the source starts,
|
26
|
+
* not an index into the source string (that would just be 0).
|
27
|
+
*/
|
28
|
+
static VALUE SuffixArray_suffix_start(VALUE self)
|
29
|
+
{
|
30
|
+
return rb_iv_get(self, "@suffix_start");
|
31
|
+
}</pre>
|
32
|
+
</body>
|
33
|
+
</html>
|
@@ -0,0 +1,24 @@
|
|
1
|
+
<?xml version="1.0" encoding="iso-8859-1"?>
|
2
|
+
<!DOCTYPE html
|
3
|
+
PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
|
4
|
+
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
|
5
|
+
|
6
|
+
<html>
|
7
|
+
<head>
|
8
|
+
<title>source (SuffixArray)</title>
|
9
|
+
<meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1" />
|
10
|
+
<link rel="stylesheet" href="../.././rdoc-style.css" type="text/css" media="screen" />
|
11
|
+
</head>
|
12
|
+
<body class="standalone-code">
|
13
|
+
<pre>/*
|
14
|
+
* call-seq:
|
15
|
+
* sarray.source -> String
|
16
|
+
*
|
17
|
+
* Returns the source that this suffix array was constructed with.
|
18
|
+
*/
|
19
|
+
static VALUE SuffixArray_source(VALUE self)
|
20
|
+
{
|
21
|
+
return rb_iv_get(self, "@source");
|
22
|
+
}</pre>
|
23
|
+
</body>
|
24
|
+
</html>
|
@@ -0,0 +1,46 @@
|
|
1
|
+
<?xml version="1.0" encoding="iso-8859-1"?>
|
2
|
+
<!DOCTYPE html
|
3
|
+
PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
|
4
|
+
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
|
5
|
+
|
6
|
+
<html>
|
7
|
+
<head>
|
8
|
+
<title>all_starts (SuffixArray)</title>
|
9
|
+
<meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1" />
|
10
|
+
<link rel="stylesheet" href="../.././rdoc-style.css" type="text/css" media="screen" />
|
11
|
+
</head>
|
12
|
+
<body class="standalone-code">
|
13
|
+
<pre>/*
|
14
|
+
* call-seq:
|
15
|
+
* sarray.all_starts(character) -> Array
|
16
|
+
*
|
17
|
+
* Returns an array containing all the indexes into the source that start
|
18
|
+
* with the given character. This is a very fast operation since the
|
19
|
+
* SuffixArray already knows where each character starts and ends in the
|
20
|
+
* suffix array structure internally. All it does is copy the range of
|
21
|
+
* the suffix array for that region.
|
22
|
+
*/
|
23
|
+
static VALUE SuffixArray_all_starts(VALUE self, VALUE character)
|
24
|
+
{
|
25
|
+
SuffixArray *sa = NULL;
|
26
|
+
Data_Get_Struct(self, SuffixArray, sa);
|
27
|
+
|
28
|
+
VALUE result = rb_ary_new();
|
29
|
+
VALUE char_str = StringValue(character);
|
30
|
+
|
31
|
+
// must be at least one length
|
32
|
+
if(RSTRING(char_str)->len > 0) {
|
33
|
+
size_t ch = (size_t)RSTRING(char_str)->ptr[0];
|
34
|
+
|
35
|
+
// go through all the suffix array indices as indicated by sa->starts and sa->ends
|
36
|
+
size_t start = 0;
|
37
|
+
|
38
|
+
for(start = sa->starts[ch]; start <= sa->ends[ch]; start++) {
|
39
|
+
rb_ary_push(result, INT2FIX(sa->suffix_index[start]));
|
40
|
+
}
|
41
|
+
}
|
42
|
+
|
43
|
+
return result;
|
44
|
+
}</pre>
|
45
|
+
</body>
|
46
|
+
</html>
|