bktree 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
data/README ADDED
@@ -0,0 +1,28 @@
1
+ BkTrees are pretty cool.
2
+
3
+
4
+ This one is extremely alpha though, so... You should probably look elsewhere. It's pretty fast though, and that's pretty cool.
5
+
6
+
7
+ Anyway, if you want to give it a shot just do something like this:
8
+
9
+ ruby extconf.rb && make
10
+
11
+
12
+ And using it is really simple. It only has two methods:
13
+
14
+ require 'bktree'
15
+
16
+ bk = BkTree.new
17
+ bk.add 'foo'
18
+ bk.add 'bar'
19
+ bk.add 'baz'
20
+
21
+ bk.query 'bor', 2 #=> [['bar', 1], ['baz', 2]]
22
+
23
+
24
+ Have fun.
25
+
26
+
27
+ Your pal,
28
+ Tyler McMullen
@@ -0,0 +1,15 @@
1
+ Gem::Specification.new do |s|
2
+ s.name = 'bktree'
3
+ s.version = '0.0.1'
4
+ s.authors = ["Tyler McMullen"]
5
+ s.date = '2010-02-13'
6
+ s.description = 'bktree in c with ruby bindings'
7
+ s.email = 'tbmcmullen@gmail.com'
8
+ s.extensions = ["ext/extconf.rb"]
9
+ s.extra_rdoc_files = ["README"]
10
+ s.files = Dir['*'] + Dir['ext/*.{c,h,rb}']
11
+ s.homepage = 'http://github.com/tyler/BkTree'
12
+ s.require_paths = ["ext"]
13
+ s.rubygems_version = '1.3.5'
14
+ s.summary = 'bktree in c with ruby bindings'
15
+ end
@@ -0,0 +1,50 @@
1
+ #include "ruby.h"
2
+ #include "bktree.h"
3
+ #include "levenshtein.h"
4
+
5
+ VALUE cBkTree;
6
+
7
+ static VALUE rb_bktree_alloc(VALUE klass) {
8
+ VALUE obj;
9
+ obj = Data_Wrap_Struct(klass, 0, bktree_destroy, bktree_new(levenshtein_distance));
10
+ return obj;
11
+ }
12
+
13
+ static VALUE rb_bktree_add(VALUE self, VALUE word) {
14
+ StringValue(word);
15
+
16
+ BKTree * bktree;
17
+ Data_Get_Struct(self, BKTree, bktree);
18
+
19
+ bktree_add(bktree, RSTRING(word)->ptr, RSTRING(word)->len);
20
+
21
+ return Qnil;
22
+ }
23
+
24
+ static VALUE rb_bktree_query(VALUE self, VALUE word, VALUE max) {
25
+ StringValue(word);
26
+
27
+ BKTree * bktree;
28
+ Data_Get_Struct(self, BKTree, bktree);
29
+
30
+ VALUE result_out = rb_ary_new();
31
+ BKResult * result = bktree_query(bktree, RSTRING(word)->ptr, RSTRING(word)->len, FIX2INT(max));
32
+ while(result) {
33
+ VALUE result_node = rb_ary_new();
34
+
35
+ rb_ary_push(result_node, rb_str_new(BKTREE_GET_STRING(bktree, result->string_offset), BKTREE_GET_STRING_LEN(bktree, result->string_offset)));
36
+ rb_ary_push(result_node, INT2FIX(result->distance));
37
+
38
+ rb_ary_push(result_out, result_node);
39
+ result = result->next;
40
+ }
41
+
42
+ return result_out;
43
+ }
44
+
45
+ void Init_bktree() {
46
+ cBkTree = rb_define_class("BkTree", rb_cObject);
47
+ rb_define_alloc_func(cBkTree, rb_bktree_alloc);
48
+ rb_define_method(cBkTree, "add", rb_bktree_add, 1);
49
+ rb_define_method(cBkTree, "query", rb_bktree_query, 2);
50
+ }
@@ -0,0 +1,47 @@
1
+ #define BKTREE_STRINGS_SIZE 4096
2
+ #define BKTREE_TREE_SIZE 2147483648
3
+
4
+ #define BKTREE_STRING_MAX 16
5
+
6
+ #define BKTREE_OK 0
7
+ #define BKTREE_FAIL 1
8
+
9
+ #define BKTREE_GET_STRING(bktree, string_offset) (bktree->strings + string_offset + 1)
10
+
11
+ #define BKTREE_GET_STRING_LEN(bktree, string_offset) (*(bktree->strings + string_offset))
12
+
13
+ typedef struct {
14
+ long string_offset;
15
+ int next[BKTREE_STRING_MAX];
16
+ } BKNode;
17
+
18
+ typedef struct {
19
+ int size;
20
+
21
+ BKNode * tree;
22
+ BKNode * tree_cursor;
23
+ size_t tree_size;
24
+
25
+ char * strings;
26
+ char * strings_cursor;
27
+ size_t strings_size;
28
+
29
+ // word1, len(word1), word2, len(word2), max
30
+ int (* distance)(char *, int, char *, int, int);
31
+ } BKTree;
32
+
33
+ struct BKResult_s {
34
+ int distance;
35
+ int string_offset;
36
+ struct BKResult_s * next;
37
+ };
38
+ typedef struct BKResult_s BKResult;
39
+
40
+
41
+ BKTree * bktree_new(int (* distance)(char *, int, char *, int, int));
42
+ void bktree_destroy(BKTree * bktree);
43
+ BKNode * bktree_add(BKTree * bktree, char * string, unsigned char len);
44
+ void bktree_node_print(BKTree * bktree, BKNode * node);
45
+
46
+ void bktree_result_destroy(BKResult * result);
47
+ BKResult * bktree_query(BKTree * bktree, char * string, unsigned char len, int max);
@@ -0,0 +1,156 @@
1
+ #include <stdlib.h>
2
+ #include <string.h>
3
+ #include <stdio.h>
4
+ #include "bktree.h"
5
+
6
+ static int write_string(BKTree * bktree, char * string, unsigned char len);
7
+ static BKNode * write_new_record(BKTree * bktree, char * string, unsigned char len);
8
+
9
+ BKTree * bktree_new(int (* distance)(char *, int, char *, int, int)) {
10
+ BKTree * bktree = malloc(sizeof(BKTree));
11
+
12
+ bktree->tree_size = BKTREE_TREE_SIZE;
13
+ bktree->tree = malloc(bktree->tree_size);
14
+ bktree->tree_cursor = bktree->tree;
15
+
16
+ bktree->strings_size = BKTREE_STRINGS_SIZE;
17
+ bktree->strings = malloc(bktree->strings_size);
18
+ bktree->strings_cursor = bktree->strings;
19
+
20
+ bktree->size = 0;
21
+
22
+ bktree->distance = distance;
23
+
24
+ return bktree;
25
+ }
26
+
27
+ void bktree_destroy(BKTree * bktree) {
28
+ free(bktree->tree);
29
+ free(bktree->strings);
30
+ free(bktree);
31
+ }
32
+
33
+ BKNode * bktree_add(BKTree * bktree, char * string, unsigned char len) {
34
+ if(len > BKTREE_STRING_MAX || len == 0)
35
+ return NULL;
36
+
37
+ if(bktree->size == 0) {
38
+ return write_new_record(bktree, string, len);
39
+ }
40
+
41
+ BKNode * node = (BKNode *) bktree->tree;
42
+ while(node) {
43
+ char * node_str = BKTREE_GET_STRING(bktree, node->string_offset);
44
+ int node_str_len = BKTREE_GET_STRING_LEN(bktree, node->string_offset);
45
+
46
+ int d = bktree->distance(node_str, node_str_len, string, len, -1);
47
+
48
+ if(d == 0)
49
+ return BKTREE_OK;
50
+
51
+ if(node->next[d] > 0) {
52
+ node = bktree->tree + node->next[d];
53
+ } else {
54
+ BKNode * new_node = write_new_record(bktree, string, len);
55
+ node->next[d] = new_node - bktree->tree;
56
+ return new_node;
57
+ }
58
+ }
59
+
60
+ return NULL;
61
+ }
62
+
63
+ BKResult * bktree_result_new(BKResult * next, BKNode * node, int distance) {
64
+ BKResult * result = malloc(sizeof(BKResult));
65
+ result->next = next;
66
+ result->distance = distance;
67
+ result->string_offset = node->string_offset;
68
+
69
+ return result;
70
+ }
71
+
72
+ void inner_query(BKTree * bktree, BKNode * node, char * string, unsigned char len, int max, BKResult * * result_ptr) {
73
+
74
+ int d = bktree->distance(BKTREE_GET_STRING(bktree, node->string_offset), BKTREE_GET_STRING_LEN(bktree, node->string_offset), string, len, -1);
75
+
76
+ int start = d - max < 1 ? 1 : d - max;
77
+ int stop = d + max + 1;
78
+ if(stop >= BKTREE_STRING_MAX)
79
+ stop = BKTREE_STRING_MAX - 1;
80
+
81
+ if(d <= max) {
82
+ *result_ptr = bktree_result_new(*result_ptr, node, d);
83
+ }
84
+
85
+ int i;
86
+ for(i = start; i <= stop; i++) {
87
+ if(node->next[i] > 0) {
88
+ inner_query(bktree, bktree->tree + node->next[i], string, len, max, result_ptr);
89
+ }
90
+ }
91
+ }
92
+
93
+ BKResult * bktree_query(BKTree * bktree, char * string, unsigned char len, int max) {
94
+ BKResult * results = NULL;
95
+ inner_query(bktree, bktree->tree, string, len, max, &results);
96
+ return results;
97
+ }
98
+
99
+ void bktree_node_print(BKTree * bktree, BKNode * node) {
100
+ if(bktree == NULL) {
101
+ printf("bktree is null\n");
102
+ return;
103
+ }
104
+
105
+ if(node == NULL) {
106
+ printf("node is null\n");
107
+ return;
108
+ }
109
+
110
+ printf("String: %s\n", BKTREE_GET_STRING(bktree, node->string_offset));
111
+ printf("Offset: %ld\n", node - bktree->tree);
112
+ int i;
113
+ for(i = 0; i < BKTREE_STRING_MAX; i++)
114
+ printf("%d ", node->next[i]);
115
+
116
+ printf("\n");
117
+ }
118
+
119
+ static int write_string(BKTree * bktree, char * string, unsigned char len) {
120
+ while(bktree->strings_cursor - bktree->strings + len + 2 >= bktree->strings_size) {
121
+ int cursor_offset = bktree->strings_cursor - bktree->strings;
122
+
123
+ char * old_strings = bktree->strings;
124
+ bktree->strings = malloc(bktree->strings_size * 2);
125
+ memcpy(bktree->strings, old_strings, bktree->strings_size);
126
+ free(old_strings);
127
+
128
+ //printf("old ptr: %p\n", old_strings);
129
+ //printf("new ptr: %p\n", bktree->strings);
130
+
131
+ bktree->strings_size *= 2;
132
+ bktree->strings_cursor = bktree->strings + cursor_offset;
133
+ }
134
+
135
+ int original_offset = bktree->strings_cursor - bktree->strings;
136
+
137
+ *(bktree->strings_cursor) = len;
138
+ memcpy(bktree->strings_cursor + 1, string, len);
139
+ *(bktree->strings_cursor + len + 1) = '\0';
140
+ bktree->strings_cursor += len + 2;
141
+
142
+ return original_offset;
143
+ }
144
+
145
+ static BKNode * write_new_record(BKTree * bktree, char * string, unsigned char len) {
146
+ BKNode * node = bktree->tree_cursor++;
147
+ node->string_offset = write_string(bktree, string, len);
148
+
149
+ int i;
150
+ for(i = 0; i < BKTREE_STRING_MAX; i++)
151
+ node->next[i] = 0;
152
+
153
+ bktree->size++;
154
+
155
+ return node;
156
+ }
@@ -0,0 +1,2 @@
1
+ require 'mkmf'
2
+ create_makefile 'bktree'
@@ -0,0 +1,155 @@
1
+ #include <stdlib.h>
2
+ #include <string.h>
3
+ #include "levenshtein.h"
4
+
5
+ static int minimum(int a,int b,int c)
6
+ /*Gets the minimum of three values*/
7
+ {
8
+ int min=a;
9
+ if(b<min)
10
+ min=b;
11
+ if(c<min)
12
+ min=c;
13
+ return min;
14
+ }
15
+
16
+ int levenshtein_distance(char *s, int n, char*t, int m, int noop)
17
+ /*Compute levenshtein distance between s and t*/
18
+ {
19
+ //Step 1
20
+ int k,i,j,cost,*d,distance;
21
+ if(n!=0&&m!=0)
22
+ {
23
+ d=malloc((sizeof(int))*(m+1)*(n+1));
24
+ m++;
25
+ n++;
26
+ //Step 2
27
+ for(k=0;k<n;k++)
28
+ d[k]=k;
29
+ for(k=0;k<m;k++)
30
+ d[k*n]=k;
31
+ //Step 3 and 4
32
+ for(i=1;i<n;i++)
33
+ for(j=1;j<m;j++)
34
+ {
35
+ //Step 5
36
+ if(s[i-1]==t[j-1])
37
+ cost=0;
38
+ else
39
+ cost=1;
40
+ //Step 6
41
+ d[j*n+i]=minimum(d[(j-1)*n+i]+1,d[j*n+i-1]+1,d[(j-1)*n+i-1]+cost);
42
+ }
43
+ distance=d[n*m-1];
44
+ free(d);
45
+ return distance;
46
+ }
47
+ else
48
+ return -1; //a negative return value means that one or both strings are empty.
49
+ }
50
+
51
+ int levenshtein(char * s1, int l1, char * s2, int l2, int threshold) {
52
+ int * prev_row, * curr_row;
53
+ int col, row;
54
+ int curr_row_min, result;
55
+ int offset = 0;
56
+
57
+ /* Do the expensive calculation on a subset of the sequences, if possible, by removing the common prefix. */
58
+
59
+ while (s1[offset] == s2[offset]) {
60
+ offset++;
61
+ }
62
+
63
+ /* Do the expensive calculation on a subset of the sequences, if possible, by removing the common postfix. */
64
+
65
+ while ((l1-1 > offset) && (l2-1 > offset) && (s1[l1-1] == s2[l2-1])) {
66
+ l1--;
67
+ l2--;
68
+ }
69
+
70
+ l1 -= offset;
71
+ l2 -= offset;
72
+
73
+ /* The Levenshtein algorithm itself. */
74
+
75
+ /* s1= */
76
+ /* ERIK */
77
+ /* */
78
+ /* 01234 */
79
+ /* s2=V 11234 */
80
+ /* E 21234 */
81
+ /* E 32234 */
82
+ /* N 43334 <- prev_row */
83
+ /* S 54444 <- curr_row */
84
+ /* T 65555 */
85
+ /* R 76566 */
86
+ /* A 87667 */
87
+
88
+ /* Allocate memory for both rows */
89
+
90
+ prev_row = malloc(l1+1);
91
+ curr_row = malloc(l1+1);
92
+
93
+ if ((prev_row == NULL) || (curr_row == NULL)) {
94
+ return -1;
95
+ }
96
+
97
+ /* Initialize the current row. */
98
+
99
+ for (col=0; col<=l1; col++) {
100
+ curr_row[col] = col;
101
+ }
102
+
103
+ for (row=1; row<=l2; row++) {
104
+ /* Copy the current row to the previous row. */
105
+
106
+ memcpy(prev_row, curr_row, sizeof(int)*(l1+1));
107
+
108
+ /* Calculate the values of the current row. */
109
+
110
+ curr_row[0] = row;
111
+ curr_row_min = row;
112
+
113
+ for (col=1; col<=l1; col++) {
114
+ /* Equal (cost=0) or substitution (cost=1). */
115
+
116
+ curr_row[col] = prev_row[col-1] + ((s1[offset+col-1] == s2[offset+row-1]) ? 0 : 1);
117
+
118
+ /* Insertion if it's cheaper than substitution. */
119
+
120
+ if (prev_row[col]+1 < curr_row[col]) {
121
+ curr_row[col] = prev_row[col]+1;
122
+ }
123
+
124
+ /* Deletion if it's cheaper than substitution. */
125
+
126
+ if (curr_row[col-1]+1 < curr_row[col]) {
127
+ curr_row[col] = curr_row[col-1]+1;
128
+ }
129
+
130
+ /* Keep track of the minimum value on this row. */
131
+
132
+ if (curr_row[col] < curr_row_min) {
133
+ curr_row_min = curr_row[col];
134
+ }
135
+ }
136
+
137
+ /* Return nil as soon as we exceed the threshold. */
138
+
139
+ if (threshold > -1 && curr_row_min >= threshold) {
140
+ free(prev_row);
141
+ free(curr_row);
142
+
143
+ return -1;
144
+ }
145
+ }
146
+
147
+ /* The result is the last value on the last row. */
148
+
149
+ result = curr_row[l1];
150
+
151
+ free(prev_row);
152
+ free(curr_row);
153
+
154
+ return result;
155
+ }
@@ -0,0 +1,3 @@
1
+ int levenshtein(char * s1, int l1, char * s2, int l2, int threshold);
2
+ int levenshtein_distance(char *s, int n, char*t, int m, int noop);
3
+
metadata ADDED
@@ -0,0 +1,62 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: bktree
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ platform: ruby
6
+ authors:
7
+ - Tyler McMullen
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+
12
+ date: 2010-02-13 00:00:00 -08:00
13
+ default_executable:
14
+ dependencies: []
15
+
16
+ description: bktree in c with ruby bindings
17
+ email: tbmcmullen@gmail.com
18
+ executables: []
19
+
20
+ extensions:
21
+ - ext/extconf.rb
22
+ extra_rdoc_files:
23
+ - README
24
+ files:
25
+ - bktree.gemspec
26
+ - README
27
+ - ext/bktree.c
28
+ - ext/bktree_c.c
29
+ - ext/levenshtein.c
30
+ - ext/bktree.h
31
+ - ext/levenshtein.h
32
+ - ext/extconf.rb
33
+ has_rdoc: true
34
+ homepage: http://github.com/tyler/BkTree
35
+ licenses: []
36
+
37
+ post_install_message:
38
+ rdoc_options: []
39
+
40
+ require_paths:
41
+ - ext
42
+ required_ruby_version: !ruby/object:Gem::Requirement
43
+ requirements:
44
+ - - ">="
45
+ - !ruby/object:Gem::Version
46
+ version: "0"
47
+ version:
48
+ required_rubygems_version: !ruby/object:Gem::Requirement
49
+ requirements:
50
+ - - ">="
51
+ - !ruby/object:Gem::Version
52
+ version: "0"
53
+ version:
54
+ requirements: []
55
+
56
+ rubyforge_project:
57
+ rubygems_version: 1.3.5
58
+ signing_key:
59
+ specification_version: 3
60
+ summary: bktree in c with ruby bindings
61
+ test_files: []
62
+