bktree 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- data/README +28 -0
- data/bktree.gemspec +15 -0
- data/ext/bktree.c +50 -0
- data/ext/bktree.h +47 -0
- data/ext/bktree_c.c +156 -0
- data/ext/extconf.rb +2 -0
- data/ext/levenshtein.c +155 -0
- data/ext/levenshtein.h +3 -0
- metadata +62 -0
data/README
ADDED
@@ -0,0 +1,28 @@
|
|
1
|
+
BkTrees are pretty cool.
|
2
|
+
|
3
|
+
|
4
|
+
This one is extremely alpha though, so... You should probably look elsewhere. It's pretty fast though, and that's pretty cool.
|
5
|
+
|
6
|
+
|
7
|
+
Anyway, if you want to give it a shot just do something like this:
|
8
|
+
|
9
|
+
ruby extconf.rb && make
|
10
|
+
|
11
|
+
|
12
|
+
And using it is really simple. It only has two methods:
|
13
|
+
|
14
|
+
require 'bktree'
|
15
|
+
|
16
|
+
bk = BkTree.new
|
17
|
+
bk.add 'foo'
|
18
|
+
bk.add 'bar'
|
19
|
+
bk.add 'baz'
|
20
|
+
|
21
|
+
bk.query 'bor', 2 #=> [['bar', 1], ['baz', 2]]
|
22
|
+
|
23
|
+
|
24
|
+
Have fun.
|
25
|
+
|
26
|
+
|
27
|
+
Your pal,
|
28
|
+
Tyler McMullen
|
data/bktree.gemspec
ADDED
@@ -0,0 +1,15 @@
|
|
1
|
+
Gem::Specification.new do |s|
|
2
|
+
s.name = 'bktree'
|
3
|
+
s.version = '0.0.1'
|
4
|
+
s.authors = ["Tyler McMullen"]
|
5
|
+
s.date = '2010-02-13'
|
6
|
+
s.description = 'bktree in c with ruby bindings'
|
7
|
+
s.email = 'tbmcmullen@gmail.com'
|
8
|
+
s.extensions = ["ext/extconf.rb"]
|
9
|
+
s.extra_rdoc_files = ["README"]
|
10
|
+
s.files = Dir['*'] + Dir['ext/*.{c,h,rb}']
|
11
|
+
s.homepage = 'http://github.com/tyler/BkTree'
|
12
|
+
s.require_paths = ["ext"]
|
13
|
+
s.rubygems_version = '1.3.5'
|
14
|
+
s.summary = 'bktree in c with ruby bindings'
|
15
|
+
end
|
data/ext/bktree.c
ADDED
@@ -0,0 +1,50 @@
|
|
1
|
+
#include "ruby.h"
|
2
|
+
#include "bktree.h"
|
3
|
+
#include "levenshtein.h"
|
4
|
+
|
5
|
+
VALUE cBkTree;
|
6
|
+
|
7
|
+
static VALUE rb_bktree_alloc(VALUE klass) {
|
8
|
+
VALUE obj;
|
9
|
+
obj = Data_Wrap_Struct(klass, 0, bktree_destroy, bktree_new(levenshtein_distance));
|
10
|
+
return obj;
|
11
|
+
}
|
12
|
+
|
13
|
+
static VALUE rb_bktree_add(VALUE self, VALUE word) {
|
14
|
+
StringValue(word);
|
15
|
+
|
16
|
+
BKTree * bktree;
|
17
|
+
Data_Get_Struct(self, BKTree, bktree);
|
18
|
+
|
19
|
+
bktree_add(bktree, RSTRING(word)->ptr, RSTRING(word)->len);
|
20
|
+
|
21
|
+
return Qnil;
|
22
|
+
}
|
23
|
+
|
24
|
+
static VALUE rb_bktree_query(VALUE self, VALUE word, VALUE max) {
|
25
|
+
StringValue(word);
|
26
|
+
|
27
|
+
BKTree * bktree;
|
28
|
+
Data_Get_Struct(self, BKTree, bktree);
|
29
|
+
|
30
|
+
VALUE result_out = rb_ary_new();
|
31
|
+
BKResult * result = bktree_query(bktree, RSTRING(word)->ptr, RSTRING(word)->len, FIX2INT(max));
|
32
|
+
while(result) {
|
33
|
+
VALUE result_node = rb_ary_new();
|
34
|
+
|
35
|
+
rb_ary_push(result_node, rb_str_new(BKTREE_GET_STRING(bktree, result->string_offset), BKTREE_GET_STRING_LEN(bktree, result->string_offset)));
|
36
|
+
rb_ary_push(result_node, INT2FIX(result->distance));
|
37
|
+
|
38
|
+
rb_ary_push(result_out, result_node);
|
39
|
+
result = result->next;
|
40
|
+
}
|
41
|
+
|
42
|
+
return result_out;
|
43
|
+
}
|
44
|
+
|
45
|
+
void Init_bktree() {
|
46
|
+
cBkTree = rb_define_class("BkTree", rb_cObject);
|
47
|
+
rb_define_alloc_func(cBkTree, rb_bktree_alloc);
|
48
|
+
rb_define_method(cBkTree, "add", rb_bktree_add, 1);
|
49
|
+
rb_define_method(cBkTree, "query", rb_bktree_query, 2);
|
50
|
+
}
|
data/ext/bktree.h
ADDED
@@ -0,0 +1,47 @@
|
|
1
|
+
#define BKTREE_STRINGS_SIZE 4096
|
2
|
+
#define BKTREE_TREE_SIZE 2147483648
|
3
|
+
|
4
|
+
#define BKTREE_STRING_MAX 16
|
5
|
+
|
6
|
+
#define BKTREE_OK 0
|
7
|
+
#define BKTREE_FAIL 1
|
8
|
+
|
9
|
+
#define BKTREE_GET_STRING(bktree, string_offset) (bktree->strings + string_offset + 1)
|
10
|
+
|
11
|
+
#define BKTREE_GET_STRING_LEN(bktree, string_offset) (*(bktree->strings + string_offset))
|
12
|
+
|
13
|
+
typedef struct {
|
14
|
+
long string_offset;
|
15
|
+
int next[BKTREE_STRING_MAX];
|
16
|
+
} BKNode;
|
17
|
+
|
18
|
+
typedef struct {
|
19
|
+
int size;
|
20
|
+
|
21
|
+
BKNode * tree;
|
22
|
+
BKNode * tree_cursor;
|
23
|
+
size_t tree_size;
|
24
|
+
|
25
|
+
char * strings;
|
26
|
+
char * strings_cursor;
|
27
|
+
size_t strings_size;
|
28
|
+
|
29
|
+
// word1, len(word1), word2, len(word2), max
|
30
|
+
int (* distance)(char *, int, char *, int, int);
|
31
|
+
} BKTree;
|
32
|
+
|
33
|
+
struct BKResult_s {
|
34
|
+
int distance;
|
35
|
+
int string_offset;
|
36
|
+
struct BKResult_s * next;
|
37
|
+
};
|
38
|
+
typedef struct BKResult_s BKResult;
|
39
|
+
|
40
|
+
|
41
|
+
BKTree * bktree_new(int (* distance)(char *, int, char *, int, int));
|
42
|
+
void bktree_destroy(BKTree * bktree);
|
43
|
+
BKNode * bktree_add(BKTree * bktree, char * string, unsigned char len);
|
44
|
+
void bktree_node_print(BKTree * bktree, BKNode * node);
|
45
|
+
|
46
|
+
void bktree_result_destroy(BKResult * result);
|
47
|
+
BKResult * bktree_query(BKTree * bktree, char * string, unsigned char len, int max);
|
data/ext/bktree_c.c
ADDED
@@ -0,0 +1,156 @@
|
|
1
|
+
#include <stdlib.h>
|
2
|
+
#include <string.h>
|
3
|
+
#include <stdio.h>
|
4
|
+
#include "bktree.h"
|
5
|
+
|
6
|
+
static int write_string(BKTree * bktree, char * string, unsigned char len);
|
7
|
+
static BKNode * write_new_record(BKTree * bktree, char * string, unsigned char len);
|
8
|
+
|
9
|
+
BKTree * bktree_new(int (* distance)(char *, int, char *, int, int)) {
|
10
|
+
BKTree * bktree = malloc(sizeof(BKTree));
|
11
|
+
|
12
|
+
bktree->tree_size = BKTREE_TREE_SIZE;
|
13
|
+
bktree->tree = malloc(bktree->tree_size);
|
14
|
+
bktree->tree_cursor = bktree->tree;
|
15
|
+
|
16
|
+
bktree->strings_size = BKTREE_STRINGS_SIZE;
|
17
|
+
bktree->strings = malloc(bktree->strings_size);
|
18
|
+
bktree->strings_cursor = bktree->strings;
|
19
|
+
|
20
|
+
bktree->size = 0;
|
21
|
+
|
22
|
+
bktree->distance = distance;
|
23
|
+
|
24
|
+
return bktree;
|
25
|
+
}
|
26
|
+
|
27
|
+
void bktree_destroy(BKTree * bktree) {
|
28
|
+
free(bktree->tree);
|
29
|
+
free(bktree->strings);
|
30
|
+
free(bktree);
|
31
|
+
}
|
32
|
+
|
33
|
+
BKNode * bktree_add(BKTree * bktree, char * string, unsigned char len) {
|
34
|
+
if(len > BKTREE_STRING_MAX || len == 0)
|
35
|
+
return NULL;
|
36
|
+
|
37
|
+
if(bktree->size == 0) {
|
38
|
+
return write_new_record(bktree, string, len);
|
39
|
+
}
|
40
|
+
|
41
|
+
BKNode * node = (BKNode *) bktree->tree;
|
42
|
+
while(node) {
|
43
|
+
char * node_str = BKTREE_GET_STRING(bktree, node->string_offset);
|
44
|
+
int node_str_len = BKTREE_GET_STRING_LEN(bktree, node->string_offset);
|
45
|
+
|
46
|
+
int d = bktree->distance(node_str, node_str_len, string, len, -1);
|
47
|
+
|
48
|
+
if(d == 0)
|
49
|
+
return BKTREE_OK;
|
50
|
+
|
51
|
+
if(node->next[d] > 0) {
|
52
|
+
node = bktree->tree + node->next[d];
|
53
|
+
} else {
|
54
|
+
BKNode * new_node = write_new_record(bktree, string, len);
|
55
|
+
node->next[d] = new_node - bktree->tree;
|
56
|
+
return new_node;
|
57
|
+
}
|
58
|
+
}
|
59
|
+
|
60
|
+
return NULL;
|
61
|
+
}
|
62
|
+
|
63
|
+
BKResult * bktree_result_new(BKResult * next, BKNode * node, int distance) {
|
64
|
+
BKResult * result = malloc(sizeof(BKResult));
|
65
|
+
result->next = next;
|
66
|
+
result->distance = distance;
|
67
|
+
result->string_offset = node->string_offset;
|
68
|
+
|
69
|
+
return result;
|
70
|
+
}
|
71
|
+
|
72
|
+
void inner_query(BKTree * bktree, BKNode * node, char * string, unsigned char len, int max, BKResult * * result_ptr) {
|
73
|
+
|
74
|
+
int d = bktree->distance(BKTREE_GET_STRING(bktree, node->string_offset), BKTREE_GET_STRING_LEN(bktree, node->string_offset), string, len, -1);
|
75
|
+
|
76
|
+
int start = d - max < 1 ? 1 : d - max;
|
77
|
+
int stop = d + max + 1;
|
78
|
+
if(stop >= BKTREE_STRING_MAX)
|
79
|
+
stop = BKTREE_STRING_MAX - 1;
|
80
|
+
|
81
|
+
if(d <= max) {
|
82
|
+
*result_ptr = bktree_result_new(*result_ptr, node, d);
|
83
|
+
}
|
84
|
+
|
85
|
+
int i;
|
86
|
+
for(i = start; i <= stop; i++) {
|
87
|
+
if(node->next[i] > 0) {
|
88
|
+
inner_query(bktree, bktree->tree + node->next[i], string, len, max, result_ptr);
|
89
|
+
}
|
90
|
+
}
|
91
|
+
}
|
92
|
+
|
93
|
+
BKResult * bktree_query(BKTree * bktree, char * string, unsigned char len, int max) {
|
94
|
+
BKResult * results = NULL;
|
95
|
+
inner_query(bktree, bktree->tree, string, len, max, &results);
|
96
|
+
return results;
|
97
|
+
}
|
98
|
+
|
99
|
+
void bktree_node_print(BKTree * bktree, BKNode * node) {
|
100
|
+
if(bktree == NULL) {
|
101
|
+
printf("bktree is null\n");
|
102
|
+
return;
|
103
|
+
}
|
104
|
+
|
105
|
+
if(node == NULL) {
|
106
|
+
printf("node is null\n");
|
107
|
+
return;
|
108
|
+
}
|
109
|
+
|
110
|
+
printf("String: %s\n", BKTREE_GET_STRING(bktree, node->string_offset));
|
111
|
+
printf("Offset: %ld\n", node - bktree->tree);
|
112
|
+
int i;
|
113
|
+
for(i = 0; i < BKTREE_STRING_MAX; i++)
|
114
|
+
printf("%d ", node->next[i]);
|
115
|
+
|
116
|
+
printf("\n");
|
117
|
+
}
|
118
|
+
|
119
|
+
static int write_string(BKTree * bktree, char * string, unsigned char len) {
|
120
|
+
while(bktree->strings_cursor - bktree->strings + len + 2 >= bktree->strings_size) {
|
121
|
+
int cursor_offset = bktree->strings_cursor - bktree->strings;
|
122
|
+
|
123
|
+
char * old_strings = bktree->strings;
|
124
|
+
bktree->strings = malloc(bktree->strings_size * 2);
|
125
|
+
memcpy(bktree->strings, old_strings, bktree->strings_size);
|
126
|
+
free(old_strings);
|
127
|
+
|
128
|
+
//printf("old ptr: %p\n", old_strings);
|
129
|
+
//printf("new ptr: %p\n", bktree->strings);
|
130
|
+
|
131
|
+
bktree->strings_size *= 2;
|
132
|
+
bktree->strings_cursor = bktree->strings + cursor_offset;
|
133
|
+
}
|
134
|
+
|
135
|
+
int original_offset = bktree->strings_cursor - bktree->strings;
|
136
|
+
|
137
|
+
*(bktree->strings_cursor) = len;
|
138
|
+
memcpy(bktree->strings_cursor + 1, string, len);
|
139
|
+
*(bktree->strings_cursor + len + 1) = '\0';
|
140
|
+
bktree->strings_cursor += len + 2;
|
141
|
+
|
142
|
+
return original_offset;
|
143
|
+
}
|
144
|
+
|
145
|
+
static BKNode * write_new_record(BKTree * bktree, char * string, unsigned char len) {
|
146
|
+
BKNode * node = bktree->tree_cursor++;
|
147
|
+
node->string_offset = write_string(bktree, string, len);
|
148
|
+
|
149
|
+
int i;
|
150
|
+
for(i = 0; i < BKTREE_STRING_MAX; i++)
|
151
|
+
node->next[i] = 0;
|
152
|
+
|
153
|
+
bktree->size++;
|
154
|
+
|
155
|
+
return node;
|
156
|
+
}
|
data/ext/extconf.rb
ADDED
data/ext/levenshtein.c
ADDED
@@ -0,0 +1,155 @@
|
|
1
|
+
#include <stdlib.h>
|
2
|
+
#include <string.h>
|
3
|
+
#include "levenshtein.h"
|
4
|
+
|
5
|
+
static int minimum(int a,int b,int c)
|
6
|
+
/*Gets the minimum of three values*/
|
7
|
+
{
|
8
|
+
int min=a;
|
9
|
+
if(b<min)
|
10
|
+
min=b;
|
11
|
+
if(c<min)
|
12
|
+
min=c;
|
13
|
+
return min;
|
14
|
+
}
|
15
|
+
|
16
|
+
int levenshtein_distance(char *s, int n, char*t, int m, int noop)
|
17
|
+
/*Compute levenshtein distance between s and t*/
|
18
|
+
{
|
19
|
+
//Step 1
|
20
|
+
int k,i,j,cost,*d,distance;
|
21
|
+
if(n!=0&&m!=0)
|
22
|
+
{
|
23
|
+
d=malloc((sizeof(int))*(m+1)*(n+1));
|
24
|
+
m++;
|
25
|
+
n++;
|
26
|
+
//Step 2
|
27
|
+
for(k=0;k<n;k++)
|
28
|
+
d[k]=k;
|
29
|
+
for(k=0;k<m;k++)
|
30
|
+
d[k*n]=k;
|
31
|
+
//Step 3 and 4
|
32
|
+
for(i=1;i<n;i++)
|
33
|
+
for(j=1;j<m;j++)
|
34
|
+
{
|
35
|
+
//Step 5
|
36
|
+
if(s[i-1]==t[j-1])
|
37
|
+
cost=0;
|
38
|
+
else
|
39
|
+
cost=1;
|
40
|
+
//Step 6
|
41
|
+
d[j*n+i]=minimum(d[(j-1)*n+i]+1,d[j*n+i-1]+1,d[(j-1)*n+i-1]+cost);
|
42
|
+
}
|
43
|
+
distance=d[n*m-1];
|
44
|
+
free(d);
|
45
|
+
return distance;
|
46
|
+
}
|
47
|
+
else
|
48
|
+
return -1; //a negative return value means that one or both strings are empty.
|
49
|
+
}
|
50
|
+
|
51
|
+
int levenshtein(char * s1, int l1, char * s2, int l2, int threshold) {
|
52
|
+
int * prev_row, * curr_row;
|
53
|
+
int col, row;
|
54
|
+
int curr_row_min, result;
|
55
|
+
int offset = 0;
|
56
|
+
|
57
|
+
/* Do the expensive calculation on a subset of the sequences, if possible, by removing the common prefix. */
|
58
|
+
|
59
|
+
while (s1[offset] == s2[offset]) {
|
60
|
+
offset++;
|
61
|
+
}
|
62
|
+
|
63
|
+
/* Do the expensive calculation on a subset of the sequences, if possible, by removing the common postfix. */
|
64
|
+
|
65
|
+
while ((l1-1 > offset) && (l2-1 > offset) && (s1[l1-1] == s2[l2-1])) {
|
66
|
+
l1--;
|
67
|
+
l2--;
|
68
|
+
}
|
69
|
+
|
70
|
+
l1 -= offset;
|
71
|
+
l2 -= offset;
|
72
|
+
|
73
|
+
/* The Levenshtein algorithm itself. */
|
74
|
+
|
75
|
+
/* s1= */
|
76
|
+
/* ERIK */
|
77
|
+
/* */
|
78
|
+
/* 01234 */
|
79
|
+
/* s2=V 11234 */
|
80
|
+
/* E 21234 */
|
81
|
+
/* E 32234 */
|
82
|
+
/* N 43334 <- prev_row */
|
83
|
+
/* S 54444 <- curr_row */
|
84
|
+
/* T 65555 */
|
85
|
+
/* R 76566 */
|
86
|
+
/* A 87667 */
|
87
|
+
|
88
|
+
/* Allocate memory for both rows */
|
89
|
+
|
90
|
+
prev_row = malloc(l1+1);
|
91
|
+
curr_row = malloc(l1+1);
|
92
|
+
|
93
|
+
if ((prev_row == NULL) || (curr_row == NULL)) {
|
94
|
+
return -1;
|
95
|
+
}
|
96
|
+
|
97
|
+
/* Initialize the current row. */
|
98
|
+
|
99
|
+
for (col=0; col<=l1; col++) {
|
100
|
+
curr_row[col] = col;
|
101
|
+
}
|
102
|
+
|
103
|
+
for (row=1; row<=l2; row++) {
|
104
|
+
/* Copy the current row to the previous row. */
|
105
|
+
|
106
|
+
memcpy(prev_row, curr_row, sizeof(int)*(l1+1));
|
107
|
+
|
108
|
+
/* Calculate the values of the current row. */
|
109
|
+
|
110
|
+
curr_row[0] = row;
|
111
|
+
curr_row_min = row;
|
112
|
+
|
113
|
+
for (col=1; col<=l1; col++) {
|
114
|
+
/* Equal (cost=0) or substitution (cost=1). */
|
115
|
+
|
116
|
+
curr_row[col] = prev_row[col-1] + ((s1[offset+col-1] == s2[offset+row-1]) ? 0 : 1);
|
117
|
+
|
118
|
+
/* Insertion if it's cheaper than substitution. */
|
119
|
+
|
120
|
+
if (prev_row[col]+1 < curr_row[col]) {
|
121
|
+
curr_row[col] = prev_row[col]+1;
|
122
|
+
}
|
123
|
+
|
124
|
+
/* Deletion if it's cheaper than substitution. */
|
125
|
+
|
126
|
+
if (curr_row[col-1]+1 < curr_row[col]) {
|
127
|
+
curr_row[col] = curr_row[col-1]+1;
|
128
|
+
}
|
129
|
+
|
130
|
+
/* Keep track of the minimum value on this row. */
|
131
|
+
|
132
|
+
if (curr_row[col] < curr_row_min) {
|
133
|
+
curr_row_min = curr_row[col];
|
134
|
+
}
|
135
|
+
}
|
136
|
+
|
137
|
+
/* Return nil as soon as we exceed the threshold. */
|
138
|
+
|
139
|
+
if (threshold > -1 && curr_row_min >= threshold) {
|
140
|
+
free(prev_row);
|
141
|
+
free(curr_row);
|
142
|
+
|
143
|
+
return -1;
|
144
|
+
}
|
145
|
+
}
|
146
|
+
|
147
|
+
/* The result is the last value on the last row. */
|
148
|
+
|
149
|
+
result = curr_row[l1];
|
150
|
+
|
151
|
+
free(prev_row);
|
152
|
+
free(curr_row);
|
153
|
+
|
154
|
+
return result;
|
155
|
+
}
|
data/ext/levenshtein.h
ADDED
metadata
ADDED
@@ -0,0 +1,62 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: bktree
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.1
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Tyler McMullen
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
|
12
|
+
date: 2010-02-13 00:00:00 -08:00
|
13
|
+
default_executable:
|
14
|
+
dependencies: []
|
15
|
+
|
16
|
+
description: bktree in c with ruby bindings
|
17
|
+
email: tbmcmullen@gmail.com
|
18
|
+
executables: []
|
19
|
+
|
20
|
+
extensions:
|
21
|
+
- ext/extconf.rb
|
22
|
+
extra_rdoc_files:
|
23
|
+
- README
|
24
|
+
files:
|
25
|
+
- bktree.gemspec
|
26
|
+
- README
|
27
|
+
- ext/bktree.c
|
28
|
+
- ext/bktree_c.c
|
29
|
+
- ext/levenshtein.c
|
30
|
+
- ext/bktree.h
|
31
|
+
- ext/levenshtein.h
|
32
|
+
- ext/extconf.rb
|
33
|
+
has_rdoc: true
|
34
|
+
homepage: http://github.com/tyler/BkTree
|
35
|
+
licenses: []
|
36
|
+
|
37
|
+
post_install_message:
|
38
|
+
rdoc_options: []
|
39
|
+
|
40
|
+
require_paths:
|
41
|
+
- ext
|
42
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
43
|
+
requirements:
|
44
|
+
- - ">="
|
45
|
+
- !ruby/object:Gem::Version
|
46
|
+
version: "0"
|
47
|
+
version:
|
48
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
49
|
+
requirements:
|
50
|
+
- - ">="
|
51
|
+
- !ruby/object:Gem::Version
|
52
|
+
version: "0"
|
53
|
+
version:
|
54
|
+
requirements: []
|
55
|
+
|
56
|
+
rubyforge_project:
|
57
|
+
rubygems_version: 1.3.5
|
58
|
+
signing_key:
|
59
|
+
specification_version: 3
|
60
|
+
summary: bktree in c with ruby bindings
|
61
|
+
test_files: []
|
62
|
+
|