bktree 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README +28 -0
- data/bktree.gemspec +15 -0
- data/ext/bktree.c +50 -0
- data/ext/bktree.h +47 -0
- data/ext/bktree_c.c +156 -0
- data/ext/extconf.rb +2 -0
- data/ext/levenshtein.c +155 -0
- data/ext/levenshtein.h +3 -0
- metadata +62 -0
data/README
ADDED
@@ -0,0 +1,28 @@
|
|
1
|
+
BkTrees are pretty cool.
|
2
|
+
|
3
|
+
|
4
|
+
This one is extremely alpha though, so... You should probably look elsewhere. It's pretty fast though, and that's pretty cool.
|
5
|
+
|
6
|
+
|
7
|
+
Anyway, if you want to give it a shot just do something like this:
|
8
|
+
|
9
|
+
ruby extconf.rb && make
|
10
|
+
|
11
|
+
|
12
|
+
And using it is really simple. It only has two methods:
|
13
|
+
|
14
|
+
require 'bktree'
|
15
|
+
|
16
|
+
bk = BkTree.new
|
17
|
+
bk.add 'foo'
|
18
|
+
bk.add 'bar'
|
19
|
+
bk.add 'baz'
|
20
|
+
|
21
|
+
bk.query 'bor', 2 #=> [['bar', 1], ['baz', 2]]
|
22
|
+
|
23
|
+
|
24
|
+
Have fun.
|
25
|
+
|
26
|
+
|
27
|
+
Your pal,
|
28
|
+
Tyler McMullen
|
data/bktree.gemspec
ADDED
@@ -0,0 +1,15 @@
|
|
1
|
+
Gem::Specification.new do |s|
|
2
|
+
s.name = 'bktree'
|
3
|
+
s.version = '0.0.1'
|
4
|
+
s.authors = ["Tyler McMullen"]
|
5
|
+
s.date = '2010-02-13'
|
6
|
+
s.description = 'bktree in c with ruby bindings'
|
7
|
+
s.email = 'tbmcmullen@gmail.com'
|
8
|
+
s.extensions = ["ext/extconf.rb"]
|
9
|
+
s.extra_rdoc_files = ["README"]
|
10
|
+
s.files = Dir['*'] + Dir['ext/*.{c,h,rb}']
|
11
|
+
s.homepage = 'http://github.com/tyler/BkTree'
|
12
|
+
s.require_paths = ["ext"]
|
13
|
+
s.rubygems_version = '1.3.5'
|
14
|
+
s.summary = 'bktree in c with ruby bindings'
|
15
|
+
end
|
data/ext/bktree.c
ADDED
@@ -0,0 +1,50 @@
|
|
1
|
+
#include "ruby.h"
|
2
|
+
#include "bktree.h"
|
3
|
+
#include "levenshtein.h"
|
4
|
+
|
5
|
+
VALUE cBkTree;
|
6
|
+
|
7
|
+
static VALUE rb_bktree_alloc(VALUE klass) {
|
8
|
+
VALUE obj;
|
9
|
+
obj = Data_Wrap_Struct(klass, 0, bktree_destroy, bktree_new(levenshtein_distance));
|
10
|
+
return obj;
|
11
|
+
}
|
12
|
+
|
13
|
+
static VALUE rb_bktree_add(VALUE self, VALUE word) {
|
14
|
+
StringValue(word);
|
15
|
+
|
16
|
+
BKTree * bktree;
|
17
|
+
Data_Get_Struct(self, BKTree, bktree);
|
18
|
+
|
19
|
+
bktree_add(bktree, RSTRING(word)->ptr, RSTRING(word)->len);
|
20
|
+
|
21
|
+
return Qnil;
|
22
|
+
}
|
23
|
+
|
24
|
+
static VALUE rb_bktree_query(VALUE self, VALUE word, VALUE max) {
|
25
|
+
StringValue(word);
|
26
|
+
|
27
|
+
BKTree * bktree;
|
28
|
+
Data_Get_Struct(self, BKTree, bktree);
|
29
|
+
|
30
|
+
VALUE result_out = rb_ary_new();
|
31
|
+
BKResult * result = bktree_query(bktree, RSTRING(word)->ptr, RSTRING(word)->len, FIX2INT(max));
|
32
|
+
while(result) {
|
33
|
+
VALUE result_node = rb_ary_new();
|
34
|
+
|
35
|
+
rb_ary_push(result_node, rb_str_new(BKTREE_GET_STRING(bktree, result->string_offset), BKTREE_GET_STRING_LEN(bktree, result->string_offset)));
|
36
|
+
rb_ary_push(result_node, INT2FIX(result->distance));
|
37
|
+
|
38
|
+
rb_ary_push(result_out, result_node);
|
39
|
+
result = result->next;
|
40
|
+
}
|
41
|
+
|
42
|
+
return result_out;
|
43
|
+
}
|
44
|
+
|
45
|
+
void Init_bktree() {
|
46
|
+
cBkTree = rb_define_class("BkTree", rb_cObject);
|
47
|
+
rb_define_alloc_func(cBkTree, rb_bktree_alloc);
|
48
|
+
rb_define_method(cBkTree, "add", rb_bktree_add, 1);
|
49
|
+
rb_define_method(cBkTree, "query", rb_bktree_query, 2);
|
50
|
+
}
|
data/ext/bktree.h
ADDED
@@ -0,0 +1,47 @@
|
|
1
|
+
#define BKTREE_STRINGS_SIZE 4096
|
2
|
+
#define BKTREE_TREE_SIZE 2147483648
|
3
|
+
|
4
|
+
#define BKTREE_STRING_MAX 16
|
5
|
+
|
6
|
+
#define BKTREE_OK 0
|
7
|
+
#define BKTREE_FAIL 1
|
8
|
+
|
9
|
+
#define BKTREE_GET_STRING(bktree, string_offset) (bktree->strings + string_offset + 1)
|
10
|
+
|
11
|
+
#define BKTREE_GET_STRING_LEN(bktree, string_offset) (*(bktree->strings + string_offset))
|
12
|
+
|
13
|
+
typedef struct {
|
14
|
+
long string_offset;
|
15
|
+
int next[BKTREE_STRING_MAX];
|
16
|
+
} BKNode;
|
17
|
+
|
18
|
+
typedef struct {
|
19
|
+
int size;
|
20
|
+
|
21
|
+
BKNode * tree;
|
22
|
+
BKNode * tree_cursor;
|
23
|
+
size_t tree_size;
|
24
|
+
|
25
|
+
char * strings;
|
26
|
+
char * strings_cursor;
|
27
|
+
size_t strings_size;
|
28
|
+
|
29
|
+
// word1, len(word1), word2, len(word2), max
|
30
|
+
int (* distance)(char *, int, char *, int, int);
|
31
|
+
} BKTree;
|
32
|
+
|
33
|
+
struct BKResult_s {
|
34
|
+
int distance;
|
35
|
+
int string_offset;
|
36
|
+
struct BKResult_s * next;
|
37
|
+
};
|
38
|
+
typedef struct BKResult_s BKResult;
|
39
|
+
|
40
|
+
|
41
|
+
BKTree * bktree_new(int (* distance)(char *, int, char *, int, int));
|
42
|
+
void bktree_destroy(BKTree * bktree);
|
43
|
+
BKNode * bktree_add(BKTree * bktree, char * string, unsigned char len);
|
44
|
+
void bktree_node_print(BKTree * bktree, BKNode * node);
|
45
|
+
|
46
|
+
void bktree_result_destroy(BKResult * result);
|
47
|
+
BKResult * bktree_query(BKTree * bktree, char * string, unsigned char len, int max);
|
data/ext/bktree_c.c
ADDED
@@ -0,0 +1,156 @@
|
|
1
|
+
#include <stdlib.h>
|
2
|
+
#include <string.h>
|
3
|
+
#include <stdio.h>
|
4
|
+
#include "bktree.h"
|
5
|
+
|
6
|
+
static int write_string(BKTree * bktree, char * string, unsigned char len);
|
7
|
+
static BKNode * write_new_record(BKTree * bktree, char * string, unsigned char len);
|
8
|
+
|
9
|
+
BKTree * bktree_new(int (* distance)(char *, int, char *, int, int)) {
|
10
|
+
BKTree * bktree = malloc(sizeof(BKTree));
|
11
|
+
|
12
|
+
bktree->tree_size = BKTREE_TREE_SIZE;
|
13
|
+
bktree->tree = malloc(bktree->tree_size);
|
14
|
+
bktree->tree_cursor = bktree->tree;
|
15
|
+
|
16
|
+
bktree->strings_size = BKTREE_STRINGS_SIZE;
|
17
|
+
bktree->strings = malloc(bktree->strings_size);
|
18
|
+
bktree->strings_cursor = bktree->strings;
|
19
|
+
|
20
|
+
bktree->size = 0;
|
21
|
+
|
22
|
+
bktree->distance = distance;
|
23
|
+
|
24
|
+
return bktree;
|
25
|
+
}
|
26
|
+
|
27
|
+
void bktree_destroy(BKTree * bktree) {
|
28
|
+
free(bktree->tree);
|
29
|
+
free(bktree->strings);
|
30
|
+
free(bktree);
|
31
|
+
}
|
32
|
+
|
33
|
+
BKNode * bktree_add(BKTree * bktree, char * string, unsigned char len) {
|
34
|
+
if(len > BKTREE_STRING_MAX || len == 0)
|
35
|
+
return NULL;
|
36
|
+
|
37
|
+
if(bktree->size == 0) {
|
38
|
+
return write_new_record(bktree, string, len);
|
39
|
+
}
|
40
|
+
|
41
|
+
BKNode * node = (BKNode *) bktree->tree;
|
42
|
+
while(node) {
|
43
|
+
char * node_str = BKTREE_GET_STRING(bktree, node->string_offset);
|
44
|
+
int node_str_len = BKTREE_GET_STRING_LEN(bktree, node->string_offset);
|
45
|
+
|
46
|
+
int d = bktree->distance(node_str, node_str_len, string, len, -1);
|
47
|
+
|
48
|
+
if(d == 0)
|
49
|
+
return BKTREE_OK;
|
50
|
+
|
51
|
+
if(node->next[d] > 0) {
|
52
|
+
node = bktree->tree + node->next[d];
|
53
|
+
} else {
|
54
|
+
BKNode * new_node = write_new_record(bktree, string, len);
|
55
|
+
node->next[d] = new_node - bktree->tree;
|
56
|
+
return new_node;
|
57
|
+
}
|
58
|
+
}
|
59
|
+
|
60
|
+
return NULL;
|
61
|
+
}
|
62
|
+
|
63
|
+
BKResult * bktree_result_new(BKResult * next, BKNode * node, int distance) {
|
64
|
+
BKResult * result = malloc(sizeof(BKResult));
|
65
|
+
result->next = next;
|
66
|
+
result->distance = distance;
|
67
|
+
result->string_offset = node->string_offset;
|
68
|
+
|
69
|
+
return result;
|
70
|
+
}
|
71
|
+
|
72
|
+
void inner_query(BKTree * bktree, BKNode * node, char * string, unsigned char len, int max, BKResult * * result_ptr) {
|
73
|
+
|
74
|
+
int d = bktree->distance(BKTREE_GET_STRING(bktree, node->string_offset), BKTREE_GET_STRING_LEN(bktree, node->string_offset), string, len, -1);
|
75
|
+
|
76
|
+
int start = d - max < 1 ? 1 : d - max;
|
77
|
+
int stop = d + max + 1;
|
78
|
+
if(stop >= BKTREE_STRING_MAX)
|
79
|
+
stop = BKTREE_STRING_MAX - 1;
|
80
|
+
|
81
|
+
if(d <= max) {
|
82
|
+
*result_ptr = bktree_result_new(*result_ptr, node, d);
|
83
|
+
}
|
84
|
+
|
85
|
+
int i;
|
86
|
+
for(i = start; i <= stop; i++) {
|
87
|
+
if(node->next[i] > 0) {
|
88
|
+
inner_query(bktree, bktree->tree + node->next[i], string, len, max, result_ptr);
|
89
|
+
}
|
90
|
+
}
|
91
|
+
}
|
92
|
+
|
93
|
+
BKResult * bktree_query(BKTree * bktree, char * string, unsigned char len, int max) {
|
94
|
+
BKResult * results = NULL;
|
95
|
+
inner_query(bktree, bktree->tree, string, len, max, &results);
|
96
|
+
return results;
|
97
|
+
}
|
98
|
+
|
99
|
+
void bktree_node_print(BKTree * bktree, BKNode * node) {
|
100
|
+
if(bktree == NULL) {
|
101
|
+
printf("bktree is null\n");
|
102
|
+
return;
|
103
|
+
}
|
104
|
+
|
105
|
+
if(node == NULL) {
|
106
|
+
printf("node is null\n");
|
107
|
+
return;
|
108
|
+
}
|
109
|
+
|
110
|
+
printf("String: %s\n", BKTREE_GET_STRING(bktree, node->string_offset));
|
111
|
+
printf("Offset: %ld\n", node - bktree->tree);
|
112
|
+
int i;
|
113
|
+
for(i = 0; i < BKTREE_STRING_MAX; i++)
|
114
|
+
printf("%d ", node->next[i]);
|
115
|
+
|
116
|
+
printf("\n");
|
117
|
+
}
|
118
|
+
|
119
|
+
static int write_string(BKTree * bktree, char * string, unsigned char len) {
|
120
|
+
while(bktree->strings_cursor - bktree->strings + len + 2 >= bktree->strings_size) {
|
121
|
+
int cursor_offset = bktree->strings_cursor - bktree->strings;
|
122
|
+
|
123
|
+
char * old_strings = bktree->strings;
|
124
|
+
bktree->strings = malloc(bktree->strings_size * 2);
|
125
|
+
memcpy(bktree->strings, old_strings, bktree->strings_size);
|
126
|
+
free(old_strings);
|
127
|
+
|
128
|
+
//printf("old ptr: %p\n", old_strings);
|
129
|
+
//printf("new ptr: %p\n", bktree->strings);
|
130
|
+
|
131
|
+
bktree->strings_size *= 2;
|
132
|
+
bktree->strings_cursor = bktree->strings + cursor_offset;
|
133
|
+
}
|
134
|
+
|
135
|
+
int original_offset = bktree->strings_cursor - bktree->strings;
|
136
|
+
|
137
|
+
*(bktree->strings_cursor) = len;
|
138
|
+
memcpy(bktree->strings_cursor + 1, string, len);
|
139
|
+
*(bktree->strings_cursor + len + 1) = '\0';
|
140
|
+
bktree->strings_cursor += len + 2;
|
141
|
+
|
142
|
+
return original_offset;
|
143
|
+
}
|
144
|
+
|
145
|
+
static BKNode * write_new_record(BKTree * bktree, char * string, unsigned char len) {
|
146
|
+
BKNode * node = bktree->tree_cursor++;
|
147
|
+
node->string_offset = write_string(bktree, string, len);
|
148
|
+
|
149
|
+
int i;
|
150
|
+
for(i = 0; i < BKTREE_STRING_MAX; i++)
|
151
|
+
node->next[i] = 0;
|
152
|
+
|
153
|
+
bktree->size++;
|
154
|
+
|
155
|
+
return node;
|
156
|
+
}
|
data/ext/extconf.rb
ADDED
data/ext/levenshtein.c
ADDED
@@ -0,0 +1,155 @@
|
|
1
|
+
#include <stdlib.h>
|
2
|
+
#include <string.h>
|
3
|
+
#include "levenshtein.h"
|
4
|
+
|
5
|
+
static int minimum(int a,int b,int c)
|
6
|
+
/*Gets the minimum of three values*/
|
7
|
+
{
|
8
|
+
int min=a;
|
9
|
+
if(b<min)
|
10
|
+
min=b;
|
11
|
+
if(c<min)
|
12
|
+
min=c;
|
13
|
+
return min;
|
14
|
+
}
|
15
|
+
|
16
|
+
int levenshtein_distance(char *s, int n, char*t, int m, int noop)
|
17
|
+
/*Compute levenshtein distance between s and t*/
|
18
|
+
{
|
19
|
+
//Step 1
|
20
|
+
int k,i,j,cost,*d,distance;
|
21
|
+
if(n!=0&&m!=0)
|
22
|
+
{
|
23
|
+
d=malloc((sizeof(int))*(m+1)*(n+1));
|
24
|
+
m++;
|
25
|
+
n++;
|
26
|
+
//Step 2
|
27
|
+
for(k=0;k<n;k++)
|
28
|
+
d[k]=k;
|
29
|
+
for(k=0;k<m;k++)
|
30
|
+
d[k*n]=k;
|
31
|
+
//Step 3 and 4
|
32
|
+
for(i=1;i<n;i++)
|
33
|
+
for(j=1;j<m;j++)
|
34
|
+
{
|
35
|
+
//Step 5
|
36
|
+
if(s[i-1]==t[j-1])
|
37
|
+
cost=0;
|
38
|
+
else
|
39
|
+
cost=1;
|
40
|
+
//Step 6
|
41
|
+
d[j*n+i]=minimum(d[(j-1)*n+i]+1,d[j*n+i-1]+1,d[(j-1)*n+i-1]+cost);
|
42
|
+
}
|
43
|
+
distance=d[n*m-1];
|
44
|
+
free(d);
|
45
|
+
return distance;
|
46
|
+
}
|
47
|
+
else
|
48
|
+
return -1; //a negative return value means that one or both strings are empty.
|
49
|
+
}
|
50
|
+
|
51
|
+
int levenshtein(char * s1, int l1, char * s2, int l2, int threshold) {
|
52
|
+
int * prev_row, * curr_row;
|
53
|
+
int col, row;
|
54
|
+
int curr_row_min, result;
|
55
|
+
int offset = 0;
|
56
|
+
|
57
|
+
/* Do the expensive calculation on a subset of the sequences, if possible, by removing the common prefix. */
|
58
|
+
|
59
|
+
while (s1[offset] == s2[offset]) {
|
60
|
+
offset++;
|
61
|
+
}
|
62
|
+
|
63
|
+
/* Do the expensive calculation on a subset of the sequences, if possible, by removing the common postfix. */
|
64
|
+
|
65
|
+
while ((l1-1 > offset) && (l2-1 > offset) && (s1[l1-1] == s2[l2-1])) {
|
66
|
+
l1--;
|
67
|
+
l2--;
|
68
|
+
}
|
69
|
+
|
70
|
+
l1 -= offset;
|
71
|
+
l2 -= offset;
|
72
|
+
|
73
|
+
/* The Levenshtein algorithm itself. */
|
74
|
+
|
75
|
+
/* s1= */
|
76
|
+
/* ERIK */
|
77
|
+
/* */
|
78
|
+
/* 01234 */
|
79
|
+
/* s2=V 11234 */
|
80
|
+
/* E 21234 */
|
81
|
+
/* E 32234 */
|
82
|
+
/* N 43334 <- prev_row */
|
83
|
+
/* S 54444 <- curr_row */
|
84
|
+
/* T 65555 */
|
85
|
+
/* R 76566 */
|
86
|
+
/* A 87667 */
|
87
|
+
|
88
|
+
/* Allocate memory for both rows */
|
89
|
+
|
90
|
+
prev_row = malloc(l1+1);
|
91
|
+
curr_row = malloc(l1+1);
|
92
|
+
|
93
|
+
if ((prev_row == NULL) || (curr_row == NULL)) {
|
94
|
+
return -1;
|
95
|
+
}
|
96
|
+
|
97
|
+
/* Initialize the current row. */
|
98
|
+
|
99
|
+
for (col=0; col<=l1; col++) {
|
100
|
+
curr_row[col] = col;
|
101
|
+
}
|
102
|
+
|
103
|
+
for (row=1; row<=l2; row++) {
|
104
|
+
/* Copy the current row to the previous row. */
|
105
|
+
|
106
|
+
memcpy(prev_row, curr_row, sizeof(int)*(l1+1));
|
107
|
+
|
108
|
+
/* Calculate the values of the current row. */
|
109
|
+
|
110
|
+
curr_row[0] = row;
|
111
|
+
curr_row_min = row;
|
112
|
+
|
113
|
+
for (col=1; col<=l1; col++) {
|
114
|
+
/* Equal (cost=0) or substitution (cost=1). */
|
115
|
+
|
116
|
+
curr_row[col] = prev_row[col-1] + ((s1[offset+col-1] == s2[offset+row-1]) ? 0 : 1);
|
117
|
+
|
118
|
+
/* Insertion if it's cheaper than substitution. */
|
119
|
+
|
120
|
+
if (prev_row[col]+1 < curr_row[col]) {
|
121
|
+
curr_row[col] = prev_row[col]+1;
|
122
|
+
}
|
123
|
+
|
124
|
+
/* Deletion if it's cheaper than substitution. */
|
125
|
+
|
126
|
+
if (curr_row[col-1]+1 < curr_row[col]) {
|
127
|
+
curr_row[col] = curr_row[col-1]+1;
|
128
|
+
}
|
129
|
+
|
130
|
+
/* Keep track of the minimum value on this row. */
|
131
|
+
|
132
|
+
if (curr_row[col] < curr_row_min) {
|
133
|
+
curr_row_min = curr_row[col];
|
134
|
+
}
|
135
|
+
}
|
136
|
+
|
137
|
+
/* Return nil as soon as we exceed the threshold. */
|
138
|
+
|
139
|
+
if (threshold > -1 && curr_row_min >= threshold) {
|
140
|
+
free(prev_row);
|
141
|
+
free(curr_row);
|
142
|
+
|
143
|
+
return -1;
|
144
|
+
}
|
145
|
+
}
|
146
|
+
|
147
|
+
/* The result is the last value on the last row. */
|
148
|
+
|
149
|
+
result = curr_row[l1];
|
150
|
+
|
151
|
+
free(prev_row);
|
152
|
+
free(curr_row);
|
153
|
+
|
154
|
+
return result;
|
155
|
+
}
|
data/ext/levenshtein.h
ADDED
metadata
ADDED
@@ -0,0 +1,62 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: bktree
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.1
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Tyler McMullen
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
|
12
|
+
date: 2010-02-13 00:00:00 -08:00
|
13
|
+
default_executable:
|
14
|
+
dependencies: []
|
15
|
+
|
16
|
+
description: bktree in c with ruby bindings
|
17
|
+
email: tbmcmullen@gmail.com
|
18
|
+
executables: []
|
19
|
+
|
20
|
+
extensions:
|
21
|
+
- ext/extconf.rb
|
22
|
+
extra_rdoc_files:
|
23
|
+
- README
|
24
|
+
files:
|
25
|
+
- bktree.gemspec
|
26
|
+
- README
|
27
|
+
- ext/bktree.c
|
28
|
+
- ext/bktree_c.c
|
29
|
+
- ext/levenshtein.c
|
30
|
+
- ext/bktree.h
|
31
|
+
- ext/levenshtein.h
|
32
|
+
- ext/extconf.rb
|
33
|
+
has_rdoc: true
|
34
|
+
homepage: http://github.com/tyler/BkTree
|
35
|
+
licenses: []
|
36
|
+
|
37
|
+
post_install_message:
|
38
|
+
rdoc_options: []
|
39
|
+
|
40
|
+
require_paths:
|
41
|
+
- ext
|
42
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
43
|
+
requirements:
|
44
|
+
- - ">="
|
45
|
+
- !ruby/object:Gem::Version
|
46
|
+
version: "0"
|
47
|
+
version:
|
48
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
49
|
+
requirements:
|
50
|
+
- - ">="
|
51
|
+
- !ruby/object:Gem::Version
|
52
|
+
version: "0"
|
53
|
+
version:
|
54
|
+
requirements: []
|
55
|
+
|
56
|
+
rubyforge_project:
|
57
|
+
rubygems_version: 1.3.5
|
58
|
+
signing_key:
|
59
|
+
specification_version: 3
|
60
|
+
summary: bktree in c with ruby bindings
|
61
|
+
test_files: []
|
62
|
+
|