RubyTrie 1.1 → 2.1
Sign up to get free protection for your applications and to get access to all the features.
- data/ChangeLog +3 -1
- data/README +10 -2
- data/ext/trie/levenshtein_distance.c +77 -0
- data/ext/trie/levenshtein_distance.h +13 -0
- data/ext/trie/otrie2.c +161 -0
- data/ext/trie/otrie2.h +41 -0
- data/ext/trie/ruby_trie.c +199 -0
- data/ext/trie/t.rb +6 -4
- data/test/trie_test.rb +8 -7
- metadata +10 -6
- data/ext/trie/trie.c +0 -408
data/ChangeLog
CHANGED
@@ -1,2 +1,4 @@
|
|
1
1
|
1.0 added children and each methods
|
2
|
-
1.1 added levenshtein algorithm implementation - inspired from: http://stevehanov.ca/blog/index.php?id=114
|
2
|
+
1.1 added levenshtein algorithm implementation - inspired from: http://stevehanov.ca/blog/index.php?id=114
|
3
|
+
2.0 completely rewritten the trie implementation to use more optimized data structure. For /usr/share/dict/web2 it takes the memory to about 6Mb from 40Mb. The levenshtein now takes a block and passes in the value but it can be easy extened to return an array. It also has a glitch where some values are for some reason just false... :|
|
4
|
+
2.1 bug fixes, fixing levenshtein search
|
data/README
CHANGED
@@ -22,6 +22,11 @@ t["goes"] = 2
|
|
22
22
|
t["gone"] = 3
|
23
23
|
t["other"] = 4
|
24
24
|
|
25
|
+
by default assigning multiple values to the same object will add them to an array.
|
26
|
+
t['a'] = 1
|
27
|
+
t['a'] = 2
|
28
|
+
will return for t['a'] => [1,2]
|
29
|
+
|
25
30
|
t.children("go") => [1,2,3]
|
26
31
|
|
27
32
|
t.each("partial key") will yield to the given block all values that are matched by the partial key
|
@@ -29,8 +34,11 @@ t.each("go") {|v| puts v } => prints 1, 2 & 3
|
|
29
34
|
|
30
35
|
Levenshtein search
|
31
36
|
|
32
|
-
t.levenshtein_search("go", 2) - will
|
33
|
-
|
37
|
+
t.levenshtein_search("go", 2) {|value| puts value} - will print all values for the words in the trie that have 2 distance from the search word
|
38
|
+
Eg.
|
39
|
+
1
|
40
|
+
2
|
41
|
+
3
|
34
42
|
|
35
43
|
== Bugs
|
36
44
|
|
@@ -0,0 +1,77 @@
|
|
1
|
+
/*
|
2
|
+
* levenshtein_distance.c
|
3
|
+
* otrie
|
4
|
+
*
|
5
|
+
* Created by Petrica Ghiurca on 18.03.2011.
|
6
|
+
* Copyright 2011 __MyCompanyName__. All rights reserved.
|
7
|
+
*
|
8
|
+
*/
|
9
|
+
|
10
|
+
#include <stdlib.h>
|
11
|
+
#include <string.h> /* for memcmp, memmove */
|
12
|
+
#include <stdio.h>
|
13
|
+
|
14
|
+
#include "otrie2.h"
|
15
|
+
#include "levenshtein_distance.h"
|
16
|
+
|
17
|
+
|
18
|
+
void recursive_levenshtein_search(Node* trie, int node_offset, levenshtein_distance_callback cb, int* prev_line, int max_dist, const char* word, int word_length);
|
19
|
+
|
20
|
+
void levenshtein_distance(Node* trie, const char* word, int max_distance, levenshtein_distance_callback cb) {
|
21
|
+
int first_line[strlen(word) + 1];
|
22
|
+
int i=0;
|
23
|
+
for(; i < strlen(word) + 1; i++) {
|
24
|
+
first_line[i] = i;
|
25
|
+
}
|
26
|
+
recursive_levenshtein_search(trie->next_sibling, 0, cb, first_line, max_distance, word, strlen(word));
|
27
|
+
}
|
28
|
+
|
29
|
+
|
30
|
+
int minimum(int* numbers, int len) {
|
31
|
+
int minValue = numbers[0];
|
32
|
+
int i;
|
33
|
+
for(i=1; i<len; i++) {
|
34
|
+
if (numbers[i] < minValue) minValue = numbers[i];
|
35
|
+
}
|
36
|
+
return minValue;
|
37
|
+
}
|
38
|
+
|
39
|
+
int min3(int a, int b, int c) {
|
40
|
+
int min = a;
|
41
|
+
if (b < min) min = b;
|
42
|
+
if (c < min) min = c;
|
43
|
+
return min;
|
44
|
+
}
|
45
|
+
|
46
|
+
void recursive_levenshtein_search(Node* trie, int node_offset, levenshtein_distance_callback cb, int* prev_line, int max_dist, const char* word, int word_length) {
|
47
|
+
int cur_line[word_length + 1];
|
48
|
+
int i, insert_cost, replace_cost, delete_cost;
|
49
|
+
|
50
|
+
cur_line[0] = prev_line[0] + 1;
|
51
|
+
|
52
|
+
for(i=1; i < word_length + 1; i++) {
|
53
|
+
insert_cost = cur_line[i-1] + 1;
|
54
|
+
delete_cost = prev_line[i] + 1;
|
55
|
+
if (trie->data[node_offset] != word[i-1]) {
|
56
|
+
replace_cost = prev_line[i-1] + 1;
|
57
|
+
} else {
|
58
|
+
replace_cost = prev_line[i-1];
|
59
|
+
}
|
60
|
+
cur_line[i] = min3(insert_cost, delete_cost, replace_cost);
|
61
|
+
}
|
62
|
+
|
63
|
+
|
64
|
+
if (cur_line[word_length] <= max_dist && (strlen(trie->data) == (node_offset + 1)) && trie->value != Qnil) {
|
65
|
+
cb(trie, cur_line[word_length]);
|
66
|
+
}
|
67
|
+
|
68
|
+
if (minimum(cur_line, word_length + 1) <= max_dist) {
|
69
|
+
if (strlen(trie->data) > (node_offset + 1))
|
70
|
+
recursive_levenshtein_search(trie, node_offset + 1, cb, cur_line, max_dist, word, word_length);
|
71
|
+
if (trie->first_child != NULL)
|
72
|
+
recursive_levenshtein_search(trie->first_child, 0, cb, cur_line, max_dist, word, word_length);
|
73
|
+
if (trie->next_sibling != NULL)
|
74
|
+
recursive_levenshtein_search(trie->next_sibling, 0, cb, prev_line, max_dist, word, word_length);
|
75
|
+
}
|
76
|
+
|
77
|
+
}
|
@@ -0,0 +1,13 @@
|
|
1
|
+
/*
|
2
|
+
* levenshtein_distance.h
|
3
|
+
* otrie
|
4
|
+
*
|
5
|
+
* Created by Petrica Ghiurca on 18.03.2011.
|
6
|
+
* Copyright 2011 __MyCompanyName__. All rights reserved.
|
7
|
+
*
|
8
|
+
*/
|
9
|
+
|
10
|
+
#include "otrie2.h"
|
11
|
+
|
12
|
+
typedef void (*levenshtein_distance_callback)(Node* node, int length);
|
13
|
+
void levenshtein_distance(Node* trie, const char* word, int max_distance, levenshtein_distance_callback cb);
|
data/ext/trie/otrie2.c
ADDED
@@ -0,0 +1,161 @@
|
|
1
|
+
/*
|
2
|
+
* otrie2.c
|
3
|
+
* otrie
|
4
|
+
*
|
5
|
+
* Created by Petrica Ghiurca on 18.03.2011.
|
6
|
+
* Copyright 2011 __MyCompanyName__. All rights reserved.
|
7
|
+
*
|
8
|
+
*/
|
9
|
+
|
10
|
+
#include <stdlib.h> /* for malloc, free */
|
11
|
+
#include <string.h> /* for memcmp, memmove */
|
12
|
+
#include <stdio.h>
|
13
|
+
|
14
|
+
#include "otrie2.h"
|
15
|
+
|
16
|
+
Node* new_node() {
|
17
|
+
Node *node = malloc(sizeof(Node));
|
18
|
+
memset(node, 0, sizeof(Node));
|
19
|
+
return node;
|
20
|
+
}
|
21
|
+
|
22
|
+
Node* new_node_string_len(const char* string, const int len) {
|
23
|
+
Node *node = new_node();
|
24
|
+
node->data = malloc(len+1);
|
25
|
+
strncpy(node->data, string, len);
|
26
|
+
node->data[len] = 0;
|
27
|
+
return node;
|
28
|
+
}
|
29
|
+
|
30
|
+
Node* new_node_string(const char* string) {
|
31
|
+
Node *node = new_node();
|
32
|
+
int len = strlen(string);
|
33
|
+
node->data = malloc(len+1);
|
34
|
+
strcpy(node->data, string);
|
35
|
+
return node;
|
36
|
+
}
|
37
|
+
|
38
|
+
void node_update_data(Node* node, const char* string, int len) {
|
39
|
+
char* new_data = malloc(len+1);
|
40
|
+
strncpy(new_data, string, len);
|
41
|
+
new_data[len] = 0;
|
42
|
+
free(node->data);
|
43
|
+
node->data = new_data;
|
44
|
+
}
|
45
|
+
|
46
|
+
void free_node(Node *node) {
|
47
|
+
if (node->first_child) free_node(node->first_child);
|
48
|
+
if (node->next_sibling) free_node(node->next_sibling);
|
49
|
+
free(node->data);
|
50
|
+
free(node);
|
51
|
+
}
|
52
|
+
|
53
|
+
void node_insert(Node* node, const char* string, const VALUE value) {
|
54
|
+
int len = strlen(string);
|
55
|
+
Pos *cur = new_pos(node, 0);
|
56
|
+
int i=0;
|
57
|
+
for (; i<len; i++) {
|
58
|
+
pos_next(cur, string + i, true);
|
59
|
+
}
|
60
|
+
cur->node->value = value;
|
61
|
+
}
|
62
|
+
|
63
|
+
Node* node_find(Node* this, const char* string) {
|
64
|
+
int len = strlen(string);
|
65
|
+
Pos *cur = new_pos(this, 0);
|
66
|
+
int i=0;
|
67
|
+
for(; i<len; i++) {
|
68
|
+
pos_next(cur, string + i, false);
|
69
|
+
if (cur->node == NULL) { return NULL; }
|
70
|
+
}
|
71
|
+
if (strlen(cur->node->data) == cur->offset + 1)
|
72
|
+
return cur->node;
|
73
|
+
return NULL;
|
74
|
+
}
|
75
|
+
|
76
|
+
Pos* new_pos(Node *node, int offset) {
|
77
|
+
Pos *pos = malloc(sizeof(Pos));
|
78
|
+
pos->node = node;
|
79
|
+
pos->offset = offset;
|
80
|
+
return pos;
|
81
|
+
}
|
82
|
+
|
83
|
+
Node* pos_find_or_create_child(Pos* this, const char* string, bool down, bool insert) {
|
84
|
+
Node *child = this->node->next_sibling;
|
85
|
+
if (down) child = this->node->first_child;
|
86
|
+
|
87
|
+
Node *last_child = NULL;
|
88
|
+
|
89
|
+
while(child != NULL && *child->data != *string) {
|
90
|
+
last_child = child;
|
91
|
+
child = child -> next_sibling;
|
92
|
+
}
|
93
|
+
if (child == NULL && insert) {
|
94
|
+
child = new_node_string(string);
|
95
|
+
if (!down) {
|
96
|
+
if (this->node -> next_sibling != NULL) {
|
97
|
+
last_child -> next_sibling = child;
|
98
|
+
} else {
|
99
|
+
this -> node -> next_sibling = child;
|
100
|
+
}
|
101
|
+
} else {
|
102
|
+
if (this -> node -> first_child != NULL) {
|
103
|
+
last_child -> next_sibling = child;
|
104
|
+
} else {
|
105
|
+
this->node -> first_child = child;
|
106
|
+
}
|
107
|
+
}
|
108
|
+
}
|
109
|
+
return child;
|
110
|
+
}
|
111
|
+
|
112
|
+
void pos_next(Pos *this, const char* string, bool insert) {
|
113
|
+
if (this -> node -> data == NULL) {
|
114
|
+
this->node = pos_find_or_create_child(this, string, false, insert);
|
115
|
+
this->offset = 0;
|
116
|
+
return;
|
117
|
+
}
|
118
|
+
|
119
|
+
int len = strlen(this->node->data);
|
120
|
+
if (this -> offset + 1 < len) {
|
121
|
+
if (this -> node->data[this -> offset + 1] == string[0]) {
|
122
|
+
this -> offset++;
|
123
|
+
return;
|
124
|
+
} else {
|
125
|
+
// split paths
|
126
|
+
// - new child node with old partial content
|
127
|
+
// - new child node with new content
|
128
|
+
if (insert) {
|
129
|
+
Node *splitChild = new_node_string(this->node->data + this->offset + 1);
|
130
|
+
splitChild -> value = this -> node -> value;
|
131
|
+
Node *newChild = new_node_string(string);
|
132
|
+
newChild -> value = Qnil;
|
133
|
+
node_update_data(this->node, this->node->data, this->offset + 1);
|
134
|
+
splitChild -> next_sibling = newChild;
|
135
|
+
this->node -> first_child = splitChild;
|
136
|
+
this->node -> value = Qnil;
|
137
|
+
|
138
|
+
this->node = newChild;
|
139
|
+
this->offset = 0;
|
140
|
+
} else {
|
141
|
+
this -> node = NULL;
|
142
|
+
this -> offset = 0;
|
143
|
+
}
|
144
|
+
}
|
145
|
+
} else {
|
146
|
+
// reached end of data... find a child
|
147
|
+
this->node = pos_find_or_create_child(this, string, true, insert);
|
148
|
+
this->offset = 0;
|
149
|
+
return;
|
150
|
+
}
|
151
|
+
}
|
152
|
+
|
153
|
+
void node_visit(Node* this, node_iterator func, VALUE context) {
|
154
|
+
func(this, context);
|
155
|
+
if (this->first_child != NULL) {
|
156
|
+
node_visit(this->first_child, func, context);
|
157
|
+
}
|
158
|
+
if (this->next_sibling != NULL) {
|
159
|
+
node_visit(this->next_sibling, func, context);
|
160
|
+
}
|
161
|
+
}
|
data/ext/trie/otrie2.h
ADDED
@@ -0,0 +1,41 @@
|
|
1
|
+
/*
|
2
|
+
* otrie2.h
|
3
|
+
* otrie
|
4
|
+
*
|
5
|
+
* Created by Petrica Ghiurca on 18.03.2011.
|
6
|
+
* Copyright 2011 __MyCompanyName__. All rights reserved.
|
7
|
+
*
|
8
|
+
*/
|
9
|
+
#include <ruby.h>
|
10
|
+
|
11
|
+
#define bool int
|
12
|
+
#define true 1
|
13
|
+
#define false 0
|
14
|
+
|
15
|
+
#ifndef TRIE_NODE
|
16
|
+
#define TRIE_NODE
|
17
|
+
|
18
|
+
typedef struct trie_node {
|
19
|
+
char *data;
|
20
|
+
struct trie_node* first_child;
|
21
|
+
struct trie_node* next_sibling;
|
22
|
+
VALUE value;
|
23
|
+
} Node;
|
24
|
+
|
25
|
+
typedef struct pos_struct {
|
26
|
+
Node *node;
|
27
|
+
int offset;
|
28
|
+
} Pos;
|
29
|
+
|
30
|
+
typedef void (*node_iterator)(Node* node, VALUE context);
|
31
|
+
|
32
|
+
Node* new_node();
|
33
|
+
void free_node(Node*);
|
34
|
+
void node_insert(Node* node, const char* string, const VALUE value);
|
35
|
+
Node* node_find(Node* this, const char* string);
|
36
|
+
|
37
|
+
Pos* new_pos(Node *node, int offset);
|
38
|
+
Node* pos_find_or_create_child(Pos* this, const char* string, bool down, bool insert);
|
39
|
+
void pos_next(Pos *this, const char* string, bool);
|
40
|
+
void node_visit(Node* this, node_iterator func, VALUE context);
|
41
|
+
#endif
|
@@ -0,0 +1,199 @@
|
|
1
|
+
/*
|
2
|
+
* ruby_trie.c
|
3
|
+
* otrie
|
4
|
+
*
|
5
|
+
* Created by Petrica Ghiurca on 18.03.2011.
|
6
|
+
* Copyright 2011 Petrica Ghiurca. All rights reserved.
|
7
|
+
*
|
8
|
+
*/
|
9
|
+
|
10
|
+
#include <ruby.h>
|
11
|
+
#include <stdlib.h> /* for malloc, free */
|
12
|
+
#include <string.h> /* for memcmp, memmove */
|
13
|
+
#include "otrie2.h"
|
14
|
+
#include "levenshtein_distance.h"
|
15
|
+
|
16
|
+
static VALUE rb_cTrie;
|
17
|
+
|
18
|
+
static void count_nodes_callback(Node *trie, VALUE accum);
|
19
|
+
static VALUE rb_trie_count_nodes(VALUE self);
|
20
|
+
static VALUE rb_trie_allocate(VALUE klass);
|
21
|
+
static VALUE rb_trie_get_key(VALUE self, VALUE key);
|
22
|
+
static void trie_mark_value(Node*, VALUE);
|
23
|
+
static void rb_trie_mark(Node* t);
|
24
|
+
static void rb_trie_free(Node * t);
|
25
|
+
void tree_collect_values(Node *node, VALUE rary);
|
26
|
+
static VALUE rb_trie_find_children(VALUE self, VALUE key);
|
27
|
+
static void trie_collect_values_with_yield(Node * node, VALUE context);
|
28
|
+
static VALUE rb_trie_find_children_with_block(VALUE self, VALUE key);
|
29
|
+
static VALUE rb_trie_set_key_to_value(VALUE self, VALUE key, VALUE value);
|
30
|
+
static VALUE rb_trie_levenshtein_search(VALUE self, VALUE word, VALUE max_distance);
|
31
|
+
|
32
|
+
// extension init
|
33
|
+
void Init_trie() {
|
34
|
+
rb_cTrie = rb_define_class("Trie", rb_cObject);
|
35
|
+
|
36
|
+
rb_define_alloc_func(rb_cTrie, rb_trie_allocate);
|
37
|
+
|
38
|
+
int arg_count = 0;
|
39
|
+
//rb_define_method(rb_cTrie, "inspect", rb_trie_inspect, arg_count);
|
40
|
+
rb_define_method(rb_cTrie, "memory", rb_trie_count_nodes, arg_count);
|
41
|
+
|
42
|
+
arg_count = 1;
|
43
|
+
rb_define_method(rb_cTrie, "[]", rb_trie_get_key, arg_count);
|
44
|
+
// rb_define_method(rb_cTrie, "delete", rb_trie_undef_key, arg_count);
|
45
|
+
rb_define_method(rb_cTrie, "children", rb_trie_find_children, arg_count);
|
46
|
+
rb_define_method(rb_cTrie, "each", rb_trie_find_children_with_block, arg_count);
|
47
|
+
|
48
|
+
arg_count = 2;
|
49
|
+
rb_define_method(rb_cTrie, "[]=", rb_trie_set_key_to_value, arg_count);
|
50
|
+
// trie.levenshtein_search(word, max_distance)
|
51
|
+
rb_define_method(rb_cTrie, "levenshtein_search", rb_trie_levenshtein_search, arg_count);
|
52
|
+
}
|
53
|
+
|
54
|
+
static int total_memory;
|
55
|
+
static void count_nodes_callback(Node *trie, VALUE accum) {
|
56
|
+
int len = 0;
|
57
|
+
if (trie->data) len = strlen(trie->data);
|
58
|
+
rb_big_plus(accum, rb_uint2big(len + sizeof(Node)));
|
59
|
+
total_memory += len + sizeof(Node);
|
60
|
+
}
|
61
|
+
|
62
|
+
static VALUE rb_trie_count_nodes(VALUE self) {
|
63
|
+
Node *root;
|
64
|
+
Data_Get_Struct(self, Node, root);
|
65
|
+
VALUE accum = rb_uint2big(0);
|
66
|
+
total_memory = 0;
|
67
|
+
node_visit(root, count_nodes_callback, accum);
|
68
|
+
//return accum;
|
69
|
+
return rb_uint2big(total_memory);
|
70
|
+
}
|
71
|
+
|
72
|
+
static VALUE rb_trie_allocate(VALUE klass) {
|
73
|
+
Node * t = new_node();
|
74
|
+
return Data_Wrap_Struct(klass, rb_trie_mark, rb_trie_free, t);
|
75
|
+
}
|
76
|
+
|
77
|
+
static VALUE rb_trie_get_key(VALUE self, VALUE key) {
|
78
|
+
Node * root;
|
79
|
+
Node * node;
|
80
|
+
char * key_cstring;
|
81
|
+
|
82
|
+
Check_Type(key, T_STRING);
|
83
|
+
key_cstring = StringValuePtr(key);
|
84
|
+
|
85
|
+
Data_Get_Struct(self, Node, root);
|
86
|
+
|
87
|
+
node = node_find(root, key_cstring);
|
88
|
+
if (node == NULL) return Qnil;
|
89
|
+
return node->value;
|
90
|
+
}
|
91
|
+
|
92
|
+
|
93
|
+
static void trie_mark_value(Node * t, VALUE context) {
|
94
|
+
rb_gc_mark( t->value );
|
95
|
+
}
|
96
|
+
|
97
|
+
static void rb_trie_mark(Node* t) {
|
98
|
+
node_visit(t, trie_mark_value, Qnil);
|
99
|
+
}
|
100
|
+
|
101
|
+
static void rb_trie_free(Node * t) {
|
102
|
+
free_node(t);
|
103
|
+
}
|
104
|
+
|
105
|
+
void tree_collect_values(Node *node, VALUE rary) {
|
106
|
+
if (node->value != Qnil) {
|
107
|
+
rb_ary_push(rary, node->value);
|
108
|
+
}
|
109
|
+
}
|
110
|
+
|
111
|
+
static VALUE rb_trie_find_children(VALUE self, VALUE key) {
|
112
|
+
Node * root;
|
113
|
+
Node * node;
|
114
|
+
char * key_cstring;
|
115
|
+
VALUE rary = rb_ary_new();
|
116
|
+
|
117
|
+
key_cstring = StringValuePtr(key);
|
118
|
+
Data_Get_Struct(self, Node, root);
|
119
|
+
|
120
|
+
node = node_find(root, key_cstring);
|
121
|
+
|
122
|
+
if (node != NULL && node->value != Qnil) {
|
123
|
+
rb_ary_push(rary, node->value);
|
124
|
+
}
|
125
|
+
|
126
|
+
if (node == NULL || node->first_child == NULL) return rary;
|
127
|
+
|
128
|
+
node_visit(node->first_child, tree_collect_values, rary);
|
129
|
+
return rary;
|
130
|
+
}
|
131
|
+
|
132
|
+
static void trie_collect_values_with_yield(Node * node, VALUE context) {
|
133
|
+
if (node->value != Qnil) {
|
134
|
+
rb_yield(node->value);
|
135
|
+
}
|
136
|
+
}
|
137
|
+
|
138
|
+
static VALUE rb_trie_find_children_with_block(VALUE self, VALUE key) {
|
139
|
+
Node * root;
|
140
|
+
Node * node;
|
141
|
+
char * key_cstring;
|
142
|
+
VALUE rary = rb_ary_new();
|
143
|
+
|
144
|
+
key_cstring = StringValuePtr(key);
|
145
|
+
Data_Get_Struct(self, Node, root);
|
146
|
+
|
147
|
+
node = node_find(root, key_cstring);
|
148
|
+
|
149
|
+
if (node != NULL && node->value != Qnil) {
|
150
|
+
rb_yield(node->value);
|
151
|
+
}
|
152
|
+
|
153
|
+
if (node == NULL || node->first_child == NULL) return rary;
|
154
|
+
|
155
|
+
node_visit(node->first_child, trie_collect_values_with_yield, Qnil);
|
156
|
+
return rary;
|
157
|
+
}
|
158
|
+
|
159
|
+
|
160
|
+
static VALUE rb_trie_set_key_to_value(VALUE self, VALUE key, VALUE value) {
|
161
|
+
Node * root, *node;
|
162
|
+
char * key_cstring;
|
163
|
+
|
164
|
+
Check_Type(key, T_STRING);
|
165
|
+
key_cstring = StringValuePtr(key);
|
166
|
+
|
167
|
+
Data_Get_Struct(self, Node, root);
|
168
|
+
|
169
|
+
node = node_find(root, key_cstring);
|
170
|
+
if (node == NULL || node -> value == Qnil) {
|
171
|
+
// printf("New node for %s -> %d\n", key_cstring, value);
|
172
|
+
VALUE arr = rb_ary_new();
|
173
|
+
rb_ary_push(arr, value);
|
174
|
+
node_insert(root, key_cstring, arr);
|
175
|
+
} else {
|
176
|
+
// printf("Append value %s to %s -> %d\n", node->data, key_cstring, node->value);
|
177
|
+
rb_ary_push(node->value, value);
|
178
|
+
}
|
179
|
+
|
180
|
+
return Qnil;
|
181
|
+
}
|
182
|
+
|
183
|
+
void rb_levensthtein_cb(Node* node, int distance) {
|
184
|
+
if (node->value != Qnil)
|
185
|
+
rb_yield(node->value);
|
186
|
+
}
|
187
|
+
|
188
|
+
static VALUE rb_trie_levenshtein_search(VALUE self, VALUE word, VALUE max_distance) {
|
189
|
+
Node *root;
|
190
|
+
char *word_cstring;
|
191
|
+
|
192
|
+
Data_Get_Struct(self, Node, root);
|
193
|
+
|
194
|
+
word_cstring = StringValuePtr(word);
|
195
|
+
|
196
|
+
levenshtein_distance(root, word_cstring, FIX2INT(max_distance), rb_levensthtein_cb);
|
197
|
+
|
198
|
+
return Qnil;
|
199
|
+
}
|
data/ext/trie/t.rb
CHANGED
@@ -1,20 +1,22 @@
|
|
1
1
|
require 'trie'
|
2
|
+
require 'rubygems'
|
2
3
|
require 'benchmark'
|
3
4
|
|
4
5
|
t = Trie.new
|
5
6
|
c = 0
|
7
|
+
a = 0
|
6
8
|
s1 = (Benchmark.measure do
|
7
|
-
open('/usr/share/dict/
|
8
|
-
t[w.chop]
|
9
|
+
open('/usr/share/dict/web2').each_line do |w|
|
10
|
+
t[w.chop]= w
|
9
11
|
c += 1
|
10
12
|
end
|
11
13
|
end)
|
12
14
|
|
13
15
|
# %w(gol golas golaster lux xal).each {|w| t[w] = w}
|
14
16
|
s2 = (Benchmark.measure do
|
15
|
-
t.levenshtein_search('
|
17
|
+
t.levenshtein_search('food', 1) {|p| puts p}
|
16
18
|
end)
|
17
19
|
|
18
|
-
puts "#{t.memory/(1024*1024)}Mb, #{c} words"
|
20
|
+
puts "#{t.memory/(1024*1024)}Mb, #{c} words, #{a} unique"
|
19
21
|
puts s1
|
20
22
|
puts s2
|
data/test/trie_test.rb
CHANGED
@@ -23,10 +23,11 @@ time do
|
|
23
23
|
1.upto(max) do |i|
|
24
24
|
t["item #{i}"].class
|
25
25
|
end
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
26
|
+
|
27
|
+
# not implemented yet
|
28
|
+
# 1.upto(max) do |i|
|
29
|
+
# t.delete("item #{i}")
|
30
|
+
# end
|
30
31
|
end
|
31
32
|
|
32
33
|
|
@@ -41,9 +42,9 @@ time do
|
|
41
42
|
t["item #{i}"].class
|
42
43
|
end
|
43
44
|
|
44
|
-
1.upto(max) do |i|
|
45
|
-
|
46
|
-
end
|
45
|
+
# 1.upto(max) do |i|
|
46
|
+
# t.delete("item #{i}")
|
47
|
+
# end
|
47
48
|
end
|
48
49
|
|
49
50
|
# puts "With a Ruby Trie..."
|
metadata
CHANGED
@@ -1,21 +1,21 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: RubyTrie
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
hash:
|
4
|
+
hash: 1
|
5
5
|
prerelease: false
|
6
6
|
segments:
|
7
|
+
- 2
|
7
8
|
- 1
|
8
|
-
|
9
|
-
version: "1.1"
|
9
|
+
version: "2.1"
|
10
10
|
platform: ruby
|
11
11
|
authors:
|
12
|
-
- Matt Freels
|
13
12
|
- Petrica Ghiurca
|
13
|
+
- Matt Freels
|
14
14
|
autorequire: trie
|
15
15
|
bindir: bin
|
16
16
|
cert_chain: []
|
17
17
|
|
18
|
-
date: 2011-
|
18
|
+
date: 2011-03-21 00:00:00 +02:00
|
19
19
|
default_executable:
|
20
20
|
dependencies: []
|
21
21
|
|
@@ -30,7 +30,11 @@ extra_rdoc_files:
|
|
30
30
|
files:
|
31
31
|
- README
|
32
32
|
- ChangeLog
|
33
|
-
- ext/trie/
|
33
|
+
- ext/trie/levenshtein_distance.h
|
34
|
+
- ext/trie/otrie2.h
|
35
|
+
- ext/trie/levenshtein_distance.c
|
36
|
+
- ext/trie/otrie2.c
|
37
|
+
- ext/trie/ruby_trie.c
|
34
38
|
- ext/trie/extconf.rb
|
35
39
|
- ext/trie/t.rb
|
36
40
|
- lib/ruby_trie.rb
|
data/ext/trie/trie.c
DELETED
@@ -1,408 +0,0 @@
|
|
1
|
-
#include <stdlib.h> /* for malloc, free */
|
2
|
-
#include <string.h> /* for memcmp, memmove */
|
3
|
-
#include "ruby.h"
|
4
|
-
|
5
|
-
// typdefs!
|
6
|
-
typedef enum { false = 0, true} bool;
|
7
|
-
|
8
|
-
typedef struct node {
|
9
|
-
char character;
|
10
|
-
VALUE value;
|
11
|
-
struct node * first_child;
|
12
|
-
struct node * next_sibling;
|
13
|
-
} trie_node;
|
14
|
-
|
15
|
-
static VALUE rb_cTrie;
|
16
|
-
|
17
|
-
// =========================
|
18
|
-
// = function declarations =
|
19
|
-
// =========================
|
20
|
-
|
21
|
-
//trie implementation
|
22
|
-
static trie_node * trie_node_for_key(trie_node * root, char * key, bool create_missing_nodes);
|
23
|
-
static trie_node * trie_sibling_for_char(trie_node * node, char ch);
|
24
|
-
static trie_node * trie_add_sibling_for_char(trie_node * node, char ch);
|
25
|
-
static trie_node * trie_new_node_with_char(char ch);
|
26
|
-
static trie_node * trie_new_node();
|
27
|
-
static VALUE rb_trie_find_children(VALUE self, VALUE key);
|
28
|
-
static VALUE rb_trie_find_children_with_block(VALUE self, VALUE key);
|
29
|
-
static VALUE rb_trie_levenshtein_search(VALUE self, VALUE word, VALUE max_distance);
|
30
|
-
static void trie_collect_values(void * t, VALUE prary);
|
31
|
-
static void trie_collect_values_with_yield(void * t);
|
32
|
-
static void trie_traverse(trie_node * trie, void (*lambda_func)(void *));
|
33
|
-
static void trie_traverse_with_context(trie_node * trie, VALUE context, void (*lambda_func)(void *, VALUE));
|
34
|
-
static void free_trie(trie_node * trie);
|
35
|
-
static void count_nodes_callback(void *n, VALUE accum);
|
36
|
-
static VALUE rb_trie_count_nodes(VALUE self);
|
37
|
-
static recursive_levenshtein_search(trie_node* trie, VALUE rary, int* prev_line, int max_dist, char* word, int word_length);
|
38
|
-
// int print_arr(char c, int* arr, int len);
|
39
|
-
|
40
|
-
|
41
|
-
// ========================
|
42
|
-
// = function definitions =
|
43
|
-
// ========================
|
44
|
-
|
45
|
-
// instance methods
|
46
|
-
static VALUE rb_trie_get_key(VALUE self, VALUE key) {
|
47
|
-
trie_node * root;
|
48
|
-
trie_node * node;
|
49
|
-
char * key_cstring;
|
50
|
-
|
51
|
-
//Check_Type(key, T_STRING);
|
52
|
-
key_cstring = StringValuePtr(key);
|
53
|
-
|
54
|
-
Data_Get_Struct(self, trie_node, root);
|
55
|
-
|
56
|
-
node = trie_node_for_key(root, key_cstring, false);
|
57
|
-
if (node == NULL) return Qnil;
|
58
|
-
return node->value;
|
59
|
-
}
|
60
|
-
|
61
|
-
static VALUE rb_trie_set_key_to_value(VALUE self, VALUE key, VALUE value) {
|
62
|
-
trie_node * root;
|
63
|
-
trie_node * node;
|
64
|
-
char * key_cstring;
|
65
|
-
|
66
|
-
//Check_Type(key, T_STRING);
|
67
|
-
key_cstring = StringValuePtr(key);
|
68
|
-
|
69
|
-
Data_Get_Struct(self, trie_node, root);
|
70
|
-
|
71
|
-
node = trie_node_for_key(root, key_cstring, true);
|
72
|
-
node->value = value;
|
73
|
-
|
74
|
-
return Qnil;
|
75
|
-
}
|
76
|
-
|
77
|
-
static uint mem_count = 0;
|
78
|
-
|
79
|
-
static void count_nodes_callback(void *n, VALUE accum) {
|
80
|
-
trie_node *node = (trie_node*)n;
|
81
|
-
// rb_big_plus(accum, rb_uint2big(sizeof(*node)));
|
82
|
-
mem_count+=sizeof(*node);
|
83
|
-
}
|
84
|
-
|
85
|
-
static VALUE rb_trie_count_nodes(VALUE self) {
|
86
|
-
trie_node *root;
|
87
|
-
Data_Get_Struct(self, trie_node, root);
|
88
|
-
VALUE accum = rb_uint2big(0);
|
89
|
-
mem_count = 0;
|
90
|
-
trie_traverse_with_context(root, accum, count_nodes_callback);
|
91
|
-
return rb_uint2big(mem_count);
|
92
|
-
}
|
93
|
-
|
94
|
-
static VALUE rb_trie_undef_key(VALUE self, VALUE key) {
|
95
|
-
trie_node * root, * node, * prev, * next;
|
96
|
-
VALUE return_value;
|
97
|
-
char * key_cstring;
|
98
|
-
int steps;
|
99
|
-
int i;
|
100
|
-
|
101
|
-
//Check_Type(key, T_STRING);
|
102
|
-
key_cstring = StringValuePtr(key);
|
103
|
-
|
104
|
-
Data_Get_Struct(self, trie_node, root);
|
105
|
-
next = root;
|
106
|
-
node = NULL;
|
107
|
-
prev = NULL;
|
108
|
-
|
109
|
-
steps = strlen(key_cstring);
|
110
|
-
|
111
|
-
for (i = 0; i < steps; i++) {
|
112
|
-
if (next == NULL) return Qnil;
|
113
|
-
|
114
|
-
while(next->character != key_cstring[i]) {
|
115
|
-
if (next == NULL) return Qnil;
|
116
|
-
next = next->next_sibling;
|
117
|
-
}
|
118
|
-
prev = node;
|
119
|
-
node = next;
|
120
|
-
next = node->first_child;
|
121
|
-
}
|
122
|
-
|
123
|
-
return_value = node->value;
|
124
|
-
node->value = Qnil;
|
125
|
-
|
126
|
-
if (node->first_child == NULL) { //node has no children. we can delete it.
|
127
|
-
if (prev == NULL) {
|
128
|
-
//printf("should delete root");
|
129
|
-
} else if (prev->first_child == node) {
|
130
|
-
prev->first_child = node->next_sibling;
|
131
|
-
free(node);
|
132
|
-
} else if (prev->next_sibling == node) {
|
133
|
-
prev->next_sibling = node->next_sibling;
|
134
|
-
free(node);
|
135
|
-
}
|
136
|
-
}
|
137
|
-
|
138
|
-
return return_value;
|
139
|
-
}
|
140
|
-
|
141
|
-
// garbage collection and allocation
|
142
|
-
static void trie_mark_value(void * t) {
|
143
|
-
rb_gc_mark( ((trie_node *)t)->value );
|
144
|
-
}
|
145
|
-
|
146
|
-
static void rb_trie_mark(trie_node * t) {
|
147
|
-
trie_traverse(t, trie_mark_value);
|
148
|
-
}
|
149
|
-
|
150
|
-
static void rb_trie_free(trie_node * t) {
|
151
|
-
free_trie(t);
|
152
|
-
}
|
153
|
-
|
154
|
-
static VALUE rb_trie_allocate (VALUE klass) {
|
155
|
-
trie_node * t = trie_new_node();
|
156
|
-
|
157
|
-
return Data_Wrap_Struct(klass, rb_trie_mark, rb_trie_free, t);
|
158
|
-
}
|
159
|
-
|
160
|
-
// extension init
|
161
|
-
void Init_trie() {
|
162
|
-
rb_cTrie = rb_define_class("Trie", rb_cObject);
|
163
|
-
|
164
|
-
rb_define_alloc_func (rb_cTrie, rb_trie_allocate);
|
165
|
-
|
166
|
-
int arg_count = 0;
|
167
|
-
//rb_define_method(rb_cTrie, "inspect", rb_trie_inspect, arg_count);
|
168
|
-
rb_define_method(rb_cTrie, "memory", rb_trie_count_nodes, arg_count);
|
169
|
-
|
170
|
-
arg_count = 1;
|
171
|
-
rb_define_method(rb_cTrie, "[]", rb_trie_get_key, arg_count);
|
172
|
-
rb_define_method(rb_cTrie, "delete", rb_trie_undef_key, arg_count);
|
173
|
-
rb_define_method(rb_cTrie, "children", rb_trie_find_children, arg_count);
|
174
|
-
rb_define_method(rb_cTrie, "each", rb_trie_find_children_with_block, arg_count);
|
175
|
-
|
176
|
-
arg_count = 2;
|
177
|
-
rb_define_method(rb_cTrie, "[]=", rb_trie_set_key_to_value, arg_count);
|
178
|
-
// trie.levenshtein_search(word, max_distance)
|
179
|
-
rb_define_method(rb_cTrie, "levenshtein_search", rb_trie_levenshtein_search, arg_count);
|
180
|
-
}
|
181
|
-
|
182
|
-
|
183
|
-
// =======================
|
184
|
-
// = trie implementation =
|
185
|
-
// =======================
|
186
|
-
|
187
|
-
static trie_node * trie_node_for_key(trie_node * root, char * key, bool create_missing_nodes) {
|
188
|
-
int steps, i;
|
189
|
-
trie_node * next, * node;
|
190
|
-
|
191
|
-
steps = strlen(key);
|
192
|
-
next = root;
|
193
|
-
|
194
|
-
for (i = 0; i < steps; i++) {
|
195
|
-
if (next == NULL) {
|
196
|
-
if (create_missing_nodes) {
|
197
|
-
node->first_child = trie_new_node();
|
198
|
-
next = node->first_child;
|
199
|
-
}
|
200
|
-
else return NULL;
|
201
|
-
}
|
202
|
-
|
203
|
-
node = trie_sibling_for_char(next, key[i]);
|
204
|
-
|
205
|
-
if (node == NULL) {
|
206
|
-
if (create_missing_nodes) {
|
207
|
-
node = trie_add_sibling_for_char(next, key[i]);
|
208
|
-
}
|
209
|
-
else return NULL;
|
210
|
-
}
|
211
|
-
|
212
|
-
next = node->first_child;
|
213
|
-
}
|
214
|
-
|
215
|
-
return node;
|
216
|
-
}
|
217
|
-
|
218
|
-
static void trie_collect_values(void * t, VALUE rary) {
|
219
|
-
trie_node *node = (trie_node*)t;
|
220
|
-
if (node->value != Qnil) {
|
221
|
-
rb_ary_push(rary, node->value);
|
222
|
-
}
|
223
|
-
}
|
224
|
-
|
225
|
-
static void trie_collect_values_with_yield(void * t) {
|
226
|
-
trie_node *node = (trie_node*)t;
|
227
|
-
if (node->value != Qnil) {
|
228
|
-
// rb_ary_push(rary, node->value);
|
229
|
-
rb_yield(node->value);
|
230
|
-
}
|
231
|
-
}
|
232
|
-
|
233
|
-
static VALUE rb_trie_find_children(VALUE self, VALUE key) {
|
234
|
-
trie_node * root;
|
235
|
-
trie_node * node;
|
236
|
-
char * key_cstring;
|
237
|
-
VALUE rary = rb_ary_new();
|
238
|
-
|
239
|
-
key_cstring = StringValuePtr(key);
|
240
|
-
Data_Get_Struct(self, trie_node, root);
|
241
|
-
|
242
|
-
node = trie_node_for_key(root, key_cstring, false);
|
243
|
-
|
244
|
-
if (node != NULL && node->value != Qnil) {
|
245
|
-
rb_ary_push(rary, node->value);
|
246
|
-
}
|
247
|
-
|
248
|
-
if (node == NULL || node->first_child == NULL) return rary;
|
249
|
-
|
250
|
-
trie_traverse_with_context(node->first_child, rary, trie_collect_values);
|
251
|
-
return rary;
|
252
|
-
}
|
253
|
-
|
254
|
-
|
255
|
-
static VALUE rb_trie_find_children_with_block(VALUE self, VALUE key) {
|
256
|
-
trie_node * root;
|
257
|
-
trie_node * node;
|
258
|
-
char * key_cstring;
|
259
|
-
VALUE rary = rb_ary_new();
|
260
|
-
|
261
|
-
key_cstring = StringValuePtr(key);
|
262
|
-
Data_Get_Struct(self, trie_node, root);
|
263
|
-
|
264
|
-
node = trie_node_for_key(root, key_cstring, false);
|
265
|
-
|
266
|
-
if (node != NULL && node->value != Qnil) {
|
267
|
-
rb_yield(node->value);
|
268
|
-
}
|
269
|
-
|
270
|
-
if (node == NULL || node->first_child == NULL) return rary;
|
271
|
-
|
272
|
-
trie_traverse(node->first_child, trie_collect_values_with_yield);
|
273
|
-
return rary;
|
274
|
-
}
|
275
|
-
|
276
|
-
static VALUE rb_trie_levenshtein_search(VALUE self, VALUE word, VALUE max_distance) {
|
277
|
-
trie_node *root;
|
278
|
-
trie_node *node;
|
279
|
-
char *word_cstring;
|
280
|
-
VALUE rary = rb_ary_new();
|
281
|
-
int i=0;
|
282
|
-
|
283
|
-
Data_Get_Struct(self, trie_node, root);
|
284
|
-
|
285
|
-
word_cstring = StringValuePtr(word);
|
286
|
-
|
287
|
-
int first_line[strlen(word_cstring) + 1];
|
288
|
-
for(; i < strlen(word_cstring) + 1; i++) {
|
289
|
-
first_line[i] = i;
|
290
|
-
}
|
291
|
-
// print_arr('R', first_line, strlen(word_cstring)+1);
|
292
|
-
recursive_levenshtein_search(root->next_sibling, rary, first_line, FIX2INT(max_distance), word_cstring, strlen(word_cstring));
|
293
|
-
|
294
|
-
return rary;
|
295
|
-
}
|
296
|
-
|
297
|
-
int minimum(int* numbers, int len) {
|
298
|
-
int minValue = numbers[0];
|
299
|
-
int i;
|
300
|
-
for(i=1; i<len; i++) {
|
301
|
-
if (numbers[i] < minValue) minValue = numbers[i];
|
302
|
-
}
|
303
|
-
return minValue;
|
304
|
-
}
|
305
|
-
|
306
|
-
int min3(int a, int b, int c) {
|
307
|
-
int min = a;
|
308
|
-
if (b < min) min = b;
|
309
|
-
if (c < min) min = c;
|
310
|
-
return min;
|
311
|
-
}
|
312
|
-
|
313
|
-
static recursive_levenshtein_search(trie_node* trie, VALUE rary, int* prev_line, int max_dist, char* word, int word_length) {
|
314
|
-
int cur_line[word_length + 1];
|
315
|
-
int i,j, insert_cost, replace_cost, delete_cost;
|
316
|
-
VALUE carr;
|
317
|
-
|
318
|
-
cur_line[0] = prev_line[0] + 1;
|
319
|
-
|
320
|
-
for(i=1; i < word_length + 1; i++) {
|
321
|
-
insert_cost = cur_line[i-1] + 1;
|
322
|
-
delete_cost = prev_line[i] + 1;
|
323
|
-
if (trie->character != word[i-1]) {
|
324
|
-
replace_cost = prev_line[i-1] + 1;
|
325
|
-
} else {
|
326
|
-
replace_cost = prev_line[i-1];
|
327
|
-
}
|
328
|
-
cur_line[i] = min3(insert_cost, delete_cost, replace_cost);
|
329
|
-
}
|
330
|
-
|
331
|
-
|
332
|
-
if (cur_line[word_length] <= max_dist && trie->value != Qnil) {
|
333
|
-
carr = rb_ary_new();
|
334
|
-
rb_ary_push(carr, trie->value);
|
335
|
-
rb_ary_push(carr, INT2FIX(cur_line[word_length]));
|
336
|
-
rb_ary_push(rary, carr);
|
337
|
-
}
|
338
|
-
|
339
|
-
if (minimum(cur_line, word_length + 1) <= max_dist) {
|
340
|
-
if (trie->first_child != NULL)
|
341
|
-
recursive_levenshtein_search(trie->first_child, rary, cur_line, max_dist, word, word_length);
|
342
|
-
if (trie->next_sibling != NULL)
|
343
|
-
recursive_levenshtein_search(trie->next_sibling, rary, prev_line, max_dist, word, word_length);
|
344
|
-
}
|
345
|
-
}
|
346
|
-
|
347
|
-
static trie_node * trie_sibling_for_char(trie_node * node, char ch) {
|
348
|
-
while(true) {
|
349
|
-
if (node == NULL) return NULL;
|
350
|
-
|
351
|
-
if (node->character == ch) return node;
|
352
|
-
|
353
|
-
node = node->next_sibling;
|
354
|
-
}
|
355
|
-
return node;
|
356
|
-
}
|
357
|
-
|
358
|
-
static trie_node * trie_add_sibling_for_char(trie_node * node, char ch) {
|
359
|
-
trie_node * current_next;
|
360
|
-
|
361
|
-
current_next = node->next_sibling;
|
362
|
-
node->next_sibling = trie_new_node_with_char(ch);
|
363
|
-
node->next_sibling->next_sibling = current_next;
|
364
|
-
|
365
|
-
return node->next_sibling;
|
366
|
-
}
|
367
|
-
|
368
|
-
static trie_node * trie_new_node_with_char(char ch) {
|
369
|
-
trie_node * trie;
|
370
|
-
trie = malloc(sizeof(trie_node));
|
371
|
-
trie->character = ch;
|
372
|
-
trie->value = Qnil;
|
373
|
-
trie->first_child = NULL;
|
374
|
-
trie->next_sibling = NULL;
|
375
|
-
return trie;
|
376
|
-
}
|
377
|
-
|
378
|
-
static trie_node * trie_new_node() {
|
379
|
-
return trie_new_node_with_char('s'); //insert most common starting letter here.
|
380
|
-
}
|
381
|
-
|
382
|
-
static void trie_traverse(trie_node * trie, void (* lambda_func)(void *)) {
|
383
|
-
if (trie->next_sibling != NULL) {
|
384
|
-
trie_traverse(trie->next_sibling, lambda_func);
|
385
|
-
}
|
386
|
-
|
387
|
-
if (trie->first_child != NULL) {
|
388
|
-
trie_traverse(trie->first_child, lambda_func);
|
389
|
-
}
|
390
|
-
|
391
|
-
lambda_func(trie);
|
392
|
-
}
|
393
|
-
|
394
|
-
static void trie_traverse_with_context(trie_node * trie, VALUE context, void (*lambda_func)(void *, VALUE)) {
|
395
|
-
if (trie->next_sibling != NULL) {
|
396
|
-
trie_traverse_with_context(trie->next_sibling, context, lambda_func);
|
397
|
-
}
|
398
|
-
|
399
|
-
if (trie->first_child != NULL) {
|
400
|
-
trie_traverse_with_context(trie->first_child, context, lambda_func);
|
401
|
-
}
|
402
|
-
|
403
|
-
lambda_func(trie, context);
|
404
|
-
}
|
405
|
-
|
406
|
-
static void free_trie(trie_node * trie) {
|
407
|
-
trie_traverse(trie, free);
|
408
|
-
}
|