RubyTrie 1.1 → 2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/ChangeLog +3 -1
- data/README +10 -2
- data/ext/trie/levenshtein_distance.c +77 -0
- data/ext/trie/levenshtein_distance.h +13 -0
- data/ext/trie/otrie2.c +161 -0
- data/ext/trie/otrie2.h +41 -0
- data/ext/trie/ruby_trie.c +199 -0
- data/ext/trie/t.rb +6 -4
- data/test/trie_test.rb +8 -7
- metadata +10 -6
- data/ext/trie/trie.c +0 -408
data/ChangeLog
CHANGED
@@ -1,2 +1,4 @@
|
|
1
1
|
1.0 added children and each methods
|
2
|
-
1.1 added levenshtein algorithm implementation - inspired from: http://stevehanov.ca/blog/index.php?id=114
|
2
|
+
1.1 added levenshtein algorithm implementation - inspired from: http://stevehanov.ca/blog/index.php?id=114
|
3
|
+
2.0 completely rewritten the trie implementation to use more optimized data structure. For /usr/share/dict/web2 it takes the memory to about 6Mb from 40Mb. The levenshtein now takes a block and passes in the value but it can be easy extened to return an array. It also has a glitch where some values are for some reason just false... :|
|
4
|
+
2.1 bug fixes, fixing levenshtein search
|
data/README
CHANGED
@@ -22,6 +22,11 @@ t["goes"] = 2
|
|
22
22
|
t["gone"] = 3
|
23
23
|
t["other"] = 4
|
24
24
|
|
25
|
+
by default assigning multiple values to the same object will add them to an array.
|
26
|
+
t['a'] = 1
|
27
|
+
t['a'] = 2
|
28
|
+
will return for t['a'] => [1,2]
|
29
|
+
|
25
30
|
t.children("go") => [1,2,3]
|
26
31
|
|
27
32
|
t.each("partial key") will yield to the given block all values that are matched by the partial key
|
@@ -29,8 +34,11 @@ t.each("go") {|v| puts v } => prints 1, 2 & 3
|
|
29
34
|
|
30
35
|
Levenshtein search
|
31
36
|
|
32
|
-
t.levenshtein_search("go", 2) - will
|
33
|
-
|
37
|
+
t.levenshtein_search("go", 2) {|value| puts value} - will print all values for the words in the trie that have 2 distance from the search word
|
38
|
+
Eg.
|
39
|
+
1
|
40
|
+
2
|
41
|
+
3
|
34
42
|
|
35
43
|
== Bugs
|
36
44
|
|
@@ -0,0 +1,77 @@
|
|
1
|
+
/*
|
2
|
+
* levenshtein_distance.c
|
3
|
+
* otrie
|
4
|
+
*
|
5
|
+
* Created by Petrica Ghiurca on 18.03.2011.
|
6
|
+
* Copyright 2011 __MyCompanyName__. All rights reserved.
|
7
|
+
*
|
8
|
+
*/
|
9
|
+
|
10
|
+
#include <stdlib.h>
|
11
|
+
#include <string.h> /* for memcmp, memmove */
|
12
|
+
#include <stdio.h>
|
13
|
+
|
14
|
+
#include "otrie2.h"
|
15
|
+
#include "levenshtein_distance.h"
|
16
|
+
|
17
|
+
|
18
|
+
void recursive_levenshtein_search(Node* trie, int node_offset, levenshtein_distance_callback cb, int* prev_line, int max_dist, const char* word, int word_length);
|
19
|
+
|
20
|
+
void levenshtein_distance(Node* trie, const char* word, int max_distance, levenshtein_distance_callback cb) {
|
21
|
+
int first_line[strlen(word) + 1];
|
22
|
+
int i=0;
|
23
|
+
for(; i < strlen(word) + 1; i++) {
|
24
|
+
first_line[i] = i;
|
25
|
+
}
|
26
|
+
recursive_levenshtein_search(trie->next_sibling, 0, cb, first_line, max_distance, word, strlen(word));
|
27
|
+
}
|
28
|
+
|
29
|
+
|
30
|
+
int minimum(int* numbers, int len) {
|
31
|
+
int minValue = numbers[0];
|
32
|
+
int i;
|
33
|
+
for(i=1; i<len; i++) {
|
34
|
+
if (numbers[i] < minValue) minValue = numbers[i];
|
35
|
+
}
|
36
|
+
return minValue;
|
37
|
+
}
|
38
|
+
|
39
|
+
int min3(int a, int b, int c) {
|
40
|
+
int min = a;
|
41
|
+
if (b < min) min = b;
|
42
|
+
if (c < min) min = c;
|
43
|
+
return min;
|
44
|
+
}
|
45
|
+
|
46
|
+
void recursive_levenshtein_search(Node* trie, int node_offset, levenshtein_distance_callback cb, int* prev_line, int max_dist, const char* word, int word_length) {
|
47
|
+
int cur_line[word_length + 1];
|
48
|
+
int i, insert_cost, replace_cost, delete_cost;
|
49
|
+
|
50
|
+
cur_line[0] = prev_line[0] + 1;
|
51
|
+
|
52
|
+
for(i=1; i < word_length + 1; i++) {
|
53
|
+
insert_cost = cur_line[i-1] + 1;
|
54
|
+
delete_cost = prev_line[i] + 1;
|
55
|
+
if (trie->data[node_offset] != word[i-1]) {
|
56
|
+
replace_cost = prev_line[i-1] + 1;
|
57
|
+
} else {
|
58
|
+
replace_cost = prev_line[i-1];
|
59
|
+
}
|
60
|
+
cur_line[i] = min3(insert_cost, delete_cost, replace_cost);
|
61
|
+
}
|
62
|
+
|
63
|
+
|
64
|
+
if (cur_line[word_length] <= max_dist && (strlen(trie->data) == (node_offset + 1)) && trie->value != Qnil) {
|
65
|
+
cb(trie, cur_line[word_length]);
|
66
|
+
}
|
67
|
+
|
68
|
+
if (minimum(cur_line, word_length + 1) <= max_dist) {
|
69
|
+
if (strlen(trie->data) > (node_offset + 1))
|
70
|
+
recursive_levenshtein_search(trie, node_offset + 1, cb, cur_line, max_dist, word, word_length);
|
71
|
+
if (trie->first_child != NULL)
|
72
|
+
recursive_levenshtein_search(trie->first_child, 0, cb, cur_line, max_dist, word, word_length);
|
73
|
+
if (trie->next_sibling != NULL)
|
74
|
+
recursive_levenshtein_search(trie->next_sibling, 0, cb, prev_line, max_dist, word, word_length);
|
75
|
+
}
|
76
|
+
|
77
|
+
}
|
@@ -0,0 +1,13 @@
|
|
1
|
+
/*
|
2
|
+
* levenshtein_distance.h
|
3
|
+
* otrie
|
4
|
+
*
|
5
|
+
* Created by Petrica Ghiurca on 18.03.2011.
|
6
|
+
* Copyright 2011 __MyCompanyName__. All rights reserved.
|
7
|
+
*
|
8
|
+
*/
|
9
|
+
|
10
|
+
#include "otrie2.h"
|
11
|
+
|
12
|
+
typedef void (*levenshtein_distance_callback)(Node* node, int length);
|
13
|
+
void levenshtein_distance(Node* trie, const char* word, int max_distance, levenshtein_distance_callback cb);
|
data/ext/trie/otrie2.c
ADDED
@@ -0,0 +1,161 @@
|
|
1
|
+
/*
|
2
|
+
* otrie2.c
|
3
|
+
* otrie
|
4
|
+
*
|
5
|
+
* Created by Petrica Ghiurca on 18.03.2011.
|
6
|
+
* Copyright 2011 __MyCompanyName__. All rights reserved.
|
7
|
+
*
|
8
|
+
*/
|
9
|
+
|
10
|
+
#include <stdlib.h> /* for malloc, free */
|
11
|
+
#include <string.h> /* for memcmp, memmove */
|
12
|
+
#include <stdio.h>
|
13
|
+
|
14
|
+
#include "otrie2.h"
|
15
|
+
|
16
|
+
Node* new_node() {
|
17
|
+
Node *node = malloc(sizeof(Node));
|
18
|
+
memset(node, 0, sizeof(Node));
|
19
|
+
return node;
|
20
|
+
}
|
21
|
+
|
22
|
+
Node* new_node_string_len(const char* string, const int len) {
|
23
|
+
Node *node = new_node();
|
24
|
+
node->data = malloc(len+1);
|
25
|
+
strncpy(node->data, string, len);
|
26
|
+
node->data[len] = 0;
|
27
|
+
return node;
|
28
|
+
}
|
29
|
+
|
30
|
+
Node* new_node_string(const char* string) {
|
31
|
+
Node *node = new_node();
|
32
|
+
int len = strlen(string);
|
33
|
+
node->data = malloc(len+1);
|
34
|
+
strcpy(node->data, string);
|
35
|
+
return node;
|
36
|
+
}
|
37
|
+
|
38
|
+
void node_update_data(Node* node, const char* string, int len) {
|
39
|
+
char* new_data = malloc(len+1);
|
40
|
+
strncpy(new_data, string, len);
|
41
|
+
new_data[len] = 0;
|
42
|
+
free(node->data);
|
43
|
+
node->data = new_data;
|
44
|
+
}
|
45
|
+
|
46
|
+
void free_node(Node *node) {
|
47
|
+
if (node->first_child) free_node(node->first_child);
|
48
|
+
if (node->next_sibling) free_node(node->next_sibling);
|
49
|
+
free(node->data);
|
50
|
+
free(node);
|
51
|
+
}
|
52
|
+
|
53
|
+
void node_insert(Node* node, const char* string, const VALUE value) {
|
54
|
+
int len = strlen(string);
|
55
|
+
Pos *cur = new_pos(node, 0);
|
56
|
+
int i=0;
|
57
|
+
for (; i<len; i++) {
|
58
|
+
pos_next(cur, string + i, true);
|
59
|
+
}
|
60
|
+
cur->node->value = value;
|
61
|
+
}
|
62
|
+
|
63
|
+
Node* node_find(Node* this, const char* string) {
|
64
|
+
int len = strlen(string);
|
65
|
+
Pos *cur = new_pos(this, 0);
|
66
|
+
int i=0;
|
67
|
+
for(; i<len; i++) {
|
68
|
+
pos_next(cur, string + i, false);
|
69
|
+
if (cur->node == NULL) { return NULL; }
|
70
|
+
}
|
71
|
+
if (strlen(cur->node->data) == cur->offset + 1)
|
72
|
+
return cur->node;
|
73
|
+
return NULL;
|
74
|
+
}
|
75
|
+
|
76
|
+
Pos* new_pos(Node *node, int offset) {
|
77
|
+
Pos *pos = malloc(sizeof(Pos));
|
78
|
+
pos->node = node;
|
79
|
+
pos->offset = offset;
|
80
|
+
return pos;
|
81
|
+
}
|
82
|
+
|
83
|
+
Node* pos_find_or_create_child(Pos* this, const char* string, bool down, bool insert) {
|
84
|
+
Node *child = this->node->next_sibling;
|
85
|
+
if (down) child = this->node->first_child;
|
86
|
+
|
87
|
+
Node *last_child = NULL;
|
88
|
+
|
89
|
+
while(child != NULL && *child->data != *string) {
|
90
|
+
last_child = child;
|
91
|
+
child = child -> next_sibling;
|
92
|
+
}
|
93
|
+
if (child == NULL && insert) {
|
94
|
+
child = new_node_string(string);
|
95
|
+
if (!down) {
|
96
|
+
if (this->node -> next_sibling != NULL) {
|
97
|
+
last_child -> next_sibling = child;
|
98
|
+
} else {
|
99
|
+
this -> node -> next_sibling = child;
|
100
|
+
}
|
101
|
+
} else {
|
102
|
+
if (this -> node -> first_child != NULL) {
|
103
|
+
last_child -> next_sibling = child;
|
104
|
+
} else {
|
105
|
+
this->node -> first_child = child;
|
106
|
+
}
|
107
|
+
}
|
108
|
+
}
|
109
|
+
return child;
|
110
|
+
}
|
111
|
+
|
112
|
+
void pos_next(Pos *this, const char* string, bool insert) {
|
113
|
+
if (this -> node -> data == NULL) {
|
114
|
+
this->node = pos_find_or_create_child(this, string, false, insert);
|
115
|
+
this->offset = 0;
|
116
|
+
return;
|
117
|
+
}
|
118
|
+
|
119
|
+
int len = strlen(this->node->data);
|
120
|
+
if (this -> offset + 1 < len) {
|
121
|
+
if (this -> node->data[this -> offset + 1] == string[0]) {
|
122
|
+
this -> offset++;
|
123
|
+
return;
|
124
|
+
} else {
|
125
|
+
// split paths
|
126
|
+
// - new child node with old partial content
|
127
|
+
// - new child node with new content
|
128
|
+
if (insert) {
|
129
|
+
Node *splitChild = new_node_string(this->node->data + this->offset + 1);
|
130
|
+
splitChild -> value = this -> node -> value;
|
131
|
+
Node *newChild = new_node_string(string);
|
132
|
+
newChild -> value = Qnil;
|
133
|
+
node_update_data(this->node, this->node->data, this->offset + 1);
|
134
|
+
splitChild -> next_sibling = newChild;
|
135
|
+
this->node -> first_child = splitChild;
|
136
|
+
this->node -> value = Qnil;
|
137
|
+
|
138
|
+
this->node = newChild;
|
139
|
+
this->offset = 0;
|
140
|
+
} else {
|
141
|
+
this -> node = NULL;
|
142
|
+
this -> offset = 0;
|
143
|
+
}
|
144
|
+
}
|
145
|
+
} else {
|
146
|
+
// reached end of data... find a child
|
147
|
+
this->node = pos_find_or_create_child(this, string, true, insert);
|
148
|
+
this->offset = 0;
|
149
|
+
return;
|
150
|
+
}
|
151
|
+
}
|
152
|
+
|
153
|
+
void node_visit(Node* this, node_iterator func, VALUE context) {
|
154
|
+
func(this, context);
|
155
|
+
if (this->first_child != NULL) {
|
156
|
+
node_visit(this->first_child, func, context);
|
157
|
+
}
|
158
|
+
if (this->next_sibling != NULL) {
|
159
|
+
node_visit(this->next_sibling, func, context);
|
160
|
+
}
|
161
|
+
}
|
data/ext/trie/otrie2.h
ADDED
@@ -0,0 +1,41 @@
|
|
1
|
+
/*
|
2
|
+
* otrie2.h
|
3
|
+
* otrie
|
4
|
+
*
|
5
|
+
* Created by Petrica Ghiurca on 18.03.2011.
|
6
|
+
* Copyright 2011 __MyCompanyName__. All rights reserved.
|
7
|
+
*
|
8
|
+
*/
|
9
|
+
#include <ruby.h>
|
10
|
+
|
11
|
+
#define bool int
|
12
|
+
#define true 1
|
13
|
+
#define false 0
|
14
|
+
|
15
|
+
#ifndef TRIE_NODE
|
16
|
+
#define TRIE_NODE
|
17
|
+
|
18
|
+
typedef struct trie_node {
|
19
|
+
char *data;
|
20
|
+
struct trie_node* first_child;
|
21
|
+
struct trie_node* next_sibling;
|
22
|
+
VALUE value;
|
23
|
+
} Node;
|
24
|
+
|
25
|
+
typedef struct pos_struct {
|
26
|
+
Node *node;
|
27
|
+
int offset;
|
28
|
+
} Pos;
|
29
|
+
|
30
|
+
typedef void (*node_iterator)(Node* node, VALUE context);
|
31
|
+
|
32
|
+
Node* new_node();
|
33
|
+
void free_node(Node*);
|
34
|
+
void node_insert(Node* node, const char* string, const VALUE value);
|
35
|
+
Node* node_find(Node* this, const char* string);
|
36
|
+
|
37
|
+
Pos* new_pos(Node *node, int offset);
|
38
|
+
Node* pos_find_or_create_child(Pos* this, const char* string, bool down, bool insert);
|
39
|
+
void pos_next(Pos *this, const char* string, bool);
|
40
|
+
void node_visit(Node* this, node_iterator func, VALUE context);
|
41
|
+
#endif
|
@@ -0,0 +1,199 @@
|
|
1
|
+
/*
|
2
|
+
* ruby_trie.c
|
3
|
+
* otrie
|
4
|
+
*
|
5
|
+
* Created by Petrica Ghiurca on 18.03.2011.
|
6
|
+
* Copyright 2011 Petrica Ghiurca. All rights reserved.
|
7
|
+
*
|
8
|
+
*/
|
9
|
+
|
10
|
+
#include <ruby.h>
|
11
|
+
#include <stdlib.h> /* for malloc, free */
|
12
|
+
#include <string.h> /* for memcmp, memmove */
|
13
|
+
#include "otrie2.h"
|
14
|
+
#include "levenshtein_distance.h"
|
15
|
+
|
16
|
+
static VALUE rb_cTrie;
|
17
|
+
|
18
|
+
static void count_nodes_callback(Node *trie, VALUE accum);
|
19
|
+
static VALUE rb_trie_count_nodes(VALUE self);
|
20
|
+
static VALUE rb_trie_allocate(VALUE klass);
|
21
|
+
static VALUE rb_trie_get_key(VALUE self, VALUE key);
|
22
|
+
static void trie_mark_value(Node*, VALUE);
|
23
|
+
static void rb_trie_mark(Node* t);
|
24
|
+
static void rb_trie_free(Node * t);
|
25
|
+
void tree_collect_values(Node *node, VALUE rary);
|
26
|
+
static VALUE rb_trie_find_children(VALUE self, VALUE key);
|
27
|
+
static void trie_collect_values_with_yield(Node * node, VALUE context);
|
28
|
+
static VALUE rb_trie_find_children_with_block(VALUE self, VALUE key);
|
29
|
+
static VALUE rb_trie_set_key_to_value(VALUE self, VALUE key, VALUE value);
|
30
|
+
static VALUE rb_trie_levenshtein_search(VALUE self, VALUE word, VALUE max_distance);
|
31
|
+
|
32
|
+
// extension init
|
33
|
+
void Init_trie() {
|
34
|
+
rb_cTrie = rb_define_class("Trie", rb_cObject);
|
35
|
+
|
36
|
+
rb_define_alloc_func(rb_cTrie, rb_trie_allocate);
|
37
|
+
|
38
|
+
int arg_count = 0;
|
39
|
+
//rb_define_method(rb_cTrie, "inspect", rb_trie_inspect, arg_count);
|
40
|
+
rb_define_method(rb_cTrie, "memory", rb_trie_count_nodes, arg_count);
|
41
|
+
|
42
|
+
arg_count = 1;
|
43
|
+
rb_define_method(rb_cTrie, "[]", rb_trie_get_key, arg_count);
|
44
|
+
// rb_define_method(rb_cTrie, "delete", rb_trie_undef_key, arg_count);
|
45
|
+
rb_define_method(rb_cTrie, "children", rb_trie_find_children, arg_count);
|
46
|
+
rb_define_method(rb_cTrie, "each", rb_trie_find_children_with_block, arg_count);
|
47
|
+
|
48
|
+
arg_count = 2;
|
49
|
+
rb_define_method(rb_cTrie, "[]=", rb_trie_set_key_to_value, arg_count);
|
50
|
+
// trie.levenshtein_search(word, max_distance)
|
51
|
+
rb_define_method(rb_cTrie, "levenshtein_search", rb_trie_levenshtein_search, arg_count);
|
52
|
+
}
|
53
|
+
|
54
|
+
static int total_memory;
|
55
|
+
static void count_nodes_callback(Node *trie, VALUE accum) {
|
56
|
+
int len = 0;
|
57
|
+
if (trie->data) len = strlen(trie->data);
|
58
|
+
rb_big_plus(accum, rb_uint2big(len + sizeof(Node)));
|
59
|
+
total_memory += len + sizeof(Node);
|
60
|
+
}
|
61
|
+
|
62
|
+
static VALUE rb_trie_count_nodes(VALUE self) {
|
63
|
+
Node *root;
|
64
|
+
Data_Get_Struct(self, Node, root);
|
65
|
+
VALUE accum = rb_uint2big(0);
|
66
|
+
total_memory = 0;
|
67
|
+
node_visit(root, count_nodes_callback, accum);
|
68
|
+
//return accum;
|
69
|
+
return rb_uint2big(total_memory);
|
70
|
+
}
|
71
|
+
|
72
|
+
static VALUE rb_trie_allocate(VALUE klass) {
|
73
|
+
Node * t = new_node();
|
74
|
+
return Data_Wrap_Struct(klass, rb_trie_mark, rb_trie_free, t);
|
75
|
+
}
|
76
|
+
|
77
|
+
static VALUE rb_trie_get_key(VALUE self, VALUE key) {
|
78
|
+
Node * root;
|
79
|
+
Node * node;
|
80
|
+
char * key_cstring;
|
81
|
+
|
82
|
+
Check_Type(key, T_STRING);
|
83
|
+
key_cstring = StringValuePtr(key);
|
84
|
+
|
85
|
+
Data_Get_Struct(self, Node, root);
|
86
|
+
|
87
|
+
node = node_find(root, key_cstring);
|
88
|
+
if (node == NULL) return Qnil;
|
89
|
+
return node->value;
|
90
|
+
}
|
91
|
+
|
92
|
+
|
93
|
+
static void trie_mark_value(Node * t, VALUE context) {
|
94
|
+
rb_gc_mark( t->value );
|
95
|
+
}
|
96
|
+
|
97
|
+
static void rb_trie_mark(Node* t) {
|
98
|
+
node_visit(t, trie_mark_value, Qnil);
|
99
|
+
}
|
100
|
+
|
101
|
+
static void rb_trie_free(Node * t) {
|
102
|
+
free_node(t);
|
103
|
+
}
|
104
|
+
|
105
|
+
void tree_collect_values(Node *node, VALUE rary) {
|
106
|
+
if (node->value != Qnil) {
|
107
|
+
rb_ary_push(rary, node->value);
|
108
|
+
}
|
109
|
+
}
|
110
|
+
|
111
|
+
static VALUE rb_trie_find_children(VALUE self, VALUE key) {
|
112
|
+
Node * root;
|
113
|
+
Node * node;
|
114
|
+
char * key_cstring;
|
115
|
+
VALUE rary = rb_ary_new();
|
116
|
+
|
117
|
+
key_cstring = StringValuePtr(key);
|
118
|
+
Data_Get_Struct(self, Node, root);
|
119
|
+
|
120
|
+
node = node_find(root, key_cstring);
|
121
|
+
|
122
|
+
if (node != NULL && node->value != Qnil) {
|
123
|
+
rb_ary_push(rary, node->value);
|
124
|
+
}
|
125
|
+
|
126
|
+
if (node == NULL || node->first_child == NULL) return rary;
|
127
|
+
|
128
|
+
node_visit(node->first_child, tree_collect_values, rary);
|
129
|
+
return rary;
|
130
|
+
}
|
131
|
+
|
132
|
+
static void trie_collect_values_with_yield(Node * node, VALUE context) {
|
133
|
+
if (node->value != Qnil) {
|
134
|
+
rb_yield(node->value);
|
135
|
+
}
|
136
|
+
}
|
137
|
+
|
138
|
+
static VALUE rb_trie_find_children_with_block(VALUE self, VALUE key) {
|
139
|
+
Node * root;
|
140
|
+
Node * node;
|
141
|
+
char * key_cstring;
|
142
|
+
VALUE rary = rb_ary_new();
|
143
|
+
|
144
|
+
key_cstring = StringValuePtr(key);
|
145
|
+
Data_Get_Struct(self, Node, root);
|
146
|
+
|
147
|
+
node = node_find(root, key_cstring);
|
148
|
+
|
149
|
+
if (node != NULL && node->value != Qnil) {
|
150
|
+
rb_yield(node->value);
|
151
|
+
}
|
152
|
+
|
153
|
+
if (node == NULL || node->first_child == NULL) return rary;
|
154
|
+
|
155
|
+
node_visit(node->first_child, trie_collect_values_with_yield, Qnil);
|
156
|
+
return rary;
|
157
|
+
}
|
158
|
+
|
159
|
+
|
160
|
+
static VALUE rb_trie_set_key_to_value(VALUE self, VALUE key, VALUE value) {
|
161
|
+
Node * root, *node;
|
162
|
+
char * key_cstring;
|
163
|
+
|
164
|
+
Check_Type(key, T_STRING);
|
165
|
+
key_cstring = StringValuePtr(key);
|
166
|
+
|
167
|
+
Data_Get_Struct(self, Node, root);
|
168
|
+
|
169
|
+
node = node_find(root, key_cstring);
|
170
|
+
if (node == NULL || node -> value == Qnil) {
|
171
|
+
// printf("New node for %s -> %d\n", key_cstring, value);
|
172
|
+
VALUE arr = rb_ary_new();
|
173
|
+
rb_ary_push(arr, value);
|
174
|
+
node_insert(root, key_cstring, arr);
|
175
|
+
} else {
|
176
|
+
// printf("Append value %s to %s -> %d\n", node->data, key_cstring, node->value);
|
177
|
+
rb_ary_push(node->value, value);
|
178
|
+
}
|
179
|
+
|
180
|
+
return Qnil;
|
181
|
+
}
|
182
|
+
|
183
|
+
void rb_levensthtein_cb(Node* node, int distance) {
|
184
|
+
if (node->value != Qnil)
|
185
|
+
rb_yield(node->value);
|
186
|
+
}
|
187
|
+
|
188
|
+
static VALUE rb_trie_levenshtein_search(VALUE self, VALUE word, VALUE max_distance) {
|
189
|
+
Node *root;
|
190
|
+
char *word_cstring;
|
191
|
+
|
192
|
+
Data_Get_Struct(self, Node, root);
|
193
|
+
|
194
|
+
word_cstring = StringValuePtr(word);
|
195
|
+
|
196
|
+
levenshtein_distance(root, word_cstring, FIX2INT(max_distance), rb_levensthtein_cb);
|
197
|
+
|
198
|
+
return Qnil;
|
199
|
+
}
|
data/ext/trie/t.rb
CHANGED
@@ -1,20 +1,22 @@
|
|
1
1
|
require 'trie'
|
2
|
+
require 'rubygems'
|
2
3
|
require 'benchmark'
|
3
4
|
|
4
5
|
t = Trie.new
|
5
6
|
c = 0
|
7
|
+
a = 0
|
6
8
|
s1 = (Benchmark.measure do
|
7
|
-
open('/usr/share/dict/
|
8
|
-
t[w.chop]
|
9
|
+
open('/usr/share/dict/web2').each_line do |w|
|
10
|
+
t[w.chop]= w
|
9
11
|
c += 1
|
10
12
|
end
|
11
13
|
end)
|
12
14
|
|
13
15
|
# %w(gol golas golaster lux xal).each {|w| t[w] = w}
|
14
16
|
s2 = (Benchmark.measure do
|
15
|
-
t.levenshtein_search('
|
17
|
+
t.levenshtein_search('food', 1) {|p| puts p}
|
16
18
|
end)
|
17
19
|
|
18
|
-
puts "#{t.memory/(1024*1024)}Mb, #{c} words"
|
20
|
+
puts "#{t.memory/(1024*1024)}Mb, #{c} words, #{a} unique"
|
19
21
|
puts s1
|
20
22
|
puts s2
|
data/test/trie_test.rb
CHANGED
@@ -23,10 +23,11 @@ time do
|
|
23
23
|
1.upto(max) do |i|
|
24
24
|
t["item #{i}"].class
|
25
25
|
end
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
26
|
+
|
27
|
+
# not implemented yet
|
28
|
+
# 1.upto(max) do |i|
|
29
|
+
# t.delete("item #{i}")
|
30
|
+
# end
|
30
31
|
end
|
31
32
|
|
32
33
|
|
@@ -41,9 +42,9 @@ time do
|
|
41
42
|
t["item #{i}"].class
|
42
43
|
end
|
43
44
|
|
44
|
-
1.upto(max) do |i|
|
45
|
-
|
46
|
-
end
|
45
|
+
# 1.upto(max) do |i|
|
46
|
+
# t.delete("item #{i}")
|
47
|
+
# end
|
47
48
|
end
|
48
49
|
|
49
50
|
# puts "With a Ruby Trie..."
|
metadata
CHANGED
@@ -1,21 +1,21 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: RubyTrie
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
hash:
|
4
|
+
hash: 1
|
5
5
|
prerelease: false
|
6
6
|
segments:
|
7
|
+
- 2
|
7
8
|
- 1
|
8
|
-
|
9
|
-
version: "1.1"
|
9
|
+
version: "2.1"
|
10
10
|
platform: ruby
|
11
11
|
authors:
|
12
|
-
- Matt Freels
|
13
12
|
- Petrica Ghiurca
|
13
|
+
- Matt Freels
|
14
14
|
autorequire: trie
|
15
15
|
bindir: bin
|
16
16
|
cert_chain: []
|
17
17
|
|
18
|
-
date: 2011-
|
18
|
+
date: 2011-03-21 00:00:00 +02:00
|
19
19
|
default_executable:
|
20
20
|
dependencies: []
|
21
21
|
|
@@ -30,7 +30,11 @@ extra_rdoc_files:
|
|
30
30
|
files:
|
31
31
|
- README
|
32
32
|
- ChangeLog
|
33
|
-
- ext/trie/
|
33
|
+
- ext/trie/levenshtein_distance.h
|
34
|
+
- ext/trie/otrie2.h
|
35
|
+
- ext/trie/levenshtein_distance.c
|
36
|
+
- ext/trie/otrie2.c
|
37
|
+
- ext/trie/ruby_trie.c
|
34
38
|
- ext/trie/extconf.rb
|
35
39
|
- ext/trie/t.rb
|
36
40
|
- lib/ruby_trie.rb
|
data/ext/trie/trie.c
DELETED
@@ -1,408 +0,0 @@
|
|
1
|
-
#include <stdlib.h> /* for malloc, free */
|
2
|
-
#include <string.h> /* for memcmp, memmove */
|
3
|
-
#include "ruby.h"
|
4
|
-
|
5
|
-
// typdefs!
|
6
|
-
typedef enum { false = 0, true} bool;
|
7
|
-
|
8
|
-
typedef struct node {
|
9
|
-
char character;
|
10
|
-
VALUE value;
|
11
|
-
struct node * first_child;
|
12
|
-
struct node * next_sibling;
|
13
|
-
} trie_node;
|
14
|
-
|
15
|
-
static VALUE rb_cTrie;
|
16
|
-
|
17
|
-
// =========================
|
18
|
-
// = function declarations =
|
19
|
-
// =========================
|
20
|
-
|
21
|
-
//trie implementation
|
22
|
-
static trie_node * trie_node_for_key(trie_node * root, char * key, bool create_missing_nodes);
|
23
|
-
static trie_node * trie_sibling_for_char(trie_node * node, char ch);
|
24
|
-
static trie_node * trie_add_sibling_for_char(trie_node * node, char ch);
|
25
|
-
static trie_node * trie_new_node_with_char(char ch);
|
26
|
-
static trie_node * trie_new_node();
|
27
|
-
static VALUE rb_trie_find_children(VALUE self, VALUE key);
|
28
|
-
static VALUE rb_trie_find_children_with_block(VALUE self, VALUE key);
|
29
|
-
static VALUE rb_trie_levenshtein_search(VALUE self, VALUE word, VALUE max_distance);
|
30
|
-
static void trie_collect_values(void * t, VALUE prary);
|
31
|
-
static void trie_collect_values_with_yield(void * t);
|
32
|
-
static void trie_traverse(trie_node * trie, void (*lambda_func)(void *));
|
33
|
-
static void trie_traverse_with_context(trie_node * trie, VALUE context, void (*lambda_func)(void *, VALUE));
|
34
|
-
static void free_trie(trie_node * trie);
|
35
|
-
static void count_nodes_callback(void *n, VALUE accum);
|
36
|
-
static VALUE rb_trie_count_nodes(VALUE self);
|
37
|
-
static recursive_levenshtein_search(trie_node* trie, VALUE rary, int* prev_line, int max_dist, char* word, int word_length);
|
38
|
-
// int print_arr(char c, int* arr, int len);
|
39
|
-
|
40
|
-
|
41
|
-
// ========================
|
42
|
-
// = function definitions =
|
43
|
-
// ========================
|
44
|
-
|
45
|
-
// instance methods
|
46
|
-
static VALUE rb_trie_get_key(VALUE self, VALUE key) {
|
47
|
-
trie_node * root;
|
48
|
-
trie_node * node;
|
49
|
-
char * key_cstring;
|
50
|
-
|
51
|
-
//Check_Type(key, T_STRING);
|
52
|
-
key_cstring = StringValuePtr(key);
|
53
|
-
|
54
|
-
Data_Get_Struct(self, trie_node, root);
|
55
|
-
|
56
|
-
node = trie_node_for_key(root, key_cstring, false);
|
57
|
-
if (node == NULL) return Qnil;
|
58
|
-
return node->value;
|
59
|
-
}
|
60
|
-
|
61
|
-
static VALUE rb_trie_set_key_to_value(VALUE self, VALUE key, VALUE value) {
|
62
|
-
trie_node * root;
|
63
|
-
trie_node * node;
|
64
|
-
char * key_cstring;
|
65
|
-
|
66
|
-
//Check_Type(key, T_STRING);
|
67
|
-
key_cstring = StringValuePtr(key);
|
68
|
-
|
69
|
-
Data_Get_Struct(self, trie_node, root);
|
70
|
-
|
71
|
-
node = trie_node_for_key(root, key_cstring, true);
|
72
|
-
node->value = value;
|
73
|
-
|
74
|
-
return Qnil;
|
75
|
-
}
|
76
|
-
|
77
|
-
static uint mem_count = 0;
|
78
|
-
|
79
|
-
static void count_nodes_callback(void *n, VALUE accum) {
|
80
|
-
trie_node *node = (trie_node*)n;
|
81
|
-
// rb_big_plus(accum, rb_uint2big(sizeof(*node)));
|
82
|
-
mem_count+=sizeof(*node);
|
83
|
-
}
|
84
|
-
|
85
|
-
static VALUE rb_trie_count_nodes(VALUE self) {
|
86
|
-
trie_node *root;
|
87
|
-
Data_Get_Struct(self, trie_node, root);
|
88
|
-
VALUE accum = rb_uint2big(0);
|
89
|
-
mem_count = 0;
|
90
|
-
trie_traverse_with_context(root, accum, count_nodes_callback);
|
91
|
-
return rb_uint2big(mem_count);
|
92
|
-
}
|
93
|
-
|
94
|
-
static VALUE rb_trie_undef_key(VALUE self, VALUE key) {
|
95
|
-
trie_node * root, * node, * prev, * next;
|
96
|
-
VALUE return_value;
|
97
|
-
char * key_cstring;
|
98
|
-
int steps;
|
99
|
-
int i;
|
100
|
-
|
101
|
-
//Check_Type(key, T_STRING);
|
102
|
-
key_cstring = StringValuePtr(key);
|
103
|
-
|
104
|
-
Data_Get_Struct(self, trie_node, root);
|
105
|
-
next = root;
|
106
|
-
node = NULL;
|
107
|
-
prev = NULL;
|
108
|
-
|
109
|
-
steps = strlen(key_cstring);
|
110
|
-
|
111
|
-
for (i = 0; i < steps; i++) {
|
112
|
-
if (next == NULL) return Qnil;
|
113
|
-
|
114
|
-
while(next->character != key_cstring[i]) {
|
115
|
-
if (next == NULL) return Qnil;
|
116
|
-
next = next->next_sibling;
|
117
|
-
}
|
118
|
-
prev = node;
|
119
|
-
node = next;
|
120
|
-
next = node->first_child;
|
121
|
-
}
|
122
|
-
|
123
|
-
return_value = node->value;
|
124
|
-
node->value = Qnil;
|
125
|
-
|
126
|
-
if (node->first_child == NULL) { //node has no children. we can delete it.
|
127
|
-
if (prev == NULL) {
|
128
|
-
//printf("should delete root");
|
129
|
-
} else if (prev->first_child == node) {
|
130
|
-
prev->first_child = node->next_sibling;
|
131
|
-
free(node);
|
132
|
-
} else if (prev->next_sibling == node) {
|
133
|
-
prev->next_sibling = node->next_sibling;
|
134
|
-
free(node);
|
135
|
-
}
|
136
|
-
}
|
137
|
-
|
138
|
-
return return_value;
|
139
|
-
}
|
140
|
-
|
141
|
-
// garbage collection and allocation
|
142
|
-
static void trie_mark_value(void * t) {
|
143
|
-
rb_gc_mark( ((trie_node *)t)->value );
|
144
|
-
}
|
145
|
-
|
146
|
-
static void rb_trie_mark(trie_node * t) {
|
147
|
-
trie_traverse(t, trie_mark_value);
|
148
|
-
}
|
149
|
-
|
150
|
-
static void rb_trie_free(trie_node * t) {
|
151
|
-
free_trie(t);
|
152
|
-
}
|
153
|
-
|
154
|
-
static VALUE rb_trie_allocate (VALUE klass) {
|
155
|
-
trie_node * t = trie_new_node();
|
156
|
-
|
157
|
-
return Data_Wrap_Struct(klass, rb_trie_mark, rb_trie_free, t);
|
158
|
-
}
|
159
|
-
|
160
|
-
// extension init
|
161
|
-
void Init_trie() {
|
162
|
-
rb_cTrie = rb_define_class("Trie", rb_cObject);
|
163
|
-
|
164
|
-
rb_define_alloc_func (rb_cTrie, rb_trie_allocate);
|
165
|
-
|
166
|
-
int arg_count = 0;
|
167
|
-
//rb_define_method(rb_cTrie, "inspect", rb_trie_inspect, arg_count);
|
168
|
-
rb_define_method(rb_cTrie, "memory", rb_trie_count_nodes, arg_count);
|
169
|
-
|
170
|
-
arg_count = 1;
|
171
|
-
rb_define_method(rb_cTrie, "[]", rb_trie_get_key, arg_count);
|
172
|
-
rb_define_method(rb_cTrie, "delete", rb_trie_undef_key, arg_count);
|
173
|
-
rb_define_method(rb_cTrie, "children", rb_trie_find_children, arg_count);
|
174
|
-
rb_define_method(rb_cTrie, "each", rb_trie_find_children_with_block, arg_count);
|
175
|
-
|
176
|
-
arg_count = 2;
|
177
|
-
rb_define_method(rb_cTrie, "[]=", rb_trie_set_key_to_value, arg_count);
|
178
|
-
// trie.levenshtein_search(word, max_distance)
|
179
|
-
rb_define_method(rb_cTrie, "levenshtein_search", rb_trie_levenshtein_search, arg_count);
|
180
|
-
}
|
181
|
-
|
182
|
-
|
183
|
-
// =======================
|
184
|
-
// = trie implementation =
|
185
|
-
// =======================
|
186
|
-
|
187
|
-
static trie_node * trie_node_for_key(trie_node * root, char * key, bool create_missing_nodes) {
|
188
|
-
int steps, i;
|
189
|
-
trie_node * next, * node;
|
190
|
-
|
191
|
-
steps = strlen(key);
|
192
|
-
next = root;
|
193
|
-
|
194
|
-
for (i = 0; i < steps; i++) {
|
195
|
-
if (next == NULL) {
|
196
|
-
if (create_missing_nodes) {
|
197
|
-
node->first_child = trie_new_node();
|
198
|
-
next = node->first_child;
|
199
|
-
}
|
200
|
-
else return NULL;
|
201
|
-
}
|
202
|
-
|
203
|
-
node = trie_sibling_for_char(next, key[i]);
|
204
|
-
|
205
|
-
if (node == NULL) {
|
206
|
-
if (create_missing_nodes) {
|
207
|
-
node = trie_add_sibling_for_char(next, key[i]);
|
208
|
-
}
|
209
|
-
else return NULL;
|
210
|
-
}
|
211
|
-
|
212
|
-
next = node->first_child;
|
213
|
-
}
|
214
|
-
|
215
|
-
return node;
|
216
|
-
}
|
217
|
-
|
218
|
-
static void trie_collect_values(void * t, VALUE rary) {
|
219
|
-
trie_node *node = (trie_node*)t;
|
220
|
-
if (node->value != Qnil) {
|
221
|
-
rb_ary_push(rary, node->value);
|
222
|
-
}
|
223
|
-
}
|
224
|
-
|
225
|
-
static void trie_collect_values_with_yield(void * t) {
|
226
|
-
trie_node *node = (trie_node*)t;
|
227
|
-
if (node->value != Qnil) {
|
228
|
-
// rb_ary_push(rary, node->value);
|
229
|
-
rb_yield(node->value);
|
230
|
-
}
|
231
|
-
}
|
232
|
-
|
233
|
-
static VALUE rb_trie_find_children(VALUE self, VALUE key) {
|
234
|
-
trie_node * root;
|
235
|
-
trie_node * node;
|
236
|
-
char * key_cstring;
|
237
|
-
VALUE rary = rb_ary_new();
|
238
|
-
|
239
|
-
key_cstring = StringValuePtr(key);
|
240
|
-
Data_Get_Struct(self, trie_node, root);
|
241
|
-
|
242
|
-
node = trie_node_for_key(root, key_cstring, false);
|
243
|
-
|
244
|
-
if (node != NULL && node->value != Qnil) {
|
245
|
-
rb_ary_push(rary, node->value);
|
246
|
-
}
|
247
|
-
|
248
|
-
if (node == NULL || node->first_child == NULL) return rary;
|
249
|
-
|
250
|
-
trie_traverse_with_context(node->first_child, rary, trie_collect_values);
|
251
|
-
return rary;
|
252
|
-
}
|
253
|
-
|
254
|
-
|
255
|
-
static VALUE rb_trie_find_children_with_block(VALUE self, VALUE key) {
|
256
|
-
trie_node * root;
|
257
|
-
trie_node * node;
|
258
|
-
char * key_cstring;
|
259
|
-
VALUE rary = rb_ary_new();
|
260
|
-
|
261
|
-
key_cstring = StringValuePtr(key);
|
262
|
-
Data_Get_Struct(self, trie_node, root);
|
263
|
-
|
264
|
-
node = trie_node_for_key(root, key_cstring, false);
|
265
|
-
|
266
|
-
if (node != NULL && node->value != Qnil) {
|
267
|
-
rb_yield(node->value);
|
268
|
-
}
|
269
|
-
|
270
|
-
if (node == NULL || node->first_child == NULL) return rary;
|
271
|
-
|
272
|
-
trie_traverse(node->first_child, trie_collect_values_with_yield);
|
273
|
-
return rary;
|
274
|
-
}
|
275
|
-
|
276
|
-
static VALUE rb_trie_levenshtein_search(VALUE self, VALUE word, VALUE max_distance) {
|
277
|
-
trie_node *root;
|
278
|
-
trie_node *node;
|
279
|
-
char *word_cstring;
|
280
|
-
VALUE rary = rb_ary_new();
|
281
|
-
int i=0;
|
282
|
-
|
283
|
-
Data_Get_Struct(self, trie_node, root);
|
284
|
-
|
285
|
-
word_cstring = StringValuePtr(word);
|
286
|
-
|
287
|
-
int first_line[strlen(word_cstring) + 1];
|
288
|
-
for(; i < strlen(word_cstring) + 1; i++) {
|
289
|
-
first_line[i] = i;
|
290
|
-
}
|
291
|
-
// print_arr('R', first_line, strlen(word_cstring)+1);
|
292
|
-
recursive_levenshtein_search(root->next_sibling, rary, first_line, FIX2INT(max_distance), word_cstring, strlen(word_cstring));
|
293
|
-
|
294
|
-
return rary;
|
295
|
-
}
|
296
|
-
|
297
|
-
int minimum(int* numbers, int len) {
|
298
|
-
int minValue = numbers[0];
|
299
|
-
int i;
|
300
|
-
for(i=1; i<len; i++) {
|
301
|
-
if (numbers[i] < minValue) minValue = numbers[i];
|
302
|
-
}
|
303
|
-
return minValue;
|
304
|
-
}
|
305
|
-
|
306
|
-
int min3(int a, int b, int c) {
|
307
|
-
int min = a;
|
308
|
-
if (b < min) min = b;
|
309
|
-
if (c < min) min = c;
|
310
|
-
return min;
|
311
|
-
}
|
312
|
-
|
313
|
-
static recursive_levenshtein_search(trie_node* trie, VALUE rary, int* prev_line, int max_dist, char* word, int word_length) {
|
314
|
-
int cur_line[word_length + 1];
|
315
|
-
int i,j, insert_cost, replace_cost, delete_cost;
|
316
|
-
VALUE carr;
|
317
|
-
|
318
|
-
cur_line[0] = prev_line[0] + 1;
|
319
|
-
|
320
|
-
for(i=1; i < word_length + 1; i++) {
|
321
|
-
insert_cost = cur_line[i-1] + 1;
|
322
|
-
delete_cost = prev_line[i] + 1;
|
323
|
-
if (trie->character != word[i-1]) {
|
324
|
-
replace_cost = prev_line[i-1] + 1;
|
325
|
-
} else {
|
326
|
-
replace_cost = prev_line[i-1];
|
327
|
-
}
|
328
|
-
cur_line[i] = min3(insert_cost, delete_cost, replace_cost);
|
329
|
-
}
|
330
|
-
|
331
|
-
|
332
|
-
if (cur_line[word_length] <= max_dist && trie->value != Qnil) {
|
333
|
-
carr = rb_ary_new();
|
334
|
-
rb_ary_push(carr, trie->value);
|
335
|
-
rb_ary_push(carr, INT2FIX(cur_line[word_length]));
|
336
|
-
rb_ary_push(rary, carr);
|
337
|
-
}
|
338
|
-
|
339
|
-
if (minimum(cur_line, word_length + 1) <= max_dist) {
|
340
|
-
if (trie->first_child != NULL)
|
341
|
-
recursive_levenshtein_search(trie->first_child, rary, cur_line, max_dist, word, word_length);
|
342
|
-
if (trie->next_sibling != NULL)
|
343
|
-
recursive_levenshtein_search(trie->next_sibling, rary, prev_line, max_dist, word, word_length);
|
344
|
-
}
|
345
|
-
}
|
346
|
-
|
347
|
-
static trie_node * trie_sibling_for_char(trie_node * node, char ch) {
|
348
|
-
while(true) {
|
349
|
-
if (node == NULL) return NULL;
|
350
|
-
|
351
|
-
if (node->character == ch) return node;
|
352
|
-
|
353
|
-
node = node->next_sibling;
|
354
|
-
}
|
355
|
-
return node;
|
356
|
-
}
|
357
|
-
|
358
|
-
static trie_node * trie_add_sibling_for_char(trie_node * node, char ch) {
|
359
|
-
trie_node * current_next;
|
360
|
-
|
361
|
-
current_next = node->next_sibling;
|
362
|
-
node->next_sibling = trie_new_node_with_char(ch);
|
363
|
-
node->next_sibling->next_sibling = current_next;
|
364
|
-
|
365
|
-
return node->next_sibling;
|
366
|
-
}
|
367
|
-
|
368
|
-
static trie_node * trie_new_node_with_char(char ch) {
|
369
|
-
trie_node * trie;
|
370
|
-
trie = malloc(sizeof(trie_node));
|
371
|
-
trie->character = ch;
|
372
|
-
trie->value = Qnil;
|
373
|
-
trie->first_child = NULL;
|
374
|
-
trie->next_sibling = NULL;
|
375
|
-
return trie;
|
376
|
-
}
|
377
|
-
|
378
|
-
static trie_node * trie_new_node() {
|
379
|
-
return trie_new_node_with_char('s'); //insert most common starting letter here.
|
380
|
-
}
|
381
|
-
|
382
|
-
static void trie_traverse(trie_node * trie, void (* lambda_func)(void *)) {
|
383
|
-
if (trie->next_sibling != NULL) {
|
384
|
-
trie_traverse(trie->next_sibling, lambda_func);
|
385
|
-
}
|
386
|
-
|
387
|
-
if (trie->first_child != NULL) {
|
388
|
-
trie_traverse(trie->first_child, lambda_func);
|
389
|
-
}
|
390
|
-
|
391
|
-
lambda_func(trie);
|
392
|
-
}
|
393
|
-
|
394
|
-
static void trie_traverse_with_context(trie_node * trie, VALUE context, void (*lambda_func)(void *, VALUE)) {
|
395
|
-
if (trie->next_sibling != NULL) {
|
396
|
-
trie_traverse_with_context(trie->next_sibling, context, lambda_func);
|
397
|
-
}
|
398
|
-
|
399
|
-
if (trie->first_child != NULL) {
|
400
|
-
trie_traverse_with_context(trie->first_child, context, lambda_func);
|
401
|
-
}
|
402
|
-
|
403
|
-
lambda_func(trie, context);
|
404
|
-
}
|
405
|
-
|
406
|
-
static void free_trie(trie_node * trie) {
|
407
|
-
trie_traverse(trie, free);
|
408
|
-
}
|