wordtriez 0.0.1 → 0.0.2
Sign up to get free protection for your applications and to get access to all the features.
- data/ext/extconf.rb +1 -1
- data/ext/hat-trie/text.c +38 -20
- data/ext/hat-trie/text.h +2 -2
- data/ext/{triez.cc → wordtriez.cc} +9 -5
- data/lib/wordtriez.rb +11 -3
- data/test/{triez_test.rb → wordtriez_test.rb} +19 -19
- metadata +4 -4
data/ext/extconf.rb
CHANGED
data/ext/hat-trie/text.c
CHANGED
@@ -101,21 +101,44 @@ void text_clean(char* text)
|
|
101
101
|
*write = '\0';
|
102
102
|
}
|
103
103
|
|
104
|
-
void add_ngrams(hattrie_t* trie, int upto_n, char* text)
|
104
|
+
void add_ngrams(hattrie_t* trie, int upto_n, char* text, uint8_t incr_existing_keys_only)
|
105
105
|
{
|
106
106
|
char blank_suffix[] = "\0";
|
107
|
-
add_ngrams_with_suffix(trie, upto_n, text, blank_suffix);
|
107
|
+
add_ngrams_with_suffix(trie, upto_n, text, blank_suffix, incr_existing_keys_only);
|
108
108
|
}
|
109
109
|
|
110
|
-
void
|
110
|
+
inline void incr_value(
|
111
|
+
hattrie_t* trie,
|
112
|
+
char* buffer,
|
113
|
+
char* buffer_pre,
|
114
|
+
char* head,
|
115
|
+
size_t len,
|
116
|
+
size_t suffix_len,
|
117
|
+
uint8_t incr_existing_keys_only)
|
118
|
+
{
|
119
|
+
value_t* value = NULL;
|
120
|
+
|
121
|
+
assert(buffer_pre - len >= buffer);
|
122
|
+
memcpy(buffer_pre - len, head, len);
|
123
|
+
if (incr_existing_keys_only) {
|
124
|
+
value = hattrie_tryget(trie, buffer_pre - len, len + suffix_len);
|
125
|
+
if (value) {
|
126
|
+
(*value)++;
|
127
|
+
}
|
128
|
+
} else {
|
129
|
+
value = hattrie_get(trie, buffer_pre - len, len + suffix_len);
|
130
|
+
(*value)++;
|
131
|
+
}
|
132
|
+
|
133
|
+
}
|
134
|
+
|
135
|
+
void add_ngrams_with_suffix(hattrie_t* trie, int upto_n, char* text, char* suffix, uint8_t incr_existing_keys_only)
|
111
136
|
{
|
112
137
|
char* head = text;
|
113
138
|
char* tail = text;
|
114
139
|
char* next_head = text;
|
115
140
|
char* next_tail = text;
|
116
141
|
int word_count = 0;
|
117
|
-
value_t* value = NULL;
|
118
|
-
size_t len = 0;
|
119
142
|
|
120
143
|
if (*text == '\0') return;
|
121
144
|
|
@@ -134,11 +157,9 @@ void add_ngrams_with_suffix(hattrie_t* trie, int upto_n, char* text, char* suffi
|
|
134
157
|
next_tail = tail;
|
135
158
|
}
|
136
159
|
if (word_count <= upto_n) {
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
value = hattrie_get(trie, buffer_pre - len, len + suffix_len);
|
141
|
-
(*value)++;
|
160
|
+
incr_value(trie, buffer, buffer_pre,
|
161
|
+
head, tail - head, suffix_len,
|
162
|
+
incr_existing_keys_only);
|
142
163
|
}
|
143
164
|
if (word_count == upto_n) {
|
144
165
|
head = next_head;
|
@@ -153,20 +174,17 @@ void add_ngrams_with_suffix(hattrie_t* trie, int upto_n, char* text, char* suffi
|
|
153
174
|
} while(*tail);
|
154
175
|
|
155
176
|
// add the last ngram of size upto_n
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
value = hattrie_get(trie, buffer_pre - len, len + suffix_len);
|
160
|
-
(*value)++;
|
177
|
+
incr_value(trie, buffer, buffer_pre,
|
178
|
+
head, tail - head, suffix_len,
|
179
|
+
incr_existing_keys_only);
|
161
180
|
|
181
|
+
// add the 1..(upto_n-1) sized ngrams at the tail
|
162
182
|
if (upto_n > 1) {
|
163
183
|
while(*head) {
|
164
184
|
if(*head == ' ' || *head == '.') {
|
165
|
-
|
166
|
-
|
167
|
-
|
168
|
-
value = hattrie_get(trie, buffer_pre - len, len + suffix_len);
|
169
|
-
(*value)++;
|
185
|
+
incr_value(trie, buffer, buffer_pre,
|
186
|
+
head + 1, tail - head - 1, suffix_len,
|
187
|
+
incr_existing_keys_only);
|
170
188
|
}
|
171
189
|
head++;
|
172
190
|
}
|
data/ext/hat-trie/text.h
CHANGED
@@ -12,8 +12,8 @@ extern "C" {
|
|
12
12
|
#define NGRAM_BUFFER_SIZE 4096
|
13
13
|
|
14
14
|
void text_clean(char* text);
|
15
|
-
void add_ngrams(hattrie_t* trie, int upto_n, char* text);
|
16
|
-
void add_ngrams_with_suffix(hattrie_t* trie, int upto_n, char* text, char* suffix);
|
15
|
+
void add_ngrams(hattrie_t* trie, int upto_n, char* text, uint8_t incr_existing_keys_only);
|
16
|
+
void add_ngrams_with_suffix(hattrie_t* trie, int upto_n, char* text, char* suffix, uint8_t incr_existing_keys_only);
|
17
17
|
|
18
18
|
#ifdef __cplusplus
|
19
19
|
}
|
@@ -273,7 +273,7 @@ static VALUE hat_walk(VALUE self, VALUE key) {
|
|
273
273
|
return data.arr;
|
274
274
|
}
|
275
275
|
|
276
|
-
static VALUE hat_add_text(VALUE self, VALUE text, VALUE ngrams, VALUE suffix) {
|
276
|
+
static VALUE hat_add_text(VALUE self, VALUE text, VALUE ngrams, VALUE suffix, VALUE incr_existing_keys_only) {
|
277
277
|
// rb_str_dup
|
278
278
|
hattrie_t* p;
|
279
279
|
HatTrie* ht;
|
@@ -283,7 +283,11 @@ static VALUE hat_add_text(VALUE self, VALUE text, VALUE ngrams, VALUE suffix) {
|
|
283
283
|
char* ctext = StringValueCStr(text);
|
284
284
|
text_clean(ctext);
|
285
285
|
|
286
|
-
add_ngrams_with_suffix(p,
|
286
|
+
add_ngrams_with_suffix(p,
|
287
|
+
FIX2INT(ngrams),
|
288
|
+
ctext,
|
289
|
+
StringValueCStr(suffix),
|
290
|
+
RTEST(incr_existing_keys_only));
|
287
291
|
|
288
292
|
return self;
|
289
293
|
// rb_str_substr
|
@@ -292,8 +296,8 @@ static VALUE hat_add_text(VALUE self, VALUE text, VALUE ngrams, VALUE suffix) {
|
|
292
296
|
#define DEF(k,n,f,c) rb_define_method(k,n,RUBY_METHOD_FUNC(f),c)
|
293
297
|
|
294
298
|
extern "C"
|
295
|
-
void
|
296
|
-
hat_class = rb_define_class("
|
299
|
+
void Init_wordtriez() {
|
300
|
+
hat_class = rb_define_class("Wordtriez", rb_cObject);
|
297
301
|
u8_enc = rb_utf8_encoding();
|
298
302
|
bin_enc = rb_ascii8bit_encoding();
|
299
303
|
|
@@ -309,5 +313,5 @@ void Init_triez() {
|
|
309
313
|
DEF(hat_class, "delete", hat_del, 1);
|
310
314
|
DEF(hat_class, "_internal_search", hat_search, 4);
|
311
315
|
DEF(hat_class, "_internal_walk", hat_walk, 1);
|
312
|
-
DEF(hat_class, "
|
316
|
+
DEF(hat_class, "_internal_add_text", hat_add_text, 4);
|
313
317
|
}
|
data/lib/wordtriez.rb
CHANGED
@@ -1,7 +1,7 @@
|
|
1
|
-
require_relative "../ext/
|
1
|
+
require_relative "../ext/wordtriez"
|
2
2
|
|
3
|
-
class
|
4
|
-
VERSION = '
|
3
|
+
class Wordtriez
|
4
|
+
VERSION = '0.0.2'
|
5
5
|
|
6
6
|
private :_internal_set_type
|
7
7
|
private :_internal_search
|
@@ -62,4 +62,12 @@ class Triez
|
|
62
62
|
a
|
63
63
|
end
|
64
64
|
end
|
65
|
+
|
66
|
+
def add_text! text, ngrams, suffix=""
|
67
|
+
_internal_add_text(text, ngrams, suffix, false)
|
68
|
+
end
|
69
|
+
|
70
|
+
def union_text! text, ngrams, suffix=""
|
71
|
+
_internal_add_text(text, ngrams, suffix, true)
|
72
|
+
end
|
65
73
|
end
|
@@ -1,28 +1,28 @@
|
|
1
1
|
# coding: utf-8
|
2
2
|
require "test/unit"
|
3
|
-
require_relative "../lib/
|
3
|
+
require_relative "../lib/wordtriez"
|
4
4
|
|
5
5
|
GC.stress
|
6
6
|
|
7
|
-
class
|
7
|
+
class WordtriezTest < Test::Unit::TestCase
|
8
8
|
def test_init_type_options
|
9
|
-
t =
|
9
|
+
t = Wordtriez.new value_type: :int64
|
10
10
|
assert_equal :int64, t.value_type
|
11
|
-
t =
|
11
|
+
t = Wordtriez.new value_type: :object
|
12
12
|
assert_equal :object, t.value_type
|
13
|
-
t =
|
13
|
+
t = Wordtriez.new
|
14
14
|
assert_equal :int64, t.value_type
|
15
15
|
|
16
16
|
assert_raise ArgumentError do
|
17
|
-
|
17
|
+
Wordtriez.new value_type: :string
|
18
18
|
end
|
19
19
|
assert_raise ArgumentError do
|
20
|
-
|
20
|
+
Wordtriez.new invalid_option: :int64
|
21
21
|
end
|
22
22
|
end
|
23
23
|
|
24
24
|
def test_hat_trie
|
25
|
-
t =
|
25
|
+
t = Wordtriez.new value_type: :object
|
26
26
|
|
27
27
|
v1 = (1 << 40)
|
28
28
|
v2 = (1 << 141)
|
@@ -47,7 +47,7 @@ class TriezTest < Test::Unit::TestCase
|
|
47
47
|
end
|
48
48
|
|
49
49
|
def test_insertion_and_search_on_many_keys
|
50
|
-
t =
|
50
|
+
t = Wordtriez.new
|
51
51
|
as = ('A'..'z').to_a
|
52
52
|
bs = ('一'..'百').to_a
|
53
53
|
as.each do |a|
|
@@ -70,7 +70,7 @@ class TriezTest < Test::Unit::TestCase
|
|
70
70
|
end
|
71
71
|
|
72
72
|
def test_each_and_raise
|
73
|
-
t =
|
73
|
+
t = Wordtriez.new
|
74
74
|
t['abcd'] = 0
|
75
75
|
t['abc'] = 1
|
76
76
|
|
@@ -86,7 +86,7 @@ class TriezTest < Test::Unit::TestCase
|
|
86
86
|
end
|
87
87
|
|
88
88
|
def test_append
|
89
|
-
t =
|
89
|
+
t = Wordtriez.new
|
90
90
|
('a'..'z').each do |c|
|
91
91
|
t << c
|
92
92
|
end
|
@@ -101,7 +101,7 @@ class TriezTest < Test::Unit::TestCase
|
|
101
101
|
'ATACGGTCCA' => 2,
|
102
102
|
'GCTTGTACGT' => 3
|
103
103
|
}
|
104
|
-
t =
|
104
|
+
t = Wordtriez.new
|
105
105
|
sequences.each do |seq, id|
|
106
106
|
t.change_all(:suffix, seq){ id }
|
107
107
|
end
|
@@ -109,7 +109,7 @@ class TriezTest < Test::Unit::TestCase
|
|
109
109
|
end
|
110
110
|
|
111
111
|
def test_nul_char_in_keys
|
112
|
-
t =
|
112
|
+
t = Wordtriez.new
|
113
113
|
t["a\0b"] = 1
|
114
114
|
assert_equal 1, t["a\0b"]
|
115
115
|
assert_equal 1, t.size
|
@@ -118,7 +118,7 @@ class TriezTest < Test::Unit::TestCase
|
|
118
118
|
|
119
119
|
def test_change_all_with_prefix
|
120
120
|
default = 10
|
121
|
-
t =
|
121
|
+
t = Wordtriez.new default: default
|
122
122
|
t['regexp'] = 1
|
123
123
|
t['readme'] = 2
|
124
124
|
t.change_all :prefix, 'readme' do |v|
|
@@ -131,7 +131,7 @@ class TriezTest < Test::Unit::TestCase
|
|
131
131
|
end
|
132
132
|
|
133
133
|
def test_change_all_with_suffix
|
134
|
-
t =
|
134
|
+
t = Wordtriez.new
|
135
135
|
t['regexp'] = 1
|
136
136
|
t['exp'] = 2
|
137
137
|
t['reg'] = 3
|
@@ -145,7 +145,7 @@ class TriezTest < Test::Unit::TestCase
|
|
145
145
|
end
|
146
146
|
|
147
147
|
def test_change_all_with_substring
|
148
|
-
t =
|
148
|
+
t = Wordtriez.new value_type: :object
|
149
149
|
t.change_all :substring, 'abc' do
|
150
150
|
1
|
151
151
|
end
|
@@ -163,7 +163,7 @@ class TriezTest < Test::Unit::TestCase
|
|
163
163
|
/users/12/edit
|
164
164
|
/posts
|
165
165
|
]
|
166
|
-
t =
|
166
|
+
t = Wordtriez.new value_type: :object
|
167
167
|
urls.each_with_index do |url, i|
|
168
168
|
t[url] = i.to_s
|
169
169
|
end
|
@@ -195,7 +195,7 @@ class TriezTest < Test::Unit::TestCase
|
|
195
195
|
|
196
196
|
# value is bitset representing id of the sentence
|
197
197
|
# in ruby we can use integers of arbitrary length as bitsets
|
198
|
-
t =
|
198
|
+
t = Wordtriez.new value_type: :object, default: 0
|
199
199
|
|
200
200
|
sentences.each_with_index do |sentence, i|
|
201
201
|
elem = 1 << i
|
@@ -215,7 +215,7 @@ class TriezTest < Test::Unit::TestCase
|
|
215
215
|
end
|
216
216
|
|
217
217
|
def test_should_not_segfault_when_search_with_prefix
|
218
|
-
t =
|
218
|
+
t = Wordtriez.new
|
219
219
|
# bursts when 16384
|
220
220
|
16_385.times{ |i| t["a#{i}"] = i }
|
221
221
|
expected_postfices = 16_385.times.map &:to_s
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: wordtriez
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.2
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -10,7 +10,7 @@ authors:
|
|
10
10
|
autorequire:
|
11
11
|
bindir: bin
|
12
12
|
cert_chain: []
|
13
|
-
date: 2014-09-
|
13
|
+
date: 2014-09-23 00:00:00.000000000 Z
|
14
14
|
dependencies: []
|
15
15
|
description: fast, efficient, unicode aware HAT trie with prefix / suffix support.
|
16
16
|
email:
|
@@ -23,8 +23,8 @@ files:
|
|
23
23
|
- changes
|
24
24
|
- readme.md
|
25
25
|
- lib/wordtriez.rb
|
26
|
-
- test/
|
27
|
-
- ext/
|
26
|
+
- test/wordtriez_test.rb
|
27
|
+
- ext/wordtriez.cc
|
28
28
|
- ext/common.h
|
29
29
|
- ext/extconf.rb
|
30
30
|
- ext/hat-trie/ahtable.c
|