wordtriez 0.0.1 → 0.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/ext/extconf.rb +1 -1
- data/ext/hat-trie/text.c +38 -20
- data/ext/hat-trie/text.h +2 -2
- data/ext/{triez.cc → wordtriez.cc} +9 -5
- data/lib/wordtriez.rb +11 -3
- data/test/{triez_test.rb → wordtriez_test.rb} +19 -19
- metadata +4 -4
data/ext/extconf.rb
CHANGED
data/ext/hat-trie/text.c
CHANGED
@@ -101,21 +101,44 @@ void text_clean(char* text)
|
|
101
101
|
*write = '\0';
|
102
102
|
}
|
103
103
|
|
104
|
-
void add_ngrams(hattrie_t* trie, int upto_n, char* text)
|
104
|
+
void add_ngrams(hattrie_t* trie, int upto_n, char* text, uint8_t incr_existing_keys_only)
|
105
105
|
{
|
106
106
|
char blank_suffix[] = "\0";
|
107
|
-
add_ngrams_with_suffix(trie, upto_n, text, blank_suffix);
|
107
|
+
add_ngrams_with_suffix(trie, upto_n, text, blank_suffix, incr_existing_keys_only);
|
108
108
|
}
|
109
109
|
|
110
|
-
void
|
110
|
+
inline void incr_value(
|
111
|
+
hattrie_t* trie,
|
112
|
+
char* buffer,
|
113
|
+
char* buffer_pre,
|
114
|
+
char* head,
|
115
|
+
size_t len,
|
116
|
+
size_t suffix_len,
|
117
|
+
uint8_t incr_existing_keys_only)
|
118
|
+
{
|
119
|
+
value_t* value = NULL;
|
120
|
+
|
121
|
+
assert(buffer_pre - len >= buffer);
|
122
|
+
memcpy(buffer_pre - len, head, len);
|
123
|
+
if (incr_existing_keys_only) {
|
124
|
+
value = hattrie_tryget(trie, buffer_pre - len, len + suffix_len);
|
125
|
+
if (value) {
|
126
|
+
(*value)++;
|
127
|
+
}
|
128
|
+
} else {
|
129
|
+
value = hattrie_get(trie, buffer_pre - len, len + suffix_len);
|
130
|
+
(*value)++;
|
131
|
+
}
|
132
|
+
|
133
|
+
}
|
134
|
+
|
135
|
+
void add_ngrams_with_suffix(hattrie_t* trie, int upto_n, char* text, char* suffix, uint8_t incr_existing_keys_only)
|
111
136
|
{
|
112
137
|
char* head = text;
|
113
138
|
char* tail = text;
|
114
139
|
char* next_head = text;
|
115
140
|
char* next_tail = text;
|
116
141
|
int word_count = 0;
|
117
|
-
value_t* value = NULL;
|
118
|
-
size_t len = 0;
|
119
142
|
|
120
143
|
if (*text == '\0') return;
|
121
144
|
|
@@ -134,11 +157,9 @@ void add_ngrams_with_suffix(hattrie_t* trie, int upto_n, char* text, char* suffi
|
|
134
157
|
next_tail = tail;
|
135
158
|
}
|
136
159
|
if (word_count <= upto_n) {
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
value = hattrie_get(trie, buffer_pre - len, len + suffix_len);
|
141
|
-
(*value)++;
|
160
|
+
incr_value(trie, buffer, buffer_pre,
|
161
|
+
head, tail - head, suffix_len,
|
162
|
+
incr_existing_keys_only);
|
142
163
|
}
|
143
164
|
if (word_count == upto_n) {
|
144
165
|
head = next_head;
|
@@ -153,20 +174,17 @@ void add_ngrams_with_suffix(hattrie_t* trie, int upto_n, char* text, char* suffi
|
|
153
174
|
} while(*tail);
|
154
175
|
|
155
176
|
// add the last ngram of size upto_n
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
value = hattrie_get(trie, buffer_pre - len, len + suffix_len);
|
160
|
-
(*value)++;
|
177
|
+
incr_value(trie, buffer, buffer_pre,
|
178
|
+
head, tail - head, suffix_len,
|
179
|
+
incr_existing_keys_only);
|
161
180
|
|
181
|
+
// add the 1..(upto_n-1) sized ngrams at the tail
|
162
182
|
if (upto_n > 1) {
|
163
183
|
while(*head) {
|
164
184
|
if(*head == ' ' || *head == '.') {
|
165
|
-
|
166
|
-
|
167
|
-
|
168
|
-
value = hattrie_get(trie, buffer_pre - len, len + suffix_len);
|
169
|
-
(*value)++;
|
185
|
+
incr_value(trie, buffer, buffer_pre,
|
186
|
+
head + 1, tail - head - 1, suffix_len,
|
187
|
+
incr_existing_keys_only);
|
170
188
|
}
|
171
189
|
head++;
|
172
190
|
}
|
data/ext/hat-trie/text.h
CHANGED
@@ -12,8 +12,8 @@ extern "C" {
|
|
12
12
|
#define NGRAM_BUFFER_SIZE 4096
|
13
13
|
|
14
14
|
void text_clean(char* text);
|
15
|
-
void add_ngrams(hattrie_t* trie, int upto_n, char* text);
|
16
|
-
void add_ngrams_with_suffix(hattrie_t* trie, int upto_n, char* text, char* suffix);
|
15
|
+
void add_ngrams(hattrie_t* trie, int upto_n, char* text, uint8_t incr_existing_keys_only);
|
16
|
+
void add_ngrams_with_suffix(hattrie_t* trie, int upto_n, char* text, char* suffix, uint8_t incr_existing_keys_only);
|
17
17
|
|
18
18
|
#ifdef __cplusplus
|
19
19
|
}
|
@@ -273,7 +273,7 @@ static VALUE hat_walk(VALUE self, VALUE key) {
|
|
273
273
|
return data.arr;
|
274
274
|
}
|
275
275
|
|
276
|
-
static VALUE hat_add_text(VALUE self, VALUE text, VALUE ngrams, VALUE suffix) {
|
276
|
+
static VALUE hat_add_text(VALUE self, VALUE text, VALUE ngrams, VALUE suffix, VALUE incr_existing_keys_only) {
|
277
277
|
// rb_str_dup
|
278
278
|
hattrie_t* p;
|
279
279
|
HatTrie* ht;
|
@@ -283,7 +283,11 @@ static VALUE hat_add_text(VALUE self, VALUE text, VALUE ngrams, VALUE suffix) {
|
|
283
283
|
char* ctext = StringValueCStr(text);
|
284
284
|
text_clean(ctext);
|
285
285
|
|
286
|
-
add_ngrams_with_suffix(p,
|
286
|
+
add_ngrams_with_suffix(p,
|
287
|
+
FIX2INT(ngrams),
|
288
|
+
ctext,
|
289
|
+
StringValueCStr(suffix),
|
290
|
+
RTEST(incr_existing_keys_only));
|
287
291
|
|
288
292
|
return self;
|
289
293
|
// rb_str_substr
|
@@ -292,8 +296,8 @@ static VALUE hat_add_text(VALUE self, VALUE text, VALUE ngrams, VALUE suffix) {
|
|
292
296
|
#define DEF(k,n,f,c) rb_define_method(k,n,RUBY_METHOD_FUNC(f),c)
|
293
297
|
|
294
298
|
extern "C"
|
295
|
-
void
|
296
|
-
hat_class = rb_define_class("
|
299
|
+
void Init_wordtriez() {
|
300
|
+
hat_class = rb_define_class("Wordtriez", rb_cObject);
|
297
301
|
u8_enc = rb_utf8_encoding();
|
298
302
|
bin_enc = rb_ascii8bit_encoding();
|
299
303
|
|
@@ -309,5 +313,5 @@ void Init_triez() {
|
|
309
313
|
DEF(hat_class, "delete", hat_del, 1);
|
310
314
|
DEF(hat_class, "_internal_search", hat_search, 4);
|
311
315
|
DEF(hat_class, "_internal_walk", hat_walk, 1);
|
312
|
-
DEF(hat_class, "
|
316
|
+
DEF(hat_class, "_internal_add_text", hat_add_text, 4);
|
313
317
|
}
|
data/lib/wordtriez.rb
CHANGED
@@ -1,7 +1,7 @@
|
|
1
|
-
require_relative "../ext/
|
1
|
+
require_relative "../ext/wordtriez"
|
2
2
|
|
3
|
-
class
|
4
|
-
VERSION = '
|
3
|
+
class Wordtriez
|
4
|
+
VERSION = '0.0.2'
|
5
5
|
|
6
6
|
private :_internal_set_type
|
7
7
|
private :_internal_search
|
@@ -62,4 +62,12 @@ class Triez
|
|
62
62
|
a
|
63
63
|
end
|
64
64
|
end
|
65
|
+
|
66
|
+
def add_text! text, ngrams, suffix=""
|
67
|
+
_internal_add_text(text, ngrams, suffix, false)
|
68
|
+
end
|
69
|
+
|
70
|
+
def union_text! text, ngrams, suffix=""
|
71
|
+
_internal_add_text(text, ngrams, suffix, true)
|
72
|
+
end
|
65
73
|
end
|
@@ -1,28 +1,28 @@
|
|
1
1
|
# coding: utf-8
|
2
2
|
require "test/unit"
|
3
|
-
require_relative "../lib/
|
3
|
+
require_relative "../lib/wordtriez"
|
4
4
|
|
5
5
|
GC.stress
|
6
6
|
|
7
|
-
class
|
7
|
+
class WordtriezTest < Test::Unit::TestCase
|
8
8
|
def test_init_type_options
|
9
|
-
t =
|
9
|
+
t = Wordtriez.new value_type: :int64
|
10
10
|
assert_equal :int64, t.value_type
|
11
|
-
t =
|
11
|
+
t = Wordtriez.new value_type: :object
|
12
12
|
assert_equal :object, t.value_type
|
13
|
-
t =
|
13
|
+
t = Wordtriez.new
|
14
14
|
assert_equal :int64, t.value_type
|
15
15
|
|
16
16
|
assert_raise ArgumentError do
|
17
|
-
|
17
|
+
Wordtriez.new value_type: :string
|
18
18
|
end
|
19
19
|
assert_raise ArgumentError do
|
20
|
-
|
20
|
+
Wordtriez.new invalid_option: :int64
|
21
21
|
end
|
22
22
|
end
|
23
23
|
|
24
24
|
def test_hat_trie
|
25
|
-
t =
|
25
|
+
t = Wordtriez.new value_type: :object
|
26
26
|
|
27
27
|
v1 = (1 << 40)
|
28
28
|
v2 = (1 << 141)
|
@@ -47,7 +47,7 @@ class TriezTest < Test::Unit::TestCase
|
|
47
47
|
end
|
48
48
|
|
49
49
|
def test_insertion_and_search_on_many_keys
|
50
|
-
t =
|
50
|
+
t = Wordtriez.new
|
51
51
|
as = ('A'..'z').to_a
|
52
52
|
bs = ('一'..'百').to_a
|
53
53
|
as.each do |a|
|
@@ -70,7 +70,7 @@ class TriezTest < Test::Unit::TestCase
|
|
70
70
|
end
|
71
71
|
|
72
72
|
def test_each_and_raise
|
73
|
-
t =
|
73
|
+
t = Wordtriez.new
|
74
74
|
t['abcd'] = 0
|
75
75
|
t['abc'] = 1
|
76
76
|
|
@@ -86,7 +86,7 @@ class TriezTest < Test::Unit::TestCase
|
|
86
86
|
end
|
87
87
|
|
88
88
|
def test_append
|
89
|
-
t =
|
89
|
+
t = Wordtriez.new
|
90
90
|
('a'..'z').each do |c|
|
91
91
|
t << c
|
92
92
|
end
|
@@ -101,7 +101,7 @@ class TriezTest < Test::Unit::TestCase
|
|
101
101
|
'ATACGGTCCA' => 2,
|
102
102
|
'GCTTGTACGT' => 3
|
103
103
|
}
|
104
|
-
t =
|
104
|
+
t = Wordtriez.new
|
105
105
|
sequences.each do |seq, id|
|
106
106
|
t.change_all(:suffix, seq){ id }
|
107
107
|
end
|
@@ -109,7 +109,7 @@ class TriezTest < Test::Unit::TestCase
|
|
109
109
|
end
|
110
110
|
|
111
111
|
def test_nul_char_in_keys
|
112
|
-
t =
|
112
|
+
t = Wordtriez.new
|
113
113
|
t["a\0b"] = 1
|
114
114
|
assert_equal 1, t["a\0b"]
|
115
115
|
assert_equal 1, t.size
|
@@ -118,7 +118,7 @@ class TriezTest < Test::Unit::TestCase
|
|
118
118
|
|
119
119
|
def test_change_all_with_prefix
|
120
120
|
default = 10
|
121
|
-
t =
|
121
|
+
t = Wordtriez.new default: default
|
122
122
|
t['regexp'] = 1
|
123
123
|
t['readme'] = 2
|
124
124
|
t.change_all :prefix, 'readme' do |v|
|
@@ -131,7 +131,7 @@ class TriezTest < Test::Unit::TestCase
|
|
131
131
|
end
|
132
132
|
|
133
133
|
def test_change_all_with_suffix
|
134
|
-
t =
|
134
|
+
t = Wordtriez.new
|
135
135
|
t['regexp'] = 1
|
136
136
|
t['exp'] = 2
|
137
137
|
t['reg'] = 3
|
@@ -145,7 +145,7 @@ class TriezTest < Test::Unit::TestCase
|
|
145
145
|
end
|
146
146
|
|
147
147
|
def test_change_all_with_substring
|
148
|
-
t =
|
148
|
+
t = Wordtriez.new value_type: :object
|
149
149
|
t.change_all :substring, 'abc' do
|
150
150
|
1
|
151
151
|
end
|
@@ -163,7 +163,7 @@ class TriezTest < Test::Unit::TestCase
|
|
163
163
|
/users/12/edit
|
164
164
|
/posts
|
165
165
|
]
|
166
|
-
t =
|
166
|
+
t = Wordtriez.new value_type: :object
|
167
167
|
urls.each_with_index do |url, i|
|
168
168
|
t[url] = i.to_s
|
169
169
|
end
|
@@ -195,7 +195,7 @@ class TriezTest < Test::Unit::TestCase
|
|
195
195
|
|
196
196
|
# value is bitset representing id of the sentence
|
197
197
|
# in ruby we can use integers of arbitrary length as bitsets
|
198
|
-
t =
|
198
|
+
t = Wordtriez.new value_type: :object, default: 0
|
199
199
|
|
200
200
|
sentences.each_with_index do |sentence, i|
|
201
201
|
elem = 1 << i
|
@@ -215,7 +215,7 @@ class TriezTest < Test::Unit::TestCase
|
|
215
215
|
end
|
216
216
|
|
217
217
|
def test_should_not_segfault_when_search_with_prefix
|
218
|
-
t =
|
218
|
+
t = Wordtriez.new
|
219
219
|
# bursts when 16384
|
220
220
|
16_385.times{ |i| t["a#{i}"] = i }
|
221
221
|
expected_postfices = 16_385.times.map &:to_s
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: wordtriez
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.2
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -10,7 +10,7 @@ authors:
|
|
10
10
|
autorequire:
|
11
11
|
bindir: bin
|
12
12
|
cert_chain: []
|
13
|
-
date: 2014-09-
|
13
|
+
date: 2014-09-23 00:00:00.000000000 Z
|
14
14
|
dependencies: []
|
15
15
|
description: fast, efficient, unicode aware HAT trie with prefix / suffix support.
|
16
16
|
email:
|
@@ -23,8 +23,8 @@ files:
|
|
23
23
|
- changes
|
24
24
|
- readme.md
|
25
25
|
- lib/wordtriez.rb
|
26
|
-
- test/
|
27
|
-
- ext/
|
26
|
+
- test/wordtriez_test.rb
|
27
|
+
- ext/wordtriez.cc
|
28
28
|
- ext/common.h
|
29
29
|
- ext/extconf.rb
|
30
30
|
- ext/hat-trie/ahtable.c
|