wordtriez 0.0.1 → 0.0.2

Sign up to get free protection for your applications and to get access to all the features.
data/ext/extconf.rb CHANGED
@@ -3,7 +3,7 @@ require "mkmf"
3
3
  $CFLAGS << ' -Ihat-trie'
4
4
  $CPPFLAGS << ' -Ihat-trie'
5
5
  $LDFLAGS << ' -Lbuild -ltries'
6
- create_makefile 'triez'
6
+ create_makefile 'wordtriez'
7
7
 
8
8
  # respect header changes
9
9
  headers = Dir.glob('*.{hpp,h}').join ' '
data/ext/hat-trie/text.c CHANGED
@@ -101,21 +101,44 @@ void text_clean(char* text)
101
101
  *write = '\0';
102
102
  }
103
103
 
104
- void add_ngrams(hattrie_t* trie, int upto_n, char* text)
104
+ void add_ngrams(hattrie_t* trie, int upto_n, char* text, uint8_t incr_existing_keys_only)
105
105
  {
106
106
  char blank_suffix[] = "\0";
107
- add_ngrams_with_suffix(trie, upto_n, text, blank_suffix);
107
+ add_ngrams_with_suffix(trie, upto_n, text, blank_suffix, incr_existing_keys_only);
108
108
  }
109
109
 
110
- void add_ngrams_with_suffix(hattrie_t* trie, int upto_n, char* text, char* suffix)
110
+ inline void incr_value(
111
+ hattrie_t* trie,
112
+ char* buffer,
113
+ char* buffer_pre,
114
+ char* head,
115
+ size_t len,
116
+ size_t suffix_len,
117
+ uint8_t incr_existing_keys_only)
118
+ {
119
+ value_t* value = NULL;
120
+
121
+ assert(buffer_pre - len >= buffer);
122
+ memcpy(buffer_pre - len, head, len);
123
+ if (incr_existing_keys_only) {
124
+ value = hattrie_tryget(trie, buffer_pre - len, len + suffix_len);
125
+ if (value) {
126
+ (*value)++;
127
+ }
128
+ } else {
129
+ value = hattrie_get(trie, buffer_pre - len, len + suffix_len);
130
+ (*value)++;
131
+ }
132
+
133
+ }
134
+
135
+ void add_ngrams_with_suffix(hattrie_t* trie, int upto_n, char* text, char* suffix, uint8_t incr_existing_keys_only)
111
136
  {
112
137
  char* head = text;
113
138
  char* tail = text;
114
139
  char* next_head = text;
115
140
  char* next_tail = text;
116
141
  int word_count = 0;
117
- value_t* value = NULL;
118
- size_t len = 0;
119
142
 
120
143
  if (*text == '\0') return;
121
144
 
@@ -134,11 +157,9 @@ void add_ngrams_with_suffix(hattrie_t* trie, int upto_n, char* text, char* suffi
134
157
  next_tail = tail;
135
158
  }
136
159
  if (word_count <= upto_n) {
137
- len = tail - head;
138
- assert(buffer_pre - len >= buffer);
139
- memcpy(buffer_pre - len, head, len);
140
- value = hattrie_get(trie, buffer_pre - len, len + suffix_len);
141
- (*value)++;
160
+ incr_value(trie, buffer, buffer_pre,
161
+ head, tail - head, suffix_len,
162
+ incr_existing_keys_only);
142
163
  }
143
164
  if (word_count == upto_n) {
144
165
  head = next_head;
@@ -153,20 +174,17 @@ void add_ngrams_with_suffix(hattrie_t* trie, int upto_n, char* text, char* suffi
153
174
  } while(*tail);
154
175
 
155
176
  // add the last ngram of size upto_n
156
- len = tail - head;
157
- assert(buffer_pre - len >= buffer);
158
- memcpy(buffer_pre - len, head, len);
159
- value = hattrie_get(trie, buffer_pre - len, len + suffix_len);
160
- (*value)++;
177
+ incr_value(trie, buffer, buffer_pre,
178
+ head, tail - head, suffix_len,
179
+ incr_existing_keys_only);
161
180
 
181
+ // add the 1..(upto_n-1) sized ngrams at the tail
162
182
  if (upto_n > 1) {
163
183
  while(*head) {
164
184
  if(*head == ' ' || *head == '.') {
165
- len = tail - head - 1;
166
- assert(buffer_pre - len >= buffer);
167
- memcpy(buffer_pre - len, head + 1, len);
168
- value = hattrie_get(trie, buffer_pre - len, len + suffix_len);
169
- (*value)++;
185
+ incr_value(trie, buffer, buffer_pre,
186
+ head + 1, tail - head - 1, suffix_len,
187
+ incr_existing_keys_only);
170
188
  }
171
189
  head++;
172
190
  }
data/ext/hat-trie/text.h CHANGED
@@ -12,8 +12,8 @@ extern "C" {
12
12
  #define NGRAM_BUFFER_SIZE 4096
13
13
 
14
14
  void text_clean(char* text);
15
- void add_ngrams(hattrie_t* trie, int upto_n, char* text);
16
- void add_ngrams_with_suffix(hattrie_t* trie, int upto_n, char* text, char* suffix);
15
+ void add_ngrams(hattrie_t* trie, int upto_n, char* text, uint8_t incr_existing_keys_only);
16
+ void add_ngrams_with_suffix(hattrie_t* trie, int upto_n, char* text, char* suffix, uint8_t incr_existing_keys_only);
17
17
 
18
18
  #ifdef __cplusplus
19
19
  }
@@ -273,7 +273,7 @@ static VALUE hat_walk(VALUE self, VALUE key) {
273
273
  return data.arr;
274
274
  }
275
275
 
276
- static VALUE hat_add_text(VALUE self, VALUE text, VALUE ngrams, VALUE suffix) {
276
+ static VALUE hat_add_text(VALUE self, VALUE text, VALUE ngrams, VALUE suffix, VALUE incr_existing_keys_only) {
277
277
  // rb_str_dup
278
278
  hattrie_t* p;
279
279
  HatTrie* ht;
@@ -283,7 +283,11 @@ static VALUE hat_add_text(VALUE self, VALUE text, VALUE ngrams, VALUE suffix) {
283
283
  char* ctext = StringValueCStr(text);
284
284
  text_clean(ctext);
285
285
 
286
- add_ngrams_with_suffix(p, FIX2INT(ngrams), ctext, StringValueCStr(suffix));
286
+ add_ngrams_with_suffix(p,
287
+ FIX2INT(ngrams),
288
+ ctext,
289
+ StringValueCStr(suffix),
290
+ RTEST(incr_existing_keys_only));
287
291
 
288
292
  return self;
289
293
  // rb_str_substr
@@ -292,8 +296,8 @@ static VALUE hat_add_text(VALUE self, VALUE text, VALUE ngrams, VALUE suffix) {
292
296
  #define DEF(k,n,f,c) rb_define_method(k,n,RUBY_METHOD_FUNC(f),c)
293
297
 
294
298
  extern "C"
295
- void Init_triez() {
296
- hat_class = rb_define_class("Triez", rb_cObject);
299
+ void Init_wordtriez() {
300
+ hat_class = rb_define_class("Wordtriez", rb_cObject);
297
301
  u8_enc = rb_utf8_encoding();
298
302
  bin_enc = rb_ascii8bit_encoding();
299
303
 
@@ -309,5 +313,5 @@ void Init_triez() {
309
313
  DEF(hat_class, "delete", hat_del, 1);
310
314
  DEF(hat_class, "_internal_search", hat_search, 4);
311
315
  DEF(hat_class, "_internal_walk", hat_walk, 1);
312
- DEF(hat_class, "add_text!", hat_add_text, 3);
316
+ DEF(hat_class, "_internal_add_text", hat_add_text, 4);
313
317
  }
data/lib/wordtriez.rb CHANGED
@@ -1,7 +1,7 @@
1
- require_relative "../ext/triez"
1
+ require_relative "../ext/wordtriez"
2
2
 
3
- class Triez
4
- VERSION = '1.0.4'
3
+ class Wordtriez
4
+ VERSION = '0.0.2'
5
5
 
6
6
  private :_internal_set_type
7
7
  private :_internal_search
@@ -62,4 +62,12 @@ class Triez
62
62
  a
63
63
  end
64
64
  end
65
+
66
+ def add_text! text, ngrams, suffix=""
67
+ _internal_add_text(text, ngrams, suffix, false)
68
+ end
69
+
70
+ def union_text! text, ngrams, suffix=""
71
+ _internal_add_text(text, ngrams, suffix, true)
72
+ end
65
73
  end
@@ -1,28 +1,28 @@
1
1
  # coding: utf-8
2
2
  require "test/unit"
3
- require_relative "../lib/triez"
3
+ require_relative "../lib/wordtriez"
4
4
 
5
5
  GC.stress
6
6
 
7
- class TriezTest < Test::Unit::TestCase
7
+ class WordtriezTest < Test::Unit::TestCase
8
8
  def test_init_type_options
9
- t = Triez.new value_type: :int64
9
+ t = Wordtriez.new value_type: :int64
10
10
  assert_equal :int64, t.value_type
11
- t = Triez.new value_type: :object
11
+ t = Wordtriez.new value_type: :object
12
12
  assert_equal :object, t.value_type
13
- t = Triez.new
13
+ t = Wordtriez.new
14
14
  assert_equal :int64, t.value_type
15
15
 
16
16
  assert_raise ArgumentError do
17
- Triez.new value_type: :string
17
+ Wordtriez.new value_type: :string
18
18
  end
19
19
  assert_raise ArgumentError do
20
- Triez.new invalid_option: :int64
20
+ Wordtriez.new invalid_option: :int64
21
21
  end
22
22
  end
23
23
 
24
24
  def test_hat_trie
25
- t = Triez.new value_type: :object
25
+ t = Wordtriez.new value_type: :object
26
26
 
27
27
  v1 = (1 << 40)
28
28
  v2 = (1 << 141)
@@ -47,7 +47,7 @@ class TriezTest < Test::Unit::TestCase
47
47
  end
48
48
 
49
49
  def test_insertion_and_search_on_many_keys
50
- t = Triez.new
50
+ t = Wordtriez.new
51
51
  as = ('A'..'z').to_a
52
52
  bs = ('一'..'百').to_a
53
53
  as.each do |a|
@@ -70,7 +70,7 @@ class TriezTest < Test::Unit::TestCase
70
70
  end
71
71
 
72
72
  def test_each_and_raise
73
- t = Triez.new
73
+ t = Wordtriez.new
74
74
  t['abcd'] = 0
75
75
  t['abc'] = 1
76
76
 
@@ -86,7 +86,7 @@ class TriezTest < Test::Unit::TestCase
86
86
  end
87
87
 
88
88
  def test_append
89
- t = Triez.new
89
+ t = Wordtriez.new
90
90
  ('a'..'z').each do |c|
91
91
  t << c
92
92
  end
@@ -101,7 +101,7 @@ class TriezTest < Test::Unit::TestCase
101
101
  'ATACGGTCCA' => 2,
102
102
  'GCTTGTACGT' => 3
103
103
  }
104
- t = Triez.new
104
+ t = Wordtriez.new
105
105
  sequences.each do |seq, id|
106
106
  t.change_all(:suffix, seq){ id }
107
107
  end
@@ -109,7 +109,7 @@ class TriezTest < Test::Unit::TestCase
109
109
  end
110
110
 
111
111
  def test_nul_char_in_keys
112
- t = Triez.new
112
+ t = Wordtriez.new
113
113
  t["a\0b"] = 1
114
114
  assert_equal 1, t["a\0b"]
115
115
  assert_equal 1, t.size
@@ -118,7 +118,7 @@ class TriezTest < Test::Unit::TestCase
118
118
 
119
119
  def test_change_all_with_prefix
120
120
  default = 10
121
- t = Triez.new default: default
121
+ t = Wordtriez.new default: default
122
122
  t['regexp'] = 1
123
123
  t['readme'] = 2
124
124
  t.change_all :prefix, 'readme' do |v|
@@ -131,7 +131,7 @@ class TriezTest < Test::Unit::TestCase
131
131
  end
132
132
 
133
133
  def test_change_all_with_suffix
134
- t = Triez.new
134
+ t = Wordtriez.new
135
135
  t['regexp'] = 1
136
136
  t['exp'] = 2
137
137
  t['reg'] = 3
@@ -145,7 +145,7 @@ class TriezTest < Test::Unit::TestCase
145
145
  end
146
146
 
147
147
  def test_change_all_with_substring
148
- t = Triez.new value_type: :object
148
+ t = Wordtriez.new value_type: :object
149
149
  t.change_all :substring, 'abc' do
150
150
  1
151
151
  end
@@ -163,7 +163,7 @@ class TriezTest < Test::Unit::TestCase
163
163
  /users/12/edit
164
164
  /posts
165
165
  ]
166
- t = Triez.new value_type: :object
166
+ t = Wordtriez.new value_type: :object
167
167
  urls.each_with_index do |url, i|
168
168
  t[url] = i.to_s
169
169
  end
@@ -195,7 +195,7 @@ class TriezTest < Test::Unit::TestCase
195
195
 
196
196
  # value is bitset representing id of the sentence
197
197
  # in ruby we can use integers of arbitrary length as bitsets
198
- t = Triez.new value_type: :object, default: 0
198
+ t = Wordtriez.new value_type: :object, default: 0
199
199
 
200
200
  sentences.each_with_index do |sentence, i|
201
201
  elem = 1 << i
@@ -215,7 +215,7 @@ class TriezTest < Test::Unit::TestCase
215
215
  end
216
216
 
217
217
  def test_should_not_segfault_when_search_with_prefix
218
- t = Triez.new
218
+ t = Wordtriez.new
219
219
  # bursts when 16384
220
220
  16_385.times{ |i| t["a#{i}"] = i }
221
221
  expected_postfices = 16_385.times.map &:to_s
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: wordtriez
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.1
4
+ version: 0.0.2
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -10,7 +10,7 @@ authors:
10
10
  autorequire:
11
11
  bindir: bin
12
12
  cert_chain: []
13
- date: 2014-09-21 00:00:00.000000000 Z
13
+ date: 2014-09-23 00:00:00.000000000 Z
14
14
  dependencies: []
15
15
  description: fast, efficient, unicode aware HAT trie with prefix / suffix support.
16
16
  email:
@@ -23,8 +23,8 @@ files:
23
23
  - changes
24
24
  - readme.md
25
25
  - lib/wordtriez.rb
26
- - test/triez_test.rb
27
- - ext/triez.cc
26
+ - test/wordtriez_test.rb
27
+ - ext/wordtriez.cc
28
28
  - ext/common.h
29
29
  - ext/extconf.rb
30
30
  - ext/hat-trie/ahtable.c