wordtriez 0.0.1 → 0.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/ext/extconf.rb CHANGED
@@ -3,7 +3,7 @@ require "mkmf"
3
3
  $CFLAGS << ' -Ihat-trie'
4
4
  $CPPFLAGS << ' -Ihat-trie'
5
5
  $LDFLAGS << ' -Lbuild -ltries'
6
- create_makefile 'triez'
6
+ create_makefile 'wordtriez'
7
7
 
8
8
  # respect header changes
9
9
  headers = Dir.glob('*.{hpp,h}').join ' '
data/ext/hat-trie/text.c CHANGED
@@ -101,21 +101,44 @@ void text_clean(char* text)
101
101
  *write = '\0';
102
102
  }
103
103
 
104
- void add_ngrams(hattrie_t* trie, int upto_n, char* text)
104
+ void add_ngrams(hattrie_t* trie, int upto_n, char* text, uint8_t incr_existing_keys_only)
105
105
  {
106
106
  char blank_suffix[] = "\0";
107
- add_ngrams_with_suffix(trie, upto_n, text, blank_suffix);
107
+ add_ngrams_with_suffix(trie, upto_n, text, blank_suffix, incr_existing_keys_only);
108
108
  }
109
109
 
110
- void add_ngrams_with_suffix(hattrie_t* trie, int upto_n, char* text, char* suffix)
110
+ inline void incr_value(
111
+ hattrie_t* trie,
112
+ char* buffer,
113
+ char* buffer_pre,
114
+ char* head,
115
+ size_t len,
116
+ size_t suffix_len,
117
+ uint8_t incr_existing_keys_only)
118
+ {
119
+ value_t* value = NULL;
120
+
121
+ assert(buffer_pre - len >= buffer);
122
+ memcpy(buffer_pre - len, head, len);
123
+ if (incr_existing_keys_only) {
124
+ value = hattrie_tryget(trie, buffer_pre - len, len + suffix_len);
125
+ if (value) {
126
+ (*value)++;
127
+ }
128
+ } else {
129
+ value = hattrie_get(trie, buffer_pre - len, len + suffix_len);
130
+ (*value)++;
131
+ }
132
+
133
+ }
134
+
135
+ void add_ngrams_with_suffix(hattrie_t* trie, int upto_n, char* text, char* suffix, uint8_t incr_existing_keys_only)
111
136
  {
112
137
  char* head = text;
113
138
  char* tail = text;
114
139
  char* next_head = text;
115
140
  char* next_tail = text;
116
141
  int word_count = 0;
117
- value_t* value = NULL;
118
- size_t len = 0;
119
142
 
120
143
  if (*text == '\0') return;
121
144
 
@@ -134,11 +157,9 @@ void add_ngrams_with_suffix(hattrie_t* trie, int upto_n, char* text, char* suffi
134
157
  next_tail = tail;
135
158
  }
136
159
  if (word_count <= upto_n) {
137
- len = tail - head;
138
- assert(buffer_pre - len >= buffer);
139
- memcpy(buffer_pre - len, head, len);
140
- value = hattrie_get(trie, buffer_pre - len, len + suffix_len);
141
- (*value)++;
160
+ incr_value(trie, buffer, buffer_pre,
161
+ head, tail - head, suffix_len,
162
+ incr_existing_keys_only);
142
163
  }
143
164
  if (word_count == upto_n) {
144
165
  head = next_head;
@@ -153,20 +174,17 @@ void add_ngrams_with_suffix(hattrie_t* trie, int upto_n, char* text, char* suffi
153
174
  } while(*tail);
154
175
 
155
176
  // add the last ngram of size upto_n
156
- len = tail - head;
157
- assert(buffer_pre - len >= buffer);
158
- memcpy(buffer_pre - len, head, len);
159
- value = hattrie_get(trie, buffer_pre - len, len + suffix_len);
160
- (*value)++;
177
+ incr_value(trie, buffer, buffer_pre,
178
+ head, tail - head, suffix_len,
179
+ incr_existing_keys_only);
161
180
 
181
+ // add the 1..(upto_n-1) sized ngrams at the tail
162
182
  if (upto_n > 1) {
163
183
  while(*head) {
164
184
  if(*head == ' ' || *head == '.') {
165
- len = tail - head - 1;
166
- assert(buffer_pre - len >= buffer);
167
- memcpy(buffer_pre - len, head + 1, len);
168
- value = hattrie_get(trie, buffer_pre - len, len + suffix_len);
169
- (*value)++;
185
+ incr_value(trie, buffer, buffer_pre,
186
+ head + 1, tail - head - 1, suffix_len,
187
+ incr_existing_keys_only);
170
188
  }
171
189
  head++;
172
190
  }
data/ext/hat-trie/text.h CHANGED
@@ -12,8 +12,8 @@ extern "C" {
12
12
  #define NGRAM_BUFFER_SIZE 4096
13
13
 
14
14
  void text_clean(char* text);
15
- void add_ngrams(hattrie_t* trie, int upto_n, char* text);
16
- void add_ngrams_with_suffix(hattrie_t* trie, int upto_n, char* text, char* suffix);
15
+ void add_ngrams(hattrie_t* trie, int upto_n, char* text, uint8_t incr_existing_keys_only);
16
+ void add_ngrams_with_suffix(hattrie_t* trie, int upto_n, char* text, char* suffix, uint8_t incr_existing_keys_only);
17
17
 
18
18
  #ifdef __cplusplus
19
19
  }
@@ -273,7 +273,7 @@ static VALUE hat_walk(VALUE self, VALUE key) {
273
273
  return data.arr;
274
274
  }
275
275
 
276
- static VALUE hat_add_text(VALUE self, VALUE text, VALUE ngrams, VALUE suffix) {
276
+ static VALUE hat_add_text(VALUE self, VALUE text, VALUE ngrams, VALUE suffix, VALUE incr_existing_keys_only) {
277
277
  // rb_str_dup
278
278
  hattrie_t* p;
279
279
  HatTrie* ht;
@@ -283,7 +283,11 @@ static VALUE hat_add_text(VALUE self, VALUE text, VALUE ngrams, VALUE suffix) {
283
283
  char* ctext = StringValueCStr(text);
284
284
  text_clean(ctext);
285
285
 
286
- add_ngrams_with_suffix(p, FIX2INT(ngrams), ctext, StringValueCStr(suffix));
286
+ add_ngrams_with_suffix(p,
287
+ FIX2INT(ngrams),
288
+ ctext,
289
+ StringValueCStr(suffix),
290
+ RTEST(incr_existing_keys_only));
287
291
 
288
292
  return self;
289
293
  // rb_str_substr
@@ -292,8 +296,8 @@ static VALUE hat_add_text(VALUE self, VALUE text, VALUE ngrams, VALUE suffix) {
292
296
  #define DEF(k,n,f,c) rb_define_method(k,n,RUBY_METHOD_FUNC(f),c)
293
297
 
294
298
  extern "C"
295
- void Init_triez() {
296
- hat_class = rb_define_class("Triez", rb_cObject);
299
+ void Init_wordtriez() {
300
+ hat_class = rb_define_class("Wordtriez", rb_cObject);
297
301
  u8_enc = rb_utf8_encoding();
298
302
  bin_enc = rb_ascii8bit_encoding();
299
303
 
@@ -309,5 +313,5 @@ void Init_triez() {
309
313
  DEF(hat_class, "delete", hat_del, 1);
310
314
  DEF(hat_class, "_internal_search", hat_search, 4);
311
315
  DEF(hat_class, "_internal_walk", hat_walk, 1);
312
- DEF(hat_class, "add_text!", hat_add_text, 3);
316
+ DEF(hat_class, "_internal_add_text", hat_add_text, 4);
313
317
  }
data/lib/wordtriez.rb CHANGED
@@ -1,7 +1,7 @@
1
- require_relative "../ext/triez"
1
+ require_relative "../ext/wordtriez"
2
2
 
3
- class Triez
4
- VERSION = '1.0.4'
3
+ class Wordtriez
4
+ VERSION = '0.0.2'
5
5
 
6
6
  private :_internal_set_type
7
7
  private :_internal_search
@@ -62,4 +62,12 @@ class Triez
62
62
  a
63
63
  end
64
64
  end
65
+
66
+ def add_text! text, ngrams, suffix=""
67
+ _internal_add_text(text, ngrams, suffix, false)
68
+ end
69
+
70
+ def union_text! text, ngrams, suffix=""
71
+ _internal_add_text(text, ngrams, suffix, true)
72
+ end
65
73
  end
@@ -1,28 +1,28 @@
1
1
  # coding: utf-8
2
2
  require "test/unit"
3
- require_relative "../lib/triez"
3
+ require_relative "../lib/wordtriez"
4
4
 
5
5
  GC.stress
6
6
 
7
- class TriezTest < Test::Unit::TestCase
7
+ class WordtriezTest < Test::Unit::TestCase
8
8
  def test_init_type_options
9
- t = Triez.new value_type: :int64
9
+ t = Wordtriez.new value_type: :int64
10
10
  assert_equal :int64, t.value_type
11
- t = Triez.new value_type: :object
11
+ t = Wordtriez.new value_type: :object
12
12
  assert_equal :object, t.value_type
13
- t = Triez.new
13
+ t = Wordtriez.new
14
14
  assert_equal :int64, t.value_type
15
15
 
16
16
  assert_raise ArgumentError do
17
- Triez.new value_type: :string
17
+ Wordtriez.new value_type: :string
18
18
  end
19
19
  assert_raise ArgumentError do
20
- Triez.new invalid_option: :int64
20
+ Wordtriez.new invalid_option: :int64
21
21
  end
22
22
  end
23
23
 
24
24
  def test_hat_trie
25
- t = Triez.new value_type: :object
25
+ t = Wordtriez.new value_type: :object
26
26
 
27
27
  v1 = (1 << 40)
28
28
  v2 = (1 << 141)
@@ -47,7 +47,7 @@ class TriezTest < Test::Unit::TestCase
47
47
  end
48
48
 
49
49
  def test_insertion_and_search_on_many_keys
50
- t = Triez.new
50
+ t = Wordtriez.new
51
51
  as = ('A'..'z').to_a
52
52
  bs = ('一'..'百').to_a
53
53
  as.each do |a|
@@ -70,7 +70,7 @@ class TriezTest < Test::Unit::TestCase
70
70
  end
71
71
 
72
72
  def test_each_and_raise
73
- t = Triez.new
73
+ t = Wordtriez.new
74
74
  t['abcd'] = 0
75
75
  t['abc'] = 1
76
76
 
@@ -86,7 +86,7 @@ class TriezTest < Test::Unit::TestCase
86
86
  end
87
87
 
88
88
  def test_append
89
- t = Triez.new
89
+ t = Wordtriez.new
90
90
  ('a'..'z').each do |c|
91
91
  t << c
92
92
  end
@@ -101,7 +101,7 @@ class TriezTest < Test::Unit::TestCase
101
101
  'ATACGGTCCA' => 2,
102
102
  'GCTTGTACGT' => 3
103
103
  }
104
- t = Triez.new
104
+ t = Wordtriez.new
105
105
  sequences.each do |seq, id|
106
106
  t.change_all(:suffix, seq){ id }
107
107
  end
@@ -109,7 +109,7 @@ class TriezTest < Test::Unit::TestCase
109
109
  end
110
110
 
111
111
  def test_nul_char_in_keys
112
- t = Triez.new
112
+ t = Wordtriez.new
113
113
  t["a\0b"] = 1
114
114
  assert_equal 1, t["a\0b"]
115
115
  assert_equal 1, t.size
@@ -118,7 +118,7 @@ class TriezTest < Test::Unit::TestCase
118
118
 
119
119
  def test_change_all_with_prefix
120
120
  default = 10
121
- t = Triez.new default: default
121
+ t = Wordtriez.new default: default
122
122
  t['regexp'] = 1
123
123
  t['readme'] = 2
124
124
  t.change_all :prefix, 'readme' do |v|
@@ -131,7 +131,7 @@ class TriezTest < Test::Unit::TestCase
131
131
  end
132
132
 
133
133
  def test_change_all_with_suffix
134
- t = Triez.new
134
+ t = Wordtriez.new
135
135
  t['regexp'] = 1
136
136
  t['exp'] = 2
137
137
  t['reg'] = 3
@@ -145,7 +145,7 @@ class TriezTest < Test::Unit::TestCase
145
145
  end
146
146
 
147
147
  def test_change_all_with_substring
148
- t = Triez.new value_type: :object
148
+ t = Wordtriez.new value_type: :object
149
149
  t.change_all :substring, 'abc' do
150
150
  1
151
151
  end
@@ -163,7 +163,7 @@ class TriezTest < Test::Unit::TestCase
163
163
  /users/12/edit
164
164
  /posts
165
165
  ]
166
- t = Triez.new value_type: :object
166
+ t = Wordtriez.new value_type: :object
167
167
  urls.each_with_index do |url, i|
168
168
  t[url] = i.to_s
169
169
  end
@@ -195,7 +195,7 @@ class TriezTest < Test::Unit::TestCase
195
195
 
196
196
  # value is bitset representing id of the sentence
197
197
  # in ruby we can use integers of arbitrary length as bitsets
198
- t = Triez.new value_type: :object, default: 0
198
+ t = Wordtriez.new value_type: :object, default: 0
199
199
 
200
200
  sentences.each_with_index do |sentence, i|
201
201
  elem = 1 << i
@@ -215,7 +215,7 @@ class TriezTest < Test::Unit::TestCase
215
215
  end
216
216
 
217
217
  def test_should_not_segfault_when_search_with_prefix
218
- t = Triez.new
218
+ t = Wordtriez.new
219
219
  # bursts when 16384
220
220
  16_385.times{ |i| t["a#{i}"] = i }
221
221
  expected_postfices = 16_385.times.map &:to_s
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: wordtriez
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.1
4
+ version: 0.0.2
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -10,7 +10,7 @@ authors:
10
10
  autorequire:
11
11
  bindir: bin
12
12
  cert_chain: []
13
- date: 2014-09-21 00:00:00.000000000 Z
13
+ date: 2014-09-23 00:00:00.000000000 Z
14
14
  dependencies: []
15
15
  description: fast, efficient, unicode aware HAT trie with prefix / suffix support.
16
16
  email:
@@ -23,8 +23,8 @@ files:
23
23
  - changes
24
24
  - readme.md
25
25
  - lib/wordtriez.rb
26
- - test/triez_test.rb
27
- - ext/triez.cc
26
+ - test/wordtriez_test.rb
27
+ - ext/wordtriez.cc
28
28
  - ext/common.h
29
29
  - ext/extconf.rb
30
30
  - ext/hat-trie/ahtable.c