RubyGems - wordtriez - Versions diffs - 0.0.1 → 0.0.2 - Mend

wordtriez 0.0.1 → 0.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

data/ext/extconf.rb +1 -1
data/ext/hat-trie/text.c +38 -20
data/ext/hat-trie/text.h +2 -2
data/ext/{triez.cc → wordtriez.cc} +9 -5
data/lib/wordtriez.rb +11 -3
data/test/{triez_test.rb → wordtriez_test.rb} +19 -19
metadata +4 -4

data/ext/extconf.rb CHANGED Viewed

@@ -3,7 +3,7 @@ require "mkmf"
 $CFLAGS << ' -Ihat-trie'
 $CPPFLAGS << ' -Ihat-trie'
 $LDFLAGS << ' -Lbuild -ltries'
-create_makefile 'triez'
+create_makefile 'wordtriez'
 # respect header changes
 headers = Dir.glob('*.{hpp,h}').join ' '

data/ext/hat-trie/text.c CHANGED Viewed

@@ -101,21 +101,44 @@ void text_clean(char* text)
   *write = '\0';
 }
-void add_ngrams(hattrie_t* trie, int upto_n, char* text)
+void add_ngrams(hattrie_t* trie, int upto_n, char* text, uint8_t incr_existing_keys_only)
 {
   char blank_suffix[] = "\0";
-  add_ngrams_with_suffix(trie, upto_n, text, blank_suffix);
+  add_ngrams_with_suffix(trie, upto_n, text, blank_suffix, incr_existing_keys_only);
 }
-void add_ngrams_with_suffix(hattrie_t* trie, int upto_n, char* text, char* suffix)
+inline void incr_value(
+  hattrie_t* trie,
+  char* buffer,
+  char* buffer_pre,
+  char* head,
+  size_t len,
+  size_t suffix_len,
+  uint8_t incr_existing_keys_only)
+{
+  value_t* value = NULL;
+  assert(buffer_pre - len >= buffer);
+  memcpy(buffer_pre - len, head, len);
+  if (incr_existing_keys_only) {
+    value = hattrie_tryget(trie, buffer_pre - len, len + suffix_len);
+    if (value) {
+      (*value)++;
+    }
+  } else {
+    value = hattrie_get(trie, buffer_pre - len, len + suffix_len);
+    (*value)++;
+  }
+}
+void add_ngrams_with_suffix(hattrie_t* trie, int upto_n, char* text, char* suffix, uint8_t incr_existing_keys_only)
 {
   char* head = text;
   char* tail = text;
   char* next_head = text;
   char* next_tail = text;
   int word_count = 0;
-  value_t* value = NULL;
-  size_t len = 0;
   if (*text == '\0') return;
@@ -134,11 +157,9 @@ void add_ngrams_with_suffix(hattrie_t* trie, int upto_n, char* text, char* suffi
         next_tail = tail;
       }
       if (word_count <= upto_n) {
-        len = tail - head;
-        assert(buffer_pre - len >= buffer);
-        memcpy(buffer_pre - len, head, len);
-        value = hattrie_get(trie, buffer_pre - len, len + suffix_len);
-        (*value)++;
+        incr_value(trie, buffer, buffer_pre,
+          head, tail - head, suffix_len,
+          incr_existing_keys_only);
       }
       if (word_count == upto_n) {
         head = next_head;
@@ -153,20 +174,17 @@ void add_ngrams_with_suffix(hattrie_t* trie, int upto_n, char* text, char* suffi
   } while(*tail);
   // add the last ngram of size upto_n
-  len = tail - head;
-  assert(buffer_pre - len >= buffer);
-  memcpy(buffer_pre - len, head, len);
-  value = hattrie_get(trie, buffer_pre - len, len + suffix_len);
-  (*value)++;
+  incr_value(trie, buffer, buffer_pre,
+    head, tail - head, suffix_len,
+    incr_existing_keys_only);
+  // add the 1..(upto_n-1) sized ngrams at the tail
   if (upto_n > 1) {
     while(*head) {
       if(*head == ' ' || *head == '.') {
-        len = tail - head - 1;
-        assert(buffer_pre - len >= buffer);
-        memcpy(buffer_pre - len, head + 1, len);
-        value = hattrie_get(trie, buffer_pre - len, len + suffix_len);
-        (*value)++;
+        incr_value(trie, buffer, buffer_pre,
+          head + 1, tail - head - 1, suffix_len,
+          incr_existing_keys_only);
       }
       head++;
     }

data/ext/hat-trie/text.h CHANGED Viewed

@@ -12,8 +12,8 @@ extern "C" {
 #define NGRAM_BUFFER_SIZE 4096
 void text_clean(char* text);
-void add_ngrams(hattrie_t* trie, int upto_n, char* text);
-void add_ngrams_with_suffix(hattrie_t* trie, int upto_n, char* text, char* suffix);
+void add_ngrams(hattrie_t* trie, int upto_n, char* text, uint8_t incr_existing_keys_only);
+void add_ngrams_with_suffix(hattrie_t* trie, int upto_n, char* text, char* suffix, uint8_t incr_existing_keys_only);
 #ifdef __cplusplus
 }

data/ext/{triez.cc → wordtriez.cc} RENAMED Viewed

@@ -273,7 +273,7 @@ static VALUE hat_walk(VALUE self, VALUE key) {
     return data.arr;
 }
-static VALUE hat_add_text(VALUE self, VALUE text, VALUE ngrams, VALUE suffix) {
+static VALUE hat_add_text(VALUE self, VALUE text, VALUE ngrams, VALUE suffix, VALUE incr_existing_keys_only) {
     // rb_str_dup
     hattrie_t* p;
     HatTrie* ht;
@@ -283,7 +283,11 @@ static VALUE hat_add_text(VALUE self, VALUE text, VALUE ngrams, VALUE suffix) {
     char* ctext = StringValueCStr(text);
     text_clean(ctext);
-    add_ngrams_with_suffix(p, FIX2INT(ngrams), ctext, StringValueCStr(suffix));
+    add_ngrams_with_suffix(p,
+        FIX2INT(ngrams),
+        ctext,
+        StringValueCStr(suffix),
+        RTEST(incr_existing_keys_only));
     return self;
     // rb_str_substr
@@ -292,8 +296,8 @@ static VALUE hat_add_text(VALUE self, VALUE text, VALUE ngrams, VALUE suffix) {
 #define DEF(k,n,f,c) rb_define_method(k,n,RUBY_METHOD_FUNC(f),c)
 extern "C"
-void Init_triez() {
-    hat_class = rb_define_class("Triez", rb_cObject);
+void Init_wordtriez() {
+    hat_class = rb_define_class("Wordtriez", rb_cObject);
     u8_enc = rb_utf8_encoding();
     bin_enc = rb_ascii8bit_encoding();
@@ -309,5 +313,5 @@ void Init_triez() {
     DEF(hat_class, "delete", hat_del, 1);
     DEF(hat_class, "_internal_search", hat_search, 4);
     DEF(hat_class, "_internal_walk", hat_walk, 1);
-    DEF(hat_class, "add_text!", hat_add_text, 3);
+    DEF(hat_class, "_internal_add_text", hat_add_text, 4);
 }

data/lib/wordtriez.rb CHANGED Viewed

@@ -1,7 +1,7 @@
-require_relative "../ext/triez"
+require_relative "../ext/wordtriez"
-class Triez
-  VERSION = '1.0.4'
+class Wordtriez
+  VERSION = '0.0.2'
   private :_internal_set_type
   private :_internal_search
@@ -62,4 +62,12 @@ class Triez
       a
     end
   end
+  def add_text! text, ngrams, suffix=""
+    _internal_add_text(text, ngrams, suffix, false)
+  end
+  def union_text! text, ngrams, suffix=""
+    _internal_add_text(text, ngrams, suffix, true)
+  end
 end

data/test/{triez_test.rb → wordtriez_test.rb} RENAMED Viewed

@@ -1,28 +1,28 @@
 # coding: utf-8
 require "test/unit"
-require_relative "../lib/triez"
+require_relative "../lib/wordtriez"
 GC.stress
-class TriezTest < Test::Unit::TestCase
+class WordtriezTest < Test::Unit::TestCase
   def test_init_type_options
-    t = Triez.new value_type: :int64
+    t = Wordtriez.new value_type: :int64
     assert_equal :int64, t.value_type
-    t = Triez.new value_type: :object
+    t = Wordtriez.new value_type: :object
     assert_equal :object, t.value_type
-    t = Triez.new
+    t = Wordtriez.new
     assert_equal :int64, t.value_type
     assert_raise ArgumentError do
-      Triez.new value_type: :string
+      Wordtriez.new value_type: :string
     end
     assert_raise ArgumentError do
-      Triez.new invalid_option: :int64
+      Wordtriez.new invalid_option: :int64
     end
   end
   def test_hat_trie
-    t = Triez.new value_type: :object
+    t = Wordtriez.new value_type: :object
     v1 = (1 << 40)
     v2 = (1 << 141)
@@ -47,7 +47,7 @@ class TriezTest < Test::Unit::TestCase
   end
   def test_insertion_and_search_on_many_keys
-    t = Triez.new
+    t = Wordtriez.new
     as = ('A'..'z').to_a
     bs = ('一'..'百').to_a
     as.each do |a|
@@ -70,7 +70,7 @@ class TriezTest < Test::Unit::TestCase
   end
   def test_each_and_raise
-    t = Triez.new
+    t = Wordtriez.new
     t['abcd'] = 0
     t['abc'] = 1
@@ -86,7 +86,7 @@ class TriezTest < Test::Unit::TestCase
   end
   def test_append
-    t = Triez.new
+    t = Wordtriez.new
     ('a'..'z').each do |c|
       t << c
     end
@@ -101,7 +101,7 @@ class TriezTest < Test::Unit::TestCase
       'ATACGGTCCA' => 2,
       'GCTTGTACGT' => 3
     }
-    t = Triez.new
+    t = Wordtriez.new
     sequences.each do |seq, id|
       t.change_all(:suffix, seq){ id }
     end
@@ -109,7 +109,7 @@ class TriezTest < Test::Unit::TestCase
   end
   def test_nul_char_in_keys
-    t = Triez.new
+    t = Wordtriez.new
     t["a\0b"] = 1
     assert_equal 1, t["a\0b"]
     assert_equal 1, t.size
@@ -118,7 +118,7 @@ class TriezTest < Test::Unit::TestCase
   def test_change_all_with_prefix
     default = 10
-    t = Triez.new default: default
+    t = Wordtriez.new default: default
     t['regexp'] = 1
     t['readme'] = 2
     t.change_all :prefix, 'readme' do |v|
@@ -131,7 +131,7 @@ class TriezTest < Test::Unit::TestCase
   end
   def test_change_all_with_suffix
-    t = Triez.new
+    t = Wordtriez.new
     t['regexp'] = 1
     t['exp'] = 2
     t['reg'] = 3
@@ -145,7 +145,7 @@ class TriezTest < Test::Unit::TestCase
   end
   def test_change_all_with_substring
-    t = Triez.new value_type: :object
+    t = Wordtriez.new value_type: :object
     t.change_all :substring, 'abc' do
       1
     end
@@ -163,7 +163,7 @@ class TriezTest < Test::Unit::TestCase
       /users/12/edit
       /posts
     ]
-    t = Triez.new value_type: :object
+    t = Wordtriez.new value_type: :object
     urls.each_with_index do |url, i|
       t[url] = i.to_s
     end
@@ -195,7 +195,7 @@ class TriezTest < Test::Unit::TestCase
     # value is bitset representing id of the sentence
     # in ruby we can use integers of arbitrary length as bitsets
-    t = Triez.new value_type: :object, default: 0
+    t = Wordtriez.new value_type: :object, default: 0
     sentences.each_with_index do |sentence, i|
       elem = 1 << i
@@ -215,7 +215,7 @@ class TriezTest < Test::Unit::TestCase
   end
   def test_should_not_segfault_when_search_with_prefix
-    t = Triez.new
+    t = Wordtriez.new
     # bursts when 16384
     16_385.times{ |i| t["a#{i}"] = i }
     expected_postfices = 16_385.times.map &:to_s

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: wordtriez
 version: !ruby/object:Gem::Version
-  version: 0.0.1
+  version: 0.0.2
   prerelease:
 platform: ruby
 authors:
@@ -10,7 +10,7 @@ authors:
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2014-09-21 00:00:00.000000000 Z
+date: 2014-09-23 00:00:00.000000000 Z
 dependencies: []
 description: fast, efficient, unicode aware HAT trie with prefix / suffix support.
 email:
@@ -23,8 +23,8 @@ files:
 - changes
 - readme.md
 - lib/wordtriez.rb
-- test/triez_test.rb
-- ext/triez.cc
+- test/wordtriez_test.rb
+- ext/wordtriez.cc
 - ext/common.h
 - ext/extconf.rb
 - ext/hat-trie/ahtable.c