RubyGems - wordtriez - Versions diffs - 0.0.3 → 0.1.0 - Mend

wordtriez 0.0.3 → 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (5) hide show

checksums.yaml ADDED Viewed

@@ -0,0 +1,7 @@
+---
+SHA1:
+  metadata.gz: 8006468978af919cac247d68b46533ff84461141
+  data.tar.gz: a1b1cc4a7ff1aeaee6af7e6be38c95fc244f4a53
+SHA512:
+  metadata.gz: ecabfb5cd2767f95bb5c69d8f417c60f8c049a1a2196d67e65337296df4362a19e7ddc0d580de632b72e2fc0eae7a5b89572289d223e37da4296f7be8b6e2d73
+  data.tar.gz: 56a46e9f54537062a419c39647f4149351f2e0f998eeab1123225a0148e538449d347924e8b1818911058d7d256ab24aeb36f68072d8631846986bb47b297d95

data/ext/hat-trie/text.c CHANGED Viewed

@@ -3,43 +3,6 @@
 #include <string.h>
 #include <assert.h>
- /* Chris' C Code Version of the above (self.clean_text)**
- * Credit: "most efficient way to remove special characters from string" By Guffa
- *  http://stackoverflow.com/questions/1120198/most-efficient-way-to-remove-special-characters-from-string
- *
- * How fast is this code?
- *
- * Regular expression: 294.4 ms.
- * Original function: 54.5 ms.
- * My suggested change: 47.1 ms.
- * Mine with setting StringBuilder capacity: 43.3 ms.
- * I tested the lookup+char[] solution, and it runs in about 13 ms.
- */
-/*
-private static bool[] _lookup;
-static Program() {
- _lookup = new bool[65535];
- for (char c = '0'; c <= '9'; c++) _lookup[c] = true;
- for (char c = 'A'; c <= 'Z'; c++) _lookup[c] = true;
- for (char c = 'a'; c <= 'z'; c++) _lookup[c] = true;
- _lookup['.'] = true;
- _lookup['_'] = true;
-}
-public static string RemoveSpecialCharacters(string str) {
- char[] buffer = new char[str.Length];
- int index = 0;
- foreach (char c in str) {
-   if (_lookup[c]) {
-      buffer[index] = c;
-      index++;
-   }
- }
- return new string(buffer, 0, index);
-}
-*/
 /** Transforms text such as the following:
  *
  *   And behold, I said, "This is no good!"
@@ -52,7 +15,7 @@ public static string RemoveSpecialCharacters(string str) {
  *
  * Spaces indicate word boundaries, while periods indicate sentence boundaries.
  */
-void text_clean(char* text)
+size_t text_clean(char* text)
 {
   char* read;
   char* write = text;
@@ -99,6 +62,9 @@ void text_clean(char* text)
   if (just_added_space) write--;
   // terminate the string at its new length
   *write = '\0';
+  // Return the new length of the string
+  return (size_t)(write - text);
 }
 void add_ngrams(hattrie_t* trie, int upto_n, char* text, uint8_t incr_existing_keys_only)
@@ -151,6 +117,9 @@ void add_ngrams_with_suffix(hattrie_t* trie, int upto_n, char* text, char* suffi
   char* buffer_pre = buffer + buffer_offset;
   strcpy(buffer_pre, suffix);
+  // skip any spaces at beginning
+  while(*head == ' ') head++;
   do {
     if (*tail == ' ' || *tail == '.' || *tail == '\0') {
       word_count++;

data/ext/hat-trie/text.h CHANGED Viewed

@@ -11,7 +11,7 @@ extern "C" {
 #define NGRAM_BUFFER_SIZE 4096
-void text_clean(char* text);
+size_t text_clean(char* text);
 void add_ngrams(hattrie_t* trie, int upto_n, char* text, uint8_t incr_existing_keys_only);
 void add_ngrams_with_suffix(hattrie_t* trie, int upto_n, char* text, char* suffix, uint8_t incr_existing_keys_only);

data/ext/wordtriez.cc CHANGED Viewed

@@ -281,7 +281,9 @@ static VALUE hat_add_text(VALUE self, VALUE text, VALUE ngrams, VALUE suffix, VA
     p = ht->p;
     char* ctext = StringValueCStr(text);
-    text_clean(ctext);
+    size_t new_length = text_clean(ctext);
+    rb_str_resize(text, (long)new_length);
     add_ngrams_with_suffix(p,
         FIX2INT(ngrams),

metadata CHANGED Viewed

@@ -1,8 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: wordtriez
 version: !ruby/object:Gem::Version
-  version: 0.0.3
-  prerelease:
+  version: 0.1.0
 platform: ruby
 authors:
 - Zete Lui
@@ -10,7 +9,7 @@ authors:
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2014-09-23 00:00:00.000000000 Z
+date: 2014-09-25 00:00:00.000000000 Z
 dependencies: []
 description: fast, efficient, unicode aware HAT trie with prefix / suffix support.
 email:
@@ -19,12 +18,8 @@ extensions:
 - ext/extconf.rb
 extra_rdoc_files: []
 files:
-- copying
 - changes
-- readme.md
-- lib/wordtriez.rb
-- test/wordtriez_test.rb
-- ext/wordtriez.cc
+- copying
 - ext/common.h
 - ext/extconf.rb
 - ext/hat-trie/ahtable.c
@@ -39,29 +34,31 @@ files:
 - ext/hat-trie/pstdint.h
 - ext/hat-trie/text.c
 - ext/hat-trie/text.h
+- ext/wordtriez.cc
+- lib/wordtriez.rb
+- readme.md
+- test/wordtriez_test.rb
 homepage: https://github.com/canadaduane/triez
 licenses: []
+metadata: {}
 post_install_message:
 rdoc_options: []
 require_paths:
 - lib
 required_ruby_version: !ruby/object:Gem::Requirement
-  none: false
   requirements:
-  - - ! '>='
+  - - ">="
     - !ruby/object:Gem::Version
       version: 1.9.2
 required_rubygems_version: !ruby/object:Gem::Requirement
-  none: false
   requirements:
-  - - ! '>='
+  - - ">="
     - !ruby/object:Gem::Version
       version: '0'
 requirements: []
 rubyforge_project:
-rubygems_version: 1.8.23
+rubygems_version: 2.2.2
 signing_key:
-specification_version: 3
+specification_version: 4
 summary: fast, efficient, unicode aware HAT trie with prefix / suffix support
 test_files: []
-has_rdoc: false