wordtriez 0.0.3 → 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 8006468978af919cac247d68b46533ff84461141
4
+ data.tar.gz: a1b1cc4a7ff1aeaee6af7e6be38c95fc244f4a53
5
+ SHA512:
6
+ metadata.gz: ecabfb5cd2767f95bb5c69d8f417c60f8c049a1a2196d67e65337296df4362a19e7ddc0d580de632b72e2fc0eae7a5b89572289d223e37da4296f7be8b6e2d73
7
+ data.tar.gz: 56a46e9f54537062a419c39647f4149351f2e0f998eeab1123225a0148e538449d347924e8b1818911058d7d256ab24aeb36f68072d8631846986bb47b297d95
data/ext/hat-trie/text.c CHANGED
@@ -3,43 +3,6 @@
3
3
  #include <string.h>
4
4
  #include <assert.h>
5
5
 
6
- /* Chris' C Code Version of the above (self.clean_text)**
7
-
8
- * Credit: "most efficient way to remove special characters from string" By Guffa
9
- * http://stackoverflow.com/questions/1120198/most-efficient-way-to-remove-special-characters-from-string
10
- *
11
- * How fast is this code?
12
- *
13
- * Regular expression: 294.4 ms.
14
- * Original function: 54.5 ms.
15
- * My suggested change: 47.1 ms.
16
- * Mine with setting StringBuilder capacity: 43.3 ms.
17
- * I tested the lookup+char[] solution, and it runs in about 13 ms.
18
- */
19
-
20
- /*
21
- private static bool[] _lookup;
22
- static Program() {
23
- _lookup = new bool[65535];
24
- for (char c = '0'; c <= '9'; c++) _lookup[c] = true;
25
- for (char c = 'A'; c <= 'Z'; c++) _lookup[c] = true;
26
- for (char c = 'a'; c <= 'z'; c++) _lookup[c] = true;
27
- _lookup['.'] = true;
28
- _lookup['_'] = true;
29
- }
30
- public static string RemoveSpecialCharacters(string str) {
31
- char[] buffer = new char[str.Length];
32
- int index = 0;
33
- foreach (char c in str) {
34
- if (_lookup[c]) {
35
- buffer[index] = c;
36
- index++;
37
- }
38
- }
39
- return new string(buffer, 0, index);
40
- }
41
- */
42
-
43
6
  /** Transforms text such as the following:
44
7
  *
45
8
  * And behold, I said, "This is no good!"
@@ -52,7 +15,7 @@ public static string RemoveSpecialCharacters(string str) {
52
15
  *
53
16
  * Spaces indicate word boundaries, while periods indicate sentence boundaries.
54
17
  */
55
- void text_clean(char* text)
18
+ size_t text_clean(char* text)
56
19
  {
57
20
  char* read;
58
21
  char* write = text;
@@ -99,6 +62,9 @@ void text_clean(char* text)
99
62
  if (just_added_space) write--;
100
63
  // terminate the string at its new length
101
64
  *write = '\0';
65
+
66
+ // Return the new length of the string
67
+ return (size_t)(write - text);
102
68
  }
103
69
 
104
70
  void add_ngrams(hattrie_t* trie, int upto_n, char* text, uint8_t incr_existing_keys_only)
@@ -151,6 +117,9 @@ void add_ngrams_with_suffix(hattrie_t* trie, int upto_n, char* text, char* suffi
151
117
  char* buffer_pre = buffer + buffer_offset;
152
118
  strcpy(buffer_pre, suffix);
153
119
 
120
+ // skip any spaces at beginning
121
+ while(*head == ' ') head++;
122
+
154
123
  do {
155
124
  if (*tail == ' ' || *tail == '.' || *tail == '\0') {
156
125
  word_count++;
data/ext/hat-trie/text.h CHANGED
@@ -11,7 +11,7 @@ extern "C" {
11
11
 
12
12
  #define NGRAM_BUFFER_SIZE 4096
13
13
 
14
- void text_clean(char* text);
14
+ size_t text_clean(char* text);
15
15
  void add_ngrams(hattrie_t* trie, int upto_n, char* text, uint8_t incr_existing_keys_only);
16
16
  void add_ngrams_with_suffix(hattrie_t* trie, int upto_n, char* text, char* suffix, uint8_t incr_existing_keys_only);
17
17
 
data/ext/wordtriez.cc CHANGED
@@ -281,7 +281,9 @@ static VALUE hat_add_text(VALUE self, VALUE text, VALUE ngrams, VALUE suffix, VA
281
281
  p = ht->p;
282
282
 
283
283
  char* ctext = StringValueCStr(text);
284
- text_clean(ctext);
284
+ size_t new_length = text_clean(ctext);
285
+
286
+ rb_str_resize(text, (long)new_length);
285
287
 
286
288
  add_ngrams_with_suffix(p,
287
289
  FIX2INT(ngrams),
metadata CHANGED
@@ -1,8 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: wordtriez
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.3
5
- prerelease:
4
+ version: 0.1.0
6
5
  platform: ruby
7
6
  authors:
8
7
  - Zete Lui
@@ -10,7 +9,7 @@ authors:
10
9
  autorequire:
11
10
  bindir: bin
12
11
  cert_chain: []
13
- date: 2014-09-23 00:00:00.000000000 Z
12
+ date: 2014-09-25 00:00:00.000000000 Z
14
13
  dependencies: []
15
14
  description: fast, efficient, unicode aware HAT trie with prefix / suffix support.
16
15
  email:
@@ -19,12 +18,8 @@ extensions:
19
18
  - ext/extconf.rb
20
19
  extra_rdoc_files: []
21
20
  files:
22
- - copying
23
21
  - changes
24
- - readme.md
25
- - lib/wordtriez.rb
26
- - test/wordtriez_test.rb
27
- - ext/wordtriez.cc
22
+ - copying
28
23
  - ext/common.h
29
24
  - ext/extconf.rb
30
25
  - ext/hat-trie/ahtable.c
@@ -39,29 +34,31 @@ files:
39
34
  - ext/hat-trie/pstdint.h
40
35
  - ext/hat-trie/text.c
41
36
  - ext/hat-trie/text.h
37
+ - ext/wordtriez.cc
38
+ - lib/wordtriez.rb
39
+ - readme.md
40
+ - test/wordtriez_test.rb
42
41
  homepage: https://github.com/canadaduane/triez
43
42
  licenses: []
43
+ metadata: {}
44
44
  post_install_message:
45
45
  rdoc_options: []
46
46
  require_paths:
47
47
  - lib
48
48
  required_ruby_version: !ruby/object:Gem::Requirement
49
- none: false
50
49
  requirements:
51
- - - ! '>='
50
+ - - ">="
52
51
  - !ruby/object:Gem::Version
53
52
  version: 1.9.2
54
53
  required_rubygems_version: !ruby/object:Gem::Requirement
55
- none: false
56
54
  requirements:
57
- - - ! '>='
55
+ - - ">="
58
56
  - !ruby/object:Gem::Version
59
57
  version: '0'
60
58
  requirements: []
61
59
  rubyforge_project:
62
- rubygems_version: 1.8.23
60
+ rubygems_version: 2.2.2
63
61
  signing_key:
64
- specification_version: 3
62
+ specification_version: 4
65
63
  summary: fast, efficient, unicode aware HAT trie with prefix / suffix support
66
64
  test_files: []
67
- has_rdoc: false