wordtriez 0.0.3 → 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 8006468978af919cac247d68b46533ff84461141
4
+ data.tar.gz: a1b1cc4a7ff1aeaee6af7e6be38c95fc244f4a53
5
+ SHA512:
6
+ metadata.gz: ecabfb5cd2767f95bb5c69d8f417c60f8c049a1a2196d67e65337296df4362a19e7ddc0d580de632b72e2fc0eae7a5b89572289d223e37da4296f7be8b6e2d73
7
+ data.tar.gz: 56a46e9f54537062a419c39647f4149351f2e0f998eeab1123225a0148e538449d347924e8b1818911058d7d256ab24aeb36f68072d8631846986bb47b297d95
data/ext/hat-trie/text.c CHANGED
@@ -3,43 +3,6 @@
3
3
  #include <string.h>
4
4
  #include <assert.h>
5
5
 
6
- /* Chris' C Code Version of the above (self.clean_text)**
7
-
8
- * Credit: "most efficient way to remove special characters from string" By Guffa
9
- * http://stackoverflow.com/questions/1120198/most-efficient-way-to-remove-special-characters-from-string
10
- *
11
- * How fast is this code?
12
- *
13
- * Regular expression: 294.4 ms.
14
- * Original function: 54.5 ms.
15
- * My suggested change: 47.1 ms.
16
- * Mine with setting StringBuilder capacity: 43.3 ms.
17
- * I tested the lookup+char[] solution, and it runs in about 13 ms.
18
- */
19
-
20
- /*
21
- private static bool[] _lookup;
22
- static Program() {
23
- _lookup = new bool[65535];
24
- for (char c = '0'; c <= '9'; c++) _lookup[c] = true;
25
- for (char c = 'A'; c <= 'Z'; c++) _lookup[c] = true;
26
- for (char c = 'a'; c <= 'z'; c++) _lookup[c] = true;
27
- _lookup['.'] = true;
28
- _lookup['_'] = true;
29
- }
30
- public static string RemoveSpecialCharacters(string str) {
31
- char[] buffer = new char[str.Length];
32
- int index = 0;
33
- foreach (char c in str) {
34
- if (_lookup[c]) {
35
- buffer[index] = c;
36
- index++;
37
- }
38
- }
39
- return new string(buffer, 0, index);
40
- }
41
- */
42
-
43
6
  /** Transforms text such as the following:
44
7
  *
45
8
  * And behold, I said, "This is no good!"
@@ -52,7 +15,7 @@ public static string RemoveSpecialCharacters(string str) {
52
15
  *
53
16
  * Spaces indicate word boundaries, while periods indicate sentence boundaries.
54
17
  */
55
- void text_clean(char* text)
18
+ size_t text_clean(char* text)
56
19
  {
57
20
  char* read;
58
21
  char* write = text;
@@ -99,6 +62,9 @@ void text_clean(char* text)
99
62
  if (just_added_space) write--;
100
63
  // terminate the string at its new length
101
64
  *write = '\0';
65
+
66
+ // Return the new length of the string
67
+ return (size_t)(write - text);
102
68
  }
103
69
 
104
70
  void add_ngrams(hattrie_t* trie, int upto_n, char* text, uint8_t incr_existing_keys_only)
@@ -151,6 +117,9 @@ void add_ngrams_with_suffix(hattrie_t* trie, int upto_n, char* text, char* suffi
151
117
  char* buffer_pre = buffer + buffer_offset;
152
118
  strcpy(buffer_pre, suffix);
153
119
 
120
+ // skip any spaces at beginning
121
+ while(*head == ' ') head++;
122
+
154
123
  do {
155
124
  if (*tail == ' ' || *tail == '.' || *tail == '\0') {
156
125
  word_count++;
data/ext/hat-trie/text.h CHANGED
@@ -11,7 +11,7 @@ extern "C" {
11
11
 
12
12
  #define NGRAM_BUFFER_SIZE 4096
13
13
 
14
- void text_clean(char* text);
14
+ size_t text_clean(char* text);
15
15
  void add_ngrams(hattrie_t* trie, int upto_n, char* text, uint8_t incr_existing_keys_only);
16
16
  void add_ngrams_with_suffix(hattrie_t* trie, int upto_n, char* text, char* suffix, uint8_t incr_existing_keys_only);
17
17
 
data/ext/wordtriez.cc CHANGED
@@ -281,7 +281,9 @@ static VALUE hat_add_text(VALUE self, VALUE text, VALUE ngrams, VALUE suffix, VA
281
281
  p = ht->p;
282
282
 
283
283
  char* ctext = StringValueCStr(text);
284
- text_clean(ctext);
284
+ size_t new_length = text_clean(ctext);
285
+
286
+ rb_str_resize(text, (long)new_length);
285
287
 
286
288
  add_ngrams_with_suffix(p,
287
289
  FIX2INT(ngrams),
metadata CHANGED
@@ -1,8 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: wordtriez
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.3
5
- prerelease:
4
+ version: 0.1.0
6
5
  platform: ruby
7
6
  authors:
8
7
  - Zete Lui
@@ -10,7 +9,7 @@ authors:
10
9
  autorequire:
11
10
  bindir: bin
12
11
  cert_chain: []
13
- date: 2014-09-23 00:00:00.000000000 Z
12
+ date: 2014-09-25 00:00:00.000000000 Z
14
13
  dependencies: []
15
14
  description: fast, efficient, unicode aware HAT trie with prefix / suffix support.
16
15
  email:
@@ -19,12 +18,8 @@ extensions:
19
18
  - ext/extconf.rb
20
19
  extra_rdoc_files: []
21
20
  files:
22
- - copying
23
21
  - changes
24
- - readme.md
25
- - lib/wordtriez.rb
26
- - test/wordtriez_test.rb
27
- - ext/wordtriez.cc
22
+ - copying
28
23
  - ext/common.h
29
24
  - ext/extconf.rb
30
25
  - ext/hat-trie/ahtable.c
@@ -39,29 +34,31 @@ files:
39
34
  - ext/hat-trie/pstdint.h
40
35
  - ext/hat-trie/text.c
41
36
  - ext/hat-trie/text.h
37
+ - ext/wordtriez.cc
38
+ - lib/wordtriez.rb
39
+ - readme.md
40
+ - test/wordtriez_test.rb
42
41
  homepage: https://github.com/canadaduane/triez
43
42
  licenses: []
43
+ metadata: {}
44
44
  post_install_message:
45
45
  rdoc_options: []
46
46
  require_paths:
47
47
  - lib
48
48
  required_ruby_version: !ruby/object:Gem::Requirement
49
- none: false
50
49
  requirements:
51
- - - ! '>='
50
+ - - ">="
52
51
  - !ruby/object:Gem::Version
53
52
  version: 1.9.2
54
53
  required_rubygems_version: !ruby/object:Gem::Requirement
55
- none: false
56
54
  requirements:
57
- - - ! '>='
55
+ - - ">="
58
56
  - !ruby/object:Gem::Version
59
57
  version: '0'
60
58
  requirements: []
61
59
  rubyforge_project:
62
- rubygems_version: 1.8.23
60
+ rubygems_version: 2.2.2
63
61
  signing_key:
64
- specification_version: 3
62
+ specification_version: 4
65
63
  summary: fast, efficient, unicode aware HAT trie with prefix / suffix support
66
64
  test_files: []
67
- has_rdoc: false