wordtriez 0.0.3 → 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/ext/hat-trie/text.c +7 -38
- data/ext/hat-trie/text.h +1 -1
- data/ext/wordtriez.cc +3 -1
- metadata +12 -15
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 8006468978af919cac247d68b46533ff84461141
|
4
|
+
data.tar.gz: a1b1cc4a7ff1aeaee6af7e6be38c95fc244f4a53
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: ecabfb5cd2767f95bb5c69d8f417c60f8c049a1a2196d67e65337296df4362a19e7ddc0d580de632b72e2fc0eae7a5b89572289d223e37da4296f7be8b6e2d73
|
7
|
+
data.tar.gz: 56a46e9f54537062a419c39647f4149351f2e0f998eeab1123225a0148e538449d347924e8b1818911058d7d256ab24aeb36f68072d8631846986bb47b297d95
|
data/ext/hat-trie/text.c
CHANGED
@@ -3,43 +3,6 @@
|
|
3
3
|
#include <string.h>
|
4
4
|
#include <assert.h>
|
5
5
|
|
6
|
-
/* Chris' C Code Version of the above (self.clean_text)**
|
7
|
-
|
8
|
-
* Credit: "most efficient way to remove special characters from string" By Guffa
|
9
|
-
* http://stackoverflow.com/questions/1120198/most-efficient-way-to-remove-special-characters-from-string
|
10
|
-
*
|
11
|
-
* How fast is this code?
|
12
|
-
*
|
13
|
-
* Regular expression: 294.4 ms.
|
14
|
-
* Original function: 54.5 ms.
|
15
|
-
* My suggested change: 47.1 ms.
|
16
|
-
* Mine with setting StringBuilder capacity: 43.3 ms.
|
17
|
-
* I tested the lookup+char[] solution, and it runs in about 13 ms.
|
18
|
-
*/
|
19
|
-
|
20
|
-
/*
|
21
|
-
private static bool[] _lookup;
|
22
|
-
static Program() {
|
23
|
-
_lookup = new bool[65535];
|
24
|
-
for (char c = '0'; c <= '9'; c++) _lookup[c] = true;
|
25
|
-
for (char c = 'A'; c <= 'Z'; c++) _lookup[c] = true;
|
26
|
-
for (char c = 'a'; c <= 'z'; c++) _lookup[c] = true;
|
27
|
-
_lookup['.'] = true;
|
28
|
-
_lookup['_'] = true;
|
29
|
-
}
|
30
|
-
public static string RemoveSpecialCharacters(string str) {
|
31
|
-
char[] buffer = new char[str.Length];
|
32
|
-
int index = 0;
|
33
|
-
foreach (char c in str) {
|
34
|
-
if (_lookup[c]) {
|
35
|
-
buffer[index] = c;
|
36
|
-
index++;
|
37
|
-
}
|
38
|
-
}
|
39
|
-
return new string(buffer, 0, index);
|
40
|
-
}
|
41
|
-
*/
|
42
|
-
|
43
6
|
/** Transforms text such as the following:
|
44
7
|
*
|
45
8
|
* And behold, I said, "This is no good!"
|
@@ -52,7 +15,7 @@ public static string RemoveSpecialCharacters(string str) {
|
|
52
15
|
*
|
53
16
|
* Spaces indicate word boundaries, while periods indicate sentence boundaries.
|
54
17
|
*/
|
55
|
-
|
18
|
+
size_t text_clean(char* text)
|
56
19
|
{
|
57
20
|
char* read;
|
58
21
|
char* write = text;
|
@@ -99,6 +62,9 @@ void text_clean(char* text)
|
|
99
62
|
if (just_added_space) write--;
|
100
63
|
// terminate the string at its new length
|
101
64
|
*write = '\0';
|
65
|
+
|
66
|
+
// Return the new length of the string
|
67
|
+
return (size_t)(write - text);
|
102
68
|
}
|
103
69
|
|
104
70
|
void add_ngrams(hattrie_t* trie, int upto_n, char* text, uint8_t incr_existing_keys_only)
|
@@ -151,6 +117,9 @@ void add_ngrams_with_suffix(hattrie_t* trie, int upto_n, char* text, char* suffi
|
|
151
117
|
char* buffer_pre = buffer + buffer_offset;
|
152
118
|
strcpy(buffer_pre, suffix);
|
153
119
|
|
120
|
+
// skip any spaces at beginning
|
121
|
+
while(*head == ' ') head++;
|
122
|
+
|
154
123
|
do {
|
155
124
|
if (*tail == ' ' || *tail == '.' || *tail == '\0') {
|
156
125
|
word_count++;
|
data/ext/hat-trie/text.h
CHANGED
@@ -11,7 +11,7 @@ extern "C" {
|
|
11
11
|
|
12
12
|
#define NGRAM_BUFFER_SIZE 4096
|
13
13
|
|
14
|
-
|
14
|
+
size_t text_clean(char* text);
|
15
15
|
void add_ngrams(hattrie_t* trie, int upto_n, char* text, uint8_t incr_existing_keys_only);
|
16
16
|
void add_ngrams_with_suffix(hattrie_t* trie, int upto_n, char* text, char* suffix, uint8_t incr_existing_keys_only);
|
17
17
|
|
data/ext/wordtriez.cc
CHANGED
@@ -281,7 +281,9 @@ static VALUE hat_add_text(VALUE self, VALUE text, VALUE ngrams, VALUE suffix, VA
|
|
281
281
|
p = ht->p;
|
282
282
|
|
283
283
|
char* ctext = StringValueCStr(text);
|
284
|
-
text_clean(ctext);
|
284
|
+
size_t new_length = text_clean(ctext);
|
285
|
+
|
286
|
+
rb_str_resize(text, (long)new_length);
|
285
287
|
|
286
288
|
add_ngrams_with_suffix(p,
|
287
289
|
FIX2INT(ngrams),
|
metadata
CHANGED
@@ -1,8 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: wordtriez
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0
|
5
|
-
prerelease:
|
4
|
+
version: 0.1.0
|
6
5
|
platform: ruby
|
7
6
|
authors:
|
8
7
|
- Zete Lui
|
@@ -10,7 +9,7 @@ authors:
|
|
10
9
|
autorequire:
|
11
10
|
bindir: bin
|
12
11
|
cert_chain: []
|
13
|
-
date: 2014-09-
|
12
|
+
date: 2014-09-25 00:00:00.000000000 Z
|
14
13
|
dependencies: []
|
15
14
|
description: fast, efficient, unicode aware HAT trie with prefix / suffix support.
|
16
15
|
email:
|
@@ -19,12 +18,8 @@ extensions:
|
|
19
18
|
- ext/extconf.rb
|
20
19
|
extra_rdoc_files: []
|
21
20
|
files:
|
22
|
-
- copying
|
23
21
|
- changes
|
24
|
-
-
|
25
|
-
- lib/wordtriez.rb
|
26
|
-
- test/wordtriez_test.rb
|
27
|
-
- ext/wordtriez.cc
|
22
|
+
- copying
|
28
23
|
- ext/common.h
|
29
24
|
- ext/extconf.rb
|
30
25
|
- ext/hat-trie/ahtable.c
|
@@ -39,29 +34,31 @@ files:
|
|
39
34
|
- ext/hat-trie/pstdint.h
|
40
35
|
- ext/hat-trie/text.c
|
41
36
|
- ext/hat-trie/text.h
|
37
|
+
- ext/wordtriez.cc
|
38
|
+
- lib/wordtriez.rb
|
39
|
+
- readme.md
|
40
|
+
- test/wordtriez_test.rb
|
42
41
|
homepage: https://github.com/canadaduane/triez
|
43
42
|
licenses: []
|
43
|
+
metadata: {}
|
44
44
|
post_install_message:
|
45
45
|
rdoc_options: []
|
46
46
|
require_paths:
|
47
47
|
- lib
|
48
48
|
required_ruby_version: !ruby/object:Gem::Requirement
|
49
|
-
none: false
|
50
49
|
requirements:
|
51
|
-
- -
|
50
|
+
- - ">="
|
52
51
|
- !ruby/object:Gem::Version
|
53
52
|
version: 1.9.2
|
54
53
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
55
|
-
none: false
|
56
54
|
requirements:
|
57
|
-
- -
|
55
|
+
- - ">="
|
58
56
|
- !ruby/object:Gem::Version
|
59
57
|
version: '0'
|
60
58
|
requirements: []
|
61
59
|
rubyforge_project:
|
62
|
-
rubygems_version:
|
60
|
+
rubygems_version: 2.2.2
|
63
61
|
signing_key:
|
64
|
-
specification_version:
|
62
|
+
specification_version: 4
|
65
63
|
summary: fast, efficient, unicode aware HAT trie with prefix / suffix support
|
66
64
|
test_files: []
|
67
|
-
has_rdoc: false
|