wordtriez 0.0.3 → 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/ext/hat-trie/text.c +7 -38
- data/ext/hat-trie/text.h +1 -1
- data/ext/wordtriez.cc +3 -1
- metadata +12 -15
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 8006468978af919cac247d68b46533ff84461141
|
4
|
+
data.tar.gz: a1b1cc4a7ff1aeaee6af7e6be38c95fc244f4a53
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: ecabfb5cd2767f95bb5c69d8f417c60f8c049a1a2196d67e65337296df4362a19e7ddc0d580de632b72e2fc0eae7a5b89572289d223e37da4296f7be8b6e2d73
|
7
|
+
data.tar.gz: 56a46e9f54537062a419c39647f4149351f2e0f998eeab1123225a0148e538449d347924e8b1818911058d7d256ab24aeb36f68072d8631846986bb47b297d95
|
data/ext/hat-trie/text.c
CHANGED
@@ -3,43 +3,6 @@
|
|
3
3
|
#include <string.h>
|
4
4
|
#include <assert.h>
|
5
5
|
|
6
|
-
/* Chris' C Code Version of the above (self.clean_text)**
|
7
|
-
|
8
|
-
* Credit: "most efficient way to remove special characters from string" By Guffa
|
9
|
-
* http://stackoverflow.com/questions/1120198/most-efficient-way-to-remove-special-characters-from-string
|
10
|
-
*
|
11
|
-
* How fast is this code?
|
12
|
-
*
|
13
|
-
* Regular expression: 294.4 ms.
|
14
|
-
* Original function: 54.5 ms.
|
15
|
-
* My suggested change: 47.1 ms.
|
16
|
-
* Mine with setting StringBuilder capacity: 43.3 ms.
|
17
|
-
* I tested the lookup+char[] solution, and it runs in about 13 ms.
|
18
|
-
*/
|
19
|
-
|
20
|
-
/*
|
21
|
-
private static bool[] _lookup;
|
22
|
-
static Program() {
|
23
|
-
_lookup = new bool[65535];
|
24
|
-
for (char c = '0'; c <= '9'; c++) _lookup[c] = true;
|
25
|
-
for (char c = 'A'; c <= 'Z'; c++) _lookup[c] = true;
|
26
|
-
for (char c = 'a'; c <= 'z'; c++) _lookup[c] = true;
|
27
|
-
_lookup['.'] = true;
|
28
|
-
_lookup['_'] = true;
|
29
|
-
}
|
30
|
-
public static string RemoveSpecialCharacters(string str) {
|
31
|
-
char[] buffer = new char[str.Length];
|
32
|
-
int index = 0;
|
33
|
-
foreach (char c in str) {
|
34
|
-
if (_lookup[c]) {
|
35
|
-
buffer[index] = c;
|
36
|
-
index++;
|
37
|
-
}
|
38
|
-
}
|
39
|
-
return new string(buffer, 0, index);
|
40
|
-
}
|
41
|
-
*/
|
42
|
-
|
43
6
|
/** Transforms text such as the following:
|
44
7
|
*
|
45
8
|
* And behold, I said, "This is no good!"
|
@@ -52,7 +15,7 @@ public static string RemoveSpecialCharacters(string str) {
|
|
52
15
|
*
|
53
16
|
* Spaces indicate word boundaries, while periods indicate sentence boundaries.
|
54
17
|
*/
|
55
|
-
|
18
|
+
size_t text_clean(char* text)
|
56
19
|
{
|
57
20
|
char* read;
|
58
21
|
char* write = text;
|
@@ -99,6 +62,9 @@ void text_clean(char* text)
|
|
99
62
|
if (just_added_space) write--;
|
100
63
|
// terminate the string at its new length
|
101
64
|
*write = '\0';
|
65
|
+
|
66
|
+
// Return the new length of the string
|
67
|
+
return (size_t)(write - text);
|
102
68
|
}
|
103
69
|
|
104
70
|
void add_ngrams(hattrie_t* trie, int upto_n, char* text, uint8_t incr_existing_keys_only)
|
@@ -151,6 +117,9 @@ void add_ngrams_with_suffix(hattrie_t* trie, int upto_n, char* text, char* suffi
|
|
151
117
|
char* buffer_pre = buffer + buffer_offset;
|
152
118
|
strcpy(buffer_pre, suffix);
|
153
119
|
|
120
|
+
// skip any spaces at beginning
|
121
|
+
while(*head == ' ') head++;
|
122
|
+
|
154
123
|
do {
|
155
124
|
if (*tail == ' ' || *tail == '.' || *tail == '\0') {
|
156
125
|
word_count++;
|
data/ext/hat-trie/text.h
CHANGED
@@ -11,7 +11,7 @@ extern "C" {
|
|
11
11
|
|
12
12
|
#define NGRAM_BUFFER_SIZE 4096
|
13
13
|
|
14
|
-
|
14
|
+
size_t text_clean(char* text);
|
15
15
|
void add_ngrams(hattrie_t* trie, int upto_n, char* text, uint8_t incr_existing_keys_only);
|
16
16
|
void add_ngrams_with_suffix(hattrie_t* trie, int upto_n, char* text, char* suffix, uint8_t incr_existing_keys_only);
|
17
17
|
|
data/ext/wordtriez.cc
CHANGED
@@ -281,7 +281,9 @@ static VALUE hat_add_text(VALUE self, VALUE text, VALUE ngrams, VALUE suffix, VA
|
|
281
281
|
p = ht->p;
|
282
282
|
|
283
283
|
char* ctext = StringValueCStr(text);
|
284
|
-
text_clean(ctext);
|
284
|
+
size_t new_length = text_clean(ctext);
|
285
|
+
|
286
|
+
rb_str_resize(text, (long)new_length);
|
285
287
|
|
286
288
|
add_ngrams_with_suffix(p,
|
287
289
|
FIX2INT(ngrams),
|
metadata
CHANGED
@@ -1,8 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: wordtriez
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0
|
5
|
-
prerelease:
|
4
|
+
version: 0.1.0
|
6
5
|
platform: ruby
|
7
6
|
authors:
|
8
7
|
- Zete Lui
|
@@ -10,7 +9,7 @@ authors:
|
|
10
9
|
autorequire:
|
11
10
|
bindir: bin
|
12
11
|
cert_chain: []
|
13
|
-
date: 2014-09-
|
12
|
+
date: 2014-09-25 00:00:00.000000000 Z
|
14
13
|
dependencies: []
|
15
14
|
description: fast, efficient, unicode aware HAT trie with prefix / suffix support.
|
16
15
|
email:
|
@@ -19,12 +18,8 @@ extensions:
|
|
19
18
|
- ext/extconf.rb
|
20
19
|
extra_rdoc_files: []
|
21
20
|
files:
|
22
|
-
- copying
|
23
21
|
- changes
|
24
|
-
-
|
25
|
-
- lib/wordtriez.rb
|
26
|
-
- test/wordtriez_test.rb
|
27
|
-
- ext/wordtriez.cc
|
22
|
+
- copying
|
28
23
|
- ext/common.h
|
29
24
|
- ext/extconf.rb
|
30
25
|
- ext/hat-trie/ahtable.c
|
@@ -39,29 +34,31 @@ files:
|
|
39
34
|
- ext/hat-trie/pstdint.h
|
40
35
|
- ext/hat-trie/text.c
|
41
36
|
- ext/hat-trie/text.h
|
37
|
+
- ext/wordtriez.cc
|
38
|
+
- lib/wordtriez.rb
|
39
|
+
- readme.md
|
40
|
+
- test/wordtriez_test.rb
|
42
41
|
homepage: https://github.com/canadaduane/triez
|
43
42
|
licenses: []
|
43
|
+
metadata: {}
|
44
44
|
post_install_message:
|
45
45
|
rdoc_options: []
|
46
46
|
require_paths:
|
47
47
|
- lib
|
48
48
|
required_ruby_version: !ruby/object:Gem::Requirement
|
49
|
-
none: false
|
50
49
|
requirements:
|
51
|
-
- -
|
50
|
+
- - ">="
|
52
51
|
- !ruby/object:Gem::Version
|
53
52
|
version: 1.9.2
|
54
53
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
55
|
-
none: false
|
56
54
|
requirements:
|
57
|
-
- -
|
55
|
+
- - ">="
|
58
56
|
- !ruby/object:Gem::Version
|
59
57
|
version: '0'
|
60
58
|
requirements: []
|
61
59
|
rubyforge_project:
|
62
|
-
rubygems_version:
|
60
|
+
rubygems_version: 2.2.2
|
63
61
|
signing_key:
|
64
|
-
specification_version:
|
62
|
+
specification_version: 4
|
65
63
|
summary: fast, efficient, unicode aware HAT trie with prefix / suffix support
|
66
64
|
test_files: []
|
67
|
-
has_rdoc: false
|