wordtriez 0.1.3 → 0.1.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/ext/hat-trie/text.c +6 -7
- data/ext/hat-trie/text.h +2 -2
- data/ext/wordtriez.cc +3 -3
- metadata +2 -2
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA1:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 1166b687e599b75c7c6423e177e661e6e01557fc
|
|
4
|
+
data.tar.gz: 7157ecc077b1be637ceb067bae8e170acef367bf
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: a674980638b76c5ad7893897d945986b56634b49055b807cded3bc367a05917a207f7ea92a731692a74a4335580b758ac31398466654b01a3a3e291fe13fea1c
|
|
7
|
+
data.tar.gz: 4c113c405f97f8033a11e0f501c4e3043466ade43717dc762347c5283d5064a00096b1b27f578940ee4a97978a72977a79651755a81c3342c2c5b6059b31cb2c
|
data/ext/hat-trie/text.c
CHANGED
|
@@ -15,16 +15,16 @@
|
|
|
15
15
|
*
|
|
16
16
|
* Spaces indicate word boundaries, while periods indicate sentence boundaries.
|
|
17
17
|
*/
|
|
18
|
-
size_t text_clean(char* text)
|
|
18
|
+
size_t text_clean(char* text, size_t len)
|
|
19
19
|
{
|
|
20
|
-
if (
|
|
20
|
+
if (len == 0) return 0;
|
|
21
21
|
|
|
22
22
|
char* read;
|
|
23
23
|
char* write = text;
|
|
24
24
|
uint8_t join_lines = false,
|
|
25
25
|
just_added_space = true, // prevent prefix spaces
|
|
26
26
|
just_added_period = false;
|
|
27
|
-
for (read=text;
|
|
27
|
+
for (read=text; read<text+len; read++) {
|
|
28
28
|
char c = *read;
|
|
29
29
|
if (c >= 'A' && c <= 'Z') {
|
|
30
30
|
// Change upper case to lowercase
|
|
@@ -70,7 +70,7 @@ size_t text_clean(char* text)
|
|
|
70
70
|
void add_ngrams(hattrie_t* trie, int upto_n, char* text, size_t text_len, uint8_t incr_existing_keys_only)
|
|
71
71
|
{
|
|
72
72
|
char blank_suffix[] = "\0";
|
|
73
|
-
add_ngrams_with_suffix(trie, upto_n, text, text_len, blank_suffix, incr_existing_keys_only);
|
|
73
|
+
add_ngrams_with_suffix(trie, upto_n, text, text_len, blank_suffix, 0, incr_existing_keys_only);
|
|
74
74
|
}
|
|
75
75
|
|
|
76
76
|
static inline void incr_value(
|
|
@@ -101,7 +101,7 @@ static inline void incr_value(
|
|
|
101
101
|
|
|
102
102
|
}
|
|
103
103
|
|
|
104
|
-
void add_ngrams_with_suffix(hattrie_t* trie, int upto_n, char* text, size_t text_len, char* suffix, uint8_t incr_existing_keys_only)
|
|
104
|
+
void add_ngrams_with_suffix(hattrie_t* trie, int upto_n, char* text, size_t text_len, char* suffix, size_t suffix_len, uint8_t incr_existing_keys_only)
|
|
105
105
|
{
|
|
106
106
|
char* head = text;
|
|
107
107
|
char* tail = text;
|
|
@@ -112,10 +112,9 @@ void add_ngrams_with_suffix(hattrie_t* trie, int upto_n, char* text, size_t text
|
|
|
112
112
|
if (text_len == 0) return;
|
|
113
113
|
|
|
114
114
|
char buffer[NGRAM_BUFFER_SIZE];
|
|
115
|
-
size_t suffix_len = strlen(suffix);
|
|
116
115
|
size_t buffer_offset = NGRAM_BUFFER_SIZE - suffix_len - 1;
|
|
117
116
|
char* buffer_pre = buffer + buffer_offset;
|
|
118
|
-
|
|
117
|
+
memcpy(buffer_pre, suffix, suffix_len);
|
|
119
118
|
|
|
120
119
|
do {
|
|
121
120
|
if (*tail == ' ' || *tail == '.' || tail >= head+text_len) {
|
data/ext/hat-trie/text.h
CHANGED
|
@@ -11,9 +11,9 @@ extern "C" {
|
|
|
11
11
|
|
|
12
12
|
#define NGRAM_BUFFER_SIZE 4096
|
|
13
13
|
|
|
14
|
-
size_t text_clean(char* text);
|
|
14
|
+
size_t text_clean(char* text, size_t len);
|
|
15
15
|
void add_ngrams(hattrie_t* trie, int upto_n, char* text, size_t text_len, uint8_t incr_existing_keys_only);
|
|
16
|
-
void add_ngrams_with_suffix(hattrie_t* trie, int upto_n, char* text, size_t text_len, char* suffix, uint8_t incr_existing_keys_only);
|
|
16
|
+
void add_ngrams_with_suffix(hattrie_t* trie, int upto_n, char* text, size_t text_len, char* suffix, size_t suffix_len, uint8_t incr_existing_keys_only);
|
|
17
17
|
|
|
18
18
|
#ifdef __cplusplus
|
|
19
19
|
}
|
data/ext/wordtriez.cc
CHANGED
|
@@ -276,8 +276,7 @@ static VALUE hat_walk(VALUE self, VALUE key) {
|
|
|
276
276
|
static VALUE hat_text_clean(VALUE self, VALUE text) {
|
|
277
277
|
rb_str_modify(text);
|
|
278
278
|
|
|
279
|
-
|
|
280
|
-
size_t new_length = text_clean(ctext);
|
|
279
|
+
size_t new_length = text_clean(RSTRING_PTR(text), RSTRING_LEN(text));
|
|
281
280
|
|
|
282
281
|
rb_str_set_len(text, (long)new_length);
|
|
283
282
|
|
|
@@ -296,7 +295,8 @@ static VALUE hat_add_text(VALUE self, VALUE text, VALUE ngrams, VALUE suffix, VA
|
|
|
296
295
|
FIX2INT(ngrams),
|
|
297
296
|
RSTRING_PTR(text),
|
|
298
297
|
RSTRING_LEN(text),
|
|
299
|
-
|
|
298
|
+
RSTRING_PTR(suffix),
|
|
299
|
+
RSTRING_LEN(suffix),
|
|
300
300
|
RTEST(incr_existing_keys_only));
|
|
301
301
|
|
|
302
302
|
return self;
|
metadata
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: wordtriez
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.1.
|
|
4
|
+
version: 0.1.6
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Zete Lui
|
|
@@ -9,7 +9,7 @@ authors:
|
|
|
9
9
|
autorequire:
|
|
10
10
|
bindir: bin
|
|
11
11
|
cert_chain: []
|
|
12
|
-
date: 2014-09-
|
|
12
|
+
date: 2014-09-28 00:00:00.000000000 Z
|
|
13
13
|
dependencies: []
|
|
14
14
|
description: fast, efficient, unicode aware HAT trie with prefix / suffix support.
|
|
15
15
|
email:
|