wordtriez 0.1.1 → 0.1.2

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 2d703337bf6fdd17a5ac29691d7cc22bc0aa94c2
4
- data.tar.gz: 0ae1078629cf2e13ddf365575c5b271f32de2e32
3
+ metadata.gz: 2cf93aec498957ae648674251b2051ab4a08381c
4
+ data.tar.gz: 8fe39195a015704d6afee86762d304f84d485456
5
5
  SHA512:
6
- metadata.gz: 1356429e081fed8cdf8bc1cf893d9acc845e512c2a6a6bd444d180e0c9c15b2179b48aff350da11eb3f42b81f93a838d910c68c1f5218eeeb1ed19ef29917a50
7
- data.tar.gz: c3eb416caf1b40b860c622300c81fae29430eeb576f1ae26bfca68189dbc3be812ee4a66f52a7e8784c3fe10cbaf5dfbd65dcce545e9873389c5ccc8b224e6fc
6
+ metadata.gz: 23f5c58246dec87b35d3c11476c4e8d82c6fdc013b1f534eaa8dc47881608c74a33b002dae5a2d744a223b38de59233eddf50fef8ac239a07f9fff64d03559f3
7
+ data.tar.gz: 1033906d59479f13f251af341905315ae637e34663cee293a785614299baf0e883715d9d61d0bab3b3e2f68e945e298c50b2861f6191c2918fd6cf308605218d
data/ext/hat-trie/text.c CHANGED
@@ -20,7 +20,7 @@ size_t text_clean(char* text)
20
20
  char* read;
21
21
  char* write = text;
22
22
  uint8_t join_lines = false,
23
- just_added_space = false,
23
+ just_added_space = true, // prevent prefix spaces
24
24
  just_added_period = false;
25
25
  for (read=text; *read; read++) {
26
26
  char c = *read;
@@ -60,17 +60,15 @@ size_t text_clean(char* text)
60
60
  }
61
61
  // erase space at end of text
62
62
  if (just_added_space) write--;
63
- // terminate the string at its new length
64
- *write = '\0';
65
63
 
66
64
  // Return the new length of the string
67
65
  return (size_t)(write - text);
68
66
  }
69
67
 
70
- void add_ngrams(hattrie_t* trie, int upto_n, char* text, uint8_t incr_existing_keys_only)
68
+ void add_ngrams(hattrie_t* trie, int upto_n, char* text, size_t text_len, uint8_t incr_existing_keys_only)
71
69
  {
72
70
  char blank_suffix[] = "\0";
73
- add_ngrams_with_suffix(trie, upto_n, text, blank_suffix, incr_existing_keys_only);
71
+ add_ngrams_with_suffix(trie, upto_n, text, text_len, blank_suffix, incr_existing_keys_only);
74
72
  }
75
73
 
76
74
  static inline void incr_value(
@@ -101,7 +99,7 @@ static inline void incr_value(
101
99
 
102
100
  }
103
101
 
104
- void add_ngrams_with_suffix(hattrie_t* trie, int upto_n, char* text, char* suffix, uint8_t incr_existing_keys_only)
102
+ void add_ngrams_with_suffix(hattrie_t* trie, int upto_n, char* text, size_t text_len, char* suffix, uint8_t incr_existing_keys_only)
105
103
  {
106
104
  char* head = text;
107
105
  char* tail = text;
@@ -109,7 +107,7 @@ void add_ngrams_with_suffix(hattrie_t* trie, int upto_n, char* text, char* suffi
109
107
  char* next_tail = text;
110
108
  int word_count = 0;
111
109
 
112
- if (*text == '\0') return;
110
+ if (text_len == 0) return;
113
111
 
114
112
  char buffer[NGRAM_BUFFER_SIZE];
115
113
  size_t suffix_len = strlen(suffix);
@@ -117,11 +115,8 @@ void add_ngrams_with_suffix(hattrie_t* trie, int upto_n, char* text, char* suffi
117
115
  char* buffer_pre = buffer + buffer_offset;
118
116
  strcpy(buffer_pre, suffix);
119
117
 
120
- // skip any spaces at beginning
121
- while(*head == ' ') head++;
122
-
123
118
  do {
124
- if (*tail == ' ' || *tail == '.' || *tail == '\0') {
119
+ if (*tail == ' ' || *tail == '.' || tail >= head+text_len) {
125
120
  word_count++;
126
121
  if (word_count == 1 || upto_n == 1) {
127
122
  next_head = next_tail = tail + 1;
@@ -152,7 +147,7 @@ void add_ngrams_with_suffix(hattrie_t* trie, int upto_n, char* text, char* suffi
152
147
 
153
148
  // add the 1..(upto_n-1) sized ngrams at the tail
154
149
  if (upto_n > 1) {
155
- while(*head) {
150
+ while(head < text+text_len) {
156
151
  if(*head == ' ' || *head == '.') {
157
152
  incr_value(trie, buffer, buffer_pre,
158
153
  head + 1, tail - head - 1, suffix_len,
data/ext/hat-trie/text.h CHANGED
@@ -12,8 +12,8 @@ extern "C" {
12
12
  #define NGRAM_BUFFER_SIZE 4096
13
13
 
14
14
  size_t text_clean(char* text);
15
- void add_ngrams(hattrie_t* trie, int upto_n, char* text, uint8_t incr_existing_keys_only);
16
- void add_ngrams_with_suffix(hattrie_t* trie, int upto_n, char* text, char* suffix, uint8_t incr_existing_keys_only);
15
+ void add_ngrams(hattrie_t* trie, int upto_n, char* text, size_t text_len, uint8_t incr_existing_keys_only);
16
+ void add_ngrams_with_suffix(hattrie_t* trie, int upto_n, char* text, size_t text_len, char* suffix, uint8_t incr_existing_keys_only);
17
17
 
18
18
  #ifdef __cplusplus
19
19
  }
data/ext/wordtriez.cc CHANGED
@@ -273,26 +273,33 @@ static VALUE hat_walk(VALUE self, VALUE key) {
273
273
  return data.arr;
274
274
  }
275
275
 
276
+ static VALUE hat_text_clean(VALUE self, VALUE text) {
277
+ rb_str_modify(text);
278
+
279
+ char* ctext = StringValueCStr(text);
280
+ size_t new_length = text_clean(ctext);
281
+
282
+ rb_str_set_len(text, (long)new_length);
283
+
284
+ return text;
285
+ }
286
+
276
287
  static VALUE hat_add_text(VALUE self, VALUE text, VALUE ngrams, VALUE suffix, VALUE incr_existing_keys_only) {
277
- // rb_str_dup
278
288
  hattrie_t* p;
279
289
  HatTrie* ht;
280
290
  Data_Get_Struct(self, HatTrie, ht);
281
291
  p = ht->p;
282
292
 
283
- char* ctext = StringValueCStr(text);
284
- size_t new_length = text_clean(ctext);
293
+ hat_text_clean(self, text);
285
294
 
286
295
  add_ngrams_with_suffix(p,
287
296
  FIX2INT(ngrams),
288
- ctext,
297
+ RSTRING_PTR(text),
298
+ RSTRING_LEN(text),
289
299
  StringValueCStr(suffix),
290
300
  RTEST(incr_existing_keys_only));
291
301
 
292
- rb_str_resize(text, (long)new_length);
293
-
294
302
  return self;
295
- // rb_str_substr
296
303
  }
297
304
 
298
305
  #define DEF(k,n,f,c) rb_define_method(k,n,RUBY_METHOD_FUNC(f),c)
@@ -316,4 +323,5 @@ void Init_wordtriez() {
316
323
  DEF(hat_class, "_internal_search", hat_search, 4);
317
324
  DEF(hat_class, "_internal_walk", hat_walk, 1);
318
325
  DEF(hat_class, "_internal_add_text", hat_add_text, 4);
326
+ DEF(hat_class, "text_clean", hat_text_clean, 1);
319
327
  }
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: wordtriez
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.1
4
+ version: 0.1.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Zete Lui