wordtriez 0.1.1 → 0.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 2d703337bf6fdd17a5ac29691d7cc22bc0aa94c2
4
- data.tar.gz: 0ae1078629cf2e13ddf365575c5b271f32de2e32
3
+ metadata.gz: 2cf93aec498957ae648674251b2051ab4a08381c
4
+ data.tar.gz: 8fe39195a015704d6afee86762d304f84d485456
5
5
  SHA512:
6
- metadata.gz: 1356429e081fed8cdf8bc1cf893d9acc845e512c2a6a6bd444d180e0c9c15b2179b48aff350da11eb3f42b81f93a838d910c68c1f5218eeeb1ed19ef29917a50
7
- data.tar.gz: c3eb416caf1b40b860c622300c81fae29430eeb576f1ae26bfca68189dbc3be812ee4a66f52a7e8784c3fe10cbaf5dfbd65dcce545e9873389c5ccc8b224e6fc
6
+ metadata.gz: 23f5c58246dec87b35d3c11476c4e8d82c6fdc013b1f534eaa8dc47881608c74a33b002dae5a2d744a223b38de59233eddf50fef8ac239a07f9fff64d03559f3
7
+ data.tar.gz: 1033906d59479f13f251af341905315ae637e34663cee293a785614299baf0e883715d9d61d0bab3b3e2f68e945e298c50b2861f6191c2918fd6cf308605218d
data/ext/hat-trie/text.c CHANGED
@@ -20,7 +20,7 @@ size_t text_clean(char* text)
20
20
  char* read;
21
21
  char* write = text;
22
22
  uint8_t join_lines = false,
23
- just_added_space = false,
23
+ just_added_space = true, // prevent prefix spaces
24
24
  just_added_period = false;
25
25
  for (read=text; *read; read++) {
26
26
  char c = *read;
@@ -60,17 +60,15 @@ size_t text_clean(char* text)
60
60
  }
61
61
  // erase space at end of text
62
62
  if (just_added_space) write--;
63
- // terminate the string at its new length
64
- *write = '\0';
65
63
 
66
64
  // Return the new length of the string
67
65
  return (size_t)(write - text);
68
66
  }
69
67
 
70
- void add_ngrams(hattrie_t* trie, int upto_n, char* text, uint8_t incr_existing_keys_only)
68
+ void add_ngrams(hattrie_t* trie, int upto_n, char* text, size_t text_len, uint8_t incr_existing_keys_only)
71
69
  {
72
70
  char blank_suffix[] = "\0";
73
- add_ngrams_with_suffix(trie, upto_n, text, blank_suffix, incr_existing_keys_only);
71
+ add_ngrams_with_suffix(trie, upto_n, text, text_len, blank_suffix, incr_existing_keys_only);
74
72
  }
75
73
 
76
74
  static inline void incr_value(
@@ -101,7 +99,7 @@ static inline void incr_value(
101
99
 
102
100
  }
103
101
 
104
- void add_ngrams_with_suffix(hattrie_t* trie, int upto_n, char* text, char* suffix, uint8_t incr_existing_keys_only)
102
+ void add_ngrams_with_suffix(hattrie_t* trie, int upto_n, char* text, size_t text_len, char* suffix, uint8_t incr_existing_keys_only)
105
103
  {
106
104
  char* head = text;
107
105
  char* tail = text;
@@ -109,7 +107,7 @@ void add_ngrams_with_suffix(hattrie_t* trie, int upto_n, char* text, char* suffi
109
107
  char* next_tail = text;
110
108
  int word_count = 0;
111
109
 
112
- if (*text == '\0') return;
110
+ if (text_len == 0) return;
113
111
 
114
112
  char buffer[NGRAM_BUFFER_SIZE];
115
113
  size_t suffix_len = strlen(suffix);
@@ -117,11 +115,8 @@ void add_ngrams_with_suffix(hattrie_t* trie, int upto_n, char* text, char* suffi
117
115
  char* buffer_pre = buffer + buffer_offset;
118
116
  strcpy(buffer_pre, suffix);
119
117
 
120
- // skip any spaces at beginning
121
- while(*head == ' ') head++;
122
-
123
118
  do {
124
- if (*tail == ' ' || *tail == '.' || *tail == '\0') {
119
+ if (*tail == ' ' || *tail == '.' || tail >= head+text_len) {
125
120
  word_count++;
126
121
  if (word_count == 1 || upto_n == 1) {
127
122
  next_head = next_tail = tail + 1;
@@ -152,7 +147,7 @@ void add_ngrams_with_suffix(hattrie_t* trie, int upto_n, char* text, char* suffi
152
147
 
153
148
  // add the 1..(upto_n-1) sized ngrams at the tail
154
149
  if (upto_n > 1) {
155
- while(*head) {
150
+ while(head < text+text_len) {
156
151
  if(*head == ' ' || *head == '.') {
157
152
  incr_value(trie, buffer, buffer_pre,
158
153
  head + 1, tail - head - 1, suffix_len,
data/ext/hat-trie/text.h CHANGED
@@ -12,8 +12,8 @@ extern "C" {
12
12
  #define NGRAM_BUFFER_SIZE 4096
13
13
 
14
14
  size_t text_clean(char* text);
15
- void add_ngrams(hattrie_t* trie, int upto_n, char* text, uint8_t incr_existing_keys_only);
16
- void add_ngrams_with_suffix(hattrie_t* trie, int upto_n, char* text, char* suffix, uint8_t incr_existing_keys_only);
15
+ void add_ngrams(hattrie_t* trie, int upto_n, char* text, size_t text_len, uint8_t incr_existing_keys_only);
16
+ void add_ngrams_with_suffix(hattrie_t* trie, int upto_n, char* text, size_t text_len, char* suffix, uint8_t incr_existing_keys_only);
17
17
 
18
18
  #ifdef __cplusplus
19
19
  }
data/ext/wordtriez.cc CHANGED
@@ -273,26 +273,33 @@ static VALUE hat_walk(VALUE self, VALUE key) {
273
273
  return data.arr;
274
274
  }
275
275
 
276
+ static VALUE hat_text_clean(VALUE self, VALUE text) {
277
+ rb_str_modify(text);
278
+
279
+ char* ctext = StringValueCStr(text);
280
+ size_t new_length = text_clean(ctext);
281
+
282
+ rb_str_set_len(text, (long)new_length);
283
+
284
+ return text;
285
+ }
286
+
276
287
  static VALUE hat_add_text(VALUE self, VALUE text, VALUE ngrams, VALUE suffix, VALUE incr_existing_keys_only) {
277
- // rb_str_dup
278
288
  hattrie_t* p;
279
289
  HatTrie* ht;
280
290
  Data_Get_Struct(self, HatTrie, ht);
281
291
  p = ht->p;
282
292
 
283
- char* ctext = StringValueCStr(text);
284
- size_t new_length = text_clean(ctext);
293
+ hat_text_clean(self, text);
285
294
 
286
295
  add_ngrams_with_suffix(p,
287
296
  FIX2INT(ngrams),
288
- ctext,
297
+ RSTRING_PTR(text),
298
+ RSTRING_LEN(text),
289
299
  StringValueCStr(suffix),
290
300
  RTEST(incr_existing_keys_only));
291
301
 
292
- rb_str_resize(text, (long)new_length);
293
-
294
302
  return self;
295
- // rb_str_substr
296
303
  }
297
304
 
298
305
  #define DEF(k,n,f,c) rb_define_method(k,n,RUBY_METHOD_FUNC(f),c)
@@ -316,4 +323,5 @@ void Init_wordtriez() {
316
323
  DEF(hat_class, "_internal_search", hat_search, 4);
317
324
  DEF(hat_class, "_internal_walk", hat_walk, 1);
318
325
  DEF(hat_class, "_internal_add_text", hat_add_text, 4);
326
+ DEF(hat_class, "text_clean", hat_text_clean, 1);
319
327
  }
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: wordtriez
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.1
4
+ version: 0.1.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Zete Lui