wordtree 0.4.0 → 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 7bcb5a59a130a24ca1bede26a9f3b8efa651230c
4
- data.tar.gz: 03ef94ffd836a11f891065fd7eb3c73d4aa8bdd2
3
+ metadata.gz: c57c8ec838d33d0f963f3f6316bcb165b65de21f
4
+ data.tar.gz: 08b411c2499a4a11a5b94cc160e57b4a24f33b96
5
5
  SHA512:
6
- metadata.gz: cbf2b847f90fdffd1a52a4f7ea2c2fd78ce339ded803cbf30720fba1dced77b2ea2e9b1b84f5fee147d472d0e673437ad0e58ccffcb1a5fd2794fa87ec739384
7
- data.tar.gz: 0abb233dd846913d69e13087d84c174136e8a1511c417f5239e3395972f17af71f43091079e8fc605354843c09bb73b6e82ca358801303d77f692c3d8d140ea5
6
+ metadata.gz: 89a3efcd77faaa357b4240f26a3f70291f9c340f33d83209417b0b12b3324b12280d14fdea786300634b0daa2d2c0c56ad013f00460799da3e82efc46691b41c
7
+ data.tar.gz: cf66225761d294969e41dc9e2d2d8de7d2ac518553240a40bd5bc4f72c9b488d7d64b507837a8cbf37310fa997d9b29a1d9305797ca00b3adb99eefcc35ec98b
@@ -112,6 +112,114 @@ static VALUE text_clean(VALUE self, VALUE text) {
112
112
  return text;
113
113
  }
114
114
 
115
+ static inline void _incr_value(
116
+ VALUE hash, // Hash
117
+ VALUE key, // String
118
+ VALUE suffix, // Symbol or nil
119
+ VALUE incr_existing_keys_only) // true/false
120
+ {
121
+ // rb_funcall(rb_mKernel, rb_intern("p"), 4, hash, key, suffix, incr_existing_keys_only);
122
+ if (suffix == Qnil) {
123
+ // We know the hash is shallow, and has just integer values
124
+ VALUE val = rb_hash_lookup(hash, key);
125
+ if (val != Qnil) {
126
+ // Increment the key's value by 1
127
+ rb_hash_aset(hash, key, INT2FIX(FIX2INT(val) + 1));
128
+ } else if (!RTEST(incr_existing_keys_only)) {
129
+ // Add this key and start the value at 1
130
+ rb_hash_aset(hash, key, INT2FIX(1));
131
+ }
132
+ } else {
133
+ // The hash contains a hash
134
+ VALUE inner_hash = rb_hash_lookup(hash, key);
135
+ if (inner_hash != Qnil) {
136
+ Check_Type(inner_hash, T_HASH);
137
+ VALUE val = rb_hash_lookup(inner_hash, suffix);
138
+ if (val == Qnil) {
139
+ // Start this key.suffix's value at 1
140
+ rb_hash_aset(inner_hash, suffix, INT2FIX(1));
141
+ } else {
142
+ // Increment the key.suffix's value by 1
143
+ rb_hash_aset(inner_hash, suffix, INT2FIX(FIX2INT(val) + 1));
144
+ }
145
+ } else if (!RTEST(incr_existing_keys_only)) {
146
+ // Create an inner hash for this key (to contain suffixes)
147
+ inner_hash = rb_hash_new();
148
+ rb_hash_aset(inner_hash, suffix, INT2FIX(1));
149
+ // Add suffix inner_hash to this key
150
+ rb_hash_aset(hash, key, inner_hash);
151
+ }
152
+ }
153
+ }
154
+
155
+ VALUE text_incr_value(VALUE self, VALUE hash, VALUE key, VALUE suffix, VALUE incr_existing_keys_only)
156
+ {
157
+ Check_Type(hash, T_HASH);
158
+ Check_Type(key, T_STRING);
159
+ if (suffix != Qnil) Check_Type(suffix, T_SYMBOL);
160
+
161
+ _incr_value(hash, key, suffix, incr_existing_keys_only);
162
+ return self;
163
+ }
164
+
165
+ VALUE text_add_ngrams_with_suffix(
166
+ VALUE self,
167
+ VALUE text,
168
+ VALUE hash,
169
+ VALUE upto_n_value,
170
+ VALUE suffix,
171
+ VALUE incr_existing_keys_only)
172
+ {
173
+ char* head = RSTRING_PTR(text);
174
+ char* tail = RSTRING_PTR(text);
175
+ char* next_head = head;
176
+ char* next_tail = tail;
177
+ int word_count = 0;
178
+ int text_len = RSTRING_LEN(text);
179
+ int incr_existing = RTEST(incr_existing_keys_only);
180
+ int upto_n = FIX2INT(upto_n_value);
181
+
182
+ if (text_len == 0) return self;
183
+
184
+ do {
185
+ if (*tail == ' ' || *tail == '.' || tail >= head+text_len) {
186
+ word_count++;
187
+ if (word_count == 1 || upto_n == 1) {
188
+ next_head = next_tail = tail + 1;
189
+ } else if (word_count == 2) {
190
+ next_tail = tail;
191
+ }
192
+ if (word_count <= upto_n) {
193
+ _incr_value(hash, rb_str_new(head, tail - head), suffix, incr_existing_keys_only);
194
+ }
195
+ if (word_count == upto_n) {
196
+ head = next_head;
197
+ tail = next_tail;
198
+ word_count = 0;
199
+ } else {
200
+ tail++;
201
+ }
202
+ } else {
203
+ tail++;
204
+ }
205
+ } while(*tail);
206
+
207
+ // add the last ngram of size upto_n
208
+ _incr_value(hash, rb_str_new(head, tail - head), suffix, incr_existing_keys_only);
209
+
210
+ // add the 1..(upto_n-1) sized ngrams at the tail
211
+ if (upto_n > 1) {
212
+ while(head < RSTRING_PTR(text)+text_len) {
213
+ if(*head == ' ' || *head == '.') {
214
+ _incr_value(hash, rb_str_new(head + 1, tail - head - 1), suffix, incr_existing_keys_only);
215
+ }
216
+ head++;
217
+ }
218
+ }
219
+
220
+ return self;
221
+ }
222
+
115
223
  extern "C"
116
224
  void Init_wordtree() {
117
225
  VALUE rb_mWordTree = rb_define_module("WordTree");
@@ -122,4 +230,6 @@ void Init_wordtree() {
122
230
 
123
231
  rb_define_module_function(rb_mText, "clean", RUBY_METHOD_FUNC(text_clean), 1);
124
232
  rb_define_module_function(rb_mText, "common_trigrams", RUBY_METHOD_FUNC(text_common_trigrams), 1);
233
+ rb_define_module_function(rb_mText, "incr_value", RUBY_METHOD_FUNC(text_incr_value), 4);
234
+ rb_define_module_function(rb_mText, "_add_ngrams_with_suffix", RUBY_METHOD_FUNC(text_add_ngrams_with_suffix), 5);
125
235
  }
@@ -33,5 +33,8 @@ module WordTree
33
33
  return wrapped_output
34
34
  end
35
35
 
36
+ def self.add_ngrams_with_suffix(text, hash, upto_n=4, suffix=nil, incr_existing_keys_only=false)
37
+ _add_ngrams_with_suffix(text, hash, upto_n, suffix, incr_existing_keys_only)
38
+ end
36
39
  end
37
40
  end
@@ -1,3 +1,3 @@
1
1
  module WordTree
2
- VERSION = "0.4.0"
2
+ VERSION = "0.5.0"
3
3
  end
@@ -78,4 +78,70 @@ describe WordTree::Text do
78
78
  end
79
79
  end
80
80
 
81
+ describe "#incr_value" do
82
+ context "existing keys only" do
83
+ it "does not add keys" do
84
+ hash = {"hello" => 1}
85
+ WordTree::Text.incr_value(hash, "goodbye", nil, true)
86
+ expect(hash.size).to eq 1
87
+ expect(hash).to_not have_key("goodbye")
88
+ end
89
+
90
+ it "creates suffixes to existing keys" do
91
+ hash = {"hello" => {}}
92
+ WordTree::Text.incr_value(hash, "hello", :greeting, true)
93
+ expect(hash.size).to eq 1
94
+ expect(hash["hello"]).to be_a(Hash)
95
+ expect(hash["hello"][:greeting]).to eq 1
96
+ end
97
+
98
+ it "adds values for suffixes to existing keys" do
99
+ hash = {"hello" => {:greeting => 1}}
100
+ WordTree::Text.incr_value(hash, "hello", :greeting, true)
101
+ WordTree::Text.incr_value(hash, "hello", :other, true)
102
+ expect(hash.size).to eq 1
103
+ expect(hash["hello"]).to eq(:greeting => 2, :other => 1)
104
+ end
105
+ end
106
+
107
+ context "open ended keys" do
108
+ it "adds keys" do
109
+ hash = {}
110
+ WordTree::Text.incr_value(hash, "hello", nil, false)
111
+ expect(hash).to eq("hello" => 1)
112
+ end
113
+
114
+ it "adds key and suffix" do
115
+ hash = {}
116
+ WordTree::Text.incr_value(hash, "hello", :greeting, false)
117
+ expect(hash).to eq("hello" => {:greeting => 1})
118
+ end
119
+ end
120
+ end
121
+
122
+ describe "#add_ngrams_with_suffix" do
123
+ it "adds ngrams to a hash" do
124
+ hash = {}
125
+ text = "some text.text"
126
+ WordTree::Text.add_ngrams_with_suffix(text, hash, 2)
127
+ expect(hash).to eq(
128
+ "some" => 1,
129
+ "text" => 2,
130
+ "some text" => 1,
131
+ "text.text" => 1)
132
+ end
133
+
134
+ it "adds suffixes to hash of hashes" do
135
+ hash = {}
136
+ text = "some text.text"
137
+ WordTree::Text.add_ngrams_with_suffix(text, hash, 1, :a)
138
+ WordTree::Text.add_ngrams_with_suffix(text, hash, 2, :b)
139
+ expect(hash).to eq(
140
+ "some" => {:a => 1, :b => 1},
141
+ "text" => {:a => 2, :b => 2},
142
+ "some text" => {:b => 1},
143
+ "text.text" => {:b => 1}
144
+ )
145
+ end
146
+ end
81
147
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: wordtree
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.4.0
4
+ version: 0.5.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Duane Johnson
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2014-09-26 00:00:00.000000000 Z
11
+ date: 2014-09-28 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: virtus