wordtree 0.4.0 → 0.5.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 7bcb5a59a130a24ca1bede26a9f3b8efa651230c
4
- data.tar.gz: 03ef94ffd836a11f891065fd7eb3c73d4aa8bdd2
3
+ metadata.gz: c57c8ec838d33d0f963f3f6316bcb165b65de21f
4
+ data.tar.gz: 08b411c2499a4a11a5b94cc160e57b4a24f33b96
5
5
  SHA512:
6
- metadata.gz: cbf2b847f90fdffd1a52a4f7ea2c2fd78ce339ded803cbf30720fba1dced77b2ea2e9b1b84f5fee147d472d0e673437ad0e58ccffcb1a5fd2794fa87ec739384
7
- data.tar.gz: 0abb233dd846913d69e13087d84c174136e8a1511c417f5239e3395972f17af71f43091079e8fc605354843c09bb73b6e82ca358801303d77f692c3d8d140ea5
6
+ metadata.gz: 89a3efcd77faaa357b4240f26a3f70291f9c340f33d83209417b0b12b3324b12280d14fdea786300634b0daa2d2c0c56ad013f00460799da3e82efc46691b41c
7
+ data.tar.gz: cf66225761d294969e41dc9e2d2d8de7d2ac518553240a40bd5bc4f72c9b488d7d64b507837a8cbf37310fa997d9b29a1d9305797ca00b3adb99eefcc35ec98b
@@ -112,6 +112,114 @@ static VALUE text_clean(VALUE self, VALUE text) {
112
112
  return text;
113
113
  }
114
114
 
115
+ static inline void _incr_value(
116
+ VALUE hash, // Hash
117
+ VALUE key, // String
118
+ VALUE suffix, // Symbol or nil
119
+ VALUE incr_existing_keys_only) // true/false
120
+ {
121
+ // rb_funcall(rb_mKernel, rb_intern("p"), 4, hash, key, suffix, incr_existing_keys_only);
122
+ if (suffix == Qnil) {
123
+ // We know the hash is shallow, and has just integer values
124
+ VALUE val = rb_hash_lookup(hash, key);
125
+ if (val != Qnil) {
126
+ // Increment the key's value by 1
127
+ rb_hash_aset(hash, key, INT2FIX(FIX2INT(val) + 1));
128
+ } else if (!RTEST(incr_existing_keys_only)) {
129
+ // Add this key and start the value at 1
130
+ rb_hash_aset(hash, key, INT2FIX(1));
131
+ }
132
+ } else {
133
+ // The hash contains a hash
134
+ VALUE inner_hash = rb_hash_lookup(hash, key);
135
+ if (inner_hash != Qnil) {
136
+ Check_Type(inner_hash, T_HASH);
137
+ VALUE val = rb_hash_lookup(inner_hash, suffix);
138
+ if (val == Qnil) {
139
+ // Start this key.suffix's value at 1
140
+ rb_hash_aset(inner_hash, suffix, INT2FIX(1));
141
+ } else {
142
+ // Increment the key.suffix's value by 1
143
+ rb_hash_aset(inner_hash, suffix, INT2FIX(FIX2INT(val) + 1));
144
+ }
145
+ } else if (!RTEST(incr_existing_keys_only)) {
146
+ // Create an inner hash for this key (to contain suffixes)
147
+ inner_hash = rb_hash_new();
148
+ rb_hash_aset(inner_hash, suffix, INT2FIX(1));
149
+ // Add suffix inner_hash to this key
150
+ rb_hash_aset(hash, key, inner_hash);
151
+ }
152
+ }
153
+ }
154
+
155
+ VALUE text_incr_value(VALUE self, VALUE hash, VALUE key, VALUE suffix, VALUE incr_existing_keys_only)
156
+ {
157
+ Check_Type(hash, T_HASH);
158
+ Check_Type(key, T_STRING);
159
+ if (suffix != Qnil) Check_Type(suffix, T_SYMBOL);
160
+
161
+ _incr_value(hash, key, suffix, incr_existing_keys_only);
162
+ return self;
163
+ }
164
+
165
+ VALUE text_add_ngrams_with_suffix(
166
+ VALUE self,
167
+ VALUE text,
168
+ VALUE hash,
169
+ VALUE upto_n_value,
170
+ VALUE suffix,
171
+ VALUE incr_existing_keys_only)
172
+ {
173
+ char* head = RSTRING_PTR(text);
174
+ char* tail = RSTRING_PTR(text);
175
+ char* next_head = head;
176
+ char* next_tail = tail;
177
+ int word_count = 0;
178
+ int text_len = RSTRING_LEN(text);
179
+ int incr_existing = RTEST(incr_existing_keys_only);
180
+ int upto_n = FIX2INT(upto_n_value);
181
+
182
+ if (text_len == 0) return self;
183
+
184
+ do {
185
+ if (*tail == ' ' || *tail == '.' || tail >= head+text_len) {
186
+ word_count++;
187
+ if (word_count == 1 || upto_n == 1) {
188
+ next_head = next_tail = tail + 1;
189
+ } else if (word_count == 2) {
190
+ next_tail = tail;
191
+ }
192
+ if (word_count <= upto_n) {
193
+ _incr_value(hash, rb_str_new(head, tail - head), suffix, incr_existing_keys_only);
194
+ }
195
+ if (word_count == upto_n) {
196
+ head = next_head;
197
+ tail = next_tail;
198
+ word_count = 0;
199
+ } else {
200
+ tail++;
201
+ }
202
+ } else {
203
+ tail++;
204
+ }
205
+ } while(*tail);
206
+
207
+ // add the last ngram of size upto_n
208
+ _incr_value(hash, rb_str_new(head, tail - head), suffix, incr_existing_keys_only);
209
+
210
+ // add the 1..(upto_n-1) sized ngrams at the tail
211
+ if (upto_n > 1) {
212
+ while(head < RSTRING_PTR(text)+text_len) {
213
+ if(*head == ' ' || *head == '.') {
214
+ _incr_value(hash, rb_str_new(head + 1, tail - head - 1), suffix, incr_existing_keys_only);
215
+ }
216
+ head++;
217
+ }
218
+ }
219
+
220
+ return self;
221
+ }
222
+
115
223
  extern "C"
116
224
  void Init_wordtree() {
117
225
  VALUE rb_mWordTree = rb_define_module("WordTree");
@@ -122,4 +230,6 @@ void Init_wordtree() {
122
230
 
123
231
  rb_define_module_function(rb_mText, "clean", RUBY_METHOD_FUNC(text_clean), 1);
124
232
  rb_define_module_function(rb_mText, "common_trigrams", RUBY_METHOD_FUNC(text_common_trigrams), 1);
233
+ rb_define_module_function(rb_mText, "incr_value", RUBY_METHOD_FUNC(text_incr_value), 4);
234
+ rb_define_module_function(rb_mText, "_add_ngrams_with_suffix", RUBY_METHOD_FUNC(text_add_ngrams_with_suffix), 5);
125
235
  }
@@ -33,5 +33,8 @@ module WordTree
33
33
  return wrapped_output
34
34
  end
35
35
 
36
+ def self.add_ngrams_with_suffix(text, hash, upto_n=4, suffix=nil, incr_existing_keys_only=false)
37
+ _add_ngrams_with_suffix(text, hash, upto_n, suffix, incr_existing_keys_only)
38
+ end
36
39
  end
37
40
  end
@@ -1,3 +1,3 @@
1
1
  module WordTree
2
- VERSION = "0.4.0"
2
+ VERSION = "0.5.0"
3
3
  end
@@ -78,4 +78,70 @@ describe WordTree::Text do
78
78
  end
79
79
  end
80
80
 
81
+ describe "#incr_value" do
82
+ context "existing keys only" do
83
+ it "does not add keys" do
84
+ hash = {"hello" => 1}
85
+ WordTree::Text.incr_value(hash, "goodbye", nil, true)
86
+ expect(hash.size).to eq 1
87
+ expect(hash).to_not have_key("goodbye")
88
+ end
89
+
90
+ it "creates suffixes to existing keys" do
91
+ hash = {"hello" => {}}
92
+ WordTree::Text.incr_value(hash, "hello", :greeting, true)
93
+ expect(hash.size).to eq 1
94
+ expect(hash["hello"]).to be_a(Hash)
95
+ expect(hash["hello"][:greeting]).to eq 1
96
+ end
97
+
98
+ it "adds values for suffixes to existing keys" do
99
+ hash = {"hello" => {:greeting => 1}}
100
+ WordTree::Text.incr_value(hash, "hello", :greeting, true)
101
+ WordTree::Text.incr_value(hash, "hello", :other, true)
102
+ expect(hash.size).to eq 1
103
+ expect(hash["hello"]).to eq(:greeting => 2, :other => 1)
104
+ end
105
+ end
106
+
107
+ context "open ended keys" do
108
+ it "adds keys" do
109
+ hash = {}
110
+ WordTree::Text.incr_value(hash, "hello", nil, false)
111
+ expect(hash).to eq("hello" => 1)
112
+ end
113
+
114
+ it "adds key and suffix" do
115
+ hash = {}
116
+ WordTree::Text.incr_value(hash, "hello", :greeting, false)
117
+ expect(hash).to eq("hello" => {:greeting => 1})
118
+ end
119
+ end
120
+ end
121
+
122
+ describe "#add_ngrams_with_suffix" do
123
+ it "adds ngrams to a hash" do
124
+ hash = {}
125
+ text = "some text.text"
126
+ WordTree::Text.add_ngrams_with_suffix(text, hash, 2)
127
+ expect(hash).to eq(
128
+ "some" => 1,
129
+ "text" => 2,
130
+ "some text" => 1,
131
+ "text.text" => 1)
132
+ end
133
+
134
+ it "adds suffixes to hash of hashes" do
135
+ hash = {}
136
+ text = "some text.text"
137
+ WordTree::Text.add_ngrams_with_suffix(text, hash, 1, :a)
138
+ WordTree::Text.add_ngrams_with_suffix(text, hash, 2, :b)
139
+ expect(hash).to eq(
140
+ "some" => {:a => 1, :b => 1},
141
+ "text" => {:a => 2, :b => 2},
142
+ "some text" => {:b => 1},
143
+ "text.text" => {:b => 1}
144
+ )
145
+ end
146
+ end
81
147
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: wordtree
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.4.0
4
+ version: 0.5.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Duane Johnson
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2014-09-26 00:00:00.000000000 Z
11
+ date: 2014-09-28 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: virtus