wordtree 0.4.0 → 0.5.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/ext/wordtree.cc +110 -0
- data/lib/wordtree/text.rb +3 -0
- data/lib/wordtree/version.rb +1 -1
- data/spec/wordtree/text_spec.rb +66 -0
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: c57c8ec838d33d0f963f3f6316bcb165b65de21f
|
4
|
+
data.tar.gz: 08b411c2499a4a11a5b94cc160e57b4a24f33b96
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 89a3efcd77faaa357b4240f26a3f70291f9c340f33d83209417b0b12b3324b12280d14fdea786300634b0daa2d2c0c56ad013f00460799da3e82efc46691b41c
|
7
|
+
data.tar.gz: cf66225761d294969e41dc9e2d2d8de7d2ac518553240a40bd5bc4f72c9b488d7d64b507837a8cbf37310fa997d9b29a1d9305797ca00b3adb99eefcc35ec98b
|
data/ext/wordtree.cc
CHANGED
@@ -112,6 +112,114 @@ static VALUE text_clean(VALUE self, VALUE text) {
|
|
112
112
|
return text;
|
113
113
|
}
|
114
114
|
|
115
|
+
static inline void _incr_value(
|
116
|
+
VALUE hash, // Hash
|
117
|
+
VALUE key, // String
|
118
|
+
VALUE suffix, // Symbol or nil
|
119
|
+
VALUE incr_existing_keys_only) // true/false
|
120
|
+
{
|
121
|
+
// rb_funcall(rb_mKernel, rb_intern("p"), 4, hash, key, suffix, incr_existing_keys_only);
|
122
|
+
if (suffix == Qnil) {
|
123
|
+
// We know the hash is shallow, and has just integer values
|
124
|
+
VALUE val = rb_hash_lookup(hash, key);
|
125
|
+
if (val != Qnil) {
|
126
|
+
// Increment the key's value by 1
|
127
|
+
rb_hash_aset(hash, key, INT2FIX(FIX2INT(val) + 1));
|
128
|
+
} else if (!RTEST(incr_existing_keys_only)) {
|
129
|
+
// Add this key and start the value at 1
|
130
|
+
rb_hash_aset(hash, key, INT2FIX(1));
|
131
|
+
}
|
132
|
+
} else {
|
133
|
+
// The hash contains a hash
|
134
|
+
VALUE inner_hash = rb_hash_lookup(hash, key);
|
135
|
+
if (inner_hash != Qnil) {
|
136
|
+
Check_Type(inner_hash, T_HASH);
|
137
|
+
VALUE val = rb_hash_lookup(inner_hash, suffix);
|
138
|
+
if (val == Qnil) {
|
139
|
+
// Start this key.suffix's value at 1
|
140
|
+
rb_hash_aset(inner_hash, suffix, INT2FIX(1));
|
141
|
+
} else {
|
142
|
+
// Increment the key.suffix's value by 1
|
143
|
+
rb_hash_aset(inner_hash, suffix, INT2FIX(FIX2INT(val) + 1));
|
144
|
+
}
|
145
|
+
} else if (!RTEST(incr_existing_keys_only)) {
|
146
|
+
// Create an inner hash for this key (to contain suffixes)
|
147
|
+
inner_hash = rb_hash_new();
|
148
|
+
rb_hash_aset(inner_hash, suffix, INT2FIX(1));
|
149
|
+
// Add suffix inner_hash to this key
|
150
|
+
rb_hash_aset(hash, key, inner_hash);
|
151
|
+
}
|
152
|
+
}
|
153
|
+
}
|
154
|
+
|
155
|
+
VALUE text_incr_value(VALUE self, VALUE hash, VALUE key, VALUE suffix, VALUE incr_existing_keys_only)
|
156
|
+
{
|
157
|
+
Check_Type(hash, T_HASH);
|
158
|
+
Check_Type(key, T_STRING);
|
159
|
+
if (suffix != Qnil) Check_Type(suffix, T_SYMBOL);
|
160
|
+
|
161
|
+
_incr_value(hash, key, suffix, incr_existing_keys_only);
|
162
|
+
return self;
|
163
|
+
}
|
164
|
+
|
165
|
+
VALUE text_add_ngrams_with_suffix(
|
166
|
+
VALUE self,
|
167
|
+
VALUE text,
|
168
|
+
VALUE hash,
|
169
|
+
VALUE upto_n_value,
|
170
|
+
VALUE suffix,
|
171
|
+
VALUE incr_existing_keys_only)
|
172
|
+
{
|
173
|
+
char* head = RSTRING_PTR(text);
|
174
|
+
char* tail = RSTRING_PTR(text);
|
175
|
+
char* next_head = head;
|
176
|
+
char* next_tail = tail;
|
177
|
+
int word_count = 0;
|
178
|
+
int text_len = RSTRING_LEN(text);
|
179
|
+
int incr_existing = RTEST(incr_existing_keys_only);
|
180
|
+
int upto_n = FIX2INT(upto_n_value);
|
181
|
+
|
182
|
+
if (text_len == 0) return self;
|
183
|
+
|
184
|
+
do {
|
185
|
+
if (*tail == ' ' || *tail == '.' || tail >= head+text_len) {
|
186
|
+
word_count++;
|
187
|
+
if (word_count == 1 || upto_n == 1) {
|
188
|
+
next_head = next_tail = tail + 1;
|
189
|
+
} else if (word_count == 2) {
|
190
|
+
next_tail = tail;
|
191
|
+
}
|
192
|
+
if (word_count <= upto_n) {
|
193
|
+
_incr_value(hash, rb_str_new(head, tail - head), suffix, incr_existing_keys_only);
|
194
|
+
}
|
195
|
+
if (word_count == upto_n) {
|
196
|
+
head = next_head;
|
197
|
+
tail = next_tail;
|
198
|
+
word_count = 0;
|
199
|
+
} else {
|
200
|
+
tail++;
|
201
|
+
}
|
202
|
+
} else {
|
203
|
+
tail++;
|
204
|
+
}
|
205
|
+
} while(*tail);
|
206
|
+
|
207
|
+
// add the last ngram of size upto_n
|
208
|
+
_incr_value(hash, rb_str_new(head, tail - head), suffix, incr_existing_keys_only);
|
209
|
+
|
210
|
+
// add the 1..(upto_n-1) sized ngrams at the tail
|
211
|
+
if (upto_n > 1) {
|
212
|
+
while(head < RSTRING_PTR(text)+text_len) {
|
213
|
+
if(*head == ' ' || *head == '.') {
|
214
|
+
_incr_value(hash, rb_str_new(head + 1, tail - head - 1), suffix, incr_existing_keys_only);
|
215
|
+
}
|
216
|
+
head++;
|
217
|
+
}
|
218
|
+
}
|
219
|
+
|
220
|
+
return self;
|
221
|
+
}
|
222
|
+
|
115
223
|
extern "C"
|
116
224
|
void Init_wordtree() {
|
117
225
|
VALUE rb_mWordTree = rb_define_module("WordTree");
|
@@ -122,4 +230,6 @@ void Init_wordtree() {
|
|
122
230
|
|
123
231
|
rb_define_module_function(rb_mText, "clean", RUBY_METHOD_FUNC(text_clean), 1);
|
124
232
|
rb_define_module_function(rb_mText, "common_trigrams", RUBY_METHOD_FUNC(text_common_trigrams), 1);
|
233
|
+
rb_define_module_function(rb_mText, "incr_value", RUBY_METHOD_FUNC(text_incr_value), 4);
|
234
|
+
rb_define_module_function(rb_mText, "_add_ngrams_with_suffix", RUBY_METHOD_FUNC(text_add_ngrams_with_suffix), 5);
|
125
235
|
}
|
data/lib/wordtree/text.rb
CHANGED
data/lib/wordtree/version.rb
CHANGED
data/spec/wordtree/text_spec.rb
CHANGED
@@ -78,4 +78,70 @@ describe WordTree::Text do
|
|
78
78
|
end
|
79
79
|
end
|
80
80
|
|
81
|
+
describe "#incr_value" do
|
82
|
+
context "existing keys only" do
|
83
|
+
it "does not add keys" do
|
84
|
+
hash = {"hello" => 1}
|
85
|
+
WordTree::Text.incr_value(hash, "goodbye", nil, true)
|
86
|
+
expect(hash.size).to eq 1
|
87
|
+
expect(hash).to_not have_key("goodbye")
|
88
|
+
end
|
89
|
+
|
90
|
+
it "creates suffixes to existing keys" do
|
91
|
+
hash = {"hello" => {}}
|
92
|
+
WordTree::Text.incr_value(hash, "hello", :greeting, true)
|
93
|
+
expect(hash.size).to eq 1
|
94
|
+
expect(hash["hello"]).to be_a(Hash)
|
95
|
+
expect(hash["hello"][:greeting]).to eq 1
|
96
|
+
end
|
97
|
+
|
98
|
+
it "adds values for suffixes to existing keys" do
|
99
|
+
hash = {"hello" => {:greeting => 1}}
|
100
|
+
WordTree::Text.incr_value(hash, "hello", :greeting, true)
|
101
|
+
WordTree::Text.incr_value(hash, "hello", :other, true)
|
102
|
+
expect(hash.size).to eq 1
|
103
|
+
expect(hash["hello"]).to eq(:greeting => 2, :other => 1)
|
104
|
+
end
|
105
|
+
end
|
106
|
+
|
107
|
+
context "open ended keys" do
|
108
|
+
it "adds keys" do
|
109
|
+
hash = {}
|
110
|
+
WordTree::Text.incr_value(hash, "hello", nil, false)
|
111
|
+
expect(hash).to eq("hello" => 1)
|
112
|
+
end
|
113
|
+
|
114
|
+
it "adds key and suffix" do
|
115
|
+
hash = {}
|
116
|
+
WordTree::Text.incr_value(hash, "hello", :greeting, false)
|
117
|
+
expect(hash).to eq("hello" => {:greeting => 1})
|
118
|
+
end
|
119
|
+
end
|
120
|
+
end
|
121
|
+
|
122
|
+
describe "#add_ngrams_with_suffix" do
|
123
|
+
it "adds ngrams to a hash" do
|
124
|
+
hash = {}
|
125
|
+
text = "some text.text"
|
126
|
+
WordTree::Text.add_ngrams_with_suffix(text, hash, 2)
|
127
|
+
expect(hash).to eq(
|
128
|
+
"some" => 1,
|
129
|
+
"text" => 2,
|
130
|
+
"some text" => 1,
|
131
|
+
"text.text" => 1)
|
132
|
+
end
|
133
|
+
|
134
|
+
it "adds suffixes to hash of hashes" do
|
135
|
+
hash = {}
|
136
|
+
text = "some text.text"
|
137
|
+
WordTree::Text.add_ngrams_with_suffix(text, hash, 1, :a)
|
138
|
+
WordTree::Text.add_ngrams_with_suffix(text, hash, 2, :b)
|
139
|
+
expect(hash).to eq(
|
140
|
+
"some" => {:a => 1, :b => 1},
|
141
|
+
"text" => {:a => 2, :b => 2},
|
142
|
+
"some text" => {:b => 1},
|
143
|
+
"text.text" => {:b => 1}
|
144
|
+
)
|
145
|
+
end
|
146
|
+
end
|
81
147
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: wordtree
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.5.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Duane Johnson
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2014-09-
|
11
|
+
date: 2014-09-28 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: virtus
|