wordtree 0.4.0 → 0.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/ext/wordtree.cc +110 -0
- data/lib/wordtree/text.rb +3 -0
- data/lib/wordtree/version.rb +1 -1
- data/spec/wordtree/text_spec.rb +66 -0
- metadata +2 -2
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA1:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: c57c8ec838d33d0f963f3f6316bcb165b65de21f
|
|
4
|
+
data.tar.gz: 08b411c2499a4a11a5b94cc160e57b4a24f33b96
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 89a3efcd77faaa357b4240f26a3f70291f9c340f33d83209417b0b12b3324b12280d14fdea786300634b0daa2d2c0c56ad013f00460799da3e82efc46691b41c
|
|
7
|
+
data.tar.gz: cf66225761d294969e41dc9e2d2d8de7d2ac518553240a40bd5bc4f72c9b488d7d64b507837a8cbf37310fa997d9b29a1d9305797ca00b3adb99eefcc35ec98b
|
data/ext/wordtree.cc
CHANGED
|
@@ -112,6 +112,114 @@ static VALUE text_clean(VALUE self, VALUE text) {
|
|
|
112
112
|
return text;
|
|
113
113
|
}
|
|
114
114
|
|
|
115
|
+
static inline void _incr_value(
|
|
116
|
+
VALUE hash, // Hash
|
|
117
|
+
VALUE key, // String
|
|
118
|
+
VALUE suffix, // Symbol or nil
|
|
119
|
+
VALUE incr_existing_keys_only) // true/false
|
|
120
|
+
{
|
|
121
|
+
// rb_funcall(rb_mKernel, rb_intern("p"), 4, hash, key, suffix, incr_existing_keys_only);
|
|
122
|
+
if (suffix == Qnil) {
|
|
123
|
+
// We know the hash is shallow, and has just integer values
|
|
124
|
+
VALUE val = rb_hash_lookup(hash, key);
|
|
125
|
+
if (val != Qnil) {
|
|
126
|
+
// Increment the key's value by 1
|
|
127
|
+
rb_hash_aset(hash, key, INT2FIX(FIX2INT(val) + 1));
|
|
128
|
+
} else if (!RTEST(incr_existing_keys_only)) {
|
|
129
|
+
// Add this key and start the value at 1
|
|
130
|
+
rb_hash_aset(hash, key, INT2FIX(1));
|
|
131
|
+
}
|
|
132
|
+
} else {
|
|
133
|
+
// The hash contains a hash
|
|
134
|
+
VALUE inner_hash = rb_hash_lookup(hash, key);
|
|
135
|
+
if (inner_hash != Qnil) {
|
|
136
|
+
Check_Type(inner_hash, T_HASH);
|
|
137
|
+
VALUE val = rb_hash_lookup(inner_hash, suffix);
|
|
138
|
+
if (val == Qnil) {
|
|
139
|
+
// Start this key.suffix's value at 1
|
|
140
|
+
rb_hash_aset(inner_hash, suffix, INT2FIX(1));
|
|
141
|
+
} else {
|
|
142
|
+
// Increment the key.suffix's value by 1
|
|
143
|
+
rb_hash_aset(inner_hash, suffix, INT2FIX(FIX2INT(val) + 1));
|
|
144
|
+
}
|
|
145
|
+
} else if (!RTEST(incr_existing_keys_only)) {
|
|
146
|
+
// Create an inner hash for this key (to contain suffixes)
|
|
147
|
+
inner_hash = rb_hash_new();
|
|
148
|
+
rb_hash_aset(inner_hash, suffix, INT2FIX(1));
|
|
149
|
+
// Add suffix inner_hash to this key
|
|
150
|
+
rb_hash_aset(hash, key, inner_hash);
|
|
151
|
+
}
|
|
152
|
+
}
|
|
153
|
+
}
|
|
154
|
+
|
|
155
|
+
VALUE text_incr_value(VALUE self, VALUE hash, VALUE key, VALUE suffix, VALUE incr_existing_keys_only)
|
|
156
|
+
{
|
|
157
|
+
Check_Type(hash, T_HASH);
|
|
158
|
+
Check_Type(key, T_STRING);
|
|
159
|
+
if (suffix != Qnil) Check_Type(suffix, T_SYMBOL);
|
|
160
|
+
|
|
161
|
+
_incr_value(hash, key, suffix, incr_existing_keys_only);
|
|
162
|
+
return self;
|
|
163
|
+
}
|
|
164
|
+
|
|
165
|
+
VALUE text_add_ngrams_with_suffix(
|
|
166
|
+
VALUE self,
|
|
167
|
+
VALUE text,
|
|
168
|
+
VALUE hash,
|
|
169
|
+
VALUE upto_n_value,
|
|
170
|
+
VALUE suffix,
|
|
171
|
+
VALUE incr_existing_keys_only)
|
|
172
|
+
{
|
|
173
|
+
char* head = RSTRING_PTR(text);
|
|
174
|
+
char* tail = RSTRING_PTR(text);
|
|
175
|
+
char* next_head = head;
|
|
176
|
+
char* next_tail = tail;
|
|
177
|
+
int word_count = 0;
|
|
178
|
+
int text_len = RSTRING_LEN(text);
|
|
179
|
+
int incr_existing = RTEST(incr_existing_keys_only);
|
|
180
|
+
int upto_n = FIX2INT(upto_n_value);
|
|
181
|
+
|
|
182
|
+
if (text_len == 0) return self;
|
|
183
|
+
|
|
184
|
+
do {
|
|
185
|
+
if (*tail == ' ' || *tail == '.' || tail >= head+text_len) {
|
|
186
|
+
word_count++;
|
|
187
|
+
if (word_count == 1 || upto_n == 1) {
|
|
188
|
+
next_head = next_tail = tail + 1;
|
|
189
|
+
} else if (word_count == 2) {
|
|
190
|
+
next_tail = tail;
|
|
191
|
+
}
|
|
192
|
+
if (word_count <= upto_n) {
|
|
193
|
+
_incr_value(hash, rb_str_new(head, tail - head), suffix, incr_existing_keys_only);
|
|
194
|
+
}
|
|
195
|
+
if (word_count == upto_n) {
|
|
196
|
+
head = next_head;
|
|
197
|
+
tail = next_tail;
|
|
198
|
+
word_count = 0;
|
|
199
|
+
} else {
|
|
200
|
+
tail++;
|
|
201
|
+
}
|
|
202
|
+
} else {
|
|
203
|
+
tail++;
|
|
204
|
+
}
|
|
205
|
+
} while(*tail);
|
|
206
|
+
|
|
207
|
+
// add the last ngram of size upto_n
|
|
208
|
+
_incr_value(hash, rb_str_new(head, tail - head), suffix, incr_existing_keys_only);
|
|
209
|
+
|
|
210
|
+
// add the 1..(upto_n-1) sized ngrams at the tail
|
|
211
|
+
if (upto_n > 1) {
|
|
212
|
+
while(head < RSTRING_PTR(text)+text_len) {
|
|
213
|
+
if(*head == ' ' || *head == '.') {
|
|
214
|
+
_incr_value(hash, rb_str_new(head + 1, tail - head - 1), suffix, incr_existing_keys_only);
|
|
215
|
+
}
|
|
216
|
+
head++;
|
|
217
|
+
}
|
|
218
|
+
}
|
|
219
|
+
|
|
220
|
+
return self;
|
|
221
|
+
}
|
|
222
|
+
|
|
115
223
|
extern "C"
|
|
116
224
|
void Init_wordtree() {
|
|
117
225
|
VALUE rb_mWordTree = rb_define_module("WordTree");
|
|
@@ -122,4 +230,6 @@ void Init_wordtree() {
|
|
|
122
230
|
|
|
123
231
|
rb_define_module_function(rb_mText, "clean", RUBY_METHOD_FUNC(text_clean), 1);
|
|
124
232
|
rb_define_module_function(rb_mText, "common_trigrams", RUBY_METHOD_FUNC(text_common_trigrams), 1);
|
|
233
|
+
rb_define_module_function(rb_mText, "incr_value", RUBY_METHOD_FUNC(text_incr_value), 4);
|
|
234
|
+
rb_define_module_function(rb_mText, "_add_ngrams_with_suffix", RUBY_METHOD_FUNC(text_add_ngrams_with_suffix), 5);
|
|
125
235
|
}
|
data/lib/wordtree/text.rb
CHANGED
data/lib/wordtree/version.rb
CHANGED
data/spec/wordtree/text_spec.rb
CHANGED
|
@@ -78,4 +78,70 @@ describe WordTree::Text do
|
|
|
78
78
|
end
|
|
79
79
|
end
|
|
80
80
|
|
|
81
|
+
describe "#incr_value" do
|
|
82
|
+
context "existing keys only" do
|
|
83
|
+
it "does not add keys" do
|
|
84
|
+
hash = {"hello" => 1}
|
|
85
|
+
WordTree::Text.incr_value(hash, "goodbye", nil, true)
|
|
86
|
+
expect(hash.size).to eq 1
|
|
87
|
+
expect(hash).to_not have_key("goodbye")
|
|
88
|
+
end
|
|
89
|
+
|
|
90
|
+
it "creates suffixes to existing keys" do
|
|
91
|
+
hash = {"hello" => {}}
|
|
92
|
+
WordTree::Text.incr_value(hash, "hello", :greeting, true)
|
|
93
|
+
expect(hash.size).to eq 1
|
|
94
|
+
expect(hash["hello"]).to be_a(Hash)
|
|
95
|
+
expect(hash["hello"][:greeting]).to eq 1
|
|
96
|
+
end
|
|
97
|
+
|
|
98
|
+
it "adds values for suffixes to existing keys" do
|
|
99
|
+
hash = {"hello" => {:greeting => 1}}
|
|
100
|
+
WordTree::Text.incr_value(hash, "hello", :greeting, true)
|
|
101
|
+
WordTree::Text.incr_value(hash, "hello", :other, true)
|
|
102
|
+
expect(hash.size).to eq 1
|
|
103
|
+
expect(hash["hello"]).to eq(:greeting => 2, :other => 1)
|
|
104
|
+
end
|
|
105
|
+
end
|
|
106
|
+
|
|
107
|
+
context "open ended keys" do
|
|
108
|
+
it "adds keys" do
|
|
109
|
+
hash = {}
|
|
110
|
+
WordTree::Text.incr_value(hash, "hello", nil, false)
|
|
111
|
+
expect(hash).to eq("hello" => 1)
|
|
112
|
+
end
|
|
113
|
+
|
|
114
|
+
it "adds key and suffix" do
|
|
115
|
+
hash = {}
|
|
116
|
+
WordTree::Text.incr_value(hash, "hello", :greeting, false)
|
|
117
|
+
expect(hash).to eq("hello" => {:greeting => 1})
|
|
118
|
+
end
|
|
119
|
+
end
|
|
120
|
+
end
|
|
121
|
+
|
|
122
|
+
describe "#add_ngrams_with_suffix" do
|
|
123
|
+
it "adds ngrams to a hash" do
|
|
124
|
+
hash = {}
|
|
125
|
+
text = "some text.text"
|
|
126
|
+
WordTree::Text.add_ngrams_with_suffix(text, hash, 2)
|
|
127
|
+
expect(hash).to eq(
|
|
128
|
+
"some" => 1,
|
|
129
|
+
"text" => 2,
|
|
130
|
+
"some text" => 1,
|
|
131
|
+
"text.text" => 1)
|
|
132
|
+
end
|
|
133
|
+
|
|
134
|
+
it "adds suffixes to hash of hashes" do
|
|
135
|
+
hash = {}
|
|
136
|
+
text = "some text.text"
|
|
137
|
+
WordTree::Text.add_ngrams_with_suffix(text, hash, 1, :a)
|
|
138
|
+
WordTree::Text.add_ngrams_with_suffix(text, hash, 2, :b)
|
|
139
|
+
expect(hash).to eq(
|
|
140
|
+
"some" => {:a => 1, :b => 1},
|
|
141
|
+
"text" => {:a => 2, :b => 2},
|
|
142
|
+
"some text" => {:b => 1},
|
|
143
|
+
"text.text" => {:b => 1}
|
|
144
|
+
)
|
|
145
|
+
end
|
|
146
|
+
end
|
|
81
147
|
end
|
metadata
CHANGED
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: wordtree
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.
|
|
4
|
+
version: 0.5.0
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Duane Johnson
|
|
8
8
|
autorequire:
|
|
9
9
|
bindir: bin
|
|
10
10
|
cert_chain: []
|
|
11
|
-
date: 2014-09-
|
|
11
|
+
date: 2014-09-28 00:00:00.000000000 Z
|
|
12
12
|
dependencies:
|
|
13
13
|
- !ruby/object:Gem::Dependency
|
|
14
14
|
name: virtus
|