categorize 0.0.11 → 0.0.12
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/ext/categorize/categorize.c +4 -4
- data/lib/categorize.rb +1 -1
- data/lib/categorize/model.rb +12 -11
- data/lib/categorize/version.rb +1 -1
- metadata +1 -4
data/ext/categorize/categorize.c
CHANGED
@@ -12,7 +12,7 @@ VALUE Categorize = Qnil;
|
|
12
12
|
VALUE CBagOfWords = Qnil;
|
13
13
|
VALUE Models = Qnil;
|
14
14
|
|
15
|
-
static VALUE
|
15
|
+
static VALUE method_model(VALUE, VALUE);
|
16
16
|
static int add_or_update_gram_from_index(int, char *);
|
17
17
|
|
18
18
|
// Store all grams, used in compare_top_grams.
|
@@ -25,7 +25,7 @@ void Init_categorize()
|
|
25
25
|
Categorize = rb_define_module("Categorize");
|
26
26
|
Models = rb_define_module_under(Categorize, "Models");
|
27
27
|
CBagOfWords = rb_define_class_under(Models, "CBagOfWords", rb_cObject);
|
28
|
-
rb_define_method(CBagOfWords, "
|
28
|
+
rb_define_method(CBagOfWords, "model", method_model, 1);
|
29
29
|
}
|
30
30
|
|
31
31
|
const bool DEBUG = false;
|
@@ -122,13 +122,13 @@ int compare_top_grams(const void *idx1, const void *idx2)
|
|
122
122
|
}
|
123
123
|
|
124
124
|
/*
|
125
|
-
*
|
125
|
+
* model(array_of_tokens);
|
126
126
|
* ==== Return
|
127
127
|
* Top terms
|
128
128
|
* ==== Parameters
|
129
129
|
* array_of_tokens: Tokens to turn into grams and extract phrases from.
|
130
130
|
*/
|
131
|
-
static VALUE
|
131
|
+
static VALUE method_model(VALUE self, VALUE array_of_tokens)
|
132
132
|
{
|
133
133
|
int i, j;
|
134
134
|
long array_of_tokens_len = RARRAY_LEN(array_of_tokens);
|
data/lib/categorize.rb
CHANGED
data/lib/categorize/model.rb
CHANGED
@@ -4,6 +4,7 @@ module Categorize
|
|
4
4
|
module Model
|
5
5
|
MIN_WORD_LENGTH = 3
|
6
6
|
@bag_of_words = Models::BagOfWords.new
|
7
|
+
@c_bag_of_words = Models::CBagOfWords.new
|
7
8
|
|
8
9
|
class << self
|
9
10
|
#include Bow
|
@@ -21,12 +22,13 @@ module Categorize
|
|
21
22
|
# ==== Parameters
|
22
23
|
# items:: the items to be classified
|
23
24
|
def make_model_c(strings)
|
24
|
-
strings.map { |s| preprocess(s) }
|
25
|
-
|
25
|
+
array_of_tokens = strings.map { |s| preprocess(s) }
|
26
|
+
ret = @c_bag_of_words.model(array_of_tokens);
|
26
27
|
count = 0
|
27
28
|
ret.reduce({}) do |hash, term|
|
28
29
|
hash[term] ||= []
|
29
|
-
hash[term] << count
|
30
|
+
hash[term] << count
|
31
|
+
count += 1
|
30
32
|
hash
|
31
33
|
end
|
32
34
|
end
|
@@ -37,15 +39,14 @@ module Categorize
|
|
37
39
|
]
|
38
40
|
end
|
39
41
|
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
word
|
46
|
-
Constants::Words::COMMON.include?(word)
|
47
|
-
end
|
42
|
+
def preprocess(string)
|
43
|
+
split_lower_strings = string.split(
|
44
|
+
Constants::Words::SPLIT_REGEX).map(&:downcase)
|
45
|
+
split_lower_strings.delete_if do |word|
|
46
|
+
word.length < MIN_WORD_LENGTH ||
|
47
|
+
Constants::Words::COMMON.include?(word)
|
48
48
|
end
|
49
|
+
end
|
49
50
|
end
|
50
51
|
end
|
51
52
|
end
|
data/lib/categorize/version.rb
CHANGED
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: categorize
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.12
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -46,9 +46,6 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
46
46
|
- - ! '>='
|
47
47
|
- !ruby/object:Gem::Version
|
48
48
|
version: '0'
|
49
|
-
segments:
|
50
|
-
- 0
|
51
|
-
hash: -3553060524054293255
|
52
49
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
53
50
|
none: false
|
54
51
|
requirements:
|