categorize 0.0.11 → 0.0.12
Sign up to get free protection for your applications and to get access to all the features.
- data/ext/categorize/categorize.c +4 -4
- data/lib/categorize.rb +1 -1
- data/lib/categorize/model.rb +12 -11
- data/lib/categorize/version.rb +1 -1
- metadata +1 -4
data/ext/categorize/categorize.c
CHANGED
@@ -12,7 +12,7 @@ VALUE Categorize = Qnil;
|
|
12
12
|
VALUE CBagOfWords = Qnil;
|
13
13
|
VALUE Models = Qnil;
|
14
14
|
|
15
|
-
static VALUE
|
15
|
+
static VALUE method_model(VALUE, VALUE);
|
16
16
|
static int add_or_update_gram_from_index(int, char *);
|
17
17
|
|
18
18
|
// Store all grams, used in compare_top_grams.
|
@@ -25,7 +25,7 @@ void Init_categorize()
|
|
25
25
|
Categorize = rb_define_module("Categorize");
|
26
26
|
Models = rb_define_module_under(Categorize, "Models");
|
27
27
|
CBagOfWords = rb_define_class_under(Models, "CBagOfWords", rb_cObject);
|
28
|
-
rb_define_method(CBagOfWords, "
|
28
|
+
rb_define_method(CBagOfWords, "model", method_model, 1);
|
29
29
|
}
|
30
30
|
|
31
31
|
const bool DEBUG = false;
|
@@ -122,13 +122,13 @@ int compare_top_grams(const void *idx1, const void *idx2)
|
|
122
122
|
}
|
123
123
|
|
124
124
|
/*
|
125
|
-
*
|
125
|
+
* model(array_of_tokens);
|
126
126
|
* ==== Return
|
127
127
|
* Top terms
|
128
128
|
* ==== Parameters
|
129
129
|
* array_of_tokens: Tokens to turn into grams and extract phrases from.
|
130
130
|
*/
|
131
|
-
static VALUE
|
131
|
+
static VALUE method_model(VALUE self, VALUE array_of_tokens)
|
132
132
|
{
|
133
133
|
int i, j;
|
134
134
|
long array_of_tokens_len = RARRAY_LEN(array_of_tokens);
|
data/lib/categorize.rb
CHANGED
data/lib/categorize/model.rb
CHANGED
@@ -4,6 +4,7 @@ module Categorize
|
|
4
4
|
module Model
|
5
5
|
MIN_WORD_LENGTH = 3
|
6
6
|
@bag_of_words = Models::BagOfWords.new
|
7
|
+
@c_bag_of_words = Models::CBagOfWords.new
|
7
8
|
|
8
9
|
class << self
|
9
10
|
#include Bow
|
@@ -21,12 +22,13 @@ module Categorize
|
|
21
22
|
# ==== Parameters
|
22
23
|
# items:: the items to be classified
|
23
24
|
def make_model_c(strings)
|
24
|
-
strings.map { |s| preprocess(s) }
|
25
|
-
|
25
|
+
array_of_tokens = strings.map { |s| preprocess(s) }
|
26
|
+
ret = @c_bag_of_words.model(array_of_tokens);
|
26
27
|
count = 0
|
27
28
|
ret.reduce({}) do |hash, term|
|
28
29
|
hash[term] ||= []
|
29
|
-
hash[term] << count
|
30
|
+
hash[term] << count
|
31
|
+
count += 1
|
30
32
|
hash
|
31
33
|
end
|
32
34
|
end
|
@@ -37,15 +39,14 @@ module Categorize
|
|
37
39
|
]
|
38
40
|
end
|
39
41
|
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
word
|
46
|
-
Constants::Words::COMMON.include?(word)
|
47
|
-
end
|
42
|
+
def preprocess(string)
|
43
|
+
split_lower_strings = string.split(
|
44
|
+
Constants::Words::SPLIT_REGEX).map(&:downcase)
|
45
|
+
split_lower_strings.delete_if do |word|
|
46
|
+
word.length < MIN_WORD_LENGTH ||
|
47
|
+
Constants::Words::COMMON.include?(word)
|
48
48
|
end
|
49
|
+
end
|
49
50
|
end
|
50
51
|
end
|
51
52
|
end
|
data/lib/categorize/version.rb
CHANGED
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: categorize
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.12
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -46,9 +46,6 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
46
46
|
- - ! '>='
|
47
47
|
- !ruby/object:Gem::Version
|
48
48
|
version: '0'
|
49
|
-
segments:
|
50
|
-
- 0
|
51
|
-
hash: -3553060524054293255
|
52
49
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
53
50
|
none: false
|
54
51
|
requirements:
|