categorize 0.0.2 → 0.0.3
Sign up to get free protection for your applications and to get access to all the features.
- data/lib/categorize/model.rb +35 -33
- metadata +1 -1
data/lib/categorize/model.rb
CHANGED
@@ -1,49 +1,51 @@
|
|
1
1
|
# encoding: utf-8
|
2
2
|
|
3
3
|
module Categorize
|
4
|
-
|
5
|
-
|
4
|
+
module Model
|
5
|
+
MIN_WORD_LENGTH = 3
|
6
|
+
@bag_of_words = Models::BagOfWords.new
|
6
7
|
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
8
|
+
class << self
|
9
|
+
#include Bow
|
10
|
+
# ==== Return
|
11
|
+
# Hash - category => results
|
12
|
+
# ==== Parameters
|
13
|
+
# documents:: a list of documents to be classified
|
14
|
+
def make_model(query, documents, modeler = @bag_of_words)
|
15
|
+
records_to_tokens = lexicalize(documents)
|
16
|
+
modeler.model(query.downcase.strip, records_to_tokens)
|
17
|
+
end
|
17
18
|
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
19
|
+
# ==== Return
|
20
|
+
# Hash - category => results
|
21
|
+
# ==== Parameters
|
22
|
+
# items:: the items to be classified
|
23
|
+
def make_model_c(strings)
|
24
|
+
strings.map { |s| preprocess(s) }
|
25
|
+
#ret = model_bow(array_of_tokens);
|
26
|
+
count = 0
|
27
|
+
ret.reduce({}) do |hash, term|
|
28
|
+
hash[term] ||= []
|
29
|
+
hash[term] << count += 1
|
30
|
+
hash
|
31
|
+
end
|
30
32
|
end
|
31
|
-
end
|
32
33
|
|
33
|
-
private
|
34
34
|
def lexicalize(strings)
|
35
35
|
Hash[
|
36
36
|
(0..(strings.length - 1)).zip(strings.map { |s| preprocess(s) })
|
37
37
|
]
|
38
38
|
end
|
39
39
|
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
40
|
+
private
|
41
|
+
def preprocess(string)
|
42
|
+
split_lower_strings = string.split(
|
43
|
+
Constants::Words::SPLIT_REGEX).map(&:downcase)
|
44
|
+
split_lower_strings.delete_if do |word|
|
45
|
+
word.length < MIN_WORD_LENGTH ||
|
46
|
+
Constants::Words::COMMON.include?(word)
|
47
|
+
end
|
46
48
|
end
|
47
|
-
|
49
|
+
end
|
48
50
|
end
|
49
51
|
end
|