yanbi-ml 0.2.1 → 0.2.2
Sign up to get free protection for your applications and to get access to all the features.
- data/lib/bayes/bayes.rb +11 -5
- data/lib/bayes/fisher.rb +3 -10
- data/lib/version.rb +1 -1
- metadata +1 -1
data/lib/bayes/bayes.rb
CHANGED
@@ -20,6 +20,7 @@ module Yanbi
|
|
20
20
|
@categories = categories
|
21
21
|
@category_counts = {}
|
22
22
|
@document_counts = {}
|
23
|
+
@category_sizes = {}
|
23
24
|
|
24
25
|
@categories.each do |category|
|
25
26
|
cat = category.to_sym
|
@@ -54,11 +55,13 @@ module Yanbi
|
|
54
55
|
@category_counts[cat][word] ||= 0
|
55
56
|
@category_counts[cat][word] += 1
|
56
57
|
end
|
58
|
+
|
59
|
+
@category_sizes[cat] = category_size(cat)
|
57
60
|
end
|
58
61
|
|
59
62
|
def classify(document)
|
60
63
|
max_score(document) do |cat, doc|
|
61
|
-
|
64
|
+
score(cat, doc)
|
62
65
|
end
|
63
66
|
end
|
64
67
|
|
@@ -75,6 +78,7 @@ module Yanbi
|
|
75
78
|
categories.each do |category|
|
76
79
|
cat = category.to_sym
|
77
80
|
@category_counts[cat].reject! {|k,v| v < cutoff}
|
81
|
+
@category_sizes[cat] = category_size(cat)
|
78
82
|
end
|
79
83
|
end
|
80
84
|
|
@@ -84,16 +88,15 @@ module Yanbi
|
|
84
88
|
|
85
89
|
private
|
86
90
|
|
87
|
-
def
|
91
|
+
def score(cat, document)
|
88
92
|
total_docs = @document_counts.values.reduce(:+).to_f
|
89
93
|
document_prob = document.words.uniq.map {|word| word_prob(cat, word)}.reduce(:+)
|
90
94
|
document_prob + Math.log(@document_counts[cat] / total_docs)
|
91
95
|
end
|
92
96
|
|
93
97
|
def word_prob(cat, word)
|
94
|
-
all_word_count = @category_counts[cat].values.reduce(&:+)
|
95
98
|
count = @category_counts[cat].has_key?(word) ? @category_counts[cat][word].to_f : 0.1
|
96
|
-
Math.log(count /
|
99
|
+
Math.log(count / @category_sizes[cat])
|
97
100
|
end
|
98
101
|
|
99
102
|
def max_score(document)
|
@@ -107,7 +110,10 @@ module Yanbi
|
|
107
110
|
i = scores.rindex(scores.max)
|
108
111
|
@categories[i]
|
109
112
|
end
|
110
|
-
|
113
|
+
|
114
|
+
def category_size(cat)
|
115
|
+
@category_counts[cat].values.reduce(&:+).to_i
|
116
|
+
end
|
111
117
|
end
|
112
118
|
|
113
119
|
end
|
data/lib/bayes/fisher.rb
CHANGED
@@ -5,16 +5,10 @@
|
|
5
5
|
module Yanbi
|
6
6
|
|
7
7
|
class Fisher < Bayes
|
8
|
-
|
9
|
-
def classify(text)
|
10
|
-
max_score(text) do |cat, doc|
|
11
|
-
fisher_score(cat, doc)
|
12
|
-
end
|
13
|
-
end
|
14
|
-
|
8
|
+
|
15
9
|
private
|
16
10
|
|
17
|
-
def
|
11
|
+
def score(category, document)
|
18
12
|
features = document.words.uniq
|
19
13
|
probs = features.map {|x| weighted_prob(x, category)}
|
20
14
|
pscores = probs.reduce(&:*)
|
@@ -30,9 +24,8 @@ module Yanbi
|
|
30
24
|
end
|
31
25
|
|
32
26
|
def word_prob(cat, word)
|
33
|
-
all_word_count = @category_counts[cat].values.reduce(&:+)
|
34
27
|
count = @category_counts[cat].has_key?(word) ? @category_counts[cat][word].to_f : 0
|
35
|
-
count /
|
28
|
+
count / @category_sizes[cat]
|
36
29
|
end
|
37
30
|
|
38
31
|
def weighted_prob(word, category, basicprob=nil, weight=1.0, ap=0.5)
|
data/lib/version.rb
CHANGED