yanbi-ml 0.2.1 → 0.2.2

Sign up to get free protection for your applications and to get access to all the features.
data/lib/bayes/bayes.rb CHANGED
@@ -20,6 +20,7 @@ module Yanbi
20
20
  @categories = categories
21
21
  @category_counts = {}
22
22
  @document_counts = {}
23
+ @category_sizes = {}
23
24
 
24
25
  @categories.each do |category|
25
26
  cat = category.to_sym
@@ -54,11 +55,13 @@ module Yanbi
54
55
  @category_counts[cat][word] ||= 0
55
56
  @category_counts[cat][word] += 1
56
57
  end
58
+
59
+ @category_sizes[cat] = category_size(cat)
57
60
  end
58
61
 
59
62
  def classify(document)
60
63
  max_score(document) do |cat, doc|
61
- cond_prob(cat, doc)
64
+ score(cat, doc)
62
65
  end
63
66
  end
64
67
 
@@ -75,6 +78,7 @@ module Yanbi
75
78
  categories.each do |category|
76
79
  cat = category.to_sym
77
80
  @category_counts[cat].reject! {|k,v| v < cutoff}
81
+ @category_sizes[cat] = category_size(cat)
78
82
  end
79
83
  end
80
84
 
@@ -84,16 +88,15 @@ module Yanbi
84
88
 
85
89
  private
86
90
 
87
- def cond_prob(cat, document)
91
+ def score(cat, document)
88
92
  total_docs = @document_counts.values.reduce(:+).to_f
89
93
  document_prob = document.words.uniq.map {|word| word_prob(cat, word)}.reduce(:+)
90
94
  document_prob + Math.log(@document_counts[cat] / total_docs)
91
95
  end
92
96
 
93
97
  def word_prob(cat, word)
94
- all_word_count = @category_counts[cat].values.reduce(&:+)
95
98
  count = @category_counts[cat].has_key?(word) ? @category_counts[cat][word].to_f : 0.1
96
- Math.log(count / all_word_count)
99
+ Math.log(count / @category_sizes[cat])
97
100
  end
98
101
 
99
102
  def max_score(document)
@@ -107,7 +110,10 @@ module Yanbi
107
110
  i = scores.rindex(scores.max)
108
111
  @categories[i]
109
112
  end
110
-
113
+
114
+ def category_size(cat)
115
+ @category_counts[cat].values.reduce(&:+).to_i
116
+ end
111
117
  end
112
118
 
113
119
  end
data/lib/bayes/fisher.rb CHANGED
@@ -5,16 +5,10 @@
5
5
  module Yanbi
6
6
 
7
7
  class Fisher < Bayes
8
-
9
- def classify(text)
10
- max_score(text) do |cat, doc|
11
- fisher_score(cat, doc)
12
- end
13
- end
14
-
8
+
15
9
  private
16
10
 
17
- def fisher_score(category, document)
11
+ def score(category, document)
18
12
  features = document.words.uniq
19
13
  probs = features.map {|x| weighted_prob(x, category)}
20
14
  pscores = probs.reduce(&:*)
@@ -30,9 +24,8 @@ module Yanbi
30
24
  end
31
25
 
32
26
  def word_prob(cat, word)
33
- all_word_count = @category_counts[cat].values.reduce(&:+)
34
27
  count = @category_counts[cat].has_key?(word) ? @category_counts[cat][word].to_f : 0
35
- count / all_word_count
28
+ count / @category_sizes[cat]
36
29
  end
37
30
 
38
31
  def weighted_prob(word, category, basicprob=nil, weight=1.0, ap=0.5)
data/lib/version.rb CHANGED
@@ -3,5 +3,5 @@
3
3
  # License:: MIT
4
4
 
5
5
  module Yanbi
6
- VERSION = "0.2.1"
6
+ VERSION = "0.2.2"
7
7
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: yanbi-ml
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.1
4
+ version: 0.2.2
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors: