reclassifier 0.4.5 → 0.4.6

Sign up to get free protection for your applications and to get access to all the features.
@@ -2,8 +2,11 @@
2
2
  # Bayesian classifier for arbitrary text.
3
3
  #
4
4
  # Implementation is translated from
5
- # <em>Introduction to Information Retrieval</em> by Christopher D. Manning, Prabhakar Raghavan and Hinrich Schütze,
6
- # Cambridge University Press. 2008, ISBN 0521865719.
5
+ # <em>Introduction to Information Retrieval</em> by Christopher D. Manning,
6
+ # Prabhakar Raghavan and Hinrich Schütze, # Cambridge University Press. 2008,
7
+ # ISBN 0521865719.
8
+ #
9
+ # Derived quantities are cached to improve performance of repeated #classify calls.
7
10
  #
8
11
  class Reclassifier::Bayes
9
12
  include Reclassifier::WordHash
@@ -33,7 +36,7 @@ class Reclassifier::Bayes
33
36
  def train(classification, text)
34
37
  ensure_classification_exists(classification)
35
38
 
36
- @docs_in_classification_count[classification] += 1
39
+ update_doc_count(classification, 1)
37
40
 
38
41
  smart_word_hash(text).each do |word, count|
39
42
  @classifications[classification][word] ||= 0
@@ -53,7 +56,7 @@ class Reclassifier::Bayes
53
56
  def untrain(classification, text)
54
57
  ensure_classification_exists(classification)
55
58
 
56
- @docs_in_classification_count[classification] -= 1
59
+ update_doc_count(classification, -1)
57
60
 
58
61
  smart_word_hash(text).each do |word, count|
59
62
  @classifications[classification][word] -= count if @classifications[classification].include?(word)
@@ -68,17 +71,21 @@ class Reclassifier::Bayes
68
71
  def calculate_scores(text)
69
72
  scores = {}
70
73
 
74
+ @cache[:total_docs_classified_log] ||= Math.log(@docs_in_classification_count.values.reduce(:+))
75
+ @cache[:words_classified] ||= @classifications.values.reduce(Set.new) {|set, word_counts| set.merge(word_counts.keys)}
76
+
71
77
  @classifications.each do |classification, classification_word_counts|
72
78
  # prior
73
79
  scores[classification] = Math.log(@docs_in_classification_count[classification])
74
- scores[classification] -= Math.log(@docs_in_classification_count.values.reduce(:+))
80
+ scores[classification] -= @cache[:total_docs_classified_log]
75
81
 
76
82
  # likelihood
83
+ classification_word_count = classification_word_counts.values.reduce(:+).to_i
77
84
  smart_word_hash(text).each do |word, count|
78
- if @classifications.values.reduce(Set.new) {|set, word_counts| set.merge(word_counts.keys)}.include?(word)
85
+ if @cache[:words_classified].include?(word)
79
86
  scores[classification] += count * Math.log((classification_word_counts[word] || 0) + 1)
80
87
 
81
- scores[classification] -= count * Math.log(classification_word_counts.values.reduce(:+).to_i + @classifications.values.reduce(Set.new) {|set, word_counts| set.merge(word_counts.keys)}.count)
88
+ scores[classification] -= count * Math.log(classification_word_count + @cache[:words_classified].count)
82
89
  end
83
90
  end
84
91
  end
@@ -135,7 +142,52 @@ class Reclassifier::Bayes
135
142
  return_value
136
143
  end
137
144
 
145
+ # Invalidates the cache.
146
+ #
147
+ # classifier = Reclassifier::Bayes.new([:one, :other])
148
+ #
149
+ # classifier.train(:one, 'bbb')
150
+ # classifier.train(:other, 'aaa')
151
+ #
152
+ # classifier.classify('aaa')
153
+ #
154
+ # classifier.cache_set?
155
+ # => true
156
+ #
157
+ # classifier.invalidate_cache
158
+ #
159
+ # classifier.cache_set?
160
+ # => false
161
+ #
162
+ def invalidate_cache
163
+ @cache = {}
164
+ end
165
+
166
+ # Returns true if the cache has been set (i.e. #classify has been run).
167
+ # Returns false otherwise.
168
+ # classifier = Reclassifier::Bayes.new([:one, :other])
169
+ #
170
+ # classifier.cache_set?
171
+ # => false
172
+ #
173
+ # classifier.train(:one, 'bbb')
174
+ # classifier.train(:other, 'aaa')
175
+ #
176
+ # classifier.classify('aaa')
177
+ #
178
+ # classifier.cache_set?
179
+ # => true
180
+ #
181
+ def cache_set?
182
+ @cache.present?
183
+ end
184
+
138
185
  private
186
+ def update_doc_count(classification, value)
187
+ @docs_in_classification_count[classification] += value
188
+
189
+ invalidate_cache
190
+ end
139
191
 
140
192
  def ensure_classification_exists(classification)
141
193
  raise Reclassifier::UnknownClassificationError unless @classifications.include?(classification)
@@ -1,3 +1,3 @@
1
1
  module Reclassifier
2
- VERSION = "0.4.5"
2
+ VERSION = "0.4.6"
3
3
  end
data/lib/reclassifier.rb CHANGED
@@ -2,6 +2,7 @@
2
2
  require 'fast-stemmer'
3
3
  require 'gsl'
4
4
  require 'matrix'
5
+ require 'active_support/core_ext/object/blank'
5
6
 
6
7
  # files
7
8
  require 'reclassifier/version'
data/reclassifier.gemspec CHANGED
@@ -24,4 +24,5 @@ Gem::Specification.new do |spec|
24
24
 
25
25
  spec.add_dependency 'fast-stemmer'
26
26
  spec.add_dependency 'gsl'
27
+ spec.add_dependency 'activesupport'
27
28
  end
data/spec/bayes_spec.rb CHANGED
@@ -1,134 +1,184 @@
1
1
  require 'spec_helper'
2
2
 
3
3
  describe Reclassifier::Bayes do
4
+ subject(:classifier) { Reclassifier::Bayes.new }
5
+
6
+ shared_examples 'cache invalidator' do |method|
7
+ it 'should invalidate the cache' do
8
+ classifier = Reclassifier::Bayes.new([:in_china, :not_in_china])
9
+
10
+ classifier.should_receive(:invalidate_cache)
11
+
12
+ classifier.send(method, :in_china, 'Chinese Beijing Chinese')
13
+ end
14
+ end
15
+
4
16
  describe "classifications" do
5
17
  it "should return the classifications" do
6
- subject = described_class.new([:interesting, :uninteresting])
18
+ classifier = Reclassifier::Bayes.new([:interesting, :uninteresting])
7
19
 
8
- subject.classifications.sort.should eq([:interesting, :uninteresting])
20
+ classifier.classifications.sort.should eq([:interesting, :uninteresting])
9
21
  end
10
22
  end
11
23
 
12
24
  describe "train" do
13
25
  it "should raise an UnknownClassificationError if the specified classification hasn't been added" do
14
- expect {subject.train(:blargle, '')}.to raise_error(Reclassifier::UnknownClassificationError)
26
+ expect { classifier.train(:blargle, '')}.to raise_error(Reclassifier::UnknownClassificationError)
15
27
  end
16
28
 
17
29
  it "should train the classifier to the (classification, document) pair" do
18
- subject = described_class.new([:in_china, :not_in_china])
30
+ classifier = Reclassifier::Bayes.new([:in_china, :not_in_china])
19
31
 
20
- subject.train(:in_china, 'Chinese Beijing Chinese')
21
- subject.train(:in_china, 'Chinese Chinese Shanghai')
22
- subject.train(:in_china, 'Chinese Macao')
23
- subject.train(:not_in_china, 'Tokyo Japan Chinese')
32
+ classifier.train(:in_china, 'Chinese Beijing Chinese')
33
+ classifier.train(:in_china, 'Chinese Chinese Shanghai')
34
+ classifier.train(:in_china, 'Chinese Macao')
35
+ classifier.train(:not_in_china, 'Tokyo Japan Chinese')
24
36
 
25
- subject.classify('Chinese Chinese Chinese Tokyo Japan').should eq(:in_china)
37
+ classifier.classify('Chinese Chinese Chinese Tokyo Japan').should eq(:in_china)
26
38
  end
39
+
40
+ it_should_behave_like 'cache invalidator', :train
27
41
  end
28
42
 
29
- describe "untrain" do
43
+ describe 'untrain' do
30
44
  it "should raise an UnknownClassificationError if the specified classification hasn't been added" do
31
- expect {subject.untrain(:blargle, '')}.to raise_error(Reclassifier::UnknownClassificationError)
45
+ expect {classifier.untrain(:blargle, '')}.to raise_error(Reclassifier::UnknownClassificationError)
32
46
  end
33
47
 
34
- it "should untrain the classifier against the (classification, document) pair" do
35
- subject = described_class.new([:in_china, :not_in_china])
48
+ it 'should untrain the classifier against the (classification, document) pair' do
49
+ classifier = Reclassifier::Bayes.new([:in_china, :not_in_china])
36
50
 
37
- subject.train(:in_china, 'Chinese Chinese')
38
- subject.train(:not_in_china, 'Chinese Macao')
51
+ classifier.train(:in_china, 'Chinese Chinese')
52
+ classifier.train(:not_in_china, 'Chinese Macao')
39
53
 
40
- subject.classify('Chinese').should eq(:in_china)
54
+ classifier.classify('Chinese').should eq(:in_china)
41
55
 
42
- subject.untrain(:in_china, 'Chinese Chinese')
56
+ classifier.untrain(:in_china, 'Chinese Chinese')
43
57
 
44
- subject.classify('Chinese').should eq(:not_in_china)
58
+ classifier.classify('Chinese').should eq(:not_in_china)
45
59
  end
60
+
61
+ it 'should not result in negative word counts'
62
+
63
+ it_should_behave_like 'cache invalidator', :untrain
46
64
  end
47
65
 
48
66
  describe "calculate_scores" do
49
67
  it "should return a score hash with the correct scores" do
50
- subject = described_class.new([:in_china, :not_in_china])
68
+ classifier = Reclassifier::Bayes.new([:in_china, :not_in_china])
51
69
 
52
- subject.train(:in_china, 'Chinese Beijing Chinese')
53
- subject.train(:in_china, 'Chinese Chinese Shanghai')
54
- subject.train(:in_china, 'Chinese Macao')
55
- subject.train(:not_in_china, 'Tokyo Japan Chinese')
70
+ classifier.train(:in_china, 'Chinese Beijing Chinese')
71
+ classifier.train(:in_china, 'Chinese Chinese Shanghai')
72
+ classifier.train(:in_china, 'Chinese Macao')
73
+ classifier.train(:not_in_china, 'Tokyo Japan Chinese')
56
74
 
57
- scores = subject.calculate_scores('Chinese Chinese Chinese Tokyo Japan')
75
+ scores = classifier.calculate_scores('Chinese Chinese Chinese Tokyo Japan')
58
76
 
59
77
  scores[:in_china].should eq(-8.107690312843907)
60
78
  scores[:not_in_china].should eq(-8.906681345001262)
61
79
  end
62
80
 
63
81
  it "should handle the case when no documents are classified for a particular classification" do
64
- subject = described_class.new([:in_china, :not_in_china])
82
+ classifier = Reclassifier::Bayes.new([:in_china, :not_in_china])
65
83
 
66
- subject.train(:in_china, 'Chinese Beijing Chinese')
84
+ classifier.train(:in_china, 'Chinese Beijing Chinese')
67
85
 
68
- subject.calculate_scores('Chinese Beijing')
86
+ classifier.calculate_scores('Chinese Beijing')
69
87
  end
70
88
  end
71
89
 
72
90
  describe "add_classification" do
73
91
  it "should add the classification to the set of classifications" do
74
- subject.classifications.should be_empty
92
+ classifier.classifications.should be_empty
75
93
 
76
- subject.add_classification(:niner)
94
+ classifier.add_classification(:niner)
77
95
 
78
- subject.classifications.should eq([:niner])
96
+ classifier.classifications.should eq([:niner])
79
97
  end
80
98
 
81
99
  it "should return the classification" do
82
- subject.add_classification(:niner).should eq(:niner)
100
+ classifier.add_classification(:niner).should eq(:niner)
83
101
  end
84
102
  end
85
103
 
86
104
  describe "remove_classification" do
87
105
  it "should remove the classification from the set of classifications" do
88
- subject.add_classification(:niner)
106
+ classifier.add_classification(:niner)
89
107
 
90
- subject.remove_classification(:niner)
108
+ classifier.remove_classification(:niner)
91
109
 
92
- subject.classifications.should be_empty
110
+ classifier.classifications.should be_empty
93
111
  end
94
112
 
95
113
  it "should return the classification" do
96
- subject.add_classification(:niner)
114
+ classifier.add_classification(:niner)
97
115
 
98
- subject.remove_classification(:niner).should eq(:niner)
116
+ classifier.remove_classification(:niner).should eq(:niner)
99
117
  end
100
118
 
101
119
  it "should return nil if the classification didn't exist" do
102
- subject.remove_classification(:niner).should be(nil)
120
+ classifier.remove_classification(:niner).should be(nil)
121
+ end
122
+ end
123
+
124
+ describe 'cache_present?' do
125
+ it 'should return true if the cache has been set' do
126
+ classifier = Reclassifier::Bayes.new([:one, :other])
127
+
128
+ classifier.train(:one, 'bbb')
129
+ classifier.train(:other, 'aaa')
130
+
131
+ classifier.classify('')
132
+
133
+ classifier.cache_set?.should be(true)
134
+ end
135
+
136
+ it 'should return false if the cache has not been set' do
137
+ classifier.cache_set?.should be(false)
138
+ end
139
+
140
+ it 'should return false if the cache has been invalidated' do
141
+ classifier = Reclassifier::Bayes.new([:one, :other])
142
+
143
+ classifier.train(:one, 'bbb')
144
+ classifier.train(:other, 'aaa')
145
+
146
+ classifier.classify('')
147
+
148
+ classifier.cache_set?.should be(true)
149
+
150
+ classifier.invalidate_cache
151
+
152
+ classifier.cache_set?.should be(false)
103
153
  end
104
154
  end
105
155
 
106
156
  context ':clean option' do
107
157
  it 'should cause punctuation to be omitted if it is set to true' do
108
- subject = described_class.new([:one, :other], {:clean => true})
158
+ classifier = Reclassifier::Bayes.new([:one, :other], {:clean => true})
109
159
 
110
- subject.train(:one, '! ! ! ! bbb')
111
- subject.train(:other, 'aaa')
160
+ classifier.train(:one, '! ! ! ! bbb')
161
+ classifier.train(:other, 'aaa')
112
162
 
113
- subject.classify('! aaa !').should eq(:other)
163
+ classifier.classify('! aaa !').should eq(:other)
114
164
  end
115
165
 
116
166
  it 'should default to true' do
117
- subject = described_class.new([:one, :other])
167
+ classifier = Reclassifier::Bayes.new([:one, :other])
118
168
 
119
- subject.train(:one, '! ! ! ! bbb')
120
- subject.train(:other, 'aaa')
169
+ classifier.train(:one, '! ! ! ! bbb')
170
+ classifier.train(:other, 'aaa')
121
171
 
122
- subject.classify('! aaa !').should eq(:other)
172
+ classifier.classify('! aaa !').should eq(:other)
123
173
  end
124
174
 
125
175
  it 'should cause punctuation not to be omitted if it is set to false' do
126
- subject = described_class.new([:one, :other], {:clean => false})
176
+ classifier = Reclassifier::Bayes.new([:one, :other], {:clean => false})
127
177
 
128
- subject.train(:one, '! ! ! ! bbb')
129
- subject.train(:other, 'aaa')
178
+ classifier.train(:one, '! ! ! ! bbb')
179
+ classifier.train(:other, 'aaa')
130
180
 
131
- subject.classify('! aaa !').should eq(:one)
181
+ classifier.classify('! aaa !').should eq(:one)
132
182
  end
133
183
  end
134
184
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: reclassifier
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.4.5
4
+ version: 0.4.6
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2013-04-24 00:00:00.000000000 Z
12
+ date: 2013-04-25 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: bundler
@@ -91,6 +91,22 @@ dependencies:
91
91
  - - ! '>='
92
92
  - !ruby/object:Gem::Version
93
93
  version: '0'
94
+ - !ruby/object:Gem::Dependency
95
+ name: activesupport
96
+ requirement: !ruby/object:Gem::Requirement
97
+ none: false
98
+ requirements:
99
+ - - ! '>='
100
+ - !ruby/object:Gem::Version
101
+ version: '0'
102
+ type: :runtime
103
+ prerelease: false
104
+ version_requirements: !ruby/object:Gem::Requirement
105
+ none: false
106
+ requirements:
107
+ - - ! '>='
108
+ - !ruby/object:Gem::Version
109
+ version: '0'
94
110
  description: Bayesian and Latent Semantic Indexing classification of text.
95
111
  email:
96
112
  - rroblak@gmail.com