reclassifier 0.4.5 → 0.4.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -2,8 +2,11 @@
2
2
  # Bayesian classifier for arbitrary text.
3
3
  #
4
4
  # Implementation is translated from
5
- # <em>Introduction to Information Retrieval</em> by Christopher D. Manning, Prabhakar Raghavan and Hinrich Schütze,
6
- # Cambridge University Press. 2008, ISBN 0521865719.
5
+ # <em>Introduction to Information Retrieval</em> by Christopher D. Manning,
6
+ # Prabhakar Raghavan and Hinrich Schütze, # Cambridge University Press. 2008,
7
+ # ISBN 0521865719.
8
+ #
9
+ # Derived quantities are cached to improve performance of repeated #classify calls.
7
10
  #
8
11
  class Reclassifier::Bayes
9
12
  include Reclassifier::WordHash
@@ -33,7 +36,7 @@ class Reclassifier::Bayes
33
36
  def train(classification, text)
34
37
  ensure_classification_exists(classification)
35
38
 
36
- @docs_in_classification_count[classification] += 1
39
+ update_doc_count(classification, 1)
37
40
 
38
41
  smart_word_hash(text).each do |word, count|
39
42
  @classifications[classification][word] ||= 0
@@ -53,7 +56,7 @@ class Reclassifier::Bayes
53
56
  def untrain(classification, text)
54
57
  ensure_classification_exists(classification)
55
58
 
56
- @docs_in_classification_count[classification] -= 1
59
+ update_doc_count(classification, -1)
57
60
 
58
61
  smart_word_hash(text).each do |word, count|
59
62
  @classifications[classification][word] -= count if @classifications[classification].include?(word)
@@ -68,17 +71,21 @@ class Reclassifier::Bayes
68
71
  def calculate_scores(text)
69
72
  scores = {}
70
73
 
74
+ @cache[:total_docs_classified_log] ||= Math.log(@docs_in_classification_count.values.reduce(:+))
75
+ @cache[:words_classified] ||= @classifications.values.reduce(Set.new) {|set, word_counts| set.merge(word_counts.keys)}
76
+
71
77
  @classifications.each do |classification, classification_word_counts|
72
78
  # prior
73
79
  scores[classification] = Math.log(@docs_in_classification_count[classification])
74
- scores[classification] -= Math.log(@docs_in_classification_count.values.reduce(:+))
80
+ scores[classification] -= @cache[:total_docs_classified_log]
75
81
 
76
82
  # likelihood
83
+ classification_word_count = classification_word_counts.values.reduce(:+).to_i
77
84
  smart_word_hash(text).each do |word, count|
78
- if @classifications.values.reduce(Set.new) {|set, word_counts| set.merge(word_counts.keys)}.include?(word)
85
+ if @cache[:words_classified].include?(word)
79
86
  scores[classification] += count * Math.log((classification_word_counts[word] || 0) + 1)
80
87
 
81
- scores[classification] -= count * Math.log(classification_word_counts.values.reduce(:+).to_i + @classifications.values.reduce(Set.new) {|set, word_counts| set.merge(word_counts.keys)}.count)
88
+ scores[classification] -= count * Math.log(classification_word_count + @cache[:words_classified].count)
82
89
  end
83
90
  end
84
91
  end
@@ -135,7 +142,52 @@ class Reclassifier::Bayes
135
142
  return_value
136
143
  end
137
144
 
145
+ # Invalidates the cache.
146
+ #
147
+ # classifier = Reclassifier::Bayes.new([:one, :other])
148
+ #
149
+ # classifier.train(:one, 'bbb')
150
+ # classifier.train(:other, 'aaa')
151
+ #
152
+ # classifier.classify('aaa')
153
+ #
154
+ # classifier.cache_set?
155
+ # => true
156
+ #
157
+ # classifier.invalidate_cache
158
+ #
159
+ # classifier.cache_set?
160
+ # => false
161
+ #
162
+ def invalidate_cache
163
+ @cache = {}
164
+ end
165
+
166
+ # Returns true if the cache has been set (i.e. #classify has been run).
167
+ # Returns false otherwise.
168
+ # classifier = Reclassifier::Bayes.new([:one, :other])
169
+ #
170
+ # classifier.cache_set?
171
+ # => false
172
+ #
173
+ # classifier.train(:one, 'bbb')
174
+ # classifier.train(:other, 'aaa')
175
+ #
176
+ # classifier.classify('aaa')
177
+ #
178
+ # classifier.cache_set?
179
+ # => true
180
+ #
181
+ def cache_set?
182
+ @cache.present?
183
+ end
184
+
138
185
  private
186
+ def update_doc_count(classification, value)
187
+ @docs_in_classification_count[classification] += value
188
+
189
+ invalidate_cache
190
+ end
139
191
 
140
192
  def ensure_classification_exists(classification)
141
193
  raise Reclassifier::UnknownClassificationError unless @classifications.include?(classification)
@@ -1,3 +1,3 @@
1
1
  module Reclassifier
2
- VERSION = "0.4.5"
2
+ VERSION = "0.4.6"
3
3
  end
data/lib/reclassifier.rb CHANGED
@@ -2,6 +2,7 @@
2
2
  require 'fast-stemmer'
3
3
  require 'gsl'
4
4
  require 'matrix'
5
+ require 'active_support/core_ext/object/blank'
5
6
 
6
7
  # files
7
8
  require 'reclassifier/version'
data/reclassifier.gemspec CHANGED
@@ -24,4 +24,5 @@ Gem::Specification.new do |spec|
24
24
 
25
25
  spec.add_dependency 'fast-stemmer'
26
26
  spec.add_dependency 'gsl'
27
+ spec.add_dependency 'activesupport'
27
28
  end
data/spec/bayes_spec.rb CHANGED
@@ -1,134 +1,184 @@
1
1
  require 'spec_helper'
2
2
 
3
3
  describe Reclassifier::Bayes do
4
+ subject(:classifier) { Reclassifier::Bayes.new }
5
+
6
+ shared_examples 'cache invalidator' do |method|
7
+ it 'should invalidate the cache' do
8
+ classifier = Reclassifier::Bayes.new([:in_china, :not_in_china])
9
+
10
+ classifier.should_receive(:invalidate_cache)
11
+
12
+ classifier.send(method, :in_china, 'Chinese Beijing Chinese')
13
+ end
14
+ end
15
+
4
16
  describe "classifications" do
5
17
  it "should return the classifications" do
6
- subject = described_class.new([:interesting, :uninteresting])
18
+ classifier = Reclassifier::Bayes.new([:interesting, :uninteresting])
7
19
 
8
- subject.classifications.sort.should eq([:interesting, :uninteresting])
20
+ classifier.classifications.sort.should eq([:interesting, :uninteresting])
9
21
  end
10
22
  end
11
23
 
12
24
  describe "train" do
13
25
  it "should raise an UnknownClassificationError if the specified classification hasn't been added" do
14
- expect {subject.train(:blargle, '')}.to raise_error(Reclassifier::UnknownClassificationError)
26
+ expect { classifier.train(:blargle, '')}.to raise_error(Reclassifier::UnknownClassificationError)
15
27
  end
16
28
 
17
29
  it "should train the classifier to the (classification, document) pair" do
18
- subject = described_class.new([:in_china, :not_in_china])
30
+ classifier = Reclassifier::Bayes.new([:in_china, :not_in_china])
19
31
 
20
- subject.train(:in_china, 'Chinese Beijing Chinese')
21
- subject.train(:in_china, 'Chinese Chinese Shanghai')
22
- subject.train(:in_china, 'Chinese Macao')
23
- subject.train(:not_in_china, 'Tokyo Japan Chinese')
32
+ classifier.train(:in_china, 'Chinese Beijing Chinese')
33
+ classifier.train(:in_china, 'Chinese Chinese Shanghai')
34
+ classifier.train(:in_china, 'Chinese Macao')
35
+ classifier.train(:not_in_china, 'Tokyo Japan Chinese')
24
36
 
25
- subject.classify('Chinese Chinese Chinese Tokyo Japan').should eq(:in_china)
37
+ classifier.classify('Chinese Chinese Chinese Tokyo Japan').should eq(:in_china)
26
38
  end
39
+
40
+ it_should_behave_like 'cache invalidator', :train
27
41
  end
28
42
 
29
- describe "untrain" do
43
+ describe 'untrain' do
30
44
  it "should raise an UnknownClassificationError if the specified classification hasn't been added" do
31
- expect {subject.untrain(:blargle, '')}.to raise_error(Reclassifier::UnknownClassificationError)
45
+ expect {classifier.untrain(:blargle, '')}.to raise_error(Reclassifier::UnknownClassificationError)
32
46
  end
33
47
 
34
- it "should untrain the classifier against the (classification, document) pair" do
35
- subject = described_class.new([:in_china, :not_in_china])
48
+ it 'should untrain the classifier against the (classification, document) pair' do
49
+ classifier = Reclassifier::Bayes.new([:in_china, :not_in_china])
36
50
 
37
- subject.train(:in_china, 'Chinese Chinese')
38
- subject.train(:not_in_china, 'Chinese Macao')
51
+ classifier.train(:in_china, 'Chinese Chinese')
52
+ classifier.train(:not_in_china, 'Chinese Macao')
39
53
 
40
- subject.classify('Chinese').should eq(:in_china)
54
+ classifier.classify('Chinese').should eq(:in_china)
41
55
 
42
- subject.untrain(:in_china, 'Chinese Chinese')
56
+ classifier.untrain(:in_china, 'Chinese Chinese')
43
57
 
44
- subject.classify('Chinese').should eq(:not_in_china)
58
+ classifier.classify('Chinese').should eq(:not_in_china)
45
59
  end
60
+
61
+ it 'should not result in negative word counts'
62
+
63
+ it_should_behave_like 'cache invalidator', :untrain
46
64
  end
47
65
 
48
66
  describe "calculate_scores" do
49
67
  it "should return a score hash with the correct scores" do
50
- subject = described_class.new([:in_china, :not_in_china])
68
+ classifier = Reclassifier::Bayes.new([:in_china, :not_in_china])
51
69
 
52
- subject.train(:in_china, 'Chinese Beijing Chinese')
53
- subject.train(:in_china, 'Chinese Chinese Shanghai')
54
- subject.train(:in_china, 'Chinese Macao')
55
- subject.train(:not_in_china, 'Tokyo Japan Chinese')
70
+ classifier.train(:in_china, 'Chinese Beijing Chinese')
71
+ classifier.train(:in_china, 'Chinese Chinese Shanghai')
72
+ classifier.train(:in_china, 'Chinese Macao')
73
+ classifier.train(:not_in_china, 'Tokyo Japan Chinese')
56
74
 
57
- scores = subject.calculate_scores('Chinese Chinese Chinese Tokyo Japan')
75
+ scores = classifier.calculate_scores('Chinese Chinese Chinese Tokyo Japan')
58
76
 
59
77
  scores[:in_china].should eq(-8.107690312843907)
60
78
  scores[:not_in_china].should eq(-8.906681345001262)
61
79
  end
62
80
 
63
81
  it "should handle the case when no documents are classified for a particular classification" do
64
- subject = described_class.new([:in_china, :not_in_china])
82
+ classifier = Reclassifier::Bayes.new([:in_china, :not_in_china])
65
83
 
66
- subject.train(:in_china, 'Chinese Beijing Chinese')
84
+ classifier.train(:in_china, 'Chinese Beijing Chinese')
67
85
 
68
- subject.calculate_scores('Chinese Beijing')
86
+ classifier.calculate_scores('Chinese Beijing')
69
87
  end
70
88
  end
71
89
 
72
90
  describe "add_classification" do
73
91
  it "should add the classification to the set of classifications" do
74
- subject.classifications.should be_empty
92
+ classifier.classifications.should be_empty
75
93
 
76
- subject.add_classification(:niner)
94
+ classifier.add_classification(:niner)
77
95
 
78
- subject.classifications.should eq([:niner])
96
+ classifier.classifications.should eq([:niner])
79
97
  end
80
98
 
81
99
  it "should return the classification" do
82
- subject.add_classification(:niner).should eq(:niner)
100
+ classifier.add_classification(:niner).should eq(:niner)
83
101
  end
84
102
  end
85
103
 
86
104
  describe "remove_classification" do
87
105
  it "should remove the classification from the set of classifications" do
88
- subject.add_classification(:niner)
106
+ classifier.add_classification(:niner)
89
107
 
90
- subject.remove_classification(:niner)
108
+ classifier.remove_classification(:niner)
91
109
 
92
- subject.classifications.should be_empty
110
+ classifier.classifications.should be_empty
93
111
  end
94
112
 
95
113
  it "should return the classification" do
96
- subject.add_classification(:niner)
114
+ classifier.add_classification(:niner)
97
115
 
98
- subject.remove_classification(:niner).should eq(:niner)
116
+ classifier.remove_classification(:niner).should eq(:niner)
99
117
  end
100
118
 
101
119
  it "should return nil if the classification didn't exist" do
102
- subject.remove_classification(:niner).should be(nil)
120
+ classifier.remove_classification(:niner).should be(nil)
121
+ end
122
+ end
123
+
124
+ describe 'cache_present?' do
125
+ it 'should return true if the cache has been set' do
126
+ classifier = Reclassifier::Bayes.new([:one, :other])
127
+
128
+ classifier.train(:one, 'bbb')
129
+ classifier.train(:other, 'aaa')
130
+
131
+ classifier.classify('')
132
+
133
+ classifier.cache_set?.should be(true)
134
+ end
135
+
136
+ it 'should return false if the cache has not been set' do
137
+ classifier.cache_set?.should be(false)
138
+ end
139
+
140
+ it 'should return false if the cache has been invalidated' do
141
+ classifier = Reclassifier::Bayes.new([:one, :other])
142
+
143
+ classifier.train(:one, 'bbb')
144
+ classifier.train(:other, 'aaa')
145
+
146
+ classifier.classify('')
147
+
148
+ classifier.cache_set?.should be(true)
149
+
150
+ classifier.invalidate_cache
151
+
152
+ classifier.cache_set?.should be(false)
103
153
  end
104
154
  end
105
155
 
106
156
  context ':clean option' do
107
157
  it 'should cause punctuation to be omitted if it is set to true' do
108
- subject = described_class.new([:one, :other], {:clean => true})
158
+ classifier = Reclassifier::Bayes.new([:one, :other], {:clean => true})
109
159
 
110
- subject.train(:one, '! ! ! ! bbb')
111
- subject.train(:other, 'aaa')
160
+ classifier.train(:one, '! ! ! ! bbb')
161
+ classifier.train(:other, 'aaa')
112
162
 
113
- subject.classify('! aaa !').should eq(:other)
163
+ classifier.classify('! aaa !').should eq(:other)
114
164
  end
115
165
 
116
166
  it 'should default to true' do
117
- subject = described_class.new([:one, :other])
167
+ classifier = Reclassifier::Bayes.new([:one, :other])
118
168
 
119
- subject.train(:one, '! ! ! ! bbb')
120
- subject.train(:other, 'aaa')
169
+ classifier.train(:one, '! ! ! ! bbb')
170
+ classifier.train(:other, 'aaa')
121
171
 
122
- subject.classify('! aaa !').should eq(:other)
172
+ classifier.classify('! aaa !').should eq(:other)
123
173
  end
124
174
 
125
175
  it 'should cause punctuation not to be omitted if it is set to false' do
126
- subject = described_class.new([:one, :other], {:clean => false})
176
+ classifier = Reclassifier::Bayes.new([:one, :other], {:clean => false})
127
177
 
128
- subject.train(:one, '! ! ! ! bbb')
129
- subject.train(:other, 'aaa')
178
+ classifier.train(:one, '! ! ! ! bbb')
179
+ classifier.train(:other, 'aaa')
130
180
 
131
- subject.classify('! aaa !').should eq(:one)
181
+ classifier.classify('! aaa !').should eq(:one)
132
182
  end
133
183
  end
134
184
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: reclassifier
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.4.5
4
+ version: 0.4.6
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2013-04-24 00:00:00.000000000 Z
12
+ date: 2013-04-25 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: bundler
@@ -91,6 +91,22 @@ dependencies:
91
91
  - - ! '>='
92
92
  - !ruby/object:Gem::Version
93
93
  version: '0'
94
+ - !ruby/object:Gem::Dependency
95
+ name: activesupport
96
+ requirement: !ruby/object:Gem::Requirement
97
+ none: false
98
+ requirements:
99
+ - - ! '>='
100
+ - !ruby/object:Gem::Version
101
+ version: '0'
102
+ type: :runtime
103
+ prerelease: false
104
+ version_requirements: !ruby/object:Gem::Requirement
105
+ none: false
106
+ requirements:
107
+ - - ! '>='
108
+ - !ruby/object:Gem::Version
109
+ version: '0'
94
110
  description: Bayesian and Latent Semantic Indexing classification of text.
95
111
  email:
96
112
  - rroblak@gmail.com