reclassifier 0.4.5 → 0.4.6
Sign up to get free protection for your applications and to get access to all the features.
- data/lib/reclassifier/bayes.rb +59 -7
- data/lib/reclassifier/version.rb +1 -1
- data/lib/reclassifier.rb +1 -0
- data/reclassifier.gemspec +1 -0
- data/spec/bayes_spec.rb +99 -49
- metadata +18 -2
data/lib/reclassifier/bayes.rb
CHANGED
@@ -2,8 +2,11 @@
|
|
2
2
|
# Bayesian classifier for arbitrary text.
|
3
3
|
#
|
4
4
|
# Implementation is translated from
|
5
|
-
# <em>Introduction to Information Retrieval</em> by Christopher D. Manning,
|
6
|
-
# Cambridge University Press. 2008,
|
5
|
+
# <em>Introduction to Information Retrieval</em> by Christopher D. Manning,
|
6
|
+
# Prabhakar Raghavan and Hinrich Schütze, # Cambridge University Press. 2008,
|
7
|
+
# ISBN 0521865719.
|
8
|
+
#
|
9
|
+
# Derived quantities are cached to improve performance of repeated #classify calls.
|
7
10
|
#
|
8
11
|
class Reclassifier::Bayes
|
9
12
|
include Reclassifier::WordHash
|
@@ -33,7 +36,7 @@ class Reclassifier::Bayes
|
|
33
36
|
def train(classification, text)
|
34
37
|
ensure_classification_exists(classification)
|
35
38
|
|
36
|
-
|
39
|
+
update_doc_count(classification, 1)
|
37
40
|
|
38
41
|
smart_word_hash(text).each do |word, count|
|
39
42
|
@classifications[classification][word] ||= 0
|
@@ -53,7 +56,7 @@ class Reclassifier::Bayes
|
|
53
56
|
def untrain(classification, text)
|
54
57
|
ensure_classification_exists(classification)
|
55
58
|
|
56
|
-
|
59
|
+
update_doc_count(classification, -1)
|
57
60
|
|
58
61
|
smart_word_hash(text).each do |word, count|
|
59
62
|
@classifications[classification][word] -= count if @classifications[classification].include?(word)
|
@@ -68,17 +71,21 @@ class Reclassifier::Bayes
|
|
68
71
|
def calculate_scores(text)
|
69
72
|
scores = {}
|
70
73
|
|
74
|
+
@cache[:total_docs_classified_log] ||= Math.log(@docs_in_classification_count.values.reduce(:+))
|
75
|
+
@cache[:words_classified] ||= @classifications.values.reduce(Set.new) {|set, word_counts| set.merge(word_counts.keys)}
|
76
|
+
|
71
77
|
@classifications.each do |classification, classification_word_counts|
|
72
78
|
# prior
|
73
79
|
scores[classification] = Math.log(@docs_in_classification_count[classification])
|
74
|
-
scores[classification] -=
|
80
|
+
scores[classification] -= @cache[:total_docs_classified_log]
|
75
81
|
|
76
82
|
# likelihood
|
83
|
+
classification_word_count = classification_word_counts.values.reduce(:+).to_i
|
77
84
|
smart_word_hash(text).each do |word, count|
|
78
|
-
if @
|
85
|
+
if @cache[:words_classified].include?(word)
|
79
86
|
scores[classification] += count * Math.log((classification_word_counts[word] || 0) + 1)
|
80
87
|
|
81
|
-
scores[classification] -= count * Math.log(
|
88
|
+
scores[classification] -= count * Math.log(classification_word_count + @cache[:words_classified].count)
|
82
89
|
end
|
83
90
|
end
|
84
91
|
end
|
@@ -135,7 +142,52 @@ class Reclassifier::Bayes
|
|
135
142
|
return_value
|
136
143
|
end
|
137
144
|
|
145
|
+
# Invalidates the cache.
|
146
|
+
#
|
147
|
+
# classifier = Reclassifier::Bayes.new([:one, :other])
|
148
|
+
#
|
149
|
+
# classifier.train(:one, 'bbb')
|
150
|
+
# classifier.train(:other, 'aaa')
|
151
|
+
#
|
152
|
+
# classifier.classify('aaa')
|
153
|
+
#
|
154
|
+
# classifier.cache_set?
|
155
|
+
# => true
|
156
|
+
#
|
157
|
+
# classifier.invalidate_cache
|
158
|
+
#
|
159
|
+
# classifier.cache_set?
|
160
|
+
# => false
|
161
|
+
#
|
162
|
+
def invalidate_cache
|
163
|
+
@cache = {}
|
164
|
+
end
|
165
|
+
|
166
|
+
# Returns true if the cache has been set (i.e. #classify has been run).
|
167
|
+
# Returns false otherwise.
|
168
|
+
# classifier = Reclassifier::Bayes.new([:one, :other])
|
169
|
+
#
|
170
|
+
# classifier.cache_set?
|
171
|
+
# => false
|
172
|
+
#
|
173
|
+
# classifier.train(:one, 'bbb')
|
174
|
+
# classifier.train(:other, 'aaa')
|
175
|
+
#
|
176
|
+
# classifier.classify('aaa')
|
177
|
+
#
|
178
|
+
# classifier.cache_set?
|
179
|
+
# => true
|
180
|
+
#
|
181
|
+
def cache_set?
|
182
|
+
@cache.present?
|
183
|
+
end
|
184
|
+
|
138
185
|
private
|
186
|
+
def update_doc_count(classification, value)
|
187
|
+
@docs_in_classification_count[classification] += value
|
188
|
+
|
189
|
+
invalidate_cache
|
190
|
+
end
|
139
191
|
|
140
192
|
def ensure_classification_exists(classification)
|
141
193
|
raise Reclassifier::UnknownClassificationError unless @classifications.include?(classification)
|
data/lib/reclassifier/version.rb
CHANGED
data/lib/reclassifier.rb
CHANGED
data/reclassifier.gemspec
CHANGED
data/spec/bayes_spec.rb
CHANGED
@@ -1,134 +1,184 @@
|
|
1
1
|
require 'spec_helper'
|
2
2
|
|
3
3
|
describe Reclassifier::Bayes do
|
4
|
+
subject(:classifier) { Reclassifier::Bayes.new }
|
5
|
+
|
6
|
+
shared_examples 'cache invalidator' do |method|
|
7
|
+
it 'should invalidate the cache' do
|
8
|
+
classifier = Reclassifier::Bayes.new([:in_china, :not_in_china])
|
9
|
+
|
10
|
+
classifier.should_receive(:invalidate_cache)
|
11
|
+
|
12
|
+
classifier.send(method, :in_china, 'Chinese Beijing Chinese')
|
13
|
+
end
|
14
|
+
end
|
15
|
+
|
4
16
|
describe "classifications" do
|
5
17
|
it "should return the classifications" do
|
6
|
-
|
18
|
+
classifier = Reclassifier::Bayes.new([:interesting, :uninteresting])
|
7
19
|
|
8
|
-
|
20
|
+
classifier.classifications.sort.should eq([:interesting, :uninteresting])
|
9
21
|
end
|
10
22
|
end
|
11
23
|
|
12
24
|
describe "train" do
|
13
25
|
it "should raise an UnknownClassificationError if the specified classification hasn't been added" do
|
14
|
-
expect {
|
26
|
+
expect { classifier.train(:blargle, '')}.to raise_error(Reclassifier::UnknownClassificationError)
|
15
27
|
end
|
16
28
|
|
17
29
|
it "should train the classifier to the (classification, document) pair" do
|
18
|
-
|
30
|
+
classifier = Reclassifier::Bayes.new([:in_china, :not_in_china])
|
19
31
|
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
32
|
+
classifier.train(:in_china, 'Chinese Beijing Chinese')
|
33
|
+
classifier.train(:in_china, 'Chinese Chinese Shanghai')
|
34
|
+
classifier.train(:in_china, 'Chinese Macao')
|
35
|
+
classifier.train(:not_in_china, 'Tokyo Japan Chinese')
|
24
36
|
|
25
|
-
|
37
|
+
classifier.classify('Chinese Chinese Chinese Tokyo Japan').should eq(:in_china)
|
26
38
|
end
|
39
|
+
|
40
|
+
it_should_behave_like 'cache invalidator', :train
|
27
41
|
end
|
28
42
|
|
29
|
-
describe
|
43
|
+
describe 'untrain' do
|
30
44
|
it "should raise an UnknownClassificationError if the specified classification hasn't been added" do
|
31
|
-
expect {
|
45
|
+
expect {classifier.untrain(:blargle, '')}.to raise_error(Reclassifier::UnknownClassificationError)
|
32
46
|
end
|
33
47
|
|
34
|
-
it
|
35
|
-
|
48
|
+
it 'should untrain the classifier against the (classification, document) pair' do
|
49
|
+
classifier = Reclassifier::Bayes.new([:in_china, :not_in_china])
|
36
50
|
|
37
|
-
|
38
|
-
|
51
|
+
classifier.train(:in_china, 'Chinese Chinese')
|
52
|
+
classifier.train(:not_in_china, 'Chinese Macao')
|
39
53
|
|
40
|
-
|
54
|
+
classifier.classify('Chinese').should eq(:in_china)
|
41
55
|
|
42
|
-
|
56
|
+
classifier.untrain(:in_china, 'Chinese Chinese')
|
43
57
|
|
44
|
-
|
58
|
+
classifier.classify('Chinese').should eq(:not_in_china)
|
45
59
|
end
|
60
|
+
|
61
|
+
it 'should not result in negative word counts'
|
62
|
+
|
63
|
+
it_should_behave_like 'cache invalidator', :untrain
|
46
64
|
end
|
47
65
|
|
48
66
|
describe "calculate_scores" do
|
49
67
|
it "should return a score hash with the correct scores" do
|
50
|
-
|
68
|
+
classifier = Reclassifier::Bayes.new([:in_china, :not_in_china])
|
51
69
|
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
70
|
+
classifier.train(:in_china, 'Chinese Beijing Chinese')
|
71
|
+
classifier.train(:in_china, 'Chinese Chinese Shanghai')
|
72
|
+
classifier.train(:in_china, 'Chinese Macao')
|
73
|
+
classifier.train(:not_in_china, 'Tokyo Japan Chinese')
|
56
74
|
|
57
|
-
scores =
|
75
|
+
scores = classifier.calculate_scores('Chinese Chinese Chinese Tokyo Japan')
|
58
76
|
|
59
77
|
scores[:in_china].should eq(-8.107690312843907)
|
60
78
|
scores[:not_in_china].should eq(-8.906681345001262)
|
61
79
|
end
|
62
80
|
|
63
81
|
it "should handle the case when no documents are classified for a particular classification" do
|
64
|
-
|
82
|
+
classifier = Reclassifier::Bayes.new([:in_china, :not_in_china])
|
65
83
|
|
66
|
-
|
84
|
+
classifier.train(:in_china, 'Chinese Beijing Chinese')
|
67
85
|
|
68
|
-
|
86
|
+
classifier.calculate_scores('Chinese Beijing')
|
69
87
|
end
|
70
88
|
end
|
71
89
|
|
72
90
|
describe "add_classification" do
|
73
91
|
it "should add the classification to the set of classifications" do
|
74
|
-
|
92
|
+
classifier.classifications.should be_empty
|
75
93
|
|
76
|
-
|
94
|
+
classifier.add_classification(:niner)
|
77
95
|
|
78
|
-
|
96
|
+
classifier.classifications.should eq([:niner])
|
79
97
|
end
|
80
98
|
|
81
99
|
it "should return the classification" do
|
82
|
-
|
100
|
+
classifier.add_classification(:niner).should eq(:niner)
|
83
101
|
end
|
84
102
|
end
|
85
103
|
|
86
104
|
describe "remove_classification" do
|
87
105
|
it "should remove the classification from the set of classifications" do
|
88
|
-
|
106
|
+
classifier.add_classification(:niner)
|
89
107
|
|
90
|
-
|
108
|
+
classifier.remove_classification(:niner)
|
91
109
|
|
92
|
-
|
110
|
+
classifier.classifications.should be_empty
|
93
111
|
end
|
94
112
|
|
95
113
|
it "should return the classification" do
|
96
|
-
|
114
|
+
classifier.add_classification(:niner)
|
97
115
|
|
98
|
-
|
116
|
+
classifier.remove_classification(:niner).should eq(:niner)
|
99
117
|
end
|
100
118
|
|
101
119
|
it "should return nil if the classification didn't exist" do
|
102
|
-
|
120
|
+
classifier.remove_classification(:niner).should be(nil)
|
121
|
+
end
|
122
|
+
end
|
123
|
+
|
124
|
+
describe 'cache_present?' do
|
125
|
+
it 'should return true if the cache has been set' do
|
126
|
+
classifier = Reclassifier::Bayes.new([:one, :other])
|
127
|
+
|
128
|
+
classifier.train(:one, 'bbb')
|
129
|
+
classifier.train(:other, 'aaa')
|
130
|
+
|
131
|
+
classifier.classify('')
|
132
|
+
|
133
|
+
classifier.cache_set?.should be(true)
|
134
|
+
end
|
135
|
+
|
136
|
+
it 'should return false if the cache has not been set' do
|
137
|
+
classifier.cache_set?.should be(false)
|
138
|
+
end
|
139
|
+
|
140
|
+
it 'should return false if the cache has been invalidated' do
|
141
|
+
classifier = Reclassifier::Bayes.new([:one, :other])
|
142
|
+
|
143
|
+
classifier.train(:one, 'bbb')
|
144
|
+
classifier.train(:other, 'aaa')
|
145
|
+
|
146
|
+
classifier.classify('')
|
147
|
+
|
148
|
+
classifier.cache_set?.should be(true)
|
149
|
+
|
150
|
+
classifier.invalidate_cache
|
151
|
+
|
152
|
+
classifier.cache_set?.should be(false)
|
103
153
|
end
|
104
154
|
end
|
105
155
|
|
106
156
|
context ':clean option' do
|
107
157
|
it 'should cause punctuation to be omitted if it is set to true' do
|
108
|
-
|
158
|
+
classifier = Reclassifier::Bayes.new([:one, :other], {:clean => true})
|
109
159
|
|
110
|
-
|
111
|
-
|
160
|
+
classifier.train(:one, '! ! ! ! bbb')
|
161
|
+
classifier.train(:other, 'aaa')
|
112
162
|
|
113
|
-
|
163
|
+
classifier.classify('! aaa !').should eq(:other)
|
114
164
|
end
|
115
165
|
|
116
166
|
it 'should default to true' do
|
117
|
-
|
167
|
+
classifier = Reclassifier::Bayes.new([:one, :other])
|
118
168
|
|
119
|
-
|
120
|
-
|
169
|
+
classifier.train(:one, '! ! ! ! bbb')
|
170
|
+
classifier.train(:other, 'aaa')
|
121
171
|
|
122
|
-
|
172
|
+
classifier.classify('! aaa !').should eq(:other)
|
123
173
|
end
|
124
174
|
|
125
175
|
it 'should cause punctuation not to be omitted if it is set to false' do
|
126
|
-
|
176
|
+
classifier = Reclassifier::Bayes.new([:one, :other], {:clean => false})
|
127
177
|
|
128
|
-
|
129
|
-
|
178
|
+
classifier.train(:one, '! ! ! ! bbb')
|
179
|
+
classifier.train(:other, 'aaa')
|
130
180
|
|
131
|
-
|
181
|
+
classifier.classify('! aaa !').should eq(:one)
|
132
182
|
end
|
133
183
|
end
|
134
184
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: reclassifier
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.4.
|
4
|
+
version: 0.4.6
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2013-04-
|
12
|
+
date: 2013-04-25 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: bundler
|
@@ -91,6 +91,22 @@ dependencies:
|
|
91
91
|
- - ! '>='
|
92
92
|
- !ruby/object:Gem::Version
|
93
93
|
version: '0'
|
94
|
+
- !ruby/object:Gem::Dependency
|
95
|
+
name: activesupport
|
96
|
+
requirement: !ruby/object:Gem::Requirement
|
97
|
+
none: false
|
98
|
+
requirements:
|
99
|
+
- - ! '>='
|
100
|
+
- !ruby/object:Gem::Version
|
101
|
+
version: '0'
|
102
|
+
type: :runtime
|
103
|
+
prerelease: false
|
104
|
+
version_requirements: !ruby/object:Gem::Requirement
|
105
|
+
none: false
|
106
|
+
requirements:
|
107
|
+
- - ! '>='
|
108
|
+
- !ruby/object:Gem::Version
|
109
|
+
version: '0'
|
94
110
|
description: Bayesian and Latent Semantic Indexing classification of text.
|
95
111
|
email:
|
96
112
|
- rroblak@gmail.com
|