reclassifier 0.4.5 → 0.4.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/lib/reclassifier/bayes.rb +59 -7
- data/lib/reclassifier/version.rb +1 -1
- data/lib/reclassifier.rb +1 -0
- data/reclassifier.gemspec +1 -0
- data/spec/bayes_spec.rb +99 -49
- metadata +18 -2
data/lib/reclassifier/bayes.rb
CHANGED
@@ -2,8 +2,11 @@
|
|
2
2
|
# Bayesian classifier for arbitrary text.
|
3
3
|
#
|
4
4
|
# Implementation is translated from
|
5
|
-
# <em>Introduction to Information Retrieval</em> by Christopher D. Manning,
|
6
|
-
# Cambridge University Press. 2008,
|
5
|
+
# <em>Introduction to Information Retrieval</em> by Christopher D. Manning,
|
6
|
+
# Prabhakar Raghavan and Hinrich Schütze, # Cambridge University Press. 2008,
|
7
|
+
# ISBN 0521865719.
|
8
|
+
#
|
9
|
+
# Derived quantities are cached to improve performance of repeated #classify calls.
|
7
10
|
#
|
8
11
|
class Reclassifier::Bayes
|
9
12
|
include Reclassifier::WordHash
|
@@ -33,7 +36,7 @@ class Reclassifier::Bayes
|
|
33
36
|
def train(classification, text)
|
34
37
|
ensure_classification_exists(classification)
|
35
38
|
|
36
|
-
|
39
|
+
update_doc_count(classification, 1)
|
37
40
|
|
38
41
|
smart_word_hash(text).each do |word, count|
|
39
42
|
@classifications[classification][word] ||= 0
|
@@ -53,7 +56,7 @@ class Reclassifier::Bayes
|
|
53
56
|
def untrain(classification, text)
|
54
57
|
ensure_classification_exists(classification)
|
55
58
|
|
56
|
-
|
59
|
+
update_doc_count(classification, -1)
|
57
60
|
|
58
61
|
smart_word_hash(text).each do |word, count|
|
59
62
|
@classifications[classification][word] -= count if @classifications[classification].include?(word)
|
@@ -68,17 +71,21 @@ class Reclassifier::Bayes
|
|
68
71
|
def calculate_scores(text)
|
69
72
|
scores = {}
|
70
73
|
|
74
|
+
@cache[:total_docs_classified_log] ||= Math.log(@docs_in_classification_count.values.reduce(:+))
|
75
|
+
@cache[:words_classified] ||= @classifications.values.reduce(Set.new) {|set, word_counts| set.merge(word_counts.keys)}
|
76
|
+
|
71
77
|
@classifications.each do |classification, classification_word_counts|
|
72
78
|
# prior
|
73
79
|
scores[classification] = Math.log(@docs_in_classification_count[classification])
|
74
|
-
scores[classification] -=
|
80
|
+
scores[classification] -= @cache[:total_docs_classified_log]
|
75
81
|
|
76
82
|
# likelihood
|
83
|
+
classification_word_count = classification_word_counts.values.reduce(:+).to_i
|
77
84
|
smart_word_hash(text).each do |word, count|
|
78
|
-
if @
|
85
|
+
if @cache[:words_classified].include?(word)
|
79
86
|
scores[classification] += count * Math.log((classification_word_counts[word] || 0) + 1)
|
80
87
|
|
81
|
-
scores[classification] -= count * Math.log(
|
88
|
+
scores[classification] -= count * Math.log(classification_word_count + @cache[:words_classified].count)
|
82
89
|
end
|
83
90
|
end
|
84
91
|
end
|
@@ -135,7 +142,52 @@ class Reclassifier::Bayes
|
|
135
142
|
return_value
|
136
143
|
end
|
137
144
|
|
145
|
+
# Invalidates the cache.
|
146
|
+
#
|
147
|
+
# classifier = Reclassifier::Bayes.new([:one, :other])
|
148
|
+
#
|
149
|
+
# classifier.train(:one, 'bbb')
|
150
|
+
# classifier.train(:other, 'aaa')
|
151
|
+
#
|
152
|
+
# classifier.classify('aaa')
|
153
|
+
#
|
154
|
+
# classifier.cache_set?
|
155
|
+
# => true
|
156
|
+
#
|
157
|
+
# classifier.invalidate_cache
|
158
|
+
#
|
159
|
+
# classifier.cache_set?
|
160
|
+
# => false
|
161
|
+
#
|
162
|
+
def invalidate_cache
|
163
|
+
@cache = {}
|
164
|
+
end
|
165
|
+
|
166
|
+
# Returns true if the cache has been set (i.e. #classify has been run).
|
167
|
+
# Returns false otherwise.
|
168
|
+
# classifier = Reclassifier::Bayes.new([:one, :other])
|
169
|
+
#
|
170
|
+
# classifier.cache_set?
|
171
|
+
# => false
|
172
|
+
#
|
173
|
+
# classifier.train(:one, 'bbb')
|
174
|
+
# classifier.train(:other, 'aaa')
|
175
|
+
#
|
176
|
+
# classifier.classify('aaa')
|
177
|
+
#
|
178
|
+
# classifier.cache_set?
|
179
|
+
# => true
|
180
|
+
#
|
181
|
+
def cache_set?
|
182
|
+
@cache.present?
|
183
|
+
end
|
184
|
+
|
138
185
|
private
|
186
|
+
def update_doc_count(classification, value)
|
187
|
+
@docs_in_classification_count[classification] += value
|
188
|
+
|
189
|
+
invalidate_cache
|
190
|
+
end
|
139
191
|
|
140
192
|
def ensure_classification_exists(classification)
|
141
193
|
raise Reclassifier::UnknownClassificationError unless @classifications.include?(classification)
|
data/lib/reclassifier/version.rb
CHANGED
data/lib/reclassifier.rb
CHANGED
data/reclassifier.gemspec
CHANGED
data/spec/bayes_spec.rb
CHANGED
@@ -1,134 +1,184 @@
|
|
1
1
|
require 'spec_helper'
|
2
2
|
|
3
3
|
describe Reclassifier::Bayes do
|
4
|
+
subject(:classifier) { Reclassifier::Bayes.new }
|
5
|
+
|
6
|
+
shared_examples 'cache invalidator' do |method|
|
7
|
+
it 'should invalidate the cache' do
|
8
|
+
classifier = Reclassifier::Bayes.new([:in_china, :not_in_china])
|
9
|
+
|
10
|
+
classifier.should_receive(:invalidate_cache)
|
11
|
+
|
12
|
+
classifier.send(method, :in_china, 'Chinese Beijing Chinese')
|
13
|
+
end
|
14
|
+
end
|
15
|
+
|
4
16
|
describe "classifications" do
|
5
17
|
it "should return the classifications" do
|
6
|
-
|
18
|
+
classifier = Reclassifier::Bayes.new([:interesting, :uninteresting])
|
7
19
|
|
8
|
-
|
20
|
+
classifier.classifications.sort.should eq([:interesting, :uninteresting])
|
9
21
|
end
|
10
22
|
end
|
11
23
|
|
12
24
|
describe "train" do
|
13
25
|
it "should raise an UnknownClassificationError if the specified classification hasn't been added" do
|
14
|
-
expect {
|
26
|
+
expect { classifier.train(:blargle, '')}.to raise_error(Reclassifier::UnknownClassificationError)
|
15
27
|
end
|
16
28
|
|
17
29
|
it "should train the classifier to the (classification, document) pair" do
|
18
|
-
|
30
|
+
classifier = Reclassifier::Bayes.new([:in_china, :not_in_china])
|
19
31
|
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
32
|
+
classifier.train(:in_china, 'Chinese Beijing Chinese')
|
33
|
+
classifier.train(:in_china, 'Chinese Chinese Shanghai')
|
34
|
+
classifier.train(:in_china, 'Chinese Macao')
|
35
|
+
classifier.train(:not_in_china, 'Tokyo Japan Chinese')
|
24
36
|
|
25
|
-
|
37
|
+
classifier.classify('Chinese Chinese Chinese Tokyo Japan').should eq(:in_china)
|
26
38
|
end
|
39
|
+
|
40
|
+
it_should_behave_like 'cache invalidator', :train
|
27
41
|
end
|
28
42
|
|
29
|
-
describe
|
43
|
+
describe 'untrain' do
|
30
44
|
it "should raise an UnknownClassificationError if the specified classification hasn't been added" do
|
31
|
-
expect {
|
45
|
+
expect {classifier.untrain(:blargle, '')}.to raise_error(Reclassifier::UnknownClassificationError)
|
32
46
|
end
|
33
47
|
|
34
|
-
it
|
35
|
-
|
48
|
+
it 'should untrain the classifier against the (classification, document) pair' do
|
49
|
+
classifier = Reclassifier::Bayes.new([:in_china, :not_in_china])
|
36
50
|
|
37
|
-
|
38
|
-
|
51
|
+
classifier.train(:in_china, 'Chinese Chinese')
|
52
|
+
classifier.train(:not_in_china, 'Chinese Macao')
|
39
53
|
|
40
|
-
|
54
|
+
classifier.classify('Chinese').should eq(:in_china)
|
41
55
|
|
42
|
-
|
56
|
+
classifier.untrain(:in_china, 'Chinese Chinese')
|
43
57
|
|
44
|
-
|
58
|
+
classifier.classify('Chinese').should eq(:not_in_china)
|
45
59
|
end
|
60
|
+
|
61
|
+
it 'should not result in negative word counts'
|
62
|
+
|
63
|
+
it_should_behave_like 'cache invalidator', :untrain
|
46
64
|
end
|
47
65
|
|
48
66
|
describe "calculate_scores" do
|
49
67
|
it "should return a score hash with the correct scores" do
|
50
|
-
|
68
|
+
classifier = Reclassifier::Bayes.new([:in_china, :not_in_china])
|
51
69
|
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
70
|
+
classifier.train(:in_china, 'Chinese Beijing Chinese')
|
71
|
+
classifier.train(:in_china, 'Chinese Chinese Shanghai')
|
72
|
+
classifier.train(:in_china, 'Chinese Macao')
|
73
|
+
classifier.train(:not_in_china, 'Tokyo Japan Chinese')
|
56
74
|
|
57
|
-
scores =
|
75
|
+
scores = classifier.calculate_scores('Chinese Chinese Chinese Tokyo Japan')
|
58
76
|
|
59
77
|
scores[:in_china].should eq(-8.107690312843907)
|
60
78
|
scores[:not_in_china].should eq(-8.906681345001262)
|
61
79
|
end
|
62
80
|
|
63
81
|
it "should handle the case when no documents are classified for a particular classification" do
|
64
|
-
|
82
|
+
classifier = Reclassifier::Bayes.new([:in_china, :not_in_china])
|
65
83
|
|
66
|
-
|
84
|
+
classifier.train(:in_china, 'Chinese Beijing Chinese')
|
67
85
|
|
68
|
-
|
86
|
+
classifier.calculate_scores('Chinese Beijing')
|
69
87
|
end
|
70
88
|
end
|
71
89
|
|
72
90
|
describe "add_classification" do
|
73
91
|
it "should add the classification to the set of classifications" do
|
74
|
-
|
92
|
+
classifier.classifications.should be_empty
|
75
93
|
|
76
|
-
|
94
|
+
classifier.add_classification(:niner)
|
77
95
|
|
78
|
-
|
96
|
+
classifier.classifications.should eq([:niner])
|
79
97
|
end
|
80
98
|
|
81
99
|
it "should return the classification" do
|
82
|
-
|
100
|
+
classifier.add_classification(:niner).should eq(:niner)
|
83
101
|
end
|
84
102
|
end
|
85
103
|
|
86
104
|
describe "remove_classification" do
|
87
105
|
it "should remove the classification from the set of classifications" do
|
88
|
-
|
106
|
+
classifier.add_classification(:niner)
|
89
107
|
|
90
|
-
|
108
|
+
classifier.remove_classification(:niner)
|
91
109
|
|
92
|
-
|
110
|
+
classifier.classifications.should be_empty
|
93
111
|
end
|
94
112
|
|
95
113
|
it "should return the classification" do
|
96
|
-
|
114
|
+
classifier.add_classification(:niner)
|
97
115
|
|
98
|
-
|
116
|
+
classifier.remove_classification(:niner).should eq(:niner)
|
99
117
|
end
|
100
118
|
|
101
119
|
it "should return nil if the classification didn't exist" do
|
102
|
-
|
120
|
+
classifier.remove_classification(:niner).should be(nil)
|
121
|
+
end
|
122
|
+
end
|
123
|
+
|
124
|
+
describe 'cache_present?' do
|
125
|
+
it 'should return true if the cache has been set' do
|
126
|
+
classifier = Reclassifier::Bayes.new([:one, :other])
|
127
|
+
|
128
|
+
classifier.train(:one, 'bbb')
|
129
|
+
classifier.train(:other, 'aaa')
|
130
|
+
|
131
|
+
classifier.classify('')
|
132
|
+
|
133
|
+
classifier.cache_set?.should be(true)
|
134
|
+
end
|
135
|
+
|
136
|
+
it 'should return false if the cache has not been set' do
|
137
|
+
classifier.cache_set?.should be(false)
|
138
|
+
end
|
139
|
+
|
140
|
+
it 'should return false if the cache has been invalidated' do
|
141
|
+
classifier = Reclassifier::Bayes.new([:one, :other])
|
142
|
+
|
143
|
+
classifier.train(:one, 'bbb')
|
144
|
+
classifier.train(:other, 'aaa')
|
145
|
+
|
146
|
+
classifier.classify('')
|
147
|
+
|
148
|
+
classifier.cache_set?.should be(true)
|
149
|
+
|
150
|
+
classifier.invalidate_cache
|
151
|
+
|
152
|
+
classifier.cache_set?.should be(false)
|
103
153
|
end
|
104
154
|
end
|
105
155
|
|
106
156
|
context ':clean option' do
|
107
157
|
it 'should cause punctuation to be omitted if it is set to true' do
|
108
|
-
|
158
|
+
classifier = Reclassifier::Bayes.new([:one, :other], {:clean => true})
|
109
159
|
|
110
|
-
|
111
|
-
|
160
|
+
classifier.train(:one, '! ! ! ! bbb')
|
161
|
+
classifier.train(:other, 'aaa')
|
112
162
|
|
113
|
-
|
163
|
+
classifier.classify('! aaa !').should eq(:other)
|
114
164
|
end
|
115
165
|
|
116
166
|
it 'should default to true' do
|
117
|
-
|
167
|
+
classifier = Reclassifier::Bayes.new([:one, :other])
|
118
168
|
|
119
|
-
|
120
|
-
|
169
|
+
classifier.train(:one, '! ! ! ! bbb')
|
170
|
+
classifier.train(:other, 'aaa')
|
121
171
|
|
122
|
-
|
172
|
+
classifier.classify('! aaa !').should eq(:other)
|
123
173
|
end
|
124
174
|
|
125
175
|
it 'should cause punctuation not to be omitted if it is set to false' do
|
126
|
-
|
176
|
+
classifier = Reclassifier::Bayes.new([:one, :other], {:clean => false})
|
127
177
|
|
128
|
-
|
129
|
-
|
178
|
+
classifier.train(:one, '! ! ! ! bbb')
|
179
|
+
classifier.train(:other, 'aaa')
|
130
180
|
|
131
|
-
|
181
|
+
classifier.classify('! aaa !').should eq(:one)
|
132
182
|
end
|
133
183
|
end
|
134
184
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: reclassifier
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.4.
|
4
|
+
version: 0.4.6
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2013-04-
|
12
|
+
date: 2013-04-25 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: bundler
|
@@ -91,6 +91,22 @@ dependencies:
|
|
91
91
|
- - ! '>='
|
92
92
|
- !ruby/object:Gem::Version
|
93
93
|
version: '0'
|
94
|
+
- !ruby/object:Gem::Dependency
|
95
|
+
name: activesupport
|
96
|
+
requirement: !ruby/object:Gem::Requirement
|
97
|
+
none: false
|
98
|
+
requirements:
|
99
|
+
- - ! '>='
|
100
|
+
- !ruby/object:Gem::Version
|
101
|
+
version: '0'
|
102
|
+
type: :runtime
|
103
|
+
prerelease: false
|
104
|
+
version_requirements: !ruby/object:Gem::Requirement
|
105
|
+
none: false
|
106
|
+
requirements:
|
107
|
+
- - ! '>='
|
108
|
+
- !ruby/object:Gem::Version
|
109
|
+
version: '0'
|
94
110
|
description: Bayesian and Latent Semantic Indexing classification of text.
|
95
111
|
email:
|
96
112
|
- rroblak@gmail.com
|