reclassifier 0.0.3 → 0.0.4
Sign up to get free protection for your applications and to get access to all the features.
- data/Rakefile +6 -1
- data/lib/reclassifier/bayes.rb +108 -110
- data/lib/reclassifier/version.rb +1 -1
- metadata +2 -2
data/Rakefile
CHANGED
data/lib/reclassifier/bayes.rb
CHANGED
@@ -2,136 +2,134 @@
|
|
2
2
|
# Bayesian classifier for arbitrary text.
|
3
3
|
#
|
4
4
|
# Implementation is translated from
|
5
|
-
# Introduction to Information Retrieval by Christopher D. Manning, Prabhakar Raghavan and Hinrich Schütze,
|
5
|
+
# <em>Introduction to Information Retrieval</em> by Christopher D. Manning, Prabhakar Raghavan and Hinrich Schütze,
|
6
6
|
# Cambridge University Press. 2008, ISBN 0521865719.
|
7
7
|
#
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
end
|
8
|
+
class Reclassifier::Bayes
|
9
|
+
# Can be created with zero or more classifications, each of which will be
|
10
|
+
# initialized and given a training method. The classifications are specified as
|
11
|
+
# symbols. E.g.,
|
12
|
+
# b = Reclassifier::Bayes.new :interesting, :uninteresting, :spam
|
13
|
+
def initialize(*classifications)
|
14
|
+
@classifications = {}
|
15
|
+
classifications.each {|classification| @classifications[classification] = {}}
|
16
|
+
|
17
|
+
@docs_in_classification_count = {}
|
18
|
+
end
|
20
19
|
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
20
|
+
#
|
21
|
+
# Provides a general training method for all classifications specified in Bayes#new
|
22
|
+
# For example:
|
23
|
+
# b = Reclassifier::Bayes.new :this, :that
|
24
|
+
# b.train :this, "This text"
|
25
|
+
# b.train :that, "That text"
|
26
|
+
def train(classification, text)
|
27
|
+
ensure_classification_exists(classification)
|
29
28
|
|
30
|
-
|
31
|
-
|
29
|
+
@docs_in_classification_count[classification] ||= 0
|
30
|
+
@docs_in_classification_count[classification] += 1
|
32
31
|
|
33
|
-
|
34
|
-
|
32
|
+
text.word_hash.each do |word, count|
|
33
|
+
@classifications[classification][word] ||= 0
|
35
34
|
|
36
|
-
|
37
|
-
|
35
|
+
@classifications[classification][word] += count
|
36
|
+
end
|
37
|
+
end
|
38
|
+
|
39
|
+
#
|
40
|
+
# Untrain a (classification, text) pair.
|
41
|
+
# Be very careful with this method.
|
42
|
+
#
|
43
|
+
# For example:
|
44
|
+
# b = Reclassifier::Bayes.new :this, :that, :the_other
|
45
|
+
# b.train :this, "This text"
|
46
|
+
# b.untrain :this, "This text"
|
47
|
+
def untrain(classification, text)
|
48
|
+
ensure_classification_exists(classification)
|
49
|
+
|
50
|
+
@docs_in_classification_count[classification] -= 1
|
51
|
+
|
52
|
+
text.word_hash.each do |word, count|
|
53
|
+
@classifications[classification][word] -= count if @classifications[classification].include?(word)
|
38
54
|
end
|
55
|
+
end
|
39
56
|
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
# b.untrain :this, "This text"
|
48
|
-
def untrain(classification, text)
|
49
|
-
ensure_classification_exists(classification)
|
57
|
+
#
|
58
|
+
# Returns the scores of the specified text for each classification. E.g.,
|
59
|
+
# b.classifications "I hate bad words and you"
|
60
|
+
# => {"Uninteresting"=>-12.6997928013932, "Interesting"=>-18.4206807439524}
|
61
|
+
# The largest of these scores (the one closest to 0) is the one picked out by #classify
|
62
|
+
def calculate_scores(text)
|
63
|
+
scores = {}
|
50
64
|
|
51
|
-
|
65
|
+
@classifications.each do |classification, classification_word_counts|
|
66
|
+
# prior
|
67
|
+
scores[classification] = Math.log(@docs_in_classification_count[classification])
|
68
|
+
scores[classification] -= Math.log(@docs_in_classification_count.values.reduce(:+))
|
52
69
|
|
70
|
+
# likelihood
|
53
71
|
text.word_hash.each do |word, count|
|
54
|
-
@classifications
|
55
|
-
|
56
|
-
end
|
72
|
+
if @classifications.values.reduce(Set.new) {|set, word_counts| set.merge(word_counts.keys)}.include?(word)
|
73
|
+
scores[classification] += count * Math.log((classification_word_counts[word] || 0) + 1)
|
57
74
|
|
58
|
-
|
59
|
-
# Returns the scores of the specified text for each classification. E.g.,
|
60
|
-
# b.classifications "I hate bad words and you"
|
61
|
-
# => {"Uninteresting"=>-12.6997928013932, "Interesting"=>-18.4206807439524}
|
62
|
-
# The largest of these scores (the one closest to 0) is the one picked out by #classify
|
63
|
-
def calculate_scores(text)
|
64
|
-
scores = {}
|
65
|
-
|
66
|
-
@classifications.each do |classification, classification_word_counts|
|
67
|
-
# prior
|
68
|
-
scores[classification] = Math.log(@docs_in_classification_count[classification])
|
69
|
-
scores[classification] -= Math.log(@docs_in_classification_count.values.reduce(:+))
|
70
|
-
|
71
|
-
# likelihood
|
72
|
-
text.word_hash.each do |word, count|
|
73
|
-
if @classifications.values.reduce(Set.new) {|set, word_counts| set.merge(word_counts.keys)}.include?(word)
|
74
|
-
scores[classification] += count * Math.log((classification_word_counts[word] || 0) + 1)
|
75
|
-
|
76
|
-
scores[classification] -= count * Math.log(classification_word_counts.values.reduce(:+) + @classifications.values.reduce(Set.new) {|set, word_counts| set.merge(word_counts.keys)}.count)
|
77
|
-
end
|
75
|
+
scores[classification] -= count * Math.log(classification_word_counts.values.reduce(:+) + @classifications.values.reduce(Set.new) {|set, word_counts| set.merge(word_counts.keys)}.count)
|
78
76
|
end
|
79
77
|
end
|
80
|
-
|
81
|
-
scores
|
82
|
-
end
|
83
|
-
|
84
|
-
#
|
85
|
-
# Returns the classification of the specified text, which is one of the
|
86
|
-
# classifications given in the initializer. E.g.,
|
87
|
-
# b.classify "I hate bad words and you"
|
88
|
-
# => :uninteresting
|
89
|
-
def classify(text)
|
90
|
-
calculate_scores(text).max_by {|classification| classification[1]}[0]
|
91
78
|
end
|
92
79
|
|
93
|
-
|
94
|
-
|
95
|
-
# For example:
|
96
|
-
# b.classifications
|
97
|
-
# => [:this, :that, :the_other]
|
98
|
-
def classifications
|
99
|
-
@classifications.keys
|
100
|
-
end
|
80
|
+
scores
|
81
|
+
end
|
101
82
|
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
83
|
+
#
|
84
|
+
# Returns the classification of the specified text, which is one of the
|
85
|
+
# classifications given in the initializer. E.g.,
|
86
|
+
# b.classify "I hate bad words and you"
|
87
|
+
# => :uninteresting
|
88
|
+
def classify(text)
|
89
|
+
calculate_scores(text).max_by {|classification| classification[1]}[0]
|
90
|
+
end
|
110
91
|
|
111
|
-
|
112
|
-
|
92
|
+
#
|
93
|
+
# Provides a list of classification names
|
94
|
+
# For example:
|
95
|
+
# b.classifications
|
96
|
+
# => [:this, :that, :the_other]
|
97
|
+
def classifications
|
98
|
+
@classifications.keys
|
99
|
+
end
|
113
100
|
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
else
|
123
|
-
nil
|
124
|
-
end
|
125
|
-
|
126
|
-
@classifications.delete(classification)
|
127
|
-
|
128
|
-
return_value
|
129
|
-
end
|
101
|
+
#
|
102
|
+
# Adds the classification to the classifier.
|
103
|
+
# Has no effect if the classification already existed.
|
104
|
+
# Returns the classification.
|
105
|
+
# For example:
|
106
|
+
# b.add_classification(:not_spam)
|
107
|
+
def add_classification(classification)
|
108
|
+
@classifications[classification] ||= {}
|
130
109
|
|
131
|
-
|
110
|
+
classification
|
111
|
+
end
|
132
112
|
|
133
|
-
|
134
|
-
|
135
|
-
|
113
|
+
#
|
114
|
+
# Removes the classification from the classifier.
|
115
|
+
# Returns the classifier if the classification existed, else nil.
|
116
|
+
# For example:
|
117
|
+
# b.remove_classification(:not_spam)
|
118
|
+
def remove_classification(classification)
|
119
|
+
return_value = if @classifications.include?(classification)
|
120
|
+
classification
|
121
|
+
else
|
122
|
+
nil
|
123
|
+
end
|
124
|
+
|
125
|
+
@classifications.delete(classification)
|
126
|
+
|
127
|
+
return_value
|
136
128
|
end
|
129
|
+
|
130
|
+
private
|
131
|
+
|
132
|
+
def ensure_classification_exists(classification)
|
133
|
+
raise Reclassifier::UnknownClassificationError unless @classifications.include?(classification)
|
134
|
+
end
|
137
135
|
end
|
data/lib/reclassifier/version.rb
CHANGED
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: reclassifier
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.4
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2013-04-
|
12
|
+
date: 2013-04-23 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: bundler
|