reclassifier 0.0.3 → 0.0.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/Rakefile +6 -1
- data/lib/reclassifier/bayes.rb +108 -110
- data/lib/reclassifier/version.rb +1 -1
- metadata +2 -2
data/Rakefile
CHANGED
data/lib/reclassifier/bayes.rb
CHANGED
@@ -2,136 +2,134 @@
|
|
2
2
|
# Bayesian classifier for arbitrary text.
|
3
3
|
#
|
4
4
|
# Implementation is translated from
|
5
|
-
# Introduction to Information Retrieval by Christopher D. Manning, Prabhakar Raghavan and Hinrich Schütze,
|
5
|
+
# <em>Introduction to Information Retrieval</em> by Christopher D. Manning, Prabhakar Raghavan and Hinrich Schütze,
|
6
6
|
# Cambridge University Press. 2008, ISBN 0521865719.
|
7
7
|
#
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
end
|
8
|
+
class Reclassifier::Bayes
|
9
|
+
# Can be created with zero or more classifications, each of which will be
|
10
|
+
# initialized and given a training method. The classifications are specified as
|
11
|
+
# symbols. E.g.,
|
12
|
+
# b = Reclassifier::Bayes.new :interesting, :uninteresting, :spam
|
13
|
+
def initialize(*classifications)
|
14
|
+
@classifications = {}
|
15
|
+
classifications.each {|classification| @classifications[classification] = {}}
|
16
|
+
|
17
|
+
@docs_in_classification_count = {}
|
18
|
+
end
|
20
19
|
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
20
|
+
#
|
21
|
+
# Provides a general training method for all classifications specified in Bayes#new
|
22
|
+
# For example:
|
23
|
+
# b = Reclassifier::Bayes.new :this, :that
|
24
|
+
# b.train :this, "This text"
|
25
|
+
# b.train :that, "That text"
|
26
|
+
def train(classification, text)
|
27
|
+
ensure_classification_exists(classification)
|
29
28
|
|
30
|
-
|
31
|
-
|
29
|
+
@docs_in_classification_count[classification] ||= 0
|
30
|
+
@docs_in_classification_count[classification] += 1
|
32
31
|
|
33
|
-
|
34
|
-
|
32
|
+
text.word_hash.each do |word, count|
|
33
|
+
@classifications[classification][word] ||= 0
|
35
34
|
|
36
|
-
|
37
|
-
|
35
|
+
@classifications[classification][word] += count
|
36
|
+
end
|
37
|
+
end
|
38
|
+
|
39
|
+
#
|
40
|
+
# Untrain a (classification, text) pair.
|
41
|
+
# Be very careful with this method.
|
42
|
+
#
|
43
|
+
# For example:
|
44
|
+
# b = Reclassifier::Bayes.new :this, :that, :the_other
|
45
|
+
# b.train :this, "This text"
|
46
|
+
# b.untrain :this, "This text"
|
47
|
+
def untrain(classification, text)
|
48
|
+
ensure_classification_exists(classification)
|
49
|
+
|
50
|
+
@docs_in_classification_count[classification] -= 1
|
51
|
+
|
52
|
+
text.word_hash.each do |word, count|
|
53
|
+
@classifications[classification][word] -= count if @classifications[classification].include?(word)
|
38
54
|
end
|
55
|
+
end
|
39
56
|
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
# b.untrain :this, "This text"
|
48
|
-
def untrain(classification, text)
|
49
|
-
ensure_classification_exists(classification)
|
57
|
+
#
|
58
|
+
# Returns the scores of the specified text for each classification. E.g.,
|
59
|
+
# b.classifications "I hate bad words and you"
|
60
|
+
# => {"Uninteresting"=>-12.6997928013932, "Interesting"=>-18.4206807439524}
|
61
|
+
# The largest of these scores (the one closest to 0) is the one picked out by #classify
|
62
|
+
def calculate_scores(text)
|
63
|
+
scores = {}
|
50
64
|
|
51
|
-
|
65
|
+
@classifications.each do |classification, classification_word_counts|
|
66
|
+
# prior
|
67
|
+
scores[classification] = Math.log(@docs_in_classification_count[classification])
|
68
|
+
scores[classification] -= Math.log(@docs_in_classification_count.values.reduce(:+))
|
52
69
|
|
70
|
+
# likelihood
|
53
71
|
text.word_hash.each do |word, count|
|
54
|
-
@classifications
|
55
|
-
|
56
|
-
end
|
72
|
+
if @classifications.values.reduce(Set.new) {|set, word_counts| set.merge(word_counts.keys)}.include?(word)
|
73
|
+
scores[classification] += count * Math.log((classification_word_counts[word] || 0) + 1)
|
57
74
|
|
58
|
-
|
59
|
-
# Returns the scores of the specified text for each classification. E.g.,
|
60
|
-
# b.classifications "I hate bad words and you"
|
61
|
-
# => {"Uninteresting"=>-12.6997928013932, "Interesting"=>-18.4206807439524}
|
62
|
-
# The largest of these scores (the one closest to 0) is the one picked out by #classify
|
63
|
-
def calculate_scores(text)
|
64
|
-
scores = {}
|
65
|
-
|
66
|
-
@classifications.each do |classification, classification_word_counts|
|
67
|
-
# prior
|
68
|
-
scores[classification] = Math.log(@docs_in_classification_count[classification])
|
69
|
-
scores[classification] -= Math.log(@docs_in_classification_count.values.reduce(:+))
|
70
|
-
|
71
|
-
# likelihood
|
72
|
-
text.word_hash.each do |word, count|
|
73
|
-
if @classifications.values.reduce(Set.new) {|set, word_counts| set.merge(word_counts.keys)}.include?(word)
|
74
|
-
scores[classification] += count * Math.log((classification_word_counts[word] || 0) + 1)
|
75
|
-
|
76
|
-
scores[classification] -= count * Math.log(classification_word_counts.values.reduce(:+) + @classifications.values.reduce(Set.new) {|set, word_counts| set.merge(word_counts.keys)}.count)
|
77
|
-
end
|
75
|
+
scores[classification] -= count * Math.log(classification_word_counts.values.reduce(:+) + @classifications.values.reduce(Set.new) {|set, word_counts| set.merge(word_counts.keys)}.count)
|
78
76
|
end
|
79
77
|
end
|
80
|
-
|
81
|
-
scores
|
82
|
-
end
|
83
|
-
|
84
|
-
#
|
85
|
-
# Returns the classification of the specified text, which is one of the
|
86
|
-
# classifications given in the initializer. E.g.,
|
87
|
-
# b.classify "I hate bad words and you"
|
88
|
-
# => :uninteresting
|
89
|
-
def classify(text)
|
90
|
-
calculate_scores(text).max_by {|classification| classification[1]}[0]
|
91
78
|
end
|
92
79
|
|
93
|
-
|
94
|
-
|
95
|
-
# For example:
|
96
|
-
# b.classifications
|
97
|
-
# => [:this, :that, :the_other]
|
98
|
-
def classifications
|
99
|
-
@classifications.keys
|
100
|
-
end
|
80
|
+
scores
|
81
|
+
end
|
101
82
|
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
83
|
+
#
|
84
|
+
# Returns the classification of the specified text, which is one of the
|
85
|
+
# classifications given in the initializer. E.g.,
|
86
|
+
# b.classify "I hate bad words and you"
|
87
|
+
# => :uninteresting
|
88
|
+
def classify(text)
|
89
|
+
calculate_scores(text).max_by {|classification| classification[1]}[0]
|
90
|
+
end
|
110
91
|
|
111
|
-
|
112
|
-
|
92
|
+
#
|
93
|
+
# Provides a list of classification names
|
94
|
+
# For example:
|
95
|
+
# b.classifications
|
96
|
+
# => [:this, :that, :the_other]
|
97
|
+
def classifications
|
98
|
+
@classifications.keys
|
99
|
+
end
|
113
100
|
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
else
|
123
|
-
nil
|
124
|
-
end
|
125
|
-
|
126
|
-
@classifications.delete(classification)
|
127
|
-
|
128
|
-
return_value
|
129
|
-
end
|
101
|
+
#
|
102
|
+
# Adds the classification to the classifier.
|
103
|
+
# Has no effect if the classification already existed.
|
104
|
+
# Returns the classification.
|
105
|
+
# For example:
|
106
|
+
# b.add_classification(:not_spam)
|
107
|
+
def add_classification(classification)
|
108
|
+
@classifications[classification] ||= {}
|
130
109
|
|
131
|
-
|
110
|
+
classification
|
111
|
+
end
|
132
112
|
|
133
|
-
|
134
|
-
|
135
|
-
|
113
|
+
#
|
114
|
+
# Removes the classification from the classifier.
|
115
|
+
# Returns the classifier if the classification existed, else nil.
|
116
|
+
# For example:
|
117
|
+
# b.remove_classification(:not_spam)
|
118
|
+
def remove_classification(classification)
|
119
|
+
return_value = if @classifications.include?(classification)
|
120
|
+
classification
|
121
|
+
else
|
122
|
+
nil
|
123
|
+
end
|
124
|
+
|
125
|
+
@classifications.delete(classification)
|
126
|
+
|
127
|
+
return_value
|
136
128
|
end
|
129
|
+
|
130
|
+
private
|
131
|
+
|
132
|
+
def ensure_classification_exists(classification)
|
133
|
+
raise Reclassifier::UnknownClassificationError unless @classifications.include?(classification)
|
134
|
+
end
|
137
135
|
end
|
data/lib/reclassifier/version.rb
CHANGED
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: reclassifier
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.4
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2013-04-
|
12
|
+
date: 2013-04-23 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: bundler
|