reclassifier 0.4.6 → 0.4.7
Sign up to get free protection for your applications and to get access to all the features.
- data/lib/reclassifier/bayes.rb +26 -19
- data/lib/reclassifier/version.rb +1 -1
- metadata +1 -1
data/lib/reclassifier/bayes.rb
CHANGED
@@ -3,7 +3,7 @@
|
|
3
3
|
#
|
4
4
|
# Implementation is translated from
|
5
5
|
# <em>Introduction to Information Retrieval</em> by Christopher D. Manning,
|
6
|
-
# Prabhakar Raghavan and Hinrich Schütze,
|
6
|
+
# Prabhakar Raghavan and Hinrich Schütze, Cambridge University Press. 2008,
|
7
7
|
# ISBN 0521865719.
|
8
8
|
#
|
9
9
|
# Derived quantities are cached to improve performance of repeated #classify calls.
|
@@ -19,6 +19,7 @@ class Reclassifier::Bayes
|
|
19
19
|
# * :clean - If false, punctuation will be included in the classifier. Otherwise, punctuation will be omitted. Default is true.
|
20
20
|
#
|
21
21
|
# b = Reclassifier::Bayes.new([:interesting, :uninteresting, :spam], :clean => true)
|
22
|
+
#
|
22
23
|
def initialize(classifications = [], options = {})
|
23
24
|
@classifications = {}
|
24
25
|
@docs_in_classification_count = {}
|
@@ -29,10 +30,11 @@ class Reclassifier::Bayes
|
|
29
30
|
|
30
31
|
#
|
31
32
|
# Provides a general training method for all classifications specified in Bayes#new
|
32
|
-
#
|
33
|
-
# b = Reclassifier::Bayes.new
|
34
|
-
# b.train
|
35
|
-
# b.train
|
33
|
+
#
|
34
|
+
# b = Reclassifier::Bayes.new([:this, :that])
|
35
|
+
# b.train(:this, "This text")
|
36
|
+
# b.train(:that, "That text")
|
37
|
+
#
|
36
38
|
def train(classification, text)
|
37
39
|
ensure_classification_exists(classification)
|
38
40
|
|
@@ -49,10 +51,10 @@ class Reclassifier::Bayes
|
|
49
51
|
# Untrain a (classification, text) pair.
|
50
52
|
# Be very careful with this method.
|
51
53
|
#
|
52
|
-
#
|
53
|
-
# b
|
54
|
-
# b.
|
55
|
-
#
|
54
|
+
# b = Reclassifier::Bayes.new([:this, :that])
|
55
|
+
# b.train(:this, "This text")
|
56
|
+
# b.untrain(:this, "This text")
|
57
|
+
#
|
56
58
|
def untrain(classification, text)
|
57
59
|
ensure_classification_exists(classification)
|
58
60
|
|
@@ -63,11 +65,13 @@ class Reclassifier::Bayes
|
|
63
65
|
end
|
64
66
|
end
|
65
67
|
|
68
|
+
# Returns the scores of the specified text for each classification.
|
66
69
|
#
|
67
|
-
#
|
68
|
-
# b.classifications "I hate bad words and you"
|
70
|
+
# b.calculate_scores("I hate bad words and you")
|
69
71
|
# => {"Uninteresting"=>-12.6997928013932, "Interesting"=>-18.4206807439524}
|
72
|
+
#
|
70
73
|
# The largest of these scores (the one closest to 0) is the one picked out by #classify
|
74
|
+
#
|
71
75
|
def calculate_scores(text)
|
72
76
|
scores = {}
|
73
77
|
|
@@ -93,30 +97,31 @@ class Reclassifier::Bayes
|
|
93
97
|
scores
|
94
98
|
end
|
95
99
|
|
96
|
-
#
|
97
100
|
# Returns the classification of the specified text, which is one of the
|
98
|
-
# classifications given in the initializer.
|
99
|
-
#
|
101
|
+
# classifications given in the initializer.
|
102
|
+
#
|
103
|
+
# b.classify("I hate bad words and you")
|
100
104
|
# => :uninteresting
|
105
|
+
#
|
101
106
|
def classify(text)
|
102
107
|
calculate_scores(text).max_by {|classification| classification[1]}[0]
|
103
108
|
end
|
104
109
|
|
105
|
-
#
|
106
110
|
# Provides a list of classification names
|
107
|
-
#
|
111
|
+
#
|
108
112
|
# b.classifications
|
109
113
|
# => [:this, :that, :the_other]
|
114
|
+
#
|
110
115
|
def classifications
|
111
116
|
@classifications.keys
|
112
117
|
end
|
113
118
|
|
114
|
-
#
|
115
119
|
# Adds the classification to the classifier.
|
116
120
|
# Has no effect if the classification already existed.
|
117
121
|
# Returns the classification.
|
118
|
-
#
|
122
|
+
#
|
119
123
|
# b.add_classification(:not_spam)
|
124
|
+
#
|
120
125
|
def add_classification(classification)
|
121
126
|
@classifications[classification] ||= {}
|
122
127
|
|
@@ -128,8 +133,9 @@ class Reclassifier::Bayes
|
|
128
133
|
#
|
129
134
|
# Removes the classification from the classifier.
|
130
135
|
# Returns the classifier if the classification existed, else nil.
|
131
|
-
#
|
136
|
+
#
|
132
137
|
# b.remove_classification(:not_spam)
|
138
|
+
#
|
133
139
|
def remove_classification(classification)
|
134
140
|
return_value = if @classifications.include?(classification)
|
135
141
|
classification
|
@@ -183,6 +189,7 @@ class Reclassifier::Bayes
|
|
183
189
|
end
|
184
190
|
|
185
191
|
private
|
192
|
+
|
186
193
|
def update_doc_count(classification, value)
|
187
194
|
@docs_in_classification_count[classification] += value
|
188
195
|
|
data/lib/reclassifier/version.rb
CHANGED