reclassifier 0.4.6 → 0.4.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/lib/reclassifier/bayes.rb +26 -19
- data/lib/reclassifier/version.rb +1 -1
- metadata +1 -1
data/lib/reclassifier/bayes.rb
CHANGED
@@ -3,7 +3,7 @@
|
|
3
3
|
#
|
4
4
|
# Implementation is translated from
|
5
5
|
# <em>Introduction to Information Retrieval</em> by Christopher D. Manning,
|
6
|
-
# Prabhakar Raghavan and Hinrich Schütze,
|
6
|
+
# Prabhakar Raghavan and Hinrich Schütze, Cambridge University Press. 2008,
|
7
7
|
# ISBN 0521865719.
|
8
8
|
#
|
9
9
|
# Derived quantities are cached to improve performance of repeated #classify calls.
|
@@ -19,6 +19,7 @@ class Reclassifier::Bayes
|
|
19
19
|
# * :clean - If false, punctuation will be included in the classifier. Otherwise, punctuation will be omitted. Default is true.
|
20
20
|
#
|
21
21
|
# b = Reclassifier::Bayes.new([:interesting, :uninteresting, :spam], :clean => true)
|
22
|
+
#
|
22
23
|
def initialize(classifications = [], options = {})
|
23
24
|
@classifications = {}
|
24
25
|
@docs_in_classification_count = {}
|
@@ -29,10 +30,11 @@ class Reclassifier::Bayes
|
|
29
30
|
|
30
31
|
#
|
31
32
|
# Provides a general training method for all classifications specified in Bayes#new
|
32
|
-
#
|
33
|
-
# b = Reclassifier::Bayes.new
|
34
|
-
# b.train
|
35
|
-
# b.train
|
33
|
+
#
|
34
|
+
# b = Reclassifier::Bayes.new([:this, :that])
|
35
|
+
# b.train(:this, "This text")
|
36
|
+
# b.train(:that, "That text")
|
37
|
+
#
|
36
38
|
def train(classification, text)
|
37
39
|
ensure_classification_exists(classification)
|
38
40
|
|
@@ -49,10 +51,10 @@ class Reclassifier::Bayes
|
|
49
51
|
# Untrain a (classification, text) pair.
|
50
52
|
# Be very careful with this method.
|
51
53
|
#
|
52
|
-
#
|
53
|
-
# b
|
54
|
-
# b.
|
55
|
-
#
|
54
|
+
# b = Reclassifier::Bayes.new([:this, :that])
|
55
|
+
# b.train(:this, "This text")
|
56
|
+
# b.untrain(:this, "This text")
|
57
|
+
#
|
56
58
|
def untrain(classification, text)
|
57
59
|
ensure_classification_exists(classification)
|
58
60
|
|
@@ -63,11 +65,13 @@ class Reclassifier::Bayes
|
|
63
65
|
end
|
64
66
|
end
|
65
67
|
|
68
|
+
# Returns the scores of the specified text for each classification.
|
66
69
|
#
|
67
|
-
#
|
68
|
-
# b.classifications "I hate bad words and you"
|
70
|
+
# b.calculate_scores("I hate bad words and you")
|
69
71
|
# => {"Uninteresting"=>-12.6997928013932, "Interesting"=>-18.4206807439524}
|
72
|
+
#
|
70
73
|
# The largest of these scores (the one closest to 0) is the one picked out by #classify
|
74
|
+
#
|
71
75
|
def calculate_scores(text)
|
72
76
|
scores = {}
|
73
77
|
|
@@ -93,30 +97,31 @@ class Reclassifier::Bayes
|
|
93
97
|
scores
|
94
98
|
end
|
95
99
|
|
96
|
-
#
|
97
100
|
# Returns the classification of the specified text, which is one of the
|
98
|
-
# classifications given in the initializer.
|
99
|
-
#
|
101
|
+
# classifications given in the initializer.
|
102
|
+
#
|
103
|
+
# b.classify("I hate bad words and you")
|
100
104
|
# => :uninteresting
|
105
|
+
#
|
101
106
|
def classify(text)
|
102
107
|
calculate_scores(text).max_by {|classification| classification[1]}[0]
|
103
108
|
end
|
104
109
|
|
105
|
-
#
|
106
110
|
# Provides a list of classification names
|
107
|
-
#
|
111
|
+
#
|
108
112
|
# b.classifications
|
109
113
|
# => [:this, :that, :the_other]
|
114
|
+
#
|
110
115
|
def classifications
|
111
116
|
@classifications.keys
|
112
117
|
end
|
113
118
|
|
114
|
-
#
|
115
119
|
# Adds the classification to the classifier.
|
116
120
|
# Has no effect if the classification already existed.
|
117
121
|
# Returns the classification.
|
118
|
-
#
|
122
|
+
#
|
119
123
|
# b.add_classification(:not_spam)
|
124
|
+
#
|
120
125
|
def add_classification(classification)
|
121
126
|
@classifications[classification] ||= {}
|
122
127
|
|
@@ -128,8 +133,9 @@ class Reclassifier::Bayes
|
|
128
133
|
#
|
129
134
|
# Removes the classification from the classifier.
|
130
135
|
# Returns the classifier if the classification existed, else nil.
|
131
|
-
#
|
136
|
+
#
|
132
137
|
# b.remove_classification(:not_spam)
|
138
|
+
#
|
133
139
|
def remove_classification(classification)
|
134
140
|
return_value = if @classifications.include?(classification)
|
135
141
|
classification
|
@@ -183,6 +189,7 @@ class Reclassifier::Bayes
|
|
183
189
|
end
|
184
190
|
|
185
191
|
private
|
192
|
+
|
186
193
|
def update_doc_count(classification, value)
|
187
194
|
@docs_in_classification_count[classification] += value
|
188
195
|
|
data/lib/reclassifier/version.rb
CHANGED