reclassifier 0.0.3 → 0.0.4

Sign up to get free protection for your applications and to get access to all the features.
data/Rakefile CHANGED
@@ -1,4 +1,9 @@
1
- require "bundler/gem_tasks"
1
+ require 'bundler/gem_tasks'
2
+ require 'rdoc/task'
2
3
  require 'rspec/core/rake_task'
3
4
 
5
+ Rake::RDocTask.new do |rdoc|
6
+ rdoc.rdoc_files.include('lib/**/*.rb')
7
+ end
8
+
4
9
  RSpec::Core::RakeTask.new(:spec)
@@ -2,136 +2,134 @@
2
2
  # Bayesian classifier for arbitrary text.
3
3
  #
4
4
  # Implementation is translated from
5
- # Introduction to Information Retrieval by Christopher D. Manning, Prabhakar Raghavan and Hinrich Schütze,
5
+ # <em>Introduction to Information Retrieval</em> by Christopher D. Manning, Prabhakar Raghavan and Hinrich Schütze,
6
6
  # Cambridge University Press. 2008, ISBN 0521865719.
7
7
  #
8
- module Reclassifier
9
- class Bayes
10
- # Can be created with zero or more classifications, each of which will be
11
- # initialized and given a training method. The classifications are specified as
12
- # symbols. E.g.,
13
- # b = Reclassifier::Bayes.new :interesting, :uninteresting, :spam
14
- def initialize(*classifications)
15
- @classifications = {}
16
- classifications.each {|classification| @classifications[classification] = {}}
17
-
18
- @docs_in_classification_count = {}
19
- end
8
+ class Reclassifier::Bayes
9
+ # Can be created with zero or more classifications, each of which will be
10
+ # initialized and given a training method. The classifications are specified as
11
+ # symbols. E.g.,
12
+ # b = Reclassifier::Bayes.new :interesting, :uninteresting, :spam
13
+ def initialize(*classifications)
14
+ @classifications = {}
15
+ classifications.each {|classification| @classifications[classification] = {}}
16
+
17
+ @docs_in_classification_count = {}
18
+ end
20
19
 
21
- #
22
- # Provides a general training method for all classifications specified in Bayes#new
23
- # For example:
24
- # b = Reclassifier::Bayes.new :this, :that
25
- # b.train :this, "This text"
26
- # b.train :that, "That text"
27
- def train(classification, text)
28
- ensure_classification_exists(classification)
20
+ #
21
+ # Provides a general training method for all classifications specified in Bayes#new
22
+ # For example:
23
+ # b = Reclassifier::Bayes.new :this, :that
24
+ # b.train :this, "This text"
25
+ # b.train :that, "That text"
26
+ def train(classification, text)
27
+ ensure_classification_exists(classification)
29
28
 
30
- @docs_in_classification_count[classification] ||= 0
31
- @docs_in_classification_count[classification] += 1
29
+ @docs_in_classification_count[classification] ||= 0
30
+ @docs_in_classification_count[classification] += 1
32
31
 
33
- text.word_hash.each do |word, count|
34
- @classifications[classification][word] ||= 0
32
+ text.word_hash.each do |word, count|
33
+ @classifications[classification][word] ||= 0
35
34
 
36
- @classifications[classification][word] += count
37
- end
35
+ @classifications[classification][word] += count
36
+ end
37
+ end
38
+
39
+ #
40
+ # Untrain a (classification, text) pair.
41
+ # Be very careful with this method.
42
+ #
43
+ # For example:
44
+ # b = Reclassifier::Bayes.new :this, :that, :the_other
45
+ # b.train :this, "This text"
46
+ # b.untrain :this, "This text"
47
+ def untrain(classification, text)
48
+ ensure_classification_exists(classification)
49
+
50
+ @docs_in_classification_count[classification] -= 1
51
+
52
+ text.word_hash.each do |word, count|
53
+ @classifications[classification][word] -= count if @classifications[classification].include?(word)
38
54
  end
55
+ end
39
56
 
40
- #
41
- # Untrain a (classification, text) pair.
42
- # Be very careful with this method.
43
- #
44
- # For example:
45
- # b = Reclassifier::Bayes.new :this, :that, :the_other
46
- # b.train :this, "This text"
47
- # b.untrain :this, "This text"
48
- def untrain(classification, text)
49
- ensure_classification_exists(classification)
57
+ #
58
+ # Returns the scores of the specified text for each classification. E.g.,
59
+ # b.classifications "I hate bad words and you"
60
+ # => {"Uninteresting"=>-12.6997928013932, "Interesting"=>-18.4206807439524}
61
+ # The largest of these scores (the one closest to 0) is the one picked out by #classify
62
+ def calculate_scores(text)
63
+ scores = {}
50
64
 
51
- @docs_in_classification_count[classification] -= 1
65
+ @classifications.each do |classification, classification_word_counts|
66
+ # prior
67
+ scores[classification] = Math.log(@docs_in_classification_count[classification])
68
+ scores[classification] -= Math.log(@docs_in_classification_count.values.reduce(:+))
52
69
 
70
+ # likelihood
53
71
  text.word_hash.each do |word, count|
54
- @classifications[classification][word] -= count if @classifications[classification].include?(word)
55
- end
56
- end
72
+ if @classifications.values.reduce(Set.new) {|set, word_counts| set.merge(word_counts.keys)}.include?(word)
73
+ scores[classification] += count * Math.log((classification_word_counts[word] || 0) + 1)
57
74
 
58
- #
59
- # Returns the scores of the specified text for each classification. E.g.,
60
- # b.classifications "I hate bad words and you"
61
- # => {"Uninteresting"=>-12.6997928013932, "Interesting"=>-18.4206807439524}
62
- # The largest of these scores (the one closest to 0) is the one picked out by #classify
63
- def calculate_scores(text)
64
- scores = {}
65
-
66
- @classifications.each do |classification, classification_word_counts|
67
- # prior
68
- scores[classification] = Math.log(@docs_in_classification_count[classification])
69
- scores[classification] -= Math.log(@docs_in_classification_count.values.reduce(:+))
70
-
71
- # likelihood
72
- text.word_hash.each do |word, count|
73
- if @classifications.values.reduce(Set.new) {|set, word_counts| set.merge(word_counts.keys)}.include?(word)
74
- scores[classification] += count * Math.log((classification_word_counts[word] || 0) + 1)
75
-
76
- scores[classification] -= count * Math.log(classification_word_counts.values.reduce(:+) + @classifications.values.reduce(Set.new) {|set, word_counts| set.merge(word_counts.keys)}.count)
77
- end
75
+ scores[classification] -= count * Math.log(classification_word_counts.values.reduce(:+) + @classifications.values.reduce(Set.new) {|set, word_counts| set.merge(word_counts.keys)}.count)
78
76
  end
79
77
  end
80
-
81
- scores
82
- end
83
-
84
- #
85
- # Returns the classification of the specified text, which is one of the
86
- # classifications given in the initializer. E.g.,
87
- # b.classify "I hate bad words and you"
88
- # => :uninteresting
89
- def classify(text)
90
- calculate_scores(text).max_by {|classification| classification[1]}[0]
91
78
  end
92
79
 
93
- #
94
- # Provides a list of classification names
95
- # For example:
96
- # b.classifications
97
- # => [:this, :that, :the_other]
98
- def classifications
99
- @classifications.keys
100
- end
80
+ scores
81
+ end
101
82
 
102
- #
103
- # Adds the classification to the classifier.
104
- # Has no effect if the classification already existed.
105
- # Returns the classification.
106
- # For example:
107
- # b.add_classification(:not_spam)
108
- def add_classification(classification)
109
- @classifications[classification] ||= {}
83
+ #
84
+ # Returns the classification of the specified text, which is one of the
85
+ # classifications given in the initializer. E.g.,
86
+ # b.classify "I hate bad words and you"
87
+ # => :uninteresting
88
+ def classify(text)
89
+ calculate_scores(text).max_by {|classification| classification[1]}[0]
90
+ end
110
91
 
111
- classification
112
- end
92
+ #
93
+ # Provides a list of classification names
94
+ # For example:
95
+ # b.classifications
96
+ # => [:this, :that, :the_other]
97
+ def classifications
98
+ @classifications.keys
99
+ end
113
100
 
114
- #
115
- # Removes the classification from the classifier.
116
- # Returns the classifier if the classification existed, else nil.
117
- # For example:
118
- # b.remove_classification(:not_spam)
119
- def remove_classification(classification)
120
- return_value = if @classifications.include?(classification)
121
- classification
122
- else
123
- nil
124
- end
125
-
126
- @classifications.delete(classification)
127
-
128
- return_value
129
- end
101
+ #
102
+ # Adds the classification to the classifier.
103
+ # Has no effect if the classification already existed.
104
+ # Returns the classification.
105
+ # For example:
106
+ # b.add_classification(:not_spam)
107
+ def add_classification(classification)
108
+ @classifications[classification] ||= {}
130
109
 
131
- private
110
+ classification
111
+ end
132
112
 
133
- def ensure_classification_exists(classification)
134
- raise Reclassifier::UnknownClassificationError unless @classifications.include?(classification)
135
- end
113
+ #
114
+ # Removes the classification from the classifier.
115
+ # Returns the classifier if the classification existed, else nil.
116
+ # For example:
117
+ # b.remove_classification(:not_spam)
118
+ def remove_classification(classification)
119
+ return_value = if @classifications.include?(classification)
120
+ classification
121
+ else
122
+ nil
123
+ end
124
+
125
+ @classifications.delete(classification)
126
+
127
+ return_value
136
128
  end
129
+
130
+ private
131
+
132
+ def ensure_classification_exists(classification)
133
+ raise Reclassifier::UnknownClassificationError unless @classifications.include?(classification)
134
+ end
137
135
  end
@@ -1,3 +1,3 @@
1
1
  module Reclassifier
2
- VERSION = "0.0.3"
2
+ VERSION = "0.0.4"
3
3
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: reclassifier
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.3
4
+ version: 0.0.4
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2013-04-22 00:00:00.000000000 Z
12
+ date: 2013-04-23 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: bundler