reclassifier 0.0.3 → 0.0.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/Rakefile CHANGED
@@ -1,4 +1,9 @@
1
- require "bundler/gem_tasks"
1
+ require 'bundler/gem_tasks'
2
+ require 'rdoc/task'
2
3
  require 'rspec/core/rake_task'
3
4
 
5
+ Rake::RDocTask.new do |rdoc|
6
+ rdoc.rdoc_files.include('lib/**/*.rb')
7
+ end
8
+
4
9
  RSpec::Core::RakeTask.new(:spec)
@@ -2,136 +2,134 @@
2
2
  # Bayesian classifier for arbitrary text.
3
3
  #
4
4
  # Implementation is translated from
5
- # Introduction to Information Retrieval by Christopher D. Manning, Prabhakar Raghavan and Hinrich Schütze,
5
+ # <em>Introduction to Information Retrieval</em> by Christopher D. Manning, Prabhakar Raghavan and Hinrich Schütze,
6
6
  # Cambridge University Press. 2008, ISBN 0521865719.
7
7
  #
8
- module Reclassifier
9
- class Bayes
10
- # Can be created with zero or more classifications, each of which will be
11
- # initialized and given a training method. The classifications are specified as
12
- # symbols. E.g.,
13
- # b = Reclassifier::Bayes.new :interesting, :uninteresting, :spam
14
- def initialize(*classifications)
15
- @classifications = {}
16
- classifications.each {|classification| @classifications[classification] = {}}
17
-
18
- @docs_in_classification_count = {}
19
- end
8
+ class Reclassifier::Bayes
9
+ # Can be created with zero or more classifications, each of which will be
10
+ # initialized and given a training method. The classifications are specified as
11
+ # symbols. E.g.,
12
+ # b = Reclassifier::Bayes.new :interesting, :uninteresting, :spam
13
+ def initialize(*classifications)
14
+ @classifications = {}
15
+ classifications.each {|classification| @classifications[classification] = {}}
16
+
17
+ @docs_in_classification_count = {}
18
+ end
20
19
 
21
- #
22
- # Provides a general training method for all classifications specified in Bayes#new
23
- # For example:
24
- # b = Reclassifier::Bayes.new :this, :that
25
- # b.train :this, "This text"
26
- # b.train :that, "That text"
27
- def train(classification, text)
28
- ensure_classification_exists(classification)
20
+ #
21
+ # Provides a general training method for all classifications specified in Bayes#new
22
+ # For example:
23
+ # b = Reclassifier::Bayes.new :this, :that
24
+ # b.train :this, "This text"
25
+ # b.train :that, "That text"
26
+ def train(classification, text)
27
+ ensure_classification_exists(classification)
29
28
 
30
- @docs_in_classification_count[classification] ||= 0
31
- @docs_in_classification_count[classification] += 1
29
+ @docs_in_classification_count[classification] ||= 0
30
+ @docs_in_classification_count[classification] += 1
32
31
 
33
- text.word_hash.each do |word, count|
34
- @classifications[classification][word] ||= 0
32
+ text.word_hash.each do |word, count|
33
+ @classifications[classification][word] ||= 0
35
34
 
36
- @classifications[classification][word] += count
37
- end
35
+ @classifications[classification][word] += count
36
+ end
37
+ end
38
+
39
+ #
40
+ # Untrain a (classification, text) pair.
41
+ # Be very careful with this method.
42
+ #
43
+ # For example:
44
+ # b = Reclassifier::Bayes.new :this, :that, :the_other
45
+ # b.train :this, "This text"
46
+ # b.untrain :this, "This text"
47
+ def untrain(classification, text)
48
+ ensure_classification_exists(classification)
49
+
50
+ @docs_in_classification_count[classification] -= 1
51
+
52
+ text.word_hash.each do |word, count|
53
+ @classifications[classification][word] -= count if @classifications[classification].include?(word)
38
54
  end
55
+ end
39
56
 
40
- #
41
- # Untrain a (classification, text) pair.
42
- # Be very careful with this method.
43
- #
44
- # For example:
45
- # b = Reclassifier::Bayes.new :this, :that, :the_other
46
- # b.train :this, "This text"
47
- # b.untrain :this, "This text"
48
- def untrain(classification, text)
49
- ensure_classification_exists(classification)
57
+ #
58
+ # Returns the scores of the specified text for each classification. E.g.,
59
+ # b.classifications "I hate bad words and you"
60
+ # => {"Uninteresting"=>-12.6997928013932, "Interesting"=>-18.4206807439524}
61
+ # The largest of these scores (the one closest to 0) is the one picked out by #classify
62
+ def calculate_scores(text)
63
+ scores = {}
50
64
 
51
- @docs_in_classification_count[classification] -= 1
65
+ @classifications.each do |classification, classification_word_counts|
66
+ # prior
67
+ scores[classification] = Math.log(@docs_in_classification_count[classification])
68
+ scores[classification] -= Math.log(@docs_in_classification_count.values.reduce(:+))
52
69
 
70
+ # likelihood
53
71
  text.word_hash.each do |word, count|
54
- @classifications[classification][word] -= count if @classifications[classification].include?(word)
55
- end
56
- end
72
+ if @classifications.values.reduce(Set.new) {|set, word_counts| set.merge(word_counts.keys)}.include?(word)
73
+ scores[classification] += count * Math.log((classification_word_counts[word] || 0) + 1)
57
74
 
58
- #
59
- # Returns the scores of the specified text for each classification. E.g.,
60
- # b.classifications "I hate bad words and you"
61
- # => {"Uninteresting"=>-12.6997928013932, "Interesting"=>-18.4206807439524}
62
- # The largest of these scores (the one closest to 0) is the one picked out by #classify
63
- def calculate_scores(text)
64
- scores = {}
65
-
66
- @classifications.each do |classification, classification_word_counts|
67
- # prior
68
- scores[classification] = Math.log(@docs_in_classification_count[classification])
69
- scores[classification] -= Math.log(@docs_in_classification_count.values.reduce(:+))
70
-
71
- # likelihood
72
- text.word_hash.each do |word, count|
73
- if @classifications.values.reduce(Set.new) {|set, word_counts| set.merge(word_counts.keys)}.include?(word)
74
- scores[classification] += count * Math.log((classification_word_counts[word] || 0) + 1)
75
-
76
- scores[classification] -= count * Math.log(classification_word_counts.values.reduce(:+) + @classifications.values.reduce(Set.new) {|set, word_counts| set.merge(word_counts.keys)}.count)
77
- end
75
+ scores[classification] -= count * Math.log(classification_word_counts.values.reduce(:+) + @classifications.values.reduce(Set.new) {|set, word_counts| set.merge(word_counts.keys)}.count)
78
76
  end
79
77
  end
80
-
81
- scores
82
- end
83
-
84
- #
85
- # Returns the classification of the specified text, which is one of the
86
- # classifications given in the initializer. E.g.,
87
- # b.classify "I hate bad words and you"
88
- # => :uninteresting
89
- def classify(text)
90
- calculate_scores(text).max_by {|classification| classification[1]}[0]
91
78
  end
92
79
 
93
- #
94
- # Provides a list of classification names
95
- # For example:
96
- # b.classifications
97
- # => [:this, :that, :the_other]
98
- def classifications
99
- @classifications.keys
100
- end
80
+ scores
81
+ end
101
82
 
102
- #
103
- # Adds the classification to the classifier.
104
- # Has no effect if the classification already existed.
105
- # Returns the classification.
106
- # For example:
107
- # b.add_classification(:not_spam)
108
- def add_classification(classification)
109
- @classifications[classification] ||= {}
83
+ #
84
+ # Returns the classification of the specified text, which is one of the
85
+ # classifications given in the initializer. E.g.,
86
+ # b.classify "I hate bad words and you"
87
+ # => :uninteresting
88
+ def classify(text)
89
+ calculate_scores(text).max_by {|classification| classification[1]}[0]
90
+ end
110
91
 
111
- classification
112
- end
92
+ #
93
+ # Provides a list of classification names
94
+ # For example:
95
+ # b.classifications
96
+ # => [:this, :that, :the_other]
97
+ def classifications
98
+ @classifications.keys
99
+ end
113
100
 
114
- #
115
- # Removes the classification from the classifier.
116
- # Returns the classifier if the classification existed, else nil.
117
- # For example:
118
- # b.remove_classification(:not_spam)
119
- def remove_classification(classification)
120
- return_value = if @classifications.include?(classification)
121
- classification
122
- else
123
- nil
124
- end
125
-
126
- @classifications.delete(classification)
127
-
128
- return_value
129
- end
101
+ #
102
+ # Adds the classification to the classifier.
103
+ # Has no effect if the classification already existed.
104
+ # Returns the classification.
105
+ # For example:
106
+ # b.add_classification(:not_spam)
107
+ def add_classification(classification)
108
+ @classifications[classification] ||= {}
130
109
 
131
- private
110
+ classification
111
+ end
132
112
 
133
- def ensure_classification_exists(classification)
134
- raise Reclassifier::UnknownClassificationError unless @classifications.include?(classification)
135
- end
113
+ #
114
+ # Removes the classification from the classifier.
115
+ # Returns the classifier if the classification existed, else nil.
116
+ # For example:
117
+ # b.remove_classification(:not_spam)
118
+ def remove_classification(classification)
119
+ return_value = if @classifications.include?(classification)
120
+ classification
121
+ else
122
+ nil
123
+ end
124
+
125
+ @classifications.delete(classification)
126
+
127
+ return_value
136
128
  end
129
+
130
+ private
131
+
132
+ def ensure_classification_exists(classification)
133
+ raise Reclassifier::UnknownClassificationError unless @classifications.include?(classification)
134
+ end
137
135
  end
@@ -1,3 +1,3 @@
1
1
  module Reclassifier
2
- VERSION = "0.0.3"
2
+ VERSION = "0.0.4"
3
3
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: reclassifier
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.3
4
+ version: 0.0.4
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2013-04-22 00:00:00.000000000 Z
12
+ date: 2013-04-23 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: bundler