otherinbox-classifier 1.3.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,96 @@
1
+ require 'rubygems'
2
+ require 'rake'
3
+ require 'rake/testtask'
4
+ require 'rake/rdoctask'
5
+ require 'rake/gempackagetask'
6
+ require 'rake/contrib/rubyforgepublisher'
7
+
8
+ PKG_VERSION = "1.3.1"
9
+
10
+ PKG_FILES = FileList[
11
+ "lib/**/*", "bin/*", "test/**/*", "[A-Z]*", "Rakefile", "html/**/*"
12
+ ]
13
+
14
+ desc "Default Task"
15
+ task :default => [ :test ]
16
+
17
+ # Run the unit tests
18
+ desc "Run all unit tests"
19
+ Rake::TestTask.new("test") { |t|
20
+ t.libs << "lib"
21
+ t.pattern = 'test/*/*_test.rb'
22
+ t.verbose = true
23
+ }
24
+
25
+ # Make a console, useful when working on tests
26
+ desc "Generate a test console"
27
+ task :console do
28
+ verbose( false ) { sh "irb -I lib/ -r 'classifier'" }
29
+ end
30
+
31
+ # Genereate the RDoc documentation
32
+ desc "Create documentation"
33
+ Rake::RDocTask.new("doc") { |rdoc|
34
+ rdoc.title = "Ruby Classifier - Bayesian and LSI classification library"
35
+ rdoc.rdoc_dir = 'html'
36
+ rdoc.rdoc_files.include('README')
37
+ rdoc.rdoc_files.include('lib/**/*.rb')
38
+ }
39
+
40
+ # Genereate the package
41
+ spec = Gem::Specification.new do |s|
42
+
43
+ #### Basic information.
44
+
45
+ s.name = 'classifier'
46
+ s.version = PKG_VERSION
47
+ s.summary = <<-EOF
48
+ A general classifier module to allow Bayesian and other types of classifications.
49
+ EOF
50
+ s.description = <<-EOF
51
+ A general classifier module to allow Bayesian and other types of classifications.
52
+ EOF
53
+
54
+ #### Which files are to be included in this gem? Everything! (Except CVS directories.)
55
+
56
+ s.files = PKG_FILES
57
+
58
+ #### Load-time details: library and application (you will need one or both).
59
+
60
+ s.require_path = 'lib'
61
+ s.autorequire = 'classifier'
62
+
63
+ #### Documentation and testing.
64
+
65
+ s.has_rdoc = true
66
+
67
+ #### Dependencies and requirements.
68
+
69
+ s.add_dependency('stemmer', '>= 1.0.0')
70
+ s.requirements << "A porter-stemmer module to split word stems."
71
+
72
+ #### Author and project details.
73
+ s.author = "Lucas Carlson"
74
+ s.email = "lucas@rufy.com"
75
+ s.homepage = "http://classifier.rufy.com/"
76
+ end
77
+
78
+ Rake::GemPackageTask.new(spec) do |pkg|
79
+ pkg.need_zip = true
80
+ pkg.need_tar = true
81
+ end
82
+
83
+ desc "Report code statistics (KLOCs, etc) from the application"
84
+ task :stats do
85
+ require 'code_statistics'
86
+ CodeStatistics.new(
87
+ ["Library", "lib"],
88
+ ["Units", "test"]
89
+ ).to_s
90
+ end
91
+
92
+ desc "Publish new documentation"
93
+ task :publish do
94
+ `ssh rufy update-classifier-doc`
95
+ Rake::RubyForgePublisher.new('classifier', 'cardmagic').upload
96
+ end
@@ -0,0 +1,36 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ begin
4
+ require 'rubygems'
5
+ require 'classifier'
6
+ rescue
7
+ require 'classifier'
8
+ end
9
+
10
+ require 'madeleine'
11
+
12
+ m = SnapshotMadeleine.new(File.expand_path("~/.bayes_data")) {
13
+ Classifier::Bayes.new 'Interesting', 'Uninteresting'
14
+ }
15
+
16
+ case ARGV[0]
17
+ when "add"
18
+ case ARGV[1].downcase
19
+ when "interesting"
20
+ m.system.train_interesting File.open(ARGV[2]).read
21
+ puts "#{ARGV[2]} has been classified as interesting"
22
+ when "uninteresting"
23
+ m.system.train_uninteresting File.open(ARGV[2]).read
24
+ puts "#{ARGV[2]} has been classified as uninteresting"
25
+ else
26
+ puts "Invalid category: choose between interesting and uninteresting"
27
+ exit(1)
28
+ end
29
+ when "classify"
30
+ puts m.system.classify(File.open(ARGV[1]).read)
31
+ else
32
+ puts "Invalid option: choose add [category] [file] or clasify [file]"
33
+ exit(-1)
34
+ end
35
+
36
+ m.take_snapshot
@@ -0,0 +1,16 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ begin
4
+ require 'rubygems'
5
+ require 'classifier'
6
+ rescue
7
+ require 'classifier'
8
+ end
9
+
10
+ require 'open-uri'
11
+
12
+ num = ARGV[1].to_i
13
+ num = num < 1 ? 10 : num
14
+
15
+ text = open(ARGV.first).read
16
+ puts text.gsub(/<[^>]+>/,"").gsub(/[\s]+/," ").summary(num)
@@ -0,0 +1,30 @@
1
+ #--
2
+ # Copyright (c) 2005 Lucas Carlson
3
+ #
4
+ # Permission is hereby granted, free of charge, to any person obtaining
5
+ # a copy of this software and associated documentation files (the
6
+ # "Software"), to deal in the Software without restriction, including
7
+ # without limitation the rights to use, copy, modify, merge, publish,
8
+ # distribute, sublicense, and/or sell copies of the Software, and to
9
+ # permit persons to whom the Software is furnished to do so, subject to
10
+ # the following conditions:
11
+ #
12
+ # The above copyright notice and this permission notice shall be
13
+ # included in all copies or substantial portions of the Software.
14
+ #
15
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
16
+ # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
17
+ # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
18
+ # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
19
+ # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
20
+ # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
21
+ # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
22
+ #++
23
+ # Author:: Lucas Carlson (mailto:lucas@rufy.com)
24
+ # Copyright:: Copyright (c) 2005 Lucas Carlson
25
+ # License:: LGPL
26
+
27
+ require 'rubygems'
28
+ require 'classifier/extensions/string'
29
+ require 'classifier/bayes'
30
+ # require 'classifier/lsi'
@@ -0,0 +1,172 @@
1
+ # Author:: Lucas Carlson (mailto:lucas@rufy.com)
2
+ # Copyright:: Copyright (c) 2005 Lucas Carlson
3
+ # License:: LGPL
4
+
5
+ module Classifier
6
+
7
+ class Bayes
8
+ # The class can be created with one or more categories, each of which will be
9
+ # initialized and given a training method. E.g.,
10
+ # b = Classifier::Bayes.new 'Interesting', 'Uninteresting', 'Spam'
11
+ def initialize(*categories)
12
+ @categories = Hash.new
13
+ categories.each { |category| @categories[category.prepare_category_name] = Hash.new }
14
+ @total_words = 0
15
+ end
16
+
17
+ #
18
+ # Provides a general training method for all categories specified in Bayes#new
19
+ # For example:
20
+ # b = Classifier::Bayes.new 'This', 'That', 'the_other'
21
+ # b.train :this, "This text"
22
+ # b.train "that", "That text"
23
+ # b.train "The other", "The other text"
24
+ def train(category, text)
25
+ category = category.prepare_category_name
26
+ text.word_hash.each do |word, count|
27
+ @categories[category][word] ||= 0
28
+ @categories[category][word] += count
29
+ @total_words += count
30
+ end
31
+ end
32
+
33
+ #
34
+ # Provides a untraining method for all categories specified in Bayes#new
35
+ # Be very careful with this method.
36
+ #
37
+ # For example:
38
+ # b = Classifier::Bayes.new 'This', 'That', 'the_other'
39
+ # b.train :this, "This text"
40
+ # b.untrain :this, "This text"
41
+ def untrain(category, text)
42
+ category = category.prepare_category_name
43
+ text.word_hash.each do |word, count|
44
+ if @total_words >= 0
45
+ # Sometimes items can be untrained before they are trained,
46
+ # be tolerant of that case
47
+ next if @categories[category][word].nil?
48
+ orig = @categories[category][word]
49
+ @categories[category][word] ||= 0
50
+ @categories[category][word] -= count
51
+ if @categories[category][word] <= 0
52
+ @categories[category].delete(word)
53
+ count = orig
54
+ end
55
+ @total_words -= count
56
+ end
57
+ end
58
+ end
59
+
60
+ #
61
+ # Returns the scores in each category the provided +text+. E.g.,
62
+ # b.classifications "I hate bad words and you"
63
+ # => {"Uninteresting"=>-12.6997928013932, "Interesting"=>-18.4206807439524}
64
+ # The largest of these scores (the one closest to 0) is the one picked out by #classify
65
+ def classifications(text)
66
+ score = Hash.new
67
+ @categories.each do |category, category_words|
68
+ score[category.to_s] = 0
69
+ total = category_words.values.inject(0) {|sum, element| sum+element}
70
+ text.word_hash.each do |word, count|
71
+ s = category_words.has_key?(word) ? category_words[word] : 0.1
72
+ score[category.to_s] += Math.log(s/total.to_f)
73
+ end
74
+ end
75
+ return score
76
+ end
77
+
78
+ # These assume that the classes are Member and Not Member
79
+ def myclassify(text)
80
+ myclassify_with_word_hash(text.word_hash)
81
+ end
82
+
83
+ # http://nlp.stanford.edu/IR-book/html/htmledition/naive-bayes-text-classification-1.html
84
+ def myclassify_with_word_hash(word_hash)
85
+ member_term_count = @categories[:Member].size
86
+ nonmember_term_count = @categories[:"Not member"].size
87
+ term_count = member_term_count + nonmember_term_count
88
+ score = 0
89
+ word_hash.each do |word, count|
90
+ # count of words in each category
91
+ member_count = @categories[:Member][word].to_i + 1
92
+ nonmember_count = @categories[:"Not member"][word].to_i + 1
93
+ next if member_count.to_i == 1 && nonmember_count.to_i == 1
94
+
95
+ # find relative prob word is in class -- p(w|c)
96
+ word_member_p = (member_count) / (total_member_count + term_count).to_f
97
+ word_nonmember_p = (nonmember_count) / (total_nonmember_count + term_count).to_f
98
+
99
+ word_pr = Math.log(word_member_p / word_nonmember_p)
100
+ score += word_pr * count
101
+ end
102
+ if score > 0
103
+ return "Member", score
104
+ else
105
+ return "Not member", score
106
+ end
107
+ end
108
+
109
+ #
110
+ # Returns the classification of the provided +text+, which is one of the
111
+ # categories given in the initializer. E.g.,
112
+ # b.classify "I hate bad words and you"
113
+ # => 'Uninteresting'
114
+ def classify(text)
115
+ (classifications(text).sort_by { |a| -a[1] })[0][0]
116
+ end
117
+
118
+ #
119
+ # Provides training and untraining methods for the categories specified in Bayes#new
120
+ # For example:
121
+ # b = Classifier::Bayes.new 'This', 'That', 'the_other'
122
+ # b.train_this "This text"
123
+ # b.train_that "That text"
124
+ # b.untrain_that "That text"
125
+ # b.train_the_other "The other text"
126
+ def method_missing(name, *args)
127
+ category = name.to_s.gsub(/(un)?train_([\w]+)/, '\2').prepare_category_name
128
+ if @categories.has_key? category
129
+ args.each { |text| eval("#{$1}train(category, text)") }
130
+ elsif name.to_s =~ /(un)?train_([\w]+)/
131
+ raise StandardError, "No such category: #{category}"
132
+ else
133
+ super #raise StandardError, "No such method: #{name}"
134
+ end
135
+ end
136
+
137
+ #
138
+ # Provides a list of category names
139
+ # For example:
140
+ # b.categories
141
+ # => ['This', 'That', 'the_other']
142
+ def categories # :nodoc:
143
+ @categories.keys.collect {|c| c.to_s}
144
+ end
145
+
146
+ #
147
+ # Allows you to add categories to the classifier.
148
+ # For example:
149
+ # b.add_category "Not spam"
150
+ #
151
+ # WARNING: Adding categories to a trained classifier will
152
+ # result in an undertrained category that will tend to match
153
+ # more criteria than the trained selective categories. In short,
154
+ # try to initialize your categories at initialization.
155
+ def add_category(category)
156
+ @categories[category.prepare_category_name] = Hash.new
157
+ end
158
+
159
+ alias append_category add_category
160
+
161
+ private
162
+ def total_member_count
163
+ @total_member_count ||= @categories[:Member].values.inject(0) {|sum, element| sum+element}
164
+ end
165
+
166
+ def total_nonmember_count
167
+ @total_nonmember_count ||= @categories[:"Not member"].values.inject(0) {|sum, element| sum+element}
168
+ end
169
+
170
+ end
171
+
172
+ end
@@ -0,0 +1,16 @@
1
+ # Author:: Lucas Carlson (mailto:lucas@rufy.com)
2
+ # Copyright:: Copyright (c) 2005 Lucas Carlson
3
+ # License:: LGPL
4
+
5
+ begin
6
+ require 'stemmer'
7
+ rescue LoadError
8
+ puts "Please install stemmer from http://rubyforge.org/projects/stemmer or 'gem install stemmer'"
9
+ exit(-1)
10
+ end
11
+
12
+ require 'classifier/extensions/word_hash'
13
+
14
+ class Object
15
+ def prepare_category_name; to_s.gsub("_"," ").capitalize.intern end
16
+ end
@@ -0,0 +1,106 @@
1
+ # Author:: Ernest Ellingson
2
+ # Copyright:: Copyright (c) 2005
3
+
4
+ # These are extensions to the std-lib 'matrix' to allow an all ruby SVD
5
+
6
+ require 'matrix'
7
+ require 'mathn'
8
+
9
+ class Array
10
+ def sum
11
+ inject(0) { |sum,term| sum += term }.to_f
12
+ end
13
+ end
14
+
15
+ class Vector
16
+ def magnitude
17
+ sumsqs = 0.0
18
+ self.size.times do |i|
19
+ sumsqs += self[i] ** 2.0
20
+ end
21
+ Math.sqrt(sumsqs)
22
+ end
23
+ def normalize
24
+ nv = []
25
+ mag = self.magnitude
26
+ self.size.times do |i|
27
+
28
+ nv << (self[i] / mag)
29
+
30
+ end
31
+ Vector[*nv]
32
+ end
33
+ end
34
+
35
+ class Matrix
36
+ def Matrix.diag(s)
37
+ Matrix.diagonal(*s)
38
+ end
39
+
40
+ alias :trans :transpose
41
+
42
+ def SV_decomp(maxSweeps = 20)
43
+ if self.row_size >= self.column_size
44
+ q = self.trans * self
45
+ else
46
+ q = self * self.trans
47
+ end
48
+
49
+ qrot = q.dup
50
+ v = Matrix.identity(q.row_size)
51
+ azrot = nil
52
+ mzrot = nil
53
+ cnt = 0
54
+ s_old = nil
55
+ mu = nil
56
+
57
+ while true do
58
+ cnt += 1
59
+ for row in (0...qrot.row_size-1) do
60
+ for col in (1..qrot.row_size-1) do
61
+ next if row == col
62
+ h = Math.atan((2 * qrot[row,col])/(qrot[row,row]-qrot[col,col]))/2.0
63
+ hcos = Math.cos(h)
64
+ hsin = Math.sin(h)
65
+ mzrot = Matrix.identity(qrot.row_size)
66
+ mzrot[row,row] = hcos
67
+ mzrot[row,col] = -hsin
68
+ mzrot[col,row] = hsin
69
+ mzrot[col,col] = hcos
70
+ qrot = mzrot.trans * qrot * mzrot
71
+ v = v * mzrot
72
+ end
73
+ end
74
+ s_old = qrot.dup if cnt == 1
75
+ sum_qrot = 0.0
76
+ if cnt > 1
77
+ qrot.row_size.times do |r|
78
+ sum_qrot += (qrot[r,r]-s_old[r,r]).abs if (qrot[r,r]-s_old[r,r]).abs > 0.001
79
+ end
80
+ s_old = qrot.dup
81
+ end
82
+ break if (sum_qrot <= 0.001 and cnt > 1) or cnt >= maxSweeps
83
+ end # of do while true
84
+ s = []
85
+ qrot.row_size.times do |r|
86
+ s << Math.sqrt(qrot[r,r])
87
+ end
88
+ #puts "cnt = #{cnt}"
89
+ if self.row_size >= self.column_size
90
+ mu = self * v * Matrix.diagonal(*s).inverse
91
+ return [mu, v, s]
92
+ else
93
+ puts v.row_size
94
+ puts v.column_size
95
+ puts self.row_size
96
+ puts self.column_size
97
+ puts s.size
98
+
99
+ mu = (self.trans * v * Matrix.diagonal(*s).inverse)
100
+ return [mu, v, s]
101
+ end
102
+ end
103
+ def []=(i,j,val)
104
+ @rows[i][j] = val
105
+ end
106
+ end