otherinbox-classifier 1.3.1.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,96 @@
1
+ require 'rubygems'
2
+ require 'rake'
3
+ require 'rake/testtask'
4
+ require 'rake/rdoctask'
5
+ require 'rake/gempackagetask'
6
+ require 'rake/contrib/rubyforgepublisher'
7
+
8
+ PKG_VERSION = "1.3.1"
9
+
10
+ PKG_FILES = FileList[
11
+ "lib/**/*", "bin/*", "test/**/*", "[A-Z]*", "Rakefile", "html/**/*"
12
+ ]
13
+
14
+ desc "Default Task"
15
+ task :default => [ :test ]
16
+
17
+ # Run the unit tests
18
+ desc "Run all unit tests"
19
+ Rake::TestTask.new("test") { |t|
20
+ t.libs << "lib"
21
+ t.pattern = 'test/*/*_test.rb'
22
+ t.verbose = true
23
+ }
24
+
25
+ # Make a console, useful when working on tests
26
+ desc "Generate a test console"
27
+ task :console do
28
+ verbose( false ) { sh "irb -I lib/ -r 'classifier'" }
29
+ end
30
+
31
+ # Genereate the RDoc documentation
32
+ desc "Create documentation"
33
+ Rake::RDocTask.new("doc") { |rdoc|
34
+ rdoc.title = "Ruby Classifier - Bayesian and LSI classification library"
35
+ rdoc.rdoc_dir = 'html'
36
+ rdoc.rdoc_files.include('README')
37
+ rdoc.rdoc_files.include('lib/**/*.rb')
38
+ }
39
+
40
+ # Genereate the package
41
+ spec = Gem::Specification.new do |s|
42
+
43
+ #### Basic information.
44
+
45
+ s.name = 'classifier'
46
+ s.version = PKG_VERSION
47
+ s.summary = <<-EOF
48
+ A general classifier module to allow Bayesian and other types of classifications.
49
+ EOF
50
+ s.description = <<-EOF
51
+ A general classifier module to allow Bayesian and other types of classifications.
52
+ EOF
53
+
54
+ #### Which files are to be included in this gem? Everything! (Except CVS directories.)
55
+
56
+ s.files = PKG_FILES
57
+
58
+ #### Load-time details: library and application (you will need one or both).
59
+
60
+ s.require_path = 'lib'
61
+ s.autorequire = 'classifier'
62
+
63
+ #### Documentation and testing.
64
+
65
+ s.has_rdoc = true
66
+
67
+ #### Dependencies and requirements.
68
+
69
+ s.add_dependency('stemmer', '>= 1.0.0')
70
+ s.requirements << "A porter-stemmer module to split word stems."
71
+
72
+ #### Author and project details.
73
+ s.author = "Lucas Carlson"
74
+ s.email = "lucas@rufy.com"
75
+ s.homepage = "http://classifier.rufy.com/"
76
+ end
77
+
78
+ Rake::GemPackageTask.new(spec) do |pkg|
79
+ pkg.need_zip = true
80
+ pkg.need_tar = true
81
+ end
82
+
83
+ desc "Report code statistics (KLOCs, etc) from the application"
84
+ task :stats do
85
+ require 'code_statistics'
86
+ CodeStatistics.new(
87
+ ["Library", "lib"],
88
+ ["Units", "test"]
89
+ ).to_s
90
+ end
91
+
92
+ desc "Publish new documentation"
93
+ task :publish do
94
+ `ssh rufy update-classifier-doc`
95
+ Rake::RubyForgePublisher.new('classifier', 'cardmagic').upload
96
+ end
@@ -0,0 +1,36 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ begin
4
+ require 'rubygems'
5
+ require 'classifier'
6
+ rescue
7
+ require 'classifier'
8
+ end
9
+
10
+ require 'madeleine'
11
+
12
+ m = SnapshotMadeleine.new(File.expand_path("~/.bayes_data")) {
13
+ Classifier::Bayes.new 'Interesting', 'Uninteresting'
14
+ }
15
+
16
+ case ARGV[0]
17
+ when "add"
18
+ case ARGV[1].downcase
19
+ when "interesting"
20
+ m.system.train_interesting File.open(ARGV[2]).read
21
+ puts "#{ARGV[2]} has been classified as interesting"
22
+ when "uninteresting"
23
+ m.system.train_uninteresting File.open(ARGV[2]).read
24
+ puts "#{ARGV[2]} has been classified as uninteresting"
25
+ else
26
+ puts "Invalid category: choose between interesting and uninteresting"
27
+ exit(1)
28
+ end
29
+ when "classify"
30
+ puts m.system.classify(File.open(ARGV[1]).read)
31
+ else
32
+ puts "Invalid option: choose add [category] [file] or clasify [file]"
33
+ exit(-1)
34
+ end
35
+
36
+ m.take_snapshot
@@ -0,0 +1,16 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ begin
4
+ require 'rubygems'
5
+ require 'classifier'
6
+ rescue
7
+ require 'classifier'
8
+ end
9
+
10
+ require 'open-uri'
11
+
12
+ num = ARGV[1].to_i
13
+ num = num < 1 ? 10 : num
14
+
15
+ text = open(ARGV.first).read
16
+ puts text.gsub(/<[^>]+>/,"").gsub(/[\s]+/," ").summary(num)
@@ -0,0 +1,30 @@
1
+ #--
2
+ # Copyright (c) 2005 Lucas Carlson
3
+ #
4
+ # Permission is hereby granted, free of charge, to any person obtaining
5
+ # a copy of this software and associated documentation files (the
6
+ # "Software"), to deal in the Software without restriction, including
7
+ # without limitation the rights to use, copy, modify, merge, publish,
8
+ # distribute, sublicense, and/or sell copies of the Software, and to
9
+ # permit persons to whom the Software is furnished to do so, subject to
10
+ # the following conditions:
11
+ #
12
+ # The above copyright notice and this permission notice shall be
13
+ # included in all copies or substantial portions of the Software.
14
+ #
15
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
16
+ # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
17
+ # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
18
+ # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
19
+ # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
20
+ # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
21
+ # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
22
+ #++
23
+ # Author:: Lucas Carlson (mailto:lucas@rufy.com)
24
+ # Copyright:: Copyright (c) 2005 Lucas Carlson
25
+ # License:: LGPL
26
+
27
+ require 'rubygems'
28
+ require 'classifier/extensions/string'
29
+ require 'classifier/bayes'
30
+ # require 'classifier/lsi'
@@ -0,0 +1,172 @@
1
+ # Author:: Lucas Carlson (mailto:lucas@rufy.com)
2
+ # Copyright:: Copyright (c) 2005 Lucas Carlson
3
+ # License:: LGPL
4
+
5
+ module Classifier
6
+
7
+ class Bayes
8
+ # The class can be created with one or more categories, each of which will be
9
+ # initialized and given a training method. E.g.,
10
+ # b = Classifier::Bayes.new 'Interesting', 'Uninteresting', 'Spam'
11
+ def initialize(*categories)
12
+ @categories = Hash.new
13
+ categories.each { |category| @categories[category.prepare_category_name] = Hash.new }
14
+ @total_words = 0
15
+ end
16
+
17
+ #
18
+ # Provides a general training method for all categories specified in Bayes#new
19
+ # For example:
20
+ # b = Classifier::Bayes.new 'This', 'That', 'the_other'
21
+ # b.train :this, "This text"
22
+ # b.train "that", "That text"
23
+ # b.train "The other", "The other text"
24
+ def train(category, text)
25
+ category = category.prepare_category_name
26
+ text.word_hash.each do |word, count|
27
+ @categories[category][word] ||= 0
28
+ @categories[category][word] += count
29
+ @total_words += count
30
+ end
31
+ end
32
+
33
+ #
34
+ # Provides a untraining method for all categories specified in Bayes#new
35
+ # Be very careful with this method.
36
+ #
37
+ # For example:
38
+ # b = Classifier::Bayes.new 'This', 'That', 'the_other'
39
+ # b.train :this, "This text"
40
+ # b.untrain :this, "This text"
41
+ def untrain(category, text)
42
+ category = category.prepare_category_name
43
+ text.word_hash.each do |word, count|
44
+ if @total_words >= 0
45
+ # Sometimes items can be untrained before they are trained,
46
+ # be tolerant of that case
47
+ next if @categories[category][word].nil?
48
+ orig = @categories[category][word]
49
+ @categories[category][word] ||= 0
50
+ @categories[category][word] -= count
51
+ if @categories[category][word] <= 0
52
+ @categories[category].delete(word)
53
+ count = orig
54
+ end
55
+ @total_words -= count
56
+ end
57
+ end
58
+ end
59
+
60
+ #
61
+ # Returns the scores in each category the provided +text+. E.g.,
62
+ # b.classifications "I hate bad words and you"
63
+ # => {"Uninteresting"=>-12.6997928013932, "Interesting"=>-18.4206807439524}
64
+ # The largest of these scores (the one closest to 0) is the one picked out by #classify
65
+ def classifications(text)
66
+ score = Hash.new
67
+ @categories.each do |category, category_words|
68
+ score[category.to_s] = 0
69
+ total = category_words.values.inject(0) {|sum, element| sum+element}
70
+ text.word_hash.each do |word, count|
71
+ s = category_words.has_key?(word) ? category_words[word] : 0.1
72
+ score[category.to_s] += Math.log(s/total.to_f)
73
+ end
74
+ end
75
+ return score
76
+ end
77
+
78
+ # These assume that the classes are Member and Not Member
79
+ def myclassify(text)
80
+ myclassify_with_word_hash(text.word_hash)
81
+ end
82
+
83
+ # http://nlp.stanford.edu/IR-book/html/htmledition/naive-bayes-text-classification-1.html
84
+ def myclassify_with_word_hash(word_hash)
85
+ member_term_count = @categories[:Member].size
86
+ nonmember_term_count = @categories[:"Not member"].size
87
+ term_count = member_term_count + nonmember_term_count
88
+ score = 0
89
+ word_hash.each do |word, count|
90
+ # count of words in each category
91
+ member_count = @categories[:Member][word].to_i + 1
92
+ nonmember_count = @categories[:"Not member"][word].to_i + 1
93
+ next if member_count.to_i == 1 && nonmember_count.to_i == 1
94
+
95
+ # find relative prob word is in class -- p(w|c)
96
+ word_member_p = (member_count) / (total_member_count + term_count).to_f
97
+ word_nonmember_p = (nonmember_count) / (total_nonmember_count + term_count).to_f
98
+
99
+ word_pr = Math.log(word_member_p / word_nonmember_p)
100
+ score += word_pr * count
101
+ end
102
+ if score > 0
103
+ return "Member", score
104
+ else
105
+ return "Not member", score
106
+ end
107
+ end
108
+
109
+ #
110
+ # Returns the classification of the provided +text+, which is one of the
111
+ # categories given in the initializer. E.g.,
112
+ # b.classify "I hate bad words and you"
113
+ # => 'Uninteresting'
114
+ def classify(text)
115
+ (classifications(text).sort_by { |a| -a[1] })[0][0]
116
+ end
117
+
118
+ #
119
+ # Provides training and untraining methods for the categories specified in Bayes#new
120
+ # For example:
121
+ # b = Classifier::Bayes.new 'This', 'That', 'the_other'
122
+ # b.train_this "This text"
123
+ # b.train_that "That text"
124
+ # b.untrain_that "That text"
125
+ # b.train_the_other "The other text"
126
+ def method_missing(name, *args)
127
+ category = name.to_s.gsub(/(un)?train_([\w]+)/, '\2').prepare_category_name
128
+ if @categories.has_key? category
129
+ args.each { |text| eval("#{$1}train(category, text)") }
130
+ elsif name.to_s =~ /(un)?train_([\w]+)/
131
+ raise StandardError, "No such category: #{category}"
132
+ else
133
+ super #raise StandardError, "No such method: #{name}"
134
+ end
135
+ end
136
+
137
+ #
138
+ # Provides a list of category names
139
+ # For example:
140
+ # b.categories
141
+ # => ['This', 'That', 'the_other']
142
+ def categories # :nodoc:
143
+ @categories.keys.collect {|c| c.to_s}
144
+ end
145
+
146
+ #
147
+ # Allows you to add categories to the classifier.
148
+ # For example:
149
+ # b.add_category "Not spam"
150
+ #
151
+ # WARNING: Adding categories to a trained classifier will
152
+ # result in an undertrained category that will tend to match
153
+ # more criteria than the trained selective categories. In short,
154
+ # try to initialize your categories at initialization.
155
+ def add_category(category)
156
+ @categories[category.prepare_category_name] = Hash.new
157
+ end
158
+
159
+ alias append_category add_category
160
+
161
+ private
162
+ def total_member_count
163
+ @total_member_count ||= @categories[:Member].values.inject(0) {|sum, element| sum+element}
164
+ end
165
+
166
+ def total_nonmember_count
167
+ @total_nonmember_count ||= @categories[:"Not member"].values.inject(0) {|sum, element| sum+element}
168
+ end
169
+
170
+ end
171
+
172
+ end
@@ -0,0 +1,16 @@
1
+ # Author:: Lucas Carlson (mailto:lucas@rufy.com)
2
+ # Copyright:: Copyright (c) 2005 Lucas Carlson
3
+ # License:: LGPL
4
+
5
+ begin
6
+ require 'stemmer'
7
+ rescue LoadError
8
+ puts "Please install stemmer from http://rubyforge.org/projects/stemmer or 'gem install stemmer'"
9
+ exit(-1)
10
+ end
11
+
12
+ require 'classifier/extensions/word_hash'
13
+
14
+ class Object
15
+ def prepare_category_name; to_s.gsub("_"," ").capitalize.intern end
16
+ end
@@ -0,0 +1,106 @@
1
+ # Author:: Ernest Ellingson
2
+ # Copyright:: Copyright (c) 2005
3
+
4
+ # These are extensions to the std-lib 'matrix' to allow an all ruby SVD
5
+
6
+ require 'matrix'
7
+ require 'mathn'
8
+
9
+ class Array
10
+ def sum
11
+ inject(0) { |sum,term| sum += term }.to_f
12
+ end
13
+ end
14
+
15
+ class Vector
16
+ def magnitude
17
+ sumsqs = 0.0
18
+ self.size.times do |i|
19
+ sumsqs += self[i] ** 2.0
20
+ end
21
+ Math.sqrt(sumsqs)
22
+ end
23
+ def normalize
24
+ nv = []
25
+ mag = self.magnitude
26
+ self.size.times do |i|
27
+
28
+ nv << (self[i] / mag)
29
+
30
+ end
31
+ Vector[*nv]
32
+ end
33
+ end
34
+
35
+ class Matrix
36
+ def Matrix.diag(s)
37
+ Matrix.diagonal(*s)
38
+ end
39
+
40
+ alias :trans :transpose
41
+
42
+ def SV_decomp(maxSweeps = 20)
43
+ if self.row_size >= self.column_size
44
+ q = self.trans * self
45
+ else
46
+ q = self * self.trans
47
+ end
48
+
49
+ qrot = q.dup
50
+ v = Matrix.identity(q.row_size)
51
+ azrot = nil
52
+ mzrot = nil
53
+ cnt = 0
54
+ s_old = nil
55
+ mu = nil
56
+
57
+ while true do
58
+ cnt += 1
59
+ for row in (0...qrot.row_size-1) do
60
+ for col in (1..qrot.row_size-1) do
61
+ next if row == col
62
+ h = Math.atan((2 * qrot[row,col])/(qrot[row,row]-qrot[col,col]))/2.0
63
+ hcos = Math.cos(h)
64
+ hsin = Math.sin(h)
65
+ mzrot = Matrix.identity(qrot.row_size)
66
+ mzrot[row,row] = hcos
67
+ mzrot[row,col] = -hsin
68
+ mzrot[col,row] = hsin
69
+ mzrot[col,col] = hcos
70
+ qrot = mzrot.trans * qrot * mzrot
71
+ v = v * mzrot
72
+ end
73
+ end
74
+ s_old = qrot.dup if cnt == 1
75
+ sum_qrot = 0.0
76
+ if cnt > 1
77
+ qrot.row_size.times do |r|
78
+ sum_qrot += (qrot[r,r]-s_old[r,r]).abs if (qrot[r,r]-s_old[r,r]).abs > 0.001
79
+ end
80
+ s_old = qrot.dup
81
+ end
82
+ break if (sum_qrot <= 0.001 and cnt > 1) or cnt >= maxSweeps
83
+ end # of do while true
84
+ s = []
85
+ qrot.row_size.times do |r|
86
+ s << Math.sqrt(qrot[r,r])
87
+ end
88
+ #puts "cnt = #{cnt}"
89
+ if self.row_size >= self.column_size
90
+ mu = self * v * Matrix.diagonal(*s).inverse
91
+ return [mu, v, s]
92
+ else
93
+ puts v.row_size
94
+ puts v.column_size
95
+ puts self.row_size
96
+ puts self.column_size
97
+ puts s.size
98
+
99
+ mu = (self.trans * v * Matrix.diagonal(*s).inverse)
100
+ return [mu, v, s]
101
+ end
102
+ end
103
+ def []=(i,j,val)
104
+ @rows[i][j] = val
105
+ end
106
+ end