otherinbox-classifier 1.3.1.1
Sign up to get free protection for your applications and to get access to all the features.
- data/LICENSE +429 -0
- data/README +88 -0
- data/Rakefile +96 -0
- data/bin/bayes.rb +36 -0
- data/bin/summarize.rb +16 -0
- data/lib/classifier.rb +30 -0
- data/lib/classifier/bayes.rb +172 -0
- data/lib/classifier/extensions/string.rb +16 -0
- data/lib/classifier/extensions/vector.rb +106 -0
- data/lib/classifier/extensions/vector_serialize.rb +20 -0
- data/lib/classifier/extensions/word_hash.rb +154 -0
- data/lib/classifier/lsi.rb +318 -0
- data/lib/classifier/lsi/content_node.rb +72 -0
- data/lib/classifier/lsi/summary.rb +31 -0
- data/lib/classifier/lsi/word_list.rb +36 -0
- data/test/bayes/bayesian_test.rb +33 -0
- data/test/extensions/word_hash_test.rb +14 -0
- data/test/lsi/lsi_test.rb +123 -0
- data/test/test_helper.rb +4 -0
- metadata +85 -0
data/Rakefile
ADDED
@@ -0,0 +1,96 @@
|
|
1
|
+
require 'rubygems'
|
2
|
+
require 'rake'
|
3
|
+
require 'rake/testtask'
|
4
|
+
require 'rake/rdoctask'
|
5
|
+
require 'rake/gempackagetask'
|
6
|
+
require 'rake/contrib/rubyforgepublisher'
|
7
|
+
|
8
|
+
PKG_VERSION = "1.3.1"
|
9
|
+
|
10
|
+
PKG_FILES = FileList[
|
11
|
+
"lib/**/*", "bin/*", "test/**/*", "[A-Z]*", "Rakefile", "html/**/*"
|
12
|
+
]
|
13
|
+
|
14
|
+
desc "Default Task"
|
15
|
+
task :default => [ :test ]
|
16
|
+
|
17
|
+
# Run the unit tests
|
18
|
+
desc "Run all unit tests"
|
19
|
+
Rake::TestTask.new("test") { |t|
|
20
|
+
t.libs << "lib"
|
21
|
+
t.pattern = 'test/*/*_test.rb'
|
22
|
+
t.verbose = true
|
23
|
+
}
|
24
|
+
|
25
|
+
# Make a console, useful when working on tests
|
26
|
+
desc "Generate a test console"
|
27
|
+
task :console do
|
28
|
+
verbose( false ) { sh "irb -I lib/ -r 'classifier'" }
|
29
|
+
end
|
30
|
+
|
31
|
+
# Genereate the RDoc documentation
|
32
|
+
desc "Create documentation"
|
33
|
+
Rake::RDocTask.new("doc") { |rdoc|
|
34
|
+
rdoc.title = "Ruby Classifier - Bayesian and LSI classification library"
|
35
|
+
rdoc.rdoc_dir = 'html'
|
36
|
+
rdoc.rdoc_files.include('README')
|
37
|
+
rdoc.rdoc_files.include('lib/**/*.rb')
|
38
|
+
}
|
39
|
+
|
40
|
+
# Genereate the package
|
41
|
+
spec = Gem::Specification.new do |s|
|
42
|
+
|
43
|
+
#### Basic information.
|
44
|
+
|
45
|
+
s.name = 'classifier'
|
46
|
+
s.version = PKG_VERSION
|
47
|
+
s.summary = <<-EOF
|
48
|
+
A general classifier module to allow Bayesian and other types of classifications.
|
49
|
+
EOF
|
50
|
+
s.description = <<-EOF
|
51
|
+
A general classifier module to allow Bayesian and other types of classifications.
|
52
|
+
EOF
|
53
|
+
|
54
|
+
#### Which files are to be included in this gem? Everything! (Except CVS directories.)
|
55
|
+
|
56
|
+
s.files = PKG_FILES
|
57
|
+
|
58
|
+
#### Load-time details: library and application (you will need one or both).
|
59
|
+
|
60
|
+
s.require_path = 'lib'
|
61
|
+
s.autorequire = 'classifier'
|
62
|
+
|
63
|
+
#### Documentation and testing.
|
64
|
+
|
65
|
+
s.has_rdoc = true
|
66
|
+
|
67
|
+
#### Dependencies and requirements.
|
68
|
+
|
69
|
+
s.add_dependency('stemmer', '>= 1.0.0')
|
70
|
+
s.requirements << "A porter-stemmer module to split word stems."
|
71
|
+
|
72
|
+
#### Author and project details.
|
73
|
+
s.author = "Lucas Carlson"
|
74
|
+
s.email = "lucas@rufy.com"
|
75
|
+
s.homepage = "http://classifier.rufy.com/"
|
76
|
+
end
|
77
|
+
|
78
|
+
Rake::GemPackageTask.new(spec) do |pkg|
|
79
|
+
pkg.need_zip = true
|
80
|
+
pkg.need_tar = true
|
81
|
+
end
|
82
|
+
|
83
|
+
desc "Report code statistics (KLOCs, etc) from the application"
|
84
|
+
task :stats do
|
85
|
+
require 'code_statistics'
|
86
|
+
CodeStatistics.new(
|
87
|
+
["Library", "lib"],
|
88
|
+
["Units", "test"]
|
89
|
+
).to_s
|
90
|
+
end
|
91
|
+
|
92
|
+
desc "Publish new documentation"
|
93
|
+
task :publish do
|
94
|
+
`ssh rufy update-classifier-doc`
|
95
|
+
Rake::RubyForgePublisher.new('classifier', 'cardmagic').upload
|
96
|
+
end
|
data/bin/bayes.rb
ADDED
@@ -0,0 +1,36 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
begin
|
4
|
+
require 'rubygems'
|
5
|
+
require 'classifier'
|
6
|
+
rescue
|
7
|
+
require 'classifier'
|
8
|
+
end
|
9
|
+
|
10
|
+
require 'madeleine'
|
11
|
+
|
12
|
+
m = SnapshotMadeleine.new(File.expand_path("~/.bayes_data")) {
|
13
|
+
Classifier::Bayes.new 'Interesting', 'Uninteresting'
|
14
|
+
}
|
15
|
+
|
16
|
+
case ARGV[0]
|
17
|
+
when "add"
|
18
|
+
case ARGV[1].downcase
|
19
|
+
when "interesting"
|
20
|
+
m.system.train_interesting File.open(ARGV[2]).read
|
21
|
+
puts "#{ARGV[2]} has been classified as interesting"
|
22
|
+
when "uninteresting"
|
23
|
+
m.system.train_uninteresting File.open(ARGV[2]).read
|
24
|
+
puts "#{ARGV[2]} has been classified as uninteresting"
|
25
|
+
else
|
26
|
+
puts "Invalid category: choose between interesting and uninteresting"
|
27
|
+
exit(1)
|
28
|
+
end
|
29
|
+
when "classify"
|
30
|
+
puts m.system.classify(File.open(ARGV[1]).read)
|
31
|
+
else
|
32
|
+
puts "Invalid option: choose add [category] [file] or clasify [file]"
|
33
|
+
exit(-1)
|
34
|
+
end
|
35
|
+
|
36
|
+
m.take_snapshot
|
data/bin/summarize.rb
ADDED
@@ -0,0 +1,16 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
begin
|
4
|
+
require 'rubygems'
|
5
|
+
require 'classifier'
|
6
|
+
rescue
|
7
|
+
require 'classifier'
|
8
|
+
end
|
9
|
+
|
10
|
+
require 'open-uri'
|
11
|
+
|
12
|
+
num = ARGV[1].to_i
|
13
|
+
num = num < 1 ? 10 : num
|
14
|
+
|
15
|
+
text = open(ARGV.first).read
|
16
|
+
puts text.gsub(/<[^>]+>/,"").gsub(/[\s]+/," ").summary(num)
|
data/lib/classifier.rb
ADDED
@@ -0,0 +1,30 @@
|
|
1
|
+
#--
|
2
|
+
# Copyright (c) 2005 Lucas Carlson
|
3
|
+
#
|
4
|
+
# Permission is hereby granted, free of charge, to any person obtaining
|
5
|
+
# a copy of this software and associated documentation files (the
|
6
|
+
# "Software"), to deal in the Software without restriction, including
|
7
|
+
# without limitation the rights to use, copy, modify, merge, publish,
|
8
|
+
# distribute, sublicense, and/or sell copies of the Software, and to
|
9
|
+
# permit persons to whom the Software is furnished to do so, subject to
|
10
|
+
# the following conditions:
|
11
|
+
#
|
12
|
+
# The above copyright notice and this permission notice shall be
|
13
|
+
# included in all copies or substantial portions of the Software.
|
14
|
+
#
|
15
|
+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
16
|
+
# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
17
|
+
# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
18
|
+
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
19
|
+
# LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
20
|
+
# OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
21
|
+
# WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
22
|
+
#++
|
23
|
+
# Author:: Lucas Carlson (mailto:lucas@rufy.com)
|
24
|
+
# Copyright:: Copyright (c) 2005 Lucas Carlson
|
25
|
+
# License:: LGPL
|
26
|
+
|
27
|
+
require 'rubygems'
|
28
|
+
require 'classifier/extensions/string'
|
29
|
+
require 'classifier/bayes'
|
30
|
+
# require 'classifier/lsi'
|
@@ -0,0 +1,172 @@
|
|
1
|
+
# Author:: Lucas Carlson (mailto:lucas@rufy.com)
|
2
|
+
# Copyright:: Copyright (c) 2005 Lucas Carlson
|
3
|
+
# License:: LGPL
|
4
|
+
|
5
|
+
module Classifier
|
6
|
+
|
7
|
+
class Bayes
|
8
|
+
# The class can be created with one or more categories, each of which will be
|
9
|
+
# initialized and given a training method. E.g.,
|
10
|
+
# b = Classifier::Bayes.new 'Interesting', 'Uninteresting', 'Spam'
|
11
|
+
def initialize(*categories)
|
12
|
+
@categories = Hash.new
|
13
|
+
categories.each { |category| @categories[category.prepare_category_name] = Hash.new }
|
14
|
+
@total_words = 0
|
15
|
+
end
|
16
|
+
|
17
|
+
#
|
18
|
+
# Provides a general training method for all categories specified in Bayes#new
|
19
|
+
# For example:
|
20
|
+
# b = Classifier::Bayes.new 'This', 'That', 'the_other'
|
21
|
+
# b.train :this, "This text"
|
22
|
+
# b.train "that", "That text"
|
23
|
+
# b.train "The other", "The other text"
|
24
|
+
def train(category, text)
|
25
|
+
category = category.prepare_category_name
|
26
|
+
text.word_hash.each do |word, count|
|
27
|
+
@categories[category][word] ||= 0
|
28
|
+
@categories[category][word] += count
|
29
|
+
@total_words += count
|
30
|
+
end
|
31
|
+
end
|
32
|
+
|
33
|
+
#
|
34
|
+
# Provides a untraining method for all categories specified in Bayes#new
|
35
|
+
# Be very careful with this method.
|
36
|
+
#
|
37
|
+
# For example:
|
38
|
+
# b = Classifier::Bayes.new 'This', 'That', 'the_other'
|
39
|
+
# b.train :this, "This text"
|
40
|
+
# b.untrain :this, "This text"
|
41
|
+
def untrain(category, text)
|
42
|
+
category = category.prepare_category_name
|
43
|
+
text.word_hash.each do |word, count|
|
44
|
+
if @total_words >= 0
|
45
|
+
# Sometimes items can be untrained before they are trained,
|
46
|
+
# be tolerant of that case
|
47
|
+
next if @categories[category][word].nil?
|
48
|
+
orig = @categories[category][word]
|
49
|
+
@categories[category][word] ||= 0
|
50
|
+
@categories[category][word] -= count
|
51
|
+
if @categories[category][word] <= 0
|
52
|
+
@categories[category].delete(word)
|
53
|
+
count = orig
|
54
|
+
end
|
55
|
+
@total_words -= count
|
56
|
+
end
|
57
|
+
end
|
58
|
+
end
|
59
|
+
|
60
|
+
#
|
61
|
+
# Returns the scores in each category the provided +text+. E.g.,
|
62
|
+
# b.classifications "I hate bad words and you"
|
63
|
+
# => {"Uninteresting"=>-12.6997928013932, "Interesting"=>-18.4206807439524}
|
64
|
+
# The largest of these scores (the one closest to 0) is the one picked out by #classify
|
65
|
+
def classifications(text)
|
66
|
+
score = Hash.new
|
67
|
+
@categories.each do |category, category_words|
|
68
|
+
score[category.to_s] = 0
|
69
|
+
total = category_words.values.inject(0) {|sum, element| sum+element}
|
70
|
+
text.word_hash.each do |word, count|
|
71
|
+
s = category_words.has_key?(word) ? category_words[word] : 0.1
|
72
|
+
score[category.to_s] += Math.log(s/total.to_f)
|
73
|
+
end
|
74
|
+
end
|
75
|
+
return score
|
76
|
+
end
|
77
|
+
|
78
|
+
# These assume that the classes are Member and Not Member
|
79
|
+
def myclassify(text)
|
80
|
+
myclassify_with_word_hash(text.word_hash)
|
81
|
+
end
|
82
|
+
|
83
|
+
# http://nlp.stanford.edu/IR-book/html/htmledition/naive-bayes-text-classification-1.html
|
84
|
+
def myclassify_with_word_hash(word_hash)
|
85
|
+
member_term_count = @categories[:Member].size
|
86
|
+
nonmember_term_count = @categories[:"Not member"].size
|
87
|
+
term_count = member_term_count + nonmember_term_count
|
88
|
+
score = 0
|
89
|
+
word_hash.each do |word, count|
|
90
|
+
# count of words in each category
|
91
|
+
member_count = @categories[:Member][word].to_i + 1
|
92
|
+
nonmember_count = @categories[:"Not member"][word].to_i + 1
|
93
|
+
next if member_count.to_i == 1 && nonmember_count.to_i == 1
|
94
|
+
|
95
|
+
# find relative prob word is in class -- p(w|c)
|
96
|
+
word_member_p = (member_count) / (total_member_count + term_count).to_f
|
97
|
+
word_nonmember_p = (nonmember_count) / (total_nonmember_count + term_count).to_f
|
98
|
+
|
99
|
+
word_pr = Math.log(word_member_p / word_nonmember_p)
|
100
|
+
score += word_pr * count
|
101
|
+
end
|
102
|
+
if score > 0
|
103
|
+
return "Member", score
|
104
|
+
else
|
105
|
+
return "Not member", score
|
106
|
+
end
|
107
|
+
end
|
108
|
+
|
109
|
+
#
|
110
|
+
# Returns the classification of the provided +text+, which is one of the
|
111
|
+
# categories given in the initializer. E.g.,
|
112
|
+
# b.classify "I hate bad words and you"
|
113
|
+
# => 'Uninteresting'
|
114
|
+
def classify(text)
|
115
|
+
(classifications(text).sort_by { |a| -a[1] })[0][0]
|
116
|
+
end
|
117
|
+
|
118
|
+
#
|
119
|
+
# Provides training and untraining methods for the categories specified in Bayes#new
|
120
|
+
# For example:
|
121
|
+
# b = Classifier::Bayes.new 'This', 'That', 'the_other'
|
122
|
+
# b.train_this "This text"
|
123
|
+
# b.train_that "That text"
|
124
|
+
# b.untrain_that "That text"
|
125
|
+
# b.train_the_other "The other text"
|
126
|
+
def method_missing(name, *args)
|
127
|
+
category = name.to_s.gsub(/(un)?train_([\w]+)/, '\2').prepare_category_name
|
128
|
+
if @categories.has_key? category
|
129
|
+
args.each { |text| eval("#{$1}train(category, text)") }
|
130
|
+
elsif name.to_s =~ /(un)?train_([\w]+)/
|
131
|
+
raise StandardError, "No such category: #{category}"
|
132
|
+
else
|
133
|
+
super #raise StandardError, "No such method: #{name}"
|
134
|
+
end
|
135
|
+
end
|
136
|
+
|
137
|
+
#
|
138
|
+
# Provides a list of category names
|
139
|
+
# For example:
|
140
|
+
# b.categories
|
141
|
+
# => ['This', 'That', 'the_other']
|
142
|
+
def categories # :nodoc:
|
143
|
+
@categories.keys.collect {|c| c.to_s}
|
144
|
+
end
|
145
|
+
|
146
|
+
#
|
147
|
+
# Allows you to add categories to the classifier.
|
148
|
+
# For example:
|
149
|
+
# b.add_category "Not spam"
|
150
|
+
#
|
151
|
+
# WARNING: Adding categories to a trained classifier will
|
152
|
+
# result in an undertrained category that will tend to match
|
153
|
+
# more criteria than the trained selective categories. In short,
|
154
|
+
# try to initialize your categories at initialization.
|
155
|
+
def add_category(category)
|
156
|
+
@categories[category.prepare_category_name] = Hash.new
|
157
|
+
end
|
158
|
+
|
159
|
+
alias append_category add_category
|
160
|
+
|
161
|
+
private
|
162
|
+
def total_member_count
|
163
|
+
@total_member_count ||= @categories[:Member].values.inject(0) {|sum, element| sum+element}
|
164
|
+
end
|
165
|
+
|
166
|
+
def total_nonmember_count
|
167
|
+
@total_nonmember_count ||= @categories[:"Not member"].values.inject(0) {|sum, element| sum+element}
|
168
|
+
end
|
169
|
+
|
170
|
+
end
|
171
|
+
|
172
|
+
end
|
@@ -0,0 +1,16 @@
|
|
1
|
+
# Author:: Lucas Carlson (mailto:lucas@rufy.com)
|
2
|
+
# Copyright:: Copyright (c) 2005 Lucas Carlson
|
3
|
+
# License:: LGPL
|
4
|
+
|
5
|
+
begin
|
6
|
+
require 'stemmer'
|
7
|
+
rescue LoadError
|
8
|
+
puts "Please install stemmer from http://rubyforge.org/projects/stemmer or 'gem install stemmer'"
|
9
|
+
exit(-1)
|
10
|
+
end
|
11
|
+
|
12
|
+
require 'classifier/extensions/word_hash'
|
13
|
+
|
14
|
+
class Object
|
15
|
+
def prepare_category_name; to_s.gsub("_"," ").capitalize.intern end
|
16
|
+
end
|
@@ -0,0 +1,106 @@
|
|
1
|
+
# Author:: Ernest Ellingson
|
2
|
+
# Copyright:: Copyright (c) 2005
|
3
|
+
|
4
|
+
# These are extensions to the std-lib 'matrix' to allow an all ruby SVD
|
5
|
+
|
6
|
+
require 'matrix'
|
7
|
+
require 'mathn'
|
8
|
+
|
9
|
+
class Array
|
10
|
+
def sum
|
11
|
+
inject(0) { |sum,term| sum += term }.to_f
|
12
|
+
end
|
13
|
+
end
|
14
|
+
|
15
|
+
class Vector
|
16
|
+
def magnitude
|
17
|
+
sumsqs = 0.0
|
18
|
+
self.size.times do |i|
|
19
|
+
sumsqs += self[i] ** 2.0
|
20
|
+
end
|
21
|
+
Math.sqrt(sumsqs)
|
22
|
+
end
|
23
|
+
def normalize
|
24
|
+
nv = []
|
25
|
+
mag = self.magnitude
|
26
|
+
self.size.times do |i|
|
27
|
+
|
28
|
+
nv << (self[i] / mag)
|
29
|
+
|
30
|
+
end
|
31
|
+
Vector[*nv]
|
32
|
+
end
|
33
|
+
end
|
34
|
+
|
35
|
+
class Matrix
|
36
|
+
def Matrix.diag(s)
|
37
|
+
Matrix.diagonal(*s)
|
38
|
+
end
|
39
|
+
|
40
|
+
alias :trans :transpose
|
41
|
+
|
42
|
+
def SV_decomp(maxSweeps = 20)
|
43
|
+
if self.row_size >= self.column_size
|
44
|
+
q = self.trans * self
|
45
|
+
else
|
46
|
+
q = self * self.trans
|
47
|
+
end
|
48
|
+
|
49
|
+
qrot = q.dup
|
50
|
+
v = Matrix.identity(q.row_size)
|
51
|
+
azrot = nil
|
52
|
+
mzrot = nil
|
53
|
+
cnt = 0
|
54
|
+
s_old = nil
|
55
|
+
mu = nil
|
56
|
+
|
57
|
+
while true do
|
58
|
+
cnt += 1
|
59
|
+
for row in (0...qrot.row_size-1) do
|
60
|
+
for col in (1..qrot.row_size-1) do
|
61
|
+
next if row == col
|
62
|
+
h = Math.atan((2 * qrot[row,col])/(qrot[row,row]-qrot[col,col]))/2.0
|
63
|
+
hcos = Math.cos(h)
|
64
|
+
hsin = Math.sin(h)
|
65
|
+
mzrot = Matrix.identity(qrot.row_size)
|
66
|
+
mzrot[row,row] = hcos
|
67
|
+
mzrot[row,col] = -hsin
|
68
|
+
mzrot[col,row] = hsin
|
69
|
+
mzrot[col,col] = hcos
|
70
|
+
qrot = mzrot.trans * qrot * mzrot
|
71
|
+
v = v * mzrot
|
72
|
+
end
|
73
|
+
end
|
74
|
+
s_old = qrot.dup if cnt == 1
|
75
|
+
sum_qrot = 0.0
|
76
|
+
if cnt > 1
|
77
|
+
qrot.row_size.times do |r|
|
78
|
+
sum_qrot += (qrot[r,r]-s_old[r,r]).abs if (qrot[r,r]-s_old[r,r]).abs > 0.001
|
79
|
+
end
|
80
|
+
s_old = qrot.dup
|
81
|
+
end
|
82
|
+
break if (sum_qrot <= 0.001 and cnt > 1) or cnt >= maxSweeps
|
83
|
+
end # of do while true
|
84
|
+
s = []
|
85
|
+
qrot.row_size.times do |r|
|
86
|
+
s << Math.sqrt(qrot[r,r])
|
87
|
+
end
|
88
|
+
#puts "cnt = #{cnt}"
|
89
|
+
if self.row_size >= self.column_size
|
90
|
+
mu = self * v * Matrix.diagonal(*s).inverse
|
91
|
+
return [mu, v, s]
|
92
|
+
else
|
93
|
+
puts v.row_size
|
94
|
+
puts v.column_size
|
95
|
+
puts self.row_size
|
96
|
+
puts self.column_size
|
97
|
+
puts s.size
|
98
|
+
|
99
|
+
mu = (self.trans * v * Matrix.diagonal(*s).inverse)
|
100
|
+
return [mu, v, s]
|
101
|
+
end
|
102
|
+
end
|
103
|
+
def []=(i,j,val)
|
104
|
+
@rows[i][j] = val
|
105
|
+
end
|
106
|
+
end
|