classifier-reborn 2.0.0.rc1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/LICENSE +429 -0
- data/README.markdown +97 -0
- data/bin/bayes.rb +36 -0
- data/bin/summarize.rb +16 -0
- data/lib/classifier-reborn.rb +30 -0
- data/lib/classifier-reborn/bayes.rb +126 -0
- data/lib/classifier-reborn/extensions/string.rb +10 -0
- data/lib/classifier-reborn/extensions/vector.rb +112 -0
- data/lib/classifier-reborn/extensions/vector_serialize.rb +20 -0
- data/lib/classifier-reborn/extensions/word_hash.rb +136 -0
- data/lib/classifier-reborn/lsi.rb +317 -0
- data/lib/classifier-reborn/lsi/content_node.rb +72 -0
- data/lib/classifier-reborn/lsi/summary.rb +31 -0
- data/lib/classifier-reborn/lsi/word_list.rb +36 -0
- data/lib/classifier-reborn/version.rb +3 -0
- metadata +108 -0
data/bin/bayes.rb
ADDED
@@ -0,0 +1,36 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
begin
|
4
|
+
require 'rubygems'
|
5
|
+
require 'classifier'
|
6
|
+
rescue
|
7
|
+
require 'classifier'
|
8
|
+
end
|
9
|
+
|
10
|
+
require 'madeleine'
|
11
|
+
|
12
|
+
m = SnapshotMadeleine.new(File.expand_path("~/.bayes_data")) {
|
13
|
+
ClassifierReborn::Bayes.new 'Interesting', 'Uninteresting'
|
14
|
+
}
|
15
|
+
|
16
|
+
case ARGV[0]
|
17
|
+
when "add"
|
18
|
+
case ARGV[1].downcase
|
19
|
+
when "interesting"
|
20
|
+
m.system.train_interesting File.open(ARGV[2]).read
|
21
|
+
puts "#{ARGV[2]} has been classified as interesting"
|
22
|
+
when "uninteresting"
|
23
|
+
m.system.train_uninteresting File.open(ARGV[2]).read
|
24
|
+
puts "#{ARGV[2]} has been classified as uninteresting"
|
25
|
+
else
|
26
|
+
puts "Invalid category: choose between interesting and uninteresting"
|
27
|
+
exit(1)
|
28
|
+
end
|
29
|
+
when "classify"
|
30
|
+
puts m.system.classify(File.open(ARGV[1]).read)
|
31
|
+
else
|
32
|
+
puts "Invalid option: choose add [category] [file] or clasify [file]"
|
33
|
+
exit(-1)
|
34
|
+
end
|
35
|
+
|
36
|
+
m.take_snapshot
|
data/bin/summarize.rb
ADDED
@@ -0,0 +1,16 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
begin
|
4
|
+
require 'rubygems'
|
5
|
+
require 'classifier'
|
6
|
+
rescue
|
7
|
+
require 'classifier'
|
8
|
+
end
|
9
|
+
|
10
|
+
require 'open-uri'
|
11
|
+
|
12
|
+
num = ARGV[1].to_i
|
13
|
+
num = num < 1 ? 10 : num
|
14
|
+
|
15
|
+
text = open(ARGV.first).read
|
16
|
+
puts text.gsub(/<[^>]+>/,"").gsub(/[\s]+/," ").summary(num)
|
@@ -0,0 +1,30 @@
|
|
1
|
+
#--
|
2
|
+
# Copyright (c) 2005 Lucas Carlson
|
3
|
+
#
|
4
|
+
# Permission is hereby granted, free of charge, to any person obtaining
|
5
|
+
# a copy of this software and associated documentation files (the
|
6
|
+
# "Software"), to deal in the Software without restriction, including
|
7
|
+
# without limitation the rights to use, copy, modify, merge, publish,
|
8
|
+
# distribute, sublicense, and/or sell copies of the Software, and to
|
9
|
+
# permit persons to whom the Software is furnished to do so, subject to
|
10
|
+
# the following conditions:
|
11
|
+
#
|
12
|
+
# The above copyright notice and this permission notice shall be
|
13
|
+
# included in all copies or substantial portions of the Software.
|
14
|
+
#
|
15
|
+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
16
|
+
# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
17
|
+
# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
18
|
+
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
19
|
+
# LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
20
|
+
# OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
21
|
+
# WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
22
|
+
#++
|
23
|
+
# Author:: Lucas Carlson (mailto:lucas@rufy.com)
|
24
|
+
# Copyright:: Copyright (c) 2005 Lucas Carlson
|
25
|
+
# License:: LGPL
|
26
|
+
|
27
|
+
require 'rubygems'
|
28
|
+
require_relative 'classifier-reborn/extensions/string'
|
29
|
+
require_relative 'classifier-reborn/bayes'
|
30
|
+
require_relative 'classifier-reborn/lsi'
|
@@ -0,0 +1,126 @@
|
|
1
|
+
# Author:: Lucas Carlson (mailto:lucas@rufy.com)
|
2
|
+
# Copyright:: Copyright (c) 2005 Lucas Carlson
|
3
|
+
# License:: LGPL
|
4
|
+
|
5
|
+
module ClassifierReborn
|
6
|
+
class Bayes
|
7
|
+
# The class can be created with one or more categories, each of which will be
|
8
|
+
# initialized and given a training method. E.g.,
|
9
|
+
# b = ClassifierReborn::Bayes.new 'Interesting', 'Uninteresting', 'Spam'
|
10
|
+
def initialize(*categories)
|
11
|
+
@categories = Hash.new
|
12
|
+
categories.each { |category| @categories[category.prepare_category_name] = Hash.new }
|
13
|
+
@total_words = 0
|
14
|
+
@category_counts = Hash.new(0)
|
15
|
+
end
|
16
|
+
|
17
|
+
# Provides a general training method for all categories specified in Bayes#new
|
18
|
+
# For example:
|
19
|
+
# b = ClassifierReborn::Bayes.new 'This', 'That', 'the_other'
|
20
|
+
# b.train :this, "This text"
|
21
|
+
# b.train "that", "That text"
|
22
|
+
# b.train "The other", "The other text"
|
23
|
+
def train(category, text)
|
24
|
+
category = category.prepare_category_name
|
25
|
+
@category_counts[category] += 1
|
26
|
+
text.word_hash.each do |word, count|
|
27
|
+
@categories[category][word] ||= 0
|
28
|
+
@categories[category][word] += count
|
29
|
+
@total_words += count
|
30
|
+
end
|
31
|
+
end
|
32
|
+
|
33
|
+
# Provides a untraining method for all categories specified in Bayes#new
|
34
|
+
# Be very careful with this method.
|
35
|
+
#
|
36
|
+
# For example:
|
37
|
+
# b = ClassifierReborn::Bayes.new 'This', 'That', 'the_other'
|
38
|
+
# b.train :this, "This text"
|
39
|
+
# b.untrain :this, "This text"
|
40
|
+
def untrain(category, text)
|
41
|
+
category = category.prepare_category_name
|
42
|
+
@category_counts[category] -= 1
|
43
|
+
text.word_hash.each do |word, count|
|
44
|
+
if @total_words >= 0
|
45
|
+
orig = @categories[category][word]
|
46
|
+
@categories[category][word] ||= 0
|
47
|
+
@categories[category][word] -= count
|
48
|
+
if @categories[category][word] <= 0
|
49
|
+
@categories[category].delete(word)
|
50
|
+
count = orig
|
51
|
+
end
|
52
|
+
@total_words -= count
|
53
|
+
end
|
54
|
+
end
|
55
|
+
end
|
56
|
+
|
57
|
+
# Returns the scores in each category the provided +text+. E.g.,
|
58
|
+
# b.classifications "I hate bad words and you"
|
59
|
+
# => {"Uninteresting"=>-12.6997928013932, "Interesting"=>-18.4206807439524}
|
60
|
+
# The largest of these scores (the one closest to 0) is the one picked out by #classify
|
61
|
+
def classifications(text)
|
62
|
+
score = Hash.new
|
63
|
+
training_count = @category_counts.values.inject { |x,y| x+y }.to_f
|
64
|
+
@categories.each do |category, category_words|
|
65
|
+
score[category.to_s] = 0
|
66
|
+
total = category_words.values.inject(0) {|sum, element| sum+element}
|
67
|
+
text.word_hash.each do |word, count|
|
68
|
+
s = category_words.has_key?(word) ? category_words[word] : 0.1
|
69
|
+
score[category.to_s] += Math.log(s/total.to_f)
|
70
|
+
end
|
71
|
+
# now add prior probability for the category
|
72
|
+
s = @category_counts.has_key?(category) ? @category_counts[category] : 0.1
|
73
|
+
score[category.to_s] += Math.log(s / training_count)
|
74
|
+
end
|
75
|
+
return score
|
76
|
+
end
|
77
|
+
|
78
|
+
# Returns the classification of the provided +text+, which is one of the
|
79
|
+
# categories given in the initializer. E.g.,
|
80
|
+
# b.classify "I hate bad words and you"
|
81
|
+
# => 'Uninteresting'
|
82
|
+
def classify(text)
|
83
|
+
(classifications(text).sort_by { |a| -a[1] })[0][0]
|
84
|
+
end
|
85
|
+
|
86
|
+
# Provides training and untraining methods for the categories specified in Bayes#new
|
87
|
+
# For example:
|
88
|
+
# b = ClassifierReborn::Bayes.new 'This', 'That', 'the_other'
|
89
|
+
# b.train_this "This text"
|
90
|
+
# b.train_that "That text"
|
91
|
+
# b.untrain_that "That text"
|
92
|
+
# b.train_the_other "The other text"
|
93
|
+
def method_missing(name, *args)
|
94
|
+
category = name.to_s.gsub(/(un)?train_([\w]+)/, '\2').prepare_category_name
|
95
|
+
if @categories.has_key? category
|
96
|
+
args.each { |text| eval("#{$1}train(category, text)") }
|
97
|
+
elsif name.to_s =~ /(un)?train_([\w]+)/
|
98
|
+
raise StandardError, "No such category: #{category}"
|
99
|
+
else
|
100
|
+
super #raise StandardError, "No such method: #{name}"
|
101
|
+
end
|
102
|
+
end
|
103
|
+
|
104
|
+
# Provides a list of category names
|
105
|
+
# For example:
|
106
|
+
# b.categories
|
107
|
+
# => ['This', 'That', 'the_other']
|
108
|
+
def categories # :nodoc:
|
109
|
+
@categories.keys.collect {|c| c.to_s}
|
110
|
+
end
|
111
|
+
|
112
|
+
# Allows you to add categories to the classifier.
|
113
|
+
# For example:
|
114
|
+
# b.add_category "Not spam"
|
115
|
+
#
|
116
|
+
# WARNING: Adding categories to a trained classifier will
|
117
|
+
# result in an undertrained category that will tend to match
|
118
|
+
# more criteria than the trained selective categories. In short,
|
119
|
+
# try to initialize your categories at initialization.
|
120
|
+
def add_category(category)
|
121
|
+
@categories[category.prepare_category_name] = Hash.new
|
122
|
+
end
|
123
|
+
|
124
|
+
alias append_category add_category
|
125
|
+
end
|
126
|
+
end
|
@@ -0,0 +1,10 @@
|
|
1
|
+
# Author:: Lucas Carlson (mailto:lucas@rufy.com)
|
2
|
+
# Copyright:: Copyright (c) 2005 Lucas Carlson
|
3
|
+
# License:: LGPL
|
4
|
+
|
5
|
+
require 'fast_stemmer'
|
6
|
+
require 'classifier-reborn/extensions/word_hash'
|
7
|
+
|
8
|
+
class Object
|
9
|
+
def prepare_category_name; to_s.gsub("_"," ").capitalize.intern end
|
10
|
+
end
|
@@ -0,0 +1,112 @@
|
|
1
|
+
# Author:: Ernest Ellingson
|
2
|
+
# Copyright:: Copyright (c) 2005
|
3
|
+
|
4
|
+
# These are extensions to the std-lib 'matrix' to allow an all ruby SVD
|
5
|
+
|
6
|
+
require 'matrix'
|
7
|
+
require 'mathn'
|
8
|
+
|
9
|
+
class Array
|
10
|
+
def sum(identity = 0, &block)
|
11
|
+
return identity unless size > 0
|
12
|
+
|
13
|
+
if block_given?
|
14
|
+
map(&block).sum
|
15
|
+
else
|
16
|
+
reduce(:+)
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
class Vector
|
22
|
+
def magnitude
|
23
|
+
sumsqs = 0.0
|
24
|
+
self.size.times do |i|
|
25
|
+
sumsqs += self[i] ** 2.0
|
26
|
+
end
|
27
|
+
Math.sqrt(sumsqs)
|
28
|
+
end
|
29
|
+
def normalize
|
30
|
+
nv = []
|
31
|
+
mag = self.magnitude
|
32
|
+
self.size.times do |i|
|
33
|
+
|
34
|
+
nv << (self[i] / mag)
|
35
|
+
|
36
|
+
end
|
37
|
+
Vector[*nv]
|
38
|
+
end
|
39
|
+
end
|
40
|
+
|
41
|
+
class Matrix
|
42
|
+
def Matrix.diag(s)
|
43
|
+
Matrix.diagonal(*s)
|
44
|
+
end
|
45
|
+
|
46
|
+
alias :trans :transpose
|
47
|
+
|
48
|
+
def SV_decomp(maxSweeps = 20)
|
49
|
+
if self.row_size >= self.column_size
|
50
|
+
q = self.trans * self
|
51
|
+
else
|
52
|
+
q = self * self.trans
|
53
|
+
end
|
54
|
+
|
55
|
+
qrot = q.dup
|
56
|
+
v = Matrix.identity(q.row_size)
|
57
|
+
azrot = nil
|
58
|
+
mzrot = nil
|
59
|
+
cnt = 0
|
60
|
+
s_old = nil
|
61
|
+
mu = nil
|
62
|
+
|
63
|
+
while true do
|
64
|
+
cnt += 1
|
65
|
+
for row in (0...qrot.row_size-1) do
|
66
|
+
for col in (1..qrot.row_size-1) do
|
67
|
+
next if row == col
|
68
|
+
h = Math.atan((2 * qrot[row,col])/(qrot[row,row]-qrot[col,col]))/2.0
|
69
|
+
hcos = Math.cos(h)
|
70
|
+
hsin = Math.sin(h)
|
71
|
+
mzrot = Matrix.identity(qrot.row_size)
|
72
|
+
mzrot[row,row] = hcos
|
73
|
+
mzrot[row,col] = -hsin
|
74
|
+
mzrot[col,row] = hsin
|
75
|
+
mzrot[col,col] = hcos
|
76
|
+
qrot = mzrot.trans * qrot * mzrot
|
77
|
+
v = v * mzrot
|
78
|
+
end
|
79
|
+
end
|
80
|
+
s_old = qrot.dup if cnt == 1
|
81
|
+
sum_qrot = 0.0
|
82
|
+
if cnt > 1
|
83
|
+
qrot.row_size.times do |r|
|
84
|
+
sum_qrot += (qrot[r,r]-s_old[r,r]).abs if (qrot[r,r]-s_old[r,r]).abs > 0.001
|
85
|
+
end
|
86
|
+
s_old = qrot.dup
|
87
|
+
end
|
88
|
+
break if (sum_qrot <= 0.001 and cnt > 1) or cnt >= maxSweeps
|
89
|
+
end # of do while true
|
90
|
+
s = []
|
91
|
+
qrot.row_size.times do |r|
|
92
|
+
s << Math.sqrt(qrot[r,r])
|
93
|
+
end
|
94
|
+
#puts "cnt = #{cnt}"
|
95
|
+
if self.row_size >= self.column_size
|
96
|
+
mu = self * v * Matrix.diagonal(*s).inverse
|
97
|
+
return [mu, v, s]
|
98
|
+
else
|
99
|
+
puts v.row_size
|
100
|
+
puts v.column_size
|
101
|
+
puts self.row_size
|
102
|
+
puts self.column_size
|
103
|
+
puts s.size
|
104
|
+
|
105
|
+
mu = (self.trans * v * Matrix.diagonal(*s).inverse)
|
106
|
+
return [mu, v, s]
|
107
|
+
end
|
108
|
+
end
|
109
|
+
def []=(i,j,val)
|
110
|
+
@rows[i][j] = val
|
111
|
+
end
|
112
|
+
end
|
@@ -0,0 +1,20 @@
|
|
1
|
+
module GSL
|
2
|
+
|
3
|
+
class Vector
|
4
|
+
def _dump(v)
|
5
|
+
Marshal.dump( self.to_a )
|
6
|
+
end
|
7
|
+
|
8
|
+
def self._load(arr)
|
9
|
+
arry = Marshal.load(arr)
|
10
|
+
return GSL::Vector.alloc(arry)
|
11
|
+
end
|
12
|
+
|
13
|
+
end
|
14
|
+
|
15
|
+
class Matrix
|
16
|
+
class <<self
|
17
|
+
alias :diag :diagonal
|
18
|
+
end
|
19
|
+
end
|
20
|
+
end
|
@@ -0,0 +1,136 @@
|
|
1
|
+
# Author:: Lucas Carlson (mailto:lucas@rufy.com)
|
2
|
+
# Copyright:: Copyright (c) 2005 Lucas Carlson
|
3
|
+
# License:: LGPL
|
4
|
+
|
5
|
+
require "set"
|
6
|
+
|
7
|
+
# These are extensions to the String class to provide convenience
|
8
|
+
# methods for the Classifier package.
|
9
|
+
class String
|
10
|
+
|
11
|
+
# Removes common punctuation symbols, returning a new string.
|
12
|
+
# E.g.,
|
13
|
+
# "Hello (greeting's), with {braces} < >...?".without_punctuation
|
14
|
+
# => "Hello greetings with braces "
|
15
|
+
def without_punctuation
|
16
|
+
tr( ',?.!;:"@#$%^&*()_=+[]{}\|<>/`~', " " ) .tr( "'\-", "")
|
17
|
+
end
|
18
|
+
|
19
|
+
# Return a Hash of strings => ints. Each word in the string is stemmed,
|
20
|
+
# interned, and indexes to its frequency in the document.
|
21
|
+
def word_hash
|
22
|
+
word_hash = clean_word_hash()
|
23
|
+
symbol_hash = word_hash_for_symbols(gsub(/[\w]/," ").split)
|
24
|
+
return word_hash.merge(symbol_hash)
|
25
|
+
end
|
26
|
+
|
27
|
+
# Return a word hash without extra punctuation or short symbols, just stemmed words
|
28
|
+
def clean_word_hash
|
29
|
+
word_hash_for_words gsub(/[^\w\s]/,"").split
|
30
|
+
end
|
31
|
+
|
32
|
+
private
|
33
|
+
|
34
|
+
def word_hash_for_words(words)
|
35
|
+
d = Hash.new(0)
|
36
|
+
words.each do |word|
|
37
|
+
word.downcase!
|
38
|
+
if ! CORPUS_SKIP_WORDS.include?(word) && word.length > 2
|
39
|
+
d[word.stem.intern] += 1
|
40
|
+
end
|
41
|
+
end
|
42
|
+
return d
|
43
|
+
end
|
44
|
+
|
45
|
+
|
46
|
+
def word_hash_for_symbols(words)
|
47
|
+
d = Hash.new(0)
|
48
|
+
words.each do |word|
|
49
|
+
d[word.intern] += 1
|
50
|
+
end
|
51
|
+
return d
|
52
|
+
end
|
53
|
+
|
54
|
+
CORPUS_SKIP_WORDS = Set.new([
|
55
|
+
"a",
|
56
|
+
"again",
|
57
|
+
"all",
|
58
|
+
"along",
|
59
|
+
"are",
|
60
|
+
"also",
|
61
|
+
"an",
|
62
|
+
"and",
|
63
|
+
"as",
|
64
|
+
"at",
|
65
|
+
"but",
|
66
|
+
"by",
|
67
|
+
"came",
|
68
|
+
"can",
|
69
|
+
"cant",
|
70
|
+
"couldnt",
|
71
|
+
"did",
|
72
|
+
"didn",
|
73
|
+
"didnt",
|
74
|
+
"do",
|
75
|
+
"doesnt",
|
76
|
+
"dont",
|
77
|
+
"ever",
|
78
|
+
"first",
|
79
|
+
"from",
|
80
|
+
"have",
|
81
|
+
"her",
|
82
|
+
"here",
|
83
|
+
"him",
|
84
|
+
"how",
|
85
|
+
"i",
|
86
|
+
"if",
|
87
|
+
"in",
|
88
|
+
"into",
|
89
|
+
"is",
|
90
|
+
"isnt",
|
91
|
+
"it",
|
92
|
+
"itll",
|
93
|
+
"just",
|
94
|
+
"last",
|
95
|
+
"least",
|
96
|
+
"like",
|
97
|
+
"most",
|
98
|
+
"my",
|
99
|
+
"new",
|
100
|
+
"no",
|
101
|
+
"not",
|
102
|
+
"now",
|
103
|
+
"of",
|
104
|
+
"on",
|
105
|
+
"or",
|
106
|
+
"should",
|
107
|
+
"sinc",
|
108
|
+
"so",
|
109
|
+
"some",
|
110
|
+
"th",
|
111
|
+
"than",
|
112
|
+
"this",
|
113
|
+
"that",
|
114
|
+
"the",
|
115
|
+
"their",
|
116
|
+
"then",
|
117
|
+
"those",
|
118
|
+
"to",
|
119
|
+
"told",
|
120
|
+
"too",
|
121
|
+
"true",
|
122
|
+
"try",
|
123
|
+
"until",
|
124
|
+
"url",
|
125
|
+
"us",
|
126
|
+
"were",
|
127
|
+
"when",
|
128
|
+
"whether",
|
129
|
+
"while",
|
130
|
+
"with",
|
131
|
+
"within",
|
132
|
+
"yes",
|
133
|
+
"you",
|
134
|
+
"youll",
|
135
|
+
])
|
136
|
+
end
|