classifier 1.3.4 → 1.4.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +5 -5
- data/LICENSE +2 -2
- data/lib/classifier/bayes.rb +132 -124
- data/lib/classifier/extensions/string.rb +1 -1
- data/lib/classifier/extensions/vector.rb +72 -78
- data/lib/classifier/extensions/vector_serialize.rb +8 -10
- data/lib/classifier/extensions/word_hash.rb +114 -120
- data/lib/classifier/lsi/content_node.rb +39 -37
- data/lib/classifier/lsi/summary.rb +24 -24
- data/lib/classifier/lsi/word_list.rb +7 -8
- data/lib/classifier/lsi.rb +174 -151
- data/lib/classifier.rb +2 -1
- data/test/test_helper.rb +3 -2
- metadata +60 -27
- data/Gemfile +0 -5
- data/Gemfile.lock +0 -26
- data/README.markdown +0 -97
- data/Rakefile +0 -84
- data/test/bayes/bayesian_test.rb +0 -33
- data/test/extensions/word_hash_test.rb +0 -35
- data/test/lsi/lsi_test.rb +0 -123
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
|
-
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: c20227e9f55f35fe93f2cbb6e5fc132127159973da74d7af68d1680a25021e73
|
4
|
+
data.tar.gz: ab8146db131a32b455b4e9b47dc3db7964e1e1a3bf9c69e953fdc6e6305eab89
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 2d024e1d46b3529f7e328110e8b59161e0f6fa0d738b3de0d973073974ad69fd7d3bfb529d5a1b827a444266c14f5dff37630af2e522522d07d74f18324064eb
|
7
|
+
data.tar.gz: 12fa59e6fc0f3e5ffb4fe1dc838b00ca529e630875729c78a3a0847bfdb6367856799c0f95831ecd75dbb36d0f4fad7fea2a6082083a965a5529d8bbf75a1542
|
data/LICENSE
CHANGED
@@ -146,7 +146,7 @@ such a program is covered only if its contents constitute a work based
|
|
146
146
|
on the Library (independent of the use of the Library in a tool for
|
147
147
|
writing it). Whether that is true depends on what the Library does
|
148
148
|
and what the program that uses the Library does.
|
149
|
-
|
149
|
+
|
150
150
|
1. You may copy and distribute verbatim copies of the Library's
|
151
151
|
complete source code as you receive it, in any medium, provided that
|
152
152
|
you conspicuously and appropriately publish on each copy an
|
@@ -426,4 +426,4 @@ the Free Software Foundation.
|
|
426
426
|
14. If you wish to incorporate parts of the Library into other free
|
427
427
|
programs whose distribution conditions are incompatible with these,
|
428
428
|
write to the author to ask for permission. For software which is
|
429
|
-
copyrighted by
|
429
|
+
copyrighted by
|
data/lib/classifier/bayes.rb
CHANGED
@@ -3,133 +3,141 @@
|
|
3
3
|
# License:: LGPL
|
4
4
|
|
5
5
|
module Classifier
|
6
|
+
class Bayes
|
7
|
+
# The class can be created with one or more categories, each of which will be
|
8
|
+
# initialized and given a training method. E.g.,
|
9
|
+
# b = Classifier::Bayes.new 'Interesting', 'Uninteresting', 'Spam'
|
10
|
+
def initialize(*categories)
|
11
|
+
@categories = {}
|
12
|
+
categories.each { |category| @categories[category.prepare_category_name] = {} }
|
13
|
+
@total_words = 0
|
14
|
+
@category_counts = Hash.new(0)
|
15
|
+
@category_word_count = Hash.new(0)
|
16
|
+
end
|
6
17
|
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
18
|
+
#
|
19
|
+
# Provides a general training method for all categories specified in Bayes#new
|
20
|
+
# For example:
|
21
|
+
# b = Classifier::Bayes.new 'This', 'That', 'the_other'
|
22
|
+
# b.train :this, "This text"
|
23
|
+
# b.train "that", "That text"
|
24
|
+
# b.train "The other", "The other text"
|
25
|
+
def train(category, text)
|
26
|
+
category = category.prepare_category_name
|
27
|
+
@category_counts[category] += 1
|
28
|
+
text.word_hash.each do |word, count|
|
29
|
+
@categories[category][word] ||= 0
|
30
|
+
@categories[category][word] += count
|
31
|
+
@total_words += count
|
32
|
+
@category_word_count[category] += count
|
33
|
+
end
|
34
|
+
end
|
17
35
|
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
@total_words += count
|
32
|
-
end
|
33
|
-
end
|
36
|
+
#
|
37
|
+
# Provides a untraining method for all categories specified in Bayes#new
|
38
|
+
# Be very careful with this method.
|
39
|
+
#
|
40
|
+
# For example:
|
41
|
+
# b = Classifier::Bayes.new 'This', 'That', 'the_other'
|
42
|
+
# b.train :this, "This text"
|
43
|
+
# b.untrain :this, "This text"
|
44
|
+
def untrain(category, text)
|
45
|
+
category = category.prepare_category_name
|
46
|
+
@category_counts[category] -= 1
|
47
|
+
text.word_hash.each do |word, count|
|
48
|
+
next unless @total_words >= 0
|
34
49
|
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
text.word_hash.each do |word, count|
|
47
|
-
if @total_words >= 0
|
48
|
-
orig = @categories[category][word]
|
49
|
-
@categories[category][word] ||= 0
|
50
|
-
@categories[category][word] -= count
|
51
|
-
if @categories[category][word] <= 0
|
52
|
-
@categories[category].delete(word)
|
53
|
-
count = orig
|
54
|
-
end
|
55
|
-
@total_words -= count
|
56
|
-
end
|
57
|
-
end
|
58
|
-
end
|
59
|
-
|
60
|
-
#
|
61
|
-
# Returns the scores in each category the provided +text+. E.g.,
|
62
|
-
# b.classifications "I hate bad words and you"
|
63
|
-
# => {"Uninteresting"=>-12.6997928013932, "Interesting"=>-18.4206807439524}
|
64
|
-
# The largest of these scores (the one closest to 0) is the one picked out by #classify
|
65
|
-
def classifications(text)
|
66
|
-
score = Hash.new
|
67
|
-
training_count = @category_counts.values.inject { |x,y| x+y }.to_f
|
68
|
-
@categories.each do |category, category_words|
|
69
|
-
score[category.to_s] = 0
|
70
|
-
total = category_words.values.inject(0) {|sum, element| sum+element}
|
71
|
-
text.word_hash.each do |word, count|
|
72
|
-
s = category_words.has_key?(word) ? category_words[word] : 0.1
|
73
|
-
score[category.to_s] += Math.log(s/total.to_f)
|
74
|
-
end
|
75
|
-
# now add prior probability for the category
|
76
|
-
s = @category_counts.has_key?(category) ? @category_counts[category] : 0.1
|
77
|
-
score[category.to_s] += Math.log(s / training_count)
|
78
|
-
end
|
79
|
-
return score
|
80
|
-
end
|
50
|
+
orig = @categories[category][word] || 0
|
51
|
+
@categories[category][word] ||= 0
|
52
|
+
@categories[category][word] -= count
|
53
|
+
if @categories[category][word] <= 0
|
54
|
+
@categories[category].delete(word)
|
55
|
+
count = orig
|
56
|
+
end
|
57
|
+
@category_word_count[category] -= count if @category_word_count[category] >= count
|
58
|
+
@total_words -= count
|
59
|
+
end
|
60
|
+
end
|
81
61
|
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
62
|
+
#
|
63
|
+
# Returns the scores in each category the provided +text+. E.g.,
|
64
|
+
# b.classifications "I hate bad words and you"
|
65
|
+
# => {"Uninteresting"=>-12.6997928013932, "Interesting"=>-18.4206807439524}
|
66
|
+
# The largest of these scores (the one closest to 0) is the one picked out by #classify
|
67
|
+
def classifications(text)
|
68
|
+
score = {}
|
69
|
+
word_hash = text.word_hash
|
70
|
+
training_count = @category_counts.values.inject { |x, y| x + y }.to_f
|
71
|
+
@categories.each do |category, category_words|
|
72
|
+
score[category.to_s] = 0
|
73
|
+
total = (@category_word_count[category] || 1).to_f
|
74
|
+
word_hash.each_key do |word|
|
75
|
+
s = category_words.key?(word) ? category_words[word] : 0.1
|
76
|
+
score[category.to_s] += Math.log(s / total)
|
77
|
+
end
|
78
|
+
# now add prior probability for the category
|
79
|
+
s = @category_counts.key?(category) ? @category_counts[category] : 0.1
|
80
|
+
score[category.to_s] += Math.log(s / training_count)
|
81
|
+
end
|
82
|
+
score
|
83
|
+
end
|
84
|
+
|
85
|
+
#
|
86
|
+
# Returns the classification of the provided +text+, which is one of the
|
87
|
+
# categories given in the initializer. E.g.,
|
88
|
+
# b.classify "I hate bad words and you"
|
89
|
+
# => 'Uninteresting'
|
90
|
+
def classify(text)
|
91
|
+
(classifications(text).sort_by { |a| -a[1] })[0][0]
|
92
|
+
end
|
93
|
+
|
94
|
+
#
|
95
|
+
# Provides training and untraining methods for the categories specified in Bayes#new
|
96
|
+
# For example:
|
97
|
+
# b = Classifier::Bayes.new 'This', 'That', 'the_other'
|
98
|
+
# b.train_this "This text"
|
99
|
+
# b.train_that "That text"
|
100
|
+
# b.untrain_that "That text"
|
101
|
+
# b.train_the_other "The other text"
|
102
|
+
def method_missing(name, *args)
|
103
|
+
category = name.to_s.gsub(/(un)?train_(\w+)/, '\2').prepare_category_name
|
104
|
+
if @categories.key?(category)
|
105
|
+
args.each do |text|
|
106
|
+
if name.to_s.start_with?('untrain_')
|
107
|
+
untrain(category, text)
|
108
|
+
else
|
109
|
+
train(category, text)
|
110
|
+
end
|
111
|
+
end
|
112
|
+
elsif name.to_s =~ /(un)?train_(\w+)/
|
113
|
+
raise StandardError, "No such category: #{category}"
|
114
|
+
else
|
115
|
+
super
|
116
|
+
end
|
117
|
+
end
|
118
|
+
|
119
|
+
#
|
120
|
+
# Provides a list of category names
|
121
|
+
# For example:
|
122
|
+
# b.categories
|
123
|
+
# => ['This', 'That', 'the_other']
|
124
|
+
def categories # :nodoc:
|
125
|
+
@categories.keys.collect(&:to_s)
|
126
|
+
end
|
127
|
+
|
128
|
+
#
|
129
|
+
# Allows you to add categories to the classifier.
|
130
|
+
# For example:
|
131
|
+
# b.add_category "Not spam"
|
132
|
+
#
|
133
|
+
# WARNING: Adding categories to a trained classifier will
|
134
|
+
# result in an undertrained category that will tend to match
|
135
|
+
# more criteria than the trained selective categories. In short,
|
136
|
+
# try to initialize your categories at initialization.
|
137
|
+
def add_category(category)
|
138
|
+
@categories[category.prepare_category_name] = {}
|
139
|
+
end
|
134
140
|
|
141
|
+
alias append_category add_category
|
142
|
+
end
|
135
143
|
end
|
@@ -1,112 +1,106 @@
|
|
1
1
|
# Author:: Ernest Ellingson
|
2
|
-
# Copyright:: Copyright (c) 2005
|
2
|
+
# Copyright:: Copyright (c) 2005
|
3
3
|
|
4
4
|
# These are extensions to the std-lib 'matrix' to allow an all ruby SVD
|
5
5
|
|
6
6
|
require 'matrix'
|
7
|
-
require 'mathn'
|
8
7
|
|
9
8
|
class Array
|
10
|
-
def
|
11
|
-
return identity unless size
|
12
|
-
|
9
|
+
def sum_with_identity(identity = 0.0, &block)
|
10
|
+
return identity unless size.to_i.positive?
|
11
|
+
|
13
12
|
if block_given?
|
14
|
-
map(&block).
|
13
|
+
map(&block).sum_with_identity(identity)
|
15
14
|
else
|
16
|
-
reduce(:+)
|
15
|
+
compact.reduce(:+).to_f || identity.to_f
|
17
16
|
end
|
18
17
|
end
|
19
18
|
end
|
20
19
|
|
21
|
-
|
20
|
+
module VectorExtensions
|
22
21
|
def magnitude
|
23
|
-
|
24
|
-
|
25
|
-
|
22
|
+
sum_of_squares = 0.to_r
|
23
|
+
size.times do |i|
|
24
|
+
sum_of_squares += self[i]**2.to_r
|
26
25
|
end
|
27
|
-
Math.sqrt(
|
26
|
+
Math.sqrt(sum_of_squares.to_f)
|
28
27
|
end
|
29
|
-
def normalize
|
30
|
-
nv = []
|
31
|
-
mag = self.magnitude
|
32
|
-
self.size.times do |i|
|
33
|
-
|
34
|
-
nv << (self[i] / mag)
|
35
28
|
|
29
|
+
def normalize
|
30
|
+
normalized_values = []
|
31
|
+
magnitude_value = magnitude.to_r
|
32
|
+
size.times do |i|
|
33
|
+
normalized_values << (self[i] / magnitude_value)
|
36
34
|
end
|
37
|
-
Vector[*
|
35
|
+
Vector[*normalized_values]
|
38
36
|
end
|
39
37
|
end
|
40
38
|
|
39
|
+
class Vector
|
40
|
+
include VectorExtensions
|
41
|
+
end
|
42
|
+
|
41
43
|
class Matrix
|
42
|
-
def
|
43
|
-
|
44
|
+
def self.diag(diagonal_elements)
|
45
|
+
Matrix.diagonal(*diagonal_elements)
|
44
46
|
end
|
45
|
-
|
46
|
-
alias :trans :transpose
|
47
47
|
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
v = Matrix.identity(q.row_size)
|
57
|
-
azrot = nil
|
58
|
-
mzrot = nil
|
59
|
-
cnt = 0
|
60
|
-
s_old = nil
|
61
|
-
mu = nil
|
48
|
+
alias trans transpose
|
49
|
+
|
50
|
+
def SV_decomp(max_sweeps = 20)
|
51
|
+
q_matrix = if row_size >= column_size
|
52
|
+
trans * self
|
53
|
+
else
|
54
|
+
self * trans
|
55
|
+
end
|
62
56
|
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
57
|
+
q_rotation_matrix = q_matrix.dup
|
58
|
+
v_matrix = Matrix.identity(q_matrix.row_size)
|
59
|
+
iteration_count = 0
|
60
|
+
previous_s_matrix = nil
|
61
|
+
|
62
|
+
loop do
|
63
|
+
iteration_count += 1
|
64
|
+
(0...q_rotation_matrix.row_size - 1).each do |row|
|
65
|
+
(1..q_rotation_matrix.row_size - 1).each do |col|
|
67
66
|
next if row == col
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
67
|
+
|
68
|
+
angle = Math.atan((2.to_r * q_rotation_matrix[row,
|
69
|
+
col]) / (q_rotation_matrix[row,
|
70
|
+
row] - q_rotation_matrix[col,
|
71
|
+
col])) / 2.0
|
72
|
+
cosine = Math.cos(angle)
|
73
|
+
sine = Math.sin(angle)
|
74
|
+
rotation_matrix = Matrix.identity(q_rotation_matrix.row_size)
|
75
|
+
rotation_matrix[row, row] = cosine
|
76
|
+
rotation_matrix[row, col] = -sine
|
77
|
+
rotation_matrix[col, row] = sine
|
78
|
+
rotation_matrix[col, col] = cosine
|
79
|
+
q_rotation_matrix = rotation_matrix.trans * q_rotation_matrix * rotation_matrix
|
80
|
+
v_matrix *= rotation_matrix
|
81
|
+
end
|
79
82
|
end
|
80
|
-
|
81
|
-
|
82
|
-
if
|
83
|
-
|
84
|
-
|
83
|
+
previous_s_matrix = q_rotation_matrix.dup if iteration_count == 1
|
84
|
+
sum_of_differences = 0.to_r
|
85
|
+
if iteration_count > 1
|
86
|
+
q_rotation_matrix.row_size.times do |r|
|
87
|
+
difference = (q_rotation_matrix[r, r] - previous_s_matrix[r, r]).abs
|
88
|
+
sum_of_differences += difference.to_r if difference > 0.001
|
85
89
|
end
|
86
|
-
|
87
|
-
end
|
88
|
-
break if (
|
89
|
-
end # of do while true
|
90
|
-
s = []
|
91
|
-
qrot.row_size.times do |r|
|
92
|
-
s << Math.sqrt(qrot[r,r])
|
90
|
+
previous_s_matrix = q_rotation_matrix.dup
|
91
|
+
end
|
92
|
+
break if (sum_of_differences <= 0.001 && iteration_count > 1) || iteration_count >= max_sweeps
|
93
93
|
end
|
94
|
-
#puts "cnt = #{cnt}"
|
95
|
-
if self.row_size >= self.column_size
|
96
|
-
mu = self * v * Matrix.diagonal(*s).inverse
|
97
|
-
return [mu, v, s]
|
98
|
-
else
|
99
|
-
puts v.row_size
|
100
|
-
puts v.column_size
|
101
|
-
puts self.row_size
|
102
|
-
puts self.column_size
|
103
|
-
puts s.size
|
104
94
|
|
105
|
-
|
106
|
-
|
95
|
+
singular_values = []
|
96
|
+
q_rotation_matrix.row_size.times do |r|
|
97
|
+
singular_values << Math.sqrt(q_rotation_matrix[r, r].to_f)
|
107
98
|
end
|
99
|
+
u_matrix = (row_size >= column_size ? self : trans) * v_matrix * Matrix.diagonal(*singular_values).inverse
|
100
|
+
[u_matrix, v_matrix, singular_values]
|
108
101
|
end
|
109
|
-
|
110
|
-
|
102
|
+
|
103
|
+
def []=(row_index, col_index, value)
|
104
|
+
@rows[row_index][col_index] = value
|
111
105
|
end
|
112
106
|
end
|
@@ -1,20 +1,18 @@
|
|
1
1
|
module GSL
|
2
|
-
|
3
2
|
class Vector
|
4
|
-
def _dump(
|
5
|
-
Marshal.dump(
|
3
|
+
def _dump(_v)
|
4
|
+
Marshal.dump(to_a)
|
6
5
|
end
|
7
|
-
|
6
|
+
|
8
7
|
def self._load(arr)
|
9
8
|
arry = Marshal.load(arr)
|
10
|
-
|
9
|
+
GSL::Vector.alloc(arry)
|
11
10
|
end
|
12
|
-
|
13
11
|
end
|
14
|
-
|
12
|
+
|
15
13
|
class Matrix
|
16
|
-
|
17
|
-
|
18
|
-
|
14
|
+
class << self
|
15
|
+
alias diag diagonal
|
16
|
+
end
|
19
17
|
end
|
20
18
|
end
|