classifier 1.3.5 → 1.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/classifier/bayes.rb +128 -120
- data/lib/classifier/extensions/string.rb +1 -1
- data/lib/classifier/extensions/vector.rb +66 -72
- data/lib/classifier/extensions/vector_serialize.rb +6 -8
- data/lib/classifier/extensions/word_hash.rb +108 -114
- data/lib/classifier/lsi/content_node.rb +25 -23
- data/lib/classifier/lsi/summary.rb +20 -20
- data/lib/classifier/lsi/word_list.rb +1 -2
- data/lib/classifier/lsi.rb +112 -89
- data/lib/classifier.rb +1 -0
- data/test/test_helper.rb +5 -0
- metadata +7 -21
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: c20227e9f55f35fe93f2cbb6e5fc132127159973da74d7af68d1680a25021e73
|
4
|
+
data.tar.gz: ab8146db131a32b455b4e9b47dc3db7964e1e1a3bf9c69e953fdc6e6305eab89
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 2d024e1d46b3529f7e328110e8b59161e0f6fa0d738b3de0d973073974ad69fd7d3bfb529d5a1b827a444266c14f5dff37630af2e522522d07d74f18324064eb
|
7
|
+
data.tar.gz: 12fa59e6fc0f3e5ffb4fe1dc838b00ca529e630875729c78a3a0847bfdb6367856799c0f95831ecd75dbb36d0f4fad7fea2a6082083a965a5529d8bbf75a1542
|
data/lib/classifier/bayes.rb
CHANGED
@@ -3,133 +3,141 @@
|
|
3
3
|
# License:: LGPL
|
4
4
|
|
5
5
|
module Classifier
|
6
|
+
class Bayes
|
7
|
+
# The class can be created with one or more categories, each of which will be
|
8
|
+
# initialized and given a training method. E.g.,
|
9
|
+
# b = Classifier::Bayes.new 'Interesting', 'Uninteresting', 'Spam'
|
10
|
+
def initialize(*categories)
|
11
|
+
@categories = {}
|
12
|
+
categories.each { |category| @categories[category.prepare_category_name] = {} }
|
13
|
+
@total_words = 0
|
14
|
+
@category_counts = Hash.new(0)
|
15
|
+
@category_word_count = Hash.new(0)
|
16
|
+
end
|
6
17
|
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
18
|
+
#
|
19
|
+
# Provides a general training method for all categories specified in Bayes#new
|
20
|
+
# For example:
|
21
|
+
# b = Classifier::Bayes.new 'This', 'That', 'the_other'
|
22
|
+
# b.train :this, "This text"
|
23
|
+
# b.train "that", "That text"
|
24
|
+
# b.train "The other", "The other text"
|
25
|
+
def train(category, text)
|
26
|
+
category = category.prepare_category_name
|
27
|
+
@category_counts[category] += 1
|
28
|
+
text.word_hash.each do |word, count|
|
29
|
+
@categories[category][word] ||= 0
|
30
|
+
@categories[category][word] += count
|
31
|
+
@total_words += count
|
32
|
+
@category_word_count[category] += count
|
33
|
+
end
|
34
|
+
end
|
17
35
|
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
@total_words += count
|
32
|
-
end
|
33
|
-
end
|
36
|
+
#
|
37
|
+
# Provides a untraining method for all categories specified in Bayes#new
|
38
|
+
# Be very careful with this method.
|
39
|
+
#
|
40
|
+
# For example:
|
41
|
+
# b = Classifier::Bayes.new 'This', 'That', 'the_other'
|
42
|
+
# b.train :this, "This text"
|
43
|
+
# b.untrain :this, "This text"
|
44
|
+
def untrain(category, text)
|
45
|
+
category = category.prepare_category_name
|
46
|
+
@category_counts[category] -= 1
|
47
|
+
text.word_hash.each do |word, count|
|
48
|
+
next unless @total_words >= 0
|
34
49
|
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
text.word_hash.each do |word, count|
|
47
|
-
if @total_words >= 0
|
48
|
-
orig = @categories[category][word]
|
49
|
-
@categories[category][word] ||= 0
|
50
|
-
@categories[category][word] -= count
|
51
|
-
if @categories[category][word] <= 0
|
52
|
-
@categories[category].delete(word)
|
53
|
-
count = orig
|
54
|
-
end
|
55
|
-
@total_words -= count
|
56
|
-
end
|
57
|
-
end
|
58
|
-
end
|
50
|
+
orig = @categories[category][word] || 0
|
51
|
+
@categories[category][word] ||= 0
|
52
|
+
@categories[category][word] -= count
|
53
|
+
if @categories[category][word] <= 0
|
54
|
+
@categories[category].delete(word)
|
55
|
+
count = orig
|
56
|
+
end
|
57
|
+
@category_word_count[category] -= count if @category_word_count[category] >= count
|
58
|
+
@total_words -= count
|
59
|
+
end
|
60
|
+
end
|
59
61
|
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
62
|
+
#
|
63
|
+
# Returns the scores in each category the provided +text+. E.g.,
|
64
|
+
# b.classifications "I hate bad words and you"
|
65
|
+
# => {"Uninteresting"=>-12.6997928013932, "Interesting"=>-18.4206807439524}
|
66
|
+
# The largest of these scores (the one closest to 0) is the one picked out by #classify
|
67
|
+
def classifications(text)
|
68
|
+
score = {}
|
69
|
+
word_hash = text.word_hash
|
70
|
+
training_count = @category_counts.values.inject { |x, y| x + y }.to_f
|
71
|
+
@categories.each do |category, category_words|
|
72
|
+
score[category.to_s] = 0
|
73
|
+
total = (@category_word_count[category] || 1).to_f
|
74
|
+
word_hash.each_key do |word|
|
75
|
+
s = category_words.key?(word) ? category_words[word] : 0.1
|
76
|
+
score[category.to_s] += Math.log(s / total)
|
77
|
+
end
|
78
|
+
# now add prior probability for the category
|
79
|
+
s = @category_counts.key?(category) ? @category_counts[category] : 0.1
|
80
|
+
score[category.to_s] += Math.log(s / training_count)
|
81
|
+
end
|
82
|
+
score
|
83
|
+
end
|
81
84
|
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
85
|
+
#
|
86
|
+
# Returns the classification of the provided +text+, which is one of the
|
87
|
+
# categories given in the initializer. E.g.,
|
88
|
+
# b.classify "I hate bad words and you"
|
89
|
+
# => 'Uninteresting'
|
90
|
+
def classify(text)
|
91
|
+
(classifications(text).sort_by { |a| -a[1] })[0][0]
|
92
|
+
end
|
90
93
|
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
94
|
+
#
|
95
|
+
# Provides training and untraining methods for the categories specified in Bayes#new
|
96
|
+
# For example:
|
97
|
+
# b = Classifier::Bayes.new 'This', 'That', 'the_other'
|
98
|
+
# b.train_this "This text"
|
99
|
+
# b.train_that "That text"
|
100
|
+
# b.untrain_that "That text"
|
101
|
+
# b.train_the_other "The other text"
|
102
|
+
def method_missing(name, *args)
|
103
|
+
category = name.to_s.gsub(/(un)?train_(\w+)/, '\2').prepare_category_name
|
104
|
+
if @categories.key?(category)
|
105
|
+
args.each do |text|
|
106
|
+
if name.to_s.start_with?('untrain_')
|
107
|
+
untrain(category, text)
|
108
|
+
else
|
109
|
+
train(category, text)
|
110
|
+
end
|
111
|
+
end
|
112
|
+
elsif name.to_s =~ /(un)?train_(\w+)/
|
113
|
+
raise StandardError, "No such category: #{category}"
|
114
|
+
else
|
115
|
+
super
|
116
|
+
end
|
117
|
+
end
|
109
118
|
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
119
|
+
#
|
120
|
+
# Provides a list of category names
|
121
|
+
# For example:
|
122
|
+
# b.categories
|
123
|
+
# => ['This', 'That', 'the_other']
|
124
|
+
def categories # :nodoc:
|
125
|
+
@categories.keys.collect(&:to_s)
|
126
|
+
end
|
118
127
|
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
alias append_category add_category
|
133
|
-
end
|
128
|
+
#
|
129
|
+
# Allows you to add categories to the classifier.
|
130
|
+
# For example:
|
131
|
+
# b.add_category "Not spam"
|
132
|
+
#
|
133
|
+
# WARNING: Adding categories to a trained classifier will
|
134
|
+
# result in an undertrained category that will tend to match
|
135
|
+
# more criteria than the trained selective categories. In short,
|
136
|
+
# try to initialize your categories at initialization.
|
137
|
+
def add_category(category)
|
138
|
+
@categories[category.prepare_category_name] = {}
|
139
|
+
end
|
134
140
|
|
141
|
+
alias append_category add_category
|
142
|
+
end
|
135
143
|
end
|
@@ -4,109 +4,103 @@
|
|
4
4
|
# These are extensions to the std-lib 'matrix' to allow an all ruby SVD
|
5
5
|
|
6
6
|
require 'matrix'
|
7
|
-
require 'mathn'
|
8
7
|
|
9
8
|
class Array
|
10
|
-
def
|
11
|
-
return identity unless size
|
9
|
+
def sum_with_identity(identity = 0.0, &block)
|
10
|
+
return identity unless size.to_i.positive?
|
12
11
|
|
13
12
|
if block_given?
|
14
|
-
map(&block).
|
13
|
+
map(&block).sum_with_identity(identity)
|
15
14
|
else
|
16
|
-
reduce(:+)
|
15
|
+
compact.reduce(:+).to_f || identity.to_f
|
17
16
|
end
|
18
17
|
end
|
19
18
|
end
|
20
19
|
|
21
|
-
|
20
|
+
module VectorExtensions
|
22
21
|
def magnitude
|
23
|
-
|
24
|
-
|
25
|
-
|
22
|
+
sum_of_squares = 0.to_r
|
23
|
+
size.times do |i|
|
24
|
+
sum_of_squares += self[i]**2.to_r
|
26
25
|
end
|
27
|
-
Math.sqrt(
|
26
|
+
Math.sqrt(sum_of_squares.to_f)
|
28
27
|
end
|
29
|
-
def normalize
|
30
|
-
nv = []
|
31
|
-
mag = self.magnitude
|
32
|
-
self.size.times do |i|
|
33
|
-
|
34
|
-
nv << (self[i] / mag)
|
35
28
|
|
29
|
+
def normalize
|
30
|
+
normalized_values = []
|
31
|
+
magnitude_value = magnitude.to_r
|
32
|
+
size.times do |i|
|
33
|
+
normalized_values << (self[i] / magnitude_value)
|
36
34
|
end
|
37
|
-
Vector[*
|
35
|
+
Vector[*normalized_values]
|
38
36
|
end
|
39
37
|
end
|
40
38
|
|
39
|
+
class Vector
|
40
|
+
include VectorExtensions
|
41
|
+
end
|
42
|
+
|
41
43
|
class Matrix
|
42
|
-
def
|
43
|
-
|
44
|
+
def self.diag(diagonal_elements)
|
45
|
+
Matrix.diagonal(*diagonal_elements)
|
44
46
|
end
|
45
47
|
|
46
|
-
alias
|
48
|
+
alias trans transpose
|
47
49
|
|
48
|
-
def SV_decomp(
|
49
|
-
if
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
50
|
+
def SV_decomp(max_sweeps = 20)
|
51
|
+
q_matrix = if row_size >= column_size
|
52
|
+
trans * self
|
53
|
+
else
|
54
|
+
self * trans
|
55
|
+
end
|
54
56
|
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
cnt = 0
|
60
|
-
s_old = nil
|
61
|
-
mu = nil
|
57
|
+
q_rotation_matrix = q_matrix.dup
|
58
|
+
v_matrix = Matrix.identity(q_matrix.row_size)
|
59
|
+
iteration_count = 0
|
60
|
+
previous_s_matrix = nil
|
62
61
|
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
62
|
+
loop do
|
63
|
+
iteration_count += 1
|
64
|
+
(0...q_rotation_matrix.row_size - 1).each do |row|
|
65
|
+
(1..q_rotation_matrix.row_size - 1).each do |col|
|
67
66
|
next if row == col
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
67
|
+
|
68
|
+
angle = Math.atan((2.to_r * q_rotation_matrix[row,
|
69
|
+
col]) / (q_rotation_matrix[row,
|
70
|
+
row] - q_rotation_matrix[col,
|
71
|
+
col])) / 2.0
|
72
|
+
cosine = Math.cos(angle)
|
73
|
+
sine = Math.sin(angle)
|
74
|
+
rotation_matrix = Matrix.identity(q_rotation_matrix.row_size)
|
75
|
+
rotation_matrix[row, row] = cosine
|
76
|
+
rotation_matrix[row, col] = -sine
|
77
|
+
rotation_matrix[col, row] = sine
|
78
|
+
rotation_matrix[col, col] = cosine
|
79
|
+
q_rotation_matrix = rotation_matrix.trans * q_rotation_matrix * rotation_matrix
|
80
|
+
v_matrix *= rotation_matrix
|
78
81
|
end
|
79
82
|
end
|
80
|
-
|
81
|
-
|
82
|
-
if
|
83
|
-
|
84
|
-
|
83
|
+
previous_s_matrix = q_rotation_matrix.dup if iteration_count == 1
|
84
|
+
sum_of_differences = 0.to_r
|
85
|
+
if iteration_count > 1
|
86
|
+
q_rotation_matrix.row_size.times do |r|
|
87
|
+
difference = (q_rotation_matrix[r, r] - previous_s_matrix[r, r]).abs
|
88
|
+
sum_of_differences += difference.to_r if difference > 0.001
|
85
89
|
end
|
86
|
-
|
90
|
+
previous_s_matrix = q_rotation_matrix.dup
|
87
91
|
end
|
88
|
-
break if (
|
89
|
-
end # of do while true
|
90
|
-
s = []
|
91
|
-
qrot.row_size.times do |r|
|
92
|
-
s << Math.sqrt(qrot[r,r])
|
92
|
+
break if (sum_of_differences <= 0.001 && iteration_count > 1) || iteration_count >= max_sweeps
|
93
93
|
end
|
94
|
-
#puts "cnt = #{cnt}"
|
95
|
-
if self.row_size >= self.column_size
|
96
|
-
mu = self * v * Matrix.diagonal(*s).inverse
|
97
|
-
return [mu, v, s]
|
98
|
-
else
|
99
|
-
puts v.row_size
|
100
|
-
puts v.column_size
|
101
|
-
puts self.row_size
|
102
|
-
puts self.column_size
|
103
|
-
puts s.size
|
104
94
|
|
105
|
-
|
106
|
-
|
95
|
+
singular_values = []
|
96
|
+
q_rotation_matrix.row_size.times do |r|
|
97
|
+
singular_values << Math.sqrt(q_rotation_matrix[r, r].to_f)
|
107
98
|
end
|
99
|
+
u_matrix = (row_size >= column_size ? self : trans) * v_matrix * Matrix.diagonal(*singular_values).inverse
|
100
|
+
[u_matrix, v_matrix, singular_values]
|
108
101
|
end
|
109
|
-
|
110
|
-
|
102
|
+
|
103
|
+
def []=(row_index, col_index, value)
|
104
|
+
@rows[row_index][col_index] = value
|
111
105
|
end
|
112
106
|
end
|
@@ -1,20 +1,18 @@
|
|
1
1
|
module GSL
|
2
|
-
|
3
2
|
class Vector
|
4
|
-
def _dump(
|
5
|
-
Marshal.dump(
|
3
|
+
def _dump(_v)
|
4
|
+
Marshal.dump(to_a)
|
6
5
|
end
|
7
6
|
|
8
7
|
def self._load(arr)
|
9
8
|
arry = Marshal.load(arr)
|
10
|
-
|
9
|
+
GSL::Vector.alloc(arry)
|
11
10
|
end
|
12
|
-
|
13
11
|
end
|
14
12
|
|
15
13
|
class Matrix
|
16
|
-
|
17
|
-
|
18
|
-
|
14
|
+
class << self
|
15
|
+
alias diag diagonal
|
16
|
+
end
|
19
17
|
end
|
20
18
|
end
|