classifier 1.3.4 → 1.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +5 -5
- data/LICENSE +2 -2
- data/lib/classifier/bayes.rb +132 -124
- data/lib/classifier/extensions/string.rb +1 -1
- data/lib/classifier/extensions/vector.rb +72 -78
- data/lib/classifier/extensions/vector_serialize.rb +8 -10
- data/lib/classifier/extensions/word_hash.rb +114 -120
- data/lib/classifier/lsi/content_node.rb +39 -37
- data/lib/classifier/lsi/summary.rb +24 -24
- data/lib/classifier/lsi/word_list.rb +7 -8
- data/lib/classifier/lsi.rb +174 -151
- data/lib/classifier.rb +2 -1
- data/test/test_helper.rb +3 -2
- metadata +60 -27
- data/Gemfile +0 -5
- data/Gemfile.lock +0 -26
- data/README.markdown +0 -97
- data/Rakefile +0 -84
- data/test/bayes/bayesian_test.rb +0 -33
- data/test/extensions/word_hash_test.rb +0 -35
- data/test/lsi/lsi_test.rb +0 -123
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
|
-
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: c20227e9f55f35fe93f2cbb6e5fc132127159973da74d7af68d1680a25021e73
|
4
|
+
data.tar.gz: ab8146db131a32b455b4e9b47dc3db7964e1e1a3bf9c69e953fdc6e6305eab89
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 2d024e1d46b3529f7e328110e8b59161e0f6fa0d738b3de0d973073974ad69fd7d3bfb529d5a1b827a444266c14f5dff37630af2e522522d07d74f18324064eb
|
7
|
+
data.tar.gz: 12fa59e6fc0f3e5ffb4fe1dc838b00ca529e630875729c78a3a0847bfdb6367856799c0f95831ecd75dbb36d0f4fad7fea2a6082083a965a5529d8bbf75a1542
|
data/LICENSE
CHANGED
@@ -146,7 +146,7 @@ such a program is covered only if its contents constitute a work based
|
|
146
146
|
on the Library (independent of the use of the Library in a tool for
|
147
147
|
writing it). Whether that is true depends on what the Library does
|
148
148
|
and what the program that uses the Library does.
|
149
|
-
|
149
|
+
|
150
150
|
1. You may copy and distribute verbatim copies of the Library's
|
151
151
|
complete source code as you receive it, in any medium, provided that
|
152
152
|
you conspicuously and appropriately publish on each copy an
|
@@ -426,4 +426,4 @@ the Free Software Foundation.
|
|
426
426
|
14. If you wish to incorporate parts of the Library into other free
|
427
427
|
programs whose distribution conditions are incompatible with these,
|
428
428
|
write to the author to ask for permission. For software which is
|
429
|
-
copyrighted by
|
429
|
+
copyrighted by
|
data/lib/classifier/bayes.rb
CHANGED
@@ -3,133 +3,141 @@
|
|
3
3
|
# License:: LGPL
|
4
4
|
|
5
5
|
module Classifier
|
6
|
+
class Bayes
|
7
|
+
# The class can be created with one or more categories, each of which will be
|
8
|
+
# initialized and given a training method. E.g.,
|
9
|
+
# b = Classifier::Bayes.new 'Interesting', 'Uninteresting', 'Spam'
|
10
|
+
def initialize(*categories)
|
11
|
+
@categories = {}
|
12
|
+
categories.each { |category| @categories[category.prepare_category_name] = {} }
|
13
|
+
@total_words = 0
|
14
|
+
@category_counts = Hash.new(0)
|
15
|
+
@category_word_count = Hash.new(0)
|
16
|
+
end
|
6
17
|
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
18
|
+
#
|
19
|
+
# Provides a general training method for all categories specified in Bayes#new
|
20
|
+
# For example:
|
21
|
+
# b = Classifier::Bayes.new 'This', 'That', 'the_other'
|
22
|
+
# b.train :this, "This text"
|
23
|
+
# b.train "that", "That text"
|
24
|
+
# b.train "The other", "The other text"
|
25
|
+
def train(category, text)
|
26
|
+
category = category.prepare_category_name
|
27
|
+
@category_counts[category] += 1
|
28
|
+
text.word_hash.each do |word, count|
|
29
|
+
@categories[category][word] ||= 0
|
30
|
+
@categories[category][word] += count
|
31
|
+
@total_words += count
|
32
|
+
@category_word_count[category] += count
|
33
|
+
end
|
34
|
+
end
|
17
35
|
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
@total_words += count
|
32
|
-
end
|
33
|
-
end
|
36
|
+
#
|
37
|
+
# Provides a untraining method for all categories specified in Bayes#new
|
38
|
+
# Be very careful with this method.
|
39
|
+
#
|
40
|
+
# For example:
|
41
|
+
# b = Classifier::Bayes.new 'This', 'That', 'the_other'
|
42
|
+
# b.train :this, "This text"
|
43
|
+
# b.untrain :this, "This text"
|
44
|
+
def untrain(category, text)
|
45
|
+
category = category.prepare_category_name
|
46
|
+
@category_counts[category] -= 1
|
47
|
+
text.word_hash.each do |word, count|
|
48
|
+
next unless @total_words >= 0
|
34
49
|
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
text.word_hash.each do |word, count|
|
47
|
-
if @total_words >= 0
|
48
|
-
orig = @categories[category][word]
|
49
|
-
@categories[category][word] ||= 0
|
50
|
-
@categories[category][word] -= count
|
51
|
-
if @categories[category][word] <= 0
|
52
|
-
@categories[category].delete(word)
|
53
|
-
count = orig
|
54
|
-
end
|
55
|
-
@total_words -= count
|
56
|
-
end
|
57
|
-
end
|
58
|
-
end
|
59
|
-
|
60
|
-
#
|
61
|
-
# Returns the scores in each category the provided +text+. E.g.,
|
62
|
-
# b.classifications "I hate bad words and you"
|
63
|
-
# => {"Uninteresting"=>-12.6997928013932, "Interesting"=>-18.4206807439524}
|
64
|
-
# The largest of these scores (the one closest to 0) is the one picked out by #classify
|
65
|
-
def classifications(text)
|
66
|
-
score = Hash.new
|
67
|
-
training_count = @category_counts.values.inject { |x,y| x+y }.to_f
|
68
|
-
@categories.each do |category, category_words|
|
69
|
-
score[category.to_s] = 0
|
70
|
-
total = category_words.values.inject(0) {|sum, element| sum+element}
|
71
|
-
text.word_hash.each do |word, count|
|
72
|
-
s = category_words.has_key?(word) ? category_words[word] : 0.1
|
73
|
-
score[category.to_s] += Math.log(s/total.to_f)
|
74
|
-
end
|
75
|
-
# now add prior probability for the category
|
76
|
-
s = @category_counts.has_key?(category) ? @category_counts[category] : 0.1
|
77
|
-
score[category.to_s] += Math.log(s / training_count)
|
78
|
-
end
|
79
|
-
return score
|
80
|
-
end
|
50
|
+
orig = @categories[category][word] || 0
|
51
|
+
@categories[category][word] ||= 0
|
52
|
+
@categories[category][word] -= count
|
53
|
+
if @categories[category][word] <= 0
|
54
|
+
@categories[category].delete(word)
|
55
|
+
count = orig
|
56
|
+
end
|
57
|
+
@category_word_count[category] -= count if @category_word_count[category] >= count
|
58
|
+
@total_words -= count
|
59
|
+
end
|
60
|
+
end
|
81
61
|
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
62
|
+
#
|
63
|
+
# Returns the scores in each category the provided +text+. E.g.,
|
64
|
+
# b.classifications "I hate bad words and you"
|
65
|
+
# => {"Uninteresting"=>-12.6997928013932, "Interesting"=>-18.4206807439524}
|
66
|
+
# The largest of these scores (the one closest to 0) is the one picked out by #classify
|
67
|
+
def classifications(text)
|
68
|
+
score = {}
|
69
|
+
word_hash = text.word_hash
|
70
|
+
training_count = @category_counts.values.inject { |x, y| x + y }.to_f
|
71
|
+
@categories.each do |category, category_words|
|
72
|
+
score[category.to_s] = 0
|
73
|
+
total = (@category_word_count[category] || 1).to_f
|
74
|
+
word_hash.each_key do |word|
|
75
|
+
s = category_words.key?(word) ? category_words[word] : 0.1
|
76
|
+
score[category.to_s] += Math.log(s / total)
|
77
|
+
end
|
78
|
+
# now add prior probability for the category
|
79
|
+
s = @category_counts.key?(category) ? @category_counts[category] : 0.1
|
80
|
+
score[category.to_s] += Math.log(s / training_count)
|
81
|
+
end
|
82
|
+
score
|
83
|
+
end
|
84
|
+
|
85
|
+
#
|
86
|
+
# Returns the classification of the provided +text+, which is one of the
|
87
|
+
# categories given in the initializer. E.g.,
|
88
|
+
# b.classify "I hate bad words and you"
|
89
|
+
# => 'Uninteresting'
|
90
|
+
def classify(text)
|
91
|
+
(classifications(text).sort_by { |a| -a[1] })[0][0]
|
92
|
+
end
|
93
|
+
|
94
|
+
#
|
95
|
+
# Provides training and untraining methods for the categories specified in Bayes#new
|
96
|
+
# For example:
|
97
|
+
# b = Classifier::Bayes.new 'This', 'That', 'the_other'
|
98
|
+
# b.train_this "This text"
|
99
|
+
# b.train_that "That text"
|
100
|
+
# b.untrain_that "That text"
|
101
|
+
# b.train_the_other "The other text"
|
102
|
+
def method_missing(name, *args)
|
103
|
+
category = name.to_s.gsub(/(un)?train_(\w+)/, '\2').prepare_category_name
|
104
|
+
if @categories.key?(category)
|
105
|
+
args.each do |text|
|
106
|
+
if name.to_s.start_with?('untrain_')
|
107
|
+
untrain(category, text)
|
108
|
+
else
|
109
|
+
train(category, text)
|
110
|
+
end
|
111
|
+
end
|
112
|
+
elsif name.to_s =~ /(un)?train_(\w+)/
|
113
|
+
raise StandardError, "No such category: #{category}"
|
114
|
+
else
|
115
|
+
super
|
116
|
+
end
|
117
|
+
end
|
118
|
+
|
119
|
+
#
|
120
|
+
# Provides a list of category names
|
121
|
+
# For example:
|
122
|
+
# b.categories
|
123
|
+
# => ['This', 'That', 'the_other']
|
124
|
+
def categories # :nodoc:
|
125
|
+
@categories.keys.collect(&:to_s)
|
126
|
+
end
|
127
|
+
|
128
|
+
#
|
129
|
+
# Allows you to add categories to the classifier.
|
130
|
+
# For example:
|
131
|
+
# b.add_category "Not spam"
|
132
|
+
#
|
133
|
+
# WARNING: Adding categories to a trained classifier will
|
134
|
+
# result in an undertrained category that will tend to match
|
135
|
+
# more criteria than the trained selective categories. In short,
|
136
|
+
# try to initialize your categories at initialization.
|
137
|
+
def add_category(category)
|
138
|
+
@categories[category.prepare_category_name] = {}
|
139
|
+
end
|
134
140
|
|
141
|
+
alias append_category add_category
|
142
|
+
end
|
135
143
|
end
|
@@ -1,112 +1,106 @@
|
|
1
1
|
# Author:: Ernest Ellingson
|
2
|
-
# Copyright:: Copyright (c) 2005
|
2
|
+
# Copyright:: Copyright (c) 2005
|
3
3
|
|
4
4
|
# These are extensions to the std-lib 'matrix' to allow an all ruby SVD
|
5
5
|
|
6
6
|
require 'matrix'
|
7
|
-
require 'mathn'
|
8
7
|
|
9
8
|
class Array
|
10
|
-
def
|
11
|
-
return identity unless size
|
12
|
-
|
9
|
+
def sum_with_identity(identity = 0.0, &block)
|
10
|
+
return identity unless size.to_i.positive?
|
11
|
+
|
13
12
|
if block_given?
|
14
|
-
map(&block).
|
13
|
+
map(&block).sum_with_identity(identity)
|
15
14
|
else
|
16
|
-
reduce(:+)
|
15
|
+
compact.reduce(:+).to_f || identity.to_f
|
17
16
|
end
|
18
17
|
end
|
19
18
|
end
|
20
19
|
|
21
|
-
|
20
|
+
module VectorExtensions
|
22
21
|
def magnitude
|
23
|
-
|
24
|
-
|
25
|
-
|
22
|
+
sum_of_squares = 0.to_r
|
23
|
+
size.times do |i|
|
24
|
+
sum_of_squares += self[i]**2.to_r
|
26
25
|
end
|
27
|
-
Math.sqrt(
|
26
|
+
Math.sqrt(sum_of_squares.to_f)
|
28
27
|
end
|
29
|
-
def normalize
|
30
|
-
nv = []
|
31
|
-
mag = self.magnitude
|
32
|
-
self.size.times do |i|
|
33
|
-
|
34
|
-
nv << (self[i] / mag)
|
35
28
|
|
29
|
+
def normalize
|
30
|
+
normalized_values = []
|
31
|
+
magnitude_value = magnitude.to_r
|
32
|
+
size.times do |i|
|
33
|
+
normalized_values << (self[i] / magnitude_value)
|
36
34
|
end
|
37
|
-
Vector[*
|
35
|
+
Vector[*normalized_values]
|
38
36
|
end
|
39
37
|
end
|
40
38
|
|
39
|
+
class Vector
|
40
|
+
include VectorExtensions
|
41
|
+
end
|
42
|
+
|
41
43
|
class Matrix
|
42
|
-
def
|
43
|
-
|
44
|
+
def self.diag(diagonal_elements)
|
45
|
+
Matrix.diagonal(*diagonal_elements)
|
44
46
|
end
|
45
|
-
|
46
|
-
alias :trans :transpose
|
47
47
|
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
v = Matrix.identity(q.row_size)
|
57
|
-
azrot = nil
|
58
|
-
mzrot = nil
|
59
|
-
cnt = 0
|
60
|
-
s_old = nil
|
61
|
-
mu = nil
|
48
|
+
alias trans transpose
|
49
|
+
|
50
|
+
def SV_decomp(max_sweeps = 20)
|
51
|
+
q_matrix = if row_size >= column_size
|
52
|
+
trans * self
|
53
|
+
else
|
54
|
+
self * trans
|
55
|
+
end
|
62
56
|
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
57
|
+
q_rotation_matrix = q_matrix.dup
|
58
|
+
v_matrix = Matrix.identity(q_matrix.row_size)
|
59
|
+
iteration_count = 0
|
60
|
+
previous_s_matrix = nil
|
61
|
+
|
62
|
+
loop do
|
63
|
+
iteration_count += 1
|
64
|
+
(0...q_rotation_matrix.row_size - 1).each do |row|
|
65
|
+
(1..q_rotation_matrix.row_size - 1).each do |col|
|
67
66
|
next if row == col
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
67
|
+
|
68
|
+
angle = Math.atan((2.to_r * q_rotation_matrix[row,
|
69
|
+
col]) / (q_rotation_matrix[row,
|
70
|
+
row] - q_rotation_matrix[col,
|
71
|
+
col])) / 2.0
|
72
|
+
cosine = Math.cos(angle)
|
73
|
+
sine = Math.sin(angle)
|
74
|
+
rotation_matrix = Matrix.identity(q_rotation_matrix.row_size)
|
75
|
+
rotation_matrix[row, row] = cosine
|
76
|
+
rotation_matrix[row, col] = -sine
|
77
|
+
rotation_matrix[col, row] = sine
|
78
|
+
rotation_matrix[col, col] = cosine
|
79
|
+
q_rotation_matrix = rotation_matrix.trans * q_rotation_matrix * rotation_matrix
|
80
|
+
v_matrix *= rotation_matrix
|
81
|
+
end
|
79
82
|
end
|
80
|
-
|
81
|
-
|
82
|
-
if
|
83
|
-
|
84
|
-
|
83
|
+
previous_s_matrix = q_rotation_matrix.dup if iteration_count == 1
|
84
|
+
sum_of_differences = 0.to_r
|
85
|
+
if iteration_count > 1
|
86
|
+
q_rotation_matrix.row_size.times do |r|
|
87
|
+
difference = (q_rotation_matrix[r, r] - previous_s_matrix[r, r]).abs
|
88
|
+
sum_of_differences += difference.to_r if difference > 0.001
|
85
89
|
end
|
86
|
-
|
87
|
-
end
|
88
|
-
break if (
|
89
|
-
end # of do while true
|
90
|
-
s = []
|
91
|
-
qrot.row_size.times do |r|
|
92
|
-
s << Math.sqrt(qrot[r,r])
|
90
|
+
previous_s_matrix = q_rotation_matrix.dup
|
91
|
+
end
|
92
|
+
break if (sum_of_differences <= 0.001 && iteration_count > 1) || iteration_count >= max_sweeps
|
93
93
|
end
|
94
|
-
#puts "cnt = #{cnt}"
|
95
|
-
if self.row_size >= self.column_size
|
96
|
-
mu = self * v * Matrix.diagonal(*s).inverse
|
97
|
-
return [mu, v, s]
|
98
|
-
else
|
99
|
-
puts v.row_size
|
100
|
-
puts v.column_size
|
101
|
-
puts self.row_size
|
102
|
-
puts self.column_size
|
103
|
-
puts s.size
|
104
94
|
|
105
|
-
|
106
|
-
|
95
|
+
singular_values = []
|
96
|
+
q_rotation_matrix.row_size.times do |r|
|
97
|
+
singular_values << Math.sqrt(q_rotation_matrix[r, r].to_f)
|
107
98
|
end
|
99
|
+
u_matrix = (row_size >= column_size ? self : trans) * v_matrix * Matrix.diagonal(*singular_values).inverse
|
100
|
+
[u_matrix, v_matrix, singular_values]
|
108
101
|
end
|
109
|
-
|
110
|
-
|
102
|
+
|
103
|
+
def []=(row_index, col_index, value)
|
104
|
+
@rows[row_index][col_index] = value
|
111
105
|
end
|
112
106
|
end
|
@@ -1,20 +1,18 @@
|
|
1
1
|
module GSL
|
2
|
-
|
3
2
|
class Vector
|
4
|
-
def _dump(
|
5
|
-
Marshal.dump(
|
3
|
+
def _dump(_v)
|
4
|
+
Marshal.dump(to_a)
|
6
5
|
end
|
7
|
-
|
6
|
+
|
8
7
|
def self._load(arr)
|
9
8
|
arry = Marshal.load(arr)
|
10
|
-
|
9
|
+
GSL::Vector.alloc(arry)
|
11
10
|
end
|
12
|
-
|
13
11
|
end
|
14
|
-
|
12
|
+
|
15
13
|
class Matrix
|
16
|
-
|
17
|
-
|
18
|
-
|
14
|
+
class << self
|
15
|
+
alias diag diagonal
|
16
|
+
end
|
19
17
|
end
|
20
18
|
end
|