classifier 1.3.5 → 1.4.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 1d453b4ca9e0a0c44a2b8d3c9ef55db5b55a17efe5ff0cfbcab93f24965cd536
4
- data.tar.gz: 26f5d9595ddd35c8d7c239f946afa8251a4b68f24f5b8bc41e30be13ded60547
3
+ metadata.gz: c20227e9f55f35fe93f2cbb6e5fc132127159973da74d7af68d1680a25021e73
4
+ data.tar.gz: ab8146db131a32b455b4e9b47dc3db7964e1e1a3bf9c69e953fdc6e6305eab89
5
5
  SHA512:
6
- metadata.gz: c13c1666c7d2fe92d47ab7ced0a885fec7b13719aea25f283a013d5015744d6aea7d473706af54042972ba687b214c2a3a619f58d75560d90e57c3569d38f957
7
- data.tar.gz: 23261ba6708307ecf6faac636d8572c50720fbf9a2e6db6ee736a07ce3de445daa431afb3bbf2c49dd4d2bd699327ca1c419029b4972236c52a9a5c1f00ab5a2
6
+ metadata.gz: 2d024e1d46b3529f7e328110e8b59161e0f6fa0d738b3de0d973073974ad69fd7d3bfb529d5a1b827a444266c14f5dff37630af2e522522d07d74f18324064eb
7
+ data.tar.gz: 12fa59e6fc0f3e5ffb4fe1dc838b00ca529e630875729c78a3a0847bfdb6367856799c0f95831ecd75dbb36d0f4fad7fea2a6082083a965a5529d8bbf75a1542
@@ -3,133 +3,141 @@
3
3
  # License:: LGPL
4
4
 
5
5
  module Classifier
6
+ class Bayes
7
+ # The class can be created with one or more categories, each of which will be
8
+ # initialized and given a training method. E.g.,
9
+ # b = Classifier::Bayes.new 'Interesting', 'Uninteresting', 'Spam'
10
+ def initialize(*categories)
11
+ @categories = {}
12
+ categories.each { |category| @categories[category.prepare_category_name] = {} }
13
+ @total_words = 0
14
+ @category_counts = Hash.new(0)
15
+ @category_word_count = Hash.new(0)
16
+ end
6
17
 
7
- class Bayes
8
- # The class can be created with one or more categories, each of which will be
9
- # initialized and given a training method. E.g.,
10
- # b = Classifier::Bayes.new 'Interesting', 'Uninteresting', 'Spam'
11
- def initialize(*categories)
12
- @categories = Hash.new
13
- categories.each { |category| @categories[category.prepare_category_name] = Hash.new }
14
- @total_words = 0
15
- @category_counts = Hash.new(0)
16
- end
18
+ #
19
+ # Provides a general training method for all categories specified in Bayes#new
20
+ # For example:
21
+ # b = Classifier::Bayes.new 'This', 'That', 'the_other'
22
+ # b.train :this, "This text"
23
+ # b.train "that", "That text"
24
+ # b.train "The other", "The other text"
25
+ def train(category, text)
26
+ category = category.prepare_category_name
27
+ @category_counts[category] += 1
28
+ text.word_hash.each do |word, count|
29
+ @categories[category][word] ||= 0
30
+ @categories[category][word] += count
31
+ @total_words += count
32
+ @category_word_count[category] += count
33
+ end
34
+ end
17
35
 
18
- #
19
- # Provides a general training method for all categories specified in Bayes#new
20
- # For example:
21
- # b = Classifier::Bayes.new 'This', 'That', 'the_other'
22
- # b.train :this, "This text"
23
- # b.train "that", "That text"
24
- # b.train "The other", "The other text"
25
- def train(category, text)
26
- category = category.prepare_category_name
27
- @category_counts[category] += 1
28
- text.word_hash.each do |word, count|
29
- @categories[category][word] ||= 0
30
- @categories[category][word] += count
31
- @total_words += count
32
- end
33
- end
36
+ #
37
+ # Provides a untraining method for all categories specified in Bayes#new
38
+ # Be very careful with this method.
39
+ #
40
+ # For example:
41
+ # b = Classifier::Bayes.new 'This', 'That', 'the_other'
42
+ # b.train :this, "This text"
43
+ # b.untrain :this, "This text"
44
+ def untrain(category, text)
45
+ category = category.prepare_category_name
46
+ @category_counts[category] -= 1
47
+ text.word_hash.each do |word, count|
48
+ next unless @total_words >= 0
34
49
 
35
- #
36
- # Provides a untraining method for all categories specified in Bayes#new
37
- # Be very careful with this method.
38
- #
39
- # For example:
40
- # b = Classifier::Bayes.new 'This', 'That', 'the_other'
41
- # b.train :this, "This text"
42
- # b.untrain :this, "This text"
43
- def untrain(category, text)
44
- category = category.prepare_category_name
45
- @category_counts[category] -= 1
46
- text.word_hash.each do |word, count|
47
- if @total_words >= 0
48
- orig = @categories[category][word]
49
- @categories[category][word] ||= 0
50
- @categories[category][word] -= count
51
- if @categories[category][word] <= 0
52
- @categories[category].delete(word)
53
- count = orig
54
- end
55
- @total_words -= count
56
- end
57
- end
58
- end
50
+ orig = @categories[category][word] || 0
51
+ @categories[category][word] ||= 0
52
+ @categories[category][word] -= count
53
+ if @categories[category][word] <= 0
54
+ @categories[category].delete(word)
55
+ count = orig
56
+ end
57
+ @category_word_count[category] -= count if @category_word_count[category] >= count
58
+ @total_words -= count
59
+ end
60
+ end
59
61
 
60
- #
61
- # Returns the scores in each category the provided +text+. E.g.,
62
- # b.classifications "I hate bad words and you"
63
- # => {"Uninteresting"=>-12.6997928013932, "Interesting"=>-18.4206807439524}
64
- # The largest of these scores (the one closest to 0) is the one picked out by #classify
65
- def classifications(text)
66
- score = Hash.new
67
- training_count = @category_counts.values.inject { |x,y| x+y }.to_f
68
- @categories.each do |category, category_words|
69
- score[category.to_s] = 0
70
- total = category_words.values.inject(0) {|sum, element| sum+element}
71
- text.word_hash.each do |word, count|
72
- s = category_words.has_key?(word) ? category_words[word] : 0.1
73
- score[category.to_s] += Math.log(s/total.to_f)
74
- end
75
- # now add prior probability for the category
76
- s = @category_counts.has_key?(category) ? @category_counts[category] : 0.1
77
- score[category.to_s] += Math.log(s / training_count)
78
- end
79
- return score
80
- end
62
+ #
63
+ # Returns the scores in each category the provided +text+. E.g.,
64
+ # b.classifications "I hate bad words and you"
65
+ # => {"Uninteresting"=>-12.6997928013932, "Interesting"=>-18.4206807439524}
66
+ # The largest of these scores (the one closest to 0) is the one picked out by #classify
67
+ def classifications(text)
68
+ score = {}
69
+ word_hash = text.word_hash
70
+ training_count = @category_counts.values.inject { |x, y| x + y }.to_f
71
+ @categories.each do |category, category_words|
72
+ score[category.to_s] = 0
73
+ total = (@category_word_count[category] || 1).to_f
74
+ word_hash.each_key do |word|
75
+ s = category_words.key?(word) ? category_words[word] : 0.1
76
+ score[category.to_s] += Math.log(s / total)
77
+ end
78
+ # now add prior probability for the category
79
+ s = @category_counts.key?(category) ? @category_counts[category] : 0.1
80
+ score[category.to_s] += Math.log(s / training_count)
81
+ end
82
+ score
83
+ end
81
84
 
82
- #
83
- # Returns the classification of the provided +text+, which is one of the
84
- # categories given in the initializer. E.g.,
85
- # b.classify "I hate bad words and you"
86
- # => 'Uninteresting'
87
- def classify(text)
88
- (classifications(text).sort_by { |a| -a[1] })[0][0]
89
- end
85
+ #
86
+ # Returns the classification of the provided +text+, which is one of the
87
+ # categories given in the initializer. E.g.,
88
+ # b.classify "I hate bad words and you"
89
+ # => 'Uninteresting'
90
+ def classify(text)
91
+ (classifications(text).sort_by { |a| -a[1] })[0][0]
92
+ end
90
93
 
91
- #
92
- # Provides training and untraining methods for the categories specified in Bayes#new
93
- # For example:
94
- # b = Classifier::Bayes.new 'This', 'That', 'the_other'
95
- # b.train_this "This text"
96
- # b.train_that "That text"
97
- # b.untrain_that "That text"
98
- # b.train_the_other "The other text"
99
- def method_missing(name, *args)
100
- category = name.to_s.gsub(/(un)?train_([\w]+)/, '\2').prepare_category_name
101
- if @categories.has_key? category
102
- args.each { |text| eval("#{$1}train(category, text)") }
103
- elsif name.to_s =~ /(un)?train_([\w]+)/
104
- raise StandardError, "No such category: #{category}"
105
- else
106
- super #raise StandardError, "No such method: #{name}"
107
- end
108
- end
94
+ #
95
+ # Provides training and untraining methods for the categories specified in Bayes#new
96
+ # For example:
97
+ # b = Classifier::Bayes.new 'This', 'That', 'the_other'
98
+ # b.train_this "This text"
99
+ # b.train_that "That text"
100
+ # b.untrain_that "That text"
101
+ # b.train_the_other "The other text"
102
+ def method_missing(name, *args)
103
+ category = name.to_s.gsub(/(un)?train_(\w+)/, '\2').prepare_category_name
104
+ if @categories.key?(category)
105
+ args.each do |text|
106
+ if name.to_s.start_with?('untrain_')
107
+ untrain(category, text)
108
+ else
109
+ train(category, text)
110
+ end
111
+ end
112
+ elsif name.to_s =~ /(un)?train_(\w+)/
113
+ raise StandardError, "No such category: #{category}"
114
+ else
115
+ super
116
+ end
117
+ end
109
118
 
110
- #
111
- # Provides a list of category names
112
- # For example:
113
- # b.categories
114
- # => ['This', 'That', 'the_other']
115
- def categories # :nodoc:
116
- @categories.keys.collect {|c| c.to_s}
117
- end
119
+ #
120
+ # Provides a list of category names
121
+ # For example:
122
+ # b.categories
123
+ # => ['This', 'That', 'the_other']
124
+ def categories # :nodoc:
125
+ @categories.keys.collect(&:to_s)
126
+ end
118
127
 
119
- #
120
- # Allows you to add categories to the classifier.
121
- # For example:
122
- # b.add_category "Not spam"
123
- #
124
- # WARNING: Adding categories to a trained classifier will
125
- # result in an undertrained category that will tend to match
126
- # more criteria than the trained selective categories. In short,
127
- # try to initialize your categories at initialization.
128
- def add_category(category)
129
- @categories[category.prepare_category_name] = Hash.new
130
- end
131
-
132
- alias append_category add_category
133
- end
128
+ #
129
+ # Allows you to add categories to the classifier.
130
+ # For example:
131
+ # b.add_category "Not spam"
132
+ #
133
+ # WARNING: Adding categories to a trained classifier will
134
+ # result in an undertrained category that will tend to match
135
+ # more criteria than the trained selective categories. In short,
136
+ # try to initialize your categories at initialization.
137
+ def add_category(category)
138
+ @categories[category.prepare_category_name] = {}
139
+ end
134
140
 
141
+ alias append_category add_category
142
+ end
135
143
  end
@@ -6,5 +6,5 @@ require 'fast_stemmer'
6
6
  require 'classifier/extensions/word_hash'
7
7
 
8
8
  class Object
9
- def prepare_category_name; to_s.gsub("_"," ").capitalize.intern end
9
+ def prepare_category_name = to_s.gsub('_', ' ').capitalize.intern
10
10
  end
@@ -4,109 +4,103 @@
4
4
  # These are extensions to the std-lib 'matrix' to allow an all ruby SVD
5
5
 
6
6
  require 'matrix'
7
- require 'mathn'
8
7
 
9
8
  class Array
10
- def sum(identity = 0, &block)
11
- return identity unless size > 0
9
+ def sum_with_identity(identity = 0.0, &block)
10
+ return identity unless size.to_i.positive?
12
11
 
13
12
  if block_given?
14
- map(&block).sum
13
+ map(&block).sum_with_identity(identity)
15
14
  else
16
- reduce(:+)
15
+ compact.reduce(:+).to_f || identity.to_f
17
16
  end
18
17
  end
19
18
  end
20
19
 
21
- class Vector
20
+ module VectorExtensions
22
21
  def magnitude
23
- sumsqs = 0.0
24
- self.size.times do |i|
25
- sumsqs += self[i] ** 2.0
22
+ sum_of_squares = 0.to_r
23
+ size.times do |i|
24
+ sum_of_squares += self[i]**2.to_r
26
25
  end
27
- Math.sqrt(sumsqs)
26
+ Math.sqrt(sum_of_squares.to_f)
28
27
  end
29
- def normalize
30
- nv = []
31
- mag = self.magnitude
32
- self.size.times do |i|
33
-
34
- nv << (self[i] / mag)
35
28
 
29
+ def normalize
30
+ normalized_values = []
31
+ magnitude_value = magnitude.to_r
32
+ size.times do |i|
33
+ normalized_values << (self[i] / magnitude_value)
36
34
  end
37
- Vector[*nv]
35
+ Vector[*normalized_values]
38
36
  end
39
37
  end
40
38
 
39
+ class Vector
40
+ include VectorExtensions
41
+ end
42
+
41
43
  class Matrix
42
- def Matrix.diag(s)
43
- Matrix.diagonal(*s)
44
+ def self.diag(diagonal_elements)
45
+ Matrix.diagonal(*diagonal_elements)
44
46
  end
45
47
 
46
- alias :trans :transpose
48
+ alias trans transpose
47
49
 
48
- def SV_decomp(maxSweeps = 20)
49
- if self.row_size >= self.column_size
50
- q = self.trans * self
51
- else
52
- q = self * self.trans
53
- end
50
+ def SV_decomp(max_sweeps = 20)
51
+ q_matrix = if row_size >= column_size
52
+ trans * self
53
+ else
54
+ self * trans
55
+ end
54
56
 
55
- qrot = q.dup
56
- v = Matrix.identity(q.row_size)
57
- azrot = nil
58
- mzrot = nil
59
- cnt = 0
60
- s_old = nil
61
- mu = nil
57
+ q_rotation_matrix = q_matrix.dup
58
+ v_matrix = Matrix.identity(q_matrix.row_size)
59
+ iteration_count = 0
60
+ previous_s_matrix = nil
62
61
 
63
- while true do
64
- cnt += 1
65
- for row in (0...qrot.row_size-1) do
66
- for col in (1..qrot.row_size-1) do
62
+ loop do
63
+ iteration_count += 1
64
+ (0...q_rotation_matrix.row_size - 1).each do |row|
65
+ (1..q_rotation_matrix.row_size - 1).each do |col|
67
66
  next if row == col
68
- h = Math.atan((2 * qrot[row,col])/(qrot[row,row]-qrot[col,col]))/2.0
69
- hcos = Math.cos(h)
70
- hsin = Math.sin(h)
71
- mzrot = Matrix.identity(qrot.row_size)
72
- mzrot[row,row] = hcos
73
- mzrot[row,col] = -hsin
74
- mzrot[col,row] = hsin
75
- mzrot[col,col] = hcos
76
- qrot = mzrot.trans * qrot * mzrot
77
- v = v * mzrot
67
+
68
+ angle = Math.atan((2.to_r * q_rotation_matrix[row,
69
+ col]) / (q_rotation_matrix[row,
70
+ row] - q_rotation_matrix[col,
71
+ col])) / 2.0
72
+ cosine = Math.cos(angle)
73
+ sine = Math.sin(angle)
74
+ rotation_matrix = Matrix.identity(q_rotation_matrix.row_size)
75
+ rotation_matrix[row, row] = cosine
76
+ rotation_matrix[row, col] = -sine
77
+ rotation_matrix[col, row] = sine
78
+ rotation_matrix[col, col] = cosine
79
+ q_rotation_matrix = rotation_matrix.trans * q_rotation_matrix * rotation_matrix
80
+ v_matrix *= rotation_matrix
78
81
  end
79
82
  end
80
- s_old = qrot.dup if cnt == 1
81
- sum_qrot = 0.0
82
- if cnt > 1
83
- qrot.row_size.times do |r|
84
- sum_qrot += (qrot[r,r]-s_old[r,r]).abs if (qrot[r,r]-s_old[r,r]).abs > 0.001
83
+ previous_s_matrix = q_rotation_matrix.dup if iteration_count == 1
84
+ sum_of_differences = 0.to_r
85
+ if iteration_count > 1
86
+ q_rotation_matrix.row_size.times do |r|
87
+ difference = (q_rotation_matrix[r, r] - previous_s_matrix[r, r]).abs
88
+ sum_of_differences += difference.to_r if difference > 0.001
85
89
  end
86
- s_old = qrot.dup
90
+ previous_s_matrix = q_rotation_matrix.dup
87
91
  end
88
- break if (sum_qrot <= 0.001 and cnt > 1) or cnt >= maxSweeps
89
- end # of do while true
90
- s = []
91
- qrot.row_size.times do |r|
92
- s << Math.sqrt(qrot[r,r])
92
+ break if (sum_of_differences <= 0.001 && iteration_count > 1) || iteration_count >= max_sweeps
93
93
  end
94
- #puts "cnt = #{cnt}"
95
- if self.row_size >= self.column_size
96
- mu = self * v * Matrix.diagonal(*s).inverse
97
- return [mu, v, s]
98
- else
99
- puts v.row_size
100
- puts v.column_size
101
- puts self.row_size
102
- puts self.column_size
103
- puts s.size
104
94
 
105
- mu = (self.trans * v * Matrix.diagonal(*s).inverse)
106
- return [mu, v, s]
95
+ singular_values = []
96
+ q_rotation_matrix.row_size.times do |r|
97
+ singular_values << Math.sqrt(q_rotation_matrix[r, r].to_f)
107
98
  end
99
+ u_matrix = (row_size >= column_size ? self : trans) * v_matrix * Matrix.diagonal(*singular_values).inverse
100
+ [u_matrix, v_matrix, singular_values]
108
101
  end
109
- def []=(i,j,val)
110
- @rows[i][j] = val
102
+
103
+ def []=(row_index, col_index, value)
104
+ @rows[row_index][col_index] = value
111
105
  end
112
106
  end
@@ -1,20 +1,18 @@
1
1
  module GSL
2
-
3
2
  class Vector
4
- def _dump(v)
5
- Marshal.dump( self.to_a )
3
+ def _dump(_v)
4
+ Marshal.dump(to_a)
6
5
  end
7
6
 
8
7
  def self._load(arr)
9
8
  arry = Marshal.load(arr)
10
- return GSL::Vector.alloc(arry)
9
+ GSL::Vector.alloc(arry)
11
10
  end
12
-
13
11
  end
14
12
 
15
13
  class Matrix
16
- class <<self
17
- alias :diag :diagonal
18
- end
14
+ class << self
15
+ alias diag diagonal
16
+ end
19
17
  end
20
18
  end