classifier 1.3.4 → 1.4.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
- SHA1:
3
- metadata.gz: 3c53668ddd328fb78862c67723b185df9c2aa717
4
- data.tar.gz: 3655405d082fdd8a01e4ca893a70360ca9f62322
2
+ SHA256:
3
+ metadata.gz: c20227e9f55f35fe93f2cbb6e5fc132127159973da74d7af68d1680a25021e73
4
+ data.tar.gz: ab8146db131a32b455b4e9b47dc3db7964e1e1a3bf9c69e953fdc6e6305eab89
5
5
  SHA512:
6
- metadata.gz: 40b7395e2f04f56bdbabb49a4d0013dba36e9c1325ae66e5bff92451059c5b559677aaea30e50f8f2fbbae58e50bf0f084925ef38e0e3d3fb729e37e357469d4
7
- data.tar.gz: 150f8f387706d870a37e86b0418c5e68ad386b82518294bdf21585ab3509fd98515648bc5e06dfb78b97f1e544099fe1da5ddcd69413826e0ccc39780d457940
6
+ metadata.gz: 2d024e1d46b3529f7e328110e8b59161e0f6fa0d738b3de0d973073974ad69fd7d3bfb529d5a1b827a444266c14f5dff37630af2e522522d07d74f18324064eb
7
+ data.tar.gz: 12fa59e6fc0f3e5ffb4fe1dc838b00ca529e630875729c78a3a0847bfdb6367856799c0f95831ecd75dbb36d0f4fad7fea2a6082083a965a5529d8bbf75a1542
data/LICENSE CHANGED
@@ -146,7 +146,7 @@ such a program is covered only if its contents constitute a work based
146
146
  on the Library (independent of the use of the Library in a tool for
147
147
  writing it). Whether that is true depends on what the Library does
148
148
  and what the program that uses the Library does.
149
-
149
+
150
150
  1. You may copy and distribute verbatim copies of the Library's
151
151
  complete source code as you receive it, in any medium, provided that
152
152
  you conspicuously and appropriately publish on each copy an
@@ -426,4 +426,4 @@ the Free Software Foundation.
426
426
  14. If you wish to incorporate parts of the Library into other free
427
427
  programs whose distribution conditions are incompatible with these,
428
428
  write to the author to ask for permission. For software which is
429
- copyrighted by
429
+ copyrighted by
@@ -3,133 +3,141 @@
3
3
  # License:: LGPL
4
4
 
5
5
  module Classifier
6
+ class Bayes
7
+ # The class can be created with one or more categories, each of which will be
8
+ # initialized and given a training method. E.g.,
9
+ # b = Classifier::Bayes.new 'Interesting', 'Uninteresting', 'Spam'
10
+ def initialize(*categories)
11
+ @categories = {}
12
+ categories.each { |category| @categories[category.prepare_category_name] = {} }
13
+ @total_words = 0
14
+ @category_counts = Hash.new(0)
15
+ @category_word_count = Hash.new(0)
16
+ end
6
17
 
7
- class Bayes
8
- # The class can be created with one or more categories, each of which will be
9
- # initialized and given a training method. E.g.,
10
- # b = Classifier::Bayes.new 'Interesting', 'Uninteresting', 'Spam'
11
- def initialize(*categories)
12
- @categories = Hash.new
13
- categories.each { |category| @categories[category.prepare_category_name] = Hash.new }
14
- @total_words = 0
15
- @category_counts = Hash.new(0)
16
- end
18
+ #
19
+ # Provides a general training method for all categories specified in Bayes#new
20
+ # For example:
21
+ # b = Classifier::Bayes.new 'This', 'That', 'the_other'
22
+ # b.train :this, "This text"
23
+ # b.train "that", "That text"
24
+ # b.train "The other", "The other text"
25
+ def train(category, text)
26
+ category = category.prepare_category_name
27
+ @category_counts[category] += 1
28
+ text.word_hash.each do |word, count|
29
+ @categories[category][word] ||= 0
30
+ @categories[category][word] += count
31
+ @total_words += count
32
+ @category_word_count[category] += count
33
+ end
34
+ end
17
35
 
18
- #
19
- # Provides a general training method for all categories specified in Bayes#new
20
- # For example:
21
- # b = Classifier::Bayes.new 'This', 'That', 'the_other'
22
- # b.train :this, "This text"
23
- # b.train "that", "That text"
24
- # b.train "The other", "The other text"
25
- def train(category, text)
26
- category = category.prepare_category_name
27
- @category_counts[category] += 1
28
- text.word_hash.each do |word, count|
29
- @categories[category][word] ||= 0
30
- @categories[category][word] += count
31
- @total_words += count
32
- end
33
- end
36
+ #
37
+ # Provides a untraining method for all categories specified in Bayes#new
38
+ # Be very careful with this method.
39
+ #
40
+ # For example:
41
+ # b = Classifier::Bayes.new 'This', 'That', 'the_other'
42
+ # b.train :this, "This text"
43
+ # b.untrain :this, "This text"
44
+ def untrain(category, text)
45
+ category = category.prepare_category_name
46
+ @category_counts[category] -= 1
47
+ text.word_hash.each do |word, count|
48
+ next unless @total_words >= 0
34
49
 
35
- #
36
- # Provides a untraining method for all categories specified in Bayes#new
37
- # Be very careful with this method.
38
- #
39
- # For example:
40
- # b = Classifier::Bayes.new 'This', 'That', 'the_other'
41
- # b.train :this, "This text"
42
- # b.untrain :this, "This text"
43
- def untrain(category, text)
44
- category = category.prepare_category_name
45
- @category_counts[category] -= 1
46
- text.word_hash.each do |word, count|
47
- if @total_words >= 0
48
- orig = @categories[category][word]
49
- @categories[category][word] ||= 0
50
- @categories[category][word] -= count
51
- if @categories[category][word] <= 0
52
- @categories[category].delete(word)
53
- count = orig
54
- end
55
- @total_words -= count
56
- end
57
- end
58
- end
59
-
60
- #
61
- # Returns the scores in each category the provided +text+. E.g.,
62
- # b.classifications "I hate bad words and you"
63
- # => {"Uninteresting"=>-12.6997928013932, "Interesting"=>-18.4206807439524}
64
- # The largest of these scores (the one closest to 0) is the one picked out by #classify
65
- def classifications(text)
66
- score = Hash.new
67
- training_count = @category_counts.values.inject { |x,y| x+y }.to_f
68
- @categories.each do |category, category_words|
69
- score[category.to_s] = 0
70
- total = category_words.values.inject(0) {|sum, element| sum+element}
71
- text.word_hash.each do |word, count|
72
- s = category_words.has_key?(word) ? category_words[word] : 0.1
73
- score[category.to_s] += Math.log(s/total.to_f)
74
- end
75
- # now add prior probability for the category
76
- s = @category_counts.has_key?(category) ? @category_counts[category] : 0.1
77
- score[category.to_s] += Math.log(s / training_count)
78
- end
79
- return score
80
- end
50
+ orig = @categories[category][word] || 0
51
+ @categories[category][word] ||= 0
52
+ @categories[category][word] -= count
53
+ if @categories[category][word] <= 0
54
+ @categories[category].delete(word)
55
+ count = orig
56
+ end
57
+ @category_word_count[category] -= count if @category_word_count[category] >= count
58
+ @total_words -= count
59
+ end
60
+ end
81
61
 
82
- #
83
- # Returns the classification of the provided +text+, which is one of the
84
- # categories given in the initializer. E.g.,
85
- # b.classify "I hate bad words and you"
86
- # => 'Uninteresting'
87
- def classify(text)
88
- (classifications(text).sort_by { |a| -a[1] })[0][0]
89
- end
90
-
91
- #
92
- # Provides training and untraining methods for the categories specified in Bayes#new
93
- # For example:
94
- # b = Classifier::Bayes.new 'This', 'That', 'the_other'
95
- # b.train_this "This text"
96
- # b.train_that "That text"
97
- # b.untrain_that "That text"
98
- # b.train_the_other "The other text"
99
- def method_missing(name, *args)
100
- category = name.to_s.gsub(/(un)?train_([\w]+)/, '\2').prepare_category_name
101
- if @categories.has_key? category
102
- args.each { |text| eval("#{$1}train(category, text)") }
103
- elsif name.to_s =~ /(un)?train_([\w]+)/
104
- raise StandardError, "No such category: #{category}"
105
- else
106
- super #raise StandardError, "No such method: #{name}"
107
- end
108
- end
109
-
110
- #
111
- # Provides a list of category names
112
- # For example:
113
- # b.categories
114
- # => ['This', 'That', 'the_other']
115
- def categories # :nodoc:
116
- @categories.keys.collect {|c| c.to_s}
117
- end
118
-
119
- #
120
- # Allows you to add categories to the classifier.
121
- # For example:
122
- # b.add_category "Not spam"
123
- #
124
- # WARNING: Adding categories to a trained classifier will
125
- # result in an undertrained category that will tend to match
126
- # more criteria than the trained selective categories. In short,
127
- # try to initialize your categories at initialization.
128
- def add_category(category)
129
- @categories[category.prepare_category_name] = Hash.new
130
- end
131
-
132
- alias append_category add_category
133
- end
62
+ #
63
+ # Returns the scores in each category the provided +text+. E.g.,
64
+ # b.classifications "I hate bad words and you"
65
+ # => {"Uninteresting"=>-12.6997928013932, "Interesting"=>-18.4206807439524}
66
+ # The largest of these scores (the one closest to 0) is the one picked out by #classify
67
+ def classifications(text)
68
+ score = {}
69
+ word_hash = text.word_hash
70
+ training_count = @category_counts.values.inject { |x, y| x + y }.to_f
71
+ @categories.each do |category, category_words|
72
+ score[category.to_s] = 0
73
+ total = (@category_word_count[category] || 1).to_f
74
+ word_hash.each_key do |word|
75
+ s = category_words.key?(word) ? category_words[word] : 0.1
76
+ score[category.to_s] += Math.log(s / total)
77
+ end
78
+ # now add prior probability for the category
79
+ s = @category_counts.key?(category) ? @category_counts[category] : 0.1
80
+ score[category.to_s] += Math.log(s / training_count)
81
+ end
82
+ score
83
+ end
84
+
85
+ #
86
+ # Returns the classification of the provided +text+, which is one of the
87
+ # categories given in the initializer. E.g.,
88
+ # b.classify "I hate bad words and you"
89
+ # => 'Uninteresting'
90
+ def classify(text)
91
+ (classifications(text).sort_by { |a| -a[1] })[0][0]
92
+ end
93
+
94
+ #
95
+ # Provides training and untraining methods for the categories specified in Bayes#new
96
+ # For example:
97
+ # b = Classifier::Bayes.new 'This', 'That', 'the_other'
98
+ # b.train_this "This text"
99
+ # b.train_that "That text"
100
+ # b.untrain_that "That text"
101
+ # b.train_the_other "The other text"
102
+ def method_missing(name, *args)
103
+ category = name.to_s.gsub(/(un)?train_(\w+)/, '\2').prepare_category_name
104
+ if @categories.key?(category)
105
+ args.each do |text|
106
+ if name.to_s.start_with?('untrain_')
107
+ untrain(category, text)
108
+ else
109
+ train(category, text)
110
+ end
111
+ end
112
+ elsif name.to_s =~ /(un)?train_(\w+)/
113
+ raise StandardError, "No such category: #{category}"
114
+ else
115
+ super
116
+ end
117
+ end
118
+
119
+ #
120
+ # Provides a list of category names
121
+ # For example:
122
+ # b.categories
123
+ # => ['This', 'That', 'the_other']
124
+ def categories # :nodoc:
125
+ @categories.keys.collect(&:to_s)
126
+ end
127
+
128
+ #
129
+ # Allows you to add categories to the classifier.
130
+ # For example:
131
+ # b.add_category "Not spam"
132
+ #
133
+ # WARNING: Adding categories to a trained classifier will
134
+ # result in an undertrained category that will tend to match
135
+ # more criteria than the trained selective categories. In short,
136
+ # try to initialize your categories at initialization.
137
+ def add_category(category)
138
+ @categories[category.prepare_category_name] = {}
139
+ end
134
140
 
141
+ alias append_category add_category
142
+ end
135
143
  end
@@ -6,5 +6,5 @@ require 'fast_stemmer'
6
6
  require 'classifier/extensions/word_hash'
7
7
 
8
8
  class Object
9
- def prepare_category_name; to_s.gsub("_"," ").capitalize.intern end
9
+ def prepare_category_name = to_s.gsub('_', ' ').capitalize.intern
10
10
  end
@@ -1,112 +1,106 @@
1
1
  # Author:: Ernest Ellingson
2
- # Copyright:: Copyright (c) 2005
2
+ # Copyright:: Copyright (c) 2005
3
3
 
4
4
  # These are extensions to the std-lib 'matrix' to allow an all ruby SVD
5
5
 
6
6
  require 'matrix'
7
- require 'mathn'
8
7
 
9
8
  class Array
10
- def sum(identity = 0, &block)
11
- return identity unless size > 0
12
-
9
+ def sum_with_identity(identity = 0.0, &block)
10
+ return identity unless size.to_i.positive?
11
+
13
12
  if block_given?
14
- map(&block).sum
13
+ map(&block).sum_with_identity(identity)
15
14
  else
16
- reduce(:+)
15
+ compact.reduce(:+).to_f || identity.to_f
17
16
  end
18
17
  end
19
18
  end
20
19
 
21
- class Vector
20
+ module VectorExtensions
22
21
  def magnitude
23
- sumsqs = 0.0
24
- self.size.times do |i|
25
- sumsqs += self[i] ** 2.0
22
+ sum_of_squares = 0.to_r
23
+ size.times do |i|
24
+ sum_of_squares += self[i]**2.to_r
26
25
  end
27
- Math.sqrt(sumsqs)
26
+ Math.sqrt(sum_of_squares.to_f)
28
27
  end
29
- def normalize
30
- nv = []
31
- mag = self.magnitude
32
- self.size.times do |i|
33
-
34
- nv << (self[i] / mag)
35
28
 
29
+ def normalize
30
+ normalized_values = []
31
+ magnitude_value = magnitude.to_r
32
+ size.times do |i|
33
+ normalized_values << (self[i] / magnitude_value)
36
34
  end
37
- Vector[*nv]
35
+ Vector[*normalized_values]
38
36
  end
39
37
  end
40
38
 
39
+ class Vector
40
+ include VectorExtensions
41
+ end
42
+
41
43
  class Matrix
42
- def Matrix.diag(s)
43
- Matrix.diagonal(*s)
44
+ def self.diag(diagonal_elements)
45
+ Matrix.diagonal(*diagonal_elements)
44
46
  end
45
-
46
- alias :trans :transpose
47
47
 
48
- def SV_decomp(maxSweeps = 20)
49
- if self.row_size >= self.column_size
50
- q = self.trans * self
51
- else
52
- q = self * self.trans
53
- end
54
-
55
- qrot = q.dup
56
- v = Matrix.identity(q.row_size)
57
- azrot = nil
58
- mzrot = nil
59
- cnt = 0
60
- s_old = nil
61
- mu = nil
48
+ alias trans transpose
49
+
50
+ def SV_decomp(max_sweeps = 20)
51
+ q_matrix = if row_size >= column_size
52
+ trans * self
53
+ else
54
+ self * trans
55
+ end
62
56
 
63
- while true do
64
- cnt += 1
65
- for row in (0...qrot.row_size-1) do
66
- for col in (1..qrot.row_size-1) do
57
+ q_rotation_matrix = q_matrix.dup
58
+ v_matrix = Matrix.identity(q_matrix.row_size)
59
+ iteration_count = 0
60
+ previous_s_matrix = nil
61
+
62
+ loop do
63
+ iteration_count += 1
64
+ (0...q_rotation_matrix.row_size - 1).each do |row|
65
+ (1..q_rotation_matrix.row_size - 1).each do |col|
67
66
  next if row == col
68
- h = Math.atan((2 * qrot[row,col])/(qrot[row,row]-qrot[col,col]))/2.0
69
- hcos = Math.cos(h)
70
- hsin = Math.sin(h)
71
- mzrot = Matrix.identity(qrot.row_size)
72
- mzrot[row,row] = hcos
73
- mzrot[row,col] = -hsin
74
- mzrot[col,row] = hsin
75
- mzrot[col,col] = hcos
76
- qrot = mzrot.trans * qrot * mzrot
77
- v = v * mzrot
78
- end
67
+
68
+ angle = Math.atan((2.to_r * q_rotation_matrix[row,
69
+ col]) / (q_rotation_matrix[row,
70
+ row] - q_rotation_matrix[col,
71
+ col])) / 2.0
72
+ cosine = Math.cos(angle)
73
+ sine = Math.sin(angle)
74
+ rotation_matrix = Matrix.identity(q_rotation_matrix.row_size)
75
+ rotation_matrix[row, row] = cosine
76
+ rotation_matrix[row, col] = -sine
77
+ rotation_matrix[col, row] = sine
78
+ rotation_matrix[col, col] = cosine
79
+ q_rotation_matrix = rotation_matrix.trans * q_rotation_matrix * rotation_matrix
80
+ v_matrix *= rotation_matrix
81
+ end
79
82
  end
80
- s_old = qrot.dup if cnt == 1
81
- sum_qrot = 0.0
82
- if cnt > 1
83
- qrot.row_size.times do |r|
84
- sum_qrot += (qrot[r,r]-s_old[r,r]).abs if (qrot[r,r]-s_old[r,r]).abs > 0.001
83
+ previous_s_matrix = q_rotation_matrix.dup if iteration_count == 1
84
+ sum_of_differences = 0.to_r
85
+ if iteration_count > 1
86
+ q_rotation_matrix.row_size.times do |r|
87
+ difference = (q_rotation_matrix[r, r] - previous_s_matrix[r, r]).abs
88
+ sum_of_differences += difference.to_r if difference > 0.001
85
89
  end
86
- s_old = qrot.dup
87
- end
88
- break if (sum_qrot <= 0.001 and cnt > 1) or cnt >= maxSweeps
89
- end # of do while true
90
- s = []
91
- qrot.row_size.times do |r|
92
- s << Math.sqrt(qrot[r,r])
90
+ previous_s_matrix = q_rotation_matrix.dup
91
+ end
92
+ break if (sum_of_differences <= 0.001 && iteration_count > 1) || iteration_count >= max_sweeps
93
93
  end
94
- #puts "cnt = #{cnt}"
95
- if self.row_size >= self.column_size
96
- mu = self * v * Matrix.diagonal(*s).inverse
97
- return [mu, v, s]
98
- else
99
- puts v.row_size
100
- puts v.column_size
101
- puts self.row_size
102
- puts self.column_size
103
- puts s.size
104
94
 
105
- mu = (self.trans * v * Matrix.diagonal(*s).inverse)
106
- return [mu, v, s]
95
+ singular_values = []
96
+ q_rotation_matrix.row_size.times do |r|
97
+ singular_values << Math.sqrt(q_rotation_matrix[r, r].to_f)
107
98
  end
99
+ u_matrix = (row_size >= column_size ? self : trans) * v_matrix * Matrix.diagonal(*singular_values).inverse
100
+ [u_matrix, v_matrix, singular_values]
108
101
  end
109
- def []=(i,j,val)
110
- @rows[i][j] = val
102
+
103
+ def []=(row_index, col_index, value)
104
+ @rows[row_index][col_index] = value
111
105
  end
112
106
  end
@@ -1,20 +1,18 @@
1
1
  module GSL
2
-
3
2
  class Vector
4
- def _dump(v)
5
- Marshal.dump( self.to_a )
3
+ def _dump(_v)
4
+ Marshal.dump(to_a)
6
5
  end
7
-
6
+
8
7
  def self._load(arr)
9
8
  arry = Marshal.load(arr)
10
- return GSL::Vector.alloc(arry)
9
+ GSL::Vector.alloc(arry)
11
10
  end
12
-
13
11
  end
14
-
12
+
15
13
  class Matrix
16
- class <<self
17
- alias :diag :diagonal
18
- end
14
+ class << self
15
+ alias diag diagonal
16
+ end
19
17
  end
20
18
  end